This commit adds the PCIe Performance Monitoring Unit (PMU) driver support
for T-Head Yitian SoC chip. Yitian is based on the Synopsys PCI Express
Core controller IP which provides statistics feature. The PMU is not a PCIe
Root Complex integrated End Point(RCiEP) device but only register counters
provided by each PCIe Root Port.
To facilitate collection of statistics the controller provides the
following two features for each Root Port:
- Time Based Analysis (RX/TX data throughput and time spent in each
low-power LTSSM state)
- Event counters (Error and Non-Error for lanes)
Note, only one counter for each type.
This driver add PMU devices for each PCIe Root Port. And the PMU device is
named based the BDF of Root Port. For example,
10:00.0 PCI bridge: Device 1ded:8000 (rev 01)
the PMU device name for this Root Port is pcie_bdf_100000.
Example usage of counting PCIe RX TLP data payload (Units of 16 bytes)::
$# perf stat -a -e pcie_bdf_200/Rx_PCIe_TLP_Data_Payload/
average RX bandwidth can be calculated like this:
PCIe TX Bandwidth = PCIE_TX_DATA * 16B / Measure_Time_Window
Signed-off-by: Shuai Xue <[email protected]>
---
drivers/perf/Kconfig | 7 +
drivers/perf/Makefile | 1 +
drivers/perf/dwc_pcie_pmu.c | 976 ++++++++++++++++++++++++++++++++++++
3 files changed, 984 insertions(+)
create mode 100644 drivers/perf/dwc_pcie_pmu.c
diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 1e2d69453771..11ae99de5bbf 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -192,4 +192,11 @@ config MARVELL_CN10K_DDR_PMU
Enable perf support for Marvell DDR Performance monitoring
event on CN10K platform.
+config CONFIG_DWC_PCIE_PMU
+ tristate "Enable Synopsys DesignWare PCIe PMU Support"
+ depends on ARM64 || (COMPILE_TEST && 64BIT)
+ help
+ Enable perf support for Synopsys DesignWare PCIe PMU Performance
+ monitoring event on Yitan 710 platform.
+
endmenu
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index 57a279c61df5..36f75cb0f320 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -20,3 +20,4 @@ obj-$(CONFIG_ARM_DMC620_PMU) += arm_dmc620_pmu.o
obj-$(CONFIG_MARVELL_CN10K_TAD_PMU) += marvell_cn10k_tad_pmu.o
obj-$(CONFIG_MARVELL_CN10K_DDR_PMU) += marvell_cn10k_ddr_pmu.o
obj-$(CONFIG_APPLE_M1_CPU_PMU) += apple_m1_cpu_pmu.o
+obj-$(CONFIG_DWC_PCIE_PMU) += dwc_pcie_pmu.o
diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c
new file mode 100644
index 000000000000..81e534be13fa
--- /dev/null
+++ b/drivers/perf/dwc_pcie_pmu.c
@@ -0,0 +1,976 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Synopsys DesignWare PCIe PMU driver
+ *
+ * Copyright (C) 2021, 2022 Alibaba Inc.
+ */
+
+#include <linux/pci.h>
+#include <linux/bitfield.h>
+#include <linux/bitops.h>
+#include <linux/cpuhotplug.h>
+#include <linux/cpumask.h>
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/perf_event.h>
+#include <linux/platform_device.h>
+#include <linux/smp.h>
+#include <linux/sysfs.h>
+#include <linux/types.h>
+
+#define DRV_NAME "dwc_pcie_pmu"
+#define DEV_NAME "dwc_pcie_pmu"
+#define RP_NUM_MAX 32 /* 2die * 4RC * 4Ctrol */
+#define ATTRI_NAME_MAX_SIZE 32
+
+#define DWC_PCIE_VSEC_ID 0x02
+#define DWC_PCIE_VSEC_REV 0x04
+
+#define DWC_PCIE_LINK_CAPABILITIES_REG 0xC
+#define DWC_PCIE_LANE_SHIFT 4
+#define DWC_PCIE_LANE_MASK GENMASK(9, 4)
+
+#define DWC_PCIE_EVENT_CNT_CTRL 0x8
+#define DWC_PCIE__CNT_EVENT_SELECT_SHIFT 16
+#define DWC_PCIE__CNT_EVENT_SELECT_MASK GENMASK(27, 16)
+#define DWC_PCIE__CNT_LANE_SELECT_SHIFT 8
+#define DWC_PCIE__CNT_LANE_SELECT_MASK GENMASK(11, 8)
+#define DWC_PCIE__CNT_STATUS_SHIFT 7
+#define DWC_PCIE__CNT_STATUS_MASK BIT(7)
+#define DWC_PCIE__CNT_ENABLE_SHIFT 2
+#define DWC_PCIE__CNT_ENABLE_MASK GENMASK(4, 2)
+#define DWC_PCIE_PER_EVENT_OFF (0x1 << DWC_PCIE__CNT_ENABLE_SHIFT)
+#define DWC_PCIE_PER_EVENT_ON (0x3 << DWC_PCIE__CNT_ENABLE_SHIFT)
+#define DWC_PCIE_EVENT_CLEAR_MASK GENMASK(1, 0)
+
+#define DWC_PCIE_EVENT_CNT_DATA 0xC
+
+#define DWC_PCIE_TIME_BASED_ANALYSIS_CTRL 0x10
+#define DWC_PCIE__TIME_BASED_REPORT_SELECT_SHIFT 24
+#define DWC_PCIE__TIME_BASED_REPORT_SELECT_MASK GENMASK(31, 24)
+#define DWC_PCIE__TIME_BASED_DURATION_SHIFT 8
+#define DWC_PCIE__TIME_BASED_DURATION_SELECT GENMASK(15, 8)
+#define DWC_PCIE_DURATION_MANUAL_CTRL 0x0
+#define DWC_PCIE_DURATION_1MS 0x1
+#define DWC_PCIE_DURATION_10MS 0x2
+#define DWC_PCIE_DURATION_100MS 0x3
+#define DWC_PCIE_DURATION_1S 0x4
+#define DWC_PCIE_DURATION_2S 0x5
+#define DWC_PCIE_DURATION_4S 0x6
+#define DWC_PCIE_DURATION_4US 0xff
+#define DWC_PCIE__TIME_BASED_COUNTER_ENABLE 1
+
+#define DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_LOW 0x14
+#define DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_HIGH 0x18
+
+/* Event attributes */
+#define DWC_PCIE_CONFIG_EVENTID GENMASK(15, 0)
+#define DWC_PCIE_CONFIG_TYPE GENMASK(19, 16)
+#define DWC_PCIE_CONFIG_LANE GENMASK(27, 20)
+
+#define DWC_PCIE_EVENT_ID(event) FIELD_GET(DWC_PCIE_CONFIG_EVENTID, (event)->attr.config)
+#define DWC_PCIE_EVENT_TYPE(event) FIELD_GET(DWC_PCIE_CONFIG_TYPE, (event)->attr.config)
+#define DWC_PCIE_EVENT_LANE(event) FIELD_GET(DWC_PCIE_CONFIG_LANE, (event)->attr.config)
+
+#define DWC_PCIE_PMU_HAS_REGISTER 1
+
+enum dwc_pcie_event_type {
+ DWC_PCIE_TYPE_INVALID,
+ DWC_PCIE_TIME_BASE_EVENT,
+ DWC_PCIE_LANE_EVENT,
+};
+
+struct dwc_event_counters {
+ const char name[32];
+ u32 event_id;
+};
+
+struct dwc_pcie_pmu {
+ struct hlist_node node;
+ unsigned int on_cpu;
+ struct pmu pmu;
+ struct device *dev;
+};
+
+struct dwc_pcie_info_table {
+ u32 bdf;
+ u32 cap_pos;
+ u32 num_lanes;
+ struct pci_dev *pdev;
+ struct dwc_pcie_pmu pcie_pmu;
+ u8 pmu_is_register;
+ struct perf_event *event;
+
+ struct dwc_pcie_event_attr *lane_event_attrs;
+ struct attribute **pcie_pmu_event_attrs;
+ struct attribute_group pcie_pmu_event_attrs_group;
+ const struct attribute_group *pcie_pmu_attr_groups[4];
+};
+
+struct dwc_pcie_pmu_priv {
+ struct device *dev;
+ u32 pcie_ctrl_num;
+ struct dwc_pcie_info_table *pcie_table;
+};
+
+#define DWC_PCIE_CREATE_BDF(seg, bus, dev, func) \
+ (((seg) << 24) | (((bus) & 0xFF) << 16) | (((dev) & 0xFF) << 8) | (func))
+#define to_pcie_pmu(p) (container_of(p, struct dwc_pcie_pmu, pmu))
+
+static struct platform_device *dwc_pcie_pmu_dev;
+static char *event_attr_name = "events";
+
+static ssize_t dwc_pcie_pmu_cpumask_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct dwc_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev));
+
+ return cpumap_print_to_pagebuf(true, buf, cpumask_of(pcie_pmu->on_cpu));
+}
+
+static struct device_attribute dwc_pcie_pmu_cpumask_attr =
+__ATTR(cpumask, 0444, dwc_pcie_pmu_cpumask_show, NULL);
+
+static struct attribute *dwc_pcie_pmu_cpumask_attrs[] = {
+ &dwc_pcie_pmu_cpumask_attr.attr,
+ NULL
+};
+
+static struct attribute_group pcie_pmu_cpumask_attrs_group = {
+ .attrs = dwc_pcie_pmu_cpumask_attrs,
+};
+
+struct dwc_pcie_format_attr {
+ struct device_attribute attr;
+ u64 field;
+ int config;
+};
+
+static ssize_t dwc_pcie_pmu_format_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct dwc_pcie_format_attr *fmt = container_of(attr, typeof(*fmt), attr);
+ int lo = __ffs(fmt->field), hi = __fls(fmt->field);
+
+ if (lo == hi)
+ return snprintf(buf, PAGE_SIZE, "config:%d\n", lo);
+
+ if (!fmt->config)
+ return snprintf(buf, PAGE_SIZE, "config:%d-%d\n", lo, hi);
+
+ return snprintf(buf, PAGE_SIZE, "config%d:%d-%d\n", fmt->config, lo,
+ hi);
+}
+
+#define _dwc_pcie_format_attr(_name, _cfg, _fld) \
+ (&((struct dwc_pcie_format_attr[]) {{ \
+ .attr = __ATTR(_name, 0444, dwc_pcie_pmu_format_show, NULL), \
+ .config = _cfg, \
+ .field = _fld, \
+ }})[0].attr.attr)
+
+#define dwc_pcie_format_attr(_name, _fld) _dwc_pcie_format_attr(_name, 0, _fld)
+
+static struct attribute *dwc_pcie_format_attrs[] = {
+ dwc_pcie_format_attr(type, DWC_PCIE_CONFIG_TYPE),
+ dwc_pcie_format_attr(eventid, DWC_PCIE_CONFIG_EVENTID),
+ dwc_pcie_format_attr(lane, DWC_PCIE_CONFIG_LANE),
+ NULL,
+};
+
+static struct attribute_group pcie_pmu_format_attrs_group = {
+ .name = "format",
+ .attrs = dwc_pcie_format_attrs,
+};
+
+struct dwc_pcie_event_attr {
+ struct device_attribute attr;
+ enum dwc_pcie_event_type type;
+ u16 eventid;
+ u8 lane;
+};
+
+ssize_t dwc_pcie_event_show(struct device *dev,
+ struct device_attribute *attr, char *page)
+{
+ struct dwc_pcie_event_attr *eattr;
+
+ eattr = container_of(attr, typeof(*eattr), attr);
+
+ if (eattr->type == DWC_PCIE_LANE_EVENT)
+ return sprintf(page, "eventid=0x%lx, type=0x%lx, lane=0x%lx\n",
+ (unsigned long)eattr->eventid,
+ (unsigned long)eattr->type,
+ (unsigned long)eattr->lane);
+ else
+ return sprintf(page, "eventid=0x%lx, type=0x%lx",
+ (unsigned long)eattr->eventid,
+ (unsigned long)eattr->type);
+}
+
+#define DWC_PCIE_EVENT_ATTR(_name, _type, _eventid, _lane) \
+ (&((struct dwc_pcie_event_attr[]) {{ \
+ .attr = __ATTR(_name, 0444, dwc_pcie_event_show, NULL), \
+ .type = _type, \
+ .eventid = _eventid, \
+ .lane = _lane, \
+ }})[0].attr.attr)
+
+#define DWC_PCIE_PMU_BASE_TIME_ATTR(_name, _eventid) \
+ DWC_PCIE_EVENT_ATTR(_name, DWC_PCIE_TIME_BASE_EVENT, _eventid, 0)
+
+static struct attribute *dwc_pcie_pmu_time_event_attrs[] = {
+ /* Group #0 */
+ DWC_PCIE_PMU_BASE_TIME_ATTR(one_cycle, 0x00),
+ DWC_PCIE_PMU_BASE_TIME_ATTR(TX_L0S, 0x01),
+ DWC_PCIE_PMU_BASE_TIME_ATTR(RX_L0S, 0x02),
+ DWC_PCIE_PMU_BASE_TIME_ATTR(L0, 0x03),
+ DWC_PCIE_PMU_BASE_TIME_ATTR(L1, 0x04),
+ DWC_PCIE_PMU_BASE_TIME_ATTR(L1_1, 0x05),
+ DWC_PCIE_PMU_BASE_TIME_ATTR(L1_2, 0x06),
+ DWC_PCIE_PMU_BASE_TIME_ATTR(CFG_RCVRY, 0x07),
+ DWC_PCIE_PMU_BASE_TIME_ATTR(TX_RX_L0S, 0x08),
+ DWC_PCIE_PMU_BASE_TIME_ATTR(L1_AUX, 0x09),
+ DWC_PCIE_PMU_BASE_TIME_ATTR(ONE_cycle, 0x10),
+ DWC_PCIE_PMU_BASE_TIME_ATTR(TX_L0S_, 0x11),
+ DWC_PCIE_PMU_BASE_TIME_ATTR(RX_L0S_, 0x12),
+ DWC_PCIE_PMU_BASE_TIME_ATTR(L0_, 0x13),
+ DWC_PCIE_PMU_BASE_TIME_ATTR(L1_, 0x17),
+ DWC_PCIE_PMU_BASE_TIME_ATTR(CFG_RCVRY_, 0x17),
+ DWC_PCIE_PMU_BASE_TIME_ATTR(TX_RX_L0S_, 0x18),
+ /* Group #1 */
+ DWC_PCIE_PMU_BASE_TIME_ATTR(Tx_PCIe_TLP_Data_Payload, 0x20),
+ DWC_PCIE_PMU_BASE_TIME_ATTR(Rx_PCIe_TLP_Data_Payload, 0x21),
+ DWC_PCIE_PMU_BASE_TIME_ATTR(Tx_CCIX_TLP_Data_Payload, 0x22),
+ DWC_PCIE_PMU_BASE_TIME_ATTR(Rx_CCIX_TLP_Data_Payload, 0x23),
+ NULL
+};
+
+static inline umode_t pcie_pmu_event_attr_is_visible(struct kobject *kobj,
+ struct attribute *attr,
+ int unuse)
+{
+ return attr->mode;
+}
+
+static inline bool pci_dev_is_rootport(struct pci_dev *pdev)
+{
+ return (pci_is_pcie(pdev) &&
+ pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT);
+}
+
+static inline unsigned int dwc_pcie_get_bdf(struct pci_dev *dev)
+{
+ return (DWC_PCIE_CREATE_BDF(pci_domain_nr(dev->bus), dev->bus->number,
+ PCI_SLOT(dev->devfn),
+ PCI_FUNC(dev->devfn)));
+}
+
+static int dwc_pcie_find_ras_des_cap_position(struct pci_dev *pdev, int *pos)
+{
+ u32 header;
+ int vsec = 0;
+
+ while ((vsec = pci_find_next_ext_capability(pdev, vsec,
+ PCI_EXT_CAP_ID_VNDR))) {
+ pci_read_config_dword(pdev, vsec + PCI_VNDR_HEADER, &header);
+ /* Is the device part of a DesignWare Cores PCIe Controller ? */
+ if (PCI_VNDR_HEADER_ID(header) == DWC_PCIE_VSEC_ID &&
+ PCI_VNDR_HEADER_REV(header) == DWC_PCIE_VSEC_REV) {
+ *pos = vsec;
+ return 0;
+ }
+ }
+
+ return -ENODEV;
+}
+
+static int dwc_pcie_pmu_discover(struct dwc_pcie_pmu_priv *priv)
+{
+ int val, where, index = 0;
+ struct pci_dev *pdev = NULL;
+ struct dwc_pcie_info_table *pcie_info;
+
+ priv->pcie_table =
+ devm_kcalloc(priv->dev, RP_NUM_MAX, sizeof(*pcie_info), GFP_KERNEL);
+ if (!priv->pcie_table)
+ return -EINVAL;
+
+ pcie_info = priv->pcie_table;
+ while ((pdev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pdev)) != NULL &&
+ index < RP_NUM_MAX) {
+ if (!pci_dev_is_rootport(pdev))
+ continue;
+
+ pcie_info[index].bdf = dwc_pcie_get_bdf(pdev);
+ pcie_info[index].pdev = pdev;
+
+ if (dwc_pcie_find_ras_des_cap_position(pdev, &where))
+ continue;
+
+ pcie_info[index].cap_pos = where;
+
+ pci_read_config_dword(pdev,
+ pdev->pcie_cap + DWC_PCIE_LINK_CAPABILITIES_REG,
+ &val);
+ pcie_info[index].num_lanes =
+ (val & DWC_PCIE_LANE_MASK) >> DWC_PCIE_LANE_SHIFT;
+ index++;
+ }
+
+ if (!index)
+ return -ENODEV;
+
+ priv->pcie_ctrl_num = index;
+
+ return 0;
+}
+
+static inline int dwc_pcie_pmu_read_dword(struct dwc_pcie_info_table *pcie_info,
+ u32 reg, u32 *val)
+{
+ return pci_read_config_dword(pcie_info->pdev, pcie_info->cap_pos + reg,
+ val);
+}
+
+static inline int dwc_pcie_pmu_write_dword(struct dwc_pcie_info_table
+ *pcie_info, u32 reg, u32 val)
+{
+ return pci_write_config_dword(pcie_info->pdev, pcie_info->cap_pos + reg,
+ val);
+}
+
+static int dwc_pcie_pmu_set_event_id(struct dwc_pcie_info_table *pcie_info,
+ int event_id)
+{
+ int ret;
+ u32 val;
+
+ ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, &val);
+ if (ret) {
+ pci_err(pcie_info->pdev, "PCIe read fail\n");
+ return ret;
+ }
+
+ val &= ~DWC_PCIE__CNT_ENABLE_MASK;
+ val &= ~DWC_PCIE__CNT_EVENT_SELECT_MASK;
+ val |= event_id << DWC_PCIE__CNT_EVENT_SELECT_SHIFT;
+
+ ret = dwc_pcie_pmu_write_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, val);
+ if (ret)
+ pci_err(pcie_info->pdev, "PCIe write fail\n");
+
+ return ret;
+}
+
+static int dwc_pcie_pmu_write_event_lane(struct dwc_pcie_info_table *pcie_info,
+ int lane, int event_id)
+{
+ u32 ret;
+ u32 val;
+
+ ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, &val);
+ if (ret) {
+ pci_err(pcie_info->pdev, "PCIe read fail\n");
+ return ret;
+ }
+
+ val &= ~DWC_PCIE__CNT_LANE_SELECT_MASK;
+ val |= lane << DWC_PCIE__CNT_LANE_SELECT_SHIFT;
+
+ ret = dwc_pcie_pmu_write_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, val);
+ if (ret)
+ pci_err(pcie_info->pdev, "PCIe write fail\n");
+
+ return ret;
+}
+
+static int dwc_pcie_pmu_event_enable(struct dwc_pcie_info_table *pcie_info,
+ u32 enable)
+{
+ u32 ret;
+ u32 val;
+
+ ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, &val);
+ if (ret) {
+ pci_err(pcie_info->pdev, "PCIe read fail\n");
+ return ret;
+ }
+
+ val &= ~(DWC_PCIE__CNT_ENABLE_MASK);
+
+ if (enable)
+ val |= DWC_PCIE_PER_EVENT_ON;
+ else
+ val |= DWC_PCIE_PER_EVENT_OFF;
+
+ ret = dwc_pcie_pmu_write_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, val);
+ if (ret)
+ pci_err(pcie_info->pdev, "PCIe write fail\n");
+
+ return ret;
+}
+
+static int dwc_pcie_pmu_base_time_enable(struct dwc_pcie_info_table *pcie_info,
+ u32 enable)
+{
+ u32 ret;
+ u32 val;
+
+ ret = dwc_pcie_pmu_read_dword(pcie_info,
+ DWC_PCIE_TIME_BASED_ANALYSIS_CTRL, &val);
+ if (ret) {
+ pci_err(pcie_info->pdev, "PCIe read fail\n");
+ return ret;
+ }
+
+ if (enable)
+ val |= DWC_PCIE__TIME_BASED_COUNTER_ENABLE;
+ else
+ val &= ~DWC_PCIE__TIME_BASED_COUNTER_ENABLE;
+
+ ret = dwc_pcie_pmu_write_dword(pcie_info,
+ DWC_PCIE_TIME_BASED_ANALYSIS_CTRL, val);
+ if (ret)
+ pci_err(pcie_info->pdev, "PCIe write fail\n");
+
+ return ret;
+}
+
+static int dwc_pcie_pmu_read_event_counter(struct dwc_pcie_info_table
+ *pcie_info, u64 *counter)
+{
+ u32 ret, val;
+
+ ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_DATA, &val);
+ if (ret) {
+ pci_err(pcie_info->pdev, "PCIe read fail\n");
+ return ret;
+ }
+ *counter = val;
+
+ return ret;
+}
+
+static int dwc_pcie_pmu_read_base_time_counter(struct dwc_pcie_info_table
+ *pcie_info, u64 *counter)
+{
+ u32 ret, val;
+
+ ret = dwc_pcie_pmu_read_dword(pcie_info,
+ DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_HIGH,
+ &val);
+ if (ret) {
+ pci_err(pcie_info->pdev, "PCIe read fail\n");
+ return ret;
+ }
+
+ *counter = val;
+ *counter <<= 32;
+
+ ret = dwc_pcie_pmu_read_dword(pcie_info,
+ DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_LOW,
+ &val);
+ if (ret) {
+ pci_err(pcie_info->pdev, "PCIe read fail\n");
+ return ret;
+ }
+
+ *counter += val;
+
+ return ret;
+}
+
+static int dwc_pcie_pmu_clear_event_counter(struct dwc_pcie_info_table
+ *pcie_info)
+{
+ u32 ret;
+ u32 val;
+
+ ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, &val);
+ if (ret) {
+ pci_err(pcie_info->pdev, "PCIe read fail\n");
+ return ret;
+ }
+
+ val &= ~DWC_PCIE_EVENT_CLEAR_MASK;
+ val |= 1;
+
+ ret = dwc_pcie_pmu_write_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, val);
+ if (ret)
+ pci_err(pcie_info->pdev, "PCIe write fail\n");
+
+ return ret;
+}
+
+static int dwc_pcie_pmu_base_time_add_prepare(struct dwc_pcie_info_table
+ *pcie_info, u32 event_id)
+{
+ u32 ret;
+ u32 val;
+
+ ret = dwc_pcie_pmu_read_dword(pcie_info,
+ DWC_PCIE_TIME_BASED_ANALYSIS_CTRL, &val);
+ if (ret) {
+ pci_err(pcie_info->pdev, "PCIe read fail\n");
+ return ret;
+ }
+
+ val &= ~DWC_PCIE__TIME_BASED_REPORT_SELECT_MASK;
+ val |= event_id << DWC_PCIE__TIME_BASED_REPORT_SELECT_SHIFT;
+ val &= ~DWC_PCIE__TIME_BASED_DURATION_SELECT;
+
+ /*
+ * TIME_BASED_ANALYSIS_DATA_REG is a 64 bit register, we can safely
+ * use it with any manually controllered duration.
+ */
+ val &= ~(DWC_PCIE__TIME_BASED_DURATION_SELECT);
+ val |= DWC_PCIE_DURATION_MANUAL_CTRL;
+
+ ret = dwc_pcie_pmu_write_dword(pcie_info,
+ DWC_PCIE_TIME_BASED_ANALYSIS_CTRL, val);
+ if (ret)
+ pci_err(pcie_info->pdev, "PCIe write fail\n");
+
+ return ret;
+}
+
+static struct dwc_pcie_info_table *pmu_to_pcie_info(struct pmu *pmu)
+{
+ struct dwc_pcie_info_table *pcie_info;
+ struct dwc_pcie_pmu *pcie_pmu = to_pcie_pmu(pmu);
+
+ pcie_info = container_of(pcie_pmu, struct dwc_pcie_info_table, pcie_pmu);
+ if (pcie_info == NULL)
+ pci_err(pcie_info->pdev, "Can't get pcie info\n");
+
+ return pcie_info;
+}
+
+static void dwc_pcie_pmu_event_update(struct perf_event *event)
+{
+ u64 counter;
+ struct dwc_pcie_info_table *pcie_info = pmu_to_pcie_info(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+ enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
+ u64 delta, prev, now;
+
+ do {
+ prev = local64_read(&hwc->prev_count);
+
+ if (type == DWC_PCIE_LANE_EVENT)
+ dwc_pcie_pmu_read_event_counter(pcie_info, &counter);
+ else if (type == DWC_PCIE_TIME_BASE_EVENT)
+ dwc_pcie_pmu_read_base_time_counter(pcie_info,
+ &counter);
+ else
+ pci_err(pcie_info->pdev, "Input param is invalid\n");
+
+ now = counter;
+ } while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
+
+ delta = now - prev;
+
+ local64_add(delta, &event->count);
+}
+
+static int dwc_pcie_pmu_event_init(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct dwc_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+ struct perf_event *sibling;
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ if (hwc->sample_period) {
+ dev_dbg(pcie_pmu->dev, "Sampling not supported\n");
+ return -EOPNOTSUPP;
+ }
+
+ if (event->cpu < 0) {
+ dev_dbg(pcie_pmu->dev, "Per-task mode not supported\n");
+ return -EOPNOTSUPP;
+ }
+
+ event->cpu = pcie_pmu->on_cpu;
+
+ if (event->group_leader != event &&
+ !is_software_event(event->group_leader)) {
+ dev_dbg(pcie_pmu->dev, "Drive way only allow one event!\n");
+ return -EINVAL;
+ }
+
+ for_each_sibling_event(sibling, event->group_leader) {
+ if (sibling != event && !is_software_event(sibling)) {
+ dev_dbg(pcie_pmu->dev, "Drive way event not allowed!\n");
+ return -EINVAL;
+ }
+ }
+
+ hwc->idx = -1;
+
+ return 0;
+}
+
+static void dwc_pcie_pmu_set_period(struct hw_perf_event *hwc)
+{
+ u64 new = 0;
+
+ local64_set(&hwc->prev_count, new);
+}
+
+static void dwc_pcie_pmu_event_start(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct dwc_pcie_info_table *pcie_info = pmu_to_pcie_info(event->pmu);
+ enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
+
+ hwc->state = 0;
+ dwc_pcie_pmu_set_period(hwc);
+
+ if (type == DWC_PCIE_LANE_EVENT)
+ dwc_pcie_pmu_event_enable(pcie_info, 1);
+ else if (type == DWC_PCIE_TIME_BASE_EVENT)
+ dwc_pcie_pmu_base_time_enable(pcie_info, 1);
+ else
+ pci_err(pcie_info->pdev, "Input param is invalid\n");
+}
+
+static void dwc_pcie_pmu_event_stop(struct perf_event *event, int flags)
+{
+ struct dwc_pcie_info_table *pcie_info = pmu_to_pcie_info(event->pmu);
+ enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
+
+ if (event->hw.state & PERF_HES_STOPPED)
+ return;
+
+ if (type == DWC_PCIE_LANE_EVENT)
+ dwc_pcie_pmu_event_enable(pcie_info, 0);
+ else if (type == DWC_PCIE_TIME_BASE_EVENT)
+ dwc_pcie_pmu_base_time_enable(pcie_info, 0);
+ else
+ pci_err(pcie_info->pdev, "Input param is invalid\n");
+
+ dwc_pcie_pmu_event_update(event);
+}
+
+static int dwc_pcie_pmu_event_add(struct perf_event *event, int flags)
+{
+ struct dwc_pcie_info_table *pcie_info = pmu_to_pcie_info(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+ enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
+ int event_id = DWC_PCIE_EVENT_ID(event);
+ int lane = DWC_PCIE_EVENT_LANE(event);
+
+ if (pcie_info->event)
+ return -ENOSPC;
+
+ pcie_info->event = event;
+
+ hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+
+ if (type == DWC_PCIE_LANE_EVENT) {
+ dwc_pcie_pmu_event_enable(pcie_info, 0);
+ dwc_pcie_pmu_write_event_lane(pcie_info, lane, event_id);
+ dwc_pcie_pmu_set_event_id(pcie_info, event_id);
+ dwc_pcie_pmu_clear_event_counter(pcie_info);
+ } else if (type == DWC_PCIE_TIME_BASE_EVENT) {
+ dwc_pcie_pmu_base_time_enable(pcie_info, 0);
+ dwc_pcie_pmu_base_time_add_prepare(pcie_info, event_id);
+ } else {
+ pci_err(pcie_info->pdev, "Input param is invalid\n");
+ return -EINVAL;
+ }
+
+ if (flags & PERF_EF_START)
+ dwc_pcie_pmu_event_start(event, PERF_EF_RELOAD);
+
+ perf_event_update_userpage(event);
+
+ return 0;
+}
+
+static void dwc_pcie_pmu_event_del(struct perf_event *event, int flags)
+{
+ struct dwc_pcie_info_table *pcie_info = pmu_to_pcie_info(event->pmu);
+
+ dwc_pcie_pmu_event_stop(event, flags | PERF_EF_UPDATE);
+ perf_event_update_userpage(event);
+ pcie_info->event = NULL;
+}
+
+static void dwc_pcie_pmu_event_read(struct perf_event *event)
+{
+ dwc_pcie_pmu_event_update(event);
+}
+
+static struct dwc_event_counters event_array[] = {
+ {"tx_ack_dllp", 0x600},
+ {"tx_update_fc_dllp", 0x601},
+ {"rx_ack_dllp", 0x602},
+ {"rx_update_fc_dllp", 0x603},
+ {"rx_nulified_tlp", 0x604},
+ {"tx_nulified_tlp", 0x605},
+ {"rx_duplicate_tlp", 0x606},
+ {"tx_memory_write", 0x700},
+ {"tx_memory_read", 0x701},
+ {"tx_configuration_write", 0x702},
+ {"tx_configuration_read", 0x703},
+ {"tx_io_write", 0x704},
+ {"tx_io_read", 0x705},
+ {"tx_completion_without_data", 0x706},
+ {"tx_completion_with_data", 0x707},
+ {"tx_message_tlp", 0x708},
+ {"tx_atomic", 0x709},
+ {"tx_tlp_with_prefix", 0x70A},
+ {"rx_memory_write", 0x70B},
+ {"rx_memory_read", 0x70C},
+ {"rx_io_write", 0x70F},
+ {"rx_io_read", 0x710},
+ {"rx_completion_without_data", 0x711},
+ {"rx_completion_with_data", 0x712},
+ {"rx_message_tlp", 0x713},
+ {"rx_atomic", 0x714},
+ {"rx_tlp_with_prefix", 0x715},
+ {"tx_ccix_tlp", 0x716},
+ {"rx_ccix_tlp", 0x717},
+};
+
+static int dwc_pcie_pmu_attr_init(struct dwc_pcie_pmu_priv *priv,
+ struct dwc_pcie_info_table *pcie_info)
+{
+ int i, j;
+ char lane[8];
+ const char tmp[64];
+ int events_per_lane;
+ int num_lane_events;
+ int time_base_count;
+ int num_attrs, attr_idx;
+ struct dwc_pcie_event_attr *lane_attrs;
+ struct attribute **pmu_attrs;
+
+ memset((void *)tmp, 0, sizeof(tmp));
+ memset((void *)lane, 0, sizeof(lane));
+ time_base_count = ARRAY_SIZE(dwc_pcie_pmu_time_event_attrs);
+ events_per_lane = ARRAY_SIZE(event_array);
+ num_lane_events = pcie_info->num_lanes * events_per_lane;
+ num_attrs = time_base_count + num_lane_events;
+
+ pcie_info->lane_event_attrs =
+ devm_kcalloc(priv->dev, num_lane_events,
+ sizeof(struct dwc_pcie_event_attr),
+ GFP_KERNEL);
+ if (!pcie_info->lane_event_attrs)
+ return -ENOMEM;
+ lane_attrs = pcie_info->lane_event_attrs;
+ pcie_info->pcie_pmu_event_attrs =
+ devm_kcalloc(priv->dev, num_attrs, sizeof(struct attribute *),
+ GFP_KERNEL);
+ if (!pcie_info->pcie_pmu_event_attrs)
+ return -ENOMEM;
+ pmu_attrs = pcie_info->pcie_pmu_event_attrs;
+
+ for (i = 0; i < num_lane_events; i++) {
+ lane_attrs[i].attr.attr.name =
+ devm_kzalloc(priv->dev, sizeof(char)
+ * ATTRI_NAME_MAX_SIZE, GFP_KERNEL);
+ if (!lane_attrs[i].attr.attr.name)
+ return -ENOMEM;
+ }
+
+ attr_idx = 0;
+ for (i = 0; i < pcie_info->num_lanes; i++) {
+ sprintf(lane, "_lane%d", i);
+
+ for (j = 0; j < events_per_lane; j++) {
+ int pos = i * events_per_lane + j;
+
+ strcat((char *)tmp, event_array[j].name);
+ strcat((char *)tmp, lane);
+ memcpy((void *)lane_attrs[pos].attr.attr.name,
+ (void *)tmp,
+ sizeof(tmp));
+
+ lane_attrs[pos].attr.attr.mode =
+ VERIFY_OCTAL_PERMISSIONS(0444);
+ lane_attrs[pos].attr.show = dwc_pcie_event_show;
+ lane_attrs[pos].attr.store = NULL;
+ lane_attrs[pos].type = DWC_PCIE_LANE_EVENT;
+ lane_attrs[pos].eventid = event_array[j].event_id;
+ lane_attrs[pos].lane = i;
+ pmu_attrs[attr_idx++] = &lane_attrs[pos].attr.attr;
+
+ memset((void *)tmp, 0, sizeof(tmp));
+ }
+ }
+
+ for (i = 0; i < ARRAY_SIZE(dwc_pcie_pmu_time_event_attrs); i++)
+ pmu_attrs[attr_idx++] = dwc_pcie_pmu_time_event_attrs[i];
+
+ pcie_info->pcie_pmu_event_attrs[attr_idx++] = NULL;
+
+ pcie_info->pcie_pmu_event_attrs_group.name = event_attr_name;
+ pcie_info->pcie_pmu_event_attrs_group.is_visible =
+ pcie_pmu_event_attr_is_visible;
+ pcie_info->pcie_pmu_event_attrs_group.attrs =
+ pcie_info->pcie_pmu_event_attrs;
+
+ pcie_info->pcie_pmu_attr_groups[0] =
+ &pcie_info->pcie_pmu_event_attrs_group;
+ pcie_info->pcie_pmu_attr_groups[1] = &pcie_pmu_format_attrs_group;
+ pcie_info->pcie_pmu_attr_groups[2] = &pcie_pmu_cpumask_attrs_group;
+ pcie_info->pcie_pmu_attr_groups[3] = NULL;
+
+ return 0;
+}
+
+static int __dwc_pcie_pmu_probe(struct dwc_pcie_pmu_priv *priv,
+ struct dwc_pcie_info_table *pcie_info)
+{
+ int ret;
+ char *name;
+ struct dwc_pcie_pmu *pcie_pmu;
+ struct device *dev;
+
+ if (!pcie_info || !pcie_info->pdev) {
+ pci_err(pcie_info->pdev, "Input parameter is invalid\n");
+ return -EINVAL;
+ }
+
+ pcie_pmu = &pcie_info->pcie_pmu;
+ dev = &pcie_info->pdev->dev;
+
+ ret = dwc_pcie_pmu_attr_init(priv, pcie_info);
+ if (ret) {
+ pci_err(pcie_info->pdev, "PMU attr init fail ret=%d\n", ret);
+ return ret;
+ }
+
+ pcie_pmu->dev = dev;
+ pcie_pmu->pmu = (struct pmu) {
+ .module = THIS_MODULE,
+ .task_ctx_nr = perf_invalid_context,
+ .pmu_enable = NULL,
+ .pmu_disable = NULL,
+ .event_init = dwc_pcie_pmu_event_init,
+ .add = dwc_pcie_pmu_event_add,
+ .del = dwc_pcie_pmu_event_del,
+ .start = dwc_pcie_pmu_event_start,
+ .stop = dwc_pcie_pmu_event_stop,
+ .read = dwc_pcie_pmu_event_read,
+ .attr_groups = pcie_info->pcie_pmu_attr_groups,
+ .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
+ };
+
+ name = devm_kasprintf(priv->dev, GFP_KERNEL, "pcie_bdf_%x",
+ pcie_info->bdf);
+ if (!name)
+ return -ENOMEM;
+
+ /* Pick one CPU to be the preferred one to use */
+ pcie_pmu->on_cpu = raw_smp_processor_id();
+
+ ret = perf_pmu_register(&pcie_pmu->pmu, name, -1);
+ if (ret) {
+ pci_err(pcie_info->pdev, "Error %d registering PMU @%x\n", ret,
+ pcie_info->bdf);
+ return ret;
+ }
+
+ pcie_info->pmu_is_register = DWC_PCIE_PMU_HAS_REGISTER;
+
+ return ret;
+}
+
+static int dwc_pcie_pmu_remove(struct platform_device *pdev)
+{
+ struct dwc_pcie_pmu_priv *priv = platform_get_drvdata(pdev);
+ int index;
+ struct dwc_pcie_pmu *pcie_pmu;
+
+ for (index = 0; index < priv->pcie_ctrl_num; index++)
+ if (priv->pcie_table[index].pmu_is_register) {
+ pcie_pmu = &priv->pcie_table[index].pcie_pmu;
+ perf_pmu_unregister(&pcie_pmu->pmu);
+ }
+ return 0;
+}
+
+static int dwc_pcie_pmu_probe(struct platform_device *pdev)
+{
+ int ret = 0;
+ int pcie_index;
+ struct dwc_pcie_pmu_priv *priv;
+
+ priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+ priv->dev = &pdev->dev;
+ platform_set_drvdata(pdev, priv);
+
+ /* If PMU is not support on current platform, keep slient */
+ if (dwc_pcie_pmu_discover(priv))
+ return 0;
+
+ for (pcie_index = 0; pcie_index < priv->pcie_ctrl_num; pcie_index++) {
+ struct pci_dev *rp = priv->pcie_table[pcie_index].pdev;
+
+ ret = __dwc_pcie_pmu_probe(priv, &priv->pcie_table[pcie_index]);
+ if (ret) {
+ dev_err(&rp->dev, "PCIe PMU probe fail\n");
+ goto pmu_unregister;
+ }
+ }
+ dev_info(&pdev->dev, "PCIe PMUs registered\n");
+
+ return 0;
+
+pmu_unregister:
+ dwc_pcie_pmu_remove(pdev);
+
+ return ret;
+}
+
+static struct platform_driver dwc_pcie_pmu_driver = {
+ .probe = dwc_pcie_pmu_probe,
+ .remove = dwc_pcie_pmu_remove,
+ .driver = {.name = DRV_NAME,},
+};
+
+static int __init dwc_pcie_pmu_init(void)
+{
+ int ret;
+
+ ret = platform_driver_register(&dwc_pcie_pmu_driver);
+
+ if (ret)
+ return ret;
+
+ dwc_pcie_pmu_dev =
+ platform_device_register_simple(DEV_NAME, -1, NULL, 0);
+ if (IS_ERR(dwc_pcie_pmu_dev)) {
+ platform_driver_unregister(&dwc_pcie_pmu_driver);
+ return PTR_ERR(dwc_pcie_pmu_dev);
+ }
+
+ return 0;
+}
+
+static void __exit dwc_pcie_pmu_exit(void)
+{
+ platform_device_unregister(dwc_pcie_pmu_dev);
+ platform_driver_unregister(&dwc_pcie_pmu_driver);
+}
+
+module_init(dwc_pcie_pmu_init);
+module_exit(dwc_pcie_pmu_exit);
+
+MODULE_DESCRIPTION("PMU driver for DesignWare Cores PCI Express Controller");
+MODULE_AUTHOR("[email protected]");
+MODULE_AUTHOR("[email protected]");
+MODULE_LICENSE("GPL v2");
--
2.20.1.12.g72788fdb
On Sat, 17 Sep 2022 20:10:35 +0800
Shuai Xue <[email protected]> wrote:
> This commit adds the PCIe Performance Monitoring Unit (PMU) driver support
> for T-Head Yitian SoC chip. Yitian is based on the Synopsys PCI Express
> Core controller IP which provides statistics feature. The PMU is not a PCIe
> Root Complex integrated End Point(RCiEP) device but only register counters
> provided by each PCIe Root Port.
>
> To facilitate collection of statistics the controller provides the
> following two features for each Root Port:
>
> - Time Based Analysis (RX/TX data throughput and time spent in each
> low-power LTSSM state)
> - Event counters (Error and Non-Error for lanes)
>
> Note, only one counter for each type.
>
> This driver add PMU devices for each PCIe Root Port. And the PMU device is
> named based the BDF of Root Port. For example,
>
> 10:00.0 PCI bridge: Device 1ded:8000 (rev 01)
>
> the PMU device name for this Root Port is pcie_bdf_100000.
>
> Example usage of counting PCIe RX TLP data payload (Units of 16 bytes)::
>
> $# perf stat -a -e pcie_bdf_200/Rx_PCIe_TLP_Data_Payload/
>
> average RX bandwidth can be calculated like this:
>
> PCIe TX Bandwidth = PCIE_TX_DATA * 16B / Measure_Time_Window
>
> Signed-off-by: Shuai Xue <[email protected]>
+CC linux-pci list and Bjorn.
Question in here which I've been meaning to address for other reasons
around how to register 'extra features' on pci ports.
This particular PMU is in config space in a Vendor Specific Extended
Capability.
I've focused on that aspect for this review rather than the perf parts.
We'll need to figure that story out first as doing this from a bus walk
makes triggered of a platform driver is not the way I'd expect to see
this work.
> diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c
> new file mode 100644
> index 000000000000..81e534be13fa
> --- /dev/null
> +++ b/drivers/perf/dwc_pcie_pmu.c
> @@ -0,0 +1,976 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Synopsys DesignWare PCIe PMU driver
> + *
> + * Copyright (C) 2021, 2022 Alibaba Inc.
> + */
> +?
> +#include <linux/pci.h>
> +#include <linux/bitfield.h>
> +#include <linux/bitops.h>
> +#include <linux/cpuhotplug.h>
> +#include <linux/cpumask.h>
> +#include <linux/device.h>
> +#include <linux/errno.h>
> +#include <linux/kernel.h>
> +#include <linux/list.h>
> +#include <linux/perf_event.h>
> +#include <linux/platform_device.h>
> +#include <linux/smp.h>
> +#include <linux/sysfs.h>
> +#include <linux/types.h>
> +
> +#define DRV_NAME "dwc_pcie_pmu"
> +#define DEV_NAME "dwc_pcie_pmu"
Put these strings where they are used. That's where people will look for them...
> +#define RP_NUM_MAX 32 /* 2die * 4RC * 4Ctrol */
This driver is 'almost' generic. So if you an avoid defines based on a particular
platform that's definitely good!
> +#define ATTRI_NAME_MAX_SIZE 32
> +
> +#define DWC_PCIE_VSEC_ID 0x02
> +#define DWC_PCIE_VSEC_REV 0x04
I wouldn't define the REV like this. Put the number inline so we
can clearly see this is revision 4. VSEC_ID won't change so a
define for that is fine.
> +
> +#define DWC_PCIE_LINK_CAPABILITIES_REG 0xC
This is PCIE spec defined. Put these in a common header.
> +#define DWC_PCIE_LANE_SHIFT 4
> +#define DWC_PCIE_LANE_MASK GENMASK(9, 4)
> +
> +#define DWC_PCIE_EVENT_CNT_CTRL 0x8
> +#define DWC_PCIE__CNT_EVENT_SELECT_SHIFT 16
Why double __? If point is to separate register from fields, then
naming works better
DWC_PCIE_EVENT_CNT_CTRL_REG
DWC_PCIE_EVENT_CNT_CTRL_EV_SELECT_MSK etc
> +#define DWC_PCIE__CNT_EVENT_SELECT_MASK GENMASK(27, 16)
> +#define DWC_PCIE__CNT_LANE_SELECT_SHIFT 8
> +#define DWC_PCIE__CNT_LANE_SELECT_MASK GENMASK(11, 8)
> +#define DWC_PCIE__CNT_STATUS_SHIFT 7
> +#define DWC_PCIE__CNT_STATUS_MASK BIT(7)
> +#define DWC_PCIE__CNT_ENABLE_SHIFT 2
With FIELD_PREP() / FIELD_GET() you should never need to define the shifts.
They will be extracted from the masks as needed.
> +#define DWC_PCIE__CNT_ENABLE_MASK GENMASK(4, 2)
> +#define DWC_PCIE_PER_EVENT_OFF (0x1 << DWC_PCIE__CNT_ENABLE_SHIFT)
FIELD_PREP() / FIELD_GET() combined with defines for the values.
#define DWC_PCIE_CNT_ENABLE_MASK ...
> +#define DWC_PCIE_PER_EVENT_ON (0x3 << DWC_PCIE__CNT_ENABLE_SHIFT)
> +#define DWC_PCIE_EVENT_CLEAR_MASK GENMASK(1, 0)
> +
> +#define DWC_PCIE_EVENT_CNT_DATA 0xC
> +
> +#define DWC_PCIE_TIME_BASED_ANALYSIS_CTRL 0x10
> +#define DWC_PCIE__TIME_BASED_REPORT_SELECT_SHIFT 24
> +#define DWC_PCIE__TIME_BASED_REPORT_SELECT_MASK GENMASK(31, 24)
> +#define DWC_PCIE__TIME_BASED_DURATION_SHIFT 8
> +#define DWC_PCIE__TIME_BASED_DURATION_SELECT GENMASK(15, 8)
> +#define DWC_PCIE_DURATION_MANUAL_CTRL 0x0
> +#define DWC_PCIE_DURATION_1MS 0x1
> +#define DWC_PCIE_DURATION_10MS 0x2
> +#define DWC_PCIE_DURATION_100MS 0x3
> +#define DWC_PCIE_DURATION_1S 0x4
> +#define DWC_PCIE_DURATION_2S 0x5
> +#define DWC_PCIE_DURATION_4S 0x6
> +#define DWC_PCIE_DURATION_4US 0xff
> +#define DWC_PCIE__TIME_BASED_COUNTER_ENABLE 1
> +
> +#define DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_LOW 0x14
> +#define DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_HIGH 0x18
> +
> +/* Event attributes */
> +#define DWC_PCIE_CONFIG_EVENTID GENMASK(15, 0)
> +#define DWC_PCIE_CONFIG_TYPE GENMASK(19, 16)
> +#define DWC_PCIE_CONFIG_LANE GENMASK(27, 20)
> +
> +#define DWC_PCIE_EVENT_ID(event) FIELD_GET(DWC_PCIE_CONFIG_EVENTID, (event)->attr.config)
> +#define DWC_PCIE_EVENT_TYPE(event) FIELD_GET(DWC_PCIE_CONFIG_TYPE, (event)->attr.config)
> +#define DWC_PCIE_EVENT_LANE(event) FIELD_GET(DWC_PCIE_CONFIG_LANE, (event)->attr.config)
> +
> +#define DWC_PCIE_PMU_HAS_REGISTER 1
> +
> +enum dwc_pcie_event_type {
> + DWC_PCIE_TYPE_INVALID,
> + DWC_PCIE_TIME_BASE_EVENT,
> + DWC_PCIE_LANE_EVENT,
> +};
> +
> +struct dwc_event_counters {
> + const char name[32];
> + u32 event_id;
> +};
> +
> +struct dwc_pcie_pmu {
> + struct hlist_node node;
> + unsigned int on_cpu;
> + struct pmu pmu;
> + struct device *dev;
> +};
> +
> +struct dwc_pcie_info_table {
> + u32 bdf;
> + u32 cap_pos;
> + u32 num_lanes;
> + struct pci_dev *pdev;
> + struct dwc_pcie_pmu pcie_pmu;
> + u8 pmu_is_register;
> + struct perf_event *event;
> +
> + struct dwc_pcie_event_attr *lane_event_attrs;
> + struct attribute **pcie_pmu_event_attrs;
> + struct attribute_group pcie_pmu_event_attrs_group;
> + const struct attribute_group *pcie_pmu_attr_groups[4];
> +};
> +
> +struct dwc_pcie_pmu_priv {
> + struct device *dev;
> + u32 pcie_ctrl_num;
> + struct dwc_pcie_info_table *pcie_table;
> +};
> +
> +#define DWC_PCIE_CREATE_BDF(seg, bus, dev, func) \
> + (((seg) << 24) | (((bus) & 0xFF) << 16) | (((dev) & 0xFF) << 8) | (func))
Superficially this looks pretty standard. Why is is DWC specific?
> +#define to_pcie_pmu(p) (container_of(p, struct dwc_pcie_pmu, pmu))
Prefix that name. I'm hopeful we'll have a PCI SIG defined PMU one
day and when we do that macro belongs to that!
to_dwc_pcie_pmu() is possibly fine.
> +
> +static struct platform_device *dwc_pcie_pmu_dev;
> +static char *event_attr_name = "events";
> +
...
> +
> +static int dwc_pcie_find_ras_des_cap_position(struct pci_dev *pdev, int *pos)
> +{
> + u32 header;
> + int vsec = 0;
> +
> + while ((vsec = pci_find_next_ext_capability(pdev, vsec,
> + PCI_EXT_CAP_ID_VNDR))) {
This probably belongs in the PCI core in a similar fashion to the DVSEC
helper.
> + pci_read_config_dword(pdev, vsec + PCI_VNDR_HEADER, &header);
> + /* Is the device part of a DesignWare Cores PCIe Controller ? */
Good question... This code doesn't check that. VSEC ID is matched only with
the Vendor ID of the devices - unlike DVSEC where this would all be nice
and local.
> + if (PCI_VNDR_HEADER_ID(header) == DWC_PCIE_VSEC_ID &&
> + PCI_VNDR_HEADER_REV(header) == DWC_PCIE_VSEC_REV) {
> + *pos = vsec;
> + return 0;
> + }
> + }
> +
> + return -ENODEV;
> +}
> +
> +static int dwc_pcie_pmu_discover(struct dwc_pcie_pmu_priv *priv)
> +{
> + int val, where, index = 0;
> + struct pci_dev *pdev = NULL;
> + struct dwc_pcie_info_table *pcie_info;
> +
> + priv->pcie_table =
> + devm_kcalloc(priv->dev, RP_NUM_MAX, sizeof(*pcie_info), GFP_KERNEL);
> + if (!priv->pcie_table)
> + return -EINVAL;
> +
> + pcie_info = priv->pcie_table;
> + while ((pdev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pdev)) != NULL &&
> + index < RP_NUM_MAX) {
This having a driver than then walks the pci topology to find root ports and add
extra stuff to them is not a clean solution.
The probing should be driven from the existing PCI driver topology.
There are a bunch of new features we need to add to ports in the near future
anyway - this would just be another one.
Same problem exists for CXL CPMU perf devices - so far we only support those
on end points, partly because we need a clean way to probe them on pci ports.
Whatever we come up with there will apply here as well.
> + if (!pci_dev_is_rootport(pdev))
> + continue;
> +
> + pcie_info[index].bdf = dwc_pcie_get_bdf(pdev);
> + pcie_info[index].pdev = pdev;
Probably want a sanity check this has a vendor ID appropriate the VSEC you are about
to look for.
> +
> + if (dwc_pcie_find_ras_des_cap_position(pdev, &where))
> + continue;
> +
> + pcie_info[index].cap_pos = where;
> +
> + pci_read_config_dword(pdev,
> + pdev->pcie_cap + DWC_PCIE_LINK_CAPABILITIES_REG,
> + &val);
> + pcie_info[index].num_lanes =
> + (val & DWC_PCIE_LANE_MASK) >> DWC_PCIE_LANE_SHIFT;
FIELD_GET()
> + index++;
> + }
> +
> + if (!index)
> + return -ENODEV;
> +
> + priv->pcie_ctrl_num = index;
> +
> + return 0;
> +}
> +
> +static inline int dwc_pcie_pmu_read_dword(struct dwc_pcie_info_table *pcie_info,
> + u32 reg, u32 *val)
> +{
> + return pci_read_config_dword(pcie_info->pdev, pcie_info->cap_pos + reg,
> + val);
> +}
> +
> +static inline int dwc_pcie_pmu_write_dword(struct dwc_pcie_info_table
> + *pcie_info, u32 reg, u32 val)
> +{
> + return pci_write_config_dword(pcie_info->pdev, pcie_info->cap_pos + reg,
> + val);
> +}
These two wrappers don't add a lot so I would drop them.
> +
> +static int dwc_pcie_pmu_set_event_id(struct dwc_pcie_info_table *pcie_info,
> + int event_id)
> +{
> + int ret;
> + u32 val;
> +
> + ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, &val);
> + if (ret) {
> + pci_err(pcie_info->pdev, "PCIe read fail\n");
> + return ret;
> + }
> +
> + val &= ~DWC_PCIE__CNT_ENABLE_MASK;
> + val &= ~DWC_PCIE__CNT_EVENT_SELECT_MASK;
> + val |= event_id << DWC_PCIE__CNT_EVENT_SELECT_SHIFT;
FIELD_PREP()
> +
> + ret = dwc_pcie_pmu_write_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, val);
> + if (ret)
> + pci_err(pcie_info->pdev, "PCIe write fail\n");
> +
> + return ret;
> +}
...
> +
> +static int dwc_pcie_pmu_read_base_time_counter(struct dwc_pcie_info_table
> + *pcie_info, u64 *counter)
> +{
> + u32 ret, val;
> +
> + ret = dwc_pcie_pmu_read_dword(pcie_info,
> + DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_HIGH,
> + &val);
> + if (ret) {
> + pci_err(pcie_info->pdev, "PCIe read fail\n");
> + return ret;
> + }
> +
> + *counter = val;
> + *counter <<= 32;
This looks like you could get ripping between the upper and lower dwords.
What prevents that? Perhaps a comment to say why that's not a problem?
> +
> + ret = dwc_pcie_pmu_read_dword(pcie_info,
> + DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_LOW,
> + &val);
> + if (ret) {
> + pci_err(pcie_info->pdev, "PCIe read fail\n");
> + return ret;
> + }
> +
> + *counter += val;
> +
> + return ret;
> +}
...
> +static int __dwc_pcie_pmu_probe(struct dwc_pcie_pmu_priv *priv,
> + struct dwc_pcie_info_table *pcie_info)
> +{
> + int ret;
> + char *name;
> + struct dwc_pcie_pmu *pcie_pmu;
> + struct device *dev;
> +
> + if (!pcie_info || !pcie_info->pdev) {
> + pci_err(pcie_info->pdev, "Input parameter is invalid\n");
> + return -EINVAL;
> + }
> +
> + pcie_pmu = &pcie_info->pcie_pmu;
> + dev = &pcie_info->pdev->dev;
> +
> + ret = dwc_pcie_pmu_attr_init(priv, pcie_info);
> + if (ret) {
> + pci_err(pcie_info->pdev, "PMU attr init fail ret=%d\n", ret);
> + return ret;
> + }
> +
> + pcie_pmu->dev = dev;
> + pcie_pmu->pmu = (struct pmu) {
> + .module = THIS_MODULE,
> + .task_ctx_nr = perf_invalid_context,
> + .pmu_enable = NULL,
> + .pmu_disable = NULL,
> + .event_init = dwc_pcie_pmu_event_init,
> + .add = dwc_pcie_pmu_event_add,
> + .del = dwc_pcie_pmu_event_del,
> + .start = dwc_pcie_pmu_event_start,
> + .stop = dwc_pcie_pmu_event_stop,
> + .read = dwc_pcie_pmu_event_read,
> + .attr_groups = pcie_info->pcie_pmu_attr_groups,
> + .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
> + };
> +
> + name = devm_kasprintf(priv->dev, GFP_KERNEL, "pcie_bdf_%x",
> + pcie_info->bdf);
> + if (!name)
> + return -ENOMEM;
> +
> + /* Pick one CPU to be the preferred one to use */
> + pcie_pmu->on_cpu = raw_smp_processor_id();
Above there are references to multiple dies. Maybe at least make sure you
are on a near by die? (I'm guessing at topology!)
> +
> + ret = perf_pmu_register(&pcie_pmu->pmu, name, -1);
> + if (ret) {
> + pci_err(pcie_info->pdev, "Error %d registering PMU @%x\n", ret,
> + pcie_info->bdf);
> + return ret;
> + }
> +
> + pcie_info->pmu_is_register = DWC_PCIE_PMU_HAS_REGISTER;
As below. I think you can drop this state info.
> +
> + return ret;
> +}
> +
> +static int dwc_pcie_pmu_remove(struct platform_device *pdev)
> +{
> + struct dwc_pcie_pmu_priv *priv = platform_get_drvdata(pdev);
> + int index;
> + struct dwc_pcie_pmu *pcie_pmu;
> +
> + for (index = 0; index < priv->pcie_ctrl_num; index++)
> + if (priv->pcie_table[index].pmu_is_register) {
> + pcie_pmu = &priv->pcie_table[index].pcie_pmu;
> + perf_pmu_unregister(&pcie_pmu->pmu);
> + }
> + return 0;
> +}
> +
> +static int dwc_pcie_pmu_probe(struct platform_device *pdev)
> +{
> + int ret = 0;
Initialized in all paths where it is used. Compiler should be able to tell
that so I doubt you need this to be set to 0 here.
> + int pcie_index;
> + struct dwc_pcie_pmu_priv *priv;
> +
> + priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
> + if (!priv)
> + return -ENOMEM;
> + priv->dev = &pdev->dev;
> + platform_set_drvdata(pdev, priv);
> +
> + /* If PMU is not support on current platform, keep slient */
> + if (dwc_pcie_pmu_discover(priv))
> + return 0;
> +
> + for (pcie_index = 0; pcie_index < priv->pcie_ctrl_num; pcie_index++) {
> + struct pci_dev *rp = priv->pcie_table[pcie_index].pdev;
> +
> + ret = __dwc_pcie_pmu_probe(priv, &priv->pcie_table[pcie_index]);
> + if (ret) {
> + dev_err(&rp->dev, "PCIe PMU probe fail\n");
> + goto pmu_unregister;
> + }
> + }
> + dev_info(&pdev->dev, "PCIe PMUs registered\n");
Noise in the logs. There are lots of ways to know if we reached this point
so this adds no value.
> +
> + return 0;
> +
> +pmu_unregister:
> + dwc_pcie_pmu_remove(pdev);
I'd much rather see the unwind here directly so we can clearly see that it undoes
the result of errors in this function. That removes the need to use the
is_registered flag in the remove() function simplifying that flow as well.
> +
> + return ret;
> +}
> +
> +static struct platform_driver dwc_pcie_pmu_driver = {
> + .probe = dwc_pcie_pmu_probe,
> + .remove = dwc_pcie_pmu_remove,
> + .driver = {.name = DRV_NAME,},
More common to format as
.driver = {
.name = "dwc_pcie_pmu",
},
};
Note use of string here. Using a define just forces people to
look for this in the wrong place.
> +};
> +
> +static int __init dwc_pcie_pmu_init(void)
> +{
> + int ret;
> +
> + ret = platform_driver_register(&dwc_pcie_pmu_driver);
> +
> + if (ret)
> + return ret;
> +
> + dwc_pcie_pmu_dev =
> + platform_device_register_simple(DEV_NAME, -1, NULL, 0);
I'd normally expect to see the device created as a result of firmware
description (ACPI DSDT / or Device tree)
It is unusual to create a 'real' device directly in the driver
init - that's normally reserved for various fake / software devices.
> + if (IS_ERR(dwc_pcie_pmu_dev)) {
> + platform_driver_unregister(&dwc_pcie_pmu_driver);
> + return PTR_ERR(dwc_pcie_pmu_dev);
> + }
> +
> + return 0;
> +}
> +
> +static void __exit dwc_pcie_pmu_exit(void)
> +{
> + platform_device_unregister(dwc_pcie_pmu_dev);
> + platform_driver_unregister(&dwc_pcie_pmu_driver);
> +}
> +
> +module_init(dwc_pcie_pmu_init);
> +module_exit(dwc_pcie_pmu_exit);
> +
> +MODULE_DESCRIPTION("PMU driver for DesignWare Cores PCI Express Controller");
> +MODULE_AUTHOR("[email protected]");
> +MODULE_AUTHOR("[email protected]");
> +MODULE_LICENSE("GPL v2");
On Thu, Sep 22, 2022 at 04:58:20PM +0100, Jonathan Cameron wrote:
> On Sat, 17 Sep 2022 20:10:35 +0800
> Shuai Xue <[email protected]> wrote:
>
> > This commit adds the PCIe Performance Monitoring Unit (PMU) driver support
> > for T-Head Yitian SoC chip. Yitian is based on the Synopsys PCI Express
> > Core controller IP which provides statistics feature. The PMU is not a PCIe
> > Root Complex integrated End Point(RCiEP) device but only register counters
> > provided by each PCIe Root Port.
> >
> > To facilitate collection of statistics the controller provides the
> > following two features for each Root Port:
> >
> > - Time Based Analysis (RX/TX data throughput and time spent in each
> > low-power LTSSM state)
> > - Event counters (Error and Non-Error for lanes)
> >
> > Note, only one counter for each type.
> >
> > This driver add PMU devices for each PCIe Root Port. And the PMU device is
> > named based the BDF of Root Port. For example,
> >
> > 10:00.0 PCI bridge: Device 1ded:8000 (rev 01)
> >
> > the PMU device name for this Root Port is pcie_bdf_100000.
> >
> > Example usage of counting PCIe RX TLP data payload (Units of 16 bytes)::
> >
> > $# perf stat -a -e pcie_bdf_200/Rx_PCIe_TLP_Data_Payload/
> >
> > average RX bandwidth can be calculated like this:
> >
> > PCIe TX Bandwidth = PCIE_TX_DATA * 16B / Measure_Time_Window
> >
> > Signed-off-by: Shuai Xue <[email protected]>
>
> +CC linux-pci list and Bjorn.
Thanks, this is definitely of interest to linux-pci.
> Question in here which I've been meaning to address for other reasons
> around how to register 'extra features' on pci ports.
>
> This particular PMU is in config space in a Vendor Specific Extended
> Capability.
>
> I've focused on that aspect for this review rather than the perf parts.
> We'll need to figure that story out first as doing this from a bus walk
> makes triggered of a platform driver is not the way I'd expect to see
> this work.
> > +static int dwc_pcie_pmu_discover(struct dwc_pcie_pmu_priv *priv)
> > +{
> > + int val, where, index = 0;
> > + struct pci_dev *pdev = NULL;
> > + struct dwc_pcie_info_table *pcie_info;
> > +
> > + priv->pcie_table =
> > + devm_kcalloc(priv->dev, RP_NUM_MAX, sizeof(*pcie_info), GFP_KERNEL);
> > + if (!priv->pcie_table)
> > + return -EINVAL;
> > +
> > + pcie_info = priv->pcie_table;
> > + while ((pdev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pdev)) != NULL &&
> > + index < RP_NUM_MAX) {
>
> This having a driver than then walks the pci topology to find root ports and add
> extra stuff to them is not a clean solution.
>
> The probing should be driven from the existing PCI driver topology.
> There are a bunch of new features we need to add to ports in the near future
> anyway - this would just be another one.
> Same problem exists for CXL CPMU perf devices - so far we only support those
> on end points, partly because we need a clean way to probe them on pci ports.
>
> Whatever we come up with there will apply here as well.
I agree, I don't like to see more uses of pci_get_device() because it
doesn't fit the driver model at all. For one thing, it really screws
up the hotplug model because this doesn't account for hot-added
devices and there's no clear cleanup path for removal.
Hotplug is likely not an issue in this particular case, but it gets
copied to places where it is an issue.
Maybe we need some kind of PCI core interface whereby drivers can
register their interest in VSEC and/or DVSEC capabilities.
Bjorn
[+cc linux-pci]
On Sat, Sep 17, 2022 at 08:10:35PM +0800, Shuai Xue wrote:
> This commit adds the PCIe Performance Monitoring Unit (PMU) driver support
> for T-Head Yitian SoC chip. Yitian is based on the Synopsys PCI Express
> Core controller IP which provides statistics feature. The PMU is not a PCIe
> Root Complex integrated End Point(RCiEP) device but only register counters
> provided by each PCIe Root Port.
>
> To facilitate collection of statistics the controller provides the
> following two features for each Root Port:
>
> - Time Based Analysis (RX/TX data throughput and time spent in each
> low-power LTSSM state)
> - Event counters (Error and Non-Error for lanes)
>
> Note, only one counter for each type.
>
> This driver add PMU devices for each PCIe Root Port. And the PMU device is
> named based the BDF of Root Port. For example,
>
> 10:00.0 PCI bridge: Device 1ded:8000 (rev 01)
>
> the PMU device name for this Root Port is pcie_bdf_100000.
>
> Example usage of counting PCIe RX TLP data payload (Units of 16 bytes)::
>
> $# perf stat -a -e pcie_bdf_200/Rx_PCIe_TLP_Data_Payload/
>
> average RX bandwidth can be calculated like this:
>
> PCIe TX Bandwidth = PCIE_TX_DATA * 16B / Measure_Time_Window
>
> Signed-off-by: Shuai Xue <[email protected]>
> +++ b/drivers/perf/dwc_pcie_pmu.c
> ...
> +#define DWC_PCIE_VSEC_ID 0x02
I don't think DWC_PCIE_VSEC_ID is a very good name because it doesn't
tell us anything about the purpose of the capability. Something like
DWC_PCIE_RAS_DES_VSEC_ID would be more useful to readers.
> +#define DWC_PCIE_LINK_CAPABILITIES_REG 0xC
> +#define DWC_PCIE_LANE_SHIFT 4
> +#define DWC_PCIE_LANE_MASK GENMASK(9, 4)
Shouldn't need these at all; see below.
> +struct dwc_pcie_info_table {
> + u32 bdf;
> + u32 cap_pos;
Would be useful to name this "ras_des" or similar so we have a hint
about what we're reading/writing when using "pcie_info->cap_pos" below.
> +static struct device_attribute dwc_pcie_pmu_cpumask_attr =
> +__ATTR(cpumask, 0444, dwc_pcie_pmu_cpumask_show, NULL);
DEVICE_ATTR_RO()?
> +#define _dwc_pcie_format_attr(_name, _cfg, _fld) \
> + (&((struct dwc_pcie_format_attr[]) {{ \
> + .attr = __ATTR(_name, 0444, dwc_pcie_pmu_format_show, NULL), \
Ditto.
> +#define DWC_PCIE_EVENT_ATTR(_name, _type, _eventid, _lane) \
> + (&((struct dwc_pcie_event_attr[]) {{ \
> + .attr = __ATTR(_name, 0444, dwc_pcie_event_show, NULL), \
Ditto.
> +static int dwc_pcie_pmu_discover(struct dwc_pcie_pmu_priv *priv)
> +{
> + int val, where, index = 0;
> + struct pci_dev *pdev = NULL;
> + struct dwc_pcie_info_table *pcie_info;
> +
> + priv->pcie_table =
> + devm_kcalloc(priv->dev, RP_NUM_MAX, sizeof(*pcie_info), GFP_KERNEL);
> + if (!priv->pcie_table)
> + return -EINVAL;
> +
> + pcie_info = priv->pcie_table;
> + while ((pdev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pdev)) != NULL &&
> + index < RP_NUM_MAX) {
> + if (!pci_dev_is_rootport(pdev))
> + continue;
> +
> + pcie_info[index].bdf = dwc_pcie_get_bdf(pdev);
> + pcie_info[index].pdev = pdev;
> +
> + if (dwc_pcie_find_ras_des_cap_position(pdev, &where))
> + continue;
> +
> + pcie_info[index].cap_pos = where;
> +
> + pci_read_config_dword(pdev,
> + pdev->pcie_cap + DWC_PCIE_LINK_CAPABILITIES_REG,
> + &val);
> + pcie_info[index].num_lanes =
> + (val & DWC_PCIE_LANE_MASK) >> DWC_PCIE_LANE_SHIFT;
I think you can use pcie_get_width_cap() here.
> +static int dwc_pcie_pmu_set_event_id(struct dwc_pcie_info_table *pcie_info,
> + int event_id)
> +{
> + int ret;
> + u32 val;
> +
> + ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, &val);
> + if (ret) {
> + pci_err(pcie_info->pdev, "PCIe read fail\n");
Maybe #define dev_fmt above to add a prefix to these messages?
Otherwise I think they will look like:
pcieport 0000:00:1c.0: PCIe read fail
which suggests it's related to pcieport, but that's the wrong place to
look.
I think every caller of dwc_pcie_pmu_read_dword() makes the same check
and prints the same message; maybe the message should be moved inside
dwc_pcie_pmu_read_dword()?
Same with dwc_pcie_pmu_write_dword(); moving the message there would
simplify all callers.
> +static int dwc_pcie_pmu_event_enable(struct dwc_pcie_info_table *pcie_info,
> + u32 enable)
> +{
> + u32 ret;
> + u32 val;
> +
> + ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, &val);
> + if (ret) {
> + pci_err(pcie_info->pdev, "PCIe read fail\n");
> + return ret;
> + }
> +
> + val &= ~(DWC_PCIE__CNT_ENABLE_MASK);
Superfluous parens.
> +static int dwc_pcie_pmu_base_time_add_prepare(struct dwc_pcie_info_table
> + *pcie_info, u32 event_id)
> +{
> + u32 ret;
> + u32 val;
> +
> + ret = dwc_pcie_pmu_read_dword(pcie_info,
> + DWC_PCIE_TIME_BASED_ANALYSIS_CTRL, &val);
> + if (ret) {
> + pci_err(pcie_info->pdev, "PCIe read fail\n");
> + return ret;
> + }
> +
> + val &= ~DWC_PCIE__TIME_BASED_REPORT_SELECT_MASK;
> + val |= event_id << DWC_PCIE__TIME_BASED_REPORT_SELECT_SHIFT;
> + val &= ~DWC_PCIE__TIME_BASED_DURATION_SELECT;
> +
> + /*
> + * TIME_BASED_ANALYSIS_DATA_REG is a 64 bit register, we can safely
> + * use it with any manually controllered duration.
s/controllered/controlled/ ? Not sure what this means. Maybe that
64 bits is wide enough you don't need to worry about rollover?
> +static struct dwc_pcie_info_table *pmu_to_pcie_info(struct pmu *pmu)
> +{
> + struct dwc_pcie_info_table *pcie_info;
> + struct dwc_pcie_pmu *pcie_pmu = to_pcie_pmu(pmu);
> +
> + pcie_info = container_of(pcie_pmu, struct dwc_pcie_info_table, pcie_pmu);
> + if (pcie_info == NULL)
> + pci_err(pcie_info->pdev, "Can't get pcie info\n");
It shouldn't be possible to get here for a pmu with no pcie_info, and
callers don't check for a NULL pointer return value before
dereferencing it, so I guess all this adds is an error message before
a NULL pointer oops? Not sure the code clutter is worth it.
> + return pcie_info;
> +}
> +static int dwc_pcie_pmu_event_init(struct perf_event *event)
> +{
> + struct hw_perf_event *hwc = &event->hw;
> + struct dwc_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
> + struct perf_event *sibling;
> +
> + if (event->attr.type != event->pmu->type)
> + return -ENOENT;
> +
> + if (hwc->sample_period) {
> + dev_dbg(pcie_pmu->dev, "Sampling not supported\n");
> + return -EOPNOTSUPP;
> + }
> +
> + if (event->cpu < 0) {
> + dev_dbg(pcie_pmu->dev, "Per-task mode not supported\n");
> + return -EOPNOTSUPP;
> + }
> +
> + event->cpu = pcie_pmu->on_cpu;
> +
> + if (event->group_leader != event &&
> + !is_software_event(event->group_leader)) {
> + dev_dbg(pcie_pmu->dev, "Drive way only allow one event!\n");
"Drive way"? -ENOPARSE for me :)
> + return -EINVAL;
> + }
> +
> + for_each_sibling_event(sibling, event->group_leader) {
> + if (sibling != event && !is_software_event(sibling)) {
> + dev_dbg(pcie_pmu->dev, "Drive way event not allowed!\n");
> + return -EINVAL;
> + }
> + }
> +static void dwc_pcie_pmu_set_period(struct hw_perf_event *hwc)
> +{
> + u64 new = 0;
Superfluous variable.
> + local64_set(&hwc->prev_count, new);
> +}
> +static int __dwc_pcie_pmu_probe(struct dwc_pcie_pmu_priv *priv,
> + struct dwc_pcie_info_table *pcie_info)
> +{
> + int ret;
> + char *name;
> + struct dwc_pcie_pmu *pcie_pmu;
> + struct device *dev;
> +
> + if (!pcie_info || !pcie_info->pdev) {
> + pci_err(pcie_info->pdev, "Input parameter is invalid\n");
There are a lot of "Input parameter is invalid" messages. If somebody
sees that, there's no hint about which one to look at. Messages that
are constant strings are usually a hint that they could include more
information.
> +static int dwc_pcie_pmu_probe(struct platform_device *pdev)
> +{
> + int ret = 0;
> + int pcie_index;
> + struct dwc_pcie_pmu_priv *priv;
> +
> + priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
> + if (!priv)
> + return -ENOMEM;
> + priv->dev = &pdev->dev;
> + platform_set_drvdata(pdev, priv);
> +
> + /* If PMU is not support on current platform, keep slient */
s/not support/not supported/
s/slient/silent/
Bjorn
On 2022/9/17 20:10, Shuai Xue wrote:
> This commit adds the PCIe Performance Monitoring Unit (PMU) driver support
> for T-Head Yitian SoC chip. Yitian is based on the Synopsys PCI Express
> Core controller IP which provides statistics feature. The PMU is not a PCIe
> Root Complex integrated End Point(RCiEP) device but only register counters
> provided by each PCIe Root Port.
>
> To facilitate collection of statistics the controller provides the
> following two features for each Root Port:
>
> - Time Based Analysis (RX/TX data throughput and time spent in each
> low-power LTSSM state)
> - Event counters (Error and Non-Error for lanes)
>
> Note, only one counter for each type.
>
> This driver add PMU devices for each PCIe Root Port. And the PMU device is
> named based the BDF of Root Port. For example,
>
> 10:00.0 PCI bridge: Device 1ded:8000 (rev 01)
>
> the PMU device name for this Root Port is pcie_bdf_100000.
>
> Example usage of counting PCIe RX TLP data payload (Units of 16 bytes)::
>
> $# perf stat -a -e pcie_bdf_200/Rx_PCIe_TLP_Data_Payload/
>
> average RX bandwidth can be calculated like this:
>
> PCIe TX Bandwidth = PCIE_TX_DATA * 16B / Measure_Time_Window
>
> Signed-off-by: Shuai Xue <[email protected]>
> ---
> drivers/perf/Kconfig | 7 +
> drivers/perf/Makefile | 1 +
> drivers/perf/dwc_pcie_pmu.c | 976 ++++++++++++++++++++++++++++++++++++
> 3 files changed, 984 insertions(+)
> create mode 100644 drivers/perf/dwc_pcie_pmu.c
>
> diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
> index 1e2d69453771..11ae99de5bbf 100644
> --- a/drivers/perf/Kconfig
> +++ b/drivers/perf/Kconfig
> @@ -192,4 +192,11 @@ config MARVELL_CN10K_DDR_PMU
> Enable perf support for Marvell DDR Performance monitoring
> event on CN10K platform.
>
> +config CONFIG_DWC_PCIE_PMU
> + tristate "Enable Synopsys DesignWare PCIe PMU Support"
> + depends on ARM64 || (COMPILE_TEST && 64BIT)
> + help
> + Enable perf support for Synopsys DesignWare PCIe PMU Performance
> + monitoring event on Yitan 710 platform.
> +
> endmenu
> diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
> index 57a279c61df5..36f75cb0f320 100644
> --- a/drivers/perf/Makefile
> +++ b/drivers/perf/Makefile
> @@ -20,3 +20,4 @@ obj-$(CONFIG_ARM_DMC620_PMU) += arm_dmc620_pmu.o
> obj-$(CONFIG_MARVELL_CN10K_TAD_PMU) += marvell_cn10k_tad_pmu.o
> obj-$(CONFIG_MARVELL_CN10K_DDR_PMU) += marvell_cn10k_ddr_pmu.o
> obj-$(CONFIG_APPLE_M1_CPU_PMU) += apple_m1_cpu_pmu.o
> +obj-$(CONFIG_DWC_PCIE_PMU) += dwc_pcie_pmu.o
> diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c
> new file mode 100644
> index 000000000000..81e534be13fa
> --- /dev/null
> +++ b/drivers/perf/dwc_pcie_pmu.c
> @@ -0,0 +1,976 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Synopsys DesignWare PCIe PMU driver
> + *
> + * Copyright (C) 2021, 2022 Alibaba Inc.
> + */
> +
> +#include <linux/pci.h>
> +#include <linux/bitfield.h>
> +#include <linux/bitops.h>
> +#include <linux/cpuhotplug.h>
> +#include <linux/cpumask.h>
> +#include <linux/device.h>
> +#include <linux/errno.h>
> +#include <linux/kernel.h>
> +#include <linux/list.h>
> +#include <linux/perf_event.h>
> +#include <linux/platform_device.h>
> +#include <linux/smp.h>
> +#include <linux/sysfs.h>
> +#include <linux/types.h>
> +
> +#define DRV_NAME "dwc_pcie_pmu"
> +#define DEV_NAME "dwc_pcie_pmu"
> +#define RP_NUM_MAX 32 /* 2die * 4RC * 4Ctrol */
> +#define ATTRI_NAME_MAX_SIZE 32
> +
> +#define DWC_PCIE_VSEC_ID 0x02
> +#define DWC_PCIE_VSEC_REV 0x04
> +
> +#define DWC_PCIE_LINK_CAPABILITIES_REG 0xC
> +#define DWC_PCIE_LANE_SHIFT 4
> +#define DWC_PCIE_LANE_MASK GENMASK(9, 4)
> +
> +#define DWC_PCIE_EVENT_CNT_CTRL 0x8
> +#define DWC_PCIE__CNT_EVENT_SELECT_SHIFT 16
> +#define DWC_PCIE__CNT_EVENT_SELECT_MASK GENMASK(27, 16)
> +#define DWC_PCIE__CNT_LANE_SELECT_SHIFT 8
> +#define DWC_PCIE__CNT_LANE_SELECT_MASK GENMASK(11, 8)
> +#define DWC_PCIE__CNT_STATUS_SHIFT 7
> +#define DWC_PCIE__CNT_STATUS_MASK BIT(7)
> +#define DWC_PCIE__CNT_ENABLE_SHIFT 2
> +#define DWC_PCIE__CNT_ENABLE_MASK GENMASK(4, 2)
> +#define DWC_PCIE_PER_EVENT_OFF (0x1 << DWC_PCIE__CNT_ENABLE_SHIFT)
> +#define DWC_PCIE_PER_EVENT_ON (0x3 << DWC_PCIE__CNT_ENABLE_SHIFT)
> +#define DWC_PCIE_EVENT_CLEAR_MASK GENMASK(1, 0)
> +
> +#define DWC_PCIE_EVENT_CNT_DATA 0xC
> +
> +#define DWC_PCIE_TIME_BASED_ANALYSIS_CTRL 0x10
> +#define DWC_PCIE__TIME_BASED_REPORT_SELECT_SHIFT 24
> +#define DWC_PCIE__TIME_BASED_REPORT_SELECT_MASK GENMASK(31, 24)
> +#define DWC_PCIE__TIME_BASED_DURATION_SHIFT 8
> +#define DWC_PCIE__TIME_BASED_DURATION_SELECT GENMASK(15, 8)
> +#define DWC_PCIE_DURATION_MANUAL_CTRL 0x0
> +#define DWC_PCIE_DURATION_1MS 0x1
> +#define DWC_PCIE_DURATION_10MS 0x2
> +#define DWC_PCIE_DURATION_100MS 0x3
> +#define DWC_PCIE_DURATION_1S 0x4
> +#define DWC_PCIE_DURATION_2S 0x5
> +#define DWC_PCIE_DURATION_4S 0x6
> +#define DWC_PCIE_DURATION_4US 0xff
> +#define DWC_PCIE__TIME_BASED_COUNTER_ENABLE 1
> +
> +#define DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_LOW 0x14
> +#define DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_HIGH 0x18
> +
> +/* Event attributes */
> +#define DWC_PCIE_CONFIG_EVENTID GENMASK(15, 0)
> +#define DWC_PCIE_CONFIG_TYPE GENMASK(19, 16)
> +#define DWC_PCIE_CONFIG_LANE GENMASK(27, 20)
> +
> +#define DWC_PCIE_EVENT_ID(event) FIELD_GET(DWC_PCIE_CONFIG_EVENTID, (event)->attr.config)
> +#define DWC_PCIE_EVENT_TYPE(event) FIELD_GET(DWC_PCIE_CONFIG_TYPE, (event)->attr.config)
> +#define DWC_PCIE_EVENT_LANE(event) FIELD_GET(DWC_PCIE_CONFIG_LANE, (event)->attr.config)
> +
> +#define DWC_PCIE_PMU_HAS_REGISTER 1
> +
> +enum dwc_pcie_event_type {
> + DWC_PCIE_TYPE_INVALID,
> + DWC_PCIE_TIME_BASE_EVENT,
> + DWC_PCIE_LANE_EVENT,
> +};
> +
> +struct dwc_event_counters {
> + const char name[32];
> + u32 event_id;
> +};
> +
> +struct dwc_pcie_pmu {
> + struct hlist_node node;
> + unsigned int on_cpu;
> + struct pmu pmu;
> + struct device *dev;
> +};
> +
> +struct dwc_pcie_info_table {
> + u32 bdf;
> + u32 cap_pos;
> + u32 num_lanes;
> + struct pci_dev *pdev;
> + struct dwc_pcie_pmu pcie_pmu;
> + u8 pmu_is_register;
> + struct perf_event *event;
> +
> + struct dwc_pcie_event_attr *lane_event_attrs;
> + struct attribute **pcie_pmu_event_attrs;
> + struct attribute_group pcie_pmu_event_attrs_group;
> + const struct attribute_group *pcie_pmu_attr_groups[4];
> +};
> +
> +struct dwc_pcie_pmu_priv {
> + struct device *dev;
> + u32 pcie_ctrl_num;
> + struct dwc_pcie_info_table *pcie_table;
> +};
> +
> +#define DWC_PCIE_CREATE_BDF(seg, bus, dev, func) \
> + (((seg) << 24) | (((bus) & 0xFF) << 16) | (((dev) & 0xFF) << 8) | (func))
Just pass pdev->devfn and use PCI_DEVID() to simplify here.
> +#define to_pcie_pmu(p) (container_of(p, struct dwc_pcie_pmu, pmu))
> +
> +static struct platform_device *dwc_pcie_pmu_dev;
> +static char *event_attr_name = "events";
> +
> +static ssize_t dwc_pcie_pmu_cpumask_show(struct device *dev,
> + struct device_attribute *attr,
> + char *buf)
> +{
> + struct dwc_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev));
> +
> + return cpumap_print_to_pagebuf(true, buf, cpumask_of(pcie_pmu->on_cpu));
> +}
> +
> +static struct device_attribute dwc_pcie_pmu_cpumask_attr =
> +__ATTR(cpumask, 0444, dwc_pcie_pmu_cpumask_show, NULL);
> +
> +static struct attribute *dwc_pcie_pmu_cpumask_attrs[] = {
> + &dwc_pcie_pmu_cpumask_attr.attr,
> + NULL
> +};
> +
> +static struct attribute_group pcie_pmu_cpumask_attrs_group = {
> + .attrs = dwc_pcie_pmu_cpumask_attrs,
> +};
> +
> +struct dwc_pcie_format_attr {
> + struct device_attribute attr;
> + u64 field;
> + int config;
> +};
> +
> +static ssize_t dwc_pcie_pmu_format_show(struct device *dev,
> + struct device_attribute *attr,
> + char *buf)
> +{
> + struct dwc_pcie_format_attr *fmt = container_of(attr, typeof(*fmt), attr);
> + int lo = __ffs(fmt->field), hi = __fls(fmt->field);
> +
> + if (lo == hi)
> + return snprintf(buf, PAGE_SIZE, "config:%d\n", lo);
> +
> + if (!fmt->config)
> + return snprintf(buf, PAGE_SIZE, "config:%d-%d\n", lo, hi);
> +
> + return snprintf(buf, PAGE_SIZE, "config%d:%d-%d\n", fmt->config, lo,
> + hi);
> +}
> +
> +#define _dwc_pcie_format_attr(_name, _cfg, _fld) \
> + (&((struct dwc_pcie_format_attr[]) {{ \
> + .attr = __ATTR(_name, 0444, dwc_pcie_pmu_format_show, NULL), \
> + .config = _cfg, \
> + .field = _fld, \
> + }})[0].attr.attr)
> +
> +#define dwc_pcie_format_attr(_name, _fld) _dwc_pcie_format_attr(_name, 0, _fld)
> +
> +static struct attribute *dwc_pcie_format_attrs[] = {
> + dwc_pcie_format_attr(type, DWC_PCIE_CONFIG_TYPE),
> + dwc_pcie_format_attr(eventid, DWC_PCIE_CONFIG_EVENTID),
> + dwc_pcie_format_attr(lane, DWC_PCIE_CONFIG_LANE),
> + NULL,
> +};
> +
> +static struct attribute_group pcie_pmu_format_attrs_group = {
> + .name = "format",
> + .attrs = dwc_pcie_format_attrs,
> +};
> +
> +struct dwc_pcie_event_attr {
> + struct device_attribute attr;
> + enum dwc_pcie_event_type type;
> + u16 eventid;
> + u8 lane;
> +};
> +
> +ssize_t dwc_pcie_event_show(struct device *dev,
> + struct device_attribute *attr, char *page)
> +{
> + struct dwc_pcie_event_attr *eattr;
> +
> + eattr = container_of(attr, typeof(*eattr), attr);
> +
> + if (eattr->type == DWC_PCIE_LANE_EVENT)
> + return sprintf(page, "eventid=0x%lx, type=0x%lx, lane=0x%lx\n",
> + (unsigned long)eattr->eventid,
> + (unsigned long)eattr->type,
> + (unsigned long)eattr->lane);
> + else
> + return sprintf(page, "eventid=0x%lx, type=0x%lx",
> + (unsigned long)eattr->eventid,
> + (unsigned long)eattr->type);
> +}
I remember sysfs_emit() is preferred.
> +
> +#define DWC_PCIE_EVENT_ATTR(_name, _type, _eventid, _lane) \
> + (&((struct dwc_pcie_event_attr[]) {{ \
> + .attr = __ATTR(_name, 0444, dwc_pcie_event_show, NULL), \
> + .type = _type, \
> + .eventid = _eventid, \
> + .lane = _lane, \
> + }})[0].attr.attr)
> +
> +#define DWC_PCIE_PMU_BASE_TIME_ATTR(_name, _eventid) \
> + DWC_PCIE_EVENT_ATTR(_name, DWC_PCIE_TIME_BASE_EVENT, _eventid, 0)
> +
> +static struct attribute *dwc_pcie_pmu_time_event_attrs[] = {
> + /* Group #0 */
> + DWC_PCIE_PMU_BASE_TIME_ATTR(one_cycle, 0x00),
> + DWC_PCIE_PMU_BASE_TIME_ATTR(TX_L0S, 0x01),
> + DWC_PCIE_PMU_BASE_TIME_ATTR(RX_L0S, 0x02),
> + DWC_PCIE_PMU_BASE_TIME_ATTR(L0, 0x03),
> + DWC_PCIE_PMU_BASE_TIME_ATTR(L1, 0x04),
> + DWC_PCIE_PMU_BASE_TIME_ATTR(L1_1, 0x05),
> + DWC_PCIE_PMU_BASE_TIME_ATTR(L1_2, 0x06),
> + DWC_PCIE_PMU_BASE_TIME_ATTR(CFG_RCVRY, 0x07),
> + DWC_PCIE_PMU_BASE_TIME_ATTR(TX_RX_L0S, 0x08),
> + DWC_PCIE_PMU_BASE_TIME_ATTR(L1_AUX, 0x09),
> + DWC_PCIE_PMU_BASE_TIME_ATTR(ONE_cycle, 0x10),
> + DWC_PCIE_PMU_BASE_TIME_ATTR(TX_L0S_, 0x11),
> + DWC_PCIE_PMU_BASE_TIME_ATTR(RX_L0S_, 0x12),
> + DWC_PCIE_PMU_BASE_TIME_ATTR(L0_, 0x13),
> + DWC_PCIE_PMU_BASE_TIME_ATTR(L1_, 0x17),
> + DWC_PCIE_PMU_BASE_TIME_ATTR(CFG_RCVRY_, 0x17),
> + DWC_PCIE_PMU_BASE_TIME_ATTR(TX_RX_L0S_, 0x18),
> + /* Group #1 */
> + DWC_PCIE_PMU_BASE_TIME_ATTR(Tx_PCIe_TLP_Data_Payload, 0x20),
> + DWC_PCIE_PMU_BASE_TIME_ATTR(Rx_PCIe_TLP_Data_Payload, 0x21),
> + DWC_PCIE_PMU_BASE_TIME_ATTR(Tx_CCIX_TLP_Data_Payload, 0x22),
> + DWC_PCIE_PMU_BASE_TIME_ATTR(Rx_CCIX_TLP_Data_Payload, 0x23),
> + NULL
> +};
> +
> +static inline umode_t pcie_pmu_event_attr_is_visible(struct kobject *kobj,
> + struct attribute *attr,
> + int unuse)
> +{
> + return attr->mode;
> +}
> +
> +static inline bool pci_dev_is_rootport(struct pci_dev *pdev)
> +{
> + return (pci_is_pcie(pdev) &&
> + pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT);
> +}
> +
> +static inline unsigned int dwc_pcie_get_bdf(struct pci_dev *dev)
> +{
> + return (DWC_PCIE_CREATE_BDF(pci_domain_nr(dev->bus), dev->bus->number,
> + PCI_SLOT(dev->devfn),
> + PCI_FUNC(dev->devfn)));
> +}
> +
> +static int dwc_pcie_find_ras_des_cap_position(struct pci_dev *pdev, int *pos)
> +{
> + u32 header;
> + int vsec = 0;
> +
> + while ((vsec = pci_find_next_ext_capability(pdev, vsec,
> + PCI_EXT_CAP_ID_VNDR))) {
> + pci_read_config_dword(pdev, vsec + PCI_VNDR_HEADER, &header);
> + /* Is the device part of a DesignWare Cores PCIe Controller ? */
> + if (PCI_VNDR_HEADER_ID(header) == DWC_PCIE_VSEC_ID &&
> + PCI_VNDR_HEADER_REV(header) == DWC_PCIE_VSEC_REV) {
> + *pos = vsec;
> + return 0;
> + }
> + }
> +
> + return -ENODEV;
> +}
> +
> +static int dwc_pcie_pmu_discover(struct dwc_pcie_pmu_priv *priv)
> +{
> + int val, where, index = 0;
> + struct pci_dev *pdev = NULL;
> + struct dwc_pcie_info_table *pcie_info;
> +
> + priv->pcie_table =
> + devm_kcalloc(priv->dev, RP_NUM_MAX, sizeof(*pcie_info), GFP_KERNEL);
> + if (!priv->pcie_table)
> + return -EINVAL;
> +
> + pcie_info = priv->pcie_table;
> + while ((pdev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pdev)) != NULL &&
I may miss but I don't pci_dev_put() to balance the reference cnt.
> + index < RP_NUM_MAX) {
> + if (!pci_dev_is_rootport(pdev))
> + continue;
> +
> + pcie_info[index].bdf = dwc_pcie_get_bdf(pdev);
> + pcie_info[index].pdev = pdev;
> +
> + if (dwc_pcie_find_ras_des_cap_position(pdev, &where))
> + continue;
> +
> + pcie_info[index].cap_pos = where;
> +
> + pci_read_config_dword(pdev,
> + pdev->pcie_cap + DWC_PCIE_LINK_CAPABILITIES_REG,
> + &val);
> + pcie_info[index].num_lanes =
> + (val & DWC_PCIE_LANE_MASK) >> DWC_PCIE_LANE_SHIFT;
> + index++;
> + }
> +
> + if (!index)
> + return -ENODEV;
> +
> + priv->pcie_ctrl_num = index;
> +
> + return 0;
> +}
> +
> +static inline int dwc_pcie_pmu_read_dword(struct dwc_pcie_info_table *pcie_info,
> + u32 reg, u32 *val)
> +{
> + return pci_read_config_dword(pcie_info->pdev, pcie_info->cap_pos + reg,
> + val);
> +}
> +
> +static inline int dwc_pcie_pmu_write_dword(struct dwc_pcie_info_table
> + *pcie_info, u32 reg, u32 val)
> +{
> + return pci_write_config_dword(pcie_info->pdev, pcie_info->cap_pos + reg,
> + val);
> +}
> +
> +static int dwc_pcie_pmu_set_event_id(struct dwc_pcie_info_table *pcie_info,
> + int event_id)
> +{
> + int ret;
> + u32 val;
> +
> + ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, &val);
> + if (ret) {
> + pci_err(pcie_info->pdev, "PCIe read fail\n");
> + return ret;
> + }
> +
> + val &= ~DWC_PCIE__CNT_ENABLE_MASK;
> + val &= ~DWC_PCIE__CNT_EVENT_SELECT_MASK;
> + val |= event_id << DWC_PCIE__CNT_EVENT_SELECT_SHIFT;
> +
> + ret = dwc_pcie_pmu_write_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, val);
> + if (ret)
> + pci_err(pcie_info->pdev, "PCIe write fail\n");
> +
> + return ret;
> +}
> +
> +static int dwc_pcie_pmu_write_event_lane(struct dwc_pcie_info_table *pcie_info,
> + int lane, int event_id)
> +{
> + u32 ret;
> + u32 val;
> +
> + ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, &val);
> + if (ret) {
> + pci_err(pcie_info->pdev, "PCIe read fail\n");
> + return ret;
> + }
> +
> + val &= ~DWC_PCIE__CNT_LANE_SELECT_MASK;
> + val |= lane << DWC_PCIE__CNT_LANE_SELECT_SHIFT;
> +
> + ret = dwc_pcie_pmu_write_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, val);
> + if (ret)
> + pci_err(pcie_info->pdev, "PCIe write fail\n");
> +
> + return ret;
> +}
> +
> +static int dwc_pcie_pmu_event_enable(struct dwc_pcie_info_table *pcie_info,
> + u32 enable)
> +{
> + u32 ret;
> + u32 val;
> +
> + ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, &val);
> + if (ret) {
> + pci_err(pcie_info->pdev, "PCIe read fail\n");
Somebody may mentioned. Maybe you don't need to print these messages in PMU ops, just
return the correct error code and let perf handle it. Or you should provide more
information for these, like failed in which funcion or read/write which value.
If it only necessary when debugging, make it pci_dbg().
> + return ret;
> + }
> +
> + val &= ~(DWC_PCIE__CNT_ENABLE_MASK);
> +
> + if (enable)
> + val |= DWC_PCIE_PER_EVENT_ON;
> + else
> + val |= DWC_PCIE_PER_EVENT_OFF;
> +
> + ret = dwc_pcie_pmu_write_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, val);
> + if (ret)
> + pci_err(pcie_info->pdev, "PCIe write fail\n");
> +
> + return ret;
> +}
> +
> +static int dwc_pcie_pmu_base_time_enable(struct dwc_pcie_info_table *pcie_info,
> + u32 enable)
> +{
> + u32 ret;
> + u32 val;
> +
> + ret = dwc_pcie_pmu_read_dword(pcie_info,
> + DWC_PCIE_TIME_BASED_ANALYSIS_CTRL, &val);
> + if (ret) {
> + pci_err(pcie_info->pdev, "PCIe read fail\n");
> + return ret;
> + }
> +
> + if (enable)
> + val |= DWC_PCIE__TIME_BASED_COUNTER_ENABLE;
> + else
> + val &= ~DWC_PCIE__TIME_BASED_COUNTER_ENABLE;
> +
> + ret = dwc_pcie_pmu_write_dword(pcie_info,
> + DWC_PCIE_TIME_BASED_ANALYSIS_CTRL, val);
> + if (ret)
> + pci_err(pcie_info->pdev, "PCIe write fail\n");
> +
> + return ret;
> +}
> +
> +static int dwc_pcie_pmu_read_event_counter(struct dwc_pcie_info_table
> + *pcie_info, u64 *counter)
> +{
> + u32 ret, val;
> +
> + ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_DATA, &val);
> + if (ret) {
> + pci_err(pcie_info->pdev, "PCIe read fail\n");
> + return ret;
> + }
> + *counter = val;
> +
> + return ret;
> +}
> +
> +static int dwc_pcie_pmu_read_base_time_counter(struct dwc_pcie_info_table
> + *pcie_info, u64 *counter)
> +{
> + u32 ret, val;
> +
> + ret = dwc_pcie_pmu_read_dword(pcie_info,
> + DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_HIGH,
> + &val);
> + if (ret) {
> + pci_err(pcie_info->pdev, "PCIe read fail\n");
> + return ret;
> + }
> +
> + *counter = val;
> + *counter <<= 32;
> +
> + ret = dwc_pcie_pmu_read_dword(pcie_info,
> + DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_LOW,
> + &val);
> + if (ret) {
> + pci_err(pcie_info->pdev, "PCIe read fail\n");
> + return ret;
> + }
> +
> + *counter += val;
> +
> + return ret;
> +}
> +
> +static int dwc_pcie_pmu_clear_event_counter(struct dwc_pcie_info_table
> + *pcie_info)
> +{
> + u32 ret;
> + u32 val;
> +
> + ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, &val);
> + if (ret) {
> + pci_err(pcie_info->pdev, "PCIe read fail\n");
> + return ret;
> + }
> +
> + val &= ~DWC_PCIE_EVENT_CLEAR_MASK;
> + val |= 1;
It's better to use a macro for '1' to make it more clear.
> +
> + ret = dwc_pcie_pmu_write_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, val);
> + if (ret)
> + pci_err(pcie_info->pdev, "PCIe write fail\n");
> +
> + return ret;
> +}
> +
> +static int dwc_pcie_pmu_base_time_add_prepare(struct dwc_pcie_info_table
> + *pcie_info, u32 event_id)
> +{
> + u32 ret;
> + u32 val;
> +
> + ret = dwc_pcie_pmu_read_dword(pcie_info,
> + DWC_PCIE_TIME_BASED_ANALYSIS_CTRL, &val);
> + if (ret) {
> + pci_err(pcie_info->pdev, "PCIe read fail\n");
> + return ret;
> + }
> +
> + val &= ~DWC_PCIE__TIME_BASED_REPORT_SELECT_MASK;
> + val |= event_id << DWC_PCIE__TIME_BASED_REPORT_SELECT_SHIFT;
> + val &= ~DWC_PCIE__TIME_BASED_DURATION_SELECT;
> +
> + /*
> + * TIME_BASED_ANALYSIS_DATA_REG is a 64 bit register, we can safely
> + * use it with any manually controllered duration.
> + */
> + val &= ~(DWC_PCIE__TIME_BASED_DURATION_SELECT);
> + val |= DWC_PCIE_DURATION_MANUAL_CTRL;
> +
> + ret = dwc_pcie_pmu_write_dword(pcie_info,
> + DWC_PCIE_TIME_BASED_ANALYSIS_CTRL, val);
> + if (ret)
> + pci_err(pcie_info->pdev, "PCIe write fail\n");
> +
> + return ret;
> +}
> +
> +static struct dwc_pcie_info_table *pmu_to_pcie_info(struct pmu *pmu)
> +{
> + struct dwc_pcie_info_table *pcie_info;
> + struct dwc_pcie_pmu *pcie_pmu = to_pcie_pmu(pmu);
> +
> + pcie_info = container_of(pcie_pmu, struct dwc_pcie_info_table, pcie_pmu);
> + if (pcie_info == NULL)
> + pci_err(pcie_info->pdev, "Can't get pcie info\n");
> +
> + return pcie_info;
> +}
> +
> +static void dwc_pcie_pmu_event_update(struct perf_event *event)
> +{
> + u64 counter;
> + struct dwc_pcie_info_table *pcie_info = pmu_to_pcie_info(event->pmu);
> + struct hw_perf_event *hwc = &event->hw;
> + enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
> + u64 delta, prev, now;
> +
> + do {
> + prev = local64_read(&hwc->prev_count);
> +
> + if (type == DWC_PCIE_LANE_EVENT)
> + dwc_pcie_pmu_read_event_counter(pcie_info, &counter);
> + else if (type == DWC_PCIE_TIME_BASE_EVENT)
> + dwc_pcie_pmu_read_base_time_counter(pcie_info,
> + &counter);
> + else
> + pci_err(pcie_info->pdev, "Input param is invalid\n");
> +
For the messages in PMU ops, you should print the message on behalf of PMU device
rather than PCIe device. Same for the other places.
> + now = counter;
> + } while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
> +
> + delta = now - prev;
> +
> + local64_add(delta, &event->count);
> +}
> +
> +static int dwc_pcie_pmu_event_init(struct perf_event *event)
> +{
> + struct hw_perf_event *hwc = &event->hw;
> + struct dwc_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
> + struct perf_event *sibling;
> +
> + if (event->attr.type != event->pmu->type)
> + return -ENOENT;
> +
> + if (hwc->sample_period) {
> + dev_dbg(pcie_pmu->dev, "Sampling not supported\n");
> + return -EOPNOTSUPP;
> + }
> +
> + if (event->cpu < 0) {
> + dev_dbg(pcie_pmu->dev, "Per-task mode not supported\n");
> + return -EOPNOTSUPP;
> + }
> +
> + event->cpu = pcie_pmu->on_cpu;
> +
> + if (event->group_leader != event &&
> + !is_software_event(event->group_leader)) {
> + dev_dbg(pcie_pmu->dev, "Drive way only allow one event!\n");
> + return -EINVAL;
> + }
> +
> + for_each_sibling_event(sibling, event->group_leader) {
> + if (sibling != event && !is_software_event(sibling)) {
> + dev_dbg(pcie_pmu->dev, "Drive way event not allowed!\n");
> + return -EINVAL;
> + }
> + }
> +
> + hwc->idx = -1;
> +
> + return 0;
> +}
> +
> +static void dwc_pcie_pmu_set_period(struct hw_perf_event *hwc)
> +{
> + u64 new = 0;
> +
redundant 'new'.
> + local64_set(&hwc->prev_count, new);
> +}
> +
> +static void dwc_pcie_pmu_event_start(struct perf_event *event, int flags)
> +{
> + struct hw_perf_event *hwc = &event->hw;
> + struct dwc_pcie_info_table *pcie_info = pmu_to_pcie_info(event->pmu);
> + enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
> +
> + hwc->state = 0;
> + dwc_pcie_pmu_set_period(hwc);
> +
> + if (type == DWC_PCIE_LANE_EVENT)
> + dwc_pcie_pmu_event_enable(pcie_info, 1);
> + else if (type == DWC_PCIE_TIME_BASE_EVENT)
> + dwc_pcie_pmu_base_time_enable(pcie_info, 1);
> + else
> + pci_err(pcie_info->pdev, "Input param is invalid\n");
> +}
> +
> +static void dwc_pcie_pmu_event_stop(struct perf_event *event, int flags)
> +{
> + struct dwc_pcie_info_table *pcie_info = pmu_to_pcie_info(event->pmu);
> + enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
> +
> + if (event->hw.state & PERF_HES_STOPPED)
> + return;
> +
> + if (type == DWC_PCIE_LANE_EVENT)
> + dwc_pcie_pmu_event_enable(pcie_info, 0);
> + else if (type == DWC_PCIE_TIME_BASE_EVENT)
> + dwc_pcie_pmu_base_time_enable(pcie_info, 0);
> + else
> + pci_err(pcie_info->pdev, "Input param is invalid\n");
If the message is necessary, it'll be more helpful to mention which param
is invalid.
> +
> + dwc_pcie_pmu_event_update(event);
> +}
> +
> +static int dwc_pcie_pmu_event_add(struct perf_event *event, int flags)
> +{
> + struct dwc_pcie_info_table *pcie_info = pmu_to_pcie_info(event->pmu);
> + struct hw_perf_event *hwc = &event->hw;
> + enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
> + int event_id = DWC_PCIE_EVENT_ID(event);
> + int lane = DWC_PCIE_EVENT_LANE(event);
> +
> + if (pcie_info->event)
> + return -ENOSPC;
> +
> + pcie_info->event = event;
> +
> + hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
> +
> + if (type == DWC_PCIE_LANE_EVENT) {
> + dwc_pcie_pmu_event_enable(pcie_info, 0);
> + dwc_pcie_pmu_write_event_lane(pcie_info, lane, event_id);
> + dwc_pcie_pmu_set_event_id(pcie_info, event_id);
> + dwc_pcie_pmu_clear_event_counter(pcie_info);
> + } else if (type == DWC_PCIE_TIME_BASE_EVENT) {
> + dwc_pcie_pmu_base_time_enable(pcie_info, 0);
> + dwc_pcie_pmu_base_time_add_prepare(pcie_info, event_id);
> + } else {
> + pci_err(pcie_info->pdev, "Input param is invalid\n");
> + return -EINVAL;
> + }
> +
> + if (flags & PERF_EF_START)
> + dwc_pcie_pmu_event_start(event, PERF_EF_RELOAD);
> +
> + perf_event_update_userpage(event);
> +
> + return 0;
> +}
> +
> +static void dwc_pcie_pmu_event_del(struct perf_event *event, int flags)
> +{
> + struct dwc_pcie_info_table *pcie_info = pmu_to_pcie_info(event->pmu);
> +
> + dwc_pcie_pmu_event_stop(event, flags | PERF_EF_UPDATE);
> + perf_event_update_userpage(event);
> + pcie_info->event = NULL;
> +}
> +
> +static void dwc_pcie_pmu_event_read(struct perf_event *event)
> +{
> + dwc_pcie_pmu_event_update(event);
> +}
> +
> +static struct dwc_event_counters event_array[] = {
> + {"tx_ack_dllp", 0x600},
> + {"tx_update_fc_dllp", 0x601},
> + {"rx_ack_dllp", 0x602},
> + {"rx_update_fc_dllp", 0x603},
> + {"rx_nulified_tlp", 0x604},
> + {"tx_nulified_tlp", 0x605},
> + {"rx_duplicate_tlp", 0x606},
> + {"tx_memory_write", 0x700},
> + {"tx_memory_read", 0x701},
> + {"tx_configuration_write", 0x702},
> + {"tx_configuration_read", 0x703},
> + {"tx_io_write", 0x704},
> + {"tx_io_read", 0x705},
> + {"tx_completion_without_data", 0x706},
> + {"tx_completion_with_data", 0x707},
> + {"tx_message_tlp", 0x708},
> + {"tx_atomic", 0x709},
> + {"tx_tlp_with_prefix", 0x70A},
> + {"rx_memory_write", 0x70B},
> + {"rx_memory_read", 0x70C},
> + {"rx_io_write", 0x70F},
> + {"rx_io_read", 0x710},
> + {"rx_completion_without_data", 0x711},
> + {"rx_completion_with_data", 0x712},
> + {"rx_message_tlp", 0x713},
> + {"rx_atomic", 0x714},
> + {"rx_tlp_with_prefix", 0x715},
> + {"tx_ccix_tlp", 0x716},
> + {"rx_ccix_tlp", 0x717},
> +};
> +
> +static int dwc_pcie_pmu_attr_init(struct dwc_pcie_pmu_priv *priv,
> + struct dwc_pcie_info_table *pcie_info)
> +{
> + int i, j;
> + char lane[8];
> + const char tmp[64];
> + int events_per_lane;
> + int num_lane_events;
> + int time_base_count;
> + int num_attrs, attr_idx;
> + struct dwc_pcie_event_attr *lane_attrs;
> + struct attribute **pmu_attrs;
> +
> + memset((void *)tmp, 0, sizeof(tmp));
> + memset((void *)lane, 0, sizeof(lane));
> + time_base_count = ARRAY_SIZE(dwc_pcie_pmu_time_event_attrs);
> + events_per_lane = ARRAY_SIZE(event_array);
> + num_lane_events = pcie_info->num_lanes * events_per_lane;
> + num_attrs = time_base_count + num_lane_events;
> +
> + pcie_info->lane_event_attrs =
> + devm_kcalloc(priv->dev, num_lane_events,
> + sizeof(struct dwc_pcie_event_attr),
> + GFP_KERNEL);
> + if (!pcie_info->lane_event_attrs)
> + return -ENOMEM;
> + lane_attrs = pcie_info->lane_event_attrs;
> + pcie_info->pcie_pmu_event_attrs =
> + devm_kcalloc(priv->dev, num_attrs, sizeof(struct attribute *),
> + GFP_KERNEL);
> + if (!pcie_info->pcie_pmu_event_attrs)
> + return -ENOMEM;
> + pmu_attrs = pcie_info->pcie_pmu_event_attrs;
> +
> + for (i = 0; i < num_lane_events; i++) {
> + lane_attrs[i].attr.attr.name =
> + devm_kzalloc(priv->dev, sizeof(char)
> + * ATTRI_NAME_MAX_SIZE, GFP_KERNEL);
> + if (!lane_attrs[i].attr.attr.name)
> + return -ENOMEM;
> + }
> +
> + attr_idx = 0;
> + for (i = 0; i < pcie_info->num_lanes; i++) {
> + sprintf(lane, "_lane%d", i);
> +
> + for (j = 0; j < events_per_lane; j++) {
> + int pos = i * events_per_lane + j;
> +
> + strcat((char *)tmp, event_array[j].name);
> + strcat((char *)tmp, lane);
> + memcpy((void *)lane_attrs[pos].attr.attr.name,
> + (void *)tmp,
> + sizeof(tmp));
> +
> + lane_attrs[pos].attr.attr.mode =
> + VERIFY_OCTAL_PERMISSIONS(0444);
> + lane_attrs[pos].attr.show = dwc_pcie_event_show;
> + lane_attrs[pos].attr.store = NULL;
> + lane_attrs[pos].type = DWC_PCIE_LANE_EVENT;
> + lane_attrs[pos].eventid = event_array[j].event_id;
> + lane_attrs[pos].lane = i;
> + pmu_attrs[attr_idx++] = &lane_attrs[pos].attr.attr;
> +
> + memset((void *)tmp, 0, sizeof(tmp));
> + }
> + }
> +
> + for (i = 0; i < ARRAY_SIZE(dwc_pcie_pmu_time_event_attrs); i++)
> + pmu_attrs[attr_idx++] = dwc_pcie_pmu_time_event_attrs[i];
> +
> + pcie_info->pcie_pmu_event_attrs[attr_idx++] = NULL;
> +
> + pcie_info->pcie_pmu_event_attrs_group.name = event_attr_name;
> + pcie_info->pcie_pmu_event_attrs_group.is_visible =
> + pcie_pmu_event_attr_is_visible;
> + pcie_info->pcie_pmu_event_attrs_group.attrs =
> + pcie_info->pcie_pmu_event_attrs;
> +
> + pcie_info->pcie_pmu_attr_groups[0] =
> + &pcie_info->pcie_pmu_event_attrs_group;
> + pcie_info->pcie_pmu_attr_groups[1] = &pcie_pmu_format_attrs_group;
> + pcie_info->pcie_pmu_attr_groups[2] = &pcie_pmu_cpumask_attrs_group;
> + pcie_info->pcie_pmu_attr_groups[3] = NULL;
> +
> + return 0;
> +}
> +
> +static int __dwc_pcie_pmu_probe(struct dwc_pcie_pmu_priv *priv,
> + struct dwc_pcie_info_table *pcie_info)
> +{
> + int ret;
> + char *name;
> + struct dwc_pcie_pmu *pcie_pmu;
> + struct device *dev;
> +
> + if (!pcie_info || !pcie_info->pdev) {
> + pci_err(pcie_info->pdev, "Input parameter is invalid\n");
> + return -EINVAL;
> + }
> +
> + pcie_pmu = &pcie_info->pcie_pmu;
> + dev = &pcie_info->pdev->dev;
> +
> + ret = dwc_pcie_pmu_attr_init(priv, pcie_info);
> + if (ret) {
> + pci_err(pcie_info->pdev, "PMU attr init fail ret=%d\n", ret);
> + return ret;
> + }
> +
> + pcie_pmu->dev = dev;
> + pcie_pmu->pmu = (struct pmu) {
> + .module = THIS_MODULE,
> + .task_ctx_nr = perf_invalid_context,
> + .pmu_enable = NULL,
> + .pmu_disable = NULL,
> + .event_init = dwc_pcie_pmu_event_init,
> + .add = dwc_pcie_pmu_event_add,
> + .del = dwc_pcie_pmu_event_del,
> + .start = dwc_pcie_pmu_event_start,
> + .stop = dwc_pcie_pmu_event_stop,
> + .read = dwc_pcie_pmu_event_read,
> + .attr_groups = pcie_info->pcie_pmu_attr_groups,
> + .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
> + };
> +
> + name = devm_kasprintf(priv->dev, GFP_KERNEL, "pcie_bdf_%x",
> + pcie_info->bdf);
> + if (!name)
> + return -ENOMEM;
> +
> + /* Pick one CPU to be the preferred one to use */
> + pcie_pmu->on_cpu = raw_smp_processor_id();
> +
So we'll probabley bind all the pmus on one single CPU, is it intended? Since it's
an uncore PMU, we can make it run on any cpu (or for locality CPU on the controller's
NUMA node).
And I didn't see you register a hotplug handler, so what if the ->on_cpu is hot removed?
> + ret = perf_pmu_register(&pcie_pmu->pmu, name, -1);
> + if (ret) {
> + pci_err(pcie_info->pdev, "Error %d registering PMU @%x\n", ret,
> + pcie_info->bdf);
will be more helpful to print the bdf as format <bus>:<dev>:<func>.
> + return ret;
> + }
> +
> + pcie_info->pmu_is_register = DWC_PCIE_PMU_HAS_REGISTER;
Make @pmu_is_register a boolean will be more clear.
> +
> + return ret;
> +}
> +
> +static int dwc_pcie_pmu_remove(struct platform_device *pdev)
> +{
> + struct dwc_pcie_pmu_priv *priv = platform_get_drvdata(pdev);
> + int index;
> + struct dwc_pcie_pmu *pcie_pmu;
Make the long line first when declaring.
> +
> + for (index = 0; index < priv->pcie_ctrl_num; index++)
> + if (priv->pcie_table[index].pmu_is_register) {
> + pcie_pmu = &priv->pcie_table[index].pcie_pmu;
> + perf_pmu_unregister(&pcie_pmu->pmu);
> + }
> + return 0;
> +}
> +
> +static int dwc_pcie_pmu_probe(struct platform_device *pdev)
> +{
> + int ret = 0;
> + int pcie_index;
> + struct dwc_pcie_pmu_priv *priv;
> +
> + priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
> + if (!priv)
> + return -ENOMEM;
> + priv->dev = &pdev->dev;
> + platform_set_drvdata(pdev, priv);
> +
> + /* If PMU is not support on current platform, keep slient */
> + if (dwc_pcie_pmu_discover(priv))
> + return 0;
> +
> + for (pcie_index = 0; pcie_index < priv->pcie_ctrl_num; pcie_index++) {
> + struct pci_dev *rp = priv->pcie_table[pcie_index].pdev;
> +
> + ret = __dwc_pcie_pmu_probe(priv, &priv->pcie_table[pcie_index]);
> + if (ret) {
> + dev_err(&rp->dev, "PCIe PMU probe fail\n");
> + goto pmu_unregister;
> + }
> + }
> + dev_info(&pdev->dev, "PCIe PMUs registered\n");
> +
As Jonathan mentioned this message maybe unnecessary, but I may find it useful if you
print how many PMU's registered.
On one PMU registration failed, you just remove all the PMUs registered. I wonder if
it's better to make already registered PMU stay instead of removing them all.
Glad to see another PCIe PMU device!
Thanks,
Yicong
> + return 0;
> +
> +pmu_unregister:
> + dwc_pcie_pmu_remove(pdev);
> +
> + return ret;
> +}
> +
> +static struct platform_driver dwc_pcie_pmu_driver = {
> + .probe = dwc_pcie_pmu_probe,
> + .remove = dwc_pcie_pmu_remove,
> + .driver = {.name = DRV_NAME,},
> +};
> +
> +static int __init dwc_pcie_pmu_init(void)
> +{
> + int ret;
> +
> + ret = platform_driver_register(&dwc_pcie_pmu_driver);
> +
> + if (ret)
> + return ret;
> +
> + dwc_pcie_pmu_dev =
> + platform_device_register_simple(DEV_NAME, -1, NULL, 0);
> + if (IS_ERR(dwc_pcie_pmu_dev)) {
> + platform_driver_unregister(&dwc_pcie_pmu_driver);
> + return PTR_ERR(dwc_pcie_pmu_dev);
> + }
> +
> + return 0;
> +}
> +
> +static void __exit dwc_pcie_pmu_exit(void)
> +{
> + platform_device_unregister(dwc_pcie_pmu_dev);
> + platform_driver_unregister(&dwc_pcie_pmu_driver);
> +}
> +
> +module_init(dwc_pcie_pmu_init);
> +module_exit(dwc_pcie_pmu_exit);
> +
> +MODULE_DESCRIPTION("PMU driver for DesignWare Cores PCI Express Controller");
> +MODULE_AUTHOR("[email protected]");
> +MODULE_AUTHOR("[email protected]");
> +MODULE_LICENSE("GPL v2");
>
On 2022/9/23 1:32, Bjorn Helgaas wrote:
> On Thu, Sep 22, 2022 at 04:58:20PM +0100, Jonathan Cameron wrote:
>> On Sat, 17 Sep 2022 20:10:35 +0800
>> Shuai Xue <[email protected]> wrote:
>>
>>> This commit adds the PCIe Performance Monitoring Unit (PMU) driver support
>>> for T-Head Yitian SoC chip. Yitian is based on the Synopsys PCI Express
>>> Core controller IP which provides statistics feature. The PMU is not a PCIe
>>> Root Complex integrated End Point(RCiEP) device but only register counters
>>> provided by each PCIe Root Port.
>>>
>>> To facilitate collection of statistics the controller provides the
>>> following two features for each Root Port:
>>>
>>> - Time Based Analysis (RX/TX data throughput and time spent in each
>>> low-power LTSSM state)
>>> - Event counters (Error and Non-Error for lanes)
>>>
>>> Note, only one counter for each type.
>>>
>>> This driver add PMU devices for each PCIe Root Port. And the PMU device is
>>> named based the BDF of Root Port. For example,
>>>
>>> 10:00.0 PCI bridge: Device 1ded:8000 (rev 01)
>>>
>>> the PMU device name for this Root Port is pcie_bdf_100000.
>>>
>>> Example usage of counting PCIe RX TLP data payload (Units of 16 bytes)::
>>>
>>> $# perf stat -a -e pcie_bdf_200/Rx_PCIe_TLP_Data_Payload/
>>>
>>> average RX bandwidth can be calculated like this:
>>>
>>> PCIe TX Bandwidth = PCIE_TX_DATA * 16B / Measure_Time_Window
>>>
>>> Signed-off-by: Shuai Xue <[email protected]>
>>
>> +CC linux-pci list and Bjorn.
>
> Thanks, this is definitely of interest to linux-pci.
>
>> Question in here which I've been meaning to address for other reasons
>> around how to register 'extra features' on pci ports.
>>
>> This particular PMU is in config space in a Vendor Specific Extended
>> Capability.
>>
>> I've focused on that aspect for this review rather than the perf parts.
>> We'll need to figure that story out first as doing this from a bus walk
>> makes triggered of a platform driver is not the way I'd expect to see
>> this work.
>
>>> +static int dwc_pcie_pmu_discover(struct dwc_pcie_pmu_priv *priv)
>>> +{
>>> + int val, where, index = 0;
>>> + struct pci_dev *pdev = NULL;
>>> + struct dwc_pcie_info_table *pcie_info;
>>> +
>>> + priv->pcie_table =
>>> + devm_kcalloc(priv->dev, RP_NUM_MAX, sizeof(*pcie_info), GFP_KERNEL);
>>> + if (!priv->pcie_table)
>>> + return -EINVAL;
>>> +
>>> + pcie_info = priv->pcie_table;
>>> + while ((pdev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pdev)) != NULL &&
>>> + index < RP_NUM_MAX) {
>>
>> This having a driver than then walks the pci topology to find root ports and add
>> extra stuff to them is not a clean solution.
>>
>> The probing should be driven from the existing PCI driver topology.
>> There are a bunch of new features we need to add to ports in the near future
>> anyway - this would just be another one.
>> Same problem exists for CXL CPMU perf devices - so far we only support those
>> on end points, partly because we need a clean way to probe them on pci ports.
>>
>> Whatever we come up with there will apply here as well.
>
> I agree, I don't like to see more uses of pci_get_device() because it
> doesn't fit the driver model at all. For one thing, it really screws
> up the hotplug model because this doesn't account for hot-added
> devices and there's no clear cleanup path for removal.
>
> Hotplug is likely not an issue in this particular case, but it gets
> copied to places where it is an issue.
>
> Maybe we need some kind of PCI core interface whereby drivers can
> register their interest in VSEC and/or DVSEC capabilities.
>
Considering this PMU is related to each Root Port without real backup device. I'm
wondering whether we can extend the pcie port bus and make use of it (though it's
currently used by the standard services).
Thanks.
On Fri, 23 Sep 2022 11:35:45 +0800
Yicong Yang <[email protected]> wrote:
> On 2022/9/23 1:32, Bjorn Helgaas wrote:
> > On Thu, Sep 22, 2022 at 04:58:20PM +0100, Jonathan Cameron wrote:
> >> On Sat, 17 Sep 2022 20:10:35 +0800
> >> Shuai Xue <[email protected]> wrote:
> >>
> >>> This commit adds the PCIe Performance Monitoring Unit (PMU) driver support
> >>> for T-Head Yitian SoC chip. Yitian is based on the Synopsys PCI Express
> >>> Core controller IP which provides statistics feature. The PMU is not a PCIe
> >>> Root Complex integrated End Point(RCiEP) device but only register counters
> >>> provided by each PCIe Root Port.
> >>>
> >>> To facilitate collection of statistics the controller provides the
> >>> following two features for each Root Port:
> >>>
> >>> - Time Based Analysis (RX/TX data throughput and time spent in each
> >>> low-power LTSSM state)
> >>> - Event counters (Error and Non-Error for lanes)
> >>>
> >>> Note, only one counter for each type.
> >>>
> >>> This driver add PMU devices for each PCIe Root Port. And the PMU device is
> >>> named based the BDF of Root Port. For example,
> >>>
> >>> 10:00.0 PCI bridge: Device 1ded:8000 (rev 01)
> >>>
> >>> the PMU device name for this Root Port is pcie_bdf_100000.
> >>>
> >>> Example usage of counting PCIe RX TLP data payload (Units of 16 bytes)::
> >>>
> >>> $# perf stat -a -e pcie_bdf_200/Rx_PCIe_TLP_Data_Payload/
> >>>
> >>> average RX bandwidth can be calculated like this:
> >>>
> >>> PCIe TX Bandwidth = PCIE_TX_DATA * 16B / Measure_Time_Window
> >>>
> >>> Signed-off-by: Shuai Xue <[email protected]>
> >>
> >> +CC linux-pci list and Bjorn.
> >
> > Thanks, this is definitely of interest to linux-pci.
> >
> >> Question in here which I've been meaning to address for other reasons
> >> around how to register 'extra features' on pci ports.
> >>
> >> This particular PMU is in config space in a Vendor Specific Extended
> >> Capability.
> >>
> >> I've focused on that aspect for this review rather than the perf parts.
> >> We'll need to figure that story out first as doing this from a bus walk
> >> makes triggered of a platform driver is not the way I'd expect to see
> >> this work.
> >
> >>> +static int dwc_pcie_pmu_discover(struct dwc_pcie_pmu_priv *priv)
> >>> +{
> >>> + int val, where, index = 0;
> >>> + struct pci_dev *pdev = NULL;
> >>> + struct dwc_pcie_info_table *pcie_info;
> >>> +
> >>> + priv->pcie_table =
> >>> + devm_kcalloc(priv->dev, RP_NUM_MAX, sizeof(*pcie_info), GFP_KERNEL);
> >>> + if (!priv->pcie_table)
> >>> + return -EINVAL;
> >>> +
> >>> + pcie_info = priv->pcie_table;
> >>> + while ((pdev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pdev)) != NULL &&
> >>> + index < RP_NUM_MAX) {
> >>
> >> This having a driver than then walks the pci topology to find root ports and add
> >> extra stuff to them is not a clean solution.
> >>
> >> The probing should be driven from the existing PCI driver topology.
> >> There are a bunch of new features we need to add to ports in the near future
> >> anyway - this would just be another one.
> >> Same problem exists for CXL CPMU perf devices - so far we only support those
> >> on end points, partly because we need a clean way to probe them on pci ports.
> >>
> >> Whatever we come up with there will apply here as well.
> >
> > I agree, I don't like to see more uses of pci_get_device() because it
> > doesn't fit the driver model at all. For one thing, it really screws
> > up the hotplug model because this doesn't account for hot-added
> > devices and there's no clear cleanup path for removal.
> >
> > Hotplug is likely not an issue in this particular case, but it gets
> > copied to places where it is an issue.
> >
> > Maybe we need some kind of PCI core interface whereby drivers can
> > register their interest in VSEC and/or DVSEC capabilities.
Something along those lines works if the facility is constrained to just
VSEC / DVSEC.
* This one is.
* CMA / SPDM / IDE all are - but with complexity of interrupts.
After the plumbers SPDM BoF the resulting plan would not fit in the
same model as this driver (need to be done earlier in PCI registration
flow I think). I need to write up and share some notes on what we
are planning around that to get wider feedback - but might be a few
weeks!
Others are less well confined.
* CXL PMU uses registers in bar space - but is hanging off a DVSEC
description to tell you where to find it.
> >
>
> Considering this PMU is related to each Root Port without real backup device. I'm
> wondering whether we can extend the pcie port bus and make use of it (though it's
> currently used by the standard services).
I did that a few years back for our older PCI PMUs. It wasn't pretty.
https://lore.kernel.org/all/[email protected]/
We never took that driver forwards - it was mostly useful to understand what
might work for newer hardware - we went the RCiEP route at least partly to avoid
software complexity (and because of hardware topology - counters shared by multiple
RP)
We could do something more generic along the same lines as the portdrv framework
- that highlights some of the complexities however.
There are some nasty potential ordering issues in registering interest caused
by any attempt to make this work with modules.
I'd want to see a solution that works just as well for all the components that
might have DVSEC / VSEC entries - not just those covered by portdrv.
+CC Dan Williams and linux-cxl as they may also be interested in this discussion.
>
> Thanks.
>
在 2022/9/22 PM11:58, Jonathan Cameron 写道:
> On Sat, 17 Sep 2022 20:10:35 +0800
> Shuai Xue <[email protected]> wrote:
>
>> This commit adds the PCIe Performance Monitoring Unit (PMU) driver support
>> for T-Head Yitian SoC chip. Yitian is based on the Synopsys PCI Express
>> Core controller IP which provides statistics feature. The PMU is not a PCIe
>> Root Complex integrated End Point(RCiEP) device but only register counters
>> provided by each PCIe Root Port.
>>
>> To facilitate collection of statistics the controller provides the
>> following two features for each Root Port:
>>
>> - Time Based Analysis (RX/TX data throughput and time spent in each
>> low-power LTSSM state)
>> - Event counters (Error and Non-Error for lanes)
>>
>> Note, only one counter for each type.
>>
>> This driver add PMU devices for each PCIe Root Port. And the PMU device is
>> named based the BDF of Root Port. For example,
>>
>> 10:00.0 PCI bridge: Device 1ded:8000 (rev 01)
>>
>> the PMU device name for this Root Port is pcie_bdf_100000.
>>
>> Example usage of counting PCIe RX TLP data payload (Units of 16 bytes)::
>>
>> $# perf stat -a -e pcie_bdf_200/Rx_PCIe_TLP_Data_Payload/
>>
>> average RX bandwidth can be calculated like this:
>>
>> PCIe TX Bandwidth = PCIE_TX_DATA * 16B / Measure_Time_Window
>>
>> Signed-off-by: Shuai Xue <[email protected]>
>
> +CC linux-pci list and Bjorn.
>
> Question in here which I've been meaning to address for other reasons
> around how to register 'extra features' on pci ports.
>
> This particular PMU is in config space in a Vendor Specific Extended
> Capability.
>
> I've focused on that aspect for this review rather than the perf parts.
> We'll need to figure that story out first as doing this from a bus walk
> makes triggered of a platform driver is not the way I'd expect to see
> this work.
Thank you for your valuable comments. Please see my reply inline.
Best Regards,
Shuai
>> diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c
>> new file mode 100644
>> index 000000000000..81e534be13fa
>> --- /dev/null
>> +++ b/drivers/perf/dwc_pcie_pmu.c
>> @@ -0,0 +1,976 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * Synopsys DesignWare PCIe PMU driver
>> + *
>> + * Copyright (C) 2021, 2022 Alibaba Inc.
>> + */
>> +
>> +#include <linux/pci.h>
>> +#include <linux/bitfield.h>
>> +#include <linux/bitops.h>
>> +#include <linux/cpuhotplug.h>
>> +#include <linux/cpumask.h>
>> +#include <linux/device.h>
>> +#include <linux/errno.h>
>> +#include <linux/kernel.h>
>> +#include <linux/list.h>
>> +#include <linux/perf_event.h>
>> +#include <linux/platform_device.h>
>> +#include <linux/smp.h>
>> +#include <linux/sysfs.h>
>> +#include <linux/types.h>
>> +
>> +#define DRV_NAME "dwc_pcie_pmu"
>> +#define DEV_NAME "dwc_pcie_pmu"
> Put these strings where they are used. That's where people will look for them...
Got it. Will use strings directly in next version.
>
>> +#define RP_NUM_MAX 32 /* 2die * 4RC * 4Ctrol */
>
> This driver is 'almost' generic. So if you an avoid defines based on a particular
> platform that's definitely good!
Good idea. How about defining RP_NUM_MAX as 64? As fars as I know,
some platfrom use 2 sockets, 2 die per socket.
Then 2 sockets * 2 dies * 4 Root Complex * 4 root port.
>
>> +#define ATTRI_NAME_MAX_SIZE 32
>> +
>> +#define DWC_PCIE_VSEC_ID 0x02
>> +#define DWC_PCIE_VSEC_REV 0x04
>
> I wouldn't define the REV like this. Put the number inline so we
> can clearly see this is revision 4. VSEC_ID won't change so a
> define for that is fine.
I see. I will use 0x04 instead REV macro in next version.
>> +
>> +#define DWC_PCIE_LINK_CAPABILITIES_REG 0xC
> This is PCIE spec defined. Put these in a common header.
Good catch, I will fix in next version.
>> +#define DWC_PCIE_LANE_SHIFT 4
>> +#define DWC_PCIE_LANE_MASK GENMASK(9, 4)
>> +
>> +#define DWC_PCIE_EVENT_CNT_CTRL 0x8
>> +#define DWC_PCIE__CNT_EVENT_SELECT_SHIFT 16
>
> Why double __? If point is , then
> naming works better
> DWC_PCIE_EVENT_CNT_CTRL_REG
> DWC_PCIE_EVENT_CNT_CTRL_EV_SELECT_MSK etc
Yes, I point to use double `__` to indicate it is a field of register,
as CMN and CCN drivers do. I also considered naming with REG explicitly,
but the macro is so long that I often have to wrap code into multilines.
Any way, it's fine to rename if you still suggest to do so.
>
>> +#define DWC_PCIE__CNT_EVENT_SELECT_MASK GENMASK(27, 16)
>> +#define DWC_PCIE__CNT_LANE_SELECT_SHIFT 8
>> +#define DWC_PCIE__CNT_LANE_SELECT_MASK GENMASK(11, 8)
>> +#define DWC_PCIE__CNT_STATUS_SHIFT 7
>> +#define DWC_PCIE__CNT_STATUS_MASK BIT(7)
>> +#define DWC_PCIE__CNT_ENABLE_SHIFT 2
>
> With FIELD_PREP() / FIELD_GET() you should never need to define the shifts.
> They will be extracted from the masks as needed.
>
>> +#define DWC_PCIE__CNT_ENABLE_MASK GENMASK(4, 2)
>> +#define DWC_PCIE_PER_EVENT_OFF (0x1 << DWC_PCIE__CNT_ENABLE_SHIFT)
> FIELD_PREP() / FIELD_GET() combined with defines for the values.
>
> #define DWC_PCIE_CNT_ENABLE_MASK ...
Got it, I will use FIELD_PREP() / FIELD_GET() to remove SHIFT micros
and improve code readability.
>
>> +#define DWC_PCIE_PER_EVENT_ON (0x3 << DWC_PCIE__CNT_ENABLE_SHIFT)
>> +#define DWC_PCIE_EVENT_CLEAR_MASK GENMASK(1, 0)
>> +
>> +#define DWC_PCIE_EVENT_CNT_DATA 0xC
>> +
>> +#define DWC_PCIE_TIME_BASED_ANALYSIS_CTRL 0x10
>> +#define DWC_PCIE__TIME_BASED_REPORT_SELECT_SHIFT 24
>> +#define DWC_PCIE__TIME_BASED_REPORT_SELECT_MASK GENMASK(31, 24)
>> +#define DWC_PCIE__TIME_BASED_DURATION_SHIFT 8
>> +#define DWC_PCIE__TIME_BASED_DURATION_SELECT GENMASK(15, 8)
>> +#define DWC_PCIE_DURATION_MANUAL_CTRL 0x0
>> +#define DWC_PCIE_DURATION_1MS 0x1
>> +#define DWC_PCIE_DURATION_10MS 0x2
>> +#define DWC_PCIE_DURATION_100MS 0x3
>> +#define DWC_PCIE_DURATION_1S 0x4
>> +#define DWC_PCIE_DURATION_2S 0x5
>> +#define DWC_PCIE_DURATION_4S 0x6
>> +#define DWC_PCIE_DURATION_4US 0xff
>> +#define DWC_PCIE__TIME_BASED_COUNTER_ENABLE 1
>> +
>> +#define DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_LOW 0x14
>> +#define DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_HIGH 0x18
>> +
>> +/* Event attributes */
>> +#define DWC_PCIE_CONFIG_EVENTID GENMASK(15, 0)
>> +#define DWC_PCIE_CONFIG_TYPE GENMASK(19, 16)
>> +#define DWC_PCIE_CONFIG_LANE GENMASK(27, 20)
>> +
>> +#define DWC_PCIE_EVENT_ID(event) FIELD_GET(DWC_PCIE_CONFIG_EVENTID, (event)->attr.config)
>> +#define DWC_PCIE_EVENT_TYPE(event) FIELD_GET(DWC_PCIE_CONFIG_TYPE, (event)->attr.config)
>> +#define DWC_PCIE_EVENT_LANE(event) FIELD_GET(DWC_PCIE_CONFIG_LANE, (event)->attr.config)
>> +
>> +#define DWC_PCIE_PMU_HAS_REGISTER 1
>> +
>> +enum dwc_pcie_event_type {
>> + DWC_PCIE_TYPE_INVALID,
>> + DWC_PCIE_TIME_BASE_EVENT,
>> + DWC_PCIE_LANE_EVENT,
>> +};
>> +
>> +struct dwc_event_counters {
>> + const char name[32];
>> + u32 event_id;
>> +};
>> +
>> +struct dwc_pcie_pmu {
>> + struct hlist_node node;
>> + unsigned int on_cpu;
>> + struct pmu pmu;
>> + struct device *dev;
>> +};
>> +
>> +struct dwc_pcie_info_table {
>> + u32 bdf;
>> + u32 cap_pos;
>> + u32 num_lanes;
>> + struct pci_dev *pdev;
>> + struct dwc_pcie_pmu pcie_pmu;
>> + u8 pmu_is_register;
>> + struct perf_event *event;
>> +
>> + struct dwc_pcie_event_attr *lane_event_attrs;
>> + struct attribute **pcie_pmu_event_attrs;
>> + struct attribute_group pcie_pmu_event_attrs_group;
>> + const struct attribute_group *pcie_pmu_attr_groups[4];
>> +};
>> +
>> +struct dwc_pcie_pmu_priv {
>> + struct device *dev;
>> + u32 pcie_ctrl_num;
>> + struct dwc_pcie_info_table *pcie_table;
>> +};
>> +
>> +#define DWC_PCIE_CREATE_BDF(seg, bus, dev, func) \
>> + (((seg) << 24) | (((bus) & 0xFF) << 16) | (((dev) & 0xFF) << 8) | (func))
>
> Superficially this looks pretty standard. Why is is DWC specific?
You are right, it is not DWC specific.
I found a similar definition in arch/ia64/pci/pci.c .
#define PCI_SAL_ADDRESS(seg, bus, devfn, reg) \
(((u64) seg << 24) | (bus << 16) | (devfn << 8) | (reg))
Should we move it into a common header first?
>
>> +#define to_pcie_pmu(p) (container_of(p, struct dwc_pcie_pmu, pmu))
>
> Prefix that name. I'm hopeful we'll have a PCI SIG defined PMU one
> day and when we do that macro belongs to that!
> to_dwc_pcie_pmu() is possibly fine.
I entirely agree that a standard definition is preferred.
Will use to_dwc_pcie_pmu instead in next version.
>
>> +
>> +static struct platform_device *dwc_pcie_pmu_dev;
>> +static char *event_attr_name = "events";
>> +
>
>
> ...
>
>> +
>> +static int dwc_pcie_find_ras_des_cap_position(struct pci_dev *pdev, int *pos)
>> +{
>> + u32 header;
>> + int vsec = 0;
>> +
>> + while ((vsec = pci_find_next_ext_capability(pdev, vsec,
>> + PCI_EXT_CAP_ID_VNDR))) {
>
> This probably belongs in the PCI core in a similar fashion to the DVSEC
> helper.
>
>> + pci_read_config_dword(pdev, vsec + PCI_VNDR_HEADER, &header);
>> + /* Is the device part of a DesignWare Cores PCIe Controller ? */
>
> Good question... This code doesn't check that. VSEC ID is matched only with
> the Vendor ID of the devices - unlike DVSEC where this would all be nice
> and local.
I think a similar fashion is
u16 pci_find_vsec_capability(struct pci_dev *dev, u16 vendor, int cap)
As you see, I don't want to limit this driver to a specific vendor, like
Alibaba (0x1ded), because this driver is generic to all DesignWare Cores PCIe
Controller. Therefore, dwc_pcie_find_ras_des_cap_position does not check vendor
like pci_find_vsec_capability.
Do you mean to use DVSEC instead? I try to read out DVSEC with lspci:
# lspci -vvv
b0:00.0 PCI bridge: Alibaba (China) Co., Ltd. M1 Root Port (rev 01) (prog-if 00 [Normal decode])
[...snip...]
Capabilities: [374 v1] Vendor Specific Information: ID=0002 Rev=4 Len=100 <?>
Capabilities: [474 v1] Vendor Specific Information: ID=0001 Rev=1 Len=038 <?>
Capabilities: [4ac v1] Data Link Feature <?>
Capabilities: [4b8 v1] Designated Vendor-Specific: Vendor=0001 ID=0000 Rev=1 Len=64 <?>
Capabilities: [4fc v1] Vendor Specific Information: ID=0005 Rev=1 Len=018 <?>
How can we tell it's a DesignWare Cores PCIe Controller?
>> + if (PCI_VNDR_HEADER_ID(header) == DWC_PCIE_VSEC_ID &&
>> + PCI_VNDR_HEADER_REV(header) == DWC_PCIE_VSEC_REV) {
>> + *pos = vsec;
>> + return 0;
>> + }
>> + }
>> +
>> + return -ENODEV;
>> +}
>> +
>> +static int dwc_pcie_pmu_discover(struct dwc_pcie_pmu_priv *priv)
>> +{
>> + int val, where, index = 0;
>> + struct pci_dev *pdev = NULL;
>> + struct dwc_pcie_info_table *pcie_info;
>> +
>> + priv->pcie_table =
>> + devm_kcalloc(priv->dev, RP_NUM_MAX, sizeof(*pcie_info), GFP_KERNEL);
>> + if (!priv->pcie_table)
>> + return -EINVAL;
>> +
>> + pcie_info = priv->pcie_table;
>> + while ((pdev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pdev)) != NULL &&
>> + index < RP_NUM_MAX) {
>
> This having a driver than then walks the pci topology to find root ports and add
> extra stuff to them is not a clean solution.
>
> The probing should be driven from the existing PCI driver topology.
> There are a bunch of new features we need to add to ports in the near future
> anyway - this would just be another one.
> Same problem exists for CXL CPMU perf devices - so far we only support those
> on end points, partly because we need a clean way to probe them on pci ports.
>
> Whatever we come up with there will apply here as well.
I see your point. Any link to reference?
>
>> + if (!pci_dev_is_rootport(pdev))
>> + continue;
>> +
>> + pcie_info[index].bdf = dwc_pcie_get_bdf(pdev);
>> + pcie_info[index].pdev = pdev;
> Probably want a sanity check this has a vendor ID appropriate the VSEC you are about
> to look for.
If I check the vendor ID here or in dwc_pcie_find_ras_des_cap_position, this driver
will only work for Alibaba as I mentioned before.
>
>> +
>> + if (dwc_pcie_find_ras_des_cap_position(pdev, &where))
>> + continue;
>> +
>> + pcie_info[index].cap_pos = where;
>> +
>> + pci_read_config_dword(pdev,
>> + pdev->pcie_cap + DWC_PCIE_LINK_CAPABILITIES_REG,
>> + &val);
>> + pcie_info[index].num_lanes =
>> + (val & DWC_PCIE_LANE_MASK) >> DWC_PCIE_LANE_SHIFT;
>
> FIELD_GET()
Will fix in next version.
>
>> + index++;
>> + }
>> +
>> + if (!index)
>> + return -ENODEV;
>> +
>> + priv->pcie_ctrl_num = index;
>> +
>> + return 0;
>> +}
>> +
>> +static inline int dwc_pcie_pmu_read_dword(struct dwc_pcie_info_table *pcie_info,
>> + u32 reg, u32 *val)
>> +{
>> + return pci_read_config_dword(pcie_info->pdev, pcie_info->cap_pos + reg,
>> + val);
>> +}
>> +
>> +static inline int dwc_pcie_pmu_write_dword(struct dwc_pcie_info_table
>> + *pcie_info, u32 reg, u32 val)
>> +{
>> + return pci_write_config_dword(pcie_info->pdev, pcie_info->cap_pos + reg,
>> + val);
>> +}
>
> These two wrappers don't add a lot so I would drop them.
I see, I will use pci_{write/read}_config_dword directly.
>
>> +
>> +static int dwc_pcie_pmu_set_event_id(struct dwc_pcie_info_table *pcie_info,
>> + int event_id)
>> +{
>> + int ret;
>> + u32 val;
>> +
>> + ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, &val);
>> + if (ret) {
>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>> + return ret;
>> + }
>> +
>> + val &= ~DWC_PCIE__CNT_ENABLE_MASK;
>> + val &= ~DWC_PCIE__CNT_EVENT_SELECT_MASK;
>> + val |= event_id << DWC_PCIE__CNT_EVENT_SELECT_SHIFT;
>
> FIELD_PREP()
Will fix in next version.
>> +
>> + ret = dwc_pcie_pmu_write_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, val);
>> + if (ret)
>> + pci_err(pcie_info->pdev, "PCIe write fail\n");
>> +
>> + return ret;
>> +}
>
> ...
>
>> +
>> +static int dwc_pcie_pmu_read_base_time_counter(struct dwc_pcie_info_table
>> + *pcie_info, u64 *counter)
>> +{
>> + u32 ret, val;
>> +
>> + ret = dwc_pcie_pmu_read_dword(pcie_info,
>> + DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_HIGH,
>> + &val);
>> + if (ret) {
>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>> + return ret;
>> + }
>> +
>> + *counter = val;
>> + *counter <<= 32;
>
> This looks like you could get ripping between the upper and lower dwords.
> What prevents that? Perhaps a comment to say why that's not a problem?
The Time-based Analysis Data which contains the measurement results of
RX/TX data throughput and time spent in each low-power LTSSM state is 64 bit.
The data is provided by two 32 bit registers so I rip them together. I will
add a comment here in next verison.
>
>> +
>> + ret = dwc_pcie_pmu_read_dword(pcie_info,
>> + DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_LOW,
>> + &val);
>> + if (ret) {
>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>> + return ret;
>> + }
>> +
>> + *counter += val;
>> +
>> + return ret;
>> +}
> ...
>
>
>> +static int __dwc_pcie_pmu_probe(struct dwc_pcie_pmu_priv *priv,
>> + struct dwc_pcie_info_table *pcie_info)
>> +{
>> + int ret;
>> + char *name;
>> + struct dwc_pcie_pmu *pcie_pmu;
>> + struct device *dev;
>> +
>> + if (!pcie_info || !pcie_info->pdev) {
>> + pci_err(pcie_info->pdev, "Input parameter is invalid\n");
>> + return -EINVAL;
>> + }
>> +
>> + pcie_pmu = &pcie_info->pcie_pmu;
>> + dev = &pcie_info->pdev->dev;
>> +
>> + ret = dwc_pcie_pmu_attr_init(priv, pcie_info);
>> + if (ret) {
>> + pci_err(pcie_info->pdev, "PMU attr init fail ret=%d\n", ret);
>> + return ret;
>> + }
>> +
>> + pcie_pmu->dev = dev;
>> + pcie_pmu->pmu = (struct pmu) {
>> + .module = THIS_MODULE,
>> + .task_ctx_nr = perf_invalid_context,
>> + .pmu_enable = NULL,
>> + .pmu_disable = NULL,
>> + .event_init = dwc_pcie_pmu_event_init,
>> + .add = dwc_pcie_pmu_event_add,
>> + .del = dwc_pcie_pmu_event_del,
>> + .start = dwc_pcie_pmu_event_start,
>> + .stop = dwc_pcie_pmu_event_stop,
>> + .read = dwc_pcie_pmu_event_read,
>> + .attr_groups = pcie_info->pcie_pmu_attr_groups,
>> + .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
>> + };
>> +
>> + name = devm_kasprintf(priv->dev, GFP_KERNEL, "pcie_bdf_%x",
>> + pcie_info->bdf);
>> + if (!name)
>> + return -ENOMEM;
>> +
>> + /* Pick one CPU to be the preferred one to use */
>> + pcie_pmu->on_cpu = raw_smp_processor_id();
> Above there are references to multiple dies. Maybe at least make sure you
> are on a near by die? (I'm guessing at topology!)
Good idea, will fix in next version.
>> +
>> + ret = perf_pmu_register(&pcie_pmu->pmu, name, -1);
>> + if (ret) {
>> + pci_err(pcie_info->pdev, "Error %d registering PMU @%x\n", ret,
>> + pcie_info->bdf);
>> + return ret;
>> + }
>> +
>> + pcie_info->pmu_is_register = DWC_PCIE_PMU_HAS_REGISTER;
>
> As below. I think you can drop this state info.
Please see my confusion bellow.
>
>> +
>> + return ret;
>> +}
>> +
>> +static int dwc_pcie_pmu_remove(struct platform_device *pdev)
>> +{
>> + struct dwc_pcie_pmu_priv *priv = platform_get_drvdata(pdev);
>> + int index;
>> + struct dwc_pcie_pmu *pcie_pmu;
>> +
>> + for (index = 0; index < priv->pcie_ctrl_num; index++)
>> + if (priv->pcie_table[index].pmu_is_register) {
>> + pcie_pmu = &priv->pcie_table[index].pcie_pmu;
>> + perf_pmu_unregister(&pcie_pmu->pmu);
>> + }
>> + return 0;
>> +}
>> +
>> +static int dwc_pcie_pmu_probe(struct platform_device *pdev)
>> +{
>> + int ret = 0;
>
> Initialized in all paths where it is used. Compiler should be able to tell
> that so I doubt you need this to be set to 0 here.
Agree, will leave it as uninitialized.
>
>> + int pcie_index;
>> + struct dwc_pcie_pmu_priv *priv;
>> +
>> + priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
>> + if (!priv)
>> + return -ENOMEM;
>> + priv->dev = &pdev->dev;
>> + platform_set_drvdata(pdev, priv);
>> +
>> + /* If PMU is not support on current platform, keep slient */
>> + if (dwc_pcie_pmu_discover(priv))
>> + return 0;
>> +
>> + for (pcie_index = 0; pcie_index < priv->pcie_ctrl_num; pcie_index++) {
>> + struct pci_dev *rp = priv->pcie_table[pcie_index].pdev;
>> +
>> + ret = __dwc_pcie_pmu_probe(priv, &priv->pcie_table[pcie_index]);
>> + if (ret) {
>> + dev_err(&rp->dev, "PCIe PMU probe fail\n");
>> + goto pmu_unregister;
>> + }
>> + }
>> + dev_info(&pdev->dev, "PCIe PMUs registered\n");
>
> Noise in the logs. There are lots of ways to know if we reached this point
> so this adds no value.
Got it, will drop this out in next version.
>
>> +
>> + return 0;
>> +
>> +pmu_unregister:
>> + dwc_pcie_pmu_remove(pdev);
>
> I'd much rather see the unwind here directly so we can clearly see that it undoes
> the result of errors in this function. That removes the need to use the
> is_registered flag in the remove() function simplifying that flow as well.
Do you mean that if perf_pmu_register fails, then jump to pmu_unregister lable directly?
How can we tell which PMU diveice fails to reigister?
>> +
>> + return ret;
>> +}
>> +
>> +static struct platform_driver dwc_pcie_pmu_driver = {
>> + .probe = dwc_pcie_pmu_probe,
>> + .remove = dwc_pcie_pmu_remove,
>> + .driver = {.name = DRV_NAME,},
> More common to format as
> .driver = {
> .name = "dwc_pcie_pmu",
> },
> };
> Note use of string here. Using a define just forces people to
> look for this in the wrong place.
I see, will use string here in next version.
>
>> +};
>> +
>> +static int __init dwc_pcie_pmu_init(void)
>> +{
>> + int ret;
>> +
>> + ret = platform_driver_register(&dwc_pcie_pmu_driver);
>> +
>> + if (ret)
>> + return ret;
>> +
>> + dwc_pcie_pmu_dev =
>> + platform_device_register_simple(DEV_NAME, -1, NULL, 0);
>
> I'd normally expect to see the device created as a result of firmware
> description (ACPI DSDT / or Device tree)
> It is unusual to create a 'real' device directly in the driver
> init - that's normally reserved for various fake / software devices.
I see your concerns. You mentioned that
> The probing should be driven from the existing PCI driver topology.
Should we add a fake device in firmware or drive from PCI driver topology?
Thank you.
Best Regards,
Shuai
>> + if (IS_ERR(dwc_pcie_pmu_dev)) {
>> + platform_driver_unregister(&dwc_pcie_pmu_driver);
>> + return PTR_ERR(dwc_pcie_pmu_dev);
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +static void __exit dwc_pcie_pmu_exit(void)
>> +{
>> + platform_device_unregister(dwc_pcie_pmu_dev);
>> + platform_driver_unregister(&dwc_pcie_pmu_driver);
>> +}
>> +
>> +module_init(dwc_pcie_pmu_init);
>> +module_exit(dwc_pcie_pmu_exit);
>> +
>> +MODULE_DESCRIPTION("PMU driver for DesignWare Cores PCI Express Controller");
>> +MODULE_AUTHOR("[email protected]");
>> +MODULE_AUTHOR("[email protected]");
>> +MODULE_LICENSE("GPL v2");
在 2022/9/23 AM1:36, Bjorn Helgaas 写道:
> [+cc linux-pci]
>
> On Sat, Sep 17, 2022 at 08:10:35PM +0800, Shuai Xue wrote:
>> This commit adds the PCIe Performance Monitoring Unit (PMU) driver support
>> for T-Head Yitian SoC chip. Yitian is based on the Synopsys PCI Express
>> Core controller IP which provides statistics feature. The PMU is not a PCIe
>> Root Complex integrated End Point(RCiEP) device but only register counters
>> provided by each PCIe Root Port.
>>
>> To facilitate collection of statistics the controller provides the
>> following two features for each Root Port:
>>
>> - Time Based Analysis (RX/TX data throughput and time spent in each
>> low-power LTSSM state)
>> - Event counters (Error and Non-Error for lanes)
>>
>> Note, only one counter for each type.
>>
>> This driver add PMU devices for each PCIe Root Port. And the PMU device is
>> named based the BDF of Root Port. For example,
>>
>> 10:00.0 PCI bridge: Device 1ded:8000 (rev 01)
>>
>> the PMU device name for this Root Port is pcie_bdf_100000.
>>
>> Example usage of counting PCIe RX TLP data payload (Units of 16 bytes)::
>>
>> $# perf stat -a -e pcie_bdf_200/Rx_PCIe_TLP_Data_Payload/
>>
>> average RX bandwidth can be calculated like this:
>>
>> PCIe TX Bandwidth = PCIE_TX_DATA * 16B / Measure_Time_Window
>>
>> Signed-off-by: Shuai Xue <[email protected]>
>
>> +++ b/drivers/perf/dwc_pcie_pmu.c
>> ...
>> +#define DWC_PCIE_VSEC_ID 0x02
>
> I don't think DWC_PCIE_VSEC_ID is a very good name because it doesn't
> tell us anything about the purpose of the capability. Something like
> DWC_PCIE_RAS_DES_VSEC_ID would be more useful to readers.
Good idea, will use DWC_PCIE_RAS_DES_VSEC_ID instead in next version.
>
>> +#define DWC_PCIE_LINK_CAPABILITIES_REG 0xC
>> +#define DWC_PCIE_LANE_SHIFT 4
>> +#define DWC_PCIE_LANE_MASK GENMASK(9, 4)
>
> Shouldn't need these at all; see below.
>
>> +struct dwc_pcie_info_table {
>> + u32 bdf;
>> + u32 cap_pos;
>
> Would be useful to name this "ras_des" or similar so we have a hint
> about what we're reading/writing when using "pcie_info->cap_pos" below.
Good idea, will use ras_des instead in next version.
>
>> +static struct device_attribute dwc_pcie_pmu_cpumask_attr =
>> +__ATTR(cpumask, 0444, dwc_pcie_pmu_cpumask_show, NULL);
>
> DEVICE_ATTR_RO()?
>
>> +#define _dwc_pcie_format_attr(_name, _cfg, _fld) \
>> + (&((struct dwc_pcie_format_attr[]) {{ \
>> + .attr = __ATTR(_name, 0444, dwc_pcie_pmu_format_show, NULL), \
>
> Ditto.
>
>> +#define DWC_PCIE_EVENT_ATTR(_name, _type, _eventid, _lane) \
>> + (&((struct dwc_pcie_event_attr[]) {{ \
>> + .attr = __ATTR(_name, 0444, dwc_pcie_event_show, NULL), \
>
> Ditto.
DEVICE_ATTR_RO may a good choice. But does it fit the code style to use
DEVICE_ATTR_RO in drivers/perf? As far as know, CCN, CCI, SMMU,
qcom_l2_pmu use "struct device_attribute" directly.
>
>> +static int dwc_pcie_pmu_discover(struct dwc_pcie_pmu_priv *priv)
>> +{
>> + int val, where, index = 0;
>> + struct pci_dev *pdev = NULL;
>> + struct dwc_pcie_info_table *pcie_info;
>> +
>> + priv->pcie_table =
>> + devm_kcalloc(priv->dev, RP_NUM_MAX, sizeof(*pcie_info), GFP_KERNEL);
>> + if (!priv->pcie_table)
>> + return -EINVAL;
>> +
>> + pcie_info = priv->pcie_table;
>> + while ((pdev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pdev)) != NULL &&
>> + index < RP_NUM_MAX) {
>> + if (!pci_dev_is_rootport(pdev))
>> + continue;
>> +
>> + pcie_info[index].bdf = dwc_pcie_get_bdf(pdev);
>> + pcie_info[index].pdev = pdev;
>> +
>> + if (dwc_pcie_find_ras_des_cap_position(pdev, &where))
>> + continue;
>> +
>> + pcie_info[index].cap_pos = where;
>> +
>> + pci_read_config_dword(pdev,
>> + pdev->pcie_cap + DWC_PCIE_LINK_CAPABILITIES_REG,
>> + &val);
>> + pcie_info[index].num_lanes =
>> + (val & DWC_PCIE_LANE_MASK) >> DWC_PCIE_LANE_SHIFT;
>
> I think you can use pcie_get_width_cap() here.
You are right, will use pcie_get_width_cap() instead in next version.
>> +static int dwc_pcie_pmu_set_event_id(struct dwc_pcie_info_table *pcie_info,
>> + int event_id)
>> +{
>> + int ret;
>> + u32 val;
>> +
>> + ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, &val);
>> + if (ret) {
>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>
> Maybe #define dev_fmt above to add a prefix to these messages?
> Otherwise I think they will look like:
>
> pcieport 0000:00:1c.0: PCIe read fail
>
> which suggests it's related to pcieport, but that's the wrong place to
> look.
>
> I think every caller of dwc_pcie_pmu_read_dword() makes the same check
> and prints the same message; maybe the message should be moved inside
> dwc_pcie_pmu_read_dword()?
>
> Same with dwc_pcie_pmu_write_dword(); moving the message there would
> simplify all callers.
I would like to wrap dwc_pcie_pmu_{write}_dword out, use pci_{read}_config_dword
and drop the snaity check of return value as Jonathan suggests.
How did you like it?
>
>> +static int dwc_pcie_pmu_event_enable(struct dwc_pcie_info_table *pcie_info,
>> + u32 enable)
>> +{
>> + u32 ret;
>> + u32 val;
>> +
>> + ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, &val);
>> + if (ret) {
>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>> + return ret;
>> + }
>> +
>> + val &= ~(DWC_PCIE__CNT_ENABLE_MASK);
>
> Superfluous parens.
Will use recap in next version.
>
>> +static int dwc_pcie_pmu_base_time_add_prepare(struct dwc_pcie_info_table
>> + *pcie_info, u32 event_id)
>> +{
>> + u32 ret;
>> + u32 val;
>> +
>> + ret = dwc_pcie_pmu_read_dword(pcie_info,
>> + DWC_PCIE_TIME_BASED_ANALYSIS_CTRL, &val);
>> + if (ret) {
>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>> + return ret;
>> + }
>> +
>> + val &= ~DWC_PCIE__TIME_BASED_REPORT_SELECT_MASK;
>> + val |= event_id << DWC_PCIE__TIME_BASED_REPORT_SELECT_SHIFT;
>> + val &= ~DWC_PCIE__TIME_BASED_DURATION_SELECT;
>> +
>> + /*
>> + * TIME_BASED_ANALYSIS_DATA_REG is a 64 bit register, we can safely
>> + * use it with any manually controllered duration.
>
> s/controllered/controlled/ ? Not sure what this means. Maybe that
> 64 bits is wide enough you don't need to worry about rollover?
Yes, 64 bits is wide enough so we do not need to worry about rollover.
Sorry for this typo.
>> +static struct dwc_pcie_info_table *pmu_to_pcie_info(struct pmu *pmu)
>> +{
>> + struct dwc_pcie_info_table *pcie_info;
>> + struct dwc_pcie_pmu *pcie_pmu = to_pcie_pmu(pmu);
>> +
>> + pcie_info = container_of(pcie_pmu, struct dwc_pcie_info_table, pcie_pmu);
>> + if (pcie_info == NULL)
>> + pci_err(pcie_info->pdev, "Can't get pcie info\n");
>
> It shouldn't be possible to get here for a pmu with no pcie_info, and
> callers don't check for a NULL pointer return value before
> dereferencing it, so I guess all this adds is an error message before
> a NULL pointer oops? Not sure the code clutter is worth it.
Do you mean to drop the snaity check of container_of?
>> + return pcie_info;
>> +}
>
>> +static int dwc_pcie_pmu_event_init(struct perf_event *event)
>> +{
>> + struct hw_perf_event *hwc = &event->hw;
>> + struct dwc_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
>> + struct perf_event *sibling;
>> +
>> + if (event->attr.type != event->pmu->type)
>> + return -ENOENT;
>> +
>> + if (hwc->sample_period) {
>> + dev_dbg(pcie_pmu->dev, "Sampling not supported\n");
>> + return -EOPNOTSUPP;
>> + }
>> +
>> + if (event->cpu < 0) {
>> + dev_dbg(pcie_pmu->dev, "Per-task mode not supported\n");
>> + return -EOPNOTSUPP;
>> + }
>> +
>> + event->cpu = pcie_pmu->on_cpu;
>> +
>> + if (event->group_leader != event &&
>> + !is_software_event(event->group_leader)) {
>> + dev_dbg(pcie_pmu->dev, "Drive way only allow one event!\n");
>
> "Drive way"? -ENOPARSE for me :)
Good catch, its a typo and I used this in DDR Driveway PMU debug. Will drop
it in next version.
>
>> + return -EINVAL;
>> + }
>> +
>> + for_each_sibling_event(sibling, event->group_leader) {
>> + if (sibling != event && !is_software_event(sibling)) {
>> + dev_dbg(pcie_pmu->dev, "Drive way event not allowed!\n");
>> + return -EINVAL;
>> + }
>> + }
>
>> +static void dwc_pcie_pmu_set_period(struct hw_perf_event *hwc)
>> +{
>> + u64 new = 0;
>
> Superfluous variable.
>
>> + local64_set(&hwc->prev_count, new);
>> +}
I will set with 0 instead in next version.
>
>> +static int __dwc_pcie_pmu_probe(struct dwc_pcie_pmu_priv *priv,
>> + struct dwc_pcie_info_table *pcie_info)
>> +{
>> + int ret;
>> + char *name;
>> + struct dwc_pcie_pmu *pcie_pmu;
>> + struct device *dev;
>> +
>> + if (!pcie_info || !pcie_info->pdev) {
>> + pci_err(pcie_info->pdev, "Input parameter is invalid\n");
>
> There are a lot of "Input parameter is invalid" messages. If somebody
> sees that, there's no hint about which one to look at. Messages that
> are constant strings are usually a hint that they could include more
> information.
I see your points. Will give a more accurate hint.
>> +static int dwc_pcie_pmu_probe(struct platform_device *pdev)
>> +{
>> + int ret = 0;
>> + int pcie_index;
>> + struct dwc_pcie_pmu_priv *priv;
>> +
>> + priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
>> + if (!priv)
>> + return -ENOMEM;
>> + priv->dev = &pdev->dev;
>> + platform_set_drvdata(pdev, priv);
>> +
>> + /* If PMU is not support on current platform, keep slient */
>
> s/not support/not supported/
> s/slient/silent/
Sorry for these typos, will fix in next version.
>
> Bjorn
Thank you for your valuable comments.
Best Regards,
Shuai
在 2022/9/23 AM11:30, Yicong Yang 写道:
> On 2022/9/17 20:10, Shuai Xue wrote:
>> This commit adds the PCIe Performance Monitoring Unit (PMU) driver support
>> for T-Head Yitian SoC chip. Yitian is based on the Synopsys PCI Express
>> Core controller IP which provides statistics feature. The PMU is not a PCIe
>> Root Complex integrated End Point(RCiEP) device but only register counters
>> provided by each PCIe Root Port.
>>
>> To facilitate collection of statistics the controller provides the
>> following two features for each Root Port:
>>
>> - Time Based Analysis (RX/TX data throughput and time spent in each
>> low-power LTSSM state)
>> - Event counters (Error and Non-Error for lanes)
>>
>> Note, only one counter for each type.
>>
>> This driver add PMU devices for each PCIe Root Port. And the PMU device is
>> named based the BDF of Root Port. For example,
>>
>> 10:00.0 PCI bridge: Device 1ded:8000 (rev 01)
>>
>> the PMU device name for this Root Port is pcie_bdf_100000.
>>
>> Example usage of counting PCIe RX TLP data payload (Units of 16 bytes)::
>>
>> $# perf stat -a -e pcie_bdf_200/Rx_PCIe_TLP_Data_Payload/
>>
>> average RX bandwidth can be calculated like this:
>>
>> PCIe TX Bandwidth = PCIE_TX_DATA * 16B / Measure_Time_Window
>>
>> Signed-off-by: Shuai Xue <[email protected]>
>> ---
>> drivers/perf/Kconfig | 7 +
>> drivers/perf/Makefile | 1 +
>> drivers/perf/dwc_pcie_pmu.c | 976 ++++++++++++++++++++++++++++++++++++
>> 3 files changed, 984 insertions(+)
>> create mode 100644 drivers/perf/dwc_pcie_pmu.c
>>
>> diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
>> index 1e2d69453771..11ae99de5bbf 100644
>> --- a/drivers/perf/Kconfig
>> +++ b/drivers/perf/Kconfig
>> @@ -192,4 +192,11 @@ config MARVELL_CN10K_DDR_PMU
>> Enable perf support for Marvell DDR Performance monitoring
>> event on CN10K platform.
>>
>> +config CONFIG_DWC_PCIE_PMU
>> + tristate "Enable Synopsys DesignWare PCIe PMU Support"
>> + depends on ARM64 || (COMPILE_TEST && 64BIT)
>> + help
>> + Enable perf support for Synopsys DesignWare PCIe PMU Performance
>> + monitoring event on Yitan 710 platform.
>> +
>> endmenu
>> diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
>> index 57a279c61df5..36f75cb0f320 100644
>> --- a/drivers/perf/Makefile
>> +++ b/drivers/perf/Makefile
>> @@ -20,3 +20,4 @@ obj-$(CONFIG_ARM_DMC620_PMU) += arm_dmc620_pmu.o
>> obj-$(CONFIG_MARVELL_CN10K_TAD_PMU) += marvell_cn10k_tad_pmu.o
>> obj-$(CONFIG_MARVELL_CN10K_DDR_PMU) += marvell_cn10k_ddr_pmu.o
>> obj-$(CONFIG_APPLE_M1_CPU_PMU) += apple_m1_cpu_pmu.o
>> +obj-$(CONFIG_DWC_PCIE_PMU) += dwc_pcie_pmu.o
>> diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c
>> new file mode 100644
>> index 000000000000..81e534be13fa
>> --- /dev/null
>> +++ b/drivers/perf/dwc_pcie_pmu.c
>> @@ -0,0 +1,976 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * Synopsys DesignWare PCIe PMU driver
>> + *
>> + * Copyright (C) 2021, 2022 Alibaba Inc.
>> + */
>> +
>> +#include <linux/pci.h>
>> +#include <linux/bitfield.h>
>> +#include <linux/bitops.h>
>> +#include <linux/cpuhotplug.h>
>> +#include <linux/cpumask.h>
>> +#include <linux/device.h>
>> +#include <linux/errno.h>
>> +#include <linux/kernel.h>
>> +#include <linux/list.h>
>> +#include <linux/perf_event.h>
>> +#include <linux/platform_device.h>
>> +#include <linux/smp.h>
>> +#include <linux/sysfs.h>
>> +#include <linux/types.h>
>> +
>> +#define DRV_NAME "dwc_pcie_pmu"
>> +#define DEV_NAME "dwc_pcie_pmu"
>> +#define RP_NUM_MAX 32 /* 2die * 4RC * 4Ctrol */
>> +#define ATTRI_NAME_MAX_SIZE 32
>> +
>> +#define DWC_PCIE_VSEC_ID 0x02
>> +#define DWC_PCIE_VSEC_REV 0x04
>> +
>> +#define DWC_PCIE_LINK_CAPABILITIES_REG 0xC
>> +#define DWC_PCIE_LANE_SHIFT 4
>> +#define DWC_PCIE_LANE_MASK GENMASK(9, 4)
>> +
>> +#define DWC_PCIE_EVENT_CNT_CTRL 0x8
>> +#define DWC_PCIE__CNT_EVENT_SELECT_SHIFT 16
>> +#define DWC_PCIE__CNT_EVENT_SELECT_MASK GENMASK(27, 16)
>> +#define DWC_PCIE__CNT_LANE_SELECT_SHIFT 8
>> +#define DWC_PCIE__CNT_LANE_SELECT_MASK GENMASK(11, 8)
>> +#define DWC_PCIE__CNT_STATUS_SHIFT 7
>> +#define DWC_PCIE__CNT_STATUS_MASK BIT(7)
>> +#define DWC_PCIE__CNT_ENABLE_SHIFT 2
>> +#define DWC_PCIE__CNT_ENABLE_MASK GENMASK(4, 2)
>> +#define DWC_PCIE_PER_EVENT_OFF (0x1 << DWC_PCIE__CNT_ENABLE_SHIFT)
>> +#define DWC_PCIE_PER_EVENT_ON (0x3 << DWC_PCIE__CNT_ENABLE_SHIFT)
>> +#define DWC_PCIE_EVENT_CLEAR_MASK GENMASK(1, 0)
>> +
>> +#define DWC_PCIE_EVENT_CNT_DATA 0xC
>> +
>> +#define DWC_PCIE_TIME_BASED_ANALYSIS_CTRL 0x10
>> +#define DWC_PCIE__TIME_BASED_REPORT_SELECT_SHIFT 24
>> +#define DWC_PCIE__TIME_BASED_REPORT_SELECT_MASK GENMASK(31, 24)
>> +#define DWC_PCIE__TIME_BASED_DURATION_SHIFT 8
>> +#define DWC_PCIE__TIME_BASED_DURATION_SELECT GENMASK(15, 8)
>> +#define DWC_PCIE_DURATION_MANUAL_CTRL 0x0
>> +#define DWC_PCIE_DURATION_1MS 0x1
>> +#define DWC_PCIE_DURATION_10MS 0x2
>> +#define DWC_PCIE_DURATION_100MS 0x3
>> +#define DWC_PCIE_DURATION_1S 0x4
>> +#define DWC_PCIE_DURATION_2S 0x5
>> +#define DWC_PCIE_DURATION_4S 0x6
>> +#define DWC_PCIE_DURATION_4US 0xff
>> +#define DWC_PCIE__TIME_BASED_COUNTER_ENABLE 1
>> +
>> +#define DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_LOW 0x14
>> +#define DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_HIGH 0x18
>> +
>> +/* Event attributes */
>> +#define DWC_PCIE_CONFIG_EVENTID GENMASK(15, 0)
>> +#define DWC_PCIE_CONFIG_TYPE GENMASK(19, 16)
>> +#define DWC_PCIE_CONFIG_LANE GENMASK(27, 20)
>> +
>> +#define DWC_PCIE_EVENT_ID(event) FIELD_GET(DWC_PCIE_CONFIG_EVENTID, (event)->attr.config)
>> +#define DWC_PCIE_EVENT_TYPE(event) FIELD_GET(DWC_PCIE_CONFIG_TYPE, (event)->attr.config)
>> +#define DWC_PCIE_EVENT_LANE(event) FIELD_GET(DWC_PCIE_CONFIG_LANE, (event)->attr.config)
>> +
>> +#define DWC_PCIE_PMU_HAS_REGISTER 1
>> +
>> +enum dwc_pcie_event_type {
>> + DWC_PCIE_TYPE_INVALID,
>> + DWC_PCIE_TIME_BASE_EVENT,
>> + DWC_PCIE_LANE_EVENT,
>> +};
>> +
>> +struct dwc_event_counters {
>> + const char name[32];
>> + u32 event_id;
>> +};
>> +
>> +struct dwc_pcie_pmu {
>> + struct hlist_node node;
>> + unsigned int on_cpu;
>> + struct pmu pmu;
>> + struct device *dev;
>> +};
>> +
>> +struct dwc_pcie_info_table {
>> + u32 bdf;
>> + u32 cap_pos;
>> + u32 num_lanes;
>> + struct pci_dev *pdev;
>> + struct dwc_pcie_pmu pcie_pmu;
>> + u8 pmu_is_register;
>> + struct perf_event *event;
>> +
>> + struct dwc_pcie_event_attr *lane_event_attrs;
>> + struct attribute **pcie_pmu_event_attrs;
>> + struct attribute_group pcie_pmu_event_attrs_group;
>> + const struct attribute_group *pcie_pmu_attr_groups[4];
>> +};
>> +
>> +struct dwc_pcie_pmu_priv {
>> + struct device *dev;
>> + u32 pcie_ctrl_num;
>> + struct dwc_pcie_info_table *pcie_table;
>> +};
>> +
>> +#define DWC_PCIE_CREATE_BDF(seg, bus, dev, func) \
>> + (((seg) << 24) | (((bus) & 0xFF) << 16) | (((dev) & 0xFF) << 8) | (func))
>
> Just pass pdev->devfn and use PCI_DEVID() to simplify here.
Sorry, as far as I know, PCI_DEVID() output is not exactly the bdf.
For example, bdf 300100 is decoded as 3008.
>
>> +#define to_pcie_pmu(p) (container_of(p, struct dwc_pcie_pmu, pmu))
>> +
>> +static struct platform_device *dwc_pcie_pmu_dev;
>> +static char *event_attr_name = "events";
>> +
>> +static ssize_t dwc_pcie_pmu_cpumask_show(struct device *dev,
>> + struct device_attribute *attr,
>> + char *buf)
>> +{
>> + struct dwc_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev));
>> +
>> + return cpumap_print_to_pagebuf(true, buf, cpumask_of(pcie_pmu->on_cpu));
>> +}
>> +
>> +static struct device_attribute dwc_pcie_pmu_cpumask_attr =
>> +__ATTR(cpumask, 0444, dwc_pcie_pmu_cpumask_show, NULL);
>> +
>> +static struct attribute *dwc_pcie_pmu_cpumask_attrs[] = {
>> + &dwc_pcie_pmu_cpumask_attr.attr,
>> + NULL
>> +};
>> +
>> +static struct attribute_group pcie_pmu_cpumask_attrs_group = {
>> + .attrs = dwc_pcie_pmu_cpumask_attrs,
>> +};
>> +
>> +struct dwc_pcie_format_attr {
>> + struct device_attribute attr;
>> + u64 field;
>> + int config;
>> +};
>> +
>> +static ssize_t dwc_pcie_pmu_format_show(struct device *dev,
>> + struct device_attribute *attr,
>> + char *buf)
>> +{
>> + struct dwc_pcie_format_attr *fmt = container_of(attr, typeof(*fmt), attr);
>> + int lo = __ffs(fmt->field), hi = __fls(fmt->field);
>> +
>> + if (lo == hi)
>> + return snprintf(buf, PAGE_SIZE, "config:%d\n", lo);
>> +
>> + if (!fmt->config)
>> + return snprintf(buf, PAGE_SIZE, "config:%d-%d\n", lo, hi);
>> +
>> + return snprintf(buf, PAGE_SIZE, "config%d:%d-%d\n", fmt->config, lo,
>> + hi);
>> +}
>> +
>> +#define _dwc_pcie_format_attr(_name, _cfg, _fld) \
>> + (&((struct dwc_pcie_format_attr[]) {{ \
>> + .attr = __ATTR(_name, 0444, dwc_pcie_pmu_format_show, NULL), \
>> + .config = _cfg, \
>> + .field = _fld, \
>> + }})[0].attr.attr)
>> +
>> +#define dwc_pcie_format_attr(_name, _fld) _dwc_pcie_format_attr(_name, 0, _fld)
>> +
>> +static struct attribute *dwc_pcie_format_attrs[] = {
>> + dwc_pcie_format_attr(type, DWC_PCIE_CONFIG_TYPE),
>> + dwc_pcie_format_attr(eventid, DWC_PCIE_CONFIG_EVENTID),
>> + dwc_pcie_format_attr(lane, DWC_PCIE_CONFIG_LANE),
>> + NULL,
>> +};
>> +
>> +static struct attribute_group pcie_pmu_format_attrs_group = {
>> + .name = "format",
>> + .attrs = dwc_pcie_format_attrs,
>> +};
>> +
>> +struct dwc_pcie_event_attr {
>> + struct device_attribute attr;
>> + enum dwc_pcie_event_type type;
>> + u16 eventid;
>> + u8 lane;
>> +};
>> +
>> +ssize_t dwc_pcie_event_show(struct device *dev,
>> + struct device_attribute *attr, char *page)
>> +{
>> + struct dwc_pcie_event_attr *eattr;
>> +
>> + eattr = container_of(attr, typeof(*eattr), attr);
>> +
>> + if (eattr->type == DWC_PCIE_LANE_EVENT)
>> + return sprintf(page, "eventid=0x%lx, type=0x%lx, lane=0x%lx\n",
>> + (unsigned long)eattr->eventid,
>> + (unsigned long)eattr->type,
>> + (unsigned long)eattr->lane);
>> + else
>> + return sprintf(page, "eventid=0x%lx, type=0x%lx",
>> + (unsigned long)eattr->eventid,
>> + (unsigned long)eattr->type);
>> +}
>
> I remember sysfs_emit() is preferred.
You are right, I will use sysfs_emit() in next version.
>
>> +
>> +#define DWC_PCIE_EVENT_ATTR(_name, _type, _eventid, _lane) \
>> + (&((struct dwc_pcie_event_attr[]) {{ \
>> + .attr = __ATTR(_name, 0444, dwc_pcie_event_show, NULL), \
>> + .type = _type, \
>> + .eventid = _eventid, \
>> + .lane = _lane, \
>> + }})[0].attr.attr)
>> +
>> +#define DWC_PCIE_PMU_BASE_TIME_ATTR(_name, _eventid) \
>> + DWC_PCIE_EVENT_ATTR(_name, DWC_PCIE_TIME_BASE_EVENT, _eventid, 0)
>> +
>> +static struct attribute *dwc_pcie_pmu_time_event_attrs[] = {
>> + /* Group #0 */
>> + DWC_PCIE_PMU_BASE_TIME_ATTR(one_cycle, 0x00),
>> + DWC_PCIE_PMU_BASE_TIME_ATTR(TX_L0S, 0x01),
>> + DWC_PCIE_PMU_BASE_TIME_ATTR(RX_L0S, 0x02),
>> + DWC_PCIE_PMU_BASE_TIME_ATTR(L0, 0x03),
>> + DWC_PCIE_PMU_BASE_TIME_ATTR(L1, 0x04),
>> + DWC_PCIE_PMU_BASE_TIME_ATTR(L1_1, 0x05),
>> + DWC_PCIE_PMU_BASE_TIME_ATTR(L1_2, 0x06),
>> + DWC_PCIE_PMU_BASE_TIME_ATTR(CFG_RCVRY, 0x07),
>> + DWC_PCIE_PMU_BASE_TIME_ATTR(TX_RX_L0S, 0x08),
>> + DWC_PCIE_PMU_BASE_TIME_ATTR(L1_AUX, 0x09),
>> + DWC_PCIE_PMU_BASE_TIME_ATTR(ONE_cycle, 0x10),
>> + DWC_PCIE_PMU_BASE_TIME_ATTR(TX_L0S_, 0x11),
>> + DWC_PCIE_PMU_BASE_TIME_ATTR(RX_L0S_, 0x12),
>> + DWC_PCIE_PMU_BASE_TIME_ATTR(L0_, 0x13),
>> + DWC_PCIE_PMU_BASE_TIME_ATTR(L1_, 0x17),
>> + DWC_PCIE_PMU_BASE_TIME_ATTR(CFG_RCVRY_, 0x17),
>> + DWC_PCIE_PMU_BASE_TIME_ATTR(TX_RX_L0S_, 0x18),
>> + /* Group #1 */
>> + DWC_PCIE_PMU_BASE_TIME_ATTR(Tx_PCIe_TLP_Data_Payload, 0x20),
>> + DWC_PCIE_PMU_BASE_TIME_ATTR(Rx_PCIe_TLP_Data_Payload, 0x21),
>> + DWC_PCIE_PMU_BASE_TIME_ATTR(Tx_CCIX_TLP_Data_Payload, 0x22),
>> + DWC_PCIE_PMU_BASE_TIME_ATTR(Rx_CCIX_TLP_Data_Payload, 0x23),
>> + NULL
>> +};
>> +
>> +static inline umode_t pcie_pmu_event_attr_is_visible(struct kobject *kobj,
>> + struct attribute *attr,
>> + int unuse)
>> +{
>> + return attr->mode;
>> +}
>> +
>> +static inline bool pci_dev_is_rootport(struct pci_dev *pdev)
>> +{
>> + return (pci_is_pcie(pdev) &&
>> + pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT);
>> +}
>> +
>> +static inline unsigned int dwc_pcie_get_bdf(struct pci_dev *dev)
>> +{
>> + return (DWC_PCIE_CREATE_BDF(pci_domain_nr(dev->bus), dev->bus->number,
>> + PCI_SLOT(dev->devfn),
>> + PCI_FUNC(dev->devfn)));
>> +}
>> +
>> +static int dwc_pcie_find_ras_des_cap_position(struct pci_dev *pdev, int *pos)
>> +{
>> + u32 header;
>> + int vsec = 0;
>> +
>> + while ((vsec = pci_find_next_ext_capability(pdev, vsec,
>> + PCI_EXT_CAP_ID_VNDR))) {
>> + pci_read_config_dword(pdev, vsec + PCI_VNDR_HEADER, &header);
>> + /* Is the device part of a DesignWare Cores PCIe Controller ? */
>> + if (PCI_VNDR_HEADER_ID(header) == DWC_PCIE_VSEC_ID &&
>> + PCI_VNDR_HEADER_REV(header) == DWC_PCIE_VSEC_REV) {
>> + *pos = vsec;
>> + return 0;
>> + }
>> + }
>> +
>> + return -ENODEV;
>> +}
>> +
>> +static int dwc_pcie_pmu_discover(struct dwc_pcie_pmu_priv *priv)
>> +{
>> + int val, where, index = 0;
>> + struct pci_dev *pdev = NULL;
>> + struct dwc_pcie_info_table *pcie_info;
>> +
>> + priv->pcie_table =
>> + devm_kcalloc(priv->dev, RP_NUM_MAX, sizeof(*pcie_info), GFP_KERNEL);
>> + if (!priv->pcie_table)
>> + return -EINVAL;
>> +
>> + pcie_info = priv->pcie_table;
>> + while ((pdev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pdev)) != NULL &&
>
> I may miss but I don't pci_dev_put() to balance the reference cnt.
As the comments in pci_get_device, the reference count is incremented and
decremented in the loop automatically. So we do not need to use
pci_dev_put(), right?
Iterates through the list of known PCI devices. If a PCI device is
found with a matching @vendor and @device, *the reference count to the
device is incremented* and a pointer to its device structure is returned.
Otherwise, %NULL is returned. A new search is initiated by passing %NULL
as the @from argument. Otherwise if @from is not %NULL, searches continue
from next device on the global list. *The reference count for @from is
always decremented if it is not %NULL.*
>
>> + index < RP_NUM_MAX) {
>> + if (!pci_dev_is_rootport(pdev))
>> + continue;
>> +
>> + pcie_info[index].bdf = dwc_pcie_get_bdf(pdev);
>> + pcie_info[index].pdev = pdev;
>> +
>> + if (dwc_pcie_find_ras_des_cap_position(pdev, &where))
>> + continue;
>> +
>> + pcie_info[index].cap_pos = where;
>> +
>> + pci_read_config_dword(pdev,
>> + pdev->pcie_cap + DWC_PCIE_LINK_CAPABILITIES_REG,
>> + &val);
>> + pcie_info[index].num_lanes =
>> + (val & DWC_PCIE_LANE_MASK) >> DWC_PCIE_LANE_SHIFT;
>> + index++;
>> + }
>> +
>> + if (!index)
>> + return -ENODEV;
>> +
>> + priv->pcie_ctrl_num = index;
>> +
>> + return 0;
>> +}
>> +
>> +static inline int dwc_pcie_pmu_read_dword(struct dwc_pcie_info_table *pcie_info,
>> + u32 reg, u32 *val)
>> +{
>> + return pci_read_config_dword(pcie_info->pdev, pcie_info->cap_pos + reg,
>> + val);
>> +}
>> +
>> +static inline int dwc_pcie_pmu_write_dword(struct dwc_pcie_info_table
>> + *pcie_info, u32 reg, u32 val)
>> +{
>> + return pci_write_config_dword(pcie_info->pdev, pcie_info->cap_pos + reg,
>> + val);
>> +}
>> +
>> +static int dwc_pcie_pmu_set_event_id(struct dwc_pcie_info_table *pcie_info,
>> + int event_id)
>> +{
>> + int ret;
>> + u32 val;
>> +
>> + ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, &val);
>> + if (ret) {
>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>> + return ret;
>> + }
>> +
>> + val &= ~DWC_PCIE__CNT_ENABLE_MASK;
>> + val &= ~DWC_PCIE__CNT_EVENT_SELECT_MASK;
>> + val |= event_id << DWC_PCIE__CNT_EVENT_SELECT_SHIFT;
>> +
>> + ret = dwc_pcie_pmu_write_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, val);
>> + if (ret)
>> + pci_err(pcie_info->pdev, "PCIe write fail\n");
>> +
>> + return ret;
>> +}
>> +
>> +static int dwc_pcie_pmu_write_event_lane(struct dwc_pcie_info_table *pcie_info,
>> + int lane, int event_id)
>> +{
>> + u32 ret;
>> + u32 val;
>> +
>> + ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, &val);
>> + if (ret) {
>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>> + return ret;
>> + }
>> +
>> + val &= ~DWC_PCIE__CNT_LANE_SELECT_MASK;
>> + val |= lane << DWC_PCIE__CNT_LANE_SELECT_SHIFT;
>> +
>> + ret = dwc_pcie_pmu_write_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, val);
>> + if (ret)
>> + pci_err(pcie_info->pdev, "PCIe write fail\n");
>> +
>> + return ret;
>> +}
>> +
>> +static int dwc_pcie_pmu_event_enable(struct dwc_pcie_info_table *pcie_info,
>> + u32 enable)
>> +{
>> + u32 ret;
>> + u32 val;
>> +
>> + ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, &val);
>> + if (ret) {
>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>
> Somebody may mentioned. Maybe you don't need to print these messages in PMU ops, just
> return the correct error code and let perf handle it. Or you should provide more
> information for these, like failed in which funcion or read/write which value.
> If it only necessary when debugging, make it pci_dbg().
Yep, you are right, I will drop the print info in next version.
>
>> + return ret;
>> + }
>> +
>> + val &= ~(DWC_PCIE__CNT_ENABLE_MASK);
>> +
>> + if (enable)
>> + val |= DWC_PCIE_PER_EVENT_ON;
>> + else
>> + val |= DWC_PCIE_PER_EVENT_OFF;
>> +
>> + ret = dwc_pcie_pmu_write_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, val);
>> + if (ret)
>> + pci_err(pcie_info->pdev, "PCIe write fail\n");
>> +
>> + return ret;
>> +}
>> +
>> +static int dwc_pcie_pmu_base_time_enable(struct dwc_pcie_info_table *pcie_info,
>> + u32 enable)
>> +{
>> + u32 ret;
>> + u32 val;
>> +
>> + ret = dwc_pcie_pmu_read_dword(pcie_info,
>> + DWC_PCIE_TIME_BASED_ANALYSIS_CTRL, &val);
>> + if (ret) {
>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>> + return ret;
>> + }
>> +
>> + if (enable)
>> + val |= DWC_PCIE__TIME_BASED_COUNTER_ENABLE;
>> + else
>> + val &= ~DWC_PCIE__TIME_BASED_COUNTER_ENABLE;
>> +
>> + ret = dwc_pcie_pmu_write_dword(pcie_info,
>> + DWC_PCIE_TIME_BASED_ANALYSIS_CTRL, val);
>> + if (ret)
>> + pci_err(pcie_info->pdev, "PCIe write fail\n");
>> +
>> + return ret;
>> +}
>> +
>> +static int dwc_pcie_pmu_read_event_counter(struct dwc_pcie_info_table
>> + *pcie_info, u64 *counter)
>> +{
>> + u32 ret, val;
>> +
>> + ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_DATA, &val);
>> + if (ret) {
>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>> + return ret;
>> + }
>> + *counter = val;
>> +
>> + return ret;
>> +}
>> +
>> +static int dwc_pcie_pmu_read_base_time_counter(struct dwc_pcie_info_table
>> + *pcie_info, u64 *counter)
>> +{
>> + u32 ret, val;
>> +
>> + ret = dwc_pcie_pmu_read_dword(pcie_info,
>> + DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_HIGH,
>> + &val);
>> + if (ret) {
>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>> + return ret;
>> + }
>> +
>> + *counter = val;
>> + *counter <<= 32;
>> +
>> + ret = dwc_pcie_pmu_read_dword(pcie_info,
>> + DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_LOW,
>> + &val);
>> + if (ret) {
>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>> + return ret;
>> + }
>> +
>> + *counter += val;
>> +
>> + return ret;
>> +}
>> +
>> +static int dwc_pcie_pmu_clear_event_counter(struct dwc_pcie_info_table
>> + *pcie_info)
>> +{
>> + u32 ret;
>> + u32 val;
>> +
>> + ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, &val);
>> + if (ret) {
>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>> + return ret;
>> + }
>> +
>> + val &= ~DWC_PCIE_EVENT_CLEAR_MASK;
>> + val |= 1;
>
> It's better to use a macro for '1' to make it more clear.
Good idea, will fix it in next version.
>
>> +
>> + ret = dwc_pcie_pmu_write_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, val);
>> + if (ret)
>> + pci_err(pcie_info->pdev, "PCIe write fail\n");
>> +
>> + return ret;
>> +}
>> +
>> +static int dwc_pcie_pmu_base_time_add_prepare(struct dwc_pcie_info_table
>> + *pcie_info, u32 event_id)
>> +{
>> + u32 ret;
>> + u32 val;
>> +
>> + ret = dwc_pcie_pmu_read_dword(pcie_info,
>> + DWC_PCIE_TIME_BASED_ANALYSIS_CTRL, &val);
>> + if (ret) {
>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>> + return ret;
>> + }
>> +
>> + val &= ~DWC_PCIE__TIME_BASED_REPORT_SELECT_MASK;
>> + val |= event_id << DWC_PCIE__TIME_BASED_REPORT_SELECT_SHIFT;
>> + val &= ~DWC_PCIE__TIME_BASED_DURATION_SELECT;
>> +
>> + /*
>> + * TIME_BASED_ANALYSIS_DATA_REG is a 64 bit register, we can safely
>> + * use it with any manually controllered duration.
>> + */
>> + val &= ~(DWC_PCIE__TIME_BASED_DURATION_SELECT);
>> + val |= DWC_PCIE_DURATION_MANUAL_CTRL;
>> +
>> + ret = dwc_pcie_pmu_write_dword(pcie_info,
>> + DWC_PCIE_TIME_BASED_ANALYSIS_CTRL, val);
>> + if (ret)
>> + pci_err(pcie_info->pdev, "PCIe write fail\n");
>> +
>> + return ret;
>> +}
>> +
>> +static struct dwc_pcie_info_table *pmu_to_pcie_info(struct pmu *pmu)
>> +{
>> + struct dwc_pcie_info_table *pcie_info;
>> + struct dwc_pcie_pmu *pcie_pmu = to_pcie_pmu(pmu);
>> +
>> + pcie_info = container_of(pcie_pmu, struct dwc_pcie_info_table, pcie_pmu);
>> + if (pcie_info == NULL)
>> + pci_err(pcie_info->pdev, "Can't get pcie info\n");
>> +
>> + return pcie_info;
>> +}
>> +
>> +static void dwc_pcie_pmu_event_update(struct perf_event *event)
>> +{
>> + u64 counter;
>> + struct dwc_pcie_info_table *pcie_info = pmu_to_pcie_info(event->pmu);
>> + struct hw_perf_event *hwc = &event->hw;
>> + enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
>> + u64 delta, prev, now;
>> +
>> + do {
>> + prev = local64_read(&hwc->prev_count);
>> +
>> + if (type == DWC_PCIE_LANE_EVENT)
>> + dwc_pcie_pmu_read_event_counter(pcie_info, &counter);
>> + else if (type == DWC_PCIE_TIME_BASE_EVENT)
>> + dwc_pcie_pmu_read_base_time_counter(pcie_info,
>> + &counter);
>> + else
>> + pci_err(pcie_info->pdev, "Input param is invalid\n");
>> +
>
> For the messages in PMU ops, you should print the message on behalf of PMU device
> rather than PCIe device. Same for the other places.
Good idea, will fix it in next version.
>
>> + now = counter;
>> + } while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
>> +
>> + delta = now - prev;
>> +
>> + local64_add(delta, &event->count);
>> +}
>> +
>> +static int dwc_pcie_pmu_event_init(struct perf_event *event)
>> +{
>> + struct hw_perf_event *hwc = &event->hw;
>> + struct dwc_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
>> + struct perf_event *sibling;
>> +
>> + if (event->attr.type != event->pmu->type)
>> + return -ENOENT;
>> +
>> + if (hwc->sample_period) {
>> + dev_dbg(pcie_pmu->dev, "Sampling not supported\n");
>> + return -EOPNOTSUPP;
>> + }
>> +
>> + if (event->cpu < 0) {
>> + dev_dbg(pcie_pmu->dev, "Per-task mode not supported\n");
>> + return -EOPNOTSUPP;
>> + }
>> +
>> + event->cpu = pcie_pmu->on_cpu;
>> +
>> + if (event->group_leader != event &&
>> + !is_software_event(event->group_leader)) {
>> + dev_dbg(pcie_pmu->dev, "Drive way only allow one event!\n");
>> + return -EINVAL;
>> + }
>> +
>> + for_each_sibling_event(sibling, event->group_leader) {
>> + if (sibling != event && !is_software_event(sibling)) {
>> + dev_dbg(pcie_pmu->dev, "Drive way event not allowed!\n");
>> + return -EINVAL;
>> + }
>> + }
>> +
>> + hwc->idx = -1;
>> +
>> + return 0;
>> +}
>> +
>> +static void dwc_pcie_pmu_set_period(struct hw_perf_event *hwc)
>> +{
>> + u64 new = 0;
>> +
>
> redundant 'new'.
>
>> + local64_set(&hwc->prev_count, new);
>> +}
Yep, will fix it in next version.
>> +
>> +static void dwc_pcie_pmu_event_start(struct perf_event *event, int flags)
>> +{
>> + struct hw_perf_event *hwc = &event->hw;
>> + struct dwc_pcie_info_table *pcie_info = pmu_to_pcie_info(event->pmu);
>> + enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
>> +
>> + hwc->state = 0;
>> + dwc_pcie_pmu_set_period(hwc);
>> +
>> + if (type == DWC_PCIE_LANE_EVENT)
>> + dwc_pcie_pmu_event_enable(pcie_info, 1);
>> + else if (type == DWC_PCIE_TIME_BASE_EVENT)
>> + dwc_pcie_pmu_base_time_enable(pcie_info, 1);
>> + else
>> + pci_err(pcie_info->pdev, "Input param is invalid\n");
>> +}
>> +
>> +static void dwc_pcie_pmu_event_stop(struct perf_event *event, int flags)
>> +{
>> + struct dwc_pcie_info_table *pcie_info = pmu_to_pcie_info(event->pmu);
>> + enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
>> +
>> + if (event->hw.state & PERF_HES_STOPPED)
>> + return;
>> +
>> + if (type == DWC_PCIE_LANE_EVENT)
>> + dwc_pcie_pmu_event_enable(pcie_info, 0);
>> + else if (type == DWC_PCIE_TIME_BASE_EVENT)
>> + dwc_pcie_pmu_base_time_enable(pcie_info, 0);
>> + else
>> + pci_err(pcie_info->pdev, "Input param is invalid\n");
>
> If the message is necessary, it'll be more helpful to mention which param
> is invalid.
I see, will give more hint in log.
>
>> +
>> + dwc_pcie_pmu_event_update(event);
>> +}
>> +
>> +static int dwc_pcie_pmu_event_add(struct perf_event *event, int flags)
>> +{
>> + struct dwc_pcie_info_table *pcie_info = pmu_to_pcie_info(event->pmu);
>> + struct hw_perf_event *hwc = &event->hw;
>> + enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
>> + int event_id = DWC_PCIE_EVENT_ID(event);
>> + int lane = DWC_PCIE_EVENT_LANE(event);
>> +
>> + if (pcie_info->event)
>> + return -ENOSPC;
>> +
>> + pcie_info->event = event;
>> +
>> + hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
>> +
>> + if (type == DWC_PCIE_LANE_EVENT) {
>> + dwc_pcie_pmu_event_enable(pcie_info, 0);
>> + dwc_pcie_pmu_write_event_lane(pcie_info, lane, event_id);
>> + dwc_pcie_pmu_set_event_id(pcie_info, event_id);
>> + dwc_pcie_pmu_clear_event_counter(pcie_info);
>> + } else if (type == DWC_PCIE_TIME_BASE_EVENT) {
>> + dwc_pcie_pmu_base_time_enable(pcie_info, 0);
>> + dwc_pcie_pmu_base_time_add_prepare(pcie_info, event_id);
>> + } else {
>> + pci_err(pcie_info->pdev, "Input param is invalid\n");
>> + return -EINVAL;
>> + }
>> +
>> + if (flags & PERF_EF_START)
>> + dwc_pcie_pmu_event_start(event, PERF_EF_RELOAD);
>> +
>> + perf_event_update_userpage(event);
>> +
>> + return 0;
>> +}
>> +
>> +static void dwc_pcie_pmu_event_del(struct perf_event *event, int flags)
>> +{
>> + struct dwc_pcie_info_table *pcie_info = pmu_to_pcie_info(event->pmu);
>> +
>> + dwc_pcie_pmu_event_stop(event, flags | PERF_EF_UPDATE);
>> + perf_event_update_userpage(event);
>> + pcie_info->event = NULL;
>> +}
>> +
>> +static void dwc_pcie_pmu_event_read(struct perf_event *event)
>> +{
>> + dwc_pcie_pmu_event_update(event);
>> +}
>> +
>> +static struct dwc_event_counters event_array[] = {
>> + {"tx_ack_dllp", 0x600},
>> + {"tx_update_fc_dllp", 0x601},
>> + {"rx_ack_dllp", 0x602},
>> + {"rx_update_fc_dllp", 0x603},
>> + {"rx_nulified_tlp", 0x604},
>> + {"tx_nulified_tlp", 0x605},
>> + {"rx_duplicate_tlp", 0x606},
>> + {"tx_memory_write", 0x700},
>> + {"tx_memory_read", 0x701},
>> + {"tx_configuration_write", 0x702},
>> + {"tx_configuration_read", 0x703},
>> + {"tx_io_write", 0x704},
>> + {"tx_io_read", 0x705},
>> + {"tx_completion_without_data", 0x706},
>> + {"tx_completion_with_data", 0x707},
>> + {"tx_message_tlp", 0x708},
>> + {"tx_atomic", 0x709},
>> + {"tx_tlp_with_prefix", 0x70A},
>> + {"rx_memory_write", 0x70B},
>> + {"rx_memory_read", 0x70C},
>> + {"rx_io_write", 0x70F},
>> + {"rx_io_read", 0x710},
>> + {"rx_completion_without_data", 0x711},
>> + {"rx_completion_with_data", 0x712},
>> + {"rx_message_tlp", 0x713},
>> + {"rx_atomic", 0x714},
>> + {"rx_tlp_with_prefix", 0x715},
>> + {"tx_ccix_tlp", 0x716},
>> + {"rx_ccix_tlp", 0x717},
>> +};
>> +
>> +static int dwc_pcie_pmu_attr_init(struct dwc_pcie_pmu_priv *priv,
>> + struct dwc_pcie_info_table *pcie_info)
>> +{
>> + int i, j;
>> + char lane[8];
>> + const char tmp[64];
>> + int events_per_lane;
>> + int num_lane_events;
>> + int time_base_count;
>> + int num_attrs, attr_idx;
>> + struct dwc_pcie_event_attr *lane_attrs;
>> + struct attribute **pmu_attrs;
>> +
>> + memset((void *)tmp, 0, sizeof(tmp));
>> + memset((void *)lane, 0, sizeof(lane));
>> + time_base_count = ARRAY_SIZE(dwc_pcie_pmu_time_event_attrs);
>> + events_per_lane = ARRAY_SIZE(event_array);
>> + num_lane_events = pcie_info->num_lanes * events_per_lane;
>> + num_attrs = time_base_count + num_lane_events;
>> +
>> + pcie_info->lane_event_attrs =
>> + devm_kcalloc(priv->dev, num_lane_events,
>> + sizeof(struct dwc_pcie_event_attr),
>> + GFP_KERNEL);
>> + if (!pcie_info->lane_event_attrs)
>> + return -ENOMEM;
>> + lane_attrs = pcie_info->lane_event_attrs;
>> + pcie_info->pcie_pmu_event_attrs =
>> + devm_kcalloc(priv->dev, num_attrs, sizeof(struct attribute *),
>> + GFP_KERNEL);
>> + if (!pcie_info->pcie_pmu_event_attrs)
>> + return -ENOMEM;
>> + pmu_attrs = pcie_info->pcie_pmu_event_attrs;
>> +
>> + for (i = 0; i < num_lane_events; i++) {
>> + lane_attrs[i].attr.attr.name =
>> + devm_kzalloc(priv->dev, sizeof(char)
>> + * ATTRI_NAME_MAX_SIZE, GFP_KERNEL);
>> + if (!lane_attrs[i].attr.attr.name)
>> + return -ENOMEM;
>> + }
>> +
>> + attr_idx = 0;
>> + for (i = 0; i < pcie_info->num_lanes; i++) {
>> + sprintf(lane, "_lane%d", i);
>> +
>> + for (j = 0; j < events_per_lane; j++) {
>> + int pos = i * events_per_lane + j;
>> +
>> + strcat((char *)tmp, event_array[j].name);
>> + strcat((char *)tmp, lane);
>> + memcpy((void *)lane_attrs[pos].attr.attr.name,
>> + (void *)tmp,
>> + sizeof(tmp));
>> +
>> + lane_attrs[pos].attr.attr.mode =
>> + VERIFY_OCTAL_PERMISSIONS(0444);
>> + lane_attrs[pos].attr.show = dwc_pcie_event_show;
>> + lane_attrs[pos].attr.store = NULL;
>> + lane_attrs[pos].type = DWC_PCIE_LANE_EVENT;
>> + lane_attrs[pos].eventid = event_array[j].event_id;
>> + lane_attrs[pos].lane = i;
>> + pmu_attrs[attr_idx++] = &lane_attrs[pos].attr.attr;
>> +
>> + memset((void *)tmp, 0, sizeof(tmp));
>> + }
>> + }
>> +
>> + for (i = 0; i < ARRAY_SIZE(dwc_pcie_pmu_time_event_attrs); i++)
>> + pmu_attrs[attr_idx++] = dwc_pcie_pmu_time_event_attrs[i];
>> +
>> + pcie_info->pcie_pmu_event_attrs[attr_idx++] = NULL;
>> +
>> + pcie_info->pcie_pmu_event_attrs_group.name = event_attr_name;
>> + pcie_info->pcie_pmu_event_attrs_group.is_visible =
>> + pcie_pmu_event_attr_is_visible;
>> + pcie_info->pcie_pmu_event_attrs_group.attrs =
>> + pcie_info->pcie_pmu_event_attrs;
>> +
>> + pcie_info->pcie_pmu_attr_groups[0] =
>> + &pcie_info->pcie_pmu_event_attrs_group;
>> + pcie_info->pcie_pmu_attr_groups[1] = &pcie_pmu_format_attrs_group;
>> + pcie_info->pcie_pmu_attr_groups[2] = &pcie_pmu_cpumask_attrs_group;
>> + pcie_info->pcie_pmu_attr_groups[3] = NULL;
>> +
>> + return 0;
>> +}
>> +
>> +static int __dwc_pcie_pmu_probe(struct dwc_pcie_pmu_priv *priv,
>> + struct dwc_pcie_info_table *pcie_info)
>> +{
>> + int ret;
>> + char *name;
>> + struct dwc_pcie_pmu *pcie_pmu;
>> + struct device *dev;
>> +
>> + if (!pcie_info || !pcie_info->pdev) {
>> + pci_err(pcie_info->pdev, "Input parameter is invalid\n");
>> + return -EINVAL;
>> + }
>> +
>> + pcie_pmu = &pcie_info->pcie_pmu;
>> + dev = &pcie_info->pdev->dev;
>> +
>> + ret = dwc_pcie_pmu_attr_init(priv, pcie_info);
>> + if (ret) {
>> + pci_err(pcie_info->pdev, "PMU attr init fail ret=%d\n", ret);
>> + return ret;
>> + }
>> +
>> + pcie_pmu->dev = dev;
>> + pcie_pmu->pmu = (struct pmu) {
>> + .module = THIS_MODULE,
>> + .task_ctx_nr = perf_invalid_context,
>> + .pmu_enable = NULL,
>> + .pmu_disable = NULL,
>> + .event_init = dwc_pcie_pmu_event_init,
>> + .add = dwc_pcie_pmu_event_add,
>> + .del = dwc_pcie_pmu_event_del,
>> + .start = dwc_pcie_pmu_event_start,
>> + .stop = dwc_pcie_pmu_event_stop,
>> + .read = dwc_pcie_pmu_event_read,
>> + .attr_groups = pcie_info->pcie_pmu_attr_groups,
>> + .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
>> + };
>> +
>> + name = devm_kasprintf(priv->dev, GFP_KERNEL, "pcie_bdf_%x",
>> + pcie_info->bdf);
>> + if (!name)
>> + return -ENOMEM;
>> +
>> + /* Pick one CPU to be the preferred one to use */
>> + pcie_pmu->on_cpu = raw_smp_processor_id();
>> +
>
> So we'll probabley bind all the pmus on one single CPU, is it intended? Since it's
> an uncore PMU, we can make it run on any cpu (or for locality CPU on the controller's
> NUMA node).
>
> And I didn't see you register a hotplug handler, so what if the ->on_cpu is hot removed?
This PMU does not support interrupt at all, so we do not need to bind it to CPU.
Should we remove this line?
>
>> + ret = perf_pmu_register(&pcie_pmu->pmu, name, -1);
>> + if (ret) {
>> + pci_err(pcie_info->pdev, "Error %d registering PMU @%x\n", ret,
>> + pcie_info->bdf);
>
> will be more helpful to print the bdf as format <bus>:<dev>:<func>.
Good idea, will fix in next version.
>
>> + return ret;
>> + }
>> +
>> + pcie_info->pmu_is_register = DWC_PCIE_PMU_HAS_REGISTER;
>
> Make @pmu_is_register a boolean will be more clear.
@pmu_is_register is also discussed in Jonathan' reply. Jonathan suggests to
remove it, so let discuss if to keep this field first :) If we decide to keep
it, I will let it be boolean.
>
>> +
>> + return ret;
>> +}
>> +
>> +static int dwc_pcie_pmu_remove(struct platform_device *pdev)
>> +{
>> + struct dwc_pcie_pmu_priv *priv = platform_get_drvdata(pdev);
>> + int index;
>> + struct dwc_pcie_pmu *pcie_pmu;
>
> Make the long line first when declaring.
Agree, will change the code style.
>
>> +
>> + for (index = 0; index < priv->pcie_ctrl_num; index++)
>> + if (priv->pcie_table[index].pmu_is_register) {
>> + pcie_pmu = &priv->pcie_table[index].pcie_pmu;
>> + perf_pmu_unregister(&pcie_pmu->pmu);
>> + }
>> + return 0;
>> +}
>> +
>> +static int dwc_pcie_pmu_probe(struct platform_device *pdev)
>> +{
>> + int ret = 0;
>> + int pcie_index;
>> + struct dwc_pcie_pmu_priv *priv;
>> +
>> + priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
>> + if (!priv)
>> + return -ENOMEM;
>> + priv->dev = &pdev->dev;
>> + platform_set_drvdata(pdev, priv);
>> +
>> + /* If PMU is not support on current platform, keep slient */
>> + if (dwc_pcie_pmu_discover(priv))
>> + return 0;
>> +
>> + for (pcie_index = 0; pcie_index < priv->pcie_ctrl_num; pcie_index++) {
>> + struct pci_dev *rp = priv->pcie_table[pcie_index].pdev;
>> +
>> + ret = __dwc_pcie_pmu_probe(priv, &priv->pcie_table[pcie_index]);
>> + if (ret) {
>> + dev_err(&rp->dev, "PCIe PMU probe fail\n");
>> + goto pmu_unregister;
>> + }
>> + }
>> + dev_info(&pdev->dev, "PCIe PMUs registered\n");
>> +
>
> As Jonathan mentioned this message maybe unnecessary, but I may find it useful if you
> print how many PMU's registered.
Fine, I can add a count here.
>
> On one PMU registration failed, you just remove all the PMUs registered. I wonder if
> it's better to make already registered PMU stay instead of removing them all.
If perf_pmu_register fails, is it necessary to call perf_pmu_unregister? I did not find
similar implementation to unregister pmu when perf_pmu_register fails.
>
> Glad to see another PCIe PMU device!
>
> Thanks,
> Yicong
Thank you for your valuable comments. I hope we can upstream this driver too :)
Cheers,
Shuai
>
>> + return 0;
>> +
>> +pmu_unregister:
>> + dwc_pcie_pmu_remove(pdev);
>> +
>> + return ret;
>> +}
>> +
>> +static struct platform_driver dwc_pcie_pmu_driver = {
>> + .probe = dwc_pcie_pmu_probe,
>> + .remove = dwc_pcie_pmu_remove,
>> + .driver = {.name = DRV_NAME,},
>> +};
>> +
>> +static int __init dwc_pcie_pmu_init(void)
>> +{
>> + int ret;
>> +
>> + ret = platform_driver_register(&dwc_pcie_pmu_driver);
>> +
>> + if (ret)
>> + return ret;
>> +
>> + dwc_pcie_pmu_dev =
>> + platform_device_register_simple(DEV_NAME, -1, NULL, 0);
>> + if (IS_ERR(dwc_pcie_pmu_dev)) {
>> + platform_driver_unregister(&dwc_pcie_pmu_driver);
>> + return PTR_ERR(dwc_pcie_pmu_dev);
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +static void __exit dwc_pcie_pmu_exit(void)
>> +{
>> + platform_device_unregister(dwc_pcie_pmu_dev);
>> + platform_driver_unregister(&dwc_pcie_pmu_driver);
>> +}
>> +
>> +module_init(dwc_pcie_pmu_init);
>> +module_exit(dwc_pcie_pmu_exit);
>> +
>> +MODULE_DESCRIPTION("PMU driver for DesignWare Cores PCI Express Controller");
>> +MODULE_AUTHOR("[email protected]");
>> +MODULE_AUTHOR("[email protected]");
>> +MODULE_LICENSE("GPL v2");
>>
>
> >
> >> +#define RP_NUM_MAX 32 /* 2die * 4RC * 4Ctrol */
> >
> > This driver is 'almost' generic. So if you an avoid defines based on a particular
> > platform that's definitely good!
>
> Good idea. How about defining RP_NUM_MAX as 64? As fars as I know,
> some platfrom use 2 sockets, 2 die per socket.
> Then 2 sockets * 2 dies * 4 Root Complex * 4 root port.
Setting a reasonable maximum is fine - but make sure the code then fails with
a suitable error message if there are more!
> >> +#define DWC_PCIE_LANE_SHIFT 4
> >> +#define DWC_PCIE_LANE_MASK GENMASK(9, 4)
> >> +
> >> +#define DWC_PCIE_EVENT_CNT_CTRL 0x8
> >> +#define DWC_PCIE__CNT_EVENT_SELECT_SHIFT 16
> >
> > Why double __? If point is , then
> > naming works better
> > DWC_PCIE_EVENT_CNT_CTRL_REG
> > DWC_PCIE_EVENT_CNT_CTRL_EV_SELECT_MSK etc
>
> Yes, I point to use double `__` to indicate it is a field of register,
> as CMN and CCN drivers do. I also considered naming with REG explicitly,
> but the macro is so long that I often have to wrap code into multilines.
> Any way, it's fine to rename if you still suggest to do so.
I don't particularly mind. This convention was new to me.
> >> +struct dwc_pcie_pmu_priv {
> >> + struct device *dev;
> >> + u32 pcie_ctrl_num;
> >> + struct dwc_pcie_info_table *pcie_table;
> >> +};
> >> +
> >> +#define DWC_PCIE_CREATE_BDF(seg, bus, dev, func) \
> >> + (((seg) << 24) | (((bus) & 0xFF) << 16) | (((dev) & 0xFF) << 8) | (func))
> >
> > Superficially this looks pretty standard. Why is is DWC specific?
>
> You are right, it is not DWC specific.
>
> I found a similar definition in arch/ia64/pci/pci.c .
>
> #define PCI_SAL_ADDRESS(seg, bus, devfn, reg) \
> (((u64) seg << 24) | (bus << 16) | (devfn << 8) | (reg))
>
> Should we move it into a common header first?
Maybe. The bus, devfn, reg part is standard bdf, but I don't think
the PCI 6.0 spec defined a version with the seg in the upper bits.
I'm not sure if we want to adopt that in LInux.
> >
> >> + pci_read_config_dword(pdev, vsec + PCI_VNDR_HEADER, &header);
> >> + /* Is the device part of a DesignWare Cores PCIe Controller ? */
> >
> > Good question... This code doesn't check that. VSEC ID is matched only with
> > the Vendor ID of the devices - unlike DVSEC where this would all be nice
> > and local.
>
> I think a similar fashion is
>
> u16 pci_find_vsec_capability(struct pci_dev *dev, u16 vendor, int cap)
>
> As you see, I don't want to limit this driver to a specific vendor, like
> Alibaba (0x1ded), because this driver is generic to all DesignWare Cores PCIe
> Controller. Therefore, dwc_pcie_find_ras_des_cap_position does not check vendor
> like pci_find_vsec_capability.
You can't do that because another vendor could use the same VSEC ID for
an entirely different purpose. They are only valid in combination with the device VID.
The only way this can work is with a list of specific vendor ID / VSEC pairs for
known devices.
>
> Do you mean to use DVSEC instead? I try to read out DVSEC with lspci:
>
> # lspci -vvv
> b0:00.0 PCI bridge: Alibaba (China) Co., Ltd. M1 Root Port (rev 01) (prog-if 00 [Normal decode])
> [...snip...]
> Capabilities: [374 v1] Vendor Specific Information: ID=0002 Rev=4 Len=100 <?>
> Capabilities: [474 v1] Vendor Specific Information: ID=0001 Rev=1 Len=038 <?>
> Capabilities: [4ac v1] Data Link Feature <?>
> Capabilities: [4b8 v1] Designated Vendor-Specific: Vendor=0001 ID=0000 Rev=1 Len=64 <?>
> Capabilities: [4fc v1] Vendor Specific Information: ID=0005 Rev=1 Len=018 <?>
>
> How can we tell it's a DesignWare Cores PCIe Controller?
Gah. This is what DVSEC was defined to solve. It lets you have a common
vendor defined extended capability defined by a vendor, independent of the
VID of a given device. With a VSEC you can't write generic code.
>
>
> >> + if (PCI_VNDR_HEADER_ID(header) == DWC_PCIE_VSEC_ID &&
> >> + PCI_VNDR_HEADER_REV(header) == DWC_PCIE_VSEC_REV) {
> >> + *pos = vsec;
> >> + return 0;
> >> + }
> >> + }
> >> +
> >> + return -ENODEV;
> >> +}
> >> +
> >> +static int dwc_pcie_pmu_discover(struct dwc_pcie_pmu_priv *priv)
> >> +{
> >> + int val, where, index = 0;
> >> + struct pci_dev *pdev = NULL;
> >> + struct dwc_pcie_info_table *pcie_info;
> >> +
> >> + priv->pcie_table =
> >> + devm_kcalloc(priv->dev, RP_NUM_MAX, sizeof(*pcie_info), GFP_KERNEL);
> >> + if (!priv->pcie_table)
> >> + return -EINVAL;
> >> +
> >> + pcie_info = priv->pcie_table;
> >> + while ((pdev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pdev)) != NULL &&
> >> + index < RP_NUM_MAX) {
> >
> > This having a driver than then walks the pci topology to find root ports and add
> > extra stuff to them is not a clean solution.
> >
> > The probing should be driven from the existing PCI driver topology.
> > There are a bunch of new features we need to add to ports in the near future
> > anyway - this would just be another one.
> > Same problem exists for CXL CPMU perf devices - so far we only support those
> > on end points, partly because we need a clean way to probe them on pci ports.
> >
> > Whatever we come up with there will apply here as well.
>
> I see your point. Any link to reference?
No, though hopefully we'll get to some sort of plan in the branch of this thread
that Bjorn comment in.
>
> >
> >> + if (!pci_dev_is_rootport(pdev))
> >> + continue;
> >> +
> >> + pcie_info[index].bdf = dwc_pcie_get_bdf(pdev);
> >> + pcie_info[index].pdev = pdev;
> > Probably want a sanity check this has a vendor ID appropriate the VSEC you are about
> > to look for.
>
> If I check the vendor ID here or in dwc_pcie_find_ras_des_cap_position, this driver
> will only work for Alibaba as I mentioned before.
Agreed. Unfortunately that's all you can do safely as VSEC IDs are not a global
namespace.
>
> >> +
> >> + ret = dwc_pcie_pmu_write_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, val);
> >> + if (ret)
> >> + pci_err(pcie_info->pdev, "PCIe write fail\n");
> >> +
> >> + return ret;
> >> +}
> >
> > ...
> >
> >> +
> >> +static int dwc_pcie_pmu_read_base_time_counter(struct dwc_pcie_info_table
> >> + *pcie_info, u64 *counter)
> >> +{
> >> + u32 ret, val;
> >> +
> >> + ret = dwc_pcie_pmu_read_dword(pcie_info,
> >> + DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_HIGH,
> >> + &val);
> >> + if (ret) {
> >> + pci_err(pcie_info->pdev, "PCIe read fail\n");
> >> + return ret;
> >> + }
> >> +
> >> + *counter = val;
> >> + *counter <<= 32;
> >
> > This looks like you could get ripping between the upper and lower dwords.
> > What prevents that? Perhaps a comment to say why that's not a problem?
>
> The Time-based Analysis Data which contains the measurement results of
> RX/TX data throughput and time spent in each low-power LTSSM state is 64 bit.
> The data is provided by two 32 bit registers so I rip them together. I will
> add a comment here in next verison.
If I understand correctly the only safe way to read this is in a try / retry loop.
Read the upper part, then the lower part, then reread the upper part.
If the upper part is unchanged you did not get ripping across the two registers.
If it changes, try again.
>
> >
> >> +
> >> + ret = dwc_pcie_pmu_read_dword(pcie_info,
> >> + DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_LOW,
> >> + &val);
> >> + if (ret) {
> >> + pci_err(pcie_info->pdev, "PCIe read fail\n");
> >> + return ret;
> >> + }
> >> +
> >> + *counter += val;
> >> +
> >> + return ret;
> >> +}
> > ...
> >
> >> +
> >> + ret = perf_pmu_register(&pcie_pmu->pmu, name, -1);
> >> + if (ret) {
> >> + pci_err(pcie_info->pdev, "Error %d registering PMU @%x\n", ret,
> >> + pcie_info->bdf);
> >> + return ret;
> >> + }
> >> +
> >> + pcie_info->pmu_is_register = DWC_PCIE_PMU_HAS_REGISTER;
> >
> > As below. I think you can drop this state info.
>
> Please see my confusion bellow.
>
> >
> >> +
> >> + return ret;
> >> +}
> >> +
> >> +static int dwc_pcie_pmu_remove(struct platform_device *pdev)
> >> +{
> >> + struct dwc_pcie_pmu_priv *priv = platform_get_drvdata(pdev);
> >> + int index;
> >> + struct dwc_pcie_pmu *pcie_pmu;
> >> +
> >> + for (index = 0; index < priv->pcie_ctrl_num; index++)
> >> + if (priv->pcie_table[index].pmu_is_register) {
> >> + pcie_pmu = &priv->pcie_table[index].pcie_pmu;
> >> + perf_pmu_unregister(&pcie_pmu->pmu);
> >> + }
> >> + return 0;
> >> +}
> >> +
> >> +static int dwc_pcie_pmu_probe(struct platform_device *pdev)
> >> +{
> >> + int ret = 0;
> >
> > Initialized in all paths where it is used. Compiler should be able to tell
> > that so I doubt you need this to be set to 0 here.
>
> Agree, will leave it as uninitialized.
>
> >
> >> + int pcie_index;
> >> + struct dwc_pcie_pmu_priv *priv;
> >> +
> >> + priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
> >> + if (!priv)
> >> + return -ENOMEM;
> >> + priv->dev = &pdev->dev;
> >> + platform_set_drvdata(pdev, priv);
> >> +
> >> + /* If PMU is not support on current platform, keep slient */
> >> + if (dwc_pcie_pmu_discover(priv))
> >> + return 0;
> >> +
> >> + for (pcie_index = 0; pcie_index < priv->pcie_ctrl_num; pcie_index++) {
> >> + struct pci_dev *rp = priv->pcie_table[pcie_index].pdev;
> >> +
> >> + ret = __dwc_pcie_pmu_probe(priv, &priv->pcie_table[pcie_index]);
> >> + if (ret) {
> >> + dev_err(&rp->dev, "PCIe PMU probe fail\n");
> >> + goto pmu_unregister;
> >> + }
> >> + }
> >> + dev_info(&pdev->dev, "PCIe PMUs registered\n");
> >
> > Noise in the logs. There are lots of ways to know if we reached this point
> > so this adds no value.
>
> Got it, will drop this out in next version.
>
> >
> >> +
> >> + return 0;
> >> +
> >> +pmu_unregister:
> >> + dwc_pcie_pmu_remove(pdev);
> >
> > I'd much rather see the unwind here directly so we can clearly see that it undoes
> > the result of errors in this function. That removes the need to use the
> > is_registered flag in the remove() function simplifying that flow as well.
>
> Do you mean that if perf_pmu_register fails, then jump to pmu_unregister lable directly?
> How can we tell which PMU diveice fails to reigister?
pcie_index will be set to the index of the PMU device that failed - so loops backwards
from that removing them.
>
.
>
> >
> >> +};
> >> +
> >> +static int __init dwc_pcie_pmu_init(void)
> >> +{
> >> + int ret;
> >> +
> >> + ret = platform_driver_register(&dwc_pcie_pmu_driver);
> >> +
> >> + if (ret)
> >> + return ret;
> >> +
> >> + dwc_pcie_pmu_dev =
> >> + platform_device_register_simple(DEV_NAME, -1, NULL, 0);
> >
> > I'd normally expect to see the device created as a result of firmware
> > description (ACPI DSDT / or Device tree)
> > It is unusual to create a 'real' device directly in the driver
> > init - that's normally reserved for various fake / software devices.
>
> I see your concerns. You mentioned that
>
> > The probing should be driven from the existing PCI driver topology.
>
> Should we add a fake device in firmware or drive from PCI driver topology?
Ah. I was reviewing backwards so when I wrote this hadn't realized you walk
the PCI topology. PCI driver topology is the right solution here.
>
> Thank you.
>
> Best Regards,
> Shuai
>
On Fri, Sep 23, 2022 at 10:46:09PM +0800, Shuai Xue wrote:
> 在 2022/9/23 AM1:36, Bjorn Helgaas 写道:
> > On Sat, Sep 17, 2022 at 08:10:35PM +0800, Shuai Xue wrote:
> >> +static struct device_attribute dwc_pcie_pmu_cpumask_attr =
> >> +__ATTR(cpumask, 0444, dwc_pcie_pmu_cpumask_show, NULL);
> >
> > DEVICE_ATTR_RO()?
> DEVICE_ATTR_RO may a good choice. But does it fit the code style to use
> DEVICE_ATTR_RO in drivers/perf? As far as know, CCN, CCI, SMMU,
> qcom_l2_pmu use "struct device_attribute" directly.
DEVICE_ATTR_RO is just newer, and I think CCN, CCI, SMMU, etc. would
be using it if they were written today. Of course, the drivers/perf
maintainers may have a different opinion :)
> > I think every caller of dwc_pcie_pmu_read_dword() makes the same check
> > and prints the same message; maybe the message should be moved inside
> > dwc_pcie_pmu_read_dword()?
> >
> > Same with dwc_pcie_pmu_write_dword(); moving the message there would
> > simplify all callers.
>
> I would like to wrap dwc_pcie_pmu_{write}_dword out, use
> pci_{read}_config_dword and drop the snaity check of return value as
> Jonathan suggests. How did you like it?
Sounds good. Not sure the error checking is worthwhile since
pci_read_config_dword() really doesn't return meaningful errors
anyway.
> >> +static struct dwc_pcie_info_table *pmu_to_pcie_info(struct pmu *pmu)
> >> +{
> >> + struct dwc_pcie_info_table *pcie_info;
> >> + struct dwc_pcie_pmu *pcie_pmu = to_pcie_pmu(pmu);
> >> +
> >> + pcie_info = container_of(pcie_pmu, struct dwc_pcie_info_table, pcie_pmu);
> >> + if (pcie_info == NULL)
> >> + pci_err(pcie_info->pdev, "Can't get pcie info\n");
> >
> > It shouldn't be possible to get here for a pmu with no pcie_info, and
> > callers don't check for a NULL pointer return value before
> > dereferencing it, so I guess all this adds is an error message before
> > a NULL pointer oops? Not sure the code clutter is worth it.
>
> Do you mean to drop the snaity check of container_of?
Yes. I'm suggesting that the NULL pointer oops itself has enough
information to debug this problem, even without the pci_err().
Bjorn
On 2022/9/23 23:43, Shuai Xue wrote:
>
>
> 在 2022/9/23 AM11:30, Yicong Yang 写道:
>> On 2022/9/17 20:10, Shuai Xue wrote:
>>> This commit adds the PCIe Performance Monitoring Unit (PMU) driver support
>>> for T-Head Yitian SoC chip. Yitian is based on the Synopsys PCI Express
>>> Core controller IP which provides statistics feature. The PMU is not a PCIe
>>> Root Complex integrated End Point(RCiEP) device but only register counters
>>> provided by each PCIe Root Port.
>>>
>>> To facilitate collection of statistics the controller provides the
>>> following two features for each Root Port:
>>>
>>> - Time Based Analysis (RX/TX data throughput and time spent in each
>>> low-power LTSSM state)
>>> - Event counters (Error and Non-Error for lanes)
>>>
>>> Note, only one counter for each type.
>>>
>>> This driver add PMU devices for each PCIe Root Port. And the PMU device is
>>> named based the BDF of Root Port. For example,
>>>
>>> 10:00.0 PCI bridge: Device 1ded:8000 (rev 01)
>>>
>>> the PMU device name for this Root Port is pcie_bdf_100000.
>>>
>>> Example usage of counting PCIe RX TLP data payload (Units of 16 bytes)::
>>>
>>> $# perf stat -a -e pcie_bdf_200/Rx_PCIe_TLP_Data_Payload/
>>>
>>> average RX bandwidth can be calculated like this:
>>>
>>> PCIe TX Bandwidth = PCIE_TX_DATA * 16B / Measure_Time_Window
>>>
>>> Signed-off-by: Shuai Xue <[email protected]>
>>> ---
>>> drivers/perf/Kconfig | 7 +
>>> drivers/perf/Makefile | 1 +
>>> drivers/perf/dwc_pcie_pmu.c | 976 ++++++++++++++++++++++++++++++++++++
>>> 3 files changed, 984 insertions(+)
>>> create mode 100644 drivers/perf/dwc_pcie_pmu.c
>>>
>>> diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
>>> index 1e2d69453771..11ae99de5bbf 100644
>>> --- a/drivers/perf/Kconfig
>>> +++ b/drivers/perf/Kconfig
>>> @@ -192,4 +192,11 @@ config MARVELL_CN10K_DDR_PMU
>>> Enable perf support for Marvell DDR Performance monitoring
>>> event on CN10K platform.
>>>
>>> +config CONFIG_DWC_PCIE_PMU
>>> + tristate "Enable Synopsys DesignWare PCIe PMU Support"
>>> + depends on ARM64 || (COMPILE_TEST && 64BIT)
>>> + help
>>> + Enable perf support for Synopsys DesignWare PCIe PMU Performance
>>> + monitoring event on Yitan 710 platform.
>>> +
>>> endmenu
>>> diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
>>> index 57a279c61df5..36f75cb0f320 100644
>>> --- a/drivers/perf/Makefile
>>> +++ b/drivers/perf/Makefile
>>> @@ -20,3 +20,4 @@ obj-$(CONFIG_ARM_DMC620_PMU) += arm_dmc620_pmu.o
>>> obj-$(CONFIG_MARVELL_CN10K_TAD_PMU) += marvell_cn10k_tad_pmu.o
>>> obj-$(CONFIG_MARVELL_CN10K_DDR_PMU) += marvell_cn10k_ddr_pmu.o
>>> obj-$(CONFIG_APPLE_M1_CPU_PMU) += apple_m1_cpu_pmu.o
>>> +obj-$(CONFIG_DWC_PCIE_PMU) += dwc_pcie_pmu.o
>>> diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c
>>> new file mode 100644
>>> index 000000000000..81e534be13fa
>>> --- /dev/null
>>> +++ b/drivers/perf/dwc_pcie_pmu.c
>>> @@ -0,0 +1,976 @@
>>> +// SPDX-License-Identifier: GPL-2.0
>>> +/*
>>> + * Synopsys DesignWare PCIe PMU driver
>>> + *
>>> + * Copyright (C) 2021, 2022 Alibaba Inc.
>>> + */
>>> +
>>> +#include <linux/pci.h>
>>> +#include <linux/bitfield.h>
>>> +#include <linux/bitops.h>
>>> +#include <linux/cpuhotplug.h>
>>> +#include <linux/cpumask.h>
>>> +#include <linux/device.h>
>>> +#include <linux/errno.h>
>>> +#include <linux/kernel.h>
>>> +#include <linux/list.h>
>>> +#include <linux/perf_event.h>
>>> +#include <linux/platform_device.h>
>>> +#include <linux/smp.h>
>>> +#include <linux/sysfs.h>
>>> +#include <linux/types.h>
>>> +
>>> +#define DRV_NAME "dwc_pcie_pmu"
>>> +#define DEV_NAME "dwc_pcie_pmu"
>>> +#define RP_NUM_MAX 32 /* 2die * 4RC * 4Ctrol */
>>> +#define ATTRI_NAME_MAX_SIZE 32
>>> +
>>> +#define DWC_PCIE_VSEC_ID 0x02
>>> +#define DWC_PCIE_VSEC_REV 0x04
>>> +
>>> +#define DWC_PCIE_LINK_CAPABILITIES_REG 0xC
>>> +#define DWC_PCIE_LANE_SHIFT 4
>>> +#define DWC_PCIE_LANE_MASK GENMASK(9, 4)
>>> +
>>> +#define DWC_PCIE_EVENT_CNT_CTRL 0x8
>>> +#define DWC_PCIE__CNT_EVENT_SELECT_SHIFT 16
>>> +#define DWC_PCIE__CNT_EVENT_SELECT_MASK GENMASK(27, 16)
>>> +#define DWC_PCIE__CNT_LANE_SELECT_SHIFT 8
>>> +#define DWC_PCIE__CNT_LANE_SELECT_MASK GENMASK(11, 8)
>>> +#define DWC_PCIE__CNT_STATUS_SHIFT 7
>>> +#define DWC_PCIE__CNT_STATUS_MASK BIT(7)
>>> +#define DWC_PCIE__CNT_ENABLE_SHIFT 2
>>> +#define DWC_PCIE__CNT_ENABLE_MASK GENMASK(4, 2)
>>> +#define DWC_PCIE_PER_EVENT_OFF (0x1 << DWC_PCIE__CNT_ENABLE_SHIFT)
>>> +#define DWC_PCIE_PER_EVENT_ON (0x3 << DWC_PCIE__CNT_ENABLE_SHIFT)
>>> +#define DWC_PCIE_EVENT_CLEAR_MASK GENMASK(1, 0)
>>> +
>>> +#define DWC_PCIE_EVENT_CNT_DATA 0xC
>>> +
>>> +#define DWC_PCIE_TIME_BASED_ANALYSIS_CTRL 0x10
>>> +#define DWC_PCIE__TIME_BASED_REPORT_SELECT_SHIFT 24
>>> +#define DWC_PCIE__TIME_BASED_REPORT_SELECT_MASK GENMASK(31, 24)
>>> +#define DWC_PCIE__TIME_BASED_DURATION_SHIFT 8
>>> +#define DWC_PCIE__TIME_BASED_DURATION_SELECT GENMASK(15, 8)
>>> +#define DWC_PCIE_DURATION_MANUAL_CTRL 0x0
>>> +#define DWC_PCIE_DURATION_1MS 0x1
>>> +#define DWC_PCIE_DURATION_10MS 0x2
>>> +#define DWC_PCIE_DURATION_100MS 0x3
>>> +#define DWC_PCIE_DURATION_1S 0x4
>>> +#define DWC_PCIE_DURATION_2S 0x5
>>> +#define DWC_PCIE_DURATION_4S 0x6
>>> +#define DWC_PCIE_DURATION_4US 0xff
>>> +#define DWC_PCIE__TIME_BASED_COUNTER_ENABLE 1
>>> +
>>> +#define DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_LOW 0x14
>>> +#define DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_HIGH 0x18
>>> +
>>> +/* Event attributes */
>>> +#define DWC_PCIE_CONFIG_EVENTID GENMASK(15, 0)
>>> +#define DWC_PCIE_CONFIG_TYPE GENMASK(19, 16)
>>> +#define DWC_PCIE_CONFIG_LANE GENMASK(27, 20)
>>> +
>>> +#define DWC_PCIE_EVENT_ID(event) FIELD_GET(DWC_PCIE_CONFIG_EVENTID, (event)->attr.config)
>>> +#define DWC_PCIE_EVENT_TYPE(event) FIELD_GET(DWC_PCIE_CONFIG_TYPE, (event)->attr.config)
>>> +#define DWC_PCIE_EVENT_LANE(event) FIELD_GET(DWC_PCIE_CONFIG_LANE, (event)->attr.config)
>>> +
>>> +#define DWC_PCIE_PMU_HAS_REGISTER 1
>>> +
>>> +enum dwc_pcie_event_type {
>>> + DWC_PCIE_TYPE_INVALID,
>>> + DWC_PCIE_TIME_BASE_EVENT,
>>> + DWC_PCIE_LANE_EVENT,
>>> +};
>>> +
>>> +struct dwc_event_counters {
>>> + const char name[32];
>>> + u32 event_id;
>>> +};
>>> +
>>> +struct dwc_pcie_pmu {
>>> + struct hlist_node node;
>>> + unsigned int on_cpu;
>>> + struct pmu pmu;
>>> + struct device *dev;
>>> +};
>>> +
>>> +struct dwc_pcie_info_table {
>>> + u32 bdf;
>>> + u32 cap_pos;
>>> + u32 num_lanes;
>>> + struct pci_dev *pdev;
>>> + struct dwc_pcie_pmu pcie_pmu;
>>> + u8 pmu_is_register;
>>> + struct perf_event *event;
>>> +
>>> + struct dwc_pcie_event_attr *lane_event_attrs;
>>> + struct attribute **pcie_pmu_event_attrs;
>>> + struct attribute_group pcie_pmu_event_attrs_group;
>>> + const struct attribute_group *pcie_pmu_attr_groups[4];
>>> +};
>>> +
>>> +struct dwc_pcie_pmu_priv {
>>> + struct device *dev;
>>> + u32 pcie_ctrl_num;
>>> + struct dwc_pcie_info_table *pcie_table;
>>> +};
>>> +
>>> +#define DWC_PCIE_CREATE_BDF(seg, bus, dev, func) \
>>> + (((seg) << 24) | (((bus) & 0xFF) << 16) | (((dev) & 0xFF) << 8) | (func))
>>
>> Just pass pdev->devfn and use PCI_DEVID() to simplify here.
>
> Sorry, as far as I know, PCI_DEVID() output is not exactly the bdf.
> For example, bdf 300100 is decoded as 3008.
>
See the standard's encoding of BDF (PCIe Spec 4.0 Figure 6-34: Routing IDs (RIDs) and Supported
Granularities). Also in uapi/linux/pci.h and include/linux/pci.h. Bus number is encoding in
BIT[15, 8], slot number in BIT[7, 3] and function number for BIT[2, 0].
You're use your coding of "BDF" here and thought it's more convenient to the user to recognize, but
that's not what is known of BDF. Just use the standard coding of BDF will have less ambiguous.
>>
>>> +#define to_pcie_pmu(p) (container_of(p, struct dwc_pcie_pmu, pmu))
>>> +
>>> +static struct platform_device *dwc_pcie_pmu_dev;
>>> +static char *event_attr_name = "events";
>>> +
>>> +static ssize_t dwc_pcie_pmu_cpumask_show(struct device *dev,
>>> + struct device_attribute *attr,
>>> + char *buf)
>>> +{
>>> + struct dwc_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev));
>>> +
>>> + return cpumap_print_to_pagebuf(true, buf, cpumask_of(pcie_pmu->on_cpu));
>>> +}
>>> +
>>> +static struct device_attribute dwc_pcie_pmu_cpumask_attr =
>>> +__ATTR(cpumask, 0444, dwc_pcie_pmu_cpumask_show, NULL);
>>> +
>>> +static struct attribute *dwc_pcie_pmu_cpumask_attrs[] = {
>>> + &dwc_pcie_pmu_cpumask_attr.attr,
>>> + NULL
>>> +};
>>> +
>>> +static struct attribute_group pcie_pmu_cpumask_attrs_group = {
>>> + .attrs = dwc_pcie_pmu_cpumask_attrs,
>>> +};
>>> +
>>> +struct dwc_pcie_format_attr {
>>> + struct device_attribute attr;
>>> + u64 field;
>>> + int config;
>>> +};
>>> +
>>> +static ssize_t dwc_pcie_pmu_format_show(struct device *dev,
>>> + struct device_attribute *attr,
>>> + char *buf)
>>> +{
>>> + struct dwc_pcie_format_attr *fmt = container_of(attr, typeof(*fmt), attr);
>>> + int lo = __ffs(fmt->field), hi = __fls(fmt->field);
>>> +
>>> + if (lo == hi)
>>> + return snprintf(buf, PAGE_SIZE, "config:%d\n", lo);
>>> +
>>> + if (!fmt->config)
>>> + return snprintf(buf, PAGE_SIZE, "config:%d-%d\n", lo, hi);
>>> +
>>> + return snprintf(buf, PAGE_SIZE, "config%d:%d-%d\n", fmt->config, lo,
>>> + hi);
>>> +}
>>> +
>>> +#define _dwc_pcie_format_attr(_name, _cfg, _fld) \
>>> + (&((struct dwc_pcie_format_attr[]) {{ \
>>> + .attr = __ATTR(_name, 0444, dwc_pcie_pmu_format_show, NULL), \
>>> + .config = _cfg, \
>>> + .field = _fld, \
>>> + }})[0].attr.attr)
>>> +
>>> +#define dwc_pcie_format_attr(_name, _fld) _dwc_pcie_format_attr(_name, 0, _fld)
>>> +
>>> +static struct attribute *dwc_pcie_format_attrs[] = {
>>> + dwc_pcie_format_attr(type, DWC_PCIE_CONFIG_TYPE),
>>> + dwc_pcie_format_attr(eventid, DWC_PCIE_CONFIG_EVENTID),
>>> + dwc_pcie_format_attr(lane, DWC_PCIE_CONFIG_LANE),
>>> + NULL,
>>> +};
>>> +
>>> +static struct attribute_group pcie_pmu_format_attrs_group = {
>>> + .name = "format",
>>> + .attrs = dwc_pcie_format_attrs,
>>> +};
>>> +
>>> +struct dwc_pcie_event_attr {
>>> + struct device_attribute attr;
>>> + enum dwc_pcie_event_type type;
>>> + u16 eventid;
>>> + u8 lane;
>>> +};
>>> +
>>> +ssize_t dwc_pcie_event_show(struct device *dev,
>>> + struct device_attribute *attr, char *page)
>>> +{
>>> + struct dwc_pcie_event_attr *eattr;
>>> +
>>> + eattr = container_of(attr, typeof(*eattr), attr);
>>> +
>>> + if (eattr->type == DWC_PCIE_LANE_EVENT)
>>> + return sprintf(page, "eventid=0x%lx, type=0x%lx, lane=0x%lx\n",
>>> + (unsigned long)eattr->eventid,
>>> + (unsigned long)eattr->type,
>>> + (unsigned long)eattr->lane);
>>> + else
>>> + return sprintf(page, "eventid=0x%lx, type=0x%lx",
>>> + (unsigned long)eattr->eventid,
>>> + (unsigned long)eattr->type);
>>> +}
>>
>> I remember sysfs_emit() is preferred.
>
> You are right, I will use sysfs_emit() in next version.
>
>>
>>> +
>>> +#define DWC_PCIE_EVENT_ATTR(_name, _type, _eventid, _lane) \
>>> + (&((struct dwc_pcie_event_attr[]) {{ \
>>> + .attr = __ATTR(_name, 0444, dwc_pcie_event_show, NULL), \
>>> + .type = _type, \
>>> + .eventid = _eventid, \
>>> + .lane = _lane, \
>>> + }})[0].attr.attr)
>>> +
>>> +#define DWC_PCIE_PMU_BASE_TIME_ATTR(_name, _eventid) \
>>> + DWC_PCIE_EVENT_ATTR(_name, DWC_PCIE_TIME_BASE_EVENT, _eventid, 0)
>>> +
>>> +static struct attribute *dwc_pcie_pmu_time_event_attrs[] = {
>>> + /* Group #0 */
>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(one_cycle, 0x00),
>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(TX_L0S, 0x01),
>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(RX_L0S, 0x02),
>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(L0, 0x03),
>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(L1, 0x04),
>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(L1_1, 0x05),
>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(L1_2, 0x06),
>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(CFG_RCVRY, 0x07),
>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(TX_RX_L0S, 0x08),
>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(L1_AUX, 0x09),
>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(ONE_cycle, 0x10),
>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(TX_L0S_, 0x11),
>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(RX_L0S_, 0x12),
>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(L0_, 0x13),
>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(L1_, 0x17),
>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(CFG_RCVRY_, 0x17),
>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(TX_RX_L0S_, 0x18),
>>> + /* Group #1 */
>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(Tx_PCIe_TLP_Data_Payload, 0x20),
>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(Rx_PCIe_TLP_Data_Payload, 0x21),
>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(Tx_CCIX_TLP_Data_Payload, 0x22),
>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(Rx_CCIX_TLP_Data_Payload, 0x23),
>>> + NULL
>>> +};
>>> +
>>> +static inline umode_t pcie_pmu_event_attr_is_visible(struct kobject *kobj,
>>> + struct attribute *attr,
>>> + int unuse)
>>> +{
>>> + return attr->mode;
>>> +}
>>> +
>>> +static inline bool pci_dev_is_rootport(struct pci_dev *pdev)
>>> +{
>>> + return (pci_is_pcie(pdev) &&
>>> + pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT);
>>> +}
>>> +
>>> +static inline unsigned int dwc_pcie_get_bdf(struct pci_dev *dev)
>>> +{
>>> + return (DWC_PCIE_CREATE_BDF(pci_domain_nr(dev->bus), dev->bus->number,
>>> + PCI_SLOT(dev->devfn),
>>> + PCI_FUNC(dev->devfn)));
>>> +}
>>> +
>>> +static int dwc_pcie_find_ras_des_cap_position(struct pci_dev *pdev, int *pos)
>>> +{
>>> + u32 header;
>>> + int vsec = 0;
>>> +
>>> + while ((vsec = pci_find_next_ext_capability(pdev, vsec,
>>> + PCI_EXT_CAP_ID_VNDR))) {
>>> + pci_read_config_dword(pdev, vsec + PCI_VNDR_HEADER, &header);
>>> + /* Is the device part of a DesignWare Cores PCIe Controller ? */
>>> + if (PCI_VNDR_HEADER_ID(header) == DWC_PCIE_VSEC_ID &&
>>> + PCI_VNDR_HEADER_REV(header) == DWC_PCIE_VSEC_REV) {
>>> + *pos = vsec;
>>> + return 0;
>>> + }
>>> + }
>>> +
>>> + return -ENODEV;
>>> +}
>>> +
>>> +static int dwc_pcie_pmu_discover(struct dwc_pcie_pmu_priv *priv)
>>> +{
>>> + int val, where, index = 0;
>>> + struct pci_dev *pdev = NULL;
>>> + struct dwc_pcie_info_table *pcie_info;
>>> +
>>> + priv->pcie_table =
>>> + devm_kcalloc(priv->dev, RP_NUM_MAX, sizeof(*pcie_info), GFP_KERNEL);
>>> + if (!priv->pcie_table)
>>> + return -EINVAL;
>>> +
>>> + pcie_info = priv->pcie_table;
>>> + while ((pdev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pdev)) != NULL &&
>>
>> I may miss but I don't pci_dev_put() to balance the reference cnt.
>
> As the comments in pci_get_device, the reference count is incremented and
> decremented in the loop automatically. So we do not need to use
> pci_dev_put(), right?
>
> Iterates through the list of known PCI devices. If a PCI device is
> found with a matching @vendor and @device, *the reference count to the
> device is incremented* and a pointer to its device structure is returned.
> Otherwise, %NULL is returned. A new search is initiated by passing %NULL
> as the @from argument. Otherwise if @from is not %NULL, searches continue
> from next device on the global list. *The reference count for @from is
> always decremented if it is not %NULL.*
Thanks for the explanation. The usage is right here. Can we use for_each_pci_dev() instead?
And any reason to limit the Root Ports number to RP_NUM_MAX? Shouldn't we find all the
Root Ports with PMU counters and make use of them? Limit it with RP_NUM_MAX is rather
platform specific and you need to extend it if we have more Root Ports someday.
Another problem I see here is that you walk all the Root Ports with counters and register
a PMU for them. But you don't know whether they're removed later when you use them...
>>
>>> + index < RP_NUM_MAX) {
>>> + if (!pci_dev_is_rootport(pdev))
>>> + continue;
>>> +
>>> + pcie_info[index].bdf = dwc_pcie_get_bdf(pdev);
>>> + pcie_info[index].pdev = pdev;
...you store the *pdev and use them directly in the pmu_ops but when the device is hot removed
you'll access an invalid address and crash.
A possible solution is to be notified when the corresponding device is removed/added and handle
correctly. Or you can get the reference count of the device to prevent it from being removed, but
this may not be a good option.
>>> +
>>> + if (dwc_pcie_find_ras_des_cap_position(pdev, &where))
>>> + continue;
>>> +
>>> + pcie_info[index].cap_pos = where;
>>> +
>>> + pci_read_config_dword(pdev,
>>> + pdev->pcie_cap + DWC_PCIE_LINK_CAPABILITIES_REG,
>>> + &val);
>>> + pcie_info[index].num_lanes =
>>> + (val & DWC_PCIE_LANE_MASK) >> DWC_PCIE_LANE_SHIFT;
>>> + index++;
>>> + }
>>> +
>>> + if (!index)
>>> + return -ENODEV;
>>> +
>>> + priv->pcie_ctrl_num = index;
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static inline int dwc_pcie_pmu_read_dword(struct dwc_pcie_info_table *pcie_info,
>>> + u32 reg, u32 *val)
>>> +{
>>> + return pci_read_config_dword(pcie_info->pdev, pcie_info->cap_pos + reg,
>>> + val);
>>> +}
>>> +
>>> +static inline int dwc_pcie_pmu_write_dword(struct dwc_pcie_info_table
>>> + *pcie_info, u32 reg, u32 val)
>>> +{
>>> + return pci_write_config_dword(pcie_info->pdev, pcie_info->cap_pos + reg,
>>> + val);
>>> +}
>>> +
>>> +static int dwc_pcie_pmu_set_event_id(struct dwc_pcie_info_table *pcie_info,
>>> + int event_id)
>>> +{
>>> + int ret;
>>> + u32 val;
>>> +
>>> + ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, &val);
>>> + if (ret) {
>>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>>> + return ret;
>>> + }
>>> +
>>> + val &= ~DWC_PCIE__CNT_ENABLE_MASK;
>>> + val &= ~DWC_PCIE__CNT_EVENT_SELECT_MASK;
>>> + val |= event_id << DWC_PCIE__CNT_EVENT_SELECT_SHIFT;
>>> +
>>> + ret = dwc_pcie_pmu_write_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, val);
>>> + if (ret)
>>> + pci_err(pcie_info->pdev, "PCIe write fail\n");
>>> +
>>> + return ret;
>>> +}
>>> +
>>> +static int dwc_pcie_pmu_write_event_lane(struct dwc_pcie_info_table *pcie_info,
>>> + int lane, int event_id)
>>> +{
>>> + u32 ret;
>>> + u32 val;
>>> +
>>> + ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, &val);
>>> + if (ret) {
>>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>>> + return ret;
>>> + }
>>> +
>>> + val &= ~DWC_PCIE__CNT_LANE_SELECT_MASK;
>>> + val |= lane << DWC_PCIE__CNT_LANE_SELECT_SHIFT;
>>> +
>>> + ret = dwc_pcie_pmu_write_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, val);
>>> + if (ret)
>>> + pci_err(pcie_info->pdev, "PCIe write fail\n");
>>> +
>>> + return ret;
>>> +}
>>> +
>>> +static int dwc_pcie_pmu_event_enable(struct dwc_pcie_info_table *pcie_info,
>>> + u32 enable)
>>> +{
>>> + u32 ret;
>>> + u32 val;
>>> +
>>> + ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, &val);
>>> + if (ret) {
>>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>>
>> Somebody may mentioned. Maybe you don't need to print these messages in PMU ops, just
>> return the correct error code and let perf handle it. Or you should provide more
>> information for these, like failed in which funcion or read/write which value.
>> If it only necessary when debugging, make it pci_dbg().
>
> Yep, you are right, I will drop the print info in next version.
>
>>
>>> + return ret;
>>> + }
>>> +
>>> + val &= ~(DWC_PCIE__CNT_ENABLE_MASK);
>>> +
>>> + if (enable)
>>> + val |= DWC_PCIE_PER_EVENT_ON;
>>> + else
>>> + val |= DWC_PCIE_PER_EVENT_OFF;
>>> +
>>> + ret = dwc_pcie_pmu_write_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, val);
>>> + if (ret)
>>> + pci_err(pcie_info->pdev, "PCIe write fail\n");
>>> +
>>> + return ret;
>>> +}
>>> +
>>> +static int dwc_pcie_pmu_base_time_enable(struct dwc_pcie_info_table *pcie_info,
>>> + u32 enable)
>>> +{
>>> + u32 ret;
>>> + u32 val;
>>> +
>>> + ret = dwc_pcie_pmu_read_dword(pcie_info,
>>> + DWC_PCIE_TIME_BASED_ANALYSIS_CTRL, &val);
>>> + if (ret) {
>>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>>> + return ret;
>>> + }
>>> +
>>> + if (enable)
>>> + val |= DWC_PCIE__TIME_BASED_COUNTER_ENABLE;
>>> + else
>>> + val &= ~DWC_PCIE__TIME_BASED_COUNTER_ENABLE;
>>> +
>>> + ret = dwc_pcie_pmu_write_dword(pcie_info,
>>> + DWC_PCIE_TIME_BASED_ANALYSIS_CTRL, val);
>>> + if (ret)
>>> + pci_err(pcie_info->pdev, "PCIe write fail\n");
>>> +
>>> + return ret;
>>> +}
>>> +
>>> +static int dwc_pcie_pmu_read_event_counter(struct dwc_pcie_info_table
>>> + *pcie_info, u64 *counter)
>>> +{
>>> + u32 ret, val;
>>> +
>>> + ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_DATA, &val);
>>> + if (ret) {
>>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>>> + return ret;
>>> + }
>>> + *counter = val;
>>> +
>>> + return ret;
>>> +}
>>> +
>>> +static int dwc_pcie_pmu_read_base_time_counter(struct dwc_pcie_info_table
>>> + *pcie_info, u64 *counter)
>>> +{
>>> + u32 ret, val;
>>> +
>>> + ret = dwc_pcie_pmu_read_dword(pcie_info,
>>> + DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_HIGH,
>>> + &val);
>>> + if (ret) {
>>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>>> + return ret;
>>> + }
>>> +
>>> + *counter = val;
>>> + *counter <<= 32;
>>> +
>>> + ret = dwc_pcie_pmu_read_dword(pcie_info,
>>> + DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_LOW,
>>> + &val);
>>> + if (ret) {
>>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>>> + return ret;
>>> + }
>>> +
>>> + *counter += val;
>>> +
>>> + return ret;
>>> +}
>>> +
>>> +static int dwc_pcie_pmu_clear_event_counter(struct dwc_pcie_info_table
>>> + *pcie_info)
>>> +{
>>> + u32 ret;
>>> + u32 val;
>>> +
>>> + ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, &val);
>>> + if (ret) {
>>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>>> + return ret;
>>> + }
>>> +
>>> + val &= ~DWC_PCIE_EVENT_CLEAR_MASK;
>>> + val |= 1;
>>
>> It's better to use a macro for '1' to make it more clear.
>
> Good idea, will fix it in next version.
>
>>
>>> +
>>> + ret = dwc_pcie_pmu_write_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, val);
>>> + if (ret)
>>> + pci_err(pcie_info->pdev, "PCIe write fail\n");
>>> +
>>> + return ret;
>>> +}
>>> +
>>> +static int dwc_pcie_pmu_base_time_add_prepare(struct dwc_pcie_info_table
>>> + *pcie_info, u32 event_id)
>>> +{
>>> + u32 ret;
>>> + u32 val;
>>> +
>>> + ret = dwc_pcie_pmu_read_dword(pcie_info,
>>> + DWC_PCIE_TIME_BASED_ANALYSIS_CTRL, &val);
>>> + if (ret) {
>>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>>> + return ret;
>>> + }
>>> +
>>> + val &= ~DWC_PCIE__TIME_BASED_REPORT_SELECT_MASK;
>>> + val |= event_id << DWC_PCIE__TIME_BASED_REPORT_SELECT_SHIFT;
>>> + val &= ~DWC_PCIE__TIME_BASED_DURATION_SELECT;
>>> +
>>> + /*
>>> + * TIME_BASED_ANALYSIS_DATA_REG is a 64 bit register, we can safely
>>> + * use it with any manually controllered duration.
>>> + */
>>> + val &= ~(DWC_PCIE__TIME_BASED_DURATION_SELECT);
>>> + val |= DWC_PCIE_DURATION_MANUAL_CTRL;
>>> +
>>> + ret = dwc_pcie_pmu_write_dword(pcie_info,
>>> + DWC_PCIE_TIME_BASED_ANALYSIS_CTRL, val);
>>> + if (ret)
>>> + pci_err(pcie_info->pdev, "PCIe write fail\n");
>>> +
>>> + return ret;
>>> +}
>>> +
>>> +static struct dwc_pcie_info_table *pmu_to_pcie_info(struct pmu *pmu)
>>> +{
>>> + struct dwc_pcie_info_table *pcie_info;
>>> + struct dwc_pcie_pmu *pcie_pmu = to_pcie_pmu(pmu);
>>> +
>>> + pcie_info = container_of(pcie_pmu, struct dwc_pcie_info_table, pcie_pmu);
>>> + if (pcie_info == NULL)
>>> + pci_err(pcie_info->pdev, "Can't get pcie info\n");
>>> +
>>> + return pcie_info;
>>> +}
>>> +
>>> +static void dwc_pcie_pmu_event_update(struct perf_event *event)
>>> +{
>>> + u64 counter;
>>> + struct dwc_pcie_info_table *pcie_info = pmu_to_pcie_info(event->pmu);
>>> + struct hw_perf_event *hwc = &event->hw;
>>> + enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
>>> + u64 delta, prev, now;
>>> +
>>> + do {
>>> + prev = local64_read(&hwc->prev_count);
>>> +
>>> + if (type == DWC_PCIE_LANE_EVENT)
>>> + dwc_pcie_pmu_read_event_counter(pcie_info, &counter);
>>> + else if (type == DWC_PCIE_TIME_BASE_EVENT)
>>> + dwc_pcie_pmu_read_base_time_counter(pcie_info,
>>> + &counter);
>>> + else
>>> + pci_err(pcie_info->pdev, "Input param is invalid\n");
>>> +
>>
>> For the messages in PMU ops, you should print the message on behalf of PMU device
>> rather than PCIe device. Same for the other places.
>
> Good idea, will fix it in next version.
>
>>
>>> + now = counter;
>>> + } while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
>>> +
>>> + delta = now - prev;
>>> +
>>> + local64_add(delta, &event->count);
>>> +}
>>> +
>>> +static int dwc_pcie_pmu_event_init(struct perf_event *event)
>>> +{
>>> + struct hw_perf_event *hwc = &event->hw;
>>> + struct dwc_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
>>> + struct perf_event *sibling;
>>> +
>>> + if (event->attr.type != event->pmu->type)
>>> + return -ENOENT;
>>> +
>>> + if (hwc->sample_period) {
>>> + dev_dbg(pcie_pmu->dev, "Sampling not supported\n");
>>> + return -EOPNOTSUPP;
>>> + }
>>> +
>>> + if (event->cpu < 0) {
>>> + dev_dbg(pcie_pmu->dev, "Per-task mode not supported\n");
>>> + return -EOPNOTSUPP;
>>> + }
>>> +
>>> + event->cpu = pcie_pmu->on_cpu;
>>> +
>>> + if (event->group_leader != event &&
>>> + !is_software_event(event->group_leader)) {
>>> + dev_dbg(pcie_pmu->dev, "Drive way only allow one event!\n");
>>> + return -EINVAL;
>>> + }
>>> +
>>> + for_each_sibling_event(sibling, event->group_leader) {
>>> + if (sibling != event && !is_software_event(sibling)) {
>>> + dev_dbg(pcie_pmu->dev, "Drive way event not allowed!\n");
>>> + return -EINVAL;
>>> + }
>>> + }
>>> +
>>> + hwc->idx = -1;
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static void dwc_pcie_pmu_set_period(struct hw_perf_event *hwc)
>>> +{
>>> + u64 new = 0;
>>> +
>>
>> redundant 'new'.
>>
>>> + local64_set(&hwc->prev_count, new);
>>> +}
>
> Yep, will fix it in next version.
>
>>> +
>>> +static void dwc_pcie_pmu_event_start(struct perf_event *event, int flags)
>>> +{
>>> + struct hw_perf_event *hwc = &event->hw;
>>> + struct dwc_pcie_info_table *pcie_info = pmu_to_pcie_info(event->pmu);
>>> + enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
>>> +
>>> + hwc->state = 0;
>>> + dwc_pcie_pmu_set_period(hwc);
>>> +
>>> + if (type == DWC_PCIE_LANE_EVENT)
>>> + dwc_pcie_pmu_event_enable(pcie_info, 1);
>>> + else if (type == DWC_PCIE_TIME_BASE_EVENT)
>>> + dwc_pcie_pmu_base_time_enable(pcie_info, 1);
>>> + else
>>> + pci_err(pcie_info->pdev, "Input param is invalid\n");
>>> +}
>>> +
>>> +static void dwc_pcie_pmu_event_stop(struct perf_event *event, int flags)
>>> +{
>>> + struct dwc_pcie_info_table *pcie_info = pmu_to_pcie_info(event->pmu);
>>> + enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
>>> +
>>> + if (event->hw.state & PERF_HES_STOPPED)
>>> + return;
>>> +
>>> + if (type == DWC_PCIE_LANE_EVENT)
>>> + dwc_pcie_pmu_event_enable(pcie_info, 0);
>>> + else if (type == DWC_PCIE_TIME_BASE_EVENT)
>>> + dwc_pcie_pmu_base_time_enable(pcie_info, 0);
>>> + else
>>> + pci_err(pcie_info->pdev, "Input param is invalid\n");
>>
>> If the message is necessary, it'll be more helpful to mention which param
>> is invalid.
>
> I see, will give more hint in log.
>
>>
>>> +
>>> + dwc_pcie_pmu_event_update(event);
>>> +}
>>> +
>>> +static int dwc_pcie_pmu_event_add(struct perf_event *event, int flags)
>>> +{
>>> + struct dwc_pcie_info_table *pcie_info = pmu_to_pcie_info(event->pmu);
>>> + struct hw_perf_event *hwc = &event->hw;
>>> + enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
>>> + int event_id = DWC_PCIE_EVENT_ID(event);
>>> + int lane = DWC_PCIE_EVENT_LANE(event);
>>> +
>>> + if (pcie_info->event)
>>> + return -ENOSPC;
>>> +
>>> + pcie_info->event = event;
>>> +
>>> + hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
>>> +
>>> + if (type == DWC_PCIE_LANE_EVENT) {
>>> + dwc_pcie_pmu_event_enable(pcie_info, 0);
>>> + dwc_pcie_pmu_write_event_lane(pcie_info, lane, event_id);
>>> + dwc_pcie_pmu_set_event_id(pcie_info, event_id);
>>> + dwc_pcie_pmu_clear_event_counter(pcie_info);
>>> + } else if (type == DWC_PCIE_TIME_BASE_EVENT) {
>>> + dwc_pcie_pmu_base_time_enable(pcie_info, 0);
>>> + dwc_pcie_pmu_base_time_add_prepare(pcie_info, event_id);
>>> + } else {
>>> + pci_err(pcie_info->pdev, "Input param is invalid\n");
>>> + return -EINVAL;
>>> + }
>>> +
>>> + if (flags & PERF_EF_START)
>>> + dwc_pcie_pmu_event_start(event, PERF_EF_RELOAD);
>>> +
>>> + perf_event_update_userpage(event);
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static void dwc_pcie_pmu_event_del(struct perf_event *event, int flags)
>>> +{
>>> + struct dwc_pcie_info_table *pcie_info = pmu_to_pcie_info(event->pmu);
>>> +
>>> + dwc_pcie_pmu_event_stop(event, flags | PERF_EF_UPDATE);
>>> + perf_event_update_userpage(event);
>>> + pcie_info->event = NULL;
>>> +}
>>> +
>>> +static void dwc_pcie_pmu_event_read(struct perf_event *event)
>>> +{
>>> + dwc_pcie_pmu_event_update(event);
>>> +}
>>> +
>>> +static struct dwc_event_counters event_array[] = {
>>> + {"tx_ack_dllp", 0x600},
>>> + {"tx_update_fc_dllp", 0x601},
>>> + {"rx_ack_dllp", 0x602},
>>> + {"rx_update_fc_dllp", 0x603},
>>> + {"rx_nulified_tlp", 0x604},
>>> + {"tx_nulified_tlp", 0x605},
>>> + {"rx_duplicate_tlp", 0x606},
>>> + {"tx_memory_write", 0x700},
>>> + {"tx_memory_read", 0x701},
>>> + {"tx_configuration_write", 0x702},
>>> + {"tx_configuration_read", 0x703},
>>> + {"tx_io_write", 0x704},
>>> + {"tx_io_read", 0x705},
>>> + {"tx_completion_without_data", 0x706},
>>> + {"tx_completion_with_data", 0x707},
>>> + {"tx_message_tlp", 0x708},
>>> + {"tx_atomic", 0x709},
>>> + {"tx_tlp_with_prefix", 0x70A},
>>> + {"rx_memory_write", 0x70B},
>>> + {"rx_memory_read", 0x70C},
>>> + {"rx_io_write", 0x70F},
>>> + {"rx_io_read", 0x710},
>>> + {"rx_completion_without_data", 0x711},
>>> + {"rx_completion_with_data", 0x712},
>>> + {"rx_message_tlp", 0x713},
>>> + {"rx_atomic", 0x714},
>>> + {"rx_tlp_with_prefix", 0x715},
>>> + {"tx_ccix_tlp", 0x716},
>>> + {"rx_ccix_tlp", 0x717},
>>> +};
>>> +
>>> +static int dwc_pcie_pmu_attr_init(struct dwc_pcie_pmu_priv *priv,
>>> + struct dwc_pcie_info_table *pcie_info)
>>> +{
>>> + int i, j;
>>> + char lane[8];
>>> + const char tmp[64];
>>> + int events_per_lane;
>>> + int num_lane_events;
>>> + int time_base_count;
>>> + int num_attrs, attr_idx;
>>> + struct dwc_pcie_event_attr *lane_attrs;
>>> + struct attribute **pmu_attrs;
>>> +
>>> + memset((void *)tmp, 0, sizeof(tmp));
>>> + memset((void *)lane, 0, sizeof(lane));
>>> + time_base_count = ARRAY_SIZE(dwc_pcie_pmu_time_event_attrs);
>>> + events_per_lane = ARRAY_SIZE(event_array);
>>> + num_lane_events = pcie_info->num_lanes * events_per_lane;
>>> + num_attrs = time_base_count + num_lane_events;
>>> +
>>> + pcie_info->lane_event_attrs =
>>> + devm_kcalloc(priv->dev, num_lane_events,
>>> + sizeof(struct dwc_pcie_event_attr),
>>> + GFP_KERNEL);
>>> + if (!pcie_info->lane_event_attrs)
>>> + return -ENOMEM;
>>> + lane_attrs = pcie_info->lane_event_attrs;
>>> + pcie_info->pcie_pmu_event_attrs =
>>> + devm_kcalloc(priv->dev, num_attrs, sizeof(struct attribute *),
>>> + GFP_KERNEL);
>>> + if (!pcie_info->pcie_pmu_event_attrs)
>>> + return -ENOMEM;
>>> + pmu_attrs = pcie_info->pcie_pmu_event_attrs;
>>> +
>>> + for (i = 0; i < num_lane_events; i++) {
>>> + lane_attrs[i].attr.attr.name =
>>> + devm_kzalloc(priv->dev, sizeof(char)
>>> + * ATTRI_NAME_MAX_SIZE, GFP_KERNEL);
>>> + if (!lane_attrs[i].attr.attr.name)
>>> + return -ENOMEM;
>>> + }
>>> +
>>> + attr_idx = 0;
>>> + for (i = 0; i < pcie_info->num_lanes; i++) {
>>> + sprintf(lane, "_lane%d", i);
>>> +
>>> + for (j = 0; j < events_per_lane; j++) {
>>> + int pos = i * events_per_lane + j;
>>> +
>>> + strcat((char *)tmp, event_array[j].name);
>>> + strcat((char *)tmp, lane);
>>> + memcpy((void *)lane_attrs[pos].attr.attr.name,
>>> + (void *)tmp,
>>> + sizeof(tmp));
>>> +
>>> + lane_attrs[pos].attr.attr.mode =
>>> + VERIFY_OCTAL_PERMISSIONS(0444);
>>> + lane_attrs[pos].attr.show = dwc_pcie_event_show;
>>> + lane_attrs[pos].attr.store = NULL;
>>> + lane_attrs[pos].type = DWC_PCIE_LANE_EVENT;
>>> + lane_attrs[pos].eventid = event_array[j].event_id;
>>> + lane_attrs[pos].lane = i;
>>> + pmu_attrs[attr_idx++] = &lane_attrs[pos].attr.attr;
>>> +
>>> + memset((void *)tmp, 0, sizeof(tmp));
>>> + }
>>> + }
>>> +
>>> + for (i = 0; i < ARRAY_SIZE(dwc_pcie_pmu_time_event_attrs); i++)
>>> + pmu_attrs[attr_idx++] = dwc_pcie_pmu_time_event_attrs[i];
>>> +
>>> + pcie_info->pcie_pmu_event_attrs[attr_idx++] = NULL;
>>> +
>>> + pcie_info->pcie_pmu_event_attrs_group.name = event_attr_name;
>>> + pcie_info->pcie_pmu_event_attrs_group.is_visible =
>>> + pcie_pmu_event_attr_is_visible;
>>> + pcie_info->pcie_pmu_event_attrs_group.attrs =
>>> + pcie_info->pcie_pmu_event_attrs;
>>> +
>>> + pcie_info->pcie_pmu_attr_groups[0] =
>>> + &pcie_info->pcie_pmu_event_attrs_group;
>>> + pcie_info->pcie_pmu_attr_groups[1] = &pcie_pmu_format_attrs_group;
>>> + pcie_info->pcie_pmu_attr_groups[2] = &pcie_pmu_cpumask_attrs_group;
>>> + pcie_info->pcie_pmu_attr_groups[3] = NULL;
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static int __dwc_pcie_pmu_probe(struct dwc_pcie_pmu_priv *priv,
>>> + struct dwc_pcie_info_table *pcie_info)
>>> +{
>>> + int ret;
>>> + char *name;
>>> + struct dwc_pcie_pmu *pcie_pmu;
>>> + struct device *dev;
>>> +
>>> + if (!pcie_info || !pcie_info->pdev) {
>>> + pci_err(pcie_info->pdev, "Input parameter is invalid\n");
>>> + return -EINVAL;
>>> + }
>>> +
>>> + pcie_pmu = &pcie_info->pcie_pmu;
>>> + dev = &pcie_info->pdev->dev;
>>> +
>>> + ret = dwc_pcie_pmu_attr_init(priv, pcie_info);
>>> + if (ret) {
>>> + pci_err(pcie_info->pdev, "PMU attr init fail ret=%d\n", ret);
>>> + return ret;
>>> + }
>>> +
>>> + pcie_pmu->dev = dev;
>>> + pcie_pmu->pmu = (struct pmu) {
>>> + .module = THIS_MODULE,
>>> + .task_ctx_nr = perf_invalid_context,
>>> + .pmu_enable = NULL,
>>> + .pmu_disable = NULL,
>>> + .event_init = dwc_pcie_pmu_event_init,
>>> + .add = dwc_pcie_pmu_event_add,
>>> + .del = dwc_pcie_pmu_event_del,
>>> + .start = dwc_pcie_pmu_event_start,
>>> + .stop = dwc_pcie_pmu_event_stop,
>>> + .read = dwc_pcie_pmu_event_read,
>>> + .attr_groups = pcie_info->pcie_pmu_attr_groups,
>>> + .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
>>> + };
>>> +
>>> + name = devm_kasprintf(priv->dev, GFP_KERNEL, "pcie_bdf_%x",
>>> + pcie_info->bdf);
>>> + if (!name)
>>> + return -ENOMEM;
>>> +
>>> + /* Pick one CPU to be the preferred one to use */
>>> + pcie_pmu->on_cpu = raw_smp_processor_id();
>>> +
>>
>> So we'll probabley bind all the pmus on one single CPU, is it intended? Since it's
>> an uncore PMU, we can make it run on any cpu (or for locality CPU on the controller's
>> NUMA node).
>>
>> And I didn't see you register a hotplug handler, so what if the ->on_cpu is hot removed?
>
> This PMU does not support interrupt at all, so we do not need to bind it to CPU.
> Should we remove this line?
>
No it's still needed to provide it and export it through the cpumask sysfs attribute, otherwise
the perf cannot recognize it as an uncore PMU device. See [*]. It can be a single CPU mask and
the event will start only on the given CPU. Or make it a range of CPUs but only let one CPU really
start the event. It may still need to handle the case that the CPU start the event is offline,
even without interrupt.
BTW, since it doesn't support interrupt, can counters overflow and how to handle the overflow case?
[*] https://github.com/torvalds/linux/blob/v6.0-rc1/tools/perf/util/pmu.c#L615
>>
>>> + ret = perf_pmu_register(&pcie_pmu->pmu, name, -1);
>>> + if (ret) {
>>> + pci_err(pcie_info->pdev, "Error %d registering PMU @%x\n", ret,
>>> + pcie_info->bdf);
>>
>> will be more helpful to print the bdf as format <bus>:<dev>:<func>.
>
> Good idea, will fix in next version.
>
>>
>>> + return ret;
>>> + }
>>> +
>>> + pcie_info->pmu_is_register = DWC_PCIE_PMU_HAS_REGISTER;
>>
>> Make @pmu_is_register a boolean will be more clear.
>
> @pmu_is_register is also discussed in Jonathan' reply. Jonathan suggests to
> remove it, so let discuss if to keep this field first :) If we decide to keep
> it, I will let it be boolean.
>
>>
>>> +
>>> + return ret;
>>> +}
>>> +
>>> +static int dwc_pcie_pmu_remove(struct platform_device *pdev)
>>> +{
>>> + struct dwc_pcie_pmu_priv *priv = platform_get_drvdata(pdev);
>>> + int index;
>>> + struct dwc_pcie_pmu *pcie_pmu;
>>
>> Make the long line first when declaring.
>
> Agree, will change the code style.
>
>>
>>> +
>>> + for (index = 0; index < priv->pcie_ctrl_num; index++)
>>> + if (priv->pcie_table[index].pmu_is_register) {
>>> + pcie_pmu = &priv->pcie_table[index].pcie_pmu;
>>> + perf_pmu_unregister(&pcie_pmu->pmu);
>>> + }
>>> + return 0;
>>> +}
>>> +
>>> +static int dwc_pcie_pmu_probe(struct platform_device *pdev)
>>> +{
>>> + int ret = 0;
>>> + int pcie_index;
>>> + struct dwc_pcie_pmu_priv *priv;
>>> +
>>> + priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
>>> + if (!priv)
>>> + return -ENOMEM;
>>> + priv->dev = &pdev->dev;
>>> + platform_set_drvdata(pdev, priv);
>>> +
>>> + /* If PMU is not support on current platform, keep slient */
>>> + if (dwc_pcie_pmu_discover(priv))
>>> + return 0;
>>> +
>>> + for (pcie_index = 0; pcie_index < priv->pcie_ctrl_num; pcie_index++) {
>>> + struct pci_dev *rp = priv->pcie_table[pcie_index].pdev;
>>> +
>>> + ret = __dwc_pcie_pmu_probe(priv, &priv->pcie_table[pcie_index]);
>>> + if (ret) {
>>> + dev_err(&rp->dev, "PCIe PMU probe fail\n");
>>> + goto pmu_unregister;
>>> + }
>>> + }
>>> + dev_info(&pdev->dev, "PCIe PMUs registered\n");
>>> +
>>
>> As Jonathan mentioned this message maybe unnecessary, but I may find it useful if you
>> print how many PMU's registered.
>
> Fine, I can add a count here.
>
>>
>> On one PMU registration failed, you just remove all the PMUs registered. I wonder if
>> it's better to make already registered PMU stay instead of removing them all.
>
> If perf_pmu_register fails, is it necessary to call perf_pmu_unregister? I did not find
> similar implementation to unregister pmu when perf_pmu_register fails.
>
No need for this but that's not what I mean here. If there should be M PMUs but only N of them
are probed successfully, maybe no need to fail the probe and remove them all. We still can
use them but just notify the user that how many PMUs are registered and the rest fail.
Anyway it's up to you.
Thanks,
Yicong
在 2022/9/24 PM4:00, Yicong Yang 写道:
> On 2022/9/23 23:43, Shuai Xue wrote:
>>
>>
>> 在 2022/9/23 AM11:30, Yicong Yang 写道:
>>> On 2022/9/17 20:10, Shuai Xue wrote:
>>>> This commit adds the PCIe Performance Monitoring Unit (PMU) driver support
>>>> for T-Head Yitian SoC chip. Yitian is based on the Synopsys PCI Express
>>>> Core controller IP which provides statistics feature. The PMU is not a PCIe
>>>> Root Complex integrated End Point(RCiEP) device but only register counters
>>>> provided by each PCIe Root Port.
>>>>
>>>> To facilitate collection of statistics the controller provides the
>>>> following two features for each Root Port:
>>>>
>>>> - Time Based Analysis (RX/TX data throughput and time spent in each
>>>> low-power LTSSM state)
>>>> - Event counters (Error and Non-Error for lanes)
>>>>
>>>> Note, only one counter for each type.
>>>>
>>>> This driver add PMU devices for each PCIe Root Port. And the PMU device is
>>>> named based the BDF of Root Port. For example,
>>>>
>>>> 10:00.0 PCI bridge: Device 1ded:8000 (rev 01)
>>>>
>>>> the PMU device name for this Root Port is pcie_bdf_100000.
>>>>
>>>> Example usage of counting PCIe RX TLP data payload (Units of 16 bytes)::
>>>>
>>>> $# perf stat -a -e pcie_bdf_200/Rx_PCIe_TLP_Data_Payload/
>>>>
>>>> average RX bandwidth can be calculated like this:
>>>>
>>>> PCIe TX Bandwidth = PCIE_TX_DATA * 16B / Measure_Time_Window
>>>>
>>>> Signed-off-by: Shuai Xue <[email protected]>
>>>> ---
>>>> drivers/perf/Kconfig | 7 +
>>>> drivers/perf/Makefile | 1 +
>>>> drivers/perf/dwc_pcie_pmu.c | 976 ++++++++++++++++++++++++++++++++++++
>>>> 3 files changed, 984 insertions(+)
>>>> create mode 100644 drivers/perf/dwc_pcie_pmu.c
>>>>
>>>> diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
>>>> index 1e2d69453771..11ae99de5bbf 100644
>>>> --- a/drivers/perf/Kconfig
>>>> +++ b/drivers/perf/Kconfig
>>>> @@ -192,4 +192,11 @@ config MARVELL_CN10K_DDR_PMU
>>>> Enable perf support for Marvell DDR Performance monitoring
>>>> event on CN10K platform.
>>>>
>>>> +config CONFIG_DWC_PCIE_PMU
>>>> + tristate "Enable Synopsys DesignWare PCIe PMU Support"
>>>> + depends on ARM64 || (COMPILE_TEST && 64BIT)
>>>> + help
>>>> + Enable perf support for Synopsys DesignWare PCIe PMU Performance
>>>> + monitoring event on Yitan 710 platform.
>>>> +
>>>> endmenu
>>>> diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
>>>> index 57a279c61df5..36f75cb0f320 100644
>>>> --- a/drivers/perf/Makefile
>>>> +++ b/drivers/perf/Makefile
>>>> @@ -20,3 +20,4 @@ obj-$(CONFIG_ARM_DMC620_PMU) += arm_dmc620_pmu.o
>>>> obj-$(CONFIG_MARVELL_CN10K_TAD_PMU) += marvell_cn10k_tad_pmu.o
>>>> obj-$(CONFIG_MARVELL_CN10K_DDR_PMU) += marvell_cn10k_ddr_pmu.o
>>>> obj-$(CONFIG_APPLE_M1_CPU_PMU) += apple_m1_cpu_pmu.o
>>>> +obj-$(CONFIG_DWC_PCIE_PMU) += dwc_pcie_pmu.o
>>>> diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c
>>>> new file mode 100644
>>>> index 000000000000..81e534be13fa
>>>> --- /dev/null
>>>> +++ b/drivers/perf/dwc_pcie_pmu.c
>>>> @@ -0,0 +1,976 @@
>>>> +// SPDX-License-Identifier: GPL-2.0
>>>> +/*
>>>> + * Synopsys DesignWare PCIe PMU driver
>>>> + *
>>>> + * Copyright (C) 2021, 2022 Alibaba Inc.
>>>> + */
>>>> +
>>>> +#include <linux/pci.h>
>>>> +#include <linux/bitfield.h>
>>>> +#include <linux/bitops.h>
>>>> +#include <linux/cpuhotplug.h>
>>>> +#include <linux/cpumask.h>
>>>> +#include <linux/device.h>
>>>> +#include <linux/errno.h>
>>>> +#include <linux/kernel.h>
>>>> +#include <linux/list.h>
>>>> +#include <linux/perf_event.h>
>>>> +#include <linux/platform_device.h>
>>>> +#include <linux/smp.h>
>>>> +#include <linux/sysfs.h>
>>>> +#include <linux/types.h>
>>>> +
>>>> +#define DRV_NAME "dwc_pcie_pmu"
>>>> +#define DEV_NAME "dwc_pcie_pmu"
>>>> +#define RP_NUM_MAX 32 /* 2die * 4RC * 4Ctrol */
>>>> +#define ATTRI_NAME_MAX_SIZE 32
>>>> +
>>>> +#define DWC_PCIE_VSEC_ID 0x02
>>>> +#define DWC_PCIE_VSEC_REV 0x04
>>>> +
>>>> +#define DWC_PCIE_LINK_CAPABILITIES_REG 0xC
>>>> +#define DWC_PCIE_LANE_SHIFT 4
>>>> +#define DWC_PCIE_LANE_MASK GENMASK(9, 4)
>>>> +
>>>> +#define DWC_PCIE_EVENT_CNT_CTRL 0x8
>>>> +#define DWC_PCIE__CNT_EVENT_SELECT_SHIFT 16
>>>> +#define DWC_PCIE__CNT_EVENT_SELECT_MASK GENMASK(27, 16)
>>>> +#define DWC_PCIE__CNT_LANE_SELECT_SHIFT 8
>>>> +#define DWC_PCIE__CNT_LANE_SELECT_MASK GENMASK(11, 8)
>>>> +#define DWC_PCIE__CNT_STATUS_SHIFT 7
>>>> +#define DWC_PCIE__CNT_STATUS_MASK BIT(7)
>>>> +#define DWC_PCIE__CNT_ENABLE_SHIFT 2
>>>> +#define DWC_PCIE__CNT_ENABLE_MASK GENMASK(4, 2)
>>>> +#define DWC_PCIE_PER_EVENT_OFF (0x1 << DWC_PCIE__CNT_ENABLE_SHIFT)
>>>> +#define DWC_PCIE_PER_EVENT_ON (0x3 << DWC_PCIE__CNT_ENABLE_SHIFT)
>>>> +#define DWC_PCIE_EVENT_CLEAR_MASK GENMASK(1, 0)
>>>> +
>>>> +#define DWC_PCIE_EVENT_CNT_DATA 0xC
>>>> +
>>>> +#define DWC_PCIE_TIME_BASED_ANALYSIS_CTRL 0x10
>>>> +#define DWC_PCIE__TIME_BASED_REPORT_SELECT_SHIFT 24
>>>> +#define DWC_PCIE__TIME_BASED_REPORT_SELECT_MASK GENMASK(31, 24)
>>>> +#define DWC_PCIE__TIME_BASED_DURATION_SHIFT 8
>>>> +#define DWC_PCIE__TIME_BASED_DURATION_SELECT GENMASK(15, 8)
>>>> +#define DWC_PCIE_DURATION_MANUAL_CTRL 0x0
>>>> +#define DWC_PCIE_DURATION_1MS 0x1
>>>> +#define DWC_PCIE_DURATION_10MS 0x2
>>>> +#define DWC_PCIE_DURATION_100MS 0x3
>>>> +#define DWC_PCIE_DURATION_1S 0x4
>>>> +#define DWC_PCIE_DURATION_2S 0x5
>>>> +#define DWC_PCIE_DURATION_4S 0x6
>>>> +#define DWC_PCIE_DURATION_4US 0xff
>>>> +#define DWC_PCIE__TIME_BASED_COUNTER_ENABLE 1
>>>> +
>>>> +#define DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_LOW 0x14
>>>> +#define DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_HIGH 0x18
>>>> +
>>>> +/* Event attributes */
>>>> +#define DWC_PCIE_CONFIG_EVENTID GENMASK(15, 0)
>>>> +#define DWC_PCIE_CONFIG_TYPE GENMASK(19, 16)
>>>> +#define DWC_PCIE_CONFIG_LANE GENMASK(27, 20)
>>>> +
>>>> +#define DWC_PCIE_EVENT_ID(event) FIELD_GET(DWC_PCIE_CONFIG_EVENTID, (event)->attr.config)
>>>> +#define DWC_PCIE_EVENT_TYPE(event) FIELD_GET(DWC_PCIE_CONFIG_TYPE, (event)->attr.config)
>>>> +#define DWC_PCIE_EVENT_LANE(event) FIELD_GET(DWC_PCIE_CONFIG_LANE, (event)->attr.config)
>>>> +
>>>> +#define DWC_PCIE_PMU_HAS_REGISTER 1
>>>> +
>>>> +enum dwc_pcie_event_type {
>>>> + DWC_PCIE_TYPE_INVALID,
>>>> + DWC_PCIE_TIME_BASE_EVENT,
>>>> + DWC_PCIE_LANE_EVENT,
>>>> +};
>>>> +
>>>> +struct dwc_event_counters {
>>>> + const char name[32];
>>>> + u32 event_id;
>>>> +};
>>>> +
>>>> +struct dwc_pcie_pmu {
>>>> + struct hlist_node node;
>>>> + unsigned int on_cpu;
>>>> + struct pmu pmu;
>>>> + struct device *dev;
>>>> +};
>>>> +
>>>> +struct dwc_pcie_info_table {
>>>> + u32 bdf;
>>>> + u32 cap_pos;
>>>> + u32 num_lanes;
>>>> + struct pci_dev *pdev;
>>>> + struct dwc_pcie_pmu pcie_pmu;
>>>> + u8 pmu_is_register;
>>>> + struct perf_event *event;
>>>> +
>>>> + struct dwc_pcie_event_attr *lane_event_attrs;
>>>> + struct attribute **pcie_pmu_event_attrs;
>>>> + struct attribute_group pcie_pmu_event_attrs_group;
>>>> + const struct attribute_group *pcie_pmu_attr_groups[4];
>>>> +};
>>>> +
>>>> +struct dwc_pcie_pmu_priv {
>>>> + struct device *dev;
>>>> + u32 pcie_ctrl_num;
>>>> + struct dwc_pcie_info_table *pcie_table;
>>>> +};
>>>> +
>>>> +#define DWC_PCIE_CREATE_BDF(seg, bus, dev, func) \
>>>> + (((seg) << 24) | (((bus) & 0xFF) << 16) | (((dev) & 0xFF) << 8) | (func))
>>>
>>> Just pass pdev->devfn and use PCI_DEVID() to simplify here.
>>
>> Sorry, as far as I know, PCI_DEVID() output is not exactly the bdf.
>> For example, bdf 300100 is decoded as 3008.
>>
>
> See the standard's encoding of BDF (PCIe Spec 4.0 Figure 6-34: Routing IDs (RIDs) and Supported
> Granularities). Also in uapi/linux/pci.h and include/linux/pci.h. Bus number is encoding in
> BIT[15, 8], slot number in BIT[7, 3] and function number for BIT[2, 0].
Yep, it's right. PCI_SLOT and PCI_FUNC defined in uapi/linux/pci.h follows PCIe Spec
4.0 standard.
>
> You're use your coding of "BDF" here and thought it's more convenient to the user to recognize, but
> that's not what is known of BDF. Just use the standard coding of BDF will have less ambiguous.
My coding of "BDF" is based on the output format of BDF, for example, most of kernel
code split devfn to SLOT and FUNC when print the the pcie address. For example:
// pci_addr_show in drivers/misc/habanalabs/common/sysfs.c
return sprintf(buf, "%04x:%02x:%02x.%x\n",
pci_domain_nr(hdev->pdev->bus),
hdev->pdev->bus->number,
PCI_SLOT(hdev->pdev->devfn),
PCI_FUNC(hdev->pdev->devfn));
And the pci address is consistent with lspci output, which is what I intend to do.
Should we rename DWC_PCIE_CREATE_BDF to DWC_PCIE_CREATE_PCI_ADDRESS?
>
>>>
>>>> +#define to_pcie_pmu(p) (container_of(p, struct dwc_pcie_pmu, pmu))
>>>> +
>>>> +static struct platform_device *dwc_pcie_pmu_dev;
>>>> +static char *event_attr_name = "events";
>>>> +
>>>> +static ssize_t dwc_pcie_pmu_cpumask_show(struct device *dev,
>>>> + struct device_attribute *attr,
>>>> + char *buf)
>>>> +{
>>>> + struct dwc_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev));
>>>> +
>>>> + return cpumap_print_to_pagebuf(true, buf, cpumask_of(pcie_pmu->on_cpu));
>>>> +}
>>>> +
>>>> +static struct device_attribute dwc_pcie_pmu_cpumask_attr =
>>>> +__ATTR(cpumask, 0444, dwc_pcie_pmu_cpumask_show, NULL);
>>>> +
>>>> +static struct attribute *dwc_pcie_pmu_cpumask_attrs[] = {
>>>> + &dwc_pcie_pmu_cpumask_attr.attr,
>>>> + NULL
>>>> +};
>>>> +
>>>> +static struct attribute_group pcie_pmu_cpumask_attrs_group = {
>>>> + .attrs = dwc_pcie_pmu_cpumask_attrs,
>>>> +};
>>>> +
>>>> +struct dwc_pcie_format_attr {
>>>> + struct device_attribute attr;
>>>> + u64 field;
>>>> + int config;
>>>> +};
>>>> +
>>>> +static ssize_t dwc_pcie_pmu_format_show(struct device *dev,
>>>> + struct device_attribute *attr,
>>>> + char *buf)
>>>> +{
>>>> + struct dwc_pcie_format_attr *fmt = container_of(attr, typeof(*fmt), attr);
>>>> + int lo = __ffs(fmt->field), hi = __fls(fmt->field);
>>>> +
>>>> + if (lo == hi)
>>>> + return snprintf(buf, PAGE_SIZE, "config:%d\n", lo);
>>>> +
>>>> + if (!fmt->config)
>>>> + return snprintf(buf, PAGE_SIZE, "config:%d-%d\n", lo, hi);
>>>> +
>>>> + return snprintf(buf, PAGE_SIZE, "config%d:%d-%d\n", fmt->config, lo,
>>>> + hi);
>>>> +}
>>>> +
>>>> +#define _dwc_pcie_format_attr(_name, _cfg, _fld) \
>>>> + (&((struct dwc_pcie_format_attr[]) {{ \
>>>> + .attr = __ATTR(_name, 0444, dwc_pcie_pmu_format_show, NULL), \
>>>> + .config = _cfg, \
>>>> + .field = _fld, \
>>>> + }})[0].attr.attr)
>>>> +
>>>> +#define dwc_pcie_format_attr(_name, _fld) _dwc_pcie_format_attr(_name, 0, _fld)
>>>> +
>>>> +static struct attribute *dwc_pcie_format_attrs[] = {
>>>> + dwc_pcie_format_attr(type, DWC_PCIE_CONFIG_TYPE),
>>>> + dwc_pcie_format_attr(eventid, DWC_PCIE_CONFIG_EVENTID),
>>>> + dwc_pcie_format_attr(lane, DWC_PCIE_CONFIG_LANE),
>>>> + NULL,
>>>> +};
>>>> +
>>>> +static struct attribute_group pcie_pmu_format_attrs_group = {
>>>> + .name = "format",
>>>> + .attrs = dwc_pcie_format_attrs,
>>>> +};
>>>> +
>>>> +struct dwc_pcie_event_attr {
>>>> + struct device_attribute attr;
>>>> + enum dwc_pcie_event_type type;
>>>> + u16 eventid;
>>>> + u8 lane;
>>>> +};
>>>> +
>>>> +ssize_t dwc_pcie_event_show(struct device *dev,
>>>> + struct device_attribute *attr, char *page)
>>>> +{
>>>> + struct dwc_pcie_event_attr *eattr;
>>>> +
>>>> + eattr = container_of(attr, typeof(*eattr), attr);
>>>> +
>>>> + if (eattr->type == DWC_PCIE_LANE_EVENT)
>>>> + return sprintf(page, "eventid=0x%lx, type=0x%lx, lane=0x%lx\n",
>>>> + (unsigned long)eattr->eventid,
>>>> + (unsigned long)eattr->type,
>>>> + (unsigned long)eattr->lane);
>>>> + else
>>>> + return sprintf(page, "eventid=0x%lx, type=0x%lx",
>>>> + (unsigned long)eattr->eventid,
>>>> + (unsigned long)eattr->type);
>>>> +}
>>>
>>> I remember sysfs_emit() is preferred.
>>
>> You are right, I will use sysfs_emit() in next version.
>>
>>>
>>>> +
>>>> +#define DWC_PCIE_EVENT_ATTR(_name, _type, _eventid, _lane) \
>>>> + (&((struct dwc_pcie_event_attr[]) {{ \
>>>> + .attr = __ATTR(_name, 0444, dwc_pcie_event_show, NULL), \
>>>> + .type = _type, \
>>>> + .eventid = _eventid, \
>>>> + .lane = _lane, \
>>>> + }})[0].attr.attr)
>>>> +
>>>> +#define DWC_PCIE_PMU_BASE_TIME_ATTR(_name, _eventid) \
>>>> + DWC_PCIE_EVENT_ATTR(_name, DWC_PCIE_TIME_BASE_EVENT, _eventid, 0)
>>>> +
>>>> +static struct attribute *dwc_pcie_pmu_time_event_attrs[] = {
>>>> + /* Group #0 */
>>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(one_cycle, 0x00),
>>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(TX_L0S, 0x01),
>>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(RX_L0S, 0x02),
>>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(L0, 0x03),
>>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(L1, 0x04),
>>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(L1_1, 0x05),
>>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(L1_2, 0x06),
>>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(CFG_RCVRY, 0x07),
>>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(TX_RX_L0S, 0x08),
>>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(L1_AUX, 0x09),
>>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(ONE_cycle, 0x10),
>>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(TX_L0S_, 0x11),
>>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(RX_L0S_, 0x12),
>>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(L0_, 0x13),
>>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(L1_, 0x17),
>>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(CFG_RCVRY_, 0x17),
>>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(TX_RX_L0S_, 0x18),
>>>> + /* Group #1 */
>>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(Tx_PCIe_TLP_Data_Payload, 0x20),
>>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(Rx_PCIe_TLP_Data_Payload, 0x21),
>>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(Tx_CCIX_TLP_Data_Payload, 0x22),
>>>> + DWC_PCIE_PMU_BASE_TIME_ATTR(Rx_CCIX_TLP_Data_Payload, 0x23),
>>>> + NULL
>>>> +};
>>>> +
>>>> +static inline umode_t pcie_pmu_event_attr_is_visible(struct kobject *kobj,
>>>> + struct attribute *attr,
>>>> + int unuse)
>>>> +{
>>>> + return attr->mode;
>>>> +}
>>>> +
>>>> +static inline bool pci_dev_is_rootport(struct pci_dev *pdev)
>>>> +{
>>>> + return (pci_is_pcie(pdev) &&
>>>> + pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT);
>>>> +}
>>>> +
>>>> +static inline unsigned int dwc_pcie_get_bdf(struct pci_dev *dev)
>>>> +{
>>>> + return (DWC_PCIE_CREATE_BDF(pci_domain_nr(dev->bus), dev->bus->number,
>>>> + PCI_SLOT(dev->devfn),
>>>> + PCI_FUNC(dev->devfn)));
>>>> +}
>>>> +
>>>> +static int dwc_pcie_find_ras_des_cap_position(struct pci_dev *pdev, int *pos)
>>>> +{
>>>> + u32 header;
>>>> + int vsec = 0;
>>>> +
>>>> + while ((vsec = pci_find_next_ext_capability(pdev, vsec,
>>>> + PCI_EXT_CAP_ID_VNDR))) {
>>>> + pci_read_config_dword(pdev, vsec + PCI_VNDR_HEADER, &header);
>>>> + /* Is the device part of a DesignWare Cores PCIe Controller ? */
>>>> + if (PCI_VNDR_HEADER_ID(header) == DWC_PCIE_VSEC_ID &&
>>>> + PCI_VNDR_HEADER_REV(header) == DWC_PCIE_VSEC_REV) {
>>>> + *pos = vsec;
>>>> + return 0;
>>>> + }
>>>> + }
>>>> +
>>>> + return -ENODEV;
>>>> +}
>>>> +
>>>> +static int dwc_pcie_pmu_discover(struct dwc_pcie_pmu_priv *priv)
>>>> +{
>>>> + int val, where, index = 0;
>>>> + struct pci_dev *pdev = NULL;
>>>> + struct dwc_pcie_info_table *pcie_info;
>>>> +
>>>> + priv->pcie_table =
>>>> + devm_kcalloc(priv->dev, RP_NUM_MAX, sizeof(*pcie_info), GFP_KERNEL);
>>>> + if (!priv->pcie_table)
>>>> + return -EINVAL;
>>>> +
>>>> + pcie_info = priv->pcie_table;
>>>> + while ((pdev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pdev)) != NULL &&
>>>
>>> I may miss but I don't pci_dev_put() to balance the reference cnt.
>>
>> As the comments in pci_get_device, the reference count is incremented and
>> decremented in the loop automatically. So we do not need to use
>> pci_dev_put(), right?
>>
>> Iterates through the list of known PCI devices. If a PCI device is
>> found with a matching @vendor and @device, *the reference count to the
>> device is incremented* and a pointer to its device structure is returned.
>> Otherwise, %NULL is returned. A new search is initiated by passing %NULL
>> as the @from argument. Otherwise if @from is not %NULL, searches continue
>> from next device on the global list. *The reference count for @from is
>> always decremented if it is not %NULL.*
>
> Thanks for the explanation. The usage is right here. Can we use for_each_pci_dev() instead?
Yes, for_each_pci_dev is more easier. But as we discussed with Jonathan:
> This having a driver than then walks the pci topology to find root ports and add
> extra stuff to them is not a clean solution.
Do we have any plan to extend PCI core interface?
> And any reason to limit the Root Ports number to RP_NUM_MAX? Shouldn't we find all the
> Root Ports with PMU counters and make use of them? Limit it with RP_NUM_MAX is rather
> platform specific and you need to extend it if we have more Root Ports someday.
No. I'm sorry I didn't consider other platforms when developing. I will extend to
discover root device at probing.
>
> Another problem I see here is that you walk all the Root Ports with counters and register
> a PMU for them. But you don't know whether they're removed later when you use them...
>
>>>
>>>> + index < RP_NUM_MAX) {
>>>> + if (!pci_dev_is_rootport(pdev))
>>>> + continue;
>>>> +
>>>> + pcie_info[index].bdf = dwc_pcie_get_bdf(pdev);
>>>> + pcie_info[index].pdev = pdev;
>
> ...you store the *pdev and use them directly in the pmu_ops but when the device is hot removed
> you'll access an invalid address and crash.
>
> A possible solution is to be notified when the corresponding device is removed/added and handle
> correctly. Or you can get the reference count of the device to prevent it from being removed, but
> this may not be a good option.
I see your point. I will try to add a notifier in hot remove and plugin path.
>
>>>> +
>>>> + if (dwc_pcie_find_ras_des_cap_position(pdev, &where))
>>>> + continue;
>>>> +
>>>> + pcie_info[index].cap_pos = where;
>>>> +
>>>> + pci_read_config_dword(pdev,
>>>> + pdev->pcie_cap + DWC_PCIE_LINK_CAPABILITIES_REG,
>>>> + &val);
>>>> + pcie_info[index].num_lanes =
>>>> + (val & DWC_PCIE_LANE_MASK) >> DWC_PCIE_LANE_SHIFT;
>>>> + index++;
>>>> + }
>>>> +
>>>> + if (!index)
>>>> + return -ENODEV;
>>>> +
>>>> + priv->pcie_ctrl_num = index;
>>>> +
>>>> + return 0;
>>>> +}
>>>> +
>>>> +static inline int dwc_pcie_pmu_read_dword(struct dwc_pcie_info_table *pcie_info,
>>>> + u32 reg, u32 *val)
>>>> +{
>>>> + return pci_read_config_dword(pcie_info->pdev, pcie_info->cap_pos + reg,
>>>> + val);
>>>> +}
>>>> +
>>>> +static inline int dwc_pcie_pmu_write_dword(struct dwc_pcie_info_table
>>>> + *pcie_info, u32 reg, u32 val)
>>>> +{
>>>> + return pci_write_config_dword(pcie_info->pdev, pcie_info->cap_pos + reg,
>>>> + val);
>>>> +}
>>>> +
>>>> +static int dwc_pcie_pmu_set_event_id(struct dwc_pcie_info_table *pcie_info,
>>>> + int event_id)
>>>> +{
>>>> + int ret;
>>>> + u32 val;
>>>> +
>>>> + ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, &val);
>>>> + if (ret) {
>>>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>>>> + return ret;
>>>> + }
>>>> +
>>>> + val &= ~DWC_PCIE__CNT_ENABLE_MASK;
>>>> + val &= ~DWC_PCIE__CNT_EVENT_SELECT_MASK;
>>>> + val |= event_id << DWC_PCIE__CNT_EVENT_SELECT_SHIFT;
>>>> +
>>>> + ret = dwc_pcie_pmu_write_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, val);
>>>> + if (ret)
>>>> + pci_err(pcie_info->pdev, "PCIe write fail\n");
>>>> +
>>>> + return ret;
>>>> +}
>>>> +
>>>> +static int dwc_pcie_pmu_write_event_lane(struct dwc_pcie_info_table *pcie_info,
>>>> + int lane, int event_id)
>>>> +{
>>>> + u32 ret;
>>>> + u32 val;
>>>> +
>>>> + ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, &val);
>>>> + if (ret) {
>>>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>>>> + return ret;
>>>> + }
>>>> +
>>>> + val &= ~DWC_PCIE__CNT_LANE_SELECT_MASK;
>>>> + val |= lane << DWC_PCIE__CNT_LANE_SELECT_SHIFT;
>>>> +
>>>> + ret = dwc_pcie_pmu_write_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, val);
>>>> + if (ret)
>>>> + pci_err(pcie_info->pdev, "PCIe write fail\n");
>>>> +
>>>> + return ret;
>>>> +}
>>>> +
>>>> +static int dwc_pcie_pmu_event_enable(struct dwc_pcie_info_table *pcie_info,
>>>> + u32 enable)
>>>> +{
>>>> + u32 ret;
>>>> + u32 val;
>>>> +
>>>> + ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, &val);
>>>> + if (ret) {
>>>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>>>
>>> Somebody may mentioned. Maybe you don't need to print these messages in PMU ops, just
>>> return the correct error code and let perf handle it. Or you should provide more
>>> information for these, like failed in which funcion or read/write which value.
>>> If it only necessary when debugging, make it pci_dbg().
>>
>> Yep, you are right, I will drop the print info in next version.
>>
>>>
>>>> + return ret;
>>>> + }
>>>> +
>>>> + val &= ~(DWC_PCIE__CNT_ENABLE_MASK);
>>>> +
>>>> + if (enable)
>>>> + val |= DWC_PCIE_PER_EVENT_ON;
>>>> + else
>>>> + val |= DWC_PCIE_PER_EVENT_OFF;
>>>> +
>>>> + ret = dwc_pcie_pmu_write_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, val);
>>>> + if (ret)
>>>> + pci_err(pcie_info->pdev, "PCIe write fail\n");
>>>> +
>>>> + return ret;
>>>> +}
>>>> +
>>>> +static int dwc_pcie_pmu_base_time_enable(struct dwc_pcie_info_table *pcie_info,
>>>> + u32 enable)
>>>> +{
>>>> + u32 ret;
>>>> + u32 val;
>>>> +
>>>> + ret = dwc_pcie_pmu_read_dword(pcie_info,
>>>> + DWC_PCIE_TIME_BASED_ANALYSIS_CTRL, &val);
>>>> + if (ret) {
>>>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>>>> + return ret;
>>>> + }
>>>> +
>>>> + if (enable)
>>>> + val |= DWC_PCIE__TIME_BASED_COUNTER_ENABLE;
>>>> + else
>>>> + val &= ~DWC_PCIE__TIME_BASED_COUNTER_ENABLE;
>>>> +
>>>> + ret = dwc_pcie_pmu_write_dword(pcie_info,
>>>> + DWC_PCIE_TIME_BASED_ANALYSIS_CTRL, val);
>>>> + if (ret)
>>>> + pci_err(pcie_info->pdev, "PCIe write fail\n");
>>>> +
>>>> + return ret;
>>>> +}
>>>> +
>>>> +static int dwc_pcie_pmu_read_event_counter(struct dwc_pcie_info_table
>>>> + *pcie_info, u64 *counter)
>>>> +{
>>>> + u32 ret, val;
>>>> +
>>>> + ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_DATA, &val);
>>>> + if (ret) {
>>>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>>>> + return ret;
>>>> + }
>>>> + *counter = val;
>>>> +
>>>> + return ret;
>>>> +}
>>>> +
>>>> +static int dwc_pcie_pmu_read_base_time_counter(struct dwc_pcie_info_table
>>>> + *pcie_info, u64 *counter)
>>>> +{
>>>> + u32 ret, val;
>>>> +
>>>> + ret = dwc_pcie_pmu_read_dword(pcie_info,
>>>> + DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_HIGH,
>>>> + &val);
>>>> + if (ret) {
>>>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>>>> + return ret;
>>>> + }
>>>> +
>>>> + *counter = val;
>>>> + *counter <<= 32;
>>>> +
>>>> + ret = dwc_pcie_pmu_read_dword(pcie_info,
>>>> + DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_LOW,
>>>> + &val);
>>>> + if (ret) {
>>>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>>>> + return ret;
>>>> + }
>>>> +
>>>> + *counter += val;
>>>> +
>>>> + return ret;
>>>> +}
>>>> +
>>>> +static int dwc_pcie_pmu_clear_event_counter(struct dwc_pcie_info_table
>>>> + *pcie_info)
>>>> +{
>>>> + u32 ret;
>>>> + u32 val;
>>>> +
>>>> + ret = dwc_pcie_pmu_read_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, &val);
>>>> + if (ret) {
>>>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>>>> + return ret;
>>>> + }
>>>> +
>>>> + val &= ~DWC_PCIE_EVENT_CLEAR_MASK;
>>>> + val |= 1;
>>>
>>> It's better to use a macro for '1' to make it more clear.
>>
>> Good idea, will fix it in next version.
>>
>>>
>>>> +
>>>> + ret = dwc_pcie_pmu_write_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, val);
>>>> + if (ret)
>>>> + pci_err(pcie_info->pdev, "PCIe write fail\n");
>>>> +
>>>> + return ret;
>>>> +}
>>>> +
>>>> +static int dwc_pcie_pmu_base_time_add_prepare(struct dwc_pcie_info_table
>>>> + *pcie_info, u32 event_id)
>>>> +{
>>>> + u32 ret;
>>>> + u32 val;
>>>> +
>>>> + ret = dwc_pcie_pmu_read_dword(pcie_info,
>>>> + DWC_PCIE_TIME_BASED_ANALYSIS_CTRL, &val);
>>>> + if (ret) {
>>>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>>>> + return ret;
>>>> + }
>>>> +
>>>> + val &= ~DWC_PCIE__TIME_BASED_REPORT_SELECT_MASK;
>>>> + val |= event_id << DWC_PCIE__TIME_BASED_REPORT_SELECT_SHIFT;
>>>> + val &= ~DWC_PCIE__TIME_BASED_DURATION_SELECT;
>>>> +
>>>> + /*
>>>> + * TIME_BASED_ANALYSIS_DATA_REG is a 64 bit register, we can safely
>>>> + * use it with any manually controllered duration.
>>>> + */
>>>> + val &= ~(DWC_PCIE__TIME_BASED_DURATION_SELECT);
>>>> + val |= DWC_PCIE_DURATION_MANUAL_CTRL;
>>>> +
>>>> + ret = dwc_pcie_pmu_write_dword(pcie_info,
>>>> + DWC_PCIE_TIME_BASED_ANALYSIS_CTRL, val);
>>>> + if (ret)
>>>> + pci_err(pcie_info->pdev, "PCIe write fail\n");
>>>> +
>>>> + return ret;
>>>> +}
>>>> +
>>>> +static struct dwc_pcie_info_table *pmu_to_pcie_info(struct pmu *pmu)
>>>> +{
>>>> + struct dwc_pcie_info_table *pcie_info;
>>>> + struct dwc_pcie_pmu *pcie_pmu = to_pcie_pmu(pmu);
>>>> +
>>>> + pcie_info = container_of(pcie_pmu, struct dwc_pcie_info_table, pcie_pmu);
>>>> + if (pcie_info == NULL)
>>>> + pci_err(pcie_info->pdev, "Can't get pcie info\n");
>>>> +
>>>> + return pcie_info;
>>>> +}
>>>> +
>>>> +static void dwc_pcie_pmu_event_update(struct perf_event *event)
>>>> +{
>>>> + u64 counter;
>>>> + struct dwc_pcie_info_table *pcie_info = pmu_to_pcie_info(event->pmu);
>>>> + struct hw_perf_event *hwc = &event->hw;
>>>> + enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
>>>> + u64 delta, prev, now;
>>>> +
>>>> + do {
>>>> + prev = local64_read(&hwc->prev_count);
>>>> +
>>>> + if (type == DWC_PCIE_LANE_EVENT)
>>>> + dwc_pcie_pmu_read_event_counter(pcie_info, &counter);
>>>> + else if (type == DWC_PCIE_TIME_BASE_EVENT)
>>>> + dwc_pcie_pmu_read_base_time_counter(pcie_info,
>>>> + &counter);
>>>> + else
>>>> + pci_err(pcie_info->pdev, "Input param is invalid\n");
>>>> +
>>>
>>> For the messages in PMU ops, you should print the message on behalf of PMU device
>>> rather than PCIe device. Same for the other places.
>>
>> Good idea, will fix it in next version.
>>
>>>
>>>> + now = counter;
>>>> + } while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
>>>> +
>>>> + delta = now - prev;
>>>> +
>>>> + local64_add(delta, &event->count);
>>>> +}
>>>> +
>>>> +static int dwc_pcie_pmu_event_init(struct perf_event *event)
>>>> +{
>>>> + struct hw_perf_event *hwc = &event->hw;
>>>> + struct dwc_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
>>>> + struct perf_event *sibling;
>>>> +
>>>> + if (event->attr.type != event->pmu->type)
>>>> + return -ENOENT;
>>>> +
>>>> + if (hwc->sample_period) {
>>>> + dev_dbg(pcie_pmu->dev, "Sampling not supported\n");
>>>> + return -EOPNOTSUPP;
>>>> + }
>>>> +
>>>> + if (event->cpu < 0) {
>>>> + dev_dbg(pcie_pmu->dev, "Per-task mode not supported\n");
>>>> + return -EOPNOTSUPP;
>>>> + }
>>>> +
>>>> + event->cpu = pcie_pmu->on_cpu;
>>>> +
>>>> + if (event->group_leader != event &&
>>>> + !is_software_event(event->group_leader)) {
>>>> + dev_dbg(pcie_pmu->dev, "Drive way only allow one event!\n");
>>>> + return -EINVAL;
>>>> + }
>>>> +
>>>> + for_each_sibling_event(sibling, event->group_leader) {
>>>> + if (sibling != event && !is_software_event(sibling)) {
>>>> + dev_dbg(pcie_pmu->dev, "Drive way event not allowed!\n");
>>>> + return -EINVAL;
>>>> + }
>>>> + }
>>>> +
>>>> + hwc->idx = -1;
>>>> +
>>>> + return 0;
>>>> +}
>>>> +
>>>> +static void dwc_pcie_pmu_set_period(struct hw_perf_event *hwc)
>>>> +{
>>>> + u64 new = 0;
>>>> +
>>>
>>> redundant 'new'.
>>>
>>>> + local64_set(&hwc->prev_count, new);
>>>> +}
>>
>> Yep, will fix it in next version.
>>
>>>> +
>>>> +static void dwc_pcie_pmu_event_start(struct perf_event *event, int flags)
>>>> +{
>>>> + struct hw_perf_event *hwc = &event->hw;
>>>> + struct dwc_pcie_info_table *pcie_info = pmu_to_pcie_info(event->pmu);
>>>> + enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
>>>> +
>>>> + hwc->state = 0;
>>>> + dwc_pcie_pmu_set_period(hwc);
>>>> +
>>>> + if (type == DWC_PCIE_LANE_EVENT)
>>>> + dwc_pcie_pmu_event_enable(pcie_info, 1);
>>>> + else if (type == DWC_PCIE_TIME_BASE_EVENT)
>>>> + dwc_pcie_pmu_base_time_enable(pcie_info, 1);
>>>> + else
>>>> + pci_err(pcie_info->pdev, "Input param is invalid\n");
>>>> +}
>>>> +
>>>> +static void dwc_pcie_pmu_event_stop(struct perf_event *event, int flags)
>>>> +{
>>>> + struct dwc_pcie_info_table *pcie_info = pmu_to_pcie_info(event->pmu);
>>>> + enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
>>>> +
>>>> + if (event->hw.state & PERF_HES_STOPPED)
>>>> + return;
>>>> +
>>>> + if (type == DWC_PCIE_LANE_EVENT)
>>>> + dwc_pcie_pmu_event_enable(pcie_info, 0);
>>>> + else if (type == DWC_PCIE_TIME_BASE_EVENT)
>>>> + dwc_pcie_pmu_base_time_enable(pcie_info, 0);
>>>> + else
>>>> + pci_err(pcie_info->pdev, "Input param is invalid\n");
>>>
>>> If the message is necessary, it'll be more helpful to mention which param
>>> is invalid.
>>
>> I see, will give more hint in log.
>>
>>>
>>>> +
>>>> + dwc_pcie_pmu_event_update(event);
>>>> +}
>>>> +
>>>> +static int dwc_pcie_pmu_event_add(struct perf_event *event, int flags)
>>>> +{
>>>> + struct dwc_pcie_info_table *pcie_info = pmu_to_pcie_info(event->pmu);
>>>> + struct hw_perf_event *hwc = &event->hw;
>>>> + enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
>>>> + int event_id = DWC_PCIE_EVENT_ID(event);
>>>> + int lane = DWC_PCIE_EVENT_LANE(event);
>>>> +
>>>> + if (pcie_info->event)
>>>> + return -ENOSPC;
>>>> +
>>>> + pcie_info->event = event;
>>>> +
>>>> + hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
>>>> +
>>>> + if (type == DWC_PCIE_LANE_EVENT) {
>>>> + dwc_pcie_pmu_event_enable(pcie_info, 0);
>>>> + dwc_pcie_pmu_write_event_lane(pcie_info, lane, event_id);
>>>> + dwc_pcie_pmu_set_event_id(pcie_info, event_id);
>>>> + dwc_pcie_pmu_clear_event_counter(pcie_info);
>>>> + } else if (type == DWC_PCIE_TIME_BASE_EVENT) {
>>>> + dwc_pcie_pmu_base_time_enable(pcie_info, 0);
>>>> + dwc_pcie_pmu_base_time_add_prepare(pcie_info, event_id);
>>>> + } else {
>>>> + pci_err(pcie_info->pdev, "Input param is invalid\n");
>>>> + return -EINVAL;
>>>> + }
>>>> +
>>>> + if (flags & PERF_EF_START)
>>>> + dwc_pcie_pmu_event_start(event, PERF_EF_RELOAD);
>>>> +
>>>> + perf_event_update_userpage(event);
>>>> +
>>>> + return 0;
>>>> +}
>>>> +
>>>> +static void dwc_pcie_pmu_event_del(struct perf_event *event, int flags)
>>>> +{
>>>> + struct dwc_pcie_info_table *pcie_info = pmu_to_pcie_info(event->pmu);
>>>> +
>>>> + dwc_pcie_pmu_event_stop(event, flags | PERF_EF_UPDATE);
>>>> + perf_event_update_userpage(event);
>>>> + pcie_info->event = NULL;
>>>> +}
>>>> +
>>>> +static void dwc_pcie_pmu_event_read(struct perf_event *event)
>>>> +{
>>>> + dwc_pcie_pmu_event_update(event);
>>>> +}
>>>> +
>>>> +static struct dwc_event_counters event_array[] = {
>>>> + {"tx_ack_dllp", 0x600},
>>>> + {"tx_update_fc_dllp", 0x601},
>>>> + {"rx_ack_dllp", 0x602},
>>>> + {"rx_update_fc_dllp", 0x603},
>>>> + {"rx_nulified_tlp", 0x604},
>>>> + {"tx_nulified_tlp", 0x605},
>>>> + {"rx_duplicate_tlp", 0x606},
>>>> + {"tx_memory_write", 0x700},
>>>> + {"tx_memory_read", 0x701},
>>>> + {"tx_configuration_write", 0x702},
>>>> + {"tx_configuration_read", 0x703},
>>>> + {"tx_io_write", 0x704},
>>>> + {"tx_io_read", 0x705},
>>>> + {"tx_completion_without_data", 0x706},
>>>> + {"tx_completion_with_data", 0x707},
>>>> + {"tx_message_tlp", 0x708},
>>>> + {"tx_atomic", 0x709},
>>>> + {"tx_tlp_with_prefix", 0x70A},
>>>> + {"rx_memory_write", 0x70B},
>>>> + {"rx_memory_read", 0x70C},
>>>> + {"rx_io_write", 0x70F},
>>>> + {"rx_io_read", 0x710},
>>>> + {"rx_completion_without_data", 0x711},
>>>> + {"rx_completion_with_data", 0x712},
>>>> + {"rx_message_tlp", 0x713},
>>>> + {"rx_atomic", 0x714},
>>>> + {"rx_tlp_with_prefix", 0x715},
>>>> + {"tx_ccix_tlp", 0x716},
>>>> + {"rx_ccix_tlp", 0x717},
>>>> +};
>>>> +
>>>> +static int dwc_pcie_pmu_attr_init(struct dwc_pcie_pmu_priv *priv,
>>>> + struct dwc_pcie_info_table *pcie_info)
>>>> +{
>>>> + int i, j;
>>>> + char lane[8];
>>>> + const char tmp[64];
>>>> + int events_per_lane;
>>>> + int num_lane_events;
>>>> + int time_base_count;
>>>> + int num_attrs, attr_idx;
>>>> + struct dwc_pcie_event_attr *lane_attrs;
>>>> + struct attribute **pmu_attrs;
>>>> +
>>>> + memset((void *)tmp, 0, sizeof(tmp));
>>>> + memset((void *)lane, 0, sizeof(lane));
>>>> + time_base_count = ARRAY_SIZE(dwc_pcie_pmu_time_event_attrs);
>>>> + events_per_lane = ARRAY_SIZE(event_array);
>>>> + num_lane_events = pcie_info->num_lanes * events_per_lane;
>>>> + num_attrs = time_base_count + num_lane_events;
>>>> +
>>>> + pcie_info->lane_event_attrs =
>>>> + devm_kcalloc(priv->dev, num_lane_events,
>>>> + sizeof(struct dwc_pcie_event_attr),
>>>> + GFP_KERNEL);
>>>> + if (!pcie_info->lane_event_attrs)
>>>> + return -ENOMEM;
>>>> + lane_attrs = pcie_info->lane_event_attrs;
>>>> + pcie_info->pcie_pmu_event_attrs =
>>>> + devm_kcalloc(priv->dev, num_attrs, sizeof(struct attribute *),
>>>> + GFP_KERNEL);
>>>> + if (!pcie_info->pcie_pmu_event_attrs)
>>>> + return -ENOMEM;
>>>> + pmu_attrs = pcie_info->pcie_pmu_event_attrs;
>>>> +
>>>> + for (i = 0; i < num_lane_events; i++) {
>>>> + lane_attrs[i].attr.attr.name =
>>>> + devm_kzalloc(priv->dev, sizeof(char)
>>>> + * ATTRI_NAME_MAX_SIZE, GFP_KERNEL);
>>>> + if (!lane_attrs[i].attr.attr.name)
>>>> + return -ENOMEM;
>>>> + }
>>>> +
>>>> + attr_idx = 0;
>>>> + for (i = 0; i < pcie_info->num_lanes; i++) {
>>>> + sprintf(lane, "_lane%d", i);
>>>> +
>>>> + for (j = 0; j < events_per_lane; j++) {
>>>> + int pos = i * events_per_lane + j;
>>>> +
>>>> + strcat((char *)tmp, event_array[j].name);
>>>> + strcat((char *)tmp, lane);
>>>> + memcpy((void *)lane_attrs[pos].attr.attr.name,
>>>> + (void *)tmp,
>>>> + sizeof(tmp));
>>>> +
>>>> + lane_attrs[pos].attr.attr.mode =
>>>> + VERIFY_OCTAL_PERMISSIONS(0444);
>>>> + lane_attrs[pos].attr.show = dwc_pcie_event_show;
>>>> + lane_attrs[pos].attr.store = NULL;
>>>> + lane_attrs[pos].type = DWC_PCIE_LANE_EVENT;
>>>> + lane_attrs[pos].eventid = event_array[j].event_id;
>>>> + lane_attrs[pos].lane = i;
>>>> + pmu_attrs[attr_idx++] = &lane_attrs[pos].attr.attr;
>>>> +
>>>> + memset((void *)tmp, 0, sizeof(tmp));
>>>> + }
>>>> + }
>>>> +
>>>> + for (i = 0; i < ARRAY_SIZE(dwc_pcie_pmu_time_event_attrs); i++)
>>>> + pmu_attrs[attr_idx++] = dwc_pcie_pmu_time_event_attrs[i];
>>>> +
>>>> + pcie_info->pcie_pmu_event_attrs[attr_idx++] = NULL;
>>>> +
>>>> + pcie_info->pcie_pmu_event_attrs_group.name = event_attr_name;
>>>> + pcie_info->pcie_pmu_event_attrs_group.is_visible =
>>>> + pcie_pmu_event_attr_is_visible;
>>>> + pcie_info->pcie_pmu_event_attrs_group.attrs =
>>>> + pcie_info->pcie_pmu_event_attrs;
>>>> +
>>>> + pcie_info->pcie_pmu_attr_groups[0] =
>>>> + &pcie_info->pcie_pmu_event_attrs_group;
>>>> + pcie_info->pcie_pmu_attr_groups[1] = &pcie_pmu_format_attrs_group;
>>>> + pcie_info->pcie_pmu_attr_groups[2] = &pcie_pmu_cpumask_attrs_group;
>>>> + pcie_info->pcie_pmu_attr_groups[3] = NULL;
>>>> +
>>>> + return 0;
>>>> +}
>>>> +
>>>> +static int __dwc_pcie_pmu_probe(struct dwc_pcie_pmu_priv *priv,
>>>> + struct dwc_pcie_info_table *pcie_info)
>>>> +{
>>>> + int ret;
>>>> + char *name;
>>>> + struct dwc_pcie_pmu *pcie_pmu;
>>>> + struct device *dev;
>>>> +
>>>> + if (!pcie_info || !pcie_info->pdev) {
>>>> + pci_err(pcie_info->pdev, "Input parameter is invalid\n");
>>>> + return -EINVAL;
>>>> + }
>>>> +
>>>> + pcie_pmu = &pcie_info->pcie_pmu;
>>>> + dev = &pcie_info->pdev->dev;
>>>> +
>>>> + ret = dwc_pcie_pmu_attr_init(priv, pcie_info);
>>>> + if (ret) {
>>>> + pci_err(pcie_info->pdev, "PMU attr init fail ret=%d\n", ret);
>>>> + return ret;
>>>> + }
>>>> +
>>>> + pcie_pmu->dev = dev;
>>>> + pcie_pmu->pmu = (struct pmu) {
>>>> + .module = THIS_MODULE,
>>>> + .task_ctx_nr = perf_invalid_context,
>>>> + .pmu_enable = NULL,
>>>> + .pmu_disable = NULL,
>>>> + .event_init = dwc_pcie_pmu_event_init,
>>>> + .add = dwc_pcie_pmu_event_add,
>>>> + .del = dwc_pcie_pmu_event_del,
>>>> + .start = dwc_pcie_pmu_event_start,
>>>> + .stop = dwc_pcie_pmu_event_stop,
>>>> + .read = dwc_pcie_pmu_event_read,
>>>> + .attr_groups = pcie_info->pcie_pmu_attr_groups,
>>>> + .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
>>>> + };
>>>> +
>>>> + name = devm_kasprintf(priv->dev, GFP_KERNEL, "pcie_bdf_%x",
>>>> + pcie_info->bdf);
>>>> + if (!name)
>>>> + return -ENOMEM;
>>>> +
>>>> + /* Pick one CPU to be the preferred one to use */
>>>> + pcie_pmu->on_cpu = raw_smp_processor_id();
>>>> +
>>>
>>> So we'll probabley bind all the pmus on one single CPU, is it intended? Since it's
>>> an uncore PMU, we can make it run on any cpu (or for locality CPU on the controller's
>>> NUMA node).
>>>
>>> And I didn't see you register a hotplug handler, so what if the ->on_cpu is hot removed?
>>
>> This PMU does not support interrupt at all, so we do not need to bind it to CPU.
>> Should we remove this line?
>>
>
> No it's still needed to provide it and export it through the cpumask sysfs attribute, otherwise
> the perf cannot recognize it as an uncore PMU device. See [*]. It can be a single CPU mask and
> the event will start only on the given CPU. Or make it a range of CPUs but only let one CPU really
> start the event. It may still need to handle the case that the CPU start the event is offline,
> even without interrupt.
Thank you for the explanation.
>
> BTW, since it doesn't support interrupt, can counters overflow and how to handle the overflow case?
The counter is 64 bit so that we can saftly use it and do not need to handle overflow.
>
> [*] https://github.com/torvalds/linux/blob/v6.0-rc1/tools/perf/util/pmu.c#L615
>
>>>
>>>> + ret = perf_pmu_register(&pcie_pmu->pmu, name, -1);
>>>> + if (ret) {
>>>> + pci_err(pcie_info->pdev, "Error %d registering PMU @%x\n", ret,
>>>> + pcie_info->bdf);
>>>
>>> will be more helpful to print the bdf as format <bus>:<dev>:<func>.
>>
>> Good idea, will fix in next version.
>>
>>>
>>>> + return ret;
>>>> + }
>>>> +
>>>> + pcie_info->pmu_is_register = DWC_PCIE_PMU_HAS_REGISTER;
>>>
>>> Make @pmu_is_register a boolean will be more clear.
>>
>> @pmu_is_register is also discussed in Jonathan' reply. Jonathan suggests to
>> remove it, so let discuss if to keep this field first :) If we decide to keep
>> it, I will let it be boolean.
>>
>>>
>>>> +
>>>> + return ret;
>>>> +}
>>>> +
>>>> +static int dwc_pcie_pmu_remove(struct platform_device *pdev)
>>>> +{
>>>> + struct dwc_pcie_pmu_priv *priv = platform_get_drvdata(pdev);
>>>> + int index;
>>>> + struct dwc_pcie_pmu *pcie_pmu;
>>>
>>> Make the long line first when declaring.
>>
>> Agree, will change the code style.
>>
>>>
>>>> +
>>>> + for (index = 0; index < priv->pcie_ctrl_num; index++)
>>>> + if (priv->pcie_table[index].pmu_is_register) {
>>>> + pcie_pmu = &priv->pcie_table[index].pcie_pmu;
>>>> + perf_pmu_unregister(&pcie_pmu->pmu);
>>>> + }
>>>> + return 0;
>>>> +}
>>>> +
>>>> +static int dwc_pcie_pmu_probe(struct platform_device *pdev)
>>>> +{
>>>> + int ret = 0;
>>>> + int pcie_index;
>>>> + struct dwc_pcie_pmu_priv *priv;
>>>> +
>>>> + priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
>>>> + if (!priv)
>>>> + return -ENOMEM;
>>>> + priv->dev = &pdev->dev;
>>>> + platform_set_drvdata(pdev, priv);
>>>> +
>>>> + /* If PMU is not support on current platform, keep slient */
>>>> + if (dwc_pcie_pmu_discover(priv))
>>>> + return 0;
>>>> +
>>>> + for (pcie_index = 0; pcie_index < priv->pcie_ctrl_num; pcie_index++) {
>>>> + struct pci_dev *rp = priv->pcie_table[pcie_index].pdev;
>>>> +
>>>> + ret = __dwc_pcie_pmu_probe(priv, &priv->pcie_table[pcie_index]);
>>>> + if (ret) {
>>>> + dev_err(&rp->dev, "PCIe PMU probe fail\n");
>>>> + goto pmu_unregister;
>>>> + }
>>>> + }
>>>> + dev_info(&pdev->dev, "PCIe PMUs registered\n");
>>>> +
>>>
>>> As Jonathan mentioned this message maybe unnecessary, but I may find it useful if you
>>> print how many PMU's registered.
>>
>> Fine, I can add a count here.
>>
>>>
>>> On one PMU registration failed, you just remove all the PMUs registered. I wonder if
>>> it's better to make already registered PMU stay instead of removing them all.
>>
>> If perf_pmu_register fails, is it necessary to call perf_pmu_unregister? I did not find
>> similar implementation to unregister pmu when perf_pmu_register fails.
>>
>
> No need for this but that's not what I mean here. If there should be M PMUs but only N of them
> are probed successfully, maybe no need to fail the probe and remove them all. We still can
> use them but just notify the user that how many PMUs are registered and the rest fail.
> Anyway it's up to you.
I see your point, I will try to leave the PMUs which are probed successfully.
Thank you for your valuable comments.
Best Regards,
Shuai
+ Bjorn Helgaas
在 2022/9/23 PM11:54, Jonathan Cameron 写道:
>
>>
>>>
>>>> +#define RP_NUM_MAX 32 /* 2die * 4RC * 4Ctrol */
>>>
>>> This driver is 'almost' generic. So if you an avoid defines based on a particular
>>> platform that's definitely good!
>>
>> Good idea. How about defining RP_NUM_MAX as 64? As fars as I know,
>> some platfrom use 2 sockets, 2 die per socket.
>> Then 2 sockets * 2 dies * 4 Root Complex * 4 root port.
>
> Setting a reasonable maximum is fine - but make sure the code then fails with
> a suitable error message if there are more!
OK, I will add a discovery logic here and count PMU number at runtime.
>
>
>>>> +#define DWC_PCIE_LANE_SHIFT 4
>>>> +#define DWC_PCIE_LANE_MASK GENMASK(9, 4)
>>>> +
>>>> +#define DWC_PCIE_EVENT_CNT_CTRL 0x8
>>>> +#define DWC_PCIE__CNT_EVENT_SELECT_SHIFT 16
>>>
>>> Why double __? If point is , then
>>> naming works better
>>> DWC_PCIE_EVENT_CNT_CTRL_REG
>>> DWC_PCIE_EVENT_CNT_CTRL_EV_SELECT_MSK etc
>>
>> Yes, I point to use double `__` to indicate it is a field of register,
>> as CMN and CCN drivers do. I also considered naming with REG explicitly,
>> but the macro is so long that I often have to wrap code into multilines.
>> Any way, it's fine to rename if you still suggest to do so.
>
> I don't particularly mind. This convention was new to me.
Haha, then I will leave the double `__` as CMN and CCN drivers do.
>>>> +struct dwc_pcie_pmu_priv {
>>>> + struct device *dev;
>>>> + u32 pcie_ctrl_num;
>>>> + struct dwc_pcie_info_table *pcie_table;
>>>> +};
>>>> +
>>>> +#define DWC_PCIE_CREATE_BDF(seg, bus, dev, func) \
>>>> + (((seg) << 24) | (((bus) & 0xFF) << 16) | (((dev) & 0xFF) << 8) | (func))
>>>
>>> Superficially this looks pretty standard. Why is is DWC specific?
>>
>> You are right, it is not DWC specific.
>>
>> I found a similar definition in arch/ia64/pci/pci.c .
>>
>> #define PCI_SAL_ADDRESS(seg, bus, devfn, reg) \
>> (((u64) seg << 24) | (bus << 16) | (devfn << 8) | (reg))
>>
>> Should we move it into a common header first?
>
> Maybe. The bus, devfn, reg part is standard bdf, but I don't think
> the PCI 6.0 spec defined a version with the seg in the upper bits.
> I'm not sure if we want to adopt that in LInux.
I found lots of code use seg,bus,devfn,reg with format "%04x:%02x:%02x.%x",
I am not quite familiar with PCIe spec. What do you think about it, Bjorn?
>
>>>
>>>> + pci_read_config_dword(pdev, vsec + PCI_VNDR_HEADER, &header);
>>>> + /* Is the device part of a DesignWare Cores PCIe Controller ? */
>>>
>>> Good question... This code doesn't check that. VSEC ID is matched only with
>>> the Vendor ID of the devices - unlike DVSEC where this would all be nice
>>> and local.
>>
>> I think a similar fashion is
>>
>> u16 pci_find_vsec_capability(struct pci_dev *dev, u16 vendor, int cap)
>>
>> As you see, I don't want to limit this driver to a specific vendor, like
>> Alibaba (0x1ded), because this driver is generic to all DesignWare Cores PCIe
>> Controller. Therefore, dwc_pcie_find_ras_des_cap_position does not check vendor
>> like pci_find_vsec_capability.
>
> You can't do that because another vendor could use the same VSEC ID for
> an entirely different purpose. They are only valid in combination with the device VID.
It make sense to me.
>
> The only way this can work is with a list of specific vendor ID / VSEC pairs for
> known devices.
>
>>
>> Do you mean to use DVSEC instead? I try to read out DVSEC with lspci:
>>
>> # lspci -vvv
>> b0:00.0 PCI bridge: Alibaba (China) Co., Ltd. M1 Root Port (rev 01) (prog-if 00 [Normal decode])
>> [...snip...]
>> Capabilities: [374 v1] Vendor Specific Information: ID=0002 Rev=4 Len=100 <?>
>> Capabilities: [474 v1] Vendor Specific Information: ID=0001 Rev=1 Len=038 <?>
>> Capabilities: [4ac v1] Data Link Feature <?>
>> Capabilities: [4b8 v1] Designated Vendor-Specific: Vendor=0001 ID=0000 Rev=1 Len=64 <?>
>> Capabilities: [4fc v1] Vendor Specific Information: ID=0005 Rev=1 Len=018 <?>
>>
>> How can we tell it's a DesignWare Cores PCIe Controller?
>
> Gah. This is what DVSEC was defined to solve. It lets you have a common
> vendor defined extended capability defined by a vendor, independent of the
> VID of a given device. With a VSEC you can't write generic code.
>
Got it. But I don't see any description about RAS_DES_CAP register relate to DVSEC
in PCIe Controller TRM. I will check this later.
>>
>>>> + if (PCI_VNDR_HEADER_ID(header) == DWC_PCIE_VSEC_ID &&
>>>> + PCI_VNDR_HEADER_REV(header) == DWC_PCIE_VSEC_REV) {
>>>> + *pos = vsec;
>>>> + return 0;
>>>> + }
>>>> + }
>>>> +
>>>> + return -ENODEV;
>>>> +}
>>>> +
>>>> +static int dwc_pcie_pmu_discover(struct dwc_pcie_pmu_priv *priv)
>>>> +{
>>>> + int val, where, index = 0;
>>>> + struct pci_dev *pdev = NULL;
>>>> + struct dwc_pcie_info_table *pcie_info;
>>>> +
>>>> + priv->pcie_table =
>>>> + devm_kcalloc(priv->dev, RP_NUM_MAX, sizeof(*pcie_info), GFP_KERNEL);
>>>> + if (!priv->pcie_table)
>>>> + return -EINVAL;
>>>> +
>>>> + pcie_info = priv->pcie_table;
>>>> + while ((pdev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pdev)) != NULL &&
>>>> + index < RP_NUM_MAX) {
>>>
>>> This having a driver than then walks the pci topology to find root ports and add
>>> extra stuff to them is not a clean solution.
>>>
>>> The probing should be driven from the existing PCI driver topology.
>>> There are a bunch of new features we need to add to ports in the near future
>>> anyway - this would just be another one.
>>> Same problem exists for CXL CPMU perf devices - so far we only support those
>>> on end points, partly because we need a clean way to probe them on pci ports.
>>>
>>> Whatever we come up with there will apply here as well.
>>
>> I see your point. Any link to reference?
>
> No, though hopefully we'll get to some sort of plan in the branch of this thread
> that Bjorn comment in.
>
OK.
>>
>>>
>>>> + if (!pci_dev_is_rootport(pdev))
>>>> + continue;
>>>> +
>>>> + pcie_info[index].bdf = dwc_pcie_get_bdf(pdev);
>>>> + pcie_info[index].pdev = pdev;
>>> Probably want a sanity check this has a vendor ID appropriate the VSEC you are about
>>> to look for.
>>
>> If I check the vendor ID here or in dwc_pcie_find_ras_des_cap_position, this driver
>> will only work for Alibaba as I mentioned before.
>
> Agreed. Unfortunately that's all you can do safely as VSEC IDs are not a global
> namespace.
Should we add a sanity check with a vendor list in dwc_pcie_find_ras_des_cap_position?
>>
>>>> +
>>>> + ret = dwc_pcie_pmu_write_dword(pcie_info, DWC_PCIE_EVENT_CNT_CTRL, val);
>>>> + if (ret)
>>>> + pci_err(pcie_info->pdev, "PCIe write fail\n");
>>>> +
>>>> + return ret;
>>>> +}
>>>
>>> ...
>>>
>>>> +
>>>> +static int dwc_pcie_pmu_read_base_time_counter(struct dwc_pcie_info_table
>>>> + *pcie_info, u64 *counter)
>>>> +{
>>>> + u32 ret, val;
>>>> +
>>>> + ret = dwc_pcie_pmu_read_dword(pcie_info,
>>>> + DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_HIGH,
>>>> + &val);
>>>> + if (ret) {
>>>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>>>> + return ret;
>>>> + }
>>>> +
>>>> + *counter = val;
>>>> + *counter <<= 32;
>>>
>>> This looks like you could get ripping between the upper and lower dwords.
>>> What prevents that? Perhaps a comment to say why that's not a problem?
>>
>> The Time-based Analysis Data which contains the measurement results of
>> RX/TX data throughput and time spent in each low-power LTSSM state is 64 bit.
>> The data is provided by two 32 bit registers so I rip them together. I will
>> add a comment here in next verison.
>
> If I understand correctly the only safe way to read this is in a try / retry loop.
> Read the upper part, then the lower part, then reread the upper part.
> If the upper part is unchanged you did not get ripping across the two registers.
> If it changes, try again.
It make sence to me, I will fix it in next version.
>
>>
>>>
>>>> +
>>>> + ret = dwc_pcie_pmu_read_dword(pcie_info,
>>>> + DWC_PCIE_TIME_BASED_ANALYSIS_DATA_REG_LOW,
>>>> + &val);
>>>> + if (ret) {
>>>> + pci_err(pcie_info->pdev, "PCIe read fail\n");
>>>> + return ret;
>>>> + }
>>>> +
>>>> + *counter += val;
>>>> +
>>>> + return ret;
>>>> +}
>>> ...
>>>
>>>> +
>>>> + ret = perf_pmu_register(&pcie_pmu->pmu, name, -1);
>>>> + if (ret) {
>>>> + pci_err(pcie_info->pdev, "Error %d registering PMU @%x\n", ret,
>>>> + pcie_info->bdf);
>>>> + return ret;
>>>> + }
>>>> +
>>>> + pcie_info->pmu_is_register = DWC_PCIE_PMU_HAS_REGISTER;
>>>
>>> As below. I think you can drop this state info.
>>
>> Please see my confusion bellow.
>>
>>>
>>>> +
>>>> + return ret;
>>>> +}
>>>> +
>>>> +static int dwc_pcie_pmu_remove(struct platform_device *pdev)
>>>> +{
>>>> + struct dwc_pcie_pmu_priv *priv = platform_get_drvdata(pdev);
>>>> + int index;
>>>> + struct dwc_pcie_pmu *pcie_pmu;
>>>> +
>>>> + for (index = 0; index < priv->pcie_ctrl_num; index++)
>>>> + if (priv->pcie_table[index].pmu_is_register) {
>>>> + pcie_pmu = &priv->pcie_table[index].pcie_pmu;
>>>> + perf_pmu_unregister(&pcie_pmu->pmu);
>>>> + }
>>>> + return 0;
>>>> +}
>>>> +
>>>> +static int dwc_pcie_pmu_probe(struct platform_device *pdev)
>>>> +{
>>>> + int ret = 0;
>>>
>>> Initialized in all paths where it is used. Compiler should be able to tell
>>> that so I doubt you need this to be set to 0 here.
>>
>> Agree, will leave it as uninitialized.
>>
>>>
>>>> + int pcie_index;
>>>> + struct dwc_pcie_pmu_priv *priv;
>>>> +
>>>> + priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
>>>> + if (!priv)
>>>> + return -ENOMEM;
>>>> + priv->dev = &pdev->dev;
>>>> + platform_set_drvdata(pdev, priv);
>>>> +
>>>> + /* If PMU is not support on current platform, keep slient */
>>>> + if (dwc_pcie_pmu_discover(priv))
>>>> + return 0;
>>>> +
>>>> + for (pcie_index = 0; pcie_index < priv->pcie_ctrl_num; pcie_index++) {
>>>> + struct pci_dev *rp = priv->pcie_table[pcie_index].pdev;
>>>> +
>>>> + ret = __dwc_pcie_pmu_probe(priv, &priv->pcie_table[pcie_index]);
>>>> + if (ret) {
>>>> + dev_err(&rp->dev, "PCIe PMU probe fail\n");
>>>> + goto pmu_unregister;
>>>> + }
>>>> + }
>>>> + dev_info(&pdev->dev, "PCIe PMUs registered\n");
>>>
>>> Noise in the logs. There are lots of ways to know if we reached this point
>>> so this adds no value.
>>
>> Got it, will drop this out in next version.
>>
>>>
>>>> +
>>>> + return 0;
>>>> +
>>>> +pmu_unregister:
>>>> + dwc_pcie_pmu_remove(pdev);
>>>
>>> I'd much rather see the unwind here directly so we can clearly see that it undoes
>>> the result of errors in this function. That removes the need to use the
>>> is_registered flag in the remove() function simplifying that flow as well.
>>
>> Do you mean that if perf_pmu_register fails, then jump to pmu_unregister lable directly?
>> How can we tell which PMU diveice fails to reigister?
>
> pcie_index will be set to the index of the PMU device that failed - so loops backwards
> from that removing them.
Good idea. I will fix it in next version.
>>
> .
>>
>>>
>>>> +};
>>>> +
>>>> +static int __init dwc_pcie_pmu_init(void)
>>>> +{
>>>> + int ret;
>>>> +
>>>> + ret = platform_driver_register(&dwc_pcie_pmu_driver);
>>>> +
>>>> + if (ret)
>>>> + return ret;
>>>> +
>>>> + dwc_pcie_pmu_dev =
>>>> + platform_device_register_simple(DEV_NAME, -1, NULL, 0);
>>>
>>> I'd normally expect to see the device created as a result of firmware
>>> description (ACPI DSDT / or Device tree)
>>> It is unusual to create a 'real' device directly in the driver
>>> init - that's normally reserved for various fake / software devices.
>>
>> I see your concerns. You mentioned that
>>
>> > The probing should be driven from the existing PCI driver topology.
>>
>> Should we add a fake device in firmware or drive from PCI driver topology?
>
> Ah. I was reviewing backwards so when I wrote this hadn't realized you walk
> the PCI topology. PCI driver topology is the right solution here.
I see, I will use PCI driver topology instead.
>
>>
>> Thank you.
>>
>> Best Regards,
>> Shuai
>>
On 2022-09-26 14:31, Shuai Xue wrote:
> + Bjorn Helgaas
>
> 在 2022/9/23 PM11:54, Jonathan Cameron 写道:
>>
>>>
>>>>
>>>>> +#define RP_NUM_MAX 32 /* 2die * 4RC * 4Ctrol */
>>>>
>>>> This driver is 'almost' generic. So if you an avoid defines based on a particular
>>>> platform that's definitely good!
>>>
>>> Good idea. How about defining RP_NUM_MAX as 64? As fars as I know,
>>> some platfrom use 2 sockets, 2 die per socket.
>>> Then 2 sockets * 2 dies * 4 Root Complex * 4 root port.
>>
>> Setting a reasonable maximum is fine - but make sure the code then fails with
>> a suitable error message if there are more!
>
> OK, I will add a discovery logic here and count PMU number at runtime.
>
>>
>>
>>>>> +#define DWC_PCIE_LANE_SHIFT 4
>>>>> +#define DWC_PCIE_LANE_MASK GENMASK(9, 4)
>>>>> +
>>>>> +#define DWC_PCIE_EVENT_CNT_CTRL 0x8
>>>>> +#define DWC_PCIE__CNT_EVENT_SELECT_SHIFT 16
>>>>
>>>> Why double __? If point is , then
>>>> naming works better
>>>> DWC_PCIE_EVENT_CNT_CTRL_REG
>>>> DWC_PCIE_EVENT_CNT_CTRL_EV_SELECT_MSK etc
>>>
>>> Yes, I point to use double `__` to indicate it is a field of register,
>>> as CMN and CCN drivers do. I also considered naming with REG explicitly,
>>> but the macro is so long that I often have to wrap code into multilines.
>>> Any way, it's fine to rename if you still suggest to do so.
>>
>> I don't particularly mind. This convention was new to me.
>
> Haha, then I will leave the double `__` as CMN and CCN drivers do.
FWIW I'm not sure there's really any convention. CCN seems to use
double-underscores as distinct separators in a consistent
CCN_REG_NAME__FIELD_NAME__SUFFIX pattern. Conversely in CMN I used it as
an indication of the usual CMN_REG_NAME_FIELD_NAME_VALUE pattern being
abbreviated where it would have been uncomfortably long otherwise (and
particularly where the field name reflects the register name anyway); it
just seemed like a good visual cue to imply that something was missing.
Robin.
On Mon, Sep 26, 2022 at 09:31:34PM +0800, Shuai Xue wrote:
> 在 2022/9/23 PM11:54, Jonathan Cameron 写道:
> >> I found a similar definition in arch/ia64/pci/pci.c .
> >>
> >> #define PCI_SAL_ADDRESS(seg, bus, devfn, reg) \
> >> (((u64) seg << 24) | (bus << 16) | (devfn << 8) | (reg))
> >>
> >> Should we move it into a common header first?
> >
> > Maybe. The bus, devfn, reg part is standard bdf, but I don't think
> > the PCI 6.0 spec defined a version with the seg in the upper bits.
> > I'm not sure if we want to adopt that in LInux.
>
> I found lots of code use seg,bus,devfn,reg with format "%04x:%02x:%02x.%x",
> I am not quite familiar with PCIe spec. What do you think about it, Bjorn?
The PCIe spec defines an address encoding for bus/device/function/reg
for the purposes of ECAM (PCIe r6.0, sec 7.2.2), but as far as I know,
it doesn't define anything similar that includes the segment. The
segment is really outside the scope of PCIe because each segment is a
completely separate PCIe hierarchy.
So I probably wouldn't make this a generic definition. But if/when
you print things like this out, please do use the format spec you
mentioned above so it matches the style used elsewhere.
Bjorn
在 2022/9/27 AM1:18, Bjorn Helgaas 写道:
> On Mon, Sep 26, 2022 at 09:31:34PM +0800, Shuai Xue wrote:
>> 在 2022/9/23 PM11:54, Jonathan Cameron 写道:
>>>> I found a similar definition in arch/ia64/pci/pci.c .
>>>>
>>>> #define PCI_SAL_ADDRESS(seg, bus, devfn, reg) \
>>>> (((u64) seg << 24) | (bus << 16) | (devfn << 8) | (reg))
>>>>
>>>> Should we move it into a common header first?
>>>
>>> Maybe. The bus, devfn, reg part is standard bdf, but I don't think
>>> the PCI 6.0 spec defined a version with the seg in the upper bits.
>>> I'm not sure if we want to adopt that in LInux.
>>
>> I found lots of code use seg,bus,devfn,reg with format "%04x:%02x:%02x.%x",
>> I am not quite familiar with PCIe spec. What do you think about it, Bjorn?
>
> The PCIe spec defines an address encoding for bus/device/function/reg
> for the purposes of ECAM (PCIe r6.0, sec 7.2.2), but as far as I know,
> it doesn't define anything similar that includes the segment. The
> segment is really outside the scope of PCIe because each segment is a
> completely separate PCIe hierarchy.
Thank you for your explanation.
>
> So I probably wouldn't make this a generic definition. But if/when
> you print things like this out, please do use the format spec you
> mentioned above so it matches the style used elsewhere.
>
Agree. The print format of bus/device/function/reg is "%04x:%02x:%02x.%x",
so I named the PMU as the same format. Then the usage flow would be:
- lspci to get the device root port in format seg/bus/device/function/reg.
10:00.0 PCI bridge: Device 1ded:8000 (rev 01)
- select its PMU name pcie_bdf_100000.
- monitor with perf:
perf stat -a -e pcie_bdf_200/Rx_PCIe_TLP_Data_Payload/
Bjorn and Jonathan, are you happy with this flow?
Thank you.
Best Regards,
Shuai
在 2022/9/24 AM2:51, Bjorn Helgaas 写道:
> On Fri, Sep 23, 2022 at 10:46:09PM +0800, Shuai Xue wrote:
>> 在 2022/9/23 AM1:36, Bjorn Helgaas 写道:
>>> On Sat, Sep 17, 2022 at 08:10:35PM +0800, Shuai Xue wrote:
>
>>>> +static struct device_attribute dwc_pcie_pmu_cpumask_attr =
>>>> +__ATTR(cpumask, 0444, dwc_pcie_pmu_cpumask_show, NULL);
>>>
>>> DEVICE_ATTR_RO()?
>
>> DEVICE_ATTR_RO may a good choice. But does it fit the code style to use
>> DEVICE_ATTR_RO in drivers/perf? As far as know, CCN, CCI, SMMU,
>> qcom_l2_pmu use "struct device_attribute" directly.
>
> DEVICE_ATTR_RO is just newer, and I think CCN, CCI, SMMU, etc. would
> be using it if they were written today. Of course, the drivers/perf
> maintainers may have a different opinion :)
Well, you are right, I will use DEVICE_ATTR_RO instead :)
>
>>> I think every caller of dwc_pcie_pmu_read_dword() makes the same check
>>> and prints the same message; maybe the message should be moved inside
>>> dwc_pcie_pmu_read_dword()?
>>>
>>> Same with dwc_pcie_pmu_write_dword(); moving the message there would
>>> simplify all callers.
>>
>> I would like to wrap dwc_pcie_pmu_{write}_dword out, use
>> pci_{read}_config_dword and drop the snaity check of return value as
>> Jonathan suggests. How did you like it?
>
> Sounds good. Not sure the error checking is worthwhile since
> pci_read_config_dword() really doesn't return meaningful errors
> anyway.
>
>>>> +static struct dwc_pcie_info_table *pmu_to_pcie_info(struct pmu *pmu)
>>>> +{
>>>> + struct dwc_pcie_info_table *pcie_info;
>>>> + struct dwc_pcie_pmu *pcie_pmu = to_pcie_pmu(pmu);
>>>> +
>>>> + pcie_info = container_of(pcie_pmu, struct dwc_pcie_info_table, pcie_pmu);
>>>> + if (pcie_info == NULL)
>>>> + pci_err(pcie_info->pdev, "Can't get pcie info\n");
>>>
>>> It shouldn't be possible to get here for a pmu with no pcie_info, and
>>> callers don't check for a NULL pointer return value before
>>> dereferencing it, so I guess all this adds is an error message before
>>> a NULL pointer oops? Not sure the code clutter is worth it.
>>
>> Do you mean to drop the snaity check of container_of?
>
> Yes. I'm suggesting that the NULL pointer oops itself has enough
> information to debug this problem, even without the pci_err().
I will drop the snaity check in next version.
Thank you for you valuable comments.
Best Regards,
Shuai
On Mon, 26 Sep 2022 12:18:57 -0500
Bjorn Helgaas <[email protected]> wrote:
> On Mon, Sep 26, 2022 at 09:31:34PM +0800, Shuai Xue wrote:
> > 在 2022/9/23 PM11:54, Jonathan Cameron 写道:
> > >> I found a similar definition in arch/ia64/pci/pci.c .
> > >>
> > >> #define PCI_SAL_ADDRESS(seg, bus, devfn, reg) \
> > >> (((u64) seg << 24) | (bus << 16) | (devfn << 8) | (reg))
> > >>
> > >> Should we move it into a common header first?
> > >
> > > Maybe. The bus, devfn, reg part is standard bdf, but I don't think
> > > the PCI 6.0 spec defined a version with the seg in the upper bits.
> > > I'm not sure if we want to adopt that in LInux.
> >
> > I found lots of code use seg,bus,devfn,reg with format "%04x:%02x:%02x.%x",
> > I am not quite familiar with PCIe spec. What do you think about it, Bjorn?
>
> The PCIe spec defines an address encoding for bus/device/function/reg
> for the purposes of ECAM (PCIe r6.0, sec 7.2.2), but as far as I know,
> it doesn't define anything similar that includes the segment. The
> segment is really outside the scope of PCIe because each segment is a
> completely separate PCIe hierarchy.
It's beginning to get exposed in PCIe 6.0 as a result of enabling cross
segment messages. Two places I know of that the segment can be seen in.
Captured TLP headers with certain AER reported errors.
Hierarchy ID Extended capability - this one takes some digging.
Specifically 7.9.17.3 Hierarchy ID Data Register which if you follow
link to 6.25 includes Segment Group Number.
Anyhow, not particularly relevant here and it never occurs next to
any of the BDF stuff but it is now (just about) in scope of PCIe.
Jonathan
>
> So I probably wouldn't make this a generic definition. But if/when
> you print things like this out, please do use the format spec you
> mentioned above so it matches the style used elsewhere.
>
> Bjorn
On Tue, 27 Sep 2022 13:13:29 +0800
Shuai Xue <[email protected]> wrote:
> 在 2022/9/27 AM1:18, Bjorn Helgaas 写道:
> > On Mon, Sep 26, 2022 at 09:31:34PM +0800, Shuai Xue wrote:
> >> 在 2022/9/23 PM11:54, Jonathan Cameron 写道:
> >>>> I found a similar definition in arch/ia64/pci/pci.c .
> >>>>
> >>>> #define PCI_SAL_ADDRESS(seg, bus, devfn, reg) \
> >>>> (((u64) seg << 24) | (bus << 16) | (devfn << 8) | (reg))
> >>>>
> >>>> Should we move it into a common header first?
> >>>
> >>> Maybe. The bus, devfn, reg part is standard bdf, but I don't think
> >>> the PCI 6.0 spec defined a version with the seg in the upper bits.
> >>> I'm not sure if we want to adopt that in LInux.
> >>
> >> I found lots of code use seg,bus,devfn,reg with format "%04x:%02x:%02x.%x",
> >> I am not quite familiar with PCIe spec. What do you think about it, Bjorn?
> >
> > The PCIe spec defines an address encoding for bus/device/function/reg
> > for the purposes of ECAM (PCIe r6.0, sec 7.2.2), but as far as I know,
> > it doesn't define anything similar that includes the segment. The
> > segment is really outside the scope of PCIe because each segment is a
> > completely separate PCIe hierarchy.
>
> Thank you for your explanation.
>
> >
> > So I probably wouldn't make this a generic definition. But if/when
> > you print things like this out, please do use the format spec you
> > mentioned above so it matches the style used elsewhere.
> >
>
> Agree. The print format of bus/device/function/reg is "%04x:%02x:%02x.%x",
> so I named the PMU as the same format. Then the usage flow would be:
>
> - lspci to get the device root port in format seg/bus/device/function/reg.
> 10:00.0 PCI bridge: Device 1ded:8000 (rev 01)
> - select its PMU name pcie_bdf_100000.
> - monitor with perf:
> perf stat -a -e pcie_bdf_200/Rx_PCIe_TLP_Data_Payload/
I think you probably want something in there to indicate it's an RP
and the bdf part may be redundant...
Jonathan
>
> Bjorn and Jonathan, are you happy with this flow?
>
> Thank you.
>
> Best Regards,
> Shuai
>
On 2022-09-27 11:04, Jonathan Cameron wrote:
> On Tue, 27 Sep 2022 13:13:29 +0800
> Shuai Xue <[email protected]> wrote:
>
>> 在 2022/9/27 AM1:18, Bjorn Helgaas 写道:
>>> On Mon, Sep 26, 2022 at 09:31:34PM +0800, Shuai Xue wrote:
>>>> 在 2022/9/23 PM11:54, Jonathan Cameron 写道:
>>>>>> I found a similar definition in arch/ia64/pci/pci.c .
>>>>>>
>>>>>> #define PCI_SAL_ADDRESS(seg, bus, devfn, reg) \
>>>>>> (((u64) seg << 24) | (bus << 16) | (devfn << 8) | (reg))
>>>>>>
>>>>>> Should we move it into a common header first?
>>>>>
>>>>> Maybe. The bus, devfn, reg part is standard bdf, but I don't think
>>>>> the PCI 6.0 spec defined a version with the seg in the upper bits.
>>>>> I'm not sure if we want to adopt that in LInux.
>>>>
>>>> I found lots of code use seg,bus,devfn,reg with format "%04x:%02x:%02x.%x",
>>>> I am not quite familiar with PCIe spec. What do you think about it, Bjorn?
>>>
>>> The PCIe spec defines an address encoding for bus/device/function/reg
>>> for the purposes of ECAM (PCIe r6.0, sec 7.2.2), but as far as I know,
>>> it doesn't define anything similar that includes the segment. The
>>> segment is really outside the scope of PCIe because each segment is a
>>> completely separate PCIe hierarchy.
>>
>> Thank you for your explanation.
>>
>>>
>>> So I probably wouldn't make this a generic definition. But if/when
>>> you print things like this out, please do use the format spec you
>>> mentioned above so it matches the style used elsewhere.
>>>
>>
>> Agree. The print format of bus/device/function/reg is "%04x:%02x:%02x.%x",
>> so I named the PMU as the same format. Then the usage flow would be:
>>
>> - lspci to get the device root port in format seg/bus/device/function/reg.
>> 10:00.0 PCI bridge: Device 1ded:8000 (rev 01)
>> - select its PMU name pcie_bdf_100000.
>> - monitor with perf:
>> perf stat -a -e pcie_bdf_200/Rx_PCIe_TLP_Data_Payload/
>
> I think you probably want something in there to indicate it's an RP
> and the bdf part may be redundant...
Indeed that seems horribly unclear; personally I reckon something like
"dw_pcie_200" would be more appropriate. The address is just a
disambiguator between multiple instances so doesn't need any further
emphasis, but what is crucial to the user is exactly what kind of PMU it
is (especially if there's potential for other unrelated PCIe functions
to start exposing their own different PMUs).
Thanks,
Robin.
在 2022/9/27 PM6:04, Jonathan Cameron 写道:
> On Tue, 27 Sep 2022 13:13:29 +0800
> Shuai Xue <[email protected]> wrote:
>
>> 在 2022/9/27 AM1:18, Bjorn Helgaas 写道:
>>> On Mon, Sep 26, 2022 at 09:31:34PM +0800, Shuai Xue wrote:
>>>> 在 2022/9/23 PM11:54, Jonathan Cameron 写道:
>>>>>> I found a similar definition in arch/ia64/pci/pci.c .
>>>>>>
>>>>>> #define PCI_SAL_ADDRESS(seg, bus, devfn, reg) \
>>>>>> (((u64) seg << 24) | (bus << 16) | (devfn << 8) | (reg))
>>>>>>
>>>>>> Should we move it into a common header first?
>>>>>
>>>>> Maybe. The bus, devfn, reg part is standard bdf, but I don't think
>>>>> the PCI 6.0 spec defined a version with the seg in the upper bits.
>>>>> I'm not sure if we want to adopt that in LInux.
>>>>
>>>> I found lots of code use seg,bus,devfn,reg with format "%04x:%02x:%02x.%x",
>>>> I am not quite familiar with PCIe spec. What do you think about it, Bjorn?
>>>
>>> The PCIe spec defines an address encoding for bus/device/function/reg
>>> for the purposes of ECAM (PCIe r6.0, sec 7.2.2), but as far as I know,
>>> it doesn't define anything similar that includes the segment. The
>>> segment is really outside the scope of PCIe because each segment is a
>>> completely separate PCIe hierarchy.
>>
>> Thank you for your explanation.
>>
>>>
>>> So I probably wouldn't make this a generic definition. But if/when
>>> you print things like this out, please do use the format spec you
>>> mentioned above so it matches the style used elsewhere.
>>>
>>
>> Agree. The print format of bus/device/function/reg is "%04x:%02x:%02x.%x",
>> so I named the PMU as the same format. Then the usage flow would be:
>>
>> - lspci to get the device root port in format seg/bus/device/function/reg.
>> 10:00.0 PCI bridge: Device 1ded:8000 (rev 01)
>> - select its PMU name pcie_bdf_100000.
>> - monitor with perf:
>> perf stat -a -e pcie_bdf_100000/Rx_PCIe_TLP_Data_Payload/
>
> I think you probably want something in there to indicate it's an RP
> and the bdf part may be redundant...
Yes, I realized that the prefix `pcie_bdf` is not appropriate. Let's discuss
with Robin in his thread.
Thank you.
Best Regards,
Shuai
On Tue, 27 Sep 2022 20:49:26 +0800
Shuai Xue <[email protected]> wrote:
> + Jonathan
>
> 在 2022/9/27 PM6:14, Robin Murphy 写道:
> > On 2022-09-27 11:04, Jonathan Cameron wrote:
> >> On Tue, 27 Sep 2022 13:13:29 +0800
> >> Shuai Xue <[email protected]> wrote:
> >>
> >>> 在 2022/9/27 AM1:18, Bjorn Helgaas 写道:
> >>>> On Mon, Sep 26, 2022 at 09:31:34PM +0800, Shuai Xue wrote:
> >>>>> 在 2022/9/23 PM11:54, Jonathan Cameron 写道:
> >>>>>>> I found a similar definition in arch/ia64/pci/pci.c .
> >>>>>>>
> >>>>>>> #define PCI_SAL_ADDRESS(seg, bus, devfn, reg) \
> >>>>>>> (((u64) seg << 24) | (bus << 16) | (devfn << 8) | (reg))
> >>>>>>>
> >>>>>>> Should we move it into a common header first?
> >>>>>>
> >>>>>> Maybe. The bus, devfn, reg part is standard bdf, but I don't think
> >>>>>> the PCI 6.0 spec defined a version with the seg in the upper bits.
> >>>>>> I'm not sure if we want to adopt that in LInux.
> >>>>>
> >>>>> I found lots of code use seg,bus,devfn,reg with format "%04x:%02x:%02x.%x",
> >>>>> I am not quite familiar with PCIe spec. What do you think about it, Bjorn?
> >>>>
> >>>> The PCIe spec defines an address encoding for bus/device/function/reg
> >>>> for the purposes of ECAM (PCIe r6.0, sec 7.2.2), but as far as I know,
> >>>> it doesn't define anything similar that includes the segment. The
> >>>> segment is really outside the scope of PCIe because each segment is a
> >>>> completely separate PCIe hierarchy.
> >>>
> >>> Thank you for your explanation.
> >>>
> >>>>
> >>>> So I probably wouldn't make this a generic definition. But if/when
> >>>> you print things like this out, please do use the format spec you
> >>>> mentioned above so it matches the style used elsewhere.
> >>>>
> >>>
> >>> Agree. The print format of bus/device/function/reg is "%04x:%02x:%02x.%x",
> >>> so I named the PMU as the same format. Then the usage flow would be:
> >>>
> >>> - lspci to get the device root port in format seg/bus/device/function/reg.
> >>> 10:00.0 PCI bridge: Device 1ded:8000 (rev 01)
> >>> - select its PMU name pcie_bdf_100000.
> >>> - monitor with perf:
> >>> perf stat -a -e pcie_bdf_100000/Rx_PCIe_TLP_Data_Payload/
> >>
> >> I think you probably want something in there to indicate it's an RP
> >> and the bdf part may be redundant...
> >
> > Indeed that seems horribly unclear; personally I reckon something like "dw_pcie_200" would be more appropriate. The address is just a disambiguator between multiple instances so doesn't need any further emphasis, but what is crucial to the user is exactly what kind of PMU it is (especially if there's potential for other unrelated PCIe functions to start exposing their own different PMUs).
>
> I see your point. The current prefix `pcie_bdf` is not appropriate,
>
> - it does not indicate it is for a root point as Jonathan mentioned.
> - its prefix is not `dwc`
>
> Is dwc_rootport_100000 more appropriate?
>
> - `dwc` indicates the PMU is for Synopsys DesignWare Cores PCIe controller IP
> - `rootport` indicates the PMU is for a root port device
> - `100000` indicates the device address
Looks good to me.
J
>
>
> Thank you.
>
> Best Regards,
> Shuai
>
>
>
>
+ Jonathan
在 2022/9/27 PM6:14, Robin Murphy 写道:
> On 2022-09-27 11:04, Jonathan Cameron wrote:
>> On Tue, 27 Sep 2022 13:13:29 +0800
>> Shuai Xue <[email protected]> wrote:
>>
>>> 在 2022/9/27 AM1:18, Bjorn Helgaas 写道:
>>>> On Mon, Sep 26, 2022 at 09:31:34PM +0800, Shuai Xue wrote:
>>>>> 在 2022/9/23 PM11:54, Jonathan Cameron 写道:
>>>>>>> I found a similar definition in arch/ia64/pci/pci.c .
>>>>>>>
>>>>>>> #define PCI_SAL_ADDRESS(seg, bus, devfn, reg) \
>>>>>>> (((u64) seg << 24) | (bus << 16) | (devfn << 8) | (reg))
>>>>>>>
>>>>>>> Should we move it into a common header first?
>>>>>>
>>>>>> Maybe. The bus, devfn, reg part is standard bdf, but I don't think
>>>>>> the PCI 6.0 spec defined a version with the seg in the upper bits.
>>>>>> I'm not sure if we want to adopt that in LInux.
>>>>>
>>>>> I found lots of code use seg,bus,devfn,reg with format "%04x:%02x:%02x.%x",
>>>>> I am not quite familiar with PCIe spec. What do you think about it, Bjorn?
>>>>
>>>> The PCIe spec defines an address encoding for bus/device/function/reg
>>>> for the purposes of ECAM (PCIe r6.0, sec 7.2.2), but as far as I know,
>>>> it doesn't define anything similar that includes the segment. The
>>>> segment is really outside the scope of PCIe because each segment is a
>>>> completely separate PCIe hierarchy.
>>>
>>> Thank you for your explanation.
>>>
>>>>
>>>> So I probably wouldn't make this a generic definition. But if/when
>>>> you print things like this out, please do use the format spec you
>>>> mentioned above so it matches the style used elsewhere.
>>>>
>>>
>>> Agree. The print format of bus/device/function/reg is "%04x:%02x:%02x.%x",
>>> so I named the PMU as the same format. Then the usage flow would be:
>>>
>>> - lspci to get the device root port in format seg/bus/device/function/reg.
>>> 10:00.0 PCI bridge: Device 1ded:8000 (rev 01)
>>> - select its PMU name pcie_bdf_100000.
>>> - monitor with perf:
>>> perf stat -a -e pcie_bdf_100000/Rx_PCIe_TLP_Data_Payload/
>>
>> I think you probably want something in there to indicate it's an RP
>> and the bdf part may be redundant...
>
> Indeed that seems horribly unclear; personally I reckon something like "dw_pcie_200" would be more appropriate. The address is just a disambiguator between multiple instances so doesn't need any further emphasis, but what is crucial to the user is exactly what kind of PMU it is (especially if there's potential for other unrelated PCIe functions to start exposing their own different PMUs).
I see your point. The current prefix `pcie_bdf` is not appropriate,
- it does not indicate it is for a root point as Jonathan mentioned.
- its prefix is not `dwc`
Is dwc_rootport_100000 more appropriate?
- `dwc` indicates the PMU is for Synopsys DesignWare Cores PCIe controller IP
- `rootport` indicates the PMU is for a root port device
- `100000` indicates the device address
Thank you.
Best Regards,
Shuai