2023-01-11 20:32:59

by Liang, Kan

[permalink] [raw]
Subject: [PATCH 0/7] iommu/vt-d: Support performance monitoring for IOMMU

From: Kan Liang <[email protected]>

A performance monitoring infrastructure, perfmon, is introduced with
the VT-d Spec 4.0. The purpose of perfmon is to support collection of
information about key events occurring during operation of the remapping
hardware, to aid performance tuning and debug. The patch series is to
support the perfmon for IOMMU.

To facilitate the perfmon support, the patch series also supports two
new generic features of VT-d Spec 4.0.
- Support the 'size' field to retrieve the accurate size of the register
set for each dmar device from DRHD. (Patch 1)
- Support the new Enhanced Command Interface. (Patch 3)

With the patch series, users can collect the performance data of IOMMU
via Linux perf tool. For example,

$ perf stat -e dmar0/iommu_requests,filter_ats=0/ -a sleep 1

Performance counter stats for 'system wide':

2135 dmar0/iommu_requests,filter_ats=0/

1.001087695 seconds time elapsed

The IOMMU PMUs can be found under /sys/bus/event_source/devices/dmar*

The available filters and event format can be found at the format folder
$ ls /sys/bus/event_source/devices/dmar0/format/
event event_group filter_ats filter_page_table

The supported events can be found at the events folder

$ ls /sys/bus/event_source/devices/dmar0/events/
ats_blocked int_cache_hit_nonposted iommu_mrds
pasid_cache_lookup
ctxt_cache_hit int_cache_hit_posted iommu_requests
pg_req_posted
ctxt_cache_lookup int_cache_lookup iotlb_hit
pw_occupancy
fs_nonleaf_hit iommu_clocks iotlb_lookup
ss_nonleaf_hit
fs_nonleaf_lookup iommu_mem_blocked pasid_cache_hit
ss_nonleaf_lookup

Kan Liang (7):
iommu/vt-d: Support size of the register set in DRHD
iommu/vt-d: Retrieve IOMMU perfmon capability information
iommu/vt-d: Support Enhanced Command Interface
iommu/vt-d: Add IOMMU perfmon support
iommu/vt-d: Support cpumask for IOMMU perfmon
iommu/vt-d: Add IOMMU perfmon overflow handler support
iommu/vt-d: Enable IOMMU perfmon support

.../sysfs-bus-event_source-devices-iommu | 32 +
drivers/iommu/intel/Kconfig | 9 +
drivers/iommu/intel/Makefile | 1 +
drivers/iommu/intel/dmar.c | 100 ++-
drivers/iommu/intel/iommu.c | 3 +
drivers/iommu/intel/iommu.h | 113 ++-
drivers/iommu/intel/perfmon.c | 836 ++++++++++++++++++
drivers/iommu/intel/perfmon.h | 65 ++
drivers/iommu/intel/svm.c | 2 +-
include/acpi/actbl1.h | 2 +-
include/linux/cpuhotplug.h | 1 +
include/linux/dmar.h | 1 +
12 files changed, 1155 insertions(+), 10 deletions(-)
create mode 100644 Documentation/ABI/testing/sysfs-bus-event_source-devices-iommu
create mode 100644 drivers/iommu/intel/perfmon.c
create mode 100644 drivers/iommu/intel/perfmon.h

--
2.35.1


2023-01-11 20:33:08

by Liang, Kan

[permalink] [raw]
Subject: [PATCH 1/7] iommu/vt-d: Support size of the register set in DRHD

From: Kan Liang <[email protected]>

A new field, which indicates the size of the remapping hardware register
set for this remapping unit, is introduced in the DMA-remapping hardware
unit definition (DRHD) structure with the VT-d Spec 4.0.
With the information, SW doesn't need to 'guess' the size of the
register set anymore.

Update the struct acpi_dmar_hardware_unit to reflect the field.

Store the size of the register set in struct dmar_drhd_unit for each
dmar device.

The 'size' information is ResvZ for the old BIOS and platforms. Fall
back to the old guessing method. There is nothing changed.

Signed-off-by: Kan Liang <[email protected]>
---
drivers/iommu/intel/dmar.c | 25 ++++++++++++++++++-------
include/acpi/actbl1.h | 2 +-
include/linux/dmar.h | 1 +
3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index b00a0ceb2d13..6a411d964474 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -399,6 +399,13 @@ dmar_find_dmaru(struct acpi_dmar_hardware_unit *drhd)
return NULL;
}

+/* The size of the register set is 2 ^ N 4 KB pages. */
+static unsigned long
+dmar_get_drhd_reg_size(u8 npages)
+{
+ return 1UL << (npages + 12);
+}
+
/*
* dmar_parse_one_drhd - parses exactly one DMA remapping hardware definition
* structure which uniquely represent one DMA remapping hardware unit
@@ -427,6 +434,7 @@ static int dmar_parse_one_drhd(struct acpi_dmar_header *header, void *arg)
memcpy(dmaru->hdr, header, header->length);
dmaru->reg_base_addr = drhd->address;
dmaru->segment = drhd->segment;
+ dmaru->reg_size = dmar_get_drhd_reg_size(drhd->size);
dmaru->include_all = drhd->flags & 0x1; /* BIT0: INCLUDE_ALL */
dmaru->devices = dmar_alloc_dev_scope((void *)(drhd + 1),
((void *)drhd) + drhd->header.length,
@@ -880,6 +888,7 @@ dmar_validate_one_drhd(struct acpi_dmar_header *entry, void *arg)
struct acpi_dmar_hardware_unit *drhd;
void __iomem *addr;
u64 cap, ecap;
+ unsigned long size;

drhd = (void *)entry;
if (!drhd->address) {
@@ -887,10 +896,11 @@ dmar_validate_one_drhd(struct acpi_dmar_header *entry, void *arg)
return -EINVAL;
}

+ size = dmar_get_drhd_reg_size(drhd->size);
if (arg)
- addr = ioremap(drhd->address, VTD_PAGE_SIZE);
+ addr = ioremap(drhd->address, size);
else
- addr = early_ioremap(drhd->address, VTD_PAGE_SIZE);
+ addr = early_ioremap(drhd->address, size);
if (!addr) {
pr_warn("Can't validate DRHD address: %llx\n", drhd->address);
return -EINVAL;
@@ -902,7 +912,7 @@ dmar_validate_one_drhd(struct acpi_dmar_header *entry, void *arg)
if (arg)
iounmap(addr);
else
- early_iounmap(addr, VTD_PAGE_SIZE);
+ early_iounmap(addr, size);

if (cap == (uint64_t)-1 && ecap == (uint64_t)-1) {
warn_invalid_dmar(drhd->address, " returns all ones");
@@ -956,17 +966,18 @@ static void unmap_iommu(struct intel_iommu *iommu)
/**
* map_iommu: map the iommu's registers
* @iommu: the iommu to map
- * @phys_addr: the physical address of the base resgister
+ * @drhd: DMA remapping hardware definition structure
*
* Memory map the iommu's registers. Start w/ a single page, and
* possibly expand if that turns out to be insufficent.
*/
-static int map_iommu(struct intel_iommu *iommu, u64 phys_addr)
+static int map_iommu(struct intel_iommu *iommu, struct dmar_drhd_unit *drhd)
{
+ u64 phys_addr = drhd->reg_base_addr;
int map_size, err=0;

iommu->reg_phys = phys_addr;
- iommu->reg_size = VTD_PAGE_SIZE;
+ iommu->reg_size = drhd->reg_size;

if (!request_mem_region(iommu->reg_phys, iommu->reg_size, iommu->name)) {
pr_err("Can't reserve memory\n");
@@ -1050,7 +1061,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
}
sprintf(iommu->name, "dmar%d", iommu->seq_id);

- err = map_iommu(iommu, drhd->reg_base_addr);
+ err = map_iommu(iommu, drhd);
if (err) {
pr_err("Failed to map %s\n", iommu->name);
goto error_free_seq_id;
diff --git a/include/acpi/actbl1.h b/include/acpi/actbl1.h
index 4175dce3967c..bdded0ac46eb 100644
--- a/include/acpi/actbl1.h
+++ b/include/acpi/actbl1.h
@@ -802,7 +802,7 @@ struct acpi_dmar_pci_path {
struct acpi_dmar_hardware_unit {
struct acpi_dmar_header header;
u8 flags;
- u8 reserved;
+ u8 size; /* Size of the register set */
u16 segment;
u64 address; /* Register Base Address */
};
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index d81a51978d01..51d498f0a02b 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -46,6 +46,7 @@ struct dmar_drhd_unit {
u8 include_all:1;
u8 gfx_dedicated:1; /* graphic dedicated */
struct intel_iommu *iommu;
+ unsigned long reg_size; /* size of register set */
};

struct dmar_pci_path {
--
2.35.1

2023-01-11 20:33:23

by Liang, Kan

[permalink] [raw]
Subject: [PATCH 2/7] iommu/vt-d: Retrieve IOMMU perfmon capability information

From: Kan Liang <[email protected]>

The performance monitoring infrastructure, perfmon, is to support
collection of information about key events occurring during operation of
the remapping hardware, to aid performance tuning and debug. Each
remapping hardware unit has capability registers that indicate support
for performance monitoring features and enumerate the capabilities.

Add alloc_iommu_pmu() to retrieve IOMMU perfmon capability information
for each iommu unit. The information is stored in the iommu->pmu data
structure. Capability registers are read-only, so it's safe to prefetch
and store them in the pmu structure. This could avoid unnecessary VMEXIT
when this code is running in the virtualization environment.

Add free_iommu_pmu() to free the saved capability information when
freeing the iommu unit.

Add a kernel config option for the IOMMU perfmon feature. Unless a user
explicitly uses the perf tool to monitor the IOMMU perfmon event, there
isn't any impact for the existing IOMMU. Enable it by default.

Signed-off-by: Kan Liang <[email protected]>
---
drivers/iommu/intel/Kconfig | 9 ++
drivers/iommu/intel/Makefile | 1 +
drivers/iommu/intel/dmar.c | 7 ++
drivers/iommu/intel/iommu.h | 41 +++++++++
drivers/iommu/intel/perfmon.c | 159 ++++++++++++++++++++++++++++++++++
drivers/iommu/intel/perfmon.h | 41 +++++++++
6 files changed, 258 insertions(+)
create mode 100644 drivers/iommu/intel/perfmon.c
create mode 100644 drivers/iommu/intel/perfmon.h

diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index b7dff5092fd2..1a4aebddc9a6 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -96,4 +96,13 @@ config INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
passing intel_iommu=sm_on to the kernel. If not sure, please use
the default value.

+config INTEL_IOMMU_PERF_EVENTS
+ def_bool y
+ bool "Intel IOMMU performance events"
+ depends on INTEL_IOMMU && PERF_EVENTS
+ help
+ Include support for Intel IOMMU performance events. These are
+ available on modern processors which support Intel VT-d 4.0 and
+ later.
+
endif # INTEL_IOMMU
diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile
index fa0dae16441c..7af3b8a4f2a0 100644
--- a/drivers/iommu/intel/Makefile
+++ b/drivers/iommu/intel/Makefile
@@ -6,3 +6,4 @@ obj-$(CONFIG_DMAR_PERF) += perf.o
obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o
obj-$(CONFIG_INTEL_IOMMU_SVM) += svm.o
obj-$(CONFIG_IRQ_REMAP) += irq_remapping.o
+obj-$(CONFIG_INTEL_IOMMU_PERF_EVENTS) += perfmon.o
diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index 6a411d964474..91bb48267df2 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -34,6 +34,7 @@
#include "../irq_remapping.h"
#include "perf.h"
#include "trace.h"
+#include "perfmon.h"

typedef int (*dmar_res_handler_t)(struct acpi_dmar_header *, void *);
struct dmar_res_callback {
@@ -1114,6 +1115,9 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
if (sts & DMA_GSTS_QIES)
iommu->gcmd |= DMA_GCMD_QIE;

+ if (alloc_iommu_pmu(iommu))
+ pr_debug("Cannot alloc PMU for iommu (seq_id = %d)\n", iommu->seq_id);
+
raw_spin_lock_init(&iommu->register_lock);

/*
@@ -1148,6 +1152,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
err_sysfs:
iommu_device_sysfs_remove(&iommu->iommu);
err_unmap:
+ free_iommu_pmu(iommu);
unmap_iommu(iommu);
error_free_seq_id:
ida_free(&dmar_seq_ids, iommu->seq_id);
@@ -1163,6 +1168,8 @@ static void free_iommu(struct intel_iommu *iommu)
iommu_device_sysfs_remove(&iommu->iommu);
}

+ free_iommu_pmu(iommu);
+
if (iommu->irq) {
if (iommu->pr_irq) {
free_irq(iommu->pr_irq, iommu);
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index 06e61e474856..5bcefbea55c9 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -125,6 +125,11 @@
#define DMAR_MTRR_PHYSMASK8_REG 0x208
#define DMAR_MTRR_PHYSBASE9_REG 0x210
#define DMAR_MTRR_PHYSMASK9_REG 0x218
+#define DMAR_PERFCAP_REG 0x300
+#define DMAR_PERFCFGOFF_REG 0x310
+#define DMAR_PERFOVFOFF_REG 0x318
+#define DMAR_PERFCNTROFF_REG 0x31c
+#define DMAR_PERFEVNTCAP_REG 0x380
#define DMAR_VCCAP_REG 0xe30 /* Virtual command capability register */
#define DMAR_VCMD_REG 0xe00 /* Virtual command register */
#define DMAR_VCRSP_REG 0xe10 /* Virtual command response register */
@@ -148,6 +153,7 @@
*/
#define cap_esrtps(c) (((c) >> 63) & 1)
#define cap_esirtps(c) (((c) >> 62) & 1)
+#define cap_ecmds(c) (((c) >> 61) & 1)
#define cap_fl5lp_support(c) (((c) >> 60) & 1)
#define cap_pi_support(c) (((c) >> 59) & 1)
#define cap_fl1gp_support(c) (((c) >> 56) & 1)
@@ -179,6 +185,7 @@
* Extended Capability Register
*/

+#define ecap_pms(e) (((e) >> 51) & 0x1)
#define ecap_rps(e) (((e) >> 49) & 0x1)
#define ecap_smpwc(e) (((e) >> 48) & 0x1)
#define ecap_flts(e) (((e) >> 47) & 0x1)
@@ -210,6 +217,22 @@
#define ecap_max_handle_mask(e) (((e) >> 20) & 0xf)
#define ecap_sc_support(e) (((e) >> 7) & 0x1) /* Snooping Control */

+/*
+ * Decoding Perf Capability Register
+ */
+#define pcap_num_cntr(p) ((p) & 0xffff)
+#define pcap_cntr_width(p) (((p) >> 16) & 0x7f)
+#define pcap_num_event_group(p) (((p) >> 24) & 0x1f)
+#define pcap_filters_mask(p) (((p) >> 32) & 0x1f)
+#define pcap_interrupt(p) (((p) >> 50) & 0x1)
+/* The counter stride is calculated as 2 ^ (x+10) bytes */
+#define pcap_cntr_stride(p) (1ULL << ((((p) >> 52) & 0x7) + 10))
+
+/*
+ * Decoding Perf Event Capability Register
+ */
+#define pecap_es(p) ((p) & 0xfffffff)
+
/* Virtual command interface capability */
#define vccap_pasid(v) (((v) & DMA_VCS_PAS)) /* PASID allocation */

@@ -554,6 +577,22 @@ struct dmar_domain {
iommu core */
};

+struct iommu_pmu {
+ struct intel_iommu *iommu;
+ u32 num_cntr; /* Number of counters */
+ u32 num_eg; /* Number of event group */
+ u32 cntr_width; /* Counter width */
+ u32 cntr_stride; /* Counter Stride */
+ u32 filter; /* Bitmask of filter support */
+ void __iomem *base; /* the PerfMon base address */
+ void __iomem *cfg_reg; /* counter configuration base address */
+ void __iomem *cntr_reg; /* counter 0 address*/
+ void __iomem *overflow; /* overflow status register */
+
+ u64 *evcap; /* Indicates all supported events */
+ u32 **cntr_evcap; /* Supported events of each counter. */
+};
+
struct intel_iommu {
void __iomem *reg; /* Pointer to hardware regs, virtual addr */
u64 reg_phys; /* physical address of hw register set */
@@ -600,6 +639,8 @@ struct intel_iommu {

struct dmar_drhd_unit *drhd;
void *perf_statistic;
+
+ struct iommu_pmu *pmu;
};

/* PCI domain-device relationship */
diff --git a/drivers/iommu/intel/perfmon.c b/drivers/iommu/intel/perfmon.c
new file mode 100644
index 000000000000..bc090f329c32
--- /dev/null
+++ b/drivers/iommu/intel/perfmon.c
@@ -0,0 +1,159 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Support Intel IOMMU PerfMon
+ * Copyright(c) 2022 Intel Corporation.
+ */
+
+#include <linux/dmar.h>
+#include "iommu.h"
+#include "perfmon.h"
+
+static inline void __iomem *
+get_perf_reg_address(struct intel_iommu *iommu, u32 offset)
+{
+ u32 off = dmar_readl(iommu->reg + offset);
+
+ return iommu->reg + off;
+}
+
+int alloc_iommu_pmu(struct intel_iommu *iommu)
+{
+ struct iommu_pmu *iommu_pmu;
+ int i, j, ret;
+ u64 perfcap;
+ u32 cap;
+
+ /* The IOMMU PMU requires the ECMD support as well */
+ if (!ecap_pms(iommu->ecap) || !cap_ecmds(iommu->cap))
+ return -ENODEV;
+
+ perfcap = dmar_readq(iommu->reg + DMAR_PERFCAP_REG);
+ /* The performance monitoring is not supported. */
+ if (!perfcap)
+ return -ENODEV;
+
+ /* Sanity check for the number of the counters and event groups */
+ if (!pcap_num_cntr(perfcap) || !pcap_num_event_group(perfcap))
+ return -ENODEV;
+
+ /* The interrupt on overflow is required */
+ if (!pcap_interrupt(perfcap))
+ return -ENODEV;
+
+ iommu_pmu = kzalloc(sizeof(*iommu_pmu), GFP_KERNEL);
+ if (!iommu_pmu)
+ return -ENOMEM;
+
+ iommu_pmu->num_cntr = pcap_num_cntr(perfcap);
+ iommu_pmu->cntr_width = pcap_cntr_width(perfcap);
+ iommu_pmu->filter = pcap_filters_mask(perfcap);
+ iommu_pmu->cntr_stride = pcap_cntr_stride(perfcap);
+ iommu_pmu->num_eg = pcap_num_event_group(perfcap);
+
+ iommu_pmu->evcap = kcalloc(iommu_pmu->num_eg, sizeof(u64), GFP_KERNEL);
+ if (!iommu_pmu->evcap) {
+ ret = -ENOMEM;
+ goto free_pmu;
+ }
+
+ /* Parse event group capabilities */
+ for (i = 0; i < iommu_pmu->num_eg; i++) {
+ u64 pcap;
+
+ pcap = dmar_readq(iommu->reg + DMAR_PERFEVNTCAP_REG +
+ i * IOMMU_PMU_CAP_REGS_STEP);
+ iommu_pmu->evcap[i] = pecap_es(pcap);
+ }
+
+ iommu_pmu->cntr_evcap = kcalloc(iommu_pmu->num_cntr, sizeof(u32 *), GFP_KERNEL);
+ if (!iommu_pmu->cntr_evcap) {
+ ret = -ENOMEM;
+ goto free_pmu_evcap;
+ }
+ for (i = 0; i < iommu_pmu->num_cntr; i++) {
+ iommu_pmu->cntr_evcap[i] = kcalloc(iommu_pmu->num_eg, sizeof(u32), GFP_KERNEL);
+ if (!iommu_pmu->cntr_evcap[i]) {
+ ret = -ENOMEM;
+ iommu_pmu->num_cntr = i;
+ goto free_pmu_cntr_evcap;
+ }
+ /*
+ * Set to the global capabilities, will adjust according
+ * to per-counter capabilities later.
+ */
+ for (j = 0; j < iommu_pmu->num_eg; j++)
+ iommu_pmu->cntr_evcap[i][j] = (u32)iommu_pmu->evcap[j];
+ }
+
+ iommu_pmu->cfg_reg = get_perf_reg_address(iommu, DMAR_PERFCFGOFF_REG);
+ iommu_pmu->cntr_reg = get_perf_reg_address(iommu, DMAR_PERFCNTROFF_REG);
+ iommu_pmu->overflow = get_perf_reg_address(iommu, DMAR_PERFOVFOFF_REG);
+
+ /*
+ * Check per-counter capabilities
+ * All counters should have the same capabilities on
+ * Interrupt on Overflow Support and Counter Width
+ */
+ for (i = 0; i < iommu_pmu->num_cntr; i++) {
+ cap = dmar_readl(iommu_pmu->cfg_reg +
+ i * IOMMU_PMU_CFG_OFFSET +
+ IOMMU_PMU_CFG_CNTRCAP_OFFSET);
+ if (!iommu_cntrcap_pcc(cap))
+ continue;
+ if ((iommu_cntrcap_cw(cap) != iommu_pmu->cntr_width) ||
+ !iommu_cntrcap_ios(cap))
+ iommu_pmu->num_cntr = i;
+
+ /* Clear the pre-defined events group */
+ for (j = 0; j < iommu_pmu->num_eg; j++)
+ iommu_pmu->cntr_evcap[i][j] = 0;
+
+ /* Override with per-counter event capabilities */
+ for (j = 0; j < iommu_cntrcap_egcnt(cap); j++) {
+ cap = dmar_readl(iommu_pmu->cfg_reg + i * IOMMU_PMU_CFG_OFFSET +
+ IOMMU_PMU_CFG_CNTREVCAP_OFFSET +
+ (j * IOMMU_PMU_OFF_REGS_STEP));
+ iommu_pmu->cntr_evcap[i][iommu_event_group(cap)] = iommu_event_select(cap);
+ /*
+ * Some events may only be supported by a specific counter.
+ * Track them in the evcap as well.
+ */
+ iommu_pmu->evcap[iommu_event_group(cap)] |= iommu_event_select(cap);
+ }
+ }
+
+ iommu_pmu->iommu = iommu;
+ iommu->pmu = iommu_pmu;
+
+ return 0;
+
+free_pmu_cntr_evcap:
+ for (i = 0; i < iommu_pmu->num_cntr; i++)
+ kfree(iommu_pmu->cntr_evcap[i]);
+ kfree(iommu_pmu->cntr_evcap);
+free_pmu_evcap:
+ kfree(iommu_pmu->evcap);
+free_pmu:
+ kfree(iommu_pmu);
+
+ return ret;
+}
+
+void free_iommu_pmu(struct intel_iommu *iommu)
+{
+ struct iommu_pmu *iommu_pmu = iommu->pmu;
+
+ if (!iommu_pmu)
+ return;
+
+ if (iommu_pmu->evcap) {
+ int i;
+
+ for (i = 0; i < iommu_pmu->num_cntr; i++)
+ kfree(iommu_pmu->cntr_evcap[i]);
+ kfree(iommu_pmu->cntr_evcap);
+ }
+ kfree(iommu_pmu->evcap);
+ kfree(iommu_pmu);
+ iommu->pmu = NULL;
+}
diff --git a/drivers/iommu/intel/perfmon.h b/drivers/iommu/intel/perfmon.h
new file mode 100644
index 000000000000..8587c80501cd
--- /dev/null
+++ b/drivers/iommu/intel/perfmon.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * PERFCFGOFF_REG, PERFFRZOFF_REG
+ * PERFOVFOFF_REG, PERFCNTROFF_REG
+ */
+#define IOMMU_PMU_NUM_OFF_REGS 4
+#define IOMMU_PMU_OFF_REGS_STEP 4
+
+#define IOMMU_PMU_CFG_OFFSET 0x100
+#define IOMMU_PMU_CFG_CNTRCAP_OFFSET 0x80
+#define IOMMU_PMU_CFG_CNTREVCAP_OFFSET 0x84
+#define IOMMU_PMU_CFG_SIZE 0x8
+#define IOMMU_PMU_CFG_FILTERS_OFFSET 0x4
+
+
+#define IOMMU_PMU_CAP_REGS_STEP 8
+
+#define iommu_cntrcap_pcc(p) ((p) & 0x1)
+#define iommu_cntrcap_cw(p) ((p >> 8) & 0xff)
+#define iommu_cntrcap_ios(p) ((p >> 16) & 0x1)
+#define iommu_cntrcap_egcnt(p) ((p >> 28) & 0xf)
+
+#define iommu_event_select(p) ((p) & 0xfffffff)
+#define iommu_event_group(p) ((p >> 28) & 0xf)
+
+#ifdef CONFIG_INTEL_IOMMU_PERF_EVENTS
+int alloc_iommu_pmu(struct intel_iommu *iommu);
+void free_iommu_pmu(struct intel_iommu *iommu);
+#else
+static inline int
+alloc_iommu_pmu(struct intel_iommu *iommu)
+{
+ return 0;
+}
+
+static inline void
+free_iommu_pmu(struct intel_iommu *iommu)
+{
+}
+#endif /* CONFIG_INTEL_IOMMU_PERF_EVENTS */
--
2.35.1

2023-01-11 20:33:25

by Liang, Kan

[permalink] [raw]
Subject: [PATCH 3/7] iommu/vt-d: Support Enhanced Command Interface

From: Kan Liang <[email protected]>

The Enhanced Command Register is to submit command and operand of
enhanced commands to DMA Remapping hardware. It can supports upto 256
enhanced commands.

There is a HW register to indicate the availability of all 256 enhanced
commands. Each bit stands for each command. But there isn't an existing
interface to read/write all 256 bits. Introduce the u64 ecmdcap[4] to
store the existence of each enhanced command. Read 4 times to get
all of them in map_iommu().

Add a helper to facilitate an enhanced command launch. Make sure hardware
complete the command.

Add a helper to facilitate the check of PMU essentials.

The helpers will be used later.

Signed-off-by: Kan Liang <[email protected]>
---
drivers/iommu/intel/dmar.c | 63 +++++++++++++++++++++++++++++++++++++
drivers/iommu/intel/iommu.h | 46 +++++++++++++++++++++++++++
2 files changed, 109 insertions(+)

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index 91bb48267df2..dcafa32875c1 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -1025,6 +1025,16 @@ static int map_iommu(struct intel_iommu *iommu, struct dmar_drhd_unit *drhd)
goto release;
}
}
+
+ if (cap_ecmds(iommu->cap)) {
+ int i;
+
+ for (i = 0; i < DMA_MAX_NUM_ECMDCAP; i++) {
+ iommu->ecmdcap[i] = dmar_readq(iommu->reg + DMAR_ECCAP_REG +
+ i * DMA_ECMD_REG_STEP);
+ }
+ }
+
err = 0;
goto out;

@@ -2434,3 +2444,56 @@ bool dmar_platform_optin(void)
return ret;
}
EXPORT_SYMBOL_GPL(dmar_platform_optin);
+
+#ifdef CONFIG_INTEL_IOMMU
+#define ecmd_get_status_code(res) ((res & 0xff) >> 1)
+
+/*
+ * Function to submit a command to the enhanced command interface. The
+ * valid enhanced command descriptions are defined in Table 47 of the
+ * VT-d spec. The VT-d hardware implementation may support some but not
+ * all commands, which can be determined by checking the Enhanced
+ * Command Capability Register.
+ *
+ * Return values:
+ * - 0: Command successful without any error;
+ * - Negative: software error value;
+ * - Nonzero positive: failure status code defined in Table 48.
+ */
+int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd,
+ u64 oa, bool has_ob, u64 ob)
+{
+ unsigned long flags;
+ u64 res;
+ int ret;
+
+ if (!cap_ecmds(iommu->cap))
+ return -ENODEV;
+
+ raw_spin_lock_irqsave(&iommu->register_lock, flags);
+
+ res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
+ if (res & DMA_ECMD_ECRSP_IP) {
+ ret = -EBUSY;
+ goto err;
+ }
+
+ if (has_ob)
+ dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
+ dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
+
+ IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
+ !(res & DMA_ECMD_ECRSP_IP), res);
+
+ if (res & DMA_ECMD_ECRSP_IP) {
+ ret = -ETIMEDOUT;
+ goto err;
+ }
+
+ ret = ecmd_get_status_code(res);
+err:
+ raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+
+ return ret;
+}
+#endif /* CONFIG_INTEL_IOMMU */
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index 5bcefbea55c9..f227107434ac 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -130,6 +130,10 @@
#define DMAR_PERFOVFOFF_REG 0x318
#define DMAR_PERFCNTROFF_REG 0x31c
#define DMAR_PERFEVNTCAP_REG 0x380
+#define DMAR_ECMD_REG 0x400
+#define DMAR_ECEO_REG 0x408
+#define DMAR_ECRSP_REG 0x410
+#define DMAR_ECCAP_REG 0x430
#define DMAR_VCCAP_REG 0xe30 /* Virtual command capability register */
#define DMAR_VCMD_REG 0xe00 /* Virtual command register */
#define DMAR_VCRSP_REG 0xe10 /* Virtual command response register */
@@ -304,6 +308,26 @@
#define DMA_CCMD_SID(s) (((u64)((s) & 0xffff)) << 16)
#define DMA_CCMD_DID(d) ((u64)((d) & 0xffff))

+/* ECMD_REG */
+#define DMA_MAX_NUM_ECMD 256
+#define DMA_MAX_NUM_ECMDCAP (DMA_MAX_NUM_ECMD / 64)
+#define DMA_ECMD_REG_STEP 8
+#define DMA_ECMD_ENABLE 0xf0
+#define DMA_ECMD_DISABLE 0xf1
+#define DMA_ECMD_FREEZE 0xf4
+#define DMA_ECMD_UNFREEZE 0xf5
+#define DMA_ECMD_OA_SHIFT 16
+#define DMA_ECMD_ECRSP_IP 0x1
+#define DMA_ECMD_ECCAP3 3
+#define DMA_ECMD_ECCAP3_ECNTS (1ULL << 48)
+#define DMA_ECMD_ECCAP3_DCNTS (1ULL << 49)
+#define DMA_ECMD_ECCAP3_FCNTS (1ULL << 52)
+#define DMA_ECMD_ECCAP3_UFCNTS (1ULL << 53)
+#define DMA_ECMD_ECCAP3_ESSENTIAL (DMA_ECMD_ECCAP3_ECNTS | \
+ DMA_ECMD_ECCAP3_DCNTS | \
+ DMA_ECMD_ECCAP3_FCNTS | \
+ DMA_ECMD_ECCAP3_UFCNTS)
+
/* FECTL_REG */
#define DMA_FECTL_IM (((u32)1) << 31)

@@ -600,6 +624,7 @@ struct intel_iommu {
u64 cap;
u64 ecap;
u64 vccap;
+ u64 ecmdcap[DMA_MAX_NUM_ECMDCAP];
u32 gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */
raw_spinlock_t register_lock; /* protect register handling */
int seq_id; /* sequence id of the iommu */
@@ -841,6 +866,15 @@ extern const struct iommu_ops intel_iommu_ops;
extern int intel_iommu_sm;
extern int iommu_calculate_agaw(struct intel_iommu *iommu);
extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu);
+extern int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd,
+ u64 oa, bool has_ob, u64 ob);
+
+static inline bool ecmd_has_pmu_essential(struct intel_iommu *iommu)
+{
+ return (iommu->ecmdcap[DMA_ECMD_ECCAP3] & DMA_ECMD_ECCAP3_ESSENTIAL) ==
+ DMA_ECMD_ECCAP3_ESSENTIAL;
+}
+
extern int dmar_disabled;
extern int intel_iommu_enabled;
#else
@@ -852,6 +886,18 @@ static inline int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
{
return 0;
}
+
+static inline int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd,
+ u64 oa, bool has_ob, u64 ob)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline bool ecmd_has_pmu_essential(struct intel_iommu *iommu)
+{
+ return false;
+}
+
#define dmar_disabled (1)
#define intel_iommu_enabled (0)
#define intel_iommu_sm (0)
--
2.35.1

2023-01-11 20:33:42

by Liang, Kan

[permalink] [raw]
Subject: [PATCH 4/7] iommu/vt-d: Add IOMMU perfmon support

From: Kan Liang <[email protected]>

Implement the IOMMU performance monitor capability, which supports the
collection of information about key events occurring during operation of
the remapping hardware, to aid performance tuning and debug.

The IOMMU perfmon support is implemented as part of the IOMMU driver and
interfaces with the Linux perf subsystem.

The IOMMU PMU has the following unique features compared with the other
PMUs.
- Support counting. Not support sampling.
- Does not support per-thread counting. The scope is system-wide.
- Support per-counter capability register. The event constraints can be
enumerated.
- The available event and event group can also be enumerated.
- Extra Enhanced Commands are introduced to control the counters.

Add a new variable, struct iommu_pmu *pmu, to in the struct intel_iommu
to track the PMU related information.

Add iommu_pmu_register() and iommu_pmu_unregister() to register and
unregister a IOMMU PMU. The register function setup the IOMMU PMU ops
and invoke the standard perf_pmu_register() interface to register a PMU
in the perf subsystem. This patch only exposes the functions. The
following patch will enable them in the IOMMU driver.

The IOMMU PMUs can be found under /sys/bus/event_source/devices/dmar*

The available filters and event format can be found at the format folder
$ ls /sys/bus/event_source/devices/dmar0/format/
event event_group filter_ats filter_page_table

The supported events can be found at the events folder

$ ls /sys/bus/event_source/devices/dmar0/events/
ats_blocked int_cache_hit_nonposted iommu_mrds
pasid_cache_lookup
ctxt_cache_hit int_cache_hit_posted iommu_requests
pg_req_posted
ctxt_cache_lookup int_cache_lookup iotlb_hit
pw_occupancy
fs_nonleaf_hit iommu_clocks iotlb_lookup
ss_nonleaf_hit
fs_nonleaf_lookup iommu_mem_blocked pasid_cache_hit
ss_nonleaf_lookup

The command below illustrates filter usage with a simple example.

$ perf stat -e dmar0/iommu_requests,filter_ats=0/ -a sleep 1

Performance counter stats for 'system wide':

2135 dmar0/iommu_requests,filter_ats=0/

1.001087695 seconds time elapsed

Signed-off-by: Kan Liang <[email protected]>
---
.../sysfs-bus-event_source-devices-iommu | 24 +
drivers/iommu/intel/iommu.h | 15 +
drivers/iommu/intel/perfmon.c | 496 ++++++++++++++++++
drivers/iommu/intel/perfmon.h | 24 +
4 files changed, 559 insertions(+)
create mode 100644 Documentation/ABI/testing/sysfs-bus-event_source-devices-iommu

diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-iommu b/Documentation/ABI/testing/sysfs-bus-event_source-devices-iommu
new file mode 100644
index 000000000000..04e08851d8e6
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-iommu
@@ -0,0 +1,24 @@
+What: /sys/bus/event_source/devices/dmar*/format
+Date: Jan 2023
+KernelVersion: 6.3
+Contact: Kan Liang <[email protected]>
+Description: Read-only. Attribute group to describe the magic bits
+ that go into perf_event_attr.config,
+ perf_event_attr.config1 or perf_event_attr.config2 for
+ the IOMMU pmu. (See also
+ ABI/testing/sysfs-bus-event_source-devices-format).
+
+ Each attribute in this group defines a bit range in
+ perf_event_attr.config, perf_event_attr.config1,
+ or perf_event_attr.config2. All supported attributes
+ are listed below (See the VT-d Spec 4.0 for possible
+ attribute values)::
+
+ event = "config:0-27" - event ID
+ event_group = "config:28-31" - event group ID
+
+ filter_requester_id = "config1:0-15" - Requester ID filter
+ filter_domain = "config1:16-31" - Domain ID filter
+ filter_pasid = "config1:32-53" - PASID filter
+ filter_ats = "config2:0-4" - Address Type filter
+ filter_page_table = "config2:8-12" - Page Table Level filter
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index f227107434ac..bbc5dda903e9 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -22,6 +22,7 @@
#include <linux/ioasid.h>
#include <linux/bitfield.h>
#include <linux/xarray.h>
+#include <linux/perf_event.h>

#include <asm/cacheflush.h>
#include <asm/iommu.h>
@@ -601,6 +602,16 @@ struct dmar_domain {
iommu core */
};

+/*
+ * In theory, the VT-d 4.0 spec can support up to 2 ^ 16 counters.
+ * But in practice, there are only 14 counters for the existing
+ * platform. Setting the max number of counters to 64 should be good
+ * enough for a long time. Also, supporting more than 64 counters
+ * requires more extras, e.g., extra freeze and overflow registers,
+ * which is not necessary for now.
+ */
+#define IOMMU_PMU_IDX_MAX 64
+
struct iommu_pmu {
struct intel_iommu *iommu;
u32 num_cntr; /* Number of counters */
@@ -615,6 +626,10 @@ struct iommu_pmu {

u64 *evcap; /* Indicates all supported events */
u32 **cntr_evcap; /* Supported events of each counter. */
+
+ struct pmu pmu;
+ DECLARE_BITMAP(used_mask, IOMMU_PMU_IDX_MAX);
+ struct perf_event *event_list[IOMMU_PMU_IDX_MAX];
};

struct intel_iommu {
diff --git a/drivers/iommu/intel/perfmon.c b/drivers/iommu/intel/perfmon.c
index bc090f329c32..43a5075eaecd 100644
--- a/drivers/iommu/intel/perfmon.c
+++ b/drivers/iommu/intel/perfmon.c
@@ -8,6 +8,475 @@
#include "iommu.h"
#include "perfmon.h"

+PMU_FORMAT_ATTR(event, "config:0-27"); /* ES: Events Select */
+PMU_FORMAT_ATTR(event_group, "config:28-31"); /* EGI: Event Group Index */
+
+static struct attribute *iommu_pmu_format_attrs[] = {
+ &format_attr_event_group.attr,
+ &format_attr_event.attr,
+ NULL
+};
+
+static struct attribute_group iommu_pmu_format_attr_group = {
+ .name = "format",
+ .attrs = iommu_pmu_format_attrs,
+};
+
+/* The available events are added in attr_update later */
+static struct attribute *attrs_empty[] = {
+ NULL
+};
+
+static struct attribute_group iommu_pmu_events_attr_group = {
+ .name = "events",
+ .attrs = attrs_empty,
+};
+
+static const struct attribute_group *iommu_pmu_attr_groups[] = {
+ &iommu_pmu_format_attr_group,
+ &iommu_pmu_events_attr_group,
+ NULL
+};
+
+static inline struct iommu_pmu *dev_to_iommu_pmu(struct device *dev)
+{
+ /*
+ * The perf_event creates its own dev for each PMU.
+ * See pmu_dev_alloc()
+ */
+ return container_of(dev_get_drvdata(dev), struct iommu_pmu, pmu);
+}
+
+#define IOMMU_PMU_ATTR(_name, _format, _filter) \
+ PMU_FORMAT_ATTR(_name, _format); \
+ \
+static struct attribute *_name##_attr[] = { \
+ &format_attr_##_name.attr, \
+ NULL \
+}; \
+ \
+static umode_t \
+_name##_is_visible(struct kobject *kobj, struct attribute *attr, int i) \
+{ \
+ struct device *dev = kobj_to_dev(kobj); \
+ struct iommu_pmu *iommu_pmu = dev_to_iommu_pmu(dev); \
+ \
+ if (!iommu_pmu) \
+ return 0; \
+ return (iommu_pmu->filter & _filter) ? attr->mode : 0; \
+} \
+ \
+static struct attribute_group _name = { \
+ .name = "format", \
+ .attrs = _name##_attr, \
+ .is_visible = _name##_is_visible, \
+};
+
+IOMMU_PMU_ATTR(filter_requester_id, "config1:0-15", IOMMU_PMU_FILTER_REQUESTER_ID);
+IOMMU_PMU_ATTR(filter_domain, "config1:16-31", IOMMU_PMU_FILTER_DOMAIN);
+IOMMU_PMU_ATTR(filter_pasid, "config1:32-53", IOMMU_PMU_FILTER_PASID);
+IOMMU_PMU_ATTR(filter_ats, "config2:0-4", IOMMU_PMU_FILTER_ATS);
+IOMMU_PMU_ATTR(filter_page_table, "config2:8-12", IOMMU_PMU_FILTER_PAGE_TABLE);
+
+#define iommu_pmu_get_requester_id(filter) ((filter) & 0xffff)
+#define iommu_pmu_get_domain(filter) (((filter) >> 16) & 0xffff)
+#define iommu_pmu_get_pasid(filter) (((filter) >> 32) & 0x3fffff)
+#define iommu_pmu_get_ats(filter) ((filter) & 0x1f)
+#define iommu_pmu_get_page_table(filter) (((filter) >> 8) & 0x1f)
+
+#define iommu_pmu_set_filter(_name, _config, _filter, _idx) \
+{ \
+ if ((iommu_pmu->filter & _filter) && iommu_pmu_get_##_name(_config)) { \
+ dmar_writel(iommu_pmu->cfg_reg + _idx * IOMMU_PMU_CFG_OFFSET + \
+ IOMMU_PMU_CFG_SIZE + \
+ (ffs(_filter) - 1) * IOMMU_PMU_CFG_FILTERS_OFFSET, \
+ iommu_pmu_get_##_name(_config) | IOMMU_PMU_FILTER_EN);\
+ } \
+}
+
+#define iommu_pmu_clear_filter(_filter, _idx) \
+{ \
+ if (iommu_pmu->filter & _filter) { \
+ dmar_writel(iommu_pmu->cfg_reg + _idx * IOMMU_PMU_CFG_OFFSET + \
+ IOMMU_PMU_CFG_SIZE + \
+ (ffs(_filter) - 1) * IOMMU_PMU_CFG_FILTERS_OFFSET, \
+ 0); \
+ } \
+}
+
+/*
+ * Define the event attr related functions
+ * Input: _name: event attr name
+ * _string: string of the event in sysfs
+ * _g_idx: event group encoding
+ * _event: event encoding
+ */
+#define IOMMU_PMU_EVENT_ATTR(_name, _string, _g_idx, _event) \
+ PMU_EVENT_ATTR_STRING(_name, event_attr_##_name, _string) \
+ \
+static struct attribute *_name##_attr[] = { \
+ &event_attr_##_name.attr.attr, \
+ NULL \
+}; \
+ \
+static umode_t \
+_name##_is_visible(struct kobject *kobj, struct attribute *attr, int i) \
+{ \
+ struct device *dev = kobj_to_dev(kobj); \
+ struct iommu_pmu *iommu_pmu = dev_to_iommu_pmu(dev); \
+ \
+ if (!iommu_pmu) \
+ return 0; \
+ return (iommu_pmu->evcap[_g_idx] & _event) ? attr->mode : 0; \
+} \
+ \
+static struct attribute_group _name = { \
+ .name = "events", \
+ .attrs = _name##_attr, \
+ .is_visible = _name##_is_visible, \
+};
+
+IOMMU_PMU_EVENT_ATTR(iommu_clocks, "event_group=0x0,event=0x001", 0x0, 0x001)
+IOMMU_PMU_EVENT_ATTR(iommu_requests, "event_group=0x0,event=0x002", 0x0, 0x002)
+IOMMU_PMU_EVENT_ATTR(pw_occupancy, "event_group=0x0,event=0x004", 0x0, 0x004)
+IOMMU_PMU_EVENT_ATTR(ats_blocked, "event_group=0x0,event=0x008", 0x0, 0x008)
+IOMMU_PMU_EVENT_ATTR(iommu_mrds, "event_group=0x1,event=0x001", 0x1, 0x001)
+IOMMU_PMU_EVENT_ATTR(iommu_mem_blocked, "event_group=0x1,event=0x020", 0x1, 0x020)
+IOMMU_PMU_EVENT_ATTR(pg_req_posted, "event_group=0x1,event=0x040", 0x1, 0x040)
+IOMMU_PMU_EVENT_ATTR(ctxt_cache_lookup, "event_group=0x2,event=0x001", 0x2, 0x001)
+IOMMU_PMU_EVENT_ATTR(ctxt_cache_hit, "event_group=0x2,event=0x002", 0x2, 0x002)
+IOMMU_PMU_EVENT_ATTR(pasid_cache_lookup, "event_group=0x2,event=0x004", 0x2, 0x004)
+IOMMU_PMU_EVENT_ATTR(pasid_cache_hit, "event_group=0x2,event=0x008", 0x2, 0x008)
+IOMMU_PMU_EVENT_ATTR(ss_nonleaf_lookup, "event_group=0x2,event=0x010", 0x2, 0x010)
+IOMMU_PMU_EVENT_ATTR(ss_nonleaf_hit, "event_group=0x2,event=0x020", 0x2, 0x020)
+IOMMU_PMU_EVENT_ATTR(fs_nonleaf_lookup, "event_group=0x2,event=0x040", 0x2, 0x040)
+IOMMU_PMU_EVENT_ATTR(fs_nonleaf_hit, "event_group=0x2,event=0x080", 0x2, 0x080)
+IOMMU_PMU_EVENT_ATTR(hpt_nonleaf_lookup, "event_group=0x2,event=0x100", 0x2, 0x100)
+IOMMU_PMU_EVENT_ATTR(hpt_nonleaf_hit, "event_group=0x2,event=0x200", 0x2, 0x200)
+IOMMU_PMU_EVENT_ATTR(iotlb_lookup, "event_group=0x3,event=0x001", 0x3, 0x001)
+IOMMU_PMU_EVENT_ATTR(iotlb_hit, "event_group=0x3,event=0x002", 0x3, 0x002)
+IOMMU_PMU_EVENT_ATTR(hpt_leaf_lookup, "event_group=0x3,event=0x004", 0x3, 0x004)
+IOMMU_PMU_EVENT_ATTR(hpt_leaf_hit, "event_group=0x3,event=0x008", 0x3, 0x008)
+IOMMU_PMU_EVENT_ATTR(int_cache_lookup, "event_group=0x4,event=0x001", 0x4, 0x001)
+IOMMU_PMU_EVENT_ATTR(int_cache_hit_nonposted, "event_group=0x4,event=0x002", 0x4, 0x002)
+IOMMU_PMU_EVENT_ATTR(int_cache_hit_posted, "event_group=0x4,event=0x004", 0x4, 0x004)
+
+
+static const struct attribute_group *iommu_pmu_attr_update[] = {
+ &filter_requester_id,
+ &filter_domain,
+ &filter_pasid,
+ &filter_ats,
+ &filter_page_table,
+ &iommu_clocks,
+ &iommu_requests,
+ &pw_occupancy,
+ &ats_blocked,
+ &iommu_mrds,
+ &iommu_mem_blocked,
+ &pg_req_posted,
+ &ctxt_cache_lookup,
+ &ctxt_cache_hit,
+ &pasid_cache_lookup,
+ &pasid_cache_hit,
+ &ss_nonleaf_lookup,
+ &ss_nonleaf_hit,
+ &fs_nonleaf_lookup,
+ &fs_nonleaf_hit,
+ &hpt_nonleaf_lookup,
+ &hpt_nonleaf_hit,
+ &iotlb_lookup,
+ &iotlb_hit,
+ &hpt_leaf_lookup,
+ &hpt_leaf_hit,
+ &int_cache_lookup,
+ &int_cache_hit_nonposted,
+ &int_cache_hit_posted,
+ NULL
+};
+
+static inline void __iomem *
+iommu_event_base(struct iommu_pmu *iommu_pmu, int idx)
+{
+ return iommu_pmu->cntr_reg + idx * iommu_pmu->cntr_stride;
+}
+
+static inline void __iomem *
+iommu_config_base(struct iommu_pmu *iommu_pmu, int idx)
+{
+ return iommu_pmu->cfg_reg + idx * IOMMU_PMU_CFG_OFFSET;
+}
+
+static inline struct iommu_pmu *iommu_event_to_pmu(struct perf_event *event)
+{
+ return container_of(event->pmu, struct iommu_pmu, pmu);
+}
+
+static inline u64 iommu_event_config(struct perf_event *event)
+{
+ u64 config = event->attr.config;
+
+ return (iommu_event_select(config) << IOMMU_EVENT_CFG_ES_SHIFT) |
+ (iommu_event_group(config) << IOMMU_EVENT_CFG_EGI_SHIFT) |
+ IOMMU_EVENT_CFG_INT;
+}
+
+static inline bool is_iommu_pmu_event(struct iommu_pmu *iommu_pmu,
+ struct perf_event *event)
+{
+ return event->pmu == &iommu_pmu->pmu;
+}
+
+static int iommu_pmu_validate_event(struct perf_event *event)
+{
+ struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
+ u32 event_group = iommu_event_group(event->attr.config);
+
+ if (event_group >= iommu_pmu->num_eg)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int iommu_pmu_validate_group(struct perf_event *event)
+{
+ struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
+ struct perf_event *sibling;
+ int nr = 0;
+
+ /*
+ * All events in a group must be scheduled simultaneously.
+ * Check whether there is enough counters for all the events.
+ */
+ for_each_sibling_event(sibling, event->group_leader) {
+ if (!is_iommu_pmu_event(iommu_pmu, sibling) ||
+ sibling->state <= PERF_EVENT_STATE_OFF)
+ continue;
+
+ if (++nr > iommu_pmu->num_cntr)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int iommu_pmu_event_init(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ /* sampling not supported */
+ if (event->attr.sample_period)
+ return -EINVAL;
+
+ if (event->cpu < 0)
+ return -EINVAL;
+
+ if (iommu_pmu_validate_event(event))
+ return -EINVAL;
+
+ hwc->config = iommu_event_config(event);
+
+ return iommu_pmu_validate_group(event);
+}
+
+static void iommu_pmu_event_update(struct perf_event *event)
+{
+ struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
+ struct hw_perf_event *hwc = &event->hw;
+ u64 prev_count, new_count, delta;
+ int shift = 64 - iommu_pmu->cntr_width;
+
+again:
+ prev_count = local64_read(&hwc->prev_count);
+ new_count = dmar_readq(iommu_event_base(iommu_pmu, hwc->idx));
+ if (local64_xchg(&hwc->prev_count, new_count) != prev_count)
+ goto again;
+
+ /*
+ * The counter width is enumerated.
+ * Always shift the counter before using it.
+ */
+ delta = (new_count << shift) - (prev_count << shift);
+ delta >>= shift;
+
+ local64_add(delta, &event->count);
+}
+
+static void iommu_pmu_start(struct perf_event *event, int flags)
+{
+ struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
+ struct intel_iommu *iommu = iommu_pmu->iommu;
+ struct hw_perf_event *hwc = &event->hw;
+ u64 count;
+
+ if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+ return;
+
+ if (WARN_ON_ONCE(hwc->idx < 0 || hwc->idx >= IOMMU_PMU_IDX_MAX))
+ return;
+
+ if (flags & PERF_EF_RELOAD)
+ WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+
+ hwc->state = 0;
+
+ /* Always reprogram the period */
+ count = dmar_readq(iommu_event_base(iommu_pmu, hwc->idx));
+ local64_set((&hwc->prev_count), count);
+
+ ecmd_submit_sync(iommu, DMA_ECMD_ENABLE, hwc->idx, false, 0);
+
+ perf_event_update_userpage(event);
+}
+
+static void iommu_pmu_stop(struct perf_event *event, int flags)
+{
+ struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
+ struct intel_iommu *iommu = iommu_pmu->iommu;
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!(hwc->state & PERF_HES_STOPPED)) {
+ ecmd_submit_sync(iommu, DMA_ECMD_DISABLE, hwc->idx, false, 0);
+
+ iommu_pmu_event_update(event);
+
+ hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+ }
+}
+
+static inline int
+iommu_pmu_validate_per_cntr_event(struct iommu_pmu *iommu_pmu,
+ int idx, struct perf_event *event)
+{
+ u32 event_group = iommu_event_group(event->attr.config);
+ u32 select = iommu_event_select(event->attr.config);
+
+ if (!(iommu_pmu->cntr_evcap[idx][event_group] & select))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int iommu_pmu_assign_event(struct iommu_pmu *iommu_pmu,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ int idx;
+
+ /*
+ * The counters which support limited events are usually at the end.
+ * Schedule them first to accommodate more events.
+ */
+ for (idx = iommu_pmu->num_cntr - 1; idx >= 0; idx--) {
+ if (test_and_set_bit(idx, iommu_pmu->used_mask))
+ continue;
+ /* Check per-counter event capabilities */
+ if (!iommu_pmu_validate_per_cntr_event(iommu_pmu, idx, event))
+ break;
+ clear_bit(idx, iommu_pmu->used_mask);
+ }
+ if (idx < 0)
+ return -EINVAL;
+
+ iommu_pmu->event_list[idx] = event;
+ hwc->idx = idx;
+
+ /* config events */
+ dmar_writeq(iommu_config_base(iommu_pmu, idx), hwc->config);
+
+ iommu_pmu_set_filter(requester_id, event->attr.config1,
+ IOMMU_PMU_FILTER_REQUESTER_ID, idx);
+ iommu_pmu_set_filter(domain, event->attr.config1,
+ IOMMU_PMU_FILTER_DOMAIN, idx);
+ iommu_pmu_set_filter(pasid, event->attr.config1,
+ IOMMU_PMU_FILTER_PASID, idx);
+ iommu_pmu_set_filter(ats, event->attr.config2,
+ IOMMU_PMU_FILTER_ATS, idx);
+ iommu_pmu_set_filter(page_table, event->attr.config2,
+ IOMMU_PMU_FILTER_PAGE_TABLE, idx);
+
+ return 0;
+}
+
+static int iommu_pmu_add(struct perf_event *event, int flags)
+{
+ struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
+ struct hw_perf_event *hwc = &event->hw;
+ int ret;
+
+ ret = iommu_pmu_assign_event(iommu_pmu, event);
+ if (ret < 0)
+ return ret;
+
+ hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+ if (flags & PERF_EF_START)
+ iommu_pmu_start(event, 0);
+
+ return 0;
+}
+
+static void iommu_pmu_del(struct perf_event *event, int flags)
+{
+ struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
+ int idx = event->hw.idx;
+
+ iommu_pmu_stop(event, PERF_EF_UPDATE);
+
+ iommu_pmu_clear_filter(IOMMU_PMU_FILTER_REQUESTER_ID, idx);
+ iommu_pmu_clear_filter(IOMMU_PMU_FILTER_DOMAIN, idx);
+ iommu_pmu_clear_filter(IOMMU_PMU_FILTER_PASID, idx);
+ iommu_pmu_clear_filter(IOMMU_PMU_FILTER_ATS, idx);
+ iommu_pmu_clear_filter(IOMMU_PMU_FILTER_PAGE_TABLE, idx);
+
+ iommu_pmu->event_list[idx] = NULL;
+ event->hw.idx = -1;
+ clear_bit(idx, iommu_pmu->used_mask);
+
+ perf_event_update_userpage(event);
+}
+
+static void iommu_pmu_enable(struct pmu *pmu)
+{
+ struct iommu_pmu *iommu_pmu = container_of(pmu, struct iommu_pmu, pmu);
+ struct intel_iommu *iommu = iommu_pmu->iommu;
+
+ ecmd_submit_sync(iommu, DMA_ECMD_UNFREEZE, 0, false, 0);
+}
+
+static void iommu_pmu_disable(struct pmu *pmu)
+{
+ struct iommu_pmu *iommu_pmu = container_of(pmu, struct iommu_pmu, pmu);
+ struct intel_iommu *iommu = iommu_pmu->iommu;
+
+ ecmd_submit_sync(iommu, DMA_ECMD_FREEZE, 0, false, 0);
+}
+
+static int __iommu_pmu_register(struct intel_iommu *iommu)
+{
+ struct iommu_pmu *iommu_pmu = iommu->pmu;
+
+ iommu_pmu->pmu.name = iommu->name;
+ iommu_pmu->pmu.task_ctx_nr = perf_invalid_context;
+ iommu_pmu->pmu.event_init = iommu_pmu_event_init;
+ iommu_pmu->pmu.pmu_enable = iommu_pmu_enable;
+ iommu_pmu->pmu.pmu_disable = iommu_pmu_disable;
+ iommu_pmu->pmu.add = iommu_pmu_add;
+ iommu_pmu->pmu.del = iommu_pmu_del;
+ iommu_pmu->pmu.start = iommu_pmu_start;
+ iommu_pmu->pmu.stop = iommu_pmu_stop;
+ iommu_pmu->pmu.read = iommu_pmu_event_update;
+ iommu_pmu->pmu.attr_groups = iommu_pmu_attr_groups;
+ iommu_pmu->pmu.attr_update = iommu_pmu_attr_update;
+ iommu_pmu->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
+ iommu_pmu->pmu.module = THIS_MODULE;
+
+ return perf_pmu_register(&iommu_pmu->pmu, iommu_pmu->pmu.name, -1);
+}
+
static inline void __iomem *
get_perf_reg_address(struct intel_iommu *iommu, u32 offset)
{
@@ -40,11 +509,21 @@ int alloc_iommu_pmu(struct intel_iommu *iommu)
if (!pcap_interrupt(perfcap))
return -ENODEV;

+ /* Check required Enhanced Command Capability */
+ if (!ecmd_has_pmu_essential(iommu))
+ return -ENODEV;
+
iommu_pmu = kzalloc(sizeof(*iommu_pmu), GFP_KERNEL);
if (!iommu_pmu)
return -ENOMEM;

iommu_pmu->num_cntr = pcap_num_cntr(perfcap);
+ if (iommu_pmu->num_cntr > IOMMU_PMU_IDX_MAX) {
+ WARN_ONCE(1, "The number of IOMMU counters %d > max(%d), clipping!",
+ iommu_pmu->num_cntr, IOMMU_PMU_IDX_MAX);
+ iommu_pmu->num_cntr = IOMMU_PMU_IDX_MAX;
+ }
+
iommu_pmu->cntr_width = pcap_cntr_width(perfcap);
iommu_pmu->filter = pcap_filters_mask(perfcap);
iommu_pmu->cntr_stride = pcap_cntr_stride(perfcap);
@@ -157,3 +636,20 @@ void free_iommu_pmu(struct intel_iommu *iommu)
kfree(iommu_pmu);
iommu->pmu = NULL;
}
+
+void iommu_pmu_register(struct intel_iommu *iommu)
+{
+ if (!iommu->pmu)
+ return;
+
+ if (__iommu_pmu_register(iommu)) {
+ pr_err("Failed to register PMU for iommu (seq_id = %d)\n",
+ iommu->seq_id);
+ }
+}
+
+void iommu_pmu_unregister(struct intel_iommu *iommu)
+{
+ if (iommu->pmu)
+ perf_pmu_unregister(&iommu->pmu->pmu);
+}
diff --git a/drivers/iommu/intel/perfmon.h b/drivers/iommu/intel/perfmon.h
index 8587c80501cd..b60f0cad5bfd 100644
--- a/drivers/iommu/intel/perfmon.h
+++ b/drivers/iommu/intel/perfmon.h
@@ -7,6 +7,14 @@
#define IOMMU_PMU_NUM_OFF_REGS 4
#define IOMMU_PMU_OFF_REGS_STEP 4

+#define IOMMU_PMU_FILTER_REQUESTER_ID 0x01
+#define IOMMU_PMU_FILTER_DOMAIN 0x02
+#define IOMMU_PMU_FILTER_PASID 0x04
+#define IOMMU_PMU_FILTER_ATS 0x08
+#define IOMMU_PMU_FILTER_PAGE_TABLE 0x10
+
+#define IOMMU_PMU_FILTER_EN (1 << 31)
+
#define IOMMU_PMU_CFG_OFFSET 0x100
#define IOMMU_PMU_CFG_CNTRCAP_OFFSET 0x80
#define IOMMU_PMU_CFG_CNTREVCAP_OFFSET 0x84
@@ -21,12 +29,18 @@
#define iommu_cntrcap_ios(p) ((p >> 16) & 0x1)
#define iommu_cntrcap_egcnt(p) ((p >> 28) & 0xf)

+#define IOMMU_EVENT_CFG_EGI_SHIFT 8
+#define IOMMU_EVENT_CFG_ES_SHIFT 32
+#define IOMMU_EVENT_CFG_INT (1ULL << 1)
+
#define iommu_event_select(p) ((p) & 0xfffffff)
#define iommu_event_group(p) ((p >> 28) & 0xf)

#ifdef CONFIG_INTEL_IOMMU_PERF_EVENTS
int alloc_iommu_pmu(struct intel_iommu *iommu);
void free_iommu_pmu(struct intel_iommu *iommu);
+void iommu_pmu_register(struct intel_iommu *iommu);
+void iommu_pmu_unregister(struct intel_iommu *iommu);
#else
static inline int
alloc_iommu_pmu(struct intel_iommu *iommu)
@@ -38,4 +52,14 @@ static inline void
free_iommu_pmu(struct intel_iommu *iommu)
{
}
+
+static inline void
+iommu_pmu_register(struct intel_iommu *iommu)
+{
+}
+
+static inline void
+iommu_pmu_unregister(struct intel_iommu *iommu)
+{
+}
#endif /* CONFIG_INTEL_IOMMU_PERF_EVENTS */
--
2.35.1

2023-01-11 20:33:54

by Liang, Kan

[permalink] [raw]
Subject: [PATCH 6/7] iommu/vt-d: Add IOMMU perfmon overflow handler support

From: Kan Liang <[email protected]>

While enabled to count events and an event occurrence causes the counter
value to increment and roll over to or past zero, this is termed a
counter overflow. The overflow can trigger an interrupt. The IOMMU
perfmon needs to handle the case properly.

New HW IRQs are allocated for each IOMMU device for perfmon. The IRQ IDs
are after the SVM range.

In the overflow handler, the counter is not frozen. It's very unlikely
that the same counter overflows again during the period. But it's
possible that other counters overflow at the same time. Read the
overflow register at the end of the handler and check whether there are
more.

Signed-off-by: Kan Liang <[email protected]>
---
drivers/iommu/intel/dmar.c | 2 +
drivers/iommu/intel/iommu.h | 11 ++++-
drivers/iommu/intel/perfmon.c | 82 +++++++++++++++++++++++++++++++++++
drivers/iommu/intel/svm.c | 2 +-
4 files changed, 95 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index dcafa32875c1..94e314320cd9 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -1887,6 +1887,8 @@ static inline int dmar_msi_reg(struct intel_iommu *iommu, int irq)
return DMAR_FECTL_REG;
else if (iommu->pr_irq == irq)
return DMAR_PECTL_REG;
+ else if (iommu->perf_irq == irq)
+ return DMAR_PERFINTRCTL_REG;
else
BUG();
}
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index bbc5dda903e9..f616e4f3d765 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -130,6 +130,8 @@
#define DMAR_PERFCFGOFF_REG 0x310
#define DMAR_PERFOVFOFF_REG 0x318
#define DMAR_PERFCNTROFF_REG 0x31c
+#define DMAR_PERFINTRSTS_REG 0x324
+#define DMAR_PERFINTRCTL_REG 0x328
#define DMAR_PERFEVNTCAP_REG 0x380
#define DMAR_ECMD_REG 0x400
#define DMAR_ECEO_REG 0x408
@@ -357,6 +359,9 @@

#define DMA_VCS_PAS ((u64)1)

+/* PERFINTRSTS_REG */
+#define DMA_PERFINTRSTS_PIS ((u32)1)
+
#define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
do { \
cycles_t start_time = get_cycles(); \
@@ -630,8 +635,12 @@ struct iommu_pmu {
struct pmu pmu;
DECLARE_BITMAP(used_mask, IOMMU_PMU_IDX_MAX);
struct perf_event *event_list[IOMMU_PMU_IDX_MAX];
+ unsigned char irq_name[16];
};

+#define IOMMU_IRQ_ID_OFFSET_PRQ (DMAR_UNITS_SUPPORTED)
+#define IOMMU_IRQ_ID_OFFSET_PERF (2 * DMAR_UNITS_SUPPORTED)
+
struct intel_iommu {
void __iomem *reg; /* Pointer to hardware regs, virtual addr */
u64 reg_phys; /* physical address of hw register set */
@@ -645,7 +654,7 @@ struct intel_iommu {
int seq_id; /* sequence id of the iommu */
int agaw; /* agaw of this iommu */
int msagaw; /* max sagaw of this iommu */
- unsigned int irq, pr_irq;
+ unsigned int irq, pr_irq, perf_irq;
u16 segment; /* PCI segment# */
unsigned char name[13]; /* Device Name */

diff --git a/drivers/iommu/intel/perfmon.c b/drivers/iommu/intel/perfmon.c
index f332232bb345..d0fbf784c827 100644
--- a/drivers/iommu/intel/perfmon.c
+++ b/drivers/iommu/intel/perfmon.c
@@ -476,6 +476,49 @@ static void iommu_pmu_disable(struct pmu *pmu)
ecmd_submit_sync(iommu, DMA_ECMD_FREEZE, 0, false, 0);
}

+static void iommu_pmu_counter_overflow(struct iommu_pmu *iommu_pmu)
+{
+ struct perf_event *event;
+ int i, handled = 0;
+ u64 status;
+
+ /*
+ * Two counters may be overflowed very close.
+ * Always check whether there are more to handle.
+ */
+ while ((status = dmar_readq(iommu_pmu->overflow))) {
+ for_each_set_bit(i, (unsigned long *)&status, iommu_pmu->num_cntr) {
+ handled++;
+
+ /*
+ * Find the assigned event of the counter.
+ * Accumulate the value into the event->count.
+ */
+ event = iommu_pmu->event_list[i];
+ if (WARN_ON_ONCE(!event))
+ continue;
+ iommu_pmu_event_update(event);
+ }
+
+ dmar_writeq(iommu_pmu->overflow, status);
+ }
+}
+
+static irqreturn_t iommu_pmu_irq_handler(int irq, void *dev_id)
+{
+ struct intel_iommu *iommu = dev_id;
+
+ if (!dmar_readl(iommu->reg + DMAR_PERFINTRSTS_REG))
+ return IRQ_NONE;
+
+ iommu_pmu_counter_overflow(iommu->pmu);
+
+ /* Clear the status bit */
+ dmar_writel(iommu->reg + DMAR_PERFINTRSTS_REG, DMA_PERFINTRSTS_PIS);
+
+ return IRQ_HANDLED;
+}
+
static int __iommu_pmu_register(struct intel_iommu *iommu)
{
struct iommu_pmu *iommu_pmu = iommu->pmu;
@@ -658,6 +701,38 @@ void free_iommu_pmu(struct intel_iommu *iommu)
iommu->pmu = NULL;
}

+static int iommu_pmu_set_interrupt(struct intel_iommu *iommu)
+{
+ struct iommu_pmu *iommu_pmu = iommu->pmu;
+ int irq, ret;
+
+ irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PERF + iommu->seq_id, iommu->node, iommu);
+ if (irq <= 0)
+ return -EINVAL;
+
+ snprintf(iommu_pmu->irq_name, sizeof(iommu_pmu->irq_name), "dmar%d-perf", iommu->seq_id);
+
+ iommu->perf_irq = irq;
+ ret = request_threaded_irq(irq, NULL, iommu_pmu_irq_handler,
+ IRQF_ONESHOT, iommu_pmu->irq_name, iommu);
+ if (ret) {
+ dmar_free_hwirq(irq);
+ iommu->perf_irq = 0;
+ return ret;
+ }
+ return 0;
+}
+
+static void iommu_pmu_unset_interrupt(struct intel_iommu *iommu)
+{
+ if (!iommu->perf_irq)
+ return;
+
+ free_irq(iommu->perf_irq, iommu);
+ dmar_free_hwirq(iommu->perf_irq);
+ iommu->perf_irq = 0;
+}
+
static int iommu_pmu_cpu_online(unsigned int cpu)
{
if (cpumask_empty(&iommu_pmu_cpu_mask))
@@ -734,8 +809,14 @@ void iommu_pmu_register(struct intel_iommu *iommu)
if (iommu_pmu_cpuhp_setup(iommu_pmu))
goto unregister;

+ /* Set interrupt for overflow */
+ if (iommu_pmu_set_interrupt(iommu))
+ goto cpuhp_free;
+
return;

+cpuhp_free:
+ iommu_pmu_cpuhp_free(iommu_pmu);
unregister:
perf_pmu_unregister(&iommu_pmu->pmu);
err:
@@ -749,6 +830,7 @@ void iommu_pmu_unregister(struct intel_iommu *iommu)
if (!iommu_pmu)
return;

+ iommu_pmu_unset_interrupt(iommu);
iommu_pmu_cpuhp_free(iommu_pmu);
perf_pmu_unregister(&iommu_pmu->pmu);
}
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index c76b66263467..b6c5edd80d5d 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -79,7 +79,7 @@ int intel_svm_enable_prq(struct intel_iommu *iommu)
}
iommu->prq = page_address(pages);

- irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu);
+ irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PRQ + iommu->seq_id, iommu->node, iommu);
if (irq <= 0) {
pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n",
iommu->name);
--
2.35.1

2023-01-11 20:34:12

by Liang, Kan

[permalink] [raw]
Subject: [PATCH 5/7] iommu/vt-d: Support cpumask for IOMMU perfmon

From: Kan Liang <[email protected]>

The perf subsystem assumes that all counters are by default per-CPU. So
the user space tool reads a counter from each CPU. However, the IOMMU
counters are system-wide and can be read from any CPU. Here we use a CPU
mask to restrict counting to one CPU to handle the issue. (with CPU
hotplug notifier to choose a different CPU if the chosen one is taken
off-line).

The CPU is exposed to /sys/bus/event_source/devices/dmar*/cpumask for
the user space perf tool.

Signed-off-by: Kan Liang <[email protected]>
---
.../sysfs-bus-event_source-devices-iommu | 8 ++
drivers/iommu/intel/perfmon.c | 113 ++++++++++++++++--
include/linux/cpuhotplug.h | 1 +
3 files changed, 115 insertions(+), 7 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-iommu b/Documentation/ABI/testing/sysfs-bus-event_source-devices-iommu
index 04e08851d8e6..3519954fe713 100644
--- a/Documentation/ABI/testing/sysfs-bus-event_source-devices-iommu
+++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-iommu
@@ -22,3 +22,11 @@ Description: Read-only. Attribute group to describe the magic bits
filter_pasid = "config1:32-53" - PASID filter
filter_ats = "config2:0-4" - Address Type filter
filter_page_table = "config2:8-12" - Page Table Level filter
+
+What: /sys/bus/event_source/devices/dmar*/cpumask
+Date: Jan 2023
+KernelVersion: 6.3
+Contact: Kan Liang <[email protected]>
+Description: Read-only. This file always returns the CPU to which the
+ IOMMU pmu is bound for access to all IOMMU pmu
+ performance monitoring events.
diff --git a/drivers/iommu/intel/perfmon.c b/drivers/iommu/intel/perfmon.c
index 43a5075eaecd..f332232bb345 100644
--- a/drivers/iommu/intel/perfmon.c
+++ b/drivers/iommu/intel/perfmon.c
@@ -32,9 +32,30 @@ static struct attribute_group iommu_pmu_events_attr_group = {
.attrs = attrs_empty,
};

+static cpumask_t iommu_pmu_cpu_mask;
+
+static ssize_t iommu_pmu_cpumask_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return cpumap_print_to_pagebuf(true, buf, &iommu_pmu_cpu_mask);
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, iommu_pmu_cpumask_show, NULL);
+
+static struct attribute *iommu_pmu_cpumask_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL
+};
+
+static struct attribute_group iommu_pmu_cpumask_attr_group = {
+ .attrs = iommu_pmu_cpumask_attrs,
+};
+
static const struct attribute_group *iommu_pmu_attr_groups[] = {
&iommu_pmu_format_attr_group,
&iommu_pmu_events_attr_group,
+ &iommu_pmu_cpumask_attr_group,
NULL
};

@@ -637,19 +658,97 @@ void free_iommu_pmu(struct intel_iommu *iommu)
iommu->pmu = NULL;
}

+static int iommu_pmu_cpu_online(unsigned int cpu)
+{
+ if (cpumask_empty(&iommu_pmu_cpu_mask))
+ cpumask_set_cpu(cpu, &iommu_pmu_cpu_mask);
+
+ return 0;
+}
+
+static int iommu_pmu_cpu_offline(unsigned int cpu)
+{
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu;
+ int target;
+
+ if (!cpumask_test_and_clear_cpu(cpu, &iommu_pmu_cpu_mask))
+ return 0;
+
+ target = cpumask_any_but(cpu_online_mask, cpu);
+
+ if (target < nr_cpu_ids)
+ cpumask_set_cpu(target, &iommu_pmu_cpu_mask);
+ else
+ target = -1;
+
+ rcu_read_lock();
+
+ for_each_iommu(iommu, drhd) {
+ if (!iommu->pmu)
+ continue;
+ perf_pmu_migrate_context(&iommu->pmu->pmu, cpu, target);
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static int nr_iommu_pmu;
+
+static int iommu_pmu_cpuhp_setup(struct iommu_pmu *iommu_pmu)
+{
+ int ret;
+
+ if (nr_iommu_pmu++)
+ return 0;
+
+ ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_IOMMU_PERF_ONLINE,
+ "driver/iommu/intel/perfmon:online",
+ iommu_pmu_cpu_online,
+ iommu_pmu_cpu_offline);
+ if (ret)
+ nr_iommu_pmu = 0;
+
+ return ret;
+}
+
+static void iommu_pmu_cpuhp_free(struct iommu_pmu *iommu_pmu)
+{
+ if (--nr_iommu_pmu)
+ return;
+
+ cpuhp_remove_state(CPUHP_AP_PERF_X86_IOMMU_PERF_ONLINE);
+}
+
void iommu_pmu_register(struct intel_iommu *iommu)
{
- if (!iommu->pmu)
+ struct iommu_pmu *iommu_pmu = iommu->pmu;
+
+ if (!iommu_pmu)
return;

- if (__iommu_pmu_register(iommu)) {
- pr_err("Failed to register PMU for iommu (seq_id = %d)\n",
- iommu->seq_id);
- }
+ if (__iommu_pmu_register(iommu))
+ goto err;
+
+ if (iommu_pmu_cpuhp_setup(iommu_pmu))
+ goto unregister;
+
+ return;
+
+unregister:
+ perf_pmu_unregister(&iommu_pmu->pmu);
+err:
+ pr_err("Failed to register PMU for iommu (seq_id = %d)\n", iommu->seq_id);
}

void iommu_pmu_unregister(struct intel_iommu *iommu)
{
- if (iommu->pmu)
- perf_pmu_unregister(&iommu->pmu->pmu);
+ struct iommu_pmu *iommu_pmu = iommu->pmu;
+
+ if (!iommu_pmu)
+ return;
+
+ iommu_pmu_cpuhp_free(iommu_pmu);
+ perf_pmu_unregister(&iommu_pmu->pmu);
}
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 6c6859bfc454..f2ea348ce3b0 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -221,6 +221,7 @@ enum cpuhp_state {
CPUHP_AP_PERF_X86_CQM_ONLINE,
CPUHP_AP_PERF_X86_CSTATE_ONLINE,
CPUHP_AP_PERF_X86_IDXD_ONLINE,
+ CPUHP_AP_PERF_X86_IOMMU_PERF_ONLINE,
CPUHP_AP_PERF_S390_CF_ONLINE,
CPUHP_AP_PERF_S390_SF_ONLINE,
CPUHP_AP_PERF_ARM_CCI_ONLINE,
--
2.35.1

2023-01-11 20:34:46

by Liang, Kan

[permalink] [raw]
Subject: [PATCH 7/7] iommu/vt-d: Enable IOMMU perfmon support

From: Kan Liang <[email protected]>

Register and enable an IOMMU perfmon for each active IOMMU device.

The failure of IOMMU perfmon registration doesn't impact other
functionalities of an IOMMU device.

Signed-off-by: Kan Liang <[email protected]>
---
drivers/iommu/intel/dmar.c | 3 +++
drivers/iommu/intel/iommu.c | 3 +++
2 files changed, 6 insertions(+)

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index 94e314320cd9..5a33df9e19ce 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -1152,6 +1152,8 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
err = iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
if (err)
goto err_sysfs;
+
+ iommu_pmu_register(iommu);
}

drhd->iommu = iommu;
@@ -1174,6 +1176,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
static void free_iommu(struct intel_iommu *iommu)
{
if (intel_iommu_enabled && !iommu->drhd->ignored) {
+ iommu_pmu_unregister(iommu);
iommu_device_unregister(&iommu->iommu);
iommu_device_sysfs_remove(&iommu->iommu);
}
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 59df7e42fd53..c57e60c5f353 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -30,6 +30,7 @@
#include "../iommu-sva.h"
#include "pasid.h"
#include "cap_audit.h"
+#include "perfmon.h"

#define ROOT_SIZE VTD_PAGE_SIZE
#define CONTEXT_SIZE VTD_PAGE_SIZE
@@ -4013,6 +4014,8 @@ int __init intel_iommu_init(void)
intel_iommu_groups,
"%s", iommu->name);
iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
+
+ iommu_pmu_register(iommu);
}
up_read(&dmar_global_lock);

--
2.35.1

2023-01-12 13:54:45

by Lu Baolu

[permalink] [raw]
Subject: Re: [PATCH 1/7] iommu/vt-d: Support size of the register set in DRHD

On 2023/1/12 4:24, [email protected] wrote:
> From: Kan Liang <[email protected]>
>
> A new field, which indicates the size of the remapping hardware register
> set for this remapping unit, is introduced in the DMA-remapping hardware
> unit definition (DRHD) structure with the VT-d Spec 4.0.
> With the information, SW doesn't need to 'guess' the size of the
> register set anymore.
>
> Update the struct acpi_dmar_hardware_unit to reflect the field.
>
> Store the size of the register set in struct dmar_drhd_unit for each
> dmar device.
>
> The 'size' information is ResvZ for the old BIOS and platforms. Fall
> back to the old guessing method. There is nothing changed.
>
> Signed-off-by: Kan Liang <[email protected]>
> ---
> drivers/iommu/intel/dmar.c | 25 ++++++++++++++++++-------
> include/acpi/actbl1.h | 2 +-
> include/linux/dmar.h | 1 +
> 3 files changed, 20 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
> index b00a0ceb2d13..6a411d964474 100644
> --- a/drivers/iommu/intel/dmar.c
> +++ b/drivers/iommu/intel/dmar.c
> @@ -399,6 +399,13 @@ dmar_find_dmaru(struct acpi_dmar_hardware_unit *drhd)
> return NULL;
> }
>
> +/* The size of the register set is 2 ^ N 4 KB pages. */
> +static unsigned long
> +dmar_get_drhd_reg_size(u8 npages)

No need to divide it into two lines. Or put this line of code directly
where it is called?

> +{
> + return 1UL << (npages + 12);
> +}
> +
> /*
> * dmar_parse_one_drhd - parses exactly one DMA remapping hardware definition
> * structure which uniquely represent one DMA remapping hardware unit
> @@ -427,6 +434,7 @@ static int dmar_parse_one_drhd(struct acpi_dmar_header *header, void *arg)
> memcpy(dmaru->hdr, header, header->length);
> dmaru->reg_base_addr = drhd->address;
> dmaru->segment = drhd->segment;
> + dmaru->reg_size = dmar_get_drhd_reg_size(drhd->size);
> dmaru->include_all = drhd->flags & 0x1; /* BIT0: INCLUDE_ALL */
> dmaru->devices = dmar_alloc_dev_scope((void *)(drhd + 1),
> ((void *)drhd) + drhd->header.length,
> @@ -880,6 +888,7 @@ dmar_validate_one_drhd(struct acpi_dmar_header *entry, void *arg)
> struct acpi_dmar_hardware_unit *drhd;
> void __iomem *addr;
> u64 cap, ecap;
> + unsigned long size;
>
> drhd = (void *)entry;
> if (!drhd->address) {
> @@ -887,10 +896,11 @@ dmar_validate_one_drhd(struct acpi_dmar_header *entry, void *arg)
> return -EINVAL;
> }
>
> + size = dmar_get_drhd_reg_size(drhd->size);
> if (arg)
> - addr = ioremap(drhd->address, VTD_PAGE_SIZE);
> + addr = ioremap(drhd->address, size);
> else
> - addr = early_ioremap(drhd->address, VTD_PAGE_SIZE);
> + addr = early_ioremap(drhd->address, size);
> if (!addr) {
> pr_warn("Can't validate DRHD address: %llx\n", drhd->address);
> return -EINVAL;
> @@ -902,7 +912,7 @@ dmar_validate_one_drhd(struct acpi_dmar_header *entry, void *arg)
> if (arg)
> iounmap(addr);
> else
> - early_iounmap(addr, VTD_PAGE_SIZE);
> + early_iounmap(addr, size);
>
> if (cap == (uint64_t)-1 && ecap == (uint64_t)-1) {
> warn_invalid_dmar(drhd->address, " returns all ones");

The cap and ecap registers are always in the first page. Just map one
4K page is enough. There is no need to change it?

> @@ -956,17 +966,18 @@ static void unmap_iommu(struct intel_iommu *iommu)
> /**
> * map_iommu: map the iommu's registers
> * @iommu: the iommu to map
> - * @phys_addr: the physical address of the base resgister
> + * @drhd: DMA remapping hardware definition structure
> *
> * Memory map the iommu's registers. Start w/ a single page, and
> * possibly expand if that turns out to be insufficent.
> */
> -static int map_iommu(struct intel_iommu *iommu, u64 phys_addr)
> +static int map_iommu(struct intel_iommu *iommu, struct dmar_drhd_unit *drhd)
> {
> + u64 phys_addr = drhd->reg_base_addr;
> int map_size, err=0;
>
> iommu->reg_phys = phys_addr;
> - iommu->reg_size = VTD_PAGE_SIZE;
> + iommu->reg_size = drhd->reg_size;
>
> if (!request_mem_region(iommu->reg_phys, iommu->reg_size, iommu->name)) {
> pr_err("Can't reserve memory\n");
> @@ -1050,7 +1061,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
> }
> sprintf(iommu->name, "dmar%d", iommu->seq_id);
>
> - err = map_iommu(iommu, drhd->reg_base_addr);
> + err = map_iommu(iommu, drhd);
> if (err) {
> pr_err("Failed to map %s\n", iommu->name);
> goto error_free_seq_id;
> diff --git a/include/acpi/actbl1.h b/include/acpi/actbl1.h
> index 4175dce3967c..bdded0ac46eb 100644
> --- a/include/acpi/actbl1.h
> +++ b/include/acpi/actbl1.h
> @@ -802,7 +802,7 @@ struct acpi_dmar_pci_path {
> struct acpi_dmar_hardware_unit {
> struct acpi_dmar_header header;
> u8 flags;
> - u8 reserved;
> + u8 size; /* Size of the register set */
> u16 segment;
> u64 address; /* Register Base Address */
> };
> diff --git a/include/linux/dmar.h b/include/linux/dmar.h
> index d81a51978d01..51d498f0a02b 100644
> --- a/include/linux/dmar.h
> +++ b/include/linux/dmar.h
> @@ -46,6 +46,7 @@ struct dmar_drhd_unit {
> u8 include_all:1;
> u8 gfx_dedicated:1; /* graphic dedicated */
> struct intel_iommu *iommu;
> + unsigned long reg_size; /* size of register set */

Move it up and pair it with reg_base_addr.

> };
>
> struct dmar_pci_path {

--
Best regards,
baolu

2023-01-12 17:36:49

by Liang, Kan

[permalink] [raw]
Subject: Re: [PATCH 1/7] iommu/vt-d: Support size of the register set in DRHD



On 2023-01-12 7:42 a.m., Baolu Lu wrote:
> On 2023/1/12 4:24, [email protected] wrote:
>> From: Kan Liang <[email protected]>
>>
>> A new field, which indicates the size of the remapping hardware register
>> set for this remapping unit, is introduced in the DMA-remapping hardware
>> unit definition (DRHD) structure with the VT-d Spec 4.0.
>> With the information, SW doesn't need to 'guess' the size of the
>> register set anymore.
>>
>> Update the struct acpi_dmar_hardware_unit to reflect the field.
>>
>> Store the size of the register set in struct dmar_drhd_unit for each
>> dmar device.
>>
>> The 'size' information is ResvZ for the old BIOS and platforms. Fall
>> back to the old guessing method. There is nothing changed.
>>
>> Signed-off-by: Kan Liang <[email protected]>
>> ---
>>   drivers/iommu/intel/dmar.c | 25 ++++++++++++++++++-------
>>   include/acpi/actbl1.h      |  2 +-
>>   include/linux/dmar.h       |  1 +
>>   3 files changed, 20 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
>> index b00a0ceb2d13..6a411d964474 100644
>> --- a/drivers/iommu/intel/dmar.c
>> +++ b/drivers/iommu/intel/dmar.c
>> @@ -399,6 +399,13 @@ dmar_find_dmaru(struct acpi_dmar_hardware_unit
>> *drhd)
>>       return NULL;
>>   }
>>   +/* The size of the register set is 2 ^ N 4 KB pages. */
>> +static unsigned long
>> +dmar_get_drhd_reg_size(u8 npages)
>
> No need to divide it into two lines. Or put this line of code directly
> where it is called?
>
>> +{
>> +    return 1UL << (npages + 12);
>> +}
>> +
>>   /*
>>    * dmar_parse_one_drhd - parses exactly one DMA remapping hardware
>> definition
>>    * structure which uniquely represent one DMA remapping hardware unit
>> @@ -427,6 +434,7 @@ static int dmar_parse_one_drhd(struct
>> acpi_dmar_header *header, void *arg)
>>       memcpy(dmaru->hdr, header, header->length);
>>       dmaru->reg_base_addr = drhd->address;
>>       dmaru->segment = drhd->segment;
>> +    dmaru->reg_size = dmar_get_drhd_reg_size(drhd->size);
>>       dmaru->include_all = drhd->flags & 0x1; /* BIT0: INCLUDE_ALL */
>>       dmaru->devices = dmar_alloc_dev_scope((void *)(drhd + 1),
>>                             ((void *)drhd) + drhd->header.length,
>> @@ -880,6 +888,7 @@ dmar_validate_one_drhd(struct acpi_dmar_header
>> *entry, void *arg)
>>       struct acpi_dmar_hardware_unit *drhd;
>>       void __iomem *addr;
>>       u64 cap, ecap;
>> +    unsigned long size;
>>         drhd = (void *)entry;
>>       if (!drhd->address) {
>> @@ -887,10 +896,11 @@ dmar_validate_one_drhd(struct acpi_dmar_header
>> *entry, void *arg)
>>           return -EINVAL;
>>       }
>>   +    size = dmar_get_drhd_reg_size(drhd->size);
>>       if (arg)
>> -        addr = ioremap(drhd->address, VTD_PAGE_SIZE);
>> +        addr = ioremap(drhd->address, size);
>>       else
>> -        addr = early_ioremap(drhd->address, VTD_PAGE_SIZE);
>> +        addr = early_ioremap(drhd->address, size);
>>       if (!addr) {
>>           pr_warn("Can't validate DRHD address: %llx\n", drhd->address);
>>           return -EINVAL;
>> @@ -902,7 +912,7 @@ dmar_validate_one_drhd(struct acpi_dmar_header
>> *entry, void *arg)
>>       if (arg)
>>           iounmap(addr);
>>       else
>> -        early_iounmap(addr, VTD_PAGE_SIZE);
>> +        early_iounmap(addr, size);
>>         if (cap == (uint64_t)-1 && ecap == (uint64_t)-1) {
>>           warn_invalid_dmar(drhd->address, " returns all ones");
>
> The cap and ecap registers are always in the first page. Just map one
> 4K page is enough. There is no need to change it?

Right, one page should be enough for cap and ecap.

The perf cap will be checked in the alloc_iommu_pmu() of patch 2. If it
fails, it should not impact the existing features. No need to change
here. I will update it in V2.

After the change, there will be only one place which invokes
dmar_get_drhd_reg_size(). I will move it to the place as well.

>
>> @@ -956,17 +966,18 @@ static void unmap_iommu(struct intel_iommu *iommu)
>>   /**
>>    * map_iommu: map the iommu's registers
>>    * @iommu: the iommu to map
>> - * @phys_addr: the physical address of the base resgister
>> + * @drhd: DMA remapping hardware definition structure
>>    *
>>    * Memory map the iommu's registers.  Start w/ a single page, and
>>    * possibly expand if that turns out to be insufficent.
>>    */
>> -static int map_iommu(struct intel_iommu *iommu, u64 phys_addr)
>> +static int map_iommu(struct intel_iommu *iommu, struct dmar_drhd_unit
>> *drhd)
>>   {
>> +    u64 phys_addr = drhd->reg_base_addr;
>>       int map_size, err=0;
>>         iommu->reg_phys = phys_addr;
>> -    iommu->reg_size = VTD_PAGE_SIZE;
>> +    iommu->reg_size = drhd->reg_size;
>>         if (!request_mem_region(iommu->reg_phys, iommu->reg_size,
>> iommu->name)) {
>>           pr_err("Can't reserve memory\n");
>> @@ -1050,7 +1061,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
>>       }
>>       sprintf(iommu->name, "dmar%d", iommu->seq_id);
>>   -    err = map_iommu(iommu, drhd->reg_base_addr);
>> +    err = map_iommu(iommu, drhd);
>>       if (err) {
>>           pr_err("Failed to map %s\n", iommu->name);
>>           goto error_free_seq_id;
>> diff --git a/include/acpi/actbl1.h b/include/acpi/actbl1.h
>> index 4175dce3967c..bdded0ac46eb 100644
>> --- a/include/acpi/actbl1.h
>> +++ b/include/acpi/actbl1.h
>> @@ -802,7 +802,7 @@ struct acpi_dmar_pci_path {
>>   struct acpi_dmar_hardware_unit {
>>       struct acpi_dmar_header header;
>>       u8 flags;
>> -    u8 reserved;
>> +    u8 size;        /* Size of the register set */
>>       u16 segment;
>>       u64 address;        /* Register Base Address */
>>   };
>> diff --git a/include/linux/dmar.h b/include/linux/dmar.h
>> index d81a51978d01..51d498f0a02b 100644
>> --- a/include/linux/dmar.h
>> +++ b/include/linux/dmar.h
>> @@ -46,6 +46,7 @@ struct dmar_drhd_unit {
>>       u8    include_all:1;
>>       u8    gfx_dedicated:1;    /* graphic dedicated    */
>>       struct intel_iommu *iommu;
>> +    unsigned long reg_size;        /* size of register set */
>
> Move it up and pair it with reg_base_addr.

Sure.

Thanks,
Kan
>
>>   };
>>     struct dmar_pci_path {
>
> --
> Best regards,
> baolu

2023-01-13 13:53:26

by Lu Baolu

[permalink] [raw]
Subject: Re: [PATCH 2/7] iommu/vt-d: Retrieve IOMMU perfmon capability information

On 2023/1/12 4:24, [email protected] wrote:
> From: Kan Liang <[email protected]>
>
> The performance monitoring infrastructure, perfmon, is to support
> collection of information about key events occurring during operation of
> the remapping hardware, to aid performance tuning and debug. Each
> remapping hardware unit has capability registers that indicate support
> for performance monitoring features and enumerate the capabilities.
>
> Add alloc_iommu_pmu() to retrieve IOMMU perfmon capability information
> for each iommu unit. The information is stored in the iommu->pmu data
> structure. Capability registers are read-only, so it's safe to prefetch
> and store them in the pmu structure. This could avoid unnecessary VMEXIT
> when this code is running in the virtualization environment.
>
> Add free_iommu_pmu() to free the saved capability information when
> freeing the iommu unit.
>
> Add a kernel config option for the IOMMU perfmon feature. Unless a user
> explicitly uses the perf tool to monitor the IOMMU perfmon event, there
> isn't any impact for the existing IOMMU. Enable it by default.
>
> Signed-off-by: Kan Liang <[email protected]>
> ---
> drivers/iommu/intel/Kconfig | 9 ++
> drivers/iommu/intel/Makefile | 1 +
> drivers/iommu/intel/dmar.c | 7 ++
> drivers/iommu/intel/iommu.h | 41 +++++++++
> drivers/iommu/intel/perfmon.c | 159 ++++++++++++++++++++++++++++++++++
> drivers/iommu/intel/perfmon.h | 41 +++++++++
> 6 files changed, 258 insertions(+)
> create mode 100644 drivers/iommu/intel/perfmon.c
> create mode 100644 drivers/iommu/intel/perfmon.h
>
> diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
> index b7dff5092fd2..1a4aebddc9a6 100644
> --- a/drivers/iommu/intel/Kconfig
> +++ b/drivers/iommu/intel/Kconfig
> @@ -96,4 +96,13 @@ config INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
> passing intel_iommu=sm_on to the kernel. If not sure, please use
> the default value.
>
> +config INTEL_IOMMU_PERF_EVENTS
> + def_bool y
> + bool "Intel IOMMU performance events"
> + depends on INTEL_IOMMU && PERF_EVENTS
> + help
> + Include support for Intel IOMMU performance events. These are
> + available on modern processors which support Intel VT-d 4.0 and
> + later.
> +
> endif # INTEL_IOMMU
> diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile
> index fa0dae16441c..7af3b8a4f2a0 100644
> --- a/drivers/iommu/intel/Makefile
> +++ b/drivers/iommu/intel/Makefile
> @@ -6,3 +6,4 @@ obj-$(CONFIG_DMAR_PERF) += perf.o
> obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o
> obj-$(CONFIG_INTEL_IOMMU_SVM) += svm.o
> obj-$(CONFIG_IRQ_REMAP) += irq_remapping.o
> +obj-$(CONFIG_INTEL_IOMMU_PERF_EVENTS) += perfmon.o
> diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
> index 6a411d964474..91bb48267df2 100644
> --- a/drivers/iommu/intel/dmar.c
> +++ b/drivers/iommu/intel/dmar.c
> @@ -34,6 +34,7 @@
> #include "../irq_remapping.h"
> #include "perf.h"
> #include "trace.h"
> +#include "perfmon.h"
>
> typedef int (*dmar_res_handler_t)(struct acpi_dmar_header *, void *);
> struct dmar_res_callback {
> @@ -1114,6 +1115,9 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
> if (sts & DMA_GSTS_QIES)
> iommu->gcmd |= DMA_GCMD_QIE;
>
> + if (alloc_iommu_pmu(iommu))
> + pr_debug("Cannot alloc PMU for iommu (seq_id = %d)\n", iommu->seq_id);
> +
> raw_spin_lock_init(&iommu->register_lock);
>
> /*
> @@ -1148,6 +1152,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
> err_sysfs:
> iommu_device_sysfs_remove(&iommu->iommu);
> err_unmap:
> + free_iommu_pmu(iommu);
> unmap_iommu(iommu);
> error_free_seq_id:
> ida_free(&dmar_seq_ids, iommu->seq_id);
> @@ -1163,6 +1168,8 @@ static void free_iommu(struct intel_iommu *iommu)
> iommu_device_sysfs_remove(&iommu->iommu);
> }
>
> + free_iommu_pmu(iommu);
> +
> if (iommu->irq) {
> if (iommu->pr_irq) {
> free_irq(iommu->pr_irq, iommu);
> diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
> index 06e61e474856..5bcefbea55c9 100644
> --- a/drivers/iommu/intel/iommu.h
> +++ b/drivers/iommu/intel/iommu.h
> @@ -125,6 +125,11 @@
> #define DMAR_MTRR_PHYSMASK8_REG 0x208
> #define DMAR_MTRR_PHYSBASE9_REG 0x210
> #define DMAR_MTRR_PHYSMASK9_REG 0x218
> +#define DMAR_PERFCAP_REG 0x300
> +#define DMAR_PERFCFGOFF_REG 0x310
> +#define DMAR_PERFOVFOFF_REG 0x318
> +#define DMAR_PERFCNTROFF_REG 0x31c
> +#define DMAR_PERFEVNTCAP_REG 0x380
> #define DMAR_VCCAP_REG 0xe30 /* Virtual command capability register */
> #define DMAR_VCMD_REG 0xe00 /* Virtual command register */
> #define DMAR_VCRSP_REG 0xe10 /* Virtual command response register */
> @@ -148,6 +153,7 @@
> */
> #define cap_esrtps(c) (((c) >> 63) & 1)
> #define cap_esirtps(c) (((c) >> 62) & 1)
> +#define cap_ecmds(c) (((c) >> 61) & 1)
> #define cap_fl5lp_support(c) (((c) >> 60) & 1)
> #define cap_pi_support(c) (((c) >> 59) & 1)
> #define cap_fl1gp_support(c) (((c) >> 56) & 1)
> @@ -179,6 +185,7 @@
> * Extended Capability Register
> */
>
> +#define ecap_pms(e) (((e) >> 51) & 0x1)
> #define ecap_rps(e) (((e) >> 49) & 0x1)
> #define ecap_smpwc(e) (((e) >> 48) & 0x1)
> #define ecap_flts(e) (((e) >> 47) & 0x1)
> @@ -210,6 +217,22 @@
> #define ecap_max_handle_mask(e) (((e) >> 20) & 0xf)
> #define ecap_sc_support(e) (((e) >> 7) & 0x1) /* Snooping Control */
>
> +/*
> + * Decoding Perf Capability Register
> + */
> +#define pcap_num_cntr(p) ((p) & 0xffff)
> +#define pcap_cntr_width(p) (((p) >> 16) & 0x7f)
> +#define pcap_num_event_group(p) (((p) >> 24) & 0x1f)
> +#define pcap_filters_mask(p) (((p) >> 32) & 0x1f)
> +#define pcap_interrupt(p) (((p) >> 50) & 0x1)
> +/* The counter stride is calculated as 2 ^ (x+10) bytes */
> +#define pcap_cntr_stride(p) (1ULL << ((((p) >> 52) & 0x7) + 10))
> +
> +/*
> + * Decoding Perf Event Capability Register
> + */
> +#define pecap_es(p) ((p) & 0xfffffff)
> +
> /* Virtual command interface capability */
> #define vccap_pasid(v) (((v) & DMA_VCS_PAS)) /* PASID allocation */
>
> @@ -554,6 +577,22 @@ struct dmar_domain {
> iommu core */
> };
>
> +struct iommu_pmu {
> + struct intel_iommu *iommu;
> + u32 num_cntr; /* Number of counters */
> + u32 num_eg; /* Number of event group */
> + u32 cntr_width; /* Counter width */
> + u32 cntr_stride; /* Counter Stride */
> + u32 filter; /* Bitmask of filter support */
> + void __iomem *base; /* the PerfMon base address */
> + void __iomem *cfg_reg; /* counter configuration base address */
> + void __iomem *cntr_reg; /* counter 0 address*/
> + void __iomem *overflow; /* overflow status register */
> +
> + u64 *evcap; /* Indicates all supported events */
> + u32 **cntr_evcap; /* Supported events of each counter. */
> +};
> +
> struct intel_iommu {
> void __iomem *reg; /* Pointer to hardware regs, virtual addr */
> u64 reg_phys; /* physical address of hw register set */
> @@ -600,6 +639,8 @@ struct intel_iommu {
>
> struct dmar_drhd_unit *drhd;
> void *perf_statistic;
> +
> + struct iommu_pmu *pmu;
> };
>
> /* PCI domain-device relationship */
> diff --git a/drivers/iommu/intel/perfmon.c b/drivers/iommu/intel/perfmon.c
> new file mode 100644
> index 000000000000..bc090f329c32
> --- /dev/null
> +++ b/drivers/iommu/intel/perfmon.c
> @@ -0,0 +1,159 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Support Intel IOMMU PerfMon
> + * Copyright(c) 2022 Intel Corporation.

Copyright 2023

> + */
> +
> +#include <linux/dmar.h>
> +#include "iommu.h"
> +#include "perfmon.h"
> +
> +static inline void __iomem *
> +get_perf_reg_address(struct intel_iommu *iommu, u32 offset)
> +{
> + u32 off = dmar_readl(iommu->reg + offset);
> +
> + return iommu->reg + off;
> +}
> +
> +int alloc_iommu_pmu(struct intel_iommu *iommu)
> +{
> + struct iommu_pmu *iommu_pmu;
> + int i, j, ret;
> + u64 perfcap;
> + u32 cap;
> +
> + /* The IOMMU PMU requires the ECMD support as well */
> + if (!ecap_pms(iommu->ecap) || !cap_ecmds(iommu->cap))
> + return -ENODEV;

It's normal that PMS is not supported on an IOMMU, how about,

if (!ecap_pms(iommu->ecap))
return 0;

/* The IOMMU PMU requires the ECMD support as well */
if (!cap_ecmds(iommu->cap))
return -ENODEV;

> +
> + perfcap = dmar_readq(iommu->reg + DMAR_PERFCAP_REG);
> + /* The performance monitoring is not supported. */
> + if (!perfcap)
> + return -ENODEV;
> +
> + /* Sanity check for the number of the counters and event groups */
> + if (!pcap_num_cntr(perfcap) || !pcap_num_event_group(perfcap))
> + return -ENODEV;
> +
> + /* The interrupt on overflow is required */
> + if (!pcap_interrupt(perfcap))
> + return -ENODEV;
> +
> + iommu_pmu = kzalloc(sizeof(*iommu_pmu), GFP_KERNEL);
> + if (!iommu_pmu)
> + return -ENOMEM;
> +
> + iommu_pmu->num_cntr = pcap_num_cntr(perfcap);
> + iommu_pmu->cntr_width = pcap_cntr_width(perfcap);
> + iommu_pmu->filter = pcap_filters_mask(perfcap);
> + iommu_pmu->cntr_stride = pcap_cntr_stride(perfcap);
> + iommu_pmu->num_eg = pcap_num_event_group(perfcap);
> +
> + iommu_pmu->evcap = kcalloc(iommu_pmu->num_eg, sizeof(u64), GFP_KERNEL);
> + if (!iommu_pmu->evcap) {
> + ret = -ENOMEM;
> + goto free_pmu;
> + }
> +
> + /* Parse event group capabilities */
> + for (i = 0; i < iommu_pmu->num_eg; i++) {
> + u64 pcap;
> +
> + pcap = dmar_readq(iommu->reg + DMAR_PERFEVNTCAP_REG +
> + i * IOMMU_PMU_CAP_REGS_STEP);
> + iommu_pmu->evcap[i] = pecap_es(pcap);
> + }
> +
> + iommu_pmu->cntr_evcap = kcalloc(iommu_pmu->num_cntr, sizeof(u32 *), GFP_KERNEL);
> + if (!iommu_pmu->cntr_evcap) {
> + ret = -ENOMEM;
> + goto free_pmu_evcap;
> + }
> + for (i = 0; i < iommu_pmu->num_cntr; i++) {
> + iommu_pmu->cntr_evcap[i] = kcalloc(iommu_pmu->num_eg, sizeof(u32), GFP_KERNEL);
> + if (!iommu_pmu->cntr_evcap[i]) {
> + ret = -ENOMEM;
> + iommu_pmu->num_cntr = i;

Do we really need above line? kfree() is friendly to a NULL pointer,
right?

> + goto free_pmu_cntr_evcap;
> + }
> + /*
> + * Set to the global capabilities, will adjust according
> + * to per-counter capabilities later.
> + */
> + for (j = 0; j < iommu_pmu->num_eg; j++)
> + iommu_pmu->cntr_evcap[i][j] = (u32)iommu_pmu->evcap[j];
> + }
> +
> + iommu_pmu->cfg_reg = get_perf_reg_address(iommu, DMAR_PERFCFGOFF_REG);
> + iommu_pmu->cntr_reg = get_perf_reg_address(iommu, DMAR_PERFCNTROFF_REG);
> + iommu_pmu->overflow = get_perf_reg_address(iommu, DMAR_PERFOVFOFF_REG);
> +
> + /*
> + * Check per-counter capabilities
> + * All counters should have the same capabilities on
> + * Interrupt on Overflow Support and Counter Width
> + */

Please re-org this comment and make it neat.

> + for (i = 0; i < iommu_pmu->num_cntr; i++) {
> + cap = dmar_readl(iommu_pmu->cfg_reg +
> + i * IOMMU_PMU_CFG_OFFSET +
> + IOMMU_PMU_CFG_CNTRCAP_OFFSET);
> + if (!iommu_cntrcap_pcc(cap))
> + continue;
> + if ((iommu_cntrcap_cw(cap) != iommu_pmu->cntr_width) ||
> + !iommu_cntrcap_ios(cap))
> + iommu_pmu->num_cntr = i;

Can you please add some words describing how do you handle the
capability inconsistent case? It seems that you just truncate the number
of counters? Any rationality behind that?

> +
> + /* Clear the pre-defined events group */
> + for (j = 0; j < iommu_pmu->num_eg; j++)
> + iommu_pmu->cntr_evcap[i][j] = 0;
> +
> + /* Override with per-counter event capabilities */
> + for (j = 0; j < iommu_cntrcap_egcnt(cap); j++) {
> + cap = dmar_readl(iommu_pmu->cfg_reg + i * IOMMU_PMU_CFG_OFFSET +
> + IOMMU_PMU_CFG_CNTREVCAP_OFFSET +
> + (j * IOMMU_PMU_OFF_REGS_STEP));
> + iommu_pmu->cntr_evcap[i][iommu_event_group(cap)] = iommu_event_select(cap);
> + /*
> + * Some events may only be supported by a specific counter.
> + * Track them in the evcap as well.
> + */
> + iommu_pmu->evcap[iommu_event_group(cap)] |= iommu_event_select(cap);
> + }
> + }
> +
> + iommu_pmu->iommu = iommu;
> + iommu->pmu = iommu_pmu;
> +
> + return 0;
> +
> +free_pmu_cntr_evcap:
> + for (i = 0; i < iommu_pmu->num_cntr; i++)
> + kfree(iommu_pmu->cntr_evcap[i]);
> + kfree(iommu_pmu->cntr_evcap);
> +free_pmu_evcap:
> + kfree(iommu_pmu->evcap);
> +free_pmu:
> + kfree(iommu_pmu);
> +
> + return ret;
> +}
> +
> +void free_iommu_pmu(struct intel_iommu *iommu)
> +{
> + struct iommu_pmu *iommu_pmu = iommu->pmu;
> +
> + if (!iommu_pmu)
> + return;
> +
> + if (iommu_pmu->evcap) {
> + int i;
> +
> + for (i = 0; i < iommu_pmu->num_cntr; i++)
> + kfree(iommu_pmu->cntr_evcap[i]);
> + kfree(iommu_pmu->cntr_evcap);
> + }
> + kfree(iommu_pmu->evcap);
> + kfree(iommu_pmu);
> + iommu->pmu = NULL;
> +}
> diff --git a/drivers/iommu/intel/perfmon.h b/drivers/iommu/intel/perfmon.h
> new file mode 100644
> index 000000000000..8587c80501cd
> --- /dev/null
> +++ b/drivers/iommu/intel/perfmon.h
> @@ -0,0 +1,41 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +/*
> + * PERFCFGOFF_REG, PERFFRZOFF_REG
> + * PERFOVFOFF_REG, PERFCNTROFF_REG
> + */
> +#define IOMMU_PMU_NUM_OFF_REGS 4
> +#define IOMMU_PMU_OFF_REGS_STEP 4
> +
> +#define IOMMU_PMU_CFG_OFFSET 0x100
> +#define IOMMU_PMU_CFG_CNTRCAP_OFFSET 0x80
> +#define IOMMU_PMU_CFG_CNTREVCAP_OFFSET 0x84
> +#define IOMMU_PMU_CFG_SIZE 0x8
> +#define IOMMU_PMU_CFG_FILTERS_OFFSET 0x4
> +
> +
> +#define IOMMU_PMU_CAP_REGS_STEP 8
> +
> +#define iommu_cntrcap_pcc(p) ((p) & 0x1)
> +#define iommu_cntrcap_cw(p) ((p >> 8) & 0xff)
> +#define iommu_cntrcap_ios(p) ((p >> 16) & 0x1)
> +#define iommu_cntrcap_egcnt(p) ((p >> 28) & 0xf)
> +
> +#define iommu_event_select(p) ((p) & 0xfffffff)
> +#define iommu_event_group(p) ((p >> 28) & 0xf)
> +
> +#ifdef CONFIG_INTEL_IOMMU_PERF_EVENTS
> +int alloc_iommu_pmu(struct intel_iommu *iommu);
> +void free_iommu_pmu(struct intel_iommu *iommu);
> +#else
> +static inline int
> +alloc_iommu_pmu(struct intel_iommu *iommu)
> +{
> + return 0;
> +}
> +
> +static inline void
> +free_iommu_pmu(struct intel_iommu *iommu)
> +{
> +}
> +#endif /* CONFIG_INTEL_IOMMU_PERF_EVENTS */

--
Best regards,
baolu

2023-01-13 14:03:39

by Lu Baolu

[permalink] [raw]
Subject: Re: [PATCH 3/7] iommu/vt-d: Support Enhanced Command Interface

On 2023/1/12 4:25, [email protected] wrote:
> From: Kan Liang <[email protected]>
>
> The Enhanced Command Register is to submit command and operand of
> enhanced commands to DMA Remapping hardware. It can supports upto 256
> enhanced commands.
>
> There is a HW register to indicate the availability of all 256 enhanced
> commands. Each bit stands for each command. But there isn't an existing
> interface to read/write all 256 bits. Introduce the u64 ecmdcap[4] to
> store the existence of each enhanced command. Read 4 times to get
> all of them in map_iommu().
>
> Add a helper to facilitate an enhanced command launch. Make sure hardware
> complete the command.
>
> Add a helper to facilitate the check of PMU essentials.
>
> The helpers will be used later.
>
> Signed-off-by: Kan Liang <[email protected]>
> ---
> drivers/iommu/intel/dmar.c | 63 +++++++++++++++++++++++++++++++++++++
> drivers/iommu/intel/iommu.h | 46 +++++++++++++++++++++++++++
> 2 files changed, 109 insertions(+)
>
> diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
> index 91bb48267df2..dcafa32875c1 100644
> --- a/drivers/iommu/intel/dmar.c
> +++ b/drivers/iommu/intel/dmar.c
> @@ -1025,6 +1025,16 @@ static int map_iommu(struct intel_iommu *iommu, struct dmar_drhd_unit *drhd)
> goto release;
> }
> }
> +
> + if (cap_ecmds(iommu->cap)) {
> + int i;
> +
> + for (i = 0; i < DMA_MAX_NUM_ECMDCAP; i++) {
> + iommu->ecmdcap[i] = dmar_readq(iommu->reg + DMAR_ECCAP_REG +
> + i * DMA_ECMD_REG_STEP);
> + }
> + }
> +
> err = 0;
> goto out;
>
> @@ -2434,3 +2444,56 @@ bool dmar_platform_optin(void)
> return ret;
> }
> EXPORT_SYMBOL_GPL(dmar_platform_optin);
> +
> +#ifdef CONFIG_INTEL_IOMMU

Why do you need above #ifdef?

> +#define ecmd_get_status_code(res) ((res & 0xff) >> 1)
> +
> +/*
> + * Function to submit a command to the enhanced command interface. The
> + * valid enhanced command descriptions are defined in Table 47 of the
> + * VT-d spec. The VT-d hardware implementation may support some but not
> + * all commands, which can be determined by checking the Enhanced
> + * Command Capability Register.
> + *
> + * Return values:
> + * - 0: Command successful without any error;
> + * - Negative: software error value;
> + * - Nonzero positive: failure status code defined in Table 48.
> + */
> +int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd,
> + u64 oa, bool has_ob, u64 ob)
> +{
> + unsigned long flags;
> + u64 res;
> + int ret;
> +
> + if (!cap_ecmds(iommu->cap))
> + return -ENODEV;
> +
> + raw_spin_lock_irqsave(&iommu->register_lock, flags);
> +
> + res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
> + if (res & DMA_ECMD_ECRSP_IP) {
> + ret = -EBUSY;
> + goto err;
> + }
> +
> + if (has_ob)
> + dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);

The ecmds that require a Operand B are statically defined in the spec,
right? What will it look like if we define a static ignore_ob(ecmd)?

> + dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
> +
> + IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
> + !(res & DMA_ECMD_ECRSP_IP), res);
> +
> + if (res & DMA_ECMD_ECRSP_IP) {
> + ret = -ETIMEDOUT;
> + goto err;
> + }
> +
> + ret = ecmd_get_status_code(res);
> +err:
> + raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
> +
> + return ret;
> +}
> +#endif /* CONFIG_INTEL_IOMMU */
> diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
> index 5bcefbea55c9..f227107434ac 100644
> --- a/drivers/iommu/intel/iommu.h
> +++ b/drivers/iommu/intel/iommu.h
> @@ -130,6 +130,10 @@
> #define DMAR_PERFOVFOFF_REG 0x318
> #define DMAR_PERFCNTROFF_REG 0x31c
> #define DMAR_PERFEVNTCAP_REG 0x380
> +#define DMAR_ECMD_REG 0x400
> +#define DMAR_ECEO_REG 0x408
> +#define DMAR_ECRSP_REG 0x410
> +#define DMAR_ECCAP_REG 0x430
> #define DMAR_VCCAP_REG 0xe30 /* Virtual command capability register */
> #define DMAR_VCMD_REG 0xe00 /* Virtual command register */
> #define DMAR_VCRSP_REG 0xe10 /* Virtual command response register */
> @@ -304,6 +308,26 @@
> #define DMA_CCMD_SID(s) (((u64)((s) & 0xffff)) << 16)
> #define DMA_CCMD_DID(d) ((u64)((d) & 0xffff))
>
> +/* ECMD_REG */
> +#define DMA_MAX_NUM_ECMD 256
> +#define DMA_MAX_NUM_ECMDCAP (DMA_MAX_NUM_ECMD / 64)
> +#define DMA_ECMD_REG_STEP 8
> +#define DMA_ECMD_ENABLE 0xf0
> +#define DMA_ECMD_DISABLE 0xf1
> +#define DMA_ECMD_FREEZE 0xf4
> +#define DMA_ECMD_UNFREEZE 0xf5
> +#define DMA_ECMD_OA_SHIFT 16
> +#define DMA_ECMD_ECRSP_IP 0x1
> +#define DMA_ECMD_ECCAP3 3
> +#define DMA_ECMD_ECCAP3_ECNTS (1ULL << 48)
> +#define DMA_ECMD_ECCAP3_DCNTS (1ULL << 49)
> +#define DMA_ECMD_ECCAP3_FCNTS (1ULL << 52)
> +#define DMA_ECMD_ECCAP3_UFCNTS (1ULL << 53)
> +#define DMA_ECMD_ECCAP3_ESSENTIAL (DMA_ECMD_ECCAP3_ECNTS | \
> + DMA_ECMD_ECCAP3_DCNTS | \
> + DMA_ECMD_ECCAP3_FCNTS | \
> + DMA_ECMD_ECCAP3_UFCNTS)
> +
> /* FECTL_REG */
> #define DMA_FECTL_IM (((u32)1) << 31)
>
> @@ -600,6 +624,7 @@ struct intel_iommu {
> u64 cap;
> u64 ecap;
> u64 vccap;
> + u64 ecmdcap[DMA_MAX_NUM_ECMDCAP];
> u32 gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */
> raw_spinlock_t register_lock; /* protect register handling */
> int seq_id; /* sequence id of the iommu */
> @@ -841,6 +866,15 @@ extern const struct iommu_ops intel_iommu_ops;
> extern int intel_iommu_sm;
> extern int iommu_calculate_agaw(struct intel_iommu *iommu);
> extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu);
> +extern int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd,
> + u64 oa, bool has_ob, u64 ob);
> +
> +static inline bool ecmd_has_pmu_essential(struct intel_iommu *iommu)
> +{
> + return (iommu->ecmdcap[DMA_ECMD_ECCAP3] & DMA_ECMD_ECCAP3_ESSENTIAL) ==
> + DMA_ECMD_ECCAP3_ESSENTIAL;
> +}
> +
> extern int dmar_disabled;
> extern int intel_iommu_enabled;
> #else
> @@ -852,6 +886,18 @@ static inline int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
> {
> return 0;
> }
> +
> +static inline int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd,
> + u64 oa, bool has_ob, u64 ob)
> +{
> + return -EOPNOTSUPP;
> +}
> +
> +static inline bool ecmd_has_pmu_essential(struct intel_iommu *iommu)
> +{
> + return false;
> +}

Is there any need to call these helpers when INTEL_IOMMU is not
configured?

> +
> #define dmar_disabled (1)
> #define intel_iommu_enabled (0)
> #define intel_iommu_sm (0)

--
Best regards,
baolu

2023-01-13 14:28:06

by Lu Baolu

[permalink] [raw]
Subject: Re: [PATCH 3/7] iommu/vt-d: Support Enhanced Command Interface

On 2023/1/13 21:55, Baolu Lu wrote:
>> +/*
>> + * Function to submit a command to the enhanced command interface. The
>> + * valid enhanced command descriptions are defined in Table 47 of the
>> + * VT-d spec. The VT-d hardware implementation may support some but not
>> + * all commands, which can be determined by checking the Enhanced
>> + * Command Capability Register.
>> + *
>> + * Return values:
>> + *  - 0: Command successful without any error;
>> + *  - Negative: software error value;
>> + *  - Nonzero positive: failure status code defined in Table 48.
>> + */
>> +int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd,
>> +             u64 oa, bool has_ob, u64 ob)
>> +{
>> +    unsigned long flags;
>> +    u64 res;
>> +    int ret;
>> +
>> +    if (!cap_ecmds(iommu->cap))
>> +        return -ENODEV;
>> +
>> +    raw_spin_lock_irqsave(&iommu->register_lock, flags);
>> +
>> +    res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
>> +    if (res & DMA_ECMD_ECRSP_IP) {
>> +        ret = -EBUSY;
>> +        goto err;
>> +    }
>> +
>> +    if (has_ob)
>> +        dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
>
> The ecmds that require a Operand B are statically defined in the spec,
> right? What will it look like if we define a static ignore_ob(ecmd)?

Or simply remove has_ob parameter? The least case is an unnecessary
write to a register. It's fine as far as I can see since we should avoid
using it in any critical path.

--
Best regards,
baolu

2023-01-13 17:03:17

by Liang, Kan

[permalink] [raw]
Subject: Re: [PATCH 2/7] iommu/vt-d: Retrieve IOMMU perfmon capability information



On 2023-01-13 7:58 a.m., Baolu Lu wrote:
> On 2023/1/12 4:24, [email protected] wrote:
>> From: Kan Liang <[email protected]>
>>
>> The performance monitoring infrastructure, perfmon, is to support
>> collection of information about key events occurring during operation of
>> the remapping hardware, to aid performance tuning and debug. Each
>> remapping hardware unit has capability registers that indicate support
>> for performance monitoring features and enumerate the capabilities.
>>
>> Add alloc_iommu_pmu() to retrieve IOMMU perfmon capability information
>> for each iommu unit. The information is stored in the iommu->pmu data
>> structure. Capability registers are read-only, so it's safe to prefetch
>> and store them in the pmu structure. This could avoid unnecessary VMEXIT
>> when this code is running in the virtualization environment.
>>
>> Add free_iommu_pmu() to free the saved capability information when
>> freeing the iommu unit.
>>
>> Add a kernel config option for the IOMMU perfmon feature. Unless a user
>> explicitly uses the perf tool to monitor the IOMMU perfmon event, there
>> isn't any impact for the existing IOMMU. Enable it by default.
>>
>> Signed-off-by: Kan Liang <[email protected]>
>> ---
>>   drivers/iommu/intel/Kconfig   |   9 ++
>>   drivers/iommu/intel/Makefile  |   1 +
>>   drivers/iommu/intel/dmar.c    |   7 ++
>>   drivers/iommu/intel/iommu.h   |  41 +++++++++
>>   drivers/iommu/intel/perfmon.c | 159 ++++++++++++++++++++++++++++++++++
>>   drivers/iommu/intel/perfmon.h |  41 +++++++++
>>   6 files changed, 258 insertions(+)
>>   create mode 100644 drivers/iommu/intel/perfmon.c
>>   create mode 100644 drivers/iommu/intel/perfmon.h
>>
>> diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
>> index b7dff5092fd2..1a4aebddc9a6 100644
>> --- a/drivers/iommu/intel/Kconfig
>> +++ b/drivers/iommu/intel/Kconfig
>> @@ -96,4 +96,13 @@ config INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
>>         passing intel_iommu=sm_on to the kernel. If not sure, please use
>>         the default value.
>>   +config INTEL_IOMMU_PERF_EVENTS
>> +    def_bool y
>> +    bool "Intel IOMMU performance events"
>> +    depends on INTEL_IOMMU && PERF_EVENTS
>> +    help
>> +      Include support for Intel IOMMU performance events. These are
>> +      available on modern processors which support Intel VT-d 4.0 and
>> +      later.
>> +
>>   endif # INTEL_IOMMU
>> diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile
>> index fa0dae16441c..7af3b8a4f2a0 100644
>> --- a/drivers/iommu/intel/Makefile
>> +++ b/drivers/iommu/intel/Makefile
>> @@ -6,3 +6,4 @@ obj-$(CONFIG_DMAR_PERF) += perf.o
>>   obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o
>>   obj-$(CONFIG_INTEL_IOMMU_SVM) += svm.o
>>   obj-$(CONFIG_IRQ_REMAP) += irq_remapping.o
>> +obj-$(CONFIG_INTEL_IOMMU_PERF_EVENTS) += perfmon.o
>> diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
>> index 6a411d964474..91bb48267df2 100644
>> --- a/drivers/iommu/intel/dmar.c
>> +++ b/drivers/iommu/intel/dmar.c
>> @@ -34,6 +34,7 @@
>>   #include "../irq_remapping.h"
>>   #include "perf.h"
>>   #include "trace.h"
>> +#include "perfmon.h"
>>     typedef int (*dmar_res_handler_t)(struct acpi_dmar_header *, void *);
>>   struct dmar_res_callback {
>> @@ -1114,6 +1115,9 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
>>       if (sts & DMA_GSTS_QIES)
>>           iommu->gcmd |= DMA_GCMD_QIE;
>>   +    if (alloc_iommu_pmu(iommu))
>> +        pr_debug("Cannot alloc PMU for iommu (seq_id = %d)\n",
>> iommu->seq_id);
>> +
>>       raw_spin_lock_init(&iommu->register_lock);
>>         /*
>> @@ -1148,6 +1152,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
>>   err_sysfs:
>>       iommu_device_sysfs_remove(&iommu->iommu);
>>   err_unmap:
>> +    free_iommu_pmu(iommu);
>>       unmap_iommu(iommu);
>>   error_free_seq_id:
>>       ida_free(&dmar_seq_ids, iommu->seq_id);
>> @@ -1163,6 +1168,8 @@ static void free_iommu(struct intel_iommu *iommu)
>>           iommu_device_sysfs_remove(&iommu->iommu);
>>       }
>>   +    free_iommu_pmu(iommu);
>> +
>>       if (iommu->irq) {
>>           if (iommu->pr_irq) {
>>               free_irq(iommu->pr_irq, iommu);
>> diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
>> index 06e61e474856..5bcefbea55c9 100644
>> --- a/drivers/iommu/intel/iommu.h
>> +++ b/drivers/iommu/intel/iommu.h
>> @@ -125,6 +125,11 @@
>>   #define DMAR_MTRR_PHYSMASK8_REG 0x208
>>   #define DMAR_MTRR_PHYSBASE9_REG 0x210
>>   #define DMAR_MTRR_PHYSMASK9_REG 0x218
>> +#define DMAR_PERFCAP_REG    0x300
>> +#define DMAR_PERFCFGOFF_REG    0x310
>> +#define DMAR_PERFOVFOFF_REG    0x318
>> +#define DMAR_PERFCNTROFF_REG    0x31c
>> +#define DMAR_PERFEVNTCAP_REG    0x380
>>   #define DMAR_VCCAP_REG        0xe30 /* Virtual command capability
>> register */
>>   #define DMAR_VCMD_REG        0xe00 /* Virtual command register */
>>   #define DMAR_VCRSP_REG        0xe10 /* Virtual command response
>> register */
>> @@ -148,6 +153,7 @@
>>    */
>>   #define cap_esrtps(c)        (((c) >> 63) & 1)
>>   #define cap_esirtps(c)        (((c) >> 62) & 1)
>> +#define cap_ecmds(c)        (((c) >> 61) & 1)
>>   #define cap_fl5lp_support(c)    (((c) >> 60) & 1)
>>   #define cap_pi_support(c)    (((c) >> 59) & 1)
>>   #define cap_fl1gp_support(c)    (((c) >> 56) & 1)
>> @@ -179,6 +185,7 @@
>>    * Extended Capability Register
>>    */
>>   +#define ecap_pms(e)        (((e) >> 51) & 0x1)
>>   #define    ecap_rps(e)        (((e) >> 49) & 0x1)
>>   #define ecap_smpwc(e)        (((e) >> 48) & 0x1)
>>   #define ecap_flts(e)        (((e) >> 47) & 0x1)
>> @@ -210,6 +217,22 @@
>>   #define ecap_max_handle_mask(e) (((e) >> 20) & 0xf)
>>   #define ecap_sc_support(e)    (((e) >> 7) & 0x1) /* Snooping Control */
>>   +/*
>> + * Decoding Perf Capability Register
>> + */
>> +#define pcap_num_cntr(p)    ((p) & 0xffff)
>> +#define pcap_cntr_width(p)    (((p) >> 16) & 0x7f)
>> +#define pcap_num_event_group(p)    (((p) >> 24) & 0x1f)
>> +#define pcap_filters_mask(p)    (((p) >> 32) & 0x1f)
>> +#define pcap_interrupt(p)    (((p) >> 50) & 0x1)
>> +/* The counter stride is calculated as 2 ^ (x+10) bytes */
>> +#define pcap_cntr_stride(p)    (1ULL << ((((p) >> 52) & 0x7) + 10))
>> +
>> +/*
>> + * Decoding Perf Event Capability Register
>> + */
>> +#define pecap_es(p)        ((p) & 0xfffffff)
>> +
>>   /* Virtual command interface capability */
>>   #define vccap_pasid(v)        (((v) & DMA_VCS_PAS)) /* PASID
>> allocation */
>>   @@ -554,6 +577,22 @@ struct dmar_domain {
>>                          iommu core */
>>   };
>>   +struct iommu_pmu {
>> +    struct intel_iommu    *iommu;
>> +    u32            num_cntr;    /* Number of counters */
>> +    u32            num_eg;        /* Number of event group */
>> +    u32            cntr_width;    /* Counter width */
>> +    u32            cntr_stride;    /* Counter Stride */
>> +    u32            filter;        /* Bitmask of filter support */
>> +    void __iomem        *base;        /* the PerfMon base address */
>> +    void __iomem        *cfg_reg;    /* counter configuration base
>> address */
>> +    void __iomem        *cntr_reg;    /* counter 0 address*/
>> +    void __iomem        *overflow;    /* overflow status register */
>> +
>> +    u64            *evcap;        /* Indicates all supported events */
>> +    u32            **cntr_evcap;    /* Supported events of each
>> counter. */
>> +};
>> +
>>   struct intel_iommu {
>>       void __iomem    *reg; /* Pointer to hardware regs, virtual addr */
>>       u64         reg_phys; /* physical address of hw register set */
>> @@ -600,6 +639,8 @@ struct intel_iommu {
>>         struct dmar_drhd_unit *drhd;
>>       void *perf_statistic;
>> +
>> +    struct iommu_pmu *pmu;
>>   };
>>     /* PCI domain-device relationship */
>> diff --git a/drivers/iommu/intel/perfmon.c
>> b/drivers/iommu/intel/perfmon.c
>> new file mode 100644
>> index 000000000000..bc090f329c32
>> --- /dev/null
>> +++ b/drivers/iommu/intel/perfmon.c
>> @@ -0,0 +1,159 @@
>> +/* SPDX-License-Identifier: GPL-2.0-only */
>> +/*
>> + * Support Intel IOMMU PerfMon
>> + * Copyright(c) 2022 Intel Corporation.
>
> Copyright 2023

Happy new year!

>
>> + */
>> +
>> +#include <linux/dmar.h>
>> +#include "iommu.h"
>> +#include "perfmon.h"
>> +
>> +static inline void __iomem *
>> +get_perf_reg_address(struct intel_iommu *iommu, u32 offset)
>> +{
>> +    u32 off = dmar_readl(iommu->reg + offset);
>> +
>> +    return iommu->reg + off;
>> +}
>> +
>> +int alloc_iommu_pmu(struct intel_iommu *iommu)
>> +{
>> +    struct iommu_pmu *iommu_pmu;
>> +    int i, j, ret;
>> +    u64 perfcap;
>> +    u32 cap;
>> +
>> +    /* The IOMMU PMU requires the ECMD support as well */
>> +    if (!ecap_pms(iommu->ecap) || !cap_ecmds(iommu->cap))
>> +        return -ENODEV;
>
> It's normal that PMS is not supported on an IOMMU, how about,
>
>     if (!ecap_pms(iommu->ecap))
>         return 0;
>
>     /* The IOMMU PMU requires the ECMD support as well */
>     if (!cap_ecmds(iommu->cap))
>         return -ENODEV;

Sure

>
>> +
>> +    perfcap = dmar_readq(iommu->reg + DMAR_PERFCAP_REG);
>> +    /* The performance monitoring is not supported. */
>> +    if (!perfcap)
>> +        return -ENODEV;
>> +
>> +    /* Sanity check for the number of the counters and event groups */
>> +    if (!pcap_num_cntr(perfcap) || !pcap_num_event_group(perfcap))
>> +        return -ENODEV;
>> +
>> +    /* The interrupt on overflow is required */
>> +    if (!pcap_interrupt(perfcap))
>> +        return -ENODEV;
>> +
>> +    iommu_pmu = kzalloc(sizeof(*iommu_pmu), GFP_KERNEL);
>> +    if (!iommu_pmu)
>> +        return -ENOMEM;
>> +
>> +    iommu_pmu->num_cntr = pcap_num_cntr(perfcap);
>> +    iommu_pmu->cntr_width = pcap_cntr_width(perfcap);
>> +    iommu_pmu->filter = pcap_filters_mask(perfcap);
>> +    iommu_pmu->cntr_stride = pcap_cntr_stride(perfcap);
>> +    iommu_pmu->num_eg = pcap_num_event_group(perfcap);
>> +
>> +    iommu_pmu->evcap = kcalloc(iommu_pmu->num_eg, sizeof(u64),
>> GFP_KERNEL);
>> +    if (!iommu_pmu->evcap) {
>> +        ret = -ENOMEM;
>> +        goto free_pmu;
>> +    }
>> +
>> +    /* Parse event group capabilities */
>> +    for (i = 0; i < iommu_pmu->num_eg; i++) {
>> +        u64 pcap;
>> +
>> +        pcap = dmar_readq(iommu->reg + DMAR_PERFEVNTCAP_REG +
>> +                  i * IOMMU_PMU_CAP_REGS_STEP);
>> +        iommu_pmu->evcap[i] = pecap_es(pcap);
>> +    }
>> +
>> +    iommu_pmu->cntr_evcap = kcalloc(iommu_pmu->num_cntr, sizeof(u32
>> *), GFP_KERNEL);
>> +    if (!iommu_pmu->cntr_evcap) {
>> +        ret = -ENOMEM;
>> +        goto free_pmu_evcap;
>> +    }
>> +    for (i = 0; i < iommu_pmu->num_cntr; i++) {
>> +        iommu_pmu->cntr_evcap[i] = kcalloc(iommu_pmu->num_eg,
>> sizeof(u32), GFP_KERNEL);
>> +        if (!iommu_pmu->cntr_evcap[i]) {
>> +            ret = -ENOMEM;
>> +            iommu_pmu->num_cntr = i;
>
> Do we really need above line? kfree() is friendly to a NULL pointer,
> right?

OK. I will remove it.

>
>> +            goto free_pmu_cntr_evcap;
>> +        }
>> +        /*
>> +         * Set to the global capabilities, will adjust according
>> +         * to per-counter capabilities later.
>> +         */
>> +        for (j = 0; j < iommu_pmu->num_eg; j++)
>> +            iommu_pmu->cntr_evcap[i][j] = (u32)iommu_pmu->evcap[j];
>> +    }
>> +
>> +    iommu_pmu->cfg_reg = get_perf_reg_address(iommu,
>> DMAR_PERFCFGOFF_REG);
>> +    iommu_pmu->cntr_reg = get_perf_reg_address(iommu,
>> DMAR_PERFCNTROFF_REG);
>> +    iommu_pmu->overflow = get_perf_reg_address(iommu,
>> DMAR_PERFOVFOFF_REG);
>> +
>> +    /*
>> +     * Check per-counter capabilities
>> +     * All counters should have the same capabilities on
>> +     * Interrupt on Overflow Support and Counter Width
>> +     */
>
> Please re-org this comment and make it neat.

OK.

>
>> +    for (i = 0; i < iommu_pmu->num_cntr; i++) {
>> +        cap = dmar_readl(iommu_pmu->cfg_reg +
>> +                 i * IOMMU_PMU_CFG_OFFSET +
>> +                 IOMMU_PMU_CFG_CNTRCAP_OFFSET);
>> +        if (!iommu_cntrcap_pcc(cap))
>> +            continue;
>> +        if ((iommu_cntrcap_cw(cap) != iommu_pmu->cntr_width) ||
>> +            !iommu_cntrcap_ios(cap))
>> +            iommu_pmu->num_cntr = i;
>
> Can you please add some words describing how do you handle the
> capability inconsistent case? It seems that you just truncate the number
> of counters? Any rationality behind that?

In practice, the counters have the same capabilities of Interrupt on
Overflow Support and Counter Width. The current implementation is also
based on this assumption.

But it's possible that some counters have a different capability because
of e.g., HW bug. We can check such corner case here and simply drop
those counters.

I will add some comments and print a warnning for the rare case here in V2.

Thanks,
Kan
>
>> +
>> +        /* Clear the pre-defined events group */
>> +        for (j = 0; j < iommu_pmu->num_eg; j++)
>> +            iommu_pmu->cntr_evcap[i][j] = 0;
>> +
>> +        /* Override with per-counter event capabilities */
>> +        for (j = 0; j < iommu_cntrcap_egcnt(cap); j++) {
>> +            cap = dmar_readl(iommu_pmu->cfg_reg + i *
>> IOMMU_PMU_CFG_OFFSET +
>> +                     IOMMU_PMU_CFG_CNTREVCAP_OFFSET +
>> +                     (j * IOMMU_PMU_OFF_REGS_STEP));
>> +            iommu_pmu->cntr_evcap[i][iommu_event_group(cap)] =
>> iommu_event_select(cap);
>> +            /*
>> +             * Some events may only be supported by a specific counter.
>> +             * Track them in the evcap as well.
>> +             */
>> +            iommu_pmu->evcap[iommu_event_group(cap)] |=
>> iommu_event_select(cap);
>> +        }
>> +    }
>> +
>> +    iommu_pmu->iommu = iommu;
>> +    iommu->pmu = iommu_pmu;
>> +
>> +    return 0;
>> +
>> +free_pmu_cntr_evcap:
>> +    for (i = 0; i < iommu_pmu->num_cntr; i++)
>> +        kfree(iommu_pmu->cntr_evcap[i]);
>> +    kfree(iommu_pmu->cntr_evcap);
>> +free_pmu_evcap:
>> +    kfree(iommu_pmu->evcap);
>> +free_pmu:
>> +    kfree(iommu_pmu);
>> +
>> +    return ret;
>> +}
>> +
>> +void free_iommu_pmu(struct intel_iommu *iommu)
>> +{
>> +    struct iommu_pmu *iommu_pmu = iommu->pmu;
>> +
>> +    if (!iommu_pmu)
>> +        return;
>> +
>> +    if (iommu_pmu->evcap) {
>> +        int i;
>> +
>> +        for (i = 0; i < iommu_pmu->num_cntr; i++)
>> +            kfree(iommu_pmu->cntr_evcap[i]);
>> +        kfree(iommu_pmu->cntr_evcap);
>> +    }
>> +    kfree(iommu_pmu->evcap);
>> +    kfree(iommu_pmu);
>> +    iommu->pmu = NULL;
>> +}
>> diff --git a/drivers/iommu/intel/perfmon.h
>> b/drivers/iommu/intel/perfmon.h
>> new file mode 100644
>> index 000000000000..8587c80501cd
>> --- /dev/null
>> +++ b/drivers/iommu/intel/perfmon.h
>> @@ -0,0 +1,41 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +
>> +/*
>> + * PERFCFGOFF_REG, PERFFRZOFF_REG
>> + * PERFOVFOFF_REG, PERFCNTROFF_REG
>> + */
>> +#define IOMMU_PMU_NUM_OFF_REGS            4
>> +#define IOMMU_PMU_OFF_REGS_STEP            4
>> +
>> +#define IOMMU_PMU_CFG_OFFSET            0x100
>> +#define IOMMU_PMU_CFG_CNTRCAP_OFFSET        0x80
>> +#define IOMMU_PMU_CFG_CNTREVCAP_OFFSET        0x84
>> +#define IOMMU_PMU_CFG_SIZE            0x8
>> +#define IOMMU_PMU_CFG_FILTERS_OFFSET        0x4
>> +
>> +
>> +#define IOMMU_PMU_CAP_REGS_STEP            8
>> +
>> +#define iommu_cntrcap_pcc(p)            ((p) & 0x1)
>> +#define iommu_cntrcap_cw(p)            ((p >> 8) & 0xff)
>> +#define iommu_cntrcap_ios(p)            ((p >> 16) & 0x1)
>> +#define iommu_cntrcap_egcnt(p)            ((p >> 28) & 0xf)
>> +
>> +#define iommu_event_select(p)            ((p) & 0xfffffff)
>> +#define iommu_event_group(p)            ((p >> 28) & 0xf)
>> +
>> +#ifdef CONFIG_INTEL_IOMMU_PERF_EVENTS
>> +int alloc_iommu_pmu(struct intel_iommu *iommu);
>> +void free_iommu_pmu(struct intel_iommu *iommu);
>> +#else
>> +static inline int
>> +alloc_iommu_pmu(struct intel_iommu *iommu)
>> +{
>> +    return 0;
>> +}
>> +
>> +static inline void
>> +free_iommu_pmu(struct intel_iommu *iommu)
>> +{
>> +}
>> +#endif /* CONFIG_INTEL_IOMMU_PERF_EVENTS */
>
> --
> Best regards,
> baolu

2023-01-13 18:42:08

by Liang, Kan

[permalink] [raw]
Subject: Re: [PATCH 3/7] iommu/vt-d: Support Enhanced Command Interface



On 2023-01-13 8:55 a.m., Baolu Lu wrote:
> On 2023/1/12 4:25, [email protected] wrote:
>> From: Kan Liang <[email protected]>
>>
>> The Enhanced Command Register is to submit command and operand of
>> enhanced commands to DMA Remapping hardware. It can supports upto 256
>> enhanced commands.
>>
>> There is a HW register to indicate the availability of all 256 enhanced
>> commands. Each bit stands for each command. But there isn't an existing
>> interface to read/write all 256 bits. Introduce the u64 ecmdcap[4] to
>> store the existence of each enhanced command. Read 4 times to get
>> all of them in map_iommu().
>>
>> Add a helper to facilitate an enhanced command launch. Make sure hardware
>> complete the command.
>>
>> Add a helper to facilitate the check of PMU essentials.
>>
>> The helpers will be used later.
>>
>> Signed-off-by: Kan Liang <[email protected]>
>> ---
>>   drivers/iommu/intel/dmar.c  | 63 +++++++++++++++++++++++++++++++++++++
>>   drivers/iommu/intel/iommu.h | 46 +++++++++++++++++++++++++++
>>   2 files changed, 109 insertions(+)
>>
>> diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
>> index 91bb48267df2..dcafa32875c1 100644
>> --- a/drivers/iommu/intel/dmar.c
>> +++ b/drivers/iommu/intel/dmar.c
>> @@ -1025,6 +1025,16 @@ static int map_iommu(struct intel_iommu *iommu,
>> struct dmar_drhd_unit *drhd)
>>               goto release;
>>           }
>>       }
>> +
>> +    if (cap_ecmds(iommu->cap)) {
>> +        int i;
>> +
>> +        for (i = 0; i < DMA_MAX_NUM_ECMDCAP; i++) {
>> +            iommu->ecmdcap[i] = dmar_readq(iommu->reg + DMAR_ECCAP_REG +
>> +                               i * DMA_ECMD_REG_STEP);
>> +        }
>> +    }
>> +
>>       err = 0;
>>       goto out;
>>   @@ -2434,3 +2444,56 @@ bool dmar_platform_optin(void)
>>       return ret;
>>   }
>>   EXPORT_SYMBOL_GPL(dmar_platform_optin);
>> +
>> +#ifdef CONFIG_INTEL_IOMMU
>
> Why do you need above #ifdef?
>
>> +#define ecmd_get_status_code(res)    ((res & 0xff) >> 1)
>> +
>> +/*
>> + * Function to submit a command to the enhanced command interface. The
>> + * valid enhanced command descriptions are defined in Table 47 of the
>> + * VT-d spec. The VT-d hardware implementation may support some but not
>> + * all commands, which can be determined by checking the Enhanced
>> + * Command Capability Register.
>> + *
>> + * Return values:
>> + *  - 0: Command successful without any error;
>> + *  - Negative: software error value;
>> + *  - Nonzero positive: failure status code defined in Table 48.
>> + */
>> +int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd,
>> +             u64 oa, bool has_ob, u64 ob)
>> +{
>> +    unsigned long flags;
>> +    u64 res;
>> +    int ret;
>> +
>> +    if (!cap_ecmds(iommu->cap))
>> +        return -ENODEV;
>> +
>> +    raw_spin_lock_irqsave(&iommu->register_lock, flags);
>> +
>> +    res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
>> +    if (res & DMA_ECMD_ECRSP_IP) {
>> +        ret = -EBUSY;
>> +        goto err;
>> +    }
>> +
>> +    if (has_ob)
>> +        dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
>
> The ecmds that require a Operand B are statically defined in the spec,
> right? What will it look like if we define a static ignore_ob(ecmd)?
>
>> +    dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa <<
>> DMA_ECMD_OA_SHIFT));
>> +
>> +    IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
>> +              !(res & DMA_ECMD_ECRSP_IP), res);
>> +
>> +    if (res & DMA_ECMD_ECRSP_IP) {
>> +        ret = -ETIMEDOUT;
>> +        goto err;
>> +    }
>> +
>> +    ret = ecmd_get_status_code(res);
>> +err:
>> +    raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
>> +
>> +    return ret;
>> +}
>> +#endif /* CONFIG_INTEL_IOMMU */
>> diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
>> index 5bcefbea55c9..f227107434ac 100644
>> --- a/drivers/iommu/intel/iommu.h
>> +++ b/drivers/iommu/intel/iommu.h
>> @@ -130,6 +130,10 @@
>>   #define DMAR_PERFOVFOFF_REG    0x318
>>   #define DMAR_PERFCNTROFF_REG    0x31c
>>   #define DMAR_PERFEVNTCAP_REG    0x380
>> +#define DMAR_ECMD_REG        0x400
>> +#define DMAR_ECEO_REG        0x408
>> +#define DMAR_ECRSP_REG        0x410
>> +#define DMAR_ECCAP_REG        0x430
>>   #define DMAR_VCCAP_REG        0xe30 /* Virtual command capability
>> register */
>>   #define DMAR_VCMD_REG        0xe00 /* Virtual command register */
>>   #define DMAR_VCRSP_REG        0xe10 /* Virtual command response
>> register */
>> @@ -304,6 +308,26 @@
>>   #define DMA_CCMD_SID(s) (((u64)((s) & 0xffff)) << 16)
>>   #define DMA_CCMD_DID(d) ((u64)((d) & 0xffff))
>>   +/* ECMD_REG */
>> +#define DMA_MAX_NUM_ECMD        256
>> +#define DMA_MAX_NUM_ECMDCAP        (DMA_MAX_NUM_ECMD / 64)
>> +#define DMA_ECMD_REG_STEP        8
>> +#define DMA_ECMD_ENABLE            0xf0
>> +#define DMA_ECMD_DISABLE        0xf1
>> +#define DMA_ECMD_FREEZE            0xf4
>> +#define DMA_ECMD_UNFREEZE        0xf5
>> +#define DMA_ECMD_OA_SHIFT        16
>> +#define DMA_ECMD_ECRSP_IP        0x1
>> +#define DMA_ECMD_ECCAP3            3
>> +#define DMA_ECMD_ECCAP3_ECNTS        (1ULL << 48)
>> +#define DMA_ECMD_ECCAP3_DCNTS        (1ULL << 49)
>> +#define DMA_ECMD_ECCAP3_FCNTS        (1ULL << 52)
>> +#define DMA_ECMD_ECCAP3_UFCNTS        (1ULL << 53)
>> +#define DMA_ECMD_ECCAP3_ESSENTIAL    (DMA_ECMD_ECCAP3_ECNTS |    \
>> +                     DMA_ECMD_ECCAP3_DCNTS |    \
>> +                     DMA_ECMD_ECCAP3_FCNTS |    \
>> +                     DMA_ECMD_ECCAP3_UFCNTS)
>> +
>>   /* FECTL_REG */
>>   #define DMA_FECTL_IM (((u32)1) << 31)
>>   @@ -600,6 +624,7 @@ struct intel_iommu {
>>       u64        cap;
>>       u64        ecap;
>>       u64        vccap;
>> +    u64        ecmdcap[DMA_MAX_NUM_ECMDCAP];
>>       u32        gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */
>>       raw_spinlock_t    register_lock; /* protect register handling */
>>       int        seq_id;    /* sequence id of the iommu */
>> @@ -841,6 +866,15 @@ extern const struct iommu_ops intel_iommu_ops;
>>   extern int intel_iommu_sm;
>>   extern int iommu_calculate_agaw(struct intel_iommu *iommu);
>>   extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu);
>> +extern int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd,
>> +                u64 oa, bool has_ob, u64 ob);
>> +
>> +static inline bool ecmd_has_pmu_essential(struct intel_iommu *iommu)
>> +{
>> +    return (iommu->ecmdcap[DMA_ECMD_ECCAP3] &
>> DMA_ECMD_ECCAP3_ESSENTIAL) ==
>> +        DMA_ECMD_ECCAP3_ESSENTIAL;
>> +}
>> +
>>   extern int dmar_disabled;
>>   extern int intel_iommu_enabled;
>>   #else
>> @@ -852,6 +886,18 @@ static inline int
>> iommu_calculate_max_sagaw(struct intel_iommu *iommu)
>>   {
>>       return 0;
>>   }
>> +
>> +static inline int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd,
>> +                   u64 oa, bool has_ob, u64 ob)
>> +{
>> +    return -EOPNOTSUPP;
>> +}
>> +
>> +static inline bool ecmd_has_pmu_essential(struct intel_iommu *iommu)
>> +{
>> +    return false;
>> +}
>
> Is there any need to call these helpers when INTEL_IOMMU is not
> configured?


No, I will remove them from non-INTEL_IOMMU.

Thanks,
Kan

>
>> +
>>   #define dmar_disabled    (1)
>>   #define intel_iommu_enabled (0)
>>   #define intel_iommu_sm (0)
>
> --
> Best regards,
> baolu

2023-01-13 19:11:14

by Liang, Kan

[permalink] [raw]
Subject: Re: [PATCH 3/7] iommu/vt-d: Support Enhanced Command Interface



On 2023-01-13 9:12 a.m., Baolu Lu wrote:
> On 2023/1/13 21:55, Baolu Lu wrote:
>>> +/*
>>> + * Function to submit a command to the enhanced command interface. The
>>> + * valid enhanced command descriptions are defined in Table 47 of the
>>> + * VT-d spec. The VT-d hardware implementation may support some but not
>>> + * all commands, which can be determined by checking the Enhanced
>>> + * Command Capability Register.
>>> + *
>>> + * Return values:
>>> + *  - 0: Command successful without any error;
>>> + *  - Negative: software error value;
>>> + *  - Nonzero positive: failure status code defined in Table 48.
>>> + */
>>> +int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd,
>>> +             u64 oa, bool has_ob, u64 ob)
>>> +{
>>> +    unsigned long flags;
>>> +    u64 res;
>>> +    int ret;
>>> +
>>> +    if (!cap_ecmds(iommu->cap))
>>> +        return -ENODEV;
>>> +
>>> +    raw_spin_lock_irqsave(&iommu->register_lock, flags);
>>> +
>>> +    res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
>>> +    if (res & DMA_ECMD_ECRSP_IP) {
>>> +        ret = -EBUSY;
>>> +        goto err;
>>> +    }
>>> +
>>> +    if (has_ob)
>>> +        dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
>>
>> The ecmds that require a Operand B are statically defined in the spec,
>> right? What will it look like if we define a static ignore_ob(ecmd)?
>

If so, I think we have to maintain a table of ecmd in the ignore_ob(),
and check the given ecmd at runtime, right?
That sounds hard to maintain and low efficiency with more and more ecmds
are introduced.

> Or simply remove has_ob parameter? The least case is an unnecessary
> write to a register. It's fine as far as I can see since we should avoid
> using it in any critical path.

I was told in the internal review that a MMIO write may trigger a VM
exit, if in a guest. We should avoid such unnecessary MMIO write.

For PMU, right, I don't think we use it at critical path. Now the PMU is
the only customer for ecmd. I think the extra MMIO write can be tolerant.

I will remove has_ob and add some comments in V2.

Thanks,
Kan

>
> --
> Best regards,
> baolu

2023-01-14 09:12:08

by Lu Baolu

[permalink] [raw]
Subject: Re: [PATCH 4/7] iommu/vt-d: Add IOMMU perfmon support

On 2023/1/12 4:25, [email protected] wrote:
> From: Kan Liang <[email protected]>
>
> Implement the IOMMU performance monitor capability, which supports the
> collection of information about key events occurring during operation of
> the remapping hardware, to aid performance tuning and debug.
>
> The IOMMU perfmon support is implemented as part of the IOMMU driver and
> interfaces with the Linux perf subsystem.
>
> The IOMMU PMU has the following unique features compared with the other
> PMUs.
> - Support counting. Not support sampling.
> - Does not support per-thread counting. The scope is system-wide.
> - Support per-counter capability register. The event constraints can be
> enumerated.
> - The available event and event group can also be enumerated.
> - Extra Enhanced Commands are introduced to control the counters.
>
> Add a new variable, struct iommu_pmu *pmu, to in the struct intel_iommu
> to track the PMU related information.
>
> Add iommu_pmu_register() and iommu_pmu_unregister() to register and
> unregister a IOMMU PMU. The register function setup the IOMMU PMU ops
> and invoke the standard perf_pmu_register() interface to register a PMU
> in the perf subsystem. This patch only exposes the functions. The
> following patch will enable them in the IOMMU driver.
>
> The IOMMU PMUs can be found under /sys/bus/event_source/devices/dmar*
>
> The available filters and event format can be found at the format folder
> $ ls /sys/bus/event_source/devices/dmar0/format/
> event event_group filter_ats filter_page_table
>
> The supported events can be found at the events folder
>
> $ ls /sys/bus/event_source/devices/dmar0/events/
> ats_blocked int_cache_hit_nonposted iommu_mrds
> pasid_cache_lookup
> ctxt_cache_hit int_cache_hit_posted iommu_requests
> pg_req_posted
> ctxt_cache_lookup int_cache_lookup iotlb_hit
> pw_occupancy
> fs_nonleaf_hit iommu_clocks iotlb_lookup
> ss_nonleaf_hit
> fs_nonleaf_lookup iommu_mem_blocked pasid_cache_hit
> ss_nonleaf_lookup
>
> The command below illustrates filter usage with a simple example.
>
> $ perf stat -e dmar0/iommu_requests,filter_ats=0/ -a sleep 1
>
> Performance counter stats for 'system wide':
>
> 2135 dmar0/iommu_requests,filter_ats=0/
>
> 1.001087695 seconds time elapsed
>
> Signed-off-by: Kan Liang <[email protected]>
> ---
> .../sysfs-bus-event_source-devices-iommu | 24 +
> drivers/iommu/intel/iommu.h | 15 +
> drivers/iommu/intel/perfmon.c | 496 ++++++++++++++++++
> drivers/iommu/intel/perfmon.h | 24 +
> 4 files changed, 559 insertions(+)
> create mode 100644 Documentation/ABI/testing/sysfs-bus-event_source-devices-iommu
>
> diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-iommu b/Documentation/ABI/testing/sysfs-bus-event_source-devices-iommu
> new file mode 100644
> index 000000000000..04e08851d8e6
> --- /dev/null
> +++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-iommu
> @@ -0,0 +1,24 @@
> +What: /sys/bus/event_source/devices/dmar*/format
> +Date: Jan 2023
> +KernelVersion: 6.3
> +Contact: Kan Liang <[email protected]>
> +Description: Read-only. Attribute group to describe the magic bits
> + that go into perf_event_attr.config,
> + perf_event_attr.config1 or perf_event_attr.config2 for
> + the IOMMU pmu. (See also
> + ABI/testing/sysfs-bus-event_source-devices-format).
> +
> + Each attribute in this group defines a bit range in
> + perf_event_attr.config, perf_event_attr.config1,
> + or perf_event_attr.config2. All supported attributes
> + are listed below (See the VT-d Spec 4.0 for possible
> + attribute values)::
> +
> + event = "config:0-27" - event ID
> + event_group = "config:28-31" - event group ID
> +
> + filter_requester_id = "config1:0-15" - Requester ID filter
> + filter_domain = "config1:16-31" - Domain ID filter
> + filter_pasid = "config1:32-53" - PASID filter

Just out of curiosity, PCI PASID is 20-bit wide, why do you need 22 bits
in config1 encoding?

> + filter_ats = "config2:0-4" - Address Type filter
> + filter_page_table = "config2:8-12" - Page Table Level filter
> diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
> index f227107434ac..bbc5dda903e9 100644
> --- a/drivers/iommu/intel/iommu.h
> +++ b/drivers/iommu/intel/iommu.h
> @@ -22,6 +22,7 @@
> #include <linux/ioasid.h>
> #include <linux/bitfield.h>
> #include <linux/xarray.h>
> +#include <linux/perf_event.h>
>
> #include <asm/cacheflush.h>
> #include <asm/iommu.h>
> @@ -601,6 +602,16 @@ struct dmar_domain {
> iommu core */
> };
>
> +/*
> + * In theory, the VT-d 4.0 spec can support up to 2 ^ 16 counters.
> + * But in practice, there are only 14 counters for the existing
> + * platform. Setting the max number of counters to 64 should be good
> + * enough for a long time. Also, supporting more than 64 counters
> + * requires more extras, e.g., extra freeze and overflow registers,
> + * which is not necessary for now.
> + */
> +#define IOMMU_PMU_IDX_MAX 64
> +
> struct iommu_pmu {
> struct intel_iommu *iommu;
> u32 num_cntr; /* Number of counters */
> @@ -615,6 +626,10 @@ struct iommu_pmu {
>
> u64 *evcap; /* Indicates all supported events */
> u32 **cntr_evcap; /* Supported events of each counter. */
> +
> + struct pmu pmu;
> + DECLARE_BITMAP(used_mask, IOMMU_PMU_IDX_MAX);
> + struct perf_event *event_list[IOMMU_PMU_IDX_MAX];
> };
>
> struct intel_iommu {
> diff --git a/drivers/iommu/intel/perfmon.c b/drivers/iommu/intel/perfmon.c
> index bc090f329c32..43a5075eaecd 100644
> --- a/drivers/iommu/intel/perfmon.c
> +++ b/drivers/iommu/intel/perfmon.c
> @@ -8,6 +8,475 @@
> #include "iommu.h"
> #include "perfmon.h"
>
> +PMU_FORMAT_ATTR(event, "config:0-27"); /* ES: Events Select */
> +PMU_FORMAT_ATTR(event_group, "config:28-31"); /* EGI: Event Group Index */
> +
> +static struct attribute *iommu_pmu_format_attrs[] = {
> + &format_attr_event_group.attr,
> + &format_attr_event.attr,
> + NULL
> +};
> +
> +static struct attribute_group iommu_pmu_format_attr_group = {
> + .name = "format",
> + .attrs = iommu_pmu_format_attrs,
> +};
> +
> +/* The available events are added in attr_update later */
> +static struct attribute *attrs_empty[] = {
> + NULL
> +};
> +
> +static struct attribute_group iommu_pmu_events_attr_group = {
> + .name = "events",
> + .attrs = attrs_empty,
> +};
> +
> +static const struct attribute_group *iommu_pmu_attr_groups[] = {
> + &iommu_pmu_format_attr_group,
> + &iommu_pmu_events_attr_group,
> + NULL
> +};
> +
> +static inline struct iommu_pmu *dev_to_iommu_pmu(struct device *dev)
> +{
> + /*
> + * The perf_event creates its own dev for each PMU.
> + * See pmu_dev_alloc()
> + */
> + return container_of(dev_get_drvdata(dev), struct iommu_pmu, pmu);
> +}
> +
> +#define IOMMU_PMU_ATTR(_name, _format, _filter) \
> + PMU_FORMAT_ATTR(_name, _format); \
> + \
> +static struct attribute *_name##_attr[] = { \
> + &format_attr_##_name.attr, \
> + NULL \
> +}; \
> + \
> +static umode_t \
> +_name##_is_visible(struct kobject *kobj, struct attribute *attr, int i) \
> +{ \
> + struct device *dev = kobj_to_dev(kobj); \
> + struct iommu_pmu *iommu_pmu = dev_to_iommu_pmu(dev); \
> + \
> + if (!iommu_pmu) \
> + return 0; \
> + return (iommu_pmu->filter & _filter) ? attr->mode : 0; \
> +} \
> + \
> +static struct attribute_group _name = { \
> + .name = "format", \
> + .attrs = _name##_attr, \
> + .is_visible = _name##_is_visible, \
> +};
> +
> +IOMMU_PMU_ATTR(filter_requester_id, "config1:0-15", IOMMU_PMU_FILTER_REQUESTER_ID);
> +IOMMU_PMU_ATTR(filter_domain, "config1:16-31", IOMMU_PMU_FILTER_DOMAIN);
> +IOMMU_PMU_ATTR(filter_pasid, "config1:32-53", IOMMU_PMU_FILTER_PASID);
> +IOMMU_PMU_ATTR(filter_ats, "config2:0-4", IOMMU_PMU_FILTER_ATS);
> +IOMMU_PMU_ATTR(filter_page_table, "config2:8-12", IOMMU_PMU_FILTER_PAGE_TABLE);
> +
> +#define iommu_pmu_get_requester_id(filter) ((filter) & 0xffff)
> +#define iommu_pmu_get_domain(filter) (((filter) >> 16) & 0xffff)
> +#define iommu_pmu_get_pasid(filter) (((filter) >> 32) & 0x3fffff)
> +#define iommu_pmu_get_ats(filter) ((filter) & 0x1f)
> +#define iommu_pmu_get_page_table(filter) (((filter) >> 8) & 0x1f)
> +
> +#define iommu_pmu_set_filter(_name, _config, _filter, _idx) \
> +{ \
> + if ((iommu_pmu->filter & _filter) && iommu_pmu_get_##_name(_config)) { \
> + dmar_writel(iommu_pmu->cfg_reg + _idx * IOMMU_PMU_CFG_OFFSET + \
> + IOMMU_PMU_CFG_SIZE + \
> + (ffs(_filter) - 1) * IOMMU_PMU_CFG_FILTERS_OFFSET, \
> + iommu_pmu_get_##_name(_config) | IOMMU_PMU_FILTER_EN);\
> + } \
> +}
> +
> +#define iommu_pmu_clear_filter(_filter, _idx) \
> +{ \
> + if (iommu_pmu->filter & _filter) { \
> + dmar_writel(iommu_pmu->cfg_reg + _idx * IOMMU_PMU_CFG_OFFSET + \
> + IOMMU_PMU_CFG_SIZE + \
> + (ffs(_filter) - 1) * IOMMU_PMU_CFG_FILTERS_OFFSET, \
> + 0); \
> + } \
> +}
> +
> +/*
> + * Define the event attr related functions
> + * Input: _name: event attr name
> + * _string: string of the event in sysfs
> + * _g_idx: event group encoding
> + * _event: event encoding
> + */
> +#define IOMMU_PMU_EVENT_ATTR(_name, _string, _g_idx, _event) \
> + PMU_EVENT_ATTR_STRING(_name, event_attr_##_name, _string) \
> + \
> +static struct attribute *_name##_attr[] = { \
> + &event_attr_##_name.attr.attr, \
> + NULL \
> +}; \
> + \
> +static umode_t \
> +_name##_is_visible(struct kobject *kobj, struct attribute *attr, int i) \
> +{ \
> + struct device *dev = kobj_to_dev(kobj); \
> + struct iommu_pmu *iommu_pmu = dev_to_iommu_pmu(dev); \
> + \
> + if (!iommu_pmu) \
> + return 0; \
> + return (iommu_pmu->evcap[_g_idx] & _event) ? attr->mode : 0; \
> +} \
> + \
> +static struct attribute_group _name = { \
> + .name = "events", \
> + .attrs = _name##_attr, \
> + .is_visible = _name##_is_visible, \
> +};
> +
> +IOMMU_PMU_EVENT_ATTR(iommu_clocks, "event_group=0x0,event=0x001", 0x0, 0x001)
> +IOMMU_PMU_EVENT_ATTR(iommu_requests, "event_group=0x0,event=0x002", 0x0, 0x002)
> +IOMMU_PMU_EVENT_ATTR(pw_occupancy, "event_group=0x0,event=0x004", 0x0, 0x004)
> +IOMMU_PMU_EVENT_ATTR(ats_blocked, "event_group=0x0,event=0x008", 0x0, 0x008)
> +IOMMU_PMU_EVENT_ATTR(iommu_mrds, "event_group=0x1,event=0x001", 0x1, 0x001)
> +IOMMU_PMU_EVENT_ATTR(iommu_mem_blocked, "event_group=0x1,event=0x020", 0x1, 0x020)
> +IOMMU_PMU_EVENT_ATTR(pg_req_posted, "event_group=0x1,event=0x040", 0x1, 0x040)
> +IOMMU_PMU_EVENT_ATTR(ctxt_cache_lookup, "event_group=0x2,event=0x001", 0x2, 0x001)
> +IOMMU_PMU_EVENT_ATTR(ctxt_cache_hit, "event_group=0x2,event=0x002", 0x2, 0x002)
> +IOMMU_PMU_EVENT_ATTR(pasid_cache_lookup, "event_group=0x2,event=0x004", 0x2, 0x004)
> +IOMMU_PMU_EVENT_ATTR(pasid_cache_hit, "event_group=0x2,event=0x008", 0x2, 0x008)
> +IOMMU_PMU_EVENT_ATTR(ss_nonleaf_lookup, "event_group=0x2,event=0x010", 0x2, 0x010)
> +IOMMU_PMU_EVENT_ATTR(ss_nonleaf_hit, "event_group=0x2,event=0x020", 0x2, 0x020)
> +IOMMU_PMU_EVENT_ATTR(fs_nonleaf_lookup, "event_group=0x2,event=0x040", 0x2, 0x040)
> +IOMMU_PMU_EVENT_ATTR(fs_nonleaf_hit, "event_group=0x2,event=0x080", 0x2, 0x080)
> +IOMMU_PMU_EVENT_ATTR(hpt_nonleaf_lookup, "event_group=0x2,event=0x100", 0x2, 0x100)
> +IOMMU_PMU_EVENT_ATTR(hpt_nonleaf_hit, "event_group=0x2,event=0x200", 0x2, 0x200)
> +IOMMU_PMU_EVENT_ATTR(iotlb_lookup, "event_group=0x3,event=0x001", 0x3, 0x001)
> +IOMMU_PMU_EVENT_ATTR(iotlb_hit, "event_group=0x3,event=0x002", 0x3, 0x002)
> +IOMMU_PMU_EVENT_ATTR(hpt_leaf_lookup, "event_group=0x3,event=0x004", 0x3, 0x004)
> +IOMMU_PMU_EVENT_ATTR(hpt_leaf_hit, "event_group=0x3,event=0x008", 0x3, 0x008)
> +IOMMU_PMU_EVENT_ATTR(int_cache_lookup, "event_group=0x4,event=0x001", 0x4, 0x001)
> +IOMMU_PMU_EVENT_ATTR(int_cache_hit_nonposted, "event_group=0x4,event=0x002", 0x4, 0x002)
> +IOMMU_PMU_EVENT_ATTR(int_cache_hit_posted, "event_group=0x4,event=0x004", 0x4, 0x004)
> +
> +
> +static const struct attribute_group *iommu_pmu_attr_update[] = {
> + &filter_requester_id,
> + &filter_domain,
> + &filter_pasid,
> + &filter_ats,
> + &filter_page_table,
> + &iommu_clocks,
> + &iommu_requests,
> + &pw_occupancy,
> + &ats_blocked,
> + &iommu_mrds,
> + &iommu_mem_blocked,
> + &pg_req_posted,
> + &ctxt_cache_lookup,
> + &ctxt_cache_hit,
> + &pasid_cache_lookup,
> + &pasid_cache_hit,
> + &ss_nonleaf_lookup,
> + &ss_nonleaf_hit,
> + &fs_nonleaf_lookup,
> + &fs_nonleaf_hit,
> + &hpt_nonleaf_lookup,
> + &hpt_nonleaf_hit,
> + &iotlb_lookup,
> + &iotlb_hit,
> + &hpt_leaf_lookup,
> + &hpt_leaf_hit,
> + &int_cache_lookup,
> + &int_cache_hit_nonposted,
> + &int_cache_hit_posted,
> + NULL
> +};
> +
> +static inline void __iomem *
> +iommu_event_base(struct iommu_pmu *iommu_pmu, int idx)
> +{
> + return iommu_pmu->cntr_reg + idx * iommu_pmu->cntr_stride;
> +}
> +
> +static inline void __iomem *
> +iommu_config_base(struct iommu_pmu *iommu_pmu, int idx)
> +{
> + return iommu_pmu->cfg_reg + idx * IOMMU_PMU_CFG_OFFSET;
> +}
> +
> +static inline struct iommu_pmu *iommu_event_to_pmu(struct perf_event *event)
> +{
> + return container_of(event->pmu, struct iommu_pmu, pmu);
> +}
> +
> +static inline u64 iommu_event_config(struct perf_event *event)
> +{
> + u64 config = event->attr.config;
> +
> + return (iommu_event_select(config) << IOMMU_EVENT_CFG_ES_SHIFT) |
> + (iommu_event_group(config) << IOMMU_EVENT_CFG_EGI_SHIFT) |
> + IOMMU_EVENT_CFG_INT;
> +}
> +
> +static inline bool is_iommu_pmu_event(struct iommu_pmu *iommu_pmu,
> + struct perf_event *event)
> +{
> + return event->pmu == &iommu_pmu->pmu;
> +}
> +
> +static int iommu_pmu_validate_event(struct perf_event *event)
> +{
> + struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
> + u32 event_group = iommu_event_group(event->attr.config);
> +
> + if (event_group >= iommu_pmu->num_eg)
> + return -EINVAL;
> +
> + return 0;
> +}
> +
> +static int iommu_pmu_validate_group(struct perf_event *event)
> +{
> + struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
> + struct perf_event *sibling;
> + int nr = 0;
> +
> + /*
> + * All events in a group must be scheduled simultaneously.
> + * Check whether there is enough counters for all the events.
> + */
> + for_each_sibling_event(sibling, event->group_leader) {
> + if (!is_iommu_pmu_event(iommu_pmu, sibling) ||
> + sibling->state <= PERF_EVENT_STATE_OFF)
> + continue;
> +
> + if (++nr > iommu_pmu->num_cntr)
> + return -EINVAL;
> + }
> +
> + return 0;
> +}
> +
> +static int iommu_pmu_event_init(struct perf_event *event)
> +{
> + struct hw_perf_event *hwc = &event->hw;
> +
> + if (event->attr.type != event->pmu->type)
> + return -ENOENT;
> +
> + /* sampling not supported */
> + if (event->attr.sample_period)
> + return -EINVAL;
> +
> + if (event->cpu < 0)
> + return -EINVAL;
> +
> + if (iommu_pmu_validate_event(event))
> + return -EINVAL;
> +
> + hwc->config = iommu_event_config(event);
> +
> + return iommu_pmu_validate_group(event);
> +}
> +
> +static void iommu_pmu_event_update(struct perf_event *event)
> +{
> + struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
> + struct hw_perf_event *hwc = &event->hw;
> + u64 prev_count, new_count, delta;
> + int shift = 64 - iommu_pmu->cntr_width;
> +
> +again:
> + prev_count = local64_read(&hwc->prev_count);
> + new_count = dmar_readq(iommu_event_base(iommu_pmu, hwc->idx));
> + if (local64_xchg(&hwc->prev_count, new_count) != prev_count)
> + goto again;
> +
> + /*
> + * The counter width is enumerated.
> + * Always shift the counter before using it.
> + */

Re-org above comments and make it neat.

> + delta = (new_count << shift) - (prev_count << shift);
> + delta >>= shift;
> +
> + local64_add(delta, &event->count);
> +}
> +
> +static void iommu_pmu_start(struct perf_event *event, int flags)
> +{
> + struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
> + struct intel_iommu *iommu = iommu_pmu->iommu;
> + struct hw_perf_event *hwc = &event->hw;
> + u64 count;
> +
> + if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
> + return;
> +
> + if (WARN_ON_ONCE(hwc->idx < 0 || hwc->idx >= IOMMU_PMU_IDX_MAX))
> + return;
> +
> + if (flags & PERF_EF_RELOAD)
> + WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
> +
> + hwc->state = 0;
> +
> + /* Always reprogram the period */
> + count = dmar_readq(iommu_event_base(iommu_pmu, hwc->idx));
> + local64_set((&hwc->prev_count), count);
> +
> + ecmd_submit_sync(iommu, DMA_ECMD_ENABLE, hwc->idx, false, 0);

What happens when emcmd_submit_sync() returns an error? How should we
handle this case? The same queestion to other places in this patch.

> +
> + perf_event_update_userpage(event);
> +}
> +
> +static void iommu_pmu_stop(struct perf_event *event, int flags)
> +{
> + struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
> + struct intel_iommu *iommu = iommu_pmu->iommu;
> + struct hw_perf_event *hwc = &event->hw;
> +
> + if (!(hwc->state & PERF_HES_STOPPED)) {
> + ecmd_submit_sync(iommu, DMA_ECMD_DISABLE, hwc->idx, false, 0);
> +
> + iommu_pmu_event_update(event);
> +
> + hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
> + }
> +}
> +
> +static inline int
> +iommu_pmu_validate_per_cntr_event(struct iommu_pmu *iommu_pmu,
> + int idx, struct perf_event *event)
> +{
> + u32 event_group = iommu_event_group(event->attr.config);
> + u32 select = iommu_event_select(event->attr.config);
> +
> + if (!(iommu_pmu->cntr_evcap[idx][event_group] & select))
> + return -EINVAL;
> +
> + return 0;
> +}
> +
> +static int iommu_pmu_assign_event(struct iommu_pmu *iommu_pmu,
> + struct perf_event *event)
> +{
> + struct hw_perf_event *hwc = &event->hw;
> + int idx;
> +
> + /*
> + * The counters which support limited events are usually at the end.
> + * Schedule them first to accommodate more events.
> + */
> + for (idx = iommu_pmu->num_cntr - 1; idx >= 0; idx--) {
> + if (test_and_set_bit(idx, iommu_pmu->used_mask))
> + continue;
> + /* Check per-counter event capabilities */
> + if (!iommu_pmu_validate_per_cntr_event(iommu_pmu, idx, event))
> + break;
> + clear_bit(idx, iommu_pmu->used_mask);
> + }
> + if (idx < 0)
> + return -EINVAL;
> +
> + iommu_pmu->event_list[idx] = event;
> + hwc->idx = idx;
> +
> + /* config events */
> + dmar_writeq(iommu_config_base(iommu_pmu, idx), hwc->config);
> +
> + iommu_pmu_set_filter(requester_id, event->attr.config1,
> + IOMMU_PMU_FILTER_REQUESTER_ID, idx);
> + iommu_pmu_set_filter(domain, event->attr.config1,
> + IOMMU_PMU_FILTER_DOMAIN, idx);
> + iommu_pmu_set_filter(pasid, event->attr.config1,
> + IOMMU_PMU_FILTER_PASID, idx);
> + iommu_pmu_set_filter(ats, event->attr.config2,
> + IOMMU_PMU_FILTER_ATS, idx);
> + iommu_pmu_set_filter(page_table, event->attr.config2,
> + IOMMU_PMU_FILTER_PAGE_TABLE, idx);
> +
> + return 0;
> +}
> +
> +static int iommu_pmu_add(struct perf_event *event, int flags)
> +{
> + struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
> + struct hw_perf_event *hwc = &event->hw;
> + int ret;
> +
> + ret = iommu_pmu_assign_event(iommu_pmu, event);
> + if (ret < 0)
> + return ret;
> +
> + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
> +
> + if (flags & PERF_EF_START)
> + iommu_pmu_start(event, 0);
> +
> + return 0;
> +}
> +
> +static void iommu_pmu_del(struct perf_event *event, int flags)
> +{
> + struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
> + int idx = event->hw.idx;
> +
> + iommu_pmu_stop(event, PERF_EF_UPDATE);
> +
> + iommu_pmu_clear_filter(IOMMU_PMU_FILTER_REQUESTER_ID, idx);
> + iommu_pmu_clear_filter(IOMMU_PMU_FILTER_DOMAIN, idx);
> + iommu_pmu_clear_filter(IOMMU_PMU_FILTER_PASID, idx);
> + iommu_pmu_clear_filter(IOMMU_PMU_FILTER_ATS, idx);
> + iommu_pmu_clear_filter(IOMMU_PMU_FILTER_PAGE_TABLE, idx);
> +
> + iommu_pmu->event_list[idx] = NULL;
> + event->hw.idx = -1;
> + clear_bit(idx, iommu_pmu->used_mask);
> +
> + perf_event_update_userpage(event);
> +}
> +
> +static void iommu_pmu_enable(struct pmu *pmu)
> +{
> + struct iommu_pmu *iommu_pmu = container_of(pmu, struct iommu_pmu, pmu);
> + struct intel_iommu *iommu = iommu_pmu->iommu;
> +
> + ecmd_submit_sync(iommu, DMA_ECMD_UNFREEZE, 0, false, 0);
> +}
> +
> +static void iommu_pmu_disable(struct pmu *pmu)
> +{
> + struct iommu_pmu *iommu_pmu = container_of(pmu, struct iommu_pmu, pmu);
> + struct intel_iommu *iommu = iommu_pmu->iommu;
> +
> + ecmd_submit_sync(iommu, DMA_ECMD_FREEZE, 0, false, 0);
> +}
> +
> +static int __iommu_pmu_register(struct intel_iommu *iommu)
> +{
> + struct iommu_pmu *iommu_pmu = iommu->pmu;
> +
> + iommu_pmu->pmu.name = iommu->name;
> + iommu_pmu->pmu.task_ctx_nr = perf_invalid_context;
> + iommu_pmu->pmu.event_init = iommu_pmu_event_init;
> + iommu_pmu->pmu.pmu_enable = iommu_pmu_enable;
> + iommu_pmu->pmu.pmu_disable = iommu_pmu_disable;
> + iommu_pmu->pmu.add = iommu_pmu_add;
> + iommu_pmu->pmu.del = iommu_pmu_del;
> + iommu_pmu->pmu.start = iommu_pmu_start;
> + iommu_pmu->pmu.stop = iommu_pmu_stop;
> + iommu_pmu->pmu.read = iommu_pmu_event_update;
> + iommu_pmu->pmu.attr_groups = iommu_pmu_attr_groups;
> + iommu_pmu->pmu.attr_update = iommu_pmu_attr_update;
> + iommu_pmu->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
> + iommu_pmu->pmu.module = THIS_MODULE;
> +
> + return perf_pmu_register(&iommu_pmu->pmu, iommu_pmu->pmu.name, -1);
> +}
> +
> static inline void __iomem *
> get_perf_reg_address(struct intel_iommu *iommu, u32 offset)
> {
> @@ -40,11 +509,21 @@ int alloc_iommu_pmu(struct intel_iommu *iommu)
> if (!pcap_interrupt(perfcap))
> return -ENODEV;
>
> + /* Check required Enhanced Command Capability */
> + if (!ecmd_has_pmu_essential(iommu))
> + return -ENODEV;
> +
> iommu_pmu = kzalloc(sizeof(*iommu_pmu), GFP_KERNEL);
> if (!iommu_pmu)
> return -ENOMEM;
>
> iommu_pmu->num_cntr = pcap_num_cntr(perfcap);
> + if (iommu_pmu->num_cntr > IOMMU_PMU_IDX_MAX) {
> + WARN_ONCE(1, "The number of IOMMU counters %d > max(%d), clipping!",
> + iommu_pmu->num_cntr, IOMMU_PMU_IDX_MAX);

Do we really need a kernel trace here? This isn't a software flaw,
perhaps, use pr_info() to let users know about this is enough?

> + iommu_pmu->num_cntr = IOMMU_PMU_IDX_MAX;
> + }
> +
> iommu_pmu->cntr_width = pcap_cntr_width(perfcap);
> iommu_pmu->filter = pcap_filters_mask(perfcap);
> iommu_pmu->cntr_stride = pcap_cntr_stride(perfcap);
> @@ -157,3 +636,20 @@ void free_iommu_pmu(struct intel_iommu *iommu)
> kfree(iommu_pmu);
> iommu->pmu = NULL;
> }
> +
> +void iommu_pmu_register(struct intel_iommu *iommu)
> +{
> + if (!iommu->pmu)
> + return;
> +
> + if (__iommu_pmu_register(iommu)) {
> + pr_err("Failed to register PMU for iommu (seq_id = %d)\n",
> + iommu->seq_id);

What will happen if iommu_pmu_unregister() is called after
__iommu_pmu_register() has returned error?

> + }
> +}
> +
> +void iommu_pmu_unregister(struct intel_iommu *iommu)
> +{
> + if (iommu->pmu)
> + perf_pmu_unregister(&iommu->pmu->pmu);
> +}
> diff --git a/drivers/iommu/intel/perfmon.h b/drivers/iommu/intel/perfmon.h
> index 8587c80501cd..b60f0cad5bfd 100644
> --- a/drivers/iommu/intel/perfmon.h
> +++ b/drivers/iommu/intel/perfmon.h
> @@ -7,6 +7,14 @@
> #define IOMMU_PMU_NUM_OFF_REGS 4
> #define IOMMU_PMU_OFF_REGS_STEP 4
>
> +#define IOMMU_PMU_FILTER_REQUESTER_ID 0x01
> +#define IOMMU_PMU_FILTER_DOMAIN 0x02
> +#define IOMMU_PMU_FILTER_PASID 0x04
> +#define IOMMU_PMU_FILTER_ATS 0x08
> +#define IOMMU_PMU_FILTER_PAGE_TABLE 0x10
> +
> +#define IOMMU_PMU_FILTER_EN (1 << 31)
> +
> #define IOMMU_PMU_CFG_OFFSET 0x100
> #define IOMMU_PMU_CFG_CNTRCAP_OFFSET 0x80
> #define IOMMU_PMU_CFG_CNTREVCAP_OFFSET 0x84
> @@ -21,12 +29,18 @@
> #define iommu_cntrcap_ios(p) ((p >> 16) & 0x1)
> #define iommu_cntrcap_egcnt(p) ((p >> 28) & 0xf)
>
> +#define IOMMU_EVENT_CFG_EGI_SHIFT 8
> +#define IOMMU_EVENT_CFG_ES_SHIFT 32
> +#define IOMMU_EVENT_CFG_INT (1ULL << 1)
> +
> #define iommu_event_select(p) ((p) & 0xfffffff)
> #define iommu_event_group(p) ((p >> 28) & 0xf)
>
> #ifdef CONFIG_INTEL_IOMMU_PERF_EVENTS
> int alloc_iommu_pmu(struct intel_iommu *iommu);
> void free_iommu_pmu(struct intel_iommu *iommu);
> +void iommu_pmu_register(struct intel_iommu *iommu);
> +void iommu_pmu_unregister(struct intel_iommu *iommu);
> #else
> static inline int
> alloc_iommu_pmu(struct intel_iommu *iommu)
> @@ -38,4 +52,14 @@ static inline void
> free_iommu_pmu(struct intel_iommu *iommu)
> {
> }
> +
> +static inline void
> +iommu_pmu_register(struct intel_iommu *iommu)
> +{
> +}
> +
> +static inline void
> +iommu_pmu_unregister(struct intel_iommu *iommu)
> +{
> +}
> #endif /* CONFIG_INTEL_IOMMU_PERF_EVENTS */

--
Best regards,
baolu

2023-01-14 11:11:29

by Lu Baolu

[permalink] [raw]
Subject: Re: [PATCH 6/7] iommu/vt-d: Add IOMMU perfmon overflow handler support

On 2023/1/12 4:25, [email protected] wrote:
> From: Kan Liang <[email protected]>
>
> While enabled to count events and an event occurrence causes the counter
> value to increment and roll over to or past zero, this is termed a
> counter overflow. The overflow can trigger an interrupt. The IOMMU
> perfmon needs to handle the case properly.
>
> New HW IRQs are allocated for each IOMMU device for perfmon. The IRQ IDs
> are after the SVM range.
>
> In the overflow handler, the counter is not frozen. It's very unlikely
> that the same counter overflows again during the period. But it's
> possible that other counters overflow at the same time. Read the
> overflow register at the end of the handler and check whether there are
> more.
>
> Signed-off-by: Kan Liang <[email protected]>
> ---
> drivers/iommu/intel/dmar.c | 2 +
> drivers/iommu/intel/iommu.h | 11 ++++-
> drivers/iommu/intel/perfmon.c | 82 +++++++++++++++++++++++++++++++++++
> drivers/iommu/intel/svm.c | 2 +-
> 4 files changed, 95 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
> index dcafa32875c1..94e314320cd9 100644
> --- a/drivers/iommu/intel/dmar.c
> +++ b/drivers/iommu/intel/dmar.c
> @@ -1887,6 +1887,8 @@ static inline int dmar_msi_reg(struct intel_iommu *iommu, int irq)
> return DMAR_FECTL_REG;
> else if (iommu->pr_irq == irq)
> return DMAR_PECTL_REG;
> + else if (iommu->perf_irq == irq)
> + return DMAR_PERFINTRCTL_REG;
> else
> BUG();
> }
> diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
> index bbc5dda903e9..f616e4f3d765 100644
> --- a/drivers/iommu/intel/iommu.h
> +++ b/drivers/iommu/intel/iommu.h
> @@ -130,6 +130,8 @@
> #define DMAR_PERFCFGOFF_REG 0x310
> #define DMAR_PERFOVFOFF_REG 0x318
> #define DMAR_PERFCNTROFF_REG 0x31c
> +#define DMAR_PERFINTRSTS_REG 0x324
> +#define DMAR_PERFINTRCTL_REG 0x328
> #define DMAR_PERFEVNTCAP_REG 0x380
> #define DMAR_ECMD_REG 0x400
> #define DMAR_ECEO_REG 0x408
> @@ -357,6 +359,9 @@
>
> #define DMA_VCS_PAS ((u64)1)
>
> +/* PERFINTRSTS_REG */
> +#define DMA_PERFINTRSTS_PIS ((u32)1)
> +
> #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
> do { \
> cycles_t start_time = get_cycles(); \
> @@ -630,8 +635,12 @@ struct iommu_pmu {
> struct pmu pmu;
> DECLARE_BITMAP(used_mask, IOMMU_PMU_IDX_MAX);
> struct perf_event *event_list[IOMMU_PMU_IDX_MAX];
> + unsigned char irq_name[16];
> };
>
> +#define IOMMU_IRQ_ID_OFFSET_PRQ (DMAR_UNITS_SUPPORTED)
> +#define IOMMU_IRQ_ID_OFFSET_PERF (2 * DMAR_UNITS_SUPPORTED)
> +
> struct intel_iommu {
> void __iomem *reg; /* Pointer to hardware regs, virtual addr */
> u64 reg_phys; /* physical address of hw register set */
> @@ -645,7 +654,7 @@ struct intel_iommu {
> int seq_id; /* sequence id of the iommu */
> int agaw; /* agaw of this iommu */
> int msagaw; /* max sagaw of this iommu */
> - unsigned int irq, pr_irq;
> + unsigned int irq, pr_irq, perf_irq;
> u16 segment; /* PCI segment# */
> unsigned char name[13]; /* Device Name */
>
> diff --git a/drivers/iommu/intel/perfmon.c b/drivers/iommu/intel/perfmon.c
> index f332232bb345..d0fbf784c827 100644
> --- a/drivers/iommu/intel/perfmon.c
> +++ b/drivers/iommu/intel/perfmon.c
> @@ -476,6 +476,49 @@ static void iommu_pmu_disable(struct pmu *pmu)
> ecmd_submit_sync(iommu, DMA_ECMD_FREEZE, 0, false, 0);
> }
>
> +static void iommu_pmu_counter_overflow(struct iommu_pmu *iommu_pmu)
> +{
> + struct perf_event *event;
> + int i, handled = 0;
> + u64 status;
> +
> + /*
> + * Two counters may be overflowed very close.
> + * Always check whether there are more to handle.
> + */

Keep comment style consistent, like this

/*
* Two counters may be overflowed very close. Always check whether
* there are more to handle.
*/

Same to other places.

> + while ((status = dmar_readq(iommu_pmu->overflow))) {

Unnecessary double brackets?

> + for_each_set_bit(i, (unsigned long *)&status, iommu_pmu->num_cntr) {
> + handled++;

"handled" isn't used anywhere. Please cleanup it.

> +
> + /*
> + * Find the assigned event of the counter.
> + * Accumulate the value into the event->count.
> + */
> + event = iommu_pmu->event_list[i];
> + if (WARN_ON_ONCE(!event))

Again, do we need a kernel trace here? This is only because the hardware
reported an wrong event id, right? How about a pr_warn() to let the user
know this?

> + continue;
> + iommu_pmu_event_update(event);
> + }
> +
> + dmar_writeq(iommu_pmu->overflow, status);
> + }
> +}
> +
> +static irqreturn_t iommu_pmu_irq_handler(int irq, void *dev_id)
> +{
> + struct intel_iommu *iommu = dev_id;
> +
> + if (!dmar_readl(iommu->reg + DMAR_PERFINTRSTS_REG))
> + return IRQ_NONE;
> +
> + iommu_pmu_counter_overflow(iommu->pmu);
> +
> + /* Clear the status bit */
> + dmar_writel(iommu->reg + DMAR_PERFINTRSTS_REG, DMA_PERFINTRSTS_PIS);
> +
> + return IRQ_HANDLED;
> +}
> +
> static int __iommu_pmu_register(struct intel_iommu *iommu)
> {
> struct iommu_pmu *iommu_pmu = iommu->pmu;
> @@ -658,6 +701,38 @@ void free_iommu_pmu(struct intel_iommu *iommu)
> iommu->pmu = NULL;
> }
>
> +static int iommu_pmu_set_interrupt(struct intel_iommu *iommu)
> +{
> + struct iommu_pmu *iommu_pmu = iommu->pmu;
> + int irq, ret;
> +
> + irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PERF + iommu->seq_id, iommu->node, iommu);
> + if (irq <= 0)
> + return -EINVAL;
> +
> + snprintf(iommu_pmu->irq_name, sizeof(iommu_pmu->irq_name), "dmar%d-perf", iommu->seq_id);
> +
> + iommu->perf_irq = irq;
> + ret = request_threaded_irq(irq, NULL, iommu_pmu_irq_handler,
> + IRQF_ONESHOT, iommu_pmu->irq_name, iommu);
> + if (ret) {
> + dmar_free_hwirq(irq);
> + iommu->perf_irq = 0;
> + return ret;
> + }
> + return 0;
> +}
> +
> +static void iommu_pmu_unset_interrupt(struct intel_iommu *iommu)
> +{
> + if (!iommu->perf_irq)
> + return;
> +
> + free_irq(iommu->perf_irq, iommu);
> + dmar_free_hwirq(iommu->perf_irq);
> + iommu->perf_irq = 0;
> +}
> +
> static int iommu_pmu_cpu_online(unsigned int cpu)
> {
> if (cpumask_empty(&iommu_pmu_cpu_mask))
> @@ -734,8 +809,14 @@ void iommu_pmu_register(struct intel_iommu *iommu)
> if (iommu_pmu_cpuhp_setup(iommu_pmu))
> goto unregister;
>
> + /* Set interrupt for overflow */
> + if (iommu_pmu_set_interrupt(iommu))
> + goto cpuhp_free;
> +
> return;
>
> +cpuhp_free:
> + iommu_pmu_cpuhp_free(iommu_pmu);
> unregister:
> perf_pmu_unregister(&iommu_pmu->pmu);
> err:
> @@ -749,6 +830,7 @@ void iommu_pmu_unregister(struct intel_iommu *iommu)
> if (!iommu_pmu)
> return;
>
> + iommu_pmu_unset_interrupt(iommu);
> iommu_pmu_cpuhp_free(iommu_pmu);
> perf_pmu_unregister(&iommu_pmu->pmu);
> }
> diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
> index c76b66263467..b6c5edd80d5d 100644
> --- a/drivers/iommu/intel/svm.c
> +++ b/drivers/iommu/intel/svm.c
> @@ -79,7 +79,7 @@ int intel_svm_enable_prq(struct intel_iommu *iommu)
> }
> iommu->prq = page_address(pages);
>
> - irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu);
> + irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PRQ + iommu->seq_id, iommu->node, iommu);
> if (irq <= 0) {
> pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n",
> iommu->name);

--
Best regards,
baolu

2023-01-16 15:29:06

by Liang, Kan

[permalink] [raw]
Subject: Re: [PATCH 6/7] iommu/vt-d: Add IOMMU perfmon overflow handler support



On 2023-01-14 6:05 a.m., Baolu Lu wrote:
> On 2023/1/12 4:25, [email protected] wrote:
>> From: Kan Liang <[email protected]>
>>
>> While enabled to count events and an event occurrence causes the counter
>> value to increment and roll over to or past zero, this is termed a
>> counter overflow. The overflow can trigger an interrupt. The IOMMU
>> perfmon needs to handle the case properly.
>>
>> New HW IRQs are allocated for each IOMMU device for perfmon. The IRQ IDs
>> are after the SVM range.
>>
>> In the overflow handler, the counter is not frozen. It's very unlikely
>> that the same counter overflows again during the period. But it's
>> possible that other counters overflow at the same time. Read the
>> overflow register at the end of the handler and check whether there are
>> more.
>>
>> Signed-off-by: Kan Liang <[email protected]>
>> ---
>>   drivers/iommu/intel/dmar.c    |  2 +
>>   drivers/iommu/intel/iommu.h   | 11 ++++-
>>   drivers/iommu/intel/perfmon.c | 82 +++++++++++++++++++++++++++++++++++
>>   drivers/iommu/intel/svm.c     |  2 +-
>>   4 files changed, 95 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
>> index dcafa32875c1..94e314320cd9 100644
>> --- a/drivers/iommu/intel/dmar.c
>> +++ b/drivers/iommu/intel/dmar.c
>> @@ -1887,6 +1887,8 @@ static inline int dmar_msi_reg(struct
>> intel_iommu *iommu, int irq)
>>           return DMAR_FECTL_REG;
>>       else if (iommu->pr_irq == irq)
>>           return DMAR_PECTL_REG;
>> +    else if (iommu->perf_irq == irq)
>> +        return DMAR_PERFINTRCTL_REG;
>>       else
>>           BUG();
>>   }
>> diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
>> index bbc5dda903e9..f616e4f3d765 100644
>> --- a/drivers/iommu/intel/iommu.h
>> +++ b/drivers/iommu/intel/iommu.h
>> @@ -130,6 +130,8 @@
>>   #define DMAR_PERFCFGOFF_REG    0x310
>>   #define DMAR_PERFOVFOFF_REG    0x318
>>   #define DMAR_PERFCNTROFF_REG    0x31c
>> +#define DMAR_PERFINTRSTS_REG    0x324
>> +#define DMAR_PERFINTRCTL_REG    0x328
>>   #define DMAR_PERFEVNTCAP_REG    0x380
>>   #define DMAR_ECMD_REG        0x400
>>   #define DMAR_ECEO_REG        0x408
>> @@ -357,6 +359,9 @@
>>     #define DMA_VCS_PAS    ((u64)1)
>>   +/* PERFINTRSTS_REG */
>> +#define DMA_PERFINTRSTS_PIS    ((u32)1)
>> +
>>   #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts)            \
>>   do {                                    \
>>       cycles_t start_time = get_cycles();                \
>> @@ -630,8 +635,12 @@ struct iommu_pmu {
>>       struct pmu        pmu;
>>       DECLARE_BITMAP(used_mask, IOMMU_PMU_IDX_MAX);
>>       struct perf_event    *event_list[IOMMU_PMU_IDX_MAX];
>> +    unsigned char        irq_name[16];
>>   };
>>   +#define IOMMU_IRQ_ID_OFFSET_PRQ        (DMAR_UNITS_SUPPORTED)
>> +#define IOMMU_IRQ_ID_OFFSET_PERF    (2 * DMAR_UNITS_SUPPORTED)
>> +
>>   struct intel_iommu {
>>       void __iomem    *reg; /* Pointer to hardware regs, virtual addr */
>>       u64         reg_phys; /* physical address of hw register set */
>> @@ -645,7 +654,7 @@ struct intel_iommu {
>>       int        seq_id;    /* sequence id of the iommu */
>>       int        agaw; /* agaw of this iommu */
>>       int        msagaw; /* max sagaw of this iommu */
>> -    unsigned int     irq, pr_irq;
>> +    unsigned int    irq, pr_irq, perf_irq;
>>       u16        segment;     /* PCI segment# */
>>       unsigned char     name[13];    /* Device Name */
>>   diff --git a/drivers/iommu/intel/perfmon.c
>> b/drivers/iommu/intel/perfmon.c
>> index f332232bb345..d0fbf784c827 100644
>> --- a/drivers/iommu/intel/perfmon.c
>> +++ b/drivers/iommu/intel/perfmon.c
>> @@ -476,6 +476,49 @@ static void iommu_pmu_disable(struct pmu *pmu)
>>       ecmd_submit_sync(iommu, DMA_ECMD_FREEZE, 0, false, 0);
>>   }
>>   +static void iommu_pmu_counter_overflow(struct iommu_pmu *iommu_pmu)
>> +{
>> +    struct perf_event *event;
>> +    int i, handled = 0;
>> +    u64 status;
>> +
>> +    /*
>> +     * Two counters may be overflowed very close.
>> +     * Always check whether there are more to handle.
>> +     */
>
> Keep comment style consistent, like this
>
>         /*
>          * Two counters may be overflowed very close. Always check whether
>          * there are more to handle.
>          */
>
> Same to other places.

Sure.

>
>> +    while ((status = dmar_readq(iommu_pmu->overflow))) {
>
> Unnecessary double brackets?
>
>> +        for_each_set_bit(i, (unsigned long *)&status,
>> iommu_pmu->num_cntr) {
>> +            handled++;
>
> "handled" isn't used anywhere. Please cleanup it.
>

Sure.

>> +
>> +            /*
>> +             * Find the assigned event of the counter.
>> +             * Accumulate the value into the event->count.
>> +             */
>> +            event = iommu_pmu->event_list[i];
>> +            if (WARN_ON_ONCE(!event))
>
> Again, do we need a kernel trace here? This is only because the hardware
> reported an wrong event id, right? How about a pr_warn() to let the user
> know this?

The hardware reported ID doesn't match the SW records. It's hard to say
which one is correct. But I guess pr_warn_once() may be enough. I will
change it in V2.

Thanks,
Kan

>
>> +                continue;
>> +            iommu_pmu_event_update(event);
>> +        }
>> +
>> +        dmar_writeq(iommu_pmu->overflow, status);
>> +    }
>> +}
>> +
>> +static irqreturn_t iommu_pmu_irq_handler(int irq, void *dev_id)
>> +{
>> +    struct intel_iommu *iommu = dev_id;
>> +
>> +    if (!dmar_readl(iommu->reg + DMAR_PERFINTRSTS_REG))
>> +        return IRQ_NONE;
>> +
>> +    iommu_pmu_counter_overflow(iommu->pmu);
>> +
>> +    /* Clear the status bit */
>> +    dmar_writel(iommu->reg + DMAR_PERFINTRSTS_REG, DMA_PERFINTRSTS_PIS);
>> +
>> +    return IRQ_HANDLED;
>> +}
>> +
>>   static int __iommu_pmu_register(struct intel_iommu *iommu)
>>   {
>>       struct iommu_pmu *iommu_pmu = iommu->pmu;
>> @@ -658,6 +701,38 @@ void free_iommu_pmu(struct intel_iommu *iommu)
>>       iommu->pmu = NULL;
>>   }
>>   +static int iommu_pmu_set_interrupt(struct intel_iommu *iommu)
>> +{
>> +    struct iommu_pmu *iommu_pmu = iommu->pmu;
>> +    int irq, ret;
>> +
>> +    irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PERF + iommu->seq_id,
>> iommu->node, iommu);
>> +    if (irq <= 0)
>> +        return -EINVAL;
>> +
>> +    snprintf(iommu_pmu->irq_name, sizeof(iommu_pmu->irq_name),
>> "dmar%d-perf", iommu->seq_id);
>> +
>> +    iommu->perf_irq = irq;
>> +    ret = request_threaded_irq(irq, NULL, iommu_pmu_irq_handler,
>> +                   IRQF_ONESHOT, iommu_pmu->irq_name, iommu);
>> +    if (ret) {
>> +        dmar_free_hwirq(irq);
>> +        iommu->perf_irq = 0;
>> +        return ret;
>> +    }
>> +    return 0;
>> +}
>> +
>> +static void iommu_pmu_unset_interrupt(struct intel_iommu *iommu)
>> +{
>> +    if (!iommu->perf_irq)
>> +        return;
>> +
>> +    free_irq(iommu->perf_irq, iommu);
>> +    dmar_free_hwirq(iommu->perf_irq);
>> +    iommu->perf_irq = 0;
>> +}
>> +
>>   static int iommu_pmu_cpu_online(unsigned int cpu)
>>   {
>>       if (cpumask_empty(&iommu_pmu_cpu_mask))
>> @@ -734,8 +809,14 @@ void iommu_pmu_register(struct intel_iommu *iommu)
>>       if (iommu_pmu_cpuhp_setup(iommu_pmu))
>>           goto unregister;
>>   +    /* Set interrupt for overflow */
>> +    if (iommu_pmu_set_interrupt(iommu))
>> +        goto cpuhp_free;
>> +
>>       return;
>>   +cpuhp_free:
>> +    iommu_pmu_cpuhp_free(iommu_pmu);
>>   unregister:
>>       perf_pmu_unregister(&iommu_pmu->pmu);
>>   err:
>> @@ -749,6 +830,7 @@ void iommu_pmu_unregister(struct intel_iommu *iommu)
>>       if (!iommu_pmu)
>>           return;
>>   +    iommu_pmu_unset_interrupt(iommu);
>>       iommu_pmu_cpuhp_free(iommu_pmu);
>>       perf_pmu_unregister(&iommu_pmu->pmu);
>>   }
>> diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
>> index c76b66263467..b6c5edd80d5d 100644
>> --- a/drivers/iommu/intel/svm.c
>> +++ b/drivers/iommu/intel/svm.c
>> @@ -79,7 +79,7 @@ int intel_svm_enable_prq(struct intel_iommu *iommu)
>>       }
>>       iommu->prq = page_address(pages);
>>   -    irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id,
>> iommu->node, iommu);
>> +    irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PRQ + iommu->seq_id,
>> iommu->node, iommu);
>>       if (irq <= 0) {
>>           pr_err("IOMMU: %s: Failed to create IRQ vector for page
>> request queue\n",
>>                  iommu->name);
>
> --
> Best regards,
> baolu

2023-01-16 15:33:56

by Liang, Kan

[permalink] [raw]
Subject: Re: [PATCH 4/7] iommu/vt-d: Add IOMMU perfmon support



On 2023-01-14 4:00 a.m., Baolu Lu wrote:
> On 2023/1/12 4:25, [email protected] wrote:
>> From: Kan Liang <[email protected]>
>>
>> Implement the IOMMU performance monitor capability, which supports the
>> collection of information about key events occurring during operation of
>> the remapping hardware, to aid performance tuning and debug.
>>
>> The IOMMU perfmon support is implemented as part of the IOMMU driver and
>> interfaces with the Linux perf subsystem.
>>
>> The IOMMU PMU has the following unique features compared with the other
>> PMUs.
>> - Support counting. Not support sampling.
>> - Does not support per-thread counting. The scope is system-wide.
>> - Support per-counter capability register. The event constraints can be
>>    enumerated.
>> - The available event and event group can also be enumerated.
>> - Extra Enhanced Commands are introduced to control the counters.
>>
>> Add a new variable, struct iommu_pmu *pmu, to in the struct intel_iommu
>> to track the PMU related information.
>>
>> Add iommu_pmu_register() and iommu_pmu_unregister() to register and
>> unregister a IOMMU PMU. The register function setup the IOMMU PMU ops
>> and invoke the standard perf_pmu_register() interface to register a PMU
>> in the perf subsystem. This patch only exposes the functions. The
>> following patch will enable them in the IOMMU driver.
>>
>> The IOMMU PMUs can be found under /sys/bus/event_source/devices/dmar*
>>
>> The available filters and event format can be found at the format folder
>>   $ ls /sys/bus/event_source/devices/dmar0/format/
>> event  event_group  filter_ats  filter_page_table
>>
>> The supported events can be found at the events folder
>>
>>   $ ls /sys/bus/event_source/devices/dmar0/events/
>> ats_blocked        int_cache_hit_nonposted  iommu_mrds
>> pasid_cache_lookup
>> ctxt_cache_hit     int_cache_hit_posted     iommu_requests
>> pg_req_posted
>> ctxt_cache_lookup  int_cache_lookup         iotlb_hit
>> pw_occupancy
>> fs_nonleaf_hit     iommu_clocks             iotlb_lookup
>> ss_nonleaf_hit
>> fs_nonleaf_lookup  iommu_mem_blocked        pasid_cache_hit
>> ss_nonleaf_lookup
>>
>> The command below illustrates filter usage with a simple example.
>>
>>   $ perf stat -e dmar0/iommu_requests,filter_ats=0/ -a sleep 1
>>
>>   Performance counter stats for 'system wide':
>>
>>                2135      dmar0/iommu_requests,filter_ats=0/
>>
>>         1.001087695 seconds time elapsed
>>
>> Signed-off-by: Kan Liang <[email protected]>
>> ---
>>   .../sysfs-bus-event_source-devices-iommu      |  24 +
>>   drivers/iommu/intel/iommu.h                   |  15 +
>>   drivers/iommu/intel/perfmon.c                 | 496 ++++++++++++++++++
>>   drivers/iommu/intel/perfmon.h                 |  24 +
>>   4 files changed, 559 insertions(+)
>>   create mode 100644
>> Documentation/ABI/testing/sysfs-bus-event_source-devices-iommu
>>
>> diff --git
>> a/Documentation/ABI/testing/sysfs-bus-event_source-devices-iommu
>> b/Documentation/ABI/testing/sysfs-bus-event_source-devices-iommu
>> new file mode 100644
>> index 000000000000..04e08851d8e6
>> --- /dev/null
>> +++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-iommu
>> @@ -0,0 +1,24 @@
>> +What:        /sys/bus/event_source/devices/dmar*/format
>> +Date:        Jan 2023
>> +KernelVersion:  6.3
>> +Contact:    Kan Liang <[email protected]>
>> +Description:    Read-only.  Attribute group to describe the magic bits
>> +        that go into perf_event_attr.config,
>> +        perf_event_attr.config1 or perf_event_attr.config2 for
>> +        the IOMMU pmu.  (See also
>> +        ABI/testing/sysfs-bus-event_source-devices-format).
>> +
>> +        Each attribute in this group defines a bit range in
>> +        perf_event_attr.config, perf_event_attr.config1,
>> +        or perf_event_attr.config2. All supported attributes
>> +        are listed below (See the VT-d Spec 4.0 for possible
>> +        attribute values)::
>> +
>> +            event        = "config:0-27"   - event ID
>> +            event_group        = "config:28-31"  - event group ID
>> +
>> +            filter_requester_id    = "config1:0-15"  - Requester ID
>> filter
>> +            filter_domain    = "config1:16-31" - Domain ID filter
>> +            filter_pasid    = "config1:32-53" - PASID filter
>
> Just out of curiosity, PCI PASID is 20-bit wide, why do you need 22 bits
> in config1 encoding?

The extra 2 bits are for PASID Filter Mode.

Please see Figure 11-60. Performance Monitoring PASID Filter
Configuration Registers of VT-d spec.

>
>> +            filter_ats        = "config2:0-4"   - Address Type filter
>> +            filter_page_table    = "config2:8-12"  - Page Table Level
>> filter
>> diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
>> index f227107434ac..bbc5dda903e9 100644
>> --- a/drivers/iommu/intel/iommu.h
>> +++ b/drivers/iommu/intel/iommu.h
>> @@ -22,6 +22,7 @@
>>   #include <linux/ioasid.h>
>>   #include <linux/bitfield.h>
>>   #include <linux/xarray.h>
>> +#include <linux/perf_event.h>
>>     #include <asm/cacheflush.h>
>>   #include <asm/iommu.h>
>> @@ -601,6 +602,16 @@ struct dmar_domain {
>>                          iommu core */
>>   };
>>   +/*
>> + * In theory, the VT-d 4.0 spec can support up to 2 ^ 16 counters.
>> + * But in practice, there are only 14 counters for the existing
>> + * platform. Setting the max number of counters to 64 should be good
>> + * enough for a long time. Also, supporting more than 64 counters
>> + * requires more extras, e.g., extra freeze and overflow registers,
>> + * which is not necessary for now.
>> + */
>> +#define IOMMU_PMU_IDX_MAX        64
>> +
>>   struct iommu_pmu {
>>       struct intel_iommu    *iommu;
>>       u32            num_cntr;    /* Number of counters */
>> @@ -615,6 +626,10 @@ struct iommu_pmu {
>>         u64            *evcap;        /* Indicates all supported
>> events */
>>       u32            **cntr_evcap;    /* Supported events of each
>> counter. */
>> +
>> +    struct pmu        pmu;
>> +    DECLARE_BITMAP(used_mask, IOMMU_PMU_IDX_MAX);
>> +    struct perf_event    *event_list[IOMMU_PMU_IDX_MAX];
>>   };
>>     struct intel_iommu {
>> diff --git a/drivers/iommu/intel/perfmon.c
>> b/drivers/iommu/intel/perfmon.c
>> index bc090f329c32..43a5075eaecd 100644
>> --- a/drivers/iommu/intel/perfmon.c
>> +++ b/drivers/iommu/intel/perfmon.c
>> @@ -8,6 +8,475 @@
>>   #include "iommu.h"
>>   #include "perfmon.h"
>>   +PMU_FORMAT_ATTR(event,        "config:0-27");        /* ES: Events
>> Select */
>> +PMU_FORMAT_ATTR(event_group,    "config:28-31");    /* EGI: Event
>> Group Index */
>> +
>> +static struct attribute *iommu_pmu_format_attrs[] = {
>> +    &format_attr_event_group.attr,
>> +    &format_attr_event.attr,
>> +    NULL
>> +};
>> +
>> +static struct attribute_group iommu_pmu_format_attr_group = {
>> +    .name = "format",
>> +    .attrs = iommu_pmu_format_attrs,
>> +};
>> +
>> +/* The available events are added in attr_update later */
>> +static struct attribute *attrs_empty[] = {
>> +    NULL
>> +};
>> +
>> +static struct attribute_group iommu_pmu_events_attr_group = {
>> +    .name = "events",
>> +    .attrs = attrs_empty,
>> +};
>> +
>> +static const struct attribute_group *iommu_pmu_attr_groups[] = {
>> +    &iommu_pmu_format_attr_group,
>> +    &iommu_pmu_events_attr_group,
>> +    NULL
>> +};
>> +
>> +static inline struct iommu_pmu *dev_to_iommu_pmu(struct device *dev)
>> +{
>> +    /*
>> +     * The perf_event creates its own dev for each PMU.
>> +     * See pmu_dev_alloc()
>> +     */
>> +    return container_of(dev_get_drvdata(dev), struct iommu_pmu, pmu);
>> +}
>> +
>> +#define IOMMU_PMU_ATTR(_name, _format, _filter)                \
>> +    PMU_FORMAT_ATTR(_name, _format);                \
>> +                                    \
>> +static struct attribute *_name##_attr[] = {                \
>> +    &format_attr_##_name.attr,                    \
>> +    NULL                                \
>> +};                                    \
>> +                                    \
>> +static umode_t                                \
>> +_name##_is_visible(struct kobject *kobj, struct attribute *attr, int
>> i)    \
>> +{                                    \
>> +    struct device *dev = kobj_to_dev(kobj);                \
>> +    struct iommu_pmu *iommu_pmu = dev_to_iommu_pmu(dev);        \
>> +                                    \
>> +    if (!iommu_pmu)                            \
>> +        return 0;                        \
>> +    return (iommu_pmu->filter & _filter) ? attr->mode : 0;        \
>> +}                                    \
>> +                                    \
>> +static struct attribute_group _name = {                    \
>> +    .name        = "format",                    \
>> +    .attrs        = _name##_attr,                    \
>> +    .is_visible    = _name##_is_visible,                \
>> +};
>> +
>> +IOMMU_PMU_ATTR(filter_requester_id,    "config1:0-15",       
>> IOMMU_PMU_FILTER_REQUESTER_ID);
>> +IOMMU_PMU_ATTR(filter_domain,        "config1:16-31",   
>> IOMMU_PMU_FILTER_DOMAIN);
>> +IOMMU_PMU_ATTR(filter_pasid,        "config1:32-53",   
>> IOMMU_PMU_FILTER_PASID);
>> +IOMMU_PMU_ATTR(filter_ats,        "config2:0-4",       
>> IOMMU_PMU_FILTER_ATS);
>> +IOMMU_PMU_ATTR(filter_page_table,    "config2:8-12",       
>> IOMMU_PMU_FILTER_PAGE_TABLE);
>> +
>> +#define iommu_pmu_get_requester_id(filter)    ((filter) & 0xffff)
>> +#define iommu_pmu_get_domain(filter)        (((filter) >> 16) & 0xffff)
>> +#define iommu_pmu_get_pasid(filter)        (((filter) >> 32) & 0x3fffff)
>> +#define iommu_pmu_get_ats(filter)        ((filter) & 0x1f)
>> +#define iommu_pmu_get_page_table(filter)    (((filter) >> 8) & 0x1f)
>> +
>> +#define iommu_pmu_set_filter(_name, _config, _filter, _idx)            \
>> +{                                        \
>> +    if ((iommu_pmu->filter & _filter) &&
>> iommu_pmu_get_##_name(_config)) {    \
>> +        dmar_writel(iommu_pmu->cfg_reg + _idx * IOMMU_PMU_CFG_OFFSET
>> +    \
>> +                IOMMU_PMU_CFG_SIZE +                \
>> +                (ffs(_filter) - 1) * IOMMU_PMU_CFG_FILTERS_OFFSET,    \
>> +                iommu_pmu_get_##_name(_config) | IOMMU_PMU_FILTER_EN);\
>> +    }                                    \
>> +}
>> +
>> +#define iommu_pmu_clear_filter(_filter, _idx)                    \
>> +{                                        \
>> +    if (iommu_pmu->filter & _filter) {                    \
>> +        dmar_writel(iommu_pmu->cfg_reg + _idx * IOMMU_PMU_CFG_OFFSET
>> +    \
>> +                IOMMU_PMU_CFG_SIZE +                \
>> +                (ffs(_filter) - 1) * IOMMU_PMU_CFG_FILTERS_OFFSET,    \
>> +                0);                            \
>> +    }                                    \
>> +}
>> +
>> +/*
>> + * Define the event attr related functions
>> + * Input: _name: event attr name
>> + *        _string: string of the event in sysfs
>> + *        _g_idx: event group encoding
>> + *        _event: event encoding
>> + */
>> +#define IOMMU_PMU_EVENT_ATTR(_name, _string, _g_idx,
>> _event)            \
>> +    PMU_EVENT_ATTR_STRING(_name, event_attr_##_name, _string)        \
>> +                                        \
>> +static struct attribute *_name##_attr[] = {                    \
>> +    &event_attr_##_name.attr.attr,                        \
>> +    NULL                                    \
>> +};                                        \
>> +                                        \
>> +static umode_t                                    \
>> +_name##_is_visible(struct kobject *kobj, struct attribute *attr, int
>> i)        \
>> +{                                        \
>> +    struct device *dev = kobj_to_dev(kobj);                    \
>> +    struct iommu_pmu *iommu_pmu = dev_to_iommu_pmu(dev);            \
>> +                                        \
>> +    if (!iommu_pmu)                                \
>> +        return 0;                            \
>> +    return (iommu_pmu->evcap[_g_idx] & _event) ? attr->mode :
>> 0;        \
>> +}                                        \
>> +                                        \
>> +static struct attribute_group _name = {                        \
>> +    .name        = "events",                        \
>> +    .attrs        = _name##_attr,                        \
>> +    .is_visible    = _name##_is_visible,                    \
>> +};
>> +
>> +IOMMU_PMU_EVENT_ATTR(iommu_clocks,       
>> "event_group=0x0,event=0x001", 0x0, 0x001)
>> +IOMMU_PMU_EVENT_ATTR(iommu_requests,       
>> "event_group=0x0,event=0x002", 0x0, 0x002)
>> +IOMMU_PMU_EVENT_ATTR(pw_occupancy,       
>> "event_group=0x0,event=0x004", 0x0, 0x004)
>> +IOMMU_PMU_EVENT_ATTR(ats_blocked,       
>> "event_group=0x0,event=0x008", 0x0, 0x008)
>> +IOMMU_PMU_EVENT_ATTR(iommu_mrds,       
>> "event_group=0x1,event=0x001", 0x1, 0x001)
>> +IOMMU_PMU_EVENT_ATTR(iommu_mem_blocked,       
>> "event_group=0x1,event=0x020", 0x1, 0x020)
>> +IOMMU_PMU_EVENT_ATTR(pg_req_posted,       
>> "event_group=0x1,event=0x040", 0x1, 0x040)
>> +IOMMU_PMU_EVENT_ATTR(ctxt_cache_lookup,       
>> "event_group=0x2,event=0x001", 0x2, 0x001)
>> +IOMMU_PMU_EVENT_ATTR(ctxt_cache_hit,       
>> "event_group=0x2,event=0x002", 0x2, 0x002)
>> +IOMMU_PMU_EVENT_ATTR(pasid_cache_lookup,   
>> "event_group=0x2,event=0x004", 0x2, 0x004)
>> +IOMMU_PMU_EVENT_ATTR(pasid_cache_hit,       
>> "event_group=0x2,event=0x008", 0x2, 0x008)
>> +IOMMU_PMU_EVENT_ATTR(ss_nonleaf_lookup,       
>> "event_group=0x2,event=0x010", 0x2, 0x010)
>> +IOMMU_PMU_EVENT_ATTR(ss_nonleaf_hit,       
>> "event_group=0x2,event=0x020", 0x2, 0x020)
>> +IOMMU_PMU_EVENT_ATTR(fs_nonleaf_lookup,       
>> "event_group=0x2,event=0x040", 0x2, 0x040)
>> +IOMMU_PMU_EVENT_ATTR(fs_nonleaf_hit,       
>> "event_group=0x2,event=0x080", 0x2, 0x080)
>> +IOMMU_PMU_EVENT_ATTR(hpt_nonleaf_lookup,   
>> "event_group=0x2,event=0x100", 0x2, 0x100)
>> +IOMMU_PMU_EVENT_ATTR(hpt_nonleaf_hit,       
>> "event_group=0x2,event=0x200", 0x2, 0x200)
>> +IOMMU_PMU_EVENT_ATTR(iotlb_lookup,       
>> "event_group=0x3,event=0x001", 0x3, 0x001)
>> +IOMMU_PMU_EVENT_ATTR(iotlb_hit,           
>> "event_group=0x3,event=0x002", 0x3, 0x002)
>> +IOMMU_PMU_EVENT_ATTR(hpt_leaf_lookup,       
>> "event_group=0x3,event=0x004", 0x3, 0x004)
>> +IOMMU_PMU_EVENT_ATTR(hpt_leaf_hit,       
>> "event_group=0x3,event=0x008", 0x3, 0x008)
>> +IOMMU_PMU_EVENT_ATTR(int_cache_lookup,       
>> "event_group=0x4,event=0x001", 0x4, 0x001)
>> +IOMMU_PMU_EVENT_ATTR(int_cache_hit_nonposted,   
>> "event_group=0x4,event=0x002", 0x4, 0x002)
>> +IOMMU_PMU_EVENT_ATTR(int_cache_hit_posted,   
>> "event_group=0x4,event=0x004", 0x4, 0x004)
>> +
>> +
>> +static const struct attribute_group *iommu_pmu_attr_update[] = {
>> +    &filter_requester_id,
>> +    &filter_domain,
>> +    &filter_pasid,
>> +    &filter_ats,
>> +    &filter_page_table,
>> +    &iommu_clocks,
>> +    &iommu_requests,
>> +    &pw_occupancy,
>> +    &ats_blocked,
>> +    &iommu_mrds,
>> +    &iommu_mem_blocked,
>> +    &pg_req_posted,
>> +    &ctxt_cache_lookup,
>> +    &ctxt_cache_hit,
>> +    &pasid_cache_lookup,
>> +    &pasid_cache_hit,
>> +    &ss_nonleaf_lookup,
>> +    &ss_nonleaf_hit,
>> +    &fs_nonleaf_lookup,
>> +    &fs_nonleaf_hit,
>> +    &hpt_nonleaf_lookup,
>> +    &hpt_nonleaf_hit,
>> +    &iotlb_lookup,
>> +    &iotlb_hit,
>> +    &hpt_leaf_lookup,
>> +    &hpt_leaf_hit,
>> +    &int_cache_lookup,
>> +    &int_cache_hit_nonposted,
>> +    &int_cache_hit_posted,
>> +    NULL
>> +};
>> +
>> +static inline void __iomem *
>> +iommu_event_base(struct iommu_pmu *iommu_pmu, int idx)
>> +{
>> +    return iommu_pmu->cntr_reg + idx * iommu_pmu->cntr_stride;
>> +}
>> +
>> +static inline void __iomem *
>> +iommu_config_base(struct iommu_pmu *iommu_pmu, int idx)
>> +{
>> +    return iommu_pmu->cfg_reg + idx * IOMMU_PMU_CFG_OFFSET;
>> +}
>> +
>> +static inline struct iommu_pmu *iommu_event_to_pmu(struct perf_event
>> *event)
>> +{
>> +    return container_of(event->pmu, struct iommu_pmu, pmu);
>> +}
>> +
>> +static inline u64 iommu_event_config(struct perf_event *event)
>> +{
>> +    u64 config = event->attr.config;
>> +
>> +    return (iommu_event_select(config) << IOMMU_EVENT_CFG_ES_SHIFT) |
>> +           (iommu_event_group(config) << IOMMU_EVENT_CFG_EGI_SHIFT) |
>> +           IOMMU_EVENT_CFG_INT;
>> +}
>> +
>> +static inline bool is_iommu_pmu_event(struct iommu_pmu *iommu_pmu,
>> +                      struct perf_event *event)
>> +{
>> +    return event->pmu == &iommu_pmu->pmu;
>> +}
>> +
>> +static int iommu_pmu_validate_event(struct perf_event *event)
>> +{
>> +    struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
>> +    u32 event_group = iommu_event_group(event->attr.config);
>> +
>> +    if (event_group >= iommu_pmu->num_eg)
>> +        return -EINVAL;
>> +
>> +    return 0;
>> +}
>> +
>> +static int iommu_pmu_validate_group(struct perf_event *event)
>> +{
>> +    struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
>> +    struct perf_event *sibling;
>> +    int nr = 0;
>> +
>> +    /*
>> +     * All events in a group must be scheduled simultaneously.
>> +     * Check whether there is enough counters for all the events.
>> +     */
>> +    for_each_sibling_event(sibling, event->group_leader) {
>> +        if (!is_iommu_pmu_event(iommu_pmu, sibling) ||
>> +            sibling->state <= PERF_EVENT_STATE_OFF)
>> +            continue;
>> +
>> +        if (++nr > iommu_pmu->num_cntr)
>> +            return -EINVAL;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static int iommu_pmu_event_init(struct perf_event *event)
>> +{
>> +    struct hw_perf_event *hwc = &event->hw;
>> +
>> +    if (event->attr.type != event->pmu->type)
>> +        return -ENOENT;
>> +
>> +    /* sampling not supported */
>> +    if (event->attr.sample_period)
>> +        return -EINVAL;
>> +
>> +    if (event->cpu < 0)
>> +        return -EINVAL;
>> +
>> +    if (iommu_pmu_validate_event(event))
>> +        return -EINVAL;
>> +
>> +    hwc->config = iommu_event_config(event);
>> +
>> +    return iommu_pmu_validate_group(event);
>> +}
>> +
>> +static void iommu_pmu_event_update(struct perf_event *event)
>> +{
>> +    struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
>> +    struct hw_perf_event *hwc = &event->hw;
>> +    u64 prev_count, new_count, delta;
>> +    int shift = 64 - iommu_pmu->cntr_width;
>> +
>> +again:
>> +    prev_count = local64_read(&hwc->prev_count);
>> +    new_count = dmar_readq(iommu_event_base(iommu_pmu, hwc->idx));
>> +    if (local64_xchg(&hwc->prev_count, new_count) != prev_count)
>> +        goto again;
>> +
>> +    /*
>> +     * The counter width is enumerated.
>> +     * Always shift the counter before using it.
>> +     */
>
> Re-org above comments and make it neat.

Sure.

>
>> +    delta = (new_count << shift) - (prev_count << shift);
>> +    delta >>= shift;
>> +
>> +    local64_add(delta, &event->count);
>> +}
>> +
>> +static void iommu_pmu_start(struct perf_event *event, int flags)
>> +{
>> +    struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
>> +    struct intel_iommu *iommu = iommu_pmu->iommu;
>> +    struct hw_perf_event *hwc = &event->hw;
>> +    u64 count;
>> +
>> +    if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
>> +        return;
>> +
>> +    if (WARN_ON_ONCE(hwc->idx < 0 || hwc->idx >= IOMMU_PMU_IDX_MAX))
>> +        return;
>> +
>> +    if (flags & PERF_EF_RELOAD)
>> +        WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
>> +
>> +    hwc->state = 0;
>> +
>> +    /* Always reprogram the period */
>> +    count = dmar_readq(iommu_event_base(iommu_pmu, hwc->idx));
>> +    local64_set((&hwc->prev_count), count);
>> +
>> +    ecmd_submit_sync(iommu, DMA_ECMD_ENABLE, hwc->idx, false, 0);
>
> What happens when emcmd_submit_sync() returns an error? How should we
> handle this case? The same queestion to other places in this patch.
>

The existing perf_event subsystem doesn't handle the error, because
other PMUs never trigger such errors. Perf usually check all the PMU
counters once at the beginning when registering/initializing them.

For IOMMU PMU, the error will be ignored. I think it should be OK. Because
- It's a corner case, which is very unlikely to happen.
- The worst case is that the user will get <not count> with perf
command, which can the user some hints.

If it's not good enough, I think we can add a WARN_ON_ONCE() here and
everywhere for ecmd to dump the error in the dmesg.

What do you think?


>> +
>> +    perf_event_update_userpage(event);
>> +}
>> +
>> +static void iommu_pmu_stop(struct perf_event *event, int flags)
>> +{
>> +    struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
>> +    struct intel_iommu *iommu = iommu_pmu->iommu;
>> +    struct hw_perf_event *hwc = &event->hw;
>> +
>> +    if (!(hwc->state & PERF_HES_STOPPED)) {
>> +        ecmd_submit_sync(iommu, DMA_ECMD_DISABLE, hwc->idx, false, 0);
>> +
>> +        iommu_pmu_event_update(event);
>> +
>> +        hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
>> +    }
>> +}
>> +
>> +static inline int
>> +iommu_pmu_validate_per_cntr_event(struct iommu_pmu *iommu_pmu,
>> +                  int idx, struct perf_event *event)
>> +{
>> +    u32 event_group = iommu_event_group(event->attr.config);
>> +    u32 select = iommu_event_select(event->attr.config);
>> +
>> +    if (!(iommu_pmu->cntr_evcap[idx][event_group] & select))
>> +        return -EINVAL;
>> +
>> +    return 0;
>> +}
>> +
>> +static int iommu_pmu_assign_event(struct iommu_pmu *iommu_pmu,
>> +                  struct perf_event *event)
>> +{
>> +    struct hw_perf_event *hwc = &event->hw;
>> +    int idx;
>> +
>> +    /*
>> +     * The counters which support limited events are usually at the end.
>> +     * Schedule them first to accommodate more events.
>> +     */
>> +    for (idx = iommu_pmu->num_cntr - 1; idx >= 0; idx--) {
>> +        if (test_and_set_bit(idx, iommu_pmu->used_mask))
>> +            continue;
>> +        /* Check per-counter event capabilities */
>> +        if (!iommu_pmu_validate_per_cntr_event(iommu_pmu, idx, event))
>> +            break;
>> +        clear_bit(idx, iommu_pmu->used_mask);
>> +    }
>> +    if (idx < 0)
>> +        return -EINVAL;
>> +
>> +    iommu_pmu->event_list[idx] = event;
>> +    hwc->idx = idx;
>> +
>> +    /* config events */
>> +    dmar_writeq(iommu_config_base(iommu_pmu, idx), hwc->config);
>> +
>> +    iommu_pmu_set_filter(requester_id, event->attr.config1,
>> +                 IOMMU_PMU_FILTER_REQUESTER_ID, idx);
>> +    iommu_pmu_set_filter(domain, event->attr.config1,
>> +                 IOMMU_PMU_FILTER_DOMAIN, idx);
>> +    iommu_pmu_set_filter(pasid, event->attr.config1,
>> +                 IOMMU_PMU_FILTER_PASID, idx);
>> +    iommu_pmu_set_filter(ats, event->attr.config2,
>> +                 IOMMU_PMU_FILTER_ATS, idx);
>> +    iommu_pmu_set_filter(page_table, event->attr.config2,
>> +                 IOMMU_PMU_FILTER_PAGE_TABLE, idx);
>> +
>> +    return 0;
>> +}
>> +
>> +static int iommu_pmu_add(struct perf_event *event, int flags)
>> +{
>> +    struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
>> +    struct hw_perf_event *hwc = &event->hw;
>> +    int ret;
>> +
>> +    ret = iommu_pmu_assign_event(iommu_pmu, event);
>> +    if (ret < 0)
>> +        return ret;
>> +
>> +    hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
>> +
>> +    if (flags & PERF_EF_START)
>> +        iommu_pmu_start(event, 0);
>> +
>> +    return 0;
>> +}
>> +
>> +static void iommu_pmu_del(struct perf_event *event, int flags)
>> +{
>> +    struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
>> +    int idx = event->hw.idx;
>> +
>> +    iommu_pmu_stop(event, PERF_EF_UPDATE);
>> +
>> +    iommu_pmu_clear_filter(IOMMU_PMU_FILTER_REQUESTER_ID, idx);
>> +    iommu_pmu_clear_filter(IOMMU_PMU_FILTER_DOMAIN, idx);
>> +    iommu_pmu_clear_filter(IOMMU_PMU_FILTER_PASID, idx);
>> +    iommu_pmu_clear_filter(IOMMU_PMU_FILTER_ATS, idx);
>> +    iommu_pmu_clear_filter(IOMMU_PMU_FILTER_PAGE_TABLE, idx);
>> +
>> +    iommu_pmu->event_list[idx] = NULL;
>> +    event->hw.idx = -1;
>> +    clear_bit(idx, iommu_pmu->used_mask);
>> +
>> +    perf_event_update_userpage(event);
>> +}
>> +
>> +static void iommu_pmu_enable(struct pmu *pmu)
>> +{
>> +    struct iommu_pmu *iommu_pmu = container_of(pmu, struct iommu_pmu,
>> pmu);
>> +    struct intel_iommu *iommu = iommu_pmu->iommu;
>> +
>> +    ecmd_submit_sync(iommu, DMA_ECMD_UNFREEZE, 0, false, 0);
>> +}
>> +
>> +static void iommu_pmu_disable(struct pmu *pmu)
>> +{
>> +    struct iommu_pmu *iommu_pmu = container_of(pmu, struct iommu_pmu,
>> pmu);
>> +    struct intel_iommu *iommu = iommu_pmu->iommu;
>> +
>> +    ecmd_submit_sync(iommu, DMA_ECMD_FREEZE, 0, false, 0);
>> +}
>> +
>> +static int __iommu_pmu_register(struct intel_iommu *iommu)
>> +{
>> +    struct iommu_pmu *iommu_pmu = iommu->pmu;
>> +
>> +    iommu_pmu->pmu.name        = iommu->name;
>> +    iommu_pmu->pmu.task_ctx_nr    = perf_invalid_context;
>> +    iommu_pmu->pmu.event_init    = iommu_pmu_event_init;
>> +    iommu_pmu->pmu.pmu_enable    = iommu_pmu_enable;
>> +    iommu_pmu->pmu.pmu_disable    = iommu_pmu_disable;
>> +    iommu_pmu->pmu.add        = iommu_pmu_add;
>> +    iommu_pmu->pmu.del        = iommu_pmu_del;
>> +    iommu_pmu->pmu.start        = iommu_pmu_start;
>> +    iommu_pmu->pmu.stop        = iommu_pmu_stop;
>> +    iommu_pmu->pmu.read        = iommu_pmu_event_update;
>> +    iommu_pmu->pmu.attr_groups    = iommu_pmu_attr_groups;
>> +    iommu_pmu->pmu.attr_update    = iommu_pmu_attr_update;
>> +    iommu_pmu->pmu.capabilities    = PERF_PMU_CAP_NO_EXCLUDE;
>> +    iommu_pmu->pmu.module        = THIS_MODULE;
>> +
>> +    return perf_pmu_register(&iommu_pmu->pmu, iommu_pmu->pmu.name, -1);
>> +}
>> +
>>   static inline void __iomem *
>>   get_perf_reg_address(struct intel_iommu *iommu, u32 offset)
>>   {
>> @@ -40,11 +509,21 @@ int alloc_iommu_pmu(struct intel_iommu *iommu)
>>       if (!pcap_interrupt(perfcap))
>>           return -ENODEV;
>>   +    /* Check required Enhanced Command Capability */
>> +    if (!ecmd_has_pmu_essential(iommu))
>> +        return -ENODEV;
>> +
>>       iommu_pmu = kzalloc(sizeof(*iommu_pmu), GFP_KERNEL);
>>       if (!iommu_pmu)
>>           return -ENOMEM;
>>         iommu_pmu->num_cntr = pcap_num_cntr(perfcap);
>> +    if (iommu_pmu->num_cntr > IOMMU_PMU_IDX_MAX) {
>> +        WARN_ONCE(1, "The number of IOMMU counters %d > max(%d),
>> clipping!",
>> +              iommu_pmu->num_cntr, IOMMU_PMU_IDX_MAX);
>
> Do we really need a kernel trace here? This isn't a software flaw,
> perhaps, use pr_info() to let users know about this is enough?

It's a software flaw. If the number of counter > 64, we need to update
the driver to support more extra registers, e.g., freeze and overflow
registers.

There are only 14 counters for the existing platform. It's far away from
64 counters. I would not expect that the warning can be triggered soon.

>
>> +        iommu_pmu->num_cntr = IOMMU_PMU_IDX_MAX;
>> +    }
>> +
>>       iommu_pmu->cntr_width = pcap_cntr_width(perfcap);
>>       iommu_pmu->filter = pcap_filters_mask(perfcap);
>>       iommu_pmu->cntr_stride = pcap_cntr_stride(perfcap);
>> @@ -157,3 +636,20 @@ void free_iommu_pmu(struct intel_iommu *iommu)
>>       kfree(iommu_pmu);
>>       iommu->pmu = NULL;
>>   }
>> +
>> +void iommu_pmu_register(struct intel_iommu *iommu)
>> +{
>> +    if (!iommu->pmu)
>> +        return;
>> +
>> +    if (__iommu_pmu_register(iommu)) {
>> +        pr_err("Failed to register PMU for iommu (seq_id = %d)\n",
>> +               iommu->seq_id);
>
> What will happen if iommu_pmu_unregister() is called after
> __iommu_pmu_register() has returned error?


Yes, we should avoid iommu_pmu_unregister().

I will free the iommu->pmu via free_iommu_pmu() for this case.

Thanks,
Kan
>
>> +    }
>> +}
>> +
>> +void iommu_pmu_unregister(struct intel_iommu *iommu)
>> +{
>> +    if (iommu->pmu)
>> +        perf_pmu_unregister(&iommu->pmu->pmu);
>> +}
>> diff --git a/drivers/iommu/intel/perfmon.h
>> b/drivers/iommu/intel/perfmon.h
>> index 8587c80501cd..b60f0cad5bfd 100644
>> --- a/drivers/iommu/intel/perfmon.h
>> +++ b/drivers/iommu/intel/perfmon.h
>> @@ -7,6 +7,14 @@
>>   #define IOMMU_PMU_NUM_OFF_REGS            4
>>   #define IOMMU_PMU_OFF_REGS_STEP            4
>>   +#define IOMMU_PMU_FILTER_REQUESTER_ID        0x01
>> +#define IOMMU_PMU_FILTER_DOMAIN            0x02
>> +#define IOMMU_PMU_FILTER_PASID            0x04
>> +#define IOMMU_PMU_FILTER_ATS            0x08
>> +#define IOMMU_PMU_FILTER_PAGE_TABLE        0x10
>> +
>> +#define IOMMU_PMU_FILTER_EN            (1 << 31)
>> +
>>   #define IOMMU_PMU_CFG_OFFSET            0x100
>>   #define IOMMU_PMU_CFG_CNTRCAP_OFFSET        0x80
>>   #define IOMMU_PMU_CFG_CNTREVCAP_OFFSET        0x84
>> @@ -21,12 +29,18 @@
>>   #define iommu_cntrcap_ios(p)            ((p >> 16) & 0x1)
>>   #define iommu_cntrcap_egcnt(p)            ((p >> 28) & 0xf)
>>   +#define IOMMU_EVENT_CFG_EGI_SHIFT        8
>> +#define IOMMU_EVENT_CFG_ES_SHIFT        32
>> +#define IOMMU_EVENT_CFG_INT            (1ULL << 1)
>> +
>>   #define iommu_event_select(p)            ((p) & 0xfffffff)
>>   #define iommu_event_group(p)            ((p >> 28) & 0xf)
>>     #ifdef CONFIG_INTEL_IOMMU_PERF_EVENTS
>>   int alloc_iommu_pmu(struct intel_iommu *iommu);
>>   void free_iommu_pmu(struct intel_iommu *iommu);
>> +void iommu_pmu_register(struct intel_iommu *iommu);
>> +void iommu_pmu_unregister(struct intel_iommu *iommu);
>>   #else
>>   static inline int
>>   alloc_iommu_pmu(struct intel_iommu *iommu)
>> @@ -38,4 +52,14 @@ static inline void
>>   free_iommu_pmu(struct intel_iommu *iommu)
>>   {
>>   }
>> +
>> +static inline void
>> +iommu_pmu_register(struct intel_iommu *iommu)
>> +{
>> +}
>> +
>> +static inline void
>> +iommu_pmu_unregister(struct intel_iommu *iommu)
>> +{
>> +}
>>   #endif /* CONFIG_INTEL_IOMMU_PERF_EVENTS */
>
> --
> Best regards,
> baolu

2023-01-17 08:52:06

by Lu Baolu

[permalink] [raw]
Subject: Re: [PATCH 4/7] iommu/vt-d: Add IOMMU perfmon support

On 2023/1/16 23:12, Liang, Kan wrote:
>>> +static void iommu_pmu_start(struct perf_event *event, int flags)
>>> +{
>>> +    struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
>>> +    struct intel_iommu *iommu = iommu_pmu->iommu;
>>> +    struct hw_perf_event *hwc = &event->hw;
>>> +    u64 count;
>>> +
>>> +    if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
>>> +        return;
>>> +
>>> +    if (WARN_ON_ONCE(hwc->idx < 0 || hwc->idx >= IOMMU_PMU_IDX_MAX))
>>> +        return;
>>> +
>>> +    if (flags & PERF_EF_RELOAD)
>>> +        WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
>>> +
>>> +    hwc->state = 0;
>>> +
>>> +    /* Always reprogram the period */
>>> +    count = dmar_readq(iommu_event_base(iommu_pmu, hwc->idx));
>>> +    local64_set((&hwc->prev_count), count);
>>> +
>>> +    ecmd_submit_sync(iommu, DMA_ECMD_ENABLE, hwc->idx, false, 0);
>> What happens when emcmd_submit_sync() returns an error? How should we
>> handle this case? The same queestion to other places in this patch.
>>
> The existing perf_event subsystem doesn't handle the error, because
> other PMUs never trigger such errors. Perf usually check all the PMU
> counters once at the beginning when registering/initializing them.
>
> For IOMMU PMU, the error will be ignored. I think it should be OK. Because
> - It's a corner case, which is very unlikely to happen.
> - The worst case is that the user will get <not count> with perf
> command, which can the user some hints.
>
> If it's not good enough, I think we can add a WARN_ON_ONCE() here and
> everywhere for ecmd to dump the error in the dmesg.
>
> What do you think?

No need for a WARN() here. If the hardware is stuck, there should be
warnings everywhere.

It's fine to me if you add above comments around the code.

--
Best regards,
baolu

2023-01-17 17:37:23

by Liang, Kan

[permalink] [raw]
Subject: Re: [PATCH 6/7] iommu/vt-d: Add IOMMU perfmon overflow handler support



On 2023-01-16 10:20 a.m., Liang, Kan wrote:
>
>
> On 2023-01-14 6:05 a.m., Baolu Lu wrote:
>> On 2023/1/12 4:25, [email protected] wrote:
>>> From: Kan Liang <[email protected]>
>>>
>>> While enabled to count events and an event occurrence causes the counter
>>> value to increment and roll over to or past zero, this is termed a
>>> counter overflow. The overflow can trigger an interrupt. The IOMMU
>>> perfmon needs to handle the case properly.
>>>
>>> New HW IRQs are allocated for each IOMMU device for perfmon. The IRQ IDs
>>> are after the SVM range.
>>>
>>> In the overflow handler, the counter is not frozen. It's very unlikely
>>> that the same counter overflows again during the period. But it's
>>> possible that other counters overflow at the same time. Read the
>>> overflow register at the end of the handler and check whether there are
>>> more.
>>>
>>> Signed-off-by: Kan Liang <[email protected]>
>>> ---
>>>   drivers/iommu/intel/dmar.c    |  2 +
>>>   drivers/iommu/intel/iommu.h   | 11 ++++-
>>>   drivers/iommu/intel/perfmon.c | 82 +++++++++++++++++++++++++++++++++++
>>>   drivers/iommu/intel/svm.c     |  2 +-
>>>   4 files changed, 95 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
>>> index dcafa32875c1..94e314320cd9 100644
>>> --- a/drivers/iommu/intel/dmar.c
>>> +++ b/drivers/iommu/intel/dmar.c
>>> @@ -1887,6 +1887,8 @@ static inline int dmar_msi_reg(struct
>>> intel_iommu *iommu, int irq)
>>>           return DMAR_FECTL_REG;
>>>       else if (iommu->pr_irq == irq)
>>>           return DMAR_PECTL_REG;
>>> +    else if (iommu->perf_irq == irq)
>>> +        return DMAR_PERFINTRCTL_REG;
>>>       else
>>>           BUG();
>>>   }
>>> diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
>>> index bbc5dda903e9..f616e4f3d765 100644
>>> --- a/drivers/iommu/intel/iommu.h
>>> +++ b/drivers/iommu/intel/iommu.h
>>> @@ -130,6 +130,8 @@
>>>   #define DMAR_PERFCFGOFF_REG    0x310
>>>   #define DMAR_PERFOVFOFF_REG    0x318
>>>   #define DMAR_PERFCNTROFF_REG    0x31c
>>> +#define DMAR_PERFINTRSTS_REG    0x324
>>> +#define DMAR_PERFINTRCTL_REG    0x328
>>>   #define DMAR_PERFEVNTCAP_REG    0x380
>>>   #define DMAR_ECMD_REG        0x400
>>>   #define DMAR_ECEO_REG        0x408
>>> @@ -357,6 +359,9 @@
>>>     #define DMA_VCS_PAS    ((u64)1)
>>>   +/* PERFINTRSTS_REG */
>>> +#define DMA_PERFINTRSTS_PIS    ((u32)1)
>>> +
>>>   #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts)            \
>>>   do {                                    \
>>>       cycles_t start_time = get_cycles();                \
>>> @@ -630,8 +635,12 @@ struct iommu_pmu {
>>>       struct pmu        pmu;
>>>       DECLARE_BITMAP(used_mask, IOMMU_PMU_IDX_MAX);
>>>       struct perf_event    *event_list[IOMMU_PMU_IDX_MAX];
>>> +    unsigned char        irq_name[16];
>>>   };
>>>   +#define IOMMU_IRQ_ID_OFFSET_PRQ        (DMAR_UNITS_SUPPORTED)
>>> +#define IOMMU_IRQ_ID_OFFSET_PERF    (2 * DMAR_UNITS_SUPPORTED)
>>> +
>>>   struct intel_iommu {
>>>       void __iomem    *reg; /* Pointer to hardware regs, virtual addr */
>>>       u64         reg_phys; /* physical address of hw register set */
>>> @@ -645,7 +654,7 @@ struct intel_iommu {
>>>       int        seq_id;    /* sequence id of the iommu */
>>>       int        agaw; /* agaw of this iommu */
>>>       int        msagaw; /* max sagaw of this iommu */
>>> -    unsigned int     irq, pr_irq;
>>> +    unsigned int    irq, pr_irq, perf_irq;
>>>       u16        segment;     /* PCI segment# */
>>>       unsigned char     name[13];    /* Device Name */
>>>   diff --git a/drivers/iommu/intel/perfmon.c
>>> b/drivers/iommu/intel/perfmon.c
>>> index f332232bb345..d0fbf784c827 100644
>>> --- a/drivers/iommu/intel/perfmon.c
>>> +++ b/drivers/iommu/intel/perfmon.c
>>> @@ -476,6 +476,49 @@ static void iommu_pmu_disable(struct pmu *pmu)
>>>       ecmd_submit_sync(iommu, DMA_ECMD_FREEZE, 0, false, 0);
>>>   }
>>>   +static void iommu_pmu_counter_overflow(struct iommu_pmu *iommu_pmu)
>>> +{
>>> +    struct perf_event *event;
>>> +    int i, handled = 0;
>>> +    u64 status;
>>> +
>>> +    /*
>>> +     * Two counters may be overflowed very close.
>>> +     * Always check whether there are more to handle.
>>> +     */
>>
>> Keep comment style consistent, like this
>>
>>         /*
>>          * Two counters may be overflowed very close. Always check whether
>>          * there are more to handle.
>>          */
>>
>> Same to other places.
>
> Sure.
>
>>
>>> +    while ((status = dmar_readq(iommu_pmu->overflow))) {
>>
>> Unnecessary double brackets?

I think we need the double brackets to aovid compiler warnning -
"suggest parentheses around assignment used as truth value"

Thanks,
Kan
>>
>>> +        for_each_set_bit(i, (unsigned long *)&status,
>>> iommu_pmu->num_cntr) {
>>> +            handled++;
>>
>> "handled" isn't used anywhere. Please cleanup it.
>>
>
> Sure.
>
>>> +
>>> +            /*
>>> +             * Find the assigned event of the counter.
>>> +             * Accumulate the value into the event->count.
>>> +             */
>>> +            event = iommu_pmu->event_list[i];
>>> +            if (WARN_ON_ONCE(!event))
>>
>> Again, do we need a kernel trace here? This is only because the hardware
>> reported an wrong event id, right? How about a pr_warn() to let the user
>> know this?
>
> The hardware reported ID doesn't match the SW records. It's hard to say
> which one is correct. But I guess pr_warn_once() may be enough. I will
> change it in V2.
>
> Thanks,
> Kan
>
>>
>>> +                continue;
>>> +            iommu_pmu_event_update(event);
>>> +        }
>>> +
>>> +        dmar_writeq(iommu_pmu->overflow, status);
>>> +    }
>>> +}
>>> +
>>> +static irqreturn_t iommu_pmu_irq_handler(int irq, void *dev_id)
>>> +{
>>> +    struct intel_iommu *iommu = dev_id;
>>> +
>>> +    if (!dmar_readl(iommu->reg + DMAR_PERFINTRSTS_REG))
>>> +        return IRQ_NONE;
>>> +
>>> +    iommu_pmu_counter_overflow(iommu->pmu);
>>> +
>>> +    /* Clear the status bit */
>>> +    dmar_writel(iommu->reg + DMAR_PERFINTRSTS_REG, DMA_PERFINTRSTS_PIS);
>>> +
>>> +    return IRQ_HANDLED;
>>> +}
>>> +
>>>   static int __iommu_pmu_register(struct intel_iommu *iommu)
>>>   {
>>>       struct iommu_pmu *iommu_pmu = iommu->pmu;
>>> @@ -658,6 +701,38 @@ void free_iommu_pmu(struct intel_iommu *iommu)
>>>       iommu->pmu = NULL;
>>>   }
>>>   +static int iommu_pmu_set_interrupt(struct intel_iommu *iommu)
>>> +{
>>> +    struct iommu_pmu *iommu_pmu = iommu->pmu;
>>> +    int irq, ret;
>>> +
>>> +    irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PERF + iommu->seq_id,
>>> iommu->node, iommu);
>>> +    if (irq <= 0)
>>> +        return -EINVAL;
>>> +
>>> +    snprintf(iommu_pmu->irq_name, sizeof(iommu_pmu->irq_name),
>>> "dmar%d-perf", iommu->seq_id);
>>> +
>>> +    iommu->perf_irq = irq;
>>> +    ret = request_threaded_irq(irq, NULL, iommu_pmu_irq_handler,
>>> +                   IRQF_ONESHOT, iommu_pmu->irq_name, iommu);
>>> +    if (ret) {
>>> +        dmar_free_hwirq(irq);
>>> +        iommu->perf_irq = 0;
>>> +        return ret;
>>> +    }
>>> +    return 0;
>>> +}
>>> +
>>> +static void iommu_pmu_unset_interrupt(struct intel_iommu *iommu)
>>> +{
>>> +    if (!iommu->perf_irq)
>>> +        return;
>>> +
>>> +    free_irq(iommu->perf_irq, iommu);
>>> +    dmar_free_hwirq(iommu->perf_irq);
>>> +    iommu->perf_irq = 0;
>>> +}
>>> +
>>>   static int iommu_pmu_cpu_online(unsigned int cpu)
>>>   {
>>>       if (cpumask_empty(&iommu_pmu_cpu_mask))
>>> @@ -734,8 +809,14 @@ void iommu_pmu_register(struct intel_iommu *iommu)
>>>       if (iommu_pmu_cpuhp_setup(iommu_pmu))
>>>           goto unregister;
>>>   +    /* Set interrupt for overflow */
>>> +    if (iommu_pmu_set_interrupt(iommu))
>>> +        goto cpuhp_free;
>>> +
>>>       return;
>>>   +cpuhp_free:
>>> +    iommu_pmu_cpuhp_free(iommu_pmu);
>>>   unregister:
>>>       perf_pmu_unregister(&iommu_pmu->pmu);
>>>   err:
>>> @@ -749,6 +830,7 @@ void iommu_pmu_unregister(struct intel_iommu *iommu)
>>>       if (!iommu_pmu)
>>>           return;
>>>   +    iommu_pmu_unset_interrupt(iommu);
>>>       iommu_pmu_cpuhp_free(iommu_pmu);
>>>       perf_pmu_unregister(&iommu_pmu->pmu);
>>>   }
>>> diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
>>> index c76b66263467..b6c5edd80d5d 100644
>>> --- a/drivers/iommu/intel/svm.c
>>> +++ b/drivers/iommu/intel/svm.c
>>> @@ -79,7 +79,7 @@ int intel_svm_enable_prq(struct intel_iommu *iommu)
>>>       }
>>>       iommu->prq = page_address(pages);
>>>   -    irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id,
>>> iommu->node, iommu);
>>> +    irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PRQ + iommu->seq_id,
>>> iommu->node, iommu);
>>>       if (irq <= 0) {
>>>           pr_err("IOMMU: %s: Failed to create IRQ vector for page
>>> request queue\n",
>>>                  iommu->name);
>>
>> --
>> Best regards,
>> baolu