This series includes RISC-V IOMMU hardware performance monitor and
nested IOMMU support. It also introduces operations for the g-stage
table, which are required for nested IOMMU.
This patch set is implemented on top of the RISC-V IOMMU v4 series [1],
and it will be submitted as an RFC until the RISC-V IOMMU has been
merged. Additionally, it will be updated as needed when a new version
of the RISC-V IOMMU series is posted.
[1] link: https://lists.infradead.org/pipermail/linux-riscv/2024-May/053708.html
Zong Li (6):
iommu/riscv: Add RISC-V IOMMU PMU support
iommu/riscv: Support HPM and interrupt handling
iommu/riscv: support GSCID
iommu/riscv: support nested iommu for getting iommu hardware
information
iommu/riscv: support nested iommu for creating domains owned by
userspace
iommu/riscv: support nested iommu for flushing cache
drivers/iommu/riscv/Makefile | 4 +-
drivers/iommu/riscv/iommu-bits.h | 22 ++
drivers/iommu/riscv/iommu-pmu.c | 477 ++++++++++++++++++++++++++++++
drivers/iommu/riscv/iommu.c | 481 +++++++++++++++++++++++++++++--
drivers/iommu/riscv/iommu.h | 8 +
include/uapi/linux/iommufd.h | 39 +++
6 files changed, 1002 insertions(+), 29 deletions(-)
create mode 100644 drivers/iommu/riscv/iommu-pmu.c
--
2.17.1
This patch implements the RISC-V IOMMU hardware performance monitor, it
includes the counting ans sampling mode.
Specification doesn't define the event ID for counting the number of
clock cycles, there is no associated iohpmevt0. But we need an event for
counting cycle in perf, reserve the maximum number of event ID for it now.
Signed-off-by: Zong Li <[email protected]>
---
drivers/iommu/riscv/Makefile | 4 +-
drivers/iommu/riscv/iommu-bits.h | 15 +
drivers/iommu/riscv/iommu-pmu.c | 477 +++++++++++++++++++++++++++++++
drivers/iommu/riscv/iommu.h | 8 +
4 files changed, 502 insertions(+), 2 deletions(-)
create mode 100644 drivers/iommu/riscv/iommu-pmu.c
diff --git a/drivers/iommu/riscv/Makefile b/drivers/iommu/riscv/Makefile
index f54c9ed17d41..1b02e07d83c9 100644
--- a/drivers/iommu/riscv/Makefile
+++ b/drivers/iommu/riscv/Makefile
@@ -1,3 +1,3 @@
# SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_RISCV_IOMMU) += iommu.o iommu-platform.o
-obj-$(CONFIG_RISCV_IOMMU_PCI) += iommu-pci.o
+obj-$(CONFIG_RISCV_IOMMU) += iommu.o iommu-platform.o iommu-pmu.o
+obj-$(CONFIG_RISCV_IOMMU_PCI) += iommu-pci.o iommu-pmu.o
diff --git a/drivers/iommu/riscv/iommu-bits.h b/drivers/iommu/riscv/iommu-bits.h
index 40c379222821..11351cf6c710 100644
--- a/drivers/iommu/riscv/iommu-bits.h
+++ b/drivers/iommu/riscv/iommu-bits.h
@@ -17,6 +17,7 @@
#include <linux/types.h>
#include <linux/bitfield.h>
#include <linux/bits.h>
+#include <linux/perf_event.h>
/*
* Chapter 5: Memory Mapped register interface
@@ -210,6 +211,7 @@ enum riscv_iommu_ddtp_modes {
/* 5.22 Performance monitoring event counters (31 * 64bits) */
#define RISCV_IOMMU_REG_IOHPMCTR_BASE 0x0068
#define RISCV_IOMMU_REG_IOHPMCTR(_n) (RISCV_IOMMU_REG_IOHPMCTR_BASE + ((_n) * 0x8))
+#define RISCV_IOMMU_IOHPMCTR_COUNTER GENMASK_ULL(63, 0)
/* 5.23 Performance monitoring event selectors (31 * 64bits) */
#define RISCV_IOMMU_REG_IOHPMEVT_BASE 0x0160
@@ -251,6 +253,19 @@ enum riscv_iommu_hpmevent_id {
RISCV_IOMMU_HPMEVENT_MAX = 9
};
+/* Use maximum event ID for cycle event */
+#define RISCV_IOMMU_HPMEVENT_CYCLE GENMASK_ULL(14, 0)
+
+#define RISCV_IOMMU_HPM_COUNTER_NUM 32
+
+struct riscv_iommu_pmu {
+ struct pmu pmu;
+ void __iomem *reg;
+ int num_counters;
+ struct perf_event *events[RISCV_IOMMU_HPM_COUNTER_NUM];
+ DECLARE_BITMAP(used_counters, RISCV_IOMMU_HPM_COUNTER_NUM);
+};
+
/* 5.24 Translation request IOVA (64bits) */
#define RISCV_IOMMU_REG_TR_REQ_IOVA 0x0258
#define RISCV_IOMMU_TR_REQ_IOVA_VPN GENMASK_ULL(63, 12)
diff --git a/drivers/iommu/riscv/iommu-pmu.c b/drivers/iommu/riscv/iommu-pmu.c
new file mode 100644
index 000000000000..6ab50763860f
--- /dev/null
+++ b/drivers/iommu/riscv/iommu-pmu.c
@@ -0,0 +1,477 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2024 SiFive
+ *
+ * Authors
+ * Zong Li <[email protected]>
+ */
+
+#include <linux/io-64-nonatomic-hi-lo.h>
+
+#include "iommu.h"
+#include "iommu-bits.h"
+
+#define to_riscv_iommu_pmu(p) (container_of(p, struct riscv_iommu_pmu, pmu))
+
+#define RISCV_IOMMU_PMU_ATTR_EXTRACTOR(_name, _mask) \
+ static inline u32 get_##_name(struct perf_event *event) \
+ { \
+ return FIELD_GET(_mask, event->attr.config); \
+ } \
+
+RISCV_IOMMU_PMU_ATTR_EXTRACTOR(event, RISCV_IOMMU_IOHPMEVT_EVENT_ID);
+RISCV_IOMMU_PMU_ATTR_EXTRACTOR(partial_matching, RISCV_IOMMU_IOHPMEVT_DMASK);
+RISCV_IOMMU_PMU_ATTR_EXTRACTOR(pid_pscid, RISCV_IOMMU_IOHPMEVT_PID_PSCID);
+RISCV_IOMMU_PMU_ATTR_EXTRACTOR(did_gscid, RISCV_IOMMU_IOHPMEVT_DID_GSCID);
+RISCV_IOMMU_PMU_ATTR_EXTRACTOR(filter_pid_pscid, RISCV_IOMMU_IOHPMEVT_PV_PSCV);
+RISCV_IOMMU_PMU_ATTR_EXTRACTOR(filter_did_gscid, RISCV_IOMMU_IOHPMEVT_DV_GSCV);
+RISCV_IOMMU_PMU_ATTR_EXTRACTOR(filter_id_type, RISCV_IOMMU_IOHPMEVT_IDT);
+
+/* Formats */
+PMU_FORMAT_ATTR(event, "config:0-14");
+PMU_FORMAT_ATTR(partial_matching, "config:15");
+PMU_FORMAT_ATTR(pid_pscid, "config:16-35");
+PMU_FORMAT_ATTR(did_gscid, "config:36-59");
+PMU_FORMAT_ATTR(filter_pid_pscid, "config:60");
+PMU_FORMAT_ATTR(filter_did_gscid, "config:61");
+PMU_FORMAT_ATTR(filter_id_type, "config:62");
+
+static struct attribute *riscv_iommu_pmu_formats[] = {
+ &format_attr_event.attr,
+ &format_attr_partial_matching.attr,
+ &format_attr_pid_pscid.attr,
+ &format_attr_did_gscid.attr,
+ &format_attr_filter_pid_pscid.attr,
+ &format_attr_filter_did_gscid.attr,
+ &format_attr_filter_id_type.attr,
+ NULL,
+};
+
+static const struct attribute_group riscv_iommu_pmu_format_group = {
+ .name = "format",
+ .attrs = riscv_iommu_pmu_formats,
+};
+
+/* Events */
+static ssize_t riscv_iommu_pmu_event_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ struct perf_pmu_events_attr *pmu_attr;
+
+ pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
+
+ return sprintf(page, "event=0x%02llx\n", pmu_attr->id);
+}
+
+PMU_EVENT_ATTR(cycle, event_attr_cycle,
+ RISCV_IOMMU_HPMEVENT_CYCLE, riscv_iommu_pmu_event_show);
+PMU_EVENT_ATTR(dont_count, event_attr_dont_count,
+ RISCV_IOMMU_HPMEVENT_INVALID, riscv_iommu_pmu_event_show);
+PMU_EVENT_ATTR(untranslated_req, event_attr_untranslated_req,
+ RISCV_IOMMU_HPMEVENT_URQ, riscv_iommu_pmu_event_show);
+PMU_EVENT_ATTR(translated_req, event_attr_translated_req,
+ RISCV_IOMMU_HPMEVENT_TRQ, riscv_iommu_pmu_event_show);
+PMU_EVENT_ATTR(ats_trans_req, event_attr_ats_trans_req,
+ RISCV_IOMMU_HPMEVENT_ATS_RQ, riscv_iommu_pmu_event_show);
+PMU_EVENT_ATTR(tlb_miss, event_attr_tlb_miss,
+ RISCV_IOMMU_HPMEVENT_TLB_MISS, riscv_iommu_pmu_event_show);
+PMU_EVENT_ATTR(ddt_walks, event_attr_ddt_walks,
+ RISCV_IOMMU_HPMEVENT_DD_WALK, riscv_iommu_pmu_event_show);
+PMU_EVENT_ATTR(pdt_walks, event_attr_pdt_walks,
+ RISCV_IOMMU_HPMEVENT_PD_WALK, riscv_iommu_pmu_event_show);
+PMU_EVENT_ATTR(s_vs_pt_walks, event_attr_s_vs_pt_walks,
+ RISCV_IOMMU_HPMEVENT_S_VS_WALKS, riscv_iommu_pmu_event_show);
+PMU_EVENT_ATTR(g_pt_walks, event_attr_g_pt_walks,
+ RISCV_IOMMU_HPMEVENT_G_WALKS, riscv_iommu_pmu_event_show);
+
+static struct attribute *riscv_iommu_pmu_events[] = {
+ &event_attr_cycle.attr.attr,
+ &event_attr_dont_count.attr.attr,
+ &event_attr_untranslated_req.attr.attr,
+ &event_attr_translated_req.attr.attr,
+ &event_attr_ats_trans_req.attr.attr,
+ &event_attr_tlb_miss.attr.attr,
+ &event_attr_ddt_walks.attr.attr,
+ &event_attr_pdt_walks.attr.attr,
+ &event_attr_s_vs_pt_walks.attr.attr,
+ &event_attr_g_pt_walks.attr.attr,
+ NULL,
+};
+
+static const struct attribute_group riscv_iommu_pmu_events_group = {
+ .name = "events",
+ .attrs = riscv_iommu_pmu_events,
+};
+
+static const struct attribute_group *riscv_iommu_pmu_attr_grps[] = {
+ &riscv_iommu_pmu_format_group,
+ &riscv_iommu_pmu_events_group,
+ NULL,
+};
+
+/* PMU Operations */
+static void riscv_iommu_pmu_set_counter(struct riscv_iommu_pmu *pmu, u32 idx,
+ u64 value)
+{
+ void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOHPMCYCLES;
+
+ if (WARN_ON_ONCE(idx < 0 || idx > pmu->num_counters))
+ return;
+
+ writeq(FIELD_PREP(RISCV_IOMMU_IOHPMCTR_COUNTER, value), addr + idx * 8);
+}
+
+static u64 riscv_iommu_pmu_get_counter(struct riscv_iommu_pmu *pmu, u32 idx)
+{
+ void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOHPMCYCLES;
+ u64 value;
+
+ if (WARN_ON_ONCE(idx < 0 || idx > pmu->num_counters))
+ return -EINVAL;
+
+ value = readq(addr + idx * 8);
+
+ return FIELD_GET(RISCV_IOMMU_IOHPMCTR_COUNTER, value);
+}
+
+static u64 riscv_iommu_pmu_get_event(struct riscv_iommu_pmu *pmu, u32 idx)
+{
+ void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOHPMEVT_BASE;
+
+ if (WARN_ON_ONCE(idx < 0 || idx > pmu->num_counters))
+ return 0;
+
+ /* There is no associtated IOHPMEVT0 for IOHPMCYCLES */
+ if (idx == 0)
+ return 0;
+
+ return readq(addr + (idx - 1) * 8);
+}
+
+static void riscv_iommu_pmu_set_event(struct riscv_iommu_pmu *pmu, u32 idx,
+ u64 value)
+{
+ void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOHPMEVT_BASE;
+
+ if (WARN_ON_ONCE(idx < 0 || idx > pmu->num_counters))
+ return;
+
+ /* There is no associtated IOHPMEVT0 for IOHPMCYCLES */
+ if (idx == 0)
+ return;
+
+ writeq(value, addr + (idx - 1) * 8);
+}
+
+static void riscv_iommu_pmu_enable_counter(struct riscv_iommu_pmu *pmu, u32 idx)
+{
+ void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOCOUNTINH;
+ u32 value = readl(addr);
+
+ writel(value & ~BIT(idx), addr);
+}
+
+static void riscv_iommu_pmu_disable_counter(struct riscv_iommu_pmu *pmu, u32 idx)
+{
+ void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOCOUNTINH;
+ u32 value = readl(addr);
+
+ writel(value | BIT(idx), addr);
+}
+
+static void riscv_iommu_pmu_enable_ovf_intr(struct riscv_iommu_pmu *pmu, u32 idx)
+{
+ u64 value;
+
+ if (get_event(pmu->events[idx]) == RISCV_IOMMU_HPMEVENT_CYCLE) {
+ value = riscv_iommu_pmu_get_counter(pmu, idx) & ~RISCV_IOMMU_IOHPMCYCLES_OVF;
+ writeq(value, pmu->reg + RISCV_IOMMU_REG_IOHPMCYCLES);
+ } else {
+ value = riscv_iommu_pmu_get_event(pmu, idx) & ~RISCV_IOMMU_IOHPMEVT_OF;
+ writeq(value, pmu->reg + RISCV_IOMMU_REG_IOHPMEVT_BASE + (idx - 1) * 8);
+ }
+}
+
+static void riscv_iommu_pmu_disable_ovf_intr(struct riscv_iommu_pmu *pmu, u32 idx)
+{
+ u64 value;
+
+ if (get_event(pmu->events[idx]) == RISCV_IOMMU_HPMEVENT_CYCLE) {
+ value = riscv_iommu_pmu_get_counter(pmu, idx) | RISCV_IOMMU_IOHPMCYCLES_OVF;
+ writeq(value, pmu->reg + RISCV_IOMMU_REG_IOHPMCYCLES);
+ } else {
+ value = riscv_iommu_pmu_get_event(pmu, idx) | RISCV_IOMMU_IOHPMEVT_OF;
+ writeq(value, pmu->reg + RISCV_IOMMU_REG_IOHPMEVT_BASE + (idx - 1) * 8);
+ }
+}
+
+static void riscv_iommu_pmu_start_all(struct riscv_iommu_pmu *pmu)
+{
+ int idx;
+
+ for_each_set_bit(idx, pmu->used_counters, pmu->num_counters) {
+ riscv_iommu_pmu_enable_ovf_intr(pmu, idx);
+ riscv_iommu_pmu_enable_counter(pmu, idx);
+ }
+}
+
+static void riscv_iommu_pmu_stop_all(struct riscv_iommu_pmu *pmu)
+{
+ writel(GENMASK_ULL(pmu->num_counters - 1, 0),
+ pmu->reg + RISCV_IOMMU_REG_IOCOUNTINH);
+}
+
+/* PMU APIs */
+static int riscv_iommu_pmu_set_period(struct perf_event *event)
+{
+ struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+ s64 left = local64_read(&hwc->period_left);
+ s64 period = hwc->sample_period;
+ u64 max_period = RISCV_IOMMU_IOHPMCTR_COUNTER;
+ int ret = 0;
+
+ if (unlikely(left <= -period)) {
+ left = period;
+ local64_set(&hwc->period_left, left);
+ hwc->last_period = period;
+ ret = 1;
+ }
+
+ if (unlikely(left <= 0)) {
+ left += period;
+ local64_set(&hwc->period_left, left);
+ hwc->last_period = period;
+ ret = 1;
+ }
+
+ /*
+ * Limit the maximum period to prevent the counter value
+ * from overtaking the one we are about to program. In
+ * effect we are reducing max_period to account for
+ * interrupt latency (and we are being very conservative).
+ */
+ if (left > (max_period >> 1))
+ left = (max_period >> 1);
+
+ local64_set(&hwc->prev_count, (u64)-left);
+ riscv_iommu_pmu_set_counter(pmu, hwc->idx, (u64)(-left) & max_period);
+ perf_event_update_userpage(event);
+
+ return ret;
+}
+
+static int riscv_iommu_pmu_event_init(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ hwc->idx = -1;
+ hwc->config = event->attr.config;
+
+ if (!is_sampling_event(event)) {
+ /*
+ * For non-sampling runs, limit the sample_period to half
+ * of the counter width. That way, the new counter value
+ * is far less likely to overtake the previous one unless
+ * you have some serious IRQ latency issues.
+ */
+ hwc->sample_period = RISCV_IOMMU_IOHPMCTR_COUNTER >> 1;
+ hwc->last_period = hwc->sample_period;
+ local64_set(&hwc->period_left, hwc->sample_period);
+ }
+
+ return 0;
+}
+
+static void riscv_iommu_pmu_update(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
+ u64 delta, prev, now;
+ u32 idx = hwc->idx;
+
+ do {
+ prev = local64_read(&hwc->prev_count);
+ now = riscv_iommu_pmu_get_counter(pmu, idx);
+ } while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
+
+ delta = FIELD_GET(RISCV_IOMMU_IOHPMCTR_COUNTER, now - prev);
+ local64_add(delta, &event->count);
+ local64_sub(delta, &hwc->period_left);
+}
+
+static void riscv_iommu_pmu_start(struct perf_event *event, int flags)
+{
+ struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+ return;
+
+ if (flags & PERF_EF_RELOAD)
+ WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+
+ hwc->state = 0;
+ riscv_iommu_pmu_set_period(event);
+ riscv_iommu_pmu_set_event(pmu, hwc->idx, hwc->config);
+ riscv_iommu_pmu_enable_ovf_intr(pmu, hwc->idx);
+ riscv_iommu_pmu_enable_counter(pmu, hwc->idx);
+
+ perf_event_update_userpage(event);
+}
+
+static void riscv_iommu_pmu_stop(struct perf_event *event, int flags)
+{
+ struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (hwc->state & PERF_HES_STOPPED)
+ return;
+
+ riscv_iommu_pmu_set_event(pmu, hwc->idx, RISCV_IOMMU_HPMEVENT_INVALID);
+ riscv_iommu_pmu_disable_counter(pmu, hwc->idx);
+
+ if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE))
+ riscv_iommu_pmu_update(event);
+
+ hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+}
+
+static int riscv_iommu_pmu_add(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
+ unsigned int num_counters = pmu->num_counters;
+ int idx;
+
+ /* Reserve index zero for iohpmcycles */
+ if (get_event(event) == RISCV_IOMMU_HPMEVENT_CYCLE)
+ idx = 0;
+ else
+ idx = find_next_zero_bit(pmu->used_counters, num_counters, 1);
+
+ if (idx == num_counters)
+ return -EAGAIN;
+
+ set_bit(idx, pmu->used_counters);
+
+ pmu->events[idx] = event;
+ hwc->idx = idx;
+ hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+
+ if (flags & PERF_EF_START)
+ riscv_iommu_pmu_start(event, flags);
+
+ /* Propagate changes to the userspace mapping. */
+ perf_event_update_userpage(event);
+
+ return 0;
+}
+
+static void riscv_iommu_pmu_read(struct perf_event *event)
+{
+ riscv_iommu_pmu_update(event);
+}
+
+static void riscv_iommu_pmu_del(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
+ int idx = hwc->idx;
+
+ riscv_iommu_pmu_stop(event, PERF_EF_UPDATE);
+ pmu->events[idx] = NULL;
+ clear_bit(idx, pmu->used_counters);
+ perf_event_update_userpage(event);
+}
+
+irqreturn_t riscv_iommu_pmu_handle_irq(struct riscv_iommu_pmu *pmu)
+{
+ struct perf_sample_data data;
+ struct pt_regs *regs;
+ u32 ovf = readl(pmu->reg + RISCV_IOMMU_REG_IOCOUNTOVF);
+ int idx;
+
+ if (!ovf)
+ return IRQ_NONE;
+
+ riscv_iommu_pmu_stop_all(pmu);
+
+ regs = get_irq_regs();
+
+ for_each_set_bit(idx, (unsigned long *)&ovf, pmu->num_counters) {
+ struct perf_event *event = pmu->events[idx];
+ struct hw_perf_event *hwc;
+
+ if (WARN_ON_ONCE(!event) || !is_sampling_event(event))
+ continue;
+
+ hwc = &event->hw;
+
+ riscv_iommu_pmu_update(event);
+ perf_sample_data_init(&data, 0, hwc->last_period);
+ if (!riscv_iommu_pmu_set_period(event))
+ continue;
+
+ if (perf_event_overflow(event, &data, regs))
+ riscv_iommu_pmu_stop(event, 0);
+ }
+
+ riscv_iommu_pmu_start_all(pmu);
+
+ return IRQ_HANDLED;
+}
+
+int riscv_iommu_pmu_init(struct riscv_iommu_pmu *pmu, void __iomem *reg,
+ const char *dev_name)
+{
+ char *name;
+ int ret;
+
+ pmu->reg = reg;
+ pmu->num_counters = RISCV_IOMMU_HPM_COUNTER_NUM;
+
+ pmu->pmu = (struct pmu) {
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = riscv_iommu_pmu_event_init,
+ .add = riscv_iommu_pmu_add,
+ .del = riscv_iommu_pmu_del,
+ .start = riscv_iommu_pmu_start,
+ .stop = riscv_iommu_pmu_stop,
+ .read = riscv_iommu_pmu_read,
+ .attr_groups = riscv_iommu_pmu_attr_grps,
+ .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
+ .module = THIS_MODULE,
+ };
+
+ name = kasprintf(GFP_KERNEL, "riscv-iommu-pmu@%s", dev_name);
+
+ ret = perf_pmu_register(&pmu->pmu, name, -1);
+ if (ret) {
+ pr_err("Failed to register riscv-iommu-pmu@%s: %d\n",
+ dev_name, ret);
+ return ret;
+ }
+
+ /* Stop all counters and later start the counter with perf */
+ riscv_iommu_pmu_stop_all(pmu);
+
+ pr_info("riscv-iommu-pmu@%s: Registered with %d counters\n",
+ dev_name, pmu->num_counters);
+
+ return 0;
+}
+
+void riscv_iommu_pmu_uninit(struct riscv_iommu_pmu *pmu)
+{
+ int idx;
+
+ /* Disable interrupt and functions */
+ for_each_set_bit(idx, pmu->used_counters, pmu->num_counters) {
+ riscv_iommu_pmu_disable_counter(pmu, idx);
+ riscv_iommu_pmu_disable_ovf_intr(pmu, idx);
+ }
+
+ perf_pmu_unregister(&pmu->pmu);
+}
diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
index 03e0c45bc7e1..ff66822c1114 100644
--- a/drivers/iommu/riscv/iommu.h
+++ b/drivers/iommu/riscv/iommu.h
@@ -60,11 +60,19 @@ struct riscv_iommu_device {
unsigned int ddt_mode;
dma_addr_t ddt_phys;
u64 *ddt_root;
+
+ /* hardware performance monitor */
+ struct riscv_iommu_pmu pmu;
};
int riscv_iommu_init(struct riscv_iommu_device *iommu);
void riscv_iommu_remove(struct riscv_iommu_device *iommu);
+int riscv_iommu_pmu_init(struct riscv_iommu_pmu *pmu, void __iomem *reg,
+ const char *name);
+void riscv_iommu_pmu_uninit(struct riscv_iommu_pmu *pmu);
+irqreturn_t riscv_iommu_pmu_handle_irq(struct riscv_iommu_pmu *pmu);
+
#define riscv_iommu_readl(iommu, addr) \
readl_relaxed((iommu)->reg + (addr))
--
2.17.1
This patch implements .hw_info operation and the related data
structures for passing the IOMMU hardware capabilities for iommufd.
Signed-off-by: Zong Li <[email protected]>
---
drivers/iommu/riscv/iommu.c | 23 +++++++++++++++++++++++
include/uapi/linux/iommufd.h | 13 +++++++++++++
2 files changed, 36 insertions(+)
diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
index d38e09b138b6..072251f6ad85 100644
--- a/drivers/iommu/riscv/iommu.c
+++ b/drivers/iommu/riscv/iommu.c
@@ -19,6 +19,7 @@
#include <linux/iopoll.h>
#include <linux/kernel.h>
#include <linux/pci.h>
+#include <uapi/linux/iommufd.h>
#include "../iommu-pages.h"
#include "iommu-bits.h"
@@ -1485,6 +1486,27 @@ static struct iommu_domain riscv_iommu_identity_domain = {
}
};
+static void *riscv_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
+{
+ struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+ struct iommu_hw_info_riscv_iommu *info;
+
+ if (!iommu)
+ return ERR_PTR(-ENODEV);
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return ERR_PTR(-ENOMEM);
+
+ info->capability = iommu->caps;
+ info->fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL);
+
+ *length = sizeof(*info);
+ *type = IOMMU_HW_INFO_TYPE_RISCV_IOMMU;
+
+ return info;
+}
+
static int riscv_iommu_device_domain_type(struct device *dev)
{
return 0;
@@ -1560,6 +1582,7 @@ static void riscv_iommu_release_device(struct device *dev)
static const struct iommu_ops riscv_iommu_ops = {
.pgsize_bitmap = SZ_4K,
.of_xlate = riscv_iommu_of_xlate,
+ .hw_info = riscv_iommu_hw_info,
.identity_domain = &riscv_iommu_identity_domain,
.blocked_domain = &riscv_iommu_blocking_domain,
.release_domain = &riscv_iommu_blocking_domain,
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 1dfeaa2e649e..ec9aafd7d373 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -475,15 +475,28 @@ struct iommu_hw_info_vtd {
__aligned_u64 ecap_reg;
};
+/**
+ * struct iommu_hw_info_riscv_iommu - RISCV IOMMU hardware information
+ *
+ * @capability: Value of RISC-V IOMMU capability register
+ * @fctl: Value of RISC-V IOMMU feature control register
+ */
+struct iommu_hw_info_riscv_iommu {
+ __aligned_u64 capability;
+ __u32 fctl;
+};
+
/**
* enum iommu_hw_info_type - IOMMU Hardware Info Types
* @IOMMU_HW_INFO_TYPE_NONE: Used by the drivers that do not report hardware
* info
* @IOMMU_HW_INFO_TYPE_INTEL_VTD: Intel VT-d iommu info type
+ * @IOMMU_HW_INFO_TYPE_RISCV_IOMMU: RISC-V iommu info type
*/
enum iommu_hw_info_type {
IOMMU_HW_INFO_TYPE_NONE,
IOMMU_HW_INFO_TYPE_INTEL_VTD,
+ IOMMU_HW_INFO_TYPE_RISCV_IOMMU,
};
/**
--
2.17.1
This patch implements cache_invalidate_user operation for the userspace
to flush the hardware caches for a nested domain through iommufd.
Signed-off-by: Zong Li <[email protected]>
---
drivers/iommu/riscv/iommu.c | 91 ++++++++++++++++++++++++++++++++++++
include/uapi/linux/iommufd.h | 9 ++++
2 files changed, 100 insertions(+)
diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
index 7eda850df475..4dd58fe2242d 100644
--- a/drivers/iommu/riscv/iommu.c
+++ b/drivers/iommu/riscv/iommu.c
@@ -1522,9 +1522,100 @@ static void riscv_iommu_domain_free_nested(struct iommu_domain *domain)
kfree(riscv_domain);
}
+static int riscv_iommu_fix_user_cmd(struct riscv_iommu_command *cmd,
+ unsigned int pscid, unsigned int gscid)
+{
+ u32 opcode = FIELD_GET(RISCV_IOMMU_CMD_OPCODE, cmd->dword0);
+
+ switch (opcode) {
+ case RISCV_IOMMU_CMD_IOTINVAL_OPCODE:
+ u32 func = FIELD_GET(RISCV_IOMMU_CMD_FUNC, cmd->dword0);
+
+ if (func != RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA &&
+ func != RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA) {
+ pr_warn("The IOTINVAL function: 0x%x is not supported\n",
+ func);
+ return -EOPNOTSUPP;
+ }
+
+ if (func == RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA) {
+ cmd->dword0 &= ~RISCV_IOMMU_CMD_FUNC;
+ cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_FUNC,
+ RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA);
+ }
+
+ cmd->dword0 &= ~(RISCV_IOMMU_CMD_IOTINVAL_PSCID |
+ RISCV_IOMMU_CMD_IOTINVAL_GSCID);
+ riscv_iommu_cmd_inval_set_pscid(cmd, pscid);
+ riscv_iommu_cmd_inval_set_gscid(cmd, gscid);
+ break;
+ case RISCV_IOMMU_CMD_IODIR_OPCODE:
+ /*
+ * Ensure the device ID is right. We expect that VMM has
+ * transferred the device ID to host's from guest's.
+ */
+ break;
+ default:
+ pr_warn("The user command: 0x%x is not supported\n", opcode);
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int riscv_iommu_cache_invalidate_user(struct iommu_domain *domain,
+ struct iommu_user_data_array *array)
+{
+ struct riscv_iommu_domain *riscv_domain = iommu_domain_to_riscv(domain);
+ struct riscv_iommu_device *iommu;
+ struct riscv_iommu_bond *bond;
+ struct riscv_iommu_command cmd;
+ struct iommu_hwpt_riscv_iommu_invalidate inv_info;
+ int ret, index;
+
+ if (!riscv_domain)
+ return -EINVAL;
+
+ /* Assume attached devices in the domain go through the same IOMMU device */
+ spin_lock(&riscv_domain->lock);
+ list_for_each_entry_rcu(bond, &riscv_domain->bonds, list) {
+ if (bond->dev) {
+ iommu = dev_to_iommu(bond->dev);
+ break;
+ }
+ }
+ spin_unlock(&riscv_domain->lock);
+
+ if (!iommu)
+ return -EINVAL;
+
+ for (index = 0; index < array->entry_num; index++) {
+ ret = iommu_copy_struct_from_user_array(&inv_info, array,
+ IOMMU_HWPT_DATA_RISCV_IOMMU,
+ index, cmd);
+ if (ret)
+ break;
+
+ ret = riscv_iommu_fix_user_cmd((struct riscv_iommu_command *)inv_info.cmd,
+ riscv_domain->pscid,
+ riscv_domain->s2->gscid);
+ if (ret == -EOPNOTSUPP)
+ continue;
+
+ riscv_iommu_cmd_send(iommu, (struct riscv_iommu_command *)inv_info.cmd, 0);
+ riscv_iommu_cmd_iofence(&cmd);
+ riscv_iommu_cmd_send(iommu, &cmd, RISCV_IOMMU_QUEUE_TIMEOUT);
+ }
+
+ array->entry_num = index;
+
+ return ret;
+}
+
static const struct iommu_domain_ops riscv_iommu_nested_domain_ops = {
.attach_dev = riscv_iommu_attach_dev_nested,
.free = riscv_iommu_domain_free_nested,
+ .cache_invalidate_user = riscv_iommu_cache_invalidate_user,
};
static int
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index e10b6e236647..d93a8f11813d 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -689,6 +689,15 @@ struct iommu_hwpt_vtd_s1_invalidate {
__u32 __reserved;
};
+/**
+ * struct iommu_hwpt_riscv_iommu_invalidate - RISCV IOMMU cache invalidation
+ * (IOMMU_HWPT_TYPE_RISCV_IOMMU)
+ * @cmd: An array holds a command for cache invalidation
+ */
+struct iommu_hwpt_riscv_iommu_invalidate {
+ __aligned_u64 cmd[2];
+};
+
/**
* struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE)
* @size: sizeof(struct iommu_hwpt_invalidate)
--
2.17.1
This patch implements .domain_alloc_user operation for creating domains
owend by userspace, e.g. through IOMMUFD. Add s2 domain for parent
domain for second stage, s1 domain will be the first stage.
Don't remove IOMMU private data of dev in blocked domain, because it
holds the user data of device, which is used when attaching device into
s1 domain.
Signed-off-by: Zong Li <[email protected]>
---
drivers/iommu/riscv/iommu.c | 227 ++++++++++++++++++++++++++++++++++-
include/uapi/linux/iommufd.h | 17 +++
2 files changed, 243 insertions(+), 1 deletion(-)
diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
index 072251f6ad85..7eda850df475 100644
--- a/drivers/iommu/riscv/iommu.c
+++ b/drivers/iommu/riscv/iommu.c
@@ -827,6 +827,7 @@ static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
/* This struct contains protection domain specific IOMMU driver data. */
struct riscv_iommu_domain {
+ struct riscv_iommu_domain *s2;
struct iommu_domain domain;
struct list_head bonds;
spinlock_t lock; /* protect bonds list updates. */
@@ -844,6 +845,7 @@ struct riscv_iommu_domain {
/* Private IOMMU data for managed devices, dev_iommu_priv_* */
struct riscv_iommu_info {
struct riscv_iommu_domain *domain;
+ struct riscv_iommu_dc dc_user;
};
/* Linkage between an iommu_domain and attached devices. */
@@ -1454,7 +1456,6 @@ static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain,
riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0, 0);
riscv_iommu_bond_unlink(info->domain, dev);
- info->domain = NULL;
return 0;
}
@@ -1486,6 +1487,229 @@ static struct iommu_domain riscv_iommu_identity_domain = {
}
};
+/**
+ * Nested IOMMU operations
+ */
+
+static int riscv_iommu_attach_dev_nested(struct iommu_domain *domain, struct device *dev)
+{
+ struct riscv_iommu_domain *riscv_domain = iommu_domain_to_riscv(domain);
+ struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+ struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
+
+ if (riscv_domain->numa_node == NUMA_NO_NODE)
+ riscv_domain->numa_node = dev_to_node(iommu->dev);
+
+ riscv_iommu_bond_unlink(info->domain, dev);
+
+ if (riscv_iommu_bond_link(riscv_domain, dev))
+ return -ENOMEM;
+
+ riscv_iommu_iotlb_inval(riscv_domain, 0, ULONG_MAX);
+
+ riscv_iommu_iodir_update(iommu, dev, info->dc_user.fsc, info->dc_user.ta,
+ info->dc_user.iohgatp);
+
+ info->domain = riscv_domain;
+
+ return 0;
+}
+
+static void riscv_iommu_domain_free_nested(struct iommu_domain *domain)
+{
+ struct riscv_iommu_domain *riscv_domain = iommu_domain_to_riscv(domain);
+
+ kfree(riscv_domain);
+}
+
+static const struct iommu_domain_ops riscv_iommu_nested_domain_ops = {
+ .attach_dev = riscv_iommu_attach_dev_nested,
+ .free = riscv_iommu_domain_free_nested,
+};
+
+static int
+riscv_iommu_get_dc_user(struct device *dev, struct iommu_hwpt_riscv_iommu *user_arg)
+{
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+ struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
+ struct riscv_iommu_dc dc;
+ struct riscv_iommu_fq_record event;
+ u64 dc_len = sizeof(struct riscv_iommu_dc) >> (!(iommu->caps & RISCV_IOMMU_CAP_MSI_FLAT));
+ u64 event_len = sizeof(struct riscv_iommu_fq_record);
+ void __user *event_user = NULL;
+
+ for (int i = 0; i < fwspec->num_ids; i++) {
+ event.hdr =
+ FIELD_PREP(RISCV_IOMMU_FQ_HDR_CAUSE, RISCV_IOMMU_FQ_CAUSE_DDT_INVALID) |
+ FIELD_PREP(RISCV_IOMMU_FQ_HDR_DID, fwspec->ids[i]);
+
+ /* Sanity check DC of stage-1 from user data */
+ if (!user_arg->out_event_uptr || user_arg->event_len != event_len)
+ return -EINVAL;
+
+ event_user = u64_to_user_ptr(user_arg->out_event_uptr);
+
+ if (!user_arg->dc_uptr || user_arg->dc_len != dc_len)
+ return -EINVAL;
+
+ if (copy_from_user(&dc, u64_to_user_ptr(user_arg->dc_uptr), dc_len))
+ return -EFAULT;
+
+ if (!(dc.tc & RISCV_IOMMU_DDTE_VALID)) {
+ dev_dbg(dev, "Invalid DDT from user data\n");
+ if (copy_to_user(event_user, &event, event_len))
+ return -EFAULT;
+ }
+
+ if (!dc.fsc || dc.iohgatp) {
+ dev_dbg(dev, "Wrong page table from user data\n");
+ if (copy_to_user(event_user, &event, event_len))
+ return -EFAULT;
+ }
+
+ /* Save DC of stage-1 from user data */
+ memcpy(&info->dc_user,
+ riscv_iommu_get_dc(iommu, fwspec->ids[i]),
+ sizeof(struct riscv_iommu_dc));
+ info->dc_user.fsc = dc.fsc;
+ }
+
+ return 0;
+}
+
+static struct iommu_domain *
+riscv_iommu_domain_alloc_nested(struct device *dev,
+ struct iommu_domain *parent,
+ const struct iommu_user_data *user_data)
+{
+ struct riscv_iommu_domain *s2_domain = iommu_domain_to_riscv(parent);
+ struct riscv_iommu_domain *s1_domain;
+ struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+ struct iommu_hwpt_riscv_iommu arg;
+ int ret, va_bits;
+
+ if (user_data->type != IOMMU_HWPT_DATA_RISCV_IOMMU)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (parent->type != IOMMU_DOMAIN_UNMANAGED)
+ return ERR_PTR(-EINVAL);
+
+ ret = iommu_copy_struct_from_user(&arg,
+ user_data,
+ IOMMU_HWPT_DATA_RISCV_IOMMU,
+ out_event_uptr);
+ if (ret)
+ return ERR_PTR(ret);
+
+ s1_domain = kzalloc(sizeof(*s1_domain), GFP_KERNEL);
+ if (!s1_domain)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock_init(&s1_domain->lock);
+ INIT_LIST_HEAD_RCU(&s1_domain->bonds);
+
+ s1_domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1,
+ RISCV_IOMMU_MAX_PSCID, GFP_KERNEL);
+ if (s1_domain->pscid < 0) {
+ iommu_free_page(s1_domain->pgd_root);
+ kfree(s1_domain);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* Get device context of stage-1 from user*/
+ ret = riscv_iommu_get_dc_user(dev, &arg);
+ if (ret) {
+ kfree(s1_domain);
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (!iommu) {
+ va_bits = VA_BITS;
+ } else if (iommu->caps & RISCV_IOMMU_CAP_S_SV57) {
+ va_bits = 57;
+ } else if (iommu->caps & RISCV_IOMMU_CAP_S_SV48) {
+ va_bits = 48;
+ } else if (iommu->caps & RISCV_IOMMU_CAP_S_SV39) {
+ va_bits = 39;
+ } else {
+ dev_err(dev, "cannot find supported page table mode\n");
+ return ERR_PTR(-ENODEV);
+ }
+
+ /*
+ * The ops->domain_alloc_user could be directly called by the iommufd core,
+ * instead of iommu core. So, this function need to set the default value of
+ * following data member:
+ * - domain->pgsize_bitmap
+ * - domain->geometry
+ * - domain->type
+ * - domain->ops
+ */
+ s1_domain->s2 = s2_domain;
+ s1_domain->domain.type = IOMMU_DOMAIN_NESTED;
+ s1_domain->domain.ops = &riscv_iommu_nested_domain_ops;
+ s1_domain->domain.pgsize_bitmap = SZ_4K;
+ s1_domain->domain.geometry.aperture_start = 0;
+ s1_domain->domain.geometry.aperture_end = DMA_BIT_MASK(va_bits - 1);
+ s1_domain->domain.geometry.force_aperture = true;
+
+ return &s1_domain->domain;
+}
+
+static struct iommu_domain *
+riscv_iommu_domain_alloc_user(struct device *dev, u32 flags,
+ struct iommu_domain *parent,
+ const struct iommu_user_data *user_data)
+{
+ struct iommu_domain *domain;
+ struct riscv_iommu_domain *riscv_domain;
+
+ /* Allocate stage-1 domain if it has stage-2 parent domain */
+ if (parent)
+ return riscv_iommu_domain_alloc_nested(dev, parent, user_data);
+
+ if (flags & ~((IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (user_data)
+ return ERR_PTR(-EINVAL);
+
+ /* domain_alloc_user op needs to be fully initialized */
+ domain = iommu_domain_alloc(dev->bus);
+ if (!domain)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * We assume that nest-parent or g-stage only will come here
+ * TODO: Shadow page table doesn't be supported now.
+ * We currently can't distinguish g-stage and shadow
+ * page table here. Shadow page table shouldn't be
+ * put at stage-2.
+ */
+ riscv_domain = iommu_domain_to_riscv(domain);
+
+ /* pgd_root may be allocated in .domain_alloc_paging */
+ if (riscv_domain->pgd_root)
+ iommu_free_page(riscv_domain->pgd_root);
+
+ riscv_domain->pgd_root = iommu_alloc_pages_node(riscv_domain->numa_node,
+ GFP_KERNEL_ACCOUNT,
+ 2);
+ if (!riscv_domain->pgd_root)
+ return ERR_PTR(-ENOMEM);
+
+ riscv_domain->gscid = ida_alloc_range(&riscv_iommu_gscids, 1,
+ RISCV_IOMMU_MAX_GSCID, GFP_KERNEL);
+ if (riscv_domain->gscid < 0) {
+ iommu_free_pages(riscv_domain->pgd_root, 2);
+ kfree(riscv_domain);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return domain;
+}
+
static void *riscv_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
{
struct riscv_iommu_device *iommu = dev_to_iommu(dev);
@@ -1587,6 +1811,7 @@ static const struct iommu_ops riscv_iommu_ops = {
.blocked_domain = &riscv_iommu_blocking_domain,
.release_domain = &riscv_iommu_blocking_domain,
.domain_alloc_paging = riscv_iommu_alloc_paging_domain,
+ .domain_alloc_user = riscv_iommu_domain_alloc_user,
.def_domain_type = riscv_iommu_device_domain_type,
.device_group = riscv_iommu_device_group,
.probe_device = riscv_iommu_probe_device,
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index ec9aafd7d373..e10b6e236647 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -390,14 +390,31 @@ struct iommu_hwpt_vtd_s1 {
__u32 __reserved;
};
+/**
+ * struct iommu_hwpt_riscv_iommu - RISCV IOMMU stage-1 device context table
+ * info (IOMMU_HWPT_TYPE_RISCV_IOMMU)
+ * @dc_len: Length of device context
+ * @dc_uptr: User pointer to the address of device context
+ * @event_len: Length of an event record
+ * @out_event_uptr: User pointer to the address of event record
+ */
+struct iommu_hwpt_riscv_iommu {
+ __aligned_u64 dc_len;
+ __aligned_u64 dc_uptr;
+ __aligned_u64 event_len;
+ __aligned_u64 out_event_uptr;
+};
+
/**
* enum iommu_hwpt_data_type - IOMMU HWPT Data Type
* @IOMMU_HWPT_DATA_NONE: no data
* @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table
+ * @IOMMU_HWPT_DATA_RISCV_IOMMU: RISC-V IOMMU device context table
*/
enum iommu_hwpt_data_type {
IOMMU_HWPT_DATA_NONE,
IOMMU_HWPT_DATA_VTD_S1,
+ IOMMU_HWPT_DATA_RISCV_IOMMU,
};
/**
--
2.17.1
This patch initialize the pmu stuff and uninitialize it when driver
removing. The interrupt handling is also provided, this handler need to
be primary handler instead of thread function, because pt_regs is empty
when threading the IRQ, but pt_regs is necessary by perf_event_overflow.
Signed-off-by: Zong Li <[email protected]>
---
drivers/iommu/riscv/iommu.c | 59 +++++++++++++++++++++++++++++++++++++
1 file changed, 59 insertions(+)
diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
index ec701fde520f..e0bf74a9c64d 100644
--- a/drivers/iommu/riscv/iommu.c
+++ b/drivers/iommu/riscv/iommu.c
@@ -526,6 +526,56 @@ static irqreturn_t riscv_iommu_fltq_process(int irq, void *data)
return IRQ_HANDLED;
}
+/*
+ * IOMMU Hardware performance monitor
+ */
+
+/* HPM interrupt primary handler */
+static irqreturn_t riscv_iommu_hpm_irq_handler(int irq, void *dev_id)
+{
+ struct riscv_iommu_device *iommu = (struct riscv_iommu_device *)dev_id;
+
+ /* Process pmu irq */
+ riscv_iommu_pmu_handle_irq(&iommu->pmu);
+
+ /* Clear performance monitoring interrupt pending */
+ riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, RISCV_IOMMU_IPSR_PMIP);
+
+ return IRQ_HANDLED;
+}
+
+/* HPM initialization */
+static int riscv_iommu_hpm_enable(struct riscv_iommu_device *iommu)
+{
+ int rc;
+
+ if (!(iommu->caps & RISCV_IOMMU_CAP_HPM))
+ return 0;
+
+ /*
+ * pt_regs is empty when threading the IRQ, but pt_regs is necessary
+ * by perf_event_overflow. Use primary handler instead of thread
+ * function for PM IRQ.
+ */
+ rc = devm_request_irq(iommu->dev, iommu->irqs[RISCV_IOMMU_IPSR_PMIP],
+ riscv_iommu_hpm_irq_handler, 0, NULL, iommu);
+ if (rc)
+ return rc;
+
+ return riscv_iommu_pmu_init(&iommu->pmu, iommu->reg, dev_name(iommu->dev));
+}
+
+/* HPM uninitialization */
+static void riscv_iommu_hpm_disable(struct riscv_iommu_device *iommu)
+{
+ if (!(iommu->caps & RISCV_IOMMU_CAP_HPM))
+ return;
+
+ devm_free_irq(iommu->dev, iommu->irqs[RISCV_IOMMU_IPSR_PMIP], iommu);
+
+ riscv_iommu_pmu_uninit(&iommu->pmu);
+}
+
/* Lookup and initialize device context info structure. */
static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu,
unsigned int devid)
@@ -1551,6 +1601,9 @@ void riscv_iommu_remove(struct riscv_iommu_device *iommu)
riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
riscv_iommu_queue_disable(&iommu->cmdq);
riscv_iommu_queue_disable(&iommu->fltq);
+
+ if (iommu->caps & RISCV_IOMMU_CAP_HPM)
+ riscv_iommu_pmu_uninit(&iommu->pmu);
}
int riscv_iommu_init(struct riscv_iommu_device *iommu)
@@ -1590,6 +1643,10 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
if (rc)
goto err_queue_disable;
+ rc = riscv_iommu_hpm_enable(iommu);
+ if (rc)
+ goto err_hpm_disable;
+
rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s",
dev_name(iommu->dev));
if (rc) {
@@ -1608,6 +1665,8 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
err_remove_sysfs:
iommu_device_sysfs_remove(&iommu->iommu);
err_iodir_off:
+ riscv_iommu_hpm_disable(iommu);
+err_hpm_disable:
riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
err_queue_disable:
riscv_iommu_queue_disable(&iommu->fltq);
--
2.17.1
This patch adds a global ID Allocator for GSCID and a wrap for setting
up GSCID in IOTLB invalidation command.
Set up iohgatp to enable second stage table and flus stage-2 table if
the GSCID is allocated.
The GSCID of domain should be freed when release domain. GSCID will be
allocated for parent domain in nested IOMMU process.
Signed-off-by: Zong Li <[email protected]>
---
drivers/iommu/riscv/iommu-bits.h | 7 +++
drivers/iommu/riscv/iommu.c | 81 ++++++++++++++++++++++----------
2 files changed, 62 insertions(+), 26 deletions(-)
diff --git a/drivers/iommu/riscv/iommu-bits.h b/drivers/iommu/riscv/iommu-bits.h
index 11351cf6c710..62b1ee387357 100644
--- a/drivers/iommu/riscv/iommu-bits.h
+++ b/drivers/iommu/riscv/iommu-bits.h
@@ -728,6 +728,13 @@ static inline void riscv_iommu_cmd_inval_vma(struct riscv_iommu_command *cmd)
cmd->dword1 = 0;
}
+static inline void riscv_iommu_cmd_inval_gvma(struct riscv_iommu_command *cmd)
+{
+ cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOTINVAL_OPCODE) |
+ FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA);
+ cmd->dword1 = 0;
+}
+
static inline void riscv_iommu_cmd_inval_set_addr(struct riscv_iommu_command *cmd,
u64 addr)
{
diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
index e0bf74a9c64d..d38e09b138b6 100644
--- a/drivers/iommu/riscv/iommu.c
+++ b/drivers/iommu/riscv/iommu.c
@@ -45,6 +45,10 @@
static DEFINE_IDA(riscv_iommu_pscids);
#define RISCV_IOMMU_MAX_PSCID (BIT(20) - 1)
+/* IOMMU GSCID allocation namespace. */
+static DEFINE_IDA(riscv_iommu_gscids);
+#define RISCV_IOMMU_MAX_GSCID (BIT(16) - 1)
+
/* Device resource-managed allocations */
struct riscv_iommu_devres {
void *addr;
@@ -826,6 +830,7 @@ struct riscv_iommu_domain {
struct list_head bonds;
spinlock_t lock; /* protect bonds list updates. */
int pscid;
+ int gscid;
int numa_node;
int amo_enabled:1;
unsigned int pgd_mode;
@@ -919,29 +924,43 @@ static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain,
rcu_read_lock();
prev = NULL;
- list_for_each_entry_rcu(bond, &domain->bonds, list) {
- iommu = dev_to_iommu(bond->dev);
- /*
- * IOTLB invalidation request can be safely omitted if already sent
- * to the IOMMU for the same PSCID, and with domain->bonds list
- * arranged based on the device's IOMMU, it's sufficient to check
- * last device the invalidation was sent to.
- */
- if (iommu == prev)
- continue;
-
- riscv_iommu_cmd_inval_vma(&cmd);
- riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
- if (len && len >= RISCV_IOMMU_IOTLB_INVAL_LIMIT) {
- for (iova = start; iova < end; iova += PAGE_SIZE) {
- riscv_iommu_cmd_inval_set_addr(&cmd, iova);
+ /*
+ * Host domain needs to flush entries in stage-2 for MSI mapping.
+ * However, device is bound to s1 domain instead of s2 domain.
+ * We need to flush mapping without looping devices of s2 domain
+ */
+ if (domain->gscid) {
+ riscv_iommu_cmd_inval_gvma(&cmd);
+ riscv_iommu_cmd_inval_set_gscid(&cmd, domain->gscid);
+ riscv_iommu_cmd_send(iommu, &cmd, 0);
+ riscv_iommu_cmd_iofence(&cmd);
+ riscv_iommu_cmd_send(iommu, &cmd, RISCV_IOMMU_QUEUE_TIMEOUT);
+ } else {
+ list_for_each_entry_rcu(bond, &domain->bonds, list) {
+ iommu = dev_to_iommu(bond->dev);
+
+ /*
+ * IOTLB invalidation request can be safely omitted if already sent
+ * to the IOMMU for the same PSCID, and with domain->bonds list
+ * arranged based on the device's IOMMU, it's sufficient to check
+ * last device the invalidation was sent to.
+ */
+ if (iommu == prev)
+ continue;
+
+ riscv_iommu_cmd_inval_vma(&cmd);
+ riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
+ if (len && len >= RISCV_IOMMU_IOTLB_INVAL_LIMIT) {
+ for (iova = start; iova < end; iova += PAGE_SIZE) {
+ riscv_iommu_cmd_inval_set_addr(&cmd, iova);
+ riscv_iommu_cmd_send(iommu, &cmd, 0);
+ }
+ } else {
riscv_iommu_cmd_send(iommu, &cmd, 0);
}
- } else {
- riscv_iommu_cmd_send(iommu, &cmd, 0);
+ prev = iommu;
}
- prev = iommu;
}
prev = NULL;
@@ -972,7 +991,7 @@ static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain,
* interim translation faults.
*/
static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu,
- struct device *dev, u64 fsc, u64 ta)
+ struct device *dev, u64 fsc, u64 ta, u64 iohgatp)
{
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct riscv_iommu_dc *dc;
@@ -1012,6 +1031,7 @@ static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu,
/* Update device context, write TC.V as the last step. */
WRITE_ONCE(dc->fsc, fsc);
WRITE_ONCE(dc->ta, ta & RISCV_IOMMU_PC_TA_PSCID);
+ WRITE_ONCE(dc->iohgatp, iohgatp);
WRITE_ONCE(dc->tc, tc);
}
}
@@ -1271,6 +1291,9 @@ static void riscv_iommu_free_paging_domain(struct iommu_domain *iommu_domain)
if ((int)domain->pscid > 0)
ida_free(&riscv_iommu_pscids, domain->pscid);
+ if ((int)domain->gscid > 0)
+ ida_free(&riscv_iommu_gscids, domain->gscid);
+
riscv_iommu_pte_free(domain, _io_pte_entry(pfn, _PAGE_TABLE), NULL);
kfree(domain);
}
@@ -1296,7 +1319,7 @@ static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
struct riscv_iommu_device *iommu = dev_to_iommu(dev);
struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
- u64 fsc, ta;
+ u64 fsc = 0, iohgatp = 0, ta;
if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode))
return -ENODEV;
@@ -1314,12 +1337,18 @@ static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
*/
riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX);
- fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, domain->pgd_mode) |
- FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, virt_to_pfn(domain->pgd_root));
+ if (domain->gscid)
+ iohgatp = FIELD_PREP(RISCV_IOMMU_DC_IOHGATP_MODE, domain->pgd_mode) |
+ FIELD_PREP(RISCV_IOMMU_DC_IOHGATP_GSCID, domain->gscid) |
+ FIELD_PREP(RISCV_IOMMU_DC_IOHGATP_PPN, virt_to_pfn(domain->pgd_root));
+ else
+ fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, domain->pgd_mode) |
+ FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, virt_to_pfn(domain->pgd_root));
+
ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid) |
RISCV_IOMMU_PC_TA_V;
- riscv_iommu_iodir_update(iommu, dev, fsc, ta);
+ riscv_iommu_iodir_update(iommu, dev, fsc, ta, iohgatp);
riscv_iommu_bond_unlink(info->domain, dev);
info->domain = domain;
@@ -1422,7 +1451,7 @@ static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain,
struct riscv_iommu_device *iommu = dev_to_iommu(dev);
struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
- riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0);
+ riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0, 0);
riscv_iommu_bond_unlink(info->domain, dev);
info->domain = NULL;
@@ -1442,7 +1471,7 @@ static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain,
struct riscv_iommu_device *iommu = dev_to_iommu(dev);
struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
- riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, RISCV_IOMMU_PC_TA_V);
+ riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, RISCV_IOMMU_PC_TA_V, 0);
riscv_iommu_bond_unlink(info->domain, dev);
info->domain = NULL;
--
2.17.1
On Tue, May 07, 2024 at 10:26:00PM +0800, Zong Li wrote:
> This patch implements cache_invalidate_user operation for the userspace
> to flush the hardware caches for a nested domain through iommufd.
>
> Signed-off-by: Zong Li <[email protected]>
> ---
> drivers/iommu/riscv/iommu.c | 91 ++++++++++++++++++++++++++++++++++++
> include/uapi/linux/iommufd.h | 9 ++++
> 2 files changed, 100 insertions(+)
>
> diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
> index 7eda850df475..4dd58fe2242d 100644
> --- a/drivers/iommu/riscv/iommu.c
> +++ b/drivers/iommu/riscv/iommu.c
> @@ -1522,9 +1522,100 @@ static void riscv_iommu_domain_free_nested(struct iommu_domain *domain)
> kfree(riscv_domain);
> }
>
> +static int riscv_iommu_fix_user_cmd(struct riscv_iommu_command *cmd,
> + unsigned int pscid, unsigned int gscid)
> +{
> + u32 opcode = FIELD_GET(RISCV_IOMMU_CMD_OPCODE, cmd->dword0);
> +
> + switch (opcode) {
> + case RISCV_IOMMU_CMD_IOTINVAL_OPCODE:
> + u32 func = FIELD_GET(RISCV_IOMMU_CMD_FUNC, cmd->dword0);
> +
> + if (func != RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA &&
> + func != RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA) {
> + pr_warn("The IOTINVAL function: 0x%x is not supported\n",
> + func);
> + return -EOPNOTSUPP;
> + }
> +
> + if (func == RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA) {
> + cmd->dword0 &= ~RISCV_IOMMU_CMD_FUNC;
> + cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_FUNC,
> + RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA);
> + }
> +
> + cmd->dword0 &= ~(RISCV_IOMMU_CMD_IOTINVAL_PSCID |
> + RISCV_IOMMU_CMD_IOTINVAL_GSCID);
> + riscv_iommu_cmd_inval_set_pscid(cmd, pscid);
> + riscv_iommu_cmd_inval_set_gscid(cmd, gscid);
> + break;
> + case RISCV_IOMMU_CMD_IODIR_OPCODE:
> + /*
> + * Ensure the device ID is right. We expect that VMM has
> + * transferred the device ID to host's from guest's.
> + */
> + break;
> + default:
> + pr_warn("The user command: 0x%x is not supported\n", opcode);
> + return -EOPNOTSUPP;
No userspace triggerable warnings.
> +static int riscv_iommu_cache_invalidate_user(struct iommu_domain *domain,
> + struct iommu_user_data_array *array)
> +{
> + struct riscv_iommu_domain *riscv_domain = iommu_domain_to_riscv(domain);
> + struct riscv_iommu_device *iommu;
> + struct riscv_iommu_bond *bond;
> + struct riscv_iommu_command cmd;
> + struct iommu_hwpt_riscv_iommu_invalidate inv_info;
> + int ret, index;
> +
> + if (!riscv_domain)
> + return -EINVAL;
> +
> + /* Assume attached devices in the domain go through the same IOMMU device */
No, you can't assume that.
Jason
On Tue, May 07, 2024 at 10:25:57PM +0800, Zong Li wrote:
> @@ -919,29 +924,43 @@ static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain,
> rcu_read_lock();
>
> prev = NULL;
> - list_for_each_entry_rcu(bond, &domain->bonds, list) {
> - iommu = dev_to_iommu(bond->dev);
>
> - /*
> - * IOTLB invalidation request can be safely omitted if already sent
> - * to the IOMMU for the same PSCID, and with domain->bonds list
> - * arranged based on the device's IOMMU, it's sufficient to check
> - * last device the invalidation was sent to.
> - */
> - if (iommu == prev)
> - continue;
> -
> - riscv_iommu_cmd_inval_vma(&cmd);
> - riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
> - if (len && len >= RISCV_IOMMU_IOTLB_INVAL_LIMIT) {
> - for (iova = start; iova < end; iova += PAGE_SIZE) {
> - riscv_iommu_cmd_inval_set_addr(&cmd, iova);
> + /*
> + * Host domain needs to flush entries in stage-2 for MSI mapping.
> + * However, device is bound to s1 domain instead of s2 domain.
> + * We need to flush mapping without looping devices of s2 domain
> + */
> + if (domain->gscid) {
> + riscv_iommu_cmd_inval_gvma(&cmd);
> + riscv_iommu_cmd_inval_set_gscid(&cmd, domain->gscid);
> + riscv_iommu_cmd_send(iommu, &cmd, 0);
> + riscv_iommu_cmd_iofence(&cmd);
> + riscv_iommu_cmd_send(iommu, &cmd, RISCV_IOMMU_QUEUE_TIMEOUT);
Is iommu null here? Where did it come from?
This looks wrong too. The "bonds" list is sort of misnamed, it is
really a list of invalidation instructions. If you need a special
invalidation instruction for this case then you should allocate a
memory and add it to the bond list when the attach is done.
Invalidation should simply iterate over the bond list and do the
instructions it contains, always.
> static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu,
> - struct device *dev, u64 fsc, u64 ta)
> + struct device *dev, u64 fsc, u64 ta, u64 iohgatp)
I think you should make a struct to represent the dc entry.
Jason
On Tue, May 7, 2024 at 10:54 PM Jason Gunthorpe <[email protected]> wrote:
>
> On Tue, May 07, 2024 at 10:25:58PM +0800, Zong Li wrote:
> > +{
> > + struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> > + struct iommu_hw_info_riscv_iommu *info;
> > +
> > + if (!iommu)
> > + return ERR_PTR(-ENODEV);
>
> This is not possible, don't include impossible checks like this.
Thanks for pointing this out, I will remove it in the next version.
>
> > + info = kzalloc(sizeof(*info), GFP_KERNEL);
> > + if (!info)
> > + return ERR_PTR(-ENOMEM);
> > +
> > + info->capability = iommu->caps;
> > + info->fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL);
> > +
> > + *length = sizeof(*info);
> > + *type = IOMMU_HW_INFO_TYPE_RISCV_IOMMU;
> > +
> > + return info;
> > +}
> > +
> > static int riscv_iommu_device_domain_type(struct device *dev)
> > {
> > return 0;
> > @@ -1560,6 +1582,7 @@ static void riscv_iommu_release_device(struct device *dev)
> > static const struct iommu_ops riscv_iommu_ops = {
> > .pgsize_bitmap = SZ_4K,
> > .of_xlate = riscv_iommu_of_xlate,
> > + .hw_info = riscv_iommu_hw_info,
> > .identity_domain = &riscv_iommu_identity_domain,
> > .blocked_domain = &riscv_iommu_blocking_domain,
> > .release_domain = &riscv_iommu_blocking_domain,
> > diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
> > index 1dfeaa2e649e..ec9aafd7d373 100644
> > --- a/include/uapi/linux/iommufd.h
> > +++ b/include/uapi/linux/iommufd.h
> > @@ -475,15 +475,28 @@ struct iommu_hw_info_vtd {
> > __aligned_u64 ecap_reg;
> > };
> >
> > +/**
> > + * struct iommu_hw_info_riscv_iommu - RISCV IOMMU hardware information
> > + *
> > + * @capability: Value of RISC-V IOMMU capability register
> > + * @fctl: Value of RISC-V IOMMU feature control register
> > + */
>
> Please call out explictly what spec these values come from.
Let me add the description for the section that defines them.
>
> > +struct iommu_hw_info_riscv_iommu {
> > + __aligned_u64 capability;
> > + __u32 fctl;
> > +};
>
> Add explicit padding here
Add a u32 reserve here in the next version. Thanks
>
> Jason
On Tue, May 07, 2024 at 10:25:58PM +0800, Zong Li wrote:
> +{
> + struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> + struct iommu_hw_info_riscv_iommu *info;
> +
> + if (!iommu)
> + return ERR_PTR(-ENODEV);
This is not possible, don't include impossible checks like this.
> + info = kzalloc(sizeof(*info), GFP_KERNEL);
> + if (!info)
> + return ERR_PTR(-ENOMEM);
> +
> + info->capability = iommu->caps;
> + info->fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL);
> +
> + *length = sizeof(*info);
> + *type = IOMMU_HW_INFO_TYPE_RISCV_IOMMU;
> +
> + return info;
> +}
> +
> static int riscv_iommu_device_domain_type(struct device *dev)
> {
> return 0;
> @@ -1560,6 +1582,7 @@ static void riscv_iommu_release_device(struct device *dev)
> static const struct iommu_ops riscv_iommu_ops = {
> .pgsize_bitmap = SZ_4K,
> .of_xlate = riscv_iommu_of_xlate,
> + .hw_info = riscv_iommu_hw_info,
> .identity_domain = &riscv_iommu_identity_domain,
> .blocked_domain = &riscv_iommu_blocking_domain,
> .release_domain = &riscv_iommu_blocking_domain,
> diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
> index 1dfeaa2e649e..ec9aafd7d373 100644
> --- a/include/uapi/linux/iommufd.h
> +++ b/include/uapi/linux/iommufd.h
> @@ -475,15 +475,28 @@ struct iommu_hw_info_vtd {
> __aligned_u64 ecap_reg;
> };
>
> +/**
> + * struct iommu_hw_info_riscv_iommu - RISCV IOMMU hardware information
> + *
> + * @capability: Value of RISC-V IOMMU capability register
> + * @fctl: Value of RISC-V IOMMU feature control register
> + */
Please call out explictly what spec these values come from.
> +struct iommu_hw_info_riscv_iommu {
> + __aligned_u64 capability;
> + __u32 fctl;
> +};
Add explicit padding here
Jason
On Tue, May 7, 2024 at 11:08 PM Jason Gunthorpe <[email protected]> wrote:
>
> On Tue, May 07, 2024 at 10:26:00PM +0800, Zong Li wrote:
> > This patch implements cache_invalidate_user operation for the userspace
> > to flush the hardware caches for a nested domain through iommufd.
> >
> > Signed-off-by: Zong Li <[email protected]>
> > ---
> > drivers/iommu/riscv/iommu.c | 91 ++++++++++++++++++++++++++++++++++++
> > include/uapi/linux/iommufd.h | 9 ++++
> > 2 files changed, 100 insertions(+)
> >
> > diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
> > index 7eda850df475..4dd58fe2242d 100644
> > --- a/drivers/iommu/riscv/iommu.c
> > +++ b/drivers/iommu/riscv/iommu.c
> > @@ -1522,9 +1522,100 @@ static void riscv_iommu_domain_free_nested(struct iommu_domain *domain)
> > kfree(riscv_domain);
> > }
> >
> > +static int riscv_iommu_fix_user_cmd(struct riscv_iommu_command *cmd,
> > + unsigned int pscid, unsigned int gscid)
> > +{
> > + u32 opcode = FIELD_GET(RISCV_IOMMU_CMD_OPCODE, cmd->dword0);
> > +
> > + switch (opcode) {
> > + case RISCV_IOMMU_CMD_IOTINVAL_OPCODE:
> > + u32 func = FIELD_GET(RISCV_IOMMU_CMD_FUNC, cmd->dword0);
> > +
> > + if (func != RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA &&
> > + func != RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA) {
> > + pr_warn("The IOTINVAL function: 0x%x is not supported\n",
> > + func);
> > + return -EOPNOTSUPP;
> > + }
> > +
> > + if (func == RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA) {
> > + cmd->dword0 &= ~RISCV_IOMMU_CMD_FUNC;
> > + cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_FUNC,
> > + RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA);
> > + }
> > +
> > + cmd->dword0 &= ~(RISCV_IOMMU_CMD_IOTINVAL_PSCID |
> > + RISCV_IOMMU_CMD_IOTINVAL_GSCID);
> > + riscv_iommu_cmd_inval_set_pscid(cmd, pscid);
> > + riscv_iommu_cmd_inval_set_gscid(cmd, gscid);
> > + break;
> > + case RISCV_IOMMU_CMD_IODIR_OPCODE:
> > + /*
> > + * Ensure the device ID is right. We expect that VMM has
> > + * transferred the device ID to host's from guest's.
> > + */
> > + break;
> > + default:
> > + pr_warn("The user command: 0x%x is not supported\n", opcode);
> > + return -EOPNOTSUPP;
>
> No userspace triggerable warnings.
I don't complete understand about this. Could I know whether we should
suppress the message and return the error directly, or if we should
convert the warning to an error (i.e. pr_err)?
>
> > +static int riscv_iommu_cache_invalidate_user(struct iommu_domain *domain,
> > + struct iommu_user_data_array *array)
> > +{
> > + struct riscv_iommu_domain *riscv_domain = iommu_domain_to_riscv(domain);
> > + struct riscv_iommu_device *iommu;
> > + struct riscv_iommu_bond *bond;
> > + struct riscv_iommu_command cmd;
> > + struct iommu_hwpt_riscv_iommu_invalidate inv_info;
> > + int ret, index;
> > +
> > + if (!riscv_domain)
> > + return -EINVAL;
> > +
> > + /* Assume attached devices in the domain go through the same IOMMU device */
>
> No, you can't assume that.
Do you think that it makes sense to add a riscv_iommu_device structure
in riscv_iommu_domain? Or we might need to add some data structure to
build the mapping of the riscv_iommu_device and riscv_iommu_domain,
then we can get the corresponding riscv_iommu_device by
riscv_iommu_domain?
Thanks
>
> Jason
On Tue, May 07, 2024 at 10:25:59PM +0800, Zong Li wrote:
> This patch implements .domain_alloc_user operation for creating domains
> owend by userspace, e.g. through IOMMUFD. Add s2 domain for parent
> domain for second stage, s1 domain will be the first stage.
>
> Don't remove IOMMU private data of dev in blocked domain, because it
> holds the user data of device, which is used when attaching device into
> s1 domain.
>
> Signed-off-by: Zong Li <[email protected]>
> ---
> drivers/iommu/riscv/iommu.c | 227 ++++++++++++++++++++++++++++++++++-
> include/uapi/linux/iommufd.h | 17 +++
> 2 files changed, 243 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
> index 072251f6ad85..7eda850df475 100644
> --- a/drivers/iommu/riscv/iommu.c
> +++ b/drivers/iommu/riscv/iommu.c
> @@ -827,6 +827,7 @@ static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
>
> /* This struct contains protection domain specific IOMMU driver data. */
> struct riscv_iommu_domain {
> + struct riscv_iommu_domain *s2;
> struct iommu_domain domain;
> struct list_head bonds;
> spinlock_t lock; /* protect bonds list updates. */
> @@ -844,6 +845,7 @@ struct riscv_iommu_domain {
> /* Private IOMMU data for managed devices, dev_iommu_priv_* */
> struct riscv_iommu_info {
> struct riscv_iommu_domain *domain;
> + struct riscv_iommu_dc dc_user;
> };
>
> /* Linkage between an iommu_domain and attached devices. */
> @@ -1454,7 +1456,6 @@ static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain,
>
> riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0, 0);
> riscv_iommu_bond_unlink(info->domain, dev);
> - info->domain = NULL;
Nope, whatever is going on here, this can't be correct.
Once a domain is detached the driver must drop all references to it
immediately.
> return 0;
> }
> @@ -1486,6 +1487,229 @@ static struct iommu_domain riscv_iommu_identity_domain = {
> }
> };
>
> +/**
> + * Nested IOMMU operations
> + */
> +
> +static int riscv_iommu_attach_dev_nested(struct iommu_domain *domain, struct device *dev)
> +{
> + struct riscv_iommu_domain *riscv_domain = iommu_domain_to_riscv(domain);
> + struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> + struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
> +
> + if (riscv_domain->numa_node == NUMA_NO_NODE)
> + riscv_domain->numa_node = dev_to_node(iommu->dev);
Why? The kernel doesn't do any memory allocation for nested domains,
does it?
> + riscv_iommu_bond_unlink(info->domain, dev);
> +
> + if (riscv_iommu_bond_link(riscv_domain, dev))
> + return -ENOMEM;
> +
> + riscv_iommu_iotlb_inval(riscv_domain, 0, ULONG_MAX);
Why? The cache tags should be in good shape before they are assigned
to the device. Wiping it on every attach is just needless flushing
that isn't useful.
> + riscv_iommu_iodir_update(iommu, dev, info->dc_user.fsc, info->dc_user.ta,
> + info->dc_user.iohgatp);
This isn't ideal. Ideally the driver should make changing from S2 to
S2+S1 and back to be hitless. Wiping the valid bit as part of the
programming is not good.
As I said before, this driver probably needs the programming sequencer
from ARM.
> + info->domain = riscv_domain;
> +
> + return 0;
> +}
> +
> +static void riscv_iommu_domain_free_nested(struct iommu_domain *domain)
> +{
> + struct riscv_iommu_domain *riscv_domain = iommu_domain_to_riscv(domain);
> +
> + kfree(riscv_domain);
> +}
Doesn't the driver already have this function?
> +static struct iommu_domain *
> +riscv_iommu_domain_alloc_nested(struct device *dev,
> + struct iommu_domain *parent,
> + const struct iommu_user_data *user_data)
> +{
> + struct riscv_iommu_domain *s2_domain = iommu_domain_to_riscv(parent);
> + struct riscv_iommu_domain *s1_domain;
> + struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> + struct iommu_hwpt_riscv_iommu arg;
> + int ret, va_bits;
> +
> + if (user_data->type != IOMMU_HWPT_DATA_RISCV_IOMMU)
> + return ERR_PTR(-EOPNOTSUPP);
> +
> + if (parent->type != IOMMU_DOMAIN_UNMANAGED)
> + return ERR_PTR(-EINVAL);
> +
> + ret = iommu_copy_struct_from_user(&arg,
> + user_data,
> + IOMMU_HWPT_DATA_RISCV_IOMMU,
> + out_event_uptr);
> + if (ret)
> + return ERR_PTR(ret);
> +
> + s1_domain = kzalloc(sizeof(*s1_domain), GFP_KERNEL);
> + if (!s1_domain)
> + return ERR_PTR(-ENOMEM);
> +
> + spin_lock_init(&s1_domain->lock);
> + INIT_LIST_HEAD_RCU(&s1_domain->bonds);
> +
> + s1_domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1,
> + RISCV_IOMMU_MAX_PSCID, GFP_KERNEL);
> + if (s1_domain->pscid < 0) {
> + iommu_free_page(s1_domain->pgd_root);
> + kfree(s1_domain);
> + return ERR_PTR(-ENOMEM);
> + }
> +
> + /* Get device context of stage-1 from user*/
> + ret = riscv_iommu_get_dc_user(dev, &arg);
> + if (ret) {
> + kfree(s1_domain);
> + return ERR_PTR(-EINVAL);
> + }
> +
> + if (!iommu) {
> + va_bits = VA_BITS;
> + } else if (iommu->caps & RISCV_IOMMU_CAP_S_SV57) {
> + va_bits = 57;
> + } else if (iommu->caps & RISCV_IOMMU_CAP_S_SV48) {
> + va_bits = 48;
> + } else if (iommu->caps & RISCV_IOMMU_CAP_S_SV39) {
> + va_bits = 39;
> + } else {
> + dev_err(dev, "cannot find supported page table mode\n");
> + return ERR_PTR(-ENODEV);
> + }
> +
> + /*
> + * The ops->domain_alloc_user could be directly called by the iommufd core,
> + * instead of iommu core. So, this function need to set the default value of
> + * following data member:
> + * - domain->pgsize_bitmap
> + * - domain->geometry
> + * - domain->type
> + * - domain->ops
> + */
> + s1_domain->s2 = s2_domain;
> + s1_domain->domain.type = IOMMU_DOMAIN_NESTED;
> + s1_domain->domain.ops = &riscv_iommu_nested_domain_ops;
> + s1_domain->domain.pgsize_bitmap = SZ_4K;
> + s1_domain->domain.geometry.aperture_start = 0;
> + s1_domain->domain.geometry.aperture_end = DMA_BIT_MASK(va_bits - 1);
> + s1_domain->domain.geometry.force_aperture = true;
nested domains don't support paging/map so they don't use any of
this. Don't set it. Will remove the confusing va_bit stuff.
> +static struct iommu_domain *
> +riscv_iommu_domain_alloc_user(struct device *dev, u32 flags,
> + struct iommu_domain *parent,
> + const struct iommu_user_data *user_data)
> +{
> + struct iommu_domain *domain;
> + struct riscv_iommu_domain *riscv_domain;
> +
> + /* Allocate stage-1 domain if it has stage-2 parent domain */
> + if (parent)
> + return riscv_iommu_domain_alloc_nested(dev, parent, user_data);
> +
> + if (flags & ~((IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
> + return ERR_PTR(-EOPNOTSUPP);
This if should be moved up to the top and this driver does not support
diry_tracking, so don't test it.
> + if (user_data)
> + return ERR_PTR(-EINVAL);
Really? Don't you need to ask for the s2?
> + /* domain_alloc_user op needs to be fully initialized */
> + domain = iommu_domain_alloc(dev->bus);
Please no, structure your driver properly so you can call only
internal functions. This was a hack for intel only.
> + if (!domain)
> + return ERR_PTR(-ENOMEM);
> +
> + /*
> + * We assume that nest-parent or g-stage only will come here
No... That isn't right. Normal non-virtualization paging domains come
here too.
The default of this function, with an empty user_data, should be to
create the same domain as alloc_domain_paging.
> + * TODO: Shadow page table doesn't be supported now.
> + * We currently can't distinguish g-stage and shadow
> + * page table here. Shadow page table shouldn't be
> + * put at stage-2.
> + */
> + riscv_domain = iommu_domain_to_riscv(domain);
What is a shadow page table?
> + /* pgd_root may be allocated in .domain_alloc_paging */
> + if (riscv_domain->pgd_root)
> + iommu_free_page(riscv_domain->pgd_root);
And this is gross, structure your driver right so you don't have to
undo what prior functions did.
> + riscv_domain->pgd_root = iommu_alloc_pages_node(riscv_domain->numa_node,
> + GFP_KERNEL_ACCOUNT,
> + 2);
> + if (!riscv_domain->pgd_root)
> + return ERR_PTR(-ENOMEM);
> +
> + riscv_domain->gscid = ida_alloc_range(&riscv_iommu_gscids, 1,
> + RISCV_IOMMU_MAX_GSCID, GFP_KERNEL);
> + if (riscv_domain->gscid < 0) {
> + iommu_free_pages(riscv_domain->pgd_root, 2);
> + kfree(riscv_domain);
> + return ERR_PTR(-ENOMEM);
> + }
> +
> + return domain;
> +}
> +
> static void *riscv_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
> {
> struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> @@ -1587,6 +1811,7 @@ static const struct iommu_ops riscv_iommu_ops = {
> .blocked_domain = &riscv_iommu_blocking_domain,
> .release_domain = &riscv_iommu_blocking_domain,
> .domain_alloc_paging = riscv_iommu_alloc_paging_domain,
> + .domain_alloc_user = riscv_iommu_domain_alloc_user,
> .def_domain_type = riscv_iommu_device_domain_type,
> .device_group = riscv_iommu_device_group,
> .probe_device = riscv_iommu_probe_device,
> diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
> index ec9aafd7d373..e10b6e236647 100644
> --- a/include/uapi/linux/iommufd.h
> +++ b/include/uapi/linux/iommufd.h
> @@ -390,14 +390,31 @@ struct iommu_hwpt_vtd_s1 {
> __u32 __reserved;
> };
>
> +/**
> + * struct iommu_hwpt_riscv_iommu - RISCV IOMMU stage-1 device context table
> + * info (IOMMU_HWPT_TYPE_RISCV_IOMMU)
> + * @dc_len: Length of device context
> + * @dc_uptr: User pointer to the address of device context
> + * @event_len: Length of an event record
> + * @out_event_uptr: User pointer to the address of event record
> + */
> +struct iommu_hwpt_riscv_iommu {
> + __aligned_u64 dc_len;
> + __aligned_u64 dc_uptr;
So far other drivers have been inlining their "device context" in this
struct, why do you need a pointer and length?
> + __aligned_u64 event_len;
> + __aligned_u64 out_event_uptr;
Weird? Why?
Jason
On Tue, May 7, 2024 at 11:15 PM Jason Gunthorpe <[email protected]> wrote:
>
> On Tue, May 07, 2024 at 10:25:57PM +0800, Zong Li wrote:
> > @@ -919,29 +924,43 @@ static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain,
> > rcu_read_lock();
> >
> > prev = NULL;
> > - list_for_each_entry_rcu(bond, &domain->bonds, list) {
> > - iommu = dev_to_iommu(bond->dev);
> >
> > - /*
> > - * IOTLB invalidation request can be safely omitted if already sent
> > - * to the IOMMU for the same PSCID, and with domain->bonds list
> > - * arranged based on the device's IOMMU, it's sufficient to check
> > - * last device the invalidation was sent to.
> > - */
> > - if (iommu == prev)
> > - continue;
> > -
> > - riscv_iommu_cmd_inval_vma(&cmd);
> > - riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
> > - if (len && len >= RISCV_IOMMU_IOTLB_INVAL_LIMIT) {
> > - for (iova = start; iova < end; iova += PAGE_SIZE) {
> > - riscv_iommu_cmd_inval_set_addr(&cmd, iova);
> > + /*
> > + * Host domain needs to flush entries in stage-2 for MSI mapping.
> > + * However, device is bound to s1 domain instead of s2 domain.
> > + * We need to flush mapping without looping devices of s2 domain
> > + */
> > + if (domain->gscid) {
> > + riscv_iommu_cmd_inval_gvma(&cmd);
> > + riscv_iommu_cmd_inval_set_gscid(&cmd, domain->gscid);
> > + riscv_iommu_cmd_send(iommu, &cmd, 0);
> > + riscv_iommu_cmd_iofence(&cmd);
> > + riscv_iommu_cmd_send(iommu, &cmd, RISCV_IOMMU_QUEUE_TIMEOUT);
>
> Is iommu null here? Where did it come from?
>
> This looks wrong too. The "bonds" list is sort of misnamed, it is
> really a list of invalidation instructions. If you need a special
> invalidation instruction for this case then you should allocate a
> memory and add it to the bond list when the attach is done.
>
> Invalidation should simply iterate over the bond list and do the
> instructions it contains, always.
I messed up this piece of code while cleaning it. I will fix it in the
next version. However, after your tips, it seems to me that we should
allocate a new bond entry in the s2 domain's list. This is because the
original bond entry becomes detached from the s2 domain and is
attached to the s1 domain when the device passes through to the guest,
if we don't create a new bond in s2 domain, then the list in s2 domain
would lose it. Let me modify the implementation here. Thanks.
>
> > static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu,
> > - struct device *dev, u64 fsc, u64 ta)
> > + struct device *dev, u64 fsc, u64 ta, u64 iohgatp)
>
> I think you should make a struct to represent the dc entry.
>
> Jason
On Tue, May 07, 2024 at 11:52:15PM +0800, Zong Li wrote:
> On Tue, May 7, 2024 at 11:15 PM Jason Gunthorpe <[email protected]> wrote:
> >
> > On Tue, May 07, 2024 at 10:25:57PM +0800, Zong Li wrote:
> > > @@ -919,29 +924,43 @@ static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain,
> > > rcu_read_lock();
> > >
> > > prev = NULL;
> > > - list_for_each_entry_rcu(bond, &domain->bonds, list) {
> > > - iommu = dev_to_iommu(bond->dev);
> > >
> > > - /*
> > > - * IOTLB invalidation request can be safely omitted if already sent
> > > - * to the IOMMU for the same PSCID, and with domain->bonds list
> > > - * arranged based on the device's IOMMU, it's sufficient to check
> > > - * last device the invalidation was sent to.
> > > - */
> > > - if (iommu == prev)
> > > - continue;
> > > -
> > > - riscv_iommu_cmd_inval_vma(&cmd);
> > > - riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
> > > - if (len && len >= RISCV_IOMMU_IOTLB_INVAL_LIMIT) {
> > > - for (iova = start; iova < end; iova += PAGE_SIZE) {
> > > - riscv_iommu_cmd_inval_set_addr(&cmd, iova);
> > > + /*
> > > + * Host domain needs to flush entries in stage-2 for MSI mapping.
> > > + * However, device is bound to s1 domain instead of s2 domain.
> > > + * We need to flush mapping without looping devices of s2 domain
> > > + */
> > > + if (domain->gscid) {
> > > + riscv_iommu_cmd_inval_gvma(&cmd);
> > > + riscv_iommu_cmd_inval_set_gscid(&cmd, domain->gscid);
> > > + riscv_iommu_cmd_send(iommu, &cmd, 0);
> > > + riscv_iommu_cmd_iofence(&cmd);
> > > + riscv_iommu_cmd_send(iommu, &cmd, RISCV_IOMMU_QUEUE_TIMEOUT);
> >
> > Is iommu null here? Where did it come from?
> >
> > This looks wrong too. The "bonds" list is sort of misnamed, it is
> > really a list of invalidation instructions. If you need a special
> > invalidation instruction for this case then you should allocate a
> > memory and add it to the bond list when the attach is done.
> >
> > Invalidation should simply iterate over the bond list and do the
> > instructions it contains, always.
>
> I messed up this piece of code while cleaning it. I will fix it in the
> next version. However, after your tips, it seems to me that we should
> allocate a new bond entry in the s2 domain's list.
Yes, when the nest is attached the S2's bond should get a GSCID
invalidation instruction and the S1's bond should get no invalidation
instruction. Bond is better understood as "paging domain invalidation
instructions".
(also if you follow this advice, then again, I don't see why the idr
allocators are global)
You have to make a decision on the user invalidation flow, and some of
that depends on what you plan to do for ATS invalidation.
It is OK to make the nested domain locked to a single iommu, enforced
at attach time, but don't use the bond list to do it.
For single iommu you'd just de-virtualize the PSCID and the ATS vRID
and stuff the commands into the single iommu. But this shouldn't
interact with the bond. The bond list is about invalidation
instructions for the paging domain. Just loosely store the owning
instace in the nesting domain struct.
Also please be careful that you don't advertise ATS support to the
guest before the kernel driver understands how to support it. You
should probably put a note to that effect in the uapi struct for the
get info patch.
Jason
On Tue, May 07, 2024 at 11:35:16PM +0800, Zong Li wrote:
> > > + default:
> > > + pr_warn("The user command: 0x%x is not supported\n", opcode);
> > > + return -EOPNOTSUPP;
> >
> > No userspace triggerable warnings.
>
> I don't complete understand about this. Could I know whether we should
> suppress the message and return the error directly, or if we should
> convert the warning to an error (i.e. pr_err)?
A userspace system call should never print to dmesg. Return an error
code. Put a pr_debug if you really care.
Jason