2023-07-19 19:43:34

by Tomasz Jeznach

[permalink] [raw]
Subject: [PATCH 08/11] RISC-V: drivers/iommu/riscv: Add page table support

Introduces I/O page level translation services, with 4K, 2M, 1G page
size support and enables page level iommu_map/unmap domain interfaces.

Co-developed-by: Sebastien Boeuf <[email protected]>
Signed-off-by: Sebastien Boeuf <[email protected]>
Signed-off-by: Tomasz Jeznach <[email protected]>
---
drivers/iommu/io-pgtable.c | 3 +
drivers/iommu/riscv/Makefile | 2 +-
drivers/iommu/riscv/io_pgtable.c | 266 +++++++++++++++++++++++++++++++
drivers/iommu/riscv/iommu.c | 40 +++--
drivers/iommu/riscv/iommu.h | 1 +
include/linux/io-pgtable.h | 2 +
6 files changed, 297 insertions(+), 17 deletions(-)
create mode 100644 drivers/iommu/riscv/io_pgtable.c

diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c
index b843fcd365d2..c4807175934f 100644
--- a/drivers/iommu/io-pgtable.c
+++ b/drivers/iommu/io-pgtable.c
@@ -32,6 +32,9 @@ io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] = {
[AMD_IOMMU_V1] = &io_pgtable_amd_iommu_v1_init_fns,
[AMD_IOMMU_V2] = &io_pgtable_amd_iommu_v2_init_fns,
#endif
+#ifdef CONFIG_RISCV_IOMMU
+ [RISCV_IOMMU] = &io_pgtable_riscv_init_fns,
+#endif
};

struct io_pgtable_ops *alloc_io_pgtable_ops(enum io_pgtable_fmt fmt,
diff --git a/drivers/iommu/riscv/Makefile b/drivers/iommu/riscv/Makefile
index 9523eb053cfc..13af452c3052 100644
--- a/drivers/iommu/riscv/Makefile
+++ b/drivers/iommu/riscv/Makefile
@@ -1 +1 @@
-obj-$(CONFIG_RISCV_IOMMU) += iommu.o iommu-pci.o iommu-platform.o iommu-sysfs.o
\ No newline at end of file
+obj-$(CONFIG_RISCV_IOMMU) += iommu.o iommu-pci.o iommu-platform.o iommu-sysfs.o io_pgtable.o
diff --git a/drivers/iommu/riscv/io_pgtable.c b/drivers/iommu/riscv/io_pgtable.c
new file mode 100644
index 000000000000..b6e603e6726e
--- /dev/null
+++ b/drivers/iommu/riscv/io_pgtable.c
@@ -0,0 +1,266 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright © 2022-2023 Rivos Inc.
+ *
+ * RISC-V IOMMU page table allocator.
+ *
+ * Authors:
+ * Tomasz Jeznach <[email protected]>
+ * Sebastien Boeuf <[email protected]>
+ */
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/io-pgtable.h>
+#include <linux/kernel.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/dma-mapping.h>
+
+#include "iommu.h"
+
+#define io_pgtable_to_domain(x) \
+ container_of((x), struct riscv_iommu_domain, pgtbl)
+
+#define io_pgtable_ops_to_domain(x) \
+ io_pgtable_to_domain(container_of((x), struct io_pgtable, ops))
+
+static inline size_t get_page_size(size_t size)
+{
+ if (size >= IOMMU_PAGE_SIZE_512G)
+ return IOMMU_PAGE_SIZE_512G;
+
+ if (size >= IOMMU_PAGE_SIZE_1G)
+ return IOMMU_PAGE_SIZE_1G;
+
+ if (size >= IOMMU_PAGE_SIZE_2M)
+ return IOMMU_PAGE_SIZE_2M;
+
+ return IOMMU_PAGE_SIZE_4K;
+}
+
+static void riscv_iommu_pt_walk_free(pmd_t * ptp, unsigned shift, bool root)
+{
+ pmd_t *pte, *pt_base;
+ int i;
+
+ if (shift == PAGE_SHIFT)
+ return;
+
+ if (root)
+ pt_base = ptp;
+ else
+ pt_base =
+ (pmd_t *) pfn_to_virt(__page_val_to_pfn(pmd_val(*ptp)));
+
+ /* Recursively free all sub page table pages */
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pte = pt_base + i;
+ if (pmd_present(*pte) && !pmd_leaf(*pte))
+ riscv_iommu_pt_walk_free(pte, shift - 9, false);
+ }
+
+ /* Now free the current page table page */
+ if (!root && pmd_present(*pt_base))
+ free_page((unsigned long)pt_base);
+}
+
+static void riscv_iommu_free_pgtable(struct io_pgtable *iop)
+{
+ struct riscv_iommu_domain *domain = io_pgtable_to_domain(iop);
+ riscv_iommu_pt_walk_free((pmd_t *) domain->pgd_root, PGDIR_SHIFT, true);
+}
+
+static pte_t *riscv_iommu_pt_walk_alloc(pmd_t * ptp, unsigned long iova,
+ unsigned shift, bool root,
+ size_t pgsize,
+ unsigned long (*pd_alloc)(gfp_t),
+ gfp_t gfp)
+{
+ pmd_t *pte;
+ unsigned long pfn;
+
+ if (root)
+ pte = ptp + ((iova >> shift) & (PTRS_PER_PMD - 1));
+ else
+ pte = (pmd_t *) pfn_to_virt(__page_val_to_pfn(pmd_val(*ptp))) +
+ ((iova >> shift) & (PTRS_PER_PMD - 1));
+
+ if ((1ULL << shift) <= pgsize) {
+ if (pmd_present(*pte) && !pmd_leaf(*pte))
+ riscv_iommu_pt_walk_free(pte, shift - 9, false);
+ return (pte_t *) pte;
+ }
+
+ if (pmd_none(*pte)) {
+ pfn = pd_alloc ? virt_to_pfn(pd_alloc(gfp)) : 0;
+ if (!pfn)
+ return NULL;
+ set_pmd(pte, __pmd((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
+ }
+
+ return riscv_iommu_pt_walk_alloc(pte, iova, shift - 9, false,
+ pgsize, pd_alloc, gfp);
+}
+
+static pte_t *riscv_iommu_pt_walk_fetch(pmd_t * ptp,
+ unsigned long iova, unsigned shift,
+ bool root)
+{
+ pmd_t *pte;
+
+ if (root)
+ pte = ptp + ((iova >> shift) & (PTRS_PER_PMD - 1));
+ else
+ pte = (pmd_t *) pfn_to_virt(__page_val_to_pfn(pmd_val(*ptp))) +
+ ((iova >> shift) & (PTRS_PER_PMD - 1));
+
+ if (pmd_leaf(*pte))
+ return (pte_t *) pte;
+ else if (pmd_none(*pte))
+ return NULL;
+ else if (shift == PAGE_SHIFT)
+ return NULL;
+
+ return riscv_iommu_pt_walk_fetch(pte, iova, shift - 9, false);
+}
+
+static int riscv_iommu_map_pages(struct io_pgtable_ops *ops,
+ unsigned long iova, phys_addr_t phys,
+ size_t pgsize, size_t pgcount, int prot,
+ gfp_t gfp, size_t *mapped)
+{
+ struct riscv_iommu_domain *domain = io_pgtable_ops_to_domain(ops);
+ size_t size = 0;
+ size_t page_size = get_page_size(pgsize);
+ pte_t *pte;
+ pte_t pte_val;
+ pgprot_t pte_prot;
+
+ if (domain->domain.type == IOMMU_DOMAIN_BLOCKED)
+ return -ENODEV;
+
+ if (domain->domain.type == IOMMU_DOMAIN_IDENTITY) {
+ *mapped = pgsize * pgcount;
+ return 0;
+ }
+
+ pte_prot = (prot & IOMMU_WRITE) ?
+ __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY) :
+ __pgprot(_PAGE_BASE | _PAGE_READ);
+
+ while (pgcount--) {
+ pte =
+ riscv_iommu_pt_walk_alloc((pmd_t *) domain->pgd_root, iova,
+ PGDIR_SHIFT, true, page_size,
+ get_zeroed_page, gfp);
+ if (!pte) {
+ *mapped = size;
+ return -ENOMEM;
+ }
+
+ pte_val = pfn_pte(phys_to_pfn(phys), pte_prot);
+
+ set_pte(pte, pte_val);
+
+ size += page_size;
+ iova += page_size;
+ phys += page_size;
+ }
+
+ *mapped = size;
+ return 0;
+}
+
+static size_t riscv_iommu_unmap_pages(struct io_pgtable_ops *ops,
+ unsigned long iova, size_t pgsize,
+ size_t pgcount,
+ struct iommu_iotlb_gather *gather)
+{
+ struct riscv_iommu_domain *domain = io_pgtable_ops_to_domain(ops);
+ size_t size = 0;
+ size_t page_size = get_page_size(pgsize);
+ pte_t *pte;
+
+ if (domain->domain.type == IOMMU_DOMAIN_IDENTITY)
+ return pgsize * pgcount;
+
+ while (pgcount--) {
+ pte = riscv_iommu_pt_walk_fetch((pmd_t *) domain->pgd_root,
+ iova, PGDIR_SHIFT, true);
+ if (!pte)
+ return size;
+
+ set_pte(pte, __pte(0));
+
+ iommu_iotlb_gather_add_page(&domain->domain, gather, iova,
+ pgsize);
+
+ size += page_size;
+ iova += page_size;
+ }
+
+ return size;
+}
+
+static phys_addr_t riscv_iommu_iova_to_phys(struct io_pgtable_ops *ops,
+ unsigned long iova)
+{
+ struct riscv_iommu_domain *domain = io_pgtable_ops_to_domain(ops);
+ pte_t *pte;
+
+ if (domain->domain.type == IOMMU_DOMAIN_IDENTITY)
+ return (phys_addr_t) iova;
+
+ pte = riscv_iommu_pt_walk_fetch((pmd_t *) domain->pgd_root,
+ iova, PGDIR_SHIFT, true);
+ if (!pte || !pte_present(*pte))
+ return 0;
+
+ return (pfn_to_phys(pte_pfn(*pte)) | (iova & PAGE_MASK));
+}
+
+static void riscv_iommu_tlb_inv_all(void *cookie)
+{
+}
+
+static void riscv_iommu_tlb_inv_walk(unsigned long iova, size_t size,
+ size_t granule, void *cookie)
+{
+}
+
+static void riscv_iommu_tlb_add_page(struct iommu_iotlb_gather *gather,
+ unsigned long iova, size_t granule,
+ void *cookie)
+{
+}
+
+static const struct iommu_flush_ops riscv_iommu_flush_ops = {
+ .tlb_flush_all = riscv_iommu_tlb_inv_all,
+ .tlb_flush_walk = riscv_iommu_tlb_inv_walk,
+ .tlb_add_page = riscv_iommu_tlb_add_page,
+};
+
+/* NOTE: cfg should point to riscv_iommu_domain structure member pgtbl.cfg */
+static struct io_pgtable *riscv_iommu_alloc_pgtable(struct io_pgtable_cfg *cfg,
+ void *cookie)
+{
+ struct io_pgtable *iop = container_of(cfg, struct io_pgtable, cfg);
+
+ cfg->pgsize_bitmap = SZ_4K | SZ_2M | SZ_1G;
+ cfg->ias = 57; // va mode, SvXX -> ias
+ cfg->oas = 57; // pa mode, or SvXX+4 -> oas
+ cfg->tlb = &riscv_iommu_flush_ops;
+
+ iop->ops.map_pages = riscv_iommu_map_pages;
+ iop->ops.unmap_pages = riscv_iommu_unmap_pages;
+ iop->ops.iova_to_phys = riscv_iommu_iova_to_phys;
+
+ return iop;
+}
+
+struct io_pgtable_init_fns io_pgtable_riscv_init_fns = {
+ .alloc = riscv_iommu_alloc_pgtable,
+ .free = riscv_iommu_free_pgtable,
+};
diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
index 9ee7d2b222b5..2ef6952a2109 100644
--- a/drivers/iommu/riscv/iommu.c
+++ b/drivers/iommu/riscv/iommu.c
@@ -807,7 +807,7 @@ static struct iommu_device *riscv_iommu_probe_device(struct device *dev)
/* Initial DC pointer can be NULL if IOMMU is configured in OFF or BARE mode */
ep->dc = riscv_iommu_get_dc(iommu, ep->devid);

- dev_info(iommu->dev, "adding device to iommu with devid %i in domain %i\n",
+ dev_dbg(iommu->dev, "adding device to iommu with devid %i in domain %i\n",
ep->devid, ep->domid);

dev_iommu_priv_set(dev, ep);
@@ -874,7 +874,10 @@ static struct iommu_domain *riscv_iommu_domain_alloc(unsigned type)
{
struct riscv_iommu_domain *domain;

- if (type != IOMMU_DOMAIN_IDENTITY &&
+ if (type != IOMMU_DOMAIN_DMA &&
+ type != IOMMU_DOMAIN_DMA_FQ &&
+ type != IOMMU_DOMAIN_UNMANAGED &&
+ type != IOMMU_DOMAIN_IDENTITY &&
type != IOMMU_DOMAIN_BLOCKED)
return NULL;

@@ -890,7 +893,7 @@ static struct iommu_domain *riscv_iommu_domain_alloc(unsigned type)
domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1,
RISCV_IOMMU_MAX_PSCID, GFP_KERNEL);

- printk("domain type %x alloc %u\n", type, domain->pscid);
+ printk("domain alloc %u\n", domain->pscid);

return &domain->domain;
}
@@ -903,6 +906,9 @@ static void riscv_iommu_domain_free(struct iommu_domain *iommu_domain)
pr_warn("IOMMU domain is not empty!\n");
}

+ if (domain->pgtbl.cookie)
+ free_io_pgtable_ops(&domain->pgtbl.ops);
+
if (domain->pgd_root)
free_pages((unsigned long)domain->pgd_root, 0);

@@ -959,6 +965,9 @@ static int riscv_iommu_domain_finalize(struct riscv_iommu_domain *domain,
if (!domain->pgd_root)
return -ENOMEM;

+ if (!alloc_io_pgtable_ops(RISCV_IOMMU, &domain->pgtbl.cfg, domain))
+ return -ENOMEM;
+
return 0;
}

@@ -1006,9 +1015,8 @@ static int riscv_iommu_attach_dev(struct iommu_domain *iommu_domain, struct devi
return 0;
}

- if (!dc) {
+ if (!dc)
return -ENODEV;
- }

/*
* S-Stage translation table. G-Stage remains unmodified (BARE).
@@ -1104,12 +1112,11 @@ static int riscv_iommu_map_pages(struct iommu_domain *iommu_domain,
{
struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);

- if (domain->domain.type == IOMMU_DOMAIN_IDENTITY) {
- *mapped = pgsize * pgcount;
- return 0;
- }
+ if (!domain->pgtbl.ops.map_pages)
+ return -ENODEV;

- return -ENODEV;
+ return domain->pgtbl.ops.map_pages(&domain->pgtbl.ops, iova, phys,
+ pgsize, pgcount, prot, gfp, mapped);
}

static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain,
@@ -1118,10 +1125,11 @@ static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain,
{
struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);

- if (domain->domain.type == IOMMU_DOMAIN_IDENTITY)
- return pgsize * pgcount;
+ if (!domain->pgtbl.ops.unmap_pages)
+ return 0;

- return 0;
+ return domain->pgtbl.ops.unmap_pages(&domain->pgtbl.ops, iova, pgsize,
+ pgcount, gather);
}

static phys_addr_t riscv_iommu_iova_to_phys(struct iommu_domain *iommu_domain,
@@ -1129,10 +1137,10 @@ static phys_addr_t riscv_iommu_iova_to_phys(struct iommu_domain *iommu_domain,
{
struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);

- if (domain->domain.type == IOMMU_DOMAIN_IDENTITY)
- return (phys_addr_t) iova;
+ if (!domain->pgtbl.ops.iova_to_phys)
+ return 0;

- return 0;
+ return domain->pgtbl.ops.iova_to_phys(&domain->pgtbl.ops, iova);
}

/*
diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
index 9140df71e17b..fe32a4eff14e 100644
--- a/drivers/iommu/riscv/iommu.h
+++ b/drivers/iommu/riscv/iommu.h
@@ -88,6 +88,7 @@ struct riscv_iommu_device {

struct riscv_iommu_domain {
struct iommu_domain domain;
+ struct io_pgtable pgtbl;

struct list_head endpoints;
struct mutex lock;
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 1b7a44b35616..8dd9d3a28e3a 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -19,6 +19,7 @@ enum io_pgtable_fmt {
AMD_IOMMU_V2,
APPLE_DART,
APPLE_DART2,
+ RISCV_IOMMU,
IO_PGTABLE_NUM_FMTS,
};

@@ -258,5 +259,6 @@ extern struct io_pgtable_init_fns io_pgtable_arm_mali_lpae_init_fns;
extern struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns;
extern struct io_pgtable_init_fns io_pgtable_amd_iommu_v2_init_fns;
extern struct io_pgtable_init_fns io_pgtable_apple_dart_init_fns;
+extern struct io_pgtable_init_fns io_pgtable_riscv_init_fns;

#endif /* __IO_PGTABLE_H */
--
2.34.1



2023-07-25 13:51:20

by Zong Li

[permalink] [raw]
Subject: Re: [PATCH 08/11] RISC-V: drivers/iommu/riscv: Add page table support

On Thu, Jul 20, 2023 at 3:34 AM Tomasz Jeznach <[email protected]> wrote:
>
> Introduces I/O page level translation services, with 4K, 2M, 1G page
> size support and enables page level iommu_map/unmap domain interfaces.
>
> Co-developed-by: Sebastien Boeuf <[email protected]>
> Signed-off-by: Sebastien Boeuf <[email protected]>
> Signed-off-by: Tomasz Jeznach <[email protected]>
> ---
> drivers/iommu/io-pgtable.c | 3 +
> drivers/iommu/riscv/Makefile | 2 +-
> drivers/iommu/riscv/io_pgtable.c | 266 +++++++++++++++++++++++++++++++
> drivers/iommu/riscv/iommu.c | 40 +++--
> drivers/iommu/riscv/iommu.h | 1 +
> include/linux/io-pgtable.h | 2 +
> 6 files changed, 297 insertions(+), 17 deletions(-)
> create mode 100644 drivers/iommu/riscv/io_pgtable.c
>
> diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c
> index b843fcd365d2..c4807175934f 100644
> --- a/drivers/iommu/io-pgtable.c
> +++ b/drivers/iommu/io-pgtable.c
> @@ -32,6 +32,9 @@ io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] = {
> [AMD_IOMMU_V1] = &io_pgtable_amd_iommu_v1_init_fns,
> [AMD_IOMMU_V2] = &io_pgtable_amd_iommu_v2_init_fns,
> #endif
> +#ifdef CONFIG_RISCV_IOMMU
> + [RISCV_IOMMU] = &io_pgtable_riscv_init_fns,
> +#endif
> };
>
> struct io_pgtable_ops *alloc_io_pgtable_ops(enum io_pgtable_fmt fmt,
> diff --git a/drivers/iommu/riscv/Makefile b/drivers/iommu/riscv/Makefile
> index 9523eb053cfc..13af452c3052 100644
> --- a/drivers/iommu/riscv/Makefile
> +++ b/drivers/iommu/riscv/Makefile
> @@ -1 +1 @@
> -obj-$(CONFIG_RISCV_IOMMU) += iommu.o iommu-pci.o iommu-platform.o iommu-sysfs.o
> \ No newline at end of file
> +obj-$(CONFIG_RISCV_IOMMU) += iommu.o iommu-pci.o iommu-platform.o iommu-sysfs.o io_pgtable.o
> diff --git a/drivers/iommu/riscv/io_pgtable.c b/drivers/iommu/riscv/io_pgtable.c
> new file mode 100644
> index 000000000000..b6e603e6726e
> --- /dev/null
> +++ b/drivers/iommu/riscv/io_pgtable.c
> @@ -0,0 +1,266 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright © 2022-2023 Rivos Inc.
> + *
> + * RISC-V IOMMU page table allocator.
> + *
> + * Authors:
> + * Tomasz Jeznach <[email protected]>
> + * Sebastien Boeuf <[email protected]>
> + */
> +
> +#include <linux/atomic.h>
> +#include <linux/bitops.h>
> +#include <linux/io-pgtable.h>
> +#include <linux/kernel.h>
> +#include <linux/sizes.h>
> +#include <linux/slab.h>
> +#include <linux/types.h>
> +#include <linux/dma-mapping.h>
> +
> +#include "iommu.h"
> +
> +#define io_pgtable_to_domain(x) \
> + container_of((x), struct riscv_iommu_domain, pgtbl)
> +
> +#define io_pgtable_ops_to_domain(x) \
> + io_pgtable_to_domain(container_of((x), struct io_pgtable, ops))
> +
> +static inline size_t get_page_size(size_t size)
> +{
> + if (size >= IOMMU_PAGE_SIZE_512G)
> + return IOMMU_PAGE_SIZE_512G;
> +
> + if (size >= IOMMU_PAGE_SIZE_1G)
> + return IOMMU_PAGE_SIZE_1G;
> +
> + if (size >= IOMMU_PAGE_SIZE_2M)
> + return IOMMU_PAGE_SIZE_2M;
> +
> + return IOMMU_PAGE_SIZE_4K;
> +}
> +
> +static void riscv_iommu_pt_walk_free(pmd_t * ptp, unsigned shift, bool root)
> +{
> + pmd_t *pte, *pt_base;
> + int i;
> +
> + if (shift == PAGE_SHIFT)
> + return;
> +
> + if (root)
> + pt_base = ptp;
> + else
> + pt_base =
> + (pmd_t *) pfn_to_virt(__page_val_to_pfn(pmd_val(*ptp)));
> +
> + /* Recursively free all sub page table pages */
> + for (i = 0; i < PTRS_PER_PMD; i++) {
> + pte = pt_base + i;
> + if (pmd_present(*pte) && !pmd_leaf(*pte))
> + riscv_iommu_pt_walk_free(pte, shift - 9, false);
> + }
> +
> + /* Now free the current page table page */
> + if (!root && pmd_present(*pt_base))
> + free_page((unsigned long)pt_base);
> +}
> +
> +static void riscv_iommu_free_pgtable(struct io_pgtable *iop)
> +{
> + struct riscv_iommu_domain *domain = io_pgtable_to_domain(iop);
> + riscv_iommu_pt_walk_free((pmd_t *) domain->pgd_root, PGDIR_SHIFT, true);
> +}
> +
> +static pte_t *riscv_iommu_pt_walk_alloc(pmd_t * ptp, unsigned long iova,
> + unsigned shift, bool root,
> + size_t pgsize,
> + unsigned long (*pd_alloc)(gfp_t),
> + gfp_t gfp)
> +{
> + pmd_t *pte;
> + unsigned long pfn;
> +
> + if (root)
> + pte = ptp + ((iova >> shift) & (PTRS_PER_PMD - 1));
> + else
> + pte = (pmd_t *) pfn_to_virt(__page_val_to_pfn(pmd_val(*ptp))) +
> + ((iova >> shift) & (PTRS_PER_PMD - 1));
> +
> + if ((1ULL << shift) <= pgsize) {
> + if (pmd_present(*pte) && !pmd_leaf(*pte))
> + riscv_iommu_pt_walk_free(pte, shift - 9, false);
> + return (pte_t *) pte;
> + }
> +
> + if (pmd_none(*pte)) {
> + pfn = pd_alloc ? virt_to_pfn(pd_alloc(gfp)) : 0;
> + if (!pfn)
> + return NULL;
> + set_pmd(pte, __pmd((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
> + }
> +
> + return riscv_iommu_pt_walk_alloc(pte, iova, shift - 9, false,
> + pgsize, pd_alloc, gfp);
> +}
> +
> +static pte_t *riscv_iommu_pt_walk_fetch(pmd_t * ptp,
> + unsigned long iova, unsigned shift,
> + bool root)
> +{
> + pmd_t *pte;
> +
> + if (root)
> + pte = ptp + ((iova >> shift) & (PTRS_PER_PMD - 1));
> + else
> + pte = (pmd_t *) pfn_to_virt(__page_val_to_pfn(pmd_val(*ptp))) +
> + ((iova >> shift) & (PTRS_PER_PMD - 1));
> +
> + if (pmd_leaf(*pte))
> + return (pte_t *) pte;
> + else if (pmd_none(*pte))
> + return NULL;
> + else if (shift == PAGE_SHIFT)
> + return NULL;
> +
> + return riscv_iommu_pt_walk_fetch(pte, iova, shift - 9, false);
> +}
> +
> +static int riscv_iommu_map_pages(struct io_pgtable_ops *ops,
> + unsigned long iova, phys_addr_t phys,
> + size_t pgsize, size_t pgcount, int prot,
> + gfp_t gfp, size_t *mapped)
> +{
> + struct riscv_iommu_domain *domain = io_pgtable_ops_to_domain(ops);
> + size_t size = 0;
> + size_t page_size = get_page_size(pgsize);
> + pte_t *pte;
> + pte_t pte_val;
> + pgprot_t pte_prot;
> +
> + if (domain->domain.type == IOMMU_DOMAIN_BLOCKED)
> + return -ENODEV;
> +
> + if (domain->domain.type == IOMMU_DOMAIN_IDENTITY) {
> + *mapped = pgsize * pgcount;
> + return 0;
> + }
> +
> + pte_prot = (prot & IOMMU_WRITE) ?
> + __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY) :
> + __pgprot(_PAGE_BASE | _PAGE_READ);
> +
> + while (pgcount--) {
> + pte =
> + riscv_iommu_pt_walk_alloc((pmd_t *) domain->pgd_root, iova,
> + PGDIR_SHIFT, true, page_size,
> + get_zeroed_page, gfp);
> + if (!pte) {
> + *mapped = size;
> + return -ENOMEM;
> + }
> +
> + pte_val = pfn_pte(phys_to_pfn(phys), pte_prot);
> +
> + set_pte(pte, pte_val);
> +
> + size += page_size;
> + iova += page_size;
> + phys += page_size;
> + }
> +
> + *mapped = size;
> + return 0;
> +}
> +
> +static size_t riscv_iommu_unmap_pages(struct io_pgtable_ops *ops,
> + unsigned long iova, size_t pgsize,
> + size_t pgcount,
> + struct iommu_iotlb_gather *gather)
> +{
> + struct riscv_iommu_domain *domain = io_pgtable_ops_to_domain(ops);
> + size_t size = 0;
> + size_t page_size = get_page_size(pgsize);
> + pte_t *pte;
> +
> + if (domain->domain.type == IOMMU_DOMAIN_IDENTITY)
> + return pgsize * pgcount;
> +
> + while (pgcount--) {
> + pte = riscv_iommu_pt_walk_fetch((pmd_t *) domain->pgd_root,
> + iova, PGDIR_SHIFT, true);
> + if (!pte)
> + return size;
> +
> + set_pte(pte, __pte(0));
> +
> + iommu_iotlb_gather_add_page(&domain->domain, gather, iova,
> + pgsize);
> +
> + size += page_size;
> + iova += page_size;
> + }
> +
> + return size;
> +}
> +
> +static phys_addr_t riscv_iommu_iova_to_phys(struct io_pgtable_ops *ops,
> + unsigned long iova)
> +{
> + struct riscv_iommu_domain *domain = io_pgtable_ops_to_domain(ops);
> + pte_t *pte;
> +
> + if (domain->domain.type == IOMMU_DOMAIN_IDENTITY)
> + return (phys_addr_t) iova;
> +
> + pte = riscv_iommu_pt_walk_fetch((pmd_t *) domain->pgd_root,
> + iova, PGDIR_SHIFT, true);
> + if (!pte || !pte_present(*pte))
> + return 0;
> +
> + return (pfn_to_phys(pte_pfn(*pte)) | (iova & PAGE_MASK));

It should be (iova & ~PAGE_MASK) for getting low 12 bits.

> +}
> +
> +static void riscv_iommu_tlb_inv_all(void *cookie)
> +{
> +}
> +
> +static void riscv_iommu_tlb_inv_walk(unsigned long iova, size_t size,
> + size_t granule, void *cookie)
> +{
> +}
> +
> +static void riscv_iommu_tlb_add_page(struct iommu_iotlb_gather *gather,
> + unsigned long iova, size_t granule,
> + void *cookie)
> +{
> +}
> +
> +static const struct iommu_flush_ops riscv_iommu_flush_ops = {
> + .tlb_flush_all = riscv_iommu_tlb_inv_all,
> + .tlb_flush_walk = riscv_iommu_tlb_inv_walk,
> + .tlb_add_page = riscv_iommu_tlb_add_page,
> +};
> +
> +/* NOTE: cfg should point to riscv_iommu_domain structure member pgtbl.cfg */
> +static struct io_pgtable *riscv_iommu_alloc_pgtable(struct io_pgtable_cfg *cfg,
> + void *cookie)
> +{
> + struct io_pgtable *iop = container_of(cfg, struct io_pgtable, cfg);
> +
> + cfg->pgsize_bitmap = SZ_4K | SZ_2M | SZ_1G;
> + cfg->ias = 57; // va mode, SvXX -> ias
> + cfg->oas = 57; // pa mode, or SvXX+4 -> oas
> + cfg->tlb = &riscv_iommu_flush_ops;
> +
> + iop->ops.map_pages = riscv_iommu_map_pages;
> + iop->ops.unmap_pages = riscv_iommu_unmap_pages;
> + iop->ops.iova_to_phys = riscv_iommu_iova_to_phys;
> +
> + return iop;
> +}
> +
> +struct io_pgtable_init_fns io_pgtable_riscv_init_fns = {
> + .alloc = riscv_iommu_alloc_pgtable,
> + .free = riscv_iommu_free_pgtable,
> +};
> diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
> index 9ee7d2b222b5..2ef6952a2109 100644
> --- a/drivers/iommu/riscv/iommu.c
> +++ b/drivers/iommu/riscv/iommu.c
> @@ -807,7 +807,7 @@ static struct iommu_device *riscv_iommu_probe_device(struct device *dev)
> /* Initial DC pointer can be NULL if IOMMU is configured in OFF or BARE mode */
> ep->dc = riscv_iommu_get_dc(iommu, ep->devid);
>
> - dev_info(iommu->dev, "adding device to iommu with devid %i in domain %i\n",
> + dev_dbg(iommu->dev, "adding device to iommu with devid %i in domain %i\n",
> ep->devid, ep->domid);
>
> dev_iommu_priv_set(dev, ep);
> @@ -874,7 +874,10 @@ static struct iommu_domain *riscv_iommu_domain_alloc(unsigned type)
> {
> struct riscv_iommu_domain *domain;
>
> - if (type != IOMMU_DOMAIN_IDENTITY &&
> + if (type != IOMMU_DOMAIN_DMA &&
> + type != IOMMU_DOMAIN_DMA_FQ &&
> + type != IOMMU_DOMAIN_UNMANAGED &&
> + type != IOMMU_DOMAIN_IDENTITY &&
> type != IOMMU_DOMAIN_BLOCKED)
> return NULL;
>
> @@ -890,7 +893,7 @@ static struct iommu_domain *riscv_iommu_domain_alloc(unsigned type)
> domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1,
> RISCV_IOMMU_MAX_PSCID, GFP_KERNEL);
>
> - printk("domain type %x alloc %u\n", type, domain->pscid);
> + printk("domain alloc %u\n", domain->pscid);
>
> return &domain->domain;
> }
> @@ -903,6 +906,9 @@ static void riscv_iommu_domain_free(struct iommu_domain *iommu_domain)
> pr_warn("IOMMU domain is not empty!\n");
> }
>
> + if (domain->pgtbl.cookie)
> + free_io_pgtable_ops(&domain->pgtbl.ops);
> +
> if (domain->pgd_root)
> free_pages((unsigned long)domain->pgd_root, 0);
>
> @@ -959,6 +965,9 @@ static int riscv_iommu_domain_finalize(struct riscv_iommu_domain *domain,
> if (!domain->pgd_root)
> return -ENOMEM;
>
> + if (!alloc_io_pgtable_ops(RISCV_IOMMU, &domain->pgtbl.cfg, domain))
> + return -ENOMEM;
> +
> return 0;
> }
>
> @@ -1006,9 +1015,8 @@ static int riscv_iommu_attach_dev(struct iommu_domain *iommu_domain, struct devi
> return 0;
> }
>
> - if (!dc) {
> + if (!dc)
> return -ENODEV;
> - }
>
> /*
> * S-Stage translation table. G-Stage remains unmodified (BARE).
> @@ -1104,12 +1112,11 @@ static int riscv_iommu_map_pages(struct iommu_domain *iommu_domain,
> {
> struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
>
> - if (domain->domain.type == IOMMU_DOMAIN_IDENTITY) {
> - *mapped = pgsize * pgcount;
> - return 0;
> - }
> + if (!domain->pgtbl.ops.map_pages)
> + return -ENODEV;
>
> - return -ENODEV;
> + return domain->pgtbl.ops.map_pages(&domain->pgtbl.ops, iova, phys,
> + pgsize, pgcount, prot, gfp, mapped);
> }
>
> static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain,
> @@ -1118,10 +1125,11 @@ static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain,
> {
> struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
>
> - if (domain->domain.type == IOMMU_DOMAIN_IDENTITY)
> - return pgsize * pgcount;
> + if (!domain->pgtbl.ops.unmap_pages)
> + return 0;
>
> - return 0;
> + return domain->pgtbl.ops.unmap_pages(&domain->pgtbl.ops, iova, pgsize,
> + pgcount, gather);
> }
>
> static phys_addr_t riscv_iommu_iova_to_phys(struct iommu_domain *iommu_domain,
> @@ -1129,10 +1137,10 @@ static phys_addr_t riscv_iommu_iova_to_phys(struct iommu_domain *iommu_domain,
> {
> struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
>
> - if (domain->domain.type == IOMMU_DOMAIN_IDENTITY)
> - return (phys_addr_t) iova;
> + if (!domain->pgtbl.ops.iova_to_phys)
> + return 0;
>
> - return 0;
> + return domain->pgtbl.ops.iova_to_phys(&domain->pgtbl.ops, iova);
> }
>
> /*
> diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
> index 9140df71e17b..fe32a4eff14e 100644
> --- a/drivers/iommu/riscv/iommu.h
> +++ b/drivers/iommu/riscv/iommu.h
> @@ -88,6 +88,7 @@ struct riscv_iommu_device {
>
> struct riscv_iommu_domain {
> struct iommu_domain domain;
> + struct io_pgtable pgtbl;
>
> struct list_head endpoints;
> struct mutex lock;
> diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
> index 1b7a44b35616..8dd9d3a28e3a 100644
> --- a/include/linux/io-pgtable.h
> +++ b/include/linux/io-pgtable.h
> @@ -19,6 +19,7 @@ enum io_pgtable_fmt {
> AMD_IOMMU_V2,
> APPLE_DART,
> APPLE_DART2,
> + RISCV_IOMMU,
> IO_PGTABLE_NUM_FMTS,
> };
>
> @@ -258,5 +259,6 @@ extern struct io_pgtable_init_fns io_pgtable_arm_mali_lpae_init_fns;
> extern struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns;
> extern struct io_pgtable_init_fns io_pgtable_amd_iommu_v2_init_fns;
> extern struct io_pgtable_init_fns io_pgtable_apple_dart_init_fns;
> +extern struct io_pgtable_init_fns io_pgtable_riscv_init_fns;
>
> #endif /* __IO_PGTABLE_H */
> --
> 2.34.1
>
>
> _______________________________________________
> linux-riscv mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/linux-riscv

2023-07-31 09:50:39

by Zong Li

[permalink] [raw]
Subject: Re: [PATCH 08/11] RISC-V: drivers/iommu/riscv: Add page table support

On Thu, Jul 20, 2023 at 3:34 AM Tomasz Jeznach <[email protected]> wrote:
>
> Introduces I/O page level translation services, with 4K, 2M, 1G page
> size support and enables page level iommu_map/unmap domain interfaces.
>
> Co-developed-by: Sebastien Boeuf <[email protected]>
> Signed-off-by: Sebastien Boeuf <[email protected]>
> Signed-off-by: Tomasz Jeznach <[email protected]>
> ---
> drivers/iommu/io-pgtable.c | 3 +
> drivers/iommu/riscv/Makefile | 2 +-
> drivers/iommu/riscv/io_pgtable.c | 266 +++++++++++++++++++++++++++++++
> drivers/iommu/riscv/iommu.c | 40 +++--
> drivers/iommu/riscv/iommu.h | 1 +
> include/linux/io-pgtable.h | 2 +
> 6 files changed, 297 insertions(+), 17 deletions(-)
> create mode 100644 drivers/iommu/riscv/io_pgtable.c
>
> diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c
> index b843fcd365d2..c4807175934f 100644
> --- a/drivers/iommu/io-pgtable.c
> +++ b/drivers/iommu/io-pgtable.c
> @@ -32,6 +32,9 @@ io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] = {
> [AMD_IOMMU_V1] = &io_pgtable_amd_iommu_v1_init_fns,
> [AMD_IOMMU_V2] = &io_pgtable_amd_iommu_v2_init_fns,
> #endif
> +#ifdef CONFIG_RISCV_IOMMU
> + [RISCV_IOMMU] = &io_pgtable_riscv_init_fns,
> +#endif
> };
>
> struct io_pgtable_ops *alloc_io_pgtable_ops(enum io_pgtable_fmt fmt,
> diff --git a/drivers/iommu/riscv/Makefile b/drivers/iommu/riscv/Makefile
> index 9523eb053cfc..13af452c3052 100644
> --- a/drivers/iommu/riscv/Makefile
> +++ b/drivers/iommu/riscv/Makefile
> @@ -1 +1 @@
> -obj-$(CONFIG_RISCV_IOMMU) += iommu.o iommu-pci.o iommu-platform.o iommu-sysfs.o
> \ No newline at end of file
> +obj-$(CONFIG_RISCV_IOMMU) += iommu.o iommu-pci.o iommu-platform.o iommu-sysfs.o io_pgtable.o
> diff --git a/drivers/iommu/riscv/io_pgtable.c b/drivers/iommu/riscv/io_pgtable.c
> new file mode 100644
> index 000000000000..b6e603e6726e
> --- /dev/null
> +++ b/drivers/iommu/riscv/io_pgtable.c
> @@ -0,0 +1,266 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright © 2022-2023 Rivos Inc.
> + *
> + * RISC-V IOMMU page table allocator.
> + *
> + * Authors:
> + * Tomasz Jeznach <[email protected]>
> + * Sebastien Boeuf <[email protected]>
> + */
> +
> +#include <linux/atomic.h>
> +#include <linux/bitops.h>
> +#include <linux/io-pgtable.h>
> +#include <linux/kernel.h>
> +#include <linux/sizes.h>
> +#include <linux/slab.h>
> +#include <linux/types.h>
> +#include <linux/dma-mapping.h>
> +
> +#include "iommu.h"
> +
> +#define io_pgtable_to_domain(x) \
> + container_of((x), struct riscv_iommu_domain, pgtbl)
> +
> +#define io_pgtable_ops_to_domain(x) \
> + io_pgtable_to_domain(container_of((x), struct io_pgtable, ops))
> +
> +static inline size_t get_page_size(size_t size)
> +{
> + if (size >= IOMMU_PAGE_SIZE_512G)
> + return IOMMU_PAGE_SIZE_512G;
> +
> + if (size >= IOMMU_PAGE_SIZE_1G)
> + return IOMMU_PAGE_SIZE_1G;
> +
> + if (size >= IOMMU_PAGE_SIZE_2M)
> + return IOMMU_PAGE_SIZE_2M;
> +
> + return IOMMU_PAGE_SIZE_4K;
> +}
> +
> +static void riscv_iommu_pt_walk_free(pmd_t * ptp, unsigned shift, bool root)
> +{
> + pmd_t *pte, *pt_base;
> + int i;
> +
> + if (shift == PAGE_SHIFT)
> + return;
> +
> + if (root)
> + pt_base = ptp;
> + else
> + pt_base =
> + (pmd_t *) pfn_to_virt(__page_val_to_pfn(pmd_val(*ptp)));
> +
> + /* Recursively free all sub page table pages */
> + for (i = 0; i < PTRS_PER_PMD; i++) {
> + pte = pt_base + i;
> + if (pmd_present(*pte) && !pmd_leaf(*pte))
> + riscv_iommu_pt_walk_free(pte, shift - 9, false);
> + }
> +
> + /* Now free the current page table page */
> + if (!root && pmd_present(*pt_base))
> + free_page((unsigned long)pt_base);
> +}
> +
> +static void riscv_iommu_free_pgtable(struct io_pgtable *iop)
> +{
> + struct riscv_iommu_domain *domain = io_pgtable_to_domain(iop);
> + riscv_iommu_pt_walk_free((pmd_t *) domain->pgd_root, PGDIR_SHIFT, true);
> +}
> +
> +static pte_t *riscv_iommu_pt_walk_alloc(pmd_t * ptp, unsigned long iova,
> + unsigned shift, bool root,
> + size_t pgsize,
> + unsigned long (*pd_alloc)(gfp_t),
> + gfp_t gfp)
> +{
> + pmd_t *pte;
> + unsigned long pfn;
> +
> + if (root)
> + pte = ptp + ((iova >> shift) & (PTRS_PER_PMD - 1));
> + else
> + pte = (pmd_t *) pfn_to_virt(__page_val_to_pfn(pmd_val(*ptp))) +
> + ((iova >> shift) & (PTRS_PER_PMD - 1));
> +
> + if ((1ULL << shift) <= pgsize) {
> + if (pmd_present(*pte) && !pmd_leaf(*pte))
> + riscv_iommu_pt_walk_free(pte, shift - 9, false);
> + return (pte_t *) pte;
> + }
> +
> + if (pmd_none(*pte)) {
> + pfn = pd_alloc ? virt_to_pfn(pd_alloc(gfp)) : 0;
> + if (!pfn)
> + return NULL;
> + set_pmd(pte, __pmd((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
> + }
> +
> + return riscv_iommu_pt_walk_alloc(pte, iova, shift - 9, false,
> + pgsize, pd_alloc, gfp);
> +}
> +
> +static pte_t *riscv_iommu_pt_walk_fetch(pmd_t * ptp,
> + unsigned long iova, unsigned shift,
> + bool root)
> +{
> + pmd_t *pte;
> +
> + if (root)
> + pte = ptp + ((iova >> shift) & (PTRS_PER_PMD - 1));
> + else
> + pte = (pmd_t *) pfn_to_virt(__page_val_to_pfn(pmd_val(*ptp))) +
> + ((iova >> shift) & (PTRS_PER_PMD - 1));
> +
> + if (pmd_leaf(*pte))
> + return (pte_t *) pte;
> + else if (pmd_none(*pte))
> + return NULL;
> + else if (shift == PAGE_SHIFT)
> + return NULL;
> +
> + return riscv_iommu_pt_walk_fetch(pte, iova, shift - 9, false);
> +}
> +
> +static int riscv_iommu_map_pages(struct io_pgtable_ops *ops,
> + unsigned long iova, phys_addr_t phys,
> + size_t pgsize, size_t pgcount, int prot,
> + gfp_t gfp, size_t *mapped)
> +{
> + struct riscv_iommu_domain *domain = io_pgtable_ops_to_domain(ops);
> + size_t size = 0;
> + size_t page_size = get_page_size(pgsize);
> + pte_t *pte;
> + pte_t pte_val;
> + pgprot_t pte_prot;
> +
> + if (domain->domain.type == IOMMU_DOMAIN_BLOCKED)
> + return -ENODEV;
> +
> + if (domain->domain.type == IOMMU_DOMAIN_IDENTITY) {
> + *mapped = pgsize * pgcount;
> + return 0;
> + }
> +
> + pte_prot = (prot & IOMMU_WRITE) ?
> + __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY) :
> + __pgprot(_PAGE_BASE | _PAGE_READ);
> +
> + while (pgcount--) {
> + pte =
> + riscv_iommu_pt_walk_alloc((pmd_t *) domain->pgd_root, iova,
> + PGDIR_SHIFT, true, page_size,
> + get_zeroed_page, gfp);
> + if (!pte) {
> + *mapped = size;
> + return -ENOMEM;
> + }
> +
> + pte_val = pfn_pte(phys_to_pfn(phys), pte_prot);
> +
> + set_pte(pte, pte_val);
> +
> + size += page_size;
> + iova += page_size;
> + phys += page_size;
> + }
> +
> + *mapped = size;
> + return 0;
> +}
> +
> +static size_t riscv_iommu_unmap_pages(struct io_pgtable_ops *ops,
> + unsigned long iova, size_t pgsize,
> + size_t pgcount,
> + struct iommu_iotlb_gather *gather)
> +{
> + struct riscv_iommu_domain *domain = io_pgtable_ops_to_domain(ops);
> + size_t size = 0;
> + size_t page_size = get_page_size(pgsize);
> + pte_t *pte;
> +
> + if (domain->domain.type == IOMMU_DOMAIN_IDENTITY)
> + return pgsize * pgcount;
> +
> + while (pgcount--) {
> + pte = riscv_iommu_pt_walk_fetch((pmd_t *) domain->pgd_root,
> + iova, PGDIR_SHIFT, true);
> + if (!pte)
> + return size;
> +
> + set_pte(pte, __pte(0));
> +
> + iommu_iotlb_gather_add_page(&domain->domain, gather, iova,
> + pgsize);
> +
> + size += page_size;
> + iova += page_size;
> + }
> +
> + return size;
> +}
> +
> +static phys_addr_t riscv_iommu_iova_to_phys(struct io_pgtable_ops *ops,
> + unsigned long iova)
> +{
> + struct riscv_iommu_domain *domain = io_pgtable_ops_to_domain(ops);
> + pte_t *pte;
> +
> + if (domain->domain.type == IOMMU_DOMAIN_IDENTITY)
> + return (phys_addr_t) iova;
> +
> + pte = riscv_iommu_pt_walk_fetch((pmd_t *) domain->pgd_root,
> + iova, PGDIR_SHIFT, true);
> + if (!pte || !pte_present(*pte))
> + return 0;
> +
> + return (pfn_to_phys(pte_pfn(*pte)) | (iova & PAGE_MASK));

As I mentioned in last mail, it should be (iova & ~PAGE_MASK) for
getting low 12 bits.

> +}
> +
> +static void riscv_iommu_tlb_inv_all(void *cookie)
> +{
> +}
> +
> +static void riscv_iommu_tlb_inv_walk(unsigned long iova, size_t size,
> + size_t granule, void *cookie)
> +{
> +}
> +

What could we do in these callbacks? Perhaps send the IOTINVAL command to IOMMU?

> +static void riscv_iommu_tlb_add_page(struct iommu_iotlb_gather *gather,
> + unsigned long iova, size_t granule,
> + void *cookie)
> +{
> +}
> +
> +static const struct iommu_flush_ops riscv_iommu_flush_ops = {
> + .tlb_flush_all = riscv_iommu_tlb_inv_all,
> + .tlb_flush_walk = riscv_iommu_tlb_inv_walk,
> + .tlb_add_page = riscv_iommu_tlb_add_page,
> +};
> +
> +/* NOTE: cfg should point to riscv_iommu_domain structure member pgtbl.cfg */
> +static struct io_pgtable *riscv_iommu_alloc_pgtable(struct io_pgtable_cfg *cfg,
> + void *cookie)
> +{
> + struct io_pgtable *iop = container_of(cfg, struct io_pgtable, cfg);
> +
> + cfg->pgsize_bitmap = SZ_4K | SZ_2M | SZ_1G;
> + cfg->ias = 57; // va mode, SvXX -> ias
> + cfg->oas = 57; // pa mode, or SvXX+4 -> oas

Is it possible that use VA_BITS instead of this magic number?

> + cfg->tlb = &riscv_iommu_flush_ops;
> +
> + iop->ops.map_pages = riscv_iommu_map_pages;
> + iop->ops.unmap_pages = riscv_iommu_unmap_pages;
> + iop->ops.iova_to_phys = riscv_iommu_iova_to_phys;
> +
> + return iop;
> +}
> +
> +struct io_pgtable_init_fns io_pgtable_riscv_init_fns = {
> + .alloc = riscv_iommu_alloc_pgtable,
> + .free = riscv_iommu_free_pgtable,
> +};
> diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
> index 9ee7d2b222b5..2ef6952a2109 100644
> --- a/drivers/iommu/riscv/iommu.c
> +++ b/drivers/iommu/riscv/iommu.c
> @@ -807,7 +807,7 @@ static struct iommu_device *riscv_iommu_probe_device(struct device *dev)
> /* Initial DC pointer can be NULL if IOMMU is configured in OFF or BARE mode */
> ep->dc = riscv_iommu_get_dc(iommu, ep->devid);
>
> - dev_info(iommu->dev, "adding device to iommu with devid %i in domain %i\n",
> + dev_dbg(iommu->dev, "adding device to iommu with devid %i in domain %i\n",
> ep->devid, ep->domid);
>
> dev_iommu_priv_set(dev, ep);
> @@ -874,7 +874,10 @@ static struct iommu_domain *riscv_iommu_domain_alloc(unsigned type)
> {
> struct riscv_iommu_domain *domain;
>
> - if (type != IOMMU_DOMAIN_IDENTITY &&
> + if (type != IOMMU_DOMAIN_DMA &&
> + type != IOMMU_DOMAIN_DMA_FQ &&
> + type != IOMMU_DOMAIN_UNMANAGED &&
> + type != IOMMU_DOMAIN_IDENTITY &&
> type != IOMMU_DOMAIN_BLOCKED)
> return NULL;
>
> @@ -890,7 +893,7 @@ static struct iommu_domain *riscv_iommu_domain_alloc(unsigned type)
> domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1,
> RISCV_IOMMU_MAX_PSCID, GFP_KERNEL);
>
> - printk("domain type %x alloc %u\n", type, domain->pscid);
> + printk("domain alloc %u\n", domain->pscid);
>
> return &domain->domain;
> }
> @@ -903,6 +906,9 @@ static void riscv_iommu_domain_free(struct iommu_domain *iommu_domain)
> pr_warn("IOMMU domain is not empty!\n");
> }
>
> + if (domain->pgtbl.cookie)
> + free_io_pgtable_ops(&domain->pgtbl.ops);
> +
> if (domain->pgd_root)
> free_pages((unsigned long)domain->pgd_root, 0);
>
> @@ -959,6 +965,9 @@ static int riscv_iommu_domain_finalize(struct riscv_iommu_domain *domain,
> if (!domain->pgd_root)
> return -ENOMEM;
>
> + if (!alloc_io_pgtable_ops(RISCV_IOMMU, &domain->pgtbl.cfg, domain))
> + return -ENOMEM;
> +
> return 0;
> }
>
> @@ -1006,9 +1015,8 @@ static int riscv_iommu_attach_dev(struct iommu_domain *iommu_domain, struct devi
> return 0;
> }
>
> - if (!dc) {
> + if (!dc)
> return -ENODEV;
> - }
>
> /*
> * S-Stage translation table. G-Stage remains unmodified (BARE).
> @@ -1104,12 +1112,11 @@ static int riscv_iommu_map_pages(struct iommu_domain *iommu_domain,
> {
> struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
>
> - if (domain->domain.type == IOMMU_DOMAIN_IDENTITY) {
> - *mapped = pgsize * pgcount;
> - return 0;
> - }
> + if (!domain->pgtbl.ops.map_pages)
> + return -ENODEV;
>
> - return -ENODEV;
> + return domain->pgtbl.ops.map_pages(&domain->pgtbl.ops, iova, phys,
> + pgsize, pgcount, prot, gfp, mapped);
> }
>
> static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain,
> @@ -1118,10 +1125,11 @@ static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain,
> {
> struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
>
> - if (domain->domain.type == IOMMU_DOMAIN_IDENTITY)
> - return pgsize * pgcount;
> + if (!domain->pgtbl.ops.unmap_pages)
> + return 0;
>
> - return 0;
> + return domain->pgtbl.ops.unmap_pages(&domain->pgtbl.ops, iova, pgsize,
> + pgcount, gather);
> }
>
> static phys_addr_t riscv_iommu_iova_to_phys(struct iommu_domain *iommu_domain,
> @@ -1129,10 +1137,10 @@ static phys_addr_t riscv_iommu_iova_to_phys(struct iommu_domain *iommu_domain,
> {
> struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
>
> - if (domain->domain.type == IOMMU_DOMAIN_IDENTITY)
> - return (phys_addr_t) iova;
> + if (!domain->pgtbl.ops.iova_to_phys)
> + return 0;
>
> - return 0;
> + return domain->pgtbl.ops.iova_to_phys(&domain->pgtbl.ops, iova);
> }
>
> /*
> diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
> index 9140df71e17b..fe32a4eff14e 100644
> --- a/drivers/iommu/riscv/iommu.h
> +++ b/drivers/iommu/riscv/iommu.h
> @@ -88,6 +88,7 @@ struct riscv_iommu_device {
>
> struct riscv_iommu_domain {
> struct iommu_domain domain;
> + struct io_pgtable pgtbl;
>
> struct list_head endpoints;
> struct mutex lock;
> diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
> index 1b7a44b35616..8dd9d3a28e3a 100644
> --- a/include/linux/io-pgtable.h
> +++ b/include/linux/io-pgtable.h
> @@ -19,6 +19,7 @@ enum io_pgtable_fmt {
> AMD_IOMMU_V2,
> APPLE_DART,
> APPLE_DART2,
> + RISCV_IOMMU,
> IO_PGTABLE_NUM_FMTS,
> };
>
> @@ -258,5 +259,6 @@ extern struct io_pgtable_init_fns io_pgtable_arm_mali_lpae_init_fns;
> extern struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns;
> extern struct io_pgtable_init_fns io_pgtable_amd_iommu_v2_init_fns;
> extern struct io_pgtable_init_fns io_pgtable_apple_dart_init_fns;
> +extern struct io_pgtable_init_fns io_pgtable_riscv_init_fns;
>
> #endif /* __IO_PGTABLE_H */
> --
> 2.34.1
>
>
> _______________________________________________
> linux-riscv mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/linux-riscv

2023-08-16 21:57:44

by Robin Murphy

[permalink] [raw]
Subject: Re: [PATCH 08/11] RISC-V: drivers/iommu/riscv: Add page table support

On 2023-07-19 20:33, Tomasz Jeznach wrote:
> Introduces I/O page level translation services, with 4K, 2M, 1G page
> size support and enables page level iommu_map/unmap domain interfaces.
>
> Co-developed-by: Sebastien Boeuf <[email protected]>
> Signed-off-by: Sebastien Boeuf <[email protected]>
> Signed-off-by: Tomasz Jeznach <[email protected]>
> ---
> drivers/iommu/io-pgtable.c | 3 +
> drivers/iommu/riscv/Makefile | 2 +-
> drivers/iommu/riscv/io_pgtable.c | 266 +++++++++++++++++++++++++++++++
> drivers/iommu/riscv/iommu.c | 40 +++--
> drivers/iommu/riscv/iommu.h | 1 +
> include/linux/io-pgtable.h | 2 +
> 6 files changed, 297 insertions(+), 17 deletions(-)
> create mode 100644 drivers/iommu/riscv/io_pgtable.c
>
> diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c
> index b843fcd365d2..c4807175934f 100644
> --- a/drivers/iommu/io-pgtable.c
> +++ b/drivers/iommu/io-pgtable.c
> @@ -32,6 +32,9 @@ io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] = {
> [AMD_IOMMU_V1] = &io_pgtable_amd_iommu_v1_init_fns,
> [AMD_IOMMU_V2] = &io_pgtable_amd_iommu_v2_init_fns,
> #endif
> +#ifdef CONFIG_RISCV_IOMMU
> + [RISCV_IOMMU] = &io_pgtable_riscv_init_fns,
> +#endif
> };
>
> struct io_pgtable_ops *alloc_io_pgtable_ops(enum io_pgtable_fmt fmt,
> diff --git a/drivers/iommu/riscv/Makefile b/drivers/iommu/riscv/Makefile
> index 9523eb053cfc..13af452c3052 100644
> --- a/drivers/iommu/riscv/Makefile
> +++ b/drivers/iommu/riscv/Makefile
> @@ -1 +1 @@
> -obj-$(CONFIG_RISCV_IOMMU) += iommu.o iommu-pci.o iommu-platform.o iommu-sysfs.o
> \ No newline at end of file
> +obj-$(CONFIG_RISCV_IOMMU) += iommu.o iommu-pci.o iommu-platform.o iommu-sysfs.o io_pgtable.o
> diff --git a/drivers/iommu/riscv/io_pgtable.c b/drivers/iommu/riscv/io_pgtable.c
> new file mode 100644
> index 000000000000..b6e603e6726e
> --- /dev/null
> +++ b/drivers/iommu/riscv/io_pgtable.c
> @@ -0,0 +1,266 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright © 2022-2023 Rivos Inc.
> + *
> + * RISC-V IOMMU page table allocator.
> + *
> + * Authors:
> + * Tomasz Jeznach <[email protected]>
> + * Sebastien Boeuf <[email protected]>
> + */
> +
> +#include <linux/atomic.h>
> +#include <linux/bitops.h>
> +#include <linux/io-pgtable.h>
> +#include <linux/kernel.h>
> +#include <linux/sizes.h>
> +#include <linux/slab.h>
> +#include <linux/types.h>
> +#include <linux/dma-mapping.h>

There's no DMA API usage here. Should there be?

> +
> +#include "iommu.h"
> +
> +#define io_pgtable_to_domain(x) \
> + container_of((x), struct riscv_iommu_domain, pgtbl)
> +
> +#define io_pgtable_ops_to_domain(x) \
> + io_pgtable_to_domain(container_of((x), struct io_pgtable, ops))
> +
> +static inline size_t get_page_size(size_t size)
> +{
> + if (size >= IOMMU_PAGE_SIZE_512G)
> + return IOMMU_PAGE_SIZE_512G;
> +
> + if (size >= IOMMU_PAGE_SIZE_1G)
> + return IOMMU_PAGE_SIZE_1G;
> +
> + if (size >= IOMMU_PAGE_SIZE_2M)
> + return IOMMU_PAGE_SIZE_2M;
> +
> + return IOMMU_PAGE_SIZE_4K;
> +}
> +
> +static void riscv_iommu_pt_walk_free(pmd_t * ptp, unsigned shift, bool root)
> +{
> + pmd_t *pte, *pt_base;
> + int i;
> +
> + if (shift == PAGE_SHIFT)
> + return;
> +
> + if (root)
> + pt_base = ptp;
> + else
> + pt_base =
> + (pmd_t *) pfn_to_virt(__page_val_to_pfn(pmd_val(*ptp)));
> +
> + /* Recursively free all sub page table pages */
> + for (i = 0; i < PTRS_PER_PMD; i++) {
> + pte = pt_base + i;
> + if (pmd_present(*pte) && !pmd_leaf(*pte))
> + riscv_iommu_pt_walk_free(pte, shift - 9, false);
> + }
> +
> + /* Now free the current page table page */

Without any TLB maintenance, even if it was live? Maybe walk caches and
speculative prefetching are a long way from anyone's mind if this is
still only running under Qemu, but it still makes me unconfortable to
see a complete lack of appropriate looking maintenance in the places I
would usually expect to. Especially if there's the prospect of the IOMMU
doing hardware pagetable updates itself (which I see is a thing, even if
it's not enabled here yet).

> + if (!root && pmd_present(*pt_base))
> + free_page((unsigned long)pt_base);
> +}
> +
> +static void riscv_iommu_free_pgtable(struct io_pgtable *iop)
> +{
> + struct riscv_iommu_domain *domain = io_pgtable_to_domain(iop);
> + riscv_iommu_pt_walk_free((pmd_t *) domain->pgd_root, PGDIR_SHIFT, true);
> +}
> +
> +static pte_t *riscv_iommu_pt_walk_alloc(pmd_t * ptp, unsigned long iova,
> + unsigned shift, bool root,
> + size_t pgsize,
> + unsigned long (*pd_alloc)(gfp_t),
> + gfp_t gfp)
> +{
> + pmd_t *pte;
> + unsigned long pfn;
> +
> + if (root)
> + pte = ptp + ((iova >> shift) & (PTRS_PER_PMD - 1));
> + else
> + pte = (pmd_t *) pfn_to_virt(__page_val_to_pfn(pmd_val(*ptp))) +
> + ((iova >> shift) & (PTRS_PER_PMD - 1));
> +
> + if ((1ULL << shift) <= pgsize) {
> + if (pmd_present(*pte) && !pmd_leaf(*pte))
> + riscv_iommu_pt_walk_free(pte, shift - 9, false);
> + return (pte_t *) pte;
> + }
> +
> + if (pmd_none(*pte)) {
> + pfn = pd_alloc ? virt_to_pfn(pd_alloc(gfp)) : 0;
> + if (!pfn)
> + return NULL;
> + set_pmd(pte, __pmd((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
> + }
> +
> + return riscv_iommu_pt_walk_alloc(pte, iova, shift - 9, false,
> + pgsize, pd_alloc, gfp);
> +}
> +
> +static pte_t *riscv_iommu_pt_walk_fetch(pmd_t * ptp,
> + unsigned long iova, unsigned shift,
> + bool root)
> +{
> + pmd_t *pte;
> +
> + if (root)
> + pte = ptp + ((iova >> shift) & (PTRS_PER_PMD - 1));
> + else
> + pte = (pmd_t *) pfn_to_virt(__page_val_to_pfn(pmd_val(*ptp))) +
> + ((iova >> shift) & (PTRS_PER_PMD - 1));
> +
> + if (pmd_leaf(*pte))
> + return (pte_t *) pte;
> + else if (pmd_none(*pte))
> + return NULL;
> + else if (shift == PAGE_SHIFT)
> + return NULL;
> +
> + return riscv_iommu_pt_walk_fetch(pte, iova, shift - 9, false);
> +}
> +
> +static int riscv_iommu_map_pages(struct io_pgtable_ops *ops,
> + unsigned long iova, phys_addr_t phys,
> + size_t pgsize, size_t pgcount, int prot,
> + gfp_t gfp, size_t *mapped)
> +{
> + struct riscv_iommu_domain *domain = io_pgtable_ops_to_domain(ops);
> + size_t size = 0;
> + size_t page_size = get_page_size(pgsize);
> + pte_t *pte;
> + pte_t pte_val;
> + pgprot_t pte_prot;
> +
> + if (domain->domain.type == IOMMU_DOMAIN_BLOCKED)
> + return -ENODEV;
> +
> + if (domain->domain.type == IOMMU_DOMAIN_IDENTITY) {
> + *mapped = pgsize * pgcount;
> + return 0;
> + }

As before, these are utter nonsense, but cannot happen anyway.

> +
> + pte_prot = (prot & IOMMU_WRITE) ?
> + __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY) :
> + __pgprot(_PAGE_BASE | _PAGE_READ);
> +
> + while (pgcount--) {
> + pte =
> + riscv_iommu_pt_walk_alloc((pmd_t *) domain->pgd_root, iova,
> + PGDIR_SHIFT, true, page_size,
> + get_zeroed_page, gfp);
> + if (!pte) {
> + *mapped = size;
> + return -ENOMEM;
> + }
> +
> + pte_val = pfn_pte(phys_to_pfn(phys), pte_prot);
> +
> + set_pte(pte, pte_val);
> +
> + size += page_size;
> + iova += page_size;
> + phys += page_size;
> + }
> +
> + *mapped = size;
> + return 0;
> +}
> +
> +static size_t riscv_iommu_unmap_pages(struct io_pgtable_ops *ops,
> + unsigned long iova, size_t pgsize,
> + size_t pgcount,
> + struct iommu_iotlb_gather *gather)
> +{
> + struct riscv_iommu_domain *domain = io_pgtable_ops_to_domain(ops);
> + size_t size = 0;
> + size_t page_size = get_page_size(pgsize);
> + pte_t *pte;
> +
> + if (domain->domain.type == IOMMU_DOMAIN_IDENTITY)
> + return pgsize * pgcount;

"Yes, non-existent caller, those pages are definitely unmapped and
inaccessible now. Totally secure. Device couldn't possibly touch them if
it tried. Would I lie to you?"

> +
> + while (pgcount--) {
> + pte = riscv_iommu_pt_walk_fetch((pmd_t *) domain->pgd_root,
> + iova, PGDIR_SHIFT, true);
> + if (!pte)
> + return size;
> +
> + set_pte(pte, __pte(0));
> +
> + iommu_iotlb_gather_add_page(&domain->domain, gather, iova,
> + pgsize);
> +
> + size += page_size;
> + iova += page_size;
> + }
> +
> + return size;
> +}
> +
> +static phys_addr_t riscv_iommu_iova_to_phys(struct io_pgtable_ops *ops,
> + unsigned long iova)
> +{
> + struct riscv_iommu_domain *domain = io_pgtable_ops_to_domain(ops);
> + pte_t *pte;
> +
> + if (domain->domain.type == IOMMU_DOMAIN_IDENTITY)
> + return (phys_addr_t) iova;

I mean, even if it was still 2 years ago before the core code handled
this anyway (and it's only for a couple of broken network drivers doing
dumb things they shouldn't), why would a sane IOMMU driver even bother
going to the lengths of allocating io-pgtable ops for an identity domain
that by definition doesn't use a pagetable!?

> +
> + pte = riscv_iommu_pt_walk_fetch((pmd_t *) domain->pgd_root,
> + iova, PGDIR_SHIFT, true);
> + if (!pte || !pte_present(*pte))
> + return 0;
> +
> + return (pfn_to_phys(pte_pfn(*pte)) | (iova & PAGE_MASK));
> +}
> +
> +static void riscv_iommu_tlb_inv_all(void *cookie)
> +{
> +}
> +
> +static void riscv_iommu_tlb_inv_walk(unsigned long iova, size_t size,
> + size_t granule, void *cookie)
> +{
> +}
> +
> +static void riscv_iommu_tlb_add_page(struct iommu_iotlb_gather *gather,
> + unsigned long iova, size_t granule,
> + void *cookie)
> +{
> +}
> +
> +static const struct iommu_flush_ops riscv_iommu_flush_ops = {
> + .tlb_flush_all = riscv_iommu_tlb_inv_all,
> + .tlb_flush_walk = riscv_iommu_tlb_inv_walk,
> + .tlb_add_page = riscv_iommu_tlb_add_page,
> +};

...Why? Either implement them properly, or don't implement them at all.
And if they are implemented it needs to be by the driver, so either way
they shouldn't be *here*.

> +
> +/* NOTE: cfg should point to riscv_iommu_domain structure member pgtbl.cfg */
> +static struct io_pgtable *riscv_iommu_alloc_pgtable(struct io_pgtable_cfg *cfg,
> + void *cookie)
> +{
> + struct io_pgtable *iop = container_of(cfg, struct io_pgtable, cfg);
> +
> + cfg->pgsize_bitmap = SZ_4K | SZ_2M | SZ_1G;
> + cfg->ias = 57; // va mode, SvXX -> ias
> + cfg->oas = 57; // pa mode, or SvXX+4 -> oas

At least IAS should be passed by the driver based on what the IOMMU
actually supports (and isn't OAS 56?)

> + cfg->tlb = &riscv_iommu_flush_ops;
> +
> + iop->ops.map_pages = riscv_iommu_map_pages;
> + iop->ops.unmap_pages = riscv_iommu_unmap_pages;
> + iop->ops.iova_to_phys = riscv_iommu_iova_to_phys;
> +
> + return iop;
> +}
> +
> +struct io_pgtable_init_fns io_pgtable_riscv_init_fns = {
> + .alloc = riscv_iommu_alloc_pgtable,
> + .free = riscv_iommu_free_pgtable,
> +};
> diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
> index 9ee7d2b222b5..2ef6952a2109 100644
> --- a/drivers/iommu/riscv/iommu.c
> +++ b/drivers/iommu/riscv/iommu.c
> @@ -807,7 +807,7 @@ static struct iommu_device *riscv_iommu_probe_device(struct device *dev)
> /* Initial DC pointer can be NULL if IOMMU is configured in OFF or BARE mode */
> ep->dc = riscv_iommu_get_dc(iommu, ep->devid);
>
> - dev_info(iommu->dev, "adding device to iommu with devid %i in domain %i\n",
> + dev_dbg(iommu->dev, "adding device to iommu with devid %i in domain %i\n",
> ep->devid, ep->domid);
>
> dev_iommu_priv_set(dev, ep);
> @@ -874,7 +874,10 @@ static struct iommu_domain *riscv_iommu_domain_alloc(unsigned type)
> {
> struct riscv_iommu_domain *domain;
>
> - if (type != IOMMU_DOMAIN_IDENTITY &&
> + if (type != IOMMU_DOMAIN_DMA &&
> + type != IOMMU_DOMAIN_DMA_FQ &&

IOMMU_DOMAIN_DMA_FQ isn't exposed to drivers any more.

> + type != IOMMU_DOMAIN_UNMANAGED &&
> + type != IOMMU_DOMAIN_IDENTITY &&
> type != IOMMU_DOMAIN_BLOCKED)

I might start believing you could support blocking domains if I saw some
kind of handling of them since patch #7, but no, still nothing.

AFAICS from the spec there's no super-convenient way if you did want to
do it, since you apparently can't suppress the faults from simply making
the DC invalid. I guess it might be a case of keeping a special
always-empty context so you can point any device's FSC at that while
setting DTF. But it's hardly critical, so for now I'd just remove the
broken non-support and leave the idea as something to revisit later.

> return NULL;
>
> @@ -890,7 +893,7 @@ static struct iommu_domain *riscv_iommu_domain_alloc(unsigned type)
> domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1,
> RISCV_IOMMU_MAX_PSCID, GFP_KERNEL);
>
> - printk("domain type %x alloc %u\n", type, domain->pscid);
> + printk("domain alloc %u\n", domain->pscid);
>
> return &domain->domain;
> }
> @@ -903,6 +906,9 @@ static void riscv_iommu_domain_free(struct iommu_domain *iommu_domain)
> pr_warn("IOMMU domain is not empty!\n");
> }
>
> + if (domain->pgtbl.cookie)
> + free_io_pgtable_ops(&domain->pgtbl.ops);
> +
> if (domain->pgd_root)
> free_pages((unsigned long)domain->pgd_root, 0);

Is there a reason for this weird pgd_root setup where the io-pgtable
implementation doesn't simply allocate and own the full pagetable itself?

>
> @@ -959,6 +965,9 @@ static int riscv_iommu_domain_finalize(struct riscv_iommu_domain *domain,
> if (!domain->pgd_root)
> return -ENOMEM;
>
> + if (!alloc_io_pgtable_ops(RISCV_IOMMU, &domain->pgtbl.cfg, domain))
> + return -ENOMEM;
> +
> return 0;
> }
>
> @@ -1006,9 +1015,8 @@ static int riscv_iommu_attach_dev(struct iommu_domain *iommu_domain, struct devi
> return 0;
> }
>
> - if (!dc) {
> + if (!dc)
> return -ENODEV;
> - }

This is a great example of more of the kind of stuff I was getting at on
patch #1 - obviously unnecessary churn *within* a series is the ideal
way to overload and annoy reviewers... and then I look at the rest of my
screen below and see loads of code from an earlier patch being deleted
already, so apparently it was a waste of time reviewing it at all :(

Thanks,
Robin.

> /*
> * S-Stage translation table. G-Stage remains unmodified (BARE).
> @@ -1104,12 +1112,11 @@ static int riscv_iommu_map_pages(struct iommu_domain *iommu_domain,
> {
> struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
>
> - if (domain->domain.type == IOMMU_DOMAIN_IDENTITY) {
> - *mapped = pgsize * pgcount;
> - return 0;
> - }
> + if (!domain->pgtbl.ops.map_pages)
> + return -ENODEV;
>
> - return -ENODEV;
> + return domain->pgtbl.ops.map_pages(&domain->pgtbl.ops, iova, phys,
> + pgsize, pgcount, prot, gfp, mapped);
> }
>
> static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain,
> @@ -1118,10 +1125,11 @@ static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain,
> {
> struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
>
> - if (domain->domain.type == IOMMU_DOMAIN_IDENTITY)
> - return pgsize * pgcount;
> + if (!domain->pgtbl.ops.unmap_pages)
> + return 0;
>
> - return 0;
> + return domain->pgtbl.ops.unmap_pages(&domain->pgtbl.ops, iova, pgsize,
> + pgcount, gather);
> }
>
> static phys_addr_t riscv_iommu_iova_to_phys(struct iommu_domain *iommu_domain,
> @@ -1129,10 +1137,10 @@ static phys_addr_t riscv_iommu_iova_to_phys(struct iommu_domain *iommu_domain,
> {
> struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
>
> - if (domain->domain.type == IOMMU_DOMAIN_IDENTITY)
> - return (phys_addr_t) iova;
> + if (!domain->pgtbl.ops.iova_to_phys)
> + return 0;
>
> - return 0;
> + return domain->pgtbl.ops.iova_to_phys(&domain->pgtbl.ops, iova);
> }
>
> /*
> diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
> index 9140df71e17b..fe32a4eff14e 100644
> --- a/drivers/iommu/riscv/iommu.h
> +++ b/drivers/iommu/riscv/iommu.h
> @@ -88,6 +88,7 @@ struct riscv_iommu_device {
>
> struct riscv_iommu_domain {
> struct iommu_domain domain;
> + struct io_pgtable pgtbl;
>
> struct list_head endpoints;
> struct mutex lock;
> diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
> index 1b7a44b35616..8dd9d3a28e3a 100644
> --- a/include/linux/io-pgtable.h
> +++ b/include/linux/io-pgtable.h
> @@ -19,6 +19,7 @@ enum io_pgtable_fmt {
> AMD_IOMMU_V2,
> APPLE_DART,
> APPLE_DART2,
> + RISCV_IOMMU,
> IO_PGTABLE_NUM_FMTS,
> };
>
> @@ -258,5 +259,6 @@ extern struct io_pgtable_init_fns io_pgtable_arm_mali_lpae_init_fns;
> extern struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns;
> extern struct io_pgtable_init_fns io_pgtable_amd_iommu_v2_init_fns;
> extern struct io_pgtable_init_fns io_pgtable_apple_dart_init_fns;
> +extern struct io_pgtable_init_fns io_pgtable_riscv_init_fns;
>
> #endif /* __IO_PGTABLE_H */