2024-03-05 10:16:48

by Leon Romanovsky

[permalink] [raw]
Subject: [RFC 01/16] mm/hmm: let users to tag specific PFNs

From: Leon Romanovsky <[email protected]>

Introduce new sticky flag, which isn't overwritten by HMM range fault.
Such flag allows users to tag specific PFNs with extra data in addition
to already filled by HMM.

Signed-off-by: Leon Romanovsky <[email protected]>
---
include/linux/hmm.h | 3 +++
mm/hmm.c | 34 +++++++++++++++++++++-------------
2 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 126a36571667..b90902baa593 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -23,6 +23,7 @@ struct mmu_interval_notifier;
* HMM_PFN_WRITE - if the page memory can be written to (requires HMM_PFN_VALID)
* HMM_PFN_ERROR - accessing the pfn is impossible and the device should
* fail. ie poisoned memory, special pages, no vma, etc
+ * HMM_PFN_STICKY - Flag preserved on input-to-output transformation
*
* On input:
* 0 - Return the current state of the page, do not fault it.
@@ -36,6 +37,8 @@ enum hmm_pfn_flags {
HMM_PFN_VALID = 1UL << (BITS_PER_LONG - 1),
HMM_PFN_WRITE = 1UL << (BITS_PER_LONG - 2),
HMM_PFN_ERROR = 1UL << (BITS_PER_LONG - 3),
+ /* Sticky lag, carried from Input to Output */
+ HMM_PFN_STICKY = 1UL << (BITS_PER_LONG - 7),
HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 8),

/* Input flags */
diff --git a/mm/hmm.c b/mm/hmm.c
index 277ddcab4947..9645a72beec0 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -44,8 +44,10 @@ static int hmm_pfns_fill(unsigned long addr, unsigned long end,
{
unsigned long i = (addr - range->start) >> PAGE_SHIFT;

- for (; addr < end; addr += PAGE_SIZE, i++)
- range->hmm_pfns[i] = cpu_flags;
+ for (; addr < end; addr += PAGE_SIZE, i++) {
+ range->hmm_pfns[i] &= HMM_PFN_STICKY;
+ range->hmm_pfns[i] |= cpu_flags;
+ }
return 0;
}

@@ -202,8 +204,10 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
return hmm_vma_fault(addr, end, required_fault, walk);

pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
- hmm_pfns[i] = pfn | cpu_flags;
+ for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+ hmm_pfns[i] &= HMM_PFN_STICKY;
+ hmm_pfns[i] |= pfn | cpu_flags;
+ }
return 0;
}
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -236,7 +240,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
if (required_fault)
goto fault;
- *hmm_pfn = 0;
+ *hmm_pfn = *hmm_pfn & HMM_PFN_STICKY;
return 0;
}

@@ -253,14 +257,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
cpu_flags = HMM_PFN_VALID;
if (is_writable_device_private_entry(entry))
cpu_flags |= HMM_PFN_WRITE;
- *hmm_pfn = swp_offset_pfn(entry) | cpu_flags;
+ *hmm_pfn = (*hmm_pfn & HMM_PFN_STICKY) | swp_offset_pfn(entry) | cpu_flags;
return 0;
}

required_fault =
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
if (!required_fault) {
- *hmm_pfn = 0;
+ *hmm_pfn = *hmm_pfn & HMM_PFN_STICKY;
return 0;
}

@@ -304,11 +308,11 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
pte_unmap(ptep);
return -EFAULT;
}
- *hmm_pfn = HMM_PFN_ERROR;
+ *hmm_pfn = (*hmm_pfn & HMM_PFN_STICKY) | HMM_PFN_ERROR;
return 0;
}

- *hmm_pfn = pte_pfn(pte) | cpu_flags;
+ *hmm_pfn = (*hmm_pfn & HMM_PFN_STICKY) | pte_pfn(pte) | cpu_flags;
return 0;

fault:
@@ -453,8 +457,10 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
}

pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- for (i = 0; i < npages; ++i, ++pfn)
- hmm_pfns[i] = pfn | cpu_flags;
+ for (i = 0; i < npages; ++i, ++pfn) {
+ hmm_pfns[i] &= HMM_PFN_STICKY;
+ hmm_pfns[i] |= pfn | cpu_flags;
+ }
goto out_unlock;
}

@@ -512,8 +518,10 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
}

pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
- for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
- range->hmm_pfns[i] = pfn | cpu_flags;
+ for (; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+ range->hmm_pfns[i] &= HMM_PFN_STICKY;
+ range->hmm_pfns[i] |= pfn | cpu_flags;
+ }

spin_unlock(ptl);
return 0;
--
2.44.0



2024-03-05 10:17:34

by Leon Romanovsky

[permalink] [raw]
Subject: [RFC 02/16] dma-mapping: provide an interface to allocate IOVA

From: Leon Romanovsky <[email protected]>

Existing .map_page() callback provides two things at the same time:
allocates IOVA and links DMA pages. That combination works great for
most of the callers who use it in control paths, but less effective
in fast paths.

These advanced callers already manage their data in some sort of
database and can perform IOVA allocation in advance, leaving range
linkage operation to be in fast path.

Provide an interface to allocate/deallocate IOVA and next patch
link/unlink DMA ranges to that specific IOVA.

Signed-off-by: Leon Romanovsky <[email protected]>
---
include/linux/dma-map-ops.h | 3 +++
include/linux/dma-mapping.h | 20 ++++++++++++++++++++
kernel/dma/mapping.c | 30 ++++++++++++++++++++++++++++++
3 files changed, 53 insertions(+)

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 4abc60f04209..bd605b44bb57 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -83,6 +83,9 @@ struct dma_map_ops {
size_t (*max_mapping_size)(struct device *dev);
size_t (*opt_mapping_size)(void);
unsigned long (*get_merge_boundary)(struct device *dev);
+
+ dma_addr_t (*alloc_iova)(struct device *dev, size_t size);
+ void (*free_iova)(struct device *dev, dma_addr_t dma_addr, size_t size);
};

#ifdef CONFIG_DMA_OPS
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 4a658de44ee9..176fb8a86d63 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -91,6 +91,16 @@ static inline void debug_dma_map_single(struct device *dev, const void *addr,
}
#endif /* CONFIG_DMA_API_DEBUG */

+struct dma_iova_attrs {
+ /* OUT field */
+ dma_addr_t addr;
+ /* IN fields */
+ struct device *dev;
+ size_t size;
+ enum dma_data_direction dir;
+ unsigned long attrs;
+};
+
#ifdef CONFIG_HAS_DMA
static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
{
@@ -101,6 +111,9 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
return 0;
}

+int dma_alloc_iova(struct dma_iova_attrs *iova);
+void dma_free_iova(struct dma_iova_attrs *iova);
+
dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
size_t offset, size_t size, enum dma_data_direction dir,
unsigned long attrs);
@@ -159,6 +172,13 @@ void dma_vunmap_noncontiguous(struct device *dev, void *vaddr);
int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma,
size_t size, struct sg_table *sgt);
#else /* CONFIG_HAS_DMA */
+static inline int dma_alloc_iova(struct dma_iova_attrs *iova)
+{
+ return -EOPNOTSUPP;
+}
+static inline void dma_free_iova(struct dma_iova_attrs *iova)
+{
+}
static inline dma_addr_t dma_map_page_attrs(struct device *dev,
struct page *page, size_t offset, size_t size,
enum dma_data_direction dir, unsigned long attrs)
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 58db8fd70471..b6b27bab90f3 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -183,6 +183,36 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
}
EXPORT_SYMBOL(dma_unmap_page_attrs);

+int dma_alloc_iova(struct dma_iova_attrs *iova)
+{
+ struct device *dev = iova->dev;
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (dma_map_direct(dev, ops) || !ops->alloc_iova) {
+ iova->addr = 0;
+ return 0;
+ }
+
+ iova->addr = ops->alloc_iova(dev, iova->size);
+ if (dma_mapping_error(dev, iova->addr))
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL(dma_alloc_iova);
+
+void dma_free_iova(struct dma_iova_attrs *iova)
+{
+ struct device *dev = iova->dev;
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (dma_map_direct(dev, ops) || !ops->free_iova)
+ return;
+
+ ops->free_iova(dev, iova->addr, iova->size);
+}
+EXPORT_SYMBOL(dma_free_iova);
+
static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
int nents, enum dma_data_direction dir, unsigned long attrs)
{
--
2.44.0


2024-03-05 10:18:26

by Leon Romanovsky

[permalink] [raw]
Subject: [RFC 03/16] dma-mapping: provide callbacks to link/unlink pages to specific IOVA

From: Leon Romanovsky <[email protected]>

Introduce new DMA link/unlink API to provide a way for advanced users
to directly map/unmap pages without ned to allocate IOVA on every map
call.

Signed-off-by: Leon Romanovsky <[email protected]>
---
include/linux/dma-map-ops.h | 10 +++++++
include/linux/dma-mapping.h | 13 +++++++++
kernel/dma/debug.h | 2 ++
kernel/dma/direct.h | 3 ++
kernel/dma/mapping.c | 57 +++++++++++++++++++++++++++++++++++++
5 files changed, 85 insertions(+)

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index bd605b44bb57..fd03a080df1e 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -86,6 +86,13 @@ struct dma_map_ops {

dma_addr_t (*alloc_iova)(struct device *dev, size_t size);
void (*free_iova)(struct device *dev, dma_addr_t dma_addr, size_t size);
+ dma_addr_t (*link_range)(struct device *dev, struct page *page,
+ unsigned long offset, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir,
+ unsigned long attrs);
+ void (*unlink_range)(struct device *dev, dma_addr_t dma_handle,
+ size_t size, enum dma_data_direction dir,
+ unsigned long attrs);
};

#ifdef CONFIG_DMA_OPS
@@ -428,6 +435,9 @@ bool arch_dma_unmap_sg_direct(struct device *dev, struct scatterlist *sg,
#define arch_dma_unmap_sg_direct(d, s, n) (false)
#endif

+#define arch_dma_link_range_direct arch_dma_map_page_direct
+#define arch_dma_unlink_range_direct arch_dma_unmap_page_direct
+
#ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS
void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
bool coherent);
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 176fb8a86d63..91cc084adb53 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -113,6 +113,9 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)

int dma_alloc_iova(struct dma_iova_attrs *iova);
void dma_free_iova(struct dma_iova_attrs *iova);
+dma_addr_t dma_link_range(struct page *page, unsigned long offset,
+ struct dma_iova_attrs *iova, dma_addr_t dma_offset);
+void dma_unlink_range(struct dma_iova_attrs *iova, dma_addr_t dma_offset);

dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
size_t offset, size_t size, enum dma_data_direction dir,
@@ -179,6 +182,16 @@ static inline int dma_alloc_iova(struct dma_iova_attrs *iova)
static inline void dma_free_iova(struct dma_iova_attrs *iova)
{
}
+static inline dma_addr_t dma_link_range(struct page *page, unsigned long offset,
+ struct dma_iova_attrs *iova,
+ dma_addr_t dma_offset)
+{
+ return DMA_MAPPING_ERROR;
+}
+static inline void dma_unlink_range(struct dma_iova_attrs *iova,
+ dma_addr_t dma_offset)
+{
+}
static inline dma_addr_t dma_map_page_attrs(struct device *dev,
struct page *page, size_t offset, size_t size,
enum dma_data_direction dir, unsigned long attrs)
diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h
index f525197d3cae..3d529f355c6d 100644
--- a/kernel/dma/debug.h
+++ b/kernel/dma/debug.h
@@ -127,4 +127,6 @@ static inline void debug_dma_sync_sg_for_device(struct device *dev,
{
}
#endif /* CONFIG_DMA_API_DEBUG */
+#define debug_dma_link_range debug_dma_map_page
+#define debug_dma_unlink_range debug_dma_unmap_page
#endif /* _KERNEL_DMA_DEBUG_H */
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index 18d346118fe8..1c30e1cd607a 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -125,4 +125,7 @@ static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
swiotlb_tbl_unmap_single(dev, phys, size, dir,
attrs | DMA_ATTR_SKIP_CPU_SYNC);
}
+
+#define dma_direct_link_range dma_direct_map_page
+#define dma_direct_unlink_range dma_direct_unmap_page
#endif /* _KERNEL_DMA_DIRECT_H */
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index b6b27bab90f3..f989c64622c2 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -213,6 +213,63 @@ void dma_free_iova(struct dma_iova_attrs *iova)
}
EXPORT_SYMBOL(dma_free_iova);

+/**
+ * dma_link_range - Link a physical page to DMA address
+ * @page: The page to be mapped
+ * @offset: The offset within the page
+ * @iova: Preallocated IOVA attributes
+ * @dma_offset: DMA offset form which this page needs to be linked
+ *
+ * dma_alloc_iova() allocates IOVA based on the size specified by ther user in
+ * iova->size. Call this function after IOVA allocation to link @page from
+ * @offset to get the DMA address. Note that very first call to this function
+ * will have @dma_offset set to 0 in the IOVA space allocated from
+ * dma_alloc_iova(). For subsequent calls to this function on same @iova,
+ * @dma_offset needs to be advanced by the caller with the size of previous
+ * page that was linked + DMA address returned for the previous page that was
+ * linked by this function.
+ */
+dma_addr_t dma_link_range(struct page *page, unsigned long offset,
+ struct dma_iova_attrs *iova, dma_addr_t dma_offset)
+{
+ struct device *dev = iova->dev;
+ size_t size = iova->size;
+ enum dma_data_direction dir = iova->dir;
+ unsigned long attrs = iova->attrs;
+ dma_addr_t addr = iova->addr + dma_offset;
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (dma_map_direct(dev, ops) ||
+ arch_dma_link_range_direct(dev, page_to_phys(page) + offset + size))
+ addr = dma_direct_link_range(dev, page, offset, size, dir, attrs);
+ else if (ops->link_range)
+ addr = ops->link_range(dev, page, offset, addr, size, dir, attrs);
+
+ kmsan_handle_dma(page, offset, size, dir);
+ debug_dma_link_range(dev, page, offset, size, dir, addr, attrs);
+ return addr;
+}
+EXPORT_SYMBOL(dma_link_range);
+
+void dma_unlink_range(struct dma_iova_attrs *iova, dma_addr_t dma_offset)
+{
+ struct device *dev = iova->dev;
+ size_t size = iova->size;
+ enum dma_data_direction dir = iova->dir;
+ unsigned long attrs = iova->attrs;
+ dma_addr_t addr = iova->addr + dma_offset;
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (dma_map_direct(dev, ops) ||
+ arch_dma_unlink_range_direct(dev, addr + size))
+ dma_direct_unlink_range(dev, addr, size, dir, attrs);
+ else if (ops->unlink_range)
+ ops->unlink_range(dev, addr, size, dir, attrs);
+
+ debug_dma_unlink_range(dev, addr, size, dir);
+}
+EXPORT_SYMBOL(dma_unlink_range);
+
static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
int nents, enum dma_data_direction dir, unsigned long attrs)
{
--
2.44.0


2024-03-05 10:19:24

by Leon Romanovsky

[permalink] [raw]
Subject: [RFC 04/16] iommu/dma: Provide an interface to allow preallocate IOVA

From: Leon Romanovsky <[email protected]>

Separate IOVA allocation to dedicated callback so it will allow
cache of IOVA and reuse it in fast paths for devices which support
ODP (on-demand-paging) mechanism.

Signed-off-by: Leon Romanovsky <[email protected]>
---
drivers/iommu/dma-iommu.c | 50 +++++++++++++++++++++++++++++----------
1 file changed, 38 insertions(+), 12 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 50ccc4f1ef81..e55726783501 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -356,7 +356,7 @@ int iommu_dma_init_fq(struct iommu_domain *domain)
atomic_set(&cookie->fq_timer_on, 0);
/*
* Prevent incomplete fq state being observable. Pairs with path from
- * __iommu_dma_unmap() through iommu_dma_free_iova() to queue_iova()
+ * __iommu_dma_unmap() through __iommu_dma_free_iova() to queue_iova()
*/
smp_wmb();
WRITE_ONCE(cookie->fq_domain, domain);
@@ -760,7 +760,7 @@ static int dma_info_to_prot(enum dma_data_direction dir, bool coherent,
}
}

-static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain,
+static dma_addr_t __iommu_dma_alloc_iova(struct iommu_domain *domain,
size_t size, u64 dma_limit, struct device *dev)
{
struct iommu_dma_cookie *cookie = domain->iova_cookie;
@@ -806,7 +806,7 @@ static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain,
return (dma_addr_t)iova << shift;
}

-static void iommu_dma_free_iova(struct iommu_dma_cookie *cookie,
+static void __iommu_dma_free_iova(struct iommu_dma_cookie *cookie,
dma_addr_t iova, size_t size, struct iommu_iotlb_gather *gather)
{
struct iova_domain *iovad = &cookie->iovad;
@@ -843,7 +843,7 @@ static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr,

if (!iotlb_gather.queued)
iommu_iotlb_sync(domain, &iotlb_gather);
- iommu_dma_free_iova(cookie, dma_addr, size, &iotlb_gather);
+ __iommu_dma_free_iova(cookie, dma_addr, size, &iotlb_gather);
}

static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
@@ -861,12 +861,12 @@ static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,

size = iova_align(iovad, size + iova_off);

- iova = iommu_dma_alloc_iova(domain, size, dma_mask, dev);
+ iova = __iommu_dma_alloc_iova(domain, size, dma_mask, dev);
if (!iova)
return DMA_MAPPING_ERROR;

if (iommu_map(domain, iova, phys - iova_off, size, prot, GFP_ATOMIC)) {
- iommu_dma_free_iova(cookie, iova, size, NULL);
+ __iommu_dma_free_iova(cookie, iova, size, NULL);
return DMA_MAPPING_ERROR;
}
return iova + iova_off;
@@ -970,7 +970,7 @@ static struct page **__iommu_dma_alloc_noncontiguous(struct device *dev,
return NULL;

size = iova_align(iovad, size);
- iova = iommu_dma_alloc_iova(domain, size, dev->coherent_dma_mask, dev);
+ iova = __iommu_dma_alloc_iova(domain, size, dev->coherent_dma_mask, dev);
if (!iova)
goto out_free_pages;

@@ -1004,7 +1004,7 @@ static struct page **__iommu_dma_alloc_noncontiguous(struct device *dev,
out_free_sg:
sg_free_table(sgt);
out_free_iova:
- iommu_dma_free_iova(cookie, iova, size, NULL);
+ __iommu_dma_free_iova(cookie, iova, size, NULL);
out_free_pages:
__iommu_dma_free_pages(pages, count);
return NULL;
@@ -1436,7 +1436,7 @@ static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
if (!iova_len)
return __finalise_sg(dev, sg, nents, 0);

- iova = iommu_dma_alloc_iova(domain, iova_len, dma_get_mask(dev), dev);
+ iova = __iommu_dma_alloc_iova(domain, iova_len, dma_get_mask(dev), dev);
if (!iova) {
ret = -ENOMEM;
goto out_restore_sg;
@@ -1453,7 +1453,7 @@ static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
return __finalise_sg(dev, sg, nents, iova);

out_free_iova:
- iommu_dma_free_iova(cookie, iova, iova_len, NULL);
+ __iommu_dma_free_iova(cookie, iova, iova_len, NULL);
out_restore_sg:
__invalidate_sg(sg, nents);
out:
@@ -1706,6 +1706,30 @@ static size_t iommu_dma_opt_mapping_size(void)
return iova_rcache_range();
}

+static dma_addr_t iommu_dma_alloc_iova(struct device *dev, size_t size)
+{
+ struct iommu_domain *domain = iommu_get_dma_domain(dev);
+ struct iommu_dma_cookie *cookie = domain->iova_cookie;
+ struct iova_domain *iovad = &cookie->iovad;
+ dma_addr_t dma_mask = dma_get_mask(dev);
+
+ size = iova_align(iovad, size);
+ return __iommu_dma_alloc_iova(domain, size, dma_mask, dev);
+}
+
+static void iommu_dma_free_iova(struct device *dev, dma_addr_t iova,
+ size_t size)
+{
+ struct iommu_domain *domain = iommu_get_dma_domain(dev);
+ struct iommu_dma_cookie *cookie = domain->iova_cookie;
+ struct iova_domain *iovad = &cookie->iovad;
+ struct iommu_iotlb_gather iotlb_gather;
+
+ size = iova_align(iovad, size);
+ iommu_iotlb_gather_init(&iotlb_gather);
+ __iommu_dma_free_iova(cookie, iova, size, &iotlb_gather);
+}
+
static const struct dma_map_ops iommu_dma_ops = {
.flags = DMA_F_PCI_P2PDMA_SUPPORTED,
.alloc = iommu_dma_alloc,
@@ -1728,6 +1752,8 @@ static const struct dma_map_ops iommu_dma_ops = {
.unmap_resource = iommu_dma_unmap_resource,
.get_merge_boundary = iommu_dma_get_merge_boundary,
.opt_mapping_size = iommu_dma_opt_mapping_size,
+ .alloc_iova = iommu_dma_alloc_iova,
+ .free_iova = iommu_dma_free_iova,
};

/*
@@ -1776,7 +1802,7 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
if (!msi_page)
return NULL;

- iova = iommu_dma_alloc_iova(domain, size, dma_get_mask(dev), dev);
+ iova = __iommu_dma_alloc_iova(domain, size, dma_get_mask(dev), dev);
if (!iova)
goto out_free_page;

@@ -1790,7 +1816,7 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
return msi_page;

out_free_iova:
- iommu_dma_free_iova(cookie, iova, size, NULL);
+ __iommu_dma_free_iova(cookie, iova, size, NULL);
out_free_page:
kfree(msi_page);
return NULL;
--
2.44.0


2024-03-05 10:20:09

by Leon Romanovsky

[permalink] [raw]
Subject: [RFC 05/16] iommu/dma: Prepare map/unmap page functions to receive IOVA

From: Leon Romanovsky <[email protected]>

Extend the existing map_page/unmap_page function implementations to get
preallocated IOVA. In such case, the IOVA allocation needs to be
skipped, but rest of the code stays the same.

Signed-off-by: Leon Romanovsky <[email protected]>
---
drivers/iommu/dma-iommu.c | 68 ++++++++++++++++++++++++++-------------
1 file changed, 45 insertions(+), 23 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index e55726783501..dbdd373a609a 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -824,7 +824,7 @@ static void __iommu_dma_free_iova(struct iommu_dma_cookie *cookie,
}

static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr,
- size_t size)
+ size_t size, bool free_iova)
{
struct iommu_domain *domain = iommu_get_dma_domain(dev);
struct iommu_dma_cookie *cookie = domain->iova_cookie;
@@ -843,17 +843,19 @@ static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr,

if (!iotlb_gather.queued)
iommu_iotlb_sync(domain, &iotlb_gather);
- __iommu_dma_free_iova(cookie, dma_addr, size, &iotlb_gather);
+ if (free_iova)
+ __iommu_dma_free_iova(cookie, dma_addr, size, &iotlb_gather);
}

static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
- size_t size, int prot, u64 dma_mask)
+ dma_addr_t iova, size_t size, int prot,
+ u64 dma_mask)
{
struct iommu_domain *domain = iommu_get_dma_domain(dev);
struct iommu_dma_cookie *cookie = domain->iova_cookie;
struct iova_domain *iovad = &cookie->iovad;
size_t iova_off = iova_offset(iovad, phys);
- dma_addr_t iova;
+ bool no_iova = !iova;

if (static_branch_unlikely(&iommu_deferred_attach_enabled) &&
iommu_deferred_attach(dev, domain))
@@ -861,12 +863,14 @@ static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,

size = iova_align(iovad, size + iova_off);

- iova = __iommu_dma_alloc_iova(domain, size, dma_mask, dev);
+ if (no_iova)
+ iova = __iommu_dma_alloc_iova(domain, size, dma_mask, dev);
if (!iova)
return DMA_MAPPING_ERROR;

if (iommu_map(domain, iova, phys - iova_off, size, prot, GFP_ATOMIC)) {
- __iommu_dma_free_iova(cookie, iova, size, NULL);
+ if (no_iova)
+ __iommu_dma_free_iova(cookie, iova, size, NULL);
return DMA_MAPPING_ERROR;
}
return iova + iova_off;
@@ -1031,7 +1035,7 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
return vaddr;

out_unmap:
- __iommu_dma_unmap(dev, *dma_handle, size);
+ __iommu_dma_unmap(dev, *dma_handle, size, true);
__iommu_dma_free_pages(pages, PAGE_ALIGN(size) >> PAGE_SHIFT);
return NULL;
}
@@ -1060,7 +1064,7 @@ static void iommu_dma_free_noncontiguous(struct device *dev, size_t size,
{
struct dma_sgt_handle *sh = sgt_handle(sgt);

- __iommu_dma_unmap(dev, sgt->sgl->dma_address, size);
+ __iommu_dma_unmap(dev, sgt->sgl->dma_address, size, true);
__iommu_dma_free_pages(sh->pages, PAGE_ALIGN(size) >> PAGE_SHIFT);
sg_free_table(&sh->sgt);
kfree(sh);
@@ -1131,9 +1135,11 @@ static void iommu_dma_sync_sg_for_device(struct device *dev,
arch_sync_dma_for_device(sg_phys(sg), sg->length, dir);
}

-static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size, enum dma_data_direction dir,
- unsigned long attrs)
+static dma_addr_t __iommu_dma_map_pages(struct device *dev, struct page *page,
+ unsigned long offset, dma_addr_t iova,
+ size_t size,
+ enum dma_data_direction dir,
+ unsigned long attrs)
{
phys_addr_t phys = page_to_phys(page) + offset;
bool coherent = dev_is_dma_coherent(dev);
@@ -1141,7 +1147,7 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
struct iommu_domain *domain = iommu_get_dma_domain(dev);
struct iommu_dma_cookie *cookie = domain->iova_cookie;
struct iova_domain *iovad = &cookie->iovad;
- dma_addr_t iova, dma_mask = dma_get_mask(dev);
+ dma_addr_t addr, dma_mask = dma_get_mask(dev);

/*
* If both the physical buffer start address and size are
@@ -1182,14 +1188,23 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
if (!coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
arch_sync_dma_for_device(phys, size, dir);

- iova = __iommu_dma_map(dev, phys, size, prot, dma_mask);
- if (iova == DMA_MAPPING_ERROR && is_swiotlb_buffer(dev, phys))
+ addr = __iommu_dma_map(dev, phys, iova, size, prot, dma_mask);
+ if (addr == DMA_MAPPING_ERROR && is_swiotlb_buffer(dev, phys))
swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
- return iova;
+ return addr;
}

-static void iommu_dma_unmap_page(struct device *dev, dma_addr_t dma_handle,
- size_t size, enum dma_data_direction dir, unsigned long attrs)
+static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size,
+ enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ return __iommu_dma_map_pages(dev, page, offset, 0, size, dir, attrs);
+}
+
+static void __iommu_dma_unmap_pages(struct device *dev, dma_addr_t dma_handle,
+ size_t size, enum dma_data_direction dir,
+ unsigned long attrs, bool free_iova)
{
struct iommu_domain *domain = iommu_get_dma_domain(dev);
phys_addr_t phys;
@@ -1201,12 +1216,19 @@ static void iommu_dma_unmap_page(struct device *dev, dma_addr_t dma_handle,
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && !dev_is_dma_coherent(dev))
arch_sync_dma_for_cpu(phys, size, dir);

- __iommu_dma_unmap(dev, dma_handle, size);
+ __iommu_dma_unmap(dev, dma_handle, size, free_iova);

if (unlikely(is_swiotlb_buffer(dev, phys)))
swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
}

+static void iommu_dma_unmap_page(struct device *dev, dma_addr_t dma_handle,
+ size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ __iommu_dma_unmap_pages(dev, dma_handle, size, dir, attrs, true);
+}
+
/*
* Prepare a successfully-mapped scatterlist to give back to the caller.
*
@@ -1509,13 +1531,13 @@ static void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
}

if (end)
- __iommu_dma_unmap(dev, start, end - start);
+ __iommu_dma_unmap(dev, start, end - start, true);
}

static dma_addr_t iommu_dma_map_resource(struct device *dev, phys_addr_t phys,
size_t size, enum dma_data_direction dir, unsigned long attrs)
{
- return __iommu_dma_map(dev, phys, size,
+ return __iommu_dma_map(dev, phys, 0, size,
dma_info_to_prot(dir, false, attrs) | IOMMU_MMIO,
dma_get_mask(dev));
}
@@ -1523,7 +1545,7 @@ static dma_addr_t iommu_dma_map_resource(struct device *dev, phys_addr_t phys,
static void iommu_dma_unmap_resource(struct device *dev, dma_addr_t handle,
size_t size, enum dma_data_direction dir, unsigned long attrs)
{
- __iommu_dma_unmap(dev, handle, size);
+ __iommu_dma_unmap(dev, handle, size, true);
}

static void __iommu_dma_free(struct device *dev, size_t size, void *cpu_addr)
@@ -1560,7 +1582,7 @@ static void __iommu_dma_free(struct device *dev, size_t size, void *cpu_addr)
static void iommu_dma_free(struct device *dev, size_t size, void *cpu_addr,
dma_addr_t handle, unsigned long attrs)
{
- __iommu_dma_unmap(dev, handle, size);
+ __iommu_dma_unmap(dev, handle, size, true);
__iommu_dma_free(dev, size, cpu_addr);
}

@@ -1626,7 +1648,7 @@ static void *iommu_dma_alloc(struct device *dev, size_t size,
if (!cpu_addr)
return NULL;

- *handle = __iommu_dma_map(dev, page_to_phys(page), size, ioprot,
+ *handle = __iommu_dma_map(dev, page_to_phys(page), 0, size, ioprot,
dev->coherent_dma_mask);
if (*handle == DMA_MAPPING_ERROR) {
__iommu_dma_free(dev, size, cpu_addr);
--
2.44.0


2024-03-05 10:20:56

by Leon Romanovsky

[permalink] [raw]
Subject: [RFC 06/16] iommu/dma: Implement link/unlink page callbacks

From: Leon Romanovsky <[email protected]>

Add an implementation of link/unlink interface to perform in map/unmap
pages in fast patch for pre-allocated IOVA.

Signed-off-by: Leon Romanovsky <[email protected]>
---
drivers/iommu/dma-iommu.c | 17 +++++++++++++++++
1 file changed, 17 insertions(+)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index dbdd373a609a..b683c4a4e9f8 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1752,6 +1752,21 @@ static void iommu_dma_free_iova(struct device *dev, dma_addr_t iova,
__iommu_dma_free_iova(cookie, iova, size, &iotlb_gather);
}

+static dma_addr_t iommu_dma_link_range(struct device *dev, struct page *page,
+ unsigned long offset, dma_addr_t iova,
+ size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ return __iommu_dma_map_pages(dev, page, offset, iova, size, dir, attrs);
+}
+
+static void iommu_dma_unlink_range(struct device *dev, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ __iommu_dma_unmap_pages(dev, addr, size, dir, attrs, false);
+}
+
static const struct dma_map_ops iommu_dma_ops = {
.flags = DMA_F_PCI_P2PDMA_SUPPORTED,
.alloc = iommu_dma_alloc,
@@ -1776,6 +1791,8 @@ static const struct dma_map_ops iommu_dma_ops = {
.opt_mapping_size = iommu_dma_opt_mapping_size,
.alloc_iova = iommu_dma_alloc_iova,
.free_iova = iommu_dma_free_iova,
+ .link_range = iommu_dma_link_range,
+ .unlink_range = iommu_dma_unlink_range,
};

/*
--
2.44.0


2024-03-05 10:21:42

by Leon Romanovsky

[permalink] [raw]
Subject: [RFC 07/16] RDMA/umem: Preallocate and cache IOVA for UMEM ODP

From: Leon Romanovsky <[email protected]>

As a preparation to provide two step interface to map pages,
preallocate IOVA when UMEM is initialized.

Signed-off-by: Leon Romanovsky <[email protected]>
---
drivers/infiniband/core/umem_odp.c | 16 +++++++++++++++-
include/rdma/ib_umem_odp.h | 1 +
include/rdma/ib_verbs.h | 18 ++++++++++++++++++
3 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index e9fa22d31c23..f69d1233dc82 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -50,6 +50,7 @@
static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
const struct mmu_interval_notifier_ops *ops)
{
+ struct ib_device *dev = umem_odp->umem.ibdev;
int ret;

umem_odp->umem.is_odp = 1;
@@ -87,15 +88,25 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
goto out_pfn_list;
}

+ umem_odp->iova.dev = dev->dma_device;
+ umem_odp->iova.size = end - start;
+ umem_odp->iova.dir = DMA_BIDIRECTIONAL;
+ ret = ib_dma_alloc_iova(dev, &umem_odp->iova);
+ if (ret)
+ goto out_dma_list;
+
+
ret = mmu_interval_notifier_insert(&umem_odp->notifier,
umem_odp->umem.owning_mm,
start, end - start, ops);
if (ret)
- goto out_dma_list;
+ goto out_free_iova;
}

return 0;

+out_free_iova:
+ ib_dma_free_iova(dev, &umem_odp->iova);
out_dma_list:
kvfree(umem_odp->dma_list);
out_pfn_list:
@@ -262,6 +273,8 @@ EXPORT_SYMBOL(ib_umem_odp_get);

void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
{
+ struct ib_device *dev = umem_odp->umem.ibdev;
+
/*
* Ensure that no more pages are mapped in the umem.
*
@@ -274,6 +287,7 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
ib_umem_end(umem_odp));
mutex_unlock(&umem_odp->umem_mutex);
mmu_interval_notifier_remove(&umem_odp->notifier);
+ ib_dma_free_iova(dev, &umem_odp->iova);
kvfree(umem_odp->dma_list);
kvfree(umem_odp->pfn_list);
}
diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h
index 0844c1d05ac6..bb2d7f2a5b04 100644
--- a/include/rdma/ib_umem_odp.h
+++ b/include/rdma/ib_umem_odp.h
@@ -23,6 +23,7 @@ struct ib_umem_odp {
* See ODP_READ_ALLOWED_BIT and ODP_WRITE_ALLOWED_BIT.
*/
dma_addr_t *dma_list;
+ struct dma_iova_attrs iova;
/*
* The umem_mutex protects the page_list and dma_list fields of an ODP
* umem, allowing only a single thread to map/unmap pages. The mutex
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index b7b6b58dd348..e71fa19187cc 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -4077,6 +4077,24 @@ static inline int ib_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
return dma_mapping_error(dev->dma_device, dma_addr);
}

+static inline int ib_dma_alloc_iova(struct ib_device *dev,
+ struct dma_iova_attrs *iova)
+{
+ if (ib_uses_virt_dma(dev))
+ return 0;
+
+ return dma_alloc_iova(iova);
+}
+
+static inline void ib_dma_free_iova(struct ib_device *dev,
+ struct dma_iova_attrs *iova)
+{
+ if (ib_uses_virt_dma(dev))
+ return;
+
+ dma_free_iova(iova);
+}
+
/**
* ib_dma_map_single - Map a kernel virtual address to DMA address
* @dev: The device for which the dma_addr is to be created
--
2.44.0


2024-03-05 10:23:30

by Leon Romanovsky

[permalink] [raw]
Subject: [RFC 09/16] RDMA/core: Separate DMA mapping to caching IOVA and page linkage

From: Leon Romanovsky <[email protected]>

Reuse newly added DMA API to cache IOVA and only link/unlink pages
in fast path.

Signed-off-by: Leon Romanovsky <[email protected]>
---
drivers/infiniband/core/umem_odp.c | 57 ++----------------------------
drivers/infiniband/hw/mlx5/odp.c | 22 +++++++++++-
include/rdma/ib_umem_odp.h | 8 +----
include/rdma/ib_verbs.h | 36 +++++++++++++++++++
4 files changed, 61 insertions(+), 62 deletions(-)

diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index 3619fb78f786..1301009a6b78 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -81,20 +81,13 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
if (!umem_odp->pfn_list)
return -ENOMEM;

- umem_odp->dma_list = kvcalloc(
- ndmas, sizeof(*umem_odp->dma_list), GFP_KERNEL);
- if (!umem_odp->dma_list) {
- ret = -ENOMEM;
- goto out_pfn_list;
- }

umem_odp->iova.dev = dev->dma_device;
umem_odp->iova.size = end - start;
umem_odp->iova.dir = DMA_BIDIRECTIONAL;
ret = ib_dma_alloc_iova(dev, &umem_odp->iova);
if (ret)
- goto out_dma_list;
-
+ goto out_pfn_list;

ret = mmu_interval_notifier_insert(&umem_odp->notifier,
umem_odp->umem.owning_mm,
@@ -107,8 +100,6 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,

out_free_iova:
ib_dma_free_iova(dev, &umem_odp->iova);
-out_dma_list:
- kvfree(umem_odp->dma_list);
out_pfn_list:
kvfree(umem_odp->pfn_list);
return ret;
@@ -288,7 +279,6 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
mutex_unlock(&umem_odp->umem_mutex);
mmu_interval_notifier_remove(&umem_odp->notifier);
ib_dma_free_iova(dev, &umem_odp->iova);
- kvfree(umem_odp->dma_list);
kvfree(umem_odp->pfn_list);
}
put_pid(umem_odp->tgid);
@@ -296,40 +286,10 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
}
EXPORT_SYMBOL(ib_umem_odp_release);

-/*
- * Map for DMA and insert a single page into the on-demand paging page tables.
- *
- * @umem: the umem to insert the page to.
- * @dma_index: index in the umem to add the dma to.
- * @page: the page struct to map and add.
- * @access_mask: access permissions needed for this page.
- *
- * The function returns -EFAULT if the DMA mapping operation fails.
- *
- */
-static int ib_umem_odp_map_dma_single_page(
- struct ib_umem_odp *umem_odp,
- unsigned int dma_index,
- struct page *page)
-{
- struct ib_device *dev = umem_odp->umem.ibdev;
- dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index];
-
- *dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift,
- DMA_BIDIRECTIONAL);
- if (ib_dma_mapping_error(dev, *dma_addr)) {
- *dma_addr = 0;
- return -EFAULT;
- }
- umem_odp->npages++;
- return 0;
-}
-
/**
* ib_umem_odp_map_dma_and_lock - DMA map userspace memory in an ODP MR and lock it.
*
* Maps the range passed in the argument to DMA addresses.
- * The DMA addresses of the mapped pages is updated in umem_odp->dma_list.
* Upon success the ODP MR will be locked to let caller complete its device
* page table update.
*
@@ -437,15 +397,6 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
__func__, hmm_order, page_shift);
break;
}
-
- ret = ib_umem_odp_map_dma_single_page(
- umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]));
- if (ret < 0) {
- ibdev_dbg(umem_odp->umem.ibdev,
- "ib_umem_odp_map_dma_single_page failed with error %d\n", ret);
- break;
- }
- range.hmm_pfns[pfn_index] |= HMM_PFN_STICKY;
}
/* upon success lock should stay on hold for the callee */
if (!ret)
@@ -465,7 +416,6 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock);
void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
u64 bound)
{
- dma_addr_t dma;
int idx;
u64 addr;
struct ib_device *dev = umem_odp->umem.ibdev;
@@ -479,15 +429,14 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]);

idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
- dma = umem_odp->dma_list[idx];

if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_VALID))
continue;
if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_STICKY))
continue;

- ib_dma_unmap_page(dev, dma, BIT(umem_odp->page_shift),
- DMA_BIDIRECTIONAL);
+ ib_dma_unlink_range(dev, &umem_odp->iova,
+ idx * (1 << umem_odp->page_shift));
if (umem_odp->pfn_list[pfn_idx] & HMM_PFN_WRITE) {
struct page *head_page = compound_head(page);
/*
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 5713fe25f4de..13d61f1ab40b 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -149,6 +149,7 @@ static void populate_mtt(__be64 *pas, size_t idx, size_t nentries,
{
struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
bool downgrade = flags & MLX5_IB_UPD_XLT_DOWNGRADE;
+ struct ib_device *dev = odp->umem.ibdev;
unsigned long pfn;
dma_addr_t pa;
size_t i;
@@ -162,12 +163,31 @@ static void populate_mtt(__be64 *pas, size_t idx, size_t nentries,
/* Initial ODP init */
continue;

- pa = odp->dma_list[idx + i];
+ if (pfn & HMM_PFN_STICKY && odp->iova.addr)
+ /*
+ * We are in this flow when there is a need to resync flags,
+ * for example when page was already linked in prefetch call
+ * with READ flag and now we need to add WRITE flag
+ *
+ * This page was already programmed to HW and we don't want/need
+ * to unlink and link it again just to resync flags.
+ *
+ * The DMA address calculation below is based on the fact that
+ * RDMA UMEM doesn't work with swiotlb.
+ */
+ pa = odp->iova.addr + (idx + i) * (1 << odp->page_shift);
+ else
+ pa = ib_dma_link_range(dev, hmm_pfn_to_page(pfn), 0, &odp->iova,
+ (idx + i) * (1 << odp->page_shift));
+ WARN_ON_ONCE(ib_dma_mapping_error(dev, pa));
+
pa |= MLX5_IB_MTT_READ;
if ((pfn & HMM_PFN_WRITE) && !downgrade)
pa |= MLX5_IB_MTT_WRITE;

pas[i] = cpu_to_be64(pa);
+ odp->pfn_list[idx + i] |= HMM_PFN_STICKY;
+ odp->npages++;
}
}

diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h
index 095b1297cfb1..a786556c65f9 100644
--- a/include/rdma/ib_umem_odp.h
+++ b/include/rdma/ib_umem_odp.h
@@ -17,15 +17,9 @@ struct ib_umem_odp {
/* An array of the pfns included in the on-demand paging umem. */
unsigned long *pfn_list;

- /*
- * An array with DMA addresses mapped for pfns in pfn_list.
- * The lower two bits designate access permissions.
- * See ODP_READ_ALLOWED_BIT and ODP_WRITE_ALLOWED_BIT.
- */
- dma_addr_t *dma_list;
struct dma_iova_attrs iova;
/*
- * The umem_mutex protects the page_list and dma_list fields of an ODP
+ * The umem_mutex protects the page_list field of an ODP
* umem, allowing only a single thread to map/unmap pages. The mutex
* also protects access to the mmu notifier counters.
*/
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index e71fa19187cc..c9e2bcd5268a 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -4160,6 +4160,42 @@ static inline void ib_dma_unmap_page(struct ib_device *dev,
dma_unmap_page(dev->dma_device, addr, size, direction);
}

+/**
+ * ib_dma_link_range - Link a physical page to DMA address
+ * @dev: The device for which the dma_addr is to be created
+ * @page: The page to be mapped
+ * @offset: The offset within the page
+ * @iova: Preallocated IOVA attributes
+ * @dma_offset: DMA offset
+ */
+static inline dma_addr_t ib_dma_link_range(struct ib_device *dev,
+ struct page *page,
+ unsigned long offset,
+ struct dma_iova_attrs *iova,
+ dma_addr_t dma_offset)
+{
+ if (ib_uses_virt_dma(dev))
+ return (uintptr_t)(page_address(page) + offset);
+
+ return dma_link_range(page, offset, iova, dma_offset);
+}
+
+/**
+ * ib_dma_unlink_range - Unlink a mapping created by ib_dma_link_page()
+ * @dev: The device for which the DMA address was created
+ * @iova: DMA IOVA properties
+ * @dma_offset: DMA offset
+ */
+static inline void ib_dma_unlink_range(struct ib_device *dev,
+ struct dma_iova_attrs *iova,
+ dma_addr_t dma_offset)
+{
+ if (ib_uses_virt_dma(dev))
+ return;
+
+ dma_unlink_range(iova, dma_offset);
+}
+
int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents);
static inline int ib_dma_map_sg_attrs(struct ib_device *dev,
struct scatterlist *sg, int nents,
--
2.44.0


2024-03-05 10:24:00

by Leon Romanovsky

[permalink] [raw]
Subject: [RFC 08/16] RDMA/umem: Store ODP access mask information in PFN

From: Leon Romanovsky <[email protected]>

As a preparation to remove of dma_list, store access mask in PFN pointer
and not in dma_addr_t.

Signed-off-by: Leon Romanovsky <[email protected]>
---
drivers/infiniband/core/umem_odp.c | 99 +++++++++++-----------------
drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 +
drivers/infiniband/hw/mlx5/odp.c | 37 ++++++-----
include/rdma/ib_umem_odp.h | 13 ----
4 files changed, 59 insertions(+), 91 deletions(-)

diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index f69d1233dc82..3619fb78f786 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -310,22 +310,11 @@ EXPORT_SYMBOL(ib_umem_odp_release);
static int ib_umem_odp_map_dma_single_page(
struct ib_umem_odp *umem_odp,
unsigned int dma_index,
- struct page *page,
- u64 access_mask)
+ struct page *page)
{
struct ib_device *dev = umem_odp->umem.ibdev;
dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index];

- if (*dma_addr) {
- /*
- * If the page is already dma mapped it means it went through
- * a non-invalidating trasition, like read-only to writable.
- * Resync the flags.
- */
- *dma_addr = (*dma_addr & ODP_DMA_ADDR_MASK) | access_mask;
- return 0;
- }
-
*dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift,
DMA_BIDIRECTIONAL);
if (ib_dma_mapping_error(dev, *dma_addr)) {
@@ -333,7 +322,6 @@ static int ib_umem_odp_map_dma_single_page(
return -EFAULT;
}
umem_odp->npages++;
- *dma_addr |= access_mask;
return 0;
}

@@ -369,9 +357,6 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
struct hmm_range range = {};
unsigned long timeout;

- if (access_mask == 0)
- return -EINVAL;
-
if (user_virt < ib_umem_start(umem_odp) ||
user_virt + bcnt > ib_umem_end(umem_odp))
return -EFAULT;
@@ -397,7 +382,7 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
if (fault) {
range.default_flags = HMM_PFN_REQ_FAULT;

- if (access_mask & ODP_WRITE_ALLOWED_BIT)
+ if (access_mask & HMM_PFN_WRITE)
range.default_flags |= HMM_PFN_REQ_WRITE;
}

@@ -429,22 +414,17 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
for (pfn_index = 0; pfn_index < num_pfns;
pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) {

- if (fault) {
- /*
- * Since we asked for hmm_range_fault() to populate
- * pages it shouldn't return an error entry on success.
- */
- WARN_ON(range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
- WARN_ON(!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
- } else {
- if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) {
- WARN_ON(umem_odp->dma_list[dma_index]);
- continue;
- }
- access_mask = ODP_READ_ALLOWED_BIT;
- if (range.hmm_pfns[pfn_index] & HMM_PFN_WRITE)
- access_mask |= ODP_WRITE_ALLOWED_BIT;
- }
+ /*
+ * Since we asked for hmm_range_fault() to populate
+ * pages it shouldn't return an error entry on success.
+ */
+ WARN_ON(fault && range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
+ WARN_ON(fault && !(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
+ if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID))
+ continue;
+
+ if (range.hmm_pfns[pfn_index] & HMM_PFN_STICKY)
+ continue;

hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]);
/* If a hugepage was detected and ODP wasn't set for, the umem
@@ -459,13 +439,13 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
}

ret = ib_umem_odp_map_dma_single_page(
- umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]),
- access_mask);
+ umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]));
if (ret < 0) {
ibdev_dbg(umem_odp->umem.ibdev,
"ib_umem_odp_map_dma_single_page failed with error %d\n", ret);
break;
}
+ range.hmm_pfns[pfn_index] |= HMM_PFN_STICKY;
}
/* upon success lock should stay on hold for the callee */
if (!ret)
@@ -485,7 +465,6 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock);
void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
u64 bound)
{
- dma_addr_t dma_addr;
dma_addr_t dma;
int idx;
u64 addr;
@@ -496,34 +475,34 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
virt = max_t(u64, virt, ib_umem_start(umem_odp));
bound = min_t(u64, bound, ib_umem_end(umem_odp));
for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
+ unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT;
+ struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]);
+
idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
dma = umem_odp->dma_list[idx];

- /* The access flags guaranteed a valid DMA address in case was NULL */
- if (dma) {
- unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT;
- struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]);
-
- dma_addr = dma & ODP_DMA_ADDR_MASK;
- ib_dma_unmap_page(dev, dma_addr,
- BIT(umem_odp->page_shift),
- DMA_BIDIRECTIONAL);
- if (dma & ODP_WRITE_ALLOWED_BIT) {
- struct page *head_page = compound_head(page);
- /*
- * set_page_dirty prefers being called with
- * the page lock. However, MMU notifiers are
- * called sometimes with and sometimes without
- * the lock. We rely on the umem_mutex instead
- * to prevent other mmu notifiers from
- * continuing and allowing the page mapping to
- * be removed.
- */
- set_page_dirty(head_page);
- }
- umem_odp->dma_list[idx] = 0;
- umem_odp->npages--;
+ if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_VALID))
+ continue;
+ if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_STICKY))
+ continue;
+
+ ib_dma_unmap_page(dev, dma, BIT(umem_odp->page_shift),
+ DMA_BIDIRECTIONAL);
+ if (umem_odp->pfn_list[pfn_idx] & HMM_PFN_WRITE) {
+ struct page *head_page = compound_head(page);
+ /*
+ * set_page_dirty prefers being called with
+ * the page lock. However, MMU notifiers are
+ * called sometimes with and sometimes without
+ * the lock. We rely on the umem_mutex instead
+ * to prevent other mmu notifiers from
+ * continuing and allowing the page mapping to
+ * be removed.
+ */
+ set_page_dirty(head_page);
}
+ umem_odp->pfn_list[pfn_idx] &= ~HMM_PFN_STICKY;
+ umem_odp->npages--;
}
}
EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index bbe79b86c717..4f368242680d 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -334,6 +334,7 @@ struct mlx5_ib_flow_db {
#define MLX5_IB_UPD_XLT_PD BIT(4)
#define MLX5_IB_UPD_XLT_ACCESS BIT(5)
#define MLX5_IB_UPD_XLT_INDIRECT BIT(6)
+#define MLX5_IB_UPD_XLT_DOWNGRADE BIT(7)

/* Private QP creation flags to be passed in ib_qp_init_attr.create_flags.
*
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 4a04cbc5b78a..5713fe25f4de 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -34,6 +34,7 @@
#include <linux/kernel.h>
#include <linux/dma-buf.h>
#include <linux/dma-resv.h>
+#include <linux/hmm.h>

#include "mlx5_ib.h"
#include "cmd.h"
@@ -143,22 +144,12 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
}
}

-static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
-{
- u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
-
- if (umem_dma & ODP_READ_ALLOWED_BIT)
- mtt_entry |= MLX5_IB_MTT_READ;
- if (umem_dma & ODP_WRITE_ALLOWED_BIT)
- mtt_entry |= MLX5_IB_MTT_WRITE;
-
- return mtt_entry;
-}
-
static void populate_mtt(__be64 *pas, size_t idx, size_t nentries,
struct mlx5_ib_mr *mr, int flags)
{
struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
+ bool downgrade = flags & MLX5_IB_UPD_XLT_DOWNGRADE;
+ unsigned long pfn;
dma_addr_t pa;
size_t i;

@@ -166,8 +157,17 @@ static void populate_mtt(__be64 *pas, size_t idx, size_t nentries,
return;

for (i = 0; i < nentries; i++) {
+ pfn = odp->pfn_list[idx + i];
+ if (!(pfn & HMM_PFN_VALID))
+ /* Initial ODP init */
+ continue;
+
pa = odp->dma_list[idx + i];
- pas[i] = cpu_to_be64(umem_dma_to_mtt(pa));
+ pa |= MLX5_IB_MTT_READ;
+ if ((pfn & HMM_PFN_WRITE) && !downgrade)
+ pa |= MLX5_IB_MTT_WRITE;
+
+ pas[i] = cpu_to_be64(pa);
}
}

@@ -268,8 +268,7 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
* estimate the cost of another UMR vs. the cost of bigger
* UMR.
*/
- if (umem_odp->dma_list[idx] &
- (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) {
+ if (umem_odp->pfn_list[idx] & HMM_PFN_VALID) {
if (!in_block) {
blk_start_idx = idx;
in_block = 1;
@@ -555,7 +554,7 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
{
int page_shift, ret, np;
bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
- u64 access_mask;
+ u64 access_mask = 0;
u64 start_idx;
bool fault = !(flags & MLX5_PF_FLAGS_SNAPSHOT);
u32 xlt_flags = MLX5_IB_UPD_XLT_ATOMIC;
@@ -563,12 +562,14 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
if (flags & MLX5_PF_FLAGS_ENABLE)
xlt_flags |= MLX5_IB_UPD_XLT_ENABLE;

+ if (flags & MLX5_PF_FLAGS_DOWNGRADE)
+ xlt_flags |= MLX5_IB_UPD_XLT_DOWNGRADE;
+
page_shift = odp->page_shift;
start_idx = (user_va - ib_umem_start(odp)) >> page_shift;
- access_mask = ODP_READ_ALLOWED_BIT;

if (odp->umem.writable && !downgrade)
- access_mask |= ODP_WRITE_ALLOWED_BIT;
+ access_mask |= HMM_PFN_WRITE;

np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask, fault);
if (np < 0)
diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h
index bb2d7f2a5b04..095b1297cfb1 100644
--- a/include/rdma/ib_umem_odp.h
+++ b/include/rdma/ib_umem_odp.h
@@ -68,19 +68,6 @@ static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp)
umem_odp->page_shift;
}

-/*
- * The lower 2 bits of the DMA address signal the R/W permissions for
- * the entry. To upgrade the permissions, provide the appropriate
- * bitmask to the map_dma_pages function.
- *
- * Be aware that upgrading a mapped address might result in change of
- * the DMA address for the page.
- */
-#define ODP_READ_ALLOWED_BIT (1<<0ULL)
-#define ODP_WRITE_ALLOWED_BIT (1<<1ULL)
-
-#define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT))
-
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING

struct ib_umem_odp *
--
2.44.0


2024-03-05 10:24:15

by Leon Romanovsky

[permalink] [raw]
Subject: [RFC 10/16] RDMA/umem: Prevent UMEM ODP creation with SWIOTLB

From: Leon Romanovsky <[email protected]>

RDMA UMEM never supported DMA addresses returned from SWIOTLB, as these
addresses should be programmed to the hardware which is not aware that
it is bounce buffers and not real ones.

Instead of silently leave broken system for the users who didn't
know it, let's be explicit and return an error to them.

Signed-off-by: Leon Romanovsky <[email protected]>
---
Documentation/core-api/dma-attributes.rst | 7 +++
drivers/infiniband/core/umem_odp.c | 77 +++++++++++------------
include/linux/dma-mapping.h | 6 ++
kernel/dma/direct.h | 4 +-
kernel/dma/mapping.c | 4 ++
5 files changed, 58 insertions(+), 40 deletions(-)

diff --git a/Documentation/core-api/dma-attributes.rst b/Documentation/core-api/dma-attributes.rst
index 1887d92e8e92..b337ec65d506 100644
--- a/Documentation/core-api/dma-attributes.rst
+++ b/Documentation/core-api/dma-attributes.rst
@@ -130,3 +130,10 @@ accesses to DMA buffers in both privileged "supervisor" and unprivileged
subsystem that the buffer is fully accessible at the elevated privilege
level (and ideally inaccessible or at least read-only at the
lesser-privileged levels).
+
+DMA_ATTR_NO_TRANSLATION
+-----------------------
+
+This attribute is used to indicate to the DMA-mapping subsystem that the
+buffer is not subject to any address translation. This is used for devices
+that doesn't need buffer bouncing or fixing DMA addresses.
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index 1301009a6b78..57c56000f60e 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -50,51 +50,50 @@
static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
const struct mmu_interval_notifier_ops *ops)
{
+ size_t page_size = 1UL << umem_odp->page_shift;
struct ib_device *dev = umem_odp->umem.ibdev;
+ size_t ndmas, npfns;
+ unsigned long start;
+ unsigned long end;
int ret;

umem_odp->umem.is_odp = 1;
mutex_init(&umem_odp->umem_mutex);

- if (!umem_odp->is_implicit_odp) {
- size_t page_size = 1UL << umem_odp->page_shift;
- unsigned long start;
- unsigned long end;
- size_t ndmas, npfns;
-
- start = ALIGN_DOWN(umem_odp->umem.address, page_size);
- if (check_add_overflow(umem_odp->umem.address,
- (unsigned long)umem_odp->umem.length,
- &end))
- return -EOVERFLOW;
- end = ALIGN(end, page_size);
- if (unlikely(end < page_size))
- return -EOVERFLOW;
-
- ndmas = (end - start) >> umem_odp->page_shift;
- if (!ndmas)
- return -EINVAL;
-
- npfns = (end - start) >> PAGE_SHIFT;
- umem_odp->pfn_list = kvcalloc(
- npfns, sizeof(*umem_odp->pfn_list), GFP_KERNEL);
- if (!umem_odp->pfn_list)
- return -ENOMEM;
-
-
- umem_odp->iova.dev = dev->dma_device;
- umem_odp->iova.size = end - start;
- umem_odp->iova.dir = DMA_BIDIRECTIONAL;
- ret = ib_dma_alloc_iova(dev, &umem_odp->iova);
- if (ret)
- goto out_pfn_list;
-
- ret = mmu_interval_notifier_insert(&umem_odp->notifier,
- umem_odp->umem.owning_mm,
- start, end - start, ops);
- if (ret)
- goto out_free_iova;
- }
+ if (umem_odp->is_implicit_odp)
+ return 0;
+
+ start = ALIGN_DOWN(umem_odp->umem.address, page_size);
+ if (check_add_overflow(umem_odp->umem.address,
+ (unsigned long)umem_odp->umem.length, &end))
+ return -EOVERFLOW;
+ end = ALIGN(end, page_size);
+ if (unlikely(end < page_size))
+ return -EOVERFLOW;
+
+ ndmas = (end - start) >> umem_odp->page_shift;
+ if (!ndmas)
+ return -EINVAL;
+
+ npfns = (end - start) >> PAGE_SHIFT;
+ umem_odp->pfn_list =
+ kvcalloc(npfns, sizeof(*umem_odp->pfn_list), GFP_KERNEL);
+ if (!umem_odp->pfn_list)
+ return -ENOMEM;
+
+ umem_odp->iova.dev = dev->dma_device;
+ umem_odp->iova.size = end - start;
+ umem_odp->iova.dir = DMA_BIDIRECTIONAL;
+ umem_odp->iova.attrs = DMA_ATTR_NO_TRANSLATION;
+ ret = ib_dma_alloc_iova(dev, &umem_odp->iova);
+ if (ret)
+ goto out_pfn_list;
+
+ ret = mmu_interval_notifier_insert(&umem_odp->notifier,
+ umem_odp->umem.owning_mm, start,
+ end - start, ops);
+ if (ret)
+ goto out_free_iova;

return 0;

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 91cc084adb53..89945e707a9b 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -62,6 +62,12 @@
*/
#define DMA_ATTR_PRIVILEGED (1UL << 9)

+/*
+ * DMA_ATTR_NO_TRANSLATION: used to indicate that the buffer should not be mapped
+ * through address translation.
+ */
+#define DMA_ATTR_NO_TRANSLATION (1UL << 10)
+
/*
* A dma_addr_t can hold any valid DMA or bus address for the platform. It can
* be given to a device to use as a DMA source or target. It is specific to a
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index 1c30e1cd607a..1c9ec204c999 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -92,6 +92,8 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev,
if (is_swiotlb_force_bounce(dev)) {
if (is_pci_p2pdma_page(page))
return DMA_MAPPING_ERROR;
+ if (attrs & DMA_ATTR_NO_TRANSLATION)
+ return DMA_MAPPING_ERROR;
return swiotlb_map(dev, phys, size, dir, attrs);
}

@@ -99,7 +101,7 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev,
dma_kmalloc_needs_bounce(dev, size, dir)) {
if (is_pci_p2pdma_page(page))
return DMA_MAPPING_ERROR;
- if (is_swiotlb_active(dev))
+ if (is_swiotlb_active(dev) && !(attrs & DMA_ATTR_NO_TRANSLATION))
return swiotlb_map(dev, phys, size, dir, attrs);

dev_WARN_ONCE(dev, 1,
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index f989c64622c2..49b1fde510c5 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -188,6 +188,10 @@ int dma_alloc_iova(struct dma_iova_attrs *iova)
struct device *dev = iova->dev;
const struct dma_map_ops *ops = get_dma_ops(dev);

+ if (dma_map_direct(dev, ops) && is_swiotlb_force_bounce(dev) &&
+ iova->attrs & DMA_ATTR_NO_TRANSLATION)
+ return -EOPNOTSUPP;
+
if (dma_map_direct(dev, ops) || !ops->alloc_iova) {
iova->addr = 0;
return 0;
--
2.44.0


2024-03-05 10:25:14

by Leon Romanovsky

[permalink] [raw]
Subject: [RFC 11/16] vfio/mlx5: Explicitly use number of pages instead of allocated length

From: Leon Romanovsky <[email protected]>

allocated_length is a multiple of page size and number of pages, so let's
change the functions to accept number of pages. It opens us a venue to
combine receive and send paths together with code readability
improvement.

Signed-off-by: Leon Romanovsky <[email protected]>
---
drivers/vfio/pci/mlx5/cmd.c | 31 ++++++++---------
drivers/vfio/pci/mlx5/cmd.h | 10 +++---
drivers/vfio/pci/mlx5/main.c | 65 +++++++++++++++++++++++-------------
3 files changed, 62 insertions(+), 44 deletions(-)

diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index efd1d252cdc9..45104e47b7b2 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -305,8 +305,7 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
struct mlx5_vhca_recv_buf *recv_buf,
u32 *mkey)
{
- size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) :
- recv_buf->npages;
+ size_t npages = buf ? buf->npages : recv_buf->npages;
int err = 0, inlen;
__be64 *mtt;
void *mkc;
@@ -362,7 +361,7 @@ static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf)
if (mvdev->mdev_detach)
return -ENOTCONN;

- if (buf->dmaed || !buf->allocated_length)
+ if (buf->dmaed || !buf->npages)
return -EINVAL;

ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
@@ -403,8 +402,7 @@ void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf)
}

struct mlx5_vhca_data_buffer *
-mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
- size_t length,
+mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages,
enum dma_data_direction dma_dir)
{
struct mlx5_vhca_data_buffer *buf;
@@ -416,9 +414,8 @@ mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,

buf->dma_dir = dma_dir;
buf->migf = migf;
- if (length) {
- ret = mlx5vf_add_migration_pages(buf,
- DIV_ROUND_UP_ULL(length, PAGE_SIZE));
+ if (npages) {
+ ret = mlx5vf_add_migration_pages(buf, npages);
if (ret)
goto end;

@@ -444,8 +441,8 @@ void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf)
}

struct mlx5_vhca_data_buffer *
-mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
- size_t length, enum dma_data_direction dma_dir)
+mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages,
+ enum dma_data_direction dma_dir)
{
struct mlx5_vhca_data_buffer *buf, *temp_buf;
struct list_head free_list;
@@ -460,7 +457,7 @@ mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) {
if (buf->dma_dir == dma_dir) {
list_del_init(&buf->buf_elm);
- if (buf->allocated_length >= length) {
+ if (buf->npages >= npages) {
spin_unlock_irq(&migf->list_lock);
goto found;
}
@@ -474,7 +471,7 @@ mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
}
}
spin_unlock_irq(&migf->list_lock);
- buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir);
+ buf = mlx5vf_alloc_data_buffer(migf, npages, dma_dir);

found:
while ((temp_buf = list_first_entry_or_null(&free_list,
@@ -645,7 +642,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
MLX5_SET(save_vhca_state_in, in, op_mod, 0);
MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id);
MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey);
- MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length);
+ MLX5_SET(save_vhca_state_in, in, size, buf->npages * PAGE_SIZE);
MLX5_SET(save_vhca_state_in, in, incremental, inc);
MLX5_SET(save_vhca_state_in, in, set_track, track);

@@ -668,8 +665,12 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
}

if (!header_buf) {
- header_buf = mlx5vf_get_data_buffer(migf,
- sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+ u32 npages = DIV_ROUND_UP(
+ sizeof(struct mlx5_vf_migration_header),
+ PAGE_SIZE);
+
+ header_buf =
+ mlx5vf_get_data_buffer(migf, npages, DMA_NONE);
if (IS_ERR(header_buf)) {
err = PTR_ERR(header_buf);
goto err_free;
diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h
index f2c7227fa683..887267ebbd8a 100644
--- a/drivers/vfio/pci/mlx5/cmd.h
+++ b/drivers/vfio/pci/mlx5/cmd.h
@@ -60,7 +60,7 @@ struct mlx5_vhca_data_buffer {
struct sg_append_table table;
loff_t start_pos;
u64 length;
- u64 allocated_length;
+ u32 npages;
u32 mkey;
enum dma_data_direction dma_dir;
u8 dmaed:1;
@@ -219,12 +219,12 @@ int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf);
void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf);
void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf);
struct mlx5_vhca_data_buffer *
-mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
- size_t length, enum dma_data_direction dma_dir);
+mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages,
+ enum dma_data_direction dma_dir);
void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf);
struct mlx5_vhca_data_buffer *
-mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
- size_t length, enum dma_data_direction dma_dir);
+mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages,
+ enum dma_data_direction dma_dir);
void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf);
int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
unsigned int npages);
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index fe09a8c8af95..b11b1c27d284 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -94,7 +94,7 @@ int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,

if (ret)
goto err;
- buf->allocated_length += filled * PAGE_SIZE;
+ buf->npages += filled;
/* clean input for another bulk allocation */
memset(page_list, 0, filled * sizeof(*page_list));
to_fill = min_t(unsigned int, to_alloc,
@@ -352,6 +352,7 @@ static struct mlx5_vhca_data_buffer *
mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf,
u8 index, size_t required_length)
{
+ u32 npages = DIV_ROUND_UP(required_length, PAGE_SIZE);
struct mlx5_vhca_data_buffer *buf = migf->buf[index];
u8 chunk_num;

@@ -359,12 +360,11 @@ mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf,
chunk_num = buf->stop_copy_chunk_num;
buf->migf->buf[index] = NULL;
/* Checking whether the pre-allocated buffer can fit */
- if (buf->allocated_length >= required_length)
+ if (buf->npages >= npages)
return buf;

mlx5vf_put_data_buffer(buf);
- buf = mlx5vf_get_data_buffer(buf->migf, required_length,
- DMA_FROM_DEVICE);
+ buf = mlx5vf_get_data_buffer(buf->migf, npages, DMA_FROM_DEVICE);
if (IS_ERR(buf))
return buf;

@@ -417,7 +417,9 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf,
u8 *to_buff;
int ret;

- header_buf = mlx5vf_get_data_buffer(migf, size, DMA_NONE);
+ BUILD_BUG_ON(size > PAGE_SIZE);
+ header_buf = mlx5vf_get_data_buffer(migf, DIV_ROUND_UP(size, PAGE_SIZE),
+ DMA_NONE);
if (IS_ERR(header_buf))
return PTR_ERR(header_buf);

@@ -432,7 +434,7 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf,
to_buff = kmap_local_page(page);
memcpy(to_buff, &header, sizeof(header));
header_buf->length = sizeof(header);
- data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length);
+ data.stop_copy_size = cpu_to_le64(migf->buf[0]->npages * PAGE_SIZE);
memcpy(to_buff + sizeof(header), &data, sizeof(data));
header_buf->length += sizeof(data);
kunmap_local(to_buff);
@@ -481,15 +483,22 @@ static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev,

num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1;
for (i = 0; i < num_chunks; i++) {
- buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
+ buf = mlx5vf_get_data_buffer(
+ migf, DIV_ROUND_UP(inc_state_size, PAGE_SIZE),
+ DMA_FROM_DEVICE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto err;
}

+ BUILD_BUG_ON(sizeof(struct mlx5_vf_migration_header) >
+ PAGE_SIZE);
migf->buf[i] = buf;
- buf = mlx5vf_get_data_buffer(migf,
- sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+ buf = mlx5vf_get_data_buffer(
+ migf,
+ DIV_ROUND_UP(sizeof(struct mlx5_vf_migration_header),
+ PAGE_SIZE),
+ DMA_NONE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto err;
@@ -597,7 +606,8 @@ static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
* We finished transferring the current state and the device has a
* dirty state, save a new state to be ready for.
*/
- buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE);
+ buf = mlx5vf_get_data_buffer(migf, DIV_ROUND_UP(inc_length, PAGE_SIZE),
+ DMA_FROM_DEVICE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
mlx5vf_mark_err(migf);
@@ -718,8 +728,8 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)

if (track) {
/* leave the allocated buffer ready for the stop-copy phase */
- buf = mlx5vf_alloc_data_buffer(migf,
- migf->buf[0]->allocated_length, DMA_FROM_DEVICE);
+ buf = mlx5vf_alloc_data_buffer(migf, migf->buf[0]->npages,
+ DMA_FROM_DEVICE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto out_pd;
@@ -783,16 +793,15 @@ mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf,
const char __user **buf, size_t *len,
loff_t *pos, ssize_t *done)
{
+ u32 npages = DIV_ROUND_UP(requested_length, PAGE_SIZE);
int ret;

if (requested_length > MAX_LOAD_SIZE)
return -ENOMEM;

- if (vhca_buf->allocated_length < requested_length) {
- ret = mlx5vf_add_migration_pages(
- vhca_buf,
- DIV_ROUND_UP(requested_length - vhca_buf->allocated_length,
- PAGE_SIZE));
+ if (vhca_buf->npages < npages) {
+ ret = mlx5vf_add_migration_pages(vhca_buf,
+ npages - vhca_buf->npages);
if (ret)
return ret;
}
@@ -992,11 +1001,14 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
goto out_unlock;
break;
case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA:
- if (vhca_buf_header->allocated_length < migf->record_size) {
+ {
+ u32 npages = DIV_ROUND_UP(migf->record_size, PAGE_SIZE);
+
+ if (vhca_buf_header->npages < npages) {
mlx5vf_free_data_buffer(vhca_buf_header);

- migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf,
- migf->record_size, DMA_NONE);
+ migf->buf_header[0] = mlx5vf_alloc_data_buffer(
+ migf, npages, DMA_NONE);
if (IS_ERR(migf->buf_header[0])) {
ret = PTR_ERR(migf->buf_header[0]);
migf->buf_header[0] = NULL;
@@ -1009,6 +1021,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
vhca_buf_header->start_pos = migf->max_pos;
migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA;
break;
+ }
case MLX5_VF_LOAD_STATE_READ_HEADER_DATA:
ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header,
&buf, &len, pos, &done);
@@ -1019,12 +1032,13 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
{
u64 size = max(migf->record_size,
migf->stop_copy_prep_size);
+ u32 npages = DIV_ROUND_UP(size, PAGE_SIZE);

- if (vhca_buf->allocated_length < size) {
+ if (vhca_buf->npages < npages) {
mlx5vf_free_data_buffer(vhca_buf);

migf->buf[0] = mlx5vf_alloc_data_buffer(migf,
- size, DMA_TO_DEVICE);
+ npages, DMA_TO_DEVICE);
if (IS_ERR(migf->buf[0])) {
ret = PTR_ERR(migf->buf[0]);
migf->buf[0] = NULL;
@@ -1115,8 +1129,11 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)

migf->buf[0] = buf;
if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
- buf = mlx5vf_alloc_data_buffer(migf,
- sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+ buf = mlx5vf_alloc_data_buffer(
+ migf,
+ DIV_ROUND_UP(sizeof(struct mlx5_vf_migration_header),
+ PAGE_SIZE),
+ DMA_NONE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto out_buf;
--
2.44.0


2024-03-05 10:26:01

by Leon Romanovsky

[permalink] [raw]
Subject: [RFC 12/16] vfio/mlx5: Rewrite create mkey flow to allow better code reuse

From: Leon Romanovsky <[email protected]>

Change the creation of mkey to be performed in multiple steps:
data allocation, DMA setup and actual call to HW to create that mkey.

In this new flow, the whole input to MKEY command is saved to eliminate
the need to keep array of pointers for DMA addresses for receive list
and in the future patches for send list too.

In addition to memory size reduce and elimination of unnecessary data
movements to set MKEY input, the code is prepared for future reuse.

Signed-off-by: Leon Romanovsky <[email protected]>
---
drivers/vfio/pci/mlx5/cmd.c | 149 +++++++++++++++++++++---------------
drivers/vfio/pci/mlx5/cmd.h | 3 +-
2 files changed, 88 insertions(+), 64 deletions(-)

diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index 45104e47b7b2..44762980fcb9 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -300,39 +300,21 @@ static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
return ret;
}

-static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
- struct mlx5_vhca_data_buffer *buf,
- struct mlx5_vhca_recv_buf *recv_buf,
- u32 *mkey)
+static u32 *alloc_mkey_in(u32 npages, u32 pdn)
{
- size_t npages = buf ? buf->npages : recv_buf->npages;
- int err = 0, inlen;
- __be64 *mtt;
+ int inlen;
void *mkc;
u32 *in;

inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
- sizeof(*mtt) * round_up(npages, 2);
+ sizeof(__be64) * round_up(npages, 2);

- in = kvzalloc(inlen, GFP_KERNEL);
+ in = kvzalloc(inlen, GFP_KERNEL_ACCOUNT);
if (!in)
- return -ENOMEM;
+ return NULL;

MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
DIV_ROUND_UP(npages, 2));
- mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
-
- if (buf) {
- struct sg_dma_page_iter dma_iter;
-
- for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0)
- *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter));
- } else {
- int i;
-
- for (i = 0; i < npages; i++)
- *mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]);
- }

mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
@@ -346,9 +328,30 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2));
MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE);
- err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
- kvfree(in);
- return err;
+
+ return in;
+}
+
+static int create_mkey(struct mlx5_core_dev *mdev, u32 npages,
+ struct mlx5_vhca_data_buffer *buf, u32 *mkey_in,
+ u32 *mkey)
+{
+ __be64 *mtt;
+ int inlen;
+
+ mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt);
+
+ if (buf) {
+ struct sg_dma_page_iter dma_iter;
+
+ for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0)
+ *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter));
+ }
+
+ inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
+ sizeof(__be64) * round_up(npages, 2);
+
+ return mlx5_core_create_mkey(mdev, mkey, mkey_in, inlen);
}

static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf)
@@ -368,13 +371,22 @@ static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf)
if (ret)
return ret;

- ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey);
- if (ret)
+ buf->mkey_in = alloc_mkey_in(buf->npages, buf->migf->pdn);
+ if (!buf->mkey_in) {
+ ret = -ENOMEM;
goto err;
+ }
+
+ ret = create_mkey(mdev, buf->npages, buf, buf->mkey_in, &buf->mkey);
+ if (ret)
+ goto err_create_mkey;

buf->dmaed = true;

return 0;
+
+err_create_mkey:
+ kvfree(buf->mkey_in);
err:
dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
return ret;
@@ -390,6 +402,7 @@ void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf)

if (buf->dmaed) {
mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey);
+ kvfree(buf->mkey_in);
dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt,
buf->dma_dir, 0);
}
@@ -1286,46 +1299,45 @@ static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf,
return -ENOMEM;
}

-static int register_dma_recv_pages(struct mlx5_core_dev *mdev,
- struct mlx5_vhca_recv_buf *recv_buf)
+static void unregister_dma_pages(struct mlx5_core_dev *mdev, u32 npages,
+ u32 *mkey_in)
{
- int i, j;
+ dma_addr_t addr;
+ __be64 *mtt;
+ int i;

- recv_buf->dma_addrs = kvcalloc(recv_buf->npages,
- sizeof(*recv_buf->dma_addrs),
- GFP_KERNEL_ACCOUNT);
- if (!recv_buf->dma_addrs)
- return -ENOMEM;
+ mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt);

- for (i = 0; i < recv_buf->npages; i++) {
- recv_buf->dma_addrs[i] = dma_map_page(mdev->device,
- recv_buf->page_list[i],
- 0, PAGE_SIZE,
- DMA_FROM_DEVICE);
- if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i]))
- goto error;
+ for (i = npages - 1; i >= 0; i--) {
+ addr = be64_to_cpu(mtt[i]);
+ dma_unmap_single(mdev->device, addr, PAGE_SIZE,
+ DMA_FROM_DEVICE);
}
- return 0;
-
-error:
- for (j = 0; j < i; j++)
- dma_unmap_single(mdev->device, recv_buf->dma_addrs[j],
- PAGE_SIZE, DMA_FROM_DEVICE);
-
- kvfree(recv_buf->dma_addrs);
- return -ENOMEM;
}

-static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev,
- struct mlx5_vhca_recv_buf *recv_buf)
+static int register_dma_pages(struct mlx5_core_dev *mdev, u32 npages,
+ struct page **page_list, u32 *mkey_in)
{
+ dma_addr_t addr;
+ __be64 *mtt;
int i;

- for (i = 0; i < recv_buf->npages; i++)
- dma_unmap_single(mdev->device, recv_buf->dma_addrs[i],
- PAGE_SIZE, DMA_FROM_DEVICE);
+ mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt);
+
+ for (i = 0; i < npages; i++) {
+ addr = dma_map_page(mdev->device, page_list[i], 0, PAGE_SIZE,
+ DMA_FROM_DEVICE);
+ if (dma_mapping_error(mdev->device, addr))
+ goto error;
+
+ *mtt++ = cpu_to_be64(addr);
+ }
+
+ return 0;

- kvfree(recv_buf->dma_addrs);
+error:
+ unregister_dma_pages(mdev, i, mkey_in);
+ return -ENOMEM;
}

static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev,
@@ -1334,7 +1346,8 @@ static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev,
struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;

mlx5_core_destroy_mkey(mdev, recv_buf->mkey);
- unregister_dma_recv_pages(mdev, recv_buf);
+ unregister_dma_pages(mdev, recv_buf->npages, recv_buf->mkey_in);
+ kvfree(recv_buf->mkey_in);
free_recv_pages(&qp->recv_buf);
}

@@ -1350,18 +1363,28 @@ static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev,
if (err < 0)
return err;

- err = register_dma_recv_pages(mdev, recv_buf);
- if (err)
+ recv_buf->mkey_in = alloc_mkey_in(npages, pdn);
+ if (!recv_buf->mkey_in) {
+ err = -ENOMEM;
goto end;
+ }
+
+ err = register_dma_pages(mdev, npages, recv_buf->page_list,
+ recv_buf->mkey_in);
+ if (err)
+ goto err_register_dma;

- err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey);
+ err = create_mkey(mdev, npages, NULL, recv_buf->mkey_in,
+ &recv_buf->mkey);
if (err)
goto err_create_mkey;

return 0;

err_create_mkey:
- unregister_dma_recv_pages(mdev, recv_buf);
+ unregister_dma_pages(mdev, npages, recv_buf->mkey_in);
+err_register_dma:
+ kvfree(recv_buf->mkey_in);
end:
free_recv_pages(recv_buf);
return err;
diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h
index 887267ebbd8a..83728c0669e7 100644
--- a/drivers/vfio/pci/mlx5/cmd.h
+++ b/drivers/vfio/pci/mlx5/cmd.h
@@ -62,6 +62,7 @@ struct mlx5_vhca_data_buffer {
u64 length;
u32 npages;
u32 mkey;
+ u32 *mkey_in;
enum dma_data_direction dma_dir;
u8 dmaed:1;
u8 stop_copy_chunk_num;
@@ -137,8 +138,8 @@ struct mlx5_vhca_cq {
struct mlx5_vhca_recv_buf {
u32 npages;
struct page **page_list;
- dma_addr_t *dma_addrs;
u32 next_rq_offset;
+ u32 *mkey_in;
u32 mkey;
};

--
2.44.0


2024-03-05 10:26:53

by Leon Romanovsky

[permalink] [raw]
Subject: [RFC 13/16] vfio/mlx5: Explicitly store page list

From: Leon Romanovsky <[email protected]>

As a preparation to removal scatter-gather table and unifying
receive and send list, explicitly store page list.

Signed-off-by: Leon Romanovsky <[email protected]>
---
drivers/vfio/pci/mlx5/cmd.c | 1 +
drivers/vfio/pci/mlx5/cmd.h | 1 +
drivers/vfio/pci/mlx5/main.c | 35 +++++++++++++++++------------------
3 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index 44762980fcb9..5e2103042d9b 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -411,6 +411,7 @@ void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf)
for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0)
__free_page(sg_page_iter_page(&sg_iter));
sg_free_append_table(&buf->table);
+ kvfree(buf->page_list);
kfree(buf);
}

diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h
index 83728c0669e7..815fcb54494d 100644
--- a/drivers/vfio/pci/mlx5/cmd.h
+++ b/drivers/vfio/pci/mlx5/cmd.h
@@ -57,6 +57,7 @@ struct mlx5_vf_migration_header {
};

struct mlx5_vhca_data_buffer {
+ struct page **page_list;
struct sg_append_table table;
loff_t start_pos;
u64 length;
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index b11b1c27d284..7ffe24693a55 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -69,44 +69,43 @@ int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
unsigned int npages)
{
unsigned int to_alloc = npages;
+ size_t old_size, new_size;
struct page **page_list;
unsigned long filled;
unsigned int to_fill;
int ret;

- to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
- page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT);
+ to_fill = min_t(unsigned int, npages,
+ PAGE_SIZE / sizeof(*buf->page_list));
+ old_size = buf->npages * sizeof(*buf->page_list);
+ new_size = old_size + to_fill * sizeof(*buf->page_list);
+ page_list = kvrealloc(buf->page_list, old_size, new_size,
+ GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!page_list)
return -ENOMEM;

+ buf->page_list = page_list;
+
do {
filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill,
- page_list);
- if (!filled) {
- ret = -ENOMEM;
- goto err;
- }
+ buf->page_list + buf->npages);
+ if (!filled)
+ return -ENOMEM;
+
to_alloc -= filled;
ret = sg_alloc_append_table_from_pages(
- &buf->table, page_list, filled, 0,
+ &buf->table, buf->page_list + buf->npages, filled, 0,
filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
GFP_KERNEL_ACCOUNT);
-
if (ret)
- goto err;
+ return ret;
+
buf->npages += filled;
- /* clean input for another bulk allocation */
- memset(page_list, 0, filled * sizeof(*page_list));
to_fill = min_t(unsigned int, to_alloc,
- PAGE_SIZE / sizeof(*page_list));
+ PAGE_SIZE / sizeof(*buf->page_list));
} while (to_alloc > 0);

- kvfree(page_list);
return 0;
-
-err:
- kvfree(page_list);
- return ret;
}

static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
--
2.44.0


2024-03-05 10:28:04

by Leon Romanovsky

[permalink] [raw]
Subject: [RFC 14/16] vfio/mlx5: Convert vfio to use DMA link API

From: Leon Romanovsky <[email protected]>

Remove intermediate scatter-gather table as it is not needed
if DMA link API is used. This conversion reduces drastically
the memory used to manage that table.

Signed-off-by: Leon Romanovsky <[email protected]>
---
drivers/vfio/pci/mlx5/cmd.c | 177 ++++++++++++++++-------------------
drivers/vfio/pci/mlx5/cmd.h | 8 +-
drivers/vfio/pci/mlx5/main.c | 50 ++--------
3 files changed, 91 insertions(+), 144 deletions(-)

diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index 5e2103042d9b..cfae03f7b7da 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -332,26 +332,60 @@ static u32 *alloc_mkey_in(u32 npages, u32 pdn)
return in;
}

-static int create_mkey(struct mlx5_core_dev *mdev, u32 npages,
- struct mlx5_vhca_data_buffer *buf, u32 *mkey_in,
+static int create_mkey(struct mlx5_core_dev *mdev, u32 npages, u32 *mkey_in,
u32 *mkey)
{
+ int inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
+ sizeof(__be64) * round_up(npages, 2);
+
+ return mlx5_core_create_mkey(mdev, mkey, mkey_in, inlen);
+}
+
+static void unregister_dma_pages(struct mlx5_core_dev *mdev, u32 npages,
+ u32 *mkey_in, struct dma_iova_attrs *iova)
+{
+ dma_addr_t addr;
__be64 *mtt;
- int inlen;
+ int i;

mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt);

- if (buf) {
- struct sg_dma_page_iter dma_iter;
+ for (i = npages - 1; i >= 0; i--) {
+ addr = be64_to_cpu(mtt[i]);
+ dma_unlink_range(iova, addr);
+ }
+ dma_free_iova(iova);
+}
+
+static int register_dma_pages(struct mlx5_core_dev *mdev, u32 npages,
+ struct page **page_list, u32 *mkey_in,
+ struct dma_iova_attrs *iova)
+{
+ dma_addr_t addr;
+ __be64 *mtt;
+ int i, err;
+
+ iova->dev = mdev->device;
+ iova->size = npages * PAGE_SIZE;
+ err = dma_alloc_iova(iova);
+ if (err)
+ return err;
+
+ mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt);
+
+ for (i = 0; i < npages; i++) {
+ addr = dma_link_range(page_list[i], 0, iova, i * PAGE_SIZE);
+ if (dma_mapping_error(mdev->device, addr))
+ goto error;

- for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0)
- *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter));
+ *mtt++ = cpu_to_be64(addr);
}

- inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
- sizeof(__be64) * round_up(npages, 2);
+ return 0;

- return mlx5_core_create_mkey(mdev, mkey, mkey_in, inlen);
+error:
+ unregister_dma_pages(mdev, i, mkey_in, iova);
+ return -ENOMEM;
}

static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf)
@@ -367,17 +401,16 @@ static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf)
if (buf->dmaed || !buf->npages)
return -EINVAL;

- ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
- if (ret)
- return ret;
-
buf->mkey_in = alloc_mkey_in(buf->npages, buf->migf->pdn);
- if (!buf->mkey_in) {
- ret = -ENOMEM;
- goto err;
- }
+ if (!buf->mkey_in)
+ return -ENOMEM;
+
+ ret = register_dma_pages(mdev, buf->npages, buf->page_list,
+ buf->mkey_in, &buf->iova);
+ if (ret)
+ goto err_register_dma;

- ret = create_mkey(mdev, buf->npages, buf, buf->mkey_in, &buf->mkey);
+ ret = create_mkey(mdev, buf->npages, buf->mkey_in, &buf->mkey);
if (ret)
goto err_create_mkey;

@@ -386,32 +419,39 @@ static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf)
return 0;

err_create_mkey:
+ unregister_dma_pages(mdev, buf->npages, buf->mkey_in, &buf->iova);
+err_register_dma:
kvfree(buf->mkey_in);
-err:
- dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
return ret;
}

+static void free_page_list(u32 npages, struct page **page_list)
+{
+ int i;
+
+ /* Undo alloc_pages_bulk_array() */
+ for (i = npages - 1; i >= 0; i--)
+ __free_page(page_list[i]);
+
+ kvfree(page_list);
+}
+
void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf)
{
- struct mlx5_vf_migration_file *migf = buf->migf;
- struct sg_page_iter sg_iter;
+ struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev;
+ struct mlx5_core_dev *mdev = mvdev->mdev;

- lockdep_assert_held(&migf->mvdev->state_mutex);
- WARN_ON(migf->mvdev->mdev_detach);
+ lockdep_assert_held(&mvdev->state_mutex);
+ WARN_ON(mvdev->mdev_detach);

if (buf->dmaed) {
- mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey);
+ mlx5_core_destroy_mkey(mdev, buf->mkey);
+ unregister_dma_pages(mdev, buf->npages, buf->mkey_in,
+ &buf->iova);
kvfree(buf->mkey_in);
- dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt,
- buf->dma_dir, 0);
}

- /* Undo alloc_pages_bulk_array() */
- for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0)
- __free_page(sg_page_iter_page(&sg_iter));
- sg_free_append_table(&buf->table);
- kvfree(buf->page_list);
+ free_page_list(buf->npages, buf->page_list);
kfree(buf);
}

@@ -426,7 +466,7 @@ mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages,
if (!buf)
return ERR_PTR(-ENOMEM);

- buf->dma_dir = dma_dir;
+ buf->iova.dir = dma_dir;
buf->migf = migf;
if (npages) {
ret = mlx5vf_add_migration_pages(buf, npages);
@@ -469,7 +509,7 @@ mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages,

spin_lock_irq(&migf->list_lock);
list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) {
- if (buf->dma_dir == dma_dir) {
+ if (buf->iova.dir == dma_dir) {
list_del_init(&buf->buf_elm);
if (buf->npages >= npages) {
spin_unlock_irq(&migf->list_lock);
@@ -1253,17 +1293,6 @@ static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev,
kfree(qp);
}

-static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf)
-{
- int i;
-
- /* Undo alloc_pages_bulk_array() */
- for (i = 0; i < recv_buf->npages; i++)
- __free_page(recv_buf->page_list[i]);
-
- kvfree(recv_buf->page_list);
-}
-
static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf,
unsigned int npages)
{
@@ -1300,56 +1329,16 @@ static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf,
return -ENOMEM;
}

-static void unregister_dma_pages(struct mlx5_core_dev *mdev, u32 npages,
- u32 *mkey_in)
-{
- dma_addr_t addr;
- __be64 *mtt;
- int i;
-
- mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt);
-
- for (i = npages - 1; i >= 0; i--) {
- addr = be64_to_cpu(mtt[i]);
- dma_unmap_single(mdev->device, addr, PAGE_SIZE,
- DMA_FROM_DEVICE);
- }
-}
-
-static int register_dma_pages(struct mlx5_core_dev *mdev, u32 npages,
- struct page **page_list, u32 *mkey_in)
-{
- dma_addr_t addr;
- __be64 *mtt;
- int i;
-
- mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt);
-
- for (i = 0; i < npages; i++) {
- addr = dma_map_page(mdev->device, page_list[i], 0, PAGE_SIZE,
- DMA_FROM_DEVICE);
- if (dma_mapping_error(mdev->device, addr))
- goto error;
-
- *mtt++ = cpu_to_be64(addr);
- }
-
- return 0;
-
-error:
- unregister_dma_pages(mdev, i, mkey_in);
- return -ENOMEM;
-}
-
static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev,
struct mlx5_vhca_qp *qp)
{
struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;

mlx5_core_destroy_mkey(mdev, recv_buf->mkey);
- unregister_dma_pages(mdev, recv_buf->npages, recv_buf->mkey_in);
+ unregister_dma_pages(mdev, recv_buf->npages, recv_buf->mkey_in,
+ &recv_buf->iova);
kvfree(recv_buf->mkey_in);
- free_recv_pages(&qp->recv_buf);
+ free_page_list(recv_buf->npages, recv_buf->page_list);
}

static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev,
@@ -1370,24 +1359,24 @@ static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev,
goto end;
}

+ recv_buf->iova.dir = DMA_FROM_DEVICE;
err = register_dma_pages(mdev, npages, recv_buf->page_list,
- recv_buf->mkey_in);
+ recv_buf->mkey_in, &recv_buf->iova);
if (err)
goto err_register_dma;

- err = create_mkey(mdev, npages, NULL, recv_buf->mkey_in,
- &recv_buf->mkey);
+ err = create_mkey(mdev, npages, recv_buf->mkey_in, &recv_buf->mkey);
if (err)
goto err_create_mkey;

return 0;

err_create_mkey:
- unregister_dma_pages(mdev, npages, recv_buf->mkey_in);
+ unregister_dma_pages(mdev, npages, recv_buf->mkey_in, &recv_buf->iova);
err_register_dma:
kvfree(recv_buf->mkey_in);
end:
- free_recv_pages(recv_buf);
+ free_page_list(npages, recv_buf->page_list);
return err;
}

diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h
index 815fcb54494d..3a046166d9f2 100644
--- a/drivers/vfio/pci/mlx5/cmd.h
+++ b/drivers/vfio/pci/mlx5/cmd.h
@@ -57,22 +57,17 @@ struct mlx5_vf_migration_header {
};

struct mlx5_vhca_data_buffer {
+ struct dma_iova_attrs iova;
struct page **page_list;
- struct sg_append_table table;
loff_t start_pos;
u64 length;
u32 npages;
u32 mkey;
u32 *mkey_in;
- enum dma_data_direction dma_dir;
u8 dmaed:1;
u8 stop_copy_chunk_num;
struct list_head buf_elm;
struct mlx5_vf_migration_file *migf;
- /* Optimize mlx5vf_get_migration_page() for sequential access */
- struct scatterlist *last_offset_sg;
- unsigned int sg_last_entry;
- unsigned long last_offset;
};

struct mlx5vf_async_data {
@@ -137,6 +132,7 @@ struct mlx5_vhca_cq {
};

struct mlx5_vhca_recv_buf {
+ struct dma_iova_attrs iova;
u32 npages;
struct page **page_list;
u32 next_rq_offset;
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index 7ffe24693a55..668c28bc429c 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -34,35 +34,10 @@ static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
core_device);
}

-struct page *
-mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
- unsigned long offset)
+struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
+ unsigned long offset)
{
- unsigned long cur_offset = 0;
- struct scatterlist *sg;
- unsigned int i;
-
- /* All accesses are sequential */
- if (offset < buf->last_offset || !buf->last_offset_sg) {
- buf->last_offset = 0;
- buf->last_offset_sg = buf->table.sgt.sgl;
- buf->sg_last_entry = 0;
- }
-
- cur_offset = buf->last_offset;
-
- for_each_sg(buf->last_offset_sg, sg,
- buf->table.sgt.orig_nents - buf->sg_last_entry, i) {
- if (offset < sg->length + cur_offset) {
- buf->last_offset_sg = sg;
- buf->sg_last_entry += i;
- buf->last_offset = cur_offset;
- return nth_page(sg_page(sg),
- (offset - cur_offset) / PAGE_SIZE);
- }
- cur_offset += sg->length;
- }
- return NULL;
+ return buf->page_list[offset / PAGE_SIZE];
}

int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
@@ -72,13 +47,9 @@ int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
size_t old_size, new_size;
struct page **page_list;
unsigned long filled;
- unsigned int to_fill;
- int ret;

- to_fill = min_t(unsigned int, npages,
- PAGE_SIZE / sizeof(*buf->page_list));
old_size = buf->npages * sizeof(*buf->page_list);
- new_size = old_size + to_fill * sizeof(*buf->page_list);
+ new_size = old_size + to_alloc * sizeof(*buf->page_list);
page_list = kvrealloc(buf->page_list, old_size, new_size,
GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!page_list)
@@ -87,22 +58,13 @@ int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
buf->page_list = page_list;

do {
- filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill,
+ filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_alloc,
buf->page_list + buf->npages);
if (!filled)
return -ENOMEM;

to_alloc -= filled;
- ret = sg_alloc_append_table_from_pages(
- &buf->table, buf->page_list + buf->npages, filled, 0,
- filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
- GFP_KERNEL_ACCOUNT);
- if (ret)
- return ret;
-
buf->npages += filled;
- to_fill = min_t(unsigned int, to_alloc,
- PAGE_SIZE / sizeof(*buf->page_list));
} while (to_alloc > 0);

return 0;
@@ -164,7 +126,7 @@ static void mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer *vhca_buf)
struct mlx5_vf_migration_file *migf = vhca_buf->migf;

if (vhca_buf->stop_copy_chunk_num) {
- bool is_header = vhca_buf->dma_dir == DMA_NONE;
+ bool is_header = vhca_buf->iova.dir == DMA_NONE;
u8 chunk_num = vhca_buf->stop_copy_chunk_num;
size_t next_required_umem_size = 0;

--
2.44.0


2024-03-05 10:28:52

by Leon Romanovsky

[permalink] [raw]
Subject: [RFC 15/16] block: add dma_link_range() based API

From: Chaitanya Kulkarni <[email protected]>

Add two helper functions that are needed to calculate the total DMA
length of the request blk_rq_get_dma_length() and to create DMA
mapping blk_rq_dma_map().

blk_rq_get_dma_length() is used to get the total length of the request,
when driver is allocating IOVA space for this request with the call to
dma_alloc_iova(). This length is then initialized to the iova->size and
passed to allocate iova call chain :-
dma_map_ops->allov_iova()
iommu_dma_alloc_iova()
alloc_iova_fast()
iova_rcache_get()
OR
alloc_iova()

blk_rq_dma_map() iterates through bvec list and creates DMA mapping
for each page using iova parameter with the help of dma_link_range().
Note that @iova is allocated & pre-initialized using dma_alloc_iova()
by the caller. After creating a mapping for each page, call into the
callback function @cb provided by the drive with a mapped DMA address
for this page, offset into the iova space (needed at the time of
unlink), length of the mapped page, and page number that is mapped in
this request. Driver is responsible for using this DMA address to
complete the mapping of underlying protocol-specific data structures,
such as NVMe PRPs or NVMe SGLs. This callback approach allows us to
iterate bvec list only once to create bvec to DMA mapping and use that
DMA address in driver to build the protocol-specific data structure,
essentially mapping one bvec page at a time to DMA address and using
that DMA address to create underlying protocol-specific data structures.
Finally, returning the number of linked count.

Signed-off-by: Chaitanya Kulkarni <[email protected]>
Signed-off-by: Leon Romanovsky <[email protected]>
---
block/blk-merge.c | 156 +++++++++++++++++++++++++++++++++++++++++
include/linux/blk-mq.h | 9 +++
2 files changed, 165 insertions(+)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 2d470cf2173e..63effc8ac1db 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -583,6 +583,162 @@ int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
}
EXPORT_SYMBOL(__blk_rq_map_sg);

+static dma_addr_t blk_dma_link_page(struct page *page, unsigned int page_offset,
+ struct dma_iova_attrs *iova,
+ dma_addr_t dma_offset)
+{
+ dma_addr_t dma_addr;
+ int ret;
+
+ dma_addr = dma_link_range(page, page_offset, iova, dma_offset);
+ ret = dma_mapping_error(iova->dev, dma_addr);
+ if (ret) {
+ pr_err("dma_mapping_err %d dma_addr 0x%llx dma_offset %llu\n",
+ ret, dma_addr, dma_offset);
+ /* better way ? */
+ dma_addr = 0;
+ }
+ return dma_addr;
+}
+
+/**
+ * blk_rq_dma_map: block layer request to DMA mapping helper.
+ *
+ * @req : [in] request to be mapped
+ * @cb : [in] callback to be called for each bvec mapped bvec into
+ * underlaying driver.
+ * @cb_data : [in] callback data to be passed, privete to the underlaying
+ * driver.
+ * @iova : [in] iova to be used to create DMA mapping for this request's
+ * bvecs.
+ * Description:
+ * Iterates through bvec list and create dma mapping between each bvec page
+ * using @iova with dma_link_range(). Note that @iova needs to be allocated and
+ * pre-initialized using dma_alloc_iova() by the caller. After creating
+ * a mapping for each page, call into the callback function @cb provided by
+ * driver with mapped dma address for this bvec, offset into iova space, length
+ * of the mapped page, and bvec number that is mapped in this requets. Driver is
+ * responsible for using this dma address to complete the mapping of underlaying
+ * protocol specific data structure, such as NVMe PRPs or NVMe SGLs. This
+ * callback approach allows us to iterate bvec list only once to create bvec to
+ * DMA mapping & use that dma address in the driver to build the protocol
+ * specific data structure, essentially mapping one bvec page at a time to DMA
+ * address and use that DMA address to create underlaying protocol specific
+ * data structure.
+ *
+ * Caller needs to ensure @iova is initialized & allovated with using
+ * dma_alloc_iova().
+ */
+int blk_rq_dma_map(struct request *req, driver_map_cb cb, void *cb_data,
+ struct dma_iova_attrs *iova)
+{
+ dma_addr_t curr_dma_offset = 0;
+ dma_addr_t prev_dma_addr = 0;
+ dma_addr_t dma_addr;
+ size_t prev_dma_len = 0;
+ struct req_iterator iter;
+ struct bio_vec bv;
+ int linked_cnt = 0;
+
+ rq_for_each_bvec(bv, req, iter) {
+ if (bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
+ curr_dma_offset = prev_dma_addr + prev_dma_len;
+
+ dma_addr = blk_dma_link_page(bv.bv_page, bv.bv_offset,
+ iova, curr_dma_offset);
+ if (!dma_addr)
+ break;
+
+ cb(cb_data, linked_cnt, dma_addr, curr_dma_offset,
+ bv.bv_len);
+
+ prev_dma_len = bv.bv_len;
+ prev_dma_addr = dma_addr;
+ linked_cnt++;
+ } else {
+ unsigned nbytes = bv.bv_len;
+ unsigned total = 0;
+ unsigned offset, len;
+
+ while (nbytes > 0) {
+ struct page *page = bv.bv_page;
+
+ offset = bv.bv_offset + total;
+ len = min(get_max_segment_size(&req->q->limits,
+ page, offset),
+ nbytes);
+
+ page += (offset >> PAGE_SHIFT);
+ offset &= ~PAGE_MASK;
+
+ curr_dma_offset = prev_dma_addr + prev_dma_len;
+
+ dma_addr = blk_dma_link_page(page, offset,
+ iova,
+ curr_dma_offset);
+ if (!dma_addr)
+ break;
+
+ cb(cb_data, linked_cnt, dma_addr,
+ curr_dma_offset, len);
+
+ total += len;
+ nbytes -= len;
+
+ prev_dma_len = len;
+ prev_dma_addr = dma_addr;
+ linked_cnt++;
+ }
+ }
+ }
+ return linked_cnt;
+}
+EXPORT_SYMBOL_GPL(blk_rq_dma_map);
+
+/*
+ * Calculate total DMA length needed to satisfy this request.
+ */
+size_t blk_rq_get_dma_length(struct request *rq)
+{
+ struct request_queue *q = rq->q;
+ struct bio *bio = rq->bio;
+ unsigned int offset, len;
+ struct bvec_iter iter;
+ size_t dma_length = 0;
+ struct bio_vec bvec;
+
+ if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
+ return rq->special_vec.bv_len;
+
+ if (!rq->bio)
+ return 0;
+
+ for_each_bio(bio) {
+ bio_for_each_bvec(bvec, bio, iter) {
+ unsigned int nbytes = bvec.bv_len;
+ unsigned int total = 0;
+
+ if (bvec.bv_offset + bvec.bv_len <= PAGE_SIZE) {
+ dma_length += bvec.bv_len;
+ continue;
+ }
+
+ while (nbytes > 0) {
+ offset = bvec.bv_offset + total;
+ len = min(get_max_segment_size(&q->limits,
+ bvec.bv_page,
+ offset), nbytes);
+ total += len;
+ nbytes -= len;
+ dma_length += len;
+ }
+ }
+ }
+
+ return dma_length;
+}
+EXPORT_SYMBOL(blk_rq_get_dma_length);
+
static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
sector_t offset)
{
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 7a8150a5f051..80b9c7f2c3a0 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -8,6 +8,7 @@
#include <linux/scatterlist.h>
#include <linux/prefetch.h>
#include <linux/srcu.h>
+#include <linux/dma-mapping.h>

struct blk_mq_tags;
struct blk_flush_queue;
@@ -1144,7 +1145,15 @@ static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq,

return __blk_rq_map_sg(q, rq, sglist, &last_sg);
}
+
+typedef void (*driver_map_cb)(void *cb_data, u32 cnt, dma_addr_t dma_addr,
+ dma_addr_t offset, u32 len);
+
+int blk_rq_dma_map(struct request *req, driver_map_cb cb, void *cb_data,
+ struct dma_iova_attrs *iova);
+
void blk_dump_rq_flags(struct request *, char *);
+size_t blk_rq_get_dma_length(struct request *rq);

#ifdef CONFIG_BLK_DEV_ZONED
static inline unsigned int blk_rq_zone_no(struct request *rq)
--
2.44.0


2024-03-05 10:30:10

by Leon Romanovsky

[permalink] [raw]
Subject: [RFC 16/16] nvme-pci: use blk_rq_dma_map() for NVMe SGL

From: Chaitanya Kulkarni <[email protected]>

Update nvme_iod structure to hold iova, list of DMA linked addresses and
total linked count, first one is needed in the request submission path
to create a request to DMA mapping and last two are needed in the
request completion path to remove the DMA mapping. In nvme_map_data()
initialize iova with device, direction, and iova dma length with the
help of blk_rq_get_dma_length(). Allocate iova using dma_alloc_iova().
and call in nvme_pci_setup_sgls().

Call newly added blk_rq_dma_map() to create request to DMA mapping and
provide a callback function nvme_pci_sgl_map(). In the callback
function initialize NVMe SGL dma addresses.

Finally in nvme_unmap_data() unlink the dma address and free iova.

Full disclosure:-
-----------------

This is an RFC to demonstrate the newly added DMA APIs can be used to
map/unmap bvecs without the use of sg list, hence I've modified the pci
code to only handle SGLs for now. Once we have some agreement on the
structure of new DMA API I'll add support for PRPs along with all the
optimization that I've removed from the code for this RFC for NVMe SGLs
and PRPs.

I was able to run fio verification job successfully :-

$ fio fio/verify.fio --ioengine=io_uring --filename=/dev/nvme0n1
--loops=10
write-and-verify: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B,
(T) 8192B-8192B, ioengine=io_uring, iodepth=16
fio-3.36
Starting 1 process
Jobs: 1 (f=1): [V(1)][81.6%][r=12.2MiB/s][r=1559 IOPS][eta 03m:00s]
write-and-verify: (groupid=0, jobs=1): err= 0: pid=4435: Mon Mar 4 20:54:48 2024
read: IOPS=2789, BW=21.8MiB/s (22.9MB/s)(6473MiB/297008msec)
slat (usec): min=4, max=5124, avg=356.51, stdev=604.30
clat (nsec): min=1593, max=23376k, avg=5377076.99, stdev=2039189.93
lat (usec): min=493, max=23407, avg=5733.58, stdev=2103.22
clat percentiles (usec):
| 1.00th=[ 1172], 5.00th=[ 2114], 10.00th=[ 2835], 20.00th=[ 3654],
| 30.00th=[ 4228], 40.00th=[ 4752], 50.00th=[ 5276], 60.00th=[ 5800],
| 70.00th=[ 6325], 80.00th=[ 7046], 90.00th=[ 8094], 95.00th=[ 8979],
| 99.00th=[10421], 99.50th=[11076], 99.90th=[12780], 99.95th=[14222],
| 99.99th=[16909]
write: IOPS=2608, BW=20.4MiB/s (21.4MB/s)(10.0GiB/502571msec); 0 zone resets
slat (usec): min=4, max=5787, avg=382.68, stdev=649.01
clat (nsec): min=521, max=23650k, avg=5751363.17, stdev=2676065.35
lat (usec): min=95, max=23674, avg=6134.04, stdev=2813.48
clat percentiles (usec):
| 1.00th=[ 709], 5.00th=[ 1270], 10.00th=[ 1958], 20.00th=[ 3261],
| 30.00th=[ 4228], 40.00th=[ 5014], 50.00th=[ 5800], 60.00th=[ 6521],
| 70.00th=[ 7373], 80.00th=[ 8225], 90.00th=[ 9241], 95.00th=[ 9896],
| 99.00th=[11469], 99.50th=[11863], 99.90th=[13960], 99.95th=[15270],
| 99.99th=[17695]
bw ( KiB/s): min= 1440, max=132496, per=99.28%, avg=20715.88, stdev=13123.13, samples=1013
iops : min= 180, max=16562, avg=2589.34, stdev=1640.39, samples=1013
lat (nsec) : 750=0.01%
lat (usec) : 2=0.01%, 4=0.01%, 100=0.01%, 250=0.01%, 500=0.07%
lat (usec) : 750=0.79%, 1000=1.22%
lat (msec) : 2=5.94%, 4=18.87%, 10=69.53%, 20=3.58%, 50=0.01%
cpu : usr=1.01%, sys=98.95%, ctx=1591, majf=0, minf=2286
IO depths : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=100.0%, 32=0.0%, >=64=0.0%
submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.1%, 32=0.0%, 64=0.0%, >=64=0.0%
issued rwts: total=828524,1310720,0,0 short=0,0,0,0 dropped=0,0,0,0
latency : target=0, window=0, percentile=100.00%, depth=16

Run status group 0 (all jobs):
READ: bw=21.8MiB/s (22.9MB/s), 21.8MiB/s-21.8MiB/s (22.9MB/s-22.9MB/s),
io=6473MiB (6787MB), run=297008-297008msec
WRITE: bw=20.4MiB/s (21.4MB/s), 20.4MiB/s-20.4MiB/s (21.4MB/s-21.4MB/s),
io=10.0GiB (10.7GB), run=502571-502571msec

Disk stats (read/write):
nvme0n1: ios=829189/1310720, sectors=13293416/20971520, merge=0/0,
ticks=836561/1340351, in_queue=2176913, util=99.30%

Signed-off-by: Chaitanya Kulkarni <[email protected]>
Signed-off-by: Leon Romanovsky <[email protected]>
---
drivers/nvme/host/pci.c | 220 +++++++++-------------------------------
1 file changed, 49 insertions(+), 171 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index e6267a6aa380..140939228409 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -236,7 +236,9 @@ struct nvme_iod {
unsigned int dma_len; /* length of single DMA segment mapping */
dma_addr_t first_dma;
dma_addr_t meta_dma;
- struct sg_table sgt;
+ struct dma_iova_attrs iova;
+ dma_addr_t dma_link_address[128];
+ u16 nr_dma_link_address;
union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS];
};

@@ -521,25 +523,10 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
return true;
}

-static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
-{
- const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- dma_addr_t dma_addr = iod->first_dma;
- int i;
-
- for (i = 0; i < iod->nr_allocations; i++) {
- __le64 *prp_list = iod->list[i].prp_list;
- dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]);
-
- dma_pool_free(dev->prp_page_pool, prp_list, dma_addr);
- dma_addr = next_dma_addr;
- }
-}
-
static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ u16 i;

if (iod->dma_len) {
dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len,
@@ -547,9 +534,8 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
return;
}

- WARN_ON_ONCE(!iod->sgt.nents);
-
- dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0);
+ for (i = 0; i < iod->nr_dma_link_address; i++)
+ dma_unlink_range(&iod->iova, iod->dma_link_address[i]);

if (iod->nr_allocations == 0)
dma_pool_free(dev->prp_small_pool, iod->list[0].sg_list,
@@ -557,120 +543,15 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
else if (iod->nr_allocations == 1)
dma_pool_free(dev->prp_page_pool, iod->list[0].sg_list,
iod->first_dma);
- else
- nvme_free_prps(dev, req);
- mempool_free(iod->sgt.sgl, dev->iod_mempool);
-}
-
-static void nvme_print_sgl(struct scatterlist *sgl, int nents)
-{
- int i;
- struct scatterlist *sg;
-
- for_each_sg(sgl, sg, nents, i) {
- dma_addr_t phys = sg_phys(sg);
- pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d "
- "dma_address:%pad dma_length:%d\n",
- i, &phys, sg->offset, sg->length, &sg_dma_address(sg),
- sg_dma_len(sg));
- }
-}
-
-static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
- struct request *req, struct nvme_rw_command *cmnd)
-{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- struct dma_pool *pool;
- int length = blk_rq_payload_bytes(req);
- struct scatterlist *sg = iod->sgt.sgl;
- int dma_len = sg_dma_len(sg);
- u64 dma_addr = sg_dma_address(sg);
- int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1);
- __le64 *prp_list;
- dma_addr_t prp_dma;
- int nprps, i;
-
- length -= (NVME_CTRL_PAGE_SIZE - offset);
- if (length <= 0) {
- iod->first_dma = 0;
- goto done;
- }
-
- dma_len -= (NVME_CTRL_PAGE_SIZE - offset);
- if (dma_len) {
- dma_addr += (NVME_CTRL_PAGE_SIZE - offset);
- } else {
- sg = sg_next(sg);
- dma_addr = sg_dma_address(sg);
- dma_len = sg_dma_len(sg);
- }
-
- if (length <= NVME_CTRL_PAGE_SIZE) {
- iod->first_dma = dma_addr;
- goto done;
- }
-
- nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
- if (nprps <= (256 / 8)) {
- pool = dev->prp_small_pool;
- iod->nr_allocations = 0;
- } else {
- pool = dev->prp_page_pool;
- iod->nr_allocations = 1;
- }
-
- prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
- if (!prp_list) {
- iod->nr_allocations = -1;
- return BLK_STS_RESOURCE;
- }
- iod->list[0].prp_list = prp_list;
- iod->first_dma = prp_dma;
- i = 0;
- for (;;) {
- if (i == NVME_CTRL_PAGE_SIZE >> 3) {
- __le64 *old_prp_list = prp_list;
- prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
- if (!prp_list)
- goto free_prps;
- iod->list[iod->nr_allocations++].prp_list = prp_list;
- prp_list[0] = old_prp_list[i - 1];
- old_prp_list[i - 1] = cpu_to_le64(prp_dma);
- i = 1;
- }
- prp_list[i++] = cpu_to_le64(dma_addr);
- dma_len -= NVME_CTRL_PAGE_SIZE;
- dma_addr += NVME_CTRL_PAGE_SIZE;
- length -= NVME_CTRL_PAGE_SIZE;
- if (length <= 0)
- break;
- if (dma_len > 0)
- continue;
- if (unlikely(dma_len < 0))
- goto bad_sgl;
- sg = sg_next(sg);
- dma_addr = sg_dma_address(sg);
- dma_len = sg_dma_len(sg);
- }
-done:
- cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sgt.sgl));
- cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
- return BLK_STS_OK;
-free_prps:
- nvme_free_prps(dev, req);
- return BLK_STS_RESOURCE;
-bad_sgl:
- WARN(DO_ONCE(nvme_print_sgl, iod->sgt.sgl, iod->sgt.nents),
- "Invalid SGL for payload:%d nents:%d\n",
- blk_rq_payload_bytes(req), iod->sgt.nents);
- return BLK_STS_IOERR;
+ dma_free_iova(&iod->iova);
}

static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
- struct scatterlist *sg)
+ dma_addr_t dma_addr,
+ unsigned int dma_len)
{
- sge->addr = cpu_to_le64(sg_dma_address(sg));
- sge->length = cpu_to_le32(sg_dma_len(sg));
+ sge->addr = cpu_to_le64(dma_addr);
+ sge->length = cpu_to_le32(dma_len);
sge->type = NVME_SGL_FMT_DATA_DESC << 4;
}

@@ -682,25 +563,37 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
}

+struct nvme_pci_sgl_map_data {
+ struct nvme_iod *iod;
+ struct nvme_sgl_desc *sgl_list;
+};
+
+static void nvme_pci_sgl_map(void *data, u32 cnt, dma_addr_t dma_addr,
+ dma_addr_t offset, u32 len)
+{
+ struct nvme_pci_sgl_map_data *d = data;
+ struct nvme_sgl_desc *sgl_list = d->sgl_list;
+ struct nvme_iod *iod = d->iod;
+
+ nvme_pci_sgl_set_data(&sgl_list[cnt], dma_addr, len);
+ iod->dma_link_address[cnt] = offset;
+ iod->nr_dma_link_address++;
+}
+
static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
struct request *req, struct nvme_rw_command *cmd)
{
+ unsigned int entries = blk_rq_nr_phys_segments(req);
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- struct dma_pool *pool;
struct nvme_sgl_desc *sg_list;
- struct scatterlist *sg = iod->sgt.sgl;
- unsigned int entries = iod->sgt.nents;
+ struct dma_pool *pool;
dma_addr_t sgl_dma;
- int i = 0;
+ int linked_count;
+ struct nvme_pci_sgl_map_data data;

/* setting the transfer type as SGL */
cmd->flags = NVME_CMD_SGL_METABUF;

- if (entries == 1) {
- nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
- return BLK_STS_OK;
- }
-
if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
pool = dev->prp_small_pool;
iod->nr_allocations = 0;
@@ -718,11 +611,13 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
iod->list[0].sg_list = sg_list;
iod->first_dma = sgl_dma;

- nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);
- do {
- nvme_pci_sgl_set_data(&sg_list[i++], sg);
- sg = sg_next(sg);
- } while (--entries > 0);
+ data.iod = iod;
+ data.sgl_list = sg_list;
+
+ linked_count = blk_rq_dma_map(req, nvme_pci_sgl_map, &data,
+ &iod->iova);
+
+ nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, linked_count);

return BLK_STS_OK;
}
@@ -788,36 +683,20 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
&cmnd->rw, &bv);
}
}
-
- iod->dma_len = 0;
- iod->sgt.sgl = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
- if (!iod->sgt.sgl)
+ iod->iova.dev = dev->dev;
+ iod->iova.dir = rq_dma_dir(req);
+ iod->iova.attrs = DMA_ATTR_NO_WARN;
+ iod->iova.size = blk_rq_get_dma_length(req);
+ if (!iod->iova.size)
return BLK_STS_RESOURCE;
- sg_init_table(iod->sgt.sgl, blk_rq_nr_phys_segments(req));
- iod->sgt.orig_nents = blk_rq_map_sg(req->q, req, iod->sgt.sgl);
- if (!iod->sgt.orig_nents)
- goto out_free_sg;

- rc = dma_map_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req),
- DMA_ATTR_NO_WARN);
- if (rc) {
- if (rc == -EREMOTEIO)
- ret = BLK_STS_TARGET;
- goto out_free_sg;
- }
+ rc = dma_alloc_iova(&iod->iova);
+ if (rc)
+ return BLK_STS_RESOURCE;

- if (nvme_pci_use_sgls(dev, req, iod->sgt.nents))
- ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw);
- else
- ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
- if (ret != BLK_STS_OK)
- goto out_unmap_sg;
- return BLK_STS_OK;
+ iod->dma_len = 0;

-out_unmap_sg:
- dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0);
-out_free_sg:
- mempool_free(iod->sgt.sgl, dev->iod_mempool);
+ ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw);
return ret;
}

@@ -841,7 +720,6 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)

iod->aborted = false;
iod->nr_allocations = -1;
- iod->sgt.nents = 0;

ret = nvme_setup_cmd(req->q->queuedata, req);
if (ret)
--
2.44.0