2021-06-29 08:41:26

by Leon Romanovsky

[permalink] [raw]
Subject: [PATCH rdma-next v1 0/2] SG fix together with update to RDMA umem

From: Leon Romanovsky <[email protected]>

Changelog:
v1:
* Fixed sg_page with a _dma_ API in the umem.c
v0: https://lore.kernel.org/lkml/[email protected]

Maor Gottlieb (2):
lib/scatterlist: Fix wrong update of orig_nents
RDMA: Use dma_map_sgtable for map umem pages

drivers/infiniband/core/umem.c | 29 +++++++++---------------
drivers/infiniband/core/umem_dmabuf.c | 1 -
drivers/infiniband/hw/mlx4/mr.c | 4 ++--
drivers/infiniband/hw/mlx5/mr.c | 3 ++-
drivers/infiniband/sw/rdmavt/mr.c | 2 +-
drivers/infiniband/sw/rxe/rxe_mr.c | 3 ++-
include/linux/scatterlist.h | 8 +++++--
include/rdma/ib_umem.h | 5 ++---
include/rdma/ib_verbs.h | 28 +++++++++++++++++++++++
lib/scatterlist.c | 32 +++++++--------------------
10 files changed, 62 insertions(+), 53 deletions(-)

--
2.31.1


2021-06-29 08:42:07

by Leon Romanovsky

[permalink] [raw]
Subject: [PATCH rdma-next v1 2/2] RDMA: Use dma_map_sgtable for map umem pages

From: Maor Gottlieb <[email protected]>

In order to avoid incorrect usage of sg_table fields, change umem to
use dma_map_sgtable for map the pages for DMA. Since dma_map_sgtable
update the nents field (number of DMA entries), there is no need
anymore for nmap variable, hence do some cleanups accordingly.

Signed-off-by: Maor Gottlieb <[email protected]>
Signed-off-by: Leon Romanovsky <[email protected]>
---
drivers/infiniband/core/umem.c | 29 ++++++++++-----------------
drivers/infiniband/core/umem_dmabuf.c | 1 -
drivers/infiniband/hw/mlx4/mr.c | 4 ++--
drivers/infiniband/hw/mlx5/mr.c | 3 ++-
drivers/infiniband/sw/rdmavt/mr.c | 2 +-
drivers/infiniband/sw/rxe/rxe_mr.c | 3 ++-
include/rdma/ib_umem.h | 5 ++---
include/rdma/ib_verbs.h | 28 ++++++++++++++++++++++++++
8 files changed, 48 insertions(+), 27 deletions(-)

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 0eb40025075f..f620d5b6b0e1 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -51,11 +51,11 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
struct scatterlist *sg;
unsigned int i;

- if (umem->nmap > 0)
- ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents,
- DMA_BIDIRECTIONAL);
+ if (dirty)
+ ib_dma_unmap_sgtable_attrs(dev, &umem->sg_head,
+ DMA_BIDIRECTIONAL, 0);

- for_each_sg(umem->sg_head.sgl, sg, umem->sg_nents, i)
+ for_each_sgtable_sg(&umem->sg_head, sg, i)
unpin_user_page_range_dirty_lock(sg_page(sg),
DIV_ROUND_UP(sg->length, PAGE_SIZE), make_dirty);

@@ -111,7 +111,7 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
/* offset into first SGL */
pgoff = umem->address & ~PAGE_MASK;

- for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) {
+ for_each_sgtable_dma_sg(&umem->sg_head, sg, i) {
/* Walk SGL and reduce max page size if VA/PA bits differ
* for any address.
*/
@@ -121,7 +121,7 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
* the maximum possible page size as the low bits of the iova
* must be zero when starting the next chunk.
*/
- if (i != (umem->nmap - 1))
+ if (i != (umem->sg_head.nents - 1))
mask |= va;
pgoff = 0;
}
@@ -230,7 +230,6 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
0, ret << PAGE_SHIFT,
ib_dma_max_seg_size(device), sg, npages,
GFP_KERNEL);
- umem->sg_nents = umem->sg_head.nents;
if (IS_ERR(sg)) {
unpin_user_pages_dirty_lock(page_list, ret, 0);
ret = PTR_ERR(sg);
@@ -241,16 +240,10 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
if (access & IB_ACCESS_RELAXED_ORDERING)
dma_attr |= DMA_ATTR_WEAK_ORDERING;

- umem->nmap =
- ib_dma_map_sg_attrs(device, umem->sg_head.sgl, umem->sg_nents,
- DMA_BIDIRECTIONAL, dma_attr);
-
- if (!umem->nmap) {
- ret = -ENOMEM;
+ ret = ib_dma_map_sgtable_attrs(device, &umem->sg_head,
+ DMA_BIDIRECTIONAL, dma_attr);
+ if (ret)
goto umem_release;
- }
-
- ret = 0;
goto out;

umem_release:
@@ -310,8 +303,8 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
return -EINVAL;
}

- ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->sg_nents, dst, length,
- offset + ib_umem_offset(umem));
+ ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->sg_head.orig_nents,
+ dst, length, offset + ib_umem_offset(umem));

if (ret < 0)
return ret;
diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c
index 0d65ce146fc4..cd2dd1f39aa7 100644
--- a/drivers/infiniband/core/umem_dmabuf.c
+++ b/drivers/infiniband/core/umem_dmabuf.c
@@ -57,7 +57,6 @@ int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf)

umem_dmabuf->umem.sg_head.sgl = umem_dmabuf->first_sg;
umem_dmabuf->umem.sg_head.nents = nmap;
- umem_dmabuf->umem.nmap = nmap;
umem_dmabuf->sgt = sgt;

wait_fence:
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
index 50becc0e4b62..ab5dc8eac7f8 100644
--- a/drivers/infiniband/hw/mlx4/mr.c
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -200,7 +200,7 @@ int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
mtt_shift = mtt->page_shift;
mtt_size = 1ULL << mtt_shift;

- for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) {
+ for_each_sgtable_dma_sg(&umem->sg_head, sg, i) {
if (cur_start_addr + len == sg_dma_address(sg)) {
/* still the same block */
len += sg_dma_len(sg);
@@ -273,7 +273,7 @@ int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va,

*num_of_mtts = ib_umem_num_dma_blocks(umem, PAGE_SIZE);

- for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) {
+ for_each_sgtable_dma_sg(&umem->sg_head, sg, i) {
/*
* Initialization - save the first chunk start as the
* current_block_start - block means contiguous pages.
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 3263851ea574..4954fb9eb6dc 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1226,7 +1226,8 @@ int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
orig_sg_length = sg.length;

cur_mtt = mtt;
- rdma_for_each_block (mr->umem->sg_head.sgl, &biter, mr->umem->nmap,
+ rdma_for_each_block (mr->umem->sg_head.sgl, &biter,
+ mr->umem->sg_head.nents,
BIT(mr->page_shift)) {
if (cur_mtt == (void *)mtt + sg.length) {
dma_sync_single_for_device(ddev, sg.addr, sg.length,
diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c
index 34b7af6ab9c2..d955c8c4acc4 100644
--- a/drivers/infiniband/sw/rdmavt/mr.c
+++ b/drivers/infiniband/sw/rdmavt/mr.c
@@ -410,7 +410,7 @@ struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
mr->mr.page_shift = PAGE_SHIFT;
m = 0;
n = 0;
- for_each_sg_page (umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
+ for_each_sg_page (umem->sg_head.sgl, &sg_iter, umem->sg_head.nents, 0) {
void *vaddr;

vaddr = page_address(sg_page_iter_page(&sg_iter));
diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
index 6aabcb4de235..a269085e0946 100644
--- a/drivers/infiniband/sw/rxe/rxe_mr.c
+++ b/drivers/infiniband/sw/rxe/rxe_mr.c
@@ -142,7 +142,8 @@ int rxe_mr_init_user(struct rxe_pd *pd, u64 start, u64 length, u64 iova,
if (length > 0) {
buf = map[0]->buf;

- for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
+ for_each_sg_page(umem->sg_head.sgl, &sg_iter,
+ umem->sg_head.nents, 0) {
if (num_buf >= RXE_BUF_PER_MAP) {
map++;
buf = map[0]->buf;
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 676c57f5ca80..c754b1a31cc9 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -27,8 +27,6 @@ struct ib_umem {
u32 is_dmabuf : 1;
struct work_struct work;
struct sg_table sg_head;
- int nmap;
- unsigned int sg_nents;
};

struct ib_umem_dmabuf {
@@ -77,7 +75,8 @@ static inline void __rdma_umem_block_iter_start(struct ib_block_iter *biter,
struct ib_umem *umem,
unsigned long pgsz)
{
- __rdma_block_iter_start(biter, umem->sg_head.sgl, umem->nmap, pgsz);
+ __rdma_block_iter_start(biter, umem->sg_head.sgl, umem->sg_head.nents,
+ pgsz);
}

/**
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 371df1c80aeb..2dba30849731 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -4057,6 +4057,34 @@ static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev,
dma_attrs);
}

+/**
+ * ib_dma_map_sgtable_attrs - Map a scatter/gather table to DMA addresses
+ * @dev: The device for which the DMA addresses are to be created
+ * @sg: The sg_table object describing the buffer
+ * @direction: The direction of the DMA
+ * @attrs: Optional DMA attributes for the map operation
+ */
+static inline int ib_dma_map_sgtable_attrs(struct ib_device *dev,
+ struct sg_table *sgt,
+ enum dma_data_direction direction,
+ unsigned long dma_attrs)
+{
+ if (ib_uses_virt_dma(dev)) {
+ ib_dma_virt_map_sg(dev, sgt->sgl, sgt->orig_nents);
+ return 0;
+ }
+ return dma_map_sgtable(dev->dma_device, sgt, direction, dma_attrs);
+}
+
+static inline void ib_dma_unmap_sgtable_attrs(struct ib_device *dev,
+ struct sg_table *sgt,
+ enum dma_data_direction direction,
+ unsigned long dma_attrs)
+{
+ if (!ib_uses_virt_dma(dev))
+ dma_unmap_sgtable(dev->dma_device, sgt, direction, dma_attrs);
+}
+
/**
* ib_dma_map_sg - Map a scatter/gather list to DMA addresses
* @dev: The device for which the DMA addresses are to be created
--
2.31.1

2021-06-29 08:42:56

by Leon Romanovsky

[permalink] [raw]
Subject: [PATCH rdma-next v1 1/2] lib/scatterlist: Fix wrong update of orig_nents

From: Maor Gottlieb <[email protected]>

orig_nents should represent the number of entries with pages,
but __sg_alloc_table_from_pages sets orig_nents as the number of
total entries in the table. This is wrong when the API is used for
dynamic allocation where not all the table entries are mapped with
pages. It wasn't observed until now, since RDMA umem who uses this
API in the dynamic form doesn't use orig_nents implicit or explicit
by the scatterlist APIs.

Fix it by:
1. Set orig_nents as number of entries with pages also in
__sg_alloc_table_from_pages.
2. Add a new field total_nents to reflect the total number of entries
in the table. This is required for the release flow (sg_free_table).
This filed should be used internally only by scatterlist.

Fixes: 07da1223ec93 ("lib/scatterlist: Add support in dynamic allocation of SG table from pages")
Signed-off-by: Maor Gottlieb <[email protected]>
Signed-off-by: Leon Romanovsky <[email protected]>
---
include/linux/scatterlist.h | 8 ++++++--
lib/scatterlist.c | 32 ++++++++------------------------
2 files changed, 14 insertions(+), 26 deletions(-)

diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 6f70572b2938..1c889141eb91 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -35,8 +35,12 @@ struct scatterlist {

struct sg_table {
struct scatterlist *sgl; /* the list */
- unsigned int nents; /* number of mapped entries */
- unsigned int orig_nents; /* original size of list */
+ unsigned int nents; /* number of DMA mapped entries */
+ unsigned int orig_nents; /* number of CPU mapped entries */
+ /* The fields below should be used internally only by
+ * scatterlist implementation.
+ */
+ unsigned int total_nents; /* number of total entries in the table */
};

/*
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index a59778946404..6db70a1e7dd0 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -192,33 +192,26 @@ static void sg_kfree(struct scatterlist *sg, unsigned int nents)
void __sg_free_table(struct sg_table *table, unsigned int max_ents,
unsigned int nents_first_chunk, sg_free_fn *free_fn)
{
- struct scatterlist *sgl, *next;
+ struct scatterlist *sgl, *next = NULL;
unsigned curr_max_ents = nents_first_chunk ?: max_ents;

if (unlikely(!table->sgl))
return;

sgl = table->sgl;
- while (table->orig_nents) {
- unsigned int alloc_size = table->orig_nents;
- unsigned int sg_size;
+ while (table->total_nents) {
+ unsigned int alloc_size = table->total_nents;

/*
* If we have more than max_ents segments left,
* then assign 'next' to the sg table after the current one.
- * sg_size is then one less than alloc size, since the last
- * element is the chain pointer.
*/
if (alloc_size > curr_max_ents) {
next = sg_chain_ptr(&sgl[curr_max_ents - 1]);
alloc_size = curr_max_ents;
- sg_size = alloc_size - 1;
- } else {
- sg_size = alloc_size;
- next = NULL;
}

- table->orig_nents -= sg_size;
+ table->total_nents -= alloc_size;
if (nents_first_chunk)
nents_first_chunk = 0;
else
@@ -301,20 +294,11 @@ int __sg_alloc_table(struct sg_table *table, unsigned int nents,
} else {
sg = alloc_fn(alloc_size, gfp_mask);
}
- if (unlikely(!sg)) {
- /*
- * Adjust entry count to reflect that the last
- * entry of the previous table won't be used for
- * linkage. Without this, sg_kfree() may get
- * confused.
- */
- if (prv)
- table->nents = ++table->orig_nents;
-
+ if (unlikely(!sg))
return -ENOMEM;
- }

sg_init_table(sg, alloc_size);
+ table->total_nents += alloc_size;
table->nents = table->orig_nents += sg_size;

/*
@@ -385,12 +369,11 @@ static struct scatterlist *get_next_sg(struct sg_table *table,
if (!new_sg)
return ERR_PTR(-ENOMEM);
sg_init_table(new_sg, alloc_size);
+ table->total_nents += alloc_size;
if (cur) {
__sg_chain(next_sg, new_sg);
- table->orig_nents += alloc_size - 1;
} else {
table->sgl = new_sg;
- table->orig_nents = alloc_size;
table->nents = 0;
}
return new_sg;
@@ -515,6 +498,7 @@ struct scatterlist *__sg_alloc_table_from_pages(struct sg_table *sgt,
cur_page = j;
}
sgt->nents += added_nents;
+ sgt->orig_nents = sgt->nents;
out:
if (!left_pages)
sg_mark_end(s);
--
2.31.1

2021-06-29 23:11:46

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH rdma-next v1 0/2] SG fix together with update to RDMA umem

On Tue, Jun 29, 2021 at 11:40:00AM +0300, Leon Romanovsky wrote:
> From: Leon Romanovsky <[email protected]>
>
> Changelog:
> v1:
> * Fixed sg_page with a _dma_ API in the umem.c
> v0: https://lore.kernel.org/lkml/[email protected]
>
> Maor Gottlieb (2):
> lib/scatterlist: Fix wrong update of orig_nents
> RDMA: Use dma_map_sgtable for map umem pages

Though I would have liked to see some ack, I think fixing the semantic
bug here is important enough. Yell quick if there is any concern as
my PR will go tomorrow.

Applied to for-next.

Thanks,
Jason

2021-06-30 06:03:37

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH rdma-next v1 1/2] lib/scatterlist: Fix wrong update of orig_nents

On Tue, Jun 29, 2021 at 11:40:01AM +0300, Leon Romanovsky wrote:
> 2. Add a new field total_nents to reflect the total number of entries
> in the table. This is required for the release flow (sg_free_table).
> This filed should be used internally only by scatterlist.

No, please don't bloat the common structure.

> + /* The fields below should be used internally only by
> + * scatterlist implementation.
> + */

And this is not the way kernel comments work.

2021-06-30 06:31:45

by Leon Romanovsky

[permalink] [raw]
Subject: Re: [PATCH rdma-next v1 1/2] lib/scatterlist: Fix wrong update of orig_nents

On Wed, Jun 30, 2021 at 06:59:11AM +0100, Christoph Hellwig wrote:
> On Tue, Jun 29, 2021 at 11:40:01AM +0300, Leon Romanovsky wrote:
> > 2. Add a new field total_nents to reflect the total number of entries
> > in the table. This is required for the release flow (sg_free_table).
> > This filed should be used internally only by scatterlist.
>
> No, please don't bloat the common structure.

Somehow we need to store that total_nents value and our internal
proposal was to wrap sg_table with another private structure that is
visible in lib/scatterlist.c only.

Something like that:
struct sg_table_private {
struct sg_table table;
unsigned int total_nents;
};

But it looks awkward.

>
> > + /* The fields below should be used internally only by
> > + * scatterlist implementation.
> > + */
>
> And this is not the way kernel comments work.

It is netdev style.

Thanks

2021-06-30 06:35:21

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH rdma-next v1 1/2] lib/scatterlist: Fix wrong update of orig_nents

On Wed, Jun 30, 2021 at 09:29:51AM +0300, Leon Romanovsky wrote:
> On Wed, Jun 30, 2021 at 06:59:11AM +0100, Christoph Hellwig wrote:
> > On Tue, Jun 29, 2021 at 11:40:01AM +0300, Leon Romanovsky wrote:
> > > 2. Add a new field total_nents to reflect the total number of entries
> > > in the table. This is required for the release flow (sg_free_table).
> > > This filed should be used internally only by scatterlist.
> >
> > No, please don't bloat the common structure.
>
> Somehow we need to store that total_nents value and our internal
> proposal was to wrap sg_table with another private structure that is
> visible in lib/scatterlist.c only.
>
> Something like that:
> struct sg_table_private {
> struct sg_table table;
> unsigned int total_nents;
> };
>
> But it looks awkward.

Well, the important point is that we only need it for the new
way of collapsing, appending allocations. We should not burden
it on all other users.

2021-06-30 07:06:18

by Leon Romanovsky

[permalink] [raw]
Subject: Re: [PATCH rdma-next v1 1/2] lib/scatterlist: Fix wrong update of orig_nents

On Wed, Jun 30, 2021 at 07:33:17AM +0100, Christoph Hellwig wrote:
> On Wed, Jun 30, 2021 at 09:29:51AM +0300, Leon Romanovsky wrote:
> > On Wed, Jun 30, 2021 at 06:59:11AM +0100, Christoph Hellwig wrote:
> > > On Tue, Jun 29, 2021 at 11:40:01AM +0300, Leon Romanovsky wrote:
> > > > 2. Add a new field total_nents to reflect the total number of entries
> > > > in the table. This is required for the release flow (sg_free_table).
> > > > This filed should be used internally only by scatterlist.
> > >
> > > No, please don't bloat the common structure.
> >
> > Somehow we need to store that total_nents value and our internal
> > proposal was to wrap sg_table with another private structure that is
> > visible in lib/scatterlist.c only.
> >
> > Something like that:
> > struct sg_table_private {
> > struct sg_table table;
> > unsigned int total_nents;
> > };
> >
> > But it looks awkward.
>
> Well, the important point is that we only need it for the new
> way of collapsing, appending allocations. We should not burden
> it on all other users.

Another possible solution is to change __sg_alloc_table()/__sg_alloc_table_from_pages
to return total_nents and expect from the users to store it internally and pass
it later to the __sg_free_table().

Something like that.

Thanks

2021-06-30 07:18:52

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH rdma-next v1 1/2] lib/scatterlist: Fix wrong update of orig_nents

On Wed, Jun 30, 2021 at 10:02:11AM +0300, Leon Romanovsky wrote:
> Another possible solution is to change __sg_alloc_table()/__sg_alloc_table_from_pages
> to return total_nents and expect from the users to store it internally and pass
> it later to the __sg_free_table().
>
> Something like that.

Yes, that sounds pretty reasonable.

2021-06-30 09:15:18

by Maor Gottlieb

[permalink] [raw]
Subject: Re: [PATCH rdma-next v1 1/2] lib/scatterlist: Fix wrong update of orig_nents


On 6/30/2021 10:16 AM, Christoph Hellwig wrote:
> On Wed, Jun 30, 2021 at 10:02:11AM +0300, Leon Romanovsky wrote:
>> Another possible solution is to change __sg_alloc_table()/__sg_alloc_table_from_pages
>> to return total_nents and expect from the users to store it internally and pass
>> it later to the __sg_free_table().
>>
>> Something like that.
> Yes, that sounds pretty reasonable.

OK. So we have two options

1. Add another argument (output) to __sg_alloc_table_from_pages and
__sg_free_table.
    The thing is that __sg_free_table gets free_fn as argument while
__sg_alloc_table_from_pages doesn't get alloc function. Users will pass
NULL as free_fn and scatterlist internally will use sg_kfree.

2. Have dedicated functions to the appending allocations -
sg_alloc_table_from_pages_append and sg_free_table_append. With this
approach users that use __sg_alloc_table_from_pages not for appending
allocation don't need to maintain the new argument.

Christoph, what do you prefer? do you see a better option?

2021-06-30 11:14:10

by Marek Szyprowski

[permalink] [raw]
Subject: Re: [PATCH rdma-next v1 1/2] lib/scatterlist: Fix wrong update of orig_nents

Hi Leon,

On 29.06.2021 10:40, Leon Romanovsky wrote:
> From: Maor Gottlieb <[email protected]>
>
> orig_nents should represent the number of entries with pages,
> but __sg_alloc_table_from_pages sets orig_nents as the number of
> total entries in the table. This is wrong when the API is used for
> dynamic allocation where not all the table entries are mapped with
> pages. It wasn't observed until now, since RDMA umem who uses this
> API in the dynamic form doesn't use orig_nents implicit or explicit
> by the scatterlist APIs.
>
> Fix it by:
> 1. Set orig_nents as number of entries with pages also in
> __sg_alloc_table_from_pages.
> 2. Add a new field total_nents to reflect the total number of entries
> in the table. This is required for the release flow (sg_free_table).
> This filed should be used internally only by scatterlist.
>
> Fixes: 07da1223ec93 ("lib/scatterlist: Add support in dynamic allocation of SG table from pages")
> Signed-off-by: Maor Gottlieb <[email protected]>
> Signed-off-by: Leon Romanovsky <[email protected]>

This patch landed in linux-next 20210630 as commit a52724456928
("lib/scatterlist: Fix wrong update of orig_nents"). It causes serious
regression in DMA-IOMMU integration, which can be observed for example
on ARM Juno board during boot:

Unable to handle kernel paging request at virtual address 00376f42a6e40454
Mem abort info:
  ESR = 0x96000004
  EC = 0x25: DABT (current EL), IL = 32 bits
  SET = 0, FnV = 0
  EA = 0, S1PTW = 0
  FSC = 0x04: level 0 translation fault
Data abort info:
  ISV = 0, ISS = 0x00000004
  CM = 0, WnR = 0
[00376f42a6e40454] address between user and kernel address ranges
Internal error: Oops: 96000004 [#1] PREEMPT SMP
Modules linked in:
CPU: 2 PID: 1 Comm: swapper/0 Not tainted 5.13.0-next-20210630+ #3585
Hardware name: ARM Juno development board (r1) (DT)
pstate: 80000005 (Nzcv daif -PAN -UAO -TCO BTYPE=--)
pc : __sg_free_table+0x60/0xa0
lr : __sg_free_table+0x7c/0xa0
..
Call trace:
 __sg_free_table+0x60/0xa0
 sg_free_table+0x1c/0x28
 iommu_dma_alloc+0xc8/0x388
 dma_alloc_attrs+0xcc/0xf0
 dmam_alloc_attrs+0x68/0xb8
 sil24_port_start+0x60/0xe0
 ata_host_start.part.32+0xbc/0x208
 ata_host_activate+0x64/0x150
 sil24_init_one+0x1e8/0x268
 local_pci_probe+0x3c/0xa0
 pci_device_probe+0x128/0x1c8
 really_probe+0x138/0x2d0
 __driver_probe_device+0x78/0xd8
 driver_probe_device+0x40/0x110
 __driver_attach+0xcc/0x118
 bus_for_each_dev+0x68/0xc8
 driver_attach+0x20/0x28
 bus_add_driver+0x168/0x1f8
 driver_register+0x60/0x110
 __pci_register_driver+0x5c/0x68
 sil24_pci_driver_init+0x20/0x28
 do_one_initcall+0x84/0x450
 kernel_init_freeable+0x31c/0x38c
 kernel_init+0x20/0x120
 ret_from_fork+0x10/0x18
Code: d37be885 6b01007f 52800004 540000a2 (f8656813)
---[ end trace 4ba4f0c9c48711a1 ]---
Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b

It looks that some changes to the scatterlist structures are missing
outside of the lib/scatterlist.c.

For now I would suggest to revert this change.

> ---
> include/linux/scatterlist.h | 8 ++++++--
> lib/scatterlist.c | 32 ++++++++------------------------
> 2 files changed, 14 insertions(+), 26 deletions(-)
>
> diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
> index 6f70572b2938..1c889141eb91 100644
> --- a/include/linux/scatterlist.h
> +++ b/include/linux/scatterlist.h
> @@ -35,8 +35,12 @@ struct scatterlist {
>
> struct sg_table {
> struct scatterlist *sgl; /* the list */
> - unsigned int nents; /* number of mapped entries */
> - unsigned int orig_nents; /* original size of list */
> + unsigned int nents; /* number of DMA mapped entries */
> + unsigned int orig_nents; /* number of CPU mapped entries */
> + /* The fields below should be used internally only by
> + * scatterlist implementation.
> + */
> + unsigned int total_nents; /* number of total entries in the table */
> };
>
> /*
> diff --git a/lib/scatterlist.c b/lib/scatterlist.c
> index a59778946404..6db70a1e7dd0 100644
> --- a/lib/scatterlist.c
> +++ b/lib/scatterlist.c
> @@ -192,33 +192,26 @@ static void sg_kfree(struct scatterlist *sg, unsigned int nents)
> void __sg_free_table(struct sg_table *table, unsigned int max_ents,
> unsigned int nents_first_chunk, sg_free_fn *free_fn)
> {
> - struct scatterlist *sgl, *next;
> + struct scatterlist *sgl, *next = NULL;
> unsigned curr_max_ents = nents_first_chunk ?: max_ents;
>
> if (unlikely(!table->sgl))
> return;
>
> sgl = table->sgl;
> - while (table->orig_nents) {
> - unsigned int alloc_size = table->orig_nents;
> - unsigned int sg_size;
> + while (table->total_nents) {
> + unsigned int alloc_size = table->total_nents;
>
> /*
> * If we have more than max_ents segments left,
> * then assign 'next' to the sg table after the current one.
> - * sg_size is then one less than alloc size, since the last
> - * element is the chain pointer.
> */
> if (alloc_size > curr_max_ents) {
> next = sg_chain_ptr(&sgl[curr_max_ents - 1]);
> alloc_size = curr_max_ents;
> - sg_size = alloc_size - 1;
> - } else {
> - sg_size = alloc_size;
> - next = NULL;
> }
>
> - table->orig_nents -= sg_size;
> + table->total_nents -= alloc_size;
> if (nents_first_chunk)
> nents_first_chunk = 0;
> else
> @@ -301,20 +294,11 @@ int __sg_alloc_table(struct sg_table *table, unsigned int nents,
> } else {
> sg = alloc_fn(alloc_size, gfp_mask);
> }
> - if (unlikely(!sg)) {
> - /*
> - * Adjust entry count to reflect that the last
> - * entry of the previous table won't be used for
> - * linkage. Without this, sg_kfree() may get
> - * confused.
> - */
> - if (prv)
> - table->nents = ++table->orig_nents;
> -
> + if (unlikely(!sg))
> return -ENOMEM;
> - }
>
> sg_init_table(sg, alloc_size);
> + table->total_nents += alloc_size;
> table->nents = table->orig_nents += sg_size;
>
> /*
> @@ -385,12 +369,11 @@ static struct scatterlist *get_next_sg(struct sg_table *table,
> if (!new_sg)
> return ERR_PTR(-ENOMEM);
> sg_init_table(new_sg, alloc_size);
> + table->total_nents += alloc_size;
> if (cur) {
> __sg_chain(next_sg, new_sg);
> - table->orig_nents += alloc_size - 1;
> } else {
> table->sgl = new_sg;
> - table->orig_nents = alloc_size;
> table->nents = 0;
> }
> return new_sg;
> @@ -515,6 +498,7 @@ struct scatterlist *__sg_alloc_table_from_pages(struct sg_table *sgt,
> cur_page = j;
> }
> sgt->nents += added_nents;
> + sgt->orig_nents = sgt->nents;
> out:
> if (!left_pages)
> sg_mark_end(s);

Best regards
--
Marek Szyprowski, PhD
Samsung R&D Institute Poland

2021-06-30 11:17:42

by Leon Romanovsky

[permalink] [raw]
Subject: Re: [PATCH rdma-next v1 1/2] lib/scatterlist: Fix wrong update of orig_nents

On Wed, Jun 30, 2021 at 01:12:26PM +0200, Marek Szyprowski wrote:
> Hi Leon,
>
> On 29.06.2021 10:40, Leon Romanovsky wrote:
> > From: Maor Gottlieb <[email protected]>
> >
> > orig_nents should represent the number of entries with pages,
> > but __sg_alloc_table_from_pages sets orig_nents as the number of
> > total entries in the table. This is wrong when the API is used for
> > dynamic allocation where not all the table entries are mapped with
> > pages. It wasn't observed until now, since RDMA umem who uses this
> > API in the dynamic form doesn't use orig_nents implicit or explicit
> > by the scatterlist APIs.
> >
> > Fix it by:
> > 1. Set orig_nents as number of entries with pages also in
> > __sg_alloc_table_from_pages.
> > 2. Add a new field total_nents to reflect the total number of entries
> > in the table. This is required for the release flow (sg_free_table).
> > This filed should be used internally only by scatterlist.
> >
> > Fixes: 07da1223ec93 ("lib/scatterlist: Add support in dynamic allocation of SG table from pages")
> > Signed-off-by: Maor Gottlieb <[email protected]>
> > Signed-off-by: Leon Romanovsky <[email protected]>

<...>

> For now I would suggest to revert this change.

Thanks for the report, we will drop this patch.

2021-06-30 17:43:36

by Thierry Reding

[permalink] [raw]
Subject: Re: [PATCH rdma-next v1 1/2] lib/scatterlist: Fix wrong update of orig_nents

On Wed, Jun 30, 2021 at 01:12:26PM +0200, Marek Szyprowski wrote:
> Hi Leon,
>
> On 29.06.2021 10:40, Leon Romanovsky wrote:
> > From: Maor Gottlieb <[email protected]>
> >
> > orig_nents should represent the number of entries with pages,
> > but __sg_alloc_table_from_pages sets orig_nents as the number of
> > total entries in the table. This is wrong when the API is used for
> > dynamic allocation where not all the table entries are mapped with
> > pages. It wasn't observed until now, since RDMA umem who uses this
> > API in the dynamic form doesn't use orig_nents implicit or explicit
> > by the scatterlist APIs.
> >
> > Fix it by:
> > 1. Set orig_nents as number of entries with pages also in
> > __sg_alloc_table_from_pages.
> > 2. Add a new field total_nents to reflect the total number of entries
> > in the table. This is required for the release flow (sg_free_table).
> > This filed should be used internally only by scatterlist.
> >
> > Fixes: 07da1223ec93 ("lib/scatterlist: Add support in dynamic allocation of SG table from pages")
> > Signed-off-by: Maor Gottlieb <[email protected]>
> > Signed-off-by: Leon Romanovsky <[email protected]>
>
> This patch landed in linux-next 20210630 as commit a52724456928
> ("lib/scatterlist: Fix wrong update of orig_nents"). It causes serious
> regression in DMA-IOMMU integration, which can be observed for example
> on ARM Juno board during boot:
>
> Unable to handle kernel paging request at virtual address 00376f42a6e40454
> Mem abort info:
>   ESR = 0x96000004
>   EC = 0x25: DABT (current EL), IL = 32 bits
>   SET = 0, FnV = 0
>   EA = 0, S1PTW = 0
>   FSC = 0x04: level 0 translation fault
> Data abort info:
>   ISV = 0, ISS = 0x00000004
>   CM = 0, WnR = 0
> [00376f42a6e40454] address between user and kernel address ranges
> Internal error: Oops: 96000004 [#1] PREEMPT SMP
> Modules linked in:
> CPU: 2 PID: 1 Comm: swapper/0 Not tainted 5.13.0-next-20210630+ #3585
> Hardware name: ARM Juno development board (r1) (DT)
> pstate: 80000005 (Nzcv daif -PAN -UAO -TCO BTYPE=--)
> pc : __sg_free_table+0x60/0xa0
> lr : __sg_free_table+0x7c/0xa0
> ..
> Call trace:
>  __sg_free_table+0x60/0xa0
>  sg_free_table+0x1c/0x28
>  iommu_dma_alloc+0xc8/0x388
>  dma_alloc_attrs+0xcc/0xf0
>  dmam_alloc_attrs+0x68/0xb8
>  sil24_port_start+0x60/0xe0
>  ata_host_start.part.32+0xbc/0x208
>  ata_host_activate+0x64/0x150
>  sil24_init_one+0x1e8/0x268
>  local_pci_probe+0x3c/0xa0
>  pci_device_probe+0x128/0x1c8
>  really_probe+0x138/0x2d0
>  __driver_probe_device+0x78/0xd8
>  driver_probe_device+0x40/0x110
>  __driver_attach+0xcc/0x118
>  bus_for_each_dev+0x68/0xc8
>  driver_attach+0x20/0x28
>  bus_add_driver+0x168/0x1f8
>  driver_register+0x60/0x110
>  __pci_register_driver+0x5c/0x68
>  sil24_pci_driver_init+0x20/0x28
>  do_one_initcall+0x84/0x450
>  kernel_init_freeable+0x31c/0x38c
>  kernel_init+0x20/0x120
>  ret_from_fork+0x10/0x18
> Code: d37be885 6b01007f 52800004 540000a2 (f8656813)
> ---[ end trace 4ba4f0c9c48711a1 ]---
> Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
>
> It looks that some changes to the scatterlist structures are missing
> outside of the lib/scatterlist.c.
>
> For now I would suggest to revert this change.

I see a very similar crash on Tegra during the HDA driver's probe.

Leon, let me know if you need a tester for a replacement patch.

Thanks,
Thierry


Attachments:
(No filename) (3.47 kB)
signature.asc (849.00 B)
Download all attachments

2021-06-30 17:51:11

by Leon Romanovsky

[permalink] [raw]
Subject: Re: [PATCH rdma-next v1 1/2] lib/scatterlist: Fix wrong update of orig_nents

On Wed, Jun 30, 2021 at 07:44:21PM +0200, Thierry Reding wrote:
> On Wed, Jun 30, 2021 at 01:12:26PM +0200, Marek Szyprowski wrote:
> > Hi Leon,
> >
> > On 29.06.2021 10:40, Leon Romanovsky wrote:
> > > From: Maor Gottlieb <[email protected]>
> > >
> > > orig_nents should represent the number of entries with pages,
> > > but __sg_alloc_table_from_pages sets orig_nents as the number of
> > > total entries in the table. This is wrong when the API is used for
> > > dynamic allocation where not all the table entries are mapped with
> > > pages. It wasn't observed until now, since RDMA umem who uses this
> > > API in the dynamic form doesn't use orig_nents implicit or explicit
> > > by the scatterlist APIs.
> > >
> > > Fix it by:
> > > 1. Set orig_nents as number of entries with pages also in
> > > __sg_alloc_table_from_pages.
> > > 2. Add a new field total_nents to reflect the total number of entries
> > > in the table. This is required for the release flow (sg_free_table).
> > > This filed should be used internally only by scatterlist.
> > >
> > > Fixes: 07da1223ec93 ("lib/scatterlist: Add support in dynamic allocation of SG table from pages")
> > > Signed-off-by: Maor Gottlieb <[email protected]>
> > > Signed-off-by: Leon Romanovsky <[email protected]>
> >
> > This patch landed in linux-next 20210630 as commit a52724456928
> > ("lib/scatterlist: Fix wrong update of orig_nents"). It causes serious
> > regression in DMA-IOMMU integration, which can be observed for example
> > on ARM Juno board during boot:
> >
> > Unable to handle kernel paging request at virtual address 00376f42a6e40454
> > Mem abort info:
> > ? ESR = 0x96000004
> > ? EC = 0x25: DABT (current EL), IL = 32 bits
> > ? SET = 0, FnV = 0
> > ? EA = 0, S1PTW = 0
> > ? FSC = 0x04: level 0 translation fault
> > Data abort info:
> > ? ISV = 0, ISS = 0x00000004
> > ? CM = 0, WnR = 0
> > [00376f42a6e40454] address between user and kernel address ranges
> > Internal error: Oops: 96000004 [#1] PREEMPT SMP
> > Modules linked in:
> > CPU: 2 PID: 1 Comm: swapper/0 Not tainted 5.13.0-next-20210630+ #3585
> > Hardware name: ARM Juno development board (r1) (DT)
> > pstate: 80000005 (Nzcv daif -PAN -UAO -TCO BTYPE=--)
> > pc : __sg_free_table+0x60/0xa0
> > lr : __sg_free_table+0x7c/0xa0
> > ..
> > Call trace:
> > ?__sg_free_table+0x60/0xa0
> > ?sg_free_table+0x1c/0x28
> > ?iommu_dma_alloc+0xc8/0x388
> > ?dma_alloc_attrs+0xcc/0xf0
> > ?dmam_alloc_attrs+0x68/0xb8
> > ?sil24_port_start+0x60/0xe0
> > ?ata_host_start.part.32+0xbc/0x208
> > ?ata_host_activate+0x64/0x150
> > ?sil24_init_one+0x1e8/0x268
> > ?local_pci_probe+0x3c/0xa0
> > ?pci_device_probe+0x128/0x1c8
> > ?really_probe+0x138/0x2d0
> > ?__driver_probe_device+0x78/0xd8
> > ?driver_probe_device+0x40/0x110
> > ?__driver_attach+0xcc/0x118
> > ?bus_for_each_dev+0x68/0xc8
> > ?driver_attach+0x20/0x28
> > ?bus_add_driver+0x168/0x1f8
> > ?driver_register+0x60/0x110
> > ?__pci_register_driver+0x5c/0x68
> > ?sil24_pci_driver_init+0x20/0x28
> > ?do_one_initcall+0x84/0x450
> > ?kernel_init_freeable+0x31c/0x38c
> > ?kernel_init+0x20/0x120
> > ?ret_from_fork+0x10/0x18
> > Code: d37be885 6b01007f 52800004 540000a2 (f8656813)
> > ---[ end trace 4ba4f0c9c48711a1 ]---
> > Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
> >
> > It looks that some changes to the scatterlist structures are missing
> > outside of the lib/scatterlist.c.
> >
> > For now I would suggest to revert this change.
>
> I see a very similar crash on Tegra during the HDA driver's probe.
>
> Leon, let me know if you need a tester for a replacement patch.

With a great pleasure, I'll contact you offline when we prepare it.

For now, this patch will be dropped.

Thanks

>
> Thanks,
> Thierry