2021-11-13 09:44:26

by Ivaylo Dimitrov

[permalink] [raw]
Subject: [PATCH] drm: omapdrm: Export correct scatterlist for TILER backed BOs

Memory of BOs backed by TILER is not contiguous, but omap_gem_map_dma_buf()
exports it like it is. This leads to (possibly) invalid memory accesses if
another device imports such a BO.

Fix that by providing a scatterlist that correctly describes TILER memory
layout.

Suggested-by: Matthijs van Duin <[email protected]>
Signed-off-by: Ivaylo Dimitrov <[email protected]>
---
drivers/gpu/drm/omapdrm/omap_gem.c | 78 ++++++++++++++++++++++++++++++-
drivers/gpu/drm/omapdrm/omap_gem.h | 3 +-
drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c | 32 ++-----------
3 files changed, 83 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c
index 97e5fe6..a1a18bb 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem.c
@@ -48,7 +48,7 @@ struct omap_gem_object {
* OMAP_BO_MEM_DMA_API flag set)
*
* - buffers imported from dmabuf (with the OMAP_BO_MEM_DMABUF flag set)
- * if they are physically contiguous (when sgt->orig_nents == 1)
+ * if they are physically contiguous (when sgt->nents == 1)
*
* - buffers mapped through the TILER when dma_addr_cnt is not zero, in
* which case the DMA address points to the TILER aperture
@@ -862,6 +862,11 @@ static void omap_gem_unpin_locked(struct drm_gem_object *obj)
return;

if (refcount_dec_and_test(&omap_obj->dma_addr_cnt)) {
+ if (omap_obj->sgt) {
+ sg_free_table(omap_obj->sgt);
+ kfree(omap_obj->sgt);
+ omap_obj->sgt = NULL;
+ }
ret = tiler_unpin(omap_obj->block);
if (ret) {
dev_err(obj->dev->dev,
@@ -974,6 +979,77 @@ int omap_gem_put_pages(struct drm_gem_object *obj)
return 0;
}

+struct sg_table *omap_gem_get_sg(struct drm_gem_object *obj)
+{
+ struct omap_gem_object *omap_obj = to_omap_bo(obj);
+ dma_addr_t addr;
+ struct sg_table *sgt;
+ struct scatterlist *sg;
+ unsigned int count, len, stride, i;
+ int ret;
+
+ ret = omap_gem_pin(obj, &addr);
+ if (ret)
+ return ERR_PTR(ret);
+
+ mutex_lock(&omap_obj->lock);
+
+ sgt = omap_obj->sgt;
+ if (sgt)
+ goto out;
+
+ sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
+ ret = -ENOMEM;
+ if (!sgt)
+ goto out_unpin;
+
+ if (omap_obj->flags & OMAP_BO_TILED_MASK) {
+ enum tiler_fmt fmt = gem2fmt(omap_obj->flags);
+
+ len = omap_obj->width << (int)fmt;
+ count = omap_obj->height;
+ stride = tiler_stride(fmt, 0);
+ } else {
+ len = obj->size;
+ count = 1;
+ stride = 0;
+ }
+
+ ret = sg_alloc_table(sgt, count, GFP_KERNEL);
+ if (ret)
+ goto out_free;
+
+ for_each_sg(sgt->sgl, sg, count, i) {
+ sg_set_page(sg, phys_to_page(addr), len, offset_in_page(addr));
+ sg_dma_address(sg) = addr;
+ sg_dma_len(sg) = len;
+
+ addr += stride;
+ }
+
+ omap_obj->sgt = sgt;
+out:
+ mutex_unlock(&omap_obj->lock);
+ return sgt;
+
+out_free:
+ kfree(sgt);
+out_unpin:
+ mutex_unlock(&omap_obj->lock);
+ omap_gem_unpin(obj);
+ return ERR_PTR(ret);
+}
+
+void omap_gem_put_sg(struct drm_gem_object *obj, struct sg_table *sgt)
+{
+ struct omap_gem_object *omap_obj = to_omap_bo(obj);
+
+ if (WARN_ON(omap_obj->sgt != sgt))
+ return;
+
+ omap_gem_unpin(obj);
+}
+
#ifdef CONFIG_DRM_FBDEV_EMULATION
/*
* Get kernel virtual address for CPU access.. this more or less only
diff --git a/drivers/gpu/drm/omapdrm/omap_gem.h b/drivers/gpu/drm/omapdrm/omap_gem.h
index eda9b48..3b61cfc 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem.h
+++ b/drivers/gpu/drm/omapdrm/omap_gem.h
@@ -77,10 +77,11 @@ void omap_gem_dma_sync_buffer(struct drm_gem_object *obj,
int omap_gem_get_pages(struct drm_gem_object *obj, struct page ***pages,
bool remap);
int omap_gem_put_pages(struct drm_gem_object *obj);
-
u32 omap_gem_flags(struct drm_gem_object *obj);
int omap_gem_rotated_dma_addr(struct drm_gem_object *obj, u32 orient,
int x, int y, dma_addr_t *dma_addr);
int omap_gem_tiled_stride(struct drm_gem_object *obj, u32 orient);
+struct sg_table *omap_gem_get_sg(struct drm_gem_object *obj);
+void omap_gem_put_sg(struct drm_gem_object *obj, struct sg_table *sgt);

#endif /* __OMAPDRM_GEM_H__ */
diff --git a/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c b/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
index f4cde3a..9650416 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
@@ -21,45 +21,21 @@ static struct sg_table *omap_gem_map_dma_buf(
{
struct drm_gem_object *obj = attachment->dmabuf->priv;
struct sg_table *sg;
- dma_addr_t dma_addr;
- int ret;
-
- sg = kzalloc(sizeof(*sg), GFP_KERNEL);
- if (!sg)
- return ERR_PTR(-ENOMEM);
-
- /* camera, etc, need physically contiguous.. but we need a
- * better way to know this..
- */
- ret = omap_gem_pin(obj, &dma_addr);
- if (ret)
- goto out;
-
- ret = sg_alloc_table(sg, 1, GFP_KERNEL);
- if (ret)
- goto out;
-
- sg_init_table(sg->sgl, 1);
- sg_dma_len(sg->sgl) = obj->size;
- sg_set_page(sg->sgl, pfn_to_page(PFN_DOWN(dma_addr)), obj->size, 0);
- sg_dma_address(sg->sgl) = dma_addr;
+ sg = omap_gem_get_sg(obj);
+ if (IS_ERR(sg))
+ return sg;

/* this must be after omap_gem_pin() to ensure we have pages attached */
omap_gem_dma_sync_buffer(obj, dir);

return sg;
-out:
- kfree(sg);
- return ERR_PTR(ret);
}

static void omap_gem_unmap_dma_buf(struct dma_buf_attachment *attachment,
struct sg_table *sg, enum dma_data_direction dir)
{
struct drm_gem_object *obj = attachment->dmabuf->priv;
- omap_gem_unpin(obj);
- sg_free_table(sg);
- kfree(sg);
+ omap_gem_put_sg(obj, sg);
}

static int omap_gem_dmabuf_begin_cpu_access(struct dma_buf *buffer,
--
1.9.1



2021-11-13 09:49:47

by Ivaylo Dimitrov

[permalink] [raw]
Subject: Re: [PATCH] drm: omapdrm: Export correct scatterlist for TILER backed BOs



On 13.11.21 г. 11:40 ч., Ivaylo Dimitrov wrote:
> Memory of BOs backed by TILER is not contiguous, but omap_gem_map_dma_buf()
> exports it like it is. This leads to (possibly) invalid memory accesses if
> another device imports such a BO.
>
> Fix that by providing a scatterlist that correctly describes TILER memory
> layout.
>
> Suggested-by: Matthijs van Duin <[email protected]>
> Signed-off-by: Ivaylo Dimitrov <[email protected]>
> ---
> drivers/gpu/drm/omapdrm/omap_gem.c | 78 ++++++++++++++++++++++++++++++-
> drivers/gpu/drm/omapdrm/omap_gem.h | 3 +-
> drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c | 32 ++-----------
> 3 files changed, 83 insertions(+), 30 deletions(-)
>
> diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c
> index 97e5fe6..a1a18bb 100644
> --- a/drivers/gpu/drm/omapdrm/omap_gem.c
> +++ b/drivers/gpu/drm/omapdrm/omap_gem.c
> @@ -48,7 +48,7 @@ struct omap_gem_object {
> * OMAP_BO_MEM_DMA_API flag set)
> *
> * - buffers imported from dmabuf (with the OMAP_BO_MEM_DMABUF flag set)
> - * if they are physically contiguous (when sgt->orig_nents == 1)
> + * if they are physically contiguous (when sgt->nents == 1)
> *

Ugh, this should not be here, will send new patch

> * - buffers mapped through the TILER when dma_addr_cnt is not zero, in
> * which case the DMA address points to the TILER aperture
> @@ -862,6 +862,11 @@ static void omap_gem_unpin_locked(struct drm_gem_object *obj)
> return;
>
> if (refcount_dec_and_test(&omap_obj->dma_addr_cnt)) {
> + if (omap_obj->sgt) {
> + sg_free_table(omap_obj->sgt);
> + kfree(omap_obj->sgt);
> + omap_obj->sgt = NULL;
> + }
> ret = tiler_unpin(omap_obj->block);
> if (ret) {
> dev_err(obj->dev->dev,
> @@ -974,6 +979,77 @@ int omap_gem_put_pages(struct drm_gem_object *obj)
> return 0;
> }
>
> +struct sg_table *omap_gem_get_sg(struct drm_gem_object *obj)
> +{
> + struct omap_gem_object *omap_obj = to_omap_bo(obj);
> + dma_addr_t addr;
> + struct sg_table *sgt;
> + struct scatterlist *sg;
> + unsigned int count, len, stride, i;
> + int ret;
> +
> + ret = omap_gem_pin(obj, &addr);
> + if (ret)
> + return ERR_PTR(ret);
> +
> + mutex_lock(&omap_obj->lock);
> +
> + sgt = omap_obj->sgt;
> + if (sgt)
> + goto out;
> +
> + sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
> + ret = -ENOMEM;
> + if (!sgt)
> + goto out_unpin;
> +
> + if (omap_obj->flags & OMAP_BO_TILED_MASK) {
> + enum tiler_fmt fmt = gem2fmt(omap_obj->flags);
> +
> + len = omap_obj->width << (int)fmt;
> + count = omap_obj->height;
> + stride = tiler_stride(fmt, 0);
> + } else {
> + len = obj->size;
> + count = 1;
> + stride = 0;
> + }
> +
> + ret = sg_alloc_table(sgt, count, GFP_KERNEL);
> + if (ret)
> + goto out_free;
> +
> + for_each_sg(sgt->sgl, sg, count, i) {
> + sg_set_page(sg, phys_to_page(addr), len, offset_in_page(addr));
> + sg_dma_address(sg) = addr;
> + sg_dma_len(sg) = len;
> +
> + addr += stride;
> + }
> +
> + omap_obj->sgt = sgt;
> +out:
> + mutex_unlock(&omap_obj->lock);
> + return sgt;
> +
> +out_free:
> + kfree(sgt);
> +out_unpin:
> + mutex_unlock(&omap_obj->lock);
> + omap_gem_unpin(obj);
> + return ERR_PTR(ret);
> +}
> +
> +void omap_gem_put_sg(struct drm_gem_object *obj, struct sg_table *sgt)
> +{
> + struct omap_gem_object *omap_obj = to_omap_bo(obj);
> +
> + if (WARN_ON(omap_obj->sgt != sgt))
> + return;
> +
> + omap_gem_unpin(obj);
> +}
> +
> #ifdef CONFIG_DRM_FBDEV_EMULATION
> /*
> * Get kernel virtual address for CPU access.. this more or less only
> diff --git a/drivers/gpu/drm/omapdrm/omap_gem.h b/drivers/gpu/drm/omapdrm/omap_gem.h
> index eda9b48..3b61cfc 100644
> --- a/drivers/gpu/drm/omapdrm/omap_gem.h
> +++ b/drivers/gpu/drm/omapdrm/omap_gem.h
> @@ -77,10 +77,11 @@ void omap_gem_dma_sync_buffer(struct drm_gem_object *obj,
> int omap_gem_get_pages(struct drm_gem_object *obj, struct page ***pages,
> bool remap);
> int omap_gem_put_pages(struct drm_gem_object *obj);
> -
> u32 omap_gem_flags(struct drm_gem_object *obj);
> int omap_gem_rotated_dma_addr(struct drm_gem_object *obj, u32 orient,
> int x, int y, dma_addr_t *dma_addr);
> int omap_gem_tiled_stride(struct drm_gem_object *obj, u32 orient);
> +struct sg_table *omap_gem_get_sg(struct drm_gem_object *obj);
> +void omap_gem_put_sg(struct drm_gem_object *obj, struct sg_table *sgt);
>
> #endif /* __OMAPDRM_GEM_H__ */
> diff --git a/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c b/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
> index f4cde3a..9650416 100644
> --- a/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
> +++ b/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
> @@ -21,45 +21,21 @@ static struct sg_table *omap_gem_map_dma_buf(
> {
> struct drm_gem_object *obj = attachment->dmabuf->priv;
> struct sg_table *sg;
> - dma_addr_t dma_addr;
> - int ret;
> -
> - sg = kzalloc(sizeof(*sg), GFP_KERNEL);
> - if (!sg)
> - return ERR_PTR(-ENOMEM);
> -
> - /* camera, etc, need physically contiguous.. but we need a
> - * better way to know this..
> - */
> - ret = omap_gem_pin(obj, &dma_addr);
> - if (ret)
> - goto out;
> -
> - ret = sg_alloc_table(sg, 1, GFP_KERNEL);
> - if (ret)
> - goto out;
> -
> - sg_init_table(sg->sgl, 1);
> - sg_dma_len(sg->sgl) = obj->size;
> - sg_set_page(sg->sgl, pfn_to_page(PFN_DOWN(dma_addr)), obj->size, 0);
> - sg_dma_address(sg->sgl) = dma_addr;
> + sg = omap_gem_get_sg(obj);
> + if (IS_ERR(sg))
> + return sg;
>
> /* this must be after omap_gem_pin() to ensure we have pages attached */
> omap_gem_dma_sync_buffer(obj, dir);
>
> return sg;
> -out:
> - kfree(sg);
> - return ERR_PTR(ret);
> }
>
> static void omap_gem_unmap_dma_buf(struct dma_buf_attachment *attachment,
> struct sg_table *sg, enum dma_data_direction dir)
> {
> struct drm_gem_object *obj = attachment->dmabuf->priv;
> - omap_gem_unpin(obj);
> - sg_free_table(sg);
> - kfree(sg);
> + omap_gem_put_sg(obj, sg);
> }
>
> static int omap_gem_dmabuf_begin_cpu_access(struct dma_buf *buffer,
>

2021-11-13 09:58:06

by Ivaylo Dimitrov

[permalink] [raw]
Subject: [PATCH v2] drm: omapdrm: Export correct scatterlist for TILER backed BOs

Memory of BOs backed by TILER is not contiguous, but omap_gem_map_dma_buf()
exports it like it is. This leads to (possibly) invalid memory accesses if
another device imports such a BO.

Fix that by providing a scatterlist that correctly describes TILER memory
layout.

Suggested-by: Matthijs van Duin <[email protected]>
Signed-off-by: Ivaylo Dimitrov <[email protected]>
---
drivers/gpu/drm/omapdrm/omap_gem.c | 76 +++++++++++++++++++++++++++++++
drivers/gpu/drm/omapdrm/omap_gem.h | 3 +-
drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c | 32 ++-----------
3 files changed, 82 insertions(+), 29 deletions(-)

diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c
index 97e5fe6..2ffcc37 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem.c
@@ -862,6 +862,11 @@ static void omap_gem_unpin_locked(struct drm_gem_object *obj)
return;

if (refcount_dec_and_test(&omap_obj->dma_addr_cnt)) {
+ if (omap_obj->sgt) {
+ sg_free_table(omap_obj->sgt);
+ kfree(omap_obj->sgt);
+ omap_obj->sgt = NULL;
+ }
ret = tiler_unpin(omap_obj->block);
if (ret) {
dev_err(obj->dev->dev,
@@ -974,6 +979,77 @@ int omap_gem_put_pages(struct drm_gem_object *obj)
return 0;
}

+struct sg_table *omap_gem_get_sg(struct drm_gem_object *obj)
+{
+ struct omap_gem_object *omap_obj = to_omap_bo(obj);
+ dma_addr_t addr;
+ struct sg_table *sgt;
+ struct scatterlist *sg;
+ unsigned int count, len, stride, i;
+ int ret;
+
+ ret = omap_gem_pin(obj, &addr);
+ if (ret)
+ return ERR_PTR(ret);
+
+ mutex_lock(&omap_obj->lock);
+
+ sgt = omap_obj->sgt;
+ if (sgt)
+ goto out;
+
+ sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
+ ret = -ENOMEM;
+ if (!sgt)
+ goto out_unpin;
+
+ if (omap_obj->flags & OMAP_BO_TILED_MASK) {
+ enum tiler_fmt fmt = gem2fmt(omap_obj->flags);
+
+ len = omap_obj->width << (int)fmt;
+ count = omap_obj->height;
+ stride = tiler_stride(fmt, 0);
+ } else {
+ len = obj->size;
+ count = 1;
+ stride = 0;
+ }
+
+ ret = sg_alloc_table(sgt, count, GFP_KERNEL);
+ if (ret)
+ goto out_free;
+
+ for_each_sg(sgt->sgl, sg, count, i) {
+ sg_set_page(sg, phys_to_page(addr), len, offset_in_page(addr));
+ sg_dma_address(sg) = addr;
+ sg_dma_len(sg) = len;
+
+ addr += stride;
+ }
+
+ omap_obj->sgt = sgt;
+out:
+ mutex_unlock(&omap_obj->lock);
+ return sgt;
+
+out_free:
+ kfree(sgt);
+out_unpin:
+ mutex_unlock(&omap_obj->lock);
+ omap_gem_unpin(obj);
+ return ERR_PTR(ret);
+}
+
+void omap_gem_put_sg(struct drm_gem_object *obj, struct sg_table *sgt)
+{
+ struct omap_gem_object *omap_obj = to_omap_bo(obj);
+
+ if (WARN_ON(omap_obj->sgt != sgt))
+ return;
+
+ omap_gem_unpin(obj);
+}
+
#ifdef CONFIG_DRM_FBDEV_EMULATION
/*
* Get kernel virtual address for CPU access.. this more or less only
diff --git a/drivers/gpu/drm/omapdrm/omap_gem.h b/drivers/gpu/drm/omapdrm/omap_gem.h
index eda9b48..3b61cfc 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem.h
+++ b/drivers/gpu/drm/omapdrm/omap_gem.h
@@ -77,10 +77,11 @@ void omap_gem_dma_sync_buffer(struct drm_gem_object *obj,
int omap_gem_get_pages(struct drm_gem_object *obj, struct page ***pages,
bool remap);
int omap_gem_put_pages(struct drm_gem_object *obj);
-
u32 omap_gem_flags(struct drm_gem_object *obj);
int omap_gem_rotated_dma_addr(struct drm_gem_object *obj, u32 orient,
int x, int y, dma_addr_t *dma_addr);
int omap_gem_tiled_stride(struct drm_gem_object *obj, u32 orient);
+struct sg_table *omap_gem_get_sg(struct drm_gem_object *obj);
+void omap_gem_put_sg(struct drm_gem_object *obj, struct sg_table *sgt);

#endif /* __OMAPDRM_GEM_H__ */
diff --git a/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c b/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
index f4cde3a..9650416 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
@@ -21,45 +21,21 @@ static struct sg_table *omap_gem_map_dma_buf(
{
struct drm_gem_object *obj = attachment->dmabuf->priv;
struct sg_table *sg;
- dma_addr_t dma_addr;
- int ret;
-
- sg = kzalloc(sizeof(*sg), GFP_KERNEL);
- if (!sg)
- return ERR_PTR(-ENOMEM);
-
- /* camera, etc, need physically contiguous.. but we need a
- * better way to know this..
- */
- ret = omap_gem_pin(obj, &dma_addr);
- if (ret)
- goto out;
-
- ret = sg_alloc_table(sg, 1, GFP_KERNEL);
- if (ret)
- goto out;
-
- sg_init_table(sg->sgl, 1);
- sg_dma_len(sg->sgl) = obj->size;
- sg_set_page(sg->sgl, pfn_to_page(PFN_DOWN(dma_addr)), obj->size, 0);
- sg_dma_address(sg->sgl) = dma_addr;
+ sg = omap_gem_get_sg(obj);
+ if (IS_ERR(sg))
+ return sg;

/* this must be after omap_gem_pin() to ensure we have pages attached */
omap_gem_dma_sync_buffer(obj, dir);

return sg;
-out:
- kfree(sg);
- return ERR_PTR(ret);
}

static void omap_gem_unmap_dma_buf(struct dma_buf_attachment *attachment,
struct sg_table *sg, enum dma_data_direction dir)
{
struct drm_gem_object *obj = attachment->dmabuf->priv;
- omap_gem_unpin(obj);
- sg_free_table(sg);
- kfree(sg);
+ omap_gem_put_sg(obj, sg);
}

static int omap_gem_dmabuf_begin_cpu_access(struct dma_buf *buffer,
--
1.9.1


2021-11-15 08:45:31

by Tomi Valkeinen

[permalink] [raw]
Subject: Re: [PATCH v2] drm: omapdrm: Export correct scatterlist for TILER backed BOs

Hi,

On 13/11/2021 11:53, Ivaylo Dimitrov wrote:
> Memory of BOs backed by TILER is not contiguous, but omap_gem_map_dma_buf()
> exports it like it is. This leads to (possibly) invalid memory accesses if
> another device imports such a BO.

This is one reason why TILER hasn't been officially supported. But the
above is not exactly right, or at least not the whole truth.

A BO's memory via the TILER memory is contiguous, although with
consistent gaps of memory that should not be accessed. That point is
important, as the IPs that might use TILER backed BOs only support
contiguous memory.

This means that the drivers for such IPs cannot use the BOs exported
like you do in this patch. I believe the drivers could be improved by
writing a helper function which studies the sg_table and concludes that
it's actually contiguous. I think we should have at least one such
driver fixed along with this patch so that we can be more confident that
this actually works.

But I'm not sure what that driver would be on droid4. I have DRA7 boards
which have VIP, CAL and VPE that I can use here. Perhaps it wouldn't be
too much effort for me to extend my tests a bit to include CAL, and try
to fix the driver. I just fear the driver changes won't be trivial.

Did you test this somehow?

Did you look at the userspace mmap of TILER buffers? I wonder if that
goes correctly or not. Isn't memory to userspace mapped per page, and
lengths of the TILER lines are not page aligned?

> Fix that by providing a scatterlist that correctly describes TILER memory
> layout.
>
> Suggested-by: Matthijs van Duin <[email protected]>
> Signed-off-by: Ivaylo Dimitrov <[email protected]>
> ---
> drivers/gpu/drm/omapdrm/omap_gem.c | 76 +++++++++++++++++++++++++++++++
> drivers/gpu/drm/omapdrm/omap_gem.h | 3 +-
> drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c | 32 ++-----------
> 3 files changed, 82 insertions(+), 29 deletions(-)
>
> diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c
> index 97e5fe6..2ffcc37 100644
> --- a/drivers/gpu/drm/omapdrm/omap_gem.c
> +++ b/drivers/gpu/drm/omapdrm/omap_gem.c
> @@ -862,6 +862,11 @@ static void omap_gem_unpin_locked(struct drm_gem_object *obj)
> return;
>
> if (refcount_dec_and_test(&omap_obj->dma_addr_cnt)) {
> + if (omap_obj->sgt) {
> + sg_free_table(omap_obj->sgt);
> + kfree(omap_obj->sgt);
> + omap_obj->sgt = NULL;
> + }

This behavior is different than before, isn't it? The commit desc only
mentions changing the construction of the sg-list, not how they're
allocated and freed.

> ret = tiler_unpin(omap_obj->block);
> if (ret) {
> dev_err(obj->dev->dev,
> @@ -974,6 +979,77 @@ int omap_gem_put_pages(struct drm_gem_object *obj)
> return 0;
> }
>
> +struct sg_table *omap_gem_get_sg(struct drm_gem_object *obj)
> +{
> + struct omap_gem_object *omap_obj = to_omap_bo(obj);
> + dma_addr_t addr;
> + struct sg_table *sgt;
> + struct scatterlist *sg;
> + unsigned int count, len, stride, i;
> + int ret;
> +
> + ret = omap_gem_pin(obj, &addr);
> + if (ret)
> + return ERR_PTR(ret);
> +
> + mutex_lock(&omap_obj->lock);
> +
> + sgt = omap_obj->sgt;
> + if (sgt)
> + goto out;
> +
> + sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
> + ret = -ENOMEM;
> + if (!sgt)
> + goto out_unpin;

I think you can move the ret = ... to be inside the if block.

> +
> + if (omap_obj->flags & OMAP_BO_TILED_MASK) {
> + enum tiler_fmt fmt = gem2fmt(omap_obj->flags);
> +
> + len = omap_obj->width << (int)fmt;
> + count = omap_obj->height;
> + stride = tiler_stride(fmt, 0);
> + } else {
> + len = obj->size;
> + count = 1;
> + stride = 0;
> + }
> +
> + ret = sg_alloc_table(sgt, count, GFP_KERNEL);
> + if (ret)
> + goto out_free;
> +
> + for_each_sg(sgt->sgl, sg, count, i) {
> + sg_set_page(sg, phys_to_page(addr), len, offset_in_page(addr));
> + sg_dma_address(sg) = addr;
> + sg_dma_len(sg) = len;
> +
> + addr += stride;
> + }
> +
> + omap_obj->sgt = sgt;
> +out:
> + mutex_unlock(&omap_obj->lock);
> + return sgt;
> +
> +out_free:
> + kfree(sgt);
> +out_unpin:

There are errors handlers, I suggest err_ prefix.

> + mutex_unlock(&omap_obj->lock);
> + omap_gem_unpin(obj);
> + return ERR_PTR(ret);
> +}
> +
> +void omap_gem_put_sg(struct drm_gem_object *obj, struct sg_table *sgt)
> +{
> + struct omap_gem_object *omap_obj = to_omap_bo(obj);
> +
> + if (WARN_ON(omap_obj->sgt != sgt))
> + return;
> +
> + omap_gem_unpin(obj);
> +}
> +
> #ifdef CONFIG_DRM_FBDEV_EMULATION
> /*
> * Get kernel virtual address for CPU access.. this more or less only
> diff --git a/drivers/gpu/drm/omapdrm/omap_gem.h b/drivers/gpu/drm/omapdrm/omap_gem.h
> index eda9b48..3b61cfc 100644
> --- a/drivers/gpu/drm/omapdrm/omap_gem.h
> +++ b/drivers/gpu/drm/omapdrm/omap_gem.h
> @@ -77,10 +77,11 @@ void omap_gem_dma_sync_buffer(struct drm_gem_object *obj,
> int omap_gem_get_pages(struct drm_gem_object *obj, struct page ***pages,
> bool remap);
> int omap_gem_put_pages(struct drm_gem_object *obj);
> -

Extra change here.

> u32 omap_gem_flags(struct drm_gem_object *obj);
> int omap_gem_rotated_dma_addr(struct drm_gem_object *obj, u32 orient,
> int x, int y, dma_addr_t *dma_addr);
> int omap_gem_tiled_stride(struct drm_gem_object *obj, u32 orient);
> +struct sg_table *omap_gem_get_sg(struct drm_gem_object *obj);
> +void omap_gem_put_sg(struct drm_gem_object *obj, struct sg_table *sgt);
>
> #endif /* __OMAPDRM_GEM_H__ */
> diff --git a/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c b/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
> index f4cde3a..9650416 100644
> --- a/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
> +++ b/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
> @@ -21,45 +21,21 @@ static struct sg_table *omap_gem_map_dma_buf(
> {
> struct drm_gem_object *obj = attachment->dmabuf->priv;
> struct sg_table *sg;
> - dma_addr_t dma_addr;
> - int ret;
> -
> - sg = kzalloc(sizeof(*sg), GFP_KERNEL);
> - if (!sg)
> - return ERR_PTR(-ENOMEM);
> -
> - /* camera, etc, need physically contiguous.. but we need a
> - * better way to know this..
> - */
> - ret = omap_gem_pin(obj, &dma_addr);
> - if (ret)
> - goto out;
> -
> - ret = sg_alloc_table(sg, 1, GFP_KERNEL);
> - if (ret)
> - goto out;
> -
> - sg_init_table(sg->sgl, 1);
> - sg_dma_len(sg->sgl) = obj->size;
> - sg_set_page(sg->sgl, pfn_to_page(PFN_DOWN(dma_addr)), obj->size, 0);
> - sg_dma_address(sg->sgl) = dma_addr;
> + sg = omap_gem_get_sg(obj);
> + if (IS_ERR(sg))
> + return sg;
>
> /* this must be after omap_gem_pin() to ensure we have pages attached */
> omap_gem_dma_sync_buffer(obj, dir);
>
> return sg;
> -out:
> - kfree(sg);
> - return ERR_PTR(ret);
> }
>
> static void omap_gem_unmap_dma_buf(struct dma_buf_attachment *attachment,
> struct sg_table *sg, enum dma_data_direction dir)
> {
> struct drm_gem_object *obj = attachment->dmabuf->priv;
> - omap_gem_unpin(obj);
> - sg_free_table(sg);
> - kfree(sg);
> + omap_gem_put_sg(obj, sg);
> }
>
> static int omap_gem_dmabuf_begin_cpu_access(struct dma_buf *buffer,
>

2021-11-15 09:24:42

by Matthijs van Duin

[permalink] [raw]
Subject: Re: [PATCH v2] drm: omapdrm: Export correct scatterlist for TILER backed BOs

On Mon, Nov 15, 2021 at 10:42:41AM +0200, Tomi Valkeinen wrote:
> A BO's memory via the TILER memory is
> contiguous, although with consistent gaps of
> memory that should not be accessed.

But pretending that these "gaps" are part of the buffer is a security
vulnerability, since that memory which "should not be accessed" may
belong to different security contexts, and exporting the entire
contiguous region covering the buffer allows untrusted contexts (e.g.
userspace) to access this memory.

> IPs that might use TILER
> backed BOs only support contiguous memory.
>
> This means that the drivers for such IPs cannot
> use the BOs exported like you do in this patch.
> I believe the drivers could be improved by
> writing a helper function which studies the
> sg_table and concludes that it's actually
> contiguous.

That indeed sounds like the proper solution for such importers, rather
than making the exporter lie about the buffer bounds to work around
limitations of these importers.

> Did you look at the userspace mmap of TILER
> buffers? I wonder if that goes correctly or not.
> Isn't memory to userspace mapped per page, and
> lengths of the TILER lines are not page aligned?

Mapping to userspace uses an ugly hack whereby small slabs of the
buffer (4096x64 (8bpp), 2048x32 (16bpp), or 1024x32 (32bpp) pixels) are
dynamically mapped to dedicated page-aligned regions of the TILER
virtual space. For each of the three bitdepths only two such slabs can
be mapped into userspace at any given time (on the entire system), so
using this mechanism to render graphics from userspace can easily cause
hundreds if not thousands of page faults per second.

The alternative (used e.g. in the pyra kernel) is to force all TILER
buffers to be page-aligned, at the cost of wasting some TILER space.
This will presumably also be necessary to allow SGX to import these
buffers since its MMU can obviously also not map data which is not
page-aligned, same for any other importer which uses an MMU to enforce
memory security (rather than being trusted to simply refrain from
accessing data outside the declared bounds).

Ideally such page-alignment should only be applied to buffers which are
intended to be consumed by importers which require this, though it's not
clear how that might be accomplished.

--
Matthijs van Duin

2021-11-15 10:38:03

by Tomi Valkeinen

[permalink] [raw]
Subject: Re: [PATCH v2] drm: omapdrm: Export correct scatterlist for TILER backed BOs

On 15/11/2021 11:23, Matthijs van Duin wrote:
> On Mon, Nov 15, 2021 at 10:42:41AM +0200, Tomi Valkeinen wrote:
>> A BO's memory via the TILER memory is
>> contiguous, although with consistent gaps of
>> memory that should not be accessed.
>
> But pretending that these "gaps" are part of the buffer is a security
> vulnerability, since that memory which "should not be accessed" may
> belong to different security contexts, and exporting the entire
> contiguous region covering the buffer allows untrusted contexts (e.g.
> userspace) to access this memory.

Yes, I fully agree. I wasn't criticizing the patch, just wanted to
highlight the unmentioned aspects.

>> IPs that might use TILER
>> backed BOs only support contiguous memory.
>>
>> This means that the drivers for such IPs cannot
>> use the BOs exported like you do in this patch.
>> I believe the drivers could be improved by
>> writing a helper function which studies the
>> sg_table and concludes that it's actually
>> contiguous.
>
> That indeed sounds like the proper solution for such importers, rather
> than making the exporter lie about the buffer bounds to work around
> limitations of these importers.

The annoying thing with this solution is that we need to add extra code
to all the drivers that want to import TILER buffers, even if the
drivers shouldn't know anything about TILER.

Then again, the code is not really TILER or OMAP specific, and any IP on
any platform that only supports contiguous buffers, but supports stride,
could import such buffers. Which hints that maybe the code should be
somewhere in the framework, not in the driver. In practice it may be
better to just swallow the annoyance, add the code to the drivers and be
done with it =).

>> Did you look at the userspace mmap of TILER
>> buffers? I wonder if that goes correctly or not.
>> Isn't memory to userspace mapped per page, and
>> lengths of the TILER lines are not page aligned?
>
> Mapping to userspace uses an ugly hack whereby small slabs of the
> buffer (4096x64 (8bpp), 2048x32 (16bpp), or 1024x32 (32bpp) pixels) are
> dynamically mapped to dedicated page-aligned regions of the TILER
> virtual space. For each of the three bitdepths only two such slabs can
> be mapped into userspace at any given time (on the entire system), so
> using this mechanism to render graphics from userspace can easily cause
> hundreds if not thousands of page faults per second.

Ah, right, yes, now I remember. The userspace mmap of TILER buffers
isn't very usable.

> The alternative (used e.g. in the pyra kernel) is to force all TILER
> buffers to be page-aligned, at the cost of wasting some TILER space.
> This will presumably also be necessary to allow SGX to import these
> buffers since its MMU can obviously also not map data which is not
> page-aligned, same for any other importer which uses an MMU to enforce
> memory security (rather than being trusted to simply refrain from
> accessing data outside the declared bounds).
>
> Ideally such page-alignment should only be applied to buffers which are
> intended to be consumed by importers which require this, though it's not
> clear how that might be accomplished.

Do you mean that each TILER _line_ should be page aligned and the length
should be page divisible? Doesn't that cause quite a lot of wasted
space? Although that, of course, depends on the use. If the primary use
case is allocating a few full screen display buffers, perhaps the waste
is negligible.

In any case, I'm fine with that change, too, as it helps making TILER
usable.

And while speaking about usable, if I recall right, the
omap_bo_new_tiled() is pretty annoying to use. You need to give the
width and OMAP_BO_TILED_x flag as a parameter, and if I recall right,
it's all but obvious how those need to be set for, e.g. NV12. But it
works so perhaps better to keep it as it is...

There was also some particular YUV format with some rotations that I
never got working, even after discussing with TI DSS HW guys.

Tomi

2021-11-15 14:00:39

by Ivaylo Dimitrov

[permalink] [raw]
Subject: Re: [PATCH v2] drm: omapdrm: Export correct scatterlist for TILER backed BOs

Hi Tomi,

On 15.11.21 г. 10:42 ч., Tomi Valkeinen wrote:
> Hi,
>
> On 13/11/2021 11:53, Ivaylo Dimitrov wrote:
>> Memory of BOs backed by TILER is not contiguous, but
>> omap_gem_map_dma_buf()
>> exports it like it is. This leads to (possibly) invalid memory
>> accesses if
>> another device imports such a BO.
>
> This is one reason why TILER hasn't been officially supported. But the
> above is not exactly right, or at least not the whole truth.
>

Definitely, not only these BOs lie about their memory layout, they lie
about size and alignment as well. I have 2 more patches here (one is to
align TILER memory on page, as proposed by Matthijs in the other mail,
the other to set the correct size when exporting TILER BO), but I wanted
to hear from you first, like, what is the general trend :) .

Also, I have another patch in mind, that will enable exporting of
buffers that are not TILER backed, but are not CMA backed either. SGX
for example does not need CMA memory to render to.

> A BO's memory via the TILER memory is contiguous, although with
> consistent gaps of memory that should not be accessed.

I think this more or less contradicts to the definition of contiguous,
no? :)

> That point is
> important, as the IPs that might use TILER backed BOs only support
> contiguous memory.
>

Well, every IP with MMU should be capable to use such BOs. SGX has one
for sure, IIRC IVA and ISP on omap3 has MMUs too.

> This means that the drivers for such IPs cannot use the BOs exported

Neither they can use them without the patch.

> like you do in this patch. I believe the drivers could be improved by
> writing a helper function which studies the sg_table and concludes that
> it's actually contiguous. I think we should have at least one such
> driver fixed along with this patch so that we can be more confident that
> this actually works.
>

Yes, I we have such driver, but unfortunately it is not upstream (PVR
driver that is). Right now I have droid4 in front of me with SGX
rendering hildon-desktop in landscape using TILER BO.

Sure I had 2 more patches applied and also had to teach PVR driver to
know how to use that scatterlist, but that was a trivial change (an old
version of the patch
https://github.com/tmlind/linux_openpvrsgx/commit/90f16ed906c8c6eb4893d3ff647ca7d921972495).

See glmark-es2 results (with a little help of a simple PVR EXA):

user@devuan-droid4:/root$ uname -a
Linux devuan-droid4 5.15.2-01783-g6ba3430a6fad-dirty #28 SMP PREEMPT Mon
Nov 15 08:48:21 EET 2021 armv7l GNU/Linux
user@devuan-droid4:/root$ xrandr -o 3
user@devuan-droid4:/root$ glmark2-es2 --fullscreen
=======================================================
glmark2 2020.04
=======================================================
OpenGL Information
GL_VENDOR: Imagination Technologies
GL_RENDERER: PowerVR SGX 540
GL_VERSION: OpenGL ES 2.0 build 1.17@4948957
=======================================================
[build] use-vbo=false: FPS: 107 FrameTime: 9.346 ms
[build] use-vbo=true: FPS: 136 FrameTime: 7.353 ms
[texture] texture-filter=nearest: FPS: 156 FrameTime: 6.410 ms
[texture] texture-filter=linear: FPS: 153 FrameTime: 6.536 ms
[texture] texture-filter=mipmap: FPS: 152 FrameTime: 6.579 ms
[shading] shading=gouraud: FPS: 111 FrameTime: 9.009 ms
[shading] shading=blinn-phong-inf: FPS: 116 FrameTime: 8.621 ms
[shading] shading=phong: FPS: 104 FrameTime: 9.615 ms
[shading] shading=cel: FPS: 96 FrameTime: 10.417 ms
[bump] bump-render=high-poly: FPS: 67 FrameTime: 14.925 ms
[bump] bump-render=normals: FPS: 140 FrameTime: 7.143 ms
[bump] bump-render=height: FPS: 131 FrameTime: 7.634 ms
[effect2d] kernel=0,1,0;1,-4,1;0,1,0;: FPS: 49 FrameTime: 20.408 ms
[effect2d] kernel=1,1,1,1,1;1,1,1,1,1;1,1,1,1,1;: FPS: 17 FrameTime:
58.824 ms
[pulsar] light=false:quads=5:texture=false: FPS: 152 FrameTime: 6.579 ms
[desktop] blur-radius=5:effect=blur:passes=1:separable=true:windows=4:
FPS: 23 FrameTime: 43.478 ms
[desktop] effect=shadow:windows=4: FPS: 64 FrameTime: 15.625 ms
[buffer]
columns=200:interleave=false:update-dispersion=0.9:update-fraction=0.5:update-method=map:
FPS: 33 FrameTime: 30.303 ms
[buffer]
columns=200:interleave=false:update-dispersion=0.9:update-fraction=0.5:update-method=subdata:
FPS: 33 FrameTime: 30.303 ms
[buffer]
columns=200:interleave=true:update-dispersion=0.9:update-fraction=0.5:update-method=map:
FPS: 60 FrameTime: 16.667 ms
[ideas] speed=duration: FPS: 115 FrameTime: 8.696 ms
[jellyfish] <default>: FPS: 48 FrameTime: 20.833 ms
[terrain] <default>:PVR:(Error): SGXKickTA: TA went out of Mem and SPM
occurred during last TA kick [0, ]
PVR:(Error): SGXKickTA: TA went out of Mem and SPM occurred during last
TA kick [0, ]
PVR:(Error): SGXKickTA: TA went out of Mem and SPM occurred during last
TA kick [0, ]
FPS: 1 FrameTime: 1000.000 ms
[shadow] <default>: FPS: 31 FrameTime: 32.258 ms
[refract] <default>: FPS: 13 FrameTime: 76.923 ms
[conditionals] fragment-steps=0:vertex-steps=0: FPS: 156 FrameTime: 6.410 ms
[conditionals] fragment-steps=5:vertex-steps=0: FPS: 100 FrameTime:
10.000 ms
[conditionals] fragment-steps=0:vertex-steps=5: FPS: 156 FrameTime: 6.410 ms
[function] fragment-complexity=low:fragment-steps=5: FPS: 128 FrameTime:
7.812 ms
[function] fragment-complexity=medium:fragment-steps=5: FPS: 83
FrameTime: 12.048 ms
[loop] fragment-loop=false:fragment-steps=5:vertex-steps=5: FPS: 127
FrameTime: 7.874 ms
[loop] fragment-steps=5:fragment-uniform=false:vertex-steps=5: FPS: 104
FrameTime: 9.615 ms
[loop] fragment-steps=5:fragment-uniform=true:vertex-steps=5: FPS: 77
FrameTime: 12.987 ms
=======================================================
glmark2 Score: 92
=======================================================

Impressive, ain't it ;)

> But I'm not sure what that driver would be on droid4. I have DRA7 boards
> which have VIP, CAL and VPE that I can use here. Perhaps it wouldn't be
> too much effort for me to extend my tests a bit to include CAL, and try
> to fix the driver. I just fear the driver changes won't be trivial.
>

At least PVR change was really trivial.

> Did you test this somehow?
>

Yes, on droid4, works with no issue, see above.

> Did you look at the userspace mmap of TILER buffers? I wonder if that
> goes correctly or not. Isn't memory to userspace mapped per page, and
> lengths of the TILER lines are not page aligned?
>

I really can't explain it any better than Matthijs, see the other mail.

So, what I think shall be done to have TILER BOs (and not only) in a
shape that's usable for anything else but a simple test-cases, if you
accept the $subject patch:

1. Make TILER BOs page-aligned (simple patch, I already have it). That
should fix possible invalid memory accesses for both mmap()-ed memory
and kernel drivers.

2. Set exp_info.size = omap_gem_mmap_size(obj); when exporting a BO.
That way importer knows the real BO memory size (including alignment
etc) so he will be able to calculate the number of pages he needs to map
the scatterlist.

3. Do not refuse to export non-TILER non-contiguous buffers (we can use
them for rendering, for example)

4. make VRFB functional and implement the same/similar logic as for
TILER BOs.

5. Teach omapdrm to know about different GBM allocator flags (like
allocate TILER/VRFB backed BO if GBM_BO_USE_LINEAR is not specified, but
GBM_BO_USE_SCANOUT is or something like that). I still don't have all
the details, but the ultimate goal is to get rid of omap_bo_xxx stuff
and to use GBM BOs. That will eventually allow modesetting to be fixed
to not do SW rotation. But that's too far in the future I guess.

Sounds like a plan, yeah :) ?

But, for now will send a v3 patch with the requested fixes, see below.

>> Fix that by providing a scatterlist that correctly describes TILER memory
>> layout.
>>
>> Suggested-by: Matthijs van Duin <[email protected]>
>> Signed-off-by: Ivaylo Dimitrov <[email protected]>
>> ---
>>   drivers/gpu/drm/omapdrm/omap_gem.c        | 76
>> +++++++++++++++++++++++++++++++
>>   drivers/gpu/drm/omapdrm/omap_gem.h        |  3 +-
>>   drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c | 32 ++-----------
>>   3 files changed, 82 insertions(+), 29 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c
>> b/drivers/gpu/drm/omapdrm/omap_gem.c
>> index 97e5fe6..2ffcc37 100644
>> --- a/drivers/gpu/drm/omapdrm/omap_gem.c
>> +++ b/drivers/gpu/drm/omapdrm/omap_gem.c
>> @@ -862,6 +862,11 @@ static void omap_gem_unpin_locked(struct
>> drm_gem_object *obj)
>>           return;
>>       if (refcount_dec_and_test(&omap_obj->dma_addr_cnt)) {
>> +        if (omap_obj->sgt) {
>> +            sg_free_table(omap_obj->sgt);
>> +            kfree(omap_obj->sgt);
>> +            omap_obj->sgt = NULL;
>> +        }
>
> This behavior is different than before, isn't it? The commit desc only
> mentions changing the construction of the sg-list, not how they're
> allocated and freed.
>

Will fix the commit description.

>>           ret = tiler_unpin(omap_obj->block);
>>           if (ret) {
>>               dev_err(obj->dev->dev,
>> @@ -974,6 +979,77 @@ int omap_gem_put_pages(struct drm_gem_object *obj)
>>       return 0;
>>   }
>> +struct sg_table *omap_gem_get_sg(struct drm_gem_object *obj)
>> +{
>> +    struct omap_gem_object *omap_obj = to_omap_bo(obj);
>> +    dma_addr_t addr;
>> +    struct sg_table *sgt;
>> +    struct scatterlist *sg;
>> +    unsigned int count, len, stride, i;
>> +    int ret;
>> +
>> +    ret = omap_gem_pin(obj, &addr);
>> +    if (ret)
>> +        return ERR_PTR(ret);
>> +
>> +    mutex_lock(&omap_obj->lock);
>> +
>> +    sgt = omap_obj->sgt;
>> +    if (sgt)
>> +        goto out;
>> +
>> +    sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
>> +    ret = -ENOMEM;
>> +    if (!sgt)
>> +        goto out_unpin;
>
> I think you can move the ret = ... to be inside the if block.
>

Will do.

>> +
>> +    if (omap_obj->flags & OMAP_BO_TILED_MASK) {
>> +        enum tiler_fmt fmt = gem2fmt(omap_obj->flags);
>> +
>> +        len = omap_obj->width << (int)fmt;
>> +        count = omap_obj->height;
>> +        stride = tiler_stride(fmt, 0);
>> +    } else {
>> +        len = obj->size;
>> +        count = 1;
>> +        stride = 0;
>> +    }
>> +
>> +    ret = sg_alloc_table(sgt, count, GFP_KERNEL);
>> +    if (ret)
>> +        goto out_free;
>> +
>> +    for_each_sg(sgt->sgl, sg, count, i) {
>> +        sg_set_page(sg, phys_to_page(addr), len, offset_in_page(addr));
>> +        sg_dma_address(sg) = addr;
>> +        sg_dma_len(sg) = len;
>> +
>> +        addr += stride;
>> +    }
>> +
>> +    omap_obj->sgt = sgt;
>> +out:
>> +    mutex_unlock(&omap_obj->lock);
>> +    return sgt;
>> +
>> +out_free:
>> +    kfree(sgt);
>> +out_unpin:
>
> There are errors handlers, I suggest err_ prefix.
>

ok

>> +    mutex_unlock(&omap_obj->lock);
>> +    omap_gem_unpin(obj);
>> +    return ERR_PTR(ret);
>> +}
>> +
>> +void omap_gem_put_sg(struct drm_gem_object *obj, struct sg_table *sgt)
>> +{
>> +    struct omap_gem_object *omap_obj = to_omap_bo(obj);
>> +
>> +    if (WARN_ON(omap_obj->sgt != sgt))
>> +        return;
>> +
>> +    omap_gem_unpin(obj);
>> +}
>> +
>>   #ifdef CONFIG_DRM_FBDEV_EMULATION
>>   /*
>>    * Get kernel virtual address for CPU access.. this more or less only
>> diff --git a/drivers/gpu/drm/omapdrm/omap_gem.h
>> b/drivers/gpu/drm/omapdrm/omap_gem.h
>> index eda9b48..3b61cfc 100644
>> --- a/drivers/gpu/drm/omapdrm/omap_gem.h
>> +++ b/drivers/gpu/drm/omapdrm/omap_gem.h
>> @@ -77,10 +77,11 @@ void omap_gem_dma_sync_buffer(struct
>> drm_gem_object *obj,
>>   int omap_gem_get_pages(struct drm_gem_object *obj, struct page
>> ***pages,
>>           bool remap);
>>   int omap_gem_put_pages(struct drm_gem_object *obj);
>> -
>
> Extra change here.
>

will remove that.

>>   u32 omap_gem_flags(struct drm_gem_object *obj);
>>   int omap_gem_rotated_dma_addr(struct drm_gem_object *obj, u32 orient,
>>           int x, int y, dma_addr_t *dma_addr);
>>   int omap_gem_tiled_stride(struct drm_gem_object *obj, u32 orient);
>> +struct sg_table *omap_gem_get_sg(struct drm_gem_object *obj);
>> +void omap_gem_put_sg(struct drm_gem_object *obj, struct sg_table *sgt);
>>   #endif /* __OMAPDRM_GEM_H__ */
>> diff --git a/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
>> b/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
>> index f4cde3a..9650416 100644
>> --- a/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
>> +++ b/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
>> @@ -21,45 +21,21 @@ static struct sg_table *omap_gem_map_dma_buf(
>>   {
>>       struct drm_gem_object *obj = attachment->dmabuf->priv;
>>       struct sg_table *sg;
>> -    dma_addr_t dma_addr;
>> -    int ret;
>> -
>> -    sg = kzalloc(sizeof(*sg), GFP_KERNEL);
>> -    if (!sg)
>> -        return ERR_PTR(-ENOMEM);
>> -
>> -    /* camera, etc, need physically contiguous.. but we need a
>> -     * better way to know this..
>> -     */
>> -    ret = omap_gem_pin(obj, &dma_addr);
>> -    if (ret)
>> -        goto out;
>> -
>> -    ret = sg_alloc_table(sg, 1, GFP_KERNEL);
>> -    if (ret)
>> -        goto out;
>> -
>> -    sg_init_table(sg->sgl, 1);
>> -    sg_dma_len(sg->sgl) = obj->size;
>> -    sg_set_page(sg->sgl, pfn_to_page(PFN_DOWN(dma_addr)), obj->size, 0);
>> -    sg_dma_address(sg->sgl) = dma_addr;
>> +    sg = omap_gem_get_sg(obj);
>> +    if (IS_ERR(sg))
>> +        return sg;
>>       /* this must be after omap_gem_pin() to ensure we have pages
>> attached */
>>       omap_gem_dma_sync_buffer(obj, dir);
>>       return sg;
>> -out:
>> -    kfree(sg);
>> -    return ERR_PTR(ret);
>>   }
>>   static void omap_gem_unmap_dma_buf(struct dma_buf_attachment
>> *attachment,
>>           struct sg_table *sg, enum dma_data_direction dir)
>>   {
>>       struct drm_gem_object *obj = attachment->dmabuf->priv;
>> -    omap_gem_unpin(obj);
>> -    sg_free_table(sg);
>> -    kfree(sg);
>> +    omap_gem_put_sg(obj, sg);
>>   }
>>   static int omap_gem_dmabuf_begin_cpu_access(struct dma_buf *buffer,
>>

Thanks and regards,
Ivo

2021-11-15 14:10:28

by Ivaylo Dimitrov

[permalink] [raw]
Subject: Re: [PATCH v2] drm: omapdrm: Export correct scatterlist for TILER backed BOs


Hi,

On 15.11.21 г. 12:37 ч., Tomi Valkeinen wrote:
> On 15/11/2021 11:23, Matthijs van Duin wrote:
>> On Mon, Nov 15, 2021 at 10:42:41AM +0200, Tomi Valkeinen wrote:
>>> A BO's memory via the TILER memory is
>>> contiguous, although with consistent gaps of
>>> memory that should not be accessed.
>>
>> But pretending that these "gaps" are part of the buffer is a security
>> vulnerability, since that memory which "should not be accessed" may
>> belong to different security contexts, and exporting the entire
>> contiguous region covering the buffer allows untrusted contexts (e.g.
>> userspace) to access this memory.
>
> Yes, I fully agree. I wasn't criticizing the patch, just wanted to
> highlight the unmentioned aspects.
>
>>> IPs that might use TILER
>>> backed BOs only support contiguous memory.
>>>
>>> This means that the drivers for such IPs cannot
>>> use the BOs exported like you do in this patch.
>>> I believe the drivers could be improved by
>>> writing a helper function which studies the
>>> sg_table and concludes that it's actually
>>> contiguous.
>>
>> That indeed sounds like the proper solution for such importers, rather
>> than making the exporter lie about the buffer bounds to work around
>> limitations of these importers.
>
> The annoying thing with this solution is that we need to add extra code
> to all the drivers that want to import TILER buffers, even if the
> drivers shouldn't know anything about TILER.
>
> Then again, the code is not really TILER or OMAP specific, and any IP on
> any platform that only supports contiguous buffers, but supports stride,
> could import such buffers. Which hints that maybe the code should be
> somewhere in the framework, not in the driver. In practice it may be
> better to just swallow the annoyance, add the code to the drivers and be
> done with it =).
>
>>> Did you look at the userspace mmap of TILER
>>> buffers? I wonder if that goes correctly or not.
>>> Isn't memory to userspace mapped per page, and
>>> lengths of the TILER lines are not page aligned?
>>
>> Mapping to userspace uses an ugly hack whereby small slabs of the
>> buffer (4096x64 (8bpp), 2048x32 (16bpp), or 1024x32 (32bpp) pixels) are
>> dynamically mapped to dedicated page-aligned regions of the TILER
>> virtual space.  For each of the three bitdepths only two such slabs can
>> be mapped into userspace at any given time (on the entire system), so
>> using this mechanism to render graphics from userspace can easily cause
>> hundreds if not thousands of page faults per second.
>
> Ah, right, yes, now I remember. The userspace mmap of TILER buffers
> isn't very usable.
>
>> The alternative (used e.g. in the pyra kernel) is to force all TILER
>> buffers to be page-aligned, at the cost of wasting some TILER space.
>> This will presumably also be necessary to allow SGX to import these
>> buffers since its MMU can obviously also not map data which is not
>> page-aligned, same for any other importer which uses an MMU to enforce
>> memory security (rather than being trusted to simply refrain from
>> accessing data outside the declared bounds).
>>
>> Ideally such page-alignment should only be applied to buffers which are
>> intended to be consumed by importers which require this, though it's not
>> clear how that might be accomplished.
>
> Do you mean that each TILER _line_ should be page aligned and the length
> should be page divisible? Doesn't that cause quite a lot of wasted
> space? Although that, of course, depends on the use. If the primary use
> case is allocating a few full screen display buffers, perhaps the waste
> is negligible.
>

Yes, I think this is the idea, otherwise no MMU can be set correctly.

> In any case, I'm fine with that change, too, as it helps making TILER
> usable.
>

That's great, will send a patch ASAP.

> And while speaking about usable, if I recall right, the
> omap_bo_new_tiled() is pretty annoying to use. You need to give the
> width and OMAP_BO_TILED_x flag as a parameter, and if I recall right,
> it's all but obvious how those need to be set for, e.g. NV12. But it
> works so perhaps better to keep it as it is...
>

To me the whole omap_bo_xxx stuff should go away and be replaced by
gbm_bo_xxx stuff. The only issue there is with TILER BOs, but I think
we'll be able to get away with that with a little abuse of GBM_BO_XXX
flags (see the other mail)

> There was also some particular YUV format with some rotations that I
> never got working, even after discussing with TI DSS HW guys.
>
>  Tomi

Thanks,
Ivo

2021-11-15 15:37:35

by Tomi Valkeinen

[permalink] [raw]
Subject: Re: [PATCH v2] drm: omapdrm: Export correct scatterlist for TILER backed BOs

On 15/11/2021 15:55, Ivaylo Dimitrov wrote:
> Hi Tomi,
>
> On 15.11.21 г. 10:42 ч., Tomi Valkeinen wrote:
>> Hi,
>>
>> On 13/11/2021 11:53, Ivaylo Dimitrov wrote:
>>> Memory of BOs backed by TILER is not contiguous, but
>>> omap_gem_map_dma_buf()
>>> exports it like it is. This leads to (possibly) invalid memory
>>> accesses if
>>> another device imports such a BO.
>>
>> This is one reason why TILER hasn't been officially supported. But the
>> above is not exactly right, or at least not the whole truth.
>>
>
> Definitely, not only these BOs lie about their memory layout, they lie
> about size and alignment as well. I have 2 more patches here (one is to
> align TILER memory on page, as proposed by Matthijs in the other mail,
> the other to set the correct size when exporting TILER BO), but I wanted
> to hear from you first, like, what is the general trend :) .

My thoughts here are that the current code doesn't work in practice, so
if you get it fixed, it's great =).

> Also, I have another patch in mind, that will enable exporting of
> buffers that are not TILER backed, but are not CMA backed either. SGX
> for example does not need CMA memory to render to.

What do you mean with this? DSS needs contiguous memory, so the memory
has to be 1) physically contiguous, 2) mapped with DMM or 3) mapped with
TILER. There's no reason for the driver to export non-contiguous memory.

>> A BO's memory via the TILER memory is contiguous, although with
>> consistent gaps of memory that should not be accessed.
>
> I think this more or less contradicts to the definition of contiguous,
> no?  :)

Depends on the definition ;). The buffers can be handled with DMA that
only supports contiguous memory, so...

>> That point is important, as the IPs that might use TILER backed BOs
>> only support contiguous memory.
>>
>
> Well, every IP with MMU should be capable to use such BOs. SGX has one
> for sure, IIRC IVA and ISP on omap3 has MMUs too.

True, we have those. But none of the DRA7 capture IPs have MMU.

>> This means that the drivers for such IPs cannot use the BOs exported
>
> Neither they can use them without the patch.
>
>> like you do in this patch. I believe the drivers could be improved by
>> writing a helper function which studies the sg_table and concludes
>> that it's actually contiguous. I think we should have at least one
>> such driver fixed along with this patch so that we can be more
>> confident that this actually works.
>>
>
> Yes, I we have such driver, but unfortunately it is not upstream (PVR
> driver that is). Right now I have droid4 in front of me with SGX
> rendering hildon-desktop in landscape using TILER BO.
>
> Sure I had 2 more patches applied and also had to teach PVR driver to
> know how to use that scatterlist, but that was a trivial change (an old
> version of the patch
> https://github.com/tmlind/linux_openpvrsgx/commit/90f16ed906c8c6eb4893d3ff647ca7d921972495).
>
>
> See glmark-es2 results (with a little help of a simple PVR EXA):
>
> user@devuan-droid4:/root$ uname -a
> Linux devuan-droid4 5.15.2-01783-g6ba3430a6fad-dirty #28 SMP PREEMPT Mon
> Nov 15 08:48:21 EET 2021 armv7l GNU/Linux
> user@devuan-droid4:/root$ xrandr -o 3
> user@devuan-droid4:/root$ glmark2-es2 --fullscreen
> =======================================================
>     glmark2 2020.04
> =======================================================
>     OpenGL Information
>     GL_VENDOR:     Imagination Technologies
>     GL_RENDERER:   PowerVR SGX 540
>     GL_VERSION:    OpenGL ES 2.0 build 1.17@4948957
> =======================================================
> [build] use-vbo=false: FPS: 107 FrameTime: 9.346 ms
> [build] use-vbo=true: FPS: 136 FrameTime: 7.353 ms
> [texture] texture-filter=nearest: FPS: 156 FrameTime: 6.410 ms
> [texture] texture-filter=linear: FPS: 153 FrameTime: 6.536 ms
> [texture] texture-filter=mipmap: FPS: 152 FrameTime: 6.579 ms
> [shading] shading=gouraud: FPS: 111 FrameTime: 9.009 ms
> [shading] shading=blinn-phong-inf: FPS: 116 FrameTime: 8.621 ms
> [shading] shading=phong: FPS: 104 FrameTime: 9.615 ms
> [shading] shading=cel: FPS: 96 FrameTime: 10.417 ms
> [bump] bump-render=high-poly: FPS: 67 FrameTime: 14.925 ms
> [bump] bump-render=normals: FPS: 140 FrameTime: 7.143 ms
> [bump] bump-render=height: FPS: 131 FrameTime: 7.634 ms
> [effect2d] kernel=0,1,0;1,-4,1;0,1,0;: FPS: 49 FrameTime: 20.408 ms
> [effect2d] kernel=1,1,1,1,1;1,1,1,1,1;1,1,1,1,1;: FPS: 17 FrameTime:
> 58.824 ms
> [pulsar] light=false:quads=5:texture=false: FPS: 152 FrameTime: 6.579 ms
> [desktop] blur-radius=5:effect=blur:passes=1:separable=true:windows=4:
> FPS: 23 FrameTime: 43.478 ms
> [desktop] effect=shadow:windows=4: FPS: 64 FrameTime: 15.625 ms
> [buffer]
> columns=200:interleave=false:update-dispersion=0.9:update-fraction=0.5:update-method=map:
> FPS: 33 FrameTime: 30.303 ms
> [buffer]
> columns=200:interleave=false:update-dispersion=0.9:update-fraction=0.5:update-method=subdata:
> FPS: 33 FrameTime: 30.303 ms
> [buffer]
> columns=200:interleave=true:update-dispersion=0.9:update-fraction=0.5:update-method=map:
> FPS: 60 FrameTime: 16.667 ms
> [ideas] speed=duration: FPS: 115 FrameTime: 8.696 ms
> [jellyfish] <default>: FPS: 48 FrameTime: 20.833 ms
> [terrain] <default>:PVR:(Error): SGXKickTA: TA went out of Mem and SPM
> occurred during last TA kick [0, ]
> PVR:(Error): SGXKickTA: TA went out of Mem and SPM occurred during last
> TA kick [0, ]
> PVR:(Error): SGXKickTA: TA went out of Mem and SPM occurred during last
> TA kick [0, ]
>  FPS: 1 FrameTime: 1000.000 ms
> [shadow] <default>: FPS: 31 FrameTime: 32.258 ms
> [refract] <default>: FPS: 13 FrameTime: 76.923 ms
> [conditionals] fragment-steps=0:vertex-steps=0: FPS: 156 FrameTime:
> 6.410 ms
> [conditionals] fragment-steps=5:vertex-steps=0: FPS: 100 FrameTime:
> 10.000 ms
> [conditionals] fragment-steps=0:vertex-steps=5: FPS: 156 FrameTime:
> 6.410 ms
> [function] fragment-complexity=low:fragment-steps=5: FPS: 128 FrameTime:
> 7.812 ms
> [function] fragment-complexity=medium:fragment-steps=5: FPS: 83
> FrameTime: 12.048 ms
> [loop] fragment-loop=false:fragment-steps=5:vertex-steps=5: FPS: 127
> FrameTime: 7.874 ms
> [loop] fragment-steps=5:fragment-uniform=false:vertex-steps=5: FPS: 104
> FrameTime: 9.615 ms
> [loop] fragment-steps=5:fragment-uniform=true:vertex-steps=5: FPS: 77
> FrameTime: 12.987 ms
> =======================================================
>                                   glmark2 Score: 92
> =======================================================
>
> Impressive, ain't it ;)
>
>> But I'm not sure what that driver would be on droid4. I have DRA7
>> boards which have VIP, CAL and VPE that I can use here. Perhaps it
>> wouldn't be too much effort for me to extend my tests a bit to include
>> CAL, and try to fix the driver. I just fear the driver changes won't
>> be trivial.
>>
>
> At least PVR change was really trivial.
>
>> Did you test this somehow?
>>
>
> Yes, on droid4, works with no issue, see above.
>
>> Did you look at the userspace mmap of TILER buffers? I wonder if that
>> goes correctly or not. Isn't memory to userspace mapped per page, and
>> lengths of the TILER lines are not page aligned?
>>
>
> I really can't explain it any better than Matthijs, see the other mail.
>
> So, what I think shall be done to have TILER BOs (and not only) in a
> shape that's usable for anything else but a simple test-cases, if you
> accept the $subject patch:
>
> 1. Make TILER BOs page-aligned (simple patch, I already have it). That
> should fix possible invalid memory accesses for both mmap()-ed memory
> and kernel drivers.

Sounds good.

> 2. Set exp_info.size = omap_gem_mmap_size(obj); when exporting a BO.
> That way importer knows the real BO memory size (including alignment
> etc) so he will be able to calculate the number of pages he needs to map
> the scatterlist.

Can you elaborate what this means?

> 3. Do not refuse to export non-TILER non-contiguous buffers (we can use
> them for rendering, for example)

omapdrm should only allow allocation of buffers that are used for
display. Pure render buffers should be allocated from SGX.

> 4. make VRFB functional and implement the same/similar logic as for
> TILER BOs.

Sounds good.

> 5. Teach omapdrm to know about different GBM allocator flags (like
> allocate TILER/VRFB backed BO if GBM_BO_USE_LINEAR is not specified, but
> GBM_BO_USE_SCANOUT is or something like that). I still don't have all
> the details, but the ultimate goal is to get rid of omap_bo_xxx stuff
> and to use GBM BOs. That will eventually allow modesetting to be fixed
> to not do SW rotation. But that's too far in the future I guess.

What does this mean? New omapdrm ioctls, called from libgbm? Isn't
libgbm normally just using libdrm, which includes omap_bo_xxx stuff?

Tomi

2021-11-15 18:25:13

by Ivaylo Dimitrov

[permalink] [raw]
Subject: Re: [PATCH v2] drm: omapdrm: Export correct scatterlist for TILER backed BOs

Hi,

On 15.11.21 г. 17:37 ч., Tomi Valkeinen wrote:
> On 15/11/2021 15:55, Ivaylo Dimitrov wrote:
>> Hi Tomi,
>>
>> On 15.11.21 г. 10:42 ч., Tomi Valkeinen wrote:
>>> Hi,
>>>
>>> On 13/11/2021 11:53, Ivaylo Dimitrov wrote:
>>>> Memory of BOs backed by TILER is not contiguous, but
>>>> omap_gem_map_dma_buf()
>>>> exports it like it is. This leads to (possibly) invalid memory
>>>> accesses if
>>>> another device imports such a BO.
>>>
>>> This is one reason why TILER hasn't been officially supported. But
>>> the above is not exactly right, or at least not the whole truth.
>>>
>>
>> Definitely, not only these BOs lie about their memory layout, they lie
>> about size and alignment as well. I have 2 more patches here (one is
>> to align TILER memory on page, as proposed by Matthijs in the other
>> mail, the other to set the correct size when exporting TILER BO), but
>> I wanted to hear from you first, like, what is the general trend :) .
>
> My thoughts here are that the current code doesn't work in practice, so
> if you get it fixed, it's great =).
>
>> Also, I have another patch in mind, that will enable exporting of
>> buffers that are not TILER backed, but are not CMA backed either. SGX
>> for example does not need CMA memory to render to.
>
> What do you mean with this? DSS needs contiguous memory, so the memory
> has to be 1) physically contiguous, 2) mapped with DMM or 3) mapped with
> TILER. There's no reason for the driver to export non-contiguous memory.
>

DSS yes, but, omapdrm is used to allocate non-scanout buffers as well,
which do not need to be (and in practice are not) contiguous. GPU (or
anyone with MMU) can render on them (DRI buffers for example) and later
on those buffers can be copied (blit) to the framebuffer. Yes, not
zero-copy, but if you're doing compositing, there is no option anyway.

Exactly this is done by omap-video driver for example. GBM BOs are
allocated through omapdrm as well.

>>> A BO's memory via the TILER memory is contiguous, although with
>>> consistent gaps of memory that should not be accessed.
>>
>> I think this more or less contradicts to the definition of contiguous,
>> no?  :)
>
> Depends on the definition ;). The buffers can be handled with DMA that
> only supports contiguous memory, so...
>
>>> That point is important, as the IPs that might use TILER backed BOs
>>> only support contiguous memory.
>>>
>>
>> Well, every IP with MMU should be capable to use such BOs. SGX has one
>> for sure, IIRC IVA and ISP on omap3 has MMUs too.
>
> True, we have those. But none of the DRA7 capture IPs have MMU.
>
>>> This means that the drivers for such IPs cannot use the BOs exported
>>
>> Neither they can use them without the patch.
>>
>>> like you do in this patch. I believe the drivers could be improved by
>>> writing a helper function which studies the sg_table and concludes
>>> that it's actually contiguous. I think we should have at least one
>>> such driver fixed along with this patch so that we can be more
>>> confident that this actually works.
>>>
>>
>> Yes, I we have such driver, but unfortunately it is not upstream (PVR
>> driver that is). Right now I have droid4 in front of me with SGX
>> rendering hildon-desktop in landscape using TILER BO.
>>
>> Sure I had 2 more patches applied and also had to teach PVR driver to
>> know how to use that scatterlist, but that was a trivial change (an
>> old version of the patch
>> https://github.com/tmlind/linux_openpvrsgx/commit/90f16ed906c8c6eb4893d3ff647ca7d921972495).
>>
>>
>> See glmark-es2 results (with a little help of a simple PVR EXA):
>>
>> user@devuan-droid4:/root$ uname -a
>> Linux devuan-droid4 5.15.2-01783-g6ba3430a6fad-dirty #28 SMP PREEMPT
>> Mon Nov 15 08:48:21 EET 2021 armv7l GNU/Linux
>> user@devuan-droid4:/root$ xrandr -o 3
>> user@devuan-droid4:/root$ glmark2-es2 --fullscreen
>> =======================================================
>>      glmark2 2020.04
>> =======================================================
>>      OpenGL Information
>>      GL_VENDOR:     Imagination Technologies
>>      GL_RENDERER:   PowerVR SGX 540
>>      GL_VERSION:    OpenGL ES 2.0 build 1.17@4948957
>> =======================================================
>> [build] use-vbo=false: FPS: 107 FrameTime: 9.346 ms
>> [build] use-vbo=true: FPS: 136 FrameTime: 7.353 ms
>> [texture] texture-filter=nearest: FPS: 156 FrameTime: 6.410 ms
>> [texture] texture-filter=linear: FPS: 153 FrameTime: 6.536 ms
>> [texture] texture-filter=mipmap: FPS: 152 FrameTime: 6.579 ms
>> [shading] shading=gouraud: FPS: 111 FrameTime: 9.009 ms
>> [shading] shading=blinn-phong-inf: FPS: 116 FrameTime: 8.621 ms
>> [shading] shading=phong: FPS: 104 FrameTime: 9.615 ms
>> [shading] shading=cel: FPS: 96 FrameTime: 10.417 ms
>> [bump] bump-render=high-poly: FPS: 67 FrameTime: 14.925 ms
>> [bump] bump-render=normals: FPS: 140 FrameTime: 7.143 ms
>> [bump] bump-render=height: FPS: 131 FrameTime: 7.634 ms
>> [effect2d] kernel=0,1,0;1,-4,1;0,1,0;: FPS: 49 FrameTime: 20.408 ms
>> [effect2d] kernel=1,1,1,1,1;1,1,1,1,1;1,1,1,1,1;: FPS: 17 FrameTime:
>> 58.824 ms
>> [pulsar] light=false:quads=5:texture=false: FPS: 152 FrameTime: 6.579 ms
>> [desktop] blur-radius=5:effect=blur:passes=1:separable=true:windows=4:
>> FPS: 23 FrameTime: 43.478 ms
>> [desktop] effect=shadow:windows=4: FPS: 64 FrameTime: 15.625 ms
>> [buffer]
>> columns=200:interleave=false:update-dispersion=0.9:update-fraction=0.5:update-method=map:
>> FPS: 33 FrameTime: 30.303 ms
>> [buffer]
>> columns=200:interleave=false:update-dispersion=0.9:update-fraction=0.5:update-method=subdata:
>> FPS: 33 FrameTime: 30.303 ms
>> [buffer]
>> columns=200:interleave=true:update-dispersion=0.9:update-fraction=0.5:update-method=map:
>> FPS: 60 FrameTime: 16.667 ms
>> [ideas] speed=duration: FPS: 115 FrameTime: 8.696 ms
>> [jellyfish] <default>: FPS: 48 FrameTime: 20.833 ms
>> [terrain] <default>:PVR:(Error): SGXKickTA: TA went out of Mem and SPM
>> occurred during last TA kick [0, ]
>> PVR:(Error): SGXKickTA: TA went out of Mem and SPM occurred during
>> last TA kick [0, ]
>> PVR:(Error): SGXKickTA: TA went out of Mem and SPM occurred during
>> last TA kick [0, ]
>>   FPS: 1 FrameTime: 1000.000 ms
>> [shadow] <default>: FPS: 31 FrameTime: 32.258 ms
>> [refract] <default>: FPS: 13 FrameTime: 76.923 ms
>> [conditionals] fragment-steps=0:vertex-steps=0: FPS: 156 FrameTime:
>> 6.410 ms
>> [conditionals] fragment-steps=5:vertex-steps=0: FPS: 100 FrameTime:
>> 10.000 ms
>> [conditionals] fragment-steps=0:vertex-steps=5: FPS: 156 FrameTime:
>> 6.410 ms
>> [function] fragment-complexity=low:fragment-steps=5: FPS: 128
>> FrameTime: 7.812 ms
>> [function] fragment-complexity=medium:fragment-steps=5: FPS: 83
>> FrameTime: 12.048 ms
>> [loop] fragment-loop=false:fragment-steps=5:vertex-steps=5: FPS: 127
>> FrameTime: 7.874 ms
>> [loop] fragment-steps=5:fragment-uniform=false:vertex-steps=5: FPS:
>> 104 FrameTime: 9.615 ms
>> [loop] fragment-steps=5:fragment-uniform=true:vertex-steps=5: FPS: 77
>> FrameTime: 12.987 ms
>> =======================================================
>>                                    glmark2 Score: 92
>> =======================================================
>>
>> Impressive, ain't it ;)
>>
>>> But I'm not sure what that driver would be on droid4. I have DRA7
>>> boards which have VIP, CAL and VPE that I can use here. Perhaps it
>>> wouldn't be too much effort for me to extend my tests a bit to
>>> include CAL, and try to fix the driver. I just fear the driver
>>> changes won't be trivial.
>>>
>>
>> At least PVR change was really trivial.
>>
>>> Did you test this somehow?
>>>
>>
>> Yes, on droid4, works with no issue, see above.
>>
>>> Did you look at the userspace mmap of TILER buffers? I wonder if that
>>> goes correctly or not. Isn't memory to userspace mapped per page, and
>>> lengths of the TILER lines are not page aligned?
>>>
>>
>> I really can't explain it any better than Matthijs, see the other mail.
>>
>> So, what I think shall be done to have TILER BOs (and not only) in a
>> shape that's usable for anything else but a simple test-cases, if you
>> accept the $subject patch:
>>
>> 1. Make TILER BOs page-aligned (simple patch, I already have it). That
>> should fix possible invalid memory accesses for both mmap()-ed memory
>> and kernel drivers.
>
> Sounds good.
>
>> 2. Set exp_info.size = omap_gem_mmap_size(obj); when exporting a BO.
>> That way importer knows the real BO memory size (including alignment
>> etc) so he will be able to calculate the number of pages he needs to
>> map the scatterlist.
>
> Can you elaborate what this means?
>

When we align to page, we shall report the size including the alignment,
no? Or, it is the importer that shall take care to calculate BO size(
including the alignment) based on scatterlist if he needs to?

>> 3. Do not refuse to export non-TILER non-contiguous buffers (we can
>> use them for rendering, for example)
>
> omapdrm should only allow allocation of buffers that are used for
> display. Pure render buffers should be allocated from SGX.
>

In practice everyone around uses omapdrm to allocate all kinds of
buffers, which kind of makes sense to me - that way you keep only one
DRM fd around. Take omap-video for example, it uses omap_bo_new() (or
omap_bo_tiled()) to allocate DRI buffers. GBM does the same
(gbm_bo_create() uses omapdrm, because that's the DRM fd you create gbm
with), IIUC. But those buffers should not be necessarily scanout capable.

The problem with contiguous buffers as I see it, is that CMA is limited,
so lets not waste it if we can use DMA memory and scatterlist.

To support that, we just need to simply create a scatterlist, in similar
way we do for TILER BOs in the $subject patch, so it should be few lines
of code only.

>> 4. make VRFB functional and implement the same/similar logic as for
>> TILER BOs.
>
> Sounds good.
>
>> 5. Teach omapdrm to know about different GBM allocator flags (like
>> allocate TILER/VRFB backed BO if GBM_BO_USE_LINEAR is not specified,
>> but GBM_BO_USE_SCANOUT is or something like that). I still don't have
>> all the details, but the ultimate goal is to get rid of omap_bo_xxx
>> stuff and to use GBM BOs. That will eventually allow modesetting to be
>> fixed to not do SW rotation. But that's too far in the future I guess.
>
> What does this mean? New omapdrm ioctls, called from libgbm? Isn't
> libgbm normally just using libdrm, which includes omap_bo_xxx stuff?
>

I need to dig into this a bit more, lets not discuss it now.

Regards,
Ivo

2021-11-16 00:04:04

by Ivaylo Dimitrov

[permalink] [raw]
Subject: [PATCH v3] drm: omapdrm: Export correct scatterlist for TILER backed BOs

Memory of BOs backed by TILER is not contiguous, but omap_gem_map_dma_buf()
exports it like it is. This leads to (possibly) invalid memory accesses if
another device imports such a BO.

Fix that by providing a sg that correctly describes TILER memory layout.
Also, make sure to destroy it on unpin, as it is no longer valid.

Suggested-by: Matthijs van Duin <[email protected]>
Signed-off-by: Ivaylo Dimitrov <[email protected]>
---
drivers/gpu/drm/omapdrm/omap_gem.c | 77 +++++++++++++++++++++++++++++++
drivers/gpu/drm/omapdrm/omap_gem.h | 2 +
drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c | 32 ++-----------
3 files changed, 83 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c
index 97e5fe6..cd4a31c 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem.c
@@ -862,6 +862,11 @@ static void omap_gem_unpin_locked(struct drm_gem_object *obj)
return;

if (refcount_dec_and_test(&omap_obj->dma_addr_cnt)) {
+ if (omap_obj->sgt) {
+ sg_free_table(omap_obj->sgt);
+ kfree(omap_obj->sgt);
+ omap_obj->sgt = NULL;
+ }
ret = tiler_unpin(omap_obj->block);
if (ret) {
dev_err(obj->dev->dev,
@@ -974,6 +979,78 @@ int omap_gem_put_pages(struct drm_gem_object *obj)
return 0;
}

+struct sg_table *omap_gem_get_sg(struct drm_gem_object *obj)
+{
+ struct omap_gem_object *omap_obj = to_omap_bo(obj);
+ dma_addr_t addr;
+ struct sg_table *sgt;
+ struct scatterlist *sg;
+ unsigned int count, len, stride, i;
+ int ret;
+
+ ret = omap_gem_pin(obj, &addr);
+ if (ret)
+ return ERR_PTR(ret);
+
+ mutex_lock(&omap_obj->lock);
+
+ sgt = omap_obj->sgt;
+ if (sgt)
+ goto out;
+
+ sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
+ if (!sgt) {
+ ret = -ENOMEM;
+ goto err_unpin;
+ }
+
+ if (omap_obj->flags & OMAP_BO_TILED_MASK) {
+ enum tiler_fmt fmt = gem2fmt(omap_obj->flags);
+
+ len = omap_obj->width << (int)fmt;
+ count = omap_obj->height;
+ stride = tiler_stride(fmt, 0);
+ } else {
+ len = obj->size;
+ count = 1;
+ stride = 0;
+ }
+
+ ret = sg_alloc_table(sgt, count, GFP_KERNEL);
+ if (ret)
+ goto err_free;
+
+ for_each_sg(sgt->sgl, sg, count, i) {
+ sg_set_page(sg, phys_to_page(addr), len, offset_in_page(addr));
+ sg_dma_address(sg) = addr;
+ sg_dma_len(sg) = len;
+
+ addr += stride;
+ }
+
+ omap_obj->sgt = sgt;
+out:
+ mutex_unlock(&omap_obj->lock);
+ return sgt;
+
+err_free:
+ kfree(sgt);
+err_unpin:
+ mutex_unlock(&omap_obj->lock);
+ omap_gem_unpin(obj);
+ return ERR_PTR(ret);
+}
+
+void omap_gem_put_sg(struct drm_gem_object *obj, struct sg_table *sgt)
+{
+ struct omap_gem_object *omap_obj = to_omap_bo(obj);
+
+ if (WARN_ON(omap_obj->sgt != sgt))
+ return;
+
+ omap_gem_unpin(obj);
+}
+
#ifdef CONFIG_DRM_FBDEV_EMULATION
/*
* Get kernel virtual address for CPU access.. this more or less only
diff --git a/drivers/gpu/drm/omapdrm/omap_gem.h b/drivers/gpu/drm/omapdrm/omap_gem.h
index eda9b48..19209e3 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem.h
+++ b/drivers/gpu/drm/omapdrm/omap_gem.h
@@ -82,5 +82,7 @@ int omap_gem_get_pages(struct drm_gem_object *obj, struct page ***pages,
int omap_gem_rotated_dma_addr(struct drm_gem_object *obj, u32 orient,
int x, int y, dma_addr_t *dma_addr);
int omap_gem_tiled_stride(struct drm_gem_object *obj, u32 orient);
+struct sg_table *omap_gem_get_sg(struct drm_gem_object *obj);
+void omap_gem_put_sg(struct drm_gem_object *obj, struct sg_table *sgt);

#endif /* __OMAPDRM_GEM_H__ */
diff --git a/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c b/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
index f4cde3a..9650416 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
@@ -21,45 +21,21 @@ static struct sg_table *omap_gem_map_dma_buf(
{
struct drm_gem_object *obj = attachment->dmabuf->priv;
struct sg_table *sg;
- dma_addr_t dma_addr;
- int ret;
-
- sg = kzalloc(sizeof(*sg), GFP_KERNEL);
- if (!sg)
- return ERR_PTR(-ENOMEM);
-
- /* camera, etc, need physically contiguous.. but we need a
- * better way to know this..
- */
- ret = omap_gem_pin(obj, &dma_addr);
- if (ret)
- goto out;
-
- ret = sg_alloc_table(sg, 1, GFP_KERNEL);
- if (ret)
- goto out;
-
- sg_init_table(sg->sgl, 1);
- sg_dma_len(sg->sgl) = obj->size;
- sg_set_page(sg->sgl, pfn_to_page(PFN_DOWN(dma_addr)), obj->size, 0);
- sg_dma_address(sg->sgl) = dma_addr;
+ sg = omap_gem_get_sg(obj);
+ if (IS_ERR(sg))
+ return sg;

/* this must be after omap_gem_pin() to ensure we have pages attached */
omap_gem_dma_sync_buffer(obj, dir);

return sg;
-out:
- kfree(sg);
- return ERR_PTR(ret);
}

static void omap_gem_unmap_dma_buf(struct dma_buf_attachment *attachment,
struct sg_table *sg, enum dma_data_direction dir)
{
struct drm_gem_object *obj = attachment->dmabuf->priv;
- omap_gem_unpin(obj);
- sg_free_table(sg);
- kfree(sg);
+ omap_gem_put_sg(obj, sg);
}

static int omap_gem_dmabuf_begin_cpu_access(struct dma_buf *buffer,
--
1.9.1


2021-11-16 06:42:54

by Tomi Valkeinen

[permalink] [raw]
Subject: Re: [PATCH v2] drm: omapdrm: Export correct scatterlist for TILER backed BOs

On 15/11/2021 19:15, Ivaylo Dimitrov wrote:
> Hi,
>
> On 15.11.21 г. 17:37 ч., Tomi Valkeinen wrote:
>> On 15/11/2021 15:55, Ivaylo Dimitrov wrote:
>>> Hi Tomi,
>>>
>>> On 15.11.21 г. 10:42 ч., Tomi Valkeinen wrote:
>>>> Hi,
>>>>
>>>> On 13/11/2021 11:53, Ivaylo Dimitrov wrote:
>>>>> Memory of BOs backed by TILER is not contiguous, but
>>>>> omap_gem_map_dma_buf()
>>>>> exports it like it is. This leads to (possibly) invalid memory
>>>>> accesses if
>>>>> another device imports such a BO.
>>>>
>>>> This is one reason why TILER hasn't been officially supported. But
>>>> the above is not exactly right, or at least not the whole truth.
>>>>
>>>
>>> Definitely, not only these BOs lie about their memory layout, they
>>> lie about size and alignment as well. I have 2 more patches here (one
>>> is to align TILER memory on page, as proposed by Matthijs in the
>>> other mail, the other to set the correct size when exporting TILER
>>> BO), but I wanted to hear from you first, like, what is the general
>>> trend :) .
>>
>> My thoughts here are that the current code doesn't work in practice,
>> so if you get it fixed, it's great =).
>>
>>> Also, I have another patch in mind, that will enable exporting of
>>> buffers that are not TILER backed, but are not CMA backed either. SGX
>>> for example does not need CMA memory to render to.
>>
>> What do you mean with this? DSS needs contiguous memory, so the memory
>> has to be 1) physically contiguous, 2) mapped with DMM or 3) mapped
>> with TILER. There's no reason for the driver to export non-contiguous
>> memory.
>>
>
> DSS yes, but, omapdrm is used to allocate non-scanout buffers as well,
> which do not need to be (and in practice are not) contiguous. GPU (or
> anyone with MMU) can render on them (DRI buffers for example) and later
> on those buffers can be copied (blit) to the framebuffer. Yes, not
> zero-copy, but if you're doing compositing, there is no option anyway.
>
> Exactly this is done by omap-video driver for example. GBM BOs are
> allocated through omapdrm as well.

That is not correct and shouldn't be done. omapdrm is not a generic
memory allocator. We have real generic allocators, so those should be
used. Or, if the buffer is only used for a single device, the buffer
should be allocated from that device's driver.

>>>> A BO's memory via the TILER memory is contiguous, although with
>>>> consistent gaps of memory that should not be accessed.
>>>
>>> I think this more or less contradicts to the definition of
>>> contiguous, no?  :)
>>
>> Depends on the definition ;). The buffers can be handled with DMA that
>> only supports contiguous memory, so...
>>
>>>> That point is important, as the IPs that might use TILER backed BOs
>>>> only support contiguous memory.
>>>>
>>>
>>> Well, every IP with MMU should be capable to use such BOs. SGX has
>>> one for sure, IIRC IVA and ISP on omap3 has MMUs too.
>>
>> True, we have those. But none of the DRA7 capture IPs have MMU.
>>
>>>> This means that the drivers for such IPs cannot use the BOs exported
>>>
>>> Neither they can use them without the patch.
>>>
>>>> like you do in this patch. I believe the drivers could be improved
>>>> by writing a helper function which studies the sg_table and
>>>> concludes that it's actually contiguous. I think we should have at
>>>> least one such driver fixed along with this patch so that we can be
>>>> more confident that this actually works.
>>>>
>>>
>>> Yes, I we have such driver, but unfortunately it is not upstream (PVR
>>> driver that is). Right now I have droid4 in front of me with SGX
>>> rendering hildon-desktop in landscape using TILER BO.
>>>
>>> Sure I had 2 more patches applied and also had to teach PVR driver to
>>> know how to use that scatterlist, but that was a trivial change (an
>>> old version of the patch
>>> https://github.com/tmlind/linux_openpvrsgx/commit/90f16ed906c8c6eb4893d3ff647ca7d921972495).
>>>
>>>
>>> See glmark-es2 results (with a little help of a simple PVR EXA):
>>>
>>> user@devuan-droid4:/root$ uname -a
>>> Linux devuan-droid4 5.15.2-01783-g6ba3430a6fad-dirty #28 SMP PREEMPT
>>> Mon Nov 15 08:48:21 EET 2021 armv7l GNU/Linux
>>> user@devuan-droid4:/root$ xrandr -o 3
>>> user@devuan-droid4:/root$ glmark2-es2 --fullscreen
>>> =======================================================
>>>      glmark2 2020.04
>>> =======================================================
>>>      OpenGL Information
>>>      GL_VENDOR:     Imagination Technologies
>>>      GL_RENDERER:   PowerVR SGX 540
>>>      GL_VERSION:    OpenGL ES 2.0 build 1.17@4948957
>>> =======================================================
>>> [build] use-vbo=false: FPS: 107 FrameTime: 9.346 ms
>>> [build] use-vbo=true: FPS: 136 FrameTime: 7.353 ms
>>> [texture] texture-filter=nearest: FPS: 156 FrameTime: 6.410 ms
>>> [texture] texture-filter=linear: FPS: 153 FrameTime: 6.536 ms
>>> [texture] texture-filter=mipmap: FPS: 152 FrameTime: 6.579 ms
>>> [shading] shading=gouraud: FPS: 111 FrameTime: 9.009 ms
>>> [shading] shading=blinn-phong-inf: FPS: 116 FrameTime: 8.621 ms
>>> [shading] shading=phong: FPS: 104 FrameTime: 9.615 ms
>>> [shading] shading=cel: FPS: 96 FrameTime: 10.417 ms
>>> [bump] bump-render=high-poly: FPS: 67 FrameTime: 14.925 ms
>>> [bump] bump-render=normals: FPS: 140 FrameTime: 7.143 ms
>>> [bump] bump-render=height: FPS: 131 FrameTime: 7.634 ms
>>> [effect2d] kernel=0,1,0;1,-4,1;0,1,0;: FPS: 49 FrameTime: 20.408 ms
>>> [effect2d] kernel=1,1,1,1,1;1,1,1,1,1;1,1,1,1,1;: FPS: 17 FrameTime:
>>> 58.824 ms
>>> [pulsar] light=false:quads=5:texture=false: FPS: 152 FrameTime: 6.579 ms
>>> [desktop]
>>> blur-radius=5:effect=blur:passes=1:separable=true:windows=4: FPS: 23
>>> FrameTime: 43.478 ms
>>> [desktop] effect=shadow:windows=4: FPS: 64 FrameTime: 15.625 ms
>>> [buffer]
>>> columns=200:interleave=false:update-dispersion=0.9:update-fraction=0.5:update-method=map:
>>> FPS: 33 FrameTime: 30.303 ms
>>> [buffer]
>>> columns=200:interleave=false:update-dispersion=0.9:update-fraction=0.5:update-method=subdata:
>>> FPS: 33 FrameTime: 30.303 ms
>>> [buffer]
>>> columns=200:interleave=true:update-dispersion=0.9:update-fraction=0.5:update-method=map:
>>> FPS: 60 FrameTime: 16.667 ms
>>> [ideas] speed=duration: FPS: 115 FrameTime: 8.696 ms
>>> [jellyfish] <default>: FPS: 48 FrameTime: 20.833 ms
>>> [terrain] <default>:PVR:(Error): SGXKickTA: TA went out of Mem and
>>> SPM occurred during last TA kick [0, ]
>>> PVR:(Error): SGXKickTA: TA went out of Mem and SPM occurred during
>>> last TA kick [0, ]
>>> PVR:(Error): SGXKickTA: TA went out of Mem and SPM occurred during
>>> last TA kick [0, ]
>>>   FPS: 1 FrameTime: 1000.000 ms
>>> [shadow] <default>: FPS: 31 FrameTime: 32.258 ms
>>> [refract] <default>: FPS: 13 FrameTime: 76.923 ms
>>> [conditionals] fragment-steps=0:vertex-steps=0: FPS: 156 FrameTime:
>>> 6.410 ms
>>> [conditionals] fragment-steps=5:vertex-steps=0: FPS: 100 FrameTime:
>>> 10.000 ms
>>> [conditionals] fragment-steps=0:vertex-steps=5: FPS: 156 FrameTime:
>>> 6.410 ms
>>> [function] fragment-complexity=low:fragment-steps=5: FPS: 128
>>> FrameTime: 7.812 ms
>>> [function] fragment-complexity=medium:fragment-steps=5: FPS: 83
>>> FrameTime: 12.048 ms
>>> [loop] fragment-loop=false:fragment-steps=5:vertex-steps=5: FPS: 127
>>> FrameTime: 7.874 ms
>>> [loop] fragment-steps=5:fragment-uniform=false:vertex-steps=5: FPS:
>>> 104 FrameTime: 9.615 ms
>>> [loop] fragment-steps=5:fragment-uniform=true:vertex-steps=5: FPS: 77
>>> FrameTime: 12.987 ms
>>> =======================================================
>>>                                    glmark2 Score: 92
>>> =======================================================
>>>
>>> Impressive, ain't it ;)
>>>
>>>> But I'm not sure what that driver would be on droid4. I have DRA7
>>>> boards which have VIP, CAL and VPE that I can use here. Perhaps it
>>>> wouldn't be too much effort for me to extend my tests a bit to
>>>> include CAL, and try to fix the driver. I just fear the driver
>>>> changes won't be trivial.
>>>>
>>>
>>> At least PVR change was really trivial.
>>>
>>>> Did you test this somehow?
>>>>
>>>
>>> Yes, on droid4, works with no issue, see above.
>>>
>>>> Did you look at the userspace mmap of TILER buffers? I wonder if
>>>> that goes correctly or not. Isn't memory to userspace mapped per
>>>> page, and lengths of the TILER lines are not page aligned?
>>>>
>>>
>>> I really can't explain it any better than Matthijs, see the other mail.
>>>
>>> So, what I think shall be done to have TILER BOs (and not only) in a
>>> shape that's usable for anything else but a simple test-cases, if you
>>> accept the $subject patch:
>>>
>>> 1. Make TILER BOs page-aligned (simple patch, I already have it).
>>> That should fix possible invalid memory accesses for both mmap()-ed
>>> memory and kernel drivers.
>>
>> Sounds good.
>>
>>> 2. Set exp_info.size = omap_gem_mmap_size(obj); when exporting a BO.
>>> That way importer knows the real BO memory size (including alignment
>>> etc) so he will be able to calculate the number of pages he needs to
>>> map the scatterlist.
>>
>> Can you elaborate what this means?
>>
>
> When we align to page, we shall report the size including the alignment,
> no? Or, it is the importer that shall take care to calculate BO size(
> including the alignment) based on scatterlist if he needs to?

I'm not sure... But I guess the export size should include the alignment.

Hmm... I haven't had enough coffee yet, but how does this go... Let's
say we have a tiled fb, and the width gets expanded to a page. What
happens to reads/writes that happen outside the fb, but still within the
page? Those should cause an error or do nothing, but is it possible that
they go through TILER and get mapped to some real memory location?

>>> 3. Do not refuse to export non-TILER non-contiguous buffers (we can
>>> use them for rendering, for example)
>>
>> omapdrm should only allow allocation of buffers that are used for
>> display. Pure render buffers should be allocated from SGX.
>>
>
> In practice everyone around uses omapdrm to allocate all kinds of
> buffers, which kind of makes sense to me - that way you keep only one
> DRM fd around. Take omap-video for example, it uses omap_bo_new() (or
> omap_bo_tiled()) to allocate DRI buffers. GBM does the same
> (gbm_bo_create() uses omapdrm, because that's the DRM fd you create gbm
> with), IIUC. But those buffers should not be necessarily scanout capable.
>
> The problem with contiguous buffers as I see it, is that CMA is limited,
> so lets not waste it if we can use DMA memory and scatterlist.
>
> To support that, we just need to simply create a scatterlist, in similar
> way we do for TILER BOs in the $subject patch, so it should be few lines
> of code only.

Yep. Well, as I mentioned above, I disagree with this. We have proper
generic memory allocators.

Tomi

2021-11-16 08:28:00

by Ivaylo Dimitrov

[permalink] [raw]
Subject: Re: [PATCH v2] drm: omapdrm: Export correct scatterlist for TILER backed BOs

Hi,

On 16.11.21 г. 8:42 ч., Tomi Valkeinen wrote:
> On 15/11/2021 19:15, Ivaylo Dimitrov wrote:
>> Hi,
>>
>> On 15.11.21 г. 17:37 ч., Tomi Valkeinen wrote:
>>> On 15/11/2021 15:55, Ivaylo Dimitrov wrote:
>>>> Hi Tomi,
>>>>
>>>> On 15.11.21 г. 10:42 ч., Tomi Valkeinen wrote:
>>>>> Hi,
>>>>>
>>>>> On 13/11/2021 11:53, Ivaylo Dimitrov wrote:
>>>>>> Memory of BOs backed by TILER is not contiguous, but
>>>>>> omap_gem_map_dma_buf()
>>>>>> exports it like it is. This leads to (possibly) invalid memory
>>>>>> accesses if
>>>>>> another device imports such a BO.
>>>>>
>>>>> This is one reason why TILER hasn't been officially supported. But
>>>>> the above is not exactly right, or at least not the whole truth.
>>>>>
>>>>
>>>> Definitely, not only these BOs lie about their memory layout, they
>>>> lie about size and alignment as well. I have 2 more patches here
>>>> (one is to align TILER memory on page, as proposed by Matthijs in
>>>> the other mail, the other to set the correct size when exporting
>>>> TILER BO), but I wanted to hear from you first, like, what is the
>>>> general trend :) .
>>>
>>> My thoughts here are that the current code doesn't work in practice,
>>> so if you get it fixed, it's great =).
>>>
>>>> Also, I have another patch in mind, that will enable exporting of
>>>> buffers that are not TILER backed, but are not CMA backed either.
>>>> SGX for example does not need CMA memory to render to.
>>>
>>> What do you mean with this? DSS needs contiguous memory, so the
>>> memory has to be 1) physically contiguous, 2) mapped with DMM or 3)
>>> mapped with TILER. There's no reason for the driver to export
>>> non-contiguous memory.
>>>
>>
>> DSS yes, but, omapdrm is used to allocate non-scanout buffers as well,
>> which do not need to be (and in practice are not) contiguous. GPU (or
>> anyone with MMU) can render on them (DRI buffers for example) and
>> later on those buffers can be copied (blit) to the framebuffer. Yes,
>> not zero-copy, but if you're doing compositing, there is no option
>> anyway.
>>
>> Exactly this is done by omap-video driver for example. GBM BOs are
>> allocated through omapdrm as well.
>
> That is not correct and shouldn't be done. omapdrm is not a generic
> memory allocator. We have real generic allocators, so those should be
> used. Or, if the buffer is only used for a single device, the buffer
> should be allocated from that device's driver.
>

Yes, I saw the comment in kernel headers that dumb buffers should not be
used for rendering
(https://elixir.bootlin.com/linux/latest/source/include/drm/drm_drv.h#L361).
This makes no sense to me at all, but maybe I am missing the point.

Also, it could be that the implementation of omap-video and/or PVR
userspace blobs is against the specs, but I see omap-video calling
DRM_IOCTL_OMAP_GEM_NEW for DRI buffers without OMAP_BO_SCANOUT and
libdbm.so calling DRM_IOCTL_MODE_CREATE_DUMB to create buffers then used
for rendering.

This is not an issue on omap4 an later, because when export of that
buffer is requested, omapdrm uses DMM and exports a single scatterlist
entry, IIUC.

But, on omap3, given there is no DMM, export is simply refused. I don't
see that as a consistent behaviour - we shall either a) export
non-scanout buffers (scattered ones) using whatever is supported (DMM
and single scatterlist entry on omap4 (and later), multiple-entry
scatterlist on omap3) or b) always require OMAP_BO_SCANOUT for BOs to be
exported and refuse to export if no such flag is set. I would say b) is
not a good option which leaves a) only.

BTW, I think DMM is not really needed unless userspace requests mmap(),
in theory we can provide userspace with view through DMM but give device
drivers multiple entry scatterlist, potentially saving DMM space.

I hope I made it clearer now why I think this feature shall be implemented.

>>>>> A BO's memory via the TILER memory is contiguous, although with
>>>>> consistent gaps of memory that should not be accessed.
>>>>
>>>> I think this more or less contradicts to the definition of
>>>> contiguous, no?  :)
>>>
>>> Depends on the definition ;). The buffers can be handled with DMA
>>> that only supports contiguous memory, so...
>>>
>>>>> That point is important, as the IPs that might use TILER backed BOs
>>>>> only support contiguous memory.
>>>>>
>>>>
>>>> Well, every IP with MMU should be capable to use such BOs. SGX has
>>>> one for sure, IIRC IVA and ISP on omap3 has MMUs too.
>>>
>>> True, we have those. But none of the DRA7 capture IPs have MMU.
>>>
>>>>> This means that the drivers for such IPs cannot use the BOs exported
>>>>
>>>> Neither they can use them without the patch.
>>>>
>>>>> like you do in this patch. I believe the drivers could be improved
>>>>> by writing a helper function which studies the sg_table and
>>>>> concludes that it's actually contiguous. I think we should have at
>>>>> least one such driver fixed along with this patch so that we can be
>>>>> more confident that this actually works.
>>>>>
>>>>
>>>> Yes, I we have such driver, but unfortunately it is not upstream
>>>> (PVR driver that is). Right now I have droid4 in front of me with
>>>> SGX rendering hildon-desktop in landscape using TILER BO.
>>>>
>>>> Sure I had 2 more patches applied and also had to teach PVR driver
>>>> to know how to use that scatterlist, but that was a trivial change
>>>> (an old version of the patch
>>>> https://github.com/tmlind/linux_openpvrsgx/commit/90f16ed906c8c6eb4893d3ff647ca7d921972495).
>>>>
>>>>
>>>> See glmark-es2 results (with a little help of a simple PVR EXA):
>>>>
>>>> user@devuan-droid4:/root$ uname -a
>>>> Linux devuan-droid4 5.15.2-01783-g6ba3430a6fad-dirty #28 SMP PREEMPT
>>>> Mon Nov 15 08:48:21 EET 2021 armv7l GNU/Linux
>>>> user@devuan-droid4:/root$ xrandr -o 3
>>>> user@devuan-droid4:/root$ glmark2-es2 --fullscreen
>>>> =======================================================
>>>>      glmark2 2020.04
>>>> =======================================================
>>>>      OpenGL Information
>>>>      GL_VENDOR:     Imagination Technologies
>>>>      GL_RENDERER:   PowerVR SGX 540
>>>>      GL_VERSION:    OpenGL ES 2.0 build 1.17@4948957
>>>> =======================================================
>>>> [build] use-vbo=false: FPS: 107 FrameTime: 9.346 ms
>>>> [build] use-vbo=true: FPS: 136 FrameTime: 7.353 ms
>>>> [texture] texture-filter=nearest: FPS: 156 FrameTime: 6.410 ms
>>>> [texture] texture-filter=linear: FPS: 153 FrameTime: 6.536 ms
>>>> [texture] texture-filter=mipmap: FPS: 152 FrameTime: 6.579 ms
>>>> [shading] shading=gouraud: FPS: 111 FrameTime: 9.009 ms
>>>> [shading] shading=blinn-phong-inf: FPS: 116 FrameTime: 8.621 ms
>>>> [shading] shading=phong: FPS: 104 FrameTime: 9.615 ms
>>>> [shading] shading=cel: FPS: 96 FrameTime: 10.417 ms
>>>> [bump] bump-render=high-poly: FPS: 67 FrameTime: 14.925 ms
>>>> [bump] bump-render=normals: FPS: 140 FrameTime: 7.143 ms
>>>> [bump] bump-render=height: FPS: 131 FrameTime: 7.634 ms
>>>> [effect2d] kernel=0,1,0;1,-4,1;0,1,0;: FPS: 49 FrameTime: 20.408 ms
>>>> [effect2d] kernel=1,1,1,1,1;1,1,1,1,1;1,1,1,1,1;: FPS: 17 FrameTime:
>>>> 58.824 ms
>>>> [pulsar] light=false:quads=5:texture=false: FPS: 152 FrameTime:
>>>> 6.579 ms
>>>> [desktop]
>>>> blur-radius=5:effect=blur:passes=1:separable=true:windows=4: FPS: 23
>>>> FrameTime: 43.478 ms
>>>> [desktop] effect=shadow:windows=4: FPS: 64 FrameTime: 15.625 ms
>>>> [buffer]
>>>> columns=200:interleave=false:update-dispersion=0.9:update-fraction=0.5:update-method=map:
>>>> FPS: 33 FrameTime: 30.303 ms
>>>> [buffer]
>>>> columns=200:interleave=false:update-dispersion=0.9:update-fraction=0.5:update-method=subdata:
>>>> FPS: 33 FrameTime: 30.303 ms
>>>> [buffer]
>>>> columns=200:interleave=true:update-dispersion=0.9:update-fraction=0.5:update-method=map:
>>>> FPS: 60 FrameTime: 16.667 ms
>>>> [ideas] speed=duration: FPS: 115 FrameTime: 8.696 ms
>>>> [jellyfish] <default>: FPS: 48 FrameTime: 20.833 ms
>>>> [terrain] <default>:PVR:(Error): SGXKickTA: TA went out of Mem and
>>>> SPM occurred during last TA kick [0, ]
>>>> PVR:(Error): SGXKickTA: TA went out of Mem and SPM occurred during
>>>> last TA kick [0, ]
>>>> PVR:(Error): SGXKickTA: TA went out of Mem and SPM occurred during
>>>> last TA kick [0, ]
>>>>   FPS: 1 FrameTime: 1000.000 ms
>>>> [shadow] <default>: FPS: 31 FrameTime: 32.258 ms
>>>> [refract] <default>: FPS: 13 FrameTime: 76.923 ms
>>>> [conditionals] fragment-steps=0:vertex-steps=0: FPS: 156 FrameTime:
>>>> 6.410 ms
>>>> [conditionals] fragment-steps=5:vertex-steps=0: FPS: 100 FrameTime:
>>>> 10.000 ms
>>>> [conditionals] fragment-steps=0:vertex-steps=5: FPS: 156 FrameTime:
>>>> 6.410 ms
>>>> [function] fragment-complexity=low:fragment-steps=5: FPS: 128
>>>> FrameTime: 7.812 ms
>>>> [function] fragment-complexity=medium:fragment-steps=5: FPS: 83
>>>> FrameTime: 12.048 ms
>>>> [loop] fragment-loop=false:fragment-steps=5:vertex-steps=5: FPS: 127
>>>> FrameTime: 7.874 ms
>>>> [loop] fragment-steps=5:fragment-uniform=false:vertex-steps=5: FPS:
>>>> 104 FrameTime: 9.615 ms
>>>> [loop] fragment-steps=5:fragment-uniform=true:vertex-steps=5: FPS:
>>>> 77 FrameTime: 12.987 ms
>>>> =======================================================
>>>>                                    glmark2 Score: 92
>>>> =======================================================
>>>>
>>>> Impressive, ain't it ;)
>>>>
>>>>> But I'm not sure what that driver would be on droid4. I have DRA7
>>>>> boards which have VIP, CAL and VPE that I can use here. Perhaps it
>>>>> wouldn't be too much effort for me to extend my tests a bit to
>>>>> include CAL, and try to fix the driver. I just fear the driver
>>>>> changes won't be trivial.
>>>>>
>>>>
>>>> At least PVR change was really trivial.
>>>>
>>>>> Did you test this somehow?
>>>>>
>>>>
>>>> Yes, on droid4, works with no issue, see above.
>>>>
>>>>> Did you look at the userspace mmap of TILER buffers? I wonder if
>>>>> that goes correctly or not. Isn't memory to userspace mapped per
>>>>> page, and lengths of the TILER lines are not page aligned?
>>>>>
>>>>
>>>> I really can't explain it any better than Matthijs, see the other mail.
>>>>
>>>> So, what I think shall be done to have TILER BOs (and not only) in a
>>>> shape that's usable for anything else but a simple test-cases, if
>>>> you accept the $subject patch:
>>>>
>>>> 1. Make TILER BOs page-aligned (simple patch, I already have it).
>>>> That should fix possible invalid memory accesses for both mmap()-ed
>>>> memory and kernel drivers.
>>>
>>> Sounds good.
>>>
>>>> 2. Set exp_info.size = omap_gem_mmap_size(obj); when exporting a BO.
>>>> That way importer knows the real BO memory size (including alignment
>>>> etc) so he will be able to calculate the number of pages he needs to
>>>> map the scatterlist.
>>>
>>> Can you elaborate what this means?
>>>
>>
>> When we align to page, we shall report the size including the
>> alignment, no? Or, it is the importer that shall take care to
>> calculate BO size( including the alignment) based on scatterlist if he
>> needs to?
>
> I'm not sure... But I guess the export size should include the alignment.
>

My understanding as well. Will sent that change as a part of page
alignment patch.

> Hmm... I haven't had enough coffee yet, but how does this go... Let's
> say we have a tiled fb, and the width gets expanded to a page. What
> happens to reads/writes that happen outside the fb, but still within the
> page? Those should cause an error or do nothing, but is it possible that
> they go through TILER and get mapped to some real memory location?
>

I lack the details here, but reading through TRM leaves me with the
impression that TILER smallest unit is a tile, and every tile is backed
by a real memory page (4KiB), so outside read-writes will end up in
memory that's there but unused and will do nothing.

omap_gem_new() calls tiler_align(), which in turn seems to return
page-aligned size, so I think there is no issue here.

>>>> 3. Do not refuse to export non-TILER non-contiguous buffers (we can
>>>> use them for rendering, for example)
>>>
>>> omapdrm should only allow allocation of buffers that are used for
>>> display. Pure render buffers should be allocated from SGX.
>>>
>>
>> In practice everyone around uses omapdrm to allocate all kinds of
>> buffers, which kind of makes sense to me - that way you keep only one
>> DRM fd around. Take omap-video for example, it uses omap_bo_new() (or
>> omap_bo_tiled()) to allocate DRI buffers. GBM does the same
>> (gbm_bo_create() uses omapdrm, because that's the DRM fd you create
>> gbm with), IIUC. But those buffers should not be necessarily scanout
>> capable.
>>
>> The problem with contiguous buffers as I see it, is that CMA is
>> limited, so lets not waste it if we can use DMA memory and scatterlist.
>>
>> To support that, we just need to simply create a scatterlist, in
>> similar way we do for TILER BOs in the $subject patch, so it should be
>> few lines of code only.
>
> Yep. Well, as I mentioned above, I disagree with this. We have proper
> generic memory allocators.
>

Maybe you could reconsider in the light of my latest explanations :)

Thanks and regards,
Ivo

2021-11-16 10:21:23

by Tomi Valkeinen

[permalink] [raw]
Subject: Re: [PATCH v2] drm: omapdrm: Export correct scatterlist for TILER backed BOs

On 16/11/2021 10:27, Ivaylo Dimitrov wrote:
> Hi,
>
> On 16.11.21 г. 8:42 ч., Tomi Valkeinen wrote:
>> On 15/11/2021 19:15, Ivaylo Dimitrov wrote:
>>> Hi,
>>>
>>> On 15.11.21 г. 17:37 ч., Tomi Valkeinen wrote:
>>>> On 15/11/2021 15:55, Ivaylo Dimitrov wrote:
>>>>> Hi Tomi,
>>>>>
>>>>> On 15.11.21 г. 10:42 ч., Tomi Valkeinen wrote:
>>>>>> Hi,
>>>>>>
>>>>>> On 13/11/2021 11:53, Ivaylo Dimitrov wrote:
>>>>>>> Memory of BOs backed by TILER is not contiguous, but
>>>>>>> omap_gem_map_dma_buf()
>>>>>>> exports it like it is. This leads to (possibly) invalid memory
>>>>>>> accesses if
>>>>>>> another device imports such a BO.
>>>>>>
>>>>>> This is one reason why TILER hasn't been officially supported. But
>>>>>> the above is not exactly right, or at least not the whole truth.
>>>>>>
>>>>>
>>>>> Definitely, not only these BOs lie about their memory layout, they
>>>>> lie about size and alignment as well. I have 2 more patches here
>>>>> (one is to align TILER memory on page, as proposed by Matthijs in
>>>>> the other mail, the other to set the correct size when exporting
>>>>> TILER BO), but I wanted to hear from you first, like, what is the
>>>>> general trend :) .
>>>>
>>>> My thoughts here are that the current code doesn't work in practice,
>>>> so if you get it fixed, it's great =).
>>>>
>>>>> Also, I have another patch in mind, that will enable exporting of
>>>>> buffers that are not TILER backed, but are not CMA backed either.
>>>>> SGX for example does not need CMA memory to render to.
>>>>
>>>> What do you mean with this? DSS needs contiguous memory, so the
>>>> memory has to be 1) physically contiguous, 2) mapped with DMM or 3)
>>>> mapped with TILER. There's no reason for the driver to export
>>>> non-contiguous memory.
>>>>
>>>
>>> DSS yes, but, omapdrm is used to allocate non-scanout buffers as
>>> well, which do not need to be (and in practice are not) contiguous.
>>> GPU (or anyone with MMU) can render on them (DRI buffers for example)
>>> and later on those buffers can be copied (blit) to the framebuffer.
>>> Yes, not zero-copy, but if you're doing compositing, there is no
>>> option anyway.
>>>
>>> Exactly this is done by omap-video driver for example. GBM BOs are
>>> allocated through omapdrm as well.
>>
>> That is not correct and shouldn't be done. omapdrm is not a generic
>> memory allocator. We have real generic allocators, so those should be
>> used. Or, if the buffer is only used for a single device, the buffer
>> should be allocated from that device's driver.
>>
>
> Yes, I saw the comment in kernel headers that dumb buffers should not be
> used for rendering
> (https://elixir.bootlin.com/linux/latest/source/include/drm/drm_drv.h#L361).
> This makes no sense to me at all, but maybe I am missing the point.

I believe that comments refers to another issue: a dumb buffer from may
not be usable for rendering. It's only guaranteed to be
readable/writable by the CPU.

What I'm talking about is that a driver must support memory allocations
for buffers that the device handled by the driver can use. In many cases
that allocated buffer also works with other devices, and thus dmabuf
export/import can be used. But a driver supporting memory allocations
for buffers that the device itself cannot use is just wrong.

> Also, it could be that the implementation of omap-video and/or PVR
> userspace blobs is against the specs, but I see omap-video calling
> DRM_IOCTL_OMAP_GEM_NEW for DRI buffers without OMAP_BO_SCANOUT and
> libdbm.so calling DRM_IOCTL_MODE_CREATE_DUMB to create buffers then used
> for rendering.

I think neither of those are exactly material to be used as examples on
how to do things. And there's lots of history there. We didn't have
generic allocators back then.

> This is not an issue on omap4 an later, because when export of that
> buffer is requested, omapdrm uses DMM and exports a single scatterlist
> entry, IIUC.
>
> But, on omap3, given there is no DMM, export is simply refused. I don't
> see that as a consistent behaviour - we shall either a) export
> non-scanout buffers (scattered ones) using whatever is supported (DMM
> and single scatterlist entry on omap4 (and later), multiple-entry
> scatterlist on omap3) or b) always require OMAP_BO_SCANOUT for BOs to be
> exported and refuse to export if no such flag is set. I would say b) is
> not a good option which leaves a) only.

I think we should always require OMAP_BO_SCANOUT, or rather, drop the
flag totally and always expect the buffer to be a scanout buffer. The
only use for DSS is scanout, and those are the only buffers that omapdrm
needs to support. But that would be breaking the uAPI, so I think we
just have to support what we do now.

> BTW, I think DMM is not really needed unless userspace requests mmap(),
> in theory we can provide userspace with view through DMM but give device
> drivers multiple entry scatterlist, potentially saving DMM space.

The userspace (CPU) doesn't need the DMM, the CPU has an MMU. I thought
we already skip the DMM when mapping to the userspace. But in TILER case
we always need TILER, even with the CPU.

> I hope I made it clearer now why I think this feature shall be implemented.

I think it's just adding more wrong on top of the old wrong =).

Also, if we need DMM/TILER allocations for other devices than DSS (but
so far this hasn't been mentioned), then I think the DMM/TILER
functionality should be separated from omapdrm and moved to (I think)
dma-heap.

>>>>> 2. Set exp_info.size = omap_gem_mmap_size(obj); when exporting a
>>>>> BO. That way importer knows the real BO memory size (including
>>>>> alignment etc) so he will be able to calculate the number of pages
>>>>> he needs to map the scatterlist.
>>>>
>>>> Can you elaborate what this means?
>>>>
>>>
>>> When we align to page, we shall report the size including the
>>> alignment, no? Or, it is the importer that shall take care to
>>> calculate BO size( including the alignment) based on scatterlist if
>>> he needs to?
>>
>> I'm not sure... But I guess the export size should include the alignment.
>>
>
> My understanding as well. Will sent that change as a part of page
> alignment patch.
>
>> Hmm... I haven't had enough coffee yet, but how does this go... Let's
>> say we have a tiled fb, and the width gets expanded to a page. What
>> happens to reads/writes that happen outside the fb, but still within
>> the page? Those should cause an error or do nothing, but is it
>> possible that they go through TILER and get mapped to some real memory
>> location?
>>
>
> I lack the details here, but reading through TRM leaves me with the
> impression that TILER smallest unit is a tile, and every tile is backed
> by a real memory page (4KiB), so outside read-writes will end up in
> memory that's there but unused and will do nothing.
>
> omap_gem_new() calls tiler_align(), which in turn seems to return
> page-aligned size, so I think there is no issue here.

Maybe, but, consider this example, with numbers totally out of thin air:
We have a fb with the width of 32 pixels, so 128 bytes. If we have tiles
which cover 32 x 32 pixels (so 4096 bytes with 4 bpp), we need one tile
to cover the width. But we have all the rest of the page mapped, so 3968
bytes that are not covered with a tile (or rather, we haven't configured
that tile, or maybe the tile contains old configuration).

I could be totally wrong here, as I don't remember the details. But I do
think that it's very easy to get this wrong, creating memory corruptions
and/or security violations.

Tomi

2021-11-16 11:12:42

by Ivaylo Dimitrov

[permalink] [raw]
Subject: Re: [PATCH v2] drm: omapdrm: Export correct scatterlist for TILER backed BOs

On 16.11.21 г. 12:20 ч., Tomi Valkeinen wrote:
> On 16/11/2021 10:27, Ivaylo Dimitrov wrote:
>> Hi,
>>
>> On 16.11.21 г. 8:42 ч., Tomi Valkeinen wrote:
>>> On 15/11/2021 19:15, Ivaylo Dimitrov wrote:
>>>> Hi,
>>>>
>>>> On 15.11.21 г. 17:37 ч., Tomi Valkeinen wrote:
>>>>> On 15/11/2021 15:55, Ivaylo Dimitrov wrote:
>>>>>> Hi Tomi,
>>>>>>
>>>>>> On 15.11.21 г. 10:42 ч., Tomi Valkeinen wrote:
>>>>>>> Hi,
>>>>>>>
>>>>>>> On 13/11/2021 11:53, Ivaylo Dimitrov wrote:
>>>>>>>> Memory of BOs backed by TILER is not contiguous, but
>>>>>>>> omap_gem_map_dma_buf()
>>>>>>>> exports it like it is. This leads to (possibly) invalid memory
>>>>>>>> accesses if
>>>>>>>> another device imports such a BO.
>>>>>>>
>>>>>>> This is one reason why TILER hasn't been officially supported.
>>>>>>> But the above is not exactly right, or at least not the whole truth.
>>>>>>>
>>>>>>
>>>>>> Definitely, not only these BOs lie about their memory layout, they
>>>>>> lie about size and alignment as well. I have 2 more patches here
>>>>>> (one is to align TILER memory on page, as proposed by Matthijs in
>>>>>> the other mail, the other to set the correct size when exporting
>>>>>> TILER BO), but I wanted to hear from you first, like, what is the
>>>>>> general trend :) .
>>>>>
>>>>> My thoughts here are that the current code doesn't work in
>>>>> practice, so if you get it fixed, it's great =).
>>>>>
>>>>>> Also, I have another patch in mind, that will enable exporting of
>>>>>> buffers that are not TILER backed, but are not CMA backed either.
>>>>>> SGX for example does not need CMA memory to render to.
>>>>>
>>>>> What do you mean with this? DSS needs contiguous memory, so the
>>>>> memory has to be 1) physically contiguous, 2) mapped with DMM or 3)
>>>>> mapped with TILER. There's no reason for the driver to export
>>>>> non-contiguous memory.
>>>>>
>>>>
>>>> DSS yes, but, omapdrm is used to allocate non-scanout buffers as
>>>> well, which do not need to be (and in practice are not) contiguous.
>>>> GPU (or anyone with MMU) can render on them (DRI buffers for
>>>> example) and later on those buffers can be copied (blit) to the
>>>> framebuffer. Yes, not zero-copy, but if you're doing compositing,
>>>> there is no option anyway.
>>>>
>>>> Exactly this is done by omap-video driver for example. GBM BOs are
>>>> allocated through omapdrm as well.
>>>
>>> That is not correct and shouldn't be done. omapdrm is not a generic
>>> memory allocator. We have real generic allocators, so those should be
>>> used. Or, if the buffer is only used for a single device, the buffer
>>> should be allocated from that device's driver.
>>>
>>
>> Yes, I saw the comment in kernel headers that dumb buffers should not
>> be used for rendering
>> (https://elixir.bootlin.com/linux/latest/source/include/drm/drm_drv.h#L361).
>> This makes no sense to me at all, but maybe I am missing the point.
>
> I believe that comments refers to another issue: a dumb buffer from may
> not be usable for rendering. It's only guaranteed to be
> readable/writable by the CPU.
>
> What I'm talking about is that a driver must support memory allocations
> for buffers that the device handled by the driver can use. In many cases
> that allocated buffer also works with other devices, and thus dmabuf
> export/import can be used. But a driver supporting memory allocations
> for buffers that the device itself cannot use is just wrong.
>
>> Also, it could be that the implementation of omap-video and/or PVR
>> userspace blobs is against the specs, but I see omap-video calling
>> DRM_IOCTL_OMAP_GEM_NEW for DRI buffers without OMAP_BO_SCANOUT and
>> libdbm.so calling DRM_IOCTL_MODE_CREATE_DUMB to create buffers then
>> used for rendering.
>
> I think neither of those are exactly material to be used as examples on
> how to do things. And there's lots of history there. We didn't have
> generic allocators back then.
>
>> This is not an issue on omap4 an later, because when export of that
>> buffer is requested, omapdrm uses DMM and exports a single scatterlist
>> entry, IIUC.
>>
>> But, on omap3, given there is no DMM, export is simply refused. I
>> don't see that as a consistent behaviour - we shall either a) export
>> non-scanout buffers (scattered ones) using whatever is supported (DMM
>> and single scatterlist entry on omap4 (and later), multiple-entry
>> scatterlist on omap3) or b) always require OMAP_BO_SCANOUT for BOs to
>> be exported and refuse to export if no such flag is set. I would say
>> b) is not a good option which leaves a) only.
>
> I think we should always require OMAP_BO_SCANOUT, or rather, drop the
> flag totally and always expect the buffer to be a scanout buffer. The
> only use for DSS is scanout, and those are the only buffers that omapdrm
> needs to support. But that would be breaking the uAPI, so I think we
> just have to support what we do now.
>
>> BTW, I think DMM is not really needed unless userspace requests
>> mmap(), in theory we can provide userspace with view through DMM but
>> give device drivers multiple entry scatterlist, potentially saving DMM
>> space.
>
> The userspace (CPU) doesn't need the DMM, the CPU has an MMU. I thought
> we already skip the DMM when mapping to the userspace. But in TILER case
> we always need TILER, even with the CPU.
>
>> I hope I made it clearer now why I think this feature shall be
>> implemented.
>
> I think it's just adding more wrong on top of the old wrong =).
>
> Also, if we need DMM/TILER allocations for other devices than DSS (but
> so far this hasn't been mentioned), then I think the DMM/TILER
> functionality should be separated from omapdrm and moved to (I think)
> dma-heap.
>

I see. Ok, it is not that much of an issue for me to carry one
out-of-tree patch if needed.

>>>>>> 2. Set exp_info.size = omap_gem_mmap_size(obj); when exporting a
>>>>>> BO. That way importer knows the real BO memory size (including
>>>>>> alignment etc) so he will be able to calculate the number of pages
>>>>>> he needs to map the scatterlist.
>>>>>
>>>>> Can you elaborate what this means?
>>>>>
>>>>
>>>> When we align to page, we shall report the size including the
>>>> alignment, no? Or, it is the importer that shall take care to
>>>> calculate BO size( including the alignment) based on scatterlist if
>>>> he needs to?
>>>
>>> I'm not sure... But I guess the export size should include the
>>> alignment.
>>>
>>
>> My understanding as well. Will sent that change as a part of page
>> alignment patch.
>>
>>> Hmm... I haven't had enough coffee yet, but how does this go... Let's
>>> say we have a tiled fb, and the width gets expanded to a page. What
>>> happens to reads/writes that happen outside the fb, but still within
>>> the page? Those should cause an error or do nothing, but is it
>>> possible that they go through TILER and get mapped to some real
>>> memory location?
>>>
>>
>> I lack the details here, but reading through TRM leaves me with the
>> impression that TILER smallest unit is a tile, and every tile is
>> backed by a real memory page (4KiB), so outside read-writes will end
>> up in memory that's there but unused and will do nothing.
>>
>> omap_gem_new() calls tiler_align(), which in turn seems to return
>> page-aligned size, so I think there is no issue here.
>
> Maybe, but, consider this example, with numbers totally out of thin air:
> We have a fb with the width of 32 pixels, so 128 bytes. If we have tiles
> which cover 32 x 32 pixels (so 4096 bytes with 4 bpp), we need one tile
> to cover the width. But we have all the rest of the page mapped, so 3968
> bytes that are not covered with a tile (or rather, we haven't configured
> that tile, or maybe the tile contains old configuration).
>
> I could be totally wrong here, as I don't remember the details. But I do
> think that it's very easy to get this wrong, creating memory corruptions
> and/or security violations.
>

I see what you mean and I think you are right. What about configuring
(if we use your made-up values) those 31 'unused' TILER entries to point
to the same page tile 1 points to? If we need more than one tile (say N)
to cover the width, apply the same logic, like, tile N+1 points to page
1, N+2 to page 2 and so on. Or, allocate one extra page and setup all
'unused' tiles to point to it? Both will guarantee no other memory can
be accessed, IIUC, though I prefer the 'overlap' workaround.

Also, we can have a single page used to back all the 'unused' tiler
entries, for all the TILER BOs, but with such an attributes that it is
not actually accessible (not sure how such a page shall be allocated
though), so if someone tries to access memory outside of the allowed
region and hits that page, we'll either have SEGFAULT or OOPS.

Ivo

2021-11-16 16:11:11

by Ivaylo Dimitrov

[permalink] [raw]
Subject: Re: [PATCH v2] drm: omapdrm: Export correct scatterlist for TILER backed BOs



On 16.11.21 г. 13:12 ч., Ivaylo Dimitrov wrote:
> On 16.11.21 г. 12:20 ч., Tomi Valkeinen wrote:
>> On 16/11/2021 10:27, Ivaylo Dimitrov wrote:
>>> Hi,
>>>
>>> On 16.11.21 г. 8:42 ч., Tomi Valkeinen wrote:
>>>> On 15/11/2021 19:15, Ivaylo Dimitrov wrote:
>>>>> Hi,
>>>>>
>>>>> On 15.11.21 г. 17:37 ч., Tomi Valkeinen wrote:
>>>>>> On 15/11/2021 15:55, Ivaylo Dimitrov wrote:
>>>>>>> Hi Tomi,
>>>>>>>
>>>>>>> On 15.11.21 г. 10:42 ч., Tomi Valkeinen wrote:
>>>>>>>> Hi,
>>>>>>>>
>>>>>>>> On 13/11/2021 11:53, Ivaylo Dimitrov wrote:
>>>>>>>>> Memory of BOs backed by TILER is not contiguous, but
>>>>>>>>> omap_gem_map_dma_buf()
>>>>>>>>> exports it like it is. This leads to (possibly) invalid memory
>>>>>>>>> accesses if
>>>>>>>>> another device imports such a BO.
>>>>>>>>
>>>>>>>> This is one reason why TILER hasn't been officially supported.
>>>>>>>> But the above is not exactly right, or at least not the whole
>>>>>>>> truth.
>>>>>>>>
>>>>>>>
>>>>>>> Definitely, not only these BOs lie about their memory layout,
>>>>>>> they lie about size and alignment as well. I have 2 more patches
>>>>>>> here (one is to align TILER memory on page, as proposed by
>>>>>>> Matthijs in the other mail, the other to set the correct size
>>>>>>> when exporting TILER BO), but I wanted to hear from you first,
>>>>>>> like, what is the general trend :) .
>>>>>>
>>>>>> My thoughts here are that the current code doesn't work in
>>>>>> practice, so if you get it fixed, it's great =).
>>>>>>
>>>>>>> Also, I have another patch in mind, that will enable exporting of
>>>>>>> buffers that are not TILER backed, but are not CMA backed either.
>>>>>>> SGX for example does not need CMA memory to render to.
>>>>>>
>>>>>> What do you mean with this? DSS needs contiguous memory, so the
>>>>>> memory has to be 1) physically contiguous, 2) mapped with DMM or
>>>>>> 3) mapped with TILER. There's no reason for the driver to export
>>>>>> non-contiguous memory.
>>>>>>
>>>>>
>>>>> DSS yes, but, omapdrm is used to allocate non-scanout buffers as
>>>>> well, which do not need to be (and in practice are not) contiguous.
>>>>> GPU (or anyone with MMU) can render on them (DRI buffers for
>>>>> example) and later on those buffers can be copied (blit) to the
>>>>> framebuffer. Yes, not zero-copy, but if you're doing compositing,
>>>>> there is no option anyway.
>>>>>
>>>>> Exactly this is done by omap-video driver for example. GBM BOs are
>>>>> allocated through omapdrm as well.
>>>>
>>>> That is not correct and shouldn't be done. omapdrm is not a generic
>>>> memory allocator. We have real generic allocators, so those should
>>>> be used. Or, if the buffer is only used for a single device, the
>>>> buffer should be allocated from that device's driver.
>>>>
>>>
>>> Yes, I saw the comment in kernel headers that dumb buffers should not
>>> be used for rendering
>>> (https://elixir.bootlin.com/linux/latest/source/include/drm/drm_drv.h#L361).
>>> This makes no sense to me at all, but maybe I am missing the point.
>>
>> I believe that comments refers to another issue: a dumb buffer from
>> may not be usable for rendering. It's only guaranteed to be
>> readable/writable by the CPU.
>>
>> What I'm talking about is that a driver must support memory
>> allocations for buffers that the device handled by the driver can use.
>> In many cases that allocated buffer also works with other devices, and
>> thus dmabuf export/import can be used. But a driver supporting memory
>> allocations for buffers that the device itself cannot use is just wrong.
>>
>>> Also, it could be that the implementation of omap-video and/or PVR
>>> userspace blobs is against the specs, but I see omap-video calling
>>> DRM_IOCTL_OMAP_GEM_NEW for DRI buffers without OMAP_BO_SCANOUT and
>>> libdbm.so calling DRM_IOCTL_MODE_CREATE_DUMB to create buffers then
>>> used for rendering.
>>
>> I think neither of those are exactly material to be used as examples
>> on how to do things. And there's lots of history there. We didn't have
>> generic allocators back then.
>>
>>> This is not an issue on omap4 an later, because when export of that
>>> buffer is requested, omapdrm uses DMM and exports a single
>>> scatterlist entry, IIUC.
>>>
>>> But, on omap3, given there is no DMM, export is simply refused. I
>>> don't see that as a consistent behaviour - we shall either a) export
>>> non-scanout buffers (scattered ones) using whatever is supported (DMM
>>> and single scatterlist entry on omap4 (and later), multiple-entry
>>> scatterlist on omap3) or b) always require OMAP_BO_SCANOUT for BOs to
>>> be exported and refuse to export if no such flag is set. I would say
>>> b) is not a good option which leaves a) only.
>>
>> I think we should always require OMAP_BO_SCANOUT, or rather, drop the
>> flag totally and always expect the buffer to be a scanout buffer. The
>> only use for DSS is scanout, and those are the only buffers that
>> omapdrm needs to support. But that would be breaking the uAPI, so I
>> think we just have to support what we do now.
>>
>>> BTW, I think DMM is not really needed unless userspace requests
>>> mmap(), in theory we can provide userspace with view through DMM but
>>> give device drivers multiple entry scatterlist, potentially saving
>>> DMM space.
>>
>> The userspace (CPU) doesn't need the DMM, the CPU has an MMU. I
>> thought we already skip the DMM when mapping to the userspace. But in
>> TILER case we always need TILER, even with the CPU.
>>
>>> I hope I made it clearer now why I think this feature shall be
>>> implemented.
>>
>> I think it's just adding more wrong on top of the old wrong =).
>>
>> Also, if we need DMM/TILER allocations for other devices than DSS (but
>> so far this hasn't been mentioned), then I think the DMM/TILER
>> functionality should be separated from omapdrm and moved to (I think)
>> dma-heap.
>>
>
> I see. Ok, it is not that much of an issue for me to carry one
> out-of-tree patch if needed.
>
>>>>>>> 2. Set exp_info.size = omap_gem_mmap_size(obj); when exporting a
>>>>>>> BO. That way importer knows the real BO memory size (including
>>>>>>> alignment etc) so he will be able to calculate the number of
>>>>>>> pages he needs to map the scatterlist.
>>>>>>
>>>>>> Can you elaborate what this means?
>>>>>>
>>>>>
>>>>> When we align to page, we shall report the size including the
>>>>> alignment, no? Or, it is the importer that shall take care to
>>>>> calculate BO size( including the alignment) based on scatterlist if
>>>>> he needs to?
>>>>
>>>> I'm not sure... But I guess the export size should include the
>>>> alignment.
>>>>
>>>
>>> My understanding as well. Will sent that change as a part of page
>>> alignment patch.
>>>
>>>> Hmm... I haven't had enough coffee yet, but how does this go...
>>>> Let's say we have a tiled fb, and the width gets expanded to a page.
>>>> What happens to reads/writes that happen outside the fb, but still
>>>> within the page? Those should cause an error or do nothing, but is
>>>> it possible that they go through TILER and get mapped to some real
>>>> memory location?
>>>>
>>>
>>> I lack the details here, but reading through TRM leaves me with the
>>> impression that TILER smallest unit is a tile, and every tile is
>>> backed by a real memory page (4KiB), so outside read-writes will end
>>> up in memory that's there but unused and will do nothing.
>>>
>>> omap_gem_new() calls tiler_align(), which in turn seems to return
>>> page-aligned size, so I think there is no issue here.
>>
>> Maybe, but, consider this example, with numbers totally out of thin
>> air: We have a fb with the width of 32 pixels, so 128 bytes. If we
>> have tiles which cover 32 x 32 pixels (so 4096 bytes with 4 bpp), we
>> need one tile to cover the width. But we have all the rest of the page
>> mapped, so 3968 bytes that are not covered with a tile (or rather, we
>> haven't configured that tile, or maybe the tile contains old
>> configuration).
>>
>> I could be totally wrong here, as I don't remember the details. But I
>> do think that it's very easy to get this wrong, creating memory
>> corruptions and/or security violations.
>>
>
> I see what you mean and I think you are right. What about configuring
> (if we use your made-up values) those 31 'unused' TILER entries to point
> to the same page tile 1 points to? If we need more than one tile (say N)
> to cover the width, apply the same logic, like, tile N+1 points to page
> 1, N+2 to page 2 and so on. Or, allocate one extra page and setup all
> 'unused' tiles to point to it? Both will guarantee no other memory can
> be accessed, IIUC, though I prefer the 'overlap' workaround.
>

I see I am not the first one to come up with that 'extra page' idea:
https://elixir.bootlin.com/linux/v5.16-rc1/source/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c#L349
. So, we already have that dummy_page, but it seems the loop in
dmm_txn_append() needs some massaging to back 'unused' tiles with that
dummy page.

I will send an updated patch.

> Also, we can have a single page used to back all the 'unused' tiler
> entries, for all the TILER BOs, but with such an attributes that it is
> not actually accessible (not sure how such a page shall be allocated
> though), so if someone tries to access memory outside of the allowed
> region and hits that page, we'll either have SEGFAULT or OOPS.
>
> Ivo

2021-11-19 06:42:24

by Ivaylo Dimitrov

[permalink] [raw]
Subject: Re: [PATCH v2] drm: omapdrm: Export correct scatterlist for TILER backed BOs



On 16.11.21 г. 12:20 ч., Tomi Valkeinen wrote:
> On 16/11/2021 10:27, Ivaylo Dimitrov wrote:
>> Hi,
>>
>> On 16.11.21 г. 8:42 ч., Tomi Valkeinen wrote:
>>> On 15/11/2021 19:15, Ivaylo Dimitrov wrote:
>>>> Hi,
>>>>
>>>> On 15.11.21 г. 17:37 ч., Tomi Valkeinen wrote:
>>>>> On 15/11/2021 15:55, Ivaylo Dimitrov wrote:
>>>>>> Hi Tomi,
>>>>>>
>>>>>> On 15.11.21 г. 10:42 ч., Tomi Valkeinen wrote:
>>>>>>> Hi,
>>>>>>>
>>>>>>> On 13/11/2021 11:53, Ivaylo Dimitrov wrote:
>>>>>>>> Memory of BOs backed by TILER is not contiguous, but
>>>>>>>> omap_gem_map_dma_buf()
>>>>>>>> exports it like it is. This leads to (possibly) invalid memory
>>>>>>>> accesses if
>>>>>>>> another device imports such a BO.
>>>>>>>
>>>>>>> This is one reason why TILER hasn't been officially supported.
>>>>>>> But the above is not exactly right, or at least not the whole truth.
>>>>>>>
>>>>>>
>>>>>> Definitely, not only these BOs lie about their memory layout, they
>>>>>> lie about size and alignment as well. I have 2 more patches here
>>>>>> (one is to align TILER memory on page, as proposed by Matthijs in
>>>>>> the other mail, the other to set the correct size when exporting
>>>>>> TILER BO), but I wanted to hear from you first, like, what is the
>>>>>> general trend :) .
>>>>>
>>>>> My thoughts here are that the current code doesn't work in
>>>>> practice, so if you get it fixed, it's great =).
>>>>>
>>>>>> Also, I have another patch in mind, that will enable exporting of
>>>>>> buffers that are not TILER backed, but are not CMA backed either.
>>>>>> SGX for example does not need CMA memory to render to.
>>>>>
>>>>> What do you mean with this? DSS needs contiguous memory, so the
>>>>> memory has to be 1) physically contiguous, 2) mapped with DMM or 3)
>>>>> mapped with TILER. There's no reason for the driver to export
>>>>> non-contiguous memory.
>>>>>
>>>>
>>>> DSS yes, but, omapdrm is used to allocate non-scanout buffers as
>>>> well, which do not need to be (and in practice are not) contiguous.
>>>> GPU (or anyone with MMU) can render on them (DRI buffers for
>>>> example) and later on those buffers can be copied (blit) to the
>>>> framebuffer. Yes, not zero-copy, but if you're doing compositing,
>>>> there is no option anyway.
>>>>
>>>> Exactly this is done by omap-video driver for example. GBM BOs are
>>>> allocated through omapdrm as well.
>>>
>>> That is not correct and shouldn't be done. omapdrm is not a generic
>>> memory allocator. We have real generic allocators, so those should be
>>> used. Or, if the buffer is only used for a single device, the buffer
>>> should be allocated from that device's driver.
>>>
>>
>> Yes, I saw the comment in kernel headers that dumb buffers should not
>> be used for rendering
>> (https://elixir.bootlin.com/linux/latest/source/include/drm/drm_drv.h#L361).
>> This makes no sense to me at all, but maybe I am missing the point.
>
> I believe that comments refers to another issue: a dumb buffer from may
> not be usable for rendering. It's only guaranteed to be
> readable/writable by the CPU.
>
> What I'm talking about is that a driver must support memory allocations
> for buffers that the device handled by the driver can use. In many cases
> that allocated buffer also works with other devices, and thus dmabuf
> export/import can be used. But a driver supporting memory allocations
> for buffers that the device itself cannot use is just wrong.
>
>> Also, it could be that the implementation of omap-video and/or PVR
>> userspace blobs is against the specs, but I see omap-video calling
>> DRM_IOCTL_OMAP_GEM_NEW for DRI buffers without OMAP_BO_SCANOUT and
>> libdbm.so calling DRM_IOCTL_MODE_CREATE_DUMB to create buffers then
>> used for rendering.
>
> I think neither of those are exactly material to be used as examples on
> how to do things. And there's lots of history there. We didn't have
> generic allocators back then.
>
>> This is not an issue on omap4 an later, because when export of that
>> buffer is requested, omapdrm uses DMM and exports a single scatterlist
>> entry, IIUC.
>>
>> But, on omap3, given there is no DMM, export is simply refused. I
>> don't see that as a consistent behaviour - we shall either a) export
>> non-scanout buffers (scattered ones) using whatever is supported (DMM
>> and single scatterlist entry on omap4 (and later), multiple-entry
>> scatterlist on omap3) or b) always require OMAP_BO_SCANOUT for BOs to
>> be exported and refuse to export if no such flag is set. I would say
>> b) is not a good option which leaves a) only.
>
> I think we should always require OMAP_BO_SCANOUT, or rather, drop the
> flag totally and always expect the buffer to be a scanout buffer. The
> only use for DSS is scanout, and those are the only buffers that omapdrm
> needs to support. But that would be breaking the uAPI, so I think we
> just have to support what we do now.
>
>> BTW, I think DMM is not really needed unless userspace requests
>> mmap(), in theory we can provide userspace with view through DMM but
>> give device drivers multiple entry scatterlist, potentially saving DMM
>> space.
>
> The userspace (CPU) doesn't need the DMM, the CPU has an MMU. I thought
> we already skip the DMM when mapping to the userspace. But in TILER case
> we always need TILER, even with the CPU.
>
>> I hope I made it clearer now why I think this feature shall be
>> implemented.
>
> I think it's just adding more wrong on top of the old wrong =).
>
> Also, if we need DMM/TILER allocations for other devices than DSS (but
> so far this hasn't been mentioned), then I think the DMM/TILER
> functionality should be separated from omapdrm and moved to (I think)
> dma-heap.
>
>>>>>> 2. Set exp_info.size = omap_gem_mmap_size(obj); when exporting a
>>>>>> BO. That way importer knows the real BO memory size (including
>>>>>> alignment etc) so he will be able to calculate the number of pages
>>>>>> he needs to map the scatterlist.
>>>>>
>>>>> Can you elaborate what this means?
>>>>>
>>>>
>>>> When we align to page, we shall report the size including the
>>>> alignment, no? Or, it is the importer that shall take care to
>>>> calculate BO size( including the alignment) based on scatterlist if
>>>> he needs to?
>>>
>>> I'm not sure... But I guess the export size should include the
>>> alignment.
>>>
>>
>> My understanding as well. Will sent that change as a part of page
>> alignment patch.
>>
>>> Hmm... I haven't had enough coffee yet, but how does this go... Let's
>>> say we have a tiled fb, and the width gets expanded to a page. What
>>> happens to reads/writes that happen outside the fb, but still within
>>> the page? Those should cause an error or do nothing, but is it
>>> possible that they go through TILER and get mapped to some real
>>> memory location?
>>>
>>
>> I lack the details here, but reading through TRM leaves me with the
>> impression that TILER smallest unit is a tile, and every tile is
>> backed by a real memory page (4KiB), so outside read-writes will end
>> up in memory that's there but unused and will do nothing.
>>
>> omap_gem_new() calls tiler_align(), which in turn seems to return
>> page-aligned size, so I think there is no issue here.
>
> Maybe, but, consider this example, with numbers totally out of thin air:
> We have a fb with the width of 32 pixels, so 128 bytes. If we have tiles
> which cover 32 x 32 pixels (so 4096 bytes with 4 bpp), we need one tile
> to cover the width. But we have all the rest of the page mapped, so 3968
> bytes that are not covered with a tile (or rather, we haven't configured
> that tile, or maybe the tile contains old configuration).
>
> I could be totally wrong here, as I don't remember the details. But I do
> think that it's very easy to get this wrong, creating memory corruptions
> and/or security violations.
>

By further looking into this, I think we actually don't have any issue
here, see
https://elixir.bootlin.com/linux/v5.16-rc1/source/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c#L982.
So, on probe, all LUTs are initialised to point to dummy page, the same
happens when 2d block is released - its LUT entries are initialised to
point to dummy page -
https://elixir.bootlin.com/linux/v5.16-rc1/source/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c#L529.

I will send v3 that just incorporates 2d allocations page size alignment
and export size fix.

Ivo

2021-11-19 08:06:36

by Ivaylo Dimitrov

[permalink] [raw]
Subject: [PATCH v3] drm: omapdrm: Export correct scatterlist for TILER backed BOs

Memory of BOs backed by TILER is not contiguous, but omap_gem_map_dma_buf()
exports it like it is. This leads to (possibly) invalid memory accesses if
another device imports such a BO.

Fix that by providing sg that correctly describes TILER memory layout.
Align TILER allocations to page, so importer to be able to correctly set
its MMU if have one. Set export size accounting for the alignment. Also,
make sure to destroy sg on unpin, as it is no longer valid.

Tested on Motorola Droid4 by using GPU (sgx540) to render.

Suggested-by: Matthijs van Duin <[email protected]>
Signed-off-by: Ivaylo Dimitrov <[email protected]>
---
drivers/gpu/drm/omapdrm/omap_gem.c | 79 ++++++++++++++++++++++++++++++-
drivers/gpu/drm/omapdrm/omap_gem.h | 2 +
drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c | 34 ++-----------
3 files changed, 85 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c
index 97e5fe6..54cb6ce 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem.c
@@ -800,7 +800,7 @@ int omap_gem_pin(struct drm_gem_object *obj, dma_addr_t *dma_addr)
if (omap_obj->flags & OMAP_BO_TILED_MASK) {
block = tiler_reserve_2d(fmt,
omap_obj->width,
- omap_obj->height, 0);
+ omap_obj->height, PAGE_SIZE);
} else {
block = tiler_reserve_1d(obj->size);
}
@@ -862,6 +862,11 @@ static void omap_gem_unpin_locked(struct drm_gem_object *obj)
return;

if (refcount_dec_and_test(&omap_obj->dma_addr_cnt)) {
+ if (omap_obj->sgt) {
+ sg_free_table(omap_obj->sgt);
+ kfree(omap_obj->sgt);
+ omap_obj->sgt = NULL;
+ }
ret = tiler_unpin(omap_obj->block);
if (ret) {
dev_err(obj->dev->dev,
@@ -974,6 +979,78 @@ int omap_gem_put_pages(struct drm_gem_object *obj)
return 0;
}

+struct sg_table *omap_gem_get_sg(struct drm_gem_object *obj)
+{
+ struct omap_gem_object *omap_obj = to_omap_bo(obj);
+ dma_addr_t addr;
+ struct sg_table *sgt;
+ struct scatterlist *sg;
+ unsigned int count, len, stride, i;
+ int ret;
+
+ ret = omap_gem_pin(obj, &addr);
+ if (ret)
+ return ERR_PTR(ret);
+
+ mutex_lock(&omap_obj->lock);
+
+ sgt = omap_obj->sgt;
+ if (sgt)
+ goto out;
+
+ sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
+ if (!sgt) {
+ ret = -ENOMEM;
+ goto err_unpin;
+ }
+
+ if (omap_obj->flags & OMAP_BO_TILED_MASK) {
+ enum tiler_fmt fmt = gem2fmt(omap_obj->flags);
+
+ len = omap_obj->width << (int)fmt;
+ count = omap_obj->height;
+ stride = tiler_stride(fmt, 0);
+ } else {
+ len = obj->size;
+ count = 1;
+ stride = 0;
+ }
+
+ ret = sg_alloc_table(sgt, count, GFP_KERNEL);
+ if (ret)
+ goto err_free;
+
+ for_each_sg(sgt->sgl, sg, count, i) {
+ sg_set_page(sg, phys_to_page(addr), len, offset_in_page(addr));
+ sg_dma_address(sg) = addr;
+ sg_dma_len(sg) = len;
+
+ addr += stride;
+ }
+
+ omap_obj->sgt = sgt;
+out:
+ mutex_unlock(&omap_obj->lock);
+ return sgt;
+
+err_free:
+ kfree(sgt);
+err_unpin:
+ mutex_unlock(&omap_obj->lock);
+ omap_gem_unpin(obj);
+ return ERR_PTR(ret);
+}
+
+void omap_gem_put_sg(struct drm_gem_object *obj, struct sg_table *sgt)
+{
+ struct omap_gem_object *omap_obj = to_omap_bo(obj);
+
+ if (WARN_ON(omap_obj->sgt != sgt))
+ return;
+
+ omap_gem_unpin(obj);
+}
+
#ifdef CONFIG_DRM_FBDEV_EMULATION
/*
* Get kernel virtual address for CPU access.. this more or less only
diff --git a/drivers/gpu/drm/omapdrm/omap_gem.h b/drivers/gpu/drm/omapdrm/omap_gem.h
index eda9b48..19209e3 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem.h
+++ b/drivers/gpu/drm/omapdrm/omap_gem.h
@@ -82,5 +82,7 @@ int omap_gem_get_pages(struct drm_gem_object *obj, struct page ***pages,
int omap_gem_rotated_dma_addr(struct drm_gem_object *obj, u32 orient,
int x, int y, dma_addr_t *dma_addr);
int omap_gem_tiled_stride(struct drm_gem_object *obj, u32 orient);
+struct sg_table *omap_gem_get_sg(struct drm_gem_object *obj);
+void omap_gem_put_sg(struct drm_gem_object *obj, struct sg_table *sgt);

#endif /* __OMAPDRM_GEM_H__ */
diff --git a/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c b/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
index f4cde3a..95a9a89 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
@@ -21,45 +21,21 @@ static struct sg_table *omap_gem_map_dma_buf(
{
struct drm_gem_object *obj = attachment->dmabuf->priv;
struct sg_table *sg;
- dma_addr_t dma_addr;
- int ret;
-
- sg = kzalloc(sizeof(*sg), GFP_KERNEL);
- if (!sg)
- return ERR_PTR(-ENOMEM);
-
- /* camera, etc, need physically contiguous.. but we need a
- * better way to know this..
- */
- ret = omap_gem_pin(obj, &dma_addr);
- if (ret)
- goto out;
-
- ret = sg_alloc_table(sg, 1, GFP_KERNEL);
- if (ret)
- goto out;
-
- sg_init_table(sg->sgl, 1);
- sg_dma_len(sg->sgl) = obj->size;
- sg_set_page(sg->sgl, pfn_to_page(PFN_DOWN(dma_addr)), obj->size, 0);
- sg_dma_address(sg->sgl) = dma_addr;
+ sg = omap_gem_get_sg(obj);
+ if (IS_ERR(sg))
+ return sg;

/* this must be after omap_gem_pin() to ensure we have pages attached */
omap_gem_dma_sync_buffer(obj, dir);

return sg;
-out:
- kfree(sg);
- return ERR_PTR(ret);
}

static void omap_gem_unmap_dma_buf(struct dma_buf_attachment *attachment,
struct sg_table *sg, enum dma_data_direction dir)
{
struct drm_gem_object *obj = attachment->dmabuf->priv;
- omap_gem_unpin(obj);
- sg_free_table(sg);
- kfree(sg);
+ omap_gem_put_sg(obj, sg);
}

static int omap_gem_dmabuf_begin_cpu_access(struct dma_buf *buffer,
@@ -112,7 +88,7 @@ struct dma_buf *omap_gem_prime_export(struct drm_gem_object *obj, int flags)
DEFINE_DMA_BUF_EXPORT_INFO(exp_info);

exp_info.ops = &omap_dmabuf_ops;
- exp_info.size = obj->size;
+ exp_info.size = omap_gem_mmap_size(obj);
exp_info.flags = flags;
exp_info.priv = obj;

--
1.9.1


2021-11-25 08:24:12

by Ivaylo Dimitrov

[permalink] [raw]
Subject: Re: [PATCH v3] drm: omapdrm: Export correct scatterlist for TILER backed BOs

Gentle ping.

On 19.11.21 г. 10:06 ч., Ivaylo Dimitrov wrote:
> Memory of BOs backed by TILER is not contiguous, but omap_gem_map_dma_buf()
> exports it like it is. This leads to (possibly) invalid memory accesses if
> another device imports such a BO.
>
> Fix that by providing sg that correctly describes TILER memory layout.
> Align TILER allocations to page, so importer to be able to correctly set
> its MMU if have one. Set export size accounting for the alignment. Also,
> make sure to destroy sg on unpin, as it is no longer valid.
>
> Tested on Motorola Droid4 by using GPU (sgx540) to render.
>
> Suggested-by: Matthijs van Duin <[email protected]>
> Signed-off-by: Ivaylo Dimitrov <[email protected]>
> ---
> drivers/gpu/drm/omapdrm/omap_gem.c | 79 ++++++++++++++++++++++++++++++-
> drivers/gpu/drm/omapdrm/omap_gem.h | 2 +
> drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c | 34 ++-----------
> 3 files changed, 85 insertions(+), 30 deletions(-)
>
> diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c
> index 97e5fe6..54cb6ce 100644
> --- a/drivers/gpu/drm/omapdrm/omap_gem.c
> +++ b/drivers/gpu/drm/omapdrm/omap_gem.c
> @@ -800,7 +800,7 @@ int omap_gem_pin(struct drm_gem_object *obj, dma_addr_t *dma_addr)
> if (omap_obj->flags & OMAP_BO_TILED_MASK) {
> block = tiler_reserve_2d(fmt,
> omap_obj->width,
> - omap_obj->height, 0);
> + omap_obj->height, PAGE_SIZE);
> } else {
> block = tiler_reserve_1d(obj->size);
> }
> @@ -862,6 +862,11 @@ static void omap_gem_unpin_locked(struct drm_gem_object *obj)
> return;
>
> if (refcount_dec_and_test(&omap_obj->dma_addr_cnt)) {
> + if (omap_obj->sgt) {
> + sg_free_table(omap_obj->sgt);
> + kfree(omap_obj->sgt);
> + omap_obj->sgt = NULL;
> + }
> ret = tiler_unpin(omap_obj->block);
> if (ret) {
> dev_err(obj->dev->dev,
> @@ -974,6 +979,78 @@ int omap_gem_put_pages(struct drm_gem_object *obj)
> return 0;
> }
>
> +struct sg_table *omap_gem_get_sg(struct drm_gem_object *obj)
> +{
> + struct omap_gem_object *omap_obj = to_omap_bo(obj);
> + dma_addr_t addr;
> + struct sg_table *sgt;
> + struct scatterlist *sg;
> + unsigned int count, len, stride, i;
> + int ret;
> +
> + ret = omap_gem_pin(obj, &addr);
> + if (ret)
> + return ERR_PTR(ret);
> +
> + mutex_lock(&omap_obj->lock);
> +
> + sgt = omap_obj->sgt;
> + if (sgt)
> + goto out;
> +
> + sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
> + if (!sgt) {
> + ret = -ENOMEM;
> + goto err_unpin;
> + }
> +
> + if (omap_obj->flags & OMAP_BO_TILED_MASK) {
> + enum tiler_fmt fmt = gem2fmt(omap_obj->flags);
> +
> + len = omap_obj->width << (int)fmt;
> + count = omap_obj->height;
> + stride = tiler_stride(fmt, 0);
> + } else {
> + len = obj->size;
> + count = 1;
> + stride = 0;
> + }
> +
> + ret = sg_alloc_table(sgt, count, GFP_KERNEL);
> + if (ret)
> + goto err_free;
> +
> + for_each_sg(sgt->sgl, sg, count, i) {
> + sg_set_page(sg, phys_to_page(addr), len, offset_in_page(addr));
> + sg_dma_address(sg) = addr;
> + sg_dma_len(sg) = len;
> +
> + addr += stride;
> + }
> +
> + omap_obj->sgt = sgt;
> +out:
> + mutex_unlock(&omap_obj->lock);
> + return sgt;
> +
> +err_free:
> + kfree(sgt);
> +err_unpin:
> + mutex_unlock(&omap_obj->lock);
> + omap_gem_unpin(obj);
> + return ERR_PTR(ret);
> +}
> +
> +void omap_gem_put_sg(struct drm_gem_object *obj, struct sg_table *sgt)
> +{
> + struct omap_gem_object *omap_obj = to_omap_bo(obj);
> +
> + if (WARN_ON(omap_obj->sgt != sgt))
> + return;
> +
> + omap_gem_unpin(obj);
> +}
> +
> #ifdef CONFIG_DRM_FBDEV_EMULATION
> /*
> * Get kernel virtual address for CPU access.. this more or less only
> diff --git a/drivers/gpu/drm/omapdrm/omap_gem.h b/drivers/gpu/drm/omapdrm/omap_gem.h
> index eda9b48..19209e3 100644
> --- a/drivers/gpu/drm/omapdrm/omap_gem.h
> +++ b/drivers/gpu/drm/omapdrm/omap_gem.h
> @@ -82,5 +82,7 @@ int omap_gem_get_pages(struct drm_gem_object *obj, struct page ***pages,
> int omap_gem_rotated_dma_addr(struct drm_gem_object *obj, u32 orient,
> int x, int y, dma_addr_t *dma_addr);
> int omap_gem_tiled_stride(struct drm_gem_object *obj, u32 orient);
> +struct sg_table *omap_gem_get_sg(struct drm_gem_object *obj);
> +void omap_gem_put_sg(struct drm_gem_object *obj, struct sg_table *sgt);
>
> #endif /* __OMAPDRM_GEM_H__ */
> diff --git a/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c b/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
> index f4cde3a..95a9a89 100644
> --- a/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
> +++ b/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
> @@ -21,45 +21,21 @@ static struct sg_table *omap_gem_map_dma_buf(
> {
> struct drm_gem_object *obj = attachment->dmabuf->priv;
> struct sg_table *sg;
> - dma_addr_t dma_addr;
> - int ret;
> -
> - sg = kzalloc(sizeof(*sg), GFP_KERNEL);
> - if (!sg)
> - return ERR_PTR(-ENOMEM);
> -
> - /* camera, etc, need physically contiguous.. but we need a
> - * better way to know this..
> - */
> - ret = omap_gem_pin(obj, &dma_addr);
> - if (ret)
> - goto out;
> -
> - ret = sg_alloc_table(sg, 1, GFP_KERNEL);
> - if (ret)
> - goto out;
> -
> - sg_init_table(sg->sgl, 1);
> - sg_dma_len(sg->sgl) = obj->size;
> - sg_set_page(sg->sgl, pfn_to_page(PFN_DOWN(dma_addr)), obj->size, 0);
> - sg_dma_address(sg->sgl) = dma_addr;
> + sg = omap_gem_get_sg(obj);
> + if (IS_ERR(sg))
> + return sg;
>
> /* this must be after omap_gem_pin() to ensure we have pages attached */
> omap_gem_dma_sync_buffer(obj, dir);
>
> return sg;
> -out:
> - kfree(sg);
> - return ERR_PTR(ret);
> }
>
> static void omap_gem_unmap_dma_buf(struct dma_buf_attachment *attachment,
> struct sg_table *sg, enum dma_data_direction dir)
> {
> struct drm_gem_object *obj = attachment->dmabuf->priv;
> - omap_gem_unpin(obj);
> - sg_free_table(sg);
> - kfree(sg);
> + omap_gem_put_sg(obj, sg);
> }
>
> static int omap_gem_dmabuf_begin_cpu_access(struct dma_buf *buffer,
> @@ -112,7 +88,7 @@ struct dma_buf *omap_gem_prime_export(struct drm_gem_object *obj, int flags)
> DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
>
> exp_info.ops = &omap_dmabuf_ops;
> - exp_info.size = obj->size;
> + exp_info.size = omap_gem_mmap_size(obj);
> exp_info.flags = flags;
> exp_info.priv = obj;
>
>

2021-12-02 09:13:26

by Tomi Valkeinen

[permalink] [raw]
Subject: Re: [PATCH v3] drm: omapdrm: Export correct scatterlist for TILER backed BOs

Hi,

On 19/11/2021 10:06, Ivaylo Dimitrov wrote:
> Memory of BOs backed by TILER is not contiguous, but omap_gem_map_dma_buf()
> exports it like it is. This leads to (possibly) invalid memory accesses if
> another device imports such a BO.
>
> Fix that by providing sg that correctly describes TILER memory layout.
> Align TILER allocations to page, so importer to be able to correctly set
> its MMU if have one. Set export size accounting for the alignment. Also,
> make sure to destroy sg on unpin, as it is no longer valid.
>
> Tested on Motorola Droid4 by using GPU (sgx540) to render.
>
> Suggested-by: Matthijs van Duin <[email protected]>
> Signed-off-by: Ivaylo Dimitrov <[email protected]>
> ---
> drivers/gpu/drm/omapdrm/omap_gem.c | 79 ++++++++++++++++++++++++++++++-
> drivers/gpu/drm/omapdrm/omap_gem.h | 2 +
> drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c | 34 ++-----------
> 3 files changed, 85 insertions(+), 30 deletions(-)

I'll pick this to my work branch which I use every day, and let it be
there for a few days. If I don't see any issues, I'll push to drm-misc-next.

Tomi