LinuxLists.cc - [RESEND PATCH v2 4/5] drm/msm: add DRM_MSM_GEM_SYNC

2020-11-14 15:24:40

Subject: [RESEND PATCH v2 4/5] drm/msm: add DRM_MSM_GEM_SYNC_CACHE for non-coherent cache maintenance

This makes it possible to use the non-coherent cached MSM_BO_CACHED mode,
which otherwise doesn't provide any method for cleaning/invalidating the
cache to sync with the device.

Signed-off-by: Jonathan Marek <[email protected]>
---
drivers/gpu/drm/msm/msm_drv.c | 21 +++++++++++++++++++++
drivers/gpu/drm/msm/msm_drv.h | 2 ++
drivers/gpu/drm/msm/msm_gem.c | 23 +++++++++++++++++++++++
include/uapi/drm/msm_drm.h | 20 ++++++++++++++++++++
4 files changed, 66 insertions(+)

diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index bae48afca82e..3f17acdf6594 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -959,6 +959,26 @@ static int msm_ioctl_submitqueue_close(struct drm_device *dev, void *data,
return msm_submitqueue_remove(file->driver_priv, id);
}

+static int msm_ioctl_gem_sync_cache(struct drm_device *dev, void *data,
+ struct drm_file *file)
+{
+ struct drm_msm_gem_sync_cache *args = data;
+ struct drm_gem_object *obj;
+
+ if (args->flags & ~MSM_GEM_SYNC_CACHE_FLAGS)
+ return -EINVAL;
+
+ obj = drm_gem_object_lookup(file, args->handle);
+ if (!obj)
+ return -ENOENT;
+
+ msm_gem_sync_cache(obj, args->flags, args->offset, args->end);
+
+ drm_gem_object_put(obj);
+
+ return 0;
+}
+
static const struct drm_ioctl_desc msm_ioctls[] = {
DRM_IOCTL_DEF_DRV(MSM_GET_PARAM, msm_ioctl_get_param, DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(MSM_GEM_NEW, msm_ioctl_gem_new, DRM_RENDER_ALLOW),
@@ -971,6 +991,7 @@ static const struct drm_ioctl_desc msm_ioctls[] = {
DRM_IOCTL_DEF_DRV(MSM_SUBMITQUEUE_NEW, msm_ioctl_submitqueue_new, DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(MSM_SUBMITQUEUE_CLOSE, msm_ioctl_submitqueue_close, DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(MSM_SUBMITQUEUE_QUERY, msm_ioctl_submitqueue_query, DRM_RENDER_ALLOW),
+ DRM_IOCTL_DEF_DRV(MSM_GEM_SYNC_CACHE, msm_ioctl_gem_sync_cache, DRM_RENDER_ALLOW),
};

static const struct vm_operations_struct vm_ops = {
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index 22ebecb28349..f170f843010e 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -318,6 +318,8 @@ void msm_gem_active_get(struct drm_gem_object *obj, struct msm_gpu *gpu);
void msm_gem_active_put(struct drm_gem_object *obj);
int msm_gem_cpu_prep(struct drm_gem_object *obj, uint32_t op, ktime_t *timeout);
int msm_gem_cpu_fini(struct drm_gem_object *obj);
+void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags,
+ size_t range_start, size_t range_end);
void msm_gem_free_object(struct drm_gem_object *obj);
int msm_gem_new_handle(struct drm_device *dev, struct drm_file *file,
uint32_t size, uint32_t flags, uint32_t *handle, char *name);
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index 3d8254b5de16..039738696f9a 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -797,6 +797,29 @@ int msm_gem_cpu_fini(struct drm_gem_object *obj)
return 0;
}

+void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags,
+ size_t range_start, size_t range_end)
+{
+ struct msm_gem_object *msm_obj = to_msm_bo(obj);
+ struct device *dev = msm_obj->base.dev->dev;
+
+ /* exit early if get_pages() hasn't been called yet */
+ if (!msm_obj->pages)
+ return;
+
+ /* TODO: sync only the specified range */
+
+ if (flags & MSM_GEM_SYNC_FOR_DEVICE) {
+ dma_sync_sg_for_device(dev, msm_obj->sgt->sgl,
+ msm_obj->sgt->nents, DMA_TO_DEVICE);
+ }
+
+ if (flags & MSM_GEM_SYNC_FOR_CPU) {
+ dma_sync_sg_for_cpu(dev, msm_obj->sgt->sgl,
+ msm_obj->sgt->nents, DMA_FROM_DEVICE);
+ }
+}
+
#ifdef CONFIG_DEBUG_FS
static void describe_fence(struct dma_fence *fence, const char *type,
struct seq_file *m)
diff --git a/include/uapi/drm/msm_drm.h b/include/uapi/drm/msm_drm.h
index 474497e8743a..c8288f328528 100644
--- a/include/uapi/drm/msm_drm.h
+++ b/include/uapi/drm/msm_drm.h
@@ -319,6 +319,24 @@ struct drm_msm_submitqueue_query {
__u32 pad;
};

+/*
+ * Host cache maintenance (relevant for MSM_BO_CACHED)
+ * driver may both clean/invalidate (flush) for clean
+ */
+
+#define MSM_GEM_SYNC_FOR_DEVICE 0x1
+#define MSM_GEM_SYNC_FOR_CPU 0x2
+
+#define MSM_GEM_SYNC_CACHE_FLAGS (MSM_GEM_SYNC_FOR_DEVICE | \
+ MSM_GEM_SYNC_FOR_CPU)
+
+struct drm_msm_gem_sync_cache {
+ __u32 handle;
+ __u32 flags;
+ __u64 offset;
+ __u64 end; /* offset + size */
+};
+
#define DRM_MSM_GET_PARAM 0x00
/* placeholder:
#define DRM_MSM_SET_PARAM 0x01
@@ -336,6 +354,7 @@ struct drm_msm_submitqueue_query {
#define DRM_MSM_SUBMITQUEUE_NEW 0x0A
#define DRM_MSM_SUBMITQUEUE_CLOSE 0x0B
#define DRM_MSM_SUBMITQUEUE_QUERY 0x0C
+#define DRM_MSM_GEM_SYNC_CACHE 0x0D

#define DRM_IOCTL_MSM_GET_PARAM DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GET_PARAM, struct drm_msm_param)
#define DRM_IOCTL_MSM_GEM_NEW DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GEM_NEW, struct drm_msm_gem_new)
@@ -348,6 +367,7 @@ struct drm_msm_submitqueue_query {
#define DRM_IOCTL_MSM_SUBMITQUEUE_NEW DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_NEW, struct drm_msm_submitqueue)
#define DRM_IOCTL_MSM_SUBMITQUEUE_CLOSE DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_CLOSE, __u32)
#define DRM_IOCTL_MSM_SUBMITQUEUE_QUERY DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_QUERY, struct drm_msm_submitqueue_query)
+#define DRM_IOCTL_MSM_GEM_SYNC_CACHE DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_GEM_SYNC_CACHE, struct drm_msm_gem_sync_cache)

#if defined(__cplusplus)
}
--
2.26.1

2020-11-14 16:26:52

by Christoph Hellwig

[permalink] [raw]

Subject: Re: [RESEND PATCH v2 4/5] drm/msm: add DRM_MSM_GEM_SYNC_CACHE for non-coherent cache maintenance

On Sat, Nov 14, 2020 at 10:17:12AM -0500, Jonathan Marek wrote:
> +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags,
> + size_t range_start, size_t range_end)
> +{
> + struct msm_gem_object *msm_obj = to_msm_bo(obj);
> + struct device *dev = msm_obj->base.dev->dev;
> +
> + /* exit early if get_pages() hasn't been called yet */
> + if (!msm_obj->pages)
> + return;
> +
> + /* TODO: sync only the specified range */
> +
> + if (flags & MSM_GEM_SYNC_FOR_DEVICE) {
> + dma_sync_sg_for_device(dev, msm_obj->sgt->sgl,
> + msm_obj->sgt->nents, DMA_TO_DEVICE);
> + }
> +
> + if (flags & MSM_GEM_SYNC_FOR_CPU) {
> + dma_sync_sg_for_cpu(dev, msm_obj->sgt->sgl,
> + msm_obj->sgt->nents, DMA_FROM_DEVICE);
> + }

Splitting this helper from the only caller is rather strange, epecially
with the two unused arguments. And I think the way this is specified
to take a range, but ignoring it is actively dangerous. User space will
rely on it syncing everything sooner or later and then you are stuck.
So just define a sync all primitive for now, and if you really need a
range sync and have actually implemented it add a new ioctl for that.

2020-11-14 18:49:02

by Rob Clark

[permalink] [raw]

Subject: Re: [RESEND PATCH v2 4/5] drm/msm: add DRM_MSM_GEM_SYNC_CACHE for non-coherent cache maintenance

On Sat, Nov 14, 2020 at 8:24 AM Christoph Hellwig <[email protected]> wrote:
>
> On Sat, Nov 14, 2020 at 10:17:12AM -0500, Jonathan Marek wrote:
> > +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags,
> > + size_t range_start, size_t range_end)
> > +{
> > + struct msm_gem_object *msm_obj = to_msm_bo(obj);
> > + struct device *dev = msm_obj->base.dev->dev;
> > +
> > + /* exit early if get_pages() hasn't been called yet */
> > + if (!msm_obj->pages)
> > + return;
> > +
> > + /* TODO: sync only the specified range */
> > +
> > + if (flags & MSM_GEM_SYNC_FOR_DEVICE) {
> > + dma_sync_sg_for_device(dev, msm_obj->sgt->sgl,
> > + msm_obj->sgt->nents, DMA_TO_DEVICE);
> > + }
> > +
> > + if (flags & MSM_GEM_SYNC_FOR_CPU) {
> > + dma_sync_sg_for_cpu(dev, msm_obj->sgt->sgl,
> > + msm_obj->sgt->nents, DMA_FROM_DEVICE);
> > + }
>
> Splitting this helper from the only caller is rather strange, epecially
> with the two unused arguments. And I think the way this is specified
> to take a range, but ignoring it is actively dangerous. User space will
> rely on it syncing everything sooner or later and then you are stuck.
> So just define a sync all primitive for now, and if you really need a
> range sync and have actually implemented it add a new ioctl for that.

We do already have a split of ioctl "layer" which enforces valid ioctl
params, etc, and gem (or other) module code which is called by the
ioctl func. So I think it is fine to keep this split here. (Also, I
think at some point there will be a uring type of ioctl alternative
which would re-use the same gem func.)

But I do agree that the range should be respected or added later..
drm_ioctl() dispatch is well prepared for extending ioctls.

And I assume there should be some validation that the range is aligned
to cache-line? Or can we flush a partial cache line?

BR,
-R

2020-11-14 19:00:45

by Jonathan Marek

[permalink] [raw]

Subject: Re: [RESEND PATCH v2 4/5] drm/msm: add DRM_MSM_GEM_SYNC_CACHE for non-coherent cache maintenance

On 11/14/20 1:46 PM, Rob Clark wrote:
> On Sat, Nov 14, 2020 at 8:24 AM Christoph Hellwig <[email protected]> wrote:
>>
>> On Sat, Nov 14, 2020 at 10:17:12AM -0500, Jonathan Marek wrote:
>>> +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags,
>>> + size_t range_start, size_t range_end)
>>> +{
>>> + struct msm_gem_object *msm_obj = to_msm_bo(obj);
>>> + struct device *dev = msm_obj->base.dev->dev;
>>> +
>>> + /* exit early if get_pages() hasn't been called yet */
>>> + if (!msm_obj->pages)
>>> + return;
>>> +
>>> + /* TODO: sync only the specified range */
>>> +
>>> + if (flags & MSM_GEM_SYNC_FOR_DEVICE) {
>>> + dma_sync_sg_for_device(dev, msm_obj->sgt->sgl,
>>> + msm_obj->sgt->nents, DMA_TO_DEVICE);
>>> + }
>>> +
>>> + if (flags & MSM_GEM_SYNC_FOR_CPU) {
>>> + dma_sync_sg_for_cpu(dev, msm_obj->sgt->sgl,
>>> + msm_obj->sgt->nents, DMA_FROM_DEVICE);
>>> + }
>>
>> Splitting this helper from the only caller is rather strange, epecially
>> with the two unused arguments. And I think the way this is specified
>> to take a range, but ignoring it is actively dangerous. User space will
>> rely on it syncing everything sooner or later and then you are stuck.
>> So just define a sync all primitive for now, and if you really need a
>> range sync and have actually implemented it add a new ioctl for that.
>
> We do already have a split of ioctl "layer" which enforces valid ioctl
> params, etc, and gem (or other) module code which is called by the
> ioctl func. So I think it is fine to keep this split here. (Also, I
> think at some point there will be a uring type of ioctl alternative
> which would re-use the same gem func.)
>
> But I do agree that the range should be respected or added later..
> drm_ioctl() dispatch is well prepared for extending ioctls.
>
> And I assume there should be some validation that the range is aligned
> to cache-line? Or can we flush a partial cache line?
>

The range is intended to be "sync at least this range", so that
userspace doesn't have to worry about details like that.

> BR,
> -R
>

2020-11-14 19:44:36

by Rob Clark

[permalink] [raw]

Subject: Re: [RESEND PATCH v2 4/5] drm/msm: add DRM_MSM_GEM_SYNC_CACHE for non-coherent cache maintenance

On Sat, Nov 14, 2020 at 10:58 AM Jonathan Marek <[email protected]> wrote:
>
> On 11/14/20 1:46 PM, Rob Clark wrote:
> > On Sat, Nov 14, 2020 at 8:24 AM Christoph Hellwig <[email protected]> wrote:
> >>
> >> On Sat, Nov 14, 2020 at 10:17:12AM -0500, Jonathan Marek wrote:
> >>> +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags,
> >>> + size_t range_start, size_t range_end)
> >>> +{
> >>> + struct msm_gem_object *msm_obj = to_msm_bo(obj);
> >>> + struct device *dev = msm_obj->base.dev->dev;
> >>> +
> >>> + /* exit early if get_pages() hasn't been called yet */
> >>> + if (!msm_obj->pages)
> >>> + return;
> >>> +
> >>> + /* TODO: sync only the specified range */
> >>> +
> >>> + if (flags & MSM_GEM_SYNC_FOR_DEVICE) {
> >>> + dma_sync_sg_for_device(dev, msm_obj->sgt->sgl,
> >>> + msm_obj->sgt->nents, DMA_TO_DEVICE);
> >>> + }
> >>> +
> >>> + if (flags & MSM_GEM_SYNC_FOR_CPU) {
> >>> + dma_sync_sg_for_cpu(dev, msm_obj->sgt->sgl,
> >>> + msm_obj->sgt->nents, DMA_FROM_DEVICE);
> >>> + }
> >>
> >> Splitting this helper from the only caller is rather strange, epecially
> >> with the two unused arguments. And I think the way this is specified
> >> to take a range, but ignoring it is actively dangerous. User space will
> >> rely on it syncing everything sooner or later and then you are stuck.
> >> So just define a sync all primitive for now, and if you really need a
> >> range sync and have actually implemented it add a new ioctl for that.
> >
> > We do already have a split of ioctl "layer" which enforces valid ioctl
> > params, etc, and gem (or other) module code which is called by the
> > ioctl func. So I think it is fine to keep this split here. (Also, I
> > think at some point there will be a uring type of ioctl alternative
> > which would re-use the same gem func.)
> >
> > But I do agree that the range should be respected or added later..
> > drm_ioctl() dispatch is well prepared for extending ioctls.
> >
> > And I assume there should be some validation that the range is aligned
> > to cache-line? Or can we flush a partial cache line?
> >
>
> The range is intended to be "sync at least this range", so that
> userspace doesn't have to worry about details like that.
>

I don't think userspace can *not* worry about details like that.
Consider a case where the cpu and gpu are simultaneously accessing
different parts of a buffer (for ex, sub-allocation). There needs to
be cache-line separation between the two.

BR,
-R

2020-11-14 20:12:31

by Jonathan Marek

[permalink] [raw]

Subject: Re: [RESEND PATCH v2 4/5] drm/msm: add DRM_MSM_GEM_SYNC_CACHE for non-coherent cache maintenance

On 11/14/20 2:39 PM, Rob Clark wrote:
> On Sat, Nov 14, 2020 at 10:58 AM Jonathan Marek <[email protected]> wrote:
>>
>> On 11/14/20 1:46 PM, Rob Clark wrote:
>>> On Sat, Nov 14, 2020 at 8:24 AM Christoph Hellwig <[email protected]> wrote:
>>>>
>>>> On Sat, Nov 14, 2020 at 10:17:12AM -0500, Jonathan Marek wrote:
>>>>> +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags,
>>>>> + size_t range_start, size_t range_end)
>>>>> +{
>>>>> + struct msm_gem_object *msm_obj = to_msm_bo(obj);
>>>>> + struct device *dev = msm_obj->base.dev->dev;
>>>>> +
>>>>> + /* exit early if get_pages() hasn't been called yet */
>>>>> + if (!msm_obj->pages)
>>>>> + return;
>>>>> +
>>>>> + /* TODO: sync only the specified range */
>>>>> +
>>>>> + if (flags & MSM_GEM_SYNC_FOR_DEVICE) {
>>>>> + dma_sync_sg_for_device(dev, msm_obj->sgt->sgl,
>>>>> + msm_obj->sgt->nents, DMA_TO_DEVICE);
>>>>> + }
>>>>> +
>>>>> + if (flags & MSM_GEM_SYNC_FOR_CPU) {
>>>>> + dma_sync_sg_for_cpu(dev, msm_obj->sgt->sgl,
>>>>> + msm_obj->sgt->nents, DMA_FROM_DEVICE);
>>>>> + }
>>>>
>>>> Splitting this helper from the only caller is rather strange, epecially
>>>> with the two unused arguments. And I think the way this is specified
>>>> to take a range, but ignoring it is actively dangerous. User space will
>>>> rely on it syncing everything sooner or later and then you are stuck.
>>>> So just define a sync all primitive for now, and if you really need a
>>>> range sync and have actually implemented it add a new ioctl for that.
>>>
>>> We do already have a split of ioctl "layer" which enforces valid ioctl
>>> params, etc, and gem (or other) module code which is called by the
>>> ioctl func. So I think it is fine to keep this split here. (Also, I
>>> think at some point there will be a uring type of ioctl alternative
>>> which would re-use the same gem func.)
>>>
>>> But I do agree that the range should be respected or added later..
>>> drm_ioctl() dispatch is well prepared for extending ioctls.
>>>
>>> And I assume there should be some validation that the range is aligned
>>> to cache-line? Or can we flush a partial cache line?
>>>
>>
>> The range is intended to be "sync at least this range", so that
>> userspace doesn't have to worry about details like that.
>>
>
> I don't think userspace can *not* worry about details like that.
> Consider a case where the cpu and gpu are simultaneously accessing
> different parts of a buffer (for ex, sub-allocation). There needs to
> be cache-line separation between the two.
>

Right.. and it also seems like we can't get away with just
flushing/invalidating the whole thing.

qcom's vulkan driver has nonCoherentAtomSize=1, and it looks like
dma_sync_single_for_cpu() does deal in some way with the partial cache
line case, although I'm not sure that means we can have a
nonCoherentAtomSize=1.

> BR,
> -R
>

2020-11-14 20:50:47

by Rob Clark

[permalink] [raw]

Subject: Re: [RESEND PATCH v2 4/5] drm/msm: add DRM_MSM_GEM_SYNC_CACHE for non-coherent cache maintenance

On Sat, Nov 14, 2020 at 12:10 PM Jonathan Marek <[email protected]> wrote:
>
> On 11/14/20 2:39 PM, Rob Clark wrote:
> > On Sat, Nov 14, 2020 at 10:58 AM Jonathan Marek <[email protected]> wrote:
> >>
> >> On 11/14/20 1:46 PM, Rob Clark wrote:
> >>> On Sat, Nov 14, 2020 at 8:24 AM Christoph Hellwig <[email protected]> wrote:
> >>>>
> >>>> On Sat, Nov 14, 2020 at 10:17:12AM -0500, Jonathan Marek wrote:
> >>>>> +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags,
> >>>>> + size_t range_start, size_t range_end)
> >>>>> +{
> >>>>> + struct msm_gem_object *msm_obj = to_msm_bo(obj);
> >>>>> + struct device *dev = msm_obj->base.dev->dev;
> >>>>> +
> >>>>> + /* exit early if get_pages() hasn't been called yet */
> >>>>> + if (!msm_obj->pages)
> >>>>> + return;
> >>>>> +
> >>>>> + /* TODO: sync only the specified range */
> >>>>> +
> >>>>> + if (flags & MSM_GEM_SYNC_FOR_DEVICE) {
> >>>>> + dma_sync_sg_for_device(dev, msm_obj->sgt->sgl,
> >>>>> + msm_obj->sgt->nents, DMA_TO_DEVICE);
> >>>>> + }
> >>>>> +
> >>>>> + if (flags & MSM_GEM_SYNC_FOR_CPU) {
> >>>>> + dma_sync_sg_for_cpu(dev, msm_obj->sgt->sgl,
> >>>>> + msm_obj->sgt->nents, DMA_FROM_DEVICE);
> >>>>> + }
> >>>>
> >>>> Splitting this helper from the only caller is rather strange, epecially
> >>>> with the two unused arguments. And I think the way this is specified
> >>>> to take a range, but ignoring it is actively dangerous. User space will
> >>>> rely on it syncing everything sooner or later and then you are stuck.
> >>>> So just define a sync all primitive for now, and if you really need a
> >>>> range sync and have actually implemented it add a new ioctl for that.
> >>>
> >>> We do already have a split of ioctl "layer" which enforces valid ioctl
> >>> params, etc, and gem (or other) module code which is called by the
> >>> ioctl func. So I think it is fine to keep this split here. (Also, I
> >>> think at some point there will be a uring type of ioctl alternative
> >>> which would re-use the same gem func.)
> >>>
> >>> But I do agree that the range should be respected or added later..
> >>> drm_ioctl() dispatch is well prepared for extending ioctls.
> >>>
> >>> And I assume there should be some validation that the range is aligned
> >>> to cache-line? Or can we flush a partial cache line?
> >>>
> >>
> >> The range is intended to be "sync at least this range", so that
> >> userspace doesn't have to worry about details like that.
> >>
> >
> > I don't think userspace can *not* worry about details like that.
> > Consider a case where the cpu and gpu are simultaneously accessing
> > different parts of a buffer (for ex, sub-allocation). There needs to
> > be cache-line separation between the two.
> >
>
> Right.. and it also seems like we can't get away with just
> flushing/invalidating the whole thing.
>
> qcom's vulkan driver has nonCoherentAtomSize=1, and it looks like
> dma_sync_single_for_cpu() does deal in some way with the partial cache
> line case, although I'm not sure that means we can have a
> nonCoherentAtomSize=1.
>

flush/inv the whole thing could be a useful first step, or at least I
can think of some uses for it. But if it isn't useful for how vk sees
the world, then maybe we should just implement the range properly from
the get-go. (And I *think* requiring the range to be aligned to
cacheline boundaries.. it is always easy from a kernel uabi PoV to
loosen restrictions later, than the other way around.)

BR,
-R

2020-11-16 17:39:03

by Christoph Hellwig

[permalink] [raw]

Subject: Re: [RESEND PATCH v2 4/5] drm/msm: add DRM_MSM_GEM_SYNC_CACHE for non-coherent cache maintenance

On Sat, Nov 14, 2020 at 03:07:20PM -0500, Jonathan Marek wrote:
> qcom's vulkan driver has nonCoherentAtomSize=1, and it looks like
> dma_sync_single_for_cpu() does deal in some way with the partial cache line
> case, although I'm not sure that means we can have a nonCoherentAtomSize=1.

No, it doesn't. You need to ensure ownership is managed at
dma_get_cache_alignment() granularity.

2020-11-16 17:51:28

by Rob Clark

[permalink] [raw]

Subject: Re: [RESEND PATCH v2 4/5] drm/msm: add DRM_MSM_GEM_SYNC_CACHE for non-coherent cache maintenance

On Mon, Nov 16, 2020 at 9:33 AM Christoph Hellwig <[email protected]> wrote:
>
> On Sat, Nov 14, 2020 at 03:07:20PM -0500, Jonathan Marek wrote:
> > qcom's vulkan driver has nonCoherentAtomSize=1, and it looks like
> > dma_sync_single_for_cpu() does deal in some way with the partial cache line
> > case, although I'm not sure that means we can have a nonCoherentAtomSize=1.
>
> No, it doesn't. You need to ensure ownership is managed at
> dma_get_cache_alignment() granularity.

my guess is nonCoherentAtomSize=1 only works in the case of cache
coherent buffers

BR,
-R

2020-11-16 20:47:49

by Jordan Crouse

[permalink] [raw]

Subject: Re: [Freedreno] [RESEND PATCH v2 4/5] drm/msm: add DRM_MSM_GEM_SYNC_CACHE for non-coherent cache maintenance

On Sat, Nov 14, 2020 at 11:39:45AM -0800, Rob Clark wrote:
> On Sat, Nov 14, 2020 at 10:58 AM Jonathan Marek <[email protected]> wrote:
> >
> > On 11/14/20 1:46 PM, Rob Clark wrote:
> > > On Sat, Nov 14, 2020 at 8:24 AM Christoph Hellwig <[email protected]> wrote:
> > >>
> > >> On Sat, Nov 14, 2020 at 10:17:12AM -0500, Jonathan Marek wrote:
> > >>> +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags,
> > >>> + size_t range_start, size_t range_end)
> > >>> +{
> > >>> + struct msm_gem_object *msm_obj = to_msm_bo(obj);
> > >>> + struct device *dev = msm_obj->base.dev->dev;
> > >>> +
> > >>> + /* exit early if get_pages() hasn't been called yet */
> > >>> + if (!msm_obj->pages)
> > >>> + return;
> > >>> +
> > >>> + /* TODO: sync only the specified range */
> > >>> +
> > >>> + if (flags & MSM_GEM_SYNC_FOR_DEVICE) {
> > >>> + dma_sync_sg_for_device(dev, msm_obj->sgt->sgl,
> > >>> + msm_obj->sgt->nents, DMA_TO_DEVICE);
> > >>> + }
> > >>> +
> > >>> + if (flags & MSM_GEM_SYNC_FOR_CPU) {
> > >>> + dma_sync_sg_for_cpu(dev, msm_obj->sgt->sgl,
> > >>> + msm_obj->sgt->nents, DMA_FROM_DEVICE);
> > >>> + }
> > >>
> > >> Splitting this helper from the only caller is rather strange, epecially
> > >> with the two unused arguments. And I think the way this is specified
> > >> to take a range, but ignoring it is actively dangerous. User space will
> > >> rely on it syncing everything sooner or later and then you are stuck.
> > >> So just define a sync all primitive for now, and if you really need a
> > >> range sync and have actually implemented it add a new ioctl for that.
> > >
> > > We do already have a split of ioctl "layer" which enforces valid ioctl
> > > params, etc, and gem (or other) module code which is called by the
> > > ioctl func. So I think it is fine to keep this split here. (Also, I
> > > think at some point there will be a uring type of ioctl alternative
> > > which would re-use the same gem func.)
> > >
> > > But I do agree that the range should be respected or added later..
> > > drm_ioctl() dispatch is well prepared for extending ioctls.
> > >
> > > And I assume there should be some validation that the range is aligned
> > > to cache-line? Or can we flush a partial cache line?
> > >
> >
> > The range is intended to be "sync at least this range", so that
> > userspace doesn't have to worry about details like that.
> >
>
> I don't think userspace can *not* worry about details like that.
> Consider a case where the cpu and gpu are simultaneously accessing
> different parts of a buffer (for ex, sub-allocation). There needs to
> be cache-line separation between the two.

There is at least one compute conformance test that I can think of that does
exactly this.

Jordan

> BR,
> -R
> _______________________________________________
> Freedreno mailing list
> [email protected]
> https://lists.freedesktop.org/mailman/listinfo/freedreno

--
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project

2020-11-16 20:49:04

by Jordan Crouse

[permalink] [raw]

Subject: Re: [Freedreno] [RESEND PATCH v2 4/5] drm/msm: add DRM_MSM_GEM_SYNC_CACHE for non-coherent cache maintenance

On Sat, Nov 14, 2020 at 10:17:12AM -0500, Jonathan Marek wrote:
> This makes it possible to use the non-coherent cached MSM_BO_CACHED mode,
> which otherwise doesn't provide any method for cleaning/invalidating the
> cache to sync with the device.
>
> Signed-off-by: Jonathan Marek <[email protected]>
> ---
> drivers/gpu/drm/msm/msm_drv.c | 21 +++++++++++++++++++++
> drivers/gpu/drm/msm/msm_drv.h | 2 ++
> drivers/gpu/drm/msm/msm_gem.c | 23 +++++++++++++++++++++++
> include/uapi/drm/msm_drm.h | 20 ++++++++++++++++++++
> 4 files changed, 66 insertions(+)
>
> diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
> index bae48afca82e..3f17acdf6594 100644
> --- a/drivers/gpu/drm/msm/msm_drv.c
> +++ b/drivers/gpu/drm/msm/msm_drv.c
> @@ -959,6 +959,26 @@ static int msm_ioctl_submitqueue_close(struct drm_device *dev, void *data,
> return msm_submitqueue_remove(file->driver_priv, id);
> }
>
> +static int msm_ioctl_gem_sync_cache(struct drm_device *dev, void *data,
> + struct drm_file *file)
> +{
> + struct drm_msm_gem_sync_cache *args = data;
> + struct drm_gem_object *obj;
> +
> + if (args->flags & ~MSM_GEM_SYNC_CACHE_FLAGS)
> + return -EINVAL;
> +
> + obj = drm_gem_object_lookup(file, args->handle);
> + if (!obj)
> + return -ENOENT;
> +
> + msm_gem_sync_cache(obj, args->flags, args->offset, args->end);
> +
> + drm_gem_object_put(obj);
> +
> + return 0;
> +}
> +
> static const struct drm_ioctl_desc msm_ioctls[] = {
> DRM_IOCTL_DEF_DRV(MSM_GET_PARAM, msm_ioctl_get_param, DRM_RENDER_ALLOW),
> DRM_IOCTL_DEF_DRV(MSM_GEM_NEW, msm_ioctl_gem_new, DRM_RENDER_ALLOW),
> @@ -971,6 +991,7 @@ static const struct drm_ioctl_desc msm_ioctls[] = {
> DRM_IOCTL_DEF_DRV(MSM_SUBMITQUEUE_NEW, msm_ioctl_submitqueue_new, DRM_RENDER_ALLOW),
> DRM_IOCTL_DEF_DRV(MSM_SUBMITQUEUE_CLOSE, msm_ioctl_submitqueue_close, DRM_RENDER_ALLOW),
> DRM_IOCTL_DEF_DRV(MSM_SUBMITQUEUE_QUERY, msm_ioctl_submitqueue_query, DRM_RENDER_ALLOW),
> + DRM_IOCTL_DEF_DRV(MSM_GEM_SYNC_CACHE, msm_ioctl_gem_sync_cache, DRM_RENDER_ALLOW),
> };
>
> static const struct vm_operations_struct vm_ops = {
> diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
> index 22ebecb28349..f170f843010e 100644
> --- a/drivers/gpu/drm/msm/msm_drv.h
> +++ b/drivers/gpu/drm/msm/msm_drv.h
> @@ -318,6 +318,8 @@ void msm_gem_active_get(struct drm_gem_object *obj, struct msm_gpu *gpu);
> void msm_gem_active_put(struct drm_gem_object *obj);
> int msm_gem_cpu_prep(struct drm_gem_object *obj, uint32_t op, ktime_t *timeout);
> int msm_gem_cpu_fini(struct drm_gem_object *obj);
> +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags,
> + size_t range_start, size_t range_end);
> void msm_gem_free_object(struct drm_gem_object *obj);
> int msm_gem_new_handle(struct drm_device *dev, struct drm_file *file,
> uint32_t size, uint32_t flags, uint32_t *handle, char *name);
> diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
> index 3d8254b5de16..039738696f9a 100644
> --- a/drivers/gpu/drm/msm/msm_gem.c
> +++ b/drivers/gpu/drm/msm/msm_gem.c
> @@ -797,6 +797,29 @@ int msm_gem_cpu_fini(struct drm_gem_object *obj)
> return 0;
> }
>
> +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags,
> + size_t range_start, size_t range_end)
> +{
> + struct msm_gem_object *msm_obj = to_msm_bo(obj);
> + struct device *dev = msm_obj->base.dev->dev;
> +

On a6xx targets with I/O coherency you can skip this (assuming that the IOMMU
flag has been set for this buffer). I'm not sure if one of the other patches
already does the bypass but I mention it here since this is the point of
no-return.

Jordan

> + /* exit early if get_pages() hasn't been called yet */
> + if (!msm_obj->pages)
> + return;
> +
> + /* TODO: sync only the specified range */
> +
> + if (flags & MSM_GEM_SYNC_FOR_DEVICE) {
> + dma_sync_sg_for_device(dev, msm_obj->sgt->sgl,
> + msm_obj->sgt->nents, DMA_TO_DEVICE);
> + }
> +
> + if (flags & MSM_GEM_SYNC_FOR_CPU) {
> + dma_sync_sg_for_cpu(dev, msm_obj->sgt->sgl,
> + msm_obj->sgt->nents, DMA_FROM_DEVICE);
> + }
> +}
> +
> #ifdef CONFIG_DEBUG_FS
> static void describe_fence(struct dma_fence *fence, const char *type,
> struct seq_file *m)
> diff --git a/include/uapi/drm/msm_drm.h b/include/uapi/drm/msm_drm.h
> index 474497e8743a..c8288f328528 100644
> --- a/include/uapi/drm/msm_drm.h
> +++ b/include/uapi/drm/msm_drm.h
> @@ -319,6 +319,24 @@ struct drm_msm_submitqueue_query {
> __u32 pad;
> };
>
> +/*
> + * Host cache maintenance (relevant for MSM_BO_CACHED)
> + * driver may both clean/invalidate (flush) for clean
> + */
> +
> +#define MSM_GEM_SYNC_FOR_DEVICE 0x1
> +#define MSM_GEM_SYNC_FOR_CPU 0x2
> +
> +#define MSM_GEM_SYNC_CACHE_FLAGS (MSM_GEM_SYNC_FOR_DEVICE | \
> + MSM_GEM_SYNC_FOR_CPU)
> +
> +struct drm_msm_gem_sync_cache {
> + __u32 handle;
> + __u32 flags;
> + __u64 offset;
> + __u64 end; /* offset + size */
> +};
> +
> #define DRM_MSM_GET_PARAM 0x00
> /* placeholder:
> #define DRM_MSM_SET_PARAM 0x01
> @@ -336,6 +354,7 @@ struct drm_msm_submitqueue_query {
> #define DRM_MSM_SUBMITQUEUE_NEW 0x0A
> #define DRM_MSM_SUBMITQUEUE_CLOSE 0x0B
> #define DRM_MSM_SUBMITQUEUE_QUERY 0x0C
> +#define DRM_MSM_GEM_SYNC_CACHE 0x0D
>
> #define DRM_IOCTL_MSM_GET_PARAM DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GET_PARAM, struct drm_msm_param)
> #define DRM_IOCTL_MSM_GEM_NEW DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GEM_NEW, struct drm_msm_gem_new)
> @@ -348,6 +367,7 @@ struct drm_msm_submitqueue_query {
> #define DRM_IOCTL_MSM_SUBMITQUEUE_NEW DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_NEW, struct drm_msm_submitqueue)
> #define DRM_IOCTL_MSM_SUBMITQUEUE_CLOSE DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_CLOSE, __u32)
> #define DRM_IOCTL_MSM_SUBMITQUEUE_QUERY DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_QUERY, struct drm_msm_submitqueue_query)
> +#define DRM_IOCTL_MSM_GEM_SYNC_CACHE DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_GEM_SYNC_CACHE, struct drm_msm_gem_sync_cache)
>
> #if defined(__cplusplus)
> }
> --
> 2.26.1
>
> _______________________________________________
> Freedreno mailing list
> [email protected]
> https://lists.freedesktop.org/mailman/listinfo/freedreno

--
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project

2020-11-17 01:58:23

by Jonathan Marek

[permalink] [raw]

Subject: Re: [RESEND PATCH v2 4/5] drm/msm: add DRM_MSM_GEM_SYNC_CACHE for non-coherent cache maintenance

On 11/16/20 12:50 PM, Rob Clark wrote:
> On Mon, Nov 16, 2020 at 9:33 AM Christoph Hellwig <[email protected]> wrote:
>>
>> On Sat, Nov 14, 2020 at 03:07:20PM -0500, Jonathan Marek wrote:
>>> qcom's vulkan driver has nonCoherentAtomSize=1, and it looks like
>>> dma_sync_single_for_cpu() does deal in some way with the partial cache line
>>> case, although I'm not sure that means we can have a nonCoherentAtomSize=1.
>>
>> No, it doesn't. You need to ensure ownership is managed at
>> dma_get_cache_alignment() granularity.
>
> my guess is nonCoherentAtomSize=1 only works in the case of cache
> coherent buffers
>

nonCoherentAtomSize doesn't apply to coherent memory (as the name
implies), I guess qcom's driver is just wrong about having
nonCoherentAtomSize=1.

Jordan just mentioned there is at least one conformance test for this, I
wonder if it just doesn't test it well enough, or just doesn't test the
non-coherent memory type?

2020-11-29 18:52:44

by Rob Clark

[permalink] [raw]

Subject: Re: [RESEND PATCH v2 4/5] drm/msm: add DRM_MSM_GEM_SYNC_CACHE for non-coherent cache maintenance

On Mon, Nov 16, 2020 at 9:55 AM Jonathan Marek <[email protected]> wrote:
>
> On 11/16/20 12:50 PM, Rob Clark wrote:
> > On Mon, Nov 16, 2020 at 9:33 AM Christoph Hellwig <[email protected]> wrote:
> >>
> >> On Sat, Nov 14, 2020 at 03:07:20PM -0500, Jonathan Marek wrote:
> >>> qcom's vulkan driver has nonCoherentAtomSize=1, and it looks like
> >>> dma_sync_single_for_cpu() does deal in some way with the partial cache line
> >>> case, although I'm not sure that means we can have a nonCoherentAtomSize=1.
> >>
> >> No, it doesn't. You need to ensure ownership is managed at
> >> dma_get_cache_alignment() granularity.
> >
> > my guess is nonCoherentAtomSize=1 only works in the case of cache
> > coherent buffers
> >
>
> nonCoherentAtomSize doesn't apply to coherent memory (as the name
> implies), I guess qcom's driver is just wrong about having
> nonCoherentAtomSize=1.
>
> Jordan just mentioned there is at least one conformance test for this, I
> wonder if it just doesn't test it well enough, or just doesn't test the
> non-coherent memory type?

I was *assuming* (but could be wrong) that Jordan was referring to an
opencl cts test?

At any rate, it is sounding like you should add a
`MSM_PARAM_CACHE_ALIGNMENT` type of param that returns
dma_get_cache_alignment(), and then properly implement offset/end

BR,
-R