2024-01-26 13:56:31

by Alexander Lobakin

[permalink] [raw]
Subject: [PATCH net-next 1/7] dma: compile-out DMA sync op calls when not used

Some platforms do have DMA, but DMA there is always direct and coherent.
Currently, even on such platforms DMA sync operations are compiled and
called.
Add a new hidden Kconfig symbol, DMA_NEED_SYNC, and set it only when
either sync operations are needed or there is DMA ops or swiotlb
enabled. Set dma_need_sync() and dma_skip_sync() (stub for now)
depending on this symbol state and don't call sync ops when
dma_skip_sync() is true.
The change allows for future optimizations of DMA sync calls depending
on compile-time or runtime conditions.

Signed-off-by: Alexander Lobakin <[email protected]>
---
kernel/dma/Kconfig | 4 ++
include/linux/dma-mapping.h | 92 +++++++++++++++++++++++++------------
kernel/dma/mapping.c | 26 ++++++-----
3 files changed, 81 insertions(+), 41 deletions(-)

diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index d62f5957f36b..1c9ff05b1ecb 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -107,6 +107,10 @@ config DMA_BOUNCE_UNALIGNED_KMALLOC
bool
depends on SWIOTLB

+config DMA_NEED_SYNC
+ def_bool ARCH_HAS_SYNC_DMA_FOR_DEVICE || ARCH_HAS_SYNC_DMA_FOR_CPU || \
+ ARCH_HAS_SYNC_DMA_FOR_CPU_ALL || DMA_OPS || SWIOTLB
+
config DMA_RESTRICTED_POOL
bool "DMA Restricted Pool"
depends on OF && OF_RESERVED_MEM && SWIOTLB
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 4a658de44ee9..9dd7e1578bf6 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -117,14 +117,14 @@ dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
size_t size, enum dma_data_direction dir, unsigned long attrs);
void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
enum dma_data_direction dir, unsigned long attrs);
-void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
- enum dma_data_direction dir);
-void dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
- size_t size, enum dma_data_direction dir);
-void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
- int nelems, enum dma_data_direction dir);
-void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
- int nelems, enum dma_data_direction dir);
+void __dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir);
+void __dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir);
+void __dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+ int nelems, enum dma_data_direction dir);
+void __dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+ int nelems, enum dma_data_direction dir);
void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
gfp_t flag, unsigned long attrs);
void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
@@ -147,7 +147,6 @@ u64 dma_get_required_mask(struct device *dev);
bool dma_addressing_limited(struct device *dev);
size_t dma_max_mapping_size(struct device *dev);
size_t dma_opt_mapping_size(struct device *dev);
-bool dma_need_sync(struct device *dev, dma_addr_t dma_addr);
unsigned long dma_get_merge_boundary(struct device *dev);
struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size,
enum dma_data_direction dir, gfp_t gfp, unsigned long attrs);
@@ -195,20 +194,24 @@ static inline void dma_unmap_resource(struct device *dev, dma_addr_t addr,
size_t size, enum dma_data_direction dir, unsigned long attrs)
{
}
-static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
- size_t size, enum dma_data_direction dir)
+static inline void __dma_sync_single_for_cpu(struct device *dev,
+ dma_addr_t addr, size_t size,
+ enum dma_data_direction dir)
{
}
-static inline void dma_sync_single_for_device(struct device *dev,
- dma_addr_t addr, size_t size, enum dma_data_direction dir)
+static inline void __dma_sync_single_for_device(struct device *dev,
+ dma_addr_t addr, size_t size,
+ enum dma_data_direction dir)
{
}
-static inline void dma_sync_sg_for_cpu(struct device *dev,
- struct scatterlist *sg, int nelems, enum dma_data_direction dir)
+static inline void __dma_sync_sg_for_cpu(struct device *dev,
+ struct scatterlist *sg, int nelems,
+ enum dma_data_direction dir)
{
}
-static inline void dma_sync_sg_for_device(struct device *dev,
- struct scatterlist *sg, int nelems, enum dma_data_direction dir)
+static inline void __dma_sync_sg_for_device(struct device *dev,
+ struct scatterlist *sg, int nelems,
+ enum dma_data_direction dir)
{
}
static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
@@ -277,10 +280,6 @@ static inline size_t dma_opt_mapping_size(struct device *dev)
{
return 0;
}
-static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
-{
- return false;
-}
static inline unsigned long dma_get_merge_boundary(struct device *dev)
{
return 0;
@@ -348,20 +347,55 @@ static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr,
return dma_unmap_page_attrs(dev, addr, size, dir, attrs);
}

-static inline void dma_sync_single_range_for_cpu(struct device *dev,
- dma_addr_t addr, unsigned long offset, size_t size,
- enum dma_data_direction dir)
+static inline void
+__dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t addr,
+ unsigned long offset, size_t size,
+ enum dma_data_direction dir)
{
- return dma_sync_single_for_cpu(dev, addr + offset, size, dir);
+ __dma_sync_single_for_cpu(dev, addr + offset, size, dir);
}

-static inline void dma_sync_single_range_for_device(struct device *dev,
- dma_addr_t addr, unsigned long offset, size_t size,
- enum dma_data_direction dir)
+static inline void
+__dma_sync_single_range_for_device(struct device *dev, dma_addr_t addr,
+ unsigned long offset, size_t size,
+ enum dma_data_direction dir)
{
- return dma_sync_single_for_device(dev, addr + offset, size, dir);
+ __dma_sync_single_for_device(dev, addr + offset, size, dir);
}

+#ifdef CONFIG_DMA_NEED_SYNC
+
+#define dma_skip_sync(dev) false
+
+bool dma_need_sync(struct device *dev, dma_addr_t dma_addr);
+
+#else /* !CONFIG_DMA_NEED_SYNC */
+
+#define dma_skip_sync(dev) true
+#define dma_need_sync(dev, dma_addr) false
+
+#endif /* !CONFIG_DMA_NEED_SYNC */
+
+#define dma_check_sync(op, dev, ...) \
+ do { \
+ if (!dma_skip_sync(dev)) \
+ op(dev, __VA_ARGS__); \
+ } while (0)
+
+#define dma_sync_single_for_cpu(d, a, s, r) \
+ dma_check_sync(__dma_sync_single_for_cpu, d, a, s, r)
+#define dma_sync_single_for_device(d, a, s, r) \
+ dma_check_sync(__dma_sync_single_for_device, d, a, s, r)
+#define dma_sync_sg_for_cpu(d, s, n, r) \
+ dma_check_sync(__dma_sync_sg_for_cpu, d, s, n, r)
+#define dma_sync_sg_for_device(d, s, n, r) \
+ dma_check_sync(__dma_sync_sg_for_device, d, s, n, r)
+
+#define dma_sync_single_range_for_cpu(d, a, o, s, r) \
+ dma_check_sync(__dma_sync_single_range_for_cpu, d, a, o, s, r)
+#define dma_sync_single_range_for_device(d, a, o, s, r) \
+ dma_check_sync(__dma_sync_single_range_for_device, d, a, o, s, r)
+
/**
* dma_unmap_sgtable - Unmap the given buffer for DMA
* @dev: The device for which to perform the DMA operation
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 58db8fd70471..a30f37f9d4db 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -329,8 +329,8 @@ void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
}
EXPORT_SYMBOL(dma_unmap_resource);

-void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
- enum dma_data_direction dir)
+void __dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir)
{
const struct dma_map_ops *ops = get_dma_ops(dev);

@@ -341,10 +341,10 @@ void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
ops->sync_single_for_cpu(dev, addr, size, dir);
debug_dma_sync_single_for_cpu(dev, addr, size, dir);
}
-EXPORT_SYMBOL(dma_sync_single_for_cpu);
+EXPORT_SYMBOL(__dma_sync_single_for_cpu);

-void dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
- size_t size, enum dma_data_direction dir)
+void __dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir)
{
const struct dma_map_ops *ops = get_dma_ops(dev);

@@ -355,10 +355,10 @@ void dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
ops->sync_single_for_device(dev, addr, size, dir);
debug_dma_sync_single_for_device(dev, addr, size, dir);
}
-EXPORT_SYMBOL(dma_sync_single_for_device);
+EXPORT_SYMBOL(__dma_sync_single_for_device);

-void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
- int nelems, enum dma_data_direction dir)
+void __dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+ int nelems, enum dma_data_direction dir)
{
const struct dma_map_ops *ops = get_dma_ops(dev);

@@ -369,10 +369,10 @@ void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
ops->sync_sg_for_cpu(dev, sg, nelems, dir);
debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
}
-EXPORT_SYMBOL(dma_sync_sg_for_cpu);
+EXPORT_SYMBOL(__dma_sync_sg_for_cpu);

-void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
- int nelems, enum dma_data_direction dir)
+void __dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+ int nelems, enum dma_data_direction dir)
{
const struct dma_map_ops *ops = get_dma_ops(dev);

@@ -383,7 +383,7 @@ void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
ops->sync_sg_for_device(dev, sg, nelems, dir);
debug_dma_sync_sg_for_device(dev, sg, nelems, dir);
}
-EXPORT_SYMBOL(dma_sync_sg_for_device);
+EXPORT_SYMBOL(__dma_sync_sg_for_device);

/*
* The whole dma_get_sgtable() idea is fundamentally unsafe - it seems
@@ -841,6 +841,7 @@ size_t dma_opt_mapping_size(struct device *dev)
}
EXPORT_SYMBOL_GPL(dma_opt_mapping_size);

+#ifdef CONFIG_DMA_NEED_SYNC
bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -850,6 +851,7 @@ bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
return ops->sync_single_for_cpu || ops->sync_single_for_device;
}
EXPORT_SYMBOL_GPL(dma_need_sync);
+#endif /* CONFIG_DMA_NEED_SYNC */

unsigned long dma_get_merge_boundary(struct device *dev)
{
--
2.43.0



2024-01-29 06:11:59

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH net-next 1/7] dma: compile-out DMA sync op calls when not used

On Fri, Jan 26, 2024 at 02:54:50PM +0100, Alexander Lobakin wrote:
> Some platforms do have DMA, but DMA there is always direct and coherent.
> Currently, even on such platforms DMA sync operations are compiled and
> called.
> Add a new hidden Kconfig symbol, DMA_NEED_SYNC, and set it only when
> either sync operations are needed or there is DMA ops or swiotlb
> enabled. Set dma_need_sync() and dma_skip_sync() (stub for now)
> depending on this symbol state and don't call sync ops when
> dma_skip_sync() is true.
> The change allows for future optimizations of DMA sync calls depending
> on compile-time or runtime conditions.

So the idea of compiling out the calls sounds fine to me. But what
is the point of the extra indirection through the __-prefixed calls?

And if we need that (please document it in the commit log), please
make the wrappers proper inline functions and not macros.


2024-01-29 11:09:23

by Alexander Lobakin

[permalink] [raw]
Subject: Re: [PATCH net-next 1/7] dma: compile-out DMA sync op calls when not used

From: Christoph Hellwig <[email protected]>
Date: Mon, 29 Jan 2024 07:11:36 +0100

> On Fri, Jan 26, 2024 at 02:54:50PM +0100, Alexander Lobakin wrote:
>> Some platforms do have DMA, but DMA there is always direct and coherent.
>> Currently, even on such platforms DMA sync operations are compiled and
>> called.
>> Add a new hidden Kconfig symbol, DMA_NEED_SYNC, and set it only when
>> either sync operations are needed or there is DMA ops or swiotlb
>> enabled. Set dma_need_sync() and dma_skip_sync() (stub for now)
>> depending on this symbol state and don't call sync ops when
>> dma_skip_sync() is true.
>> The change allows for future optimizations of DMA sync calls depending
>> on compile-time or runtime conditions.
>
> So the idea of compiling out the calls sounds fine to me. But what
> is the point of the extra indirection through the __-prefixed calls?

Because dma_sync_* ops are external functions, not inlines, and in the
next patch I'm adding a check there.

>
> And if we need that (please document it in the commit log), please
> make the wrappers proper inline functions and not macros.

Thanks,
Olek

2024-01-29 12:15:33

by Robin Murphy

[permalink] [raw]
Subject: Re: [PATCH net-next 1/7] dma: compile-out DMA sync op calls when not used

On 2024-01-29 11:07 am, Alexander Lobakin wrote:
> From: Christoph Hellwig <[email protected]>
> Date: Mon, 29 Jan 2024 07:11:36 +0100
>
>> On Fri, Jan 26, 2024 at 02:54:50PM +0100, Alexander Lobakin wrote:
>>> Some platforms do have DMA, but DMA there is always direct and coherent.
>>> Currently, even on such platforms DMA sync operations are compiled and
>>> called.
>>> Add a new hidden Kconfig symbol, DMA_NEED_SYNC, and set it only when
>>> either sync operations are needed or there is DMA ops or swiotlb
>>> enabled. Set dma_need_sync() and dma_skip_sync() (stub for now)
>>> depending on this symbol state and don't call sync ops when
>>> dma_skip_sync() is true.
>>> The change allows for future optimizations of DMA sync calls depending
>>> on compile-time or runtime conditions.
>>
>> So the idea of compiling out the calls sounds fine to me. But what
>> is the point of the extra indirection through the __-prefixed calls?
>
> Because dma_sync_* ops are external functions, not inlines, and in the
> next patch I'm adding a check there.
>
>>
>> And if we need that (please document it in the commit log), please
>> make the wrappers proper inline functions and not macros.

In fact those wrappers could perhaps subsume the existing stub
definitions, by starting with a refactor along these lines:

static inline dma_sync_x(...)
{
if (IS_ENABLED(CONFIG_NEED_DMA_SYNC))
__dma_sync_x(...);
}

Cheers,
Robin.

2024-01-31 16:56:53

by Simon Horman

[permalink] [raw]
Subject: Re: [PATCH net-next 1/7] dma: compile-out DMA sync op calls when not used

On Fri, Jan 26, 2024 at 02:54:50PM +0100, Alexander Lobakin wrote:
> Some platforms do have DMA, but DMA there is always direct and coherent.
> Currently, even on such platforms DMA sync operations are compiled and
> called.
> Add a new hidden Kconfig symbol, DMA_NEED_SYNC, and set it only when
> either sync operations are needed or there is DMA ops or swiotlb
> enabled. Set dma_need_sync() and dma_skip_sync() (stub for now)
> depending on this symbol state and don't call sync ops when
> dma_skip_sync() is true.
> The change allows for future optimizations of DMA sync calls depending
> on compile-time or runtime conditions.
>
> Signed-off-by: Alexander Lobakin <[email protected]>

Hi Alexander,

This seems to cause x86_64 allmodconfig builds to fail:

../drivers/media/platform/ti/omap3isp/ispstat.c:82:35: error: ‘dma_sync_single_range_for_device’ undeclared (first use in this function); did you mean ‘dma_sync_sgtable_for_device’?
82 | dma_sync_single_range_for_device);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
| dma_sync_sgtable_for_device
../drivers/media/platform/ti/omap3isp/ispstat.c:82:35: note: each undeclared identifier is reported only once for each function it appears in
../drivers/media/platform/ti/omap3isp/ispstat.c: In function ‘isp_stat_buf_sync_magic_for_cpu’:
../drivers/media/platform/ti/omap3isp/ispstat.c:94:35: error: ‘dma_sync_single_range_for_cpu’ undeclared (first use in this function); did you mean ‘dma_sync_sgtable_for_cpu’?
94 | dma_sync_single_range_for_cpu);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
| dma_sync_sgtable_for_cpu

2024-01-31 17:14:30

by Robin Murphy

[permalink] [raw]
Subject: Re: [PATCH net-next 1/7] dma: compile-out DMA sync op calls when not used

On 31/01/2024 4:52 pm, Simon Horman wrote:
> On Fri, Jan 26, 2024 at 02:54:50PM +0100, Alexander Lobakin wrote:
>> Some platforms do have DMA, but DMA there is always direct and coherent.
>> Currently, even on such platforms DMA sync operations are compiled and
>> called.
>> Add a new hidden Kconfig symbol, DMA_NEED_SYNC, and set it only when
>> either sync operations are needed or there is DMA ops or swiotlb
>> enabled. Set dma_need_sync() and dma_skip_sync() (stub for now)
>> depending on this symbol state and don't call sync ops when
>> dma_skip_sync() is true.
>> The change allows for future optimizations of DMA sync calls depending
>> on compile-time or runtime conditions.
>>
>> Signed-off-by: Alexander Lobakin <[email protected]>
>
> Hi Alexander,
>
> This seems to cause x86_64 allmodconfig builds to fail:

Oh yeah, the sync_single_range definitions shouldn't need touching at
all, since they're unconditional wrappers around regular sync_single
invocations (which already may or may not do anything).

Thanks,
Robin.

>
> ../drivers/media/platform/ti/omap3isp/ispstat.c:82:35: error: ‘dma_sync_single_range_for_device’ undeclared (first use in this function); did you mean ‘dma_sync_sgtable_for_device’?
> 82 | dma_sync_single_range_for_device);
> | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> | dma_sync_sgtable_for_device
> ../drivers/media/platform/ti/omap3isp/ispstat.c:82:35: note: each undeclared identifier is reported only once for each function it appears in
> ../drivers/media/platform/ti/omap3isp/ispstat.c: In function ‘isp_stat_buf_sync_magic_for_cpu’:
> ../drivers/media/platform/ti/omap3isp/ispstat.c:94:35: error: ‘dma_sync_single_range_for_cpu’ undeclared (first use in this function); did you mean ‘dma_sync_sgtable_for_cpu’?
> 94 | dma_sync_single_range_for_cpu);
> | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> | dma_sync_sgtable_for_cpu