2024-01-26 13:56:52

by Alexander Lobakin

[permalink] [raw]
Subject: [PATCH net-next 2/7] dma: avoid expensive redundant calls for sync operations

From: Eric Dumazet <[email protected]>

Quite often, NIC devices do not need dma_sync operations on x86_64
at least.
Indeed, when dev_is_dma_coherent(dev) is true and
dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
and friends do nothing.

However, indirectly calling them when CONFIG_RETPOLINE=y consumes about
10% of cycles on a cpu receiving packets from softirq at ~100Gbit rate.
Even if/when CONFIG_RETPOLINE is not set, there is a cost of about 3%.

Add dev->skip_dma_sync boolean which is set during the device
initialization depending on the setup: dev_is_dma_coherent() for direct
DMA, !(sync_single_for_device || sync_single_for_cpu) or positive result
from the new callback, dma_map_ops::can_skip_sync for non-NULL DMA ops.
Then later, if/when swiotlb is used for the first time, the flag
is turned off, from swiotlb_tbl_map_single().

On iavf, the UDP trafficgen with XDP_DROP in skb mode test shows
+3-5% increase for direct DMA.

Signed-off-by: Eric Dumazet <[email protected]>
Co-developed-by: Alexander Lobakin <[email protected]>
Signed-off-by: Alexander Lobakin <[email protected]>
---
include/linux/device.h | 5 +++++
include/linux/dma-map-ops.h | 17 +++++++++++++++++
include/linux/dma-mapping.h | 12 ++++++++++--
drivers/base/dd.c | 2 ++
kernel/dma/mapping.c | 34 +++++++++++++++++++++++++++++++---
kernel/dma/swiotlb.c | 14 ++++++++++++++
6 files changed, 79 insertions(+), 5 deletions(-)

diff --git a/include/linux/device.h b/include/linux/device.h
index 97c4b046c09d..f23e6a32bea0 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -686,6 +686,8 @@ struct device_physical_location {
* other devices probe successfully.
* @dma_coherent: this particular device is dma coherent, even if the
* architecture supports non-coherent devices.
+ * @dma_skip_sync: DMA sync operations can be skipped for coherent non-SWIOTLB
+ * buffers.
* @dma_ops_bypass: If set to %true then the dma_ops are bypassed for the
* streaming DMA operations (->map_* / ->unmap_* / ->sync_*),
* and optionall (if the coherent mask is large enough) also
@@ -800,6 +802,9 @@ struct device {
defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
bool dma_coherent:1;
#endif
+#ifdef CONFIG_DMA_NEED_SYNC
+ bool dma_skip_sync:1;
+#endif
#ifdef CONFIG_DMA_OPS_BYPASS
bool dma_ops_bypass : 1;
#endif
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 4abc60f04209..937c295e9da8 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -78,6 +78,7 @@ struct dma_map_ops {
int nents, enum dma_data_direction dir);
void (*cache_sync)(struct device *dev, void *vaddr, size_t size,
enum dma_data_direction direction);
+ bool (*can_skip_sync)(struct device *dev);
int (*dma_supported)(struct device *dev, u64 mask);
u64 (*get_required_mask)(struct device *dev);
size_t (*max_mapping_size)(struct device *dev);
@@ -111,6 +112,22 @@ static inline void set_dma_ops(struct device *dev,
}
#endif /* CONFIG_DMA_OPS */

+#ifdef CONFIG_DMA_NEED_SYNC
+
+static inline void dma_set_skip_sync(struct device *dev, bool skip)
+{
+ dev->dma_skip_sync = skip;
+}
+
+void dma_setup_skip_sync(struct device *dev);
+
+#else /* !CONFIG_DMA_NEED_SYNC */
+
+#define dma_set_skip_sync(dev, skip) do { } while (0)
+#define dma_setup_skip_sync(dev) do { } while (0)
+
+#endif /* !CONFIG_DMA_NEED_SYNC */
+
#ifdef CONFIG_DMA_CMA
extern struct cma *dma_contiguous_default_area;

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 9dd7e1578bf6..bc9f67e0c139 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -365,9 +365,17 @@ __dma_sync_single_range_for_device(struct device *dev, dma_addr_t addr,

#ifdef CONFIG_DMA_NEED_SYNC

-#define dma_skip_sync(dev) false
+static inline bool dma_skip_sync(const struct device *dev)
+{
+ return dev->dma_skip_sync;
+}
+
+bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr);

-bool dma_need_sync(struct device *dev, dma_addr_t dma_addr);
+static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
+{
+ return dma_skip_sync(dev) ? false : __dma_need_sync(dev, dma_addr);
+}

#else /* !CONFIG_DMA_NEED_SYNC */

diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 85152537dbf1..67ad3e1d51f6 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -642,6 +642,8 @@ static int really_probe(struct device *dev, struct device_driver *drv)
goto pinctrl_bind_failed;
}

+ dma_setup_skip_sync(dev);
+
ret = driver_sysfs_add(dev);
if (ret) {
pr_err("%s: driver_sysfs_add(%s) failed\n",
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index a30f37f9d4db..8fa464b3954e 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -842,15 +842,43 @@ size_t dma_opt_mapping_size(struct device *dev)
EXPORT_SYMBOL_GPL(dma_opt_mapping_size);

#ifdef CONFIG_DMA_NEED_SYNC
-bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
+bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr)
{
const struct dma_map_ops *ops = get_dma_ops(dev);

if (dma_map_direct(dev, ops))
+ /*
+ * dma_skip_sync could've been set to false on first SWIOTLB
+ * buffer mapping, but @dma_addr is not necessary an SWIOTLB
+ * buffer. In this case, fall back to more granular check.
+ */
return dma_direct_need_sync(dev, dma_addr);
- return ops->sync_single_for_cpu || ops->sync_single_for_device;
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(__dma_need_sync);
+
+void dma_setup_skip_sync(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ bool skip;
+
+ if (dma_map_direct(dev, ops))
+ /*
+ * dma_skip_sync will be set to false on first SWIOTLB buffer
+ * mapping, if any. During the device initialization, it's
+ * enough to check only for DMA coherence.
+ */
+ skip = dev_is_dma_coherent(dev);
+ else if (!ops->sync_single_for_device && !ops->sync_single_for_cpu)
+ skip = true;
+ else if (ops->can_skip_sync)
+ skip = ops->can_skip_sync(dev);
+ else
+ skip = false;
+
+ dma_set_skip_sync(dev, skip);
}
-EXPORT_SYMBOL_GPL(dma_need_sync);
#endif /* CONFIG_DMA_NEED_SYNC */

unsigned long dma_get_merge_boundary(struct device *dev)
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index b079a9a8e087..b62ea0a4f106 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -1286,6 +1286,16 @@ static unsigned long mem_used(struct io_tlb_mem *mem)

#endif /* CONFIG_DEBUG_FS */

+static inline void swiotlb_disable_dma_skip_sync(struct device *dev)
+{
+ /*
+ * If dma_skip_sync was set, reset it to false on first SWIOTLB buffer
+ * mapping/allocation to always sync SWIOTLB buffers.
+ */
+ if (unlikely(dma_skip_sync(dev)))
+ dma_set_skip_sync(dev, false);
+}
+
phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
size_t mapping_size, size_t alloc_size,
unsigned int alloc_align_mask, enum dma_data_direction dir,
@@ -1323,6 +1333,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
return (phys_addr_t)DMA_MAPPING_ERROR;
}

+ swiotlb_disable_dma_skip_sync(dev);
+
/*
* Save away the mapping from the original address to the DMA address.
* This is needed when we sync the memory. Then we sync the buffer if
@@ -1640,6 +1652,8 @@ struct page *swiotlb_alloc(struct device *dev, size_t size)
if (index == -1)
return NULL;

+ swiotlb_disable_dma_skip_sync(dev);
+
tlb_addr = slot_addr(pool->start, index);

return pfn_to_page(PFN_DOWN(tlb_addr));
--
2.43.0



2024-01-26 15:49:19

by Robin Murphy

[permalink] [raw]
Subject: Re: [PATCH net-next 2/7] dma: avoid expensive redundant calls for sync operations

On 26/01/2024 1:54 pm, Alexander Lobakin wrote:
> From: Eric Dumazet <[email protected]>
>
> Quite often, NIC devices do not need dma_sync operations on x86_64
> at least.
> Indeed, when dev_is_dma_coherent(dev) is true and
> dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
> and friends do nothing.
>
> However, indirectly calling them when CONFIG_RETPOLINE=y consumes about
> 10% of cycles on a cpu receiving packets from softirq at ~100Gbit rate.
> Even if/when CONFIG_RETPOLINE is not set, there is a cost of about 3%.
>
> Add dev->skip_dma_sync boolean which is set during the device
> initialization depending on the setup: dev_is_dma_coherent() for direct
> DMA, !(sync_single_for_device || sync_single_for_cpu) or positive result
> from the new callback, dma_map_ops::can_skip_sync for non-NULL DMA ops.
> Then later, if/when swiotlb is used for the first time, the flag
> is turned off, from swiotlb_tbl_map_single().

I think you could probably just promote the dma_uses_io_tlb flag from
SWIOTLB_DYNAMIC to a general SWIOTLB thing to serve this purpose now.

Similarly I don't think a new op is necessary now that we have
dma_map_ops.flags. A simple static flag to indicate that sync may be
skipped under the same conditions as implied for dma-direct - i.e.
dev_is_dma_coherent(dev) && !dev->dma_use_io_tlb - seems like it ought
to suffice.

Thanks,
Robin.

> On iavf, the UDP trafficgen with XDP_DROP in skb mode test shows
> +3-5% increase for direct DMA.
>
> Signed-off-by: Eric Dumazet <[email protected]>
> Co-developed-by: Alexander Lobakin <[email protected]>
> Signed-off-by: Alexander Lobakin <[email protected]>
> ---
> include/linux/device.h | 5 +++++
> include/linux/dma-map-ops.h | 17 +++++++++++++++++
> include/linux/dma-mapping.h | 12 ++++++++++--
> drivers/base/dd.c | 2 ++
> kernel/dma/mapping.c | 34 +++++++++++++++++++++++++++++++---
> kernel/dma/swiotlb.c | 14 ++++++++++++++
> 6 files changed, 79 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/device.h b/include/linux/device.h
> index 97c4b046c09d..f23e6a32bea0 100644
> --- a/include/linux/device.h
> +++ b/include/linux/device.h
> @@ -686,6 +686,8 @@ struct device_physical_location {
> * other devices probe successfully.
> * @dma_coherent: this particular device is dma coherent, even if the
> * architecture supports non-coherent devices.
> + * @dma_skip_sync: DMA sync operations can be skipped for coherent non-SWIOTLB
> + * buffers.
> * @dma_ops_bypass: If set to %true then the dma_ops are bypassed for the
> * streaming DMA operations (->map_* / ->unmap_* / ->sync_*),
> * and optionall (if the coherent mask is large enough) also
> @@ -800,6 +802,9 @@ struct device {
> defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
> bool dma_coherent:1;
> #endif
> +#ifdef CONFIG_DMA_NEED_SYNC
> + bool dma_skip_sync:1;
> +#endif
> #ifdef CONFIG_DMA_OPS_BYPASS
> bool dma_ops_bypass : 1;
> #endif
> diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
> index 4abc60f04209..937c295e9da8 100644
> --- a/include/linux/dma-map-ops.h
> +++ b/include/linux/dma-map-ops.h
> @@ -78,6 +78,7 @@ struct dma_map_ops {
> int nents, enum dma_data_direction dir);
> void (*cache_sync)(struct device *dev, void *vaddr, size_t size,
> enum dma_data_direction direction);
> + bool (*can_skip_sync)(struct device *dev);
> int (*dma_supported)(struct device *dev, u64 mask);
> u64 (*get_required_mask)(struct device *dev);
> size_t (*max_mapping_size)(struct device *dev);
> @@ -111,6 +112,22 @@ static inline void set_dma_ops(struct device *dev,
> }
> #endif /* CONFIG_DMA_OPS */
>
> +#ifdef CONFIG_DMA_NEED_SYNC
> +
> +static inline void dma_set_skip_sync(struct device *dev, bool skip)
> +{
> + dev->dma_skip_sync = skip;
> +}
> +
> +void dma_setup_skip_sync(struct device *dev);
> +
> +#else /* !CONFIG_DMA_NEED_SYNC */
> +
> +#define dma_set_skip_sync(dev, skip) do { } while (0)
> +#define dma_setup_skip_sync(dev) do { } while (0)
> +
> +#endif /* !CONFIG_DMA_NEED_SYNC */
> +
> #ifdef CONFIG_DMA_CMA
> extern struct cma *dma_contiguous_default_area;
>
> diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
> index 9dd7e1578bf6..bc9f67e0c139 100644
> --- a/include/linux/dma-mapping.h
> +++ b/include/linux/dma-mapping.h
> @@ -365,9 +365,17 @@ __dma_sync_single_range_for_device(struct device *dev, dma_addr_t addr,
>
> #ifdef CONFIG_DMA_NEED_SYNC
>
> -#define dma_skip_sync(dev) false
> +static inline bool dma_skip_sync(const struct device *dev)
> +{
> + return dev->dma_skip_sync;
> +}
> +
> +bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr);
>
> -bool dma_need_sync(struct device *dev, dma_addr_t dma_addr);
> +static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
> +{
> + return dma_skip_sync(dev) ? false : __dma_need_sync(dev, dma_addr);
> +}
>
> #else /* !CONFIG_DMA_NEED_SYNC */
>
> diff --git a/drivers/base/dd.c b/drivers/base/dd.c
> index 85152537dbf1..67ad3e1d51f6 100644
> --- a/drivers/base/dd.c
> +++ b/drivers/base/dd.c
> @@ -642,6 +642,8 @@ static int really_probe(struct device *dev, struct device_driver *drv)
> goto pinctrl_bind_failed;
> }
>
> + dma_setup_skip_sync(dev);
> +
> ret = driver_sysfs_add(dev);
> if (ret) {
> pr_err("%s: driver_sysfs_add(%s) failed\n",
> diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
> index a30f37f9d4db..8fa464b3954e 100644
> --- a/kernel/dma/mapping.c
> +++ b/kernel/dma/mapping.c
> @@ -842,15 +842,43 @@ size_t dma_opt_mapping_size(struct device *dev)
> EXPORT_SYMBOL_GPL(dma_opt_mapping_size);
>
> #ifdef CONFIG_DMA_NEED_SYNC
> -bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
> +bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr)
> {
> const struct dma_map_ops *ops = get_dma_ops(dev);
>
> if (dma_map_direct(dev, ops))
> + /*
> + * dma_skip_sync could've been set to false on first SWIOTLB
> + * buffer mapping, but @dma_addr is not necessary an SWIOTLB
> + * buffer. In this case, fall back to more granular check.
> + */
> return dma_direct_need_sync(dev, dma_addr);
> - return ops->sync_single_for_cpu || ops->sync_single_for_device;
> +
> + return true;
> +}
> +EXPORT_SYMBOL_GPL(__dma_need_sync);
> +
> +void dma_setup_skip_sync(struct device *dev)
> +{
> + const struct dma_map_ops *ops = get_dma_ops(dev);
> + bool skip;
> +
> + if (dma_map_direct(dev, ops))
> + /*
> + * dma_skip_sync will be set to false on first SWIOTLB buffer
> + * mapping, if any. During the device initialization, it's
> + * enough to check only for DMA coherence.
> + */
> + skip = dev_is_dma_coherent(dev);
> + else if (!ops->sync_single_for_device && !ops->sync_single_for_cpu)
> + skip = true;
> + else if (ops->can_skip_sync)
> + skip = ops->can_skip_sync(dev);
> + else
> + skip = false;
> +
> + dma_set_skip_sync(dev, skip);
> }
> -EXPORT_SYMBOL_GPL(dma_need_sync);
> #endif /* CONFIG_DMA_NEED_SYNC */
>
> unsigned long dma_get_merge_boundary(struct device *dev)
> diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
> index b079a9a8e087..b62ea0a4f106 100644
> --- a/kernel/dma/swiotlb.c
> +++ b/kernel/dma/swiotlb.c
> @@ -1286,6 +1286,16 @@ static unsigned long mem_used(struct io_tlb_mem *mem)
>
> #endif /* CONFIG_DEBUG_FS */
>
> +static inline void swiotlb_disable_dma_skip_sync(struct device *dev)
> +{
> + /*
> + * If dma_skip_sync was set, reset it to false on first SWIOTLB buffer
> + * mapping/allocation to always sync SWIOTLB buffers.
> + */
> + if (unlikely(dma_skip_sync(dev)))
> + dma_set_skip_sync(dev, false);
> +}
> +
> phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
> size_t mapping_size, size_t alloc_size,
> unsigned int alloc_align_mask, enum dma_data_direction dir,
> @@ -1323,6 +1333,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
> return (phys_addr_t)DMA_MAPPING_ERROR;
> }
>
> + swiotlb_disable_dma_skip_sync(dev);
> +
> /*
> * Save away the mapping from the original address to the DMA address.
> * This is needed when we sync the memory. Then we sync the buffer if
> @@ -1640,6 +1652,8 @@ struct page *swiotlb_alloc(struct device *dev, size_t size)
> if (index == -1)
> return NULL;
>
> + swiotlb_disable_dma_skip_sync(dev);
> +
> tlb_addr = slot_addr(pool->start, index);
>
> return pfn_to_page(PFN_DOWN(tlb_addr));

2024-01-26 16:45:45

by Alexander Lobakin

[permalink] [raw]
Subject: Re: [PATCH net-next 2/7] dma: avoid expensive redundant calls for sync operations

From: Robin Murphy <[email protected]>
Date: Fri, 26 Jan 2024 15:48:54 +0000

> On 26/01/2024 1:54 pm, Alexander Lobakin wrote:
>> From: Eric Dumazet <[email protected]>
>>
>> Quite often, NIC devices do not need dma_sync operations on x86_64
>> at least.
>> Indeed, when dev_is_dma_coherent(dev) is true and
>> dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
>> and friends do nothing.
>>
>> However, indirectly calling them when CONFIG_RETPOLINE=y consumes about
>> 10% of cycles on a cpu receiving packets from softirq at ~100Gbit rate.
>> Even if/when CONFIG_RETPOLINE is not set, there is a cost of about 3%.
>>
>> Add dev->skip_dma_sync boolean which is set during the device
>> initialization depending on the setup: dev_is_dma_coherent() for direct
>> DMA, !(sync_single_for_device || sync_single_for_cpu) or positive result
>> from the new callback, dma_map_ops::can_skip_sync for non-NULL DMA ops.
>> Then later, if/when swiotlb is used for the first time, the flag
>> is turned off, from swiotlb_tbl_map_single().
>
> I think you could probably just promote the dma_uses_io_tlb flag from
> SWIOTLB_DYNAMIC to a general SWIOTLB thing to serve this purpose now.

Nice catch!

>
> Similarly I don't think a new op is necessary now that we have
> dma_map_ops.flags. A simple static flag to indicate that sync may be> skipped under the same conditions as implied for dma-direct - i.e.
> dev_is_dma_coherent(dev) && !dev->dma_use_io_tlb - seems like it ought
> to suffice.

In my initial implementation, I used a new dma_map_ops flag, but then I
realized different DMA ops may require or not require syncing under
different conditions, not only dev_is_dma_coherent().
Or am I wrong and they would always be the same?

>
> Thanks,
> Robin.

Thanks,
Olek

2024-01-26 17:21:53

by Robin Murphy

[permalink] [raw]
Subject: Re: [PATCH net-next 2/7] dma: avoid expensive redundant calls for sync operations

On 26/01/2024 4:45 pm, Alexander Lobakin wrote:
> From: Robin Murphy <[email protected]>
> Date: Fri, 26 Jan 2024 15:48:54 +0000
>
>> On 26/01/2024 1:54 pm, Alexander Lobakin wrote:
>>> From: Eric Dumazet <[email protected]>
>>>
>>> Quite often, NIC devices do not need dma_sync operations on x86_64
>>> at least.
>>> Indeed, when dev_is_dma_coherent(dev) is true and
>>> dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
>>> and friends do nothing.
>>>
>>> However, indirectly calling them when CONFIG_RETPOLINE=y consumes about
>>> 10% of cycles on a cpu receiving packets from softirq at ~100Gbit rate.
>>> Even if/when CONFIG_RETPOLINE is not set, there is a cost of about 3%.
>>>
>>> Add dev->skip_dma_sync boolean which is set during the device
>>> initialization depending on the setup: dev_is_dma_coherent() for direct
>>> DMA, !(sync_single_for_device || sync_single_for_cpu) or positive result
>>> from the new callback, dma_map_ops::can_skip_sync for non-NULL DMA ops.
>>> Then later, if/when swiotlb is used for the first time, the flag
>>> is turned off, from swiotlb_tbl_map_single().
>>
>> I think you could probably just promote the dma_uses_io_tlb flag from
>> SWIOTLB_DYNAMIC to a general SWIOTLB thing to serve this purpose now.
>
> Nice catch!
>
>>
>> Similarly I don't think a new op is necessary now that we have
>> dma_map_ops.flags. A simple static flag to indicate that sync may be> skipped under the same conditions as implied for dma-direct - i.e.
>> dev_is_dma_coherent(dev) && !dev->dma_use_io_tlb - seems like it ought
>> to suffice.
>
> In my initial implementation, I used a new dma_map_ops flag, but then I
> realized different DMA ops may require or not require syncing under
> different conditions, not only dev_is_dma_coherent().
> Or am I wrong and they would always be the same?

I think it's safe to assume that, as with P2P support, this will only
matter for dma-direct and iommu-dma for the foreseeable future, and
those do currently share the same conditions as above. Thus we may as
well keep things simple for now, and if anything ever does have cause to
change, it can be the future's problem to keep this mechanism working as
intended.

Thanks,
Robin.

2024-01-26 18:48:45

by Petr Tesařík

[permalink] [raw]
Subject: Re: [PATCH net-next 2/7] dma: avoid expensive redundant calls for sync operations

On Fri, 26 Jan 2024 17:21:24 +0000
Robin Murphy <[email protected]> wrote:

> On 26/01/2024 4:45 pm, Alexander Lobakin wrote:
> > From: Robin Murphy <[email protected]>
> > Date: Fri, 26 Jan 2024 15:48:54 +0000
> >
> >> On 26/01/2024 1:54 pm, Alexander Lobakin wrote:
> >>> From: Eric Dumazet <[email protected]>
> >>>
> >>> Quite often, NIC devices do not need dma_sync operations on x86_64
> >>> at least.
> >>> Indeed, when dev_is_dma_coherent(dev) is true and
> >>> dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
> >>> and friends do nothing.
> >>>
> >>> However, indirectly calling them when CONFIG_RETPOLINE=y consumes about
> >>> 10% of cycles on a cpu receiving packets from softirq at ~100Gbit rate.
> >>> Even if/when CONFIG_RETPOLINE is not set, there is a cost of about 3%.
> >>>
> >>> Add dev->skip_dma_sync boolean which is set during the device
> >>> initialization depending on the setup: dev_is_dma_coherent() for direct
> >>> DMA, !(sync_single_for_device || sync_single_for_cpu) or positive result
> >>> from the new callback, dma_map_ops::can_skip_sync for non-NULL DMA ops.
> >>> Then later, if/when swiotlb is used for the first time, the flag
> >>> is turned off, from swiotlb_tbl_map_single().
> >>
> >> I think you could probably just promote the dma_uses_io_tlb flag from
> >> SWIOTLB_DYNAMIC to a general SWIOTLB thing to serve this purpose now.
> >
> > Nice catch!
> >
> >>
> >> Similarly I don't think a new op is necessary now that we have
> >> dma_map_ops.flags. A simple static flag to indicate that sync may be> skipped under the same conditions as implied for dma-direct - i.e.
> >> dev_is_dma_coherent(dev) && !dev->dma_use_io_tlb - seems like it ought
> >> to suffice.
> >
> > In my initial implementation, I used a new dma_map_ops flag, but then I
> > realized different DMA ops may require or not require syncing under
> > different conditions, not only dev_is_dma_coherent().
> > Or am I wrong and they would always be the same?
>
> I think it's safe to assume that, as with P2P support, this will only
> matter for dma-direct and iommu-dma for the foreseeable future, and
> those do currently share the same conditions as above. Thus we may as
> well keep things simple for now, and if anything ever does have cause to
> change, it can be the future's problem to keep this mechanism working as
> intended.

Can we have a comment that states this assumption along with the flag?
Because when it breaks, it will keep someone cursing for days why DMA
sometimes fails on their device before they find out it's not synced.
And then wondering why the code makes such silly assumptions...

My two cents
Petr T

2024-01-26 19:13:27

by Robin Murphy

[permalink] [raw]
Subject: Re: [PATCH net-next 2/7] dma: avoid expensive redundant calls for sync operations

On 26/01/2024 6:48 pm, Petr Tesařík wrote:
> On Fri, 26 Jan 2024 17:21:24 +0000
> Robin Murphy <[email protected]> wrote:
>
>> On 26/01/2024 4:45 pm, Alexander Lobakin wrote:
>>> From: Robin Murphy <[email protected]>
>>> Date: Fri, 26 Jan 2024 15:48:54 +0000
>>>
>>>> On 26/01/2024 1:54 pm, Alexander Lobakin wrote:
>>>>> From: Eric Dumazet <[email protected]>
>>>>>
>>>>> Quite often, NIC devices do not need dma_sync operations on x86_64
>>>>> at least.
>>>>> Indeed, when dev_is_dma_coherent(dev) is true and
>>>>> dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
>>>>> and friends do nothing.
>>>>>
>>>>> However, indirectly calling them when CONFIG_RETPOLINE=y consumes about
>>>>> 10% of cycles on a cpu receiving packets from softirq at ~100Gbit rate.
>>>>> Even if/when CONFIG_RETPOLINE is not set, there is a cost of about 3%.
>>>>>
>>>>> Add dev->skip_dma_sync boolean which is set during the device
>>>>> initialization depending on the setup: dev_is_dma_coherent() for direct
>>>>> DMA, !(sync_single_for_device || sync_single_for_cpu) or positive result
>>>>> from the new callback, dma_map_ops::can_skip_sync for non-NULL DMA ops.
>>>>> Then later, if/when swiotlb is used for the first time, the flag
>>>>> is turned off, from swiotlb_tbl_map_single().
>>>>
>>>> I think you could probably just promote the dma_uses_io_tlb flag from
>>>> SWIOTLB_DYNAMIC to a general SWIOTLB thing to serve this purpose now.
>>>
>>> Nice catch!
>>>
>>>>
>>>> Similarly I don't think a new op is necessary now that we have
>>>> dma_map_ops.flags. A simple static flag to indicate that sync may be> skipped under the same conditions as implied for dma-direct - i.e.
>>>> dev_is_dma_coherent(dev) && !dev->dma_use_io_tlb - seems like it ought
>>>> to suffice.
>>>
>>> In my initial implementation, I used a new dma_map_ops flag, but then I
>>> realized different DMA ops may require or not require syncing under
>>> different conditions, not only dev_is_dma_coherent().
>>> Or am I wrong and they would always be the same?
>>
>> I think it's safe to assume that, as with P2P support, this will only
>> matter for dma-direct and iommu-dma for the foreseeable future, and
>> those do currently share the same conditions as above. Thus we may as
>> well keep things simple for now, and if anything ever does have cause to
>> change, it can be the future's problem to keep this mechanism working as
>> intended.
>
> Can we have a comment that states this assumption along with the flag?
> Because when it breaks, it will keep someone cursing for days why DMA
> sometimes fails on their device before they find out it's not synced.
> And then wondering why the code makes such silly assumptions...

Indeed, apologies if it wasn't totally clear, but I really was implying
a literal "may skip sync if coherent and not using SWIOTLB (which
matches dma-direct)" flag, documented as such, and not trying to dress
it up as anything more generic. I just can't suggest a suitably concise
name for that of the top of my head... :)

Thanks,
Robin.

2024-01-29 06:10:07

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH net-next 2/7] dma: avoid expensive redundant calls for sync operations

On Fri, Jan 26, 2024 at 07:13:05PM +0000, Robin Murphy wrote:
>> Can we have a comment that states this assumption along with the flag?
>> Because when it breaks, it will keep someone cursing for days why DMA
>> sometimes fails on their device before they find out it's not synced.
>> And then wondering why the code makes such silly assumptions...
>
> Indeed, apologies if it wasn't totally clear, but I really was implying a
> literal "may skip sync if coherent and not using SWIOTLB (which matches
> dma-direct)" flag, documented as such, and not trying to dress it up as
> anything more generic. I just can't suggest a suitably concise name for
> that of the top of my head... :)

Yes, that seems like the right way to go.


2024-01-29 14:08:53

by Alexander Lobakin

[permalink] [raw]
Subject: Re: [PATCH net-next 2/7] dma: avoid expensive redundant calls for sync operations

From: Alexander Lobakin <[email protected]>
Date: Fri, 26 Jan 2024 17:45:11 +0100

> From: Robin Murphy <[email protected]>
> Date: Fri, 26 Jan 2024 15:48:54 +0000
>
>> On 26/01/2024 1:54 pm, Alexander Lobakin wrote:
>>> From: Eric Dumazet <[email protected]>
>>>
>>> Quite often, NIC devices do not need dma_sync operations on x86_64
>>> at least.
>>> Indeed, when dev_is_dma_coherent(dev) is true and
>>> dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
>>> and friends do nothing.
>>>
>>> However, indirectly calling them when CONFIG_RETPOLINE=y consumes about
>>> 10% of cycles on a cpu receiving packets from softirq at ~100Gbit rate.
>>> Even if/when CONFIG_RETPOLINE is not set, there is a cost of about 3%.
>>>
>>> Add dev->skip_dma_sync boolean which is set during the device
>>> initialization depending on the setup: dev_is_dma_coherent() for direct
>>> DMA, !(sync_single_for_device || sync_single_for_cpu) or positive result
>>> from the new callback, dma_map_ops::can_skip_sync for non-NULL DMA ops.
>>> Then later, if/when swiotlb is used for the first time, the flag
>>> is turned off, from swiotlb_tbl_map_single().
>>
>> I think you could probably just promote the dma_uses_io_tlb flag from
>> SWIOTLB_DYNAMIC to a general SWIOTLB thing to serve this purpose now.
>
> Nice catch!

BTW, this implies such hotpath check:

if (dev->dma_skip_sync && !READ_ONCE(dev->dma_uses_io_tlb))
// ...

This seems less effective than just resetting dma_skip_sync on first
allocation.

>
>>
>> Similarly I don't think a new op is necessary now that we have
>> dma_map_ops.flags. A simple static flag to indicate that sync may be> skipped under the same conditions as implied for dma-direct - i.e.
>> dev_is_dma_coherent(dev) && !dev->dma_use_io_tlb - seems like it ought
>> to suffice.
>
> In my initial implementation, I used a new dma_map_ops flag, but then I
> realized different DMA ops may require or not require syncing under
> different conditions, not only dev_is_dma_coherent().
> Or am I wrong and they would always be the same?
>
>>
>> Thanks,
>> Robin.
>
> Thanks,
> Olek

Thanks,
Olek

2024-01-29 14:30:46

by Robin Murphy

[permalink] [raw]
Subject: Re: [PATCH net-next 2/7] dma: avoid expensive redundant calls for sync operations

On 2024-01-29 2:07 pm, Alexander Lobakin wrote:
> From: Alexander Lobakin <[email protected]>
> Date: Fri, 26 Jan 2024 17:45:11 +0100
>
>> From: Robin Murphy <[email protected]>
>> Date: Fri, 26 Jan 2024 15:48:54 +0000
>>
>>> On 26/01/2024 1:54 pm, Alexander Lobakin wrote:
>>>> From: Eric Dumazet <[email protected]>
>>>>
>>>> Quite often, NIC devices do not need dma_sync operations on x86_64
>>>> at least.
>>>> Indeed, when dev_is_dma_coherent(dev) is true and
>>>> dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
>>>> and friends do nothing.
>>>>
>>>> However, indirectly calling them when CONFIG_RETPOLINE=y consumes about
>>>> 10% of cycles on a cpu receiving packets from softirq at ~100Gbit rate.
>>>> Even if/when CONFIG_RETPOLINE is not set, there is a cost of about 3%.
>>>>
>>>> Add dev->skip_dma_sync boolean which is set during the device
>>>> initialization depending on the setup: dev_is_dma_coherent() for direct
>>>> DMA, !(sync_single_for_device || sync_single_for_cpu) or positive result
>>>> from the new callback, dma_map_ops::can_skip_sync for non-NULL DMA ops.
>>>> Then later, if/when swiotlb is used for the first time, the flag
>>>> is turned off, from swiotlb_tbl_map_single().
>>>
>>> I think you could probably just promote the dma_uses_io_tlb flag from
>>> SWIOTLB_DYNAMIC to a general SWIOTLB thing to serve this purpose now.
>>
>> Nice catch!
>
> BTW, this implies such hotpath check:
>
> if (dev->dma_skip_sync && !READ_ONCE(dev->dma_uses_io_tlb))
> // ...
>
> This seems less effective than just resetting dma_skip_sync on first
> allocation.

Well, my point is not to have a dma_skip_sync at all; I'm suggesting the
check would be:

if (dev_is_dma_coherent(dev) && dev_uses_io_tlb(dev))
...

where on the platform which cares about this most, that first condition
is a compile-time constant (and as implied, the second would want to be
similarly wrapped for !SWIOTLB configs).

Thanks,
Robin.

2024-01-29 14:38:35

by Alexander Lobakin

[permalink] [raw]
Subject: Re: [PATCH net-next 2/7] dma: avoid expensive redundant calls for sync operations

From: Petr Tesařík <[email protected]>
Date: Fri, 26 Jan 2024 19:48:19 +0100

> On Fri, 26 Jan 2024 17:21:24 +0000
> Robin Murphy <[email protected]> wrote:
>
>> On 26/01/2024 4:45 pm, Alexander Lobakin wrote:
>>> From: Robin Murphy <[email protected]>
>>> Date: Fri, 26 Jan 2024 15:48:54 +0000
>>>
>>>> On 26/01/2024 1:54 pm, Alexander Lobakin wrote:
>>>>> From: Eric Dumazet <[email protected]>
>>>>>
>>>>> Quite often, NIC devices do not need dma_sync operations on x86_64
>>>>> at least.
>>>>> Indeed, when dev_is_dma_coherent(dev) is true and
>>>>> dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
>>>>> and friends do nothing.
>>>>>
>>>>> However, indirectly calling them when CONFIG_RETPOLINE=y consumes about
>>>>> 10% of cycles on a cpu receiving packets from softirq at ~100Gbit rate.
>>>>> Even if/when CONFIG_RETPOLINE is not set, there is a cost of about 3%.
>>>>>
>>>>> Add dev->skip_dma_sync boolean which is set during the device
>>>>> initialization depending on the setup: dev_is_dma_coherent() for direct
>>>>> DMA, !(sync_single_for_device || sync_single_for_cpu) or positive result
>>>>> from the new callback, dma_map_ops::can_skip_sync for non-NULL DMA ops.
>>>>> Then later, if/when swiotlb is used for the first time, the flag
>>>>> is turned off, from swiotlb_tbl_map_single().
>>>>
>>>> I think you could probably just promote the dma_uses_io_tlb flag from
>>>> SWIOTLB_DYNAMIC to a general SWIOTLB thing to serve this purpose now.
>>>
>>> Nice catch!
>>>
>>>>
>>>> Similarly I don't think a new op is necessary now that we have
>>>> dma_map_ops.flags. A simple static flag to indicate that sync may be> skipped under the same conditions as implied for dma-direct - i.e.
>>>> dev_is_dma_coherent(dev) && !dev->dma_use_io_tlb - seems like it ought
>>>> to suffice.
>>>
>>> In my initial implementation, I used a new dma_map_ops flag, but then I
>>> realized different DMA ops may require or not require syncing under
>>> different conditions, not only dev_is_dma_coherent().
>>> Or am I wrong and they would always be the same?
>>
>> I think it's safe to assume that, as with P2P support, this will only
>> matter for dma-direct and iommu-dma for the foreseeable future, and
>> those do currently share the same conditions as above. Thus we may as
>> well keep things simple for now, and if anything ever does have cause to
>> change, it can be the future's problem to keep this mechanism working as
>> intended.
>
> Can we have a comment that states this assumption along with the flag?
> Because when it breaks, it will keep someone cursing for days why DMA
> sometimes fails on their device before they find out it's not synced.

BTW, dma_skip_sync is set right before driver->probe(), so that if any
problematic device appears, it could easily be fixed by adding one line
to its probe callback.

> And then wondering why the code makes such silly assumptions...
>
> My two cents
> Petr T

Thanks,
Olek

2024-01-29 14:40:50

by Alexander Lobakin

[permalink] [raw]
Subject: Re: [PATCH net-next 2/7] dma: avoid expensive redundant calls for sync operations

From: Robin Murphy <[email protected]>
Date: Mon, 29 Jan 2024 14:29:43 +0000

> On 2024-01-29 2:07 pm, Alexander Lobakin wrote:
>> From: Alexander Lobakin <[email protected]>
>> Date: Fri, 26 Jan 2024 17:45:11 +0100
>>
>>> From: Robin Murphy <[email protected]>
>>> Date: Fri, 26 Jan 2024 15:48:54 +0000
>>>
>>>> On 26/01/2024 1:54 pm, Alexander Lobakin wrote:
>>>>> From: Eric Dumazet <[email protected]>
>>>>>
>>>>> Quite often, NIC devices do not need dma_sync operations on x86_64
>>>>> at least.
>>>>> Indeed, when dev_is_dma_coherent(dev) is true and
>>>>> dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
>>>>> and friends do nothing.
>>>>>
>>>>> However, indirectly calling them when CONFIG_RETPOLINE=y consumes
>>>>> about
>>>>> 10% of cycles on a cpu receiving packets from softirq at ~100Gbit
>>>>> rate.
>>>>> Even if/when CONFIG_RETPOLINE is not set, there is a cost of about 3%.
>>>>>
>>>>> Add dev->skip_dma_sync boolean which is set during the device
>>>>> initialization depending on the setup: dev_is_dma_coherent() for
>>>>> direct
>>>>> DMA, !(sync_single_for_device || sync_single_for_cpu) or positive
>>>>> result
>>>>> from the new callback, dma_map_ops::can_skip_sync for non-NULL DMA
>>>>> ops.
>>>>> Then later, if/when swiotlb is used for the first time, the flag
>>>>> is turned off, from swiotlb_tbl_map_single().
>>>>
>>>> I think you could probably just promote the dma_uses_io_tlb flag from
>>>> SWIOTLB_DYNAMIC to a general SWIOTLB thing to serve this purpose now.
>>>
>>> Nice catch!
>>
>> BTW, this implies such hotpath check:
>>
>>     if (dev->dma_skip_sync && !READ_ONCE(dev->dma_uses_io_tlb))
>>         // ...
>>
>> This seems less effective than just resetting dma_skip_sync on first
>> allocation.
>
> Well, my point is not to have a dma_skip_sync at all; I'm suggesting the
> check would be:
>
>     if (dev_is_dma_coherent(dev) && dev_uses_io_tlb(dev))
>         ...

Aaah, okay, but what about dma_map_ops?
It would then become:

if ((!dev->dma_ops ||
(!dev->dma_ops->sync_single_for_device &&
!dev->dma_ops->sync_single_for_cpu)) ||
(dev->dma_ops->flags & DMA_F_SKIP_SYNC)) &&
dev_is_dma_coherent(dev) && !dev_uses_io_tlb(dev))
dma_sync_ ...

Quite a lot and everything except dev_uses_io_tlb() is known at device
probing time, that's why I decided to cache the result into a new flag...

>
> where on the platform which cares about this most, that first condition
> is a compile-time constant (and as implied, the second would want to be
> similarly wrapped for !SWIOTLB configs).
>
> Thanks,
> Robin.

Thanks,
Olek

2024-01-29 16:30:36

by Petr Tesařík

[permalink] [raw]
Subject: Re: [PATCH net-next 2/7] dma: avoid expensive redundant calls for sync operations

On Mon, 29 Jan 2024 15:36:35 +0100
Alexander Lobakin <[email protected]> wrote:

> From: Petr Tesařík <[email protected]>
> Date: Fri, 26 Jan 2024 19:48:19 +0100
>
> > On Fri, 26 Jan 2024 17:21:24 +0000
> > Robin Murphy <[email protected]> wrote:
> >
> >> On 26/01/2024 4:45 pm, Alexander Lobakin wrote:
> >>> From: Robin Murphy <[email protected]>
> >>> Date: Fri, 26 Jan 2024 15:48:54 +0000
> >>>
> >>>> On 26/01/2024 1:54 pm, Alexander Lobakin wrote:
> >>>>> From: Eric Dumazet <[email protected]>
> >>>>>
> >>>>> Quite often, NIC devices do not need dma_sync operations on x86_64
> >>>>> at least.
> >>>>> Indeed, when dev_is_dma_coherent(dev) is true and
> >>>>> dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
> >>>>> and friends do nothing.
> >>>>>
> >>>>> However, indirectly calling them when CONFIG_RETPOLINE=y consumes about
> >>>>> 10% of cycles on a cpu receiving packets from softirq at ~100Gbit rate.
> >>>>> Even if/when CONFIG_RETPOLINE is not set, there is a cost of about 3%.
> >>>>>
> >>>>> Add dev->skip_dma_sync boolean which is set during the device
> >>>>> initialization depending on the setup: dev_is_dma_coherent() for direct
> >>>>> DMA, !(sync_single_for_device || sync_single_for_cpu) or positive result
> >>>>> from the new callback, dma_map_ops::can_skip_sync for non-NULL DMA ops.
> >>>>> Then later, if/when swiotlb is used for the first time, the flag
> >>>>> is turned off, from swiotlb_tbl_map_single().
> >>>>
> >>>> I think you could probably just promote the dma_uses_io_tlb flag from
> >>>> SWIOTLB_DYNAMIC to a general SWIOTLB thing to serve this purpose now
> >>>
> >>> Nice catch!
> >>>
> >>>>
> >>>> Similarly I don't think a new op is necessary now that we have
> >>>> dma_map_ops.flags. A simple static flag to indicate that sync may be> skipped under the same conditions as implied for dma-direct - i.e.
> >>>> dev_is_dma_coherent(dev) && !dev->dma_use_io_tlb - seems like it ought
> >>>> to suffice.
> >>>
> >>> In my initial implementation, I used a new dma_map_ops flag, but then I
> >>> realized different DMA ops may require or not require syncing under
> >>> different conditions, not only dev_is_dma_coherent().
> >>> Or am I wrong and they would always be the same?
> >>
> >> I think it's safe to assume that, as with P2P support, this will only
> >> matter for dma-direct and iommu-dma for the foreseeable future, and
> >> those do currently share the same conditions as above. Thus we may as
> >> well keep things simple for now, and if anything ever does have cause to
> >> change, it can be the future's problem to keep this mechanism working as
> >> intended.
> >
> > Can we have a comment that states this assumption along with the flag?
> > Because when it breaks, it will keep someone cursing for days why DMA
> > sometimes fails on their device before they find out it's not synced.
>
> BTW, dma_skip_sync is set right before driver->probe(), so that if any
> problematic device appears, it could easily be fixed by adding one line
> to its probe callback.

Ah, perfect!

Petr T