2021-09-05 03:28:15

by Leo Yan

[permalink] [raw]
Subject: [PATCH v4] coresight: tmc-etr: Speed up for bounce buffer in flat mode

The AUX bounce buffer is allocated with API dma_alloc_coherent(), in the
low level's architecture code, e.g. for Arm64, it maps the memory with
the attribution "Normal non-cacheable"; this can be concluded from the
definition for pgprot_dmacoherent() in arch/arm64/include/asm/pgtable.h.

Later when access the AUX bounce buffer, since the memory mapping is
non-cacheable, it's low efficiency due to every load instruction must
reach out DRAM.

This patch changes to allocate pages with dma_alloc_noncoherent(), the
driver can access the memory via cacheable mapping; therefore, load
instructions can fetch data from cache lines rather than always read
data from DRAM, the driver can boost memory performance. After using
the cacheable mapping, the driver uses dma_sync_single_for_cpu() to
invalidate cacheline prior to read bounce buffer so can avoid read stale
trace data.

By measurement the duration for function tmc_update_etr_buffer() with
ftrace function_graph tracer, it shows the performance significant
improvement for copying 4MiB data from bounce buffer:

# echo tmc_etr_get_data_flat_buf > set_graph_notrace // avoid noise
# echo tmc_update_etr_buffer > set_graph_function
# echo function_graph > current_tracer

before:

# CPU DURATION FUNCTION CALLS
# | | | | | | |
2) | tmc_update_etr_buffer() {
...
2) # 8148.320 us | }

after:

# CPU DURATION FUNCTION CALLS
# | | | | | | |
2) | tmc_update_etr_buffer() {
...
2) # 2525.420 us | }

Signed-off-by: Leo Yan <[email protected]>
Reviewed-by: Suzuki K Poulose <[email protected]>
---

Changes from v3:
Refined change to use dma_alloc_noncoherent()/dma_free_noncoherent()
(Robin Murphy);
Retested functionality and performance on Juno-r2 board.

Changes from v2:
Sync the entire buffer in one go when the tracing is wrap around
(Suzuki);
Add Suzuki's review tage.

.../hwtracing/coresight/coresight-tmc-etr.c | 26 ++++++++++++++++---
1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c
index acdb59e0e661..a049b525a274 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
@@ -609,8 +609,9 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata,
if (!flat_buf)
return -ENOMEM;

- flat_buf->vaddr = dma_alloc_coherent(real_dev, etr_buf->size,
- &flat_buf->daddr, GFP_KERNEL);
+ flat_buf->vaddr = dma_alloc_noncoherent(real_dev, etr_buf->size,
+ &flat_buf->daddr,
+ DMA_FROM_DEVICE, GFP_KERNEL);
if (!flat_buf->vaddr) {
kfree(flat_buf);
return -ENOMEM;
@@ -631,14 +632,18 @@ static void tmc_etr_free_flat_buf(struct etr_buf *etr_buf)
if (flat_buf && flat_buf->daddr) {
struct device *real_dev = flat_buf->dev->parent;

- dma_free_coherent(real_dev, flat_buf->size,
- flat_buf->vaddr, flat_buf->daddr);
+ dma_free_noncoherent(real_dev, etr_buf->size,
+ flat_buf->vaddr, flat_buf->daddr,
+ DMA_FROM_DEVICE);
}
kfree(flat_buf);
}

static void tmc_etr_sync_flat_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp)
{
+ struct etr_flat_buf *flat_buf = etr_buf->private;
+ struct device *real_dev = flat_buf->dev->parent;
+
/*
* Adjust the buffer to point to the beginning of the trace data
* and update the available trace data.
@@ -648,6 +653,19 @@ static void tmc_etr_sync_flat_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp)
etr_buf->len = etr_buf->size;
else
etr_buf->len = rwp - rrp;
+
+ /*
+ * The driver always starts tracing at the beginning of the buffer,
+ * the only reason why we would get a wrap around is when the buffer
+ * is full. Sync the entire buffer in one go for this case.
+ */
+ if (etr_buf->offset + etr_buf->len > etr_buf->size)
+ dma_sync_single_for_cpu(real_dev, flat_buf->daddr,
+ etr_buf->size, DMA_FROM_DEVICE);
+ else
+ dma_sync_single_for_cpu(real_dev,
+ flat_buf->daddr + etr_buf->offset,
+ etr_buf->len, DMA_FROM_DEVICE);
}

static ssize_t tmc_etr_get_data_flat_buf(struct etr_buf *etr_buf,
--
2.25.1


2021-09-14 00:49:50

by Mathieu Poirier

[permalink] [raw]
Subject: Re: [PATCH v4] coresight: tmc-etr: Speed up for bounce buffer in flat mode

On Sun, Sep 05, 2021 at 11:21:44AM +0800, Leo Yan wrote:
> The AUX bounce buffer is allocated with API dma_alloc_coherent(), in the
> low level's architecture code, e.g. for Arm64, it maps the memory with
> the attribution "Normal non-cacheable"; this can be concluded from the
> definition for pgprot_dmacoherent() in arch/arm64/include/asm/pgtable.h.
>
> Later when access the AUX bounce buffer, since the memory mapping is
> non-cacheable, it's low efficiency due to every load instruction must
> reach out DRAM.
>
> This patch changes to allocate pages with dma_alloc_noncoherent(), the
> driver can access the memory via cacheable mapping; therefore, load
> instructions can fetch data from cache lines rather than always read
> data from DRAM, the driver can boost memory performance. After using
> the cacheable mapping, the driver uses dma_sync_single_for_cpu() to
> invalidate cacheline prior to read bounce buffer so can avoid read stale
> trace data.
>
> By measurement the duration for function tmc_update_etr_buffer() with
> ftrace function_graph tracer, it shows the performance significant
> improvement for copying 4MiB data from bounce buffer:
>
> # echo tmc_etr_get_data_flat_buf > set_graph_notrace // avoid noise
> # echo tmc_update_etr_buffer > set_graph_function
> # echo function_graph > current_tracer
>
> before:
>
> # CPU DURATION FUNCTION CALLS
> # | | | | | | |
> 2) | tmc_update_etr_buffer() {
> ...
> 2) # 8148.320 us | }
>
> after:
>
> # CPU DURATION FUNCTION CALLS
> # | | | | | | |
> 2) | tmc_update_etr_buffer() {
> ...
> 2) # 2525.420 us | }
>
> Signed-off-by: Leo Yan <[email protected]>
> Reviewed-by: Suzuki K Poulose <[email protected]>
> ---
>
> Changes from v3:
> Refined change to use dma_alloc_noncoherent()/dma_free_noncoherent()
> (Robin Murphy);
> Retested functionality and performance on Juno-r2 board.
>
> Changes from v2:
> Sync the entire buffer in one go when the tracing is wrap around
> (Suzuki);
> Add Suzuki's review tage.
>
> .../hwtracing/coresight/coresight-tmc-etr.c | 26 ++++++++++++++++---
> 1 file changed, 22 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c
> index acdb59e0e661..a049b525a274 100644
> --- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
> +++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
> @@ -609,8 +609,9 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata,
> if (!flat_buf)
> return -ENOMEM;
>
> - flat_buf->vaddr = dma_alloc_coherent(real_dev, etr_buf->size,
> - &flat_buf->daddr, GFP_KERNEL);
> + flat_buf->vaddr = dma_alloc_noncoherent(real_dev, etr_buf->size,
> + &flat_buf->daddr,
> + DMA_FROM_DEVICE, GFP_KERNEL);

Suzuki and Robin - are you guys good with this new revision?

Thanks,
Mathieu

PS: Suzuki - I know you've already provided your RB on this but with the change
in API I want to make sure before merging.

> if (!flat_buf->vaddr) {
> kfree(flat_buf);
> return -ENOMEM;
> @@ -631,14 +632,18 @@ static void tmc_etr_free_flat_buf(struct etr_buf *etr_buf)
> if (flat_buf && flat_buf->daddr) {
> struct device *real_dev = flat_buf->dev->parent;
>
> - dma_free_coherent(real_dev, flat_buf->size,
> - flat_buf->vaddr, flat_buf->daddr);
> + dma_free_noncoherent(real_dev, etr_buf->size,
> + flat_buf->vaddr, flat_buf->daddr,
> + DMA_FROM_DEVICE);
> }
> kfree(flat_buf);
> }
>
> static void tmc_etr_sync_flat_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp)
> {
> + struct etr_flat_buf *flat_buf = etr_buf->private;
> + struct device *real_dev = flat_buf->dev->parent;
> +
> /*
> * Adjust the buffer to point to the beginning of the trace data
> * and update the available trace data.
> @@ -648,6 +653,19 @@ static void tmc_etr_sync_flat_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp)
> etr_buf->len = etr_buf->size;
> else
> etr_buf->len = rwp - rrp;
> +
> + /*
> + * The driver always starts tracing at the beginning of the buffer,
> + * the only reason why we would get a wrap around is when the buffer
> + * is full. Sync the entire buffer in one go for this case.
> + */
> + if (etr_buf->offset + etr_buf->len > etr_buf->size)
> + dma_sync_single_for_cpu(real_dev, flat_buf->daddr,
> + etr_buf->size, DMA_FROM_DEVICE);
> + else
> + dma_sync_single_for_cpu(real_dev,
> + flat_buf->daddr + etr_buf->offset,
> + etr_buf->len, DMA_FROM_DEVICE);
> }
>
> static ssize_t tmc_etr_get_data_flat_buf(struct etr_buf *etr_buf,
> --
> 2.25.1
>

2021-09-14 06:04:22

by Suzuki K Poulose

[permalink] [raw]
Subject: Re: [PATCH v4] coresight: tmc-etr: Speed up for bounce buffer in flat mode

On 13/09/2021 18:56, Mathieu Poirier wrote:
> On Sun, Sep 05, 2021 at 11:21:44AM +0800, Leo Yan wrote:
>> The AUX bounce buffer is allocated with API dma_alloc_coherent(), in the
>> low level's architecture code, e.g. for Arm64, it maps the memory with
>> the attribution "Normal non-cacheable"; this can be concluded from the
>> definition for pgprot_dmacoherent() in arch/arm64/include/asm/pgtable.h.
>>
>> Later when access the AUX bounce buffer, since the memory mapping is
>> non-cacheable, it's low efficiency due to every load instruction must
>> reach out DRAM.
>>
>> This patch changes to allocate pages with dma_alloc_noncoherent(), the
>> driver can access the memory via cacheable mapping; therefore, load
>> instructions can fetch data from cache lines rather than always read
>> data from DRAM, the driver can boost memory performance. After using
>> the cacheable mapping, the driver uses dma_sync_single_for_cpu() to
>> invalidate cacheline prior to read bounce buffer so can avoid read stale
>> trace data.
>>
>> By measurement the duration for function tmc_update_etr_buffer() with
>> ftrace function_graph tracer, it shows the performance significant
>> improvement for copying 4MiB data from bounce buffer:
>>
>> # echo tmc_etr_get_data_flat_buf > set_graph_notrace // avoid noise
>> # echo tmc_update_etr_buffer > set_graph_function
>> # echo function_graph > current_tracer
>>
>> before:
>>
>> # CPU DURATION FUNCTION CALLS
>> # | | | | | | |
>> 2) | tmc_update_etr_buffer() {
>> ...
>> 2) # 8148.320 us | }
>>
>> after:
>>
>> # CPU DURATION FUNCTION CALLS
>> # | | | | | | |
>> 2) | tmc_update_etr_buffer() {
>> ...
>> 2) # 2525.420 us | }
>>
>> Signed-off-by: Leo Yan <[email protected]>
>> Reviewed-by: Suzuki K Poulose <[email protected]>
>> ---
>>
>> Changes from v3:
>> Refined change to use dma_alloc_noncoherent()/dma_free_noncoherent()
>> (Robin Murphy);
>> Retested functionality and performance on Juno-r2 board.
>>
>> Changes from v2:
>> Sync the entire buffer in one go when the tracing is wrap around
>> (Suzuki);
>> Add Suzuki's review tage.
>>
>> .../hwtracing/coresight/coresight-tmc-etr.c | 26 ++++++++++++++++---
>> 1 file changed, 22 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c
>> index acdb59e0e661..a049b525a274 100644
>> --- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
>> +++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
>> @@ -609,8 +609,9 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata,
>> if (!flat_buf)
>> return -ENOMEM;
>>
>> - flat_buf->vaddr = dma_alloc_coherent(real_dev, etr_buf->size,
>> - &flat_buf->daddr, GFP_KERNEL);
>> + flat_buf->vaddr = dma_alloc_noncoherent(real_dev, etr_buf->size,
>> + &flat_buf->daddr,
>> + DMA_FROM_DEVICE, GFP_KERNEL);
>
> Suzuki and Robin - are you guys good with this new revision?

Yes, fine by me.

Suzuki

2021-09-14 15:24:29

by Mathieu Poirier

[permalink] [raw]
Subject: Re: [PATCH v4] coresight: tmc-etr: Speed up for bounce buffer in flat mode

On Sun, Sep 05, 2021 at 11:21:44AM +0800, Leo Yan wrote:
> The AUX bounce buffer is allocated with API dma_alloc_coherent(), in the
> low level's architecture code, e.g. for Arm64, it maps the memory with
> the attribution "Normal non-cacheable"; this can be concluded from the
> definition for pgprot_dmacoherent() in arch/arm64/include/asm/pgtable.h.
>
> Later when access the AUX bounce buffer, since the memory mapping is
> non-cacheable, it's low efficiency due to every load instruction must
> reach out DRAM.
>
> This patch changes to allocate pages with dma_alloc_noncoherent(), the
> driver can access the memory via cacheable mapping; therefore, load
> instructions can fetch data from cache lines rather than always read
> data from DRAM, the driver can boost memory performance. After using
> the cacheable mapping, the driver uses dma_sync_single_for_cpu() to
> invalidate cacheline prior to read bounce buffer so can avoid read stale
> trace data.
>
> By measurement the duration for function tmc_update_etr_buffer() with
> ftrace function_graph tracer, it shows the performance significant
> improvement for copying 4MiB data from bounce buffer:
>
> # echo tmc_etr_get_data_flat_buf > set_graph_notrace // avoid noise
> # echo tmc_update_etr_buffer > set_graph_function
> # echo function_graph > current_tracer
>
> before:
>
> # CPU DURATION FUNCTION CALLS
> # | | | | | | |
> 2) | tmc_update_etr_buffer() {
> ...
> 2) # 8148.320 us | }
>
> after:
>
> # CPU DURATION FUNCTION CALLS
> # | | | | | | |
> 2) | tmc_update_etr_buffer() {
> ...
> 2) # 2525.420 us | }
>
> Signed-off-by: Leo Yan <[email protected]>
> Reviewed-by: Suzuki K Poulose <[email protected]>

I have merged this patch.

Thanks,
Mathieu

> ---
>
> Changes from v3:
> Refined change to use dma_alloc_noncoherent()/dma_free_noncoherent()
> (Robin Murphy);
> Retested functionality and performance on Juno-r2 board.
>
> Changes from v2:
> Sync the entire buffer in one go when the tracing is wrap around
> (Suzuki);
> Add Suzuki's review tage.
>
> .../hwtracing/coresight/coresight-tmc-etr.c | 26 ++++++++++++++++---
> 1 file changed, 22 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c
> index acdb59e0e661..a049b525a274 100644
> --- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
> +++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
> @@ -609,8 +609,9 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata,
> if (!flat_buf)
> return -ENOMEM;
>
> - flat_buf->vaddr = dma_alloc_coherent(real_dev, etr_buf->size,
> - &flat_buf->daddr, GFP_KERNEL);
> + flat_buf->vaddr = dma_alloc_noncoherent(real_dev, etr_buf->size,
> + &flat_buf->daddr,
> + DMA_FROM_DEVICE, GFP_KERNEL);
> if (!flat_buf->vaddr) {
> kfree(flat_buf);
> return -ENOMEM;
> @@ -631,14 +632,18 @@ static void tmc_etr_free_flat_buf(struct etr_buf *etr_buf)
> if (flat_buf && flat_buf->daddr) {
> struct device *real_dev = flat_buf->dev->parent;
>
> - dma_free_coherent(real_dev, flat_buf->size,
> - flat_buf->vaddr, flat_buf->daddr);
> + dma_free_noncoherent(real_dev, etr_buf->size,
> + flat_buf->vaddr, flat_buf->daddr,
> + DMA_FROM_DEVICE);
> }
> kfree(flat_buf);
> }
>
> static void tmc_etr_sync_flat_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp)
> {
> + struct etr_flat_buf *flat_buf = etr_buf->private;
> + struct device *real_dev = flat_buf->dev->parent;
> +
> /*
> * Adjust the buffer to point to the beginning of the trace data
> * and update the available trace data.
> @@ -648,6 +653,19 @@ static void tmc_etr_sync_flat_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp)
> etr_buf->len = etr_buf->size;
> else
> etr_buf->len = rwp - rrp;
> +
> + /*
> + * The driver always starts tracing at the beginning of the buffer,
> + * the only reason why we would get a wrap around is when the buffer
> + * is full. Sync the entire buffer in one go for this case.
> + */
> + if (etr_buf->offset + etr_buf->len > etr_buf->size)
> + dma_sync_single_for_cpu(real_dev, flat_buf->daddr,
> + etr_buf->size, DMA_FROM_DEVICE);
> + else
> + dma_sync_single_for_cpu(real_dev,
> + flat_buf->daddr + etr_buf->offset,
> + etr_buf->len, DMA_FROM_DEVICE);
> }
>
> static ssize_t tmc_etr_get_data_flat_buf(struct etr_buf *etr_buf,
> --
> 2.25.1
>