Received-SPF: pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 23.128.96.18 as permitted sender) client-ip=23.128.96.18;
Subject: Re: [PATCH v2] coresight: tmc-etr: Speed up for bounce buffer in flat
 mode
To:     Leo Yan <leo.yan@linaro.org>,
        Mathieu Poirier <mathieu.poirier@linaro.org>,
        Mike Leach <mike.leach@linaro.org>,
        Alexander Shishkin <alexander.shishkin@linux.intel.com>,
        coresight@lists.linaro.org, linux-arm-kernel@lists.infradead.org,
        linux-kernel@vger.kernel.org
References: <20210710070115.462674-1-leo.yan@linaro.org>
From:   Suzuki K Poulose <suzuki.poulose@arm.com>
Message-ID: <f17065d6-5083-74c9-d9ca-a467b640aed3@arm.com>
Date:   Mon, 12 Jul 2021 10:55:32 +0100
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0)
 Gecko/20100101 Thunderbird/78.11.0
MIME-Version: 1.0
In-Reply-To: <20210710070115.462674-1-leo.yan@linaro.org>
Content-Type: text/plain; charset=utf-8; format=flowed
Content-Language: en-GB
Content-Transfer-Encoding: 7bit
Precedence: bulk

Leo,

On 10/07/2021 08:01, Leo Yan wrote:
> The AUX bounce buffer is allocated with API dma_alloc_coherent(), in the
> low level's architecture code, e.g. for Arm64, it maps the memory with
> the attribution "Normal non-cacheable"; this can be concluded from the
> definition for pgprot_dmacoherent() in arch/arm64/include/asm/pgtable.h.
> 
> Later when access the AUX bounce buffer, since the memory mapping is
> non-cacheable, it's low efficiency due to every load instruction must
> reach out DRAM.
> 
> This patch changes to allocate pages with alloc_pages_node(), thus the
> driver can access the memory with cacheable mapping in the kernel linear
> virtual address; therefore, because load instructions can fetch data
> from cache lines rather than always read data from DRAM, the driver can
> boost memory coping performance.  After using the cacheable mapping, the
> driver uses dma_sync_single_for_cpu() to invalidate cacheline prior to
> read bounce buffer so can avoid read stale trace data.
> 
> By measurement the duration for function tmc_update_etr_buffer() with
> ftrace function_graph tracer, it shows the performance significant
> improvement for copying 4MiB data from bounce buffer:
> 
>    # echo tmc_etr_get_data_flat_buf > set_graph_notrace // avoid noise
>    # echo tmc_update_etr_buffer > set_graph_function
>    # echo function_graph > current_tracer
> 
>    before:
> 
>    # CPU  DURATION                  FUNCTION CALLS
>    # |     |   |                     |   |   |   |
>    2)               |    tmc_update_etr_buffer() {
>    ...
>    2) # 8148.320 us |    }
> 
>    after:
> 
>    # CPU  DURATION                  FUNCTION CALLS
>    # |     |   |                     |   |   |   |
>    2)               |  tmc_update_etr_buffer() {
>    ...
>    2) # 2463.980 us |  }

Thats a good speed up ! Thanks for the patch. One minor comment
below.

> 
> Signed-off-by: Leo Yan <leo.yan@linaro.org>
> ---
> 
> Changes from v1:
> Set "flat_buf->daddr" to 0 when fails to map DMA region; and dropped the
> unexpected if condition change in tmc_etr_free_flat_buf().
> 
>   .../hwtracing/coresight/coresight-tmc-etr.c   | 56 ++++++++++++++++---
>   1 file changed, 49 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c
> index acdb59e0e661..888b0f929d33 100644
> --- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
> +++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
> @@ -21,6 +21,7 @@
>   
>   struct etr_flat_buf {
>   	struct device	*dev;
> +	struct page	*pages;
>   	dma_addr_t	daddr;
>   	void		*vaddr;
>   	size_t		size;
> @@ -600,6 +601,7 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata,
>   {
>   	struct etr_flat_buf *flat_buf;
>   	struct device *real_dev = drvdata->csdev->dev.parent;
> +	ssize_t	aligned_size;
>   
>   	/* We cannot reuse existing pages for flat buf */
>   	if (pages)
> @@ -609,11 +611,18 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata,
>   	if (!flat_buf)
>   		return -ENOMEM;
>   
> -	flat_buf->vaddr = dma_alloc_coherent(real_dev, etr_buf->size,
> -					     &flat_buf->daddr, GFP_KERNEL);
> -	if (!flat_buf->vaddr) {
> -		kfree(flat_buf);
> -		return -ENOMEM;
> +	aligned_size = PAGE_ALIGN(etr_buf->size);
> +	flat_buf->pages = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO,
> +					   get_order(aligned_size));
> +	if (!flat_buf->pages)
> +		goto fail_alloc_pages;
> +
> +	flat_buf->vaddr = page_address(flat_buf->pages);
> +	flat_buf->daddr = dma_map_page(real_dev, flat_buf->pages, 0,
> +				       aligned_size, DMA_FROM_DEVICE);
> +	if (dma_mapping_error(real_dev, flat_buf->daddr)) {
> +		flat_buf->daddr = 0;
> +		goto fail_dma_map_page;
>   	}
>   
>   	flat_buf->size = etr_buf->size;
> @@ -622,6 +631,12 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata,
>   	etr_buf->mode = ETR_MODE_FLAT;
>   	etr_buf->private = flat_buf;
>   	return 0;
> +
> +fail_dma_map_page:
> +	__free_pages(flat_buf->pages, get_order(aligned_size));
> +fail_alloc_pages:
> +	kfree(flat_buf);
> +	return -ENOMEM;
>   }
>   
>   static void tmc_etr_free_flat_buf(struct etr_buf *etr_buf)
> @@ -630,15 +645,20 @@ static void tmc_etr_free_flat_buf(struct etr_buf *etr_buf)
>   
>   	if (flat_buf && flat_buf->daddr) {
>   		struct device *real_dev = flat_buf->dev->parent;
> +		ssize_t aligned_size = PAGE_ALIGN(etr_buf->size);
>   
> -		dma_free_coherent(real_dev, flat_buf->size,
> -				  flat_buf->vaddr, flat_buf->daddr);
> +		dma_unmap_page(real_dev, flat_buf->daddr, aligned_size,
> +			       DMA_FROM_DEVICE);
> +		__free_pages(flat_buf->pages, get_order(aligned_size));
>   	}
>   	kfree(flat_buf);
>   }
>   
>   static void tmc_etr_sync_flat_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp)
>   {
> +	struct etr_flat_buf *flat_buf = etr_buf->private;
> +	struct device *real_dev = flat_buf->dev->parent;
> +
>   	/*
>   	 * Adjust the buffer to point to the beginning of the trace data
>   	 * and update the available trace data.
> @@ -648,6 +668,28 @@ static void tmc_etr_sync_flat_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp)
>   		etr_buf->len = etr_buf->size;
>   	else
>   		etr_buf->len = rwp - rrp;
> +
> +	if (etr_buf->offset + etr_buf->len > etr_buf->size) {
> +		int len1, len2;
> +
> +		/*
> +		 * If trace data is wrapped around, sync AUX bounce buffer
> +		 * for two chunks: "len1" is for the trace date length at
> +		 * the tail of bounce buffer, and "len2" is the length from
> +		 * the start of the buffer after wrapping around.
> +		 */
> +		len1 = etr_buf->size - etr_buf->offset;
> +		len2 = etr_buf->len - len1;
> +		dma_sync_single_for_cpu(real_dev,
> +					flat_buf->daddr + etr_buf->offset,
> +					len1, DMA_FROM_DEVICE);
> +		dma_sync_single_for_cpu(real_dev, flat_buf->daddr,
> +					len2, DMA_FROM_DEVICE);

We always start tracing at the beginning of the buffer and the only 
reason why we would get a wrap around, is when the buffer is full.
So you could as well sync the entire buffer in one go

		dma_sync_single_for_cpu(real_dev, flat_buf->daddr,
					etr_buf->len, DMA_FROM_DEVICE);


> +	} else {
> +		dma_sync_single_for_cpu(real_dev,
> +					flat_buf->daddr + etr_buf->offset,
> +					etr_buf->len, DMA_FROM_DEVICE);
> +	}
>   }
>   
>   static ssize_t tmc_etr_get_data_flat_buf(struct etr_buf *etr_buf,
> 


Eitherways,

Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>