2016-10-24 07:14:00

by Michael Zoran

[permalink] [raw]
Subject: [PATCH 2/2] staging: vc04_services: Replace dmac_map_area with dmac_map_sg

From: Michael Zoran <[email protected]>

The original arm implementation uses dmac_map_area which is not
portable. Replace it with an architecture neutral version
which uses dma_map_sg.

As you can see that for larger page sizes, the dma_map_sg
implementation is faster then the original unportable dma_map_area
implementation.

Test dmac_map_area dma_map_page dma_map_sg
vchiq_test -b 4 10000 51us/iter 76us/iter 76us
vchiq_test -b 8 10000 70us/iter 82us/iter 91us
vchiq_test -b 16 10000 94us/iter 118us/iter 121us
vchiq_test -b 32 10000 146us/iter 173us/iter 187us
vchiq_test -b 64 10000 263us/iter 328us/iter 299us
vchiq_test -b 128 10000 529us/iter 631us/iter 595us
vchiq_test -b 256 10000 2285us/iter 2275us/iter 2001us
vchiq_test -b 512 10000 4372us/iter 4616us/iter 4123us

For message sizes >= 64KB, dma_map_sg is faster then dma_map_page.

For message size >= 256KB, the dma_map_sg is the fastest
implementation.

Signed-off-by: Michael Zoran <[email protected]>
---
.../interface/vchiq_arm/vchiq_2835_arm.c | 153 ++++++++++++---------
1 file changed, 91 insertions(+), 62 deletions(-)

diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c
index 98c6819..5ca66ee 100644
--- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c
+++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c
@@ -45,13 +45,8 @@
#include <asm/pgtable.h>
#include <soc/bcm2835/raspberrypi-firmware.h>

-#define dmac_map_area __glue(_CACHE,_dma_map_area)
-#define dmac_unmap_area __glue(_CACHE,_dma_unmap_area)
-
#define TOTAL_SLOTS (VCHIQ_SLOT_ZERO_SLOTS + 2 * 32)

-#define VCHIQ_ARM_ADDRESS(x) ((void *)((char *)x + g_virt_to_bus_offset))
-
#include "vchiq_arm.h"
#include "vchiq_2835.h"
#include "vchiq_connected.h"
@@ -73,7 +68,7 @@ static unsigned int g_fragments_size;
static char *g_fragments_base;
static char *g_free_fragments;
static struct semaphore g_free_fragments_sema;
-static unsigned long g_virt_to_bus_offset;
+static struct device *g_dev;

extern int vchiq_arm_log_level;

@@ -84,10 +79,11 @@ vchiq_doorbell_irq(int irq, void *dev_id);

static int
create_pagelist(char __user *buf, size_t count, unsigned short type,
- struct task_struct *task, PAGELIST_T ** ppagelist);
+ struct task_struct *task, PAGELIST_T **ppagelist,
+ dma_addr_t *dma_addr);

static void
-free_pagelist(PAGELIST_T *pagelist, int actual);
+free_pagelist(dma_addr_t dma_addr, PAGELIST_T *pagelist, int actual);

int vchiq_platform_init(struct platform_device *pdev, VCHIQ_STATE_T *state)
{
@@ -101,8 +97,6 @@ int vchiq_platform_init(struct platform_device *pdev, VCHIQ_STATE_T *state)
int slot_mem_size, frag_mem_size;
int err, irq, i;

- g_virt_to_bus_offset = virt_to_dma(dev, (void *)0);
-
(void)of_property_read_u32(dev->of_node, "cache-line-size",
&g_cache_line_size);
g_fragments_size = 2 * g_cache_line_size;
@@ -118,7 +112,7 @@ int vchiq_platform_init(struct platform_device *pdev, VCHIQ_STATE_T *state)
return -ENOMEM;
}

- WARN_ON(((int)slot_mem & (PAGE_SIZE - 1)) != 0);
+ WARN_ON(((unsigned long)slot_mem & (PAGE_SIZE - 1)) != 0);

vchiq_slot_zero = vchiq_init_slots(slot_mem, slot_mem_size);
if (!vchiq_slot_zero)
@@ -170,6 +164,7 @@ int vchiq_platform_init(struct platform_device *pdev, VCHIQ_STATE_T *state)
return err ? : -ENXIO;
}

+ g_dev = dev;
vchiq_log_info(vchiq_arm_log_level,
"vchiq_init - done (slots %pK, phys %pad)",
vchiq_slot_zero, &slot_phys);
@@ -233,6 +228,7 @@ vchiq_prepare_bulk_data(VCHIQ_BULK_T *bulk, VCHI_MEM_HANDLE_T memhandle,
{
PAGELIST_T *pagelist;
int ret;
+ dma_addr_t dma_addr;

WARN_ON(memhandle != VCHI_MEM_HANDLE_INVALID);

@@ -241,12 +237,14 @@ vchiq_prepare_bulk_data(VCHIQ_BULK_T *bulk, VCHI_MEM_HANDLE_T memhandle,
? PAGELIST_READ
: PAGELIST_WRITE,
current,
- &pagelist);
+ &pagelist,
+ &dma_addr);
+
if (ret != 0)
return VCHIQ_ERROR;

bulk->handle = memhandle;
- bulk->data = VCHIQ_ARM_ADDRESS(pagelist);
+ bulk->data = (void *)(unsigned long)dma_addr;

/* Store the pagelist address in remote_data, which isn't used by the
slave. */
@@ -259,7 +257,8 @@ void
vchiq_complete_bulk(VCHIQ_BULK_T *bulk)
{
if (bulk && bulk->remote_data && bulk->actual)
- free_pagelist((PAGELIST_T *)bulk->remote_data, bulk->actual);
+ free_pagelist((dma_addr_t)(unsigned long)bulk->data,
+ (PAGELIST_T *)bulk->remote_data, bulk->actual);
}

void
@@ -353,38 +352,44 @@ vchiq_doorbell_irq(int irq, void *dev_id)
** obscuring the new data underneath. We can solve this by transferring the
** partial cache lines separately, and allowing the ARM to copy into the
** cached area.
-
-** N.B. This implementation plays slightly fast and loose with the Linux
-** driver programming rules, e.g. its use of dmac_map_area instead of
-** dma_map_single, but it isn't a multi-platform driver and it benefits
-** from increased speed as a result.
*/

static int
create_pagelist(char __user *buf, size_t count, unsigned short type,
- struct task_struct *task, PAGELIST_T ** ppagelist)
+ struct task_struct *task, PAGELIST_T **ppagelist,
+ dma_addr_t *dma_addr)
{
PAGELIST_T *pagelist;
struct page **pages;
- unsigned long *addrs;
- unsigned int num_pages, offset, i;
- char *addr, *base_addr, *next_addr;
- int run, addridx, actual_pages;
+ u32 *addrs;
+ unsigned int num_pages, offset, i, j, k;
+ int actual_pages;
unsigned long *need_release;
+ size_t pagelist_size;
+ struct scatterlist *scatterlist;
+ int dma_buffers;
+ int dir;

- offset = (unsigned int)buf & (PAGE_SIZE - 1);
+ offset = ((unsigned int)(unsigned long)buf & (PAGE_SIZE - 1));
num_pages = (count + offset + PAGE_SIZE - 1) / PAGE_SIZE;

+ pagelist_size = sizeof(PAGELIST_T) +
+ (num_pages * sizeof(unsigned long)) +
+ sizeof(unsigned long) +
+ (num_pages * sizeof(pages[0]) +
+ (num_pages * sizeof(struct scatterlist)));
+
*ppagelist = NULL;

+ dir = (type == PAGELIST_WRITE) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
+
/* Allocate enough storage to hold the page pointers and the page
** list
*/
- pagelist = kmalloc(sizeof(PAGELIST_T) +
- (num_pages * sizeof(unsigned long)) +
- sizeof(unsigned long) +
- (num_pages * sizeof(pages[0])),
- GFP_KERNEL);
+ pagelist = dma_zalloc_coherent(g_dev,
+ pagelist_size,
+ dma_addr,
+ GFP_KERNEL);

vchiq_log_trace(vchiq_arm_log_level, "create_pagelist - %pK",
pagelist);
@@ -394,10 +399,9 @@ create_pagelist(char __user *buf, size_t count, unsigned short type,
addrs = pagelist->addrs;
need_release = (unsigned long *)(addrs + num_pages);
pages = (struct page **)(addrs + num_pages + 1);
+ scatterlist = (struct scatterlist *)(pages + num_pages);

if (is_vmalloc_addr(buf)) {
- int dir = (type == PAGELIST_WRITE) ?
- DMA_TO_DEVICE : DMA_FROM_DEVICE;
unsigned long length = count;
unsigned int off = offset;

@@ -410,7 +414,6 @@ create_pagelist(char __user *buf, size_t count, unsigned short type,
if (bytes > length)
bytes = length;
pages[actual_pages] = pg;
- dmac_map_area(page_address(pg) + off, bytes, dir);
length -= bytes;
off = 0;
}
@@ -438,7 +441,8 @@ create_pagelist(char __user *buf, size_t count, unsigned short type,
actual_pages--;
put_page(pages[actual_pages]);
}
- kfree(pagelist);
+ dma_free_coherent(g_dev, pagelist_size,
+ pagelist, *dma_addr);
if (actual_pages == 0)
actual_pages = -ENOMEM;
return actual_pages;
@@ -450,29 +454,38 @@ create_pagelist(char __user *buf, size_t count, unsigned short type,
pagelist->type = type;
pagelist->offset = offset;

- /* Group the pages into runs of contiguous pages */
-
- base_addr = VCHIQ_ARM_ADDRESS(page_address(pages[0]));
- next_addr = base_addr + PAGE_SIZE;
- addridx = 0;
- run = 0;
-
- for (i = 1; i < num_pages; i++) {
- addr = VCHIQ_ARM_ADDRESS(page_address(pages[i]));
- if ((addr == next_addr) && (run < (PAGE_SIZE - 1))) {
- next_addr += PAGE_SIZE;
- run++;
- } else {
- addrs[addridx] = (unsigned long)base_addr + run;
- addridx++;
- base_addr = addr;
- next_addr = addr + PAGE_SIZE;
- run = 0;
- }
+ for (i = 0; i < num_pages; i++)
+ sg_set_page(scatterlist + i, pages[i], PAGE_SIZE, 0);
+
+ dma_buffers = dma_map_sg(g_dev,
+ scatterlist,
+ num_pages,
+ dir);
+
+ if (dma_buffers == 0) {
+ dma_free_coherent(g_dev, pagelist_size,
+ pagelist, *dma_addr);
+
+ return -EINTR;
}

- addrs[addridx] = (unsigned long)base_addr + run;
- addridx++;
+ k = 0;
+ for (i = 0; i < dma_buffers;) {
+ u32 section_length = 0;
+
+ for (j = i + 1; j < dma_buffers; j++) {
+ if (sg_dma_address(scatterlist + j) !=
+ sg_dma_address(scatterlist + j - 1) + PAGE_SIZE) {
+ break;
+ }
+ section_length++;
+ }
+
+ addrs[k] = (u32)sg_dma_address(scatterlist + i) |
+ section_length;
+ i = j;
+ k++;
+ }

/* Partial cache lines (fragments) require special measures */
if ((type == PAGELIST_READ) &&
@@ -482,7 +495,10 @@ create_pagelist(char __user *buf, size_t count, unsigned short type,
char *fragments;

if (down_interruptible(&g_free_fragments_sema) != 0) {
- kfree(pagelist);
+ dma_unmap_sg(g_dev, scatterlist, num_pages,
+ DMA_BIDIRECTIONAL);
+ dma_free_coherent(g_dev, pagelist_size,
+ pagelist, *dma_addr);
return -EINTR;
}

@@ -497,29 +513,42 @@ create_pagelist(char __user *buf, size_t count, unsigned short type,
(fragments - g_fragments_base) / g_fragments_size;
}

- dmac_flush_range(pagelist, addrs + num_pages);
-
*ppagelist = pagelist;

return 0;
}

static void
-free_pagelist(PAGELIST_T *pagelist, int actual)
+free_pagelist(dma_addr_t dma_addr, PAGELIST_T *pagelist, int actual)
{
unsigned long *need_release;
struct page **pages;
unsigned int num_pages, i;
+ size_t pagelist_size;
+ struct scatterlist *scatterlist;
+ int dir;

vchiq_log_trace(vchiq_arm_log_level, "free_pagelist - %pK, %d",
pagelist, actual);

+ dir = (pagelist->type == PAGELIST_WRITE) ? DMA_TO_DEVICE :
+ DMA_FROM_DEVICE;
+
num_pages =
(pagelist->length + pagelist->offset + PAGE_SIZE - 1) /
PAGE_SIZE;

+ pagelist_size = sizeof(PAGELIST_T) +
+ (num_pages * sizeof(unsigned long)) +
+ sizeof(unsigned long) +
+ (num_pages * sizeof(pages[0]) +
+ (num_pages * sizeof(struct scatterlist)));
+
need_release = (unsigned long *)(pagelist->addrs + num_pages);
pages = (struct page **)(pagelist->addrs + num_pages + 1);
+ scatterlist = (struct scatterlist *)(pages + num_pages);
+
+ dma_unmap_sg(g_dev, scatterlist, num_pages, dir);

/* Deal with any partial cache lines (fragments) */
if (pagelist->type >= PAGELIST_READ_WITH_FRAGMENTS) {
@@ -569,8 +598,7 @@ free_pagelist(PAGELIST_T *pagelist, int actual)

if (bytes > length)
bytes = length;
- dmac_unmap_area(page_address(pg) + offset,
- bytes, DMA_FROM_DEVICE);
+
length -= bytes;
offset = 0;
set_page_dirty(pg);
@@ -579,5 +607,6 @@ free_pagelist(PAGELIST_T *pagelist, int actual)
}
}

- kfree(pagelist);
+ dma_free_coherent(g_dev, pagelist_size,
+ pagelist, dma_addr);
}
--
2.9.3


2016-10-24 13:24:17

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [PATCH 2/2] staging: vc04_services: Replace dmac_map_area with dmac_map_sg

On Sun, Oct 23, 2016 at 10:29:32PM -0700, [email protected] wrote:
> From: Michael Zoran <[email protected]>
>
> The original arm implementation uses dmac_map_area which is not
> portable. Replace it with an architecture neutral version
> which uses dma_map_sg.
>
> As you can see that for larger page sizes, the dma_map_sg
> implementation is faster then the original unportable dma_map_area
> implementation.
>
> Test dmac_map_area dma_map_page dma_map_sg
> vchiq_test -b 4 10000 51us/iter 76us/iter 76us
> vchiq_test -b 8 10000 70us/iter 82us/iter 91us
> vchiq_test -b 16 10000 94us/iter 118us/iter 121us
> vchiq_test -b 32 10000 146us/iter 173us/iter 187us
> vchiq_test -b 64 10000 263us/iter 328us/iter 299us
> vchiq_test -b 128 10000 529us/iter 631us/iter 595us
> vchiq_test -b 256 10000 2285us/iter 2275us/iter 2001us
> vchiq_test -b 512 10000 4372us/iter 4616us/iter 4123us
>
> For message sizes >= 64KB, dma_map_sg is faster then dma_map_page.
>
> For message size >= 256KB, the dma_map_sg is the fastest
> implementation.

What is the "normal" message size value when using this driver?

thanks,

greg k-h

2016-10-24 14:56:12

by Michael Zoran

[permalink] [raw]
Subject: Re: [PATCH 2/2] staging: vc04_services: Replace dmac_map_area with dmac_map_sg

On Mon, 2016-10-24 at 15:24 +0200, Greg KH wrote:
> On Sun, Oct 23, 2016 at 10:29:32PM -0700, [email protected] wrote:
> > From: Michael Zoran <[email protected]>
> >
> > The original arm implementation uses dmac_map_area which is not
> > portable.  Replace it with an architecture neutral version
> > which uses dma_map_sg.
> >
> > As you can see that for larger page sizes, the dma_map_sg
> > implementation is faster then the original unportable dma_map_area
> > implementation.  
> >
> > Test                       dmac_map_area   dma_map_page dma_map_sg
> > vchiq_test -b 4 10000      51us/iter       76us/iter    76us
> > vchiq_test -b 8 10000      70us/iter       82us/iter    91us
> > vchiq_test -b 16 10000     94us/iter       118us/iter   121us
> > vchiq_test -b 32 10000     146us/iter      173us/iter   187us
> > vchiq_test -b 64 10000     263us/iter      328us/iter   299us
> > vchiq_test -b 128 10000    529us/iter      631us/iter   595us
> > vchiq_test -b 256 10000    2285us/iter     2275us/iter  2001us
> > vchiq_test -b 512 10000    4372us/iter     4616us/iter  4123us
> >
> > For message sizes >= 64KB, dma_map_sg is faster then dma_map_page.
> >
> > For message size >= 256KB, the dma_map_sg is the fastest
> > implementation.
>
> What is the "normal" message size value when using this driver?
>
> thanks,
>
> greg k-h

I honestly have no idea. From what I understand, the only code that
actually uses this code path is the closed source multimedia drivers
which I know nothing about.

Obviously, one approach would be to have the kernel collect data on
what the typical size is after running some benchmarks.


2016-10-24 16:56:40

by Eric Anholt

[permalink] [raw]
Subject: Re: [PATCH 2/2] staging: vc04_services: Replace dmac_map_area with dmac_map_sg

Greg KH <[email protected]> writes:

> On Sun, Oct 23, 2016 at 10:29:32PM -0700, [email protected] wrote:
>> From: Michael Zoran <[email protected]>
>>
>> The original arm implementation uses dmac_map_area which is not
>> portable. Replace it with an architecture neutral version
>> which uses dma_map_sg.
>>
>> As you can see that for larger page sizes, the dma_map_sg
>> implementation is faster then the original unportable dma_map_area
>> implementation.
>>
>> Test dmac_map_area dma_map_page dma_map_sg
>> vchiq_test -b 4 10000 51us/iter 76us/iter 76us
>> vchiq_test -b 8 10000 70us/iter 82us/iter 91us
>> vchiq_test -b 16 10000 94us/iter 118us/iter 121us
>> vchiq_test -b 32 10000 146us/iter 173us/iter 187us
>> vchiq_test -b 64 10000 263us/iter 328us/iter 299us
>> vchiq_test -b 128 10000 529us/iter 631us/iter 595us
>> vchiq_test -b 256 10000 2285us/iter 2275us/iter 2001us
>> vchiq_test -b 512 10000 4372us/iter 4616us/iter 4123us
>>
>> For message sizes >= 64KB, dma_map_sg is faster then dma_map_page.
>>
>> For message size >= 256KB, the dma_map_sg is the fastest
>> implementation.
>
> What is the "normal" message size value when using this driver?

For the contents of a bulk transfer, the performance case you care about
is moving an image (video decode, camera, etc.), so on the order of 1MB.


Attachments:
signature.asc (800.00 B)

2016-10-24 17:31:58

by Eric Anholt

[permalink] [raw]
Subject: Re: [PATCH 2/2] staging: vc04_services: Replace dmac_map_area with dmac_map_sg

[email protected] writes:

> From: Michael Zoran <[email protected]>
>
> The original arm implementation uses dmac_map_area which is not
> portable. Replace it with an architecture neutral version
> which uses dma_map_sg.
>
> As you can see that for larger page sizes, the dma_map_sg
> implementation is faster then the original unportable dma_map_area
> implementation.
>
> Test dmac_map_area dma_map_page dma_map_sg
> vchiq_test -b 4 10000 51us/iter 76us/iter 76us
> vchiq_test -b 8 10000 70us/iter 82us/iter 91us
> vchiq_test -b 16 10000 94us/iter 118us/iter 121us
> vchiq_test -b 32 10000 146us/iter 173us/iter 187us
> vchiq_test -b 64 10000 263us/iter 328us/iter 299us
> vchiq_test -b 128 10000 529us/iter 631us/iter 595us
> vchiq_test -b 256 10000 2285us/iter 2275us/iter 2001us
> vchiq_test -b 512 10000 4372us/iter 4616us/iter 4123us
>
> For message sizes >= 64KB, dma_map_sg is faster then dma_map_page.
>
> For message size >= 256KB, the dma_map_sg is the fastest
> implementation.
>
> Signed-off-by: Michael Zoran <[email protected]>
> ---
> .../interface/vchiq_arm/vchiq_2835_arm.c | 153 ++++++++++++---------
> 1 file changed, 91 insertions(+), 62 deletions(-)
>
> diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c
> index 98c6819..5ca66ee 100644
> --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c
> +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c
> @@ -45,13 +45,8 @@
> #include <asm/pgtable.h>
> #include <soc/bcm2835/raspberrypi-firmware.h>
>
> -#define dmac_map_area __glue(_CACHE,_dma_map_area)
> -#define dmac_unmap_area __glue(_CACHE,_dma_unmap_area)
> -
> #define TOTAL_SLOTS (VCHIQ_SLOT_ZERO_SLOTS + 2 * 32)
>
> -#define VCHIQ_ARM_ADDRESS(x) ((void *)((char *)x + g_virt_to_bus_offset))
> -
> #include "vchiq_arm.h"
> #include "vchiq_2835.h"
> #include "vchiq_connected.h"
> @@ -73,7 +68,7 @@ static unsigned int g_fragments_size;
> static char *g_fragments_base;
> static char *g_free_fragments;
> static struct semaphore g_free_fragments_sema;
> -static unsigned long g_virt_to_bus_offset;
> +static struct device *g_dev;
>
> extern int vchiq_arm_log_level;
>
> @@ -84,10 +79,11 @@ vchiq_doorbell_irq(int irq, void *dev_id);
>
> static int
> create_pagelist(char __user *buf, size_t count, unsigned short type,
> - struct task_struct *task, PAGELIST_T ** ppagelist);
> + struct task_struct *task, PAGELIST_T **ppagelist,
> + dma_addr_t *dma_addr);
>
> static void
> -free_pagelist(PAGELIST_T *pagelist, int actual);
> +free_pagelist(dma_addr_t dma_addr, PAGELIST_T *pagelist, int actual);
>
> int vchiq_platform_init(struct platform_device *pdev, VCHIQ_STATE_T *state)
> {
> @@ -101,8 +97,6 @@ int vchiq_platform_init(struct platform_device *pdev, VCHIQ_STATE_T *state)
> int slot_mem_size, frag_mem_size;
> int err, irq, i;
>
> - g_virt_to_bus_offset = virt_to_dma(dev, (void *)0);
> -
> (void)of_property_read_u32(dev->of_node, "cache-line-size",
> &g_cache_line_size);
> g_fragments_size = 2 * g_cache_line_size;
> @@ -118,7 +112,7 @@ int vchiq_platform_init(struct platform_device *pdev, VCHIQ_STATE_T *state)
> return -ENOMEM;
> }
>
> - WARN_ON(((int)slot_mem & (PAGE_SIZE - 1)) != 0);
> + WARN_ON(((unsigned long)slot_mem & (PAGE_SIZE - 1)) != 0);
>
> vchiq_slot_zero = vchiq_init_slots(slot_mem, slot_mem_size);
> if (!vchiq_slot_zero)
> @@ -170,6 +164,7 @@ int vchiq_platform_init(struct platform_device *pdev, VCHIQ_STATE_T *state)
> return err ? : -ENXIO;
> }
>
> + g_dev = dev;
> vchiq_log_info(vchiq_arm_log_level,
> "vchiq_init - done (slots %pK, phys %pad)",
> vchiq_slot_zero, &slot_phys);
> @@ -233,6 +228,7 @@ vchiq_prepare_bulk_data(VCHIQ_BULK_T *bulk, VCHI_MEM_HANDLE_T memhandle,
> {
> PAGELIST_T *pagelist;
> int ret;
> + dma_addr_t dma_addr;
>
> WARN_ON(memhandle != VCHI_MEM_HANDLE_INVALID);
>
> @@ -241,12 +237,14 @@ vchiq_prepare_bulk_data(VCHIQ_BULK_T *bulk, VCHI_MEM_HANDLE_T memhandle,
> ? PAGELIST_READ
> : PAGELIST_WRITE,
> current,
> - &pagelist);
> + &pagelist,
> + &dma_addr);
> +
> if (ret != 0)
> return VCHIQ_ERROR;
>
> bulk->handle = memhandle;
> - bulk->data = VCHIQ_ARM_ADDRESS(pagelist);
> + bulk->data = (void *)(unsigned long)dma_addr;
>
> /* Store the pagelist address in remote_data, which isn't used by the
> slave. */
> @@ -259,7 +257,8 @@ void
> vchiq_complete_bulk(VCHIQ_BULK_T *bulk)
> {
> if (bulk && bulk->remote_data && bulk->actual)
> - free_pagelist((PAGELIST_T *)bulk->remote_data, bulk->actual);
> + free_pagelist((dma_addr_t)(unsigned long)bulk->data,
> + (PAGELIST_T *)bulk->remote_data, bulk->actual);
> }
>
> void
> @@ -353,38 +352,44 @@ vchiq_doorbell_irq(int irq, void *dev_id)
> ** obscuring the new data underneath. We can solve this by transferring the
> ** partial cache lines separately, and allowing the ARM to copy into the
> ** cached area.
> -
> -** N.B. This implementation plays slightly fast and loose with the Linux
> -** driver programming rules, e.g. its use of dmac_map_area instead of
> -** dma_map_single, but it isn't a multi-platform driver and it benefits
> -** from increased speed as a result.
> */
>
> static int
> create_pagelist(char __user *buf, size_t count, unsigned short type,
> - struct task_struct *task, PAGELIST_T ** ppagelist)
> + struct task_struct *task, PAGELIST_T **ppagelist,
> + dma_addr_t *dma_addr)
> {
> PAGELIST_T *pagelist;
> struct page **pages;
> - unsigned long *addrs;
> - unsigned int num_pages, offset, i;
> - char *addr, *base_addr, *next_addr;
> - int run, addridx, actual_pages;
> + u32 *addrs;
> + unsigned int num_pages, offset, i, j, k;
> + int actual_pages;
> unsigned long *need_release;
> + size_t pagelist_size;
> + struct scatterlist *scatterlist;
> + int dma_buffers;
> + int dir;
>
> - offset = (unsigned int)buf & (PAGE_SIZE - 1);
> + offset = ((unsigned int)(unsigned long)buf & (PAGE_SIZE - 1));
> num_pages = (count + offset + PAGE_SIZE - 1) / PAGE_SIZE;
>
> + pagelist_size = sizeof(PAGELIST_T) +
> + (num_pages * sizeof(unsigned long)) +
> + sizeof(unsigned long) +
> + (num_pages * sizeof(pages[0]) +
> + (num_pages * sizeof(struct scatterlist)));
> +
> *ppagelist = NULL;
>
> + dir = (type == PAGELIST_WRITE) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
> +
> /* Allocate enough storage to hold the page pointers and the page
> ** list
> */
> - pagelist = kmalloc(sizeof(PAGELIST_T) +
> - (num_pages * sizeof(unsigned long)) +
> - sizeof(unsigned long) +
> - (num_pages * sizeof(pages[0])),
> - GFP_KERNEL);
> + pagelist = dma_zalloc_coherent(g_dev,
> + pagelist_size,
> + dma_addr,
> + GFP_KERNEL);
>
> vchiq_log_trace(vchiq_arm_log_level, "create_pagelist - %pK",
> pagelist);
> @@ -394,10 +399,9 @@ create_pagelist(char __user *buf, size_t count, unsigned short type,
> addrs = pagelist->addrs;
> need_release = (unsigned long *)(addrs + num_pages);
> pages = (struct page **)(addrs + num_pages + 1);
> + scatterlist = (struct scatterlist *)(pages + num_pages);
>
> if (is_vmalloc_addr(buf)) {
> - int dir = (type == PAGELIST_WRITE) ?
> - DMA_TO_DEVICE : DMA_FROM_DEVICE;
> unsigned long length = count;
> unsigned int off = offset;
>
> @@ -410,7 +414,6 @@ create_pagelist(char __user *buf, size_t count, unsigned short type,
> if (bytes > length)
> bytes = length;
> pages[actual_pages] = pg;
> - dmac_map_area(page_address(pg) + off, bytes, dir);
> length -= bytes;
> off = 0;
> }
> @@ -438,7 +441,8 @@ create_pagelist(char __user *buf, size_t count, unsigned short type,
> actual_pages--;
> put_page(pages[actual_pages]);
> }
> - kfree(pagelist);
> + dma_free_coherent(g_dev, pagelist_size,
> + pagelist, *dma_addr);
> if (actual_pages == 0)
> actual_pages = -ENOMEM;
> return actual_pages;
> @@ -450,29 +454,38 @@ create_pagelist(char __user *buf, size_t count, unsigned short type,
> pagelist->type = type;
> pagelist->offset = offset;
>
> - /* Group the pages into runs of contiguous pages */
> -
> - base_addr = VCHIQ_ARM_ADDRESS(page_address(pages[0]));
> - next_addr = base_addr + PAGE_SIZE;
> - addridx = 0;
> - run = 0;
> -
> - for (i = 1; i < num_pages; i++) {
> - addr = VCHIQ_ARM_ADDRESS(page_address(pages[i]));
> - if ((addr == next_addr) && (run < (PAGE_SIZE - 1))) {
> - next_addr += PAGE_SIZE;
> - run++;
> - } else {
> - addrs[addridx] = (unsigned long)base_addr + run;
> - addridx++;
> - base_addr = addr;
> - next_addr = addr + PAGE_SIZE;
> - run = 0;
> - }
> + for (i = 0; i < num_pages; i++)
> + sg_set_page(scatterlist + i, pages[i], PAGE_SIZE, 0);
> +
> + dma_buffers = dma_map_sg(g_dev,
> + scatterlist,
> + num_pages,
> + dir);
> +
> + if (dma_buffers == 0) {
> + dma_free_coherent(g_dev, pagelist_size,
> + pagelist, *dma_addr);
> +
> + return -EINTR;
> }
>
> - addrs[addridx] = (unsigned long)base_addr + run;
> - addridx++;
> + k = 0;
> + for (i = 0; i < dma_buffers;) {
> + u32 section_length = 0;
> +
> + for (j = i + 1; j < dma_buffers; j++) {
> + if (sg_dma_address(scatterlist + j) !=
> + sg_dma_address(scatterlist + j - 1) + PAGE_SIZE) {
> + break;
> + }
> + section_length++;
> + }
> +
> + addrs[k] = (u32)sg_dma_address(scatterlist + i) |
> + section_length;

It looks like scatterlist may not be just an array, so I think this
whole loop wants to be something like:

for_each_sg(scatterlist, sg, num_pages, i) {
u32 len = sg_dma_len(sg);
u32 addr = sg_dma_address(sg);

if (k > 0 && addrs[k - 1] & PAGE_MASK +
(addrs[k - 1] & ~PAGE_MASK) << PAGE_SHIFT) == addr) {
addrs[k - 1] += len;
} else {
addrs[k++] = addr | len;
}
}

Note the use of sg_dma_len(), since sg entries may be more than one
page. I don't think any merging is actually happening on our platform
currently, thus why I left the inner loop.

Thanks for taking on doing this conversion! This code's going to be so
much nicer when you're done.


Attachments:
signature.asc (800.00 B)

2016-10-25 15:07:21

by Michael Zoran

[permalink] [raw]
Subject: Re: [PATCH 2/2] staging: vc04_services: Replace dmac_map_area with dmac_map_sg

On Tue, 2016-10-25 at 08:00 -0700, Michael Zoran wrote:
> On Mon, 2016-10-24 at 10:31 -0700, Eric Anholt wrote:
> > [email protected] writes:
> >
> > >  */
> > >  
> > >  static int
> > >  create_pagelist(char __user *buf, size_t count, unsigned short
> > > type,
> > > - struct task_struct *task, PAGELIST_T ** ppagelist)
> > > + struct task_struct *task, PAGELIST_T
> > > **ppagelist,
> > > + dma_addr_t *dma_addr)
> > >  {
> > >   PAGELIST_T *pagelist;
> > >   struct page **pages;
> > > - unsigned long *addrs;
> > > - unsigned int num_pages, offset, i;
> > > - char *addr, *base_addr, *next_addr;
> > > - int run, addridx, actual_pages;
> > > + u32 *addrs;
> > > + unsigned int num_pages, offset, i, j, k;
> > > + int actual_pages;
> > >          unsigned long *need_release;
> > > + size_t pagelist_size;
> > > + struct scatterlist *scatterlist;
> > > + int dma_buffers;
> > > + int dir;
> > >  
> > > - offset = (unsigned int)buf & (PAGE_SIZE - 1);
> > > + offset = ((unsigned int)(unsigned long)buf & (PAGE_SIZE
> > > -
> > > 1));
> > >   num_pages = (count + offset + PAGE_SIZE - 1) /
> > > PAGE_SIZE;
> > >  
> > > + pagelist_size = sizeof(PAGELIST_T) +
> > > + (num_pages * sizeof(unsigned long)) +
> > > + sizeof(unsigned long) +
> > > + (num_pages * sizeof(pages[0]) +
> > > + (num_pages * sizeof(struct
> > > scatterlist)));
> > > +
> > >   *ppagelist = NULL;
> > >  
> > > + dir = (type == PAGELIST_WRITE) ? DMA_TO_DEVICE :
> > > DMA_FROM_DEVICE;
> > > +
> > >   /* Allocate enough storage to hold the page pointers and
> > > the page
> > >   ** list
> > >   */
> > > - pagelist = kmalloc(sizeof(PAGELIST_T) +
> > > -                           (num_pages * sizeof(unsigned long)) +
> > > -                           sizeof(unsigned long) +
> > > -                           (num_pages * sizeof(pages[0])),
> > > -                           GFP_KERNEL);
> > > + pagelist = dma_zalloc_coherent(g_dev,
> > > +        pagelist_size,
> > > +        dma_addr,
> > > +        GFP_KERNEL);
> > >  
> > >   vchiq_log_trace(vchiq_arm_log_level, "create_pagelist -
> > > %pK",
> > >   pagelist);
> > > @@ -394,10 +399,9 @@ create_pagelist(char __user *buf, size_t
> > > count, unsigned short type,
> > >   addrs = pagelist->addrs;
> > >          need_release = (unsigned long *)(addrs + num_pages);
> > >   pages = (struct page **)(addrs + num_pages + 1);
> > > + scatterlist = (struct scatterlist *)(pages + num_pages);
> > >  
> > >   if (is_vmalloc_addr(buf)) {
> > > - int dir = (type == PAGELIST_WRITE) ?
> > > - DMA_TO_DEVICE : DMA_FROM_DEVICE;
> > >   unsigned long length = count;
> > >   unsigned int off = offset;
> > >  
> > > @@ -410,7 +414,6 @@ create_pagelist(char __user *buf, size_t
> > > count,
> > > unsigned short type,
> > >   if (bytes > length)
> > >   bytes = length;
> > >   pages[actual_pages] = pg;
> > > - dmac_map_area(page_address(pg) + off,
> > > bytes, dir);
> > >   length -= bytes;
> > >   off = 0;
> > >   }
> > > @@ -438,7 +441,8 @@ create_pagelist(char __user *buf, size_t
> > > count,
> > > unsigned short type,
> > >   actual_pages--;
> > >   put_page(pages[actual_pages]);
> > >   }
> > > - kfree(pagelist);
> > > + dma_free_coherent(g_dev, pagelist_size,
> > > +   pagelist, *dma_addr);
> > >   if (actual_pages == 0)
> > >   actual_pages = -ENOMEM;
> > >   return actual_pages;
> > > @@ -450,29 +454,38 @@ create_pagelist(char __user *buf, size_t
> > > count, unsigned short type,
> > >   pagelist->type = type;
> > >   pagelist->offset = offset;
> > >  
> > > - /* Group the pages into runs of contiguous pages */
> > > -
> > > - base_addr = VCHIQ_ARM_ADDRESS(page_address(pages[0]));
> > > - next_addr = base_addr + PAGE_SIZE;
> > > - addridx = 0;
> > > - run = 0;
> > > -
> > > - for (i = 1; i < num_pages; i++) {
> > > - addr =
> > > VCHIQ_ARM_ADDRESS(page_address(pages[i]));
> > > - if ((addr == next_addr) && (run < (PAGE_SIZE -
> > > 1))) {
> > > - next_addr += PAGE_SIZE;
> > > - run++;
> > > - } else {
> > > - addrs[addridx] = (unsigned
> > > long)base_addr
> > > + run;
> > > - addridx++;
> > > - base_addr = addr;
> > > - next_addr = addr + PAGE_SIZE;
> > > - run = 0;
> > > - }
> > > + for (i = 0; i < num_pages; i++)
> > > + sg_set_page(scatterlist + i, pages[i],
> > > PAGE_SIZE,
> > > 0);
> > > +
> > > + dma_buffers = dma_map_sg(g_dev,
> > > +  scatterlist,
> > > +  num_pages,
> > > +  dir);
> > > +
> > > + if (dma_buffers == 0) {
> > > + dma_free_coherent(g_dev, pagelist_size,
> > > +   pagelist, *dma_addr);
> > > +
> > > + return -EINTR;
> > >   }
> > >  
> > > - addrs[addridx] = (unsigned long)base_addr + run;
> > > - addridx++;
> > > + k = 0;
> > > + for (i = 0; i < dma_buffers;) {
> > > + u32 section_length = 0;
> > > +
> > > + for (j = i + 1; j < dma_buffers; j++) {
> > > + if (sg_dma_address(scatterlist + j) !=
> > > +     sg_dma_address(scatterlist + j - 1)
> > > +
> > > PAGE_SIZE) {
> > > + break;
> > > + }
> > > + section_length++;
> > > + }
> > > +
> > > + addrs[k] = (u32)sg_dma_address(scatterlist + i)
> > > |
> > > + section_length;
> >
> > It looks like scatterlist may not be just an array, so I think this
> > whole loop wants to be something like:
> >
> > for_each_sg(scatterlist, sg, num_pages, i) {
> > u32 len = sg_dma_len(sg);
> >         u32 addr = sg_dma_address(sg);
> >
> >         if (k > 0 && addrs[k - 1] & PAGE_MASK +
> >             (addrs[k - 1] & ~PAGE_MASK) << PAGE_SHIFT) == addr) {
> >          addrs[k - 1] += len;
> >         } else {
> >          addrs[k++] = addr | len;
> >         }
> > }
> >
> > Note the use of sg_dma_len(), since sg entries may be more than one
> > page.  I don't think any merging is actually happening on our
> > platform
> > currently, thus why I left the inner loop.
> >
> > Thanks for taking on doing this conversion!  This code's going to
> > be
> > so
> > much nicer when you're done.
>
> Thanks for looking at this.  
>
> While I understand the use of sg_dma_len, I don't understand totally
> the need for "for_each_sg". The scatterlist can be part of a scatter
> table with all kinds of complex chaining, but in this case it is
> directly allocated as an array at the top of the function.
>
> Also note that the addrs[] list is passed to the firmware and it
> requires all the items of the list be paged aligned and be a multiple
> of the page size.  So I'm also a bit confused about the need for
> handling scatterlists that are not page aligned or a multiple of
> pages.
>
>

Sorry, but I forgot to add that the addrs list is a address anded with
a page count, not a byte count. The example you have sent appears at
a glance to look like it is anding a byte count with the address.

2016-10-25 15:10:01

by Michael Zoran

[permalink] [raw]
Subject: Re: [PATCH 2/2] staging: vc04_services: Replace dmac_map_area with dmac_map_sg

On Mon, 2016-10-24 at 10:31 -0700, Eric Anholt wrote:
> [email protected] writes:
>
> >  */
> >  
> >  static int
> >  create_pagelist(char __user *buf, size_t count, unsigned short
> > type,
> > - struct task_struct *task, PAGELIST_T ** ppagelist)
> > + struct task_struct *task, PAGELIST_T **ppagelist,
> > + dma_addr_t *dma_addr)
> >  {
> >   PAGELIST_T *pagelist;
> >   struct page **pages;
> > - unsigned long *addrs;
> > - unsigned int num_pages, offset, i;
> > - char *addr, *base_addr, *next_addr;
> > - int run, addridx, actual_pages;
> > + u32 *addrs;
> > + unsigned int num_pages, offset, i, j, k;
> > + int actual_pages;
> >          unsigned long *need_release;
> > + size_t pagelist_size;
> > + struct scatterlist *scatterlist;
> > + int dma_buffers;
> > + int dir;
> >  
> > - offset = (unsigned int)buf & (PAGE_SIZE - 1);
> > + offset = ((unsigned int)(unsigned long)buf & (PAGE_SIZE -
> > 1));
> >   num_pages = (count + offset + PAGE_SIZE - 1) / PAGE_SIZE;
> >  
> > + pagelist_size = sizeof(PAGELIST_T) +
> > + (num_pages * sizeof(unsigned long)) +
> > + sizeof(unsigned long) +
> > + (num_pages * sizeof(pages[0]) +
> > + (num_pages * sizeof(struct scatterlist)));
> > +
> >   *ppagelist = NULL;
> >  
> > + dir = (type == PAGELIST_WRITE) ? DMA_TO_DEVICE :
> > DMA_FROM_DEVICE;
> > +
> >   /* Allocate enough storage to hold the page pointers and
> > the page
> >   ** list
> >   */
> > - pagelist = kmalloc(sizeof(PAGELIST_T) +
> > -                           (num_pages * sizeof(unsigned long)) +
> > -                           sizeof(unsigned long) +
> > -                           (num_pages * sizeof(pages[0])),
> > -                           GFP_KERNEL);
> > + pagelist = dma_zalloc_coherent(g_dev,
> > +        pagelist_size,
> > +        dma_addr,
> > +        GFP_KERNEL);
> >  
> >   vchiq_log_trace(vchiq_arm_log_level, "create_pagelist -
> > %pK",
> >   pagelist);
> > @@ -394,10 +399,9 @@ create_pagelist(char __user *buf, size_t
> > count, unsigned short type,
> >   addrs = pagelist->addrs;
> >          need_release = (unsigned long *)(addrs + num_pages);
> >   pages = (struct page **)(addrs + num_pages + 1);
> > + scatterlist = (struct scatterlist *)(pages + num_pages);
> >  
> >   if (is_vmalloc_addr(buf)) {
> > - int dir = (type == PAGELIST_WRITE) ?
> > - DMA_TO_DEVICE : DMA_FROM_DEVICE;
> >   unsigned long length = count;
> >   unsigned int off = offset;
> >  
> > @@ -410,7 +414,6 @@ create_pagelist(char __user *buf, size_t count,
> > unsigned short type,
> >   if (bytes > length)
> >   bytes = length;
> >   pages[actual_pages] = pg;
> > - dmac_map_area(page_address(pg) + off,
> > bytes, dir);
> >   length -= bytes;
> >   off = 0;
> >   }
> > @@ -438,7 +441,8 @@ create_pagelist(char __user *buf, size_t count,
> > unsigned short type,
> >   actual_pages--;
> >   put_page(pages[actual_pages]);
> >   }
> > - kfree(pagelist);
> > + dma_free_coherent(g_dev, pagelist_size,
> > +   pagelist, *dma_addr);
> >   if (actual_pages == 0)
> >   actual_pages = -ENOMEM;
> >   return actual_pages;
> > @@ -450,29 +454,38 @@ create_pagelist(char __user *buf, size_t
> > count, unsigned short type,
> >   pagelist->type = type;
> >   pagelist->offset = offset;
> >  
> > - /* Group the pages into runs of contiguous pages */
> > -
> > - base_addr = VCHIQ_ARM_ADDRESS(page_address(pages[0]));
> > - next_addr = base_addr + PAGE_SIZE;
> > - addridx = 0;
> > - run = 0;
> > -
> > - for (i = 1; i < num_pages; i++) {
> > - addr = VCHIQ_ARM_ADDRESS(page_address(pages[i]));
> > - if ((addr == next_addr) && (run < (PAGE_SIZE -
> > 1))) {
> > - next_addr += PAGE_SIZE;
> > - run++;
> > - } else {
> > - addrs[addridx] = (unsigned long)base_addr
> > + run;
> > - addridx++;
> > - base_addr = addr;
> > - next_addr = addr + PAGE_SIZE;
> > - run = 0;
> > - }
> > + for (i = 0; i < num_pages; i++)
> > + sg_set_page(scatterlist + i, pages[i], PAGE_SIZE,
> > 0);
> > +
> > + dma_buffers = dma_map_sg(g_dev,
> > +  scatterlist,
> > +  num_pages,
> > +  dir);
> > +
> > + if (dma_buffers == 0) {
> > + dma_free_coherent(g_dev, pagelist_size,
> > +   pagelist, *dma_addr);
> > +
> > + return -EINTR;
> >   }
> >  
> > - addrs[addridx] = (unsigned long)base_addr + run;
> > - addridx++;
> > + k = 0;
> > + for (i = 0; i < dma_buffers;) {
> > + u32 section_length = 0;
> > +
> > + for (j = i + 1; j < dma_buffers; j++) {
> > + if (sg_dma_address(scatterlist + j) !=
> > +     sg_dma_address(scatterlist + j - 1) +
> > PAGE_SIZE) {
> > + break;
> > + }
> > + section_length++;
> > + }
> > +
> > + addrs[k] = (u32)sg_dma_address(scatterlist + i) |
> > + section_length;
>
> It looks like scatterlist may not be just an array, so I think this
> whole loop wants to be something like:
>
> for_each_sg(scatterlist, sg, num_pages, i) {
> u32 len = sg_dma_len(sg);
>         u32 addr = sg_dma_address(sg);
>
>         if (k > 0 && addrs[k - 1] & PAGE_MASK +
>             (addrs[k - 1] & ~PAGE_MASK) << PAGE_SHIFT) == addr) {
>          addrs[k - 1] += len;
>         } else {
>          addrs[k++] = addr | len;
>         }
> }
>
> Note the use of sg_dma_len(), since sg entries may be more than one
> page.  I don't think any merging is actually happening on our
> platform
> currently, thus why I left the inner loop.
>
> Thanks for taking on doing this conversion!  This code's going to be
> so
> much nicer when you're done.

Thanks for looking at this.  

While I understand the use of sg_dma_len, I don't understand totally
the need for "for_each_sg". The scatterlist can be part of a scatter
table with all kinds of complex chaining, but in this case it is
directly allocated as an array at the top of the function.

Also note that the addrs[] list is passed to the firmware and it
requires all the items of the list be paged aligned and be a multiple
of the page size. So I'm also a bit confused about the need for
handling scatterlists that are not page aligned or a multiple of pages.





2016-10-25 16:17:03

by Eric Anholt

[permalink] [raw]
Subject: Re: [PATCH 2/2] staging: vc04_services: Replace dmac_map_area with dmac_map_sg

Michael Zoran <[email protected]> writes:

> On Tue, 2016-10-25 at 08:00 -0700, Michael Zoran wrote:
>> On Mon, 2016-10-24 at 10:31 -0700, Eric Anholt wrote:
>> > [email protected] writes:
>> >
>> > >  */
>> > >  
>> > >  static int
>> > >  create_pagelist(char __user *buf, size_t count, unsigned short
>> > > type,
>> > > - struct task_struct *task, PAGELIST_T ** ppagelist)
>> > > + struct task_struct *task, PAGELIST_T
>> > > **ppagelist,
>> > > + dma_addr_t *dma_addr)
>> > >  {
>> > >   PAGELIST_T *pagelist;
>> > >   struct page **pages;
>> > > - unsigned long *addrs;
>> > > - unsigned int num_pages, offset, i;
>> > > - char *addr, *base_addr, *next_addr;
>> > > - int run, addridx, actual_pages;
>> > > + u32 *addrs;
>> > > + unsigned int num_pages, offset, i, j, k;
>> > > + int actual_pages;
>> > >          unsigned long *need_release;
>> > > + size_t pagelist_size;
>> > > + struct scatterlist *scatterlist;
>> > > + int dma_buffers;
>> > > + int dir;
>> > >  
>> > > - offset = (unsigned int)buf & (PAGE_SIZE - 1);
>> > > + offset = ((unsigned int)(unsigned long)buf & (PAGE_SIZE
>> > > -
>> > > 1));
>> > >   num_pages = (count + offset + PAGE_SIZE - 1) /
>> > > PAGE_SIZE;
>> > >  
>> > > + pagelist_size = sizeof(PAGELIST_T) +
>> > > + (num_pages * sizeof(unsigned long)) +
>> > > + sizeof(unsigned long) +
>> > > + (num_pages * sizeof(pages[0]) +
>> > > + (num_pages * sizeof(struct
>> > > scatterlist)));
>> > > +
>> > >   *ppagelist = NULL;
>> > >  
>> > > + dir = (type == PAGELIST_WRITE) ? DMA_TO_DEVICE :
>> > > DMA_FROM_DEVICE;
>> > > +
>> > >   /* Allocate enough storage to hold the page pointers and
>> > > the page
>> > >   ** list
>> > >   */
>> > > - pagelist = kmalloc(sizeof(PAGELIST_T) +
>> > > -                           (num_pages * sizeof(unsigned long)) +
>> > > -                           sizeof(unsigned long) +
>> > > -                           (num_pages * sizeof(pages[0])),
>> > > -                           GFP_KERNEL);
>> > > + pagelist = dma_zalloc_coherent(g_dev,
>> > > +        pagelist_size,
>> > > +        dma_addr,
>> > > +        GFP_KERNEL);
>> > >  
>> > >   vchiq_log_trace(vchiq_arm_log_level, "create_pagelist -
>> > > %pK",
>> > >   pagelist);
>> > > @@ -394,10 +399,9 @@ create_pagelist(char __user *buf, size_t
>> > > count, unsigned short type,
>> > >   addrs = pagelist->addrs;
>> > >          need_release = (unsigned long *)(addrs + num_pages);
>> > >   pages = (struct page **)(addrs + num_pages + 1);
>> > > + scatterlist = (struct scatterlist *)(pages + num_pages);
>> > >  
>> > >   if (is_vmalloc_addr(buf)) {
>> > > - int dir = (type == PAGELIST_WRITE) ?
>> > > - DMA_TO_DEVICE : DMA_FROM_DEVICE;
>> > >   unsigned long length = count;
>> > >   unsigned int off = offset;
>> > >  
>> > > @@ -410,7 +414,6 @@ create_pagelist(char __user *buf, size_t
>> > > count,
>> > > unsigned short type,
>> > >   if (bytes > length)
>> > >   bytes = length;
>> > >   pages[actual_pages] = pg;
>> > > - dmac_map_area(page_address(pg) + off,
>> > > bytes, dir);
>> > >   length -= bytes;
>> > >   off = 0;
>> > >   }
>> > > @@ -438,7 +441,8 @@ create_pagelist(char __user *buf, size_t
>> > > count,
>> > > unsigned short type,
>> > >   actual_pages--;
>> > >   put_page(pages[actual_pages]);
>> > >   }
>> > > - kfree(pagelist);
>> > > + dma_free_coherent(g_dev, pagelist_size,
>> > > +   pagelist, *dma_addr);
>> > >   if (actual_pages == 0)
>> > >   actual_pages = -ENOMEM;
>> > >   return actual_pages;
>> > > @@ -450,29 +454,38 @@ create_pagelist(char __user *buf, size_t
>> > > count, unsigned short type,
>> > >   pagelist->type = type;
>> > >   pagelist->offset = offset;
>> > >  
>> > > - /* Group the pages into runs of contiguous pages */
>> > > -
>> > > - base_addr = VCHIQ_ARM_ADDRESS(page_address(pages[0]));
>> > > - next_addr = base_addr + PAGE_SIZE;
>> > > - addridx = 0;
>> > > - run = 0;
>> > > -
>> > > - for (i = 1; i < num_pages; i++) {
>> > > - addr =
>> > > VCHIQ_ARM_ADDRESS(page_address(pages[i]));
>> > > - if ((addr == next_addr) && (run < (PAGE_SIZE -
>> > > 1))) {
>> > > - next_addr += PAGE_SIZE;
>> > > - run++;
>> > > - } else {
>> > > - addrs[addridx] = (unsigned
>> > > long)base_addr
>> > > + run;
>> > > - addridx++;
>> > > - base_addr = addr;
>> > > - next_addr = addr + PAGE_SIZE;
>> > > - run = 0;
>> > > - }
>> > > + for (i = 0; i < num_pages; i++)
>> > > + sg_set_page(scatterlist + i, pages[i],
>> > > PAGE_SIZE,
>> > > 0);
>> > > +
>> > > + dma_buffers = dma_map_sg(g_dev,
>> > > +  scatterlist,
>> > > +  num_pages,
>> > > +  dir);
>> > > +
>> > > + if (dma_buffers == 0) {
>> > > + dma_free_coherent(g_dev, pagelist_size,
>> > > +   pagelist, *dma_addr);
>> > > +
>> > > + return -EINTR;
>> > >   }
>> > >  
>> > > - addrs[addridx] = (unsigned long)base_addr + run;
>> > > - addridx++;
>> > > + k = 0;
>> > > + for (i = 0; i < dma_buffers;) {
>> > > + u32 section_length = 0;
>> > > +
>> > > + for (j = i + 1; j < dma_buffers; j++) {
>> > > + if (sg_dma_address(scatterlist + j) !=
>> > > +     sg_dma_address(scatterlist + j - 1)
>> > > +
>> > > PAGE_SIZE) {
>> > > + break;
>> > > + }
>> > > + section_length++;
>> > > + }
>> > > +
>> > > + addrs[k] = (u32)sg_dma_address(scatterlist + i)
>> > > |
>> > > + section_length;
>> >
>> > It looks like scatterlist may not be just an array, so I think this
>> > whole loop wants to be something like:
>> >
>> > for_each_sg(scatterlist, sg, num_pages, i) {
>> > u32 len = sg_dma_len(sg);
>> >         u32 addr = sg_dma_address(sg);
>> >
>> >         if (k > 0 && addrs[k - 1] & PAGE_MASK +
>> >             (addrs[k - 1] & ~PAGE_MASK) << PAGE_SHIFT) == addr) {
>> >          addrs[k - 1] += len;
>> >         } else {
>> >          addrs[k++] = addr | len;
>> >         }
>> > }
>> >
>> > Note the use of sg_dma_len(), since sg entries may be more than one
>> > page.  I don't think any merging is actually happening on our
>> > platform
>> > currently, thus why I left the inner loop.
>> >
>> > Thanks for taking on doing this conversion!  This code's going to
>> > be
>> > so
>> > much nicer when you're done.
>>
>> Thanks for looking at this.  
>>
>> While I understand the use of sg_dma_len, I don't understand totally
>> the need for "for_each_sg". The scatterlist can be part of a scatter
>> table with all kinds of complex chaining, but in this case it is
>> directly allocated as an array at the top of the function.
>>
>> Also note that the addrs[] list is passed to the firmware and it
>> requires all the items of the list be paged aligned and be a multiple
>> of the page size.  So I'm also a bit confused about the need for
>> handling scatterlists that are not page aligned or a multiple of
>> pages.

I'm sure we can assume that sg_dma_map is going to give us back
page-aligned mappings, because we only asked for page aligned mappings.

I'm just making suggestions that follow what is describeed in
DMA-API-HOWTO.txt here. Following the documentation seems like a good
idea unless there's a reason not to.

> Sorry, but I forgot to add that the addrs list is a address anded with
> a page count, not a byte count. The example you have sent appears at
> a glance to look like it is anding a byte count with the address.

Looking at the type of the field being read, it looked like a page count
to me, but I was unsure and the header didn't clarify. Looking again,
it must be bytes, so shifting would be necessary.


Attachments:
signature.asc (800.00 B)

2016-10-25 16:30:57

by Michael Zoran

[permalink] [raw]
Subject: Re: [PATCH 2/2] staging: vc04_services: Replace dmac_map_area with dmac_map_sg

On Tue, 2016-10-25 at 09:16 -0700, Eric Anholt wrote:
> Michael Zoran <[email protected]> writes:
>
> > On Tue, 2016-10-25 at 08:00 -0700, Michael Zoran wrote:
> > > On Mon, 2016-10-24 at 10:31 -0700, Eric Anholt wrote:
> > > > [email protected] writes:
> > > >
> > > > >  */
> > > > >  
> > > > >  static int
> > > > >  create_pagelist(char __user *buf, size_t count, unsigned
> > > > > short
> > > > > type,
> > > > > - struct task_struct *task, PAGELIST_T ** ppagelist)
> > > > > + struct task_struct *task, PAGELIST_T
> > > > > **ppagelist,
> > > > > + dma_addr_t *dma_addr)
> > > > >  {
> > > > >   PAGELIST_T *pagelist;
> > > > >   struct page **pages;
> > > > > - unsigned long *addrs;
> > > > > - unsigned int num_pages, offset, i;
> > > > > - char *addr, *base_addr, *next_addr;
> > > > > - int run, addridx, actual_pages;
> > > > > + u32 *addrs;
> > > > > + unsigned int num_pages, offset, i, j, k;
> > > > > + int actual_pages;
> > > > >          unsigned long *need_release;
> > > > > + size_t pagelist_size;
> > > > > + struct scatterlist *scatterlist;
> > > > > + int dma_buffers;
> > > > > + int dir;
> > > > >  
> > > > > - offset = (unsigned int)buf & (PAGE_SIZE - 1);
> > > > > + offset = ((unsigned int)(unsigned long)buf &
> > > > > (PAGE_SIZE
> > > > > -
> > > > > 1));
> > > > >   num_pages = (count + offset + PAGE_SIZE - 1) /
> > > > > PAGE_SIZE;
> > > > >  
> > > > > + pagelist_size = sizeof(PAGELIST_T) +
> > > > > + (num_pages * sizeof(unsigned long))
> > > > > +
> > > > > + sizeof(unsigned long) +
> > > > > + (num_pages * sizeof(pages[0]) +
> > > > > + (num_pages * sizeof(struct
> > > > > scatterlist)));
> > > > > +
> > > > >   *ppagelist = NULL;
> > > > >  
> > > > > + dir = (type == PAGELIST_WRITE) ? DMA_TO_DEVICE :
> > > > > DMA_FROM_DEVICE;
> > > > > +
> > > > >   /* Allocate enough storage to hold the page pointers
> > > > > and
> > > > > the page
> > > > >   ** list
> > > > >   */
> > > > > - pagelist = kmalloc(sizeof(PAGELIST_T) +
> > > > > -                           (num_pages * sizeof(unsigned
> > > > > long)) +
> > > > > -                           sizeof(unsigned long) +
> > > > > -                           (num_pages * sizeof(pages[0])),
> > > > > -                           GFP_KERNEL);
> > > > > + pagelist = dma_zalloc_coherent(g_dev,
> > > > > +        pagelist_size,
> > > > > +        dma_addr,
> > > > > +        GFP_KERNEL);
> > > > >  
> > > > >   vchiq_log_trace(vchiq_arm_log_level,
> > > > > "create_pagelist -
> > > > > %pK",
> > > > >   pagelist);
> > > > > @@ -394,10 +399,9 @@ create_pagelist(char __user *buf, size_t
> > > > > count, unsigned short type,
> > > > >   addrs = pagelist->addrs;
> > > > >          need_release = (unsigned long *)(addrs + num_pages);
> > > > >   pages = (struct page **)(addrs + num_pages + 1);
> > > > > + scatterlist = (struct scatterlist *)(pages +
> > > > > num_pages);
> > > > >  
> > > > >   if (is_vmalloc_addr(buf)) {
> > > > > - int dir = (type == PAGELIST_WRITE) ?
> > > > > - DMA_TO_DEVICE : DMA_FROM_DEVICE;
> > > > >   unsigned long length = count;
> > > > >   unsigned int off = offset;
> > > > >  
> > > > > @@ -410,7 +414,6 @@ create_pagelist(char __user *buf, size_t
> > > > > count,
> > > > > unsigned short type,
> > > > >   if (bytes > length)
> > > > >   bytes = length;
> > > > >   pages[actual_pages] = pg;
> > > > > - dmac_map_area(page_address(pg) +
> > > > > off,
> > > > > bytes, dir);
> > > > >   length -= bytes;
> > > > >   off = 0;
> > > > >   }
> > > > > @@ -438,7 +441,8 @@ create_pagelist(char __user *buf, size_t
> > > > > count,
> > > > > unsigned short type,
> > > > >   actual_pages--;
> > > > >   put_page(pages[actual_pages]
> > > > > );
> > > > >   }
> > > > > - kfree(pagelist);
> > > > > + dma_free_coherent(g_dev,
> > > > > pagelist_size,
> > > > > +   pagelist,
> > > > > *dma_addr);
> > > > >   if (actual_pages == 0)
> > > > >   actual_pages = -ENOMEM;
> > > > >   return actual_pages;
> > > > > @@ -450,29 +454,38 @@ create_pagelist(char __user *buf,
> > > > > size_t
> > > > > count, unsigned short type,
> > > > >   pagelist->type = type;
> > > > >   pagelist->offset = offset;
> > > > >  
> > > > > - /* Group the pages into runs of contiguous pages */
> > > > > -
> > > > > - base_addr =
> > > > > VCHIQ_ARM_ADDRESS(page_address(pages[0]));
> > > > > - next_addr = base_addr + PAGE_SIZE;
> > > > > - addridx = 0;
> > > > > - run = 0;
> > > > > -
> > > > > - for (i = 1; i < num_pages; i++) {
> > > > > - addr =
> > > > > VCHIQ_ARM_ADDRESS(page_address(pages[i]));
> > > > > - if ((addr == next_addr) && (run < (PAGE_SIZE
> > > > > -
> > > > > 1))) {
> > > > > - next_addr += PAGE_SIZE;
> > > > > - run++;
> > > > > - } else {
> > > > > - addrs[addridx] = (unsigned
> > > > > long)base_addr
> > > > > + run;
> > > > > - addridx++;
> > > > > - base_addr = addr;
> > > > > - next_addr = addr + PAGE_SIZE;
> > > > > - run = 0;
> > > > > - }
> > > > > + for (i = 0; i < num_pages; i++)
> > > > > + sg_set_page(scatterlist + i, pages[i],
> > > > > PAGE_SIZE,
> > > > > 0);
> > > > > +
> > > > > + dma_buffers = dma_map_sg(g_dev,
> > > > > +  scatterlist,
> > > > > +  num_pages,
> > > > > +  dir);
> > > > > +
> > > > > + if (dma_buffers == 0) {
> > > > > + dma_free_coherent(g_dev, pagelist_size,
> > > > > +   pagelist, *dma_addr);
> > > > > +
> > > > > + return -EINTR;
> > > > >   }
> > > > >  
> > > > > - addrs[addridx] = (unsigned long)base_addr + run;
> > > > > - addridx++;
> > > > > + k = 0;
> > > > > + for (i = 0; i < dma_buffers;) {
> > > > > + u32 section_length = 0;
> > > > > +
> > > > > + for (j = i + 1; j < dma_buffers; j++) {
> > > > > + if (sg_dma_address(scatterlist + j)
> > > > > !=
> > > > > +     sg_dma_address(scatterlist + j -
> > > > > 1)
> > > > > +
> > > > > PAGE_SIZE) {
> > > > > + break;
> > > > > + }
> > > > > + section_length++;
> > > > > + }
> > > > > +
> > > > > + addrs[k] = (u32)sg_dma_address(scatterlist +
> > > > > i)
> > > > > >
> > > > >
> > > > > + section_length;
> > > >
> > > > It looks like scatterlist may not be just an array, so I think
> > > > this
> > > > whole loop wants to be something like:
> > > >
> > > > for_each_sg(scatterlist, sg, num_pages, i) {
> > > > u32 len = sg_dma_len(sg);
> > > >         u32 addr = sg_dma_address(sg);
> > > >
> > > >         if (k > 0 && addrs[k - 1] & PAGE_MASK +
> > > >             (addrs[k - 1] & ~PAGE_MASK) << PAGE_SHIFT) == addr)
> > > > {
> > > >          addrs[k - 1] += len;
> > > >         } else {
> > > >          addrs[k++] = addr | len;
> > > >         }
> > > > }
> > > >
> > > > Note the use of sg_dma_len(), since sg entries may be more than
> > > > one
> > > > page.  I don't think any merging is actually happening on our
> > > > platform
> > > > currently, thus why I left the inner loop.
> > > >
> > > > Thanks for taking on doing this conversion!  This code's going
> > > > to
> > > > be
> > > > so
> > > > much nicer when you're done.
> > >
> > > Thanks for looking at this.  
> > >
> > > While I understand the use of sg_dma_len, I don't understand
> > > totally
> > > the need for "for_each_sg". The scatterlist can be part of a
> > > scatter
> > > table with all kinds of complex chaining, but in this case it is
> > > directly allocated as an array at the top of the function.
> > >
> > > Also note that the addrs[] list is passed to the firmware and it
> > > requires all the items of the list be paged aligned and be a
> > > multiple
> > > of the page size.  So I'm also a bit confused about the need for
> > > handling scatterlists that are not page aligned or a multiple of
> > > pages.
>
> I'm sure we can assume that sg_dma_map is going to give us back
> page-aligned mappings, because we only asked for page aligned
> mappings.
>
> I'm just making suggestions that follow what is describeed in
> DMA-API-HOWTO.txt here.  Following the documentation seems like a
> good
> idea unless there's a reason not to.
>
> > Sorry, but I forgot to add that the addrs list is a address anded
> > with
> > a page count, not a byte count.   The example you have sent appears
> > at
> > a glance to look like it is anding a byte count with the address.
>
> Looking at the type of the field being read, it looked like a page
> count
> to me, but I was unsure and the header didn't clarify.  Looking
> again,
> it must be bytes, so shifting would be necessary.

Please see the pagelist_struct structure. It contains a comment
that the addrs field is in pages.

drivers/staging/vc04_services/interface/vchiq_arm/vchiq_pagelist.h

typedef struct pagelist_struct {
unsigned long length;
unsigned short type;
unsigned short offset;
unsigned long addrs[1]; /* N.B. 12 LSBs hold the number
of following
   pages at consecutive addresses. */
} PAGELIST_T;

If the DMA api can return lengths that aren't pages lengths, then I
don't think this is really a good API to be using especially since the
firmware requires all lengths to be pages.

It might be good to reference the original code that is currently
checked into the stagging tree. It also strongly suggests the length
is in pages. 

I tested the changes with vchiq_test and added some temp print outs and
I know that count in pages works and that for larger messages sizes the
changes I sent is combining pages at consecutive addresses. Sometimes
alot of pages are being combined.

I wish meld or something similar had a way to load a diff so that it's
easier to review these patches. But I digress...