Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S965165AbcJXRb6 (ORCPT ); Mon, 24 Oct 2016 13:31:58 -0400 Received: from anholt.net ([50.246.234.109]:52614 "EHLO anholt.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1757486AbcJXRb4 (ORCPT ); Mon, 24 Oct 2016 13:31:56 -0400 From: Eric Anholt To: mzoran@crowfest.net, gregkh@linuxfoundation.org Cc: mzoran@crowfest.net, daniels@collabora.com, noralf@tronnes.org, popcornmix@gmail.com, devel@driverdev.osuosl.org, linux-kernel@vger.kernel.org Subject: Re: [PATCH 2/2] staging: vc04_services: Replace dmac_map_area with dmac_map_sg In-Reply-To: <20161024052932.28259-1-mzoran@crowfest.net> References: <20161024052932.28259-1-mzoran@crowfest.net> User-Agent: Notmuch/0.22.2+1~gb0bcfaa (http://notmuchmail.org) Emacs/24.5.1 (x86_64-pc-linux-gnu) Date: Mon, 24 Oct 2016 10:31:52 -0700 Message-ID: <878ttdslfb.fsf@eliezer.anholt.net> MIME-Version: 1.0 Content-Type: multipart/signed; boundary="=-=-="; micalg=pgp-sha512; protocol="application/pgp-signature" Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 11691 Lines: 339 --=-=-= Content-Type: text/plain Content-Transfer-Encoding: quoted-printable mzoran@crowfest.net writes: > From: Michael Zoran > > The original arm implementation uses dmac_map_area which is not > portable. Replace it with an architecture neutral version > which uses dma_map_sg. > > As you can see that for larger page sizes, the dma_map_sg > implementation is faster then the original unportable dma_map_area > implementation.=20=20 > > Test dmac_map_area dma_map_page dma_map_sg > vchiq_test -b 4 10000 51us/iter 76us/iter 76us > vchiq_test -b 8 10000 70us/iter 82us/iter 91us > vchiq_test -b 16 10000 94us/iter 118us/iter 121us > vchiq_test -b 32 10000 146us/iter 173us/iter 187us > vchiq_test -b 64 10000 263us/iter 328us/iter 299us > vchiq_test -b 128 10000 529us/iter 631us/iter 595us > vchiq_test -b 256 10000 2285us/iter 2275us/iter 2001us > vchiq_test -b 512 10000 4372us/iter 4616us/iter 4123us > > For message sizes >=3D 64KB, dma_map_sg is faster then dma_map_page. > > For message size >=3D 256KB, the dma_map_sg is the fastest > implementation. > > Signed-off-by: Michael Zoran > --- > .../interface/vchiq_arm/vchiq_2835_arm.c | 153 ++++++++++++---= ------ > 1 file changed, 91 insertions(+), 62 deletions(-) > > diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835= _arm.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c > index 98c6819..5ca66ee 100644 > --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c > +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c > @@ -45,13 +45,8 @@ > #include > #include >=20=20 > -#define dmac_map_area __glue(_CACHE,_dma_map_area) > -#define dmac_unmap_area __glue(_CACHE,_dma_unmap_area) > - > #define TOTAL_SLOTS (VCHIQ_SLOT_ZERO_SLOTS + 2 * 32) >=20=20 > -#define VCHIQ_ARM_ADDRESS(x) ((void *)((char *)x + g_virt_to_bus_offset)) > - > #include "vchiq_arm.h" > #include "vchiq_2835.h" > #include "vchiq_connected.h" > @@ -73,7 +68,7 @@ static unsigned int g_fragments_size; > static char *g_fragments_base; > static char *g_free_fragments; > static struct semaphore g_free_fragments_sema; > -static unsigned long g_virt_to_bus_offset; > +static struct device *g_dev; >=20=20 > extern int vchiq_arm_log_level; >=20=20 > @@ -84,10 +79,11 @@ vchiq_doorbell_irq(int irq, void *dev_id); >=20=20 > static int > create_pagelist(char __user *buf, size_t count, unsigned short type, > - struct task_struct *task, PAGELIST_T ** ppagelist); > + struct task_struct *task, PAGELIST_T **ppagelist, > + dma_addr_t *dma_addr); >=20=20 > static void > -free_pagelist(PAGELIST_T *pagelist, int actual); > +free_pagelist(dma_addr_t dma_addr, PAGELIST_T *pagelist, int actual); >=20=20 > int vchiq_platform_init(struct platform_device *pdev, VCHIQ_STATE_T *sta= te) > { > @@ -101,8 +97,6 @@ int vchiq_platform_init(struct platform_device *pdev, = VCHIQ_STATE_T *state) > int slot_mem_size, frag_mem_size; > int err, irq, i; >=20=20 > - g_virt_to_bus_offset =3D virt_to_dma(dev, (void *)0); > - > (void)of_property_read_u32(dev->of_node, "cache-line-size", > &g_cache_line_size); > g_fragments_size =3D 2 * g_cache_line_size; > @@ -118,7 +112,7 @@ int vchiq_platform_init(struct platform_device *pdev,= VCHIQ_STATE_T *state) > return -ENOMEM; > } >=20=20 > - WARN_ON(((int)slot_mem & (PAGE_SIZE - 1)) !=3D 0); > + WARN_ON(((unsigned long)slot_mem & (PAGE_SIZE - 1)) !=3D 0); >=20=20 > vchiq_slot_zero =3D vchiq_init_slots(slot_mem, slot_mem_size); > if (!vchiq_slot_zero) > @@ -170,6 +164,7 @@ int vchiq_platform_init(struct platform_device *pdev,= VCHIQ_STATE_T *state) > return err ? : -ENXIO; > } >=20=20 > + g_dev =3D dev; > vchiq_log_info(vchiq_arm_log_level, > "vchiq_init - done (slots %pK, phys %pad)", > vchiq_slot_zero, &slot_phys); > @@ -233,6 +228,7 @@ vchiq_prepare_bulk_data(VCHIQ_BULK_T *bulk, VCHI_MEM_= HANDLE_T memhandle, > { > PAGELIST_T *pagelist; > int ret; > + dma_addr_t dma_addr; >=20=20 > WARN_ON(memhandle !=3D VCHI_MEM_HANDLE_INVALID); >=20=20 > @@ -241,12 +237,14 @@ vchiq_prepare_bulk_data(VCHIQ_BULK_T *bulk, VCHI_ME= M_HANDLE_T memhandle, > ? PAGELIST_READ > : PAGELIST_WRITE, > current, > - &pagelist); > + &pagelist, > + &dma_addr); > + > if (ret !=3D 0) > return VCHIQ_ERROR; >=20=20 > bulk->handle =3D memhandle; > - bulk->data =3D VCHIQ_ARM_ADDRESS(pagelist); > + bulk->data =3D (void *)(unsigned long)dma_addr; >=20=20 > /* Store the pagelist address in remote_data, which isn't used by the > slave. */ > @@ -259,7 +257,8 @@ void > vchiq_complete_bulk(VCHIQ_BULK_T *bulk) > { > if (bulk && bulk->remote_data && bulk->actual) > - free_pagelist((PAGELIST_T *)bulk->remote_data, bulk->actual); > + free_pagelist((dma_addr_t)(unsigned long)bulk->data, > + (PAGELIST_T *)bulk->remote_data, bulk->actual); > } >=20=20 > void > @@ -353,38 +352,44 @@ vchiq_doorbell_irq(int irq, void *dev_id) > ** obscuring the new data underneath. We can solve this by transferring = the > ** partial cache lines separately, and allowing the ARM to copy into the > ** cached area. > - > -** N.B. This implementation plays slightly fast and loose with the Linux > -** driver programming rules, e.g. its use of dmac_map_area instead of > -** dma_map_single, but it isn't a multi-platform driver and it benefits > -** from increased speed as a result. > */ >=20=20 > static int > create_pagelist(char __user *buf, size_t count, unsigned short type, > - struct task_struct *task, PAGELIST_T ** ppagelist) > + struct task_struct *task, PAGELIST_T **ppagelist, > + dma_addr_t *dma_addr) > { > PAGELIST_T *pagelist; > struct page **pages; > - unsigned long *addrs; > - unsigned int num_pages, offset, i; > - char *addr, *base_addr, *next_addr; > - int run, addridx, actual_pages; > + u32 *addrs; > + unsigned int num_pages, offset, i, j, k; > + int actual_pages; > unsigned long *need_release; > + size_t pagelist_size; > + struct scatterlist *scatterlist; > + int dma_buffers; > + int dir; >=20=20 > - offset =3D (unsigned int)buf & (PAGE_SIZE - 1); > + offset =3D ((unsigned int)(unsigned long)buf & (PAGE_SIZE - 1)); > num_pages =3D (count + offset + PAGE_SIZE - 1) / PAGE_SIZE; >=20=20 > + pagelist_size =3D sizeof(PAGELIST_T) + > + (num_pages * sizeof(unsigned long)) + > + sizeof(unsigned long) + > + (num_pages * sizeof(pages[0]) + > + (num_pages * sizeof(struct scatterlist))); > + > *ppagelist =3D NULL; >=20=20 > + dir =3D (type =3D=3D PAGELIST_WRITE) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; > + > /* Allocate enough storage to hold the page pointers and the page > ** list > */ > - pagelist =3D kmalloc(sizeof(PAGELIST_T) + > - (num_pages * sizeof(unsigned long)) + > - sizeof(unsigned long) + > - (num_pages * sizeof(pages[0])), > - GFP_KERNEL); > + pagelist =3D dma_zalloc_coherent(g_dev, > + pagelist_size, > + dma_addr, > + GFP_KERNEL); >=20=20 > vchiq_log_trace(vchiq_arm_log_level, "create_pagelist - %pK", > pagelist); > @@ -394,10 +399,9 @@ create_pagelist(char __user *buf, size_t count, unsi= gned short type, > addrs =3D pagelist->addrs; > need_release =3D (unsigned long *)(addrs + num_pages); > pages =3D (struct page **)(addrs + num_pages + 1); > + scatterlist =3D (struct scatterlist *)(pages + num_pages); >=20=20 > if (is_vmalloc_addr(buf)) { > - int dir =3D (type =3D=3D PAGELIST_WRITE) ? > - DMA_TO_DEVICE : DMA_FROM_DEVICE; > unsigned long length =3D count; > unsigned int off =3D offset; >=20=20 > @@ -410,7 +414,6 @@ create_pagelist(char __user *buf, size_t count, unsig= ned short type, > if (bytes > length) > bytes =3D length; > pages[actual_pages] =3D pg; > - dmac_map_area(page_address(pg) + off, bytes, dir); > length -=3D bytes; > off =3D 0; > } > @@ -438,7 +441,8 @@ create_pagelist(char __user *buf, size_t count, unsig= ned short type, > actual_pages--; > put_page(pages[actual_pages]); > } > - kfree(pagelist); > + dma_free_coherent(g_dev, pagelist_size, > + pagelist, *dma_addr); > if (actual_pages =3D=3D 0) > actual_pages =3D -ENOMEM; > return actual_pages; > @@ -450,29 +454,38 @@ create_pagelist(char __user *buf, size_t count, uns= igned short type, > pagelist->type =3D type; > pagelist->offset =3D offset; >=20=20 > - /* Group the pages into runs of contiguous pages */ > - > - base_addr =3D VCHIQ_ARM_ADDRESS(page_address(pages[0])); > - next_addr =3D base_addr + PAGE_SIZE; > - addridx =3D 0; > - run =3D 0; > - > - for (i =3D 1; i < num_pages; i++) { > - addr =3D VCHIQ_ARM_ADDRESS(page_address(pages[i])); > - if ((addr =3D=3D next_addr) && (run < (PAGE_SIZE - 1))) { > - next_addr +=3D PAGE_SIZE; > - run++; > - } else { > - addrs[addridx] =3D (unsigned long)base_addr + run; > - addridx++; > - base_addr =3D addr; > - next_addr =3D addr + PAGE_SIZE; > - run =3D 0; > - } > + for (i =3D 0; i < num_pages; i++) > + sg_set_page(scatterlist + i, pages[i], PAGE_SIZE, 0); > + > + dma_buffers =3D dma_map_sg(g_dev, > + scatterlist, > + num_pages, > + dir); > + > + if (dma_buffers =3D=3D 0) { > + dma_free_coherent(g_dev, pagelist_size, > + pagelist, *dma_addr); > + > + return -EINTR; > } >=20=20 > - addrs[addridx] =3D (unsigned long)base_addr + run; > - addridx++; > + k =3D 0; > + for (i =3D 0; i < dma_buffers;) { > + u32 section_length =3D 0; > + > + for (j =3D i + 1; j < dma_buffers; j++) { > + if (sg_dma_address(scatterlist + j) !=3D > + sg_dma_address(scatterlist + j - 1) + PAGE_SIZE) { > + break; > + } > + section_length++; > + } > + > + addrs[k] =3D (u32)sg_dma_address(scatterlist + i) | > + section_length; It looks like scatterlist may not be just an array, so I think this whole loop wants to be something like: for_each_sg(scatterlist, sg, num_pages, i) { u32 len =3D sg_dma_len(sg); u32 addr =3D sg_dma_address(sg); if (k > 0 && addrs[k - 1] & PAGE_MASK + (addrs[k - 1] & ~PAGE_MASK) << PAGE_SHIFT) =3D=3D addr) { addrs[k - 1] +=3D len; } else { addrs[k++] =3D addr | len; } } Note the use of sg_dma_len(), since sg entries may be more than one page. I don't think any merging is actually happening on our platform currently, thus why I left the inner loop. Thanks for taking on doing this conversion! This code's going to be so much nicer when you're done. --=-=-= Content-Type: application/pgp-signature; name="signature.asc" -----BEGIN PGP SIGNATURE----- iQIcBAEBCgAGBQJYDkWIAAoJELXWKTbR/J7o1AEQAIMcz650gKLjemfWxSfyJOG4 +Cin7B3fgNPiRz8qZLewOSDvcW6FO+7nyWG7iiDn4Qwx6DWgAJUbhWUWOUuTUhSQ GMWCbeIIbgeynO5DB2yLP0+7y5pvbeWCajYbGeO8A2yUIF6iU6EuHip3d4OOhGlt OPLXm7PEDEFFnLKrYeGMli5XckT5n3V7hV6OnWv/UyXHg5IgrYy9sRpWXfQ+1v9S O0C4OtuLTmZ7w4IvA57B4CBQ0ceYun/NUXakir/7Fk2KX8cmGgl38eB9ML8qS/lI lIyVln3AnNMbhVeSt22miUai69/2R+xKDYqCz/8/3JZHznOwFEqUL00uZGJUNu2K EZcLk0cvS4O3G6XBl/LMnH9bxHo8LJZwt8aFU5SoyRCphAYzjukmYyQlIqm7J+E4 VhS7NnXzpNQqzJLlo4WBYj71Yi1p5kEkDPc2duzYwtVITxEWAeen/x4jKCTyLKRa eaJADA7MUcnG9e7a4rk4pxglJyvgM3dLpphTj1hiI24nvQ1vYKJeCZTFiqfSzFdA m60gMldOBkv4TuQJopya74uRpU/tstgrgUM2W+3HuuLsbWUTIvEVICNw5tJvNqQe 2EOhTC0ja7VCZHLCQILDxjtMLpTJTUzO/NQe+VuszbqM19+hC81tHMCauiJVThag +jXaSBJz7m4yak7B7t0f =03zW -----END PGP SIGNATURE----- --=-=-=--