2015-04-15 10:04:20

by Noralf Trønnes

[permalink] [raw]
Subject: [PATCH] dmaengine: bcm2835: Add slave dma support

Add slave transfer capability to BCM2835 dmaengine driver.
This patch is pulled from the bcm2708-dmaengine driver in the
Raspberry Pi repo. The work was done by Gellert Weisz.

Tested with the bcm2835-mmc driver from the same repo.

Signed-off-by: Noralf Trønnes <[email protected]>
---

Gellert Weisz has ended his internship with Raspberry Pi Trading and
was not available to sign off this patch.

Changes from original code:
Remove slave_id use.
SDHCI_BCM_DMA_WAITS changed to BCM2835_DMA_WAIT_CYCLES.
Use SZ_* macros instead of decimal values.
Change MAX_LITE_TRANSFER from 32k to 64K - 1.
Fix several whitespace issues.

Lee Jones' comments in previous email to Piotr Król:
Remove __func__ from dev_err message.
Cleanup comments.


Quick testing:
Enable CONFIG_DMA_BCM2835
Apply patch: https://gist.github.com/notro/0c9184df1b43a793f006


drivers/dma/bcm2835-dma.c | 200 ++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 186 insertions(+), 14 deletions(-)

diff --git a/drivers/dma/bcm2835-dma.c b/drivers/dma/bcm2835-dma.c
index c92d6a7..4230aac 100644
--- a/drivers/dma/bcm2835-dma.c
+++ b/drivers/dma/bcm2835-dma.c
@@ -1,11 +1,10 @@
/*
* BCM2835 DMA engine support
*
- * This driver only supports cyclic DMA transfers
- * as needed for the I2S module.
- *
* Author: Florian Meier <[email protected]>
* Copyright 2013
+ * Gellert Weisz <[email protected]>
+ * Copyright 2013-2014
*
* Based on
* OMAP DMAengine support by Russell King
@@ -89,6 +88,8 @@ struct bcm2835_desc {
size_t size;
};

+#define BCM2835_DMA_WAIT_CYCLES 0 /* Slow down DMA transfers: 0-31 */
+
#define BCM2835_DMA_CS 0x00
#define BCM2835_DMA_ADDR 0x04
#define BCM2835_DMA_SOURCE_AD 0x0c
@@ -105,12 +106,16 @@ struct bcm2835_desc {
#define BCM2835_DMA_RESET BIT(31) /* WO, self clearing */

#define BCM2835_DMA_INT_EN BIT(0)
+#define BCM2835_DMA_WAIT_RESP BIT(3)
#define BCM2835_DMA_D_INC BIT(4)
+#define BCM2835_DMA_D_WIDTH BIT(5)
#define BCM2835_DMA_D_DREQ BIT(6)
#define BCM2835_DMA_S_INC BIT(8)
+#define BCM2835_DMA_S_WIDTH BIT(9)
#define BCM2835_DMA_S_DREQ BIT(10)

#define BCM2835_DMA_PER_MAP(x) ((x) << 16)
+#define BCM2835_DMA_WAITS(x) (((x) & 0x1f) << 21)

#define BCM2835_DMA_DATA_TYPE_S8 1
#define BCM2835_DMA_DATA_TYPE_S16 2
@@ -124,6 +129,9 @@ struct bcm2835_desc {
#define BCM2835_DMA_CHAN(n) ((n) << 8) /* Base address */
#define BCM2835_DMA_CHANIO(base, n) ((base) + BCM2835_DMA_CHAN(n))

+#define MAX_LITE_TRANSFER (SZ_64K - 1)
+#define MAX_NORMAL_TRANSFER SZ_1G
+
static inline struct bcm2835_dmadev *to_bcm2835_dma_dev(struct dma_device *d)
{
return container_of(d, struct bcm2835_dmadev, ddev);
@@ -217,12 +225,18 @@ static irqreturn_t bcm2835_dma_callback(int irq, void *data)
d = c->desc;

if (d) {
- /* TODO Only works for cyclic DMA */
- vchan_cyclic_callback(&d->vd);
- }
+ if (c->cyclic) {
+ vchan_cyclic_callback(&d->vd);

- /* Keep the DMA engine running */
- writel(BCM2835_DMA_ACTIVE, c->chan_base + BCM2835_DMA_CS);
+ /* Keep the DMA engine running */
+ writel(BCM2835_DMA_ACTIVE,
+ c->chan_base + BCM2835_DMA_CS);
+
+ } else {
+ vchan_cookie_complete(&c->desc->vd);
+ bcm2835_dma_start_desc(c);
+ }
+ }

spin_unlock_irqrestore(&c->vc.lock, flags);

@@ -323,8 +337,6 @@ static void bcm2835_dma_issue_pending(struct dma_chan *chan)
struct bcm2835_chan *c = to_bcm2835_dma_chan(chan);
unsigned long flags;

- c->cyclic = true; /* Nothing else is implemented */
-
spin_lock_irqsave(&c->vc.lock, flags);
if (vchan_issue_pending(&c->vc) && !c->desc)
bcm2835_dma_start_desc(c);
@@ -342,7 +354,7 @@ static struct dma_async_tx_descriptor *bcm2835_dma_prep_dma_cyclic(
struct bcm2835_desc *d;
dma_addr_t dev_addr;
unsigned int es, sync_type;
- unsigned int frame;
+ unsigned int frame, max_size;

/* Grab configuration */
if (!is_slave_direction(direction)) {
@@ -375,7 +387,12 @@ static struct dma_async_tx_descriptor *bcm2835_dma_prep_dma_cyclic(
return NULL;

d->dir = direction;
- d->frames = buf_len / period_len;
+ if (c->ch >= 8) /* LITE channel */
+ max_size = MAX_LITE_TRANSFER;
+ else
+ max_size = MAX_NORMAL_TRANSFER;
+ period_len = min(period_len, max_size);
+ d->frames = (buf_len - 1) / (period_len + 1);

/* Allocate memory for control blocks */
d->control_block_size = d->frames * sizeof(struct bcm2835_dma_cb);
@@ -420,12 +437,16 @@ static struct dma_async_tx_descriptor *bcm2835_dma_prep_dma_cyclic(
BCM2835_DMA_PER_MAP(c->dreq);

/* Length of a frame */
- control_block->length = period_len;
+ if (frame != d->frames - 1)
+ control_block->length = period_len;
+ else
+ control_block->length = buf_len - (d->frames - 1) *
+ period_len;
d->size += control_block->length;

/*
* Next block is the next frame.
- * This DMA engine driver currently only supports cyclic DMA.
+ * This function is called on cyclic DMA transfers.
* Therefore, wrap around at number of frames.
*/
control_block->next = d->control_block_base_phys +
@@ -433,6 +454,155 @@ static struct dma_async_tx_descriptor *bcm2835_dma_prep_dma_cyclic(
* ((frame + 1) % d->frames);
}

+ c->cyclic = true;
+
+ return vchan_tx_prep(&c->vc, &d->vd, flags);
+}
+
+static struct dma_async_tx_descriptor *
+bcm2835_dma_prep_slave_sg(struct dma_chan *chan,
+ struct scatterlist *sgl,
+ unsigned int sg_len,
+ enum dma_transfer_direction direction,
+ unsigned long flags, void *context)
+{
+ struct bcm2835_chan *c = to_bcm2835_dma_chan(chan);
+ enum dma_slave_buswidth dev_width;
+ struct bcm2835_desc *d;
+ dma_addr_t dev_addr;
+ struct scatterlist *sgent;
+ unsigned int es, sync_type;
+ unsigned int i, j, splitct, max_size;
+
+ if (!is_slave_direction(direction)) {
+ dev_err(chan->device->dev, "direction not supported\n");
+ return NULL;
+ }
+
+ if (direction == DMA_DEV_TO_MEM) {
+ dev_addr = c->cfg.src_addr;
+ dev_width = c->cfg.src_addr_width;
+ sync_type = BCM2835_DMA_S_DREQ;
+ } else {
+ dev_addr = c->cfg.dst_addr;
+ dev_width = c->cfg.dst_addr_width;
+ sync_type = BCM2835_DMA_D_DREQ;
+ }
+
+ /* Bus width translates to the element size (ES) */
+ switch (dev_width) {
+ case DMA_SLAVE_BUSWIDTH_4_BYTES:
+ es = BCM2835_DMA_DATA_TYPE_S32;
+ break;
+ default:
+ return NULL;
+ }
+
+ /* Allocate and setup the descriptor. */
+ d = kzalloc(sizeof(*d), GFP_NOWAIT);
+ if (!d)
+ return NULL;
+
+ d->dir = direction;
+
+ if (c->ch >= 8) /* LITE channel */
+ max_size = MAX_LITE_TRANSFER;
+ else
+ max_size = MAX_NORMAL_TRANSFER;
+
+ /*
+ * Store the length of the SG list in d->frames
+ * taking care to account for splitting up transfers
+ * too large for a LITE channel
+ */
+ d->frames = 0;
+ for_each_sg(sgl, sgent, sg_len, i) {
+ unsigned int len = sg_dma_len(sgent);
+
+ d->frames += 1 + len / max_size;
+ }
+
+ /* Allocate memory for control blocks */
+ d->control_block_size = d->frames * sizeof(struct bcm2835_dma_cb);
+ d->control_block_base = dma_zalloc_coherent(chan->device->dev,
+ d->control_block_size, &d->control_block_base_phys,
+ GFP_NOWAIT);
+ if (!d->control_block_base) {
+ kfree(d);
+ return NULL;
+ }
+
+ /*
+ * Iterate over all SG entries, create a control block
+ * for each frame and link them together.
+ * Count the number of times an SG entry had to be split
+ * as a result of using a LITE channel
+ */
+ splitct = 0;
+
+ for_each_sg(sgl, sgent, sg_len, i) {
+ dma_addr_t addr = sg_dma_address(sgent);
+ unsigned int len = sg_dma_len(sgent);
+
+ for (j = 0; j < len; j += max_size) {
+ struct bcm2835_dma_cb *control_block =
+ &d->control_block_base[i + splitct];
+
+ /* Setup addresses */
+ if (d->dir == DMA_DEV_TO_MEM) {
+ control_block->info = BCM2835_DMA_D_INC |
+ BCM2835_DMA_D_WIDTH |
+ BCM2835_DMA_S_DREQ;
+ control_block->src = dev_addr;
+ control_block->dst = addr + (dma_addr_t)j;
+ } else {
+ control_block->info = BCM2835_DMA_S_INC |
+ BCM2835_DMA_S_WIDTH |
+ BCM2835_DMA_D_DREQ;
+ control_block->src = addr + (dma_addr_t)j;
+ control_block->dst = dev_addr;
+ }
+
+ /* Common part */
+ control_block->info |=
+ BCM2835_DMA_WAITS(BCM2835_DMA_WAIT_CYCLES);
+ control_block->info |= BCM2835_DMA_WAIT_RESP;
+
+ /* Enable */
+ if (i == sg_len - 1 && len - j <= max_size)
+ control_block->info |= BCM2835_DMA_INT_EN;
+
+ /* Setup synchronization */
+ if (sync_type != 0)
+ control_block->info |= sync_type;
+
+ /* Setup DREQ channel */
+ if (c->dreq)
+ control_block->info |=
+ BCM2835_DMA_PER_MAP(c->dreq);
+
+ /* Length of a frame */
+ control_block->length = min(len - j, max_size);
+ d->size += control_block->length;
+
+ if (i < sg_len - 1 || len - j > max_size) {
+ /* Next block is the next frame. */
+ control_block->next =
+ d->control_block_base_phys +
+ sizeof(struct bcm2835_dma_cb) *
+ (i + splitct + 1);
+ } else {
+ /* Next block is empty. */
+ control_block->next = 0;
+ }
+
+ if (len - j > max_size)
+ splitct++;
+ }
+ }
+
+ c->cyclic = false;
+
return vchan_tx_prep(&c->vc, &d->vd, flags);
}

@@ -590,6 +760,7 @@ static int bcm2835_dma_probe(struct platform_device *pdev)
od->ddev.device_tx_status = bcm2835_dma_tx_status;
od->ddev.device_issue_pending = bcm2835_dma_issue_pending;
od->ddev.device_prep_dma_cyclic = bcm2835_dma_prep_dma_cyclic;
+ od->ddev.device_prep_slave_sg = bcm2835_dma_prep_slave_sg;
od->ddev.device_config = bcm2835_dma_slave_config;
od->ddev.device_terminate_all = bcm2835_dma_terminate_all;
od->ddev.src_addr_widths = BIT(DMA_SLAVE_BUSWIDTH_4_BYTES);
@@ -678,4 +849,5 @@ module_platform_driver(bcm2835_dma_driver);
MODULE_ALIAS("platform:bcm2835-dma");
MODULE_DESCRIPTION("BCM2835 DMA engine driver");
MODULE_AUTHOR("Florian Meier <[email protected]>");
+MODULE_AUTHOR("Gellert Weisz <[email protected]>");
MODULE_LICENSE("GPL v2");
--
2.2.2


2015-04-15 15:37:58

by Martin Sperl

[permalink] [raw]
Subject: Re: [PATCH] dmaengine: bcm2835: Add slave dma support


> On 15.04.2015, at 11:56, Noralf Trønnes <[email protected]> wrote:
> +#define MAX_LITE_TRANSFER (SZ_64K - 1)
> +#define MAX_NORMAL_TRANSFER SZ_1G
...
> + if (c->ch >= 8) /* LITE channel */
> + max_size = MAX_LITE_TRANSFER;
> + else
> + max_size = MAX_NORMAL_TRANSFER;
> + period_len = min(period_len, max_size);
> + d->frames = (buf_len - 1) / (period_len + 1);

I wonder if it is wise to split the transfers on 65535 bytes for the
Lite DMA-channels - especially if you are transferring to word size
registers (like SPI_FIFO), you still push 16384 words into the register
and the last word of this transfer (word 16384) still is assumed 4 valid
bytes by the device and thus gets operated upon - even if the last byte
contains garbage from the DMA-transfer point of view.

So maybe it is better to separate on SZ_64K-4 or better still SZ_32K to
be on a power of 2 address boundary.-

2015-04-15 18:53:25

by Noralf Trønnes

[permalink] [raw]
Subject: Re: [PATCH] dmaengine: bcm2835: Add slave dma support


Den 15.04.2015 16:37, skrev Martin Sperl:
>> On 15.04.2015, at 11:56, Noralf Trønnes <[email protected]> wrote:
>> +#define MAX_LITE_TRANSFER (SZ_64K - 1)
>> +#define MAX_NORMAL_TRANSFER SZ_1G
> ...
>> + if (c->ch >= 8) /* LITE channel */
>> + max_size = MAX_LITE_TRANSFER;
>> + else
>> + max_size = MAX_NORMAL_TRANSFER;
>> + period_len = min(period_len, max_size);
>> + d->frames = (buf_len - 1) / (period_len + 1);
> I wonder if it is wise to split the transfers on 65535 bytes for the
> Lite DMA-channels - especially if you are transferring to word size
> registers (like SPI_FIFO), you still push 16384 words into the register
> and the last word of this transfer (word 16384) still is assumed 4 valid
> bytes by the device and thus gets operated upon - even if the last byte
> contains garbage from the DMA-transfer point of view.
>
> So maybe it is better to separate on SZ_64K-4 or better still SZ_32K to
> be on a power of 2 address boundary.

The datasheet is contradictory:
BCM2835 ARM Peripherals - 4.5 DMA LITE Engines
3. The DMA length register is now 16 bits, limiting the maximum
transferrable length to 65536 bytes.

A 16-bit register can't hold a value of 65536.
Either the max value is 65535 or the register is 17-bits wide.

There is currently no driver that we can use to test >32k buffers.
Unless someone disagrees, I will change this back to the SZ_32K
value from the original driver (and add a comment to explain that
32k is chosen to stay on the safe side).

2015-04-15 19:33:32

by Stefan Wahren

[permalink] [raw]
Subject: Re: [PATCH] dmaengine: bcm2835: Add slave dma support

Hi Noralf,

Am 15.04.2015 um 11:56 schrieb Noralf Trønnes:
> Add slave transfer capability to BCM2835 dmaengine driver.
> This patch is pulled from the bcm2708-dmaengine driver in the
> Raspberry Pi repo. The work was done by Gellert Weisz.
>
> Tested with the bcm2835-mmc driver from the same repo.

why not with the upstream kernel?

>
> Signed-off-by: Noralf Trønnes <[email protected]>
> ---
>
> Gellert Weisz has ended his internship with Raspberry Pi Trading and
> was not available to sign off this patch.
>
> Changes from original code:
> Remove slave_id use.
> SDHCI_BCM_DMA_WAITS changed to BCM2835_DMA_WAIT_CYCLES.
> Use SZ_* macros instead of decimal values.
> Change MAX_LITE_TRANSFER from 32k to 64K - 1.
> Fix several whitespace issues.
>
> Lee Jones' comments in previous email to Piotr Król:
> Remove __func__ from dev_err message.
> Cleanup comments.
>
>
> Quick testing:
> Enable CONFIG_DMA_BCM2835
> Apply patch: https://gist.github.com/notro/0c9184df1b43a793f006
>
>
> drivers/dma/bcm2835-dma.c | 200 ++++++++++++++++++++++++++++++++++++++++++----
> 1 file changed, 186 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/dma/bcm2835-dma.c b/drivers/dma/bcm2835-dma.c
> index c92d6a7..4230aac 100644
> --- a/drivers/dma/bcm2835-dma.c
> +++ b/drivers/dma/bcm2835-dma.c
> @@ -1,11 +1,10 @@
> [...]
> +
> +static struct dma_async_tx_descriptor *
> +bcm2835_dma_prep_slave_sg(struct dma_chan *chan,
> + struct scatterlist *sgl,
> + unsigned int sg_len,
> + enum dma_transfer_direction direction,
> + unsigned long flags, void *context)
> +{
> + struct bcm2835_chan *c = to_bcm2835_dma_chan(chan);
> + enum dma_slave_buswidth dev_width;
> + struct bcm2835_desc *d;
> + dma_addr_t dev_addr;
> + struct scatterlist *sgent;
> + unsigned int es, sync_type;
> + unsigned int i, j, splitct, max_size;

I think "split_cnt" would be better.

> +
> + if (!is_slave_direction(direction)) {
> + dev_err(chan->device->dev, "direction not supported\n");
> + return NULL;
> + }
> +
> + if (direction == DMA_DEV_TO_MEM) {
> + dev_addr = c->cfg.src_addr;
> + dev_width = c->cfg.src_addr_width;
> + sync_type = BCM2835_DMA_S_DREQ;
> + } else {
> + dev_addr = c->cfg.dst_addr;
> + dev_width = c->cfg.dst_addr_width;
> + sync_type = BCM2835_DMA_D_DREQ;
> + }
> +
> + /* Bus width translates to the element size (ES) */
> + switch (dev_width) {
> + case DMA_SLAVE_BUSWIDTH_4_BYTES:
> + es = BCM2835_DMA_DATA_TYPE_S32;

Looks like "es" is never used.

> + break;
> + default:

A dev_err() might be useful here.

> + return NULL;
> + }
> +
> + /* Allocate and setup the descriptor. */
> + d = kzalloc(sizeof(*d), GFP_NOWAIT);
> + if (!d)
> + return NULL;
> +
> + d->dir = direction;
> +
> + if (c->ch >= 8) /* LITE channel */
> + max_size = MAX_LITE_TRANSFER;
> + else
> + max_size = MAX_NORMAL_TRANSFER;
> +
> + /*
> + * Store the length of the SG list in d->frames
> + * taking care to account for splitting up transfers
> + * too large for a LITE channel
> + */
> + d->frames = 0;
> + for_each_sg(sgl, sgent, sg_len, i) {
> + unsigned int len = sg_dma_len(sgent);
> +
> + d->frames += 1 + len / max_size;

If it's correct this should be more intuitive:

d->frames += len / max_size + 1;

> + }
> +
> + /* Allocate memory for control blocks */
> + d->control_block_size = d->frames * sizeof(struct bcm2835_dma_cb);
> + d->control_block_base = dma_zalloc_coherent(chan->device->dev,
> + d->control_block_size, &d->control_block_base_phys,
> + GFP_NOWAIT);
> + if (!d->control_block_base) {
> + kfree(d);
> + return NULL;
> + }
> +
> + /*
> + * Iterate over all SG entries, create a control block
> + * for each frame and link them together.
> + * Count the number of times an SG entry had to be split
> + * as a result of using a LITE channel
> + */
> + splitct = 0;
> +
> + for_each_sg(sgl, sgent, sg_len, i) {
> + dma_addr_t addr = sg_dma_address(sgent);
> + unsigned int len = sg_dma_len(sgent);
> +
> + for (j = 0; j < len; j += max_size) {

It should be possible to move declaration of "j" down here.

> + struct bcm2835_dma_cb *control_block =
> + &d->control_block_base[i + splitct];
> +
> + /* Setup addresses */
> + if (d->dir == DMA_DEV_TO_MEM) {
> + control_block->info = BCM2835_DMA_D_INC |
> + BCM2835_DMA_D_WIDTH |
> + BCM2835_DMA_S_DREQ;
> + control_block->src = dev_addr;
> + control_block->dst = addr + (dma_addr_t)j;
> + } else {
> + control_block->info = BCM2835_DMA_S_INC |
> + BCM2835_DMA_S_WIDTH |
> + BCM2835_DMA_D_DREQ;
> + control_block->src = addr + (dma_addr_t)j;
> + control_block->dst = dev_addr;
> + }
> +
> + /* Common part */
> + control_block->info |=
> + BCM2835_DMA_WAITS(BCM2835_DMA_WAIT_CYCLES);
> + control_block->info |= BCM2835_DMA_WAIT_RESP;
> +
> + /* Enable */
> + if (i == sg_len - 1 && len - j <= max_size)
> + control_block->info |= BCM2835_DMA_INT_EN;
> +
> + /* Setup synchronization */
> + if (sync_type != 0)

if (sync_type) ?

> + control_block->info |= sync_type;
> +
> + /* Setup DREQ channel */
> + if (c->dreq)
> + control_block->info |=
> + BCM2835_DMA_PER_MAP(c->dreq);
> +

Best regards
Stefan

2015-04-16 06:40:54

by Rogier Wolff

[permalink] [raw]
Subject: Re: [PATCH] dmaengine: bcm2835: Add slave dma support

On Wed, Apr 15, 2015 at 08:53:07PM +0200, Noralf Tr?nnes wrote:

> A 16-bit register can't hold a value of 65536.
> Either the max value is 65535 or the register is 17-bits wide.

It is common for hardware registers to have the value "0" mean 65536
in case of a 16-bit register.

The hardware would then FIRST decrement the register and THEN check
for zero. This results in the behaviour that "1" requires one cycle to
complete, "10" requires ten cycles, and "0" means the same as the
total number of bitpatterns possible in the register. (256 for an
8-bit register, 65536 for a 16-bit register).

Another way to implement such a register in hardware would "check for
zero" first, and not do antyhing if the register equals zero. This
results in differnet behaviour for the "0" value.

That said: IMHO, the overhead of setting up 2 transfers for each 64k
block as opposed to only one results in such a small performance
penalty that I'd prefer to play it safe unless you're very sure you
can adequately test it. (Another option would be to set the maximum
transfer size to 0xf000: 60kbytes. Less than 10% extra transfers in
the long run than when aiming for the edge...)

Roger.

--
** [email protected] ** http://www.BitWizard.nl/ ** +31-15-2600998 **
** Delftechpark 26 2628 XH Delft, The Netherlands. KVK: 27239233 **
*-- BitWizard writes Linux device drivers for any device you may have! --*
The plan was simple, like my brother-in-law Phil. But unlike
Phil, this plan just might work.

2015-04-16 17:28:44

by Noralf Trønnes

[permalink] [raw]
Subject: Re: [PATCH] dmaengine: bcm2835: Add slave dma support


Den 16.04.2015 08:30, skrev Rogier Wolff:
> On Wed, Apr 15, 2015 at 08:53:07PM +0200, Noralf Tr?nnes wrote:
>
>> A 16-bit register can't hold a value of 65536.
>> Either the max value is 65535 or the register is 17-bits wide.
> It is common for hardware registers to have the value "0" mean 65536
> in case of a 16-bit register.
>
> The hardware would then FIRST decrement the register and THEN check
> for zero. This results in the behaviour that "1" requires one cycle to
> complete, "10" requires ten cycles, and "0" means the same as the
> total number of bitpatterns possible in the register. (256 for an
> 8-bit register, 65536 for a 16-bit register).
>
> Another way to implement such a register in hardware would "check for
> zero" first, and not do antyhing if the register equals zero. This
> results in differnet behaviour for the "0" value.
>
> That said: IMHO, the overhead of setting up 2 transfers for each 64k
> block as opposed to only one results in such a small performance
> penalty that I'd prefer to play it safe unless you're very sure you
> can adequately test it. (Another option would be to set the maximum
> transfer size to 0xf000: 60kbytes. Less than 10% extra transfers in
> the long run than when aiming for the edge...)

Dom Cobley (Raspberry Pi) has just been in contact with
the hardware designer. He said:
65535 is the maximum transfer length of a LITE channel.
65536 will be treated as zero which is undefined
(it will actually do one transfer then stop)

Additional info from the datasheet about Lite channels:
The internal data structure is 128 bits instead of 256 bits.
This means that if you do a 128 bit wide read burst of more
than 1 beat, the DMA input register will be full and the read
bus will be stalled. The normal DMA engine can accept a read
burst of 2 without stalling. If you do a narrow 32 bit read
burst from the peripherals then the lite engine can cope with
a burst of 4 as opposed to a burst of 8 for the normal engine.

This suggest to me that we could go as far as the last 128-bit
boundary like this: (SZ_64K - 16)

2015-04-16 19:07:40

by Alexander Stein

[permalink] [raw]
Subject: Re: Re: [PATCH] dmaengine: bcm2835: Add slave dma support

Hi Stefan,

On Wednesday 15 April 2015, 21:00:26 wrote Stefan Wahren:
> Am 15.04.2015 um 11:56 schrieb Noralf Trønnes:
> > Add slave transfer capability to BCM2835 dmaengine driver.
> > This patch is pulled from the bcm2708-dmaengine driver in the
> > Raspberry Pi repo. The work was done by Gellert Weisz.
> >
> > Tested with the bcm2835-mmc driver from the same repo.
>
> why not with the upstream kernel?

I also looked at slave dma support, especially for use in mmc. It turns our that bcm2835-mmc is written more or less completly new.
Mainline linux uses sdhci "framework" which internally uses the SDMA and/or ADMA (both internal, to SD/MMC controller, DMA units) which can be supported by an SDHCI compatible controller.
AFAIK the SD/MMC controller in bcm2835 lacks both that is why the driver only uses PIO. I dunno if external DMA usage can so easily be integrated into the sdhci, I have my doubts.

Best regards,
Alexander

2015-04-16 22:03:52

by Noralf Trønnes

[permalink] [raw]
Subject: Re: [PATCH] dmaengine: bcm2835: Add slave dma support


Den 16.04.2015 21:06, skrev Alexander Stein:
> Hi Stefan,
>
> On Wednesday 15 April 2015, 21:00:26 wrote Stefan Wahren:
>> Am 15.04.2015 um 11:56 schrieb Noralf Trønnes:
>>> Add slave transfer capability to BCM2835 dmaengine driver.
>>> This patch is pulled from the bcm2708-dmaengine driver in the
>>> Raspberry Pi repo. The work was done by Gellert Weisz.
>>>
>>> Tested with the bcm2835-mmc driver from the same repo.
>> why not with the upstream kernel?
> I also looked at slave dma support, especially for use in mmc. It turns our that bcm2835-mmc is written more or less completly new.
> Mainline linux uses sdhci "framework" which internally uses the SDMA and/or ADMA (both internal, to SD/MMC controller, DMA units) which can be supported by an SDHCI compatible controller.
> AFAIK the SD/MMC controller in bcm2835 lacks both that is why the driver only uses PIO. I dunno if external DMA usage can so easily be integrated into the sdhci, I have my doubts.

I asked Jonathan Bell (Raspberry Pi) about why a new driver was made
instead of extending sdhci-bcm2835.

On 10.04.2015 20:02, Jonathan Bell wrote:
> Basically, it's impossible to integrate platform DMA channel support
> within the SDHCI framework. The Arasan controller (and the Broadcom
> MMCI controller) both use platform DMA channels to pump data to/from
> the host FIFO. Our old "sdhci-bcm2708" driver basically hacked sdhci.c
> to allow platform DMA support in a way that was guaranteed to cause
> merge conflicts with every new kernel branch. The reasoning behind
> creating an MMC-level driver was to minimise disruption of incorporating
> platform DMA and to have additional control e.g. on sequencing of
commands
> that are known to have bugs/problems. There are drivers in the source
> tree that are "SDHCI compliant" but have their own various idiosyncrasies
> - e.g. :
http://lxr.free-electrons.com/source/drivers/mmc/host/davinci_mmc.c
> Implementing an MMC-level driver was the easiest way to incorporate all
> our various bits of baggage (random necessary delays here, busy-wait
there)
> without disrupting the rest of the codebase. I agree that some functions
> could just substitute the sdhci.c equivalents and deduplicate some of
the code.


Stephen Warren made this comment on a previous attempt to upstream the
bcm2835-mmc driver:

On Tue, 28 Oct 2014 19:55:20 -0600, Stephen Warren wrote:
> On 10/28/2014 06:00 PM, Piotr Król wrote:
> > This is driver for Arasan External Mass Media Controller provided in
> > Raspberry Pi single board computer.
>
> We should not have multiple drivers for the same HW. The correct
> approach would be to enhance the existing sdhci-bcm2835.c to support any
> new features or bug-fixes embodied within this driver. Presumably that
> way, you'd also end up with a lot of small feature patches, which would
> make patch review easier. Consequently I haven't reviewed this patch
much.

2015-04-16 22:09:44

by Noralf Trønnes

[permalink] [raw]
Subject: Re: [PATCH] dmaengine: bcm2835: Add slave dma support


Den 15.04.2015 21:00, skrev Stefan Wahren:
> Hi Noralf,
>
> Am 15.04.2015 um 11:56 schrieb Noralf Trønnes:
>> Add slave transfer capability to BCM2835 dmaengine driver.
>> This patch is pulled from the bcm2708-dmaengine driver in the
>> Raspberry Pi repo. The work was done by Gellert Weisz.
>>
>> Tested with the bcm2835-mmc driver from the same repo.
>
> why not with the upstream kernel?
>

See my answer to Alexander Stein.

>> + unsigned int i, j, splitct, max_size;
>
> I think "split_cnt" would be better.
>
>> + es = BCM2835_DMA_DATA_TYPE_S32;
>
> Looks like "es" is never used.
>
>> + break;
>> + default:
>
> A dev_err() might be useful here.
>
>> + d->frames += 1 + len / max_size;
>
> If it's correct this should be more intuitive:
>
> d->frames += len / max_size + 1;
>
>> + for_each_sg(sgl, sgent, sg_len, i) {
>> + dma_addr_t addr = sg_dma_address(sgent);
>> + unsigned int len = sg_dma_len(sgent);
>> +
>> + for (j = 0; j < len; j += max_size) {
>
> It should be possible to move declaration of "j" down here.
>
>> + if (sync_type != 0)
>
> if (sync_type) ?
>

Thanks for your comments Stefan, I'll make a new version of the patch.


Noralf.

2015-04-17 17:09:00

by Stefan Wahren

[permalink] [raw]
Subject: Re: [PATCH] dmaengine: bcm2835: Add slave dma support

Hi Noralf,

Am 17.04.2015 um 00:09 schrieb Noralf Trønnes:
>
> Den 15.04.2015 21:00, skrev Stefan Wahren:
>> Hi Noralf,
>>
>> Am 15.04.2015 um 11:56 schrieb Noralf Trønnes:
>>> Add slave transfer capability to BCM2835 dmaengine driver.
>>> This patch is pulled from the bcm2708-dmaengine driver in the
>>> Raspberry Pi repo. The work was done by Gellert Weisz.
>>>
>>> Tested with the bcm2835-mmc driver from the same repo.
>>
>> why not with the upstream kernel?
>>
>
> See my answer to Alexander Stein.

i read the mail, but i'm still confused. Please let me paraphrase my
last question:

Is this patch testable with upstream kernel?

It would be helpful to put those facts from the email to Alexander into
the patch description. Please clarify the intension of your patch.

Thanks
Stefan

2015-04-17 17:19:50

by Martin Sperl

[permalink] [raw]
Subject: Re: [PATCH] dmaengine: bcm2835: Add slave dma support


> On 17.04.2015, at 19:08, Stefan Wahren <[email protected]> wrote:
> i read the mail, but i'm still confused. Please let me paraphrase my last question:
>
> Is this patch testable with upstream kernel?
>
> It would be helpful to put those facts from the email to Alexander into
> the patch description. Please clarify the intension of your patch.

The spi-bcm2835 driver will probably be the first “consumer” of this
patch. But that development is has just started and it obviously
requires scatter/gather support in dma-engine to work.

2015-04-17 17:20:59

by Noralf Trønnes

[permalink] [raw]
Subject: Re: [PATCH] dmaengine: bcm2835: Add slave dma support

Hi Stefan,

Den 17.04.2015 19:08, skrev Stefan Wahren:
> Hi Noralf,
>
> Am 17.04.2015 um 00:09 schrieb Noralf Trønnes:
>>
>> Den 15.04.2015 21:00, skrev Stefan Wahren:
>>> Hi Noralf,
>>>
>>> Am 15.04.2015 um 11:56 schrieb Noralf Trønnes:
>>>> Add slave transfer capability to BCM2835 dmaengine driver.
>>>> This patch is pulled from the bcm2708-dmaengine driver in the
>>>> Raspberry Pi repo. The work was done by Gellert Weisz.
>>>>
>>>> Tested with the bcm2835-mmc driver from the same repo.
>>>
>>> why not with the upstream kernel?
>>>
>>
>> See my answer to Alexander Stein.
>
> i read the mail, but i'm still confused. Please let me paraphrase my
> last question:
>
> Is this patch testable with upstream kernel?
>

Sorry, I misread you.
This patch was made against mainline 4.0-rc7, not the Raspberry Pi repo.
I then used the bcm2835-mmc driver in mainline to be able to test the
functionality.

> It would be helpful to put those facts from the email to Alexander into
> the patch description. Please clarify the intension of your patch.
>

From my point of view, the mmc driver is a discussion of it's own.
This patch provides functionality that other drivers can make use of as
well.
Martin Sperl will soon start working on DMA support for spi-bcm2835,
relying on this patch to make that happen.


Noralf.