2015-04-18 11:07:28

by Noralf Trønnes

[permalink] [raw]
Subject: [PATCH v2] dmaengine: bcm2835: Add slave dma support

Add slave transfer capability to BCM2835 dmaengine driver.
This patch is pulled from the bcm2708-dmaengine driver in the
Raspberry Pi repo. The work was done by Gellert Weisz.

Tested using the bcm2835-mmc driver from the same repo.

Signed-off-by: Noralf Trønnes <[email protected]>
---

Gellert Weisz has ended his internship with Raspberry Pi Trading and
was not available to sign off this patch.

The patch is made against mainline 4.0-rc7.

Changes from v1:
Martin Sperl, Dom Cobley:
MAX_LITE_TRANSFER has to be 32-bit aligned

Stefan Wahren:
Variable es is not used
Change splitct to split_cnt
Add dev_err for unsupported buswidth
Rearrange d->frames formula for readability
Move j variable definition to loop

Changes from original code:
Remove slave_id use.
SDHCI_BCM_DMA_WAITS changed to BCM2835_DMA_WAIT_CYCLES.
Use SZ_* macros instead of decimal values.
Change MAX_LITE_TRANSFER from 32k to 64K - 1.
Fix several whitespace issues.

Lee Jones' comments in previous email to Piotr Król:
Remove __func__ from dev_err message.
Cleanup comments.


Quick testing:
Enable CONFIG_DMA_BCM2835
Apply patch: https://gist.github.com/notro/0c9184df1b43a793f006

drivers/dma/bcm2835-dma.c | 206 ++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 192 insertions(+), 14 deletions(-)

diff --git a/drivers/dma/bcm2835-dma.c b/drivers/dma/bcm2835-dma.c
index c92d6a7..0d93a90 100644
--- a/drivers/dma/bcm2835-dma.c
+++ b/drivers/dma/bcm2835-dma.c
@@ -1,11 +1,10 @@
/*
* BCM2835 DMA engine support
*
- * This driver only supports cyclic DMA transfers
- * as needed for the I2S module.
- *
* Author: Florian Meier <[email protected]>
* Copyright 2013
+ * Gellert Weisz <[email protected]>
+ * Copyright 2013-2014
*
* Based on
* OMAP DMAengine support by Russell King
@@ -89,6 +88,8 @@ struct bcm2835_desc {
size_t size;
};

+#define BCM2835_DMA_WAIT_CYCLES 0 /* Slow down DMA transfers: 0-31 */
+
#define BCM2835_DMA_CS 0x00
#define BCM2835_DMA_ADDR 0x04
#define BCM2835_DMA_SOURCE_AD 0x0c
@@ -105,12 +106,16 @@ struct bcm2835_desc {
#define BCM2835_DMA_RESET BIT(31) /* WO, self clearing */

#define BCM2835_DMA_INT_EN BIT(0)
+#define BCM2835_DMA_WAIT_RESP BIT(3)
#define BCM2835_DMA_D_INC BIT(4)
+#define BCM2835_DMA_D_WIDTH BIT(5)
#define BCM2835_DMA_D_DREQ BIT(6)
#define BCM2835_DMA_S_INC BIT(8)
+#define BCM2835_DMA_S_WIDTH BIT(9)
#define BCM2835_DMA_S_DREQ BIT(10)

#define BCM2835_DMA_PER_MAP(x) ((x) << 16)
+#define BCM2835_DMA_WAITS(x) (((x) & 0x1f) << 21)

#define BCM2835_DMA_DATA_TYPE_S8 1
#define BCM2835_DMA_DATA_TYPE_S16 2
@@ -124,6 +129,14 @@ struct bcm2835_desc {
#define BCM2835_DMA_CHAN(n) ((n) << 8) /* Base address */
#define BCM2835_DMA_CHANIO(base, n) ((base) + BCM2835_DMA_CHAN(n))

+#define MAX_NORMAL_TRANSFER SZ_1G
+/*
+ * Max length on a Lite channel is 65535 bytes.
+ * DMA handles byte-enables on SDRAM reads and writes even on 128-bit accesses,
+ * but byte-enables don't exist on peripheral addresses, so align to 32-bit.
+ */
+#define MAX_LITE_TRANSFER (SZ_64K - 4)
+
static inline struct bcm2835_dmadev *to_bcm2835_dma_dev(struct dma_device *d)
{
return container_of(d, struct bcm2835_dmadev, ddev);
@@ -217,12 +230,18 @@ static irqreturn_t bcm2835_dma_callback(int irq, void *data)
d = c->desc;

if (d) {
- /* TODO Only works for cyclic DMA */
- vchan_cyclic_callback(&d->vd);
- }
+ if (c->cyclic) {
+ vchan_cyclic_callback(&d->vd);

- /* Keep the DMA engine running */
- writel(BCM2835_DMA_ACTIVE, c->chan_base + BCM2835_DMA_CS);
+ /* Keep the DMA engine running */
+ writel(BCM2835_DMA_ACTIVE,
+ c->chan_base + BCM2835_DMA_CS);
+
+ } else {
+ vchan_cookie_complete(&c->desc->vd);
+ bcm2835_dma_start_desc(c);
+ }
+ }

spin_unlock_irqrestore(&c->vc.lock, flags);

@@ -323,8 +342,6 @@ static void bcm2835_dma_issue_pending(struct dma_chan *chan)
struct bcm2835_chan *c = to_bcm2835_dma_chan(chan);
unsigned long flags;

- c->cyclic = true; /* Nothing else is implemented */
-
spin_lock_irqsave(&c->vc.lock, flags);
if (vchan_issue_pending(&c->vc) && !c->desc)
bcm2835_dma_start_desc(c);
@@ -342,7 +359,7 @@ static struct dma_async_tx_descriptor *bcm2835_dma_prep_dma_cyclic(
struct bcm2835_desc *d;
dma_addr_t dev_addr;
unsigned int es, sync_type;
- unsigned int frame;
+ unsigned int frame, max_size;

/* Grab configuration */
if (!is_slave_direction(direction)) {
@@ -375,7 +392,12 @@ static struct dma_async_tx_descriptor *bcm2835_dma_prep_dma_cyclic(
return NULL;

d->dir = direction;
- d->frames = buf_len / period_len;
+ if (c->ch >= 8) /* LITE channel */
+ max_size = MAX_LITE_TRANSFER;
+ else
+ max_size = MAX_NORMAL_TRANSFER;
+ period_len = min(period_len, max_size);
+ d->frames = (buf_len - 1) / (period_len + 1);

/* Allocate memory for control blocks */
d->control_block_size = d->frames * sizeof(struct bcm2835_dma_cb);
@@ -420,12 +442,16 @@ static struct dma_async_tx_descriptor *bcm2835_dma_prep_dma_cyclic(
BCM2835_DMA_PER_MAP(c->dreq);

/* Length of a frame */
- control_block->length = period_len;
+ if (frame != d->frames - 1)
+ control_block->length = period_len;
+ else
+ control_block->length = buf_len - (d->frames - 1) *
+ period_len;
d->size += control_block->length;

/*
* Next block is the next frame.
- * This DMA engine driver currently only supports cyclic DMA.
+ * This function is called on cyclic DMA transfers.
* Therefore, wrap around at number of frames.
*/
control_block->next = d->control_block_base_phys +
@@ -433,6 +459,156 @@ static struct dma_async_tx_descriptor *bcm2835_dma_prep_dma_cyclic(
* ((frame + 1) % d->frames);
}

+ c->cyclic = true;
+
+ return vchan_tx_prep(&c->vc, &d->vd, flags);
+}
+
+static struct dma_async_tx_descriptor *
+bcm2835_dma_prep_slave_sg(struct dma_chan *chan,
+ struct scatterlist *sgl,
+ unsigned int sg_len,
+ enum dma_transfer_direction direction,
+ unsigned long flags, void *context)
+{
+ struct bcm2835_chan *c = to_bcm2835_dma_chan(chan);
+ enum dma_slave_buswidth dev_width;
+ struct bcm2835_desc *d;
+ dma_addr_t dev_addr;
+ struct scatterlist *sgent;
+ unsigned int i, sync_type, split_cnt, max_size;
+
+ if (!is_slave_direction(direction)) {
+ dev_err(chan->device->dev, "direction not supported\n");
+ return NULL;
+ }
+
+ if (direction == DMA_DEV_TO_MEM) {
+ dev_addr = c->cfg.src_addr;
+ dev_width = c->cfg.src_addr_width;
+ sync_type = BCM2835_DMA_S_DREQ;
+ } else {
+ dev_addr = c->cfg.dst_addr;
+ dev_width = c->cfg.dst_addr_width;
+ sync_type = BCM2835_DMA_D_DREQ;
+ }
+
+ /* Bus width translates to the element size (ES) */
+ switch (dev_width) {
+ case DMA_SLAVE_BUSWIDTH_4_BYTES:
+ break;
+ default:
+ dev_err(chan->device->dev, "buswidth not supported: %i\n",
+ dev_width);
+ return NULL;
+ }
+
+ /* Allocate and setup the descriptor. */
+ d = kzalloc(sizeof(*d), GFP_NOWAIT);
+ if (!d)
+ return NULL;
+
+ d->dir = direction;
+
+ if (c->ch >= 8) /* LITE channel */
+ max_size = MAX_LITE_TRANSFER;
+ else
+ max_size = MAX_NORMAL_TRANSFER;
+
+ /*
+ * Store the length of the SG list in d->frames
+ * taking care to account for splitting up transfers
+ * too large for a LITE channel
+ */
+ d->frames = 0;
+ for_each_sg(sgl, sgent, sg_len, i) {
+ unsigned int len = sg_dma_len(sgent);
+
+ d->frames += len / max_size + 1;
+ }
+
+ /* Allocate memory for control blocks */
+ d->control_block_size = d->frames * sizeof(struct bcm2835_dma_cb);
+ d->control_block_base = dma_zalloc_coherent(chan->device->dev,
+ d->control_block_size, &d->control_block_base_phys,
+ GFP_NOWAIT);
+ if (!d->control_block_base) {
+ kfree(d);
+ return NULL;
+ }
+
+ /*
+ * Iterate over all SG entries, create a control block
+ * for each frame and link them together.
+ * Count the number of times an SG entry had to be split
+ * as a result of using a LITE channel
+ */
+ split_cnt = 0;
+
+ for_each_sg(sgl, sgent, sg_len, i) {
+ unsigned int j;
+ dma_addr_t addr = sg_dma_address(sgent);
+ unsigned int len = sg_dma_len(sgent);
+
+ for (j = 0; j < len; j += max_size) {
+ struct bcm2835_dma_cb *control_block =
+ &d->control_block_base[i + split_cnt];
+
+ /* Setup addresses */
+ if (d->dir == DMA_DEV_TO_MEM) {
+ control_block->info = BCM2835_DMA_D_INC |
+ BCM2835_DMA_D_WIDTH |
+ BCM2835_DMA_S_DREQ;
+ control_block->src = dev_addr;
+ control_block->dst = addr + (dma_addr_t)j;
+ } else {
+ control_block->info = BCM2835_DMA_S_INC |
+ BCM2835_DMA_S_WIDTH |
+ BCM2835_DMA_D_DREQ;
+ control_block->src = addr + (dma_addr_t)j;
+ control_block->dst = dev_addr;
+ }
+
+ /* Common part */
+ control_block->info |=
+ BCM2835_DMA_WAITS(BCM2835_DMA_WAIT_CYCLES);
+ control_block->info |= BCM2835_DMA_WAIT_RESP;
+
+ /* Enable */
+ if (i == sg_len - 1 && len - j <= max_size)
+ control_block->info |= BCM2835_DMA_INT_EN;
+
+ /* Setup synchronization */
+ if (sync_type)
+ control_block->info |= sync_type;
+
+ /* Setup DREQ channel */
+ if (c->dreq)
+ control_block->info |=
+ BCM2835_DMA_PER_MAP(c->dreq);
+
+ /* Length of a frame */
+ control_block->length = min(len - j, max_size);
+ d->size += control_block->length;
+
+ if (i < sg_len - 1 || len - j > max_size) {
+ /* Next block is the next frame. */
+ control_block->next =
+ d->control_block_base_phys +
+ sizeof(struct bcm2835_dma_cb) *
+ (i + split_cnt + 1);
+ } else {
+ /* Next block is empty. */
+ control_block->next = 0;
+ }
+
+ if (len - j > max_size)
+ split_cnt++;
+ }
+ }
+
+ c->cyclic = false;
+
return vchan_tx_prep(&c->vc, &d->vd, flags);
}

@@ -590,6 +766,7 @@ static int bcm2835_dma_probe(struct platform_device *pdev)
od->ddev.device_tx_status = bcm2835_dma_tx_status;
od->ddev.device_issue_pending = bcm2835_dma_issue_pending;
od->ddev.device_prep_dma_cyclic = bcm2835_dma_prep_dma_cyclic;
+ od->ddev.device_prep_slave_sg = bcm2835_dma_prep_slave_sg;
od->ddev.device_config = bcm2835_dma_slave_config;
od->ddev.device_terminate_all = bcm2835_dma_terminate_all;
od->ddev.src_addr_widths = BIT(DMA_SLAVE_BUSWIDTH_4_BYTES);
@@ -678,4 +855,5 @@ module_platform_driver(bcm2835_dma_driver);
MODULE_ALIAS("platform:bcm2835-dma");
MODULE_DESCRIPTION("BCM2835 DMA engine driver");
MODULE_AUTHOR("Florian Meier <[email protected]>");
+MODULE_AUTHOR("Gellert Weisz <[email protected]>");
MODULE_LICENSE("GPL v2");
--
2.2.2


2015-05-01 21:07:40

by Martin Sperl

[permalink] [raw]
Subject: Re: [PATCH v2] dmaengine: bcm2835: Add slave dma support

Tests with the initial (and incomplete) version of the spi-bcm2835 driver
with DMA transfer support show that the dma-engine works as expected with
this patch.

There is one one observation:

> On 18.04.2015, at 13:06, Noralf Trønnes <[email protected]> wrote:
> +static struct dma_async_tx_descriptor *
> +bcm2835_dma_prep_slave_sg(struct dma_chan *chan,
> + struct scatterlist *sgl,
> + unsigned int sg_len,
> + enum dma_transfer_direction direction,
> + unsigned long flags, void *context)
> +{
...
> + /* Enable */
> + if (i == sg_len - 1 && len - j <= max_size)
> + control_block->info |= BCM2835_DMA_INT_EN;

The observation is that an interrupt is always triggered - even in the case
where flags does NOT have DMA_PREP_INTERRUPT set.
This may not be necessary and avoid interrupts.

So maybe the above if clause should get extended by:
&& (flags & DMA_PREP_INTERRUPT)
to only trigger an interrupt when really requested.

I am not sure if there are any side-effects because of this besides having the
requirement on the client to run dmaengine_terminate_all() on that specific dma
channel without interrupts when the transfer is finished.

In the case of SPI we have TX feed the fifo - which finishes early - , but we
only need to the interrupt when RX finishes reading the fifo, which indicates
that the SPI-transfer is fully finished.
So having an interrupt on TX is not necessary for the process.

The same observations may also apply to bcm2835_dma_prep_dma_cyclic (which is
outside of this patch provided by Noralf).

Martin

2015-05-04 06:59:25

by Martin Sperl

[permalink] [raw]
Subject: Re: [PATCH v2] dmaengine: bcm2835: Add slave dma support

I did tests with the (almost) finished dma-support of spi-bcm2835
against a TFT device and it work fine there.

Upstreaming this spi-bcm2835 patch obviously requires that the DMA
support is in the dma-engine driver first...

Please also see prior note about unnecessary interrupts when none
are requested for possible improvements.

> On 18.04.2015, at 13:06, Noralf Trønnes <[email protected]> wrote:
>
> Add slave transfer capability to BCM2835 dmaengine driver.
> This patch is pulled from the bcm2708-dmaengine driver in the
> Raspberry Pi repo. The work was done by Gellert Weisz.
>
> Tested using the bcm2835-mmc driver from the same repo.
>
> Signed-off-by: Noralf Trønnes <[email protected]>

Tested-by: Martin Sperl <[email protected]>

2015-05-06 19:22:15

by Noralf Trønnes

[permalink] [raw]
Subject: Re: [PATCH v2] dmaengine: bcm2835: Add slave dma support


Den 01.05.2015 23:07, skrev Martin Sperl:
> Tests with the initial (and incomplete) version of the spi-bcm2835 driver
> with DMA transfer support show that the dma-engine works as expected with
> this patch.
>
> There is one one observation:
>
>> On 18.04.2015, at 13:06, Noralf Trønnes <[email protected]> wrote:
>> +static struct dma_async_tx_descriptor *
>> +bcm2835_dma_prep_slave_sg(struct dma_chan *chan,
>> + struct scatterlist *sgl,
>> + unsigned int sg_len,
>> + enum dma_transfer_direction direction,
>> + unsigned long flags, void *context)
>> +{
> ...
>> + /* Enable */
>> + if (i == sg_len - 1 && len - j <= max_size)
>> + control_block->info |= BCM2835_DMA_INT_EN;
> The observation is that an interrupt is always triggered - even in the case
> where flags does NOT have DMA_PREP_INTERRUPT set.
> This may not be necessary and avoid interrupts.
>
> So maybe the above if clause should get extended by:
> && (flags & DMA_PREP_INTERRUPT)
> to only trigger an interrupt when really requested.
>
> I am not sure if there are any side-effects because of this besides having the
> requirement on the client to run dmaengine_terminate_all() on that specific dma
> channel without interrupts when the transfer is finished.
>
> In the case of SPI we have TX feed the fifo - which finishes early - , but we
> only need to the interrupt when RX finishes reading the fifo, which indicates
> that the SPI-transfer is fully finished.
> So having an interrupt on TX is not necessary for the process.
>
> The same observations may also apply to bcm2835_dma_prep_dma_cyclic (which is
> outside of this patch provided by Noralf).

Jonathan, can you comment on this?


Noralf.

2015-05-08 11:26:50

by Jonathan Bell

[permalink] [raw]
Subject: Re: [PATCH v2] dmaengine: bcm2835: Add slave dma support

On 06/05/2015 20:21, Noralf Trønnes wrote:
>
> Den 01.05.2015 23:07, skrev Martin Sperl:
>> Tests with the initial (and incomplete) version of the spi-bcm2835
>> driver
>> with DMA transfer support show that the dma-engine works as expected
>> with
>> this patch.
>>
>> There is one one observation:
>>
>>> On 18.04.2015, at 13:06, Noralf Trønnes <[email protected]> wrote:
>>> +static struct dma_async_tx_descriptor *
>>> +bcm2835_dma_prep_slave_sg(struct dma_chan *chan,
>>> + struct scatterlist *sgl,
>>> + unsigned int sg_len,
>>> + enum dma_transfer_direction direction,
>>> + unsigned long flags, void *context)
>>> +{
>> ...
>>> + /* Enable */
>>> + if (i == sg_len - 1 && len - j <= max_size)
>>> + control_block->info |= BCM2835_DMA_INT_EN;
>> The observation is that an interrupt is always triggered - even in
>> the case
>> where flags does NOT have DMA_PREP_INTERRUPT set.
>> This may not be necessary and avoid interrupts.
>>
>> So maybe the above if clause should get extended by:
>> && (flags & DMA_PREP_INTERRUPT)
>> to only trigger an interrupt when really requested.
>>
>> I am not sure if there are any side-effects because of this besides
>> having the
>> requirement on the client to run dmaengine_terminate_all() on that
>> specific dma
>> channel without interrupts when the transfer is finished.
>>
>> In the case of SPI we have TX feed the fifo - which finishes early -
>> , but we
>> only need to the interrupt when RX finishes reading the fifo, which
>> indicates
>> that the SPI-transfer is fully finished.
>> So having an interrupt on TX is not necessary for the process.
>>
>> The same observations may also apply to bcm2835_dma_prep_dma_cyclic
>> (which is
>> outside of this patch provided by Noralf).
>
> Jonathan, can you comment on this?
>
>
> Noralf.
>
>
I agree that the interrupt generated would be spurious - in the case
where it is not required.

However if you do && (flags & DMA_PREP_INTERRUPT) then all users of this
driver need to explicitly set interrupt flags when doing a
scatter-gather transfer. As I understand it, currently the only upstream
client of this driver is the I2S driver which only uses cyclic anyway.

Not requiring an interrupt on completion is a bit of an edge case - the
default among other dmaengine drivers appears to be to enable interrupts
unconditionally.


2015-05-11 05:11:49

by Martin Sperl

[permalink] [raw]
Subject: Re: [PATCH v2] dmaengine: bcm2835: Add slave dma support


> On 08.05.2015, at 13:20, Jonathan Bell <[email protected]> wrote:
>>
> I agree that the interrupt generated would be spurious - in the case where it is not required.
>
> However if you do && (flags & DMA_PREP_INTERRUPT) then all users of this driver need to explicitly set interrupt flags when doing a scatter-gather transfer. As I understand it, currently the only upstream client of this driver is the I2S driver which only uses cyclic anyway.
>
> Not requiring an interrupt on completion is a bit of an edge case - the default among other dmaengine drivers appears to be to enable interrupts unconditionally.

I have now submitted a patch for spi-bcm2835 to make use of dma,
so there is one candidate for this kind of behavior.
So please go forward with the merge.

Also note that with the spi-HW dma support of the bcm2835
it is necessary to do a RX transfer even if the data is not
used (similar for TX).

Right now we have to allocate a dummy buffer to run these kind
of “one-way” transfers where we need 2 DMA channels.

The bcm2835 dma hw supports such dummy-transfer modes via
BCM2835_DMA_S_IGNORE and BCM2835_DMA_D_IGNORE.

So maybe we can add a “flag” to the dmaengine_prep_slave_sg
that will allow such kind of behavior to get implemented?

That is not a necessity, but would be a welcome improvement.

Tested-by: Martin Sperl <[email protected]>

2015-05-12 15:58:26

by Noralf Trønnes

[permalink] [raw]
Subject: Re: [PATCH v2] dmaengine: bcm2835: Add slave dma support

Hi,

Den 18.04.2015 13:06, skrev Noralf Trønnes:
> Add slave transfer capability to BCM2835 dmaengine driver.
> This patch is pulled from the bcm2708-dmaengine driver in the
> Raspberry Pi repo. The work was done by Gellert Weisz.
>
> Tested using the bcm2835-mmc driver from the same repo.
>
> Signed-off-by: Noralf Trønnes <[email protected]>
> ---
>
> Gellert Weisz has ended his internship with Raspberry Pi Trading and
> was not available to sign off this patch.
>
> The patch is made against mainline 4.0-rc7.
>
> Changes from v1:
> Martin Sperl, Dom Cobley:
> MAX_LITE_TRANSFER has to be 32-bit aligned
>
> Stefan Wahren:
> Variable es is not used
> Change splitct to split_cnt
> Add dev_err for unsupported buswidth
> Rearrange d->frames formula for readability
> Move j variable definition to loop
>
> Changes from original code:
> Remove slave_id use.
> SDHCI_BCM_DMA_WAITS changed to BCM2835_DMA_WAIT_CYCLES.
> Use SZ_* macros instead of decimal values.
> Change MAX_LITE_TRANSFER from 32k to 64K - 1.
> Fix several whitespace issues.
>
> Lee Jones' comments in previous email to Piotr Król:
> Remove __func__ from dev_err message.
> Cleanup comments.

Is there something missing for this patch to get accepted?
spi-bcm2835 has now DMA support that depends on this patch.


Noralf.

2015-05-16 17:31:45

by Martin Sperl

[permalink] [raw]
Subject: Re: [PATCH v2] dmaengine: bcm2835: Add slave dma support


> On 12.05.2015, at 17:58, Noralf Trønnes <[email protected]> wrote:
>
> Is there something missing for this patch to get accepted?
> spi-bcm2835 has now DMA support that depends on this patch.

As the spi-bcm2835.c patch using DMA (relying on this) has gone into
spi/for-next (but so far without the DT changes to enable DMA mode)
can we also get this dmaengine patch merged into for-next?

Note that I played BigBuckBunny movie on my spi TFT for the last
24 hours using DMA only and it works without any hickups using
the V2 patch provided by Noralf - it transferred 24GB so far.
I also did test it with spi-mmc and enc28j60 - all of which use DMA
with different access pattern, so that test should be quite
comprehensive.

Thanks,
Martin