Hi Vinod,
Here is a patch serie that adds a new dmaengine operation for
scatter-gathered memset.
Indeed, doing a memset over a discontiguous buffer is quite
inefficient at the moment, since you have to create and submit each
chunk separately, which might result in a huge list of transfers,
while some controllers can handle that just fine.
Let me know what you think,
Maxime
Maxime Ripard (2):
dmaengine: Add scatter-gathered memset
dmaengine: xdmac: Add scatter gathered memset support
drivers/dma/at_xdmac.c | 166 +++++++++++++++++++++++++++++++++++++++++++++-
include/linux/dmaengine.h | 5 ++
2 files changed, 170 insertions(+), 1 deletion(-)
--
2.4.5
The current API allows the driver to accelerate memset by using the DMA
controller.
However, it does so over a contiguous memory area, which might proves
inefficient when you have to do it over a non-contiguous yet repititive
pattern, since you have to create a number of descriptors and then submit
each other.
Add a memset operation going over a scatter list to handle such cases in a
single call.
Signed-off-by: Maxime Ripard <[email protected]>
---
include/linux/dmaengine.h | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index e2f5eb419976..8da299cc7361 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -66,6 +66,7 @@ enum dma_transaction_type {
DMA_XOR_VAL,
DMA_PQ_VAL,
DMA_MEMSET,
+ DMA_MEMSET_SG,
DMA_INTERRUPT,
DMA_SG,
DMA_PRIVATE,
@@ -616,6 +617,7 @@ struct dma_tx_state {
* @device_prep_dma_pq: prepares a pq operation
* @device_prep_dma_pq_val: prepares a pqzero_sum operation
* @device_prep_dma_memset: prepares a memset operation
+ * @device_prep_dma_memset_sg: prepares a memset operation over a scatter list
* @device_prep_dma_interrupt: prepares an end of chain interrupt operation
* @device_prep_slave_sg: prepares a slave dma operation
* @device_prep_dma_cyclic: prepare a cyclic dma operation suitable for audio.
@@ -682,6 +684,9 @@ struct dma_device {
struct dma_async_tx_descriptor *(*device_prep_dma_memset)(
struct dma_chan *chan, dma_addr_t dest, int value, size_t len,
unsigned long flags);
+ struct dma_async_tx_descriptor *(*device_prep_dma_memset_sg)(
+ struct dma_chan *chan, struct scatterlist *sg,
+ unsigned int nents, int value, unsigned long flags);
struct dma_async_tx_descriptor *(*device_prep_dma_interrupt)(
struct dma_chan *chan, unsigned long flags);
struct dma_async_tx_descriptor *(*device_prep_dma_sg)(
--
2.4.5
The XDMAC also supports memset operations over discontiguous areas. Add the
necessary logic to support this.
Signed-off-by: Maxime Ripard <[email protected]>
---
drivers/dma/at_xdmac.c | 166 ++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 165 insertions(+), 1 deletion(-)
diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index cf1213de7865..1a2d9a39ff25 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -1133,7 +1133,7 @@ static struct at_xdmac_desc *at_xdmac_memset_create_desc(struct dma_chan *chan,
* SAMA5D4x), so we can use the same interface for source and dest,
* that solves the fact we don't know the direction.
*/
- u32 chan_cc = AT_XDMAC_CC_DAM_INCREMENTED_AM
+ u32 chan_cc = AT_XDMAC_CC_DAM_UBS_AM
| AT_XDMAC_CC_SAM_INCREMENTED_AM
| AT_XDMAC_CC_DIF(0)
| AT_XDMAC_CC_SIF(0)
@@ -1201,6 +1201,168 @@ at_xdmac_prep_dma_memset(struct dma_chan *chan, dma_addr_t dest, int value,
return &desc->tx_dma_desc;
}
+static struct dma_async_tx_descriptor *
+at_xdmac_prep_dma_memset_sg(struct dma_chan *chan, struct scatterlist *sgl,
+ unsigned int sg_len, int value,
+ unsigned long flags)
+{
+ struct at_xdmac_chan *atchan = to_at_xdmac_chan(chan);
+ struct at_xdmac_desc *desc, *pdesc = NULL,
+ *ppdesc = NULL, *first = NULL;
+ struct scatterlist *sg, *psg = NULL, *ppsg = NULL;
+ size_t stride = 0, pstride = 0, len = 0;
+ int i;
+
+ if (!sgl)
+ return NULL;
+
+ dev_dbg(chan2dev(chan), "%s: sg_len=%d, value=0x%x, flags=0x%lx\n",
+ __func__, sg_len, value, flags);
+
+ /* Prepare descriptors. */
+ for_each_sg(sgl, sg, sg_len, i) {
+ dev_dbg(chan2dev(chan), "%s: dest=0x%08x, len=%d, pattern=0x%x, flags=0x%lx\n",
+ __func__, sg_dma_address(sg), sg_dma_len(sg),
+ value, flags);
+ desc = at_xdmac_memset_create_desc(chan, atchan,
+ sg_dma_address(sg),
+ sg_dma_len(sg),
+ value);
+ if (!desc && first)
+ list_splice_init(&first->descs_list,
+ &atchan->free_descs_list);
+
+ if (!first)
+ first = desc;
+
+ /* Update our strides */
+ pstride = stride;
+ if (psg)
+ stride = sg_dma_address(sg) -
+ (sg_dma_address(psg) + sg_dma_len(psg));
+
+ /*
+ * The scatterlist API gives us only the address and
+ * length of each elements.
+ *
+ * Unfortunately, we don't have the stride, which we
+ * will need to compute.
+ *
+ * That make us end up in a situation like this one:
+ * len stride len stride len
+ * +-------+ +-------+ +-------+
+ * | N-2 | | N-1 | | N |
+ * +-------+ +-------+ +-------+
+ *
+ * We need all these three elements (N-2, N-1 and N)
+ * to actually take the decision on whether we need to
+ * queue N-1 or reuse N-2.
+ *
+ * We will only consider N if it is the last element.
+ */
+ if (ppdesc && pdesc) {
+ if ((stride == pstride) &&
+ (sg_dma_len(ppsg) == sg_dma_len(psg))) {
+ dev_dbg(chan2dev(chan),
+ "%s: desc 0x%p can be merged with desc 0x%p\n",
+ __func__, pdesc, ppdesc);
+
+ /*
+ * Increment the block count of the
+ * N-2 descriptor
+ */
+ at_xdmac_increment_block_count(chan, ppdesc);
+ ppdesc->lld.mbr_dus = stride;
+
+ /*
+ * Put back the N-1 descriptor in the
+ * free descriptor list
+ */
+ list_add_tail(&pdesc->desc_node,
+ &atchan->free_descs_list);
+
+ /*
+ * Make our N-1 descriptor pointer
+ * point to the N-2 since they were
+ * actually merged.
+ */
+ pdesc = ppdesc;
+
+ /*
+ * Rule out the case where we don't have
+ * pstride computed yet (our second sg
+ * element)
+ *
+ * We also want to catch the case where there
+ * would be a negative stride,
+ */
+ } else if (pstride ||
+ sg_dma_address(sg) < sg_dma_address(psg)) {
+ /*
+ * Queue the N-1 descriptor after the
+ * N-2
+ */
+ at_xdmac_queue_desc(chan, ppdesc, pdesc);
+
+ /*
+ * Add the N-1 descriptor to the list
+ * of the descriptors used for this
+ * transfer
+ */
+ list_add_tail(&desc->desc_node,
+ &first->descs_list);
+ dev_dbg(chan2dev(chan),
+ "%s: add desc 0x%p to descs_list 0x%p\n",
+ __func__, desc, first);
+ }
+ }
+
+ /*
+ * If we are the last element, just see if we have the
+ * same size than the previous element.
+ *
+ * If so, we can merge it with the previous descriptor
+ * since we don't care about the stride anymore.
+ */
+ if ((i == (sg_len - 1)) &&
+ sg_dma_len(ppsg) == sg_dma_len(psg)) {
+ dev_dbg(chan2dev(chan),
+ "%s: desc 0x%p can be merged with desc 0x%p\n",
+ __func__, desc, pdesc);
+
+ /*
+ * Increment the block count of the N-1
+ * descriptor
+ */
+ at_xdmac_increment_block_count(chan, pdesc);
+ pdesc->lld.mbr_dus = stride;
+
+ /*
+ * Put back the N descriptor in the free
+ * descriptor list
+ */
+ list_add_tail(&desc->desc_node,
+ &atchan->free_descs_list);
+ }
+
+ /* Update our descriptors */
+ ppdesc = pdesc;
+ pdesc = desc;
+
+ /* Update our scatter pointers */
+ ppsg = psg;
+ psg = sg;
+
+ len += sg_dma_len(sg);
+ }
+
+ first->tx_dma_desc.cookie = -EBUSY;
+ first->tx_dma_desc.flags = flags;
+ first->xfer_size = len;
+
+ return &first->tx_dma_desc;
+}
+
static enum dma_status
at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
struct dma_tx_state *txstate)
@@ -1734,6 +1896,7 @@ static int at_xdmac_probe(struct platform_device *pdev)
dma_cap_set(DMA_INTERLEAVE, atxdmac->dma.cap_mask);
dma_cap_set(DMA_MEMCPY, atxdmac->dma.cap_mask);
dma_cap_set(DMA_MEMSET, atxdmac->dma.cap_mask);
+ dma_cap_set(DMA_MEMSET_SG, atxdmac->dma.cap_mask);
dma_cap_set(DMA_SLAVE, atxdmac->dma.cap_mask);
/*
* Without DMA_PRIVATE the driver is not able to allocate more than
@@ -1749,6 +1912,7 @@ static int at_xdmac_probe(struct platform_device *pdev)
atxdmac->dma.device_prep_interleaved_dma = at_xdmac_prep_interleaved;
atxdmac->dma.device_prep_dma_memcpy = at_xdmac_prep_dma_memcpy;
atxdmac->dma.device_prep_dma_memset = at_xdmac_prep_dma_memset;
+ atxdmac->dma.device_prep_dma_memset_sg = at_xdmac_prep_dma_memset_sg;
atxdmac->dma.device_prep_slave_sg = at_xdmac_prep_slave_sg;
atxdmac->dma.device_config = at_xdmac_device_config;
atxdmac->dma.device_pause = at_xdmac_device_pause;
--
2.4.5
On Mon, Jul 06, 2015 at 12:19:24PM +0200, Maxime Ripard wrote:
> The XDMAC also supports memset operations over discontiguous areas. Add the
> necessary logic to support this.
>
> Signed-off-by: Maxime Ripard <[email protected]>
> ---
> drivers/dma/at_xdmac.c | 166 ++++++++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 165 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
> index cf1213de7865..1a2d9a39ff25 100644
> --- a/drivers/dma/at_xdmac.c
> +++ b/drivers/dma/at_xdmac.c
> @@ -1133,7 +1133,7 @@ static struct at_xdmac_desc *at_xdmac_memset_create_desc(struct dma_chan *chan,
> * SAMA5D4x), so we can use the same interface for source and dest,
> * that solves the fact we don't know the direction.
> */
> - u32 chan_cc = AT_XDMAC_CC_DAM_INCREMENTED_AM
> + u32 chan_cc = AT_XDMAC_CC_DAM_UBS_AM
> | AT_XDMAC_CC_SAM_INCREMENTED_AM
> | AT_XDMAC_CC_DIF(0)
> | AT_XDMAC_CC_SIF(0)
> @@ -1201,6 +1201,168 @@ at_xdmac_prep_dma_memset(struct dma_chan *chan, dma_addr_t dest, int value,
> return &desc->tx_dma_desc;
> }
>
> +static struct dma_async_tx_descriptor *
> +at_xdmac_prep_dma_memset_sg(struct dma_chan *chan, struct scatterlist *sgl,
> + unsigned int sg_len, int value,
> + unsigned long flags)
> +{
> + struct at_xdmac_chan *atchan = to_at_xdmac_chan(chan);
> + struct at_xdmac_desc *desc, *pdesc = NULL,
> + *ppdesc = NULL, *first = NULL;
> + struct scatterlist *sg, *psg = NULL, *ppsg = NULL;
> + size_t stride = 0, pstride = 0, len = 0;
> + int i;
> +
> + if (!sgl)
> + return NULL;
> +
> + dev_dbg(chan2dev(chan), "%s: sg_len=%d, value=0x%x, flags=0x%lx\n",
> + __func__, sg_len, value, flags);
> +
> + /* Prepare descriptors. */
> + for_each_sg(sgl, sg, sg_len, i) {
> + dev_dbg(chan2dev(chan), "%s: dest=0x%08x, len=%d, pattern=0x%x, flags=0x%lx\n",
> + __func__, sg_dma_address(sg), sg_dma_len(sg),
> + value, flags);
> + desc = at_xdmac_memset_create_desc(chan, atchan,
> + sg_dma_address(sg),
> + sg_dma_len(sg),
> + value);
> + if (!desc && first)
> + list_splice_init(&first->descs_list,
> + &atchan->free_descs_list);
> +
> + if (!first)
> + first = desc;
> +
> + /* Update our strides */
> + pstride = stride;
> + if (psg)
> + stride = sg_dma_address(sg) -
> + (sg_dma_address(psg) + sg_dma_len(psg));
> +
> + /*
> + * The scatterlist API gives us only the address and
> + * length of each elements.
> + *
> + * Unfortunately, we don't have the stride, which we
> + * will need to compute.
> + *
> + * That make us end up in a situation like this one:
> + * len stride len stride len
> + * +-------+ +-------+ +-------+
> + * | N-2 | | N-1 | | N |
> + * +-------+ +-------+ +-------+
> + *
> + * We need all these three elements (N-2, N-1 and N)
> + * to actually take the decision on whether we need to
> + * queue N-1 or reuse N-2.
> + *
> + * We will only consider N if it is the last element.
> + */
Why do you need stride?
This is scatterlist so the computation of stride sounds odd here. Ideally
you should take the scatterlist and program the lli for controller.
--
~Vinod
> + if (ppdesc && pdesc) {
> + if ((stride == pstride) &&
> + (sg_dma_len(ppsg) == sg_dma_len(psg))) {
> + dev_dbg(chan2dev(chan),
> + "%s: desc 0x%p can be merged with desc 0x%p\n",
> + __func__, pdesc, ppdesc);
> +
> + /*
> + * Increment the block count of the
> + * N-2 descriptor
> + */
> + at_xdmac_increment_block_count(chan, ppdesc);
> + ppdesc->lld.mbr_dus = stride;
> +
> + /*
> + * Put back the N-1 descriptor in the
> + * free descriptor list
> + */
> + list_add_tail(&pdesc->desc_node,
> + &atchan->free_descs_list);
> +
> + /*
> + * Make our N-1 descriptor pointer
> + * point to the N-2 since they were
> + * actually merged.
> + */
> + pdesc = ppdesc;
> +
> + /*
> + * Rule out the case where we don't have
> + * pstride computed yet (our second sg
> + * element)
> + *
> + * We also want to catch the case where there
> + * would be a negative stride,
> + */
> + } else if (pstride ||
> + sg_dma_address(sg) < sg_dma_address(psg)) {
> + /*
> + * Queue the N-1 descriptor after the
> + * N-2
> + */
> + at_xdmac_queue_desc(chan, ppdesc, pdesc);
> +
> + /*
> + * Add the N-1 descriptor to the list
> + * of the descriptors used for this
> + * transfer
> + */
> + list_add_tail(&desc->desc_node,
> + &first->descs_list);
> + dev_dbg(chan2dev(chan),
> + "%s: add desc 0x%p to descs_list 0x%p\n",
> + __func__, desc, first);
> + }
> + }
> +
> + /*
> + * If we are the last element, just see if we have the
> + * same size than the previous element.
> + *
> + * If so, we can merge it with the previous descriptor
> + * since we don't care about the stride anymore.
> + */
> + if ((i == (sg_len - 1)) &&
> + sg_dma_len(ppsg) == sg_dma_len(psg)) {
> + dev_dbg(chan2dev(chan),
> + "%s: desc 0x%p can be merged with desc 0x%p\n",
> + __func__, desc, pdesc);
> +
> + /*
> + * Increment the block count of the N-1
> + * descriptor
> + */
> + at_xdmac_increment_block_count(chan, pdesc);
> + pdesc->lld.mbr_dus = stride;
> +
> + /*
> + * Put back the N descriptor in the free
> + * descriptor list
> + */
> + list_add_tail(&desc->desc_node,
> + &atchan->free_descs_list);
> + }
> +
> + /* Update our descriptors */
> + ppdesc = pdesc;
> + pdesc = desc;
> +
> + /* Update our scatter pointers */
> + ppsg = psg;
> + psg = sg;
> +
> + len += sg_dma_len(sg);
> + }
> +
> + first->tx_dma_desc.cookie = -EBUSY;
> + first->tx_dma_desc.flags = flags;
> + first->xfer_size = len;
> +
> + return &first->tx_dma_desc;
> +}
> +
> static enum dma_status
> at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
> struct dma_tx_state *txstate)
> @@ -1734,6 +1896,7 @@ static int at_xdmac_probe(struct platform_device *pdev)
> dma_cap_set(DMA_INTERLEAVE, atxdmac->dma.cap_mask);
> dma_cap_set(DMA_MEMCPY, atxdmac->dma.cap_mask);
> dma_cap_set(DMA_MEMSET, atxdmac->dma.cap_mask);
> + dma_cap_set(DMA_MEMSET_SG, atxdmac->dma.cap_mask);
> dma_cap_set(DMA_SLAVE, atxdmac->dma.cap_mask);
> /*
> * Without DMA_PRIVATE the driver is not able to allocate more than
> @@ -1749,6 +1912,7 @@ static int at_xdmac_probe(struct platform_device *pdev)
> atxdmac->dma.device_prep_interleaved_dma = at_xdmac_prep_interleaved;
> atxdmac->dma.device_prep_dma_memcpy = at_xdmac_prep_dma_memcpy;
> atxdmac->dma.device_prep_dma_memset = at_xdmac_prep_dma_memset;
> + atxdmac->dma.device_prep_dma_memset_sg = at_xdmac_prep_dma_memset_sg;
> atxdmac->dma.device_prep_slave_sg = at_xdmac_prep_slave_sg;
> atxdmac->dma.device_config = at_xdmac_device_config;
> atxdmac->dma.device_pause = at_xdmac_device_pause;
> --
> 2.4.5
>
--
Hi Vinod,
> > + /*
> > + * The scatterlist API gives us only the address and
> > + * length of each elements.
> > + *
> > + * Unfortunately, we don't have the stride, which we
> > + * will need to compute.
> > + *
> > + * That make us end up in a situation like this one:
> > + * len stride len stride len
> > + * +-------+ +-------+ +-------+
> > + * | N-2 | | N-1 | | N |
> > + * +-------+ +-------+ +-------+
> > + *
> > + * We need all these three elements (N-2, N-1 and N)
> > + * to actually take the decision on whether we need to
> > + * queue N-1 or reuse N-2.
> > + *
> > + * We will only consider N if it is the last element.
> > + */
>
> Why do you need stride?
>
> This is scatterlist so the computation of stride sounds odd here. Ideally
> you should take the scatterlist and program the lli for controller.
Because it is sub-optimal if the length and stride are equals from one
descriptors to another. The XDMAC is able to repeat any given
descriptor a given number of time (which is one by default), which
means that if the parameters of the transfer don't change, we simply
have to increment the number of time the descriptor has to be used,
instead of creating a new one that the controller will have to fetch.
In the non-optimal case (ie the length and/or stride change from one
scatterlist element to another), we simply fallback to a one LLI per
scatter list element.
Maxime
--
Maxime Ripard, Free Electrons
Embedded Linux, Kernel and Android engineering
http://free-electrons.com
On Thu, Jul 16, 2015 at 04:11:05PM +0200, Maxime Ripard wrote:
> Hi Vinod,
>
> > > + /*
> > > + * The scatterlist API gives us only the address and
> > > + * length of each elements.
> > > + *
> > > + * Unfortunately, we don't have the stride, which we
> > > + * will need to compute.
> > > + *
> > > + * That make us end up in a situation like this one:
> > > + * len stride len stride len
> > > + * +-------+ +-------+ +-------+
> > > + * | N-2 | | N-1 | | N |
> > > + * +-------+ +-------+ +-------+
> > > + *
> > > + * We need all these three elements (N-2, N-1 and N)
> > > + * to actually take the decision on whether we need to
> > > + * queue N-1 or reuse N-2.
> > > + *
> > > + * We will only consider N if it is the last element.
> > > + */
> >
> > Why do you need stride?
> >
> > This is scatterlist so the computation of stride sounds odd here. Ideally
> > you should take the scatterlist and program the lli for controller.
>
> Because it is sub-optimal if the length and stride are equals from one
> descriptors to another. The XDMAC is able to repeat any given
> descriptor a given number of time (which is one by default), which
> means that if the parameters of the transfer don't change, we simply
> have to increment the number of time the descriptor has to be used,
> instead of creating a new one that the controller will have to fetch.
>
> In the non-optimal case (ie the length and/or stride change from one
> scatterlist element to another), we simply fallback to a one LLI per
> scatter list element.
Sound fine to me. The optimization is a good one then...
Another question though, do you expect stride to be linear for your cases,
if so have you actually though about using interleaved API, for these cases
--
~Vinod
On Mon, Jul 06, 2015 at 12:19:22PM +0200, Maxime Ripard wrote:
> Hi Vinod,
>
> Here is a patch serie that adds a new dmaengine operation for
> scatter-gathered memset.
>
> Indeed, doing a memset over a discontiguous buffer is quite
> inefficient at the moment, since you have to create and submit each
> chunk separately, which might result in a huge list of transfers,
> while some controllers can handle that just fine.
>
> Let me know what you think,
> Maxime
>
> Maxime Ripard (2):
> dmaengine: Add scatter-gathered memset
> dmaengine: xdmac: Add scatter gathered memset support
>
I have forgotten to add it but of course:
Acked-by: Ludovic Desroches <[email protected]>
Thanks
> drivers/dma/at_xdmac.c | 166 +++++++++++++++++++++++++++++++++++++++++++++-
> include/linux/dmaengine.h | 5 ++
> 2 files changed, 170 insertions(+), 1 deletion(-)
>
> --
> 2.4.5
>
On Fri, Jul 17, 2015 at 08:39:26AM +0530, Vinod Koul wrote:
> On Thu, Jul 16, 2015 at 04:11:05PM +0200, Maxime Ripard wrote:
> > Hi Vinod,
> >
> > > > + /*
> > > > + * The scatterlist API gives us only the address and
> > > > + * length of each elements.
> > > > + *
> > > > + * Unfortunately, we don't have the stride, which we
> > > > + * will need to compute.
> > > > + *
> > > > + * That make us end up in a situation like this one:
> > > > + * len stride len stride len
> > > > + * +-------+ +-------+ +-------+
> > > > + * | N-2 | | N-1 | | N |
> > > > + * +-------+ +-------+ +-------+
> > > > + *
> > > > + * We need all these three elements (N-2, N-1 and N)
> > > > + * to actually take the decision on whether we need to
> > > > + * queue N-1 or reuse N-2.
> > > > + *
> > > > + * We will only consider N if it is the last element.
> > > > + */
> > >
> > > Why do you need stride?
> > >
> > > This is scatterlist so the computation of stride sounds odd here. Ideally
> > > you should take the scatterlist and program the lli for controller.
> >
> > Because it is sub-optimal if the length and stride are equals from one
> > descriptors to another. The XDMAC is able to repeat any given
> > descriptor a given number of time (which is one by default), which
> > means that if the parameters of the transfer don't change, we simply
> > have to increment the number of time the descriptor has to be used,
> > instead of creating a new one that the controller will have to fetch.
> >
> > In the non-optimal case (ie the length and/or stride change from one
> > scatterlist element to another), we simply fallback to a one LLI per
> > scatter list element.
>
> Sound fine to me. The optimization is a good one then...
>
> Another question though, do you expect stride to be linear for your
> cases, if so have you actually though about using interleaved API,
> for these cases
What do you mean by linear? I guess, at least from an API point of
view, we should expect any stride and length. The scatter gather
variants of the dmaengine API were already using scatterlist, and the
interleaved template doesn't really fit our use case here, so I chose
to use a scatterlist here, mostly for consistency.
Maxime
--
Maxime Ripard, Free Electrons
Embedded Linux, Kernel and Android engineering
http://free-electrons.com
On Mon, Jul 06, 2015 at 12:19:22PM +0200, Maxime Ripard wrote:
> Hi Vinod,
>
> Here is a patch serie that adds a new dmaengine operation for
> scatter-gathered memset.
>
> Indeed, doing a memset over a discontiguous buffer is quite
> inefficient at the moment, since you have to create and submit each
> chunk separately, which might result in a huge list of transfers,
> while some controllers can handle that just fine.
Applied, thanks
--
~Vinod