2010-02-10 20:37:38

by Abhijeet Dharmapurikar

[permalink] [raw]
Subject: [RFC 0/2] fix dma_map_sg not to do barriers for each buffer

From: Abhijeet Dharmapurikar <[email protected]>

Please refer to the post here
http://lkml.org/lkml/2010/1/4/347

These changes are to introduce barrierless dma_map_area and dma_unmap_area and
use them to map the buffers in the scatterlist. For the last buffer, call
the normal dma_map_area(aka with barriers) effectively executing the barrier
at the end of the operation.

Note that the barrierless operations are implemented for few arm
architectures only and I would implement for others once these are okayed by the
community.

Abhijeet Dharmapurikar (2):
dma: define barrierless versions of map and unmap area
dma: fix scatter-gather api to use barrierless map/unmap functions

arch/arm/include/asm/cacheflush.h | 9 +++
arch/arm/include/asm/dma-mapping.h | 82 +++++++++++++++++++++
arch/arm/mm/cache-v3.S | 6 ++
arch/arm/mm/cache-v4.S | 6 ++
arch/arm/mm/cache-v4wb.S | 94 +++++++++++++++++-------
arch/arm/mm/cache-v4wt.S | 6 ++
arch/arm/mm/cache-v6.S | 139 +++++++++++++++++++++++++----------
arch/arm/mm/cache-v7.S | 120 +++++++++++++++++++++++--------
arch/arm/mm/dma-mapping.c | 55 +++++++++++++--
9 files changed, 414 insertions(+), 103 deletions(-)


2010-02-10 20:37:40

by Abhijeet Dharmapurikar

[permalink] [raw]
Subject: [PATCH 1/2] dma: define barrierless versions of map and unmap area

From: Abhijeet Dharmapurikar <[email protected]>

Barrierless versions of dma_map_area and dma_unmap_area will be used in
the scatter-gather mapping and unmapping functions.

Signed-off-by: Abhijeet Dharmapurikar <[email protected]>
---
arch/arm/include/asm/cacheflush.h | 9 +++
arch/arm/mm/cache-v3.S | 6 ++
arch/arm/mm/cache-v4.S | 6 ++
arch/arm/mm/cache-v4wb.S | 94 +++++++++++++++++--------
arch/arm/mm/cache-v4wt.S | 6 ++
arch/arm/mm/cache-v6.S | 139 ++++++++++++++++++++++++++-----------
arch/arm/mm/cache-v7.S | 120 ++++++++++++++++++++++++--------
7 files changed, 283 insertions(+), 97 deletions(-)

diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index 8148a00..e91e014 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -215,6 +215,9 @@ struct cpu_cache_fns {
void (*dma_map_area)(const void *, size_t, int);
void (*dma_unmap_area)(const void *, size_t, int);

+ void (*dma_map_area_nobarrier)(const void *, size_t, int);
+ void (*dma_unmap_area_nobarrier)(const void *, size_t, int);
+
void (*dma_flush_range)(const void *, const void *);
};

@@ -246,6 +249,8 @@ extern struct cpu_cache_fns cpu_cache;
*/
#define dmac_map_area cpu_cache.dma_map_area
#define dmac_unmap_area cpu_cache.dma_unmap_area
+#define dmac_map_area_nobarrier cpu_cache.dma_map_area_nobarrier
+#define dmac_unmap_area_nobarrier cpu_cache.dma_unmap_area_nobarrier
#define dmac_flush_range cpu_cache.dma_flush_range

#else
@@ -272,10 +277,14 @@ extern void __cpuc_flush_dcache_area(void *, size_t);
*/
#define dmac_map_area __glue(_CACHE,_dma_map_area)
#define dmac_unmap_area __glue(_CACHE,_dma_unmap_area)
+#define dmac_map_area_nobarrier __glue(_CACHE,_dma_map_area_nobarrier)
+#define dmac_unmap_area_nobarrier __glue(_CACHE,_dma_unmap_area_nobarrier)
#define dmac_flush_range __glue(_CACHE,_dma_flush_range)

extern void dmac_map_area(const void *, size_t, int);
extern void dmac_unmap_area(const void *, size_t, int);
+extern void dmac_map_area_nobarrier(const void *, size_t, int);
+extern void dmac_unmap_area_nobarrier(const void *, size_t, int);
extern void dmac_flush_range(const void *, const void *);

#endif
diff --git a/arch/arm/mm/cache-v3.S b/arch/arm/mm/cache-v3.S
index c2ff3c5..5ba5b9b 100644
--- a/arch/arm/mm/cache-v3.S
+++ b/arch/arm/mm/cache-v3.S
@@ -103,6 +103,7 @@ ENTRY(v3_dma_flush_range)
* - dir - DMA direction
*/
ENTRY(v3_dma_unmap_area)
+ENTRY(v3_dma_unmap_area_nobarrier)
teq r2, #DMA_TO_DEVICE
bne v3_dma_flush_range
/* FALLTHROUGH */
@@ -114,9 +115,12 @@ ENTRY(v3_dma_unmap_area)
* - dir - DMA direction
*/
ENTRY(v3_dma_map_area)
+ENTRY(v3_dma_map_area_nobarrier)
mov pc, lr
ENDPROC(v3_dma_unmap_area)
+ENDPROC(v3_dma_unmap_area_nobarrier)
ENDPROC(v3_dma_map_area)
+ENDPROC(v3_dma_map_area_nobarrier)

__INITDATA

@@ -130,5 +134,7 @@ ENTRY(v3_cache_fns)
.long v3_flush_kern_dcache_area
.long v3_dma_map_area
.long v3_dma_unmap_area
+ .long v3_dma_map_area_nobarrier
+ .long v3_dma_unmap_area_nobarrier
.long v3_dma_flush_range
.size v3_cache_fns, . - v3_cache_fns
diff --git a/arch/arm/mm/cache-v4.S b/arch/arm/mm/cache-v4.S
index 4810f7e..a914c5f 100644
--- a/arch/arm/mm/cache-v4.S
+++ b/arch/arm/mm/cache-v4.S
@@ -115,6 +115,7 @@ ENTRY(v4_dma_flush_range)
* - dir - DMA direction
*/
ENTRY(v4_dma_unmap_area)
+ENTRY(v4_dma_unmap_area_nobarrier)
teq r2, #DMA_TO_DEVICE
bne v4_dma_flush_range
/* FALLTHROUGH */
@@ -126,9 +127,12 @@ ENTRY(v4_dma_unmap_area)
* - dir - DMA direction
*/
ENTRY(v4_dma_map_area)
+ENTRY(v4_dma_map_area_nobarrier)
mov pc, lr
ENDPROC(v4_dma_unmap_area)
+ENDPROC(v4_dma_unmap_area_nobarrier)
ENDPROC(v4_dma_map_area)
+ENDPROC(v4_dma_map_area_nobarrier)

__INITDATA

@@ -142,5 +146,7 @@ ENTRY(v4_cache_fns)
.long v4_flush_kern_dcache_area
.long v4_dma_map_area
.long v4_dma_unmap_area
+ .long v4_dma_map_area_nobarrier
+ .long v4_dma_unmap_area_nobarrier
.long v4_dma_flush_range
.size v4_cache_fns, . - v4_cache_fns
diff --git a/arch/arm/mm/cache-v4wb.S b/arch/arm/mm/cache-v4wb.S
index df8368a..dff8248 100644
--- a/arch/arm/mm/cache-v4wb.S
+++ b/arch/arm/mm/cache-v4wb.S
@@ -113,6 +113,37 @@ ENTRY(v4wb_flush_user_cache_range)
mcrne p15, 0, ip, c7, c10, 4 @ drain write buffer
mov pc, lr

+ .macro v4wb_dma_flush_range_macro, start, end
+ bic \start, \start, #CACHE_DLINESIZE - 1
+1: mcr p15, 0, \start, c7, c10, 1 @ clean D entry
+ mcr p15, 0, \start, c7, c6, 1 @ invalidate D entry
+ add \start, \start, #CACHE_DLINESIZE
+ cmp \start, \end
+ blo 1b
+ mov ip, #0
+ mcr p15, 0, ip, c7, c5, 0 @ invalidate I cache
+ .endm
+
+ .macro v4wb_dma_inv_range, start, end
+ tst \start, #CACHE_DLINESIZE - 1
+ bic \start, \start, #CACHE_DLINESIZE - 1
+ mcrne p15, 0, \start, c7, c10, 1 @ clean D entry
+ tst \end, #CACHE_DLINESIZE - 1
+ mcrne p15, 0, \end, c7, c10, 1 @ clean D entry
+1: mcr p15, 0, \start, c7, c6, 1 @ invalidate D entry
+ add \start, \start, #CACHE_DLINESIZE
+ cmp \start, \end
+ blo 1b
+ .endm
+
+ .macro v4wb_dma_clean_range, start, end
+ bic \start, \start, #CACHE_DLINESIZE - 1
+1: mcr p15, 0, \start, c7, c10, 1 @ clean D entry
+ add \start, \start, #CACHE_DLINESIZE
+ cmp \start, \end
+ blo 1b
+ .endm
+
/*
* flush_kern_dcache_area(void *addr, size_t size)
*
@@ -150,20 +181,12 @@ ENTRY(v4wb_coherent_kern_range)
* - end - virtual end address
*/
ENTRY(v4wb_coherent_user_range)
- bic r0, r0, #CACHE_DLINESIZE - 1
-1: mcr p15, 0, r0, c7, c10, 1 @ clean D entry
- mcr p15, 0, r0, c7, c6, 1 @ invalidate D entry
- add r0, r0, #CACHE_DLINESIZE
- cmp r0, r1
- blo 1b
- mov ip, #0
- mcr p15, 0, ip, c7, c5, 0 @ invalidate I cache
+ v4wb_dma_flush_range_macro r0, r1
mcr p15, 0, ip, c7, c10, 4 @ drain WB
mov pc, lr

-
/*
- * dma_inv_range(start, end)
+ * dma_inv_range_barrier(start, end)
*
* Invalidate (discard) the specified virtual address range.
* May not write back any entries. If 'start' or 'end'
@@ -173,16 +196,8 @@ ENTRY(v4wb_coherent_user_range)
* - start - virtual start address
* - end - virtual end address
*/
-v4wb_dma_inv_range:
- tst r0, #CACHE_DLINESIZE - 1
- bic r0, r0, #CACHE_DLINESIZE - 1
- mcrne p15, 0, r0, c7, c10, 1 @ clean D entry
- tst r1, #CACHE_DLINESIZE - 1
- mcrne p15, 0, r1, c7, c10, 1 @ clean D entry
-1: mcr p15, 0, r0, c7, c6, 1 @ invalidate D entry
- add r0, r0, #CACHE_DLINESIZE
- cmp r0, r1
- blo 1b
+v4wb_dma_inv_range_barrier:
+ v4wb_dma_inv_range r0, r1
mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
mov pc, lr

@@ -194,12 +209,8 @@ v4wb_dma_inv_range:
* - start - virtual start address
* - end - virtual end address
*/
-v4wb_dma_clean_range:
- bic r0, r0, #CACHE_DLINESIZE - 1
-1: mcr p15, 0, r0, c7, c10, 1 @ clean D entry
- add r0, r0, #CACHE_DLINESIZE
- cmp r0, r1
- blo 1b
+v4wb_dma_clean_range_barrier:
+ v4wb_dma_clean_range r0, r1
mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
mov pc, lr

@@ -216,17 +227,32 @@ v4wb_dma_clean_range:
.globl v4wb_dma_flush_range
.set v4wb_dma_flush_range, v4wb_coherent_kern_range

+
+v4wb_dma_inv_range_nobarrier:
+ v4wb_dma_inv_range r0, r1
+ mov pc, lr
+
+v4wb_dma_clean_range_nobarrier:
+ v4wb_dma_clean_range r0, r1
+ mov pc, lr
+
+v4wb_dma_flush_range_nobarrier:
+ v4wb_dma_flush_range_macro r0, r1
+ mov pc, lr
+
+
/*
* dma_map_area(start, size, dir)
* - start - kernel virtual start address
* - size - size of region
* - dir - DMA direction
*/
+
ENTRY(v4wb_dma_map_area)
add r1, r1, r0
cmp r2, #DMA_TO_DEVICE
- beq v4wb_dma_clean_range
- bcs v4wb_dma_inv_range
+ beq v4wb_dma_clean_range_barrier
+ bcs v4wb_dma_inv_range_barrier
b v4wb_dma_flush_range
ENDPROC(v4wb_dma_map_area)

@@ -237,8 +263,18 @@ ENDPROC(v4wb_dma_map_area)
* - dir - DMA direction
*/
ENTRY(v4wb_dma_unmap_area)
+ENTRY(v4wb_dma_unmap_area_nobarrier)
mov pc, lr
ENDPROC(v4wb_dma_unmap_area)
+ENDPROC(v4wb_dma_unmap_area_nobarrier)
+
+ENTRY(v4wb_dma_map_area_nobarrier)
+ add r1, r1, r0
+ cmp r2, #DMA_TO_DEVICE
+ beq v4wb_dma_clean_range_nobarrier
+ bcs v4wb_dma_inv_range_nobarrier
+ b v4wb_dma_flush_range_nobarrier
+ENDPROC(v4wb_dma_map_area_nobarrier)

__INITDATA

@@ -252,5 +288,7 @@ ENTRY(v4wb_cache_fns)
.long v4wb_flush_kern_dcache_area
.long v4wb_dma_map_area
.long v4wb_dma_unmap_area
+ .long v4wb_dma_map_area_nobarrier
+ .long v4wb_dma_unmap_area_nobarrier
.long v4wb_dma_flush_range
.size v4wb_cache_fns, . - v4wb_cache_fns
diff --git a/arch/arm/mm/cache-v4wt.S b/arch/arm/mm/cache-v4wt.S
index 45c7031..df587b6 100644
--- a/arch/arm/mm/cache-v4wt.S
+++ b/arch/arm/mm/cache-v4wt.S
@@ -168,6 +168,7 @@ v4wt_dma_inv_range:
* - dir - DMA direction
*/
ENTRY(v4wt_dma_unmap_area)
+ENTRY(v4wt_dma_unmap_area_nobarrier)
add r1, r1, r0
teq r2, #DMA_TO_DEVICE
bne v4wt_dma_inv_range
@@ -180,9 +181,12 @@ ENTRY(v4wt_dma_unmap_area)
* - dir - DMA direction
*/
ENTRY(v4wt_dma_map_area)
+ENTRY(v4wt_dma_map_area_nobarrier)
mov pc, lr
ENDPROC(v4wt_dma_unmap_area)
+ENDPROC(v4wt_dma_unmap_area_nobarrier)
ENDPROC(v4wt_dma_map_area)
+ENDPROC(v4wt_dma_map_area_nobarrier)

__INITDATA

@@ -196,5 +200,7 @@ ENTRY(v4wt_cache_fns)
.long v4wt_flush_kern_dcache_area
.long v4wt_dma_map_area
.long v4wt_dma_unmap_area
+ .long v4wt_dma_map_area_nobarrier
+ .long v4wt_dma_unmap_area_nobarrier
.long v4wt_dma_flush_range
.size v4wt_cache_fns, . - v4wt_cache_fns
diff --git a/arch/arm/mm/cache-v6.S b/arch/arm/mm/cache-v6.S
index 9d89c67..0e3f9b9 100644
--- a/arch/arm/mm/cache-v6.S
+++ b/arch/arm/mm/cache-v6.S
@@ -185,65 +185,96 @@ ENTRY(v6_flush_kern_dcache_area)
mov pc, lr


-/*
- * v6_dma_inv_range(start,end)
- *
- * Invalidate the data cache within the specified region; we will
- * be performing a DMA operation in this region and we want to
- * purge old data in the cache.
- *
- * - start - virtual start address of region
- * - end - virtual end address of region
- */
-v6_dma_inv_range:
- tst r0, #D_CACHE_LINE_SIZE - 1
- bic r0, r0, #D_CACHE_LINE_SIZE - 1
+ .macro v6_dma_inv_range, start,end
+ tst \start, #D_CACHE_LINE_SIZE - 1
+ bic \start, \start, #D_CACHE_LINE_SIZE - 1
#ifdef HARVARD_CACHE
- mcrne p15, 0, r0, c7, c10, 1 @ clean D line
+ mcrne p15, 0, \start, c7, c10, 1 @ clean D line
#else
- mcrne p15, 0, r0, c7, c11, 1 @ clean unified line
+ mcrne p15, 0, \start, c7, c11, 1 @ clean unified line
#endif
- tst r1, #D_CACHE_LINE_SIZE - 1
- bic r1, r1, #D_CACHE_LINE_SIZE - 1
+ tst \end, #D_CACHE_LINE_SIZE - 1
+ bic \end, \end, #D_CACHE_LINE_SIZE - 1
#ifdef HARVARD_CACHE
- mcrne p15, 0, r1, c7, c14, 1 @ clean & invalidate D line
+ mcrne p15, 0, \end, c7, c14, 1 @ clean & invalidate D line
#else
- mcrne p15, 0, r1, c7, c15, 1 @ clean & invalidate unified line
+ mcrne p15, 0, \end, c7, c15, 1 @ clean & invalidate unifiedline
#endif
1:
#ifdef HARVARD_CACHE
- mcr p15, 0, r0, c7, c6, 1 @ invalidate D line
+ mcr p15, 0, \start, c7, c6, 1 @ invalidate D line
#else
- mcr p15, 0, r0, c7, c7, 1 @ invalidate unified line
+ mcr p15, 0, \start, c7, c7, 1 @ invalidate unified line
#endif
- add r0, r0, #D_CACHE_LINE_SIZE
- cmp r0, r1
+ add \start, \start, #D_CACHE_LINE_SIZE
+ cmp \start, \end
blo 1b
- mov r0, #0
+ mov \start, #0
+ .endm
+
+ .macro v6_dma_clean_range, start, end
+ bic \start, \start, #D_CACHE_LINE_SIZE - 1
+1:
+#ifdef HARVARD_CACHE
+ mcr p15, 0, \start, c7, c10, 1 @ clean D line
+#else
+ mcr p15, 0, \start, c7, c11, 1 @ clean unified line
+#endif
+ add \start, \start, #D_CACHE_LINE_SIZE
+ cmp \start, \end
+ blo 1b
+ mov \start, #0
+ .endm
+
+/*
+ * v6_dma_inv_range_barrier(start,end)
+ *
+ * Invalidate the data cache within the specified region; we will
+ * be performing a DMA operation in this region and we want to
+ * purge old data in the cache.
+ *
+ * - start - virtual start address of region
+ * - end - virtual end address of region
+ */
+v6_dma_inv_range_barrier:
+ v6_dma_inv_range r0, r1
mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
mov pc, lr

/*
- * v6_dma_clean_range(start,end)
+ * v6_dma_clean_range_barrier(start,end)
* - start - virtual start address of region
* - end - virtual end address of region
*/
-v6_dma_clean_range:
- bic r0, r0, #D_CACHE_LINE_SIZE - 1
-1:
-#ifdef HARVARD_CACHE
- mcr p15, 0, r0, c7, c10, 1 @ clean D line
-#else
- mcr p15, 0, r0, c7, c11, 1 @ clean unified line
-#endif
- add r0, r0, #D_CACHE_LINE_SIZE
- cmp r0, r1
- blo 1b
- mov r0, #0
+v6_dma_clean_range_barrier:
+ v6_dma_clean_range r0, r1
mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
mov pc, lr

/*
+ * v6_dma_inv_range_nobarrier(start,end)
+ *
+ * Invalidate the data cache within the specified region; we will
+ * be performing a DMA operation in this region and we want to
+ * purge old data in the cache.
+ *
+ * - start - virtual start address of region
+ * - end - virtual end address of region
+ */
+v6_dma_inv_range_nobarrier:
+ v6_dma_inv_range r0, r1
+ mov pc, lr
+
+/*
+ * v6_dma_clean_range_nobarrier(start,end)
+ * - start - virtual start address of region
+ * - end - virtual end address of region
+ */
+v6_dma_clean_range_nobarrier:
+ v6_dma_clean_range r0, r1
+ mov pc, lr
+
+/*
* v6_dma_flush_range(start,end)
* - start - virtual start address of region
* - end - virtual end address of region
@@ -272,8 +303,8 @@ ENTRY(v6_dma_flush_range)
ENTRY(v6_dma_map_area)
add r1, r1, r0
teq r2, #DMA_FROM_DEVICE
- beq v6_dma_inv_range
- b v6_dma_clean_range
+ beq v6_dma_inv_range_barrier
+ b v6_dma_clean_range_barrier
ENDPROC(v6_dma_map_area)

/*
@@ -285,10 +316,36 @@ ENDPROC(v6_dma_map_area)
ENTRY(v6_dma_unmap_area)
add r1, r1, r0
teq r2, #DMA_TO_DEVICE
- bne v6_dma_inv_range
+ bne v6_dma_inv_range_barrier
mov pc, lr
ENDPROC(v6_dma_unmap_area)

+/*
+ * dma_map_area_nobarrier(start, size, dir)
+ * - start - kernel virtual start address
+ * - size - size of region
+ * - dir - DMA direction
+ */
+ENTRY(v6_dma_map_area_nobarrier)
+ add r1, r1, r0
+ teq r2, #DMA_FROM_DEVICE
+ beq v6_dma_inv_range_nobarrier
+ b v6_dma_clean_range_nobarrier
+ENDPROC(v6_dma_map_area_nobarrier)
+
+/*
+ * dma_unmap_area_nobarrier(start, size, dir)
+ * - start - kernel virtual start address
+ * - size - size of region
+ * - dir - DMA direction
+ */
+ENTRY(v6_dma_unmap_area_nobarrier)
+ add r1, r1, r0
+ teq r2, #DMA_TO_DEVICE
+ bne v6_dma_inv_range_nobarrier
+ mov pc, lr
+ENDPROC(v6_dma_unmap_area_nobarrier)
+
__INITDATA

.type v6_cache_fns, #object
@@ -301,5 +358,7 @@ ENTRY(v6_cache_fns)
.long v6_flush_kern_dcache_area
.long v6_dma_map_area
.long v6_dma_unmap_area
+ .long v6_dma_map_area_nobarrier
+ .long v6_dma_unmap_area_nobarrier
.long v6_dma_flush_range
.size v6_cache_fns, . - v6_cache_fns
diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
index bcd64f2..d748137 100644
--- a/arch/arm/mm/cache-v7.S
+++ b/arch/arm/mm/cache-v7.S
@@ -206,8 +206,33 @@ ENTRY(v7_flush_kern_dcache_area)
mov pc, lr
ENDPROC(v7_flush_kern_dcache_area)

+ .macro v7_dma_inv_range, start, end, line_size, tmp
+ sub \tmp, \line_size, #1
+ tst \start, \line_size
+ bic \start, \start, \tmp
+ mcrne p15, 0, \start, c7, c14, 1 @ clean & invalidate D / U line
+
+ tst \end, \tmp
+ bic \end, \end, \tmp
+ mcrne p15, 0, \end, c7, c14, 1 @ clean & invalidate D / U line
+1:
+ mcr p15, 0, \start, c7, c6, 1 @ invalidate D / U line
+ add \start, \start, \line_size
+ cmp \start, \end
+ blo 1b
+ .endm
+
+ .macro v7_dma_clean_range, start, end, line_size, tmp
+ sub \tmp, \line_size, #1
+ bic \start, \start, \tmp
+1:
+ mcr p15, 0, \start, c7, c10, 1 @ clean D / U line
+ add \start, \start, \line_size
+ cmp \start, \end
+ blo 1b
+ .endm
/*
- * v7_dma_inv_range(start,end)
+ * v7_dma_inv_range_barrier(start,end)
*
* Invalidate the data cache within the specified region; we will
* be performing a DMA operation in this region and we want to
@@ -216,42 +241,51 @@ ENDPROC(v7_flush_kern_dcache_area)
* - start - virtual start address of region
* - end - virtual end address of region
*/
-v7_dma_inv_range:
+v7_dma_inv_range_barrier:
dcache_line_size r2, r3
- sub r3, r2, #1
- tst r0, r3
- bic r0, r0, r3
- mcrne p15, 0, r0, c7, c14, 1 @ clean & invalidate D / U line
-
- tst r1, r3
- bic r1, r1, r3
- mcrne p15, 0, r1, c7, c14, 1 @ clean & invalidate D / U line
-1:
- mcr p15, 0, r0, c7, c6, 1 @ invalidate D / U line
- add r0, r0, r2
- cmp r0, r1
- blo 1b
+ v7_dma_inv_range r0, r1, r2, r3
dsb
mov pc, lr
-ENDPROC(v7_dma_inv_range)
+ENDPROC(v7_dma_inv_range_barrier)

/*
- * v7_dma_clean_range(start,end)
+ * v7_dma_clean_range_barrier(start,end)
* - start - virtual start address of region
* - end - virtual end address of region
*/
-v7_dma_clean_range:
+v7_dma_clean_range_barrier:
dcache_line_size r2, r3
- sub r3, r2, #1
- bic r0, r0, r3
-1:
- mcr p15, 0, r0, c7, c10, 1 @ clean D / U line
- add r0, r0, r2
- cmp r0, r1
- blo 1b
+ v7_dma_clean_range r0, r1, r2 ,r3
dsb
mov pc, lr
-ENDPROC(v7_dma_clean_range)
+ENDPROC(v7_dma_clean_range_barrier)
+
+/*
+ * v7_dma_inv_range_nobarrier(start,end)
+ *
+ * Invalidate the data cache within the specified region; we will
+ * be performing a DMA operation in this region and we want to
+ * purge old data in the cache.
+ *
+ * - start - virtual start address of region
+ * - end - virtual end address of region
+ */
+v7_dma_inv_range_nobarrier:
+ dcache_line_size r2, r3
+ v7_dma_inv_range r0, r1, r2, r3
+ mov pc, lr
+ENDPROC(v7_dma_inv_range_nobarrier)
+
+/*
+ * v7_dma_clean_range_nobarrier(start,end)
+ * - start - virtual start address of region
+ * - end - virtual end address of region
+ */
+v7_dma_clean_range_nobarrier:
+ dcache_line_size r2, r3
+ v7_dma_clean_range r0, r1, r2 ,r3
+ mov pc, lr
+ENDPROC(v7_dma_clean_range_nobarrier)

/*
* v7_dma_flush_range(start,end)
@@ -280,8 +314,8 @@ ENDPROC(v7_dma_flush_range)
ENTRY(v7_dma_map_area)
add r1, r1, r0
teq r2, #DMA_FROM_DEVICE
- beq v7_dma_inv_range
- b v7_dma_clean_range
+ beq v7_dma_inv_range_barrier
+ b v7_dma_clean_range_barrier
ENDPROC(v7_dma_map_area)

/*
@@ -293,10 +327,36 @@ ENDPROC(v7_dma_map_area)
ENTRY(v7_dma_unmap_area)
add r1, r1, r0
teq r2, #DMA_TO_DEVICE
- bne v7_dma_inv_range
+ bne v7_dma_inv_range_barrier
mov pc, lr
ENDPROC(v7_dma_unmap_area)

+/*
+ * dma_map_area_nobarrier(start, size, dir)
+ * - start - kernel virtual start address
+ * - size - size of region
+ * - dir - DMA direction
+ */
+ENTRY(v7_dma_map_area_nobarrier)
+ add r1, r1, r0
+ teq r2, #DMA_FROM_DEVICE
+ beq v7_dma_inv_range_nobarrier
+ b v7_dma_clean_range_nobarrier
+ENDPROC(v7_dma_map_area_nobarrier)
+
+/*
+ * dma_unmap_area_nobarrier(start, size, dir)
+ * - start - kernel virtual start address
+ * - size - size of region
+ * - dir - DMA direction
+ */
+ENTRY(v7_dma_unmap_area_nobarrier)
+ add r1, r1, r0
+ teq r2, #DMA_TO_DEVICE
+ bne v7_dma_inv_range_nobarrier
+ mov pc, lr
+ENDPROC(v7_dma_unmap_area_nobarrier)
+
__INITDATA

.type v7_cache_fns, #object
@@ -309,5 +369,7 @@ ENTRY(v7_cache_fns)
.long v7_flush_kern_dcache_area
.long v7_dma_map_area
.long v7_dma_unmap_area
+ .long v7_dma_map_area_nobarrier
+ .long v7_dma_unmap_area_nobarrier
.long v7_dma_flush_range
.size v7_cache_fns, . - v7_cache_fns
--
1.5.6.3

2010-02-10 20:37:56

by Abhijeet Dharmapurikar

[permalink] [raw]
Subject: [PATCH 2/2] dma: fix scatter-gather api to use barrierless map/unmap functions

From: Abhijeet Dharmapurikar <[email protected]>

dma_map/unmap_sg need to execute barrier only after the last buffer has been
mapped/unmapped. This imporves performance in situations where multiple
buffers need to be mapped for a single DMA operation.

Signed-off-by: Abhijeet Dharmapurikar <[email protected]>
---
arch/arm/include/asm/dma-mapping.h | 87 ++++++++++++++++++++++++++++++++++++
arch/arm/mm/dma-mapping.c | 59 +++++++++++++++++++++---
2 files changed, 139 insertions(+), 7 deletions(-)

diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index 256ee1c..06b528d 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -110,6 +110,26 @@ static inline void __dma_page_dev_to_cpu(struct page *page, unsigned long off,
___dma_page_dev_to_cpu(page, off, size, dir);
}

+static inline void __dma_page_cpu_to_dev_nobarrier(struct page *page,
+ unsigned long off, size_t size, enum dma_data_direction dir)
+{
+ extern void ___dma_page_cpu_to_dev_nobarrier(struct page *,
+ unsigned long, size_t, enum dma_data_direction);
+
+ if (!arch_is_coherent())
+ ___dma_page_cpu_to_dev_nobarrier(page, off, size, dir);
+}
+
+static inline void __dma_page_dev_to_cpu_nobarrier(struct page *page,
+ unsigned long off, size_t size, enum dma_data_direction dir)
+{
+ extern void ___dma_page_dev_to_cpu_nobarrier(struct page *,
+ unsigned long, size_t, enum dma_data_direction);
+
+ if (!arch_is_coherent())
+ ___dma_page_dev_to_cpu_nobarrier(page, off, size, dir);
+}
+
/*
* Return whether the given device DMA address mask can be supported
* properly. For example, if your device can only drive the low 24-bits
@@ -305,6 +325,23 @@ extern void dma_unmap_page(struct device *, dma_addr_t, size_t,
enum dma_data_direction);

/*
+ * for DMA_BOUNCE we keep the nobarrier version same as their barriered
+ * counterpart
+ */
+static inline dma_addr_t dma_map_page_nobarrier(struct device *dev,
+ struct page *page, unsigned long offset, size_t size,
+ enum dma_data_direction dir)
+{
+ return dma_map_page(dev, page, offset, size, dir);
+}
+
+static inline void dma_unmap_page_nobarrier(struct device *dev,
+ dma_addr_t handle, size_t size, enum dma_data_direction dir)
+{
+ return dma_unmap_page(dev, handle, size, dir);
+}
+
+/*
* Private functions
*/
int dmabounce_sync_for_cpu(struct device *, dma_addr_t, unsigned long,
@@ -374,6 +411,34 @@ static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
}

/**
+ * dma_map_page_nobarrier - map a portion of a page for streaming DMA without a
+ * barrier
+ * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
+ * @page: page that buffer resides in
+ * @offset: offset into page for start of buffer
+ * @size: size of buffer to map
+ * @dir: DMA transfer direction
+ *
+ * Once this call is followed by a barrier it is ensured that any data held
+ * in the cache is appropriately discarded or written back.
+ *
+ * The device owns this memory once this call has completed and a barrier is
+ * executed. The CPU
+ * can regain ownership by calling dma_unmap_page() or
+ * dma_unmap_page_nobarrier() followed by a barrier.
+ */
+static inline dma_addr_t dma_map_page_nobarrier(struct device *dev,
+ struct page *page, unsigned long offset, size_t size,
+ enum dma_data_direction dir)
+{
+ BUG_ON(!valid_dma_direction(dir));
+
+ __dma_page_cpu_to_dev_nobarrier(page, offset, size, dir);
+
+ return page_to_dma(dev, page) + offset;
+}
+
+/**
* dma_unmap_single - unmap a single buffer previously mapped
* @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
* @handle: DMA address of buffer
@@ -413,6 +478,28 @@ static inline void dma_unmap_page(struct device *dev, dma_addr_t handle,
__dma_page_dev_to_cpu(dma_to_page(dev, handle), handle & ~PAGE_MASK,
size, dir);
}
+
+/**
+ * dma_unmap_page_nobarrier - unmap a buffer previously mapped through dma_map_page()
+ * or dma_map_page_nobarrier() followed by a barrier
+ * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
+ * @handle: DMA address of buffer
+ * @size: size of buffer (same as passed to dma_map_page)
+ * @dir: DMA transfer direction (same as passed to dma_map_page)
+ *
+ * Unmap a page streaming mode DMA translation. The handle and size
+ * must match what was provided in the previous dma_map_page() call.
+ * All other usages are undefined.
+ *
+ * After this call, followed by a barrier(dsb/dmb), reads by the CPU to the
+ * buffer are guaranteed to see whatever the device wrote there.
+ */
+static inline void dma_unmap_page_nobarrier(struct device *dev,
+ dma_addr_t handle, size_t size, enum dma_data_direction dir)
+{
+ __dma_page_dev_to_cpu_nobarrier(dma_to_page(dev, handle),
+ handle & ~PAGE_MASK, size, dir);
+}
#endif /* CONFIG_DMABOUNCE */

/**
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 64daef2..23556ab 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -509,6 +509,37 @@ void ___dma_page_dev_to_cpu(struct page *page, unsigned long off,
}
EXPORT_SYMBOL(___dma_page_dev_to_cpu);

+
+void ___dma_page_cpu_to_dev_nobarrier(struct page *page, unsigned long off,
+ size_t size, enum dma_data_direction dir)
+{
+ unsigned long paddr;
+
+ dma_cache_maint_page(page, off, size, dir, dmac_map_area_nobarrier);
+
+ paddr = page_to_phys(page) + off;
+ if (dir == DMA_FROM_DEVICE) {
+ outer_inv_range(paddr, paddr + size);
+ } else {
+ outer_clean_range(paddr, paddr + size);
+ }
+ /* FIXME: non-speculating: flush on bidirectional mappings? */
+}
+EXPORT_SYMBOL(___dma_page_cpu_to_dev_nobarrier);
+
+void ___dma_page_dev_to_cpu_nobarrier(struct page *page, unsigned long off,
+ size_t size, enum dma_data_direction dir)
+{
+ unsigned long paddr = page_to_phys(page) + off;
+
+ /* FIXME: non-speculating: not required */
+ /* don't bother invalidating if DMA to device */
+ if (dir != DMA_TO_DEVICE)
+ outer_inv_range(paddr, paddr + size);
+
+ dma_cache_maint_page(page, off, size, dir, dmac_unmap_area_nobarrier);
+}
+EXPORT_SYMBOL(___dma_page_dev_to_cpu_nobarrier);
/**
* dma_map_sg - map a set of SG buffers for streaming mode DMA
* @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
@@ -531,17 +562,28 @@ int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
struct scatterlist *s;
int i, j;

- for_each_sg(sg, s, nents, i) {
- s->dma_address = dma_map_page(dev, sg_page(s), s->offset,
- s->length, dir);
+ for_each_sg(sg, s, nents - 1 , i) {
+ s->dma_address = dma_map_page_nobarrier(dev, sg_page(s),
+ s->offset, s->length, dir);
if (dma_mapping_error(dev, s->dma_address))
goto bad_mapping;
}
+
+ s = sg_next(s);
+ i++;
+ s->dma_address = dma_map_page(dev, sg_page(s), s->offset,
+ s->length, dir);
+ if (dma_mapping_error(dev, s->dma_address))
+ goto bad_mapping;
+
return nents;

bad_mapping:
- for_each_sg(sg, s, i, j)
- dma_unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir);
+ for_each_sg(sg, s, i - 1, j)
+ dma_unmap_page_nobarrier(dev, sg_dma_address(s),
+ sg_dma_len(s), dir);
+ s = sg_next(s);
+ dma_unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir);
return 0;
}
EXPORT_SYMBOL(dma_map_sg);
@@ -562,8 +604,11 @@ void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
struct scatterlist *s;
int i;

- for_each_sg(sg, s, nents, i)
- dma_unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir);
+ for_each_sg(sg, s, nents - 1, i)
+ dma_unmap_page_nobarrier(dev, sg_dma_address(s),
+ sg_dma_len(s), dir);
+ s = sg_next(s);
+ dma_unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir);
}
EXPORT_SYMBOL(dma_unmap_sg);

--
1.5.6.3

2010-02-10 21:27:50

by Randy Dunlap

[permalink] [raw]
Subject: Re: [RFC 0/2] fix dma_map_sg not to do barriers for each buffer

On 02/10/10 12:37, [email protected] wrote:
> From: Abhijeet Dharmapurikar <[email protected]>
>
> Please refer to the post here
> http://lkml.org/lkml/2010/1/4/347
>
> These changes are to introduce barrierless dma_map_area and dma_unmap_area and
> use them to map the buffers in the scatterlist. For the last buffer, call
> the normal dma_map_area(aka with barriers) effectively executing the barrier
> at the end of the operation.
>
> Note that the barrierless operations are implemented for few arm
> architectures only and I would implement for others once these are okayed by the
> community.

So when you add these interfaces for other architectures, you will also
update Documentation/DMA-API.txt, right??


> Abhijeet Dharmapurikar (2):
> dma: define barrierless versions of map and unmap area
> dma: fix scatter-gather api to use barrierless map/unmap functions
>
> arch/arm/include/asm/cacheflush.h | 9 +++
> arch/arm/include/asm/dma-mapping.h | 82 +++++++++++++++++++++
> arch/arm/mm/cache-v3.S | 6 ++
> arch/arm/mm/cache-v4.S | 6 ++
> arch/arm/mm/cache-v4wb.S | 94 +++++++++++++++++-------
> arch/arm/mm/cache-v4wt.S | 6 ++
> arch/arm/mm/cache-v6.S | 139 +++++++++++++++++++++++++----------
> arch/arm/mm/cache-v7.S | 120 +++++++++++++++++++++++--------
> arch/arm/mm/dma-mapping.c | 55 +++++++++++++--
> 9 files changed, 414 insertions(+), 103 deletions(-)


--
~Randy

2010-02-10 21:28:31

by Russell King - ARM Linux

[permalink] [raw]
Subject: Re: [RFC 0/2] fix dma_map_sg not to do barriers for each buffer

On Wed, Feb 10, 2010 at 12:37:28PM -0800, [email protected] wrote:
> From: Abhijeet Dharmapurikar <[email protected]>
>
> Please refer to the post here
> http://lkml.org/lkml/2010/1/4/347
>
> These changes are to introduce barrierless dma_map_area and dma_unmap_area and
> use them to map the buffers in the scatterlist. For the last buffer, call
> the normal dma_map_area(aka with barriers) effectively executing the barrier
> at the end of the operation.

What if we make dma_map_area and dma_unmap_area both be barrier-less,
and instead have a separate dma_barrier method - eg, something like the
attached?

This might allow for better I-cache usage by not having to duplicate the
DMA cache coherence functions.

PS, you haven't sorted out all the processor support files for your change.

arch/arm/include/asm/cacheflush.h | 4 ++++
arch/arm/include/asm/dma-mapping.h | 8 ++++++++
arch/arm/mm/cache-fa.S | 13 +++++++------
arch/arm/mm/cache-v3.S | 3 +++
arch/arm/mm/cache-v4.S | 3 +++
arch/arm/mm/cache-v4wb.S | 9 +++++++--
arch/arm/mm/cache-v4wt.S | 3 +++
arch/arm/mm/cache-v6.S | 13 +++++++------
arch/arm/mm/cache-v7.S | 9 ++++++---
arch/arm/mm/dma-mapping.c | 16 ++++++++++++++++
arch/arm/mm/proc-arm1020e.S | 10 +++++++---
arch/arm/mm/proc-arm1022.S | 10 +++++++---
arch/arm/mm/proc-arm1026.S | 10 +++++++---
arch/arm/mm/proc-arm920.S | 10 +++++++---
arch/arm/mm/proc-arm922.S | 10 +++++++---
arch/arm/mm/proc-arm925.S | 10 +++++++---
arch/arm/mm/proc-arm926.S | 10 +++++++---
arch/arm/mm/proc-arm940.S | 10 +++++++---
arch/arm/mm/proc-arm946.S | 10 +++++++---
arch/arm/mm/proc-feroceon.S | 13 ++++++++-----
arch/arm/mm/proc-mohawk.S | 10 +++++++---
arch/arm/mm/proc-xsc3.S | 10 +++++++---
arch/arm/mm/proc-xscale.S | 10 +++++++---
23 files changed, 156 insertions(+), 58 deletions(-)

diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index e290885..5928e78 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -200,6 +200,7 @@ struct cpu_cache_fns {

void (*dma_map_area)(const void *, size_t, int);
void (*dma_unmap_area)(const void *, size_t, int);
+ void (*dma_barrier)(void);

void (*dma_flush_range)(const void *, const void *);
};
@@ -232,6 +233,7 @@ extern struct cpu_cache_fns cpu_cache;
*/
#define dmac_map_area cpu_cache.dma_map_area
#define dmac_unmap_area cpu_cache.dma_unmap_area
+#define dmac_barrier cpu_cache.dma_barrier
#define dmac_flush_range cpu_cache.dma_flush_range

#else
@@ -258,10 +260,12 @@ extern void __cpuc_flush_dcache_area(void *, size_t);
*/
#define dmac_map_area __glue(_CACHE,_dma_map_area)
#define dmac_unmap_area __glue(_CACHE,_dma_unmap_area)
+#define dmac_barrier __glue(_CACHE,_dma_barrier)
#define dmac_flush_range __glue(_CACHE,_dma_flush_range)

extern void dmac_map_area(const void *, size_t, int);
extern void dmac_unmap_area(const void *, size_t, int);
+extern void dmac_barrier(void);
extern void dmac_flush_range(const void *, const void *);

#endif
diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index 256ee1c..4a0824c 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -110,6 +110,8 @@ static inline void __dma_page_dev_to_cpu(struct page *page, unsigned long off,
___dma_page_dev_to_cpu(page, off, size, dir);
}

+extern void __dma_barrier(enum dma_data_direction);
+
/*
* Return whether the given device DMA address mask can be supported
* properly. For example, if your device can only drive the low 24-bits
@@ -345,6 +347,7 @@ static inline dma_addr_t dma_map_single(struct device *dev, void *cpu_addr,
BUG_ON(!valid_dma_direction(dir));

__dma_single_cpu_to_dev(cpu_addr, size, dir);
+ __dma_barrier(dir);

return virt_to_dma(dev, cpu_addr);
}
@@ -369,6 +372,7 @@ static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
BUG_ON(!valid_dma_direction(dir));

__dma_page_cpu_to_dev(page, offset, size, dir);
+ __dma_barrier(dir);

return page_to_dma(dev, page) + offset;
}
@@ -391,6 +395,7 @@ static inline void dma_unmap_single(struct device *dev, dma_addr_t handle,
size_t size, enum dma_data_direction dir)
{
__dma_single_dev_to_cpu(dma_to_virt(dev, handle), size, dir);
+ __dma_barrier(dir);
}

/**
@@ -412,6 +417,7 @@ static inline void dma_unmap_page(struct device *dev, dma_addr_t handle,
{
__dma_page_dev_to_cpu(dma_to_page(dev, handle), handle & ~PAGE_MASK,
size, dir);
+ __dma_barrier(dir);
}
#endif /* CONFIG_DMABOUNCE */

@@ -443,6 +449,7 @@ static inline void dma_sync_single_range_for_cpu(struct device *dev,
return;

__dma_single_dev_to_cpu(dma_to_virt(dev, handle) + offset, size, dir);
+ __dma_barrier(dir);
}

static inline void dma_sync_single_range_for_device(struct device *dev,
@@ -455,6 +462,7 @@ static inline void dma_sync_single_range_for_device(struct device *dev,
return;

__dma_single_cpu_to_dev(dma_to_virt(dev, handle) + offset, size, dir);
+ __dma_barrier(dir);
}

static inline void dma_sync_single_for_cpu(struct device *dev,
diff --git a/arch/arm/mm/cache-fa.S b/arch/arm/mm/cache-fa.S
index 7148e53..cdcfae2 100644
--- a/arch/arm/mm/cache-fa.S
+++ b/arch/arm/mm/cache-fa.S
@@ -168,8 +168,6 @@ fa_dma_inv_range:
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mov r0, #0
- mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
mov pc, lr

/*
@@ -186,8 +184,6 @@ fa_dma_clean_range:
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mov r0, #0
- mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
mov pc, lr

/*
@@ -201,8 +197,6 @@ ENTRY(fa_dma_flush_range)
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mov r0, #0
- mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
mov pc, lr

/*
@@ -229,6 +223,12 @@ ENTRY(fa_dma_unmap_area)
mov pc, lr
ENDPROC(fa_dma_unmap_area)

+ENTRY(fa_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
+ mov pc, lr
+ENDPROC(fa_dma_barrier)
+
__INITDATA

.type fa_cache_fns, #object
@@ -241,5 +241,6 @@ ENTRY(fa_cache_fns)
.long fa_flush_kern_dcache_area
.long fa_dma_map_area
.long fa_dma_unmap_area
+ .long fa_dma_barrier
.long fa_dma_flush_range
.size fa_cache_fns, . - fa_cache_fns
diff --git a/arch/arm/mm/cache-v3.S b/arch/arm/mm/cache-v3.S
index c2ff3c5..df34458 100644
--- a/arch/arm/mm/cache-v3.S
+++ b/arch/arm/mm/cache-v3.S
@@ -114,9 +114,11 @@ ENTRY(v3_dma_unmap_area)
* - dir - DMA direction
*/
ENTRY(v3_dma_map_area)
+ENTRY(v3_dma_barrier)
mov pc, lr
ENDPROC(v3_dma_unmap_area)
ENDPROC(v3_dma_map_area)
+ENDPROC(v3_dma_barrier)

__INITDATA

@@ -130,5 +132,6 @@ ENTRY(v3_cache_fns)
.long v3_flush_kern_dcache_area
.long v3_dma_map_area
.long v3_dma_unmap_area
+ .long v3_dma_barrier
.long v3_dma_flush_range
.size v3_cache_fns, . - v3_cache_fns
diff --git a/arch/arm/mm/cache-v4.S b/arch/arm/mm/cache-v4.S
index 4810f7e..20260b1 100644
--- a/arch/arm/mm/cache-v4.S
+++ b/arch/arm/mm/cache-v4.S
@@ -126,9 +126,11 @@ ENTRY(v4_dma_unmap_area)
* - dir - DMA direction
*/
ENTRY(v4_dma_map_area)
+ENTRY(v4_dma_barrier)
mov pc, lr
ENDPROC(v4_dma_unmap_area)
ENDPROC(v4_dma_map_area)
+ENDPROC(v4_dma_barrier)

__INITDATA

@@ -142,5 +144,6 @@ ENTRY(v4_cache_fns)
.long v4_flush_kern_dcache_area
.long v4_dma_map_area
.long v4_dma_unmap_area
+ .long v4_dma_barrier
.long v4_dma_flush_range
.size v4_cache_fns, . - v4_cache_fns
diff --git a/arch/arm/mm/cache-v4wb.S b/arch/arm/mm/cache-v4wb.S
index df8368a..9c9c875 100644
--- a/arch/arm/mm/cache-v4wb.S
+++ b/arch/arm/mm/cache-v4wb.S
@@ -183,7 +183,6 @@ v4wb_dma_inv_range:
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
mov pc, lr

/*
@@ -200,7 +199,6 @@ v4wb_dma_clean_range:
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
mov pc, lr

/*
@@ -240,6 +238,12 @@ ENTRY(v4wb_dma_unmap_area)
mov pc, lr
ENDPROC(v4wb_dma_unmap_area)

+ENTRY(v4wb_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
+ mov pc, lr
+ENDPROC(v4wb_dma_barrier)
+
__INITDATA

.type v4wb_cache_fns, #object
@@ -252,5 +256,6 @@ ENTRY(v4wb_cache_fns)
.long v4wb_flush_kern_dcache_area
.long v4wb_dma_map_area
.long v4wb_dma_unmap_area
+ .long v4wb_dma_barrier
.long v4wb_dma_flush_range
.size v4wb_cache_fns, . - v4wb_cache_fns
diff --git a/arch/arm/mm/cache-v4wt.S b/arch/arm/mm/cache-v4wt.S
index 45c7031..223eea4 100644
--- a/arch/arm/mm/cache-v4wt.S
+++ b/arch/arm/mm/cache-v4wt.S
@@ -180,9 +180,11 @@ ENTRY(v4wt_dma_unmap_area)
* - dir - DMA direction
*/
ENTRY(v4wt_dma_map_area)
+ENTRY(v4wt_dma_barrier)
mov pc, lr
ENDPROC(v4wt_dma_unmap_area)
ENDPROC(v4wt_dma_map_area)
+ENDPROC(v4wt_dma_barrier)

__INITDATA

@@ -196,5 +198,6 @@ ENTRY(v4wt_cache_fns)
.long v4wt_flush_kern_dcache_area
.long v4wt_dma_map_area
.long v4wt_dma_unmap_area
+ .long v4wt_dma_barrier
.long v4wt_dma_flush_range
.size v4wt_cache_fns, . - v4wt_cache_fns
diff --git a/arch/arm/mm/cache-v6.S b/arch/arm/mm/cache-v6.S
index 9d89c67..b294854 100644
--- a/arch/arm/mm/cache-v6.S
+++ b/arch/arm/mm/cache-v6.S
@@ -219,8 +219,6 @@ v6_dma_inv_range:
add r0, r0, #D_CACHE_LINE_SIZE
cmp r0, r1
blo 1b
- mov r0, #0
- mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
mov pc, lr

/*
@@ -239,8 +237,6 @@ v6_dma_clean_range:
add r0, r0, #D_CACHE_LINE_SIZE
cmp r0, r1
blo 1b
- mov r0, #0
- mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
mov pc, lr

/*
@@ -259,8 +255,6 @@ ENTRY(v6_dma_flush_range)
add r0, r0, #D_CACHE_LINE_SIZE
cmp r0, r1
blo 1b
- mov r0, #0
- mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
mov pc, lr

/*
@@ -289,6 +283,12 @@ ENTRY(v6_dma_unmap_area)
mov pc, lr
ENDPROC(v6_dma_unmap_area)

+ENTRY(v6_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
+ mov pc, lr
+ENDPROC(v6_dma_barrier)
+
__INITDATA

.type v6_cache_fns, #object
@@ -301,5 +301,6 @@ ENTRY(v6_cache_fns)
.long v6_flush_kern_dcache_area
.long v6_dma_map_area
.long v6_dma_unmap_area
+ .long v6_dma_barrier
.long v6_dma_flush_range
.size v6_cache_fns, . - v6_cache_fns
diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
index bcd64f2..d89d55a 100644
--- a/arch/arm/mm/cache-v7.S
+++ b/arch/arm/mm/cache-v7.S
@@ -231,7 +231,6 @@ v7_dma_inv_range:
add r0, r0, r2
cmp r0, r1
blo 1b
- dsb
mov pc, lr
ENDPROC(v7_dma_inv_range)

@@ -249,7 +248,6 @@ v7_dma_clean_range:
add r0, r0, r2
cmp r0, r1
blo 1b
- dsb
mov pc, lr
ENDPROC(v7_dma_clean_range)

@@ -267,7 +265,6 @@ ENTRY(v7_dma_flush_range)
add r0, r0, r2
cmp r0, r1
blo 1b
- dsb
mov pc, lr
ENDPROC(v7_dma_flush_range)

@@ -297,6 +294,11 @@ ENTRY(v7_dma_unmap_area)
mov pc, lr
ENDPROC(v7_dma_unmap_area)

+ENTRY(v7_dma_barrier)
+ dsb
+ mov pc, lr
+ENDPROC(v7_dma_barrier)
+
__INITDATA

.type v7_cache_fns, #object
@@ -309,5 +311,6 @@ ENTRY(v7_cache_fns)
.long v7_flush_kern_dcache_area
.long v7_dma_map_area
.long v7_dma_unmap_area
+ .long v6_dma_barrier
.long v7_dma_flush_range
.size v7_cache_fns, . - v7_cache_fns
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 64daef2..debe7cb 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -108,6 +108,7 @@ static struct page *__dma_alloc_buffer(struct device *dev, size_t size, gfp_t gf
memset(ptr, 0, size);
dmac_flush_range(ptr, ptr + size);
outer_flush_range(__pa(ptr), __pa(ptr) + size);
+ dmac_barrier();

return page;
}
@@ -509,6 +510,12 @@ void ___dma_page_dev_to_cpu(struct page *page, unsigned long off,
}
EXPORT_SYMBOL(___dma_page_dev_to_cpu);

+void __dma_barrier(enum dma_data_direction dir)
+{
+ dmac_barrier();
+}
+EXPORT_SYMBOL(__dma_barrier);
+
/**
* dma_map_sg - map a set of SG buffers for streaming mode DMA
* @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
@@ -537,6 +544,9 @@ int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
if (dma_mapping_error(dev, s->dma_address))
goto bad_mapping;
}
+
+ __dma_barrier(dir);
+
return nents;

bad_mapping:
@@ -564,6 +574,8 @@ void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,

for_each_sg(sg, s, nents, i)
dma_unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir);
+
+ __dma_barrier(dir);
}
EXPORT_SYMBOL(dma_unmap_sg);

@@ -588,6 +600,8 @@ void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
__dma_page_dev_to_cpu(sg_page(s), s->offset,
s->length, dir);
}
+
+ __dma_barrier(dir);
}
EXPORT_SYMBOL(dma_sync_sg_for_cpu);

@@ -612,5 +626,7 @@ void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
__dma_page_cpu_to_dev(sg_page(s), s->offset,
s->length, dir);
}
+
+ __dma_barrier(dir);
}
EXPORT_SYMBOL(dma_sync_sg_for_device);
diff --git a/arch/arm/mm/proc-arm1020e.S b/arch/arm/mm/proc-arm1020e.S
index d278298..fea33c9 100644
--- a/arch/arm/mm/proc-arm1020e.S
+++ b/arch/arm/mm/proc-arm1020e.S
@@ -271,7 +271,6 @@ arm1020e_dma_inv_range:
cmp r0, r1
blo 1b
#endif
- mcr p15, 0, ip, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -293,7 +292,6 @@ arm1020e_dma_clean_range:
cmp r0, r1
blo 1b
#endif
- mcr p15, 0, ip, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -313,7 +311,6 @@ ENTRY(arm1020e_dma_flush_range)
cmp r0, r1
blo 1b
#endif
- mcr p15, 0, ip, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -340,6 +337,12 @@ ENTRY(arm1020e_dma_unmap_area)
mov pc, lr
ENDPROC(arm1020e_dma_unmap_area)

+ENTRY(arm1020e_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, r0, c7, c10, 4 @ drain WB
+ mov pc, lr
+ENDPROC(arm1020e_dma_barrier)
+
ENTRY(arm1020e_cache_fns)
.long arm1020e_flush_kern_cache_all
.long arm1020e_flush_user_cache_all
@@ -349,6 +352,7 @@ ENTRY(arm1020e_cache_fns)
.long arm1020e_flush_kern_dcache_area
.long arm1020e_dma_map_area
.long arm1020e_dma_unmap_area
+ .long arm1020e_dma_barrier
.long arm1020e_dma_flush_range

.align 5
diff --git a/arch/arm/mm/proc-arm1022.S b/arch/arm/mm/proc-arm1022.S
index ce13e4a..ba1a7df 100644
--- a/arch/arm/mm/proc-arm1022.S
+++ b/arch/arm/mm/proc-arm1022.S
@@ -260,7 +260,6 @@ arm1022_dma_inv_range:
cmp r0, r1
blo 1b
#endif
- mcr p15, 0, ip, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -282,7 +281,6 @@ arm1022_dma_clean_range:
cmp r0, r1
blo 1b
#endif
- mcr p15, 0, ip, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -302,7 +300,6 @@ ENTRY(arm1022_dma_flush_range)
cmp r0, r1
blo 1b
#endif
- mcr p15, 0, ip, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -329,6 +326,12 @@ ENTRY(arm1022_dma_unmap_area)
mov pc, lr
ENDPROC(arm1022_dma_unmap_area)

+ENTRY(arm1022_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, r0, c7, c10, 4 @ drain WB
+ mov pc, lr
+ENDPROC(arm1022_dma_barrier)
+
ENTRY(arm1022_cache_fns)
.long arm1022_flush_kern_cache_all
.long arm1022_flush_user_cache_all
@@ -338,6 +341,7 @@ ENTRY(arm1022_cache_fns)
.long arm1022_flush_kern_dcache_area
.long arm1022_dma_map_area
.long arm1022_dma_unmap_area
+ .long arm1022_dma_barrier
.long arm1022_dma_flush_range

.align 5
diff --git a/arch/arm/mm/proc-arm1026.S b/arch/arm/mm/proc-arm1026.S
index 636672a..de648f1 100644
--- a/arch/arm/mm/proc-arm1026.S
+++ b/arch/arm/mm/proc-arm1026.S
@@ -254,7 +254,6 @@ arm1026_dma_inv_range:
cmp r0, r1
blo 1b
#endif
- mcr p15, 0, ip, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -276,7 +275,6 @@ arm1026_dma_clean_range:
cmp r0, r1
blo 1b
#endif
- mcr p15, 0, ip, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -296,7 +294,6 @@ ENTRY(arm1026_dma_flush_range)
cmp r0, r1
blo 1b
#endif
- mcr p15, 0, ip, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -323,6 +320,12 @@ ENTRY(arm1026_dma_unmap_area)
mov pc, lr
ENDPROC(arm1026_dma_unmap_area)

+ENTRY(arm1026_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, r0, c7, c10, 4 @ drain WB
+ mov pc, lr
+ENDPROC(arm1026_dma_barrier)
+
ENTRY(arm1026_cache_fns)
.long arm1026_flush_kern_cache_all
.long arm1026_flush_user_cache_all
@@ -332,6 +335,7 @@ ENTRY(arm1026_cache_fns)
.long arm1026_flush_kern_dcache_area
.long arm1026_dma_map_area
.long arm1026_dma_unmap_area
+ .long arm1026_dma_barrier
.long arm1026_dma_flush_range

.align 5
diff --git a/arch/arm/mm/proc-arm920.S b/arch/arm/mm/proc-arm920.S
index 8be8199..ec74093 100644
--- a/arch/arm/mm/proc-arm920.S
+++ b/arch/arm/mm/proc-arm920.S
@@ -249,7 +249,6 @@ arm920_dma_inv_range:
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -268,7 +267,6 @@ arm920_dma_clean_range:
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -285,7 +283,6 @@ ENTRY(arm920_dma_flush_range)
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -312,6 +309,12 @@ ENTRY(arm920_dma_unmap_area)
mov pc, lr
ENDPROC(arm920_dma_unmap_area)

+ENTRY(arm920_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, r0, c7, c10, 4 @ drain WB
+ mov pc, lr
+ENDPROC(arm920_dma_barrier)
+
ENTRY(arm920_cache_fns)
.long arm920_flush_kern_cache_all
.long arm920_flush_user_cache_all
@@ -321,6 +324,7 @@ ENTRY(arm920_cache_fns)
.long arm920_flush_kern_dcache_area
.long arm920_dma_map_area
.long arm920_dma_unmap_area
+ .long arm920_dma_barrier
.long arm920_dma_flush_range

#endif
diff --git a/arch/arm/mm/proc-arm922.S b/arch/arm/mm/proc-arm922.S
index c0ff8e4..474d4c6 100644
--- a/arch/arm/mm/proc-arm922.S
+++ b/arch/arm/mm/proc-arm922.S
@@ -251,7 +251,6 @@ arm922_dma_inv_range:
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -270,7 +269,6 @@ arm922_dma_clean_range:
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -287,7 +285,6 @@ ENTRY(arm922_dma_flush_range)
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -314,6 +311,12 @@ ENTRY(arm922_dma_unmap_area)
mov pc, lr
ENDPROC(arm922_dma_unmap_area)

+ENTRY(arm922_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, r0, c7, c10, 4 @ drain WB
+ mov pc, lr
+ENDPROC(arm922_dma_barrier)
+
ENTRY(arm922_cache_fns)
.long arm922_flush_kern_cache_all
.long arm922_flush_user_cache_all
@@ -323,6 +326,7 @@ ENTRY(arm922_cache_fns)
.long arm922_flush_kern_dcache_area
.long arm922_dma_map_area
.long arm922_dma_unmap_area
+ .long arm922_dma_barrier
.long arm922_dma_flush_range

#endif
diff --git a/arch/arm/mm/proc-arm925.S b/arch/arm/mm/proc-arm925.S
index 3c6cffe..0336ae3 100644
--- a/arch/arm/mm/proc-arm925.S
+++ b/arch/arm/mm/proc-arm925.S
@@ -295,7 +295,6 @@ arm925_dma_inv_range:
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -316,7 +315,6 @@ arm925_dma_clean_range:
cmp r0, r1
blo 1b
#endif
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -338,7 +336,6 @@ ENTRY(arm925_dma_flush_range)
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -365,6 +362,12 @@ ENTRY(arm925_dma_unmap_area)
mov pc, lr
ENDPROC(arm925_dma_unmap_area)

+ENTRY(arm925_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, r0, c7, c10, 4 @ drain WB
+ mov pc, lr
+ENDPROC(arm925_dma_barrier)
+
ENTRY(arm925_cache_fns)
.long arm925_flush_kern_cache_all
.long arm925_flush_user_cache_all
@@ -374,6 +377,7 @@ ENTRY(arm925_cache_fns)
.long arm925_flush_kern_dcache_area
.long arm925_dma_map_area
.long arm925_dma_unmap_area
+ .long arm925_dma_barrier
.long arm925_dma_flush_range

ENTRY(cpu_arm925_dcache_clean_area)
diff --git a/arch/arm/mm/proc-arm926.S b/arch/arm/mm/proc-arm926.S
index 75b707c..473bbe6 100644
--- a/arch/arm/mm/proc-arm926.S
+++ b/arch/arm/mm/proc-arm926.S
@@ -258,7 +258,6 @@ arm926_dma_inv_range:
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -279,7 +278,6 @@ arm926_dma_clean_range:
cmp r0, r1
blo 1b
#endif
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -301,7 +299,6 @@ ENTRY(arm926_dma_flush_range)
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -328,6 +325,12 @@ ENTRY(arm926_dma_unmap_area)
mov pc, lr
ENDPROC(arm926_dma_unmap_area)

+ENTRY(arm926_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, r0, c7, c10, 4 @ drain WB
+ mov pc, lr
+ENDPROC(arm926_dma_barrier)
+
ENTRY(arm926_cache_fns)
.long arm926_flush_kern_cache_all
.long arm926_flush_user_cache_all
@@ -337,6 +340,7 @@ ENTRY(arm926_cache_fns)
.long arm926_flush_kern_dcache_area
.long arm926_dma_map_area
.long arm926_dma_unmap_area
+ .long arm926_dma_barrier
.long arm926_dma_flush_range

ENTRY(cpu_arm926_dcache_clean_area)
diff --git a/arch/arm/mm/proc-arm940.S b/arch/arm/mm/proc-arm940.S
index 1af1657..c44c963 100644
--- a/arch/arm/mm/proc-arm940.S
+++ b/arch/arm/mm/proc-arm940.S
@@ -180,7 +180,6 @@ arm940_dma_inv_range:
bcs 2b @ entries 63 to 0
subs r1, r1, #1 << 4
bcs 1b @ segments 7 to 0
- mcr p15, 0, ip, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -204,7 +203,6 @@ ENTRY(cpu_arm940_dcache_clean_area)
subs r1, r1, #1 << 4
bcs 1b @ segments 7 to 0
#endif
- mcr p15, 0, ip, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -230,7 +228,6 @@ ENTRY(arm940_dma_flush_range)
bcs 2b @ entries 63 to 0
subs r1, r1, #1 << 4
bcs 1b @ segments 7 to 0
- mcr p15, 0, ip, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -257,6 +254,12 @@ ENTRY(arm940_dma_unmap_area)
mov pc, lr
ENDPROC(arm940_dma_unmap_area)

+ENTRY(arm940_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, ip, c7, c10, 4 @ drain WB
+ mov pc, lr
+ENDPROC(arm940_dma_barrier)
+
ENTRY(arm940_cache_fns)
.long arm940_flush_kern_cache_all
.long arm940_flush_user_cache_all
@@ -266,6 +269,7 @@ ENTRY(arm940_cache_fns)
.long arm940_flush_kern_dcache_area
.long arm940_dma_map_area
.long arm940_dma_unmap_area
+ .long arm940_dma_barrier
.long arm940_dma_flush_range

__INIT
diff --git a/arch/arm/mm/proc-arm946.S b/arch/arm/mm/proc-arm946.S
index 1664b6a..11e9ad7 100644
--- a/arch/arm/mm/proc-arm946.S
+++ b/arch/arm/mm/proc-arm946.S
@@ -227,7 +227,6 @@ arm946_dma_inv_range:
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -248,7 +247,6 @@ arm946_dma_clean_range:
cmp r0, r1
blo 1b
#endif
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -272,7 +270,6 @@ ENTRY(arm946_dma_flush_range)
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -299,6 +296,12 @@ ENTRY(arm946_dma_unmap_area)
mov pc, lr
ENDPROC(arm946_dma_unmap_area)

+ENTRY(arm946_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, r0, c7, c10, 4 @ drain WB
+ mov pc, lr
+ENDPROC(arm946_dma_barrier)
+
ENTRY(arm946_cache_fns)
.long arm946_flush_kern_cache_all
.long arm946_flush_user_cache_all
@@ -308,6 +311,7 @@ ENTRY(arm946_cache_fns)
.long arm946_flush_kern_dcache_area
.long arm946_dma_map_area
.long arm946_dma_unmap_area
+ .long arm946_dma_barrier
.long arm946_dma_flush_range


diff --git a/arch/arm/mm/proc-feroceon.S b/arch/arm/mm/proc-feroceon.S
index 53e6323..50a309e 100644
--- a/arch/arm/mm/proc-feroceon.S
+++ b/arch/arm/mm/proc-feroceon.S
@@ -284,7 +284,6 @@ feroceon_dma_inv_range:
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

.align 5
@@ -320,7 +319,6 @@ feroceon_dma_clean_range:
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

.align 5
@@ -333,7 +331,6 @@ feroceon_range_dma_clean_range:
mcr p15, 5, r0, c15, c13, 0 @ D clean range start
mcr p15, 5, r1, c15, c13, 1 @ D clean range top
msr cpsr_c, r2 @ restore interrupts
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -351,7 +348,6 @@ ENTRY(feroceon_dma_flush_range)
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

.align 5
@@ -364,7 +360,6 @@ ENTRY(feroceon_range_dma_flush_range)
mcr p15, 5, r0, c15, c15, 0 @ D clean/inv range start
mcr p15, 5, r1, c15, c15, 1 @ D clean/inv range top
msr cpsr_c, r2 @ restore interrupts
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -405,6 +400,12 @@ ENTRY(feroceon_dma_unmap_area)
mov pc, lr
ENDPROC(feroceon_dma_unmap_area)

+ENTRY(feroceon_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, r0, c7, c10, 4 @ drain WB
+ mov pc, lr
+ENDPROC(feroceon_dma_barrier)
+
ENTRY(feroceon_cache_fns)
.long feroceon_flush_kern_cache_all
.long feroceon_flush_user_cache_all
@@ -414,6 +415,7 @@ ENTRY(feroceon_cache_fns)
.long feroceon_flush_kern_dcache_area
.long feroceon_dma_map_area
.long feroceon_dma_unmap_area
+ .long feroceon_dma_barrier
.long feroceon_dma_flush_range

ENTRY(feroceon_range_cache_fns)
@@ -425,6 +427,7 @@ ENTRY(feroceon_range_cache_fns)
.long feroceon_range_flush_kern_dcache_area
.long feroceon_range_dma_map_area
.long feroceon_dma_unmap_area
+ .long feroceon_dma_barrier
.long feroceon_range_dma_flush_range

.align 5
diff --git a/arch/arm/mm/proc-mohawk.S b/arch/arm/mm/proc-mohawk.S
index caa3115..09e8883 100644
--- a/arch/arm/mm/proc-mohawk.S
+++ b/arch/arm/mm/proc-mohawk.S
@@ -228,7 +228,6 @@ mohawk_dma_inv_range:
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -247,7 +246,6 @@ mohawk_dma_clean_range:
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -265,7 +263,6 @@ ENTRY(mohawk_dma_flush_range)
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -292,6 +289,12 @@ ENTRY(mohawk_dma_unmap_area)
mov pc, lr
ENDPROC(mohawk_dma_unmap_area)

+ENTRY(mohawk_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, r0, c7, c10, 4 @ drain WB
+ mov pc, lr
+ENDPROC(mohawk_dma_barrier)
+
ENTRY(mohawk_cache_fns)
.long mohawk_flush_kern_cache_all
.long mohawk_flush_user_cache_all
@@ -301,6 +304,7 @@ ENTRY(mohawk_cache_fns)
.long mohawk_flush_kern_dcache_area
.long mohawk_dma_map_area
.long mohawk_dma_unmap_area
+ .long mohawk_dma_barrier
.long mohawk_dma_flush_range

ENTRY(cpu_mohawk_dcache_clean_area)
diff --git a/arch/arm/mm/proc-xsc3.S b/arch/arm/mm/proc-xsc3.S
index 046b3d8..d033ed4 100644
--- a/arch/arm/mm/proc-xsc3.S
+++ b/arch/arm/mm/proc-xsc3.S
@@ -267,7 +267,6 @@ xsc3_dma_inv_range:
add r0, r0, #CACHELINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ data write barrier
mov pc, lr

/*
@@ -284,7 +283,6 @@ xsc3_dma_clean_range:
add r0, r0, #CACHELINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ data write barrier
mov pc, lr

/*
@@ -301,7 +299,6 @@ ENTRY(xsc3_dma_flush_range)
add r0, r0, #CACHELINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ data write barrier
mov pc, lr

/*
@@ -328,6 +325,12 @@ ENTRY(xsc3_dma_unmap_area)
mov pc, lr
ENDPROC(xsc3_dma_unmap_area)

+ENTRY(xsc3_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, r0, c7, c10, 4 @ data write barrier
+ mov pc, lr
+ENDPROC(xsc3_dma_barrier)
+
ENTRY(xsc3_cache_fns)
.long xsc3_flush_kern_cache_all
.long xsc3_flush_user_cache_all
@@ -337,6 +340,7 @@ ENTRY(xsc3_cache_fns)
.long xsc3_flush_kern_dcache_area
.long xsc3_dma_map_area
.long xsc3_dma_unmap_area
+ .long xsc3_dma_barrier
.long xsc3_dma_flush_range

ENTRY(cpu_xsc3_dcache_clean_area)
diff --git a/arch/arm/mm/proc-xscale.S b/arch/arm/mm/proc-xscale.S
index 63037e2..e390ae6 100644
--- a/arch/arm/mm/proc-xscale.S
+++ b/arch/arm/mm/proc-xscale.S
@@ -325,7 +325,6 @@ xscale_dma_inv_range:
add r0, r0, #CACHELINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ Drain Write (& Fill) Buffer
mov pc, lr

/*
@@ -342,7 +341,6 @@ xscale_dma_clean_range:
add r0, r0, #CACHELINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ Drain Write (& Fill) Buffer
mov pc, lr

/*
@@ -360,7 +358,6 @@ ENTRY(xscale_dma_flush_range)
add r0, r0, #CACHELINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ Drain Write (& Fill) Buffer
mov pc, lr

/*
@@ -400,6 +397,12 @@ ENTRY(xscale_dma_unmap_area)
mov pc, lr
ENDPROC(xscale_dma_unmap_area)

+ENTRY(xscale_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, r0, c7, c10, 4 @ Drain Write (& Fill) Buffer
+ mov pc, lr
+ENDPROC(xscsale_dma_barrier)
+
ENTRY(xscale_cache_fns)
.long xscale_flush_kern_cache_all
.long xscale_flush_user_cache_all
@@ -409,6 +412,7 @@ ENTRY(xscale_cache_fns)
.long xscale_flush_kern_dcache_area
.long xscale_dma_map_area
.long xscale_dma_unmap_area
+ .long xscale_dma_barrier
.long xscale_dma_flush_range

/*

2010-02-10 22:40:57

by Russell King - ARM Linux

[permalink] [raw]
Subject: Re: [RFC 0/2] fix dma_map_sg not to do barriers for each buffer

On Wed, Feb 10, 2010 at 01:27:47PM -0800, Randy Dunlap wrote:
> On 02/10/10 12:37, [email protected] wrote:
> > From: Abhijeet Dharmapurikar <[email protected]>
> >
> > Please refer to the post here
> > http://lkml.org/lkml/2010/1/4/347
> >
> > These changes are to introduce barrierless dma_map_area and dma_unmap_area and
> > use them to map the buffers in the scatterlist. For the last buffer, call
> > the normal dma_map_area(aka with barriers) effectively executing the barrier
> > at the end of the operation.
> >
> > Note that the barrierless operations are implemented for few arm
> > architectures only and I would implement for others once these are okayed by the
> > community.
>
> So when you add these interfaces for other architectures, you will also
> update Documentation/DMA-API.txt, right??

Do we need barrier-less interfaces for anything other than the dma_*_sg
functions?

2010-02-10 23:10:42

by Abhijeet Dharmapurikar

[permalink] [raw]
Subject: Re: [RFC 0/2] fix dma_map_sg not to do barriers for each buffer

Russell King - ARM Linux wrote:
> On Wed, Feb 10, 2010 at 01:27:47PM -0800, Randy Dunlap wrote:
>> On 02/10/10 12:37, [email protected] wrote:
>>> From: Abhijeet Dharmapurikar <[email protected]>
>>>
>>> Please refer to the post here
>>> http://lkml.org/lkml/2010/1/4/347
>>>
>>> These changes are to introduce barrierless dma_map_area and dma_unmap_area and
>>> use them to map the buffers in the scatterlist. For the last buffer, call
>>> the normal dma_map_area(aka with barriers) effectively executing the barrier
>>> at the end of the operation.
>>>
>>> Note that the barrierless operations are implemented for few arm
>>> architectures only and I would implement for others once these are okayed by the
>>> community.
>> So when you add these interfaces for other architectures, you will also
>> update Documentation/DMA-API.txt, right??
>
> Do we need barrier-less interfaces for anything other than the dma_*_sg
> functions?

I think, dma_*_sg are the only ones that could benefit from barrier-less
interfaces.

> --
> To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2010-02-10 23:28:21

by Abhijeet Dharmapurikar

[permalink] [raw]
Subject: Re: [RFC 0/2] fix dma_map_sg not to do barriers for each buffer

Russell King - ARM Linux wrote:
> On Wed, Feb 10, 2010 at 12:37:28PM -0800, [email protected] wrote:
>> From: Abhijeet Dharmapurikar <[email protected]>
>>
>> Please refer to the post here
>> http://lkml.org/lkml/2010/1/4/347
>>
>> These changes are to introduce barrierless dma_map_area and dma_unmap_area and
>> use them to map the buffers in the scatterlist. For the last buffer, call
>> the normal dma_map_area(aka with barriers) effectively executing the barrier
>> at the end of the operation.
>
> What if we make dma_map_area and dma_unmap_area both be barrier-less,
> and instead have a separate dma_barrier method - eg, something like the
> attached?
>
> This might allow for better I-cache usage by not having to duplicate the
> DMA cache coherence functions.

Agree, thanks for pointing this and for the patch.


>
> @@ -369,6 +372,7 @@ static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
> BUG_ON(!valid_dma_direction(dir));
>
> __dma_page_cpu_to_dev(page, offset, size, dir);
> + __dma_barrier(dir);
>
> return page_to_dma(dev, page) + offset;
> }

dma_map_page is going to execute the barrier here.


> /**
> * dma_map_sg - map a set of SG buffers for streaming mode DMA
> * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
> @@ -537,6 +544,9 @@ int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
> if (dma_mapping_error(dev, s->dma_address))
> goto bad_mapping;
> }
> +
> + __dma_barrier(dir);
> +
> return nents;

This would call the barrier in addition to the ones executed by
dma_map_page.

We would need to call __dma_page_cpu_to_dev instead of dma_map_page and
do the barrier before returning.

2010-02-10 23:57:26

by Russell King - ARM Linux

[permalink] [raw]
Subject: Re: [RFC 0/2] fix dma_map_sg not to do barriers for each buffer

On Wed, Feb 10, 2010 at 03:28:17PM -0800, Abhijeet Dharmapurikar wrote:
> This would call the barrier in addition to the ones executed by
> dma_map_page.
>
> We would need to call __dma_page_cpu_to_dev instead of dma_map_page and
> do the barrier before returning.

It's not that simple because of the dmabounce crap. Ho hum, let's add
yet another layer of indirection for it.

arch/arm/include/asm/cacheflush.h | 4 ++++
arch/arm/include/asm/dma-mapping.h | 19 +++++++++++++++++--
arch/arm/mm/cache-fa.S | 13 +++++++------
arch/arm/mm/cache-v3.S | 3 +++
arch/arm/mm/cache-v4.S | 3 +++
arch/arm/mm/cache-v4wb.S | 9 +++++++--
arch/arm/mm/cache-v4wt.S | 3 +++
arch/arm/mm/cache-v6.S | 13 +++++++------
arch/arm/mm/cache-v7.S | 9 ++++++---
arch/arm/mm/dma-mapping.c | 18 +++++++++++++++++-
arch/arm/mm/proc-arm1020e.S | 10 +++++++---
arch/arm/mm/proc-arm1022.S | 10 +++++++---
arch/arm/mm/proc-arm1026.S | 10 +++++++---
arch/arm/mm/proc-arm920.S | 10 +++++++---
arch/arm/mm/proc-arm922.S | 10 +++++++---
arch/arm/mm/proc-arm925.S | 10 +++++++---
arch/arm/mm/proc-arm926.S | 10 +++++++---
arch/arm/mm/proc-arm940.S | 10 +++++++---
arch/arm/mm/proc-arm946.S | 10 +++++++---
arch/arm/mm/proc-feroceon.S | 13 ++++++++-----
arch/arm/mm/proc-mohawk.S | 10 +++++++---
arch/arm/mm/proc-xsc3.S | 10 +++++++---
arch/arm/mm/proc-xscale.S | 10 +++++++---
23 files changed, 166 insertions(+), 61 deletions(-)

diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index e290885..5928e78 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -200,6 +200,7 @@ struct cpu_cache_fns {

void (*dma_map_area)(const void *, size_t, int);
void (*dma_unmap_area)(const void *, size_t, int);
+ void (*dma_barrier)(void);

void (*dma_flush_range)(const void *, const void *);
};
@@ -232,6 +233,7 @@ extern struct cpu_cache_fns cpu_cache;
*/
#define dmac_map_area cpu_cache.dma_map_area
#define dmac_unmap_area cpu_cache.dma_unmap_area
+#define dmac_barrier cpu_cache.dma_barrier
#define dmac_flush_range cpu_cache.dma_flush_range

#else
@@ -258,10 +260,12 @@ extern void __cpuc_flush_dcache_area(void *, size_t);
*/
#define dmac_map_area __glue(_CACHE,_dma_map_area)
#define dmac_unmap_area __glue(_CACHE,_dma_unmap_area)
+#define dmac_barrier __glue(_CACHE,_dma_barrier)
#define dmac_flush_range __glue(_CACHE,_dma_flush_range)

extern void dmac_map_area(const void *, size_t, int);
extern void dmac_unmap_area(const void *, size_t, int);
+extern void dmac_barrier(void);
extern void dmac_flush_range(const void *, const void *);

#endif
diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index 256ee1c..1371db7 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -110,6 +110,8 @@ static inline void __dma_page_dev_to_cpu(struct page *page, unsigned long off,
___dma_page_dev_to_cpu(page, off, size, dir);
}

+extern void __dma_barrier(enum dma_data_direction);
+
/*
* Return whether the given device DMA address mask can be supported
* properly. For example, if your device can only drive the low 24-bits
@@ -299,7 +301,7 @@ extern dma_addr_t dma_map_single(struct device *, void *, size_t,
enum dma_data_direction);
extern void dma_unmap_single(struct device *, dma_addr_t, size_t,
enum dma_data_direction);
-extern dma_addr_t dma_map_page(struct device *, struct page *,
+extern dma_addr_t __dma_map_page(struct device *, struct page *,
unsigned long, size_t, enum dma_data_direction);
extern void dma_unmap_page(struct device *, dma_addr_t, size_t,
enum dma_data_direction);
@@ -345,6 +347,7 @@ static inline dma_addr_t dma_map_single(struct device *dev, void *cpu_addr,
BUG_ON(!valid_dma_direction(dir));

__dma_single_cpu_to_dev(cpu_addr, size, dir);
+ __dma_barrier(dir);

return virt_to_dma(dev, cpu_addr);
}
@@ -363,7 +366,7 @@ static inline dma_addr_t dma_map_single(struct device *dev, void *cpu_addr,
* The device owns this memory once this call has completed. The CPU
* can regain ownership by calling dma_unmap_page().
*/
-static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
+static inline dma_addr_t __dma_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t size, enum dma_data_direction dir)
{
BUG_ON(!valid_dma_direction(dir));
@@ -373,6 +376,14 @@ static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
return page_to_dma(dev, page) + offset;
}

+static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size, enum dma_data_direction dir)
+{
+ dma_addr_t addr = __dma_map_page(page, offset, size, dir);
+ __dma_barrier(dir);
+ return addr;
+}
+
/**
* dma_unmap_single - unmap a single buffer previously mapped
* @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
@@ -391,6 +402,7 @@ static inline void dma_unmap_single(struct device *dev, dma_addr_t handle,
size_t size, enum dma_data_direction dir)
{
__dma_single_dev_to_cpu(dma_to_virt(dev, handle), size, dir);
+ __dma_barrier(dir);
}

/**
@@ -412,6 +424,7 @@ static inline void dma_unmap_page(struct device *dev, dma_addr_t handle,
{
__dma_page_dev_to_cpu(dma_to_page(dev, handle), handle & ~PAGE_MASK,
size, dir);
+ __dma_barrier(dir);
}
#endif /* CONFIG_DMABOUNCE */

@@ -443,6 +456,7 @@ static inline void dma_sync_single_range_for_cpu(struct device *dev,
return;

__dma_single_dev_to_cpu(dma_to_virt(dev, handle) + offset, size, dir);
+ __dma_barrier(dir);
}

static inline void dma_sync_single_range_for_device(struct device *dev,
@@ -455,6 +469,7 @@ static inline void dma_sync_single_range_for_device(struct device *dev,
return;

__dma_single_cpu_to_dev(dma_to_virt(dev, handle) + offset, size, dir);
+ __dma_barrier(dir);
}

static inline void dma_sync_single_for_cpu(struct device *dev,
diff --git a/arch/arm/mm/cache-fa.S b/arch/arm/mm/cache-fa.S
index 7148e53..cdcfae2 100644
--- a/arch/arm/mm/cache-fa.S
+++ b/arch/arm/mm/cache-fa.S
@@ -168,8 +168,6 @@ fa_dma_inv_range:
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mov r0, #0
- mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
mov pc, lr

/*
@@ -186,8 +184,6 @@ fa_dma_clean_range:
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mov r0, #0
- mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
mov pc, lr

/*
@@ -201,8 +197,6 @@ ENTRY(fa_dma_flush_range)
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mov r0, #0
- mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
mov pc, lr

/*
@@ -229,6 +223,12 @@ ENTRY(fa_dma_unmap_area)
mov pc, lr
ENDPROC(fa_dma_unmap_area)

+ENTRY(fa_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
+ mov pc, lr
+ENDPROC(fa_dma_barrier)
+
__INITDATA

.type fa_cache_fns, #object
@@ -241,5 +241,6 @@ ENTRY(fa_cache_fns)
.long fa_flush_kern_dcache_area
.long fa_dma_map_area
.long fa_dma_unmap_area
+ .long fa_dma_barrier
.long fa_dma_flush_range
.size fa_cache_fns, . - fa_cache_fns
diff --git a/arch/arm/mm/cache-v3.S b/arch/arm/mm/cache-v3.S
index c2ff3c5..df34458 100644
--- a/arch/arm/mm/cache-v3.S
+++ b/arch/arm/mm/cache-v3.S
@@ -114,9 +114,11 @@ ENTRY(v3_dma_unmap_area)
* - dir - DMA direction
*/
ENTRY(v3_dma_map_area)
+ENTRY(v3_dma_barrier)
mov pc, lr
ENDPROC(v3_dma_unmap_area)
ENDPROC(v3_dma_map_area)
+ENDPROC(v3_dma_barrier)

__INITDATA

@@ -130,5 +132,6 @@ ENTRY(v3_cache_fns)
.long v3_flush_kern_dcache_area
.long v3_dma_map_area
.long v3_dma_unmap_area
+ .long v3_dma_barrier
.long v3_dma_flush_range
.size v3_cache_fns, . - v3_cache_fns
diff --git a/arch/arm/mm/cache-v4.S b/arch/arm/mm/cache-v4.S
index 4810f7e..20260b1 100644
--- a/arch/arm/mm/cache-v4.S
+++ b/arch/arm/mm/cache-v4.S
@@ -126,9 +126,11 @@ ENTRY(v4_dma_unmap_area)
* - dir - DMA direction
*/
ENTRY(v4_dma_map_area)
+ENTRY(v4_dma_barrier)
mov pc, lr
ENDPROC(v4_dma_unmap_area)
ENDPROC(v4_dma_map_area)
+ENDPROC(v4_dma_barrier)

__INITDATA

@@ -142,5 +144,6 @@ ENTRY(v4_cache_fns)
.long v4_flush_kern_dcache_area
.long v4_dma_map_area
.long v4_dma_unmap_area
+ .long v4_dma_barrier
.long v4_dma_flush_range
.size v4_cache_fns, . - v4_cache_fns
diff --git a/arch/arm/mm/cache-v4wb.S b/arch/arm/mm/cache-v4wb.S
index df8368a..9c9c875 100644
--- a/arch/arm/mm/cache-v4wb.S
+++ b/arch/arm/mm/cache-v4wb.S
@@ -183,7 +183,6 @@ v4wb_dma_inv_range:
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
mov pc, lr

/*
@@ -200,7 +199,6 @@ v4wb_dma_clean_range:
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
mov pc, lr

/*
@@ -240,6 +238,12 @@ ENTRY(v4wb_dma_unmap_area)
mov pc, lr
ENDPROC(v4wb_dma_unmap_area)

+ENTRY(v4wb_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
+ mov pc, lr
+ENDPROC(v4wb_dma_barrier)
+
__INITDATA

.type v4wb_cache_fns, #object
@@ -252,5 +256,6 @@ ENTRY(v4wb_cache_fns)
.long v4wb_flush_kern_dcache_area
.long v4wb_dma_map_area
.long v4wb_dma_unmap_area
+ .long v4wb_dma_barrier
.long v4wb_dma_flush_range
.size v4wb_cache_fns, . - v4wb_cache_fns
diff --git a/arch/arm/mm/cache-v4wt.S b/arch/arm/mm/cache-v4wt.S
index 45c7031..223eea4 100644
--- a/arch/arm/mm/cache-v4wt.S
+++ b/arch/arm/mm/cache-v4wt.S
@@ -180,9 +180,11 @@ ENTRY(v4wt_dma_unmap_area)
* - dir - DMA direction
*/
ENTRY(v4wt_dma_map_area)
+ENTRY(v4wt_dma_barrier)
mov pc, lr
ENDPROC(v4wt_dma_unmap_area)
ENDPROC(v4wt_dma_map_area)
+ENDPROC(v4wt_dma_barrier)

__INITDATA

@@ -196,5 +198,6 @@ ENTRY(v4wt_cache_fns)
.long v4wt_flush_kern_dcache_area
.long v4wt_dma_map_area
.long v4wt_dma_unmap_area
+ .long v4wt_dma_barrier
.long v4wt_dma_flush_range
.size v4wt_cache_fns, . - v4wt_cache_fns
diff --git a/arch/arm/mm/cache-v6.S b/arch/arm/mm/cache-v6.S
index 9d89c67..b294854 100644
--- a/arch/arm/mm/cache-v6.S
+++ b/arch/arm/mm/cache-v6.S
@@ -219,8 +219,6 @@ v6_dma_inv_range:
add r0, r0, #D_CACHE_LINE_SIZE
cmp r0, r1
blo 1b
- mov r0, #0
- mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
mov pc, lr

/*
@@ -239,8 +237,6 @@ v6_dma_clean_range:
add r0, r0, #D_CACHE_LINE_SIZE
cmp r0, r1
blo 1b
- mov r0, #0
- mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
mov pc, lr

/*
@@ -259,8 +255,6 @@ ENTRY(v6_dma_flush_range)
add r0, r0, #D_CACHE_LINE_SIZE
cmp r0, r1
blo 1b
- mov r0, #0
- mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
mov pc, lr

/*
@@ -289,6 +283,12 @@ ENTRY(v6_dma_unmap_area)
mov pc, lr
ENDPROC(v6_dma_unmap_area)

+ENTRY(v6_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
+ mov pc, lr
+ENDPROC(v6_dma_barrier)
+
__INITDATA

.type v6_cache_fns, #object
@@ -301,5 +301,6 @@ ENTRY(v6_cache_fns)
.long v6_flush_kern_dcache_area
.long v6_dma_map_area
.long v6_dma_unmap_area
+ .long v6_dma_barrier
.long v6_dma_flush_range
.size v6_cache_fns, . - v6_cache_fns
diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
index bcd64f2..d89d55a 100644
--- a/arch/arm/mm/cache-v7.S
+++ b/arch/arm/mm/cache-v7.S
@@ -231,7 +231,6 @@ v7_dma_inv_range:
add r0, r0, r2
cmp r0, r1
blo 1b
- dsb
mov pc, lr
ENDPROC(v7_dma_inv_range)

@@ -249,7 +248,6 @@ v7_dma_clean_range:
add r0, r0, r2
cmp r0, r1
blo 1b
- dsb
mov pc, lr
ENDPROC(v7_dma_clean_range)

@@ -267,7 +265,6 @@ ENTRY(v7_dma_flush_range)
add r0, r0, r2
cmp r0, r1
blo 1b
- dsb
mov pc, lr
ENDPROC(v7_dma_flush_range)

@@ -297,6 +294,11 @@ ENTRY(v7_dma_unmap_area)
mov pc, lr
ENDPROC(v7_dma_unmap_area)

+ENTRY(v7_dma_barrier)
+ dsb
+ mov pc, lr
+ENDPROC(v7_dma_barrier)
+
__INITDATA

.type v7_cache_fns, #object
@@ -309,5 +311,6 @@ ENTRY(v7_cache_fns)
.long v7_flush_kern_dcache_area
.long v7_dma_map_area
.long v7_dma_unmap_area
+ .long v6_dma_barrier
.long v7_dma_flush_range
.size v7_cache_fns, . - v7_cache_fns
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 64daef2..d807f38 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -108,6 +108,7 @@ static struct page *__dma_alloc_buffer(struct device *dev, size_t size, gfp_t gf
memset(ptr, 0, size);
dmac_flush_range(ptr, ptr + size);
outer_flush_range(__pa(ptr), __pa(ptr) + size);
+ dmac_barrier();

return page;
}
@@ -509,6 +510,12 @@ void ___dma_page_dev_to_cpu(struct page *page, unsigned long off,
}
EXPORT_SYMBOL(___dma_page_dev_to_cpu);

+void __dma_barrier(enum dma_data_direction dir)
+{
+ dmac_barrier();
+}
+EXPORT_SYMBOL(__dma_barrier);
+
/**
* dma_map_sg - map a set of SG buffers for streaming mode DMA
* @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
@@ -532,11 +539,14 @@ int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
int i, j;

for_each_sg(sg, s, nents, i) {
- s->dma_address = dma_map_page(dev, sg_page(s), s->offset,
+ s->dma_address = __dma_map_page(dev, sg_page(s), s->offset,
s->length, dir);
if (dma_mapping_error(dev, s->dma_address))
goto bad_mapping;
}
+
+ __dma_barrier(dir);
+
return nents;

bad_mapping:
@@ -564,6 +574,8 @@ void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,

for_each_sg(sg, s, nents, i)
dma_unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir);
+
+ __dma_barrier(dir);
}
EXPORT_SYMBOL(dma_unmap_sg);

@@ -588,6 +600,8 @@ void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
__dma_page_dev_to_cpu(sg_page(s), s->offset,
s->length, dir);
}
+
+ __dma_barrier(dir);
}
EXPORT_SYMBOL(dma_sync_sg_for_cpu);

@@ -612,5 +626,7 @@ void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
__dma_page_cpu_to_dev(sg_page(s), s->offset,
s->length, dir);
}
+
+ __dma_barrier(dir);
}
EXPORT_SYMBOL(dma_sync_sg_for_device);
diff --git a/arch/arm/mm/proc-arm1020e.S b/arch/arm/mm/proc-arm1020e.S
index d278298..fea33c9 100644
--- a/arch/arm/mm/proc-arm1020e.S
+++ b/arch/arm/mm/proc-arm1020e.S
@@ -271,7 +271,6 @@ arm1020e_dma_inv_range:
cmp r0, r1
blo 1b
#endif
- mcr p15, 0, ip, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -293,7 +292,6 @@ arm1020e_dma_clean_range:
cmp r0, r1
blo 1b
#endif
- mcr p15, 0, ip, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -313,7 +311,6 @@ ENTRY(arm1020e_dma_flush_range)
cmp r0, r1
blo 1b
#endif
- mcr p15, 0, ip, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -340,6 +337,12 @@ ENTRY(arm1020e_dma_unmap_area)
mov pc, lr
ENDPROC(arm1020e_dma_unmap_area)

+ENTRY(arm1020e_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, r0, c7, c10, 4 @ drain WB
+ mov pc, lr
+ENDPROC(arm1020e_dma_barrier)
+
ENTRY(arm1020e_cache_fns)
.long arm1020e_flush_kern_cache_all
.long arm1020e_flush_user_cache_all
@@ -349,6 +352,7 @@ ENTRY(arm1020e_cache_fns)
.long arm1020e_flush_kern_dcache_area
.long arm1020e_dma_map_area
.long arm1020e_dma_unmap_area
+ .long arm1020e_dma_barrier
.long arm1020e_dma_flush_range

.align 5
diff --git a/arch/arm/mm/proc-arm1022.S b/arch/arm/mm/proc-arm1022.S
index ce13e4a..ba1a7df 100644
--- a/arch/arm/mm/proc-arm1022.S
+++ b/arch/arm/mm/proc-arm1022.S
@@ -260,7 +260,6 @@ arm1022_dma_inv_range:
cmp r0, r1
blo 1b
#endif
- mcr p15, 0, ip, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -282,7 +281,6 @@ arm1022_dma_clean_range:
cmp r0, r1
blo 1b
#endif
- mcr p15, 0, ip, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -302,7 +300,6 @@ ENTRY(arm1022_dma_flush_range)
cmp r0, r1
blo 1b
#endif
- mcr p15, 0, ip, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -329,6 +326,12 @@ ENTRY(arm1022_dma_unmap_area)
mov pc, lr
ENDPROC(arm1022_dma_unmap_area)

+ENTRY(arm1022_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, r0, c7, c10, 4 @ drain WB
+ mov pc, lr
+ENDPROC(arm1022_dma_barrier)
+
ENTRY(arm1022_cache_fns)
.long arm1022_flush_kern_cache_all
.long arm1022_flush_user_cache_all
@@ -338,6 +341,7 @@ ENTRY(arm1022_cache_fns)
.long arm1022_flush_kern_dcache_area
.long arm1022_dma_map_area
.long arm1022_dma_unmap_area
+ .long arm1022_dma_barrier
.long arm1022_dma_flush_range

.align 5
diff --git a/arch/arm/mm/proc-arm1026.S b/arch/arm/mm/proc-arm1026.S
index 636672a..de648f1 100644
--- a/arch/arm/mm/proc-arm1026.S
+++ b/arch/arm/mm/proc-arm1026.S
@@ -254,7 +254,6 @@ arm1026_dma_inv_range:
cmp r0, r1
blo 1b
#endif
- mcr p15, 0, ip, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -276,7 +275,6 @@ arm1026_dma_clean_range:
cmp r0, r1
blo 1b
#endif
- mcr p15, 0, ip, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -296,7 +294,6 @@ ENTRY(arm1026_dma_flush_range)
cmp r0, r1
blo 1b
#endif
- mcr p15, 0, ip, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -323,6 +320,12 @@ ENTRY(arm1026_dma_unmap_area)
mov pc, lr
ENDPROC(arm1026_dma_unmap_area)

+ENTRY(arm1026_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, r0, c7, c10, 4 @ drain WB
+ mov pc, lr
+ENDPROC(arm1026_dma_barrier)
+
ENTRY(arm1026_cache_fns)
.long arm1026_flush_kern_cache_all
.long arm1026_flush_user_cache_all
@@ -332,6 +335,7 @@ ENTRY(arm1026_cache_fns)
.long arm1026_flush_kern_dcache_area
.long arm1026_dma_map_area
.long arm1026_dma_unmap_area
+ .long arm1026_dma_barrier
.long arm1026_dma_flush_range

.align 5
diff --git a/arch/arm/mm/proc-arm920.S b/arch/arm/mm/proc-arm920.S
index 8be8199..ec74093 100644
--- a/arch/arm/mm/proc-arm920.S
+++ b/arch/arm/mm/proc-arm920.S
@@ -249,7 +249,6 @@ arm920_dma_inv_range:
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -268,7 +267,6 @@ arm920_dma_clean_range:
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -285,7 +283,6 @@ ENTRY(arm920_dma_flush_range)
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -312,6 +309,12 @@ ENTRY(arm920_dma_unmap_area)
mov pc, lr
ENDPROC(arm920_dma_unmap_area)

+ENTRY(arm920_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, r0, c7, c10, 4 @ drain WB
+ mov pc, lr
+ENDPROC(arm920_dma_barrier)
+
ENTRY(arm920_cache_fns)
.long arm920_flush_kern_cache_all
.long arm920_flush_user_cache_all
@@ -321,6 +324,7 @@ ENTRY(arm920_cache_fns)
.long arm920_flush_kern_dcache_area
.long arm920_dma_map_area
.long arm920_dma_unmap_area
+ .long arm920_dma_barrier
.long arm920_dma_flush_range

#endif
diff --git a/arch/arm/mm/proc-arm922.S b/arch/arm/mm/proc-arm922.S
index c0ff8e4..474d4c6 100644
--- a/arch/arm/mm/proc-arm922.S
+++ b/arch/arm/mm/proc-arm922.S
@@ -251,7 +251,6 @@ arm922_dma_inv_range:
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -270,7 +269,6 @@ arm922_dma_clean_range:
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -287,7 +285,6 @@ ENTRY(arm922_dma_flush_range)
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -314,6 +311,12 @@ ENTRY(arm922_dma_unmap_area)
mov pc, lr
ENDPROC(arm922_dma_unmap_area)

+ENTRY(arm922_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, r0, c7, c10, 4 @ drain WB
+ mov pc, lr
+ENDPROC(arm922_dma_barrier)
+
ENTRY(arm922_cache_fns)
.long arm922_flush_kern_cache_all
.long arm922_flush_user_cache_all
@@ -323,6 +326,7 @@ ENTRY(arm922_cache_fns)
.long arm922_flush_kern_dcache_area
.long arm922_dma_map_area
.long arm922_dma_unmap_area
+ .long arm922_dma_barrier
.long arm922_dma_flush_range

#endif
diff --git a/arch/arm/mm/proc-arm925.S b/arch/arm/mm/proc-arm925.S
index 3c6cffe..0336ae3 100644
--- a/arch/arm/mm/proc-arm925.S
+++ b/arch/arm/mm/proc-arm925.S
@@ -295,7 +295,6 @@ arm925_dma_inv_range:
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -316,7 +315,6 @@ arm925_dma_clean_range:
cmp r0, r1
blo 1b
#endif
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -338,7 +336,6 @@ ENTRY(arm925_dma_flush_range)
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -365,6 +362,12 @@ ENTRY(arm925_dma_unmap_area)
mov pc, lr
ENDPROC(arm925_dma_unmap_area)

+ENTRY(arm925_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, r0, c7, c10, 4 @ drain WB
+ mov pc, lr
+ENDPROC(arm925_dma_barrier)
+
ENTRY(arm925_cache_fns)
.long arm925_flush_kern_cache_all
.long arm925_flush_user_cache_all
@@ -374,6 +377,7 @@ ENTRY(arm925_cache_fns)
.long arm925_flush_kern_dcache_area
.long arm925_dma_map_area
.long arm925_dma_unmap_area
+ .long arm925_dma_barrier
.long arm925_dma_flush_range

ENTRY(cpu_arm925_dcache_clean_area)
diff --git a/arch/arm/mm/proc-arm926.S b/arch/arm/mm/proc-arm926.S
index 75b707c..473bbe6 100644
--- a/arch/arm/mm/proc-arm926.S
+++ b/arch/arm/mm/proc-arm926.S
@@ -258,7 +258,6 @@ arm926_dma_inv_range:
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -279,7 +278,6 @@ arm926_dma_clean_range:
cmp r0, r1
blo 1b
#endif
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -301,7 +299,6 @@ ENTRY(arm926_dma_flush_range)
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -328,6 +325,12 @@ ENTRY(arm926_dma_unmap_area)
mov pc, lr
ENDPROC(arm926_dma_unmap_area)

+ENTRY(arm926_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, r0, c7, c10, 4 @ drain WB
+ mov pc, lr
+ENDPROC(arm926_dma_barrier)
+
ENTRY(arm926_cache_fns)
.long arm926_flush_kern_cache_all
.long arm926_flush_user_cache_all
@@ -337,6 +340,7 @@ ENTRY(arm926_cache_fns)
.long arm926_flush_kern_dcache_area
.long arm926_dma_map_area
.long arm926_dma_unmap_area
+ .long arm926_dma_barrier
.long arm926_dma_flush_range

ENTRY(cpu_arm926_dcache_clean_area)
diff --git a/arch/arm/mm/proc-arm940.S b/arch/arm/mm/proc-arm940.S
index 1af1657..c44c963 100644
--- a/arch/arm/mm/proc-arm940.S
+++ b/arch/arm/mm/proc-arm940.S
@@ -180,7 +180,6 @@ arm940_dma_inv_range:
bcs 2b @ entries 63 to 0
subs r1, r1, #1 << 4
bcs 1b @ segments 7 to 0
- mcr p15, 0, ip, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -204,7 +203,6 @@ ENTRY(cpu_arm940_dcache_clean_area)
subs r1, r1, #1 << 4
bcs 1b @ segments 7 to 0
#endif
- mcr p15, 0, ip, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -230,7 +228,6 @@ ENTRY(arm940_dma_flush_range)
bcs 2b @ entries 63 to 0
subs r1, r1, #1 << 4
bcs 1b @ segments 7 to 0
- mcr p15, 0, ip, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -257,6 +254,12 @@ ENTRY(arm940_dma_unmap_area)
mov pc, lr
ENDPROC(arm940_dma_unmap_area)

+ENTRY(arm940_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, ip, c7, c10, 4 @ drain WB
+ mov pc, lr
+ENDPROC(arm940_dma_barrier)
+
ENTRY(arm940_cache_fns)
.long arm940_flush_kern_cache_all
.long arm940_flush_user_cache_all
@@ -266,6 +269,7 @@ ENTRY(arm940_cache_fns)
.long arm940_flush_kern_dcache_area
.long arm940_dma_map_area
.long arm940_dma_unmap_area
+ .long arm940_dma_barrier
.long arm940_dma_flush_range

__INIT
diff --git a/arch/arm/mm/proc-arm946.S b/arch/arm/mm/proc-arm946.S
index 1664b6a..11e9ad7 100644
--- a/arch/arm/mm/proc-arm946.S
+++ b/arch/arm/mm/proc-arm946.S
@@ -227,7 +227,6 @@ arm946_dma_inv_range:
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -248,7 +247,6 @@ arm946_dma_clean_range:
cmp r0, r1
blo 1b
#endif
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -272,7 +270,6 @@ ENTRY(arm946_dma_flush_range)
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -299,6 +296,12 @@ ENTRY(arm946_dma_unmap_area)
mov pc, lr
ENDPROC(arm946_dma_unmap_area)

+ENTRY(arm946_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, r0, c7, c10, 4 @ drain WB
+ mov pc, lr
+ENDPROC(arm946_dma_barrier)
+
ENTRY(arm946_cache_fns)
.long arm946_flush_kern_cache_all
.long arm946_flush_user_cache_all
@@ -308,6 +311,7 @@ ENTRY(arm946_cache_fns)
.long arm946_flush_kern_dcache_area
.long arm946_dma_map_area
.long arm946_dma_unmap_area
+ .long arm946_dma_barrier
.long arm946_dma_flush_range


diff --git a/arch/arm/mm/proc-feroceon.S b/arch/arm/mm/proc-feroceon.S
index 53e6323..50a309e 100644
--- a/arch/arm/mm/proc-feroceon.S
+++ b/arch/arm/mm/proc-feroceon.S
@@ -284,7 +284,6 @@ feroceon_dma_inv_range:
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

.align 5
@@ -320,7 +319,6 @@ feroceon_dma_clean_range:
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

.align 5
@@ -333,7 +331,6 @@ feroceon_range_dma_clean_range:
mcr p15, 5, r0, c15, c13, 0 @ D clean range start
mcr p15, 5, r1, c15, c13, 1 @ D clean range top
msr cpsr_c, r2 @ restore interrupts
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -351,7 +348,6 @@ ENTRY(feroceon_dma_flush_range)
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

.align 5
@@ -364,7 +360,6 @@ ENTRY(feroceon_range_dma_flush_range)
mcr p15, 5, r0, c15, c15, 0 @ D clean/inv range start
mcr p15, 5, r1, c15, c15, 1 @ D clean/inv range top
msr cpsr_c, r2 @ restore interrupts
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -405,6 +400,12 @@ ENTRY(feroceon_dma_unmap_area)
mov pc, lr
ENDPROC(feroceon_dma_unmap_area)

+ENTRY(feroceon_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, r0, c7, c10, 4 @ drain WB
+ mov pc, lr
+ENDPROC(feroceon_dma_barrier)
+
ENTRY(feroceon_cache_fns)
.long feroceon_flush_kern_cache_all
.long feroceon_flush_user_cache_all
@@ -414,6 +415,7 @@ ENTRY(feroceon_cache_fns)
.long feroceon_flush_kern_dcache_area
.long feroceon_dma_map_area
.long feroceon_dma_unmap_area
+ .long feroceon_dma_barrier
.long feroceon_dma_flush_range

ENTRY(feroceon_range_cache_fns)
@@ -425,6 +427,7 @@ ENTRY(feroceon_range_cache_fns)
.long feroceon_range_flush_kern_dcache_area
.long feroceon_range_dma_map_area
.long feroceon_dma_unmap_area
+ .long feroceon_dma_barrier
.long feroceon_range_dma_flush_range

.align 5
diff --git a/arch/arm/mm/proc-mohawk.S b/arch/arm/mm/proc-mohawk.S
index caa3115..09e8883 100644
--- a/arch/arm/mm/proc-mohawk.S
+++ b/arch/arm/mm/proc-mohawk.S
@@ -228,7 +228,6 @@ mohawk_dma_inv_range:
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -247,7 +246,6 @@ mohawk_dma_clean_range:
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -265,7 +263,6 @@ ENTRY(mohawk_dma_flush_range)
add r0, r0, #CACHE_DLINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ drain WB
mov pc, lr

/*
@@ -292,6 +289,12 @@ ENTRY(mohawk_dma_unmap_area)
mov pc, lr
ENDPROC(mohawk_dma_unmap_area)

+ENTRY(mohawk_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, r0, c7, c10, 4 @ drain WB
+ mov pc, lr
+ENDPROC(mohawk_dma_barrier)
+
ENTRY(mohawk_cache_fns)
.long mohawk_flush_kern_cache_all
.long mohawk_flush_user_cache_all
@@ -301,6 +304,7 @@ ENTRY(mohawk_cache_fns)
.long mohawk_flush_kern_dcache_area
.long mohawk_dma_map_area
.long mohawk_dma_unmap_area
+ .long mohawk_dma_barrier
.long mohawk_dma_flush_range

ENTRY(cpu_mohawk_dcache_clean_area)
diff --git a/arch/arm/mm/proc-xsc3.S b/arch/arm/mm/proc-xsc3.S
index 046b3d8..d033ed4 100644
--- a/arch/arm/mm/proc-xsc3.S
+++ b/arch/arm/mm/proc-xsc3.S
@@ -267,7 +267,6 @@ xsc3_dma_inv_range:
add r0, r0, #CACHELINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ data write barrier
mov pc, lr

/*
@@ -284,7 +283,6 @@ xsc3_dma_clean_range:
add r0, r0, #CACHELINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ data write barrier
mov pc, lr

/*
@@ -301,7 +299,6 @@ ENTRY(xsc3_dma_flush_range)
add r0, r0, #CACHELINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ data write barrier
mov pc, lr

/*
@@ -328,6 +325,12 @@ ENTRY(xsc3_dma_unmap_area)
mov pc, lr
ENDPROC(xsc3_dma_unmap_area)

+ENTRY(xsc3_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, r0, c7, c10, 4 @ data write barrier
+ mov pc, lr
+ENDPROC(xsc3_dma_barrier)
+
ENTRY(xsc3_cache_fns)
.long xsc3_flush_kern_cache_all
.long xsc3_flush_user_cache_all
@@ -337,6 +340,7 @@ ENTRY(xsc3_cache_fns)
.long xsc3_flush_kern_dcache_area
.long xsc3_dma_map_area
.long xsc3_dma_unmap_area
+ .long xsc3_dma_barrier
.long xsc3_dma_flush_range

ENTRY(cpu_xsc3_dcache_clean_area)
diff --git a/arch/arm/mm/proc-xscale.S b/arch/arm/mm/proc-xscale.S
index 63037e2..e390ae6 100644
--- a/arch/arm/mm/proc-xscale.S
+++ b/arch/arm/mm/proc-xscale.S
@@ -325,7 +325,6 @@ xscale_dma_inv_range:
add r0, r0, #CACHELINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ Drain Write (& Fill) Buffer
mov pc, lr

/*
@@ -342,7 +341,6 @@ xscale_dma_clean_range:
add r0, r0, #CACHELINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ Drain Write (& Fill) Buffer
mov pc, lr

/*
@@ -360,7 +358,6 @@ ENTRY(xscale_dma_flush_range)
add r0, r0, #CACHELINESIZE
cmp r0, r1
blo 1b
- mcr p15, 0, r0, c7, c10, 4 @ Drain Write (& Fill) Buffer
mov pc, lr

/*
@@ -400,6 +397,12 @@ ENTRY(xscale_dma_unmap_area)
mov pc, lr
ENDPROC(xscale_dma_unmap_area)

+ENTRY(xscale_dma_barrier)
+ mov r0, #0
+ mcr p15, 0, r0, c7, c10, 4 @ Drain Write (& Fill) Buffer
+ mov pc, lr
+ENDPROC(xscsale_dma_barrier)
+
ENTRY(xscale_cache_fns)
.long xscale_flush_kern_cache_all
.long xscale_flush_user_cache_all
@@ -409,6 +412,7 @@ ENTRY(xscale_cache_fns)
.long xscale_flush_kern_dcache_area
.long xscale_dma_map_area
.long xscale_dma_unmap_area
+ .long xscale_dma_barrier
.long xscale_dma_flush_range

/*

2010-02-11 00:40:36

by FUJITA Tomonori

[permalink] [raw]
Subject: Re: [RFC 0/2] fix dma_map_sg not to do barriers for each buffer

On Wed, 10 Feb 2010 13:27:47 -0800
Randy Dunlap <[email protected]> wrote:

> On 02/10/10 12:37, [email protected] wrote:
> > From: Abhijeet Dharmapurikar <[email protected]>
> >
> > Please refer to the post here
> > http://lkml.org/lkml/2010/1/4/347
> >
> > These changes are to introduce barrierless dma_map_area and dma_unmap_area and
> > use them to map the buffers in the scatterlist. For the last buffer, call
> > the normal dma_map_area(aka with barriers) effectively executing the barrier
> > at the end of the operation.
> >
> > Note that the barrierless operations are implemented for few arm
> > architectures only and I would implement for others once these are okayed by the
> > community.
>
> So when you add these interfaces for other architectures, you will also
> update Documentation/DMA-API.txt, right??

Seems that you misunderstand him.

He is talking about other "arm" architectures. His patchset improves
arm's internal implementation (dma_map_area and dma_unmap_area are not
the DMA API; not exported for driver writers). He meant that the
patchset doesn't cover all arm architectures.

This is about arm's implementation details and not related with other
non arm architectures. So no need to update Documentation/DMA-API.txt.

2010-02-11 00:41:40

by Randy Dunlap

[permalink] [raw]
Subject: Re: [RFC 0/2] fix dma_map_sg not to do barriers for each buffer

On 02/10/10 16:39, FUJITA Tomonori wrote:
> On Wed, 10 Feb 2010 13:27:47 -0800
> Randy Dunlap <[email protected]> wrote:
>
>> On 02/10/10 12:37, [email protected] wrote:
>>> From: Abhijeet Dharmapurikar <[email protected]>
>>>
>>> Please refer to the post here
>>> http://lkml.org/lkml/2010/1/4/347
>>>
>>> These changes are to introduce barrierless dma_map_area and dma_unmap_area and
>>> use them to map the buffers in the scatterlist. For the last buffer, call
>>> the normal dma_map_area(aka with barriers) effectively executing the barrier
>>> at the end of the operation.
>>>
>>> Note that the barrierless operations are implemented for few arm
>>> architectures only and I would implement for others once these are okayed by the
>>> community.
>>
>> So when you add these interfaces for other architectures, you will also
>> update Documentation/DMA-API.txt, right??
>
> Seems that you misunderstand him.
>
> He is talking about other "arm" architectures. His patchset improves
> arm's internal implementation (dma_map_area and dma_unmap_area are not
> the DMA API; not exported for driver writers). He meant that the
> patchset doesn't cover all arm architectures.
>
> This is about arm's implementation details and not related with other
> non arm architectures. So no need to update Documentation/DMA-API.txt.

OK, in that case I did misunderstand. Thanks for the info.

--
~Randy

2010-02-11 10:45:32

by Catalin Marinas

[permalink] [raw]
Subject: Re: [RFC 0/2] fix dma_map_sg not to do barriers for each buffer

On Wed, 2010-02-10 at 21:21 +0000, Russell King - ARM Linux wrote:
> On Wed, Feb 10, 2010 at 12:37:28PM -0800, [email protected] wrote:
> > From: Abhijeet Dharmapurikar <[email protected]>
> >
> > Please refer to the post here
> > http://lkml.org/lkml/2010/1/4/347
> >
> > These changes are to introduce barrierless dma_map_area and dma_unmap_area and
> > use them to map the buffers in the scatterlist. For the last buffer, call
> > the normal dma_map_area(aka with barriers) effectively executing the barrier
> > at the end of the operation.
>
> What if we make dma_map_area and dma_unmap_area both be barrier-less,
> and instead have a separate dma_barrier method - eg, something like the
> attached?

I was just writing the reply when I noticed yours :). Yes, that's a
better approach.

> diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
> index e290885..5928e78 100644
> --- a/arch/arm/include/asm/cacheflush.h
> +++ b/arch/arm/include/asm/cacheflush.h
> @@ -200,6 +200,7 @@ struct cpu_cache_fns {
>
> void (*dma_map_area)(const void *, size_t, int);
> void (*dma_unmap_area)(const void *, size_t, int);
> + void (*dma_barrier)(void);

Alternatively we could use the dsb() macro. I don't think we need more
than this and we would not (well, not easily) compile ARMv5 and ARMv6 in
the same kernel.

Anyway, an additional branch and return would probably be negligible
compared to the cache flushing operation.

> @@ -345,6 +347,7 @@ static inline dma_addr_t dma_map_single(struct device *dev, void *cpu_addr,
> BUG_ON(!valid_dma_direction(dir));
>
> __dma_single_cpu_to_dev(cpu_addr, size, dir);
> + __dma_barrier(dir);
>
> return virt_to_dma(dev, cpu_addr);
> }

The ___dma_single_cpu_to_dev() covers both inner and outer caches but I
haven't seen it touched by this patch (nor the other you posted). When
you clean the L1 cache, you need to make sure that there is a barrier
(DSB) so that it completes before cleaning the L2, otherwise you clean
the L2 but data keeps coming from L1.

For the *_sg functions, you either use barrier between L1 and L2 for
each page or you do the for_each_sg() loop twice, once for L1 and
another for L2.

--
Catalin

2010-02-11 10:53:26

by Catalin Marinas

[permalink] [raw]
Subject: Re: [RFC 0/2] fix dma_map_sg not to do barriers for each buffer

On Thu, 2010-02-11 at 10:45 +0000, Catalin Marinas wrote:
> On Wed, 2010-02-10 at 21:21 +0000, Russell King - ARM Linux wrote:
> > @@ -345,6 +347,7 @@ static inline dma_addr_t dma_map_single(struct device *dev, void *cpu_addr,
> > BUG_ON(!valid_dma_direction(dir));
> >
> > __dma_single_cpu_to_dev(cpu_addr, size, dir);
> > + __dma_barrier(dir);
> >
> > return virt_to_dma(dev, cpu_addr);
> > }
>
> The ___dma_single_cpu_to_dev() covers both inner and outer caches but I
> haven't seen it touched by this patch (nor the other you posted). When
> you clean the L1 cache, you need to make sure that there is a barrier
> (DSB) so that it completes before cleaning the L2, otherwise you clean
> the L2 but data keeps coming from L1.

Actually after L2 maintenance we don't even need the __dma_barrier(), we
need an outer_cache.sync() function.

I can do the outer cache optimisations together with a few others for
PL310 (which does not require the cache_wait() call for line
operations).

--
Catalin

2010-02-11 10:57:00

by Russell King - ARM Linux

[permalink] [raw]
Subject: Re: [RFC 0/2] fix dma_map_sg not to do barriers for each buffer

On Thu, Feb 11, 2010 at 10:45:01AM +0000, Catalin Marinas wrote:
> Alternatively we could use the dsb() macro. I don't think we need more
> than this and we would not (well, not easily) compile ARMv5 and ARMv6 in
> the same kernel.

That doesn't work - ARMv3 and some ARMv4 don't have a 'drain write
buffer' instruction but others do - executing that instruction on
older CPUs which don't have a write buffer causes an illegal
instruction fault.

> The ___dma_single_cpu_to_dev() covers both inner and outer caches but I
> haven't seen it touched by this patch (nor the other you posted). When
> you clean the L1 cache, you need to make sure that there is a barrier
> (DSB) so that it completes before cleaning the L2, otherwise you clean
> the L2 but data keeps coming from L1.
>
> For the *_sg functions, you either use barrier between L1 and L2 for
> each page or you do the for_each_sg() loop twice, once for L1 and
> another for L2.

Okay, that's a fundamental problem with this approach. Spanner in the
works kind of thing. I think that's a problem for Abhijeet's patch
as well - since the same comment appears to apply there too.

Sounds like it needs a totally different approach then.

2010-02-11 11:01:40

by Russell King - ARM Linux

[permalink] [raw]
Subject: Re: [RFC 0/2] fix dma_map_sg not to do barriers for each buffer

On Thu, Feb 11, 2010 at 10:53:05AM +0000, Catalin Marinas wrote:
> Actually after L2 maintenance we don't even need the __dma_barrier(), we
> need an outer_cache.sync() function.
>
> I can do the outer cache optimisations together with a few others for
> PL310 (which does not require the cache_wait() call for line
> operations).

I'm in half a mind to say "stop everything for the DMA API and wait
until the next merge window" - what we have at the moment is a big
shake up of how the API is implemented, which has had very little
attributable testing.

Let's get the current code (which missed the last merge window) tested,
acked and merged, and only then sort out these kinds of optimizations
after that. As it is, these DMA patches have had very little in the
way of attributable feedback so far.

2010-02-11 11:03:57

by Catalin Marinas

[permalink] [raw]
Subject: Re: [RFC 0/2] fix dma_map_sg not to do barriers for each buffer

On Thu, 2010-02-11 at 11:01 +0000, Russell King - ARM Linux wrote:
> On Thu, Feb 11, 2010 at 10:53:05AM +0000, Catalin Marinas wrote:
> > Actually after L2 maintenance we don't even need the __dma_barrier(), we
> > need an outer_cache.sync() function.
> >
> > I can do the outer cache optimisations together with a few others for
> > PL310 (which does not require the cache_wait() call for line
> > operations).
>
> I'm in half a mind to say "stop everything for the DMA API and wait
> until the next merge window" - what we have at the moment is a big
> shake up of how the API is implemented, which has had very little
> attributable testing.
>
> Let's get the current code (which missed the last merge window) tested,
> acked and merged, and only then sort out these kinds of optimizations
> after that. As it is, these DMA patches have had very little in the
> way of attributable feedback so far.

I agree, I wasn't planning to submit anything for 2.6.34. These
optimisations should probably get in 2.6.35.

--
Catalin

2010-02-11 19:13:27

by Abhijeet Dharmapurikar

[permalink] [raw]
Subject: Re: [RFC 0/2] fix dma_map_sg not to do barriers for each buffer

Russell King - ARM Linux wrote:
> On Thu, Feb 11, 2010 at 10:45:01AM +0000, Catalin Marinas wrote:
>> Alternatively we could use the dsb() macro. I don't think we need more
>> than this and we would not (well, not easily) compile ARMv5 and ARMv6 in
>> the same kernel.
>
> That doesn't work - ARMv3 and some ARMv4 don't have a 'drain write
> buffer' instruction but others do - executing that instruction on
> older CPUs which don't have a write buffer causes an illegal
> instruction fault.
>
>> The ___dma_single_cpu_to_dev() covers both inner and outer caches but I
>> haven't seen it touched by this patch (nor the other you posted). When
>> you clean the L1 cache, you need to make sure that there is a barrier
>> (DSB) so that it completes before cleaning the L2, otherwise you clean
>> the L2 but data keeps coming from L1.
>>
>> For the *_sg functions, you either use barrier between L1 and L2 for
>> each page or you do the for_each_sg() loop twice, once for L1 and
>> another for L2.
>
> Okay, that's a fundamental problem with this approach. Spanner in the
> works kind of thing. I think that's a problem for Abhijeet's patch
> as well - since the same comment appears to apply there too.

The problem applies to my patch as well, however my board has a unified
cache and I didn't think about ordering operations on the outer caches.

> Sounds like it needs a totally different approach then.
how about the following ?


From ea746d981f6f7291fd0f8b3f51bdd3747ca976c5 Mon Sep 17 00:00:00 2001
From: Abhijeet Dharmapurikar <[email protected]>
Date: Thu, 11 Feb 2010 10:29:19 -0800
Subject: [PATCH] dma: define map/unmap functions for outer cache

Define map and unmap functions for outer cache and execute barriers
at appropriate places within them. For architectures without outer caches
these functions are nil.

Signed-off-by: Abhijeet Dharmapurikar <[email protected]>
---
arch/arm/include/asm/cacheflush.h | 39
+++++++++++++++++++++++++++++++++++++
arch/arm/mm/dma-mapping.c | 17 +--------------
2 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/arch/arm/include/asm/cacheflush.h
b/arch/arm/include/asm/cacheflush.h
index 8148a00..3474a54 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -11,6 +11,7 @@
#define _ASMARM_CACHEFLUSH_H

#include <linux/mm.h>
+#include <linux/dma-mapping.h>

#include <asm/glue.h>
#include <asm/shmparam.h>
@@ -300,6 +301,38 @@ static inline void outer_flush_range(unsigned long
start, unsigned long end)
outer_cache.flush_range(start, end);
}

+static inline void dmac_outer_map_area(const void *kaddr, size_t size,
+ enum dma_data_direction dir)
+{
+ unsigned long paddr;
+
+ /* complete all the prior L1 operations */
+ dma_barrier();
+ paddr = __pa(kaddr);
+ if (dir == DMA_FROM_DEVICE) {
+ outer_inv_range(paddr, paddr + size);
+ } else {
+ outer_clean_range(paddr, paddr + size);
+ }
+ /* FIXME: non-speculating: flush on bidirectional mappings? */
+}
+
+static inline void dmac_outer_unmap_area(const void *kaddr, size_t size,
+ enum dma_data_direction dir)
+{
+
+ /* FIXME: non-speculating: not required */
+ /* don't bother invalidating if DMA to device */
+ if (dir != DMA_TO_DEVICE) {
+ unsigned long paddr = __pa(kaddr);
+ outer_inv_range(paddr, paddr + size);
+ }
+
+ /* complete all the outer cache operations operations */
+ dma_barrier();
+}
+
+
#else

static inline void outer_inv_range(unsigned long start, unsigned long end)
@@ -308,6 +341,12 @@ static inline void outer_clean_range(unsigned long
start, unsigned long end)
{ }
static inline void outer_flush_range(unsigned long start, unsigned
long end)
{ }
+static inline void dmac_outer_map_area(const void *kaddr, size_t size,
+ enum dma_data_direction dir)
+{ }
+static inline void dmac_outer_unmap_area(const void *kaddr, size_t size,
+ enum dma_data_direction dir)
+{ }

#endif

diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 64daef2..6fff111 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -407,19 +407,11 @@ EXPORT_SYMBOL(dma_free_coherent);
void ___dma_single_cpu_to_dev(const void *kaddr, size_t size,
enum dma_data_direction dir)
{
- unsigned long paddr;
-
BUG_ON(!virt_addr_valid(kaddr) || !virt_addr_valid(kaddr + size - 1));

dmac_map_area(kaddr, size, dir);

- paddr = __pa(kaddr);
- if (dir == DMA_FROM_DEVICE) {
- outer_inv_range(paddr, paddr + size);
- } else {
- outer_clean_range(paddr, paddr + size);
- }
- /* FIXME: non-speculating: flush on bidirectional mappings? */
+ dmac_outer_map_area(kaddr, size, dir);
}
EXPORT_SYMBOL(___dma_single_cpu_to_dev);

@@ -428,12 +420,7 @@ void ___dma_single_dev_to_cpu(const void *kaddr,
size_t size,
{
BUG_ON(!virt_addr_valid(kaddr) || !virt_addr_valid(kaddr + size - 1));

- /* FIXME: non-speculating: not required */
- /* don't bother invalidating if DMA to device */
- if (dir != DMA_TO_DEVICE) {
- unsigned long paddr = __pa(kaddr);
- outer_inv_range(paddr, paddr + size);
- }
+ dmac_outer_unmap_area(kaddr, size, dir);

dmac_unmap_area(kaddr, size, dir);
}
--
1.5.6.3





2010-02-11 21:36:39

by Abhijeet Dharmapurikar

[permalink] [raw]
Subject: Re: [RFC 0/2] fix dma_map_sg not to do barriers for each buffer


> }
> @@ -363,7 +366,7 @@ static inline dma_addr_t dma_map_single(struct device *dev, void *cpu_addr,
> * The device owns this memory once this call has completed. The CPU
> * can regain ownership by calling dma_unmap_page().
> */
> -static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
> +static inline dma_addr_t __dma_map_page(struct device *dev, struct page *page,
> unsigned long offset, size_t size, enum dma_data_direction dir)
> {
> BUG_ON(!valid_dma_direction(dir));
> @@ -373,6 +376,14 @@ static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
> return page_to_dma(dev, page) + offset;
> }
>
> +static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
> + unsigned long offset, size_t size, enum dma_data_direction dir)
> +{
> + dma_addr_t addr = __dma_map_page(page, offset, size, dir);
> + __dma_barrier(dir);
> + return addr;
> +}
> +
> /**
.
.
.
.
> /**
> * dma_map_sg - map a set of SG buffers for streaming mode DMA
> * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
> @@ -532,11 +539,14 @@ int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
> int i, j;
>
> for_each_sg(sg, s, nents, i) {
> - s->dma_address = dma_map_page(dev, sg_page(s), s->offset,
> + s->dma_address = __dma_map_page(dev, sg_page(s), s->offset,
> s->length, dir);
> if (dma_mapping_error(dev, s->dma_address))
> goto bad_mapping;
> }
> +
> + __dma_barrier(dir);
> +
> return nents;
>
> bad_mapping:
> @@ -564,6 +574,8 @@ void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
>
> for_each_sg(sg, s, nents, i)
> dma_unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir);
> +
> + __dma_barrier(dir);
> }
> EXPORT_SYMBOL(dma_unmap_sg);

dma_unmap_sg too could use indirection like dma_map_sg.

Thanks for the patch.
Abhijeet