stnp performs non-temporal store, give a hints to the memory system
that caching is not useful for this data. But the scenario where
copy_page() used may not have this implication, although I must admit
there's such case where stnp helps performance(good). In this good
case, we can rely on the HW write streaming mechanism in some
implementations such as cortex-a55 to detect the case and take actions.
testing with https://github.com/apinski-cavium/copy_page_benchmark
this patch can reduce the time by about 3% on cortex-a55 platforms.
Signed-off-by: Jisheng Zhang <[email protected]>
---
arch/arm64/lib/copy_page.S | 32 ++++++++++++++++----------------
1 file changed, 16 insertions(+), 16 deletions(-)
diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S
index 6a56d7cf309d..4c74fe2d8bd6 100644
--- a/arch/arm64/lib/copy_page.S
+++ b/arch/arm64/lib/copy_page.S
@@ -32,21 +32,21 @@ SYM_FUNC_START(__pi_copy_page)
1:
tst x0, #(PAGE_SIZE - 1)
- stnp x2, x3, [x0, #-256]
+ stp x2, x3, [x0, #-256]
ldp x2, x3, [x1]
- stnp x4, x5, [x0, #16 - 256]
+ stp x4, x5, [x0, #16 - 256]
ldp x4, x5, [x1, #16]
- stnp x6, x7, [x0, #32 - 256]
+ stp x6, x7, [x0, #32 - 256]
ldp x6, x7, [x1, #32]
- stnp x8, x9, [x0, #48 - 256]
+ stp x8, x9, [x0, #48 - 256]
ldp x8, x9, [x1, #48]
- stnp x10, x11, [x0, #64 - 256]
+ stp x10, x11, [x0, #64 - 256]
ldp x10, x11, [x1, #64]
- stnp x12, x13, [x0, #80 - 256]
+ stp x12, x13, [x0, #80 - 256]
ldp x12, x13, [x1, #80]
- stnp x14, x15, [x0, #96 - 256]
+ stp x14, x15, [x0, #96 - 256]
ldp x14, x15, [x1, #96]
- stnp x16, x17, [x0, #112 - 256]
+ stp x16, x17, [x0, #112 - 256]
ldp x16, x17, [x1, #112]
add x0, x0, #128
@@ -54,14 +54,14 @@ SYM_FUNC_START(__pi_copy_page)
b.ne 1b
- stnp x2, x3, [x0, #-256]
- stnp x4, x5, [x0, #16 - 256]
- stnp x6, x7, [x0, #32 - 256]
- stnp x8, x9, [x0, #48 - 256]
- stnp x10, x11, [x0, #64 - 256]
- stnp x12, x13, [x0, #80 - 256]
- stnp x14, x15, [x0, #96 - 256]
- stnp x16, x17, [x0, #112 - 256]
+ stp x2, x3, [x0, #-256]
+ stp x4, x5, [x0, #16 - 256]
+ stp x6, x7, [x0, #32 - 256]
+ stp x8, x9, [x0, #48 - 256]
+ stp x10, x11, [x0, #64 - 256]
+ stp x12, x13, [x0, #80 - 256]
+ stp x14, x15, [x0, #96 - 256]
+ stp x16, x17, [x0, #112 - 256]
ret
SYM_FUNC_END(__pi_copy_page)
--
2.43.0