0) Background:
We found that AmpereOne benefits from aggressive prefetches when
using 4K page size.
1) This patch:
1.1) adds new WORKAROUND_AMPERE_AC03_PREFETCH capability.
1.2) uses MIDR_AMPERE1 to filter the processor.
1.3) uses alternative_if to alternative the code
for AmpereOne.
1.4) adds software prefetches for the specific loop.
Also add a macro add_prefetch.
2) Test result:
In hugetlb or tmpfs, We can get big seqential read performance improvement
up to 1.3x ~ 1.4x.
Huang Shijie (4):
extable: add __sort_main_extable
arm64: alternative: handle the kernel exception table
arm64: copy_template.S: add loop_for_copy_128_bytes macro
arm64: add software prefetches for AmpereOne
arch/arm64/Kconfig.platforms | 7 +++
arch/arm64/kernel/alternative.c | 18 +++++++
arch/arm64/kernel/cpu_errata.c | 9 ++++
arch/arm64/lib/copy_template.S | 87 +++++++++++++++++++++++----------
arch/arm64/tools/cpucaps | 1 +
include/linux/extable.h | 2 +
kernel/extable.c | 8 ++-
7 files changed, 105 insertions(+), 27 deletions(-)
--
2.40.1
The AmpereOne chip(arm64) may change the kernel exception table at
boot time, so it needs to sort the kernel extable table during
the kernel boot.
Introduce __sort_main_extable which is used to sort the kernel
exception table.
Signed-off-by: Huang Shijie <[email protected]>
---
include/linux/extable.h | 2 ++
kernel/extable.c | 8 +++++++-
2 files changed, 9 insertions(+), 1 deletion(-)
diff --git a/include/linux/extable.h b/include/linux/extable.h
index 4ab9e78f313b..ef70ec3633b0 100644
--- a/include/linux/extable.h
+++ b/include/linux/extable.h
@@ -15,6 +15,8 @@ search_extable(const struct exception_table_entry *base,
void sort_extable(struct exception_table_entry *start,
struct exception_table_entry *finish);
void sort_main_extable(void);
+void __sort_main_extable(void);
+
void trim_init_extable(struct module *m);
/* Given an address, look for it in the exception tables */
diff --git a/kernel/extable.c b/kernel/extable.c
index 71f482581cab..0fbe0ccb1c3a 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -32,13 +32,19 @@ extern struct exception_table_entry __stop___ex_table[];
/* Cleared by build time tools if the table is already sorted. */
u32 __initdata __visible main_extable_sort_needed = 1;
+void __sort_main_extable(void)
+{
+ if (&__stop___ex_table > &__start___ex_table)
+ sort_extable(__start___ex_table, __stop___ex_table);
+}
+
/* Sort the kernel's built-in exception table */
void __init sort_main_extable(void)
{
if (main_extable_sort_needed &&
&__stop___ex_table > &__start___ex_table) {
pr_notice("Sorting __ex_table...\n");
- sort_extable(__start___ex_table, __stop___ex_table);
+ __sort_main_extable();
}
}
--
2.40.1
0) Background:
We found that AmpereOne benefits from aggressive prefetches when
using 4K page size.
1) This patch:
1.1) adds new WORKAROUND_AMPERE_AC03_PREFETCH capability.
1.2) uses MIDR_AMPERE1 to filter the processor.
1.3) uses alternative_if to alternative the code
for AmpereOne.
1.4) adds software prefetches for the specific loop.
Also add a macro add_prefetch.
2) Test result:
In hugetlb or tmpfs, We can get big seqential read performance improvement
up to 1.3x ~ 1.4x.
Signed-off-by: Huang Shijie <[email protected]>
---
arch/arm64/Kconfig.platforms | 7 +++++++
arch/arm64/kernel/cpu_errata.c | 9 +++++++++
arch/arm64/lib/copy_template.S | 31 +++++++++++++++++++++++++++++++
arch/arm64/tools/cpucaps | 1 +
4 files changed, 48 insertions(+)
diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms
index 6069120199bb..74ab8bea0019 100644
--- a/arch/arm64/Kconfig.platforms
+++ b/arch/arm64/Kconfig.platforms
@@ -8,6 +8,13 @@ config ARCH_ACTIONS
help
This enables support for the Actions Semiconductor S900 SoC family.
+config ARCH_AMPEREONE
+ bool "AmpereOne Platforms"
+ help
+ This enables support for the ARMv8 based AmpereOne chipsets.
+ AmpereOne is the next generation of Cloud Native Processors from
+ Ampere.
+
config ARCH_SUNXI
bool "Allwinner sunxi 64-bit SoC Family"
select ARCH_HAS_RESET_CONTROLLER
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 5706e74c5578..c0060d3086d0 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -744,6 +744,15 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
.capability = ARM64_WORKAROUND_AMPERE_AC03_CPU_38,
ERRATA_MIDR_ALL_VERSIONS(MIDR_AMPERE1),
},
+#endif
+#if defined(CONFIG_ARCH_AMPEREONE) && defined(CONFIG_ARM64_4K_PAGES)
+ {
+ .desc = "Optimization for AmpereOne chip",
+ .capability = ARM64_WORKAROUND_AMPERE_AC03_PREFETCH,
+ .type = ARM64_CPUCAP_SYSTEM_FEATURE,
+ .matches = is_affected_midr_range,
+ .midr_range = MIDR_ALL_VERSIONS(MIDR_AMPERE1)
+ },
#endif
{
}
diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
index 79b32569260c..b707c3ec6820 100644
--- a/arch/arm64/lib/copy_template.S
+++ b/arch/arm64/lib/copy_template.S
@@ -41,6 +41,18 @@
b.ne .Ltail63
.endm
+#if defined(CONFIG_ARCH_AMPEREONE) && defined(CONFIG_ARM64_4K_PAGES)
+.macro add_prefetch
+ /*
+ * Add prefetch two cache lines by prfm to optimize the
+ * performance. The 2K offset is the best offset which
+ * we get from the tests.
+ */
+ prfm pldl2keep, [src, #2048]
+ prfm pldl2keep, [src, #2112]
+.endm
+#endif
+
/*
* Copy a buffer from src to dest (alignment handled by the hardware)
*
@@ -156,6 +168,13 @@ D_h .req x14
b .Lexitfunc
.Lcpy_over64:
+#if defined(CONFIG_ARCH_AMPEREONE) && defined(CONFIG_ARM64_4K_PAGES)
+alternative_if ARM64_WORKAROUND_AMPERE_AC03_PREFETCH
+ cmp count, #PAGE_SIZE
+ b.ge .Lcpy_over_pagesize
+alternative_else_nop_endif
+#endif
+
subs count, count, #128
b.ge .Lcpy_body_large
/*
@@ -182,4 +201,16 @@ D_h .req x14
.p2align L1_CACHE_SHIFT
.Lcpy_body_large:
loop_for_copy_128_bytes
+
+#if defined(CONFIG_ARCH_AMPEREONE) && defined(CONFIG_ARM64_4K_PAGES)
+ b .Lexitfunc
+
+ .p2align L1_CACHE_SHIFT
+.Lcpy_over_pagesize:
+alternative_if ARM64_WORKAROUND_AMPERE_AC03_PREFETCH
+ subs count, count, #128
+ loop_for_copy_128_bytes add_prefetch
+alternative_else_nop_endif
+#endif
+
.Lexitfunc:
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index dea3dc89234b..13e197abf249 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -100,3 +100,4 @@ WORKAROUND_NVIDIA_CARMEL_CNP
WORKAROUND_QCOM_FALKOR_E1003
WORKAROUND_REPEAT_TLBI
WORKAROUND_SPECULATIVE_AT
+WORKAROUND_AMPERE_AC03_PREFETCH
--
2.40.1
In arch/arm64/lib/copy_templates.S, some macros involve kernel exception table,
such as ldp1/stp1/strh1/str1,etc. Current alternative_if does not support
to embed them.
This patch adds the support to embed them into alternative_if:
1.) Modify the @insn to the right address,
2.) Sort the kernel exception table.
Signed-off-by: Huang Shijie <[email protected]>
---
arch/arm64/kernel/alternative.c | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c
index 8ff6610af496..4c73f9cc9a85 100644
--- a/arch/arm64/kernel/alternative.c
+++ b/arch/arm64/kernel/alternative.c
@@ -19,6 +19,7 @@
#include <asm/sections.h>
#include <asm/vdso.h>
#include <linux/stop_machine.h>
+#include <linux/extable.h>
#define __ALT_PTR(a, f) ((void *)&(a)->f + (a)->f)
#define ALT_ORIG_PTR(a) __ALT_PTR(a, orig_offset)
@@ -101,6 +102,22 @@ static __always_inline u32 get_alt_insn(struct alt_instr *alt, __le32 *insnptr,
return insn;
}
+/* Check the kernel exception table */
+static void check_extable(__le32 *origptr, __le32 *updptr, __le32 *replptr)
+{
+ struct exception_table_entry *e;
+ unsigned long addr = (unsigned long)replptr;
+
+ e = (struct exception_table_entry *)search_kernel_exception_table(addr);
+ if (e) {
+ /* Modify the @insn to the right address */
+ e->insn = cpu_to_le32((int)((long)(origptr) - (long)(&e->insn)));
+
+ /* Sort the kernel exception table */
+ __sort_main_extable();
+ }
+}
+
static noinstr void patch_alternative(struct alt_instr *alt,
__le32 *origptr, __le32 *updptr, int nr_inst)
{
@@ -112,6 +129,7 @@ static noinstr void patch_alternative(struct alt_instr *alt,
u32 insn;
insn = get_alt_insn(alt, origptr + i, replptr + i);
+ check_extable(origptr + i, updptr + i, replptr + i);
updptr[i] = cpu_to_le32(insn);
}
}
--
2.40.1
Add the loop_for_copy_128_bytes macro, to make the code clean.
And make preparation for the next patch.
Signed-off-by: Huang Shijie <[email protected]>
---
arch/arm64/lib/copy_template.S | 58 ++++++++++++++++++----------------
1 file changed, 31 insertions(+), 27 deletions(-)
diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
index 488df234c49a..79b32569260c 100644
--- a/arch/arm64/lib/copy_template.S
+++ b/arch/arm64/lib/copy_template.S
@@ -10,6 +10,36 @@
* files/head:/src/aarch64/
*/
+.macro loop_for_copy_128_bytes extra_ops
+ /* pre-get 64 bytes data. */
+ ldp1 A_l, A_h, src, #16
+ ldp1 B_l, B_h, src, #16
+ ldp1 C_l, C_h, src, #16
+ ldp1 D_l, D_h, src, #16
+1:
+ \extra_ops
+ /*
+ * interlace the load of next 64 bytes data block with store of the last
+ * loaded 64 bytes data.
+ */
+ stp1 A_l, A_h, dst, #16
+ ldp1 A_l, A_h, src, #16
+ stp1 B_l, B_h, dst, #16
+ ldp1 B_l, B_h, src, #16
+ stp1 C_l, C_h, dst, #16
+ ldp1 C_l, C_h, src, #16
+ stp1 D_l, D_h, dst, #16
+ ldp1 D_l, D_h, src, #16
+ subs count, count, #64
+ b.ge 1b
+ stp1 A_l, A_h, dst, #16
+ stp1 B_l, B_h, dst, #16
+ stp1 C_l, C_h, dst, #16
+ stp1 D_l, D_h, dst, #16
+
+ tst count, #0x3f
+ b.ne .Ltail63
+.endm
/*
* Copy a buffer from src to dest (alignment handled by the hardware)
@@ -151,31 +181,5 @@ D_h .req x14
*/
.p2align L1_CACHE_SHIFT
.Lcpy_body_large:
- /* pre-get 64 bytes data. */
- ldp1 A_l, A_h, src, #16
- ldp1 B_l, B_h, src, #16
- ldp1 C_l, C_h, src, #16
- ldp1 D_l, D_h, src, #16
-1:
- /*
- * interlace the load of next 64 bytes data block with store of the last
- * loaded 64 bytes data.
- */
- stp1 A_l, A_h, dst, #16
- ldp1 A_l, A_h, src, #16
- stp1 B_l, B_h, dst, #16
- ldp1 B_l, B_h, src, #16
- stp1 C_l, C_h, dst, #16
- ldp1 C_l, C_h, src, #16
- stp1 D_l, D_h, dst, #16
- ldp1 D_l, D_h, src, #16
- subs count, count, #64
- b.ge 1b
- stp1 A_l, A_h, dst, #16
- stp1 B_l, B_h, dst, #16
- stp1 C_l, C_h, dst, #16
- stp1 D_l, D_h, dst, #16
-
- tst count, #0x3f
- b.ne .Ltail63
+ loop_for_copy_128_bytes
.Lexitfunc:
--
2.40.1
On Wed, Nov 22, 2023 at 05:28:51PM +0800, Huang Shijie wrote:
> 0) Background:
> We found that AmpereOne benefits from aggressive prefetches when
> using 4K page size.
We tend to shy away from micro-architecture specific optimisations in
the arm64 kernel as they're pretty unmaintainable, hard to test properly,
generally lead to bloat and add additional obstacles to updating our
library routines.
Admittedly, we have something for Thunder-X1 in copy_page() (disguised
as ARM64_HAS_NO_HW_PREFETCH) but, frankly, that machine needed all the
help it could get and given where it is today I suspect we could drop
that code without any material consequences.
So I'd really prefer not to merge this; modern CPUs should do better at
copying data. It's copy_to_user(), not rocket science.
Will
On 2023-11-22 9:28 am, Huang Shijie wrote:
> 0) Background:
> We found that AmpereOne benefits from aggressive prefetches when
> using 4K page size.
>
> 1) This patch:
> 1.1) adds new WORKAROUND_AMPERE_AC03_PREFETCH capability.
> 1.2) uses MIDR_AMPERE1 to filter the processor.
> 1.3) uses alternative_if to alternative the code
> for AmpereOne.
> 1.4) adds software prefetches for the specific loop.
> Also add a macro add_prefetch.
>
> 2) Test result:
> In hugetlb or tmpfs, We can get big seqential read performance improvement
> up to 1.3x ~ 1.4x.
Frankly the copy_template code is pretty terrible anyway, so the fact
that you're not touching anything *else* (memcpy(), copy_page(), etc.)
makes me wonder whether you'd benefit from just a better baseline to
begin with (unless the underlying concern really is something more
specific like the hardware prefetcher failing to recognise LDTR/STTR).
The last attempt to improve this derailed into questioning the usercopy
API semantics themselves, but for reference that would be my original
patches at [0] (more optimised, but some copy_to_user() fault fixups are
buggy), and/or Mark's follow-up at [1] (less aggressive but still better
than the current code, and doesn't touch copy_from_user()).
Thanks,
Robin.
[0]
https://lore.kernel.org/linux-arm-kernel/[email protected]/
[1]
https://lore.kernel.org/linux-arch/[email protected]/
> Signed-off-by: Huang Shijie <[email protected]>
> ---
> arch/arm64/Kconfig.platforms | 7 +++++++
> arch/arm64/kernel/cpu_errata.c | 9 +++++++++
> arch/arm64/lib/copy_template.S | 31 +++++++++++++++++++++++++++++++
> arch/arm64/tools/cpucaps | 1 +
> 4 files changed, 48 insertions(+)
>
> diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms
> index 6069120199bb..74ab8bea0019 100644
> --- a/arch/arm64/Kconfig.platforms
> +++ b/arch/arm64/Kconfig.platforms
> @@ -8,6 +8,13 @@ config ARCH_ACTIONS
> help
> This enables support for the Actions Semiconductor S900 SoC family.
>
> +config ARCH_AMPEREONE
> + bool "AmpereOne Platforms"
> + help
> + This enables support for the ARMv8 based AmpereOne chipsets.
> + AmpereOne is the next generation of Cloud Native Processors from
> + Ampere.
> +
> config ARCH_SUNXI
> bool "Allwinner sunxi 64-bit SoC Family"
> select ARCH_HAS_RESET_CONTROLLER
> diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
> index 5706e74c5578..c0060d3086d0 100644
> --- a/arch/arm64/kernel/cpu_errata.c
> +++ b/arch/arm64/kernel/cpu_errata.c
> @@ -744,6 +744,15 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
> .capability = ARM64_WORKAROUND_AMPERE_AC03_CPU_38,
> ERRATA_MIDR_ALL_VERSIONS(MIDR_AMPERE1),
> },
> +#endif
> +#if defined(CONFIG_ARCH_AMPEREONE) && defined(CONFIG_ARM64_4K_PAGES)
> + {
> + .desc = "Optimization for AmpereOne chip",
> + .capability = ARM64_WORKAROUND_AMPERE_AC03_PREFETCH,
> + .type = ARM64_CPUCAP_SYSTEM_FEATURE,
> + .matches = is_affected_midr_range,
> + .midr_range = MIDR_ALL_VERSIONS(MIDR_AMPERE1)
> + },
> #endif
> {
> }
> diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
> index 79b32569260c..b707c3ec6820 100644
> --- a/arch/arm64/lib/copy_template.S
> +++ b/arch/arm64/lib/copy_template.S
> @@ -41,6 +41,18 @@
> b.ne .Ltail63
> .endm
>
> +#if defined(CONFIG_ARCH_AMPEREONE) && defined(CONFIG_ARM64_4K_PAGES)
> +.macro add_prefetch
> + /*
> + * Add prefetch two cache lines by prfm to optimize the
> + * performance. The 2K offset is the best offset which
> + * we get from the tests.
> + */
> + prfm pldl2keep, [src, #2048]
> + prfm pldl2keep, [src, #2112]
> +.endm
> +#endif
> +
> /*
> * Copy a buffer from src to dest (alignment handled by the hardware)
> *
> @@ -156,6 +168,13 @@ D_h .req x14
> b .Lexitfunc
>
> .Lcpy_over64:
> +#if defined(CONFIG_ARCH_AMPEREONE) && defined(CONFIG_ARM64_4K_PAGES)
> +alternative_if ARM64_WORKAROUND_AMPERE_AC03_PREFETCH
> + cmp count, #PAGE_SIZE
> + b.ge .Lcpy_over_pagesize
> +alternative_else_nop_endif
> +#endif
> +
> subs count, count, #128
> b.ge .Lcpy_body_large
> /*
> @@ -182,4 +201,16 @@ D_h .req x14
> .p2align L1_CACHE_SHIFT
> .Lcpy_body_large:
> loop_for_copy_128_bytes
> +
> +#if defined(CONFIG_ARCH_AMPEREONE) && defined(CONFIG_ARM64_4K_PAGES)
> + b .Lexitfunc
> +
> + .p2align L1_CACHE_SHIFT
> +.Lcpy_over_pagesize:
> +alternative_if ARM64_WORKAROUND_AMPERE_AC03_PREFETCH
> + subs count, count, #128
> + loop_for_copy_128_bytes add_prefetch
> +alternative_else_nop_endif
> +#endif
> +
> .Lexitfunc:
> diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
> index dea3dc89234b..13e197abf249 100644
> --- a/arch/arm64/tools/cpucaps
> +++ b/arch/arm64/tools/cpucaps
> @@ -100,3 +100,4 @@ WORKAROUND_NVIDIA_CARMEL_CNP
> WORKAROUND_QCOM_FALKOR_E1003
> WORKAROUND_REPEAT_TLBI
> WORKAROUND_SPECULATIVE_AT
> +WORKAROUND_AMPERE_AC03_PREFETCH
On Wed, Nov 22, 2023 at 09:48:57AM +0000, Will Deacon wrote:
> On Wed, Nov 22, 2023 at 05:28:51PM +0800, Huang Shijie wrote:
> > 0) Background:
> > We found that AmpereOne benefits from aggressive prefetches when
> > using 4K page size.
>
> We tend to shy away from micro-architecture specific optimisations in
> the arm64 kernel as they're pretty unmaintainable, hard to test properly,
> generally lead to bloat and add additional obstacles to updating our
> library routines.
>
> Admittedly, we have something for Thunder-X1 in copy_page() (disguised
> as ARM64_HAS_NO_HW_PREFETCH) but, frankly, that machine needed all the
> help it could get and given where it is today I suspect we could drop
> that code without any material consequences.
>
> So I'd really prefer not to merge this; modern CPUs should do better at
> copying data. It's copy_to_user(), not rocket science.
I agree, and I'd also like to drop ARM64_HAS_NO_HW_PREFETCH.
Mark.
On Wed, 22 Nov 2023 11:40:09 +0000,
Mark Rutland <[email protected]> wrote:
>
> On Wed, Nov 22, 2023 at 09:48:57AM +0000, Will Deacon wrote:
> > On Wed, Nov 22, 2023 at 05:28:51PM +0800, Huang Shijie wrote:
> > > 0) Background:
> > > We found that AmpereOne benefits from aggressive prefetches when
> > > using 4K page size.
> >
> > We tend to shy away from micro-architecture specific optimisations in
> > the arm64 kernel as they're pretty unmaintainable, hard to test properly,
> > generally lead to bloat and add additional obstacles to updating our
> > library routines.
> >
> > Admittedly, we have something for Thunder-X1 in copy_page() (disguised
> > as ARM64_HAS_NO_HW_PREFETCH) but, frankly, that machine needed all the
> > help it could get and given where it is today I suspect we could drop
> > that code without any material consequences.
> >
> > So I'd really prefer not to merge this; modern CPUs should do better at
> > copying data. It's copy_to_user(), not rocket science.
>
> I agree, and I'd also like to drop ARM64_HAS_NO_HW_PREFETCH.
+1. Also, as the (most probably) sole user of this remarkable
implementation, I hacked -rc2 to drop ARM64_HAS_NO_HW_PREFETCH. The
result is that a kernel compilation job regressed by 0.4%, something
that I consider being pure noise.
If nobody beats me to it, I'll send the patch.
M.
--
Without deviation from the norm, progress is not possible.
On Wed, Nov 22, 2023 at 10:29 AM Huang Shijie
<[email protected]> wrote:
> 0) Background:
> We found that AmpereOne benefits from aggressive prefetches when
> using 4K page size.
>
> 1) This patch:
> 1.1) adds new WORKAROUND_AMPERE_AC03_PREFETCH capability.
> 1.2) uses MIDR_AMPERE1 to filter the processor.
> 1.3) uses alternative_if to alternative the code
> for AmpereOne.
> 1.4) adds software prefetches for the specific loop.
> Also add a macro add_prefetch.
>
> 2) Test result:
> In hugetlb or tmpfs, We can get big seqential read performance improvement
> up to 1.3x ~ 1.4x.
In June 2022 Fujitsu tried to add a similar feature for A64FX, here is
the essence
of my feedback from back then, it applies here as well:
https://lore.kernel.org/linux-arm-kernel/CACRpkdbPLFOoPdX4L6ABV8GKpC8cQGP3s2aN2AvRHEK49U9VMg@mail.gmail.com/#t
TL;DR: this is a hack, if you want to accelerate the memory hierarchy,
then work with the MM developers to figure out how to do that in a
structured and scientific way that will work with any prefetching hardware
on any CPU.
Yours,
Linus Walleij