LinuxLists.cc - [PATCH v3] kasan: speed up mte_set_mem_tag

2021-05-19 09:20:26

Subject: [PATCH v3] kasan: speed up mte_set_mem_tag_range

Use DC GVA / DC GZVA to speed up KASan memory tagging in HW tags mode.

The first cacheline is always tagged using STG/STZG even if the address is
cacheline-aligned, as benchmarks show it is faster than a conditional
branch.

Signed-off-by: Evgenii Stepanov <[email protected]>
Co-developed-by: Peter Collingbourne <[email protected]>
Signed-off-by: Peter Collingbourne <[email protected]>
---
Changelog since v1:
- Added Co-developed-by.

Changelog since v2:
- Added Signed-off-by.

arch/arm64/include/asm/mte-kasan.h | 40 +------------------
arch/arm64/lib/Makefile | 2 +
arch/arm64/lib/mte-kasan.S | 63 ++++++++++++++++++++++++++++++
3 files changed, 66 insertions(+), 39 deletions(-)
create mode 100644 arch/arm64/lib/mte-kasan.S

diff --git a/arch/arm64/include/asm/mte-kasan.h b/arch/arm64/include/asm/mte-kasan.h
index ddd4d17cf9a0..e29a0e2ab35c 100644
--- a/arch/arm64/include/asm/mte-kasan.h
+++ b/arch/arm64/include/asm/mte-kasan.h
@@ -48,45 +48,7 @@ static inline u8 mte_get_random_tag(void)
return mte_get_ptr_tag(addr);
}

-/*
- * Assign allocation tags for a region of memory based on the pointer tag.
- * Note: The address must be non-NULL and MTE_GRANULE_SIZE aligned and
- * size must be non-zero and MTE_GRANULE_SIZE aligned.
- */
-static inline void mte_set_mem_tag_range(void *addr, size_t size,
- u8 tag, bool init)
-{
- u64 curr, end;
-
- if (!size)
- return;
-
- curr = (u64)__tag_set(addr, tag);
- end = curr + size;
-
- /*
- * 'asm volatile' is required to prevent the compiler to move
- * the statement outside of the loop.
- */
- if (init) {
- do {
- asm volatile(__MTE_PREAMBLE "stzg %0, [%0]"
- :
- : "r" (curr)
- : "memory");
- curr += MTE_GRANULE_SIZE;
- } while (curr != end);
- } else {
- do {
- asm volatile(__MTE_PREAMBLE "stg %0, [%0]"
- :
- : "r" (curr)
- : "memory");
- curr += MTE_GRANULE_SIZE;
- } while (curr != end);
- }
-}
-
+void mte_set_mem_tag_range(void *addr, size_t size, u8 tag, bool init);
void mte_enable_kernel_sync(void);
void mte_enable_kernel_async(void);
void mte_init_tags(u64 max_tag);
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index d31e1169d9b8..c06ada79a437 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -18,3 +18,5 @@ obj-$(CONFIG_CRC32) += crc32.o
obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o

obj-$(CONFIG_ARM64_MTE) += mte.o
+
+obj-$(CONFIG_KASAN_HW_TAGS) += mte-kasan.o
diff --git a/arch/arm64/lib/mte-kasan.S b/arch/arm64/lib/mte-kasan.S
new file mode 100644
index 000000000000..9f6975e2af60
--- /dev/null
+++ b/arch/arm64/lib/mte-kasan.S
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 Google Inc.
+ */
+#include <linux/const.h>
+#include <linux/linkage.h>
+
+#include <asm/mte-def.h>
+
+ .arch armv8.5-a+memtag
+
+ .macro __set_mem_tag_range, stg, gva, start, size, linesize, tmp1, tmp2, tmp3
+ add \tmp3, \start, \size
+ cmp \size, \linesize, lsl #1
+ b.lt .Lsmtr3_\@
+
+ sub \tmp1, \linesize, #1
+ bic \tmp2, \tmp3, \tmp1
+ orr \tmp1, \start, \tmp1
+
+.Lsmtr1_\@:
+ \stg \start, [\start], #MTE_GRANULE_SIZE
+ cmp \start, \tmp1
+ b.lt .Lsmtr1_\@
+
+.Lsmtr2_\@:
+ dc \gva, \start
+ add \start, \start, \linesize
+ cmp \start, \tmp2
+ b.lt .Lsmtr2_\@
+
+.Lsmtr3_\@:
+ cmp \start, \tmp3
+ b.ge .Lsmtr4_\@
+ \stg \start, [\start], #MTE_GRANULE_SIZE
+ b .Lsmtr3_\@
+.Lsmtr4_\@:
+ .endm
+
+/*
+ * Assign allocation tags for a region of memory based on the pointer tag.
+ * Note: The address must be non-NULL and MTE_GRANULE_SIZE aligned and
+ * size must be non-zero and MTE_GRANULE_SIZE aligned.
+ * x0 - start address
+ * x1 - region size
+ * x2 - tag
+ * x3 - bool init
+ */
+SYM_FUNC_START(mte_set_mem_tag_range)
+ mrs x4, dczid_el0
+ and w4, w4, #0xf
+ mov x5, #4
+ lsl x4, x5, x4
+
+ bfi x0, x2, #56, #8
+
+ cbz x3, .Lnoinit
+ __set_mem_tag_range stzg, gzva, x0, x1, x4, x2, x3, x5
+ ret
+.Lnoinit:
+ __set_mem_tag_range stg, gva, x0, x1, x4, x2, x3, x5
+ ret
+SYM_FUNC_END(mte_set_mem_tag_range)
--
2.31.1.751.gd2f1c929bd-goog

2021-05-19 18:26:50

by Catalin Marinas

[permalink] [raw]

Subject: Re: [PATCH v3] kasan: speed up mte_set_mem_tag_range

On Mon, May 17, 2021 at 04:55:46PM -0700, Evgenii Stepanov wrote:
> Use DC GVA / DC GZVA to speed up KASan memory tagging in HW tags mode.
>
> The first cacheline is always tagged using STG/STZG even if the address is
> cacheline-aligned, as benchmarks show it is faster than a conditional
> branch.
[...]
> diff --git a/arch/arm64/include/asm/mte-kasan.h b/arch/arm64/include/asm/mte-kasan.h
> index ddd4d17cf9a0..e29a0e2ab35c 100644
> --- a/arch/arm64/include/asm/mte-kasan.h
> +++ b/arch/arm64/include/asm/mte-kasan.h
> @@ -48,45 +48,7 @@ static inline u8 mte_get_random_tag(void)
> return mte_get_ptr_tag(addr);
> }
>
> -/*
> - * Assign allocation tags for a region of memory based on the pointer tag.
> - * Note: The address must be non-NULL and MTE_GRANULE_SIZE aligned and
> - * size must be non-zero and MTE_GRANULE_SIZE aligned.
> - */
> -static inline void mte_set_mem_tag_range(void *addr, size_t size,
> - u8 tag, bool init)

With commit 2cb34276427a ("arm64: kasan: simplify and inline MTE
functions") you wanted this inlined for performance. Does this not
matter much that it's now out of line?

> diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
> index d31e1169d9b8..c06ada79a437 100644
> --- a/arch/arm64/lib/Makefile
> +++ b/arch/arm64/lib/Makefile
> @@ -18,3 +18,5 @@ obj-$(CONFIG_CRC32) += crc32.o
> obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
>
> obj-$(CONFIG_ARM64_MTE) += mte.o
> +
> +obj-$(CONFIG_KASAN_HW_TAGS) += mte-kasan.o
> diff --git a/arch/arm64/lib/mte-kasan.S b/arch/arm64/lib/mte-kasan.S
> new file mode 100644
> index 000000000000..9f6975e2af60
> --- /dev/null
> +++ b/arch/arm64/lib/mte-kasan.S
> @@ -0,0 +1,63 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (C) 2021 Google Inc.
> + */
> +#include <linux/const.h>
> +#include <linux/linkage.h>
> +
> +#include <asm/mte-def.h>
> +
> + .arch armv8.5-a+memtag
> +
> + .macro __set_mem_tag_range, stg, gva, start, size, linesize, tmp1, tmp2, tmp3
> + add \tmp3, \start, \size
> + cmp \size, \linesize, lsl #1
> + b.lt .Lsmtr3_\@

We could do with some comments here. Why the lsl #1? I think I get it
but it would be good to make this more readable.

It may be easier if you placed it in a file on its own (as it is now but
with a less generic file name) and use a few .req instead of the tmpX.
You can use the macro args only for the stg/gva.

> +
> + sub \tmp1, \linesize, #1
> + bic \tmp2, \tmp3, \tmp1
> + orr \tmp1, \start, \tmp1
> +
> +.Lsmtr1_\@:
> + \stg \start, [\start], #MTE_GRANULE_SIZE
> + cmp \start, \tmp1
> + b.lt .Lsmtr1_\@
> +
> +.Lsmtr2_\@:
> + dc \gva, \start
> + add \start, \start, \linesize
> + cmp \start, \tmp2
> + b.lt .Lsmtr2_\@
> +
> +.Lsmtr3_\@:
> + cmp \start, \tmp3
> + b.ge .Lsmtr4_\@
> + \stg \start, [\start], #MTE_GRANULE_SIZE
> + b .Lsmtr3_\@
> +.Lsmtr4_\@:
> + .endm

If we want to get the best performance out of this, we should look at
the memset implementation and do something similar. In principle it's
not that far from a memzero, though depending on the microarchitecture
it may behave slightly differently.

Anyway, before that I wonder if we wrote all this in C + inline asm
(three while loops or maybe two and some goto), what's the performance
difference? It has the advantage of being easier to maintain even if we
used some C macros to generate gva/gzva variants.

--
Catalin

2021-05-19 18:30:53

by Peter Collingbourne

[permalink] [raw]

Subject: Re: [PATCH v3] kasan: speed up mte_set_mem_tag_range

On Tue, May 18, 2021 at 10:44 AM Catalin Marinas
<[email protected]> wrote:
>
> On Mon, May 17, 2021 at 04:55:46PM -0700, Evgenii Stepanov wrote:
> > Use DC GVA / DC GZVA to speed up KASan memory tagging in HW tags mode.
> >
> > The first cacheline is always tagged using STG/STZG even if the address is
> > cacheline-aligned, as benchmarks show it is faster than a conditional
> > branch.
> [...]
> > diff --git a/arch/arm64/include/asm/mte-kasan.h b/arch/arm64/include/asm/mte-kasan.h
> > index ddd4d17cf9a0..e29a0e2ab35c 100644
> > --- a/arch/arm64/include/asm/mte-kasan.h
> > +++ b/arch/arm64/include/asm/mte-kasan.h
> > @@ -48,45 +48,7 @@ static inline u8 mte_get_random_tag(void)
> > return mte_get_ptr_tag(addr);
> > }
> >
> > -/*
> > - * Assign allocation tags for a region of memory based on the pointer tag.
> > - * Note: The address must be non-NULL and MTE_GRANULE_SIZE aligned and
> > - * size must be non-zero and MTE_GRANULE_SIZE aligned.
> > - */
> > -static inline void mte_set_mem_tag_range(void *addr, size_t size,
> > - u8 tag, bool init)
>
> With commit 2cb34276427a ("arm64: kasan: simplify and inline MTE
> functions") you wanted this inlined for performance. Does this not
> matter much that it's now out of line?
>
> > diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
> > index d31e1169d9b8..c06ada79a437 100644
> > --- a/arch/arm64/lib/Makefile
> > +++ b/arch/arm64/lib/Makefile
> > @@ -18,3 +18,5 @@ obj-$(CONFIG_CRC32) += crc32.o
> > obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
> >
> > obj-$(CONFIG_ARM64_MTE) += mte.o
> > +
> > +obj-$(CONFIG_KASAN_HW_TAGS) += mte-kasan.o
> > diff --git a/arch/arm64/lib/mte-kasan.S b/arch/arm64/lib/mte-kasan.S
> > new file mode 100644
> > index 000000000000..9f6975e2af60
> > --- /dev/null
> > +++ b/arch/arm64/lib/mte-kasan.S
> > @@ -0,0 +1,63 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +/*
> > + * Copyright (C) 2021 Google Inc.
> > + */
> > +#include <linux/const.h>
> > +#include <linux/linkage.h>
> > +
> > +#include <asm/mte-def.h>
> > +
> > + .arch armv8.5-a+memtag
> > +
> > + .macro __set_mem_tag_range, stg, gva, start, size, linesize, tmp1, tmp2, tmp3
> > + add \tmp3, \start, \size
> > + cmp \size, \linesize, lsl #1
> > + b.lt .Lsmtr3_\@
>
> We could do with some comments here. Why the lsl #1? I think I get it
> but it would be good to make this more readable.
>
> It may be easier if you placed it in a file on its own (as it is now but
> with a less generic file name) and use a few .req instead of the tmpX.
> You can use the macro args only for the stg/gva.

Yes, I think we could use more comments and .req here, like the
userspace version of this function that we use in Scudo:
https://cs.android.com/android/platform/superproject/+/master:external/scudo/standalone/memtag.h;l=150;drc=34c8857fef28eab205c22cbfb4bfda2f848e5a80

> > +
> > + sub \tmp1, \linesize, #1
> > + bic \tmp2, \tmp3, \tmp1
> > + orr \tmp1, \start, \tmp1
> > +
> > +.Lsmtr1_\@:
> > + \stg \start, [\start], #MTE_GRANULE_SIZE
> > + cmp \start, \tmp1
> > + b.lt .Lsmtr1_\@
> > +
> > +.Lsmtr2_\@:
> > + dc \gva, \start
> > + add \start, \start, \linesize
> > + cmp \start, \tmp2
> > + b.lt .Lsmtr2_\@
> > +
> > +.Lsmtr3_\@:
> > + cmp \start, \tmp3
> > + b.ge .Lsmtr4_\@
> > + \stg \start, [\start], #MTE_GRANULE_SIZE
> > + b .Lsmtr3_\@
> > +.Lsmtr4_\@:
> > + .endm
>
> If we want to get the best performance out of this, we should look at
> the memset implementation and do something similar. In principle it's
> not that far from a memzero, though depending on the microarchitecture
> it may behave slightly differently.

For Scudo I compared our storeTags implementation linked above against
__mtag_tag_zero_region from the arm-optimized-routines repository
(which I think is basically an improved version of that memset
implementation rewritten to use STG and DC GZVA), and our
implementation performed better on the hardware that we have access
to.

> Anyway, before that I wonder if we wrote all this in C + inline asm
> (three while loops or maybe two and some goto), what's the performance
> difference? It has the advantage of being easier to maintain even if we
> used some C macros to generate gva/gzva variants.

I'm not sure I agree that it will be easier to maintain. Due to the
number of "unusual" instructions required here it seems more readable
to have the code in pure assembly than to require readers to switch
contexts between C and asm. If we did move it to inline asm then I
think it should basically be a large blob of asm like the Scudo code
that I linked.

Peter

2021-05-19 20:19:56

by Catalin Marinas

[permalink] [raw]

Subject: Re: [PATCH v3] kasan: speed up mte_set_mem_tag_range

On Tue, May 18, 2021 at 11:11:52AM -0700, Peter Collingbourne wrote:
> On Tue, May 18, 2021 at 10:44 AM Catalin Marinas
> <[email protected]> wrote:
> > If we want to get the best performance out of this, we should look at
> > the memset implementation and do something similar. In principle it's
> > not that far from a memzero, though depending on the microarchitecture
> > it may behave slightly differently.
>
> For Scudo I compared our storeTags implementation linked above against
> __mtag_tag_zero_region from the arm-optimized-routines repository
> (which I think is basically an improved version of that memset
> implementation rewritten to use STG and DC GZVA), and our
> implementation performed better on the hardware that we have access
> to.

That's the advantage of having hardware early ;).

> > Anyway, before that I wonder if we wrote all this in C + inline asm
> > (three while loops or maybe two and some goto), what's the performance
> > difference? It has the advantage of being easier to maintain even if we
> > used some C macros to generate gva/gzva variants.
>
> I'm not sure I agree that it will be easier to maintain. Due to the
> number of "unusual" instructions required here it seems more readable
> to have the code in pure assembly than to require readers to switch
> contexts between C and asm. If we did move it to inline asm then I
> think it should basically be a large blob of asm like the Scudo code
> that I linked.

I was definitely not thinking of a big asm block, that's even less
readable than separate .S file. It's more like adding dedicated macros
for single STG or DC GVA uses and using them in while loops.

Anyway, let's see a better commented .S implementation first. Given that
tagging is very sensitive to the performance of this function, we'd
probably benefit from a (few percent I suspect) perf improvement with
the hand-coded assembly.

--
Catalin

2021-05-19 20:23:57

by Evgenii Stepanov

[permalink] [raw]

Subject: Re: [PATCH v3] kasan: speed up mte_set_mem_tag_range

On Wed, May 19, 2021 at 11:13 AM Catalin Marinas
<[email protected]> wrote:
>
> On Tue, May 18, 2021 at 11:11:52AM -0700, Peter Collingbourne wrote:
> > On Tue, May 18, 2021 at 10:44 AM Catalin Marinas
> > <[email protected]> wrote:
> > > If we want to get the best performance out of this, we should look at
> > > the memset implementation and do something similar. In principle it's
> > > not that far from a memzero, though depending on the microarchitecture
> > > it may behave slightly differently.
> >
> > For Scudo I compared our storeTags implementation linked above against
> > __mtag_tag_zero_region from the arm-optimized-routines repository
> > (which I think is basically an improved version of that memset
> > implementation rewritten to use STG and DC GZVA), and our
> > implementation performed better on the hardware that we have access
> > to.
>
> That's the advantage of having hardware early ;).
>
> > > Anyway, before that I wonder if we wrote all this in C + inline asm
> > > (three while loops or maybe two and some goto), what's the performance
> > > difference? It has the advantage of being easier to maintain even if we
> > > used some C macros to generate gva/gzva variants.
> >
> > I'm not sure I agree that it will be easier to maintain. Due to the
> > number of "unusual" instructions required here it seems more readable
> > to have the code in pure assembly than to require readers to switch
> > contexts between C and asm. If we did move it to inline asm then I
> > think it should basically be a large blob of asm like the Scudo code
> > that I linked.
>
> I was definitely not thinking of a big asm block, that's even less
> readable than separate .S file. It's more like adding dedicated macros
> for single STG or DC GVA uses and using them in while loops.

I've got a C version with 4 single-instruction asm blocks, and it
looks pretty nice. The assembly is almost identical to the hand
written variant, and performance is 3% better, presumably because of
the inlining. Also, the C version allows more potential optimizations,
like specialization on the value of "init" - which is not happening
right now because it is not constant in any of the callers.

I'll upload a v4 shortly.

>
> Anyway, let's see a better commented .S implementation first. Given that
> tagging is very sensitive to the performance of this function, we'd
> probably benefit from a (few percent I suspect) perf improvement with
> the hand-coded assembly.
>
> --
> Catalin