LinuxLists.cc - [PATCH] lib/raid6: Add AVX2 optimized recovery functions

2012-11-08 21:48:07

Subject: [PATCH] lib/raid6: Add AVX2 optimized recovery functions

Optimize RAID6 recovery functions to take advantage of
the 256-bit YMM integer instructions introduced in AVX2.

Signed-off-by: Jim Kukunas <[email protected]>
---
arch/x86/Makefile | 5 +-
include/linux/raid/pq.h | 1 +
lib/raid6/Makefile | 2 +-
lib/raid6/algos.c | 3 +
lib/raid6/recov_avx2.c | 327 ++++++++++++++++++++++++++++++++++++++++++++++++
lib/raid6/test/Makefile | 2 +-
lib/raid6/x86.h | 14 ++-
7 files changed, 345 insertions(+), 9 deletions(-)
create mode 100644 lib/raid6/recov_avx2.c

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 682e9c2..f24c037 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -123,9 +123,10 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI
# does binutils support specific instructions?
asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
+avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)

-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
+KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr)
+KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr)

LDFLAGS := -m elf_$(UTS_MACHINE)

diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index 640c69c..3156347 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -109,6 +109,7 @@ struct raid6_recov_calls {

extern const struct raid6_recov_calls raid6_recov_intx1;
extern const struct raid6_recov_calls raid6_recov_ssse3;
+extern const struct raid6_recov_calls raid6_recov_avx2;

/* Algorithm list */
extern const struct raid6_calls * const raid6_algos[];
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index de06dfe..8c2e22b 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -1,6 +1,6 @@
obj-$(CONFIG_RAID6_PQ) += raid6_pq.o

-raid6_pq-y += algos.o recov.o recov_ssse3.o tables.o int1.o int2.o int4.o \
+raid6_pq-y += algos.o recov.o recov_ssse3.o recov_avx2.o tables.o int1.o int2.o int4.o \
int8.o int16.o int32.o altivec1.o altivec2.o altivec4.o \
altivec8.o mmx.o sse1.o sse2.o
hostprogs-y += mktables
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 589f5f5..8b7f55c 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -72,6 +72,9 @@ EXPORT_SYMBOL_GPL(raid6_datap_recov);

const struct raid6_recov_calls *const raid6_recov_algos[] = {
#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
+#ifdef CONFIG_AS_AVX2
+ &raid6_recov_avx2,
+#endif
&raid6_recov_ssse3,
#endif
&raid6_recov_intx1,
diff --git a/lib/raid6/recov_avx2.c b/lib/raid6/recov_avx2.c
new file mode 100644
index 0000000..43a9bab
--- /dev/null
+++ b/lib/raid6/recov_avx2.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright (C) 2012 Intel Corporation
+ * Author: Jim Kukunas <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
+
+#if CONFIG_AS_AVX2
+
+#include <linux/raid/pq.h>
+#include "x86.h"
+
+static int raid6_has_avx2(void)
+{
+ return boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_AVX);
+}
+
+static void raid6_2data_recov_avx2(int disks, size_t bytes, int faila,
+ int failb, void **ptrs)
+{
+ u8 *p, *q, *dp, *dq;
+ const u8 *pbmul; /* P multiplier table for B data */
+ const u8 *qmul; /* Q multiplier table (for both) */
+ const u8 x0f = 0x0f;
+
+ p = (u8 *)ptrs[disks-2];
+ q = (u8 *)ptrs[disks-1];
+
+ /* Compute syndrome with zero for the missing data pages
+ Use the dead data pages as temporary storage for
+ delta p and delta q */
+ dp = (u8 *)ptrs[faila];
+ ptrs[faila] = (void *)raid6_empty_zero_page;
+ ptrs[disks-2] = dp;
+ dq = (u8 *)ptrs[failb];
+ ptrs[failb] = (void *)raid6_empty_zero_page;
+ ptrs[disks-1] = dq;
+
+ raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+ /* Restore pointer table */
+ ptrs[faila] = dp;
+ ptrs[failb] = dq;
+ ptrs[disks-2] = p;
+ ptrs[disks-1] = q;
+
+ /* Now, pick the proper data tables */
+ pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
+ qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
+ raid6_gfexp[failb]]];
+
+ kernel_fpu_begin();
+
+ /* ymm0 = x0f[16] */
+ asm volatile("vpbroadcastb %0, %%ymm7" : : "m" (x0f));
+
+ while (bytes) {
+#ifdef CONFIG_X86_64
+ asm volatile("vmovdqa %0, %%ymm1" : : "m" (q[0]));
+ asm volatile("vmovdqa %0, %%ymm9" : : "m" (q[32]));
+ asm volatile("vmovdqa %0, %%ymm0" : : "m" (p[0]));
+ asm volatile("vmovdqa %0, %%ymm8" : : "m" (p[32]));
+ asm volatile("vpxor %0, %%ymm1, %%ymm1" : : "m" (dq[0]));
+ asm volatile("vpxor %0, %%ymm9, %%ymm9" : : "m" (dq[32]));
+ asm volatile("vpxor %0, %%ymm0, %%ymm0" : : "m" (dp[0]));
+ asm volatile("vpxor %0, %%ymm8, %%ymm8" : : "m" (dp[32]));
+
+ /*
+ * 1 = dq[0] ^ q[0]
+ * 9 = dq[32] ^ q[32]
+ * 0 = dp[0] ^ p[0]
+ * 8 = dp[32] ^ p[32]
+ */
+
+ asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (qmul[0]));
+ asm volatile("vbroadcasti128 %0, %%ymm5" : : "m" (qmul[16]));
+
+ asm volatile("vpsraw $4, %ymm1, %ymm3");
+ asm volatile("vpsraw $4, %ymm9, %ymm12");
+ asm volatile("vpand %ymm7, %ymm1, %ymm1");
+ asm volatile("vpand %ymm7, %ymm9, %ymm9");
+ asm volatile("vpand %ymm7, %ymm3, %ymm3");
+ asm volatile("vpand %ymm7, %ymm12, %ymm12");
+ asm volatile("vpshufb %ymm9, %ymm4, %ymm14");
+ asm volatile("vpshufb %ymm1, %ymm4, %ymm4");
+ asm volatile("vpshufb %ymm12, %ymm5, %ymm15");
+ asm volatile("vpshufb %ymm3, %ymm5, %ymm5");
+ asm volatile("vpxor %ymm14, %ymm15, %ymm15");
+ asm volatile("vpxor %ymm4, %ymm5, %ymm5");
+
+ /*
+ * 5 = qx[0]
+ * 15 = qx[32]
+ */
+
+ asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (pbmul[0]));
+ asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (pbmul[16]));
+ asm volatile("vpsraw $4, %ymm0, %ymm2");
+ asm volatile("vpsraw $4, %ymm8, %ymm6");
+ asm volatile("vpand %ymm7, %ymm0, %ymm3");
+ asm volatile("vpand %ymm7, %ymm8, %ymm14");
+ asm volatile("vpand %ymm7, %ymm2, %ymm2");
+ asm volatile("vpand %ymm7, %ymm6, %ymm6");
+ asm volatile("vpshufb %ymm14, %ymm4, %ymm12");
+ asm volatile("vpshufb %ymm3, %ymm4, %ymm4");
+ asm volatile("vpshufb %ymm6, %ymm1, %ymm13");
+ asm volatile("vpshufb %ymm2, %ymm1, %ymm1");
+ asm volatile("vpxor %ymm4, %ymm1, %ymm1");
+ asm volatile("vpxor %ymm12, %ymm13, %ymm13");
+
+ /*
+ * 1 = pbmul[px[0]]
+ * 13 = pbmul[px[32]]
+ */
+ asm volatile("vpxor %ymm5, %ymm1, %ymm1");
+ asm volatile("vpxor %ymm15, %ymm13, %ymm13");
+
+ /*
+ * 1 = db = DQ
+ * 13 = db[32] = DQ[32]
+ */
+ asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0]));
+ asm volatile("vmovdqa %%ymm13,%0" : "=m" (dq[32]));
+ asm volatile("vpxor %ymm1, %ymm0, %ymm0");
+ asm volatile("vpxor %ymm13, %ymm8, %ymm8");
+
+ asm volatile("vmovdqa %%ymm0, %0" : "=m" (dp[0]));
+ asm volatile("vmovdqa %%ymm8, %0" : "=m" (dp[32]));
+
+ bytes -= 64;
+ p += 64;
+ q += 64;
+ dp += 64;
+ dq += 64;
+#else
+ asm volatile("vmovdqa %0, %%ymm1" : : "m" (*q));
+ asm volatile("vmovdqa %0, %%ymm0" : : "m" (*p));
+ asm volatile("vpxor %0, %%ymm1, %%ymm1" : : "m" (*dq));
+ asm volatile("vpxor %0, %%ymm0, %%ymm0" : : "m" (*dp));
+
+ /* 1 = dq ^ q; 0 = dp ^ p */
+
+ asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (qmul[0]));
+ asm volatile("vbroadcasti128 %0, %%ymm5" : : "m" (qmul[16]));
+
+ /*
+ * 1 = dq ^ q
+ * 3 = dq ^ p >> 4
+ */
+ asm volatile("vpsraw $4, %ymm1, %ymm3");
+ asm volatile("vpand %ymm7, %ymm1, %ymm1");
+ asm volatile("vpand %ymm7, %ymm3, %ymm3");
+ asm volatile("vpshufb %ymm1, %ymm4, %ymm4");
+ asm volatile("vpshufb %ymm3, %ymm5, %ymm5");
+ asm volatile("vpxor %ymm4, %ymm5, %ymm5");
+
+ /* 5 = qx */
+
+ asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (pbmul[0]));
+ asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (pbmul[16]));
+
+ asm volatile("vpsraw $4, %ymm0, %ymm2");
+ asm volatile("vpand %ymm7, %ymm0, %ymm3");
+ asm volatile("vpand %ymm7, %ymm2, %ymm2");
+ asm volatile("vpshufb %ymm3, %ymm4, %ymm4");
+ asm volatile("vpshufb %ymm2, %ymm1, %ymm1");
+ asm volatile("vpxor %ymm4, %ymm1, %ymm1");
+
+ /* 1 = pbmul[px] */
+ asm volatile("vpxor %ymm5, %ymm1, %ymm1");
+ /* 1 = db = DQ */
+ asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0]));
+
+ asm volatile("vpxor %ymm1, %ymm0, %ymm0");
+ asm volatile("vmovdqa %%ymm0, %0" : "=m" (dp[0]));
+
+ bytes -= 32;
+ p += 32;
+ q += 32;
+ dp += 32;
+ dq += 32;
+#endif
+ }
+
+ kernel_fpu_end();
+}
+
+static void raid6_datap_recov_avx2(int disks, size_t bytes, int faila,
+ void **ptrs)
+{
+ u8 *p, *q, *dq;
+ const u8 *qmul; /* Q multiplier table */
+ const u8 x0f = 0x0f;
+
+ p = (u8 *)ptrs[disks-2];
+ q = (u8 *)ptrs[disks-1];
+
+ /* Compute syndrome with zero for the missing data page
+ Use the dead data page as temporary storage for delta q */
+ dq = (u8 *)ptrs[faila];
+ ptrs[faila] = (void *)raid6_empty_zero_page;
+ ptrs[disks-1] = dq;
+
+ raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+ /* Restore pointer table */
+ ptrs[faila] = dq;
+ ptrs[disks-1] = q;
+
+ /* Now, pick the proper data tables */
+ qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+ kernel_fpu_begin();
+
+ asm volatile("vpbroadcastb %0, %%ymm7" : : "m" (x0f));
+
+ while (bytes) {
+#ifdef CONFIG_X86_64
+ asm volatile("vmovdqa %0, %%ymm3" : : "m" (dq[0]));
+ asm volatile("vmovdqa %0, %%ymm8" : : "m" (dq[32]));
+ asm volatile("vpxor %0, %%ymm3, %%ymm3" : : "m" (q[0]));
+ asm volatile("vpxor %0, %%ymm8, %%ymm8" : : "m" (q[32]));
+
+ /*
+ * 3 = q[0] ^ dq[0]
+ * 8 = q[32] ^ dq[32]
+ */
+ asm volatile("vbroadcasti128 %0, %%ymm0" : : "m" (qmul[0]));
+ asm volatile("vmovapd %ymm0, %ymm13");
+ asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (qmul[16]));
+ asm volatile("vmovapd %ymm1, %ymm14");
+
+ asm volatile("vpsraw $4, %ymm3, %ymm6");
+ asm volatile("vpsraw $4, %ymm8, %ymm12");
+ asm volatile("vpand %ymm7, %ymm3, %ymm3");
+ asm volatile("vpand %ymm7, %ymm8, %ymm8");
+ asm volatile("vpand %ymm7, %ymm6, %ymm6");
+ asm volatile("vpand %ymm7, %ymm12, %ymm12");
+ asm volatile("vpshufb %ymm3, %ymm0, %ymm0");
+ asm volatile("vpshufb %ymm8, %ymm13, %ymm13");
+ asm volatile("vpshufb %ymm6, %ymm1, %ymm1");
+ asm volatile("vpshufb %ymm12, %ymm14, %ymm14");
+ asm volatile("vpxor %ymm0, %ymm1, %ymm1");
+ asm volatile("vpxor %ymm13, %ymm14, %ymm14");
+
+ /*
+ * 1 = qmul[q[0] ^ dq[0]]
+ * 14 = qmul[q[32] ^ dq[32]]
+ */
+ asm volatile("vmovdqa %0, %%ymm2" : : "m" (p[0]));
+ asm volatile("vmovdqa %0, %%ymm12" : : "m" (p[32]));
+ asm volatile("vpxor %ymm1, %ymm2, %ymm2");
+ asm volatile("vpxor %ymm14, %ymm12, %ymm12");
+
+ /*
+ * 2 = p[0] ^ qmul[q[0] ^ dq[0]]
+ * 12 = p[32] ^ qmul[q[32] ^ dq[32]]
+ */
+
+ asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0]));
+ asm volatile("vmovdqa %%ymm14, %0" : "=m" (dq[32]));
+ asm volatile("vmovdqa %%ymm2, %0" : "=m" (p[0]));
+ asm volatile("vmovdqa %%ymm12,%0" : "=m" (p[32]));
+
+ bytes -= 64;
+ p += 64;
+ q += 64;
+ dq += 64;
+#else
+ asm volatile("vmovdqa %0, %%ymm3" : : "m" (dq[0]));
+ asm volatile("vpxor %0, %%ymm3, %%ymm3" : : "m" (q[0]));
+
+ /* 3 = q ^ dq */
+
+ asm volatile("vbroadcasti128 %0, %%ymm0" : : "m" (qmul[0]));
+ asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (qmul[16]));
+
+ asm volatile("vpsraw $4, %ymm3, %ymm6");
+ asm volatile("vpand %ymm7, %ymm3, %ymm3");
+ asm volatile("vpand %ymm7, %ymm6, %ymm6");
+ asm volatile("vpshufb %ymm3, %ymm0, %ymm0");
+ asm volatile("vpshufb %ymm6, %ymm1, %ymm1");
+ asm volatile("vpxor %ymm0, %ymm1, %ymm1");
+
+ /* 1 = qmul[q ^ dq] */
+
+ asm volatile("vmovdqa %0, %%ymm2" : : "m" (p[0]));
+ asm volatile("vpxor %ymm1, %ymm2, %ymm2");
+
+ /* 2 = p ^ qmul[q ^ dq] */
+
+ asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0]));
+ asm volatile("vmovdqa %%ymm2, %0" : "=m" (p[0]));
+
+ bytes -= 32;
+ p += 32;
+ q += 32;
+ dq += 32;
+#endif
+ }
+
+ kernel_fpu_end();
+}
+
+const struct raid6_recov_calls raid6_recov_avx2 = {
+ .data2 = raid6_2data_recov_avx2,
+ .datap = raid6_datap_recov_avx2,
+ .valid = raid6_has_avx2,
+#ifdef CONFIG_X86_64
+ .name = "avx2x2",
+#else
+ .name = "avx2x1",
+#endif
+ .priority = 2,
+};
+
+#else
+#warning "your version of binutils lacks AVX2 support"
+#endif
+
+#endif
diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile
index c76151d..d919c98 100644
--- a/lib/raid6/test/Makefile
+++ b/lib/raid6/test/Makefile
@@ -23,7 +23,7 @@ RANLIB = ranlib
all: raid6.a raid6test

raid6.a: int1.o int2.o int4.o int8.o int16.o int32.o mmx.o sse1.o sse2.o \
- altivec1.o altivec2.o altivec4.o altivec8.o recov.o recov_ssse3.o algos.o \
+ altivec1.o altivec2.o altivec4.o altivec8.o recov.o recov_ssse3.o recov_avx2.o algos.o \
tables.o
rm -f $@
$(AR) cq $@ $^
diff --git a/lib/raid6/x86.h b/lib/raid6/x86.h
index d55d632..b759548 100644
--- a/lib/raid6/x86.h
+++ b/lib/raid6/x86.h
@@ -45,19 +45,23 @@ static inline void kernel_fpu_end(void)
#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
#define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */
#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */
+#define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */
#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */

/* Should work well enough on modern CPUs for testing */
static inline int boot_cpu_has(int flag)
{
- u32 eax = (flag & 0x20) ? 0x80000001 : 1;
- u32 ecx, edx;
+ u32 eax, ebx, ecx, edx;
+
+ eax = (flag & 0x100) ? 7 :
+ (flag & 0x20) ? 0x80000001 : 1;
+ ecx = 0;

asm volatile("cpuid"
- : "+a" (eax), "=d" (edx), "=c" (ecx)
- : : "ebx");
+ : "+a" (eax), "=b" (ebx), "=d" (edx), "+c" (ecx));

- return ((flag & 0x80 ? ecx : edx) >> (flag & 31)) & 1;
+ return ((flag & 0x100 ? ebx :
+ (flag & 0x80) ? ecx : edx) >> (flag & 31)) & 1;
}

#endif /* ndef __KERNEL__ */
--
1.8.0

2012-11-09 11:35:51

by Paul Menzel

[permalink] [raw]

Subject: Re: [PATCH] lib/raid6: Add AVX2 optimized recovery functions

Dear Jim,

Am Donnerstag, den 08.11.2012, 13:47 -0800 schrieb Jim Kukunas:
> Optimize RAID6 recovery functions to take advantage of
> the 256-bit YMM integer instructions introduced in AVX2.

in my experiencing optimizations always have to be back up by
benchmarks. Could you add those to the commit message please.

> Signed-off-by: Jim Kukunas <[email protected]>

[…]

Thanks,

Paul

Attachments:

signature.asc (198.00 B)
This is a digitally signed message part

2012-11-09 11:39:28

by H. Peter Anvin

[permalink] [raw]

Subject: Re: [PATCH] lib/raid6: Add AVX2 optimized recovery functions

Sorry, we cannot share those at this time since the hardwarenis not yet released.

Paul Menzel <[email protected]> wrote:

>Dear Jim,
>
>
>Am Donnerstag, den 08.11.2012, 13:47 -0800 schrieb Jim Kukunas:
>> Optimize RAID6 recovery functions to take advantage of
>> the 256-bit YMM integer instructions introduced in AVX2.
>
>in my experiencing optimizations always have to be back up by
>benchmarks. Could you add those to the commit message please.
>
>> Signed-off-by: Jim Kukunas <[email protected]>
>
>[…]
>
>
>Thanks,
>
>Paul

--
Sent from my mobile phone. Please excuse brevity and lack of formatting.

2012-11-09 11:46:43

by Paul Menzel

[permalink] [raw]

Subject: Re: [PATCH] lib/raid6: Add AVX2 optimized recovery functions

Am Freitag, den 09.11.2012, 12:39 +0100 schrieb H. Peter Anvin:
> Sorry, we cannot share those at this time since the hardwarenis not yet released.

Too bad. Then I suggest an additional run time switch to enable and
disable that code path. So people later can easily test themselves.

Thanks,

Paul

Attachments:

signature.asc (198.00 B)
This is a digitally signed message part

2012-11-09 11:50:51

by NeilBrown

[permalink] [raw]

Subject: Re: [PATCH] lib/raid6: Add AVX2 optimized recovery functions

On Fri, 09 Nov 2012 12:39:05 +0100 "H. Peter Anvin" <[email protected]> wrote:

> Sorry, we cannot share those at this time since the hardwarenis not yet released.

Can I take that to imply "Acked-by: "H. Peter Anvin" <[email protected]>" ??

It would be nice to have at least a statement like:
These patches have been tested both with the user-space testing tool and in
a RAID6 md array and the pass all test. While we cannot release performance
numbers as the hardwere is not released, we can confirm that on that hardware
the performance with these patches is faster than without.

I guess I should be able to assume that - surely the patches would not be
posted if it were not true... But I like to avoid assuming when I can.

Thanks,
NeilBrown

>
> Paul Menzel <[email protected]> wrote:
>
> >Dear Jim,
> >
> >
> >Am Donnerstag, den 08.11.2012, 13:47 -0800 schrieb Jim Kukunas:
> >> Optimize RAID6 recovery functions to take advantage of
> >> the 256-bit YMM integer instructions introduced in AVX2.
> >
> >in my experiencing optimizations always have to be back up by
> >benchmarks. Could you add those to the commit message please.
> >
> >> Signed-off-by: Jim Kukunas <[email protected]>
> >
> >[…]
> >
> >
> >Thanks,
> >
> >Paul
>

Attachments:

signature.asc (828.00 B)

2012-11-09 12:25:16

by H. Peter Anvin

[permalink] [raw]

Subject: Re: [PATCH] lib/raid6: Add AVX2 optimized recovery functions

Yes, consider it an implicit Acked-by.

NeilBrown <[email protected]> wrote:

>On Fri, 09 Nov 2012 12:39:05 +0100 "H. Peter Anvin" <[email protected]>
>wrote:
>
>> Sorry, we cannot share those at this time since the hardwarenis not
>yet released.
>
>Can I take that to imply "Acked-by: "H. Peter Anvin" <[email protected]>"
>??
>
>It would be nice to have at least a statement like:
>These patches have been tested both with the user-space testing tool
>and in
>a RAID6 md array and the pass all test. While we cannot release
>performance
>numbers as the hardwere is not released, we can confirm that on that
>hardware
> the performance with these patches is faster than without.
>
>I guess I should be able to assume that - surely the patches would not
>be
>posted if it were not true... But I like to avoid assuming when I can.
>
>Thanks,
>NeilBrown
>
>
>>
>> Paul Menzel <[email protected]> wrote:
>>
>> >Dear Jim,
>> >
>> >
>> >Am Donnerstag, den 08.11.2012, 13:47 -0800 schrieb Jim Kukunas:
>> >> Optimize RAID6 recovery functions to take advantage of
>> >> the 256-bit YMM integer instructions introduced in AVX2.
>> >
>> >in my experiencing optimizations always have to be back up by
>> >benchmarks. Could you add those to the commit message please.
>> >
>> >> Signed-off-by: Jim Kukunas <[email protected]>
>> >
>> >[…]
>> >
>> >
>> >Thanks,
>> >
>> >Paul
>>

--
Sent from my mobile phone. Please excuse brevity and lack of formatting.

2012-11-09 19:56:12

by Jim Kukunas

[permalink] [raw]

Subject: Re: [PATCH] lib/raid6: Add AVX2 optimized recovery functions

On Fri, Nov 09, 2012 at 10:50:25PM +1100, Neil Brown wrote:
> On Fri, 09 Nov 2012 12:39:05 +0100 "H. Peter Anvin" <[email protected]> wrote:
>
> > Sorry, we cannot share those at this time since the hardwarenis not yet released.
>
> Can I take that to imply "Acked-by: "H. Peter Anvin" <[email protected]>" ??
>
> It would be nice to have at least a statement like:
> These patches have been tested both with the user-space testing tool and in
> a RAID6 md array and the pass all test. While we cannot release performance
> numbers as the hardwere is not released, we can confirm that on that hardware
> the performance with these patches is faster than without.
>
> I guess I should be able to assume that - surely the patches would not be
> posted if it were not true... But I like to avoid assuming when I can.

Hi Neil,

That assumption is correct. The patch was tested and benchmarked before submission.

You'll notice that this code is very similar to the SSSE3-optimized
recovery routines I wrote earlier. This implementation extends that same
algorithm from 128-bit registers to 256-bit registers.

Thanks.

--
Jim Kukunas
Intel Open Source Technology Center

Attachments:

(No filename) (1.14 kB)
(No filename) (836.00 B)
Download all attachments

2012-11-19 00:57:44

by NeilBrown

[permalink] [raw]

Subject: Re: [PATCH] lib/raid6: Add AVX2 optimized recovery functions

On Fri, 9 Nov 2012 11:56:10 -0800 Jim Kukunas
<[email protected]> wrote:

> On Fri, Nov 09, 2012 at 10:50:25PM +1100, Neil Brown wrote:
> > On Fri, 09 Nov 2012 12:39:05 +0100 "H. Peter Anvin" <[email protected]> wrote:
> >
> > > Sorry, we cannot share those at this time since the hardwarenis not yet released.
> >
> > Can I take that to imply "Acked-by: "H. Peter Anvin" <[email protected]>" ??
> >
> > It would be nice to have at least a statement like:
> > These patches have been tested both with the user-space testing tool and in
> > a RAID6 md array and the pass all test. While we cannot release performance
> > numbers as the hardwere is not released, we can confirm that on that hardware
> > the performance with these patches is faster than without.
> >
> > I guess I should be able to assume that - surely the patches would not be
> > posted if it were not true... But I like to avoid assuming when I can.
>
> Hi Neil,
>
> That assumption is correct. The patch was tested and benchmarked before submission.

Thanks. I've queued the patch up.
>
> You'll notice that this code is very similar to the SSSE3-optimized
> recovery routines I wrote earlier. This implementation extends that same
> algorithm from 128-bit registers to 256-bit registers.

I might notice that if I actually looked, but it all starts swimming before
my eyes when I try :-)
If both you and hpa like it, then that is good enough for me.

Thanks,
NeilBrown

>
> Thanks.
>

Attachments:

signature.asc (828.00 B)

2012-11-29 20:10:07

by Andi Kleen

[permalink] [raw]

Subject: Re: [PATCH] lib/raid6: Add AVX2 optimized recovery functions

Jim Kukunas <[email protected]> writes:
> +
> + /* ymm0 = x0f[16] */
> + asm volatile("vpbroadcastb %0, %%ymm7" : : "m" (x0f));
> +
> + while (bytes) {
> +#ifdef CONFIG_X86_64
> + asm volatile("vmovdqa %0, %%ymm1" : : "m" (q[0]));
> + asm volatile("vmovdqa %0, %%ymm9" : : "m" (q[32]));
> + asm volatile("vmovdqa %0, %%ymm0" : : "m" (p[0]));
> + asm volatile("vmovdqa %0, %%ymm8" : : "m" (p[32]));

This is somewhat dangerous to assume registers do not get changed
between assembler statements or assembler statements do not get
reordered. Better always put such values into explicit variables or
merge them into a single asm statement.

asm volatile is also not enough to prevent reordering. If anything
you would need a memory clobber.

-Andi

--
[email protected] -- Speaking for myself only

2012-11-29 21:14:31

by H. Peter Anvin

[permalink] [raw]

Subject: Re: [PATCH] lib/raid6: Add AVX2 optimized recovery functions

On 11/29/2012 12:09 PM, Andi Kleen wrote:
> Jim Kukunas <[email protected]> writes:
>> +
>> + /* ymm0 = x0f[16] */
>> + asm volatile("vpbroadcastb %0, %%ymm7" : : "m" (x0f));
>> +
>> + while (bytes) {
>> +#ifdef CONFIG_X86_64
>> + asm volatile("vmovdqa %0, %%ymm1" : : "m" (q[0]));
>> + asm volatile("vmovdqa %0, %%ymm9" : : "m" (q[32]));
>> + asm volatile("vmovdqa %0, %%ymm0" : : "m" (p[0]));
>> + asm volatile("vmovdqa %0, %%ymm8" : : "m" (p[32]));
>
> This is somewhat dangerous to assume registers do not get changed
> between assembler statements or assembler statements do not get
> reordered. Better always put such values into explicit variables or
> merge them into a single asm statement.
>
> asm volatile is also not enough to prevent reordering. If anything
> you would need a memory clobber.
>

The code is compiled so that the xmm/ymm registers are not available to
the compiler. Do you have any known examples of asm volatiles being
reordered *with respect to each other*? My understandings of gcc is
that volatile operations are ordered with respect to each other (not
necessarily with respect to non-volatile operations, though.)

Either way, this implementatin technique was used for the MMX/SSE
implementations without any problems for 9 years now.

-h[a

2012-11-29 21:18:34

by Andi Kleen

[permalink] [raw]

Subject: Re: [PATCH] lib/raid6: Add AVX2 optimized recovery functions

> The code is compiled so that the xmm/ymm registers are not available to
> the compiler. Do you have any known examples of asm volatiles being
> reordered *with respect to each other*? My understandings of gcc is
> that volatile operations are ordered with respect to each other (not
> necessarily with respect to non-volatile operations, though.)

Can you quote it from the manual? As I understand volatile as usual
is not clearly defined.

gcc has a lot of optimization passes and volatile bugs are common.

> Either way, this implementatin technique was used for the MMX/SSE
> implementations without any problems for 9 years now.

It's still wrong.

Lying to the compiler usually bites you at some point.

-Andi

2012-11-29 22:28:27

by H. Peter Anvin

[permalink] [raw]

Subject: Re: [PATCH] lib/raid6: Add AVX2 optimized recovery functions

On 11/29/2012 01:18 PM, Andi Kleen wrote:
>
> Can you quote it from the manual? As I understand volatile as usual
> is not clearly defined.
>
> gcc has a lot of optimization passes and volatile bugs are common.
>

I talked it over with H.J. and we walked through the code. The
documentation is lacking, of course, as is common with gcc.

However, that piece was pretty clear.

-hpa

2012-11-29 22:29:16

by H. Peter Anvin

[permalink] [raw]

Subject: Re: [PATCH] lib/raid6: Add AVX2 optimized recovery functions

On 11/29/2012 01:18 PM, Andi Kleen wrote:
>
>> Either way, this implementatin technique was used for the MMX/SSE
>> implementations without any problems for 9 years now.
>
> It's still wrong.
>
> Lying to the compiler usually bites you at some point.
>

It's not lying to the compiler. It is telling the compiler "do
something that you have no way of knowing about".

It would be lying to the compiler if the compiler was allowed to touch
the FPU state, but it's not.

-hpa