2018-03-19 14:25:04

by Rahul Lakkireddy

[permalink] [raw]
Subject: [RFC PATCH 0/3] kernel: add support for 256-bit IO access

This series of patches add support for 256-bit IO read and write.
The APIs are readqq and writeqq (quad quadword - 4 x 64), that read
and write 256-bits at a time from IO, respectively.

Patch 1 adds u256 type and adds necessary non-atomic accessors. Also
adds byteorder conversion APIs.

Patch 2 adds 256-bit read and write to x86 via VMOVDQU AVX CPU
instructions.

Patch 3 updates cxgb4 driver to use the readqq API to speed up
reading on-chip memory 256-bits at a time.

Feedback and suggestions will be much appreciated.

Thanks,
Rahul

Rahul Lakkireddy (3):
include/linux: add 256-bit IO accessors
x86/io: implement 256-bit IO read and write
cxgb4: read on-chip memory 256-bits at a time

arch/x86/include/asm/io.h | 57 ++++++++++++++++++++++++-
drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c | 16 +++----
drivers/net/ethernet/chelsio/cxgb4/cxgb4.h | 6 +++
include/linux/byteorder/generic.h | 48 +++++++++++++++++++++
include/linux/io-64-nonatomic-hi-lo.h | 59 ++++++++++++++++++++++++++
include/linux/io-64-nonatomic-lo-hi.h | 59 ++++++++++++++++++++++++++
include/linux/types.h | 7 +++
7 files changed, 243 insertions(+), 9 deletions(-)

--
2.14.1



2018-03-19 14:24:00

by Rahul Lakkireddy

[permalink] [raw]
Subject: [RFC PATCH 1/3] include/linux: add 256-bit IO accessors

Add readqq and writeqq (quad quadword - 4 x 64) IO non-atomic
accessors. Also add byteorder conversions for 256-bit.

Signed-off-by: Rahul Lakkireddy <[email protected]>
Signed-off-by: Ganesh Goudar <[email protected]>
---
include/linux/byteorder/generic.h | 48 ++++++++++++++++++++++++++++
include/linux/io-64-nonatomic-hi-lo.h | 59 +++++++++++++++++++++++++++++++++++
include/linux/io-64-nonatomic-lo-hi.h | 59 +++++++++++++++++++++++++++++++++++
include/linux/types.h | 7 +++++
4 files changed, 173 insertions(+)

diff --git a/include/linux/byteorder/generic.h b/include/linux/byteorder/generic.h
index 451aaa0786ae..c6c936818a3a 100644
--- a/include/linux/byteorder/generic.h
+++ b/include/linux/byteorder/generic.h
@@ -187,4 +187,52 @@ static inline void be32_to_cpu_array(u32 *dst, const __be32 *src, size_t len)
dst[i] = be32_to_cpu(src[i]);
}

+static inline __le256 cpu_to_le256(u256 val)
+{
+ __le256 ret;
+
+ ret.a = cpu_to_le64(val.a);
+ ret.b = cpu_to_le64(val.b);
+ ret.c = cpu_to_le64(val.c);
+ ret.d = cpu_to_le64(val.d);
+
+ return ret;
+}
+
+static inline u256 le256_to_cpu(__le256 val)
+{
+ u256 ret;
+
+ ret.a = le64_to_cpu(val.a);
+ ret.b = le64_to_cpu(val.b);
+ ret.c = le64_to_cpu(val.c);
+ ret.d = le64_to_cpu(val.d);
+
+ return ret;
+}
+
+static inline __be256 cpu_to_be256(u256 val)
+{
+ __be256 ret;
+
+ ret.a = cpu_to_be64(val.d);
+ ret.b = cpu_to_be64(val.c);
+ ret.c = cpu_to_be64(val.b);
+ ret.d = cpu_to_be64(val.a);
+
+ return ret;
+}
+
+static inline u256 be256_to_cpu(__be256 val)
+{
+ u256 ret;
+
+ ret.a = be64_to_cpu(val.d);
+ ret.b = be64_to_cpu(val.c);
+ ret.c = be64_to_cpu(val.b);
+ ret.d = be64_to_cpu(val.a);
+
+ return ret;
+}
+
#endif /* _LINUX_BYTEORDER_GENERIC_H */
diff --git a/include/linux/io-64-nonatomic-hi-lo.h b/include/linux/io-64-nonatomic-hi-lo.h
index 862d786a904f..9bdc42132020 100644
--- a/include/linux/io-64-nonatomic-hi-lo.h
+++ b/include/linux/io-64-nonatomic-hi-lo.h
@@ -55,4 +55,63 @@ static inline void hi_lo_writeq_relaxed(__u64 val, volatile void __iomem *addr)
#define writeq_relaxed hi_lo_writeq_relaxed
#endif

+static inline __u256 hi_lo_readqq(const volatile void __iomem *addr)
+{
+ const volatile u64 __iomem *p = addr;
+ __u256 ret;
+
+ ret.d = readq(p + 3);
+ ret.c = readq(p + 2);
+ ret.b = readq(p + 1);
+ ret.a = readq(p);
+
+ return ret;
+}
+
+static inline void hi_lo_writeqq(__u256 val, volatile void __iomem *addr)
+{
+ writeq(val.d, addr + 24);
+ writeq(val.c, addr + 16);
+ writeq(val.b, addr + 8);
+ writeq(val.a, addr);
+}
+
+static inline __u256 hi_lo_readqq_relaxed(const volatile void __iomem *addr)
+{
+ const volatile u64 __iomem *p = addr;
+ __u256 ret;
+
+ ret.d = readq_relaxed(p + 3);
+ ret.c = readq_relaxed(p + 2);
+ ret.b = readq_relaxed(p + 1);
+ ret.a = readq_relaxed(p);
+
+ return ret;
+}
+
+static inline void hi_lo_writeqq_relaxed(__u256 val,
+ volatile void __iomem *addr)
+{
+ writeq_relaxed(val.d, addr + 24);
+ writeq_relaxed(val.c, addr + 16);
+ writeq_relaxed(val.b, addr + 8);
+ writeq_relaxed(val.a, addr);
+}
+
+#ifndef readqq
+#define readqq hi_lo_readqq
+#endif
+
+#ifndef writeqq
+#define writeqq hi_lo_writeqq
+#endif
+
+#ifndef readqq_relaxed
+#define readqq_relaxed hi_lo_readqq_relaxed
+#endif
+
+#ifndef writeqq_relaxed
+#define writeqq_relaxed hi_lo_writeqq_relaxed
+#endif
+
#endif /* _LINUX_IO_64_NONATOMIC_HI_LO_H_ */
diff --git a/include/linux/io-64-nonatomic-lo-hi.h b/include/linux/io-64-nonatomic-lo-hi.h
index d042e7bb5adb..8aaf734e1d51 100644
--- a/include/linux/io-64-nonatomic-lo-hi.h
+++ b/include/linux/io-64-nonatomic-lo-hi.h
@@ -55,4 +55,63 @@ static inline void lo_hi_writeq_relaxed(__u64 val, volatile void __iomem *addr)
#define writeq_relaxed lo_hi_writeq_relaxed
#endif

+static inline __u256 lo_hi_readqq(const volatile void __iomem *addr)
+{
+ const volatile u64 __iomem *p = addr;
+ __u256 ret;
+
+ ret.a = readq(p);
+ ret.b = readq(p + 1);
+ ret.c = readq(p + 2);
+ ret.d = readq(p + 3);
+
+ return ret;
+}
+
+static inline void lo_hi_writeqq(__u256 val, volatile void __iomem *addr)
+{
+ writeq(val.a, addr);
+ writeq(val.b, addr + 8);
+ writeq(val.c, addr + 16);
+ writeq(val.d, addr + 24);
+}
+
+static inline __u256 lo_hi_readqq_relaxed(const volatile void __iomem *addr)
+{
+ const volatile u64 __iomem *p = addr;
+ __u256 ret;
+
+ ret.a = readq_relaxed(p);
+ ret.b = readq_relaxed(p + 1);
+ ret.c = readq_relaxed(p + 2);
+ ret.d = readq_relaxed(p + 3);
+
+ return ret;
+}
+
+static inline void lo_hi_writeqq_relaxed(__u256 val,
+ volatile void __iomem *addr)
+{
+ writeq_relaxed(val.a, addr);
+ writeq_relaxed(val.b, addr + 8);
+ writeq_relaxed(val.c, addr + 16);
+ writeq_relaxed(val.d, addr + 24);
+}
+
+#ifndef readqq
+#define readqq lo_hi_readqq
+#endif
+
+#ifndef writeqq
+#define writeqq lo_hi_writeqq
+#endif
+
+#ifndef readqq_relaxed
+#define readqq_relaxed lo_hi_readqq_relaxed
+#endif
+
+#ifndef writeqq_relaxed
+#define writeqq_relaxed lo_hi_writeqq_relaxed
+#endif
+
#endif /* _LINUX_IO_64_NONATOMIC_LO_HI_H_ */
diff --git a/include/linux/types.h b/include/linux/types.h
index c94d59ef96cc..f4ee2857d253 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -114,6 +114,13 @@ typedef __u64 u_int64_t;
typedef __s64 int64_t;
#endif

+typedef struct {
+ __u64 a, b, c, d;
+} __u256;
+typedef __u256 u256;
+typedef __u256 __le256;
+typedef __u256 __be256;
+
/* this is a special 64bit data type that is 8-byte aligned */
#define aligned_u64 __u64 __attribute__((aligned(8)))
#define aligned_be64 __be64 __attribute__((aligned(8)))
--
2.14.1


2018-03-19 14:24:41

by Rahul Lakkireddy

[permalink] [raw]
Subject: [RFC PATCH 2/3] x86/io: implement 256-bit IO read and write

Use VMOVDQU AVX CPU instruction when available to do 256-bit
IO read and write.

Signed-off-by: Rahul Lakkireddy <[email protected]>
Signed-off-by: Ganesh Goudar <[email protected]>
---
arch/x86/include/asm/io.h | 57 ++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 56 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 95e948627fd0..b04f417b3374 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -109,7 +109,62 @@ build_mmio_write(__writeq, "q", unsigned long, "r", )
#define readq readq
#define writeq writeq

-#endif
+#ifdef CONFIG_AS_AVX
+#include <asm/fpu/api.h>
+
+static inline u256 __readqq(const volatile void __iomem *addr)
+{
+ u256 ret;
+
+ kernel_fpu_begin();
+ asm volatile("vmovdqu %0, %%ymm0" :
+ : "m" (*(volatile u256 __force *)addr));
+ asm volatile("vmovdqu %%ymm0, %0" : "=m" (ret));
+ kernel_fpu_end();
+ return ret;
+}
+
+static inline u256 readqq(const volatile void __iomem *addr)
+{
+ u256 ret;
+
+ kernel_fpu_begin();
+ asm volatile("vmovdqu %0, %%ymm0" :
+ : "m" (*(volatile u256 __force *)addr));
+ asm volatile("vmovdqu %%ymm0, %0" : "=m" (ret) : : "memory");
+ kernel_fpu_end();
+ return ret;
+}
+
+#define __raw_readqq __readqq
+#define readqq_relaxed(a) __readqq(a)
+#define readqq readqq
+
+static inline void __writeqq(u256 val, volatile void __iomem *addr)
+{
+ kernel_fpu_begin();
+ asm volatile("vmovdqu %0, %%ymm0" : : "m" (val));
+ asm volatile("vmovdqu %%ymm0, %0"
+ : "=m" (*(volatile u256 __force *)addr));
+ kernel_fpu_end();
+}
+
+static inline void writeqq(u256 val, volatile void __iomem *addr)
+{
+ kernel_fpu_begin();
+ asm volatile("vmovdqu %0, %%ymm0" : : "m" (val));
+ asm volatile("vmovdqu %%ymm0, %0"
+ : "=m" (*(volatile u256 __force *)addr)
+ : : "memory");
+ kernel_fpu_end();
+}
+
+#define __raw_writeqq __writeqq
+#define writeqq_relaxed(a) __writeqq(a)
+#define writeqq writeqq
+#endif /* CONFIG_AS_AVX */
+
+#endif /* CONFIG_X86_64 */

#define ARCH_HAS_VALID_PHYS_ADDR_RANGE
extern int valid_phys_addr_range(phys_addr_t addr, size_t size);
--
2.14.1


2018-03-19 14:45:15

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3] x86/io: implement 256-bit IO read and write

On Mon, 19 Mar 2018, Rahul Lakkireddy wrote:

> Use VMOVDQU AVX CPU instruction when available to do 256-bit
> IO read and write.

That's not what the patch does. See below.

> Signed-off-by: Rahul Lakkireddy <[email protected]>
> Signed-off-by: Ganesh Goudar <[email protected]>

That Signed-off-by chain is wrong....

> +#ifdef CONFIG_AS_AVX
> +#include <asm/fpu/api.h>
> +
> +static inline u256 __readqq(const volatile void __iomem *addr)
> +{
> + u256 ret;
> +
> + kernel_fpu_begin();
> + asm volatile("vmovdqu %0, %%ymm0" :
> + : "m" (*(volatile u256 __force *)addr));
> + asm volatile("vmovdqu %%ymm0, %0" : "=m" (ret));
> + kernel_fpu_end();
> + return ret;

You _cannot_ assume that the instruction is available just because
CONFIG_AS_AVX is set. The availability is determined by the runtime
evaluated CPU feature flags, i.e. X86_FEATURE_AVX.

Aside of that I very much doubt that this is faster than 4 consecutive
64bit reads/writes as you have the full overhead of
kernel_fpu_begin()/end() for each access.

You did not provide any numbers for this so its even harder to
determine.

As far as I can tell the code where you are using this is a debug
facility. What's the point? Debug is hardly a performance critical problem.

Thanks,

tglx





2018-03-19 14:53:52

by Rahul Lakkireddy

[permalink] [raw]
Subject: [RFC PATCH 3/3] cxgb4: read on-chip memory 256-bits at a time

Use readqq() to read on-chip memory 256-bits at a time.

Signed-off-by: Rahul Lakkireddy <[email protected]>
Signed-off-by: Ganesh Goudar <[email protected]>
---
drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c | 16 ++++++++--------
drivers/net/ethernet/chelsio/cxgb4/cxgb4.h | 6 ++++++
2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
index 9da6f57901a9..3a319b16c8a3 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
@@ -885,7 +885,7 @@ static int cudbg_memory_read(struct cudbg_init *pdbg_init, int win,
struct adapter *adap = pdbg_init->adap;
u32 pos, offset, resid;
u32 *res_buf;
- u64 *buf;
+ u256 *buf;
int ret;

/* Argument sanity checks ...
@@ -893,10 +893,10 @@ static int cudbg_memory_read(struct cudbg_init *pdbg_init, int win,
if (addr & 0x3 || (uintptr_t)hbuf & 0x3)
return -EINVAL;

- buf = (u64 *)hbuf;
+ buf = (u256 *)hbuf;

- /* Try to do 64-bit reads. Residual will be handled later. */
- resid = len & 0x7;
+ /* Try to do 256-bit reads. Residual will be handled later. */
+ resid = len & 0x1f;
len -= resid;

ret = t4_memory_rw_init(adap, win, mtype, &memoffset, &mem_base,
@@ -917,10 +917,10 @@ static int cudbg_memory_read(struct cudbg_init *pdbg_init, int win,

/* Transfer data from the adapter */
while (len > 0) {
- *buf++ = le64_to_cpu((__force __le64)
- t4_read_reg64(adap, mem_base + offset));
- offset += sizeof(u64);
- len -= sizeof(u64);
+ *buf++ = le256_to_cpu((__force __le256)
+ t4_read_reg256(adap, mem_base + offset));
+ offset += sizeof(u256);
+ len -= sizeof(u256);

/* If we've reached the end of our current window aperture,
* move the PCI-E Memory Window on to the next.
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index a5c0a649f3c7..0035ed0a2ec9 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -51,6 +51,7 @@
#include <linux/ptp_clock_kernel.h>
#include <linux/ptp_classify.h>
#include <asm/io.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
#include "t4_chip_type.h"
#include "cxgb4_uld.h"

@@ -1234,6 +1235,11 @@ static inline u64 t4_read_reg64(struct adapter *adap, u32 reg_addr)
return readq(adap->regs + reg_addr);
}

+static inline u256 t4_read_reg256(struct adapter *adap, u32 reg_addr)
+{
+ return readqq(adap->regs + reg_addr);
+}
+
static inline void t4_write_reg64(struct adapter *adap, u32 reg_addr, u64 val)
{
writeq(val, adap->regs + reg_addr);
--
2.14.1


2018-03-19 14:55:55

by David Laight

[permalink] [raw]
Subject: RE: [RFC PATCH 0/3] kernel: add support for 256-bit IO access

From: Rahul Lakkireddy
> Sent: 19 March 2018 14:21
>
> This series of patches add support for 256-bit IO read and write.
> The APIs are readqq and writeqq (quad quadword - 4 x 64), that read
> and write 256-bits at a time from IO, respectively.

Why not use the AVX2 registers to get 512bit accesses.

> Patch 1 adds u256 type and adds necessary non-atomic accessors. Also
> adds byteorder conversion APIs.
>
> Patch 2 adds 256-bit read and write to x86 via VMOVDQU AVX CPU
> instructions.
>
> Patch 3 updates cxgb4 driver to use the readqq API to speed up
> reading on-chip memory 256-bits at a time.

Calling kernel_fpu_begin() is likely to be slow.
I doubt you want to do it every time around a loop of accesses.

In principle it ought to be possible to get access to one or two
(eg) AVX registers by saving them to stack and telling the fpu
save code where you've put them.
Then the IPI fp save code could then copy the saved values over
the current values if asked to save the fp state for a process.
This should be reasonable cheap - especially if there isn't an
fp save IPI.

OTOH, for x86, if the code always runs in process context (eg from a
system call) then, since the ABI defines them all as caller-saved
the AVX(2) registers, it is only necessary to ensure that the current
FPU registers belong to the current process once.
The registers can be set to zero by an 'invalidate' instruction on
system call entry (hope this is done) and after use.

David


2018-03-19 15:07:24

by Thomas Gleixner

[permalink] [raw]
Subject: RE: [RFC PATCH 0/3] kernel: add support for 256-bit IO access

On Mon, 19 Mar 2018, David Laight wrote:
> From: Rahul Lakkireddy
> In principle it ought to be possible to get access to one or two
> (eg) AVX registers by saving them to stack and telling the fpu
> save code where you've put them.

No. We have functions for this and we are not adding new ad hoc magic.

> OTOH, for x86, if the code always runs in process context (eg from a
> system call) then, since the ABI defines them all as caller-saved
> the AVX(2) registers, it is only necessary to ensure that the current
> FPU registers belong to the current process once.
> The registers can be set to zero by an 'invalidate' instruction on
> system call entry (hope this is done) and after use.

Why would a system call touch the FPU registers? The kernel normally does
not use FPU instructions and the code which explicitely does has to take
care of save/restore. It would be performance madness to fiddle with the
FPU stuff unconditionally if nothing uses it.

Thanks,

tglx

2018-03-19 15:19:50

by David Laight

[permalink] [raw]
Subject: RE: [RFC PATCH 0/3] kernel: add support for 256-bit IO access

From: Thomas Gleixner
> Sent: 19 March 2018 15:05
>
> On Mon, 19 Mar 2018, David Laight wrote:
> > From: Rahul Lakkireddy
> > In principle it ought to be possible to get access to one or two
> > (eg) AVX registers by saving them to stack and telling the fpu
> > save code where you've put them.
>
> No. We have functions for this and we are not adding new ad hoc magic.

I was thinking that a real API might do this...
Useful also for code that needs AVX-like registers to do things like CRCs.

> > OTOH, for x86, if the code always runs in process context (eg from a
> > system call) then, since the ABI defines them all as caller-saved
> > the AVX(2) registers, it is only necessary to ensure that the current
> > FPU registers belong to the current process once.
> > The registers can be set to zero by an 'invalidate' instruction on
> > system call entry (hope this is done) and after use.
>
> Why would a system call touch the FPU registers? The kernel normally does
> not use FPU instructions and the code which explicitely does has to take
> care of save/restore. It would be performance madness to fiddle with the
> FPU stuff unconditionally if nothing uses it.

If system call entry reset the AVX registers then any FP save/restore
would be faster because the AVX registers wouldn't need to be saved
(and the cpu won't save them).
I believe the instruction to reset the AVX registers is fast.
The AVX registers only ever need saving if the process enters the
kernel through an interrupt.

David


2018-03-19 15:28:49

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access

On Mon, Mar 19, 2018 at 07:50:33PM +0530, Rahul Lakkireddy wrote:
> This series of patches add support for 256-bit IO read and write.
> The APIs are readqq and writeqq (quad quadword - 4 x 64), that read
> and write 256-bits at a time from IO, respectively.

What a horrible name. please encode the actual number of bits instead.

2018-03-19 15:38:51

by Thomas Gleixner

[permalink] [raw]
Subject: RE: [RFC PATCH 0/3] kernel: add support for 256-bit IO access

On Mon, 19 Mar 2018, David Laight wrote:
> From: Thomas Gleixner
> > Sent: 19 March 2018 15:05
> >
> > On Mon, 19 Mar 2018, David Laight wrote:
> > > From: Rahul Lakkireddy
> > > In principle it ought to be possible to get access to one or two
> > > (eg) AVX registers by saving them to stack and telling the fpu
> > > save code where you've put them.
> >
> > No. We have functions for this and we are not adding new ad hoc magic.
>
> I was thinking that a real API might do this...

We have a real API and that's good enough for the stuff we have using AVX
in the kernel.

> Useful also for code that needs AVX-like registers to do things like CRCs.

x86/crypto/ has a lot of AVX optimized code.

> > > OTOH, for x86, if the code always runs in process context (eg from a
> > > system call) then, since the ABI defines them all as caller-saved
> > > the AVX(2) registers, it is only necessary to ensure that the current
> > > FPU registers belong to the current process once.
> > > The registers can be set to zero by an 'invalidate' instruction on
> > > system call entry (hope this is done) and after use.
> >
> > Why would a system call touch the FPU registers? The kernel normally does
> > not use FPU instructions and the code which explicitely does has to take
> > care of save/restore. It would be performance madness to fiddle with the
> > FPU stuff unconditionally if nothing uses it.
>
> If system call entry reset the AVX registers then any FP save/restore
> would be faster because the AVX registers wouldn't need to be saved
> (and the cpu won't save them).
> I believe the instruction to reset the AVX registers is fast.
> The AVX registers only ever need saving if the process enters the
> kernel through an interrupt.

Wrong. The x8664 ABI clearly states:

Linux Kernel code is not allowed to change the x87 and SSE units. If
those are changed by kernel code, they have to be restored properly
before sleeping or leav- ing the kernel.

That means the syscall interface relies on FPU state being not changed by
the kernel. So if you want to clear AVX on syscall entry you need to save
it first and then restore before returning. That would be a huge
performance hit.

Thanks,

tglx

2018-03-19 15:55:24

by David Laight

[permalink] [raw]
Subject: RE: [RFC PATCH 0/3] kernel: add support for 256-bit IO access

From: Thomas Gleixner
> Sent: 19 March 2018 15:37
...
> > If system call entry reset the AVX registers then any FP save/restore
> > would be faster because the AVX registers wouldn't need to be saved
> > (and the cpu won't save them).
> > I believe the instruction to reset the AVX registers is fast.
> > The AVX registers only ever need saving if the process enters the
> > kernel through an interrupt.
>
> Wrong. The x8664 ABI clearly states:
>
> Linux Kernel code is not allowed to change the x87 and SSE units. If
> those are changed by kernel code, they have to be restored properly
> before sleeping or leav- ing the kernel.
>
> That means the syscall interface relies on FPU state being not changed by
> the kernel. So if you want to clear AVX on syscall entry you need to save
> it first and then restore before returning. That would be a huge
> performance hit.

The x87 and SSE registers can't be changed - they can contain callee-saved
registers.
But (IIRC) the AVX and AVX2 registers are all caller-saved.
So the system call entry stub functions are allowed to change them.
Which means that the syscall entry code can also change them.
Of course it must not leak kernel values back to userspace.

It is a few years since I looked at the AVX and fpu save code.

David

2018-03-19 16:31:38

by Linus Torvalds

[permalink] [raw]
Subject: Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access

On Mon, Mar 19, 2018 at 8:53 AM, David Laight <[email protected]> wrote:
>
> The x87 and SSE registers can't be changed - they can contain callee-saved
> registers.
> But (IIRC) the AVX and AVX2 registers are all caller-saved.

No.

The kernel entry is not the usual function call.

On kernel entry, *all* registers are callee-saved.

Of course, some may be return values, and I have a slight hope that I
can trash %eflags. But basically, a system call is simply not a
function call. We have different calling conventions on the argument
side too. In fact, the arguments are in different registers depending
on just *which* system call you take.

Linus

2018-03-20 08:28:27

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access


* Thomas Gleixner <[email protected]> wrote:

> > Useful also for code that needs AVX-like registers to do things like CRCs.
>
> x86/crypto/ has a lot of AVX optimized code.

Yeah, that's true, but the crypto code is processing fundamentally bigger blocks
of data, which amortizes the cost of using kernel_fpu_begin()/_end().

kernel_fpu_begin()/_end() is a pretty heavy operation because it does a full FPU
save/restore via the XSAVE[S] and XRSTOR[S] instructions, which can easily copy a
thousand bytes around! So kernel_fpu_begin()/_end() is probably a non-starter for
something small, like a single 256-bit or 512-bit word access.

But there's actually a new thing in modern kernels: we got rid of (most of) lazy
save/restore FPU code, our new x86 FPU model is very "direct" with no FPU faults
taken normally.

So assuming the target driver will only load on modern FPUs I *think* it should
actually be possible to do something like (pseudocode):

vmovdqa %ymm0, 40(%rsp)
vmovdqa %ymm1, 80(%rsp)

...
# use ymm0 and ymm1
...

vmovdqa 80(%rsp), %ymm1
vmovdqa 40(%rsp), %ymm0

... without using the heavy XSAVE/XRSTOR instructions.

Note that preemption probably still needs to be disabled and possibly there are
other details as well, but there should be no 'heavy' FPU operations.

I think this should still preserve all user-space FPU state and shouldn't muck up
any 'weird' user-space FPU state (such as pending exceptions, legacy x87 running
code, NaN registers or weird FPU control word settings) we might have interrupted
either.

But I could be wrong, it should be checked whether this sequence is safe.
Worst-case we might have to save/restore the FPU control and tag words - but those
operations should still be much faster than a full XSAVE/XRSTOR pair.

So I do think we could do more in this area to improve driver performance, if the
code is correct and if there's actual benchmarks that are showing real benefits.

Thanks,

Ingo

2018-03-20 08:40:34

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access

On Tue, 20 Mar 2018, Ingo Molnar wrote:
> * Thomas Gleixner <[email protected]> wrote:
>
> > > Useful also for code that needs AVX-like registers to do things like CRCs.
> >
> > x86/crypto/ has a lot of AVX optimized code.
>
> Yeah, that's true, but the crypto code is processing fundamentally bigger blocks
> of data, which amortizes the cost of using kernel_fpu_begin()/_end().

Correct.

> So assuming the target driver will only load on modern FPUs I *think* it should
> actually be possible to do something like (pseudocode):
>
> vmovdqa %ymm0, 40(%rsp)
> vmovdqa %ymm1, 80(%rsp)
>
> ...
> # use ymm0 and ymm1
> ...
>
> vmovdqa 80(%rsp), %ymm1
> vmovdqa 40(%rsp), %ymm0
>
> ... without using the heavy XSAVE/XRSTOR instructions.
>
> Note that preemption probably still needs to be disabled and possibly there are
> other details as well, but there should be no 'heavy' FPU operations.

Emphasis on should :)

> I think this should still preserve all user-space FPU state and shouldn't muck up
> any 'weird' user-space FPU state (such as pending exceptions, legacy x87 running
> code, NaN registers or weird FPU control word settings) we might have interrupted
> either.
>
> But I could be wrong, it should be checked whether this sequence is safe.
> Worst-case we might have to save/restore the FPU control and tag words - but those
> operations should still be much faster than a full XSAVE/XRSTOR pair.

Fair enough.

> So I do think we could do more in this area to improve driver performance, if the
> code is correct and if there's actual benchmarks that are showing real benefits.

If it's about hotpath performance I'm all for it, but the use case here is
a debug facility...

And if we go down that road then we want a AVX based memcpy()
implementation which is runtime conditional on the feature bit(s) and
length dependent. Just slapping a readqq() at it and use it in a loop does
not make any sense.

Thanks,

tglx

2018-03-20 09:09:31

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access


* Thomas Gleixner <[email protected]> wrote:

> > So I do think we could do more in this area to improve driver performance, if the
> > code is correct and if there's actual benchmarks that are showing real benefits.
>
> If it's about hotpath performance I'm all for it, but the use case here is
> a debug facility...
>
> And if we go down that road then we want a AVX based memcpy()
> implementation which is runtime conditional on the feature bit(s) and
> length dependent. Just slapping a readqq() at it and use it in a loop does
> not make any sense.

Yeah, so generic memcpy() replacement is only feasible I think if the most
optimistic implementation is actually correct:

- if no preempt disable()/enable() is required

- if direct access to the AVX[2] registers does not disturb legacy FPU state in
any fashion

- if direct access to the AVX[2] registers cannot raise weird exceptions or have
weird behavior if the FPU control word is modified to non-standard values by
untrusted user-space

If we have to touch the FPU tag or control words then it's probably only good for
a specialized API.

Thanks,

Ingo

2018-03-20 09:42:53

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access

On Tue, 20 Mar 2018, Ingo Molnar wrote:
> * Thomas Gleixner <[email protected]> wrote:
>
> > > So I do think we could do more in this area to improve driver performance, if the
> > > code is correct and if there's actual benchmarks that are showing real benefits.
> >
> > If it's about hotpath performance I'm all for it, but the use case here is
> > a debug facility...
> >
> > And if we go down that road then we want a AVX based memcpy()
> > implementation which is runtime conditional on the feature bit(s) and
> > length dependent. Just slapping a readqq() at it and use it in a loop does
> > not make any sense.
>
> Yeah, so generic memcpy() replacement is only feasible I think if the most
> optimistic implementation is actually correct:
>
> - if no preempt disable()/enable() is required
>
> - if direct access to the AVX[2] registers does not disturb legacy FPU state in
> any fashion
>
> - if direct access to the AVX[2] registers cannot raise weird exceptions or have
> weird behavior if the FPU control word is modified to non-standard values by
> untrusted user-space
>
> If we have to touch the FPU tag or control words then it's probably only good for
> a specialized API.

I did not mean to have a general memcpy replacement. Rather something like
magic_memcpy() which falls back to memcpy when AVX is not usable or the
length does not justify the AVX stuff at all.

Thanks,

tglx

2018-03-20 10:02:28

by David Laight

[permalink] [raw]
Subject: RE: [RFC PATCH 0/3] kernel: add support for 256-bit IO access

From: Thomas Gleixner
> Sent: 20 March 2018 09:41
> On Tue, 20 Mar 2018, Ingo Molnar wrote:
> > * Thomas Gleixner <[email protected]> wrote:
...
> > > And if we go down that road then we want a AVX based memcpy()
> > > implementation which is runtime conditional on the feature bit(s) and
> > > length dependent. Just slapping a readqq() at it and use it in a loop does
> > > not make any sense.
> >
> > Yeah, so generic memcpy() replacement is only feasible I think if the most
> > optimistic implementation is actually correct:
> >
> > - if no preempt disable()/enable() is required
> >
> > - if direct access to the AVX[2] registers does not disturb legacy FPU state in
> > any fashion
> >
> > - if direct access to the AVX[2] registers cannot raise weird exceptions or have
> > weird behavior if the FPU control word is modified to non-standard values by
> > untrusted user-space
> >
> > If we have to touch the FPU tag or control words then it's probably only good for
> > a specialized API.
>
> I did not mean to have a general memcpy replacement. Rather something like
> magic_memcpy() which falls back to memcpy when AVX is not usable or the
> length does not justify the AVX stuff at all.

There is probably no point for memcpy().

Where it would make a big difference is memcpy_fromio() for PCIe devices
(where longer TLP make a big difference).
But any code belongs in its implementation not in every driver.
The implementation of memcpy_toio() is nothing like as critical.

If might be the code would need to fallback to 64bit accesses
if the AVX(2) registers can't currently be accessed - maybe some
obscure state....

However memcpy_to/fromio() are both horrid at the moment because
they result in byte copies!

David



2018-03-20 10:56:17

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access


* Thomas Gleixner <[email protected]> wrote:

> On Tue, 20 Mar 2018, Ingo Molnar wrote:
> > * Thomas Gleixner <[email protected]> wrote:
> >
> > > > So I do think we could do more in this area to improve driver performance, if the
> > > > code is correct and if there's actual benchmarks that are showing real benefits.
> > >
> > > If it's about hotpath performance I'm all for it, but the use case here is
> > > a debug facility...
> > >
> > > And if we go down that road then we want a AVX based memcpy()
> > > implementation which is runtime conditional on the feature bit(s) and
> > > length dependent. Just slapping a readqq() at it and use it in a loop does
> > > not make any sense.
> >
> > Yeah, so generic memcpy() replacement is only feasible I think if the most
> > optimistic implementation is actually correct:
> >
> > - if no preempt disable()/enable() is required
> >
> > - if direct access to the AVX[2] registers does not disturb legacy FPU state in
> > any fashion
> >
> > - if direct access to the AVX[2] registers cannot raise weird exceptions or have
> > weird behavior if the FPU control word is modified to non-standard values by
> > untrusted user-space
> >
> > If we have to touch the FPU tag or control words then it's probably only good for
> > a specialized API.
>
> I did not mean to have a general memcpy replacement. Rather something like
> magic_memcpy() which falls back to memcpy when AVX is not usable or the
> length does not justify the AVX stuff at all.

OK, fair enough.

Note that a generic version might still be worth trying out, if and only if it's
safe to access those vector registers directly: modern x86 CPUs will do their
non-constant memcpy()s via the common memcpy_erms() function - which could in
theory be an easy common point to be (cpufeatures-) patched to an AVX2 variant, if
size (and alignment, perhaps) is a multiple of 32 bytes or so.

Assuming it's correct with arbitrary user-space FPU state and if it results in any
measurable speedups, which might not be the case: ERMS is supposed to be very
fast.

So even if it's possible (which it might not be), it could end up being slower
than the ERMS version.

Thanks,

Ingo

2018-03-20 13:34:01

by David Laight

[permalink] [raw]
Subject: RE: [RFC PATCH 0/3] kernel: add support for 256-bit IO access

From: Ingo Molnar
> Sent: 20 March 2018 10:54
...
> Note that a generic version might still be worth trying out, if and only if it's
> safe to access those vector registers directly: modern x86 CPUs will do their
> non-constant memcpy()s via the common memcpy_erms() function - which could in
> theory be an easy common point to be (cpufeatures-) patched to an AVX2 variant, if
> size (and alignment, perhaps) is a multiple of 32 bytes or so.
>
> Assuming it's correct with arbitrary user-space FPU state and if it results in any
> measurable speedups, which might not be the case: ERMS is supposed to be very
> fast.
>
> So even if it's possible (which it might not be), it could end up being slower
> than the ERMS version.

Last I checked memcpy() was implemented as 'rep movsb' on the latest Intel cpus.
Since memcpy_to/fromio() get aliased to memcpy() this generates byte copies.
The previous 'fastest' version of memcpy() was ok for uncached locations.

For PCIe I suspect that the actual instructions don't make a massive difference.
I'm not even sure interleaving two transfers makes any difference.
What makes a huge difference for memcpy_fromio() is the size of the register.
The time taken for a read will be largely independent of the width of the
register used.

David


2018-03-20 13:35:58

by Rahul Lakkireddy

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3] x86/io: implement 256-bit IO read and write

On Monday, March 03/19/18, 2018 at 20:13:10 +0530, Thomas Gleixner wrote:
> On Mon, 19 Mar 2018, Rahul Lakkireddy wrote:
>
> > Use VMOVDQU AVX CPU instruction when available to do 256-bit
> > IO read and write.
>
> That's not what the patch does. See below.
>
> > Signed-off-by: Rahul Lakkireddy <[email protected]>
> > Signed-off-by: Ganesh Goudar <[email protected]>
>
> That Signed-off-by chain is wrong....
>
> > +#ifdef CONFIG_AS_AVX
> > +#include <asm/fpu/api.h>
> > +
> > +static inline u256 __readqq(const volatile void __iomem *addr)
> > +{
> > + u256 ret;
> > +
> > + kernel_fpu_begin();
> > + asm volatile("vmovdqu %0, %%ymm0" :
> > + : "m" (*(volatile u256 __force *)addr));
> > + asm volatile("vmovdqu %%ymm0, %0" : "=m" (ret));
> > + kernel_fpu_end();
> > + return ret;
>
> You _cannot_ assume that the instruction is available just because
> CONFIG_AS_AVX is set. The availability is determined by the runtime
> evaluated CPU feature flags, i.e. X86_FEATURE_AVX.
>

Ok. Will add boot_cpu_has(X86_FEATURE_AVX) check as well.

> Aside of that I very much doubt that this is faster than 4 consecutive
> 64bit reads/writes as you have the full overhead of
> kernel_fpu_begin()/end() for each access.
>
> You did not provide any numbers for this so its even harder to
> determine.
>

Sorry about that. Here are the numbers with and without this series.

When reading up to 2 GB on-chip memory via MMIO, the time taken:

Without Series With Series
(64-bit read) (256-bit read)

52 seconds 26 seconds

As can be seen, we see good improvement with doing 256-bits at a
time.

> As far as I can tell the code where you are using this is a debug
> facility. What's the point? Debug is hardly a performance critical problem.
>

On High Availability Server, the logs of the failing system must be
collected as quickly as possible. So, we're concerned with the amount
of time taken to collect our large on-chip memory. We see improvement
in doing 256-bit reads at a time.

Thanks,
Rahul

2018-03-20 13:45:52

by Andy Shevchenko

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3] x86/io: implement 256-bit IO read and write

On Tue, Mar 20, 2018 at 3:32 PM, Rahul Lakkireddy
<[email protected]> wrote:
> On Monday, March 03/19/18, 2018 at 20:13:10 +0530, Thomas Gleixner wrote:
>> On Mon, 19 Mar 2018, Rahul Lakkireddy wrote:

>> Aside of that I very much doubt that this is faster than 4 consecutive
>> 64bit reads/writes as you have the full overhead of
>> kernel_fpu_begin()/end() for each access.
>>
>> You did not provide any numbers for this so its even harder to
>> determine.
>>
>
> Sorry about that. Here are the numbers with and without this series.
>
> When reading up to 2 GB on-chip memory via MMIO, the time taken:
>
> Without Series With Series
> (64-bit read) (256-bit read)
>
> 52 seconds 26 seconds
>
> As can be seen, we see good improvement with doing 256-bits at a
> time.

But this is kinda synthetic test, right?
If you run in a normal use case where kernel not only collecting logs,
but doing something else, especially with frequent userspace
interaction, would be trend the same?

--
With Best Regards,
Andy Shevchenko

2018-03-20 13:47:54

by Rahul Lakkireddy

[permalink] [raw]
Subject: Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access

On Monday, March 03/19/18, 2018 at 20:57:22 +0530, Christoph Hellwig wrote:
> On Mon, Mar 19, 2018 at 07:50:33PM +0530, Rahul Lakkireddy wrote:
> > This series of patches add support for 256-bit IO read and write.
> > The APIs are readqq and writeqq (quad quadword - 4 x 64), that read
> > and write 256-bits at a time from IO, respectively.
>
> What a horrible name. please encode the actual number of bits instead.

Ok. Will change it to read256() and write256().

Thanks,
Rahul

2018-03-20 14:40:48

by David Laight

[permalink] [raw]
Subject: RE: [RFC PATCH 2/3] x86/io: implement 256-bit IO read and write

From: Rahul Lakkireddy
> Sent: 20 March 2018 13:32
...
> On High Availability Server, the logs of the failing system must be
> collected as quickly as possible. So, we're concerned with the amount
> of time taken to collect our large on-chip memory. We see improvement
> in doing 256-bit reads at a time.

Two other options:

1) Get the device to DMA into host memory.

2) Use mmap() (and vm_iomap_memory() in your driver) to get direct
userspace access to the (I assume) PCIe memory space.
You can then use whatever copy instructions the cpu has.
(Just don't use memcpy().)

David


2018-03-20 14:43:32

by Alexander Duyck

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3] x86/io: implement 256-bit IO read and write

On Tue, Mar 20, 2018 at 6:32 AM, Rahul Lakkireddy
<[email protected]> wrote:
> On Monday, March 03/19/18, 2018 at 20:13:10 +0530, Thomas Gleixner wrote:
>> On Mon, 19 Mar 2018, Rahul Lakkireddy wrote:
>>
>> > Use VMOVDQU AVX CPU instruction when available to do 256-bit
>> > IO read and write.
>>
>> That's not what the patch does. See below.
>>
>> > Signed-off-by: Rahul Lakkireddy <[email protected]>
>> > Signed-off-by: Ganesh Goudar <[email protected]>
>>
>> That Signed-off-by chain is wrong....
>>
>> > +#ifdef CONFIG_AS_AVX
>> > +#include <asm/fpu/api.h>
>> > +
>> > +static inline u256 __readqq(const volatile void __iomem *addr)
>> > +{
>> > + u256 ret;
>> > +
>> > + kernel_fpu_begin();
>> > + asm volatile("vmovdqu %0, %%ymm0" :
>> > + : "m" (*(volatile u256 __force *)addr));
>> > + asm volatile("vmovdqu %%ymm0, %0" : "=m" (ret));
>> > + kernel_fpu_end();
>> > + return ret;
>>
>> You _cannot_ assume that the instruction is available just because
>> CONFIG_AS_AVX is set. The availability is determined by the runtime
>> evaluated CPU feature flags, i.e. X86_FEATURE_AVX.
>>
>
> Ok. Will add boot_cpu_has(X86_FEATURE_AVX) check as well.
>
>> Aside of that I very much doubt that this is faster than 4 consecutive
>> 64bit reads/writes as you have the full overhead of
>> kernel_fpu_begin()/end() for each access.
>>
>> You did not provide any numbers for this so its even harder to
>> determine.
>>
>
> Sorry about that. Here are the numbers with and without this series.
>
> When reading up to 2 GB on-chip memory via MMIO, the time taken:
>
> Without Series With Series
> (64-bit read) (256-bit read)
>
> 52 seconds 26 seconds
>
> As can be seen, we see good improvement with doing 256-bits at a
> time.

Instead of framing this as an enhanced version of the read/write ops
why not look at replacing or extending something like the
memcpy_fromio or memcpy_toio operations? It would probably be more
comparable to what you are doing if you are wanting to move large
chunks of memory from one region to another, and it should translate
into something like AVX instructions once the CPU optimizations kick
in for a memcpy.

- Alex

2018-03-20 14:59:34

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access

On Tue, Mar 20, 2018 at 8:26 AM, Ingo Molnar <[email protected]> wrote:
>
> * Thomas Gleixner <[email protected]> wrote:
>
>> > Useful also for code that needs AVX-like registers to do things like CRCs.
>>
>> x86/crypto/ has a lot of AVX optimized code.
>
> Yeah, that's true, but the crypto code is processing fundamentally bigger blocks
> of data, which amortizes the cost of using kernel_fpu_begin()/_end().
>
> kernel_fpu_begin()/_end() is a pretty heavy operation because it does a full FPU
> save/restore via the XSAVE[S] and XRSTOR[S] instructions, which can easily copy a
> thousand bytes around! So kernel_fpu_begin()/_end() is probably a non-starter for
> something small, like a single 256-bit or 512-bit word access.
>
> But there's actually a new thing in modern kernels: we got rid of (most of) lazy
> save/restore FPU code, our new x86 FPU model is very "direct" with no FPU faults
> taken normally.
>
> So assuming the target driver will only load on modern FPUs I *think* it should
> actually be possible to do something like (pseudocode):
>
> vmovdqa %ymm0, 40(%rsp)
> vmovdqa %ymm1, 80(%rsp)
>
> ...
> # use ymm0 and ymm1
> ...
>
> vmovdqa 80(%rsp), %ymm1
> vmovdqa 40(%rsp), %ymm0
>

I think this kinda sorts works, but only kinda sorta:

- I'm a bit worried about newer CPUs where, say, a 256-bit vector
operation will implicitly clear the high 768 bits of the regs. (IIRC
that's how it works for the new VEX stuff.)

- This code will cause XINUSE to be set, which is user-visible and
may have performance and correctness effects. I think the kernel may
already have some but where it occasionally sets XINUSE on its own,
and this caused problems for rr in the past. This might not be a
total showstopper, but it's odd.

I'd rather see us finally finish the work that Rik started to rework
this differently. I'd like kernel_fpu_begin() to look like:

if (test_thread_flag(TIF_NEED_FPU_RESTORE)) {
return; // we're already okay. maybe we need to check
in_interrupt() or something, though?
} else {
XSAVES/XSAVEOPT/XSAVE;
set_thread_flag(TIF_NEED_FPU_RESTORE):
}

and kernel_fpu_end() does nothing at all.

We take the full performance hit for a *single* kernel_fpu_begin() on
an otherwise short syscall or interrupt, but there's no additional
cost for more of them or for long-enough-running things that we
schedule in the middle.

As I remember, the main hangup was that this interacts a bit oddly
with PKRU, but that's manageable.

--Andy

2018-03-20 15:11:45

by David Laight

[permalink] [raw]
Subject: RE: [RFC PATCH 0/3] kernel: add support for 256-bit IO access

From: Andy Lutomirski
> Sent: 20 March 2018 14:57
...
> I'd rather see us finally finish the work that Rik started to rework
> this differently. I'd like kernel_fpu_begin() to look like:
>
> if (test_thread_flag(TIF_NEED_FPU_RESTORE)) {
> return; // we're already okay. maybe we need to check
> in_interrupt() or something, though?
> } else {
> XSAVES/XSAVEOPT/XSAVE;
> set_thread_flag(TIF_NEED_FPU_RESTORE):
> }
>
> and kernel_fpu_end() does nothing at all.

I guess it might need to set (clear?) the CFLAGS bit for a process
that isn't using the fpu at all - which seems a sensible feature.

> We take the full performance hit for a *single* kernel_fpu_begin() on
> an otherwise short syscall or interrupt, but there's no additional
> cost for more of them or for long-enough-running things that we
> schedule in the middle.

It might be worth adding a parameter to kernel_fpu_begin() to indicate
which registers are needed, and a return value to say what has been
granted.
Then a driver could request AVX2 (for example) and use a fallback path
if the register set isn't available (for any reason).
A call from an ISR could always fail.

> As I remember, the main hangup was that this interacts a bit oddly
> with PKRU, but that's manageable.

WTF PKRU ??

Dvaid

2018-03-20 18:02:59

by Linus Torvalds

[permalink] [raw]
Subject: Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access

On Tue, Mar 20, 2018 at 1:26 AM, Ingo Molnar <[email protected]> wrote:
>
> So assuming the target driver will only load on modern FPUs I *think* it should
> actually be possible to do something like (pseudocode):
>
> vmovdqa %ymm0, 40(%rsp)
> vmovdqa %ymm1, 80(%rsp)
>
> ...
> # use ymm0 and ymm1
> ...
>
> vmovdqa 80(%rsp), %ymm1
> vmovdqa 40(%rsp), %ymm0
>
> ... without using the heavy XSAVE/XRSTOR instructions.

No. The above is buggy. It may *work*, but it won't work in the long run.

Pretty much every single vector extension has traditionally meant that
touching "old" registers breaks any new register use. Even if you save
the registers carefully like in your example code, it will do magic
and random things to the "future extended" version.

So I absolutely *refuse* to have anything to do with the vector unit.
You can only touch it in the kernel if you own it entirely (ie that
"kernel_fpu_begin()/_end()" thing). Anything else is absolutely
guaranteed to cause problems down the line.

And even if you ignore that "maintenance problems down the line" issue
("we can fix them when they happen") I don't want to see games like
this, because I'm pretty sure it breaks the optimized xsave by tagging
the state as being dirty.

So no. Don't use vector stuff in the kernel. It's not worth the pain.

The *only* valid use is pretty much crypto, and even there it has had
issues. Benchmarks use big arrays and/or dense working sets etc to
"prove" how good the vector version is, and then you end up in
situations where it's used once per fairly small packet for an
interrupt, and it's actually much worse than doing it by hand.

Linus

2018-03-21 00:41:44

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access

On Tue, Mar 20, 2018 at 3:10 PM, David Laight <[email protected]> wrote:
> From: Andy Lutomirski
>> Sent: 20 March 2018 14:57
> ...
>> I'd rather see us finally finish the work that Rik started to rework
>> this differently. I'd like kernel_fpu_begin() to look like:
>>
>> if (test_thread_flag(TIF_NEED_FPU_RESTORE)) {
>> return; // we're already okay. maybe we need to check
>> in_interrupt() or something, though?
>> } else {
>> XSAVES/XSAVEOPT/XSAVE;
>> set_thread_flag(TIF_NEED_FPU_RESTORE):
>> }
>>
>> and kernel_fpu_end() does nothing at all.
>
> I guess it might need to set (clear?) the CFLAGS bit for a process
> that isn't using the fpu at all - which seems a sensible feature.

What do you mean "CFLAGS"?

But we no longer have any concept of "isn't using the fpu at all" --
we got rid of that.

>
>> We take the full performance hit for a *single* kernel_fpu_begin() on
>> an otherwise short syscall or interrupt, but there's no additional
>> cost for more of them or for long-enough-running things that we
>> schedule in the middle.
>
> It might be worth adding a parameter to kernel_fpu_begin() to indicate
> which registers are needed, and a return value to say what has been
> granted.
> Then a driver could request AVX2 (for example) and use a fallback path
> if the register set isn't available (for any reason).
> A call from an ISR could always fail.

Last time I benchmarked it, XSAVEC on none of the state wasn't a whole
lot faster than XSAVEC for all of it.

>
>> As I remember, the main hangup was that this interacts a bit oddly
>> with PKRU, but that's manageable.
>
> WTF PKRU ??

PKRU is uniquely demented. All the rest of the XSAVEC state only
affects code that explicitly references that state. PKRU affects
every single access to user pages, so we need PKRU to match the
current task at all times in the kernel. This means that, if we start
deferring XRSTORS until prepare_exit_to_usermode(), we need to start
restoring PKRU using WRPKRU in __switch_to(). Of course, *that*
interacts a bit oddly with XINUSE, but maybe we don't care.

Phooey on you, Intel, for putting PKRU into xstate and not giving a
fast instruction to control XINUSE.

2018-03-21 06:36:21

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access


* Linus Torvalds <[email protected]> wrote:

> And even if you ignore that "maintenance problems down the line" issue
> ("we can fix them when they happen") I don't want to see games like
> this, because I'm pretty sure it breaks the optimized xsave by tagging
> the state as being dirty.

That's true - and it would penalize the context switch cost of the affected task
for the rest of its lifetime, as I don't think there's much that clears XINUSE
other than a FINIT, which is rarely done by user-space.

> So no. Don't use vector stuff in the kernel. It's not worth the pain.

I agree, but:

> The *only* valid use is pretty much crypto, and even there it has had issues.
> Benchmarks use big arrays and/or dense working sets etc to "prove" how good the
> vector version is, and then you end up in situations where it's used once per
> fairly small packet for an interrupt, and it's actually much worse than doing it
> by hand.

That's mainly because the XSAVE/XRESTOR done by kernel_fpu_begin()/end() is so
expensive, so this argument is somewhat circular.

IFF it was safe to just use the vector unit then vector unit based crypto would be
very fast for small buffer as well, and would be even faster for larger buffer
sizes as well. Saving and restoring up to ~1.5K of context is not cheap.

Thanks,

Ingo

2018-03-21 07:48:49

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access


So I poked around a bit and I'm having second thoughts:

* Linus Torvalds <[email protected]> wrote:

> On Tue, Mar 20, 2018 at 1:26 AM, Ingo Molnar <[email protected]> wrote:
> >
> > So assuming the target driver will only load on modern FPUs I *think* it should
> > actually be possible to do something like (pseudocode):
> >
> > vmovdqa %ymm0, 40(%rsp)
> > vmovdqa %ymm1, 80(%rsp)
> >
> > ...
> > # use ymm0 and ymm1
> > ...
> >
> > vmovdqa 80(%rsp), %ymm1
> > vmovdqa 40(%rsp), %ymm0
> >
> > ... without using the heavy XSAVE/XRSTOR instructions.
>
> No. The above is buggy. It may *work*, but it won't work in the long run.
>
> Pretty much every single vector extension has traditionally meant that
> touching "old" registers breaks any new register use. Even if you save
> the registers carefully like in your example code, it will do magic
> and random things to the "future extended" version.

This should be relatively straightforward to solve via a proper CPU features
check: for example by only patching in the AVX routines for 'known compatible'
fpu_kernel_xstate_size values. Future extensions of register width will extend
the XSAVE area.

It's not fool-proof: in theory there could be semantic extensions to the vector
unit that does not increase the size of the context - but the normal pattern is to
increase the number of XINUSE bits and bump up the maximum context area size.

If that's a worry then an even safer compatibility check would be to explicitly
list CPU models - we do track them pretty accurately anyway these days, mostly due
to perf PMU support defaulting to a safe but dumb variant if a CPU model is not
specifically listed.

That method, although more maintenance-intense, should be pretty fool-proof
AFAICS.

> So I absolutely *refuse* to have anything to do with the vector unit.
> You can only touch it in the kernel if you own it entirely (ie that
> "kernel_fpu_begin()/_end()" thing). Anything else is absolutely
> guaranteed to cause problems down the line.
>
> And even if you ignore that "maintenance problems down the line" issue
> ("we can fix them when they happen") I don't want to see games like
> this, because I'm pretty sure it breaks the optimized xsave by tagging
> the state as being dirty.

So I added a bit of instrumentation and the current state of things is that on
64-bit x86 every single task has an initialized FPU, every task has the exact
same, fully filled in xfeatures (XINUSE) value:

[root@galatea ~]# grep -h fpu /proc/*/task/*/fpu | sort | uniq -c
504 x86/fpu: initialized : 1
504 x86/fpu: xfeatures_mask : 7

So our latest FPU model is *really* simple and user-space should not be able to
observe any changes in the XINUSE bits of the XSAVE header, because (at least for
the basic vector CPU features) all bits are maxed out all the time.

Note that this is with an AVX (128-bit) supporting CPU:

[ 0.000000] x86/fpu: Supporting XSAVE feature 0x001: 'x87 floating point registers'
[ 0.000000] x86/fpu: Supporting XSAVE feature 0x002: 'SSE registers'
[ 0.000000] x86/fpu: Supporting XSAVE feature 0x004: 'AVX registers'
[ 0.000000] x86/fpu: xstate_offset[2]: 576, xstate_sizes[2]: 256
[ 0.000000] x86/fpu: Enabled xstate features 0x7, context size is 832 bytes, using 'standard' format.

But note that it probably wouldn't make sense to make use of XINUSE optimizations
on most systems for the AVX space, as glibc will use the highest-bitness vector
ops for its regular memcpy(), and every user task makes use of memcpy.

It does make sense for some of the more optional XSAVE based features such as
pkeys. But I don't have any newer Intel system with a wider xsave feature set to
check.

> So no. Don't use vector stuff in the kernel. It's not worth the pain.

That might still be true, but still I'm torn:

- Broad areas of user-space has seemlessly integrated vector ops and is using
them all the time they can find an excuse to use them.

- The vector registers are fundamentally callee-saved, so in most synchronous
calls the vector unit registers are unused. Asynchronous interruptions of
context (interrupts, faults, preemption, etc.) can still use them as well, as
long as they save/restore register contents.

So other than Intel not making it particularly easy to make a forwards compatible
vector register granular save/restore pattern (but see above for how we could
handle that) for asynchronous contexts, I don't see too many other complications.

Thanks,

Ingo

2018-03-21 12:31:34

by Rahul Lakkireddy

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3] x86/io: implement 256-bit IO read and write

On Tuesday, March 03/20/18, 2018 at 20:12:15 +0530, Alexander Duyck wrote:
> On Tue, Mar 20, 2018 at 6:32 AM, Rahul Lakkireddy
> <[email protected]> wrote:
> > On Monday, March 03/19/18, 2018 at 20:13:10 +0530, Thomas Gleixner wrote:
> >> On Mon, 19 Mar 2018, Rahul Lakkireddy wrote:
> >>
> >> > Use VMOVDQU AVX CPU instruction when available to do 256-bit
> >> > IO read and write.
> >>
> >> That's not what the patch does. See below.
> >>
> >> > Signed-off-by: Rahul Lakkireddy <[email protected]>
> >> > Signed-off-by: Ganesh Goudar <[email protected]>
> >>
> >> That Signed-off-by chain is wrong....
> >>
> >> > +#ifdef CONFIG_AS_AVX
> >> > +#include <asm/fpu/api.h>
> >> > +
> >> > +static inline u256 __readqq(const volatile void __iomem *addr)
> >> > +{
> >> > + u256 ret;
> >> > +
> >> > + kernel_fpu_begin();
> >> > + asm volatile("vmovdqu %0, %%ymm0" :
> >> > + : "m" (*(volatile u256 __force *)addr));
> >> > + asm volatile("vmovdqu %%ymm0, %0" : "=m" (ret));
> >> > + kernel_fpu_end();
> >> > + return ret;
> >>
> >> You _cannot_ assume that the instruction is available just because
> >> CONFIG_AS_AVX is set. The availability is determined by the runtime
> >> evaluated CPU feature flags, i.e. X86_FEATURE_AVX.
> >>
> >
> > Ok. Will add boot_cpu_has(X86_FEATURE_AVX) check as well.
> >
> >> Aside of that I very much doubt that this is faster than 4 consecutive
> >> 64bit reads/writes as you have the full overhead of
> >> kernel_fpu_begin()/end() for each access.
> >>
> >> You did not provide any numbers for this so its even harder to
> >> determine.
> >>
> >
> > Sorry about that. Here are the numbers with and without this series.
> >
> > When reading up to 2 GB on-chip memory via MMIO, the time taken:
> >
> > Without Series With Series
> > (64-bit read) (256-bit read)
> >
> > 52 seconds 26 seconds
> >
> > As can be seen, we see good improvement with doing 256-bits at a
> > time.
>
> Instead of framing this as an enhanced version of the read/write ops
> why not look at replacing or extending something like the
> memcpy_fromio or memcpy_toio operations? It would probably be more
> comparable to what you are doing if you are wanting to move large
> chunks of memory from one region to another, and it should translate
> into something like AVX instructions once the CPU optimizations kick
> in for a memcpy.
>

Ok. Will look into this approach.

Thanks,
Rahul

2018-03-21 12:31:46

by Rahul Lakkireddy

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3] x86/io: implement 256-bit IO read and write

On Tuesday, March 03/20/18, 2018 at 19:14:46 +0530, Andy Shevchenko wrote:
> On Tue, Mar 20, 2018 at 3:32 PM, Rahul Lakkireddy
> <[email protected]> wrote:
> > On Monday, March 03/19/18, 2018 at 20:13:10 +0530, Thomas Gleixner wrote:
> >> On Mon, 19 Mar 2018, Rahul Lakkireddy wrote:
>
> >> Aside of that I very much doubt that this is faster than 4 consecutive
> >> 64bit reads/writes as you have the full overhead of
> >> kernel_fpu_begin()/end() for each access.
> >>
> >> You did not provide any numbers for this so its even harder to
> >> determine.
> >>
> >
> > Sorry about that. Here are the numbers with and without this series.
> >
> > When reading up to 2 GB on-chip memory via MMIO, the time taken:
> >
> > Without Series With Series
> > (64-bit read) (256-bit read)
> >
> > 52 seconds 26 seconds
> >
> > As can be seen, we see good improvement with doing 256-bits at a
> > time.
>
> But this is kinda synthetic test, right?
> If you run in a normal use case where kernel not only collecting logs,
> but doing something else, especially with frequent userspace
> interaction, would be trend the same?
>

We see same improvement when collecting logs while running
heavy IO with iozone.

Thanks,
Rahul

2018-03-21 12:32:13

by Rahul Lakkireddy

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3] x86/io: implement 256-bit IO read and write

On Tuesday, March 03/20/18, 2018 at 20:10:19 +0530, David Laight wrote:
> From: Rahul Lakkireddy
> > Sent: 20 March 2018 13:32
> ...
> > On High Availability Server, the logs of the failing system must be
> > collected as quickly as possible. So, we're concerned with the amount
> > of time taken to collect our large on-chip memory. We see improvement
> > in doing 256-bit reads at a time.
>
> Two other options:
>
> 1) Get the device to DMA into host memory.
>

Unfortunately, our device doesn't support doing DMA of on-chip memory.

> 2) Use mmap() (and vm_iomap_memory() in your driver) to get direct
> userspace access to the (I assume) PCIe memory space.
> You can then use whatever copy instructions the cpu has.
> (Just don't use memcpy().)
>

We also need to collect this in kernel space i.e. from crash recovery
kernel.

Thanks,
Rahul

2018-03-21 15:49:35

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access

On Wed, Mar 21, 2018 at 6:32 AM, Ingo Molnar <[email protected]> wrote:
>
> * Linus Torvalds <[email protected]> wrote:
>
>> And even if you ignore that "maintenance problems down the line" issue
>> ("we can fix them when they happen") I don't want to see games like
>> this, because I'm pretty sure it breaks the optimized xsave by tagging
>> the state as being dirty.
>
> That's true - and it would penalize the context switch cost of the affected task
> for the rest of its lifetime, as I don't think there's much that clears XINUSE
> other than a FINIT, which is rarely done by user-space.
>
>> So no. Don't use vector stuff in the kernel. It's not worth the pain.
>
> I agree, but:
>
>> The *only* valid use is pretty much crypto, and even there it has had issues.
>> Benchmarks use big arrays and/or dense working sets etc to "prove" how good the
>> vector version is, and then you end up in situations where it's used once per
>> fairly small packet for an interrupt, and it's actually much worse than doing it
>> by hand.
>
> That's mainly because the XSAVE/XRESTOR done by kernel_fpu_begin()/end() is so
> expensive, so this argument is somewhat circular.

If we do the deferred restore, then the XSAVE/XRSTOR happens at most
once per kernel entry, which isn't so bad IMO. Also, with PTI, kernel
entries are already so slow that this will be mostly in the noise :(

--Andy

2018-03-21 18:17:01

by Linus Torvalds

[permalink] [raw]
Subject: Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access

On Wed, Mar 21, 2018 at 12:46 AM, Ingo Molnar <[email protected]> wrote:
>
> So I added a bit of instrumentation and the current state of things is that on
> 64-bit x86 every single task has an initialized FPU, every task has the exact
> same, fully filled in xfeatures (XINUSE) value:

Bah. Your CPU is apparently some old crud that barely has AVX. Of
course all those features are enabled.

> Note that this is with an AVX (128-bit) supporting CPU:

That's weak even by modern standards.

I have MPX bounds and MPX CSR on my old Skylake.

And the real worry is things like AVX-512 etc, which is exactly when
things like "save and restore one ymm register" will quite likely
clear the upper bits of the zmm register.

And yes, we can have some statically patched code that takes that into
account, and saves the whole zmm register when AVX512 is on, but the
whole *point* of the dynamic XSAVES thing is actually that Intel wants
to be able enable new user-space features without having to wait for
OS support. Literally. That's why and how it was designed.

And saving a couple of zmm registers is actually pretty hard. They're
big. Do you want to allocate 128 bytes of stack space, preferably
64-byte aligned, for a save area? No. So now it needs to be some kind
of per-thread (or maybe per-CPU, if we're willing to continue to not
preempt) special save area too.

And even then, it doesn't solve the real worry of "maybe there will be
odd interactions with future extensions that we don't even know of".

All this to do a 32-byte PIO access, with absolutely zero data right
now on what the win is?

Yes, yes, I can find an Intel white-paper that talks about setting WC
and then using xmm and ymm instructions to write a single 64-byte
burst over PCIe, and I assume that is where the code and idea came
from. But I don't actually see any reason why a burst of 8 regular
quad-word bytes wouldn't cause a 64-byte burst write too.

So right now this is for _one_ odd rdma controller, with absolutely
_zero_ performance numbers, and a very high likelihood that it does
not matter in the least.

And if there are some atomicity concerns ("we need to guarantee a
single atomic access for race conditions with the hardware"), they are
probably bogus and misplaced anyway, since

(a) we can't guarantee that AVX2 exists in the first place

(b) we can't guarantee that %ymm register write will show up on any
bus as a single write transaction anyway

So as far as I can tell, there are basically *zero* upsides, and a lot
of potential downsides.

Linus

2018-03-22 01:28:22

by Linus Torvalds

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3] x86/io: implement 256-bit IO read and write

On Tue, Mar 20, 2018 at 7:42 AM, Alexander Duyck
<[email protected]> wrote:
>
> Instead of framing this as an enhanced version of the read/write ops
> why not look at replacing or extending something like the
> memcpy_fromio or memcpy_toio operations?

Yes, doing something like "memcpy_fromio_avx()" is much more
palatable, in that it works like the crypto functions do - if you do
big chunks, the "kernel_fpu_begin/end()" isn't nearly the issue it can
be otherwise.

Note that we definitely have seen hardware that *depends* on the
regular memcpy_fromio()" not doing big reads. I don't know how
hardware people screw it up, but it's clearly possible.

So it really needs to be an explicitly named function that basically a
driver can use to say "my hardware really likes big aligned accesses"
and explicitly ask for some AVX version if possible.

Linus

2018-03-22 10:08:02

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access


* Linus Torvalds <[email protected]> wrote:

> And the real worry is things like AVX-512 etc, which is exactly when
> things like "save and restore one ymm register" will quite likely
> clear the upper bits of the zmm register.

Yeah, I think the only valid save/restore pattern is to 100% correctly enumerate
the width of the vector registers, and use full width instructions.

Using partial registers, even though it's possible in some cases is probably a bad
idea not just due to most instructions auto-zeroing the upper portion to reduce
false dependencies, but also because 'mixed' use of partial and full register
access is known to result in penalties on a wide range of Intel CPUs, at least
according to the Agner PDFs. On AMD CPUs there's no penalty.

So what I think could be done at best is to define a full register save/restore
API, which falls back to XSAVE*/XRSTOR* if we don't have the routines for the
native vector register width. (I.e. if old kernel is used on very new CPU.)

Note that the actual AVX code could still use partial width, it's the save/restore
primitives that has to handle full width registers.

> And yes, we can have some statically patched code that takes that into account,
> and saves the whole zmm register when AVX512 is on, but the whole *point* of the
> dynamic XSAVES thing is actually that Intel wants to be able enable new
> user-space features without having to wait for OS support. Literally. That's why
> and how it was designed.

This aspect wouldn't be hurt AFAICS: to me it appears that due to glibc using
vector instructions in its memset() the AVX bits get used early on and to the
maximum, so the XINUSE for them is set for every task.

The optionality of other XSAVE based features like MPX wouldn't be hurt if the
kernel only uses vector registers.

> And saving a couple of zmm registers is actually pretty hard. They're big. Do
> you want to allocate 128 bytes of stack space, preferably 64-byte aligned, for a
> save area? No. So now it needs to be some kind of per-thread (or maybe per-CPU,
> if we're willing to continue to not preempt) special save area too.

Hm, that's indeed a nasty complication:

- While a single 128 bytes slot might work - in practice at least two vector
registers are needed to have enough parallelism and hide latencies.

- &current->thread.fpu.state.xsave is available almost all the time: with our
current 'direct' FPU context switching code the only time there's live data in
&current->thread.fpu is when the task is not running. But it's not IRQ-safe.

We could probably allow irq save/restore sections to use it, as
local_irq_save()/restore() is still *much* faster than a 1-1.5K FPU context
save/restore pattern.

But I was hoping for a less restrictive model ... :-/

To have a better model and avoid the local_irq_save()/restore we could perhaps
change the IRQ model to have a per IRQ 'current' value (we have separate IRQ
stacks already), but that's quite a bit of work to transform all code that
operates on the interrupted task (scheduler and timer code).

But it's work that would be useful for other reasons as well.

With such a separation in place &current->thread.fpu.state.xsave would become a
generic, natural vector register save area.

> And even then, it doesn't solve the real worry of "maybe there will be odd
> interactions with future extensions that we don't even know of".

Yes, that's true, but I think we could avoid these dangers by using CPU model
based enumeration. The cost would be that vector ops would only be available on
new CPU models after an explicit opt-in. In many cases it will be a single new
constant to an existing switch() statement, easily backported as well.

> All this to do a 32-byte PIO access, with absolutely zero data right
> now on what the win is?

Ok, so that's not what I'd use it for, I'd use it:

- Speed up existing AVX (crypto, RAID) routines for smaller buffer sizes.
Right now the XSAVE*+XRSTOR* cost is significant:

x86/fpu: Cost of: XSAVE insn: 104 cycles
x86/fpu: Cost of: XRSTOR insn: 80 cycles

... and that's with just 128-bit AVX and a ~0.8K XSAVE area. The Agner PDF
lists Skylake XSAVE+XRSTOR costs at 107+122 cycles, plus there's probably a
significant amount of L1 cache churn caused by XSAVE/XRSTOR.

Most of the relevant vector instructions have a single cycle cost
on the other hand.

- To use vector ops in bulk, well-aligned memcpy(), which in many workloads
is a fair chunk of all memset() activity. A usage profile on a typical system:

galatea:~> cat /proc/sched_debug | grep hist | grep -E '[[:digit:]]{4,}$' | grep '0\]'
hist[0x0000]: 1514272
hist[0x0010]: 1905248
hist[0x0020]: 99471
hist[0x0030]: 343309
hist[0x0040]: 177874
hist[0x0080]: 190052
hist[0x00a0]: 5258
hist[0x00b0]: 2387
hist[0x00c0]: 6975
hist[0x00d0]: 5872
hist[0x0100]: 3229
hist[0x0140]: 4813
hist[0x0160]: 9323
hist[0x0200]: 12540
hist[0x0230]: 37488
hist[0x1000]: 17136
hist[0x1d80]: 225199

First column is length of the area copied, the column is usage count.

To do this I wouldn't complicate the main memset() interface in any way to
branch it off to vector ops, I'd isolate specific memcpy()'s and memset()s
(such as page table copying and page clearing) and use the simpler
vector register based primitives there.

For example we have clear_page() which is used by GFP_ZERO and by other places
is implemented on modern x86 CPUs as:

ENTRY(clear_page_erms)
movl $4096,%ecx
xorl %eax,%eax
rep stosb
ret

While for such buffer sizes the enhanced-REP string instructions are supposed
to be slightly faster than 128-bit AVX ops, for such exact page granular ops
I'm pretty sure 256-bit (and 512-bit) vector ops are faster.

- For page granular memset/memcpy it would also be interesting to investigate
whether non-temporal, cache-preserving vector ops for such known-large bulk
ops, such as VMOVNTQA, are beneficial in certain circumstances.

On x86 there's only a single non-temporal instruction to GP registers:
MOVNTI, and for stores only.

The vector instructions space is a lot richer in that regard, allowing
non-temporal loads as well which utilize fill buffers to move chunks of memory
into vector registers.

Random example: in do_cow_fault() we use copy_user_highpage() to copy the page,
which uses copy_user_page() -> copy_page(), which uses:

ENTRY(copy_page)
ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD
movl $4096/8, %ecx
rep movsq
ret

But in this COW copy case it's pretty obvious that we shouldn't keep the
_source_ page in cache. So we could use non-temporal load, which appear to make
a difference on more recent uarchs even on write-back memory ranges:

https://stackoverflow.com/questions/40096894/do-current-x86-architectures-support-non-temporal-loads-from-normal-memory

See the final graph in that entry and now the 'NT load' variant results in the
best execution time in the 4K case - and this is a limited benchmark that
doesn't measure the lower cache eviction pressure by NT loads.

( The store part is probably better done into the cache, not just due to the
SFENCE cost (which is relatively low at 40 cycles), but because it's probably
beneficial to prime the cache with a freshly COW-ed page, it's going to get
used in the near future once we return from the fault. )

etc.

- But more broadly, if we open up vector ops for smaller buffer sizes as well
then I think other kernel code would start using them as well:

- I think the BPF JIT, whose byte code machine languge is used by an
increasing number of kernel subsystems, could benefit from having vector ops.
It would possibly allow the handling of floating point types.

- We could consider implementing vector ops based copy-to-user and copy-from-user
primitives as well, for cases where we know that the dominant usage pattern is
for larger, well-aligned chunks of memory.

- Maybe we could introduce a floating point library (which falls back to a C
implementation) and simplify scheduler math. We go to ridiculous lengths to
maintain precision across a wide range of parameters, essentially implementing
128-bit fixed point math. Even 32-bit floating point math would possibly be
better than that, even if it was done via APIs.

etc.: I think the large vector processor available in modern x86 CPUs could be
utilized by the kernel as well for various purposes.

But I think that's only worth doing if vector registers and their save areas are
easily accessibly and the accesses are fundamentally IRQ safe.

> Yes, yes, I can find an Intel white-paper that talks about setting WC and then
> using xmm and ymm instructions to write a single 64-byte burst over PCIe, and I
> assume that is where the code and idea came from. But I don't actually see any
> reason why a burst of 8 regular quad-word bytes wouldn't cause a 64-byte burst
> write too.

Yeah, I'm not too convinced about the wide readq/writeq usecase either, I just
used the opportunity to outline these (very vague) plans about utilizing vector
instructions more broadly within the kernel.

> So as far as I can tell, there are basically *zero* upsides, and a lot of
> potential downsides.

I agree about the potential downsides and I think most of them can be addressed
adequately - and I think my list of upsides above is potentially significant,
especially once we have lightweight APIs to utilize individual vector registers
without having to do a full save/restore of ~1K large vector register context.

Thanks,

Ingo

2018-03-22 10:09:03

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access


* Andy Lutomirski <[email protected]> wrote:

> On Wed, Mar 21, 2018 at 6:32 AM, Ingo Molnar <[email protected]> wrote:
> >
> > * Linus Torvalds <[email protected]> wrote:
> >
> >> And even if you ignore that "maintenance problems down the line" issue
> >> ("we can fix them when they happen") I don't want to see games like
> >> this, because I'm pretty sure it breaks the optimized xsave by tagging
> >> the state as being dirty.
> >
> > That's true - and it would penalize the context switch cost of the affected task
> > for the rest of its lifetime, as I don't think there's much that clears XINUSE
> > other than a FINIT, which is rarely done by user-space.
> >
> >> So no. Don't use vector stuff in the kernel. It's not worth the pain.
> >
> > I agree, but:
> >
> >> The *only* valid use is pretty much crypto, and even there it has had issues.
> >> Benchmarks use big arrays and/or dense working sets etc to "prove" how good the
> >> vector version is, and then you end up in situations where it's used once per
> >> fairly small packet for an interrupt, and it's actually much worse than doing it
> >> by hand.
> >
> > That's mainly because the XSAVE/XRESTOR done by kernel_fpu_begin()/end() is so
> > expensive, so this argument is somewhat circular.
>
> If we do the deferred restore, then the XSAVE/XRSTOR happens at most
> once per kernel entry, which isn't so bad IMO. Also, with PTI, kernel
> entries are already so slow that this will be mostly in the noise :(

For performance/scalability work we should just ignore the PTI overhead: it
doesn't exist on AMD CPUs and Intel has announced Meltdown-fixed CPUs, to be
released later this year:

https://www.anandtech.com/show/12533/intel-spectre-meltdown

By the time any kernel changes we are talking about today get to distros and users
the newest hardware won't have the Meltdown bug.

Thanks,

Ingo

2018-03-22 10:36:16

by David Laight

[permalink] [raw]
Subject: RE: [RFC PATCH 0/3] kernel: add support for 256-bit IO access

From: Sent: 21 March 2018 18:16
> To: Ingo Molnar
...
> All this to do a 32-byte PIO access, with absolutely zero data right
> now on what the win is?
>
> Yes, yes, I can find an Intel white-paper that talks about setting WC
> and then using xmm and ymm instructions to write a single 64-byte
> burst over PCIe, and I assume that is where the code and idea came
> from. But I don't actually see any reason why a burst of 8 regular
> quad-word bytes wouldn't cause a 64-byte burst write too.

The big gain is from wide PCIe reads, not writes.
Writes to uncached locations (without WC) are almost certainly required
to generate the 'obvious' PCIe TLP, otherwise things are likely to break.

> So right now this is for _one_ odd rdma controller, with absolutely
> _zero_ performance numbers, and a very high likelihood that it does
> not matter in the least.
>
> And if there are some atomicity concerns ("we need to guarantee a
> single atomic access for race conditions with the hardware"), they are
> probably bogus and misplaced anyway, since
>
> (a) we can't guarantee that AVX2 exists in the first place

Any code would need to be in memcpy_fromio(), not in every driver that
might benefit.
Then fallback code can be used if the registers aren't available.

> (b) we can't guarantee that %ymm register write will show up on any
> bus as a single write transaction anyway

Misaligned 8 byte accesses generate a single PCIe TLP.
I can look at what happens for AVX2 transfers later.
I've got code that mmap()s PCIe addresses into user space, and can
look at the TLP (indirectly through tracing on an fpga target).
Just need to set something up that uses AVX copies.

> So as far as I can tell, there are basically *zero* upsides, and a lot
> of potential downsides.

There are some upsides.
I'll do a performance measurement for reads.

David

2018-03-22 10:57:48

by David Laight

[permalink] [raw]
Subject: RE: [RFC PATCH 2/3] x86/io: implement 256-bit IO read and write

From: Linus Torvalds
> Sent: 22 March 2018 01:27
> On Tue, Mar 20, 2018 at 7:42 AM, Alexander Duyck
> <[email protected]> wrote:
> >
> > Instead of framing this as an enhanced version of the read/write ops
> > why not look at replacing or extending something like the
> > memcpy_fromio or memcpy_toio operations?
>
> Yes, doing something like "memcpy_fromio_avx()" is much more
> palatable, in that it works like the crypto functions do - if you do
> big chunks, the "kernel_fpu_begin/end()" isn't nearly the issue it can
> be otherwise.
>
> Note that we definitely have seen hardware that *depends* on the
> regular memcpy_fromio()" not doing big reads. I don't know how
> hardware people screw it up, but it's clearly possible.

I wonder if that hardware works with the current kernel on recent cpus?
I bet it doesn't like the byte accesses that get generated either.

> So it really needs to be an explicitly named function that basically a
> driver can use to say "my hardware really likes big aligned accesses"
> and explicitly ask for some AVX version if possible.

For x86 being able to request a copy done as 'rep movsx' (for each x)
would be useful.
For io copies the cost of the memory access is probably much smaller
that the io access, so really fancy copies are unlikely make much
difference unless the width of the io access changes.

David

2018-03-22 12:49:20

by David Laight

[permalink] [raw]
Subject: RE: [RFC PATCH 0/3] kernel: add support for 256-bit IO access

From: David Laight
> Sent: 22 March 2018 10:36
...
> Any code would need to be in memcpy_fromio(), not in every driver that
> might benefit.
> Then fallback code can be used if the registers aren't available.
>
> > (b) we can't guarantee that %ymm register write will show up on any
> > bus as a single write transaction anyway
>
> Misaligned 8 byte accesses generate a single PCIe TLP.
> I can look at what happens for AVX2 transfers later.
> I've got code that mmap()s PCIe addresses into user space, and can
> look at the TLP (indirectly through tracing on an fpga target).
> Just need to set something up that uses AVX copies.

On my i7-7700 reads into xmm registers generate 16 byte TLP and
reads into ymm registers 32 byte TLP.
I don't think I've a system that supports avx-512

With my lethargic fpga slave 32 bytes reads happen every 144 clocks
and 16 byte ones every 126 (+/- the odd clock).
Single bytes ones happen every 108, 8 byte 117.
So I have (about) 110 clock overhead on every read cycle.
These clocks are the 62.5MHz clock on the fpga.

So if we needed to do PIO reads using the AVX2 (or better AVX-512)
registers would make a significant difference.
Fortunately we can 'dma' most of the data we need to transfer.

I've traced writes before, they are a lot faster and are limited
by things in the fpga fabric (they appear back to back).

David

2018-03-22 17:08:41

by Linus Torvalds

[permalink] [raw]
Subject: Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access

On Thu, Mar 22, 2018 at 5:48 AM, David Laight <[email protected]> wrote:
>
> So if we needed to do PIO reads using the AVX2 (or better AVX-512)
> registers would make a significant difference.
> Fortunately we can 'dma' most of the data we need to transfer.

I think this is the really fundamental issue.

A device that expects PIO to do some kind of high-performance
transaction is a *broken* device.

It really is that simple. We don't bend over for misdesigned hardware
crap unless it is really common.

> I've traced writes before, they are a lot faster and are limited
> by things in the fpga fabric (they appear back to back).

The write combine buffer really should be much more effective than any
AVX or similar can ever be.

Linus

2018-03-22 17:22:55

by Linus Torvalds

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3] x86/io: implement 256-bit IO read and write

On Thu, Mar 22, 2018 at 3:48 AM, David Laight <[email protected]> wrote:
> From: Linus Torvalds
>>
>> Note that we definitely have seen hardware that *depends* on the
>> regular memcpy_fromio()" not doing big reads. I don't know how
>> hardware people screw it up, but it's clearly possible.

[ That should have been "big writes" ]

> I wonder if that hardware works with the current kernel on recent cpus?
> I bet it doesn't like the byte accesses that get generated either.

The one case we knew about we just fixed to use the special "16-bit
word at a time" scr_memcpyw().

If I recall correctly, it was some "enterprise graphics console".

If it's something I've found over the years, is that "enterprise"
hardware is absolutely the dregs. It's the low-volume stuff that
almost nobody uses and where the customer is used to the whole notion
of boutique hardware, so they're used to having to do special things
for special hardware.

And I very much use the word "special" in the "my mom told me I was
special" sense. It's not the *good* kind of "special". It's the "short
bus" kind of special.

> For x86 being able to request a copy done as 'rep movsx' (for each x)
> would be useful.

The portability issues are horrendous. Particularly if the memory area
(source or dest) might be unaligned.

The rule of thumb should be: don't use PIO, and if you *do* use PIO,
don't be picky about what you get.

And most *definitely* don't complain about performance to software
people. Blame the hardware people. Get a competent piece of hardware,
or a competent hardware designer.

Let's face it, PIO is stupid. Use it for command and status words. Not
for data transfer.

Linus

2018-03-22 17:42:00

by Alexei Starovoitov

[permalink] [raw]
Subject: Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access

On Thu, Mar 22, 2018 at 10:33:43AM +0100, Ingo Molnar wrote:
>
> - I think the BPF JIT, whose byte code machine languge is used by an
> increasing number of kernel subsystems, could benefit from having vector ops.
> It would possibly allow the handling of floating point types.

this is on our todo list already.
To process certain traffic inside BPF in XDP we'd like to have access to
floating point. The current workaround is to precompute the math and do
bpf map lookup instead.
Since XDP processing of packets is batched (typically up to napi budget
of 64 packets at a time), we can, in theory, wrap the loop with
kernel_fpu_begin/end and it will be cleaner and faster,
but the work hasn't started yet.
The microbenchmark numbers you quoted for xsave/xrestore look promising,
so we probably will focus on it soon.

Another use case for vector insns is to accelerate fib/lpm lookups
which is likely beneficial for kernel overall regardless of bpf usage.


2018-03-22 17:45:56

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access

On Thu, Mar 22, 2018 at 5:40 PM, Alexei Starovoitov
<[email protected]> wrote:
> On Thu, Mar 22, 2018 at 10:33:43AM +0100, Ingo Molnar wrote:
>>
>> - I think the BPF JIT, whose byte code machine languge is used by an
>> increasing number of kernel subsystems, could benefit from having vector ops.
>> It would possibly allow the handling of floating point types.
>
> this is on our todo list already.
> To process certain traffic inside BPF in XDP we'd like to have access to
> floating point. The current workaround is to precompute the math and do
> bpf map lookup instead.
> Since XDP processing of packets is batched (typically up to napi budget
> of 64 packets at a time), we can, in theory, wrap the loop with
> kernel_fpu_begin/end and it will be cleaner and faster,
> but the work hasn't started yet.
> The microbenchmark numbers you quoted for xsave/xrestore look promising,
> so we probably will focus on it soon.
>
> Another use case for vector insns is to accelerate fib/lpm lookups
> which is likely beneficial for kernel overall regardless of bpf usage.
>

This is a further argument for the deferred restore approach IMO.
With deferred restore, kernel_fpu_begin() + kernel_fpu_end() has very
low amortized cost as long as we do lots of work in the kernel before
re-entering userspace. For XDP workloads, this could be a pretty big
win, I imagine.

Someone just needs to do the nasty x86 work.

2018-04-03 08:51:11

by Pavel Machek

[permalink] [raw]
Subject: Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access

Hi!

> > On Tue, 20 Mar 2018, Ingo Molnar wrote:
> > > * Thomas Gleixner <[email protected]> wrote:
> > >
> > > > > So I do think we could do more in this area to improve driver performance, if the
> > > > > code is correct and if there's actual benchmarks that are showing real benefits.
> > > >
> > > > If it's about hotpath performance I'm all for it, but the use case here is
> > > > a debug facility...
> > > >
> > > > And if we go down that road then we want a AVX based memcpy()
> > > > implementation which is runtime conditional on the feature bit(s) and
> > > > length dependent. Just slapping a readqq() at it and use it in a loop does
> > > > not make any sense.
> > >
> > > Yeah, so generic memcpy() replacement is only feasible I think if the most
> > > optimistic implementation is actually correct:
> > >
> > > - if no preempt disable()/enable() is required
> > >
> > > - if direct access to the AVX[2] registers does not disturb legacy FPU state in
> > > any fashion
> > >
> > > - if direct access to the AVX[2] registers cannot raise weird exceptions or have
> > > weird behavior if the FPU control word is modified to non-standard values by
> > > untrusted user-space
> > >
> > > If we have to touch the FPU tag or control words then it's probably only good for
> > > a specialized API.
> >
> > I did not mean to have a general memcpy replacement. Rather something like
> > magic_memcpy() which falls back to memcpy when AVX is not usable or the
> > length does not justify the AVX stuff at all.
>
> OK, fair enough.
>
> Note that a generic version might still be worth trying out, if and only if it's
> safe to access those vector registers directly: modern x86 CPUs will do their
> non-constant memcpy()s via the common memcpy_erms() function - which could in
> theory be an easy common point to be (cpufeatures-) patched to an AVX2 variant, if
> size (and alignment, perhaps) is a multiple of 32 bytes or so.

How is AVX2 supposed to help the memcpy speed?

If the copy is small, constant overhead will dominate, and I don't
think AVX2 is going to be win there.

If the copy is big, well, the copy loop will likely run out of L1 and
maybe even out of L2, and at that point speed of the loop does not
matter because memory is slow...?

Best regards,
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html


Attachments:
(No filename) (2.45 kB)
signature.asc (188.00 B)
Digital signature
Download all attachments

2018-04-03 10:39:14

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access


* Pavel Machek <[email protected]> wrote:

> > > > Yeah, so generic memcpy() replacement is only feasible I think if the most
> > > > optimistic implementation is actually correct:
> > > >
> > > > - if no preempt disable()/enable() is required
> > > >
> > > > - if direct access to the AVX[2] registers does not disturb legacy FPU state in
> > > > any fashion
> > > >
> > > > - if direct access to the AVX[2] registers cannot raise weird exceptions or have
> > > > weird behavior if the FPU control word is modified to non-standard values by
> > > > untrusted user-space
> > > >
> > > > If we have to touch the FPU tag or control words then it's probably only good for
> > > > a specialized API.
> > >
> > > I did not mean to have a general memcpy replacement. Rather something like
> > > magic_memcpy() which falls back to memcpy when AVX is not usable or the
> > > length does not justify the AVX stuff at all.
> >
> > OK, fair enough.
> >
> > Note that a generic version might still be worth trying out, if and only if it's
> > safe to access those vector registers directly: modern x86 CPUs will do their
> > non-constant memcpy()s via the common memcpy_erms() function - which could in
> > theory be an easy common point to be (cpufeatures-) patched to an AVX2 variant, if
> > size (and alignment, perhaps) is a multiple of 32 bytes or so.
>
> How is AVX2 supposed to help the memcpy speed?
>
> If the copy is small, constant overhead will dominate, and I don't
> think AVX2 is going to be win there.

There are several advantages:

1)

"REP; MOVS" (also called ERMS) has a significant constant "setup cost".

In the scheme I suggested (and if it's possible) then single-register AVX2 access
on the other hand has a setup cost on the "few cycles" order of magnitude.

2)

AVX2 have various non-temporary load and store behavioral variants - while "REP;
MOVS" doesn't (or rather, any such caching optimizations, to the extent they
exist, are hidden in the microcode).

> If the copy is big, well, the copy loop will likely run out of L1 and maybe even
> out of L2, and at that point speed of the loop does not matter because memory is
> slow...?

In many cases "memory" will be something very fast, such as another level of
cache. Also, on NUMA "memory" can also be something locally wired to the CPU -
again accessible at ridiculous bandwidths.

Nevertheless ERMS is probably wins for the regular bulk memcpy by a few percentage
points, so I don't think AVX2 is a win in the generic large-memcpy case, as long
as continued caching of both the loads and the stores is beneficial.

Thanks,

Ingo