2021-12-03 00:36:46

by Jiaxun Yang

[permalink] [raw]
Subject: [RFC PATCH 00/10] x86: Allocate AVX512 xstate ondemand

This series makes allocation of AVX512 xstate buffer ondemand.
It can save some memory (~2k for a thread not using AVX512).

Also we are going to have heterogeneous processors that
only some cores support AVX512, it can be helpful when
dealing with such processors.

Patch 1~6 are some preparations.
Patch 7 moves reallocation of xstate buffer to save_fpregs_to_fpstate.
Patch 8~10 are some cleanup and enablement work.

Tested on a Skylake-X system with AVX512 intensive y-cruncher and numpy,
the performance impact seems neglectable.

Any sugguestions are welcomed.

Cheers!
---
Jiaxun Yang
Year 1 ECS Undergraduate
University of Edinburgh

Jiaxun Yang (10):
x86/fpu: Remove duplicated declaration of __fpu_state_size_dynamic
x86/fpu: Split fpu_xfd_enabled from fpu_state_size_dynamic
x86/fpu: Calculate xsave state addr on fly
x86/fpu: Remove xstate_comp_offsets cache
x86/fpu: Cache xstate_is_aligned
x86/fpu/xcr: Return all enabled xfeature if CPU doesn't support
x86/fpu: Rellocate fpstate on save_fpregs_to_fpstate
x86/fpu: Don't keep state_size in perm struct
x86/fpu: Split out XFEATURE_MASK_KERNEL_DYNAMIC
x86/fpu: Mark AVX512 xfeatures as kernel dynamic

arch/x86/include/asm/fpu/types.h | 8 -
arch/x86/include/asm/fpu/xcr.h | 16 +-
arch/x86/include/asm/fpu/xstate.h | 19 +-
arch/x86/kernel/fpu/core.c | 21 ++-
arch/x86/kernel/fpu/xstate.c | 281 ++++++++++++++----------------
arch/x86/kernel/fpu/xstate.h | 3 +-
6 files changed, 181 insertions(+), 167 deletions(-)

--
2.30.2



2021-12-03 00:36:50

by Jiaxun Yang

[permalink] [raw]
Subject: [RFC PATCH 01/10] x86/fpu: Remove duplicated declaration of __fpu_state_size_dynamic

Somehow it presented twice in arch/x86/include/asm/fpu/xstate.h.

Signed-off-by: Jiaxun Yang <[email protected]>
---
arch/x86/include/asm/fpu/xstate.h | 4 ----
1 file changed, 4 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index cd3dd170e23a..fb83534e92f6 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -111,10 +111,6 @@ void xrstors(struct xregs_state *xsave, u64 mask);

int xfd_enable_feature(u64 xfd_err);

-#ifdef CONFIG_X86_64
-DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
-#endif
-
#ifdef CONFIG_X86_64
DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);

--
2.30.2


2021-12-03 00:36:57

by Jiaxun Yang

[permalink] [raw]
Subject: [RFC PATCH 02/10] x86/fpu: Split fpu_xfd_enabled from fpu_state_size_dynamic

Following patches will make fpu_state_size_dynamic nolonger depend
on XFD, so split XFD related static branch from fpu_state_size_dynamic.

Also make fpu_state_size_dynamic available on both x86 and x86_64.

Signed-off-by: Jiaxun Yang <[email protected]>
---
arch/x86/include/asm/fpu/xstate.h | 11 +++++++++--
arch/x86/kernel/fpu/core.c | 3 ++-
arch/x86/kernel/fpu/xstate.c | 31 +++++++++++++++++++------------
arch/x86/kernel/fpu/xstate.h | 2 +-
4 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index fb83534e92f6..c46d3dd591bd 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -111,15 +111,22 @@ void xrstors(struct xregs_state *xsave, u64 mask);

int xfd_enable_feature(u64 xfd_err);

-#ifdef CONFIG_X86_64
DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);

static __always_inline __pure bool fpu_state_size_dynamic(void)
{
return static_branch_unlikely(&__fpu_state_size_dynamic);
}
+
+#ifdef CONFIG_X86_64
+DECLARE_STATIC_KEY_FALSE(__fpu_xfd_enabled);
+
+static __always_inline __pure bool fpu_xfd_enabled(void)
+{
+ return static_branch_unlikely(&__fpu_xfd_enabled);
+}
#else
-static __always_inline __pure bool fpu_state_size_dynamic(void)
+static __always_inline __pure bool fpu_xfd_enabled(void)
{
return false;
}
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index dd3777ac0443..88dbbbde7a3a 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -26,8 +26,9 @@
#define CREATE_TRACE_POINTS
#include <asm/trace/fpu.h>

-#ifdef CONFIG_X86_64
DEFINE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
+#ifdef CONFIG_X86_64
+DEFINE_STATIC_KEY_FALSE(__fpu_xfd_enabled);
DEFINE_PER_CPU(u64, xfd_state);
#endif

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index d28829403ed0..353c661f8027 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -995,7 +995,7 @@ void fpu__resume_cpu(void)
xfeatures_mask_independent());
}

- if (fpu_state_size_dynamic())
+ if (fpu_xfd_enabled())
wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd);
}

@@ -1422,6 +1422,23 @@ void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature)
EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
#endif

+static int __init xstate_update_static_branch(void)
+{
+ if (fpu_kernel_cfg.max_features != fpu_kernel_cfg.default_features)
+ static_branch_enable(&__fpu_state_size_dynamic);
+
+#ifdef CONFIG_X86_64
+ /*
+ * If init_fpstate.xfd has bits set then dynamic features are
+ * available and the dynamic sizing must be enabled.
+ */
+ if (init_fpstate.xfd)
+ static_branch_enable(&__fpu_xfd_enabled);
+#endif
+ return 0;
+}
+arch_initcall(xstate_update_static_branch)
+
#ifdef CONFIG_X86_64

#ifdef CONFIG_X86_DEBUG_FPU
@@ -1481,17 +1498,7 @@ void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
}
#endif /* CONFIG_X86_DEBUG_FPU */

-static int __init xfd_update_static_branch(void)
-{
- /*
- * If init_fpstate.xfd has bits set then dynamic features are
- * available and the dynamic sizing must be enabled.
- */
- if (init_fpstate.xfd)
- static_branch_enable(&__fpu_state_size_dynamic);
- return 0;
-}
-arch_initcall(xfd_update_static_branch)
+

void fpstate_free(struct fpu *fpu)
{
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index 86ea7c0fa2f6..f6bebaeea4ad 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -140,7 +140,7 @@ static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rs
#ifdef CONFIG_X86_64
static inline void xfd_update_state(struct fpstate *fpstate)
{
- if (fpu_state_size_dynamic()) {
+ if (fpu_xfd_enabled()) {
u64 xfd = fpstate->xfd;

if (__this_cpu_read(xfd_state) != xfd) {
--
2.30.2


2021-12-03 00:36:57

by Jiaxun Yang

[permalink] [raw]
Subject: [RFC PATCH 03/10] x86/fpu: Calculate xsave state addr on fly

As we are going to have xsave buffer with different xcomp_bv,
the compact layout of those buffers can be different.

Calculate state address on fly instead of cache them on boot.

Signed-off-by: Jiaxun Yang <[email protected]>
---
arch/x86/kernel/fpu/xstate.c | 22 +++++++++++++++++++++-
1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 353c661f8027..06f6214e9dd7 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1006,12 +1006,32 @@ void fpu__resume_cpu(void)
*/
static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
{
+ int i;
+ unsigned int offset;
+
if (!xfeature_enabled(xfeature_nr)) {
WARN_ON_FPU(1);
return NULL;
}

- return (void *)xsave + xstate_comp_offsets[xfeature_nr];
+ /* Hanlde legacy states and standard format ext states */
+ if (xfeature_nr < FIRST_EXTENDED_XFEATURE ||
+ !(xsave->header.xcomp_bv & XCOMP_BV_COMPACTED_FORMAT))
+ return (void *)xsave + xstate_offsets[xfeature_nr];
+
+ /* Calculate compact ext state offsets */
+ offset = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+ for_each_extended_xfeature(i, xsave->header.xcomp_bv) {
+ if (xfeature_is_aligned(i))
+ offset = ALIGN(offset, 64);
+
+ if (i == xfeature_nr)
+ return (void *)xsave + offset;
+
+ offset += xstate_sizes[i];
+ }
+
+ return NULL;
}
/*
* Given the xsave area and a state inside, this function returns the
--
2.30.2


2021-12-03 00:36:58

by Jiaxun Yang

[permalink] [raw]
Subject: [RFC PATCH 04/10] x86/fpu: Remove xstate_comp_offsets cache

As xsave addr is now calculated on fly, the cache nolonger works.

Signed-off-by: Jiaxun Yang <[email protected]>
---
arch/x86/kernel/fpu/xstate.c | 58 +-----------------------------------
1 file changed, 1 insertion(+), 57 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 06f6214e9dd7..e3ed3d0f3741 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -81,8 +81,6 @@ static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
{ [ 0 ... XFEATURE_MAX - 1] = -1};
static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
{ [ 0 ... XFEATURE_MAX - 1] = -1};
-static unsigned int xstate_comp_offsets[XFEATURE_MAX] __ro_after_init =
- { [ 0 ... XFEATURE_MAX - 1] = -1};
static unsigned int xstate_supervisor_only_offsets[XFEATURE_MAX] __ro_after_init =
{ [ 0 ... XFEATURE_MAX - 1] = -1};

@@ -288,49 +286,10 @@ static int xfeature_is_aligned(int xfeature_nr)
return !!(ecx & 2);
}

-/*
- * This function sets up offsets and sizes of all extended states in
- * xsave area. This supports both standard format and compacted format
- * of the xsave area.
- */
-static void __init setup_xstate_comp_offsets(void)
-{
- unsigned int next_offset;
- int i;
-
- /*
- * The FP xstates and SSE xstates are legacy states. They are always
- * in the fixed offsets in the xsave area in either compacted form
- * or standard form.
- */
- xstate_comp_offsets[XFEATURE_FP] = 0;
- xstate_comp_offsets[XFEATURE_SSE] = offsetof(struct fxregs_state,
- xmm_space);
-
- if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) {
- for_each_extended_xfeature(i, fpu_kernel_cfg.max_features)
- xstate_comp_offsets[i] = xstate_offsets[i];
- return;
- }
-
- next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE;
-
- for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
- if (xfeature_is_aligned(i))
- next_offset = ALIGN(next_offset, 64);
-
- xstate_comp_offsets[i] = next_offset;
- next_offset += xstate_sizes[i];
- }
-}
-
/*
* Setup offsets of a supervisor-state-only XSAVES buffer:
*
- * The offsets stored in xstate_comp_offsets[] only work for one specific
- * value of the Requested Feature BitMap (RFBM). In cases where a different
- * RFBM value is used, a different set of offsets is required. This set of
- * offsets is for when RFBM=xfeatures_mask_supervisor().
+ * This set of offsets is for when RFBM=xfeatures_mask_supervisor().
*/
static void __init setup_supervisor_only_offsets(void)
{
@@ -351,19 +310,6 @@ static void __init setup_supervisor_only_offsets(void)
}
}

-/*
- * Print out xstate component offsets and sizes
- */
-static void __init print_xstate_offset_size(void)
-{
- int i;
-
- for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
- pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
- i, xstate_comp_offsets[i], i, xstate_sizes[i]);
- }
-}
-
/*
* This function is called only during boot time when x86 caps are not set
* up and alternative can not be used yet.
@@ -950,7 +896,6 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
fpu_user_cfg.max_features);

setup_init_fpu_buf();
- setup_xstate_comp_offsets();
setup_supervisor_only_offsets();

/*
@@ -963,7 +908,6 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
goto out_disable;
}

- print_xstate_offset_size();
pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
fpu_kernel_cfg.max_features,
fpu_kernel_cfg.max_size,
--
2.30.2


2021-12-03 00:37:00

by Jiaxun Yang

[permalink] [raw]
Subject: [RFC PATCH 05/10] x86/fpu: Cache xstate_is_aligned

As we're now calculating xsate addr on fly, we are going to
access xfeature_is_aligned frequently. Cache it to speed up
calculation.

Signed-off-by: Jiaxun Yang <[email protected]>
---
arch/x86/kernel/fpu/xstate.c | 25 ++++++++++++++++++-------
1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index e3ed3d0f3741..df8a70a055a9 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -83,6 +83,8 @@ static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
{ [ 0 ... XFEATURE_MAX - 1] = -1};
static unsigned int xstate_supervisor_only_offsets[XFEATURE_MAX] __ro_after_init =
{ [ 0 ... XFEATURE_MAX - 1] = -1};
+static bool xstate_is_aligned[XFEATURE_MAX] __ro_after_init =
+ { [ 0 ... XFEATURE_MAX - 1] = -1};

/*
* Return whether the system supports a given xfeature.
@@ -261,10 +263,6 @@ static void __init print_xstate_features(void)
WARN_ON(nr >= XFEATURE_MAX); \
} while (0)

-/*
- * We could cache this like xstate_size[], but we only use
- * it here, so it would be a waste of space.
- */
static int xfeature_is_aligned(int xfeature_nr)
{
u32 eax, ebx, ecx, edx;
@@ -286,6 +284,18 @@ static int xfeature_is_aligned(int xfeature_nr)
return !!(ecx & 2);
}

+/*
+ * Setup a cache to tell if a xstate needs to care alignment:
+ */
+static void __init setup_is_aligned(void)
+{
+ int i;
+
+ for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
+ xstate_is_aligned[i] = xfeature_is_aligned(i);
+ }
+}
+
/*
* Setup offsets of a supervisor-state-only XSAVES buffer:
*
@@ -302,7 +312,7 @@ static void __init setup_supervisor_only_offsets(void)
if (!xfeature_is_supervisor(i))
continue;

- if (xfeature_is_aligned(i))
+ if (xstate_is_aligned[i])
next_offset = ALIGN(next_offset, 64);

xstate_supervisor_only_offsets[i] = next_offset;
@@ -595,7 +605,7 @@ static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)

for_each_extended_xfeature(i, xfeatures) {
/* Align from the end of the previous feature */
- if (xfeature_is_aligned(i))
+ if (xstate_is_aligned[i])
size = ALIGN(size, 64);
/*
* In compacted format the enabled features are packed,
@@ -881,6 +891,7 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)

/* Enable xstate instructions to be able to continue with initialization: */
fpu__init_cpu_xstate();
+ setup_is_aligned();
err = init_xstate_size();
if (err)
goto out_disable;
@@ -966,7 +977,7 @@ static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
/* Calculate compact ext state offsets */
offset = FXSAVE_SIZE + XSAVE_HDR_SIZE;
for_each_extended_xfeature(i, xsave->header.xcomp_bv) {
- if (xfeature_is_aligned(i))
+ if (xstate_is_aligned[i])
offset = ALIGN(offset, 64);

if (i == xfeature_nr)
--
2.30.2


2021-12-03 00:37:02

by Jiaxun Yang

[permalink] [raw]
Subject: [RFC PATCH 06/10] x86/fpu/xcr: Return all enabled xfeature if CPU doesn't support

Read from XCR0 when CPU doesn't support X86_FEATURE_XGETBV1.
It just assumes that all enabled features are in use.

Signed-off-by: Jiaxun Yang <[email protected]>
---
arch/x86/include/asm/fpu/xcr.h | 16 ++++++++++++++--
1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xcr.h b/arch/x86/include/asm/fpu/xcr.h
index 9656a5bc6fea..e718074ef141 100644
--- a/arch/x86/include/asm/fpu/xcr.h
+++ b/arch/x86/include/asm/fpu/xcr.h
@@ -2,6 +2,8 @@
#ifndef _ASM_X86_FPU_XCR_H
#define _ASM_X86_FPU_XCR_H

+#include <asm/alternative.h>
+
#define XCR_XFEATURE_ENABLED_MASK 0x00000000
#define XCR_XFEATURE_IN_USE_MASK 0x00000001

@@ -25,11 +27,21 @@ static inline void xsetbv(u32 index, u64 value)
* Return a mask of xfeatures which are currently being tracked
* by the processor as being in the initial configuration.
*
- * Callers should check X86_FEATURE_XGETBV1.
+ * It will return XCR0, which is all enabled xfeatures in case
+ * CPU doesn't support X86_FEATURE_XGETBV1.
*/
static inline u64 xfeatures_in_use(void)
{
- return xgetbv(XCR_XFEATURE_IN_USE_MASK);
+ u32 eax, edx;
+
+ asm volatile(ALTERNATIVE(
+ "mov $0, %%ecx",
+ "mov $1, %%ecx",
+ X86_FEATURE_XGETBV1)
+ "xgetbv"
+ : "=a" (eax), "=d" (edx) :: "ecx");
+
+ return eax + ((u64)edx << 32);
}

#endif /* _ASM_X86_FPU_XCR_H */
--
2.30.2


2021-12-03 00:37:04

by Jiaxun Yang

[permalink] [raw]
Subject: [RFC PATCH 07/10] x86/fpu: Rellocate fpstate on save_fpregs_to_fpstate

The general idea is to compare XINUSE with xfeatures of fpstate
buffer on every saving. If there are new in use states that not
existing in buffer's xfeatures then just reallocate to enlarge it.

fpstate_realloc is reworked to including calculation of kernel
buffer size and use in interrupt.

Signed-off-by: Jiaxun Yang <[email protected]>
---
arch/x86/kernel/fpu/core.c | 17 ++++
arch/x86/kernel/fpu/xstate.c | 176 ++++++++++++++++++-----------------
arch/x86/kernel/fpu/xstate.h | 1 +
3 files changed, 109 insertions(+), 85 deletions(-)

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 88dbbbde7a3a..861cbfc51c17 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -112,6 +112,22 @@ static void update_avx_timestamp(struct fpu *fpu)
fpu->avx512_timestamp = jiffies;
}

+/* Update xstate size if it more dynamic features are opted-in. */
+static inline void xstate_update_size(struct fpu *fpu)
+{
+ int err;
+ struct fpstate *fpstate = fpu->fpstate;
+ u64 fpsmask = fpstate->xfeatures;
+ u64 curmask = fpsmask | xfeatures_in_use();
+
+ if (fpu_state_size_dynamic()) {
+ if (fpsmask != curmask) {
+ err = fpstate_realloc(fpu, curmask);
+ WARN_ON_FPU(err);
+ }
+ }
+}
+
/*
* Save the FPU register state in fpu->fpstate->regs. The register state is
* preserved.
@@ -129,6 +145,7 @@ static void update_avx_timestamp(struct fpu *fpu)
void save_fpregs_to_fpstate(struct fpu *fpu)
{
if (likely(use_xsave())) {
+ xstate_update_size(fpu);
os_xsave(fpu->fpstate);
update_avx_timestamp(fpu);
return;
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index df8a70a055a9..8519a6286d0d 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -7,10 +7,12 @@
#include <linux/bitops.h>
#include <linux/compat.h>
#include <linux/cpu.h>
+#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/nospec.h>
#include <linux/pkeys.h>
#include <linux/seq_file.h>
+#include <linux/slab.h>
#include <linux/proc_fs.h>
#include <linux/vmalloc.h>

@@ -1414,71 +1416,10 @@ static int __init xstate_update_static_branch(void)
}
arch_initcall(xstate_update_static_branch)

-#ifdef CONFIG_X86_64
-
-#ifdef CONFIG_X86_DEBUG_FPU
-/*
- * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask
- * can safely operate on the @fpstate buffer.
- */
-static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
-{
- u64 xfd = __this_cpu_read(xfd_state);
-
- if (fpstate->xfd == xfd)
- return true;
-
- /*
- * The XFD MSR does not match fpstate->xfd. That's invalid when
- * the passed in fpstate is current's fpstate.
- */
- if (fpstate->xfd == current->thread.fpu.fpstate->xfd)
- return false;
-
- /*
- * XRSTOR(S) from init_fpstate are always correct as it will just
- * bring all components into init state and not read from the
- * buffer. XSAVE(S) raises #PF after init.
- */
- if (fpstate == &init_fpstate)
- return rstor;
-
- /*
- * XSAVE(S): clone(), fpu_swap_kvm_fpu()
- * XRSTORS(S): fpu_swap_kvm_fpu()
- */
-
- /*
- * No XSAVE/XRSTOR instructions (except XSAVE itself) touch
- * the buffer area for XFD-disabled state components.
- */
- mask &= ~xfd;
-
- /*
- * Remove features which are valid in fpstate. They
- * have space allocated in fpstate.
- */
- mask &= ~fpstate->xfeatures;
-
- /*
- * Any remaining state components in 'mask' might be written
- * by XSAVE/XRSTOR. Fail validation it found.
- */
- return !mask;
-}
-
-void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
-{
- WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor));
-}
-#endif /* CONFIG_X86_DEBUG_FPU */
-
-
-
void fpstate_free(struct fpu *fpu)
{
if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate)
- vfree(fpu->fpstate);
+ kvfree(fpu->fpstate);
}

/**
@@ -1507,10 +1448,9 @@ static struct fpstate *fpu_install_fpstate(struct fpu *fpu,
/**
* fpstate_realloc - Reallocate struct fpstate for the requested new features
*
- * @xfeatures: A bitmap of xstate features which extend the enabled features
- * of that task
- * @ksize: The required size for the kernel buffer
- * @usize: The required size for user space buffers
+ * @fpu: point to the struct fpu to be modified.
+ * @xfeature: A bitmap of xstate features which should be enabled in the new
+ * buffer.
*
* Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer
* terminates quickly, vfree()-induced IPIs may be a concern, but tasks
@@ -1518,22 +1458,35 @@ static struct fpstate *fpu_install_fpstate(struct fpu *fpu,
*
* Returns: 0 on success, -ENOMEM on allocation error.
*/
-static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
- unsigned int usize)
+int fpstate_realloc(struct fpu *fpu, u64 xfeatures)
{
- struct fpu *fpu = &current->thread.fpu;
struct fpstate *curfps, *newfps = NULL;
- unsigned int fpsize;
+ unsigned int ksize, fpsize;

curfps = fpu->fpstate;
+ xfeatures |= curfps->xfeatures;
+ ksize = xstate_calculate_size(xfeatures, true);
fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64);

- newfps = vzalloc(fpsize);
+ /* vzalloc is not allowed in interrupt */
+ if (in_interrupt())
+ newfps = kzalloc(fpsize, GFP_KERNEL | GFP_ATOMIC);
+ else
+ newfps = vzalloc(fpsize);
+
if (!newfps)
return -ENOMEM;
newfps->size = ksize;
- newfps->user_size = usize;
newfps->is_valloc = true;
+ newfps->xfeatures = xfeatures;
+
+ /*
+ * Just leave user and XFD stuff untouched, they will be modified
+ * on perm request syscall.
+ */
+ newfps->user_xfeatures = curfps->user_xfeatures;
+ newfps->user_size = curfps->user_size;
+ newfps->xfd = curfps->xfd;

fpregs_lock();
/*
@@ -1541,12 +1494,6 @@ static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
* swapping fpstate as that might invalidate it due to layout
* changes.
*/
- if (test_thread_flag(TIF_NEED_FPU_LOAD))
- fpregs_restore_userregs();
-
- newfps->xfeatures = curfps->xfeatures | xfeatures;
- newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
- newfps->xfd = curfps->xfd & ~xfeatures;

curfps = fpu_install_fpstate(fpu, newfps);

@@ -1556,9 +1503,67 @@ static int fpstate_realloc(u64 xfeatures, unsigned int ksize,

fpregs_unlock();

- vfree(curfps);
+ kvfree(curfps);
return 0;
}
+#ifdef CONFIG_X86_64
+
+#ifdef CONFIG_X86_DEBUG_FPU
+/*
+ * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask
+ * can safely operate on the @fpstate buffer.
+ */
+static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
+{
+ u64 xfd = __this_cpu_read(xfd_state);
+
+ if (fpstate->xfd == xfd)
+ return true;
+
+ /*
+ * The XFD MSR does not match fpstate->xfd. That's invalid when
+ * the passed in fpstate is current's fpstate.
+ */
+ if (fpstate->xfd == current->thread.fpu.fpstate->xfd)
+ return false;
+
+ /*
+ * XRSTOR(S) from init_fpstate are always correct as it will just
+ * bring all components into init state and not read from the
+ * buffer. XSAVE(S) raises #PF after init.
+ */
+ if (fpstate == &init_fpstate)
+ return rstor;
+
+ /*
+ * XSAVE(S): clone(), fpu_swap_kvm_fpu()
+ * XRSTORS(S): fpu_swap_kvm_fpu()
+ */
+
+ /*
+ * No XSAVE/XRSTOR instructions (except XSAVE itself) touch
+ * the buffer area for XFD-disabled state components.
+ */
+ mask &= ~xfd;
+
+ /*
+ * Remove features which are valid in fpstate. They
+ * have space allocated in fpstate.
+ */
+ mask &= ~fpstate->xfeatures;
+
+ /*
+ * Any remaining state components in 'mask' might be written
+ * by XSAVE/XRSTOR. Fail validation it found.
+ */
+ return !mask;
+}
+
+void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
+{
+ WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor));
+}
+#endif /* CONFIG_X86_DEBUG_FPU */

static int validate_sigaltstack(unsigned int usize)
{
@@ -1686,12 +1691,13 @@ int xfd_enable_feature(u64 xfd_err)
*/
spin_unlock_irq(&current->sighand->siglock);

- /*
- * Try to allocate a new fpstate. If that fails there is no way
- * out.
- */
- if (fpstate_realloc(xfd_event, ksize, usize))
- return -EFAULT;
+ /* Update user state size */
+ fpu->fpstate->user_size = usize;
+ /* Enable feature for user */
+ fpu->fpstate->user_xfeatures |= xfd_event;
+ /* Opt-out XFD */
+ fpu->fpstate->xfd &= ~xfd_event;
+
return 0;
}
#else /* CONFIG_X86_64 */
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index f6bebaeea4ad..473af864b5d1 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -40,6 +40,7 @@ extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf);
extern int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate, const void __user *ubuf);

+extern int fpstate_realloc(struct fpu *fpu, u64 xfeatures);

extern void fpu__init_cpu_xstate(void);
extern void fpu__init_system_xstate(unsigned int legacy_size);
--
2.30.2


2021-12-03 00:37:06

by Jiaxun Yang

[permalink] [raw]
Subject: [RFC PATCH 08/10] x86/fpu: Don't keep state_size in perm struct

As we are now calculating kernel fpstate size accroading
to features actually in use, there is no need to keep the
size of states permitted.

Signed-off-by: Jiaxun Yang <[email protected]>
---
arch/x86/include/asm/fpu/types.h | 8 --------
arch/x86/kernel/fpu/core.c | 1 -
arch/x86/kernel/fpu/xstate.c | 8 ++------
3 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 3c06c82ab355..a2cadaa9c305 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -409,14 +409,6 @@ struct fpu_state_perm {
*/
u64 __state_perm;

- /*
- * @__state_size:
- *
- * The size required for @__state_perm. Only valid to access
- * with sighand locked.
- */
- unsigned int __state_size;
-
/*
* @__user_state_size:
*
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 861cbfc51c17..b257e2dac39a 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -473,7 +473,6 @@ void fpstate_reset(struct fpu *fpu)

/* Initialize the permission related info in fpu */
fpu->perm.__state_perm = fpu_kernel_cfg.default_features;
- fpu->perm.__state_size = fpu_kernel_cfg.default_size;
fpu->perm.__user_state_size = fpu_user_cfg.default_size;
}

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 8519a6286d0d..4621b51d1b20 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1590,9 +1590,8 @@ static int __xstate_request_perm(u64 permitted, u64 requested)
* vendors into extending XFD for the pre AMX states, especially
* AVX512.
*/
- bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES);
struct fpu *fpu = &current->group_leader->thread.fpu;
- unsigned int ksize, usize;
+ unsigned int usize;
u64 mask;
int ret;

@@ -1602,7 +1601,6 @@ static int __xstate_request_perm(u64 permitted, u64 requested)

/* Calculate the resulting kernel state size */
mask = permitted | requested;
- ksize = xstate_calculate_size(mask, compacted);

/* Calculate the resulting user state size */
mask &= XFEATURE_MASK_USER_SUPPORTED;
@@ -1615,7 +1613,6 @@ static int __xstate_request_perm(u64 permitted, u64 requested)
/* Pairs with the READ_ONCE() in xstate_get_group_perm() */
WRITE_ONCE(fpu->perm.__state_perm, requested);
/* Protected by sighand lock */
- fpu->perm.__state_size = ksize;
fpu->perm.__user_state_size = usize;
return ret;
}
@@ -1663,7 +1660,7 @@ static int xstate_request_perm(unsigned long idx)
int xfd_enable_feature(u64 xfd_err)
{
u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
- unsigned int ksize, usize;
+ unsigned int usize;
struct fpu *fpu;

if (!xfd_event) {
@@ -1681,7 +1678,6 @@ int xfd_enable_feature(u64 xfd_err)
}

fpu = &current->group_leader->thread.fpu;
- ksize = fpu->perm.__state_size;
usize = fpu->perm.__user_state_size;
/*
* The feature is permitted. State size is sufficient. Dropping
--
2.30.2


2021-12-03 00:37:08

by Jiaxun Yang

[permalink] [raw]
Subject: [RFC PATCH 10/10] x86/fpu: Mark AVX512 xfeatures as kernel dynamic

As only a few user space processes use AVX512, only
allocate state buffer for AVX512 when in use can save
some memory.

Also we are going to have heterogeneous processors that
only some cores support AVX512, it can be helpful when
dealing with such processors.

Signed-off-by: Jiaxun Yang <[email protected]>
---
arch/x86/include/asm/fpu/xstate.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index d3a8c0855a5d..ac2744732264 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -50,7 +50,8 @@
#define XFEATURE_MASK_USER_DYNAMIC XFEATURE_MASK_XTILE_DATA

/* Features which are dynamically allocated only when in use */
-#define XFEATURE_MASK_KERNEL_DYNAMIC XFEATURE_MASK_XTILE_DATA
+#define XFEATURE_MASK_KERNEL_DYNAMIC \
+ (XFEATURE_MASK_AVX512 | XFEATURE_MASK_XTILE_DATA)

/* All currently supported supervisor features */
#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID)
--
2.30.2


2021-12-03 00:37:13

by Jiaxun Yang

[permalink] [raw]
Subject: [RFC PATCH 09/10] x86/fpu: Split out XFEATURE_MASK_KERNEL_DYNAMIC

For now, we can have features that is not XFD entitled
dynamically allocated on fxstate buffer.

Split out XFEATURE_MASK_KERNEL_DYNAMIC, and make it depend
on XSAVES (actually XSAVEC but os_xsave only generate
compact format when XSAVES present).

Signed-off-by: Jiaxun Yang <[email protected]>
---
arch/x86/include/asm/fpu/xstate.h | 3 +++
arch/x86/kernel/fpu/xstate.c | 3 ++-
2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index c46d3dd591bd..d3a8c0855a5d 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -49,6 +49,9 @@
/* Features which are dynamically enabled for a process on request */
#define XFEATURE_MASK_USER_DYNAMIC XFEATURE_MASK_XTILE_DATA

+/* Features which are dynamically allocated only when in use */
+#define XFEATURE_MASK_KERNEL_DYNAMIC XFEATURE_MASK_XTILE_DATA
+
/* All currently supported supervisor features */
#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 4621b51d1b20..1837e379777f 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -875,7 +875,8 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)

/* Clean out dynamic features from default */
fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features;
- fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
+ if (cpu_feature_enabled(X86_FEATURE_XSAVES))
+ fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;

fpu_user_cfg.default_features = fpu_user_cfg.max_features;
fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
--
2.30.2


2021-12-03 00:40:07

by Dave Hansen

[permalink] [raw]
Subject: Re: [RFC PATCH 00/10] x86: Allocate AVX512 xstate ondemand

On 12/2/21 4:36 PM, Jiaxun Yang wrote:
> Also we are going to have heterogeneous processors that
> only some cores support AVX512, it can be helpful when
> dealing with such processors.

Really? Which x86 vendor is doing that?

2021-12-03 00:45:00

by Dave Hansen

[permalink] [raw]
Subject: Re: [RFC PATCH 07/10] x86/fpu: Rellocate fpstate on save_fpregs_to_fpstate

On 12/2/21 4:36 PM, Jiaxun Yang wrote:
> --- a/arch/x86/kernel/fpu/core.c
> +++ b/arch/x86/kernel/fpu/core.c
> @@ -112,6 +112,22 @@ static void update_avx_timestamp(struct fpu *fpu)
> fpu->avx512_timestamp = jiffies;
> }
>
> +/* Update xstate size if it more dynamic features are opted-in. */
> +static inline void xstate_update_size(struct fpu *fpu)
> +{
> + int err;
> + struct fpstate *fpstate = fpu->fpstate;
> + u64 fpsmask = fpstate->xfeatures;
> + u64 curmask = fpsmask | xfeatures_in_use();
> +
> + if (fpu_state_size_dynamic()) {
> + if (fpsmask != curmask) {
> + err = fpstate_realloc(fpu, curmask);
> + WARN_ON_FPU(err);
> + }
> + }
> +}
> +
> /*
> * Save the FPU register state in fpu->fpstate->regs. The register state is
> * preserved.
> @@ -129,6 +145,7 @@ static void update_avx_timestamp(struct fpu *fpu)
> void save_fpregs_to_fpstate(struct fpu *fpu)
> {
> if (likely(use_xsave())) {
> + xstate_update_size(fpu);
> os_xsave(fpu->fpstate);
> update_avx_timestamp(fpu);
> return;

Have you considered what exactly happens when you hit that WARN_ON_FPU()
which otherwise ignores the allocation error? Have you considered what
happens on the os_xsave() that follows it immediately? How about what
happens the next time this task runs after that failure?


2021-12-03 00:46:09

by Jiaxun Yang

[permalink] [raw]
Subject: Re: [RFC PATCH 00/10] x86: Allocate AVX512 xstate ondemand



在2021年12月3日十二月 上午12:40,Dave Hansen写道:
> On 12/2/21 4:36 PM, Jiaxun Yang wrote:
>> Also we are going to have heterogeneous processors that
>> only some cores support AVX512, it can be helpful when
>> dealing with such processors.
>
> Really? Which x86 vendor is doing that?

Clear lower two bits of MSR 0xAF on Alder Lake give me some suprise :-)

--
- Jiaxun

2021-12-03 00:58:45

by Dave Hansen

[permalink] [raw]
Subject: Re: [RFC PATCH 00/10] x86: Allocate AVX512 xstate ondemand

On 12/2/21 4:45 PM, Jiaxun Yang wrote:
> 在2021年12月3日十二月 上午12:40,Dave Hansen写道:
>> On 12/2/21 4:36 PM, Jiaxun Yang wrote:
>>> Also we are going to have heterogeneous processors that
>>> only some cores support AVX512, it can be helpful when
>>> dealing with such processors.
>> Really? Which x86 vendor is doing that?
> Clear lower two bits of MSR 0xAF on Alder Lake give me some suprise :-)

I honestly don't know what you're talking about. Does poking that MSR
allowing AVX512 instructions to be executed on cores where it was
otherwise disabled? That's entertaining, but it's far from "supported"
or even support*able* in Linux.

2021-12-03 09:20:08

by David Laight

[permalink] [raw]
Subject: RE: [RFC PATCH 00/10] x86: Allocate AVX512 xstate ondemand

From: Dave Hansen
> Sent: 03 December 2021 00:59
>
> On 12/2/21 4:45 PM, Jiaxun Yang wrote:
> > 在2021年12月3日十二月 上午12:40,Dave Hansen写道:
> >> On 12/2/21 4:36 PM, Jiaxun Yang wrote:
> >>> Also we are going to have heterogeneous processors that
> >>> only some cores support AVX512, it can be helpful when
> >>> dealing with such processors.
> >> Really? Which x86 vendor is doing that?

> > Clear lower two bits of MSR 0xAF on Alder Lake give me some suprise :-)
>
> I honestly don't know what you're talking about. Does poking that MSR
> allowing AVX512 instructions to be executed on cores where it was
> otherwise disabled? That's entertaining, but it's far from "supported"
> or even support*able* in Linux.

I can also imagine chips being tested and tested and marked
differently depending on what works.

So a part with faulty AVX512 will be marked as BIG+little with AVX512 disabled.
A part with working AVX512 will have the 'little' cpu disabled.

A lot of this is actually setup by the BIOS code.
Cheating will cause grief.

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

2021-12-03 11:39:44

by Jiaxun Yang

[permalink] [raw]
Subject: Re: [RFC PATCH 07/10] x86/fpu: Rellocate fpstate on save_fpregs_to_fpstate



在2021年12月3日十二月 上午12:44,Dave Hansen写道:
> On 12/2/21 4:36 PM, Jiaxun Yang wrote:
>> --- a/arch/x86/kernel/fpu/core.c
>> +++ b/arch/x86/kernel/fpu/core.c
>> @@ -112,6 +112,22 @@ static void update_avx_timestamp(struct fpu *fpu)
>> fpu->avx512_timestamp = jiffies;
>> }
>>
>> +/* Update xstate size if it more dynamic features are opted-in. */
>> +static inline void xstate_update_size(struct fpu *fpu)
>> +{
>> + int err;
>> + struct fpstate *fpstate = fpu->fpstate;
>> + u64 fpsmask = fpstate->xfeatures;
>> + u64 curmask = fpsmask | xfeatures_in_use();
>> +
>> + if (fpu_state_size_dynamic()) {
>> + if (fpsmask != curmask) {
>> + err = fpstate_realloc(fpu, curmask);
>> + WARN_ON_FPU(err);
>> + }
>> + }
>> +}
>> +
>> /*
>> * Save the FPU register state in fpu->fpstate->regs. The register state is
>> * preserved.
>> @@ -129,6 +145,7 @@ static void update_avx_timestamp(struct fpu *fpu)
>> void save_fpregs_to_fpstate(struct fpu *fpu)
>> {
>> if (likely(use_xsave())) {
>> + xstate_update_size(fpu);
>> os_xsave(fpu->fpstate);
>> update_avx_timestamp(fpu);
>> return;
>
> Have you considered what exactly happens when you hit that WARN_ON_FPU()
> which otherwise ignores the allocation error? Have you considered what
> happens on the os_xsave() that follows it immediately? How about what
> happens the next time this task runs after that failure?

Thank you for the catch.
This is a few questions that I don't have answer, so it's a RFC.

I thought it is unlikely to happen as kmalloc has emergency pool.
But in case it happens, I guess the best way to handle it is just
send SIGILL to corresponding user process or panic if it's kernel
fpu use?

Thanks.
--
- Jiaxun

2021-12-03 11:42:37

by Jiaxun Yang

[permalink] [raw]
Subject: Re: [RFC PATCH 00/10] x86: Allocate AVX512 xstate ondemand



在2021年12月3日十二月 上午12:58,Dave Hansen写道:
> On 12/2/21 4:45 PM, Jiaxun Yang wrote:
>> 在2021年12月3日十二月 上午12:40,Dave Hansen写道:
>>> On 12/2/21 4:36 PM, Jiaxun Yang wrote:
>>>> Also we are going to have heterogeneous processors that
>>>> only some cores support AVX512, it can be helpful when
>>>> dealing with such processors.
>>> Really? Which x86 vendor is doing that?
>> Clear lower two bits of MSR 0xAF on Alder Lake give me some suprise :-)
>
> I honestly don't know what you're talking about. Does poking that MSR
> allowing AVX512 instructions to be executed on cores where it was
> otherwise disabled? That's entertaining, but it's far from "supported"
> or even support*able* in Linux.

If it's improper I'll drop the statement in commit message :-)

Yep, I'm entertaining with kernel hackcing.

Thanks

--
- Jiaxun

2021-12-03 11:51:10

by Borislav Petkov

[permalink] [raw]
Subject: Re: [RFC PATCH 00/10] x86: Allocate AVX512 xstate ondemand

On Fri, Dec 03, 2021 at 11:42:10AM +0000, Jiaxun Yang wrote:
> If it's improper I'll drop the statement in commit message :-)

Actually you should answer his question about what happens if you clear
those two bits in that MSR.

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2021-12-03 11:56:26

by Jiaxun Yang

[permalink] [raw]
Subject: Re: [RFC PATCH 00/10] x86: Allocate AVX512 xstate ondemand



在2021年12月3日十二月 上午11:51,Borislav Petkov写道:
> On Fri, Dec 03, 2021 at 11:42:10AM +0000, Jiaxun Yang wrote:
>> If it's improper I'll drop the statement in commit message :-)
>
> Actually you should answer his question about what happens if you clear
> those two bits in that MSR.

It made AVX512 states available in XCR0. I'm still doing some experiments
to see if AVX512 is actually enabled by this.

Thanks.

>
> --
> Regards/Gruss,
> Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette

--
- Jiaxun

2021-12-03 15:18:52

by Dave Hansen

[permalink] [raw]
Subject: Re: [RFC PATCH 07/10] x86/fpu: Rellocate fpstate on save_fpregs_to_fpstate

On 12/3/21 3:39 AM, Jiaxun Yang wrote:
>>> if (likely(use_xsave())) {
>>> + xstate_update_size(fpu);
>>> os_xsave(fpu->fpstate);
>>> update_avx_timestamp(fpu);
>>> return;
>> Have you considered what exactly happens when you hit that WARN_ON_FPU()
>> which otherwise ignores the allocation error? Have you considered what
>> happens on the os_xsave() that follows it immediately? How about what
>> happens the next time this task runs after that failure?
> Thank you for the catch.
> This is a few questions that I don't have answer, so it's a RFC.
>
> I thought it is unlikely to happen as kmalloc has emergency pool.
> But in case it happens, I guess the best way to handle it is just
> send SIGILL to corresponding user process or panic if it's kernel
> fpu use?

We've thought a *LOT* about this exact problem over the past few years.

Intel even added hardware (XFD) to prevent the situation where you land
in the context switch code, fail a memory allocation, and have to
destroy user data in registers. Without XFD, there are also zero ways
to avoid this happening to apps, *other* than preallocating the memory
in the first place.

I don't think there is *any* viable path forward with this series.

2021-12-03 15:52:02

by Jiaxun Yang

[permalink] [raw]
Subject: Re: [RFC PATCH 07/10] x86/fpu: Rellocate fpstate on save_fpregs_to_fpstate



在2021年12月3日十二月 下午3:18,Dave Hansen写道:
> On 12/3/21 3:39 AM, Jiaxun Yang wrote:
>>>> if (likely(use_xsave())) {
>>>> + xstate_update_size(fpu);
>>>> os_xsave(fpu->fpstate);
>>>> update_avx_timestamp(fpu);
>>>> return;
>>> Have you considered what exactly happens when you hit that WARN_ON_FPU()
>>> which otherwise ignores the allocation error? Have you considered what
>>> happens on the os_xsave() that follows it immediately? How about what
>>> happens the next time this task runs after that failure?
>> Thank you for the catch.
>> This is a few questions that I don't have answer, so it's a RFC.
>>
>> I thought it is unlikely to happen as kmalloc has emergency pool.
>> But in case it happens, I guess the best way to handle it is just
>> send SIGILL to corresponding user process or panic if it's kernel
>> fpu use?
>
> We've thought a *LOT* about this exact problem over the past few years.
>
> Intel even added hardware (XFD) to prevent the situation where you land
> in the context switch code, fail a memory allocation, and have to
> destroy user data in registers. Without XFD, there are also zero ways
> to avoid this happening to apps, *other* than preallocating the memory
> in the first place.
>
> I don't think there is *any* viable path forward with this series.

Hmm, actually I can come up some ways to workaround it.
Like we can have some sort of preallocated emergency pool
with max_feature and utilize them in case of allocation failure during context switch.
We'll get some chance to fulfill the pool again after going back from interrupt context :-)

But maybe you are right, it's not for me, a first year undergraduate student,
to comment on solutions from thousands of brilliant brains at Intel.

Appreciate for your comments to let me understand the nature of the problem.

Thanks.
--
- Jiaxun

2021-12-04 11:56:08

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH 00/10] x86: Allocate AVX512 xstate ondemand

On Fri, Dec 03, 2021 at 12:36:26AM +0000, Jiaxun Yang wrote:
> This series makes allocation of AVX512 xstate buffer ondemand.
> It can save some memory (~2k for a thread not using AVX512).
>
> Also we are going to have heterogeneous processors that
> only some cores support AVX512, it can be helpful when
> dealing with such processors.
>
> Patch 1~6 are some preparations.
> Patch 7 moves reallocation of xstate buffer to save_fpregs_to_fpstate.
> Patch 8~10 are some cleanup and enablement work.
>
> Tested on a Skylake-X system with AVX512 intensive y-cruncher and numpy,
> the performance impact seems neglectable.
>
> Any sugguestions are welcomed.

If we're going to do asymmetric avx512 it needs to come with a prctl()
interface like AMX.