2023-10-20 16:20:55

by Uros Bizjak

[permalink] [raw]
Subject: [PATCH] x86/percpu: Introduce const-qualified const_pcpu_hot

Some variables in pcpu_hot, currently current_task and top_of_stack
are actually per-thread variables implemented as per-cpu variables
and thus stable for the duration of the respective task. There is
already an attempt to eliminate redundant reads from these variables
using this_cpu_read_stable() asm macro, which hides the dependency
on the read memory address. However, the compiler has limited ability
to eliminate asm common subexpressions, so this approach results in a
limited success.

The solution is to allow more aggressive elimination by aliasing
pcpu_hot into a const-qualified const_pcpu_hot, and to read stable
per-cpu variables from this constant copy.

The current per-cpu infrastructure does not support reads from
const-qualified variables. However, when the compiler supports segment
qualifiers, it is possible to declare the const-aliased variable in
the relevant named address space. The compiler considers access to the
variable, declared in this way, as a read from a constant location,
and will optimize reads from the variable accordingly.

By implementing constant-qualified const_pcpu_hot, the compiler can
eliminate redundant reads from the constant variables, reducing the
number of loads from current_task from 3766 to 3217 on a test build,
a -14.6% reduction.

The reduction of loads translates to the following code savings:

text data bss dec hex filename
25477353 4389456 808452 30675261 1d4113d vmlinux-old.o
25476074 4389440 808452 30673966 1d40c2e vmlinux-new.o

representing a code size reduction of 1279 bytes.

Cc: Andy Lutomirski <[email protected]>
Cc: Brian Gerst <[email protected]>
Cc: Denys Vlasenko <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Josh Poimboeuf <[email protected]>
Co-developed-by: Nadav Amit <[email protected]>
Signed-off-by: Nadav Amit <[email protected]>
Signed-off-by: Uros Bizjak <[email protected]>
---
arch/x86/include/asm/current.h | 7 +++++++
arch/x86/include/asm/percpu.h | 6 +++---
arch/x86/include/asm/processor.h | 3 +++
arch/x86/kernel/vmlinux.lds.S | 1 +
include/linux/compiler.h | 2 +-
5 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h
index a1168e7b69e5..0538d2436673 100644
--- a/arch/x86/include/asm/current.h
+++ b/arch/x86/include/asm/current.h
@@ -36,8 +36,15 @@ static_assert(sizeof(struct pcpu_hot) == 64);

DECLARE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot);

+/* const-qualified alias to pcpu_hot, aliased by linker. */
+DECLARE_PER_CPU_ALIGNED(const struct pcpu_hot __percpu_seg_override,
+ const_pcpu_hot);
+
static __always_inline struct task_struct *get_current(void)
{
+ if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT))
+ return const_pcpu_hot.current_task;
+
return this_cpu_read_stable(pcpu_hot.current_task);
}

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index bbcc1ca737f0..630bb912a46b 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -413,9 +413,9 @@ do { \
* accessed while this_cpu_read_stable() allows the value to be cached.
* this_cpu_read_stable() is more efficient and can be used if its value
* is guaranteed to be valid across cpus. The current users include
- * get_current() and get_thread_info() both of which are actually
- * per-thread variables implemented as per-cpu variables and thus
- * stable for the duration of the respective task.
+ * pcpu_hot.current_task and pcpu_hot.top_of_stack, both of which are
+ * actually per-thread variables implemented as per-cpu variables and
+ * thus stable for the duration of the respective task.
*/
#define this_cpu_read_stable_1(pcp) percpu_stable_op(1, "mov", pcp)
#define this_cpu_read_stable_2(pcp) percpu_stable_op(2, "mov", pcp)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index ae81a7191c1c..a807025a4dee 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -533,6 +533,9 @@ static __always_inline unsigned long current_top_of_stack(void)
* and around vm86 mode and sp0 on x86_64 is special because of the
* entry trampoline.
*/
+ if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT))
+ return pcpu_hot.top_of_stack;
+
return this_cpu_read_stable(pcpu_hot.top_of_stack);
}

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 54a5596adaa6..1239be7cc8d8 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -46,6 +46,7 @@ ENTRY(phys_startup_64)
#endif

jiffies = jiffies_64;
+const_pcpu_hot = pcpu_hot;

#if defined(CONFIG_X86_64)
/*
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index d7779a18b24f..bf9815eaf4aa 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -212,7 +212,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
*/
#define ___ADDRESSABLE(sym, __attrs) \
static void * __used __attrs \
- __UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)&sym;
+ __UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)(uintptr_t)&sym;
#define __ADDRESSABLE(sym) \
___ADDRESSABLE(sym, __section(".discard.addressable"))

--
2.41.0


2023-10-21 14:27:49

by Nadav Amit

[permalink] [raw]
Subject: Re: [PATCH] x86/percpu: Introduce const-qualified const_pcpu_hot



> On Oct 20, 2023, at 7:19 PM, Uros Bizjak <[email protected]> wrote:
>
> By implementing constant-qualified const_pcpu_hot, the compiler can
> eliminate redundant reads from the constant variables, reducing the
> number of loads from current_task from 3766 to 3217 on a test build,
> a -14.6% reduction.

Thanks again for doing that. LGTM! [*]

--

[*] The negative sign here is unneeded, but that’s just me being
negative. ;-)

Subject: [tip: x86/percpu] x86/percpu: Introduce const-qualified const_pcpu_hot to micro-optimize code generation

The following commit has been merged into the x86/percpu branch of tip:

Commit-ID: ed2f752e0e0a21d941ca0ee539ef3d4cd576bc5e
Gitweb: https://git.kernel.org/tip/ed2f752e0e0a21d941ca0ee539ef3d4cd576bc5e
Author: Uros Bizjak <[email protected]>
AuthorDate: Fri, 20 Oct 2023 18:19:20 +02:00
Committer: Ingo Molnar <[email protected]>
CommitterDate: Mon, 23 Oct 2023 11:27:35 +02:00

x86/percpu: Introduce const-qualified const_pcpu_hot to micro-optimize code generation

Some variables in pcpu_hot, currently current_task and top_of_stack
are actually per-thread variables implemented as per-CPU variables
and thus stable for the duration of the respective task. There is
already an attempt to eliminate redundant reads from these variables
using this_cpu_read_stable() asm macro, which hides the dependency
on the read memory address. However, the compiler has limited ability
to eliminate asm common subexpressions, so this approach results in a
limited success.

The solution is to allow more aggressive elimination by aliasing
pcpu_hot into a const-qualified const_pcpu_hot, and to read stable
per-CPU variables from this constant copy.

The current per-CPU infrastructure does not support reads from
const-qualified variables. However, when the compiler supports segment
qualifiers, it is possible to declare the const-aliased variable in
the relevant named address space. The compiler considers access to the
variable, declared in this way, as a read from a constant location,
and will optimize reads from the variable accordingly.

By implementing constant-qualified const_pcpu_hot, the compiler can
eliminate redundant reads from the constant variables, reducing the
number of loads from current_task from 3766 to 3217 on a test build,
a -14.6% reduction.

The reduction of loads translates to the following code savings:

text data bss dec hex filename
25,477,353 4389456 808452 30675261 1d4113d vmlinux-old.o
25,476,074 4389440 808452 30673966 1d40c2e vmlinux-new.o

representing a code size reduction of -1279 bytes.

[ mingo: Updated the changelog, EXPORT(const_pcpu_hot). ]

Co-developed-by: Nadav Amit <[email protected]>
Signed-off-by: Nadav Amit <[email protected]>
Signed-off-by: Uros Bizjak <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
arch/x86/include/asm/current.h | 7 +++++++
arch/x86/include/asm/percpu.h | 6 +++---
arch/x86/include/asm/processor.h | 3 +++
arch/x86/kernel/cpu/common.c | 1 +
arch/x86/kernel/vmlinux.lds.S | 1 +
include/linux/compiler.h | 2 +-
6 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h
index a1168e7..0538d24 100644
--- a/arch/x86/include/asm/current.h
+++ b/arch/x86/include/asm/current.h
@@ -36,8 +36,15 @@ static_assert(sizeof(struct pcpu_hot) == 64);

DECLARE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot);

+/* const-qualified alias to pcpu_hot, aliased by linker. */
+DECLARE_PER_CPU_ALIGNED(const struct pcpu_hot __percpu_seg_override,
+ const_pcpu_hot);
+
static __always_inline struct task_struct *get_current(void)
{
+ if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT))
+ return const_pcpu_hot.current_task;
+
return this_cpu_read_stable(pcpu_hot.current_task);
}

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index bbcc1ca..b86b27d 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -413,9 +413,9 @@ do { \
* accessed while this_cpu_read_stable() allows the value to be cached.
* this_cpu_read_stable() is more efficient and can be used if its value
* is guaranteed to be valid across cpus. The current users include
- * get_current() and get_thread_info() both of which are actually
- * per-thread variables implemented as per-cpu variables and thus
- * stable for the duration of the respective task.
+ * pcpu_hot.current_task and pcpu_hot.top_of_stack, both of which are
+ * actually per-thread variables implemented as per-CPU variables and
+ * thus stable for the duration of the respective task.
*/
#define this_cpu_read_stable_1(pcp) percpu_stable_op(1, "mov", pcp)
#define this_cpu_read_stable_2(pcp) percpu_stable_op(2, "mov", pcp)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 0086920..b47a997 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -518,6 +518,9 @@ static __always_inline unsigned long current_top_of_stack(void)
* and around vm86 mode and sp0 on x86_64 is special because of the
* entry trampoline.
*/
+ if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT))
+ return pcpu_hot.top_of_stack;
+
return this_cpu_read_stable(pcpu_hot.top_of_stack);
}

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 382d4e6..4cc0ab0 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -2051,6 +2051,7 @@ DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = {
.top_of_stack = TOP_OF_INIT_STACK,
};
EXPORT_PER_CPU_SYMBOL(pcpu_hot);
+EXPORT_PER_CPU_SYMBOL(const_pcpu_hot);

#ifdef CONFIG_X86_64
DEFINE_PER_CPU_FIRST(struct fixed_percpu_data,
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 83d41c2..e701f2d 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -46,6 +46,7 @@ ENTRY(phys_startup_64)
#endif

jiffies = jiffies_64;
+const_pcpu_hot = pcpu_hot;

#if defined(CONFIG_X86_64)
/*
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index d7779a1..bf9815e 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -212,7 +212,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
*/
#define ___ADDRESSABLE(sym, __attrs) \
static void * __used __attrs \
- __UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)&sym;
+ __UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)(uintptr_t)&sym;
#define __ADDRESSABLE(sym) \
___ADDRESSABLE(sym, __section(".discard.addressable"))