2019-08-24 06:07:22

by Nadav Amit

[permalink] [raw]
Subject: [PATCH 0/7] x86/percpu: Use segment qualifiers

GCC 6+ supports segment qualifiers. Using them allows to implement
several optimizations:

1. Avoid unnecessary instructions when an operation is carried on
read/written per-cpu value, and instead allow the compiler to set
instructions that access per-cpu value directly.

2. Make this_cpu_ptr() more efficient and allow its value to be cached,
since preemption must be disabled when this_cpu_ptr() is used.

3. Provide better alternative for this_cpu_read_stable() that caches
values more efficiently using alias attribute to const variable.

4. Allow the compiler to perform other optimizations (e.g. CSE).

5. Use rip-relative addressing in per_cpu_read_stable(), which make it
PIE-ready.

"size" and Peter's compare do not seem to show the impact on code size
reduction correctly. Summing the code size according to nm on defconfig
shows a minor reduction from 11451310 to 11451310 (0.09%).

RFC->v1:
* Fixing i386 build bug
* Moving chunk to the right place [Peter]

Nadav Amit (7):
compiler: Report x86 segment support
x86/percpu: Use compiler segment prefix qualifier
x86/percpu: Use C for percpu accesses when possible
x86: Fix possible caching of current_task
percpu: Assume preemption is disabled on per_cpu_ptr()
x86/percpu: Optimized arch_raw_cpu_ptr()
x86/current: Aggressive caching of current

arch/x86/include/asm/current.h | 30 +++
arch/x86/include/asm/fpu/internal.h | 7 +-
arch/x86/include/asm/percpu.h | 293 +++++++++++++++++++------
arch/x86/include/asm/preempt.h | 3 +-
arch/x86/include/asm/resctrl_sched.h | 14 +-
arch/x86/kernel/cpu/Makefile | 1 +
arch/x86/kernel/cpu/common.c | 7 +-
arch/x86/kernel/cpu/current.c | 16 ++
arch/x86/kernel/cpu/resctrl/rdtgroup.c | 4 +-
arch/x86/kernel/process_32.c | 4 +-
arch/x86/kernel/process_64.c | 4 +-
include/asm-generic/percpu.h | 12 +
include/linux/compiler-gcc.h | 4 +
include/linux/compiler.h | 2 +-
include/linux/percpu-defs.h | 33 ++-
15 files changed, 346 insertions(+), 88 deletions(-)
create mode 100644 arch/x86/kernel/cpu/current.c

--
2.17.1


2019-08-24 06:07:47

by Nadav Amit

[permalink] [raw]
Subject: [PATCH 5/7] percpu: Assume preemption is disabled on per_cpu_ptr()

When per_cpu_ptr() is used, the caller should have preemption disabled,
as otherwise the pointer is meaningless. If the user wants an "unstable"
pointer he should call raw_cpu_ptr().

Add an assertion to check that indeed preemption is disabled, and
distinguish between the two cases to allow further, per-arch
optimizations.

Signed-off-by: Nadav Amit <[email protected]>
---
include/asm-generic/percpu.h | 12 ++++++++++++
include/linux/percpu-defs.h | 33 ++++++++++++++++++++++++++++++++-
2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index c2de013b2cf4..7853605f4210 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -36,6 +36,14 @@ extern unsigned long __per_cpu_offset[NR_CPUS];
#define my_cpu_offset __my_cpu_offset
#endif

+/*
+ * Determine the offset of the current active processor when preemption is
+ * disabled. Can be overriden by arch code.
+ */
+#ifndef __raw_my_cpu_offset
+#define __raw_my_cpu_offset __my_cpu_offset
+#endif
+
/*
* Arch may define arch_raw_cpu_ptr() to provide more efficient address
* translations for raw_cpu_ptr().
@@ -44,6 +52,10 @@ extern unsigned long __per_cpu_offset[NR_CPUS];
#define arch_raw_cpu_ptr(ptr) SHIFT_PERCPU_PTR(ptr, __my_cpu_offset)
#endif

+#ifndef arch_raw_cpu_ptr_preemptable
+#define arch_raw_cpu_ptr_preemptable(ptr) SHIFT_PERCPU_PTR(ptr, __raw_my_cpu_offset)
+#endif
+
#ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA
extern void setup_per_cpu_areas(void);
#endif
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index a6fabd865211..13afca8a37e7 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -237,20 +237,51 @@ do { \
SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu))); \
})

+#ifndef arch_raw_cpu_ptr_preemption_disabled
+#define arch_raw_cpu_ptr_preemption_disabled(ptr) \
+ arch_raw_cpu_ptr(ptr)
+#endif
+
+#define raw_cpu_ptr_preemption_disabled(ptr) \
+({ \
+ __verify_pcpu_ptr(ptr); \
+ arch_raw_cpu_ptr_preemption_disabled(ptr); \
+})
+
+/*
+ * If preemption is enabled, we need to read the pointer atomically on
+ * raw_cpu_ptr(). However if it is disabled, we can use the
+ * raw_cpu_ptr_nopreempt(), which is potentially more efficient. Similarly, we
+ * can use the preemption-disabled version if the kernel is non-preemptable or
+ * if voluntary preemption is used.
+ */
+#ifdef CONFIG_PREEMPT
+
#define raw_cpu_ptr(ptr) \
({ \
__verify_pcpu_ptr(ptr); \
arch_raw_cpu_ptr(ptr); \
})

+#else
+
+#define raw_cpu_ptr(ptr) raw_cpu_ptr_preemption_disabled(ptr)
+
+#endif
+
#ifdef CONFIG_DEBUG_PREEMPT
+/*
+ * Unlike other this_cpu_* operations, this_cpu_ptr() requires that preemption
+ * will be disabled. In contrast, raw_cpu_ptr() does not require that.
+ */
#define this_cpu_ptr(ptr) \
({ \
+ __this_cpu_preempt_check("ptr"); \
__verify_pcpu_ptr(ptr); \
SHIFT_PERCPU_PTR(ptr, my_cpu_offset); \
})
#else
-#define this_cpu_ptr(ptr) raw_cpu_ptr(ptr)
+#define this_cpu_ptr(ptr) raw_cpu_ptr_preemption_disabled(ptr)
#endif

#else /* CONFIG_SMP */
--
2.17.1

2019-08-24 06:08:03

by Nadav Amit

[permalink] [raw]
Subject: [PATCH 1/7] compiler: Report x86 segment support

GCC v6+ supports x86 segment qualifiers (__seg_gs and __seg_fs). Define
COMPILER_HAS_X86_SEG_SUPPORT when it is supported.

Signed-off-by: Nadav Amit <[email protected]>
---
include/linux/compiler-gcc.h | 4 ++++
1 file changed, 4 insertions(+)

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index d7ee4c6bad48..5967590a18c6 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -149,6 +149,10 @@
#define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1
#endif

+#if GCC_VERSION >= 60000
+#define COMPILER_HAS_X86_SEG_SUPPORT 1
+#endif
+
/*
* Turn individual warnings and errors on and off locally, depending
* on version.
--
2.17.1

2019-08-24 06:08:08

by Nadav Amit

[permalink] [raw]
Subject: [PATCH 7/7] x86/current: Aggressive caching of current

The current_task is supposed to be constant in each thread and therefore
does not need to be reread. There is already an attempt to cache it
using inline assembly, using this_cpu_read_stable(), which hides the
dependency on the read memory address.

However, this caching is not working very well. For example,
sync_mm_rss() still reads current_task twice for no reason.

Allow more aggressive caching by aliasing current_task to
into a constant const_current_task and reading from the constant copy.
Doing so requires the compiler to support x86 segment qualifiers.
Hide const_current_task in a different compilation unit to avoid the
compiler from assuming that the value is constant during compilation.

Signed-off-by: Nadav Amit <[email protected]>
---
arch/x86/include/asm/current.h | 30 ++++++++++++++++++++++++++++++
arch/x86/kernel/cpu/Makefile | 1 +
arch/x86/kernel/cpu/common.c | 7 +------
arch/x86/kernel/cpu/current.c | 16 ++++++++++++++++
include/linux/compiler.h | 2 +-
5 files changed, 49 insertions(+), 7 deletions(-)
create mode 100644 arch/x86/kernel/cpu/current.c

diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h
index 3e204e6140b5..7f093e81a647 100644
--- a/arch/x86/include/asm/current.h
+++ b/arch/x86/include/asm/current.h
@@ -10,11 +10,41 @@ struct task_struct;

DECLARE_PER_CPU(struct task_struct *, current_task);

+#if USE_X86_SEG_SUPPORT
+
+/*
+ * Hold a constant alias for current_task, which would allow to avoid caching of
+ * current task.
+ *
+ * We must mark const_current_task with the segment qualifiers, as otherwise gcc
+ * would do redundant reads of const_current_task.
+ */
+DECLARE_PER_CPU(struct task_struct * const __percpu_seg_override, const_current_task);
+
+static __always_inline struct task_struct *get_current(void)
+{
+
+ /*
+ * GCC is missing functionality of removing segment qualifiers, which
+ * messes with per-cpu infrastructure that holds local copies. Use
+ * __raw_cpu_read to avoid holding any copy.
+ */
+ return __raw_cpu_read(, const_current_task);
+}
+
+#else /* USE_X86_SEG_SUPPORT */
+
+/*
+ * Without segment qualifier support, the per-cpu infrastrucutre is not
+ * suitable for reading constants, so use this_cpu_read_stable() in this case.
+ */
static __always_inline struct task_struct *get_current(void)
{
return this_cpu_read_stable(current_task);
}

+#endif /* USE_X86_SEG_SUPPORT */
+
#define current get_current()

#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index d7a1e5a9331c..d816f03a37d7 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -19,6 +19,7 @@ CFLAGS_common.o := $(nostackp)

obj-y := cacheinfo.o scattered.o topology.o
obj-y += common.o
+obj-y += current.o
obj-y += rdrand.o
obj-y += match.o
obj-y += bugs.o
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 5cc2d51cc25e..5f7c9ee57802 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1619,13 +1619,8 @@ DEFINE_PER_CPU_FIRST(struct fixed_percpu_data,
EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data);

/*
- * The following percpu variables are hot. Align current_task to
- * cacheline size such that they fall in the same cacheline.
+ * The following percpu variables are hot.
*/
-DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
- &init_task;
-EXPORT_PER_CPU_SYMBOL(current_task);
-
DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;

diff --git a/arch/x86/kernel/cpu/current.c b/arch/x86/kernel/cpu/current.c
new file mode 100644
index 000000000000..3238c6e34984
--- /dev/null
+++ b/arch/x86/kernel/cpu/current.c
@@ -0,0 +1,16 @@
+#include <linux/sched/task.h>
+#include <asm/current.h>
+
+/*
+ * Align current_task to cacheline size such that they fall in the same
+ * cacheline.
+ */
+DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
+ &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+
+#if USE_X86_SEG_SUPPORT
+DECLARE_PER_CPU(struct task_struct * const __percpu_seg_override, const_current_task)
+ __attribute__((alias("current_task")));
+EXPORT_PER_CPU_SYMBOL(const_current_task);
+#endif
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index f0fd5636fddb..1b6ee9ab6373 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -299,7 +299,7 @@ unsigned long read_word_at_a_time(const void *addr)
*/
#define __ADDRESSABLE(sym) \
static void * __section(".discard.addressable") __used \
- __PASTE(__addressable_##sym, __LINE__) = (void *)&sym;
+ __PASTE(__addressable_##sym, __LINE__) = (void *)(uintptr_t)&sym;

/**
* offset_to_ptr - convert a relative memory offset to an absolute pointer
--
2.17.1