2013-08-14 13:35:06

by Peter Zijlstra

[permalink] [raw]
Subject: [RFC][PATCH 0/5] preempt_count rework

These patches optimize preempt_enable by firstly folding the preempt and
need_resched tests into one -- this should work for all architectures. And
secondly by providing per-arch preempt_count implementations; with x86 using
per-cpu preempt_count for fastest access.

These patches have so far only been compiled for defconfig-x86_64 +
CONFIG_PREEMPT=y and boot tested with kvm -smp 4 upto wanting to mount root.

It still needs asm volatile("call preempt_schedule": : :"memory"); as per
Andi's other patches to avoid the C calling convention cluttering the
preempt_enable() sites.

---
arch/alpha/include/asm/Kbuild | 1 +
arch/arc/include/asm/Kbuild | 1 +
arch/arm/include/asm/Kbuild | 1 +
arch/arm64/include/asm/Kbuild | 1 +
arch/avr32/include/asm/Kbuild | 1 +
arch/blackfin/include/asm/Kbuild | 1 +
arch/c6x/include/asm/Kbuild | 1 +
arch/cris/include/asm/Kbuild | 1 +
arch/frv/include/asm/Kbuild | 1 +
arch/h8300/include/asm/Kbuild | 1 +
arch/hexagon/include/asm/Kbuild | 1 +
arch/ia64/include/asm/Kbuild | 1 +
arch/m32r/include/asm/Kbuild | 1 +
arch/m68k/include/asm/Kbuild | 1 +
arch/metag/include/asm/Kbuild | 1 +
arch/microblaze/include/asm/Kbuild | 1 +
arch/mips/include/asm/Kbuild | 1 +
arch/mn10300/include/asm/Kbuild | 1 +
arch/openrisc/include/asm/Kbuild | 1 +
arch/parisc/include/asm/Kbuild | 1 +
arch/powerpc/include/asm/Kbuild | 1 +
arch/s390/include/asm/Kbuild | 1 +
arch/score/include/asm/Kbuild | 1 +
arch/sh/include/asm/Kbuild | 1 +
arch/sparc/include/asm/Kbuild | 1 +
arch/tile/include/asm/Kbuild | 1 +
arch/um/include/asm/Kbuild | 1 +
arch/unicore32/include/asm/Kbuild | 1 +
arch/x86/include/asm/thread_info.h | 5 ++---
arch/x86/kernel/asm-offsets.c | 1 -
arch/x86/kernel/cpu/common.c | 5 +++++
arch/x86/kernel/entry_32.S | 7 ++-----
arch/x86/kernel/entry_64.S | 4 +---
arch/x86/kernel/process_32.c | 10 ++++++++++
arch/x86/kernel/process_64.c | 10 ++++++++++
arch/xtensa/include/asm/Kbuild | 1 +
include/linux/preempt.h | 49 ++++++++++++++++++++++++++++++++++++++-----------
include/linux/sched.h | 2 +-
include/linux/thread_info.h | 1 +
include/trace/events/sched.h | 2 +-
init/main.c | 2 +-
kernel/context_tracking.c | 3 +--
kernel/cpu/idle.c | 10 ++++++++++
kernel/sched/core.c | 31 +++++++++++++++++++------------
kernel/softirq.c | 4 ++--
kernel/timer.c | 8 ++++----
lib/smp_processor_id.c | 3 +--
47 files changed, 138 insertions(+), 48 deletions(-)


2013-08-14 13:48:49

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [RFC][PATCH 0/5] preempt_count rework

On 08/14/2013 06:15 AM, Peter Zijlstra wrote:
> These patches optimize preempt_enable by firstly folding the preempt and
> need_resched tests into one -- this should work for all architectures. And
> secondly by providing per-arch preempt_count implementations; with x86 using
> per-cpu preempt_count for fastest access.
>
> These patches have so far only been compiled for defconfig-x86_64 +
> CONFIG_PREEMPT=y and boot tested with kvm -smp 4 upto wanting to mount root.
>
> It still needs asm volatile("call preempt_schedule": : :"memory"); as per
> Andi's other patches to avoid the C calling convention cluttering the
> preempt_enable() sites.

Hi,

I still don't see this using a decrement of the percpu variable
anywhere. The C compiler doesn't know how to generate those, so if I'm
not completely wet we will end up relying on sub_preempt_count()...
which, because it relies on taking the address of the percpu variable
will generate absolutely horrific code.

On x86, you never want to take the address of a percpu variable if you
can avoid it, as you end up generating code like:

movq %fs:0,%rax
subl $1,(%rax)

... for absolutely no good reason. You can use the existing accessors
for percpu variables, but that would make you lose the flags output
which was part of the point, so I think the whole sequence needs to be
in assembly (note that once you are manipulating percpu state you are
already in assembly.)

-hpa

2013-08-14 15:39:30

by Mike Galbraith

[permalink] [raw]
Subject: Re: [RFC][PATCH 0/5] preempt_count rework

On Wed, 2013-08-14 at 06:47 -0700, H. Peter Anvin wrote:

> On x86, you never want to take the address of a percpu variable if you
> can avoid it, as you end up generating code like:
>
> movq %fs:0,%rax
> subl $1,(%rax)

Hmmm..

#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
#define this_rq() (&__get_cpu_var(runqueues))

ffffffff81438c7f: 48 c7 c3 80 11 01 00 mov $0x11180,%rbx
/*
* this_rq must be evaluated again because prev may have moved
* CPUs since it called schedule(), thus the 'rq' on its stack
* frame will be invalid.
*/
finish_task_switch(this_rq(), prev);
ffffffff81438c86: e8 25 b4 c0 ff callq ffffffff810440b0 <finish_task_switch>
* The context switch have flipped the stack from under us
* and restored the local variables which were saved when
* this task called schedule() in the past. prev == current
* is still correct, but it can be moved to another cpu/rq.
*/
cpu = smp_processor_id();
ffffffff81438c8b: 65 8b 04 25 b8 c5 00 mov %gs:0xc5b8,%eax
ffffffff81438c92: 00
rq = cpu_rq(cpu);
ffffffff81438c93: 48 98 cltq
ffffffff81438c95: 48 03 1c c5 00 f3 bb add -0x7e440d00(,%rax,8),%rbx

..so could the rq = cpu_rq(cpu) sequence be improved cycle expenditure
wise by squirreling rq pointer away in a percpu this_rq, and replacing
cpu_rq(cpu) above with a __this_cpu_read(this_rq) version of this_rq()?

-Mike

2013-08-14 15:44:34

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [RFC][PATCH 0/5] preempt_count rework

On 08/14/2013 08:39 AM, Mike Galbraith wrote:
>
> ..so could the rq = cpu_rq(cpu) sequence be improved cycle expenditure
> wise by squirreling rq pointer away in a percpu this_rq, and replacing
> cpu_rq(cpu) above with a __this_cpu_read(this_rq) version of this_rq()?
>

Yes.

-hpa

2013-08-14 16:04:31

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 0/5] preempt_count rework

On Wed, Aug 14, 2013 at 06:47:45AM -0700, H. Peter Anvin wrote:

> I still don't see this using a decrement of the percpu variable
> anywhere. The C compiler doesn't know how to generate those, so if I'm
> not completely wet we will end up relying on sub_preempt_count()...
> which, because it relies on taking the address of the percpu variable
> will generate absolutely horrific code.
>
> On x86, you never want to take the address of a percpu variable if you
> can avoid it, as you end up generating code like:
>
> movq %fs:0,%rax
> subl $1,(%rax)
>

Urgh,. yes you're right. I keep forgetting GCC doesn't know how to merge
those :/

OK, so something like the below would cure the worst of that I suppose.
It compiles but doesn't boot; must've done something wrong.

Someone please look at it because my asm-foo blows. I pretty much
copy/pasted this from asm/percpu.h.

---
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -20,6 +20,28 @@ static __always_inline int *preempt_coun
return &__raw_get_cpu_var(__preempt_count);
}

+#define __preempt_count_add(x) do { \
+ asm("addl %1," __percpu_arg(0) \
+ : "+m" (__preempt_count) \
+ : "ri" ((int)x) \
+ : "memory"); \
+} while (0)
+
+#define __preempt_count_sub(x) do { \
+ asm("subl %1," __percpu_arg(0) \
+ : "+m" (__preempt_count) \
+ : "ri" ((int)x) \
+ : "memory"); \
+} while (0)
+
+#define preempt_enable() do { \
+ asm("\nsubl $1," __percpu_arg(0) \
+ "\njnz 1f" \
+ "\ncall preempt_schedule" \
+ "\n1:" : "+m" (__preempt_count) \
+ : : "memory"); \
+} while (0)
+
/*
* must be macros to avoid header recursion hell
*/
--- a/include/asm-generic/preempt.h
+++ b/include/asm-generic/preempt.h
@@ -17,6 +17,9 @@ static __always_inline int *preempt_coun
return &current_thread_info()->preempt_count;
}

+#define __preempt_count_add(x) do { current_thread_info()->preempt_count += (x); } while (0)
+#define __preempt_count_sub(x) __preempt_count_add(-(x))
+
/*
* must be macros to avoid header recursion hell
*/
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -45,8 +45,8 @@ static __always_inline bool test_preempt
extern void add_preempt_count(int val);
extern void sub_preempt_count(int val);
#else
-# define add_preempt_count(val) do { *preempt_count_ptr() += (val); } while (0)
-# define sub_preempt_count(val) do { *preempt_count_ptr() -= (val); } while (0)
+# define add_preempt_count(val) __preempt_count_add(val)
+# define sub_preempt_count(val) __preempt_count_sub(val)
#endif

#define inc_preempt_count() add_preempt_count(1)
@@ -101,17 +101,17 @@ do { \

#define preempt_enable_no_resched() sched_preempt_enable_no_resched()

+#ifndef preempt_enable
#define preempt_enable() \
do { \
preempt_enable_no_resched(); \
preempt_check_resched(); \
} while (0)
+#endif

/* For debugging and tracer internals only! */
-#define add_preempt_count_notrace(val) \
- do { *preempt_count_ptr() += (val); } while (0)
-#define sub_preempt_count_notrace(val) \
- do { *preempt_count_ptr() -= (val); } while (0)
+#define add_preempt_count_notrace(val) __preempt_count_add(val)
+#define sub_preempt_count_notrace(val) __preempt_count_sub(val)
#define inc_preempt_count_notrace() add_preempt_count_notrace(1)
#define dec_preempt_count_notrace() sub_preempt_count_notrace(1)

2013-08-14 16:06:44

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 0/5] preempt_count rework

On Wed, Aug 14, 2013 at 05:39:11PM +0200, Mike Galbraith wrote:
> On Wed, 2013-08-14 at 06:47 -0700, H. Peter Anvin wrote:
>
> > On x86, you never want to take the address of a percpu variable if you
> > can avoid it, as you end up generating code like:
> >
> > movq %fs:0,%rax
> > subl $1,(%rax)
>
> Hmmm..
>
> #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
> #define this_rq() (&__get_cpu_var(runqueues))
>
> ffffffff81438c7f: 48 c7 c3 80 11 01 00 mov $0x11180,%rbx
> /*
> * this_rq must be evaluated again because prev may have moved
> * CPUs since it called schedule(), thus the 'rq' on its stack
> * frame will be invalid.
> */
> finish_task_switch(this_rq(), prev);
> ffffffff81438c86: e8 25 b4 c0 ff callq ffffffff810440b0 <finish_task_switch>
> * The context switch have flipped the stack from under us
> * and restored the local variables which were saved when
> * this task called schedule() in the past. prev == current
> * is still correct, but it can be moved to another cpu/rq.
> */
> cpu = smp_processor_id();
> ffffffff81438c8b: 65 8b 04 25 b8 c5 00 mov %gs:0xc5b8,%eax
> ffffffff81438c92: 00
> rq = cpu_rq(cpu);
> ffffffff81438c93: 48 98 cltq
> ffffffff81438c95: 48 03 1c c5 00 f3 bb add -0x7e440d00(,%rax,8),%rbx
>
> ..so could the rq = cpu_rq(cpu) sequence be improved cycle expenditure
> wise by squirreling rq pointer away in a percpu this_rq, and replacing
> cpu_rq(cpu) above with a __this_cpu_read(this_rq) version of this_rq()?

Well, this_rq() should already get you that. The above code sucks for
using cpu_rq() when we know cpu == smp_processor_id().

2013-08-14 16:16:16

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [RFC][PATCH 0/5] preempt_count rework

On 08/14/2013 09:06 AM, Peter Zijlstra wrote:
> On Wed, Aug 14, 2013 at 05:39:11PM +0200, Mike Galbraith wrote:
>> On Wed, 2013-08-14 at 06:47 -0700, H. Peter Anvin wrote:
>>
>>> On x86, you never want to take the address of a percpu variable if you
>>> can avoid it, as you end up generating code like:
>>>
>>> movq %fs:0,%rax
>>> subl $1,(%rax)
>>
>> Hmmm..
>>
>> #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
>> #define this_rq() (&__get_cpu_var(runqueues))
>>
>> ffffffff81438c7f: 48 c7 c3 80 11 01 00 mov $0x11180,%rbx
>> /*
>> * this_rq must be evaluated again because prev may have moved
>> * CPUs since it called schedule(), thus the 'rq' on its stack
>> * frame will be invalid.
>> */
>> finish_task_switch(this_rq(), prev);
>> ffffffff81438c86: e8 25 b4 c0 ff callq ffffffff810440b0 <finish_task_switch>
>> * The context switch have flipped the stack from under us
>> * and restored the local variables which were saved when
>> * this task called schedule() in the past. prev == current
>> * is still correct, but it can be moved to another cpu/rq.
>> */
>> cpu = smp_processor_id();
>> ffffffff81438c8b: 65 8b 04 25 b8 c5 00 mov %gs:0xc5b8,%eax
>> ffffffff81438c92: 00
>> rq = cpu_rq(cpu);
>> ffffffff81438c93: 48 98 cltq
>> ffffffff81438c95: 48 03 1c c5 00 f3 bb add -0x7e440d00(,%rax,8),%rbx
>>
>> ..so could the rq = cpu_rq(cpu) sequence be improved cycle expenditure
>> wise by squirreling rq pointer away in a percpu this_rq, and replacing
>> cpu_rq(cpu) above with a __this_cpu_read(this_rq) version of this_rq()?
>
> Well, this_rq() should already get you that. The above code sucks for
> using cpu_rq() when we know cpu == smp_processor_id().
>

Even so, this_rq() uses __get_cpu_var() and takes its address, which
turns into a sequence like:

leaq __percpu_runqueues(%rip),%rax
addq %gs:this_cpu_off,%rax

... which is better than the above but still more heavyweight than it
would be if the pointer was itself a percpu variable.

-hpa

2013-08-14 16:48:54

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC][PATCH 0/5] preempt_count rework

On Wed, Aug 14, 2013 at 03:15:39PM +0200, Peter Zijlstra wrote:
> These patches optimize preempt_enable by firstly folding the preempt and
> need_resched tests into one -- this should work for all architectures. And
> secondly by providing per-arch preempt_count implementations; with x86 using
> per-cpu preempt_count for fastest access.
>
> These patches have so far only been compiled for defconfig-x86_64 +
> CONFIG_PREEMPT=y and boot tested with kvm -smp 4 upto wanting to mount root.
>
> It still needs asm volatile("call preempt_schedule": : :"memory"); as per
> Andi's other patches to avoid the C calling convention cluttering the
> preempt_enable() sites.

FWIW I removed the user_schedule in v2 because I don't need it anymore.
Feel free to pick it up from v1 though.

It needs two patches: the one adding SAVE_ALL for 32bit and
the parts of the put_user() patch adding user_schedule

When it's not used in user_ anymore it should probably
be renamed too, to preempt_schedule or somesuch,
and probably moved to a different file.

-Andi

2013-08-14 16:52:41

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 0/5] preempt_count rework

On Wed, Aug 14, 2013 at 09:14:34AM -0700, H. Peter Anvin wrote:
> Even so, this_rq() uses __get_cpu_var() and takes its address, which
> turns into a sequence like:
>
> leaq __percpu_runqueues(%rip),%rax
> addq %gs:this_cpu_off,%rax
>
> ... which is better than the above but still more heavyweight than it
> would be if the pointer was itself a percpu variable.

Oh curses, this is because lea can't do segment offsets? So there's no
sane way to get addresses of per-cpu variables.

Because ideally we'd have something like:

lea %gs:__percpu_runqueues,%rax

So in this case it makes sense to also store the actual pointer; how
unfortunate.

2013-08-14 16:55:15

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 0/5] preempt_count rework

On Wed, Aug 14, 2013 at 09:48:27AM -0700, Andi Kleen wrote:

> FWIW I removed the user_schedule in v2 because I don't need it anymore.
> Feel free to pick it up from v1 though.

Ah, I had a quick look through your v2 because I got a link into it from
Ingo but didn't find it. I'll have to ask Google to locate this v1. I
suppose that's what's wrong with my last patch. It directly does a call
preempt_schedule -- which I had hoped would work due to its asmlinkage,
but apparently there's more to it.

Also, I suppose there's a few volatile thingies missing.

2013-08-14 16:58:58

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [RFC][PATCH 0/5] preempt_count rework

On 08/14/2013 09:52 AM, Peter Zijlstra wrote:
>
> Oh curses, this is because lea can't do segment offsets? So there's no
> sane way to get addresses of per-cpu variables.
>
> Because ideally we'd have something like:
>
> lea %gs:__percpu_runqueues,%rax
>
> So in this case it makes sense to also store the actual pointer; how
> unfortunate.
>

Correct; the segment offset is not part of the effective address, even
in 64-bit mode.

-hpa

2013-08-14 17:12:23

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC][PATCH 0/5] preempt_count rework

On Wed, Aug 14, 2013 at 06:55:05PM +0200, Peter Zijlstra wrote:
> On Wed, Aug 14, 2013 at 09:48:27AM -0700, Andi Kleen wrote:
>
> > FWIW I removed the user_schedule in v2 because I don't need it anymore.
> > Feel free to pick it up from v1 though.
>
> Ah, I had a quick look through your v2 because I got a link into it from
> Ingo but didn't find it. I'll have to ask Google to locate this v1. I

Sorry, it's the thread starting with

http://permalink.gmane.org/gmane.linux.kernel/1541950

I also pushed the branch to git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-misc.git

uaccess-opt311 is v1
uaccess-opt311-2 is v2 (without user_schedule)


> suppose that's what's wrong with my last patch. It directly does a call
> preempt_schedule -- which I had hoped would work due to its asmlinkage,
> but apparently there's more to it.

preempt_schedule does not preserve registers, so yes that would
explain a lot of problems.

-Andi
--
[email protected] -- Speaking for myself only

2013-08-14 17:31:31

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 0/5] preempt_count rework

On Wed, Aug 14, 2013 at 06:04:16PM +0200, Peter Zijlstra wrote:
> OK, so something like the below would cure the worst of that I suppose.
> It compiles but doesn't boot; must've done something wrong.
>
> Someone please look at it because my asm-foo blows. I pretty much
> copy/pasted this from asm/percpu.h.

OK, another hatchet job, now with bits borrowed from Andi's
voluntary-preempt v1. This one actually boots.

---
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -48,6 +48,8 @@ For 32-bit we have the following convent

#include <asm/dwarf2.h>

+#ifdef CONFIG_X86_64
+
/*
* 64-bit system call stack frame layout defines and helpers,
* for assembly code:
@@ -192,3 +194,51 @@ For 32-bit we have the following convent
.macro icebp
.byte 0xf1
.endm
+
+#else /* CONFIG_X86_64 */
+
+/*
+ * For 32bit only simplified versions of SAVE_ALL/RESTORE_ALL. These
+ * are different from the entry_32.S versions in not changing the segment
+ * registers. So only suitable for in kernel use, not when transitioning
+ * from or to user space. The resulting stack frame is not a standard
+ * pt_regs frame. The main use case is calling C code from assembler
+ * when all the registers need to be preserved.
+ */
+
+ .macro SAVE_ALL
+ pushl_cfi %eax
+ CFI_REL_OFFSET eax, 0
+ pushl_cfi %ebp
+ CFI_REL_OFFSET ebp, 0
+ pushl_cfi %edi
+ CFI_REL_OFFSET edi, 0
+ pushl_cfi %esi
+ CFI_REL_OFFSET esi, 0
+ pushl_cfi %edx
+ CFI_REL_OFFSET edx, 0
+ pushl_cfi %ecx
+ CFI_REL_OFFSET ecx, 0
+ pushl_cfi %ebx
+ CFI_REL_OFFSET ebx, 0
+ .endm
+
+ .macro RESTORE_ALL
+ popl_cfi %ebx
+ CFI_RESTORE ebx
+ popl_cfi %ecx
+ CFI_RESTORE ecx
+ popl_cfi %edx
+ CFI_RESTORE edx
+ popl_cfi %esi
+ CFI_RESTORE esi
+ popl_cfi %edi
+ CFI_RESTORE edi
+ popl_cfi %ebp
+ CFI_RESTORE ebp
+ popl_cfi %eax
+ CFI_RESTORE eax
+ .endm
+
+#endif /* CONFIG_X86_64 */
+
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -20,6 +20,28 @@ static __always_inline int *preempt_coun
return &__raw_get_cpu_var(__preempt_count);
}

+#define __preempt_count_add(x) do { \
+ asm volatile("addl %1," __percpu_arg(0) \
+ : "+m" (__preempt_count) \
+ : "ri" ((int)x) \
+ : "memory"); \
+} while (0)
+
+#define __preempt_count_sub(x) do { \
+ asm volatile("subl %1," __percpu_arg(0) \
+ : "+m" (__preempt_count) \
+ : "ri" ((int)x) \
+ : "memory"); \
+} while (0)
+
+#define __preempt_enable() do { \
+ asm volatile("\nsubl $1," __percpu_arg(0) \
+ "\njnz 1f" \
+ "\ncall __preempt_schedule" \
+ "\n1:" \
+ : "+m" (__preempt_count) : : "memory"); \
+} while (0)
+
/*
* must be macros to avoid header recursion hell
*/
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -36,6 +36,8 @@ obj-y += tsc.o io_delay.o rtc.o
obj-y += pci-iommu_table.o
obj-y += resource.o

+obj-$(CONFIG_PREEMPT) += preempt.o
+
obj-y += process.o
obj-y += i387.o xsave.o
obj-y += ptrace.o
--- /dev/null
+++ b/arch/x86/kernel/preempt.S
@@ -0,0 +1,25 @@
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/asm.h>
+#include <asm/calling.h>
+
+ENTRY(__preempt_schedule)
+ CFI_STARTPROC
+ SAVE_ALL
+ call preempt_schedule
+ RESTORE_ALL
+ ret
+ CFI_ENDPROC
+
+#ifdef CONFIG_CONTEXT_TRACKING
+
+ENTRY(__preempt_schedule_context)
+ CFI_STARTPROC
+ SAVE_ALL
+ call preempt_schedule_context
+ RESTORE_ALL
+ ret
+ CFI_ENDPROC
+
+#endif
--- a/include/asm-generic/preempt.h
+++ b/include/asm-generic/preempt.h
@@ -17,6 +17,9 @@ static __always_inline int *preempt_coun
return &current_thread_info()->preempt_count;
}

+#define __preempt_count_add(x) do { current_thread_info()->preempt_count += (x); } while (0)
+#define __preempt_count_sub(x) __preempt_count_add(-(x))
+
/*
* must be macros to avoid header recursion hell
*/
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -45,8 +45,8 @@ static __always_inline bool test_preempt
extern void add_preempt_count(int val);
extern void sub_preempt_count(int val);
#else
-# define add_preempt_count(val) do { *preempt_count_ptr() += (val); } while (0)
-# define sub_preempt_count(val) do { *preempt_count_ptr() -= (val); } while (0)
+# define add_preempt_count(val) __preempt_count_add(val)
+# define sub_preempt_count(val) __preempt_count_sub(val)
#endif

#define inc_preempt_count() add_preempt_count(1)
@@ -64,7 +64,7 @@ do { \

#ifdef CONFIG_CONTEXT_TRACKING

-void preempt_schedule_context(void);
+asmlinkage void preempt_schedule_context(void);

#define preempt_check_resched_context() \
do { \
@@ -101,17 +101,19 @@ do { \

#define preempt_enable_no_resched() sched_preempt_enable_no_resched()

+#ifndef __preempt_enable
#define preempt_enable() \
do { \
preempt_enable_no_resched(); \
preempt_check_resched(); \
} while (0)
+#else
+#define preempt_enable() __preempt_enable()
+#endif

/* For debugging and tracer internals only! */
-#define add_preempt_count_notrace(val) \
- do { *preempt_count_ptr() += (val); } while (0)
-#define sub_preempt_count_notrace(val) \
- do { *preempt_count_ptr() -= (val); } while (0)
+#define add_preempt_count_notrace(val) __preempt_count_add(val)
+#define sub_preempt_count_notrace(val) __preempt_count_sub(val)
#define inc_preempt_count_notrace() add_preempt_count_notrace(1)
#define dec_preempt_count_notrace() sub_preempt_count_notrace(1)

2013-08-15 09:03:01

by Mike Galbraith

[permalink] [raw]
Subject: Re: [RFC][PATCH 0/5] preempt_count rework

On Wed, 2013-08-14 at 08:43 -0700, H. Peter Anvin wrote:
> On 08/14/2013 08:39 AM, Mike Galbraith wrote:
> >
> > ..so could the rq = cpu_rq(cpu) sequence be improved cycle expenditure
> > wise by squirreling rq pointer away in a percpu this_rq, and replacing
> > cpu_rq(cpu) above with a __this_cpu_read(this_rq) version of this_rq()?
> >
>
> Yes.

Oh darn, that worked out about as you'd expect. Cycles are so far down
in the frog hair as to be invisible, so not be worth the space cost.

pinned sched_yield proggy, switches/sec, 3 boots/5 runs each:
avg
pre: 1650522 1580422 1604430 1611697 1612928 1611999.8
1682789 1609103 1603866 1559040 1607424 1612444.4
1608265 1607513 1606730 1607079 1635914 1613100.2
1612514.8 avg avg 1.000

post: 1649396 1595364 1621720 1643665 1641829 1630394.8
1571322 1591638 1575406 1629960 1592129 1592091.0
1641807 1622591 1620581 1651145 1663025 1639829.8
1620771.8 avg avg 1.005

---
kernel/sched/core.c | 8 ++++----
kernel/sched/sched.h | 12 +++++++++---
2 files changed, 13 insertions(+), 7 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -111,6 +111,7 @@ void start_bandwidth_timer(struct hrtime

DEFINE_MUTEX(sched_domains_mutex);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq *, runqueue);

static void update_rq_clock_task(struct rq *rq, s64 delta);

@@ -2390,7 +2391,7 @@ static void __sched __schedule(void)
need_resched:
preempt_disable();
cpu = smp_processor_id();
- rq = cpu_rq(cpu);
+ rq = this_rq();
rcu_note_context_switch(cpu);
prev = rq->curr;

@@ -2447,8 +2448,7 @@ static void __sched __schedule(void)
* this task called schedule() in the past. prev == current
* is still correct, but it can be moved to another cpu/rq.
*/
- cpu = smp_processor_id();
- rq = cpu_rq(cpu);
+ rq = this_rq();
} else
raw_spin_unlock_irq(&rq->lock);

@@ -6470,7 +6470,7 @@ void __init sched_init(void)
for_each_possible_cpu(i) {
struct rq *rq;

- rq = cpu_rq(i);
+ rq = per_cpu(runqueue, i) = &per_cpu(runqueues, i);
raw_spin_lock_init(&rq->lock);
rq->nr_running = 0;
rq->calc_load_active = 0;
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -537,11 +537,17 @@ static inline int cpu_of(struct rq *rq)

DECLARE_PER_CPU(struct rq, runqueues);

-#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
-#define this_rq() (&__get_cpu_var(runqueues))
+/*
+ * Runqueue pointer for use by macros to avoid costly code generated
+ * by taking the address of percpu variables.
+ */
+DECLARE_PER_CPU(struct rq *, runqueue);
+
+#define cpu_rq(cpu) (per_cpu(runqueue, (cpu)))
+#define this_rq() (__this_cpu_read(runqueue))
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
-#define raw_rq() (&__raw_get_cpu_var(runqueues))
+#define raw_rq() (__raw_get_cpu_var(runqueue))

static inline u64 rq_clock(struct rq *rq)
{