... generating slightly smaller code.
Signed-off-by: Jan Beulich <[email protected]>
---
arch/x86/include/asm/hardirq.h | 2 +-
arch/x86/include/asm/percpu.h | 24 ++++++++++++++++++++++++
arch/x86/kernel/cpu/mcheck/mce.c | 4 ++--
3 files changed, 27 insertions(+), 3 deletions(-)
--- linux-2.6.34-rc5/arch/x86/include/asm/hardirq.h 2010-02-24 19:52:17.000000000 +0100
+++ 2.6.34-rc5-x86-inc-percpu/arch/x86/include/asm/hardirq.h 2010-04-15 11:58:06.000000000 +0200
@@ -35,7 +35,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpust
#define __ARCH_IRQ_STAT
-#define inc_irq_stat(member) percpu_add(irq_stat.member, 1)
+#define inc_irq_stat(member) percpu_inc(irq_stat.member)
#define local_softirq_pending() percpu_read(irq_stat.__softirq_pending)
--- linux-2.6.34-rc5/arch/x86/include/asm/percpu.h 2010-04-20 16:07:41.000000000 +0200
+++ 2.6.34-rc5-x86-inc-percpu/arch/x86/include/asm/percpu.h 2010-04-15 11:58:06.000000000 +0200
@@ -190,6 +190,29 @@ do { \
pfo_ret__; \
})
+#define percpu_unary_op(op, var) \
+({ \
+ switch (sizeof(var)) { \
+ case 1: \
+ asm(op "b "__percpu_arg(0) \
+ : "+m" (var)); \
+ break; \
+ case 2: \
+ asm(op "w "__percpu_arg(0) \
+ : "+m" (var)); \
+ break; \
+ case 4: \
+ asm(op "l "__percpu_arg(0) \
+ : "+m" (var)); \
+ break; \
+ case 8: \
+ asm(op "q "__percpu_arg(0) \
+ : "+m" (var)); \
+ break; \
+ default: __bad_percpu_size(); \
+ } \
+})
+
/*
* percpu_read() makes gcc load the percpu variable every time it is
* accessed while percpu_read_stable() allows the value to be cached.
@@ -207,6 +230,7 @@ do { \
#define percpu_and(var, val) percpu_to_op("and", var, val)
#define percpu_or(var, val) percpu_to_op("or", var, val)
#define percpu_xor(var, val) percpu_to_op("xor", var, val)
+#define percpu_inc(var) percpu_unary_op("inc", var)
#define __this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
#define __this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
--- linux-2.6.34-rc5/arch/x86/kernel/cpu/mcheck/mce.c 2010-04-20 16:07:42.000000000 +0200
+++ 2.6.34-rc5-x86-inc-percpu/arch/x86/kernel/cpu/mcheck/mce.c 2010-04-15 11:58:06.000000000 +0200
@@ -539,7 +539,7 @@ void machine_check_poll(enum mcp_flags f
struct mce m;
int i;
- __get_cpu_var(mce_poll_count)++;
+ percpu_inc(mce_poll_count);
mce_setup(&m);
@@ -934,7 +934,7 @@ void do_machine_check(struct pt_regs *re
atomic_inc(&mce_entry);
- __get_cpu_var(mce_exception_count)++;
+ percpu_inc(mce_exception_count);
if (notify_die(DIE_NMI, "machine check", regs, error_code,
18, SIGKILL) == NOTIFY_STOP)
On 04/21/2010 07:21 AM, Jan Beulich wrote:
> ... generating slightly smaller code.
>
> Signed-off-by: Jan Beulich <[email protected]>
How much smaller? Keep in mind that although INC is smaller than ADD,
the former has flag dependencies that the latter doesn't...
-hpa
>>> "H. Peter Anvin" 04/21/10 7:57 PM >>>
>On 04/21/2010 07:21 AM, Jan Beulich wrote:
>> ... generating slightly smaller code.
>>
>> Signed-off-by: Jan Beulich
>
>How much smaller?
The percpu_add(..., 1) -> percpu_inc() conversion is just a single
byte reduction (the immediate operand of the add); the other one
(where percpu_...() wasn't even used) is certainly a bigger win
(most of which obviously could also be achieved using percpu_add()).
>Keep in mind that although INC is smaller than ADD,
>the former has flag dependencies that the latter doesn't...
Wasn't that a problem just on Pentium4-s, which when I submitted
another related patch a couple of months back I was told would
not be a primary target anymore?
Jan
On 04/21/2010 11:26 PM, Jan Beulich wrote:
>
>> Keep in mind that although INC is smaller than ADD,
>> the former has flag dependencies that the latter doesn't...
>
> Wasn't that a problem just on Pentium4-s, which when I submitted
> another related patch a couple of months back I was told would
> not be a primary target anymore?
>
It doesn't look like it, from what I could tell after the abovementioned
conversation... I was wondering mostly how many bytes this saved across
the kernel. If it saves a byte in one place it's not so interesting,
but if it is all over it might make more sense.
It bit much worse on P4 than on most other cores, though.
-hpa
--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.
>>> "H. Peter Anvin" <[email protected]> 22.04.10 08:48 >>>
>I was wondering mostly how many bytes this saved across
>the kernel. If it saves a byte in one place it's not so interesting,
>but if it is all over it might make more sense.
It's about twenty places where this would get used.
Jan
Commit-ID: 402af0d7c692ddcfa2333e93d3f275ebd0487926
Gitweb: http://git.kernel.org/tip/402af0d7c692ddcfa2333e93d3f275ebd0487926
Author: Jan Beulich <[email protected]>
AuthorDate: Wed, 21 Apr 2010 15:21:51 +0100
Committer: H. Peter Anvin <[email protected]>
CommitDate: Wed, 28 Apr 2010 16:58:49 -0700
x86, asm: Introduce and use percpu_inc()
... generating slightly smaller code.
Signed-off-by: Jan Beulich <[email protected]>
LKML-Reference: <[email protected]>
Signed-off-by: H. Peter Anvin <[email protected]>
---
arch/x86/include/asm/hardirq.h | 2 +-
arch/x86/include/asm/percpu.h | 24 ++++++++++++++++++++++++
arch/x86/kernel/cpu/mcheck/mce.c | 4 ++--
3 files changed, 27 insertions(+), 3 deletions(-)
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 0f85764..aeab29a 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -35,7 +35,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
#define __ARCH_IRQ_STAT
-#define inc_irq_stat(member) percpu_add(irq_stat.member, 1)
+#define inc_irq_stat(member) percpu_inc(irq_stat.member)
#define local_softirq_pending() percpu_read(irq_stat.__softirq_pending)
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 66a272d..0ec6d12 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -190,6 +190,29 @@ do { \
pfo_ret__; \
})
+#define percpu_unary_op(op, var) \
+({ \
+ switch (sizeof(var)) { \
+ case 1: \
+ asm(op "b "__percpu_arg(0) \
+ : "+m" (var)); \
+ break; \
+ case 2: \
+ asm(op "w "__percpu_arg(0) \
+ : "+m" (var)); \
+ break; \
+ case 4: \
+ asm(op "l "__percpu_arg(0) \
+ : "+m" (var)); \
+ break; \
+ case 8: \
+ asm(op "q "__percpu_arg(0) \
+ : "+m" (var)); \
+ break; \
+ default: __bad_percpu_size(); \
+ } \
+})
+
/*
* percpu_read() makes gcc load the percpu variable every time it is
* accessed while percpu_read_stable() allows the value to be cached.
@@ -207,6 +230,7 @@ do { \
#define percpu_and(var, val) percpu_to_op("and", var, val)
#define percpu_or(var, val) percpu_to_op("or", var, val)
#define percpu_xor(var, val) percpu_to_op("xor", var, val)
+#define percpu_inc(var) percpu_unary_op("inc", var)
#define __this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
#define __this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 8a6f0af..7a355dd 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -539,7 +539,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
struct mce m;
int i;
- __get_cpu_var(mce_poll_count)++;
+ percpu_inc(mce_poll_count);
mce_setup(&m);
@@ -934,7 +934,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
atomic_inc(&mce_entry);
- __get_cpu_var(mce_exception_count)++;
+ percpu_inc(mce_exception_count);
if (notify_die(DIE_NMI, "machine check", regs, error_code,
18, SIGKILL) == NOTIFY_STOP)