Except on x86, preempt_count is always accessed with READ_ONCE.
Repeated invocations in macros like irq_count() produce repeated loads.
These redundant instructions appear in various fast paths. In the one
shown below, for example, irq_count() is evaluated during kernel entry
if !tick_nohz_full_cpu(smp_processor_id()).
0001ed0a <irq_enter_rcu>:
1ed0a: 4e56 0000 linkw %fp,#0
1ed0e: 200f movel %sp,%d0
1ed10: 0280 ffff e000 andil #-8192,%d0
1ed16: 2040 moveal %d0,%a0
1ed18: 2028 0008 movel %a0@(8),%d0
1ed1c: 0680 0001 0000 addil #65536,%d0
1ed22: 2140 0008 movel %d0,%a0@(8)
1ed26: 082a 0001 000f btst #1,%a2@(15)
1ed2c: 670c beqs 1ed3a <irq_enter_rcu+0x30>
1ed2e: 2028 0008 movel %a0@(8),%d0
1ed32: 2028 0008 movel %a0@(8),%d0
1ed36: 2028 0008 movel %a0@(8),%d0
1ed3a: 4e5e unlk %fp
1ed3c: 4e75 rts
This patch doesn't prevent the pointless btst and beqs instructions
above, but it does eliminate 2 of the 3 pointless move instructions
here and elsewhere.
On x86, preempt_count is per-cpu data and the problem does not arise
perhaps because the compiler is free to perform similar optimizations.
Cc: Thomas Gleixner <[email protected]>
Fixes: 15115830c887 ("preempt: Cleanup the macro maze a bit")
Signed-off-by: Finn Thain <[email protected]>
---
This patch was tested on m68k and x86. I was expecting no changes
to object code for x86 and mostly that's what I saw. However, there
were a few places where code generation was perturbed for some reason.
---
include/linux/preempt.h | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 0df425bf9bd7..953358e40291 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -102,10 +102,11 @@ static __always_inline unsigned char interrupt_context_level(void)
#define hardirq_count() (preempt_count() & HARDIRQ_MASK)
#ifdef CONFIG_PREEMPT_RT
# define softirq_count() (current->softirq_disable_cnt & SOFTIRQ_MASK)
+# define irq_count() ((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | softirq_count())
#else
# define softirq_count() (preempt_count() & SOFTIRQ_MASK)
+# define irq_count() (preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK))
#endif
-#define irq_count() (nmi_count() | hardirq_count() | softirq_count())
/*
* Macros to retrieve the current execution context:
@@ -118,7 +119,11 @@ static __always_inline unsigned char interrupt_context_level(void)
#define in_nmi() (nmi_count())
#define in_hardirq() (hardirq_count())
#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
-#define in_task() (!(in_nmi() | in_hardirq() | in_serving_softirq()))
+#ifdef CONFIG_PREEMPT_RT
+# define in_task() (!((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | in_serving_softirq()))
+#else
+# define in_task() (!(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
+#endif
/*
* The following macros are deprecated and should not be used in new code:
--
2.39.3
On Fri, Jul 14, 2023 at 07:18:31PM +1000, Finn Thain wrote:
> Except on x86, preempt_count is always accessed with READ_ONCE.
> Repeated invocations in macros like irq_count() produce repeated loads.
> These redundant instructions appear in various fast paths. In the one
> shown below, for example, irq_count() is evaluated during kernel entry
> if !tick_nohz_full_cpu(smp_processor_id()).
>
> 0001ed0a <irq_enter_rcu>:
> 1ed0a: 4e56 0000 linkw %fp,#0
> 1ed0e: 200f movel %sp,%d0
> 1ed10: 0280 ffff e000 andil #-8192,%d0
> 1ed16: 2040 moveal %d0,%a0
> 1ed18: 2028 0008 movel %a0@(8),%d0
> 1ed1c: 0680 0001 0000 addil #65536,%d0
> 1ed22: 2140 0008 movel %d0,%a0@(8)
> 1ed26: 082a 0001 000f btst #1,%a2@(15)
> 1ed2c: 670c beqs 1ed3a <irq_enter_rcu+0x30>
> 1ed2e: 2028 0008 movel %a0@(8),%d0
> 1ed32: 2028 0008 movel %a0@(8),%d0
> 1ed36: 2028 0008 movel %a0@(8),%d0
> 1ed3a: 4e5e unlk %fp
> 1ed3c: 4e75 rts
>
> This patch doesn't prevent the pointless btst and beqs instructions
> above, but it does eliminate 2 of the 3 pointless move instructions
> here and elsewhere.
>
> On x86, preempt_count is per-cpu data and the problem does not arise
> perhaps because the compiler is free to perform similar optimizations.
>
> Cc: Thomas Gleixner <[email protected]>
> Fixes: 15115830c887 ("preempt: Cleanup the macro maze a bit")
Does this optimization really deserves a "Fixes:" tag?
> Signed-off-by: Finn Thain <[email protected]>
> ---
> This patch was tested on m68k and x86. I was expecting no changes
> to object code for x86 and mostly that's what I saw. However, there
> were a few places where code generation was perturbed for some reason.
> ---
> include/linux/preempt.h | 9 +++++++--
> 1 file changed, 7 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/preempt.h b/include/linux/preempt.h
> index 0df425bf9bd7..953358e40291 100644
> --- a/include/linux/preempt.h
> +++ b/include/linux/preempt.h
> @@ -102,10 +102,11 @@ static __always_inline unsigned char interrupt_context_level(void)
> #define hardirq_count() (preempt_count() & HARDIRQ_MASK)
> #ifdef CONFIG_PREEMPT_RT
> # define softirq_count() (current->softirq_disable_cnt & SOFTIRQ_MASK)
> +# define irq_count() ((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | softirq_count())
> #else
> # define softirq_count() (preempt_count() & SOFTIRQ_MASK)
> +# define irq_count() (preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK))
> #endif
> -#define irq_count() (nmi_count() | hardirq_count() | softirq_count())
Perhaps add a comment as to why you're making these two versions (ie: because
that avoids three consecutive reads), otherwise people may be tempted to roll
that back again in the future to make the code shorter.
>
> /*
> * Macros to retrieve the current execution context:
> @@ -118,7 +119,11 @@ static __always_inline unsigned char interrupt_context_level(void)
> #define in_nmi() (nmi_count())
> #define in_hardirq() (hardirq_count())
> #define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
> -#define in_task() (!(in_nmi() | in_hardirq() | in_serving_softirq()))
> +#ifdef CONFIG_PREEMPT_RT
> +# define in_task() (!((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | in_serving_softirq()))
> +#else
> +# define in_task() (!(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
> +#endif
Same here, thanks!
>
> /*
> * The following macros are deprecated and should not be used in new code:
> --
> 2.39.3
>
Hello Frederic,
Thanks for your review.
On Fri, 14 Jul 2023, Frederic Weisbecker wrote:
> On Fri, Jul 14, 2023 at 07:18:31PM +1000, Finn Thain wrote:
> > Except on x86, preempt_count is always accessed with READ_ONCE.
> > Repeated invocations in macros like irq_count() produce repeated loads.
> > These redundant instructions appear in various fast paths. In the one
> > shown below, for example, irq_count() is evaluated during kernel entry
> > if !tick_nohz_full_cpu(smp_processor_id()).
> >
> > 0001ed0a <irq_enter_rcu>:
> > 1ed0a: 4e56 0000 linkw %fp,#0
> > 1ed0e: 200f movel %sp,%d0
> > 1ed10: 0280 ffff e000 andil #-8192,%d0
> > 1ed16: 2040 moveal %d0,%a0
> > 1ed18: 2028 0008 movel %a0@(8),%d0
> > 1ed1c: 0680 0001 0000 addil #65536,%d0
> > 1ed22: 2140 0008 movel %d0,%a0@(8)
> > 1ed26: 082a 0001 000f btst #1,%a2@(15)
> > 1ed2c: 670c beqs 1ed3a <irq_enter_rcu+0x30>
> > 1ed2e: 2028 0008 movel %a0@(8),%d0
> > 1ed32: 2028 0008 movel %a0@(8),%d0
> > 1ed36: 2028 0008 movel %a0@(8),%d0
> > 1ed3a: 4e5e unlk %fp
> > 1ed3c: 4e75 rts
> >
> > This patch doesn't prevent the pointless btst and beqs instructions
> > above, but it does eliminate 2 of the 3 pointless move instructions
> > here and elsewhere.
> >
> > On x86, preempt_count is per-cpu data and the problem does not arise
> > perhaps because the compiler is free to perform similar optimizations.
> >
> > Cc: Thomas Gleixner <[email protected]>
> > Fixes: 15115830c887 ("preempt: Cleanup the macro maze a bit")
>
> Does this optimization really deserves a "Fixes:" tag?
>
If commit 15115830c887 produced a performance regression then the fixes
tag is required by Documentation/process/handling-regressions.rst.
I didn't tackle that question because the m68k port doesn't have high
resolution timers or hardware instrumentation like those available on say,
arm64 or ppc64.
Perhaps someone could submit this patch for automated testing on a
suitable architecture?
> > Signed-off-by: Finn Thain <[email protected]>
> > ---
> > This patch was tested on m68k and x86. I was expecting no changes
> > to object code for x86 and mostly that's what I saw. However, there
> > were a few places where code generation was perturbed for some reason.
> > ---
> > include/linux/preempt.h | 9 +++++++--
> > 1 file changed, 7 insertions(+), 2 deletions(-)
> >
> > diff --git a/include/linux/preempt.h b/include/linux/preempt.h
> > index 0df425bf9bd7..953358e40291 100644
> > --- a/include/linux/preempt.h
> > +++ b/include/linux/preempt.h
> > @@ -102,10 +102,11 @@ static __always_inline unsigned char interrupt_context_level(void)
> > #define hardirq_count() (preempt_count() & HARDIRQ_MASK)
> > #ifdef CONFIG_PREEMPT_RT
> > # define softirq_count() (current->softirq_disable_cnt & SOFTIRQ_MASK)
> > +# define irq_count() ((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | softirq_count())
> > #else
> > # define softirq_count() (preempt_count() & SOFTIRQ_MASK)
> > +# define irq_count() (preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK))
> > #endif
> > -#define irq_count() (nmi_count() | hardirq_count() | softirq_count())
>
> Perhaps add a comment as to why you're making these two versions (ie:
> because that avoids three consecutive reads), otherwise people may be
> tempted to roll that back again in the future to make the code shorter.
>
OK.
> >
> > /*
> > * Macros to retrieve the current execution context:
> > @@ -118,7 +119,11 @@ static __always_inline unsigned char interrupt_context_level(void)
> > #define in_nmi() (nmi_count())
> > #define in_hardirq() (hardirq_count())
> > #define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
> > -#define in_task() (!(in_nmi() | in_hardirq() | in_serving_softirq()))
> > +#ifdef CONFIG_PREEMPT_RT
> > +# define in_task() (!((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | in_serving_softirq()))
> > +#else
> > +# define in_task() (!(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
> > +#endif
>
> Same here, thanks!
>
Will do.
> >
> > /*
> > * The following macros are deprecated and should not be used in new code:
> > --
> > 2.39.3
> >
>