arch_timer.h includes linux/smp.h to use DEFINE_PER_CPU() and it works
because smp.h includes percpu.h. The next commit will remove percpu.h
from smp.h and it will break this usage.
Explicitly include percpu.h and remove smp.h
Signed-off-by: Puranjay Mohan <[email protected]>
---
arch/arm64/include/asm/arch_timer.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/arm64/include/asm/arch_timer.h b/arch/arm64/include/asm/arch_timer.h
index 934c658ee947..f5794d50f51d 100644
--- a/arch/arm64/include/asm/arch_timer.h
+++ b/arch/arm64/include/asm/arch_timer.h
@@ -15,7 +15,7 @@
#include <linux/bug.h>
#include <linux/init.h>
#include <linux/jump_label.h>
-#include <linux/smp.h>
+#include <linux/percpu.h>
#include <linux/types.h>
#include <clocksource/arm_arch_timer.h>
--
2.40.1
Historically, arm64 implemented raw_smp_processor_id() as a read of
current_thread_info()->cpu. This changed when arm64 moved thread_info
into task struct, as at the time CONFIG_THREAD_INFO_IN_TASK made core
code use thread_struct::cpu for the cpu number, and due to header
dependencies prevented using this in raw_smp_processor_id(). As a
workaround, we moved to using a percpu variable in commit:
commit 57c82954e77f ("arm64: make cpu number a percpu variable")
Since then, thread_info::cpu was reintroduced, and core code was made to
use this in commits:
commit 001430c1910d ("arm64: add CPU field to struct thread_info")
commit bcf9033e5449 ("sched: move CPU field back into thread_info if
THREAD_INFO_IN_TASK=y")
Consequently it is possible to use current_thread_info()->cpu again.
This decreases the number of emitted instructions like in the following
example:
Dump of assembler code for function bpf_get_smp_processor_id:
0xffff8000802cd608 <+0>: nop
0xffff8000802cd60c <+4>: nop
0xffff8000802cd610 <+8>: adrp x0, 0xffff800082138000
0xffff8000802cd614 <+12>: mrs x1, tpidr_el1
0xffff8000802cd618 <+16>: add x0, x0, #0x8
0xffff8000802cd61c <+20>: ldrsw x0, [x0, x1]
0xffff8000802cd620 <+24>: ret
After this patch:
Dump of assembler code for function bpf_get_smp_processor_id:
0xffff8000802c9130 <+0>: nop
0xffff8000802c9134 <+4>: nop
0xffff8000802c9138 <+8>: mrs x0, sp_el0
0xffff8000802c913c <+12>: ldr w0, [x0, #24]
0xffff8000802c9140 <+16>: ret
A microbenchmark[1] was built to measure the performance improvement
provided by this change. It calls the following function given number of
times and finds the runtime overhead:
static noinline int get_cpu_id(void)
{
return smp_processor_id();
}
Run the benchmark like:
modprobe smp_processor_id nr_function_calls=1000000000
+--------------------------+------------------------+
| | Number of Calls | Time taken |
+--------+-----------------+------------------------+
| Before | 1000000000 | 1602888401ns |
+--------+-----------------+------------------------+
| After | 1000000000 | 1206212658ns |
+--------+-----------------+------------------------+
| Difference (decrease) | 396675743ns (24.74%) |
+---------------------------------------------------+
Remove the percpu variable cpu_number as it is used only in
set_smp_ipi_range() as a dummy variable to be passed to ipi_handler().
Use irq_stat in place of cpu_number here.
[1] https://github.com/puranjaymohan/linux/commit/77d3fdd
Signed-off-by: Puranjay Mohan <[email protected]>
---
Changes in v1 -> v2:
v1: https://lore.kernel.org/all/[email protected]/
- Remove the percpu variable cpu_number
- Add more information to the commit message.
---
arch/arm64/include/asm/smp.h | 13 +------------
arch/arm64/kernel/smp.c | 9 ++-------
2 files changed, 3 insertions(+), 19 deletions(-)
diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index efb13112b408..2510eec026f7 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -25,22 +25,11 @@
#ifndef __ASSEMBLY__
-#include <asm/percpu.h>
-
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/thread_info.h>
-DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
-
-/*
- * We don't use this_cpu_read(cpu_number) as that has implicit writes to
- * preempt_count, and associated (compiler) barriers, that we'd like to avoid
- * the expense of. If we're preemptible, the value can be stale at use anyway.
- * And we can't use this_cpu_ptr() either, as that winds up recursing back
- * here under CONFIG_DEBUG_PREEMPT=y.
- */
-#define raw_smp_processor_id() (*raw_cpu_ptr(&cpu_number))
+#define raw_smp_processor_id() (current_thread_info()->cpu)
/*
* Logical CPU mapping.
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 4ced34f62dab..98d4e352c3d0 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -55,9 +55,6 @@
#include <trace/events/ipi.h>
-DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number);
-EXPORT_PER_CPU_SYMBOL(cpu_number);
-
/*
* as from 2.5, kernels no longer have an init_tasks structure
* so we need some other way of telling a new secondary core
@@ -742,8 +739,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
*/
for_each_possible_cpu(cpu) {
- per_cpu(cpu_number, cpu) = cpu;
-
if (cpu == smp_processor_id())
continue;
@@ -1021,12 +1016,12 @@ void __init set_smp_ipi_range(int ipi_base, int n)
if (ipi_should_be_nmi(i)) {
err = request_percpu_nmi(ipi_base + i, ipi_handler,
- "IPI", &cpu_number);
+ "IPI", &irq_stat);
WARN(err, "Could not request IPI %d as NMI, err=%d\n",
i, err);
} else {
err = request_percpu_irq(ipi_base + i, ipi_handler,
- "IPI", &cpu_number);
+ "IPI", &irq_stat);
WARN(err, "Could not request IPI %d as IRQ, err=%d\n",
i, err);
}
--
2.40.1
On 5/2/24 18:04, Puranjay Mohan wrote:
> arch_timer.h includes linux/smp.h to use DEFINE_PER_CPU() and it works
> because smp.h includes percpu.h. The next commit will remove percpu.h
> from smp.h and it will break this usage.
>
> Explicitly include percpu.h and remove smp.h
But this particular change does not seem to be necessary for changing
raw_smp_processor_id() as current_thread_info()->cpu being done in the
later patch ? You might still leave header <asm/percpu.h> inclusion in
arch/arm64/include/asm/smp.h while dropping the per cpu cpu_number ?
>
> Signed-off-by: Puranjay Mohan <[email protected]>
> ---
> arch/arm64/include/asm/arch_timer.h | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/arch/arm64/include/asm/arch_timer.h b/arch/arm64/include/asm/arch_timer.h
> index 934c658ee947..f5794d50f51d 100644
> --- a/arch/arm64/include/asm/arch_timer.h
> +++ b/arch/arm64/include/asm/arch_timer.h
> @@ -15,7 +15,7 @@
> #include <linux/bug.h>
> #include <linux/init.h>
> #include <linux/jump_label.h>
> -#include <linux/smp.h>
> +#include <linux/percpu.h>
> #include <linux/types.h>
>
> #include <clocksource/arm_arch_timer.h>
On 5/2/24 18:04, Puranjay Mohan wrote:
> Historically, arm64 implemented raw_smp_processor_id() as a read of
> current_thread_info()->cpu. This changed when arm64 moved thread_info
> into task struct, as at the time CONFIG_THREAD_INFO_IN_TASK made core
> code use thread_struct::cpu for the cpu number, and due to header
> dependencies prevented using this in raw_smp_processor_id(). As a
> workaround, we moved to using a percpu variable in commit:
>
> commit 57c82954e77f ("arm64: make cpu number a percpu variable")
>
> Since then, thread_info::cpu was reintroduced, and core code was made to
> use this in commits:
>
> commit 001430c1910d ("arm64: add CPU field to struct thread_info")
> commit bcf9033e5449 ("sched: move CPU field back into thread_info if
> THREAD_INFO_IN_TASK=y")
>
> Consequently it is possible to use current_thread_info()->cpu again.
This captures Mark's earlier suggestion on the commit message.
>
> This decreases the number of emitted instructions like in the following
> example:
>
> Dump of assembler code for function bpf_get_smp_processor_id:
> 0xffff8000802cd608 <+0>: nop
> 0xffff8000802cd60c <+4>: nop
> 0xffff8000802cd610 <+8>: adrp x0, 0xffff800082138000
> 0xffff8000802cd614 <+12>: mrs x1, tpidr_el1
> 0xffff8000802cd618 <+16>: add x0, x0, #0x8
> 0xffff8000802cd61c <+20>: ldrsw x0, [x0, x1]
> 0xffff8000802cd620 <+24>: ret
>
> After this patch:
>
> Dump of assembler code for function bpf_get_smp_processor_id:
> 0xffff8000802c9130 <+0>: nop
> 0xffff8000802c9134 <+4>: nop
> 0xffff8000802c9138 <+8>: mrs x0, sp_el0
> 0xffff8000802c913c <+12>: ldr w0, [x0, #24]
> 0xffff8000802c9140 <+16>: ret
>
> A microbenchmark[1] was built to measure the performance improvement
> provided by this change. It calls the following function given number of
> times and finds the runtime overhead:
>
> static noinline int get_cpu_id(void)
> {
> return smp_processor_id();
> }
>
> Run the benchmark like:
> modprobe smp_processor_id nr_function_calls=1000000000
>
> +--------------------------+------------------------+
> | | Number of Calls | Time taken |
> +--------+-----------------+------------------------+
> | Before | 1000000000 | 1602888401ns |
> +--------+-----------------+------------------------+
> | After | 1000000000 | 1206212658ns |
> +--------+-----------------+------------------------+
> | Difference (decrease) | 396675743ns (24.74%) |
> +---------------------------------------------------+
>
> Remove the percpu variable cpu_number as it is used only in
> set_smp_ipi_range() as a dummy variable to be passed to ipi_handler().
> Use irq_stat in place of cpu_number here.
>
> [1] https://github.com/puranjaymohan/linux/commit/77d3fdd
>
> Signed-off-by: Puranjay Mohan <[email protected]>
> ---
> Changes in v1 -> v2:
> v1: https://lore.kernel.org/all/[email protected]/
> - Remove the percpu variable cpu_number
> - Add more information to the commit message.
> ---
> arch/arm64/include/asm/smp.h | 13 +------------
> arch/arm64/kernel/smp.c | 9 ++-------
> 2 files changed, 3 insertions(+), 19 deletions(-)
>
> diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
> index efb13112b408..2510eec026f7 100644
> --- a/arch/arm64/include/asm/smp.h
> +++ b/arch/arm64/include/asm/smp.h
> @@ -25,22 +25,11 @@
>
> #ifndef __ASSEMBLY__
>
> -#include <asm/percpu.h>
> -
> #include <linux/threads.h>
> #include <linux/cpumask.h>
> #include <linux/thread_info.h>
>
> -DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
> -
> -/*
> - * We don't use this_cpu_read(cpu_number) as that has implicit writes to
> - * preempt_count, and associated (compiler) barriers, that we'd like to avoid
> - * the expense of. If we're preemptible, the value can be stale at use anyway.
> - * And we can't use this_cpu_ptr() either, as that winds up recursing back
> - * here under CONFIG_DEBUG_PREEMPT=y.
> - */
> -#define raw_smp_processor_id() (*raw_cpu_ptr(&cpu_number))
> +#define raw_smp_processor_id() (current_thread_info()->cpu)
>
> /*
> * Logical CPU mapping.
> diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
> index 4ced34f62dab..98d4e352c3d0 100644
> --- a/arch/arm64/kernel/smp.c
> +++ b/arch/arm64/kernel/smp.c
> @@ -55,9 +55,6 @@
>
> #include <trace/events/ipi.h>
>
> -DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number);
> -EXPORT_PER_CPU_SYMBOL(cpu_number);
> -
> /*
> * as from 2.5, kernels no longer have an init_tasks structure
> * so we need some other way of telling a new secondary core
> @@ -742,8 +739,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
> */
> for_each_possible_cpu(cpu) {
>
> - per_cpu(cpu_number, cpu) = cpu;
> -
> if (cpu == smp_processor_id())
> continue;
>
> @@ -1021,12 +1016,12 @@ void __init set_smp_ipi_range(int ipi_base, int n)
>
> if (ipi_should_be_nmi(i)) {
> err = request_percpu_nmi(ipi_base + i, ipi_handler,
> - "IPI", &cpu_number);
> + "IPI", &irq_stat);
> WARN(err, "Could not request IPI %d as NMI, err=%d\n",
> i, err);
> } else {
> err = request_percpu_irq(ipi_base + i, ipi_handler,
> - "IPI", &cpu_number);
> + "IPI", &irq_stat);
> WARN(err, "Could not request IPI %d as IRQ, err=%d\n",
> i, err);
> }
set_smp_ipi_range() now looks similar to what we have on arm32 platform.
Besides the arch/arm64/include/asm/smp.h changes (not sure if is just a
clean up, then might be a stand alone patch on its own) this patch LGTM.
Anshuman Khandual <[email protected]> writes:
> On 5/2/24 18:04, Puranjay Mohan wrote:
>> arch_timer.h includes linux/smp.h to use DEFINE_PER_CPU() and it works
>> because smp.h includes percpu.h. The next commit will remove percpu.h
>> from smp.h and it will break this usage.
>>
>> Explicitly include percpu.h and remove smp.h
>
> But this particular change does not seem to be necessary for changing
> raw_smp_processor_id() as current_thread_info()->cpu being done in the
> later patch ? You might still leave header <asm/percpu.h> inclusion in
> arch/arm64/include/asm/smp.h while dropping the per cpu cpu_number ?
commit 57c82954e77f ("arm64: make cpu number a percpu variable")
created this percpu variable and included <asm/percpu.h> in <asm/smp.h>
Now we are removing the percpu variable cpu_number from smp.h, so there
is no need to keep percpu.h in smp.h
I feel users of DECLARE_PER_CPU_[...], etc. should include percpu.h and
not smp.h as it makes reading the code more easier and can thwart build
issues in the future, when headers are changed.
Thanks,
Puranjay
On Thu, May 02, 2024 at 12:34:48PM +0000, Puranjay Mohan wrote:
> arch_timer.h includes linux/smp.h to use DEFINE_PER_CPU() and it works
> because smp.h includes percpu.h. The next commit will remove percpu.h
> from smp.h and it will break this usage.
>
> Explicitly include percpu.h and remove smp.h
>
> Signed-off-by: Puranjay Mohan <[email protected]>
Looks like this was a thinko in commit:
6acc71ccac7187fc ("arm64: arch_timer: Allows a CPU-specific erratum to only affect a subset of CPUs")
.. which, as you say, should have included <linux/percpu.h> rather than
<linux/smp.h>.
This is a cleanup regardless of the next patch.
Acked-by: Mark Rutland <[email protected]>
Mark.
> ---
> arch/arm64/include/asm/arch_timer.h | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/arch/arm64/include/asm/arch_timer.h b/arch/arm64/include/asm/arch_timer.h
> index 934c658ee947..f5794d50f51d 100644
> --- a/arch/arm64/include/asm/arch_timer.h
> +++ b/arch/arm64/include/asm/arch_timer.h
> @@ -15,7 +15,7 @@
> #include <linux/bug.h>
> #include <linux/init.h>
> #include <linux/jump_label.h>
> -#include <linux/smp.h>
> +#include <linux/percpu.h>
> #include <linux/types.h>
>
> #include <clocksource/arm_arch_timer.h>
> --
> 2.40.1
>
On Fri, May 03, 2024 at 02:37:45PM +0530, Anshuman Khandual wrote:
>
>
> On 5/2/24 18:04, Puranjay Mohan wrote:
> > arch_timer.h includes linux/smp.h to use DEFINE_PER_CPU() and it works
> > because smp.h includes percpu.h. The next commit will remove percpu.h
> > from smp.h and it will break this usage.
> >
> > Explicitly include percpu.h and remove smp.h
>
> But this particular change does not seem to be necessary for changing
> raw_smp_processor_id() as current_thread_info()->cpu being done in the
> later patch ? You might still leave header <asm/percpu.h> inclusion in
> arch/arm64/include/asm/smp.h while dropping the per cpu cpu_number ?
Why would that be preferable?
The general rule is that if a file uses something explicitly, it should include
the relevant header directly rather than something that happens to transitively
include that header.
We made a mistake and included the wrong header in commit:
6acc71ccac7187fc ("arm64: arch_timer: Allows a CPU-specific erratum to only affect a subset of CPUs")
.. so we should fix that regardless of the next patch.
The point of the next patch is to effectively revert commit:
57c82954e77fa12c ("arm64: make cpu number a percpu variable")
.. and reverting that means we should stop including <asm/percpu.h> from
<asm/smp.h>; anything depending on that is already doing something wrong, and
leaving the include there only serves to paper over bugs.
Mark.
>
> >
> > Signed-off-by: Puranjay Mohan <[email protected]>
> > ---
> > arch/arm64/include/asm/arch_timer.h | 2 +-
> > 1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/arch/arm64/include/asm/arch_timer.h b/arch/arm64/include/asm/arch_timer.h
> > index 934c658ee947..f5794d50f51d 100644
> > --- a/arch/arm64/include/asm/arch_timer.h
> > +++ b/arch/arm64/include/asm/arch_timer.h
> > @@ -15,7 +15,7 @@
> > #include <linux/bug.h>
> > #include <linux/init.h>
> > #include <linux/jump_label.h>
> > -#include <linux/smp.h>
> > +#include <linux/percpu.h>
> > #include <linux/types.h>
> >
> > #include <clocksource/arm_arch_timer.h>
On Thu, May 02, 2024 at 12:34:49PM +0000, Puranjay Mohan wrote:
> Historically, arm64 implemented raw_smp_processor_id() as a read of
> current_thread_info()->cpu. This changed when arm64 moved thread_info
> into task struct, as at the time CONFIG_THREAD_INFO_IN_TASK made core
> code use thread_struct::cpu for the cpu number, and due to header
> dependencies prevented using this in raw_smp_processor_id(). As a
> workaround, we moved to using a percpu variable in commit:
>
> commit 57c82954e77f ("arm64: make cpu number a percpu variable")
>
> Since then, thread_info::cpu was reintroduced, and core code was made to
> use this in commits:
>
> commit 001430c1910d ("arm64: add CPU field to struct thread_info")
> commit bcf9033e5449 ("sched: move CPU field back into thread_info if
> THREAD_INFO_IN_TASK=y")
>
> Consequently it is possible to use current_thread_info()->cpu again.
Minor nits:
* There's no need to say "commit" before each of these when the previous line
ends with "commit:" or "commits:"
* It'd be better for these to each be single lines, even if they go over the
usual line limit.
* I'd deliberately indented those commit lines with double-spaces to
distinguish them from regular text in the commit message.
.. so if you could use that text as-is (minus the "| " prefix on each line)
from:
https://lore.kernel.org/linux-arm-kernel/[email protected]/
.. that'd be preferable.
> This decreases the number of emitted instructions like in the following
> example:
>
> Dump of assembler code for function bpf_get_smp_processor_id:
> 0xffff8000802cd608 <+0>: nop
> 0xffff8000802cd60c <+4>: nop
> 0xffff8000802cd610 <+8>: adrp x0, 0xffff800082138000
> 0xffff8000802cd614 <+12>: mrs x1, tpidr_el1
> 0xffff8000802cd618 <+16>: add x0, x0, #0x8
> 0xffff8000802cd61c <+20>: ldrsw x0, [x0, x1]
> 0xffff8000802cd620 <+24>: ret
>
> After this patch:
>
> Dump of assembler code for function bpf_get_smp_processor_id:
> 0xffff8000802c9130 <+0>: nop
> 0xffff8000802c9134 <+4>: nop
> 0xffff8000802c9138 <+8>: mrs x0, sp_el0
> 0xffff8000802c913c <+12>: ldr w0, [x0, #24]
> 0xffff8000802c9140 <+16>: ret
>
> A microbenchmark[1] was built to measure the performance improvement
> provided by this change. It calls the following function given number of
> times and finds the runtime overhead:
>
> static noinline int get_cpu_id(void)
> {
> return smp_processor_id();
> }
>
> Run the benchmark like:
> modprobe smp_processor_id nr_function_calls=1000000000
>
> +--------------------------+------------------------+
> | | Number of Calls | Time taken |
> +--------+-----------------+------------------------+
> | Before | 1000000000 | 1602888401ns |
> +--------+-----------------+------------------------+
> | After | 1000000000 | 1206212658ns |
> +--------+-----------------+------------------------+
> | Difference (decrease) | 396675743ns (24.74%) |
> +---------------------------------------------------+
>
> Remove the percpu variable cpu_number as it is used only in
> set_smp_ipi_range() as a dummy variable to be passed to ipi_handler().
> Use irq_stat in place of cpu_number here.
>
> [1] https://github.com/puranjaymohan/linux/commit/77d3fdd
>
> Signed-off-by: Puranjay Mohan <[email protected]>
> ---
> Changes in v1 -> v2:
> v1: https://lore.kernel.org/all/[email protected]/
> - Remove the percpu variable cpu_number
> - Add more information to the commit message.
> ---
> arch/arm64/include/asm/smp.h | 13 +------------
> arch/arm64/kernel/smp.c | 9 ++-------
> 2 files changed, 3 insertions(+), 19 deletions(-)
>
> diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
> index efb13112b408..2510eec026f7 100644
> --- a/arch/arm64/include/asm/smp.h
> +++ b/arch/arm64/include/asm/smp.h
> @@ -25,22 +25,11 @@
>
> #ifndef __ASSEMBLY__
>
> -#include <asm/percpu.h>
> -
> #include <linux/threads.h>
> #include <linux/cpumask.h>
> #include <linux/thread_info.h>
>
> -DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
> -
> -/*
> - * We don't use this_cpu_read(cpu_number) as that has implicit writes to
> - * preempt_count, and associated (compiler) barriers, that we'd like to avoid
> - * the expense of. If we're preemptible, the value can be stale at use anyway.
> - * And we can't use this_cpu_ptr() either, as that winds up recursing back
> - * here under CONFIG_DEBUG_PREEMPT=y.
> - */
> -#define raw_smp_processor_id() (*raw_cpu_ptr(&cpu_number))
> +#define raw_smp_processor_id() (current_thread_info()->cpu)
>
> /*
> * Logical CPU mapping.
> diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
> index 4ced34f62dab..98d4e352c3d0 100644
> --- a/arch/arm64/kernel/smp.c
> +++ b/arch/arm64/kernel/smp.c
> @@ -55,9 +55,6 @@
>
> #include <trace/events/ipi.h>
>
> -DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number);
> -EXPORT_PER_CPU_SYMBOL(cpu_number);
> -
> /*
> * as from 2.5, kernels no longer have an init_tasks structure
> * so we need some other way of telling a new secondary core
> @@ -742,8 +739,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
> */
> for_each_possible_cpu(cpu) {
>
> - per_cpu(cpu_number, cpu) = cpu;
> -
> if (cpu == smp_processor_id())
> continue;
>
> @@ -1021,12 +1016,12 @@ void __init set_smp_ipi_range(int ipi_base, int n)
>
> if (ipi_should_be_nmi(i)) {
> err = request_percpu_nmi(ipi_base + i, ipi_handler,
> - "IPI", &cpu_number);
> + "IPI", &irq_stat);
> WARN(err, "Could not request IPI %d as NMI, err=%d\n",
> i, err);
> } else {
> err = request_percpu_irq(ipi_base + i, ipi_handler,
> - "IPI", &cpu_number);
> + "IPI", &irq_stat);
I was going to say that it might be worth having a dummy percpu variable
specifically for these, but given this is what 32-bit arm does, it makse sense
to do the same thing.
This looks good to me, so:
Acked-by: Mark Rutland <[email protected]>
Mark.
> WARN(err, "Could not request IPI %d as IRQ, err=%d\n",
> i, err);
> }
> --
> 2.40.1
>
On 5/3/24 15:14, Puranjay Mohan wrote:
> Anshuman Khandual <[email protected]> writes:
>
>> On 5/2/24 18:04, Puranjay Mohan wrote:
>>> arch_timer.h includes linux/smp.h to use DEFINE_PER_CPU() and it works
>>> because smp.h includes percpu.h. The next commit will remove percpu.h
>>> from smp.h and it will break this usage.
>>>
>>> Explicitly include percpu.h and remove smp.h
>>
>> But this particular change does not seem to be necessary for changing
>> raw_smp_processor_id() as current_thread_info()->cpu being done in the
>> later patch ? You might still leave header <asm/percpu.h> inclusion in
>> arch/arm64/include/asm/smp.h while dropping the per cpu cpu_number ?
>
> commit 57c82954e77f ("arm64: make cpu number a percpu variable")
> created this percpu variable and included <asm/percpu.h> in <asm/smp.h>
>
> Now we are removing the percpu variable cpu_number from smp.h, so there
> is no need to keep percpu.h in smp.h
Fair enough.
>
> I feel users of DECLARE_PER_CPU_[...], etc. should include percpu.h and
> not smp.h as it makes reading the code more easier and can thwart build
> issues in the future, when headers are changed.
Right, makes sense, hope there is no more such cases using smp.h to pull
in DECLARE_PER_CPU_[...]. A quick build on defconfig is successful after
this patch.
On 5/2/24 18:04, Puranjay Mohan wrote:
> arch_timer.h includes linux/smp.h to use DEFINE_PER_CPU() and it works
> because smp.h includes percpu.h. The next commit will remove percpu.h
> from smp.h and it will break this usage.
>
> Explicitly include percpu.h and remove smp.h
>
> Signed-off-by: Puranjay Mohan <[email protected]>
Reviewed-by: Anshuman Khandual <[email protected]>
> ---
> arch/arm64/include/asm/arch_timer.h | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/arch/arm64/include/asm/arch_timer.h b/arch/arm64/include/asm/arch_timer.h
> index 934c658ee947..f5794d50f51d 100644
> --- a/arch/arm64/include/asm/arch_timer.h
> +++ b/arch/arm64/include/asm/arch_timer.h
> @@ -15,7 +15,7 @@
> #include <linux/bug.h>
> #include <linux/init.h>
> #include <linux/jump_label.h>
> -#include <linux/smp.h>
> +#include <linux/percpu.h>
> #include <linux/types.h>
>
> #include <clocksource/arm_arch_timer.h>
On 5/2/24 18:04, Puranjay Mohan wrote:
> Historically, arm64 implemented raw_smp_processor_id() as a read of
> current_thread_info()->cpu. This changed when arm64 moved thread_info
> into task struct, as at the time CONFIG_THREAD_INFO_IN_TASK made core
> code use thread_struct::cpu for the cpu number, and due to header
> dependencies prevented using this in raw_smp_processor_id(). As a
> workaround, we moved to using a percpu variable in commit:
>
> commit 57c82954e77f ("arm64: make cpu number a percpu variable")
>
> Since then, thread_info::cpu was reintroduced, and core code was made to
> use this in commits:
>
> commit 001430c1910d ("arm64: add CPU field to struct thread_info")
> commit bcf9033e5449 ("sched: move CPU field back into thread_info if
> THREAD_INFO_IN_TASK=y")
>
> Consequently it is possible to use current_thread_info()->cpu again.
>
> This decreases the number of emitted instructions like in the following
> example:
>
> Dump of assembler code for function bpf_get_smp_processor_id:
> 0xffff8000802cd608 <+0>: nop
> 0xffff8000802cd60c <+4>: nop
> 0xffff8000802cd610 <+8>: adrp x0, 0xffff800082138000
> 0xffff8000802cd614 <+12>: mrs x1, tpidr_el1
> 0xffff8000802cd618 <+16>: add x0, x0, #0x8
> 0xffff8000802cd61c <+20>: ldrsw x0, [x0, x1]
> 0xffff8000802cd620 <+24>: ret
>
> After this patch:
>
> Dump of assembler code for function bpf_get_smp_processor_id:
> 0xffff8000802c9130 <+0>: nop
> 0xffff8000802c9134 <+4>: nop
> 0xffff8000802c9138 <+8>: mrs x0, sp_el0
> 0xffff8000802c913c <+12>: ldr w0, [x0, #24]
> 0xffff8000802c9140 <+16>: ret
>
> A microbenchmark[1] was built to measure the performance improvement
> provided by this change. It calls the following function given number of
> times and finds the runtime overhead:
>
> static noinline int get_cpu_id(void)
> {
> return smp_processor_id();
> }
>
> Run the benchmark like:
> modprobe smp_processor_id nr_function_calls=1000000000
>
> +--------------------------+------------------------+
> | | Number of Calls | Time taken |
> +--------+-----------------+------------------------+
> | Before | 1000000000 | 1602888401ns |
> +--------+-----------------+------------------------+
> | After | 1000000000 | 1206212658ns |
> +--------+-----------------+------------------------+
> | Difference (decrease) | 396675743ns (24.74%) |
> +---------------------------------------------------+
>
> Remove the percpu variable cpu_number as it is used only in
> set_smp_ipi_range() as a dummy variable to be passed to ipi_handler().
> Use irq_stat in place of cpu_number here.
>
> [1] https://github.com/puranjaymohan/linux/commit/77d3fdd
>
> Signed-off-by: Puranjay Mohan <[email protected]>
Reviewed-by: Anshuman Khandual <[email protected]>
> ---
> Changes in v1 -> v2:
> v1: https://lore.kernel.org/all/[email protected]/
> - Remove the percpu variable cpu_number
> - Add more information to the commit message.
> ---
> arch/arm64/include/asm/smp.h | 13 +------------
> arch/arm64/kernel/smp.c | 9 ++-------
> 2 files changed, 3 insertions(+), 19 deletions(-)
>
> diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
> index efb13112b408..2510eec026f7 100644
> --- a/arch/arm64/include/asm/smp.h
> +++ b/arch/arm64/include/asm/smp.h
> @@ -25,22 +25,11 @@
>
> #ifndef __ASSEMBLY__
>
> -#include <asm/percpu.h>
> -
> #include <linux/threads.h>
> #include <linux/cpumask.h>
> #include <linux/thread_info.h>
>
> -DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
> -
> -/*
> - * We don't use this_cpu_read(cpu_number) as that has implicit writes to
> - * preempt_count, and associated (compiler) barriers, that we'd like to avoid
> - * the expense of. If we're preemptible, the value can be stale at use anyway.
> - * And we can't use this_cpu_ptr() either, as that winds up recursing back
> - * here under CONFIG_DEBUG_PREEMPT=y.
> - */
> -#define raw_smp_processor_id() (*raw_cpu_ptr(&cpu_number))
> +#define raw_smp_processor_id() (current_thread_info()->cpu)
>
> /*
> * Logical CPU mapping.
> diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
> index 4ced34f62dab..98d4e352c3d0 100644
> --- a/arch/arm64/kernel/smp.c
> +++ b/arch/arm64/kernel/smp.c
> @@ -55,9 +55,6 @@
>
> #include <trace/events/ipi.h>
>
> -DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number);
> -EXPORT_PER_CPU_SYMBOL(cpu_number);
> -
> /*
> * as from 2.5, kernels no longer have an init_tasks structure
> * so we need some other way of telling a new secondary core
> @@ -742,8 +739,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
> */
> for_each_possible_cpu(cpu) {
>
> - per_cpu(cpu_number, cpu) = cpu;
> -
> if (cpu == smp_processor_id())
> continue;
>
> @@ -1021,12 +1016,12 @@ void __init set_smp_ipi_range(int ipi_base, int n)
>
> if (ipi_should_be_nmi(i)) {
> err = request_percpu_nmi(ipi_base + i, ipi_handler,
> - "IPI", &cpu_number);
> + "IPI", &irq_stat);
> WARN(err, "Could not request IPI %d as NMI, err=%d\n",
> i, err);
> } else {
> err = request_percpu_irq(ipi_base + i, ipi_handler,
> - "IPI", &cpu_number);
> + "IPI", &irq_stat);
> WARN(err, "Could not request IPI %d as IRQ, err=%d\n",
> i, err);
> }