2021-10-07 15:49:37

by Eric Dumazet

[permalink] [raw]
Subject: [PATCH v2] x86/apic: reduce cache line misses in __x2apic_send_IPI_mask()

From: Eric Dumazet <[email protected]>

Using per-cpu storage for @x86_cpu_to_logical_apicid
is not optimal.

Broadcast IPI will need at least one cache line
per cpu to access this field.

__x2apic_send_IPI_mask() is using standard bitmask operators.

By converting x86_cpu_to_logical_apicid to an array,
we divide by 16x number of needed cache lines, because
we find 16 values per cache line. CPU prefetcher can
kick nicely.

Also move @cluster_masks to READ_MOSTLY section to avoid false sharing.

Tested on a dual socket host with 256 cpus,
cost for a full broadcast is now 11 usec instead of 33 usec.

v2: use a dynamically allocated array, as suggested by Peter.

Signed-off-by: Eric Dumazet <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Peter Zijlstra (Intel) <[email protected]>
Cc: Thomas Gleixner <[email protected]>
---
arch/x86/kernel/apic/x2apic_cluster.c | 27 +++++++++++++++++++++------
1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index f4da9bb69a8859ff10824315388aeb49c2ccfad9..e696e22d0531976f7cba72ed17443592eac72c13 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -15,9 +15,15 @@ struct cluster_mask {
struct cpumask mask;
};

-static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+/*
+ * __x2apic_send_IPI_mask() possibly needs to read
+ * x86_cpu_to_logical_apicid for all online cpus in a sequential way.
+ * Using per cpu variable would cost one cache line per cpu.
+ */
+static u32 *x86_cpu_to_logical_apicid __read_mostly;
+
static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
-static DEFINE_PER_CPU(struct cluster_mask *, cluster_masks);
+static DEFINE_PER_CPU_READ_MOSTLY(struct cluster_mask *, cluster_masks);
static struct cluster_mask *cluster_hotplug_mask;

static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
@@ -27,7 +33,7 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)

static void x2apic_send_IPI(int cpu, int vector)
{
- u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu);
+ u32 dest = x86_cpu_to_logical_apicid[cpu];

/* x2apic MSRs are special and need a special fence: */
weak_wrmsr_fence();
@@ -58,7 +64,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)

dest = 0;
for_each_cpu_and(clustercpu, tmpmsk, &cmsk->mask)
- dest |= per_cpu(x86_cpu_to_logical_apicid, clustercpu);
+ dest |= x86_cpu_to_logical_apicid[clustercpu];

if (!dest)
continue;
@@ -94,7 +100,7 @@ static void x2apic_send_IPI_all(int vector)

static u32 x2apic_calc_apicid(unsigned int cpu)
{
- return per_cpu(x86_cpu_to_logical_apicid, cpu);
+ return x86_cpu_to_logical_apicid[cpu];
}

static void init_x2apic_ldr(void)
@@ -103,7 +109,7 @@ static void init_x2apic_ldr(void)
u32 cluster, apicid = apic_read(APIC_LDR);
unsigned int cpu;

- this_cpu_write(x86_cpu_to_logical_apicid, apicid);
+ x86_cpu_to_logical_apicid[smp_processor_id()] = apicid;

if (cmsk)
goto update;
@@ -166,12 +172,21 @@ static int x2apic_dead_cpu(unsigned int dead_cpu)

static int x2apic_cluster_probe(void)
{
+ u32 slots;
+
if (!x2apic_mode)
return 0;

+ slots = max_t(u32, L1_CACHE_BYTES/sizeof(u32), nr_cpu_ids);
+ x86_cpu_to_logical_apicid = kcalloc(slots, sizeof(u32), GFP_KERNEL);
+ if (!x86_cpu_to_logical_apicid)
+ return 0;
+
if (cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare",
x2apic_prepare_cpu, x2apic_dead_cpu) < 0) {
pr_err("Failed to register X2APIC_PREPARE\n");
+ kfree(x86_cpu_to_logical_apicid);
+ x86_cpu_to_logical_apicid = NULL;
return 0;
}
init_x2apic_ldr();
--
2.33.0.882.g93a45727a2-goog


2021-10-12 12:52:35

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2] x86/apic: reduce cache line misses in __x2apic_send_IPI_mask()

On Thu, Oct 07, 2021 at 07:35:56AM -0700, Eric Dumazet wrote:
> From: Eric Dumazet <[email protected]>
>
> Using per-cpu storage for @x86_cpu_to_logical_apicid
> is not optimal.
>
> Broadcast IPI will need at least one cache line
> per cpu to access this field.
>
> __x2apic_send_IPI_mask() is using standard bitmask operators.
>
> By converting x86_cpu_to_logical_apicid to an array,
> we divide by 16x number of needed cache lines, because
> we find 16 values per cache line. CPU prefetcher can
> kick nicely.
>
> Also move @cluster_masks to READ_MOSTLY section to avoid false sharing.
>
> Tested on a dual socket host with 256 cpus,
> cost for a full broadcast is now 11 usec instead of 33 usec.
>
> v2: use a dynamically allocated array, as suggested by Peter.
>
> Signed-off-by: Eric Dumazet <[email protected]>

Acked-by: Peter Zijlstra (Intel) <[email protected]>

2021-10-29 00:16:02

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH v2] x86/apic: reduce cache line misses in __x2apic_send_IPI_mask()

On Tue, Oct 12, 2021 at 5:47 AM Peter Zijlstra <[email protected]> wrote:
>
> On Thu, Oct 07, 2021 at 07:35:56AM -0700, Eric Dumazet wrote:
> > From: Eric Dumazet <[email protected]>
> >
> > Using per-cpu storage for @x86_cpu_to_logical_apicid
> > is not optimal.
> >
> > Broadcast IPI will need at least one cache line
> > per cpu to access this field.
> >
> > __x2apic_send_IPI_mask() is using standard bitmask operators.
> >
> > By converting x86_cpu_to_logical_apicid to an array,
> > we divide by 16x number of needed cache lines, because
> > we find 16 values per cache line. CPU prefetcher can
> > kick nicely.
> >
> > Also move @cluster_masks to READ_MOSTLY section to avoid false sharing.
> >
> > Tested on a dual socket host with 256 cpus,
> > cost for a full broadcast is now 11 usec instead of 33 usec.
> >
> > v2: use a dynamically allocated array, as suggested by Peter.
> >
> > Signed-off-by: Eric Dumazet <[email protected]>
>
> Acked-by: Peter Zijlstra (Intel) <[email protected]>

Has this patch been merged x86 maintainers tree yet ?

Thanks for reviewing !

2021-10-29 08:13:22

by tip-bot2 for Jacob Pan

[permalink] [raw]
Subject: [tip: x86/apic] x86/apic: Reduce cache line misses in __x2apic_send_IPI_mask()

The following commit has been merged into the x86/apic branch of tip:

Commit-ID: cc95a07fef06a2c7917acd827b3a8322772969eb
Gitweb: https://git.kernel.org/tip/cc95a07fef06a2c7917acd827b3a8322772969eb
Author: Eric Dumazet <[email protected]>
AuthorDate: Thu, 07 Oct 2021 07:35:56 -07:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Fri, 29 Oct 2021 10:02:17 +02:00

x86/apic: Reduce cache line misses in __x2apic_send_IPI_mask()

Using per-cpu storage for @x86_cpu_to_logical_apicid is not optimal.

Broadcast IPI will need at least one cache line per cpu to access this
field.

__x2apic_send_IPI_mask() is using standard bitmask operators.

By converting x86_cpu_to_logical_apicid to an array, we divide by 16x
number of needed cache lines, because we find 16 values per cache
line. CPU prefetcher can kick nicely.

Also move @cluster_masks to READ_MOSTLY section to avoid false sharing.

Tested on a dual socket host with 256 cpus, cost for a full broadcast
is now 11 usec instead of 33 usec.

Signed-off-by: Eric Dumazet <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
arch/x86/kernel/apic/x2apic_cluster.c | 27 ++++++++++++++++++++------
1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index f4da9bb..e696e22 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -15,9 +15,15 @@ struct cluster_mask {
struct cpumask mask;
};

-static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+/*
+ * __x2apic_send_IPI_mask() possibly needs to read
+ * x86_cpu_to_logical_apicid for all online cpus in a sequential way.
+ * Using per cpu variable would cost one cache line per cpu.
+ */
+static u32 *x86_cpu_to_logical_apicid __read_mostly;
+
static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
-static DEFINE_PER_CPU(struct cluster_mask *, cluster_masks);
+static DEFINE_PER_CPU_READ_MOSTLY(struct cluster_mask *, cluster_masks);
static struct cluster_mask *cluster_hotplug_mask;

static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
@@ -27,7 +33,7 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)

static void x2apic_send_IPI(int cpu, int vector)
{
- u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu);
+ u32 dest = x86_cpu_to_logical_apicid[cpu];

/* x2apic MSRs are special and need a special fence: */
weak_wrmsr_fence();
@@ -58,7 +64,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)

dest = 0;
for_each_cpu_and(clustercpu, tmpmsk, &cmsk->mask)
- dest |= per_cpu(x86_cpu_to_logical_apicid, clustercpu);
+ dest |= x86_cpu_to_logical_apicid[clustercpu];

if (!dest)
continue;
@@ -94,7 +100,7 @@ static void x2apic_send_IPI_all(int vector)

static u32 x2apic_calc_apicid(unsigned int cpu)
{
- return per_cpu(x86_cpu_to_logical_apicid, cpu);
+ return x86_cpu_to_logical_apicid[cpu];
}

static void init_x2apic_ldr(void)
@@ -103,7 +109,7 @@ static void init_x2apic_ldr(void)
u32 cluster, apicid = apic_read(APIC_LDR);
unsigned int cpu;

- this_cpu_write(x86_cpu_to_logical_apicid, apicid);
+ x86_cpu_to_logical_apicid[smp_processor_id()] = apicid;

if (cmsk)
goto update;
@@ -166,12 +172,21 @@ static int x2apic_dead_cpu(unsigned int dead_cpu)

static int x2apic_cluster_probe(void)
{
+ u32 slots;
+
if (!x2apic_mode)
return 0;

+ slots = max_t(u32, L1_CACHE_BYTES/sizeof(u32), nr_cpu_ids);
+ x86_cpu_to_logical_apicid = kcalloc(slots, sizeof(u32), GFP_KERNEL);
+ if (!x86_cpu_to_logical_apicid)
+ return 0;
+
if (cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare",
x2apic_prepare_cpu, x2apic_dead_cpu) < 0) {
pr_err("Failed to register X2APIC_PREPARE\n");
+ kfree(x86_cpu_to_logical_apicid);
+ x86_cpu_to_logical_apicid = NULL;
return 0;
}
init_x2apic_ldr();