2019-07-03 11:06:42

by Thomas Gleixner

[permalink] [raw]
Subject: [patch 16/18] x86/apic: Convert 32bit to IPI shorthand static key

Broadcast now depends on the fact that all present CPUs have been booted
once so handling broadcast IPIs is not doing any harm. In case that a CPU
is offline, it does not react to regular IPIs and the NMI handler returns
early.

Signed-off-by: Thomas Gleixner <[email protected]>
---
arch/x86/kernel/apic/ipi.c | 12 +++---------
1 file changed, 3 insertions(+), 9 deletions(-)

--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -8,13 +8,7 @@
DEFINE_STATIC_KEY_FALSE(apic_use_ipi_shorthand);

#ifdef CONFIG_SMP
-#ifdef CONFIG_HOTPLUG_CPU
-#define DEFAULT_SEND_IPI (1)
-#else
-#define DEFAULT_SEND_IPI (0)
-#endif
-
-static int apic_ipi_shorthand_off __ro_after_init = DEFAULT_SEND_IPI;
+static int apic_ipi_shorthand_off __ro_after_init;

static __init int apic_ipi_shorthand(char *str)
{
@@ -250,7 +244,7 @@ void default_send_IPI_allbutself(int vec
if (num_online_cpus() < 2)
return;

- if (apic_ipi_shorthand_off || vector == NMI_VECTOR) {
+ if (static_branch_likely(&apic_use_ipi_shorthand)) {
apic->send_IPI_mask_allbutself(cpu_online_mask, vector);
} else {
__default_send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
@@ -259,7 +253,7 @@ void default_send_IPI_allbutself(int vec

void default_send_IPI_all(int vector)
{
- if (apic_ipi_shorthand_off || vector == NMI_VECTOR) {
+ if (static_branch_likely(&apic_use_ipi_shorthand)) {
apic->send_IPI_mask(cpu_online_mask, vector);
} else {
__default_send_IPI_shortcut(APIC_DEST_ALLINC, vector);



2019-07-03 18:08:31

by Nadav Amit

[permalink] [raw]
Subject: Re: [patch 16/18] x86/apic: Convert 32bit to IPI shorthand static key

> On Jul 3, 2019, at 3:54 AM, Thomas Gleixner <[email protected]> wrote:
>
> Broadcast now depends on the fact that all present CPUs have been booted
> once so handling broadcast IPIs is not doing any harm. In case that a CPU
> is offline, it does not react to regular IPIs and the NMI handler returns
> early.
>
> Signed-off-by: Thomas Gleixner <[email protected]>
> ---
> arch/x86/kernel/apic/ipi.c | 12 +++---------
> 1 file changed, 3 insertions(+), 9 deletions(-)
>
> --- a/arch/x86/kernel/apic/ipi.c
> +++ b/arch/x86/kernel/apic/ipi.c
> @@ -8,13 +8,7 @@
> DEFINE_STATIC_KEY_FALSE(apic_use_ipi_shorthand);
>
> #ifdef CONFIG_SMP
> -#ifdef CONFIG_HOTPLUG_CPU
> -#define DEFAULT_SEND_IPI (1)
> -#else
> -#define DEFAULT_SEND_IPI (0)
> -#endif
> -
> -static int apic_ipi_shorthand_off __ro_after_init = DEFAULT_SEND_IPI;
> +static int apic_ipi_shorthand_off __ro_after_init;
>
> static __init int apic_ipi_shorthand(char *str)
> {
> @@ -250,7 +244,7 @@ void default_send_IPI_allbutself(int vec
> if (num_online_cpus() < 2)
> return;
>
> - if (apic_ipi_shorthand_off || vector == NMI_VECTOR) {
> + if (static_branch_likely(&apic_use_ipi_shorthand)) {
> apic->send_IPI_mask_allbutself(cpu_online_mask, vector);
> } else {
> __default_send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
> @@ -259,7 +253,7 @@ void default_send_IPI_allbutself(int vec
>
> void default_send_IPI_all(int vector)
> {
> - if (apic_ipi_shorthand_off || vector == NMI_VECTOR) {
> + if (static_branch_likely(&apic_use_ipi_shorthand)) {
> apic->send_IPI_mask(cpu_online_mask, vector);
> } else {
> __default_send_IPI_shortcut(APIC_DEST_ALLINC, vector);

It may be better to check the static-key in native_send_call_func_ipi() (and
other callers if there are any), and remove all the other checks in
default_send_IPI_all(), x2apic_send_IPI_mask_allbutself(), etc.

This would allow to remove potentially unnecessary checks in
native_send_call_func_ipi()I also have this patch I still did not send to
slightly improve the test in native_send_call_func_ipi().

-- >8 --

From: Nadav Amit <[email protected]>
Date: Fri, 7 Jun 2019 15:11:44 -0700
Subject: [PATCH] x86/smp: Better check of allbutself

Introduce for_each_cpu_and_not() for this matter.

Signed-off-by: Nadav Amit <[email protected]>
---
arch/x86/kernel/smp.c | 22 +++++++++++-----------
include/asm-generic/bitops/find.h | 17 +++++++++++++++++
include/linux/cpumask.h | 17 +++++++++++++++++
lib/cpumask.c | 20 ++++++++++++++++++++
lib/find_bit.c | 21 ++++++++++++++++++---
5 files changed, 83 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 96421f97e75c..7972ab593397 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -136,23 +136,23 @@ void native_send_call_func_single_ipi(int cpu)

void native_send_call_func_ipi(const struct cpumask *mask)
{
- cpumask_var_t allbutself;
-
- if (!alloc_cpumask_var(&allbutself, GFP_ATOMIC)) {
- apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
- return;
+ int cpu, this_cpu = smp_processor_id();
+ bool allbutself = true;
+ bool self = false;
+
+ for_each_cpu_and_not(cpu, cpu_online_mask, mask) {
+ if (cpu != this_cpu) {
+ allbutself = false;
+ break;
+ }
+ self = true;
}

- cpumask_copy(allbutself, cpu_online_mask);
- __cpumask_clear_cpu(smp_processor_id(), allbutself);
-
- if (cpumask_equal(mask, allbutself) &&
+ if (allbutself && !self &&
cpumask_equal(cpu_online_mask, cpu_callout_mask))
apic->send_IPI_allbutself(CALL_FUNCTION_VECTOR);
else
apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
-
- free_cpumask_var(allbutself);
}

static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
diff --git a/include/asm-generic/bitops/find.h b/include/asm-generic/bitops/find.h
index 8a1ee10014de..e5f19eec2737 100644
--- a/include/asm-generic/bitops/find.h
+++ b/include/asm-generic/bitops/find.h
@@ -32,6 +32,23 @@ extern unsigned long find_next_and_bit(const unsigned long *addr1,
unsigned long offset);
#endif

+#ifndef find_next_and_bit
+/**
+ * find_next_and_not_bit - find the next set bit in the the first region
+ * which is clear on the second
+ * @addr1: The first address to base the search on
+ * @addr2: The second address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The bitmap size in bits
+ *
+ * Returns the bit number for the next set bit
+ * If no bits are set, returns @size.
+ */
+extern unsigned long find_next_and_not_bit(const unsigned long *addr1,
+ const unsigned long *addr2, unsigned long size,
+ unsigned long offset);
+#endif
+
#ifndef find_next_zero_bit
/**
* find_next_zero_bit - find the next cleared bit in a memory region
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 693124900f0a..4648add54fad 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -229,6 +229,7 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
}

int cpumask_next_and(int n, const struct cpumask *, const struct cpumask *);
+int __pure cpumask_next_and_not(int n, const struct cpumask *, const struct cpumask *);
int cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
unsigned int cpumask_local_spread(unsigned int i, int node);

@@ -291,6 +292,22 @@ extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool
for ((cpu) = -1; \
(cpu) = cpumask_next_and((cpu), (mask), (and)), \
(cpu) < nr_cpu_ids;)
+
+
+/**
+ * for_each_cpu_and_not - iterate over every cpu in @mask & ~@and_not
+ * @cpu: the (optionally unsigned) integer iterator
+ * @mask: the first cpumask pointer
+ * @and_not: the second cpumask pointer
+ *
+ * After the loop, cpu is >= nr_cpu_ids.
+ */
+#define for_each_cpu_and_not(cpu, mask, and_not) \
+ for ((cpu) = -1; \
+ (cpu) = cpumask_next_and_not((cpu), (mask), (and_not)), \
+ (cpu) < nr_cpu_ids;)
+
+
#endif /* SMP */

#define CPU_BITS_NONE \
diff --git a/lib/cpumask.c b/lib/cpumask.c
index 0cb672eb107c..59c98f55c308 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -42,6 +42,26 @@ int cpumask_next_and(int n, const struct cpumask *src1p,
}
EXPORT_SYMBOL(cpumask_next_and);

+/**
+ * cpumask_next_and_not - get the next cpu in *src1p & ~*src2p
+ * @n: the cpu prior to the place to search (ie. return will be > @n)
+ * @src1p: the first cpumask pointer
+ * @src2p: the second cpumask pointer
+ *
+ * Returns >= nr_cpu_ids if no further cpus set in both.
+ */
+int cpumask_next_and_not(int n, const struct cpumask *src1p,
+ const struct cpumask *src2p)
+{
+ /* -1 is a legal arg here. */
+ if (n != -1)
+ cpumask_check(n);
+ return find_next_and_not_bit(cpumask_bits(src1p), cpumask_bits(src2p),
+ nr_cpumask_bits, n + 1);
+}
+EXPORT_SYMBOL(cpumask_next_and_not);
+
+
/**
* cpumask_any_but - return a "random" in a cpumask, but not this one.
* @mask: the cpumask to search
diff --git a/lib/find_bit.c b/lib/find_bit.c
index 5c51eb45178a..7bd2c567287e 100644
--- a/lib/find_bit.c
+++ b/lib/find_bit.c
@@ -23,7 +23,7 @@
/*
* This is a common helper function for find_next_bit, find_next_zero_bit, and
* find_next_and_bit. The differences are:
- * - The "invert" argument, which is XORed with each fetched word before
+ * - The "invert" argument, which is XORed with each fetched first word before
* searching it for one bits.
* - The optional "addr2", which is anded with "addr1" if present.
*/
@@ -37,9 +37,9 @@ static inline unsigned long _find_next_bit(const unsigned long *addr1,
return nbits;

tmp = addr1[start / BITS_PER_LONG];
+ tmp ^= invert;
if (addr2)
tmp &= addr2[start / BITS_PER_LONG];
- tmp ^= invert;

/* Handle 1st word. */
tmp &= BITMAP_FIRST_WORD_MASK(start);
@@ -51,9 +51,9 @@ static inline unsigned long _find_next_bit(const unsigned long *addr1,
return nbits;

tmp = addr1[start / BITS_PER_LONG];
+ tmp ^= invert;
if (addr2)
tmp &= addr2[start / BITS_PER_LONG];
- tmp ^= invert;
}

return min(start + __ffs(tmp), nbits);
@@ -91,6 +91,21 @@ unsigned long find_next_and_bit(const unsigned long *addr1,
EXPORT_SYMBOL(find_next_and_bit);
#endif

+#if !defined(find_next_and_not_bit)
+unsigned long find_next_and_not_bit(const unsigned long *addr1,
+ const unsigned long *addr2, unsigned long size,
+ unsigned long offset)
+{
+ /*
+ * Switching addr1 and addr2, since the first argument is the one that
+ * will be inverted.
+ */
+ return _find_next_bit(addr2, addr1, size, offset, ~0UL);
+}
+EXPORT_SYMBOL(find_next_and_not_bit);
+#endif
+
+
#ifndef find_first_bit
/*
* Find the first set bit in a memory region.
--
2.20.1

2019-07-03 20:35:51

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch 16/18] x86/apic: Convert 32bit to IPI shorthand static key

Nadav,

On Wed, 3 Jul 2019, Nadav Amit wrote:
> > On Jul 3, 2019, at 3:54 AM, Thomas Gleixner <[email protected]> wrote:
> > void default_send_IPI_all(int vector)
> > {
> > - if (apic_ipi_shorthand_off || vector == NMI_VECTOR) {
> > + if (static_branch_likely(&apic_use_ipi_shorthand)) {
> > apic->send_IPI_mask(cpu_online_mask, vector);
> > } else {
> > __default_send_IPI_shortcut(APIC_DEST_ALLINC, vector);
>
> It may be better to check the static-key in native_send_call_func_ipi() (and
> other callers if there are any), and remove all the other checks in
> default_send_IPI_all(), x2apic_send_IPI_mask_allbutself(), etc.

That makes sense. Should have thought about that myself, but hunting that
APIC emulation issue was affecting my brain obviously :)

> void native_send_call_func_ipi(const struct cpumask *mask)
> {
> - cpumask_var_t allbutself;
> -
> - if (!alloc_cpumask_var(&allbutself, GFP_ATOMIC)) {
> - apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
> - return;
> + int cpu, this_cpu = smp_processor_id();
> + bool allbutself = true;
> + bool self = false;
> +
> + for_each_cpu_and_not(cpu, cpu_online_mask, mask) {
> +
> + if (cpu != this_cpu) {
> + allbutself = false;
> + break;
> + }
> + self = true;

That accumulates to a large iteration in the worst case.

> }
>
> - cpumask_copy(allbutself, cpu_online_mask);
> - __cpumask_clear_cpu(smp_processor_id(), allbutself);
> -
> - if (cpumask_equal(mask, allbutself) &&
> + if (allbutself && !self &&
> cpumask_equal(cpu_online_mask, cpu_callout_mask))

Hmm. I overlooked that one. Need to take a deeper look.

> apic->send_IPI_allbutself(CALL_FUNCTION_VECTOR);
> else
> apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);

Let me think about it for a while.

Thanks,

tglx

2019-07-03 21:16:59

by Nadav Amit

[permalink] [raw]
Subject: Re: [patch 16/18] x86/apic: Convert 32bit to IPI shorthand static key

> On Jul 3, 2019, at 1:34 PM, Thomas Gleixner <[email protected]> wrote:
>
> Nadav,
>
> On Wed, 3 Jul 2019, Nadav Amit wrote:
>>> On Jul 3, 2019, at 3:54 AM, Thomas Gleixner <[email protected]> wrote:
>>> void default_send_IPI_all(int vector)
>>> {
>>> - if (apic_ipi_shorthand_off || vector == NMI_VECTOR) {
>>> + if (static_branch_likely(&apic_use_ipi_shorthand)) {
>>> apic->send_IPI_mask(cpu_online_mask, vector);
>>> } else {
>>> __default_send_IPI_shortcut(APIC_DEST_ALLINC, vector);
>>
>> It may be better to check the static-key in native_send_call_func_ipi() (and
>> other callers if there are any), and remove all the other checks in
>> default_send_IPI_all(), x2apic_send_IPI_mask_allbutself(), etc.
>
> That makes sense. Should have thought about that myself, but hunting that
> APIC emulation issue was affecting my brain obviously :)

Well, if you used VMware and not KVM... ;-)

(allow me to reemphasize that I am joking and save myself from spam)

>> void native_send_call_func_ipi(const struct cpumask *mask)
>> {
>> - cpumask_var_t allbutself;
>> -
>> - if (!alloc_cpumask_var(&allbutself, GFP_ATOMIC)) {
>> - apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
>> - return;
>> + int cpu, this_cpu = smp_processor_id();
>> + bool allbutself = true;
>> + bool self = false;
>> +
>> + for_each_cpu_and_not(cpu, cpu_online_mask, mask) {
>> +
>> + if (cpu != this_cpu) {
>> + allbutself = false;
>> + break;
>> + }
>> + self = true;
>
> That accumulates to a large iteration in the worst case.

I don’t understand why. There should be at most two iterations - one for
self and one for another core. So _find_next_bit() will be called at most
twice. _find_next_bit() has its own loop, but I don’t think overall it is as
bad as calling alloc_cpumask_var(), cpumask_copy() and cpumask_equal(),
which also have loops.

I don’t have numbers (and I doubt they are very significant), but the cpumask
allocation showed when I was profiling my microbenchmark.

2019-07-03 21:32:41

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch 16/18] x86/apic: Convert 32bit to IPI shorthand static key

On Wed, 3 Jul 2019, Nadav Amit wrote:
> > On Jul 3, 2019, at 1:34 PM, Thomas Gleixner <[email protected]> wrote:
> > On Wed, 3 Jul 2019, Nadav Amit wrote:
> >>> On Jul 3, 2019, at 3:54 AM, Thomas Gleixner <[email protected]> wrote:
> >>> void default_send_IPI_all(int vector)
> >>> {
> >>> - if (apic_ipi_shorthand_off || vector == NMI_VECTOR) {
> >>> + if (static_branch_likely(&apic_use_ipi_shorthand)) {
> >>> apic->send_IPI_mask(cpu_online_mask, vector);
> >>> } else {
> >>> __default_send_IPI_shortcut(APIC_DEST_ALLINC, vector);
> >>
> >> It may be better to check the static-key in native_send_call_func_ipi() (and
> >> other callers if there are any), and remove all the other checks in
> >> default_send_IPI_all(), x2apic_send_IPI_mask_allbutself(), etc.
> >
> > That makes sense. Should have thought about that myself, but hunting that
> > APIC emulation issue was affecting my brain obviously :)
>
> Well, if you used VMware and not KVM... ;-)

Then I would have hunted some other bug probably :)

> >> void native_send_call_func_ipi(const struct cpumask *mask)
> >> {
> >> - cpumask_var_t allbutself;
> >> -
> >> - if (!alloc_cpumask_var(&allbutself, GFP_ATOMIC)) {
> >> - apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
> >> - return;
> >> + int cpu, this_cpu = smp_processor_id();
> >> + bool allbutself = true;
> >> + bool self = false;
> >> +
> >> + for_each_cpu_and_not(cpu, cpu_online_mask, mask) {
> >> +
> >> + if (cpu != this_cpu) {
> >> + allbutself = false;
> >> + break;
> >> + }
> >> + self = true;
> >
> > That accumulates to a large iteration in the worst case.
>
> I don’t understand why. There should be at most two iterations - one for
> self and one for another core. So _find_next_bit() will be called at most
> twice. _find_next_bit() has its own loop, but I don’t think overall it is as

Indeed, misread the code and right the bit search should be fast.

> bad as calling alloc_cpumask_var(), cpumask_copy() and cpumask_equal(),
> which also have loops.
>
> I don’t have numbers (and I doubt they are very significant), but the cpumask
> allocation showed when I was profiling my microbenchmark.

Yes, that alloc/free part is completely bogus.

Thanks,

tglx