2019-06-18 08:04:11

by Jan Kiszka

[permalink] [raw]
Subject: [PATCH] x86: Optimize load_mm_cr4 to load_mm_cr4_irqsoff

From: Jan Kiszka <[email protected]>

We only call load_mm_cr4 with interrupts disabled:
- switch_mm_irqs_off
- refresh_pce (on_each_cpu callback)

Thus, we can avoid disabling interrupts again in cr4_set/clear_bits.

Instead, provide cr4_set/clear_bits_irqsoffs and call those helpers from
load_mm_cr4_irqsoff. The renaming in combination with the checks
in __cr4_set shall ensure that any changes in the boundary conditions of
the invocations will be detected.

Signed-off-by: Jan Kiszka <[email protected]>
---

Found while porting Xenomai with its virtualized interrupt
infrastructure to a newer kernel.

arch/x86/events/core.c | 2 +-
arch/x86/include/asm/mmu_context.h | 8 ++++----
arch/x86/include/asm/tlbflush.h | 30 +++++++++++++++++++++++-------
arch/x86/mm/tlb.c | 2 +-
4 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index f315425d8468..78a3fba28c62 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2161,7 +2161,7 @@ static int x86_pmu_event_init(struct perf_event *event)

static void refresh_pce(void *ignored)
{
- load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm));
+ load_mm_cr4_irqsoff(this_cpu_read(cpu_tlbstate.loaded_mm));
}

static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 9024236693d2..16ae821483c8 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -28,16 +28,16 @@ static inline void paravirt_activate_mm(struct mm_struct *prev,

DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key);

-static inline void load_mm_cr4(struct mm_struct *mm)
+static inline void load_mm_cr4_irqsoff(struct mm_struct *mm)
{
if (static_branch_unlikely(&rdpmc_always_available_key) ||
atomic_read(&mm->context.perf_rdpmc_allowed))
- cr4_set_bits(X86_CR4_PCE);
+ cr4_set_bits_irqsoff(X86_CR4_PCE);
else
- cr4_clear_bits(X86_CR4_PCE);
+ cr4_clear_bits_irqsoff(X86_CR4_PCE);
}
#else
-static inline void load_mm_cr4(struct mm_struct *mm) {}
+static inline void load_mm_cr4_irqsoff(struct mm_struct *mm) {}
#endif

#ifdef CONFIG_MODIFY_LDT_SYSCALL
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index dee375831962..6f66d841262d 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -290,26 +290,42 @@ static inline void __cr4_set(unsigned long cr4)
}

/* Set in this cpu's CR4. */
-static inline void cr4_set_bits(unsigned long mask)
+static inline void cr4_set_bits_irqsoff(unsigned long mask)
{
- unsigned long cr4, flags;
+ unsigned long cr4;

- local_irq_save(flags);
cr4 = this_cpu_read(cpu_tlbstate.cr4);
if ((cr4 | mask) != cr4)
__cr4_set(cr4 | mask);
- local_irq_restore(flags);
}

/* Clear in this cpu's CR4. */
-static inline void cr4_clear_bits(unsigned long mask)
+static inline void cr4_clear_bits_irqsoff(unsigned long mask)
{
- unsigned long cr4, flags;
+ unsigned long cr4;

- local_irq_save(flags);
cr4 = this_cpu_read(cpu_tlbstate.cr4);
if ((cr4 & ~mask) != cr4)
__cr4_set(cr4 & ~mask);
+}
+
+/* Set in this cpu's CR4. */
+static inline void cr4_set_bits(unsigned long mask)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ cr4_set_bits_irqsoff(mask);
+ local_irq_restore(flags);
+}
+
+/* Clear in this cpu's CR4. */
+static inline void cr4_clear_bits(unsigned long mask)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ cr4_clear_bits_irqsoff(mask);
local_irq_restore(flags);
}

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 91f6db92554c..8fc1eaa55b6e 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -440,7 +440,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);

if (next != real_prev) {
- load_mm_cr4(next);
+ load_mm_cr4_irqsoff(next);
switch_ldt(real_prev, next);
}
}
--
2.16.4


2019-06-18 14:41:56

by Dave Hansen

[permalink] [raw]
Subject: Re: [PATCH] x86: Optimize load_mm_cr4 to load_mm_cr4_irqsoff

On 6/18/19 12:32 AM, Jan Kiszka wrote:
> Thus, we can avoid disabling interrupts again in cr4_set/clear_bits.

Seems reasonable.

Your *_irqsoff() variants need lockdep_assert_irqs_disabled(), at least,
though.

Can you talk a bit about the motivation here, though? Did you encounter
some performance issue that led you to make this patch, or was it simply
an improvement you realized you could make from code inspection?

2019-06-18 14:59:19

by Jan Kiszka

[permalink] [raw]
Subject: Re: [PATCH] x86: Optimize load_mm_cr4 to load_mm_cr4_irqsoff

On 18.06.19 16:41, Dave Hansen wrote:
> On 6/18/19 12:32 AM, Jan Kiszka wrote:
>> Thus, we can avoid disabling interrupts again in cr4_set/clear_bits.
>
> Seems reasonable.
>
> Your *_irqsoff() variants need lockdep_assert_irqs_disabled(), at least,
> though.

It inherits this check via __cr4_set, so I didn't add that to the outer path.

>
> Can you talk a bit about the motivation here, though? Did you encounter
> some performance issue that led you to make this patch, or was it simply
> an improvement you realized you could make from code inspection?
>

I ran into a downstream issue adjusting patches to this code. In a nutshell,
Xenomai has something like an NMI context over most Linux code that shares some
codepaths with the kernel, though. One of them is switch_mm_irqs_off, and there
our checking logic warned. The rest was code inspection.

For upstream, this is a micro-optimization. But given that something like
switch_mm is in the path, I thought it's worth to propose.

Jan

--
Siemens AG, Corporate Technology, CT RDA IOT SES-DE
Corporate Competence Center Embedded Linux

2019-07-24 02:27:01

by Jan Kiszka

[permalink] [raw]
Subject: Re: [PATCH] x86: Optimize load_mm_cr4 to load_mm_cr4_irqsoff

On 18.06.19 09:32, Jan Kiszka wrote:
> From: Jan Kiszka <[email protected]>
>
> We only call load_mm_cr4 with interrupts disabled:
> - switch_mm_irqs_off
> - refresh_pce (on_each_cpu callback)
>
> Thus, we can avoid disabling interrupts again in cr4_set/clear_bits.
>
> Instead, provide cr4_set/clear_bits_irqsoffs and call those helpers from
> load_mm_cr4_irqsoff. The renaming in combination with the checks
> in __cr4_set shall ensure that any changes in the boundary conditions of
> the invocations will be detected.
>
> Signed-off-by: Jan Kiszka <[email protected]>
> ---
>
> Found while porting Xenomai with its virtualized interrupt
> infrastructure to a newer kernel.
>
> arch/x86/events/core.c | 2 +-
> arch/x86/include/asm/mmu_context.h | 8 ++++----
> arch/x86/include/asm/tlbflush.h | 30 +++++++++++++++++++++++-------
> arch/x86/mm/tlb.c | 2 +-
> 4 files changed, 29 insertions(+), 13 deletions(-)
>
> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> index f315425d8468..78a3fba28c62 100644
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -2161,7 +2161,7 @@ static int x86_pmu_event_init(struct perf_event *event)
>
> static void refresh_pce(void *ignored)
> {
> - load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm));
> + load_mm_cr4_irqsoff(this_cpu_read(cpu_tlbstate.loaded_mm));
> }
>
> static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
> index 9024236693d2..16ae821483c8 100644
> --- a/arch/x86/include/asm/mmu_context.h
> +++ b/arch/x86/include/asm/mmu_context.h
> @@ -28,16 +28,16 @@ static inline void paravirt_activate_mm(struct mm_struct *prev,
>
> DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key);
>
> -static inline void load_mm_cr4(struct mm_struct *mm)
> +static inline void load_mm_cr4_irqsoff(struct mm_struct *mm)
> {
> if (static_branch_unlikely(&rdpmc_always_available_key) ||
> atomic_read(&mm->context.perf_rdpmc_allowed))
> - cr4_set_bits(X86_CR4_PCE);
> + cr4_set_bits_irqsoff(X86_CR4_PCE);
> else
> - cr4_clear_bits(X86_CR4_PCE);
> + cr4_clear_bits_irqsoff(X86_CR4_PCE);
> }
> #else
> -static inline void load_mm_cr4(struct mm_struct *mm) {}
> +static inline void load_mm_cr4_irqsoff(struct mm_struct *mm) {}
> #endif
>
> #ifdef CONFIG_MODIFY_LDT_SYSCALL
> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
> index dee375831962..6f66d841262d 100644
> --- a/arch/x86/include/asm/tlbflush.h
> +++ b/arch/x86/include/asm/tlbflush.h
> @@ -290,26 +290,42 @@ static inline void __cr4_set(unsigned long cr4)
> }
>
> /* Set in this cpu's CR4. */
> -static inline void cr4_set_bits(unsigned long mask)
> +static inline void cr4_set_bits_irqsoff(unsigned long mask)
> {
> - unsigned long cr4, flags;
> + unsigned long cr4;
>
> - local_irq_save(flags);
> cr4 = this_cpu_read(cpu_tlbstate.cr4);
> if ((cr4 | mask) != cr4)
> __cr4_set(cr4 | mask);
> - local_irq_restore(flags);
> }
>
> /* Clear in this cpu's CR4. */
> -static inline void cr4_clear_bits(unsigned long mask)
> +static inline void cr4_clear_bits_irqsoff(unsigned long mask)
> {
> - unsigned long cr4, flags;
> + unsigned long cr4;
>
> - local_irq_save(flags);
> cr4 = this_cpu_read(cpu_tlbstate.cr4);
> if ((cr4 & ~mask) != cr4)
> __cr4_set(cr4 & ~mask);
> +}
> +
> +/* Set in this cpu's CR4. */
> +static inline void cr4_set_bits(unsigned long mask)
> +{
> + unsigned long flags;
> +
> + local_irq_save(flags);
> + cr4_set_bits_irqsoff(mask);
> + local_irq_restore(flags);
> +}
> +
> +/* Clear in this cpu's CR4. */
> +static inline void cr4_clear_bits(unsigned long mask)
> +{
> + unsigned long flags;
> +
> + local_irq_save(flags);
> + cr4_clear_bits_irqsoff(mask);
> local_irq_restore(flags);
> }
>
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index 91f6db92554c..8fc1eaa55b6e 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -440,7 +440,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
> this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
>
> if (next != real_prev) {
> - load_mm_cr4(next);
> + load_mm_cr4_irqsoff(next);
> switch_ldt(real_prev, next);
> }
> }
>

Ping. I think the only remark of Dave was answered.

Jan

--
Siemens AG, Corporate Technology, CT RDA IOT SES-DE
Corporate Competence Center Embedded Linux

2019-07-24 12:39:34

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH] x86: Optimize load_mm_cr4 to load_mm_cr4_irqsoff

On Tue, 23 Jul 2019, Jan Kiszka wrote:
>
> Ping. I think the only remark of Dave was answered.

It's in that large append only file (TODO) already.

Subject: [tip:x86/mm] x86/mm: Avoid redundant interrupt disable in load_mm_cr4()

Commit-ID: 21e450d21ccad4cb7c7984c29ff145012b47736d
Gitweb: https://git.kernel.org/tip/21e450d21ccad4cb7c7984c29ff145012b47736d
Author: Jan Kiszka <[email protected]>
AuthorDate: Tue, 18 Jun 2019 09:32:11 +0200
Committer: Thomas Gleixner <[email protected]>
CommitDate: Wed, 24 Jul 2019 14:43:37 +0200

x86/mm: Avoid redundant interrupt disable in load_mm_cr4()

load_mm_cr4() is always called with interrupts disabled from:

- switch_mm_irqs_off()
- refresh_pce(), which is a on_each_cpu() callback

Thus, disabling interrupts in cr4_set/clear_bits() is redundant.

Implement cr4_set/clear_bits_irqsoff() helpers, rename load_mm_cr4() to
load_mm_cr4_irqsoff() and use the new helpers. The new helpers do not need
a lockdep assert as __cr4_set() has one already.

The renaming in combination with the checks in __cr4_set() ensure that any
changes in the boundary conditions at the call sites will be detected.

[ tglx: Massaged change log ]

Signed-off-by: Jan Kiszka <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
arch/x86/events/core.c | 2 +-
arch/x86/include/asm/mmu_context.h | 8 ++++----
arch/x86/include/asm/tlbflush.h | 30 +++++++++++++++++++++++-------
arch/x86/mm/tlb.c | 2 +-
4 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 81b005e4c7d9..cfe256ca76df 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2087,7 +2087,7 @@ static int x86_pmu_event_init(struct perf_event *event)

static void refresh_pce(void *ignored)
{
- load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm));
+ load_mm_cr4_irqsoff(this_cpu_read(cpu_tlbstate.loaded_mm));
}

static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 9024236693d2..16ae821483c8 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -28,16 +28,16 @@ static inline void paravirt_activate_mm(struct mm_struct *prev,

DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key);

-static inline void load_mm_cr4(struct mm_struct *mm)
+static inline void load_mm_cr4_irqsoff(struct mm_struct *mm)
{
if (static_branch_unlikely(&rdpmc_always_available_key) ||
atomic_read(&mm->context.perf_rdpmc_allowed))
- cr4_set_bits(X86_CR4_PCE);
+ cr4_set_bits_irqsoff(X86_CR4_PCE);
else
- cr4_clear_bits(X86_CR4_PCE);
+ cr4_clear_bits_irqsoff(X86_CR4_PCE);
}
#else
-static inline void load_mm_cr4(struct mm_struct *mm) {}
+static inline void load_mm_cr4_irqsoff(struct mm_struct *mm) {}
#endif

#ifdef CONFIG_MODIFY_LDT_SYSCALL
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index dee375831962..6f66d841262d 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -290,26 +290,42 @@ static inline void __cr4_set(unsigned long cr4)
}

/* Set in this cpu's CR4. */
-static inline void cr4_set_bits(unsigned long mask)
+static inline void cr4_set_bits_irqsoff(unsigned long mask)
{
- unsigned long cr4, flags;
+ unsigned long cr4;

- local_irq_save(flags);
cr4 = this_cpu_read(cpu_tlbstate.cr4);
if ((cr4 | mask) != cr4)
__cr4_set(cr4 | mask);
- local_irq_restore(flags);
}

/* Clear in this cpu's CR4. */
-static inline void cr4_clear_bits(unsigned long mask)
+static inline void cr4_clear_bits_irqsoff(unsigned long mask)
{
- unsigned long cr4, flags;
+ unsigned long cr4;

- local_irq_save(flags);
cr4 = this_cpu_read(cpu_tlbstate.cr4);
if ((cr4 & ~mask) != cr4)
__cr4_set(cr4 & ~mask);
+}
+
+/* Set in this cpu's CR4. */
+static inline void cr4_set_bits(unsigned long mask)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ cr4_set_bits_irqsoff(mask);
+ local_irq_restore(flags);
+}
+
+/* Clear in this cpu's CR4. */
+static inline void cr4_clear_bits(unsigned long mask)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ cr4_clear_bits_irqsoff(mask);
local_irq_restore(flags);
}

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 4de9704c4aaf..e6a9edc5baaf 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -440,7 +440,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);

if (next != real_prev) {
- load_mm_cr4(next);
+ load_mm_cr4_irqsoff(next);
switch_ldt(real_prev, next);
}
}