Move flush_tlb_info variables off the stack. This allows to align
flush_tlb_info to cache-line and avoid potentially unnecessary cache
line movements. It also allows to have a fixed virtual-to-physical
translation of the variables, which reduces TLB misses.
Use per-CPU struct for flush_tlb_mm_range() and
flush_tlb_kernel_range(). Add debug assertions to ensure there are
no nested TLB flushes that might overwrite the per-CPU data. For
arch_tlbbatch_flush() use a const struct.
Results when running a microbenchmarks that performs 10^6 MADV_DONTEED
operations and touching a page, in which 3 additional threads run a
busy-wait loop (5 runs, PTI and retpolines are turned off):
base off-stack
---- ---------
avg (usec/op) 1.629 1.570 (-3%)
stddev 0.014 0.009
Cc: Peter Zijlstra <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Signed-off-by: Nadav Amit <[email protected]>
---
v1->v2:
- Initialize all flush_tlb_info fields [Andy]
---
arch/x86/mm/tlb.c | 100 ++++++++++++++++++++++++++++++++++------------
1 file changed, 74 insertions(+), 26 deletions(-)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 487b8474c01c..aac191eb2b90 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -634,7 +634,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
}
-static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
+static void flush_tlb_func_local(const void *info, enum tlb_flush_reason reason)
{
const struct flush_tlb_info *f = info;
@@ -722,43 +722,81 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
*/
unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);
+
+#ifdef CONFIG_DEBUG_VM
+static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
+#endif
+
+static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
+ unsigned long start, unsigned long end,
+ unsigned int stride_shift, bool freed_tables,
+ u64 new_tlb_gen)
+{
+ struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info);
+
+#ifdef CONFIG_DEBUG_VM
+ /*
+ * Ensure that the following code is non-reentrant and flush_tlb_info
+ * is not overwritten. This means no TLB flushing is initiated by
+ * interrupt handlers and machine-check exception handlers.
+ */
+ BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
+#endif
+
+ info->start = start;
+ info->end = end;
+ info->mm = mm;
+ info->stride_shift = stride_shift;
+ info->freed_tables = freed_tables;
+ info->new_tlb_gen = new_tlb_gen;
+
+ return info;
+}
+
+static inline void put_flush_tlb_info(void)
+{
+#ifdef CONFIG_DEBUG_VM
+ /* Complete reentrency prevention checks */
+ barrier();
+ this_cpu_dec(flush_tlb_info_idx);
+#endif
+}
+
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned int stride_shift,
bool freed_tables)
{
+ struct flush_tlb_info *info;
+ u64 new_tlb_gen;
int cpu;
- struct flush_tlb_info info = {
- .mm = mm,
- .stride_shift = stride_shift,
- .freed_tables = freed_tables,
- };
-
cpu = get_cpu();
- /* This is also a barrier that synchronizes with switch_mm(). */
- info.new_tlb_gen = inc_mm_tlb_gen(mm);
-
/* Should we flush just the requested range? */
- if ((end != TLB_FLUSH_ALL) &&
- ((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) {
- info.start = start;
- info.end = end;
- } else {
- info.start = 0UL;
- info.end = TLB_FLUSH_ALL;
+ if ((end == TLB_FLUSH_ALL) ||
+ ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
+ start = 0UL;
+ end = TLB_FLUSH_ALL;
}
+ /* This is also a barrier that synchronizes with switch_mm(). */
+ new_tlb_gen = inc_mm_tlb_gen(mm);
+
+ info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
+ new_tlb_gen);
+
if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
- VM_WARN_ON(irqs_disabled());
+ lockdep_assert_irqs_enabled();
local_irq_disable();
- flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
+ flush_tlb_func_local(info, TLB_LOCAL_MM_SHOOTDOWN);
local_irq_enable();
}
if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), &info);
+ flush_tlb_others(mm_cpumask(mm), info);
+ put_flush_tlb_info();
put_cpu();
}
@@ -787,22 +825,32 @@ static void do_kernel_range_flush(void *info)
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
-
/* Balance as user space task's flush, a bit conservative */
if (end == TLB_FLUSH_ALL ||
(end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
on_each_cpu(do_flush_tlb_all, NULL, 1);
} else {
- struct flush_tlb_info info;
- info.start = start;
- info.end = end;
- on_each_cpu(do_kernel_range_flush, &info, 1);
+ struct flush_tlb_info *info;
+
+ preempt_disable();
+
+ info = get_flush_tlb_info(NULL, start, end, 0, false, 0);
+
+ info = this_cpu_ptr(&flush_tlb_info);
+ info->start = start;
+ info->end = end;
+
+ on_each_cpu(do_kernel_range_flush, info, 1);
+
+ put_flush_tlb_info();
+
+ preempt_enable();
}
}
void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
{
- struct flush_tlb_info info = {
+ static const struct flush_tlb_info info = {
.mm = NULL,
.start = 0UL,
.end = TLB_FLUSH_ALL,
--
2.19.1
On Thu, Apr 25, 2019 at 11:08 AM Nadav Amit <[email protected]> wrote:
>
> Move flush_tlb_info variables off the stack. This allows to align
> flush_tlb_info to cache-line and avoid potentially unnecessary cache
> line movements. It also allows to have a fixed virtual-to-physical
> translation of the variables, which reduces TLB misses.
>
> Use per-CPU struct for flush_tlb_mm_range() and
> flush_tlb_kernel_range(). Add debug assertions to ensure there are
> no nested TLB flushes that might overwrite the per-CPU data. For
> arch_tlbbatch_flush() use a const struct.
>
I like this version of the patch.
> On Apr 25, 2019, at 11:08 AM, Nadav Amit <[email protected]> wrote:
>
> Move flush_tlb_info variables off the stack. This allows to align
> flush_tlb_info to cache-line and avoid potentially unnecessary cache
> line movements. It also allows to have a fixed virtual-to-physical
> translation of the variables, which reduces TLB misses.
>
> Use per-CPU struct for flush_tlb_mm_range() and
> flush_tlb_kernel_range(). Add debug assertions to ensure there are
> no nested TLB flushes that might overwrite the per-CPU data. For
> arch_tlbbatch_flush() use a const struct.
>
> Results when running a microbenchmarks that performs 10^6 MADV_DONTEED
> operations and touching a page, in which 3 additional threads run a
> busy-wait loop (5 runs, PTI and retpolines are turned off):
>
> base off-stack
> ---- ---------
> avg (usec/op) 1.629 1.570 (-3%)
> stddev 0.014 0.009
>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Andy Lutomirski <[email protected]>
> Cc: Dave Hansen <[email protected]>
> Cc: Borislav Petkov <[email protected]>
> Cc: Thomas Gleixner <[email protected]>
> Signed-off-by: Nadav Amit <[email protected]>
>
> ---
>
> v1->v2:
> - Initialize all flush_tlb_info fields [Andy]
> ---
> arch/x86/mm/tlb.c | 100 ++++++++++++++++++++++++++++++++++------------
> 1 file changed, 74 insertions(+), 26 deletions(-)
>
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index 487b8474c01c..aac191eb2b90 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -634,7 +634,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
> this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
> }
>
> -static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
> +static void flush_tlb_func_local(const void *info, enum tlb_flush_reason reason)
> {
> const struct flush_tlb_info *f = info;
>
> @@ -722,43 +722,81 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
> */
> unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
>
> +static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);
> +
> +#ifdef CONFIG_DEBUG_VM
> +static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
> +#endif
> +
> +static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
> + unsigned long start, unsigned long end,
> + unsigned int stride_shift, bool freed_tables,
> + u64 new_tlb_gen)
> +{
> + struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info);
> +
> +#ifdef CONFIG_DEBUG_VM
> + /*
> + * Ensure that the following code is non-reentrant and flush_tlb_info
> + * is not overwritten. This means no TLB flushing is initiated by
> + * interrupt handlers and machine-check exception handlers.
> + */
> + BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
> +#endif
> +
> + info->start = start;
> + info->end = end;
> + info->mm = mm;
> + info->stride_shift = stride_shift;
> + info->freed_tables = freed_tables;
> + info->new_tlb_gen = new_tlb_gen;
> +
> + return info;
> +}
> +
> +static inline void put_flush_tlb_info(void)
> +{
> +#ifdef CONFIG_DEBUG_VM
> + /* Complete reentrency prevention checks */
> + barrier();
> + this_cpu_dec(flush_tlb_info_idx);
> +#endif
> +}
> +
> void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
> unsigned long end, unsigned int stride_shift,
> bool freed_tables)
> {
> + struct flush_tlb_info *info;
> + u64 new_tlb_gen;
> int cpu;
>
> - struct flush_tlb_info info = {
> - .mm = mm,
> - .stride_shift = stride_shift,
> - .freed_tables = freed_tables,
> - };
> -
> cpu = get_cpu();
>
> - /* This is also a barrier that synchronizes with switch_mm(). */
> - info.new_tlb_gen = inc_mm_tlb_gen(mm);
> -
> /* Should we flush just the requested range? */
> - if ((end != TLB_FLUSH_ALL) &&
> - ((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) {
> - info.start = start;
> - info.end = end;
> - } else {
> - info.start = 0UL;
> - info.end = TLB_FLUSH_ALL;
> + if ((end == TLB_FLUSH_ALL) ||
> + ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
> + start = 0UL;
> + end = TLB_FLUSH_ALL;
> }
>
> + /* This is also a barrier that synchronizes with switch_mm(). */
> + new_tlb_gen = inc_mm_tlb_gen(mm);
> +
> + info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
> + new_tlb_gen);
> +
> if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
> - VM_WARN_ON(irqs_disabled());
> + lockdep_assert_irqs_enabled();
> local_irq_disable();
> - flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
> + flush_tlb_func_local(info, TLB_LOCAL_MM_SHOOTDOWN);
> local_irq_enable();
> }
>
> if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
> - flush_tlb_others(mm_cpumask(mm), &info);
> + flush_tlb_others(mm_cpumask(mm), info);
>
> + put_flush_tlb_info();
> put_cpu();
> }
>
> @@ -787,22 +825,32 @@ static void do_kernel_range_flush(void *info)
>
> void flush_tlb_kernel_range(unsigned long start, unsigned long end)
> {
> -
> /* Balance as user space task's flush, a bit conservative */
> if (end == TLB_FLUSH_ALL ||
> (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
> on_each_cpu(do_flush_tlb_all, NULL, 1);
> } else {
> - struct flush_tlb_info info;
> - info.start = start;
> - info.end = end;
> - on_each_cpu(do_kernel_range_flush, &info, 1);
> + struct flush_tlb_info *info;
> +
> + preempt_disable();
> +
> + info = get_flush_tlb_info(NULL, start, end, 0, false, 0);
> +
> + info = this_cpu_ptr(&flush_tlb_info);
> + info->start = start;
> + info->end = end;
Err.. This is wrong. I will send v3 shortly.
* Nadav Amit <[email protected]> wrote:
> Move flush_tlb_info variables off the stack. This allows to align
> flush_tlb_info to cache-line and avoid potentially unnecessary cache
> line movements. It also allows to have a fixed virtual-to-physical
> translation of the variables, which reduces TLB misses.
>
> Use per-CPU struct for flush_tlb_mm_range() and
> flush_tlb_kernel_range(). Add debug assertions to ensure there are
> no nested TLB flushes that might overwrite the per-CPU data. For
> arch_tlbbatch_flush() use a const struct.
>
> Results when running a microbenchmarks that performs 10^6 MADV_DONTEED
> operations and touching a page, in which 3 additional threads run a
> busy-wait loop (5 runs, PTI and retpolines are turned off):
>
> base off-stack
> ---- ---------
> avg (usec/op) 1.629 1.570 (-3%)
> stddev 0.014 0.009
>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Andy Lutomirski <[email protected]>
> Cc: Dave Hansen <[email protected]>
> Cc: Borislav Petkov <[email protected]>
> Cc: Thomas Gleixner <[email protected]>
> Signed-off-by: Nadav Amit <[email protected]>
>
> ---
>
> v1->v2:
> - Initialize all flush_tlb_info fields [Andy]
> ---
> arch/x86/mm/tlb.c | 100 ++++++++++++++++++++++++++++++++++------------
> 1 file changed, 74 insertions(+), 26 deletions(-)
>
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index 487b8474c01c..aac191eb2b90 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -634,7 +634,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
> this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
> }
>
> -static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
> +static void flush_tlb_func_local(const void *info, enum tlb_flush_reason reason)
> {
> const struct flush_tlb_info *f = info;
>
> @@ -722,43 +722,81 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
> */
> unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
>
> +static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);
> +
> +#ifdef CONFIG_DEBUG_VM
> +static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
> +#endif
> +
> +static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
> + unsigned long start, unsigned long end,
> + unsigned int stride_shift, bool freed_tables,
> + u64 new_tlb_gen)
> +{
> + struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info);
> +
> +#ifdef CONFIG_DEBUG_VM
> + /*
> + * Ensure that the following code is non-reentrant and flush_tlb_info
> + * is not overwritten. This means no TLB flushing is initiated by
> + * interrupt handlers and machine-check exception handlers.
> + */
> + BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
> +#endif
isn't this effectively VM_BUG_ON()?
> +
> + info->start = start;
> + info->end = end;
> + info->mm = mm;
> + info->stride_shift = stride_shift;
> + info->freed_tables = freed_tables;
> + info->new_tlb_gen = new_tlb_gen;
This pattern is really asking for a bit more tabulation:
info->start = start;
info->end = end;
info->mm = mm;
info->stride_shift = stride_shift;
info->freed_tables = freed_tables;
info->new_tlb_gen = new_tlb_gen;
> +static inline void put_flush_tlb_info(void)
> +{
> +#ifdef CONFIG_DEBUG_VM
> + /* Complete reentrency prevention checks */
> + barrier();
> + this_cpu_dec(flush_tlb_info_idx);
> +#endif
In principle this_cpu_dec() should imply a compiler barrier?
> + if ((end == TLB_FLUSH_ALL) ||
> + ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
> + start = 0UL;
Any reason why we cannot use the '0' numeric literal here?
> + struct flush_tlb_info *info;
> +
> + preempt_disable();
> +
> + info = get_flush_tlb_info(NULL, start, end, 0, false, 0);
> +
> + info = this_cpu_ptr(&flush_tlb_info);
> + info->start = start;
> + info->end = end;
> +
> + on_each_cpu(do_kernel_range_flush, info, 1);
> +
> + put_flush_tlb_info();
> +
> + preempt_enable();
> }
> }
>
> void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
> {
> - struct flush_tlb_info info = {
> + static const struct flush_tlb_info info = {
> .mm = NULL,
> .start = 0UL,
> .end = TLB_FLUSH_ALL,
Could you please move this now .data global variable just before the
function, to make it more apparent what's going on?
Thanks,
Ingo
> On Apr 25, 2019, at 12:29 PM, Ingo Molnar <[email protected]> wrote:
>
>
> * Nadav Amit <[email protected]> wrote:
>
>> Move flush_tlb_info variables off the stack. This allows to align
>> flush_tlb_info to cache-line and avoid potentially unnecessary cache
>> line movements. It also allows to have a fixed virtual-to-physical
>> translation of the variables, which reduces TLB misses.
>>
>> Use per-CPU struct for flush_tlb_mm_range() and
>> flush_tlb_kernel_range(). Add debug assertions to ensure there are
>> no nested TLB flushes that might overwrite the per-CPU data. For
>> arch_tlbbatch_flush() use a const struct.
>>
>> Results when running a microbenchmarks that performs 10^6 MADV_DONTEED
>> operations and touching a page, in which 3 additional threads run a
>> busy-wait loop (5 runs, PTI and retpolines are turned off):
>>
>> base off-stack
>> ---- ---------
>> avg (usec/op) 1.629 1.570 (-3%)
>> stddev 0.014 0.009
>>
>> Cc: Peter Zijlstra <[email protected]>
>> Cc: Andy Lutomirski <[email protected]>
>> Cc: Dave Hansen <[email protected]>
>> Cc: Borislav Petkov <[email protected]>
>> Cc: Thomas Gleixner <[email protected]>
>> Signed-off-by: Nadav Amit <[email protected]>
>>
>> ---
>>
>> v1->v2:
>> - Initialize all flush_tlb_info fields [Andy]
>> ---
>> arch/x86/mm/tlb.c | 100 ++++++++++++++++++++++++++++++++++------------
>> 1 file changed, 74 insertions(+), 26 deletions(-)
>>
>> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
>> index 487b8474c01c..aac191eb2b90 100644
>> --- a/arch/x86/mm/tlb.c
>> +++ b/arch/x86/mm/tlb.c
>> @@ -634,7 +634,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
>> this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
>> }
>>
>> -static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
>> +static void flush_tlb_func_local(const void *info, enum tlb_flush_reason reason)
>> {
>> const struct flush_tlb_info *f = info;
>>
>> @@ -722,43 +722,81 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
>> */
>> unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
>>
>> +static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);
>> +
>> +#ifdef CONFIG_DEBUG_VM
>> +static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
>> +#endif
>> +
>> +static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
>> + unsigned long start, unsigned long end,
>> + unsigned int stride_shift, bool freed_tables,
>> + u64 new_tlb_gen)
>> +{
>> + struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info);
>> +
>> +#ifdef CONFIG_DEBUG_VM
>> + /*
>> + * Ensure that the following code is non-reentrant and flush_tlb_info
>> + * is not overwritten. This means no TLB flushing is initiated by
>> + * interrupt handlers and machine-check exception handlers.
>> + */
>> + BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
>> +#endif
>
> isn't this effectively VM_BUG_ON()?
Not exactly. When CONFIG_DEBUG_VM is off we get
#define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
This will cause the build to fail since flush_tlb_info_idx is not defined in
when CONFIG_DEBUG_VM is off.
>> +static inline void put_flush_tlb_info(void)
>> +{
>> +#ifdef CONFIG_DEBUG_VM
>> + /* Complete reentrency prevention checks */
>> + barrier();
>> + this_cpu_dec(flush_tlb_info_idx);
>> +#endif
>
> In principle this_cpu_dec() should imply a compiler barrier?
this_cpu_dec() is eventually expanded to the macro of percpu_add_op(). And
the inline assembly does not have a “memory” clobber, so I don’t think so.
I will address your other comments. Thanks!
* Nadav Amit <[email protected]> wrote:
> > On Apr 25, 2019, at 12:29 PM, Ingo Molnar <[email protected]> wrote:
> >
> >
> > * Nadav Amit <[email protected]> wrote:
> >
> >> Move flush_tlb_info variables off the stack. This allows to align
> >> flush_tlb_info to cache-line and avoid potentially unnecessary cache
> >> line movements. It also allows to have a fixed virtual-to-physical
> >> translation of the variables, which reduces TLB misses.
> >>
> >> Use per-CPU struct for flush_tlb_mm_range() and
> >> flush_tlb_kernel_range(). Add debug assertions to ensure there are
> >> no nested TLB flushes that might overwrite the per-CPU data. For
> >> arch_tlbbatch_flush() use a const struct.
> >>
> >> Results when running a microbenchmarks that performs 10^6 MADV_DONTEED
> >> operations and touching a page, in which 3 additional threads run a
> >> busy-wait loop (5 runs, PTI and retpolines are turned off):
> >>
> >> base off-stack
> >> ---- ---------
> >> avg (usec/op) 1.629 1.570 (-3%)
> >> stddev 0.014 0.009
> >>
> >> Cc: Peter Zijlstra <[email protected]>
> >> Cc: Andy Lutomirski <[email protected]>
> >> Cc: Dave Hansen <[email protected]>
> >> Cc: Borislav Petkov <[email protected]>
> >> Cc: Thomas Gleixner <[email protected]>
> >> Signed-off-by: Nadav Amit <[email protected]>
> >>
> >> ---
> >>
> >> v1->v2:
> >> - Initialize all flush_tlb_info fields [Andy]
> >> ---
> >> arch/x86/mm/tlb.c | 100 ++++++++++++++++++++++++++++++++++------------
> >> 1 file changed, 74 insertions(+), 26 deletions(-)
> >>
> >> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> >> index 487b8474c01c..aac191eb2b90 100644
> >> --- a/arch/x86/mm/tlb.c
> >> +++ b/arch/x86/mm/tlb.c
> >> @@ -634,7 +634,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
> >> this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
> >> }
> >>
> >> -static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
> >> +static void flush_tlb_func_local(const void *info, enum tlb_flush_reason reason)
> >> {
> >> const struct flush_tlb_info *f = info;
> >>
> >> @@ -722,43 +722,81 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
> >> */
> >> unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
> >>
> >> +static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);
> >> +
> >> +#ifdef CONFIG_DEBUG_VM
> >> +static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
> >> +#endif
> >> +
> >> +static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
> >> + unsigned long start, unsigned long end,
> >> + unsigned int stride_shift, bool freed_tables,
> >> + u64 new_tlb_gen)
> >> +{
> >> + struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info);
> >> +
> >> +#ifdef CONFIG_DEBUG_VM
> >> + /*
> >> + * Ensure that the following code is non-reentrant and flush_tlb_info
> >> + * is not overwritten. This means no TLB flushing is initiated by
> >> + * interrupt handlers and machine-check exception handlers.
> >> + */
> >> + BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
> >> +#endif
> >
> > isn't this effectively VM_BUG_ON()?
>
> Not exactly. When CONFIG_DEBUG_VM is off we get
>
> #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
>
> This will cause the build to fail since flush_tlb_info_idx is not defined in
> when CONFIG_DEBUG_VM is off.
Ugh, so VM_BUG_ON() should really be named VM_BUILD_BUG_ON()?
Anyway, agreed.
> >> +static inline void put_flush_tlb_info(void)
> >> +{
> >> +#ifdef CONFIG_DEBUG_VM
> >> + /* Complete reentrency prevention checks */
> >> + barrier();
> >> + this_cpu_dec(flush_tlb_info_idx);
> >> +#endif
> >
> > In principle this_cpu_dec() should imply a compiler barrier?
>
> this_cpu_dec() is eventually expanded to the macro of percpu_add_op(). And
> the inline assembly does not have a “memory” clobber, so I don’t think so.
I think that's a bug and PeterZ is fixing those.
Thanks,
Ingo
> On Apr 25, 2019, at 12:48 PM, Ingo Molnar <[email protected]> wrote:
>
>
> * Nadav Amit <[email protected]> wrote:
>
>>> On Apr 25, 2019, at 12:29 PM, Ingo Molnar <[email protected]> wrote:
>>>
>>>
>>> * Nadav Amit <[email protected]> wrote:
>>>
>>>> Move flush_tlb_info variables off the stack. This allows to align
>>>> flush_tlb_info to cache-line and avoid potentially unnecessary cache
>>>> line movements. It also allows to have a fixed virtual-to-physical
>>>> translation of the variables, which reduces TLB misses.
>>>>
>>>> Use per-CPU struct for flush_tlb_mm_range() and
>>>> flush_tlb_kernel_range(). Add debug assertions to ensure there are
>>>> no nested TLB flushes that might overwrite the per-CPU data. For
>>>> arch_tlbbatch_flush() use a const struct.
>>>>
>>>> Results when running a microbenchmarks that performs 10^6 MADV_DONTEED
>>>> operations and touching a page, in which 3 additional threads run a
>>>> busy-wait loop (5 runs, PTI and retpolines are turned off):
>>>>
>>>> base off-stack
>>>> ---- ---------
>>>> avg (usec/op) 1.629 1.570 (-3%)
>>>> stddev 0.014 0.009
>>>>
>>>> Cc: Peter Zijlstra <[email protected]>
>>>> Cc: Andy Lutomirski <[email protected]>
>>>> Cc: Dave Hansen <[email protected]>
>>>> Cc: Borislav Petkov <[email protected]>
>>>> Cc: Thomas Gleixner <[email protected]>
>>>> Signed-off-by: Nadav Amit <[email protected]>
>>>>
>>>> ---
>>>>
>>>> v1->v2:
>>>> - Initialize all flush_tlb_info fields [Andy]
>>>> ---
>>>> arch/x86/mm/tlb.c | 100 ++++++++++++++++++++++++++++++++++------------
>>>> 1 file changed, 74 insertions(+), 26 deletions(-)
>>>>
>>>> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
>>>> index 487b8474c01c..aac191eb2b90 100644
>>>> --- a/arch/x86/mm/tlb.c
>>>> +++ b/arch/x86/mm/tlb.c
>>>> @@ -634,7 +634,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
>>>> this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
>>>> }
>>>>
>>>> -static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
>>>> +static void flush_tlb_func_local(const void *info, enum tlb_flush_reason reason)
>>>> {
>>>> const struct flush_tlb_info *f = info;
>>>>
>>>> @@ -722,43 +722,81 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
>>>> */
>>>> unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
>>>>
>>>> +static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);
>>>> +
>>>> +#ifdef CONFIG_DEBUG_VM
>>>> +static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
>>>> +#endif
>>>> +
>>>> +static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
>>>> + unsigned long start, unsigned long end,
>>>> + unsigned int stride_shift, bool freed_tables,
>>>> + u64 new_tlb_gen)
>>>> +{
>>>> + struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info);
>>>> +
>>>> +#ifdef CONFIG_DEBUG_VM
>>>> + /*
>>>> + * Ensure that the following code is non-reentrant and flush_tlb_info
>>>> + * is not overwritten. This means no TLB flushing is initiated by
>>>> + * interrupt handlers and machine-check exception handlers.
>>>> + */
>>>> + BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
>>>> +#endif
>>>
>>> isn't this effectively VM_BUG_ON()?
>>
>> Not exactly. When CONFIG_DEBUG_VM is off we get
>>
>> #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
>>
>> This will cause the build to fail since flush_tlb_info_idx is not defined in
>> when CONFIG_DEBUG_VM is off.
>
> Ugh, so VM_BUG_ON() should really be named VM_BUILD_BUG_ON()?
>
> Anyway, agreed.
>
>>>> +static inline void put_flush_tlb_info(void)
>>>> +{
>>>> +#ifdef CONFIG_DEBUG_VM
>>>> + /* Complete reentrency prevention checks */
>>>> + barrier();
>>>> + this_cpu_dec(flush_tlb_info_idx);
>>>> +#endif
>>>
>>> In principle this_cpu_dec() should imply a compiler barrier?
>>
>> this_cpu_dec() is eventually expanded to the macro of percpu_add_op(). And
>> the inline assembly does not have a “memory” clobber, so I don’t think so.
>
> I think that's a bug and PeterZ is fixing those.
This would be quite surprising. Even atomic_dec() does not imply a compilers
barrier. I think I should leave it as is for now, and let Peter change it
later if he decides to do so. Let me know if you disagree.
On Thu, Apr 25, 2019 at 09:20:24PM +0000, Nadav Amit wrote:
> > I think that's a bug and PeterZ is fixing those.
>
> This would be quite surprising.
I need to get back to that percpu series .... :/
> Even atomic_dec() does not imply a compilers
> barrier. I think I should leave it as is for now, and let Peter change it
> later if he decides to do so. Let me know if you disagree.
https://lkml.kernel.org/r/[email protected]
> On Apr 26, 2019, at 12:53 AM, Peter Zijlstra <[email protected]> wrote:
>
> On Thu, Apr 25, 2019 at 09:20:24PM +0000, Nadav Amit wrote:
>
>>> I think that's a bug and PeterZ is fixing those.
>>
>> This would be quite surprising.
>
> I need to get back to that percpu series .... :/
>
>> Even atomic_dec() does not imply a compilers
>> barrier. I think I should leave it as is for now, and let Peter change it
>> later if he decides to do so. Let me know if you disagree.
>
> https://nam04.safelinks.protection.outlook.com/?url=https%3A%2F%2Flkml.kernel.org%2Fr%2F20190424124421.808471451%40infradead.org&data=02%7C01%7Cnamit%40vmware.com%7Cc58182519059466b21e708d6ca1c5964%7Cb39138ca3cee4b4aa4d6cd83d9dd62f0%7C0%7C0%7C636918620470835044&sdata=PpijqcCDBmWdVcoPNtx6oiNJsXj%2FED1z5k%2BdUeCifJM%3D&reserved=0
Interesting! (and thanks for the reference). Well, I said it would be quite
surprising, and I see you wrote the same thing in the patch ;-)
But correct me if I’m wrong - it does sound as if you “screw” all the uses
of atomic_inc() and atomic_dec() (~4000 instances) for the fewer uses of
smp_mb__after_atomic() and smp_mb__before_atomic() (~400 instances).
Do you intend to at least introduce a variant of atomic_inc() without a
memory barrier?
On Fri, Apr 26, 2019 at 08:37:37AM +0000, Nadav Amit wrote:
> Interesting! (and thanks for the reference). Well, I said it would be quite
> surprising, and I see you wrote the same thing in the patch ;-)
>
> But correct me if I’m wrong - it does sound as if you “screw” all the uses
> of atomic_inc() and atomic_dec() (~4000 instances) for the fewer uses of
> smp_mb__after_atomic() and smp_mb__before_atomic() (~400 instances).
>
> Do you intend to at least introduce a variant of atomic_inc() without a
> memory barrier?
Based on defconfig build changes that that patch caused, no.
https://lkml.kernel.org/r/[email protected]
Also note that except x86 and MIPS, the others: ia64, sparc, s390 and
xtensa already have this exact behaviour. Also note, as the patch notes,
that on 86 only atomic_{inc,dec,add,sub}() have this,
atomic_{and,or,xor}() already have the memory clobber.
Also, that would complicate the API too much, people are already getting
it wrong _a_lot_.
https://lkml.kernel.org/r/[email protected]