2013-08-02 19:46:43

by Alex Thorlton

[permalink] [raw]
Subject: [PATCH] Add per-process flag to control thp

This patch implements functionality to allow processes to disable the use of
transparent hugepages through the prctl syscall.

We've determined that some jobs perform significantly better with thp disabled,
and we need a way to control thp on a per-process basis, without relying on
madvise.

---
include/linux/huge_mm.h | 14 +++++++++++++-
include/linux/init_task.h | 8 ++++++++
include/linux/sched.h | 3 +++
include/uapi/linux/prctl.h | 3 +++
kernel/fork.c | 4 ++++
kernel/sys.c | 31 +++++++++++++++++++++++++++++++
6 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index b60de92..53af3ca 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -1,6 +1,8 @@
#ifndef _LINUX_HUGE_MM_H
#define _LINUX_HUGE_MM_H

+#include <linux/sched.h>
+
extern int do_huge_pmd_anonymous_page(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
@@ -66,7 +68,7 @@ extern pmd_t *page_check_address_pmd(struct page *page,

extern bool is_vma_temporary_stack(struct vm_area_struct *vma);

-#define transparent_hugepage_enabled(__vma) \
+#define _transparent_hugepage_enabled(__vma) \
((transparent_hugepage_flags & \
(1<<TRANSPARENT_HUGEPAGE_FLAG) || \
(transparent_hugepage_flags & \
@@ -177,6 +179,11 @@ static inline struct page *compound_trans_head(struct page *page)
return page;
}

+static inline int transparent_hugepage_enabled(struct vm_area_struct *vma)
+{
+ return !current->thp_disabled & _transparent_hugepage_enabled(vma);
+}
+
extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd, pmd_t *pmdp);

@@ -230,6 +237,11 @@ static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_str
return 0;
}

+static inline int transparent_hugepage_enabled(struct vm_area_struct *vma)
+{
+ return _transparent_hugepage_enabled(vma);
+}
+
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 5cd0f09..aae74fd 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -152,6 +152,13 @@ extern struct task_group root_task_group;
# define INIT_VTIME(tsk)
#endif

+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+# define INIT_THP_DISABLED \
+ .thp_disabled = 0,
+#else
+# define INIT_THP_DISABLED
+#endif
+
#define INIT_TASK_COMM "swapper"

/*
@@ -222,6 +229,7 @@ extern struct task_group root_task_group;
INIT_TASK_RCU_PREEMPT(tsk) \
INIT_CPUSET_SEQ \
INIT_VTIME(tsk) \
+ INIT_THP_DISABLED \
}


diff --git a/include/linux/sched.h b/include/linux/sched.h
index 50d04b9..f084c76 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1406,6 +1406,9 @@ struct task_struct {
unsigned int sequential_io;
unsigned int sequential_io_avg;
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ int thp_disabled;
+#endif
};

/* Future-safe accessor for struct task_struct's cpus_allowed. */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 289760f..f69780d 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -149,4 +149,7 @@

#define PR_GET_TID_ADDRESS 40

+#define PR_SET_THP_DISABLED 41
+#define PR_GET_THP_DISABLED 42
+
#endif /* _LINUX_PRCTL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 403d2bb..0b4afb5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1311,6 +1311,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->sequential_io_avg = 0;
#endif

+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ p->thp_disabled = current->thp_disabled;
+#endif
+
/* Perform scheduler related setup. Assign this task to a CPU. */
sched_fork(p);

diff --git a/kernel/sys.c b/kernel/sys.c
index 771129b..416c8a6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1836,6 +1836,31 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
}
#endif

+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int prctl_set_thp_disabled(struct task_struct *me)
+{
+ me->thp_disabled = 1;
+ return 0;
+}
+
+static int prctl_get_thp_disabled(struct task_struct *me,
+ int __user *thp_disabled)
+{
+ return put_user(me->thp_disabled, thp_disabled);
+}
+#else
+static int prctl_set_thp_disabled(struct task_struct *me)
+{
+ return -EINVAL;
+}
+
+static int prctl_get_thp_disabled(struct task_struct *me,
+ int __user *thp_disabled)
+{
+ return -EINVAL;
+}
+#endif
+
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
@@ -1999,6 +2024,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
if (arg2 || arg3 || arg4 || arg5)
return -EINVAL;
return current->no_new_privs ? 1 : 0;
+ case PR_SET_THP_DISABLED:
+ error = prctl_set_thp_disabled(me);
+ break;
+ case PR_GET_THP_DISABLED:
+ error = prctl_get_thp_disabled(me, (int __user *) arg2);
+ break;
default:
error = -EINVAL;
break;
--
1.7.12.4


2013-08-02 19:55:08

by Dave Jones

[permalink] [raw]
Subject: Re: [PATCH] Add per-process flag to control thp

On Fri, Aug 02, 2013 at 02:46:59PM -0500, Alex Thorlton wrote:

> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 50d04b9..f084c76 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1406,6 +1406,9 @@ struct task_struct {
> unsigned int sequential_io;
> unsigned int sequential_io_avg;
> #endif
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> + int thp_disabled;
> +#endif
> };

Instead of blowing a whole int on this bool, we could add it
to the bitfield a few pages up where we have other prctl things like..

unsigned no_new_privs:1;

Dave

2013-08-02 20:00:29

by Alex Thorlton

[permalink] [raw]
Subject: Re: [PATCH] Add per-process flag to control thp

> Instead of blowing a whole int on this bool, we could add it
> to the bitfield a few pages up where we have other prctl things like..
>
> unsigned no_new_privs:1;

Definitely a better way to go. I'll tweak that and float another
version of the patch. Thanks for the input, Dave!

2013-08-02 20:10:13

by Kirill A. Shutemov

[permalink] [raw]
Subject: RE: [PATCH] Add per-process flag to control thp

Alex Thorlton wrote:
> This patch implements functionality to allow processes to disable the use of
> transparent hugepages through the prctl syscall.
>
> We've determined that some jobs perform significantly better with thp disabled,
> and we need a way to control thp on a per-process basis, without relying on
> madvise.

What kind of workloads are you talking about?

What's wrong with madvise? Could you elaborate?

And I think thp_disabled should be reset to 0 on exec.

--
Kirill A. Shutemov

2013-08-02 20:34:40

by Alex Thorlton

[permalink] [raw]
Subject: Re: [PATCH] Add per-process flag to control thp

> What kind of workloads are you talking about?

Our benchmarking team has a list of several of the SPEC OMP benchmarks
that perform significantly better when THP is disabled. I tried to get
the list but one of our servers is acting up and I can't get to it
right now :/

> What's wrong with madvise? Could you elaborate?

The main issue with using madvise is that it's not an option with static
binaries, but there are also some users who have legacy Fortran code
that they're not willing/able to change.

> And I think thp_disabled should be reset to 0 on exec.

The main purpose for this getting carried down from the parent process
is that we'd like to be able to have a userland program set this flag on
itself, and then spawn off children who will also carry the flag.
This allows us to set the flag for programs where we're unable to modify
the code, thus resolving the issues detailed above.

2013-08-02 23:56:38

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCH] Add per-process flag to control thp

Alex Thorlton wrote:
> > What kind of workloads are you talking about?
>
> Our benchmarking team has a list of several of the SPEC OMP benchmarks
> that perform significantly better when THP is disabled. I tried to get
> the list but one of our servers is acting up and I can't get to it
> right now :/

Have you track down the reason why it's slow tih THP?

--
Kirill A. Shutemov

2013-08-03 19:35:26

by Kees Cook

[permalink] [raw]
Subject: Re: [PATCH] Add per-process flag to control thp

On Fri, Aug 2, 2013 at 1:34 PM, Alex Thorlton <[email protected]> wrote:
>> What kind of workloads are you talking about?
>
> Our benchmarking team has a list of several of the SPEC OMP benchmarks
> that perform significantly better when THP is disabled. I tried to get
> the list but one of our servers is acting up and I can't get to it
> right now :/
>
>> What's wrong with madvise? Could you elaborate?
>
> The main issue with using madvise is that it's not an option with static
> binaries, but there are also some users who have legacy Fortran code
> that they're not willing/able to change.
>
>> And I think thp_disabled should be reset to 0 on exec.
>
> The main purpose for this getting carried down from the parent process
> is that we'd like to be able to have a userland program set this flag on
> itself, and then spawn off children who will also carry the flag.
> This allows us to set the flag for programs where we're unable to modify
> the code, thus resolving the issues detailed above.

This could be done with LD_PRELOAD for uncontrolled binaries. Though
it does seem sensible to make it act like most personality flags do
(i.e. inherited).

-Kees

--
Kees Cook
Chrome OS Security

2013-08-04 14:25:45

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] Add per-process flag to control thp

On 08/03, Kees Cook wrote:
>
> On Fri, Aug 2, 2013 at 1:34 PM, Alex Thorlton <[email protected]> wrote:
> >
> >> And I think thp_disabled should be reset to 0 on exec.
> >
> > The main purpose for this getting carried down from the parent process
> > is that we'd like to be able to have a userland program set this flag on
> > itself, and then spawn off children who will also carry the flag.
> > This allows us to set the flag for programs where we're unable to modify
> > the code, thus resolving the issues detailed above.
>
> This could be done with LD_PRELOAD for uncontrolled binaries. Though
> it does seem sensible to make it act like most personality flags do
> (i.e. inherited).

It is only inheritable if the process is single-threaded.

And even if it is single-threaded I can't understand how it can really
help.

OK, transparent_hugepage_enabled() checks ->thp_disabled, but this is
the fault path. But this flag can't hide the task from khugepaged and
collapse_huge_page ?

I still think this should be mm flag. And we have MMF_INIT_MASK, it
can be inheritable.

Oleg.

2013-08-04 14:44:25

by Rasmus Villemoes

[permalink] [raw]
Subject: Re: [PATCH] Add per-process flag to control thp

Alex Thorlton <[email protected]> writes:

> This patch implements functionality to allow processes to disable the use of
> transparent hugepages through the prctl syscall.

A few comments:

Is there a reason it shouldn't be possible for a process to un-disable/reenable thp?

> +static inline int transparent_hugepage_enabled(struct vm_area_struct *vma)
> +{
> + return !current->thp_disabled & _transparent_hugepage_enabled(vma);
> +}

Should probably be &&.

> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 50d04b9..f084c76 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1406,6 +1406,9 @@ struct task_struct {
> unsigned int sequential_io;
> unsigned int sequential_io_avg;
> #endif
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> + int thp_disabled;
> +#endif
> };

Is there a reason this needs a whole int in task_struct and not just
a single bit?

Rasmus

2013-08-05 02:36:38

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] Add per-process flag to control thp

Alex Thorlton <[email protected]> writes:

>> What kind of workloads are you talking about?
>
> Our benchmarking team has a list of several of the SPEC OMP benchmarks
> that perform significantly better when THP is disabled. I tried to get
> the list but one of our servers is acting up and I can't get to it
> right now :/

Please try it with the vclear patches.

https://lkml.org/lkml/2012/7/20/165
et.al.

-Andi


--
[email protected] -- Speaking for myself only

2013-08-05 15:07:07

by Alex Thorlton

[permalink] [raw]
Subject: Re: [PATCH] Add per-process flag to control thp

> Please try it with the vclear patches.

Thanks, Andi. I'll give that a shot and see if it makes any difference.

- Alex

2013-08-16 14:34:36

by Alex Thorlton

[permalink] [raw]
Subject: [PATCH 3/8] THP: Pass real, not rounded, address to clear_huge_page

Conflicts:
mm/huge_memory.c
---
mm/huge_memory.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 55ec681..2e6f106 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -701,7 +701,8 @@ static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma)

static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
struct vm_area_struct *vma,
- unsigned long haddr, pmd_t *pmd,
+ unsigned long haddr,
+ unsigned long address, pmd_t *pmd,
struct page *page)
{
pgtable_t pgtable;
@@ -711,12 +712,12 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
if (unlikely(!pgtable))
return VM_FAULT_OOM;

- clear_huge_page(page, haddr, HPAGE_PMD_NR);
/*
* The memory barrier inside __SetPageUptodate makes sure that
* clear_huge_page writes become visible before the set_pmd_at()
* write.
*/
+ clear_huge_page(page, address, HPAGE_PMD_NR);
__SetPageUptodate(page);

spin_lock(&mm->page_table_lock);
@@ -825,8 +826,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
put_page(page);
goto out;
}
- if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd,
- page))) {
+ if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr,
+ address, pmd, page))) {
mem_cgroup_uncharge_page(page);
put_page(page);
goto out;
--
1.7.12.4

2013-08-16 14:34:39

by Alex Thorlton

[permalink] [raw]
Subject: [PATCH 0/8] Re: [PATCH] Add per-process flag to control thp

Here are the results from one of the benchmarks that performs
particularly poorly when thp is enabled. Unfortunately the vclear
patches don't seem to provide a performance boost. I've attached
the patches that include the changes I had to make to get the vclear
patches applied to the latest kernel.

This first set of tests was run on the latest community kernel, with the
vclear patches:

Kernel string: Kernel 3.11.0-rc5-medusa-00021-g1a15a96-dirty
harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l# cat /sys/kernel/mm/transparent_hugepage/enabled
[always] madvise never
harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l# time ./run.sh
...
Done. Terminating the simulation.

real 25m34.052s
user 10769m7.948s
sys 37m46.524s

harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l# echo never > /sys/kernel/mm/transparent_hugepage/enabled
harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l# cat /sys/kernel/mm/transparent_hugepage/enabled
always madvise [never]
harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l# time ./run.sh
...
Done. Terminating the simulation.

real 5m0.377s
user 2202m0.684s
sys 108m31.816s

Here are the same tests on the clean kernel:

Kernel string: Kernel 3.11.0-rc5-medusa-00013-g584d88b

Kernel string: Kernel 3.11.0-rc5-medusa-00013-g584d88b
athorlton@harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l> cat /sys/kernel/mm/transparent_hugepage/enabled
[always] madvise never
athorlton@harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l> time ./run.sh
...
Done. Terminating the simulation.

real 21m44.052s
user 10809m55.356s
sys 39m58.300s


harp31-sys:~ # echo never > /sys/kernel/mm/transparent_hugepage/enabled
athorlton@harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l> cat /sys/kernel/mm/transparent_hugepage/enabled
always madvise [never]
athorlton@harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l> time ./run.sh
...
Done. Terminating the simulation.

real 4m52.502s
user 2127m18.548s
sys 104m50.828s

Working on getting some more information about the root of the
performance issues now...

Alex Thorlton (8):
THP: Use real address for NUMA policy
mm: make clear_huge_page tolerate non aligned address
THP: Pass real, not rounded, address to clear_huge_page
x86: Add clear_page_nocache
mm: make clear_huge_page cache clear only around the fault address
x86: switch the 64bit uncached page clear to SSE/AVX v2
remove KM_USER0 from kmap_atomic call
fix up references to kernel_fpu_begin/end

arch/x86/include/asm/page.h | 2 +
arch/x86/include/asm/string_32.h | 5 ++
arch/x86/include/asm/string_64.h | 5 ++
arch/x86/lib/Makefile | 1 +
arch/x86/lib/clear_page_nocache_32.S | 30 ++++++++++++
arch/x86/lib/clear_page_nocache_64.S | 92 ++++++++++++++++++++++++++++++++++++
arch/x86/mm/fault.c | 7 +++
mm/huge_memory.c | 17 +++----
mm/memory.c | 31 ++++++++++--
9 files changed, 179 insertions(+), 11 deletions(-)
create mode 100644 arch/x86/lib/clear_page_nocache_32.S
create mode 100644 arch/x86/lib/clear_page_nocache_64.S

--
1.7.12.4

2013-08-16 14:35:05

by Alex Thorlton

[permalink] [raw]
Subject: [PATCH 6/8] x86: switch the 64bit uncached page clear to SSE/AVX v2

---
arch/x86/lib/clear_page_nocache_64.S | 91 ++++++++++++++++++++++++++++++------
1 file changed, 77 insertions(+), 14 deletions(-)

diff --git a/arch/x86/lib/clear_page_nocache_64.S b/arch/x86/lib/clear_page_nocache_64.S
index ee16d15..a6d938c 100644
--- a/arch/x86/lib/clear_page_nocache_64.S
+++ b/arch/x86/lib/clear_page_nocache_64.S
@@ -1,29 +1,92 @@
+/*
+ * Clear pages with cache bypass.
+ *
+ * Copyright (C) 2011, 2012 Intel Corporation
+ * Author: Andi Kleen
+ *
+ * This software may be redistributed and/or modified under the terms of
+ * the GNU General Public License ("GPL") version 2 only as published by the
+ * Free Software Foundation.
+ */
+
#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/cpufeature.h>
#include <asm/dwarf2.h>

+#define SSE_UNROLL 128
+
/*
* Zero a page avoiding the caches
* rdi page
*/
ENTRY(clear_page_nocache)
CFI_STARTPROC
- xorl %eax,%eax
- movl $4096/64,%ecx
+ push %rdi
+ call kernel_fpu_begin
+ pop %rdi
+ sub $16,%rsp
+ CFI_ADJUST_CFA_OFFSET 16
+ movdqu %xmm0,(%rsp)
+ xorpd %xmm0,%xmm0
+ movl $4096/SSE_UNROLL,%ecx
.p2align 4
.Lloop:
decl %ecx
-#define PUT(x) movnti %rax,x*8(%rdi)
- movnti %rax,(%rdi)
- PUT(1)
- PUT(2)
- PUT(3)
- PUT(4)
- PUT(5)
- PUT(6)
- PUT(7)
- leaq 64(%rdi),%rdi
+ .set x,0
+ .rept SSE_UNROLL/16
+ movntdq %xmm0,x(%rdi)
+ .set x,x+16
+ .endr
+ leaq SSE_UNROLL(%rdi),%rdi
jnz .Lloop
- nop
- ret
+ movdqu (%rsp),%xmm0
+ addq $16,%rsp
+ CFI_ADJUST_CFA_OFFSET -16
+ jmp kernel_fpu_end
CFI_ENDPROC
ENDPROC(clear_page_nocache)
+
+#ifdef CONFIG_AS_AVX
+
+ .section .altinstr_replacement,"ax"
+1: .byte 0xeb /* jmp <disp8> */
+ .byte (clear_page_nocache_avx - clear_page_nocache) - (2f - 1b)
+ /* offset */
+2:
+ .previous
+ .section .altinstructions,"a"
+ altinstruction_entry clear_page_nocache,1b,X86_FEATURE_AVX,\
+ 16, 2b-1b
+ .previous
+
+#define AVX_UNROLL 256 /* TUNE ME */
+
+ENTRY(clear_page_nocache_avx)
+ CFI_STARTPROC
+ push %rdi
+ call kernel_fpu_begin
+ pop %rdi
+ sub $32,%rsp
+ CFI_ADJUST_CFA_OFFSET 32
+ vmovdqu %ymm0,(%rsp)
+ vxorpd %ymm0,%ymm0,%ymm0
+ movl $4096/AVX_UNROLL,%ecx
+ .p2align 4
+.Lloop_avx:
+ decl %ecx
+ .set x,0
+ .rept AVX_UNROLL/32
+ vmovntdq %ymm0,x(%rdi)
+ .set x,x+32
+ .endr
+ leaq AVX_UNROLL(%rdi),%rdi
+ jnz .Lloop_avx
+ vmovdqu (%rsp),%ymm0
+ addq $32,%rsp
+ CFI_ADJUST_CFA_OFFSET -32
+ jmp kernel_fpu_end
+ CFI_ENDPROC
+ENDPROC(clear_page_nocache_avx)
+
+#endif
--
1.7.12.4

2013-08-16 14:35:10

by Alex Thorlton

[permalink] [raw]
Subject: [PATCH 7/8] remove KM_USER0 from kmap_atomic call

---
arch/x86/mm/fault.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a088ed7..d3f6dc7 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1233,7 +1233,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)

void clear_user_highpage_nocache(struct page *page, unsigned long vaddr)
{
- void *p = kmap_atomic(page, KM_USER0);
+ void *p = kmap_atomic(page);
clear_page_nocache(p);
kunmap_atomic(p);
}
--
1.7.12.4

2013-08-16 14:35:18

by Alex Thorlton

[permalink] [raw]
Subject: [PATCH 1/8] THP: Use real address for NUMA policy

---
mm/huge_memory.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a92012a..55ec681 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -746,11 +746,11 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)

static inline struct page *alloc_hugepage_vma(int defrag,
struct vm_area_struct *vma,
- unsigned long haddr, int nd,
+ unsigned long address, int nd,
gfp_t extra_gfp)
{
return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
- HPAGE_PMD_ORDER, vma, haddr, nd);
+ HPAGE_PMD_ORDER, vma, address, nd);
}

#ifndef CONFIG_NUMA
@@ -815,7 +815,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return 0;
}
page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
- vma, haddr, numa_node_id(), 0);
+ vma, address, numa_node_id(), 0);
if (unlikely(!page)) {
count_vm_event(THP_FAULT_FALLBACK);
goto out;
@@ -1160,7 +1160,7 @@ alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow())
new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
- vma, haddr, numa_node_id(), 0);
+ vma, address, numa_node_id(), 0);
else
new_page = NULL;

--
1.7.12.4

2013-08-16 14:35:23

by Alex Thorlton

[permalink] [raw]
Subject: [PATCH 2/8] mm: make clear_huge_page tolerate non aligned address

---
mm/memory.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 1ce2e2a..69a5a38 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4249,16 +4249,17 @@ void clear_huge_page(struct page *page,
unsigned long addr, unsigned int pages_per_huge_page)
{
int i;
+ unsigned long haddr = addr & HPAGE_PMD_MASK;

if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
- clear_gigantic_page(page, addr, pages_per_huge_page);
+ clear_gigantic_page(page, haddr, pages_per_huge_page);
return;
}

might_sleep();
for (i = 0; i < pages_per_huge_page; i++) {
cond_resched();
- clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+ clear_user_highpage(page + i, haddr + i * PAGE_SIZE);
}
}

--
1.7.12.4

2013-08-16 14:35:28

by Alex Thorlton

[permalink] [raw]
Subject: [PATCH 5/8] mm: make clear_huge_page cache clear only around the fault address

---
mm/memory.c | 28 ++++++++++++++++++++++++++--
1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 69a5a38..17d61f0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4231,18 +4231,35 @@ EXPORT_SYMBOL(might_fault);
#endif

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
+
+#ifndef ARCH_HAS_USER_NOCACHE
+#define ARCH_HAS_USER_NOCACHE 0
+#endif
+
+#if ARCH_HAS_USER_NOCACHE == 0
+#define clear_user_highpage_nocache clear_user_highpage
+#endif
+
static void clear_gigantic_page(struct page *page,
unsigned long addr,
unsigned int pages_per_huge_page)
{
int i;
struct page *p = page;
+ unsigned long vaddr;
+ unsigned long haddr = addr & HPAGE_PMD_MASK;
+ int target = (addr - haddr) >> PAGE_SHIFT;

might_sleep();
+ vaddr = haddr;
for (i = 0; i < pages_per_huge_page;
i++, p = mem_map_next(p, page, i)) {
cond_resched();
- clear_user_highpage(p, addr + i * PAGE_SIZE);
+ vaddr = haddr + i*PAGE_SIZE;
+ if (!ARCH_HAS_USER_NOCACHE || i == target)
+ clear_user_highpage(p, vaddr);
+ else
+ clear_user_highpage_nocache(p, vaddr);
}
}
void clear_huge_page(struct page *page,
@@ -4250,6 +4267,8 @@ void clear_huge_page(struct page *page,
{
int i;
unsigned long haddr = addr & HPAGE_PMD_MASK;
+ unsigned long vaddr;
+ int target = (addr - haddr) >> PAGE_SHIFT;

if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
clear_gigantic_page(page, haddr, pages_per_huge_page);
@@ -4257,9 +4276,14 @@ void clear_huge_page(struct page *page,
}

might_sleep();
+ vaddr = haddr;
for (i = 0; i < pages_per_huge_page; i++) {
cond_resched();
- clear_user_highpage(page + i, haddr + i * PAGE_SIZE);
+ vaddr = haddr + i*PAGE_SIZE;
+ if (!ARCH_HAS_USER_NOCACHE || i == target)
+ clear_user_highpage(page + i, vaddr);
+ else
+ clear_user_highpage_nocache(page + i, vaddr);
}
}

--
1.7.12.4

2013-08-16 14:35:15

by Alex Thorlton

[permalink] [raw]
Subject: [PATCH 8/8] fix up references to kernel_fpu_begin/end

---
arch/x86/lib/clear_page_nocache_64.S | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/lib/clear_page_nocache_64.S b/arch/x86/lib/clear_page_nocache_64.S
index a6d938c..d03408e 100644
--- a/arch/x86/lib/clear_page_nocache_64.S
+++ b/arch/x86/lib/clear_page_nocache_64.S
@@ -23,7 +23,7 @@
ENTRY(clear_page_nocache)
CFI_STARTPROC
push %rdi
- call kernel_fpu_begin
+ call __kernel_fpu_begin
pop %rdi
sub $16,%rsp
CFI_ADJUST_CFA_OFFSET 16
@@ -43,7 +43,7 @@ ENTRY(clear_page_nocache)
movdqu (%rsp),%xmm0
addq $16,%rsp
CFI_ADJUST_CFA_OFFSET -16
- jmp kernel_fpu_end
+ jmp __kernel_fpu_end
CFI_ENDPROC
ENDPROC(clear_page_nocache)

@@ -65,7 +65,7 @@ ENDPROC(clear_page_nocache)
ENTRY(clear_page_nocache_avx)
CFI_STARTPROC
push %rdi
- call kernel_fpu_begin
+ call __kernel_fpu_begin
pop %rdi
sub $32,%rsp
CFI_ADJUST_CFA_OFFSET 32
@@ -85,7 +85,7 @@ ENTRY(clear_page_nocache_avx)
vmovdqu (%rsp),%ymm0
addq $32,%rsp
CFI_ADJUST_CFA_OFFSET -32
- jmp kernel_fpu_end
+ jmp __kernel_fpu_end
CFI_ENDPROC
ENDPROC(clear_page_nocache_avx)

--
1.7.12.4

2013-08-16 14:37:13

by Alex Thorlton

[permalink] [raw]
Subject: [PATCH 4/8] x86: Add clear_page_nocache

Conflicts:
arch/x86/mm/fault.c
---
arch/x86/include/asm/page.h | 2 ++
arch/x86/include/asm/string_32.h | 5 +++++
arch/x86/include/asm/string_64.h | 5 +++++
arch/x86/lib/Makefile | 1 +
arch/x86/lib/clear_page_nocache_32.S | 30 ++++++++++++++++++++++++++++++
arch/x86/lib/clear_page_nocache_64.S | 29 +++++++++++++++++++++++++++++
arch/x86/mm/fault.c | 7 +++++++
7 files changed, 79 insertions(+)
create mode 100644 arch/x86/lib/clear_page_nocache_32.S
create mode 100644 arch/x86/lib/clear_page_nocache_64.S

diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index c878924..ccec9c3 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -33,6 +33,8 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
copy_page(to, from);
}

+void clear_user_highpage_nocache(struct page *page, unsigned long vaddr);
+
#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index 3d3e835..3f2fbcf 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -3,6 +3,8 @@

#ifdef __KERNEL__

+#include <linux/linkage.h>
+
/* Let gcc decide whether to inline or use the out of line functions */

#define __HAVE_ARCH_STRCPY
@@ -337,6 +339,9 @@ void *__constant_c_and_count_memset(void *s, unsigned long pattern,
#define __HAVE_ARCH_MEMSCAN
extern void *memscan(void *addr, int c, size_t size);

+#define ARCH_HAS_USER_NOCACHE 1
+asmlinkage void clear_page_nocache(void *page);
+
#endif /* __KERNEL__ */

#endif /* _ASM_X86_STRING_32_H */
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 19e2c46..ca23d1d 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -3,6 +3,8 @@

#ifdef __KERNEL__

+#include <linux/linkage.h>
+
/* Written 2002 by Andi Kleen */

/* Only used for special circumstances. Stolen from i386/string.h */
@@ -63,6 +65,9 @@ char *strcpy(char *dest, const char *src);
char *strcat(char *dest, const char *src);
int strcmp(const char *cs, const char *ct);

+#define ARCH_HAS_USER_NOCACHE 1
+asmlinkage void clear_page_nocache(void *page);
+
#endif /* __KERNEL__ */

#endif /* _ASM_X86_STRING_64_H */
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 96b2c66..e48776c 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -23,6 +23,7 @@ lib-y += memcpy_$(BITS).o
lib-$(CONFIG_SMP) += rwlock.o
lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
+lib-y += clear_page_nocache_$(BITS).o

obj-y += msr.o msr-reg.o msr-reg-export.o

diff --git a/arch/x86/lib/clear_page_nocache_32.S b/arch/x86/lib/clear_page_nocache_32.S
new file mode 100644
index 0000000..2394e0c
--- /dev/null
+++ b/arch/x86/lib/clear_page_nocache_32.S
@@ -0,0 +1,30 @@
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+/*
+ * Zero a page avoiding the caches
+ * rdi page
+ */
+ENTRY(clear_page_nocache)
+ CFI_STARTPROC
+ mov %eax,%edi
+ xorl %eax,%eax
+ movl $4096/64,%ecx
+ .p2align 4
+.Lloop:
+ decl %ecx
+#define PUT(x) movnti %eax,x*8(%edi) ; movnti %eax,x*8+4(%edi)
+ PUT(0)
+ PUT(1)
+ PUT(2)
+ PUT(3)
+ PUT(4)
+ PUT(5)
+ PUT(6)
+ PUT(7)
+ lea 64(%edi),%edi
+ jnz .Lloop
+ nop
+ ret
+ CFI_ENDPROC
+ENDPROC(clear_page_nocache)
diff --git a/arch/x86/lib/clear_page_nocache_64.S b/arch/x86/lib/clear_page_nocache_64.S
new file mode 100644
index 0000000..ee16d15
--- /dev/null
+++ b/arch/x86/lib/clear_page_nocache_64.S
@@ -0,0 +1,29 @@
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+/*
+ * Zero a page avoiding the caches
+ * rdi page
+ */
+ENTRY(clear_page_nocache)
+ CFI_STARTPROC
+ xorl %eax,%eax
+ movl $4096/64,%ecx
+ .p2align 4
+.Lloop:
+ decl %ecx
+#define PUT(x) movnti %rax,x*8(%rdi)
+ movnti %rax,(%rdi)
+ PUT(1)
+ PUT(2)
+ PUT(3)
+ PUT(4)
+ PUT(5)
+ PUT(6)
+ PUT(7)
+ leaq 64(%rdi),%rdi
+ jnz .Lloop
+ nop
+ ret
+ CFI_ENDPROC
+ENDPROC(clear_page_nocache)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 654be4a..a088ed7 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1230,3 +1230,10 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
__do_page_fault(regs, error_code);
exception_exit(prev_state);
}
+
+void clear_user_highpage_nocache(struct page *page, unsigned long vaddr)
+{
+ void *p = kmap_atomic(page, KM_USER0);
+ clear_page_nocache(p);
+ kunmap_atomic(p);
+}
--
1.7.12.4

2013-08-16 14:48:09

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 0/8] Re: [PATCH] Add per-process flag to control thp

On Fri, Aug 16, 2013 at 09:33:56AM -0500, Alex Thorlton wrote:

> This first set of tests was run on the latest community kernel, with the
> vclear patches:
>
> Kernel string: Kernel 3.11.0-rc5-medusa-00021-g1a15a96-dirty
> harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l# cat /sys/kernel/mm/transparent_hugepage/enabled
> [always] madvise never
> harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l# time ./run.sh
> ...
> Done. Terminating the simulation.
>
> real 25m34.052s
> user 10769m7.948s
> sys 37m46.524s
>
> harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l# echo never > /sys/kernel/mm/transparent_hugepage/enabled
> harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l# cat /sys/kernel/mm/transparent_hugepage/enabled
> always madvise [never]
> harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l# time ./run.sh
> ...
> Done. Terminating the simulation.
>
> real 5m0.377s
> user 2202m0.684s
> sys 108m31.816s
>

This 321.equake_l thing is not public, right? Do you have anything that
is public that shows the same problem?

Do you have any idea where all the extra time is spend?

2013-08-16 15:04:36

by Alex Thorlton

[permalink] [raw]
Subject: Re: [PATCH 0/8] Re: [PATCH] Add per-process flag to control thp

> This 321.equake_l thing is not public, right? Do you have anything
> that
> is public that shows the same problem?

The quake test comes from the SPEC OMP benchmarks. While I believe this
suite is available to anybody, I don't think it's free. I was given
the test by our benchmarking team, I'll have to ask them if this is
publicly available/if they know of any publicly available tests that can
reproduce the issue.

> Do you have any idea where all the extra time is spend?

I'm working on that as we speak. I was trying to eliminate the
possibility that the vclear patches would help solve the problem,
before digging any deeper. I should have some more information on
exactly where it's getting hung up in a few hours.

- Alex

2013-08-16 17:53:58

by Dave Hansen

[permalink] [raw]
Subject: Re: [PATCH 1/8] THP: Use real address for NUMA policy

On 08/16/2013 07:33 AM, Alex Thorlton wrote:
> ---
> mm/huge_memory.c | 8 ++++----
> 1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index a92012a..55ec681 100644

Could you add some actual descriptions to these patches that say why you
are doing this, and why this particular patch is needed and implemented
this way?

You mention that THP is slow for you, then go on to implement some
non-cached page zero'ing, but you never quite connect the dots.

2013-08-16 18:03:06

by Dave Hansen

[permalink] [raw]
Subject: Re: [PATCH 5/8] mm: make clear_huge_page cache clear only around the fault address

On 08/16/2013 07:34 AM, Alex Thorlton wrote:
> +#if ARCH_HAS_USER_NOCACHE == 0
> +#define clear_user_highpage_nocache clear_user_highpage
> +#endif
...
> cond_resched();
> - clear_user_highpage(p, addr + i * PAGE_SIZE);
> + vaddr = haddr + i*PAGE_SIZE;
> + if (!ARCH_HAS_USER_NOCACHE || i == target)
> + clear_user_highpage(p, vaddr);
> + else
> + clear_user_highpage_nocache(p, vaddr);
> }
> }

Is the check for !ARCH_HAS_USER_NOCACHE in there really necessary? That
logic seems like it would be taken care of by that #define.

Plus, if you don't check 'ARCH_HAS_USER_NOCACHE' like this, you don't
need to define a value for it, and you can axe the:

+#ifndef ARCH_HAS_USER_NOCACHE
+#define ARCH_HAS_USER_NOCACHE 0
+#endif

2013-08-16 18:17:13

by Alex Thorlton

[permalink] [raw]
Subject: Re: [PATCH 1/8] THP: Use real address for NUMA policy

> Could you add some actual descriptions to these patches that say why you
> are doing this, and why this particular patch is needed and implemented
> this way?
>
> You mention that THP is slow for you, then go on to implement some
> non-cached page zero'ing, but you never quite connect the dots.

I actually didn't write these patches (made a few tweaks to get them
running on the latest kernel though). They were submitted last July by
Peter Zijlstra. Andi Kleen suggested that I re-run some of my tests
using these patches to see whether they solved my issue. I just
included my updated patches so that people could confirm that I'd pulled
them forward properly.

The messages from the original submission can be found here:
https://lkml.org/lkml/2012/7/20/165

I did write the patch that these were submitted in response to, to
control THP on a per-process basis, but it looks like we're probably
going to end up taking this in a different direction, pending some more
test results.

- Alex

2013-08-16 18:51:49

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCH 1/8] THP: Use real address for NUMA policy

On Fri, Aug 16, 2013 at 01:17:28PM -0500, Alex Thorlton wrote:
> > Could you add some actual descriptions to these patches that say why you
> > are doing this, and why this particular patch is needed and implemented
> > this way?
> >
> > You mention that THP is slow for you, then go on to implement some
> > non-cached page zero'ing, but you never quite connect the dots.
>
> I actually didn't write these patches (made a few tweaks to get them
> running on the latest kernel though). They were submitted last July by
> Peter Zijlstra. Andi Kleen suggested that I re-run some of my tests
> using these patches to see whether they solved my issue. I just
> included my updated patches so that people could confirm that I'd pulled
> them forward properly.
>
> The messages from the original submission can be found here:
> https://lkml.org/lkml/2012/7/20/165

Here's more up-to-date version: https://lkml.org/lkml/2012/8/20/337

--
Kirill A. Shutemov

2013-08-16 22:46:22

by Alex Thorlton

[permalink] [raw]
Subject: Re: [PATCH 1/8] THP: Use real address for NUMA policy

On Fri, Aug 16, 2013 at 09:46:35PM +0200, Peter Zijlstra wrote:
> On Fri, Aug 16, 2013 at 01:17:28PM -0500, Alex Thorlton wrote:
> > I actually didn't write these patches (made a few tweaks to get them
> > running on the latest kernel though). They were submitted last July by
> > Peter Zijlstra.
>
> By Kirill, I don't think I've ever touched them.
>
> > The messages from the original submission can be found here:
> > https://lkml.org/lkml/2012/7/20/165
>
> As can be seen by this posting you reference ;-)

Sorry about that! Not sure where I got the idea that you wrote them,
must've gotten some names mixed up somewhere.

2013-08-17 00:52:40

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/8] THP: Use real address for NUMA policy

On Fri, Aug 16, 2013 at 01:17:28PM -0500, Alex Thorlton wrote:
> I actually didn't write these patches (made a few tweaks to get them
> running on the latest kernel though). They were submitted last July by
> Peter Zijlstra.

By Kirill, I don't think I've ever touched them.

> The messages from the original submission can be found here:
> https://lkml.org/lkml/2012/7/20/165

As can be seen by this posting you reference ;-)

2013-08-27 16:50:42

by Alex Thorlton

[permalink] [raw]
Subject: Re: [PATCH 1/8] THP: Use real address for NUMA policy

> Here's more up-to-date version: https://lkml.org/lkml/2012/8/20/337

These don't seem to give us a noticeable performance change either:

With THP:

real 22m34.279s
user 10797m35.984s
sys 39m18.188s

Without THP:

real 4m48.957s
user 2118m23.208s
sys 113m12.740s

Looks like we got a few minutes faster on the with THP case, but it's
still significantly slower, and that could just be a fluke result; we're
still floating at about a 5x performance degradation.

I talked with one of our performance/benchmarking experts last week and
he's done a bit more research into the actual problem here, so I've got
a bit more information:

The real performance hit, based on our testing, seems to be coming from
the increased latency that comes into play on large NUMA systems when a
process has to go off-node to read from/write to memory.

To give an extreme example, say we have a 16 node system with 8 cores
per node. If we have a job that shares a 2MB data structure between 128
threads, with THP on, the first thread to touch the structure will
allocate all 2MB of space for that structure in a 2MB page, local to its
socket. This means that all the memory accessses for the other 120
threads will be remote acceses. With THP off, each thread could locally
allocate a number of 4K pages sufficient to hold the chunk of the
structure on which it needs to work, significantly reducing the number
of remote accesses that each thread will need to perform.

So, with that in mind, do we agree that a per-process tunable (or
something similar) to control THP seems like a reasonable method to
handle this issue?

Just want to confirm that everyone likes this approach before moving
forward with another revision of the patch. I'm currently in favor of
moving this to a per-mm tunable, since that seems to make more sense
when it comes to threaded jobs. Also, a decent chunk of the code I've
already written can be reused with this approach, and prctl will still
be an appropriate place from which to control the behavior. Andrew
Morton suggested possibly controlling this through the ELF header, but
I'm going to lean towards the per-mm route unless anyone has a major
objection to it.

- Alex

2013-08-27 17:01:05

by Robin Holt

[permalink] [raw]
Subject: Re: [PATCH 1/8] THP: Use real address for NUMA policy

Alex,

Although the explanation seems plausible, have you verified this is
actually possible? You could make a simple pthread test case which
allocates a getpagesize() * <number-of-threads> area, prints its
address and then each thread migrate and reference their page. Have
the task then sleep(<long-time>) before exit. Look at the physical
address space with dlook for those virtual addresses in both the THP
and non-THP cases.

Thanks,
Robin

On Tue, Aug 27, 2013 at 11:50 AM, Alex Thorlton <[email protected]> wrote:
>> Here's more up-to-date version: https://lkml.org/lkml/2012/8/20/337
>
> These don't seem to give us a noticeable performance change either:
>
> With THP:
>
> real 22m34.279s
> user 10797m35.984s
> sys 39m18.188s
>
> Without THP:
>
> real 4m48.957s
> user 2118m23.208s
> sys 113m12.740s
>
> Looks like we got a few minutes faster on the with THP case, but it's
> still significantly slower, and that could just be a fluke result; we're
> still floating at about a 5x performance degradation.
>
> I talked with one of our performance/benchmarking experts last week and
> he's done a bit more research into the actual problem here, so I've got
> a bit more information:
>
> The real performance hit, based on our testing, seems to be coming from
> the increased latency that comes into play on large NUMA systems when a
> process has to go off-node to read from/write to memory.
>
> To give an extreme example, say we have a 16 node system with 8 cores
> per node. If we have a job that shares a 2MB data structure between 128
> threads, with THP on, the first thread to touch the structure will
> allocate all 2MB of space for that structure in a 2MB page, local to its
> socket. This means that all the memory accessses for the other 120
> threads will be remote acceses. With THP off, each thread could locally
> allocate a number of 4K pages sufficient to hold the chunk of the
> structure on which it needs to work, significantly reducing the number
> of remote accesses that each thread will need to perform.
>
> So, with that in mind, do we agree that a per-process tunable (or
> something similar) to control THP seems like a reasonable method to
> handle this issue?
>
> Just want to confirm that everyone likes this approach before moving
> forward with another revision of the patch. I'm currently in favor of
> moving this to a per-mm tunable, since that seems to make more sense
> when it comes to threaded jobs. Also, a decent chunk of the code I've
> already written can be reused with this approach, and prctl will still
> be an appropriate place from which to control the behavior. Andrew
> Morton suggested possibly controlling this through the ELF header, but
> I'm going to lean towards the per-mm route unless anyone has a major
> objection to it.
>
> - Alex

2013-08-28 09:32:46

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/8] THP: Use real address for NUMA policy

On Wed, Aug 28, 2013 at 11:18:14AM +0200, Ingo Molnar wrote:
> But ideally THP should detect cases where a hugepage is heavily used from
> multiple nodes and un-HP the page in question. Or not turn it into a
> hugepage in the first place. (We actually have a memory access pattern
> sampling facility in place upstream, the new CONFIG_NUMA_BALANCING=y code,
> which could in theory be utilized.)

Mel and I spoke about doing just that a few weeks ago. Breaking up THP
pages when we get shared faults from different nodes and avoiding
coalescing when the pages are from different nodes.

Rik seemed a little skeptical about the entire thing saying that the
huge tlb gain is about the same order as the locality thing -- while
this is more or less true for the 'tiny' machines most of us have, this
is clearly not the case for these big machines.

2013-08-28 13:57:21

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [PATCH] Add per-process flag to control thp

Hi everyone,

On Fri, Aug 02, 2013 at 02:46:59PM -0500, Alex Thorlton wrote:
> This patch implements functionality to allow processes to disable the use of
> transparent hugepages through the prctl syscall.
>
> We've determined that some jobs perform significantly better with thp disabled,
> and we need a way to control thp on a per-process basis, without relying on
> madvise.

Are you using automatic NUMA balancing or CPU pinning or was this a
short lived computation?

If you're not using automatic NUMA balancing or if you're using
automatic NUMA balancing and this is a fairly short lived computation,
zone_reclaim_mode was broken with THP enabled as it never called into
compaction, so with THP enabled you would have a random NUMA placement
during page fault to satisfy hugepage allocation (first priority was
to get any hugepage from any wrong node, only then it would call
compaction).

Did you verify the numa placement that you were getting?

I fixed the zone_reclaim_mode for THP and posted the fixes on
linux-mm, so you may want to try that fix just in case it helps your
workload.

Chances are you work with a big system where zone_reclaim_mode is
enabled by default, and for a test I would suggest you to enable it
once even if it's not enabled by default (after applying the fixes) to
see if it makes any difference.

I just did a push of my latest tree to the master branch of aa.git on
kernel.org, so you can just pull that for a quick test with
zone_reclaim_mode enabled.

NOTE: it's not guaranteed to help but it's worth a try. As you pointed
out if the threads are working on separate 4k fragments, it won't
help. But as things stands now THP enabled would breaks
zone_reclaim_mode and so without hard memory pinning it wouldn't
perform as good as with THP disabled.

2013-09-04 15:43:08

by Alex Thorlton

[permalink] [raw]
Subject: Re: [PATCH 1/8] THP: Use real address for NUMA policy

On Tue, Aug 27, 2013 at 12:01:01PM -0500, Robin Holt wrote:
> Alex,
>
> Although the explanation seems plausible, have you verified this is
> actually possible? You could make a simple pthread test case which
> allocates a getpagesize() * <number-of-threads> area, prints its
> address and then each thread migrate and reference their page. Have
> the task then sleep(<long-time>) before exit. Look at the physical
> address space with dlook for those virtual addresses in both the THP
> and non-THP cases.
>
> Thanks,
> Robin

Robin,

I tweaked one of our other tests to behave pretty much exactly as I
described, and I can see a very significant increase in performance with
THP turned off. The test behaves as follows:

- malloc a large array
- Spawn a specified number of threads
- Have each thread touch small, evenly spaced chunks of the array (e.g.
for 128 threads, the array is divided into 128 chunks, and each thread
touches 1/128th of each chunk, dividing the array into 16,384 pieces)

With THP off, the majority of each thread's pages are node local. With
THP on, most of the pages end up as THPs on the first thread's nodes,
since it is touching chunks that are close enough together to be
collapsed into THPs which will, of course, remain on the first node for
the duration of the test.

Here are some timings for 128 threads, allocating a total of 64gb:

THP on:

real 1m6.394s
user 16m1.160s
sys 75m25.232s

THP off:

real 0m35.251s
user 26m37.316s
sys 3m28.472s

The performance hit here isn't as severe as shown with the SPEC workload
that we originally used, but it still appears to consistently take about
twice as long with THP enabled.

2013-09-04 17:15:33

by Alex Thorlton

[permalink] [raw]
Subject: Re: [PATCH 1/8] THP: Use real address for NUMA policy

> Robin,
>
> I tweaked one of our other tests to behave pretty much exactly as I
> - malloc a large array
> - Spawn a specified number of threads
> - Have each thread touch small, evenly spaced chunks of the array (e.g.
> for 128 threads, the array is divided into 128 chunks, and each thread
> touches 1/128th of each chunk, dividing the array into 16,384 pieces)

Forgot to mention that the threads don't touch their chunks of memory
concurrently, i.e. thread 2 has to wait for thread 1 to finish first.
This is important to note, since the pages won't all get stuck on the
first node without this behavior.

2013-09-05 11:15:18

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 1/8] THP: Use real address for NUMA policy


* Alex Thorlton <[email protected]> wrote:

> > Robin,
> >
> > I tweaked one of our other tests to behave pretty much exactly as I
> > - malloc a large array
> > - Spawn a specified number of threads
> > - Have each thread touch small, evenly spaced chunks of the array (e.g.
> > for 128 threads, the array is divided into 128 chunks, and each thread
> > touches 1/128th of each chunk, dividing the array into 16,384 pieces)
>
> Forgot to mention that the threads don't touch their chunks of memory
> concurrently, i.e. thread 2 has to wait for thread 1 to finish first.
> This is important to note, since the pages won't all get stuck on the
> first node without this behavior.

Could you post the testcase please?

Thanks,

Ingo

2013-09-09 16:48:18

by Alex Thorlton

[permalink] [raw]
Subject: Re: [PATCH 1/8] THP: Use real address for NUMA policy

On Thu, Sep 05, 2013 at 01:15:10PM +0200, Ingo Molnar wrote:
>
> * Alex Thorlton <[email protected]> wrote:
>
> > > Robin,
> > >
> > > I tweaked one of our other tests to behave pretty much exactly as I
> > > - malloc a large array
> > > - Spawn a specified number of threads
> > > - Have each thread touch small, evenly spaced chunks of the array (e.g.
> > > for 128 threads, the array is divided into 128 chunks, and each thread
> > > touches 1/128th of each chunk, dividing the array into 16,384 pieces)
> >
> > Forgot to mention that the threads don't touch their chunks of memory
> > concurrently, i.e. thread 2 has to wait for thread 1 to finish first.
> > This is important to note, since the pages won't all get stuck on the
> > first node without this behavior.
>
> Could you post the testcase please?
>
> Thanks,
>
> Ingo

Sorry for the delay here, had to make sure that everything in my tests
was okay to push out to the public. Here's a pointer to the test I
wrote:

ftp://shell.sgi.com/collect/appsx_test/pthread_test.tar.gz

Everything to compile the test should be there (just run make in the
thp_pthread directory). To run the test use something like:

time ./thp_pthread -C 0 -m 0 -c <max_cores> -b <memory>

I ran:

time ./thp_pthread -C 0 -m 0 -c 128 -b 128g

On a 256 core machine, with ~500gb of memory and got these results:

THP off:

real 0m57.797s
user 46m22.156s
sys 6m14.220s

THP on:

real 1m36.906s
user 0m2.612s
sys 143m13.764s

I snagged some code from another test we use, so I can't vouch for the
usefulness/accuracy of all the output (actually, I know some of it is
wrong). I've mainly been looking at the total run time.

Don't want to bloat this e-mail up with too many test results, but I
found this one really interesting. Same machine, using all the cores,
with the same amount of memory. This means that each cpu is actually
doing *less* work, since the chunk we reserve gets divided up evenly
amongst the cpus:

time ./thp_pthread -C 0 -m 0 -c 256 -b 128g

THP off:

real 1m1.028s
user 104m58.448s
sys 8m52.908s

THP on:

real 2m26.072s
user 60m39.404s
sys 337m10.072s

Seems that the test scales really well in the THP off case, but, once
again, with THP on, we really see the performance start to degrade.

I'm planning to start investigating possible ways to split up THPs, if
we detect that that majority of the references to a THP are off-node.
I've heard some horror stories about migrating pages in this situation
(i.e., process switches cpu and then all the pages follow it), but I
think we might be able to get some better results if we can cleverly
determine an appropriate time to split up pages. I've heard a bit of
talk about doing something similar to this from a few people, but
haven't seen any code/test results. If anybody has any input on that
topic, it would be greatly appreciated.

- Alex

2013-09-10 07:47:54

by Ingo Molnar

[permalink] [raw]
Subject: [benchmark] THP performance testcase


( Changed the subject line to make it stand out better on lkml. Mail with
link & results quoted below. )

* Alex Thorlton <[email protected]> wrote:

> [...] Here's a pointer to the test I wrote:
>
> ftp://shell.sgi.com/collect/appsx_test/pthread_test.tar.gz
>
> Everything to compile the test should be there (just run make in the
> thp_pthread directory). To run the test use something like:
>
> time ./thp_pthread -C 0 -m 0 -c <max_cores> -b <memory>
>
> I ran:
>
> time ./thp_pthread -C 0 -m 0 -c 128 -b 128g
>
> On a 256 core machine, with ~500gb of memory and got these results:
>
> THP off:
>
> real 0m57.797s
> user 46m22.156s
> sys 6m14.220s
>
> THP on:
>
> real 1m36.906s
> user 0m2.612s
> sys 143m13.764s
>
> I snagged some code from another test we use, so I can't vouch for the
> usefulness/accuracy of all the output (actually, I know some of it is
> wrong). I've mainly been looking at the total run time.
>
> Don't want to bloat this e-mail up with too many test results, but I
> found this one really interesting. Same machine, using all the cores,
> with the same amount of memory. This means that each cpu is actually
> doing *less* work, since the chunk we reserve gets divided up evenly
> amongst the cpus:
>
> time ./thp_pthread -C 0 -m 0 -c 256 -b 128g
>
> THP off:
>
> real 1m1.028s
> user 104m58.448s
> sys 8m52.908s
>
> THP on:
>
> real 2m26.072s
> user 60m39.404s
> sys 337m10.072s
>
> Seems that the test scales really well in the THP off case, but, once
> again, with THP on, we really see the performance start to degrade.
>
> I'm planning to start investigating possible ways to split up THPs, if
> we detect that that majority of the references to a THP are off-node.
> I've heard some horror stories about migrating pages in this situation
> (i.e., process switches cpu and then all the pages follow it), but I
> think we might be able to get some better results if we can cleverly
> determine an appropriate time to split up pages. I've heard a bit of
> talk about doing something similar to this from a few people, but
> haven't seen any code/test results. If anybody has any input on that
> topic, it would be greatly appreciated.

2013-09-13 13:06:36

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCH 2/9] mm: convert mm->nr_ptes to atomic_t

With split page table lock for PMD level we can't hold
mm->page_table_lock while updating nr_ptes.

Let's convert it to atomic_t to avoid races.

Signed-off-by: Kirill A. Shutemov <[email protected]>
---
fs/proc/task_mmu.c | 2 +-
include/linux/mm_types.h | 2 +-
kernel/fork.c | 2 +-
mm/huge_memory.c | 10 +++++-----
mm/memory.c | 4 ++--
mm/mmap.c | 3 ++-
mm/oom_kill.c | 6 +++---
7 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7366e9d..d45d423 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -62,7 +62,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
total_rss << (PAGE_SHIFT-10),
data << (PAGE_SHIFT-10),
mm->stack_vm << (PAGE_SHIFT-10), text, lib,
- (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
+ (PTRS_PER_PTE*sizeof(pte_t)*atomic_read(&mm->nr_ptes)) >> 10,
swap << (PAGE_SHIFT-10));
}

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index fe0a4bb..1c64730 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -338,6 +338,7 @@ struct mm_struct {
pgd_t * pgd;
atomic_t mm_users; /* How many users with user space? */
atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */
+ atomic_t nr_ptes; /* Page table pages */
int map_count; /* number of VMAs */

spinlock_t page_table_lock; /* Protects page tables and some counters */
@@ -359,7 +360,6 @@ struct mm_struct {
unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE */
unsigned long stack_vm; /* VM_GROWSUP/DOWN */
unsigned long def_flags;
- unsigned long nr_ptes; /* Page table pages */
unsigned long start_code, end_code, start_data, end_data;
unsigned long start_brk, brk, start_stack;
unsigned long arg_start, arg_end, env_start, env_end;
diff --git a/kernel/fork.c b/kernel/fork.c
index 81ccb4f..4c8b986 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -532,7 +532,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
mm->flags = (current->mm) ?
(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
mm->core_state = NULL;
- mm->nr_ptes = 0;
+ atomic_set(&mm->nr_ptes, 0);
memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
spin_lock_init(&mm->page_table_lock);
mm_init_aio(mm);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7489884..bbd41a2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -737,7 +737,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
- mm->nr_ptes++;
+ atomic_inc(&mm->nr_ptes);
spin_unlock(&mm->page_table_lock);
}

@@ -778,7 +778,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
entry = pmd_mkhuge(entry);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
- mm->nr_ptes++;
+ atomic_inc(&mm->nr_ptes);
return true;
}

@@ -903,7 +903,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd = pmd_mkold(pmd_wrprotect(pmd));
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
- dst_mm->nr_ptes++;
+ atomic_inc(&dst_mm->nr_ptes);

ret = 0;
out_unlock:
@@ -1358,7 +1358,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
if (is_huge_zero_pmd(orig_pmd)) {
- tlb->mm->nr_ptes--;
+ atomic_dec(&tlb->mm->nr_ptes);
spin_unlock(&tlb->mm->page_table_lock);
put_huge_zero_page();
} else {
@@ -1367,7 +1367,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
VM_BUG_ON(page_mapcount(page) < 0);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
VM_BUG_ON(!PageHead(page));
- tlb->mm->nr_ptes--;
+ atomic_dec(&tlb->mm->nr_ptes);
spin_unlock(&tlb->mm->page_table_lock);
tlb_remove_page(tlb, page);
}
diff --git a/mm/memory.c b/mm/memory.c
index ca00039..1046396 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -382,7 +382,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
pgtable_t token = pmd_pgtable(*pmd);
pmd_clear(pmd);
pte_free_tlb(tlb, token, addr);
- tlb->mm->nr_ptes--;
+ atomic_dec(&tlb->mm->nr_ptes);
}

static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -575,7 +575,7 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
spin_lock(&mm->page_table_lock);
wait_split_huge_page = 0;
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
- mm->nr_ptes++;
+ atomic_inc(&mm->nr_ptes);
pmd_populate(mm, pmd, new);
new = NULL;
} else if (unlikely(pmd_trans_splitting(*pmd)))
diff --git a/mm/mmap.c b/mm/mmap.c
index 9d54851..1d0efbc 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2726,7 +2726,8 @@ void exit_mmap(struct mm_struct *mm)
}
vm_unacct_memory(nr_accounted);

- WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
+ WARN_ON(atomic_read(&mm->nr_ptes) >
+ (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
}

/* Insert vm structure into process list sorted by address
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 314e9d2..7ab394e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -161,7 +161,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
* The baseline for the badness score is the proportion of RAM that each
* task's rss, pagetable and swap space use.
*/
- points = get_mm_rss(p->mm) + p->mm->nr_ptes +
+ points = get_mm_rss(p->mm) + atomic_read(&p->mm->nr_ptes) +
get_mm_counter(p->mm, MM_SWAPENTS);
task_unlock(p);

@@ -364,10 +364,10 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
continue;
}

- pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5hd %s\n",
+ pr_info("[%5d] %5d %5d %8lu %8lu %7d %8lu %5hd %s\n",
task->pid, from_kuid(&init_user_ns, task_uid(task)),
task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
- task->mm->nr_ptes,
+ atomic_read(&task->mm->nr_ptes),
get_mm_counter(task->mm, MM_SWAPENTS),
task->signal->oom_score_adj, task->comm);
task_unlock(task);
--
1.8.4.rc3

2013-09-13 13:06:49

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCH 6/9] mm, thp: do not access mm->pmd_huge_pte directly

Currently mm->pmd_huge_pte protected by page table lock. It will not
work with split lock. We have to have per-pmd pmd_huge_pte for proper
access serialization.

For now, let's just introduce wrapper to access mm->pmd_huge_pte.

Signed-off-by: Kirill A. Shutemov <[email protected]>
---
arch/s390/mm/pgtable.c | 12 ++++++------
arch/sparc/mm/tlb.c | 12 ++++++------
include/linux/mm.h | 1 +
mm/pgtable-generic.c | 12 ++++++------
4 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index de8cbc3..c463e5c 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -1225,11 +1225,11 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
assert_spin_locked(&mm->page_table_lock);

/* FIFO */
- if (!mm->pmd_huge_pte)
+ if (!pmd_huge_pte(mm, pmdp))
INIT_LIST_HEAD(lh);
else
- list_add(lh, (struct list_head *) mm->pmd_huge_pte);
- mm->pmd_huge_pte = pgtable;
+ list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
+ pmd_huge_pte(mm, pmdp) = pgtable;
}

pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
@@ -1241,12 +1241,12 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
assert_spin_locked(&mm->page_table_lock);

/* FIFO */
- pgtable = mm->pmd_huge_pte;
+ pgtable = pmd_huge_pte(mm, pmdp);
lh = (struct list_head *) pgtable;
if (list_empty(lh))
- mm->pmd_huge_pte = NULL;
+ pmd_huge_pte(mm, pmdp) = NULL;
else {
- mm->pmd_huge_pte = (pgtable_t) lh->next;
+ pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
list_del(lh);
}
ptep = (pte_t *) pgtable;
diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c
index 7a91f28..656cc46 100644
--- a/arch/sparc/mm/tlb.c
+++ b/arch/sparc/mm/tlb.c
@@ -196,11 +196,11 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
assert_spin_locked(&mm->page_table_lock);

/* FIFO */
- if (!mm->pmd_huge_pte)
+ if (!pmd_huge_pte(mm, pmdp))
INIT_LIST_HEAD(lh);
else
- list_add(lh, (struct list_head *) mm->pmd_huge_pte);
- mm->pmd_huge_pte = pgtable;
+ list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
+ pmd_huge_pte(mm, pmdp) = pgtable;
}

pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
@@ -211,12 +211,12 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
assert_spin_locked(&mm->page_table_lock);

/* FIFO */
- pgtable = mm->pmd_huge_pte;
+ pgtable = pmd_huge_pte(mm, pmdp);
lh = (struct list_head *) pgtable;
if (list_empty(lh))
- mm->pmd_huge_pte = NULL;
+ pmd_huge_pte(mm, pmdp) = NULL;
else {
- mm->pmd_huge_pte = (pgtable_t) lh->next;
+ pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
list_del(lh);
}
pte_val(pgtable[0]) = 0;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index d4361e7..d2f8a50 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1299,6 +1299,7 @@ static inline spinlock_t *huge_pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
return &mm->page_table_lock;
}

+#define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)

static inline spinlock_t *huge_pmd_lock(struct mm_struct *mm, pmd_t *pmd)
{
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 3929a40..41fee3e 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -154,11 +154,11 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
assert_spin_locked(&mm->page_table_lock);

/* FIFO */
- if (!mm->pmd_huge_pte)
+ if (!pmd_huge_pte(mm, pmdp))
INIT_LIST_HEAD(&pgtable->lru);
else
- list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
- mm->pmd_huge_pte = pgtable;
+ list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru);
+ pmd_huge_pte(mm, pmdp) = pgtable;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
@@ -173,11 +173,11 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
assert_spin_locked(&mm->page_table_lock);

/* FIFO */
- pgtable = mm->pmd_huge_pte;
+ pgtable = pmd_huge_pte(mm, pmdp);
if (list_empty(&pgtable->lru))
- mm->pmd_huge_pte = NULL;
+ pmd_huge_pte(mm, pmdp) = NULL;
else {
- mm->pmd_huge_pte = list_entry(pgtable->lru.next,
+ pmd_huge_pte(mm, pmdp) = list_entry(pgtable->lru.next,
struct page, lru);
list_del(&pgtable->lru);
}
--
1.8.4.rc3

2013-09-13 13:06:45

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCH 8/9] mm: implement split page table lock for PMD level

The basic idea is the same as with PTE level: the lock is embedded into
struct page of table's page.

Split pmd page table lock only makes sense on big machines.
Let's say >= 32 CPUs for now.

We can't use mm->pmd_huge_pte to store pgtables for THP, since we don't
take mm->page_table_lock anymore. Let's reuse page->lru of table's page
for that.

hugetlbfs hasn't converted to split locking: disable split locking if
hugetlbfs enabled.

Signed-off-by: Naoya Horiguchi <[email protected]>
Signed-off-by: Kirill A. Shutemov <[email protected]>
---
include/linux/mm.h | 31 +++++++++++++++++++++++++++++++
include/linux/mm_types.h | 5 +++++
kernel/fork.c | 4 ++--
mm/Kconfig | 10 ++++++++++
4 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d2f8a50..5b3922d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1294,13 +1294,44 @@ static inline void pgtable_page_dtor(struct page *page)
((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
NULL: pte_offset_kernel(pmd, address))

+#if USE_SPLIT_PMD_PTLOCKS
+
+static inline spinlock_t *huge_pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
+{
+ return &virt_to_page(pmd)->ptl;
+}
+
+static inline void pgtable_pmd_page_ctor(struct page *page)
+{
+ spin_lock_init(&page->ptl);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ page->pmd_huge_pte = NULL;
+#endif
+}
+
+static inline void pgtable_pmd_page_dtor(struct page *page)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ VM_BUG_ON(page->pmd_huge_pte);
+#endif
+}
+
+#define pmd_huge_pte(mm, pmd) (virt_to_page(pmd)->pmd_huge_pte)
+
+#else
+
static inline spinlock_t *huge_pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
return &mm->page_table_lock;
}

+static inline void pgtable_pmd_page_ctor(struct page *page) {}
+static inline void pgtable_pmd_page_dtor(struct page *page) {}
+
#define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)

+#endif
+
static inline spinlock_t *huge_pmd_lock(struct mm_struct *mm, pmd_t *pmd)
{
spinlock_t *ptl = huge_pmd_lockptr(mm, pmd);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 1c64730..5706ddf 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -24,6 +24,8 @@
struct address_space;

#define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTE_PTLOCK_CPUS)
+#define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \
+ NR_CPUS >= CONFIG_SPLIT_PMD_PTLOCK_CPUS)

/*
* Each physical page in the system has a struct page associated with
@@ -130,6 +132,9 @@ struct page {

struct list_head list; /* slobs list of pages */
struct slab *slab_page; /* slab fields */
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
+ pgtable_t pmd_huge_pte; /* protected by page->ptl */
+#endif
};

/* Remainder is not double word aligned */
diff --git a/kernel/fork.c b/kernel/fork.c
index 4c8b986..1670af7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -560,7 +560,7 @@ static void check_mm(struct mm_struct *mm)
"mm:%p idx:%d val:%ld\n", mm, i, x);
}

-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
VM_BUG_ON(mm->pmd_huge_pte);
#endif
}
@@ -814,7 +814,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
memcpy(mm, oldmm, sizeof(*mm));
mm_init_cpumask(mm);

-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
mm->pmd_huge_pte = NULL;
#endif
#ifdef CONFIG_NUMA_BALANCING
diff --git a/mm/Kconfig b/mm/Kconfig
index 1977a33..ab32eda 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -214,6 +214,16 @@ config SPLIT_PTE_PTLOCK_CPUS
default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC
default "4"

+config ARCH_ENABLE_SPLIT_PMD_PTLOCK
+ boolean
+
+config SPLIT_PMD_PTLOCK_CPUS
+ int
+ # hugetlb hasn't converted to split locking yet
+ default "999999" if HUGETLB_PAGE
+ default "32" if ARCH_ENABLE_SPLIT_PMD_PTLOCK
+ default "999999"
+
#
# support for memory balloon compaction
config BALLOON_COMPACTION
--
1.8.4.rc3

2013-09-13 13:36:40

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 8/9] mm: implement split page table lock for PMD level

On Fri, Sep 13, 2013 at 04:06:15PM +0300, Kirill A. Shutemov wrote:
> +#if USE_SPLIT_PMD_PTLOCKS
> +
> +static inline void pgtable_pmd_page_ctor(struct page *page)
> +{
> + spin_lock_init(&page->ptl);
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> + page->pmd_huge_pte = NULL;
> +#endif
> +}
> +
> +static inline void pgtable_pmd_page_dtor(struct page *page)
> +{
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> + VM_BUG_ON(page->pmd_huge_pte);
> +#endif
> +}
> +
> +#define pmd_huge_pte(mm, pmd) (virt_to_page(pmd)->pmd_huge_pte)
> +
> +#else

So on -rt we have the problem that spinlock_t is rather huge (its a
rtmutex) so instead of blowing up the pageframe like that we treat
page->pte as a pointer and allocate the spinlock.

Since allocations could fail the above ctor path gets 'interesting'.

It would be good if new code could assume the ctor could fail so we
don't have to replicate that horror-show.


---
From: Peter Zijlstra <[email protected]>
Date: Fri, 3 Jul 2009 08:44:54 -0500
Subject: mm: shrink the page frame to !-rt size

He below is a boot-tested hack to shrink the page frame size back to
normal.

Should be a net win since there should be many less PTE-pages than
page-frames.

Signed-off-by: Peter Zijlstra <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
---
include/linux/mm.h | 46 +++++++++++++++++++++++++++++++++++++++-------
include/linux/mm_types.h | 4 ++++
mm/memory.c | 32 ++++++++++++++++++++++++++++++++
3 files changed, 75 insertions(+), 7 deletions(-)

--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1241,27 +1241,59 @@ static inline pmd_t *pmd_alloc(struct mm
* overflow into the next struct page (as it might with DEBUG_SPINLOCK).
* When freeing, reset page->mapping so free_pages_check won't complain.
*/
+#ifndef CONFIG_PREEMPT_RT_FULL
+
#define __pte_lockptr(page) &((page)->ptl)
-#define pte_lock_init(_page) do { \
- spin_lock_init(__pte_lockptr(_page)); \
-} while (0)
+
+static inline struct page *pte_lock_init(struct page *page)
+{
+ spin_lock_init(__pte_lockptr(page));
+ return page;
+}
+
#define pte_lock_deinit(page) ((page)->mapping = NULL)
+
+#else /* !PREEMPT_RT_FULL */
+
+/*
+ * On PREEMPT_RT_FULL the spinlock_t's are too large to embed in the
+ * page frame, hence it only has a pointer and we need to dynamically
+ * allocate the lock when we allocate PTE-pages.
+ *
+ * This is an overall win, since only a small fraction of the pages
+ * will be PTE pages under normal circumstances.
+ */
+
+#define __pte_lockptr(page) ((page)->ptl)
+
+extern struct page *pte_lock_init(struct page *page);
+extern void pte_lock_deinit(struct page *page);
+
+#endif /* PREEMPT_RT_FULL */
+
#define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
#else /* !USE_SPLIT_PTLOCKS */
/*
* We use mm->page_table_lock to guard all pagetable pages of the mm.
*/
-#define pte_lock_init(page) do {} while (0)
+static inline struct page *pte_lock_init(struct page *page) { return page; }
#define pte_lock_deinit(page) do {} while (0)
#define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;})
#endif /* USE_SPLIT_PTLOCKS */

-static inline void pgtable_page_ctor(struct page *page)
+static inline struct page *__pgtable_page_ctor(struct page *page)
{
- pte_lock_init(page);
- inc_zone_page_state(page, NR_PAGETABLE);
+ page = pte_lock_init(page);
+ if (page)
+ inc_zone_page_state(page, NR_PAGETABLE);
+ return page;
}

+#define pgtable_page_ctor(page) \
+do { \
+ page = __pgtable_page_ctor(page); \
+} while (0)
+
static inline void pgtable_page_dtor(struct page *page)
{
pte_lock_deinit(page);
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -142,7 +142,11 @@ struct page {
* system if PG_buddy is set.
*/
#if USE_SPLIT_PTLOCKS
+# ifndef CONFIG_PREEMPT_RT_FULL
spinlock_t ptl;
+# else
+ spinlock_t *ptl;
+# endif
#endif
struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */
struct page *first_page; /* Compound tail pages */
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4328,3 +4328,35 @@ void copy_user_huge_page(struct page *ds
}
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
+
+#if defined(CONFIG_PREEMPT_RT_FULL) && (USE_SPLIT_PTLOCKS > 0)
+/*
+ * Heinous hack, relies on the caller doing something like:
+ *
+ * pte = alloc_pages(PGALLOC_GFP, 0);
+ * if (pte)
+ * pgtable_page_ctor(pte);
+ * return pte;
+ *
+ * This ensures we release the page and return NULL when the
+ * lock allocation fails.
+ */
+struct page *pte_lock_init(struct page *page)
+{
+ page->ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
+ if (page->ptl) {
+ spin_lock_init(__pte_lockptr(page));
+ } else {
+ __free_page(page);
+ page = NULL;
+ }
+ return page;
+}
+
+void pte_lock_deinit(struct page *page)
+{
+ kfree(page->ptl);
+ page->mapping = NULL;
+}
+
+#endif

2013-09-13 13:39:17

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 8/9] mm: implement split page table lock for PMD level

On Fri, Sep 13, 2013 at 04:06:15PM +0300, Kirill A. Shutemov wrote:
> The basic idea is the same as with PTE level: the lock is embedded into
> struct page of table's page.
>
> Split pmd page table lock only makes sense on big machines.
> Let's say >= 32 CPUs for now.

Why is this? Couldn't I generate the same amount of contention on PMD
level as I can on PTE level in the THP case?

2013-09-13 13:46:22

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 3/9] mm: introduce api for split page table lock for PMD level

On Fri, Sep 13, 2013 at 04:06:10PM +0300, Kirill A. Shutemov wrote:
> Basic api, backed by mm->page_table_lock for now. Actual implementation
> will be added later.
>
> Signed-off-by: Naoya Horiguchi <[email protected]>
> Signed-off-by: Kirill A. Shutemov <[email protected]>
> ---
> include/linux/mm.h | 13 +++++++++++++
> 1 file changed, 13 insertions(+)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 6cf8ddb..d4361e7 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1294,6 +1294,19 @@ static inline void pgtable_page_dtor(struct page *page)
> ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
> NULL: pte_offset_kernel(pmd, address))
>
> +static inline spinlock_t *huge_pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
> +{
> + return &mm->page_table_lock;
> +}
> +
> +
> +static inline spinlock_t *huge_pmd_lock(struct mm_struct *mm, pmd_t *pmd)
> +{
> + spinlock_t *ptl = huge_pmd_lockptr(mm, pmd);
> + spin_lock(ptl);
> + return ptl;
> +}

Why not call the thing pmd_lock()? The pmd bit differentiates it from
pte_lock() enough IMIO.

2013-09-13 13:06:52

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCH 7/9] mm: convent the rest to new page table lock api

Only trivial cases left. Let's convert them altogether.

hugetlbfs is not covered for now.

Signed-off-by: Naoya Horiguchi <[email protected]>
Signed-off-by: Kirill A. Shutemov <[email protected]>
---
mm/huge_memory.c | 108 ++++++++++++++++++++++++++++-----------------------
mm/memory.c | 17 ++++----
mm/migrate.c | 7 ++--
mm/mprotect.c | 4 +-
mm/pgtable-generic.c | 4 +-
5 files changed, 77 insertions(+), 63 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 4b58a01..e728d74 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -709,6 +709,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
struct page *page)
{
pgtable_t pgtable;
+ spinlock_t *ptl;

VM_BUG_ON(!PageCompound(page));
pgtable = pte_alloc_one(mm, haddr);
@@ -723,9 +724,9 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
*/
__SetPageUptodate(page);

- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lock(mm, pmd);
if (unlikely(!pmd_none(*pmd))) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mem_cgroup_uncharge_page(page);
put_page(page);
pte_free(mm, pgtable);
@@ -738,7 +739,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
set_pmd_at(mm, haddr, pmd, entry);
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
atomic_inc(&mm->nr_ptes);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
}

return 0;
@@ -766,6 +767,7 @@ static inline struct page *alloc_hugepage(int defrag)
}
#endif

+/* Caller must hold page table lock. */
static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
struct page *zero_page)
@@ -797,6 +799,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM;
if (!(flags & FAULT_FLAG_WRITE) &&
transparent_hugepage_use_zero_page()) {
+ spinlock_t *ptl;
pgtable_t pgtable;
struct page *zero_page;
bool set;
@@ -809,10 +812,10 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lock(mm, pmd);
set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
zero_page);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
if (!set) {
pte_free(mm, pgtable);
put_huge_zero_page();
@@ -845,6 +848,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
struct vm_area_struct *vma)
{
+ spinlock_t *dst_ptl, *src_ptl;
struct page *src_page;
pmd_t pmd;
pgtable_t pgtable;
@@ -855,8 +859,9 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (unlikely(!pgtable))
goto out;

- spin_lock(&dst_mm->page_table_lock);
- spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
+ dst_ptl = huge_pmd_lock(dst_mm, dst_pmd);
+ src_ptl = huge_pmd_lockptr(src_mm, src_pmd);
+ spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);

ret = -EAGAIN;
pmd = *src_pmd;
@@ -865,7 +870,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
goto out_unlock;
}
/*
- * mm->page_table_lock is enough to be sure that huge zero pmd is not
+ * When page table lock is held, the huge zero pmd should not be
* under splitting since we don't split the page itself, only pmd to
* a page table.
*/
@@ -886,8 +891,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
}
if (unlikely(pmd_trans_splitting(pmd))) {
/* split huge page running from under us */
- spin_unlock(&src_mm->page_table_lock);
- spin_unlock(&dst_mm->page_table_lock);
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
pte_free(dst_mm, pgtable);

wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
@@ -907,8 +912,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,

ret = 0;
out_unlock:
- spin_unlock(&src_mm->page_table_lock);
- spin_unlock(&dst_mm->page_table_lock);
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
out:
return ret;
}
@@ -919,10 +924,11 @@ void huge_pmd_set_accessed(struct mm_struct *mm,
pmd_t *pmd, pmd_t orig_pmd,
int dirty)
{
+ spinlock_t *ptl;
pmd_t entry;
unsigned long haddr;

- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lock(mm, pmd);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto unlock;

@@ -932,13 +938,14 @@ void huge_pmd_set_accessed(struct mm_struct *mm,
update_mmu_cache_pmd(vma, address, pmd);

unlock:
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
}

static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr)
{
+ spinlock_t *ptl;
pgtable_t pgtable;
pmd_t _pmd;
struct page *page;
@@ -965,7 +972,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);

- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lock(mm, pmd);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_free_page;

@@ -992,7 +999,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
}
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
put_huge_zero_page();
inc_mm_counter(mm, MM_ANONPAGES);

@@ -1002,7 +1009,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
out:
return ret;
out_free_page:
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
mem_cgroup_uncharge_page(page);
put_page(page);
@@ -1016,6 +1023,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
struct page *page,
unsigned long haddr)
{
+ spinlock_t *ptl;
pgtable_t pgtable;
pmd_t _pmd;
int ret = 0, i;
@@ -1062,7 +1070,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);

- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lock(mm, pmd);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_free_pages;
VM_BUG_ON(!PageHead(page));
@@ -1088,7 +1096,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
page_remove_rmap(page);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);

mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);

@@ -1099,7 +1107,7 @@ out:
return ret;

out_free_pages:
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
mem_cgroup_uncharge_start();
for (i = 0; i < HPAGE_PMD_NR; i++) {
@@ -1114,17 +1122,19 @@ out_free_pages:
int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
{
+ spinlock_t *ptl;
int ret = 0;
struct page *page = NULL, *new_page;
unsigned long haddr;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */

+ ptl = huge_pmd_lockptr(mm, pmd);
VM_BUG_ON(!vma->anon_vma);
haddr = address & HPAGE_PMD_MASK;
if (is_huge_zero_pmd(orig_pmd))
goto alloc;
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_unlock;

@@ -1140,7 +1150,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out_unlock;
}
get_page(page);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow())
@@ -1187,11 +1197,11 @@ alloc:
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);

- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
if (page)
put_page(page);
if (unlikely(!pmd_same(*pmd, orig_pmd))) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mem_cgroup_uncharge_page(new_page);
put_page(new_page);
goto out_mn;
@@ -1213,13 +1223,13 @@ alloc:
}
ret |= VM_FAULT_WRITE;
}
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
out_mn:
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out:
return ret;
out_unlock:
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
return ret;
}

@@ -1231,7 +1241,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
struct page *page = NULL;

- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(huge_pmd_lockptr(mm, pmd));

if (flags & FOLL_WRITE && !pmd_write(*pmd))
goto out;
@@ -1278,13 +1288,14 @@ out:
int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd, pmd_t *pmdp)
{
+ spinlock_t *ptl;
struct page *page;
unsigned long haddr = addr & HPAGE_PMD_MASK;
int target_nid;
int current_nid = -1;
bool migrated;

- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lock(mm, pmdp);
if (unlikely(!pmd_same(pmd, *pmdp)))
goto out_unlock;

@@ -1302,17 +1313,17 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
}

/* Acquire the page lock to serialise THP migrations */
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
lock_page(page);

/* Confirm the PTE did not while locked */
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
if (unlikely(!pmd_same(pmd, *pmdp))) {
unlock_page(page);
put_page(page);
goto out_unlock;
}
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);

/* Migrate the THP to the requested node */
migrated = migrate_misplaced_transhuge_page(mm, vma,
@@ -1324,7 +1335,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
return 0;

check_same:
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
if (unlikely(!pmd_same(pmd, *pmdp)))
goto out_unlock;
clear_pmdnuma:
@@ -1333,7 +1344,7 @@ clear_pmdnuma:
VM_BUG_ON(pmd_numa(*pmdp));
update_mmu_cache_pmd(vma, addr, pmdp);
out_unlock:
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
if (current_nid != -1)
task_numa_fault(current_nid, HPAGE_PMD_NR, false);
return 0;
@@ -2282,7 +2293,7 @@ static void collapse_huge_page(struct mm_struct *mm,
pte_t *pte;
pgtable_t pgtable;
struct page *new_page;
- spinlock_t *ptl;
+ spinlock_t *pmd_ptl, *pte_ptl;
int isolated;
unsigned long hstart, hend;
unsigned long mmun_start; /* For mmu_notifiers */
@@ -2325,12 +2336,12 @@ static void collapse_huge_page(struct mm_struct *mm,
anon_vma_lock_write(vma->anon_vma);

pte = pte_offset_map(pmd, address);
- ptl = pte_lockptr(mm, pmd);
+ pte_ptl = pte_lockptr(mm, pmd);

mmun_start = address;
mmun_end = address + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- spin_lock(&mm->page_table_lock); /* probably unnecessary */
+ pmd_ptl = huge_pmd_lock(mm, pmd); /* probably unnecessary */
/*
* After this gup_fast can't run anymore. This also removes
* any huge TLB entry from the CPU so we won't allow
@@ -2338,16 +2349,16 @@ static void collapse_huge_page(struct mm_struct *mm,
* to avoid the risk of CPU bugs in that area.
*/
_pmd = pmdp_clear_flush(vma, address, pmd);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(pmd_ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);

- spin_lock(ptl);
+ spin_lock(pte_ptl);
isolated = __collapse_huge_page_isolate(vma, address, pte);
- spin_unlock(ptl);
+ spin_unlock(pte_ptl);

if (unlikely(!isolated)) {
pte_unmap(pte);
- spin_lock(&mm->page_table_lock);
+ spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
/*
* We can only use set_pmd_at when establishing
@@ -2355,7 +2366,7 @@ static void collapse_huge_page(struct mm_struct *mm,
* points to regular pagetables. Use pmd_populate for that
*/
pmd_populate(mm, pmd, pmd_pgtable(_pmd));
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(pmd_ptl);
anon_vma_unlock_write(vma->anon_vma);
goto out;
}
@@ -2366,7 +2377,7 @@ static void collapse_huge_page(struct mm_struct *mm,
*/
anon_vma_unlock_write(vma->anon_vma);

- __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
+ __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl);
pte_unmap(pte);
__SetPageUptodate(new_page);
pgtable = pmd_pgtable(_pmd);
@@ -2381,13 +2392,13 @@ static void collapse_huge_page(struct mm_struct *mm,
*/
smp_wmb();

- spin_lock(&mm->page_table_lock);
+ spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(pmd_ptl);

*hpage = NULL;

@@ -2712,6 +2723,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd)
{
+ spinlock_t *ptl;
struct page *page;
struct mm_struct *mm = vma->vm_mm;
unsigned long haddr = address & HPAGE_PMD_MASK;
@@ -2723,22 +2735,22 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
mmun_start = haddr;
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lock(mm, pmd);
if (unlikely(!pmd_trans_huge(*pmd))) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
return;
}
if (is_huge_zero_pmd(*pmd)) {
__split_huge_zero_page_pmd(vma, haddr, pmd);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
return;
}
page = pmd_page(*pmd);
VM_BUG_ON(!page_count(page));
get_page(page);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);

split_huge_page(page);
diff --git a/mm/memory.c b/mm/memory.c
index 1046396..a0ed1d5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -552,6 +552,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long address)
{
+ spinlock_t *ptl;
pgtable_t new = pte_alloc_one(mm, address);
int wait_split_huge_page;
if (!new)
@@ -572,7 +573,7 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
*/
smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */

- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lock(mm, pmd);
wait_split_huge_page = 0;
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
atomic_inc(&mm->nr_ptes);
@@ -580,7 +581,7 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
new = NULL;
} else if (unlikely(pmd_trans_splitting(*pmd)))
wait_split_huge_page = 1;
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
if (new)
pte_free(mm, new);
if (wait_split_huge_page)
@@ -1516,20 +1517,20 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
split_huge_page_pmd(vma, address, pmd);
goto split_fallthrough;
}
- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lock(mm, pmd);
if (likely(pmd_trans_huge(*pmd))) {
if (unlikely(pmd_trans_splitting(*pmd))) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
wait_split_huge_page(vma->anon_vma, pmd);
} else {
page = follow_trans_huge_pmd(vma, address,
pmd, flags);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
*page_mask = HPAGE_PMD_NR - 1;
goto out;
}
} else
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
/* fall through */
}
split_fallthrough:
@@ -3602,13 +3603,13 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
bool numa = false;
int local_nid = numa_node_id();

- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lock(mm, pmd);
pmd = *pmdp;
if (pmd_numa(pmd)) {
set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
numa = true;
}
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);

if (!numa)
return 0;
diff --git a/mm/migrate.c b/mm/migrate.c
index b7ded7e..32eff0c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1653,6 +1653,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
unsigned long address,
struct page *page, int node)
{
+ spinlock_t *ptl;
unsigned long haddr = address & HPAGE_PMD_MASK;
pg_data_t *pgdat = NODE_DATA(node);
int isolated = 0;
@@ -1699,9 +1700,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
WARN_ON(PageLRU(new_page));

/* Recheck the target PMD */
- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lock(mm, pmd);
if (unlikely(!pmd_same(*pmd, entry))) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);

/* Reverse changes made by migrate_page_copy() */
if (TestClearPageActive(new_page))
@@ -1746,7 +1747,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
* before it's fully transferred to the new page.
*/
mem_cgroup_end_migration(memcg, page, new_page, true);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);

unlock_page(new_page);
unlock_page(page);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 94722a4..885cd78 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -116,9 +116,9 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
pmd_t *pmd)
{
- spin_lock(&mm->page_table_lock);
+ spinlock_t *ptl = huge_pmd_lock(mm, pmd);
set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
}
#else
static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 41fee3e..80587a5 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -151,7 +151,7 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable)
{
- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(huge_pmd_lockptr(mm, pmdp));

/* FIFO */
if (!pmd_huge_pte(mm, pmdp))
@@ -170,7 +170,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
{
pgtable_t pgtable;

- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(huge_pmd_lockptr(mm, pmdp));

/* FIFO */
pgtable = pmd_huge_pte(mm, pmdp);
--
1.8.4.rc3

2013-09-13 14:07:06

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCH 5/9] mm, thp: move ptl taking inside page_check_address_pmd()

With split page table lock we can't know which lock we need to take
before we find the relevant pmd.

Let's move lock taking inside the function.

Signed-off-by: Naoya Horiguchi <[email protected]>
Signed-off-by: Kirill A. Shutemov <[email protected]>
---
include/linux/huge_mm.h | 3 ++-
mm/huge_memory.c | 43 +++++++++++++++++++++++++++----------------
mm/rmap.c | 13 +++++--------
3 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 4aca0d8..91672e2 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -54,7 +54,8 @@ enum page_check_address_pmd_flag {
extern pmd_t *page_check_address_pmd(struct page *page,
struct mm_struct *mm,
unsigned long address,
- enum page_check_address_pmd_flag flag);
+ enum page_check_address_pmd_flag flag,
+ spinlock_t **ptl);

#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index acf5b4d..4b58a01 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1500,23 +1500,33 @@ int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
return 0;
}

+/*
+ * This function returns whether a given @page is mapped onto the @address
+ * in the virtual space of @mm.
+ *
+ * When it's true, this function returns *pmd with holding the page table lock
+ * and passing it back to the caller via @ptl.
+ * If it's false, returns NULL without holding the page table lock.
+ */
pmd_t *page_check_address_pmd(struct page *page,
struct mm_struct *mm,
unsigned long address,
- enum page_check_address_pmd_flag flag)
+ enum page_check_address_pmd_flag flag,
+ spinlock_t **ptl)
{
- pmd_t *pmd, *ret = NULL;
+ pmd_t *pmd;

if (address & ~HPAGE_PMD_MASK)
- goto out;
+ return NULL;

pmd = mm_find_pmd(mm, address);
if (!pmd)
- goto out;
+ return NULL;
+ *ptl = huge_pmd_lock(mm, pmd);
if (pmd_none(*pmd))
- goto out;
+ goto unlock;
if (pmd_page(*pmd) != page)
- goto out;
+ goto unlock;
/*
* split_vma() may create temporary aliased mappings. There is
* no risk as long as all huge pmd are found and have their
@@ -1526,14 +1536,15 @@ pmd_t *page_check_address_pmd(struct page *page,
*/
if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
pmd_trans_splitting(*pmd))
- goto out;
+ goto unlock;
if (pmd_trans_huge(*pmd)) {
VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
!pmd_trans_splitting(*pmd));
- ret = pmd;
+ return pmd;
}
-out:
- return ret;
+unlock:
+ spin_unlock(*ptl);
+ return NULL;
}

static int __split_huge_page_splitting(struct page *page,
@@ -1541,6 +1552,7 @@ static int __split_huge_page_splitting(struct page *page,
unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
pmd_t *pmd;
int ret = 0;
/* For mmu_notifiers */
@@ -1548,9 +1560,8 @@ static int __split_huge_page_splitting(struct page *page,
const unsigned long mmun_end = address + HPAGE_PMD_SIZE;

mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- spin_lock(&mm->page_table_lock);
pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
+ PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl);
if (pmd) {
/*
* We can't temporarily set the pmd to null in order
@@ -1561,8 +1572,8 @@ static int __split_huge_page_splitting(struct page *page,
*/
pmdp_splitting_flush(vma, address, pmd);
ret = 1;
+ spin_unlock(ptl);
}
- spin_unlock(&mm->page_table_lock);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);

return ret;
@@ -1693,14 +1704,14 @@ static int __split_huge_page_map(struct page *page,
unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
pmd_t *pmd, _pmd;
int ret = 0, i;
pgtable_t pgtable;
unsigned long haddr;

- spin_lock(&mm->page_table_lock);
pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
+ PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl);
if (pmd) {
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
@@ -1755,8 +1766,8 @@ static int __split_huge_page_map(struct page *page,
pmdp_invalidate(vma, address, pmd);
pmd_populate(mm, pmd, pgtable);
ret = 1;
+ spin_unlock(ptl);
}
- spin_unlock(&mm->page_table_lock);

return ret;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index fd3ee7a..b59d741 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -665,25 +665,23 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
unsigned long *vm_flags)
{
struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
int referenced = 0;

if (unlikely(PageTransHuge(page))) {
pmd_t *pmd;

- spin_lock(&mm->page_table_lock);
/*
* rmap might return false positives; we must filter
* these out using page_check_address_pmd().
*/
pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_FLAG);
- if (!pmd) {
- spin_unlock(&mm->page_table_lock);
+ PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
+ if (!pmd)
goto out;
- }

if (vma->vm_flags & VM_LOCKED) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
*mapcount = 0; /* break early from loop */
*vm_flags |= VM_LOCKED;
goto out;
@@ -692,10 +690,9 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
/* go ahead even if the pmd is pmd_trans_splitting() */
if (pmdp_clear_flush_young_notify(vma, address, pmd))
referenced++;
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
} else {
pte_t *pte;
- spinlock_t *ptl;

/*
* rmap might return false positives; we must filter
--
1.8.4.rc3

2013-09-13 14:07:04

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCH 4/9] mm, thp: change pmd_trans_huge_lock() to return taken lock

With split ptlock it's important to know which lock pmd_trans_huge_lock()
took. This patch adds one more parameter to the function to return the
lock.

In most places new api migration to new api is trivial.
Exception is move_huge_pmd(): we need to take two locks if pmd tables
are different.

Signed-off-by: Naoya Horiguchi <[email protected]>
Signed-off-by: Kirill A. Shutemov <[email protected]>
---
fs/proc/task_mmu.c | 13 +++++++------
include/linux/huge_mm.h | 14 +++++++-------
mm/huge_memory.c | 40 +++++++++++++++++++++++++++-------------
mm/memcontrol.c | 10 +++++-----
4 files changed, 46 insertions(+), 31 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index d45d423..bbf7420 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -505,9 +505,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *pte;
spinlock_t *ptl;

- if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk);
- spin_unlock(&walk->mm->page_table_lock);
+ spin_unlock(ptl);
mss->anonymous_thp += HPAGE_PMD_SIZE;
return 0;
}
@@ -993,13 +993,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
{
struct vm_area_struct *vma;
struct pagemapread *pm = walk->private;
+ spinlock_t *ptl;
pte_t *pte;
int err = 0;
pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));

/* find the first VMA at or above 'addr' */
vma = find_vma(walk->mm, addr);
- if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
int pmd_flags2;

if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
@@ -1017,7 +1018,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (err)
break;
}
- spin_unlock(&walk->mm->page_table_lock);
+ spin_unlock(ptl);
return err;
}

@@ -1319,7 +1320,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,

md = walk->private;

- if (pmd_trans_huge_lock(pmd, md->vma) == 1) {
+ if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) {
pte_t huge_pte = *(pte_t *)pmd;
struct page *page;

@@ -1327,7 +1328,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
if (page)
gather_stats(page, md, pte_dirty(huge_pte),
HPAGE_PMD_SIZE/PAGE_SIZE);
- spin_unlock(&walk->mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 3935428..4aca0d8 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -129,15 +129,15 @@ extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
long adjust_next);
-extern int __pmd_trans_huge_lock(pmd_t *pmd,
- struct vm_area_struct *vma);
+extern int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+ spinlock_t **ptl);
/* mmap_sem must be held on entry */
-static inline int pmd_trans_huge_lock(pmd_t *pmd,
- struct vm_area_struct *vma)
+static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+ spinlock_t **ptl)
{
VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem));
if (pmd_trans_huge(*pmd))
- return __pmd_trans_huge_lock(pmd, vma);
+ return __pmd_trans_huge_lock(pmd, vma, ptl);
else
return 0;
}
@@ -215,8 +215,8 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
long adjust_next)
{
}
-static inline int pmd_trans_huge_lock(pmd_t *pmd,
- struct vm_area_struct *vma)
+static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+ spinlock_t **ptl)
{
return 0;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index bbd41a2..acf5b4d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1342,9 +1342,10 @@ out_unlock:
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr)
{
+ spinlock_t *ptl;
int ret = 0;

- if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
struct page *page;
pgtable_t pgtable;
pmd_t orig_pmd;
@@ -1359,7 +1360,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
if (is_huge_zero_pmd(orig_pmd)) {
atomic_dec(&tlb->mm->nr_ptes);
- spin_unlock(&tlb->mm->page_table_lock);
+ spin_unlock(ptl);
put_huge_zero_page();
} else {
page = pmd_page(orig_pmd);
@@ -1368,7 +1369,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
VM_BUG_ON(!PageHead(page));
atomic_dec(&tlb->mm->nr_ptes);
- spin_unlock(&tlb->mm->page_table_lock);
+ spin_unlock(ptl);
tlb_remove_page(tlb, page);
}
pte_free(tlb->mm, pgtable);
@@ -1381,14 +1382,15 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
unsigned char *vec)
{
+ spinlock_t *ptl;
int ret = 0;

- if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
/*
* All logical pages in the range are present
* if backed by a huge page.
*/
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
memset(vec, 1, (end - addr) >> PAGE_SHIFT);
ret = 1;
}
@@ -1401,6 +1403,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
unsigned long new_addr, unsigned long old_end,
pmd_t *old_pmd, pmd_t *new_pmd)
{
+ spinlock_t *old_ptl, *new_ptl;
int ret = 0;
pmd_t pmd;

@@ -1421,12 +1424,21 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
goto out;
}

- ret = __pmd_trans_huge_lock(old_pmd, vma);
+ /*
+ * We don't have to worry about the ordering of src and dst
+ * ptlocks because exclusive mmap_sem prevents deadlock.
+ */
+ ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl);
if (ret == 1) {
+ new_ptl = huge_pmd_lockptr(mm, new_pmd);
+ if (new_ptl != old_ptl)
+ spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
- spin_unlock(&mm->page_table_lock);
+ if (new_ptl != old_ptl)
+ spin_unlock(new_ptl);
+ spin_unlock(old_ptl);
}
out:
return ret;
@@ -1436,9 +1448,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, pgprot_t newprot, int prot_numa)
{
struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
int ret = 0;

- if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
pmd_t entry;
entry = pmdp_get_and_clear(mm, addr, pmd);
if (!prot_numa) {
@@ -1454,7 +1467,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
}
}
set_pmd_at(mm, addr, pmd, entry);
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
ret = 1;
}

@@ -1468,12 +1481,13 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
* Note that if it returns 1, this routine returns without unlocking page
* table locks. So callers must unlock them.
*/
-int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
+int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+ spinlock_t **ptl)
{
- spin_lock(&vma->vm_mm->page_table_lock);
+ *ptl = huge_pmd_lock(vma->vm_mm, pmd);
if (likely(pmd_trans_huge(*pmd))) {
if (unlikely(pmd_trans_splitting(*pmd))) {
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(*ptl);
wait_split_huge_page(vma->anon_vma, pmd);
return -1;
} else {
@@ -1482,7 +1496,7 @@ int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
return 1;
}
}
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(*ptl);
return 0;
}

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d5ff3ce..5f35b2a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6376,10 +6376,10 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
pte_t *pte;
spinlock_t *ptl;

- if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
mc.precharge += HPAGE_PMD_NR;
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}

@@ -6568,9 +6568,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
* to be unlocked in __split_huge_page_splitting(), where the main
* part of thp split is not executed yet.
*/
- if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
if (mc.precharge < HPAGE_PMD_NR) {
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
@@ -6587,7 +6587,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
}
put_page(page);
}
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}

--
1.8.4.rc3

2013-09-13 13:06:42

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCH 9/9] x86, mm: enable split page table lock for PMD level

Enable PMD split page table lock for X86_64 and PAE.

Signed-off-by: Kirill A. Shutemov <[email protected]>
---
arch/x86/Kconfig | 4 ++++
arch/x86/include/asm/pgalloc.h | 8 +++++++-
2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 30c40f0..6a5cf6a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1880,6 +1880,10 @@ config USE_PERCPU_NUMA_NODE_ID
def_bool y
depends on NUMA

+config ARCH_ENABLE_SPLIT_PMD_PTLOCK
+ def_bool y
+ depends on X86_64 || X86_PAE
+
menu "Power management and ACPI options"

config ARCH_HIBERNATION_HEADER
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index b4389a4..f2daea1 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -80,12 +80,18 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
#if PAGETABLE_LEVELS > 2
static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
{
- return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
+ struct page *page;
+ page = alloc_pages(GFP_KERNEL | __GFP_REPEAT| __GFP_ZERO, 0);
+ if (!page)
+ return NULL;
+ pgtable_pmd_page_ctor(page);
+ return (pmd_t *)page_address(page);
}

static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
{
BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
+ pgtable_pmd_page_dtor(virt_to_page(pmd));
free_page((unsigned long)pmd);
}

--
1.8.4.rc3

2013-09-13 13:06:33

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCH 1/9] mm: rename SPLIT_PTLOCKS to SPLIT_PTE_PTLOCKS

We're going to introduce split page table lock for PMD level.
Let's rename existing split ptlock for PTE level to avoid confusion.

Signed-off-by: Kirill A. Shutemov <[email protected]>
---
arch/arm/mm/fault-armv.c | 6 +++---
arch/um/defconfig | 2 +-
arch/x86/xen/mmu.c | 6 +++---
arch/xtensa/configs/iss_defconfig | 2 +-
arch/xtensa/configs/s6105_defconfig | 2 +-
include/linux/mm.h | 6 +++---
include/linux/mm_types.h | 8 ++++----
mm/Kconfig | 2 +-
8 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c
index 2a5907b..ff379ac 100644
--- a/arch/arm/mm/fault-armv.c
+++ b/arch/arm/mm/fault-armv.c
@@ -65,7 +65,7 @@ static int do_adjust_pte(struct vm_area_struct *vma, unsigned long address,
return ret;
}

-#if USE_SPLIT_PTLOCKS
+#if USE_SPLIT_PTE_PTLOCKS
/*
* If we are using split PTE locks, then we need to take the page
* lock here. Otherwise we are using shared mm->page_table_lock
@@ -84,10 +84,10 @@ static inline void do_pte_unlock(spinlock_t *ptl)
{
spin_unlock(ptl);
}
-#else /* !USE_SPLIT_PTLOCKS */
+#else /* !USE_SPLIT_PTE_PTLOCKS */
static inline void do_pte_lock(spinlock_t *ptl) {}
static inline void do_pte_unlock(spinlock_t *ptl) {}
-#endif /* USE_SPLIT_PTLOCKS */
+#endif /* USE_SPLIT_PTE_PTLOCKS */

static int adjust_pte(struct vm_area_struct *vma, unsigned long address,
unsigned long pfn)
diff --git a/arch/um/defconfig b/arch/um/defconfig
index 08107a7..6b0a10f 100644
--- a/arch/um/defconfig
+++ b/arch/um/defconfig
@@ -82,7 +82,7 @@ CONFIG_FLATMEM_MANUAL=y
CONFIG_FLATMEM=y
CONFIG_FLAT_NODE_MEM_MAP=y
CONFIG_PAGEFLAGS_EXTENDED=y
-CONFIG_SPLIT_PTLOCK_CPUS=4
+CONFIG_SPLIT_PTE_PTLOCK_CPUS=4
# CONFIG_COMPACTION is not set
# CONFIG_PHYS_ADDR_T_64BIT is not set
CONFIG_ZONE_DMA_FLAG=0
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index fdc3ba2..455c873 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -796,7 +796,7 @@ static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
{
spinlock_t *ptl = NULL;

-#if USE_SPLIT_PTLOCKS
+#if USE_SPLIT_PTE_PTLOCKS
ptl = __pte_lockptr(page);
spin_lock_nest_lock(ptl, &mm->page_table_lock);
#endif
@@ -1637,7 +1637,7 @@ static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,

__set_pfn_prot(pfn, PAGE_KERNEL_RO);

- if (level == PT_PTE && USE_SPLIT_PTLOCKS)
+ if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
__pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);

xen_mc_issue(PARAVIRT_LAZY_MMU);
@@ -1671,7 +1671,7 @@ static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
if (!PageHighMem(page)) {
xen_mc_batch();

- if (level == PT_PTE && USE_SPLIT_PTLOCKS)
+ if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
__pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);

__set_pfn_prot(pfn, PAGE_KERNEL);
diff --git a/arch/xtensa/configs/iss_defconfig b/arch/xtensa/configs/iss_defconfig
index 77c52f8..54cc946 100644
--- a/arch/xtensa/configs/iss_defconfig
+++ b/arch/xtensa/configs/iss_defconfig
@@ -174,7 +174,7 @@ CONFIG_FLATMEM_MANUAL=y
CONFIG_FLATMEM=y
CONFIG_FLAT_NODE_MEM_MAP=y
CONFIG_PAGEFLAGS_EXTENDED=y
-CONFIG_SPLIT_PTLOCK_CPUS=4
+CONFIG_SPLIT_PTE_PTLOCK_CPUS=4
# CONFIG_PHYS_ADDR_T_64BIT is not set
CONFIG_ZONE_DMA_FLAG=1
CONFIG_BOUNCE=y
diff --git a/arch/xtensa/configs/s6105_defconfig b/arch/xtensa/configs/s6105_defconfig
index 4799c6a..d802f11 100644
--- a/arch/xtensa/configs/s6105_defconfig
+++ b/arch/xtensa/configs/s6105_defconfig
@@ -138,7 +138,7 @@ CONFIG_FLATMEM_MANUAL=y
CONFIG_FLATMEM=y
CONFIG_FLAT_NODE_MEM_MAP=y
CONFIG_PAGEFLAGS_EXTENDED=y
-CONFIG_SPLIT_PTLOCK_CPUS=4
+CONFIG_SPLIT_PTE_PTLOCK_CPUS=4
# CONFIG_PHYS_ADDR_T_64BIT is not set
CONFIG_ZONE_DMA_FLAG=1
CONFIG_VIRT_TO_BUS=y
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8b6e55e..6cf8ddb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1232,7 +1232,7 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
}
#endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */

-#if USE_SPLIT_PTLOCKS
+#if USE_SPLIT_PTE_PTLOCKS
/*
* We tuck a spinlock to guard each pagetable page into its struct page,
* at page->private, with BUILD_BUG_ON to make sure that this will not
@@ -1245,14 +1245,14 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
} while (0)
#define pte_lock_deinit(page) ((page)->mapping = NULL)
#define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
-#else /* !USE_SPLIT_PTLOCKS */
+#else /* !USE_SPLIT_PTE_PTLOCKS */
/*
* We use mm->page_table_lock to guard all pagetable pages of the mm.
*/
#define pte_lock_init(page) do {} while (0)
#define pte_lock_deinit(page) do {} while (0)
#define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;})
-#endif /* USE_SPLIT_PTLOCKS */
+#endif /* USE_SPLIT_PTE_PTLOCKS */

static inline void pgtable_page_ctor(struct page *page)
{
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index faf4b7c..fe0a4bb 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -23,7 +23,7 @@

struct address_space;

-#define USE_SPLIT_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
+#define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTE_PTLOCK_CPUS)

/*
* Each physical page in the system has a struct page associated with
@@ -141,7 +141,7 @@ struct page {
* indicates order in the buddy
* system if PG_buddy is set.
*/
-#if USE_SPLIT_PTLOCKS
+#if USE_SPLIT_PTE_PTLOCKS
spinlock_t ptl;
#endif
struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */
@@ -309,14 +309,14 @@ enum {
NR_MM_COUNTERS
};

-#if USE_SPLIT_PTLOCKS && defined(CONFIG_MMU)
+#if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU)
#define SPLIT_RSS_COUNTING
/* per-thread cached information, */
struct task_rss_stat {
int events; /* for synchronization threshold */
int count[NR_MM_COUNTERS];
};
-#endif /* USE_SPLIT_PTLOCKS */
+#endif /* USE_SPLIT_PTE_PTLOCKS */

struct mm_rss_stat {
atomic_long_t count[NR_MM_COUNTERS];
diff --git a/mm/Kconfig b/mm/Kconfig
index 026771a..1977a33 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -207,7 +207,7 @@ config PAGEFLAGS_EXTENDED
# PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
# DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page.
#
-config SPLIT_PTLOCK_CPUS
+config SPLIT_PTE_PTLOCK_CPUS
int
default "999999" if ARM && !CPU_CACHE_VIPT
default "999999" if PARISC && !PA20
--
1.8.4.rc3

2013-09-13 14:09:30

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCH 3/9] mm: introduce api for split page table lock for PMD level

Basic api, backed by mm->page_table_lock for now. Actual implementation
will be added later.

Signed-off-by: Naoya Horiguchi <[email protected]>
Signed-off-by: Kirill A. Shutemov <[email protected]>
---
include/linux/mm.h | 13 +++++++++++++
1 file changed, 13 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6cf8ddb..d4361e7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1294,6 +1294,19 @@ static inline void pgtable_page_dtor(struct page *page)
((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
NULL: pte_offset_kernel(pmd, address))

+static inline spinlock_t *huge_pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
+{
+ return &mm->page_table_lock;
+}
+
+
+static inline spinlock_t *huge_pmd_lock(struct mm_struct *mm, pmd_t *pmd)
+{
+ spinlock_t *ptl = huge_pmd_lockptr(mm, pmd);
+ spin_lock(ptl);
+ return ptl;
+}
+
extern void free_area_init(unsigned long * zones_size);
extern void free_area_init_node(int nid, unsigned long * zones_size,
unsigned long zone_start_pfn, unsigned long *zholes_size);
--
1.8.4.rc3

2013-09-13 14:09:32

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCH 0/9] split page table lock for PMD tables

Alex Thorlton noticed that some massivly threaded workloads work poorly,
if THP enabled. This patchset fixes this by introducing split page table
lock for PMD tables. hugetlbfs is not covered yet.

This patchset is based on work by Naoya Horiguchi.

Benchmark (from Alex): ftp://shell.sgi.com/collect/appsx_test/pthread_test.tar.gz

THP off:
--------

Performance counter stats for './thp_pthread -C 0 -m 0 -c 80 -b 100g' (5 runs):

1738259.808012 task-clock # 47.571 CPUs utilized ( +- 9.49% )
147,359 context-switches # 0.085 K/sec ( +- 9.67% )
14 cpu-migrations # 0.000 K/sec ( +- 13.25% )
24,410,139 page-faults # 0.014 M/sec ( +- 0.00% )
4,149,037,526,252 cycles # 2.387 GHz ( +- 9.50% )
3,649,839,735,027 stalled-cycles-frontend # 87.97% frontend cycles idle ( +- 6.60% )
2,455,558,969,567 stalled-cycles-backend # 59.18% backend cycles idle ( +- 22.92% )
1,434,961,518,604 instructions # 0.35 insns per cycle
# 2.54 stalled cycles per insn ( +- 92.86% )
241,472,020,951 branches # 138.916 M/sec ( +- 91.72% )
84,022,172 branch-misses # 0.03% of all branches ( +- 3.16% )

36.540185552 seconds time elapsed ( +- 18.36% )

THP on, no patchset:
--------------------
Performance counter stats for './thp_pthread -C 0 -m 0 -c 80 -b 100g' (5 runs):

2528378.966949 task-clock # 50.715 CPUs utilized ( +- 11.86% )
214,063 context-switches # 0.085 K/sec ( +- 11.94% )
19 cpu-migrations # 0.000 K/sec ( +- 22.72% )
49,226 page-faults # 0.019 K/sec ( +- 0.33% )
6,034,640,598,498 cycles # 2.387 GHz ( +- 11.91% )
5,685,933,794,081 stalled-cycles-frontend # 94.22% frontend cycles idle ( +- 7.67% )
4,414,381,393,353 stalled-cycles-backend # 73.15% backend cycles idle ( +- 2.09% )
952,086,804,776 instructions # 0.16 insns per cycle
# 5.97 stalled cycles per insn ( +- 89.59% )
166,191,211,974 branches # 65.730 M/sec ( +- 85.52% )
33,341,022 branch-misses # 0.02% of all branches ( +- 3.90% )

49.854741504 seconds time elapsed ( +- 14.76% )

THP on, with patchset:
----------------------

echo always > /sys/kernel/mm/transparent_hugepage/enabled
Performance counter stats for './thp_pthread -C 0 -m 0 -c 80 -b 100g' (5 runs):

1538763.343568 task-clock # 45.386 CPUs utilized ( +- 7.21% )
130,469 context-switches # 0.085 K/sec ( +- 7.32% )
14 cpu-migrations # 0.000 K/sec ( +- 23.58% )
49,299 page-faults # 0.032 K/sec ( +- 0.15% )
3,666,748,502,650 cycles # 2.383 GHz ( +- 7.25% )
3,330,488,035,212 stalled-cycles-frontend # 90.83% frontend cycles idle ( +- 4.70% )
2,383,357,073,990 stalled-cycles-backend # 65.00% backend cycles idle ( +- 16.06% )
935,504,610,528 instructions # 0.26 insns per cycle
# 3.56 stalled cycles per insn ( +- 91.16% )
161,466,689,532 branches # 104.933 M/sec ( +- 87.67% )
22,602,225 branch-misses # 0.01% of all branches ( +- 6.43% )

33.903917543 seconds time elapsed ( +- 12.57% )

Kirill A. Shutemov (9):
mm: rename SPLIT_PTLOCKS to SPLIT_PTE_PTLOCKS
mm: convert mm->nr_ptes to atomic_t
mm: introduce api for split page table lock for PMD level
mm, thp: change pmd_trans_huge_lock() to return taken lock
mm, thp: move ptl taking inside page_check_address_pmd()
mm, thp: do not access mm->pmd_huge_pte directly
mm: convent the rest to new page table lock api
mm: implement split page table lock for PMD level
x86, mm: enable split page table lock for PMD level

arch/arm/mm/fault-armv.c | 6 +-
arch/s390/mm/pgtable.c | 12 +--
arch/sparc/mm/tlb.c | 12 +--
arch/um/defconfig | 2 +-
arch/x86/Kconfig | 4 +
arch/x86/include/asm/pgalloc.h | 8 +-
arch/x86/xen/mmu.c | 6 +-
arch/xtensa/configs/iss_defconfig | 2 +-
arch/xtensa/configs/s6105_defconfig | 2 +-
fs/proc/task_mmu.c | 15 +--
include/linux/huge_mm.h | 17 +--
include/linux/mm.h | 51 ++++++++-
include/linux/mm_types.h | 15 ++-
kernel/fork.c | 6 +-
mm/Kconfig | 12 ++-
mm/huge_memory.c | 201 +++++++++++++++++++++---------------
mm/memcontrol.c | 10 +-
mm/memory.c | 21 ++--
mm/migrate.c | 7 +-
mm/mmap.c | 3 +-
mm/mprotect.c | 4 +-
mm/oom_kill.c | 6 +-
mm/pgtable-generic.c | 16 +--
mm/rmap.c | 13 +--
24 files changed, 280 insertions(+), 171 deletions(-)

--
1.8.4.rc3

2013-09-13 14:23:13

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCH 3/9] mm: introduce api for split page table lock for PMD level

Peter Zijlstra wrote:
> On Fri, Sep 13, 2013 at 04:06:10PM +0300, Kirill A. Shutemov wrote:
> > Basic api, backed by mm->page_table_lock for now. Actual implementation
> > will be added later.
> >
> > Signed-off-by: Naoya Horiguchi <[email protected]>
> > Signed-off-by: Kirill A. Shutemov <[email protected]>
> > ---
> > include/linux/mm.h | 13 +++++++++++++
> > 1 file changed, 13 insertions(+)
> >
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 6cf8ddb..d4361e7 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -1294,6 +1294,19 @@ static inline void pgtable_page_dtor(struct page *page)
> > ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
> > NULL: pte_offset_kernel(pmd, address))
> >
> > +static inline spinlock_t *huge_pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
> > +{
> > + return &mm->page_table_lock;
> > +}
> > +
> > +
> > +static inline spinlock_t *huge_pmd_lock(struct mm_struct *mm, pmd_t *pmd)
> > +{
> > + spinlock_t *ptl = huge_pmd_lockptr(mm, pmd);
> > + spin_lock(ptl);
> > + return ptl;
> > +}
>
> Why not call the thing pmd_lock()? The pmd bit differentiates it from
> pte_lock() enough IMIO.

Okay, will rename it.

--
Kirill A. Shutemov

2013-09-13 14:25:25

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCH 8/9] mm: implement split page table lock for PMD level

Peter Zijlstra wrote:
> On Fri, Sep 13, 2013 at 04:06:15PM +0300, Kirill A. Shutemov wrote:
> > The basic idea is the same as with PTE level: the lock is embedded into
> > struct page of table's page.
> >
> > Split pmd page table lock only makes sense on big machines.
> > Let's say >= 32 CPUs for now.
>
> Why is this? Couldn't I generate the same amount of contention on PMD
> level as I can on PTE level in the THP case?

Hm. You are right. You just need more memory for that.
Do you want it to be "4" too?

--
Kirill A. Shutemov

2013-09-13 14:25:53

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCH 8/9] mm: implement split page table lock for PMD level

Peter Zijlstra wrote:
> On Fri, Sep 13, 2013 at 04:06:15PM +0300, Kirill A. Shutemov wrote:
> > +#if USE_SPLIT_PMD_PTLOCKS
> > +
> > +static inline void pgtable_pmd_page_ctor(struct page *page)
> > +{
> > + spin_lock_init(&page->ptl);
> > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> > + page->pmd_huge_pte = NULL;
> > +#endif
> > +}
> > +
> > +static inline void pgtable_pmd_page_dtor(struct page *page)
> > +{
> > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> > + VM_BUG_ON(page->pmd_huge_pte);
> > +#endif
> > +}
> > +
> > +#define pmd_huge_pte(mm, pmd) (virt_to_page(pmd)->pmd_huge_pte)
> > +
> > +#else
>
> So on -rt we have the problem that spinlock_t is rather huge (its a
> rtmutex) so instead of blowing up the pageframe like that we treat
> page->pte as a pointer and allocate the spinlock.
>
> Since allocations could fail the above ctor path gets 'interesting'.
>
> It would be good if new code could assume the ctor could fail so we
> don't have to replicate that horror-show.

Okay, I'll rework this.

--
Kirill A. Shutemov

2013-09-13 14:52:27

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 8/9] mm: implement split page table lock for PMD level

On Fri, Sep 13, 2013 at 05:25:13PM +0300, Kirill A. Shutemov wrote:
> Peter Zijlstra wrote:
> > On Fri, Sep 13, 2013 at 04:06:15PM +0300, Kirill A. Shutemov wrote:
> > > The basic idea is the same as with PTE level: the lock is embedded into
> > > struct page of table's page.
> > >
> > > Split pmd page table lock only makes sense on big machines.
> > > Let's say >= 32 CPUs for now.
> >
> > Why is this? Couldn't I generate the same amount of contention on PMD
> > level as I can on PTE level in the THP case?
>
> Hm. You are right. You just need more memory for that.
> Do you want it to be "4" too?

Well, I would drop your patch-1 and use the same config var.

2013-09-13 15:20:42

by Dave Hansen

[permalink] [raw]
Subject: Re: [PATCH 1/9] mm: rename SPLIT_PTLOCKS to SPLIT_PTE_PTLOCKS

On 09/13/2013 06:06 AM, Kirill A. Shutemov wrote:
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -207,7 +207,7 @@ config PAGEFLAGS_EXTENDED
> # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
> # DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page.
> #
> -config SPLIT_PTLOCK_CPUS
> +config SPLIT_PTE_PTLOCK_CPUS
> int
> default "999999" if ARM && !CPU_CACHE_VIPT
> default "999999" if PARISC && !PA20

If someone has a config where this is set to some non-default value,
won't changing the name cause this to revert back to the defaults?

I don't know how big of a deal it is to other folks, but you can always
do this:

config SPLIT_PTE_PTLOCK_CPUS
int
default SPLIT_PTLOCK_CPUS if SPLIT_PTLOCK_CPUS

2013-09-13 15:46:34

by Naoya Horiguchi

[permalink] [raw]
Subject: Re: [PATCH 8/9] mm: implement split page table lock for PMD level

On Fri, Sep 13, 2013 at 04:06:15PM +0300, Kirill A. Shutemov wrote:
> The basic idea is the same as with PTE level: the lock is embedded into
> struct page of table's page.
>
> Split pmd page table lock only makes sense on big machines.
> Let's say >= 32 CPUs for now.
>
> We can't use mm->pmd_huge_pte to store pgtables for THP, since we don't
> take mm->page_table_lock anymore. Let's reuse page->lru of table's page
> for that.

Looks nice.

> hugetlbfs hasn't converted to split locking: disable split locking if
> hugetlbfs enabled.

I don't think that we have to disable when hugetlbfs is enabled,
because hugetlbfs code doesn't use huge_pmd_lockptr() or huge_pmd_lock().

> Signed-off-by: Naoya Horiguchi <[email protected]>
> Signed-off-by: Kirill A. Shutemov <[email protected]>
> ---
> include/linux/mm.h | 31 +++++++++++++++++++++++++++++++
> include/linux/mm_types.h | 5 +++++
> kernel/fork.c | 4 ++--
> mm/Kconfig | 10 ++++++++++
> 4 files changed, 48 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index d2f8a50..5b3922d 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1294,13 +1294,44 @@ static inline void pgtable_page_dtor(struct page *page)
> ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
> NULL: pte_offset_kernel(pmd, address))
>
> +#if USE_SPLIT_PMD_PTLOCKS
> +
> +static inline spinlock_t *huge_pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
> +{
> + return &virt_to_page(pmd)->ptl;
> +}
> +
> +static inline void pgtable_pmd_page_ctor(struct page *page)
> +{
> + spin_lock_init(&page->ptl);
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> + page->pmd_huge_pte = NULL;
> +#endif
> +}
> +
> +static inline void pgtable_pmd_page_dtor(struct page *page)
> +{
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> + VM_BUG_ON(page->pmd_huge_pte);
> +#endif
> +}
> +
> +#define pmd_huge_pte(mm, pmd) (virt_to_page(pmd)->pmd_huge_pte)
>
> +
> +#else
> +
> static inline spinlock_t *huge_pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
> {
> return &mm->page_table_lock;
> }
>
> +static inline void pgtable_pmd_page_ctor(struct page *page) {}
> +static inline void pgtable_pmd_page_dtor(struct page *page) {}
> +
> #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)
>
> +#endif
> +
> static inline spinlock_t *huge_pmd_lock(struct mm_struct *mm, pmd_t *pmd)
> {
> spinlock_t *ptl = huge_pmd_lockptr(mm, pmd);
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 1c64730..5706ddf 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -24,6 +24,8 @@
> struct address_space;
>
> #define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTE_PTLOCK_CPUS)
> +#define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \
> + NR_CPUS >= CONFIG_SPLIT_PMD_PTLOCK_CPUS)
>
> /*
> * Each physical page in the system has a struct page associated with
> @@ -130,6 +132,9 @@ struct page {
>
> struct list_head list; /* slobs list of pages */
> struct slab *slab_page; /* slab fields */
> +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
> + pgtable_t pmd_huge_pte; /* protected by page->ptl */
> +#endif
> };
>
> /* Remainder is not double word aligned */

Can we remove pmd_huge_pte from mm_struct when USE_SPLIT_PMD_PTLOCKS is true?

Thanks,
Naoya Horiguchi

2013-09-13 19:58:15

by Dave Hansen

[permalink] [raw]
Subject: Re: [PATCH 8/9] mm: implement split page table lock for PMD level

On 09/13/2013 06:06 AM, Kirill A. Shutemov wrote:
> +config ARCH_ENABLE_SPLIT_PMD_PTLOCK
> + boolean
> +
> +config SPLIT_PMD_PTLOCK_CPUS
> + int
> + # hugetlb hasn't converted to split locking yet
> + default "999999" if HUGETLB_PAGE
> + default "32" if ARCH_ENABLE_SPLIT_PMD_PTLOCK
> + default "999999"

Is there a reason we should have separate config knobs for this from
SPLIT_PTLOCK_CPUS? Seem a bit silly.