2013-08-02 20:57:18

by Alex Thorlton

[permalink] [raw]
Subject: [PATCHv2] Add per-process flag to control thp

This patch implements functionality to allow processes to disable the use of
transparent hugepages through the prctl syscall.

We've determined that some jobs perform significantly better with thp disabled,
and we needed a way to control thp on a per-process basis, without relying on
madvise.

v2 - tweaked thp_disabled flag to be a single bit instead of an int

---
include/linux/huge_mm.h | 14 +++++++++++++-
include/linux/init_task.h | 8 ++++++++
include/linux/sched.h | 4 ++++
include/uapi/linux/prctl.h | 3 +++
kernel/fork.c | 4 ++++
kernel/sys.c | 31 +++++++++++++++++++++++++++++++
6 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index b60de92..53af3ca 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -1,6 +1,8 @@
#ifndef _LINUX_HUGE_MM_H
#define _LINUX_HUGE_MM_H

+#include <linux/sched.h>
+
extern int do_huge_pmd_anonymous_page(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
@@ -66,7 +68,7 @@ extern pmd_t *page_check_address_pmd(struct page *page,

extern bool is_vma_temporary_stack(struct vm_area_struct *vma);

-#define transparent_hugepage_enabled(__vma) \
+#define _transparent_hugepage_enabled(__vma) \
((transparent_hugepage_flags & \
(1<<TRANSPARENT_HUGEPAGE_FLAG) || \
(transparent_hugepage_flags & \
@@ -177,6 +179,11 @@ static inline struct page *compound_trans_head(struct page *page)
return page;
}

+static inline int transparent_hugepage_enabled(struct vm_area_struct *vma)
+{
+ return !current->thp_disabled & _transparent_hugepage_enabled(vma);
+}
+
extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd, pmd_t *pmdp);

@@ -230,6 +237,11 @@ static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_str
return 0;
}

+static inline int transparent_hugepage_enabled(struct vm_area_struct *vma)
+{
+ return _transparent_hugepage_enabled(vma);
+}
+
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 5cd0f09..aae74fd 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -152,6 +152,13 @@ extern struct task_group root_task_group;
# define INIT_VTIME(tsk)
#endif

+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+# define INIT_THP_DISABLED \
+ .thp_disabled = 0,
+#else
+# define INIT_THP_DISABLED
+#endif
+
#define INIT_TASK_COMM "swapper"

/*
@@ -222,6 +229,7 @@ extern struct task_group root_task_group;
INIT_TASK_RCU_PREEMPT(tsk) \
INIT_CPUSET_SEQ \
INIT_VTIME(tsk) \
+ INIT_THP_DISABLED \
}


diff --git a/include/linux/sched.h b/include/linux/sched.h
index 50d04b9..c14cf47 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1117,6 +1117,10 @@ struct task_struct {
unsigned sched_reset_on_fork:1;
unsigned sched_contributes_to_load:1;

+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ unsigned thp_disabled:1;
+#endif
+
pid_t pid;
pid_t tgid;

diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 289760f..f69780d 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -149,4 +149,7 @@

#define PR_GET_TID_ADDRESS 40

+#define PR_SET_THP_DISABLED 41
+#define PR_GET_THP_DISABLED 42
+
#endif /* _LINUX_PRCTL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 403d2bb..0b4afb5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1311,6 +1311,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->sequential_io_avg = 0;
#endif

+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ p->thp_disabled = current->thp_disabled;
+#endif
+
/* Perform scheduler related setup. Assign this task to a CPU. */
sched_fork(p);

diff --git a/kernel/sys.c b/kernel/sys.c
index 771129b..416c8a6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1836,6 +1836,31 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
}
#endif

+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int prctl_set_thp_disabled(struct task_struct *me)
+{
+ me->thp_disabled = 1;
+ return 0;
+}
+
+static int prctl_get_thp_disabled(struct task_struct *me,
+ int __user *thp_disabled)
+{
+ return put_user(me->thp_disabled, thp_disabled);
+}
+#else
+static int prctl_set_thp_disabled(struct task_struct *me)
+{
+ return -EINVAL;
+}
+
+static int prctl_get_thp_disabled(struct task_struct *me,
+ int __user *thp_disabled)
+{
+ return -EINVAL;
+}
+#endif
+
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
@@ -1999,6 +2024,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
if (arg2 || arg3 || arg4 || arg5)
return -EINVAL;
return current->no_new_privs ? 1 : 0;
+ case PR_SET_THP_DISABLED:
+ error = prctl_set_thp_disabled(me);
+ break;
+ case PR_GET_THP_DISABLED:
+ error = prctl_get_thp_disabled(me, (int __user *) arg2);
+ break;
default:
error = -EINVAL;
break;
--
1.7.12.4


2013-08-03 17:07:12

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCHv2] Add per-process flag to control thp

On 08/02, Alex Thorlton wrote:
>
> This patch implements functionality to allow processes to disable the use of
> transparent hugepages through the prctl syscall.
>
> We've determined that some jobs perform significantly better with thp disabled,
> and we needed a way to control thp on a per-process basis, without relying on
> madvise.

Well, I think the changelog should explain why madvise() is bad.

> @@ -1311,6 +1311,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
> p->sequential_io_avg = 0;
> #endif
>
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> + p->thp_disabled = current->thp_disabled;
> +#endif

Unneeded. It will be copied by dup_task_struct() automagically.

But I simply can't understand why this flag is per-thread. It should be
mm flag, no?

Oleg

2013-08-05 14:46:30

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCHv2] Add per-process flag to control thp

On Sat, Aug 03, 2013 at 07:01:02PM +0200, Oleg Nesterov wrote:
> On 08/02, Alex Thorlton wrote:
> >
> > This patch implements functionality to allow processes to disable the use of
> > transparent hugepages through the prctl syscall.
> >
> > We've determined that some jobs perform significantly better with thp disabled,
> > and we needed a way to control thp on a per-process basis, without relying on
> > madvise.
>
> Well, I think the changelog should explain why madvise() is bad.

It would be good to describe which jobs and why thp cannot be fixed for
these.

2013-08-05 15:05:57

by Alex Thorlton

[permalink] [raw]
Subject: Re: [PATCHv2] Add per-process flag to control thp

think the changelog should explain why madvise() is bad.

No problem. I wanted to keep it simple for the original submission, but
that's probably something that should be included.

> But I simply can't understand why this flag is per-thread. It should
> be
> mm flag, no?

This is something that we (Robin and I) had discussed a while back, and,
upon review, I'm beginning to agree that this might be the more sensible
route to take.

I'm going to try and gather a bit more data to see if we can get some
more exact answers as to why THP is performing so poorly under certain
conditions before trying to push this particular patch any further.

Thanks for the input!

- Alex

2013-08-05 15:10:42

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCHv2] Add per-process flag to control thp

On 08/03/2013 01:01 PM, Oleg Nesterov wrote:

>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> + p->thp_disabled = current->thp_disabled;
>> +#endif
>
> Unneeded. It will be copied by dup_task_struct() automagically.
>
> But I simply can't understand why this flag is per-thread. It should be
> mm flag, no?

It has to be per-mm for another reason, too.

Think about what were to happen if one process were ptracing
a process with the thp_disabled flag, and ended up causing a
new anonymous page to be faulted in.

With the thp_disabled flag in the task, get_user_pages would
end up not seeing the flag, and the task could get a transparent
huge page.

With the thp_disabled flag in the mm, it would be possible for
get_user_pages to easily find the flag, and do the right thing.

--
All rights reversed

2013-08-05 15:12:27

by Alex Thorlton

[permalink] [raw]
Subject: Re: [PATCHv2] Add per-process flag to control thp

> It would be good to describe which jobs and why thp cannot be fixed for
> these.

I'm going to see if I can get the benchmarks that are causing trouble up
and running on my own to get some more information (problem was
originally identified by our benchmarking team).

Thanks for the input!

- Alex

2013-08-05 23:03:11

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCHv2] Add per-process flag to control thp

On Fri, 2 Aug 2013 15:57:35 -0500 Alex Thorlton <[email protected]> wrote:

> This patch implements functionality to allow processes to disable the use of
> transparent hugepages through the prctl syscall.
>
> We've determined that some jobs perform significantly better with thp disabled,
> and we needed a way to control thp on a per-process basis, without relying on
> madvise.

What everyone else said, plus...

I worry about the inherit-across-fork thing. A scenario we should
think about is where the user doesn't build his own executables. So he
writes a little wrapper which runs prctl(PR_SET_THP_DISABLED, 0) then
execs the third-party-app. But what happens if that app execs children
who *do* want THP? He has to hunt down and wrap those as well?

It all seems a bit unwieldy. I wonder if controlling it via the ELF
header would be more practical.

2013-08-06 15:08:13

by Alex Thorlton

[permalink] [raw]
Subject: Re: [PATCHv2] Add per-process flag to control thp

> What everyone else said, plus...
>
> I worry about the inherit-across-fork thing. A scenario we should
> think about is where the user doesn't build his own executables. So he
> writes a little wrapper which runs prctl(PR_SET_THP_DISABLED, 0) then
> execs the third-party-app. But what happens if that app execs children
> who *do* want THP? He has to hunt down and wrap those as well?
>
> It all seems a bit unwieldy. I wonder if controlling it via the ELF
> header would be more practical.

Thanks, Andrew. I'm doing some more testing and looking into using a
different method for controlling this. At this point, I think it's fair
to say that we don't want to control this using the method that I've
proposed here, no matter how we look at it.

I've gotten my hands on some of the benchmarks/code that were used to
originally uncover the performance issues we're seeing. I'm currently
trying to separate out the performance issues that are being caused by
the kernel code from issues involving hardware - the cost of remote
memory accesses is a bit higher on our systems with node controllers vs.
glueless QPI/Hypertransport-based systems.

At this point, it's difficult to say whether or not the issue can be
solved by "fixing the performance issues with thp," as several have
suggested. Don't get me wrong I like the idea of that solution as well;
we're just not sure, right now, if that's going to solve all of our
problems.

I'll be back when I've dug up a bit more info on the issue. Thanks for
the input, everyone!

2013-08-06 16:06:00

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCHv2] Add per-process flag to control thp

On 08/06/2013 11:08 AM, Alex Thorlton wrote:

> Thanks, Andrew. I'm doing some more testing and looking into using a
> different method for controlling this. At this point, I think it's fair
> to say that we don't want to control this using the method that I've
> proposed here, no matter how we look at it.

Considering that we have both mlock and mlockall, it
is not totally crazy to also have both madvise and
madviseall.

Just saying... :)

--
All rights reversed

2013-08-08 03:49:45

by David Rientjes

[permalink] [raw]
Subject: Re: [PATCHv2] Add per-process flag to control thp

On Tue, 6 Aug 2013, Alex Thorlton wrote:

> I've gotten my hands on some of the benchmarks/code that were used to
> originally uncover the performance issues we're seeing. I'm currently
> trying to separate out the performance issues that are being caused by
> the kernel code from issues involving hardware - the cost of remote
> memory accesses is a bit higher on our systems with node controllers vs.
> glueless QPI/Hypertransport-based systems.
>

We've seen some issues where accessing remote hugepages causes performance
degradations over accessing local pages that affects some workloads, but
that seems like a mempolicy issue rather than madvise. You probably want
local hugepages but not fallback to remote hugepages before allocating
local pages? It would be interesting to see /proc/pid/numa_maps and
/proc/pid/smaps for workloads that don't like thp.

2013-08-11 16:56:32

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCHv2] Add per-process flag to control thp

Alex Thorlton <[email protected]> writes:

> This patch implements functionality to allow processes to disable the use of
> transparent hugepages through the prctl syscall.
>
> We've determined that some jobs perform significantly better with thp disabled,
> and we needed a way to control thp on a per-process basis, without relying on
> madvise.
>

Is that because of hugepage alloc on fault ? I understand, that could be
a real issue when applications touch one byte per hugepage. In that case
we would endup allocating more memory than the application footprint. I have
observed that system almost hang and start hitting OOM when run with THP
enabled. Now we could possibly handle that by not allowing alloc on fault and
always depend on coallpse hugepage to instantiate a hugepage ?

-aneesh