LinuxLists.cc - [PATCH 26/31] sched, numa, mm: Add fault driven placement and migration policy

2012-10-25 13:10:56

Subject: [PATCH 26/31] sched, numa, mm: Add fault driven placement and migration policy

As per the problem/design document Documentation/scheduler/numa-problem.txt
implement 3ac & 4.

( A pure 3a was found too unstable, I did briefly try 3bc
but found no significant improvement. )

Implement a per-task memory placement scheme relying on a regular
PROT_NONE 'migration' fault to scan the memory space of the procress
and uses a two stage migration scheme to reduce the invluence of
unlikely usage relations.

It relies on the assumption that the compute part is tied to a
paticular task and builds a task<->page relation set to model the
compute<->data relation.

In the previous patch we made memory migrate towards where the task
is running, here we select the node on which most memory is located
as the preferred node to run on.

This creates a feed-back control loop between trying to schedule a
task on a node and migrating memory towards the node the task is
scheduled on.

Suggested-by: Andrea Arcangeli <[email protected]>
Suggested-by: Rik van Riel <[email protected]>
Fixes-by: David Rientjes <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Andrew Morton <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
---
include/linux/mm_types.h | 4 +
include/linux/sched.h | 35 +++++++--
kernel/sched/core.c | 16 ++++
kernel/sched/fair.c | 175 +++++++++++++++++++++++++++++++++++++++++++++++
kernel/sched/features.h | 1
kernel/sched/sched.h | 31 +++++---
kernel/sysctl.c | 31 +++++++-
mm/huge_memory.c | 7 +
mm/memory.c | 4 -
9 files changed, 282 insertions(+), 22 deletions(-)
Index: tip/include/linux/mm_types.h
===================================================================
--- tip.orig/include/linux/mm_types.h
+++ tip/include/linux/mm_types.h
@@ -403,6 +403,10 @@ struct mm_struct {
#ifdef CONFIG_CPUMASK_OFFSTACK
struct cpumask cpumask_allocation;
#endif
+#ifdef CONFIG_SCHED_NUMA
+ unsigned long numa_next_scan;
+ int numa_scan_seq;
+#endif
struct uprobes_state uprobes_state;
};

Index: tip/include/linux/sched.h
===================================================================
--- tip.orig/include/linux/sched.h
+++ tip/include/linux/sched.h
@@ -1481,9 +1481,16 @@ struct task_struct {
short pref_node_fork;
#endif
#ifdef CONFIG_SCHED_NUMA
- int node;
+ int node; /* task home node */
+ int numa_scan_seq;
+ int numa_migrate_seq;
+ unsigned int numa_scan_period;
+ u64 node_stamp; /* migration stamp */
unsigned long numa_contrib;
-#endif
+ unsigned long *numa_faults;
+ struct callback_head numa_work;
+#endif /* CONFIG_SCHED_NUMA */
+
struct rcu_head rcu;

/*
@@ -1558,15 +1565,24 @@ struct task_struct {
/* Future-safe accessor for struct task_struct's cpus_allowed. */
#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)

+#ifdef CONFIG_SCHED_NUMA
static inline int tsk_home_node(struct task_struct *p)
{
-#ifdef CONFIG_SCHED_NUMA
return p->node;
+}
+
+extern void task_numa_fault(int node, int pages);
#else
+static inline int tsk_home_node(struct task_struct *p)
+{
return -1;
-#endif
}

+static inline void task_numa_fault(int node, int pages)
+{
+}
+#endif /* CONFIG_SCHED_NUMA */
+
/*
* Priority of a process goes from 0..MAX_PRIO-1, valid RT
* priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
@@ -2004,6 +2020,10 @@ enum sched_tunable_scaling {
};
extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;

+extern unsigned int sysctl_sched_numa_scan_period_min;
+extern unsigned int sysctl_sched_numa_scan_period_max;
+extern unsigned int sysctl_sched_numa_settle_count;
+
#ifdef CONFIG_SCHED_DEBUG
extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate;
@@ -2014,18 +2034,17 @@ extern unsigned int sysctl_sched_shares_
int sched_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length,
loff_t *ppos);
-#endif
-#ifdef CONFIG_SCHED_DEBUG
+
static inline unsigned int get_sysctl_timer_migration(void)
{
return sysctl_timer_migration;
}
-#else
+#else /* CONFIG_SCHED_DEBUG */
static inline unsigned int get_sysctl_timer_migration(void)
{
return 1;
}
-#endif
+#endif /* CONFIG_SCHED_DEBUG */
extern unsigned int sysctl_sched_rt_period;
extern int sysctl_sched_rt_runtime;

Index: tip/kernel/sched/core.c
===================================================================
--- tip.orig/kernel/sched/core.c
+++ tip/kernel/sched/core.c
@@ -1533,6 +1533,21 @@ static void __sched_fork(struct task_str
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
+
+#ifdef CONFIG_SCHED_NUMA
+ if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
+ p->mm->numa_next_scan = jiffies;
+ p->mm->numa_scan_seq = 0;
+ }
+
+ p->node = -1;
+ p->node_stamp = 0ULL;
+ p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
+ p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
+ p->numa_faults = NULL;
+ p->numa_scan_period = sysctl_sched_numa_scan_period_min;
+ p->numa_work.next = &p->numa_work;
+#endif /* CONFIG_SCHED_NUMA */
}

/*
@@ -1774,6 +1789,7 @@ static void finish_task_switch(struct rq
if (mm)
mmdrop(mm);
if (unlikely(prev_state == TASK_DEAD)) {
+ task_numa_free(prev);
/*
* Remove function-return probe instances associated with this
* task and put them back on the free list.
Index: tip/kernel/sched/fair.c
===================================================================
--- tip.orig/kernel/sched/fair.c
+++ tip/kernel/sched/fair.c
@@ -27,6 +27,8 @@
#include <linux/profile.h>
#include <linux/interrupt.h>
#include <linux/random.h>
+#include <linux/mempolicy.h>
+#include <linux/task_work.h>

#include <trace/events/sched.h>

@@ -775,6 +777,21 @@ update_stats_curr_start(struct cfs_rq *c

/**************************************************
* Scheduling class numa methods.
+ *
+ * The purpose of the NUMA bits are to maintain compute (task) and data
+ * (memory) locality. We try and achieve this by making tasks stick to
+ * a particular node (their home node) but if fairness mandates they run
+ * elsewhere for long enough, we let the memory follow them.
+ *
+ * Tasks start out with their home-node unset (-1) this effectively means
+ * they act !NUMA until we've established the task is busy enough to bother
+ * with placement.
+ *
+ * We keep a home-node per task and use periodic fault scans to try and
+ * estalish a task<->page relation. This assumes the task<->page relation is a
+ * compute<->data relation, this is false for things like virt. and n:m
+ * threading solutions but its the best we can do given the information we
+ * have.
*/

#ifdef CONFIG_SMP
@@ -805,6 +822,157 @@ static void account_numa_dequeue(struct
} else
rq->onnode_running--;
}
+
+/*
+ * numa task sample period in ms: 5s
+ */
+unsigned int sysctl_sched_numa_scan_period_min = 5000;
+unsigned int sysctl_sched_numa_scan_period_max = 5000*16;
+
+/*
+ * Wait for the 2-sample stuff to settle before migrating again
+ */
+unsigned int sysctl_sched_numa_settle_count = 2;
+
+static void task_numa_placement(struct task_struct *p)
+{
+ unsigned long faults, max_faults = 0;
+ int node, max_node = -1;
+ int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+
+ if (p->numa_scan_seq == seq)
+ return;
+
+ p->numa_scan_seq = seq;
+
+ for (node = 0; node < nr_node_ids; node++) {
+ faults = p->numa_faults[node];
+
+ if (faults > max_faults) {
+ max_faults = faults;
+ max_node = node;
+ }
+
+ p->numa_faults[node] /= 2;
+ }
+
+ if (max_node == -1)
+ return;
+
+ if (p->node != max_node) {
+ p->numa_scan_period = sysctl_sched_numa_scan_period_min;
+ if (sched_feat(NUMA_SETTLE) &&
+ (seq - p->numa_migrate_seq) <= (int)sysctl_sched_numa_settle_count)
+ return;
+ p->numa_migrate_seq = seq;
+ sched_setnode(p, max_node);
+ } else {
+ p->numa_scan_period = min(sysctl_sched_numa_scan_period_max,
+ p->numa_scan_period * 2);
+ }
+}
+
+/*
+ * Got a PROT_NONE fault for a page on @node.
+ */
+void task_numa_fault(int node, int pages)
+{
+ struct task_struct *p = current;
+
+ if (unlikely(!p->numa_faults)) {
+ int size = sizeof(unsigned long) * nr_node_ids;
+
+ p->numa_faults = kzalloc(size, GFP_KERNEL);
+ if (!p->numa_faults)
+ return;
+ }
+
+ task_numa_placement(p);
+
+ p->numa_faults[node] += pages;
+}
+
+/*
+ * The expensive part of numa migration is done from task_work context.
+ * Triggered from task_tick_numa().
+ */
+void task_numa_work(struct callback_head *work)
+{
+ unsigned long migrate, next_scan, now = jiffies;
+ struct task_struct *p = current;
+ struct mm_struct *mm = p->mm;
+
+ WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+
+ work->next = work; /* protect against double add */
+ /*
+ * Who cares about NUMA placement when they're dying.
+ *
+ * NOTE: make sure not to dereference p->mm before this check,
+ * exit_task_work() happens _after_ exit_mm() so we could be called
+ * without p->mm even though we still had it when we enqueued this
+ * work.
+ */
+ if (p->flags & PF_EXITING)
+ return;
+
+ /*
+ * Enforce maximal scan/migration frequency..
+ */
+ migrate = mm->numa_next_scan;
+ if (time_before(now, migrate))
+ return;
+
+ next_scan = now + 2*msecs_to_jiffies(sysctl_sched_numa_scan_period_min);
+ if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
+ return;
+
+ ACCESS_ONCE(mm->numa_scan_seq)++;
+ {
+ struct vm_area_struct *vma;
+
+ down_write(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (!vma_migratable(vma))
+ continue;
+ change_protection(vma, vma->vm_start, vma->vm_end, vma_prot_none(vma), 0);
+ }
+ up_write(&mm->mmap_sem);
+ }
+}
+
+/*
+ * Drive the periodic memory faults..
+ */
+void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+ struct callback_head *work = &curr->numa_work;
+ u64 period, now;
+
+ /*
+ * We don't care about NUMA placement if we don't have memory.
+ */
+ if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
+ return;
+
+ /*
+ * Using runtime rather than walltime has the dual advantage that
+ * we (mostly) drive the selection from busy threads and that the
+ * task needs to have done some actual work before we bother with
+ * NUMA placement.
+ */
+ now = curr->se.sum_exec_runtime;
+ period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
+
+ if (now - curr->node_stamp > period) {
+ curr->node_stamp = now;
+
+ if (!time_before(jiffies, curr->mm->numa_next_scan)) {
+ init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
+ task_work_add(curr, work, true);
+ }
+ }
+}
#else
#ifdef CONFIG_SMP
static struct list_head *account_numa_enqueue(struct rq *rq, struct task_struct *p)
@@ -816,6 +984,10 @@ static struct list_head *account_numa_en
static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
}
+
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+}
#endif /* CONFIG_SCHED_NUMA */

/**************************************************
@@ -5265,6 +5437,9 @@ static void task_tick_fair(struct rq *rq
cfs_rq = cfs_rq_of(se);
entity_tick(cfs_rq, se, queued);
}
+
+ if (sched_feat_numa(NUMA))
+ task_tick_numa(rq, curr);
}

/*
Index: tip/kernel/sched/features.h
===================================================================
--- tip.orig/kernel/sched/features.h
+++ tip/kernel/sched/features.h
@@ -69,5 +69,6 @@ SCHED_FEAT(NUMA_TTWU_BIAS, false)
SCHED_FEAT(NUMA_TTWU_TO, false)
SCHED_FEAT(NUMA_PULL, true)
SCHED_FEAT(NUMA_PULL_BIAS, true)
+SCHED_FEAT(NUMA_SETTLE, true)
#endif

Index: tip/kernel/sched/sched.h
===================================================================
--- tip.orig/kernel/sched/sched.h
+++ tip/kernel/sched/sched.h
@@ -3,6 +3,7 @@
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/stop_machine.h>
+#include <linux/slab.h>

#include "cpupri.h"

@@ -476,15 +477,6 @@ struct rq {
#endif
};

-static inline struct list_head *offnode_tasks(struct rq *rq)
-{
-#ifdef CONFIG_SCHED_NUMA
- return &rq->offnode_tasks;
-#else
- return NULL;
-#endif
-}
-
static inline int cpu_of(struct rq *rq)
{
#ifdef CONFIG_SMP
@@ -502,6 +494,27 @@ DECLARE_PER_CPU(struct rq, runqueues);
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() (&__raw_get_cpu_var(runqueues))

+#ifdef CONFIG_SCHED_NUMA
+static inline struct list_head *offnode_tasks(struct rq *rq)
+{
+ return &rq->offnode_tasks;
+}
+
+static inline void task_numa_free(struct task_struct *p)
+{
+ kfree(p->numa_faults);
+}
+#else /* CONFIG_SCHED_NUMA */
+static inline struct list_head *offnode_tasks(struct rq *rq)
+{
+ return NULL;
+}
+
+static inline void task_numa_free(struct task_struct *p)
+{
+}
+#endif /* CONFIG_SCHED_NUMA */
+
#ifdef CONFIG_SMP

#define rcu_dereference_check_sched_domain(p) \
Index: tip/kernel/sysctl.c
===================================================================
--- tip.orig/kernel/sysctl.c
+++ tip/kernel/sysctl.c
@@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 10
static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
static int min_wakeup_granularity_ns; /* 0 usecs */
static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
+#ifdef CONFIG_SMP
static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-#endif
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_SCHED_DEBUG */

#ifdef CONFIG_COMPACTION
static int min_extfrag_threshold;
@@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = {
.extra1 = &min_wakeup_granularity_ns,
.extra2 = &max_wakeup_granularity_ns,
},
+#ifdef CONFIG_SMP
{
.procname = "sched_tunable_scaling",
.data = &sysctl_sched_tunable_scaling,
@@ -347,7 +350,31 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one,
},
-#endif
+#endif /* CONFIG_SMP */
+#ifdef CONFIG_SCHED_NUMA
+ {
+ .procname = "sched_numa_scan_period_min_ms",
+ .data = &sysctl_sched_numa_scan_period_min,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_numa_scan_period_max_ms",
+ .data = &sysctl_sched_numa_scan_period_max,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_numa_settle_count",
+ .data = &sysctl_sched_numa_settle_count,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif /* CONFIG_SCHED_NUMA */
+#endif /* CONFIG_SCHED_DEBUG */
{
.procname = "sched_rt_period_us",
.data = &sysctl_sched_rt_period,
Index: tip/mm/huge_memory.c
===================================================================
--- tip.orig/mm/huge_memory.c
+++ tip/mm/huge_memory.c
@@ -774,9 +774,10 @@ fixup:

unlock:
spin_unlock(&mm->page_table_lock);
- if (page)
+ if (page) {
+ task_numa_fault(page_to_nid(page), HPAGE_PMD_NR);
put_page(page);
-
+ }
return;

migrate:
@@ -845,6 +846,8 @@ migrate:

put_page(page); /* Drop the rmap reference */

+ task_numa_fault(node, HPAGE_PMD_NR);
+
if (lru)
put_page(page); /* drop the LRU isolation reference */

Index: tip/mm/memory.c
===================================================================
--- tip.orig/mm/memory.c
+++ tip/mm/memory.c
@@ -3512,8 +3512,10 @@ out_pte_upgrade_unlock:
out_unlock:
pte_unmap_unlock(ptep, ptl);
out:
- if (page)
+ if (page) {
+ task_numa_fault(page_nid, 1);
put_page(page);
+ }

return 0;

2012-10-25 20:53:28

by Linus Torvalds

[permalink] [raw]

Subject: Re: [PATCH 26/31] sched, numa, mm: Add fault driven placement and migration policy

On Thu, Oct 25, 2012 at 5:16 AM, Peter Zijlstra <[email protected]> wrote:
> + /*
> + * Using runtime rather than walltime has the dual advantage that
> + * we (mostly) drive the selection from busy threads and that the
> + * task needs to have done some actual work before we bother with
> + * NUMA placement.
> + */

That explanation makes sense..

> + now = curr->se.sum_exec_runtime;
> + period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
> +
> + if (now - curr->node_stamp > period) {
> + curr->node_stamp = now;
> +
> + if (!time_before(jiffies, curr->mm->numa_next_scan)) {

.. but then the whole "numa_next_scan" thing ends up being about
real-time anyway?

So 'numa_scan_period' in in CPU time (msec, converted to nsec at
runtime rather than when setting it), but 'numa_next_scan' is in
wallclock time (jiffies)?

But *both* of them are based on the same 'numa_scan_period' thing that
the user sets in ms.

So numa_scan_period is interpreted as both wallclock *and* as runtime?

Maybe this works, but it doesn't really make much sense. And what is
the impact of this on machines that run lots of loads with delays
(whether due to IO or timers)?

Linus

2012-10-26 07:15:40

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH 26/31] sched, numa, mm: Add fault driven placement and migration policy

* Linus Torvalds <[email protected]> wrote:

> On Thu, Oct 25, 2012 at 5:16 AM, Peter Zijlstra <[email protected]> wrote:
> > + /*
> > + * Using runtime rather than walltime has the dual advantage that
> > + * we (mostly) drive the selection from busy threads and that the
> > + * task needs to have done some actual work before we bother with
> > + * NUMA placement.
> > + */
>
> That explanation makes sense..
>
> > + now = curr->se.sum_exec_runtime;
> > + period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
> > +
> > + if (now - curr->node_stamp > period) {
> > + curr->node_stamp = now;
> > +
> > + if (!time_before(jiffies, curr->mm->numa_next_scan)) {
>
> .. but then the whole "numa_next_scan" thing ends up being
> about real-time anyway?
>
> So 'numa_scan_period' in in CPU time (msec, converted to nsec
> at runtime rather than when setting it), but 'numa_next_scan'
> is in wallclock time (jiffies)?
>
> But *both* of them are based on the same 'numa_scan_period'
> thing that the user sets in ms.
>
> So numa_scan_period is interpreted as both wallclock *and* as
> runtime?
>
> Maybe this works, but it doesn't really make much sense.

So, the relationship between wall clock time and execution
runtime is that on the limit they run at the same speed: when
there's a single task running. In any other case execution
runtime can only run slower than wall time.

So the bit you found weird:

> > + if (!time_before(jiffies, curr->mm->numa_next_scan)) {

together with the task_numa_work() frequency limit:

/*
* Enforce maximal scan/migration frequency..
*/
migrate = mm->numa_next_scan;
if (time_before(now, migrate))
return;

next_scan = now + 2*msecs_to_jiffies(sysctl_sched_numa_scan_period_min);
if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
return;

puts an upper limit on the per mm scanning frequency.

This filters us from over-sampling if there are many threads: if
all threads happen to come in at the same time we don't create a
spike in overhead.

We also avoid multiple threads scanning at once in parallel to
each other. Faults are nicely parallel, especially with all the
preparatory patches in place, so the distributed nature of the
faults itself is not a problem.

So we have to conflicting goals here: on one hand we have a
quality of sampling goal which asks for per task runtime
proportional scanning on all threads, but we also have a
performance goal and don't actually want all threads running at
the same time. This frequency limit avoids the over-sampling
scenario while still fulfilling the per task sampling property,
statistically on average.

If you agree that we should do it like that and if the
implementation is correct and optimal, I will put a better
explanation into the code.

[
task_numa_work() performance side note:

We are also *very* close to be able to use down_read() instead
of down_write() in the sampling-unmap code in
task_numa_work(), as it should be safe in theory to call
change_protection(PROT_NONE) in parallel - but there's one
regression that disagrees with this theory so we use
down_write() at the moment.

Maybe you could help us there: can you see a reason why the
change_prot_none()->change_protection() call in
task_numa_work() can not occur in parallel to a page fault in
another thread on another CPU? It should be safe - yet if we
change it I can see occasional corruption of user-space state:
segfaults and register corruption.
]

> [...] And what is the impact of this on machines that run lots
> of loads with delays (whether due to IO or timers)?

I've done sysbench OLTP measurements which showed no apparent
regressions:

#
# Comparing { res-schednuma-NO_NUMA.txt } to { res-schednuma-+NUMA.txt }:
#
# threads improvement % SysBench OLTP transactions/second
#-------------------------------------------------------------------
2: 2.11 % # 2160.20 vs. 2205.80
4: -5.52 % # 4202.04 vs. 3969.97
8: 0.01 % # 6894.45 vs. 6895.45
16: -0.31 % # 11840.77 vs. 11804.30
24: -0.56 % # 15053.98 vs. 14969.14
30: 0.56 % # 17043.23 vs. 17138.21
32: -1.08 % # 17797.04 vs. 17604.67
34: 1.04 % # 18158.10 vs. 18347.22
36: -0.16 % # 18125.42 vs. 18096.68
40: 0.45 % # 18218.73 vs. 18300.59
48: -0.39 % # 18266.91 vs. 18195.26
56: -0.11 % # 18285.56 vs. 18265.74
64: 0.23 % # 18304.74 vs. 18347.51
96: 0.18 % # 18268.44 vs. 18302.04
128: 0.22 % # 18058.92 vs. 18099.34
256: 1.63 % # 17068.55 vs. 17347.14
512: 6.86 % # 13452.18 vs. 14375.08

No regression is the best we can hope for I think, given that
OLTP typically has huge global caches and global serialization,
so any NUMA conscious will at most be a nuisance.

We've also done kbuild measurements - which too is a pretty
sleepy workload that is too fast for any migration techniques to
help.

But even sysbench isn't doing very long delays, so I will do
more IO delay targeted measurements.

So I've been actively looking for and checking the worst-case
loads for this feature. The feature obviously helps long-run,
CPU-intense workloads, but those aren't the challenging ones
really IMO: I spent 70% of the time analyzing workloads that are
not expected to be friends with this feature.

We are also keeping CONFIG_SCHED_NUMA off by default for good
measure.

Thanks,

Ingo

2012-10-26 13:50:32

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH 26/31] sched, numa, mm: Add fault driven placement and migration policy

* Ingo Molnar <[email protected]> wrote:

> [
> task_numa_work() performance side note:
>
> We are also *very* close to be able to use down_read() instead
> of down_write() in the sampling-unmap code in
> task_numa_work(), as it should be safe in theory to call
> change_protection(PROT_NONE) in parallel - but there's one
> regression that disagrees with this theory so we use
> down_write() at the moment.
>
> Maybe you could help us there: can you see a reason why the
> change_prot_none()->change_protection() call in
> task_numa_work() can not occur in parallel to a page fault in
> another thread on another CPU? It should be safe - yet if we
> change it I can see occasional corruption of user-space state:
> segfaults and register corruption.
> ]

Oh, just found the reason:

the ptep_modify_prot_start()/modify()/commit() sequence is
SMP-unsafe - it has to be done under the mmap_sem write-locked.

It is safe against *hardware* updates to the PTE, but not safe
against itself.

This is apparently a hidden cost of paravirt, it is forcing that
weird sequence and thus the down_write() ...

Thanks,

Ingo

2012-10-26 14:11:36

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH 26/31] sched, numa, mm: Add fault driven placement and migration policy

On Fri, 2012-10-26 at 15:50 +0200, Ingo Molnar wrote:
>
> Oh, just found the reason:
>
> the ptep_modify_prot_start()/modify()/commit() sequence is
> SMP-unsafe - it has to be done under the mmap_sem write-locked.
>
> It is safe against *hardware* updates to the PTE, but not safe
> against itself.

Shouldn't the pte_lock serialize all that still? All sites that modify
PTE contents should hold the pte_lock (and do afaict).

2012-10-26 14:14:38

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH 26/31] sched, numa, mm: Add fault driven placement and migration policy

* Peter Zijlstra <[email protected]> wrote:

> On Fri, 2012-10-26 at 15:50 +0200, Ingo Molnar wrote:
> >
> > Oh, just found the reason:
> >
> > the ptep_modify_prot_start()/modify()/commit() sequence is
> > SMP-unsafe - it has to be done under the mmap_sem
> > write-locked.
> >
> > It is safe against *hardware* updates to the PTE, but not
> > safe against itself.
>
> Shouldn't the pte_lock serialize all that still? All sites
> that modify PTE contents should hold the pte_lock (and do
> afaict).

Hm, indeed.

Is there no code under down_read() (in the page fault path) that
modifies the pte via just pure atomics?

Thanks,

Ingo

2012-10-26 16:47:50

by Linus Torvalds

[permalink] [raw]

Subject: Re: [PATCH 26/31] sched, numa, mm: Add fault driven placement and migration policy

On Fri, Oct 26, 2012 at 7:14 AM, Ingo Molnar <[email protected]> wrote:
>
> * Peter Zijlstra <[email protected]> wrote:
>>
>> Shouldn't the pte_lock serialize all that still? All sites
>> that modify PTE contents should hold the pte_lock (and do
>> afaict).
>
> Hm, indeed.
>
> Is there no code under down_read() (in the page fault path) that
> modifies the pte via just pure atomics?

Well, the ptep_set_access_flags() thing modifies the pte under
down_read(). Not using atomics, though. If it races with itself or
with a hardware page walk, that's fine, but if it races with something
changing other bits than A/D, that would be horribly horribly bad - it
could undo any other bit changes exactly because it's a unlocked
read-do-other-things-write sequence.

But it's always run under the page table lock - as should all other SW
page table modifications - so it *should* be fine. The down_read() is
for protecting other VM data structures (notably the vma lists etc),
not the page table bit-twiddling.

In fact, the whole SW page table modification scheme *depends* on the
page table lock, because the ptep_modify_prot_start/commit thing does
a "atomically clear the page table pointer to protect against hardware
walkers". And if another software walker were to see that cleared
state, it would do bad things (the exception, as usual, is the GUP
code, which does the optimistic unlocked accesses and conceptually
emulates a hardware page table walk)

So I really think that the mmap_sem should be entirely a non-issue for
this kind of code.

Linus

2012-10-30 19:26:01

by Rik van Riel

[permalink] [raw]

Subject: Re: [PATCH 26/31] sched, numa, mm: Add fault driven placement and migration policy

On 10/25/2012 08:16 AM, Peter Zijlstra wrote:
> +/*
> + * Drive the periodic memory faults..
> + */
> +void task_tick_numa(struct rq *rq, struct task_struct *curr)
> +{
> + struct callback_head *work = &curr->numa_work;
> + u64 period, now;
> +
> + /*
> + * We don't care about NUMA placement if we don't have memory.
> + */
> + if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
> + return;

We should probably skip the whole unmap-and-refault
business if we are running on a system that is not
NUMA. Ie. a system with just one node...

> + /*
> + * Using runtime rather than walltime has the dual advantage that
> + * we (mostly) drive the selection from busy threads and that the
> + * task needs to have done some actual work before we bother with
> + * NUMA placement.
> + */
> + now = curr->se.sum_exec_runtime;
> + period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
> +

2012-11-01 15:40:38

by Mel Gorman

[permalink] [raw]

Subject: Re: [PATCH 26/31] sched, numa, mm: Add fault driven placement and migration policy

On Thu, Oct 25, 2012 at 02:16:43PM +0200, Peter Zijlstra wrote:
> As per the problem/design document Documentation/scheduler/numa-problem.txt
> implement 3ac & 4.
>
> ( A pure 3a was found too unstable, I did briefly try 3bc
> but found no significant improvement. )
>
> Implement a per-task memory placement scheme relying on a regular
> PROT_NONE 'migration' fault to scan the memory space of the procress
> and uses a two stage migration scheme to reduce the invluence of
> unlikely usage relations.
>
> It relies on the assumption that the compute part is tied to a
> paticular task and builds a task<->page relation set to model the
> compute<->data relation.
>
> In the previous patch we made memory migrate towards where the task
> is running, here we select the node on which most memory is located
> as the preferred node to run on.
>
> This creates a feed-back control loop between trying to schedule a
> task on a node and migrating memory towards the node the task is
> scheduled on.
>

Ok.

> Suggested-by: Andrea Arcangeli <[email protected]>
> Suggested-by: Rik van Riel <[email protected]>
> Fixes-by: David Rientjes <[email protected]>
> Cc: Linus Torvalds <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Signed-off-by: Peter Zijlstra <[email protected]>
> Signed-off-by: Ingo Molnar <[email protected]>
> ---
> include/linux/mm_types.h | 4 +
> include/linux/sched.h | 35 +++++++--
> kernel/sched/core.c | 16 ++++
> kernel/sched/fair.c | 175 +++++++++++++++++++++++++++++++++++++++++++++++
> kernel/sched/features.h | 1
> kernel/sched/sched.h | 31 +++++---
> kernel/sysctl.c | 31 +++++++-
> mm/huge_memory.c | 7 +
> mm/memory.c | 4 -
> 9 files changed, 282 insertions(+), 22 deletions(-)
> Index: tip/include/linux/mm_types.h
> ===================================================================
> --- tip.orig/include/linux/mm_types.h
> +++ tip/include/linux/mm_types.h
> @@ -403,6 +403,10 @@ struct mm_struct {
> #ifdef CONFIG_CPUMASK_OFFSTACK
> struct cpumask cpumask_allocation;
> #endif
> +#ifdef CONFIG_SCHED_NUMA
> + unsigned long numa_next_scan;

comment.

> + int numa_scan_seq;

comment! at least the other one is easy to guess. This thing looks like
it's preventing multiple threads in a process space scanning and
updating PTEs at the same time. Effectively it's a type of barrier but
without a comment I'm not sure if what it's doing is what you expect it
to be doing or something else entirely.

> +#endif
> struct uprobes_state uprobes_state;
> };
>
> Index: tip/include/linux/sched.h
> ===================================================================
> --- tip.orig/include/linux/sched.h
> +++ tip/include/linux/sched.h
> @@ -1481,9 +1481,16 @@ struct task_struct {
> short pref_node_fork;
> #endif
> #ifdef CONFIG_SCHED_NUMA
> - int node;
> + int node; /* task home node */
> + int numa_scan_seq;
> + int numa_migrate_seq;
> + unsigned int numa_scan_period;
> + u64 node_stamp; /* migration stamp */
> unsigned long numa_contrib;
> -#endif
> + unsigned long *numa_faults;
> + struct callback_head numa_work;
> +#endif /* CONFIG_SCHED_NUMA */
> +
> struct rcu_head rcu;
>
> /*
> @@ -1558,15 +1565,24 @@ struct task_struct {
> /* Future-safe accessor for struct task_struct's cpus_allowed. */
> #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
>
> +#ifdef CONFIG_SCHED_NUMA
> static inline int tsk_home_node(struct task_struct *p)
> {
> -#ifdef CONFIG_SCHED_NUMA
> return p->node;
> +}
> +
> +extern void task_numa_fault(int node, int pages);
> #else
> +static inline int tsk_home_node(struct task_struct *p)
> +{
> return -1;
> -#endif
> }
>
> +static inline void task_numa_fault(int node, int pages)
> +{
> +}
> +#endif /* CONFIG_SCHED_NUMA */
> +
> /*
> * Priority of a process goes from 0..MAX_PRIO-1, valid RT
> * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
> @@ -2004,6 +2020,10 @@ enum sched_tunable_scaling {
> };
> extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
>
> +extern unsigned int sysctl_sched_numa_scan_period_min;
> +extern unsigned int sysctl_sched_numa_scan_period_max;
> +extern unsigned int sysctl_sched_numa_settle_count;
> +
> #ifdef CONFIG_SCHED_DEBUG
> extern unsigned int sysctl_sched_migration_cost;
> extern unsigned int sysctl_sched_nr_migrate;
> @@ -2014,18 +2034,17 @@ extern unsigned int sysctl_sched_shares_
> int sched_proc_update_handler(struct ctl_table *table, int write,
> void __user *buffer, size_t *length,
> loff_t *ppos);
> -#endif
> -#ifdef CONFIG_SCHED_DEBUG
> +
> static inline unsigned int get_sysctl_timer_migration(void)
> {
> return sysctl_timer_migration;
> }
> -#else
> +#else /* CONFIG_SCHED_DEBUG */
> static inline unsigned int get_sysctl_timer_migration(void)
> {
> return 1;
> }
> -#endif
> +#endif /* CONFIG_SCHED_DEBUG */
> extern unsigned int sysctl_sched_rt_period;
> extern int sysctl_sched_rt_runtime;
>
> Index: tip/kernel/sched/core.c
> ===================================================================
> --- tip.orig/kernel/sched/core.c
> +++ tip/kernel/sched/core.c
> @@ -1533,6 +1533,21 @@ static void __sched_fork(struct task_str
> #ifdef CONFIG_PREEMPT_NOTIFIERS
> INIT_HLIST_HEAD(&p->preempt_notifiers);
> #endif
> +
> +#ifdef CONFIG_SCHED_NUMA
> + if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
> + p->mm->numa_next_scan = jiffies;
> + p->mm->numa_scan_seq = 0;
> + }
> +
> + p->node = -1;
> + p->node_stamp = 0ULL;
> + p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
> + p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
> + p->numa_faults = NULL;
> + p->numa_scan_period = sysctl_sched_numa_scan_period_min;
> + p->numa_work.next = &p->numa_work;
> +#endif /* CONFIG_SCHED_NUMA */
> }
>
> /*
> @@ -1774,6 +1789,7 @@ static void finish_task_switch(struct rq
> if (mm)
> mmdrop(mm);
> if (unlikely(prev_state == TASK_DEAD)) {
> + task_numa_free(prev);
> /*
> * Remove function-return probe instances associated with this
> * task and put them back on the free list.
> Index: tip/kernel/sched/fair.c
> ===================================================================
> --- tip.orig/kernel/sched/fair.c
> +++ tip/kernel/sched/fair.c
> @@ -27,6 +27,8 @@
> #include <linux/profile.h>
> #include <linux/interrupt.h>
> #include <linux/random.h>
> +#include <linux/mempolicy.h>
> +#include <linux/task_work.h>
>
> #include <trace/events/sched.h>
>
> @@ -775,6 +777,21 @@ update_stats_curr_start(struct cfs_rq *c
>
> /**************************************************
> * Scheduling class numa methods.
> + *
> + * The purpose of the NUMA bits are to maintain compute (task) and data
> + * (memory) locality. We try and achieve this by making tasks stick to
> + * a particular node (their home node) but if fairness mandates they run
> + * elsewhere for long enough, we let the memory follow them.
> + *
> + * Tasks start out with their home-node unset (-1) this effectively means
> + * they act !NUMA until we've established the task is busy enough to bother
> + * with placement.
> + *
> + * We keep a home-node per task and use periodic fault scans to try and
> + * estalish a task<->page relation. This assumes the task<->page relation is a
> + * compute<->data relation, this is false for things like virt. and n:m
> + * threading solutions but its the best we can do given the information we
> + * have.
> */
>
> #ifdef CONFIG_SMP
> @@ -805,6 +822,157 @@ static void account_numa_dequeue(struct
> } else
> rq->onnode_running--;
> }
> +
> +/*
> + * numa task sample period in ms: 5s
> + */
> +unsigned int sysctl_sched_numa_scan_period_min = 5000;
> +unsigned int sysctl_sched_numa_scan_period_max = 5000*16;
> +
> +/*
> + * Wait for the 2-sample stuff to settle before migrating again
> + */
> +unsigned int sysctl_sched_numa_settle_count = 2;
> +
> +static void task_numa_placement(struct task_struct *p)
> +{
> + unsigned long faults, max_faults = 0;
> + int node, max_node = -1;
> + int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
> +
> + if (p->numa_scan_seq == seq)
> + return;
> +
> + p->numa_scan_seq = seq;
> +
> + for (node = 0; node < nr_node_ids; node++) {
> + faults = p->numa_faults[node];
> +
> + if (faults > max_faults) {
> + max_faults = faults;
> + max_node = node;
> + }
> +
> + p->numa_faults[node] /= 2;
> + }

No comments explaining the logic behind the decaying average. It can be
inferred if someone reads Documentation/scheduler/numa-problem.txt and
point 3c carefully enough. At the very least point them at it.

> +
> + if (max_node == -1)
> + return;
> +
> + if (p->node != max_node) {
> + p->numa_scan_period = sysctl_sched_numa_scan_period_min;
> + if (sched_feat(NUMA_SETTLE) &&
> + (seq - p->numa_migrate_seq) <= (int)sysctl_sched_numa_settle_count)
> + return;
> + p->numa_migrate_seq = seq;
> + sched_setnode(p, max_node);

Ok, so at a guess even if we do ping-pong it will only take effect every
10 seconds which could be far worse.

> + } else {
> + p->numa_scan_period = min(sysctl_sched_numa_scan_period_max,
> + p->numa_scan_period * 2);
> + }
> +}
> +
> +/*
> + * Got a PROT_NONE fault for a page on @node.
> + */
> +void task_numa_fault(int node, int pages)
> +{
> + struct task_struct *p = current;
> +
> + if (unlikely(!p->numa_faults)) {
> + int size = sizeof(unsigned long) * nr_node_ids;
> +
> + p->numa_faults = kzalloc(size, GFP_KERNEL);
> + if (!p->numa_faults)
> + return;
> + }
> +

On a maximally configured machine this will be an order-4 allocation and
you need at least 512 nodes before it's an order-1 allocation. As unlikely
as it is, should this be GFP_NOWARN?

> + task_numa_placement(p);
> +
> + p->numa_faults[node] += pages;
> +}
> +
> +/*
> + * The expensive part of numa migration is done from task_work context.
> + * Triggered from task_tick_numa().
> + */
> +void task_numa_work(struct callback_head *work)
> +{
> + unsigned long migrate, next_scan, now = jiffies;
> + struct task_struct *p = current;
> + struct mm_struct *mm = p->mm;
> +
> + WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
> +
> + work->next = work; /* protect against double add */
> + /*
> + * Who cares about NUMA placement when they're dying.
> + *
> + * NOTE: make sure not to dereference p->mm before this check,
> + * exit_task_work() happens _after_ exit_mm() so we could be called
> + * without p->mm even though we still had it when we enqueued this
> + * work.
> + */
> + if (p->flags & PF_EXITING)
> + return;
> +
> + /*
> + * Enforce maximal scan/migration frequency..
> + */
> + migrate = mm->numa_next_scan;
> + if (time_before(now, migrate))
> + return;
> +
> + next_scan = now + 2*msecs_to_jiffies(sysctl_sched_numa_scan_period_min);
> + if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
> + return;
> +
> + ACCESS_ONCE(mm->numa_scan_seq)++;
> + {
> + struct vm_area_struct *vma;
> +
> + down_write(&mm->mmap_sem);
> + for (vma = mm->mmap; vma; vma = vma->vm_next) {
> + if (!vma_migratable(vma))
> + continue;
> + change_protection(vma, vma->vm_start, vma->vm_end, vma_prot_none(vma), 0);
> + }
> + up_write(&mm->mmap_sem);
> + }
> +}

Ok, I like the idea of the scanning cost being incurred by the process.
I was going to complain though that for very large processes that the length
time it takes to complete this scan could be considerable. However,
a quick glance forward indicates that you cope with this problem later by
limiting how much is scanned each time.

> +
> +/*
> + * Drive the periodic memory faults..
> + */
> +void task_tick_numa(struct rq *rq, struct task_struct *curr)
> +{
> + struct callback_head *work = &curr->numa_work;
> + u64 period, now;
> +
> + /*
> + * We don't care about NUMA placement if we don't have memory.
> + */
> + if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
> + return;
> +
> + /*
> + * Using runtime rather than walltime has the dual advantage that
> + * we (mostly) drive the selection from busy threads and that the
> + * task needs to have done some actual work before we bother with
> + * NUMA placement.
> + */

Makes sense.

> + now = curr->se.sum_exec_runtime;
> + period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
> +
> + if (now - curr->node_stamp > period) {
> + curr->node_stamp = now;
> +
> + if (!time_before(jiffies, curr->mm->numa_next_scan)) {
> + init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
> + task_work_add(curr, work, true);
> + }
> + }
> +}
> #else
> #ifdef CONFIG_SMP
> static struct list_head *account_numa_enqueue(struct rq *rq, struct task_struct *p)
> @@ -816,6 +984,10 @@ static struct list_head *account_numa_en
> static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
> {
> }
> +
> +static void task_tick_numa(struct rq *rq, struct task_struct *curr)
> +{
> +}
> #endif /* CONFIG_SCHED_NUMA */
>
> /**************************************************
> @@ -5265,6 +5437,9 @@ static void task_tick_fair(struct rq *rq
> cfs_rq = cfs_rq_of(se);
> entity_tick(cfs_rq, se, queued);
> }
> +
> + if (sched_feat_numa(NUMA))
> + task_tick_numa(rq, curr);
> }
>
> /*
> Index: tip/kernel/sched/features.h
> ===================================================================
> --- tip.orig/kernel/sched/features.h
> +++ tip/kernel/sched/features.h
> @@ -69,5 +69,6 @@ SCHED_FEAT(NUMA_TTWU_BIAS, false)
> SCHED_FEAT(NUMA_TTWU_TO, false)
> SCHED_FEAT(NUMA_PULL, true)
> SCHED_FEAT(NUMA_PULL_BIAS, true)
> +SCHED_FEAT(NUMA_SETTLE, true)
> #endif
>
> Index: tip/kernel/sched/sched.h
> ===================================================================
> --- tip.orig/kernel/sched/sched.h
> +++ tip/kernel/sched/sched.h
> @@ -3,6 +3,7 @@
> #include <linux/mutex.h>
> #include <linux/spinlock.h>
> #include <linux/stop_machine.h>
> +#include <linux/slab.h>
>
> #include "cpupri.h"
>
> @@ -476,15 +477,6 @@ struct rq {
> #endif
> };
>
> -static inline struct list_head *offnode_tasks(struct rq *rq)
> -{
> -#ifdef CONFIG_SCHED_NUMA
> - return &rq->offnode_tasks;
> -#else
> - return NULL;
> -#endif
> -}
> -
> static inline int cpu_of(struct rq *rq)
> {
> #ifdef CONFIG_SMP
> @@ -502,6 +494,27 @@ DECLARE_PER_CPU(struct rq, runqueues);
> #define cpu_curr(cpu) (cpu_rq(cpu)->curr)
> #define raw_rq() (&__raw_get_cpu_var(runqueues))
>
> +#ifdef CONFIG_SCHED_NUMA
> +static inline struct list_head *offnode_tasks(struct rq *rq)
> +{
> + return &rq->offnode_tasks;
> +}
> +
> +static inline void task_numa_free(struct task_struct *p)
> +{
> + kfree(p->numa_faults);
> +}
> +#else /* CONFIG_SCHED_NUMA */
> +static inline struct list_head *offnode_tasks(struct rq *rq)
> +{
> + return NULL;
> +}
> +
> +static inline void task_numa_free(struct task_struct *p)
> +{
> +}
> +#endif /* CONFIG_SCHED_NUMA */
> +
> #ifdef CONFIG_SMP
>
> #define rcu_dereference_check_sched_domain(p) \
> Index: tip/kernel/sysctl.c
> ===================================================================
> --- tip.orig/kernel/sysctl.c
> +++ tip/kernel/sysctl.c
> @@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 10
> static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
> static int min_wakeup_granularity_ns; /* 0 usecs */
> static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
> +#ifdef CONFIG_SMP
> static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
> static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
> -#endif
> +#endif /* CONFIG_SMP */
> +#endif /* CONFIG_SCHED_DEBUG */
>
> #ifdef CONFIG_COMPACTION
> static int min_extfrag_threshold;
> @@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = {
> .extra1 = &min_wakeup_granularity_ns,
> .extra2 = &max_wakeup_granularity_ns,
> },
> +#ifdef CONFIG_SMP
> {
> .procname = "sched_tunable_scaling",
> .data = &sysctl_sched_tunable_scaling,
> @@ -347,7 +350,31 @@ static struct ctl_table kern_table[] = {
> .extra1 = &zero,
> .extra2 = &one,
> },
> -#endif
> +#endif /* CONFIG_SMP */
> +#ifdef CONFIG_SCHED_NUMA
> + {
> + .procname = "sched_numa_scan_period_min_ms",
> + .data = &sysctl_sched_numa_scan_period_min,
> + .maxlen = sizeof(unsigned int),
> + .mode = 0644,
> + .proc_handler = proc_dointvec,
> + },
> + {
> + .procname = "sched_numa_scan_period_max_ms",
> + .data = &sysctl_sched_numa_scan_period_max,
> + .maxlen = sizeof(unsigned int),
> + .mode = 0644,
> + .proc_handler = proc_dointvec,
> + },
> + {
> + .procname = "sched_numa_settle_count",
> + .data = &sysctl_sched_numa_settle_count,
> + .maxlen = sizeof(unsigned int),
> + .mode = 0644,
> + .proc_handler = proc_dointvec,
> + },
> +#endif /* CONFIG_SCHED_NUMA */
> +#endif /* CONFIG_SCHED_DEBUG */
> {
> .procname = "sched_rt_period_us",
> .data = &sysctl_sched_rt_period,
> Index: tip/mm/huge_memory.c
> ===================================================================
> --- tip.orig/mm/huge_memory.c
> +++ tip/mm/huge_memory.c
> @@ -774,9 +774,10 @@ fixup:
>
> unlock:
> spin_unlock(&mm->page_table_lock);
> - if (page)
> + if (page) {
> + task_numa_fault(page_to_nid(page), HPAGE_PMD_NR);
> put_page(page);
> -
> + }
> return;
>
> migrate:
> @@ -845,6 +846,8 @@ migrate:
>
> put_page(page); /* Drop the rmap reference */
>
> + task_numa_fault(node, HPAGE_PMD_NR);
> +
> if (lru)
> put_page(page); /* drop the LRU isolation reference */
>
> Index: tip/mm/memory.c
> ===================================================================
> --- tip.orig/mm/memory.c
> +++ tip/mm/memory.c
> @@ -3512,8 +3512,10 @@ out_pte_upgrade_unlock:
> out_unlock:
> pte_unmap_unlock(ptep, ptl);
> out:
> - if (page)
> + if (page) {
> + task_numa_fault(page_nid, 1);
> put_page(page);
> + }
>
> return 0;
>
>
>

--
Mel Gorman
SUSE Labs