2003-05-27 08:16:15

by Erich Focht

[permalink] [raw]
Subject: Node affine NUMA scheduler extension

This patch is an adaptation of the earlier work on the node affine
NUMA scheduler to the NUMA features meanwhile integrated into
2.5. Compared to the patch posted for 2.5.39 this one is much simpler
and easier to understand.

The main idea is (still) that tasks are assigned a homenode to which
they are preferentially scheduled. They are not only sticking as much
as possible to a node (as in the current 2.5 NUMA scheduler) but will
also be attracted back to their homenode if they had to be scheduled
away. Therefore the tasks can be called "affine" to the homenode.

The implementation is straight forward:
- Tasks have an additional element in their task structure (node).
- The scheduler keeps track of the homenodes of the tasks running in
each node and on each runqueue.
- At cross-node load balance time nodes/runqueues which run tasks
originating from the stealer node are preferred. They get a weight
bonus for each task with the homenode of the stealer.
- When stealing from a remote node one tries to get the own tasks (if
any) or tasks from other nodes (if any). This way tasks are kept on
their homenode as long as possible.

The selection of the homenode is currently done at initial load
balancing, i.e. at exec(). A smarter selection method might be needed
for improving the situation for multithreaded processes. An option is
the dynamic_homenode patch I posted for 2.5.39 or some other scheme
based on an RSS/node measure. But that's another story...

A node affine NUMA scheduler based on these principles is running very
successfully in production on NEC TX7 machines since 1 year. The
current patch was tested on a 32 CPU Itanium2 TX7 machine.

Curious about comments...
Erich

-----------------8<----- node_affine_sched-2.5.70-23.diff ---------
diff -urN a/include/linux/sched.h b/include/linux/sched.h
--- a/include/linux/sched.h 2003-05-27 03:00:23.000000000 +0200
+++ b/include/linux/sched.h 2003-05-27 09:42:05.000000000 +0200
@@ -340,6 +340,9 @@
unsigned long policy;
unsigned long cpus_allowed;
unsigned int time_slice, first_time_slice;
+#ifdef CONFIG_NUMA
+ int node; /* homenode: task will be preferrentially scheduled on it */
+#endif

struct list_head tasks;
struct list_head ptrace_children;
@@ -489,9 +492,11 @@
#ifdef CONFIG_NUMA
extern void sched_balance_exec(void);
extern void node_nr_running_init(void);
+extern void set_task_node(task_t *p, int node);
#else
#define sched_balance_exec() {}
#define node_nr_running_init() {}
+#define set_task_node(p,n) {}
#endif

extern void set_user_nice(task_t *p, long nice);
diff -urN a/kernel/sched.c b/kernel/sched.c
--- a/kernel/sched.c 2003-05-27 03:00:46.000000000 +0200
+++ b/kernel/sched.c 2003-05-27 09:42:05.000000000 +0200
@@ -164,6 +164,8 @@
#ifdef CONFIG_NUMA
atomic_t *node_nr_running;
int prev_node_load[MAX_NUMNODES];
+ int nr_homenode[MAX_NUMNODES]; /* nr of tasks per homenode in this rq */
+ atomic_t *node_nremote; /* tasks per homenode stats for this node */
#endif
task_t *migration_thread;
struct list_head migration_queue;
@@ -197,36 +199,79 @@
static atomic_t node_nr_running[MAX_NUMNODES] ____cacheline_maxaligned_in_smp =
{[0 ...MAX_NUMNODES-1] = ATOMIC_INIT(0)};

+/*
+ * Keep track on tasks from other homenodes.
+ */
+static atomic_t node_nr_homenode[MAX_NUMNODES][MAX_NUMNODES] ____cacheline_maxaligned_in_smp =
+ {[0 ... MAX_NUMNODES-1][0 ... MAX_NUMNODES-1] = ATOMIC_INIT(0)};
+
+
static inline void nr_running_init(struct runqueue *rq)
{
rq->node_nr_running = &node_nr_running[0];
+ rq->node_nremote = &node_nr_homenode[0][0];
}

-static inline void nr_running_inc(runqueue_t *rq)
+static inline void nr_running_inc(runqueue_t *rq, int node)
{
atomic_inc(rq->node_nr_running);
rq->nr_running++;
+ rq->nr_homenode[node]++;
+ atomic_inc(&rq->node_nremote[node]);
}

-static inline void nr_running_dec(runqueue_t *rq)
+static inline void nr_running_dec(runqueue_t *rq, int node)
{
atomic_dec(rq->node_nr_running);
rq->nr_running--;
+ rq->nr_homenode[node]--;
+ atomic_dec(&rq->node_nremote[node]);
}

__init void node_nr_running_init(void)
{
int i;

- for (i = 0; i < NR_CPUS; i++)
+ for (i = 0; i < NR_CPUS; i++) {
cpu_rq(i)->node_nr_running = &node_nr_running[cpu_to_node(i)];
+ cpu_rq(i)->node_nremote = &node_nr_homenode[cpu_to_node(i)][0];
+ }
+}
+
+#define homenode(p) ((p)->node)
+#define homenode_set(p,n) (p)->node = (n)
+
+/*
+ * Allow stealing a task from another CPU if:
+ * - the CPU is in the same node or
+ * - the task has its homenode on this_node
+ * - CPU runs only own tasks
+ * - the CPU has remote tasks and the task is from another node.
+ * Tasks will tend to return to their homenode and a runqueue will keep
+ * tasks belonging to its node as long as it has remote tasks running. [EF]
+ */
+static inline int numa_should_migrate(task_t *p, runqueue_t *rq, int this_cpu)
+{
+ int src_cpu = rq - &runqueues[0];
+ int this_node = cpu_to_node(this_cpu);
+ int src_node = cpu_to_node(src_cpu);
+
+ if ((src_node == this_node) || /* same node */
+ (homenode(p) == this_node) || /* task is from this_node */
+ (rq->nr_running == rq->nr_homenode[src_node]) ||
+ ((rq->nr_running > rq->nr_homenode[src_node]) &&
+ (homenode(p) != src_node)))
+ return 1;
+ return 0;
}

#else /* !CONFIG_NUMA */

# define nr_running_init(rq) do { } while (0)
-# define nr_running_inc(rq) do { (rq)->nr_running++; } while (0)
-# define nr_running_dec(rq) do { (rq)->nr_running--; } while (0)
+# define nr_running_inc(rq,n) do { (rq)->nr_running++; } while (0)
+# define nr_running_dec(rq,n) do { (rq)->nr_running--; } while (0)
+# define homenode_set(p,n) do { } while (0)
+# define numa_should_migrate(p,q,c) (1)

#endif /* CONFIG_NUMA */

@@ -331,7 +376,7 @@
static inline void __activate_task(task_t *p, runqueue_t *rq)
{
enqueue_task(p, rq->active);
- nr_running_inc(rq);
+ nr_running_inc(rq, homenode(p));
}

/*
@@ -378,7 +423,7 @@
*/
static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
{
- nr_running_dec(rq);
+ nr_running_dec(rq, homenode(p));
if (p->state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible++;
dequeue_task(p, p->array);
@@ -557,7 +602,7 @@
list_add_tail(&p->run_list, &current->run_list);
p->array = current->array;
p->array->nr_active++;
- nr_running_inc(rq);
+ nr_running_inc(rq, homenode(p));
}
task_rq_unlock(rq, &flags);
}
@@ -765,6 +810,21 @@
set_cpus_allowed(p, old_mask);
}

+void set_task_node(task_t *p, int node)
+{
+ runqueue_t *rq;
+ unsigned long flags;
+
+ if (node < 0 || node >= numnodes) return;
+ rq = task_rq_lock(p, &flags);
+ if (p->array) {
+ nr_running_dec(rq, homenode(p));
+ nr_running_inc(rq, node);
+ }
+ homenode_set(p,node);
+ task_rq_unlock(rq, &flags);
+}
+
/*
* Find the least loaded CPU. Slightly favor the current CPU by
* setting its runqueue length as the minimum to start.
@@ -806,8 +866,11 @@

if (numnodes > 1) {
new_cpu = sched_best_cpu(current);
- if (new_cpu != smp_processor_id())
+ if (new_cpu != smp_processor_id()) {
+ if (cpu_to_node(new_cpu) != homenode(current))
+ set_task_node(current, cpu_to_node(new_cpu));
sched_migrate_task(current, new_cpu);
+ }
}
}

@@ -816,6 +879,9 @@
* geometrically deccaying weight to the load measure:
* load_{t} = load_{t-1}/2 + nr_node_running_{t}
* This way sudden load peaks are flattened out a bit.
+ * Load measure of remote nodes is increased if they run tasks from this_node,
+ * thus such nodes are preferred as steal candidates and own tasks have more
+ * chances to return "home". [EF]
*/
static int find_busiest_node(int this_node)
{
@@ -828,7 +894,8 @@
if (i == this_node)
continue;
load = (this_rq()->prev_node_load[i] >> 1)
- + atomic_read(&node_nr_running[i]);
+ + atomic_read(&node_nr_running[i])
+ + atomic_read(&node_nr_homenode[i][this_node]);
this_rq()->prev_node_load[i] = load;
if (load > maxload && (100*load > NODE_THRESHOLD*this_load)) {
maxload = load;
@@ -872,7 +939,7 @@
*/
static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle, int *imbalance, unsigned long cpumask)
{
- int nr_running, load, max_load, i;
+ int nr_running, load, max_load, i, this_node;
runqueue_t *busiest, *rq_src;

/*
@@ -903,6 +970,7 @@
nr_running = this_rq->prev_cpu_load[this_cpu];

busiest = NULL;
+ this_node = cpu_to_node(this_cpu);
max_load = 1;
for (i = 0; i < NR_CPUS; i++) {
if (!((1UL << i) & cpumask))
@@ -913,6 +981,14 @@
load = rq_src->nr_running;
else
load = this_rq->prev_cpu_load[i];
+#ifdef CONFIG_NUMA
+ /*
+ * Add relative load bonus if CPUs in remote node run tasks
+ * with homenode == this_node such that these CPUs are prefered.
+ */
+ if (this_node != cpu_to_node(i))
+ load += rq_src->nr_homenode[this_node];
+#endif
this_rq->prev_cpu_load[i] = rq_src->nr_running;

if ((load > max_load) && (rq_src != this_rq)) {
@@ -952,9 +1028,9 @@
static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu)
{
dequeue_task(p, src_array);
- nr_running_dec(src_rq);
+ nr_running_dec(src_rq, homenode(p));
set_task_cpu(p, this_cpu);
- nr_running_inc(this_rq);
+ nr_running_inc(this_rq,homenode(p));
enqueue_task(p, this_rq->active);
/*
* Note that idle threads have a prio of MAX_PRIO, for this test
@@ -1035,7 +1111,8 @@

curr = curr->prev;

- if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) {
+ if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)
+ || !numa_should_migrate(tmp, busiest, this_cpu)) {
if (curr != head)
goto skip_queue;
idx++;
@@ -2225,6 +2302,7 @@
idle->prio = MAX_PRIO;
idle->state = TASK_RUNNING;
set_task_cpu(idle, cpu);
+ homenode_set(idle, cpu_to_node(cpu));
double_rq_unlock(idle_rq, rq);
set_tsk_need_resched(idle);
local_irq_restore(flags);



2003-05-27 08:57:52

by Andi Kleen

[permalink] [raw]
Subject: Re: [Lse-tech] Node affine NUMA scheduler extension

On Tue, May 27, 2003 at 10:31:55AM +0200, Erich Focht wrote:
> This patch is an adaptation of the earlier work on the node affine
> NUMA scheduler to the NUMA features meanwhile integrated into
> 2.5. Compared to the patch posted for 2.5.39 this one is much simpler
> and easier to understand.

Yes, it is also much simpler than my implementation (I did a similar homenode
scheduler for an 2.4 kernel). The basic principles are the same.

But the main problems I have is that the tuning for threads is very
difficult. On AMD64 where Node equals CPU it is important
to home node balance threads too. After some experiments I settled on
homenode assignment on the first load balance (called "lazy homenode")
When a thread clones it initially executes on the CPU of the parent, but
there is a window until the first load balance tick where it can allocate
memory on the wrong node. I found a lot of code runs very badly until the
cache decay parameter is set to 0 (no special cache affinity) to allow
quick initial migration. Migration directly on fork/clone requires a lot
of changes and also breaks down on some benchmarks.

-Andi

2003-05-27 09:39:10

by Erich Focht

[permalink] [raw]
Subject: Re: [Lse-tech] Node affine NUMA scheduler extension

On Tuesday 27 May 2003 11:11, Andi Kleen wrote:
> On Tue, May 27, 2003 at 10:31:55AM +0200, Erich Focht wrote:
> > This patch is an adaptation of the earlier work on the node affine
> > NUMA scheduler to the NUMA features meanwhile integrated into
> > 2.5. Compared to the patch posted for 2.5.39 this one is much simpler
> > and easier to understand.
>
> Yes, it is also much simpler than my implementation (I did a similar
> homenode scheduler for an 2.4 kernel). The basic principles are the same.

:-)

> But the main problems I have is that the tuning for threads is very
> difficult. On AMD64 where Node equals CPU it is important
> to home node balance threads too. After some experiments I settled on
> homenode assignment on the first load balance (called "lazy homenode")
> When a thread clones it initially executes on the CPU of the parent, but
> there is a window until the first load balance tick where it can allocate
> memory on the wrong node. I found a lot of code runs very badly until the
> cache decay parameter is set to 0 (no special cache affinity) to allow
> quick initial migration.

Interesting observation, I didn't make it when I tried the lazy
homenode (quite a while ago). But I was focusing on MPI jobs. So what
if we add a condition to CAN_MIGRATE which disables the cache affinity
before the first load balance?

> Migration directly on fork/clone requires a lot
> of changes and also breaks down on some benchmarks.

Hmmm, I wouldn't allow this to any task/child, only to special
ones. Under 2.4 I currently use a sched_balance_fork() function
similar to sched_balance_exec(). Tasks have a default initial load
balancing policy of being migrated (and selecting the homenode) at
exec(). This can be changed (with prctl) to fork(). The ilb policy is
inheritable. Works fine for OpenMP jobs.

Regards,
Erich


2003-05-27 09:48:37

by Andi Kleen

[permalink] [raw]
Subject: Re: [Lse-tech] Node affine NUMA scheduler extension

On Tue, May 27, 2003 at 11:54:52AM +0200, Erich Focht wrote:
> > But the main problems I have is that the tuning for threads is very
> > difficult. On AMD64 where Node equals CPU it is important
> > to home node balance threads too. After some experiments I settled on
> > homenode assignment on the first load balance (called "lazy homenode")
> > When a thread clones it initially executes on the CPU of the parent, but
> > there is a window until the first load balance tick where it can allocate
> > memory on the wrong node. I found a lot of code runs very badly until the
> > cache decay parameter is set to 0 (no special cache affinity) to allow
> > quick initial migration.
>
> Interesting observation, I didn't make it when I tried the lazy
> homenode (quite a while ago). But I was focusing on MPI jobs. So what
> if we add a condition to CAN_MIGRATE which disables the cache affinity
> before the first load balance?

What I currently have is two cache decay variables: one is used if the
homenode is not assigned, the other otherwise. Both are sysctls too.
But it obviously only works with lazy homenode, but the state is the same.
I'm still not completely happy with it though.

Why exactly did you gave up to use the lazy homenode?

>
> > Migration directly on fork/clone requires a lot
> > of changes and also breaks down on some benchmarks.
>
> Hmmm, I wouldn't allow this to any task/child, only to special
> ones. Under 2.4 I currently use a sched_balance_fork() function

Yes, I agree.

> similar to sched_balance_exec(). Tasks have a default initial load
> balancing policy of being migrated (and selecting the homenode) at
> exec(). This can be changed (with prctl) to fork(). The ilb policy is
> inheritable. Works fine for OpenMP jobs.

Hmm, I should try that I guess. Where do you call it? At the end of do_fork?
I tried to hack up wake_up_forked_process() to do it, but it required
large scale changes all over the scheduler so I eventually gave up.

-Andi

2003-05-27 11:23:45

by Erich Focht

[permalink] [raw]
Subject: Re: [Lse-tech] Node affine NUMA scheduler extension

On Tuesday 27 May 2003 12:01, Andi Kleen wrote:
> > > allocate memory on the wrong node. I found a lot of code runs very
> > > badly until the cache decay parameter is set to 0 (no special cache
> > > affinity) to allow quick initial migration.
> >
> > Interesting observation, I didn't make it when I tried the lazy
> > homenode (quite a while ago). But I was focusing on MPI jobs. So what
> > if we add a condition to CAN_MIGRATE which disables the cache affinity
> > before the first load balance?
>
> What I currently have is two cache decay variables: one is used if the
> homenode is not assigned, the other otherwise. Both are sysctls too.
> But it obviously only works with lazy homenode, but the state is the same.
> I'm still not completely happy with it though.
>
> Why exactly did you gave up to use the lazy homenode?

I left the loadbalancer to decide on where the homenode should be. And
the decision wasn't good enough (it lead to unbalanced nodes, that's
certainly not the case on Opteron :-). So it made sense to have a
specialized homenode chooser which considered the node loads instead
of changing the normal load balancer.

BTW: do you assign the homenode at first load balancing or at first
cross-node balancing?

> > similar to sched_balance_exec(). Tasks have a default initial load
> > balancing policy of being migrated (and selecting the homenode) at
> > exec(). This can be changed (with prctl) to fork(). The ilb policy is
> > inheritable. Works fine for OpenMP jobs.
>
> Hmm, I should try that I guess. Where do you call it? At the end of
> do_fork? I tried to hack up wake_up_forked_process() to do it, but it
> required large scale changes all over the scheduler so I eventually gave
> up.

For the processes with ilb_policy=FORK I chose the homenode and set
the cpu before the task is woken up. It is done in do_fork right after
the task structure has been cloned. wake_up_forked_process was changed
to lock the rq of the child (and not the one of current) and not set
the cpu any more. I don't think there was more...

Regards,
Erich


2003-05-27 11:27:33

by Andi Kleen

[permalink] [raw]
Subject: Re: [Lse-tech] Node affine NUMA scheduler extension

> I left the loadbalancer to decide on where the homenode should be. And
> the decision wasn't good enough (it lead to unbalanced nodes, that's
> certainly not the case on Opteron :-). So it made sense to have a
> specialized homenode chooser which considered the node loads instead
> of changing the normal load balancer.

I have that, but only on exec (similar to what 2.5 mainline does with the
NUMA scheduler)

> BTW: do you assign the homenode at first load balancing or at first
> cross-node balancing?

It's the same on Opteron: each CPU is an own node. The lazy homenode for
fork/clone is chosen on the first load balance of the new thread.


-Andi

2003-05-27 11:34:27

by Erich Focht

[permalink] [raw]
Subject: Re: [Lse-tech] Node affine NUMA scheduler extension

On Tuesday 27 May 2003 13:40, Andi Kleen wrote:
> > I left the loadbalancer to decide on where the homenode should be. And
> > the decision wasn't good enough (it lead to unbalanced nodes, that's
> > certainly not the case on Opteron :-). So it made sense to have a
> > specialized homenode chooser which considered the node loads instead
> > of changing the normal load balancer.
>
> I have that, but only on exec (similar to what 2.5 mainline does with the
> NUMA scheduler)

Oh, that's fine, then. It's better than just on exec().

> > BTW: do you assign the homenode at first load balancing or at first
> > cross-node balancing?
>
> It's the same on Opteron: each CPU is an own node. The lazy homenode for
> fork/clone is chosen on the first load balance of the new thread.

Ah, you're right :-)

Erich


2003-05-27 15:26:20

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [Lse-tech] Node affine NUMA scheduler extension

> Interesting observation, I didn't make it when I tried the lazy
> homenode (quite a while ago). But I was focusing on MPI jobs. So what
> if we add a condition to CAN_MIGRATE which disables the cache affinity
> before the first load balance?
>
>> Migration directly on fork/clone requires a lot
>> of changes and also breaks down on some benchmarks.
>
> Hmmm, I wouldn't allow this to any task/child, only to special
> ones. Under 2.4 I currently use a sched_balance_fork() function
> similar to sched_balance_exec(). Tasks have a default initial load
> balancing policy of being migrated (and selecting the homenode) at
> exec(). This can be changed (with prctl) to fork(). The ilb policy is
> inheritable. Works fine for OpenMP jobs.

It'd be nice not to require user intervention here ... is it OK to
set CAN_MIGRATE for all clone operations?

M.

2003-05-27 21:13:45

by Erich Focht

[permalink] [raw]
Subject: Re: [Lse-tech] Node affine NUMA scheduler extension

On Tuesday 27 May 2003 17:38, Martin J. Bligh wrote:
> > Interesting observation, I didn't make it when I tried the lazy
> > homenode (quite a while ago). But I was focusing on MPI jobs. So what
> > if we add a condition to CAN_MIGRATE which disables the cache affinity
> > before the first load balance?
...
>
> It'd be nice not to require user intervention here ... is it OK to
> set CAN_MIGRATE for all clone operations?

Do you think of something like:

#define CAN_MIGRATE_TASK(p,rq,this_cpu) \
(HOMENODE_UNSET(p) && \ //<--
(jiffies - (p)->last_run > cache_decay_ticks) && \
!task_running(rq, p) && \
((p)->cpus_allowed & (1UL << (this_cpu))))

curr = curr->prev;

if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)
|| !numa_should_migrate(tmp, busiest, this_cpu)) {
if (curr != head)
goto skip_queue;
idx++;
goto skip_bitmap;
}
if (HOMENODE_UNSET(tmp)) //<--
set_task_node(tmp,cpu_to_node(this_cpu)); //<--
pull_task(busiest, array, tmp, this_rq, this_cpu);
if (!idle && --imbalance) {
...

?
Guess this would help a bit for multithreaded jobs. Chosing the
homenode more carefully here would be pretty expensive.

Regards,
Erich


2003-05-27 21:39:13

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [Lse-tech] Node affine NUMA scheduler extension

--Erich Focht <[email protected]> wrote (on Tuesday, May 27, 2003 23:28:27 +0200):

> On Tuesday 27 May 2003 17:38, Martin J. Bligh wrote:
>> > Interesting observation, I didn't make it when I tried the lazy
>> > homenode (quite a while ago). But I was focusing on MPI jobs. So what
>> > if we add a condition to CAN_MIGRATE which disables the cache affinity
>> > before the first load balance?
> ...
>>
>> It'd be nice not to require user intervention here ... is it OK to
>> set CAN_MIGRATE for all clone operations?
>
> Do you think of something like:
>
># define CAN_MIGRATE_TASK(p,rq,this_cpu) \
> (HOMENODE_UNSET(p) && \ //<--
> (jiffies - (p)->last_run > cache_decay_ticks) && \
> !task_running(rq, p) && \
> ((p)->cpus_allowed & (1UL << (this_cpu))))
>
> curr = curr->prev;
>
> if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)
> || !numa_should_migrate(tmp, busiest, this_cpu)) {
> if (curr != head)
> goto skip_queue;
> idx++;
> goto skip_bitmap;
> }
> if (HOMENODE_UNSET(tmp)) //<--
> set_task_node(tmp,cpu_to_node(this_cpu)); //<--
> pull_task(busiest, array, tmp, this_rq, this_cpu);
> if (!idle && --imbalance) {
> ...
>
> ?
> Guess this would help a bit for multithreaded jobs. Chosing the
> homenode more carefully here would be pretty expensive.

My first instinct is that #define'ing CAN_MIGRATE_TASK in the midst
of a function needs to die a horrible death ;-) But you didn't start
that one, so other than that ...

My instinct would tell me the first expression should be ||, not &&
but I'm not 100% sure. And is this restricted to just clones? Doesn't
seem to be, unless that's implicit in homenode_unset?

M.



2003-05-28 16:46:52

by Erich Focht

[permalink] [raw]
Subject: Re: [Lse-tech] Node affine NUMA scheduler extension

On Tuesday 27 May 2003 23:51, Martin J. Bligh wrote:
> > Do you think of something like:
> >
> ># define CAN_MIGRATE_TASK(p,rq,this_cpu) \
> > (HOMENODE_UNSET(p) && \ //<--
> > (jiffies - (p)->last_run > cache_decay_ticks) && \
> > !task_running(rq, p) && \
> > ((p)->cpus_allowed & (1UL << (this_cpu))))
> >
...
>
> My instinct would tell me the first expression should be ||, not &&
> but I'm not 100% sure.

You're right.

> And is this restricted to just clones? Doesn't
> seem to be, unless that's implicit in homenode_unset?

Hmmm... That's actually the difficult issue with fork vs. clone and
migrating or not. When you clone a small task, you'd usually like to
keep it on the same node. If it's a big task and fault-in a lot of
memory, it could make sense to let it migrate to another node. For
forked tasks it's the same, you mighthave some more COW memory
movement but after that you'll usually be happy if the child went away
from the parent's node if it is a long-running child. Not so for a
short running child. The current compromise:
- start children (no matter whether forked or cloned) on the same CPU
- allow other CPUs to steal freshly forked children by reducing the
cache affinity until the first steal/migration
- ok with chidren who exec soon, they might exec before anyone can
steal them, drop all the pages and find anew life on the optimal node

This way exec'd tasks get properly baklanced, forked/cloned ones get a
better start than in the current scheduler. I'll post a patch on top
of the node affine scheduler doing this in 1-2 days.

Regards,
Erich


2003-05-28 17:00:29

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [Lse-tech] Node affine NUMA scheduler extension

>> And is this restricted to just clones? Doesn't
>> seem to be, unless that's implicit in homenode_unset?
>
> Hmmm... That's actually the difficult issue with fork vs. clone and
> migrating or not. When you clone a small task, you'd usually like to
> keep it on the same node. If it's a big task and fault-in a lot of
> memory, it could make sense to let it migrate to another node. For
> forked tasks it's the same, you mighthave some more COW memory
> movement but after that you'll usually be happy if the child went away
> from the parent's node if it is a long-running child. Not so for a
> short running child.

Can you define what you mean by big vs small? I presume you mean RSS?
There are several factors that come into play here, at least:

1. RSS (and which bits of this lie on which node)
2. CPU utilisation of the task
3. Task duration
4. Cache warmth
5. the current balance situation.

I'm not convinced there's anything we can do about 3, since it's mostly
unpredicatable, and I don't have crystal_ball.o ... we can kind of predict
2, at least for running tasks (not sure it makes much sense over fork).
For 1, we can kind of use that over a fork, though as we have no idea
whether it'll exec or not, I'm not sure it's much use *unless* we're
cloning.

> The current compromise:
> - start children (no matter whether forked or cloned) on the same CPU
> - allow other CPUs to steal freshly forked children by reducing the
> cache affinity until the first steal/migration
> - ok with chidren who exec soon, they might exec before anyone can
> steal them, drop all the pages and find anew life on the optimal node
>
> This way exec'd tasks get properly baklanced, forked/cloned ones get a
> better start than in the current scheduler. I'll post a patch on top
> of the node affine scheduler doing this in 1-2 days.

Turning on child-runs-first would instinctively be good as well, though
I have no benchmarks to prove it ... would be nice to get the load
balance situation fixed asap by running the exec. That's not on right
now, IIRC.

M.

2003-05-28 18:01:53

by Rick Lindsley

[permalink] [raw]
Subject: Re: [Lse-tech] Node affine NUMA scheduler extension

Can you define what you mean by big vs small? I presume you mean RSS?
There are several factors that come into play here, at least:

1. RSS (and which bits of this lie on which node)
2. CPU utilisation of the task
3. Task duration
4. Cache warmth
5. the current balance situation.

Along the same lines, would it make sense to *permit* imbalances for some
classes of tasks? It may be worth it, for example, to let three threads
sharing a lot of data to saturate one cpu because what they lose from
their self-competition is saved from the extremely warm cache.

So you leave cpu0 at 7 tasks even though cpu1 only has 4, because the 7 are
"related" and the 4 are "dissimilar"? The equation changes dramatically,
perhaps, once their is an idle cpu, but if everything is busy does it make
sense to weight the items in the runqueues in any way?

Rick

2003-05-28 19:19:17

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [Lse-tech] Node affine NUMA scheduler extension

> Can you define what you mean by big vs small? I presume you mean RSS?
> There are several factors that come into play here, at least:
>
> 1. RSS (and which bits of this lie on which node)
> 2. CPU utilisation of the task
> 3. Task duration
> 4. Cache warmth
> 5. the current balance situation.
>
> Along the same lines, would it make sense to *permit* imbalances for some
> classes of tasks? It may be worth it, for example, to let three threads
> sharing a lot of data to saturate one cpu because what they lose from
> their self-competition is saved from the extremely warm cache.
>
> So you leave cpu0 at 7 tasks even though cpu1 only has 4, because the 7 are
> "related" and the 4 are "dissimilar"? The equation changes dramatically,
> perhaps, once their is an idle cpu, but if everything is busy does it make
> sense to weight the items in the runqueues in any way?

It'd make sense ... but I think it would be a bitch to implement ;-)
How do you know when it's worth it?

M.