2023-12-16 00:19:40

by Chris Hyser

[permalink] [raw]
Subject: [RFC/POC 1/2] sched/numa: Adds ability to over-ride a tasks numa_preferred_nid.

EXPERIMENTAL - NOT INTENDED FOR SUBMISSION

This "patch" is a proof of concept for over-riding a tasks "Preferred Node
Affinity". It is not intended for submission, but simply to show the code
used in generating the included results.

Signed-off-by: Chris Hyser <[email protected]>
---
include/linux/sched.h | 1 +
init/init_task.c | 1 +
kernel/sched/core.c | 5 ++++-
kernel/sched/debug.c | 1 +
kernel/sched/fair.c | 17 ++++++++++++++---
5 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8d258162deb0..6e7290468fa5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1279,6 +1279,7 @@ struct task_struct {
short pref_node_fork;
#endif
#ifdef CONFIG_NUMA_BALANCING
+ int numa_preferred_nid_force;
int numa_scan_seq;
unsigned int numa_scan_period;
unsigned int numa_scan_period_max;
diff --git a/init/init_task.c b/init/init_task.c
index 5727d42149c3..a1797037af7e 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -174,6 +174,7 @@ struct task_struct init_task
.vtime.state = VTIME_SYS,
#endif
#ifdef CONFIG_NUMA_BALANCING
+ .numa_preferred_nid_force = NUMA_NO_NODE,
.numa_preferred_nid = NUMA_NO_NODE,
.numa_group = NULL,
.numa_faults = NULL,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index db4be4921e7f..1444dd0207aa 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9380,7 +9380,10 @@ void sched_setnuma(struct task_struct *p, int nid)
if (running)
put_prev_task(rq, p);

- p->numa_preferred_nid = nid;
+ if (p->numa_preferred_nid_force != NUMA_NO_NODE)
+ p->numa_preferred_nid = p->numa_preferred_nid_force;
+ else
+ p->numa_preferred_nid = nid;

if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 168eecc209b4..fecf529c9dc7 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -978,6 +978,7 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
P(mm->numa_scan_seq);

P(numa_pages_migrated);
+ P(numa_preferred_nid_force);
P(numa_preferred_nid);
P(total_numa_faults);
SEQ_printf(m, "current_node=%d, numa_group_id=%d\n",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bcea3d55d95d..988b3285f40c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2541,9 +2541,14 @@ static void numa_migrate_preferred(struct task_struct *p)
unsigned long interval = HZ;

/* This task has no NUMA fault statistics yet */
- if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
+ if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE))
return;

+ if (p->numa_preferred_nid_force == NUMA_NO_NODE) {
+ if (unlikely(!p->numa_faults))
+ return;
+ }
+
/* Periodically retry migrating the task to the preferred node */
interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
p->numa_migrate_retry = jiffies + interval;
@@ -3462,7 +3467,10 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)

/* New address space, reset the preferred nid */
if (!(clone_flags & CLONE_VM)) {
- p->numa_preferred_nid = NUMA_NO_NODE;
+ if (p->numa_preferred_nid_force == NUMA_NO_NODE)
+ p->numa_preferred_nid = NUMA_NO_NODE;
+ else
+ p->numa_preferred_nid = p->numa_preferred_nid_force;
return;
}

@@ -8828,7 +8836,10 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
if (!static_branch_likely(&sched_numa_balancing))
return -1;

- if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
+ if (p->numa_preferred_nid_force == NUMA_NO_NODE && !p->numa_faults)
+ return -1;
+
+ if (!(env->sd->flags & SD_NUMA))
return -1;

src_nid = cpu_to_node(env->src_cpu);
--
2.39.3