This patch add a new api PR_NUMA_BALANCING in prctl.
A large number of page faults will cause performance loss when numa balancing
is performing. Thus those processes which care about worst-case performance
need numa balancing disabled. Others, on the contrary, allow a temporary
performance loss in exchange for higher average performance, so enable numa
balancing is better for them.
Numa balancing can only be controlled globally by /proc/sys/kernel/numa_balancing.
Due to the above case, we want to disable/enable numa_balancing per-process
instead.
Add numa_balancing under mm_struct. Then use it in task_tick_numa.
Disable/enable per-process numa balancing:
prctl(PR_NUMA_BALANCING, PR_SET_NUMA_BALANCING, 0/1);
Get numa_balancing state:
prctl(PR_NUMA_BALANCING, PR_GET_NUMA_BALANCING, &ret);
cat /proc/<pid>/status | grep NumaBalancing_enabled
mm->numa_balancing only works when global numa_balancing is enabled.
When the global numa_balancing is diabled, mm->numa_blancing will not
change, but you will always get 0 while you want to get process
numa_balancing state and kernel will return err when you use prctl set
it.
Signed-off-by: Gang Li <[email protected]>
---
Documentation/filesystems/proc.rst | 2 ++
fs/proc/task_mmu.c | 16 ++++++++++++
include/linux/mm_types.h | 3 +++
include/uapi/linux/prctl.h | 5 ++++
kernel/fork.c | 3 +++
kernel/sched/fair.c | 3 +++
kernel/sys.c | 39 ++++++++++++++++++++++++++++++
7 files changed, 71 insertions(+)
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 8d7f141c6fc7..b90f43ed0668 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -192,6 +192,7 @@ read the file /proc/PID/status::
VmLib: 1412 kB
VmPTE: 20 kb
VmSwap: 0 kB
+ NumaBalancing_enabled: 1
HugetlbPages: 0 kB
CoreDumping: 0
THP_enabled: 1
@@ -273,6 +274,7 @@ It's slow but very precise.
VmPTE size of page table entries
VmSwap amount of swap used by anonymous private data
(shmem swap usage is not included)
+ NumaBalancing_enabled numa balancing state, use prctl(PR_NUMA_BALANCING, ...)
HugetlbPages size of hugetlb memory portions
CoreDumping process's memory is currently being dumped
(killing the process may lead to a corrupted core)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ad667dbc96f5..161295e027d2 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -19,6 +19,7 @@
#include <linux/shmem_fs.h>
#include <linux/uaccess.h>
#include <linux/pkeys.h>
+#include <linux/sched/numa_balancing.h>
#include <asm/elf.h>
#include <asm/tlb.h>
@@ -27,14 +28,23 @@
#define SEQ_PUT_DEC(str, val) \
seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8)
+
+DECLARE_STATIC_KEY_FALSE(sched_numa_balancing);
+
void task_mem(struct seq_file *m, struct mm_struct *mm)
{
unsigned long text, lib, swap, anon, file, shmem;
unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
+#ifdef CONFIG_NUMA_BALANCING
+ int numa_balancing;
+#endif
anon = get_mm_counter(mm, MM_ANONPAGES);
file = get_mm_counter(mm, MM_FILEPAGES);
shmem = get_mm_counter(mm, MM_SHMEMPAGES);
+#ifdef CONFIG_NUMA_BALANCING
+ numa_balancing = READ_ONCE(mm->numa_balancing);
+#endif
/*
* Note: to minimize their overhead, mm maintains hiwater_vm and
@@ -75,6 +85,12 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
" kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8);
SEQ_PUT_DEC(" kB\nVmSwap:\t", swap);
seq_puts(m, " kB\n");
+#ifdef CONFIG_NUMA_BALANCING
+ if (!static_branch_unlikely(&sched_numa_balancing))
+ numa_balancing = 0;
+
+ seq_printf(m, "NumaBalancing_enabled:\t%d\n", numa_balancing);
+#endif
hugetlb_report_usage(m, mm);
}
#undef SEQ_PUT_DEC
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index bb8c6f5f19bc..feeb6f639f87 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -612,6 +612,9 @@ struct mm_struct {
/* numa_scan_seq prevents two threads setting pte_numa */
int numa_scan_seq;
+
+ /* numa_balancing control the numa balancing of this mm */
+ int numa_balancing;
#endif
/*
* An operation with batched TLB flushing is going on. Anything
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index b2e4dc1449b9..2235b75efd30 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -272,4 +272,9 @@ struct prctl_mm_map {
# define PR_SCHED_CORE_SCOPE_THREAD_GROUP 1
# define PR_SCHED_CORE_SCOPE_PROCESS_GROUP 2
+/* Set/get enabled per-process numa_balancing */
+#define PR_NUMA_BALANCING 63
+# define PR_SET_NUMA_BALANCING 0
+# define PR_GET_NUMA_BALANCING 1
+
#endif /* _LINUX_PRCTL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 2079f1ebfe63..39e9d5daf00a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1110,6 +1110,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
init_tlb_flush_pending(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
mm->pmd_huge_pte = NULL;
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+ mm->numa_balancing = 1;
#endif
mm_init_uprobes_state(mm);
hugetlb_count_init(mm);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 87db481e8a56..1325253e3613 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2866,6 +2866,9 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
return;
+ if (!READ_ONCE(curr->mm->numa_balancing))
+ return;
+
/*
* Using runtime rather than walltime has the dual advantage that
* we (mostly) drive the selection from busy threads and that the
diff --git a/kernel/sys.c b/kernel/sys.c
index 8fdac0d90504..64aee3d63ea8 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -154,6 +154,8 @@ int fs_overflowgid = DEFAULT_FS_OVERFLOWGID;
EXPORT_SYMBOL(fs_overflowuid);
EXPORT_SYMBOL(fs_overflowgid);
+DECLARE_STATIC_KEY_FALSE(sched_numa_balancing);
+
/*
* Returns true if current's euid is same as p's uid or euid,
* or has CAP_SYS_NICE to p's user_ns.
@@ -2081,6 +2083,28 @@ static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,
return 0;
}
+#ifdef CONFIG_NUMA_BALANCING
+static int prctl_pid_numa_balancing_write(int numa_balancing)
+{
+ if (!static_branch_unlikely(&sched_numa_balancing))
+ return -EPERM;
+
+ if (numa_balancing != 0 && numa_balancing != 1)
+ return -EINVAL;
+
+ WRITE_ONCE(current->mm->numa_balancing, numa_balancing);
+ return 0;
+}
+
+static int prctl_pid_numa_balancing_read(void)
+{
+ if (!static_branch_unlikely(&sched_numa_balancing))
+ return 0;
+ else
+ return READ_ONCE(current->mm->numa_balancing);
+}
+#endif
+
static int prctl_set_mm(int opt, unsigned long addr,
unsigned long arg4, unsigned long arg5)
{
@@ -2525,6 +2549,21 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = set_syscall_user_dispatch(arg2, arg3, arg4,
(char __user *) arg5);
break;
+#ifdef CONFIG_NUMA_BALANCING
+ case PR_NUMA_BALANCING:
+ switch (arg2) {
+ case PR_SET_NUMA_BALANCING:
+ error = prctl_pid_numa_balancing_write((int)arg3);
+ break;
+ case PR_GET_NUMA_BALANCING:
+ put_user(prctl_pid_numa_balancing_read(), (int __user *)arg3);
+ break;
+ default:
+ error = -EINVAL;
+ break;
+ }
+ break;
+#endif
#ifdef CONFIG_SCHED_CORE
case PR_SCHED_CORE:
error = sched_core_share_pid(arg2, arg3, arg4, arg5);
--
2.20.1
On Wed, Oct 27, 2021 at 09:26:32PM +0800, Gang Li wrote:
> This patch add a new api PR_NUMA_BALANCING in prctl.
>
> A large number of page faults will cause performance loss when numa balancing
> is performing. Thus those processes which care about worst-case performance
> need numa balancing disabled. Others, on the contrary, allow a temporary
> performance loss in exchange for higher average performance, so enable numa
> balancing is better for them.
>
> Numa balancing can only be controlled globally by /proc/sys/kernel/numa_balancing.
> Due to the above case, we want to disable/enable numa_balancing per-process
> instead.
>
> Add numa_balancing under mm_struct. Then use it in task_tick_numa.
>
> Disable/enable per-process numa balancing:
> prctl(PR_NUMA_BALANCING, PR_SET_NUMA_BALANCING, 0/1);
> Get numa_balancing state:
> prctl(PR_NUMA_BALANCING, PR_GET_NUMA_BALANCING, &ret);
> cat /proc/<pid>/status | grep NumaBalancing_enabled
>
> mm->numa_balancing only works when global numa_balancing is enabled.
> When the global numa_balancing is diabled, mm->numa_blancing will not
> change, but you will always get 0 while you want to get process
> numa_balancing state and kernel will return err when you use prctl set
> it.
>
This would also need a prctl(2) patch.
That aside though, the configuration space could be better. It's possible
to selectively disable NUMA balance but not selectively enable because
prctl is disabled if global NUMA balancing is disabled. That could be
somewhat achieved by having a default value for mm->numa_balancing based on
whether the global numa balancing is disabled via command line or sysctl
and enabling the static branch if prctl is used with an informational
message. This is not the only potential solution but as it stands,
there are odd semantic corner cases. For example, explicit enabling
of NUMA balancing by prctl gets silently revoked if numa balancing is
disabled via sysctl and prctl(PR_NUMA_BALANCING, PR_SET_NUMA_BALANCING,
1) means nothing.
> Signed-off-by: Gang Li <[email protected]>
> ---
> Documentation/filesystems/proc.rst | 2 ++
> fs/proc/task_mmu.c | 16 ++++++++++++
> include/linux/mm_types.h | 3 +++
> include/uapi/linux/prctl.h | 5 ++++
> kernel/fork.c | 3 +++
> kernel/sched/fair.c | 3 +++
> kernel/sys.c | 39 ++++++++++++++++++++++++++++++
> 7 files changed, 71 insertions(+)
>
> diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
> index 8d7f141c6fc7..b90f43ed0668 100644
> --- a/Documentation/filesystems/proc.rst
> +++ b/Documentation/filesystems/proc.rst
> @@ -192,6 +192,7 @@ read the file /proc/PID/status::
> VmLib: 1412 kB
> VmPTE: 20 kb
> VmSwap: 0 kB
> + NumaBalancing_enabled: 1
Bit verbose
NumaB_enabled:
> HugetlbPages: 0 kB
> CoreDumping: 0
> THP_enabled: 1
> @@ -273,6 +274,7 @@ It's slow but very precise.
> VmPTE size of page table entries
> VmSwap amount of swap used by anonymous private data
> (shmem swap usage is not included)
> + NumaBalancing_enabled numa balancing state, use prctl(PR_NUMA_BALANCING, ...)
s/use/set by/
> HugetlbPages size of hugetlb memory portions
> CoreDumping process's memory is currently being dumped
> (killing the process may lead to a corrupted core)
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index ad667dbc96f5..161295e027d2 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -19,6 +19,7 @@
> #include <linux/shmem_fs.h>
> #include <linux/uaccess.h>
> #include <linux/pkeys.h>
> +#include <linux/sched/numa_balancing.h>
>
> #include <asm/elf.h>
> #include <asm/tlb.h>
> @@ -27,14 +28,23 @@
>
> #define SEQ_PUT_DEC(str, val) \
> seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8)
> +
> +DECLARE_STATIC_KEY_FALSE(sched_numa_balancing);
> +
Declare in a header.
> void task_mem(struct seq_file *m, struct mm_struct *mm)
> {
> unsigned long text, lib, swap, anon, file, shmem;
> unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
> +#ifdef CONFIG_NUMA_BALANCING
> + int numa_balancing;
> +#endif
>
rename to numab_enabled, the name as-is gives little hint as to what
it means. If the prctl works even if numab is globally disabled by
default then the variable can go away.
> anon = get_mm_counter(mm, MM_ANONPAGES);
> file = get_mm_counter(mm, MM_FILEPAGES);
> shmem = get_mm_counter(mm, MM_SHMEMPAGES);
> +#ifdef CONFIG_NUMA_BALANCING
> + numa_balancing = READ_ONCE(mm->numa_balancing);
> +#endif
>
The READ_ONCE/WRITE_ONCE may be overkill given that the value is set at
fork time and doesn't change. I guess an application could be actively
calling prctl() but it would be inherently race-prone.
> /*
> * Note: to minimize their overhead, mm maintains hiwater_vm and
> @@ -75,6 +85,12 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
> " kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8);
> SEQ_PUT_DEC(" kB\nVmSwap:\t", swap);
> seq_puts(m, " kB\n");
> +#ifdef CONFIG_NUMA_BALANCING
> + if (!static_branch_unlikely(&sched_numa_balancing))
> + numa_balancing = 0;
> +
> + seq_printf(m, "NumaBalancing_enabled:\t%d\n", numa_balancing);
> +#endif
> hugetlb_report_usage(m, mm);
> }
> #undef SEQ_PUT_DEC
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index bb8c6f5f19bc..feeb6f639f87 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -612,6 +612,9 @@ struct mm_struct {
>
> /* numa_scan_seq prevents two threads setting pte_numa */
> int numa_scan_seq;
> +
> + /* numa_balancing control the numa balancing of this mm */
> + int numa_balancing;
> #endif
> /*
> * An operation with batched TLB flushing is going on. Anything
Rename to numab_enabled. The comment is not that helpful
/* Controls whether NUMA balancing is active for this mm. */
> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
> index b2e4dc1449b9..2235b75efd30 100644
> --- a/include/uapi/linux/prctl.h
> +++ b/include/uapi/linux/prctl.h
> @@ -272,4 +272,9 @@ struct prctl_mm_map {
> # define PR_SCHED_CORE_SCOPE_THREAD_GROUP 1
> # define PR_SCHED_CORE_SCOPE_PROCESS_GROUP 2
>
> +/* Set/get enabled per-process numa_balancing */
> +#define PR_NUMA_BALANCING 63
> +# define PR_SET_NUMA_BALANCING 0
> +# define PR_GET_NUMA_BALANCING 1
> +
> #endif /* _LINUX_PRCTL_H */
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 2079f1ebfe63..39e9d5daf00a 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1110,6 +1110,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
> init_tlb_flush_pending(mm);
> #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
> mm->pmd_huge_pte = NULL;
> +#endif
> +#ifdef CONFIG_NUMA_BALANCING
> + mm->numa_balancing = 1;
> #endif
> mm_init_uprobes_state(mm);
> hugetlb_count_init(mm);
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 87db481e8a56..1325253e3613 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2866,6 +2866,9 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
> if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
> return;
>
> + if (!READ_ONCE(curr->mm->numa_balancing))
> + return;
> +
> /*
> * Using runtime rather than walltime has the dual advantage that
> * we (mostly) drive the selection from busy threads and that the
> diff --git a/kernel/sys.c b/kernel/sys.c
> index 8fdac0d90504..64aee3d63ea8 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -154,6 +154,8 @@ int fs_overflowgid = DEFAULT_FS_OVERFLOWGID;
> EXPORT_SYMBOL(fs_overflowuid);
> EXPORT_SYMBOL(fs_overflowgid);
>
> +DECLARE_STATIC_KEY_FALSE(sched_numa_balancing);
> +
Header.
> /*
> * Returns true if current's euid is same as p's uid or euid,
> * or has CAP_SYS_NICE to p's user_ns.
> @@ -2081,6 +2083,28 @@ static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,
> return 0;
> }
>
> +#ifdef CONFIG_NUMA_BALANCING
> +static int prctl_pid_numa_balancing_write(int numa_balancing)
> +{
> + if (!static_branch_unlikely(&sched_numa_balancing))
> + return -EPERM;
> +
Obviously this would change again if prctl still has an effect if numa
balancing is disabled by default.
> + if (numa_balancing != 0 && numa_balancing != 1)
> + return -EINVAL;
> +
> + WRITE_ONCE(current->mm->numa_balancing, numa_balancing);
> + return 0;
> +}
> +
> +static int prctl_pid_numa_balancing_read(void)
> +{
> + if (!static_branch_unlikely(&sched_numa_balancing))
> + return 0;
> + else
> + return READ_ONCE(current->mm->numa_balancing);
> +}
> +#endif
> +
> static int prctl_set_mm(int opt, unsigned long addr,
> unsigned long arg4, unsigned long arg5)
> {
> @@ -2525,6 +2549,21 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
> error = set_syscall_user_dispatch(arg2, arg3, arg4,
> (char __user *) arg5);
> break;
> +#ifdef CONFIG_NUMA_BALANCING
> + case PR_NUMA_BALANCING:
> + switch (arg2) {
> + case PR_SET_NUMA_BALANCING:
> + error = prctl_pid_numa_balancing_write((int)arg3);
> + break;
> + case PR_GET_NUMA_BALANCING:
> + put_user(prctl_pid_numa_balancing_read(), (int __user *)arg3);
> + break;
> + default:
> + error = -EINVAL;
> + break;
> + }
> + break;
> +#endif
> #ifdef CONFIG_SCHED_CORE
> case PR_SCHED_CORE:
> error = sched_core_share_pid(arg2, arg3, arg4, arg5);
> --
> 2.20.1
>
--
Mel Gorman
SUSE Labs
On 10/28/21 11:30 PM, Mel Gorman wrote:
>
> That aside though, the configuration space could be better. It's possible
> to selectively disable NUMA balance but not selectively enable because
> prctl is disabled if global NUMA balancing is disabled. That could be
> somewhat achieved by having a default value for mm->numa_balancing based on
> whether the global numa balancing is disabled via command line or sysctl
> and enabling the static branch if prctl is used with an informational
> message. This is not the only potential solution but as it stands,
> there are odd semantic corner cases. For example, explicit enabling
> of NUMA balancing by prctl gets silently revoked if numa balancing is
> disabled via sysctl and prctl(PR_NUMA_BALANCING, PR_SET_NUMA_BALANCING,
> 1) means nothing.
>
static void task_tick_fair(struct rq *rq, struct task_struct *curr, int
queued)
{
...
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
...
}
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
...
if (!READ_ONCE(curr->mm->numa_balancing))
return;
...
}
When global numa_balancing is disabled, mm->numa_balancing is useless.
So I think prctl(PR_NUMA_BALANCING, PR_SET_NUMA_BALANCING,0/1) should
return error instead of modify mm->numa_balancing.
Is it reasonable that prctl(PR_NUMA_BALANCING,PR_SET_NUMA_BALANCING,0/1)
can still change the value of mm->numa_balancing when global
numa_balancing is disabled?
On Thu, Oct 28, 2021 at 11:30 PM Mel Gorman <[email protected]> wrote:
>
> That aside though, the configuration space could be better. It's possible
> to selectively disable NUMA balance but not selectively enable because
> prctl is disabled if global NUMA balancing is disabled. That could be
> somewhat achieved by having a default value for mm->numa_balancing based on
> whether the global numa balancing is disabled via command line or sysctl
> and enabling the static branch if prctl is used with an informational
> message. This is not the only potential solution but as it stands,
> there are odd semantic corner cases. For example, explicit enabling
> of NUMA balancing by prctl gets silently revoked if numa balancing is
> disabled via sysctl and prctl(PR_NUMA_BALANCING, PR_SET_NUMA_BALANCING,
> 1) means nothing.
static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
{
...
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
...
}
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
...
if (!READ_ONCE(curr->mm->numa_balancing))
return;
...
}
When global numa_balancing is disabled, mm->numa_balancing is useless.
So I think
prctl(PR_NUMA_BALANCING, PR_SET_NUMA_BALANCING,0/1) should return an
error instead of modifying mm->numa_balancing.
Is it reasonable that prctl(PR_NUMA_BALANCING,PR_SET_NUMA_BALANCING,0/1)
can still change the value of mm->numa_balancing when global numa_balancing is
disabled?
On Fri, Oct 29, 2021 at 02:12:28PM +0800, Gang Li wrote:
> On 10/28/21 11:30 PM, Mel Gorman wrote:
> >
> > That aside though, the configuration space could be better. It's possible
> > to selectively disable NUMA balance but not selectively enable because
> > prctl is disabled if global NUMA balancing is disabled. That could be
> > somewhat achieved by having a default value for mm->numa_balancing based on
> > whether the global numa balancing is disabled via command line or sysctl
> > and enabling the static branch if prctl is used with an informational
> > message. This is not the only potential solution but as it stands,
> > there are odd semantic corner cases. For example, explicit enabling
> > of NUMA balancing by prctl gets silently revoked if numa balancing is
> > disabled via sysctl and prctl(PR_NUMA_BALANCING, PR_SET_NUMA_BALANCING,
> > 1) means nothing.
> >
> static void task_tick_fair(struct rq *rq, struct task_struct *curr, int
> queued)
> {
> ...
> if (static_branch_unlikely(&sched_numa_balancing))
> task_tick_numa(rq, curr);
> ...
> }
>
> static void task_tick_numa(struct rq *rq, struct task_struct *curr)
> {
> ...
> if (!READ_ONCE(curr->mm->numa_balancing))
> return;
> ...
> }
>
> When global numa_balancing is disabled, mm->numa_balancing is useless.
I'm aware that this is the behaviour of the patch as-is.
> So I
> think prctl(PR_NUMA_BALANCING, PR_SET_NUMA_BALANCING,0/1) should return
> error instead of modify mm->numa_balancing.
>
> Is it reasonable that prctl(PR_NUMA_BALANCING,PR_SET_NUMA_BALANCING,0/1)
> can still change the value of mm->numa_balancing when global numa_balancing
> is disabled?
>
My point is that as it stands,
prctl(PR_NUMA_BALANCING,PR_SET_NUMA_BALANCING,1) either does nothing or
fails. If per-process numa balancing is to be introduced, it should have
meaning with the global tuning affecting default behaviour and the prctl
affecting specific behaviour.
--
Mel Gorman
SUSE Labs
Hi, sorry for the late reply.
On Fri, Oct 29, 2021 at 4:37 PM Mel Gorman <[email protected]> wrote:
>
> My point is that as it stands,
> prctl(PR_NUMA_BALANCING,PR_SET_NUMA_BALANCING,1) either does nothing or
> fails. If per-process numa balancing is to be introduced, it should have
> meaning with the global tuning affecting default behaviour and the prctl
> affecting specific behaviour.
>
If the global tuning affects default behaviour and the prctl
affects specific behaviour. Then when prctl specifies
numa_balancing for a process, there is no way for the
global tuning to affect that process. In other words, global tuning
become a default value, not a switch for global numa_balancing.
My idea is that the global numa_balancning still has absolute control, and prctl
can only optionally turn off numa_balancing for process when the global is on.
After all, It is more common to enable global numa_balancing and disable it in
several processes than to disable global numa_balancing and enable it in
several processes.
This is my personal opinion, what do you think.
:-)
Do we need the global to be a switch, or a default value?
On Tue, Nov 09, 2021 at 04:28:28PM +0800, ?????? wrote:
> Hi, sorry for the late reply.
>
> On Fri, Oct 29, 2021 at 4:37 PM Mel Gorman <[email protected]> wrote:
> >
> > My point is that as it stands,
> > prctl(PR_NUMA_BALANCING,PR_SET_NUMA_BALANCING,1) either does nothing or
> > fails. If per-process numa balancing is to be introduced, it should have
> > meaning with the global tuning affecting default behaviour and the prctl
> > affecting specific behaviour.
> >
>
> If the global tuning affects default behaviour and the prctl
> affects specific behaviour. Then when prctl specifies
> numa_balancing for a process, there is no way for the
> global tuning to affect that process.
Yes.
> In other words, global tuning
> become a default value, not a switch for global numa_balancing.
>
Also yes. The global tuning becomes "all processes default to using NUMA
balancing unless overridden by prctl".
The main difficulty is that one task using prctl to enable NUMA balancing
needs to enable the static branch so there is a small global hit.
> My idea is that the global numa_balancning still has absolute control, and prctl
> can only optionally turn off numa_balancing for process when the global is on.
> After all, It is more common to enable global numa_balancing and disable it in
> several processes than to disable global numa_balancing and enable it in
> several processes.
Then this comment would still apply
My point is that as it stands,
prctl(PR_NUMA_BALANCING,PR_SET_NUMA_BALANCING,1) either does nothing
or fails.
While I think it's very likely that the common case will be to disable
NUMA balancing for specific processes,
prctl(PR_NUMA_BALANCING,PR_SET_NUMA_BALANCING,1) should still be
meaningful.
--
Mel Gorman
SUSE Labs
On 11/9/21 5:19 PM, Mel Gorman wrote:
> On Tue, Nov 09, 2021 at 04:28:28PM +0800, Gang Li wrote:
>> If the global tuning affects default behaviour and the prctl
>> affects specific behaviour. Then when prctl specifies
>> numa_balancing for a process, there is no way for the
>> global tuning to affect that process.
>
> While I think it's very likely that the common case will be to disable
> NUMA balancing for specific processes,
> prctl(PR_NUMA_BALANCING,PR_SET_NUMA_BALANCING,1) should still be
> meaningful.
>
I'm still a bit confused.
If we really want to enable/disable numa_balancing for all processes,
but some of them override the global numa_balancing using prctl, what
should we do?
Do we iterate through these processes to enable/disable them individually?
On 11/9/21 8:12 PM, Mel Gorman wrote:
>
> That would be a policy decision on how existing tasks should be tuned
> if NUMA balancing is enabled at runtime after being disabled at boot
> (or some arbitrary time in the past). Introducing the prctl does mean
> that there is a semantic change for the runtime enabling/disabling
> of NUMA balancing because previously, enabling global balancing affects
> existing tasks and with prctl, it affects only future tasks. It could
> be handled in the sysctl to some exist
>
> 0. Disable for all but prctl specifications
> 1. Enable for all tasks unless disabled by prctl
> 2. Ignore all existing tasks, enable for future tasks
>
> While this is more legwork, it makes more sense as an interface than
> prctl(PR_NUMA_BALANCING,PR_SET_NUMA_BALANCING,1) failing if global
> NUMA balancing is disabled.
>
Why prctl(PR_NUMA_BALANCING,PR_SET_NUMA_BALANCING,1) must work while
global numa_balancing is disabled? No offense, I think that is a bit
redundant. And it's complicated to implement.
It's hard for me to understand the whole vision of your idea. I'm very
sorry. Can you explain your full thoughts more specifically?
----------------------------------------------------
Also in case of misunderstanding, let me re-explain my patch using
circuit diagram.
Before my patch, there is only one switch to control numa_balancing.
______process1_
...____/ __|______process2_|__...
|______process3_|
|
global numa_balancing
After my patch, we can selectively disable numa_balancing for processes.
And global switch has a high priority.
__/ __process1_
...____/ __|__/ __process2_|__...
|__/ __process3_|
| |
global per-process
Why global numa_balancing has high priority? There are two reasons:
1. numa_balancing is useful to most processes, so there is no need to
consider how to enable numa_balancing for a few processes while
disabling it globally.
2. It is easy to implement. The more we think, the more complex the code
becomes.
On Tue, Nov 09, 2021 at 09:58:34PM +0800, Gang Li wrote:
> On 11/9/21 8:12 PM, Mel Gorman wrote:
> >
> > That would be a policy decision on how existing tasks should be tuned
> > if NUMA balancing is enabled at runtime after being disabled at boot
> > (or some arbitrary time in the past). Introducing the prctl does mean
> > that there is a semantic change for the runtime enabling/disabling
> > of NUMA balancing because previously, enabling global balancing affects
> > existing tasks and with prctl, it affects only future tasks. It could
> > be handled in the sysctl to some exist
> >
> > 0. Disable for all but prctl specifications
> > 1. Enable for all tasks unless disabled by prctl
> > 2. Ignore all existing tasks, enable for future tasks
> >
> > While this is more legwork, it makes more sense as an interface than
> > prctl(PR_NUMA_BALANCING,PR_SET_NUMA_BALANCING,1) failing if global
> > NUMA balancing is disabled.
> >
>
> Why prctl(PR_NUMA_BALANCING,PR_SET_NUMA_BALANCING,1) must work while global
> numa_balancing is disabled? No offense, I think that is a bit redundant.
For symmetry and consistency of the tuning. Either there is per-process
control or there is not. Right now, there is only the ability to turn
off NUMA balancing via prctl if globally enabled. There is no option to
turn NUMA balancing on for a single task if globally disabled.
> And
> it's complicated to implement.
>
That is true.
> It's hard for me to understand the whole vision of your idea. I'm very
> sorry. Can you explain your full thoughts more specifically?
>
I'm not sure how I can be more clear.
> ----------------------------------------------------
>
> Also in case of misunderstanding, let me re-explain my patch using circuit
> diagram.
>
I understood what you are proposing. In your case, global disabling
is an absolute -- it's disabled regardless of prctl therefore
prctl(PR_NUMA_BALANCING,PR_SET_NUMA_BALANCING,1) has no meaning and it
either does nothing at all or fails so why does the option even exist?
> Why global numa_balancing has high priority? There are two reasons:
> 1. numa_balancing is useful to most processes, so there is no need to
> consider how to enable numa_balancing for a few processes while disabling it
> globally.
> 2. It is easy to implement. The more we think, the more complex the code
> becomes.
Of those two, I agree with the second one, it would be tricky to implement
but the first one is less clear. This is based on an assumption. If prctl
exists to enable/disable NUMA baalancing, it's possible that someone
else would want to control NUMA balancing on a cgroup basis instead of
globally which would run into the same type of concerns -- different
semantics depending on the global tunable.
--
Mel Gorman
SUSE Labs
On Tue, Nov 09, 2021 at 06:40:43PM +0800, Gang Li wrote:
> On 11/9/21 5:19 PM, Mel Gorman wrote:
> > On Tue, Nov 09, 2021 at 04:28:28PM +0800, Gang Li wrote:
> > > If the global tuning affects default behaviour and the prctl
> > > affects specific behaviour. Then when prctl specifies
> > > numa_balancing for a process, there is no way for the
> > > global tuning to affect that process.
> >
> > While I think it's very likely that the common case will be to disable
> > NUMA balancing for specific processes,
> > prctl(PR_NUMA_BALANCING,PR_SET_NUMA_BALANCING,1) should still be
> > meaningful.
> >
>
> I'm still a bit confused.
>
> If we really want to enable/disable numa_balancing for all processes, but
> some of them override the global numa_balancing using prctl, what should we
> do?
>
> Do we iterate through these processes to enable/disable them individually?
>
That would be a policy decision on how existing tasks should be tuned
if NUMA balancing is enabled at runtime after being disabled at boot
(or some arbitrary time in the past). Introducing the prctl does mean
that there is a semantic change for the runtime enabling/disabling
of NUMA balancing because previously, enabling global balancing affects
existing tasks and with prctl, it affects only future tasks. It could
be handled in the sysctl to some exist
0. Disable for all but prctl specifications
1. Enable for all tasks unless disabled by prctl
2. Ignore all existing tasks, enable for future tasks
While this is more legwork, it makes more sense as an interface than
prctl(PR_NUMA_BALANCING,PR_SET_NUMA_BALANCING,1) failing if global
NUMA balancing is disabled.
--
Mel Gorman
SUSE Labs
On 11/10/21 12:26 AM, Mel Gorman wrote:
>
> Of those two, I agree with the second one, it would be tricky to implement
> but the first one is less clear. This is based on an assumption. If prctl
> exists to enable/disable NUMA baalancing, it's possible that someone
> else would want to control NUMA balancing on a cgroup basis instead of
> globally which would run into the same type of concerns -- different
> semantics depending on the global tunable.
>
Hi!
You talk about the "semantics" of NUMA balancing between global, cgroup
and process. While I read the kernel doc "NUMA Memory Policy", it occur
to me that we may have a "NUMA Balancing Policy".
Since you are the reviewer of CONFIG_NUMA_BALANCING. I would like to
discuss the need for introducing "NUMA Balancing Policy" with you. Is
this worth doing?
On Wed, Nov 17, 2021 at 03:07:43PM +0800, Gang Li wrote:
> On 11/10/21 12:26 AM, Mel Gorman wrote:
> >
> > Of those two, I agree with the second one, it would be tricky to implement
> > but the first one is less clear. This is based on an assumption. If prctl
> > exists to enable/disable NUMA baalancing, it's possible that someone
> > else would want to control NUMA balancing on a cgroup basis instead of
> > globally which would run into the same type of concerns -- different
> > semantics depending on the global tunable.
> >
>
> Hi!
>
> You talk about the "semantics" of NUMA balancing between global, cgroup and
> process. While I read the kernel doc "NUMA Memory Policy", it occur to me
> that we may have a "NUMA Balancing Policy".
>
> Since you are the reviewer of CONFIG_NUMA_BALANCING. I would like to discuss
> the need for introducing "NUMA Balancing Policy" with you. Is this worth
> doing?
>
It's a bit vague but if you wanted to put together the outline, I'd read
over it. Note that this was all in the context of trying to introduce an
API like
Disable/enable per-process numa balancing:
prctl(PR_NUMA_BALANCING, PR_SET_NUMA_BALANCING, 0/1);
i.e. one that controlled both enabling and disabling. You also have
the option of introducing the NUMAB equivalent of PR_SET_THP_DISABLE --
an API that is explicitly about disabling *only*.
--
Mel Gorman
SUSE Labs
On 11/17/21 4:29 PM, Mel Gorman wrote:
>
> It's a bit vague but if you wanted to put together the outline, I'd read
> over it. Note that this was all in the context of trying to introduce an
Sorry, maybe I shouldn't propose new feature in this context.
> API like
>
> Disable/enable per-process numa balancing:
> prctl(PR_NUMA_BALANCING, PR_SET_NUMA_BALANCING, 0/1);
>
> i.e. one that controlled both enabling and disabling. You also have
> the option of introducing the NUMAB equivalent of PR_SET_THP_DISABLE --
> an API that is explicitly about disabling *only*.
>
If those APIs are ok with you, I will send v2 soon.
1. prctl(PR_NUMA_BALANCING, PR_SET_THP_DISABLE);
2. prctl(PR_NUMA_BALANCING, PR_SET_THP_ENABLE);
3. prctl(PR_NUMA_BALANCING, PR_GET_THP);
IIUC, "THP" means "this process" or "the process".
On Wed, Nov 17, 2021 at 05:38:28PM +0800, Gang Li wrote:
> On 11/17/21 4:29 PM, Mel Gorman wrote:
> >
> > It's a bit vague but if you wanted to put together the outline, I'd read
> > over it. Note that this was all in the context of trying to introduce an
>
> Sorry, maybe I shouldn't propose new feature in this context.
>
> > API like
> >
> > Disable/enable per-process numa balancing:
> > prctl(PR_NUMA_BALANCING, PR_SET_NUMA_BALANCING, 0/1);
> >
> > i.e. one that controlled both enabling and disabling. You also have
> > the option of introducing the NUMAB equivalent of PR_SET_THP_DISABLE --
> > an API that is explicitly about disabling *only*.
> >
>
> If those APIs are ok with you, I will send v2 soon.
>
> 1. prctl(PR_NUMA_BALANCING, PR_SET_THP_DISABLE);
It would be (PR_SET_NUMAB_DISABLE, 1)
> 2. prctl(PR_NUMA_BALANCING, PR_SET_THP_ENABLE);
An enable prctl will have the same problems as
prctl(PR_NUMA_BALANCING, PR_SET_NUMA_BALANCING, 0/1) -- it should have
meaning if the numa_balancing sysctl is disabled.
> 3. prctl(PR_NUMA_BALANCING, PR_GET_THP);
>
PR_GET_NUMAB_DISABLE
--
Mel Gorman
SUSE Labs
On 11/17/21 6:10 PM, Mel Gorman wrote:
> On Wed, Nov 17, 2021 at 05:38:28PM +0800, Gang Li wrote:
>> If those APIs are ok with you, I will send v2 soon.
>>
>> 1. prctl(PR_NUMA_BALANCING, PR_SET_THP_DISABLE);
>
> It would be (PR_SET_NUMAB_DISABLE, 1)
>
>> 2. prctl(PR_NUMA_BALANCING, PR_SET_THP_ENABLE);
>
> An enable prctl will have the same problems as
> prctl(PR_NUMA_BALANCING, PR_SET_NUMA_BALANCING, 0/1) -- it should have
> meaning if the numa_balancing sysctl is disabled.
>
>> 3. prctl(PR_NUMA_BALANCING, PR_GET_THP);
>>
>
> PR_GET_NUMAB_DISABLE
>
How about this:
1. prctl(PR_NUMA_BALANCING, PR_SET_NUMAB_DEFAULT); //follow global
2. prctl(PR_NUMA_BALANCING, PR_SET_NUMAB_DISABLE); //disable
3. prctl(PR_NUMA_BALANCING, PR_SET_NUMAB_ENABLE); //enable
4. prctl(PR_NUMA_BALANCING, PR_GET_NUMAB);
PR_SET_NUMAB_DISABLE/ENABLE can always have meaning whether the
numa_balancing sysctl is disabled or not,
--
Thanks,
Gang Li
On Thu, Nov 18, 2021 at 11:26:30AM +0800, Gang Li wrote:
> On 11/17/21 6:10 PM, Mel Gorman wrote:
> > On Wed, Nov 17, 2021 at 05:38:28PM +0800, Gang Li wrote:
> > > If those APIs are ok with you, I will send v2 soon.
> > >
> > > 1. prctl(PR_NUMA_BALANCING, PR_SET_THP_DISABLE);
> >
> > It would be (PR_SET_NUMAB_DISABLE, 1)
> >
> > > 2. prctl(PR_NUMA_BALANCING, PR_SET_THP_ENABLE);
> >
> > An enable prctl will have the same problems as
> > prctl(PR_NUMA_BALANCING, PR_SET_NUMA_BALANCING, 0/1) -- it should have
> > meaning if the numa_balancing sysctl is disabled.
> >
> > > 3. prctl(PR_NUMA_BALANCING, PR_GET_THP);
> > >
> >
> > PR_GET_NUMAB_DISABLE
> >
>
> How about this:
>
> 1. prctl(PR_NUMA_BALANCING, PR_SET_NUMAB_DEFAULT); //follow global
> 2. prctl(PR_NUMA_BALANCING, PR_SET_NUMAB_DISABLE); //disable
> 3. prctl(PR_NUMA_BALANCING, PR_SET_NUMAB_ENABLE); //enable
If PR_SET_NUMAB_ENABLE enables numa balancing for a task when
kernel.numa_balancing == 0 instead of returning an error then sure.
> 4. prctl(PR_NUMA_BALANCING, PR_GET_NUMAB);
>
> PR_SET_NUMAB_DISABLE/ENABLE can always have meaning whether the
> numa_balancing sysctl is disabled or not,
>
> --
> Thanks,
> Gang Li
>
--
Mel Gorman
SUSE Labs
On 11/18/21 4:58 PM, Mel Gorman wrote:
> On Thu, Nov 18, 2021 at 11:26:30AM +0800, Gang Li wrote:
>> 3. prctl(PR_NUMA_BALANCING, PR_SET_NUMAB_ENABLE); //enable
>
> If PR_SET_NUMAB_ENABLE enables numa balancing for a task when
> kernel.numa_balancing == 0 instead of returning an error then sure.
Of course.
I'll send patch v2 soon.
--
Thanks,
Gang Li
On 2021/10/28 23:30, Mel Gorman wrote:
>
> This would also need a prctl(2) patch.
>
Hi!
Should prctl(2) and this patch be combined into one series?
--
Thanks
Gang Li