2022-09-02 06:56:13

by Zhongkun He

[permalink] [raw]
Subject: [PATCH] cgroup/cpuset: Add a new isolated mems.policy type.

From: Zhongkun He <[email protected]>

Mempolicy is difficult to use because it is set in-process
via a system call. We want to make it easier to use mempolicy
in cpuset, and we can control low-priority cgroups to
allocate memory in specified nodes. So this patch want to
adds the mempolicy interface in cpuset.

The mempolicy priority of cpuset is lower than the task.
The order of getting the policy is:
1) vma mempolicy
2) task->mempolicy
3) cpuset->mempolicy
4) default policy.

cpuset's policy is owned by itself, but descendants will
get the default mempolicy from parent.

How to use the mempolicy interface:
echo prefer:2 > /sys/fs/cgroup/zz/cpuset.mems.policy
echo bind:1-3 > /sys/fs/cgroup/zz/cpuset.mems.policy
echo interleave:0,1,2,3 >/sys/fs/cgroup/zz/cpuset.mems.policy
Show the policy:
cat /sys/fs/cgroup/zz/cpuset.mems.policy
prefer:2
cat /sys/fs/cgroup/zz/cpuset.mems.policy
bind:1-3
cat /sys/fs/cgroup/zz/cpuset.mems.policy
interleave:0-3
Clear the policy:
echo default > /sys/fs/cgroup/zz/cpuset.mems.policy

Signed-off-by: Zhongkun He <[email protected]>
---
include/linux/mempolicy.h | 4 +
include/linux/sched.h | 2 +
kernel/cgroup/cpuset.c | 154 +++++++++++++++++++++++++++++++++++++-
kernel/fork.c | 1 +
mm/mempolicy.c | 28 +++++--
5 files changed, 180 insertions(+), 9 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 668389b4b53d..5e54dd8576e4 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -142,6 +142,7 @@ extern void numa_default_policy(void);
extern void numa_policy_init(void);
extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new);
extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
+extern void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask);

extern int huge_node(struct vm_area_struct *vma,
unsigned long addr, gfp_t gfp_flags,
@@ -252,6 +253,9 @@ static inline void mpol_rebind_task(struct task_struct *tsk,
static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
}
+static inline void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
+{
+}

static inline int huge_node(struct vm_area_struct *vma,
unsigned long addr, gfp_t gfp_flags,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e7b2f8a5c711..c79fc82ac9fe 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1200,6 +1200,8 @@ struct task_struct {
u64 acct_timexpd;
#endif
#ifdef CONFIG_CPUSETS
+ /* cpuset memory policy */
+ struct mempolicy *cs_mpol;
/* Protected by ->alloc_lock: */
nodemask_t mems_allowed;
/* Sequence number to catch updates: */
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 1f3a55297f39..d624393a0d7e 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -118,6 +118,9 @@ struct cpuset {
cpumask_var_t effective_cpus;
nodemask_t effective_mems;

+ /*cpuset mem policy */
+ struct mempolicy *mempolicy;
+
/*
* CPUs allocated to child sub-partitions (default hierarchy only)
* - CPUs granted by the parent = effective_cpus U subparts_cpus
@@ -378,6 +381,8 @@ static void cpuset_hotplug_workfn(struct work_struct *work);
static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);

static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
+static void cpuset_change_task_cs_mpol(struct task_struct *tsk,
+ struct mempolicy *mpol);

static inline void check_insane_mems_config(nodemask_t *nodes)
{
@@ -570,7 +575,10 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
if (!trial)
return NULL;

+ mpol_get(trial->mempolicy);
+
if (alloc_cpumasks(trial, NULL)) {
+ mpol_put(trial->mempolicy);
kfree(trial);
return NULL;
}
@@ -587,6 +595,10 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
static inline void free_cpuset(struct cpuset *cs)
{
free_cpumasks(cs, NULL);
+
+ if (cs->mempolicy)
+ mpol_put(cs->mempolicy);
+
kfree(cs);
}

@@ -1823,6 +1835,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
+ nodemask_t cs_allowed;

rcu_read_lock();
cpuset_for_each_descendant_pre(cp, pos_css, cs) {
@@ -1848,6 +1861,11 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
rcu_read_unlock();

spin_lock_irq(&callback_lock);
+
+ if (cp->mempolicy)
+ nodes_and(cs_allowed, *new_mems,
+ cp->mempolicy->w.user_nodemask);
+ mpol_rebind_policy(cp->mempolicy, &cs_allowed);
cp->effective_mems = *new_mems;
spin_unlock_irq(&callback_lock);

@@ -2304,7 +2322,8 @@ static void cpuset_attach(struct cgroup_taskset *tset)
* fail. TODO: have a better way to handle failure here
*/
WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
-
+ /*update the cpuset mempolicy to task*/
+ cpuset_change_task_cs_mpol(task, cs->mempolicy);
cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
cpuset_update_task_spread_flag(cs, task);
}
@@ -2441,6 +2460,112 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
return retval;
}

+/*update cpuset policy for task*/
+static void cpuset_change_task_cs_mpol(struct task_struct *tsk,
+ struct mempolicy *mpol)
+{
+ struct mempolicy *old = NULL;
+
+ task_lock(tsk);
+ local_irq_disable();
+ write_seqcount_begin(&tsk->mems_allowed_seq);
+
+ old = tsk->cs_mpol;
+ tsk->cs_mpol = mpol;
+ mpol_get(mpol);
+ tsk->il_prev = 0;
+
+ write_seqcount_end(&tsk->mems_allowed_seq);
+ local_irq_enable();
+ task_unlock(tsk);
+ mpol_put(old);
+}
+
+static void update_tasks_cs_mpol(struct cpuset *cs)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ css_task_iter_start(&cs->css, 0, &it);
+
+ while ((task = css_task_iter_next(&it)))
+ cpuset_change_task_cs_mpol(task, cs->mempolicy);
+ css_task_iter_end(&it);
+}
+
+/* change cpuset mempolicy */
+static ssize_t cpuset_mpol_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mempolicy *mpol, *old = NULL;
+ struct cpuset *cs = css_cs(of_css(of));
+ nodemask_t cs_allowed;
+ int err = -ENODEV;
+
+ css_get(&cs->css);
+ kernfs_break_active_protection(of->kn);
+ percpu_down_write(&cpuset_rwsem);
+
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
+
+ buf = strstrip(buf);
+ err = mpol_parse_str(buf, &mpol);
+
+ if (err) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ spin_lock_irq(&callback_lock);
+ old = cs->mempolicy;
+
+ if (mpol) {
+ nodes_and(cs_allowed, cs->effective_mems, mpol->w.user_nodemask);
+ mpol_rebind_policy(mpol, &cs_allowed);
+ cs->mempolicy = mpol;
+ }
+ spin_unlock_irq(&callback_lock);
+
+ update_tasks_cs_mpol(cs);
+
+out_unlock:
+ percpu_up_write(&cpuset_rwsem);
+ kernfs_unbreak_active_protection(of->kn);
+ css_put(&cs->css);
+
+ if (old) {
+ /*Wait for outstanding programs to complete.*/
+ synchronize_rcu();
+ mpol_put(old);
+ }
+ return err ?: nbytes;
+}
+
+/*show cpuset mempolicy*/
+static int cpuset_mpol_show(struct seq_file *seq, void *v)
+{
+ char buffer[64];
+ int ret = 0;
+ struct mempolicy *mpol;
+ struct cpuset *cs = css_cs(seq_css(seq));
+
+ memset(buffer, 0, sizeof(buffer));
+ spin_lock_irq(&callback_lock);
+ mpol = cs->mempolicy;
+
+ if (!mpol || mpol->mode == MPOL_DEFAULT)
+ goto out_unlock;
+
+ mpol_to_str(buffer, sizeof(buffer), mpol);
+ seq_printf(seq, buffer);
+ seq_putc(seq, '\n');
+
+out_unlock:
+ spin_unlock_irq(&callback_lock);
+ return ret;
+}
+
/*
* Common handling for a write to a "cpus" or "mems" file.
*/
@@ -2679,6 +2804,13 @@ static struct cftype legacy_files[] = {
.private = FILE_EFFECTIVE_MEMLIST,
},

+ {
+ .name = "mems_policy",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpuset_mpol_show,
+ .write = cpuset_mpol_write,
+ },
+
{
.name = "cpu_exclusive",
.read_u64 = cpuset_read_u64,
@@ -2787,6 +2919,13 @@ static struct cftype dfl_files[] = {
.private = FILE_EFFECTIVE_MEMLIST,
},

+ {
+ .name = "mems.policy",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpuset_mpol_show,
+ .write = cpuset_mpol_write,
+ },
+
{
.name = "cpus.partition",
.seq_show = sched_partition_show,
@@ -2815,7 +2954,8 @@ static struct cftype dfl_files[] = {
static struct cgroup_subsys_state *
cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
{
- struct cpuset *cs;
+ struct cpuset *cs, *pcs = css_cs(parent_css);
+ struct mempolicy *new;

if (!parent_css)
return &top_cpuset.css;
@@ -2835,6 +2975,16 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;

+ /*Inherit mempolicy from parent.*/
+ spin_lock_irq(&callback_lock);
+ new = mpol_dup(pcs->mempolicy);
+
+ if (IS_ERR(new))
+ new = NULL;
+
+ cs->mempolicy = new;
+ spin_unlock_irq(&callback_lock);
+
/* Set CS_MEMORY_MIGRATE for default hierarchy */
if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
__set_bit(CS_MEMORY_MIGRATE, &cs->flags);
diff --git a/kernel/fork.c b/kernel/fork.c
index 90c85b17bf69..3f695449e2a5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2190,6 +2190,7 @@ static __latent_entropy struct task_struct *copy_process(
p->mempolicy = NULL;
goto bad_fork_cleanup_delayacct;
}
+ mpol_get(p->cs_mpol); /*ref cpuset mempolicy*/
#endif
#ifdef CONFIG_CPUSETS
p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b73d3248d976..4cf54cf60244 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -158,9 +158,15 @@ int numa_map_to_online_node(int node)
}
EXPORT_SYMBOL_GPL(numa_map_to_online_node);

+static inline struct mempolicy *task_or_cs_mpol(struct task_struct *p)
+{
+ return p->mempolicy ?
+ p->mempolicy : p->cs_mpol;
+}
+
struct mempolicy *get_task_policy(struct task_struct *p)
{
- struct mempolicy *pol = p->mempolicy;
+ struct mempolicy *pol = task_or_cs_mpol(p);
int node;

if (pol)
@@ -349,7 +355,7 @@ static void mpol_rebind_preferred(struct mempolicy *pol,
* policies are protected by task->mems_allowed_seq to prevent a premature
* OOM/allocation failure due to parallel nodemask modification.
*/
-static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
+void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{
if (!pol || pol->mode == MPOL_LOCAL)
return;
@@ -1878,9 +1884,11 @@ static unsigned interleave_nodes(struct mempolicy *policy)
struct task_struct *me = current;

next = next_node_in(me->il_prev, policy->nodes);
- if (next < MAX_NUMNODES)
+ if (next < MAX_NUMNODES) {
me->il_prev = next;
- return next;
+ return next;
+ } else
+ return numa_node_id();
}

/*
@@ -1895,7 +1903,7 @@ unsigned int mempolicy_slab_node(void)
if (!in_task())
return node;

- policy = current->mempolicy;
+ policy = task_or_cs_mpol(current);
if (!policy)
return node;

@@ -2043,7 +2051,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
return false;

task_lock(current);
- mempolicy = current->mempolicy;
+ mempolicy = task_or_cs_mpol(current);
switch (mempolicy->mode) {
case MPOL_PREFERRED:
case MPOL_PREFERRED_MANY:
@@ -2633,13 +2641,16 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
*/
void mpol_put_task_policy(struct task_struct *task)
{
- struct mempolicy *pol;
+ struct mempolicy *pol, *cs_pol;

task_lock(task);
pol = task->mempolicy;
+ cs_pol = task->cs_mpol;
+ task->cs_mpol = NULL;
task->mempolicy = NULL;
task_unlock(task);
mpol_put(pol);
+ mpol_put(cs_pol);
}

static void sp_delete(struct shared_policy *sp, struct sp_node *n)
@@ -3054,6 +3065,9 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
mode_flags |= MPOL_F_RELATIVE_NODES;
else
goto out;
+ } else {
+ /*use static mode_flags in default*/
+ mode_flags |= MPOL_F_STATIC_NODES;
}

new = mpol_new(mode, mode_flags, &nodes);
--
2.25.1


2022-09-02 09:28:49

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH] cgroup/cpuset: Add a new isolated mems.policy type.

Hi hezhongkun,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on tip/sched/core]
[also build test WARNING on linus/master v6.0-rc3 next-20220901]
[cannot apply to tj-cgroup/for-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url: https://github.com/intel-lab-lkp/linux/commits/hezhongkun/cgroup-cpuset-Add-a-new-isolated-mems-policy-type/20220902-143512
base: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 53aa930dc4bae6aa269951bd37103083145d6691
config: mips-allyesconfig
compiler: mips-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/7b7fbf5ae59ebc703a8d545fabd305563c0f42f6
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review hezhongkun/cgroup-cpuset-Add-a-new-isolated-mems-policy-type/20220902-143512
git checkout 7b7fbf5ae59ebc703a8d545fabd305563c0f42f6
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=mips SHELL=/bin/bash kernel/

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <[email protected]>

All warnings (new ones prefixed by >>):

In file included from include/linux/sched.h:22,
from include/linux/ratelimit.h:6,
from include/linux/dev_printk.h:16,
from include/linux/device.h:15,
from include/linux/node.h:18,
from include/linux/cpu.h:17,
from kernel/cgroup/cpuset.c:25:
kernel/cgroup/cpuset.c: In function 'update_nodemasks_hier':
kernel/cgroup/cpuset.c:1867:54: error: 'struct mempolicy' has no member named 'w'
1867 | cp->mempolicy->w.user_nodemask);
| ^~
include/linux/nodemask.h:163:56: note: in definition of macro 'nodes_and'
163 | __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES)
| ^~~~
kernel/cgroup/cpuset.c: In function 'cpuset_change_task_cs_mpol':
kernel/cgroup/cpuset.c:2477:12: error: 'struct task_struct' has no member named 'il_prev'
2477 | tsk->il_prev = 0;
| ^~
kernel/cgroup/cpuset.c: In function 'cpuset_mpol_write':
kernel/cgroup/cpuset.c:2525:63: error: 'struct mempolicy' has no member named 'w'
2525 | nodes_and(cs_allowed, cs->effective_mems, mpol->w.user_nodemask);
| ^~
include/linux/nodemask.h:163:56: note: in definition of macro 'nodes_and'
163 | __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES)
| ^~~~
kernel/cgroup/cpuset.c: In function 'cpuset_mpol_show':
kernel/cgroup/cpuset.c:2558:26: error: 'struct mempolicy' has no member named 'mode'
2558 | if (!mpol || mpol->mode == MPOL_DEFAULT)
| ^~
kernel/cgroup/cpuset.c:2561:9: error: implicit declaration of function 'mpol_to_str' [-Werror=implicit-function-declaration]
2561 | mpol_to_str(buffer, sizeof(buffer), mpol);
| ^~~~~~~~~~~
kernel/cgroup/cpuset.c: In function 'cpuset_css_alloc':
kernel/cgroup/cpuset.c:2981:15: error: implicit declaration of function 'mpol_dup'; did you mean 'mpol_put'? [-Werror=implicit-function-declaration]
2981 | new = mpol_dup(pcs->mempolicy);
| ^~~~~~~~
| mpol_put
>> kernel/cgroup/cpuset.c:2981:13: warning: assignment to 'struct mempolicy *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
2981 | new = mpol_dup(pcs->mempolicy);
| ^
cc1: some warnings being treated as errors


vim +2981 kernel/cgroup/cpuset.c

2948
2949
2950 /*
2951 * cpuset_css_alloc - allocate a cpuset css
2952 * cgrp: control group that the new cpuset will be part of
2953 */
2954
2955 static struct cgroup_subsys_state *
2956 cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
2957 {
2958 struct cpuset *cs, *pcs = css_cs(parent_css);
2959 struct mempolicy *new;
2960
2961 if (!parent_css)
2962 return &top_cpuset.css;
2963
2964 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
2965 if (!cs)
2966 return ERR_PTR(-ENOMEM);
2967
2968 if (alloc_cpumasks(cs, NULL)) {
2969 kfree(cs);
2970 return ERR_PTR(-ENOMEM);
2971 }
2972
2973 __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
2974 nodes_clear(cs->mems_allowed);
2975 nodes_clear(cs->effective_mems);
2976 fmeter_init(&cs->fmeter);
2977 cs->relax_domain_level = -1;
2978
2979 /*Inherit mempolicy from parent.*/
2980 spin_lock_irq(&callback_lock);
> 2981 new = mpol_dup(pcs->mempolicy);
2982
2983 if (IS_ERR(new))
2984 new = NULL;
2985
2986 cs->mempolicy = new;
2987 spin_unlock_irq(&callback_lock);
2988
2989 /* Set CS_MEMORY_MIGRATE for default hierarchy */
2990 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
2991 __set_bit(CS_MEMORY_MIGRATE, &cs->flags);
2992
2993 return &cs->css;
2994 }
2995

--
0-DAY CI Kernel Test Service
https://01.org/lkp


Attachments:
(No filename) (5.78 kB)
config (330.19 kB)
Download all attachments

2022-09-02 10:11:46

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH] cgroup/cpuset: Add a new isolated mems.policy type.

Hi hezhongkun,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on tip/sched/core]
[also build test ERROR on linus/master v6.0-rc3 next-20220901]
[cannot apply to tj-cgroup/for-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url: https://github.com/intel-lab-lkp/linux/commits/hezhongkun/cgroup-cpuset-Add-a-new-isolated-mems-policy-type/20220902-143512
base: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 53aa930dc4bae6aa269951bd37103083145d6691
config: mips-allyesconfig (https://download.01.org/0day-ci/archive/20220902/[email protected]/config)
compiler: mips-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/7b7fbf5ae59ebc703a8d545fabd305563c0f42f6
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review hezhongkun/cgroup-cpuset-Add-a-new-isolated-mems-policy-type/20220902-143512
git checkout 7b7fbf5ae59ebc703a8d545fabd305563c0f42f6
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=mips SHELL=/bin/bash kernel/

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <[email protected]>

All errors (new ones prefixed by >>):

In file included from include/linux/sched.h:22,
from include/linux/ratelimit.h:6,
from include/linux/dev_printk.h:16,
from include/linux/device.h:15,
from include/linux/node.h:18,
from include/linux/cpu.h:17,
from kernel/cgroup/cpuset.c:25:
kernel/cgroup/cpuset.c: In function 'update_nodemasks_hier':
>> kernel/cgroup/cpuset.c:1867:54: error: 'struct mempolicy' has no member named 'w'
1867 | cp->mempolicy->w.user_nodemask);
| ^~
include/linux/nodemask.h:163:56: note: in definition of macro 'nodes_and'
163 | __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES)
| ^~~~
kernel/cgroup/cpuset.c: In function 'cpuset_change_task_cs_mpol':
>> kernel/cgroup/cpuset.c:2477:12: error: 'struct task_struct' has no member named 'il_prev'
2477 | tsk->il_prev = 0;
| ^~
kernel/cgroup/cpuset.c: In function 'cpuset_mpol_write':
kernel/cgroup/cpuset.c:2525:63: error: 'struct mempolicy' has no member named 'w'
2525 | nodes_and(cs_allowed, cs->effective_mems, mpol->w.user_nodemask);
| ^~
include/linux/nodemask.h:163:56: note: in definition of macro 'nodes_and'
163 | __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES)
| ^~~~
kernel/cgroup/cpuset.c: In function 'cpuset_mpol_show':
>> kernel/cgroup/cpuset.c:2558:26: error: 'struct mempolicy' has no member named 'mode'
2558 | if (!mpol || mpol->mode == MPOL_DEFAULT)
| ^~
>> kernel/cgroup/cpuset.c:2561:9: error: implicit declaration of function 'mpol_to_str' [-Werror=implicit-function-declaration]
2561 | mpol_to_str(buffer, sizeof(buffer), mpol);
| ^~~~~~~~~~~
kernel/cgroup/cpuset.c: In function 'cpuset_css_alloc':
>> kernel/cgroup/cpuset.c:2981:15: error: implicit declaration of function 'mpol_dup'; did you mean 'mpol_put'? [-Werror=implicit-function-declaration]
2981 | new = mpol_dup(pcs->mempolicy);
| ^~~~~~~~
| mpol_put
kernel/cgroup/cpuset.c:2981:13: warning: assignment to 'struct mempolicy *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
2981 | new = mpol_dup(pcs->mempolicy);
| ^
cc1: some warnings being treated as errors


vim +1867 kernel/cgroup/cpuset.c

1821
1822 /*
1823 * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
1824 * @cs: the cpuset to consider
1825 * @new_mems: a temp variable for calculating new effective_mems
1826 *
1827 * When configured nodemask is changed, the effective nodemasks of this cpuset
1828 * and all its descendants need to be updated.
1829 *
1830 * On legacy hierarchy, effective_mems will be the same with mems_allowed.
1831 *
1832 * Called with cpuset_rwsem held
1833 */
1834 static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1835 {
1836 struct cpuset *cp;
1837 struct cgroup_subsys_state *pos_css;
1838 nodemask_t cs_allowed;
1839
1840 rcu_read_lock();
1841 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
1842 struct cpuset *parent = parent_cs(cp);
1843
1844 nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
1845
1846 /*
1847 * If it becomes empty, inherit the effective mask of the
1848 * parent, which is guaranteed to have some MEMs.
1849 */
1850 if (is_in_v2_mode() && nodes_empty(*new_mems))
1851 *new_mems = parent->effective_mems;
1852
1853 /* Skip the whole subtree if the nodemask remains the same. */
1854 if (nodes_equal(*new_mems, cp->effective_mems)) {
1855 pos_css = css_rightmost_descendant(pos_css);
1856 continue;
1857 }
1858
1859 if (!css_tryget_online(&cp->css))
1860 continue;
1861 rcu_read_unlock();
1862
1863 spin_lock_irq(&callback_lock);
1864
1865 if (cp->mempolicy)
1866 nodes_and(cs_allowed, *new_mems,
> 1867 cp->mempolicy->w.user_nodemask);
1868 mpol_rebind_policy(cp->mempolicy, &cs_allowed);
1869 cp->effective_mems = *new_mems;
1870 spin_unlock_irq(&callback_lock);
1871
1872 WARN_ON(!is_in_v2_mode() &&
1873 !nodes_equal(cp->mems_allowed, cp->effective_mems));
1874
1875 update_tasks_nodemask(cp);
1876
1877 rcu_read_lock();
1878 css_put(&cp->css);
1879 }
1880 rcu_read_unlock();
1881 }
1882

--
0-DAY CI Kernel Test Service
https://01.org/lkp

2022-09-04 20:20:37

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH] cgroup/cpuset: Add a new isolated mems.policy type.

Hello,

On Fri, Sep 02, 2022 at 02:33:03PM +0800, hezhongkun wrote:
> From: Zhongkun He <[email protected]>
>
> Mempolicy is difficult to use because it is set in-process
> via a system call. We want to make it easier to use mempolicy
> in cpuset, and we can control low-priority cgroups to
> allocate memory in specified nodes. So this patch want to
> adds the mempolicy interface in cpuset.
>
> The mempolicy priority of cpuset is lower than the task.
> The order of getting the policy is:
> 1) vma mempolicy
> 2) task->mempolicy
> 3) cpuset->mempolicy
> 4) default policy.
>
> cpuset's policy is owned by itself, but descendants will
> get the default mempolicy from parent.
>
> How to use the mempolicy interface:
> echo prefer:2 > /sys/fs/cgroup/zz/cpuset.mems.policy
> echo bind:1-3 > /sys/fs/cgroup/zz/cpuset.mems.policy
> echo interleave:0,1,2,3 >/sys/fs/cgroup/zz/cpuset.mems.policy
> Show the policy:
> cat /sys/fs/cgroup/zz/cpuset.mems.policy
> prefer:2
> cat /sys/fs/cgroup/zz/cpuset.mems.policy
> bind:1-3
> cat /sys/fs/cgroup/zz/cpuset.mems.policy
> interleave:0-3
> Clear the policy:
> echo default > /sys/fs/cgroup/zz/cpuset.mems.policy

So, I'm a fan of adding cgroup functionalities which don't enforce anything
resource related. What you're proposing can easily be achieved with userland
tooling, right?

Thanks.

--
tejun

2022-09-05 11:26:54

by Zhongkun He

[permalink] [raw]
Subject: Re: [Phishing Risk] [External] Re: [PATCH] cgroup/cpuset: Add a new isolated mems.policy type.

Hi Tejun, thanks for your reply.

We usually use numactl to set the memory policy, but it cannot be
changed dynamically. In addition, the mempolicy of cpuset can provide a
more convenient interface for management and control panel.

Sorry,I don't quite understand the meaning of "don't enforce anything
resource related". Does it mean mempolicy, such as "prefer:2" must
specify node? Or "cpuset.mems.policy" need to specify a default value?
(cpuset.mems.policy does not require a default value.)

Thanks.


> Hello,
>
> On Fri, Sep 02, 2022 at 02:33:03PM +0800, hezhongkun wrote:
>> From: Zhongkun He <[email protected]>
>>
>> Mempolicy is difficult to use because it is set in-process
>> via a system call. We want to make it easier to use mempolicy
>> in cpuset, and we can control low-priority cgroups to
>> allocate memory in specified nodes. So this patch want to
>> adds the mempolicy interface in cpuset.
>>
>> The mempolicy priority of cpuset is lower than the task.
>> The order of getting the policy is:
>> 1) vma mempolicy
>> 2) task->mempolicy
>> 3) cpuset->mempolicy
>> 4) default policy.
>>
>> cpuset's policy is owned by itself, but descendants will
>> get the default mempolicy from parent.
>>
>> How to use the mempolicy interface:
>> echo prefer:2 > /sys/fs/cgroup/zz/cpuset.mems.policy
>> echo bind:1-3 > /sys/fs/cgroup/zz/cpuset.mems.policy
>> echo interleave:0,1,2,3 >/sys/fs/cgroup/zz/cpuset.mems.policy
>> Show the policy:
>> cat /sys/fs/cgroup/zz/cpuset.mems.policy
>> prefer:2
>> cat /sys/fs/cgroup/zz/cpuset.mems.policy
>> bind:1-3
>> cat /sys/fs/cgroup/zz/cpuset.mems.policy
>> interleave:0-3
>> Clear the policy:
>> echo default > /sys/fs/cgroup/zz/cpuset.mems.policy
>
> So, I'm a fan of adding cgroup functionalities which don't enforce anything
> resource related. What you're proposing can easily be achieved with userland
> tooling, right?
>
> Thanks.
>

2022-09-06 17:32:22

by Tejun Heo

[permalink] [raw]
Subject: Re: [Phishing Risk] [External] Re: [PATCH] cgroup/cpuset: Add a new isolated mems.policy type.

Hello,

On Mon, Sep 05, 2022 at 06:30:38PM +0800, Zhongkun He wrote:
> We usually use numactl to set the memory policy, but it cannot be changed
> dynamically. In addition, the mempolicy of cpuset can provide a more
> convenient interface for management and control panel.

But you can write a better tool easily in userspace to do whatever you wanna
do, right? If you're worried about racing against forks, you can freeze the
cgroup, iterate all pids applying whatever new policy and then unfreeze. We
can probably improve the freezer interface so that multiple users don't
conflict with each other but that shouldn't be too difficult to do and is
gonna be useful generically.

I don't see much point in adding something which can be almost trivially
implemented in userspace as a built-in kernel feature.

> Sorry,I don't quite understand the meaning of "don't enforce anything
> resource related". Does it mean mempolicy, such as "prefer:2" must specify
> node? Or "cpuset.mems.policy" need to specify a default value?
> (cpuset.mems.policy does not require a default value.)

In that there's no real resource being distributed hierarchically like cpu
cycles or memory capacities. All it's doing is changing attributes for a
group of processes, which can be done from userspace all the same.

Thanks.

--
tejun

2022-09-07 12:53:37

by Zhongkun He

[permalink] [raw]
Subject: Re: [Phishing Risk] Re: [Phishing Risk] [External] Re: [PATCH] cgroup/cpuset: Add a new isolated mems.policy type.

> Hello,
>
> On Mon, Sep 05, 2022 at 06:30:38PM +0800, Zhongkun He wrote:
>> We usually use numactl to set the memory policy, but it cannot be changed
>> dynamically. In addition, the mempolicy of cpuset can provide a more
>> convenient interface for management and control panel.
>
> But you can write a better tool easily in userspace to do whatever you wanna
> do, right? If you're worried about racing against forks, you can freeze the
> cgroup, iterate all pids applying whatever new policy and then unfreeze. We
> can probably improve the freezer interface so that multiple users don't
> conflict with each other but that shouldn't be too difficult to do and is
> gonna be useful generically.
>
> I don't see much point in adding something which can be almost trivially
> implemented in userspace as a built-in kernel feature.
>
>> Sorry,I don't quite understand the meaning of "don't enforce anything
>> resource related". Does it mean mempolicy, such as "prefer:2" must specify
>> node? Or "cpuset.mems.policy" need to specify a default value?
>> (cpuset.mems.policy does not require a default value.)
>
> In that there's no real resource being distributed hierarchically like cpu
> cycles or memory capacities. All it's doing is changing attributes for a
> group of processes, which can be done from userspace all the same.
>
> Thanks.
>
Hi Tejun, thanks for your reply.

It would be better if one process had a way to dynamically modify the
mempolicy of another process. But unfortunately there is no interface or
system call to do that in userspace.

In our use case, we hope to combine memory policy with cgroup for
better use of resources. The current implementation may not be suitable,
I'll keep trying other approaches.

Thanks again.

2022-09-07 16:00:14

by Tejun Heo

[permalink] [raw]
Subject: Re: [Phishing Risk] Re: [Phishing Risk] [External] Re: [PATCH] cgroup/cpuset: Add a new isolated mems.policy type.

Hello,

On Wed, Sep 07, 2022 at 08:06:30PM +0800, Zhongkun He wrote:
> It would be better if one process had a way to dynamically modify the
> mempolicy of another process. But unfortunately there is no interface or
> system call to do that in userspace.

If you need to change the properties dynamically, I suggest adding this as a
dynamic per-process interface first. That's more generic and useful for more
cases.

Thanks.

--
tejun

2022-09-09 03:05:46

by Zhongkun He

[permalink] [raw]
Subject: Re: [Phishing Risk] Re: [Phishing Risk] Re: [Phishing Risk] [External] Re: [PATCH] cgroup/cpuset: Add a new isolated mems.policy type.

Hi Tejun,
> Hello,
>
> On Wed, Sep 07, 2022 at 08:06:30PM +0800, Zhongkun He wrote:
>> It would be better if one process had a way to dynamically modify the
>> mempolicy of another process. But unfortunately there is no interface or
>> system call to do that in userspace.
>
> If you need to change the properties dynamically, I suggest adding this as a
> dynamic per-process interface first. That's more generic and useful for more
> cases.
>
> Thanks.

Got it, thanks for your suggestion and reply.