LinuxLists.cc - [PATCH resend 5/8] sched: cgroup cookie API for core scheduling

2021-03-25 03:28:21

Subject: [PATCH resend 5/8] sched: cgroup cookie API for core scheduling

From: Josh Don <[email protected]>

This adds the API to set/get the cookie for a given cgroup. This
interface lives at cgroup/cpu.core_tag.

The cgroup interface can be used to toggle a unique cookie value for all
descendent tasks, preventing these tasks from sharing with any others.
See Documentation/admin-guide/hw-vuln/core-scheduling.rst for a full
rundown of both this and the per-task API.

Signed-off-by: Josh Don <[email protected]>
---
kernel/sched/core.c | 61 ++++++++++++++--
kernel/sched/coretag.c | 156 ++++++++++++++++++++++++++++++++++++++++-
kernel/sched/sched.h | 25 +++++++
3 files changed, 235 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3093cb3414c3..a733891dfe7d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9328,6 +9328,8 @@ struct task_group *sched_create_group(struct task_group *parent)

alloc_uclamp_sched_group(tg, parent);

+ alloc_sched_core_sched_group(tg);
+
return tg;

err:
@@ -9391,6 +9393,11 @@ static void sched_change_group(struct task_struct *tsk, int type)
tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
struct task_group, css);
tg = autogroup_task_group(tsk, tg);
+
+#ifdef CONFIG_SCHED_CORE
+ sched_core_change_group(tsk, tg);
+#endif
+
tsk->sched_task_group = tg;

#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -9443,11 +9450,6 @@ void sched_move_task(struct task_struct *tsk)
task_rq_unlock(rq, tsk, &rf);
}

-static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
-{
- return css ? container_of(css, struct task_group, css) : NULL;
-}
-
static struct cgroup_subsys_state *
cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
@@ -9483,6 +9485,18 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
return 0;
}

+static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
+{
+#ifdef CONFIG_SCHED_CORE
+ struct task_group *tg = css_tg(css);
+
+ if (tg->core_tagged) {
+ sched_core_put();
+ tg->core_tagged = 0;
+ }
+#endif
+}
+
static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
@@ -9517,6 +9531,25 @@ static void cpu_cgroup_fork(struct task_struct *task)
task_rq_unlock(rq, task, &rf);
}

+static void cpu_cgroup_exit(struct task_struct *task)
+{
+#ifdef CONFIG_SCHED_CORE
+ /*
+ * This is possible if task exit races with core sched being
+ * disabled due to the task's cgroup no longer being tagged, since
+ * cpu_core_tag_write_u64() will miss dying tasks.
+ */
+ if (unlikely(sched_core_enqueued(task))) {
+ struct rq *rq;
+ struct rq_flags rf;
+
+ rq = task_rq_lock(task, &rf);
+ sched_core_dequeue(rq, task);
+ task_rq_unlock(rq, task, &rf);
+ }
+#endif
+}
+
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
@@ -10084,6 +10117,14 @@ static struct cftype cpu_legacy_files[] = {
.write_u64 = cpu_rt_period_write_uint,
},
#endif
+#ifdef CONFIG_SCHED_CORE
+ {
+ .name = "core_tag",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = cpu_core_tag_read_u64,
+ .write_u64 = cpu_core_tag_write_u64,
+ },
+#endif
#ifdef CONFIG_UCLAMP_TASK_GROUP
{
.name = "uclamp.min",
@@ -10257,6 +10298,14 @@ static struct cftype cpu_files[] = {
.write_s64 = cpu_weight_nice_write_s64,
},
#endif
+#ifdef CONFIG_SCHED_CORE
+ {
+ .name = "core_tag",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = cpu_core_tag_read_u64,
+ .write_u64 = cpu_core_tag_write_u64,
+ },
+#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
.name = "max",
@@ -10285,10 +10334,12 @@ static struct cftype cpu_files[] = {
struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
.css_online = cpu_cgroup_css_online,
+ .css_offline = cpu_cgroup_css_offline,
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.css_extra_stat_show = cpu_extra_stat_show,
.fork = cpu_cgroup_fork,
+ .exit = cpu_cgroup_exit,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
.legacy_cftypes = cpu_legacy_files,
diff --git a/kernel/sched/coretag.c b/kernel/sched/coretag.c
index 550f4975eea2..1498790bc76c 100644
--- a/kernel/sched/coretag.c
+++ b/kernel/sched/coretag.c
@@ -96,9 +96,19 @@ static void __sched_core_set_task_cookie(struct sched_core_cookie *cookie,
static void __sched_core_set_group_cookie(struct sched_core_cookie *cookie,
unsigned long val)
{
+ struct task_group *tg = (struct task_group *)val;
+ u64 group_cookie_id; /* only uses lower 32 bits */
+
cookie->group_cookie = val;

- // XXX incorporate group_cookie into userspace id
+ if (tg)
+ group_cookie_id = tg->sched_core_id;
+ else
+ group_cookie_id = 0;
+
+ /* group cookie userspace id is the lower 32 bits */
+ cookie->userspace_id &= 0xffffffff00000000;
+ cookie->userspace_id |= group_cookie_id;
}
#endif

@@ -394,6 +404,142 @@ int sched_core_share_pid(unsigned long flags, pid_t pid, enum pid_type type)
return err;
}

+/* CGroup core-scheduling interface support. */
+#ifdef CONFIG_CGROUP_SCHED
+/*
+ * Helper to get the group cookie in a hierarchy. Any ancestor can have a
+ * cookie.
+ *
+ * Can race with an update to tg->core_tagged if sched_core_group_mutex is
+ * not held.
+ */
+static unsigned long cpu_core_get_group_cookie(struct task_group *tg)
+{
+ for (; tg; tg = tg->parent) {
+ if (READ_ONCE(tg->core_tagged))
+ return (unsigned long)tg;
+ }
+
+ return 0;
+}
+
+/* Determine if any group in @tg's children are tagged. */
+static bool cpu_core_check_descendants(struct task_group *tg, bool check_tag)
+{
+ struct task_group *child;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(child, &tg->children, siblings) {
+ if ((child->core_tagged && check_tag)) {
+ rcu_read_unlock();
+ return true;
+ }
+
+ rcu_read_unlock();
+ return cpu_core_check_descendants(child, check_tag);
+ }
+
+ rcu_read_unlock();
+ return false;
+}
+
+u64 cpu_core_tag_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return !!css_tg(css)->core_tagged;
+}
+
+int cpu_core_tag_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 val)
+{
+ static DEFINE_MUTEX(sched_core_group_mutex);
+ struct task_group *tg = css_tg(css);
+ struct cgroup_subsys_state *css_tmp;
+ struct task_struct *p;
+ unsigned long group_cookie;
+ int ret = 0;
+
+ if (val > 1)
+ return -ERANGE;
+
+ if (!static_branch_likely(&sched_smt_present))
+ return -EINVAL;
+
+ mutex_lock(&sched_core_group_mutex);
+
+ if (!tg->core_tagged && val) {
+ /* Tag is being set. Check ancestors and descendants. */
+ if (cpu_core_get_group_cookie(tg) ||
+ cpu_core_check_descendants(tg, true /* tag */)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+ } else if (tg->core_tagged && !val) {
+ /* Tag is being reset. Check descendants. */
+ if (cpu_core_check_descendants(tg, true /* tag */)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+ } else {
+ goto out_unlock;
+ }
+
+ if (val)
+ sched_core_get();
+
+ tg->core_tagged = val;
+ group_cookie = cpu_core_get_group_cookie(tg);
+
+ rcu_read_lock();
+ css_for_each_descendant_pre(css_tmp, css) {
+ struct css_task_iter it;
+
+ css_task_iter_start(css_tmp, 0, &it);
+ /*
+ * Note: css_task_iter_next will skip dying tasks.
+ * There could still be dying tasks left in the core queue
+ * when we set cgroup tag to 0 when the loop is done below.
+ * We will handle this in cpu_cgroup_exit().
+ */
+ while ((p = css_task_iter_next(&it))) {
+ sched_core_update_cookie(p, group_cookie,
+ sched_core_group_cookie_type);
+ }
+
+ css_task_iter_end(&it);
+ }
+ rcu_read_unlock();
+
+ if (!val)
+ sched_core_put();
+
+out_unlock:
+ mutex_unlock(&sched_core_group_mutex);
+ return ret;
+}
+
+void sched_core_change_group(struct task_struct *p, struct task_group *new_tg)
+{
+ lockdep_assert_held(rq_lockp(task_rq(p)));
+
+ /*
+ * Reading the group cookie can race, but since the task is already
+ * visible in the group, a concurrent group_cookie update will also
+ * update this task.
+ */
+ __sched_core_set_group_cookie(&p->core_cookie,
+ cpu_core_get_group_cookie(new_tg));
+}
+
+void alloc_sched_core_sched_group(struct task_group *tg)
+{
+ static u32 next_id = 1;
+
+ tg->sched_core_id = next_id++;
+ WARN_ON_ONCE(next_id == 0);
+}
+#endif
+
int sched_core_exec(void)
{
int ret = 0;
@@ -408,7 +554,13 @@ int sched_core_exec(void)
return ret;
}

-/* Called from sched_fork() */
+/*
+ * Called from sched_fork().
+ *
+ * NOTE: This might race with a concurrent cgroup cookie update. That's
+ * ok; sched_core_change_group() will handle this post-fork, once the
+ * task is visible.
+ */
int sched_core_fork(struct task_struct *p, unsigned long clone_flags)
{
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1be86d9cc58f..c3435469ea24 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -385,6 +385,11 @@ struct cfs_bandwidth {
struct task_group {
struct cgroup_subsys_state css;

+#ifdef CONFIG_SCHED_CORE
+ int core_tagged;
+ u32 sched_core_id;
+#endif
+
#ifdef CONFIG_FAIR_GROUP_SCHED
/* schedulable entities of this group on each CPU */
struct sched_entity **se;
@@ -433,6 +438,11 @@ struct task_group {

};

+static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct task_group, css) : NULL;
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD

@@ -1170,6 +1180,7 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq)
return &rq->__lock;
}

+void sched_core_change_group(struct task_struct *p, struct task_group *new_tg);
int sched_core_fork(struct task_struct *p, unsigned long clone_flags);

static inline bool sched_core_enqueued(struct task_struct *task)
@@ -1186,6 +1197,16 @@ void sched_core_put(void);

int sched_core_share_pid(unsigned long flags, pid_t pid, enum pid_type type);

+#ifdef CONFIG_CGROUP_SCHED
+u64 cpu_core_tag_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft);
+
+int cpu_core_tag_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 val);
+
+void alloc_sched_core_sched_group(struct task_group *tg);
+#endif
+
bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool fi);

int sched_core_cookie_cmp(const struct sched_core_cookie *a,
@@ -1305,6 +1326,10 @@ static inline bool sched_core_enqueued(struct task_struct *task)
static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }

+#ifdef CONFIG_CGROUP_SCHED
+void alloc_sched_core_sched_group(struct task_group *tg) {}
+#endif
+
#endif /* CONFIG_SCHED_CORE */

static inline void lockdep_assert_rq_held(struct rq *rq)
--
2.31.0.291.g576ba9dcdaf-goog

2021-03-30 09:31:14

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH resend 5/8] sched: cgroup cookie API for core scheduling

On Wed, Mar 24, 2021 at 05:40:17PM -0400, Joel Fernandes (Google) wrote:
> From: Josh Don <[email protected]>
>
> This adds the API to set/get the cookie for a given cgroup. This
> interface lives at cgroup/cpu.core_tag.
>
> The cgroup interface can be used to toggle a unique cookie value for all
> descendent tasks, preventing these tasks from sharing with any others.
> See Documentation/admin-guide/hw-vuln/core-scheduling.rst for a full
> rundown of both this and the per-task API.

I refuse to read RST. Life's too short for that.

> +u64 cpu_core_tag_read_u64(struct cgroup_subsys_state *css,
> + struct cftype *cft)
> +{
> + return !!css_tg(css)->core_tagged;
> +}
> +
> +int cpu_core_tag_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
> + u64 val)
> +{
> + static DEFINE_MUTEX(sched_core_group_mutex);
> + struct task_group *tg = css_tg(css);
> + struct cgroup_subsys_state *css_tmp;
> + struct task_struct *p;
> + unsigned long group_cookie;
> + int ret = 0;
> +
> + if (val > 1)
> + return -ERANGE;
> +
> + if (!static_branch_likely(&sched_smt_present))
> + return -EINVAL;
> +
> + mutex_lock(&sched_core_group_mutex);
> +
> + if (!tg->core_tagged && val) {
> + /* Tag is being set. Check ancestors and descendants. */
> + if (cpu_core_get_group_cookie(tg) ||
> + cpu_core_check_descendants(tg, true /* tag */)) {
> + ret = -EBUSY;
> + goto out_unlock;
> + }

So the desired semantics is to only allow a single tag on any upwards
path? Isn't that in conflict with the cgroup requirements?

TJ?

> + } else if (tg->core_tagged && !val) {
> + /* Tag is being reset. Check descendants. */
> + if (cpu_core_check_descendants(tg, true /* tag */)) {

I'm struggling to understand this. If, per the above, you cannot set
when either a parent is already set or a child is set, then how can a
child be set to refuse clearing?

> + ret = -EBUSY;
> + goto out_unlock;
> + }
> + } else {
> + goto out_unlock;
> + }

2021-03-30 09:32:55

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH resend 5/8] sched: cgroup cookie API for core scheduling

*sigh*, +tj

On Tue, Mar 30, 2021 at 11:23:10AM +0200, Peter Zijlstra wrote:
> On Wed, Mar 24, 2021 at 05:40:17PM -0400, Joel Fernandes (Google) wrote:
> > From: Josh Don <[email protected]>
> >
> > This adds the API to set/get the cookie for a given cgroup. This
> > interface lives at cgroup/cpu.core_tag.
> >
> > The cgroup interface can be used to toggle a unique cookie value for all
> > descendent tasks, preventing these tasks from sharing with any others.
> > See Documentation/admin-guide/hw-vuln/core-scheduling.rst for a full
> > rundown of both this and the per-task API.
>
> I refuse to read RST. Life's too short for that.
>
> > +u64 cpu_core_tag_read_u64(struct cgroup_subsys_state *css,
> > + struct cftype *cft)
> > +{
> > + return !!css_tg(css)->core_tagged;
> > +}
> > +
> > +int cpu_core_tag_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
> > + u64 val)
> > +{
> > + static DEFINE_MUTEX(sched_core_group_mutex);
> > + struct task_group *tg = css_tg(css);
> > + struct cgroup_subsys_state *css_tmp;
> > + struct task_struct *p;
> > + unsigned long group_cookie;
> > + int ret = 0;
> > +
> > + if (val > 1)
> > + return -ERANGE;
> > +
> > + if (!static_branch_likely(&sched_smt_present))
> > + return -EINVAL;
> > +
> > + mutex_lock(&sched_core_group_mutex);
> > +
> > + if (!tg->core_tagged && val) {
> > + /* Tag is being set. Check ancestors and descendants. */
> > + if (cpu_core_get_group_cookie(tg) ||
> > + cpu_core_check_descendants(tg, true /* tag */)) {
> > + ret = -EBUSY;
> > + goto out_unlock;
> > + }
>
> So the desired semantics is to only allow a single tag on any upwards
> path? Isn't that in conflict with the cgroup requirements?
>
> TJ?
>
> > + } else if (tg->core_tagged && !val) {
> > + /* Tag is being reset. Check descendants. */
> > + if (cpu_core_check_descendants(tg, true /* tag */)) {
>
> I'm struggling to understand this. If, per the above, you cannot set
> when either a parent is already set or a child is set, then how can a
> child be set to refuse clearing?
>
> > + ret = -EBUSY;
> > + goto out_unlock;
> > + }
> > + } else {
> > + goto out_unlock;
> > + }
>
>

2021-03-30 21:24:09

by Josh Don

[permalink] [raw]

Subject: Re: [PATCH resend 5/8] sched: cgroup cookie API for core scheduling

On Tue, Mar 30, 2021 at 2:29 AM Peter Zijlstra <[email protected]> wrote:
> > On Wed, Mar 24, 2021 at 05:40:17PM -0400, Joel Fernandes (Google) wrote:
> > > +
> > > + if (!tg->core_tagged && val) {
> > > + /* Tag is being set. Check ancestors and descendants. */
> > > + if (cpu_core_get_group_cookie(tg) ||
> > > + cpu_core_check_descendants(tg, true /* tag */)) {
> > > + ret = -EBUSY;
> > > + goto out_unlock;
> > > + }
> >
> > So the desired semantics is to only allow a single tag on any upwards
> > path? Isn't that in conflict with the cgroup requirements?
> >
> > TJ?

I carried this requirement over from the previous iteration, but I
don't see a reason why we can't just dump this and have each task use
the group cookie of its closest tagged ancestor. Joel, is there any
context here I'm missing?

FWIW I also just realized that cpu_core_check_descendants() is busted
as it recurses only on one child.

> > > + } else if (tg->core_tagged && !val) {
> > > + /* Tag is being reset. Check descendants. */
> > > + if (cpu_core_check_descendants(tg, true /* tag */)) {
> >
> > I'm struggling to understand this. If, per the above, you cannot set
> > when either a parent is already set or a child is set, then how can a
> > child be set to refuse clearing?

Yes this is superfluous with the above semantics.