2020-10-20 17:14:38

by Joel Fernandes

[permalink] [raw]
Subject: [PATCH v8 -tip 17/26] sched: Split the cookie and setup per-task cookie on fork

In order to prevent interference and clearly support both per-task and CGroup
APIs, split the cookie into 2 and allow it to be set from either per-task, or
CGroup API. The final cookie is the combined value of both and is computed when
the stop-machine executes during a change of cookie.

Also, for the per-task cookie, it will get weird if we use pointers of any
emphemeral objects. For this reason, introduce a refcounted object who's sole
purpose is to assign unique cookie value by way of the object's pointer.

While at it, refactor the CGroup code a bit. Future patches will introduce more
APIs and support.

Tested-by: Julien Desfossez <[email protected]>
Signed-off-by: Joel Fernandes (Google) <[email protected]>
---
include/linux/sched.h | 2 +
kernel/sched/core.c | 241 ++++++++++++++++++++++++++++++++++++++++--
kernel/sched/debug.c | 4 +
3 files changed, 236 insertions(+), 11 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index fe6f225bfbf9..c6034c00846a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -688,6 +688,8 @@ struct task_struct {
#ifdef CONFIG_SCHED_CORE
struct rb_node core_node;
unsigned long core_cookie;
+ unsigned long core_task_cookie;
+ unsigned long core_group_cookie;
unsigned int core_occupation;
#endif

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bab4ea2f5cd8..30a9e4cb5ce1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -346,11 +346,14 @@ void sched_core_put(void)
mutex_unlock(&sched_core_mutex);
}

+static int sched_core_share_tasks(struct task_struct *t1, struct task_struct *t2);
+
#else /* !CONFIG_SCHED_CORE */

static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
static bool sched_core_enqueued(struct task_struct *task) { return false; }
+static int sched_core_share_tasks(struct task_struct *t1, struct task_struct *t2) { }

#endif /* CONFIG_SCHED_CORE */

@@ -3583,6 +3586,20 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
#endif
#ifdef CONFIG_SCHED_CORE
RB_CLEAR_NODE(&p->core_node);
+
+ /*
+ * Tag child via per-task cookie only if parent is tagged via per-task
+ * cookie. This is independent of, but can be additive to the CGroup tagging.
+ */
+ if (current->core_task_cookie) {
+
+ /* If it is not CLONE_THREAD fork, assign a unique per-task tag. */
+ if (!(clone_flags & CLONE_THREAD)) {
+ return sched_core_share_tasks(p, p);
+ }
+ /* Otherwise share the parent's per-task tag. */
+ return sched_core_share_tasks(p, current);
+ }
#endif
return 0;
}
@@ -9177,6 +9194,217 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
#endif /* CONFIG_RT_GROUP_SCHED */

#ifdef CONFIG_SCHED_CORE
+/*
+ * A simple wrapper around refcount. An allocated sched_core_cookie's
+ * address is used to compute the cookie of the task.
+ */
+struct sched_core_cookie {
+ refcount_t refcnt;
+};
+
+/*
+ * sched_core_tag_requeue - Common helper for all interfaces to set a cookie.
+ * @p: The task to assign a cookie to.
+ * @cookie: The cookie to assign.
+ * @group: is it a group interface or a per-task interface.
+ *
+ * This function is typically called from a stop-machine handler.
+ */
+void sched_core_tag_requeue(struct task_struct *p, unsigned long cookie, bool group)
+{
+ if (!p)
+ return;
+
+ if (group)
+ p->core_group_cookie = cookie;
+ else
+ p->core_task_cookie = cookie;
+
+ /* Use up half of the cookie's bits for task cookie and remaining for group cookie. */
+ p->core_cookie = (p->core_task_cookie <<
+ (sizeof(unsigned long) * 4)) + p->core_group_cookie;
+
+ if (sched_core_enqueued(p)) {
+ sched_core_dequeue(task_rq(p), p);
+ if (!p->core_task_cookie)
+ return;
+ }
+
+ if (sched_core_enabled(task_rq(p)) &&
+ p->core_cookie && task_on_rq_queued(p))
+ sched_core_enqueue(task_rq(p), p);
+}
+
+/* Per-task interface */
+static unsigned long sched_core_alloc_task_cookie(void)
+{
+ struct sched_core_cookie *ptr =
+ kmalloc(sizeof(struct sched_core_cookie), GFP_KERNEL);
+
+ if (!ptr)
+ return 0;
+ refcount_set(&ptr->refcnt, 1);
+
+ /*
+ * NOTE: sched_core_put() is not done by put_task_cookie(). Instead, it
+ * is done after the stopper runs.
+ */
+ sched_core_get();
+ return (unsigned long)ptr;
+}
+
+static bool sched_core_get_task_cookie(unsigned long cookie)
+{
+ struct sched_core_cookie *ptr = (struct sched_core_cookie *)cookie;
+
+ /*
+ * NOTE: sched_core_put() is not done by put_task_cookie(). Instead, it
+ * is done after the stopper runs.
+ */
+ sched_core_get();
+ return refcount_inc_not_zero(&ptr->refcnt);
+}
+
+static void sched_core_put_task_cookie(unsigned long cookie)
+{
+ struct sched_core_cookie *ptr = (struct sched_core_cookie *)cookie;
+
+ if (refcount_dec_and_test(&ptr->refcnt))
+ kfree(ptr);
+}
+
+struct sched_core_task_write_tag {
+ struct task_struct *tasks[2];
+ unsigned long cookies[2];
+};
+
+/*
+ * Ensure that the task has been requeued. The stopper ensures that the task cannot
+ * be migrated to a different CPU while its core scheduler queue state is being updated.
+ * It also makes sure to requeue a task if it was running actively on another CPU.
+ */
+static int sched_core_task_join_stopper(void *data)
+{
+ struct sched_core_task_write_tag *tag = (struct sched_core_task_write_tag *)data;
+ int i;
+
+ for (i = 0; i < 2; i++)
+ sched_core_tag_requeue(tag->tasks[i], tag->cookies[i], false /* !group */);
+
+ return 0;
+}
+
+static int sched_core_share_tasks(struct task_struct *t1, struct task_struct *t2)
+{
+ struct sched_core_task_write_tag wr = {}; /* for stop machine. */
+ bool sched_core_put_after_stopper = false;
+ unsigned long cookie;
+ int ret = -ENOMEM;
+
+ mutex_lock(&sched_core_mutex);
+
+ /*
+ * NOTE: sched_core_get() is done by sched_core_alloc_task_cookie() or
+ * sched_core_put_task_cookie(). However, sched_core_put() is done
+ * by this function *after* the stopper removes the tasks from the
+ * core queue, and not before. This is just to play it safe.
+ */
+ if (t2 == NULL) {
+ if (t1->core_task_cookie) {
+ sched_core_put_task_cookie(t1->core_task_cookie);
+ sched_core_put_after_stopper = true;
+ wr.tasks[0] = t1; /* Keep wr.cookies[0] reset for t1. */
+ }
+ } else if (t1 == t2) {
+ /* Assign a unique per-task cookie solely for t1. */
+
+ cookie = sched_core_alloc_task_cookie();
+ if (!cookie)
+ goto out_unlock;
+
+ if (t1->core_task_cookie) {
+ sched_core_put_task_cookie(t1->core_task_cookie);
+ sched_core_put_after_stopper = true;
+ }
+ wr.tasks[0] = t1;
+ wr.cookies[0] = cookie;
+ } else
+ /*
+ * t1 joining t2
+ * CASE 1:
+ * before 0 0
+ * after new cookie new cookie
+ *
+ * CASE 2:
+ * before X (non-zero) 0
+ * after 0 0
+ *
+ * CASE 3:
+ * before 0 X (non-zero)
+ * after X X
+ *
+ * CASE 4:
+ * before Y (non-zero) X (non-zero)
+ * after X X
+ */
+ if (!t1->core_task_cookie && !t2->core_task_cookie) {
+ /* CASE 1. */
+ cookie = sched_core_alloc_task_cookie();
+ if (!cookie)
+ goto out_unlock;
+
+ /* Add another reference for the other task. */
+ if (!sched_core_get_task_cookie(cookie)) {
+ return -EINVAL;
+ goto out_unlock;
+ }
+
+ wr.tasks[0] = t1;
+ wr.tasks[1] = t2;
+ wr.cookies[0] = wr.cookies[1] = cookie;
+
+ } else if (t1->core_task_cookie && !t2->core_task_cookie) {
+ /* CASE 2. */
+ sched_core_put_task_cookie(t1->core_task_cookie);
+ sched_core_put_after_stopper = true;
+
+ wr.tasks[0] = t1; /* Reset cookie for t1. */
+
+ } else if (!t1->core_task_cookie && t2->core_task_cookie) {
+ /* CASE 3. */
+ if (!sched_core_get_task_cookie(t2->core_task_cookie)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ wr.tasks[0] = t1;
+ wr.cookies[0] = t2->core_task_cookie;
+
+ } else {
+ /* CASE 4. */
+ if (!sched_core_get_task_cookie(t2->core_task_cookie)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ sched_core_put_task_cookie(t1->core_task_cookie);
+ sched_core_put_after_stopper = true;
+
+ wr.tasks[0] = t1;
+ wr.cookies[0] = t2->core_task_cookie;
+ }
+
+ stop_machine(sched_core_task_join_stopper, (void *)&wr, NULL);
+
+ if (sched_core_put_after_stopper)
+ sched_core_put();
+
+ ret = 0;
+out_unlock:
+ mutex_unlock(&sched_core_mutex);
+ return ret;
+}
+
+/* CGroup interface */
static u64 cpu_core_tag_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
{
struct task_group *tg = css_tg(css);
@@ -9207,18 +9435,9 @@ static int __sched_write_tag(void *data)
* when we set cgroup tag to 0 when the loop is done below.
*/
while ((p = css_task_iter_next(&it))) {
- p->core_cookie = !!val ? (unsigned long)tg : 0UL;
-
- if (sched_core_enqueued(p)) {
- sched_core_dequeue(task_rq(p), p);
- if (!p->core_cookie)
- continue;
- }
-
- if (sched_core_enabled(task_rq(p)) &&
- p->core_cookie && task_on_rq_queued(p))
- sched_core_enqueue(task_rq(p), p);
+ unsigned long cookie = !!val ? (unsigned long)tg : 0UL;

+ sched_core_tag_requeue(p, cookie, true /* group */);
}
css_task_iter_end(&it);

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index c8fee8d9dfd4..88bf45267672 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1024,6 +1024,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
__PS("clock-delta", t1-t0);
}

+#ifdef CONFIG_SCHED_CORE
+ __PS("core_cookie", p->core_cookie);
+#endif
+
sched_show_numa(p, m);
}

--
2.29.0.rc1.297.gfa9743e501-goog


2020-11-04 22:37:36

by Chris Hyser

[permalink] [raw]
Subject: Re: [PATCH v8 -tip 17/26] sched: Split the cookie and setup per-task cookie on fork

On 10/19/20 9:43 PM, Joel Fernandes (Google) wrote:
> In order to prevent interference and clearly support both per-task and CGroup
> APIs, split the cookie into 2 and allow it to be set from either per-task, or
> CGroup API. The final cookie is the combined value of both and is computed when
> the stop-machine executes during a change of cookie.
>
> Also, for the per-task cookie, it will get weird if we use pointers of any
> emphemeral objects. For this reason, introduce a refcounted object who's sole
> purpose is to assign unique cookie value by way of the object's pointer.
>
> While at it, refactor the CGroup code a bit. Future patches will introduce more
> APIs and support.
>
> Tested-by: Julien Desfossez <[email protected]>
> Signed-off-by: Joel Fernandes (Google) <[email protected]>
> ---
> include/linux/sched.h | 2 +
> kernel/sched/core.c | 241 ++++++++++++++++++++++++++++++++++++++++--
> kernel/sched/debug.c | 4 +
> 3 files changed, 236 insertions(+), 11 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index fe6f225bfbf9..c6034c00846a 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -688,6 +688,8 @@ struct task_struct {
> #ifdef CONFIG_SCHED_CORE
> struct rb_node core_node;
> unsigned long core_cookie;
> + unsigned long core_task_cookie;
> + unsigned long core_group_cookie;
> unsigned int core_occupation;
> #endif
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index bab4ea2f5cd8..30a9e4cb5ce1 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -346,11 +346,14 @@ void sched_core_put(void)
> mutex_unlock(&sched_core_mutex);
> }
>
> +static int sched_core_share_tasks(struct task_struct *t1, struct task_struct *t2);
> +
> #else /* !CONFIG_SCHED_CORE */
>
> static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
> static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
> static bool sched_core_enqueued(struct task_struct *task) { return false; }
> +static int sched_core_share_tasks(struct task_struct *t1, struct task_struct *t2) { }
>
> #endif /* CONFIG_SCHED_CORE */
>
> @@ -3583,6 +3586,20 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
> #endif
> #ifdef CONFIG_SCHED_CORE
> RB_CLEAR_NODE(&p->core_node);
> +
> + /*
> + * Tag child via per-task cookie only if parent is tagged via per-task
> + * cookie. This is independent of, but can be additive to the CGroup tagging.
> + */
> + if (current->core_task_cookie) {
> +
> + /* If it is not CLONE_THREAD fork, assign a unique per-task tag. */
> + if (!(clone_flags & CLONE_THREAD)) {
> + return sched_core_share_tasks(p, p);
> + }
> + /* Otherwise share the parent's per-task tag. */
> + return sched_core_share_tasks(p, current);
> + }
> #endif
> return 0;
> }
> @@ -9177,6 +9194,217 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
> #endif /* CONFIG_RT_GROUP_SCHED */
>
> #ifdef CONFIG_SCHED_CORE
> +/*
> + * A simple wrapper around refcount. An allocated sched_core_cookie's
> + * address is used to compute the cookie of the task.
> + */
> +struct sched_core_cookie {
> + refcount_t refcnt;
> +};
> +
> +/*
> + * sched_core_tag_requeue - Common helper for all interfaces to set a cookie.
> + * @p: The task to assign a cookie to.
> + * @cookie: The cookie to assign.
> + * @group: is it a group interface or a per-task interface.
> + *
> + * This function is typically called from a stop-machine handler.
> + */
> +void sched_core_tag_requeue(struct task_struct *p, unsigned long cookie, bool group)
> +{
> + if (!p)
> + return;
> +
> + if (group)
> + p->core_group_cookie = cookie;
> + else
> + p->core_task_cookie = cookie;
> +
> + /* Use up half of the cookie's bits for task cookie and remaining for group cookie. */
> + p->core_cookie = (p->core_task_cookie <<
> + (sizeof(unsigned long) * 4)) + p->core_group_cookie;
> +
> + if (sched_core_enqueued(p)) {
> + sched_core_dequeue(task_rq(p), p);
> + if (!p->core_task_cookie)
> + return;
> + }
> +
> + if (sched_core_enabled(task_rq(p)) &&
> + p->core_cookie && task_on_rq_queued(p))
> + sched_core_enqueue(task_rq(p), p);
> +}
> +
> +/* Per-task interface */
> +static unsigned long sched_core_alloc_task_cookie(void)
> +{
> + struct sched_core_cookie *ptr =
> + kmalloc(sizeof(struct sched_core_cookie), GFP_KERNEL);
> +
> + if (!ptr)
> + return 0;
> + refcount_set(&ptr->refcnt, 1);
> +
> + /*
> + * NOTE: sched_core_put() is not done by put_task_cookie(). Instead, it
> + * is done after the stopper runs.
> + */
> + sched_core_get();
> + return (unsigned long)ptr;
> +}
> +
> +static bool sched_core_get_task_cookie(unsigned long cookie)
> +{
> + struct sched_core_cookie *ptr = (struct sched_core_cookie *)cookie;
> +
> + /*
> + * NOTE: sched_core_put() is not done by put_task_cookie(). Instead, it
> + * is done after the stopper runs.
> + */
> + sched_core_get();
> + return refcount_inc_not_zero(&ptr->refcnt);
> +}
> +
> +static void sched_core_put_task_cookie(unsigned long cookie)
> +{
> + struct sched_core_cookie *ptr = (struct sched_core_cookie *)cookie;
> +
> + if (refcount_dec_and_test(&ptr->refcnt))
> + kfree(ptr);
> +}
> +
> +struct sched_core_task_write_tag {
> + struct task_struct *tasks[2];
> + unsigned long cookies[2];
> +};
> +
> +/*
> + * Ensure that the task has been requeued. The stopper ensures that the task cannot
> + * be migrated to a different CPU while its core scheduler queue state is being updated.
> + * It also makes sure to requeue a task if it was running actively on another CPU.
> + */
> +static int sched_core_task_join_stopper(void *data)
> +{
> + struct sched_core_task_write_tag *tag = (struct sched_core_task_write_tag *)data;
> + int i;
> +
> + for (i = 0; i < 2; i++)
> + sched_core_tag_requeue(tag->tasks[i], tag->cookies[i], false /* !group */);
> +
> + return 0;
> +}
> +
> +static int sched_core_share_tasks(struct task_struct *t1, struct task_struct *t2)
> +{
> + struct sched_core_task_write_tag wr = {}; /* for stop machine. */
> + bool sched_core_put_after_stopper = false;
> + unsigned long cookie;
> + int ret = -ENOMEM;
> +
> + mutex_lock(&sched_core_mutex);
> +
> + /*
> + * NOTE: sched_core_get() is done by sched_core_alloc_task_cookie() or
> + * sched_core_put_task_cookie(). However, sched_core_put() is done
> + * by this function *after* the stopper removes the tasks from the
> + * core queue, and not before. This is just to play it safe.
> + */
> + if (t2 == NULL) {
> + if (t1->core_task_cookie) {
> + sched_core_put_task_cookie(t1->core_task_cookie);
> + sched_core_put_after_stopper = true;
> + wr.tasks[0] = t1; /* Keep wr.cookies[0] reset for t1. */
> + }
> + } else if (t1 == t2) {
> + /* Assign a unique per-task cookie solely for t1. */
> +
> + cookie = sched_core_alloc_task_cookie();
> + if (!cookie)
> + goto out_unlock;
> +
> + if (t1->core_task_cookie) {
> + sched_core_put_task_cookie(t1->core_task_cookie);
> + sched_core_put_after_stopper = true;
> + }
> + wr.tasks[0] = t1;
> + wr.cookies[0] = cookie;
> + } else
> + /*
> + * t1 joining t2
> + * CASE 1:
> + * before 0 0
> + * after new cookie new cookie
> + *
> + * CASE 2:
> + * before X (non-zero) 0
> + * after 0 0
> + *
> + * CASE 3:
> + * before 0 X (non-zero)
> + * after X X
> + *
> + * CASE 4:
> + * before Y (non-zero) X (non-zero)
> + * after X X
> + */
> + if (!t1->core_task_cookie && !t2->core_task_cookie) {
> + /* CASE 1. */
> + cookie = sched_core_alloc_task_cookie();
> + if (!cookie)
> + goto out_unlock;
> +
> + /* Add another reference for the other task. */
> + if (!sched_core_get_task_cookie(cookie)) {
> + return -EINVAL;

should be: ret = -EINVAL;

> + goto out_unlock;
> + }
> +
> + wr.tasks[0] = t1;
> + wr.tasks[1] = t2;
> + wr.cookies[0] = wr.cookies[1] = cookie;
> +
> + } else if (t1->core_task_cookie && !t2->core_task_cookie) {
> + /* CASE 2. */
> + sched_core_put_task_cookie(t1->core_task_cookie);
> + sched_core_put_after_stopper = true;
> +
> + wr.tasks[0] = t1; /* Reset cookie for t1. */
> +
> + } else if (!t1->core_task_cookie && t2->core_task_cookie) {
> + /* CASE 3. */
> + if (!sched_core_get_task_cookie(t2->core_task_cookie)) {
> + ret = -EINVAL;
> + goto out_unlock;
> + }
> +
> + wr.tasks[0] = t1;
> + wr.cookies[0] = t2->core_task_cookie;
> +
> + } else {
> + /* CASE 4. */
> + if (!sched_core_get_task_cookie(t2->core_task_cookie)) {
> + ret = -EINVAL;
> + goto out_unlock;
> + }
> + sched_core_put_task_cookie(t1->core_task_cookie);
> + sched_core_put_after_stopper = true;
> +
> + wr.tasks[0] = t1;
> + wr.cookies[0] = t2->core_task_cookie;
> + }
> +
> + stop_machine(sched_core_task_join_stopper, (void *)&wr, NULL);
> +
> + if (sched_core_put_after_stopper)
> + sched_core_put();
> +
> + ret = 0;
> +out_unlock:
> + mutex_unlock(&sched_core_mutex);
> + return ret;
> +}
> +
> +/* CGroup interface */
> static u64 cpu_core_tag_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
> {
> struct task_group *tg = css_tg(css);
> @@ -9207,18 +9435,9 @@ static int __sched_write_tag(void *data)
> * when we set cgroup tag to 0 when the loop is done below.
> */
> while ((p = css_task_iter_next(&it))) {
> - p->core_cookie = !!val ? (unsigned long)tg : 0UL;
> -
> - if (sched_core_enqueued(p)) {
> - sched_core_dequeue(task_rq(p), p);
> - if (!p->core_cookie)
> - continue;
> - }
> -
> - if (sched_core_enabled(task_rq(p)) &&
> - p->core_cookie && task_on_rq_queued(p))
> - sched_core_enqueue(task_rq(p), p);
> + unsigned long cookie = !!val ? (unsigned long)tg : 0UL;
>
> + sched_core_tag_requeue(p, cookie, true /* group */);
> }
> css_task_iter_end(&it);
>
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index c8fee8d9dfd4..88bf45267672 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -1024,6 +1024,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
> __PS("clock-delta", t1-t0);
> }
>
> +#ifdef CONFIG_SCHED_CORE
> + __PS("core_cookie", p->core_cookie);
> +#endif
> +
> sched_show_numa(p, m);
> }
>
>

-chrish

2020-11-05 14:51:55

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH v8 -tip 17/26] sched: Split the cookie and setup per-task cookie on fork

On Wed, Nov 04, 2020 at 05:30:24PM -0500, chris hyser wrote:
[..]
> + wr.cookies[0] = cookie;
> > + } else
> > + /*
> > + * t1 joining t2
> > + * CASE 1:
> > + * before 0 0
> > + * after new cookie new cookie
> > + *
> > + * CASE 2:
> > + * before X (non-zero) 0
> > + * after 0 0
> > + *
> > + * CASE 3:
> > + * before 0 X (non-zero)
> > + * after X X
> > + *
> > + * CASE 4:
> > + * before Y (non-zero) X (non-zero)
> > + * after X X
> > + */
> > + if (!t1->core_task_cookie && !t2->core_task_cookie) {
> > + /* CASE 1. */
> > + cookie = sched_core_alloc_task_cookie();
> > + if (!cookie)
> > + goto out_unlock;
> > +
> > + /* Add another reference for the other task. */
> > + if (!sched_core_get_task_cookie(cookie)) {
> > + return -EINVAL;
>
> should be: ret = -EINVAL;

Fixed now, thanks.

thanks,

- Joel


>
> > + goto out_unlock;
> > + }
> > +
> > + wr.tasks[0] = t1;
> > + wr.tasks[1] = t2;
> > + wr.cookies[0] = wr.cookies[1] = cookie;
> > +
> > + } else if (t1->core_task_cookie && !t2->core_task_cookie) {
> > + /* CASE 2. */
> > + sched_core_put_task_cookie(t1->core_task_cookie);
> > + sched_core_put_after_stopper = true;
> > +
> > + wr.tasks[0] = t1; /* Reset cookie for t1. */
> > +
> > + } else if (!t1->core_task_cookie && t2->core_task_cookie) {
> > + /* CASE 3. */
> > + if (!sched_core_get_task_cookie(t2->core_task_cookie)) {
> > + ret = -EINVAL;
> > + goto out_unlock;
> > + }
> > +
> > + wr.tasks[0] = t1;
> > + wr.cookies[0] = t2->core_task_cookie;
> > +
> > + } else {
> > + /* CASE 4. */
> > + if (!sched_core_get_task_cookie(t2->core_task_cookie)) {
> > + ret = -EINVAL;
> > + goto out_unlock;
> > + }
> > + sched_core_put_task_cookie(t1->core_task_cookie);
> > + sched_core_put_after_stopper = true;
> > +
> > + wr.tasks[0] = t1;
> > + wr.cookies[0] = t2->core_task_cookie;
> > + }
> > +
> > + stop_machine(sched_core_task_join_stopper, (void *)&wr, NULL);
> > +
> > + if (sched_core_put_after_stopper)
> > + sched_core_put();
> > +
> > + ret = 0;
> > +out_unlock:
> > + mutex_unlock(&sched_core_mutex);
> > + return ret;
> > +}
> > +
> > +/* CGroup interface */
> > static u64 cpu_core_tag_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
> > {
> > struct task_group *tg = css_tg(css);
> > @@ -9207,18 +9435,9 @@ static int __sched_write_tag(void *data)
> > * when we set cgroup tag to 0 when the loop is done below.
> > */
> > while ((p = css_task_iter_next(&it))) {
> > - p->core_cookie = !!val ? (unsigned long)tg : 0UL;
> > -
> > - if (sched_core_enqueued(p)) {
> > - sched_core_dequeue(task_rq(p), p);
> > - if (!p->core_cookie)
> > - continue;
> > - }
> > -
> > - if (sched_core_enabled(task_rq(p)) &&
> > - p->core_cookie && task_on_rq_queued(p))
> > - sched_core_enqueue(task_rq(p), p);
> > + unsigned long cookie = !!val ? (unsigned long)tg : 0UL;
> > + sched_core_tag_requeue(p, cookie, true /* group */);
> > }
> > css_task_iter_end(&it);
> > diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> > index c8fee8d9dfd4..88bf45267672 100644
> > --- a/kernel/sched/debug.c
> > +++ b/kernel/sched/debug.c
> > @@ -1024,6 +1024,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
> > __PS("clock-delta", t1-t0);
> > }
> > +#ifdef CONFIG_SCHED_CORE
> > + __PS("core_cookie", p->core_cookie);
> > +#endif
> > +
> > sched_show_numa(p, m);
> > }
> >
>
> -chrish

2020-11-09 23:27:36

by Chris Hyser

[permalink] [raw]
Subject: Re: [PATCH v8 -tip 17/26] sched: Split the cookie and setup per-task cookie on fork

> On 10/19/20 9:43 PM, Joel Fernandes (Google) wrote:
>> In order to prevent interference and clearly support both per-task and CGroup
>> APIs, split the cookie into 2 and allow it to be set from either per-task, or
>> CGroup API. The final cookie is the combined value of both and is computed when
>> the stop-machine executes during a change of cookie.
>>
>> Also, for the per-task cookie, it will get weird if we use pointers of any
>> emphemeral objects. For this reason, introduce a refcounted object who's sole
>> purpose is to assign unique cookie value by way of the object's pointer.
>>
>> While at it, refactor the CGroup code a bit. Future patches will introduce more
>> APIs and support.
>>
>> Tested-by: Julien Desfossez <[email protected]>
>> Signed-off-by: Joel Fernandes (Google) <[email protected]>
>> ---
>> include/linux/sched.h | 2 +
>> kernel/sched/core.c | 241 ++++++++++++++++++++++++++++++++++++++++--
>> kernel/sched/debug.c | 4 +
>> 3 files changed, 236 insertions(+), 11 deletions(-)
>>
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index fe6f225bfbf9..c6034c00846a 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -688,6 +688,8 @@ struct task_struct {
>> #ifdef CONFIG_SCHED_CORE
>> struct rb_node core_node;
>> unsigned long core_cookie;
>> + unsigned long core_task_cookie;
>> + unsigned long core_group_cookie;
>> unsigned int core_occupation;
>> #endif
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index bab4ea2f5cd8..30a9e4cb5ce1 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -346,11 +346,14 @@ void sched_core_put(void)
>> mutex_unlock(&sched_core_mutex);
>> }
>> +static int sched_core_share_tasks(struct task_struct *t1, struct task_struct *t2);
>> +
>> #else /* !CONFIG_SCHED_CORE */
>> static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
>> static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
>> static bool sched_core_enqueued(struct task_struct *task) { return false; }
>> +static int sched_core_share_tasks(struct task_struct *t1, struct task_struct *t2) { }
>> #endif /* CONFIG_SCHED_CORE */
>> @@ -3583,6 +3586,20 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
>> #endif
>> #ifdef CONFIG_SCHED_CORE
>> RB_CLEAR_NODE(&p->core_node);
>> +
>> + /*
>> + * Tag child via per-task cookie only if parent is tagged via per-task
>> + * cookie. This is independent of, but can be additive to the CGroup tagging.
>> + */
>> + if (current->core_task_cookie) {
>> +
>> + /* If it is not CLONE_THREAD fork, assign a unique per-task tag. */
>> + if (!(clone_flags & CLONE_THREAD)) {
>> + return sched_core_share_tasks(p, p);
>> + }
>> + /* Otherwise share the parent's per-task tag. */
>> + return sched_core_share_tasks(p, current);
>> + }
>> #endif
>> return 0;
>> }

sched_core_share_tasks() looks at the value of the new tasks core_task_cookie which is non-zero on a
process or thread clone and causes underflow for both the enable flag itself and for cookie ref counts.

So just zero it in __sched_fork().

-chris


PATCH] sched: zero out the core scheduling cookie on clone

From: chris hyser <[email protected]>

As the cookie is reference counted, even if inherited, zero this and allow
explicit sharing.

Signed-off-by: Chris Hyser <[email protected]>
---
kernel/sched/core.c | 3 +++
1 file changed, 3 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fd3cc03..2af0ea6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3378,6 +3378,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->capture_control = NULL;
#endif
init_numa_balancing(clone_flags, p);
+#ifdef CONFIG_SCHED_CORE
+ p->core_task_cookie = 0;
+#endif
#ifdef CONFIG_SMP
p->wake_entry.u_flags = CSD_TYPE_TTWU;
#endif
--
1.8.3.1

2020-11-09 23:38:53

by Chris Hyser

[permalink] [raw]
Subject: Re: [PATCH v8 -tip 17/26] sched: Split the cookie and setup per-task cookie on fork

On 11/4/20 5:30 PM, chris hyser wrote:
> On 10/19/20 9:43 PM, Joel Fernandes (Google) wrote:
>> In order to prevent interference and clearly support both per-task and CGroup
>> APIs, split the cookie into 2 and allow it to be set from either per-task, or
>> CGroup API. The final cookie is the combined value of both and is computed when
>> the stop-machine executes during a change of cookie.
>>
>> Also, for the per-task cookie, it will get weird if we use pointers of any
>> emphemeral objects. For this reason, introduce a refcounted object who's sole
>> purpose is to assign unique cookie value by way of the object's pointer.
>>
>> While at it, refactor the CGroup code a bit. Future patches will introduce more
>> APIs and support.
>>
>> Tested-by: Julien Desfossez <[email protected]>
>> Signed-off-by: Joel Fernandes (Google) <[email protected]>
>> ---
>>   include/linux/sched.h |   2 +
>>   kernel/sched/core.c   | 241 ++++++++++++++++++++++++++++++++++++++++--
>>   kernel/sched/debug.c  |   4 +
>>   3 files changed, 236 insertions(+), 11 deletions(-)
>>
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index fe6f225bfbf9..c6034c00846a 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -688,6 +688,8 @@ struct task_struct {
>>   #ifdef CONFIG_SCHED_CORE
>>       struct rb_node            core_node;
>>       unsigned long            core_cookie;
>> +    unsigned long            core_task_cookie;
>> +    unsigned long            core_group_cookie;
>>       unsigned int            core_occupation;
>>   #endif
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index bab4ea2f5cd8..30a9e4cb5ce1 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -346,11 +346,14 @@ void sched_core_put(void)
>>       mutex_unlock(&sched_core_mutex);
>>   }
>> +static int sched_core_share_tasks(struct task_struct *t1, struct task_struct *t2);
>> +
>>   #else /* !CONFIG_SCHED_CORE */
>>   static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
>>   static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
>>   static bool sched_core_enqueued(struct task_struct *task) { return false; }
>> +static int sched_core_share_tasks(struct task_struct *t1, struct task_struct *t2) { }
>>   #endif /* CONFIG_SCHED_CORE */
>> @@ -3583,6 +3586,20 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
>>   #endif
>>   #ifdef CONFIG_SCHED_CORE
>>       RB_CLEAR_NODE(&p->core_node);
>> +
>> +    /*
>> +     * Tag child via per-task cookie only if parent is tagged via per-task
>> +     * cookie. This is independent of, but can be additive to the CGroup tagging.
>> +     */
>> +    if (current->core_task_cookie) {
>> +
>> +        /* If it is not CLONE_THREAD fork, assign a unique per-task tag. */
>> +        if (!(clone_flags & CLONE_THREAD)) {
>> +            return sched_core_share_tasks(p, p);
>> +               }
>> +        /* Otherwise share the parent's per-task tag. */
>> +        return sched_core_share_tasks(p, current);
>> +    }
>>   #endif
>>       return 0;
>>   }

sched_core_share_tasks() looks at the value of the new tasks core_task_cookie which is non-zero on a
process or thread clone and causes underflow for both the enable flag itself and for cookie ref counts.

So just zero it in __sched_fork().

-chrish

------

sched: zero out the core scheduling cookie on clone

As the cookie is reference counted, even if inherited, zero this and allow
explicit sharing.

Signed-off-by: Chris Hyser <[email protected]>
---
kernel/sched/core.c | 3 +++
1 file changed, 3 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fd3cc03..2af0ea6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3378,6 +3378,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->capture_control = NULL;
#endif
init_numa_balancing(clone_flags, p);
+#ifdef CONFIG_SCHED_CORE
+ p->core_task_cookie = 0;
+#endif
#ifdef CONFIG_SMP
p->wake_entry.u_flags = CSD_TYPE_TTWU;
#endif

2020-11-10 14:20:11

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH v8 -tip 17/26] sched: Split the cookie and setup per-task cookie on fork

On Mon, Nov 09, 2020 at 06:23:13PM -0500, chris hyser wrote:
[..]
> >
> > > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > > index fe6f225bfbf9..c6034c00846a 100644
> > > --- a/include/linux/sched.h
> > > +++ b/include/linux/sched.h
> > > @@ -688,6 +688,8 @@ struct task_struct {
> > > #ifdef CONFIG_SCHED_CORE
> > > struct rb_node core_node;
> > > unsigned long core_cookie;
> > > + unsigned long core_task_cookie;
> > > + unsigned long core_group_cookie;
> > > unsigned int core_occupation;
> > > #endif
> > > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > > index bab4ea2f5cd8..30a9e4cb5ce1 100644
> > > --- a/kernel/sched/core.c
> > > +++ b/kernel/sched/core.c
> > > @@ -346,11 +346,14 @@ void sched_core_put(void)
> > > mutex_unlock(&sched_core_mutex);
> > > }
> > > +static int sched_core_share_tasks(struct task_struct *t1, struct task_struct *t2);
> > > +
> > > #else /* !CONFIG_SCHED_CORE */
> > > static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
> > > static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
> > > static bool sched_core_enqueued(struct task_struct *task) { return false; }
> > > +static int sched_core_share_tasks(struct task_struct *t1, struct task_struct *t2) { }
> > > #endif /* CONFIG_SCHED_CORE */
> > > @@ -3583,6 +3586,20 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
> > > #endif
> > > #ifdef CONFIG_SCHED_CORE
> > > RB_CLEAR_NODE(&p->core_node);
> > > +
> > > + /*
> > > + * Tag child via per-task cookie only if parent is tagged via per-task
> > > + * cookie. This is independent of, but can be additive to the CGroup tagging.
> > > + */
> > > + if (current->core_task_cookie) {
> > > +
> > > + /* If it is not CLONE_THREAD fork, assign a unique per-task tag. */
> > > + if (!(clone_flags & CLONE_THREAD)) {
> > > + return sched_core_share_tasks(p, p);
> > > + }
> > > + /* Otherwise share the parent's per-task tag. */
> > > + return sched_core_share_tasks(p, current);
> > > + }
> > > #endif
> > > return 0;
> > > }
>
> sched_core_share_tasks() looks at the value of the new tasks core_task_cookie which is non-zero on a
> process or thread clone and causes underflow for both the enable flag itself and for cookie ref counts.
>
> So just zero it in __sched_fork().

Nice catch, applying to my v9 staging tree. Thanks!

thanks,

- Joel


>
> -chris
>
>
> PATCH] sched: zero out the core scheduling cookie on clone
>
> From: chris hyser <[email protected]>
>
> As the cookie is reference counted, even if inherited, zero this and allow
> explicit sharing.
>
> Signed-off-by: Chris Hyser <[email protected]>
> ---
> kernel/sched/core.c | 3 +++
> 1 file changed, 3 insertions(+)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index fd3cc03..2af0ea6 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3378,6 +3378,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
> p->capture_control = NULL;
> #endif
> init_numa_balancing(clone_flags, p);
> +#ifdef CONFIG_SCHED_CORE
> + p->core_task_cookie = 0;
> +#endif
> #ifdef CONFIG_SMP
> p->wake_entry.u_flags = CSD_TYPE_TTWU;
> #endif
> --
> 1.8.3.1
>

2020-11-17 00:47:56

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH v8 -tip 17/26] sched: Split the cookie and setup per-task cookie on fork

Hi Chris,

On Mon, Nov 09, 2020 at 06:23:13PM -0500, chris hyser wrote:
> > On 10/19/20 9:43 PM, Joel Fernandes (Google) wrote:
> > > In order to prevent interference and clearly support both per-task and CGroup
> > > APIs, split the cookie into 2 and allow it to be set from either per-task, or
> > > CGroup API. The final cookie is the combined value of both and is computed when
> > > the stop-machine executes during a change of cookie.
> > >
> > > Also, for the per-task cookie, it will get weird if we use pointers of any
> > > emphemeral objects. For this reason, introduce a refcounted object who's sole
> > > purpose is to assign unique cookie value by way of the object's pointer.
> > >
> > > While at it, refactor the CGroup code a bit. Future patches will introduce more
> > > APIs and support.
> > >
> > > Tested-by: Julien Desfossez <[email protected]>
> > > Signed-off-by: Joel Fernandes (Google) <[email protected]>
> > > ---
> > > include/linux/sched.h | 2 +
> > > kernel/sched/core.c | 241 ++++++++++++++++++++++++++++++++++++++++--
> > > kernel/sched/debug.c | 4 +
> > > 3 files changed, 236 insertions(+), 11 deletions(-)
> > >
> > > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > > index fe6f225bfbf9..c6034c00846a 100644
> > > --- a/include/linux/sched.h
> > > +++ b/include/linux/sched.h
> > > @@ -688,6 +688,8 @@ struct task_struct {
> > > #ifdef CONFIG_SCHED_CORE
> > > struct rb_node core_node;
> > > unsigned long core_cookie;
> > > + unsigned long core_task_cookie;
> > > + unsigned long core_group_cookie;
> > > unsigned int core_occupation;
> > > #endif
> > > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > > index bab4ea2f5cd8..30a9e4cb5ce1 100644
> > > --- a/kernel/sched/core.c
> > > +++ b/kernel/sched/core.c
> > > @@ -346,11 +346,14 @@ void sched_core_put(void)
> > > mutex_unlock(&sched_core_mutex);
> > > }
> > > +static int sched_core_share_tasks(struct task_struct *t1, struct task_struct *t2);
> > > +
> > > #else /* !CONFIG_SCHED_CORE */
> > > static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
> > > static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
> > > static bool sched_core_enqueued(struct task_struct *task) { return false; }
> > > +static int sched_core_share_tasks(struct task_struct *t1, struct task_struct *t2) { }
> > > #endif /* CONFIG_SCHED_CORE */
> > > @@ -3583,6 +3586,20 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
> > > #endif
> > > #ifdef CONFIG_SCHED_CORE
> > > RB_CLEAR_NODE(&p->core_node);
> > > +
> > > + /*
> > > + * Tag child via per-task cookie only if parent is tagged via per-task
> > > + * cookie. This is independent of, but can be additive to the CGroup tagging.
> > > + */
> > > + if (current->core_task_cookie) {
> > > +
> > > + /* If it is not CLONE_THREAD fork, assign a unique per-task tag. */
> > > + if (!(clone_flags & CLONE_THREAD)) {
> > > + return sched_core_share_tasks(p, p);
> > > + }
> > > + /* Otherwise share the parent's per-task tag. */
> > > + return sched_core_share_tasks(p, current);
> > > + }
> > > #endif
> > > return 0;
> > > }
>
> sched_core_share_tasks() looks at the value of the new tasks core_task_cookie which is non-zero on a
> process or thread clone and causes underflow for both the enable flag itself and for cookie ref counts.
>
> So just zero it in __sched_fork().

Since I am going to split this into a new patch, could you please send me a
detailed commit message explaining the issue? I feel the below one liner is
unclear / insufficient as a commit message.

thanks,

- Joel


> -chris
>
>
> PATCH] sched: zero out the core scheduling cookie on clone
>
> From: chris hyser <[email protected]>
>
> As the cookie is reference counted, even if inherited, zero this and allow
> explicit sharing.
>
> Signed-off-by: Chris Hyser <[email protected]>
> ---
> kernel/sched/core.c | 3 +++
> 1 file changed, 3 insertions(+)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index fd3cc03..2af0ea6 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3378,6 +3378,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
> p->capture_control = NULL;
> #endif
> init_numa_balancing(clone_flags, p);
> +#ifdef CONFIG_SCHED_CORE
> + p->core_task_cookie = 0;
> +#endif
> #ifdef CONFIG_SMP
> p->wake_entry.u_flags = CSD_TYPE_TTWU;
> #endif
> --
> 1.8.3.1
>