LinuxLists.cc - [PATCH -tip 22/32] sched: Split the cookie and setup per-task cookie on fork

2020-11-17 23:24:28

Subject: [PATCH -tip 22/32] sched: Split the cookie and setup per-task cookie on fork

In order to prevent interference and clearly support both per-task and CGroup
APIs, split the cookie into 2 and allow it to be set from either per-task, or
CGroup API. The final cookie is the combined value of both and is computed when
the stop-machine executes during a change of cookie.

Also, for the per-task cookie, it will get weird if we use pointers of any
emphemeral objects. For this reason, introduce a refcounted object who's sole
purpose is to assign unique cookie value by way of the object's pointer.

While at it, refactor the CGroup code a bit. Future patches will introduce more
APIs and support.

Reviewed-by: Josh Don <[email protected]>
Tested-by: Julien Desfossez <[email protected]>
Signed-off-by: Joel Fernandes (Google) <[email protected]>
---
include/linux/sched.h | 2 +
kernel/sched/core.c | 241 ++++++++++++++++++++++++++++++++++++++++--
kernel/sched/debug.c | 4 +
3 files changed, 236 insertions(+), 11 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a60868165590..c6a3b0fa952b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -688,6 +688,8 @@ struct task_struct {
#ifdef CONFIG_SCHED_CORE
struct rb_node core_node;
unsigned long core_cookie;
+ unsigned long core_task_cookie;
+ unsigned long core_group_cookie;
unsigned int core_occupation;
#endif

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b99a7493d590..7ccca355623a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -346,11 +346,14 @@ void sched_core_put(void)
mutex_unlock(&sched_core_mutex);
}

+static int sched_core_share_tasks(struct task_struct *t1, struct task_struct *t2);
+
#else /* !CONFIG_SCHED_CORE */

static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
static bool sched_core_enqueued(struct task_struct *task) { return false; }
+static int sched_core_share_tasks(struct task_struct *t1, struct task_struct *t2) { }

#endif /* CONFIG_SCHED_CORE */

@@ -4032,6 +4035,20 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
#endif
#ifdef CONFIG_SCHED_CORE
RB_CLEAR_NODE(&p->core_node);
+
+ /*
+ * Tag child via per-task cookie only if parent is tagged via per-task
+ * cookie. This is independent of, but can be additive to the CGroup tagging.
+ */
+ if (current->core_task_cookie) {
+
+ /* If it is not CLONE_THREAD fork, assign a unique per-task tag. */
+ if (!(clone_flags & CLONE_THREAD)) {
+ return sched_core_share_tasks(p, p);
+ }
+ /* Otherwise share the parent's per-task tag. */
+ return sched_core_share_tasks(p, current);
+ }
#endif
return 0;
}
@@ -9731,6 +9748,217 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
#endif /* CONFIG_RT_GROUP_SCHED */

#ifdef CONFIG_SCHED_CORE
+/*
+ * A simple wrapper around refcount. An allocated sched_core_cookie's
+ * address is used to compute the cookie of the task.
+ */
+struct sched_core_cookie {
+ refcount_t refcnt;
+};
+
+/*
+ * sched_core_tag_requeue - Common helper for all interfaces to set a cookie.
+ * @p: The task to assign a cookie to.
+ * @cookie: The cookie to assign.
+ * @group: is it a group interface or a per-task interface.
+ *
+ * This function is typically called from a stop-machine handler.
+ */
+void sched_core_tag_requeue(struct task_struct *p, unsigned long cookie, bool group)
+{
+ if (!p)
+ return;
+
+ if (group)
+ p->core_group_cookie = cookie;
+ else
+ p->core_task_cookie = cookie;
+
+ /* Use up half of the cookie's bits for task cookie and remaining for group cookie. */
+ p->core_cookie = (p->core_task_cookie <<
+ (sizeof(unsigned long) * 4)) + p->core_group_cookie;
+
+ if (sched_core_enqueued(p)) {
+ sched_core_dequeue(task_rq(p), p);
+ if (!p->core_task_cookie)
+ return;
+ }
+
+ if (sched_core_enabled(task_rq(p)) &&
+ p->core_cookie && task_on_rq_queued(p))
+ sched_core_enqueue(task_rq(p), p);
+}
+
+/* Per-task interface */
+static unsigned long sched_core_alloc_task_cookie(void)
+{
+ struct sched_core_cookie *ptr =
+ kmalloc(sizeof(struct sched_core_cookie), GFP_KERNEL);
+
+ if (!ptr)
+ return 0;
+ refcount_set(&ptr->refcnt, 1);
+
+ /*
+ * NOTE: sched_core_put() is not done by put_task_cookie(). Instead, it
+ * is done after the stopper runs.
+ */
+ sched_core_get();
+ return (unsigned long)ptr;
+}
+
+static bool sched_core_get_task_cookie(unsigned long cookie)
+{
+ struct sched_core_cookie *ptr = (struct sched_core_cookie *)cookie;
+
+ /*
+ * NOTE: sched_core_put() is not done by put_task_cookie(). Instead, it
+ * is done after the stopper runs.
+ */
+ sched_core_get();
+ return refcount_inc_not_zero(&ptr->refcnt);
+}
+
+static void sched_core_put_task_cookie(unsigned long cookie)
+{
+ struct sched_core_cookie *ptr = (struct sched_core_cookie *)cookie;
+
+ if (refcount_dec_and_test(&ptr->refcnt))
+ kfree(ptr);
+}
+
+struct sched_core_task_write_tag {
+ struct task_struct *tasks[2];
+ unsigned long cookies[2];
+};
+
+/*
+ * Ensure that the task has been requeued. The stopper ensures that the task cannot
+ * be migrated to a different CPU while its core scheduler queue state is being updated.
+ * It also makes sure to requeue a task if it was running actively on another CPU.
+ */
+static int sched_core_task_join_stopper(void *data)
+{
+ struct sched_core_task_write_tag *tag = (struct sched_core_task_write_tag *)data;
+ int i;
+
+ for (i = 0; i < 2; i++)
+ sched_core_tag_requeue(tag->tasks[i], tag->cookies[i], false /* !group */);
+
+ return 0;
+}
+
+static int sched_core_share_tasks(struct task_struct *t1, struct task_struct *t2)
+{
+ struct sched_core_task_write_tag wr = {}; /* for stop machine. */
+ bool sched_core_put_after_stopper = false;
+ unsigned long cookie;
+ int ret = -ENOMEM;
+
+ mutex_lock(&sched_core_mutex);
+
+ /*
+ * NOTE: sched_core_get() is done by sched_core_alloc_task_cookie() or
+ * sched_core_put_task_cookie(). However, sched_core_put() is done
+ * by this function *after* the stopper removes the tasks from the
+ * core queue, and not before. This is just to play it safe.
+ */
+ if (t2 == NULL) {
+ if (t1->core_task_cookie) {
+ sched_core_put_task_cookie(t1->core_task_cookie);
+ sched_core_put_after_stopper = true;
+ wr.tasks[0] = t1; /* Keep wr.cookies[0] reset for t1. */
+ }
+ } else if (t1 == t2) {
+ /* Assign a unique per-task cookie solely for t1. */
+
+ cookie = sched_core_alloc_task_cookie();
+ if (!cookie)
+ goto out_unlock;
+
+ if (t1->core_task_cookie) {
+ sched_core_put_task_cookie(t1->core_task_cookie);
+ sched_core_put_after_stopper = true;
+ }
+ wr.tasks[0] = t1;
+ wr.cookies[0] = cookie;
+ } else
+ /*
+ * t1 joining t2
+ * CASE 1:
+ * before 0 0
+ * after new cookie new cookie
+ *
+ * CASE 2:
+ * before X (non-zero) 0
+ * after 0 0
+ *
+ * CASE 3:
+ * before 0 X (non-zero)
+ * after X X
+ *
+ * CASE 4:
+ * before Y (non-zero) X (non-zero)
+ * after X X
+ */
+ if (!t1->core_task_cookie && !t2->core_task_cookie) {
+ /* CASE 1. */
+ cookie = sched_core_alloc_task_cookie();
+ if (!cookie)
+ goto out_unlock;
+
+ /* Add another reference for the other task. */
+ if (!sched_core_get_task_cookie(cookie)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ wr.tasks[0] = t1;
+ wr.tasks[1] = t2;
+ wr.cookies[0] = wr.cookies[1] = cookie;
+
+ } else if (t1->core_task_cookie && !t2->core_task_cookie) {
+ /* CASE 2. */
+ sched_core_put_task_cookie(t1->core_task_cookie);
+ sched_core_put_after_stopper = true;
+
+ wr.tasks[0] = t1; /* Reset cookie for t1. */
+
+ } else if (!t1->core_task_cookie && t2->core_task_cookie) {
+ /* CASE 3. */
+ if (!sched_core_get_task_cookie(t2->core_task_cookie)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ wr.tasks[0] = t1;
+ wr.cookies[0] = t2->core_task_cookie;
+
+ } else {
+ /* CASE 4. */
+ if (!sched_core_get_task_cookie(t2->core_task_cookie)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ sched_core_put_task_cookie(t1->core_task_cookie);
+ sched_core_put_after_stopper = true;
+
+ wr.tasks[0] = t1;
+ wr.cookies[0] = t2->core_task_cookie;
+ }
+
+ stop_machine(sched_core_task_join_stopper, (void *)&wr, NULL);
+
+ if (sched_core_put_after_stopper)
+ sched_core_put();
+
+ ret = 0;
+out_unlock:
+ mutex_unlock(&sched_core_mutex);
+ return ret;
+}
+
+/* CGroup interface */
static u64 cpu_core_tag_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
{
struct task_group *tg = css_tg(css);
@@ -9761,18 +9989,9 @@ static int __sched_write_tag(void *data)
* when we set cgroup tag to 0 when the loop is done below.
*/
while ((p = css_task_iter_next(&it))) {
- p->core_cookie = !!val ? (unsigned long)tg : 0UL;
-
- if (sched_core_enqueued(p)) {
- sched_core_dequeue(task_rq(p), p);
- if (!p->core_cookie)
- continue;
- }
-
- if (sched_core_enabled(task_rq(p)) &&
- p->core_cookie && task_on_rq_queued(p))
- sched_core_enqueue(task_rq(p), p);
+ unsigned long cookie = !!val ? (unsigned long)tg : 0UL;

+ sched_core_tag_requeue(p, cookie, true /* group */);
}
css_task_iter_end(&it);

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 60a922d3f46f..8c452b8010ad 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1024,6 +1024,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
__PS("clock-delta", t1-t0);
}

+#ifdef CONFIG_SCHED_CORE
+ __PS("core_cookie", p->core_cookie);
+#endif
+
sched_show_numa(p, m);
}

--
2.29.2.299.gdc1121823c-goog

2020-11-25 11:10:16

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH -tip 22/32] sched: Split the cookie and setup per-task cookie on fork

On Tue, Nov 17, 2020 at 06:19:52PM -0500, Joel Fernandes (Google) wrote:
> Also, for the per-task cookie, it will get weird if we use pointers of any
> emphemeral objects. For this reason, introduce a refcounted object who's sole
> purpose is to assign unique cookie value by way of the object's pointer.

Might be useful to explain why exactly none of the many pid_t's are
good enough.

2020-11-25 11:13:28

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH -tip 22/32] sched: Split the cookie and setup per-task cookie on fork

On Tue, Nov 17, 2020 at 06:19:52PM -0500, Joel Fernandes (Google) wrote:
> +void sched_core_tag_requeue(struct task_struct *p, unsigned long cookie, bool group)
> +{
> + if (!p)
> + return;
> +
> + if (group)
> + p->core_group_cookie = cookie;
> + else
> + p->core_task_cookie = cookie;
> +
> + /* Use up half of the cookie's bits for task cookie and remaining for group cookie. */
> + p->core_cookie = (p->core_task_cookie <<
> + (sizeof(unsigned long) * 4)) + p->core_group_cookie;

This seems dangerous; afaict there is nothing that prevents cookie
collision.

2020-11-25 11:15:47

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH -tip 22/32] sched: Split the cookie and setup per-task cookie on fork

On Tue, Nov 17, 2020 at 06:19:52PM -0500, Joel Fernandes (Google) wrote:

> + * sched_core_tag_requeue - Common helper for all interfaces to set a cookie.

sched_core_set_cookie() would be a saner name, given that description,
don't you think?

2020-11-25 11:19:07

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH -tip 22/32] sched: Split the cookie and setup per-task cookie on fork

On Tue, Nov 17, 2020 at 06:19:52PM -0500, Joel Fernandes (Google) wrote:

> +/*
> + * Ensure that the task has been requeued. The stopper ensures that the task cannot
> + * be migrated to a different CPU while its core scheduler queue state is being updated.
> + * It also makes sure to requeue a task if it was running actively on another CPU.
> + */
> +static int sched_core_task_join_stopper(void *data)
> +{
> + struct sched_core_task_write_tag *tag = (struct sched_core_task_write_tag *)data;
> + int i;
> +
> + for (i = 0; i < 2; i++)
> + sched_core_tag_requeue(tag->tasks[i], tag->cookies[i], false /* !group */);
> +
> + return 0;
> +}
> +
> +static int sched_core_share_tasks(struct task_struct *t1, struct task_struct *t2)
> +{

> + stop_machine(sched_core_task_join_stopper, (void *)&wr, NULL);

> +}

This is *REALLY* terrible...

2020-11-25 13:00:16

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH -tip 22/32] sched: Split the cookie and setup per-task cookie on fork

On Tue, Nov 17, 2020 at 06:19:52PM -0500, Joel Fernandes (Google) wrote:
> +/* Per-task interface */
> +static unsigned long sched_core_alloc_task_cookie(void)
> +{
> + struct sched_core_cookie *ptr =
> + kmalloc(sizeof(struct sched_core_cookie), GFP_KERNEL);
> +
> + if (!ptr)
> + return 0;
> + refcount_set(&ptr->refcnt, 1);
> +
> + /*
> + * NOTE: sched_core_put() is not done by put_task_cookie(). Instead, it
> + * is done after the stopper runs.
> + */
> + sched_core_get();
> + return (unsigned long)ptr;
> +}
> +
> +static bool sched_core_get_task_cookie(unsigned long cookie)
> +{
> + struct sched_core_cookie *ptr = (struct sched_core_cookie *)cookie;
> +
> + /*
> + * NOTE: sched_core_put() is not done by put_task_cookie(). Instead, it
> + * is done after the stopper runs.
> + */
> + sched_core_get();
> + return refcount_inc_not_zero(&ptr->refcnt);
> +}
> +
> +static void sched_core_put_task_cookie(unsigned long cookie)
> +{
> + struct sched_core_cookie *ptr = (struct sched_core_cookie *)cookie;
> +
> + if (refcount_dec_and_test(&ptr->refcnt))
> + kfree(ptr);
> +}

> + /*
> + * NOTE: sched_core_get() is done by sched_core_alloc_task_cookie() or
> + * sched_core_put_task_cookie(). However, sched_core_put() is done
> + * by this function *after* the stopper removes the tasks from the
> + * core queue, and not before. This is just to play it safe.
> + */

So for no reason what so ever you've made the code more difficult?

2020-11-25 13:07:20

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH -tip 22/32] sched: Split the cookie and setup per-task cookie on fork

On Tue, Nov 17, 2020 at 06:19:52PM -0500, Joel Fernandes (Google) wrote:
> +static bool sched_core_get_task_cookie(unsigned long cookie)
> +{
> + struct sched_core_cookie *ptr = (struct sched_core_cookie *)cookie;
> +
> + /*
> + * NOTE: sched_core_put() is not done by put_task_cookie(). Instead, it
> + * is done after the stopper runs.
> + */
> + sched_core_get();
> + return refcount_inc_not_zero(&ptr->refcnt);

See below, but afaict this should be refcount_inc().

> +}

> + /*
> + * t1 joining t2
> + * CASE 1:
> + * before 0 0
> + * after new cookie new cookie
> + *
> + * CASE 2:
> + * before X (non-zero) 0
> + * after 0 0
> + *
> + * CASE 3:
> + * before 0 X (non-zero)
> + * after X X
> + *
> + * CASE 4:
> + * before Y (non-zero) X (non-zero)
> + * after X X
> + */
> + if (!t1->core_task_cookie && !t2->core_task_cookie) {
> + /* CASE 1. */
> + cookie = sched_core_alloc_task_cookie();
> + if (!cookie)
> + goto out_unlock;
> +
> + /* Add another reference for the other task. */
> + if (!sched_core_get_task_cookie(cookie)) {

afaict this should be refcount_inc(), as this can never fail and if it
does, it's an error.

> + ret = -EINVAL;
> + goto out_unlock;
> + }
> +
> + wr.tasks[0] = t1;
> + wr.tasks[1] = t2;
> + wr.cookies[0] = wr.cookies[1] = cookie;
> +
> + } else if (t1->core_task_cookie && !t2->core_task_cookie) {
> + /* CASE 2. */
> + sched_core_put_task_cookie(t1->core_task_cookie);
> + sched_core_put_after_stopper = true;
> +
> + wr.tasks[0] = t1; /* Reset cookie for t1. */
> +
> + } else if (!t1->core_task_cookie && t2->core_task_cookie) {
> + /* CASE 3. */
> + if (!sched_core_get_task_cookie(t2->core_task_cookie)) {

afaict this can never fail either, because you're calling in here with a
reference on t2

> + ret = -EINVAL;
> + goto out_unlock;
> + }
> +
> + wr.tasks[0] = t1;
> + wr.cookies[0] = t2->core_task_cookie;
> +
> + } else {
> + /* CASE 4. */
> + if (!sched_core_get_task_cookie(t2->core_task_cookie)) {

Same.

> + ret = -EINVAL;
> + goto out_unlock;
> + }
> + sched_core_put_task_cookie(t1->core_task_cookie);
> + sched_core_put_after_stopper = true;
> +
> + wr.tasks[0] = t1;
> + wr.cookies[0] = t2->core_task_cookie;
> + }

2020-12-01 02:41:13

by Balbir Singh

[permalink] [raw]

Subject: Re: [PATCH -tip 22/32] sched: Split the cookie and setup per-task cookie on fork

On Tue, Nov 17, 2020 at 06:19:52PM -0500, Joel Fernandes (Google) wrote:
> In order to prevent interference and clearly support both per-task and CGroup
> APIs, split the cookie into 2 and allow it to be set from either per-task, or
> CGroup API. The final cookie is the combined value of both and is computed when
> the stop-machine executes during a change of cookie.
>
> Also, for the per-task cookie, it will get weird if we use pointers of any
> emphemeral objects. For this reason, introduce a refcounted object who's sole
> purpose is to assign unique cookie value by way of the object's pointer.
>
> While at it, refactor the CGroup code a bit. Future patches will introduce more
> APIs and support.
>
> Reviewed-by: Josh Don <[email protected]>
> Tested-by: Julien Desfossez <[email protected]>
> Signed-off-by: Joel Fernandes (Google) <[email protected]>
> ---
> include/linux/sched.h | 2 +
> kernel/sched/core.c | 241 ++++++++++++++++++++++++++++++++++++++++--
> kernel/sched/debug.c | 4 +
> 3 files changed, 236 insertions(+), 11 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index a60868165590..c6a3b0fa952b 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -688,6 +688,8 @@ struct task_struct {
> #ifdef CONFIG_SCHED_CORE
> struct rb_node core_node;
> unsigned long core_cookie;
> + unsigned long core_task_cookie;
> + unsigned long core_group_cookie;
> unsigned int core_occupation;
> #endif
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index b99a7493d590..7ccca355623a 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -346,11 +346,14 @@ void sched_core_put(void)
> mutex_unlock(&sched_core_mutex);
> }
>
> +static int sched_core_share_tasks(struct task_struct *t1, struct task_struct *t2);
> +
> #else /* !CONFIG_SCHED_CORE */
>
> static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
> static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
> static bool sched_core_enqueued(struct task_struct *task) { return false; }
> +static int sched_core_share_tasks(struct task_struct *t1, struct task_struct *t2) { }
>
> #endif /* CONFIG_SCHED_CORE */
>
> @@ -4032,6 +4035,20 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
> #endif
> #ifdef CONFIG_SCHED_CORE
> RB_CLEAR_NODE(&p->core_node);
> +
> + /*
> + * Tag child via per-task cookie only if parent is tagged via per-task
> + * cookie. This is independent of, but can be additive to the CGroup tagging.
> + */
> + if (current->core_task_cookie) {
> +
> + /* If it is not CLONE_THREAD fork, assign a unique per-task tag. */
> + if (!(clone_flags & CLONE_THREAD)) {
> + return sched_core_share_tasks(p, p);
> + }
> + /* Otherwise share the parent's per-task tag. */
> + return sched_core_share_tasks(p, current);
> + }
> #endif
> return 0;
> }
> @@ -9731,6 +9748,217 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
> #endif /* CONFIG_RT_GROUP_SCHED */
>
> #ifdef CONFIG_SCHED_CORE
> +/*
> + * A simple wrapper around refcount. An allocated sched_core_cookie's
> + * address is used to compute the cookie of the task.
> + */
> +struct sched_core_cookie {
> + refcount_t refcnt;
> +};
> +
> +/*
> + * sched_core_tag_requeue - Common helper for all interfaces to set a cookie.
> + * @p: The task to assign a cookie to.
> + * @cookie: The cookie to assign.
> + * @group: is it a group interface or a per-task interface.
> + *
> + * This function is typically called from a stop-machine handler.

Can you clarify if it is typically or always, what are the implications for
locking?

> + */
> +void sched_core_tag_requeue(struct task_struct *p, unsigned long cookie, bool group)
> +{
> + if (!p)
> + return;
> +
> + if (group)
> + p->core_group_cookie = cookie;
> + else
> + p->core_task_cookie = cookie;
> +
> + /* Use up half of the cookie's bits for task cookie and remaining for group cookie. */
> + p->core_cookie = (p->core_task_cookie <<
> + (sizeof(unsigned long) * 4)) + p->core_group_cookie;
> +

Always use masks to ensure this fits in the space we have, should we be concerned about
overflows and the potential for collision of cookie values?

> + if (sched_core_enqueued(p)) {
> + sched_core_dequeue(task_rq(p), p);
> + if (!p->core_task_cookie)
> + return;
> + }
> +
> + if (sched_core_enabled(task_rq(p)) &&
> + p->core_cookie && task_on_rq_queued(p))
> + sched_core_enqueue(task_rq(p), p);
> +}
> +
> +/* Per-task interface */
> +static unsigned long sched_core_alloc_task_cookie(void)
> +{
> + struct sched_core_cookie *ptr =
> + kmalloc(sizeof(struct sched_core_cookie), GFP_KERNEL);
> +
> + if (!ptr)
> + return 0;
> + refcount_set(&ptr->refcnt, 1);
> +
> + /*
> + * NOTE: sched_core_put() is not done by put_task_cookie(). Instead, it
> + * is done after the stopper runs.
> + */
> + sched_core_get();
> + return (unsigned long)ptr;
> +}
> +
> +static bool sched_core_get_task_cookie(unsigned long cookie)
> +{
> + struct sched_core_cookie *ptr = (struct sched_core_cookie *)cookie;
> +
> + /*
> + * NOTE: sched_core_put() is not done by put_task_cookie(). Instead, it
> + * is done after the stopper runs.
> + */
> + sched_core_get();
> + return refcount_inc_not_zero(&ptr->refcnt);
> +}
> +
> +static void sched_core_put_task_cookie(unsigned long cookie)
> +{
> + struct sched_core_cookie *ptr = (struct sched_core_cookie *)cookie;
> +
> + if (refcount_dec_and_test(&ptr->refcnt))
> + kfree(ptr);
> +}
> +
> +struct sched_core_task_write_tag {
> + struct task_struct *tasks[2];
> + unsigned long cookies[2];
> +};

Use a better name instead of 2?

> +
> +/*
> + * Ensure that the task has been requeued. The stopper ensures that the task cannot
> + * be migrated to a different CPU while its core scheduler queue state is being updated.
> + * It also makes sure to requeue a task if it was running actively on another CPU.
> + */
> +static int sched_core_task_join_stopper(void *data)
> +{
> + struct sched_core_task_write_tag *tag = (struct sched_core_task_write_tag *)data;
> + int i;
> +
> + for (i = 0; i < 2; i++)

Use ARRAY_SIZE(cookies) if you have to or ARRAY_SIZE(tasks)

> + sched_core_tag_requeue(tag->tasks[i], tag->cookies[i], false /* !group */);
> +
> + return 0;
> +}
> +
> +static int sched_core_share_tasks(struct task_struct *t1, struct task_struct *t2)
> +{

Can you please explain how t1 and t2 are related, there is a table below, but
I don't understand case#2, where the cookies get reset, is t2 the core leader
and t2 leads what t1 and t2 collectively get?

May be just called t2 as parent?

> + struct sched_core_task_write_tag wr = {}; /* for stop machine. */
> + bool sched_core_put_after_stopper = false;
> + unsigned long cookie;
> + int ret = -ENOMEM;
> +
> + mutex_lock(&sched_core_mutex);
> +
> + /*
> + * NOTE: sched_core_get() is done by sched_core_alloc_task_cookie() or
> + * sched_core_put_task_cookie(). However, sched_core_put() is done
> + * by this function *after* the stopper removes the tasks from the
> + * core queue, and not before. This is just to play it safe.
> + */
> + if (t2 == NULL) {
> + if (t1->core_task_cookie) {
> + sched_core_put_task_cookie(t1->core_task_cookie);
> + sched_core_put_after_stopper = true;
> + wr.tasks[0] = t1; /* Keep wr.cookies[0] reset for t1. */
> + }
> + } else if (t1 == t2) {
> + /* Assign a unique per-task cookie solely for t1. */
> +
> + cookie = sched_core_alloc_task_cookie();
> + if (!cookie)
> + goto out_unlock;
> +
> + if (t1->core_task_cookie) {
> + sched_core_put_task_cookie(t1->core_task_cookie);
> + sched_core_put_after_stopper = true;
> + }
> + wr.tasks[0] = t1;
> + wr.cookies[0] = cookie;
> + } else
> + /*
> + * t1 joining t2
> + * CASE 1:
> + * before 0 0
> + * after new cookie new cookie
> + *
> + * CASE 2:
> + * before X (non-zero) 0
> + * after 0 0
> + *
> + * CASE 3:
> + * before 0 X (non-zero)
> + * after X X
> + *
> + * CASE 4:
> + * before Y (non-zero) X (non-zero)
> + * after X X
> + */
> + if (!t1->core_task_cookie && !t2->core_task_cookie) {
> + /* CASE 1. */
> + cookie = sched_core_alloc_task_cookie();
> + if (!cookie)
> + goto out_unlock;
> +
> + /* Add another reference for the other task. */
> + if (!sched_core_get_task_cookie(cookie)) {
> + ret = -EINVAL;
> + goto out_unlock;
> + }
> +
> + wr.tasks[0] = t1;
> + wr.tasks[1] = t2;
> + wr.cookies[0] = wr.cookies[1] = cookie;
> +
> + } else if (t1->core_task_cookie && !t2->core_task_cookie) {
> + /* CASE 2. */
> + sched_core_put_task_cookie(t1->core_task_cookie);
> + sched_core_put_after_stopper = true;
> +
> + wr.tasks[0] = t1; /* Reset cookie for t1. */
> +
> + } else if (!t1->core_task_cookie && t2->core_task_cookie) {
> + /* CASE 3. */
> + if (!sched_core_get_task_cookie(t2->core_task_cookie)) {
> + ret = -EINVAL;
> + goto out_unlock;
> + }
> +
> + wr.tasks[0] = t1;
> + wr.cookies[0] = t2->core_task_cookie;
> +
> + } else {
> + /* CASE 4. */
> + if (!sched_core_get_task_cookie(t2->core_task_cookie)) {
> + ret = -EINVAL;
> + goto out_unlock;
> + }
> + sched_core_put_task_cookie(t1->core_task_cookie);
> + sched_core_put_after_stopper = true;
> +
> + wr.tasks[0] = t1;
> + wr.cookies[0] = t2->core_task_cookie;
> + }
> +
> + stop_machine(sched_core_task_join_stopper, (void *)&wr, NULL);
> +
> + if (sched_core_put_after_stopper)
> + sched_core_put();
> +
> + ret = 0;
> +out_unlock:
> + mutex_unlock(&sched_core_mutex);
> + return ret;
> +}
> +
> +/* CGroup interface */
> static u64 cpu_core_tag_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
> {
> struct task_group *tg = css_tg(css);
> @@ -9761,18 +9989,9 @@ static int __sched_write_tag(void *data)
> * when we set cgroup tag to 0 when the loop is done below.
> */
> while ((p = css_task_iter_next(&it))) {
> - p->core_cookie = !!val ? (unsigned long)tg : 0UL;
> -
> - if (sched_core_enqueued(p)) {
> - sched_core_dequeue(task_rq(p), p);
> - if (!p->core_cookie)
> - continue;
> - }
> -
> - if (sched_core_enabled(task_rq(p)) &&
> - p->core_cookie && task_on_rq_queued(p))
> - sched_core_enqueue(task_rq(p), p);
> + unsigned long cookie = !!val ? (unsigned long)tg : 0UL;
>
> + sched_core_tag_requeue(p, cookie, true /* group */);
> }
> css_task_iter_end(&it);
>
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 60a922d3f46f..8c452b8010ad 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -1024,6 +1024,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
> __PS("clock-delta", t1-t0);
> }
>
> +#ifdef CONFIG_SCHED_CORE
> + __PS("core_cookie", p->core_cookie);
> +#endif
> +
> sched_show_numa(p, m);
> }
>

Balbir Singh.

2020-12-01 18:42:14

by Joel Fernandes

[permalink] [raw]

Subject: Re: [PATCH -tip 22/32] sched: Split the cookie and setup per-task cookie on fork

On Wed, Nov 25, 2020 at 01:54:47PM +0100, Peter Zijlstra wrote:
> On Tue, Nov 17, 2020 at 06:19:52PM -0500, Joel Fernandes (Google) wrote:
> > +/* Per-task interface */
> > +static unsigned long sched_core_alloc_task_cookie(void)
> > +{
> > + struct sched_core_cookie *ptr =
> > + kmalloc(sizeof(struct sched_core_cookie), GFP_KERNEL);
> > +
> > + if (!ptr)
> > + return 0;
> > + refcount_set(&ptr->refcnt, 1);
> > +
> > + /*
> > + * NOTE: sched_core_put() is not done by put_task_cookie(). Instead, it
> > + * is done after the stopper runs.
> > + */
> > + sched_core_get();
> > + return (unsigned long)ptr;
> > +}
> > +
> > +static bool sched_core_get_task_cookie(unsigned long cookie)
> > +{
> > + struct sched_core_cookie *ptr = (struct sched_core_cookie *)cookie;
> > +
> > + /*
> > + * NOTE: sched_core_put() is not done by put_task_cookie(). Instead, it
> > + * is done after the stopper runs.
> > + */
> > + sched_core_get();
> > + return refcount_inc_not_zero(&ptr->refcnt);
> > +}
> > +
> > +static void sched_core_put_task_cookie(unsigned long cookie)
> > +{
> > + struct sched_core_cookie *ptr = (struct sched_core_cookie *)cookie;
> > +
> > + if (refcount_dec_and_test(&ptr->refcnt))
> > + kfree(ptr);
> > +}
>
> > + /*
> > + * NOTE: sched_core_get() is done by sched_core_alloc_task_cookie() or
> > + * sched_core_put_task_cookie(). However, sched_core_put() is done
> > + * by this function *after* the stopper removes the tasks from the
> > + * core queue, and not before. This is just to play it safe.
> > + */
>
> So for no reason what so ever you've made the code more difficult?

You're right, I could just do sched_core_get() in the caller. I changed it as in
the diff below:

---8<-----------------------

diff --git a/kernel/sched/coretag.c b/kernel/sched/coretag.c
index 800c0f8bacfc..75e2edb53a48 100644
--- a/kernel/sched/coretag.c
+++ b/kernel/sched/coretag.c
@@ -274,6 +274,7 @@ void sched_core_change_group(struct task_struct *p, struct task_group *new_tg)
/* Per-task interface: Used by fork(2) and prctl(2). */
static void sched_core_put_cookie_work(struct work_struct *ws);

+/* Caller has to call sched_core_get() if non-zero value is returned. */
static unsigned long sched_core_alloc_task_cookie(void)
{
struct sched_core_task_cookie *ck =
@@ -284,11 +285,6 @@ static unsigned long sched_core_alloc_task_cookie(void)
refcount_set(&ck->refcnt, 1);
INIT_WORK(&ck->work, sched_core_put_cookie_work);

- /*
- * NOTE: sched_core_put() is not done by put_task_cookie(). Instead, it
- * is done after the stopper runs.
- */
- sched_core_get();
return (unsigned long)ck;
}

@@ -354,12 +350,6 @@ int sched_core_share_tasks(struct task_struct *t1, struct task_struct *t2)

mutex_lock(&sched_core_tasks_mutex);

- /*
- * NOTE: sched_core_get() is done by sched_core_alloc_task_cookie() or
- * sched_core_put_task_cookie(). However, sched_core_put() is done
- * by this function *after* the stopper removes the tasks from the
- * core queue, and not before. This is just to play it safe.
- */
if (!t2) {
if (t1->core_task_cookie) {
sched_core_put_task_cookie(t1->core_task_cookie);
@@ -370,7 +360,9 @@ int sched_core_share_tasks(struct task_struct *t1, struct task_struct *t2)
/* Assign a unique per-task cookie solely for t1. */

cookie = sched_core_alloc_task_cookie();
- if (!cookie)
+ if (cookie)
+ sched_core_get();
+ else
goto out_unlock;

if (t1->core_task_cookie) {
@@ -401,7 +393,9 @@ int sched_core_share_tasks(struct task_struct *t1, struct task_struct *t2)

/* CASE 1. */
cookie = sched_core_alloc_task_cookie();
- if (!cookie)
+ if (cookie)
+ sched_core_get();
+ else
goto out_unlock;

/* Add another reference for the other task. */

2020-12-01 18:55:49

by Joel Fernandes

[permalink] [raw]

Subject: Re: [PATCH -tip 22/32] sched: Split the cookie and setup per-task cookie on fork

On Wed, Nov 25, 2020 at 02:03:22PM +0100, Peter Zijlstra wrote:
> On Tue, Nov 17, 2020 at 06:19:52PM -0500, Joel Fernandes (Google) wrote:
> > +static bool sched_core_get_task_cookie(unsigned long cookie)
> > +{
> > + struct sched_core_cookie *ptr = (struct sched_core_cookie *)cookie;
> > +
> > + /*
> > + * NOTE: sched_core_put() is not done by put_task_cookie(). Instead, it
> > + * is done after the stopper runs.
> > + */
> > + sched_core_get();
> > + return refcount_inc_not_zero(&ptr->refcnt);
>
> See below, but afaict this should be refcount_inc().

Fully agreed with all these. Updated with diff as below. Will test further
and post next version soon. Thanks!

---8<-----------------------

diff --git a/kernel/sched/coretag.c b/kernel/sched/coretag.c
index 2fb5544a4a18..8fce3f4b7cae 100644
--- a/kernel/sched/coretag.c
+++ b/kernel/sched/coretag.c
@@ -288,12 +288,12 @@ static unsigned long sched_core_alloc_task_cookie(void)
return (unsigned long)ck;
}

-static bool sched_core_get_task_cookie(unsigned long cookie)
+static void sched_core_get_task_cookie(unsigned long cookie)
{
struct sched_core_task_cookie *ptr =
(struct sched_core_task_cookie *)cookie;

- return refcount_inc_not_zero(&ptr->refcnt);
+ refcount_inc(&ptr->refcnt);
}

static void sched_core_put_task_cookie(unsigned long cookie)
@@ -392,10 +392,7 @@ int sched_core_share_tasks(struct task_struct *t1, struct task_struct *t2)
sched_core_get(); /* For the alloc. */

/* Add another reference for the other task. */
- if (!sched_core_get_task_cookie(cookie)) {
- ret = -EINVAL;
- goto out_unlock;
- }
+ sched_core_get_task_cookie(cookie);
sched_core_get(); /* For the other task. */

wr.tasks[0] = t1;
@@ -411,10 +408,7 @@ int sched_core_share_tasks(struct task_struct *t1, struct task_struct *t2)

} else if (!t1->core_task_cookie && t2->core_task_cookie) {
/* CASE 3. */
- if (!sched_core_get_task_cookie(t2->core_task_cookie)) {
- ret = -EINVAL;
- goto out_unlock;
- }
+ sched_core_get_task_cookie(t2->core_task_cookie);
sched_core_get();

wr.tasks[0] = t1;
@@ -422,10 +416,7 @@ int sched_core_share_tasks(struct task_struct *t1, struct task_struct *t2)

} else {
/* CASE 4. */
- if (!sched_core_get_task_cookie(t2->core_task_cookie)) {
- ret = -EINVAL;
- goto out_unlock;
- }
+ sched_core_get_task_cookie(t2->core_task_cookie);
sched_core_get();

sched_core_put_task_cookie(t1->core_task_cookie);
--
2.29.2.454.gaff20da3a2-goog

2020-12-01 19:00:34

by Joel Fernandes

[permalink] [raw]

Subject: Re: [PATCH -tip 22/32] sched: Split the cookie and setup per-task cookie on fork

On Wed, Nov 25, 2020 at 12:07:09PM +0100, Peter Zijlstra wrote:
> On Tue, Nov 17, 2020 at 06:19:52PM -0500, Joel Fernandes (Google) wrote:
> > Also, for the per-task cookie, it will get weird if we use pointers of any
> > emphemeral objects. For this reason, introduce a refcounted object who's sole
> > purpose is to assign unique cookie value by way of the object's pointer.
>
> Might be useful to explain why exactly none of the many pid_t's are
> good enough.

I thought about this already and it does not seem a good fit. When 2
processes share, it is possible that more processes are added to that logical
group. Then the original processes that share die, but if we hold on to the
pid_t or task_struct, that would be awkward. It seemed introducing a new
refcounted struct was the right way to go. I can add these details to the
change log.

thanks!

- Joel

2020-12-01 19:17:22

by Joel Fernandes

[permalink] [raw]

Subject: Re: [PATCH -tip 22/32] sched: Split the cookie and setup per-task cookie on fork

On Wed, Nov 25, 2020 at 12:15:41PM +0100, Peter Zijlstra wrote:
> On Tue, Nov 17, 2020 at 06:19:52PM -0500, Joel Fernandes (Google) wrote:
>
> > +/*
> > + * Ensure that the task has been requeued. The stopper ensures that the task cannot
> > + * be migrated to a different CPU while its core scheduler queue state is being updated.
> > + * It also makes sure to requeue a task if it was running actively on another CPU.
> > + */
> > +static int sched_core_task_join_stopper(void *data)
> > +{
> > + struct sched_core_task_write_tag *tag = (struct sched_core_task_write_tag *)data;
> > + int i;
> > +
> > + for (i = 0; i < 2; i++)
> > + sched_core_tag_requeue(tag->tasks[i], tag->cookies[i], false /* !group */);
> > +
> > + return 0;
> > +}
> > +
> > +static int sched_core_share_tasks(struct task_struct *t1, struct task_struct *t2)
> > +{
>
> > + stop_machine(sched_core_task_join_stopper, (void *)&wr, NULL);
>
> > +}
>
> This is *REALLY* terrible...

I pulled this bit from your original patch. Are you concerned about the
stop_machine? Sharing a core is a slow path for our usecases (and as far as I
know, for everyone else's). We can probably do something different if that
requirement changes.

2020-12-01 19:20:46

by Joel Fernandes

[permalink] [raw]

Subject: Re: [PATCH -tip 22/32] sched: Split the cookie and setup per-task cookie on fork

On Wed, Nov 25, 2020 at 12:11:28PM +0100, Peter Zijlstra wrote:
> On Tue, Nov 17, 2020 at 06:19:52PM -0500, Joel Fernandes (Google) wrote:
>
> > + * sched_core_tag_requeue - Common helper for all interfaces to set a cookie.
>
> sched_core_set_cookie() would be a saner name, given that description,
> don't you think?

Yeah, Josh is better than me at naming so he changed it to
sched_core_update_cookie() already :-). Hopefully that's Ok too with you.

thanks,

- Joel

2020-12-01 19:24:55

by Joel Fernandes

[permalink] [raw]

Subject: Re: [PATCH -tip 22/32] sched: Split the cookie and setup per-task cookie on fork

On Wed, Nov 25, 2020 at 12:10:14PM +0100, Peter Zijlstra wrote:
> On Tue, Nov 17, 2020 at 06:19:52PM -0500, Joel Fernandes (Google) wrote:
> > +void sched_core_tag_requeue(struct task_struct *p, unsigned long cookie, bool group)
> > +{
> > + if (!p)
> > + return;
> > +
> > + if (group)
> > + p->core_group_cookie = cookie;
> > + else
> > + p->core_task_cookie = cookie;
> > +
> > + /* Use up half of the cookie's bits for task cookie and remaining for group cookie. */
> > + p->core_cookie = (p->core_task_cookie <<
> > + (sizeof(unsigned long) * 4)) + p->core_group_cookie;
>
> This seems dangerous; afaict there is nothing that prevents cookie
> collision.

This is fixed in a later patch by Josh "sched: Refactor core cookie into
struct" where we are having independent fields for each type of cookie.

I'll squash it next time I post to prevent confusion. Thanks,

- Joel

2020-12-01 19:26:08

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH -tip 22/32] sched: Split the cookie and setup per-task cookie on fork

On Tue, Dec 01, 2020 at 02:11:33PM -0500, Joel Fernandes wrote:
> On Wed, Nov 25, 2020 at 12:15:41PM +0100, Peter Zijlstra wrote:
> > On Tue, Nov 17, 2020 at 06:19:52PM -0500, Joel Fernandes (Google) wrote:
> >
> > > +/*
> > > + * Ensure that the task has been requeued. The stopper ensures that the task cannot
> > > + * be migrated to a different CPU while its core scheduler queue state is being updated.
> > > + * It also makes sure to requeue a task if it was running actively on another CPU.
> > > + */
> > > +static int sched_core_task_join_stopper(void *data)
> > > +{
> > > + struct sched_core_task_write_tag *tag = (struct sched_core_task_write_tag *)data;
> > > + int i;
> > > +
> > > + for (i = 0; i < 2; i++)
> > > + sched_core_tag_requeue(tag->tasks[i], tag->cookies[i], false /* !group */);
> > > +
> > > + return 0;
> > > +}
> > > +
> > > +static int sched_core_share_tasks(struct task_struct *t1, struct task_struct *t2)
> > > +{
> >
> > > + stop_machine(sched_core_task_join_stopper, (void *)&wr, NULL);
> >
> > > +}
> >
> > This is *REALLY* terrible...
>
> I pulled this bit from your original patch. Are you concerned about the
> stop_machine? Sharing a core is a slow path for our usecases (and as far as I
> know, for everyone else's). We can probably do something different if that
> requirement changes.
>

Yeah.. so I can (and was planning on) remove stop_machine() from
sched_core_{dis,en}able() before merging it.

(there's two options, one uses stop_cpus() with the SMT mask, the other
RCU)

This though is exposing stop_machine() to joe user. Everybody is allowed
to prctl() it's own task and set a cookie on himself. This means you
just made giant unpriv DoS vector.

stop_machine is bad, really bad.

2020-12-01 19:38:20

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH -tip 22/32] sched: Split the cookie and setup per-task cookie on fork

On Tue, Dec 01, 2020 at 02:20:28PM -0500, Joel Fernandes wrote:
> On Wed, Nov 25, 2020 at 12:10:14PM +0100, Peter Zijlstra wrote:
> > On Tue, Nov 17, 2020 at 06:19:52PM -0500, Joel Fernandes (Google) wrote:
> > > +void sched_core_tag_requeue(struct task_struct *p, unsigned long cookie, bool group)
> > > +{
> > > + if (!p)
> > > + return;
> > > +
> > > + if (group)
> > > + p->core_group_cookie = cookie;
> > > + else
> > > + p->core_task_cookie = cookie;
> > > +
> > > + /* Use up half of the cookie's bits for task cookie and remaining for group cookie. */
> > > + p->core_cookie = (p->core_task_cookie <<
> > > + (sizeof(unsigned long) * 4)) + p->core_group_cookie;
> >
> > This seems dangerous; afaict there is nothing that prevents cookie
> > collision.
>
> This is fixed in a later patch by Josh "sched: Refactor core cookie into
> struct" where we are having independent fields for each type of cookie.

So I don't think that later patch is right... That is, it works, but
afaict it's massive overkill.

COOKIE_CMP_RETURN(task_cookie);
COOKIE_CMP_RETURN(group_cookie);
COOKIE_CMP_RETURN(color);

So if task_cookie matches, we consider group_cookie, if that matches we
consider color.

Now, afaict that's semantically exactly the same as just using the
narrowest cookie. That is, use the task cookie if there is, and then,
walking the cgroup hierarchy (up) pick the first cgroup cookie.

(I don't understand the color thing, but lets have that discussion in
that subthread)

Which means you only need a single active cookie field.

IOW, you're just making things complicated and expensive.

2020-12-02 06:38:53

by Josh Don

[permalink] [raw]

Subject: Re: [PATCH -tip 22/32] sched: Split the cookie and setup per-task cookie on fork

On Tue, Dec 1, 2020 at 11:35 AM Peter Zijlstra <[email protected]> wrote:
>
> So I don't think that later patch is right... That is, it works, but
> afaict it's massive overkill.
>
> COOKIE_CMP_RETURN(task_cookie);
> COOKIE_CMP_RETURN(group_cookie);
> COOKIE_CMP_RETURN(color);
>
> So if task_cookie matches, we consider group_cookie, if that matches we
> consider color.
>
> Now, afaict that's semantically exactly the same as just using the
> narrowest cookie. That is, use the task cookie if there is, and then,
> walking the cgroup hierarchy (up) pick the first cgroup cookie.
>
> (I don't understand the color thing, but lets have that discussion in
> that subthread)
>
> Which means you only need a single active cookie field.
>
> IOW, you're just making things complicated and expensive.
>

For the per-task interface, I believe we still want to prevent two
tasks that share a task cookie from sharing an overall cookie if they
are in two separately tagged groups (Joel please correct me if I'm
mistaken there). That's why in Joel's older patch, the overall cookie
was a combination of the task and group cookies. My concern about
that was the potential cookie collision.

I followed up on the 'color' portion in the other thread.

Thanks,
Josh

2020-12-02 07:58:52

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH -tip 22/32] sched: Split the cookie and setup per-task cookie on fork

On Tue, Dec 01, 2020 at 10:36:18PM -0800, Josh Don wrote:
> On Tue, Dec 1, 2020 at 11:35 AM Peter Zijlstra <[email protected]> wrote:
> >
> > So I don't think that later patch is right... That is, it works, but
> > afaict it's massive overkill.
> >
> > COOKIE_CMP_RETURN(task_cookie);
> > COOKIE_CMP_RETURN(group_cookie);
> > COOKIE_CMP_RETURN(color);
> >
> > So if task_cookie matches, we consider group_cookie, if that matches we
> > consider color.
> >
> > Now, afaict that's semantically exactly the same as just using the
> > narrowest cookie. That is, use the task cookie if there is, and then,
> > walking the cgroup hierarchy (up) pick the first cgroup cookie.
> >
> > (I don't understand the color thing, but lets have that discussion in
> > that subthread)
> >
> > Which means you only need a single active cookie field.
> >
> > IOW, you're just making things complicated and expensive.
> >
>
> For the per-task interface, I believe we still want to prevent two
> tasks that share a task cookie from sharing an overall cookie if they
> are in two separately tagged groups (Joel please correct me if I'm
> mistaken there). That's why in Joel's older patch, the overall cookie
> was a combination of the task and group cookies. My concern about
> that was the potential cookie collision.

Then disallow sharing a task cookie when the tasks are in different
cgroups or disallow cgroup movement when they share a cookie.

2020-12-04 00:25:44

by Josh Don

[permalink] [raw]

Subject: Re: [PATCH -tip 22/32] sched: Split the cookie and setup per-task cookie on fork

On Tue, Dec 1, 2020 at 11:55 PM Peter Zijlstra <[email protected]> wrote:
>
> Then disallow sharing a task cookie when the tasks are in different
> cgroups or disallow cgroup movement when they share a cookie.

Yes, we could restrict task cookie sharing to tasks that are in the
same cgroup. Then the cookie easily just becomes a single value;
either the task cookie or group cookie.

The advantage of the approach with the cookie struct is that it is
easily extensible, and allows for trust models that don't conform
exactly to the cgroup hierarchy (ie. our discussion on cookie color).
The overhead of the approach seems tolerable, given that updates to a
task's cookie are not in fast paths (ie. prctl, setting cgroup cookie,
sched_move_task). Are you more concerned with the added complexity of
maintaining the RB tree, refcounts, etc?

2020-12-06 17:52:54

by Joel Fernandes

[permalink] [raw]

Subject: Re: [PATCH -tip 22/32] sched: Split the cookie and setup per-task cookie on fork

Hi Peter,

On Tue, Dec 01, 2020 at 08:34:51PM +0100, Peter Zijlstra wrote:
> On Tue, Dec 01, 2020 at 02:20:28PM -0500, Joel Fernandes wrote:
> > On Wed, Nov 25, 2020 at 12:10:14PM +0100, Peter Zijlstra wrote:
> > > On Tue, Nov 17, 2020 at 06:19:52PM -0500, Joel Fernandes (Google) wrote:
> > > > +void sched_core_tag_requeue(struct task_struct *p, unsigned long cookie, bool group)
> > > > +{
> > > > + if (!p)
> > > > + return;
> > > > +
> > > > + if (group)
> > > > + p->core_group_cookie = cookie;
> > > > + else
> > > > + p->core_task_cookie = cookie;
> > > > +
> > > > + /* Use up half of the cookie's bits for task cookie and remaining for group cookie. */
> > > > + p->core_cookie = (p->core_task_cookie <<
> > > > + (sizeof(unsigned long) * 4)) + p->core_group_cookie;
> > >
> > > This seems dangerous; afaict there is nothing that prevents cookie
> > > collision.
> >
> > This is fixed in a later patch by Josh "sched: Refactor core cookie into
> > struct" where we are having independent fields for each type of cookie.
>
> So I don't think that later patch is right... That is, it works, but
> afaict it's massive overkill.
>
> COOKIE_CMP_RETURN(task_cookie);
> COOKIE_CMP_RETURN(group_cookie);
> COOKIE_CMP_RETURN(color);
>
> So if task_cookie matches, we consider group_cookie, if that matches we
> consider color.

Yes, the cookie is a compound one. A cookie structure is created here which
has all 3 components. The final cookie value is a pointer to this compound
structure.

>
> Now, afaict that's semantically exactly the same as just using the
> narrowest cookie. That is, use the task cookie if there is, and then,
> walking the cgroup hierarchy (up) pick the first cgroup cookie.
[..]
> Which means you only need a single active cookie field.

Its not the same. The "compounded" cookie is needed to enforce the CGroup
delegation model. This is exactly how both uclamp and cpusets work. For
uclamp, if we take uclamp.min as an example, if the CGroup sets the
uclamp.min of a group to 0, then even if you use the per-task interface
(sched_setattr) for setting the task's clamp - the "effective uclamp.min" of
the task as seen in /proc/pid/sched will still be 0.

Similar thing here, if 2 tasks belong to 2 different CGroups and each group is
tagged with its own tag, then if you use the per-task interface to make the 2
tasks share a core, the "effective" sharing is still such that the tasks will
not share a core -- because the CGroup decided to make it so.

I would like to really maintain this model. Doing it any other way is
confusing - we have already discussed doing it this way before. With that way
you end up failing one interface if another one was already used. Been
there, done that. It sucks a lot.

> IOW, you're just making things complicated and expensive.

The cost of the additional comparisons you mentioned is only in the slow path
(i.e. when someone joins or leaves a group). Once the task_struct's cookie
field is set, the cost is not any more than what it was before.

Any other thoughts?

thanks,

- Joel

2020-12-06 18:19:37

by Joel Fernandes

[permalink] [raw]

Subject: Re: [PATCH -tip 22/32] sched: Split the cookie and setup per-task cookie on fork

On Tue, Dec 01, 2020 at 08:20:50PM +0100, Peter Zijlstra wrote:
> On Tue, Dec 01, 2020 at 02:11:33PM -0500, Joel Fernandes wrote:
> > On Wed, Nov 25, 2020 at 12:15:41PM +0100, Peter Zijlstra wrote:
> > > On Tue, Nov 17, 2020 at 06:19:52PM -0500, Joel Fernandes (Google) wrote:
> > >
> > > > +/*
> > > > + * Ensure that the task has been requeued. The stopper ensures that the task cannot
> > > > + * be migrated to a different CPU while its core scheduler queue state is being updated.
> > > > + * It also makes sure to requeue a task if it was running actively on another CPU.
> > > > + */
> > > > +static int sched_core_task_join_stopper(void *data)
> > > > +{
> > > > + struct sched_core_task_write_tag *tag = (struct sched_core_task_write_tag *)data;
> > > > + int i;
> > > > +
> > > > + for (i = 0; i < 2; i++)
> > > > + sched_core_tag_requeue(tag->tasks[i], tag->cookies[i], false /* !group */);
> > > > +
> > > > + return 0;
> > > > +}
> > > > +
> > > > +static int sched_core_share_tasks(struct task_struct *t1, struct task_struct *t2)
> > > > +{
> > >
> > > > + stop_machine(sched_core_task_join_stopper, (void *)&wr, NULL);
> > >
> > > > +}
> > >
> > > This is *REALLY* terrible...
> >
> > I pulled this bit from your original patch. Are you concerned about the
> > stop_machine? Sharing a core is a slow path for our usecases (and as far as I
> > know, for everyone else's). We can probably do something different if that
> > requirement changes.
> >
>
> Yeah.. so I can (and was planning on) remove stop_machine() from
> sched_core_{dis,en}able() before merging it.
>
> (there's two options, one uses stop_cpus() with the SMT mask, the other
> RCU)

Ok. What about changing the cookie of task T while holding the rq/pi locks, and
then doing a resched_curr(rq) for that RQ?

The holding lock ensures no migration of task happens, and resched_curr()
ensure that task T's rq will enter the scheduler to consider the task T's new
cookie for scheduling. I believe this is analogous to what
__sched_setscheduler() does when you switch a task from CFS to RT.

> This though is exposing stop_machine() to joe user. Everybody is allowed
> to prctl() it's own task and set a cookie on himself. This means you
> just made giant unpriv DoS vector.
>
> stop_machine is bad, really bad.

Agreed.

thanks,

- Joel