From: Paul Turner <[email protected]>
Currently, when updating the affinity of tasks via either cpusets.cpus,
or, sched_setaffinity(); tasks not currently running within the newly
specified CPU will be arbitrarily assigned to the first CPU within the
mask.
This (particularly in the case that we are restricting masks) can
result in many tasks being assigned to the first CPUs of their new
masks.
This:
1) Can induce scheduling delays while the load-balancer has a chance to
spread them between their new CPUs.
2) Can antogonize a poor load-balancer behavior where it has a
difficult time recognizing that a cross-socket imbalance has been
forced by an affinity mask.
With this change, tasks are distributed ~evenly across the new mask. We
may intentionally move tasks already running on a CPU within the mask to
avoid edge cases in which a CPU is already overloaded (or would be
assigned to more times than is desired).
We specifically apply this behavior to the following cases:
- modifying cpuset.cpus
- when tasks join a cpuset
- when modifying a task's affinity via sched_setaffinity(2)
Signed-off-by: Paul Turner <[email protected]>
Co-developed-by: Josh Don <[email protected]>
Signed-off-by: Josh Don <[email protected]>
---
include/linux/sched.h | 8 +++++
kernel/cgroup/cpuset.c | 5 +--
kernel/sched/core.c | 81 +++++++++++++++++++++++++++++++++++++-----
3 files changed, 83 insertions(+), 11 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 04278493bf15..a2aab6a8a794 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1587,6 +1587,8 @@ extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_
#ifdef CONFIG_SMP
extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
+extern int set_cpus_allowed_ptr_distribute(struct task_struct *p,
+ const struct cpumask *new_mask);
#else
static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
@@ -1597,6 +1599,12 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpuma
return -EINVAL;
return 0;
}
+
+static inline int set_cpus_allowed_ptr_distribute(struct task_struct *p,
+ const struct cpumask *new_mask)
+{
+ return set_cpus_allowed_ptr(p, new_mask);
+}
#endif
extern int yield_to(struct task_struct *p, bool preempt);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 58f5073acff7..69960cae92e2 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1035,7 +1035,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it)))
- set_cpus_allowed_ptr(task, cs->effective_cpus);
+ set_cpus_allowed_ptr_distribute(task, cs->effective_cpus);
css_task_iter_end(&it);
}
@@ -2185,7 +2185,8 @@ static void cpuset_attach(struct cgroup_taskset *tset)
* can_attach beforehand should guarantee that this doesn't
* fail. TODO: have a better way to handle failure here
*/
- WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
+ WARN_ON_ONCE(
+ set_cpus_allowed_ptr_distribute(task, cpus_attach));
cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
cpuset_update_task_spread_flag(cs, task);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1a9983da4408..2336d6d66016 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1612,6 +1612,32 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
set_next_task(rq, p);
}
+static DEFINE_PER_CPU(int, distribute_cpu_mask_prev);
+
+/*
+ * Returns an arbitrary cpu within *srcp1 & srcp2
+ *
+ * Iterated calls using the same srcp1 and srcp2, passing the previous cpu each
+ * time, will be distributed within their intersection.
+ */
+static int distribute_to_new_cpumask(const struct cpumask *src1p,
+ const struct cpumask *src2p)
+{
+ int next, prev;
+
+ /* NOTE: our first selection will skip 0. */
+ prev = __this_cpu_read(distribute_cpu_mask_prev);
+
+ next = cpumask_next_and(prev, src1p, src2p);
+ if (next >= nr_cpu_ids)
+ next = cpumask_first_and(src1p, src2p);
+
+ if (next < nr_cpu_ids)
+ __this_cpu_write(distribute_cpu_mask_prev, next);
+
+ return next;
+}
+
/*
* Change a given task's CPU affinity. Migrate the thread to a
* proper CPU and schedule it away if the CPU it's executing on
@@ -1621,11 +1647,11 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
* task must not exit() & deallocate itself prematurely. The
* call is not atomic; no spinlocks may be held.
*/
-static int __set_cpus_allowed_ptr(struct task_struct *p,
+static int __set_cpus_allowed_ptr(struct task_struct *p, bool distribute_cpus,
const struct cpumask *new_mask, bool check)
{
const struct cpumask *cpu_valid_mask = cpu_active_mask;
- unsigned int dest_cpu;
+ unsigned int dest_cpu, prev_cpu;
struct rq_flags rf;
struct rq *rq;
int ret = 0;
@@ -1652,8 +1678,33 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
if (cpumask_equal(p->cpus_ptr, new_mask))
goto out;
- dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
- if (dest_cpu >= nr_cpu_ids) {
+ if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ prev_cpu = task_cpu(p);
+ if (distribute_cpus) {
+ dest_cpu = distribute_to_new_cpumask(new_mask,
+ cpu_valid_mask);
+ } else {
+ /*
+ * Can the task run on the task's current CPU? If so, we're
+ * done.
+ *
+ * We only enable this short-circuit in the case that we're
+ * not trying to distribute tasks. As we may otherwise not
+ * distribute away from a loaded CPU, or make duplicate
+ * assignments to it.
+ */
+ if (cpumask_test_cpu(prev_cpu, new_mask))
+ dest_cpu = prev_cpu;
+ else
+ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
+ }
+
+ /* May have raced with cpu_down */
+ if (unlikely(dest_cpu >= nr_cpu_ids)) {
ret = -EINVAL;
goto out;
}
@@ -1670,8 +1721,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
p->nr_cpus_allowed != 1);
}
- /* Can the task run on the task's current CPU? If so, we're done */
- if (cpumask_test_cpu(task_cpu(p), new_mask))
+ if (dest_cpu == prev_cpu)
goto out;
if (task_running(rq, p) || p->state == TASK_WAKING) {
@@ -1695,10 +1745,21 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
{
- return __set_cpus_allowed_ptr(p, new_mask, false);
+ return __set_cpus_allowed_ptr(p, false, new_mask, false);
}
EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+/*
+ * Each repeated call will attempt to distribute tasks to a new cpu. As a
+ * result, singular calls will also use a more "random" cpu than the first
+ * allowed in the mask.
+ */
+int set_cpus_allowed_ptr_distribute(struct task_struct *p,
+ const struct cpumask *new_mask)
+{
+ return __set_cpus_allowed_ptr(p, true, new_mask, false);
+}
+
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
#ifdef CONFIG_SCHED_DEBUG
@@ -2160,7 +2221,9 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
#else
static inline int __set_cpus_allowed_ptr(struct task_struct *p,
- const struct cpumask *new_mask, bool check)
+ bool distribute_cpus,
+ const struct cpumask *new_mask,
+ bool check)
{
return set_cpus_allowed_ptr(p, new_mask);
}
@@ -5456,7 +5519,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
}
#endif
again:
- retval = __set_cpus_allowed_ptr(p, new_mask, true);
+ retval = __set_cpus_allowed_ptr(p, true, new_mask, true);
if (!retval) {
cpuset_cpus_allowed(p, cpus_allowed);
--
2.25.1.481.gfbce0eb801-goog
On Thu, Feb 27, 2020 at 05:01:34PM -0800, Josh Don wrote:
> From: Paul Turner <[email protected]>
>
> Currently, when updating the affinity of tasks via either cpusets.cpus,
> or, sched_setaffinity(); tasks not currently running within the newly
> specified CPU will be arbitrarily assigned to the first CPU within the
> mask.
>
> This (particularly in the case that we are restricting masks) can
> result in many tasks being assigned to the first CPUs of their new
> masks.
>
> This:
> 1) Can induce scheduling delays while the load-balancer has a chance to
> spread them between their new CPUs.
> 2) Can antogonize a poor load-balancer behavior where it has a
> difficult time recognizing that a cross-socket imbalance has been
> forced by an affinity mask.
>
> With this change, tasks are distributed ~evenly across the new mask. We
> may intentionally move tasks already running on a CPU within the mask to
> avoid edge cases in which a CPU is already overloaded (or would be
> assigned to more times than is desired).
>
> We specifically apply this behavior to the following cases:
> - modifying cpuset.cpus
> - when tasks join a cpuset
> - when modifying a task's affinity via sched_setaffinity(2)
Looks fine to me. Peter, what do you think?
Thanks.
--
tejun
On Thu, Feb 27, 2020 at 05:01:34PM -0800, Josh Don wrote:
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 04278493bf15..a2aab6a8a794 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1587,6 +1587,8 @@ extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_
> #ifdef CONFIG_SMP
> extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
> extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
> +extern int set_cpus_allowed_ptr_distribute(struct task_struct *p,
> + const struct cpumask *new_mask);
Why? Changelog doesn't seem to give a reason for adding another
interface.
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 1a9983da4408..2336d6d66016 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1612,6 +1612,32 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
> set_next_task(rq, p);
> }
>
> +static DEFINE_PER_CPU(int, distribute_cpu_mask_prev);
> +
> +/*
> + * Returns an arbitrary cpu within *srcp1 & srcp2
> + *
> + * Iterated calls using the same srcp1 and srcp2, passing the previous cpu each
> + * time, will be distributed within their intersection.
> + */
> +static int distribute_to_new_cpumask(const struct cpumask *src1p,
> + const struct cpumask *src2p)
> +{
> + int next, prev;
> +
> + /* NOTE: our first selection will skip 0. */
> + prev = __this_cpu_read(distribute_cpu_mask_prev);
> +
> + next = cpumask_next_and(prev, src1p, src2p);
> + if (next >= nr_cpu_ids)
> + next = cpumask_first_and(src1p, src2p);
> +
> + if (next < nr_cpu_ids)
> + __this_cpu_write(distribute_cpu_mask_prev, next);
> +
> + return next;
> +}
That's a valid implementation of cpumask_any_and(), it just has a really
weird name.
> /*
> * Change a given task's CPU affinity. Migrate the thread to a
> * proper CPU and schedule it away if the CPU it's executing on
> @@ -1621,11 +1647,11 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
> * task must not exit() & deallocate itself prematurely. The
> * call is not atomic; no spinlocks may be held.
> */
> -static int __set_cpus_allowed_ptr(struct task_struct *p,
> +static int __set_cpus_allowed_ptr(struct task_struct *p, bool distribute_cpus,
> const struct cpumask *new_mask, bool check)
> {
> const struct cpumask *cpu_valid_mask = cpu_active_mask;
> - unsigned int dest_cpu;
> + unsigned int dest_cpu, prev_cpu;
> struct rq_flags rf;
> struct rq *rq;
> int ret = 0;
> @@ -1652,8 +1678,33 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
> if (cpumask_equal(p->cpus_ptr, new_mask))
> goto out;
>
> - dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
> - if (dest_cpu >= nr_cpu_ids) {
> + if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
> + ret = -EINVAL;
> + goto out;
> + }
> +
> + prev_cpu = task_cpu(p);
> + if (distribute_cpus) {
> + dest_cpu = distribute_to_new_cpumask(new_mask,
> + cpu_valid_mask);
> + } else {
> + /*
> + * Can the task run on the task's current CPU? If so, we're
> + * done.
> + *
> + * We only enable this short-circuit in the case that we're
> + * not trying to distribute tasks. As we may otherwise not
> + * distribute away from a loaded CPU, or make duplicate
> + * assignments to it.
> + */
> + if (cpumask_test_cpu(prev_cpu, new_mask))
> + dest_cpu = prev_cpu;
> + else
> + dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
> + }
That all seems overly complicated; what is wrong with just this:
dest_cpu = cpumask_any_and_fancy(cpu_valid_mask, new_mask);
I don't really buy the argument why that shortcut is problematic; it's
all averages anyway, and keeping a task on a CPU where it's already
running seems like a win.
> + /* May have raced with cpu_down */
> + if (unlikely(dest_cpu >= nr_cpu_ids)) {
> ret = -EINVAL;
> goto out;
> }
Thanks for the comments, I discussed further with Paul. We agree with
simplifying this into a new cpumask macro, and thus cutting down the
changes needed directly in core.c.
> Why? Changelog doesn't seem to give a reason for adding another
> interface.
The new interface was to selectively enable distribution for specific
paths. For some paths, we wanted to run on prev_cpu if in mask.
However, as described below, this concern is no longer relevant, I'll
remove the extra interface.
> I don't really buy the argument why that shortcut is problematic; it's
> all averages anyway, and keeping a task on a CPU where it's already
> running seems like a win.
It made sense for our use case at the time, but in general we agree
that it is fine to leave things where they are if possible.
On Wed, Mar 4, 2020 at 10:21 AM Peter Zijlstra <[email protected]> wrote:
>
> On Thu, Feb 27, 2020 at 05:01:34PM -0800, Josh Don wrote:
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index 04278493bf15..a2aab6a8a794 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -1587,6 +1587,8 @@ extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_
> > #ifdef CONFIG_SMP
> > extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
> > extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
> > +extern int set_cpus_allowed_ptr_distribute(struct task_struct *p,
> > + const struct cpumask *new_mask);
>
> Why? Changelog doesn't seem to give a reason for adding another
> interface.
>
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 1a9983da4408..2336d6d66016 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -1612,6 +1612,32 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
> > set_next_task(rq, p);
> > }
> >
> > +static DEFINE_PER_CPU(int, distribute_cpu_mask_prev);
> > +
> > +/*
> > + * Returns an arbitrary cpu within *srcp1 & srcp2
> > + *
> > + * Iterated calls using the same srcp1 and srcp2, passing the previous cpu each
> > + * time, will be distributed within their intersection.
> > + */
> > +static int distribute_to_new_cpumask(const struct cpumask *src1p,
> > + const struct cpumask *src2p)
> > +{
> > + int next, prev;
> > +
> > + /* NOTE: our first selection will skip 0. */
> > + prev = __this_cpu_read(distribute_cpu_mask_prev);
> > +
> > + next = cpumask_next_and(prev, src1p, src2p);
> > + if (next >= nr_cpu_ids)
> > + next = cpumask_first_and(src1p, src2p);
> > +
> > + if (next < nr_cpu_ids)
> > + __this_cpu_write(distribute_cpu_mask_prev, next);
> > +
> > + return next;
> > +}
>
> That's a valid implementation of cpumask_any_and(), it just has a really
> weird name.
>
> > /*
> > * Change a given task's CPU affinity. Migrate the thread to a
> > * proper CPU and schedule it away if the CPU it's executing on
> > @@ -1621,11 +1647,11 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
> > * task must not exit() & deallocate itself prematurely. The
> > * call is not atomic; no spinlocks may be held.
> > */
> > -static int __set_cpus_allowed_ptr(struct task_struct *p,
> > +static int __set_cpus_allowed_ptr(struct task_struct *p, bool distribute_cpus,
> > const struct cpumask *new_mask, bool check)
> > {
> > const struct cpumask *cpu_valid_mask = cpu_active_mask;
> > - unsigned int dest_cpu;
> > + unsigned int dest_cpu, prev_cpu;
> > struct rq_flags rf;
> > struct rq *rq;
> > int ret = 0;
> > @@ -1652,8 +1678,33 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
> > if (cpumask_equal(p->cpus_ptr, new_mask))
> > goto out;
> >
> > - dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
> > - if (dest_cpu >= nr_cpu_ids) {
> > + if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
> > + ret = -EINVAL;
> > + goto out;
> > + }
> > +
> > + prev_cpu = task_cpu(p);
> > + if (distribute_cpus) {
> > + dest_cpu = distribute_to_new_cpumask(new_mask,
> > + cpu_valid_mask);
> > + } else {
> > + /*
> > + * Can the task run on the task's current CPU? If so, we're
> > + * done.
> > + *
> > + * We only enable this short-circuit in the case that we're
> > + * not trying to distribute tasks. As we may otherwise not
> > + * distribute away from a loaded CPU, or make duplicate
> > + * assignments to it.
> > + */
> > + if (cpumask_test_cpu(prev_cpu, new_mask))
> > + dest_cpu = prev_cpu;
> > + else
> > + dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
> > + }
>
> That all seems overly complicated; what is wrong with just this:
>
> dest_cpu = cpumask_any_and_fancy(cpu_valid_mask, new_mask);
>
> I don't really buy the argument why that shortcut is problematic; it's
> all averages anyway, and keeping a task on a CPU where it's already
> running seems like a win.
>
> > + /* May have raced with cpu_down */
> > + if (unlikely(dest_cpu >= nr_cpu_ids)) {
> > ret = -EINVAL;
> > goto out;
> > }