With the changes to migrate disabling, ->set_cpus_allowed() no longer
gets deferred until migrate_enable(). To avoid releasing the bandwidth
while the task may still be executing on the old CPU, move the subtraction
to ->migrate_task_rq().
Signed-off-by: Scott Wood <[email protected]>
---
kernel/sched/deadline.c | 67 +++++++++++++++++++++++--------------------------
1 file changed, 31 insertions(+), 36 deletions(-)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index c18be51f7608..2f18d0cf1b56 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1606,14 +1606,42 @@ static void yield_task_dl(struct rq *rq)
return cpu;
}
+static void free_old_cpuset_bw_dl(struct rq *rq, struct task_struct *p)
+{
+ struct root_domain *src_rd = rq->rd;
+
+ /*
+ * Migrating a SCHED_DEADLINE task between exclusive
+ * cpusets (different root_domains) entails a bandwidth
+ * update. We already made space for us in the destination
+ * domain (see cpuset_can_attach()).
+ */
+ if (!cpumask_intersects(src_rd->span, p->cpus_ptr)) {
+ struct dl_bw *src_dl_b;
+
+ src_dl_b = dl_bw_of(cpu_of(rq));
+ /*
+ * We now free resources of the root_domain we are migrating
+ * off. In the worst case, sched_setattr() may temporary fail
+ * until we complete the update.
+ */
+ raw_spin_lock(&src_dl_b->lock);
+ __dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+ raw_spin_unlock(&src_dl_b->lock);
+ }
+}
+
static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused)
{
struct rq *rq;
- if (p->state != TASK_WAKING)
+ rq = task_rq(p);
+
+ if (p->state != TASK_WAKING) {
+ free_old_cpuset_bw_dl(rq, p);
return;
+ }
- rq = task_rq(p);
/*
* Since p->state == TASK_WAKING, set_task_cpu() has been called
* from try_to_wake_up(). Hence, p->pi_lock is locked, but
@@ -2220,39 +2248,6 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
}
}
-static void set_cpus_allowed_dl(struct task_struct *p,
- const struct cpumask *new_mask)
-{
- struct root_domain *src_rd;
- struct rq *rq;
-
- BUG_ON(!dl_task(p));
-
- rq = task_rq(p);
- src_rd = rq->rd;
- /*
- * Migrating a SCHED_DEADLINE task between exclusive
- * cpusets (different root_domains) entails a bandwidth
- * update. We already made space for us in the destination
- * domain (see cpuset_can_attach()).
- */
- if (!cpumask_intersects(src_rd->span, new_mask)) {
- struct dl_bw *src_dl_b;
-
- src_dl_b = dl_bw_of(cpu_of(rq));
- /*
- * We now free resources of the root_domain we are migrating
- * off. In the worst case, sched_setattr() may temporary fail
- * until we complete the update.
- */
- raw_spin_lock(&src_dl_b->lock);
- __dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
- raw_spin_unlock(&src_dl_b->lock);
- }
-
- set_cpus_allowed_common(p, new_mask);
-}
-
/* Assumes rq->lock is held */
static void rq_online_dl(struct rq *rq)
{
@@ -2407,7 +2402,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_dl,
.migrate_task_rq = migrate_task_rq_dl,
- .set_cpus_allowed = set_cpus_allowed_dl,
+ .set_cpus_allowed = set_cpus_allowed_common,
.rq_online = rq_online_dl,
.rq_offline = rq_offline_dl,
.task_woken = task_woken_dl,
--
1.8.3.1
On 2019-07-27 00:56:35 [-0500], Scott Wood wrote:
> With the changes to migrate disabling, ->set_cpus_allowed() no longer
> gets deferred until migrate_enable(). To avoid releasing the bandwidth
> while the task may still be executing on the old CPU, move the subtraction
> to ->migrate_task_rq().
I assume this is still valid for the !RT case.
Could someone from the scheduler/DL department please comment on this
one?
> Signed-off-by: Scott Wood <[email protected]>
> ---
> kernel/sched/deadline.c | 67 +++++++++++++++++++++++--------------------------
> 1 file changed, 31 insertions(+), 36 deletions(-)
>
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index c18be51f7608..2f18d0cf1b56 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -1606,14 +1606,42 @@ static void yield_task_dl(struct rq *rq)
> return cpu;
> }
>
> +static void free_old_cpuset_bw_dl(struct rq *rq, struct task_struct *p)
> +{
> + struct root_domain *src_rd = rq->rd;
> +
> + /*
> + * Migrating a SCHED_DEADLINE task between exclusive
> + * cpusets (different root_domains) entails a bandwidth
> + * update. We already made space for us in the destination
> + * domain (see cpuset_can_attach()).
> + */
> + if (!cpumask_intersects(src_rd->span, p->cpus_ptr)) {
> + struct dl_bw *src_dl_b;
> +
> + src_dl_b = dl_bw_of(cpu_of(rq));
> + /*
> + * We now free resources of the root_domain we are migrating
> + * off. In the worst case, sched_setattr() may temporary fail
> + * until we complete the update.
> + */
> + raw_spin_lock(&src_dl_b->lock);
> + __dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
> + raw_spin_unlock(&src_dl_b->lock);
> + }
> +}
> +
> static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused)
> {
> struct rq *rq;
>
> - if (p->state != TASK_WAKING)
> + rq = task_rq(p);
> +
> + if (p->state != TASK_WAKING) {
> + free_old_cpuset_bw_dl(rq, p);
> return;
> + }
>
> - rq = task_rq(p);
> /*
> * Since p->state == TASK_WAKING, set_task_cpu() has been called
> * from try_to_wake_up(). Hence, p->pi_lock is locked, but
> @@ -2220,39 +2248,6 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
> }
> }
>
> -static void set_cpus_allowed_dl(struct task_struct *p,
> - const struct cpumask *new_mask)
> -{
> - struct root_domain *src_rd;
> - struct rq *rq;
> -
> - BUG_ON(!dl_task(p));
> -
> - rq = task_rq(p);
> - src_rd = rq->rd;
> - /*
> - * Migrating a SCHED_DEADLINE task between exclusive
> - * cpusets (different root_domains) entails a bandwidth
> - * update. We already made space for us in the destination
> - * domain (see cpuset_can_attach()).
> - */
> - if (!cpumask_intersects(src_rd->span, new_mask)) {
> - struct dl_bw *src_dl_b;
> -
> - src_dl_b = dl_bw_of(cpu_of(rq));
> - /*
> - * We now free resources of the root_domain we are migrating
> - * off. In the worst case, sched_setattr() may temporary fail
> - * until we complete the update.
> - */
> - raw_spin_lock(&src_dl_b->lock);
> - __dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
> - raw_spin_unlock(&src_dl_b->lock);
> - }
> -
> - set_cpus_allowed_common(p, new_mask);
> -}
> -
> /* Assumes rq->lock is held */
> static void rq_online_dl(struct rq *rq)
> {
> @@ -2407,7 +2402,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
> #ifdef CONFIG_SMP
> .select_task_rq = select_task_rq_dl,
> .migrate_task_rq = migrate_task_rq_dl,
> - .set_cpus_allowed = set_cpus_allowed_dl,
> + .set_cpus_allowed = set_cpus_allowed_common,
> .rq_online = rq_online_dl,
> .rq_offline = rq_offline_dl,
> .task_woken = task_woken_dl,
> --
> 1.8.3.1
>
Sebastian
Hi Scott,
On 27/07/19 00:56, Scott Wood wrote:
> With the changes to migrate disabling, ->set_cpus_allowed() no longer
> gets deferred until migrate_enable(). To avoid releasing the bandwidth
> while the task may still be executing on the old CPU, move the subtraction
> to ->migrate_task_rq().
>
> Signed-off-by: Scott Wood <[email protected]>
> ---
> kernel/sched/deadline.c | 67 +++++++++++++++++++++++--------------------------
> 1 file changed, 31 insertions(+), 36 deletions(-)
>
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index c18be51f7608..2f18d0cf1b56 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -1606,14 +1606,42 @@ static void yield_task_dl(struct rq *rq)
> return cpu;
> }
>
> +static void free_old_cpuset_bw_dl(struct rq *rq, struct task_struct *p)
> +{
> + struct root_domain *src_rd = rq->rd;
> +
> + /*
> + * Migrating a SCHED_DEADLINE task between exclusive
> + * cpusets (different root_domains) entails a bandwidth
> + * update. We already made space for us in the destination
> + * domain (see cpuset_can_attach()).
> + */
> + if (!cpumask_intersects(src_rd->span, p->cpus_ptr)) {
> + struct dl_bw *src_dl_b;
> +
> + src_dl_b = dl_bw_of(cpu_of(rq));
> + /*
> + * We now free resources of the root_domain we are migrating
> + * off. In the worst case, sched_setattr() may temporary fail
> + * until we complete the update.
> + */
> + raw_spin_lock(&src_dl_b->lock);
> + __dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
> + raw_spin_unlock(&src_dl_b->lock);
> + }
> +}
> +
> static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused)
> {
> struct rq *rq;
>
> - if (p->state != TASK_WAKING)
> + rq = task_rq(p);
> +
> + if (p->state != TASK_WAKING) {
> + free_old_cpuset_bw_dl(rq, p);
What happens if a DEADLINE task is moved between cpusets while it was
sleeping? Don't we miss removing from the old cpuset if the task gets
migrated on wakeup?
Thanks,
Juri
On Fri, 2019-09-27 at 10:11 +0200, Juri Lelli wrote:
> Hi Scott,
>
> On 27/07/19 00:56, Scott Wood wrote:
> > With the changes to migrate disabling, ->set_cpus_allowed() no longer
> > gets deferred until migrate_enable(). To avoid releasing the bandwidth
> > while the task may still be executing on the old CPU, move the
> > subtraction
> > to ->migrate_task_rq().
> >
> > Signed-off-by: Scott Wood <[email protected]>
> > ---
> > kernel/sched/deadline.c | 67 +++++++++++++++++++++++-------------------
> > -------
> > 1 file changed, 31 insertions(+), 36 deletions(-)
> >
> > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> > index c18be51f7608..2f18d0cf1b56 100644
> > --- a/kernel/sched/deadline.c
> > +++ b/kernel/sched/deadline.c
> > @@ -1606,14 +1606,42 @@ static void yield_task_dl(struct rq *rq)
> > return cpu;
> > }
> >
> > +static void free_old_cpuset_bw_dl(struct rq *rq, struct task_struct *p)
> > +{
> > + struct root_domain *src_rd = rq->rd;
> > +
> > + /*
> > + * Migrating a SCHED_DEADLINE task between exclusive
> > + * cpusets (different root_domains) entails a bandwidth
> > + * update. We already made space for us in the destination
> > + * domain (see cpuset_can_attach()).
> > + */
> > + if (!cpumask_intersects(src_rd->span, p->cpus_ptr)) {
> > + struct dl_bw *src_dl_b;
> > +
> > + src_dl_b = dl_bw_of(cpu_of(rq));
> > + /*
> > + * We now free resources of the root_domain we are migrating
> > + * off. In the worst case, sched_setattr() may temporary
> > fail
> > + * until we complete the update.
> > + */
> > + raw_spin_lock(&src_dl_b->lock);
> > + __dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
> > + raw_spin_unlock(&src_dl_b->lock);
> > + }
> > +}
> > +
> > static void migrate_task_rq_dl(struct task_struct *p, int new_cpu
> > __maybe_unused)
> > {
> > struct rq *rq;
> >
> > - if (p->state != TASK_WAKING)
> > + rq = task_rq(p);
> > +
> > + if (p->state != TASK_WAKING) {
> > + free_old_cpuset_bw_dl(rq, p);
>
> What happens if a DEADLINE task is moved between cpusets while it was
> sleeping? Don't we miss removing from the old cpuset if the task gets
> migrated on wakeup?
In that case set_task_cpu() is called by ttwu after setting state to
TASK_WAKING. I guess it could be annoying if the task doesn't wake up for a
long time and therefore doesn't release the bandwidth until then.
-Scott
On 27/09/19 11:40, Scott Wood wrote:
> On Fri, 2019-09-27 at 10:11 +0200, Juri Lelli wrote:
> > Hi Scott,
> >
> > On 27/07/19 00:56, Scott Wood wrote:
> > > With the changes to migrate disabling, ->set_cpus_allowed() no longer
> > > gets deferred until migrate_enable(). To avoid releasing the bandwidth
> > > while the task may still be executing on the old CPU, move the
> > > subtraction
> > > to ->migrate_task_rq().
> > >
> > > Signed-off-by: Scott Wood <[email protected]>
> > > ---
> > > kernel/sched/deadline.c | 67 +++++++++++++++++++++++-------------------
> > > -------
> > > 1 file changed, 31 insertions(+), 36 deletions(-)
> > >
> > > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> > > index c18be51f7608..2f18d0cf1b56 100644
> > > --- a/kernel/sched/deadline.c
> > > +++ b/kernel/sched/deadline.c
> > > @@ -1606,14 +1606,42 @@ static void yield_task_dl(struct rq *rq)
> > > return cpu;
> > > }
> > >
> > > +static void free_old_cpuset_bw_dl(struct rq *rq, struct task_struct *p)
> > > +{
> > > + struct root_domain *src_rd = rq->rd;
> > > +
> > > + /*
> > > + * Migrating a SCHED_DEADLINE task between exclusive
> > > + * cpusets (different root_domains) entails a bandwidth
> > > + * update. We already made space for us in the destination
> > > + * domain (see cpuset_can_attach()).
> > > + */
> > > + if (!cpumask_intersects(src_rd->span, p->cpus_ptr)) {
> > > + struct dl_bw *src_dl_b;
> > > +
> > > + src_dl_b = dl_bw_of(cpu_of(rq));
> > > + /*
> > > + * We now free resources of the root_domain we are migrating
> > > + * off. In the worst case, sched_setattr() may temporary
> > > fail
> > > + * until we complete the update.
> > > + */
> > > + raw_spin_lock(&src_dl_b->lock);
> > > + __dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
> > > + raw_spin_unlock(&src_dl_b->lock);
> > > + }
> > > +}
> > > +
> > > static void migrate_task_rq_dl(struct task_struct *p, int new_cpu
> > > __maybe_unused)
> > > {
> > > struct rq *rq;
> > >
> > > - if (p->state != TASK_WAKING)
> > > + rq = task_rq(p);
> > > +
> > > + if (p->state != TASK_WAKING) {
> > > + free_old_cpuset_bw_dl(rq, p);
> >
> > What happens if a DEADLINE task is moved between cpusets while it was
> > sleeping? Don't we miss removing from the old cpuset if the task gets
> > migrated on wakeup?
>
> In that case set_task_cpu() is called by ttwu after setting state to
> TASK_WAKING.
Right.
> I guess it could be annoying if the task doesn't wake up for a
> long time and therefore doesn't release the bandwidth until then.
Hummm, I was actually more worried about the fact that we call free_old_
cpuset_bw_dl() only if p->state != TASK_WAKING.
On Mon, 2019-09-30 at 09:12 +0200, Juri Lelli wrote:
> On 27/09/19 11:40, Scott Wood wrote:
> > On Fri, 2019-09-27 at 10:11 +0200, Juri Lelli wrote:
> > > Hi Scott,
> > >
> > > On 27/07/19 00:56, Scott Wood wrote:
> > > > With the changes to migrate disabling, ->set_cpus_allowed() no
> > > > longer
> > > > gets deferred until migrate_enable(). To avoid releasing the
> > > > bandwidth
> > > > while the task may still be executing on the old CPU, move the
> > > > subtraction
> > > > to ->migrate_task_rq().
> > > >
> > > > Signed-off-by: Scott Wood <[email protected]>
> > > > ---
> > > > kernel/sched/deadline.c | 67 +++++++++++++++++++++++---------------
> > > > ----
> > > > -------
> > > > 1 file changed, 31 insertions(+), 36 deletions(-)
> > > >
> > > > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> > > > index c18be51f7608..2f18d0cf1b56 100644
> > > > --- a/kernel/sched/deadline.c
> > > > +++ b/kernel/sched/deadline.c
> > > > @@ -1606,14 +1606,42 @@ static void yield_task_dl(struct rq *rq)
> > > > return cpu;
> > > > }
> > > >
> > > > +static void free_old_cpuset_bw_dl(struct rq *rq, struct task_struct
> > > > *p)
> > > > +{
> > > > + struct root_domain *src_rd = rq->rd;
> > > > +
> > > > + /*
> > > > + * Migrating a SCHED_DEADLINE task between exclusive
> > > > + * cpusets (different root_domains) entails a bandwidth
> > > > + * update. We already made space for us in the destination
> > > > + * domain (see cpuset_can_attach()).
> > > > + */
> > > > + if (!cpumask_intersects(src_rd->span, p->cpus_ptr)) {
> > > > + struct dl_bw *src_dl_b;
> > > > +
> > > > + src_dl_b = dl_bw_of(cpu_of(rq));
> > > > + /*
> > > > + * We now free resources of the root_domain we are
> > > > migrating
> > > > + * off. In the worst case, sched_setattr() may
> > > > temporary
> > > > fail
> > > > + * until we complete the update.
> > > > + */
> > > > + raw_spin_lock(&src_dl_b->lock);
> > > > + __dl_sub(src_dl_b, p->dl.dl_bw,
> > > > dl_bw_cpus(task_cpu(p)));
> > > > + raw_spin_unlock(&src_dl_b->lock);
> > > > + }
> > > > +}
> > > > +
> > > > static void migrate_task_rq_dl(struct task_struct *p, int new_cpu
> > > > __maybe_unused)
> > > > {
> > > > struct rq *rq;
> > > >
> > > > - if (p->state != TASK_WAKING)
> > > > + rq = task_rq(p);
> > > > +
> > > > + if (p->state != TASK_WAKING) {
> > > > + free_old_cpuset_bw_dl(rq, p);
> > >
> > > What happens if a DEADLINE task is moved between cpusets while it was
> > > sleeping? Don't we miss removing from the old cpuset if the task gets
> > > migrated on wakeup?
> >
> > In that case set_task_cpu() is called by ttwu after setting state to
> > TASK_WAKING.
>
> Right.
>
> > I guess it could be annoying if the task doesn't wake up for a
> > long time and therefore doesn't release the bandwidth until then.
>
> Hummm, I was actually more worried about the fact that we call free_old_
> cpuset_bw_dl() only if p->state != TASK_WAKING.
Oh, right. :-P Not sure what I had in mind there; we want to call it
regardless.
I assume we need rq->lock in free_old_cpuset_bw_dl()? So something like
this:
if (p->state == TASK_WAITING)
raw_spin_lock(&rq->lock);
free_old_cpuset_bw_dl(rq, p);
if (p->state != TASK_WAITING)
return;
if (p->dl.dl_non_contending) {
....
BTW, is the full cpumask_intersects() necessary or would it suffice to see
that the new cpu is not in the old span?
-Scott
On 30/09/19 11:24, Scott Wood wrote:
> On Mon, 2019-09-30 at 09:12 +0200, Juri Lelli wrote:
[...]
> > Hummm, I was actually more worried about the fact that we call free_old_
> > cpuset_bw_dl() only if p->state != TASK_WAKING.
>
> Oh, right. :-P Not sure what I had in mind there; we want to call it
> regardless.
>
> I assume we need rq->lock in free_old_cpuset_bw_dl()? So something like
I think we can do with rcu_read_lock_sched() (see dl_task_can_attach()).
> this:
>
> if (p->state == TASK_WAITING)
> raw_spin_lock(&rq->lock);
> free_old_cpuset_bw_dl(rq, p);
> if (p->state != TASK_WAITING)
> return;
>
> if (p->dl.dl_non_contending) {
> ....
>
> BTW, is the full cpumask_intersects() necessary or would it suffice to see
> that the new cpu is not in the old span?
Checking new cpu only should be OK.
Thanks,
Juri
On Tue, 2019-10-01 at 10:52 +0200, Juri Lelli wrote:
> On 30/09/19 11:24, Scott Wood wrote:
> > On Mon, 2019-09-30 at 09:12 +0200, Juri Lelli wrote:
>
> [...]
>
> > > Hummm, I was actually more worried about the fact that we call
> > > free_old_
> > > cpuset_bw_dl() only if p->state != TASK_WAKING.
> >
> > Oh, right. :-P Not sure what I had in mind there; we want to call it
> > regardless.
> >
> > I assume we need rq->lock in free_old_cpuset_bw_dl()? So something like
>
> I think we can do with rcu_read_lock_sched() (see dl_task_can_attach()).
RCU will keep dl_bw from being freed under us (we're implicitly in an RCU
sched read section due to atomic context). It won't stop rq->rd from
changing, but that could have happened before we took rq->lock. If the cpu
the task was running on was removed from the cpuset, and that raced with the
task being moved to a different cpuset, couldn't we end up erroneously
subtracting from the cpu's new root domain (or failing to subtract at all if
the old cpu's new cpuset happens to be the task's new cpuset)? I don't see
anything that forces tasks off of the cpu when a cpu is removed from a
cpuset (though maybe I'm not looking in the right place), so the race window
could be quite large. In any case, that's an existing problem that's not
going to get solved in this patchset.
-Scott
On 09/10/19 01:25, Scott Wood wrote:
> On Tue, 2019-10-01 at 10:52 +0200, Juri Lelli wrote:
> > On 30/09/19 11:24, Scott Wood wrote:
> > > On Mon, 2019-09-30 at 09:12 +0200, Juri Lelli wrote:
> >
> > [...]
> >
> > > > Hummm, I was actually more worried about the fact that we call
> > > > free_old_
> > > > cpuset_bw_dl() only if p->state != TASK_WAKING.
> > >
> > > Oh, right. :-P Not sure what I had in mind there; we want to call it
> > > regardless.
> > >
> > > I assume we need rq->lock in free_old_cpuset_bw_dl()? So something like
> >
> > I think we can do with rcu_read_lock_sched() (see dl_task_can_attach()).
>
> RCU will keep dl_bw from being freed under us (we're implicitly in an RCU
> sched read section due to atomic context). It won't stop rq->rd from
> changing, but that could have happened before we took rq->lock. If the cpu
> the task was running on was removed from the cpuset, and that raced with the
> task being moved to a different cpuset, couldn't we end up erroneously
> subtracting from the cpu's new root domain (or failing to subtract at all if
> the old cpu's new cpuset happens to be the task's new cpuset)? I don't see
> anything that forces tasks off of the cpu when a cpu is removed from a
> cpuset (though maybe I'm not looking in the right place), so the race window
> could be quite large. In any case, that's an existing problem that's not
> going to get solved in this patchset.
OK. So, mainline has got cpuset_read_lock() which should be enough to
guard against changes to rd(s).
I agree that rq->lock is needed here.
Thanks,
Juri
On Wed, 2019-10-09 at 09:27 +0200, Juri Lelli wrote:
> On 09/10/19 01:25, Scott Wood wrote:
> > On Tue, 2019-10-01 at 10:52 +0200, Juri Lelli wrote:
> > > On 30/09/19 11:24, Scott Wood wrote:
> > > > On Mon, 2019-09-30 at 09:12 +0200, Juri Lelli wrote:
> > >
> > > [...]
> > >
> > > > > Hummm, I was actually more worried about the fact that we call
> > > > > free_old_
> > > > > cpuset_bw_dl() only if p->state != TASK_WAKING.
> > > >
> > > > Oh, right. :-P Not sure what I had in mind there; we want to call
> > > > it
> > > > regardless.
> > > >
> > > > I assume we need rq->lock in free_old_cpuset_bw_dl()? So something
> > > > like
> > >
> > > I think we can do with rcu_read_lock_sched() (see
> > > dl_task_can_attach()).
> >
> > RCU will keep dl_bw from being freed under us (we're implicitly in an
> > RCU
> > sched read section due to atomic context). It won't stop rq->rd from
> > changing, but that could have happened before we took rq->lock. If the
> > cpu
> > the task was running on was removed from the cpuset, and that raced with
> > the
> > task being moved to a different cpuset, couldn't we end up erroneously
> > subtracting from the cpu's new root domain (or failing to subtract at
> > all if
> > the old cpu's new cpuset happens to be the task's new cpuset)? I don't
> > see
> > anything that forces tasks off of the cpu when a cpu is removed from a
> > cpuset (though maybe I'm not looking in the right place), so the race
> > window
> > could be quite large. In any case, that's an existing problem that's
> > not
> > going to get solved in this patchset.
>
> OK. So, mainline has got cpuset_read_lock() which should be enough to
> guard against changes to rd(s).
>
> I agree that rq->lock is needed here.
My point was that rq->lock isn't actually helping, because rq->rd could have
changed before rq->lock is acquired (and it's still the old rd that needs
the bandwidth subtraction). cpuset_mutex/cpuset_rwsem helps somewhat,
though there's still a problem due to the subtraction not happening until
the task is woken up (by which time cpuset_mutex may have been released and
further reconfiguration could have happened). This would be an issue even
without lazy migrate, since in that case ->set_cpus_allowed() can get
deferred, but this patch makes the window much bigger.
The right solution is probably to explicitly track the rd for which a task
has a pending bandwidth subtraction (if any), and to continue doing it from
set_cpus_allowed() if the task is not migrate-disabled. In the meantime, I
think we should drop this patch from the patchset -- without it, lazy
migrate disable doesn't really make the race situation any worse than it
already was.
BTW, what happens to the bw addition in dl_task_can_attach() if a subsequent
can_attach fails and the whole operation is cancelled?
-Scott
On 09/10/19 14:12, Scott Wood wrote:
> On Wed, 2019-10-09 at 09:27 +0200, Juri Lelli wrote:
> > On 09/10/19 01:25, Scott Wood wrote:
> > > On Tue, 2019-10-01 at 10:52 +0200, Juri Lelli wrote:
> > > > On 30/09/19 11:24, Scott Wood wrote:
> > > > > On Mon, 2019-09-30 at 09:12 +0200, Juri Lelli wrote:
> > > >
> > > > [...]
> > > >
> > > > > > Hummm, I was actually more worried about the fact that we call
> > > > > > free_old_
> > > > > > cpuset_bw_dl() only if p->state != TASK_WAKING.
> > > > >
> > > > > Oh, right. :-P Not sure what I had in mind there; we want to call
> > > > > it
> > > > > regardless.
> > > > >
> > > > > I assume we need rq->lock in free_old_cpuset_bw_dl()? So something
> > > > > like
> > > >
> > > > I think we can do with rcu_read_lock_sched() (see
> > > > dl_task_can_attach()).
> > >
> > > RCU will keep dl_bw from being freed under us (we're implicitly in an
> > > RCU
> > > sched read section due to atomic context). It won't stop rq->rd from
> > > changing, but that could have happened before we took rq->lock. If the
> > > cpu
> > > the task was running on was removed from the cpuset, and that raced with
> > > the
> > > task being moved to a different cpuset, couldn't we end up erroneously
> > > subtracting from the cpu's new root domain (or failing to subtract at
> > > all if
> > > the old cpu's new cpuset happens to be the task's new cpuset)? I don't
> > > see
> > > anything that forces tasks off of the cpu when a cpu is removed from a
> > > cpuset (though maybe I'm not looking in the right place), so the race
> > > window
> > > could be quite large. In any case, that's an existing problem that's
> > > not
> > > going to get solved in this patchset.
> >
> > OK. So, mainline has got cpuset_read_lock() which should be enough to
> > guard against changes to rd(s).
> >
> > I agree that rq->lock is needed here.
>
> My point was that rq->lock isn't actually helping, because rq->rd could have
> changed before rq->lock is acquired (and it's still the old rd that needs
> the bandwidth subtraction). cpuset_mutex/cpuset_rwsem helps somewhat,
> though there's still a problem due to the subtraction not happening until
> the task is woken up (by which time cpuset_mutex may have been released and
> further reconfiguration could have happened). This would be an issue even
> without lazy migrate, since in that case ->set_cpus_allowed() can get
> deferred, but this patch makes the window much bigger.
>
> The right solution is probably to explicitly track the rd for which a task
> has a pending bandwidth subtraction (if any), and to continue doing it from
> set_cpus_allowed() if the task is not migrate-disabled. In the meantime, I
> think we should drop this patch from the patchset -- without it, lazy
> migrate disable doesn't really make the race situation any worse than it
> already was.
I'm OK with dropping it for now (as we also have other possible issues
as you point out below), but I really wonder what would be a solution
here. Problem is that if a domain(s) reconfiguration happened while the
task was migrate disabled, and we let the reconf destroy/rebuild
domains, the old rd could be gone by the time the task gets migrate
enabled again and the task could continue running, w/o its bandwidth
been accounted for, in a new rd since the migrate enable instant, no?
:-/
> BTW, what happens to the bw addition in dl_task_can_attach() if a subsequent
> can_attach fails and the whole operation is cancelled?
Oh, yeah, that doesn't look good. :-(
Maybe we can use cancel_attach() to fix things up?
Thanks,
Juri