2015-08-13 10:49:54

by Byungchul Park

[permalink] [raw]
Subject: [PATCH 1/2] sched: sync a se with its cfs_rq when attaching and dettaching

From: Byungchul Park <[email protected]>

current code seems to be wrong with cfs_rq's avg loads when changing
a task's cgroup(=cfs_rq) to another. i tested with "echo pid > cgroup" and
found that e.g. cfs_rq->avg.load_avg became larger and larger whenever i
changed a cgroup to another again and again. we have to sync se's average
load with both *prev* cfs_rq and next cfs_rq when changing its group.
we also need to sync it with cfs_rq when it returns back to fair class
after it has leaved from fair class.

in addition, i introduced two functions for attaching/detaching a se load
from/to its cfs_rq, and let them use those functions. and i place that
function call to where a se is attached/detached to/from cfs_rq.

Signed-off-by: Byungchul Park <[email protected]>
---
kernel/sched/fair.c | 82 ++++++++++++++++++++++++++++++---------------------
1 file changed, 48 insertions(+), 34 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7f0040c..d91e4da 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2709,6 +2709,31 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
update_tg_load_avg(cfs_rq, 0);
}

+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ se->avg.last_update_time = cfs_rq->avg.last_update_time;
+ cfs_rq->avg.load_avg += se->avg.load_avg;
+ cfs_rq->avg.load_sum += se->avg.load_sum;
+ cfs_rq->avg.util_avg += se->avg.util_avg;
+ cfs_rq->avg.util_sum += se->avg.util_sum;
+}
+
+static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
+ &se->avg, se->on_rq * scale_load_down(se->load.weight),
+ cfs_rq->curr == se, NULL);
+
+ cfs_rq->avg.load_avg =
+ max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
+ cfs_rq->avg.load_sum =
+ max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
+ cfs_rq->avg.util_avg =
+ max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
+ cfs_rq->avg.util_sum =
+ max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
+}
+
/* Add the load generated by se into cfs_rq's load average */
static inline void
enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -2717,27 +2742,20 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
u64 now = cfs_rq_clock_task(cfs_rq);
int migrated = 0, decayed;

- if (sa->last_update_time == 0) {
- sa->last_update_time = now;
+ if (sa->last_update_time == 0)
migrated = 1;
- }
- else {
+ else
__update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
- se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
- }
+ se->on_rq * scale_load_down(se->load.weight),
+ cfs_rq->curr == se, NULL);

decayed = update_cfs_rq_load_avg(now, cfs_rq);

cfs_rq->runnable_load_avg += sa->load_avg;
cfs_rq->runnable_load_sum += sa->load_sum;

- if (migrated) {
- cfs_rq->avg.load_avg += sa->load_avg;
- cfs_rq->avg.load_sum += sa->load_sum;
- cfs_rq->avg.util_avg += sa->util_avg;
- cfs_rq->avg.util_sum += sa->util_sum;
- }
+ if (migrated)
+ attach_entity_load_avg(cfs_rq, se);

if (decayed || migrated)
update_tg_load_avg(cfs_rq, 0);
@@ -7911,17 +7929,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)

#ifdef CONFIG_SMP
/* Catch up with the cfs_rq and remove our load when we leave */
- __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq), &se->avg,
- se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL);
-
- cfs_rq->avg.load_avg =
- max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
- cfs_rq->avg.load_sum =
- max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
- cfs_rq->avg.util_avg =
- max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
- cfs_rq->avg.util_sum =
- max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
+ detach_entity_load_avg(cfs_rq, se);
#endif
}

@@ -7940,6 +7948,11 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
se->depth = se->parent ? se->parent->depth + 1 : 0;
#endif

+#ifdef CONFIG_SMP
+ /* synchronize task with its cfs_rq */
+ attach_entity_load_avg(cfs_rq_of(&p->se), &p->se);
+#endif
+
if (!task_on_rq_queued(p)) {

/*
@@ -8032,23 +8045,24 @@ static void task_move_group_fair(struct task_struct *p, int queued)
if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
queued = 1;

+ cfs_rq = cfs_rq_of(se);
if (!queued)
- se->vruntime -= cfs_rq_of(se)->min_vruntime;
+ se->vruntime -= cfs_rq->min_vruntime;
+
+#ifdef CONFIG_SMP
+ /* synchronize task with its prev cfs_rq */
+ detach_entity_load_avg(cfs_rq, se);
+#endif
set_task_rq(p, task_cpu(p));
se->depth = se->parent ? se->parent->depth + 1 : 0;
- if (!queued) {
- cfs_rq = cfs_rq_of(se);
+ cfs_rq = cfs_rq_of(se);
+ if (!queued)
se->vruntime += cfs_rq->min_vruntime;

#ifdef CONFIG_SMP
- /* Virtually synchronize task with its new cfs_rq */
- p->se.avg.last_update_time = cfs_rq->avg.last_update_time;
- cfs_rq->avg.load_avg += p->se.avg.load_avg;
- cfs_rq->avg.load_sum += p->se.avg.load_sum;
- cfs_rq->avg.util_avg += p->se.avg.util_avg;
- cfs_rq->avg.util_sum += p->se.avg.util_sum;
+ /* Virtually synchronize task with its new cfs_rq */
+ attach_entity_load_avg(cfs_rq, se);
#endif
- }
}

void free_fair_sched_group(struct task_group *tg)
--
1.7.9.5


2015-08-13 10:51:11

by Byungchul Park

[permalink] [raw]
Subject: [PATCH 2/2] sched: make task_move_group_fair simple by using switched_to(from)_fair

From: Byungchul Park <[email protected]>

i introduced need_vruntime_adjust() to check if do or not adjust vruntime
when attaching/detaching a se to/from its cfs_rq, and use it.

i made the best use of switched_to(from)_fair for attach(detach)
operations in task_move_group_fair().

Signed-off-by: Byungchul Park <[email protected]>
---
kernel/sched/fair.c | 80 +++++++++++++++++++--------------------------------
1 file changed, 29 insertions(+), 51 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d91e4da..5ec846f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7904,6 +7904,31 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
check_preempt_curr(rq, p, 0);
}

+static inline int need_vruntime_adjust(struct task_struct *p, int queued)
+{
+ struct sched_entity *se = &p->se;
+
+ /*
+ * When !queued, vruntime of the task has usually NOT been normalized.
+ * But there are some cases where it has already been normalized:
+ *
+ * - Moving a forked child which is waiting for being woken up by
+ * wake_up_new_task().
+ * - Moving a task which has been woken up by try_to_wake_up() and
+ * waiting for actually being woken up by sched_ttwu_pending().
+ */
+ if (queued)
+ return 0;
+
+ if (!se->sum_exec_runtime || p->state == TASK_WAKING)
+ return 0;
+
+ if (p->state != TASK_RUNNING)
+ return 1;
+
+ return 0;
+}
+
static void switched_from_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
@@ -7918,7 +7943,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
* have normalized the vruntime, if it's !queued, then only when
* the task is sleeping will it still have non-normalized vruntime.
*/
- if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) {
+ if (need_vruntime_adjust(p, task_on_rq_queued(p))) {
/*
* Fix up our vruntime so that the current sleep doesn't
* cause 'unlimited' sleep bonus.
@@ -7939,7 +7964,6 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
-
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* Since the real-depth could have been changed (only FAIR
@@ -7964,7 +7988,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
* has non-normalized vruntime, if it's !queued, then it still has
* normalized vruntime.
*/
- if (p->state != TASK_RUNNING)
+ if (need_vruntime_adjust(p, task_on_rq_queued(p)))
se->vruntime += cfs_rq_of(se)->min_vruntime;
return;
}
@@ -8014,55 +8038,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
#ifdef CONFIG_FAIR_GROUP_SCHED
static void task_move_group_fair(struct task_struct *p, int queued)
{
- struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq;
-
- /*
- * If the task was not on the rq at the time of this cgroup movement
- * it must have been asleep, sleeping tasks keep their ->vruntime
- * absolute on their old rq until wakeup (needed for the fair sleeper
- * bonus in place_entity()).
- *
- * If it was on the rq, we've just 'preempted' it, which does convert
- * ->vruntime to a relative base.
- *
- * Make sure both cases convert their relative position when migrating
- * to another cgroup's rq. This does somewhat interfere with the
- * fair sleeper stuff for the first placement, but who cares.
- */
- /*
- * When !queued, vruntime of the task has usually NOT been normalized.
- * But there are some cases where it has already been normalized:
- *
- * - Moving a forked child which is waiting for being woken up by
- * wake_up_new_task().
- * - Moving a task which has been woken up by try_to_wake_up() and
- * waiting for actually being woken up by sched_ttwu_pending().
- *
- * To prevent boost or penalty in the new cfs_rq caused by delta
- * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
- */
- if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
- queued = 1;
-
- cfs_rq = cfs_rq_of(se);
- if (!queued)
- se->vruntime -= cfs_rq->min_vruntime;
-
-#ifdef CONFIG_SMP
- /* synchronize task with its prev cfs_rq */
- detach_entity_load_avg(cfs_rq, se);
-#endif
+ switched_from_fair(task_rq(p), p);
set_task_rq(p, task_cpu(p));
- se->depth = se->parent ? se->parent->depth + 1 : 0;
- cfs_rq = cfs_rq_of(se);
- if (!queued)
- se->vruntime += cfs_rq->min_vruntime;
-
-#ifdef CONFIG_SMP
- /* Virtually synchronize task with its new cfs_rq */
- attach_entity_load_avg(cfs_rq, se);
-#endif
+ switched_to_fair(task_rq(p), p);
}

void free_fair_sched_group(struct task_group *tg)
--
1.7.9.5

2015-08-14 14:55:57

by T. Zhou

[permalink] [raw]
Subject: Re: [PATCH 2/2] sched: make task_move_group_fair simple by using switched_to(from)_fair

Hi,

On Thu, Aug 13, 2015 at 07:49:19PM +0900, [email protected] wrote:
> +static inline int need_vruntime_adjust(struct task_struct *p, int queued)
> +{
> + struct sched_entity *se = &p->se;
> +
> + /*
> + * When !queued, vruntime of the task has usually NOT been normalized.
> + * But there are some cases where it has already been normalized:
> + *
> + * - Moving a forked child which is waiting for being woken up by
> + * wake_up_new_task().
> + * - Moving a task which has been woken up by try_to_wake_up() and
> + * waiting for actually being woken up by sched_ttwu_pending().
> + */
> + if (queued)
> + return 0;
> +
> + if (!se->sum_exec_runtime || p->state == TASK_WAKING)
> + return 0;
> +
> + if (p->state != TASK_RUNNING)
> + return 1;
> +
> + return 0;
> +}

Original switched_from/to_fair() do not include the following check.

if (!se->sum_exec_runtime || p->state == TASK_WAKING)
return 0;

I guess you're sure this check will always fail in switched_from/to_fair() case

Thanks
> static void switched_from_fair(struct rq *rq, struct task_struct *p)
> {
> struct sched_entity *se = &p->se;
> @@ -7918,7 +7943,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
> * have normalized the vruntime, if it's !queued, then only when
> * the task is sleeping will it still have non-normalized vruntime.
> */
> - if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) {
> + if (need_vruntime_adjust(p, task_on_rq_queued(p))) {
> /*
> * Fix up our vruntime so that the current sleep doesn't
> * cause 'unlimited' sleep bonus.
> @@ -7939,7 +7964,6 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
> static void switched_to_fair(struct rq *rq, struct task_struct *p)
> {
> struct sched_entity *se = &p->se;
> -
> #ifdef CONFIG_FAIR_GROUP_SCHED
> /*
> * Since the real-depth could have been changed (only FAIR
> @@ -7964,7 +7988,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
> * has non-normalized vruntime, if it's !queued, then it still has
> * normalized vruntime.
> */
> - if (p->state != TASK_RUNNING)
> + if (need_vruntime_adjust(p, task_on_rq_queued(p)))
> se->vruntime += cfs_rq_of(se)->min_vruntime;
> return;
> }
> @@ -8014,55 +8038,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
> #ifdef CONFIG_FAIR_GROUP_SCHED
> static void task_move_group_fair(struct task_struct *p, int queued)
> {
> - struct sched_entity *se = &p->se;
> - struct cfs_rq *cfs_rq;
> -
> - /*
> - * If the task was not on the rq at the time of this cgroup movement
> - * it must have been asleep, sleeping tasks keep their ->vruntime
> - * absolute on their old rq until wakeup (needed for the fair sleeper
> - * bonus in place_entity()).
> - *
> - * If it was on the rq, we've just 'preempted' it, which does convert
> - * ->vruntime to a relative base.
> - *
> - * Make sure both cases convert their relative position when migrating
> - * to another cgroup's rq. This does somewhat interfere with the
> - * fair sleeper stuff for the first placement, but who cares.
> - */
> - /*
> - * When !queued, vruntime of the task has usually NOT been normalized.
> - * But there are some cases where it has already been normalized:
> - *
> - * - Moving a forked child which is waiting for being woken up by
> - * wake_up_new_task().
> - * - Moving a task which has been woken up by try_to_wake_up() and
> - * waiting for actually being woken up by sched_ttwu_pending().
> - *
> - * To prevent boost or penalty in the new cfs_rq caused by delta
> - * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
> - */
> - if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
> - queued = 1;
> -
> - cfs_rq = cfs_rq_of(se);
> - if (!queued)
> - se->vruntime -= cfs_rq->min_vruntime;
> -
> -#ifdef CONFIG_SMP
> - /* synchronize task with its prev cfs_rq */
> - detach_entity_load_avg(cfs_rq, se);
> -#endif
> + switched_from_fair(task_rq(p), p);
> set_task_rq(p, task_cpu(p));
> - se->depth = se->parent ? se->parent->depth + 1 : 0;
> - cfs_rq = cfs_rq_of(se);
> - if (!queued)
> - se->vruntime += cfs_rq->min_vruntime;
> -
> -#ifdef CONFIG_SMP
> - /* Virtually synchronize task with its new cfs_rq */
> - attach_entity_load_avg(cfs_rq, se);
> -#endif
> + switched_to_fair(task_rq(p), p);
> }
>
> void free_fair_sched_group(struct task_group *tg)

--
Tao

2015-08-15 08:21:43

by Byungchul Park

[permalink] [raw]
Subject: Re: [PATCH 2/2] sched: make task_move_group_fair simple by using switched_to(from)_fair

On Fri, Aug 14, 2015 at 10:50:34PM +0800, T. Zhou wrote:
> Hi,
>
> On Thu, Aug 13, 2015 at 07:49:19PM +0900, [email protected] wrote:
> > +static inline int need_vruntime_adjust(struct task_struct *p, int queued)
> > +{
> > + struct sched_entity *se = &p->se;
> > +
> > + /*
> > + * When !queued, vruntime of the task has usually NOT been normalized.
> > + * But there are some cases where it has already been normalized:
> > + *
> > + * - Moving a forked child which is waiting for being woken up by
> > + * wake_up_new_task().
> > + * - Moving a task which has been woken up by try_to_wake_up() and
> > + * waiting for actually being woken up by sched_ttwu_pending().
> > + */
> > + if (queued)
> > + return 0;
> > +
> > + if (!se->sum_exec_runtime || p->state == TASK_WAKING)
> > + return 0;
> > +
> > + if (p->state != TASK_RUNNING)
> > + return 1;
> > +
> > + return 0;
> > +}
>
> Original switched_from/to_fair() do not include the following check.
>
> if (!se->sum_exec_runtime || p->state == TASK_WAKING)
> return 0;
>
> I guess you're sure this check will always fail in switched_from/to_fair() case

hello,

i wanted to introduce a function for checking necessity of adjusting
vruntime in general which may be also used in future. :)

thanks,
byungchul

>
> Thanks
> > static void switched_from_fair(struct rq *rq, struct task_struct *p)
> > {
> > struct sched_entity *se = &p->se;
> > @@ -7918,7 +7943,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
> > * have normalized the vruntime, if it's !queued, then only when
> > * the task is sleeping will it still have non-normalized vruntime.
> > */
> > - if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) {
> > + if (need_vruntime_adjust(p, task_on_rq_queued(p))) {
> > /*
> > * Fix up our vruntime so that the current sleep doesn't
> > * cause 'unlimited' sleep bonus.
> > @@ -7939,7 +7964,6 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
> > static void switched_to_fair(struct rq *rq, struct task_struct *p)
> > {
> > struct sched_entity *se = &p->se;
> > -
> > #ifdef CONFIG_FAIR_GROUP_SCHED
> > /*
> > * Since the real-depth could have been changed (only FAIR
> > @@ -7964,7 +7988,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
> > * has non-normalized vruntime, if it's !queued, then it still has
> > * normalized vruntime.
> > */
> > - if (p->state != TASK_RUNNING)
> > + if (need_vruntime_adjust(p, task_on_rq_queued(p)))
> > se->vruntime += cfs_rq_of(se)->min_vruntime;
> > return;
> > }
> > @@ -8014,55 +8038,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
> > #ifdef CONFIG_FAIR_GROUP_SCHED
> > static void task_move_group_fair(struct task_struct *p, int queued)
> > {
> > - struct sched_entity *se = &p->se;
> > - struct cfs_rq *cfs_rq;
> > -
> > - /*
> > - * If the task was not on the rq at the time of this cgroup movement
> > - * it must have been asleep, sleeping tasks keep their ->vruntime
> > - * absolute on their old rq until wakeup (needed for the fair sleeper
> > - * bonus in place_entity()).
> > - *
> > - * If it was on the rq, we've just 'preempted' it, which does convert
> > - * ->vruntime to a relative base.
> > - *
> > - * Make sure both cases convert their relative position when migrating
> > - * to another cgroup's rq. This does somewhat interfere with the
> > - * fair sleeper stuff for the first placement, but who cares.
> > - */
> > - /*
> > - * When !queued, vruntime of the task has usually NOT been normalized.
> > - * But there are some cases where it has already been normalized:
> > - *
> > - * - Moving a forked child which is waiting for being woken up by
> > - * wake_up_new_task().
> > - * - Moving a task which has been woken up by try_to_wake_up() and
> > - * waiting for actually being woken up by sched_ttwu_pending().
> > - *
> > - * To prevent boost or penalty in the new cfs_rq caused by delta
> > - * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
> > - */
> > - if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
> > - queued = 1;
> > -
> > - cfs_rq = cfs_rq_of(se);
> > - if (!queued)
> > - se->vruntime -= cfs_rq->min_vruntime;
> > -
> > -#ifdef CONFIG_SMP
> > - /* synchronize task with its prev cfs_rq */
> > - detach_entity_load_avg(cfs_rq, se);
> > -#endif
> > + switched_from_fair(task_rq(p), p);
> > set_task_rq(p, task_cpu(p));
> > - se->depth = se->parent ? se->parent->depth + 1 : 0;
> > - cfs_rq = cfs_rq_of(se);
> > - if (!queued)
> > - se->vruntime += cfs_rq->min_vruntime;
> > -
> > -#ifdef CONFIG_SMP
> > - /* Virtually synchronize task with its new cfs_rq */
> > - attach_entity_load_avg(cfs_rq, se);
> > -#endif
> > + switched_to_fair(task_rq(p), p);
> > }
> >
> > void free_fair_sched_group(struct task_group *tg)
>
> --
> Tao
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/