DMARC-Filter: OpenDMARC Filter v1.3.1 smtp.codeaurora.org D52D8613EB
From: Joonwoo Park <joonwoop@codeaurora.org>
To: Peter Zijlstra <peterz@infradead.org>
Cc: Joonwoo Park <joonwoop@codeaurora.org>, Ingo Molnar <mingo@redhat.com>,
        linux-kernel@vger.kernel.org
Subject: [PATCH] sched/fair: fix fairness problems among the tasks in different cgroups
Date: Tue, 18 Oct 2016 14:26:43 -0700
Message-Id: <1476826003-32522-1-git-send-email-joonwoop@codeaurora.org>
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 5158
Lines: 142

When a new cgroup is created, scheduler attaches the child cgroup
to its parent and also increases the parent's task_group load_avg to
account increased load with following path :

  sched_create_group()
    alloc_fair_sched_group()
      sched_online_group()
        online_fair_sched_group()
	  for_each_possible_cpu()
	    post_init_entity_util_avg()
	      update_tg_load_avg()

However the parent's load_avg is shared by all cpus hence it's being
increased number of cpu times.  For example when there are 8 cpus
available (in fact 1 available cpu after hotplugging out too),
making empty cgroups /grp1 and /grp1/grp11 leads each task_group's
load_avg to be 8092 and 1024 whereas desired both cgroup's task_group
load_avg is 1024 which happens when booting with 1 cpu at present.

Such an incorrect load_avg accounting causes quite steep unfairness
to the tasks when they are in different cgroups.
With a scenario when online cpus = 1, possible cpus = 4 and 2 cpu
bound tasks are running but each runs on the parent and the child
cgroup :

  # echo 0 > /sys/devices/system/cpu/cpu1/online
  # echo 0 > /sys/devices/system/cpu/cpu2/online
  # echo 0 > /sys/devices/system/cpu/cpu3/online
  # cat /sys/devices/system/cpu/online
  0
  # mkdir /sys/fs/cgroup/grp1
  # dd if=/dev/zero of=/dev/null &
  # echo $! > /sys/fs/cgroup/tasks
  # dd if=/dev/zero of=/dev/null &
  # echo $! > /sys/fs/cgroup/grp1/tasks

After 3 seconds, the task in the root cgroup got 4 times of execution
time than the task in the child cgroup because weight of possible cpu
is 4 so scheduler thinks the root cgroup has 4 times more load than
child cgroup.

  dd (2029, #threads: 1)
  se.exec_start                                :        562900.460656
  se.sum_exec_runtime                          :          2573.175002
  dd (2032, #threads: 1)
  se.exec_start                                :        562900.037152
  se.sum_exec_runtime                          :           655.439360

Whereas booting the same system with maxcpus=1 makes both tasks run
evenly.

  dd (1952, #threads: 1)
  se.exec_start                                :         75660.457449
  se.sum_exec_runtime                          :          1754.045078
  dd (1955, #threads: 1)
  se.exec_start                                :         75680.029689
  se.sum_exec_runtime                          :          1768.195390

Fix such fairness problems by updating parent's task group load_avg
only once when a new child cgroup is being created.

Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
---
 kernel/sched/core.c  | 2 +-
 kernel/sched/fair.c  | 9 ++++++---
 kernel/sched/sched.h | 3 ++-
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 94732d1..2cf46aa 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2577,7 +2577,7 @@ void wake_up_new_task(struct task_struct *p)
 	__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
 	rq = __task_rq_lock(p, &rf);
-	post_init_entity_util_avg(&p->se);
+	post_init_entity_util_avg(&p->se, true);
 
 	activate_task(rq, p, 0);
 	p->on_rq = TASK_ON_RQ_QUEUED;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 502e95a..71c08a8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -730,7 +730,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
  * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
  * if util_avg > util_avg_cap.
  */
-void post_init_entity_util_avg(struct sched_entity *se)
+void post_init_entity_util_avg(struct sched_entity *se, bool update_tg_load)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	struct sched_avg *sa = &se->avg;
@@ -770,7 +770,8 @@ void post_init_entity_util_avg(struct sched_entity *se)
 
 	update_cfs_rq_load_avg(now, cfs_rq, false);
 	attach_entity_load_avg(cfs_rq, se);
-	update_tg_load_avg(cfs_rq, false);
+	if (update_tg_load)
+		update_tg_load_avg(cfs_rq, false);
 }
 
 #else /* !CONFIG_SMP */
@@ -8872,15 +8873,17 @@ void online_fair_sched_group(struct task_group *tg)
 	struct sched_entity *se;
 	struct rq *rq;
 	int i;
+	bool update_tg_load = true;
 
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
 		se = tg->se[i];
 
 		raw_spin_lock_irq(&rq->lock);
-		post_init_entity_util_avg(se);
+		post_init_entity_util_avg(se, update_tg_load);
 		sync_throttle(tg, i);
 		raw_spin_unlock_irq(&rq->lock);
+		update_tg_load = false;
 	}
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 055f935..6ab89af 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1356,7 +1356,8 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
 unsigned long to_ratio(u64 period, u64 runtime);
 
 extern void init_entity_runnable_average(struct sched_entity *se);
-extern void post_init_entity_util_avg(struct sched_entity *se);
+extern void post_init_entity_util_avg(struct sched_entity *se,
+				      bool update_tg_load);
 
 #ifdef CONFIG_NO_HZ_FULL
 extern bool sched_can_stop_tick(struct rq *rq);
-- 
2.9.3