Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751861AbbHTBR3 (ORCPT ); Wed, 19 Aug 2015 21:17:29 -0400 Received: from bombadil.infradead.org ([198.137.202.9]:42013 "EHLO bombadil.infradead.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751690AbbHTBR2 (ORCPT ); Wed, 19 Aug 2015 21:17:28 -0400 Date: Thu, 20 Aug 2015 03:17:21 +0200 From: Peter Zijlstra To: byungchul.park@lge.com Cc: mingo@kernel.org, linux-kernel@vger.kernel.org, yuyang.du@intel.com Subject: Re: [PATCH v3 0/5] sync a se with its cfs_rq when att(det)aching it Message-ID: <20150820011721.GA3138@worktop.event.rightround.com> References: <1439966836-11266-1-git-send-email-byungchul.park@lge.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <1439966836-11266-1-git-send-email-byungchul.park@lge.com> User-Agent: Mutt/1.5.22.1 (2013-10-16) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 8773 Lines: 258 I did something like this on top.. please have a look at the XXX and integrate. --- --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2664,8 +2664,8 @@ static inline u64 cfs_rq_clock_task(stru /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { - int decayed; struct sched_avg *sa = &cfs_rq->avg; + int decayed; if (atomic_long_read(&cfs_rq->removed_load_avg)) { long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); @@ -2695,15 +2695,16 @@ static inline int update_cfs_rq_load_avg static inline void update_load_avg(struct sched_entity *se, int update_tg) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - int cpu = cpu_of(rq_of(cfs_rq)); u64 now = cfs_rq_clock_task(cfs_rq); + int cpu = cpu_of(rq_of(cfs_rq)); /* * Track task load average for carrying it to new CPU after migrated, and * track group sched_entity load average for task_h_load calc in migration */ __update_load_avg(now, cpu, &se->avg, - se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); + se->on_rq * scale_load_down(se->load.weight), + cfs_rq->curr == se, NULL); if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) update_tg_load_avg(cfs_rq, 0); @@ -2718,9 +2719,10 @@ static void attach_entity_load_avg(struc * update here. we have to update it with prev cfs_rq just before changing * se's cfs_rq, and get here soon. */ - if (se->avg.last_update_time) + if (se->avg.last_update_time) { __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), - &se->avg, 0, 0, NULL); + &se->avg, 0, 0, NULL); + } se->avg.last_update_time = cfs_rq->avg.last_update_time; cfs_rq->avg.load_avg += se->avg.load_avg; @@ -2732,17 +2734,13 @@ static void attach_entity_load_avg(struc static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), - &se->avg, se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); + &se->avg, se->on_rq * scale_load_down(se->load.weight), + cfs_rq->curr == se, NULL); - cfs_rq->avg.load_avg = - max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); - cfs_rq->avg.load_sum = - max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); - cfs_rq->avg.util_avg = - max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); - cfs_rq->avg.util_sum = - max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); + cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); + cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); + cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); + cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); } /* Add the load generated by se into cfs_rq's load average */ @@ -2751,14 +2749,14 @@ enqueue_entity_load_avg(struct cfs_rq *c { struct sched_avg *sa = &se->avg; u64 now = cfs_rq_clock_task(cfs_rq); - int migrated = 0, decayed; + int migrated, decayed; - if (sa->last_update_time == 0) - migrated = 1; - else + migrated = !sa->last_update_time; + if (!migrated) { __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); + } decayed = update_cfs_rq_load_avg(now, cfs_rq); @@ -2781,7 +2779,7 @@ dequeue_entity_load_avg(struct cfs_rq *c cfs_rq->runnable_load_avg = max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); cfs_rq->runnable_load_sum = - max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); + max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); } /* @@ -2849,6 +2847,11 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void remove_entity_load_avg(struct sched_entity *se) {} +static inline void +attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +static inline void +detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} + static inline int idle_balance(struct rq *rq) { return 0; @@ -7915,29 +7918,50 @@ prio_changed_fair(struct rq *rq, struct check_preempt_curr(rq, p, 0); } -static void detach_task_cfs_rq(struct task_struct *p, int queued) +static inline bool vruntime_normalized(struct task_struct *p) { + int queued = task_on_rq_queued(p); struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); /* - * Ensure the task's vruntime is normalized, so that after attaching - * back it to a cfs_rq the enqueue_entity() will do the right thing. - * * If it's queued, then the dequeue_entity(.flags=0) will already - * have normalized the vruntime, if it's !queued, then only when - * the task is sleeping it still has a non-normalized vruntime. - * + * have normalized the vruntime. + */ + if (queued) + return true; + + /* * When !queued, vruntime of the task has usually NOT been normalized. * But there are some cases where it has already been normalized: - * - * - Moving a forked child which is waiting for being woken up by + */ + + /* + * - a forked child which is waiting for being woken up by * wake_up_new_task(). - * - Moving a task which has been woken up by try_to_wake_up() and + */ + if (!se->sum_exec_runtime || p->state == TASK_WAKING) + return true; + + /* + * - a task which has been woken up by try_to_wake_up() and * waiting for actually being woken up by sched_ttwu_pending(). + * + * XXX this appears wrong!! check history, + * we appear to always set queued and RUNNING under the same lock instance + * might be from before TASK_WAKING ? */ - if (!queued && p->state != TASK_RUNNING && - se->sum_exec_runtime && p->state != TASK_WAKING) { + if (p->state == TASK_RUNNING) + return true; + + return false; +} + +static void detach_task_cfs_rq(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (!vruntime_normalized(p)) { /* * Fix up our vruntime so that the current sleep doesn't * cause 'unlimited' sleep bonus. @@ -7946,10 +7970,8 @@ static void detach_task_cfs_rq(struct ta se->vruntime -= cfs_rq->min_vruntime; } -#ifdef CONFIG_SMP /* Catch up with the cfs_rq and remove our load when we leave */ detach_entity_load_avg(cfs_rq, se); -#endif } static void attach_task_cfs_rq(struct task_struct *p, int queued) @@ -7965,49 +7987,23 @@ static void attach_task_cfs_rq(struct ta se->depth = se->parent ? se->parent->depth + 1 : 0; #endif -#ifdef CONFIG_SMP /* synchronize task with its cfs_rq */ attach_entity_load_avg(cfs_rq, se); -#endif - /* - * Ensure the task has a non-normalized vruntime when attaching it - * to a cfs_rq, so that enqueue_entity() at wake-up time will do - * the right thing. - * - * If it's queued, then the enqueue_entity(.flags=0) makes the task - * has non-normalized vruntime, if it's !queued, then it still has - * a normalized vruntime. - * - * When !queued, a task usually should have a non-normalized vruntime - * after being attached to a cfs_rq. But there are some cases where - * we should keep it normalized: - * - * - Moving a forked child which is waiting for being woken up by - * wake_up_new_task(). - * - Moving a task which has been woken up by try_to_wake_up() and - * waiting for actually being woken up by sched_ttwu_pending(). - */ - if (!queued && p->state != TASK_RUNNING && - se->sum_exec_runtime && p->state != TASK_WAKING) + if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime; } static void switched_from_fair(struct rq *rq, struct task_struct *p) { - detach_task_cfs_rq(p, task_on_rq_queued(p)); + detach_task_cfs_rq(p); } -/* - * We switched to the sched_fair class. - */ static void switched_to_fair(struct rq *rq, struct task_struct *p) { - int queued = task_on_rq_queued(p); - - attach_task_cfs_rq(p, queued); + attach_task_cfs_rq(p); - if (queued) { + if (task_on_rq_queued(p)) { /* * We were most likely switched from sched_rt, so * kick off the schedule if running, otherwise just see @@ -8056,8 +8052,7 @@ static void task_move_group_fair(struct { detach_task_cfs_rq(p, queued); set_task_rq(p, task_cpu(p)); - - /* tell se's cfs_rq has been changed */ + /* tell se's cfs_rq has been changed -- migrated */ p->se.avg.last_update_time = 0; attach_task_cfs_rq(p, queued); } -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/