Received-SPF: pass (google.com: best guess record for domain of linux-kernel-owner@vger.kernel.org designates 209.132.180.67 as permitted sender) client-ip=209.132.180.67;
Date:   Mon, 1 Jul 2019 16:34:08 -0400
From:   Josef Bacik <josef@toxicpanda.com>
To:     Rik van Riel <riel@surriel.com>
Cc:     linux-kernel@vger.kernel.org, kernel-team@fb.com, pjt@google.com,
        dietmar.eggemann@arm.com, peterz@infradead.org, mingo@redhat.com,
        morten.rasmussen@arm.com, tglx@linutronix.de,
        mgorman@techsingularity.net, vincent.guittot@linaro.org
Subject: Re: [PATCH 10/10] sched,fair: flatten hierarchical runqueues
Message-ID: <20190701203407.4fmsavivebyrocmw@macbook-pro-91.dhcp.thefacebook.com>
References: <20190628204913.10287-1-riel@surriel.com>
 <20190628204913.10287-11-riel@surriel.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <20190628204913.10287-11-riel@surriel.com>
User-Agent: NeoMutt/20180716
Sender: linux-kernel-owner@vger.kernel.org
Precedence: bulk

On Fri, Jun 28, 2019 at 04:49:13PM -0400, Rik van Riel wrote:
> Flatten the hierarchical runqueues into just the per CPU rq.cfs runqueue.
> 
> Iteration of the sched_entity hierarchy is rate limited to once per jiffy
> per sched_entity, which is a smaller change than it seems, because load
> average adjustments were already rate limited to once per jiffy before this
> patch series.
> 
> This patch breaks CONFIG_CFS_BANDWIDTH. The plan for that is to park tasks
> from throttled cgroups onto their cgroup runqueues, and slowly (using the
> GENTLE_FAIR_SLEEPERS) wake them back up, in vruntime order, once the cgroup
> gets unthrottled, to prevent thundering herd issues.
> 
> Signed-off-by: Rik van Riel <riel@surriel.com>
> ---
>  include/linux/sched.h |   2 +
>  kernel/sched/fair.c   | 452 +++++++++++++++---------------------------
>  kernel/sched/pelt.c   |   6 +-
>  kernel/sched/pelt.h   |   2 +-
>  kernel/sched/sched.h  |   2 +-
>  5 files changed, 171 insertions(+), 293 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 84a6cc6f5c47..901c710363e7 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -453,6 +453,8 @@ struct sched_entity {
>  #ifdef CONFIG_FAIR_GROUP_SCHED
>  	int				depth;
>  	unsigned long			enqueued_h_load;
> +	unsigned long			enqueued_h_weight;
> +	struct load_weight		h_load;
>  	struct sched_entity		*parent;
>  	/* rq on which this entity is (to be) queued: */
>  	struct cfs_rq			*cfs_rq;
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6fea8849cc12..c31d3da081fb 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -450,6 +450,19 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
>  	}
>  }
>  
> +/* Add the cgroup cfs_rqs to the list, for update_blocked_averages */
> +static void enqueue_entity_cfs_rqs(struct sched_entity *se)
> +{
> +	SCHED_WARN_ON(!entity_is_task(se));
> +
> +	for_each_sched_entity(se) {
> +		struct cfs_rq *cfs_rq = group_cfs_rq_of_parent(se);
> +
> +		if (list_add_leaf_cfs_rq(cfs_rq))
> +			break;
> +	}
> +}
> +
>  #else	/* !CONFIG_FAIR_GROUP_SCHED */
>  
>  static inline struct task_struct *task_of(struct sched_entity *se)
> @@ -672,8 +685,14 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
>   */
>  static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
>  {
> -	if (unlikely(se->load.weight != NICE_0_LOAD))
> +	if (task_se_in_cgroup(se)) {
> +		unsigned long h_weight = task_se_h_weight(se);
> +		if (h_weight != se->h_load.weight)
> +			update_load_set(&se->h_load, h_weight);
> +		delta = __calc_delta(delta, NICE_0_LOAD, &se->h_load);
> +	} else if (unlikely(se->load.weight != NICE_0_LOAD)) {
>  		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
> +	}
>  
>  	return delta;
>  }
> @@ -687,22 +706,16 @@ static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
>  static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  {
>  	u64 slice = sysctl_sched_latency;
> +	struct load_weight *load = &cfs_rq->load;
> +	struct load_weight lw;
>  
> -	for_each_sched_entity(se) {
> -		struct load_weight *load;
> -		struct load_weight lw;
> -
> -		cfs_rq = cfs_rq_of(se);
> -		load = &cfs_rq->load;
> +	if (unlikely(!se->on_rq)) {
> +		lw = cfs_rq->load;
>  
> -		if (unlikely(!se->on_rq)) {
> -			lw = cfs_rq->load;
> -
> -			update_load_add(&lw, se->load.weight);
> -			load = &lw;
> -		}
> -		slice = __calc_delta(slice, se->load.weight, load);
> +		update_load_add(&lw, task_se_h_weight(se));
> +		load = &lw;
>  	}
> +	slice = __calc_delta(slice, task_se_h_weight(se), load);
>  
>  	/*
>  	 * To avoid cache thrashing, run at least sysctl_sched_min_granularity.
> @@ -2703,16 +2716,28 @@ static inline void update_scan_period(struct task_struct *p, int new_cpu)
>  static void
>  account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  {
> -	update_load_add(&cfs_rq->load, se->load.weight);
> -	if (!parent_entity(se))
> +	struct rq *rq;
> +
> +	if (task_se_in_cgroup(se)) {
> +		struct cfs_rq *cgroup_rq = group_cfs_rq_of_parent(se);
> +		unsigned long h_weight;
> +
> +		update_load_add(&cgroup_rq->load, se->load.weight);
> +		cgroup_rq->nr_running++;
> +
> +		/* Add the hierarchical weight to the CPU rq */
> +		h_weight = task_se_h_weight(se);
> +		se->enqueued_h_weight = h_weight;
> +		update_load_add(&rq_of(cfs_rq)->load, h_weight);

Ok I think this is where we need to handle the newly enqueued se's potential
weight.  Right now task_se_h_weight() is based on it's weight _and_ its load.
So it's really it's task_se_h_weighted_load_avg, and not really
task_se_h_weight, right?  Instead we should separate these two things out, one
gives us our heirarchal load, and one that gives us our heirarchal weight based
purely on the ratio of (our weight) / (total se weight on parent).  Then we can
take this number and max it with the heirarchal load avg and use that with the
update_load_add.  Does that make sense?  Thanks,

Josef