Date: Mon, 28 Mar 2016 14:33:33 +0530
From: Viresh Kumar <viresh.kumar@linaro.org>
To: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Linux PM list <linux-pm@vger.kernel.org>, Juri Lelli <juri.lelli@arm.com>,
        Steve Muckle <steve.muckle@linaro.org>,
        ACPI Devel Maling List <linux-acpi@vger.kernel.org>,
        Linux Kernel Mailing List <linux-kernel@vger.kernel.org>,
        Peter Zijlstra <peterz@infradead.org>,
        Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>,
        Vincent Guittot <vincent.guittot@linaro.org>,
        Michael Turquette <mturquette@baylibre.com>,
        Ingo Molnar <mingo@kernel.org>
Subject: Re: [PATCH v6 7/7][Resend] cpufreq: schedutil: New governor based on
 scheduler utilization data
Message-ID: <20160328090333.GK32495@vireshk-i7>
References: <7262976.zPkLj56ATU@vostro.rjw.lan>
 <6666532.7ULg06hQ7e@vostro.rjw.lan>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <6666532.7ULg06hQ7e@vostro.rjw.lan>
User-Agent: Mutt/1.5.24 (2015-08-30)
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 17468
Lines: 605

On 22-03-16, 02:54, Rafael J. Wysocki wrote:
> Index: linux-pm/kernel/sched/cpufreq_schedutil.c
> ===================================================================
> --- /dev/null
> +++ linux-pm/kernel/sched/cpufreq_schedutil.c
> @@ -0,0 +1,528 @@
> +/*
> + * CPUFreq governor based on scheduler-provided CPU utilization data.
> + *
> + * Copyright (C) 2016, Intel Corporation
> + * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/cpufreq.h>
> +#include <linux/module.h>
> +#include <linux/slab.h>
> +#include <trace/events/power.h>
> +
> +#include "sched.h"
> +
> +struct sugov_tunables {
> +	struct gov_attr_set attr_set;
> +	unsigned int rate_limit_us;
> +};
> +
> +struct sugov_policy {
> +	struct cpufreq_policy *policy;
> +
> +	struct sugov_tunables *tunables;
> +	struct list_head tunables_hook;
> +
> +	raw_spinlock_t update_lock;  /* For shared policies */
> +	u64 last_freq_update_time;
> +	s64 freq_update_delay_ns;

And why isn't it part of sugov_tunables? Its gonna be same for all policies
sharing tunables ..

> +	unsigned int next_freq;
> +
> +	/* The next fields are only needed if fast switch cannot be used. */
> +	struct irq_work irq_work;
> +	struct work_struct work;
> +	struct mutex work_lock;
> +	bool work_in_progress;
> +
> +	bool need_freq_update;
> +};
> +
> +struct sugov_cpu {
> +	struct update_util_data update_util;
> +	struct sugov_policy *sg_policy;
> +
> +	/* The fields below are only needed when sharing a policy. */
> +	unsigned long util;
> +	unsigned long max;
> +	u64 last_update;
> +};
> +
> +static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
> +
> +/************************ Governor internals ***********************/
> +
> +static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)

To make its purpose clear, maybe name it as: sugov_should_reevaluate_freq(),
because we aren't updating the freq just yet, but deciding if we need to
reevaluate again or not.

As its going to be called from hotpath, maybe mark it as inline and let compiler
decide ?

> +{
> +	u64 delta_ns;
> +
> +	if (sg_policy->work_in_progress)
> +		return false;
> +
> +	if (unlikely(sg_policy->need_freq_update)) {
> +		sg_policy->need_freq_update = false;
> +		return true;
> +	}
> +
> +	delta_ns = time - sg_policy->last_freq_update_time;
> +	return (s64)delta_ns >= sg_policy->freq_update_delay_ns;
> +}
> +
> +static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,

Maybe sugov_update_freq() ?

> +				unsigned int next_freq)
> +{
> +	struct cpufreq_policy *policy = sg_policy->policy;
> +
> +	sg_policy->last_freq_update_time = time;
> +
> +	if (policy->fast_switch_enabled) {
> +		if (next_freq > policy->max)
> +			next_freq = policy->max;
> +		else if (next_freq < policy->min)
> +			next_freq = policy->min;
> +
> +		if (sg_policy->next_freq == next_freq) {
> +			trace_cpu_frequency(policy->cur, smp_processor_id());
> +			return;
> +		}
> +		sg_policy->next_freq = next_freq;

Why not do all of above stuff as part of else block as well and move it before
the if {} block ?

> +		next_freq = cpufreq_driver_fast_switch(policy, next_freq);
> +		if (next_freq == CPUFREQ_ENTRY_INVALID)
> +			return;
> +
> +		policy->cur = next_freq;
> +		trace_cpu_frequency(next_freq, smp_processor_id());
> +	} else if (sg_policy->next_freq != next_freq) {
> +		sg_policy->next_freq = next_freq;
> +		sg_policy->work_in_progress = true;
> +		irq_work_queue(&sg_policy->irq_work);
> +	}
> +}
> +
> +/**
> + * get_next_freq - Compute a new frequency for a given cpufreq policy.
> + * @policy: cpufreq policy object to compute the new frequency for.
> + * @util: Current CPU utilization.
> + * @max: CPU capacity.
> + *
> + * If the utilization is frequency-invariant, choose the new frequency to be
> + * proportional to it, that is
> + *
> + * next_freq = C * max_freq * util / max
> + *
> + * Otherwise, approximate the would-be frequency-invariant utilization by
> + * util_raw * (curr_freq / max_freq) which leads to
> + *
> + * next_freq = C * curr_freq * util_raw / max
> + *
> + * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
> + */
> +static unsigned int get_next_freq(struct cpufreq_policy *policy,
> +				  unsigned long util, unsigned long max)
> +{
> +	unsigned int freq = arch_scale_freq_invariant() ?
> +				policy->cpuinfo.max_freq : policy->cur;
> +
> +	return (freq + (freq >> 2)) * util / max;
> +}
> +
> +static void sugov_update_single(struct update_util_data *hook, u64 time,
> +				unsigned long util, unsigned long max)
> +{
> +	struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
> +	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
> +	struct cpufreq_policy *policy = sg_policy->policy;
> +	unsigned int next_f;
> +
> +	if (!sugov_should_update_freq(sg_policy, time))
> +		return;
> +
> +	next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq :
> +			get_next_freq(policy, util, max);
> +	sugov_update_commit(sg_policy, time, next_f);
> +}
> +
> +static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy,
> +					   unsigned long util, unsigned long max)
> +{
> +	struct cpufreq_policy *policy = sg_policy->policy;
> +	unsigned int max_f = policy->cpuinfo.max_freq;
> +	u64 last_freq_update_time = sg_policy->last_freq_update_time;
> +	unsigned int j;
> +
> +	if (util == ULONG_MAX)
> +		return max_f;
> +
> +	for_each_cpu(j, policy->cpus) {
> +		struct sugov_cpu *j_sg_cpu;
> +		unsigned long j_util, j_max;
> +		u64 delta_ns;
> +
> +		if (j == smp_processor_id())
> +			continue;

Why skip local CPU completely ? And if we really want to do that, what about
something like for_each_cpu_and_not to kill the unnecessary if {} statement ?

> +
> +		j_sg_cpu = &per_cpu(sugov_cpu, j);
> +		/*
> +		 * If the CPU utilization was last updated before the previous
> +		 * frequency update and the time elapsed between the last update
> +		 * of the CPU utilization and the last frequency update is long
> +		 * enough, don't take the CPU into account as it probably is
> +		 * idle now.
> +		 */
> +		delta_ns = last_freq_update_time - j_sg_cpu->last_update;
> +		if ((s64)delta_ns > TICK_NSEC)
> +			continue;
> +
> +		j_util = j_sg_cpu->util;
> +		if (j_util == ULONG_MAX)
> +			return max_f;
> +
> +		j_max = j_sg_cpu->max;
> +		if (j_util * max > j_max * util) {
> +			util = j_util;
> +			max = j_max;
> +		}
> +	}
> +
> +	return get_next_freq(policy, util, max);
> +}
> +
> +static void sugov_update_shared(struct update_util_data *hook, u64 time,
> +				unsigned long util, unsigned long max)
> +{
> +	struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
> +	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
> +	unsigned int next_f;
> +
> +	raw_spin_lock(&sg_policy->update_lock);
> +
> +	sg_cpu->util = util;
> +	sg_cpu->max = max;
> +	sg_cpu->last_update = time;
> +
> +	if (sugov_should_update_freq(sg_policy, time)) {
> +		next_f = sugov_next_freq_shared(sg_policy, util, max);
> +		sugov_update_commit(sg_policy, time, next_f);
> +	}
> +
> +	raw_spin_unlock(&sg_policy->update_lock);
> +}
> +
> +static void sugov_work(struct work_struct *work)
> +{
> +	struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
> +
> +	mutex_lock(&sg_policy->work_lock);
> +	__cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq,
> +				CPUFREQ_RELATION_L);
> +	mutex_unlock(&sg_policy->work_lock);
> +
> +	sg_policy->work_in_progress = false;
> +}
> +
> +static void sugov_irq_work(struct irq_work *irq_work)
> +{
> +	struct sugov_policy *sg_policy;
> +
> +	sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
> +	schedule_work_on(smp_processor_id(), &sg_policy->work);
> +}
> +
> +/************************** sysfs interface ************************/
> +
> +static struct sugov_tunables *global_tunables;
> +static DEFINE_MUTEX(global_tunables_lock);
> +
> +static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set)
> +{
> +	return container_of(attr_set, struct sugov_tunables, attr_set);
> +}
> +
> +static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
> +{
> +	struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
> +
> +	return sprintf(buf, "%u\n", tunables->rate_limit_us);
> +}
> +
> +static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf,
> +				   size_t count)
> +{
> +	struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
> +	struct sugov_policy *sg_policy;
> +	unsigned int rate_limit_us;
> +	int ret;
> +
> +	ret = sscanf(buf, "%u", &rate_limit_us);

checkpatch warns for this, we should be using kstrtou32 here ..

> +	if (ret != 1)
> +		return -EINVAL;
> +
> +	tunables->rate_limit_us = rate_limit_us;
> +
> +	list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
> +		sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC;
> +
> +	return count;
> +}
> +
> +static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);

Why not reuse gov_attr_rw() ?

> +
> +static struct attribute *sugov_attributes[] = {
> +	&rate_limit_us.attr,
> +	NULL
> +};
> +
> +static struct kobj_type sugov_tunables_ktype = {
> +	.default_attrs = sugov_attributes,
> +	.sysfs_ops = &governor_sysfs_ops,
> +};
> +
> +/********************** cpufreq governor interface *********************/
> +
> +static struct cpufreq_governor schedutil_gov;
> +
> +static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
> +{
> +	struct sugov_policy *sg_policy;
> +
> +	sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL);
> +	if (!sg_policy)
> +		return NULL;
> +
> +	sg_policy->policy = policy;
> +	init_irq_work(&sg_policy->irq_work, sugov_irq_work);
> +	INIT_WORK(&sg_policy->work, sugov_work);
> +	mutex_init(&sg_policy->work_lock);
> +	raw_spin_lock_init(&sg_policy->update_lock);
> +	return sg_policy;
> +}
> +
> +static void sugov_policy_free(struct sugov_policy *sg_policy)
> +{
> +	mutex_destroy(&sg_policy->work_lock);
> +	kfree(sg_policy);
> +}
> +
> +static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
> +{
> +	struct sugov_tunables *tunables;
> +
> +	tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
> +	if (tunables)
> +		gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook);
> +
> +	return tunables;
> +}
> +
> +static void sugov_tunables_free(struct sugov_tunables *tunables)
> +{
> +	if (!have_governor_per_policy())
> +		global_tunables = NULL;
> +
> +	kfree(tunables);
> +}
> +
> +static int sugov_init(struct cpufreq_policy *policy)
> +{
> +	struct sugov_policy *sg_policy;
> +	struct sugov_tunables *tunables;
> +	unsigned int lat;
> +	int ret = 0;
> +
> +	/* State should be equivalent to EXIT */
> +	if (policy->governor_data)
> +		return -EBUSY;
> +
> +	sg_policy = sugov_policy_alloc(policy);
> +	if (!sg_policy)
> +		return -ENOMEM;
> +
> +	mutex_lock(&global_tunables_lock);
> +
> +	if (global_tunables) {
> +		if (WARN_ON(have_governor_per_policy())) {
> +			ret = -EINVAL;
> +			goto free_sg_policy;
> +		}
> +		policy->governor_data = sg_policy;
> +		sg_policy->tunables = global_tunables;
> +
> +		gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook);
> +		goto out;
> +	}
> +
> +	tunables = sugov_tunables_alloc(sg_policy);
> +	if (!tunables) {
> +		ret = -ENOMEM;
> +		goto free_sg_policy;
> +	}
> +
> +	tunables->rate_limit_us = LATENCY_MULTIPLIER;
> +	lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
> +	if (lat)
> +		tunables->rate_limit_us *= lat;
> +
> +	if (!have_governor_per_policy())
> +		global_tunables = tunables;

To make sugov_tunables_alloc/free() symmetric to each other, should we move
above into sugov_tunables_alloc() ?

> +
> +	policy->governor_data = sg_policy;
> +	sg_policy->tunables = tunables;
> +
> +	ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype,
> +				   get_governor_parent_kobj(policy), "%s",
> +				   schedutil_gov.name);
> +	if (!ret)
> +		goto out;
> +
> +	/* Failure, so roll back. */
> +	policy->governor_data = NULL;
> +	sugov_tunables_free(tunables);
> +
> + free_sg_policy:
> +	pr_err("cpufreq: schedutil governor initialization failed (error %d)\n", ret);
> +	sugov_policy_free(sg_policy);

I didn't like the way we have mixed success and failure path here, just to save
a single line of code (unlock).

Over that it does things, that aren't symmetric anymore. For example, we have
called sugov_policy_alloc() without locks and are freeing it from within locks.

> +
> + out:
> +	mutex_unlock(&global_tunables_lock);
> +	return ret;
> +}
> +
> +static int sugov_exit(struct cpufreq_policy *policy)
> +{
> +	struct sugov_policy *sg_policy = policy->governor_data;
> +	struct sugov_tunables *tunables = sg_policy->tunables;
> +	unsigned int count;
> +
> +	mutex_lock(&global_tunables_lock);
> +
> +	count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
> +	policy->governor_data = NULL;
> +	if (!count)
> +		sugov_tunables_free(tunables);
> +
> +	mutex_unlock(&global_tunables_lock);
> +
> +	sugov_policy_free(sg_policy);
> +	return 0;
> +}
> +
> +static int sugov_start(struct cpufreq_policy *policy)
> +{
> +	struct sugov_policy *sg_policy = policy->governor_data;
> +	unsigned int cpu;
> +
> +	cpufreq_enable_fast_switch(policy);

Why should we be doing this from START, which gets called a lot compared to
INIT/EXIT? This is something which should be moved to INIT IMHO.

> +	sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
> +	sg_policy->last_freq_update_time = 0;
> +	sg_policy->next_freq = UINT_MAX;
> +	sg_policy->work_in_progress = false;
> +	sg_policy->need_freq_update = false;
> +
> +	for_each_cpu(cpu, policy->cpus) {
> +		struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
> +
> +		sg_cpu->sg_policy = sg_policy;
> +		if (policy_is_shared(policy)) {
> +			sg_cpu->util = ULONG_MAX;
> +			sg_cpu->max = 0;
> +			sg_cpu->last_update = 0;
> +			cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
> +						     sugov_update_shared);
> +		} else {
> +			cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
> +						     sugov_update_single);
> +		}
> +	}
> +	return 0;
> +}
> +
> +static int sugov_stop(struct cpufreq_policy *policy)
> +{
> +	struct sugov_policy *sg_policy = policy->governor_data;
> +	unsigned int cpu;
> +
> +	for_each_cpu(cpu, policy->cpus)
> +		cpufreq_remove_update_util_hook(cpu);
> +
> +	synchronize_sched();
> +
> +	irq_work_sync(&sg_policy->irq_work);
> +	cancel_work_sync(&sg_policy->work);

And again, we should have a disable-fast-switch as well..

> +	return 0;
> +}
> +
> +static int sugov_limits(struct cpufreq_policy *policy)
> +{
> +	struct sugov_policy *sg_policy = policy->governor_data;
> +
> +	if (!policy->fast_switch_enabled) {
> +		mutex_lock(&sg_policy->work_lock);
> +
> +		if (policy->max < policy->cur)
> +			__cpufreq_driver_target(policy, policy->max,
> +						CPUFREQ_RELATION_H);
> +		else if (policy->min > policy->cur)
> +			__cpufreq_driver_target(policy, policy->min,
> +						CPUFREQ_RELATION_L);
> +
> +		mutex_unlock(&sg_policy->work_lock);

Maybe we can try to take lock only if we are going to switch the freq, i.e. only
if sugov_limits is called for policy->min/max update?

i.e.

void __sugov_limits(policy, freq, relation)
{
		mutex_lock(&sg_policy->work_lock);
                __cpufreq_driver_target(policy, freq, relation);
		mutex_unlock(&sg_policy->work_lock);
}

static int sugov_limits(struct cpufreq_policy *policy)
{
	struct sugov_policy *sg_policy = policy->governor_data;

	if (!policy->fast_switch_enabled) {
		if (policy->max < policy->cur)
                        __sugov_limits(policy, policy->max, CPUFREQ_RELATION_H);
		else if (policy->min > policy->cur)
                        __sugov_limits(policy, policy->min, CPUFREQ_RELATION_L);
	}

	sg_policy->need_freq_update = true;
	return 0;
}

??

And maybe the same for current governors? (ofcourse in a separate patch, I can
do that if you want).


Also, why not just always do 'sg_policy->need_freq_update = true' from this
routine and remove everything else? It will be taken care of on next evaluation.

> +
> +int sugov_governor(struct cpufreq_policy *policy, unsigned int event)
> +{
> +	if (event == CPUFREQ_GOV_POLICY_INIT) {
> +		return sugov_init(policy);
> +	} else if (policy->governor_data) {
> +		switch (event) {
> +		case CPUFREQ_GOV_POLICY_EXIT:
> +			return sugov_exit(policy);
> +		case CPUFREQ_GOV_START:
> +			return sugov_start(policy);
> +		case CPUFREQ_GOV_STOP:
> +			return sugov_stop(policy);
> +		case CPUFREQ_GOV_LIMITS:
> +			return sugov_limits(policy);
> +		}
> +	}
> +	return -EINVAL;
> +}
> +
> +static struct cpufreq_governor schedutil_gov = {
> +	.name = "schedutil",
> +	.governor = sugov_governor,
> +	.owner = THIS_MODULE,
> +};
> +
> +static int __init sugov_module_init(void)
> +{
> +	return cpufreq_register_governor(&schedutil_gov);
> +}
> +
> +static void __exit sugov_module_exit(void)
> +{
> +	cpufreq_unregister_governor(&schedutil_gov);
> +}
> +
> +MODULE_AUTHOR("Rafael J. Wysocki <rafael.j.wysocki@intel.com>");
> +MODULE_DESCRIPTION("Utilization-based CPU frequency selection");
> +MODULE_LICENSE("GPL");

Maybe a MODULE_ALIAS as well ?

-- 
viresh