2016-04-20 02:39:33

by Steve Muckle

[permalink] [raw]
Subject: [RFC PATCH 1/4] cpufreq: governor: support scheduler cpufreq callbacks on remote CPUs

In preparation for the scheduler cpufreq callback happening
on remote CPUs, add support for this in the dbs governors.
The dbs governors make assumptions about the callback occurring
on the CPU being updated.

Signed-off-by: Steve Muckle <[email protected]>
---
drivers/cpufreq/cpufreq_governor.c | 21 +++++++++++++++++++--
drivers/cpufreq/cpufreq_governor.h | 1 +
2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 20f0a4e114d1..429d3a5b9866 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -248,6 +248,20 @@ static void dbs_irq_work(struct irq_work *irq_work)
schedule_work_on(smp_processor_id(), &policy_dbs->work);
}

+#ifdef CONFIG_SMP
+static inline void dbs_irq_work_queue(struct policy_dbs_info *policy_dbs,
+ int cpu)
+{
+ irq_work_queue_on(&policy_dbs->irq_work, cpu);
+}
+#else
+static inline void dbs_irq_work_queue(struct policy_dbs_info *policy_dbs,
+ int cpu)
+{
+ irq_work_queue(&policy_dbs->irq_work);
+}
+#endif
+
static void dbs_update_util_handler(struct update_util_data *data, u64 time,
unsigned long util, unsigned long max)
{
@@ -295,7 +309,7 @@ static void dbs_update_util_handler(struct update_util_data *data, u64 time,

policy_dbs->last_sample_time = time;
policy_dbs->work_in_progress = true;
- irq_work_queue(&policy_dbs->irq_work);
+ dbs_irq_work_queue(policy_dbs, cdbs->cpu);
}

static void gov_set_update_util(struct policy_dbs_info *policy_dbs,
@@ -384,7 +398,7 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy)
struct dbs_data *dbs_data;
struct policy_dbs_info *policy_dbs;
unsigned int latency;
- int ret = 0;
+ int j, ret = 0;

/* State should be equivalent to EXIT */
if (policy->governor_data)
@@ -394,6 +408,9 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy)
if (!policy_dbs)
return -ENOMEM;

+ for_each_cpu(j, policy->cpus)
+ per_cpu(cpu_dbs, j).cpu = j;
+
/* Protect gov->gdbs_data against concurrent updates. */
mutex_lock(&gov_dbs_data_mutex);

diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 3e0eb7c54903..1d5f4857ff80 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -122,6 +122,7 @@ struct cpu_dbs_info {
unsigned int prev_load;
struct update_util_data update_util;
struct policy_dbs_info *policy_dbs;
+ int cpu;
};

/* Common Governor data across policies */
--
2.4.10


2016-04-20 02:39:39

by Steve Muckle

[permalink] [raw]
Subject: [RFC PATCH 2/4] cpufreq: schedutil: support scheduler cpufreq callbacks on remote CPUs

In preparation for the scheduler cpufreq callback happening on remote
CPUs, add support for this in schedutil. Schedutil requires the
callback occur on the CPU being updated in order to support fast
frequency switches.

Signed-off-by: Steve Muckle <[email protected]>
---
kernel/sched/cpufreq_schedutil.c | 90 ++++++++++++++++++++++++++++++----------
1 file changed, 68 insertions(+), 22 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 154ae3a51e86..6e7cf90d4ea7 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -49,6 +49,8 @@ struct sugov_cpu {
unsigned long util;
unsigned long max;
u64 last_update;
+
+ int cpu;
};

static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
@@ -76,27 +78,59 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
return delta_ns >= sg_policy->freq_update_delay_ns;
}

-static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
+static void sugov_fast_switch(struct sugov_policy *sg_policy,
+ unsigned int next_freq)
+{
+ struct cpufreq_policy *policy = sg_policy->policy;
+
+ next_freq = cpufreq_driver_fast_switch(policy, next_freq);
+ if (next_freq == CPUFREQ_ENTRY_INVALID)
+ return;
+
+ policy->cur = next_freq;
+ trace_cpu_frequency(next_freq, smp_processor_id());
+}
+
+#ifdef CONFIG_SMP
+static inline bool sugov_queue_remote_callback(struct sugov_policy *sg_policy,
+ int cpu)
+{
+ if (cpu != smp_processor_id()) {
+ sg_policy->work_in_progress = true;
+ irq_work_queue_on(&sg_policy->irq_work, cpu);
+ return true;
+ }
+
+ return false;
+}
+#else
+static inline bool sugov_queue_remote_callback(struct sugov_policy *sg_policy,
+ int cpu)
+{
+ return false;
+}
+#endif
+
+static void sugov_update_commit(struct sugov_cpu *sg_cpu, u64 time,
unsigned int next_freq)
{
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;

sg_policy->last_freq_update_time = time;

+ if (sg_policy->next_freq == next_freq) {
+ trace_cpu_frequency(policy->cur, sg_cpu->cpu);
+ return;
+ }
+ sg_policy->next_freq = next_freq;
+
+ if (sugov_queue_remote_callback(sg_policy, sg_cpu->cpu))
+ return;
+
if (policy->fast_switch_enabled) {
- if (sg_policy->next_freq == next_freq) {
- trace_cpu_frequency(policy->cur, smp_processor_id());
- return;
- }
- sg_policy->next_freq = next_freq;
- next_freq = cpufreq_driver_fast_switch(policy, next_freq);
- if (next_freq == CPUFREQ_ENTRY_INVALID)
- return;
-
- policy->cur = next_freq;
- trace_cpu_frequency(next_freq, smp_processor_id());
- } else if (sg_policy->next_freq != next_freq) {
- sg_policy->next_freq = next_freq;
+ sugov_fast_switch(sg_policy, next_freq);
+ } else {
sg_policy->work_in_progress = true;
irq_work_queue(&sg_policy->irq_work);
}
@@ -142,12 +176,13 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,

next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq :
get_next_freq(policy, util, max);
- sugov_update_commit(sg_policy, time, next_f);
+ sugov_update_commit(sg_cpu, time, next_f);
}

-static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy,
+static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
unsigned long util, unsigned long max)
{
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
unsigned int max_f = policy->cpuinfo.max_freq;
u64 last_freq_update_time = sg_policy->last_freq_update_time;
@@ -161,10 +196,10 @@ static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy,
unsigned long j_util, j_max;
s64 delta_ns;

- if (j == smp_processor_id())
+ j_sg_cpu = &per_cpu(sugov_cpu, j);
+ if (j_sg_cpu == sg_cpu)
continue;

- j_sg_cpu = &per_cpu(sugov_cpu, j);
/*
* If the CPU utilization was last updated before the previous
* frequency update and the time elapsed between the last update
@@ -204,8 +239,8 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
sg_cpu->last_update = time;

if (sugov_should_update_freq(sg_policy, time)) {
- next_f = sugov_next_freq_shared(sg_policy, util, max);
- sugov_update_commit(sg_policy, time, next_f);
+ next_f = sugov_next_freq_shared(sg_cpu, util, max);
+ sugov_update_commit(sg_cpu, time, next_f);
}

raw_spin_unlock(&sg_policy->update_lock);
@@ -226,9 +261,17 @@ static void sugov_work(struct work_struct *work)
static void sugov_irq_work(struct irq_work *irq_work)
{
struct sugov_policy *sg_policy;
+ struct cpufreq_policy *policy;

sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
- schedule_work_on(smp_processor_id(), &sg_policy->work);
+ policy = sg_policy->policy;
+
+ if (policy->fast_switch_enabled) {
+ sugov_fast_switch(sg_policy, sg_policy->next_freq);
+ sg_policy->work_in_progress = false;
+ } else {
+ schedule_work_on(smp_processor_id(), &sg_policy->work);
+ }
}

/************************** sysfs interface ************************/
@@ -330,7 +373,7 @@ static int sugov_init(struct cpufreq_policy *policy)
struct sugov_policy *sg_policy;
struct sugov_tunables *tunables;
unsigned int lat;
- int ret = 0;
+ int cpu, ret = 0;

/* State should be equivalent to EXIT */
if (policy->governor_data)
@@ -340,6 +383,9 @@ static int sugov_init(struct cpufreq_policy *policy)
if (!sg_policy)
return -ENOMEM;

+ for_each_cpu(cpu, policy->cpus)
+ per_cpu(sugov_cpu, cpu).cpu = cpu;
+
mutex_lock(&global_tunables_lock);

if (global_tunables) {
--
2.4.10

2016-04-20 02:39:37

by Steve Muckle

[permalink] [raw]
Subject: [RFC PATCH 3/4] intel_pstate: support scheduler cpufreq callbacks on remote CPUs

In preparation for the scheduler cpufreq callback happening on remote
CPUs, add support for this in intel_pstate, which requires the
callback run on the local CPU to be able to change the CPU frequency.

Signed-off-by: Steve Muckle <[email protected]>
---
drivers/cpufreq/intel_pstate.c | 88 +++++++++++++++++++++++++++++++++++++++---
1 file changed, 83 insertions(+), 5 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 6c7cff13f0ed..fa49d3944aa5 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -162,6 +162,9 @@ struct _pid {
* struct cpudata - Per CPU instance data storage
* @cpu: CPU number for this instance data
* @update_util: CPUFreq utility callback information
+ * @irq_work: Data for passing remote callbacks to the target CPU
+ * @time: Timestamp of CPUFreq callback
+ * @ipi_in_progress: Whether a remote callback IPI is outstanding
* @pstate: Stores P state limits for this CPU
* @vid: Stores VID limits for this CPU
* @pid: Stores PID parameters for this CPU
@@ -179,6 +182,9 @@ struct cpudata {
int cpu;

struct update_util_data update_util;
+ struct irq_work irq_work;
+ u64 time;
+ bool ipi_in_progress;

struct pstate_data pstate;
struct vid_data vid;
@@ -1173,20 +1179,88 @@ static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
get_avg_frequency(cpu));
}

+static void _intel_pstate_update_util(struct cpudata *cpu, u64 time)
+{
+ bool sample_taken = intel_pstate_sample(cpu, time);
+
+ if (sample_taken && !hwp_active)
+ intel_pstate_adjust_busy_pstate(cpu);
+}
+
+#ifdef CONFIG_SMP
+static void intel_pstate_update_util_remote(struct irq_work *irq_work)
+{
+ struct cpudata *cpu = container_of(irq_work, struct cpudata, irq_work);
+ s64 delta_ns = cpu->time - cpu->sample.time;
+
+ /*
+ * A local update may have happened while the ipi
+ * was in progress so re-check the time.
+ */
+ if (delta_ns < pid_params.sample_rate_ns)
+ return;
+
+ _intel_pstate_update_util(cpu, cpu->time);
+
+ cpu->ipi_in_progress = false;
+}
+
static void intel_pstate_update_util(struct update_util_data *data, u64 time,
unsigned long util, unsigned long max)
{
struct cpudata *cpu = container_of(data, struct cpudata, update_util);
- u64 delta_ns = time - cpu->sample.time;
+ s64 delta_ns = time - cpu->sample.time;

- if ((s64)delta_ns >= pid_params.sample_rate_ns) {
- bool sample_taken = intel_pstate_sample(cpu, time);
+ if (delta_ns < pid_params.sample_rate_ns)
+ return;

- if (sample_taken && !hwp_active)
- intel_pstate_adjust_busy_pstate(cpu);
+ if (cpu->cpu == smp_processor_id()) {
+ _intel_pstate_update_util(cpu, time);
+ } else {
+ /* The target CPU's rq lock is held. */
+ if (cpu->ipi_in_progress)
+ return;
+
+ /* Re-check sample_time which may have advanced. */
+ smp_rmb();
+ delta_ns = time - READ_ONCE(cpu->sample.time);
+ if (delta_ns < pid_params.sample_rate_ns)
+ return;
+
+ cpu->ipi_in_progress = true;
+ cpu->time = time;
+ irq_work_queue_on(&cpu->irq_work, cpu->cpu);
}
}

+static inline void intel_pstate_irq_work_sync(unsigned int cpu)
+{
+ irq_work_sync(&all_cpu_data[cpu]->irq_work);
+}
+
+static inline void intel_pstate_init_irq_work(struct cpudata *cpu)
+{
+ init_irq_work(&cpu->irq_work, intel_pstate_update_util_remote);
+}
+#else /* !CONFIG_SMP */
+static inline void intel_pstate_irq_work_sync(unsigned int cpu) {}
+static inline void intel_pstate_init_irq_work(struct cpudata *cpu) {}
+
+static void intel_pstate_update_util(struct update_util_data *data, u64 time,
+ unsigned long util, unsigned long max)
+{
+ struct cpudata *cpu = container_of(data, struct cpudata, update_util);
+ s64 delta_ns = time - cpu->sample.time;
+
+ if (delta_ns < pid_params.sample_rate_ns)
+ return;
+
+ _intel_pstate_update_util(cpu, time);
+}
+#endif
+
+
+
#define ICPU(model, policy) \
{ X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF,\
(unsigned long)&policy }
@@ -1273,6 +1347,7 @@ static void intel_pstate_clear_update_util_hook(unsigned int cpu)
{
cpufreq_remove_update_util_hook(cpu);
synchronize_sched();
+ intel_pstate_irq_work_sync(cpu);
}

static void intel_pstate_set_performance_limits(struct perf_limits *limits)
@@ -1379,6 +1454,9 @@ static int intel_pstate_cpu_init(struct cpufreq_policy *policy)

cpu = all_cpu_data[policy->cpu];

+ intel_pstate_init_irq_work(cpu);
+
+
if (limits->min_perf_pct == 100 && limits->max_perf_pct == 100)
policy->policy = CPUFREQ_POLICY_PERFORMANCE;
else
--
2.4.10

2016-04-20 02:40:06

by Steve Muckle

[permalink] [raw]
Subject: [RFC PATCH 4/4] sched/fair: call cpufreq hook for remote wakeups

Without calling the cpufreq hook for a remote wakeup it is possible
for such a wakeup to go unnoticed by cpufreq on the target CPU for up
to a full tick. This can occur if the target CPU is running a
CPU-bound task.

Signed-off-by: Steve Muckle <[email protected]>
---
kernel/sched/fair.c | 8 +++-----
kernel/sched/sched.h | 17 ++++++++++-------
2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b06c1e938cb9..d21a80a44b6e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2826,15 +2826,13 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
struct rq *rq = rq_of(cfs_rq);
int cpu = cpu_of(rq);

- if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
+ if (&rq->cfs == cfs_rq) {
unsigned long max = rq->cpu_capacity_orig;

/*
* There are a few boundary cases this might miss but it should
* get called often enough that that should (hopefully) not be
- * a real problem -- added to that it only calls on the local
- * CPU, so if we enqueue remotely we'll miss an update, but
- * the next tick/schedule should update.
+ * a real problem.
*
* It will not get called when we go idle, because the idle
* thread is a different class (!fair), nor will the utilization
@@ -2845,7 +2843,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
*
* See cpu_util().
*/
- cpufreq_update_util(rq_clock(rq),
+ cpufreq_update_util(cpu, rq_clock(rq),
min(cfs_rq->avg.util_avg, max), max);
}
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 921d6e5d33b7..a8a1eb603263 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1799,6 +1799,7 @@ DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);

/**
* cpufreq_update_util - Take a note about CPU utilization changes.
+ * @cpu: Target CPU.
* @time: Current time.
* @util: Current utilization.
* @max: Utilization ceiling.
@@ -1808,13 +1809,14 @@ DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
*
* It can only be called from RCU-sched read-side critical sections.
*/
-static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max)
+static inline void cpufreq_update_util(int cpu, u64 time, unsigned long util,
+ unsigned long max)
{
- struct update_util_data *data;
+ struct update_util_data *data;

- data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
- if (data)
- data->func(data, time, util, max);
+ data = rcu_dereference_sched(per_cpu(cpufreq_update_util_data, cpu));
+ if (data)
+ data->func(data, time, util, max);
}

/**
@@ -1835,10 +1837,11 @@ static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned lo
*/
static inline void cpufreq_trigger_update(u64 time)
{
- cpufreq_update_util(time, ULONG_MAX, 0);
+ cpufreq_update_util(smp_processor_id(), time, ULONG_MAX, 0);
}
#else
-static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {}
+static inline void cpufreq_update_util(int cpu, u64 time, unsigned long util,
+ unsigned long max) {}
static inline void cpufreq_trigger_update(u64 time) {}
#endif /* CONFIG_CPU_FREQ */

--
2.4.10

2016-04-20 12:23:18

by Rafael J. Wysocki

[permalink] [raw]
Subject: Re: [RFC PATCH 1/4] cpufreq: governor: support scheduler cpufreq callbacks on remote CPUs

On Tuesday, April 19, 2016 07:39:26 PM Steve Muckle wrote:

You could have added a cover [0/4] message which would have made it easier
to reply to the entire series in general. Let me do it here.

Doing it the way it is done in this series would be fine by me in general
(up to a few more or less minor comments), but it is still unclear to me
how much of a difference these changes would make in terms of improved
response times etc.

> In preparation for the scheduler cpufreq callback happening
> on remote CPUs, add support for this in the dbs governors.
> The dbs governors make assumptions about the callback occurring
> on the CPU being updated.

While the above is generally correct, it would be nice to say more about what
happens in the patch. Like:

"To that end, add a CPU number field to struct cpu_dbs_info and modify
dbs_update_util_handler() to schedule IRQ works on target CPUs rather than on
the local one only."

> Signed-off-by: Steve Muckle <[email protected]>
> ---
> drivers/cpufreq/cpufreq_governor.c | 21 +++++++++++++++++++--
> drivers/cpufreq/cpufreq_governor.h | 1 +
> 2 files changed, 20 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
> index 20f0a4e114d1..429d3a5b9866 100644
> --- a/drivers/cpufreq/cpufreq_governor.c
> +++ b/drivers/cpufreq/cpufreq_governor.c
> @@ -248,6 +248,20 @@ static void dbs_irq_work(struct irq_work *irq_work)
> schedule_work_on(smp_processor_id(), &policy_dbs->work);
> }
>
> +#ifdef CONFIG_SMP
> +static inline void dbs_irq_work_queue(struct policy_dbs_info *policy_dbs,
> + int cpu)
> +{
> + irq_work_queue_on(&policy_dbs->irq_work, cpu);
> +}
> +#else
> +static inline void dbs_irq_work_queue(struct policy_dbs_info *policy_dbs,
> + int cpu)
> +{
> + irq_work_queue(&policy_dbs->irq_work);
> +}
> +#endif
> +
> static void dbs_update_util_handler(struct update_util_data *data, u64 time,
> unsigned long util, unsigned long max)
> {
> @@ -295,7 +309,7 @@ static void dbs_update_util_handler(struct update_util_data *data, u64 time,
>
> policy_dbs->last_sample_time = time;
> policy_dbs->work_in_progress = true;
> - irq_work_queue(&policy_dbs->irq_work);
> + dbs_irq_work_queue(policy_dbs, cdbs->cpu);
> }
>
> static void gov_set_update_util(struct policy_dbs_info *policy_dbs,
> @@ -384,7 +398,7 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy)
> struct dbs_data *dbs_data;
> struct policy_dbs_info *policy_dbs;
> unsigned int latency;
> - int ret = 0;
> + int j, ret = 0;
>
> /* State should be equivalent to EXIT */
> if (policy->governor_data)
> @@ -394,6 +408,9 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy)
> if (!policy_dbs)
> return -ENOMEM;
>
> + for_each_cpu(j, policy->cpus)
> + per_cpu(cpu_dbs, j).cpu = j;
> +
> /* Protect gov->gdbs_data against concurrent updates. */
> mutex_lock(&gov_dbs_data_mutex);
>
> diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
> index 3e0eb7c54903..1d5f4857ff80 100644
> --- a/drivers/cpufreq/cpufreq_governor.h
> +++ b/drivers/cpufreq/cpufreq_governor.h
> @@ -122,6 +122,7 @@ struct cpu_dbs_info {
> unsigned int prev_load;
> struct update_util_data update_util;
> struct policy_dbs_info *policy_dbs;
> + int cpu;
> };

Wouldn't it be better to add the cpu field to struct update_util_data and set
it from cpufreq_add_update_util_hook()?

That would allow you to avoid adding the cpu field to struct sugov_cpu in the
second patch at least.

Thanks,
Rafael

2016-04-20 12:34:29

by Rafael J. Wysocki

[permalink] [raw]
Subject: Re: [RFC PATCH 3/4] intel_pstate: support scheduler cpufreq callbacks on remote CPUs

On Tuesday, April 19, 2016 07:39:28 PM Steve Muckle wrote:
> In preparation for the scheduler cpufreq callback happening on remote
> CPUs, add support for this in intel_pstate, which requires the
> callback run on the local CPU to be able to change the CPU frequency.
>
> Signed-off-by: Steve Muckle <[email protected]>
> ---
> drivers/cpufreq/intel_pstate.c | 88 +++++++++++++++++++++++++++++++++++++++---
> 1 file changed, 83 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
> index 6c7cff13f0ed..fa49d3944aa5 100644
> --- a/drivers/cpufreq/intel_pstate.c
> +++ b/drivers/cpufreq/intel_pstate.c
> @@ -162,6 +162,9 @@ struct _pid {
> * struct cpudata - Per CPU instance data storage
> * @cpu: CPU number for this instance data
> * @update_util: CPUFreq utility callback information
> + * @irq_work: Data for passing remote callbacks to the target CPU
> + * @time: Timestamp of CPUFreq callback
> + * @ipi_in_progress: Whether a remote callback IPI is outstanding
> * @pstate: Stores P state limits for this CPU
> * @vid: Stores VID limits for this CPU
> * @pid: Stores PID parameters for this CPU
> @@ -179,6 +182,9 @@ struct cpudata {
> int cpu;
>
> struct update_util_data update_util;
> + struct irq_work irq_work;
> + u64 time;
> + bool ipi_in_progress;
>
> struct pstate_data pstate;
> struct vid_data vid;
> @@ -1173,20 +1179,88 @@ static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
> get_avg_frequency(cpu));
> }
>
> +static void _intel_pstate_update_util(struct cpudata *cpu, u64 time)

What about calling this intel_pstate_update_cpu()?

> +{
> + bool sample_taken = intel_pstate_sample(cpu, time);
> +
> + if (sample_taken && !hwp_active)
> + intel_pstate_adjust_busy_pstate(cpu);
> +}
> +
> +#ifdef CONFIG_SMP
> +static void intel_pstate_update_util_remote(struct irq_work *irq_work)
> +{
> + struct cpudata *cpu = container_of(irq_work, struct cpudata, irq_work);
> + s64 delta_ns = cpu->time - cpu->sample.time;
> +
> + /*
> + * A local update may have happened while the ipi
> + * was in progress so re-check the time.
> + */
> + if (delta_ns < pid_params.sample_rate_ns)
> + return;
> +
> + _intel_pstate_update_util(cpu, cpu->time);
> +
> + cpu->ipi_in_progress = false;
> +}
> +
> static void intel_pstate_update_util(struct update_util_data *data, u64 time,
> unsigned long util, unsigned long max)
> {
> struct cpudata *cpu = container_of(data, struct cpudata, update_util);
> - u64 delta_ns = time - cpu->sample.time;
> + s64 delta_ns = time - cpu->sample.time;
>
> - if ((s64)delta_ns >= pid_params.sample_rate_ns) {
> - bool sample_taken = intel_pstate_sample(cpu, time);
> + if (delta_ns < pid_params.sample_rate_ns)

Why don't you check cpu->ipi_in_progress here too and bail out if it is set?

That would allow you to avoid checking the time again below, woulnd't it?

> + return;
>
> - if (sample_taken && !hwp_active)
> - intel_pstate_adjust_busy_pstate(cpu);
> + if (cpu->cpu == smp_processor_id()) {
> + _intel_pstate_update_util(cpu, time);
> + } else {
> + /* The target CPU's rq lock is held. */
> + if (cpu->ipi_in_progress)
> + return;
> +
> + /* Re-check sample_time which may have advanced. */
> + smp_rmb();
> + delta_ns = time - READ_ONCE(cpu->sample.time);
> + if (delta_ns < pid_params.sample_rate_ns)
> + return;
> +
> + cpu->ipi_in_progress = true;
> + cpu->time = time;
> + irq_work_queue_on(&cpu->irq_work, cpu->cpu);
> }
> }
>
> +static inline void intel_pstate_irq_work_sync(unsigned int cpu)
> +{
> + irq_work_sync(&all_cpu_data[cpu]->irq_work);
> +}
> +
> +static inline void intel_pstate_init_irq_work(struct cpudata *cpu)
> +{
> + init_irq_work(&cpu->irq_work, intel_pstate_update_util_remote);
> +}
> +#else /* !CONFIG_SMP */
> +static inline void intel_pstate_irq_work_sync(unsigned int cpu) {}
> +static inline void intel_pstate_init_irq_work(struct cpudata *cpu) {}
> +
> +static void intel_pstate_update_util(struct update_util_data *data, u64 time,
> + unsigned long util, unsigned long max)
> +{
> + struct cpudata *cpu = container_of(data, struct cpudata, update_util);
> + s64 delta_ns = time - cpu->sample.time;
> +
> + if (delta_ns < pid_params.sample_rate_ns)
> + return;
> +
> + _intel_pstate_update_util(cpu, time);
> +}
> +#endif
> +
> +
> +

The additional two empty lines are not necessary.

> #define ICPU(model, policy) \
> { X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF,\
> (unsigned long)&policy }
> @@ -1273,6 +1347,7 @@ static void intel_pstate_clear_update_util_hook(unsigned int cpu)
> {
> cpufreq_remove_update_util_hook(cpu);
> synchronize_sched();
> + intel_pstate_irq_work_sync(cpu);
> }
>
> static void intel_pstate_set_performance_limits(struct perf_limits *limits)
> @@ -1379,6 +1454,9 @@ static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
>
> cpu = all_cpu_data[policy->cpu];
>
> + intel_pstate_init_irq_work(cpu);
> +
> +

One additional empty line should be sufficient here.

> if (limits->min_perf_pct == 100 && limits->max_perf_pct == 100)
> policy->policy = CPUFREQ_POLICY_PERFORMANCE;
> else
>

Thanks,
Rafael

2016-04-21 02:20:22

by Steve Muckle

[permalink] [raw]
Subject: Re: [RFC PATCH 3/4] intel_pstate: support scheduler cpufreq callbacks on remote CPUs

On Wed, Apr 20, 2016 at 02:37:18PM +0200, Rafael J. Wysocki wrote:
...
> > @@ -1173,20 +1179,88 @@ static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
> > get_avg_frequency(cpu));
> > }
> >
> > +static void _intel_pstate_update_util(struct cpudata *cpu, u64 time)
>
> What about calling this intel_pstate_update_cpu()?

Sure will change.

...
> > static void intel_pstate_update_util(struct update_util_data *data, u64 time,
> > unsigned long util, unsigned long max)
> > {
> > struct cpudata *cpu = container_of(data, struct cpudata, update_util);
> > - u64 delta_ns = time - cpu->sample.time;
> > + s64 delta_ns = time - cpu->sample.time;
> >
> > - if ((s64)delta_ns >= pid_params.sample_rate_ns) {
> > - bool sample_taken = intel_pstate_sample(cpu, time);
> > + if (delta_ns < pid_params.sample_rate_ns)
>
> Why don't you check cpu->ipi_in_progress here too and bail out if it is set?
>
> That would allow you to avoid checking the time again below, woulnd't it?

Yeah I think that should work. I can't recall why I thought I needed
to check the time first, then ipi_in_progress, then the time. As long
as ipi_in_progress is checked prior to the time, it should be fine.

>
> > + return;
> >
> > - if (sample_taken && !hwp_active)
> > - intel_pstate_adjust_busy_pstate(cpu);
> > + if (cpu->cpu == smp_processor_id()) {
> > + _intel_pstate_update_util(cpu, time);
> > + } else {
> > + /* The target CPU's rq lock is held. */
> > + if (cpu->ipi_in_progress)
> > + return;
> > +
> > + /* Re-check sample_time which may have advanced. */
> > + smp_rmb();
> > + delta_ns = time - READ_ONCE(cpu->sample.time);
> > + if (delta_ns < pid_params.sample_rate_ns)
> > + return;
> > +
> > + cpu->ipi_in_progress = true;
> > + cpu->time = time;
> > + irq_work_queue_on(&cpu->irq_work, cpu->cpu);
> > }
> > }
> >
> > +static inline void intel_pstate_irq_work_sync(unsigned int cpu)
> > +{
> > + irq_work_sync(&all_cpu_data[cpu]->irq_work);
> > +}
> > +
> > +static inline void intel_pstate_init_irq_work(struct cpudata *cpu)
> > +{
> > + init_irq_work(&cpu->irq_work, intel_pstate_update_util_remote);
> > +}
> > +#else /* !CONFIG_SMP */
> > +static inline void intel_pstate_irq_work_sync(unsigned int cpu) {}
> > +static inline void intel_pstate_init_irq_work(struct cpudata *cpu) {}
> > +
> > +static void intel_pstate_update_util(struct update_util_data *data, u64 time,
> > + unsigned long util, unsigned long max)
> > +{
> > + struct cpudata *cpu = container_of(data, struct cpudata, update_util);
> > + s64 delta_ns = time - cpu->sample.time;
> > +
> > + if (delta_ns < pid_params.sample_rate_ns)
> > + return;
> > +
> > + _intel_pstate_update_util(cpu, time);
> > +}
> > +#endif
> > +
> > +
> > +
>
> The additional two empty lines are not necessary.
>

Sorry yeah these were unintentional, will remove these and the ones below.

Thanks for the review.

thanks,
Steve

2016-04-25 19:18:03

by Steve Muckle

[permalink] [raw]
Subject: Re: [RFC PATCH 1/4] cpufreq: governor: support scheduler cpufreq callbacks on remote CPUs

On Wed, Apr 20, 2016 at 02:26:06PM +0200, Rafael J. Wysocki wrote:
> You could have added a cover [0/4] message which would have made it easier
> to reply to the entire series in general. Let me do it here.

Will add that next time.

> Doing it the way it is done in this series would be fine by me in general
> (up to a few more or less minor comments), but it is still unclear to me
> how much of a difference these changes would make in terms of improved
> response times etc.

I spent some time last week constructing a test case where the
benefits could be seen. A task which was previously low utilization
wakes on CPU0 and becomes CPU bound. Just after that, a new task is
spawned on CPU0. The initial task utilization is high so ideally we
would like to see the frequency immediately rise, but in my test it
does not occur until the next tick. There is 7ms of delay in the trace
I've saved.

Unfortunately these patches alone will not address it. There are a
couple other issues which get in the way (which is why I didn't
respond here right away). Let me spend some more time on those and see
how it goes.

> > In preparation for the scheduler cpufreq callback happening
> > on remote CPUs, add support for this in the dbs governors.
> > The dbs governors make assumptions about the callback occurring
> > on the CPU being updated.
>
> While the above is generally correct, it would be nice to say more about what
> happens in the patch. Like:
>
> "To that end, add a CPU number field to struct cpu_dbs_info and modify
> dbs_update_util_handler() to schedule IRQ works on target CPUs rather than on
> the local one only."

I'm happy to do that if it is what you'd like to see, but just
curious, isn't it really just restating the patch contents?

...
> > diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
> > index 3e0eb7c54903..1d5f4857ff80 100644
> > --- a/drivers/cpufreq/cpufreq_governor.h
> > +++ b/drivers/cpufreq/cpufreq_governor.h
> > @@ -122,6 +122,7 @@ struct cpu_dbs_info {
> > unsigned int prev_load;
> > struct update_util_data update_util;
> > struct policy_dbs_info *policy_dbs;
> > + int cpu;
> > };
>
> Wouldn't it be better to add the cpu field to struct update_util_data and set
> it from cpufreq_add_update_util_hook()?
>
> That would allow you to avoid adding the cpu field to struct sugov_cpu in the
> second patch at least.

Sure, will do.

thanks,
Steve

2016-04-25 21:28:07

by Rafael J. Wysocki

[permalink] [raw]
Subject: Re: [RFC PATCH 1/4] cpufreq: governor: support scheduler cpufreq callbacks on remote CPUs

On Mon, Apr 25, 2016 at 9:17 PM, Steve Muckle <[email protected]> wrote:
> On Wed, Apr 20, 2016 at 02:26:06PM +0200, Rafael J. Wysocki wrote:
>> You could have added a cover [0/4] message which would have made it easier
>> to reply to the entire series in general. Let me do it here.
>
> Will add that next time.
>
>> Doing it the way it is done in this series would be fine by me in general
>> (up to a few more or less minor comments), but it is still unclear to me
>> how much of a difference these changes would make in terms of improved
>> response times etc.
>
> I spent some time last week constructing a test case where the
> benefits could be seen. A task which was previously low utilization
> wakes on CPU0 and becomes CPU bound. Just after that, a new task is
> spawned on CPU0. The initial task utilization is high so ideally we
> would like to see the frequency immediately rise, but in my test it
> does not occur until the next tick. There is 7ms of delay in the trace
> I've saved.
>
> Unfortunately these patches alone will not address it. There are a
> couple other issues which get in the way (which is why I didn't
> respond here right away). Let me spend some more time on those and see
> how it goes.

I see.

>> > In preparation for the scheduler cpufreq callback happening
>> > on remote CPUs, add support for this in the dbs governors.
>> > The dbs governors make assumptions about the callback occurring
>> > on the CPU being updated.
>>
>> While the above is generally correct, it would be nice to say more about what
>> happens in the patch. Like:
>>
>> "To that end, add a CPU number field to struct cpu_dbs_info and modify
>> dbs_update_util_handler() to schedule IRQ works on target CPUs rather than on
>> the local one only."
>
> I'm happy to do that if it is what you'd like to see, but just
> curious, isn't it really just restating the patch contents?

It is somewhat, but that's for the benefit of whoever reads the git
history without necessarily looking and the changes themselves
upfront.

> ...
>> > diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
>> > index 3e0eb7c54903..1d5f4857ff80 100644
>> > --- a/drivers/cpufreq/cpufreq_governor.h
>> > +++ b/drivers/cpufreq/cpufreq_governor.h
>> > @@ -122,6 +122,7 @@ struct cpu_dbs_info {
>> > unsigned int prev_load;
>> > struct update_util_data update_util;
>> > struct policy_dbs_info *policy_dbs;
>> > + int cpu;
>> > };
>>
>> Wouldn't it be better to add the cpu field to struct update_util_data and set
>> it from cpufreq_add_update_util_hook()?
>>
>> That would allow you to avoid adding the cpu field to struct sugov_cpu in the
>> second patch at least.
>
> Sure, will do.

Thanks!

2016-04-25 21:34:56

by Rafael J. Wysocki

[permalink] [raw]
Subject: Re: [RFC PATCH 3/4] intel_pstate: support scheduler cpufreq callbacks on remote CPUs

On Thu, Apr 21, 2016 at 4:20 AM, Steve Muckle <[email protected]> wrote:
> On Wed, Apr 20, 2016 at 02:37:18PM +0200, Rafael J. Wysocki wrote:
> ...
>> > @@ -1173,20 +1179,88 @@ static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
>> > get_avg_frequency(cpu));
>> > }
>> >
>> > +static void _intel_pstate_update_util(struct cpudata *cpu, u64 time)
>>
>> What about calling this intel_pstate_update_cpu()?
>
> Sure will change.
>
> ...
>> > static void intel_pstate_update_util(struct update_util_data *data, u64 time,
>> > unsigned long util, unsigned long max)
>> > {
>> > struct cpudata *cpu = container_of(data, struct cpudata, update_util);
>> > - u64 delta_ns = time - cpu->sample.time;
>> > + s64 delta_ns = time - cpu->sample.time;
>> >
>> > - if ((s64)delta_ns >= pid_params.sample_rate_ns) {
>> > - bool sample_taken = intel_pstate_sample(cpu, time);
>> > + if (delta_ns < pid_params.sample_rate_ns)
>>
>> Why don't you check cpu->ipi_in_progress here too and bail out if it is set?
>>
>> That would allow you to avoid checking the time again below, woulnd't it?
>
> Yeah I think that should work. I can't recall why I thought I needed
> to check the time first, then ipi_in_progress, then the time. As long
> as ipi_in_progress is checked prior to the time, it should be fine.

I actually think that we can just skip all cross-CPU updates in
intel_pstate instead of adding complexity to it.

The governor algorithm here uses feedback registers to estimate
utilization and I don't think it will react to the corss-CPU updates
the way you want plus it is likely to skip them anyway due to the rate
limit.

2016-04-29 10:38:23

by Viresh Kumar

[permalink] [raw]
Subject: Re: [RFC PATCH 1/4] cpufreq: governor: support scheduler cpufreq callbacks on remote CPUs

On 19-04-16, 19:39, Steve Muckle wrote:
> diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
> index 20f0a4e114d1..429d3a5b9866 100644
> --- a/drivers/cpufreq/cpufreq_governor.c
> +++ b/drivers/cpufreq/cpufreq_governor.c
> @@ -248,6 +248,20 @@ static void dbs_irq_work(struct irq_work *irq_work)
> schedule_work_on(smp_processor_id(), &policy_dbs->work);
> }
>
> +#ifdef CONFIG_SMP
> +static inline void dbs_irq_work_queue(struct policy_dbs_info *policy_dbs,
> + int cpu)
> +{
> + irq_work_queue_on(&policy_dbs->irq_work, cpu);
> +}
> +#else
> +static inline void dbs_irq_work_queue(struct policy_dbs_info *policy_dbs,
> + int cpu)
> +{
> + irq_work_queue(&policy_dbs->irq_work);
> +}
> +#endif

Any clue, why we don't have a non-SMP version of irq_work_queue_on(), Which can
simply call irq_work_queue() ?

--
viresh

2016-04-29 11:18:23

by Rafael J. Wysocki

[permalink] [raw]
Subject: Re: [RFC PATCH 1/4] cpufreq: governor: support scheduler cpufreq callbacks on remote CPUs

On Friday, April 29, 2016 04:08:16 PM Viresh Kumar wrote:
> On 19-04-16, 19:39, Steve Muckle wrote:
> > diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
> > index 20f0a4e114d1..429d3a5b9866 100644
> > --- a/drivers/cpufreq/cpufreq_governor.c
> > +++ b/drivers/cpufreq/cpufreq_governor.c
> > @@ -248,6 +248,20 @@ static void dbs_irq_work(struct irq_work *irq_work)
> > schedule_work_on(smp_processor_id(), &policy_dbs->work);
> > }
> >
> > +#ifdef CONFIG_SMP
> > +static inline void dbs_irq_work_queue(struct policy_dbs_info *policy_dbs,
> > + int cpu)
> > +{
> > + irq_work_queue_on(&policy_dbs->irq_work, cpu);
> > +}
> > +#else
> > +static inline void dbs_irq_work_queue(struct policy_dbs_info *policy_dbs,
> > + int cpu)
> > +{
> > + irq_work_queue(&policy_dbs->irq_work);
> > +}
> > +#endif
>
> Any clue, why we don't have a non-SMP version of irq_work_queue_on(), Which can
> simply call irq_work_queue() ?

Because nobody else needs it?

But I agree that it would be nicer to add the stub to irq_work.h.