Currently, amd_pstate only uses `adjust_perf` and `target` callback
functions to get the frequency/effective utilization data from scaling
governors. But the governor can switch back to fast_switch callback when
frequency invariance is disabled. In that case, if the driver didn't set
fast_switch function pointer, null pointer dereference can happen and
kernel will crash.
Currently, frequency invariance is turned off when any anomaly is
detected with aperf/mperf readings. Which triggers this problem.
Fix this by adding `fast_switch` function in the amd_pstate driver. The
same problem could arise with intel_pstate driver when the driver is in
passive mode and the governor is schedutil.
Additionally, display a warning message if the scaling driver sets
fast_swich_possible flag but not fast_switch function pointer.
Changelog:
v2 -> v3
- Add fast_switch callback in amd_pstate and warn if any driver
sets fast_swich_possible flag but not fast_switch function pointer.
v1 -> v2
- Remove fast_switch_enabled flag dependency for adjust_perf callback
v1: https://lore.kernel.org/linux-pm/[email protected]/
v2: https://lore.kernel.org/linux-pm/[email protected]/
Gautham R. Shenoy (1):
amd_pstate: Add ->fast_switch() callback
Wyes Karny (1):
cpufreq: Warn if fast_switch is not set
drivers/cpufreq/amd-pstate.c | 35 +++++++++++++++++++++++++++++------
drivers/cpufreq/cpufreq.c | 18 ++++++++++++++++++
include/linux/cpufreq.h | 5 ++++-
3 files changed, 51 insertions(+), 7 deletions(-)
--
2.34.1
From: "Gautham R. Shenoy" <[email protected]>
Schedutil normally calls the adjust_perf callback for drivers with
adjust_perf callback available and fast_switch_possible flag set.
However, when frequency invariance is disabled, schedutil tries to
invoke fast_switch. So, there is a chance of kernel crash if this
function pointer is not set. To protect against this scenario add
fast_switch callback in amd_pstate driver.
Fixes: 1d215f0319c2 ("cpufreq: amd-pstate: Add fast switch function for AMD P-State")
Signed-off-by: Wyes Karny <[email protected]>
Signed-off-by: Gautham R. Shenoy <[email protected]>
---
drivers/cpufreq/amd-pstate.c | 35 +++++++++++++++++++++++++++++------
1 file changed, 29 insertions(+), 6 deletions(-)
diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index 5a3d4aa0f45a..6cf875d94c41 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -444,9 +444,8 @@ static int amd_pstate_verify(struct cpufreq_policy_data *policy)
return 0;
}
-static int amd_pstate_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
+static int amd_pstate_update_freq(struct cpufreq_policy *policy,
+ unsigned int target_freq, bool fast_switch)
{
struct cpufreq_freqs freqs;
struct amd_cpudata *cpudata = policy->driver_data;
@@ -465,14 +464,37 @@ static int amd_pstate_target(struct cpufreq_policy *policy,
des_perf = DIV_ROUND_CLOSEST(target_freq * cap_perf,
cpudata->max_freq);
- cpufreq_freq_transition_begin(policy, &freqs);
+ WARN_ON(fast_switch && !policy->fast_switch_enabled);
+ /*
+ * If fast_switch is desired, then there aren't any registered
+ * transition notifiers. See comment for
+ * cpufreq_enable_fast_switch().
+ */
+ if (!fast_switch)
+ cpufreq_freq_transition_begin(policy, &freqs);
+
amd_pstate_update(cpudata, min_perf, des_perf,
- max_perf, false, policy->governor->flags);
- cpufreq_freq_transition_end(policy, &freqs, false);
+ max_perf, fast_switch, policy->governor->flags);
+
+ if (!fast_switch)
+ cpufreq_freq_transition_end(policy, &freqs, false);
return 0;
}
+static int amd_pstate_target(struct cpufreq_policy *policy,
+ unsigned int target_freq,
+ unsigned int relation)
+{
+ return amd_pstate_update_freq(policy, target_freq, false);
+}
+
+static unsigned int amd_pstate_fast_switch(struct cpufreq_policy *policy,
+ unsigned int target_freq)
+{
+ return amd_pstate_update_freq(policy, target_freq, true);
+}
+
static void amd_pstate_adjust_perf(unsigned int cpu,
unsigned long _min_perf,
unsigned long target_perf,
@@ -1309,6 +1331,7 @@ static struct cpufreq_driver amd_pstate_driver = {
.flags = CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_UPDATE_LIMITS,
.verify = amd_pstate_verify,
.target = amd_pstate_target,
+ .fast_switch = amd_pstate_fast_switch,
.init = amd_pstate_cpu_init,
.exit = amd_pstate_cpu_exit,
.suspend = amd_pstate_cpu_suspend,
--
2.34.1
If fast_switch_possible flag is set by the scaling driver, the governor
is free to select fast_switch function even if adjust_perf is set. When
the frequency invariance is disabled due to some reason governor
fallbacks to fast_switch if fast_switch_possible is set. This could
crash the kernel if the driver didn't set the fast_switch function
pointer.
This issue becomes apparent when aperf/mperf overflow occurs with
amd_pstate (passive) + schedutil. When this happens, kernel disables
frequency invariance calculation which causes schedutil to fallback to
sugov_update_single_freq which currently relies on the fast_switch
callback.
Normal flow:
sugov_update_single_perf
cpufreq_driver_adjust_perf
cpufreq_driver->adjust_perf
Error case flow:
sugov_update_single_perf
sugov_update_single_freq <-- This is chosen because the freq invariant is disabled due to aperf/mperf overflow
cpufreq_driver_fast_switch
cpufreq_driver->fast_switch <-- Here NULL pointer dereference is happening, because fast_switch is not set
Put up a warning message if the driver sets fast_switch_possible flag
but not fast_switch.
Signed-off-by: Wyes Karny <[email protected]>
---
drivers/cpufreq/cpufreq.c | 18 ++++++++++++++++++
include/linux/cpufreq.h | 5 ++++-
2 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 6b52ebe5a890..180be9235b08 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -501,6 +501,13 @@ void cpufreq_enable_fast_switch(struct cpufreq_policy *policy)
if (!policy->fast_switch_possible)
return;
+ /**
+ * It's not expected driver's fast_switch callback is not set
+ * even fast_switch_possible is true.
+ */
+ if (!cpufreq_driver_has_fast_switch())
+ pr_alert_once("fast_switch callback is not set\n");
+
mutex_lock(&cpufreq_fast_switch_lock);
if (cpufreq_fast_switch_count >= 0) {
cpufreq_fast_switch_count++;
@@ -2143,6 +2150,17 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy,
}
EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch);
+/**
+ * cpufreq_driver_has_fast_switch - Check "fast switch" callback.
+ *
+ * Return 'true' if the ->fast_switch callback is present for the
+ * current driver or 'false' otherwise.
+ */
+bool cpufreq_driver_has_fast_switch(void)
+{
+ return !!cpufreq_driver->fast_switch;
+}
+
/**
* cpufreq_driver_adjust_perf - Adjust CPU performance level in one go.
* @cpu: Target CPU.
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 26e2eb399484..4277d6f8e50c 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -340,7 +340,9 @@ struct cpufreq_driver {
/*
* ->fast_switch() replacement for drivers that use an internal
* representation of performance levels and can pass hints other than
- * the target performance level to the hardware.
+ * the target performance level to the hardware. If driver is setting this,
+ * then it needs to set fast_switch also. Because in certain scenario scale
+ * invariance could be disabled and governor can switch back to fast_switch.
*/
void (*adjust_perf)(unsigned int cpu,
unsigned long min_perf,
@@ -604,6 +606,7 @@ struct cpufreq_governor {
/* Pass a target to the cpufreq driver */
unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy,
unsigned int target_freq);
+bool cpufreq_driver_has_fast_switch(void);
void cpufreq_driver_adjust_perf(unsigned int cpu,
unsigned long min_perf,
unsigned long target_perf,
--
2.34.1
On 15-05-23, 11:34, Wyes Karny wrote:
> If fast_switch_possible flag is set by the scaling driver, the governor
> is free to select fast_switch function even if adjust_perf is set. When
> the frequency invariance is disabled due to some reason governor
> fallbacks to fast_switch if fast_switch_possible is set. This could
> crash the kernel if the driver didn't set the fast_switch function
> pointer.
>
> This issue becomes apparent when aperf/mperf overflow occurs with
> amd_pstate (passive) + schedutil. When this happens, kernel disables
> frequency invariance calculation which causes schedutil to fallback to
> sugov_update_single_freq which currently relies on the fast_switch
> callback.
>
> Normal flow:
> sugov_update_single_perf
> cpufreq_driver_adjust_perf
> cpufreq_driver->adjust_perf
>
> Error case flow:
> sugov_update_single_perf
> sugov_update_single_freq <-- This is chosen because the freq invariant is disabled due to aperf/mperf overflow
> cpufreq_driver_fast_switch
> cpufreq_driver->fast_switch <-- Here NULL pointer dereference is happening, because fast_switch is not set
Not sure if all these details are required for this patch or not. It
is logically incorrect to set fast_switch_possible, while fast_switch
isn't set. That's a reason enough.
> Put up a warning message if the driver sets fast_switch_possible flag
> but not fast_switch.
>
> Signed-off-by: Wyes Karny <[email protected]>
> ---
> drivers/cpufreq/cpufreq.c | 18 ++++++++++++++++++
> include/linux/cpufreq.h | 5 ++++-
> 2 files changed, 22 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> index 6b52ebe5a890..180be9235b08 100644
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c
> @@ -501,6 +501,13 @@ void cpufreq_enable_fast_switch(struct cpufreq_policy *policy)
> if (!policy->fast_switch_possible)
> return;
>
> + /**
Doc style comments aren't required here I guess.
> + * It's not expected driver's fast_switch callback is not set
> + * even fast_switch_possible is true.
> + */
> + if (!cpufreq_driver_has_fast_switch())
> + pr_alert_once("fast_switch callback is not set\n");
> +
> mutex_lock(&cpufreq_fast_switch_lock);
> if (cpufreq_fast_switch_count >= 0) {
> cpufreq_fast_switch_count++;
> @@ -2143,6 +2150,17 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy,
> }
> EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch);
>
> +/**
> + * cpufreq_driver_has_fast_switch - Check "fast switch" callback.
> + *
> + * Return 'true' if the ->fast_switch callback is present for the
> + * current driver or 'false' otherwise.
> + */
> +bool cpufreq_driver_has_fast_switch(void)
Why create a routine for this, when no one else is going to use it ?
> +{
> + return !!cpufreq_driver->fast_switch;
> +}
I think you should add the required check in cpufreq_online(), after
cpufreq_driver->init() is called, and return failure if fast_switch
isn't set and fast_switch_possible is.
--
viresh
On 16-05-23, 11:19, Wyes Karny wrote:
> Sure, I'll do that. But one caution if we return failure, the drivers which
> has the above conditions won't load.
They shouldn't :)
--
viresh
Hi Viresh,
Thanks for looking into this patch.
On 16 May 06:40, Viresh Kumar wrote:
> On 15-05-23, 11:34, Wyes Karny wrote:
> > If fast_switch_possible flag is set by the scaling driver, the governor
> > is free to select fast_switch function even if adjust_perf is set. When
> > the frequency invariance is disabled due to some reason governor
> > fallbacks to fast_switch if fast_switch_possible is set. This could
> > crash the kernel if the driver didn't set the fast_switch function
> > pointer.
> >
> > This issue becomes apparent when aperf/mperf overflow occurs with
> > amd_pstate (passive) + schedutil. When this happens, kernel disables
> > frequency invariance calculation which causes schedutil to fallback to
> > sugov_update_single_freq which currently relies on the fast_switch
> > callback.
> >
> > Normal flow:
> > sugov_update_single_perf
> > cpufreq_driver_adjust_perf
> > cpufreq_driver->adjust_perf
> >
> > Error case flow:
> > sugov_update_single_perf
> > sugov_update_single_freq <-- This is chosen because the freq invariant is disabled due to aperf/mperf overflow
> > cpufreq_driver_fast_switch
> > cpufreq_driver->fast_switch <-- Here NULL pointer dereference is happening, because fast_switch is not set
>
> Not sure if all these details are required for this patch or not. It
> is logically incorrect to set fast_switch_possible, while fast_switch
> isn't set. That's a reason enough.
Sure, I'll remove this if it's unnecessary.
>
> > Put up a warning message if the driver sets fast_switch_possible flag
> > but not fast_switch.
> >
> > Signed-off-by: Wyes Karny <[email protected]>
> > ---
> > drivers/cpufreq/cpufreq.c | 18 ++++++++++++++++++
> > include/linux/cpufreq.h | 5 ++++-
> > 2 files changed, 22 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> > index 6b52ebe5a890..180be9235b08 100644
> > --- a/drivers/cpufreq/cpufreq.c
> > +++ b/drivers/cpufreq/cpufreq.c
> > @@ -501,6 +501,13 @@ void cpufreq_enable_fast_switch(struct cpufreq_policy *policy)
> > if (!policy->fast_switch_possible)
> > return;
> >
> > + /**
>
> Doc style comments aren't required here I guess.
Okay.
>
> > + * It's not expected driver's fast_switch callback is not set
> > + * even fast_switch_possible is true.
> > + */
> > + if (!cpufreq_driver_has_fast_switch())
> > + pr_alert_once("fast_switch callback is not set\n");
> > +
> > mutex_lock(&cpufreq_fast_switch_lock);
> > if (cpufreq_fast_switch_count >= 0) {
> > cpufreq_fast_switch_count++;
> > @@ -2143,6 +2150,17 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy,
> > }
> > EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch);
> >
> > +/**
> > + * cpufreq_driver_has_fast_switch - Check "fast switch" callback.
> > + *
> > + * Return 'true' if the ->fast_switch callback is present for the
> > + * current driver or 'false' otherwise.
> > + */
> > +bool cpufreq_driver_has_fast_switch(void)
>
> Why create a routine for this, when no one else is going to use it ?
>
> > +{
> > + return !!cpufreq_driver->fast_switch;
> > +}
>
> I think you should add the required check in cpufreq_online(), after
> cpufreq_driver->init() is called, and return failure if fast_switch
> isn't set and fast_switch_possible is.
Sure, I'll do that. But one caution if we return failure, the drivers which
has the above conditions won't load.
Thanks,
Wyes
>
> --
> viresh