2021-09-28 16:44:40

by Srinivas Pandruvada

[permalink] [raw]
Subject: [PATCH] cpufreq: intel_pstate: Process HWP Guaranteed change notification

It is possible that HWP guaranteed ratio is changed in response to
change in power and thermal limits. For example when Intel Speed Select
performance profile is changed or there is change in TDP, hardware can
send notifications. It is possible that the guaranteed ratio is
increased. This creates an issue when turbo is disabled, as the old
limits set in MSR_HWP_REQUEST are still lower and hardware will clip
to older limits.

This change enables HWP interrupt and process HWP interrupts. When
guaranteed is changed, calls cpufreq_update_policy() so that driver
callbacks are called to update to new HWP limits. This callback
is called from a delayed workqueue of 10ms to avoid frequent updates.

Although the scope of IA32_HWP_INTERRUPT is per logical cpu, on some
plaforms interrupt is generated on all CPUs. This is particularly a
problem during initialization, when the driver didn't allocated
data for other CPUs. So this change uses a cpumask of enabled CPUs and
process interrupts on those CPUs only.

When the cpufreq offline() or suspend()callback is called, HWP interrupt
is disabled on those CPUs and also cancels any pending work item.

Spin lock is used to protect data and processing shared with interrupt
handler. Here READ_ONCE(), WRITE_ONCE() macros are used to designate
shared data, even though spin lock act as an optmization barrier here.

Signed-off-by: Srinivas Pandruvada <[email protected]>
---
This patch is a replacement from the patch submitted to 5.15 and later
reverted.

drivers/cpufreq/intel_pstate.c | 117 +++++++++++++++++++++++++++++++--
1 file changed, 111 insertions(+), 6 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 8c176b7dae41..facc56dd58dd 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -32,6 +32,7 @@
#include <asm/cpu_device_id.h>
#include <asm/cpufeature.h>
#include <asm/intel-family.h>
+#include "../drivers/thermal/intel/thermal_interrupt.h"

#define INTEL_PSTATE_SAMPLING_INTERVAL (10 * NSEC_PER_MSEC)

@@ -219,6 +220,7 @@ struct global_params {
* @sched_flags: Store scheduler flags for possible cross CPU update
* @hwp_boost_min: Last HWP boosted min performance
* @suspended: Whether or not the driver has been suspended.
+ * @hwp_notify_work: workqueue for HWP notifications.
*
* This structure stores per CPU instance data for all CPUs.
*/
@@ -257,6 +259,7 @@ struct cpudata {
unsigned int sched_flags;
u32 hwp_boost_min;
bool suspended;
+ struct delayed_work hwp_notify_work;
};

static struct cpudata **all_cpu_data;
@@ -985,11 +988,15 @@ static void intel_pstate_hwp_set(unsigned int cpu)
wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);
}

+static void intel_pstate_disable_hwp_interrupt(struct cpudata *cpudata);
+
static void intel_pstate_hwp_offline(struct cpudata *cpu)
{
u64 value = READ_ONCE(cpu->hwp_req_cached);
int min_perf;

+ intel_pstate_disable_hwp_interrupt(cpu);
+
if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
/*
* In case the EPP has been set to "performance" by the
@@ -1053,6 +1060,9 @@ static int intel_pstate_suspend(struct cpufreq_policy *policy)

cpu->suspended = true;

+ /* disable HWP interrupt and cancel any pending work */
+ intel_pstate_disable_hwp_interrupt(cpu);
+
return 0;
}

@@ -1546,15 +1556,105 @@ static void intel_pstate_sysfs_hide_hwp_dynamic_boost(void)

/************************** sysfs end ************************/

+static void intel_pstate_notify_work(struct work_struct *work)
+{
+ struct cpudata *cpudata =
+ container_of(to_delayed_work(work), struct cpudata, hwp_notify_work);
+
+ cpufreq_update_policy(cpudata->cpu);
+ wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_STATUS, 0);
+}
+
+static DEFINE_SPINLOCK(hwp_notify_lock);
+static cpumask_t hwp_intr_enable_mask;
+
+void notify_hwp_interrupt(void)
+{
+ unsigned int this_cpu = smp_processor_id();
+ struct cpudata *cpudata;
+ unsigned long flags;
+ u64 value;
+
+ if (!READ_ONCE(hwp_active) || !boot_cpu_has(X86_FEATURE_HWP_NOTIFY))
+ return;
+
+ rdmsrl_safe(MSR_HWP_STATUS, &value);
+ if (!(value & 0x01))
+ return;
+
+ spin_lock_irqsave(&hwp_notify_lock, flags);
+
+ if (!cpumask_test_cpu(this_cpu, &hwp_intr_enable_mask))
+ goto ack_intr;
+
+ /*
+ * Currently we never free all_cpu_data. And we can't reach here
+ * without this allocated. But for safety for future changes, added
+ * check.
+ */
+ if (unlikely(!READ_ONCE(all_cpu_data)))
+ goto ack_intr;
+
+ /*
+ * The free is done during cleanup, when cpufreq registry is failed.
+ * We wouldn't be here if it fails on init or switch status. But for
+ * future changes, added check.
+ */
+ cpudata = READ_ONCE(all_cpu_data[this_cpu]);
+ if (unlikely(!cpudata))
+ goto ack_intr;
+
+ schedule_delayed_work(&cpudata->hwp_notify_work, msecs_to_jiffies(10));
+
+ spin_unlock_irqrestore(&hwp_notify_lock, flags);
+
+ return;
+
+ack_intr:
+ wrmsrl_safe(MSR_HWP_STATUS, 0);
+ spin_unlock_irqrestore(&hwp_notify_lock, flags);
+}
+
+static void intel_pstate_disable_hwp_interrupt(struct cpudata *cpudata)
+{
+ unsigned long flags;
+
+ /* wrmsrl_on_cpu has to be outside spinlock as this can result in IPC */
+ wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00);
+
+ spin_lock_irqsave(&hwp_notify_lock, flags);
+ if (cpumask_test_and_clear_cpu(cpudata->cpu, &hwp_intr_enable_mask))
+ cancel_delayed_work(&cpudata->hwp_notify_work);
+ spin_unlock_irqrestore(&hwp_notify_lock, flags);
+}
+
+static void intel_pstate_enable_hwp_interrupt(struct cpudata *cpudata)
+{
+ /* Enable HWP notification interrupt for guaranteed performance change */
+ if (boot_cpu_has(X86_FEATURE_HWP_NOTIFY)) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&hwp_notify_lock, flags);
+ INIT_DELAYED_WORK(&cpudata->hwp_notify_work, intel_pstate_notify_work);
+ cpumask_set_cpu(cpudata->cpu, &hwp_intr_enable_mask);
+ spin_unlock_irqrestore(&hwp_notify_lock, flags);
+
+ /* wrmsrl_on_cpu has to be outside spinlock as this can result in IPC */
+ wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x01);
+ }
+}
+
static void intel_pstate_hwp_enable(struct cpudata *cpudata)
{
- /* First disable HWP notification interrupt as we don't process them */
+ /* First disable HWP notification interrupt till we activate again */
if (boot_cpu_has(X86_FEATURE_HWP_NOTIFY))
wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00);

wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1);
if (cpudata->epp_default == -EINVAL)
cpudata->epp_default = intel_pstate_get_epp(cpudata, 0);
+
+ intel_pstate_enable_hwp_interrupt(cpudata);
}

static int atom_get_min_pstate(void)
@@ -2266,7 +2366,7 @@ static int intel_pstate_init_cpu(unsigned int cpunum)
if (!cpu)
return -ENOMEM;

- all_cpu_data[cpunum] = cpu;
+ WRITE_ONCE(all_cpu_data[cpunum], cpu);

cpu->cpu = cpunum;

@@ -2929,8 +3029,10 @@ static void intel_pstate_driver_cleanup(void)
if (intel_pstate_driver == &intel_pstate)
intel_pstate_clear_update_util_hook(cpu);

+ spin_lock(&hwp_notify_lock);
kfree(all_cpu_data[cpu]);
- all_cpu_data[cpu] = NULL;
+ WRITE_ONCE(all_cpu_data[cpu], NULL);
+ spin_unlock(&hwp_notify_lock);
}
}
cpus_read_unlock();
@@ -3199,6 +3301,7 @@ static bool intel_pstate_hwp_is_enabled(void)

static int __init intel_pstate_init(void)
{
+ static struct cpudata **_all_cpu_data;
const struct x86_cpu_id *id;
int rc;

@@ -3224,7 +3327,7 @@ static int __init intel_pstate_init(void)
* deal with it.
*/
if ((!no_hwp && boot_cpu_has(X86_FEATURE_HWP_EPP)) || hwp_forced) {
- hwp_active++;
+ WRITE_ONCE(hwp_active, 1);
hwp_mode_bdw = id->driver_data;
intel_pstate.attr = hwp_cpufreq_attrs;
intel_cpufreq.attr = hwp_cpufreq_attrs;
@@ -3275,10 +3378,12 @@ static int __init intel_pstate_init(void)

pr_info("Intel P-state driver initializing\n");

- all_cpu_data = vzalloc(array_size(sizeof(void *), num_possible_cpus()));
- if (!all_cpu_data)
+ _all_cpu_data = vzalloc(array_size(sizeof(void *), num_possible_cpus()));
+ if (!_all_cpu_data)
return -ENOMEM;

+ WRITE_ONCE(all_cpu_data, _all_cpu_data);
+
intel_pstate_request_control_from_smm();

intel_pstate_sysfs_expose_params();
--
2.31.1


2021-10-01 19:14:31

by Rafael J. Wysocki

[permalink] [raw]
Subject: Re: [PATCH] cpufreq: intel_pstate: Process HWP Guaranteed change notification

On Tue, Sep 28, 2021 at 6:42 PM Srinivas Pandruvada
<[email protected]> wrote:
>
> It is possible that HWP guaranteed ratio is changed in response to
> change in power and thermal limits. For example when Intel Speed Select
> performance profile is changed or there is change in TDP, hardware can
> send notifications. It is possible that the guaranteed ratio is
> increased. This creates an issue when turbo is disabled, as the old
> limits set in MSR_HWP_REQUEST are still lower and hardware will clip
> to older limits.
>
> This change enables HWP interrupt and process HWP interrupts. When
> guaranteed is changed, calls cpufreq_update_policy() so that driver
> callbacks are called to update to new HWP limits. This callback
> is called from a delayed workqueue of 10ms to avoid frequent updates.
>
> Although the scope of IA32_HWP_INTERRUPT is per logical cpu, on some
> plaforms interrupt is generated on all CPUs. This is particularly a
> problem during initialization, when the driver didn't allocated
> data for other CPUs. So this change uses a cpumask of enabled CPUs and
> process interrupts on those CPUs only.
>
> When the cpufreq offline() or suspend()callback is called, HWP interrupt
> is disabled on those CPUs and also cancels any pending work item.
>
> Spin lock is used to protect data and processing shared with interrupt
> handler. Here READ_ONCE(), WRITE_ONCE() macros are used to designate
> shared data, even though spin lock act as an optmization barrier here.
>
> Signed-off-by: Srinivas Pandruvada <[email protected]>
> ---
> This patch is a replacement from the patch submitted to 5.15 and later
> reverted.

Applied as 5.16 material, thanks!

> drivers/cpufreq/intel_pstate.c | 117 +++++++++++++++++++++++++++++++--
> 1 file changed, 111 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
> index 8c176b7dae41..facc56dd58dd 100644
> --- a/drivers/cpufreq/intel_pstate.c
> +++ b/drivers/cpufreq/intel_pstate.c
> @@ -32,6 +32,7 @@
> #include <asm/cpu_device_id.h>
> #include <asm/cpufeature.h>
> #include <asm/intel-family.h>
> +#include "../drivers/thermal/intel/thermal_interrupt.h"
>
> #define INTEL_PSTATE_SAMPLING_INTERVAL (10 * NSEC_PER_MSEC)
>
> @@ -219,6 +220,7 @@ struct global_params {
> * @sched_flags: Store scheduler flags for possible cross CPU update
> * @hwp_boost_min: Last HWP boosted min performance
> * @suspended: Whether or not the driver has been suspended.
> + * @hwp_notify_work: workqueue for HWP notifications.
> *
> * This structure stores per CPU instance data for all CPUs.
> */
> @@ -257,6 +259,7 @@ struct cpudata {
> unsigned int sched_flags;
> u32 hwp_boost_min;
> bool suspended;
> + struct delayed_work hwp_notify_work;
> };
>
> static struct cpudata **all_cpu_data;
> @@ -985,11 +988,15 @@ static void intel_pstate_hwp_set(unsigned int cpu)
> wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);
> }
>
> +static void intel_pstate_disable_hwp_interrupt(struct cpudata *cpudata);
> +
> static void intel_pstate_hwp_offline(struct cpudata *cpu)
> {
> u64 value = READ_ONCE(cpu->hwp_req_cached);
> int min_perf;
>
> + intel_pstate_disable_hwp_interrupt(cpu);
> +
> if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
> /*
> * In case the EPP has been set to "performance" by the
> @@ -1053,6 +1060,9 @@ static int intel_pstate_suspend(struct cpufreq_policy *policy)
>
> cpu->suspended = true;
>
> + /* disable HWP interrupt and cancel any pending work */
> + intel_pstate_disable_hwp_interrupt(cpu);
> +
> return 0;
> }
>
> @@ -1546,15 +1556,105 @@ static void intel_pstate_sysfs_hide_hwp_dynamic_boost(void)
>
> /************************** sysfs end ************************/
>
> +static void intel_pstate_notify_work(struct work_struct *work)
> +{
> + struct cpudata *cpudata =
> + container_of(to_delayed_work(work), struct cpudata, hwp_notify_work);
> +
> + cpufreq_update_policy(cpudata->cpu);
> + wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_STATUS, 0);
> +}
> +
> +static DEFINE_SPINLOCK(hwp_notify_lock);
> +static cpumask_t hwp_intr_enable_mask;
> +
> +void notify_hwp_interrupt(void)
> +{
> + unsigned int this_cpu = smp_processor_id();
> + struct cpudata *cpudata;
> + unsigned long flags;
> + u64 value;
> +
> + if (!READ_ONCE(hwp_active) || !boot_cpu_has(X86_FEATURE_HWP_NOTIFY))
> + return;
> +
> + rdmsrl_safe(MSR_HWP_STATUS, &value);
> + if (!(value & 0x01))
> + return;
> +
> + spin_lock_irqsave(&hwp_notify_lock, flags);
> +
> + if (!cpumask_test_cpu(this_cpu, &hwp_intr_enable_mask))
> + goto ack_intr;
> +
> + /*
> + * Currently we never free all_cpu_data. And we can't reach here
> + * without this allocated. But for safety for future changes, added
> + * check.
> + */
> + if (unlikely(!READ_ONCE(all_cpu_data)))
> + goto ack_intr;
> +
> + /*
> + * The free is done during cleanup, when cpufreq registry is failed.
> + * We wouldn't be here if it fails on init or switch status. But for
> + * future changes, added check.
> + */
> + cpudata = READ_ONCE(all_cpu_data[this_cpu]);
> + if (unlikely(!cpudata))
> + goto ack_intr;
> +
> + schedule_delayed_work(&cpudata->hwp_notify_work, msecs_to_jiffies(10));
> +
> + spin_unlock_irqrestore(&hwp_notify_lock, flags);
> +
> + return;
> +
> +ack_intr:
> + wrmsrl_safe(MSR_HWP_STATUS, 0);
> + spin_unlock_irqrestore(&hwp_notify_lock, flags);
> +}
> +
> +static void intel_pstate_disable_hwp_interrupt(struct cpudata *cpudata)
> +{
> + unsigned long flags;
> +
> + /* wrmsrl_on_cpu has to be outside spinlock as this can result in IPC */
> + wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00);
> +
> + spin_lock_irqsave(&hwp_notify_lock, flags);
> + if (cpumask_test_and_clear_cpu(cpudata->cpu, &hwp_intr_enable_mask))
> + cancel_delayed_work(&cpudata->hwp_notify_work);
> + spin_unlock_irqrestore(&hwp_notify_lock, flags);
> +}
> +
> +static void intel_pstate_enable_hwp_interrupt(struct cpudata *cpudata)
> +{
> + /* Enable HWP notification interrupt for guaranteed performance change */
> + if (boot_cpu_has(X86_FEATURE_HWP_NOTIFY)) {
> + unsigned long flags;
> +
> + spin_lock_irqsave(&hwp_notify_lock, flags);
> + INIT_DELAYED_WORK(&cpudata->hwp_notify_work, intel_pstate_notify_work);
> + cpumask_set_cpu(cpudata->cpu, &hwp_intr_enable_mask);
> + spin_unlock_irqrestore(&hwp_notify_lock, flags);
> +
> + /* wrmsrl_on_cpu has to be outside spinlock as this can result in IPC */
> + wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x01);
> + }
> +}
> +
> static void intel_pstate_hwp_enable(struct cpudata *cpudata)
> {
> - /* First disable HWP notification interrupt as we don't process them */
> + /* First disable HWP notification interrupt till we activate again */
> if (boot_cpu_has(X86_FEATURE_HWP_NOTIFY))
> wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00);
>
> wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1);
> if (cpudata->epp_default == -EINVAL)
> cpudata->epp_default = intel_pstate_get_epp(cpudata, 0);
> +
> + intel_pstate_enable_hwp_interrupt(cpudata);
> }
>
> static int atom_get_min_pstate(void)
> @@ -2266,7 +2366,7 @@ static int intel_pstate_init_cpu(unsigned int cpunum)
> if (!cpu)
> return -ENOMEM;
>
> - all_cpu_data[cpunum] = cpu;
> + WRITE_ONCE(all_cpu_data[cpunum], cpu);
>
> cpu->cpu = cpunum;
>
> @@ -2929,8 +3029,10 @@ static void intel_pstate_driver_cleanup(void)
> if (intel_pstate_driver == &intel_pstate)
> intel_pstate_clear_update_util_hook(cpu);
>
> + spin_lock(&hwp_notify_lock);
> kfree(all_cpu_data[cpu]);
> - all_cpu_data[cpu] = NULL;
> + WRITE_ONCE(all_cpu_data[cpu], NULL);
> + spin_unlock(&hwp_notify_lock);
> }
> }
> cpus_read_unlock();
> @@ -3199,6 +3301,7 @@ static bool intel_pstate_hwp_is_enabled(void)
>
> static int __init intel_pstate_init(void)
> {
> + static struct cpudata **_all_cpu_data;
> const struct x86_cpu_id *id;
> int rc;
>
> @@ -3224,7 +3327,7 @@ static int __init intel_pstate_init(void)
> * deal with it.
> */
> if ((!no_hwp && boot_cpu_has(X86_FEATURE_HWP_EPP)) || hwp_forced) {
> - hwp_active++;
> + WRITE_ONCE(hwp_active, 1);
> hwp_mode_bdw = id->driver_data;
> intel_pstate.attr = hwp_cpufreq_attrs;
> intel_cpufreq.attr = hwp_cpufreq_attrs;
> @@ -3275,10 +3378,12 @@ static int __init intel_pstate_init(void)
>
> pr_info("Intel P-state driver initializing\n");
>
> - all_cpu_data = vzalloc(array_size(sizeof(void *), num_possible_cpus()));
> - if (!all_cpu_data)
> + _all_cpu_data = vzalloc(array_size(sizeof(void *), num_possible_cpus()));
> + if (!_all_cpu_data)
> return -ENOMEM;
>
> + WRITE_ONCE(all_cpu_data, _all_cpu_data);
> +
> intel_pstate_request_control_from_smm();
>
> intel_pstate_sysfs_expose_params();
> --
> 2.31.1
>

2021-10-01 20:03:38

by Rafael J. Wysocki

[permalink] [raw]
Subject: Re: [PATCH] cpufreq: intel_pstate: Process HWP Guaranteed change notification

On Fri, Oct 1, 2021 at 9:01 PM Srinivas Pandruvada
<[email protected]> wrote:
>
> On Fri, 2021-10-01 at 20:52 +0200, Rafael J. Wysocki wrote:
> > On Tue, Sep 28, 2021 at 6:42 PM Srinivas Pandruvada
> > <[email protected]> wrote:
> > >
> > > It is possible that HWP guaranteed ratio is changed in response to
> > > change in power and thermal limits. For example when Intel Speed
> > > Select
> > > performance profile is changed or there is change in TDP, hardware
> > > can
> > > send notifications. It is possible that the guaranteed ratio is
> > > increased. This creates an issue when turbo is disabled, as the old
> > > limits set in MSR_HWP_REQUEST are still lower and hardware will
> > > clip
> > > to older limits.
> > >
> > > This change enables HWP interrupt and process HWP interrupts. When
> > > guaranteed is changed, calls cpufreq_update_policy() so that driver
> > > callbacks are called to update to new HWP limits. This callback
> > > is called from a delayed workqueue of 10ms to avoid frequent
> > > updates.
> > >
> > > Although the scope of IA32_HWP_INTERRUPT is per logical cpu, on
> > > some
> > > plaforms interrupt is generated on all CPUs. This is particularly a
> > > problem during initialization, when the driver didn't allocated
> > > data for other CPUs. So this change uses a cpumask of enabled CPUs
> > > and
> > > process interrupts on those CPUs only.
> > >
> > > When the cpufreq offline() or suspend()callback is called, HWP
> > > interrupt
> > > is disabled on those CPUs and also cancels any pending work item.
> > >
> > > Spin lock is used to protect data and processing shared with
> > > interrupt
> > > handler. Here READ_ONCE(), WRITE_ONCE() macros are used to
> > > designate
> > > shared data, even though spin lock act as an optmization barrier
> > > here.
> > >
> > > Signed-off-by: Srinivas Pandruvada <
> > > [email protected]>
> > > ---
> > > This patch is a replacement from the patch submitted to 5.15 and
> > > later
> > > reverted.
> >
> > Applied as 5.16 material, thanks!
> I got email that pablo ([email protected]) tested this. So he can added
> Tested-by also pointed out typo in "optmization".
> Shall I send a patch with this change?

No need, I will update the commit in the tree.

2021-10-01 20:58:02

by Srinivas Pandruvada

[permalink] [raw]
Subject: Re: [PATCH] cpufreq: intel_pstate: Process HWP Guaranteed change notification

On Fri, 2021-10-01 at 20:52 +0200, Rafael J. Wysocki wrote:
> On Tue, Sep 28, 2021 at 6:42 PM Srinivas Pandruvada
> <[email protected]> wrote:
> >
> > It is possible that HWP guaranteed ratio is changed in response to
> > change in power and thermal limits. For example when Intel Speed
> > Select
> > performance profile is changed or there is change in TDP, hardware
> > can
> > send notifications. It is possible that the guaranteed ratio is
> > increased. This creates an issue when turbo is disabled, as the old
> > limits set in MSR_HWP_REQUEST are still lower and hardware will
> > clip
> > to older limits.
> >
> > This change enables HWP interrupt and process HWP interrupts. When
> > guaranteed is changed, calls cpufreq_update_policy() so that driver
> > callbacks are called to update to new HWP limits. This callback
> > is called from a delayed workqueue of 10ms to avoid frequent
> > updates.
> >
> > Although the scope of IA32_HWP_INTERRUPT is per logical cpu, on
> > some
> > plaforms interrupt is generated on all CPUs. This is particularly a
> > problem during initialization, when the driver didn't allocated
> > data for other CPUs. So this change uses a cpumask of enabled CPUs
> > and
> > process interrupts on those CPUs only.
> >
> > When the cpufreq offline() or suspend()callback is called, HWP
> > interrupt
> > is disabled on those CPUs and also cancels any pending work item.
> >
> > Spin lock is used to protect data and processing shared with
> > interrupt
> > handler. Here READ_ONCE(), WRITE_ONCE() macros are used to
> > designate
> > shared data, even though spin lock act as an optmization barrier
> > here.
> >
> > Signed-off-by: Srinivas Pandruvada
> > <[email protected]>
> > ---
> > This patch is a replacement from the patch submitted to 5.15 and
> > later
> > reverted.
>
> Applied as 5.16 material, thanks!
>
> >  drivers/cpufreq/intel_pstate.c | 117
> > +++++++++++++++++++++++++++++++--
> >  1 file changed, 111 insertions(+), 6 deletions(-)
> >
> > diff --git a/drivers/cpufreq/intel_pstate.c
> > b/drivers/cpufreq/intel_pstate.c
> > index 8c176b7dae41..facc56dd58dd 100644
> > --- a/drivers/cpufreq/intel_pstate.c
> > +++ b/drivers/cpufreq/intel_pstate.c
> > @@ -32,6 +32,7 @@
> >  #include <asm/cpu_device_id.h>
> >  #include <asm/cpufeature.h>
> >  #include <asm/intel-family.h>
> > +#include "../drivers/thermal/intel/thermal_interrupt.h"
> >
> >  #define INTEL_PSTATE_SAMPLING_INTERVAL (10 * NSEC_PER_MSEC)
> >
> > @@ -219,6 +220,7 @@ struct global_params {
> >   * @sched_flags:       Store scheduler flags for possible cross
> > CPU update
> >   * @hwp_boost_min:     Last HWP boosted min performance
> >   * @suspended:         Whether or not the driver has been
> > suspended.
> > + * @hwp_notify_work:   workqueue for HWP notifications.
> >   *
> >   * This structure stores per CPU instance data for all CPUs.
> >   */
> > @@ -257,6 +259,7 @@ struct cpudata {
> >         unsigned int sched_flags;
> >         u32 hwp_boost_min;
> >         bool suspended;
> > +       struct delayed_work hwp_notify_work;
> >  };
> >
> >  static struct cpudata **all_cpu_data;
> > @@ -985,11 +988,15 @@ static void intel_pstate_hwp_set(unsigned int
> > cpu)
> >         wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);
> >  }
> >
> > +static void intel_pstate_disable_hwp_interrupt(struct cpudata
> > *cpudata);
> > +
> >  static void intel_pstate_hwp_offline(struct cpudata *cpu)
> >  {
> >         u64 value = READ_ONCE(cpu->hwp_req_cached);
> >         int min_perf;
> >
> > +       intel_pstate_disable_hwp_interrupt(cpu);
> > +
> >         if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
> >                 /*
> >                  * In case the EPP has been set to "performance" by
> > the
> > @@ -1053,6 +1060,9 @@ static int intel_pstate_suspend(struct
> > cpufreq_policy *policy)
> >
> >         cpu->suspended = true;
> >
> > +       /* disable HWP interrupt and cancel any pending work */
> > +       intel_pstate_disable_hwp_interrupt(cpu);
> > +
> >         return 0;
> >  }
> >
> > @@ -1546,15 +1556,105 @@ static void
> > intel_pstate_sysfs_hide_hwp_dynamic_boost(void)
> >
> >  /************************** sysfs end ************************/
> >
> > +static void intel_pstate_notify_work(struct work_struct *work)
> > +{
> > +       struct cpudata *cpudata =
> > +               container_of(to_delayed_work(work), struct cpudata,
> > hwp_notify_work);
> > +
> > +       cpufreq_update_policy(cpudata->cpu);
> > +       wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_STATUS, 0);
> > +}
> > +
> > +static DEFINE_SPINLOCK(hwp_notify_lock);
> > +static cpumask_t hwp_intr_enable_mask;
> > +
> > +void notify_hwp_interrupt(void)
> > +{
> > +       unsigned int this_cpu = smp_processor_id();
> > +       struct cpudata *cpudata;
> > +       unsigned long flags;
> > +       u64 value;
> > +
> > +       if (!READ_ONCE(hwp_active) ||
> > !boot_cpu_has(X86_FEATURE_HWP_NOTIFY))
> > +               return;
> > +
> > +       rdmsrl_safe(MSR_HWP_STATUS, &value);
> > +       if (!(value & 0x01))
> > +               return;
> > +
> > +       spin_lock_irqsave(&hwp_notify_lock, flags);
> > +
> > +       if (!cpumask_test_cpu(this_cpu, &hwp_intr_enable_mask))
> > +               goto ack_intr;
> > +
> > +       /*
> > +        * Currently we never free all_cpu_data. And we can't reach
> > here
> > +        * without this allocated. But for safety for future
> > changes, added
> > +        * check.
> > +        */
> > +       if (unlikely(!READ_ONCE(all_cpu_data)))
> > +               goto ack_intr;
> > +
> > +       /*
> > +        * The free is done during cleanup, when cpufreq registry
> > is failed.
> > +        * We wouldn't be here if it fails on init or switch
> > status. But for
> > +        * future changes, added check.
> > +        */
> > +       cpudata = READ_ONCE(all_cpu_data[this_cpu]);
> > +       if (unlikely(!cpudata))
> > +               goto ack_intr;
> > +
> > +       schedule_delayed_work(&cpudata->hwp_notify_work,
> > msecs_to_jiffies(10));
> > +
> > +       spin_unlock_irqrestore(&hwp_notify_lock, flags);
> > +
> > +       return;
> > +
> > +ack_intr:
> > +       wrmsrl_safe(MSR_HWP_STATUS, 0);
> > +       spin_unlock_irqrestore(&hwp_notify_lock, flags);
> > +}
> > +
> > +static void intel_pstate_disable_hwp_interrupt(struct cpudata
> > *cpudata)
> > +{
> > +       unsigned long flags;
> > +
> > +       /* wrmsrl_on_cpu has to be outside spinlock as this can
> > result in IPC */
> > +       wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00);
> > +
> > +       spin_lock_irqsave(&hwp_notify_lock, flags);
> > +       if (cpumask_test_and_clear_cpu(cpudata->cpu,
> > &hwp_intr_enable_mask))
> > +               cancel_delayed_work(&cpudata->hwp_notify_work);
> > +       spin_unlock_irqrestore(&hwp_notify_lock, flags);
> > +}
> > +
> > +static void intel_pstate_enable_hwp_interrupt(struct cpudata
> > *cpudata)
> > +{
> > +       /* Enable HWP notification interrupt for guaranteed
> > performance change */
> > +       if (boot_cpu_has(X86_FEATURE_HWP_NOTIFY)) {
> > +               unsigned long flags;
> > +
> > +               spin_lock_irqsave(&hwp_notify_lock, flags);
> > +               INIT_DELAYED_WORK(&cpudata->hwp_notify_work,
> > intel_pstate_notify_work);
> > +               cpumask_set_cpu(cpudata->cpu,
> > &hwp_intr_enable_mask);
> > +               spin_unlock_irqrestore(&hwp_notify_lock, flags);
> > +
> > +               /* wrmsrl_on_cpu has to be outside spinlock as this
> > can result in IPC */
> > +               wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT,
> > 0x01);
> > +       }
> > +}
> > +
> >  static void intel_pstate_hwp_enable(struct cpudata *cpudata)
> >  {
> > -       /* First disable HWP notification interrupt as we don't
> > process them */
> > +       /* First disable HWP notification interrupt till we
> > activate again */
> >         if (boot_cpu_has(X86_FEATURE_HWP_NOTIFY))
> >                 wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT,
> > 0x00);
> >
> >         wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1);
> >         if (cpudata->epp_default == -EINVAL)
> >                 cpudata->epp_default =
> > intel_pstate_get_epp(cpudata, 0);
> > +
> > +       intel_pstate_enable_hwp_interrupt(cpudata);
> >  }
> >
> >  static int atom_get_min_pstate(void)
> > @@ -2266,7 +2366,7 @@ static int intel_pstate_init_cpu(unsigned int
> > cpunum)
> >                 if (!cpu)
> >                         return -ENOMEM;
> >
> > -               all_cpu_data[cpunum] = cpu;
> > +               WRITE_ONCE(all_cpu_data[cpunum], cpu);
> >
> >                 cpu->cpu = cpunum;
> >
> > @@ -2929,8 +3029,10 @@ static void
> > intel_pstate_driver_cleanup(void)
> >                         if (intel_pstate_driver == &intel_pstate)
> >                                
> > intel_pstate_clear_update_util_hook(cpu);
> >
> > +                       spin_lock(&hwp_notify_lock);
> >                         kfree(all_cpu_data[cpu]);
> > -                       all_cpu_data[cpu] = NULL;
> > +                       WRITE_ONCE(all_cpu_data[cpu], NULL);
> > +                       spin_unlock(&hwp_notify_lock);
> >                 }
> >         }
> >         cpus_read_unlock();
> > @@ -3199,6 +3301,7 @@ static bool intel_pstate_hwp_is_enabled(void)
> >
> >  static int __init intel_pstate_init(void)
> >  {
> > +       static struct cpudata **_all_cpu_data;
> >         const struct x86_cpu_id *id;
> >         int rc;
> >
> > @@ -3224,7 +3327,7 @@ static int __init intel_pstate_init(void)
> >                  * deal with it.
> >                  */
> >                 if ((!no_hwp && boot_cpu_has(X86_FEATURE_HWP_EPP))
> > || hwp_forced) {
> > -                       hwp_active++;
> > +                       WRITE_ONCE(hwp_active, 1);
> >                         hwp_mode_bdw = id->driver_data;
> >                         intel_pstate.attr = hwp_cpufreq_attrs;
> >                         intel_cpufreq.attr = hwp_cpufreq_attrs;
> > @@ -3275,10 +3378,12 @@ static int __init intel_pstate_init(void)
> >
> >         pr_info("Intel P-state driver initializing\n");
> >
> > -       all_cpu_data = vzalloc(array_size(sizeof(void *),
> > num_possible_cpus()));
> > -       if (!all_cpu_data)
> > +       _all_cpu_data = vzalloc(array_size(sizeof(void *),
> > num_possible_cpus()));
> > +       if (!_all_cpu_data)
> >                 return -ENOMEM;
> >
> > +       WRITE_ONCE(all_cpu_data, _all_cpu_data);
> > +
> >         intel_pstate_request_control_from_smm();
> >
> >         intel_pstate_sysfs_expose_params();
> > --
> > 2.31.1
> >


2021-10-01 20:59:18

by Srinivas Pandruvada

[permalink] [raw]
Subject: Re: [PATCH] cpufreq: intel_pstate: Process HWP Guaranteed change notification

On Fri, 2021-10-01 at 20:52 +0200, Rafael J. Wysocki wrote:
> On Tue, Sep 28, 2021 at 6:42 PM Srinivas Pandruvada
> <[email protected]> wrote:
> >
> > It is possible that HWP guaranteed ratio is changed in response to
> > change in power and thermal limits. For example when Intel Speed
> > Select
> > performance profile is changed or there is change in TDP, hardware
> > can
> > send notifications. It is possible that the guaranteed ratio is
> > increased. This creates an issue when turbo is disabled, as the old
> > limits set in MSR_HWP_REQUEST are still lower and hardware will
> > clip
> > to older limits.
> >
> > This change enables HWP interrupt and process HWP interrupts. When
> > guaranteed is changed, calls cpufreq_update_policy() so that driver
> > callbacks are called to update to new HWP limits. This callback
> > is called from a delayed workqueue of 10ms to avoid frequent
> > updates.
> >
> > Although the scope of IA32_HWP_INTERRUPT is per logical cpu, on
> > some
> > plaforms interrupt is generated on all CPUs. This is particularly a
> > problem during initialization, when the driver didn't allocated
> > data for other CPUs. So this change uses a cpumask of enabled CPUs
> > and
> > process interrupts on those CPUs only.
> >
> > When the cpufreq offline() or suspend()callback is called, HWP
> > interrupt
> > is disabled on those CPUs and also cancels any pending work item.
> >
> > Spin lock is used to protect data and processing shared with
> > interrupt
> > handler. Here READ_ONCE(), WRITE_ONCE() macros are used to
> > designate
> > shared data, even though spin lock act as an optmization barrier
> > here.
> >
> > Signed-off-by: Srinivas Pandruvada <
> > [email protected]>
> > ---
> > This patch is a replacement from the patch submitted to 5.15 and
> > later
> > reverted.
>
> Applied as 5.16 material, thanks!
I got email that pablo ([email protected]) tested this. So he can added
Tested-by also pointed out typo in "optmization".
Shall I send a patch with this change?

Thanks,
Srinivas


>
> >  drivers/cpufreq/intel_pstate.c | 117
> > +++++++++++++++++++++++++++++++--
> >  1 file changed, 111 insertions(+), 6 deletions(-)
> >
> > diff --git a/drivers/cpufreq/intel_pstate.c
> > b/drivers/cpufreq/intel_pstate.c
> > index 8c176b7dae41..facc56dd58dd 100644
> > --- a/drivers/cpufreq/intel_pstate.c
> > +++ b/drivers/cpufreq/intel_pstate.c
> > @@ -32,6 +32,7 @@
> >  #include <asm/cpu_device_id.h>
> >  #include <asm/cpufeature.h>
> >  #include <asm/intel-family.h>
> > +#include "../drivers/thermal/intel/thermal_interrupt.h"
> >
> >  #define INTEL_PSTATE_SAMPLING_INTERVAL (10 * NSEC_PER_MSEC)
> >
> > @@ -219,6 +220,7 @@ struct global_params {
> >   * @sched_flags:       Store scheduler flags for possible cross CPU
> > update
> >   * @hwp_boost_min:     Last HWP boosted min performance
> >   * @suspended:         Whether or not the driver has been suspended.
> > + * @hwp_notify_work:   workqueue for HWP notifications.
> >   *
> >   * This structure stores per CPU instance data for all CPUs.
> >   */
> > @@ -257,6 +259,7 @@ struct cpudata {
> >         unsigned int sched_flags;
> >         u32 hwp_boost_min;
> >         bool suspended;
> > +       struct delayed_work hwp_notify_work;
> >  };
> >
> >  static struct cpudata **all_cpu_data;
> > @@ -985,11 +988,15 @@ static void intel_pstate_hwp_set(unsigned int
> > cpu)
> >         wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);
> >  }
> >
> > +static void intel_pstate_disable_hwp_interrupt(struct cpudata
> > *cpudata);
> > +
> >  static void intel_pstate_hwp_offline(struct cpudata *cpu)
> >  {
> >         u64 value = READ_ONCE(cpu->hwp_req_cached);
> >         int min_perf;
> >
> > +       intel_pstate_disable_hwp_interrupt(cpu);
> > +
> >         if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
> >                 /*
> >                  * In case the EPP has been set to "performance" by
> > the
> > @@ -1053,6 +1060,9 @@ static int intel_pstate_suspend(struct
> > cpufreq_policy *policy)
> >
> >         cpu->suspended = true;
> >
> > +       /* disable HWP interrupt and cancel any pending work */
> > +       intel_pstate_disable_hwp_interrupt(cpu);
> > +
> >         return 0;
> >  }
> >
> > @@ -1546,15 +1556,105 @@ static void
> > intel_pstate_sysfs_hide_hwp_dynamic_boost(void)
> >
> >  /************************** sysfs end ************************/
> >
> > +static void intel_pstate_notify_work(struct work_struct *work)
> > +{
> > +       struct cpudata *cpudata =
> > +               container_of(to_delayed_work(work), struct cpudata,
> > hwp_notify_work);
> > +
> > +       cpufreq_update_policy(cpudata->cpu);
> > +       wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_STATUS, 0);
> > +}
> > +
> > +static DEFINE_SPINLOCK(hwp_notify_lock);
> > +static cpumask_t hwp_intr_enable_mask;
> > +
> > +void notify_hwp_interrupt(void)
> > +{
> > +       unsigned int this_cpu = smp_processor_id();
> > +       struct cpudata *cpudata;
> > +       unsigned long flags;
> > +       u64 value;
> > +
> > +       if (!READ_ONCE(hwp_active) ||
> > !boot_cpu_has(X86_FEATURE_HWP_NOTIFY))
> > +               return;
> > +
> > +       rdmsrl_safe(MSR_HWP_STATUS, &value);
> > +       if (!(value & 0x01))
> > +               return;
> > +
> > +       spin_lock_irqsave(&hwp_notify_lock, flags);
> > +
> > +       if (!cpumask_test_cpu(this_cpu, &hwp_intr_enable_mask))
> > +               goto ack_intr;
> > +
> > +       /*
> > +        * Currently we never free all_cpu_data. And we can't reach
> > here
> > +        * without this allocated. But for safety for future changes,
> > added
> > +        * check.
> > +        */
> > +       if (unlikely(!READ_ONCE(all_cpu_data)))
> > +               goto ack_intr;
> > +
> > +       /*
> > +        * The free is done during cleanup, when cpufreq registry is
> > failed.
> > +        * We wouldn't be here if it fails on init or switch status.
> > But for
> > +        * future changes, added check.
> > +        */
> > +       cpudata = READ_ONCE(all_cpu_data[this_cpu]);
> > +       if (unlikely(!cpudata))
> > +               goto ack_intr;
> > +
> > +       schedule_delayed_work(&cpudata->hwp_notify_work,
> > msecs_to_jiffies(10));
> > +
> > +       spin_unlock_irqrestore(&hwp_notify_lock, flags);
> > +
> > +       return;
> > +
> > +ack_intr:
> > +       wrmsrl_safe(MSR_HWP_STATUS, 0);
> > +       spin_unlock_irqrestore(&hwp_notify_lock, flags);
> > +}
> > +
> > +static void intel_pstate_disable_hwp_interrupt(struct cpudata
> > *cpudata)
> > +{
> > +       unsigned long flags;
> > +
> > +       /* wrmsrl_on_cpu has to be outside spinlock as this can
> > result in IPC */
> > +       wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00);
> > +
> > +       spin_lock_irqsave(&hwp_notify_lock, flags);
> > +       if (cpumask_test_and_clear_cpu(cpudata->cpu,
> > &hwp_intr_enable_mask))
> > +               cancel_delayed_work(&cpudata->hwp_notify_work);
> > +       spin_unlock_irqrestore(&hwp_notify_lock, flags);
> > +}
> > +
> > +static void intel_pstate_enable_hwp_interrupt(struct cpudata
> > *cpudata)
> > +{
> > +       /* Enable HWP notification interrupt for guaranteed
> > performance change */
> > +       if (boot_cpu_has(X86_FEATURE_HWP_NOTIFY)) {
> > +               unsigned long flags;
> > +
> > +               spin_lock_irqsave(&hwp_notify_lock, flags);
> > +               INIT_DELAYED_WORK(&cpudata->hwp_notify_work,
> > intel_pstate_notify_work);
> > +               cpumask_set_cpu(cpudata->cpu, &hwp_intr_enable_mask);
> > +               spin_unlock_irqrestore(&hwp_notify_lock, flags);
> > +
> > +               /* wrmsrl_on_cpu has to be outside spinlock as this
> > can result in IPC */
> > +               wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x01);
> > +       }
> > +}
> > +
> >  static void intel_pstate_hwp_enable(struct cpudata *cpudata)
> >  {
> > -       /* First disable HWP notification interrupt as we don't
> > process them */
> > +       /* First disable HWP notification interrupt till we activate
> > again */
> >         if (boot_cpu_has(X86_FEATURE_HWP_NOTIFY))
> >                 wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00);
> >
> >         wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1);
> >         if (cpudata->epp_default == -EINVAL)
> >                 cpudata->epp_default = intel_pstate_get_epp(cpudata,
> > 0);
> > +
> > +       intel_pstate_enable_hwp_interrupt(cpudata);
> >  }
> >
> >  static int atom_get_min_pstate(void)
> > @@ -2266,7 +2366,7 @@ static int intel_pstate_init_cpu(unsigned int
> > cpunum)
> >                 if (!cpu)
> >                         return -ENOMEM;
> >
> > -               all_cpu_data[cpunum] = cpu;
> > +               WRITE_ONCE(all_cpu_data[cpunum], cpu);
> >
> >                 cpu->cpu = cpunum;
> >
> > @@ -2929,8 +3029,10 @@ static void intel_pstate_driver_cleanup(void)
> >                         if (intel_pstate_driver == &intel_pstate)
> >                                
> > intel_pstate_clear_update_util_hook(cpu);
> >
> > +                       spin_lock(&hwp_notify_lock);
> >                         kfree(all_cpu_data[cpu]);
> > -                       all_cpu_data[cpu] = NULL;
> > +                       WRITE_ONCE(all_cpu_data[cpu], NULL);
> > +                       spin_unlock(&hwp_notify_lock);
> >                 }
> >         }
> >         cpus_read_unlock();
> > @@ -3199,6 +3301,7 @@ static bool intel_pstate_hwp_is_enabled(void)
> >
> >  static int __init intel_pstate_init(void)
> >  {
> > +       static struct cpudata **_all_cpu_data;
> >         const struct x86_cpu_id *id;
> >         int rc;
> >
> > @@ -3224,7 +3327,7 @@ static int __init intel_pstate_init(void)
> >                  * deal with it.
> >                  */
> >                 if ((!no_hwp && boot_cpu_has(X86_FEATURE_HWP_EPP)) ||
> > hwp_forced) {
> > -                       hwp_active++;
> > +                       WRITE_ONCE(hwp_active, 1);
> >                         hwp_mode_bdw = id->driver_data;
> >                         intel_pstate.attr = hwp_cpufreq_attrs;
> >                         intel_cpufreq.attr = hwp_cpufreq_attrs;
> > @@ -3275,10 +3378,12 @@ static int __init intel_pstate_init(void)
> >
> >         pr_info("Intel P-state driver initializing\n");
> >
> > -       all_cpu_data = vzalloc(array_size(sizeof(void *),
> > num_possible_cpus()));
> > -       if (!all_cpu_data)
> > +       _all_cpu_data = vzalloc(array_size(sizeof(void *),
> > num_possible_cpus()));
> > +       if (!_all_cpu_data)
> >                 return -ENOMEM;
> >
> > +       WRITE_ONCE(all_cpu_data, _all_cpu_data);
> > +
> >         intel_pstate_request_control_from_smm();
> >
> >         intel_pstate_sysfs_expose_params();
> > --
> > 2.31.1
> >