2022-06-10 10:26:03

by Lukasz Luba

[permalink] [raw]
Subject: [PATCH 1/3] thermal: cpufreq_cooling: Use a copy of local ops for each cooling device

It is very unlikely that one CPU cluster would have the EM and some other
won't have it (because EM registration failed or DT lacks needed entry).
Although, we should avoid modifying global variable with callbacks anyway.
Redesign this and add safety for such situation.

Signed-off-by: Lukasz Luba <[email protected]>
---
drivers/thermal/cpufreq_cooling.c | 16 +++++++++++++---
1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
index b8151d95a806..e33183785fac 100644
--- a/drivers/thermal/cpufreq_cooling.c
+++ b/drivers/thermal/cpufreq_cooling.c
@@ -554,7 +554,12 @@ __cpufreq_cooling_register(struct device_node *np,
/* max_level is an index, not a counter */
cpufreq_cdev->max_level = i - 1;

- cooling_ops = &cpufreq_cooling_ops;
+ cooling_ops = kmemdup(&cpufreq_cooling_ops, sizeof(*cooling_ops),
+ GFP_KERNEL);
+ if (!cooling_ops) {
+ cdev = ERR_PTR(-ENOMEM);
+ goto free_idle_time;
+ }

#ifdef CONFIG_THERMAL_GOV_POWER_ALLOCATOR
if (em_is_sane(cpufreq_cdev, em)) {
@@ -568,7 +573,7 @@ __cpufreq_cooling_register(struct device_node *np,
pr_err("%s: unsorted frequency tables are not supported\n",
__func__);
cdev = ERR_PTR(-EINVAL);
- goto free_idle_time;
+ goto free_cooling_ops;
}

ret = freq_qos_add_request(&policy->constraints,
@@ -578,7 +583,7 @@ __cpufreq_cooling_register(struct device_node *np,
pr_err("%s: Failed to add freq constraint (%d)\n", __func__,
ret);
cdev = ERR_PTR(ret);
- goto free_idle_time;
+ goto free_cooling_ops;
}

cdev = ERR_PTR(-ENOMEM);
@@ -597,6 +602,8 @@ __cpufreq_cooling_register(struct device_node *np,

remove_qos_req:
freq_qos_remove_request(&cpufreq_cdev->qos_req);
+free_cooling_ops:
+ kfree(cooling_ops);
free_idle_time:
free_idle_time(cpufreq_cdev);
free_cdev:
@@ -677,16 +684,19 @@ EXPORT_SYMBOL_GPL(of_cpufreq_cooling_register);
*/
void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
{
+ const struct thermal_cooling_device_ops *ops;
struct cpufreq_cooling_device *cpufreq_cdev;

if (!cdev)
return;

cpufreq_cdev = cdev->devdata;
+ ops = cdev->ops;

thermal_cooling_device_unregister(cdev);
freq_qos_remove_request(&cpufreq_cdev->qos_req);
free_idle_time(cpufreq_cdev);
kfree(cpufreq_cdev);
+ kfree(ops);
}
EXPORT_SYMBOL_GPL(cpufreq_cooling_unregister);
--
2.17.1


2022-06-10 10:31:25

by Lukasz Luba

[permalink] [raw]
Subject: [PATCH 2/3] thermal: cpufreq_cooling : Refactor thermal_power_cpu_get_power

Simplify the thermal_power_cpu_get_power trace event by removing
complicated cpumask and variable length array. Now the tools parsing trace
output don't have to hassle to get this power data. The simplified format
version uses 'policy->cpu'. Remove also the 'load' information completely
since there is very little value of it in this trace event. To get the
CPUs' load (or utilization) there are other dedicated trace hooks in the
kernel. This patch also simplifies and speeds-up the main cooling code
when that trace event is enabled.

Rename the trace event to avoid confusion of tools which parse the trace
file.

Signed-off-by: Lukasz Luba <[email protected]>
---
drivers/thermal/cpufreq_cooling.c | 18 +-----------------
include/trace/events/thermal.h | 28 ++++++++--------------------
2 files changed, 9 insertions(+), 37 deletions(-)

diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
index e33183785fac..2ef1dc2b96a3 100644
--- a/drivers/thermal/cpufreq_cooling.c
+++ b/drivers/thermal/cpufreq_cooling.c
@@ -214,16 +214,9 @@ static int cpufreq_get_requested_power(struct thermal_cooling_device *cdev,
u32 total_load = 0;
struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
struct cpufreq_policy *policy = cpufreq_cdev->policy;
- u32 *load_cpu = NULL;

freq = cpufreq_quick_get(policy->cpu);

- if (trace_thermal_power_cpu_get_power_enabled()) {
- u32 ncpus = cpumask_weight(policy->related_cpus);
-
- load_cpu = kcalloc(ncpus, sizeof(*load_cpu), GFP_KERNEL);
- }
-
for_each_cpu(cpu, policy->related_cpus) {
u32 load;

@@ -233,22 +226,13 @@ static int cpufreq_get_requested_power(struct thermal_cooling_device *cdev,
load = 0;

total_load += load;
- if (load_cpu)
- load_cpu[i] = load;
-
- i++;
}

cpufreq_cdev->last_load = total_load;

*power = get_dynamic_power(cpufreq_cdev, freq);

- if (load_cpu) {
- trace_thermal_power_cpu_get_power(policy->related_cpus, freq,
- load_cpu, i, *power);
-
- kfree(load_cpu);
- }
+ trace_thermal_power_cpu_get_power_simple(policy->cpu, *power);

return 0;
}
diff --git a/include/trace/events/thermal.h b/include/trace/events/thermal.h
index 8a5f04888abd..e58bf3072f32 100644
--- a/include/trace/events/thermal.h
+++ b/include/trace/events/thermal.h
@@ -92,34 +92,22 @@ TRACE_EVENT(thermal_zone_trip,
);

#ifdef CONFIG_CPU_THERMAL
-TRACE_EVENT(thermal_power_cpu_get_power,
- TP_PROTO(const struct cpumask *cpus, unsigned long freq, u32 *load,
- size_t load_len, u32 dynamic_power),
+TRACE_EVENT(thermal_power_cpu_get_power_simple,
+ TP_PROTO(int cpu, u32 power),

- TP_ARGS(cpus, freq, load, load_len, dynamic_power),
+ TP_ARGS(cpu, power),

TP_STRUCT__entry(
- __bitmask(cpumask, num_possible_cpus())
- __field(unsigned long, freq )
- __dynamic_array(u32, load, load_len)
- __field(size_t, load_len )
- __field(u32, dynamic_power )
+ __field(int, cpu)
+ __field(u32, power)
),

TP_fast_assign(
- __assign_bitmask(cpumask, cpumask_bits(cpus),
- num_possible_cpus());
- __entry->freq = freq;
- memcpy(__get_dynamic_array(load), load,
- load_len * sizeof(*load));
- __entry->load_len = load_len;
- __entry->dynamic_power = dynamic_power;
+ __entry->cpu = cpu;
+ __entry->power = power;
),

- TP_printk("cpus=%s freq=%lu load={%s} dynamic_power=%d",
- __get_bitmask(cpumask), __entry->freq,
- __print_array(__get_dynamic_array(load), __entry->load_len, 4),
- __entry->dynamic_power)
+ TP_printk("cpu=%d power=%u", __entry->cpu, __entry->power)
);

TRACE_EVENT(thermal_power_cpu_limit,
--
2.17.1

2022-06-10 10:32:06

by Lukasz Luba

[permalink] [raw]
Subject: [PATCH 3/3] thermal: cpufreq_cooling: Update outdated comments

The code has moved and left some comments stale. Update them where
there is a need.

Signed-off-by: Lukasz Luba <[email protected]>
---
drivers/thermal/cpufreq_cooling.c | 44 +++++++++++++------------------
1 file changed, 18 insertions(+), 26 deletions(-)

diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
index 2ef1dc2b96a3..e72ff85b9d7e 100644
--- a/drivers/thermal/cpufreq_cooling.c
+++ b/drivers/thermal/cpufreq_cooling.c
@@ -204,7 +204,7 @@ static u32 get_dynamic_power(struct cpufreq_cooling_device *cpufreq_cdev,
* complex code may be needed if experiments show that it's not
* accurate enough.
*
- * Return: 0 on success, -E* if getting the static power failed.
+ * Return: 0 on success, this function doesn't fail.
*/
static int cpufreq_get_requested_power(struct thermal_cooling_device *cdev,
u32 *power)
@@ -247,9 +247,8 @@ static int cpufreq_get_requested_power(struct thermal_cooling_device *cdev,
* milliwatts assuming 100% load. Store the calculated power in
* @power.
*
- * Return: 0 on success, -EINVAL if the cooling device state could not
- * be converted into a frequency or other -E* if there was an error
- * when calculating the static power.
+ * Return: 0 on success, -EINVAL if the cooling device state is bigger
+ * than maximum allowed.
*/
static int cpufreq_state2power(struct thermal_cooling_device *cdev,
unsigned long state, u32 *power)
@@ -279,15 +278,11 @@ static int cpufreq_state2power(struct thermal_cooling_device *cdev,
* Calculate a cooling device state for the cpus described by @cdev
* that would allow them to consume at most @power mW and store it in
* @state. Note that this calculation depends on external factors
- * such as the cpu load or the current static power. Calling this
- * function with the same power as input can yield different cooling
- * device states depending on those external factors.
- *
- * Return: 0 on success, -ENODEV if no cpus are online or -EINVAL if
- * the calculated frequency could not be converted to a valid state.
- * The latter should not happen unless the frequencies available to
- * cpufreq have changed since the initialization of the cpu cooling
- * device.
+ * such as the CPUs load. Calling this function with the same power
+ * as input can yield different cooling device states depending on those
+ * external factors.
+ *
+ * Return: 0 on success, this function doesn't fail.
*/
static int cpufreq_power2state(struct thermal_cooling_device *cdev,
u32 power, unsigned long *state)
@@ -399,7 +394,7 @@ static unsigned int get_state_freq(struct cpufreq_cooling_device *cpufreq_cdev,
* Callback for the thermal cooling device to return the cpufreq
* max cooling state.
*
- * Return: 0 on success, an error code otherwise.
+ * Return: 0 on success, this function doesn't fail.
*/
static int cpufreq_get_max_state(struct thermal_cooling_device *cdev,
unsigned long *state)
@@ -418,7 +413,7 @@ static int cpufreq_get_max_state(struct thermal_cooling_device *cdev,
* Callback for the thermal cooling device to return the cpufreq
* current cooling state.
*
- * Return: 0 on success, an error code otherwise.
+ * Return: 0 on success, this function doesn't fail.
*/
static int cpufreq_get_cur_state(struct thermal_cooling_device *cdev,
unsigned long *state)
@@ -485,7 +480,7 @@ static struct thermal_cooling_device_ops cpufreq_cooling_ops = {
* @em: Energy Model of the cpufreq policy
*
* This interface function registers the cpufreq cooling device with the name
- * "thermal-cpufreq-%x". This api can support multiple instances of cpufreq
+ * "cpufreq-%s". This API can support multiple instances of cpufreq
* cooling devices. It also gives the opportunity to link the cooling device
* with a device tree node, in order to bind it via the thermal DT code.
*
@@ -600,8 +595,8 @@ __cpufreq_cooling_register(struct device_node *np,
* @policy: cpufreq policy
*
* This interface function registers the cpufreq cooling device with the name
- * "thermal-cpufreq-%x". This api can support multiple instances of cpufreq
- * cooling devices.
+ * "cpufreq-%s". This API can support multiple instances of cpufreq cooling
+ * devices.
*
* Return: a valid struct thermal_cooling_device pointer on success,
* on failure, it returns a corresponding ERR_PTR().
@@ -618,17 +613,14 @@ EXPORT_SYMBOL_GPL(cpufreq_cooling_register);
* @policy: cpufreq policy
*
* This interface function registers the cpufreq cooling device with the name
- * "thermal-cpufreq-%x". This api can support multiple instances of cpufreq
- * cooling devices. Using this API, the cpufreq cooling device will be
- * linked to the device tree node provided.
+ * "cpufreq-%s". This API can support multiple instances of cpufreq cooling
+ * devices. Using this API, the cpufreq cooling device will be linked to the
+ * device tree node provided.
*
* Using this function, the cooling device will implement the power
- * extensions by using a simple cpu power model. The cpus must have
+ * extensions by using the Energy Model (if present). The cpus must have
* registered their OPPs using the OPP library.
*
- * It also takes into account, if property present in policy CPU node, the
- * static power consumed by the cpu.
- *
* Return: a valid struct thermal_cooling_device pointer on success,
* and NULL on failure.
*/
@@ -664,7 +656,7 @@ EXPORT_SYMBOL_GPL(of_cpufreq_cooling_register);
* cpufreq_cooling_unregister - function to remove cpufreq cooling device.
* @cdev: thermal cooling device pointer.
*
- * This interface function unregisters the "thermal-cpufreq-%x" cooling device.
+ * This interface function unregisters the "cpufreq-%x" cooling device.
*/
void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
{
--
2.17.1

2022-06-13 09:25:59

by Viresh Kumar

[permalink] [raw]
Subject: Re: [PATCH 1/3] thermal: cpufreq_cooling: Use a copy of local ops for each cooling device

On 10-06-22, 11:03, Lukasz Luba wrote:
> It is very unlikely that one CPU cluster would have the EM and some other
> won't have it (because EM registration failed or DT lacks needed entry).
> Although, we should avoid modifying global variable with callbacks anyway.
> Redesign this and add safety for such situation.
>
> Signed-off-by: Lukasz Luba <[email protected]>
> ---
> drivers/thermal/cpufreq_cooling.c | 16 +++++++++++++---
> 1 file changed, 13 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
> index b8151d95a806..e33183785fac 100644
> --- a/drivers/thermal/cpufreq_cooling.c
> +++ b/drivers/thermal/cpufreq_cooling.c
> @@ -554,7 +554,12 @@ __cpufreq_cooling_register(struct device_node *np,
> /* max_level is an index, not a counter */
> cpufreq_cdev->max_level = i - 1;
>
> - cooling_ops = &cpufreq_cooling_ops;
> + cooling_ops = kmemdup(&cpufreq_cooling_ops, sizeof(*cooling_ops),
> + GFP_KERNEL);

I don't like the way we are duplicating the ops here. Instead of this it would
be better to add the OPs field in the cooling device structure and fill its
fields from here. The ops structure will be allocated with the cooling device
itself.

--
viresh

2022-06-13 09:48:23

by Viresh Kumar

[permalink] [raw]
Subject: Re: [PATCH 3/3] thermal: cpufreq_cooling: Update outdated comments

On 10-06-22, 11:03, Lukasz Luba wrote:
> The code has moved and left some comments stale. Update them where
> there is a need.
>
> Signed-off-by: Lukasz Luba <[email protected]>
> ---
> drivers/thermal/cpufreq_cooling.c | 44 +++++++++++++------------------
> 1 file changed, 18 insertions(+), 26 deletions(-)

Acked-by: Viresh Kumar <[email protected]>

--
viresh

2022-06-13 10:03:22

by Viresh Kumar

[permalink] [raw]
Subject: Re: [PATCH 2/3] thermal: cpufreq_cooling : Refactor thermal_power_cpu_get_power

On 10-06-22, 11:03, Lukasz Luba wrote:
> Simplify the thermal_power_cpu_get_power trace event by removing
> complicated cpumask and variable length array. Now the tools parsing trace
> output don't have to hassle to get this power data. The simplified format
> version uses 'policy->cpu'. Remove also the 'load' information completely
> since there is very little value of it in this trace event. To get the
> CPUs' load (or utilization) there are other dedicated trace hooks in the
> kernel. This patch also simplifies and speeds-up the main cooling code
> when that trace event is enabled.
>
> Rename the trace event to avoid confusion of tools which parse the trace
> file.
>
> Signed-off-by: Lukasz Luba <[email protected]>
> ---
> drivers/thermal/cpufreq_cooling.c | 18 +-----------------
> include/trace/events/thermal.h | 28 ++++++++--------------------
> 2 files changed, 9 insertions(+), 37 deletions(-)

Acked-by: Viresh Kumar <[email protected]>

--
viresh

2022-06-13 12:53:08

by Lukasz Luba

[permalink] [raw]
Subject: Re: [PATCH 1/3] thermal: cpufreq_cooling: Use a copy of local ops for each cooling device

Hi Viresh,

Thank you for the ACKs in the other patches and suggestion in this one.

On 6/13/22 10:16, Viresh Kumar wrote:
> On 10-06-22, 11:03, Lukasz Luba wrote:
>> It is very unlikely that one CPU cluster would have the EM and some other
>> won't have it (because EM registration failed or DT lacks needed entry).
>> Although, we should avoid modifying global variable with callbacks anyway.
>> Redesign this and add safety for such situation.
>>
>> Signed-off-by: Lukasz Luba <[email protected]>
>> ---
>> drivers/thermal/cpufreq_cooling.c | 16 +++++++++++++---
>> 1 file changed, 13 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
>> index b8151d95a806..e33183785fac 100644
>> --- a/drivers/thermal/cpufreq_cooling.c
>> +++ b/drivers/thermal/cpufreq_cooling.c
>> @@ -554,7 +554,12 @@ __cpufreq_cooling_register(struct device_node *np,
>> /* max_level is an index, not a counter */
>> cpufreq_cdev->max_level = i - 1;
>>
>> - cooling_ops = &cpufreq_cooling_ops;
>> + cooling_ops = kmemdup(&cpufreq_cooling_ops, sizeof(*cooling_ops),
>> + GFP_KERNEL);
>
> I don't like the way we are duplicating the ops here. Instead of this it would
> be better to add the OPs field in the cooling device structure and fill its
> fields from here. The ops structure will be allocated with the cooling device
> itself.
>

I think I know what you mean. Make sense. There are quite a few
different cooling types of devices which are using the API
thermal_of_cooling_device_register() with the custom 'ops'. We
probably don't want to disturb that well working drivers and ecosystem.

Here, I've tried to align this code with the devfreq_cooling, but I
might actually apply your suggestion into the devfreq cooling (so they
will be still aligned). In that case both struct devfreq_cooling_device
and struct cpufreq_cooling_device would get a new field:
struct thermal_cooling_device_ops cooling_ops;
We could then remove the 'global' variables:
devfreq_cooling_ops and cpufreq_cooling_ops from where we copy.
Then we would do the needed assignment to the priv 'cooling_ops' in the
setup code and just use the old API
thermal_of_cooling_device_register() to set the needed 'ops' pointer in
the struct thermal_cooling_device.

Does this sound OK?

Regards,
Lukasz

2022-06-13 12:57:56

by Viresh Kumar

[permalink] [raw]
Subject: Re: [PATCH 1/3] thermal: cpufreq_cooling: Use a copy of local ops for each cooling device

On 13-06-22, 11:37, Lukasz Luba wrote:
> Hi Viresh,
>
> Thank you for the ACKs in the other patches and suggestion in this one.
>
> On 6/13/22 10:16, Viresh Kumar wrote:
> > On 10-06-22, 11:03, Lukasz Luba wrote:
> > > It is very unlikely that one CPU cluster would have the EM and some other
> > > won't have it (because EM registration failed or DT lacks needed entry).
> > > Although, we should avoid modifying global variable with callbacks anyway.
> > > Redesign this and add safety for such situation.
> > >
> > > Signed-off-by: Lukasz Luba <[email protected]>
> > > ---
> > > drivers/thermal/cpufreq_cooling.c | 16 +++++++++++++---
> > > 1 file changed, 13 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
> > > index b8151d95a806..e33183785fac 100644
> > > --- a/drivers/thermal/cpufreq_cooling.c
> > > +++ b/drivers/thermal/cpufreq_cooling.c
> > > @@ -554,7 +554,12 @@ __cpufreq_cooling_register(struct device_node *np,
> > > /* max_level is an index, not a counter */
> > > cpufreq_cdev->max_level = i - 1;
> > > - cooling_ops = &cpufreq_cooling_ops;
> > > + cooling_ops = kmemdup(&cpufreq_cooling_ops, sizeof(*cooling_ops),
> > > + GFP_KERNEL);
> >
> > I don't like the way we are duplicating the ops here. Instead of this it would
> > be better to add the OPs field in the cooling device structure and fill its
> > fields from here. The ops structure will be allocated with the cooling device
> > itself.
> >
>
> I think I know what you mean. Make sense. There are quite a few
> different cooling types of devices which are using the API
> thermal_of_cooling_device_register() with the custom 'ops'. We
> probably don't want to disturb that well working drivers and ecosystem.

I was just suggesting to update "struct cpufreq_cooling_device" :)

This is what I was, wrongly, referring to as cooling device.

I should have written the exact structure name instead, my bad.

--
viresh

2022-06-13 15:41:08

by Lukasz Luba

[permalink] [raw]
Subject: Re: [PATCH 1/3] thermal: cpufreq_cooling: Use a copy of local ops for each cooling device



On 6/13/22 11:53, Viresh Kumar wrote:
> On 13-06-22, 11:37, Lukasz Luba wrote:
>> Hi Viresh,
>>
>> Thank you for the ACKs in the other patches and suggestion in this one.
>>
>> On 6/13/22 10:16, Viresh Kumar wrote:
>>> On 10-06-22, 11:03, Lukasz Luba wrote:
>>>> It is very unlikely that one CPU cluster would have the EM and some other
>>>> won't have it (because EM registration failed or DT lacks needed entry).
>>>> Although, we should avoid modifying global variable with callbacks anyway.
>>>> Redesign this and add safety for such situation.
>>>>
>>>> Signed-off-by: Lukasz Luba <[email protected]>
>>>> ---
>>>> drivers/thermal/cpufreq_cooling.c | 16 +++++++++++++---
>>>> 1 file changed, 13 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
>>>> index b8151d95a806..e33183785fac 100644
>>>> --- a/drivers/thermal/cpufreq_cooling.c
>>>> +++ b/drivers/thermal/cpufreq_cooling.c
>>>> @@ -554,7 +554,12 @@ __cpufreq_cooling_register(struct device_node *np,
>>>> /* max_level is an index, not a counter */
>>>> cpufreq_cdev->max_level = i - 1;
>>>> - cooling_ops = &cpufreq_cooling_ops;
>>>> + cooling_ops = kmemdup(&cpufreq_cooling_ops, sizeof(*cooling_ops),
>>>> + GFP_KERNEL);
>>>
>>> I don't like the way we are duplicating the ops here. Instead of this it would
>>> be better to add the OPs field in the cooling device structure and fill its
>>> fields from here. The ops structure will be allocated with the cooling device
>>> itself.
>>>
>>
>> I think I know what you mean. Make sense. There are quite a few
>> different cooling types of devices which are using the API
>> thermal_of_cooling_device_register() with the custom 'ops'. We
>> probably don't want to disturb that well working drivers and ecosystem.
>
> I was just suggesting to update "struct cpufreq_cooling_device" :)
>
> This is what I was, wrongly, referring to as cooling device.
>
> I should have written the exact structure name instead, my bad.
>

No worries. Thanks, I'll send a v2 with these changes.