2023-05-19 03:28:45

by Eduardo Valentin

[permalink] [raw]
Subject: [PATCH 2/7] thermal: stats: track number of change requests due to tz

From: Eduardo Valentin <[email protected]>

This patch improves the current cooling device
statistics by adding a new file under
cdev/stats/requests_of_thermal_zone

to represent the number of times each thermal zone
requested the cooling device to effectively change.
If the request associated was not serviced because
another thermal zone asked for a higher cooling level,
this counter does not increase.

The file format is:
thermal_zone: <type> <count>

Samples:
$ cat cdev0/stats/requests_of_thermal_zone
thermal_zone: amb0 2

In this example, it means the thermal zone 'amb0' has requested
2 times for cdev0 to change state.

Cc: "Rafael J. Wysocki" <[email protected]> (supporter:THERMAL)
Cc: Daniel Lezcano <[email protected]> (supporter:THERMAL)
Cc: Amit Kucheria <[email protected]> (reviewer:THERMAL)
Cc: Zhang Rui <[email protected]> (reviewer:THERMAL)
Cc: Jonathan Corbet <[email protected]> (maintainer:DOCUMENTATION)
Cc: [email protected] (open list:THERMAL)
Cc: [email protected] (open list:DOCUMENTATION)
Cc: [email protected] (open list)

Signed-off-by: Eduardo Valentin <[email protected]>
---
.../driver-api/thermal/sysfs-api.rst | 2 +
drivers/thermal/thermal_core.h | 1 +
drivers/thermal/thermal_sysfs.c | 52 +++++++++++++++++++
3 files changed, 55 insertions(+)

diff --git a/Documentation/driver-api/thermal/sysfs-api.rst b/Documentation/driver-api/thermal/sysfs-api.rst
index caa50d61a5bc..75309a51d9b3 100644
--- a/Documentation/driver-api/thermal/sysfs-api.rst
+++ b/Documentation/driver-api/thermal/sysfs-api.rst
@@ -369,6 +369,8 @@ Thermal cooling device sys I/F, created once it's registered::
|---stats/trans_table: Cooling state transition table
|---stats/time_in_thermal_zone_ms: Time that this cooling device was driven
each associated thermal zone.
+ |---stats/requests_of_thermal_zone: Total number of times this cooling device
+ changed due to each associated thermal zone.


Then next two dynamic attributes are created/removed in pairs. They represent
diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index 3cce60c6e065..ed6511c3b794 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -103,6 +103,7 @@ struct thermal_instance {
unsigned int weight; /* The weight of the cooling device */
bool upper_no_limit;
#if IS_ENABLED(CONFIG_THERMAL_STATISTICS)
+ unsigned long total_requests;
ktime_t time_in; /* time spent in this instance */
#endif
};
diff --git a/drivers/thermal/thermal_sysfs.c b/drivers/thermal/thermal_sysfs.c
index a3b71f03db75..0bce1415f7e8 100644
--- a/drivers/thermal/thermal_sysfs.c
+++ b/drivers/thermal/thermal_sysfs.c
@@ -723,6 +723,7 @@ void thermal_cooling_device_stats_update(struct thermal_cooling_device *cdev,
stats->trans_table[stats->state * (cdev->max_state + 1) + new_state]++;
stats->state = new_state;
stats->total_trans++;
+ stats->curr_instance->total_requests++;

unlock:
spin_unlock(&stats->lock);
@@ -867,6 +868,54 @@ time_in_thermal_zone_ms_show(struct device *dev, struct device_attribute *attr,
return ret < 0 ? ret : len;
}

+static ssize_t
+requests_of_thermal_zone_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ LIST_HEAD(cdev_thermal_zone_list);
+ struct thermal_cooling_device *cdev = to_cooling_device(dev);
+ struct cooling_dev_stats *stats = cdev->stats;
+ struct cdev_thermal_zone_residency *res, *next;
+ struct thermal_instance *instance;
+ ssize_t len = 0, ret = 0;
+
+ mutex_lock(&cdev->lock);
+
+ spin_lock(&stats->lock);
+ update_time_in_state(stats, stats->curr_instance);
+ spin_unlock(&stats->lock);
+
+ build_cdev_thermal_zone_residency(&cdev_thermal_zone_list, cdev);
+
+ list_for_each_entry(instance, &cdev->thermal_instances, cdev_node)
+ list_for_each_entry(res, &cdev_thermal_zone_list, node)
+ if (strncmp(res->thermal_zone, instance->tz->type,
+ THERMAL_NAME_LENGTH) == 0)
+ res->counter += instance->total_requests;
+
+ mutex_unlock(&cdev->lock);
+
+ list_for_each_entry_safe(res, next, &cdev_thermal_zone_list, node) {
+ ret = sprintf(buf + len, "thermal_zone: %s\t%lu\n",
+ res->thermal_zone, res->counter);
+
+ if (ret == 0)
+ ret = -EOVERFLOW;
+
+ if (ret < 0)
+ break;
+
+ len += ret;
+ }
+
+ list_for_each_entry_safe(res, next, &cdev_thermal_zone_list, node) {
+ list_del(&res->node);
+ kfree(res);
+ }
+
+ return ret < 0 ? ret : len;
+}
+
static ssize_t
reset_store(struct device *dev, struct device_attribute *attr, const char *buf,
size_t count)
@@ -897,6 +946,7 @@ reset_store(struct device *dev, struct device_attribute *attr, const char *buf,

/* Make sure we reset all counters per instance */
list_for_each_entry(instance, &cdev->thermal_instances, cdev_node) {
+ instance->total_requests = 0;
instance->time_in = ktime_set(0, 0);
}

@@ -971,6 +1021,7 @@ static ssize_t trans_table_show(struct device *dev,
static DEVICE_ATTR_RO(total_trans);
static DEVICE_ATTR_RO(time_in_state_ms);
static DEVICE_ATTR_RO(time_in_thermal_zone_ms);
+static DEVICE_ATTR_RO(requests_of_thermal_zone);
static DEVICE_ATTR_WO(reset);
static DEVICE_ATTR_RO(trans_table);

@@ -978,6 +1029,7 @@ static struct attribute *cooling_device_stats_attrs[] = {
&dev_attr_total_trans.attr,
&dev_attr_time_in_state_ms.attr,
&dev_attr_time_in_thermal_zone_ms.attr,
+ &dev_attr_requests_of_thermal_zone.attr,
&dev_attr_reset.attr,
&dev_attr_trans_table.attr,
NULL
--
2.34.1



2023-06-20 18:03:56

by Rafael J. Wysocki

[permalink] [raw]
Subject: Re: [PATCH 2/7] thermal: stats: track number of change requests due to tz

On Fri, May 19, 2023 at 5:27 AM Eduardo Valentin <[email protected]> wrote:
>
> From: Eduardo Valentin <[email protected]>
>
> This patch improves the current cooling device
> statistics by adding a new file under
> cdev/stats/requests_of_thermal_zone
>
> to represent the number of times each thermal zone
> requested the cooling device to effectively change.
> If the request associated was not serviced because
> another thermal zone asked for a higher cooling level,
> this counter does not increase.

What if the cdev is associated with two thermal zones asking for the
same state of it?

> The file format is:
> thermal_zone: <type> <count>
>
> Samples:
> $ cat cdev0/stats/requests_of_thermal_zone
> thermal_zone: amb0 2

The "one value per attribute" sysfs rule violation.

>
> In this example, it means the thermal zone 'amb0' has requested
> 2 times for cdev0 to change state.

Like in the previous patch, it would be good to explain the use case.

> Cc: "Rafael J. Wysocki" <[email protected]> (supporter:THERMAL)
> Cc: Daniel Lezcano <[email protected]> (supporter:THERMAL)
> Cc: Amit Kucheria <[email protected]> (reviewer:THERMAL)
> Cc: Zhang Rui <[email protected]> (reviewer:THERMAL)
> Cc: Jonathan Corbet <[email protected]> (maintainer:DOCUMENTATION)
> Cc: [email protected] (open list:THERMAL)
> Cc: [email protected] (open list:DOCUMENTATION)
> Cc: [email protected] (open list)
>
> Signed-off-by: Eduardo Valentin <[email protected]>
> ---
> .../driver-api/thermal/sysfs-api.rst | 2 +
> drivers/thermal/thermal_core.h | 1 +
> drivers/thermal/thermal_sysfs.c | 52 +++++++++++++++++++
> 3 files changed, 55 insertions(+)
>
> diff --git a/Documentation/driver-api/thermal/sysfs-api.rst b/Documentation/driver-api/thermal/sysfs-api.rst
> index caa50d61a5bc..75309a51d9b3 100644
> --- a/Documentation/driver-api/thermal/sysfs-api.rst
> +++ b/Documentation/driver-api/thermal/sysfs-api.rst
> @@ -369,6 +369,8 @@ Thermal cooling device sys I/F, created once it's registered::
> |---stats/trans_table: Cooling state transition table
> |---stats/time_in_thermal_zone_ms: Time that this cooling device was driven
> each associated thermal zone.
> + |---stats/requests_of_thermal_zone: Total number of times this cooling device
> + changed due to each associated thermal zone.

The meaning of the above description is not clear to me.

>
>
> Then next two dynamic attributes are created/removed in pairs. They represent
> diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
> index 3cce60c6e065..ed6511c3b794 100644
> --- a/drivers/thermal/thermal_core.h
> +++ b/drivers/thermal/thermal_core.h
> @@ -103,6 +103,7 @@ struct thermal_instance {
> unsigned int weight; /* The weight of the cooling device */
> bool upper_no_limit;
> #if IS_ENABLED(CONFIG_THERMAL_STATISTICS)
> + unsigned long total_requests;
> ktime_t time_in; /* time spent in this instance */
> #endif
> };
> diff --git a/drivers/thermal/thermal_sysfs.c b/drivers/thermal/thermal_sysfs.c
> index a3b71f03db75..0bce1415f7e8 100644
> --- a/drivers/thermal/thermal_sysfs.c
> +++ b/drivers/thermal/thermal_sysfs.c
> @@ -723,6 +723,7 @@ void thermal_cooling_device_stats_update(struct thermal_cooling_device *cdev,
> stats->trans_table[stats->state * (cdev->max_state + 1) + new_state]++;
> stats->state = new_state;
> stats->total_trans++;
> + stats->curr_instance->total_requests++;
>
> unlock:
> spin_unlock(&stats->lock);
> @@ -867,6 +868,54 @@ time_in_thermal_zone_ms_show(struct device *dev, struct device_attribute *attr,
> return ret < 0 ? ret : len;
> }
>
> +static ssize_t
> +requests_of_thermal_zone_show(struct device *dev, struct device_attribute *attr,
> + char *buf)
> +{
> + LIST_HEAD(cdev_thermal_zone_list);
> + struct thermal_cooling_device *cdev = to_cooling_device(dev);
> + struct cooling_dev_stats *stats = cdev->stats;
> + struct cdev_thermal_zone_residency *res, *next;
> + struct thermal_instance *instance;
> + ssize_t len = 0, ret = 0;
> +
> + mutex_lock(&cdev->lock);
> +
> + spin_lock(&stats->lock);
> + update_time_in_state(stats, stats->curr_instance);
> + spin_unlock(&stats->lock);
> +
> + build_cdev_thermal_zone_residency(&cdev_thermal_zone_list, cdev);
> +
> + list_for_each_entry(instance, &cdev->thermal_instances, cdev_node)
> + list_for_each_entry(res, &cdev_thermal_zone_list, node)
> + if (strncmp(res->thermal_zone, instance->tz->type,
> + THERMAL_NAME_LENGTH) == 0)
> + res->counter += instance->total_requests;
> +
> + mutex_unlock(&cdev->lock);
> +
> + list_for_each_entry_safe(res, next, &cdev_thermal_zone_list, node) {

Why is the _safe variant needed here?

> + ret = sprintf(buf + len, "thermal_zone: %s\t%lu\n",
> + res->thermal_zone, res->counter);
> +
> + if (ret == 0)
> + ret = -EOVERFLOW;
> +
> + if (ret < 0)
> + break;
> +
> + len += ret;
> + }
> +
> + list_for_each_entry_safe(res, next, &cdev_thermal_zone_list, node) {
> + list_del(&res->node);
> + kfree(res);
> + }
> +
> + return ret < 0 ? ret : len;

I would prefer

if (ret < 0)
return ret;

return len;

> +}
> +
> static ssize_t
> reset_store(struct device *dev, struct device_attribute *attr, const char *buf,
> size_t count)
> @@ -897,6 +946,7 @@ reset_store(struct device *dev, struct device_attribute *attr, const char *buf,
>
> /* Make sure we reset all counters per instance */
> list_for_each_entry(instance, &cdev->thermal_instances, cdev_node) {
> + instance->total_requests = 0;
> instance->time_in = ktime_set(0, 0);
> }
>
> @@ -971,6 +1021,7 @@ static ssize_t trans_table_show(struct device *dev,
> static DEVICE_ATTR_RO(total_trans);
> static DEVICE_ATTR_RO(time_in_state_ms);
> static DEVICE_ATTR_RO(time_in_thermal_zone_ms);
> +static DEVICE_ATTR_RO(requests_of_thermal_zone);
> static DEVICE_ATTR_WO(reset);
> static DEVICE_ATTR_RO(trans_table);
>
> @@ -978,6 +1029,7 @@ static struct attribute *cooling_device_stats_attrs[] = {
> &dev_attr_total_trans.attr,
> &dev_attr_time_in_state_ms.attr,
> &dev_attr_time_in_thermal_zone_ms.attr,
> + &dev_attr_requests_of_thermal_zone.attr,
> &dev_attr_reset.attr,
> &dev_attr_trans_table.attr,
> NULL
> --

2023-06-21 05:47:46

by Eduardo Valentin

[permalink] [raw]
Subject: [PATCH 2/7] thermal: stats: track number of change requests due to tz

On Tue, Jun 20, 2023 at 07:12:52PM +0200, Rafael J. Wysocki wrote:
>
>
>
> On Fri, May 19, 2023 at 5:27 AM Eduardo Valentin <[email protected]> wrote:
> >
> > From: Eduardo Valentin <[email protected]>
> >
> > This patch improves the current cooling device
> > statistics by adding a new file under
> > cdev/stats/requests_of_thermal_zone
> >
> > to represent the number of times each thermal zone
> > requested the cooling device to effectively change.
> > If the request associated was not serviced because
> > another thermal zone asked for a higher cooling level,
> > this counter does not increase.
>
> What if the cdev is associated with two thermal zones asking for the
> same state of it?

same as explained before, there will be always one thermal instance
that is picked. This patch considers that.

>
> > The file format is:
> > thermal_zone: <type> <count>
> >
> > Samples:
> > $ cat cdev0/stats/requests_of_thermal_zone
> > thermal_zone: amb0 2
>
> The "one value per attribute" sysfs rule violation.
>
> >
> > In this example, it means the thermal zone 'amb0' has requested
> > 2 times for cdev0 to change state.
>
> Like in the previous patch, it would be good to explain the use case.
>
> > Cc: "Rafael J. Wysocki" <[email protected]> (supporter:THERMAL)
> > Cc: Daniel Lezcano <[email protected]> (supporter:THERMAL)
> > Cc: Amit Kucheria <[email protected]> (reviewer:THERMAL)
> > Cc: Zhang Rui <[email protected]> (reviewer:THERMAL)
> > Cc: Jonathan Corbet <[email protected]> (maintainer:DOCUMENTATION)
> > Cc: [email protected] (open list:THERMAL)
> > Cc: [email protected] (open list:DOCUMENTATION)
> > Cc: [email protected] (open list)
> >
> > Signed-off-by: Eduardo Valentin <[email protected]>
> > ---
> > .../driver-api/thermal/sysfs-api.rst | 2 +
> > drivers/thermal/thermal_core.h | 1 +
> > drivers/thermal/thermal_sysfs.c | 52 +++++++++++++++++++
> > 3 files changed, 55 insertions(+)
> >
> > diff --git a/Documentation/driver-api/thermal/sysfs-api.rst b/Documentation/driver-api/thermal/sysfs-api.rst
> > index caa50d61a5bc..75309a51d9b3 100644
> > --- a/Documentation/driver-api/thermal/sysfs-api.rst
> > +++ b/Documentation/driver-api/thermal/sysfs-api.rst
> > @@ -369,6 +369,8 @@ Thermal cooling device sys I/F, created once it's registered::
> > |---stats/trans_table: Cooling state transition table
> > |---stats/time_in_thermal_zone_ms: Time that this cooling device was driven
> > each associated thermal zone.
> > + |---stats/requests_of_thermal_zone: Total number of times this cooling device
> > + changed due to each associated thermal zone.
>
> The meaning of the above description is not clear to me.
>
> >
> >
> > Then next two dynamic attributes are created/removed in pairs. They represent
> > diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
> > index 3cce60c6e065..ed6511c3b794 100644
> > --- a/drivers/thermal/thermal_core.h
> > +++ b/drivers/thermal/thermal_core.h
> > @@ -103,6 +103,7 @@ struct thermal_instance {
> > unsigned int weight; /* The weight of the cooling device */
> > bool upper_no_limit;
> > #if IS_ENABLED(CONFIG_THERMAL_STATISTICS)
> > + unsigned long total_requests;
> > ktime_t time_in; /* time spent in this instance */
> > #endif
> > };
> > diff --git a/drivers/thermal/thermal_sysfs.c b/drivers/thermal/thermal_sysfs.c
> > index a3b71f03db75..0bce1415f7e8 100644
> > --- a/drivers/thermal/thermal_sysfs.c
> > +++ b/drivers/thermal/thermal_sysfs.c
> > @@ -723,6 +723,7 @@ void thermal_cooling_device_stats_update(struct thermal_cooling_device *cdev,
> > stats->trans_table[stats->state * (cdev->max_state + 1) + new_state]++;
> > stats->state = new_state;
> > stats->total_trans++;
> > + stats->curr_instance->total_requests++;
> >
> > unlock:
> > spin_unlock(&stats->lock);
> > @@ -867,6 +868,54 @@ time_in_thermal_zone_ms_show(struct device *dev, struct device_attribute *attr,
> > return ret < 0 ? ret : len;
> > }
> >
> > +static ssize_t
> > +requests_of_thermal_zone_show(struct device *dev, struct device_attribute *attr,
> > + char *buf)
> > +{
> > + LIST_HEAD(cdev_thermal_zone_list);
> > + struct thermal_cooling_device *cdev = to_cooling_device(dev);
> > + struct cooling_dev_stats *stats = cdev->stats;
> > + struct cdev_thermal_zone_residency *res, *next;
> > + struct thermal_instance *instance;
> > + ssize_t len = 0, ret = 0;
> > +
> > + mutex_lock(&cdev->lock);
> > +
> > + spin_lock(&stats->lock);
> > + update_time_in_state(stats, stats->curr_instance);
> > + spin_unlock(&stats->lock);
> > +
> > + build_cdev_thermal_zone_residency(&cdev_thermal_zone_list, cdev);
> > +
> > + list_for_each_entry(instance, &cdev->thermal_instances, cdev_node)
> > + list_for_each_entry(res, &cdev_thermal_zone_list, node)
> > + if (strncmp(res->thermal_zone, instance->tz->type,
> > + THERMAL_NAME_LENGTH) == 0)
> > + res->counter += instance->total_requests;
> > +
> > + mutex_unlock(&cdev->lock);
> > +
> > + list_for_each_entry_safe(res, next, &cdev_thermal_zone_list, node) {
>
> Why is the _safe variant needed here?
>
> > + ret = sprintf(buf + len, "thermal_zone: %s\t%lu\n",
> > + res->thermal_zone, res->counter);
> > +
> > + if (ret == 0)
> > + ret = -EOVERFLOW;
> > +
> > + if (ret < 0)
> > + break;
> > +
> > + len += ret;
> > + }
> > +
> > + list_for_each_entry_safe(res, next, &cdev_thermal_zone_list, node) {
> > + list_del(&res->node);
> > + kfree(res);
> > + }
> > +
> > + return ret < 0 ? ret : len;
>
> I would prefer
>
> if (ret < 0)
> return ret;
>
> return len;

OK. I will fix this and enhance the explanations.

And the same comments and replies of the previous patch applies here.

>
> > +}
> > +
> > static ssize_t
> > reset_store(struct device *dev, struct device_attribute *attr, const char *buf,
> > size_t count)
> > @@ -897,6 +946,7 @@ reset_store(struct device *dev, struct device_attribute *attr, const char *buf,
> >
> > /* Make sure we reset all counters per instance */
> > list_for_each_entry(instance, &cdev->thermal_instances, cdev_node) {
> > + instance->total_requests = 0;
> > instance->time_in = ktime_set(0, 0);
> > }
> >
> > @@ -971,6 +1021,7 @@ static ssize_t trans_table_show(struct device *dev,
> > static DEVICE_ATTR_RO(total_trans);
> > static DEVICE_ATTR_RO(time_in_state_ms);
> > static DEVICE_ATTR_RO(time_in_thermal_zone_ms);
> > +static DEVICE_ATTR_RO(requests_of_thermal_zone);
> > static DEVICE_ATTR_WO(reset);
> > static DEVICE_ATTR_RO(trans_table);
> >
> > @@ -978,6 +1029,7 @@ static struct attribute *cooling_device_stats_attrs[] = {
> > &dev_attr_total_trans.attr,
> > &dev_attr_time_in_state_ms.attr,
> > &dev_attr_time_in_thermal_zone_ms.attr,
> > + &dev_attr_requests_of_thermal_zone.attr,
> > &dev_attr_reset.attr,
> > &dev_attr_trans_table.attr,
> > NULL
> > --

--
All the best,
Eduardo Valentin