The routine where the trip point crossed is detected is a strategic
place where different processing will happen. Encapsulate the code in
a function, so all specific actions related with a trip point crossed
can be grouped.
Reviewed-by: Lukasz Luba <[email protected]>
Signed-off-by: Daniel Lezcano <[email protected]>
---
drivers/thermal/thermal_core.c | 30 ++++++++++++++++++++----------
1 file changed, 20 insertions(+), 10 deletions(-)
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index cdc0552e8c42..d9f771b15ed8 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -358,6 +358,25 @@ static void handle_critical_trips(struct thermal_zone_device *tz,
tz->ops->critical(tz);
}
+static void handle_thermal_trip_crossed(struct thermal_zone_device *tz, int trip,
+ int trip_temp, int trip_hyst, enum thermal_trip_type trip_type)
+{
+ if (tz->last_temperature == THERMAL_TEMP_INVALID)
+ return;
+
+ if (tz->last_temperature < trip_temp &&
+ tz->temperature >= trip_temp) {
+ thermal_notify_tz_trip_up(tz->id, trip,
+ tz->temperature);
+ }
+
+ if (tz->last_temperature >= trip_temp &&
+ tz->temperature < (trip_temp - trip_hyst)) {
+ thermal_notify_tz_trip_down(tz->id, trip,
+ tz->temperature);
+ }
+}
+
static void handle_thermal_trip(struct thermal_zone_device *tz, int trip)
{
enum thermal_trip_type type;
@@ -372,16 +391,7 @@ static void handle_thermal_trip(struct thermal_zone_device *tz, int trip)
if (tz->ops->get_trip_hyst)
tz->ops->get_trip_hyst(tz, trip, &hyst);
- if (tz->last_temperature != THERMAL_TEMP_INVALID) {
- if (tz->last_temperature < trip_temp &&
- tz->temperature >= trip_temp)
- thermal_notify_tz_trip_up(tz->id, trip,
- tz->temperature);
- if (tz->last_temperature >= trip_temp &&
- tz->temperature < (trip_temp - hyst))
- thermal_notify_tz_trip_down(tz->id, trip,
- tz->temperature);
- }
+ handle_thermal_trip_crossed(tz, trip, trip_temp, hyst, type);
if (type == THERMAL_TRIP_CRITICAL || type == THERMAL_TRIP_HOT)
handle_critical_trips(tz, trip, type);
--
2.25.1
By convention the trips points are declared in the ascending
temperature order. However, no specification for the device tree, ACPI
or documentation tells the trip points must be ordered this way.
In the other hand, we need those to be ordered to browse them at the
thermal events. But if we assume they are ordered and change the code
based on this assumption, any platform with shuffled trip points
description will be broken (if they exist).
Instead of taking the risk of breaking the existing platforms, use an
array of temperature ordered trip identifiers and make it available
for the code needing to browse the trip points in an ordered way.
Signed-off-by: Daniel Lezcano <[email protected]>
---
drivers/thermal/thermal_core.c | 62 +++++++++++++++++++++++++++-------
include/linux/thermal.h | 2 ++
2 files changed, 52 insertions(+), 12 deletions(-)
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index f66036b3daae..f02f38b66445 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -355,7 +355,8 @@ static void handle_critical_trips(struct thermal_zone_device *tz,
}
static void handle_thermal_trip_crossed(struct thermal_zone_device *tz, int trip,
- int trip_temp, int trip_hyst, enum thermal_trip_type trip_type)
+ int trip_temp, int trip_hyst,
+ enum thermal_trip_type trip_type)
{
if (tz->last_temperature == THERMAL_TEMP_INVALID)
return;
@@ -1165,6 +1166,46 @@ static void bind_tz(struct thermal_zone_device *tz)
mutex_unlock(&thermal_list_lock);
}
+static void sort_trips_indexes(struct thermal_zone_device *tz)
+{
+ int i, j;
+
+ for (i = 0; i < tz->trips; i++)
+ tz->trips_indexes[i] = i;
+
+ for (i = 0; i < tz->trips; i++) {
+ for (j = i + 1; j < tz->trips; j++) {
+ int t1, t2;
+
+ tz->ops->get_trip_temp(tz, tz->trips_indexes[i], &t1);
+ tz->ops->get_trip_temp(tz, tz->trips_indexes[j], &t2);
+
+ if (t1 > t2)
+ swap(tz->trips_indexes[i], tz->trips_indexes[j]);
+ }
+ }
+}
+
+static int thermal_zone_device_trip_init(struct thermal_zone_device *tz)
+{
+ enum thermal_trip_type trip_type;
+ int trip_temp, i;
+
+ tz->trips_indexes = kzalloc(tz->trips * sizeof(int), GFP_KERNEL);
+ if (!tz->trips_indexes)
+ return -ENOMEM;
+
+ for (i = 0; i < tz->trips; i++) {
+ if (tz->ops->get_trip_type(tz, i, &trip_type) ||
+ tz->ops->get_trip_temp(tz, i, &trip_temp) || !trip_temp)
+ set_bit(i, &tz->trips_disabled);
+ }
+
+ sort_trips_indexes(tz);
+
+ return 0;
+}
+
/**
* thermal_zone_device_register() - register a new thermal zone device
* @type: the thermal zone device type
@@ -1196,11 +1237,8 @@ thermal_zone_device_register(const char *type, int trips, int mask,
int polling_delay)
{
struct thermal_zone_device *tz;
- enum thermal_trip_type trip_type;
- int trip_temp;
int id;
int result;
- int count;
struct thermal_governor *governor;
if (!type || strlen(type) == 0) {
@@ -1272,12 +1310,9 @@ thermal_zone_device_register(const char *type, int trips, int mask,
if (result)
goto release_device;
- for (count = 0; count < trips; count++) {
- if (tz->ops->get_trip_type(tz, count, &trip_type) ||
- tz->ops->get_trip_temp(tz, count, &trip_temp) ||
- !trip_temp)
- set_bit(count, &tz->trips_disabled);
- }
+ result = thermal_zone_device_trip_init(tz);
+ if (result)
+ goto unregister;
/* Update 'this' zone's governor information */
mutex_lock(&thermal_governor_lock);
@@ -1290,7 +1325,7 @@ thermal_zone_device_register(const char *type, int trips, int mask,
result = thermal_set_governor(tz, governor);
if (result) {
mutex_unlock(&thermal_governor_lock);
- goto unregister;
+ goto kfree_indexes;
}
mutex_unlock(&thermal_governor_lock);
@@ -1298,7 +1333,7 @@ thermal_zone_device_register(const char *type, int trips, int mask,
if (!tz->tzp || !tz->tzp->no_hwmon) {
result = thermal_add_hwmon_sysfs(tz);
if (result)
- goto unregister;
+ goto kfree_indexes;
}
mutex_lock(&thermal_list_lock);
@@ -1319,6 +1354,8 @@ thermal_zone_device_register(const char *type, int trips, int mask,
return tz;
+kfree_indexes:
+ kfree(tz->trips_indexes);
unregister:
device_del(&tz->device);
release_device:
@@ -1387,6 +1424,7 @@ void thermal_zone_device_unregister(struct thermal_zone_device *tz)
thermal_remove_hwmon_sysfs(tz);
ida_simple_remove(&thermal_tz_ida, tz->id);
ida_destroy(&tz->ida);
+ kfree(tz->trips_indexes);
mutex_destroy(&tz->lock);
device_unregister(&tz->device);
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 231bac2768fb..4c3b72536772 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -112,6 +112,7 @@ struct thermal_cooling_device {
* @mode: current mode of this thermal zone
* @devdata: private pointer for device private data
* @trips: number of trip points the thermal zone supports
+ * @trips_indexes: an array of sorted trip points indexes
* @trips_disabled; bitmap for disabled trips
* @passive_delay_jiffies: number of jiffies to wait between polls when
* performing passive cooling.
@@ -152,6 +153,7 @@ struct thermal_zone_device {
enum thermal_device_mode mode;
void *devdata;
int trips;
+ int *trips_indexes;
unsigned long trips_disabled; /* bitmap for disabled trips */
unsigned long passive_delay_jiffies;
unsigned long polling_delay_jiffies;
--
2.25.1
The routine doing trip point crossing the way up or down is actually
wrong.
A trip point is composed with a trip temperature and a hysteresis.
The trip temperature is used to detect when the trip point is crossed
the way up.
The trip temperature minus the hysteresis is used to detect when the
trip point is crossed the way down.
|-----------low--------high------------|
|<--------->|
| hyst |
| |
| -|--> crossed the way up
|
<---|-- crossed the way down
For that, there is a two point comparison: the current temperature and
the previous temperature.
The actual code assumes if the current temperature is greater than the
trip temperature and the previous temperature was lesser, then the
trip point is crossed the way up. That is true only if we crossed the
way down the low temperature boundary from the previous temperature or
if the hysteresis is zero. The temperature can decrease between the
low and high, so the trip point is not crossed the way down and then
increase again and cross the high temperature raising a new trip point
crossed detection which is incorrect. The same scenario happens when
crossing the way down.
The trip point crossing the way up and down must act as parenthesis, a
trip point down must close a trip point up. Today we have multiple
trip point up without the corresponding trip point down.
In order to fix that, we store the previous trip point which gives the
information about the previous trip and we change the trip point
browsing order depending on the temperature trend: in the ascending
order when the temperature trend is raising, otherwise in the
descending order.
As a sidenote, the thermal_zone_device structure has already the
prev_trip_low and prev_trip_high information which are used by the
thermal_zone_set_trips() function. This one can be changed to be
triggered by the trip temperature crossing function, which makes more
sense, and the two fields will disappear.
Tested on a rk3399-rock960 with thermal stress and 4 trip points. Also
tested with temperature emulation to create a temperature jump
directly to the second trip point.
Signed-off-by: Daniel Lezcano <[email protected]>
---
V3:
- Use the ordered indexes introduced in the previous patch as the
trip could be not ordered
V2:
- As spotted by Zhang Rui, the trip cross notification does not
work if the temperature drops and crosses two trip points in the
same update interval. In order to fix that, we browse the trip point
in the ascending order when the temperature trend is raising,
otherwise in the descending order.
Signed-off-by: Daniel Lezcano <[email protected]>
---
drivers/thermal/thermal_core.c | 54 ++++++++++++++++++++++++----------
include/linux/thermal.h | 2 ++
2 files changed, 41 insertions(+), 15 deletions(-)
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index f02f38b66445..a5c5f6f4e42b 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -354,30 +354,48 @@ static void handle_critical_trips(struct thermal_zone_device *tz,
tz->ops->critical(tz);
}
-static void handle_thermal_trip_crossed(struct thermal_zone_device *tz, int trip,
+static void handle_thermal_trip_crossed(struct thermal_zone_device *tz, int index,
int trip_temp, int trip_hyst,
enum thermal_trip_type trip_type)
{
+ int trip_low_temp = trip_temp - trip_hyst;
+ int trip = tz->trips_indexes[index];
+
if (tz->last_temperature == THERMAL_TEMP_INVALID)
return;
- if (tz->last_temperature < trip_temp &&
- tz->temperature >= trip_temp) {
- thermal_notify_tz_trip_up(tz->id, trip,
- tz->temperature);
- }
-
- if (tz->last_temperature >= trip_temp &&
- tz->temperature < (trip_temp - trip_hyst)) {
- thermal_notify_tz_trip_down(tz->id, trip,
- tz->temperature);
+ /*
+ * Due to the hysteresis, a third information is needed to
+ * detect when the temperature is wavering between the
+ * trip_low_temp and the trip_temp. A trip point is crossed
+ * the way up only if the temperature is above it while the
+ * previous temperature was below *and* we crossed the
+ * trip_temp_low before. The previous trip point give us the
+ * previous trip point transition. The similar problem exists
+ * when crossing the way down.
+ *
+ * Note the mechanism works only if the caller of the function
+ * invoke the function with the trip point ascending or
+ * descending regarding the temperature trend. A temperature
+ * drop trend will browse the trip point in the descending
+ * order
+ */
+ if (tz->last_temperature < trip_temp && tz->temperature >= trip_temp &&
+ index != tz->prev_index) {
+ thermal_notify_tz_trip_up(tz->id, trip, tz->temperature);
+ tz->prev_index = index;
+ } else if (tz->last_temperature >= trip_low_temp && tz->temperature < trip_low_temp &&
+ index == tz->prev_index) {
+ thermal_notify_tz_trip_down(tz->id, trip, tz->temperature);
+ tz->prev_index--;
}
}
-static void handle_thermal_trip(struct thermal_zone_device *tz, int trip)
+static void handle_thermal_trip(struct thermal_zone_device *tz, int index)
{
enum thermal_trip_type type;
int trip_temp, hyst = 0;
+ int trip = tz->trips_indexes[index];
/* Ignore disabled trip points */
if (test_bit(trip, &tz->trips_disabled))
@@ -388,7 +406,7 @@ static void handle_thermal_trip(struct thermal_zone_device *tz, int trip)
if (tz->ops->get_trip_hyst)
tz->ops->get_trip_hyst(tz, trip, &hyst);
- handle_thermal_trip_crossed(tz, trip, trip_temp, hyst, type);
+ handle_thermal_trip_crossed(tz, index, trip_temp, hyst, type);
if (type == THERMAL_TRIP_CRITICAL || type == THERMAL_TRIP_HOT)
handle_critical_trips(tz, trip, trip_temp, type);
@@ -428,6 +446,7 @@ static void thermal_zone_device_init(struct thermal_zone_device *tz)
{
struct thermal_instance *pos;
tz->temperature = THERMAL_TEMP_INVALID;
+ tz->prev_index = -1;
tz->prev_low_trip = -INT_MAX;
tz->prev_high_trip = INT_MAX;
list_for_each_entry(pos, &tz->thermal_instances, tz_node)
@@ -512,8 +531,13 @@ void thermal_zone_device_update(struct thermal_zone_device *tz,
tz->notify_event = event;
- for (count = 0; count < tz->trips; count++)
- handle_thermal_trip(tz, count);
+ if (tz->last_temperature <= tz->temperature) {
+ for (count = 0; count < tz->trips; count++)
+ handle_thermal_trip(tz, count);
+ } else {
+ for (count = tz->trips; count >= 0; count--)
+ handle_thermal_trip(tz, count);
+ }
}
EXPORT_SYMBOL_GPL(thermal_zone_device_update);
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 4c3b72536772..d512f21561f1 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -125,6 +125,7 @@ struct thermal_cooling_device {
* @last_temperature: previous temperature read
* @emul_temperature: emulated temperature when using CONFIG_THERMAL_EMULATION
* @passive: 1 if you've crossed a passive trip point, 0 otherwise.
+ * @prev_index: previous index pointing to the trip point the thermal zone was
* @prev_low_trip: the low current temperature if you've crossed a passive
trip point.
* @prev_high_trip: the above current temperature if you've crossed a
@@ -161,6 +162,7 @@ struct thermal_zone_device {
int last_temperature;
int emul_temperature;
int passive;
+ int prev_index;
int prev_low_trip;
int prev_high_trip;
atomic_t need_update;
--
2.25.1
As the trip temperature is already available when calling the function
handle_critical_trips(), pass it as a parameter instead of having this
function calling the ops again to retrieve the same data.
Reviewed-by: Lukasz Luba <[email protected]>
Signed-off-by: Daniel Lezcano <[email protected]>
---
v3:
- Massaged the patch title and the description
---
drivers/thermal/thermal_core.c | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index d9f771b15ed8..f66036b3daae 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -340,12 +340,8 @@ void thermal_zone_device_critical(struct thermal_zone_device *tz)
EXPORT_SYMBOL(thermal_zone_device_critical);
static void handle_critical_trips(struct thermal_zone_device *tz,
- int trip, enum thermal_trip_type trip_type)
+ int trip, int trip_temp, enum thermal_trip_type trip_type)
{
- int trip_temp;
-
- tz->ops->get_trip_temp(tz, trip, &trip_temp);
-
/* If we have not crossed the trip_temp, we do not care. */
if (trip_temp <= 0 || tz->temperature < trip_temp)
return;
@@ -394,7 +390,7 @@ static void handle_thermal_trip(struct thermal_zone_device *tz, int trip)
handle_thermal_trip_crossed(tz, trip, trip_temp, hyst, type);
if (type == THERMAL_TRIP_CRITICAL || type == THERMAL_TRIP_HOT)
- handle_critical_trips(tz, trip, type);
+ handle_critical_trips(tz, trip, trip_temp, type);
else
handle_non_critical_trips(tz, trip);
/*
--
2.25.1
On Fri, 2022-07-15 at 23:09 +0200, Daniel Lezcano wrote:
> The routine where the trip point crossed is detected is a strategic
> place where different processing will happen. Encapsulate the code in
> a function, so all specific actions related with a trip point crossed
> can be grouped.
>
> Reviewed-by: Lukasz Luba <[email protected]>
> Signed-off-by: Daniel Lezcano <[email protected]>
Reviewed by: Zhang Rui <[email protected]>
> ---
> drivers/thermal/thermal_core.c | 30 ++++++++++++++++++++----------
> 1 file changed, 20 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/thermal/thermal_core.c
> b/drivers/thermal/thermal_core.c
> index cdc0552e8c42..d9f771b15ed8 100644
> --- a/drivers/thermal/thermal_core.c
> +++ b/drivers/thermal/thermal_core.c
> @@ -358,6 +358,25 @@ static void handle_critical_trips(struct
> thermal_zone_device *tz,
> tz->ops->critical(tz);
> }
>
> +static void handle_thermal_trip_crossed(struct thermal_zone_device
> *tz, int trip,
> + int trip_temp, int trip_hyst,
> enum thermal_trip_type trip_type)
> +{
> + if (tz->last_temperature == THERMAL_TEMP_INVALID)
> + return;
> +
> + if (tz->last_temperature < trip_temp &&
> + tz->temperature >= trip_temp) {
> + thermal_notify_tz_trip_up(tz->id, trip,
> + tz->temperature);
> + }
> +
> + if (tz->last_temperature >= trip_temp &&
> + tz->temperature < (trip_temp - trip_hyst)) {
> + thermal_notify_tz_trip_down(tz->id, trip,
> + tz->temperature);
> + }
> +}
> +
> static void handle_thermal_trip(struct thermal_zone_device *tz, int
> trip)
> {
> enum thermal_trip_type type;
> @@ -372,16 +391,7 @@ static void handle_thermal_trip(struct
> thermal_zone_device *tz, int trip)
> if (tz->ops->get_trip_hyst)
> tz->ops->get_trip_hyst(tz, trip, &hyst);
>
> - if (tz->last_temperature != THERMAL_TEMP_INVALID) {
> - if (tz->last_temperature < trip_temp &&
> - tz->temperature >= trip_temp)
> - thermal_notify_tz_trip_up(tz->id, trip,
> - tz->temperature);
> - if (tz->last_temperature >= trip_temp &&
> - tz->temperature < (trip_temp - hyst))
> - thermal_notify_tz_trip_down(tz->id, trip,
> - tz->temperature);
> - }
> + handle_thermal_trip_crossed(tz, trip, trip_temp, hyst, type);
>
> if (type == THERMAL_TRIP_CRITICAL || type ==
> THERMAL_TRIP_HOT)
> handle_critical_trips(tz, trip, type);
On Fri, 2022-07-15 at 23:09 +0200, Daniel Lezcano wrote:
> As the trip temperature is already available when calling the
> function
> handle_critical_trips(), pass it as a parameter instead of having
> this
> function calling the ops again to retrieve the same data.
>
> Reviewed-by: Lukasz Luba <[email protected]>
> Signed-off-by: Daniel Lezcano <[email protected]>
> ---
> v3:
> - Massaged the patch title and the description
> ---
> drivers/thermal/thermal_core.c | 8 ++------
> 1 file changed, 2 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/thermal/thermal_core.c
> b/drivers/thermal/thermal_core.c
> index d9f771b15ed8..f66036b3daae 100644
> --- a/drivers/thermal/thermal_core.c
> +++ b/drivers/thermal/thermal_core.c
> @@ -340,12 +340,8 @@ void thermal_zone_device_critical(struct
> thermal_zone_device *tz)
> EXPORT_SYMBOL(thermal_zone_device_critical);
>
> static void handle_critical_trips(struct thermal_zone_device *tz,
> - int trip, enum thermal_trip_type
> trip_type)
> + int trip, int trip_temp, enum
> thermal_trip_type trip_type)
This indent cleanup belongs to patch 1/4.
Other than that,
Reviewed-by: Zhang Rui <[email protected]>
> {
> - int trip_temp;
> -
> - tz->ops->get_trip_temp(tz, trip, &trip_temp);
> -
> /* If we have not crossed the trip_temp, we do not care. */
> if (trip_temp <= 0 || tz->temperature < trip_temp)
> return;
> @@ -394,7 +390,7 @@ static void handle_thermal_trip(struct
> thermal_zone_device *tz, int trip)
> handle_thermal_trip_crossed(tz, trip, trip_temp, hyst, type);
>
> if (type == THERMAL_TRIP_CRITICAL || type ==
> THERMAL_TRIP_HOT)
> - handle_critical_trips(tz, trip, type);
> + handle_critical_trips(tz, trip, trip_temp, type);
> else
> handle_non_critical_trips(tz, trip);
> /*
On Fri, 2022-07-15 at 23:09 +0200, Daniel Lezcano wrote:
> By convention the trips points are declared in the ascending
> temperature order. However, no specification for the device tree,
> ACPI
> or documentation tells the trip points must be ordered this way.
>
> In the other hand, we need those to be ordered to browse them at the
> thermal events. But if we assume they are ordered and change the code
> based on this assumption, any platform with shuffled trip points
> description will be broken (if they exist).
>
> Instead of taking the risk of breaking the existing platforms, use an
> array of temperature ordered trip identifiers and make it available
> for the code needing to browse the trip points in an ordered way.
>
> Signed-off-by: Daniel Lezcano <[email protected]>
> ---
> drivers/thermal/thermal_core.c | 62 +++++++++++++++++++++++++++-----
> --
> include/linux/thermal.h | 2 ++
> 2 files changed, 52 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/thermal/thermal_core.c
> b/drivers/thermal/thermal_core.c
> index f66036b3daae..f02f38b66445 100644
> --- a/drivers/thermal/thermal_core.c
> +++ b/drivers/thermal/thermal_core.c
> @@ -355,7 +355,8 @@ static void handle_critical_trips(struct
> thermal_zone_device *tz,
> }
>
> static void handle_thermal_trip_crossed(struct thermal_zone_device
> *tz, int trip,
> - int trip_temp, int trip_hyst,
> enum thermal_trip_type trip_type)
> + int trip_temp, int trip_hyst,
> + enum thermal_trip_type
> trip_type)
> {
> if (tz->last_temperature == THERMAL_TEMP_INVALID)
> return;
> @@ -1165,6 +1166,46 @@ static void bind_tz(struct thermal_zone_device
> *tz)
> mutex_unlock(&thermal_list_lock);
> }
>
> +static void sort_trips_indexes(struct thermal_zone_device *tz)
> +{
> + int i, j;
> +
> + for (i = 0; i < tz->trips; i++)
> + tz->trips_indexes[i] = i;
> +
> + for (i = 0; i < tz->trips; i++) {
> + for (j = i + 1; j < tz->trips; j++) {
> + int t1, t2;
> +
> + tz->ops->get_trip_temp(tz, tz-
> >trips_indexes[i], &t1);
This line can be moved to the upper loop.
> + tz->ops->get_trip_temp(tz, tz-
> >trips_indexes[j], &t2);
> +
what about the disabled trip points?
we should ignore those trip points and check the return value to make
sure we're comparing the valid trip_temp values.
thanks,
rui
> + if (t1 > t2)
> + swap(tz->trips_indexes[i], tz-
> >trips_indexes[j]);
> + }
> + }
> +}
> +
> +static int thermal_zone_device_trip_init(struct thermal_zone_device
> *tz)
> +{
> + enum thermal_trip_type trip_type;
> + int trip_temp, i;
> +
> + tz->trips_indexes = kzalloc(tz->trips * sizeof(int),
> GFP_KERNEL);
> + if (!tz->trips_indexes)
> + return -ENOMEM;
> +
> + for (i = 0; i < tz->trips; i++) {
> + if (tz->ops->get_trip_type(tz, i, &trip_type) ||
> + tz->ops->get_trip_temp(tz, i, &trip_temp) ||
> !trip_temp)
> + set_bit(i, &tz->trips_disabled);
> + }
> +
> + sort_trips_indexes(tz);
> +
> + return 0;
> +}
> +
> /**
> * thermal_zone_device_register() - register a new thermal zone
> device
> * @type: the thermal zone device type
> @@ -1196,11 +1237,8 @@ thermal_zone_device_register(const char *type,
> int trips, int mask,
> int polling_delay)
> {
> struct thermal_zone_device *tz;
> - enum thermal_trip_type trip_type;
> - int trip_temp;
> int id;
> int result;
> - int count;
> struct thermal_governor *governor;
>
> if (!type || strlen(type) == 0) {
> @@ -1272,12 +1310,9 @@ thermal_zone_device_register(const char *type,
> int trips, int mask,
> if (result)
> goto release_device;
>
> - for (count = 0; count < trips; count++) {
> - if (tz->ops->get_trip_type(tz, count, &trip_type) ||
> - tz->ops->get_trip_temp(tz, count, &trip_temp) ||
> - !trip_temp)
> - set_bit(count, &tz->trips_disabled);
> - }
> + result = thermal_zone_device_trip_init(tz);
> + if (result)
> + goto unregister;
>
> /* Update 'this' zone's governor information */
> mutex_lock(&thermal_governor_lock);
> @@ -1290,7 +1325,7 @@ thermal_zone_device_register(const char *type,
> int trips, int mask,
> result = thermal_set_governor(tz, governor);
> if (result) {
> mutex_unlock(&thermal_governor_lock);
> - goto unregister;
> + goto kfree_indexes;
> }
>
> mutex_unlock(&thermal_governor_lock);
> @@ -1298,7 +1333,7 @@ thermal_zone_device_register(const char *type,
> int trips, int mask,
> if (!tz->tzp || !tz->tzp->no_hwmon) {
> result = thermal_add_hwmon_sysfs(tz);
> if (result)
> - goto unregister;
> + goto kfree_indexes;
> }
>
> mutex_lock(&thermal_list_lock);
> @@ -1319,6 +1354,8 @@ thermal_zone_device_register(const char *type,
> int trips, int mask,
>
> return tz;
>
> +kfree_indexes:
> + kfree(tz->trips_indexes);
> unregister:
> device_del(&tz->device);
> release_device:
> @@ -1387,6 +1424,7 @@ void thermal_zone_device_unregister(struct
> thermal_zone_device *tz)
> thermal_remove_hwmon_sysfs(tz);
> ida_simple_remove(&thermal_tz_ida, tz->id);
> ida_destroy(&tz->ida);
> + kfree(tz->trips_indexes);
> mutex_destroy(&tz->lock);
> device_unregister(&tz->device);
>
> diff --git a/include/linux/thermal.h b/include/linux/thermal.h
> index 231bac2768fb..4c3b72536772 100644
> --- a/include/linux/thermal.h
> +++ b/include/linux/thermal.h
> @@ -112,6 +112,7 @@ struct thermal_cooling_device {
> * @mode: current mode of this thermal zone
> * @devdata: private pointer for device private data
> * @trips: number of trip points the thermal zone supports
> + * @trips_indexes: an array of sorted trip points indexes
> * @trips_disabled; bitmap for disabled trips
> * @passive_delay_jiffies: number of jiffies to wait between polls
> when
> * performing passive cooling.
> @@ -152,6 +153,7 @@ struct thermal_zone_device {
> enum thermal_device_mode mode;
> void *devdata;
> int trips;
> + int *trips_indexes;
> unsigned long trips_disabled; /* bitmap for disabled trips
> */
> unsigned long passive_delay_jiffies;
> unsigned long polling_delay_jiffies;
On Fri, 2022-07-15 at 23:09 +0200, Daniel Lezcano wrote:
> The routine doing trip point crossing the way up or down is actually
> wrong.
>
> A trip point is composed with a trip temperature and a hysteresis.
>
> The trip temperature is used to detect when the trip point is crossed
> the way up.
>
> The trip temperature minus the hysteresis is used to detect when the
> trip point is crossed the way down.
>
> > -----------low--------high------------|
> |<--------->|
> | hyst |
> | |
> | -|--> crossed the way up
> |
> <---|-- crossed the way down
>
> For that, there is a two point comparison: the current temperature
> and
> the previous temperature.
>
> The actual code assumes if the current temperature is greater than
> the
> trip temperature and the previous temperature was lesser, then the
> trip point is crossed the way up. That is true only if we crossed the
> way down the low temperature boundary from the previous temperature
> or
> if the hysteresis is zero. The temperature can decrease between the
> low and high, so the trip point is not crossed the way down and then
> increase again and cross the high temperature raising a new trip
> point
> crossed detection which is incorrect. The same scenario happens when
> crossing the way down.
>
> The trip point crossing the way up and down must act as parenthesis,
> a
> trip point down must close a trip point up. Today we have multiple
> trip point up without the corresponding trip point down.
>
> In order to fix that, we store the previous trip point which gives
> the
> information about the previous trip and we change the trip point
> browsing order depending on the temperature trend: in the ascending
> order when the temperature trend is raising, otherwise in the
> descending order.
>
> As a sidenote, the thermal_zone_device structure has already the
> prev_trip_low and prev_trip_high information which are used by the
> thermal_zone_set_trips() function. This one can be changed to be
> triggered by the trip temperature crossing function, which makes more
> sense, and the two fields will disappear.
>
> Tested on a rk3399-rock960 with thermal stress and 4 trip points.
> Also
> tested with temperature emulation to create a temperature jump
> directly to the second trip point.
>
> Signed-off-by: Daniel Lezcano <[email protected]>
> ---
> V3:
>
> - Use the ordered indexes introduced in the previous patch as the
> trip could be not ordered
>
> V2:
> - As spotted by Zhang Rui, the trip cross notification does not
> work if the temperature drops and crosses two trip points in the
> same update interval. In order to fix that, we browse the trip
> point
> in the ascending order when the temperature trend is raising,
> otherwise in the descending order.
> Signed-off-by: Daniel Lezcano <[email protected]>
Reviewed-by: Zhang Rui <[email protected]>
> ---
> drivers/thermal/thermal_core.c | 54 ++++++++++++++++++++++++--------
> --
> include/linux/thermal.h | 2 ++
> 2 files changed, 41 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/thermal/thermal_core.c
> b/drivers/thermal/thermal_core.c
> index f02f38b66445..a5c5f6f4e42b 100644
> --- a/drivers/thermal/thermal_core.c
> +++ b/drivers/thermal/thermal_core.c
> @@ -354,30 +354,48 @@ static void handle_critical_trips(struct
> thermal_zone_device *tz,
> tz->ops->critical(tz);
> }
>
> -static void handle_thermal_trip_crossed(struct thermal_zone_device
> *tz, int trip,
> +static void handle_thermal_trip_crossed(struct thermal_zone_device
> *tz, int index,
> int trip_temp, int trip_hyst,
> enum thermal_trip_type
> trip_type)
> {
> + int trip_low_temp = trip_temp - trip_hyst;
> + int trip = tz->trips_indexes[index];
> +
> if (tz->last_temperature == THERMAL_TEMP_INVALID)
> return;
>
> - if (tz->last_temperature < trip_temp &&
> - tz->temperature >= trip_temp) {
> - thermal_notify_tz_trip_up(tz->id, trip,
> - tz->temperature);
> - }
> -
> - if (tz->last_temperature >= trip_temp &&
> - tz->temperature < (trip_temp - trip_hyst)) {
> - thermal_notify_tz_trip_down(tz->id, trip,
> - tz->temperature);
> + /*
> + * Due to the hysteresis, a third information is needed to
> + * detect when the temperature is wavering between the
> + * trip_low_temp and the trip_temp. A trip point is crossed
> + * the way up only if the temperature is above it while the
> + * previous temperature was below *and* we crossed the
> + * trip_temp_low before. The previous trip point give us the
> + * previous trip point transition. The similar problem exists
> + * when crossing the way down.
> + *
> + * Note the mechanism works only if the caller of the
> function
> + * invoke the function with the trip point ascending or
> + * descending regarding the temperature trend. A temperature
> + * drop trend will browse the trip point in the descending
> + * order
> + */
> + if (tz->last_temperature < trip_temp && tz->temperature >=
> trip_temp &&
> + index != tz->prev_index) {
> + thermal_notify_tz_trip_up(tz->id, trip, tz-
> >temperature);
> + tz->prev_index = index;
> + } else if (tz->last_temperature >= trip_low_temp && tz-
> >temperature < trip_low_temp &&
> + index == tz->prev_index) {
> + thermal_notify_tz_trip_down(tz->id, trip, tz-
> >temperature);
> + tz->prev_index--;
> }
> }
>
> -static void handle_thermal_trip(struct thermal_zone_device *tz, int
> trip)
> +static void handle_thermal_trip(struct thermal_zone_device *tz, int
> index)
> {
> enum thermal_trip_type type;
> int trip_temp, hyst = 0;
> + int trip = tz->trips_indexes[index];
>
> /* Ignore disabled trip points */
> if (test_bit(trip, &tz->trips_disabled))
> @@ -388,7 +406,7 @@ static void handle_thermal_trip(struct
> thermal_zone_device *tz, int trip)
> if (tz->ops->get_trip_hyst)
> tz->ops->get_trip_hyst(tz, trip, &hyst);
>
> - handle_thermal_trip_crossed(tz, trip, trip_temp, hyst, type);
> + handle_thermal_trip_crossed(tz, index, trip_temp, hyst,
> type);
>
> if (type == THERMAL_TRIP_CRITICAL || type ==
> THERMAL_TRIP_HOT)
> handle_critical_trips(tz, trip, trip_temp, type);
> @@ -428,6 +446,7 @@ static void thermal_zone_device_init(struct
> thermal_zone_device *tz)
> {
> struct thermal_instance *pos;
> tz->temperature = THERMAL_TEMP_INVALID;
> + tz->prev_index = -1;
> tz->prev_low_trip = -INT_MAX;
> tz->prev_high_trip = INT_MAX;
> list_for_each_entry(pos, &tz->thermal_instances, tz_node)
> @@ -512,8 +531,13 @@ void thermal_zone_device_update(struct
> thermal_zone_device *tz,
>
> tz->notify_event = event;
>
> - for (count = 0; count < tz->trips; count++)
> - handle_thermal_trip(tz, count);
> + if (tz->last_temperature <= tz->temperature) {
> + for (count = 0; count < tz->trips; count++)
> + handle_thermal_trip(tz, count);
> + } else {
> + for (count = tz->trips; count >= 0; count--)
> + handle_thermal_trip(tz, count);
> + }
> }
> EXPORT_SYMBOL_GPL(thermal_zone_device_update);
>
> diff --git a/include/linux/thermal.h b/include/linux/thermal.h
> index 4c3b72536772..d512f21561f1 100644
> --- a/include/linux/thermal.h
> +++ b/include/linux/thermal.h
> @@ -125,6 +125,7 @@ struct thermal_cooling_device {
> * @last_temperature: previous temperature read
> * @emul_temperature: emulated temperature when using
> CONFIG_THERMAL_EMULATION
> * @passive: 1 if you've crossed a passive trip point, 0
> otherwise.
> + * @prev_index: previous index pointing to the trip
> point the thermal zone was
> * @prev_low_trip: the low current temperature if you've crossed
> a passive
> trip point.
> * @prev_high_trip: the above current temperature if you've
> crossed a
> @@ -161,6 +162,7 @@ struct thermal_zone_device {
> int last_temperature;
> int emul_temperature;
> int passive;
> + int prev_index;
> int prev_low_trip;
> int prev_high_trip;
> atomic_t need_update;
Hi Zhang,
thanks for the review
On 18/07/2022 07:28, Zhang Rui wrote:
> On Fri, 2022-07-15 at 23:09 +0200, Daniel Lezcano wrote:
[ ... ]
>> Instead of taking the risk of breaking the existing platforms, use an
>> array of temperature ordered trip identifiers and make it available
>> for the code needing to browse the trip points in an ordered way.
>>
>> Signed-off-by: Daniel Lezcano <[email protected]>
>> ---
[ ... ]
>> +static void sort_trips_indexes(struct thermal_zone_device *tz)
>> +{
>> + int i, j;
>> +
>> + for (i = 0; i < tz->trips; i++)
>> + tz->trips_indexes[i] = i;
>> +
>> + for (i = 0; i < tz->trips; i++) {
>> + for (j = i + 1; j < tz->trips; j++) {
>> + int t1, t2;
>> +
>> + tz->ops->get_trip_temp(tz, tz-
>>> trips_indexes[i], &t1);
>
> This line can be moved to the upper loop.
Right, thanks!
>> + tz->ops->get_trip_temp(tz, tz-
>>> trips_indexes[j], &t2);
>> +
>
> what about the disabled trip points?
>
> we should ignore those trip points and check the return value to make
> sure we're comparing the valid trip_temp values.
We don't have to care about, whatever the position, the corresponding
trip id will be disabled by the trip init function before calling this
one and ignored in the handle_thermal_trip() function
--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
On 18/07/2022 06:59, Zhang Rui wrote:
> On Fri, 2022-07-15 at 23:09 +0200, Daniel Lezcano wrote:
>> As the trip temperature is already available when calling the
>> function
>> handle_critical_trips(), pass it as a parameter instead of having
>> this
>> function calling the ops again to retrieve the same data.
>>
>> Reviewed-by: Lukasz Luba <[email protected]>
>> Signed-off-by: Daniel Lezcano <[email protected]>
>> ---
>> v3:
>> - Massaged the patch title and the description
>> ---
>> drivers/thermal/thermal_core.c | 8 ++------
>> 1 file changed, 2 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/thermal/thermal_core.c
>> b/drivers/thermal/thermal_core.c
>> index d9f771b15ed8..f66036b3daae 100644
>> --- a/drivers/thermal/thermal_core.c
>> +++ b/drivers/thermal/thermal_core.c
>> @@ -340,12 +340,8 @@ void thermal_zone_device_critical(struct
>> thermal_zone_device *tz)
>> EXPORT_SYMBOL(thermal_zone_device_critical);
>>
>> static void handle_critical_trips(struct thermal_zone_device *tz,
>> - int trip, enum thermal_trip_type
>> trip_type)
>> + int trip, int trip_temp, enum
>> thermal_trip_type trip_type)
>
> This indent cleanup belongs to patch 1/4.
It is not an indent cleanup, the 'int trip_temp' is added in the parameters.
> Other than that,
>
> Reviewed-by: Zhang Rui <[email protected]>
[ ... ]
--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
On 18/07/2022 07:28, Zhang Rui wrote:
> On Fri, 2022-07-15 at 23:09 +0200, Daniel Lezcano wrote:
>> By convention the trips points are declared in the ascending
>> temperature order. However, no specification for the device tree,
>> ACPI
>> or documentation tells the trip points must be ordered this way.
>>
>> In the other hand, we need those to be ordered to browse them at the
>> thermal events. But if we assume they are ordered and change the code
>> based on this assumption, any platform with shuffled trip points
>> description will be broken (if they exist).
>>
>> Instead of taking the risk of breaking the existing platforms, use an
>> array of temperature ordered trip identifiers and make it available
>> for the code needing to browse the trip points in an ordered way.
>>
>> Signed-off-by: Daniel Lezcano <[email protected]>
[ ... ]
>> +static void sort_trips_indexes(struct thermal_zone_device *tz)
>> +{
>> + int i, j;
>> +
>> + for (i = 0; i < tz->trips; i++)
>> + tz->trips_indexes[i] = i;
>> +
>> + for (i = 0; i < tz->trips; i++) {
>> + for (j = i + 1; j < tz->trips; j++) {
>> + int t1, t2;
>> +
>> + tz->ops->get_trip_temp(tz, tz-
>>> trips_indexes[i], &t1);
>
> This line can be moved to the upper loop.
>
>> + tz->ops->get_trip_temp(tz, tz-
>>> trips_indexes[j], &t2);
Actually, we can not move the line up because of the swap below
>> + if (t1 > t2)
>> + swap(tz->trips_indexes[i], tz-
>>> trips_indexes[j]);
>> + }
>> + }
>> +}
--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
On Mon, 2022-07-18 at 16:04 +0200, Daniel Lezcano wrote:
> On 18/07/2022 06:59, Zhang Rui wrote:
> > On Fri, 2022-07-15 at 23:09 +0200, Daniel Lezcano wrote:
> > > As the trip temperature is already available when calling the
> > > function
> > > handle_critical_trips(), pass it as a parameter instead of having
> > > this
> > > function calling the ops again to retrieve the same data.
> > >
> > > Reviewed-by: Lukasz Luba <[email protected]>
> > > Signed-off-by: Daniel Lezcano <[email protected]>
> > > ---
> > > v3:
> > > - Massaged the patch title and the description
> > > ---
> > > drivers/thermal/thermal_core.c | 8 ++------
> > > 1 file changed, 2 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/drivers/thermal/thermal_core.c
> > > b/drivers/thermal/thermal_core.c
> > > index d9f771b15ed8..f66036b3daae 100644
> > > --- a/drivers/thermal/thermal_core.c
> > > +++ b/drivers/thermal/thermal_core.c
> > > @@ -340,12 +340,8 @@ void thermal_zone_device_critical(struct
> > > thermal_zone_device *tz)
> > > EXPORT_SYMBOL(thermal_zone_device_critical);
> > >
> > > static void handle_critical_trips(struct thermal_zone_device
> > > *tz,
> > > - int trip, enum
> > > thermal_trip_type
> > > trip_type)
> > > + int trip, int trip_temp, enum
> > > thermal_trip_type trip_type)
> >
> > This indent cleanup belongs to patch 1/4.
>
> It is not an indent cleanup, the 'int trip_temp' is added in the
> parameters.
Sorry, I meant the indent cleanup in Patch 3/4 can be moved to 2/4.
thanks,
rui
>
> > Other than that,
> >
> > Reviewed-by: Zhang Rui <[email protected]>
>
> [ ... ]
>
>
On Mon, 2022-07-18 at 16:32 +0200, Daniel Lezcano wrote:
> On 18/07/2022 07:28, Zhang Rui wrote:
> > On Fri, 2022-07-15 at 23:09 +0200, Daniel Lezcano wrote:
> > > By convention the trips points are declared in the ascending
> > > temperature order. However, no specification for the device tree,
> > > ACPI
> > > or documentation tells the trip points must be ordered this way.
> > >
> > > In the other hand, we need those to be ordered to browse them at
> > > the
> > > thermal events. But if we assume they are ordered and change the
> > > code
> > > based on this assumption, any platform with shuffled trip points
> > > description will be broken (if they exist).
> > >
> > > Instead of taking the risk of breaking the existing platforms,
> > > use an
> > > array of temperature ordered trip identifiers and make it
> > > available
> > > for the code needing to browse the trip points in an ordered way.
> > >
> > > Signed-off-by: Daniel Lezcano <[email protected]>
>
> [ ... ]
>
> > > +static void sort_trips_indexes(struct thermal_zone_device *tz)
> > > +{
> > > + int i, j;
> > > +
> > > + for (i = 0; i < tz->trips; i++)
> > > + tz->trips_indexes[i] = i;
> > > +
> > > + for (i = 0; i < tz->trips; i++) {
> > > + for (j = i + 1; j < tz->trips; j++) {
> > > + int t1, t2;
> > > +
> > > + tz->ops->get_trip_temp(tz, tz-
> > > > trips_indexes[i], &t1);
> >
> > This line can be moved to the upper loop.
> >
> > > + tz->ops->get_trip_temp(tz, tz-
> > > > trips_indexes[j], &t2);
>
>
> Actually, we can not move the line up because of the swap below
Oh, right.
But I still think that we should check the disabled trips as well as
the .get_trip_temp() return value here, or else, we may comparing some
random trip_temp value here.
thanks,
rui
>
> > > + if (t1 > t2)
> > > + swap(tz->trips_indexes[i], tz-
> > > > trips_indexes[j]);
> > > + }
> > > + }
> > > +}
>
>
>
>
On Mon, 2022-07-18 at 15:21 +0200, Daniel Lezcano wrote:
>
> Hi Zhang,
>
> thanks for the review
>
> On 18/07/2022 07:28, Zhang Rui wrote:
> > On Fri, 2022-07-15 at 23:09 +0200, Daniel Lezcano wrote:
>
> [ ... ]
>
> > > Instead of taking the risk of breaking the existing platforms,
> > > use an
> > > array of temperature ordered trip identifiers and make it
> > > available
> > > for the code needing to browse the trip points in an ordered way.
> > >
> > > Signed-off-by: Daniel Lezcano <[email protected]>
> > > ---
>
> [ ... ]
>
> > > +static void sort_trips_indexes(struct thermal_zone_device *tz)
> > > +{
> > > + int i, j;
> > > +
> > > + for (i = 0; i < tz->trips; i++)
> > > + tz->trips_indexes[i] = i;
> > > +
> > > + for (i = 0; i < tz->trips; i++) {
> > > + for (j = i + 1; j < tz->trips; j++) {
> > > + int t1, t2;
> > > +
> > > + tz->ops->get_trip_temp(tz, tz-
> > > > trips_indexes[i], &t1);
> >
> > This line can be moved to the upper loop.
>
> Right, thanks!
>
> > > + tz->ops->get_trip_temp(tz, tz-
> > > > trips_indexes[j], &t2);
> > > +
> >
> > what about the disabled trip points?
> >
> > we should ignore those trip points and check the return value to
> > make
> > sure we're comparing the valid trip_temp values.
>
> We don't have to care about, whatever the position, the corresponding
> trip id will be disabled by the trip init function before calling
> this
> one and ignored in the handle_thermal_trip() function
hah, I missed this one and replied to your latest reply directly.
The thing I'm concerning is that if we don't check the return value,
for a disabled trip point, the trip_temp (t1/t2) returned is some
random value, it all depends on the previous value set by last
successful .get_trip_temp(), and this may screw up the sorting.
thanks,
rui
>
>
On Tue, 2022-07-19 at 09:14 +0800, Zhang Rui wrote:
> On Mon, 2022-07-18 at 15:21 +0200, Daniel Lezcano wrote:
> >
> > Hi Zhang,
> >
> > thanks for the review
> >
> > On 18/07/2022 07:28, Zhang Rui wrote:
> > > On Fri, 2022-07-15 at 23:09 +0200, Daniel Lezcano wrote:
> >
> > [ ... ]
> >
> > > > Instead of taking the risk of breaking the existing platforms,
> > > > use an
> > > > array of temperature ordered trip identifiers and make it
> > > > available
> > > > for the code needing to browse the trip points in an ordered
> > > > way.
> > > >
> > > > Signed-off-by: Daniel Lezcano <[email protected]>
> > > > ---
> >
> > [ ... ]
> >
> > > > +static void sort_trips_indexes(struct thermal_zone_device *tz)
> > > > +{
> > > > + int i, j;
> > > > +
> > > > + for (i = 0; i < tz->trips; i++)
> > > > + tz->trips_indexes[i] = i;
> > > > +
> > > > + for (i = 0; i < tz->trips; i++) {
> > > > + for (j = i + 1; j < tz->trips; j++) {
> > > > + int t1, t2;
> > > > +
> > > > + tz->ops->get_trip_temp(tz, tz-
> > > > > trips_indexes[i], &t1);
> > >
> > > This line can be moved to the upper loop.
> >
> > Right, thanks!
> >
> > > > + tz->ops->get_trip_temp(tz, tz-
> > > > > trips_indexes[j], &t2);
> > > > +
> > >
> > > what about the disabled trip points?
> > >
> > > we should ignore those trip points and check the return value to
> > > make
> > > sure we're comparing the valid trip_temp values.
> >
> > We don't have to care about, whatever the position, the
> > corresponding
> > trip id will be disabled by the trip init function before calling
> > this
> > one and ignored in the handle_thermal_trip() function
>
> hah, I missed this one and replied to your latest reply directly.
>
> The thing I'm concerning is that if we don't check the return value,
> for a disabled trip point, the trip_temp (t1/t2) returned is some
> random value, it all depends on the previous value set by last
> successful .get_trip_temp(),
or uninitialized value from the stack, but still random value every
time we invoke .get_trip_temp() for the same trip point.
-rui
> and this may screw up the sorting.
>
> thanks,
> rui
> >
> >
>
On 19/07/2022 03:14, Zhang Rui wrote:
> On Mon, 2022-07-18 at 15:21 +0200, Daniel Lezcano wrote:
>>
>> Hi Zhang,
>>
>> thanks for the review
>>
>> On 18/07/2022 07:28, Zhang Rui wrote:
>>> On Fri, 2022-07-15 at 23:09 +0200, Daniel Lezcano wrote:
>>
>> [ ... ]
>>
>>>> Instead of taking the risk of breaking the existing platforms,
>>>> use an
>>>> array of temperature ordered trip identifiers and make it
>>>> available
>>>> for the code needing to browse the trip points in an ordered way.
>>>>
>>>> Signed-off-by: Daniel Lezcano <[email protected]>
>>>> ---
>>
>> [ ... ]
>>
>>>> +static void sort_trips_indexes(struct thermal_zone_device *tz)
>>>> +{
>>>> + int i, j;
>>>> +
>>>> + for (i = 0; i < tz->trips; i++)
>>>> + tz->trips_indexes[i] = i;
>>>> +
>>>> + for (i = 0; i < tz->trips; i++) {
>>>> + for (j = i + 1; j < tz->trips; j++) {
>>>> + int t1, t2;
>>>> +
>>>> + tz->ops->get_trip_temp(tz, tz-
>>>>> trips_indexes[i], &t1);
>>>
>>> This line can be moved to the upper loop.
>>
>> Right, thanks!
>>
>>>> + tz->ops->get_trip_temp(tz, tz-
>>>>> trips_indexes[j], &t2);
>>>> +
>>>
>>> what about the disabled trip points?
>>>
>>> we should ignore those trip points and check the return value to
>>> make
>>> sure we're comparing the valid trip_temp values.
>>
>> We don't have to care about, whatever the position, the corresponding
>> trip id will be disabled by the trip init function before calling
>> this
>> one and ignored in the handle_thermal_trip() function
>
> hah, I missed this one and replied to your latest reply directly.
>
> The thing I'm concerning is that if we don't check the return value,
> for a disabled trip point, the trip_temp (t1/t2) returned is some
> random value, it all depends on the previous value set by last
> successful .get_trip_temp(), and this may screw up the sorting.
The indexes array is the same size as the trip array, that makes the
code much less prone to errors.
To have the same number of trip points, the index of the disabled trip
must be inserted also in the array. We don't care about its position in
the indexes array because it is discarded in the handle_trip_point()
function anyway. For this reason, the random temperature of the disabled
trip point and the resulting position in the sorting is harmless.
It is made on purpose to ignore the return value, so we have a simpler code.
--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
On Tue, 2022-07-19 at 09:22 +0200, Daniel Lezcano wrote:
> On 19/07/2022 03:14, Zhang Rui wrote:
> > On Mon, 2022-07-18 at 15:21 +0200, Daniel Lezcano wrote:
> > >
> > > Hi Zhang,
> > >
> > > thanks for the review
> > >
> > > On 18/07/2022 07:28, Zhang Rui wrote:
> > > > On Fri, 2022-07-15 at 23:09 +0200, Daniel Lezcano wrote:
> > >
> > > [ ... ]
> > >
> > > > > Instead of taking the risk of breaking the existing
> > > > > platforms,
> > > > > use an
> > > > > array of temperature ordered trip identifiers and make it
> > > > > available
> > > > > for the code needing to browse the trip points in an ordered
> > > > > way.
> > > > >
> > > > > Signed-off-by: Daniel Lezcano <[email protected]>
> > > > > ---
> > >
> > > [ ... ]
> > >
> > > > > +static void sort_trips_indexes(struct thermal_zone_device
> > > > > *tz)
> > > > > +{
> > > > > + int i, j;
> > > > > +
> > > > > + for (i = 0; i < tz->trips; i++)
> > > > > + tz->trips_indexes[i] = i;
> > > > > +
> > > > > + for (i = 0; i < tz->trips; i++) {
> > > > > + for (j = i + 1; j < tz->trips; j++) {
> > > > > + int t1, t2;
> > > > > +
> > > > > + tz->ops->get_trip_temp(tz, tz-
> > > > > > trips_indexes[i], &t1);
> > > >
> > > > This line can be moved to the upper loop.
> > >
> > > Right, thanks!
> > >
> > > > > + tz->ops->get_trip_temp(tz, tz-
> > > > > > trips_indexes[j], &t2);
> > > > > +
> > > >
> > > > what about the disabled trip points?
> > > >
> > > > we should ignore those trip points and check the return value
> > > > to
> > > > make
> > > > sure we're comparing the valid trip_temp values.
> > >
> > > We don't have to care about, whatever the position, the
> > > corresponding
> > > trip id will be disabled by the trip init function before calling
> > > this
> > > one and ignored in the handle_thermal_trip() function
> >
> > hah, I missed this one and replied to your latest reply directly.
> >
> > The thing I'm concerning is that if we don't check the return
> > value,
> > for a disabled trip point, the trip_temp (t1/t2) returned is some
> > random value, it all depends on the previous value set by last
> > successful .get_trip_temp(), and this may screw up the sorting.
>
> The indexes array is the same size as the trip array, that makes the
> code much less prone to errors.
>
> To have the same number of trip points, the index of the disabled
> trip
> must be inserted also in the array. We don't care about its position
> in
> the indexes array because it is discarded in the handle_trip_point()
> function anyway. For this reason, the random temperature of the
> disabled
> trip point and the resulting position in the sorting is harmless.
>
> It is made on purpose to ignore the return value, so we have a
> simpler code.
>
Let's take below case for example,
say, we have three trip points 0, 1, 2, and trip point 1 is broken and
disabled.
trip temp for trip point 0 is 10 and for trip point 2 is 20.
.get_trip_temp(tz, 1, &t) fails, and t is an uninitialized random value
Initial:
trip_indexes[0]=0,trip_indexes[1]=1,trip_indexes[2]=2
step1:
i=0,j=1
get trip temp for trip point trip_indexes[0]=0 and trip_indexes[1]=1
trip point 1 returns trip temp 5, and it swaps with trip point 0
so
trip_indexes[0]=1,trip_indexes[1]=0,trip_indexes[2]=2
step2:
i=0,j=2
get trip temp for trip point trip_indexes[0]=1 and trip_indexes[2]=2
trip point 1 returns trip temp 25, and it swaps with trip point 2
so
trip_indexes[0]=2,trip_indexes[1]=0,trip_indexes[2]=1
And the sorting is broken now.
please correct me if I'm missing anything.
thanks,
rui
On 19/07/2022 16:17, Zhang Rui wrote:
> On Tue, 2022-07-19 at 09:22 +0200, Daniel Lezcano wrote:
>> On 19/07/2022 03:14, Zhang Rui wrote:
>>> On Mon, 2022-07-18 at 15:21 +0200, Daniel Lezcano wrote:
>>>>
>>>> Hi Zhang,
>>>>
>>>> thanks for the review
>>>>
>>>> On 18/07/2022 07:28, Zhang Rui wrote:
>>>>> On Fri, 2022-07-15 at 23:09 +0200, Daniel Lezcano wrote:
>>>>
>>>> [ ... ]
>>>>
>>>>>> Instead of taking the risk of breaking the existing
>>>>>> platforms,
>>>>>> use an
>>>>>> array of temperature ordered trip identifiers and make it
>>>>>> available
>>>>>> for the code needing to browse the trip points in an ordered
>>>>>> way.
>>>>>>
>>>>>> Signed-off-by: Daniel Lezcano <[email protected]>
>>>>>> ---
>>>>
>>>> [ ... ]
>>>>
>>>>>> +static void sort_trips_indexes(struct thermal_zone_device
>>>>>> *tz)
>>>>>> +{
>>>>>> + int i, j;
>>>>>> +
>>>>>> + for (i = 0; i < tz->trips; i++)
>>>>>> + tz->trips_indexes[i] = i;
>>>>>> +
>>>>>> + for (i = 0; i < tz->trips; i++) {
>>>>>> + for (j = i + 1; j < tz->trips; j++) {
>>>>>> + int t1, t2;
>>>>>> +
>>>>>> + tz->ops->get_trip_temp(tz, tz-
>>>>>>> trips_indexes[i], &t1);
>>>>>
>>>>> This line can be moved to the upper loop.
>>>>
>>>> Right, thanks!
>>>>
>>>>>> + tz->ops->get_trip_temp(tz, tz-
>>>>>>> trips_indexes[j], &t2);
>>>>>> +
>>>>>
>>>>> what about the disabled trip points?
>>>>>
>>>>> we should ignore those trip points and check the return value
>>>>> to
>>>>> make
>>>>> sure we're comparing the valid trip_temp values.
>>>>
>>>> We don't have to care about, whatever the position, the
>>>> corresponding
>>>> trip id will be disabled by the trip init function before calling
>>>> this
>>>> one and ignored in the handle_thermal_trip() function
>>>
>>> hah, I missed this one and replied to your latest reply directly.
>>>
>>> The thing I'm concerning is that if we don't check the return
>>> value,
>>> for a disabled trip point, the trip_temp (t1/t2) returned is some
>>> random value, it all depends on the previous value set by last
>>> successful .get_trip_temp(), and this may screw up the sorting.
>>
>> The indexes array is the same size as the trip array, that makes the
>> code much less prone to errors.
>>
>> To have the same number of trip points, the index of the disabled
>> trip
>> must be inserted also in the array. We don't care about its position
>> in
>> the indexes array because it is discarded in the handle_trip_point()
>> function anyway. For this reason, the random temperature of the
>> disabled
>> trip point and the resulting position in the sorting is harmless.
>>
>> It is made on purpose to ignore the return value, so we have a
>> simpler code.
>>
> Let's take below case for example,
> say, we have three trip points 0, 1, 2, and trip point 1 is broken and
> disabled.
>
> trip temp for trip point 0 is 10 and for trip point 2 is 20.
> .get_trip_temp(tz, 1, &t) fails, and t is an uninitialized random value
>
>
> Initial:
> trip_indexes[0]=0,trip_indexes[1]=1,trip_indexes[2]=2
> step1:
> i=0,j=1
> get trip temp for trip point trip_indexes[0]=0 and trip_indexes[1]=1
> trip point 1 returns trip temp 5, and it swaps with trip point 0
> so
> trip_indexes[0]=1,trip_indexes[1]=0,trip_indexes[2]=2
> step2:
> i=0,j=2
> get trip temp for trip point trip_indexes[0]=1 and trip_indexes[2]=2
> trip point 1 returns trip temp 25, and it swaps with trip point 2
> so
> trip_indexes[0]=2,trip_indexes[1]=0,trip_indexes[2]=1
>
> And the sorting is broken now.
>
> please correct me if I'm missing anything.
Oh, nice! Thanks for the detailed explanation.
We can initialize t1 and t2 to INT_MAX, so if the get_trip_temp() fails,
they will be set to the maximum temperature and it will be at the end of
the array.
Alternatively, we check the disabled bit and set the temperature to INT_MAX.
--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
> -----Original Message-----
> From: Daniel Lezcano <[email protected]>
> Sent: Thursday, July 21, 2022 5:35 PM
> To: Zhang, Rui <[email protected]>; [email protected]
> Cc: [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]
> Subject: Re: [PATCH v3 3/4] thermal/core: Build ascending ordered indexes
> for the trip points
> Importance: High
>
> On 19/07/2022 16:17, Zhang Rui wrote:
> > On Tue, 2022-07-19 at 09:22 +0200, Daniel Lezcano wrote:
> >> On 19/07/2022 03:14, Zhang Rui wrote:
> >>> On Mon, 2022-07-18 at 15:21 +0200, Daniel Lezcano wrote:
> >>>>
> >>>> Hi Zhang,
> >>>>
> >>>> thanks for the review
> >>>>
> >>>> On 18/07/2022 07:28, Zhang Rui wrote:
> >>>>> On Fri, 2022-07-15 at 23:09 +0200, Daniel Lezcano wrote:
> >>>>
> >>>> [ ... ]
> >>>>
> >>>>>> Instead of taking the risk of breaking the existing platforms,
> >>>>>> use an array of temperature ordered trip identifiers and make it
> >>>>>> available for the code needing to browse the trip points in an
> >>>>>> ordered way.
> >>>>>>
> >>>>>> Signed-off-by: Daniel Lezcano <[email protected]>
> >>>>>> ---
> >>>>
> >>>> [ ... ]
> >>>>
> >>>>>> +static void sort_trips_indexes(struct thermal_zone_device
> >>>>>> *tz)
> >>>>>> +{
> >>>>>> + int i, j;
> >>>>>> +
> >>>>>> + for (i = 0; i < tz->trips; i++)
> >>>>>> + tz->trips_indexes[i] = i;
> >>>>>> +
> >>>>>> + for (i = 0; i < tz->trips; i++) {
> >>>>>> + for (j = i + 1; j < tz->trips; j++) {
> >>>>>> + int t1, t2;
> >>>>>> +
> >>>>>> + tz->ops->get_trip_temp(tz, tz-
> >>>>>>> trips_indexes[i], &t1);
> >>>>>
> >>>>> This line can be moved to the upper loop.
> >>>>
> >>>> Right, thanks!
> >>>>
> >>>>>> + tz->ops->get_trip_temp(tz, tz-
> >>>>>>> trips_indexes[j], &t2);
> >>>>>> +
> >>>>>
> >>>>> what about the disabled trip points?
> >>>>>
> >>>>> we should ignore those trip points and check the return value to
> >>>>> make sure we're comparing the valid trip_temp values.
> >>>>
> >>>> We don't have to care about, whatever the position, the
> >>>> corresponding trip id will be disabled by the trip init function
> >>>> before calling this one and ignored in the handle_thermal_trip()
> >>>> function
> >>>
> >>> hah, I missed this one and replied to your latest reply directly.
> >>>
> >>> The thing I'm concerning is that if we don't check the return value,
> >>> for a disabled trip point, the trip_temp (t1/t2) returned is some
> >>> random value, it all depends on the previous value set by last
> >>> successful .get_trip_temp(), and this may screw up the sorting.
> >>
> >> The indexes array is the same size as the trip array, that makes the
> >> code much less prone to errors.
> >>
> >> To have the same number of trip points, the index of the disabled
> >> trip must be inserted also in the array. We don't care about its
> >> position in the indexes array because it is discarded in the
> >> handle_trip_point() function anyway. For this reason, the random
> >> temperature of the disabled trip point and the resulting position in
> >> the sorting is harmless.
> >>
> >> It is made on purpose to ignore the return value, so we have a
> >> simpler code.
> >>
> > Let's take below case for example,
> > say, we have three trip points 0, 1, 2, and trip point 1 is broken and
> > disabled.
> >
> > trip temp for trip point 0 is 10 and for trip point 2 is 20.
> > .get_trip_temp(tz, 1, &t) fails, and t is an uninitialized random
> > value
> >
> >
> > Initial:
> > trip_indexes[0]=0,trip_indexes[1]=1,trip_indexes[2]=2
> > step1:
> > i=0,j=1
> > get trip temp for trip point trip_indexes[0]=0 and trip_indexes[1]=1
> > trip point 1 returns trip temp 5, and it swaps with trip point 0
> > so
> > trip_indexes[0]=1,trip_indexes[1]=0,trip_indexes[2]=2
> > step2:
> > i=0,j=2
> > get trip temp for trip point trip_indexes[0]=1 and trip_indexes[2]=2
> > trip point 1 returns trip temp 25, and it swaps with trip point 2
> > so
> > trip_indexes[0]=2,trip_indexes[1]=0,trip_indexes[2]=1
> >
> > And the sorting is broken now.
> >
> > please correct me if I'm missing anything.
>
> Oh, nice! Thanks for the detailed explanation.
>
> We can initialize t1 and t2 to INT_MAX, so if the get_trip_temp() fails, they
> will be set to the maximum temperature and it will be at the end of the array.
>
> Alternatively, we check the disabled bit and set the temperature to INT_MAX.
IMO, we can
1. get the trip temp for each trip point and cache them
2. set the trips_disabled bit
3. do the sorting using the cached trip temp values
in thermal_zone_device_trip_init() altogether.
Thanks,
rui
>
>
>
>
>
> --
> <http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
>
> Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
> <http://twitter.com/#!/linaroorg> Twitter | <http://www.linaro.org/linaro-
> blog/> Blog
On Fri, Jul 22, 2022 at 9:16 AM Zhang, Rui <[email protected]> wrote:
>
>
>
> > -----Original Message-----
> > From: Daniel Lezcano <[email protected]>
> > Sent: Thursday, July 21, 2022 5:35 PM
> > To: Zhang, Rui <[email protected]>; [email protected]
> > Cc: [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected]
> > Subject: Re: [PATCH v3 3/4] thermal/core: Build ascending ordered indexes
> > for the trip points
> > Importance: High
> >
> > On 19/07/2022 16:17, Zhang Rui wrote:
> > > On Tue, 2022-07-19 at 09:22 +0200, Daniel Lezcano wrote:
> > >> On 19/07/2022 03:14, Zhang Rui wrote:
> > >>> On Mon, 2022-07-18 at 15:21 +0200, Daniel Lezcano wrote:
> > >>>>
> > >>>> Hi Zhang,
> > >>>>
> > >>>> thanks for the review
> > >>>>
> > >>>> On 18/07/2022 07:28, Zhang Rui wrote:
> > >>>>> On Fri, 2022-07-15 at 23:09 +0200, Daniel Lezcano wrote:
> > >>>>
> > >>>> [ ... ]
> > >>>>
> > >>>>>> Instead of taking the risk of breaking the existing platforms,
> > >>>>>> use an array of temperature ordered trip identifiers and make it
> > >>>>>> available for the code needing to browse the trip points in an
> > >>>>>> ordered way.
> > >>>>>>
> > >>>>>> Signed-off-by: Daniel Lezcano <[email protected]>
> > >>>>>> ---
> > >>>>
> > >>>> [ ... ]
> > >>>>
> > >>>>>> +static void sort_trips_indexes(struct thermal_zone_device
> > >>>>>> *tz)
> > >>>>>> +{
> > >>>>>> + int i, j;
> > >>>>>> +
> > >>>>>> + for (i = 0; i < tz->trips; i++)
> > >>>>>> + tz->trips_indexes[i] = i;
> > >>>>>> +
> > >>>>>> + for (i = 0; i < tz->trips; i++) {
> > >>>>>> + for (j = i + 1; j < tz->trips; j++) {
> > >>>>>> + int t1, t2;
> > >>>>>> +
> > >>>>>> + tz->ops->get_trip_temp(tz, tz-
> > >>>>>>> trips_indexes[i], &t1);
> > >>>>>
> > >>>>> This line can be moved to the upper loop.
> > >>>>
> > >>>> Right, thanks!
> > >>>>
> > >>>>>> + tz->ops->get_trip_temp(tz, tz-
> > >>>>>>> trips_indexes[j], &t2);
> > >>>>>> +
> > >>>>>
> > >>>>> what about the disabled trip points?
> > >>>>>
> > >>>>> we should ignore those trip points and check the return value to
> > >>>>> make sure we're comparing the valid trip_temp values.
> > >>>>
> > >>>> We don't have to care about, whatever the position, the
> > >>>> corresponding trip id will be disabled by the trip init function
> > >>>> before calling this one and ignored in the handle_thermal_trip()
> > >>>> function
> > >>>
> > >>> hah, I missed this one and replied to your latest reply directly.
> > >>>
> > >>> The thing I'm concerning is that if we don't check the return value,
> > >>> for a disabled trip point, the trip_temp (t1/t2) returned is some
> > >>> random value, it all depends on the previous value set by last
> > >>> successful .get_trip_temp(), and this may screw up the sorting.
> > >>
> > >> The indexes array is the same size as the trip array, that makes the
> > >> code much less prone to errors.
> > >>
> > >> To have the same number of trip points, the index of the disabled
> > >> trip must be inserted also in the array. We don't care about its
> > >> position in the indexes array because it is discarded in the
> > >> handle_trip_point() function anyway. For this reason, the random
> > >> temperature of the disabled trip point and the resulting position in
> > >> the sorting is harmless.
> > >>
> > >> It is made on purpose to ignore the return value, so we have a
> > >> simpler code.
> > >>
> > > Let's take below case for example,
> > > say, we have three trip points 0, 1, 2, and trip point 1 is broken and
> > > disabled.
> > >
> > > trip temp for trip point 0 is 10 and for trip point 2 is 20.
> > > .get_trip_temp(tz, 1, &t) fails, and t is an uninitialized random
> > > value
> > >
> > >
> > > Initial:
> > > trip_indexes[0]=0,trip_indexes[1]=1,trip_indexes[2]=2
> > > step1:
> > > i=0,j=1
> > > get trip temp for trip point trip_indexes[0]=0 and trip_indexes[1]=1
> > > trip point 1 returns trip temp 5, and it swaps with trip point 0
> > > so
> > > trip_indexes[0]=1,trip_indexes[1]=0,trip_indexes[2]=2
> > > step2:
> > > i=0,j=2
> > > get trip temp for trip point trip_indexes[0]=1 and trip_indexes[2]=2
> > > trip point 1 returns trip temp 25, and it swaps with trip point 2
> > > so
> > > trip_indexes[0]=2,trip_indexes[1]=0,trip_indexes[2]=1
> > >
> > > And the sorting is broken now.
> > >
> > > please correct me if I'm missing anything.
> >
> > Oh, nice! Thanks for the detailed explanation.
> >
> > We can initialize t1 and t2 to INT_MAX, so if the get_trip_temp() fails, they
> > will be set to the maximum temperature and it will be at the end of the array.
> >
> > Alternatively, we check the disabled bit and set the temperature to INT_MAX.
>
> IMO, we can
> 1. get the trip temp for each trip point and cache them
> 2. set the trips_disabled bit
> 3. do the sorting using the cached trip temp values
> in thermal_zone_device_trip_init() altogether.
What about the case when the trip temperature can be set from user
space? I guess we replace the cached value then, but it may be
necessary to sort them again in theory?
Greeting,
FYI, we noticed the following commit (built with gcc-11):
commit: 3c3e786e2b4fa279eb3a36088ea0df4fe4452e8d ("[PATCH v3 4/4] thermal/core: Fix thermal trip cross point")
url: https://github.com/intel-lab-lkp/linux/commits/Daniel-Lezcano/thermal-core-Encapsulate-the-trip-point-crossed-function/20220716-224303
base: https://git.kernel.org/cgit/linux/kernel/git/rafael/linux-pm.git thermal
patch link: https://lore.kernel.org/linux-pm/[email protected]
in testcase: nvml
version: nvml-x86_64-3de7d358f-1_20211217
with following parameters:
group: out
test: none
ucode: 0xec
on test machine: 8 threads 1 sockets Intel(R) Core(TM) i7-7700 CPU @ 3.60GHz with 32G memory
caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):
If you fix the issue, kindly add following tag
Reported-by: kernel test robot <[email protected]>
[ 116.868361][ T216] BUG: KASAN: slab-out-of-bounds in handle_thermal_trip (drivers/thermal/thermal_core.c:398)
[ 116.876213][ T216] Read of size 4 at addr ffff888160684198 by task kworker/0:2/216
[ 116.883890][ T216]
[ 116.886087][ T216] CPU: 0 PID: 216 Comm: kworker/0:2 Tainted: G I 5.18.0-02060-g3c3e786e2b4f #1
[ 116.896194][ T216] Hardware name: Dell Inc. OptiPlex 7050/062KRH, BIOS 1.2.0 12/22/2016
[ 116.904302][ T216] Workqueue: events pkg_temp_thermal_threshold_work_fn [x86_pkg_temp_thermal]
[ 116.913026][ T216] Call Trace:
[ 116.916180][ T216] <TASK>
[ 116.918985][ T216] ? handle_thermal_trip (drivers/thermal/thermal_core.c:398)
[ 116.924144][ T216] dump_stack_lvl (lib/dump_stack.c:107 (discriminator 1))
[ 116.928520][ T216] print_address_description+0x1f/0x200
[ 116.934982][ T216] ? handle_thermal_trip (drivers/thermal/thermal_core.c:398)
[ 116.940138][ T216] print_report.cold (mm/kasan/report.c:430)
[ 116.944860][ T216] ? _raw_spin_lock_irqsave (arch/x86/include/asm/atomic.h:202 include/linux/atomic/atomic-instrumented.h:543 include/asm-generic/qspinlock.h:82 include/linux/spinlock.h:185 include/linux/spinlock_api_smp.h:111 kernel/locking/spinlock.c:162)
[ 116.950190][ T216] kasan_report (mm/kasan/report.c:162 mm/kasan/report.c:493)
[ 116.954477][ T216] ? handle_thermal_trip (drivers/thermal/thermal_core.c:398)
[ 116.959632][ T216] handle_thermal_trip (drivers/thermal/thermal_core.c:398)
[ 116.964615][ T216] ? perf_trace_cdev_update (drivers/thermal/thermal_core.c:395)
[ 116.970033][ T216] ? mutex_unlock (arch/x86/include/asm/atomic64_64.h:190 include/linux/atomic/atomic-long.h:449 include/linux/atomic/atomic-instrumented.h:1790 kernel/locking/mutex.c:181 kernel/locking/mutex.c:540)
[ 116.974496][ T216] ? __mutex_unlock_slowpath+0x2c0/0x2c0
[ 116.981046][ T216] thermal_zone_device_update (drivers/thermal/thermal_core.c:538 drivers/thermal/thermal_core.c:513)
[ 116.986639][ T216] ? handle_thermal_trip (drivers/thermal/thermal_core.c:515)
[ 116.991797][ T216] ? _raw_spin_lock_irq (arch/x86/include/asm/atomic.h:202 include/linux/atomic/atomic-instrumented.h:543 include/asm-generic/qspinlock.h:82 include/linux/spinlock.h:185 include/linux/spinlock_api_smp.h:120 kernel/locking/spinlock.c:170)
[ 116.996690][ T216] ? _raw_spin_lock_bh (kernel/locking/spinlock.c:169)
[ 117.001675][ T216] pkg_temp_thermal_threshold_work_fn (drivers/thermal/intel/x86_pkg_temp_thermal.c:300) x86_pkg_temp_thermal
[ 117.009961][ T216] process_one_work (arch/x86/include/asm/jump_label.h:27 include/linux/jump_label.h:207 include/trace/events/workqueue.h:108 kernel/workqueue.c:2294)
[ 117.014771][ T216] worker_thread (include/linux/list.h:292 kernel/workqueue.c:2437)
[ 117.019233][ T216] ? __kthread_parkme (arch/x86/include/asm/bitops.h:207 (discriminator 4) include/asm-generic/bitops/instrumented-non-atomic.h:135 (discriminator 4) kernel/kthread.c:270 (discriminator 4))
[ 117.024049][ T216] ? schedule (arch/x86/include/asm/bitops.h:207 (discriminator 1) include/asm-generic/bitops/instrumented-non-atomic.h:135 (discriminator 1) include/linux/thread_info.h:118 (discriminator 1) include/linux/sched.h:2198 (discriminator 1) kernel/sched/core.c:6465 (discriminator 1))
[ 117.028169][ T216] ? process_one_work (kernel/workqueue.c:2379)
[ 117.033246][ T216] ? process_one_work (kernel/workqueue.c:2379)
[ 117.038326][ T216] kthread (kernel/kthread.c:376)
[ 117.042270][ T216] ? kthread_complete_and_exit (kernel/kthread.c:331)
[ 117.047785][ T216] ret_from_fork (arch/x86/entry/entry_64.S:308)
[ 117.052075][ T216] </TASK>
[ 117.054966][ T216]
[ 117.057159][ T216] Allocated by task 18:
[ 117.061179][ T216] kasan_save_stack (mm/kasan/common.c:39)
[ 117.065723][ T216] __kasan_kmalloc (mm/kasan/common.c:45 mm/kasan/common.c:436 mm/kasan/common.c:515 mm/kasan/common.c:524)
[ 117.070180][ T216] thermal_zone_device_trip_init (include/linux/slab.h:586 include/linux/slab.h:714 drivers/thermal/thermal_core.c:1218)
[ 117.075944][ T216] thermal_zone_device_register (drivers/thermal/thermal_core.c:1337)
[ 117.081707][ T216] pkg_temp_thermal_device_add (drivers/thermal/intel/x86_pkg_temp_thermal.c:359) x86_pkg_temp_thermal
[ 117.089387][ T216] cpuhp_invoke_callback (kernel/cpu.c:192)
[ 117.094545][ T216] cpuhp_thread_fun (kernel/cpu.c:785)
[ 117.099268][ T216] smpboot_thread_fn (kernel/smpboot.c:164 (discriminator 4))
[ 117.104078][ T216] kthread (kernel/kthread.c:376)
[ 117.108018][ T216] ret_from_fork (arch/x86/entry/entry_64.S:308)
[ 117.112305][ T216]
[ 117.114501][ T216] The buggy address belongs to the object at ffff888160684190
[ 117.114501][ T216] which belongs to the cache kmalloc-8 of size 8
[ 117.128083][ T216] The buggy address is located 0 bytes to the right of
[ 117.128083][ T216] 8-byte region [ffff888160684190, ffff888160684198)
[ 117.141405][ T216]
[ 117.143601][ T216] The buggy address belongs to the physical page:
[ 117.149880][ T216] page:00000000a8af6427 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff888160684000 pfn:0x160684
[ 117.161302][ T216] flags: 0x17ffffc0000200(slab|node=0|zone=2|lastcpupid=0x1fffff)
[ 117.168978][ T216] raw: 0017ffffc0000200 ffffea002014f188 ffffea0020070c08 ffff888100042280
[ 117.177434][ T216] raw: ffff888160684000 0000000000660023 00000001ffffffff 0000000000000000
[ 117.185890][ T216] page dumped because: kasan: bad access detected
[ 117.192167][ T216]
[ 117.194358][ T216] Memory state around the buggy address:
[ 117.199856][ T216] ffff888160684080: fc fc fc fc 00 fc fc fc fc fb fc fc fc fc fb fc
[ 117.207790][ T216] ffff888160684100: fc fc fc fb fc fc fc fc fb fc fc fc fc fb fc fc
[ 117.215724][ T216] >ffff888160684180: fc fc 00 fc fc fc fc 00 fc fc fc fc fb fc fc fc
[ 117.223656][ T216] ^
[ 117.228376][ T216] ffff888160684200: fc fb fc fc fc fc fb fc fc fc fc fb fc fc fc fc
[ 117.236311][ T216] ffff888160684280: 00 fc fc fc fc fb fc fc fc fc 00 fc fc fc fc fb
[ 117.244244][ T216] ==================================================================
[ 117.252268][ T216] Disabling lock debugging due to kernel taint
[ 121.513688][ T391] clang -MD -c -o ../nondebug/librpmem/alloc.o -std=gnu99 -Wall -Werror -Wmissing-prototypes -Wpointer-arith -Wsign-conversion -Wsign-compare -Wunused-parameter -Wconversion -Wunused-macros -Wmissing-field-initializers -Wunreachable-code-return -Wmissing-variable-declarations -Wfloat-equal -Wswitch-default -O2 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 -std=gnu99 -fno-common -pthread -DSRCVERSION="1.11.0+git148.gfe27e1033" -fno-lto -DSDS_ENABLED -DNDCTL_ENABLED=1 -DPAGE_SIZE=4096 -DUSE_VALGRIND -Wno-error -I/usr/local/include -I. -I../rpmem_common -DRPMEMC_LOG_RPMEM -I../include -I../common/ -I../core/ -fPIC ../../src/../src/core/alloc.c
[ 121.513705][ T391]
[ 140.327169][ T391] clang -MD -c -o ../nondebug/common/ctl.o -std=gnu99 -Wall -Werror -Wmissing-prototypes -Wpointer-arith -Wsign-conversion -Wsign-compare -Wunused-parameter -Wconversion -Wunused-macros -Wmissing-field-initializers -Wunreachable-code-return -Wmissing-variable-declarations -Wfloat-equal -Wswitch-default -O2 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 -std=gnu99 -fno-common -pthread -DSRCVERSION="1.11.0+git148.gfe27e1033" -fno-lto -DSDS_ENABLED -DNDCTL_ENABLED=1 -DPAGE_SIZE=4096 -DUSE_VALGRIND -Wno-error -I/usr/local/include -DUSE_LIBDL -I../include -I../common/ -I../core/ -fPIC ctl.c
[ 140.327186][ T391]
[ 146.103295][ T391] ../../src/../utils/check-os.sh ../../src/../utils/os-banned ../nondebug/core/alloc.o alloc.c
[ 146.103309][ T391]
[ 146.435828][ T391] ../../src/../utils/check-os.sh ../../src/../utils/os-banned ../nondebug/core/fs_posix.o fs_posix.c
[ 146.435840][ T391]
[ 146.752768][ T391] ../../src/../utils/check-os.sh ../../src/../utils/os-banned ../nondebug/common/bad_blocks.o bad_blocks.c
[ 146.752783][ T391]
[ 147.317604][ T391] ../../src/../utils/check-os.sh ../../src/../utils/os-banned ../nondebug/librpmem/alloc.o ../../src/../src/core/alloc.c
[ 147.317614][ T391]
[ 147.335464][ T391] ../../src/../utils/check-os.sh ../../src/../utils/os-banned ../nondebug/common/set_badblocks.o set_badblocks.c
[ 147.335484][ T391]
[ 155.193789][ T393] ../../src/../src/libpmem2/deep_flush_other.c:38:33: warning: unused parameter 'region_id' [-Wunused-parameter]
[ 155.193798][ T393]
[ 155.208897][ T393] pmem2_deep_flush_write(unsigned region_id)
[ 155.208904][ T393]
[ 155.217849][ T393] ^
[ 155.217856][ T393]
[ 155.225694][ T393] 1 warning generated.
[ 155.225701][ T393]
[ 155.234634][ T393] ../../src/../src/libpmem2/pmem2_utils_other.c:38:50: warning: unused parameter 'src' [-Wunused-parameter]
[ 155.234642][ T393]
[ 155.249984][ T393] pmem2_device_dax_size(const struct pmem2_source *src, size_t *size)
[ 155.249992][ T393]
[ 155.261566][ T393] ^
[ 155.261573][ T393]
[ 155.273080][ T393] ../../src/../src/libpmem2/pmem2_utils_other.c:38:63: warning: unused parameter 'size' [-Wunused-parameter]
[ 155.273086][ T393]
[ 155.288567][ T393] pmem2_device_dax_size(const struct pmem2_source *src, size_t *size)
[ 155.288575][ T393]
To reproduce:
git clone https://github.com/intel/lkp-tests.git
cd lkp-tests
sudo bin/lkp install job.yaml # job file is attached in this email
bin/lkp split-job --compatible job.yaml # generate the yaml file for lkp run
sudo bin/lkp run generated-yaml-file
# if come across any failure that blocks the test,
# please remove ~/.lkp and /lkp dir to run from a clean state.
--
0-DAY CI Kernel Test Service
https://01.org/lkp
On Fri, Jul 15, 2022 at 11:09 PM Daniel Lezcano
<[email protected]> wrote:
>
> The routine doing trip point crossing the way up or down is actually
> wrong.
>
> A trip point is composed with a trip temperature and a hysteresis.
>
> The trip temperature is used to detect when the trip point is crossed
> the way up.
>
> The trip temperature minus the hysteresis is used to detect when the
> trip point is crossed the way down.
>
> |-----------low--------high------------|
> |<--------->|
> | hyst |
> | |
> | -|--> crossed the way up
> |
> <---|-- crossed the way down
>
> For that, there is a two point comparison: the current temperature and
> the previous temperature.
>
> The actual code assumes if the current temperature is greater than the
> trip temperature and the previous temperature was lesser, then the
> trip point is crossed the way up. That is true only if we crossed the
> way down the low temperature boundary from the previous temperature or
> if the hysteresis is zero. The temperature can decrease between the
> low and high, so the trip point is not crossed the way down and then
> increase again and cross the high temperature raising a new trip point
> crossed detection which is incorrect. The same scenario happens when
> crossing the way down.
>
> The trip point crossing the way up and down must act as parenthesis, a
> trip point down must close a trip point up. Today we have multiple
> trip point up without the corresponding trip point down.
>
> In order to fix that, we store the previous trip point which gives the
> information about the previous trip and we change the trip point
> browsing order depending on the temperature trend: in the ascending
> order when the temperature trend is raising, otherwise in the
> descending order.
There is an alternative way of addressing this problem which doesn't
require using information regarding the previous trip point and so it
would work even if the trips were not sorted.
Namely, for each trip there can be an effective threshold equal to
either its temperature. or its temperature minus its hysteresis (low
temperature). If the initial zone temperature is below the trip's
temperature, the initial value of its threshold is equal to its
temperature. Otherwise, the initial value of the trip's threshold is
its low temperature.
Then, if the zone temperature crosses the threshold (either up or
down), the trip crossing triggers and the threshold value is flipped
(that is, if it was equal to the trip's temperature, it becomes its
low temperature or the other way around). [Note that if the threshold
value is equal to the trip temperature, it can only be crossed on the
way up, because it means that the zone temperature was below it at one
point and has not grown above it since then. Conversely, if the
threshold value is equal to the low temperature of the trip, it can
only be crossed on the way down, because it means that the zone
temperature was above the trip temperature at one point and it has not
fallen below the trip's low temperature since then.]
This should not be hard to implement AFAICS and it should also work in
the cases when one trip is located in the hysteresis range of another
one.