2020-08-12 20:54:17

by Jeremy Cline

[permalink] [raw]
Subject: [PATCH] drm/nouveau: Add fine-grain temperature reporting

Commit d32656373857 ("drm/nouveau/therm/gp100: initial implementation of
new gp1xx temperature sensor") added support for reading finer-grain
temperatures, but continued to report temperatures in 1 degree Celsius
increments via nvkm_therm_temp_get().

Rather than altering nvkm_therm_temp_get() to report finer-grain
temperatures, which would be inconvenient for other users of the
function, a second interface has been added to line up with hwmon's
native unit of temperature.

Signed-off-by: Jeremy Cline <[email protected]>
---
.../drm/nouveau/include/nvkm/subdev/therm.h | 18 +++++++++++++
drivers/gpu/drm/nouveau/nouveau_hwmon.c | 4 +--
.../gpu/drm/nouveau/nvkm/subdev/therm/base.c | 16 ++++++++++++
.../gpu/drm/nouveau/nvkm/subdev/therm/gp100.c | 25 +++++++++++++++++--
.../gpu/drm/nouveau/nvkm/subdev/therm/priv.h | 1 +
5 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
index 62c34f98c930..7b9928dd001c 100644
--- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
+++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
@@ -100,6 +100,24 @@ struct nvkm_therm {
};

int nvkm_therm_temp_get(struct nvkm_therm *);
+
+/**
+ * nvkm_therm_temp_millidegree_get() - get the temperature in millidegrees
+ * @therm: The thermal device to read from.
+ *
+ * This interface reports temperatures in units of millidegree Celsius to
+ * align with the hwmon API. Some cards may only be capable of reporting in
+ * units of Celsius, and those that report finer grain temperatures may not be
+ * capable of millidegree Celsius accuracy,
+ *
+ * For cases where millidegree temperature is too fine-grain, the
+ * nvkm_therm_temp_get() interface reports temperatures in one degree Celsius
+ * increments.
+ *
+ * Return: The temperature in millidegrees Celsius, or -ENODEV if temperature
+ * reporting is not supported.
+ */
+int nvkm_therm_temp_millidegree_get(struct nvkm_therm *therm);
int nvkm_therm_fan_sense(struct nvkm_therm *);
int nvkm_therm_cstate(struct nvkm_therm *, int, int);
void nvkm_therm_clkgate_init(struct nvkm_therm *,
diff --git a/drivers/gpu/drm/nouveau/nouveau_hwmon.c b/drivers/gpu/drm/nouveau/nouveau_hwmon.c
index 1c3104d20571..e96355f93ce5 100644
--- a/drivers/gpu/drm/nouveau/nouveau_hwmon.c
+++ b/drivers/gpu/drm/nouveau/nouveau_hwmon.c
@@ -428,8 +428,8 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
case hwmon_temp_input:
if (drm_dev->switch_power_state != DRM_SWITCH_POWER_ON)
return -EINVAL;
- ret = nvkm_therm_temp_get(therm);
- *val = ret < 0 ? ret : (ret * 1000);
+ ret = nvkm_therm_temp_millidegree_get(therm);
+ *val = ret;
break;
case hwmon_temp_max:
*val = therm->attr_get(therm, NVKM_THERM_ATTR_THRS_DOWN_CLK)
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
index 4a4d1e224126..e655b32c78b8 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
@@ -34,6 +34,22 @@ nvkm_therm_temp_get(struct nvkm_therm *therm)
return -ENODEV;
}

+int
+nvkm_therm_temp_millidegree_get(struct nvkm_therm *therm)
+{
+ int ret = -ENODEV;
+
+ if (therm->func->temp_millidegree_get)
+ return therm->func->temp_millidegree_get(therm);
+
+ if (therm->func->temp_get) {
+ ret = therm->func->temp_get(therm);
+ if (ret > 0)
+ ret *= 1000;
+ }
+ return ret;
+}
+
static int
nvkm_therm_update_trip(struct nvkm_therm *therm)
{
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
index 9f0dea3f61dc..4c3c2895a3cb 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
@@ -24,7 +24,7 @@
#include "priv.h"

static int
-gp100_temp_get(struct nvkm_therm *therm)
+gp100_temp_get_raw(struct nvkm_therm *therm)
{
struct nvkm_device *device = therm->subdev.device;
struct nvkm_subdev *subdev = &therm->subdev;
@@ -37,14 +37,35 @@ gp100_temp_get(struct nvkm_therm *therm)

/* device valid */
if (tsensor & 0x20000000)
- return (inttemp >> 8);
+ return inttemp;
else
return -ENODEV;
}

+static int
+gp100_temp_millidegree_get(struct nvkm_therm *therm)
+{
+ int raw_temp = gp100_temp_get_raw(therm);
+
+ if (raw_temp < 0)
+ return raw_temp;
+ return raw_temp * 1000 >> 8;
+}
+
+static int
+gp100_temp_get(struct nvkm_therm *therm)
+{
+ int raw_temp = gp100_temp_get_raw(therm);
+
+ if (raw_temp < 0)
+ return raw_temp;
+ return raw_temp >> 8;
+}
+
static const struct nvkm_therm_func
gp100_therm = {
.temp_get = gp100_temp_get,
+ .temp_millidegree_get = gp100_temp_millidegree_get,
.program_alarms = nvkm_therm_program_alarms_polling,
};

diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
index 21659daf1864..a53068b4f0b9 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
@@ -92,6 +92,7 @@ struct nvkm_therm_func {
int (*pwm_clock)(struct nvkm_therm *, int line);

int (*temp_get)(struct nvkm_therm *);
+ int (*temp_millidegree_get)(struct nvkm_therm *therm);

int (*fan_sense)(struct nvkm_therm *);

--
2.26.2


2020-08-13 12:45:09

by Karol Herbst

[permalink] [raw]
Subject: Re: [PATCH] drm/nouveau: Add fine-grain temperature reporting

Reviewed-by: Karol Herbst <[email protected]>

On Wed, Aug 12, 2020 at 10:50 PM Jeremy Cline <[email protected]> wrote:
>
> Commit d32656373857 ("drm/nouveau/therm/gp100: initial implementation of
> new gp1xx temperature sensor") added support for reading finer-grain
> temperatures, but continued to report temperatures in 1 degree Celsius
> increments via nvkm_therm_temp_get().
>
> Rather than altering nvkm_therm_temp_get() to report finer-grain
> temperatures, which would be inconvenient for other users of the
> function, a second interface has been added to line up with hwmon's
> native unit of temperature.
>
> Signed-off-by: Jeremy Cline <[email protected]>
> ---
> .../drm/nouveau/include/nvkm/subdev/therm.h | 18 +++++++++++++
> drivers/gpu/drm/nouveau/nouveau_hwmon.c | 4 +--
> .../gpu/drm/nouveau/nvkm/subdev/therm/base.c | 16 ++++++++++++
> .../gpu/drm/nouveau/nvkm/subdev/therm/gp100.c | 25 +++++++++++++++++--
> .../gpu/drm/nouveau/nvkm/subdev/therm/priv.h | 1 +
> 5 files changed, 60 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> index 62c34f98c930..7b9928dd001c 100644
> --- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> +++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> @@ -100,6 +100,24 @@ struct nvkm_therm {
> };
>
> int nvkm_therm_temp_get(struct nvkm_therm *);
> +
> +/**
> + * nvkm_therm_temp_millidegree_get() - get the temperature in millidegrees
> + * @therm: The thermal device to read from.
> + *
> + * This interface reports temperatures in units of millidegree Celsius to
> + * align with the hwmon API. Some cards may only be capable of reporting in
> + * units of Celsius, and those that report finer grain temperatures may not be
> + * capable of millidegree Celsius accuracy,
> + *
> + * For cases where millidegree temperature is too fine-grain, the
> + * nvkm_therm_temp_get() interface reports temperatures in one degree Celsius
> + * increments.
> + *
> + * Return: The temperature in millidegrees Celsius, or -ENODEV if temperature
> + * reporting is not supported.
> + */
> +int nvkm_therm_temp_millidegree_get(struct nvkm_therm *therm);
> int nvkm_therm_fan_sense(struct nvkm_therm *);
> int nvkm_therm_cstate(struct nvkm_therm *, int, int);
> void nvkm_therm_clkgate_init(struct nvkm_therm *,
> diff --git a/drivers/gpu/drm/nouveau/nouveau_hwmon.c b/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> index 1c3104d20571..e96355f93ce5 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> @@ -428,8 +428,8 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
> case hwmon_temp_input:
> if (drm_dev->switch_power_state != DRM_SWITCH_POWER_ON)
> return -EINVAL;
> - ret = nvkm_therm_temp_get(therm);
> - *val = ret < 0 ? ret : (ret * 1000);
> + ret = nvkm_therm_temp_millidegree_get(therm);
> + *val = ret;
> break;
> case hwmon_temp_max:
> *val = therm->attr_get(therm, NVKM_THERM_ATTR_THRS_DOWN_CLK)
> diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> index 4a4d1e224126..e655b32c78b8 100644
> --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> @@ -34,6 +34,22 @@ nvkm_therm_temp_get(struct nvkm_therm *therm)
> return -ENODEV;
> }
>
> +int
> +nvkm_therm_temp_millidegree_get(struct nvkm_therm *therm)
> +{
> + int ret = -ENODEV;
> +
> + if (therm->func->temp_millidegree_get)
> + return therm->func->temp_millidegree_get(therm);
> +
> + if (therm->func->temp_get) {
> + ret = therm->func->temp_get(therm);
> + if (ret > 0)
> + ret *= 1000;
> + }
> + return ret;
> +}
> +
> static int
> nvkm_therm_update_trip(struct nvkm_therm *therm)
> {
> diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> index 9f0dea3f61dc..4c3c2895a3cb 100644
> --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> @@ -24,7 +24,7 @@
> #include "priv.h"
>
> static int
> -gp100_temp_get(struct nvkm_therm *therm)
> +gp100_temp_get_raw(struct nvkm_therm *therm)
> {
> struct nvkm_device *device = therm->subdev.device;
> struct nvkm_subdev *subdev = &therm->subdev;
> @@ -37,14 +37,35 @@ gp100_temp_get(struct nvkm_therm *therm)
>
> /* device valid */
> if (tsensor & 0x20000000)
> - return (inttemp >> 8);
> + return inttemp;
> else
> return -ENODEV;
> }
>
> +static int
> +gp100_temp_millidegree_get(struct nvkm_therm *therm)
> +{
> + int raw_temp = gp100_temp_get_raw(therm);
> +
> + if (raw_temp < 0)
> + return raw_temp;
> + return raw_temp * 1000 >> 8;
> +}
> +
> +static int
> +gp100_temp_get(struct nvkm_therm *therm)
> +{
> + int raw_temp = gp100_temp_get_raw(therm);
> +
> + if (raw_temp < 0)
> + return raw_temp;
> + return raw_temp >> 8;
> +}
> +
> static const struct nvkm_therm_func
> gp100_therm = {
> .temp_get = gp100_temp_get,
> + .temp_millidegree_get = gp100_temp_millidegree_get,
> .program_alarms = nvkm_therm_program_alarms_polling,
> };
>
> diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> index 21659daf1864..a53068b4f0b9 100644
> --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> @@ -92,6 +92,7 @@ struct nvkm_therm_func {
> int (*pwm_clock)(struct nvkm_therm *, int line);
>
> int (*temp_get)(struct nvkm_therm *);
> + int (*temp_millidegree_get)(struct nvkm_therm *therm);
>
> int (*fan_sense)(struct nvkm_therm *);
>
> --
> 2.26.2
>

2020-09-09 04:09:45

by Ben Skeggs

[permalink] [raw]
Subject: Re: [Nouveau] [PATCH] drm/nouveau: Add fine-grain temperature reporting

On Thu, 13 Aug 2020 at 06:50, Jeremy Cline <[email protected]> wrote:
>
> Commit d32656373857 ("drm/nouveau/therm/gp100: initial implementation of
> new gp1xx temperature sensor") added support for reading finer-grain
> temperatures, but continued to report temperatures in 1 degree Celsius
> increments via nvkm_therm_temp_get().
>
> Rather than altering nvkm_therm_temp_get() to report finer-grain
> temperatures, which would be inconvenient for other users of the
> function, a second interface has been added to line up with hwmon's
> native unit of temperature.
Hey Jeremy,

Sorry this slipped past me until now. I'm OK with adding support for
millidegree temperature reporting, but don't think we need to keep
both interfaces around and would rather see the existing code
converted to return millidegrees (even on GPUs that don't support it)
instead of degrees.

Thanks!
Ben.

>
> Signed-off-by: Jeremy Cline <[email protected]>
> ---
> .../drm/nouveau/include/nvkm/subdev/therm.h | 18 +++++++++++++
> drivers/gpu/drm/nouveau/nouveau_hwmon.c | 4 +--
> .../gpu/drm/nouveau/nvkm/subdev/therm/base.c | 16 ++++++++++++
> .../gpu/drm/nouveau/nvkm/subdev/therm/gp100.c | 25 +++++++++++++++++--
> .../gpu/drm/nouveau/nvkm/subdev/therm/priv.h | 1 +
> 5 files changed, 60 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> index 62c34f98c930..7b9928dd001c 100644
> --- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> +++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> @@ -100,6 +100,24 @@ struct nvkm_therm {
> };
>
> int nvkm_therm_temp_get(struct nvkm_therm *);
> +
> +/**
> + * nvkm_therm_temp_millidegree_get() - get the temperature in millidegrees
> + * @therm: The thermal device to read from.
> + *
> + * This interface reports temperatures in units of millidegree Celsius to
> + * align with the hwmon API. Some cards may only be capable of reporting in
> + * units of Celsius, and those that report finer grain temperatures may not be
> + * capable of millidegree Celsius accuracy,
> + *
> + * For cases where millidegree temperature is too fine-grain, the
> + * nvkm_therm_temp_get() interface reports temperatures in one degree Celsius
> + * increments.
> + *
> + * Return: The temperature in millidegrees Celsius, or -ENODEV if temperature
> + * reporting is not supported.
> + */
> +int nvkm_therm_temp_millidegree_get(struct nvkm_therm *therm);
> int nvkm_therm_fan_sense(struct nvkm_therm *);
> int nvkm_therm_cstate(struct nvkm_therm *, int, int);
> void nvkm_therm_clkgate_init(struct nvkm_therm *,
> diff --git a/drivers/gpu/drm/nouveau/nouveau_hwmon.c b/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> index 1c3104d20571..e96355f93ce5 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> @@ -428,8 +428,8 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
> case hwmon_temp_input:
> if (drm_dev->switch_power_state != DRM_SWITCH_POWER_ON)
> return -EINVAL;
> - ret = nvkm_therm_temp_get(therm);
> - *val = ret < 0 ? ret : (ret * 1000);
> + ret = nvkm_therm_temp_millidegree_get(therm);
> + *val = ret;
> break;
> case hwmon_temp_max:
> *val = therm->attr_get(therm, NVKM_THERM_ATTR_THRS_DOWN_CLK)
> diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> index 4a4d1e224126..e655b32c78b8 100644
> --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> @@ -34,6 +34,22 @@ nvkm_therm_temp_get(struct nvkm_therm *therm)
> return -ENODEV;
> }
>
> +int
> +nvkm_therm_temp_millidegree_get(struct nvkm_therm *therm)
> +{
> + int ret = -ENODEV;
> +
> + if (therm->func->temp_millidegree_get)
> + return therm->func->temp_millidegree_get(therm);
> +
> + if (therm->func->temp_get) {
> + ret = therm->func->temp_get(therm);
> + if (ret > 0)
> + ret *= 1000;
> + }
> + return ret;
> +}
> +
> static int
> nvkm_therm_update_trip(struct nvkm_therm *therm)
> {
> diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> index 9f0dea3f61dc..4c3c2895a3cb 100644
> --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> @@ -24,7 +24,7 @@
> #include "priv.h"
>
> static int
> -gp100_temp_get(struct nvkm_therm *therm)
> +gp100_temp_get_raw(struct nvkm_therm *therm)
> {
> struct nvkm_device *device = therm->subdev.device;
> struct nvkm_subdev *subdev = &therm->subdev;
> @@ -37,14 +37,35 @@ gp100_temp_get(struct nvkm_therm *therm)
>
> /* device valid */
> if (tsensor & 0x20000000)
> - return (inttemp >> 8);
> + return inttemp;
> else
> return -ENODEV;
> }
>
> +static int
> +gp100_temp_millidegree_get(struct nvkm_therm *therm)
> +{
> + int raw_temp = gp100_temp_get_raw(therm);
> +
> + if (raw_temp < 0)
> + return raw_temp;
> + return raw_temp * 1000 >> 8;
> +}
> +
> +static int
> +gp100_temp_get(struct nvkm_therm *therm)
> +{
> + int raw_temp = gp100_temp_get_raw(therm);
> +
> + if (raw_temp < 0)
> + return raw_temp;
> + return raw_temp >> 8;
> +}
> +
> static const struct nvkm_therm_func
> gp100_therm = {
> .temp_get = gp100_temp_get,
> + .temp_millidegree_get = gp100_temp_millidegree_get,
> .program_alarms = nvkm_therm_program_alarms_polling,
> };
>
> diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> index 21659daf1864..a53068b4f0b9 100644
> --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> @@ -92,6 +92,7 @@ struct nvkm_therm_func {
> int (*pwm_clock)(struct nvkm_therm *, int line);
>
> int (*temp_get)(struct nvkm_therm *);
> + int (*temp_millidegree_get)(struct nvkm_therm *therm);
>
> int (*fan_sense)(struct nvkm_therm *);
>
> --
> 2.26.2
>
> _______________________________________________
> Nouveau mailing list
> [email protected]
> https://lists.freedesktop.org/mailman/listinfo/nouveau

2020-09-09 08:25:42

by Karol Herbst

[permalink] [raw]
Subject: Re: [Nouveau] [PATCH] drm/nouveau: Add fine-grain temperature reporting

On Wed, Sep 9, 2020 at 6:06 AM Ben Skeggs <[email protected]> wrote:
>
> On Thu, 13 Aug 2020 at 06:50, Jeremy Cline <[email protected]> wrote:
> >
> > Commit d32656373857 ("drm/nouveau/therm/gp100: initial implementation of
> > new gp1xx temperature sensor") added support for reading finer-grain
> > temperatures, but continued to report temperatures in 1 degree Celsius
> > increments via nvkm_therm_temp_get().
> >
> > Rather than altering nvkm_therm_temp_get() to report finer-grain
> > temperatures, which would be inconvenient for other users of the
> > function, a second interface has been added to line up with hwmon's
> > native unit of temperature.
> Hey Jeremy,
>
> Sorry this slipped past me until now. I'm OK with adding support for
> millidegree temperature reporting, but don't think we need to keep
> both interfaces around and would rather see the existing code
> converted to return millidegrees (even on GPUs that don't support it)
> instead of degrees.
>
> Thanks!
> Ben.
>

just a note as I was trying something like that in the past: we have a
lot of code which fetches the temperature and there are a lot of
places where we would then do a divide by 1000, so having a wrapper
function at least makes sense. If we want to keep the func pointers?
well.. I guess the increased CPU overhead wouldn't be as bad if all
sub classes do this static * 1000 as well either. I just think we
should have both interfaces in subdev/therm.h so we can just keep most
of the current code as is.

> >
> > Signed-off-by: Jeremy Cline <[email protected]>
> > ---
> > .../drm/nouveau/include/nvkm/subdev/therm.h | 18 +++++++++++++
> > drivers/gpu/drm/nouveau/nouveau_hwmon.c | 4 +--
> > .../gpu/drm/nouveau/nvkm/subdev/therm/base.c | 16 ++++++++++++
> > .../gpu/drm/nouveau/nvkm/subdev/therm/gp100.c | 25 +++++++++++++++++--
> > .../gpu/drm/nouveau/nvkm/subdev/therm/priv.h | 1 +
> > 5 files changed, 60 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> > index 62c34f98c930..7b9928dd001c 100644
> > --- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> > +++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> > @@ -100,6 +100,24 @@ struct nvkm_therm {
> > };
> >
> > int nvkm_therm_temp_get(struct nvkm_therm *);
> > +
> > +/**
> > + * nvkm_therm_temp_millidegree_get() - get the temperature in millidegrees
> > + * @therm: The thermal device to read from.
> > + *
> > + * This interface reports temperatures in units of millidegree Celsius to
> > + * align with the hwmon API. Some cards may only be capable of reporting in
> > + * units of Celsius, and those that report finer grain temperatures may not be
> > + * capable of millidegree Celsius accuracy,
> > + *
> > + * For cases where millidegree temperature is too fine-grain, the
> > + * nvkm_therm_temp_get() interface reports temperatures in one degree Celsius
> > + * increments.
> > + *
> > + * Return: The temperature in millidegrees Celsius, or -ENODEV if temperature
> > + * reporting is not supported.
> > + */
> > +int nvkm_therm_temp_millidegree_get(struct nvkm_therm *therm);
> > int nvkm_therm_fan_sense(struct nvkm_therm *);
> > int nvkm_therm_cstate(struct nvkm_therm *, int, int);
> > void nvkm_therm_clkgate_init(struct nvkm_therm *,
> > diff --git a/drivers/gpu/drm/nouveau/nouveau_hwmon.c b/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> > index 1c3104d20571..e96355f93ce5 100644
> > --- a/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> > +++ b/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> > @@ -428,8 +428,8 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
> > case hwmon_temp_input:
> > if (drm_dev->switch_power_state != DRM_SWITCH_POWER_ON)
> > return -EINVAL;
> > - ret = nvkm_therm_temp_get(therm);
> > - *val = ret < 0 ? ret : (ret * 1000);
> > + ret = nvkm_therm_temp_millidegree_get(therm);
> > + *val = ret;
> > break;
> > case hwmon_temp_max:
> > *val = therm->attr_get(therm, NVKM_THERM_ATTR_THRS_DOWN_CLK)
> > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> > index 4a4d1e224126..e655b32c78b8 100644
> > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> > @@ -34,6 +34,22 @@ nvkm_therm_temp_get(struct nvkm_therm *therm)
> > return -ENODEV;
> > }
> >
> > +int
> > +nvkm_therm_temp_millidegree_get(struct nvkm_therm *therm)
> > +{
> > + int ret = -ENODEV;
> > +
> > + if (therm->func->temp_millidegree_get)
> > + return therm->func->temp_millidegree_get(therm);
> > +
> > + if (therm->func->temp_get) {
> > + ret = therm->func->temp_get(therm);
> > + if (ret > 0)
> > + ret *= 1000;
> > + }
> > + return ret;
> > +}
> > +
> > static int
> > nvkm_therm_update_trip(struct nvkm_therm *therm)
> > {
> > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> > index 9f0dea3f61dc..4c3c2895a3cb 100644
> > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> > @@ -24,7 +24,7 @@
> > #include "priv.h"
> >
> > static int
> > -gp100_temp_get(struct nvkm_therm *therm)
> > +gp100_temp_get_raw(struct nvkm_therm *therm)
> > {
> > struct nvkm_device *device = therm->subdev.device;
> > struct nvkm_subdev *subdev = &therm->subdev;
> > @@ -37,14 +37,35 @@ gp100_temp_get(struct nvkm_therm *therm)
> >
> > /* device valid */
> > if (tsensor & 0x20000000)
> > - return (inttemp >> 8);
> > + return inttemp;
> > else
> > return -ENODEV;
> > }
> >
> > +static int
> > +gp100_temp_millidegree_get(struct nvkm_therm *therm)
> > +{
> > + int raw_temp = gp100_temp_get_raw(therm);
> > +
> > + if (raw_temp < 0)
> > + return raw_temp;
> > + return raw_temp * 1000 >> 8;
> > +}
> > +
> > +static int
> > +gp100_temp_get(struct nvkm_therm *therm)
> > +{
> > + int raw_temp = gp100_temp_get_raw(therm);
> > +
> > + if (raw_temp < 0)
> > + return raw_temp;
> > + return raw_temp >> 8;
> > +}
> > +
> > static const struct nvkm_therm_func
> > gp100_therm = {
> > .temp_get = gp100_temp_get,
> > + .temp_millidegree_get = gp100_temp_millidegree_get,
> > .program_alarms = nvkm_therm_program_alarms_polling,
> > };
> >
> > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> > index 21659daf1864..a53068b4f0b9 100644
> > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> > @@ -92,6 +92,7 @@ struct nvkm_therm_func {
> > int (*pwm_clock)(struct nvkm_therm *, int line);
> >
> > int (*temp_get)(struct nvkm_therm *);
> > + int (*temp_millidegree_get)(struct nvkm_therm *therm);
> >
> > int (*fan_sense)(struct nvkm_therm *);
> >
> > --
> > 2.26.2
> >
> > _______________________________________________
> > Nouveau mailing list
> > [email protected]
> > https://lists.freedesktop.org/mailman/listinfo/nouveau
> _______________________________________________
> Nouveau mailing list
> [email protected]
> https://lists.freedesktop.org/mailman/listinfo/nouveau
>

2020-09-09 17:13:59

by Jeremy Cline

[permalink] [raw]
Subject: Re: [Nouveau] [PATCH] drm/nouveau: Add fine-grain temperature reporting

On Wed, Sep 09, 2020 at 10:22:14AM +0200, Karol Herbst wrote:
> On Wed, Sep 9, 2020 at 6:06 AM Ben Skeggs <[email protected]> wrote:
> >
> > On Thu, 13 Aug 2020 at 06:50, Jeremy Cline <[email protected]> wrote:
> > >
> > > Commit d32656373857 ("drm/nouveau/therm/gp100: initial implementation of
> > > new gp1xx temperature sensor") added support for reading finer-grain
> > > temperatures, but continued to report temperatures in 1 degree Celsius
> > > increments via nvkm_therm_temp_get().
> > >
> > > Rather than altering nvkm_therm_temp_get() to report finer-grain
> > > temperatures, which would be inconvenient for other users of the
> > > function, a second interface has been added to line up with hwmon's
> > > native unit of temperature.
> > Hey Jeremy,
> >
> > Sorry this slipped past me until now. I'm OK with adding support for
> > millidegree temperature reporting, but don't think we need to keep
> > both interfaces around and would rather see the existing code
> > converted to return millidegrees (even on GPUs that don't support it)
> > instead of degrees.
> >
> > Thanks!
> > Ben.
> >
>
> just a note as I was trying something like that in the past: we have a
> lot of code which fetches the temperature and there are a lot of
> places where we would then do a divide by 1000, so having a wrapper
> function at least makes sense. If we want to keep the func pointers?
> well.. I guess the increased CPU overhead wouldn't be as bad if all
> sub classes do this static * 1000 as well either. I just think we
> should have both interfaces in subdev/therm.h so we can just keep most
> of the current code as is.
>

Indeed. My initial plan was to change the meaning of temp_get() and
adjust all the users, but there's around a dozen of them and based on my
understanding of the users none of them cared much about such accuracy.

However, I do find having one way to do things appealing, so if there's
a strong preference for it, I'm happy to produce a somewhat more
invasive patch where all users expect millidegrees.

- Jeremy

2020-09-10 04:25:31

by Ben Skeggs

[permalink] [raw]
Subject: Re: [Nouveau] [PATCH] drm/nouveau: Add fine-grain temperature reporting

On Thu, 10 Sep 2020 at 00:07, Jeremy Cline <[email protected]> wrote:
>
> On Wed, Sep 09, 2020 at 10:22:14AM +0200, Karol Herbst wrote:
> > On Wed, Sep 9, 2020 at 6:06 AM Ben Skeggs <[email protected]> wrote:
> > >
> > > On Thu, 13 Aug 2020 at 06:50, Jeremy Cline <[email protected]> wrote:
> > > >
> > > > Commit d32656373857 ("drm/nouveau/therm/gp100: initial implementation of
> > > > new gp1xx temperature sensor") added support for reading finer-grain
> > > > temperatures, but continued to report temperatures in 1 degree Celsius
> > > > increments via nvkm_therm_temp_get().
> > > >
> > > > Rather than altering nvkm_therm_temp_get() to report finer-grain
> > > > temperatures, which would be inconvenient for other users of the
> > > > function, a second interface has been added to line up with hwmon's
> > > > native unit of temperature.
> > > Hey Jeremy,
> > >
> > > Sorry this slipped past me until now. I'm OK with adding support for
> > > millidegree temperature reporting, but don't think we need to keep
> > > both interfaces around and would rather see the existing code
> > > converted to return millidegrees (even on GPUs that don't support it)
> > > instead of degrees.
> > >
> > > Thanks!
> > > Ben.
> > >
> >
> > just a note as I was trying something like that in the past: we have a
> > lot of code which fetches the temperature and there are a lot of
> > places where we would then do a divide by 1000, so having a wrapper
> > function at least makes sense. If we want to keep the func pointers?
> > well.. I guess the increased CPU overhead wouldn't be as bad if all
> > sub classes do this static * 1000 as well either. I just think we
> > should have both interfaces in subdev/therm.h so we can just keep most
> > of the current code as is.
> >
>
> Indeed. My initial plan was to change the meaning of temp_get() and
> adjust all the users, but there's around a dozen of them and based on my
> understanding of the users none of them cared much about such accuracy.
>
> However, I do find having one way to do things appealing, so if there's
> a strong preference for it, I'm happy to produce a somewhat more
> invasive patch where all users expect millidegrees.
Yeah, I do have a strong preference for not having to keep multiple
interfaces around unnecessarily. It'd be somewhat different if we had
external interactions, but this is all stuff within the module and we
should be able to make these kinds of changes without too much pain.

Ben.

>
> - Jeremy
>

2020-09-16 19:51:17

by Jeremy Cline

[permalink] [raw]
Subject: [PATCH v2 0/2] Add fine-grain temperature reporting

Hi folks,

This series adjusts the temp_get() interface in the thermal functions to
report milli-degrees, and additionally alters the way errors are
reported. As I went through all the users and implementations I realized
that Pascal's temp_get() implementation didn't include turning negative
temperatures to 0 like other implementations, so I separated error
reporting from the temperature in the API.

Couple of things I'm on the fence about here:

* I allowed the nvkm_therm_temp_get() functions to accept NULL as a way
to check if temperature reading is available rather than adding a
separate helper, but was torn about whether this is clearer than a
separate helper function.

* I added a WARN_ON in places that previously called
nvkm_therm_temp_get() and didn't check the return value for an error.
This may not be a reasonable error handling method.

Jeremy Cline (2):
drm/nouveau: return temperatures in temp_get() via parameter
drm/nouveau: Add fine-grain temperature reporting

.../nouveau/include/nvkm/subdev/bios/therm.h | 13 +++++++++
.../drm/nouveau/include/nvkm/subdev/therm.h | 28 ++++++++++++++++++-
drivers/gpu/drm/nouveau/nouveau_hwmon.c | 12 ++++----
.../gpu/drm/nouveau/nvkm/subdev/therm/base.c | 28 +++++++++++++++----
.../gpu/drm/nouveau/nvkm/subdev/therm/g84.c | 11 ++++----
.../gpu/drm/nouveau/nvkm/subdev/therm/gp100.c | 11 ++++----
.../gpu/drm/nouveau/nvkm/subdev/therm/nv40.c | 9 ++----
.../gpu/drm/nouveau/nvkm/subdev/therm/nv50.c | 9 ++----
.../gpu/drm/nouveau/nvkm/subdev/therm/priv.h | 17 +++++++++--
.../gpu/drm/nouveau/nvkm/subdev/therm/temp.c | 12 +++++---
10 files changed, 110 insertions(+), 40 deletions(-)

--
2.26.2

2020-09-16 19:52:29

by Jeremy Cline

[permalink] [raw]
Subject: [PATCH v2 1/2] drm/nouveau: return temperatures in temp_get() via parameter

The temp_get() function currently returns negative error numbers or a
temperature. However, the thermal sensors can (in theory) measure
negative temperatures. Some implementations of temp_get() correctly
clamp negative temperature readings to 0 so that users do not mistake
them for errors, but some, like gp100_temp_get(), do not.

Rather than relying on implementations remembering to clamp values,
dedicate the function return value to error codes and accept a pointer
to an integer where the temperature reading should be stored.

Signed-off-by: Jeremy Cline <[email protected]>
---
.../drm/nouveau/include/nvkm/subdev/therm.h | 2 +-
drivers/gpu/drm/nouveau/nouveau_hwmon.c | 12 ++++++------
.../gpu/drm/nouveau/nvkm/subdev/therm/base.c | 19 ++++++++++++++-----
.../gpu/drm/nouveau/nvkm/subdev/therm/g84.c | 11 ++++++-----
.../gpu/drm/nouveau/nvkm/subdev/therm/gp100.c | 11 ++++++-----
.../gpu/drm/nouveau/nvkm/subdev/therm/nv40.c | 9 +++------
.../gpu/drm/nouveau/nvkm/subdev/therm/nv50.c | 9 +++------
.../gpu/drm/nouveau/nvkm/subdev/therm/priv.h | 17 +++++++++++++++--
.../gpu/drm/nouveau/nvkm/subdev/therm/temp.c | 12 ++++++++----
9 files changed, 62 insertions(+), 40 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
index 62c34f98c930..bfe9779216fc 100644
--- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
+++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
@@ -99,7 +99,7 @@ struct nvkm_therm {
bool clkgating_enabled;
};

-int nvkm_therm_temp_get(struct nvkm_therm *);
+int nvkm_therm_temp_get(struct nvkm_therm *therm, int *temp);
int nvkm_therm_fan_sense(struct nvkm_therm *);
int nvkm_therm_cstate(struct nvkm_therm *, int, int);
void nvkm_therm_clkgate_init(struct nvkm_therm *,
diff --git a/drivers/gpu/drm/nouveau/nouveau_hwmon.c b/drivers/gpu/drm/nouveau/nouveau_hwmon.c
index 1c3104d20571..aff6aa296678 100644
--- a/drivers/gpu/drm/nouveau/nouveau_hwmon.c
+++ b/drivers/gpu/drm/nouveau/nouveau_hwmon.c
@@ -325,7 +325,7 @@ nouveau_temp_is_visible(const void *data, u32 attr, int channel)
struct nouveau_drm *drm = nouveau_drm((struct drm_device *)data);
struct nvkm_therm *therm = nvxx_therm(&drm->client.device);

- if (!therm || !therm->attr_get || nvkm_therm_temp_get(therm) < 0)
+ if (!therm || !therm->attr_get || nvkm_therm_temp_get(therm, NULL) < 0)
return 0;

switch (attr) {
@@ -419,7 +419,7 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
struct drm_device *drm_dev = dev_get_drvdata(dev);
struct nouveau_drm *drm = nouveau_drm(drm_dev);
struct nvkm_therm *therm = nvxx_therm(&drm->client.device);
- int ret;
+ int ret = 0, temp;

if (!therm || !therm->attr_get)
return -EOPNOTSUPP;
@@ -428,8 +428,8 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
case hwmon_temp_input:
if (drm_dev->switch_power_state != DRM_SWITCH_POWER_ON)
return -EINVAL;
- ret = nvkm_therm_temp_get(therm);
- *val = ret < 0 ? ret : (ret * 1000);
+ ret = nvkm_therm_temp_get(therm, &temp);
+ *val = temp * 1000;
break;
case hwmon_temp_max:
*val = therm->attr_get(therm, NVKM_THERM_ATTR_THRS_DOWN_CLK)
@@ -459,7 +459,7 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
return -EOPNOTSUPP;
}

- return 0;
+ return ret;
}

static int
@@ -735,7 +735,7 @@ nouveau_hwmon_init(struct drm_device *dev)
hwmon->dev = dev;

if (therm && therm->attr_get && therm->attr_set) {
- if (nvkm_therm_temp_get(therm) >= 0)
+ if (nvkm_therm_temp_get(therm, NULL) >= 0)
special_groups[i++] = &temp1_auto_point_sensor_group;
if (therm->fan_get && therm->fan_get(therm) >= 0)
special_groups[i++] = &pwm_fan_sensor_group;
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
index 4a4d1e224126..e7ffea1512e6 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
@@ -27,10 +27,15 @@
#include <subdev/pmu.h>

int
-nvkm_therm_temp_get(struct nvkm_therm *therm)
+nvkm_therm_temp_get(struct nvkm_therm *therm, int *temp)
{
+ int ignored_reading;
+
+ if (temp == NULL)
+ temp = &ignored_reading;
+
if (therm->func->temp_get)
- return therm->func->temp_get(therm);
+ return therm->func->temp_get(therm, temp);
return -ENODEV;
}

@@ -40,9 +45,11 @@ nvkm_therm_update_trip(struct nvkm_therm *therm)
struct nvbios_therm_trip_point *trip = therm->fan->bios.trip,
*cur_trip = NULL,
*last_trip = therm->last_trip;
- u8 temp = therm->func->temp_get(therm);
+ int temp;
u16 duty, i;

+ WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
+
/* look for the trip point corresponding to the current temperature */
cur_trip = NULL;
for (i = 0; i < therm->fan->bios.nr_fan_trip; i++) {
@@ -70,9 +77,11 @@ static int
nvkm_therm_compute_linear_duty(struct nvkm_therm *therm, u8 linear_min_temp,
u8 linear_max_temp)
{
- u8 temp = therm->func->temp_get(therm);
+ int temp;
u16 duty;

+ WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
+
/* handle the non-linear part first */
if (temp < linear_min_temp)
return therm->fan->bios.min_duty;
@@ -200,7 +209,7 @@ nvkm_therm_fan_mode(struct nvkm_therm *therm, int mode)
/* do not allow automatic fan management if the thermal sensor is
* not available */
if (mode == NVKM_THERM_CTRL_AUTO &&
- therm->func->temp_get(therm) < 0)
+ nvkm_therm_temp_get(therm, NULL) < 0)
return -EINVAL;

if (therm->mode == mode)
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
index 96f8da40ac82..e2f891d5c7ba 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
@@ -27,14 +27,15 @@
#include <subdev/fuse.h>

int
-g84_temp_get(struct nvkm_therm *therm)
+g84_temp_get(struct nvkm_therm *therm, int *temp)
{
struct nvkm_device *device = therm->subdev.device;

- if (nvkm_fuse_read(device->fuse, 0x1a8) == 1)
- return nvkm_rd32(device, 0x20400);
- else
+ if (nvkm_fuse_read(device->fuse, 0x1a8) != 1)
return -ENODEV;
+
+ *temp = nvkm_rd32(device, 0x20400);
+ return 0;
}

void
@@ -115,7 +116,7 @@ g84_therm_threshold_hyst_emulation(struct nvkm_therm *therm,
}

/* fix the state (in case someone reprogrammed the alarms) */
- cur = therm->func->temp_get(therm);
+ WARN_ON(nvkm_therm_temp_get(therm, &cur) < 0);
if (new_state == NVKM_THERM_THRS_LOWER && cur > thrs->temp)
new_state = NVKM_THERM_THRS_HIGHER;
else if (new_state == NVKM_THERM_THRS_HIGHER &&
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
index 9f0dea3f61dc..4c32e4f21bec 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
@@ -24,7 +24,7 @@
#include "priv.h"

static int
-gp100_temp_get(struct nvkm_therm *therm)
+gp100_temp_get(struct nvkm_therm *therm, int *temp)
{
struct nvkm_device *device = therm->subdev.device;
struct nvkm_subdev *subdev = &therm->subdev;
@@ -35,11 +35,12 @@ gp100_temp_get(struct nvkm_therm *therm)
if (tsensor & 0x40000000)
nvkm_trace(subdev, "reading temperature from SHADOWed sensor\n");

- /* device valid */
- if (tsensor & 0x20000000)
- return (inttemp >> 8);
- else
+ /* device invalid */
+ if (!(tsensor & 0x20000000))
return -ENODEV;
+
+ *temp = inttemp >> 8;
+ return 0;
}

static const struct nvkm_therm_func
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
index 2c92ffb5f9d0..9753ad4bee4a 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
@@ -70,7 +70,7 @@ nv40_sensor_setup(struct nvkm_therm *therm)
}

static int
-nv40_temp_get(struct nvkm_therm *therm)
+nv40_temp_get(struct nvkm_therm *therm, int *temp)
{
struct nvkm_device *device = therm->subdev.device;
struct nvbios_therm_sensor *sensor = &therm->bios_sensor;
@@ -95,11 +95,8 @@ nv40_temp_get(struct nvkm_therm *therm)
core_temp = core_temp + sensor->offset_num / sensor->offset_den;
core_temp = core_temp + sensor->offset_constant - 8;

- /* reserve negative temperatures for errors */
- if (core_temp < 0)
- core_temp = 0;
-
- return core_temp;
+ *temp = core_temp;
+ return 0;
}

static int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
index 9b57b433d4cf..38fa6777c129 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
@@ -126,7 +126,7 @@ nv50_sensor_setup(struct nvkm_therm *therm)
}

static int
-nv50_temp_get(struct nvkm_therm *therm)
+nv50_temp_get(struct nvkm_therm *therm, int *temp)
{
struct nvkm_device *device = therm->subdev.device;
struct nvbios_therm_sensor *sensor = &therm->bios_sensor;
@@ -143,11 +143,8 @@ nv50_temp_get(struct nvkm_therm *therm)
core_temp = core_temp + sensor->offset_num / sensor->offset_den;
core_temp = core_temp + sensor->offset_constant - 8;

- /* reserve negative temperatures for errors */
- if (core_temp < 0)
- core_temp = 0;
-
- return core_temp;
+ *temp = core_temp;
+ return 0;
}

static void
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
index 21659daf1864..04607d8b1755 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
@@ -91,7 +91,15 @@ struct nvkm_therm_func {
int (*pwm_set)(struct nvkm_therm *, int line, u32, u32);
int (*pwm_clock)(struct nvkm_therm *, int line);

- int (*temp_get)(struct nvkm_therm *);
+ /**
+ * @temp_get: Get the temperature reading from a thermal device
+ *
+ * @therm: The thermal device instance.
+ * @temp: A pointer to write the temperature reading to.
+ *
+ * Returns: Zero on success, or a negative error code on failure.
+ */
+ int (*temp_get)(struct nvkm_therm *therm, int *temp);

int (*fan_sense)(struct nvkm_therm *);

@@ -110,7 +118,12 @@ int nv50_fan_pwm_get(struct nvkm_therm *, int, u32 *, u32 *);
int nv50_fan_pwm_set(struct nvkm_therm *, int, u32, u32);
int nv50_fan_pwm_clock(struct nvkm_therm *, int);

-int g84_temp_get(struct nvkm_therm *);
+/**
+ * g84_temp_get() - An implementation of the &struct nvkm_therm_func temp_get()
+ * @therm: The thermal device instance.
+ * @temp: A pointer to write the temperature reading to.
+ */
+int g84_temp_get(struct nvkm_therm *therm, int *temp);
void g84_sensor_setup(struct nvkm_therm *);
void g84_therm_fini(struct nvkm_therm *);

diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
index ddb2b2c600ca..1e8803901abc 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
@@ -86,7 +86,9 @@ nvkm_therm_sensor_event(struct nvkm_therm *therm, enum nvkm_therm_thrs thrs,
static const char * const thresholds[] = {
"fanboost", "downclock", "critical", "shutdown"
};
- int temperature = therm->func->temp_get(therm);
+ int temperature;
+
+ WARN_ON(nvkm_therm_temp_get(therm, &temperature) < 0);

if (thrs < 0 || thrs > 3)
return;
@@ -140,7 +142,9 @@ nvkm_therm_threshold_hyst_polling(struct nvkm_therm *therm,
{
enum nvkm_therm_thrs_direction direction;
enum nvkm_therm_thrs_state prev_state, new_state;
- int temp = therm->func->temp_get(therm);
+ int temp;
+
+ WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);

prev_state = nvkm_therm_sensor_get_threshold_state(therm, thrs_name);

@@ -185,7 +189,7 @@ alarm_timer_callback(struct nvkm_alarm *alarm)
spin_unlock_irqrestore(&therm->sensor.alarm_program_lock, flags);

/* schedule the next poll in one second */
- if (therm->func->temp_get(therm) >= 0)
+ if (nvkm_therm_temp_get(therm, NULL) >= 0)
nvkm_timer_alarm(tmr, 1000000000ULL, alarm);
}

@@ -229,7 +233,7 @@ nvkm_therm_sensor_preinit(struct nvkm_therm *therm)
{
const char *sensor_avail = "yes";

- if (therm->func->temp_get(therm) < 0)
+ if (nvkm_therm_temp_get(therm, NULL) < 0)
sensor_avail = "no";

nvkm_debug(&therm->subdev, "internal sensor: %s\n", sensor_avail);
--
2.26.2

2020-09-16 19:52:55

by Jeremy Cline

[permalink] [raw]
Subject: [PATCH v2 2/2] drm/nouveau: Add fine-grain temperature reporting

Commit d32656373857 ("drm/nouveau/therm/gp100: initial implementation of
new gp1xx temperature sensor") added support for reading finer-grain
temperatures, but continued to report temperatures in 1 degree Celsius
increments via nvkm_therm_temp_get().

This alters the temp_get() API to return millidegrees rather than
degrees, adjusts all implementations, and changes users that need integer
Celsius to use the new nvkm_therm_temp_get_c() helper function. Since
there are now two units of measurement floating around, structs that
store temperature now include documentation to make it clear what the
unit is.

Signed-off-by: Jeremy Cline <[email protected]>
---
.../nouveau/include/nvkm/subdev/bios/therm.h | 13 ++++++++++
.../drm/nouveau/include/nvkm/subdev/therm.h | 26 +++++++++++++++++++
drivers/gpu/drm/nouveau/nouveau_hwmon.c | 2 +-
.../gpu/drm/nouveau/nvkm/subdev/therm/base.c | 14 ++++++++--
.../gpu/drm/nouveau/nvkm/subdev/therm/g84.c | 4 +--
.../gpu/drm/nouveau/nvkm/subdev/therm/gp100.c | 2 +-
.../gpu/drm/nouveau/nvkm/subdev/therm/nv40.c | 2 +-
.../gpu/drm/nouveau/nvkm/subdev/therm/nv50.c | 2 +-
.../gpu/drm/nouveau/nvkm/subdev/therm/temp.c | 4 +--
9 files changed, 59 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/bios/therm.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/bios/therm.h
index 0fb8a3480871..fdf23214cc69 100644
--- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/bios/therm.h
+++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/bios/therm.h
@@ -1,6 +1,12 @@
/* SPDX-License-Identifier: MIT */
#ifndef __NVBIOS_THERM_H__
#define __NVBIOS_THERM_H__
+
+/**
+ * struct nvbios_therm_threshold - The threshold of a thermal event.
+ * @temp: The temperature, in degrees Celsius, of the thermal threshold.
+ * @hysteresis: The hysteresis of this threshold, in degrees Celsius.
+ */
struct nvbios_therm_threshold {
u8 temp;
u8 hysteresis;
@@ -29,6 +35,13 @@ enum nvbios_therm_fan_type {

/* no vbios have more than 6 */
#define NVKM_TEMP_FAN_TRIP_MAX 10
+
+/**
+ * struct nvbios_therm_trip_point - Represents a thermal trip point.
+ * @fan_duty: The fan's duty cycle as a percentage. Valid values are 0-100.
+ * @temp: The temperature of this trip point, in degrees Celsius.
+ * @hysteresis: the hysteresis to use at this trip point, in degrees Celsius.
+ */
struct nvbios_therm_trip_point {
int fan_duty;
int temp;
diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
index bfe9779216fc..e817bb9c9505 100644
--- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
+++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
@@ -99,7 +99,33 @@ struct nvkm_therm {
bool clkgating_enabled;
};

+/**
+ * nvkm_therm_temp_get() - get the temperature in millidegrees Celsius
+ * @therm: The thermal device to read from.
+ * @temp: A pointer to write the temperature reading to. Passing NULL is
+ * allowed, useful if you only need to check that it's possible to
+ * read a temperature.
+ *
+ * Note that because some cards are only capable of reporting temperatures in
+ * integer degrees Celsius or, if they support fractional degrees, not support
+ * millidegree accuracy, the accuracy of this interface is hardware dependent.
+ *
+ * Returns: Zero on success, or a negative error code on failure.
+ */
int nvkm_therm_temp_get(struct nvkm_therm *therm, int *temp);
+
+/**
+ * nvkm_therm_temp_get_c() - get the temperature in degrees Celsius
+ * @therm: The thermal device to read from.
+ * @temp: A pointer to write the temperature reading to.
+ *
+ * A convenient wrapper for nvkm_therm_temp_get() that converts to degrees
+ * Celsius.
+ *
+ * Returns: Zero on success, or a negative error code on failure.
+ */
+int nvkm_therm_temp_get_c(struct nvkm_therm *therm, int *temp);
+
int nvkm_therm_fan_sense(struct nvkm_therm *);
int nvkm_therm_cstate(struct nvkm_therm *, int, int);
void nvkm_therm_clkgate_init(struct nvkm_therm *,
diff --git a/drivers/gpu/drm/nouveau/nouveau_hwmon.c b/drivers/gpu/drm/nouveau/nouveau_hwmon.c
index aff6aa296678..1363068f44c1 100644
--- a/drivers/gpu/drm/nouveau/nouveau_hwmon.c
+++ b/drivers/gpu/drm/nouveau/nouveau_hwmon.c
@@ -429,7 +429,7 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
if (drm_dev->switch_power_state != DRM_SWITCH_POWER_ON)
return -EINVAL;
ret = nvkm_therm_temp_get(therm, &temp);
- *val = temp * 1000;
+ *val = temp;
break;
case hwmon_temp_max:
*val = therm->attr_get(therm, NVKM_THERM_ATTR_THRS_DOWN_CLK)
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
index e7ffea1512e6..a7bb1e6b6169 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
@@ -39,6 +39,16 @@ nvkm_therm_temp_get(struct nvkm_therm *therm, int *temp)
return -ENODEV;
}

+int
+nvkm_therm_temp_get_c(struct nvkm_therm *therm, int *temp)
+{
+ int ret = nvkm_therm_temp_get(therm, temp);
+
+ if (temp != NULL)
+ *temp /= 1000;
+ return ret;
+}
+
static int
nvkm_therm_update_trip(struct nvkm_therm *therm)
{
@@ -48,7 +58,7 @@ nvkm_therm_update_trip(struct nvkm_therm *therm)
int temp;
u16 duty, i;

- WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
+ WARN_ON(nvkm_therm_temp_get_c(therm, &temp) < 0);

/* look for the trip point corresponding to the current temperature */
cur_trip = NULL;
@@ -80,7 +90,7 @@ nvkm_therm_compute_linear_duty(struct nvkm_therm *therm, u8 linear_min_temp,
int temp;
u16 duty;

- WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
+ WARN_ON(nvkm_therm_temp_get_c(therm, &temp) < 0);

/* handle the non-linear part first */
if (temp < linear_min_temp)
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
index e2f891d5c7ba..2e563b379820 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
@@ -34,7 +34,7 @@ g84_temp_get(struct nvkm_therm *therm, int *temp)
if (nvkm_fuse_read(device->fuse, 0x1a8) != 1)
return -ENODEV;

- *temp = nvkm_rd32(device, 0x20400);
+ *temp = nvkm_rd32(device, 0x20400) * 1000;
return 0;
}

@@ -116,7 +116,7 @@ g84_therm_threshold_hyst_emulation(struct nvkm_therm *therm,
}

/* fix the state (in case someone reprogrammed the alarms) */
- WARN_ON(nvkm_therm_temp_get(therm, &cur) < 0);
+ WARN_ON(nvkm_therm_temp_get_c(therm, &cur) < 0);
if (new_state == NVKM_THERM_THRS_LOWER && cur > thrs->temp)
new_state = NVKM_THERM_THRS_HIGHER;
else if (new_state == NVKM_THERM_THRS_HIGHER &&
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
index 4c32e4f21bec..e7733a3eb36b 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
@@ -39,7 +39,7 @@ gp100_temp_get(struct nvkm_therm *therm, int *temp)
if (!(tsensor & 0x20000000))
return -ENODEV;

- *temp = inttemp >> 8;
+ *temp = inttemp * 1000 >> 8;
return 0;
}

diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
index 9753ad4bee4a..500d7e2ef884 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
@@ -95,7 +95,7 @@ nv40_temp_get(struct nvkm_therm *therm, int *temp)
core_temp = core_temp + sensor->offset_num / sensor->offset_den;
core_temp = core_temp + sensor->offset_constant - 8;

- *temp = core_temp;
+ *temp = core_temp * 1000;
return 0;
}

diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
index 38fa6777c129..3b203e8ee406 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
@@ -143,7 +143,7 @@ nv50_temp_get(struct nvkm_therm *therm, int *temp)
core_temp = core_temp + sensor->offset_num / sensor->offset_den;
core_temp = core_temp + sensor->offset_constant - 8;

- *temp = core_temp;
+ *temp = core_temp * 1000;
return 0;
}

diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
index 1e8803901abc..17d1dcb1c09c 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
@@ -88,7 +88,7 @@ nvkm_therm_sensor_event(struct nvkm_therm *therm, enum nvkm_therm_thrs thrs,
};
int temperature;

- WARN_ON(nvkm_therm_temp_get(therm, &temperature) < 0);
+ WARN_ON(nvkm_therm_temp_get_c(therm, &temperature) < 0);

if (thrs < 0 || thrs > 3)
return;
@@ -144,7 +144,7 @@ nvkm_therm_threshold_hyst_polling(struct nvkm_therm *therm,
enum nvkm_therm_thrs_state prev_state, new_state;
int temp;

- WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
+ WARN_ON(nvkm_therm_temp_get_c(therm, &temp) < 0);

prev_state = nvkm_therm_sensor_get_threshold_state(therm, thrs_name);

--
2.26.2

2020-09-16 20:08:02

by Karol Herbst

[permalink] [raw]
Subject: Re: [PATCH v2 1/2] drm/nouveau: return temperatures in temp_get() via parameter

On Wed, Sep 16, 2020 at 9:47 PM Jeremy Cline <[email protected]> wrote:
>
> The temp_get() function currently returns negative error numbers or a
> temperature. However, the thermal sensors can (in theory) measure
> negative temperatures. Some implementations of temp_get() correctly
> clamp negative temperature readings to 0 so that users do not mistake
> them for errors, but some, like gp100_temp_get(), do not.
>
> Rather than relying on implementations remembering to clamp values,
> dedicate the function return value to error codes and accept a pointer
> to an integer where the temperature reading should be stored.
>
> Signed-off-by: Jeremy Cline <[email protected]>
> ---
> .../drm/nouveau/include/nvkm/subdev/therm.h | 2 +-
> drivers/gpu/drm/nouveau/nouveau_hwmon.c | 12 ++++++------
> .../gpu/drm/nouveau/nvkm/subdev/therm/base.c | 19 ++++++++++++++-----
> .../gpu/drm/nouveau/nvkm/subdev/therm/g84.c | 11 ++++++-----
> .../gpu/drm/nouveau/nvkm/subdev/therm/gp100.c | 11 ++++++-----
> .../gpu/drm/nouveau/nvkm/subdev/therm/nv40.c | 9 +++------
> .../gpu/drm/nouveau/nvkm/subdev/therm/nv50.c | 9 +++------
> .../gpu/drm/nouveau/nvkm/subdev/therm/priv.h | 17 +++++++++++++++--
> .../gpu/drm/nouveau/nvkm/subdev/therm/temp.c | 12 ++++++++----
> 9 files changed, 62 insertions(+), 40 deletions(-)
>
> diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> index 62c34f98c930..bfe9779216fc 100644
> --- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> +++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> @@ -99,7 +99,7 @@ struct nvkm_therm {
> bool clkgating_enabled;
> };
>
> -int nvkm_therm_temp_get(struct nvkm_therm *);
> +int nvkm_therm_temp_get(struct nvkm_therm *therm, int *temp);
> int nvkm_therm_fan_sense(struct nvkm_therm *);
> int nvkm_therm_cstate(struct nvkm_therm *, int, int);
> void nvkm_therm_clkgate_init(struct nvkm_therm *,
> diff --git a/drivers/gpu/drm/nouveau/nouveau_hwmon.c b/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> index 1c3104d20571..aff6aa296678 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> @@ -325,7 +325,7 @@ nouveau_temp_is_visible(const void *data, u32 attr, int channel)
> struct nouveau_drm *drm = nouveau_drm((struct drm_device *)data);
> struct nvkm_therm *therm = nvxx_therm(&drm->client.device);
>
> - if (!therm || !therm->attr_get || nvkm_therm_temp_get(therm) < 0)
> + if (!therm || !therm->attr_get || nvkm_therm_temp_get(therm, NULL) < 0)
> return 0;
>
> switch (attr) {
> @@ -419,7 +419,7 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
> struct drm_device *drm_dev = dev_get_drvdata(dev);
> struct nouveau_drm *drm = nouveau_drm(drm_dev);
> struct nvkm_therm *therm = nvxx_therm(&drm->client.device);
> - int ret;
> + int ret = 0, temp;
>
> if (!therm || !therm->attr_get)
> return -EOPNOTSUPP;
> @@ -428,8 +428,8 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
> case hwmon_temp_input:
> if (drm_dev->switch_power_state != DRM_SWITCH_POWER_ON)
> return -EINVAL;
> - ret = nvkm_therm_temp_get(therm);
> - *val = ret < 0 ? ret : (ret * 1000);
> + ret = nvkm_therm_temp_get(therm, &temp);
> + *val = temp * 1000;
> break;
> case hwmon_temp_max:
> *val = therm->attr_get(therm, NVKM_THERM_ATTR_THRS_DOWN_CLK)
> @@ -459,7 +459,7 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
> return -EOPNOTSUPP;
> }
>
> - return 0;
> + return ret;
> }
>
> static int
> @@ -735,7 +735,7 @@ nouveau_hwmon_init(struct drm_device *dev)
> hwmon->dev = dev;
>
> if (therm && therm->attr_get && therm->attr_set) {
> - if (nvkm_therm_temp_get(therm) >= 0)
> + if (nvkm_therm_temp_get(therm, NULL) >= 0)
> special_groups[i++] = &temp1_auto_point_sensor_group;
> if (therm->fan_get && therm->fan_get(therm) >= 0)
> special_groups[i++] = &pwm_fan_sensor_group;
> diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> index 4a4d1e224126..e7ffea1512e6 100644
> --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> @@ -27,10 +27,15 @@
> #include <subdev/pmu.h>
>
> int
> -nvkm_therm_temp_get(struct nvkm_therm *therm)
> +nvkm_therm_temp_get(struct nvkm_therm *therm, int *temp)
> {
> + int ignored_reading;
> +
> + if (temp == NULL)
> + temp = &ignored_reading;
> +
> if (therm->func->temp_get)
> - return therm->func->temp_get(therm);
> + return therm->func->temp_get(therm, temp);
> return -ENODEV;
> }
>
> @@ -40,9 +45,11 @@ nvkm_therm_update_trip(struct nvkm_therm *therm)
> struct nvbios_therm_trip_point *trip = therm->fan->bios.trip,
> *cur_trip = NULL,
> *last_trip = therm->last_trip;
> - u8 temp = therm->func->temp_get(therm);
> + int temp;
> u16 duty, i;
>
> + WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
> +

I think adding WARN_ONs like this is fine, I am just not sure how much
our code is actually protected against the GPU being runtime
suspended... so on laptops this could trigger some warnings.. might
even be just because nouveau_pmops_runtime_suspend and the hwmon API
race here, even though it's quite unlikely that the hwmon thread
stalls enough so it survives the entire suspend path. But in theory
that could be causing random errors getting reported.

> /* look for the trip point corresponding to the current temperature */
> cur_trip = NULL;
> for (i = 0; i < therm->fan->bios.nr_fan_trip; i++) {
> @@ -70,9 +77,11 @@ static int
> nvkm_therm_compute_linear_duty(struct nvkm_therm *therm, u8 linear_min_temp,
> u8 linear_max_temp)
> {
> - u8 temp = therm->func->temp_get(therm);
> + int temp;
> u16 duty;
>
> + WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
> +
> /* handle the non-linear part first */
> if (temp < linear_min_temp)
> return therm->fan->bios.min_duty;
> @@ -200,7 +209,7 @@ nvkm_therm_fan_mode(struct nvkm_therm *therm, int mode)
> /* do not allow automatic fan management if the thermal sensor is
> * not available */
> if (mode == NVKM_THERM_CTRL_AUTO &&
> - therm->func->temp_get(therm) < 0)
> + nvkm_therm_temp_get(therm, NULL) < 0)
> return -EINVAL;
>
> if (therm->mode == mode)
> diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
> index 96f8da40ac82..e2f891d5c7ba 100644
> --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
> +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
> @@ -27,14 +27,15 @@
> #include <subdev/fuse.h>
>
> int
> -g84_temp_get(struct nvkm_therm *therm)
> +g84_temp_get(struct nvkm_therm *therm, int *temp)
> {
> struct nvkm_device *device = therm->subdev.device;
>
> - if (nvkm_fuse_read(device->fuse, 0x1a8) == 1)
> - return nvkm_rd32(device, 0x20400);
> - else
> + if (nvkm_fuse_read(device->fuse, 0x1a8) != 1)
> return -ENODEV;
> +
> + *temp = nvkm_rd32(device, 0x20400);
> + return 0;
> }
>
> void
> @@ -115,7 +116,7 @@ g84_therm_threshold_hyst_emulation(struct nvkm_therm *therm,
> }
>
> /* fix the state (in case someone reprogrammed the alarms) */
> - cur = therm->func->temp_get(therm);
> + WARN_ON(nvkm_therm_temp_get(therm, &cur) < 0);
> if (new_state == NVKM_THERM_THRS_LOWER && cur > thrs->temp)
> new_state = NVKM_THERM_THRS_HIGHER;
> else if (new_state == NVKM_THERM_THRS_HIGHER &&
> diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> index 9f0dea3f61dc..4c32e4f21bec 100644
> --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> @@ -24,7 +24,7 @@
> #include "priv.h"
>
> static int
> -gp100_temp_get(struct nvkm_therm *therm)
> +gp100_temp_get(struct nvkm_therm *therm, int *temp)
> {
> struct nvkm_device *device = therm->subdev.device;
> struct nvkm_subdev *subdev = &therm->subdev;
> @@ -35,11 +35,12 @@ gp100_temp_get(struct nvkm_therm *therm)
> if (tsensor & 0x40000000)
> nvkm_trace(subdev, "reading temperature from SHADOWed sensor\n");
>
> - /* device valid */
> - if (tsensor & 0x20000000)
> - return (inttemp >> 8);
> - else
> + /* device invalid */
> + if (!(tsensor & 0x20000000))
> return -ENODEV;
> +
> + *temp = inttemp >> 8;
> + return 0;
> }
>
> static const struct nvkm_therm_func
> diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
> index 2c92ffb5f9d0..9753ad4bee4a 100644
> --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
> +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
> @@ -70,7 +70,7 @@ nv40_sensor_setup(struct nvkm_therm *therm)
> }
>
> static int
> -nv40_temp_get(struct nvkm_therm *therm)
> +nv40_temp_get(struct nvkm_therm *therm, int *temp)
> {
> struct nvkm_device *device = therm->subdev.device;
> struct nvbios_therm_sensor *sensor = &therm->bios_sensor;
> @@ -95,11 +95,8 @@ nv40_temp_get(struct nvkm_therm *therm)
> core_temp = core_temp + sensor->offset_num / sensor->offset_den;
> core_temp = core_temp + sensor->offset_constant - 8;
>
> - /* reserve negative temperatures for errors */
> - if (core_temp < 0)
> - core_temp = 0;
> -
> - return core_temp;
> + *temp = core_temp;
> + return 0;
> }
>
> static int
> diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
> index 9b57b433d4cf..38fa6777c129 100644
> --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
> +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
> @@ -126,7 +126,7 @@ nv50_sensor_setup(struct nvkm_therm *therm)
> }
>
> static int
> -nv50_temp_get(struct nvkm_therm *therm)
> +nv50_temp_get(struct nvkm_therm *therm, int *temp)
> {
> struct nvkm_device *device = therm->subdev.device;
> struct nvbios_therm_sensor *sensor = &therm->bios_sensor;
> @@ -143,11 +143,8 @@ nv50_temp_get(struct nvkm_therm *therm)
> core_temp = core_temp + sensor->offset_num / sensor->offset_den;
> core_temp = core_temp + sensor->offset_constant - 8;
>
> - /* reserve negative temperatures for errors */
> - if (core_temp < 0)
> - core_temp = 0;
> -
> - return core_temp;
> + *temp = core_temp;
> + return 0;
> }
>
> static void
> diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> index 21659daf1864..04607d8b1755 100644
> --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> @@ -91,7 +91,15 @@ struct nvkm_therm_func {
> int (*pwm_set)(struct nvkm_therm *, int line, u32, u32);
> int (*pwm_clock)(struct nvkm_therm *, int line);
>
> - int (*temp_get)(struct nvkm_therm *);
> + /**
> + * @temp_get: Get the temperature reading from a thermal device
> + *
> + * @therm: The thermal device instance.
> + * @temp: A pointer to write the temperature reading to.
> + *
> + * Returns: Zero on success, or a negative error code on failure.
> + */
> + int (*temp_get)(struct nvkm_therm *therm, int *temp);
>
> int (*fan_sense)(struct nvkm_therm *);
>
> @@ -110,7 +118,12 @@ int nv50_fan_pwm_get(struct nvkm_therm *, int, u32 *, u32 *);
> int nv50_fan_pwm_set(struct nvkm_therm *, int, u32, u32);
> int nv50_fan_pwm_clock(struct nvkm_therm *, int);
>
> -int g84_temp_get(struct nvkm_therm *);
> +/**
> + * g84_temp_get() - An implementation of the &struct nvkm_therm_func temp_get()
> + * @therm: The thermal device instance.
> + * @temp: A pointer to write the temperature reading to.
> + */
> +int g84_temp_get(struct nvkm_therm *therm, int *temp);
> void g84_sensor_setup(struct nvkm_therm *);
> void g84_therm_fini(struct nvkm_therm *);
>
> diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
> index ddb2b2c600ca..1e8803901abc 100644
> --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
> +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
> @@ -86,7 +86,9 @@ nvkm_therm_sensor_event(struct nvkm_therm *therm, enum nvkm_therm_thrs thrs,
> static const char * const thresholds[] = {
> "fanboost", "downclock", "critical", "shutdown"
> };
> - int temperature = therm->func->temp_get(therm);
> + int temperature;
> +
> + WARN_ON(nvkm_therm_temp_get(therm, &temperature) < 0);
>
> if (thrs < 0 || thrs > 3)
> return;
> @@ -140,7 +142,9 @@ nvkm_therm_threshold_hyst_polling(struct nvkm_therm *therm,
> {
> enum nvkm_therm_thrs_direction direction;
> enum nvkm_therm_thrs_state prev_state, new_state;
> - int temp = therm->func->temp_get(therm);
> + int temp;
> +
> + WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
>
> prev_state = nvkm_therm_sensor_get_threshold_state(therm, thrs_name);
>
> @@ -185,7 +189,7 @@ alarm_timer_callback(struct nvkm_alarm *alarm)
> spin_unlock_irqrestore(&therm->sensor.alarm_program_lock, flags);
>
> /* schedule the next poll in one second */
> - if (therm->func->temp_get(therm) >= 0)
> + if (nvkm_therm_temp_get(therm, NULL) >= 0)
> nvkm_timer_alarm(tmr, 1000000000ULL, alarm);
> }
>
> @@ -229,7 +233,7 @@ nvkm_therm_sensor_preinit(struct nvkm_therm *therm)
> {
> const char *sensor_avail = "yes";
>
> - if (therm->func->temp_get(therm) < 0)
> + if (nvkm_therm_temp_get(therm, NULL) < 0)
> sensor_avail = "no";
>
> nvkm_debug(&therm->subdev, "internal sensor: %s\n", sensor_avail);
> --
> 2.26.2
>

2020-09-16 20:11:12

by Karol Herbst

[permalink] [raw]
Subject: Re: [PATCH v2 1/2] drm/nouveau: return temperatures in temp_get() via parameter

On Wed, Sep 16, 2020 at 10:01 PM Karol Herbst <[email protected]> wrote:
>
> On Wed, Sep 16, 2020 at 9:47 PM Jeremy Cline <[email protected]> wrote:
> >
> > The temp_get() function currently returns negative error numbers or a
> > temperature. However, the thermal sensors can (in theory) measure
> > negative temperatures. Some implementations of temp_get() correctly
> > clamp negative temperature readings to 0 so that users do not mistake
> > them for errors, but some, like gp100_temp_get(), do not.
> >
> > Rather than relying on implementations remembering to clamp values,
> > dedicate the function return value to error codes and accept a pointer
> > to an integer where the temperature reading should be stored.
> >
> > Signed-off-by: Jeremy Cline <[email protected]>
> > ---
> > .../drm/nouveau/include/nvkm/subdev/therm.h | 2 +-
> > drivers/gpu/drm/nouveau/nouveau_hwmon.c | 12 ++++++------
> > .../gpu/drm/nouveau/nvkm/subdev/therm/base.c | 19 ++++++++++++++-----
> > .../gpu/drm/nouveau/nvkm/subdev/therm/g84.c | 11 ++++++-----
> > .../gpu/drm/nouveau/nvkm/subdev/therm/gp100.c | 11 ++++++-----
> > .../gpu/drm/nouveau/nvkm/subdev/therm/nv40.c | 9 +++------
> > .../gpu/drm/nouveau/nvkm/subdev/therm/nv50.c | 9 +++------
> > .../gpu/drm/nouveau/nvkm/subdev/therm/priv.h | 17 +++++++++++++++--
> > .../gpu/drm/nouveau/nvkm/subdev/therm/temp.c | 12 ++++++++----
> > 9 files changed, 62 insertions(+), 40 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> > index 62c34f98c930..bfe9779216fc 100644
> > --- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> > +++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> > @@ -99,7 +99,7 @@ struct nvkm_therm {
> > bool clkgating_enabled;
> > };
> >
> > -int nvkm_therm_temp_get(struct nvkm_therm *);
> > +int nvkm_therm_temp_get(struct nvkm_therm *therm, int *temp);
> > int nvkm_therm_fan_sense(struct nvkm_therm *);
> > int nvkm_therm_cstate(struct nvkm_therm *, int, int);
> > void nvkm_therm_clkgate_init(struct nvkm_therm *,
> > diff --git a/drivers/gpu/drm/nouveau/nouveau_hwmon.c b/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> > index 1c3104d20571..aff6aa296678 100644
> > --- a/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> > +++ b/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> > @@ -325,7 +325,7 @@ nouveau_temp_is_visible(const void *data, u32 attr, int channel)
> > struct nouveau_drm *drm = nouveau_drm((struct drm_device *)data);
> > struct nvkm_therm *therm = nvxx_therm(&drm->client.device);
> >
> > - if (!therm || !therm->attr_get || nvkm_therm_temp_get(therm) < 0)
> > + if (!therm || !therm->attr_get || nvkm_therm_temp_get(therm, NULL) < 0)
> > return 0;
> >
> > switch (attr) {
> > @@ -419,7 +419,7 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
> > struct drm_device *drm_dev = dev_get_drvdata(dev);
> > struct nouveau_drm *drm = nouveau_drm(drm_dev);
> > struct nvkm_therm *therm = nvxx_therm(&drm->client.device);
> > - int ret;
> > + int ret = 0, temp;
> >
> > if (!therm || !therm->attr_get)
> > return -EOPNOTSUPP;
> > @@ -428,8 +428,8 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
> > case hwmon_temp_input:
> > if (drm_dev->switch_power_state != DRM_SWITCH_POWER_ON)
> > return -EINVAL;
> > - ret = nvkm_therm_temp_get(therm);
> > - *val = ret < 0 ? ret : (ret * 1000);
> > + ret = nvkm_therm_temp_get(therm, &temp);
> > + *val = temp * 1000;
> > break;
> > case hwmon_temp_max:
> > *val = therm->attr_get(therm, NVKM_THERM_ATTR_THRS_DOWN_CLK)
> > @@ -459,7 +459,7 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
> > return -EOPNOTSUPP;
> > }
> >
> > - return 0;
> > + return ret;
> > }
> >
> > static int
> > @@ -735,7 +735,7 @@ nouveau_hwmon_init(struct drm_device *dev)
> > hwmon->dev = dev;
> >
> > if (therm && therm->attr_get && therm->attr_set) {
> > - if (nvkm_therm_temp_get(therm) >= 0)
> > + if (nvkm_therm_temp_get(therm, NULL) >= 0)
> > special_groups[i++] = &temp1_auto_point_sensor_group;
> > if (therm->fan_get && therm->fan_get(therm) >= 0)
> > special_groups[i++] = &pwm_fan_sensor_group;
> > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> > index 4a4d1e224126..e7ffea1512e6 100644
> > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> > @@ -27,10 +27,15 @@
> > #include <subdev/pmu.h>
> >
> > int
> > -nvkm_therm_temp_get(struct nvkm_therm *therm)
> > +nvkm_therm_temp_get(struct nvkm_therm *therm, int *temp)
> > {
> > + int ignored_reading;
> > +
> > + if (temp == NULL)
> > + temp = &ignored_reading;
> > +
> > if (therm->func->temp_get)
> > - return therm->func->temp_get(therm);
> > + return therm->func->temp_get(therm, temp);
> > return -ENODEV;
> > }
> >
> > @@ -40,9 +45,11 @@ nvkm_therm_update_trip(struct nvkm_therm *therm)
> > struct nvbios_therm_trip_point *trip = therm->fan->bios.trip,
> > *cur_trip = NULL,
> > *last_trip = therm->last_trip;
> > - u8 temp = therm->func->temp_get(therm);
> > + int temp;
> > u16 duty, i;
> >
> > + WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
> > +
>
> I think adding WARN_ONs like this is fine, I am just not sure how much
> our code is actually protected against the GPU being runtime
> suspended... so on laptops this could trigger some warnings.. might
> even be just because nouveau_pmops_runtime_suspend and the hwmon API
> race here, even though it's quite unlikely that the hwmon thread
> stalls enough so it survives the entire suspend path. But in theory
> that could be causing random errors getting reported.
>

Actually, we never set the status to DRM_SWITCH_POWER_CHANGING, so I
guess this race is indeed real. I thought we did that, but maybe
that's because I never send out the patch to use them.

> > /* look for the trip point corresponding to the current temperature */
> > cur_trip = NULL;
> > for (i = 0; i < therm->fan->bios.nr_fan_trip; i++) {
> > @@ -70,9 +77,11 @@ static int
> > nvkm_therm_compute_linear_duty(struct nvkm_therm *therm, u8 linear_min_temp,
> > u8 linear_max_temp)
> > {
> > - u8 temp = therm->func->temp_get(therm);
> > + int temp;
> > u16 duty;
> >
> > + WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
> > +
> > /* handle the non-linear part first */
> > if (temp < linear_min_temp)
> > return therm->fan->bios.min_duty;
> > @@ -200,7 +209,7 @@ nvkm_therm_fan_mode(struct nvkm_therm *therm, int mode)
> > /* do not allow automatic fan management if the thermal sensor is
> > * not available */
> > if (mode == NVKM_THERM_CTRL_AUTO &&
> > - therm->func->temp_get(therm) < 0)
> > + nvkm_therm_temp_get(therm, NULL) < 0)
> > return -EINVAL;
> >
> > if (therm->mode == mode)
> > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
> > index 96f8da40ac82..e2f891d5c7ba 100644
> > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
> > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
> > @@ -27,14 +27,15 @@
> > #include <subdev/fuse.h>
> >
> > int
> > -g84_temp_get(struct nvkm_therm *therm)
> > +g84_temp_get(struct nvkm_therm *therm, int *temp)
> > {
> > struct nvkm_device *device = therm->subdev.device;
> >
> > - if (nvkm_fuse_read(device->fuse, 0x1a8) == 1)
> > - return nvkm_rd32(device, 0x20400);
> > - else
> > + if (nvkm_fuse_read(device->fuse, 0x1a8) != 1)
> > return -ENODEV;
> > +
> > + *temp = nvkm_rd32(device, 0x20400);
> > + return 0;
> > }
> >
> > void
> > @@ -115,7 +116,7 @@ g84_therm_threshold_hyst_emulation(struct nvkm_therm *therm,
> > }
> >
> > /* fix the state (in case someone reprogrammed the alarms) */
> > - cur = therm->func->temp_get(therm);
> > + WARN_ON(nvkm_therm_temp_get(therm, &cur) < 0);
> > if (new_state == NVKM_THERM_THRS_LOWER && cur > thrs->temp)
> > new_state = NVKM_THERM_THRS_HIGHER;
> > else if (new_state == NVKM_THERM_THRS_HIGHER &&
> > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> > index 9f0dea3f61dc..4c32e4f21bec 100644
> > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> > @@ -24,7 +24,7 @@
> > #include "priv.h"
> >
> > static int
> > -gp100_temp_get(struct nvkm_therm *therm)
> > +gp100_temp_get(struct nvkm_therm *therm, int *temp)
> > {
> > struct nvkm_device *device = therm->subdev.device;
> > struct nvkm_subdev *subdev = &therm->subdev;
> > @@ -35,11 +35,12 @@ gp100_temp_get(struct nvkm_therm *therm)
> > if (tsensor & 0x40000000)
> > nvkm_trace(subdev, "reading temperature from SHADOWed sensor\n");
> >
> > - /* device valid */
> > - if (tsensor & 0x20000000)
> > - return (inttemp >> 8);
> > - else
> > + /* device invalid */
> > + if (!(tsensor & 0x20000000))
> > return -ENODEV;
> > +
> > + *temp = inttemp >> 8;
> > + return 0;
> > }
> >
> > static const struct nvkm_therm_func
> > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
> > index 2c92ffb5f9d0..9753ad4bee4a 100644
> > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
> > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
> > @@ -70,7 +70,7 @@ nv40_sensor_setup(struct nvkm_therm *therm)
> > }
> >
> > static int
> > -nv40_temp_get(struct nvkm_therm *therm)
> > +nv40_temp_get(struct nvkm_therm *therm, int *temp)
> > {
> > struct nvkm_device *device = therm->subdev.device;
> > struct nvbios_therm_sensor *sensor = &therm->bios_sensor;
> > @@ -95,11 +95,8 @@ nv40_temp_get(struct nvkm_therm *therm)
> > core_temp = core_temp + sensor->offset_num / sensor->offset_den;
> > core_temp = core_temp + sensor->offset_constant - 8;
> >
> > - /* reserve negative temperatures for errors */
> > - if (core_temp < 0)
> > - core_temp = 0;
> > -
> > - return core_temp;
> > + *temp = core_temp;
> > + return 0;
> > }
> >
> > static int
> > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
> > index 9b57b433d4cf..38fa6777c129 100644
> > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
> > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
> > @@ -126,7 +126,7 @@ nv50_sensor_setup(struct nvkm_therm *therm)
> > }
> >
> > static int
> > -nv50_temp_get(struct nvkm_therm *therm)
> > +nv50_temp_get(struct nvkm_therm *therm, int *temp)
> > {
> > struct nvkm_device *device = therm->subdev.device;
> > struct nvbios_therm_sensor *sensor = &therm->bios_sensor;
> > @@ -143,11 +143,8 @@ nv50_temp_get(struct nvkm_therm *therm)
> > core_temp = core_temp + sensor->offset_num / sensor->offset_den;
> > core_temp = core_temp + sensor->offset_constant - 8;
> >
> > - /* reserve negative temperatures for errors */
> > - if (core_temp < 0)
> > - core_temp = 0;
> > -
> > - return core_temp;
> > + *temp = core_temp;
> > + return 0;
> > }
> >
> > static void
> > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> > index 21659daf1864..04607d8b1755 100644
> > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> > @@ -91,7 +91,15 @@ struct nvkm_therm_func {
> > int (*pwm_set)(struct nvkm_therm *, int line, u32, u32);
> > int (*pwm_clock)(struct nvkm_therm *, int line);
> >
> > - int (*temp_get)(struct nvkm_therm *);
> > + /**
> > + * @temp_get: Get the temperature reading from a thermal device
> > + *
> > + * @therm: The thermal device instance.
> > + * @temp: A pointer to write the temperature reading to.
> > + *
> > + * Returns: Zero on success, or a negative error code on failure.
> > + */
> > + int (*temp_get)(struct nvkm_therm *therm, int *temp);
> >
> > int (*fan_sense)(struct nvkm_therm *);
> >
> > @@ -110,7 +118,12 @@ int nv50_fan_pwm_get(struct nvkm_therm *, int, u32 *, u32 *);
> > int nv50_fan_pwm_set(struct nvkm_therm *, int, u32, u32);
> > int nv50_fan_pwm_clock(struct nvkm_therm *, int);
> >
> > -int g84_temp_get(struct nvkm_therm *);
> > +/**
> > + * g84_temp_get() - An implementation of the &struct nvkm_therm_func temp_get()
> > + * @therm: The thermal device instance.
> > + * @temp: A pointer to write the temperature reading to.
> > + */
> > +int g84_temp_get(struct nvkm_therm *therm, int *temp);
> > void g84_sensor_setup(struct nvkm_therm *);
> > void g84_therm_fini(struct nvkm_therm *);
> >
> > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
> > index ddb2b2c600ca..1e8803901abc 100644
> > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
> > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
> > @@ -86,7 +86,9 @@ nvkm_therm_sensor_event(struct nvkm_therm *therm, enum nvkm_therm_thrs thrs,
> > static const char * const thresholds[] = {
> > "fanboost", "downclock", "critical", "shutdown"
> > };
> > - int temperature = therm->func->temp_get(therm);
> > + int temperature;
> > +
> > + WARN_ON(nvkm_therm_temp_get(therm, &temperature) < 0);
> >
> > if (thrs < 0 || thrs > 3)
> > return;
> > @@ -140,7 +142,9 @@ nvkm_therm_threshold_hyst_polling(struct nvkm_therm *therm,
> > {
> > enum nvkm_therm_thrs_direction direction;
> > enum nvkm_therm_thrs_state prev_state, new_state;
> > - int temp = therm->func->temp_get(therm);
> > + int temp;
> > +
> > + WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
> >
> > prev_state = nvkm_therm_sensor_get_threshold_state(therm, thrs_name);
> >
> > @@ -185,7 +189,7 @@ alarm_timer_callback(struct nvkm_alarm *alarm)
> > spin_unlock_irqrestore(&therm->sensor.alarm_program_lock, flags);
> >
> > /* schedule the next poll in one second */
> > - if (therm->func->temp_get(therm) >= 0)
> > + if (nvkm_therm_temp_get(therm, NULL) >= 0)
> > nvkm_timer_alarm(tmr, 1000000000ULL, alarm);
> > }
> >
> > @@ -229,7 +233,7 @@ nvkm_therm_sensor_preinit(struct nvkm_therm *therm)
> > {
> > const char *sensor_avail = "yes";
> >
> > - if (therm->func->temp_get(therm) < 0)
> > + if (nvkm_therm_temp_get(therm, NULL) < 0)
> > sensor_avail = "no";
> >
> > nvkm_debug(&therm->subdev, "internal sensor: %s\n", sensor_avail);
> > --
> > 2.26.2
> >

2020-09-17 14:23:19

by Jeremy Cline

[permalink] [raw]
Subject: Re: [PATCH v2 1/2] drm/nouveau: return temperatures in temp_get() via parameter

On Wed, Sep 16, 2020 at 10:03:22PM +0200, Karol Herbst wrote:
> On Wed, Sep 16, 2020 at 10:01 PM Karol Herbst <[email protected]> wrote:
> >
> > On Wed, Sep 16, 2020 at 9:47 PM Jeremy Cline <[email protected]> wrote:
> > >
> > > The temp_get() function currently returns negative error numbers or a
> > > temperature. However, the thermal sensors can (in theory) measure
> > > negative temperatures. Some implementations of temp_get() correctly
> > > clamp negative temperature readings to 0 so that users do not mistake
> > > them for errors, but some, like gp100_temp_get(), do not.
> > >
> > > Rather than relying on implementations remembering to clamp values,
> > > dedicate the function return value to error codes and accept a pointer
> > > to an integer where the temperature reading should be stored.
> > >
> > > Signed-off-by: Jeremy Cline <[email protected]>
> > > ---
> > > .../drm/nouveau/include/nvkm/subdev/therm.h | 2 +-
> > > drivers/gpu/drm/nouveau/nouveau_hwmon.c | 12 ++++++------
> > > .../gpu/drm/nouveau/nvkm/subdev/therm/base.c | 19 ++++++++++++++-----
> > > .../gpu/drm/nouveau/nvkm/subdev/therm/g84.c | 11 ++++++-----
> > > .../gpu/drm/nouveau/nvkm/subdev/therm/gp100.c | 11 ++++++-----
> > > .../gpu/drm/nouveau/nvkm/subdev/therm/nv40.c | 9 +++------
> > > .../gpu/drm/nouveau/nvkm/subdev/therm/nv50.c | 9 +++------
> > > .../gpu/drm/nouveau/nvkm/subdev/therm/priv.h | 17 +++++++++++++++--
> > > .../gpu/drm/nouveau/nvkm/subdev/therm/temp.c | 12 ++++++++----
> > > 9 files changed, 62 insertions(+), 40 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> > > index 62c34f98c930..bfe9779216fc 100644
> > > --- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> > > +++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> > > @@ -99,7 +99,7 @@ struct nvkm_therm {
> > > bool clkgating_enabled;
> > > };
> > >
> > > -int nvkm_therm_temp_get(struct nvkm_therm *);
> > > +int nvkm_therm_temp_get(struct nvkm_therm *therm, int *temp);
> > > int nvkm_therm_fan_sense(struct nvkm_therm *);
> > > int nvkm_therm_cstate(struct nvkm_therm *, int, int);
> > > void nvkm_therm_clkgate_init(struct nvkm_therm *,
> > > diff --git a/drivers/gpu/drm/nouveau/nouveau_hwmon.c b/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> > > index 1c3104d20571..aff6aa296678 100644
> > > --- a/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> > > +++ b/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> > > @@ -325,7 +325,7 @@ nouveau_temp_is_visible(const void *data, u32 attr, int channel)
> > > struct nouveau_drm *drm = nouveau_drm((struct drm_device *)data);
> > > struct nvkm_therm *therm = nvxx_therm(&drm->client.device);
> > >
> > > - if (!therm || !therm->attr_get || nvkm_therm_temp_get(therm) < 0)
> > > + if (!therm || !therm->attr_get || nvkm_therm_temp_get(therm, NULL) < 0)
> > > return 0;
> > >
> > > switch (attr) {
> > > @@ -419,7 +419,7 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
> > > struct drm_device *drm_dev = dev_get_drvdata(dev);
> > > struct nouveau_drm *drm = nouveau_drm(drm_dev);
> > > struct nvkm_therm *therm = nvxx_therm(&drm->client.device);
> > > - int ret;
> > > + int ret = 0, temp;
> > >
> > > if (!therm || !therm->attr_get)
> > > return -EOPNOTSUPP;
> > > @@ -428,8 +428,8 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
> > > case hwmon_temp_input:
> > > if (drm_dev->switch_power_state != DRM_SWITCH_POWER_ON)
> > > return -EINVAL;
> > > - ret = nvkm_therm_temp_get(therm);
> > > - *val = ret < 0 ? ret : (ret * 1000);
> > > + ret = nvkm_therm_temp_get(therm, &temp);
> > > + *val = temp * 1000;
> > > break;
> > > case hwmon_temp_max:
> > > *val = therm->attr_get(therm, NVKM_THERM_ATTR_THRS_DOWN_CLK)
> > > @@ -459,7 +459,7 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
> > > return -EOPNOTSUPP;
> > > }
> > >
> > > - return 0;
> > > + return ret;
> > > }
> > >
> > > static int
> > > @@ -735,7 +735,7 @@ nouveau_hwmon_init(struct drm_device *dev)
> > > hwmon->dev = dev;
> > >
> > > if (therm && therm->attr_get && therm->attr_set) {
> > > - if (nvkm_therm_temp_get(therm) >= 0)
> > > + if (nvkm_therm_temp_get(therm, NULL) >= 0)
> > > special_groups[i++] = &temp1_auto_point_sensor_group;
> > > if (therm->fan_get && therm->fan_get(therm) >= 0)
> > > special_groups[i++] = &pwm_fan_sensor_group;
> > > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> > > index 4a4d1e224126..e7ffea1512e6 100644
> > > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> > > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> > > @@ -27,10 +27,15 @@
> > > #include <subdev/pmu.h>
> > >
> > > int
> > > -nvkm_therm_temp_get(struct nvkm_therm *therm)
> > > +nvkm_therm_temp_get(struct nvkm_therm *therm, int *temp)
> > > {
> > > + int ignored_reading;
> > > +
> > > + if (temp == NULL)
> > > + temp = &ignored_reading;
> > > +
> > > if (therm->func->temp_get)
> > > - return therm->func->temp_get(therm);
> > > + return therm->func->temp_get(therm, temp);
> > > return -ENODEV;
> > > }
> > >
> > > @@ -40,9 +45,11 @@ nvkm_therm_update_trip(struct nvkm_therm *therm)
> > > struct nvbios_therm_trip_point *trip = therm->fan->bios.trip,
> > > *cur_trip = NULL,
> > > *last_trip = therm->last_trip;
> > > - u8 temp = therm->func->temp_get(therm);
> > > + int temp;
> > > u16 duty, i;
> > >
> > > + WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
> > > +
> >
> > I think adding WARN_ONs like this is fine, I am just not sure how much
> > our code is actually protected against the GPU being runtime
> > suspended... so on laptops this could trigger some warnings.. might
> > even be just because nouveau_pmops_runtime_suspend and the hwmon API
> > race here, even though it's quite unlikely that the hwmon thread
> > stalls enough so it survives the entire suspend path. But in theory
> > that could be causing random errors getting reported.
> >
>
> Actually, we never set the status to DRM_SWITCH_POWER_CHANGING, so I
> guess this race is indeed real. I thought we did that, but maybe
> that's because I never send out the patch to use them.
>

Yeah, so my thinking was that since before there was no error handling
it would show whether or not it's a real problem, but it sounds like it
is so I think it'd be better to handle that.

I see nvkm_therm_update() is checking to make sure the duty is positive.
Should it always be safe to skip setting the fan in cases where the
temperature isn't readable? If so we can just return the error here.

> > > /* look for the trip point corresponding to the current temperature */
> > > cur_trip = NULL;
> > > for (i = 0; i < therm->fan->bios.nr_fan_trip; i++) {
> > > @@ -70,9 +77,11 @@ static int
> > > nvkm_therm_compute_linear_duty(struct nvkm_therm *therm, u8 linear_min_temp,
> > > u8 linear_max_temp)
> > > {
> > > - u8 temp = therm->func->temp_get(therm);
> > > + int temp;
> > > u16 duty;
> > >
> > > + WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
> > > +
> > > /* handle the non-linear part first */
> > > if (temp < linear_min_temp)
> > > return therm->fan->bios.min_duty;
> > > @@ -200,7 +209,7 @@ nvkm_therm_fan_mode(struct nvkm_therm *therm, int mode)
> > > /* do not allow automatic fan management if the thermal sensor is
> > > * not available */
> > > if (mode == NVKM_THERM_CTRL_AUTO &&
> > > - therm->func->temp_get(therm) < 0)
> > > + nvkm_therm_temp_get(therm, NULL) < 0)
> > > return -EINVAL;
> > >
> > > if (therm->mode == mode)
> > > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
> > > index 96f8da40ac82..e2f891d5c7ba 100644
> > > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
> > > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
> > > @@ -27,14 +27,15 @@
> > > #include <subdev/fuse.h>
> > >
> > > int
> > > -g84_temp_get(struct nvkm_therm *therm)
> > > +g84_temp_get(struct nvkm_therm *therm, int *temp)
> > > {
> > > struct nvkm_device *device = therm->subdev.device;
> > >
> > > - if (nvkm_fuse_read(device->fuse, 0x1a8) == 1)
> > > - return nvkm_rd32(device, 0x20400);
> > > - else
> > > + if (nvkm_fuse_read(device->fuse, 0x1a8) != 1)
> > > return -ENODEV;
> > > +
> > > + *temp = nvkm_rd32(device, 0x20400);
> > > + return 0;
> > > }
> > >
> > > void
> > > @@ -115,7 +116,7 @@ g84_therm_threshold_hyst_emulation(struct nvkm_therm *therm,
> > > }
> > >
> > > /* fix the state (in case someone reprogrammed the alarms) */
> > > - cur = therm->func->temp_get(therm);
> > > + WARN_ON(nvkm_therm_temp_get(therm, &cur) < 0);
> > > if (new_state == NVKM_THERM_THRS_LOWER && cur > thrs->temp)
> > > new_state = NVKM_THERM_THRS_HIGHER;
> > > else if (new_state == NVKM_THERM_THRS_HIGHER &&
> > > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> > > index 9f0dea3f61dc..4c32e4f21bec 100644
> > > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> > > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> > > @@ -24,7 +24,7 @@
> > > #include "priv.h"
> > >
> > > static int
> > > -gp100_temp_get(struct nvkm_therm *therm)
> > > +gp100_temp_get(struct nvkm_therm *therm, int *temp)
> > > {
> > > struct nvkm_device *device = therm->subdev.device;
> > > struct nvkm_subdev *subdev = &therm->subdev;
> > > @@ -35,11 +35,12 @@ gp100_temp_get(struct nvkm_therm *therm)
> > > if (tsensor & 0x40000000)
> > > nvkm_trace(subdev, "reading temperature from SHADOWed sensor\n");
> > >
> > > - /* device valid */
> > > - if (tsensor & 0x20000000)
> > > - return (inttemp >> 8);
> > > - else
> > > + /* device invalid */
> > > + if (!(tsensor & 0x20000000))
> > > return -ENODEV;
> > > +
> > > + *temp = inttemp >> 8;
> > > + return 0;
> > > }
> > >
> > > static const struct nvkm_therm_func
> > > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
> > > index 2c92ffb5f9d0..9753ad4bee4a 100644
> > > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
> > > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
> > > @@ -70,7 +70,7 @@ nv40_sensor_setup(struct nvkm_therm *therm)
> > > }
> > >
> > > static int
> > > -nv40_temp_get(struct nvkm_therm *therm)
> > > +nv40_temp_get(struct nvkm_therm *therm, int *temp)
> > > {
> > > struct nvkm_device *device = therm->subdev.device;
> > > struct nvbios_therm_sensor *sensor = &therm->bios_sensor;
> > > @@ -95,11 +95,8 @@ nv40_temp_get(struct nvkm_therm *therm)
> > > core_temp = core_temp + sensor->offset_num / sensor->offset_den;
> > > core_temp = core_temp + sensor->offset_constant - 8;
> > >
> > > - /* reserve negative temperatures for errors */
> > > - if (core_temp < 0)
> > > - core_temp = 0;
> > > -
> > > - return core_temp;
> > > + *temp = core_temp;
> > > + return 0;
> > > }
> > >
> > > static int
> > > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
> > > index 9b57b433d4cf..38fa6777c129 100644
> > > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
> > > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
> > > @@ -126,7 +126,7 @@ nv50_sensor_setup(struct nvkm_therm *therm)
> > > }
> > >
> > > static int
> > > -nv50_temp_get(struct nvkm_therm *therm)
> > > +nv50_temp_get(struct nvkm_therm *therm, int *temp)
> > > {
> > > struct nvkm_device *device = therm->subdev.device;
> > > struct nvbios_therm_sensor *sensor = &therm->bios_sensor;
> > > @@ -143,11 +143,8 @@ nv50_temp_get(struct nvkm_therm *therm)
> > > core_temp = core_temp + sensor->offset_num / sensor->offset_den;
> > > core_temp = core_temp + sensor->offset_constant - 8;
> > >
> > > - /* reserve negative temperatures for errors */
> > > - if (core_temp < 0)
> > > - core_temp = 0;
> > > -
> > > - return core_temp;
> > > + *temp = core_temp;
> > > + return 0;
> > > }
> > >
> > > static void
> > > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> > > index 21659daf1864..04607d8b1755 100644
> > > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> > > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> > > @@ -91,7 +91,15 @@ struct nvkm_therm_func {
> > > int (*pwm_set)(struct nvkm_therm *, int line, u32, u32);
> > > int (*pwm_clock)(struct nvkm_therm *, int line);
> > >
> > > - int (*temp_get)(struct nvkm_therm *);
> > > + /**
> > > + * @temp_get: Get the temperature reading from a thermal device
> > > + *
> > > + * @therm: The thermal device instance.
> > > + * @temp: A pointer to write the temperature reading to.
> > > + *
> > > + * Returns: Zero on success, or a negative error code on failure.
> > > + */
> > > + int (*temp_get)(struct nvkm_therm *therm, int *temp);
> > >
> > > int (*fan_sense)(struct nvkm_therm *);
> > >
> > > @@ -110,7 +118,12 @@ int nv50_fan_pwm_get(struct nvkm_therm *, int, u32 *, u32 *);
> > > int nv50_fan_pwm_set(struct nvkm_therm *, int, u32, u32);
> > > int nv50_fan_pwm_clock(struct nvkm_therm *, int);
> > >
> > > -int g84_temp_get(struct nvkm_therm *);
> > > +/**
> > > + * g84_temp_get() - An implementation of the &struct nvkm_therm_func temp_get()
> > > + * @therm: The thermal device instance.
> > > + * @temp: A pointer to write the temperature reading to.
> > > + */
> > > +int g84_temp_get(struct nvkm_therm *therm, int *temp);
> > > void g84_sensor_setup(struct nvkm_therm *);
> > > void g84_therm_fini(struct nvkm_therm *);
> > >
> > > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
> > > index ddb2b2c600ca..1e8803901abc 100644
> > > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
> > > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
> > > @@ -86,7 +86,9 @@ nvkm_therm_sensor_event(struct nvkm_therm *therm, enum nvkm_therm_thrs thrs,
> > > static const char * const thresholds[] = {
> > > "fanboost", "downclock", "critical", "shutdown"
> > > };
> > > - int temperature = therm->func->temp_get(therm);
> > > + int temperature;
> > > +
> > > + WARN_ON(nvkm_therm_temp_get(therm, &temperature) < 0);
> > >
> > > if (thrs < 0 || thrs > 3)
> > > return;
> > > @@ -140,7 +142,9 @@ nvkm_therm_threshold_hyst_polling(struct nvkm_therm *therm,
> > > {
> > > enum nvkm_therm_thrs_direction direction;
> > > enum nvkm_therm_thrs_state prev_state, new_state;
> > > - int temp = therm->func->temp_get(therm);
> > > + int temp;
> > > +
> > > + WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
> > >
> > > prev_state = nvkm_therm_sensor_get_threshold_state(therm, thrs_name);
> > >
> > > @@ -185,7 +189,7 @@ alarm_timer_callback(struct nvkm_alarm *alarm)
> > > spin_unlock_irqrestore(&therm->sensor.alarm_program_lock, flags);
> > >
> > > /* schedule the next poll in one second */
> > > - if (therm->func->temp_get(therm) >= 0)
> > > + if (nvkm_therm_temp_get(therm, NULL) >= 0)
> > > nvkm_timer_alarm(tmr, 1000000000ULL, alarm);
> > > }
> > >
> > > @@ -229,7 +233,7 @@ nvkm_therm_sensor_preinit(struct nvkm_therm *therm)
> > > {
> > > const char *sensor_avail = "yes";
> > >
> > > - if (therm->func->temp_get(therm) < 0)
> > > + if (nvkm_therm_temp_get(therm, NULL) < 0)
> > > sensor_avail = "no";
> > >
> > > nvkm_debug(&therm->subdev, "internal sensor: %s\n", sensor_avail);
> > > --
> > > 2.26.2
> > >
>

2020-09-17 14:27:39

by Karol Herbst

[permalink] [raw]
Subject: Re: [PATCH v2 1/2] drm/nouveau: return temperatures in temp_get() via parameter

On Thu, Sep 17, 2020 at 4:11 PM Jeremy Cline <[email protected]> wrote:
>
> On Wed, Sep 16, 2020 at 10:03:22PM +0200, Karol Herbst wrote:
> > On Wed, Sep 16, 2020 at 10:01 PM Karol Herbst <[email protected]> wrote:
> > >
> > > On Wed, Sep 16, 2020 at 9:47 PM Jeremy Cline <[email protected]> wrote:
> > > >
> > > > The temp_get() function currently returns negative error numbers or a
> > > > temperature. However, the thermal sensors can (in theory) measure
> > > > negative temperatures. Some implementations of temp_get() correctly
> > > > clamp negative temperature readings to 0 so that users do not mistake
> > > > them for errors, but some, like gp100_temp_get(), do not.
> > > >
> > > > Rather than relying on implementations remembering to clamp values,
> > > > dedicate the function return value to error codes and accept a pointer
> > > > to an integer where the temperature reading should be stored.
> > > >
> > > > Signed-off-by: Jeremy Cline <[email protected]>
> > > > ---
> > > > .../drm/nouveau/include/nvkm/subdev/therm.h | 2 +-
> > > > drivers/gpu/drm/nouveau/nouveau_hwmon.c | 12 ++++++------
> > > > .../gpu/drm/nouveau/nvkm/subdev/therm/base.c | 19 ++++++++++++++-----
> > > > .../gpu/drm/nouveau/nvkm/subdev/therm/g84.c | 11 ++++++-----
> > > > .../gpu/drm/nouveau/nvkm/subdev/therm/gp100.c | 11 ++++++-----
> > > > .../gpu/drm/nouveau/nvkm/subdev/therm/nv40.c | 9 +++------
> > > > .../gpu/drm/nouveau/nvkm/subdev/therm/nv50.c | 9 +++------
> > > > .../gpu/drm/nouveau/nvkm/subdev/therm/priv.h | 17 +++++++++++++++--
> > > > .../gpu/drm/nouveau/nvkm/subdev/therm/temp.c | 12 ++++++++----
> > > > 9 files changed, 62 insertions(+), 40 deletions(-)
> > > >
> > > > diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> > > > index 62c34f98c930..bfe9779216fc 100644
> > > > --- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> > > > +++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> > > > @@ -99,7 +99,7 @@ struct nvkm_therm {
> > > > bool clkgating_enabled;
> > > > };
> > > >
> > > > -int nvkm_therm_temp_get(struct nvkm_therm *);
> > > > +int nvkm_therm_temp_get(struct nvkm_therm *therm, int *temp);
> > > > int nvkm_therm_fan_sense(struct nvkm_therm *);
> > > > int nvkm_therm_cstate(struct nvkm_therm *, int, int);
> > > > void nvkm_therm_clkgate_init(struct nvkm_therm *,
> > > > diff --git a/drivers/gpu/drm/nouveau/nouveau_hwmon.c b/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> > > > index 1c3104d20571..aff6aa296678 100644
> > > > --- a/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> > > > +++ b/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> > > > @@ -325,7 +325,7 @@ nouveau_temp_is_visible(const void *data, u32 attr, int channel)
> > > > struct nouveau_drm *drm = nouveau_drm((struct drm_device *)data);
> > > > struct nvkm_therm *therm = nvxx_therm(&drm->client.device);
> > > >
> > > > - if (!therm || !therm->attr_get || nvkm_therm_temp_get(therm) < 0)
> > > > + if (!therm || !therm->attr_get || nvkm_therm_temp_get(therm, NULL) < 0)
> > > > return 0;
> > > >
> > > > switch (attr) {
> > > > @@ -419,7 +419,7 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
> > > > struct drm_device *drm_dev = dev_get_drvdata(dev);
> > > > struct nouveau_drm *drm = nouveau_drm(drm_dev);
> > > > struct nvkm_therm *therm = nvxx_therm(&drm->client.device);
> > > > - int ret;
> > > > + int ret = 0, temp;
> > > >
> > > > if (!therm || !therm->attr_get)
> > > > return -EOPNOTSUPP;
> > > > @@ -428,8 +428,8 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
> > > > case hwmon_temp_input:
> > > > if (drm_dev->switch_power_state != DRM_SWITCH_POWER_ON)
> > > > return -EINVAL;
> > > > - ret = nvkm_therm_temp_get(therm);
> > > > - *val = ret < 0 ? ret : (ret * 1000);
> > > > + ret = nvkm_therm_temp_get(therm, &temp);
> > > > + *val = temp * 1000;
> > > > break;
> > > > case hwmon_temp_max:
> > > > *val = therm->attr_get(therm, NVKM_THERM_ATTR_THRS_DOWN_CLK)
> > > > @@ -459,7 +459,7 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
> > > > return -EOPNOTSUPP;
> > > > }
> > > >
> > > > - return 0;
> > > > + return ret;
> > > > }
> > > >
> > > > static int
> > > > @@ -735,7 +735,7 @@ nouveau_hwmon_init(struct drm_device *dev)
> > > > hwmon->dev = dev;
> > > >
> > > > if (therm && therm->attr_get && therm->attr_set) {
> > > > - if (nvkm_therm_temp_get(therm) >= 0)
> > > > + if (nvkm_therm_temp_get(therm, NULL) >= 0)
> > > > special_groups[i++] = &temp1_auto_point_sensor_group;
> > > > if (therm->fan_get && therm->fan_get(therm) >= 0)
> > > > special_groups[i++] = &pwm_fan_sensor_group;
> > > > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> > > > index 4a4d1e224126..e7ffea1512e6 100644
> > > > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> > > > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> > > > @@ -27,10 +27,15 @@
> > > > #include <subdev/pmu.h>
> > > >
> > > > int
> > > > -nvkm_therm_temp_get(struct nvkm_therm *therm)
> > > > +nvkm_therm_temp_get(struct nvkm_therm *therm, int *temp)
> > > > {
> > > > + int ignored_reading;
> > > > +
> > > > + if (temp == NULL)
> > > > + temp = &ignored_reading;
> > > > +
> > > > if (therm->func->temp_get)
> > > > - return therm->func->temp_get(therm);
> > > > + return therm->func->temp_get(therm, temp);
> > > > return -ENODEV;
> > > > }
> > > >
> > > > @@ -40,9 +45,11 @@ nvkm_therm_update_trip(struct nvkm_therm *therm)
> > > > struct nvbios_therm_trip_point *trip = therm->fan->bios.trip,
> > > > *cur_trip = NULL,
> > > > *last_trip = therm->last_trip;
> > > > - u8 temp = therm->func->temp_get(therm);
> > > > + int temp;
> > > > u16 duty, i;
> > > >
> > > > + WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
> > > > +
> > >
> > > I think adding WARN_ONs like this is fine, I am just not sure how much
> > > our code is actually protected against the GPU being runtime
> > > suspended... so on laptops this could trigger some warnings.. might
> > > even be just because nouveau_pmops_runtime_suspend and the hwmon API
> > > race here, even though it's quite unlikely that the hwmon thread
> > > stalls enough so it survives the entire suspend path. But in theory
> > > that could be causing random errors getting reported.
> > >
> >
> > Actually, we never set the status to DRM_SWITCH_POWER_CHANGING, so I
> > guess this race is indeed real. I thought we did that, but maybe
> > that's because I never send out the patch to use them.
> >
>
> Yeah, so my thinking was that since before there was no error handling
> it would show whether or not it's a real problem, but it sounds like it
> is so I think it'd be better to handle that.
>

yeah.. I guess that issue existed before already although I think we
treated negative values special enough so it never showed? mhh..
anyway, I think we want to set the power state to _CHANGING anyway and
I can send out a patch for that. The other part is to make sure that
the suspend paths are stopping all the relevant kworkers which I'd
assume they do anyway.

> I see nvkm_therm_update() is checking to make sure the duty is positive.
> Should it always be safe to skip setting the fan in cases where the
> temperature isn't readable? If so we can just return the error here.
>

I think the fan workers should be stopped once the GPU runtime
suspends, but I am not 100% on this, so we might want to check up on
that.

> > > > /* look for the trip point corresponding to the current temperature */
> > > > cur_trip = NULL;
> > > > for (i = 0; i < therm->fan->bios.nr_fan_trip; i++) {
> > > > @@ -70,9 +77,11 @@ static int
> > > > nvkm_therm_compute_linear_duty(struct nvkm_therm *therm, u8 linear_min_temp,
> > > > u8 linear_max_temp)
> > > > {
> > > > - u8 temp = therm->func->temp_get(therm);
> > > > + int temp;
> > > > u16 duty;
> > > >
> > > > + WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
> > > > +
> > > > /* handle the non-linear part first */
> > > > if (temp < linear_min_temp)
> > > > return therm->fan->bios.min_duty;
> > > > @@ -200,7 +209,7 @@ nvkm_therm_fan_mode(struct nvkm_therm *therm, int mode)
> > > > /* do not allow automatic fan management if the thermal sensor is
> > > > * not available */
> > > > if (mode == NVKM_THERM_CTRL_AUTO &&
> > > > - therm->func->temp_get(therm) < 0)
> > > > + nvkm_therm_temp_get(therm, NULL) < 0)
> > > > return -EINVAL;
> > > >
> > > > if (therm->mode == mode)
> > > > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
> > > > index 96f8da40ac82..e2f891d5c7ba 100644
> > > > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
> > > > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
> > > > @@ -27,14 +27,15 @@
> > > > #include <subdev/fuse.h>
> > > >
> > > > int
> > > > -g84_temp_get(struct nvkm_therm *therm)
> > > > +g84_temp_get(struct nvkm_therm *therm, int *temp)
> > > > {
> > > > struct nvkm_device *device = therm->subdev.device;
> > > >
> > > > - if (nvkm_fuse_read(device->fuse, 0x1a8) == 1)
> > > > - return nvkm_rd32(device, 0x20400);
> > > > - else
> > > > + if (nvkm_fuse_read(device->fuse, 0x1a8) != 1)
> > > > return -ENODEV;
> > > > +
> > > > + *temp = nvkm_rd32(device, 0x20400);
> > > > + return 0;
> > > > }
> > > >
> > > > void
> > > > @@ -115,7 +116,7 @@ g84_therm_threshold_hyst_emulation(struct nvkm_therm *therm,
> > > > }
> > > >
> > > > /* fix the state (in case someone reprogrammed the alarms) */
> > > > - cur = therm->func->temp_get(therm);
> > > > + WARN_ON(nvkm_therm_temp_get(therm, &cur) < 0);
> > > > if (new_state == NVKM_THERM_THRS_LOWER && cur > thrs->temp)
> > > > new_state = NVKM_THERM_THRS_HIGHER;
> > > > else if (new_state == NVKM_THERM_THRS_HIGHER &&
> > > > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> > > > index 9f0dea3f61dc..4c32e4f21bec 100644
> > > > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> > > > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> > > > @@ -24,7 +24,7 @@
> > > > #include "priv.h"
> > > >
> > > > static int
> > > > -gp100_temp_get(struct nvkm_therm *therm)
> > > > +gp100_temp_get(struct nvkm_therm *therm, int *temp)
> > > > {
> > > > struct nvkm_device *device = therm->subdev.device;
> > > > struct nvkm_subdev *subdev = &therm->subdev;
> > > > @@ -35,11 +35,12 @@ gp100_temp_get(struct nvkm_therm *therm)
> > > > if (tsensor & 0x40000000)
> > > > nvkm_trace(subdev, "reading temperature from SHADOWed sensor\n");
> > > >
> > > > - /* device valid */
> > > > - if (tsensor & 0x20000000)
> > > > - return (inttemp >> 8);
> > > > - else
> > > > + /* device invalid */
> > > > + if (!(tsensor & 0x20000000))
> > > > return -ENODEV;
> > > > +
> > > > + *temp = inttemp >> 8;
> > > > + return 0;
> > > > }
> > > >
> > > > static const struct nvkm_therm_func
> > > > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
> > > > index 2c92ffb5f9d0..9753ad4bee4a 100644
> > > > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
> > > > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
> > > > @@ -70,7 +70,7 @@ nv40_sensor_setup(struct nvkm_therm *therm)
> > > > }
> > > >
> > > > static int
> > > > -nv40_temp_get(struct nvkm_therm *therm)
> > > > +nv40_temp_get(struct nvkm_therm *therm, int *temp)
> > > > {
> > > > struct nvkm_device *device = therm->subdev.device;
> > > > struct nvbios_therm_sensor *sensor = &therm->bios_sensor;
> > > > @@ -95,11 +95,8 @@ nv40_temp_get(struct nvkm_therm *therm)
> > > > core_temp = core_temp + sensor->offset_num / sensor->offset_den;
> > > > core_temp = core_temp + sensor->offset_constant - 8;
> > > >
> > > > - /* reserve negative temperatures for errors */
> > > > - if (core_temp < 0)
> > > > - core_temp = 0;
> > > > -
> > > > - return core_temp;
> > > > + *temp = core_temp;
> > > > + return 0;
> > > > }
> > > >
> > > > static int
> > > > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
> > > > index 9b57b433d4cf..38fa6777c129 100644
> > > > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
> > > > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
> > > > @@ -126,7 +126,7 @@ nv50_sensor_setup(struct nvkm_therm *therm)
> > > > }
> > > >
> > > > static int
> > > > -nv50_temp_get(struct nvkm_therm *therm)
> > > > +nv50_temp_get(struct nvkm_therm *therm, int *temp)
> > > > {
> > > > struct nvkm_device *device = therm->subdev.device;
> > > > struct nvbios_therm_sensor *sensor = &therm->bios_sensor;
> > > > @@ -143,11 +143,8 @@ nv50_temp_get(struct nvkm_therm *therm)
> > > > core_temp = core_temp + sensor->offset_num / sensor->offset_den;
> > > > core_temp = core_temp + sensor->offset_constant - 8;
> > > >
> > > > - /* reserve negative temperatures for errors */
> > > > - if (core_temp < 0)
> > > > - core_temp = 0;
> > > > -
> > > > - return core_temp;
> > > > + *temp = core_temp;
> > > > + return 0;
> > > > }
> > > >
> > > > static void
> > > > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> > > > index 21659daf1864..04607d8b1755 100644
> > > > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> > > > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> > > > @@ -91,7 +91,15 @@ struct nvkm_therm_func {
> > > > int (*pwm_set)(struct nvkm_therm *, int line, u32, u32);
> > > > int (*pwm_clock)(struct nvkm_therm *, int line);
> > > >
> > > > - int (*temp_get)(struct nvkm_therm *);
> > > > + /**
> > > > + * @temp_get: Get the temperature reading from a thermal device
> > > > + *
> > > > + * @therm: The thermal device instance.
> > > > + * @temp: A pointer to write the temperature reading to.
> > > > + *
> > > > + * Returns: Zero on success, or a negative error code on failure.
> > > > + */
> > > > + int (*temp_get)(struct nvkm_therm *therm, int *temp);
> > > >
> > > > int (*fan_sense)(struct nvkm_therm *);
> > > >
> > > > @@ -110,7 +118,12 @@ int nv50_fan_pwm_get(struct nvkm_therm *, int, u32 *, u32 *);
> > > > int nv50_fan_pwm_set(struct nvkm_therm *, int, u32, u32);
> > > > int nv50_fan_pwm_clock(struct nvkm_therm *, int);
> > > >
> > > > -int g84_temp_get(struct nvkm_therm *);
> > > > +/**
> > > > + * g84_temp_get() - An implementation of the &struct nvkm_therm_func temp_get()
> > > > + * @therm: The thermal device instance.
> > > > + * @temp: A pointer to write the temperature reading to.
> > > > + */
> > > > +int g84_temp_get(struct nvkm_therm *therm, int *temp);
> > > > void g84_sensor_setup(struct nvkm_therm *);
> > > > void g84_therm_fini(struct nvkm_therm *);
> > > >
> > > > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
> > > > index ddb2b2c600ca..1e8803901abc 100644
> > > > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
> > > > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
> > > > @@ -86,7 +86,9 @@ nvkm_therm_sensor_event(struct nvkm_therm *therm, enum nvkm_therm_thrs thrs,
> > > > static const char * const thresholds[] = {
> > > > "fanboost", "downclock", "critical", "shutdown"
> > > > };
> > > > - int temperature = therm->func->temp_get(therm);
> > > > + int temperature;
> > > > +
> > > > + WARN_ON(nvkm_therm_temp_get(therm, &temperature) < 0);
> > > >
> > > > if (thrs < 0 || thrs > 3)
> > > > return;
> > > > @@ -140,7 +142,9 @@ nvkm_therm_threshold_hyst_polling(struct nvkm_therm *therm,
> > > > {
> > > > enum nvkm_therm_thrs_direction direction;
> > > > enum nvkm_therm_thrs_state prev_state, new_state;
> > > > - int temp = therm->func->temp_get(therm);
> > > > + int temp;
> > > > +
> > > > + WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
> > > >
> > > > prev_state = nvkm_therm_sensor_get_threshold_state(therm, thrs_name);
> > > >
> > > > @@ -185,7 +189,7 @@ alarm_timer_callback(struct nvkm_alarm *alarm)
> > > > spin_unlock_irqrestore(&therm->sensor.alarm_program_lock, flags);
> > > >
> > > > /* schedule the next poll in one second */
> > > > - if (therm->func->temp_get(therm) >= 0)
> > > > + if (nvkm_therm_temp_get(therm, NULL) >= 0)
> > > > nvkm_timer_alarm(tmr, 1000000000ULL, alarm);
> > > > }
> > > >
> > > > @@ -229,7 +233,7 @@ nvkm_therm_sensor_preinit(struct nvkm_therm *therm)
> > > > {
> > > > const char *sensor_avail = "yes";
> > > >
> > > > - if (therm->func->temp_get(therm) < 0)
> > > > + if (nvkm_therm_temp_get(therm, NULL) < 0)
> > > > sensor_avail = "no";
> > > >
> > > > nvkm_debug(&therm->subdev, "internal sensor: %s\n", sensor_avail);
> > > > --
> > > > 2.26.2
> > > >
> >
>