2018-10-22 20:34:36

by Ross Zwisler

[permalink] [raw]
Subject: [PATCH 1/2] thermal: Add notifier call chain for hot/critical events

From: Duncan Laurie <[email protected]>

This will allow drivers to register a callback for important
thermal events and log critical thresholds that cause the system
to shut down.

There are other places this might work, but after consideration
I think it makes sense to have the chain at this level:

The ACPI thermal driver has an existing notify function that is
eventually called into, but that would limit the notifier to only
working on systems that use ACPI.

The cpufreq driver is already getting a notify callback executed
in this path (tz->ops->notify) but the threshold info is not passed
to the cpu_cooling notifier chain so it is not useful for logging.

Signed-off-by: Duncan Laurie <[email protected]>
Reviewed-by: Olof Johansson <[email protected]>
Reviewed-by: Vincent Palatin <[email protected]>
[ rez: updated changelog for upstream ]
Signed-off-by: Ross Zwisler <[email protected]>
---
drivers/thermal/thermal_core.c | 38 ++++++++++++++++++++++++++++++++++
include/linux/thermal.h | 4 ++++
2 files changed, 42 insertions(+)

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 6ab982309e6a0..e1f8764b3d9f9 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -15,6 +15,7 @@
#include <linux/slab.h>
#include <linux/kdev_t.h>
#include <linux/idr.h>
+#include <linux/notifier.h>
#include <linux/thermal.h>
#include <linux/reboot.h>
#include <linux/string.h>
@@ -313,12 +314,46 @@ static void monitor_thermal_zone(struct thermal_zone_device *tz)
mutex_unlock(&tz->lock);
}

+static BLOCKING_NOTIFIER_HEAD(thermal_notifier_list);
+
+/**
+ * register_thermal_notifier - Register function to be called for
+ * critical thermal events.
+ *
+ * @nb: Info about notifier function to be called
+ *
+ * Currently always returns zero, as blocking_notifier_chain_register()
+ * always returns zero.
+ */
+int register_thermal_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&thermal_notifier_list, nb);
+}
+EXPORT_SYMBOL(register_thermal_notifier);
+
+/**
+ * unregister_thermal_notifier - Unregister thermal notifier
+ *
+ * @nb: Hook to be unregistered
+ *
+ * Returns zero on success, or %-ENOENT on failure.
+ */
+int unregister_thermal_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&thermal_notifier_list, nb);
+}
+EXPORT_SYMBOL(unregister_thermal_notifier);
+
+
static void handle_non_critical_trips(struct thermal_zone_device *tz,
int trip,
enum thermal_trip_type trip_type)
{
tz->governor ? tz->governor->throttle(tz, trip) :
def_governor->throttle(tz, trip);
+
+ blocking_notifier_call_chain(&thermal_notifier_list,
+ trip_type, NULL);
}

/**
@@ -385,6 +420,9 @@ static void handle_critical_trips(struct thermal_zone_device *tz,
if (tz->ops->notify)
tz->ops->notify(tz, trip, trip_type);

+ blocking_notifier_call_chain(&thermal_notifier_list,
+ trip_type, NULL);
+
if (trip_type == THERMAL_TRIP_CRITICAL) {
dev_emerg(&tz->device,
"critical temperature reached (%d C), shutting down\n",
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 5f4705f46c2f9..b948344d55cab 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -13,6 +13,7 @@
#include <linux/of.h>
#include <linux/idr.h>
#include <linux/device.h>
+#include <linux/notifier.h>
#include <linux/sysfs.h>
#include <linux/workqueue.h>
#include <uapi/linux/thermal.h>
@@ -542,4 +543,7 @@ static inline int thermal_generate_netlink_event(struct thermal_zone_device *tz,
}
#endif

+extern int register_thermal_notifier(struct notifier_block *);
+extern int unregister_thermal_notifier(struct notifier_block *);
+
#endif /* __THERMAL_H__ */
--
2.19.1.568.g152ad8e336-goog



2018-10-22 21:08:10

by Ross Zwisler

[permalink] [raw]
Subject: [PATCH 2/2] gsmi: Log event for critical thermal thresholds

From: Duncan Laurie <[email protected]>

This registers a notifier for the new thermal notifier
introduced in a previous commit and logs a kernel shutdown
event when the notifier is called for crossing the
THERMAL_TRIP_CRITICAL threshold.

To test:
1) Modify critical shutdown threshold in the BIOS
2) Generate a load on the system to increase temperature
3) Wait until system powers off
4) Power back on and read the event log:

4 | 2012-07-18 10:47:02 | Kernel Event | Critical Thermal Threshold
5 | 2012-07-18 10:47:06 | Kernel Event | Clean Shutdown
6 | 2012-07-18 10:47:06 | ACPI Enter | S5

Signed-off-by: Duncan Laurie <[email protected]>
Reviewed-by: Vincent Palatin <[email protected]>
Reviewed-by: Olof Johansson <[email protected]>
[ rez: updated changelog for upstream ]
Signed-off-by: Ross Zwisler <[email protected]>
---
drivers/firmware/google/gsmi.c | 16 ++++++++++++++++
1 file changed, 16 insertions(+)

diff --git a/drivers/firmware/google/gsmi.c b/drivers/firmware/google/gsmi.c
index c8f169bf2e27d..ee0c611b83e99 100644
--- a/drivers/firmware/google/gsmi.c
+++ b/drivers/firmware/google/gsmi.c
@@ -29,6 +29,7 @@
#include <linux/efi.h>
#include <linux/module.h>
#include <linux/ucs2_string.h>
+#include <linux/thermal.h>

#define GSMI_SHUTDOWN_CLEAN 0 /* Clean Shutdown */
/* TODO([email protected]): Tie in HARDLOCKUP_DETECTOR with NMIWDT */
@@ -40,6 +41,7 @@
#define GSMI_SHUTDOWN_SOFTWDT 6 /* Software Watchdog */
#define GSMI_SHUTDOWN_MBE 7 /* Uncorrected ECC */
#define GSMI_SHUTDOWN_TRIPLE 8 /* Triple Fault */
+#define GSMI_SHUTDOWN_THERMAL 9 /* Critical Thermal Threshold */

#define DRIVER_VERSION "1.0"
#define GSMI_GUID_SIZE 16
@@ -670,6 +672,18 @@ static struct notifier_block gsmi_panic_notifier = {
.notifier_call = gsmi_panic_callback,
};

+static int gsmi_thermal_callback(struct notifier_block *nb,
+ unsigned long reason, void *arg)
+{
+ if (reason == THERMAL_TRIP_CRITICAL)
+ gsmi_shutdown_reason(GSMI_SHUTDOWN_THERMAL);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block gsmi_thermal_notifier = {
+ .notifier_call = gsmi_thermal_callback
+};
+
/*
* This hash function was blatantly copied from include/linux/hash.h.
* It is used by this driver to obfuscate a board name that requires a
@@ -892,6 +906,7 @@ static __init int gsmi_init(void)
goto out_remove_sysfs_files;
}

+ register_thermal_notifier(&gsmi_thermal_notifier);
register_reboot_notifier(&gsmi_reboot_notifier);
register_die_notifier(&gsmi_die_notifier);
atomic_notifier_chain_register(&panic_notifier_list,
@@ -918,6 +933,7 @@ static __init int gsmi_init(void)

static void __exit gsmi_exit(void)
{
+ unregister_thermal_notifier(&gsmi_thermal_notifier);
unregister_reboot_notifier(&gsmi_reboot_notifier);
unregister_die_notifier(&gsmi_die_notifier);
atomic_notifier_chain_unregister(&panic_notifier_list,
--
2.19.1.568.g152ad8e336-goog


2018-10-22 23:02:48

by Tom Psyborg

[permalink] [raw]
Subject: Re: [PATCH 1/2] thermal: Add notifier call chain for hot/critical events

On 22/10/2018, Ross Zwisler <[email protected]> wrote:
> From: Duncan Laurie <[email protected]>
>
> This will allow drivers to register a callback for important
> thermal events and log critical thresholds that cause the system
> to shut down.

when you have proper implementation of thermal support the cores
should throttle to 80, 60 and 40% once reached passive trip point so
they never even reach critical point that causes shutdown

2018-10-23 08:19:43

by Daniel Lezcano

[permalink] [raw]
Subject: Re: [PATCH 1/2] thermal: Add notifier call chain for hot/critical events

On 22/10/2018 22:32, Ross Zwisler wrote:
> From: Duncan Laurie <[email protected]>
>
> This will allow drivers to register a callback for important
> thermal events and log critical thresholds that cause the system
> to shut down.
>
> There are other places this might work, but after consideration
> I think it makes sense to have the chain at this level:
>
> The ACPI thermal driver has an existing notify function that is
> eventually called into, but that would limit the notifier to only
> working on systems that use ACPI.
>
> The cpufreq driver is already getting a notify callback executed
> in this path (tz->ops->notify) but the threshold info is not passed
> to the cpu_cooling notifier chain so it is not useful for logging.
>
> Signed-off-by: Duncan Laurie <[email protected]>
> Reviewed-by: Olof Johansson <[email protected]>
> Reviewed-by: Vincent Palatin <[email protected]>
> [ rez: updated changelog for upstream ]
> Signed-off-by: Ross Zwisler <[email protected]>
> ---

At the critical code path, the thermal framework gives a chance to
orderly_poweroff() to complete but the execution is bounded by a delay
which in turn forces a kernel power off.

The usage of a notifier call chain is inadequate in this code path. When
the system is reaching a critical temperature it has to *urgently* take
action. By adding this notifier that allows anyone to hook this path and
add an overhead/delays.



> drivers/thermal/thermal_core.c | 38 ++++++++++++++++++++++++++++++++++
> include/linux/thermal.h | 4 ++++
> 2 files changed, 42 insertions(+)
>
> diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
> index 6ab982309e6a0..e1f8764b3d9f9 100644
> --- a/drivers/thermal/thermal_core.c
> +++ b/drivers/thermal/thermal_core.c
> @@ -15,6 +15,7 @@
> #include <linux/slab.h>
> #include <linux/kdev_t.h>
> #include <linux/idr.h>
> +#include <linux/notifier.h>
> #include <linux/thermal.h>
> #include <linux/reboot.h>
> #include <linux/string.h>
> @@ -313,12 +314,46 @@ static void monitor_thermal_zone(struct thermal_zone_device *tz)
> mutex_unlock(&tz->lock);
> }
>
> +static BLOCKING_NOTIFIER_HEAD(thermal_notifier_list);
> +
> +/**
> + * register_thermal_notifier - Register function to be called for
> + * critical thermal events.
> + *
> + * @nb: Info about notifier function to be called
> + *
> + * Currently always returns zero, as blocking_notifier_chain_register()
> + * always returns zero.
> + */
> +int register_thermal_notifier(struct notifier_block *nb)
> +{
> + return blocking_notifier_chain_register(&thermal_notifier_list, nb);
> +}
> +EXPORT_SYMBOL(register_thermal_notifier);
> +
> +/**
> + * unregister_thermal_notifier - Unregister thermal notifier
> + *
> + * @nb: Hook to be unregistered
> + *
> + * Returns zero on success, or %-ENOENT on failure.
> + */
> +int unregister_thermal_notifier(struct notifier_block *nb)
> +{
> + return blocking_notifier_chain_unregister(&thermal_notifier_list, nb);
> +}
> +EXPORT_SYMBOL(unregister_thermal_notifier);
> +
> +
> static void handle_non_critical_trips(struct thermal_zone_device *tz,
> int trip,
> enum thermal_trip_type trip_type)
> {
> tz->governor ? tz->governor->throttle(tz, trip) :
> def_governor->throttle(tz, trip);
> +
> + blocking_notifier_call_chain(&thermal_notifier_list,
> + trip_type, NULL);
> }
>
> /**
> @@ -385,6 +420,9 @@ static void handle_critical_trips(struct thermal_zone_device *tz,
> if (tz->ops->notify)
> tz->ops->notify(tz, trip, trip_type);
>
> + blocking_notifier_call_chain(&thermal_notifier_list,
> + trip_type, NULL);
> +
> if (trip_type == THERMAL_TRIP_CRITICAL) {
> dev_emerg(&tz->device,
> "critical temperature reached (%d C), shutting down\n",
> diff --git a/include/linux/thermal.h b/include/linux/thermal.h
> index 5f4705f46c2f9..b948344d55cab 100644
> --- a/include/linux/thermal.h
> +++ b/include/linux/thermal.h
> @@ -13,6 +13,7 @@
> #include <linux/of.h>
> #include <linux/idr.h>
> #include <linux/device.h>
> +#include <linux/notifier.h>
> #include <linux/sysfs.h>
> #include <linux/workqueue.h>
> #include <uapi/linux/thermal.h>
> @@ -542,4 +543,7 @@ static inline int thermal_generate_netlink_event(struct thermal_zone_device *tz,
> }
> #endif
>
> +extern int register_thermal_notifier(struct notifier_block *);
> +extern int unregister_thermal_notifier(struct notifier_block *);
> +
> #endif /* __THERMAL_H__ */
>


--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs

Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog


2018-10-30 03:17:03

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH 2/2] gsmi: Log event for critical thermal thresholds

Hi Duncan,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on soc-thermal/next]
[also build test ERROR on v4.19]
[cannot apply to next-20181029]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url: https://github.com/0day-ci/linux/commits/Ross-Zwisler/thermal-Add-notifier-call-chain-for-hot-critical-events/20181023-043806
base: https://git.kernel.org/pub/scm/linux/kernel/git/evalenti/linux-soc-thermal.git next
config: i386-randconfig-k0-10291547 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-1) 7.3.0
reproduce:
# save the attached .config to linux build tree
make ARCH=i386

All errors (new ones prefixed by >>):

drivers/firmware/google/gsmi.o: In function `gsmi_exit':
>> drivers/firmware/google/gsmi.c:936: undefined reference to `unregister_thermal_notifier'
drivers/firmware/google/gsmi.o: In function `gsmi_init':
>> drivers/firmware/google/gsmi.c:909: undefined reference to `register_thermal_notifier'

vim +936 drivers/firmware/google/gsmi.c

787
788 static __init int gsmi_init(void)
789 {
790 unsigned long flags;
791 int ret;
792
793 ret = gsmi_system_valid();
794 if (ret)
795 return ret;
796
797 gsmi_dev.smi_cmd = acpi_gbl_FADT.smi_command;
798
799 /* register device */
800 gsmi_dev.pdev = platform_device_register_full(&gsmi_dev_info);
801 if (IS_ERR(gsmi_dev.pdev)) {
802 printk(KERN_ERR "gsmi: unable to register platform device\n");
803 return PTR_ERR(gsmi_dev.pdev);
804 }
805
806 /* SMI access needs to be serialized */
807 spin_lock_init(&gsmi_dev.lock);
808
809 ret = -ENOMEM;
810 gsmi_dev.dma_pool = dma_pool_create("gsmi", &gsmi_dev.pdev->dev,
811 GSMI_BUF_SIZE, GSMI_BUF_ALIGN, 0);
812 if (!gsmi_dev.dma_pool)
813 goto out_err;
814
815 /*
816 * pre-allocate buffers because sometimes we are called when
817 * this is not feasible: oops, panic, die, mce, etc
818 */
819 gsmi_dev.name_buf = gsmi_buf_alloc();
820 if (!gsmi_dev.name_buf) {
821 printk(KERN_ERR "gsmi: failed to allocate name buffer\n");
822 goto out_err;
823 }
824
825 gsmi_dev.data_buf = gsmi_buf_alloc();
826 if (!gsmi_dev.data_buf) {
827 printk(KERN_ERR "gsmi: failed to allocate data buffer\n");
828 goto out_err;
829 }
830
831 gsmi_dev.param_buf = gsmi_buf_alloc();
832 if (!gsmi_dev.param_buf) {
833 printk(KERN_ERR "gsmi: failed to allocate param buffer\n");
834 goto out_err;
835 }
836
837 /*
838 * Determine type of handshake used to serialize the SMI
839 * entry. See also gsmi_exec().
840 *
841 * There's a "behavior" present on some chipsets where writing the
842 * SMI trigger register in the southbridge doesn't result in an
843 * immediate SMI. Rather, the processor can execute "a few" more
844 * instructions before the SMI takes effect. To ensure synchronous
845 * behavior, implement a handshake between the kernel driver and the
846 * firmware handler to spin until released. This ioctl determines
847 * the type of handshake.
848 *
849 * NONE: The firmware handler does not implement any
850 * handshake. Either it doesn't need to, or it's legacy firmware
851 * that doesn't know it needs to and never will.
852 *
853 * CF: The firmware handler will clear the CF in the saved
854 * state before returning. The driver may set the CF and test for
855 * it to clear before proceeding.
856 *
857 * SPIN: The firmware handler does not implement any handshake
858 * but the driver should spin for a hundred or so microseconds
859 * to ensure the SMI has triggered.
860 *
861 * Finally, the handler will return -ENOSYS if
862 * GSMI_CMD_HANDSHAKE_TYPE is unimplemented, which implies
863 * HANDSHAKE_NONE.
864 */
865 spin_lock_irqsave(&gsmi_dev.lock, flags);
866 gsmi_dev.handshake_type = GSMI_HANDSHAKE_SPIN;
867 gsmi_dev.handshake_type =
868 gsmi_exec(GSMI_CALLBACK, GSMI_CMD_HANDSHAKE_TYPE);
869 if (gsmi_dev.handshake_type == -ENOSYS)
870 gsmi_dev.handshake_type = GSMI_HANDSHAKE_NONE;
871 spin_unlock_irqrestore(&gsmi_dev.lock, flags);
872
873 /* Remove and clean up gsmi if the handshake could not complete. */
874 if (gsmi_dev.handshake_type == -ENXIO) {
875 printk(KERN_INFO "gsmi version " DRIVER_VERSION
876 " failed to load\n");
877 ret = -ENODEV;
878 goto out_err;
879 }
880
881 /* Register in the firmware directory */
882 ret = -ENOMEM;
883 gsmi_kobj = kobject_create_and_add("gsmi", firmware_kobj);
884 if (!gsmi_kobj) {
885 printk(KERN_INFO "gsmi: Failed to create firmware kobj\n");
886 goto out_err;
887 }
888
889 /* Setup eventlog access */
890 ret = sysfs_create_bin_file(gsmi_kobj, &eventlog_bin_attr);
891 if (ret) {
892 printk(KERN_INFO "gsmi: Failed to setup eventlog");
893 goto out_err;
894 }
895
896 /* Other attributes */
897 ret = sysfs_create_files(gsmi_kobj, gsmi_attrs);
898 if (ret) {
899 printk(KERN_INFO "gsmi: Failed to add attrs");
900 goto out_remove_bin_file;
901 }
902
903 ret = efivars_register(&efivars, &efivar_ops, gsmi_kobj);
904 if (ret) {
905 printk(KERN_INFO "gsmi: Failed to register efivars\n");
906 goto out_remove_sysfs_files;
907 }
908
> 909 register_thermal_notifier(&gsmi_thermal_notifier);
910 register_reboot_notifier(&gsmi_reboot_notifier);
911 register_die_notifier(&gsmi_die_notifier);
912 atomic_notifier_chain_register(&panic_notifier_list,
913 &gsmi_panic_notifier);
914
915 printk(KERN_INFO "gsmi version " DRIVER_VERSION " loaded\n");
916
917 return 0;
918
919 out_remove_sysfs_files:
920 sysfs_remove_files(gsmi_kobj, gsmi_attrs);
921 out_remove_bin_file:
922 sysfs_remove_bin_file(gsmi_kobj, &eventlog_bin_attr);
923 out_err:
924 kobject_put(gsmi_kobj);
925 gsmi_buf_free(gsmi_dev.param_buf);
926 gsmi_buf_free(gsmi_dev.data_buf);
927 gsmi_buf_free(gsmi_dev.name_buf);
928 dma_pool_destroy(gsmi_dev.dma_pool);
929 platform_device_unregister(gsmi_dev.pdev);
930 pr_info("gsmi: failed to load: %d\n", ret);
931 return ret;
932 }
933
934 static void __exit gsmi_exit(void)
935 {
> 936 unregister_thermal_notifier(&gsmi_thermal_notifier);
937 unregister_reboot_notifier(&gsmi_reboot_notifier);
938 unregister_die_notifier(&gsmi_die_notifier);
939 atomic_notifier_chain_unregister(&panic_notifier_list,
940 &gsmi_panic_notifier);
941 efivars_unregister(&efivars);
942
943 sysfs_remove_files(gsmi_kobj, gsmi_attrs);
944 sysfs_remove_bin_file(gsmi_kobj, &eventlog_bin_attr);
945 kobject_put(gsmi_kobj);
946 gsmi_buf_free(gsmi_dev.param_buf);
947 gsmi_buf_free(gsmi_dev.data_buf);
948 gsmi_buf_free(gsmi_dev.name_buf);
949 dma_pool_destroy(gsmi_dev.dma_pool);
950 platform_device_unregister(gsmi_dev.pdev);
951 }
952

---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation


Attachments:
(No filename) (7.42 kB)
.config.gz (28.86 kB)
Download all attachments