2010-07-28 19:09:22

by Fenghua Yu

[permalink] [raw]
Subject: [PATH V2 0/5] Package Level Thermal Control and Power Limit Notification

From: Fenghua Yu <[email protected]>

The new MSRs PACKAGE_THERM_STATUS and PACKAGE_THERM_INTERRUPT and power limit
notification are published in Intel 64 and IA32 SDM and first implemented on
Intel Sandy Bridge platform. The following patch set supports the new features
in Linux kernel.

Fenghua Yu (5):
Package Level Thermal Control and Power Limit Notification: enable
features
Package Level Thermal Control and Power Limit Notification: pkgtemp
hwmon driver
Package Level Thermal Control and Power Limit Notification: thermal
throttling handler
Package Level Thermal Control and Power Limit Notification: power
limit
Package Level Thermal Control and Power Limit Notification: pkgtemp
doc

Documentation/hwmon/pkgtemp | 36 +++
arch/x86/configs/i386_defconfig | 1 +
arch/x86/configs/x86_64_defconfig | 1 +
arch/x86/include/asm/cpufeature.h | 2 +
arch/x86/include/asm/msr-index.h | 17 +-
arch/x86/kernel/cpu/addon_cpuid_features.c | 2 +
arch/x86/kernel/cpu/mcheck/therm_throt.c | 209 +++++++++++---
drivers/hwmon/Kconfig | 7 +
drivers/hwmon/Makefile | 1 +
drivers/hwmon/pkgtemp.c | 456 ++++++++++++++++++++++++++++
10 files changed, 691 insertions(+), 41 deletions(-)
create mode 100644 Documentation/hwmon/pkgtemp
create mode 100644 drivers/hwmon/pkgtemp.c

--
1.7.2


2010-07-28 19:09:45

by Fenghua Yu

[permalink] [raw]
Subject: [PATH V2 4/5] Package Level Thermal Control and Power Limit Notification: power limit

From: Fenghua Yu <[email protected]>

Power limit notification feature is published in Intel 64 and IA-32
Architectures SDMV Vol 3A 14.5.6 Power Limit Notification.

It is implemented first on Intel Sandy Bridge platform.

The patch handles notification interrupt. Interrupt handler dumps power limit
information in log_buf, logs the event in mce log, and increases the event
counters (core_power_limit and package_power_limit). Upper level applications
could use the data to detect system health or diagnose functionality/performance
issues.

In the future, the event could be handled in a more fancy way.

Signed-off-by: Fenghua Yu <[email protected]>
Reviewed-by: Len Brown <[email protected]>
---
arch/x86/include/asm/msr-index.h | 4 +
arch/x86/kernel/cpu/mcheck/therm_throt.c | 186 +++++++++++++++++++++---------
2 files changed, 136 insertions(+), 54 deletions(-)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index f1c1b82..f3c8a7a 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -226,10 +226,12 @@

#define THERM_INT_HIGH_ENABLE (1 << 0)
#define THERM_INT_LOW_ENABLE (1 << 1)
+#define THERM_INT_PLN_ENABLE (1 << 24)

#define MSR_IA32_THERM_STATUS 0x0000019c

#define THERM_STATUS_PROCHOT (1 << 0)
+#define THERM_STATUS_POWER_LIMIT (1 << 10)

#define MSR_THERM2_CTL 0x0000019d

@@ -242,11 +244,13 @@
#define MSR_IA32_PACKAGE_THERM_STATUS 0x000001b1

#define PACKAGE_THERM_STATUS_PROCHOT (1 << 0)
+#define PACKAGE_THERM_STATUS_POWER_LIMIT (1 << 10)

#define MSR_IA32_PACKAGE_THERM_INTERRUPT 0x000001b2

#define PACKAGE_THERM_INT_HIGH_ENABLE (1 << 0)
#define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1)
+#define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24)

/* MISC_ENABLE bits: architectural */
#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0)
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index d307f9f..0c3d1e0 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -34,20 +34,25 @@
/* How long to wait between reporting thermal events */
#define CHECK_INTERVAL (300 * HZ)

+#define THERMAL_THROTTLING_EVENT 0
+#define POWER_LIMIT_EVENT 1
+
/*
- * Current thermal throttling state:
+ * Current thermal event state:
*/
struct _thermal_state {
- bool is_throttled;
-
+ bool new_event;
+ int event;
u64 next_check;
- unsigned long throttle_count;
- unsigned long last_throttle_count;
+ unsigned long count;
+ unsigned long last_count;
};

struct thermal_state {
- struct _thermal_state core;
- struct _thermal_state package;
+ struct _thermal_state core_throttle;
+ struct _thermal_state core_power_limit;
+ struct _thermal_state package_throttle;
+ struct _thermal_state package_power_limit;
};

static DEFINE_PER_CPU(struct thermal_state, thermal_state);
@@ -62,9 +67,9 @@ static u32 lvtthmr_init __read_mostly;
therm_throt_sysdev_show_##_name, \
NULL) \

-#define define_therm_throt_sysdev_show_func(level, name) \
+#define define_therm_throt_sysdev_show_func(event, name) \
\
-static ssize_t therm_throt_sysdev_show_##level##_##name( \
+static ssize_t therm_throt_sysdev_show_##event##_##name( \
struct sys_device *dev, \
struct sysdev_attribute *attr, \
char *buf) \
@@ -75,7 +80,7 @@ static ssize_t therm_throt_sysdev_show_##level##_##name( \
preempt_disable(); /* CPU hotplug */ \
if (cpu_online(cpu)) { \
ret = sprintf(buf, "%lu\n", \
- per_cpu(thermal_state, cpu).level.name); \
+ per_cpu(thermal_state, cpu).event.name); \
} else \
ret = 0; \
preempt_enable(); \
@@ -83,23 +88,32 @@ static ssize_t therm_throt_sysdev_show_##level##_##name( \
return ret; \
}

-define_therm_throt_sysdev_show_func(core, throttle_count);
+define_therm_throt_sysdev_show_func(core_throttle, count);
define_therm_throt_sysdev_one_ro(core_throttle_count);

-define_therm_throt_sysdev_show_func(package, throttle_count);
+define_therm_throt_sysdev_show_func(core_power_limit, count);
+define_therm_throt_sysdev_one_ro(core_power_limit_count);
+
+define_therm_throt_sysdev_show_func(package_throttle, count);
define_therm_throt_sysdev_one_ro(package_throttle_count);

+define_therm_throt_sysdev_show_func(package_power_limit, count);
+define_therm_throt_sysdev_one_ro(package_power_limit_count);
+
static struct attribute *thermal_throttle_attrs[] = {
&attr_core_throttle_count.attr,
NULL
};

-static struct attribute_group thermal_throttle_attr_group = {
+static struct attribute_group thermal_attr_group = {
.attrs = thermal_throttle_attrs,
.name = "thermal_throttle"
};
#endif /* CONFIG_SYSFS */

+#define CORE_LEVEL 0
+#define PACKAGE_LEVEL 1
+
/***
* therm_throt_process - Process thermal throttling event from interrupt
* @curr: Whether the condition is current or not (boolean), since the
@@ -116,49 +130,73 @@ static struct attribute_group thermal_throttle_attr_group = {
* 1 : Event should be logged further, and a message has been
* printed to the syslog.
*/
-#define CORE_LEVEL 0
-#define PACKAGE_LEVEL 1
-static int therm_throt_process(bool is_throttled, int level)
+static int therm_throt_process(bool new_event, int event, int level)
{
struct _thermal_state *state;
- unsigned int this_cpu;
- bool was_throttled;
+ unsigned int this_cpu = smp_processor_id();
+ bool old_event;
u64 now;
+ struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
+
+ if (!new_event)
+ return 0;

- this_cpu = smp_processor_id();
now = get_jiffies_64();
- if (level == CORE_LEVEL)
- state = &per_cpu(thermal_state, this_cpu).core;
- else
- state = &per_cpu(thermal_state, this_cpu).package;
+ if (level == CORE_LEVEL) {
+ if (event == THERMAL_THROTTLING_EVENT)
+ state = &pstate->core_throttle;
+ else if (event == POWER_LIMIT_EVENT)
+ state = &pstate->core_power_limit;
+ else
+ return 0;
+ } else if (level == PACKAGE_LEVEL) {
+ if (event == THERMAL_THROTTLING_EVENT)
+ state = &pstate->package_throttle;
+ else if (event == POWER_LIMIT_EVENT)
+ state = &pstate->package_power_limit;
+ else
+ return 0;
+ } else
+ return 0;

- was_throttled = state->is_throttled;
- state->is_throttled = is_throttled;
+ old_event = state->new_event;
+ state->new_event = new_event;

- if (is_throttled)
- state->throttle_count++;
+ if (new_event)
+ state->count++;

if (time_before64(now, state->next_check) &&
- state->throttle_count != state->last_throttle_count)
+ state->count != state->last_count)
return 0;

state->next_check = now + CHECK_INTERVAL;
- state->last_throttle_count = state->throttle_count;
+ state->last_count = state->count;

/* if we just entered the thermal event */
- if (is_throttled) {
- printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
- this_cpu,
- level == CORE_LEVEL ? "Core" : "Package",
- state->throttle_count);
+ if (new_event) {
+ if (event == THERMAL_THROTTLING_EVENT)
+ printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
+ this_cpu,
+ level == CORE_LEVEL ? "Core" : "Package",
+ state->count);
+ else
+ printk(KERN_CRIT "CPU%d: %s power limit notification (total events = %lu)\n",
+ this_cpu,
+ level == CORE_LEVEL ? "Core" : "Package",
+ state->count);

add_taint(TAINT_MACHINE_CHECK);
return 1;
}
- if (was_throttled) {
- printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
- this_cpu,
- level == CORE_LEVEL ? "Core" : "Package");
+ if (old_event) {
+ if (event == THERMAL_THROTTLING_EVENT)
+ printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
+ this_cpu,
+ level == CORE_LEVEL ? "Core" : "Package");
+ else
+ printk(KERN_INFO "CPU%d: %s power limit normal\n",
+ this_cpu,
+ level == CORE_LEVEL ? "Core" : "Package");
return 1;
}

@@ -172,21 +210,29 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
int err;
struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());

- err = sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group);
+ err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group);
if (err)
return err;

+ if (cpu_has(c, X86_FEATURE_PLN))
+ err = sysfs_add_file_to_group(&sys_dev->kobj,
+ &attr_core_power_limit_count.attr,
+ thermal_attr_group.name);
if (cpu_has(c, X86_FEATURE_PTS))
err = sysfs_add_file_to_group(&sys_dev->kobj,
&attr_package_throttle_count.attr,
- thermal_throttle_attr_group.name);
+ thermal_attr_group.name);
+ if (cpu_has(c, X86_FEATURE_PLN))
+ err = sysfs_add_file_to_group(&sys_dev->kobj,
+ &attr_package_power_limit_count.attr,
+ thermal_attr_group.name);

return err;
}

static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
{
- sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
+ sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group);
}

/* Mutex protecting device creation against CPU hotplug: */
@@ -257,6 +303,17 @@ device_initcall(thermal_throttle_init_device);

#endif /* CONFIG_SYSFS */

+/*
+ * Set up the most two significant bit to notify mce log that this thermal
+ * event type.
+ * This is a temp solution. May be changed in the future with mce log
+ * infrasture.
+ */
+#define CORE_THROTTLED (0)
+#define CORE_POWER_LIMIT ((__u64)1 << 62)
+#define PACKAGE_THROTTLED ((__u64)2 << 62)
+#define PACKAGE_POWER_LIMIT ((__u64)3 << 62)
+
/* Thermal transition interrupt handler */
static void intel_thermal_interrupt(void)
{
@@ -264,21 +321,31 @@ static void intel_thermal_interrupt(void)
struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());

rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
+
if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
+ THERMAL_THROTTLING_EVENT,
CORE_LEVEL) != 0)
- mce_log_therm_throt_event(msr_val);
+ mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
+
+ if (cpu_has(c, X86_FEATURE_PLN))
+ if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
+ POWER_LIMIT_EVENT,
+ CORE_LEVEL) != 0)
+ mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);

if (cpu_has(c, X86_FEATURE_PTS)) {
rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
+ THERMAL_THROTTLING_EVENT,
PACKAGE_LEVEL) != 0)
- /*
- * Set up the most significant bit to notify mce log
- * that this thermal event is a package level event.
- * This is a temp solution. May be changed in the future
- * with mce log infrasture.
- */
- mce_log_therm_throt_event(((__u64)1 << 63) | msr_val);
+ mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
+ if (cpu_has(c, X86_FEATURE_PLN))
+ if (therm_throt_process(msr_val &
+ PACKAGE_THERM_STATUS_POWER_LIMIT,
+ POWER_LIMIT_EVENT,
+ PACKAGE_LEVEL) != 0)
+ mce_log_therm_throt_event(PACKAGE_POWER_LIMIT
+ | msr_val);
}
}

@@ -381,14 +448,25 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
apic_write(APIC_LVTTHMR, h);

rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
- wrmsr(MSR_IA32_THERM_INTERRUPT,
- l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
+ if (cpu_has(c, X86_FEATURE_PLN))
+ wrmsr(MSR_IA32_THERM_INTERRUPT,
+ l | (THERM_INT_LOW_ENABLE
+ | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
+ else
+ wrmsr(MSR_IA32_THERM_INTERRUPT,
+ l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);

if (cpu_has(c, X86_FEATURE_PTS)) {
rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
- wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
- l | (PACKAGE_THERM_INT_LOW_ENABLE
- | PACKAGE_THERM_INT_HIGH_ENABLE), h);
+ if (cpu_has(c, X86_FEATURE_PLN))
+ wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+ l | (PACKAGE_THERM_INT_LOW_ENABLE
+ | PACKAGE_THERM_INT_HIGH_ENABLE
+ | PACKAGE_THERM_INT_PLN_ENABLE), h);
+ else
+ wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+ l | (PACKAGE_THERM_INT_LOW_ENABLE
+ | PACKAGE_THERM_INT_HIGH_ENABLE), h);
}

smp_thermal_vector = intel_thermal_interrupt;
--
1.7.2

2010-07-28 19:09:24

by Fenghua Yu

[permalink] [raw]
Subject: [PATH V2 1/5] Package Level Thermal Control and Power Limit Notification: enable features

From: Fenghua Yu <[email protected]>

Add package level thermal and power limit feature support in the kernel.

The two MSRs and features are new starting with Intel's Sandy Bridge processor.

Please check Intel 64 and IA-32 Architectures SDMV Vol 3A 14.5.6 Power Limit
Notification and 14.6 Package Level Thermal Management.

Signed-off-by: Fenghua Yu <[email protected]>
Reviewed-by: Len Brown <[email protected]>
---
arch/x86/include/asm/cpufeature.h | 2 ++
arch/x86/include/asm/msr-index.h | 3 +++
arch/x86/kernel/cpu/addon_cpuid_features.c | 2 ++
3 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 4681459..79517b5 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -162,6 +162,8 @@
#define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */
#define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */
#define X86_FEATURE_CPB (7*32+ 2) /* AMD Core Performance Boost */
+#define X86_FEATURE_PLN (7*32+ 3) /* Intel Power Limit Notification */
+#define X86_FEATURE_PTS (7*32+ 4) /* Intel Package Thermal Status */

/* Virtualization flags: Linux defined */
#define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 8c7ae43..5eaad6f 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -239,6 +239,9 @@

#define MSR_IA32_TEMPERATURE_TARGET 0x000001a2

+#define MSR_IA32_PACKAGE_THERM_STATUS 0x000001b1
+#define MSR_IA32_PACKAGE_THERM_INTERRUPT 0x000001b2
+
/* MISC_ENABLE bits: architectural */
#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0)
#define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1)
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 10fa568..c1f4b98 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -32,6 +32,8 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
{ X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
{ X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
+ { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006 },
+ { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006 },
{ X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006 },
{ X86_FEATURE_CPB, CR_EDX, 9, 0x80000007 },
{ X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a },
--
1.7.2

2010-07-28 19:09:41

by Fenghua Yu

[permalink] [raw]
Subject: [PATH V2 5/5] Package Level Thermal Control and Power Limit Notification: pkgtemp doc

From: Fenghua Yu <[email protected]>

Document for package level thermal hwmon driver.

Signed-off-by: Fenghua Yu <[email protected]>
Reviewed-by: Len Brown <[email protected]>
Reviewed-by: Guenter Roeck <[email protected]>
---
Documentation/hwmon/pkgtemp | 36 ++++++++++++++++++++++++++++++++++++
1 files changed, 36 insertions(+), 0 deletions(-)
create mode 100644 Documentation/hwmon/pkgtemp

diff --git a/Documentation/hwmon/pkgtemp b/Documentation/hwmon/pkgtemp
new file mode 100644
index 0000000..c8e1fb0
--- /dev/null
+++ b/Documentation/hwmon/pkgtemp
@@ -0,0 +1,36 @@
+Kernel driver pkgtemp
+======================
+
+Supported chips:
+ * Intel family
+ Prefix: 'pkgtemp'
+ CPUID:
+ Datasheet: Intel 64 and IA-32 Architectures Software Developer's Manual
+ Volume 3A: System Programming Guide
+
+Author: Fenghua Yu
+
+Description
+-----------
+
+This driver permits reading package level temperature sensor embedded inside
+Intel CPU package. The sensors can be in core, uncore, memory controller, or
+other components in a package. The feature is first implemented in Intel Sandy
+Bridge platform.
+
+Temperature is measured in degrees Celsius and measurement resolution is
+1 degree C. Valid temperatures are from 0 to TjMax degrees C, because the actual
+value of temperature register is in fact a delta from TjMax.
+
+Temperature known as TjMax is the maximum junction temperature of package.
+We get this from MSR_IA32_TEMPERATURE_TARGET. If the MSR is not accessible,
+we define TjMax as 100 degrees Celsius. At this temperature, protection
+mechanism will perform actions to forcibly cool down the package. Alarm
+may be raised, if the temperature grows enough (more than TjMax) to trigger
+the Out-Of-Spec bit. Following table summarizes the exported sysfs files:
+
+temp1_input - Package temperature (in millidegrees Celsius).
+temp1_max - All cooling devices should be turned on.
+temp1_crit - Maximum junction temperature (in millidegrees Celsius).
+temp1_crit_alarm - Set when Out-of-spec bit is set, never clears.
+ Correct CPU operation is no longer guaranteed.
--
1.7.2

2010-07-28 19:10:04

by Fenghua Yu

[permalink] [raw]
Subject: [PATH V2 3/5] Package Level Thermal Control and Power Limit Notification: thermal throttling handler

From: Fenghua Yu <[email protected]>

Add package level thermal throttle interrupt support. The interrupt handler
increases package level thermal throttle count. It also logs the event in MCE
log.

The package level thermal throttle interrupt happens across threads in a
package. Each thread handles the interrupt individually. User level application
is supposed to retrieve correct event count and log based on package/thread
topology. This is the same situation for core level interrupt handler.

core_throttle_count and package_throttle_count are used for user interface.
Previously only throttle_count is used for core throttle count. If you think
new core_throttle_count name breaks user interface, I can change this part.

This patch also fixes a bug which defines reverse THERM_INT_LOW_ENABLE bit and
THERM_INT_HIGH_ENABLE bit.

Signed-off-by: Fenghua Yu <[email protected]>
Reviewed-by: Len Brown <[email protected]>
---
arch/x86/include/asm/msr-index.h | 10 +++-
arch/x86/kernel/cpu/mcheck/therm_throt.c | 89 ++++++++++++++++++++++++------
2 files changed, 79 insertions(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 5eaad6f..f1c1b82 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -224,8 +224,8 @@
#define MSR_IA32_THERM_CONTROL 0x0000019a
#define MSR_IA32_THERM_INTERRUPT 0x0000019b

-#define THERM_INT_LOW_ENABLE (1 << 0)
-#define THERM_INT_HIGH_ENABLE (1 << 1)
+#define THERM_INT_HIGH_ENABLE (1 << 0)
+#define THERM_INT_LOW_ENABLE (1 << 1)

#define MSR_IA32_THERM_STATUS 0x0000019c

@@ -240,8 +240,14 @@
#define MSR_IA32_TEMPERATURE_TARGET 0x000001a2

#define MSR_IA32_PACKAGE_THERM_STATUS 0x000001b1
+
+#define PACKAGE_THERM_STATUS_PROCHOT (1 << 0)
+
#define MSR_IA32_PACKAGE_THERM_INTERRUPT 0x000001b2

+#define PACKAGE_THERM_INT_HIGH_ENABLE (1 << 0)
+#define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1)
+
/* MISC_ENABLE bits: architectural */
#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0)
#define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1)
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index e1a0a3b..d307f9f 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -37,7 +37,7 @@
/*
* Current thermal throttling state:
*/
-struct thermal_state {
+struct _thermal_state {
bool is_throttled;

u64 next_check;
@@ -45,6 +45,11 @@ struct thermal_state {
unsigned long last_throttle_count;
};

+struct thermal_state {
+ struct _thermal_state core;
+ struct _thermal_state package;
+};
+
static DEFINE_PER_CPU(struct thermal_state, thermal_state);

static atomic_t therm_throt_en = ATOMIC_INIT(0);
@@ -53,11 +58,13 @@ static u32 lvtthmr_init __read_mostly;

#ifdef CONFIG_SYSFS
#define define_therm_throt_sysdev_one_ro(_name) \
- static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
+ static SYSDEV_ATTR(_name, 0444, \
+ therm_throt_sysdev_show_##_name, \
+ NULL) \

-#define define_therm_throt_sysdev_show_func(name) \
+#define define_therm_throt_sysdev_show_func(level, name) \
\
-static ssize_t therm_throt_sysdev_show_##name( \
+static ssize_t therm_throt_sysdev_show_##level##_##name( \
struct sys_device *dev, \
struct sysdev_attribute *attr, \
char *buf) \
@@ -66,21 +73,24 @@ static ssize_t therm_throt_sysdev_show_##name( \
ssize_t ret; \
\
preempt_disable(); /* CPU hotplug */ \
- if (cpu_online(cpu)) \
+ if (cpu_online(cpu)) { \
ret = sprintf(buf, "%lu\n", \
- per_cpu(thermal_state, cpu).name); \
- else \
+ per_cpu(thermal_state, cpu).level.name); \
+ } else \
ret = 0; \
preempt_enable(); \
\
return ret; \
}

-define_therm_throt_sysdev_show_func(throttle_count);
-define_therm_throt_sysdev_one_ro(throttle_count);
+define_therm_throt_sysdev_show_func(core, throttle_count);
+define_therm_throt_sysdev_one_ro(core_throttle_count);
+
+define_therm_throt_sysdev_show_func(package, throttle_count);
+define_therm_throt_sysdev_one_ro(package_throttle_count);

static struct attribute *thermal_throttle_attrs[] = {
- &attr_throttle_count.attr,
+ &attr_core_throttle_count.attr,
NULL
};

@@ -106,16 +116,21 @@ static struct attribute_group thermal_throttle_attr_group = {
* 1 : Event should be logged further, and a message has been
* printed to the syslog.
*/
-static int therm_throt_process(bool is_throttled)
+#define CORE_LEVEL 0
+#define PACKAGE_LEVEL 1
+static int therm_throt_process(bool is_throttled, int level)
{
- struct thermal_state *state;
+ struct _thermal_state *state;
unsigned int this_cpu;
bool was_throttled;
u64 now;

this_cpu = smp_processor_id();
now = get_jiffies_64();
- state = &per_cpu(thermal_state, this_cpu);
+ if (level == CORE_LEVEL)
+ state = &per_cpu(thermal_state, this_cpu).core;
+ else
+ state = &per_cpu(thermal_state, this_cpu).package;

was_throttled = state->is_throttled;
state->is_throttled = is_throttled;
@@ -132,13 +147,18 @@ static int therm_throt_process(bool is_throttled)

/* if we just entered the thermal event */
if (is_throttled) {
- printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count);
+ printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
+ this_cpu,
+ level == CORE_LEVEL ? "Core" : "Package",
+ state->throttle_count);

add_taint(TAINT_MACHINE_CHECK);
return 1;
}
if (was_throttled) {
- printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu);
+ printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
+ this_cpu,
+ level == CORE_LEVEL ? "Core" : "Package");
return 1;
}

@@ -149,8 +169,19 @@ static int therm_throt_process(bool is_throttled)
/* Add/Remove thermal_throttle interface for CPU device: */
static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
{
- return sysfs_create_group(&sys_dev->kobj,
- &thermal_throttle_attr_group);
+ int err;
+ struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
+
+ err = sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group);
+ if (err)
+ return err;
+
+ if (cpu_has(c, X86_FEATURE_PTS))
+ err = sysfs_add_file_to_group(&sys_dev->kobj,
+ &attr_package_throttle_count.attr,
+ thermal_throttle_attr_group.name);
+
+ return err;
}

static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
@@ -230,10 +261,25 @@ device_initcall(thermal_throttle_init_device);
static void intel_thermal_interrupt(void)
{
__u64 msr_val;
+ struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());

rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
- if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0))
+ if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
+ CORE_LEVEL) != 0)
mce_log_therm_throt_event(msr_val);
+
+ if (cpu_has(c, X86_FEATURE_PTS)) {
+ rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
+ if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
+ PACKAGE_LEVEL) != 0)
+ /*
+ * Set up the most significant bit to notify mce log
+ * that this thermal event is a package level event.
+ * This is a temp solution. May be changed in the future
+ * with mce log infrasture.
+ */
+ mce_log_therm_throt_event(((__u64)1 << 63) | msr_val);
+ }
}

static void unexpected_thermal_interrupt(void)
@@ -338,6 +384,13 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
wrmsr(MSR_IA32_THERM_INTERRUPT,
l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);

+ if (cpu_has(c, X86_FEATURE_PTS)) {
+ rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
+ wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+ l | (PACKAGE_THERM_INT_LOW_ENABLE
+ | PACKAGE_THERM_INT_HIGH_ENABLE), h);
+ }
+
smp_thermal_vector = intel_thermal_interrupt;

rdmsr(MSR_IA32_MISC_ENABLE, l, h);
--
1.7.2

2010-07-28 19:10:07

by Fenghua Yu

[permalink] [raw]
Subject: [PATH V2 2/5] Package Level Thermal Control and Power Limit Notification: pkgtemp hwmon driver

From: Fenghua Yu <[email protected]>

This patch adds a hwmon driver for package level thermal control. The driver
dumps package level thermal information through sysfs interface so that upper
level application (e.g. lm_sensor) can retrive the information.

Instead of having the package level hwmon code in coretemp, I write a seperate
driver pkgtemp because:

First, package level thermal sensors include not only sensors for each core,
but also sensors for uncore, memory controller or other components in the
package. Logically it will be clear to have a seperate hwmon driver for package
level hwmon to monitor wider range of sensors in a package. Merging package
thermal driver into core thermal driver doesn't make sense and may mislead.

Secondly, merging the two drivers together may cause coding mess. It's easier
to include various package level sensors info if more sensor information is
implemented. Coretemp code needs to consider a lot of legacy machine cases.
Pkgtemp code only considers platform starting from Sandy Bridge.

On a 1Sx4Cx2T Sandy Bridge platform, lm-sensors dumps the pkgtemp and coretemp:

pkgtemp-isa-0000
Adapter: ISA adapter
physical id 0: +33.0°C (high = +79.0°C, crit = +99.0°C)

coretemp-isa-0000
Adapter: ISA adapter
Core 0: +32.0°C (high = +79.0°C, crit = +99.0°C)

coretemp-isa-0001
Adapter: ISA adapter
Core 1: +32.0°C (high = +79.0°C, crit = +99.0°C)

coretemp-isa-0002
Adapter: ISA adapter
Core 2: +32.0°C (high = +79.0°C, crit = +99.0°C)

coretemp-isa-0003
Adapter: ISA adapter
Core 3: +32.0°C (high = +79.0°C, crit = +99.0°C)

Signed-off-by: Fenghua Yu <[email protected]>
Reviewed-by: Len Brown <[email protected]>
Reviewed-by: Guenter Roeck <[email protected]>
---
arch/x86/configs/i386_defconfig | 1 +
arch/x86/configs/x86_64_defconfig | 1 +
drivers/hwmon/Kconfig | 7 +
drivers/hwmon/Makefile | 1 +
drivers/hwmon/pkgtemp.c | 456 +++++++++++++++++++++++++++++++++++++
5 files changed, 466 insertions(+), 0 deletions(-)
create mode 100644 drivers/hwmon/pkgtemp.c

diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index d28fad1..e3a3243 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1471,6 +1471,7 @@ CONFIG_HWMON=y
# CONFIG_SENSORS_GL518SM is not set
# CONFIG_SENSORS_GL520SM is not set
# CONFIG_SENSORS_CORETEMP is not set
+# CONFIG_SENSORS_PKGTEMP is not set
# CONFIG_SENSORS_IT87 is not set
# CONFIG_SENSORS_LM63 is not set
# CONFIG_SENSORS_LM75 is not set
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 6c86acd..4251f83 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1456,6 +1456,7 @@ CONFIG_HWMON=y
# CONFIG_SENSORS_GL518SM is not set
# CONFIG_SENSORS_GL520SM is not set
# CONFIG_SENSORS_CORETEMP is not set
+# CONFIG_SENSORS_PKGTEMP is not set
# CONFIG_SENSORS_IT87 is not set
# CONFIG_SENSORS_LM63 is not set
# CONFIG_SENSORS_LM75 is not set
diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index e19cf8e..3a858e8 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -407,6 +407,13 @@ config SENSORS_CORETEMP
sensor inside your CPU. Most of the family 6 CPUs
are supported. Check documentation/driver for details.

+config SENSORS_PKGTEMP
+ tristate "Intel processor package temperature sensor"
+ depends on X86 && PCI && EXPERIMENTAL
+ help
+ If you say yes here you get support for the package level temperature
+ sensor inside your CPU. Check documentation/driver for details.
+
config SENSORS_IBMAEM
tristate "IBM Active Energy Manager temperature/power sensors and control"
select IPMI_SI
diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
index 2138ceb..879814e 100644
--- a/drivers/hwmon/Makefile
+++ b/drivers/hwmon/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_SENSORS_AMS) += ams/
obj-$(CONFIG_SENSORS_ASC7621) += asc7621.o
obj-$(CONFIG_SENSORS_ATXP1) += atxp1.o
obj-$(CONFIG_SENSORS_CORETEMP) += coretemp.o
+obj-$(CONFIG_SENSORS_PKGTEMP) += pkgtemp.o
obj-$(CONFIG_SENSORS_DME1737) += dme1737.o
obj-$(CONFIG_SENSORS_DS1621) += ds1621.o
obj-$(CONFIG_SENSORS_EMC1403) += emc1403.o
diff --git a/drivers/hwmon/pkgtemp.c b/drivers/hwmon/pkgtemp.c
new file mode 100644
index 0000000..8dc8bb7
--- /dev/null
+++ b/drivers/hwmon/pkgtemp.c
@@ -0,0 +1,456 @@
+/*
+ * pkgtemp.c - Linux kernel module for processor package hardware monitoring
+ *
+ * Copyright (C) 2010 Fenghua Yu <[email protected]>
+ *
+ * Inspired from many hwmon drivers especially coretemp.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301 USA.
+ */
+
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/jiffies.h>
+#include <linux/hwmon.h>
+#include <linux/sysfs.h>
+#include <linux/hwmon-sysfs.h>
+#include <linux/err.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/platform_device.h>
+#include <linux/cpu.h>
+#include <linux/pci.h>
+#include <asm/msr.h>
+#include <asm/processor.h>
+
+#define DRVNAME "pkgtemp"
+
+enum { SHOW_TEMP, SHOW_TJMAX, SHOW_TTARGET, SHOW_LABEL, SHOW_NAME } SHOW;
+
+/*
+ * Functions declaration
+ */
+
+static struct pkgtemp_data *pkgtemp_update_device(struct device *dev);
+
+struct pkgtemp_data {
+ struct device *hwmon_dev;
+ struct mutex update_lock;
+ const char *name;
+ u32 id;
+ u16 phys_proc_id;
+ char valid; /* zero until following fields are valid */
+ unsigned long last_updated; /* in jiffies */
+ int temp;
+ int tjmax;
+ int ttarget;
+ u8 alarm;
+};
+
+/*
+ * Sysfs stuff
+ */
+
+static ssize_t show_name(struct device *dev, struct device_attribute
+ *devattr, char *buf)
+{
+ int ret;
+ struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
+ struct pkgtemp_data *data = dev_get_drvdata(dev);
+
+ if (attr->index == SHOW_NAME)
+ ret = sprintf(buf, "%s\n", data->name);
+ else /* show label */
+ ret = sprintf(buf, "physical id %d\n",
+ data->phys_proc_id);
+ return ret;
+}
+
+static ssize_t show_alarm(struct device *dev, struct device_attribute
+ *devattr, char *buf)
+{
+ struct pkgtemp_data *data = pkgtemp_update_device(dev);
+ /* read the Out-of-spec log, never clear */
+ return sprintf(buf, "%d\n", data->alarm);
+}
+
+static ssize_t show_temp(struct device *dev,
+ struct device_attribute *devattr, char *buf)
+{
+ struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
+ struct pkgtemp_data *data = pkgtemp_update_device(dev);
+ int err = 0;
+
+ if (attr->index == SHOW_TEMP)
+ err = data->valid ? sprintf(buf, "%d\n", data->temp) : -EAGAIN;
+ else if (attr->index == SHOW_TJMAX)
+ err = sprintf(buf, "%d\n", data->tjmax);
+ else
+ err = sprintf(buf, "%d\n", data->ttarget);
+ return err;
+}
+
+static SENSOR_DEVICE_ATTR(temp1_input, S_IRUGO, show_temp, NULL, SHOW_TEMP);
+static SENSOR_DEVICE_ATTR(temp1_crit, S_IRUGO, show_temp, NULL, SHOW_TJMAX);
+static SENSOR_DEVICE_ATTR(temp1_max, S_IRUGO, show_temp, NULL, SHOW_TTARGET);
+static DEVICE_ATTR(temp1_crit_alarm, S_IRUGO, show_alarm, NULL);
+static SENSOR_DEVICE_ATTR(temp1_label, S_IRUGO, show_name, NULL, SHOW_LABEL);
+static SENSOR_DEVICE_ATTR(name, S_IRUGO, show_name, NULL, SHOW_NAME);
+
+static struct attribute *pkgtemp_attributes[] = {
+ &sensor_dev_attr_name.dev_attr.attr,
+ &sensor_dev_attr_temp1_label.dev_attr.attr,
+ &dev_attr_temp1_crit_alarm.attr,
+ &sensor_dev_attr_temp1_input.dev_attr.attr,
+ &sensor_dev_attr_temp1_crit.dev_attr.attr,
+ NULL
+};
+
+static const struct attribute_group pkgtemp_group = {
+ .attrs = pkgtemp_attributes,
+};
+
+static struct pkgtemp_data *pkgtemp_update_device(struct device *dev)
+{
+ struct pkgtemp_data *data = dev_get_drvdata(dev);
+ unsigned int cpu;
+ int err;
+
+ mutex_lock(&data->update_lock);
+
+ if (!data->valid || time_after(jiffies, data->last_updated + HZ)) {
+ u32 eax, edx;
+
+ data->valid = 0;
+ cpu = data->id;
+ err = rdmsr_on_cpu(cpu, MSR_IA32_PACKAGE_THERM_STATUS,
+ &eax, &edx);
+ if (!err) {
+ data->alarm = (eax >> 5) & 1;
+ data->temp = data->tjmax - (((eax >> 16)
+ & 0x7f) * 1000);
+ data->valid = 1;
+ } else
+ dev_dbg(dev, "Temperature data invalid (0x%x)\n", eax);
+
+ data->last_updated = jiffies;
+ }
+
+ mutex_unlock(&data->update_lock);
+ return data;
+}
+
+static int get_tjmax(int cpu, struct device *dev)
+{
+ int default_tjmax = 100000;
+ int err;
+ u32 eax, edx;
+ u32 val;
+
+ /* IA32_TEMPERATURE_TARGET contains the TjMax value */
+ err = rdmsr_safe_on_cpu(cpu, MSR_IA32_TEMPERATURE_TARGET, &eax, &edx);
+ if (!err) {
+ val = (eax >> 16) & 0xff;
+ if ((val > 80) && (val < 120)) {
+ dev_info(dev, "TjMax is %d C.\n", val);
+ return val * 1000;
+ }
+ }
+ dev_warn(dev, "Unable to read TjMax from CPU.\n");
+ return default_tjmax;
+}
+
+static int __devinit pkgtemp_probe(struct platform_device *pdev)
+{
+ struct pkgtemp_data *data;
+ int err;
+ u32 eax, edx;
+#ifdef CONFIG_SMP
+ struct cpuinfo_x86 *c = &cpu_data(pdev->id);
+#endif
+
+ data = kzalloc(sizeof(struct pkgtemp_data), GFP_KERNEL);
+ if (!data) {
+ err = -ENOMEM;
+ dev_err(&pdev->dev, "Out of memory\n");
+ goto exit;
+ }
+
+ data->id = pdev->id;
+#ifdef CONFIG_SMP
+ data->phys_proc_id = c->phys_proc_id;
+#endif
+ data->name = "pkgtemp";
+ mutex_init(&data->update_lock);
+
+ /* test if we can access the THERM_STATUS MSR */
+ err = rdmsr_safe_on_cpu(data->id, MSR_IA32_PACKAGE_THERM_STATUS,
+ &eax, &edx);
+ if (err) {
+ dev_err(&pdev->dev,
+ "Unable to access THERM_STATUS MSR, giving up\n");
+ goto exit_free;
+ }
+
+ data->tjmax = get_tjmax(data->id, &pdev->dev);
+ platform_set_drvdata(pdev, data);
+
+ err = rdmsr_safe_on_cpu(data->id, MSR_IA32_TEMPERATURE_TARGET,
+ &eax, &edx);
+ if (err) {
+ dev_warn(&pdev->dev, "Unable to read"
+ " IA32_TEMPERATURE_TARGET MSR\n");
+ } else {
+ data->ttarget = data->tjmax - (((eax >> 8) & 0xff) * 1000);
+ err = device_create_file(&pdev->dev,
+ &sensor_dev_attr_temp1_max.dev_attr);
+ if (err)
+ goto exit_free;
+ }
+
+ err = sysfs_create_group(&pdev->dev.kobj, &pkgtemp_group);
+ if (err)
+ goto exit_free;
+
+ data->hwmon_dev = hwmon_device_register(&pdev->dev);
+ if (IS_ERR(data->hwmon_dev)) {
+ err = PTR_ERR(data->hwmon_dev);
+ dev_err(&pdev->dev, "Class registration failed (%d)\n",
+ err);
+ goto exit_class;
+ }
+
+ return 0;
+
+exit_class:
+ sysfs_remove_group(&pdev->dev.kobj, &pkgtemp_group);
+exit_free:
+ kfree(data);
+exit:
+ return err;
+}
+
+static int __devexit pkgtemp_remove(struct platform_device *pdev)
+{
+ struct pkgtemp_data *data = platform_get_drvdata(pdev);
+
+ hwmon_device_unregister(data->hwmon_dev);
+ sysfs_remove_group(&pdev->dev.kobj, &pkgtemp_group);
+ platform_set_drvdata(pdev, NULL);
+ kfree(data);
+ return 0;
+}
+
+static struct platform_driver pkgtemp_driver = {
+ .driver = {
+ .owner = THIS_MODULE,
+ .name = DRVNAME,
+ },
+ .probe = pkgtemp_probe,
+ .remove = __devexit_p(pkgtemp_remove),
+};
+
+struct pdev_entry {
+ struct list_head list;
+ struct platform_device *pdev;
+ unsigned int cpu;
+#ifdef CONFIG_SMP
+ u16 phys_proc_id;
+#endif
+};
+
+static LIST_HEAD(pdev_list);
+static DEFINE_MUTEX(pdev_list_mutex);
+
+static int __cpuinit pkgtemp_device_add(unsigned int cpu)
+{
+ int err;
+ struct platform_device *pdev;
+ struct pdev_entry *pdev_entry;
+#ifdef CONFIG_SMP
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+#endif
+
+ mutex_lock(&pdev_list_mutex);
+
+#ifdef CONFIG_SMP
+ /* Only keep the first entry in each package */
+ list_for_each_entry(pdev_entry, &pdev_list, list) {
+ if (c->phys_proc_id == pdev_entry->phys_proc_id) {
+ err = 0; /* Not an error */
+ goto exit;
+ }
+ }
+#endif
+
+ pdev = platform_device_alloc(DRVNAME, cpu);
+ if (!pdev) {
+ err = -ENOMEM;
+ printk(KERN_ERR DRVNAME ": Device allocation failed\n");
+ goto exit;
+ }
+
+ pdev_entry = kzalloc(sizeof(struct pdev_entry), GFP_KERNEL);
+ if (!pdev_entry) {
+ err = -ENOMEM;
+ goto exit_device_put;
+ }
+
+ err = platform_device_add(pdev);
+ if (err) {
+ printk(KERN_ERR DRVNAME ": Device addition failed (%d)\n",
+ err);
+ goto exit_device_free;
+ }
+
+#ifdef CONFIG_SMP
+ pdev_entry->phys_proc_id = c->phys_proc_id;
+#endif
+ pdev_entry->pdev = pdev;
+ pdev_entry->cpu = cpu;
+ list_add_tail(&pdev_entry->list, &pdev_list);
+ mutex_unlock(&pdev_list_mutex);
+
+ return 0;
+
+exit_device_free:
+ kfree(pdev_entry);
+exit_device_put:
+ platform_device_put(pdev);
+exit:
+ mutex_unlock(&pdev_list_mutex);
+ return err;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void pkgtemp_device_remove(unsigned int cpu)
+{
+ struct pdev_entry *p, *n;
+ unsigned int i;
+ int err;
+
+ mutex_lock(&pdev_list_mutex);
+ list_for_each_entry_safe(p, n, &pdev_list, list) {
+ if (p->cpu != cpu)
+ continue;
+
+ platform_device_unregister(p->pdev);
+ list_del(&p->list);
+ kfree(p);
+ for_each_cpu(i, cpu_core_mask(cpu)) {
+ if (i != cpu) {
+ err = pkgtemp_device_add(i);
+ if (!err)
+ break;
+ }
+ }
+ break;
+ }
+ mutex_unlock(&pdev_list_mutex);
+}
+
+static int __cpuinit pkgtemp_cpu_callback(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (unsigned long) hcpu;
+
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_DOWN_FAILED:
+ pkgtemp_device_add(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ pkgtemp_device_remove(cpu);
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block pkgtemp_cpu_notifier __refdata = {
+ .notifier_call = pkgtemp_cpu_callback,
+};
+#endif /* !CONFIG_HOTPLUG_CPU */
+
+static int __init pkgtemp_init(void)
+{
+ int i, err = -ENODEV;
+ struct pdev_entry *p, *n;
+
+ /* quick check if we run Intel */
+ if (cpu_data(0).x86_vendor != X86_VENDOR_INTEL)
+ goto exit;
+
+ err = platform_driver_register(&pkgtemp_driver);
+ if (err)
+ goto exit;
+
+ for_each_online_cpu(i) {
+ struct cpuinfo_x86 *c = &cpu_data(i);
+
+ if (!cpu_has(c, X86_FEATURE_PTS))
+ continue;
+
+ err = pkgtemp_device_add(i);
+ if (err)
+ goto exit_devices_unreg;
+ }
+ if (list_empty(&pdev_list)) {
+ err = -ENODEV;
+ goto exit_driver_unreg;
+ }
+
+#ifdef CONFIG_HOTPLUG_CPU
+ register_hotcpu_notifier(&pkgtemp_cpu_notifier);
+#endif
+ return 0;
+
+exit_devices_unreg:
+ mutex_lock(&pdev_list_mutex);
+ list_for_each_entry_safe(p, n, &pdev_list, list) {
+ platform_device_unregister(p->pdev);
+ list_del(&p->list);
+ kfree(p);
+ }
+ mutex_unlock(&pdev_list_mutex);
+exit_driver_unreg:
+ platform_driver_unregister(&pkgtemp_driver);
+exit:
+ return err;
+}
+
+static void __exit pkgtemp_exit(void)
+{
+ struct pdev_entry *p, *n;
+#ifdef CONFIG_HOTPLUG_CPU
+ unregister_hotcpu_notifier(&pkgtemp_cpu_notifier);
+#endif
+ mutex_lock(&pdev_list_mutex);
+ list_for_each_entry_safe(p, n, &pdev_list, list) {
+ platform_device_unregister(p->pdev);
+ list_del(&p->list);
+ kfree(p);
+ }
+ mutex_unlock(&pdev_list_mutex);
+ platform_driver_unregister(&pkgtemp_driver);
+}
+
+MODULE_AUTHOR("Fenghua Yu <[email protected]>");
+MODULE_DESCRIPTION("Intel processor package temperature monitor");
+MODULE_LICENSE("GPL");
+
+module_init(pkgtemp_init)
+module_exit(pkgtemp_exit)
--
1.7.2

2010-07-28 19:32:57

by Guenter Roeck

[permalink] [raw]
Subject: Re: [PATH V2 5/5] Package Level Thermal Control and Power Limit Notification: pkgtemp doc

On Wed, 2010-07-28 at 15:00 -0400, Fenghua Yu wrote:
> From: Fenghua Yu <[email protected]>
>
> Document for package level thermal hwmon driver.
>
> Signed-off-by: Fenghua Yu <[email protected]>
> Reviewed-by: Len Brown <[email protected]>
> Reviewed-by: Guenter Roeck <[email protected]>

That I provided feedback does not mean that I endorsed a "Reviewed-by"
signature. It is inappropriate to add such a signature without my
approval.

Guenter

2010-07-28 19:33:33

by Guenter Roeck

[permalink] [raw]
Subject: Re: [PATH V2 2/5] Package Level Thermal Control and Power Limit Notification: pkgtemp hwmon driver

On Wed, 2010-07-28 at 15:00 -0400, Fenghua Yu wrote:
> From: Fenghua Yu <[email protected]>
>
> This patch adds a hwmon driver for package level thermal control. The driver
> dumps package level thermal information through sysfs interface so that upper
> level application (e.g. lm_sensor) can retrive the information.
>
> Instead of having the package level hwmon code in coretemp, I write a seperate
> driver pkgtemp because:
>
> First, package level thermal sensors include not only sensors for each core,
> but also sensors for uncore, memory controller or other components in the
> package. Logically it will be clear to have a seperate hwmon driver for package
> level hwmon to monitor wider range of sensors in a package. Merging package
> thermal driver into core thermal driver doesn't make sense and may mislead.
>
> Secondly, merging the two drivers together may cause coding mess. It's easier
> to include various package level sensors info if more sensor information is
> implemented. Coretemp code needs to consider a lot of legacy machine cases.
> Pkgtemp code only considers platform starting from Sandy Bridge.
>
> On a 1Sx4Cx2T Sandy Bridge platform, lm-sensors dumps the pkgtemp and coretemp:
>
> pkgtemp-isa-0000
> Adapter: ISA adapter
> physical id 0: +33.0°C (high = +79.0°C, crit = +99.0°C)
>
> coretemp-isa-0000
> Adapter: ISA adapter
> Core 0: +32.0°C (high = +79.0°C, crit = +99.0°C)
>
> coretemp-isa-0001
> Adapter: ISA adapter
> Core 1: +32.0°C (high = +79.0°C, crit = +99.0°C)
>
> coretemp-isa-0002
> Adapter: ISA adapter
> Core 2: +32.0°C (high = +79.0°C, crit = +99.0°C)
>
> coretemp-isa-0003
> Adapter: ISA adapter
> Core 3: +32.0°C (high = +79.0°C, crit = +99.0°C)
>
> Signed-off-by: Fenghua Yu <[email protected]>
> Reviewed-by: Len Brown <[email protected]>
> Reviewed-by: Guenter Roeck <[email protected]>

That I provided feedback does not mean that I endorsed a "Reviewed-by"
signature. It is inappropriate to add such a signature without my
approval.

Guenter

2010-07-28 21:21:16

by Guenter Roeck

[permalink] [raw]
Subject: Re: [PATH V2 4/5] Package Level Thermal Control and Power Limit Notification: power limit

On Wed, 2010-07-28 at 15:00 -0400, Fenghua Yu wrote:
> From: Fenghua Yu <[email protected]>
>
> Power limit notification feature is published in Intel 64 and IA-32
> Architectures SDMV Vol 3A 14.5.6 Power Limit Notification.
>
> It is implemented first on Intel Sandy Bridge platform.
>
> The patch handles notification interrupt. Interrupt handler dumps power limit
> information in log_buf, logs the event in mce log, and increases the event
> counters (core_power_limit and package_power_limit). Upper level applications
> could use the data to detect system health or diagnose functionality/performance
> issues.
>
> In the future, the event could be handled in a more fancy way.
>
> Signed-off-by: Fenghua Yu <[email protected]>
> Reviewed-by: Len Brown <[email protected]>
> ---
> arch/x86/include/asm/msr-index.h | 4 +
> arch/x86/kernel/cpu/mcheck/therm_throt.c | 186 +++++++++++++++++++++---------
> 2 files changed, 136 insertions(+), 54 deletions(-)
>
> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
> index f1c1b82..f3c8a7a 100644
> --- a/arch/x86/include/asm/msr-index.h
> +++ b/arch/x86/include/asm/msr-index.h
> @@ -226,10 +226,12 @@
>
> #define THERM_INT_HIGH_ENABLE (1 << 0)
> #define THERM_INT_LOW_ENABLE (1 << 1)
> +#define THERM_INT_PLN_ENABLE (1 << 24)
>
> #define MSR_IA32_THERM_STATUS 0x0000019c
>
> #define THERM_STATUS_PROCHOT (1 << 0)
> +#define THERM_STATUS_POWER_LIMIT (1 << 10)
>
> #define MSR_THERM2_CTL 0x0000019d
>
> @@ -242,11 +244,13 @@
> #define MSR_IA32_PACKAGE_THERM_STATUS 0x000001b1
>
> #define PACKAGE_THERM_STATUS_PROCHOT (1 << 0)
> +#define PACKAGE_THERM_STATUS_POWER_LIMIT (1 << 10)
>
> #define MSR_IA32_PACKAGE_THERM_INTERRUPT 0x000001b2
>
> #define PACKAGE_THERM_INT_HIGH_ENABLE (1 << 0)
> #define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1)
> +#define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24)
>
> /* MISC_ENABLE bits: architectural */
> #define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0)
> diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
> index d307f9f..0c3d1e0 100644
> --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
> +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
> @@ -34,20 +34,25 @@
> /* How long to wait between reporting thermal events */
> #define CHECK_INTERVAL (300 * HZ)
>
> +#define THERMAL_THROTTLING_EVENT 0
> +#define POWER_LIMIT_EVENT 1
> +
> /*
> - * Current thermal throttling state:
> + * Current thermal event state:
> */
> struct _thermal_state {
> - bool is_throttled;
> -
> + bool new_event;
> + int event;
> u64 next_check;
> - unsigned long throttle_count;
> - unsigned long last_throttle_count;
> + unsigned long count;
> + unsigned long last_count;
> };
>
> struct thermal_state {
> - struct _thermal_state core;
> - struct _thermal_state package;
> + struct _thermal_state core_throttle;
> + struct _thermal_state core_power_limit;
> + struct _thermal_state package_throttle;
> + struct _thermal_state package_power_limit;
> };
>
> static DEFINE_PER_CPU(struct thermal_state, thermal_state);
> @@ -62,9 +67,9 @@ static u32 lvtthmr_init __read_mostly;
> therm_throt_sysdev_show_##_name, \
> NULL) \
>
> -#define define_therm_throt_sysdev_show_func(level, name) \
> +#define define_therm_throt_sysdev_show_func(event, name) \
> \
> -static ssize_t therm_throt_sysdev_show_##level##_##name( \
> +static ssize_t therm_throt_sysdev_show_##event##_##name( \
> struct sys_device *dev, \
> struct sysdev_attribute *attr, \
> char *buf) \
> @@ -75,7 +80,7 @@ static ssize_t therm_throt_sysdev_show_##level##_##name( \
> preempt_disable(); /* CPU hotplug */ \
> if (cpu_online(cpu)) { \
> ret = sprintf(buf, "%lu\n", \
> - per_cpu(thermal_state, cpu).level.name); \
> + per_cpu(thermal_state, cpu).event.name); \
> } else \
> ret = 0; \
> preempt_enable(); \
> @@ -83,23 +88,32 @@ static ssize_t therm_throt_sysdev_show_##level##_##name( \
> return ret; \
> }
>
> -define_therm_throt_sysdev_show_func(core, throttle_count);
> +define_therm_throt_sysdev_show_func(core_throttle, count);
> define_therm_throt_sysdev_one_ro(core_throttle_count);
>
> -define_therm_throt_sysdev_show_func(package, throttle_count);
> +define_therm_throt_sysdev_show_func(core_power_limit, count);
> +define_therm_throt_sysdev_one_ro(core_power_limit_count);
> +
> +define_therm_throt_sysdev_show_func(package_throttle, count);
> define_therm_throt_sysdev_one_ro(package_throttle_count);
>
> +define_therm_throt_sysdev_show_func(package_power_limit, count);
> +define_therm_throt_sysdev_one_ro(package_power_limit_count);
> +
> static struct attribute *thermal_throttle_attrs[] = {
> &attr_core_throttle_count.attr,
> NULL
> };
>
> -static struct attribute_group thermal_throttle_attr_group = {
> +static struct attribute_group thermal_attr_group = {
> .attrs = thermal_throttle_attrs,
> .name = "thermal_throttle"
> };
> #endif /* CONFIG_SYSFS */
>
> +#define CORE_LEVEL 0
> +#define PACKAGE_LEVEL 1
> +
> /***
> * therm_throt_process - Process thermal throttling event from interrupt
> * @curr: Whether the condition is current or not (boolean), since the
> @@ -116,49 +130,73 @@ static struct attribute_group thermal_throttle_attr_group = {
> * 1 : Event should be logged further, and a message has been
> * printed to the syslog.
> */
> -#define CORE_LEVEL 0
> -#define PACKAGE_LEVEL 1
> -static int therm_throt_process(bool is_throttled, int level)
> +static int therm_throt_process(bool new_event, int event, int level)
> {
> struct _thermal_state *state;
> - unsigned int this_cpu;
> - bool was_throttled;
> + unsigned int this_cpu = smp_processor_id();
> + bool old_event;
> u64 now;
> + struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
> +
> + if (!new_event)
> + return 0;
>
As a result of this check and return, events will never be reset.

> - this_cpu = smp_processor_id();
> now = get_jiffies_64();
> - if (level == CORE_LEVEL)
> - state = &per_cpu(thermal_state, this_cpu).core;
> - else
> - state = &per_cpu(thermal_state, this_cpu).package;
> + if (level == CORE_LEVEL) {
> + if (event == THERMAL_THROTTLING_EVENT)
> + state = &pstate->core_throttle;
> + else if (event == POWER_LIMIT_EVENT)
> + state = &pstate->core_power_limit;
> + else
> + return 0;
> + } else if (level == PACKAGE_LEVEL) {
> + if (event == THERMAL_THROTTLING_EVENT)
> + state = &pstate->package_throttle;
> + else if (event == POWER_LIMIT_EVENT)
> + state = &pstate->package_power_limit;
> + else
> + return 0;
> + } else
> + return 0;
>
> - was_throttled = state->is_throttled;
> - state->is_throttled = is_throttled;
> + old_event = state->new_event;
> + state->new_event = new_event;
>
> - if (is_throttled)
> - state->throttle_count++;
> + if (new_event)
> + state->count++;
>
> if (time_before64(now, state->next_check) &&
> - state->throttle_count != state->last_throttle_count)
> + state->count != state->last_count)
> return 0;
>
> state->next_check = now + CHECK_INTERVAL;
> - state->last_throttle_count = state->throttle_count;
> + state->last_count = state->count;
>
> /* if we just entered the thermal event */
> - if (is_throttled) {
> - printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
> - this_cpu,
> - level == CORE_LEVEL ? "Core" : "Package",
> - state->throttle_count);
> + if (new_event) {
> + if (event == THERMAL_THROTTLING_EVENT)
> + printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
> + this_cpu,
> + level == CORE_LEVEL ? "Core" : "Package",
> + state->count);
> + else
> + printk(KERN_CRIT "CPU%d: %s power limit notification (total events = %lu)\n",
> + this_cpu,
> + level == CORE_LEVEL ? "Core" : "Package",
> + state->count);
>
> add_taint(TAINT_MACHINE_CHECK);
> return 1;
> }

Since new_event is always true, code below will never be executed.
That doesn't make sense to me.

> - if (was_throttled) {
> - printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
> - this_cpu,
> - level == CORE_LEVEL ? "Core" : "Package");
> + if (old_event) {
> + if (event == THERMAL_THROTTLING_EVENT)
> + printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
> + this_cpu,
> + level == CORE_LEVEL ? "Core" : "Package");
> + else
> + printk(KERN_INFO "CPU%d: %s power limit normal\n",
> + this_cpu,
> + level == CORE_LEVEL ? "Core" : "Package");
> return 1;
> }
>
> @@ -172,21 +210,29 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
> int err;
> struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
>
> - err = sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group);
> + err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group);
> if (err)
> return err;
>
> + if (cpu_has(c, X86_FEATURE_PLN))
> + err = sysfs_add_file_to_group(&sys_dev->kobj,
> + &attr_core_power_limit_count.attr,
> + thermal_attr_group.name);
> if (cpu_has(c, X86_FEATURE_PTS))
> err = sysfs_add_file_to_group(&sys_dev->kobj,
> &attr_package_throttle_count.attr,
> - thermal_throttle_attr_group.name);
> + thermal_attr_group.name);
> + if (cpu_has(c, X86_FEATURE_PLN))
> + err = sysfs_add_file_to_group(&sys_dev->kobj,
> + &attr_package_power_limit_count.attr,
> + thermal_attr_group.name);
>
> return err;
> }
>
> static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
> {
> - sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
> + sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group);
> }
>
> /* Mutex protecting device creation against CPU hotplug: */
> @@ -257,6 +303,17 @@ device_initcall(thermal_throttle_init_device);
>
> #endif /* CONFIG_SYSFS */
>
> +/*
> + * Set up the most two significant bit to notify mce log that this thermal
> + * event type.
> + * This is a temp solution. May be changed in the future with mce log
> + * infrasture.
> + */
> +#define CORE_THROTTLED (0)
> +#define CORE_POWER_LIMIT ((__u64)1 << 62)
> +#define PACKAGE_THROTTLED ((__u64)2 << 62)
> +#define PACKAGE_POWER_LIMIT ((__u64)3 << 62)
> +
> /* Thermal transition interrupt handler */
> static void intel_thermal_interrupt(void)
> {
> @@ -264,21 +321,31 @@ static void intel_thermal_interrupt(void)
> struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
>
> rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
> +
> if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
> + THERMAL_THROTTLING_EVENT,
> CORE_LEVEL) != 0)
> - mce_log_therm_throt_event(msr_val);
> + mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
> +
> + if (cpu_has(c, X86_FEATURE_PLN))
> + if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
> + POWER_LIMIT_EVENT,
> + CORE_LEVEL) != 0)
> + mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
>
> if (cpu_has(c, X86_FEATURE_PTS)) {
> rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
> if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
> + THERMAL_THROTTLING_EVENT,
> PACKAGE_LEVEL) != 0)
> - /*
> - * Set up the most significant bit to notify mce log
> - * that this thermal event is a package level event.
> - * This is a temp solution. May be changed in the future
> - * with mce log infrasture.
> - */
> - mce_log_therm_throt_event(((__u64)1 << 63) | msr_val);
> + mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
> + if (cpu_has(c, X86_FEATURE_PLN))
> + if (therm_throt_process(msr_val &
> + PACKAGE_THERM_STATUS_POWER_LIMIT,
> + POWER_LIMIT_EVENT,
> + PACKAGE_LEVEL) != 0)
> + mce_log_therm_throt_event(PACKAGE_POWER_LIMIT
> + | msr_val);
> }
> }
>
> @@ -381,14 +448,25 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
> apic_write(APIC_LVTTHMR, h);
>
> rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
> - wrmsr(MSR_IA32_THERM_INTERRUPT,
> - l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
> + if (cpu_has(c, X86_FEATURE_PLN))
> + wrmsr(MSR_IA32_THERM_INTERRUPT,
> + l | (THERM_INT_LOW_ENABLE
> + | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
> + else
> + wrmsr(MSR_IA32_THERM_INTERRUPT,
> + l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
>
> if (cpu_has(c, X86_FEATURE_PTS)) {
> rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
> - wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
> - l | (PACKAGE_THERM_INT_LOW_ENABLE
> - | PACKAGE_THERM_INT_HIGH_ENABLE), h);
> + if (cpu_has(c, X86_FEATURE_PLN))
> + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
> + l | (PACKAGE_THERM_INT_LOW_ENABLE
> + | PACKAGE_THERM_INT_HIGH_ENABLE
> + | PACKAGE_THERM_INT_PLN_ENABLE), h);
> + else
> + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
> + l | (PACKAGE_THERM_INT_LOW_ENABLE
> + | PACKAGE_THERM_INT_HIGH_ENABLE), h);
> }
>
> smp_thermal_vector = intel_thermal_interrupt;
> --
> 1.7.2
>

2010-07-28 23:23:36

by Fenghua Yu

[permalink] [raw]
Subject: Re: [PATH V2 4/5] Package Level Thermal Control and Power Limit Notification: power limit

On Wed, Jul 28, 2010 at 02:17:40PM -0700, Guenter Roeck wrote:
> On Wed, 2010-07-28 at 15:00 -0400, Fenghua Yu wrote:
> > From: Fenghua Yu <[email protected]>
> >
> > -static int therm_throt_process(bool is_throttled, int level)
> > +static int therm_throt_process(bool new_event, int event, int level)
> > {
> > struct _thermal_state *state;
> > - unsigned int this_cpu;
> > - bool was_throttled;
> > + unsigned int this_cpu = smp_processor_id();
> > + bool old_event;
> > u64 now;
> > + struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
> > +
> > + if (!new_event)
> > + return 0;
> >
> As a result of this check and return, events will never be reset.

With the check, throttling returning to normal will not be reflected (i.e. only
throttling event is shown). I'll remove this check.
>
> > - this_cpu = smp_processor_id();
> > now = get_jiffies_64();
> > - if (level == CORE_LEVEL)
> > - state = &per_cpu(thermal_state, this_cpu).core;
> > - else
> > - state = &per_cpu(thermal_state, this_cpu).package;
> > + if (level == CORE_LEVEL) {
> > + if (event == THERMAL_THROTTLING_EVENT)
> > + state = &pstate->core_throttle;
> > + else if (event == POWER_LIMIT_EVENT)
> > + state = &pstate->core_power_limit;
> > + else
> > + return 0;
> > + } else if (level == PACKAGE_LEVEL) {
> > + if (event == THERMAL_THROTTLING_EVENT)
> > + state = &pstate->package_throttle;
> > + else if (event == POWER_LIMIT_EVENT)
> > + state = &pstate->package_power_limit;
> > + else
> > + return 0;
> > + } else
> > + return 0;
> >
> > - was_throttled = state->is_throttled;
> > - state->is_throttled = is_throttled;
> > + old_event = state->new_event;
> > + state->new_event = new_event;
> >
> > - if (is_throttled)
> > - state->throttle_count++;
> > + if (new_event)
> > + state->count++;
> >
> > if (time_before64(now, state->next_check) &&
> > - state->throttle_count != state->last_throttle_count)
> > + state->count != state->last_count)
> > return 0;
> >
> > state->next_check = now + CHECK_INTERVAL;
> > - state->last_throttle_count = state->throttle_count;
> > + state->last_count = state->count;
> >
> > /* if we just entered the thermal event */
> > - if (is_throttled) {
> > - printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
> > - this_cpu,
> > - level == CORE_LEVEL ? "Core" : "Package",
> > - state->throttle_count);
> > + if (new_event) {
> > + if (event == THERMAL_THROTTLING_EVENT)
> > + printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
> > + this_cpu,
> > + level == CORE_LEVEL ? "Core" : "Package",
> > + state->count);
> > + else
> > + printk(KERN_CRIT "CPU%d: %s power limit notification (total events = %lu)\n",
> > + this_cpu,
> > + level == CORE_LEVEL ? "Core" : "Package",
> > + state->count);
> >
> > add_taint(TAINT_MACHINE_CHECK);
> > return 1;
> > }
>
> Since new_event is always true, code below will never be executed.
> That doesn't make sense to me.
After removing the previous new_event check, this code should be fine.

Thanks.

-Fenghua