Hi All,
The RFC of this does not seem to have attracted much attention, so here goes
the first non-RFC revision.
The purpose of this set of patches is to allow the intel_idle driver to use
C-states information from ACPI _CST on systems where the processor is not
recognized by it.
The first five patches are preparatory (please look into the changelogs for
details) and are not expected to make any functional difference.
Patch [06/10] adds ACPI _CST support to intel_idle so that _CST is used when
the driver does not have a dedicated list of C-states for the given processor.
Patch [07/10] is an update of https://patchwork.kernel.org/patch/11256815/.
Patch [08/10] changes intel_idle to also use ACPI _CST in specific cases when
there is a tables of C-states for the given processor in the driver (it will
use the _CST information to decide which C-state to enable by default then).
Patch [09/10] adds a module parameter called "no_acpi" that can be used to
prevent intel_idle from using ACPI _CST via the kernel command line.
Finally, the last patch makes intel_idle use ACPI _CST, if available, on all
server systems supported by it.
This has been lightly tested on a Dell XPS13 9360 (with an additional patch to
set use_acpi for Kaby Lake). The difference between using the idle states list
from _CST and the built-in one generally appears to be that in the latter case
the processor spends more time in package C-state when the system is idle.
If there are any concerns about this series, please let me know.
For easier access, the patches are available from the intel_idle+acpi branch
in the linux-pm.git tree.
Thanks,
Rafael
From: "Rafael J. Wysocki" <[email protected]>
In order to separate the ACPI _CST evaluation from checks
specific to the ACPI processor driver, move the majority of
the acpi_processor_get_power_info_cst() function body to a new
function, acpi_processor_evaluate_cst(), that will extract
the C-states information from _CST output, and redefine
acpi_processor_get_power_info_cst() as a wrapper around it.
No intentional functional impact.
Signed-off-by: Rafael J. Wysocki <[email protected]>
---
No changes from the RFC version.
---
drivers/acpi/processor_idle.c | 52 ++++++++++++++++++++++++++-----------------
1 file changed, 32 insertions(+), 20 deletions(-)
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index dd737d836c03..e92d0e6d4cd1 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -297,21 +297,17 @@ static int acpi_processor_get_power_info_default(struct acpi_processor *pr)
return 0;
}
-static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
+static int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu,
+ struct acpi_processor_power *info)
{
+ struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+ union acpi_object *cst;
acpi_status status;
u64 count;
- int current_count;
+ int current_count = 0;
int i, ret = 0;
- struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
- union acpi_object *cst;
-
- if (nocst)
- return -ENODEV;
- current_count = 0;
-
- status = acpi_evaluate_object(pr->handle, "_CST", NULL, &buffer);
+ status = acpi_evaluate_object(handle, "_CST", NULL, &buffer);
if (ACPI_FAILURE(status)) {
ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No _CST, giving up\n"));
return -ENODEV;
@@ -335,9 +331,6 @@ static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
goto end;
}
- /* Tell driver that at least _CST is supported. */
- pr->flags.has_cst = 1;
-
for (i = 1; i <= count; i++) {
union acpi_object *element;
union acpi_object *obj;
@@ -383,7 +376,7 @@ static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
cx.entry_method = ACPI_CSTATE_SYSTEMIO;
if (reg->space_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
if (acpi_processor_ffh_cstate_probe
- (pr->id, &cx, reg) == 0) {
+ (cpu, &cx, reg) == 0) {
cx.entry_method = ACPI_CSTATE_FFH;
} else if (cx.type == ACPI_STATE_C1) {
/*
@@ -432,7 +425,7 @@ static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
continue;
current_count++;
- memcpy(&(pr->power.states[current_count]), &cx, sizeof(cx));
+ memcpy(&info->states[current_count], &cx, sizeof(cx));
/*
* We support total ACPI_PROCESSOR_MAX_POWER - 1
@@ -446,12 +439,9 @@ static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
}
}
- ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found %d power states\n",
- current_count));
+ acpi_handle_info(handle, "Found %d idle states\n", current_count);
- /* Validate number of power states discovered */
- if (current_count < 2)
- ret = -EFAULT;
+ info->count = current_count;
end:
kfree(buffer.pointer);
@@ -459,6 +449,28 @@ static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
return ret;
}
+static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
+{
+ int ret;
+
+ if (nocst)
+ return -ENODEV;
+
+ ret = acpi_processor_evaluate_cst(pr->handle, pr->id, &pr->power);
+ if (ret)
+ return ret;
+
+ /*
+ * It is expected that there will be at least 2 states, C1 and
+ * something else (C2 or C3), so fail if that is not the case.
+ */
+ if (pr->power.count < 2)
+ return -EFAULT;
+
+ pr->flags.has_cst = 1;
+ return 0;
+}
+
static void acpi_processor_power_verify_c3(struct acpi_processor *pr,
struct acpi_processor_cx *cx)
{
--
2.16.4
From: "Rafael J. Wysocki" <[email protected]>
The intel_idle driver will be modified to use ACPI _CST subsequently
and it will need to notify the platform firmware of that if
acpi_gbl_FADT.cst_control is set, so add a routine for this purpose,
acpi_processor_claim_cst_control(), to acpi_processor.c (so that it
is always present which is required by intel_idle) and export it
to allow the ACPI processor driver (which is modular) to call it.
No intentional functional impact.
Signed-off-by: Rafael J. Wysocki <[email protected]>
---
No changes from the RFC version.
---
drivers/acpi/acpi_processor.c | 25 +++++++++++++++++++++++++
drivers/acpi/processor_idle.c | 12 ++++--------
include/linux/acpi.h | 6 ++++++
3 files changed, 35 insertions(+), 8 deletions(-)
diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c
index 2c4dda0787e8..8a53f3c5b70e 100644
--- a/drivers/acpi/acpi_processor.c
+++ b/drivers/acpi/acpi_processor.c
@@ -705,3 +705,28 @@ void __init acpi_processor_init(void)
acpi_scan_add_handler_with_hotplug(&processor_handler, "processor");
acpi_scan_add_handler(&processor_container_handler);
}
+
+#ifdef CONFIG_ACPI_PROCESSOR_CSTATE
+/**
+ * acpi_processor_claim_cst_control - Request _CST control from the platform.
+ */
+bool acpi_processor_claim_cst_control(void)
+{
+ static bool cst_control_claimed;
+ acpi_status status;
+
+ if (!acpi_gbl_FADT.cst_control || cst_control_claimed)
+ return true;
+
+ status = acpi_os_write_port(acpi_gbl_FADT.smi_command,
+ acpi_gbl_FADT.cst_control, 8);
+ if (ACPI_FAILURE(status)) {
+ pr_warn("ACPI: Failed to claim processor _CST control\n");
+ return false;
+ }
+
+ cst_control_claimed = true;
+ return true;
+}
+EXPORT_SYMBOL_GPL(acpi_processor_claim_cst_control);
+#endif /* CONFIG_ACPI_PROCESSOR_CSTATE */
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 2ae95df2e74f..dd737d836c03 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -909,7 +909,6 @@ static int acpi_processor_setup_cstates(struct acpi_processor *pr)
static inline void acpi_processor_cstate_first_run_checks(void)
{
- acpi_status status;
static int first_run;
if (first_run)
@@ -921,13 +920,10 @@ static inline void acpi_processor_cstate_first_run_checks(void)
max_cstate);
first_run++;
- if (acpi_gbl_FADT.cst_control && !nocst) {
- status = acpi_os_write_port(acpi_gbl_FADT.smi_command,
- acpi_gbl_FADT.cst_control, 8);
- if (ACPI_FAILURE(status))
- ACPI_EXCEPTION((AE_INFO, status,
- "Notifying BIOS of _CST ability failed"));
- }
+ if (nocst)
+ return;
+
+ acpi_processor_claim_cst_control();
}
#else
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 0f37a7d5fa77..ee39b05e7f76 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -279,6 +279,12 @@ static inline bool invalid_phys_cpuid(phys_cpuid_t phys_id)
/* Validate the processor object's proc_id */
bool acpi_duplicate_processor_id(int proc_id);
+/* Processor _CTS control */
+#ifdef CONFIG_ACPI_PROCESSOR_CSTATE
+bool acpi_processor_claim_cst_control(void);
+#else
+static inline bool acpi_processor_claim_cst_control(void) { return false; }
+#endif
#ifdef CONFIG_ACPI_HOTPLUG_CPU
/* Arch dependent functions for cpu hotplug support */
--
2.16.4
From: "Rafael J. Wysocki" <[email protected]>
In many cases, especially on server systems, it is desirable to avoid
enabling C-states that have been disabled in the platform firmware
(BIOS) setup, except for C1E.
As a rule, the C-states disabled this way are not listed by ACPI
_CST, so if that is used by intel_idle along with the specific
table of C-states that it has for the given processor, the C-states
disabled through the platform firmware will not be enabled by default
by intel_idle.
Accordingly, set the use_acpi flag (introduced previously) in all
server processor profiles defined in intel_idle (so as to make it use
ACPI _CST to decide which C-states to enable by default) and set
the CPUIDLE_FLAG_ALWAYS_ENABLE flag (also introduced previously)
for C1E in all C-states tables in intel_idle that contain C1 too
(so that C1E is enabled regardless of whether or not it is listed
by ACPI _CST).
Signed-off-by: Rafael J. Wysocki <[email protected]>
---
New patch, not present in the RFC.
---
drivers/idle/intel_idle.c | 70 +++++++++++++++++++++++++++++++++--------------
1 file changed, 50 insertions(+), 20 deletions(-)
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 26fe383bb921..1467490adfc3 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -131,7 +131,7 @@ static struct cpuidle_state nehalem_cstates[] = {
{
.name = "C1E",
.desc = "MWAIT 0x01",
- .flags = MWAIT2flg(0x01),
+ .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
.exit_latency = 10,
.target_residency = 20,
.enter = &intel_idle,
@@ -168,7 +168,7 @@ static struct cpuidle_state snb_cstates[] = {
{
.name = "C1E",
.desc = "MWAIT 0x01",
- .flags = MWAIT2flg(0x01),
+ .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
.exit_latency = 10,
.target_residency = 20,
.enter = &intel_idle,
@@ -303,7 +303,7 @@ static struct cpuidle_state ivb_cstates[] = {
{
.name = "C1E",
.desc = "MWAIT 0x01",
- .flags = MWAIT2flg(0x01),
+ .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
.exit_latency = 10,
.target_residency = 20,
.enter = &intel_idle,
@@ -348,7 +348,7 @@ static struct cpuidle_state ivt_cstates[] = {
{
.name = "C1E",
.desc = "MWAIT 0x01",
- .flags = MWAIT2flg(0x01),
+ .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
.exit_latency = 10,
.target_residency = 80,
.enter = &intel_idle,
@@ -385,7 +385,7 @@ static struct cpuidle_state ivt_cstates_4s[] = {
{
.name = "C1E",
.desc = "MWAIT 0x01",
- .flags = MWAIT2flg(0x01),
+ .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
.exit_latency = 10,
.target_residency = 250,
.enter = &intel_idle,
@@ -422,7 +422,7 @@ static struct cpuidle_state ivt_cstates_8s[] = {
{
.name = "C1E",
.desc = "MWAIT 0x01",
- .flags = MWAIT2flg(0x01),
+ .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
.exit_latency = 10,
.target_residency = 500,
.enter = &intel_idle,
@@ -459,7 +459,7 @@ static struct cpuidle_state hsw_cstates[] = {
{
.name = "C1E",
.desc = "MWAIT 0x01",
- .flags = MWAIT2flg(0x01),
+ .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
.exit_latency = 10,
.target_residency = 20,
.enter = &intel_idle,
@@ -527,7 +527,7 @@ static struct cpuidle_state bdw_cstates[] = {
{
.name = "C1E",
.desc = "MWAIT 0x01",
- .flags = MWAIT2flg(0x01),
+ .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
.exit_latency = 10,
.target_residency = 20,
.enter = &intel_idle,
@@ -596,7 +596,7 @@ static struct cpuidle_state skl_cstates[] = {
{
.name = "C1E",
.desc = "MWAIT 0x01",
- .flags = MWAIT2flg(0x01),
+ .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
.exit_latency = 10,
.target_residency = 20,
.enter = &intel_idle,
@@ -665,7 +665,7 @@ static struct cpuidle_state skx_cstates[] = {
{
.name = "C1E",
.desc = "MWAIT 0x01",
- .flags = MWAIT2flg(0x01),
+ .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
.exit_latency = 10,
.target_residency = 20,
.enter = &intel_idle,
@@ -815,7 +815,7 @@ static struct cpuidle_state bxt_cstates[] = {
{
.name = "C1E",
.desc = "MWAIT 0x01",
- .flags = MWAIT2flg(0x01),
+ .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
.exit_latency = 10,
.target_residency = 20,
.enter = &intel_idle,
@@ -876,7 +876,7 @@ static struct cpuidle_state dnv_cstates[] = {
{
.name = "C1E",
.desc = "MWAIT 0x01",
- .flags = MWAIT2flg(0x01),
+ .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
.exit_latency = 10,
.target_residency = 20,
.enter = &intel_idle,
@@ -998,6 +998,13 @@ static const struct idle_cpu idle_cpu_nehalem = {
.disable_promotion_to_c1e = true,
};
+static const struct idle_cpu idle_cpu_nhx = {
+ .state_table = nehalem_cstates,
+ .auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE,
+ .disable_promotion_to_c1e = true,
+ .use_acpi = true,
+};
+
static const struct idle_cpu idle_cpu_atom = {
.state_table = atom_cstates,
};
@@ -1016,6 +1023,12 @@ static const struct idle_cpu idle_cpu_snb = {
.disable_promotion_to_c1e = true,
};
+static const struct idle_cpu idle_cpu_snx = {
+ .state_table = snb_cstates,
+ .disable_promotion_to_c1e = true,
+ .use_acpi = true,
+};
+
static const struct idle_cpu idle_cpu_byt = {
.state_table = byt_cstates,
.disable_promotion_to_c1e = true,
@@ -1036,6 +1049,7 @@ static const struct idle_cpu idle_cpu_ivb = {
static const struct idle_cpu idle_cpu_ivt = {
.state_table = ivt_cstates,
.disable_promotion_to_c1e = true,
+ .use_acpi = true,
};
static const struct idle_cpu idle_cpu_hsw = {
@@ -1043,11 +1057,23 @@ static const struct idle_cpu idle_cpu_hsw = {
.disable_promotion_to_c1e = true,
};
+static const struct idle_cpu idle_cpu_hsx = {
+ .state_table = hsw_cstates,
+ .disable_promotion_to_c1e = true,
+ .use_acpi = true,
+};
+
static const struct idle_cpu idle_cpu_bdw = {
.state_table = bdw_cstates,
.disable_promotion_to_c1e = true,
};
+static const struct idle_cpu idle_cpu_bdx = {
+ .state_table = bdw_cstates,
+ .disable_promotion_to_c1e = true,
+ .use_acpi = true,
+};
+
static const struct idle_cpu idle_cpu_skl = {
.state_table = skl_cstates,
.disable_promotion_to_c1e = true,
@@ -1056,15 +1082,18 @@ static const struct idle_cpu idle_cpu_skl = {
static const struct idle_cpu idle_cpu_skx = {
.state_table = skx_cstates,
.disable_promotion_to_c1e = true,
+ .use_acpi = true,
};
static const struct idle_cpu idle_cpu_avn = {
.state_table = avn_cstates,
.disable_promotion_to_c1e = true,
+ .use_acpi = true,
};
static const struct idle_cpu idle_cpu_knl = {
.state_table = knl_cstates,
+ .use_acpi = true,
};
static const struct idle_cpu idle_cpu_bxt = {
@@ -1075,20 +1104,21 @@ static const struct idle_cpu idle_cpu_bxt = {
static const struct idle_cpu idle_cpu_dnv = {
.state_table = dnv_cstates,
.disable_promotion_to_c1e = true,
+ .use_acpi = true,
};
static const struct x86_cpu_id intel_idle_ids[] __initconst = {
- INTEL_CPU_FAM6(NEHALEM_EP, idle_cpu_nehalem),
+ INTEL_CPU_FAM6(NEHALEM_EP, idle_cpu_nhx),
INTEL_CPU_FAM6(NEHALEM, idle_cpu_nehalem),
INTEL_CPU_FAM6(NEHALEM_G, idle_cpu_nehalem),
INTEL_CPU_FAM6(WESTMERE, idle_cpu_nehalem),
- INTEL_CPU_FAM6(WESTMERE_EP, idle_cpu_nehalem),
- INTEL_CPU_FAM6(NEHALEM_EX, idle_cpu_nehalem),
+ INTEL_CPU_FAM6(WESTMERE_EP, idle_cpu_nhx),
+ INTEL_CPU_FAM6(NEHALEM_EX, idle_cpu_nhx),
INTEL_CPU_FAM6(ATOM_BONNELL, idle_cpu_atom),
INTEL_CPU_FAM6(ATOM_BONNELL_MID, idle_cpu_lincroft),
- INTEL_CPU_FAM6(WESTMERE_EX, idle_cpu_nehalem),
+ INTEL_CPU_FAM6(WESTMERE_EX, idle_cpu_nhx),
INTEL_CPU_FAM6(SANDYBRIDGE, idle_cpu_snb),
- INTEL_CPU_FAM6(SANDYBRIDGE_X, idle_cpu_snb),
+ INTEL_CPU_FAM6(SANDYBRIDGE_X, idle_cpu_snx),
INTEL_CPU_FAM6(ATOM_SALTWELL, idle_cpu_atom),
INTEL_CPU_FAM6(ATOM_SILVERMONT, idle_cpu_byt),
INTEL_CPU_FAM6(ATOM_SILVERMONT_MID, idle_cpu_tangier),
@@ -1096,14 +1126,14 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
INTEL_CPU_FAM6(IVYBRIDGE, idle_cpu_ivb),
INTEL_CPU_FAM6(IVYBRIDGE_X, idle_cpu_ivt),
INTEL_CPU_FAM6(HASWELL, idle_cpu_hsw),
- INTEL_CPU_FAM6(HASWELL_X, idle_cpu_hsw),
+ INTEL_CPU_FAM6(HASWELL_X, idle_cpu_hsx),
INTEL_CPU_FAM6(HASWELL_L, idle_cpu_hsw),
INTEL_CPU_FAM6(HASWELL_G, idle_cpu_hsw),
INTEL_CPU_FAM6(ATOM_SILVERMONT_D, idle_cpu_avn),
INTEL_CPU_FAM6(BROADWELL, idle_cpu_bdw),
INTEL_CPU_FAM6(BROADWELL_G, idle_cpu_bdw),
- INTEL_CPU_FAM6(BROADWELL_X, idle_cpu_bdw),
- INTEL_CPU_FAM6(BROADWELL_D, idle_cpu_bdw),
+ INTEL_CPU_FAM6(BROADWELL_X, idle_cpu_bdx),
+ INTEL_CPU_FAM6(BROADWELL_D, idle_cpu_bdx),
INTEL_CPU_FAM6(SKYLAKE_L, idle_cpu_skl),
INTEL_CPU_FAM6(SKYLAKE, idle_cpu_skl),
INTEL_CPU_FAM6(KABYLAKE_L, idle_cpu_skl),
--
2.16.4
From: "Rafael J. Wysocki" <[email protected]>
In certain situations it may be useful to prevent some idle states
from being used by default while allowing user space to enable them
later on.
For this purpose, introduce a new state flag, CPUIDLE_FLAG_OFF, to
mark idle states that should be disabled by default, make the core
set CPUIDLE_STATE_DISABLED_BY_USER for those states at the
initialization time and add a new state attribute in sysfs,
"default_status", to inform user space of the initial status of
the given idle state ("disabled" if CPUIDLE_FLAG_OFF is set for it,
"enabled" otherwise).
Signed-off-by: Rafael J. Wysocki <[email protected]>
---
Changes from the RFC version:
- Call the new sysfs attribute "default_status" (instead of "initial_status").
---
Documentation/ABI/testing/sysfs-devices-system-cpu | 6 ++++++
Documentation/admin-guide/pm/cpuidle.rst | 3 +++
drivers/cpuidle/cpuidle.c | 6 +++++-
drivers/cpuidle/sysfs.c | 10 ++++++++++
include/linux/cpuidle.h | 1 +
5 files changed, 25 insertions(+), 1 deletion(-)
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index fc20cde63d1e..2e0e3b45d02a 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -196,6 +196,12 @@ Description:
does not reflect it. Likewise, if one enables a deep state but a
lighter state still is disabled, then this has no effect.
+What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/default_status
+Date: December 2019
+KernelVersion: v5.6
+Contact: Linux power management list <[email protected]>
+Description:
+ (RO) The default status of this state, "enabled" or "disabled".
What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/residency
Date: March 2014
diff --git a/Documentation/admin-guide/pm/cpuidle.rst b/Documentation/admin-guide/pm/cpuidle.rst
index e70b365dbc60..311cd7cc2b75 100644
--- a/Documentation/admin-guide/pm/cpuidle.rst
+++ b/Documentation/admin-guide/pm/cpuidle.rst
@@ -506,6 +506,9 @@ object corresponding to it, as follows:
``disable``
Whether or not this idle state is disabled.
+``default_status``
+ The default status of this state, "enabled" or "disabled".
+
``latency``
Exit latency of the idle state in microseconds.
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 0005be5ea2b4..24eaa4c8138b 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -571,10 +571,14 @@ static int __cpuidle_register_device(struct cpuidle_device *dev)
if (!try_module_get(drv->owner))
return -EINVAL;
- for (i = 0; i < drv->state_count; i++)
+ for (i = 0; i < drv->state_count; i++) {
if (drv->states[i].flags & CPUIDLE_FLAG_UNUSABLE)
dev->states_usage[i].disable |= CPUIDLE_STATE_DISABLED_BY_DRIVER;
+ if (drv->states[i].flags & CPUIDLE_FLAG_OFF)
+ dev->states_usage[i].disable |= CPUIDLE_STATE_DISABLED_BY_USER;
+ }
+
per_cpu(cpuidle_devices, dev->cpu) = dev;
list_add(&dev->device_list, &cpuidle_detected_devices);
diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index 38ef770be90d..254d1560dc19 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -327,6 +327,14 @@ static ssize_t store_state_disable(struct cpuidle_state *state,
return size;
}
+static ssize_t show_state_default_status(struct cpuidle_state *state,
+ struct cpuidle_state_usage *state_usage,
+ char *buf)
+{
+ return sprintf(buf, "%s\n",
+ state->flags & CPUIDLE_FLAG_OFF ? "disabled" : "enabled");
+}
+
define_one_state_ro(name, show_state_name);
define_one_state_ro(desc, show_state_desc);
define_one_state_ro(latency, show_state_exit_latency);
@@ -337,6 +345,7 @@ define_one_state_ro(time, show_state_time);
define_one_state_rw(disable, show_state_disable, store_state_disable);
define_one_state_ro(above, show_state_above);
define_one_state_ro(below, show_state_below);
+define_one_state_ro(default_status, show_state_default_status);
static struct attribute *cpuidle_state_default_attrs[] = {
&attr_name.attr,
@@ -349,6 +358,7 @@ static struct attribute *cpuidle_state_default_attrs[] = {
&attr_disable.attr,
&attr_above.attr,
&attr_below.attr,
+ &attr_default_status.attr,
NULL
};
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 1dabe36bd011..ebfb52b3ffbf 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -77,6 +77,7 @@ struct cpuidle_state {
#define CPUIDLE_FLAG_COUPLED BIT(1) /* state applies to multiple cpus */
#define CPUIDLE_FLAG_TIMER_STOP BIT(2) /* timer is stopped on this state */
#define CPUIDLE_FLAG_UNUSABLE BIT(3) /* avoid using this state */
+#define CPUIDLE_FLAG_OFF BIT(4) /* disable this state by default */
struct cpuidle_device_kobj;
struct cpuidle_state_kobj;
--
2.16.4
From: "Rafael J. Wysocki" <[email protected]>
Update the intel_idle driver to get the C-states information from ACPI
_CST in some cases in which the processor is known to the driver, as long as
that information is available and the new use_acpi flag is set in the
profile of the processor in question.
In the cases when there is a specific table of C-states for the given
processor in the driver, that table is used as the primary source of
information on the available C-states, but if ACPI _CST is present,
the C-states that are not listed by it will not be enabled by default
(they still can be enabled later by user space via sysfs, though).
The new CPUIDLE_FLAG_ALWAYS_ENABLE flag can be used for marking
C-states that should be enabled by default even if they are not
listed by ACPI _CST.
Signed-off-by: Rafael J. Wysocki <[email protected]>
---
Changes from the RFC version:
- Subject and changelog update.
- Call the new state flag CPUIDLE_FLAG_ALWAYS_ENABLE (instead of _IGNORE_ACPI).
---
drivers/idle/intel_idle.c | 45 ++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 42 insertions(+), 3 deletions(-)
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 28812d93d59a..a072b84d9595 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -80,6 +80,7 @@ struct idle_cpu {
unsigned long auto_demotion_disable_flags;
bool byt_auto_demotion_disable_flag;
bool disable_promotion_to_c1e;
+ bool use_acpi;
};
static const struct idle_cpu *icpu;
@@ -90,6 +91,11 @@ static void intel_idle_s2idle(struct cpuidle_device *dev,
struct cpuidle_driver *drv, int index);
static struct cpuidle_state *cpuidle_state_table;
+/*
+ * Enable this state by default even if the ACPI _CST does not list it.
+ */
+#define CPUIDLE_FLAG_ALWAYS_ENABLE BIT(15)
+
/*
* Set this flag for states where the HW flushes the TLB for us
* and so we don't need cross-calls to keep it consistent.
@@ -1230,9 +1236,33 @@ static void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv)
state->enter_s2idle = intel_idle_s2idle;
}
}
+
+static bool intel_idle_off_by_default(u32 mwait_hint)
+{
+ int cstate, limit;
+
+ /*
+ * If there are no _CST C-states, do not disable any C-states by
+ * default.
+ */
+ if (!acpi_state_table.count)
+ return false;
+
+ limit = min_t(int, CPUIDLE_STATE_MAX, acpi_state_table.count);
+ /*
+ * If limit > 0, intel_idle_cst_usable() has returned 'true', so all of
+ * the interesting states are ACPI_CSTATE_FFH.
+ */
+ for (cstate = 1; cstate < limit; cstate++) {
+ if (acpi_state_table.states[cstate].address == mwait_hint)
+ return false;
+ }
+ return true;
+}
#else /* !CONFIG_ACPI_PROCESSOR_CSTATE */
static inline bool intel_idle_acpi_cst_extract(void) { return false; }
static inline void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) { }
+static inline bool intel_idle_off_by_default(u32 mwait_hint) { return false; }
#endif /* !CONFIG_ACPI_PROCESSOR_CSTATE */
/*
@@ -1273,10 +1303,13 @@ static int __init intel_idle_probe(void)
pr_debug("MWAIT substates: 0x%x\n", mwait_substates);
icpu = (const struct idle_cpu *)id->driver_data;
- if (icpu)
+ if (icpu) {
cpuidle_state_table = icpu->state_table;
- else if (!intel_idle_acpi_cst_extract())
+ if (icpu->use_acpi)
+ intel_idle_acpi_cst_extract();
+ } else if (!intel_idle_acpi_cst_extract()) {
return -ENODEV;
+ }
pr_debug("v" INTEL_IDLE_VERSION " model 0x%X\n",
boot_cpu_data.x86_model);
@@ -1484,7 +1517,13 @@ static void intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
continue;
/* Structure copy. */
- drv->states[drv->state_count++] = cpuidle_state_table[cstate];
+ drv->states[drv->state_count] = cpuidle_state_table[cstate];
+
+ if (icpu->use_acpi && intel_idle_off_by_default(mwait_hint) &&
+ !(cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_ALWAYS_ENABLE))
+ drv->states[drv->state_count].flags |= CPUIDLE_FLAG_OFF;
+
+ drv->state_count++;
}
if (icpu->byt_auto_demotion_disable_flag) {
--
2.16.4
From: "Rafael J. Wysocki" <[email protected]>
Modify the intel_idle driver to get the C-states information from ACPI
_CST if the processor model is not recognized by it.
The processor is still required to support MWAIT and the information
from ACPI _CST will only be used if all of the C-states listed by
_CST are of the ACPI_CSTATE_FFH type (which means that they are
expected to be entered via MWAIT).
Moreover, the driver assumes that the _CST information is the same
for all CPUs in the system, so it is sufficient to evaluate _CST for
one of them and extract the common list of C-states from there.
Also _CST is evaluated once at the system initialization time and
the driver does not respond to _CST change notifications (that can
be changed in the future).
The main functional difference between intel_idle with this change
and the ACPI processor driver is that the former sets the target
residency to be equal to the exit latency (provided by _CST) for
C1-type C-states and to 3 times the exit latency value for the other
C-state types, whereas the latter obtains the target residency by
multiplying the exit latency by the same number (2 by default) for
all C-state types. Therefore it is expected that in general using
the former instead of the latter on the same system will lead to
improved energy-efficiency.
Signed-off-by: Rafael J. Wysocki <[email protected]>
---
Changes from the RFC version:
- Subject and changelog update.
- If C-state information from _CST is used, use target residency equal to
3 times the exit latency for non C1-type C-states (was 4 times).
- Add a comment explaining the target residency generation rules.
---
drivers/idle/intel_idle.c | 190 +++++++++++++++++++++++++++++++++++++++-------
1 file changed, 162 insertions(+), 28 deletions(-)
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 47255d3cf51f..28812d93d59a 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -41,6 +41,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/acpi.h>
#include <linux/kernel.h>
#include <linux/cpuidle.h>
#include <linux/tick.h>
@@ -1111,6 +1112,129 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
{}
};
+#define INTEL_CPU_FAM6_MWAIT \
+ { X86_VENDOR_INTEL, 6, X86_MODEL_ANY, X86_FEATURE_MWAIT, 0 }
+
+static const struct x86_cpu_id intel_mwait_ids[] __initconst = {
+ INTEL_CPU_FAM6_MWAIT,
+ {}
+};
+
+static bool intel_idle_max_cstate_reached(int cstate)
+{
+ if (cstate + 1 > max_cstate) {
+ pr_info("max_cstate %d reached\n", max_cstate);
+ return true;
+ }
+ return false;
+}
+
+#ifdef CONFIG_ACPI_PROCESSOR_CSTATE
+#include <acpi/processor.h>
+
+static struct acpi_processor_power acpi_state_table;
+
+/**
+ * intel_idle_cst_usable - Check if the _CST information can be used.
+ *
+ * Check if all of the C-states listed by _CST in the max_cstate range are
+ * ACPI_CSTATE_FFH, which means that they should be entered via MWAIT.
+ */
+static bool intel_idle_cst_usable(void)
+{
+ int cstate, limit;
+
+ limit = min_t(int, min_t(int, CPUIDLE_STATE_MAX, max_cstate + 1),
+ acpi_state_table.count);
+
+ for (cstate = 1; cstate < limit; cstate++) {
+ struct acpi_processor_cx *cx = &acpi_state_table.states[cstate];
+
+ if (cx->entry_method != ACPI_CSTATE_FFH)
+ return false;
+ }
+
+ return true;
+}
+
+static bool intel_idle_acpi_cst_extract(void)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct acpi_processor *pr = per_cpu(processors, cpu);
+
+ if (!pr)
+ continue;
+
+ if (acpi_processor_evaluate_cst(pr->handle, cpu, &acpi_state_table))
+ continue;
+
+ acpi_state_table.count++;
+
+ if (!intel_idle_cst_usable())
+ continue;
+
+ if (!acpi_processor_claim_cst_control()) {
+ acpi_state_table.count = 0;
+ return false;
+ }
+
+ return true;
+ }
+
+ pr_debug("ACPI _CST not found or not usable\n");
+ return false;
+}
+
+static void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv)
+{
+ int cstate, limit = min_t(int, CPUIDLE_STATE_MAX, acpi_state_table.count);
+
+ /*
+ * If limit > 0, intel_idle_cst_usable() has returned 'true', so all of
+ * the interesting states are ACPI_CSTATE_FFH.
+ */
+ for (cstate = 1; cstate < limit; cstate++) {
+ struct acpi_processor_cx *cx;
+ struct cpuidle_state *state;
+
+ if (intel_idle_max_cstate_reached(cstate))
+ break;
+
+ cx = &acpi_state_table.states[cstate];
+
+ state = &drv->states[drv->state_count++];
+
+ snprintf(state->name, CPUIDLE_NAME_LEN, "C%d_ACPI", cstate);
+ strlcpy(state->desc, cx->desc, CPUIDLE_DESC_LEN);
+ state->exit_latency = cx->latency;
+ /*
+ * For C1-type C-states use the same number for both the exit
+ * latency and target residency, because that is the case for
+ * C1 in the majority of the static C-states tables above.
+ * For the other types of C-states, however, set the target
+ * residency to 3 times the exit latency which should lead to
+ * a reasonable balance between energy-efficiency and
+ * performance in the majority of interesting cases.
+ */
+ state->target_residency = cx->latency;
+ if (cx->type > ACPI_STATE_C1)
+ state->target_residency *= 3;
+
+ state->flags = MWAIT2flg(cx->address);
+ if (cx->type > ACPI_STATE_C2)
+ state->flags |= CPUIDLE_FLAG_TLB_FLUSHED;
+
+ state->enter = intel_idle;
+ state->enter_s2idle = intel_idle_s2idle;
+ }
+}
+#else /* !CONFIG_ACPI_PROCESSOR_CSTATE */
+static inline bool intel_idle_acpi_cst_extract(void) { return false; }
+static inline void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) { }
+#endif /* !CONFIG_ACPI_PROCESSOR_CSTATE */
+
/*
* intel_idle_probe()
*/
@@ -1125,17 +1249,15 @@ static int __init intel_idle_probe(void)
}
id = x86_match_cpu(intel_idle_ids);
- if (!id) {
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
- boot_cpu_data.x86 == 6)
- pr_debug("does not run on family %d model %d\n",
- boot_cpu_data.x86, boot_cpu_data.x86_model);
- return -ENODEV;
- }
-
- if (!boot_cpu_has(X86_FEATURE_MWAIT)) {
- pr_debug("Please enable MWAIT in BIOS SETUP\n");
- return -ENODEV;
+ if (id) {
+ if (!boot_cpu_has(X86_FEATURE_MWAIT)) {
+ pr_debug("Please enable MWAIT in BIOS SETUP\n");
+ return -ENODEV;
+ }
+ } else {
+ id = x86_match_cpu(intel_mwait_ids);
+ if (!id)
+ return -ENODEV;
}
if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
@@ -1151,7 +1273,10 @@ static int __init intel_idle_probe(void)
pr_debug("MWAIT substates: 0x%x\n", mwait_substates);
icpu = (const struct idle_cpu *)id->driver_data;
- cpuidle_state_table = icpu->state_table;
+ if (icpu)
+ cpuidle_state_table = icpu->state_table;
+ else if (!intel_idle_acpi_cst_extract())
+ return -ENODEV;
pr_debug("v" INTEL_IDLE_VERSION " model 0x%X\n",
boot_cpu_data.x86_model);
@@ -1333,31 +1458,19 @@ static void intel_idle_state_table_update(void)
}
}
-/*
- * intel_idle_cpuidle_driver_init()
- * allocate, initialize cpuidle_states
- */
-static void __init intel_idle_cpuidle_driver_init(void)
+static void intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
{
int cstate;
- struct cpuidle_driver *drv = &intel_idle_driver;
-
- intel_idle_state_table_update();
-
- cpuidle_poll_state_init(drv);
- drv->state_count = 1;
for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) {
unsigned int mwait_hint;
- if (!cpuidle_state_table[cstate].enter &&
- !cpuidle_state_table[cstate].enter_s2idle)
+ if (intel_idle_max_cstate_reached(cstate))
break;
- if (cstate + 1 > max_cstate) {
- pr_info("max_cstate %d reached\n", max_cstate);
+ if (!cpuidle_state_table[cstate].enter &&
+ !cpuidle_state_table[cstate].enter_s2idle)
break;
- }
/* If marked as unusable, skip this state. */
if (cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_UNUSABLE) {
@@ -1380,6 +1493,24 @@ static void __init intel_idle_cpuidle_driver_init(void)
}
}
+/*
+ * intel_idle_cpuidle_driver_init()
+ * allocate, initialize cpuidle_states
+ */
+static void __init intel_idle_cpuidle_driver_init(void)
+{
+ struct cpuidle_driver *drv = &intel_idle_driver;
+
+ intel_idle_state_table_update();
+
+ cpuidle_poll_state_init(drv);
+ drv->state_count = 1;
+
+ if (icpu)
+ intel_idle_init_cstates_icpu(drv);
+ else
+ intel_idle_init_cstates_acpi(drv);
+}
/*
* intel_idle_cpu_init()
@@ -1398,6 +1529,9 @@ static int intel_idle_cpu_init(unsigned int cpu)
return -EIO;
}
+ if (!icpu)
+ return 0;
+
if (icpu->auto_demotion_disable_flags)
auto_demotion_disable();
--
2.16.4
From: "Rafael J. Wysocki" <[email protected]>
Move the C-state verification and checks from
intel_idle_cpuidle_driver_init() to a separate function,
intel_idle_verify_cstate(), and make the former call it after
checking the CPUIDLE_FLAG_UNUSABLE state flag.
Also combine the drv->states[] updates with the incrementation of
drv->state_count.
No intentional functional impact.
Signed-off-by: Rafael J. Wysocki <[email protected]>
---
No changes from the RFC version.
---
drivers/idle/intel_idle.c | 49 ++++++++++++++++++++++++-----------------------
1 file changed, 25 insertions(+), 24 deletions(-)
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 75fd2a7b0842..47255d3cf51f 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -944,6 +944,22 @@ static void intel_idle_s2idle(struct cpuidle_device *dev,
mwait_idle_with_hints(eax, ecx);
}
+static bool intel_idle_verify_cstate(unsigned int mwait_hint)
+{
+ unsigned int mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint) + 1;
+ unsigned int num_substates = (mwait_substates >> mwait_cstate * 4) &
+ MWAIT_SUBSTATE_MASK;
+
+ /* Ignore the C-state if there are NO sub-states in CPUID for it. */
+ if (num_substates == 0)
+ return false;
+
+ if (mwait_cstate > 2 && !boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
+ mark_tsc_unstable("TSC halts in idle states deeper than C2");
+
+ return true;
+}
+
static void __setup_broadcast_timer(bool on)
{
if (on)
@@ -1332,10 +1348,10 @@ static void __init intel_idle_cpuidle_driver_init(void)
drv->state_count = 1;
for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) {
- int num_substates, mwait_hint, mwait_cstate;
+ unsigned int mwait_hint;
- if ((cpuidle_state_table[cstate].enter == NULL) &&
- (cpuidle_state_table[cstate].enter_s2idle == NULL))
+ if (!cpuidle_state_table[cstate].enter &&
+ !cpuidle_state_table[cstate].enter_s2idle)
break;
if (cstate + 1 > max_cstate) {
@@ -1343,34 +1359,19 @@ static void __init intel_idle_cpuidle_driver_init(void)
break;
}
- mwait_hint = flg2MWAIT(cpuidle_state_table[cstate].flags);
- mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint);
-
- /* number of sub-states for this state in CPUID.MWAIT */
- num_substates = (mwait_substates >> ((mwait_cstate + 1) * 4))
- & MWAIT_SUBSTATE_MASK;
-
- /* if NO sub-states for this state in CPUID, skip it */
- if (num_substates == 0)
- continue;
-
- /* if state marked as disabled, skip it */
+ /* If marked as unusable, skip this state. */
if (cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_UNUSABLE) {
pr_debug("state %s is disabled\n",
cpuidle_state_table[cstate].name);
continue;
}
+ mwait_hint = flg2MWAIT(cpuidle_state_table[cstate].flags);
+ if (!intel_idle_verify_cstate(mwait_hint))
+ continue;
- if (((mwait_cstate + 1) > 2) &&
- !boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
- mark_tsc_unstable("TSC halts in idle"
- " states deeper than C2");
-
- drv->states[drv->state_count] = /* structure copy */
- cpuidle_state_table[cstate];
-
- drv->state_count += 1;
+ /* Structure copy. */
+ drv->states[drv->state_count++] = cpuidle_state_table[cstate];
}
if (icpu->byt_auto_demotion_disable_flag) {
--
2.16.4
From: "Rafael J. Wysocki" <[email protected]>
Add a new module parameter called "no_acpi" to the intel_idle driver
to allow the driver to be prevented from using ACPI _CST via kernel
command line.
Signed-off-by: Rafael J. Wysocki <[email protected]>
---
No changes from the RFC version.
---
drivers/idle/intel_idle.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index a072b84d9595..26fe383bb921 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1138,6 +1138,10 @@ static bool intel_idle_max_cstate_reached(int cstate)
#ifdef CONFIG_ACPI_PROCESSOR_CSTATE
#include <acpi/processor.h>
+static bool no_acpi __read_mostly;
+module_param(no_acpi, bool, 0444);
+MODULE_PARM_DESC(no_acpi, "Do not use ACPI _CST for building the idle states list");
+
static struct acpi_processor_power acpi_state_table;
/**
@@ -1167,6 +1171,11 @@ static bool intel_idle_acpi_cst_extract(void)
{
unsigned int cpu;
+ if (no_acpi) {
+ pr_debug("Not allowed to use ACPI _CST\n");
+ return false;
+ }
+
for_each_possible_cpu(cpu) {
struct acpi_processor *pr = per_cpu(processors, cpu);
--
2.16.4