2023-07-21 16:01:08

by Lukasz Luba

[permalink] [raw]
Subject: [PATCH v3 06/12] PM: EM: Refactor struct em_perf_domain and add default_table

The Energy Model is going to support runtime modifications. Refactor old
implementation which accessed struct em_perf_state and introduce
em_perf_domain::default_table to clean up the design. This new field
will help to better distinguish 2 performance state tables.

Update all drivers or frameworks which used the old field:
em_perf_domain::table and now should use em_perf_domain::default_table.

Signed-off-by: Lukasz Luba <[email protected]>
---
drivers/powercap/dtpm_cpu.c | 27 +++++++++++++++++++--------
drivers/powercap/dtpm_devfreq.c | 23 ++++++++++++++++-------
drivers/thermal/cpufreq_cooling.c | 23 +++++++++++++++--------
drivers/thermal/devfreq_cooling.c | 23 +++++++++++++++++------
include/linux/energy_model.h | 14 ++++++++++++--
kernel/power/energy_model.c | 22 ++++++++++++++++++----
6 files changed, 97 insertions(+), 35 deletions(-)

diff --git a/drivers/powercap/dtpm_cpu.c b/drivers/powercap/dtpm_cpu.c
index 2ff7717530bf..743a0ac8ecdf 100644
--- a/drivers/powercap/dtpm_cpu.c
+++ b/drivers/powercap/dtpm_cpu.c
@@ -43,6 +43,7 @@ static u64 set_pd_power_limit(struct dtpm *dtpm, u64 power_limit)
{
struct dtpm_cpu *dtpm_cpu = to_dtpm_cpu(dtpm);
struct em_perf_domain *pd = em_cpu_get(dtpm_cpu->cpu);
+ struct em_perf_state *table;
struct cpumask cpus;
unsigned long freq;
u64 power;
@@ -51,19 +52,21 @@ static u64 set_pd_power_limit(struct dtpm *dtpm, u64 power_limit)
cpumask_and(&cpus, cpu_online_mask, to_cpumask(pd->cpus));
nr_cpus = cpumask_weight(&cpus);

+ table = pd->default_table->state;
+
for (i = 0; i < pd->nr_perf_states; i++) {

- power = pd->table[i].power * nr_cpus;
+ power = table[i].power * nr_cpus;

if (power > power_limit)
break;
}

- freq = pd->table[i - 1].frequency;
+ freq = table[i - 1].frequency;

freq_qos_update_request(&dtpm_cpu->qos_req, freq);

- power_limit = pd->table[i - 1].power * nr_cpus;
+ power_limit = table[i - 1].power * nr_cpus;

return power_limit;
}
@@ -88,12 +91,14 @@ static u64 scale_pd_power_uw(struct cpumask *pd_mask, u64 power)
static u64 get_pd_power_uw(struct dtpm *dtpm)
{
struct dtpm_cpu *dtpm_cpu = to_dtpm_cpu(dtpm);
+ struct em_perf_state *table;
struct em_perf_domain *pd;
struct cpumask *pd_mask;
unsigned long freq;
int i;

pd = em_cpu_get(dtpm_cpu->cpu);
+ table = pd->default_table->state;

pd_mask = em_span_cpus(pd);

@@ -101,10 +106,10 @@ static u64 get_pd_power_uw(struct dtpm *dtpm)

for (i = 0; i < pd->nr_perf_states; i++) {

- if (pd->table[i].frequency < freq)
+ if (table[i].frequency < freq)
continue;

- return scale_pd_power_uw(pd_mask, pd->table[i].power *
+ return scale_pd_power_uw(pd_mask, table[i].power *
MICROWATT_PER_MILLIWATT);
}

@@ -115,17 +120,20 @@ static int update_pd_power_uw(struct dtpm *dtpm)
{
struct dtpm_cpu *dtpm_cpu = to_dtpm_cpu(dtpm);
struct em_perf_domain *em = em_cpu_get(dtpm_cpu->cpu);
+ struct em_perf_state *table;
struct cpumask cpus;
int nr_cpus;

cpumask_and(&cpus, cpu_online_mask, to_cpumask(em->cpus));
nr_cpus = cpumask_weight(&cpus);

- dtpm->power_min = em->table[0].power;
+ table = em->default_table->state;
+
+ dtpm->power_min = table[0].power;
dtpm->power_min *= MICROWATT_PER_MILLIWATT;
dtpm->power_min *= nr_cpus;

- dtpm->power_max = em->table[em->nr_perf_states - 1].power;
+ dtpm->power_max = table[em->nr_perf_states - 1].power;
dtpm->power_max *= MICROWATT_PER_MILLIWATT;
dtpm->power_max *= nr_cpus;

@@ -182,6 +190,7 @@ static int __dtpm_cpu_setup(int cpu, struct dtpm *parent)
{
struct dtpm_cpu *dtpm_cpu;
struct cpufreq_policy *policy;
+ struct em_perf_state *table;
struct em_perf_domain *pd;
char name[CPUFREQ_NAME_LEN];
int ret = -ENOMEM;
@@ -198,6 +207,8 @@ static int __dtpm_cpu_setup(int cpu, struct dtpm *parent)
if (!pd || em_is_artificial(pd))
return -EINVAL;

+ table = pd->default_table->state;
+
dtpm_cpu = kzalloc(sizeof(*dtpm_cpu), GFP_KERNEL);
if (!dtpm_cpu)
return -ENOMEM;
@@ -216,7 +227,7 @@ static int __dtpm_cpu_setup(int cpu, struct dtpm *parent)

ret = freq_qos_add_request(&policy->constraints,
&dtpm_cpu->qos_req, FREQ_QOS_MAX,
- pd->table[pd->nr_perf_states - 1].frequency);
+ table[pd->nr_perf_states - 1].frequency);
if (ret)
goto out_dtpm_unregister;

diff --git a/drivers/powercap/dtpm_devfreq.c b/drivers/powercap/dtpm_devfreq.c
index 91276761a31d..6ef0f2b4a683 100644
--- a/drivers/powercap/dtpm_devfreq.c
+++ b/drivers/powercap/dtpm_devfreq.c
@@ -37,11 +37,14 @@ static int update_pd_power_uw(struct dtpm *dtpm)
struct devfreq *devfreq = dtpm_devfreq->devfreq;
struct device *dev = devfreq->dev.parent;
struct em_perf_domain *pd = em_pd_get(dev);
+ struct em_perf_state *table;

- dtpm->power_min = pd->table[0].power;
+ table = pd->default_table->state;
+
+ dtpm->power_min = table[0].power;
dtpm->power_min *= MICROWATT_PER_MILLIWATT;

- dtpm->power_max = pd->table[pd->nr_perf_states - 1].power;
+ dtpm->power_max = table[pd->nr_perf_states - 1].power;
dtpm->power_max *= MICROWATT_PER_MILLIWATT;

return 0;
@@ -53,22 +56,25 @@ static u64 set_pd_power_limit(struct dtpm *dtpm, u64 power_limit)
struct devfreq *devfreq = dtpm_devfreq->devfreq;
struct device *dev = devfreq->dev.parent;
struct em_perf_domain *pd = em_pd_get(dev);
+ struct em_perf_state *table;
unsigned long freq;
u64 power;
int i;

+ table = pd->default_table->state;
+
for (i = 0; i < pd->nr_perf_states; i++) {

- power = pd->table[i].power * MICROWATT_PER_MILLIWATT;
+ power = table[i].power * MICROWATT_PER_MILLIWATT;
if (power > power_limit)
break;
}

- freq = pd->table[i - 1].frequency;
+ freq = table[i - 1].frequency;

dev_pm_qos_update_request(&dtpm_devfreq->qos_req, freq);

- power_limit = pd->table[i - 1].power * MICROWATT_PER_MILLIWATT;
+ power_limit = table[i - 1].power * MICROWATT_PER_MILLIWATT;

return power_limit;
}
@@ -94,6 +100,7 @@ static u64 get_pd_power_uw(struct dtpm *dtpm)
struct device *dev = devfreq->dev.parent;
struct em_perf_domain *pd = em_pd_get(dev);
struct devfreq_dev_status status;
+ struct em_perf_state *table;
unsigned long freq;
u64 power;
int i;
@@ -102,15 +109,17 @@ static u64 get_pd_power_uw(struct dtpm *dtpm)
status = devfreq->last_status;
mutex_unlock(&devfreq->lock);

+ table = pd->default_table->state;
+
freq = DIV_ROUND_UP(status.current_frequency, HZ_PER_KHZ);
_normalize_load(&status);

for (i = 0; i < pd->nr_perf_states; i++) {

- if (pd->table[i].frequency < freq)
+ if (table[i].frequency < freq)
continue;

- power = pd->table[i].power * MICROWATT_PER_MILLIWATT;
+ power = table[i].power * MICROWATT_PER_MILLIWATT;
power *= status.busy_time;
power >>= 10;

diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
index e2cc7bd30862..1d979c5e05ed 100644
--- a/drivers/thermal/cpufreq_cooling.c
+++ b/drivers/thermal/cpufreq_cooling.c
@@ -91,10 +91,11 @@ struct cpufreq_cooling_device {
static unsigned long get_level(struct cpufreq_cooling_device *cpufreq_cdev,
unsigned int freq)
{
+ struct em_perf_state *table = cpufreq_cdev->em->default_table->state;
int i;

for (i = cpufreq_cdev->max_level - 1; i >= 0; i--) {
- if (freq > cpufreq_cdev->em->table[i].frequency)
+ if (freq > table[i].frequency)
break;
}

@@ -104,15 +105,16 @@ static unsigned long get_level(struct cpufreq_cooling_device *cpufreq_cdev,
static u32 cpu_freq_to_power(struct cpufreq_cooling_device *cpufreq_cdev,
u32 freq)
{
+ struct em_perf_state *table = cpufreq_cdev->em->default_table->state;
unsigned long power_mw;
int i;

for (i = cpufreq_cdev->max_level - 1; i >= 0; i--) {
- if (freq > cpufreq_cdev->em->table[i].frequency)
+ if (freq > table[i].frequency)
break;
}

- power_mw = cpufreq_cdev->em->table[i + 1].power;
+ power_mw = table[i + 1].power;
power_mw /= MICROWATT_PER_MILLIWATT;

return power_mw;
@@ -121,18 +123,19 @@ static u32 cpu_freq_to_power(struct cpufreq_cooling_device *cpufreq_cdev,
static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_cdev,
u32 power)
{
+ struct em_perf_state *table = cpufreq_cdev->em->default_table->state;
unsigned long em_power_mw;
int i;

for (i = cpufreq_cdev->max_level; i > 0; i--) {
/* Convert EM power to milli-Watts to make safe comparison */
- em_power_mw = cpufreq_cdev->em->table[i].power;
+ em_power_mw = table[i].power;
em_power_mw /= MICROWATT_PER_MILLIWATT;
if (power >= em_power_mw)
break;
}

- return cpufreq_cdev->em->table[i].frequency;
+ return table[i].frequency;
}

/**
@@ -262,8 +265,9 @@ static int cpufreq_get_requested_power(struct thermal_cooling_device *cdev,
static int cpufreq_state2power(struct thermal_cooling_device *cdev,
unsigned long state, u32 *power)
{
- unsigned int freq, num_cpus, idx;
struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
+ unsigned int freq, num_cpus, idx;
+ struct em_perf_state *table;

/* Request state should be less than max_level */
if (state > cpufreq_cdev->max_level)
@@ -271,8 +275,9 @@ static int cpufreq_state2power(struct thermal_cooling_device *cdev,

num_cpus = cpumask_weight(cpufreq_cdev->policy->cpus);

+ table = cpufreq_cdev->em->default_table->state;
idx = cpufreq_cdev->max_level - state;
- freq = cpufreq_cdev->em->table[idx].frequency;
+ freq = table[idx].frequency;
*power = cpu_freq_to_power(cpufreq_cdev, freq) * num_cpus;

return 0;
@@ -378,8 +383,10 @@ static unsigned int get_state_freq(struct cpufreq_cooling_device *cpufreq_cdev,
#ifdef CONFIG_THERMAL_GOV_POWER_ALLOCATOR
/* Use the Energy Model table if available */
if (cpufreq_cdev->em) {
+ struct em_perf_state *table;
+ table = cpufreq_cdev->em->default_table->state;
idx = cpufreq_cdev->max_level - state;
- return cpufreq_cdev->em->table[idx].frequency;
+ return table[idx].frequency;
}
#endif

diff --git a/drivers/thermal/devfreq_cooling.c b/drivers/thermal/devfreq_cooling.c
index 262e62ab6cf2..4207ef850582 100644
--- a/drivers/thermal/devfreq_cooling.c
+++ b/drivers/thermal/devfreq_cooling.c
@@ -87,6 +87,7 @@ static int devfreq_cooling_set_cur_state(struct thermal_cooling_device *cdev,
struct devfreq_cooling_device *dfc = cdev->devdata;
struct devfreq *df = dfc->devfreq;
struct device *dev = df->dev.parent;
+ struct em_perf_state *table;
unsigned long freq;
int perf_idx;

@@ -99,8 +100,9 @@ static int devfreq_cooling_set_cur_state(struct thermal_cooling_device *cdev,
return -EINVAL;

if (dfc->em_pd) {
+ table = dfc->em_pd->default_table->state;
perf_idx = dfc->max_state - state;
- freq = dfc->em_pd->table[perf_idx].frequency * 1000;
+ freq = table[perf_idx].frequency * 1000;
} else {
freq = dfc->freq_table[state];
}
@@ -123,10 +125,11 @@ static int devfreq_cooling_set_cur_state(struct thermal_cooling_device *cdev,
*/
static int get_perf_idx(struct em_perf_domain *em_pd, unsigned long freq)
{
+ struct em_perf_state *table = em_pd->default_table->state;
int i;

for (i = 0; i < em_pd->nr_perf_states; i++) {
- if (em_pd->table[i].frequency == freq)
+ if (table[i].frequency == freq)
return i;
}

@@ -181,6 +184,7 @@ static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cd
struct devfreq_cooling_device *dfc = cdev->devdata;
struct devfreq *df = dfc->devfreq;
struct devfreq_dev_status status;
+ struct em_perf_state *table;
unsigned long state;
unsigned long freq;
unsigned long voltage;
@@ -192,6 +196,8 @@ static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cd

freq = status.current_frequency;

+ table = dfc->em_pd->default_table->state;
+
if (dfc->power_ops && dfc->power_ops->get_real_power) {
voltage = get_voltage(df, freq);
if (voltage == 0) {
@@ -204,7 +210,7 @@ static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cd
state = dfc->capped_state;

/* Convert EM power into milli-Watts first */
- dfc->res_util = dfc->em_pd->table[state].power;
+ dfc->res_util = table[state].power;
dfc->res_util /= MICROWATT_PER_MILLIWATT;

dfc->res_util *= SCALE_ERROR_MITIGATION;
@@ -225,7 +231,7 @@ static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cd
_normalize_load(&status);

/* Convert EM power into milli-Watts first */
- *power = dfc->em_pd->table[perf_idx].power;
+ *power = table[perf_idx].power;
*power /= MICROWATT_PER_MILLIWATT;
/* Scale power for utilization */
*power *= status.busy_time;
@@ -245,13 +251,15 @@ static int devfreq_cooling_state2power(struct thermal_cooling_device *cdev,
unsigned long state, u32 *power)
{
struct devfreq_cooling_device *dfc = cdev->devdata;
+ struct em_perf_state *table;
int perf_idx;

if (state > dfc->max_state)
return -EINVAL;

+ table = dfc->em_pd->default_table->state;
perf_idx = dfc->max_state - state;
- *power = dfc->em_pd->table[perf_idx].power;
+ *power = table[perf_idx].power;
*power /= MICROWATT_PER_MILLIWATT;

return 0;
@@ -264,6 +272,7 @@ static int devfreq_cooling_power2state(struct thermal_cooling_device *cdev,
struct devfreq *df = dfc->devfreq;
struct devfreq_dev_status status;
unsigned long freq, em_power_mw;
+ struct em_perf_state *table;
s32 est_power;
int i;

@@ -273,6 +282,8 @@ static int devfreq_cooling_power2state(struct thermal_cooling_device *cdev,

freq = status.current_frequency;

+ table = dfc->em_pd->default_table->state;
+
if (dfc->power_ops && dfc->power_ops->get_real_power) {
/* Scale for resource utilization */
est_power = power * dfc->res_util;
@@ -290,7 +301,7 @@ static int devfreq_cooling_power2state(struct thermal_cooling_device *cdev,
*/
for (i = dfc->max_state; i > 0; i--) {
/* Convert EM power to milli-Watts to make safe comparison */
- em_power_mw = dfc->em_pd->table[i].power;
+ em_power_mw = table[i].power;
em_power_mw /= MICROWATT_PER_MILLIWATT;
if (est_power >= em_power_mw)
break;
diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 8069f526c9d8..90c0822b664b 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -36,9 +36,19 @@ struct em_perf_state {
*/
#define EM_PERF_STATE_INEFFICIENT BIT(0)

+/**
+ * struct em_perf_table - Performance states table
+ * @state: List of performance states, in ascending order
+ * @rcu: RCU used for safe access and destruction
+ */
+struct em_perf_table {
+ struct em_perf_state *state;
+ struct rcu_head rcu;
+};
+
/**
* struct em_perf_domain - Performance domain
- * @table: List of performance states, in ascending order
+ * @default_table: Pointer to the default em_perf_table
* @nr_perf_states: Number of performance states
* @flags: See "em_perf_domain flags"
* @cpus: Cpumask covering the CPUs of the domain. It's here
@@ -53,7 +63,7 @@ struct em_perf_state {
* field is unused.
*/
struct em_perf_domain {
- struct em_perf_state *table;
+ struct em_perf_table *default_table;
int nr_perf_states;
unsigned long flags;
unsigned long cpus[];
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 5ecb73b36995..6cd94f92701d 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -66,6 +66,7 @@ DEFINE_SHOW_ATTRIBUTE(em_debug_flags);

static void em_debug_create_pd(struct device *dev)
{
+ struct em_perf_table *table = dev->em_pd->default_table;
struct dentry *d;
int i;

@@ -81,7 +82,7 @@ static void em_debug_create_pd(struct device *dev)

/* Create a sub-directory for each performance state */
for (i = 0; i < dev->em_pd->nr_perf_states; i++)
- em_debug_create_ps(&dev->em_pd->table[i], d);
+ em_debug_create_ps(&table->state[i], d);

}

@@ -196,7 +197,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
if (ret)
goto free_ps_table;

- pd->table = table;
+ pd->default_table->state = table;
pd->nr_perf_states = nr_states;

return 0;
@@ -210,6 +211,7 @@ static int em_create_pd(struct device *dev, int nr_states,
struct em_data_callback *cb, cpumask_t *cpus,
unsigned long flags)
{
+ struct em_perf_table *default_table;
struct em_perf_domain *pd;
struct device *cpu_dev;
int cpu, ret, num_cpus;
@@ -234,8 +236,17 @@ static int em_create_pd(struct device *dev, int nr_states,
return -ENOMEM;
}

+ default_table = kzalloc(sizeof(*default_table), GFP_KERNEL);
+ if (!default_table) {
+ kfree(pd);
+ return -ENOMEM;
+ }
+
+ pd->default_table = default_table;
+
ret = em_create_perf_table(dev, pd, nr_states, cb, flags);
if (ret) {
+ kfree(default_table);
kfree(pd);
return ret;
}
@@ -358,6 +369,7 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
bool microwatts)
{
unsigned long cap, prev_cap = 0;
+ struct em_perf_state *table;
unsigned long flags = 0;
int cpu, ret;

@@ -416,7 +428,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,

dev->em_pd->flags |= flags;

- em_cpufreq_update_efficiencies(dev, dev->em_pd->table);
+ table = dev->em_pd->default_table->state;
+ em_cpufreq_update_efficiencies(dev, table);

em_debug_create_pd(dev);
dev_info(dev, "EM: created perf domain\n");
@@ -449,7 +462,8 @@ void em_dev_unregister_perf_domain(struct device *dev)
mutex_lock(&em_pd_mutex);
em_debug_remove_pd(dev);

- kfree(dev->em_pd->table);
+ kfree(pd->default_table->state);
+ kfree(pd->default_table);
kfree(dev->em_pd);
dev->em_pd = NULL;
mutex_unlock(&em_pd_mutex);
--
2.25.1



2023-08-16 20:16:26

by Dietmar Eggemann

[permalink] [raw]
Subject: Re: [PATCH v3 06/12] PM: EM: Refactor struct em_perf_domain and add default_table

On 21/07/2023 17:50, Lukasz Luba wrote:
> The Energy Model is going to support runtime modifications. Refactor old
> implementation which accessed struct em_perf_state and introduce
> em_perf_domain::default_table to clean up the design. This new field
> will help to better distinguish 2 performance state tables.
>
> Update all drivers or frameworks which used the old field:
> em_perf_domain::table and now should use em_perf_domain::default_table.
>

This doesn't compile:

SYNC include/config/auto.conf.cmd
UPD include/config/kernel.release
UPD include/generated/utsrelease.h
CC arch/arm64/kernel/asm-offsets.s
In file included from ./include/linux/device.h:16,
from ./include/linux/acpi.h:14,
from ./include/acpi/apei.h:9,
from ./include/acpi/ghes.h:5,
from ./include/linux/arm_sdei.h:8,
from arch/arm64/kernel/asm-offsets.c:10:
./include/linux/energy_model.h: In function ‘em_cpu_energy’:
./include/linux/energy_model.h:256:10: error: ‘struct em_perf_domain’ has no member named ‘table’
256 | ps = &pd->table[pd->nr_perf_states - 1];
| ^~
./include/linux/energy_model.h:266:34: error: ‘struct em_perf_domain’ has no member named ‘table’
266 | i = em_pd_get_efficient_state(pd->table, pd->nr_perf_states, freq,
| ^~
./include/linux/energy_model.h:268:10: error: ‘struct em_perf_domain’ has no member named ‘table’
268 | ps = &pd->table[i];
| ^~
make[2]: *** [scripts/Makefile.build:116: arch/arm64/kernel/asm-offsets.s] Error 1
make[1]: *** [/opt/git/kernel_org/Makefile:1275: prepare0] Error 2
make: *** [Makefile:234: __sub-make] Error 2

In v2 this patch was much later in the set.