Extrapolating on the existing framework to track rt/dl utilization using
pelt signals, add a similar mechanism to track thermal pressure. The
difference here from rt/dl utilization tracking is that, instead of
tracking time spent by a cpu running a rt/dl task through util_avg, the
average thermal pressure is tracked through load_avg. This is because
thermal pressure signal is weighted time "delta" capacity unlike util_avg
which is binary. "delta capacity" here means delta between the actual
capacity of a cpu and the decreased capacity a cpu due to a thermal event.
In order to track average thermal pressure, a new sched_avg variable
avg_thermal is introduced. Function update_thermal_load_avg can be called
to do the periodic bookkeeping (accumulate, decay and average) of the
thermal pressure.
Signed-off-by: Thara Gopinath <[email protected]>
Reviewed-by: Vincent Guittot <[email protected]>
---
v6->v7:
- Added CONFIG_HAVE_SCHED_THERMAL_PRESSURE to stub out
update_thermal_load_avg in unsupported architectures as per
review comments from Peter, Dietmar and Quentin.
- Updated comment for update_thermal_load_avg as per review
comments from Peter and Dietmar.
v7->v8:
- Fixed typo in defining update_thermal_load_avg which was
causing build errors (reported by kbuild test report)
v8->v9:
- Defined thermal_load_avg to read rq->avg_thermal.load_avg and
avoid cacheline miss in unsupported cases as per Peter's
suggestion.
v9->v10:
- Fixed typos in comments as per Amit Kucheria's review comments.
include/trace/events/sched.h | 4 ++++
init/Kconfig | 4 ++++
kernel/sched/pelt.c | 31 +++++++++++++++++++++++++++++++
kernel/sched/pelt.h | 31 +++++++++++++++++++++++++++++++
kernel/sched/sched.h | 3 +++
5 files changed, 73 insertions(+)
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 420e80e56e55..a8fb667c669e 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -613,6 +613,10 @@ DECLARE_TRACE(pelt_dl_tp,
TP_PROTO(struct rq *rq),
TP_ARGS(rq));
+DECLARE_TRACE(pelt_thermal_tp,
+ TP_PROTO(struct rq *rq),
+ TP_ARGS(rq));
+
DECLARE_TRACE(pelt_irq_tp,
TP_PROTO(struct rq *rq),
TP_ARGS(rq));
diff --git a/init/Kconfig b/init/Kconfig
index 2a25c769eaaa..8d56902efa70 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -464,6 +464,10 @@ config HAVE_SCHED_AVG_IRQ
depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING
depends on SMP
+config HAVE_SCHED_THERMAL_PRESSURE
+ bool "Enable periodic averaging of thermal pressure"
+ depends on SMP
+
config BSD_PROCESS_ACCT
bool "BSD Process Accounting"
depends on MULTIUSER
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index bd006b79b360..1fdacbf6fb44 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -367,6 +367,37 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
return 0;
}
+#ifdef CONFIG_HAVE_SCHED_THERMAL_PRESSURE
+/*
+ * thermal:
+ *
+ * load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked
+ *
+ * util_avg and runnable_load_avg are not supported and meaningless.
+ *
+ * Unlike rt/dl utilization tracking that track time spent by a cpu
+ * running a rt/dl task through util_avg, the average thermal pressure is
+ * tracked through load_avg. This is because thermal pressure signal is
+ * time weighted "delta" capacity unlike util_avg which is binary.
+ * "delta capacity" = actual capacity -
+ * capped capacity a cpu due to a thermal event.
+ */
+
+int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+{
+ if (___update_load_sum(now, &rq->avg_thermal,
+ capacity,
+ capacity,
+ capacity)) {
+ ___update_load_avg(&rq->avg_thermal, 1, 1);
+ trace_pelt_thermal_tp(rq);
+ return 1;
+ }
+
+ return 0;
+}
+#endif
+
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
/*
* irq:
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index afff644da065..916979a54782 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -7,6 +7,26 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
+#ifdef CONFIG_HAVE_SCHED_THERMAL_PRESSURE
+int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity);
+
+static inline u64 thermal_load_avg(struct rq *rq)
+{
+ return READ_ONCE(rq->avg_thermal.load_avg);
+}
+#else
+static inline int
+update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+{
+ return 0;
+}
+
+static inline u64 thermal_load_avg(struct rq *rq)
+{
+ return 0;
+}
+#endif
+
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
int update_irq_load_avg(struct rq *rq, u64 running);
#else
@@ -158,6 +178,17 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
return 0;
}
+static inline int
+update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+{
+ return 0;
+}
+
+static inline u64 thermal_load_avg(struct rq *rq)
+{
+ return 0;
+}
+
static inline int
update_irq_load_avg(struct rq *rq, u64 running)
{
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 12bf82d86156..211411ac0efa 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -943,6 +943,9 @@ struct rq {
struct sched_avg avg_dl;
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
struct sched_avg avg_irq;
+#endif
+#ifdef CONFIG_HAVE_SCHED_THERMAL_PRESSURE
+ struct sched_avg avg_thermal;
#endif
u64 idle_stamp;
u64 avg_idle;
--
2.20.1
On 2/21/20 4:52 PM, Thara Gopinath wrote:
> diff --git a/init/Kconfig b/init/Kconfig
> index 2a25c769eaaa..8d56902efa70 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -464,6 +464,10 @@ config HAVE_SCHED_AVG_IRQ
> depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING
> depends on SMP
>
> +config HAVE_SCHED_THERMAL_PRESSURE
> + bool "Enable periodic averaging of thermal pressure"
This prompt string makes this symbol user-configurable, but
I don't think that's what you want here.
> + depends on SMP
> +
> config BSD_PROCESS_ACCT
> bool "BSD Process Accounting"
> depends on MULTIUSER
--
~Randy
On 2/21/20 7:59 PM, Randy Dunlap wrote:
> On 2/21/20 4:52 PM, Thara Gopinath wrote:
>> diff --git a/init/Kconfig b/init/Kconfig
>> index 2a25c769eaaa..8d56902efa70 100644
>> --- a/init/Kconfig
>> +++ b/init/Kconfig
>> @@ -464,6 +464,10 @@ config HAVE_SCHED_AVG_IRQ
>> depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING
>> depends on SMP
>>
>> +config HAVE_SCHED_THERMAL_PRESSURE
>> + bool "Enable periodic averaging of thermal pressure"
>
> This prompt string makes this symbol user-configurable, but
> I don't think that's what you want here.
Hi Randy,
Thank you for the review.
Actually I thought being user-configurable is a good idea as it will
allow users to easily enable it and see if the benefits their systems.
(I used menuconfig while developing, to enable it).
Do you see a reason why this should not be so?
>
>> + depends on SMP
>> +
>> config BSD_PROCESS_ACCT
>> bool "BSD Process Accounting"
>> depends on MULTIUSER
>
>
--
Warm Regards
Thara
On 2/22/20 10:27 AM, Thara Gopinath wrote:
>
>
> On 2/21/20 7:59 PM, Randy Dunlap wrote:
>> On 2/21/20 4:52 PM, Thara Gopinath wrote:
>>> diff --git a/init/Kconfig b/init/Kconfig
>>> index 2a25c769eaaa..8d56902efa70 100644
>>> --- a/init/Kconfig
>>> +++ b/init/Kconfig
>>> @@ -464,6 +464,10 @@ config HAVE_SCHED_AVG_IRQ
>>> depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING
>>> depends on SMP
>>> +config HAVE_SCHED_THERMAL_PRESSURE
>>> + bool "Enable periodic averaging of thermal pressure"
>>
>> This prompt string makes this symbol user-configurable, but
>> I don't think that's what you want here.
>
> Hi Randy,
> Thank you for the review.
> Actually I thought being user-configurable is a good idea as it will allow users to easily enable it and see if the benefits their systems. (I used menuconfig while developing, to enable it).
> Do you see a reason why this should not be so?
>
>>
>>> + depends on SMP
>>> +
>>> config BSD_PROCESS_ACCT
>>> bool "BSD Process Accounting"
>>> depends on MULTIUSER
Hi Thara,
Is there some other way that HAVE_SCHED_THERMAL_PRESSURE can become
set/enabled? for example, is it selected by any other options?
The Kconfig symbols that begin with HAVE_ are usually something that
are platform-specific and are usually set (selected) by other options,
or they are "default y".
In init/Kconfig, I see 15 other HAVE_ Kconfig symbols,
and none of them have user prompt strings. They are either selected
elsewhere or set inside their Kconfig block.
Maybe you just want to rename the Kconfig symbol so that it does not
being with HAVE_.
--
~Randy
On 2/22/20 1:50 PM, Randy Dunlap wrote:
> On 2/22/20 10:27 AM, Thara Gopinath wrote:
>>
>>
>> On 2/21/20 7:59 PM, Randy Dunlap wrote:
>>> On 2/21/20 4:52 PM, Thara Gopinath wrote:
>>>> diff --git a/init/Kconfig b/init/Kconfig
>>>> index 2a25c769eaaa..8d56902efa70 100644
>>>> --- a/init/Kconfig
>>>> +++ b/init/Kconfig
>>>> @@ -464,6 +464,10 @@ config HAVE_SCHED_AVG_IRQ
>>>> depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING
>>>> depends on SMP
>>>> +config HAVE_SCHED_THERMAL_PRESSURE
>>>> + bool "Enable periodic averaging of thermal pressure"
>>>
>>> This prompt string makes this symbol user-configurable, but
>>> I don't think that's what you want here.
>>
>> Hi Randy,
>> Thank you for the review.
>> Actually I thought being user-configurable is a good idea as it will allow users to easily enable it and see if the benefits their systems. (I used menuconfig while developing, to enable it).
>> Do you see a reason why this should not be so?
>>
>>>
>>>> + depends on SMP
>>>> +
>>>> config BSD_PROCESS_ACCT
>>>> bool "BSD Process Accounting"
>>>> depends on MULTIUSER
>
> Hi Thara,
> Is there some other way that HAVE_SCHED_THERMAL_PRESSURE can become
> set/enabled? for example, is it selected by any other options?
>
> The Kconfig symbols that begin with HAVE_ are usually something that
> are platform-specific and are usually set (selected) by other options,
> or they are "default y".
>
> In init/Kconfig, I see 15 other HAVE_ Kconfig symbols,
> and none of them have user prompt strings. They are either selected
> elsewhere or set inside their Kconfig block.
>
> Maybe you just want to rename the Kconfig symbol so that it does not
> being with HAVE_.
Hi Randy,
I see what you mean. I will send an update to this patch with HAVE_
removed. It is not selected by any other options. Best is for user to
select it or platform/SoC configs to enable it.
>
>
--
Warm Regards
Thara
On Mon, Feb 24, 2020 at 09:33:22AM -0500, Thara Gopinath wrote:
> I see what you mean. I will send an update to this patch with HAVE_ removed.
> It is not selected by any other options. Best is for user to select it or
> platform/SoC configs to enable it.
Done that for you ;-)
On 2/25/20 10:47 AM, Peter Zijlstra wrote:
> On Mon, Feb 24, 2020 at 09:33:22AM -0500, Thara Gopinath wrote:
>> I see what you mean. I will send an update to this patch with HAVE_ removed.
>> It is not selected by any other options. Best is for user to select it or
>> platform/SoC configs to enable it.
>
> Done that for you ;-)
Thanks!
>
--
Warm Regards
Thara
The following commit has been merged into the sched/core branch of tip:
Commit-ID: 765047932f153265db6ef15be208d6cbfc03dc62
Gitweb: https://git.kernel.org/tip/765047932f153265db6ef15be208d6cbfc03dc62
Author: Thara Gopinath <[email protected]>
AuthorDate: Fri, 21 Feb 2020 19:52:05 -05:00
Committer: Ingo Molnar <[email protected]>
CommitterDate: Fri, 06 Mar 2020 12:57:17 +01:00
sched/pelt: Add support to track thermal pressure
Extrapolating on the existing framework to track rt/dl utilization using
pelt signals, add a similar mechanism to track thermal pressure. The
difference here from rt/dl utilization tracking is that, instead of
tracking time spent by a CPU running a RT/DL task through util_avg, the
average thermal pressure is tracked through load_avg. This is because
thermal pressure signal is weighted time "delta" capacity unlike util_avg
which is binary. "delta capacity" here means delta between the actual
capacity of a CPU and the decreased capacity a CPU due to a thermal event.
In order to track average thermal pressure, a new sched_avg variable
avg_thermal is introduced. Function update_thermal_load_avg can be called
to do the periodic bookkeeping (accumulate, decay and average) of the
thermal pressure.
Reviewed-by: Vincent Guittot <[email protected]>
Signed-off-by: Thara Gopinath <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
include/trace/events/sched.h | 4 ++++
init/Kconfig | 4 ++++
kernel/sched/pelt.c | 31 +++++++++++++++++++++++++++++++
kernel/sched/pelt.h | 31 +++++++++++++++++++++++++++++++
kernel/sched/sched.h | 3 +++
5 files changed, 73 insertions(+)
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 9c3ebb7..ed168b0 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -618,6 +618,10 @@ DECLARE_TRACE(pelt_dl_tp,
TP_PROTO(struct rq *rq),
TP_ARGS(rq));
+DECLARE_TRACE(pelt_thermal_tp,
+ TP_PROTO(struct rq *rq),
+ TP_ARGS(rq));
+
DECLARE_TRACE(pelt_irq_tp,
TP_PROTO(struct rq *rq),
TP_ARGS(rq));
diff --git a/init/Kconfig b/init/Kconfig
index 20a6ac3..275c848 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -451,6 +451,10 @@ config HAVE_SCHED_AVG_IRQ
depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING
depends on SMP
+config SCHED_THERMAL_PRESSURE
+ bool "Enable periodic averaging of thermal pressure"
+ depends on SMP
+
config BSD_PROCESS_ACCT
bool "BSD Process Accounting"
depends on MULTIUSER
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index c40d57a..b647d04 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -368,6 +368,37 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
return 0;
}
+#ifdef CONFIG_SCHED_THERMAL_PRESSURE
+/*
+ * thermal:
+ *
+ * load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked
+ *
+ * util_avg and runnable_load_avg are not supported and meaningless.
+ *
+ * Unlike rt/dl utilization tracking that track time spent by a cpu
+ * running a rt/dl task through util_avg, the average thermal pressure is
+ * tracked through load_avg. This is because thermal pressure signal is
+ * time weighted "delta" capacity unlike util_avg which is binary.
+ * "delta capacity" = actual capacity -
+ * capped capacity a cpu due to a thermal event.
+ */
+
+int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+{
+ if (___update_load_sum(now, &rq->avg_thermal,
+ capacity,
+ capacity,
+ capacity)) {
+ ___update_load_avg(&rq->avg_thermal, 1);
+ trace_pelt_thermal_tp(rq);
+ return 1;
+ }
+
+ return 0;
+}
+#endif
+
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
/*
* irq:
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index afff644..eb034d9 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -7,6 +7,26 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
+#ifdef CONFIG_SCHED_THERMAL_PRESSURE
+int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity);
+
+static inline u64 thermal_load_avg(struct rq *rq)
+{
+ return READ_ONCE(rq->avg_thermal.load_avg);
+}
+#else
+static inline int
+update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+{
+ return 0;
+}
+
+static inline u64 thermal_load_avg(struct rq *rq)
+{
+ return 0;
+}
+#endif
+
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
int update_irq_load_avg(struct rq *rq, u64 running);
#else
@@ -159,6 +179,17 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
}
static inline int
+update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+{
+ return 0;
+}
+
+static inline u64 thermal_load_avg(struct rq *rq)
+{
+ return 0;
+}
+
+static inline int
update_irq_load_avg(struct rq *rq, u64 running)
{
return 0;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2a0caf3..6c839f8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -961,6 +961,9 @@ struct rq {
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
struct sched_avg avg_irq;
#endif
+#ifdef CONFIG_SCHED_THERMAL_PRESSURE
+ struct sched_avg avg_thermal;
+#endif
u64 idle_stamp;
u64 avg_idle;