2022-04-16 00:47:05

by Thomas Gleixner

[permalink] [raw]
Subject: [patch 09/10] x86/aperfmperf: Replace aperfmperf_get_khz()

The frequency invariance infrastructure provides the APERF/MPERF samples
already. Utilize them for the cpu frequency display in /proc/cpuinfo.

The sample is considered valid for 20ms. So for idle or isolated NOHZ full
CPUs the function returns 0, which is matching the previous behaviour.

This gets rid of the mass IPIs and a delay of 20ms for stabilizing observed
by Eric when reading /proc/cpuinfo.

Reported-by: Eric Dumazet <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
---
arch/x86/kernel/cpu/aperfmperf.c | 77 +++++++++++++++++----------------------
fs/proc/cpuinfo.c | 6 ---
include/linux/cpufreq.h | 1
3 files changed, 35 insertions(+), 49 deletions(-)

--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -100,49 +100,6 @@ static bool aperfmperf_snapshot_cpu(int
return time_delta <= APERFMPERF_STALE_THRESHOLD_MS;
}

-unsigned int aperfmperf_get_khz(int cpu)
-{
- if (!cpu_khz)
- return 0;
-
- if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
- return 0;
-
- if (!housekeeping_cpu(cpu, HK_TYPE_MISC))
- return 0;
-
- if (rcu_is_idle_cpu(cpu))
- return 0; /* Idle CPUs are completely uninteresting. */
-
- aperfmperf_snapshot_cpu(cpu, ktime_get(), true);
- return per_cpu(samples.khz, cpu);
-}
-
-void arch_freq_prepare_all(void)
-{
- ktime_t now = ktime_get();
- bool wait = false;
- int cpu;
-
- if (!cpu_khz)
- return;
-
- if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
- return;
-
- for_each_online_cpu(cpu) {
- if (!housekeeping_cpu(cpu, HK_TYPE_MISC))
- continue;
- if (rcu_is_idle_cpu(cpu))
- continue; /* Idle CPUs are completely uninteresting. */
- if (!aperfmperf_snapshot_cpu(cpu, now, false))
- wait = true;
- }
-
- if (wait)
- msleep(APERFMPERF_REFRESH_DELAY_MS);
-}
-
unsigned int arch_freq_get_on_cpu(int cpu)
{
struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu);
@@ -529,6 +486,40 @@ void arch_scale_freq_tick(void)
scale_freq_tick(acnt, mcnt);
}

+/*
+ * Discard samples older than the define maximum sample age of 20ms. There
+ * is no point in sending IPIs in such a case. If the scheduler tick was
+ * not running then the CPU is either idle or isolated.
+ */
+#define MAX_SAMPLE_AGE ((unsigned long)HZ / 50)
+
+unsigned int aperfmperf_get_khz(int cpu)
+{
+ struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu);
+ unsigned long last;
+ unsigned int seq;
+ u64 acnt, mcnt;
+
+ if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ return 0;
+
+ do {
+ seq = raw_read_seqcount_begin(&s->seq);
+ last = s->last_update;
+ acnt = s->acnt;
+ mcnt = s->mcnt;
+ } while (read_seqcount_retry(&s->seq, seq));
+
+ /*
+ * Bail on invalid count and when the last update was too long ago,
+ * which covers idle and NOHZ full CPUs.
+ */
+ if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE)
+ return 0;
+
+ return div64_u64((cpu_khz * acnt), mcnt);
+}
+
static int __init bp_init_aperfmperf(void)
{
if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
--- a/fs/proc/cpuinfo.c
+++ b/fs/proc/cpuinfo.c
@@ -5,14 +5,10 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>

-__weak void arch_freq_prepare_all(void)
-{
-}
-
extern const struct seq_operations cpuinfo_op;
+
static int cpuinfo_open(struct inode *inode, struct file *file)
{
- arch_freq_prepare_all();
return seq_open(file, &cpuinfo_op);
}

--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -1199,7 +1199,6 @@ static inline void sched_cpufreq_governo
struct cpufreq_governor *old_gov) { }
#endif

-extern void arch_freq_prepare_all(void);
extern unsigned int arch_freq_get_on_cpu(int cpu);

#ifndef arch_set_freq_scale


2022-04-22 20:48:25

by Rafael J. Wysocki

[permalink] [raw]
Subject: Re: [patch 09/10] x86/aperfmperf: Replace aperfmperf_get_khz()

On Fri, Apr 15, 2022 at 9:20 PM Thomas Gleixner <[email protected]> wrote:
>
> The frequency invariance infrastructure provides the APERF/MPERF samples
> already. Utilize them for the cpu frequency display in /proc/cpuinfo.
>
> The sample is considered valid for 20ms. So for idle or isolated NOHZ full
> CPUs the function returns 0, which is matching the previous behaviour.
>
> This gets rid of the mass IPIs and a delay of 20ms for stabilizing observed
> by Eric when reading /proc/cpuinfo.
>
> Reported-by: Eric Dumazet <[email protected]>
> Signed-off-by: Thomas Gleixner <[email protected]>

All fine IMV, one minor nit below.


Reviewed-by: Rafael J. Wysocki <[email protected]>

> ---
> arch/x86/kernel/cpu/aperfmperf.c | 77 +++++++++++++++++----------------------
> fs/proc/cpuinfo.c | 6 ---
> include/linux/cpufreq.h | 1
> 3 files changed, 35 insertions(+), 49 deletions(-)
>
> --- a/arch/x86/kernel/cpu/aperfmperf.c
> +++ b/arch/x86/kernel/cpu/aperfmperf.c
> @@ -100,49 +100,6 @@ static bool aperfmperf_snapshot_cpu(int
> return time_delta <= APERFMPERF_STALE_THRESHOLD_MS;
> }
>
> -unsigned int aperfmperf_get_khz(int cpu)
> -{
> - if (!cpu_khz)
> - return 0;
> -
> - if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
> - return 0;
> -
> - if (!housekeeping_cpu(cpu, HK_TYPE_MISC))
> - return 0;
> -
> - if (rcu_is_idle_cpu(cpu))
> - return 0; /* Idle CPUs are completely uninteresting. */
> -
> - aperfmperf_snapshot_cpu(cpu, ktime_get(), true);
> - return per_cpu(samples.khz, cpu);
> -}
> -
> -void arch_freq_prepare_all(void)
> -{
> - ktime_t now = ktime_get();
> - bool wait = false;
> - int cpu;
> -
> - if (!cpu_khz)
> - return;
> -
> - if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
> - return;
> -
> - for_each_online_cpu(cpu) {
> - if (!housekeeping_cpu(cpu, HK_TYPE_MISC))
> - continue;
> - if (rcu_is_idle_cpu(cpu))
> - continue; /* Idle CPUs are completely uninteresting. */
> - if (!aperfmperf_snapshot_cpu(cpu, now, false))
> - wait = true;
> - }
> -
> - if (wait)
> - msleep(APERFMPERF_REFRESH_DELAY_MS);
> -}
> -
> unsigned int arch_freq_get_on_cpu(int cpu)
> {
> struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu);
> @@ -529,6 +486,40 @@ void arch_scale_freq_tick(void)
> scale_freq_tick(acnt, mcnt);
> }
>
> +/*
> + * Discard samples older than the define maximum sample age of 20ms. There
> + * is no point in sending IPIs in such a case. If the scheduler tick was
> + * not running then the CPU is either idle or isolated.
> + */
> +#define MAX_SAMPLE_AGE ((unsigned long)HZ / 50)
> +
> +unsigned int aperfmperf_get_khz(int cpu)
> +{
> + struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu);
> + unsigned long last;
> + unsigned int seq;
> + u64 acnt, mcnt;
> +
> + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
> + return 0;
> +
> + do {
> + seq = raw_read_seqcount_begin(&s->seq);
> + last = s->last_update;
> + acnt = s->acnt;
> + mcnt = s->mcnt;
> + } while (read_seqcount_retry(&s->seq, seq));
> +
> + /*
> + * Bail on invalid count and when the last update was too long ago,
> + * which covers idle and NOHZ full CPUs.
> + */
> + if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE)

The inner parens are not needed here.

> + return 0;
> +
> + return div64_u64((cpu_khz * acnt), mcnt);
> +}
> +
> static int __init bp_init_aperfmperf(void)
> {
> if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
> --- a/fs/proc/cpuinfo.c
> +++ b/fs/proc/cpuinfo.c
> @@ -5,14 +5,10 @@
> #include <linux/proc_fs.h>
> #include <linux/seq_file.h>
>
> -__weak void arch_freq_prepare_all(void)
> -{
> -}
> -
> extern const struct seq_operations cpuinfo_op;
> +
> static int cpuinfo_open(struct inode *inode, struct file *file)
> {
> - arch_freq_prepare_all();
> return seq_open(file, &cpuinfo_op);
> }
>
> --- a/include/linux/cpufreq.h
> +++ b/include/linux/cpufreq.h
> @@ -1199,7 +1199,6 @@ static inline void sched_cpufreq_governo
> struct cpufreq_governor *old_gov) { }
> #endif
>
> -extern void arch_freq_prepare_all(void);
> extern unsigned int arch_freq_get_on_cpu(int cpu);
>
> #ifndef arch_set_freq_scale
>

Subject: [tip: x86/cleanups] x86/aperfmperf: Replace aperfmperf_get_khz()

The following commit has been merged into the x86/cleanups branch of tip:

Commit-ID: 61551f094837f77952eba2fdf8b913bb5b191ced
Gitweb: https://git.kernel.org/tip/61551f094837f77952eba2fdf8b913bb5b191ced
Author: Thomas Gleixner <[email protected]>
AuthorDate: Fri, 15 Apr 2022 21:20:02 +02:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Wed, 27 Apr 2022 15:51:08 +02:00

x86/aperfmperf: Replace aperfmperf_get_khz()

The frequency invariance infrastructure provides the APERF/MPERF samples
already. Utilize them for the cpu frequency display in /proc/cpuinfo.

The sample is considered valid for 20ms. So for idle or isolated NOHZ full
CPUs the function returns 0, which is matching the previous behaviour.

This gets rid of the mass IPIs and a delay of 20ms for stabilizing observed
by Eric when reading /proc/cpuinfo.

Reported-by: Eric Dumazet <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Tested-by: Eric Dumazet <[email protected]>
Reviewed-by: Rafael J. Wysocki <[email protected]>
Acked-by: Peter Zijlstra (Intel) <[email protected]>
Acked-by: Paul E. McKenney <[email protected]>
Link: https://lore.kernel.org/r/[email protected]

---
arch/x86/kernel/cpu/aperfmperf.c | 77 +++++++++++++------------------
fs/proc/cpuinfo.c | 6 +--
include/linux/cpufreq.h | 1 +-
3 files changed, 35 insertions(+), 49 deletions(-)

diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index 963c069..e9d2da7 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -101,49 +101,6 @@ static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait)
return time_delta <= APERFMPERF_STALE_THRESHOLD_MS;
}

-unsigned int aperfmperf_get_khz(int cpu)
-{
- if (!cpu_khz)
- return 0;
-
- if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
- return 0;
-
- if (!housekeeping_cpu(cpu, HK_TYPE_MISC))
- return 0;
-
- if (rcu_is_idle_cpu(cpu))
- return 0; /* Idle CPUs are completely uninteresting. */
-
- aperfmperf_snapshot_cpu(cpu, ktime_get(), true);
- return per_cpu(samples.khz, cpu);
-}
-
-void arch_freq_prepare_all(void)
-{
- ktime_t now = ktime_get();
- bool wait = false;
- int cpu;
-
- if (!cpu_khz)
- return;
-
- if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
- return;
-
- for_each_online_cpu(cpu) {
- if (!housekeeping_cpu(cpu, HK_TYPE_MISC))
- continue;
- if (rcu_is_idle_cpu(cpu))
- continue; /* Idle CPUs are completely uninteresting. */
- if (!aperfmperf_snapshot_cpu(cpu, now, false))
- wait = true;
- }
-
- if (wait)
- msleep(APERFMPERF_REFRESH_DELAY_MS);
-}
-
unsigned int arch_freq_get_on_cpu(int cpu)
{
struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu);
@@ -530,6 +487,40 @@ void arch_scale_freq_tick(void)
scale_freq_tick(acnt, mcnt);
}

+/*
+ * Discard samples older than the define maximum sample age of 20ms. There
+ * is no point in sending IPIs in such a case. If the scheduler tick was
+ * not running then the CPU is either idle or isolated.
+ */
+#define MAX_SAMPLE_AGE ((unsigned long)HZ / 50)
+
+unsigned int aperfmperf_get_khz(int cpu)
+{
+ struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu);
+ unsigned long last;
+ unsigned int seq;
+ u64 acnt, mcnt;
+
+ if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ return 0;
+
+ do {
+ seq = raw_read_seqcount_begin(&s->seq);
+ last = s->last_update;
+ acnt = s->acnt;
+ mcnt = s->mcnt;
+ } while (read_seqcount_retry(&s->seq, seq));
+
+ /*
+ * Bail on invalid count and when the last update was too long ago,
+ * which covers idle and NOHZ full CPUs.
+ */
+ if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE)
+ return 0;
+
+ return div64_u64((cpu_khz * acnt), mcnt);
+}
+
static int __init bp_init_aperfmperf(void)
{
if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c
index 419760f..f38bda5 100644
--- a/fs/proc/cpuinfo.c
+++ b/fs/proc/cpuinfo.c
@@ -5,14 +5,10 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>

-__weak void arch_freq_prepare_all(void)
-{
-}
-
extern const struct seq_operations cpuinfo_op;
+
static int cpuinfo_open(struct inode *inode, struct file *file)
{
- arch_freq_prepare_all();
return seq_open(file, &cpuinfo_op);
}

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 35c7d6d..d5595d5 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -1199,7 +1199,6 @@ static inline void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
struct cpufreq_governor *old_gov) { }
#endif

-extern void arch_freq_prepare_all(void);
extern unsigned int arch_freq_get_on_cpu(int cpu);

#ifndef arch_set_freq_scale

Subject: [tip: x86/cleanups] x86/aperfmperf: Replace aperfmperf_get_khz()

The following commit has been merged into the x86/cleanups branch of tip:

Commit-ID: 7d84c1ebf9ddafca27b481e6da7d24a023dacaa2
Gitweb: https://git.kernel.org/tip/7d84c1ebf9ddafca27b481e6da7d24a023dacaa2
Author: Thomas Gleixner <[email protected]>
AuthorDate: Fri, 15 Apr 2022 21:20:02 +02:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Wed, 27 Apr 2022 20:22:19 +02:00

x86/aperfmperf: Replace aperfmperf_get_khz()

The frequency invariance infrastructure provides the APERF/MPERF samples
already. Utilize them for the cpu frequency display in /proc/cpuinfo.

The sample is considered valid for 20ms. So for idle or isolated NOHZ full
CPUs the function returns 0, which is matching the previous behaviour.

This gets rid of the mass IPIs and a delay of 20ms for stabilizing observed
by Eric when reading /proc/cpuinfo.

Reported-by: Eric Dumazet <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Tested-by: Eric Dumazet <[email protected]>
Reviewed-by: Rafael J. Wysocki <[email protected]>
Acked-by: Peter Zijlstra (Intel) <[email protected]>
Acked-by: Paul E. McKenney <[email protected]>
Link: https://lore.kernel.org/r/[email protected]

---
arch/x86/kernel/cpu/aperfmperf.c | 77 +++++++++++++------------------
fs/proc/cpuinfo.c | 6 +--
include/linux/cpufreq.h | 1 +-
3 files changed, 35 insertions(+), 49 deletions(-)

diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index 963c069..e9d2da7 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -101,49 +101,6 @@ static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait)
return time_delta <= APERFMPERF_STALE_THRESHOLD_MS;
}

-unsigned int aperfmperf_get_khz(int cpu)
-{
- if (!cpu_khz)
- return 0;
-
- if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
- return 0;
-
- if (!housekeeping_cpu(cpu, HK_TYPE_MISC))
- return 0;
-
- if (rcu_is_idle_cpu(cpu))
- return 0; /* Idle CPUs are completely uninteresting. */
-
- aperfmperf_snapshot_cpu(cpu, ktime_get(), true);
- return per_cpu(samples.khz, cpu);
-}
-
-void arch_freq_prepare_all(void)
-{
- ktime_t now = ktime_get();
- bool wait = false;
- int cpu;
-
- if (!cpu_khz)
- return;
-
- if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
- return;
-
- for_each_online_cpu(cpu) {
- if (!housekeeping_cpu(cpu, HK_TYPE_MISC))
- continue;
- if (rcu_is_idle_cpu(cpu))
- continue; /* Idle CPUs are completely uninteresting. */
- if (!aperfmperf_snapshot_cpu(cpu, now, false))
- wait = true;
- }
-
- if (wait)
- msleep(APERFMPERF_REFRESH_DELAY_MS);
-}
-
unsigned int arch_freq_get_on_cpu(int cpu)
{
struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu);
@@ -530,6 +487,40 @@ void arch_scale_freq_tick(void)
scale_freq_tick(acnt, mcnt);
}

+/*
+ * Discard samples older than the define maximum sample age of 20ms. There
+ * is no point in sending IPIs in such a case. If the scheduler tick was
+ * not running then the CPU is either idle or isolated.
+ */
+#define MAX_SAMPLE_AGE ((unsigned long)HZ / 50)
+
+unsigned int aperfmperf_get_khz(int cpu)
+{
+ struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu);
+ unsigned long last;
+ unsigned int seq;
+ u64 acnt, mcnt;
+
+ if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ return 0;
+
+ do {
+ seq = raw_read_seqcount_begin(&s->seq);
+ last = s->last_update;
+ acnt = s->acnt;
+ mcnt = s->mcnt;
+ } while (read_seqcount_retry(&s->seq, seq));
+
+ /*
+ * Bail on invalid count and when the last update was too long ago,
+ * which covers idle and NOHZ full CPUs.
+ */
+ if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE)
+ return 0;
+
+ return div64_u64((cpu_khz * acnt), mcnt);
+}
+
static int __init bp_init_aperfmperf(void)
{
if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c
index 419760f..f38bda5 100644
--- a/fs/proc/cpuinfo.c
+++ b/fs/proc/cpuinfo.c
@@ -5,14 +5,10 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>

-__weak void arch_freq_prepare_all(void)
-{
-}
-
extern const struct seq_operations cpuinfo_op;
+
static int cpuinfo_open(struct inode *inode, struct file *file)
{
- arch_freq_prepare_all();
return seq_open(file, &cpuinfo_op);
}

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 35c7d6d..d5595d5 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -1199,7 +1199,6 @@ static inline void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
struct cpufreq_governor *old_gov) { }
#endif

-extern void arch_freq_prepare_all(void);
extern unsigned int arch_freq_get_on_cpu(int cpu);

#ifndef arch_set_freq_scale