2021-08-12 20:49:25

by Josh Don

[permalink] [raw]
Subject: [PATCH] fs/proc/uptime.c: fix idle time reporting in /proc/uptime

/proc/uptime reports idle time by reading the CPUTIME_IDLE field from
the per-cpu kcpustats. However, on NO_HZ systems, idle time is not
continually updated on idle cpus, leading this value to appear
incorrectly small.

/proc/stat performs an accounting update when reading idle time; we can
use the same approach for uptime.

With this patch, /proc/stat and /proc/uptime now agree on idle time.
Additionally, the following shows idle time tick up consistently on an
idle machine:
(while true; do cat /proc/uptime; sleep 1; done) | awk '{print $2-prev; prev=$2}'

Reported-by: Luigi Rizzo <[email protected]>
Signed-off-by: Josh Don <[email protected]>
---
fs/proc/stat.c | 26 --------------------------
fs/proc/uptime.c | 13 ++++++++-----
include/linux/kernel_stat.h | 1 +
kernel/sched/cputime.c | 28 ++++++++++++++++++++++++++++
4 files changed, 37 insertions(+), 31 deletions(-)

diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 6561a06ef905..99796a8a5223 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -24,16 +24,6 @@

#ifdef arch_idle_time

-static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
-{
- u64 idle;
-
- idle = kcs->cpustat[CPUTIME_IDLE];
- if (cpu_online(cpu) && !nr_iowait_cpu(cpu))
- idle += arch_idle_time(cpu);
- return idle;
-}
-
static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
{
u64 iowait;
@@ -46,22 +36,6 @@ static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)

#else

-static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
-{
- u64 idle, idle_usecs = -1ULL;
-
- if (cpu_online(cpu))
- idle_usecs = get_cpu_idle_time_us(cpu, NULL);
-
- if (idle_usecs == -1ULL)
- /* !NO_HZ or cpu offline so we can rely on cpustat.idle */
- idle = kcs->cpustat[CPUTIME_IDLE];
- else
- idle = idle_usecs * NSEC_PER_USEC;
-
- return idle;
-}
-
static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
{
u64 iowait, iowait_usecs = -1ULL;
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 5a1b228964fb..c900f354ef93 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -12,18 +12,21 @@ static int uptime_proc_show(struct seq_file *m, void *v)
{
struct timespec64 uptime;
struct timespec64 idle;
- u64 nsec;
+ const struct kernel_cpustat *kcs;
+ u64 idle_nsec;
u32 rem;
int i;

- nsec = 0;
- for_each_possible_cpu(i)
- nsec += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
+ idle_nsec = 0;
+ for_each_possible_cpu(i) {
+ kcs = &kcpustat_cpu(i);
+ idle_nsec += get_idle_time(kcs, i);
+ }

ktime_get_boottime_ts64(&uptime);
timens_add_boottime(&uptime);

- idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
+ idle.tv_sec = div_u64_rem(idle_nsec, NSEC_PER_SEC, &rem);
idle.tv_nsec = rem;
seq_printf(m, "%lu.%02lu %lu.%02lu\n",
(unsigned long) uptime.tv_sec,
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 44ae1a7eb9e3..9a5f5c6239c7 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -102,6 +102,7 @@ extern void account_system_index_time(struct task_struct *, u64,
enum cpu_usage_stat);
extern void account_steal_time(u64);
extern void account_idle_time(u64);
+extern u64 get_idle_time(const struct kernel_cpustat *kcs, int cpu);

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
static inline void account_process_tick(struct task_struct *tsk, int user)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 872e481d5098..9d7629e21164 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -227,6 +227,34 @@ void account_idle_time(u64 cputime)
cpustat[CPUTIME_IDLE] += cputime;
}

+/*
+ * Returns the total idle time for the given cpu.
+ * @kcs: The kernel_cpustat for the desired cpu.
+ * @cpu: The desired cpu.
+ */
+u64 get_idle_time(const struct kernel_cpustat *kcs, int cpu)
+{
+ u64 idle;
+ u64 __maybe_unused idle_usecs = -1ULL;
+
+#ifdef arch_idle_time
+ idle = kcs->cpustat[CPUTIME_IDLE];
+ if (cpu_online(cpu) && !nr_iowait_cpu(cpu))
+ idle += arch_idle_time(cpu);
+#else
+ if (cpu_online(cpu))
+ idle_usecs = get_cpu_idle_time_us(cpu, NULL);
+
+ if (idle_usecs == -1ULL)
+ /* !NO_HZ or cpu offline so we can rely on cpustat.idle */
+ idle = kcs->cpustat[CPUTIME_IDLE];
+ else
+ idle = idle_usecs * NSEC_PER_USEC;
+#endif
+
+ return idle;
+}
+
/*
* When a guest is interrupted for a longer amount of time, missed clock
* ticks are not redelivered later. Due to that, this function may on
--
2.33.0.rc1.237.g0d66db33f3-goog


2021-08-12 20:55:08

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH] fs/proc/uptime.c: fix idle time reporting in /proc/uptime

On Thu, Aug 12, 2021 at 10:31 PM Josh Don <[email protected]> wrote:
>
> /proc/uptime reports idle time by reading the CPUTIME_IDLE field from
> the per-cpu kcpustats. However, on NO_HZ systems, idle time is not
> continually updated on idle cpus, leading this value to appear
> incorrectly small.
>
> /proc/stat performs an accounting update when reading idle time; we can
> use the same approach for uptime.
>
> With this patch, /proc/stat and /proc/uptime now agree on idle time.
> Additionally, the following shows idle time tick up consistently on an
> idle machine:
> (while true; do cat /proc/uptime; sleep 1; done) | awk '{print $2-prev; prev=$2}'
>
> Reported-by: Luigi Rizzo <[email protected]>
> Signed-off-by: Josh Don <[email protected]>
> ---
> fs/proc/stat.c | 26 --------------------------
> fs/proc/uptime.c | 13 ++++++++-----
> include/linux/kernel_stat.h | 1 +
> kernel/sched/cputime.c | 28 ++++++++++++++++++++++++++++
> 4 files changed, 37 insertions(+), 31 deletions(-)
>
> diff --git a/fs/proc/stat.c b/fs/proc/stat.c
> index 6561a06ef905..99796a8a5223 100644
> --- a/fs/proc/stat.c
> +++ b/fs/proc/stat.c
> @@ -24,16 +24,6 @@
>
> #ifdef arch_idle_time
>
> -static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
> -{
> - u64 idle;
> -
> - idle = kcs->cpustat[CPUTIME_IDLE];
> - if (cpu_online(cpu) && !nr_iowait_cpu(cpu))
> - idle += arch_idle_time(cpu);
> - return idle;
> -}
> -
> static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
> {
> u64 iowait;
> @@ -46,22 +36,6 @@ static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
>
> #else
>
> -static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
> -{
> - u64 idle, idle_usecs = -1ULL;
> -
> - if (cpu_online(cpu))
> - idle_usecs = get_cpu_idle_time_us(cpu, NULL);
> -
> - if (idle_usecs == -1ULL)
> - /* !NO_HZ or cpu offline so we can rely on cpustat.idle */
> - idle = kcs->cpustat[CPUTIME_IDLE];
> - else
> - idle = idle_usecs * NSEC_PER_USEC;
> -
> - return idle;
> -}
> -
> static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
> {
> u64 iowait, iowait_usecs = -1ULL;

...

> diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
> index 872e481d5098..9d7629e21164 100644
> --- a/kernel/sched/cputime.c
> +++ b/kernel/sched/cputime.c
> @@ -227,6 +227,34 @@ void account_idle_time(u64 cputime)
> cpustat[CPUTIME_IDLE] += cputime;
> }
>
> +/*
> + * Returns the total idle time for the given cpu.
> + * @kcs: The kernel_cpustat for the desired cpu.
> + * @cpu: The desired cpu.
> + */
> +u64 get_idle_time(const struct kernel_cpustat *kcs, int cpu)
> +{
> + u64 idle;
> + u64 __maybe_unused idle_usecs = -1ULL;
> +
> +#ifdef arch_idle_time
> + idle = kcs->cpustat[CPUTIME_IDLE];
> + if (cpu_online(cpu) && !nr_iowait_cpu(cpu))
> + idle += arch_idle_time(cpu);
> +#else
> + if (cpu_online(cpu))
> + idle_usecs = get_cpu_idle_time_us(cpu, NULL);
> +
> + if (idle_usecs == -1ULL)
> + /* !NO_HZ or cpu offline so we can rely on cpustat.idle */
> + idle = kcs->cpustat[CPUTIME_IDLE];
> + else
> + idle = idle_usecs * NSEC_PER_USEC;
> +#endif
> +
> + return idle;
> +}
> +
>

Not sure why you moved get_idle_time() in kernel/sched/cputime.c

For builds where CONFIG_PROC_FS is not set, this function is not used/needed.

2021-08-12 21:23:24

by Josh Don

[permalink] [raw]
Subject: Re: [PATCH] fs/proc/uptime.c: fix idle time reporting in /proc/uptime

On Thu, Aug 12, 2021 at 1:42 PM Eric Dumazet <[email protected]> wrote:
>
> Not sure why you moved get_idle_time() in kernel/sched/cputime.c
>
> For builds where CONFIG_PROC_FS is not set, this function is not used/needed.

Consolidate the kernel idle accounting code; seemed a little strange
to have this implemented in proc/stat.c, especially if it also needed
to be used elsewhere.

Does a __maybe_unused/#ifdef CONFIG_PROC_FS seem reasonable, or would
you advocate leaving it in fs/proc?