A customer reported that when a cpu goes offline, the iowait and idle
times reported in /proc/stat will sometimes spike. This is being
caused by a different data source being used for these values when a
cpu is offline.
Prior to this patch:
put the system under heavy load so that there is little idle time
user nice system idle iowait
cpu 109515 17 32111 220686 607
take cpu1 offline
user nice system idle iowait
cpu 113742 17 32721 220724 612
bring cpu1 back online
user nice system idle iowait
cpu 118332 17 33430 220687 607
To prevent this, let's use the same data source whether a cpu is
online or not.
With this patch:
put the system under heavy load so that there is little idle time
user nice system idle iowait
cpu 14096 16 4646 157687 426
take cpu1 offline
user nice system idle iowait
cpu 21614 16 7179 157687 426
bring cpu1 back online
user nice system idle iowait
cpu 27362 16 9555 157688 426
Signed-off-by: Tom Hromatka <[email protected]>
---
fs/proc/stat.c | 24 ++++++------------------
1 file changed, 6 insertions(+), 18 deletions(-)
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 46b3293015fe..35b92539e711 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -47,32 +47,20 @@ static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
{
- u64 idle, idle_usecs = -1ULL;
+ u64 idle, idle_usecs;
- if (cpu_online(cpu))
- idle_usecs = get_cpu_idle_time_us(cpu, NULL);
-
- if (idle_usecs == -1ULL)
- /* !NO_HZ or cpu offline so we can rely on cpustat.idle */
- idle = kcs->cpustat[CPUTIME_IDLE];
- else
- idle = idle_usecs * NSEC_PER_USEC;
+ idle_usecs = get_cpu_idle_time_us(cpu, NULL);
+ idle = idle_usecs * NSEC_PER_USEC;
return idle;
}
static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
{
- u64 iowait, iowait_usecs = -1ULL;
-
- if (cpu_online(cpu))
- iowait_usecs = get_cpu_iowait_time_us(cpu, NULL);
+ u64 iowait, iowait_usecs;
- if (iowait_usecs == -1ULL)
- /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */
- iowait = kcs->cpustat[CPUTIME_IOWAIT];
- else
- iowait = iowait_usecs * NSEC_PER_USEC;
+ iowait_usecs = get_cpu_iowait_time_us(cpu, NULL);
+ iowait = iowait_usecs * NSEC_PER_USEC;
return iowait;
}
--
2.25.4
On Wed, Sep 09, 2020 at 08:41:22AM -0600, Tom Hromatka wrote:
> static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
> {
> - u64 idle, idle_usecs = -1ULL;
> + u64 idle, idle_usecs;
>
> - if (cpu_online(cpu))
> - idle_usecs = get_cpu_idle_time_us(cpu, NULL);
> -
> - if (idle_usecs == -1ULL)
> - /* !NO_HZ or cpu offline so we can rely on cpustat.idle */
> - idle = kcs->cpustat[CPUTIME_IDLE];
> - else
> - idle = idle_usecs * NSEC_PER_USEC;
> + idle_usecs = get_cpu_idle_time_us(cpu, NULL);
> + idle = idle_usecs * NSEC_PER_USEC;
>
> return idle;
> }
>
> static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
> {
> - u64 iowait, iowait_usecs = -1ULL;
> -
> - if (cpu_online(cpu))
> - iowait_usecs = get_cpu_iowait_time_us(cpu, NULL);
> + u64 iowait, iowait_usecs;
>
> - if (iowait_usecs == -1ULL)
> - /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */
> - iowait = kcs->cpustat[CPUTIME_IOWAIT];
> - else
> - iowait = iowait_usecs * NSEC_PER_USEC;
> + iowait_usecs = get_cpu_iowait_time_us(cpu, NULL);
> + iowait = iowait_usecs * NSEC_PER_USEC;
You can gc variables in both cases:
return get_cpu_iowait_time_us() * NSEC_PER_USEC;
On Wed, Sep 09 2020 at 08:41, Tom Hromatka wrote:
> A customer reported that when a cpu goes offline, the iowait and idle
> times reported in /proc/stat will sometimes spike. This is being
> caused by a different data source being used for these values when a
> cpu is offline.
>
> Prior to this patch:
>
> put the system under heavy load so that there is little idle time
>
> user nice system idle iowait
> cpu 109515 17 32111 220686 607
>
> take cpu1 offline
>
> user nice system idle iowait
> cpu 113742 17 32721 220724 612
>
> bring cpu1 back online
>
> user nice system idle iowait
> cpu 118332 17 33430 220687 607
>
> To prevent this, let's use the same data source whether a cpu is
> online or not.
Let's use? Your patch makes it use the same data source.
And again, neither the customer story nor the numbers are helpful to
understand the underlying problem. Also this lacks a reference to the
previous change which preserves the times accross a CPU offline/online
sequence.
> diff --git a/fs/proc/stat.c b/fs/proc/stat.c
> index 46b3293015fe..35b92539e711 100644
> --- a/fs/proc/stat.c
> +++ b/fs/proc/stat.c
> @@ -47,32 +47,20 @@ static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
>
> static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
> {
> - u64 idle, idle_usecs = -1ULL;
> + u64 idle, idle_usecs;
>
> - if (cpu_online(cpu))
> - idle_usecs = get_cpu_idle_time_us(cpu, NULL);
> -
> - if (idle_usecs == -1ULL)
> - /* !NO_HZ or cpu offline so we can rely on cpustat.idle */
> - idle = kcs->cpustat[CPUTIME_IDLE];
> - else
> - idle = idle_usecs * NSEC_PER_USEC;
> + idle_usecs = get_cpu_idle_time_us(cpu, NULL);
> + idle = idle_usecs * NSEC_PER_USEC;
>
> return idle;
return get_cpu_idle_time_us(cpu, NULL) * NSEC_PER_USEC;
perhaps?
Thanks,
tglx