Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751595AbdDMEbQ (ORCPT ); Thu, 13 Apr 2017 00:31:16 -0400 Received: from mail-oi0-f65.google.com ([209.85.218.65]:34115 "EHLO mail-oi0-f65.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751309AbdDMEbN (ORCPT ); Thu, 13 Apr 2017 00:31:13 -0400 MIME-Version: 1.0 In-Reply-To: References: <20170323165512.60945ac6@redhat.com> <1490636129.8850.76.camel@redhat.com> <20170328132406.7d23579c@redhat.com> <20170329131656.1d6cb743@redhat.com> <1490818125.28917.11.camel@redhat.com> <1490848051.4167.57.camel@gmx.de> <20170412131818.GB21309@lerouge> From: Wanpeng Li Date: Thu, 13 Apr 2017 12:31:12 +0800 Message-ID: Subject: Re: [BUG nohz]: wrong user and system time accounting To: Thomas Gleixner Cc: Frederic Weisbecker , Mike Galbraith , Rik van Riel , Luiz Capitulino , "linux-kernel@vger.kernel.org" , Peter Zijlstra Content-Type: text/plain; charset=UTF-8 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 7310 Lines: 214 2017-04-12 22:57 GMT+08:00 Thomas Gleixner : > On Wed, 12 Apr 2017, Frederic Weisbecker wrote: >> On Tue, Apr 11, 2017 at 04:22:48PM +0200, Thomas Gleixner wrote: >> > It's not different from the current jiffies based stuff at all. Same >> > failure mode. >> >> Yes you're right, I got confused again. So to fix this we could do our snapshots >> at a frequency lower than HZ but still high enough to avoid overhead. >> >> Something like TICK_NSEC / 2 ? > > If you are using TSC anyway then you can do proper accumulation for both > system and user and only account the data when the accumulation is more > than a jiffie. So I implement it as below: - HZ=1000. 1) two cpu hogs on cpu in nohz_full mode, 100% user time 2) Luzi's testcase, ~95% user time, ~5% idle time (as we expected) - HZ=250 1) two cpu hogs on cpu in nohz_full mode, 100% user time 2) Luzi's testcase, 100% idle So the codes below still not work correctly for HZ=250, any suggestions? -------------------------------------->8----------------------------------------------------- diff --git a/include/linux/sched.h b/include/linux/sched.h index d67eee8..6a11771 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -668,6 +668,8 @@ struct task_struct { #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN seqcount_t vtime_seqcount; unsigned long long vtime_snap; + u64 vtime_acct_utime; + u64 vtime_acct_stime; enum { /* Task is sleeping or running in a CPU with VTIME inactive: */ VTIME_INACTIVE = 0, diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index f3778e2b..f8e54ba 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -674,20 +674,41 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -static u64 vtime_delta(struct task_struct *tsk) +static u64 vtime_delta(struct task_struct *tsk, bool user) { - unsigned long now = READ_ONCE(jiffies); + u64 delta, ret = 0; - if (time_before(now, (unsigned long)tsk->vtime_snap)) - return 0; + delta = sched_clock() - tsk->vtime_snap; - return jiffies_to_nsecs(now - tsk->vtime_snap); + if (is_idle_task(tsk)) { + if (delta >= TICK_NSEC) + ret = delta; + } else { + if (user) { + tsk->vtime_acct_utime += delta; + if (tsk->vtime_acct_utime >= TICK_NSEC) + ret = tsk->vtime_acct_utime; + } else { + tsk->vtime_acct_stime += delta; + if (tsk->vtime_acct_utime >= TICK_NSEC) + ret = tsk->vtime_acct_stime; + } + } + + return ret; } -static u64 get_vtime_delta(struct task_struct *tsk) +static u64 get_vtime_delta(struct task_struct *tsk, bool user) { - unsigned long now = READ_ONCE(jiffies); - u64 delta, other; + u64 delta = vtime_delta(tsk, user); + u64 other; + + if (!is_idle_task(tsk)) { + if (user) + tsk->vtime_acct_utime = 0; + else + tsk->vtime_acct_stime = 0; + } /* * Unlike tick based timing, vtime based timing never has lost @@ -696,22 +717,21 @@ static u64 get_vtime_delta(struct task_struct *tsk) * elapsed time. Limit account_other_time to prevent rounding * errors from causing elapsed vtime to go negative. */ - delta = jiffies_to_nsecs(now - tsk->vtime_snap); other = account_other_time(delta); WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); - tsk->vtime_snap = now; + tsk->vtime_snap += delta; return delta - other; } static void __vtime_account_system(struct task_struct *tsk) { - account_system_time(tsk, irq_count(), get_vtime_delta(tsk)); + account_system_time(tsk, irq_count(), get_vtime_delta(tsk, false)); } void vtime_account_system(struct task_struct *tsk) { - if (!vtime_delta(tsk)) + if (!vtime_delta(tsk, false)) return; write_seqcount_begin(&tsk->vtime_seqcount); @@ -723,15 +743,15 @@ void vtime_account_user(struct task_struct *tsk) { write_seqcount_begin(&tsk->vtime_seqcount); tsk->vtime_snap_whence = VTIME_SYS; - if (vtime_delta(tsk)) - account_user_time(tsk, get_vtime_delta(tsk)); + if (vtime_delta(tsk, true)) + account_user_time(tsk, get_vtime_delta(tsk, true)); write_seqcount_end(&tsk->vtime_seqcount); } void vtime_user_enter(struct task_struct *tsk) { write_seqcount_begin(&tsk->vtime_seqcount); - if (vtime_delta(tsk)) + if (vtime_delta(tsk, false)) __vtime_account_system(tsk); tsk->vtime_snap_whence = VTIME_USER; write_seqcount_end(&tsk->vtime_seqcount); @@ -747,7 +767,7 @@ void vtime_guest_enter(struct task_struct *tsk) * that can thus safely catch up with a tickless delta. */ write_seqcount_begin(&tsk->vtime_seqcount); - if (vtime_delta(tsk)) + if (vtime_delta(tsk, false)) __vtime_account_system(tsk); current->flags |= PF_VCPU; write_seqcount_end(&tsk->vtime_seqcount); @@ -765,7 +785,7 @@ EXPORT_SYMBOL_GPL(vtime_guest_exit); void vtime_account_idle(struct task_struct *tsk) { - account_idle_time(get_vtime_delta(tsk)); + account_idle_time(get_vtime_delta(tsk, false)); } void arch_vtime_task_switch(struct task_struct *prev) @@ -776,7 +796,7 @@ void arch_vtime_task_switch(struct task_struct *prev) write_seqcount_begin(¤t->vtime_seqcount); current->vtime_snap_whence = VTIME_SYS; - current->vtime_snap = jiffies; + current->vtime_snap = sched_clock_cpu(smp_processor_id()); write_seqcount_end(¤t->vtime_seqcount); } @@ -787,7 +807,7 @@ void vtime_init_idle(struct task_struct *t, int cpu) local_irq_save(flags); write_seqcount_begin(&t->vtime_seqcount); t->vtime_snap_whence = VTIME_SYS; - t->vtime_snap = jiffies; + t->vtime_snap = sched_clock_cpu(cpu); write_seqcount_end(&t->vtime_seqcount); local_irq_restore(flags); } @@ -805,7 +825,7 @@ u64 task_gtime(struct task_struct *t) gtime = t->gtime; if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU) - gtime += vtime_delta(t); + gtime += vtime_delta(t, false); } while (read_seqcount_retry(&t->vtime_seqcount, seq)); @@ -819,7 +839,6 @@ u64 task_gtime(struct task_struct *t) */ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) { - u64 delta; unsigned int seq; if (!vtime_accounting_enabled()) { @@ -838,16 +857,14 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t)) continue; - delta = vtime_delta(t); - /* * Task runs either in user or kernel space, add pending nohz time to * the right place. */ if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) - *utime += delta; + *utime += vtime_delta(t, true); else if (t->vtime_snap_whence == VTIME_SYS) - *stime += delta; + *stime += vtime_delta(t, false); } while (read_seqcount_retry(&t->vtime_seqcount, seq)); } #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */