Received: by 2002:ac0:a581:0:0:0:0:0 with SMTP id m1-v6csp101874imm; Tue, 19 Jun 2018 16:54:40 -0700 (PDT) X-Google-Smtp-Source: ADUXVKIaZsw2FLf6U50WlaoZEW0FteYzvLhrscFzpYiHnvYOBGDSaNbpDF8e14I5GbR8UsMWWuAa X-Received: by 2002:a17:902:b590:: with SMTP id a16-v6mr21596818pls.225.1529452480699; Tue, 19 Jun 2018 16:54:40 -0700 (PDT) ARC-Seal: i=1; a=rsa-sha256; t=1529452480; cv=none; d=google.com; s=arc-20160816; b=PCRs3Rya7QMDom9+Qicsw5DVv8qFzz8Mnup45I4jA6JRjgOUMycIzRIjSHhsD7uv3z YPOAYiAmC6eVge2gYEmfWI+tYDlyhmulF5aTs4VuaFNg1Sg6q2+++VWQ1NGeRbXUWPH/ bOqo7U2NChJAjAvlkCv9km2zG7vcp6kFkXJfhkf/9e84qTc0w69TAGPHpQMv1L+D1LEY 4x/umhR3YQCJMYchXk5vUj1LWDo2OTRH/tlMj896Jeh006riPdRJYHVRpYfsPeoVzai5 /77cjURGvYzgJnta7Sd3wwRyBwUWh9Eh6hl0J2Wedc7BdcWDShDT9Xb/KEywqjxEfR5v J0yA== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-20160816; h=list-id:precedence:sender:mime-version:user-agent:references :message-id:in-reply-to:subject:cc:to:from:date :arc-authentication-results; bh=By5o6Y3uHln8V8JEKVcJeTNb658aTIA9o8ItTvzIDqs=; b=cAWBYtFXV67aUuGWWrVk5co03WqNbRGcrPpqO0zdJW3RnuEbeL7FjiFiYk73UU3jOC XU0PC28jWZyAOKlpY3iI7BhVUVA6z0TyyZ8qGJvExbg7EVcYPxnYqESzk9aSt+rjlbY2 J7KDTIjHSDcX4EbzQeY3Ab6lAkHOZqhV4KLN89+O0EXbIlmJMFoaccXg1vWKqRs1Jypl gHKGxPtY9U70GKYBUXClMtOwMopM6VgXjM2Rt/aEWkCl+Mt5mdrzn2Xg7vMT+QAq/vjf vlXi+nMt9m4vVYJSD3Rks/wSOCTcE/WKz1TEAVne1uotCAmD0NPOSbAOdSRQR/xRzBGI S6CA== ARC-Authentication-Results: i=1; mx.google.com; spf=pass (google.com: best guess record for domain of linux-kernel-owner@vger.kernel.org designates 209.132.180.67 as permitted sender) smtp.mailfrom=linux-kernel-owner@vger.kernel.org Return-Path: Received: from vger.kernel.org (vger.kernel.org. [209.132.180.67]) by mx.google.com with ESMTP id d9-v6si1007579pli.382.2018.06.19.16.54.26; Tue, 19 Jun 2018 16:54:40 -0700 (PDT) Received-SPF: pass (google.com: best guess record for domain of linux-kernel-owner@vger.kernel.org designates 209.132.180.67 as permitted sender) client-ip=209.132.180.67; Authentication-Results: mx.google.com; spf=pass (google.com: best guess record for domain of linux-kernel-owner@vger.kernel.org designates 209.132.180.67 as permitted sender) smtp.mailfrom=linux-kernel-owner@vger.kernel.org Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752650AbeFSXw2 (ORCPT + 99 others); Tue, 19 Jun 2018 19:52:28 -0400 Received: from Galois.linutronix.de ([146.0.238.70]:58597 "EHLO Galois.linutronix.de" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751003AbeFSXw0 (ORCPT ); Tue, 19 Jun 2018 19:52:26 -0400 Received: from p4fea482e.dip0.t-ipconnect.de ([79.234.72.46] helo=nanos.glx-home) by Galois.linutronix.de with esmtpsa (TLS1.2:DHE_RSA_AES_256_CBC_SHA256:256) (Exim 4.80) (envelope-from ) id 1fVQPy-0005Lz-P7; Wed, 20 Jun 2018 01:52:11 +0200 Date: Wed, 20 Jun 2018 01:52:10 +0200 (CEST) From: Thomas Gleixner To: Pavel Tatashin cc: steven.sistare@oracle.com, daniel.m.jordan@oracle.com, linux@armlinux.org.uk, schwidefsky@de.ibm.com, heiko.carstens@de.ibm.com, john.stultz@linaro.org, sboyd@codeaurora.org, x86@kernel.org, linux-kernel@vger.kernel.org, mingo@redhat.com, hpa@zytor.com, douly.fnst@cn.fujitsu.com, peterz@infradead.org, prarit@redhat.com, feng.tang@intel.com, pmladek@suse.com, gnomes@lxorguk.ukuu.org.uk Subject: Re: [PATCH v10 7/7] x86/tsc: use tsc early In-Reply-To: <20180615174204.30581-8-pasha.tatashin@oracle.com> Message-ID: References: <20180615174204.30581-1-pasha.tatashin@oracle.com> <20180615174204.30581-8-pasha.tatashin@oracle.com> User-Agent: Alpine 2.21 (DEB 202 2017-01-01) MIME-Version: 1.0 Content-Type: text/plain; charset=US-ASCII X-Linutronix-Spam-Score: -1.0 X-Linutronix-Spam-Level: - X-Linutronix-Spam-Status: No , -1.0 points, 5.0 required, ALL_TRUSTED=-1,SHORTCIRCUIT=-0.0001 Sender: linux-kernel-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Fri, 15 Jun 2018, Pavel Tatashin wrote: > tsc_early_init(): > Determines offset, shift and multiplier for the early clock based on the > TSC frequency. > > tsc_early_fini() > Implement the finish part of early tsc feature, prints message about the > offset, which can be useful to find out how much time was spent in post and > boot manager (if TSC starts from 0 during boot) > > sched_clock_early(): > TSC based implementation of early clock and is called from sched_clock(). > > start_early_clock(): > Calls tsc_early_init(), and makes sched_clock() to use early boot clock > > set_final_clock(): > Sets the final clock which is either platform specific or > native_sched_clock(). Also calls tsc_early_fini() if early clock was > previously initialized. > > Call start_early_clock() to start using early boot time stamps > functionality on the supported x86 platforms, and call set_final_clock() to > finish this feature and switch back to the default clock. The supported x86 > systems are those where TSC frequency is determined early in boot. Lots of functions for dubious value. > +static struct cyc2ns_data cyc2ns_early; > + > +static u64 sched_clock_early(void) > +{ > + u64 ns = mul_u64_u32_shr(rdtsc(), cyc2ns_early.cyc2ns_mul, > + cyc2ns_early.cyc2ns_shift); > + return ns + cyc2ns_early.cyc2ns_offset; > +} > + > +/* > + * Initialize clock for early time stamps > + */ > +static void __init tsc_early_init(unsigned int khz) > +{ > + clocks_calc_mult_shift(&cyc2ns_early.cyc2ns_mul, > + &cyc2ns_early.cyc2ns_shift, > + khz, NSEC_PER_MSEC, 0); > + cyc2ns_early.cyc2ns_offset = -sched_clock_early(); > +} > + > +/* > + * Finish clock for early time stamps, and hand over to permanent clock by > + * setting __sched_clock_offset appropriately for continued time keeping. > + */ > +static void __init tsc_early_fini(void) > +{ > + unsigned long long t; > + unsigned long r; > + > + t = -cyc2ns_early.cyc2ns_offset; > + r = do_div(t, NSEC_PER_SEC); > + > + __sched_clock_offset = sched_clock_early() - sched_clock(); > + pr_info("early sched clock is finished, offset [%lld.%09lds]\n", t, r); > +} > + > +#ifdef CONFIG_PARAVIRT > +static inline void __init start_early_clock(void) > +{ > + tsc_early_init(tsc_khz); > + pv_time_ops.active_sched_clock = sched_clock_early; > +} > + > +static inline void __init set_final_clock(void) > +{ > + pv_time_ops.active_sched_clock = pv_time_ops.sched_clock; > + > + /* We did not have early sched clock if multiplier is 0 */ > + if (cyc2ns_early.cyc2ns_mul) > + tsc_early_fini(); > +} > +#else /* CONFIG_PARAVIRT */ > +/* > + * For native clock we use two switches static and dynamic, the static switch is > + * initially true, so we check the dynamic switch, which is initially false. > + * Later when early clock is disabled, we can alter the static switch in order > + * to avoid branch check on every sched_clock() call. > + */ > +static bool __tsc_early; > +static DEFINE_STATIC_KEY_TRUE(__tsc_early_static); > + > +static inline void __init start_early_clock(void) > +{ > + tsc_early_init(tsc_khz); > + __tsc_early = true; > +} > + > +static inline void __init set_final_clock(void) > +{ > + __tsc_early = false; > + static_branch_disable(&__tsc_early_static); > + > + /* We did not have early sched clock if multiplier is 0 */ > + if (cyc2ns_early.cyc2ns_mul) > + tsc_early_fini(); > +} > +#endif /* CONFIG_PARAVIRT */ > + > /* > * Scheduler clock - returns current time in nanosec units. > */ > @@ -194,6 +272,13 @@ u64 native_sched_clock(void) > return cycles_2_ns(tsc_now); > } > > +#ifndef CONFIG_PARAVIRT > + if (static_branch_unlikely(&__tsc_early_static)) { > + if (__tsc_early) > + return sched_clock_early(); > + } > +#endif /* !CONFIG_PARAVIRT */ > + This whole function maze plus the ifdeffery which comes with it is really horrible and not required. What's wrong with reusing the existing functionality? The patch below (uncompiled and untested) should achieve the same thing without all the paravirt muck (which can be easily added w/o all the ifdeffery if really required) by just reusing the existing conversion and initialization functions. If I'm not completely mistaken then the second invocation of set_cyc2ns_scale() from tsc_init() will also take care of the smooth sched_clock() transition from early to final w/o touching the core __sched_clock_offset at all. Though my tired brain might trick me. It might not work as is, but it should not be rocket science to make it do so. Thanks, tglx 8<---------------------- tsc.c | 59 ++++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 15 deletions(-) --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -39,6 +39,9 @@ EXPORT_SYMBOL(tsc_khz); static int __read_mostly tsc_unstable; static DEFINE_STATIC_KEY_FALSE(__use_tsc); +static DEFINE_STATIC_KEY_TRUE(tsc_early_enabled); + +static bool tsc_early_sched_clock; int tsc_clocksource_reliable; @@ -133,18 +136,12 @@ static inline unsigned long long cycles_ return ns; } -static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) +static void __set_cyc2ns_scale(unsigned long khz, int cpu, + unsigned long long tsc_now) { unsigned long long ns_now; struct cyc2ns_data data; struct cyc2ns *c2n; - unsigned long flags; - - local_irq_save(flags); - sched_clock_idle_sleep_event(); - - if (!khz) - goto done; ns_now = cycles_2_ns(tsc_now); @@ -176,22 +173,46 @@ static void set_cyc2ns_scale(unsigned lo c2n->data[0] = data; raw_write_seqcount_latch(&c2n->seq); c2n->data[1] = data; +} + +static void set_cyc2ns_scale(unsigned long khz, int cpu, + unsigned long long tsc_now) +{ + unsigned long flags; + + local_irq_save(flags); + sched_clock_idle_sleep_event(); + + if (khz) + __set_cyc2ns_scale(khz, cpu, tsc_now); -done: sched_clock_idle_wakeup_event(); local_irq_restore(flags); } +static void __init sched_clock_early_init(unsigned int khz) +{ + cyc2ns_init(smp_processor_id()); + __set_cyc2ns_scale(khz, smp_processor_id(), rdtsc()); + tsc_early_sched_clock = true; +} + +static void __init sched_clock_early_exit(void) +{ + static_branch_disable(&tsc_early_enabled); +} + /* * Scheduler clock - returns current time in nanosec units. */ u64 native_sched_clock(void) { - if (static_branch_likely(&__use_tsc)) { - u64 tsc_now = rdtsc(); + if (static_branch_likely(&__use_tsc)) + return cycles_2_ns(rdtsc()); - /* return the value in ns */ - return cycles_2_ns(tsc_now); + if (static_branch_unlikely(&tsc_early_enabled)) { + if (tsc_early_sched_clock) + return cycles_2_ns(rdtsc()); } /* @@ -1332,9 +1353,10 @@ void __init tsc_early_delay_calibrate(vo lpj = tsc_khz * 1000; do_div(lpj, HZ); loops_per_jiffy = lpj; + sched_clock_early_init(tsc_khz); } -void __init tsc_init(void) +static void __init __tsc_init(void) { u64 lpj, cyc; int cpu; @@ -1384,7 +1406,8 @@ void __init tsc_init(void) */ cyc = rdtsc(); for_each_possible_cpu(cpu) { - cyc2ns_init(cpu); + if (!tsc_early_sched_clock || cpu != smp_processor_id()) + cyc2ns_init(cpu); set_cyc2ns_scale(tsc_khz, cpu, cyc); } @@ -1411,6 +1434,12 @@ void __init tsc_init(void) detect_art(); } +void __init tsc_init(void) +{ + __tsc_init(); + sched_clock_early_exit(); +} + #ifdef CONFIG_SMP /* * If we have a constant TSC and are using the TSC for the delay loop,