Currently printk timestamp mostly come from the sched_clock which
depends on the clock setup, so there are many kernel logs started
with "[ 0.000000] " before the clock is calibrated.
This patch will provide an debug option for specific platform to
provide a early boot time clock, so that we can have time info in
kernel log much earlier, which can show the time info for the early
kernel boot, and make boottime tuning/optimization easier (boot time
is critical for phone/tablet and embedded devices).
Capable platform only need to setup the "boot_printk_clock_fn"
which could return time in nano seconds.
Together with a TSC patch on x86 system, we have easily captured
some early boottime killer like unwind_init() which takes about
300ms in boot phase.
Signed-off-by: Feng Tang <[email protected]>
---
include/linux/printk.h | 3 +++
kernel/printk/printk.c | 15 +++++++++++++--
2 files changed, 16 insertions(+), 2 deletions(-)
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 6d7e800..7c13d4a 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -188,6 +188,9 @@ extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
extern int printk_delay_msec;
extern int dmesg_restrict;
+/* Return boot timestamp in nano seconds */
+extern u64 (*boot_printk_clock_fn)(void);
+
extern int
devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void __user *buf,
size_t *lenp, loff_t *ppos);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 2f4af21..7feffd8 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -190,6 +190,17 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
return 0;
}
+u64 (*boot_printk_clock_fn)(void) __read_mostly;
+
+static u64 printk_clock(void)
+{
+ /* If platform provides early boot printk clock, then use it */
+ if (unlikely(system_state == SYSTEM_BOOTING && boot_printk_clock_fn))
+ return boot_printk_clock_fn();
+ else
+ return local_clock();
+}
+
/*
* Number of registered extended console drivers.
*
@@ -628,7 +639,7 @@ static int log_store(int facility, int level,
if (ts_nsec > 0)
msg->ts_nsec = ts_nsec;
else
- msg->ts_nsec = local_clock();
+ msg->ts_nsec = printk_clock();
memset(log_dict(msg) + dict_len, 0, pad_len);
msg->len = size;
@@ -1775,7 +1786,7 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char *
cont.facility = facility;
cont.level = level;
cont.owner = current;
- cont.ts_nsec = local_clock();
+ cont.ts_nsec = printk_clock();
cont.flags = flags;
}
--
2.7.4
To show time info in kernel log earlier and help optimizing kernel
boot time, printk adds a debug hook "boot_printk_clock_fn()" for
capable platform which has accurate clock in early boot phase.
This patch will add early param setup option, so that user can
chose to provide a tsc based early printk clock simply by adding
in command line: "boot_tsc=xxxxM" (xxxxM is the stable TSC freq).
Signed-off-by: Feng Tang <[email protected]>
---
arch/x86/kernel/tsc.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 51 insertions(+)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 74392d9..d99eb70 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -281,6 +281,57 @@ static int __init tsc_setup(char *str)
__setup("tsc=", tsc_setup);
+
+/*
+ * This is to provide a not-so-accurate clock for printk timestamp in
+ * early boot phase (before sched_clock is setup).It could be used for
+ * analyzing/optimising kernel boot time and tools like bootchart.
+ *
+ * User can use it by simply append something like
+ * "boot_tsc=1881M"
+ * to cmdline for a platform with a stable 1881MHz TSC.
+ */
+static u64 boot_tsc_mhz;
+static u64 boot_tsc_offset;
+
+static u64 boot_tsc_clock(void)
+{
+ u64 cur_tsc, cur_ns;
+
+ cur_tsc = rdtsc();
+ cur_tsc -= boot_tsc_offset;
+
+ /* return value in ns */
+ cur_ns = div64_u64(cur_tsc * 1000, boot_tsc_mhz);
+ return cur_ns;
+}
+
+static int __init boot_tsc_setup(char *p)
+{
+ u64 tsc_hz;
+
+ if (!p)
+ return -EINVAL;
+
+ boot_tsc_offset = rdtsc();
+
+ tsc_hz = memparse(p, &p);
+ boot_tsc_mhz = div64_u64(tsc_hz, 1024 * 1024);
+ if (boot_tsc_mhz == 0)
+ return -EINVAL;
+
+ pr_info("TSC has run for %lld us\n",
+ div64_u64(boot_tsc_offset, boot_tsc_mhz));
+
+ /* Setup the early printk clock */
+ boot_printk_clock_fn = boot_tsc_clock;
+ pr_info("TSC: Setup early printk timestamp with %lldM TSC.",
+ boot_tsc_mhz);
+
+ return 0;
+}
+early_param("boot_tsc", boot_tsc_setup);
+
#define MAX_RETRIES 5
#define SMI_TRESHOLD 50000
--
2.7.4
On Wed, May 30, 2018 at 12:20 PM, Feng Tang <[email protected]> wrote:
> To show time info in kernel log earlier and help optimizing kernel
> boot time, printk adds a debug hook "boot_printk_clock_fn()" for
> capable platform which has accurate clock in early boot phase.
>
> This patch will add early param setup option, so that user can
> chose to provide a tsc based early printk clock simply by adding
> in command line: "boot_tsc=xxxxM" (xxxxM is the stable TSC freq).
> +
This line is not needed.
> + cur_tsc = rdtsc();
> + cur_tsc -= boot_tsc_offset;
In one expression?
> + if (!p)
> + return -EINVAL;
> +
I'm not sure it's needed at all.
> + boot_tsc_mhz = div64_u64(tsc_hz, 1024 * 1024);
Hmm... 1024*1024 != 1000 * 1000. So, hz -> mhz here (as by suffixes)
looks weird.
> + if (boot_tsc_mhz == 0)
> + return -EINVAL;
> + pr_info("TSC has run for %lld us\n",
> + div64_u64(boot_tsc_offset, boot_tsc_mhz));
> +
> + /* Setup the early printk clock */
> + boot_printk_clock_fn = boot_tsc_clock;
> + pr_info("TSC: Setup early printk timestamp with %lldM TSC.",
> + boot_tsc_mhz);
Perhaps remove period (above doesn't have it) and move this to one line?
--
With Best Regards,
Andy Shevchenko
Hi Andy,
Thanks for the review.
On Wed, May 30, 2018 at 04:25:07PM +0300, Andy Shevchenko wrote:
> On Wed, May 30, 2018 at 12:20 PM, Feng Tang <[email protected]> wrote:
> > To show time info in kernel log earlier and help optimizing kernel
> > boot time, printk adds a debug hook "boot_printk_clock_fn()" for
> > capable platform which has accurate clock in early boot phase.
> >
> > This patch will add early param setup option, so that user can
> > chose to provide a tsc based early printk clock simply by adding
> > in command line: "boot_tsc=xxxxM" (xxxxM is the stable TSC freq).
>
> > +
>
> This line is not needed.
Ok, will remove it.
>
> > + cur_tsc = rdtsc();
> > + cur_tsc -= boot_tsc_offset;
>
> In one expression?
Agree
>
> > + if (!p)
> > + return -EINVAL;
> > +
>
> I'm not sure it's needed at all.
>
> > + boot_tsc_mhz = div64_u64(tsc_hz, 1024 * 1024);
>
> Hmm... 1024*1024 != 1000 * 1000. So, hz -> mhz here (as by suffixes)
> looks weird.
The memparse is mostly for memory stuff which takes 1M as 1024*1024,
but for TSC frequency 1M means 1000*1000, and the code is trying to
do some awkward translation.
But you are right, this code will break if user doesn't use expression
with 'm".
>
> > + if (boot_tsc_mhz == 0)
> > + return -EINVAL;
>
> > + pr_info("TSC has run for %lld us\n",
> > + div64_u64(boot_tsc_offset, boot_tsc_mhz));
> > +
> > + /* Setup the early printk clock */
> > + boot_printk_clock_fn = boot_tsc_clock;
>
> > + pr_info("TSC: Setup early printk timestamp with %lldM TSC.",
> > + boot_tsc_mhz);
>
> Perhaps remove period (above doesn't have it) and move this to one line?
You mean merge the two pr_info line? yes, we can do that.
Thanks,
Feng
On Wed, May 30, 2018 at 12:20 PM, Feng Tang <[email protected]> wrote:
> Currently printk timestamp mostly come from the sched_clock which
> depends on the clock setup, so there are many kernel logs started
> with "[ 0.000000] " before the clock is calibrated.
>
> This patch will provide an debug option for specific platform to
> provide a early boot time clock, so that we can have time info in
> kernel log much earlier, which can show the time info for the early
> kernel boot, and make boottime tuning/optimization easier (boot time
> is critical for phone/tablet and embedded devices).
>
> Capable platform only need to setup the "boot_printk_clock_fn"
> which could return time in nano seconds.
>
> Together with a TSC patch on x86 system, we have easily captured
> some early boottime killer like unwind_init() which takes about
> 300ms in boot phase.
> +static u64 printk_clock(void)
> +{
> + /* If platform provides early boot printk clock, then use it */
> + if (unlikely(system_state == SYSTEM_BOOTING && boot_printk_clock_fn))
> + return boot_printk_clock_fn();
> + else
> + return local_clock();
'else' is redundant.
> +}
--
With Best Regards,
Andy Shevchenko
On Wed 2018-05-30 17:20:59, Feng Tang wrote:
> To show time info in kernel log earlier and help optimizing kernel
> boot time, printk adds a debug hook "boot_printk_clock_fn()" for
> capable platform which has accurate clock in early boot phase.
>
> This patch will add early param setup option, so that user can
> chose to provide a tsc based early printk clock simply by adding
> in command line: "boot_tsc=xxxxM" (xxxxM is the stable TSC freq).
>
> Signed-off-by: Feng Tang <[email protected]>
> ---
> arch/x86/kernel/tsc.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 51 insertions(+)
>
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index 74392d9..d99eb70 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -281,6 +281,57 @@ static int __init tsc_setup(char *str)
>
> __setup("tsc=", tsc_setup);
>
> +
> +/*
> + * This is to provide a not-so-accurate clock for printk timestamp in
> + * early boot phase (before sched_clock is setup).It could be used for
> + * analyzing/optimising kernel boot time and tools like bootchart.
> + *
> + * User can use it by simply append something like
> + * "boot_tsc=1881M"
> + * to cmdline for a platform with a stable 1881MHz TSC.
> + */
> +static u64 boot_tsc_mhz;
> +static u64 boot_tsc_offset;
> +
> +static u64 boot_tsc_clock(void)
> +{
> + u64 cur_tsc, cur_ns;
> +
> + cur_tsc = rdtsc();
> + cur_tsc -= boot_tsc_offset;
> +
> + /* return value in ns */
> + cur_ns = div64_u64(cur_tsc * 1000, boot_tsc_mhz);
> + return cur_ns;
> +}
> +
> +static int __init boot_tsc_setup(char *p)
> +{
> + u64 tsc_hz;
> +
> + if (!p)
> + return -EINVAL;
> +
> + boot_tsc_offset = rdtsc();
> +
> + tsc_hz = memparse(p, &p);
> + boot_tsc_mhz = div64_u64(tsc_hz, 1024 * 1024);
> + if (boot_tsc_mhz == 0)
> + return -EINVAL;
> +
> + pr_info("TSC has run for %lld us\n",
> + div64_u64(boot_tsc_offset, boot_tsc_mhz));
> +
> + /* Setup the early printk clock */
> + boot_printk_clock_fn = boot_tsc_clock;
I wonder if we could get some cleaner integration into the timer and
printk code.
I wonder if we could hide this under the existing timer interface.
For example, local_clock() could provide values from this non-accurate
timer and then transparently continue with the proper counting.
The proper counting will not start from zero but from the offset
reached by boot_tsc().
Best Regards,
Petr
> + pr_info("TSC: Setup early printk timestamp with %lldM TSC.",
> + boot_tsc_mhz);
> +
> + return 0;
> +}
> +early_param("boot_tsc", boot_tsc_setup);
> +
> #define MAX_RETRIES 5
> #define SMI_TRESHOLD 50000
>
> --
> 2.7.4
>
On Thu, May 31, 2018 at 03:55:42PM +0200, Petr Mladek wrote:
> I wonder if we could get some cleaner integration into the timer and
> printk code.
Yes, these patches are particularly horrific..
There were some earlier patches by Pavel Tatashin, which attempted do
get things running earlier.
http://lkml.kernel.org/r/[email protected]
I'm not entirely happy with that, but I never did get around to
reviewing that last version :-( In particuarly, now that you made me
look, I dislike his patch 6 almost as much as these patches.
The idea was to get regular sched_clock() running earlier, not to botch
some early_sched_clock() into it.
Basically run calibrate_tsc() earlier (like _waaay_ earlier, it doesn't
rely on anything other than CPUID) and if you have a recent part (with
exception of SKX) you'll get a usable tsc rate (and TSC_RELIABLE) and
things will work.
If you have a dodgy part (sorry SKX), you'll just have to live with
sched_clock starting late(r).
Do not cobble things on the side, try and get the normal things running
earlier.
On 05/31/2018 12:18 AM, Andy Shevchenko wrote:
> On Wed, May 30, 2018 at 12:20 PM, Feng Tang <[email protected]> wrote:
>> Currently printk timestamp mostly come from the sched_clock which
>> depends on the clock setup, so there are many kernel logs started
>> with "[ 0.000000] " before the clock is calibrated.
>>
>> This patch will provide an debug option for specific platform to
>> provide a early boot time clock, so that we can have time info in
>> kernel log much earlier, which can show the time info for the early
>> kernel boot, and make boottime tuning/optimization easier (boot time
>> is critical for phone/tablet and embedded devices).
>>
>> Capable platform only need to setup the "boot_printk_clock_fn"
>> which could return time in nano seconds.
>>
>> Together with a TSC patch on x86 system, we have easily captured
>> some early boottime killer like unwind_init() which takes about
>> 300ms in boot phase.
>
>> +static u64 printk_clock(void)
>> +{
>> + /* If platform provides early boot printk clock, then use it */
>> + if (unlikely(system_state == SYSTEM_BOOTING && boot_printk_clock_fn))
>> + return boot_printk_clock_fn();
>> + else
>> + return local_clock();
>
> 'else' is redundant.
So it is. Is this a style comment?
This shouldn't matter to a smart compiler, should it?
>> +}
thanks,
--
~Randy
On Thu, May 31, 2018 at 8:22 PM, Randy Dunlap <[email protected]> wrote:
> On 05/31/2018 12:18 AM, Andy Shevchenko wrote:
>> On Wed, May 30, 2018 at 12:20 PM, Feng Tang <[email protected]> wrote:
>>> +static u64 printk_clock(void)
>>> +{
>>> + /* If platform provides early boot printk clock, then use it */
>>> + if (unlikely(system_state == SYSTEM_BOOTING && boot_printk_clock_fn))
>>> + return boot_printk_clock_fn();
>>> + else
>>> + return local_clock();
>>
>> 'else' is redundant.
>
> So it is. Is this a style comment?
> This shouldn't matter to a smart compiler, should it?
In this particular case it doesn't matter, in more complex cases this
will increase an indentation level which becomes a hardly readable
code.
>>> +}
--
With Best Regards,
Andy Shevchenko
Hi Peter and Petr,
Thanks for your suggestions, will try to find a cleaner and less hacky way,
and it may take some time as dealing with all kinds of TSC is tricky :)
- Feng
On Thu, May 31, 2018 at 05:52:10PM +0200, Peter Zijlstra wrote:
> On Thu, May 31, 2018 at 03:55:42PM +0200, Petr Mladek wrote:
> > I wonder if we could get some cleaner integration into the timer and
> > printk code.
>
> Yes, these patches are particularly horrific..
>
> There were some earlier patches by Pavel Tatashin, which attempted do
> get things running earlier.
>
> http://lkml.kernel.org/r/[email protected]
>
> I'm not entirely happy with that, but I never did get around to
> reviewing that last version :-( In particuarly, now that you made me
> look, I dislike his patch 6 almost as much as these patches.
>
> The idea was to get regular sched_clock() running earlier, not to botch
> some early_sched_clock() into it.
>
> Basically run calibrate_tsc() earlier (like _waaay_ earlier, it doesn't
> rely on anything other than CPUID) and if you have a recent part (with
> exception of SKX) you'll get a usable tsc rate (and TSC_RELIABLE) and
> things will work.
>
> If you have a dodgy part (sorry SKX), you'll just have to live with
> sched_clock starting late(r).
>
> Do not cobble things on the side, try and get the normal things running
> earlier.
Hi Peter,
How about taking patches 1-4 and 7 ? I can work on patches 5 & 6
separetly to provide a better integration for x86, this can be decided
separetly but the other patches will enable early clock functionality
on arches where sched_clock() is available early.
Pavel
On Thu, May 31, 2018 at 11:52 AM Peter Zijlstra <[email protected]> wrote:
>
> On Thu, May 31, 2018 at 03:55:42PM +0200, Petr Mladek wrote:
> > I wonder if we could get some cleaner integration into the timer and
> > printk code.
>
> Yes, these patches are particularly horrific..
>
> There were some earlier patches by Pavel Tatashin, which attempted do
> get things running earlier.
>
> http://lkml.kernel.org/r/[email protected]
>
> I'm not entirely happy with that, but I never did get around to
> reviewing that last version :-( In particuarly, now that you made me
> look, I dislike his patch 6 almost as much as these patches.
>
> The idea was to get regular sched_clock() running earlier, not to botch
> some early_sched_clock() into it.
>
> Basically run calibrate_tsc() earlier (like _waaay_ earlier, it doesn't
> rely on anything other than CPUID) and if you have a recent part (with
> exception of SKX) you'll get a usable tsc rate (and TSC_RELIABLE) and
> things will work.
>
> If you have a dodgy part (sorry SKX), you'll just have to live with
> sched_clock starting late(r).
>
> Do not cobble things on the side, try and get the normal things running
> earlier.
On Sat, Jun 02, 2018 at 12:12:13AM +0800, Feng Tang wrote:
Hi Peter and all,
> Hi Peter and Petr,
>
> Thanks for your suggestions, will try to find a cleaner and less hacky way,
> and it may take some time as dealing with all kinds of TSC is tricky :)
>
> - Feng
>
> On Thu, May 31, 2018 at 05:52:10PM +0200, Peter Zijlstra wrote:
> > On Thu, May 31, 2018 at 03:55:42PM +0200, Petr Mladek wrote:
> > > I wonder if we could get some cleaner integration into the timer and
> > > printk code.
> >
> > Yes, these patches are particularly horrific..
> >
> > There were some earlier patches by Pavel Tatashin, which attempted do
> > get things running earlier.
> >
> > http://lkml.kernel.org/r/[email protected]
> >
> > I'm not entirely happy with that, but I never did get around to
> > reviewing that last version :-( In particuarly, now that you made me
> > look, I dislike his patch 6 almost as much as these patches.
> >
> > The idea was to get regular sched_clock() running earlier, not to botch
> > some early_sched_clock() into it.
> >
> > Basically run calibrate_tsc() earlier (like _waaay_ earlier, it doesn't
> > rely on anything other than CPUID) and if you have a recent part (with
> > exception of SKX) you'll get a usable tsc rate (and TSC_RELIABLE) and
> > things will work.
I just did a hacky experiment by moving the tsc_init()earlier into
setup_arch() and remove the tsc_early_delay_calibrate(). The printk stamp
does start working much earlier!
But the __use_tsc and __sched_clock_stable are relying on jump_label,
which can't be used so early (I tried to call the jump_label_init() before
tsc_init(), but kernel crashs, and I worked around it for now).
Please review the debug patch, thanks!
---
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 5c623df..b636888 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1201,7 +1201,8 @@ void __init setup_arch(char **cmdline_p)
kvmclock_init();
#endif
- tsc_early_delay_calibrate();
+ tsc_init();
+
if (!early_xdbc_setup_hardware())
early_xdbc_register_console();
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 4008dd6..8288f39 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -33,6 +33,7 @@ EXPORT_SYMBOL(cpu_khz);
unsigned int __read_mostly tsc_khz;
EXPORT_SYMBOL(tsc_khz);
+int tsc_inited;
/*
* TSC can be unstable due to cpufreq or due to unsynced TSCs
*/
@@ -192,7 +193,7 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_
*/
u64 native_sched_clock(void)
{
- if (static_branch_likely(&__use_tsc)) {
+ if (static_branch_likely(&__use_tsc) || tsc_inited) {
u64 tsc_now = rdtsc();
/* return the value in ns */
@@ -1387,30 +1391,16 @@ static int __init init_tsc_clocksource(void)
*/
device_initcall(init_tsc_clocksource);
-void __init tsc_early_delay_calibrate(void)
-{
- unsigned long lpj;
-
- if (!boot_cpu_has(X86_FEATURE_TSC))
- return;
-
- cpu_khz = x86_platform.calibrate_cpu();
- tsc_khz = x86_platform.calibrate_tsc();
-
- tsc_khz = tsc_khz ? : cpu_khz;
- if (!tsc_khz)
- return;
-
- lpj = tsc_khz * 1000;
- do_div(lpj, HZ);
- loops_per_jiffy = lpj;
-}
-
void __init tsc_init(void)
{
u64 lpj, cyc;
int cpu;
+ if (tsc_inited)
+ return;
+
+ tsc_inited = 1;
+
if (!boot_cpu_has(X86_FEATURE_TSC)) {
setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
return;
@@ -1474,11 +1464,15 @@ void __init tsc_init(void)
lpj = ((u64)tsc_khz * 1000);
do_div(lpj, HZ);
lpj_fine = lpj;
+ loops_per_jiffy = lpj;
use_tsc_delay();
check_system_tsc_reliable();
+ extern void early_set_sched_clock_stable(u64 sched_clock_offset);
+ early_set_sched_clock_stable(div64_u64(rdtsc() * 1000, tsc_khz));
+
if (unsynchronized_tsc()) {
mark_tsc_unstable("TSCs unsynchronized");
return;
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 10c83e7..6c5c22d 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -119,6 +119,13 @@ static void __scd_stamp(struct sched_clock_data *scd)
scd->tick_raw = sched_clock();
}
+
+void early_set_sched_clock_stable(u64 sched_clock_offset)
+{
+ __sched_clock_offset = sched_clock_offset;
+ static_branch_enable(&__sched_clock_stable);
+}
+
static void __set_sched_clock_stable(void)
{
struct sched_clock_data *scd;
@@ -342,12 +349,14 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)
*
* See cpu_clock().
*/
+
+extern int tsc_inited;
u64 sched_clock_cpu(int cpu)
{
struct sched_clock_data *scd;
u64 clock;
- if (sched_clock_stable())
+ if (sched_clock_stable() || tsc_inited)
return sched_clock() + __sched_clock_offset;
if (unlikely(!sched_clock_running))
> >
> > If you have a dodgy part (sorry SKX), you'll just have to live with
> > sched_clock starting late(r).
> >
> > Do not cobble things on the side, try and get the normal things running
> > earlier.
Hi Feng,
Using a global variable for this is not going to work, because you are adding a conditional branch and a load to a very hot path for the live of the system, not only for the duration of the boot.
Pavel
>
> +int tsc_inited;
> /*
> * TSC can be unstable due to cpufreq or due to unsynced TSCs
> */
> @@ -192,7 +193,7 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_
> */
> u64 native_sched_clock(void)
> {
> - if (static_branch_likely(&__use_tsc)) {
> + if (static_branch_likely(&__use_tsc) || tsc_inited) {
> u64 tsc_now = rdtsc();
>
> /* return the value in ns */
> @@ -1387,30 +1391,16 @@ static int __init init_tsc_clocksource(void)
> */
> device_initcall(init_tsc_clocksource);
>
> -void __init tsc_early_delay_calibrate(void)
> -{
> - unsigned long lpj;
> -
> - if (!boot_cpu_has(X86_FEATURE_TSC))
> - return;
> -
> - cpu_khz = x86_platform.calibrate_cpu();
> - tsc_khz = x86_platform.calibrate_tsc();
> -
> - tsc_khz = tsc_khz ? : cpu_khz;
> - if (!tsc_khz)
> - return;
> -
> - lpj = tsc_khz * 1000;
> - do_div(lpj, HZ);
> - loops_per_jiffy = lpj;
> -}
> -
> void __init tsc_init(void)
> {
> u64 lpj, cyc;
> int cpu;
>
> + if (tsc_inited)
> + return;
> +
> + tsc_inited = 1;
> +
> if (!boot_cpu_has(X86_FEATURE_TSC)) {
> setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
> return;
> @@ -1474,11 +1464,15 @@ void __init tsc_init(void)
> lpj = ((u64)tsc_khz * 1000);
> do_div(lpj, HZ);
> lpj_fine = lpj;
> + loops_per_jiffy = lpj;
>
> use_tsc_delay();
>
> check_system_tsc_reliable();
>
> + extern void early_set_sched_clock_stable(u64 sched_clock_offset);
> + early_set_sched_clock_stable(div64_u64(rdtsc() * 1000, tsc_khz));
> +
> if (unsynchronized_tsc()) {
> mark_tsc_unstable("TSCs unsynchronized");
> return;
> diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
> index 10c83e7..6c5c22d 100644
> --- a/kernel/sched/clock.c
> +++ b/kernel/sched/clock.c
> @@ -119,6 +119,13 @@ static void __scd_stamp(struct sched_clock_data *scd)
> scd->tick_raw = sched_clock();
> }
>
> +
> +void early_set_sched_clock_stable(u64 sched_clock_offset)
> +{
> + __sched_clock_offset = sched_clock_offset;
> + static_branch_enable(&__sched_clock_stable);
> +}
> +
> static void __set_sched_clock_stable(void)
> {
> struct sched_clock_data *scd;
> @@ -342,12 +349,14 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)
> *
> * See cpu_clock().
> */
> +
> +extern int tsc_inited;
> u64 sched_clock_cpu(int cpu)
> {
> struct sched_clock_data *scd;
> u64 clock;
>
> - if (sched_clock_stable())
> + if (sched_clock_stable() || tsc_inited)
> return sched_clock() + __sched_clock_offset;
>
> if (unlikely(!sched_clock_running))
>
>
>
>
>>>
>>> If you have a dodgy part (sorry SKX), you'll just have to live with
>>> sched_clock starting late(r).
>>>
>>> Do not cobble things on the side, try and get the normal things running
>>> earlier.
Hi Pavel,
Thanks for the revew.
On Wed, Jun 06, 2018 at 11:25:22AM -0400, Pavel Tatashin wrote:
> Hi Feng,
>
> Using a global variable for this is not going to work, because you are adding a conditional branch and a load to a very hot path for the live of the system, not only for the duration of the boot.
Exactly. As I explained, I wanted to use the "__use_tsc" and
"__sched_clock_stable" without creating new gloabl variables, but
the problem is jump_label_init() can't be called that early,
so I used "tsc_inited" temply just to show tsc_init() could be call
early, and the printk timestamp could work much earlier.
Thanks,
Feng
>
> Pavel
>
> >
> > +int tsc_inited;
> > /*
> > * TSC can be unstable due to cpufreq or due to unsynced TSCs
> > */
> > @@ -192,7 +193,7 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_
> > */
> > u64 native_sched_clock(void)
> > {
> > - if (static_branch_likely(&__use_tsc)) {
> > + if (static_branch_likely(&__use_tsc) || tsc_inited) {
> > u64 tsc_now = rdtsc();
> >
> > /* return the value in ns */
> > @@ -1387,30 +1391,16 @@ static int __init init_tsc_clocksource(void)
> > */
> > device_initcall(init_tsc_clocksource);
> >
> > -void __init tsc_early_delay_calibrate(void)
> > -{
> > - unsigned long lpj;
> > -
> > - if (!boot_cpu_has(X86_FEATURE_TSC))
> > - return;
> > -
> > - cpu_khz = x86_platform.calibrate_cpu();
> > - tsc_khz = x86_platform.calibrate_tsc();
> > -
> > - tsc_khz = tsc_khz ? : cpu_khz;
> > - if (!tsc_khz)
> > - return;
> > -
> > - lpj = tsc_khz * 1000;
> > - do_div(lpj, HZ);
> > - loops_per_jiffy = lpj;
> > -}
> > -
> > void __init tsc_init(void)
> > {
> > u64 lpj, cyc;
> > int cpu;
> >
> > + if (tsc_inited)
> > + return;
> > +
> > + tsc_inited = 1;
> > +
> > if (!boot_cpu_has(X86_FEATURE_TSC)) {
> > setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
> > return;
> > @@ -1474,11 +1464,15 @@ void __init tsc_init(void)
> > lpj = ((u64)tsc_khz * 1000);
> > do_div(lpj, HZ);
> > lpj_fine = lpj;
> > + loops_per_jiffy = lpj;
> >
> > use_tsc_delay();
> >
> > check_system_tsc_reliable();
> >
> > + extern void early_set_sched_clock_stable(u64 sched_clock_offset);
> > + early_set_sched_clock_stable(div64_u64(rdtsc() * 1000, tsc_khz));
> > +
> > if (unsynchronized_tsc()) {
> > mark_tsc_unstable("TSCs unsynchronized");
> > return;
> > diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
> > index 10c83e7..6c5c22d 100644
> > --- a/kernel/sched/clock.c
> > +++ b/kernel/sched/clock.c
> > @@ -119,6 +119,13 @@ static void __scd_stamp(struct sched_clock_data *scd)
> > scd->tick_raw = sched_clock();
> > }
> >
> > +
> > +void early_set_sched_clock_stable(u64 sched_clock_offset)
> > +{
> > + __sched_clock_offset = sched_clock_offset;
> > + static_branch_enable(&__sched_clock_stable);
> > +}
> > +
> > static void __set_sched_clock_stable(void)
> > {
> > struct sched_clock_data *scd;
> > @@ -342,12 +349,14 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)
> > *
> > * See cpu_clock().
> > */
> > +
> > +extern int tsc_inited;
> > u64 sched_clock_cpu(int cpu)
> > {
> > struct sched_clock_data *scd;
> > u64 clock;
> >
> > - if (sched_clock_stable())
> > + if (sched_clock_stable() || tsc_inited)
> > return sched_clock() + __sched_clock_offset;
> >
> > if (unlikely(!sched_clock_running))
> >
> >
> >
> >
> >>>
> >>> If you have a dodgy part (sorry SKX), you'll just have to live with
> >>> sched_clock starting late(r).
> >>>
> >>> Do not cobble things on the side, try and get the normal things running
> >>> earlier.
On Wed, Jun 06, 2018 at 05:38:33PM +0800, Feng Tang wrote:
> On Sat, Jun 02, 2018 at 12:12:13AM +0800, Feng Tang wrote:
>
> Hi Peter and all,
>
>
> > Hi Peter and Petr,
> >
> > Thanks for your suggestions, will try to find a cleaner and less hacky way,
> > and it may take some time as dealing with all kinds of TSC is tricky :)
> >
> > - Feng
> >
> > On Thu, May 31, 2018 at 05:52:10PM +0200, Peter Zijlstra wrote:
> > > On Thu, May 31, 2018 at 03:55:42PM +0200, Petr Mladek wrote:
> > > > I wonder if we could get some cleaner integration into the timer and
> > > > printk code.
> > >
> > > Yes, these patches are particularly horrific..
> > >
> > > There were some earlier patches by Pavel Tatashin, which attempted do
> > > get things running earlier.
> > >
> > > http://lkml.kernel.org/r/[email protected]
> > >
> > > I'm not entirely happy with that, but I never did get around to
> > > reviewing that last version :-( In particuarly, now that you made me
> > > look, I dislike his patch 6 almost as much as these patches.
> > >
> > > The idea was to get regular sched_clock() running earlier, not to botch
> > > some early_sched_clock() into it.
> > >
> > > Basically run calibrate_tsc() earlier (like _waaay_ earlier, it doesn't
> > > rely on anything other than CPUID) and if you have a recent part (with
> > > exception of SKX) you'll get a usable tsc rate (and TSC_RELIABLE) and
> > > things will work.
>
>
> I just did a hacky experiment by moving the tsc_init()earlier into
> setup_arch() and remove the tsc_early_delay_calibrate(). The printk stamp
> does start working much earlier!
>
>
> But the __use_tsc and __sched_clock_stable are relying on jump_label,
> which can't be used so early (I tried to call the jump_label_init() before
> tsc_init(), but kernel crashs, and I worked around it for now).
Just figured out the kernel crash when taking jump_label_init() earlier
into setup_arch(), the tsc_init() will enable static_key __use_tsc
static_key_enable
__jump_label_update
arch_jump_label_transform
__jump_label_transform
text_poke_bp
text_poke
text_poke() will involve page , but paging is not initialized so
early yet, so it triggers a panic.
Beside this __use_tsc, the sched_clock also has one static key
__sched_clock_stable
Thanks,
Feng
>
> Please review the debug patch, thanks!
>
> ---
> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> index 5c623df..b636888 100644
> --- a/arch/x86/kernel/setup.c
> +++ b/arch/x86/kernel/setup.c
> @@ -1201,7 +1201,8 @@ void __init setup_arch(char **cmdline_p)
> kvmclock_init();
> #endif
>
> - tsc_early_delay_calibrate();
> + tsc_init();
> +
> if (!early_xdbc_setup_hardware())
> early_xdbc_register_console();
>
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index 4008dd6..8288f39 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -33,6 +33,7 @@ EXPORT_SYMBOL(cpu_khz);
> unsigned int __read_mostly tsc_khz;
> EXPORT_SYMBOL(tsc_khz);
>
> +int tsc_inited;
> /*
> * TSC can be unstable due to cpufreq or due to unsynced TSCs
> */
> @@ -192,7 +193,7 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_
> */
> u64 native_sched_clock(void)
> {
> - if (static_branch_likely(&__use_tsc)) {
> + if (static_branch_likely(&__use_tsc) || tsc_inited) {
> u64 tsc_now = rdtsc();
>
> /* return the value in ns */
> @@ -1387,30 +1391,16 @@ static int __init init_tsc_clocksource(void)
> */
> device_initcall(init_tsc_clocksource);
>
> -void __init tsc_early_delay_calibrate(void)
> -{
> - unsigned long lpj;
> -
> - if (!boot_cpu_has(X86_FEATURE_TSC))
> - return;
> -
> - cpu_khz = x86_platform.calibrate_cpu();
> - tsc_khz = x86_platform.calibrate_tsc();
> -
> - tsc_khz = tsc_khz ? : cpu_khz;
> - if (!tsc_khz)
> - return;
> -
> - lpj = tsc_khz * 1000;
> - do_div(lpj, HZ);
> - loops_per_jiffy = lpj;
> -}
> -
> void __init tsc_init(void)
> {
> u64 lpj, cyc;
> int cpu;
>
> + if (tsc_inited)
> + return;
> +
> + tsc_inited = 1;
> +
> if (!boot_cpu_has(X86_FEATURE_TSC)) {
> setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
> return;
> @@ -1474,11 +1464,15 @@ void __init tsc_init(void)
> lpj = ((u64)tsc_khz * 1000);
> do_div(lpj, HZ);
> lpj_fine = lpj;
> + loops_per_jiffy = lpj;
>
> use_tsc_delay();
>
> check_system_tsc_reliable();
>
> + extern void early_set_sched_clock_stable(u64 sched_clock_offset);
> + early_set_sched_clock_stable(div64_u64(rdtsc() * 1000, tsc_khz));
> +
> if (unsynchronized_tsc()) {
> mark_tsc_unstable("TSCs unsynchronized");
> return;
> diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
> index 10c83e7..6c5c22d 100644
> --- a/kernel/sched/clock.c
> +++ b/kernel/sched/clock.c
> @@ -119,6 +119,13 @@ static void __scd_stamp(struct sched_clock_data *scd)
> scd->tick_raw = sched_clock();
> }
>
> +
> +void early_set_sched_clock_stable(u64 sched_clock_offset)
> +{
> + __sched_clock_offset = sched_clock_offset;
> + static_branch_enable(&__sched_clock_stable);
> +}
> +
> static void __set_sched_clock_stable(void)
> {
> struct sched_clock_data *scd;
> @@ -342,12 +349,14 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)
> *
> * See cpu_clock().
> */
> +
> +extern int tsc_inited;
> u64 sched_clock_cpu(int cpu)
> {
> struct sched_clock_data *scd;
> u64 clock;
>
> - if (sched_clock_stable())
> + if (sched_clock_stable() || tsc_inited)
> return sched_clock() + __sched_clock_offset;
>
> if (unlikely(!sched_clock_running))
>
>
>
>
> > >
> > > If you have a dodgy part (sorry SKX), you'll just have to live with
> > > sched_clock starting late(r).
> > >
> > > Do not cobble things on the side, try and get the normal things running
> > > earlier.
On Wed, May 30, 2018 at 05:20:58PM +0800, Feng Tang wrote:
> Currently printk timestamp mostly come from the sched_clock which
> depends on the clock setup, so there are many kernel logs started
> with "[ 0.000000] " before the clock is calibrated.
>
> This patch will provide an debug option for specific platform to
> provide a early boot time clock, so that we can have time info in
> kernel log much earlier, which can show the time info for the early
> kernel boot, and make boottime tuning/optimization easier (boot time
> is critical for phone/tablet and embedded devices).
>
> Capable platform only need to setup the "boot_printk_clock_fn"
> which could return time in nano seconds.
>
> Together with a TSC patch on x86 system, we have easily captured
> some early boottime killer like unwind_init() which takes about
> 300ms in boot phase.
Hi Petr and all,
As the 2/2 tsc related patch is still under review/discussion, can
we consider taking this first? As this may benefit other archs.
For example, Intel Curie platform has an always-on 32KHz osc clock,
which is accurate but low frequency, and it could be used as
early printk timestamp until the high-resolution timer is initialized
and used as sched_clock. Don't know if ARM or other platforms
have similar use case.
Thanks,
Feng
On Thu, 14 Jun 2018, Feng Tang wrote:
> On Wed, May 30, 2018 at 05:20:58PM +0800, Feng Tang wrote:
> > Currently printk timestamp mostly come from the sched_clock which
> > depends on the clock setup, so there are many kernel logs started
> > with "[ 0.000000] " before the clock is calibrated.
> >
> > This patch will provide an debug option for specific platform to
> > provide a early boot time clock, so that we can have time info in
> > kernel log much earlier, which can show the time info for the early
> > kernel boot, and make boottime tuning/optimization easier (boot time
> > is critical for phone/tablet and embedded devices).
> >
> > Capable platform only need to setup the "boot_printk_clock_fn"
> > which could return time in nano seconds.
> >
> > Together with a TSC patch on x86 system, we have easily captured
> > some early boottime killer like unwind_init() which takes about
> > 300ms in boot phase.
>
> Hi Petr and all,
>
> As the 2/2 tsc related patch is still under review/discussion, can
> we consider taking this first? As this may benefit other archs.
> For example, Intel Curie platform has an always-on 32KHz osc clock,
> which is accurate but low frequency, and it could be used as
> early printk timestamp until the high-resolution timer is initialized
> and used as sched_clock. Don't know if ARM or other platforms
> have similar use case.
Can we please _NOT_ add half sorted core stuff in a hurry?
Thanks,
tglx
Hi Thomas,
On Thu, Jun 14, 2018 at 11:08:04AM +0200, Thomas Gleixner wrote:
> On Thu, 14 Jun 2018, Feng Tang wrote:
> > On Wed, May 30, 2018 at 05:20:58PM +0800, Feng Tang wrote:
> > > Currently printk timestamp mostly come from the sched_clock which
> > > depends on the clock setup, so there are many kernel logs started
> > > with "[ 0.000000] " before the clock is calibrated.
> > >
> > > This patch will provide an debug option for specific platform to
> > > provide a early boot time clock, so that we can have time info in
> > > kernel log much earlier, which can show the time info for the early
> > > kernel boot, and make boottime tuning/optimization easier (boot time
> > > is critical for phone/tablet and embedded devices).
> > >
> > > Capable platform only need to setup the "boot_printk_clock_fn"
> > > which could return time in nano seconds.
> > >
> > > Together with a TSC patch on x86 system, we have easily captured
> > > some early boottime killer like unwind_init() which takes about
> > > 300ms in boot phase.
> >
> > Hi Petr and all,
> >
> > As the 2/2 tsc related patch is still under review/discussion, can
> > we consider taking this first? As this may benefit other archs.
> > For example, Intel Curie platform has an always-on 32KHz osc clock,
> > which is accurate but low frequency, and it could be used as
> > early printk timestamp until the high-resolution timer is initialized
> > and used as sched_clock. Don't know if ARM or other platforms
> > have similar use case.
>
> Can we please _NOT_ add half sorted core stuff in a hurry?
Got it.
The status for the 2/2 tsc patch is Peter suggested to bring the tsc_init()
earlier, with that this 1/2 is not needed for x86. I followed Peter's
suggestion and worked out a hack version with which the tsc inits fine
and printk timestamp works much earlier (detail is in the 2/2 thread).
But there is one blocker, the native_sched_clock()(tsc.c) use one staic key
__use_tsc and sched_clock_cpu()(clock.c) use another "__sched_clock_stable",
which cannot be used so early, one problem is the static_branch_enable()
will use pageing related code while the paging is not setup ready yet.
I haven't got idea to solve this first problem regarding static key.
Thanks,
Feng