Boot to boot the TSC calibration may vary by quite a large amount.
While normal variance of 50-100ppm can easily be seen, the quick
calibration code only requires 500ppm accuracy, which is the limit
of what NTP can correct for.
This can cause problems for systems being used as NTP servers, as
every time they reboot it can take hours for them to calculate the
new drift error caused by the calibration.
The classic trade-off here is calibration accuracy vs slow boot times,
as during the calibration nothing else can run.
This patch uses a delayed workqueue to calibrate the TSC over the
period of a second. This allows very accurate calibration (in my
tests only varying by 1khz or 0.4ppm boot to boot). Additionally this
refined calibration step does not block the boot process, and only
delays the TSC clocksoure registration by a few seconds in early boot.
Credit to Andi Kleen who suggested using a timer quite awhile back,
but I dismissed it thinking the timer calibration would be done after
the clocksource was registered (which would break things). Forgive
me for my short-sightedness.
This patch has worked very well in my testing, but TSC hardware is
quite varied so it would probably be good to get some extended
testing, possibly pushing inclusion out to 2.6.39.
Signed-off-by: John Stultz <[email protected]>
CC: Thomas Gleixner <[email protected]>
CC: Ingo Molnar <[email protected]>
CC: Martin Schwidefsky <[email protected]>
CC: Clark Williams <[email protected]>
CC: Andi Kleen <[email protected]>
---
arch/x86/kernel/tsc.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++--
1 files changed, 66 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 0c40d8b..e88b71e 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -885,7 +885,69 @@ __cpuinit int unsynchronized_tsc(void)
return tsc_unstable;
}
-static void __init init_tsc_clocksource(void)
+
+static void calibrate_tsc_work(struct work_struct *work);
+static DECLARE_DELAYED_WORK(tsc_irqwork, calibrate_tsc_work);
+
+static void calibrate_tsc_work(struct work_struct *work)
+{
+ static u64 tsc_start = -1, ref_start;
+ static int hpet;
+ u64 tsc_stop, ref_stop, delta;
+ unsigned long freq;
+
+ /* Don't bother refining TSC on unstable systems */
+ if (check_tsc_unstable())
+ goto out;
+
+ /*
+ * Since the timer is started early in boot, we may be
+ * delayed the first time we expire. So set the timer
+ * again once we know timers are working.
+ */
+ if (tsc_start == -1) {
+ /*
+ * Only set hpet once, to avoid mixing hardware
+ * if the hpet becomes enabled later.
+ */
+ hpet = is_hpet_enabled();
+ schedule_delayed_work(&tsc_irqwork, HZ);
+ tsc_start = tsc_read_refs(&ref_start, hpet);
+ return;
+ }
+
+ tsc_stop = tsc_read_refs(&ref_stop, hpet);
+
+ /* hpet or pmtimer available ? */
+ if (!hpet && !ref_start && !ref_stop)
+ goto out;
+
+ /* Check, whether the sampling was disturbed by an SMI */
+ if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX)
+ goto out;
+
+ delta = tsc_stop - tsc_start;
+ delta *= 1000000LL;
+ if (hpet)
+ freq = calc_hpet_ref(delta, ref_start, ref_stop);
+ else
+ freq = calc_pmtimer_ref(delta, ref_start, ref_stop);
+
+ /* Make sure we're within 1% */
+ if (abs(tsc_khz - freq) > tsc_khz/100)
+ goto out;
+
+ tsc_khz = freq;
+ printk(KERN_INFO "Refined TSC clocksource calibration: "
+ "%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000,
+ (unsigned long)tsc_khz % 1000);
+
+out:
+ clocksource_register_khz(&clocksource_tsc, tsc_khz);
+}
+
+
+static int __init init_tsc_clocksource(void)
{
if (tsc_clocksource_reliable)
clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
@@ -894,8 +956,10 @@ static void __init init_tsc_clocksource(void)
clocksource_tsc.rating = 0;
clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
}
- clocksource_register_khz(&clocksource_tsc, tsc_khz);
+ schedule_delayed_work(&tsc_irqwork, 0);
+ return 0;
}
+core_initcall(init_tsc_clocksource);
void __init tsc_init(void)
{
@@ -949,6 +1013,5 @@ void __init tsc_init(void)
mark_tsc_unstable("TSCs unsynchronized");
check_system_tsc_reliable();
- init_tsc_clocksource();
}
--
1.7.3.2.146.gca209
Hi John,
> + /*
> + * Since the timer is started early in boot, we may be
> + * delayed the first time we expire. So set the timer
> + * again once we know timers are working.
> + */
> + if (tsc_start == -1) {
> + /*
> + * Only set hpet once, to avoid mixing hardware
> + * if the hpet becomes enabled later.
> + */
> + hpet = is_hpet_enabled();
> + schedule_delayed_work(&tsc_irqwork, HZ);
> + tsc_start = tsc_read_refs(&ref_start, hpet);
> + return;
> + }
> +
> + tsc_stop = tsc_read_refs(&ref_stop, hpet);
The HPET init code stops, starts the HPET. I think you need some
way to protect against that here, e.g. a variable and rearming the
timer if it's true.
Another issue may be races against suspend, but that may be too
obscure.
I also worry a bit about NMIs etc. running later during this
and messing up the measurement, but I guess the longer period
makes up for it.
The rest of the patch looks ok to me.
-Andi
On Sun, 2010-11-07 at 21:41 +0100, Andi Kleen wrote:
> Hi John,
>
> > + /*
> > + * Since the timer is started early in boot, we may be
> > + * delayed the first time we expire. So set the timer
> > + * again once we know timers are working.
> > + */
> > + if (tsc_start == -1) {
> > + /*
> > + * Only set hpet once, to avoid mixing hardware
> > + * if the hpet becomes enabled later.
> > + */
> > + hpet = is_hpet_enabled();
> > + schedule_delayed_work(&tsc_irqwork, HZ);
> > + tsc_start = tsc_read_refs(&ref_start, hpet);
> > + return;
> > + }
> > +
> > + tsc_stop = tsc_read_refs(&ref_stop, hpet);
>
> The HPET init code stops, starts the HPET. I think you need some
> way to protect against that here, e.g. a variable and rearming the
> timer if it's true.
Interesting. Thanks for pointing that out! Couldn't I just start the
calibration after fs_initcall (when the hpet_late_init runs) to avoid
this as well?
> Another issue may be races against suspend, but that may be too
> obscure.
Yea, that seems fairly obscure. Basically you'd have to suspend in the
first second as the system came up. In that case the code will throw out
any calibration refinement that's over 1% off of the initial boot
calibration, so I think this is ok trade off.
> I also worry a bit about NMIs etc. running later during this
> and messing up the measurement, but I guess the longer period
> makes up for it.
Yea, the 1 second period should help minimize any disturbance, and
again, this is just a refinement over the existing calibration, so if
its more then 1% off of the boot time fast calibration, we'll throw it
out.
> The rest of the patch looks ok to me.
Thanks for the review!
-john
> Interesting. Thanks for pointing that out! Couldn't I just start the
> calibration after fs_initcall (when the hpet_late_init runs) to avoid
> this as well?
Yes that probably would work. Or use the barrier infrastructure
in workqueue.c
>
> > Another issue may be races against suspend, but that may be too
> > obscure.
>
> Yea, that seems fairly obscure. Basically you'd have to suspend in the
> first second as the system came up. In that case the code will throw out
> any calibration refinement that's over 1% off of the initial boot
> calibration, so I think this is ok trade off.
It may happen with opportunistic suspend if the system boots very fast.
-Andi
--
[email protected] -- Speaking for myself only.
On Tue, 2010-11-09 at 14:43 +0100, Andi Kleen wrote:
> > Interesting. Thanks for pointing that out! Couldn't I just start the
> > calibration after fs_initcall (when the hpet_late_init runs) to avoid
> > this as well?
>
> Yes that probably would work. Or use the barrier infrastructure
> in workqueue.c
>
> >
> > > Another issue may be races against suspend, but that may be too
> > > obscure.
> >
> > Yea, that seems fairly obscure. Basically you'd have to suspend in the
> > first second as the system came up. In that case the code will throw out
> > any calibration refinement that's over 1% off of the initial boot
> > calibration, so I think this is ok trade off.
>
> It may happen with opportunistic suspend if the system boots very fast.
Right, but a system using opportunistic suspend will have a hard enough
time keeping close NTP sync on its own given the frequent switching
between the fine-grained ntp adjusted clocksource during run-time and
the coarse non-adjusted RTC/persisitent_clock while suspended.
So I think such a system would be fine it falls back to using just the
boot-calibration for TSC freq rather then the refined calibration freq
calculated by this patch (which will happen automatically if the refined
calibration is off by 1%).
Does that seem like a reasonable tradeoff?
thanks
-john
On Tue, Nov 09, 2010 at 01:41:40PM -0800, john stultz wrote:
> > It may happen with opportunistic suspend if the system boots very fast.
>
> Right, but a system using opportunistic suspend will have a hard enough
> time keeping close NTP sync on its own given the frequent switching
> between the fine-grained ntp adjusted clocksource during run-time and
> the coarse non-adjusted RTC/persisitent_clock while suspended.
>
> So I think such a system would be fine it falls back to using just the
> boot-calibration for TSC freq rather then the refined calibration freq
> calculated by this patch (which will happen automatically if the refined
> calibration is off by 1%).
>
> Does that seem like a reasonable tradeoff?
Yes it sounds good to me. Some inaccuracy in this case is fine I guess.
Just major inaccuracy or a crash or hang wouldn't be good.
-Andi
--
[email protected] -- Speaking for myself only.
Commit-ID: 08ec0c58fb8a05d3191d5cb6f5d6f81adb419798
Gitweb: http://git.kernel.org/tip/08ec0c58fb8a05d3191d5cb6f5d6f81adb419798
Author: John Stultz <[email protected]>
AuthorDate: Tue, 27 Jul 2010 17:00:00 -0700
Committer: John Stultz <[email protected]>
CommitDate: Thu, 2 Dec 2010 16:48:37 -0800
x86: Improve TSC calibration using a delayed workqueue
Boot to boot the TSC calibration may vary by quite a large amount.
While normal variance of 50-100ppm can easily be seen, the quick
calibration code only requires 500ppm accuracy, which is the limit
of what NTP can correct for.
This can cause problems for systems being used as NTP servers, as
every time they reboot it can take hours for them to calculate the
new drift error caused by the calibration.
The classic trade-off here is calibration accuracy vs slow boot times,
as during the calibration nothing else can run.
This patch uses a delayed workqueue to calibrate the TSC over the
period of a second. This allows very accurate calibration (in my
tests only varying by 1khz or 0.4ppm boot to boot). Additionally this
refined calibration step does not block the boot process, and only
delays the TSC clocksoure registration by a few seconds in early boot.
If the refined calibration strays 1% from the early boot calibration
value, the system will fall back to already calculated early boot
calibration.
Credit to Andi Kleen who suggested using a timer quite awhile back,
but I dismissed it thinking the timer calibration would be done after
the clocksource was registered (which would break things). Forgive
me for my short-sightedness.
This patch has worked very well in my testing, but TSC hardware is
quite varied so it would probably be good to get some extended
testing, possibly pushing inclusion out to 2.6.39.
Signed-off-by: John Stultz <[email protected]>
LKML-Reference: <[email protected]>
Reviewed-by: Thomas Gleixner <[email protected]>
CC: Thomas Gleixner <[email protected]>
CC: Ingo Molnar <[email protected]>
CC: Martin Schwidefsky <[email protected]>
CC: Clark Williams <[email protected]>
CC: Andi Kleen <[email protected]>
---
arch/x86/kernel/tsc.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++--
1 files changed, 83 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index bb64beb..dc1393e 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -888,7 +888,82 @@ __cpuinit int unsynchronized_tsc(void)
return 0;
}
-static void __init init_tsc_clocksource(void)
+
+static void tsc_refine_calibration_work(struct work_struct *work);
+static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
+/**
+ * tsc_refine_calibration_work - Further refine tsc freq calibration
+ * @work - ignored.
+ *
+ * This functions uses delayed work over a period of a
+ * second to further refine the TSC freq value. Since this is
+ * timer based, instead of loop based, we don't block the boot
+ * process while this longer calibration is done.
+ *
+ * If there are any calibration anomolies (too many SMIs, etc),
+ * or the refined calibration is off by 1% of the fast early
+ * calibration, we throw out the new calibration and use the
+ * early calibration.
+ */
+static void tsc_refine_calibration_work(struct work_struct *work)
+{
+ static u64 tsc_start = -1, ref_start;
+ static int hpet;
+ u64 tsc_stop, ref_stop, delta;
+ unsigned long freq;
+
+ /* Don't bother refining TSC on unstable systems */
+ if (check_tsc_unstable())
+ goto out;
+
+ /*
+ * Since the work is started early in boot, we may be
+ * delayed the first time we expire. So set the workqueue
+ * again once we know timers are working.
+ */
+ if (tsc_start == -1) {
+ /*
+ * Only set hpet once, to avoid mixing hardware
+ * if the hpet becomes enabled later.
+ */
+ hpet = is_hpet_enabled();
+ schedule_delayed_work(&tsc_irqwork, HZ);
+ tsc_start = tsc_read_refs(&ref_start, hpet);
+ return;
+ }
+
+ tsc_stop = tsc_read_refs(&ref_stop, hpet);
+
+ /* hpet or pmtimer available ? */
+ if (!hpet && !ref_start && !ref_stop)
+ goto out;
+
+ /* Check, whether the sampling was disturbed by an SMI */
+ if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX)
+ goto out;
+
+ delta = tsc_stop - tsc_start;
+ delta *= 1000000LL;
+ if (hpet)
+ freq = calc_hpet_ref(delta, ref_start, ref_stop);
+ else
+ freq = calc_pmtimer_ref(delta, ref_start, ref_stop);
+
+ /* Make sure we're within 1% */
+ if (abs(tsc_khz - freq) > tsc_khz/100)
+ goto out;
+
+ tsc_khz = freq;
+ printk(KERN_INFO "Refined TSC clocksource calibration: "
+ "%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000,
+ (unsigned long)tsc_khz % 1000);
+
+out:
+ clocksource_register_khz(&clocksource_tsc, tsc_khz);
+}
+
+
+static int __init init_tsc_clocksource(void)
{
if (tsc_clocksource_reliable)
clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
@@ -897,8 +972,14 @@ static void __init init_tsc_clocksource(void)
clocksource_tsc.rating = 0;
clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
}
- clocksource_register_khz(&clocksource_tsc, tsc_khz);
+ schedule_delayed_work(&tsc_irqwork, 0);
+ return 0;
}
+/*
+ * We use device_initcall here, to ensure we run after the hpet
+ * is fully initialized, which may occur at fs_initcall time.
+ */
+device_initcall(init_tsc_clocksource);
void __init tsc_init(void)
{
@@ -952,6 +1033,5 @@ void __init tsc_init(void)
mark_tsc_unstable("TSCs unsynchronized");
check_system_tsc_reliable();
- init_tsc_clocksource();
}