If it takes longer than 12us to read the PIT counter lsb/msb,
then the error margin will never fall below 500ppm within 50ms,
and Fast TSC calibration will always fail.
This patch detects when that will happen and switches to using
a slightly different algorithm that takes advantage of the PIT's
latch comand.
Signed-off-by: Adrian Hunter <[email protected]>
---
arch/x86/kernel/tsc.c | 133 +++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 132 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 5054497..0035682 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -553,6 +553,40 @@ static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *de
}
/*
+ * High latency version of pit_expect_msb(). Instead of using the read latency
+ * as the error margin, latch the counter and use that latency. The counter
+ * latches at the current value which means there is an addition error margin
+ * of 1 timer tick which is not accounted for here.
+ */
+static inline int pit_expect_msb_hl(unsigned char val, u64 *tscp,
+ unsigned long *deltap, u16 *cnt)
+{
+ u64 tsc0, tsc1;
+ int count;
+ u8 lsb, msb;
+
+ for (count = 0; count < 50000; count++) {
+ tsc0 = get_cycles();
+ /* Latch counter 2 */
+ outb(0x80, 0x43);
+ tsc1 = get_cycles();
+ lsb = inb(0x42);
+ msb = inb(0x42);
+ if (msb != val)
+ break;
+ }
+ *deltap = tsc1 - tsc0;
+ *tscp = tsc0;
+ *cnt = lsb | ((u16)msb << 8);
+
+ /*
+ * We require _some_ success, but the quality control
+ * will be based on the error terms on the TSC values.
+ */
+ return count > 5;
+}
+
+/*
* How many MSB values do we want to see? We aim for
* a maximum error rate of 500ppm (in practice the
* real error is much smaller), but refuse to spend
@@ -561,6 +595,94 @@ static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *de
#define MAX_QUICK_PIT_MS 50
#define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
+/*
+ * High latency version of quick_pit_calibrate() that works even if there is a
+ * high latency reading the PIT lsb/msb. This is needed if it takes longer than
+ * 12us to read the lsb/msb because then the error margin will never fall below
+ * 500ppm within 50ms.
+ */
+static unsigned long quick_pit_calibrate_hl(void)
+{
+ int i;
+ u64 tsc, delta;
+ unsigned long d1, d2;
+ u16 cnt0, cnt1, dc;
+
+ /* Re-start at 0xffff */
+ outb(0xff, 0x42);
+ outb(0xff, 0x42);
+
+ /*
+ * The PIT starts counting at the next edge, so we
+ * need to delay for a microsecond. The easiest way
+ * to do that is to just read back the 16-bit counter
+ * once from the PIT.
+ */
+ pit_verify_msb(0);
+
+ /*
+ * Iterate until the error is less than 500 ppm. The error margin due to
+ * the time to latch the counter is d1 + d2. The counter latches the
+ * current count which introduces a one tick error at the start and end.
+ * So the total error is d1 + d2 + 2 timer ticks. A timer tick is
+ * approximately the TSC delta divided by the timer delta. So the error
+ * margin is too high while:
+ * d1 + d2 + 2 * (delta / dc) >= delta >> 11
+ * => d1 + d2 >= (delta >> 11) - 2 * (delta / dc)
+ * => d1 + d2 >= (delta - 4096 * (delta / dc)) / 2048
+ * => (d1 + d2) * dc >= (dc * delta - 4096 * delta) / 2048
+ * => (d1 + d2) * dc >= (dc - 4096) * delta >> 11
+ */
+ if (pit_expect_msb_hl(0xff, &tsc, &d1, &cnt0)) {
+ for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) {
+ if (!pit_expect_msb_hl(0xff - i, &delta, &d2, &cnt1))
+ break;
+
+ dc = cnt0 - cnt1;
+ if (dc < 4096)
+ continue;
+
+ delta -= tsc;
+
+ if ((d1 + d2) * (u64)dc >= (dc - 4096) * delta >> 11)
+ continue;
+
+ /*
+ * Check the PIT one more time to verify that
+ * all TSC reads were stable wrt the PIT.
+ *
+ * This also guarantees serialization of the
+ * last cycle read ('d2') in pit_expect_msb.
+ */
+ if (!pit_verify_msb(0xfe - i))
+ break;
+ goto success;
+ }
+ }
+ pr_info("Fast TSC calibration (with latch) failed\n");
+ return 0;
+
+success:
+ /*
+ * Ok, if we get here, then we've seen the
+ * MSB of the PIT decrement 'i' times, and the
+ * error has shrunk to less than 500 ppm.
+ *
+ * As a result, we can depend on there not being
+ * any odd delays anywhere, and the TSC reads are
+ * reliable (within the error).
+ *
+ * kHz = ticks / time-in-seconds / 1000;
+ * kHz = ticks / (PIT count / PIT_TICK_RATE) / 1000
+ * kHz = delta / (dc / PIT_TICK_RATE) / 1000
+ * kHz = (delta * PIT_TICK_RATE) / (dc * 1000)
+ */
+ delta *= PIT_TICK_RATE;
+ do_div(delta, dc * 1000);
+ pr_info("Fast TSC calibration (with latch) using PIT\n");
+ return delta;
+}
+
static unsigned long quick_pit_calibrate(void)
{
int i;
@@ -598,10 +720,19 @@ static unsigned long quick_pit_calibrate(void)
if (!pit_expect_msb(0xff-i, &delta, &d2))
break;
+ delta -= tsc;
+
+ /*
+ * Extrapolate the error and switch to high-latency
+ * algorithm if the error will never be below 500 ppm.
+ */
+ if (i == 1 &&
+ d1 + d2 >= (delta * MAX_QUICK_PIT_ITERATIONS) >> 11)
+ return quick_pit_calibrate_hl();
+
/*
* Iterate until the error is less than 500 ppm
*/
- delta -= tsc;
if (d1+d2 >= delta >> 11)
continue;
--
1.9.1
On 21/05/15 10:55, Adrian Hunter wrote:
> If it takes longer than 12us to read the PIT counter lsb/msb,
> then the error margin will never fall below 500ppm within 50ms,
> and Fast TSC calibration will always fail.
Hi
This does happen, so it would be nice to have this fix.
Are there any comments?
Regards
Adrian
>
> This patch detects when that will happen and switches to using
> a slightly different algorithm that takes advantage of the PIT's
> latch comand.
>
> Signed-off-by: Adrian Hunter <[email protected]>
> ---
> arch/x86/kernel/tsc.c | 133 +++++++++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 132 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index 5054497..0035682 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -553,6 +553,40 @@ static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *de
> }
>
> /*
> + * High latency version of pit_expect_msb(). Instead of using the read latency
> + * as the error margin, latch the counter and use that latency. The counter
> + * latches at the current value which means there is an addition error margin
> + * of 1 timer tick which is not accounted for here.
> + */
> +static inline int pit_expect_msb_hl(unsigned char val, u64 *tscp,
> + unsigned long *deltap, u16 *cnt)
> +{
> + u64 tsc0, tsc1;
> + int count;
> + u8 lsb, msb;
> +
> + for (count = 0; count < 50000; count++) {
> + tsc0 = get_cycles();
> + /* Latch counter 2 */
> + outb(0x80, 0x43);
> + tsc1 = get_cycles();
> + lsb = inb(0x42);
> + msb = inb(0x42);
> + if (msb != val)
> + break;
> + }
> + *deltap = tsc1 - tsc0;
> + *tscp = tsc0;
> + *cnt = lsb | ((u16)msb << 8);
> +
> + /*
> + * We require _some_ success, but the quality control
> + * will be based on the error terms on the TSC values.
> + */
> + return count > 5;
> +}
> +
> +/*
> * How many MSB values do we want to see? We aim for
> * a maximum error rate of 500ppm (in practice the
> * real error is much smaller), but refuse to spend
> @@ -561,6 +595,94 @@ static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *de
> #define MAX_QUICK_PIT_MS 50
> #define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
>
> +/*
> + * High latency version of quick_pit_calibrate() that works even if there is a
> + * high latency reading the PIT lsb/msb. This is needed if it takes longer than
> + * 12us to read the lsb/msb because then the error margin will never fall below
> + * 500ppm within 50ms.
> + */
> +static unsigned long quick_pit_calibrate_hl(void)
> +{
> + int i;
> + u64 tsc, delta;
> + unsigned long d1, d2;
> + u16 cnt0, cnt1, dc;
> +
> + /* Re-start at 0xffff */
> + outb(0xff, 0x42);
> + outb(0xff, 0x42);
> +
> + /*
> + * The PIT starts counting at the next edge, so we
> + * need to delay for a microsecond. The easiest way
> + * to do that is to just read back the 16-bit counter
> + * once from the PIT.
> + */
> + pit_verify_msb(0);
> +
> + /*
> + * Iterate until the error is less than 500 ppm. The error margin due to
> + * the time to latch the counter is d1 + d2. The counter latches the
> + * current count which introduces a one tick error at the start and end.
> + * So the total error is d1 + d2 + 2 timer ticks. A timer tick is
> + * approximately the TSC delta divided by the timer delta. So the error
> + * margin is too high while:
> + * d1 + d2 + 2 * (delta / dc) >= delta >> 11
> + * => d1 + d2 >= (delta >> 11) - 2 * (delta / dc)
> + * => d1 + d2 >= (delta - 4096 * (delta / dc)) / 2048
> + * => (d1 + d2) * dc >= (dc * delta - 4096 * delta) / 2048
> + * => (d1 + d2) * dc >= (dc - 4096) * delta >> 11
> + */
> + if (pit_expect_msb_hl(0xff, &tsc, &d1, &cnt0)) {
> + for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) {
> + if (!pit_expect_msb_hl(0xff - i, &delta, &d2, &cnt1))
> + break;
> +
> + dc = cnt0 - cnt1;
> + if (dc < 4096)
> + continue;
> +
> + delta -= tsc;
> +
> + if ((d1 + d2) * (u64)dc >= (dc - 4096) * delta >> 11)
> + continue;
> +
> + /*
> + * Check the PIT one more time to verify that
> + * all TSC reads were stable wrt the PIT.
> + *
> + * This also guarantees serialization of the
> + * last cycle read ('d2') in pit_expect_msb.
> + */
> + if (!pit_verify_msb(0xfe - i))
> + break;
> + goto success;
> + }
> + }
> + pr_info("Fast TSC calibration (with latch) failed\n");
> + return 0;
> +
> +success:
> + /*
> + * Ok, if we get here, then we've seen the
> + * MSB of the PIT decrement 'i' times, and the
> + * error has shrunk to less than 500 ppm.
> + *
> + * As a result, we can depend on there not being
> + * any odd delays anywhere, and the TSC reads are
> + * reliable (within the error).
> + *
> + * kHz = ticks / time-in-seconds / 1000;
> + * kHz = ticks / (PIT count / PIT_TICK_RATE) / 1000
> + * kHz = delta / (dc / PIT_TICK_RATE) / 1000
> + * kHz = (delta * PIT_TICK_RATE) / (dc * 1000)
> + */
> + delta *= PIT_TICK_RATE;
> + do_div(delta, dc * 1000);
> + pr_info("Fast TSC calibration (with latch) using PIT\n");
> + return delta;
> +}
> +
> static unsigned long quick_pit_calibrate(void)
> {
> int i;
> @@ -598,10 +720,19 @@ static unsigned long quick_pit_calibrate(void)
> if (!pit_expect_msb(0xff-i, &delta, &d2))
> break;
>
> + delta -= tsc;
> +
> + /*
> + * Extrapolate the error and switch to high-latency
> + * algorithm if the error will never be below 500 ppm.
> + */
> + if (i == 1 &&
> + d1 + d2 >= (delta * MAX_QUICK_PIT_ITERATIONS) >> 11)
> + return quick_pit_calibrate_hl();
> +
> /*
> * Iterate until the error is less than 500 ppm
> */
> - delta -= tsc;
> if (d1+d2 >= delta >> 11)
> continue;
>
>
On Mon, 1 Jun 2015, Adrian Hunter wrote:
> On 21/05/15 10:55, Adrian Hunter wrote:
> > If it takes longer than 12us to read the PIT counter lsb/msb,
> > then the error margin will never fall below 500ppm within 50ms,
> > and Fast TSC calibration will always fail.
>
> Hi
>
> This does happen, so it would be nice to have this fix.
> Are there any comments?
Yes. Its on my review list...
Thanks,
tglx
On Thu, 21 May 2015, Adrian Hunter wrote:
> If it takes longer than 12us to read the PIT counter lsb/msb,
> then the error margin will never fall below 500ppm within 50ms,
> and Fast TSC calibration will always fail.
So finally the legacy PIT emulation takes longer than the 30 years old
hardware implementation. Progress!
> This patch detects when that will happen and switches to using
> a slightly different algorithm that takes advantage of the PIT's
> latch comand.
Is there really no smarter way to figure out the TSC frequency on
modern systems?
Thanks,
tglx
On Tue, Jun 2, 2015 at 12:33 PM, Thomas Gleixner <[email protected]> wrote:
> On Thu, 21 May 2015, Adrian Hunter wrote:
>
>> If it takes longer than 12us to read the PIT counter lsb/msb,
>> then the error margin will never fall below 500ppm within 50ms,
>> and Fast TSC calibration will always fail.
>
> So finally the legacy PIT emulation takes longer than the 30 years old
> hardware implementation. Progress!
>
>> This patch detects when that will happen and switches to using
>> a slightly different algorithm that takes advantage of the PIT's
>> latch comand.
>
> Is there really no smarter way to figure out the TSC frequency on
> modern systems?
I just asked Len this question yesterday. intel_pstate can do it,
although the algorithm is a bit gross.
--Andy
>
> Thanks,
>
> tglx
--
Andy Lutomirski
AMA Capital Management, LLC
On Tue, Jun 02, 2015 at 12:41:27PM -0700, Andy Lutomirski wrote:
> On Tue, Jun 2, 2015 at 12:33 PM, Thomas Gleixner <[email protected]> wrote:
> > On Thu, 21 May 2015, Adrian Hunter wrote:
> >
> >> If it takes longer than 12us to read the PIT counter lsb/msb,
> >> then the error margin will never fall below 500ppm within 50ms,
> >> and Fast TSC calibration will always fail.
> >
> > So finally the legacy PIT emulation takes longer than the 30 years old
> > hardware implementation. Progress!
> >
> >> This patch detects when that will happen and switches to using
> >> a slightly different algorithm that takes advantage of the PIT's
> >> latch comand.
> >
> > Is there really no smarter way to figure out the TSC frequency on
> > modern systems?
>
> I just asked Len this question yesterday. intel_pstate can do it,
> although the algorithm is a bit gross.
intel_pstate needs to know the model number. If you know the
model number sure you can do a lot better (e.g. using the
ref-clock fixed counter or some other methods)
But if you don't you need something else. And at some point
the only thing left over is the PIT.
-Andi
On Tue, 2 Jun 2015, Andi Kleen wrote:
> On Tue, Jun 02, 2015 at 12:41:27PM -0700, Andy Lutomirski wrote:
> > On Tue, Jun 2, 2015 at 12:33 PM, Thomas Gleixner <[email protected]> wrote:
> > > On Thu, 21 May 2015, Adrian Hunter wrote:
> > >
> > >> If it takes longer than 12us to read the PIT counter lsb/msb,
> > >> then the error margin will never fall below 500ppm within 50ms,
> > >> and Fast TSC calibration will always fail.
> > >
> > > So finally the legacy PIT emulation takes longer than the 30 years old
> > > hardware implementation. Progress!
> > >
> > >> This patch detects when that will happen and switches to using
> > >> a slightly different algorithm that takes advantage of the PIT's
> > >> latch comand.
> > >
> > > Is there really no smarter way to figure out the TSC frequency on
> > > modern systems?
> >
> > I just asked Len this question yesterday. intel_pstate can do it,
> > although the algorithm is a bit gross.
Well, the algorithm for that slow PIT emulation thing is definitely
not from the straight forward kind either.
> intel_pstate needs to know the model number. If you know the
> model number sure you can do a lot better (e.g. using the
> ref-clock fixed counter or some other methods)
>
> But if you don't you need something else. And at some point
> the only thing left over is the PIT.
Right, and if we dont have a way to readout the stuff because the
kernel does not yet have support for that particular model it falls
back to PIT slow path in the worst case.
That's better than having another weird calibration routine which
might be outdated with the next generation of PIT emulation. We've
been there with the HPET deferred writes already. No need to have more
of this nonsense.
It's about time that Intel gets its act together and provides a proper
architected way to figure that out. Though I fear that will require
another 15 years of yelling (IIRC that was the time it took to get a
constant frequency TSC).
Thanks,
tglx
On Tue, Jun 2, 2015 at 12:58 PM, Thomas Gleixner <[email protected]> wrote:
> On Tue, 2 Jun 2015, Andi Kleen wrote:
>> On Tue, Jun 02, 2015 at 12:41:27PM -0700, Andy Lutomirski wrote:
>> > On Tue, Jun 2, 2015 at 12:33 PM, Thomas Gleixner <[email protected]> wrote:
>> > > On Thu, 21 May 2015, Adrian Hunter wrote:
>> > >
>> > >> If it takes longer than 12us to read the PIT counter lsb/msb,
>> > >> then the error margin will never fall below 500ppm within 50ms,
>> > >> and Fast TSC calibration will always fail.
>> > >
>> > > So finally the legacy PIT emulation takes longer than the 30 years old
>> > > hardware implementation. Progress!
>> > >
>> > >> This patch detects when that will happen and switches to using
>> > >> a slightly different algorithm that takes advantage of the PIT's
>> > >> latch comand.
>> > >
>> > > Is there really no smarter way to figure out the TSC frequency on
>> > > modern systems?
>> >
>> > I just asked Len this question yesterday. intel_pstate can do it,
>> > although the algorithm is a bit gross.
>
> Well, the algorithm for that slow PIT emulation thing is definitely
> not from the straight forward kind either.
>
>> intel_pstate needs to know the model number. If you know the
>> model number sure you can do a lot better (e.g. using the
>> ref-clock fixed counter or some other methods)
>>
>> But if you don't you need something else. And at some point
>> the only thing left over is the PIT.
>
> Right, and if we dont have a way to readout the stuff because the
> kernel does not yet have support for that particular model it falls
> back to PIT slow path in the worst case.
>
> That's better than having another weird calibration routine which
> might be outdated with the next generation of PIT emulation. We've
> been there with the HPET deferred writes already. No need to have more
> of this nonsense.
>
> It's about time that Intel gets its act together and provides a proper
> architected way to figure that out. Though I fear that will require
> another 15 years of yelling (IIRC that was the time it took to get a
> constant frequency TSC).
There's the code in tsc_msr.c. It should be relatively
straightforward to extend it to cover everything that intel_pstate
supports.
But yes, Intel should add an MSR that gives the frequency in Hz.
--Andy
> There's the code in tsc_msr.c. It should be relatively
> straightforward to extend it to cover everything that intel_pstate
> supports.
That's a good idea, but we still need an always working fallback when the
model number is not available. So Adrian's patch is needed in any
case.
-Andi
--
[email protected] -- Speaking for myself only
On Tue, 2 Jun 2015, Andi Kleen wrote:
> > There's the code in tsc_msr.c. It should be relatively
> > straightforward to extend it to cover everything that intel_pstate
> > supports.
>
> That's a good idea, but we still need an always working fallback when the
> model number is not available. So Adrian's patch is needed in any
> case.
Nonsense. The slow calibration is already a working fallback.
Thanks,
tglx
On Tue, Jun 02, 2015 at 11:03:26PM +0200, Thomas Gleixner wrote:
>
>
> On Tue, 2 Jun 2015, Andi Kleen wrote:
>
> > > There's the code in tsc_msr.c. It should be relatively
> > > straightforward to extend it to cover everything that intel_pstate
> > > supports.
> >
> > That's a good idea, but we still need an always working fallback when the
> > model number is not available. So Adrian's patch is needed in any
> > case.
>
> Nonsense. The slow calibration is already a working fallback.
Please read Adrian's description again. It's not working when the PIT read is
too slow. That is when the new algorithm is needed.
-Andi
On Tue, Jun 2, 2015 at 4:38 PM, Andi Kleen <[email protected]> wrote:
> On Tue, Jun 02, 2015 at 11:03:26PM +0200, Thomas Gleixner wrote:
>>
>>
>> On Tue, 2 Jun 2015, Andi Kleen wrote:
>>
>> > > There's the code in tsc_msr.c. It should be relatively
>> > > straightforward to extend it to cover everything that intel_pstate
>> > > supports.
>> >
>> > That's a good idea, but we still need an always working fallback when the
>> > model number is not available. So Adrian's patch is needed in any
>> > case.
>>
>> Nonsense. The slow calibration is already a working fallback.
>
> Please read Adrian's description again. It's not working when the PIT read is
> too slow. That is when the new algorithm is needed.
>
tglx's suggestion was to use slow calibration as a fallback.
--Andy
On Tue, Jun 02, 2015 at 05:21:27PM -0700, Andy Lutomirski wrote:
> On Tue, Jun 2, 2015 at 4:38 PM, Andi Kleen <[email protected]> wrote:
> > On Tue, Jun 02, 2015 at 11:03:26PM +0200, Thomas Gleixner wrote:
> >>
> >>
> >> On Tue, 2 Jun 2015, Andi Kleen wrote:
> >>
> >> > > There's the code in tsc_msr.c. It should be relatively
> >> > > straightforward to extend it to cover everything that intel_pstate
> >> > > supports.
> >> >
> >> > That's a good idea, but we still need an always working fallback when the
> >> > model number is not available. So Adrian's patch is needed in any
> >> > case.
> >>
> >> Nonsense. The slow calibration is already a working fallback.
> >
> > Please read Adrian's description again. It's not working when the PIT read is
> > too slow. That is when the new algorithm is needed.
> >
>
> tglx's suggestion was to use slow calibration as a fallback.
You mean the last fallback we have today?
That one doesn't work if the PIT read is too slow.
And Adrian's patch is fixing that.
-Andi
--
[email protected] -- Speaking for myself only
On Tue, Jun 2, 2015 at 5:39 PM, Andi Kleen <[email protected]> wrote:
> On Tue, Jun 02, 2015 at 05:21:27PM -0700, Andy Lutomirski wrote:
>> On Tue, Jun 2, 2015 at 4:38 PM, Andi Kleen <[email protected]> wrote:
>> > On Tue, Jun 02, 2015 at 11:03:26PM +0200, Thomas Gleixner wrote:
>> >>
>> >>
>> >> On Tue, 2 Jun 2015, Andi Kleen wrote:
>> >>
>> >> > > There's the code in tsc_msr.c. It should be relatively
>> >> > > straightforward to extend it to cover everything that intel_pstate
>> >> > > supports.
>> >> >
>> >> > That's a good idea, but we still need an always working fallback when the
>> >> > model number is not available. So Adrian's patch is needed in any
>> >> > case.
>> >>
>> >> Nonsense. The slow calibration is already a working fallback.
>> >
>> > Please read Adrian's description again. It's not working when the PIT read is
>> > too slow. That is when the new algorithm is needed.
>> >
>>
>> tglx's suggestion was to use slow calibration as a fallback.
>
> You mean the last fallback we have today?
>
> That one doesn't work if the PIT read is too slow.
>
> And Adrian's patch is fixing that.
Then the changelog should say that I think. The current text says
"Fast TSC calibration will always fail", which, to me, suggests that
either the slow calibration will work or that the changelog message
should be changed.
--Andy
> Then the changelog should say that I think. The current text says
> "Fast TSC calibration will always fail", which, to me, suggests that
> either the slow calibration will work or that the changelog message
> should be changed.
Ok. No, the slow calibration works I believe.
-Andi
On Tue, Jun 2, 2015 at 12:33 PM, Thomas Gleixner <[email protected]> wrote:
>
> Is there really no smarter way to figure out the TSC frequency on
> modern systems?
Sadly, if there is, we have yet to find it.
I don't think the mentioned intel_pstate thing gets it right either.
Yes, it gets some "theoretical frequency", given the pstate and the
scaling factor, but that assumes the base clock (BCLK) is something
trustworthy. That's a big assumption, and I can pretty much guarantee
it's not a valid assumption.
I'm sure that's the *common* case, but how much do you want to bet
that some motherboard makers noticed that they get good press from
good benchmark numbers, and that they can tweak BCLK a bit higher than
the competition? "Oooh, motherboard from Xyz is really good, it's 1%
faster than the competition".
Or all those overclockers? Yes, some of them buy unlocked CPU's and
leave BCLK alone. Or maybe they do both. Or maybe they just buy locked
CPU's and tweak BCLK.
The only *well-defined* clock in a modern PC seems to still remain the
PIT. Yes, it's very sad. But all the other clocks tend to be
untrustworthy for various reasons, like "frequency is stated in the
ACPI tables", which we know is basically just a fantasy that may be
right *most* of the time, but I can almost guarantee that some BIOS
guys decided that they can get slightly better random benchmark
numbers by "tweaking" the truth a bit. It's the "BIOS version" of
overclocking by increasing BCLK - lie about the PM frequency, so that
any benchmark that times things that way will give better numbers..
But pretty much nobody messes up the PIT. That will change, just
because it's clearly getting to the point where people are just
emulating it, and it's probably not going to be any more trustworthy
than anything else.
So rather than get a new and truly trustworthy reference clock, the
patch makes me suspect we're losing the old one...
Linus
* Linus Torvalds <[email protected]> wrote:
> On Tue, Jun 2, 2015 at 12:33 PM, Thomas Gleixner <[email protected]> wrote:
> >
> > Is there really no smarter way to figure out the TSC frequency on
> > modern systems?
>
> Sadly, if there is, we have yet to find it.
>
> I don't think the mentioned intel_pstate thing gets it right either. Yes, it
> gets some "theoretical frequency", given the pstate and the scaling factor, but
> that assumes the base clock (BCLK) is something trustworthy. That's a big
> assumption, and I can pretty much guarantee it's not a valid assumption.
>
> I'm sure that's the *common* case, but how much do you want to bet that some
> motherboard makers noticed that they get good press from good benchmark numbers,
> and that they can tweak BCLK a bit higher than the competition? "Oooh,
> motherboard from Xyz is really good, it's 1% faster than the competition".
>
> Or all those overclockers? Yes, some of them buy unlocked CPU's and leave BCLK
> alone. Or maybe they do both. Or maybe they just buy locked CPU's and tweak
> BCLK.
>
> The only *well-defined* clock in a modern PC seems to still remain the PIT.
> [...]
and the HPET, which is pretty good as well, when available. In fact given that
it's required to have a frequency of at least 10 MHz, and (unlike the PIT) has a
pretty wide counter, it could be used for pretty accurate calibration as well that
runs a lot shorter than PIT calibration.
HPETs have some quirks, and are sometimes emulated (on early hardware, when Window
did not require HPET and the value of HPET was yet unclear?), but I'd say on
modern x86 systems it ought to be a pretty good clock for calibration as well.
> [...] Yes, it's very sad. But all the other clocks tend to be untrustworthy for
> various reasons, like "frequency is stated in the ACPI tables", which we know is
> basically just a fantasy that may be right *most* of the time, but I can almost
> guarantee that some BIOS guys decided that they can get slightly better random
> benchmark numbers by "tweaking" the truth a bit. It's the "BIOS version" of
> overclocking by increasing BCLK - lie about the PM frequency, so that any
> benchmark that times things that way will give better numbers..
So the HPET frequency comes straight from a hardware register, via:
hpet_period = hpet_readl(HPET_PERIOD);
now you are right that the 'hardware' is in some cases is implemented via SMM
virtualization - but so is the PIT I suspect. Given that Windows relies on the
HPET for timekeeping, it might get more attention than the PIT?
> But pretty much nobody messes up the PIT. That will change, just because it's
> clearly getting to the point where people are just emulating it, and it's
> probably not going to be any more trustworthy than anything else.
So I think the reason why these tend to be real hardware most of the time is that
both the PIT and the HPET are supposed to generate interrupts - which is not that
easy to 'emulate' via SMM: you have to have a real irq-generating clock in the
hardware _somewhere_, and that comes with counter and all, so not much left to
emulate after that point.
What I think might happen is to emulate the PIT via HPET, the lower frequency
clock with a higher frequency clock.
Thanks,
Ingo
Linus wrote:
> The only *well-defined* clock in a modern PC seems to still remain the
> PIT. Yes, it's very sad. But all the other clocks tend to be
> untrustworthy for various reasons
Actually, there is one more: the CMOS RTC clock is quite reliably 32768 Hz.
The reas process is very similar, although you only have a single PE bit
rather than a count for sanity checking. You can program a rate between
2 Hz and 8192 Hz at which the PE bit (register C, 0x06) will be set.
A rate of 4096 Hz would work similarly to the current PIT-based
1193182/256 = 4661 Hz code.
Then you just poll until you capture the transition (it's cleared
automatically by read) and do similar filtering.
(It would also be very nifty to use some of the values collected by
this calibration to seed boot-time entropy.)
On 03/06/15 06:30, Andi Kleen wrote:
>> Then the changelog should say that I think. The current text says
>> "Fast TSC calibration will always fail", which, to me, suggests that
>> either the slow calibration will work or that the changelog message
>> should be changed.
>
> Ok. No, the slow calibration works I believe.
Yeah, so the (only?) downside is the 50ms wasted on Fast TSC
calibration. What about this?
From: Adrian Hunter <[email protected]>
Date: Wed, 3 Jun 2015 10:39:46 +0300
Subject: [PATCH] x86, tsc: Let high latency PIT fail fast in
quick_pit_calibrate()
If it takes longer than 12us to read the PIT counter lsb/msb,
then the error margin will never fall below 500ppm within 50ms,
and Fast TSC calibration will always fail.
This patch detects when that will happen and fails fast. Note
the failure message is not printed in that case because:
1. it will always happen on that class of hardware
2. the absence of the message is more informative than its
presence
Signed-off-by: Adrian Hunter <[email protected]>
---
arch/x86/kernel/tsc.c | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7114c86af35e..673c4f25021f 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -598,10 +598,19 @@ static unsigned long quick_pit_calibrate(void)
if (!pit_expect_msb(0xff-i, &delta, &d2))
break;
+ delta -= tsc;
+
+ /*
+ * Extrapolate the error and fail fast if the error will
+ * never be below 500 ppm.
+ */
+ if (i == 1 &&
+ d1 + d2 >= (delta * MAX_QUICK_PIT_ITERATIONS) >> 11)
+ return 0;
+
/*
* Iterate until the error is less than 500 ppm
*/
- delta -= tsc;
if (d1+d2 >= delta >> 11)
continue;
--
1.9.1
On Tue, Jun 2, 2015 at 11:20 PM, Ingo Molnar <[email protected]> wrote:
>
> and the HPET, which is pretty good as well, when available. In fact given that
> it's required to have a frequency of at least 10 MHz, and (unlike the PIT) has a
> pretty wide counter, it could be used for pretty accurate calibration as well that
> runs a lot shorter than PIT calibration.
Yeah, you're probably right that we should look into at least having
the option to use the HPET.
That said, the fact that we can read the period from HPET_PERIOD does
*not* make me trust it all that much. I suspect that register is just
another "filled in by firmware" piece of data.
> Given that Windows relies on the
> HPET for timekeeping, it might get more attention than the PIT?
Does Windows actually do that? Becuase if so, that's just about the
strongest argument for HPET there is - it's likely to work, and the
frequency is likely to be correct.
We've had issues with HPET, but for calibration it might very well be
the right thing to do.
Does anybody know what the base oscillator for HPET tends to be? Also,
some googling shows a vmware paper that is not that impressed with the
HPET.
The good thing about the PIT is that it's just *specified* to be
driven off a real crystal running at a very fixed frequency. There's
no gray areas there. Sure, virtualization can screw it up (but will
likely screw up other higher-resolution clocks even more), and hw
designers can cause problems, but it's been pretty reliable.
(Yeah, the CMOS RTC clock should be too, as George Spelvin points out.
That might be worth looking at too).
Linus
On Wed, Jun 3, 2015 at 1:13 AM, Adrian Hunter <[email protected]> wrote:
>
> Yeah, so the (only?) downside is the 50ms wasted on Fast TSC
> calibration. What about this?
That's certainly simpler.
What platform is this? Do you have access to hw people you can shake
down and ask what clock we can really _trust_?
Linus
On Wed, 3 Jun 2015, Adrian Hunter wrote:
> On 03/06/15 06:30, Andi Kleen wrote:
> >> Then the changelog should say that I think. The current text says
> >> "Fast TSC calibration will always fail", which, to me, suggests that
> >> either the slow calibration will work or that the changelog message
> >> should be changed.
> >
> > Ok. No, the slow calibration works I believe.
>
> Yeah, so the (only?) downside is the 50ms wasted on Fast TSC
> calibration. What about this?
I'm certainly happy to apply this one.
Thanks,
tglx
On Wed, 3 Jun 2015, Linus Torvalds wrote:
> On Tue, Jun 2, 2015 at 11:20 PM, Ingo Molnar <[email protected]> wrote:
> > Given that Windows relies on the
> > HPET for timekeeping, it might get more attention than the PIT?
>
> Does Windows actually do that? Becuase if so, that's just about the
> strongest argument for HPET there is - it's likely to work, and the
> frequency is likely to be correct.
At least it used to. Not sure if it still does.
> We've had issues with HPET, but for calibration it might very well be
> the right thing to do.
Right. The issues we had were on the clock events side caused by the
match register delayed writes. I've never seen a bug report about the
frequency being completely wrong, except for crap values which we
filter out. Though, HPET period can be off by more than 500ppm, but I
don't think that matters anymore for timekeeping as we switch to TSC
only after the long term calibration. The quick calibration is just
for getting the TSC frequency roughly correct for udelay.
> Does anybody know what the base oscillator for HPET tends to be? Also,
The most common frequency is 14.318180 MHz.
> some googling shows a vmware paper that is not that impressed with the
> HPET.
Yeah, they complain about period being off by 600ppm and therefor
being useless for timekeeping, but that's not what we want to use it
for.
> The good thing about the PIT is that it's just *specified* to be
> driven off a real crystal running at a very fixed frequency. There's
> no gray areas there. Sure, virtualization can screw it up (but will
> likely screw up other higher-resolution clocks even more), and hw
> designers can cause problems, but it's been pretty reliable.
The other known frequency clock is pmtimer. We use hpet and pmtimer in
the slow calibration fallback already.
> (Yeah, the CMOS RTC clock should be too, as George Spelvin points out.
> That might be worth looking at too).
Indeed.
Thanks,
tglx
On Wed, Jun 3, 2015 at 9:47 AM, Thomas Gleixner <[email protected]> wrote:
>
>> Does anybody know what the base oscillator for HPET tends to be? Also,
>
> The most common frequency is 14.318180 MHz.
I was more thinking along the lines of "which actually crystal is
driving it" than the frequency.
That said, that particular frequency does give a clue. That's exactly
12x the PIT frequency. So they presumably share the same basic
crystal, just different PLL's.
Which is a good indication that we should get very similar clock
stability with the HPET as with the PIC.
We might even use that kind of knowledge to decide "ok, let's use the
HPET as the reference clock". IOW, only use the HPET if we find it
listed, _and_ we get the expected frequency from the HPET_PERIOD
register.
Anybody who gives odd HPET_PERIOD values is likely doing somehing
different and possibly questionable.
Linus
* Thomas Gleixner <[email protected]> wrote:
> On Wed, 3 Jun 2015, Linus Torvalds wrote:
> > On Tue, Jun 2, 2015 at 11:20 PM, Ingo Molnar <[email protected]> wrote:
> > >
> > > Given that Windows relies on the HPET for timekeeping, it might get more
> > > attention than the PIT?
> >
> > Does Windows actually do that? Becuase if so, that's just about the strongest
> > argument for HPET there is - it's likely to work, and the frequency is likely
> > to be correct.
>
> At least it used to. Not sure if it still does.
So I googled around a bit, and it's not as clear-cut as I thought initially: it
appears that if the BIOS enables the HPET then modern Windows (Windows 7 and
later) will use it, but there are problems and Windows XP (the largest installed
base) used the TSC+acpipm. (two other lovely clocks)
> > We've had issues with HPET, but for calibration it might very well be the
> > right thing to do.
>
> Right. The issues we had were on the clock events side caused by the match
> register delayed writes. I've never seen a bug report about the frequency being
> completely wrong, except for crap values which we filter out. Though, HPET
> period can be off by more than 500ppm, but I don't think that matters anymore
> for timekeeping as we switch to TSC only after the long term calibration. The
> quick calibration is just for getting the TSC frequency roughly correct for
> udelay.
Yes - so the frequency being off due to production variances (or sheer firmware
incompetence) isn't a big deal in itself: having boot-to-boot jitter during fast
calibration would be, and that's the big question.
The nice thing about the PIT calibration is that it usually provides very little
jitter, so that driver delays/etc. are as boot-to-boot identical as possible.
Anyway, the HPET might be worth an attempt - but I'd not be surprised if we
discovered another rotten apple ...
Thanks,
Ingo
On 06/03/2015 10:04 AM, Linus Torvalds wrote:
> On Wed, Jun 3, 2015 at 9:47 AM, Thomas Gleixner <[email protected]> wrote:
>>
>>> Does anybody know what the base oscillator for HPET tends to be? Also,
>>
>> The most common frequency is 14.318180 MHz.
>
> I was more thinking along the lines of "which actually crystal is
> driving it" than the frequency.
>
> That said, that particular frequency does give a clue. That's exactly
> 12x the PIT frequency. So they presumably share the same basic
> crystal, just different PLL's.
Here are the gory details as far as I know them:
This is the default time keeping crystal -- it is 4x the NTSC color
burst frequency, which has been used since the original PC. It is
actually supposed to be 157.5/11 MHz = 14.31818... MHz ±40 Hz, there is
no zero even though it is often seen in docs.
However, the HPET isn't guaranteed to run at this frequency. The HPET
frequency is advertised via a control register in HPET space (at offset
0x04, 32-bit COUNTER_CLK_PERIOD in fs). This register is documented as
readonly
The RTC is probably the most reliable reference clock, in part because
32 kHz crystals are generally calibrated and extremely stable. However,
to get more than 1 Hz frequency out of it you have to enable interrupts
(which gets you to 8192 Hz). I have heard claims that this is actually
what Windows uses for calibration, but I don't know.
The ACPI PM_TMR is also fed from the 14.31818 MHz clock () with a
divisor of 4 (putting it exactly at the NTSC color burst frequency of
3.579545... MHz ± 10 Hz). The PM_TMR, if present, is required to run at
this frequency -- there are no provisions in ACPI for specifying the
frequency.
> Which is a good indication that we should get very similar clock
> stability with the HPET as with the PIC.
>
> We might even use that kind of knowledge to decide "ok, let's use the
> HPET as the reference clock". IOW, only use the HPET if we find it
> listed, _and_ we get the expected frequency from the HPET_PERIOD
> register.
>
> Anybody who gives odd HPET_PERIOD values is likely doing somehing
> different and possibly questionable.
It would definitely be fishy if PM_TMR is also supported, since PM_TMR
has to run at a fixed frequency.
The HPET frequency register would be 0x429b176 if run off the 14 MHz clock.
However, if someone does something different the PIC/PM_TMR frequency
might very well be synthesized (or worse, approximated!) and the HPET
might be run off the the real BCLK. This is a legitimate design.
PIC/PM_TMR being synthesized is fine (might introduce some jitter, but
shouldn't affect calibration), approximated is not...
-hpa
H. Peter Anvin wrote:
> The RTC is probably the most reliable reference clock, in part because
> 32 kHz crystals are generally calibrated and extremely stable. However,
> to get more than 1 Hz frequency out of it you have to enable interrupts
> (which gets you to 8192 Hz).
Indeed, it's the only one which is guaranteed a separate crystal.
Many low-cost chipsets generate ALL other frequencies from one crystal
with PLLs.
(This is why I'm keen on using the RTC interrupt for an entropy
source.)
But as I mentioned earlier, you *can* get higher frequencies with
interrupts *or* polling. When you program the periodic event frequency
(from 2 to 8192 Hz), it does three things at that rate:
1) Periodic interrupts (if enabled),
2) Square wave output (if enabled, and relevant to discrete chips only), and
3) Sets the PE bit (register C, bit 6), which is auto-cleared on read.
So if you're willing to poll the device (which the TSC calibration does
already), you can get high resolution tick edges without interrupts.
Because it's only one read (port 0x71), it's slightly faster than the PIT.
(I also wish we could use all those TSC reads for initial entropy seeding
somehow.)
On 06/03/2015 11:29 AM, George Spelvin wrote:
>
> Indeed, it's the only one which is guaranteed a separate crystal.
> Many low-cost chipsets generate ALL other frequencies from one crystal
> with PLLs.
>
Not guaranteed either, and I know for a fact there are platforms out
there which synthesize the RTC clock.
> But as I mentioned earlier, you *can* get higher frequencies with
> interrupts *or* polling. When you program the periodic event frequency
> (from 2 to 8192 Hz), it does three things at that rate:
>
> 1) Periodic interrupts (if enabled),
> 2) Square wave output (if enabled, and relevant to discrete chips only), and
> 3) Sets the PE bit (register C, bit 6), which is auto-cleared on read.
Ah, I wasn't aware of the PF (not PE) bit. That suddenly makes it a lot
more interesting. So polling for the PF bit suddenly makes sense, and
is probably the single best option for calibration.
> So if you're willing to poll the device (which the TSC calibration does
> already), you can get high resolution tick edges without interrupts.
>
> Because it's only one read (port 0x71), it's slightly faster than the PIT.
>
> (I also wish we could use all those TSC reads for initial entropy seeding
> somehow.)
Well, on x86 hopefully the entropy problem should soon be history...
-hpa
> Not guaranteed either, and I know for a fact there are platforms out
> there which synthesize the RTC clock.
Interesting! And surprising. Doesn't that take more battery power?
> Ah, I wasn't aware of the PF (not PE) bit. That suddenly makes it a lot
> more interesting. So polling for the PF bit suddenly makes sense, and
> is probably the single best option for calibration.
I had forgotten about it, too. But adapting the current calibration
loop to it is trivial. You lose the ability to detect drasitcally
delayed reads, but the current code barely uses that.
> Well, on x86 hopefully the entropy problem should soon be history...
And the installed base will be replaced when? You may recall a discussion
in December 2012 to *not* drop 486 support because people were still
using embedded clones of it with modern kernels.
On 03/06/15 16:45, Linus Torvalds wrote:
> On Wed, Jun 3, 2015 at 1:13 AM, Adrian Hunter <[email protected]> wrote:
>>
>> Yeah, so the (only?) downside is the 50ms wasted on Fast TSC
>> calibration. What about this?
>
> That's certainly simpler.
>
> What platform is this?
It's a prototype.
* H. Peter Anvin <[email protected]> wrote:
> [...]
>
> The RTC is probably the most reliable reference clock, in part because 32 kHz
> crystals are generally calibrated and extremely stable. However, to get more
> than 1 Hz frequency out of it you have to enable interrupts (which gets you to
> 8192 Hz). I have heard claims that this is actually what Windows uses for
> calibration, but I don't know.
So I ran a few tests to determine how reliable the RTC clock is at 8192 Hz, and
running it during early bootup gives this type of jitter:
[ 0.000000] tsc: RTC IRQ 0, at 30566853736, delta: 30, jitter: 30
[ 0.000000] tsc: RTC IRQ 1, at 30567333663, delta: 479927, jitter: 479897
[ 0.000000] tsc: RTC IRQ 2, at 30567578651, delta: 244988, jitter: -234939
[ 0.000000] tsc: RTC IRQ 3, at 30567825147, delta: 246496, jitter: 1508
[ 0.000000] tsc: RTC IRQ 4, at 30568070180, delta: 245033, jitter: -1463
[ 0.000000] tsc: RTC IRQ 5, at 30568315220, delta: 245040, jitter: 7
[ 0.000000] tsc: RTC IRQ 6, at 30568561747, delta: 246527, jitter: 1487
[ 0.000000] tsc: RTC IRQ 7, at 30568806794, delta: 245047, jitter: -1480
[ 0.000000] tsc: RTC IRQ 8, at 30569051789, delta: 244995, jitter: -52
[ 0.000000] tsc: RTC IRQ 9, at 30569296824, delta: 245035, jitter: 40
[ 0.000000] tsc: RTC IRQ 10, at 30569543304, delta: 246480, jitter: 1445
[ 0.000000] tsc: RTC IRQ 11, at 30569788346, delta: 245042, jitter: -1438
[ 0.000000] tsc: RTC IRQ 12, at 30570033393, delta: 245047, jitter: 5
[ 0.000000] tsc: RTC IRQ 13, at 30570278433, delta: 245040, jitter: -7
[ 0.000000] tsc: RTC IRQ 14, at 30570524894, delta: 246461, jitter: 1421
[ 0.000000] tsc: RTC IRQ 15, at 30570769950, delta: 245056, jitter: -1405
[ 0.000000] tsc: RTC IRQ 16, at 30571014993, delta: 245043, jitter: -13
[ 0.000000] tsc: RTC IRQ 17, at 30571260024, delta: 245031, jitter: -12
[ 0.000000] tsc: RTC IRQ 18, at 30571506504, delta: 246480, jitter: 1449
[ 0.000000] tsc: RTC IRQ 19, at 30571751560, delta: 245056, jitter: -1424
[ 0.000000] tsc: RTC IRQ 20, at 30571996577, delta: 245017, jitter: -39
[ 0.000000] tsc: RTC IRQ 21, at 30572243072, delta: 246495, jitter: 1478
[ 0.000000] tsc: RTC IRQ 22, at 30572488121, delta: 245049, jitter: -1446
[ 0.000000] tsc: RTC IRQ 23, at 30572733125, delta: 245004, jitter: -45
[ 0.000000] tsc: RTC IRQ 24, at 30572978181, delta: 245056, jitter: 52
[ 0.000000] tsc: RTC IRQ 25, at 30573224673, delta: 246492, jitter: 1436
[ 0.000000] tsc: RTC IRQ 26, at 30573469712, delta: 245039, jitter: -1453
[ 0.000000] tsc: RTC IRQ 27, at 30573714760, delta: 245048, jitter: 9
[ 0.000000] tsc: RTC IRQ 28, at 30573959800, delta: 245040, jitter: -8
[ 0.000000] tsc: RTC IRQ 29, at 30574206272, delta: 246472, jitter: 1432
[ 0.000000] tsc: RTC IRQ 30, at 30574451320, delta: 245048, jitter: -1424
[ 0.000000] tsc: RTC IRQ 31, at 30574696335, delta: 245015, jitter: -33
[ 0.000000] tsc: RTC IRQ 32, at 30574941390, delta: 245055, jitter: 40
[ 0.000000] tsc: RTC IRQ 33, at 30575187872, delta: 246482, jitter: 1427
[ 0.000000] tsc: RTC IRQ 34, at 30575432905, delta: 245033, jitter: -1449
[ 0.000000] tsc: RTC IRQ 35, at 30575677944, delta: 245039, jitter: 6
[ 0.000000] tsc: RTC IRQ 36, at 30575924434, delta: 246490, jitter: 1451
[ 0.000000] tsc: RTC IRQ 37, at 30576169465, delta: 245031, jitter: -1459
[ 0.000000] tsc: RTC IRQ 38, at 30576414504, delta: 245039, jitter: 8
[ 0.000000] tsc: RTC IRQ 39, at 30576659542, delta: 245038, jitter: -1
[ 0.000000] tsc: RTC IRQ 40, at 30576906030, delta: 246488, jitter: 1450
[ 0.000000] tsc: RTC IRQ 41, at 30577151072, delta: 245042, jitter: -1446
[ 0.000000] tsc: RTC IRQ 42, at 30577396105, delta: 245033, jitter: -9
[ 0.000000] tsc: RTC IRQ 43, at 30577641144, delta: 245039, jitter: 6
[ 0.000000] tsc: RTC IRQ 44, at 30577887632, delta: 246488, jitter: 1449
[ 0.000000] tsc: RTC IRQ 45, at 30578132665, delta: 245033, jitter: -1455
[ 0.000000] tsc: RTC IRQ 46, at 30578377704, delta: 245039, jitter: 6
[ 0.000000] tsc: RTC IRQ 47, at 30578622749, delta: 245045, jitter: 6
[ 0.000000] tsc: RTC IRQ 48, at 30578869230, delta: 246481, jitter: 1436
[ 0.000000] tsc: RTC IRQ 49, at 30579114272, delta: 245042, jitter: -1439
[ 0.000000] tsc: RTC IRQ 50, at 30581814501, delta: 2700229, jitter: 2455187
[ 0.000000] tsc: RTC IRQ 51, at 30582059549, delta: 245048, jitter: -2455181
[ 0.000000] tsc: RTC IRQ 52, at 30582304584, delta: 245035, jitter: -13
[ 0.000000] tsc: RTC IRQ 53, at 30582549631, delta: 245047, jitter: 12
[ 0.000000] tsc: RTC IRQ 54, at 30582796117, delta: 246486, jitter: 1439
[ 0.000000] tsc: RTC IRQ 55, at 30583041136, delta: 245019, jitter: -1467
[ 0.000000] tsc: RTC IRQ 56, at 30583286184, delta: 245048, jitter: 29
[ 0.000000] tsc: RTC IRQ 57, at 30583532682, delta: 246498, jitter: 1450
[ 0.000000] tsc: RTC IRQ 58, at 30583777720, delta: 245038, jitter: -1460
[ 0.000000] tsc: RTC IRQ 59, at 30584022745, delta: 245025, jitter: -13
[ 0.000000] tsc: RTC IRQ 60, at 30584267784, delta: 245039, jitter: 14
[ 0.000000] tsc: RTC IRQ 61, at 30584514273, delta: 246489, jitter: 1450
[ 0.000000] tsc: RTC IRQ 62, at 30584759311, delta: 245038, jitter: -1451
[ 0.000000] tsc: RTC IRQ 63, at 30585004357, delta: 245046, jitter: 8
[ 0.000000] tsc: RTC IRQ 64, at 30585249385, delta: 245028, jitter: -18
[ 0.000000] tsc: RTC IRQ 65, at 30585495880, delta: 246495, jitter: 1467
[ 0.000000] tsc: RTC IRQ 66, at 30585740908, delta: 245028, jitter: -1467
[ 0.000000] tsc: RTC IRQ 67, at 30585985953, delta: 245045, jitter: 17
[ 0.000000] tsc: RTC IRQ 68, at 30586232425, delta: 246472, jitter: 1427
[ 0.000000] tsc: RTC IRQ 69, at 30586477464, delta: 245039, jitter: -1433
[ 0.000000] tsc: RTC IRQ 70, at 30586722493, delta: 245029, jitter: -10
[ 0.000000] tsc: RTC IRQ 71, at 30586967516, delta: 245023, jitter: -6
[ 0.000000] tsc: RTC IRQ 72, at 30587213989, delta: 246473, jitter: 1450
[ 0.000000] tsc: RTC IRQ 73, at 30587459029, delta: 245040, jitter: -1433
[ 0.000000] tsc: RTC IRQ 74, at 30587704077, delta: 245048, jitter: 8
[ 0.000000] tsc: RTC IRQ 75, at 30587949125, delta: 245048, jitter: 0
[ 0.000000] tsc: RTC IRQ 76, at 30588195597, delta: 246472, jitter: 1424
[ 0.000000] tsc: RTC IRQ 77, at 30588440629, delta: 245032, jitter: -1440
[ 0.000000] tsc: RTC IRQ 78, at 30588685660, delta: 245031, jitter: -1
[ 0.000000] tsc: RTC IRQ 79, at 30588930716, delta: 245056, jitter: 25
[ 0.000000] tsc: RTC IRQ 80, at 30589177189, delta: 246473, jitter: 1417
[ 0.000000] tsc: RTC IRQ 81, at 30589422229, delta: 245040, jitter: -1433
[ 0.000000] tsc: RTC IRQ 82, at 30589667285, delta: 245056, jitter: 16
[ 0.000000] tsc: RTC IRQ 83, at 30589913760, delta: 246475, jitter: 1419
[ 0.000000] tsc: RTC IRQ 84, at 30590158798, delta: 245038, jitter: -1437
[ 0.000000] tsc: RTC IRQ 85, at 30590403837, delta: 245039, jitter: 1
[ 0.000000] tsc: RTC IRQ 86, at 30590648866, delta: 245029, jitter: -10
[ 0.000000] tsc: RTC IRQ 87, at 30590895346, delta: 246480, jitter: 1451
[ 0.000000] tsc: RTC IRQ 88, at 30591140397, delta: 245051, jitter: -1429
[ 0.000000] tsc: RTC IRQ 89, at 30591385446, delta: 245049, jitter: -2
[ 0.000000] tsc: RTC IRQ 90, at 30591630477, delta: 245031, jitter: -18
[ 0.000000] tsc: RTC IRQ 91, at 30591876949, delta: 246472, jitter: 1441
[ 0.000000] tsc: RTC IRQ 92, at 30592121998, delta: 245049, jitter: -1423
[ 0.000000] tsc: RTC IRQ 93, at 30592367029, delta: 245031, jitter: -18
[ 0.000000] tsc: RTC IRQ 94, at 30592612075, delta: 245046, jitter: 15
[ 0.000000] tsc: RTC IRQ 95, at 30592858555, delta: 246480, jitter: 1434
[ 0.000000] tsc: RTC IRQ 96, at 30593103597, delta: 245042, jitter: -1438
[ 0.000000] tsc: RTC IRQ 97, at 30593348630, delta: 245033, jitter: -9
[ 0.000000] tsc: RTC IRQ 98, at 30593595101, delta: 246471, jitter: 1438
[ 0.000000] tsc: RTC IRQ 99, at 30593840157, delta: 245056, jitter: -1415
[ 0.000000] tsc: RTC IRQ 100, at 30594085197, delta: 245040, jitter: -16
The time measurements are in cycles, the TSC on this box runs at 2.010 GHz.
What you can see is that the jitter is pretty stable, with some harmonics in it.
There's outliners that occur at about 80 Hz frequency:
[ 0.000000] tsc: RTC IRQ 50, at 30581814501, delta: 2700229, jitter: 2455187
[ 0.000000] tsc: RTC IRQ 51, at 30582059549, delta: 245048, jitter: -2455181
That's too frequent to be the RTC update delay - but its length, 1.3 msecs, comes
close to that delay - so maybe that is it. We could filter out these outliner
during calibration.
With that I think we could get down to a jitter of around 10 ppm with just a few
dozen samples - i.e. collect it all in just 2-3 milliseconds.
So that seems desirable and in would bring our calibration accuracy to the same
level as our delayed-work based 'refined TSC' calibration method is.
The big practical problem I found is implementational. Boot dependencies are
horrible:
- one problem is that the IO-APIC is not enabled yet when we calibrate these
values. With vile hacks I was able to enable the RTC during early boot by
forcing an early init of the IO-APIC, but I found no robust method.
- there's also a catch-22 with time init: IO-APIC init relies on time init. I
wasn't successful at untangling it.
- I also tried to activate the RTC IRQ in the i8259 PIC during early boot, so
that we can just use it to sample the RTC and then forget about that state, but
didn't succeed.
all in one, receiving RTC IRQs so early in the bootup was a pretty fragile affair.
- Alternatively, I also tried a different method: to set up the RTC periodic IRQ
during early boot, but not have an IRQ handler, polling RTC_PF in the
rtc_cmos_read(RTC_INTR_FLAGS) IRQ status byte.
Unfortunately when I do this then PIO based RTC accesses can take tens of
thousands of cycles, and the resulting jitter is pretty bad and hard to filter:
[ 0.000000] tsc: RTC edge 57 from 0 to 64, at 29694678517, delta: 246360, jitter: 2456, loops: 7, 35194 cycles/loop
[ 0.000000] tsc: RTC edge 58 from 0 to 64, at 29695169485, delta: 490968, jitter: 244608, loops: 118, 4160 cycles/loop
[ 0.000000] tsc: RTC edge 59 from 0 to 64, at 29695413981, delta: 244496, jitter: -246472, loops: 6, 40749 cycles/loop
[ 0.000000] tsc: RTC edge 60 from 0 to 64, at 29695660661, delta: 246680, jitter: 2184, loops: 7, 35240 cycles/loop
[ 0.000000] tsc: RTC edge 61 from 0 to 64, at 29695904853, delta: 244192, jitter: -2488, loops: 6, 40698 cycles/loop
[ 0.000000] tsc: RTC edge 62 from 0 to 64, at 29696151141, delta: 246288, jitter: 2096, loops: 7, 35184 cycles/loop
[ 0.000000] tsc: RTC edge 63 from 0 to 64, at 29696396445, delta: 245304, jitter: -984, loops: 6, 40884 cycles/loop
[ 0.000000] tsc: RTC edge 64 from 0 to 64, at 29696642669, delta: 246224, jitter: 920, loops: 7, 35174 cycles/loop
[ 0.000000] tsc: RTC edge 65 from 0 to 64, at 29696887245, delta: 244576, jitter: -1648, loops: 6, 40762 cycles/loop
[ 0.000000] tsc: RTC edge 66 from 0 to 64, at 29697377909, delta: 490664, jitter: 246088, loops: 117, 4193 cycles/loop
[ 0.000000] tsc: RTC edge 67 from 0 to 64, at 29697622701, delta: 244792, jitter: -245872, loops: 6, 40798 cycles/loop
[ 0.000000] tsc: RTC edge 68 from 0 to 64, at 29697868773, delta: 246072, jitter: 1280, loops: 7, 35153 cycles/loop
[ 0.000000] tsc: RTC edge 69 from 0 to 64, at 29700569301, delta: 2700528, jitter: 2454456, loops: 13, 207732 cycles/loop
[ 0.000000] tsc: RTC edge 70 from 0 to 64, at 29700813805, delta: 244504, jitter: -2456024, loops: 6, 40750 cycles/loop
[ 0.000000] tsc: RTC edge 71 from 0 to 64, at 29701060125, delta: 246320, jitter: 1816, loops: 7, 35188 cycles/loop
[ 0.000000] tsc: RTC edge 72 from 0 to 64, at 29701550189, delta: 490064, jitter: 243744, loops: 117, 4188 cycles/loop
[ 0.000000] tsc: RTC edge 73 from 0 to 64, at 29701796677, delta: 246488, jitter: -243576, loops: 7, 35212 cycles/loop
[ 0.000000] tsc: RTC edge 74 from 0 to 64, at 29702040829, delta: 244152, jitter: -2336, loops: 6, 40692 cycles/loop
[ 0.000000] tsc: RTC edge 75 from 0 to 64, at 29702287597, delta: 246768, jitter: 2616, loops: 7, 35252 cycles/loop
[ 0.000000] tsc: RTC edge 76 from 0 to 64, at 29702531741, delta: 244144, jitter: -2624, loops: 6, 40690 cycles/loop
[ 0.000000] tsc: RTC edge 77 from 0 to 64, at 29702778341, delta: 246600, jitter: 2456, loops: 7, 35228 cycles/loop
[ 0.000000] tsc: RTC edge 78 from 0 to 64, at 29703022661, delta: 244320, jitter: -2280, loops: 6, 40720 cycles/loop
[ 0.000000] tsc: RTC edge 79 from 0 to 64, at 29703514245, delta: 491584, jitter: 247264, loops: 118, 4165 cycles/loop
[ 0.000000] tsc: RTC edge 80 from 0 to 64, at 29703759165, delta: 244920, jitter: -246664, loops: 6, 40820 cycles/loop
[ 0.000000] tsc: RTC edge 81 from 0 to 64, at 29704005397, delta: 246232, jitter: 1312, loops: 7, 35176 cycles/loop
[ 0.000000] tsc: RTC edge 82 from 0 to 64, at 29704249589, delta: 244192, jitter: -2040, loops: 6, 40698 cycles/loop
and this was on a pretty standard whitebox PC, I'm pessimistic about how this
would work on more exotic machines.
So I think we should stay with the PIT fast-calibration method, it's very fast in
most cases.
Thanks,
Ingo
Ingo Molnar wrote:
> - Alternatively, I also tried a different method: to set up the RTC
> periodic IRQ during early boot, but not have an IRQ handler, polling
> RTC_PF in the rtc_cmos_read(RTC_INTR_FLAGS) IRQ status byte.
>
> Unfortunately when I do this then PIO based RTC accesses can take
> tens of thousands of cycles, and the resulting jitter is pretty bad
> and hard to filter:
Did you use rtc_cmos_read()? Because that's a rather complex function with
lots of locking, to allow NMI access to CMOS. (Basically, it keeps
a shadow copy of the address register, which the NMI puts back after
it does its nested access.)
You want to do the locking and selection of the register to read
just once, and have just the raw inb() in the loop. The resultant
code looks like this:
lock_cmos_prefix(RTC_REG_C);
outb(RTC_REG_C, RTC_PORT(0));
for (lots of iterations) {
flag = inb(RTC_PORT(1)) & 0x40;
}
lock_cmos_suffix(RTC_REG_C);
That should be a *lot* better.
Also, you don't have to enable interrupts in the RTC to get the PF bit
set periodically. You only program the interval (register A lsbits),
not the IRQ (register B bit 6). In fact, disabling the interrupt is
probably safer.
On Thu, Jun 4, 2015 at 9:38 AM, George Spelvin <[email protected]> wrote:
>
> Also, you don't have to enable interrupts in the RTC to get the PF bit
> set periodically. You only program the interval (register A lsbits),
> not the IRQ (register B bit 6). In fact, disabling the interrupt is
> probably safer.
Also, I don't know what Ingo's test-code looked like, but it is
probably best to set the RTC to 8kHz, and then time a few iterations
of "count cycles between PF gets set" rather than "wait for bit to get
set after programming". With the usual "have timestamp both before the
read that shows the bit set, and after the read" so that you can
estimate how big the error window is.
Maybe that's what Ingo's jitter numbers already are, without seeing
what the test-code was it's hard to guess.
And you definitely want to disable interrupts and make sure nobody
else reads that register, since reading it will clear it. Although I
guess that during early boot there shouldn't be any other RTC
activity..
Linus
On Thu, 4 Jun 2015 at 09:52:34, Linus Torvalds wrote:
> With the usual "have timestamp both before the read that shows the bit
> set, and after the read" so that you can estimate how big the error
> window is.
Actually, the current code uses three timestamps: one before the last
read of the unwanted value, one in the middle, and one after the
read of the target value (bit set in this case).
The delta beween the outer two is used for error estimaion, and the
middle timestamp is used as the guess for when the clock ticked.
Because, if you properly factor out the common code, an RTC read
is just one inb(), as opposed to two for the PIT, I would hope
it could do better.
> And you definitely want to disable interrupts and make sure nobody
> else reads that register, since reading it will clear it. Although I
> guess that during early boot there shouldn't be any other RTC
> activity..
That's the other reason to factor out the CMOS locking.
There's code in e.g.: arch/x86/kernel/rtc.c:mach_get_cmos_time() or
drivers/rtc/rtc-cmos.c:cmos_nvram_read() that would also benefit from
not acquiring and releasing the lock around every single one-byte read.
On Thu, Jun 4, 2015 at 10:54 AM, George Spelvin <[email protected]> wrote:
>
> Actually, the current code uses three timestamps: one before the last
> read of the unwanted value, one in the middle, and one after the
> read of the target value (bit set in this case).
Yes, right you are. All are valid and interesting.
Linus
FWIW, I wrote my own test routine, with some interesting results.
It's a rude bodge and obviously not kernel-quality, but included if
anyone wants to mess with it.
My machine is an X79 motherboard with a common ITE IT8728F SuperIO
chip providing both RTC and PIT.
The intersting bit is that I can double the speed of the PIT code, and
the really interesting part is that the RTC code is 18% faster still
(85% of the time).
It's running at 3.4 GHz, so I expect 729478 ticks per 256 PIT
counts, and 415039 ticks per 8192 Hz RTC tick.
Anyway, here are the results using the current pit_expect_msb().
The values printed are the returned tsc, the delta to the previous one,
and the uncertainty range, which is the time for two reads
94 inb() operations).
PIT edge at 99066193034, delta 0, range 18372
PIT edge at 99066918775, delta 725741, range 18372
PIT edge at 99067644199, delta 725424, range 18372
PIT edge at 99068379191, delta 734992, range 18372
PIT edge at 99069104615, delta 725424, range 18372
PIT edge at 99069839127, delta 734512, range 18372
PIT edge at 99070564551, delta 725424, range 18372
PIT edge at 99071299583, delta 735032, range 18348
PIT edge at 99072025530, delta 725947, range 18372
PIT edge at 99072750839, delta 725309, range 18372
PIT edge at 99073485447, delta 734608, range 18372
PIT edge at 99074210778, delta 725331, range 18372
PIT edge at 99074945471, delta 734693, range 18372
PIT edge at 99075670807, delta 725336, range 18372
PIT edge at 99076406543, delta 735736, range 18372
PIT edge at 99077132874, delta 726331, range 18372
PIT edge at 99077858095, delta 725221, range 18372
PIT edge at 99078593719, delta 735624, range 18372
PIT edge at 99079319255, delta 725536, range 18372
PIT edge at 99080053767, delta 734512, range 18372
PIT edge at 99080779079, delta 725312, range 18372
PIT edge at 99081504322, delta 725243, range 18372
PIT edge at 99082239311, delta 734989, range 18372
PIT edge at 99082964554, delta 725243, range 18372
PIT edge at 99083699543, delta 734989, range 18372
PIT edge at 99084425602, delta 726059, range 18372
PIT edge at 99085160095, delta 734493, range 18372
PIT edge at 99085885311, delta 725216, range 18372
PIT edge at 99086610535, delta 725224, range 18372
PIT edge at 99087345751, delta 735216, range 18372
PIT edge at 99088071399, delta 725648, range 18372
PIT edge at 99088805911, delta 734512, range 18372
PIT edge at 99089531519, delta 725608, range 18372
PIT edge at 99090266327, delta 734808, range 18372
PIT edge at 99090991567, delta 725240, range 18372
PIT edge at 99091716767, delta 725200, range 18372
PIT edge at 99092451279, delta 734512, range 18372
PIT edge at 99093176615, delta 725336, range 18487
PIT edge at 99093911423, delta 734808, range 18372
PIT edge at 99094636847, delta 725424, range 18372
PIT edge at 99095371447, delta 734600, range 18372
PIT edge at 99096096671, delta 725224, range 18372
PIT edge at 99096831703, delta 735032, range 18372
PIT edge at 99097557535, delta 725832, range 18372
PIT edge at 99098282959, delta 725424, range 18372
PIT edge at 99099018063, delta 735104, range 18372
PIT edge at 99099743303, delta 725240, range 18372
PIT edge at 99100477703, delta 734400, range 18372
PIT edge at 99101203015, delta 725312, range 18372
PIT edge at 99101937415, delta 734400, range 18372
Here's the same for an optimized PIT routine, which places the PIT in
msbyte-only mode, so only needs one read to poll the PIT.
It also prints the number of iterations inside the PIT spin loop.
Note that it's exactly twice the speed, but the variance
is much higher.
PIT edge at 99131203367, delta 0, range 9215, iter 158
PIT edge at 99131929383, delta 726016, range 9172, iter 157
PIT edge at 99132659519, delta 730136, range 9215, iter 158
PIT edge at 99133389546, delta 730027, range 9172, iter 158
PIT edge at 99134120047, delta 730501, range 9188, iter 158
PIT edge at 99134850095, delta 730048, range 9508, iter 158
PIT edge at 99135580303, delta 730208, range 9188, iter 158
PIT edge at 99136310623, delta 730320, range 9188, iter 158
PIT edge at 99137035935, delta 725312, range 9193, iter 157
PIT edge at 99137765754, delta 729819, range 9172, iter 158
PIT edge at 99138495666, delta 729912, range 9172, iter 158
PIT edge at 99139225578, delta 729912, range 9172, iter 158
PIT edge at 99139955511, delta 729933, range 9172, iter 158
PIT edge at 99140685311, delta 729800, range 9212, iter 158
PIT edge at 99141415743, delta 730432, range 9215, iter 158
PIT edge at 99142146247, delta 730504, range 9169, iter 158
PIT edge at 99142872303, delta 726056, range 9215, iter 157
PIT edge at 99143603031, delta 730728, range 9215, iter 158
PIT edge at 99144333559, delta 730528, range 9169, iter 158
PIT edge at 99145063879, delta 730320, range 9193, iter 158
PIT edge at 99145793791, delta 729912, range 9193, iter 158
PIT edge at 99146519823, delta 726032, range 9191, iter 157
PIT edge at 99147249735, delta 729912, range 9191, iter 158
PIT edge at 99147979559, delta 729824, range 9212, iter 158
PIT edge at 99148709674, delta 730115, range 9172, iter 158
PIT edge at 99149440311, delta 730637, range 9212, iter 158
PIT edge at 99150170743, delta 730432, range 9172, iter 158
PIT edge at 99150896055, delta 725312, range 9212, iter 157
PIT edge at 99151626487, delta 730432, range 9215, iter 158
PIT edge at 99152357103, delta 730616, range 9212, iter 158
PIT edge at 99153087242, delta 730139, range 9172, iter 158
PIT edge at 99153817154, delta 729912, range 9172, iter 158
PIT edge at 99154547271, delta 730117, range 9188, iter 158
PIT edge at 99155272671, delta 725400, range 9188, iter 157
PIT edge at 99156003103, delta 730432, range 9191, iter 158
PIT edge at 99156733423, delta 730320, range 9191, iter 158
PIT edge at 99157464063, delta 730640, range 9188, iter 158
PIT edge at 99158194087, delta 730024, range 9191, iter 158
PIT edge at 99158924498, delta 730411, range 9172, iter 158
PIT edge at 99159650239, delta 725741, range 9191, iter 157
PIT edge at 99160380039, delta 729800, range 9212, iter 158
PIT edge at 99161110583, delta 730544, range 9172, iter 158
PIT edge at 99161840495, delta 729912, range 9215, iter 158
PIT edge at 99162570815, delta 730320, range 9215, iter 158
PIT edge at 99163300911, delta 730096, range 9169, iter 158
PIT edge at 99164030823, delta 729912, range 9169, iter 158
PIT edge at 99164756767, delta 725944, range 9188, iter 157
PIT edge at 99165486703, delta 729936, range 9188, iter 158
PIT edge at 99166216818, delta 730115, range 9196, iter 158
PIT edge at 99166946730, delta 729912, range 9196, iter 158
Now here's the RTC version. Because the events being times are
faster, the delta is expected to be 415039. But what's interesting
is that the read time is singificantly lower and (with the
exception of one outlier) more stable.
Is the simpler metastability handling (we're only reading
one bit, rather than an entire counter) causing the difference?
(PIT reads are 1353 ns each, while RTC reads are 1142 ns.)
RTC edge at 99172986783, delta 0, range 7764, iter 7
RTC edge at 99173401719, delta 414936, range 7764, iter 106
RTC edge at 99173816543, delta 414824, range 7764, iter 106
RTC edge at 99174231391, delta 414848, range 7764, iter 106
RTC edge at 99174646119, delta 414728, range 7740, iter 106
RTC edge at 99175061351, delta 415232, range 7764, iter 106
RTC edge at 99175476399, delta 415048, range 7764, iter 106
RTC edge at 99175891223, delta 414824, range 7764, iter 106
RTC edge at 99176306071, delta 414848, range 7764, iter 106
RTC edge at 99176721415, delta 415344, range 7764, iter 106
RTC edge at 99177136551, delta 415136, range 7764, iter 106
RTC edge at 99177552010, delta 415459, range 7764, iter 106
RTC edge at 99177966831, delta 414821, range 7764, iter 106
RTC edge at 99178381567, delta 414736, range 7764, iter 106
RTC edge at 99178797226, delta 415659, range 7764, iter 106
RTC edge at 99179211959, delta 414733, range 7764, iter 106
RTC edge at 99179627007, delta 415048, range 7764, iter 106
RTC edge at 99180041943, delta 414936, range 7764, iter 106
RTC edge at 99180457378, delta 415435, range 7764, iter 106
RTC edge at 99180872335, delta 414957, range 7764, iter 106
RTC edge at 99181287271, delta 414936, range 7764, iter 106
RTC edge at 99181702095, delta 414824, range 7764, iter 106
RTC edge at 99182120815, delta 418720, range 7764, iter 107
RTC edge at 99182535935, delta 415120, range 7764, iter 106
RTC edge at 99182951095, delta 415160, range 7764, iter 106
RTC edge at 99183366215, delta 415120, range 7764, iter 106
RTC edge at 99183781082, delta 414867, range 7764, iter 106
RTC edge at 99184192639, delta 411557, range 7764, iter 105
RTC edge at 99184611431, delta 418792, range 7764, iter 107
RTC edge at 99185026391, delta 414960, range 7764, iter 106
RTC edge at 99185441527, delta 415136, range 7764, iter 106
RTC edge at 99185853135, delta 411608, range 8172, iter 105
RTC edge at 99186268658, delta 415523, range 7764, iter 106
RTC edge at 99186683687, delta 415029, range 7764, iter 106
RTC edge at 99187098415, delta 414728, range 7764, iter 106
RTC edge at 99187513351, delta 414936, range 7764, iter 106
RTC edge at 99187928287, delta 414936, range 7764, iter 106
RTC edge at 99188346895, delta 418608, range 7764, iter 107
RTC edge at 99188761946, delta 415051, range 7764, iter 106
RTC edge at 99189176679, delta 414733, range 7764, iter 106
RTC edge at 99189591818, delta 415139, range 7764, iter 106
RTC edge at 99190006959, delta 415141, range 7764, iter 106
RTC edge at 99190422303, delta 415344, range 7764, iter 106
RTC edge at 99190837463, delta 415160, range 7764, iter 106
RTC edge at 99191249135, delta 411672, range 7764, iter 105
RTC edge at 99191664071, delta 414936, range 7764, iter 106
RTC edge at 99192082679, delta 418608, range 7764, iter 107
RTC edge at 99192494127, delta 411448, range 7764, iter 105
RTC edge at 99192912735, delta 418608, range 7764, iter 107
RTC edge at 99193327602, delta 414867, range 7764, iter 106
tsc: Fast TSC calibration using PIT
Using user defined TSC freq: 3399.965 MHz
tsc: Detected 3399.965 MHz processor
Calibrating delay loop (skipped), value calculated using timer frequency.. 6802.26 BogoMIPS (lpj=11333216)
Here's my ugly code. The "skanky stuff" is a race condition that will
break if an NMI comes along, but I'm not sure that['s possible this
early in the boot.
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index a00f35be..7399d4c6 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -22,6 +22,8 @@
#include <asm/nmi.h>
#include <asm/x86_init.h>
+#include <asm/mc146818rtc.h>
+
unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
EXPORT_SYMBOL(cpu_khz);
@@ -533,15 +535,15 @@ static inline int pit_verify_msb(unsigned char val)
static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
{
- int count;
- u64 tsc = 0, prev_tsc = 0;
+ int count = 0;
+ u64 prev_tsc, tsc = 0;
- for (count = 0; count < 50000; count++) {
- if (!pit_verify_msb(val))
- break;
+ do {
+ if (++count > 50000)
+ return 0;
prev_tsc = tsc;
tsc = get_cycles();
- }
+ } while (pit_verify_msb(val));
*deltap = get_cycles() - prev_tsc;
*tscp = tsc;
@@ -552,6 +554,158 @@ static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *de
return count > 5;
}
+/* Similar, but only a single read. And returns the number of reads. */
+static inline unsigned
+pit_expect_msb1(unsigned char val, u64 *tscp, unsigned long *deltap)
+{
+ int count = 0;
+ u64 prev_tsc, tsc = 0;
+
+ do {
+ if (++count > 50000)
+ return 0;
+ prev_tsc = tsc;
+ tsc = get_cycles();
+ } while (inb(0x42) == val);
+ *deltap = get_cycles() - prev_tsc;
+ *tscp = tsc;
+
+ return count;
+}
+
+static inline unsigned
+rtc_wait_bit(u64 *tscp, unsigned long *deltap)
+{
+ int count = 0;
+ u64 prev_tsc, tsc = 0;
+
+ do {
+ if (++count > 5000)
+ return 0;
+ prev_tsc = tsc;
+ tsc = get_cycles();
+ } while (~inb(RTC_PORT(1)) & RTC_PF); /* Wait for bit 6 to be set */
+ *deltap = get_cycles() - prev_tsc;
+ *tscp = tsc;
+
+ /*
+ * We require _some_ success, but the quality control
+ * will be based on the error terms on the TSC values.
+ */
+ return count;
+}
+
+#define SAMPLES 50
+static void noinline_for_stack
+pit_test(void)
+{
+ u64 tsc[SAMPLES];
+ unsigned long delta[SAMPLES];
+ unsigned count[SAMPLES];
+ int i;
+ unsigned char saved_a, saved_b;
+ unsigned long flags;
+ extern spinlock_t rtc_lock;
+
+ outb(0xb0, 0x43);
+
+ /* Start at 0xffff */
+ outb(0xff, 0x42);
+ outb(0xff, 0x42);
+
+ pit_verify_msb(0);
+
+ if (pit_expect_msb(0xff, tsc+0, delta+0)) {
+ int j;
+ u64 prev;
+ for (i = 1; i < SAMPLES; i++) {
+ if (!pit_expect_msb(0xff-i, tsc+i, delta+i))
+ break;
+ if (!pit_verify_msb(0xfe - i))
+ break;
+ }
+ prev = tsc[0];
+ for (j = 0; j < i; j++) {
+ printk("PIT edge at %12llu, delta %7llu, range %6lu\n",
+ tsc[j], tsc[j] - prev, delta[j]);
+ prev = tsc[j];
+ }
+ }
+
+ /* Try again, with one-byte reads */
+ outb(0xa0, 0x43);
+ outb(0xff, 0x42); /* Start at 0xff00 */
+
+ /* Wait until we reach 0xfe (should be very fast) */
+ pit_verify_msb(0);
+ for (i = 0; i < 1000; i++)
+ if (inb(0x42) == 0xfe)
+ break;
+
+ if (i < 1000) {
+ int j;
+ u64 prev;
+
+ for (i = 0; i < SAMPLES; i++) {
+ count[i] = pit_expect_msb1(0xfe -i, tsc+i, delta+i);
+ if (count[i] <= 5)
+ break;
+ if (inb(0x42) != 0xfd - i)
+ break;
+ }
+ prev = tsc[0];
+ for (j = 0; j < i; j++) {
+ printk("PIT edge at %12llu, delta %7llu, range %6lu, iter %u\n",
+ tsc[j], tsc[j] - prev, delta[j], count[j]);
+ prev = tsc[j];
+ }
+ }
+
+ /* Once more, with the RTC */
+ spin_lock_irqsave(&rtc_lock, flags);
+
+ lock_cmos_prefix(RTC_REG_C);
+/* This is skanky stuff that requries rewritten RTC locking to do properly */
+ outb(RTC_REG_B, RTC_PORT(0));
+ saved_b = inb(RTC_PORT(1));
+ outb(saved_b & 7, RTC_PORT(1)); /* Clear interrupt enables */
+
+ outb(RTC_REG_A, RTC_PORT(0));
+ saved_a = inb(RTC_PORT(1));
+ outb((saved_a & 0xf0) | 3, RTC_PORT(1)); /* Set 8 kHz rate */
+/* End of skanky stuff */
+
+ outb(RTC_REG_C, RTC_PORT(0));
+ inb(RTC_PORT(1)); /* Clear any pending */
+
+ for (i = 0; i < SAMPLES; i++) {
+ count[i] = rtc_wait_bit(tsc+i, delta+i);
+ if (count[i] <= 3)
+ break;
+ if (inb(RTC_PORT(1)) & RTC_PF)
+ break;
+ }
+
+ outb(RTC_REG_A, RTC_PORT(0));
+ outb(saved_a, RTC_PORT(1));
+ outb(RTC_REG_B, RTC_PORT(0));
+ outb(saved_b, RTC_PORT(1));
+
+ lock_cmos_suffix(RTC_REG_C);
+
+ spin_unlock_irqrestore(&rtc_lock, flags);
+
+ {
+ int j;
+ u64 prev = tsc[0];
+ for (j = 0; j < i; j++) {
+ printk("RTC edge at %12llu, delta %7llu, range %6lu, iter %u\n",
+ tsc[j], tsc[j] - prev, delta[j], count[j]);
+ prev = tsc[j];
+ }
+ }
+}
+
/*
* How many MSB values do we want to see? We aim for
* a maximum error rate of 500ppm (in practice the
@@ -570,6 +724,8 @@ static unsigned long quick_pit_calibrate(void)
/* Set the Gate high, disable speaker */
outb((inb(0x61) & ~0x02) | 0x01, 0x61);
+pit_test();
+
/*
* Counter 2, mode 0 (one-shot), binary count
*
* George Spelvin <[email protected]> wrote:
> Ingo Molnar wrote:
> > - Alternatively, I also tried a different method: to set up the RTC
> > periodic IRQ during early boot, but not have an IRQ handler, polling
> > RTC_PF in the rtc_cmos_read(RTC_INTR_FLAGS) IRQ status byte.
> >
> > Unfortunately when I do this then PIO based RTC accesses can take
> > tens of thousands of cycles, and the resulting jitter is pretty bad
> > and hard to filter:
>
> Did you use rtc_cmos_read()? [...]
Yeah, so initially I did, but then after I noticed the overhead I introduced:
+unsigned char rtc_cmos_read_again(void)
+{
+ return inb(RTC_PORT(1));
+}
+
which compiles to a single INB instruction.
This didn't change the delay/cost behavior.
The numbers I cited, with tens of thousands of cycles per iteration, were from
such an optimized poll loop already.
Thanks,
Ingo
* George Spelvin <[email protected]> wrote:
> It's running at 3.4 GHz, so I expect 729478 ticks per 256 PIT counts, and 415039
> ticks per 8192 Hz RTC tick.
> (PIT reads are 1353 ns each, while RTC reads are 1142 ns.)
>
> RTC edge at 99172986783, delta 0, range 7764, iter 7
> RTC edge at 99173401719, delta 414936, range 7764, iter 106
> RTC edge at 99173816543, delta 414824, range 7764, iter 106
> RTC edge at 99174231391, delta 414848, range 7764, iter 106
> RTC edge at 99174646119, delta 414728, range 7740, iter 106
> +static inline unsigned
> +rtc_wait_bit(u64 *tscp, unsigned long *deltap)
> +{
> + int count = 0;
> + u64 prev_tsc, tsc = 0;
> +
> + do {
> + if (++count > 5000)
> + return 0;
> + prev_tsc = tsc;
> + tsc = get_cycles();
> + } while (~inb(RTC_PORT(1)) & RTC_PF); /* Wait for bit 6 to be set */
> + *deltap = get_cycles() - prev_tsc;
> + *tscp = tsc;
> +/* This is skanky stuff that requries rewritten RTC locking to do properly */
[ Note that no RTC locking is needed so early during bootup: this is the boot CPU
only, with only a single task running, guaranteed. ]
So your code is very close to how I did the RTC sampling, except that I got:
[ 0.000000] tsc: RTC edge 57 from 0 to 64, at 29694678517, delta: 246360, jitter: 2456, loops: 7, 35194 cycles/loop
[ 0.000000] tsc: RTC edge 58 from 0 to 64, at 29695169485, delta: 490968, jitter: 244608, loops: 118, 4160 cycles/loop
[ 0.000000] tsc: RTC edge 59 from 0 to 64, at 29695413981, delta: 244496, jitter: -246472, loops: 6, 40749 cycles/loop
[ 0.000000] tsc: RTC edge 60 from 0 to 64, at 29695660661, delta: 246680, jitter: 2184, loops: 7, 35240 cycles/loop
[ 0.000000] tsc: RTC edge 61 from 0 to 64, at 29695904853, delta: 244192, jitter: -2488, loops: 6, 40698 cycles/loop
[ 0.000000] tsc: RTC edge 62 from 0 to 64, at 29696151141, delta: 246288, jitter: 2096, loops: 7, 35184 cycles/loop
[ 0.000000] tsc: RTC edge 63 from 0 to 64, at 29696396445, delta: 245304, jitter: -984, loops: 6, 40884 cycles/loop
[ 0.000000] tsc: RTC edge 64 from 0 to 64, at 29696642669, delta: 246224, jitter: 920, loops: 7, 35174 cycles/loop
[ 0.000000] tsc: RTC edge 65 from 0 to 64, at 29696887245, delta: 244576, jitter: -1648, loops: 6, 40762 cycles/loop
[ 0.000000] tsc: RTC edge 66 from 0 to 64, at 29697377909, delta: 490664, jitter: 246088, loops: 117, 4193 cycles/loop
[ 0.000000] tsc: RTC edge 67 from 0 to 64, at 29697622701, delta: 244792, jitter: -245872, loops: 6, 40798 cycles/loop
[ 0.000000] tsc: RTC edge 68 from 0 to 64, at 29697868773, delta: 246072, jitter: 1280, loops: 7, 35153 cycles/loop
[ 0.000000] tsc: RTC edge 69 from 0 to 64, at 29700569301, delta: 2700528, jitter: 2454456, loops: 13, 207732 cycles/loop
[ 0.000000] tsc: RTC edge 70 from 0 to 64, at 29700813805, delta: 244504, jitter: -2456024, loops: 6, 40750 cycles/loop
[ 0.000000] tsc: RTC edge 71 from 0 to 64, at 29701060125, delta: 246320, jitter: 1816, loops: 7, 35188 cycles/loop
[ 0.000000] tsc: RTC edge 72 from 0 to 64, at 29701550189, delta: 490064, jitter: 243744, loops: 117, 4188 cycles/loop
[ 0.000000] tsc: RTC edge 73 from 0 to 64, at 29701796677, delta: 246488, jitter: -243576, loops: 7, 35212 cycles/loop
[ 0.000000] tsc: RTC edge 74 from 0 to 64, at 29702040829, delta: 244152, jitter: -2336, loops: 6, 40692 cycles/loop
[ 0.000000] tsc: RTC edge 75 from 0 to 64, at 29702287597, delta: 246768, jitter: 2616, loops: 7, 35252 cycles/loop
[ 0.000000] tsc: RTC edge 76 from 0 to 64, at 29702531741, delta: 244144, jitter: -2624, loops: 6, 40690 cycles/loop
[ 0.000000] tsc: RTC edge 77 from 0 to 64, at 29702778341, delta: 246600, jitter: 2456, loops: 7, 35228 cycles/loop
[ 0.000000] tsc: RTC edge 78 from 0 to 64, at 29703022661, delta: 244320, jitter: -2280, loops: 6, 40720 cycles/loop
[ 0.000000] tsc: RTC edge 79 from 0 to 64, at 29703514245, delta: 491584, jitter: 247264, loops: 118, 4165 cycles/loop
[ 0.000000] tsc: RTC edge 80 from 0 to 64, at 29703759165, delta: 244920, jitter: -246664, loops: 6, 40820 cycles/loop
[ 0.000000] tsc: RTC edge 81 from 0 to 64, at 29704005397, delta: 246232, jitter: 1312, loops: 7, 35176 cycles/loop
[ 0.000000] tsc: RTC edge 82 from 0 to 64, at 29704249589, delta: 244192, jitter: -2040, loops: 6, 40698 cycles/loop
note the 'loops' column. When it's around 117, then the read cost corresponds
roughly to the cheap-ish INB cost you have measured: 4188 cycles/loop.
But note the frequent 30-40k cycles/loop outliers. They dominate the measurement
so filtering might not help.
And this is on a 'boring' 10 years old PC (Nvidia CK804 southbridge), with no HPET
and nothing particularly fancy that I'm aware of. I tried this system first
because I expected it to work and expected problems (with RTCs being emulated via
the HPET) on more modern systems.
If the RTC polling method is not reliable here, it might be doubly problematic on
other systems.
IRQ based RTC calibration worked a lot better, but is problematic from the boot
dependencing POV. It's not unsolvable, but not trivial either - and I'd rather
prefer trivial (polling) methods so early during the bootup!
Thanks,
Ingo
> Ingo Molnar wrote:
>* George Spelvin <[email protected]> wrote:
>> Did you use rtc_cmos_read()? [...]
> Yeah, so initially I did, but then after I noticed the overhead I introduced:
> which compiles to a single INB instruction.
>
> This didn't change the delay/cost behavior.
>
> The numbers I cited, with tens of thousands of cycles per iteration,
> were from such an optimized poll loop already.
Apologies for doubting you!
>> /* This is skanky stuff that requries rewritten RTC locking to do properly */
> [ Note that no RTC locking is needed so early during bootup: this is
> the boot CPU only, with only a single task running, guaranteed. ]
Yes, I guessed I could get away with it, but I hadn't traced the code
far enough to be sure. But obviously I should either completely omit the
locking, or do it right. Half-assed is all-wrong.
> note the 'loops' column. When it's around 117, then the read cost corresponds
> roughly to the cheap-ish INB cost you have measured: 4188 cycles/loop.
>
> But note the frequent 30-40k cycles/loop outliers. They dominate the measurement
> so filtering might not help.
I don't quite understand hoe the numbers are derived. Why does 200K
cycles/loop give 13 loops, while 35K cycles/loop gives 7? Is cycles/loop
a maximum?
> And this is on a 'boring' 10 years old PC (Nvidia CK804 southbridge), with no HPET
> and nothing particularly fancy that I'm aware of. I tried this system first
> because I expected it to work and expected problems (with RTCs being emulated via
> the HPET) on more modern systems.
>
> If the RTC polling method is not reliable here, it might be doubly problematic on
> other systems.
This is definitely an "if it ain't broke, don't fix it" area.
Trying things is interesting; actually changing the kernel is not
to be undertaken lightly.
* George Spelvin <[email protected]> wrote:
> > Ingo Molnar wrote:
> >* George Spelvin <[email protected]> wrote:
> >> Did you use rtc_cmos_read()? [...]
>
> > Yeah, so initially I did, but then after I noticed the overhead I introduced:
> > which compiles to a single INB instruction.
> >
> > This didn't change the delay/cost behavior.
> >
> > The numbers I cited, with tens of thousands of cycles per iteration,
> > were from such an optimized poll loop already.
>
> Apologies for doubting you!
No apologies needed: I should really have posted my code, but the boot
dependencies hackery I had to perform was way too embarrasing to post ...
> > note the 'loops' column. When it's around 117, then the read cost corresponds
> > roughly to the cheap-ish INB cost you have measured: 4188 cycles/loop.
> >
> > But note the frequent 30-40k cycles/loop outliers. They dominate the
> > measurement so filtering might not help.
>
> I don't quite understand hoe the numbers are derived. Why does 200K
> cycles/loop give 13 loops, while 35K cycles/loop gives 7? Is cycles/loop
> a maximum?
it's delta/loops. So the 200K line:
[ 0.000000] tsc: RTC edge 69 from 0 to 64, at 29700569301, delta: 2700528, jitter: 2454456, loops: 13, 207732 cycles/loop
had a very big 'delta' outlier, ~1.3 msecs when we did not manage to detect any
RTC edge.
I'll run your code as well, to make sure it's not something bad in my code.
Thanks,
Ingo
> I'll run your code as well, to make sure it's not something bad in my code.
Here's a modified version that uses less stack space (no need to store
all 64 bits of a timestamp), and captures a window around an RTC periodic
flag edge to explore WTF is going on there.
commit 769eba0b589141edca3541cfb1e30e01b806e5cb
Author: George Spelvin <[email protected]>
Date: Thu Jun 4 22:04:19 2015 -0400
x86, tsc: Add test code.
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index a00f35be..00ff0359 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -22,6 +22,8 @@
#include <asm/nmi.h>
#include <asm/x86_init.h>
+#include <asm/mc146818rtc.h>
+
unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
EXPORT_SYMBOL(cpu_khz);
@@ -533,15 +535,15 @@ static inline int pit_verify_msb(unsigned char val)
static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
{
- int count;
- u64 tsc = 0, prev_tsc = 0;
+ int count = 0;
+ u64 prev_tsc, tsc = 0;
- for (count = 0; count < 50000; count++) {
- if (!pit_verify_msb(val))
- break;
+ do {
+ if (++count > 50000)
+ return 0;
prev_tsc = tsc;
tsc = get_cycles();
- }
+ } while (pit_verify_msb(val));
*deltap = get_cycles() - prev_tsc;
*tscp = tsc;
@@ -552,6 +554,177 @@ static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *de
return count > 5;
}
+/* Similar, but only a single read. And returns the number of reads. */
+static inline unsigned
+pit_expect_msb1(unsigned char val, unsigned *tscp, unsigned *deltap)
+{
+ int count = 0;
+ unsigned prev_tsc, tsc = 0;
+
+ do {
+ if (++count > 50000)
+ return 0;
+ prev_tsc = tsc;
+ tsc = (unsigned)get_cycles();
+ } while (inb(0x42) == val);
+ *deltap = (unsigned)get_cycles() - prev_tsc;
+ *tscp = tsc;
+
+ return count;
+}
+
+static inline unsigned
+rtc_wait_bit(unsigned *tscp, unsigned *deltap)
+{
+ int count = 0;
+ unsigned prev_tsc, tsc = 0;
+
+ do {
+ if (++count > 5000)
+ return 0;
+ prev_tsc = tsc;
+ tsc = (unsigned)get_cycles();
+ } while (~inb(RTC_PORT(1)) & RTC_PF); /* Wait for bit 6 to be set */
+ *deltap = (unsigned)get_cycles() - prev_tsc;
+ *tscp = tsc;
+
+ /*
+ * We require _some_ success, but the quality control
+ * will be based on the error terms on the TSC values.
+ */
+ return count;
+}
+
+#define SAMPLES 64
+
+static void noinline_for_stack
+pit_test(void)
+{
+ unsigned tsc[SAMPLES+1]; /* How long since rpevious edge */
+ unsigned range[SAMPLES+1]; /* Range of uncertainty */
+ unsigned iter[SAMPLES]; /*& Number of iterations for capture */
+ int i, j;
+ unsigned char saved_a, saved_b;
+ unsigned long flags;
+ extern spinlock_t rtc_lock;
+
+ outb(0xb0, 0x43);
+
+ /* Start at 0xffff */
+ outb(0xff, 0x42);
+ outb(0xff, 0x42);
+
+ pit_verify_msb(0);
+
+ /*
+ * Among the evil non-portable hacks this code does, it exploits
+ * the fact that x86 is little-endian and allows unaligned stores
+ * to store 64-bit values into an array of 32-bit values, where
+ * each one overwrites the high half of the one before.
+ */
+ if (pit_expect_msb(0xff, (u64 *)tsc, (unsigned long *)range)) {
+ for (i = 1; i < SAMPLES; i++) {
+ if (!pit_expect_msb(0xff - i, (u64 *)(tsc+i), (unsigned long *)(range+i)))
+ break;
+ if (!pit_verify_msb(0xfe - i))
+ break;
+ }
+ printk("** 2-byte PIT timing\n");
+ for (j = 1; j < i; j++)
+ printk("PIT edge delta %7u, range %6u\n",
+ tsc[j] - tsc[j-1], range[j]);
+ }
+
+ /* Try again, with one-byte reads */
+ outb(0xa0, 0x43);
+ outb(0xff, 0x42); /* Start at 0xff00 */
+
+ /* Wait until we reach 0xfe (should be very fast) */
+ pit_verify_msb(0);
+ for (i = 0; i < 1000; i++)
+ if (inb(0x42) == 0xfe)
+ break;
+
+ if (i < 1000) {
+ for (i = 0; i < SAMPLES; i++) {
+ iter[i] = pit_expect_msb1(0xfe - i, tsc+i, range+i);
+ if (iter[i] <= 5)
+ break;
+ if (inb(0x42) != 0xfd - i)
+ break;
+ }
+ printk("** 1-byte PIT timing\n");
+ for (j = 1; j < i; j++)
+ printk("PIT edge delta %7u, range %6u, iter %u\n",
+ tsc[j] - tsc[j-1], range[j], iter[j]);
+ }
+
+ /* Once more, with the RTC */
+ spin_lock_irqsave(&rtc_lock, flags);
+
+ lock_cmos_prefix(RTC_REG_C);
+/* This is skanky stuff that requries rewritten RTC locking to do properly */
+ outb(RTC_REG_B, RTC_PORT(0));
+ saved_b = inb(RTC_PORT(1));
+ outb(saved_b & 7, RTC_PORT(1)); /* Clear interrupt enables */
+
+ outb(RTC_REG_A, RTC_PORT(0));
+ saved_a = inb(RTC_PORT(1));
+ outb((saved_a & 0xf0) | 3, RTC_PORT(1)); /* Set 8 kHz rate */
+/* End of skanky stuff */
+
+ outb(RTC_REG_C, RTC_PORT(0));
+ inb(RTC_PORT(1)); /* Clear any pending */
+
+ for (i = 0; i < SAMPLES; i++) {
+ iter[i] = rtc_wait_bit(tsc+i, range+i);
+ if (iter[i] <= 3)
+ break;
+ if (inb(RTC_PORT(1)) & RTC_PF)
+ break;
+ }
+
+ lock_cmos_suffix(RTC_REG_C);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+
+ printk("** RTC timing\n");
+ for (j = 1; j < i; j++) {
+ printk("RTC edge delta %7u, range %6u, iter %u\n",
+ tsc[j] - tsc[j-1], range[j], iter[j]);
+ }
+
+ /* Collect different statistics: per-read timing */
+ spin_lock_irqsave(&rtc_lock, flags);
+ lock_cmos_prefix(RTC_REG_C);
+ outb(RTC_REG_C, RTC_PORT(0));
+ inb(RTC_PORT(1)); /* Clear any pending */
+
+ /* Capture a series of timestamps straddling a bit change */
+ j = 10000;
+ for (i = 0; i < j; i++) {
+ tsc[i % (unsigned)SAMPLES] = (unsigned)get_cycles();
+ if (inb(RTC_PORT(1)) & RTC_PF && i >= SAMPLES/2 && j < 10000)
+ j = i + SAMPLES/2;
+ }
+
+ /* Restore the RTC state */
+ outb(RTC_REG_A, RTC_PORT(0));
+ outb(saved_a, RTC_PORT(1));
+ outb(RTC_REG_B, RTC_PORT(0));
+ outb(saved_b, RTC_PORT(1));
+
+ lock_cmos_suffix(RTC_REG_C);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+
+ printk("** RTC timing details\n");
+ for (j = 1; j < SAMPLES; j++) {
+ unsigned k = i + j - SAMPLES;
+ printk("RTC sample %3d: %7u%s\n", k,
+ tsc[k % SAMPLES] - tsc[(k-1) % SAMPLES],
+ j == SAMPLES/2 ? " *" : "");
+ }
+}
+
/*
* How many MSB values do we want to see? We aim for
* a maximum error rate of 500ppm (in practice the
@@ -570,6 +743,8 @@ static unsigned long quick_pit_calibrate(void)
/* Set the Gate high, disable speaker */
outb((inb(0x61) & ~0x02) | 0x01, 0x61);
+pit_test();
+
/*
* Counter 2, mode 0 (one-shot), binary count
*
And, for interest's sake, I went to a somewhat older machine: a 2008
MSI K9A2 platinum with DDR2, a Socket 939 AMD processor, SB600 south
bridge and (most importantly for this) a Fintek F71882FG super-IO
chip providing the PIT and RTC.
Anyway, the results are similar, again with the RTC being faster to
access than the PIT (by 14% this time).
BUT! Two significant things failed, and I need to dig into why.
First, although my main RTC loop saw the PF edges at the expected
intervals of 2.5 GHz/8192 Hz = 305176, the second loop didn't; it
hit the safety timeout of 10,000 samples.
Second, fast RTC calibration failed for some reason.
I need to dig deeper into why.
Anyway, here's the kernel log, with "MEAN" "Std. Dev" lines
added at the end of each group.
It's interesting that the standard deviation for two PIT reads is lower
than that for one. If the times were independent, you'd expect it to
be a factor of sqrt(2) higher.
hpet clockevent registered
** 2-byte PIT timing
PIT edge delta 538765, range 12453
PIT edge delta 535720, range 12493
PIT edge delta 535605, range 12418
PIT edge delta 535640, range 12493
PIT edge delta 535760, range 12413
PIT edge delta 535520, range 12493
PIT edge delta 535680, range 12453
PIT edge delta 535715, range 12413
PIT edge delta 541920, range 12493
PIT edge delta 535640, range 12533
PIT edge delta 535565, range 12373
PIT edge delta 535680, range 12458
PIT edge delta 535600, range 12413
PIT edge delta 535760, range 12373
PIT edge delta 535640, range 12493
PIT edge delta 535675, range 12413
PIT edge delta 535840, range 12413
PIT edge delta 541725, range 12373
PIT edge delta 535635, range 12413
PIT edge delta 535565, range 12453
PIT edge delta 535795, range 12453
PIT edge delta 535645, range 12493
PIT edge delta 535635, range 12573
PIT edge delta 535720, range 12493
PIT edge delta 535560, range 12533
PIT edge delta 542000, range 12493
PIT edge delta 535600, range 12493
PIT edge delta 535645, range 12493
PIT edge delta 535715, range 12533
PIT edge delta 535475, range 12493
PIT edge delta 535805, range 12493
PIT edge delta 535600, range 12493
PIT edge delta 535605, range 12453
PIT edge delta 541995, range 12493
PIT edge delta 535675, range 12488
PIT edge delta 535650, range 12493
PIT edge delta 535635, range 12458
PIT edge delta 535680, range 12453
PIT edge delta 535605, range 12413
PIT edge delta 535675, range 12373
PIT edge delta 535725, range 12493
PIT edge delta 541835, range 12413
PIT edge delta 535685, range 12493
PIT edge delta 535635, range 12493
PIT edge delta 535680, range 12413
MEAN: 536420.55 12462.11
Std.Dev: 2012.8 47.95
** 1-byte PIT timing
PIT edge delta 535757, range 6295, iter 166
PIT edge delta 536680, range 6295, iter 169
PIT edge delta 536810, range 6295, iter 169
PIT edge delta 536715, range 6455, iter 169
PIT edge delta 536675, range 6335, iter 169
PIT edge delta 536085, range 6295, iter 169
PIT edge delta 536760, range 6415, iter 169
PIT edge delta 536520, range 6415, iter 169
PIT edge delta 536520, range 6292, iter 169
PIT edge delta 537235, range 6300, iter 169
PIT edge delta 536640, range 6255, iter 169
PIT edge delta 536440, range 6375, iter 169
PIT edge delta 536560, range 6295, iter 169
PIT edge delta 534398, range 6375, iter 168
PIT edge delta 536800, range 6337, iter 169
PIT edge delta 536325, range 6335, iter 169
PIT edge delta 537075, range 6375, iter 169
PIT edge delta 536680, range 6370, iter 169
PIT edge delta 536725, range 6330, iter 169
PIT edge delta 534042, range 6375, iter 168
PIT edge delta 536990, range 6540, iter 169
PIT edge delta 536570, range 6335, iter 169
PIT edge delta 536910, range 6255, iter 169
PIT edge delta 536605, range 6255, iter 169
PIT edge delta 536645, range 6335, iter 169
PIT edge delta 536995, range 6260, iter 169
PIT edge delta 536480, range 6255, iter 169
PIT edge delta 536485, range 6255, iter 169
PIT edge delta 533908, range 6295, iter 168
PIT edge delta 536610, range 6377, iter 169
PIT edge delta 536802, range 6335, iter 169
PIT edge delta 536668, range 6375, iter 169
PIT edge delta 536605, range 6295, iter 169
PIT edge delta 536800, range 6415, iter 169
PIT edge delta 536525, range 6410, iter 169
PIT edge delta 536920, range 6416, iter 169
PIT edge delta 536435, range 6413, iter 169
PIT edge delta 537000, range 6255, iter 169
PIT edge delta 536525, range 6250, iter 169
MEAN: 536459.49 6336.92 168.85
Std.Dev: 735.73 66.94 0.54
** RTC timing
RTC edge delta 303365, range 5535, iter 109
RTC edge delta 306037, range 5735, iter 110
RTC edge delta 306438, range 5615, iter 110
RTC edge delta 303685, range 5535, iter 109
RTC edge delta 306047, range 5535, iter 110
RTC edge delta 609673, range 5495, iter 220
RTC edge delta 306209, range 5535, iter 110
RTC edge delta 303708, range 5535, iter 109
RTC edge delta 305838, range 5535, iter 110
RTC edge delta 306202, range 5535, iter 110
RTC edge delta 609643, range 5495, iter 220
RTC edge delta 306522, range 5535, iter 110
RTC edge delta 303280, range 5535, iter 109
RTC edge delta 306273, range 5537, iter 110
RTC edge delta 303640, range 5535, iter 109
RTC edge delta 306442, range 5490, iter 110
RTC edge delta 306438, range 5535, iter 110
RTC edge delta 303640, range 5535, iter 109
RTC edge delta 306322, range 5535, iter 110
RTC edge delta 303125, range 5455, iter 109
RTC edge delta 306513, range 5535, iter 110
RTC edge delta 306527, range 5575, iter 110
RTC edge delta 303435, range 5535, iter 109
RTC edge delta 306243, range 5535, iter 110
RTC edge delta 303680, range 5695, iter 109
RTC edge delta 306202, range 5535, iter 110
RTC edge delta 306153, range 5540, iter 110
RTC edge delta 303485, range 5535, iter 109
RTC edge delta 306197, range 5497, iter 110
RTC edge delta 303690, range 5535, iter 109
RTC edge delta 306273, range 5535, iter 110
RTC edge delta 306282, range 5535, iter 110
RTC edge delta 303035, range 5535, iter 109
RTC edge delta 306283, range 5495, iter 110
RTC edge delta 306277, range 5535, iter 110
RTC edge delta 303560, range 5535, iter 109
RTC edge delta 305918, range 5530, iter 110
RTC edge delta 306247, range 5535, iter 110
RTC edge delta 303870, range 5535, iter 109
RTC edge delta 306448, range 5535, iter 110
RTC edge delta 303205, range 5695, iter 109
RTC edge delta 306432, range 5735, iter 110
RTC edge delta 303600, range 5535, iter 109
RTC edge delta 306233, range 5455, iter 110
RTC edge delta 306327, range 5535, iter 110
RTC edge delta 303440, range 5535, iter 109
RTC edge delta 306523, range 5535, iter 110
RTC edge delta 303710, range 5535, iter 109
RTC edge delta 306172, range 5495, iter 110
RTC edge delta 306078, range 5535, iter 110
RTC edge delta 303395, range 5535, iter 109
RTC edge delta 306244, range 5655, iter 110
RTC edge delta 306316, range 5535, iter 110
RTC edge delta 303360, range 5575, iter 109
RTC edge delta 306442, range 5655, iter 110
RTC edge delta 305758, range 5535, iter 110
RTC edge delta 303725, range 5540, iter 109
RTC edge delta 306437, range 5495, iter 110
RTC edge delta 303645, range 5535, iter 109
RTC edge delta 306473, range 5535, iter 110
RTC edge delta 303645, range 5535, iter 109
RTC edge delta 306397, range 5495, iter 110
RTC edge delta 306003, range 5655, iter 110
MEAN: 314895.32 5547.13 113.13
Std.Dev: 53818.41 56.89 19.51
** RTC timing details
RTC sample 9937: 2759
RTC sample 9938: 2761
RTC sample 9939: 2759
RTC sample 9940: 2881
RTC sample 9941: 2759
RTC sample 9942: 2756
RTC sample 9943: 2759
RTC sample 9944: 2761
RTC sample 9945: 2759
RTC sample 9946: 2761
RTC sample 9947: 2759
RTC sample 9948: 2761
RTC sample 9949: 2759
RTC sample 9950: 2761
RTC sample 9951: 2759
RTC sample 9952: 2721
RTC sample 9953: 2719
RTC sample 9954: 2761
RTC sample 9955: 2759
RTC sample 9956: 2761
RTC sample 9957: 2799
RTC sample 9958: 2761
RTC sample 9959: 2759
RTC sample 9960: 2761
RTC sample 9961: 2719
RTC sample 9962: 2721
RTC sample 9963: 2759
RTC sample 9964: 2761
RTC sample 9965: 2759
RTC sample 9966: 2761
RTC sample 9967: 2879
RTC sample 9968: 2761 *
RTC sample 9969: 2759
RTC sample 9970: 2761
RTC sample 9971: 2759
RTC sample 9972: 2761
RTC sample 9973: 2759
RTC sample 9974: 2761
RTC sample 9975: 2759
RTC sample 9976: 2761
RTC sample 9977: 2759
RTC sample 9978: 2761
RTC sample 9979: 2759
RTC sample 9980: 2761
RTC sample 9981: 2759
RTC sample 9982: 2761
RTC sample 9983: 2719
RTC sample 9984: 2721
RTC sample 9985: 2764
RTC sample 9986: 2876
RTC sample 9987: 2764
RTC sample 9988: 2801
RTC sample 9989: 2759
RTC sample 9990: 2761
RTC sample 9991: 2759
RTC sample 9992: 2761
RTC sample 9993: 2759
RTC sample 9994: 2761
RTC sample 9995: 2754
RTC sample 9996: 2761
RTC sample 9997: 2759
RTC sample 9998: 2761
RTC sample 9999: 2759
MEAN: 2763.08
Std.Dev: 29.68
tsc: Fast TSC calibration failed
tsc: Unable to calibrate against PIT
tsc: using HPET reference calibration
Using user defined TSC freq: 2500.210 MHz
tsc: Detected 2500.210 MHz processor
Calibrating delay loop (skipped), value calculated using timer frequency.. 5000.42 BogoMIPS (lpj=25002100)
It's fundamentally the same, but more robust to occasional long delays
when accessing the PIT.
In particular, the old code was susceptible to failing if the initial
PIT read was slow. This revised code will move the timing start if
it's a sufficient improvement.
Another small change that simplified the code was to give up after the
55 ms PIT timer wraparound, rather than 50 ms.
I have a test machine where the old code fails reliably and this
code works.
I've gone wild with the comments, but I don't think the code is much
more complex.
Comments solicited.
Signed-off-by: George Spelvin <[email protected]>
---
arch/x86/kernel/tsc.c | 201 ++++++++++++++++++++++++++++++++------------------
1 file changed, 130 insertions(+), 71 deletions(-)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index a00f35be..fa8e73b5 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -524,48 +524,38 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
* use the TSC value at the transitions to calculate a pretty
* good value for the TSC frequencty.
*/
-static inline int pit_verify_msb(unsigned char val)
-{
- /* Ignore LSB */
- inb(0x42);
- return inb(0x42) == val;
-}
-
-static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
-{
- int count;
- u64 tsc = 0, prev_tsc = 0;
-
- for (count = 0; count < 50000; count++) {
- if (!pit_verify_msb(val))
- break;
- prev_tsc = tsc;
- tsc = get_cycles();
- }
- *deltap = get_cycles() - prev_tsc;
- *tscp = tsc;
-
- /*
- * We require _some_ success, but the quality control
- * will be based on the error terms on the TSC values.
- */
- return count > 5;
-}
/*
- * How many MSB values do we want to see? We aim for
- * a maximum error rate of 500ppm (in practice the
- * real error is much smaller), but refuse to spend
- * more than 50ms on it.
+ * Return the TSC tick rate in kHz, or 0.
+ *
+ * This tries to get within 500 ppm of reality as fast as possible,
+ * by polling the PIT and looking for clock edges.
+ *
+ * We abort and go to a slower technique if it isn't working before
+ * the PIT wraps, which happens every 55 ms. (88 * 3 * 65536 / 315e6 =
+ * 0.0549254095238 s, if you want to be specific.)
+ *
+ * Each PIT access takes approximately 1 us, so we should not ever do
+ * more than 64K. Include an anti-livelock limit of 2x128K.
+ *
+ * The big annoyance is that sometimes there are large delays accessing
+ * the PIT (hardware, SMM, emulator, whatever), which add a lot of
+ * jitter to the collected timestamps. So this is all about measuring
+ * the time it takes to read the PIT and bounding the uncertainty.
*/
-#define MAX_QUICK_PIT_MS 50
-#define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
+#define ONE_BYTE_PIT 0
+
+#define NREPS 3 /* Number of repeat reads required to believe it */
+#define BUFMASK 7 /* Must be power of 2 minus 1, and >= NREPS+3 */
static unsigned long quick_pit_calibrate(void)
{
- int i;
- u64 tsc, delta;
- unsigned long d1, d2;
+ unsigned char msb1 = 0, msb2 = 0xff; /* PIT msbytes */
+ u64 tsc1 = tsc1, tsc2; /* When PIT rolled over to msb1/2 */
+ unsigned d1 = 1, d2; /* How long it took to read the PIT */
+ int reps = 0; /* Number of times we've read msb2 */
+ int limit = 1<<17; /* Maximum number of PIT reads */
+ u64 tsc[BUFMASK+1]; /* Circular buffer of captured TSC values */
/* Set the Gate high, disable speaker */
outb((inb(0x61) & ~0x02) | 0x01, 0x61);
@@ -584,60 +574,129 @@ static unsigned long quick_pit_calibrate(void)
/* Start at 0xffff */
outb(0xff, 0x42);
outb(0xff, 0x42);
-
/*
* The PIT starts counting at the next edge, so we
* need to delay for a microsecond. The easiest way
* to do that is to just read back the 16-bit counter
* once from the PIT.
*/
- pit_verify_msb(0);
+ (void)inb(0x42);
+ (void)inb(0x42);
- if (pit_expect_msb(0xff, &tsc, &d1)) {
- for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) {
- if (!pit_expect_msb(0xff-i, &delta, &d2))
- break;
- /*
- * Iterate until the error is less than 500 ppm
- */
- delta -= tsc;
- if (d1+d2 >= delta >> 11)
- continue;
+#if ONE_BYTE_PIT
+ outb(0xa0, 0x43); /* Set to MSB-only mode */
+#endif
+ tsc[(limit + 1) & BUFMASK] = get_cycles();
+ do {
+ unsigned char msb;
+
+#if !ONE_BYTE_PIT
+ inb(0x42);
+#endif
+ msb = inb(0x42);
+ tsc[limit & BUFMASK] = get_cycles();
+ /* Now, figure out what do do with what we just collected */
+ if (msb != msb2) {
/*
- * Check the PIT one more time to verify that
- * all TSC reads were stable wrt the PIT.
- *
- * This also guarantees serialization of the
- * last cycle read ('d2') in pit_expect_msb.
+ * This is the boring case. We've seen the PIT
+ * tick over, but we don't trust it until seeing
+ * the value NREPS additional times. Also deal
+ * with the PIT wrapping (by failing), and with it
+ * decrementing more than once (by ignoring this
+ * edge).
*/
- if (!pit_verify_msb(0xfe - i))
- break;
+ if (msb > msb2)
+ break; /* PIT wrapped, fail */
+ reps = (msb == msb2 - 1) ? 0 : NREPS;
+ msb2 = msb;
+ continue;
+ }
+ if (++reps != NREPS)
+ continue;
+
+ /*
+ * Okay, the interesting case. This is the slow path,
+ * which is only executed just after the PIT rolls over,
+ * so a few extra cycles don't matter.
+ */
+
+ /* The PIT rolled over sometime in this window */
+ tsc2 = tsc[(limit + NREPS) & BUFMASK] -
+ tsc[(limit + NREPS + 2) & BUFMASK];
+ /*
+ * At 1 us per PIT access, and two accesses per read, delta
+ * should be below 5 us. Assuming the TSC frequency is
+ * less than 10 GHz, that means that the delta should be
+ * less than 50,000. If we read it more than a million,
+ * just ignore it.
+ *
+ * One million is magic because if d1 and d2 are both
+ * less than 2^20, then (d1 + d2) << 11 is guatanteed to fit
+ * into 32 bits.
+ */
+ if (tsc2 >= 0x100000)
+ continue;
+ d2 = (unsigned)tsc2;
+ tsc2 = tsc[(limit + NREPS + 1) & BUFMASK];
+
+ /*
+ * Should we use this as a new starting point? There are
+ * msb2 more opportunities to read the PIT, so if
+ * d2/msb2 is less than the current d1/msb1, it's
+ * bettter.
+ *
+ * By initializeing d1 (to non-zero) and mab1 (to zero)
+ * correctly, this also makes the initial d1/msb1
+ * "infinitely bad" and will be replaced ASAP.
+ */
+ if (d2 * msb1 <= d1 * msb2) {
+ d1 = d2;
+ msb1 = msb2;
+ tsc1 = tsc2;
+ continue;
+ }
+ /*
+ * Now, is the solution found so far good enough?
+ * The uncertainty is d1+d2, and the time difference
+ * is the difference between the timetamps.
+ *
+ * Note that the shift up by 11 just barely fits into
+ * 32 bits; if you want to increase the precision,
+ * widen the sum or shift delta down instead.
+ *
+ * Also note that the use of <= in the above condition
+ * makes it impossible to reach this point without d1,
+ * msb1 and tsc1 set up, although GCC doesn't know it.
+ */
+ tsc2 -= tsc1;
+ if ((d1+d2) << 11 < tsc2)
goto success;
- }
- }
- pr_info("Fast TSC calibration failed\n");
+ /*
+ * Finally, we impose an upper limit on the number of
+ * times through the loop to ensure the loop eventually
+ * terminates.
+ */
+
+ } while (--limit);
+ pr_info("Fast TSC calibration failed (limit %u msb1 %d msb2 %d)\n",
+ limit, msb1, msb2);
return 0;
success:
/*
- * Ok, if we get here, then we've seen the
- * MSB of the PIT decrement 'i' times, and the
- * error has shrunk to less than 500 ppm.
- *
- * As a result, we can depend on there not being
- * any odd delays anywhere, and the TSC reads are
- * reliable (within the error).
+ * The PIT has ticked 256*(msb1 - msb2) times, at a rate of
+ * PIT_TICK_RATE, in tsc2 TSC ticks.
*
* kHz = ticks / time-in-seconds / 1000;
- * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000
- * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000)
+ * kHz = tsc2 / ((msb1 - msb2) * 256 / PIT_TICK_RATE) / 1000
+ * kHz = (tsc2 * PIT_TICK_RATE) / ((msb1 - msb2) * 256 * 1000)
*/
- delta *= PIT_TICK_RATE;
- do_div(delta, i*256*1000);
- pr_info("Fast TSC calibration using PIT\n");
- return delta;
+ tsc2 = tsc2 * PIT_TICK_RATE / ((msb1 - msb2) * 128 * 1000);
+ pr_info("Fast TSC calibration using PIT: %d(%u)..%d(%u)\n",
+ msb1, d1, msb2, d2);
+ return (tsc2 + 1) >> 1; /* Round to nearest */
}
/**
--
2.1.4
On 09/06/15 09:54, George Spelvin wrote:
> It's fundamentally the same, but more robust to occasional long delays
> when accessing the PIT.
>
> In particular, the old code was susceptible to failing if the initial
> PIT read was slow. This revised code will move the timing start if
> it's a sufficient improvement.
>
> Another small change that simplified the code was to give up after the
> 55 ms PIT timer wraparound, rather than 50 ms.
>
> I have a test machine where the old code fails reliably and this
> code works.
>
> I've gone wild with the comments, but I don't think the code is much
> more complex.
>
> Comments solicited.
Hi
I am not really sure what problem you are trying to solve.
I tried this patch on my problem hardware but it failed both with
ONE_BYTE_PIT set to 0 and set to 1. I am not sure it addresses the
'really-long-latency to read the counter' problem that I have.
A bigger issue for my case is that "slow" calibration is not that slow,
taking only 10ms anyway which is much better than the 50ms max for so-called
"quick" calibration.
So I much prefer the second patch that I posted, which just skips out of
quick_pit_calibrate() if the read latency is too long to succeed.
Regards
Adrian
Adrian Hunter wrote:
> I am not really sure what problem you are trying to solve.
I'm solving the problem I ran into testing variants on your patch!
Some I fairly normal hardware that I have which fails reliably with the
old quick code:
-> tsc: Fast TSC calibration failed: code=0 i=30 j=0 d=12453,103342771544
-> tsc: Unable to calibrate against PIT
-> tsc: using HPET reference calibration
-> Using user defined TSC freq: 2500.210 MHz
-> tsc: Detected 2500.210 MHz processor
(The "code=0" is some debugging stuff I added.)
And works with this:
-> tsc: Fast TSC calibration using PIT: 254(12315)..159(12480)
-> Using user defined TSC freq: 2500.210 MHz
-> tsc: Detected 2500.210 MHz processor
> I tried this patch on my problem hardware but it failed both with
> ONE_BYTE_PIT set to 0 and set to 1. I am not sure it addresses the
> 'really-long-latency to read the counter' problem that I have.
I'm sure it *doesn't* address your problem; it's solving something else.
It works if the read latency is only *sometimes* slow. If it's *always*
slow (your problem), your solution or some variant would be required.
> A bigger issue for my case is that "slow" calibration is not that slow,
> taking only 10ms anyway which is much better than the 50ms max for so-called
> "quick" calibration.
I need to study how that part works and what it does.
Adrian Hunter wrote:
> A bigger issue for my case is that "slow" calibration is not that slow,
> taking only 10ms anyway which is much better than the 50ms max for so-called
> "quick" calibration.
I read the code, and after figuring out which comments are wrong,
this is absolutely right. The "quick" code takes 20 ms if it works,
or 50 ms if it has problems. The fallback code can finish in 10 ms.
When bit rot has set it to this degree, something needs to be done.
Given that people would like to boot VMs in less than 1 second, 20 ms is
2% of that budget, and something that is worth trying to improve.
* The current situation
While it would be lovely if there was a reliable hardware register
with the TSC frequency, there isn't.
And as long as there's a battle between Intel's sales people (who love
locking processor frequencies and charging more for faster ones) and
motherboard makers and overclockers (who will go to great lengths to
get around such limits), I doubt there will be a trustworthy way for the
CPU to determine its own clock rate without using off-chip peripherals.
** quick_pit_calibrate()
The first thing to explain is why quick_pit_calibrate() takes 20 ms
in the best case.
Quick_pit_calibrate() is polling the PIT and looking for a carry from
the lsbyte to the msbyte. When it sees the msbyte change, it assigns
an uncertaity window, which includes the TSC value when the carry
happened. It starts before the last read of the old value, and ends
after the first read of the new value. (Ths is returned in *deltap
from pit_expect_msb().)
This includes four inb() operations and two rdtsc. Given that inb()
takes a bit over a microsecond on normal, working hardware, that's a
total of 5 us of uncertainty. (5.4 us on the machine on my desk.)
Add the uncertainty at both ends of the time interval, and you have 10 us.
Now, quick_pit_calibrate() loops until the uncertainty is less than 1/2048
of the time interval, i.e. the TSC rate is estimated to 500 ppm. Meaning
that it will stop after 20 ms. (22 ms for the machine on my desk.)
** native_pit_calibrate()
The "slow" calibration, on the other hand, tries to do it in 10 ms
(CAL_MS = 10; it falls back to CAL2_MS = 50 if 10 doesn't seem to
be enough).
The "slow" code uses the PIT and one extra timer if available: the
HPET if possible, the ACPI PM timer (and all its fun hardware bugs;
see https://lkml.org/lkml/2008/8/10/77 for details) if not.
It tries a number of things, all of which can fail, and
repeats until it gets something it feels it can go with.
The core is pit_calibrate_tsc(), which programs the PIT for a 10 ms
timeout, and alternates get_cycles() with watching inb(0x61)
for the timer output bit to go high.
During this process, it measures the duration (in TSC cycles) of
each inb(), and if things look wonky (the maximum is more than
10x the minimum), fails (returns ULONG_MAX).
Otherwise, it computes the TSC rate as kHz = (tsc2 - tsc1) / 10 ms.
The code around that core reads the extra timer (capturing the TSC
before and after), does pit_calibrate_tsc() to waste some time,
and reads the extra timer again.
There are two error cases in reading the extra timer:
- It might not exist.
- Trying 5 times may fail to read it in less than 50K clock cycles,
indicating that there's some SMI shit going on.
If reading the extra timer works *and* reading the PIT timer worked,
*and* the two agree within 10%, then return the extra timer result
immediately. This is the case that works for Adrian.
If things are questionable, the whole process is repeated up to 3x.
Throughout those, the minimum computed clock rate is kept for
each of the calibrations. Delays will cause unexpectedly high
TSC deltas, which will read as overestimates of the clock rate,
so this makes sense. (But we can do better.)
The third repetition, if pit_calibrate_tsc has failed twice, increase the
delay to 50 ms. pit_calibrate_tsc will still fail, but the HPET/PM_TIMER
will have an extra-long time interval.
If the loop ends (i.e. things weren't perfect after three retries), then
we'll have two results:
- tsc_pit_min, the PIT calibration result (or ULONG_MAX if it failed)
- tsc_ref_min, the extra timer calibration result (or ULONG_MAX)
The following cases apply:
- If both results indicate failure, disable the TSC and return failure.
- If the pit failed, return tsc_ref_min (with a warning)
- If the extra timer failed, return the PIT result
- If both succeeded, they must disagree by mroe than 10%.
Warn and use the PIT figure.
* The problem with te above
The regular calibration is full of ad-hoc constants and limits,
and doesn't know how accurate a result it's producing. The
quick_pit_calibrate at least has a well-defined goal (500 ppm) and
a systematic way to achieve it.
The big problem is that polling a port looking for a transition
fundamentally leaves +/- one port read (i.e. +/-1.3 us) of uncertainty
as to when the transition actually happened.
* How to do better
If, instead, we did
tsc1 = get_cycles();
lsb = inb(0x42);
tsc2 = get_cycles();
msb = inb(0x42);
delta = tsc2 - tsc1;
min_delta = min(min_delta, delta);
uncertainty = delta - min_delta;
and actually use the captured msb/lsb pair, we could do a lot better.
First of all, it's only one read, not two. The 8254 timer latches the
msbyte when the lsbyte is read and returns the latched value on the
next read, so there's no need to include the second inb() in the
uncertainty window.
That gets you down to about +/-0.65 us. But we can do better!
The idea comes from NTP. The point is that, as we're only comparing
timer *rates*, we don't care about when in the tsc1..tsc2 window the
PIT result was latched as long as it's consistent.
The way to do that is to keep track, over all of the hundreds of
iterations, the *minimum* TSC delta across the inb(). This can
be assumed to be the "speed of light", with a perfectly consistent
time offset.
If a read takes longer than that, there is a delay. And we don't
know whether the delay happened before the PIT latched its result
or after. So we have to add the delay to the uncertainty estimate
of the read.
But the bulk of the 1.3 us of delay can be simply ignored. It doesn't
matter if it's a slow emulated access via a VM or SMI trap; as long as
the time taken is *consistent*, we can do an accurate calibration.
This reduces the read uncertainty to +/-0.5 of a PIT tick, or +/-0.42 us.
Plus the extra uncertainty due to the read time, but we can repeat the
read until we get a fast one with low uncertainty.
With a total of 0.84 us of read uncertaity (1/12 of quick_pit_calibrate
currently), we can get within 500 ppm within 1.75 us. Or do better
within 5 or 10.
And the same technique can be applied to reading the HPET or PM timer.
The PM timer's PIIX hardware bugs make things trickier, but with some
careful coding, it's possible to capture the TSC around just the critical
read, and not include the surrounding validation reads. So we can get
the full benefit of the 280 ns clock period.
** How I'd like to replace the existing code
The loop I'd write would start the PIC (and the RTC, if we want to)
and then go round-robin reading all the time sources and associated
TSC values.
This gives us lots of reads to measure the minimum access time.
After a mimimum number of iterations (so the minimum access time is
stable), when the ratio of elapsed / (start uncertainty+end uncertainty)
is satisfactory (e.g. 2048 for current 500 ppm spec), then various
measures are sanity-checked against each other and the best one returned.
Rather than program the PIT for 10 us, I'd program it for 55 us (0xffff),
and run until I got something good enough, or it ran out.
inb(0x61) & 0x20 is set when the PIT runs out and never implciitly cleared,
so it's an excellent way to detect even extreme delays. As long as
that bit is clear, the PIT value is a reliable absolute time in the
calibration process.
* Request for comments
So, as I go about turning these ideas into code... any comments?
I realize this is a far bigger overhaul than Adrian proposed, but do
other people agree that some decruftification is warranted?
I'd also like to cut the time required to do the calibration. Being able
to compute the quality of the approximation so far is a huge help.
Any suggestions for a reasonable time/quality tradeoff? 500 ppm
ASAP? Best I can do in 10 ms? Wait until the PIT is 500 ppm and
then use the better result from a higher-resolution timer if
available?
It's some pretty low-level mucking about, but is a nice simple
single-threaded single-processor piece of code.
* George Spelvin <[email protected]> wrote:
> Adrian Hunter wrote:
>
> > A bigger issue for my case is that "slow" calibration is not that slow, taking
> > only 10ms anyway which is much better than the 50ms max for so-called "quick"
> > calibration.
>
> I read the code, and after figuring out which comments are wrong, this is
> absolutely right. The "quick" code takes 20 ms if it works, or 50 ms if it has
> problems. The fallback code can finish in 10 ms.
>
> When bit rot has set it to this degree, something needs to be done.
>
> Given that people would like to boot VMs in less than 1 second, 20 ms is 2% of
> that budget, and something that is worth trying to improve.
As a side note: so VMs often want to skip the whole calibration business, because
they are running on a well-calibrated host.
1,000 msecs is also an eternity: consider for example the KVM + tools/kvm based
"Clear Containers" feature from Arjan:
https://lwn.net/Articles/644675/
... which boots up a generic Linux kernel to generic Linux user-space in 32
milliseconds, i.e. it boots in 0.03 seconds (!).
> ** quick_pit_calibrate()
> ** native_pit_calibrate()
Nice analysis.
> * The problem with te above
>
> The regular calibration is full of ad-hoc constants and limits,
> and doesn't know how accurate a result it's producing. The
> quick_pit_calibrate at least has a well-defined goal (500 ppm) and
> a systematic way to achieve it.
>
> The big problem is that polling a port looking for a transition
> fundamentally leaves +/- one port read (i.e. +/-1.3 us) of uncertainty
> as to when the transition actually happened.
>
>
> * How to do better
>
> If, instead, we did
> tsc1 = get_cycles();
> lsb = inb(0x42);
> tsc2 = get_cycles();
> msb = inb(0x42);
> delta = tsc2 - tsc1;
> min_delta = min(min_delta, delta);
> uncertainty = delta - min_delta;
>
> and actually use the captured msb/lsb pair, we could do a lot better.
>
> First of all, it's only one read, not two. The 8254 timer latches the
> msbyte when the lsbyte is read and returns the latched value on the
> next read, so there's no need to include the second inb() in the
> uncertainty window.
>
> That gets you down to about +/-0.65 us. But we can do better!
>
> The idea comes from NTP. The point is that, as we're only comparing
> timer *rates*, we don't care about when in the tsc1..tsc2 window the
> PIT result was latched as long as it's consistent.
>
> The way to do that is to keep track, over all of the hundreds of
> iterations, the *minimum* TSC delta across the inb(). This can
> be assumed to be the "speed of light", with a perfectly consistent
> time offset.
>
> If a read takes longer than that, there is a delay. And we don't
> know whether the delay happened before the PIT latched its result
> or after. So we have to add the delay to the uncertainty estimate
> of the read.
>
> But the bulk of the 1.3 us of delay can be simply ignored. It doesn't
> matter if it's a slow emulated access via a VM or SMI trap; as long as
> the time taken is *consistent*, we can do an accurate calibration.
>
> This reduces the read uncertainty to +/-0.5 of a PIT tick, or +/-0.42 us.
> Plus the extra uncertainty due to the read time, but we can repeat the
> read until we get a fast one with low uncertainty.
>
> With a total of 0.84 us of read uncertaity (1/12 of quick_pit_calibrate
> currently), we can get within 500 ppm within 1.75 us. Or do better
> within 5 or 10.
(msec you mean I suspect?)
Fully agreed otherwise: if you look at the sawtooth pattern visible in the TSC
deltas there's a very strong, constant underlying frequency that can be recovered
statistically with just a few dozen iterations.
> And the same technique can be applied to reading the HPET or PM timer.
>
> The PM timer's PIIX hardware bugs make things trickier, but with some
> careful coding, it's possible to capture the TSC around just the critical
> read, and not include the surrounding validation reads. So we can get
> the full benefit of the 280 ns clock period.
Cool.
> ** How I'd like to replace the existing code
>
> The loop I'd write would start the PIC (and the RTC, if we want to)
> and then go round-robin reading all the time sources and associated
> TSC values.
I'd just start with the PIT to have as few balls in flight as possible.
> This gives us lots of reads to measure the minimum access time.
>
> After a mimimum number of iterations (so the minimum access time is
> stable), when the ratio of elapsed / (start uncertainty+end uncertainty)
> is satisfactory (e.g. 2048 for current 500 ppm spec), then various
> measures are sanity-checked against each other and the best one returned.
>
>
> Rather than program the PIT for 10 us, I'd program it for 55 us (0xffff),
> and run until I got something good enough, or it ran out.
>
> inb(0x61) & 0x20 is set when the PIT runs out and never implciitly cleared,
> so it's an excellent way to detect even extreme delays. As long as
> that bit is clear, the PIT value is a reliable absolute time in the
> calibration process.
Agreed.
> * Request for comments
>
> So, as I go about turning these ideas into code... any comments?
Could you please structure it the following way:
- first a patch that fixes bogus comments about the current code. It has
bitrotten and if we change it significantly we better have a well
documented starting point that is easier to compare against.
- then a patch that introduces your more accurate calibration method and
uses it as the first method to calibrate. If it fails (and it should have a
notion of failing) then it should fall back to the other two methods.
- possibly add a boot option to skip your new calibration method - i.e. to make
the kernel behave in the old way. This would be useful for tracking down any
regressions in this.
- then maybe add a patch for the RTC method, but as a .config driven opt-in
initially.
Please also add calibration tracing code (.config driven and default-off), so that
the statistical properties of calibration can be debugged and validated without
patching the kernel.
> I realize this is a far bigger overhaul than Adrian proposed, but do other
> people agree that some decruftification is warranted?
Absolutely!
> I'd also like to cut the time required to do the calibration. Being able to
> compute the quality of the approximation so far is a huge help.
Agreed. Btw., I'd suggest to lower the ppm target in your new code, to make sure
we use high quality results and fall back to the current (bitrotten but working)
code.
> Any suggestions for a reasonable time/quality tradeoff? 500 ppm ASAP? Best I
> can do in 10 ms? Wait until the PIT is 500 ppm and then use the better result
> from a higher-resolution timer if available?
So I'd suggest a minimum polling interval (at least 1 msecs?) plus a ppm target.
Would 100ppm be too aggressive?
> It's some pretty low-level mucking about, but is a nice simple single-threaded
> single-processor piece of code.
:-)
Thanks,
Ingo
Adrian Hunter wrote:
> A bigger issue for my case is that "slow" calibration is not that slow,
> taking only 10ms anyway which is much better than the 50ms max for so-called
> "quick" calibration.
I read the code, and after figuring out that the comments are wrong,
this is absolutely right. The "quick" code takes 20 ms if it works,
or 50 ms if it has problems. The fallback code can finish in 10 ms.
When bit rot has set it to this degree (the slow code's comments still
say 50 ms!), I think something needs to be done.
Given that people would like to boot VMs in less than 1 second, 20 ms is
2% of that budget, and something that is worth trying to improve.
* The current situation
While it would be lovely if there was a reliable hardware register
with the TSC frequency, there isn't.
And as long as there's a battle between Intel's sales people (who love
locking processor frequencies and charging more for faster ones) and
motherboard makers and overclockers (who will go to great lengths to
get around such limits), I doubt there will be a trustworthy way for the
CPU to determine its own clock rate without using off-chip peripherals.
** quick_pit_calibrate()
The first thing to explain is why quick_pit_calibrate() takes 20 ms
in the best case.
Quick_pit_calibrate() is polling the PIT and looking for a carry
from the lsbyte to the msbyte. When it sees the msbyte change, it
assigns an uncertaity window, which starts before the last
read of the old value, and ends after the first read of the new
value. (Ths is returned in *deltap from pit_expect_msb().)
This includes four inb() operations and two rdtsc. Given that inb()
takes a bit over a microsecond on normal, working hardware, that's a
total if 5 us of uncertainty. (5.4 us on the machine on my desk.)
Add the uncertainty at both ends of the time interval, and you have 10 us.
Now, quick_pit_calibrate() loops until the uncertainty is less than 1/2048
of the time interval. Meaning that it will stop after 20 ms. (22 ms
for the machine on my desk.)
** native_pit_calibrate()
The "slow" calibration, on the other hand, tries to do it in 10 ms
(CAL_MS = 10; it falls back to CAL2_MS = 50 if 10 doesn't seem to
be enough).
The "slow" code uses the PIT and one extra timer if available: the
HPET if possible, the ACPI PM timer (and all its fun hardware bugs;
see https://lkml.org/lkml/2008/8/10/77 for details) if not.
It tries a number of things, all of which can fail, and
repeats until it gets something it feels it can go with.
The core is pit_calibrate_tsc(), which programs the PIT for a 10 ms
timeout, and alternates get_cycles() with watching inb(0x61)
for the timer output bit to go high.
During this process, it measures the duration (in TSC cycles) of
each inb(), and if things look wonky (the maximum is more than
10x the minimum), fails (returns ULONG_MAX).
Otherwise, it computes the TSC rate as kHz = (tsc2 - tsc1) / 10 ms.
The code around that core reads the extra timer (capturing the TSC
before and after), does pit_calibrate_tsc() to waste some time,
and reads the extra timer again.
There are two error cases in reading the extra timer:
- It might not exist.
- Trying 5 times may fail to read it in less than 50K clock cycles,
indicating that there's some SMI shit going on.
If reading the extra timer works *and* reading the PIT timer worked,
*and* the two agree within 10%, then return the extra timer result
immediately. This is the case that works for Adrian.
If things are questionable, the whole process is repeated up to 3x.
Throughout those, the minimum computed clock rate is kept for
each of the calibrations. Delays will cause unexpectedly high
TSC deltas, which will read as overestimates of the clock rate,
so this makes sense. (But we can do better.)
The third repetition, if pit_calibrate_tsc has failed twice, increase the
delay to 50 ms. pit_calibrate_tsc will still fail, but the HPET/PM_TIMER
will have an extra-long time interval.
If the loop ends (i.e. things weren't perfect after three retries), then
we'll have two results:
- tsc_pit_min, the PIT calibration result (or ULONG_MAX if it failed)
- tsc_ref_min, the extra timer calibration result (or ULONG_MAX)
The following cases apply:
- If both results indicate failure, disable the TSC and return failure.
- If the pit failed, return tsc_ref_min (with a warning)
- If the extra timer failed, return the PIT result
- If both succeeded, they must disagree by mroe than 10%.
Warn and use the PIT figure.
* How to do better
Now, polling a port looking for a transition fundamentally leaves
+/- one port read (i.e. +/-1.3 us) of uncertainty as to when the
transition actually happened.
If, instead, we did
tsc1 = get_cycles();
lsb = inb(0x42);
tsc2 = get_cycles();
msb = inb(0x42);
delta = tsc2 - tsc1;
min_delta = min(min_delta, delta);
uncertainty = delta - min_delta;
and actually use the captured msb/lsb pair, we can do a lot better.
First of all, it's only one read, not two. The 8284 timer latches the
msbyte when the lsbyte is read and returns the latched value on the
next read, so there's no need to include the second inb() in the
uncertainty window.
That gets you down to about +/-0.65 us. But we can do better!
The idea comes from NTP. The point is that, as we're only comparing
timer rates, we don't care about when in the tsc1..tsc2 window the
PIT result was latched as long as it's consistent.
The way to do that is to keep track, over all of hthe hundreds of
iterations, the *minimum* TSC delta across the inb(). This can
be assumed to be the "speed of light", with a perfectly consistent
time offset.
If a read takes longer than that, there is a delay. And we don't
know whether the delay happened before the PIT latched its result
or after. So we have to add the delay to the uncertainty estimate
of the read.
But the bulk of the delay can be simply ignored. It doesn't matter if
it's a slow emulated access via a VM or SMI trap; as long as the time
taken is *consistent*, we can use it for accurate calibration.
This reduces the read uncertainty to +/- 0.5 of a PIT ticck, or
+/- 0.42 us. Plus the extra uncertainty due to the read time,
but we can repeat the read until we get a fast one with low
uncertainty.
With a total of 0.838 us of read uncertaity, we can get within 500 ppm
within 1.8 us. Or do better within 5 or 10.
And the same technique can be applied to reading the HPET or PM timer.
The PM timer's PIIX hardware bugs make things trickier, but with some
careful coding, it's possible to capture the TSC around just the critical
read, and not include the surrounding validation reads. So we can get
the full benefit of the 280 ns clock period.
** How I'd like to replace the existing code
The loop I'd write would start the PIC (and the RTC, if we want to)
and then go round-robin reading all the time sources and associated
TSC values.
This gives us lots of reads to measure the minimum access time.
After a mimimum number of iterations (so the minimum access time is
stable), when the ratio of elapsed / (start uncertainty+end uncertainty)
is satisfactory (e.g. 2048 for current 500 ppm spec), then various
measures are sanity-checked against each other and the best one returned.
* Request for comments
So, as I go about turning these ideas into code... any comments?
I realize this is a much bigger overhaul than Adrian proposed, but do
other people agree that some decruftification is warranted?
I'd also like to cut the time required to do the calibration.
Becing able to compute the quality of the approximation
so far is a huge help.
It's some pretty low-level mucking about, but is a nice simple
single-threaded single-processor piece of code.
On 10/06/15 10:08, George Spelvin wrote:
> The 8254 timer latches the msbyte when the lsbyte is read
> and returns the latched value on the next read
Are you sure about? The docs I've read don't seem to say that.
Ingo Molnar wrote:
>* George Spelvin <[email protected]> wrote:
> As a side note: so VMs often want to skip the whole calibration business,
> because they are running on a well-calibrated host.
> 1,000 msecs is also an eternity: consider for example the KVM + tools/kvm
> based "Clear Containers" feature from Arjan:
> ... which boots up a generic Linux kernel to generic Linux user-space in 32
> milliseconds, i.e. it boots in 0.03 seconds (!).
Agreed, if you're paravirtualized, you can just pass this stuff in from
the host. But there's plenty of hardware virtualization that boots
a generic Linux.
I pulled generous numbers out of my ass because I didn't want to over-reach
in the argument that it's taking too long. The shorter the boot
time, the stronger the point.
>> With a total of 0.84 us of read uncertaity (1/12 of quick_pit_calibrate
>> currently), we can get within 500 ppm within 1.75 us. Or do better
>> within 5 or 10.
> (msec you mean I suspect?)
Yes, typo; that should be 1.75 ms.
>> The loop I'd write would start the PIC (and the RTC, if we want to)
>> and then go round-robin reading all the time sources and associated
>> TSC values.
> I'd just start with the PIT to have as few balls in flight as possible.
Once I get the loop structured properly, additional timers really
aren't a problem. The biggest PITA is the PM_TMR and all its
brokenness (do I have a PIIX machine in the closet somewhere?),
but the quick_pit_calibrate patch I already posted to LKML shows
how to handle that. I set up a small circular buffer of captured
values, and when I'm (say) three captures past the "interesting"
one, go back and see if the reads look good.
> Could you please structure it the following way:
>
> - first a patch that fixes bogus comments about the current code. It has
> bitrotten and if we change it significantly we better have a well
> documented starting point that is easier to compare against.
>
> - then a patch that introduces your more accurate calibration method and
> uses it as the first method to calibrate. If it fails (and it should have a
> notion of failing) then it should fall back to the other two methods.
>
> - possibly add a boot option to skip your new calibration method -
> i.e. to make the kernel behave in the old way. This would be useful
> for tracking down any regressions in this.
>
> - then maybe add a patch for the RTC method, but as a .config driven opt-in
> initially.
Sonds good, but when do we get to the decruftification? I'd prefer to
prepare the final patch (if nothing else, so Linus will be reassured by
the diffstat), although I can see holding it back for a few releases.
> Please also add calibration tracing code (.config driven and default-off),
> so that the statistical properties of calibration can be debugged and
> validated without patching the kernel.
Definitely desired, but I have to be careful here. Obviously I can't
print during the timing loop, so it will take either a lot of memory,
or add significant computation to the loop.
I also don't want to flood the kernel log before syslog is
started.
Do you have any specific suggestions? Should I just capture everything
into a permanently-allocated buffer and export it via debugfs?
>> I realize this is a far bigger overhaul than Adrian proposed, but do other
>> people agree that some decruftification is warranted?
> Absolutely!
Thanks for the encouragement!
>> Any suggestions for a reasonable time/quality tradeoff? 500 ppm ASAP?
>> Best I can do in 10 ms? Wait until the PIT is 500 ppm and then use
>> the better result from a higher-resolution timer if available?
> So I'd suggest a minimum polling interval (at least 1 msecs?) plus a
> ppm target. Would 100ppm be too aggressive?
How about 122 ppm (1/8192) because I'm lazy? :-)
What I imagine is this:
- The code will loop until it reaches 122 ppm or 55 ms, whichever comes
first. (There's also a minimum, before which 122 ppm isn't checked.)
- Initially, failure to reach 122 ppm will print a message and fall back.
- In the final cleanup patch, I'll accept anything up to 500 ppm
and only fail (and disable TSC) if I can't reach that.
Adrian Hunter wrote:
> On 10/06/15 10:08, George Spelvin wrote:
>> The 8254 timer latches the msbyte when the lsbyte is read
>> and returns the latched value on the next read
> Are you sure about? The docs I've read don't seem to say that.
You're right; I musy have been thinking about the write side which does
this, or some other hardware. Anyway, the counter latch command exists
and can be used for the purpose.
So the idea still works.
* Adrian Hunter <[email protected]> wrote:
> On 10/06/15 10:08, George Spelvin wrote:
>
> > The 8254 timer latches the msbyte when the lsbyte is read and returns the
> > latched value on the next read
>
> Are you sure about? The docs I've read don't seem to say that.
Btw., even if docs claim that, the code should gracefully handle the case where
that's not the case or where there's an occasional quirk in the numbers.
Because real OSs mostly only care about the interrupts generated by the PIT.
That we can read the count is just a bonus that might or might not work
reliably, depending on the hardware.
Especially any 'measure the minimum time' approach measuring more than a single
PIT tick would be senstive to false positives.
Thanks,
Ingo
* George Spelvin <[email protected]> wrote:
> > Could you please structure it the following way:
> >
> > - first a patch that fixes bogus comments about the current code. It has
> > bitrotten and if we change it significantly we better have a well
> > documented starting point that is easier to compare against.
Btw., we should make the patch fixing Adrian's system patch 0, as it looks simple
and obvious enough, agreed?
> > - then a patch that introduces your more accurate calibration method and
> > uses it as the first method to calibrate. If it fails (and it should have a
> > notion of failing) then it should fall back to the other two methods.
> >
> > - possibly add a boot option to skip your new calibration method -
> > i.e. to make the kernel behave in the old way. This would be useful
> > for tracking down any regressions in this.
> >
> > - then maybe add a patch for the RTC method, but as a .config driven opt-in
> > initially.
>
> Sonds good, but when do we get to the decruftification? I'd prefer to prepare
> the final patch (if nothing else, so Linus will be reassured by the diffstat),
> although I can see holding it back for a few releases.
what do you call 'decruftification'? So I'd suggest besides obvious bug (and
comment) fixes we should not change the current calibration code but add your new
logic as the first step, and preserve those other methods, because we know that it
works reasonably well on a wide range of hardware.
Because it all has to be kept in perspective: the benefits of any changes here are
a small boot speedup plus more stable calibration at best (and a warm fuzzy
feeling from having nicely structured, well working code), while the drawbacks are
the potential for totally miscalibrated systems that were working fine before.
Once your bits work fine will there be any need for any of the two other PIT based
calibration methods? We can simply remove them at a suitable point in time and
have a single (and by definition non-crufty) piece of PIT calibration code.
(and RTC if that can be managed.)
> > Please also add calibration tracing code (.config driven and default-off), so
> > that the statistical properties of calibration can be debugged and validated
> > without patching the kernel.
>
> Definitely desired, but I have to be careful here. Obviously I can't print
> during the timing loop, so it will take either a lot of memory, or add
> significant computation to the loop.
So in theory you can use a tracepoint and enable boot tracing. Not sure it's
initialized that early in the bootup, has to be explored ...
> I also don't want to flood the kernel log before syslog is started.
By default it should not print any trace.
So I don't think the details really matter: this is a boot and .config option for
debugging, not for production kernels. You can do a big memory array and printk
the result or use the generic tracing facilities if they are available. People can
increase the kmsg buffer if it does not fit.
> >> Any suggestions for a reasonable time/quality tradeoff? 500 ppm ASAP?
> >> Best I can do in 10 ms? Wait until the PIT is 500 ppm and then use
> >> the better result from a higher-resolution timer if available?
>
> > So I'd suggest a minimum polling interval (at least 1 msecs?) plus a
> > ppm target. Would 100ppm be too aggressive?
>
> How about 122 ppm (1/8192) because I'm lazy? :-)
;-)
> What I imagine is this:
>
> - The code will loop until it reaches 122 ppm or 55 ms, whichever comes
> first. (There's also a minimum, before which 122 ppm isn't checked.)
> - Initially, failure to reach 122 ppm will print a message and fall back.
> - In the final cleanup patch, I'll accept anything up to 500 ppm
> and only fail (and disable TSC) if I can't reach that.
Sounds good to me in principle.
Thanks,
Ingo
Ingo Molnar <[email protected]> wrote:
>* George Spelvin <[email protected]> wrote:
> Btw., we should make the patch fixing Adrian's system patch 0, as it
> looks simple and obvious enough, agreed?
Agreed. The one that just fails the quick calibration if it has
no chance of succeeding.
>> Sonds good, but when do we get to the decruftification? I'd prefer to
>> prepare the final patch (if nothing else, so Linus will be reassured
>> by the diffstat), although I can see holding it back for a few releases.
> what do you call 'decruftification'? So I'd suggest besides obvious bug
> (and comment) fixes we should not change the current calibration code but
> add your new logic as the first step, and preserve those other methods,
> because we know that it works reasonably well on a wide range of hardware.
I mean getting rid of the current code, which works but (as I explained
in detail) is seriously sub-optimal.
I absolutely agree with being cautious, but I also don't want to produce a
software volcano, adding another layer of lava on top of what's already
there because I'm scared to disturb the mess.
I've seen some rants before castigating people for working around problems
in a driver, rather than fixing the offending core kernel code.
My hope in this discussion is to come to an agreement that an overhaul
is justified. (Subject to revision based on experience, of course;
It may turn out my attempted fix is a complete shambles.)
> Once your bits work fine will there be any need for any of the two other
> PIT based calibration methods? We can simply remove them at a suitable
> point in time and have a single (and by definition non-crufty) piece of
> PIT calibration code.
If my plan survives contact with reality, it should work better 100%
of the time and obsolete the old code.
You said it should fail back to the old code, but there's not a lot of
difference between failures I can detect and failures I can work around.
I know it's a fatal error to underestimate the perversity of PC hardware
(virtualized hardware even more so) but I'm trying to mitigate that by
really deeply understanding what the current code does and all the error
conditions is can handle.
> (and RTC if that can be managed.)
Adding a new hardware dependency is the riskiest part, but it would be
nice to have a santy check for Atom. (Which has an RTC, but no PIT.)
>> Definitely desired, but I have to be careful here. Obviously I can't
>> print during the timing loop, so it will take either a lot of memory,
>> or add significant computation to the loop.
> So in theory you can use a tracepoint and enable boot tracing. Not sure it's
> initialized that early in the bootup, has to be explored ...
I definitely appreciate whatever you can suggest here, but I'm not sure
that's quite the solution. Tracepoints are designed to be extremely
lightweight when off, but I need lightweight when *on* so I don't distort
the timing and create heisenbugs.
And boot tracing is all about timing, which doesn't start working until
TSC calibration completes (I've been snipping the timestamps off the
debug logs I print because they're all "[ 0.000000]", e.g.:
} [ 0.000000] RTC sample 106: 3875
} [ 0.000000] RTC sample 107: 3877
} [ 0.000000] RTC sample 108: 3875
} [ 0.000000] tsc: Fast TSC calibration using PIT: 254(18360)..150(18360)
} [ 0.000000] Using user defined TSC freq: 3399.965 MHz
} [ 0.000000] tsc: Detected 3399.965 MHz processor
} [ 0.000020] Calibrating delay loop (skipped), value calculated using timer frequency.. 6802.26 BogoMIPS (lpj=11333216)
I just had the flash of insight to collect all the timetamps always, and
*either* preserve them for debugging *or* use them as boot-time entropy
seed material and deallocate. It's not like there's any shortage of RAM.
But even better would be to find a useful summary, rather than just
up-ending the bit bucket over the victim's head.
> If my plan survives contact with reality, it should work better 100%
> of the time and obsolete the old code.
>
> You said it should fail back to the old code, but there's not a lot of
> difference between failures I can detect and failures I can work around.
>
> I know it's a fatal error to underestimate the perversity of PC hardware
> (virtualized hardware even more so) but I'm trying to mitigate that by
> really deeply understanding what the current code does and all the error
> conditions is can handle.
>
>> (and RTC if that can be managed.)
>
btw one thing to think about; if you calibrate VERY quickly, you need
to consider spread spectrum clocking.
various systems have different clocks on different SSC domains, and
while on a bit larger timewindow you can completely ignore SSC,
once you go into very short time windows you can't.
Now... you can do a quick calibration and then a longer calibration in
the background
(we do that already after 1 second or so.. but that's 900 msec after
userspace finished booting so probably too late if the first
calibration is too coarse)
also note that normally people kind of expect around 100ppm to not
have their clocks deviate too much during the day.
(but this can be the result of the longer term calibration)
> Especially any 'measure the minimum time' approach measuring more than
> a single PIT tick would be senstive to false positives.
Just to be clear, I'm not measuring the minimum of any timer ticks;
it's the timer *access* that's being measured. It's literally the
duration of a single inb() or outb(). Even the *value* is unimportant.
It's just a way to figure out the skew between the TSC and the peripheral.
Think of the peripheral access like a network ping. I'm asking
a remote computer for its time, and measuring *my* time at the start and
end of the ping.
There's a minimum possible ping time, achieved where queueing delays
are zero. If I could achieve that minimum time reliably, synchronizing
clocks rates would be trivial.
To measure offset, I have to make some arbitrary assumption about
the duration of the two legs of the round trip. Typically I assume that
they're equal, so that the remote time corresponds to the point
halfway between my two time measurements.
(For timer rate measurements, it turns out that this assumption
doesn't affect the result at all, so it can be made arbitrarily.)
If a ping takes more than the minimum possible time, then what's
actually most likely is that *one* of the two legs was delayed, and
the correct time is not in the middle of the ping time range, but
hard up against one edge.
I.e.
t1 = get_cycles();
byte = inb(peripheral);
t2 = get_cycles();
interval = t2 - t1;
tmin = min(interval, tmin);
The instant of the inb() is most likely t1 + tmin/2, or t2 - tmin/2,
might be elsewhere in that interval, but is definitely not (if tmin has
been measured accurately) outside that interval.
Arjan van de Ven <[email protected]> wrote:
> btw one thing to think about; if you calibrate VERY quickly, you need
> to consider spread spectrum clocking.
Excellent point! But spread spectrum clock modulation rates are typically
about 30 kHz, so 1 ms will average over 30 cycles, which should be enough.
The faster the modulation rate, the more the EMI peak is flattened.
But too fast produces unaceptable cycle-to-cycle timing variations,
also known as clock jitter.
George Spelvin wrote:
> spread spectrum clock modulation rates are typically about 30 kHz
Actually I found a source:
http://download.intel.com/design/Pentium4/guides/24920601.pdf
CK00 Clock Synthesizer/Driver Design Guidelines
Page 45 says
"8. The modulation frequency of SSC is required to be in the range of
30-33 KHz to avoid audio band demodulation and to minimize system
timing skew."
> The faster the modulation rate, the more the EMI peak is flattened.
> But too fast produces unaceptable cycle-to-cycle timing variations,
> also known as clock jitter.
This is actually wrong. It's the modulation *depth* that determines the
peak reduction. (The above Intel spec says 0.5% typ, 0.6% max.)
The only requirement is that the rate is well below the clock rate,
and well above the averaging time of your frequency analyzer used to
measure the noise.
But basically, they want the modulation rate above the audio band, but
as low as possible to make it easy for downstream PLLs to track.
To quantify the error, consider a triangular modulation spectrum with a
total of 0.6% of range. Intel requires <= 0.6% = 6000 ppm of down-modulation
relative to nominal, but I'll consider it to be +/-3000 ppm from an
average.
The total phase error accumulated during a frequency excursion is the
integral, which is the area under the peak.
(The Lexmark/Hershey's kiss modulation waveform, which provides a slightly
flatter peak, has a slightly smaller area due to the curved edges, so
triangular is a safe overestimate.)
At 30 kHz, during one peak (1/60 ms = 16.66 us), the oscillator phase
advances by 16.66 us * 1500 ppm = 25 ns. (At the more typical 0.5%
value, that's 21 ns.)
25 ns is 100 ppm of 0.25 ms, so it should be okay if I go use a measurement
interval of 1 ms or more.
Some BIOSes have offered 1% spread:
http://www.techarp.com/showFreeBOG.aspx?bogno=266
which doubles that figure, but I don't think there's anything more.
On Wed, Jun 10, 2015 at 11:38 AM, George Spelvin <[email protected]> wrote:
> 25 ns is 100 ppm of 0.25 ms, so it should be okay if I go use a measurement
> interval of 1 ms or more.
>
ok so we're good, but I suspect we want to put this in a comment
somewhere to prevent someone a year from now thinking they can do 100
usec safely ;-)
> ok so we're good, but I suspect we want to put this in a comment
> somewhere to prevent someone a year from now thinking they can do 100
> usec safely ;-)
I'll actually enshrine it in the code, as a factor in the uncertainty
computation.
On 03/06/15 19:23, Thomas Gleixner wrote:
> On Wed, 3 Jun 2015, Adrian Hunter wrote:
>
>> On 03/06/15 06:30, Andi Kleen wrote:
>>>> Then the changelog should say that I think. The current text says
>>>> "Fast TSC calibration will always fail", which, to me, suggests that
>>>> either the slow calibration will work or that the changelog message
>>>> should be changed.
>>>
>>> Ok. No, the slow calibration works I believe.
>>
>> Yeah, so the (only?) downside is the 50ms wasted on Fast TSC
>> calibration. What about this?
>
> I'm certainly happy to apply this one.
George Spelvin began investigating improving quick_pit_calibrate() but Ingo
anyway suggested this patch as the first, so can this be applied?
On Mon, 22 Jun 2015, Adrian Hunter wrote:
> On 03/06/15 19:23, Thomas Gleixner wrote:
> > On Wed, 3 Jun 2015, Adrian Hunter wrote:
> >
> >> On 03/06/15 06:30, Andi Kleen wrote:
> >>>> Then the changelog should say that I think. The current text says
> >>>> "Fast TSC calibration will always fail", which, to me, suggests that
> >>>> either the slow calibration will work or that the changelog message
> >>>> should be changed.
> >>>
> >>> Ok. No, the slow calibration works I believe.
> >>
> >> Yeah, so the (only?) downside is the 50ms wasted on Fast TSC
> >> calibration. What about this?
> >
> > I'm certainly happy to apply this one.
>
> George Spelvin began investigating improving quick_pit_calibrate() but Ingo
> anyway suggested this patch as the first, so can this be applied?
Oh, yes. I simply forgot to pick it up. Thanks for the reminder.
tglx
Adrian Hunter <[email protected]> wrote:
> On 03/06/15 19:23, Thomas Gleixner wrote:
>> I'm certainly happy to apply this one.
>
> George Spelvin began investigating improving quick_pit_calibrate() but Ingo
> anyway suggested this patch as the first, so can this be applied?
Acked-by: George Spelvin <[email protected]>
It's currently the first patch in the series I'm developing, but
if someone wants to beat me to it, no objection.
On 22/06/15 16:14, Thomas Gleixner wrote:
> On Mon, 22 Jun 2015, Adrian Hunter wrote:
>> On 03/06/15 19:23, Thomas Gleixner wrote:
>>> On Wed, 3 Jun 2015, Adrian Hunter wrote:
>>>
>>>> On 03/06/15 06:30, Andi Kleen wrote:
>>>>>> Then the changelog should say that I think. The current text says
>>>>>> "Fast TSC calibration will always fail", which, to me, suggests that
>>>>>> either the slow calibration will work or that the changelog message
>>>>>> should be changed.
>>>>>
>>>>> Ok. No, the slow calibration works I believe.
>>>>
>>>> Yeah, so the (only?) downside is the 50ms wasted on Fast TSC
>>>> calibration. What about this?
>>>
>>> I'm certainly happy to apply this one.
>>
>> George Spelvin began investigating improving quick_pit_calibrate() but Ingo
>> anyway suggested this patch as the first, so can this be applied?
>
> Oh, yes. I simply forgot to pick it up. Thanks for the reminder.
Sorry to bother you, but I don't still don't see the patch anywhere?
On Mon, 6 Jul 2015, Adrian Hunter wrote:
> On 22/06/15 16:14, Thomas Gleixner wrote:
> > On Mon, 22 Jun 2015, Adrian Hunter wrote:
> >> On 03/06/15 19:23, Thomas Gleixner wrote:
> >>> On Wed, 3 Jun 2015, Adrian Hunter wrote:
> >>>
> >>>> On 03/06/15 06:30, Andi Kleen wrote:
> >>>>>> Then the changelog should say that I think. The current text says
> >>>>>> "Fast TSC calibration will always fail", which, to me, suggests that
> >>>>>> either the slow calibration will work or that the changelog message
> >>>>>> should be changed.
> >>>>>
> >>>>> Ok. No, the slow calibration works I believe.
> >>>>
> >>>> Yeah, so the (only?) downside is the 50ms wasted on Fast TSC
> >>>> calibration. What about this?
> >>>
> >>> I'm certainly happy to apply this one.
> >>
> >> George Spelvin began investigating improving quick_pit_calibrate() but Ingo
> >> anyway suggested this patch as the first, so can this be applied?
> >
> > Oh, yes. I simply forgot to pick it up. Thanks for the reminder.
>
> Sorry to bother you, but I don't still don't see the patch anywhere?
That patch seems to be highly merge resistant. It's queued now.
Thanks,
tglx
Commit-ID: 5aac644a9944bea93b4f05ced1883a902a2535f6
Gitweb: http://git.kernel.org/tip/5aac644a9944bea93b4f05ced1883a902a2535f6
Author: Adrian Hunter <[email protected]>
AuthorDate: Wed, 3 Jun 2015 10:39:46 +0300
Committer: Thomas Gleixner <[email protected]>
CommitDate: Mon, 6 Jul 2015 09:41:00 +0200
x86/tsc: Let high latency PIT fail fast in quick_pit_calibrate()
If it takes longer than 12us to read the PIT counter lsb/msb,
then the error margin will never fall below 500ppm within 50ms,
and Fast TSC calibration will always fail.
This patch detects when that will happen and fails fast. Note
the failure message is not printed in that case because:
1. it will always happen on that class of hardware
2. the absence of the message is more informative than its
presence
Signed-off-by: Adrian Hunter <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Len Brown <[email protected]>
Cc: Andi Kleen <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Thomas Gleixner <[email protected]>
---
arch/x86/kernel/tsc.c | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 5054497..7437b41 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -598,10 +598,19 @@ static unsigned long quick_pit_calibrate(void)
if (!pit_expect_msb(0xff-i, &delta, &d2))
break;
+ delta -= tsc;
+
+ /*
+ * Extrapolate the error and fail fast if the error will
+ * never be below 500 ppm.
+ */
+ if (i == 1 &&
+ d1 + d2 >= (delta * MAX_QUICK_PIT_ITERATIONS) >> 11)
+ return 0;
+
/*
* Iterate until the error is less than 500 ppm
*/
- delta -= tsc;
if (d1+d2 >= delta >> 11)
continue;