There were bug reported on 8 sockets x86 machines that TSC was wrongly
disabled when system is under heavy workload.
[ 818.380354] clocksource: timekeeping watchdog on CPU336: hpet wd-wd read-back delay of 1203520ns
[ 818.436160] clocksource: wd-tsc-wd read-back delay of 181880ns, clock-skew test skipped!
[ 819.402962] clocksource: timekeeping watchdog on CPU338: hpet wd-wd read-back delay of 324000ns
[ 819.448036] clocksource: wd-tsc-wd read-back delay of 337240ns, clock-skew test skipped!
[ 819.880863] clocksource: timekeeping watchdog on CPU339: hpet read-back delay of 150280ns, attempt 3, marking unstable
[ 819.936243] tsc: Marking TSC unstable due to clocksource watchdog
[ 820.068173] TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.
[ 820.092382] sched_clock: Marking unstable (818769414384, 1195404998)
[ 820.643627] clocksource: Checking clocksource tsc synchronization from CPU 267 to CPUs 0,4,25,70,126,430,557,564.
[ 821.067990] clocksource: Switched to clocksource hpet
This can be reproduced when system is running memory intensive 'stream'
test, or some stress-ng subcases like 'ioport'.
The reason is when system is under heavy load, the read latency of
clocksource can be very high, it can be seen even with lightweight
TSC read, and is much worse on MMIO or IO port read based external
clocksource. Causing the watchdog check to be inaccurate.
As the clocksource watchdog is a lifetime check with frequency of
twice a second, there is no need to rush doing it when the system
is under heavy load and the clocksource read latency is very high,
suspend the watchdog timer for 5 minutes.
Signed-off-by: Feng Tang <[email protected]>
---
kernel/time/clocksource.c | 45 ++++++++++++++++++++++++++++-----------
1 file changed, 32 insertions(+), 13 deletions(-)
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 9cf32ccda715..8cd74b89d577 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -384,6 +384,15 @@ void clocksource_verify_percpu(struct clocksource *cs)
}
EXPORT_SYMBOL_GPL(clocksource_verify_percpu);
+static inline void clocksource_reset_watchdog(void)
+{
+ struct clocksource *cs;
+
+ list_for_each_entry(cs, &watchdog_list, wd_list)
+ cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
+}
+
+
static void clocksource_watchdog(struct timer_list *unused)
{
u64 csnow, wdnow, cslast, wdlast, delta;
@@ -391,6 +400,7 @@ static void clocksource_watchdog(struct timer_list *unused)
int64_t wd_nsec, cs_nsec;
struct clocksource *cs;
enum wd_read_status read_ret;
+ unsigned long extra_wait = 0;
u32 md;
spin_lock(&watchdog_lock);
@@ -410,13 +420,30 @@ static void clocksource_watchdog(struct timer_list *unused)
read_ret = cs_watchdog_read(cs, &csnow, &wdnow);
- if (read_ret != WD_READ_SUCCESS) {
- if (read_ret == WD_READ_UNSTABLE)
- /* Clock readout unreliable, so give it up. */
- __clocksource_unstable(cs);
+ if (read_ret == WD_READ_UNSTABLE) {
+ /* Clock readout unreliable, so give it up. */
+ __clocksource_unstable(cs);
continue;
}
+ /*
+ * When WD_READ_SKIP is returned, it means the system is likely
+ * under very heavy load, where the latency of reading
+ * watchdog/clocksource is very big, and affect the accuracy of
+ * watchdog check. So give system some space and suspend the
+ * watchdog check for 5 minutes.
+ */
+ if (read_ret == WD_READ_SKIP) {
+ /*
+ * As the watchdog timer will be suspended, and
+ * cs->last could keep unchanged for 5 minutes, reset
+ * the counters.
+ */
+ clocksource_reset_watchdog();
+ extra_wait = HZ * 300;
+ break;
+ }
+
/* Clocksource initialized ? */
if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
atomic_read(&watchdog_reset_pending)) {
@@ -512,7 +539,7 @@ static void clocksource_watchdog(struct timer_list *unused)
* pair clocksource_stop_watchdog() clocksource_start_watchdog().
*/
if (!timer_pending(&watchdog_timer)) {
- watchdog_timer.expires += WATCHDOG_INTERVAL;
+ watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait;
add_timer_on(&watchdog_timer, next_cpu);
}
out:
@@ -537,14 +564,6 @@ static inline void clocksource_stop_watchdog(void)
watchdog_running = 0;
}
-static inline void clocksource_reset_watchdog(void)
-{
- struct clocksource *cs;
-
- list_for_each_entry(cs, &watchdog_list, wd_list)
- cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
-}
-
static void clocksource_resume_watchdog(void)
{
atomic_inc(&watchdog_reset_pending);
--
2.34.1
On 12/20/22 03:25, Feng Tang wrote:
> There were bug reported on 8 sockets x86 machines that TSC was wrongly
> disabled when system is under heavy workload.
>
> [ 818.380354] clocksource: timekeeping watchdog on CPU336: hpet wd-wd read-back delay of 1203520ns
> [ 818.436160] clocksource: wd-tsc-wd read-back delay of 181880ns, clock-skew test skipped!
> [ 819.402962] clocksource: timekeeping watchdog on CPU338: hpet wd-wd read-back delay of 324000ns
> [ 819.448036] clocksource: wd-tsc-wd read-back delay of 337240ns, clock-skew test skipped!
> [ 819.880863] clocksource: timekeeping watchdog on CPU339: hpet read-back delay of 150280ns, attempt 3, marking unstable
> [ 819.936243] tsc: Marking TSC unstable due to clocksource watchdog
> [ 820.068173] TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.
> [ 820.092382] sched_clock: Marking unstable (818769414384, 1195404998)
> [ 820.643627] clocksource: Checking clocksource tsc synchronization from CPU 267 to CPUs 0,4,25,70,126,430,557,564.
> [ 821.067990] clocksource: Switched to clocksource hpet
>
> This can be reproduced when system is running memory intensive 'stream'
> test, or some stress-ng subcases like 'ioport'.
>
> The reason is when system is under heavy load, the read latency of
> clocksource can be very high, it can be seen even with lightweight
> TSC read, and is much worse on MMIO or IO port read based external
> clocksource. Causing the watchdog check to be inaccurate.
>
> As the clocksource watchdog is a lifetime check with frequency of
> twice a second, there is no need to rush doing it when the system
> is under heavy load and the clocksource read latency is very high,
> suspend the watchdog timer for 5 minutes.
>
> Signed-off-by: Feng Tang <[email protected]>
> ---
> kernel/time/clocksource.c | 45 ++++++++++++++++++++++++++++-----------
> 1 file changed, 32 insertions(+), 13 deletions(-)
>
> diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
> index 9cf32ccda715..8cd74b89d577 100644
> --- a/kernel/time/clocksource.c
> +++ b/kernel/time/clocksource.c
> @@ -384,6 +384,15 @@ void clocksource_verify_percpu(struct clocksource *cs)
> }
> EXPORT_SYMBOL_GPL(clocksource_verify_percpu);
>
> +static inline void clocksource_reset_watchdog(void)
> +{
> + struct clocksource *cs;
> +
> + list_for_each_entry(cs, &watchdog_list, wd_list)
> + cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
> +}
> +
> +
> static void clocksource_watchdog(struct timer_list *unused)
> {
> u64 csnow, wdnow, cslast, wdlast, delta;
> @@ -391,6 +400,7 @@ static void clocksource_watchdog(struct timer_list *unused)
> int64_t wd_nsec, cs_nsec;
> struct clocksource *cs;
> enum wd_read_status read_ret;
> + unsigned long extra_wait = 0;
> u32 md;
>
> spin_lock(&watchdog_lock);
> @@ -410,13 +420,30 @@ static void clocksource_watchdog(struct timer_list *unused)
>
> read_ret = cs_watchdog_read(cs, &csnow, &wdnow);
>
> - if (read_ret != WD_READ_SUCCESS) {
> - if (read_ret == WD_READ_UNSTABLE)
> - /* Clock readout unreliable, so give it up. */
> - __clocksource_unstable(cs);
> + if (read_ret == WD_READ_UNSTABLE) {
> + /* Clock readout unreliable, so give it up. */
> + __clocksource_unstable(cs);
> continue;
> }
>
> + /*
> + * When WD_READ_SKIP is returned, it means the system is likely
> + * under very heavy load, where the latency of reading
> + * watchdog/clocksource is very big, and affect the accuracy of
> + * watchdog check. So give system some space and suspend the
> + * watchdog check for 5 minutes.
> + */
> + if (read_ret == WD_READ_SKIP) {
> + /*
> + * As the watchdog timer will be suspended, and
> + * cs->last could keep unchanged for 5 minutes, reset
> + * the counters.
> + */
> + clocksource_reset_watchdog();
> + extra_wait = HZ * 300;
> + break;
> + }
> +
> /* Clocksource initialized ? */
> if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
> atomic_read(&watchdog_reset_pending)) {
> @@ -512,7 +539,7 @@ static void clocksource_watchdog(struct timer_list *unused)
> * pair clocksource_stop_watchdog() clocksource_start_watchdog().
> */
> if (!timer_pending(&watchdog_timer)) {
> - watchdog_timer.expires += WATCHDOG_INTERVAL;
> + watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait;
> add_timer_on(&watchdog_timer, next_cpu);
> }
> out:
> @@ -537,14 +564,6 @@ static inline void clocksource_stop_watchdog(void)
> watchdog_running = 0;
> }
>
> -static inline void clocksource_reset_watchdog(void)
> -{
> - struct clocksource *cs;
> -
> - list_for_each_entry(cs, &watchdog_list, wd_list)
> - cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
> -}
> -
> static void clocksource_resume_watchdog(void)
> {
> atomic_inc(&watchdog_reset_pending);
It looks reasonable to me. Thanks for the patch.
Acked-by: Waiman Long <[email protected]>
On Tue, Dec 20, 2022 at 11:11:08AM -0500, Waiman Long wrote:
> On 12/20/22 03:25, Feng Tang wrote:
> > There were bug reported on 8 sockets x86 machines that TSC was wrongly
> > disabled when system is under heavy workload.
> >
> > [ 818.380354] clocksource: timekeeping watchdog on CPU336: hpet wd-wd read-back delay of 1203520ns
> > [ 818.436160] clocksource: wd-tsc-wd read-back delay of 181880ns, clock-skew test skipped!
> > [ 819.402962] clocksource: timekeeping watchdog on CPU338: hpet wd-wd read-back delay of 324000ns
> > [ 819.448036] clocksource: wd-tsc-wd read-back delay of 337240ns, clock-skew test skipped!
> > [ 819.880863] clocksource: timekeeping watchdog on CPU339: hpet read-back delay of 150280ns, attempt 3, marking unstable
> > [ 819.936243] tsc: Marking TSC unstable due to clocksource watchdog
> > [ 820.068173] TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.
> > [ 820.092382] sched_clock: Marking unstable (818769414384, 1195404998)
> > [ 820.643627] clocksource: Checking clocksource tsc synchronization from CPU 267 to CPUs 0,4,25,70,126,430,557,564.
> > [ 821.067990] clocksource: Switched to clocksource hpet
> >
> > This can be reproduced when system is running memory intensive 'stream'
> > test, or some stress-ng subcases like 'ioport'.
> >
> > The reason is when system is under heavy load, the read latency of
> > clocksource can be very high, it can be seen even with lightweight
> > TSC read, and is much worse on MMIO or IO port read based external
> > clocksource. Causing the watchdog check to be inaccurate.
> >
> > As the clocksource watchdog is a lifetime check with frequency of
> > twice a second, there is no need to rush doing it when the system
> > is under heavy load and the clocksource read latency is very high,
> > suspend the watchdog timer for 5 minutes.
> >
> > Signed-off-by: Feng Tang <[email protected]>
> > ---
> > kernel/time/clocksource.c | 45 ++++++++++++++++++++++++++++-----------
> > 1 file changed, 32 insertions(+), 13 deletions(-)
> >
> > diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
> > index 9cf32ccda715..8cd74b89d577 100644
> > --- a/kernel/time/clocksource.c
> > +++ b/kernel/time/clocksource.c
> > @@ -384,6 +384,15 @@ void clocksource_verify_percpu(struct clocksource *cs)
> > }
> > EXPORT_SYMBOL_GPL(clocksource_verify_percpu);
> > +static inline void clocksource_reset_watchdog(void)
> > +{
> > + struct clocksource *cs;
> > +
> > + list_for_each_entry(cs, &watchdog_list, wd_list)
> > + cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
> > +}
> > +
> > +
> > static void clocksource_watchdog(struct timer_list *unused)
> > {
> > u64 csnow, wdnow, cslast, wdlast, delta;
> > @@ -391,6 +400,7 @@ static void clocksource_watchdog(struct timer_list *unused)
> > int64_t wd_nsec, cs_nsec;
> > struct clocksource *cs;
> > enum wd_read_status read_ret;
> > + unsigned long extra_wait = 0;
> > u32 md;
> > spin_lock(&watchdog_lock);
> > @@ -410,13 +420,30 @@ static void clocksource_watchdog(struct timer_list *unused)
> > read_ret = cs_watchdog_read(cs, &csnow, &wdnow);
> > - if (read_ret != WD_READ_SUCCESS) {
> > - if (read_ret == WD_READ_UNSTABLE)
> > - /* Clock readout unreliable, so give it up. */
> > - __clocksource_unstable(cs);
> > + if (read_ret == WD_READ_UNSTABLE) {
> > + /* Clock readout unreliable, so give it up. */
> > + __clocksource_unstable(cs);
> > continue;
> > }
> > + /*
> > + * When WD_READ_SKIP is returned, it means the system is likely
> > + * under very heavy load, where the latency of reading
> > + * watchdog/clocksource is very big, and affect the accuracy of
> > + * watchdog check. So give system some space and suspend the
> > + * watchdog check for 5 minutes.
> > + */
> > + if (read_ret == WD_READ_SKIP) {
> > + /*
> > + * As the watchdog timer will be suspended, and
> > + * cs->last could keep unchanged for 5 minutes, reset
> > + * the counters.
> > + */
> > + clocksource_reset_watchdog();
> > + extra_wait = HZ * 300;
> > + break;
> > + }
> > +
> > /* Clocksource initialized ? */
> > if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
> > atomic_read(&watchdog_reset_pending)) {
> > @@ -512,7 +539,7 @@ static void clocksource_watchdog(struct timer_list *unused)
> > * pair clocksource_stop_watchdog() clocksource_start_watchdog().
> > */
> > if (!timer_pending(&watchdog_timer)) {
> > - watchdog_timer.expires += WATCHDOG_INTERVAL;
> > + watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait;
> > add_timer_on(&watchdog_timer, next_cpu);
> > }
> > out:
> > @@ -537,14 +564,6 @@ static inline void clocksource_stop_watchdog(void)
> > watchdog_running = 0;
> > }
> > -static inline void clocksource_reset_watchdog(void)
> > -{
> > - struct clocksource *cs;
> > -
> > - list_for_each_entry(cs, &watchdog_list, wd_list)
> > - cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
> > -}
> > -
> > static void clocksource_resume_watchdog(void)
> > {
> > atomic_inc(&watchdog_reset_pending);
>
> It looks reasonable to me. Thanks for the patch.
>
> Acked-by: Waiman Long <[email protected]>
Queued, thank you both!
If you would like this to go in some other way:
Acked-by: Paul E. McKenney <[email protected]>
And while I am remembering it... Any objections to reversing the role of
TSC and the other timers on systems where TSC is believed to be accurate?
So that if there is clocksource skew, HPET is marked unstable rather than
TSC? This would preserve the diagnostics without hammering performance
when skew is detected. (Switching from TSC to HPET hammers performance
enough that our automation usually notices and reboots the system.)
Thanx, Paul
Using correct email address of John Stultz.
On Tue, Dec 20, 2022 at 10:34:00AM -0800, Paul E. McKenney wrote:
> On Tue, Dec 20, 2022 at 11:11:08AM -0500, Waiman Long wrote:
> > On 12/20/22 03:25, Feng Tang wrote:
> > > There were bug reported on 8 sockets x86 machines that TSC was wrongly
> > > disabled when system is under heavy workload.
> > >
> > > [ 818.380354] clocksource: timekeeping watchdog on CPU336: hpet wd-wd read-back delay of 1203520ns
> > > [ 818.436160] clocksource: wd-tsc-wd read-back delay of 181880ns, clock-skew test skipped!
> > > [ 819.402962] clocksource: timekeeping watchdog on CPU338: hpet wd-wd read-back delay of 324000ns
> > > [ 819.448036] clocksource: wd-tsc-wd read-back delay of 337240ns, clock-skew test skipped!
> > > [ 819.880863] clocksource: timekeeping watchdog on CPU339: hpet read-back delay of 150280ns, attempt 3, marking unstable
> > > [ 819.936243] tsc: Marking TSC unstable due to clocksource watchdog
> > > [ 820.068173] TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.
> > > [ 820.092382] sched_clock: Marking unstable (818769414384, 1195404998)
> > > [ 820.643627] clocksource: Checking clocksource tsc synchronization from CPU 267 to CPUs 0,4,25,70,126,430,557,564.
> > > [ 821.067990] clocksource: Switched to clocksource hpet
> > >
> > > This can be reproduced when system is running memory intensive 'stream'
> > > test, or some stress-ng subcases like 'ioport'.
> > >
> > > The reason is when system is under heavy load, the read latency of
> > > clocksource can be very high, it can be seen even with lightweight
> > > TSC read, and is much worse on MMIO or IO port read based external
> > > clocksource. Causing the watchdog check to be inaccurate.
> > >
> > > As the clocksource watchdog is a lifetime check with frequency of
> > > twice a second, there is no need to rush doing it when the system
> > > is under heavy load and the clocksource read latency is very high,
> > > suspend the watchdog timer for 5 minutes.
> > >
> > > Signed-off-by: Feng Tang <[email protected]>
> > > ---
> > > kernel/time/clocksource.c | 45 ++++++++++++++++++++++++++++-----------
> > > 1 file changed, 32 insertions(+), 13 deletions(-)
> > >
> > > diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
> > > index 9cf32ccda715..8cd74b89d577 100644
> > > --- a/kernel/time/clocksource.c
> > > +++ b/kernel/time/clocksource.c
> > > @@ -384,6 +384,15 @@ void clocksource_verify_percpu(struct clocksource *cs)
> > > }
> > > EXPORT_SYMBOL_GPL(clocksource_verify_percpu);
> > > +static inline void clocksource_reset_watchdog(void)
> > > +{
> > > + struct clocksource *cs;
> > > +
> > > + list_for_each_entry(cs, &watchdog_list, wd_list)
> > > + cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
> > > +}
> > > +
> > > +
> > > static void clocksource_watchdog(struct timer_list *unused)
> > > {
> > > u64 csnow, wdnow, cslast, wdlast, delta;
> > > @@ -391,6 +400,7 @@ static void clocksource_watchdog(struct timer_list *unused)
> > > int64_t wd_nsec, cs_nsec;
> > > struct clocksource *cs;
> > > enum wd_read_status read_ret;
> > > + unsigned long extra_wait = 0;
> > > u32 md;
> > > spin_lock(&watchdog_lock);
> > > @@ -410,13 +420,30 @@ static void clocksource_watchdog(struct timer_list *unused)
> > > read_ret = cs_watchdog_read(cs, &csnow, &wdnow);
> > > - if (read_ret != WD_READ_SUCCESS) {
> > > - if (read_ret == WD_READ_UNSTABLE)
> > > - /* Clock readout unreliable, so give it up. */
> > > - __clocksource_unstable(cs);
> > > + if (read_ret == WD_READ_UNSTABLE) {
> > > + /* Clock readout unreliable, so give it up. */
> > > + __clocksource_unstable(cs);
> > > continue;
> > > }
> > > + /*
> > > + * When WD_READ_SKIP is returned, it means the system is likely
> > > + * under very heavy load, where the latency of reading
> > > + * watchdog/clocksource is very big, and affect the accuracy of
> > > + * watchdog check. So give system some space and suspend the
> > > + * watchdog check for 5 minutes.
> > > + */
> > > + if (read_ret == WD_READ_SKIP) {
> > > + /*
> > > + * As the watchdog timer will be suspended, and
> > > + * cs->last could keep unchanged for 5 minutes, reset
> > > + * the counters.
> > > + */
> > > + clocksource_reset_watchdog();
> > > + extra_wait = HZ * 300;
> > > + break;
> > > + }
> > > +
> > > /* Clocksource initialized ? */
> > > if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
> > > atomic_read(&watchdog_reset_pending)) {
> > > @@ -512,7 +539,7 @@ static void clocksource_watchdog(struct timer_list *unused)
> > > * pair clocksource_stop_watchdog() clocksource_start_watchdog().
> > > */
> > > if (!timer_pending(&watchdog_timer)) {
> > > - watchdog_timer.expires += WATCHDOG_INTERVAL;
> > > + watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait;
> > > add_timer_on(&watchdog_timer, next_cpu);
> > > }
> > > out:
> > > @@ -537,14 +564,6 @@ static inline void clocksource_stop_watchdog(void)
> > > watchdog_running = 0;
> > > }
> > > -static inline void clocksource_reset_watchdog(void)
> > > -{
> > > - struct clocksource *cs;
> > > -
> > > - list_for_each_entry(cs, &watchdog_list, wd_list)
> > > - cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
> > > -}
> > > -
> > > static void clocksource_resume_watchdog(void)
> > > {
> > > atomic_inc(&watchdog_reset_pending);
> >
> > It looks reasonable to me. Thanks for the patch.
> >
> > Acked-by: Waiman Long <[email protected]>
>
> Queued, thank you both!
Thanks for reviewing and queueing!
> If you would like this to go in some other way:
>
> Acked-by: Paul E. McKenney <[email protected]>
>
> And while I am remembering it... Any objections to reversing the role of
> TSC and the other timers on systems where TSC is believed to be accurate?
> So that if there is clocksource skew, HPET is marked unstable rather than
> TSC?
For the bug in commit log, I think it's the 8 sockets system with
hundreds of CPUs causing the big latency, while the HPET itself may
not be broken, and if we switched to ACPI PM_TIMER as watchdog, we
could see similar big latency.
I used to only see this issue with stress tool like stress-ng, but
seems with larger and larger system, even the momory intensive load
can easily trigger this.
> This would preserve the diagnostics without hammering performance
> when skew is detected. (Switching from TSC to HPET hammers performance
> enough that our automation usually notices and reboots the system.)
Yes, switching to HPET is a disaster for performance, we've seen
from 30% to 90% drop in different benchmarks.
Thanks,
Feng
> Thanx, Paul
On 12/20/22 20:01, Feng Tang wrote:
> Using correct email address of John Stultz.
>
> On Tue, Dec 20, 2022 at 10:34:00AM -0800, Paul E. McKenney wrote:
>> On Tue, Dec 20, 2022 at 11:11:08AM -0500, Waiman Long wrote:
>>> On 12/20/22 03:25, Feng Tang wrote:
>>>> There were bug reported on 8 sockets x86 machines that TSC was wrongly
>>>> disabled when system is under heavy workload.
>>>>
>>>> [ 818.380354] clocksource: timekeeping watchdog on CPU336: hpet wd-wd read-back delay of 1203520ns
>>>> [ 818.436160] clocksource: wd-tsc-wd read-back delay of 181880ns, clock-skew test skipped!
>>>> [ 819.402962] clocksource: timekeeping watchdog on CPU338: hpet wd-wd read-back delay of 324000ns
>>>> [ 819.448036] clocksource: wd-tsc-wd read-back delay of 337240ns, clock-skew test skipped!
>>>> [ 819.880863] clocksource: timekeeping watchdog on CPU339: hpet read-back delay of 150280ns, attempt 3, marking unstable
>>>> [ 819.936243] tsc: Marking TSC unstable due to clocksource watchdog
>>>> [ 820.068173] TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.
>>>> [ 820.092382] sched_clock: Marking unstable (818769414384, 1195404998)
>>>> [ 820.643627] clocksource: Checking clocksource tsc synchronization from CPU 267 to CPUs 0,4,25,70,126,430,557,564.
>>>> [ 821.067990] clocksource: Switched to clocksource hpet
>>>>
>>>> This can be reproduced when system is running memory intensive 'stream'
>>>> test, or some stress-ng subcases like 'ioport'.
>>>>
>>>> The reason is when system is under heavy load, the read latency of
>>>> clocksource can be very high, it can be seen even with lightweight
>>>> TSC read, and is much worse on MMIO or IO port read based external
>>>> clocksource. Causing the watchdog check to be inaccurate.
>>>>
>>>> As the clocksource watchdog is a lifetime check with frequency of
>>>> twice a second, there is no need to rush doing it when the system
>>>> is under heavy load and the clocksource read latency is very high,
>>>> suspend the watchdog timer for 5 minutes.
>>>>
>>>> Signed-off-by: Feng Tang <[email protected]>
>>>> ---
>>>> kernel/time/clocksource.c | 45 ++++++++++++++++++++++++++++-----------
>>>> 1 file changed, 32 insertions(+), 13 deletions(-)
>>>>
>>>> diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
>>>> index 9cf32ccda715..8cd74b89d577 100644
>>>> --- a/kernel/time/clocksource.c
>>>> +++ b/kernel/time/clocksource.c
>>>> @@ -384,6 +384,15 @@ void clocksource_verify_percpu(struct clocksource *cs)
>>>> }
>>>> EXPORT_SYMBOL_GPL(clocksource_verify_percpu);
>>>> +static inline void clocksource_reset_watchdog(void)
>>>> +{
>>>> + struct clocksource *cs;
>>>> +
>>>> + list_for_each_entry(cs, &watchdog_list, wd_list)
>>>> + cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
>>>> +}
>>>> +
>>>> +
>>>> static void clocksource_watchdog(struct timer_list *unused)
>>>> {
>>>> u64 csnow, wdnow, cslast, wdlast, delta;
>>>> @@ -391,6 +400,7 @@ static void clocksource_watchdog(struct timer_list *unused)
>>>> int64_t wd_nsec, cs_nsec;
>>>> struct clocksource *cs;
>>>> enum wd_read_status read_ret;
>>>> + unsigned long extra_wait = 0;
>>>> u32 md;
>>>> spin_lock(&watchdog_lock);
>>>> @@ -410,13 +420,30 @@ static void clocksource_watchdog(struct timer_list *unused)
>>>> read_ret = cs_watchdog_read(cs, &csnow, &wdnow);
>>>> - if (read_ret != WD_READ_SUCCESS) {
>>>> - if (read_ret == WD_READ_UNSTABLE)
>>>> - /* Clock readout unreliable, so give it up. */
>>>> - __clocksource_unstable(cs);
>>>> + if (read_ret == WD_READ_UNSTABLE) {
>>>> + /* Clock readout unreliable, so give it up. */
>>>> + __clocksource_unstable(cs);
>>>> continue;
>>>> }
>>>> + /*
>>>> + * When WD_READ_SKIP is returned, it means the system is likely
>>>> + * under very heavy load, where the latency of reading
>>>> + * watchdog/clocksource is very big, and affect the accuracy of
>>>> + * watchdog check. So give system some space and suspend the
>>>> + * watchdog check for 5 minutes.
>>>> + */
>>>> + if (read_ret == WD_READ_SKIP) {
>>>> + /*
>>>> + * As the watchdog timer will be suspended, and
>>>> + * cs->last could keep unchanged for 5 minutes, reset
>>>> + * the counters.
>>>> + */
>>>> + clocksource_reset_watchdog();
>>>> + extra_wait = HZ * 300;
>>>> + break;
>>>> + }
>>>> +
>>>> /* Clocksource initialized ? */
>>>> if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
>>>> atomic_read(&watchdog_reset_pending)) {
>>>> @@ -512,7 +539,7 @@ static void clocksource_watchdog(struct timer_list *unused)
>>>> * pair clocksource_stop_watchdog() clocksource_start_watchdog().
>>>> */
>>>> if (!timer_pending(&watchdog_timer)) {
>>>> - watchdog_timer.expires += WATCHDOG_INTERVAL;
>>>> + watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait;
>>>> add_timer_on(&watchdog_timer, next_cpu);
>>>> }
>>>> out:
>>>> @@ -537,14 +564,6 @@ static inline void clocksource_stop_watchdog(void)
>>>> watchdog_running = 0;
>>>> }
>>>> -static inline void clocksource_reset_watchdog(void)
>>>> -{
>>>> - struct clocksource *cs;
>>>> -
>>>> - list_for_each_entry(cs, &watchdog_list, wd_list)
>>>> - cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
>>>> -}
>>>> -
>>>> static void clocksource_resume_watchdog(void)
>>>> {
>>>> atomic_inc(&watchdog_reset_pending);
>>> It looks reasonable to me. Thanks for the patch.
>>>
>>> Acked-by: Waiman Long <[email protected]>
>> Queued, thank you both!
> Thanks for reviewing and queueing!
>
>> If you would like this to go in some other way:
>>
>> Acked-by: Paul E. McKenney <[email protected]>
>>
>> And while I am remembering it... Any objections to reversing the role of
>> TSC and the other timers on systems where TSC is believed to be accurate?
>> So that if there is clocksource skew, HPET is marked unstable rather than
>> TSC?
> For the bug in commit log, I think it's the 8 sockets system with
> hundreds of CPUs causing the big latency, while the HPET itself may
> not be broken, and if we switched to ACPI PM_TIMER as watchdog, we
> could see similar big latency.
>
> I used to only see this issue with stress tool like stress-ng, but
> seems with larger and larger system, even the momory intensive load
> can easily trigger this.
>
>> This would preserve the diagnostics without hammering performance
>> when skew is detected. (Switching from TSC to HPET hammers performance
>> enough that our automation usually notices and reboots the system.)
> Yes, switching to HPET is a disaster for performance, we've seen
> from 30% to 90% drop in different benchmarks.
Switching to hpet is very bad for performance. That is the main reason
why I posted clocksource patches in the past to avoid this as much as
possible. I think your patch is also a good countermeasure to avoid this.
Thanks,
Longman
On Tue, Dec 20, 2022 at 10:26:15PM -0500, Waiman Long wrote:
>
> On 12/20/22 20:01, Feng Tang wrote:
> > Using correct email address of John Stultz.
> >
> > On Tue, Dec 20, 2022 at 10:34:00AM -0800, Paul E. McKenney wrote:
> > > On Tue, Dec 20, 2022 at 11:11:08AM -0500, Waiman Long wrote:
> > > > On 12/20/22 03:25, Feng Tang wrote:
> > > > > There were bug reported on 8 sockets x86 machines that TSC was wrongly
> > > > > disabled when system is under heavy workload.
> > > > >
> > > > > [ 818.380354] clocksource: timekeeping watchdog on CPU336: hpet wd-wd read-back delay of 1203520ns
> > > > > [ 818.436160] clocksource: wd-tsc-wd read-back delay of 181880ns, clock-skew test skipped!
> > > > > [ 819.402962] clocksource: timekeeping watchdog on CPU338: hpet wd-wd read-back delay of 324000ns
> > > > > [ 819.448036] clocksource: wd-tsc-wd read-back delay of 337240ns, clock-skew test skipped!
> > > > > [ 819.880863] clocksource: timekeeping watchdog on CPU339: hpet read-back delay of 150280ns, attempt 3, marking unstable
> > > > > [ 819.936243] tsc: Marking TSC unstable due to clocksource watchdog
> > > > > [ 820.068173] TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.
> > > > > [ 820.092382] sched_clock: Marking unstable (818769414384, 1195404998)
> > > > > [ 820.643627] clocksource: Checking clocksource tsc synchronization from CPU 267 to CPUs 0,4,25,70,126,430,557,564.
> > > > > [ 821.067990] clocksource: Switched to clocksource hpet
> > > > >
> > > > > This can be reproduced when system is running memory intensive 'stream'
> > > > > test, or some stress-ng subcases like 'ioport'.
> > > > >
> > > > > The reason is when system is under heavy load, the read latency of
> > > > > clocksource can be very high, it can be seen even with lightweight
> > > > > TSC read, and is much worse on MMIO or IO port read based external
> > > > > clocksource. Causing the watchdog check to be inaccurate.
> > > > >
> > > > > As the clocksource watchdog is a lifetime check with frequency of
> > > > > twice a second, there is no need to rush doing it when the system
> > > > > is under heavy load and the clocksource read latency is very high,
> > > > > suspend the watchdog timer for 5 minutes.
> > > > >
> > > > > Signed-off-by: Feng Tang <[email protected]>
> > > > > ---
> > > > > kernel/time/clocksource.c | 45 ++++++++++++++++++++++++++++-----------
> > > > > 1 file changed, 32 insertions(+), 13 deletions(-)
> > > > >
> > > > > diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
> > > > > index 9cf32ccda715..8cd74b89d577 100644
> > > > > --- a/kernel/time/clocksource.c
> > > > > +++ b/kernel/time/clocksource.c
> > > > > @@ -384,6 +384,15 @@ void clocksource_verify_percpu(struct clocksource *cs)
> > > > > }
> > > > > EXPORT_SYMBOL_GPL(clocksource_verify_percpu);
> > > > > +static inline void clocksource_reset_watchdog(void)
> > > > > +{
> > > > > + struct clocksource *cs;
> > > > > +
> > > > > + list_for_each_entry(cs, &watchdog_list, wd_list)
> > > > > + cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
> > > > > +}
> > > > > +
> > > > > +
> > > > > static void clocksource_watchdog(struct timer_list *unused)
> > > > > {
> > > > > u64 csnow, wdnow, cslast, wdlast, delta;
> > > > > @@ -391,6 +400,7 @@ static void clocksource_watchdog(struct timer_list *unused)
> > > > > int64_t wd_nsec, cs_nsec;
> > > > > struct clocksource *cs;
> > > > > enum wd_read_status read_ret;
> > > > > + unsigned long extra_wait = 0;
> > > > > u32 md;
> > > > > spin_lock(&watchdog_lock);
> > > > > @@ -410,13 +420,30 @@ static void clocksource_watchdog(struct timer_list *unused)
> > > > > read_ret = cs_watchdog_read(cs, &csnow, &wdnow);
> > > > > - if (read_ret != WD_READ_SUCCESS) {
> > > > > - if (read_ret == WD_READ_UNSTABLE)
> > > > > - /* Clock readout unreliable, so give it up. */
> > > > > - __clocksource_unstable(cs);
> > > > > + if (read_ret == WD_READ_UNSTABLE) {
> > > > > + /* Clock readout unreliable, so give it up. */
> > > > > + __clocksource_unstable(cs);
> > > > > continue;
> > > > > }
> > > > > + /*
> > > > > + * When WD_READ_SKIP is returned, it means the system is likely
> > > > > + * under very heavy load, where the latency of reading
> > > > > + * watchdog/clocksource is very big, and affect the accuracy of
> > > > > + * watchdog check. So give system some space and suspend the
> > > > > + * watchdog check for 5 minutes.
> > > > > + */
> > > > > + if (read_ret == WD_READ_SKIP) {
> > > > > + /*
> > > > > + * As the watchdog timer will be suspended, and
> > > > > + * cs->last could keep unchanged for 5 minutes, reset
> > > > > + * the counters.
> > > > > + */
> > > > > + clocksource_reset_watchdog();
> > > > > + extra_wait = HZ * 300;
> > > > > + break;
> > > > > + }
> > > > > +
> > > > > /* Clocksource initialized ? */
> > > > > if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
> > > > > atomic_read(&watchdog_reset_pending)) {
> > > > > @@ -512,7 +539,7 @@ static void clocksource_watchdog(struct timer_list *unused)
> > > > > * pair clocksource_stop_watchdog() clocksource_start_watchdog().
> > > > > */
> > > > > if (!timer_pending(&watchdog_timer)) {
> > > > > - watchdog_timer.expires += WATCHDOG_INTERVAL;
> > > > > + watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait;
> > > > > add_timer_on(&watchdog_timer, next_cpu);
> > > > > }
> > > > > out:
> > > > > @@ -537,14 +564,6 @@ static inline void clocksource_stop_watchdog(void)
> > > > > watchdog_running = 0;
> > > > > }
> > > > > -static inline void clocksource_reset_watchdog(void)
> > > > > -{
> > > > > - struct clocksource *cs;
> > > > > -
> > > > > - list_for_each_entry(cs, &watchdog_list, wd_list)
> > > > > - cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
> > > > > -}
> > > > > -
> > > > > static void clocksource_resume_watchdog(void)
> > > > > {
> > > > > atomic_inc(&watchdog_reset_pending);
> > > > It looks reasonable to me. Thanks for the patch.
> > > >
> > > > Acked-by: Waiman Long <[email protected]>
> > > Queued, thank you both!
> > Thanks for reviewing and queueing!
> >
> > > If you would like this to go in some other way:
> > >
> > > Acked-by: Paul E. McKenney <[email protected]>
> > >
> > > And while I am remembering it... Any objections to reversing the role of
> > > TSC and the other timers on systems where TSC is believed to be accurate?
> > > So that if there is clocksource skew, HPET is marked unstable rather than
> > > TSC?
> > For the bug in commit log, I think it's the 8 sockets system with
> > hundreds of CPUs causing the big latency, while the HPET itself may
> > not be broken, and if we switched to ACPI PM_TIMER as watchdog, we
> > could see similar big latency.
> >
> > I used to only see this issue with stress tool like stress-ng, but
> > seems with larger and larger system, even the momory intensive load
> > can easily trigger this.
> >
> > > This would preserve the diagnostics without hammering performance
> > > when skew is detected. (Switching from TSC to HPET hammers performance
> > > enough that our automation usually notices and reboots the system.)
> > Yes, switching to HPET is a disaster for performance, we've seen
> > from 30% to 90% drop in different benchmarks.
>
> Switching to hpet is very bad for performance. That is the main reason why I
> posted clocksource patches in the past to avoid this as much as possible. I
> think your patch is also a good countermeasure to avoid this.
So for the smaller systems, how about something like the following?
The background is a number of two-socket systems showing TSC clock skew
despite having CONSTANT_TSC, NONSTOP_TSC, and TSC_ADJUST all set on
the boot CPU. According to NTP, the skew was 40,000 parts per million.
HPET and ACPI PM timer were within a few parts per million, but are both
a no-go for the workloads in question, as you say. NTP actually keeps
userspace aligned with atomic-clock time to within about ten parts per
million, but that kind of skew between Kernel Standard Time and Userspace
Standard Time cannot be a good thing.
Thoughts?
Thanx, Paul
-------------------------------------------------------------------------
commit 199dfa2ba23dd0d650b1482a091e2e15457698b7
Author: Paul E. McKenney <[email protected]>
Date: Wed Dec 21 16:20:25 2022 -0800
clocksource: Verify HPET and PMTMR when TSC unverified
On systems with two or fewer sockets, when the boot CPU has CONSTANT_TSC,
NONSTOP_TSC, and TSC_ADJUST, clocksource watchdog verification of the
TSC is disabled. This works well much of the time, but there is the
occasional system that meets all of these criteria, but which still
has a TSC that skews significantly from atomic-clock time. This is
usually attributed to a firmware or hardware fault. Yes, the various
NTP daemons do express their opinions of userspace-to-atomic-clock time
skew, but they put them in various places, depending on the daemon and
distro in question. It would therefore be good for the kernel to have
some clue that there is a problem.
The old behavior of marking the TSC unstable is a non-starter because a
great many workloads simply cannot tolerate the overheads and latencies
of the various non-TSC clocksources. In addition, NTP-corrected systems
often seem to be able to tolerate significant kernel-space time skew as
long as the userspace time sources are within epsilon of atomic-clock
time.
Therefore, when watchdog verification of TSC is disabled, enable it for
HPET and PMTMR (AKA ACPI PM timer). This provides the needed in-kernel
time-skew diagnostic without degrading the system's performance.
Signed-off-by: Paul E. McKenney <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: Daniel Lezcano <[email protected]>
Cc: Feng Tang <[email protected]>
Cc: Waiman Long <[email protected]
Cc: <[email protected]>
diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h
index 8ac563abb567b..a53961c64a567 100644
--- a/arch/x86/include/asm/time.h
+++ b/arch/x86/include/asm/time.h
@@ -8,6 +8,7 @@
extern void hpet_time_init(void);
extern void time_init(void);
extern bool pit_timer_init(void);
+extern bool tsc_clocksource_watchdog_disabled(void);
extern struct clock_event_device *global_clock_event;
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 71f336425e58a..c8eb1ac5125ab 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -1091,6 +1091,8 @@ int __init hpet_enable(void)
if (!hpet_counting())
goto out_nohpet;
+ if (tsc_clocksource_watchdog_disabled())
+ clocksource_hpet.flags |= CLOCK_SOURCE_MUST_VERIFY;
clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
if (id & HPET_ID_LEGSUP) {
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index cafacb2e58cce..924e877b95f31 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1186,6 +1186,11 @@ static void __init tsc_disable_clocksource_watchdog(void)
clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
}
+bool tsc_clocksource_watchdog_disabled(void)
+{
+ return !(clocksource_tsc.flags & CLOCK_SOURCE_MUST_VERIFY);
+}
+
static void __init check_system_tsc_reliable(void)
{
#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
diff --git a/drivers/clocksource/acpi_pm.c b/drivers/clocksource/acpi_pm.c
index 279ddff81ab49..180003132bc1a 100644
--- a/drivers/clocksource/acpi_pm.c
+++ b/drivers/clocksource/acpi_pm.c
@@ -23,6 +23,7 @@
#include <linux/pci.h>
#include <linux/delay.h>
#include <asm/io.h>
+#include <asm/time.h>
/*
* The I/O port the PMTMR resides at.
@@ -210,6 +211,8 @@ static int __init init_acpi_pm_clocksource(void)
return -ENODEV;
}
+ if (tsc_clocksource_watchdog_disabled())
+ clocksource_acpi_pm.flags |= CLOCK_SOURCE_MUST_VERIFY;
return clocksource_register_hz(&clocksource_acpi_pm,
PMTMR_TICKS_PER_SEC);
}
On 12/21/22 19:40, Paul E. McKenney wrote:
> commit 199dfa2ba23dd0d650b1482a091e2e15457698b7
> Author: Paul E. McKenney<[email protected]>
> Date: Wed Dec 21 16:20:25 2022 -0800
>
> clocksource: Verify HPET and PMTMR when TSC unverified
>
> On systems with two or fewer sockets, when the boot CPU has CONSTANT_TSC,
> NONSTOP_TSC, and TSC_ADJUST, clocksource watchdog verification of the
> TSC is disabled. This works well much of the time, but there is the
> occasional system that meets all of these criteria, but which still
> has a TSC that skews significantly from atomic-clock time. This is
> usually attributed to a firmware or hardware fault. Yes, the various
> NTP daemons do express their opinions of userspace-to-atomic-clock time
> skew, but they put them in various places, depending on the daemon and
> distro in question. It would therefore be good for the kernel to have
> some clue that there is a problem.
>
> The old behavior of marking the TSC unstable is a non-starter because a
> great many workloads simply cannot tolerate the overheads and latencies
> of the various non-TSC clocksources. In addition, NTP-corrected systems
> often seem to be able to tolerate significant kernel-space time skew as
> long as the userspace time sources are within epsilon of atomic-clock
> time.
>
> Therefore, when watchdog verification of TSC is disabled, enable it for
> HPET and PMTMR (AKA ACPI PM timer). This provides the needed in-kernel
> time-skew diagnostic without degrading the system's performance.
>
> Signed-off-by: Paul E. McKenney<[email protected]>
> Cc: Thomas Gleixner<[email protected]>
> Cc: Ingo Molnar<[email protected]>
> Cc: Borislav Petkov<[email protected]>
> Cc: Dave Hansen<[email protected]>
> Cc: "H. Peter Anvin"<[email protected]>
> Cc: Daniel Lezcano<[email protected]>
> Cc: Feng Tang<[email protected]>
> Cc: Waiman Long <[email protected]
> Cc:<[email protected]>
As I currently understand, you are trying to use TSC as a watchdog to
check against HPET and PMTMR. I do have 2 questions about this patch.
First of all, why you need to use both HPET and PMTMR? Can you just use
one of those that are available. Secondly, is it possible to enable this
time-skew diagnostic for a limit amount of time instead running
indefinitely? The running of the clocksource watchdog itself will still
consume a tiny amount of CPU cycles.
Cheers,
Longman
On Wed, Dec 21, 2022 at 09:55:15PM -0800, Paul E. McKenney wrote:
> On Wed, Dec 21, 2022 at 10:39:53PM -0500, Waiman Long wrote:
> > On 12/21/22 19:40, Paul E. McKenney wrote:
> > > commit 199dfa2ba23dd0d650b1482a091e2e15457698b7
> > > Author: Paul E. McKenney<[email protected]>
> > > Date: Wed Dec 21 16:20:25 2022 -0800
> > >
> > > clocksource: Verify HPET and PMTMR when TSC unverified
> > > On systems with two or fewer sockets, when the boot CPU has CONSTANT_TSC,
> > > NONSTOP_TSC, and TSC_ADJUST, clocksource watchdog verification of the
> > > TSC is disabled. This works well much of the time, but there is the
> > > occasional system that meets all of these criteria, but which still
> > > has a TSC that skews significantly from atomic-clock time. This is
> > > usually attributed to a firmware or hardware fault. Yes, the various
> > > NTP daemons do express their opinions of userspace-to-atomic-clock time
> > > skew, but they put them in various places, depending on the daemon and
> > > distro in question. It would therefore be good for the kernel to have
> > > some clue that there is a problem.
> > > The old behavior of marking the TSC unstable is a non-starter because a
> > > great many workloads simply cannot tolerate the overheads and latencies
> > > of the various non-TSC clocksources. In addition, NTP-corrected systems
> > > often seem to be able to tolerate significant kernel-space time skew as
> > > long as the userspace time sources are within epsilon of atomic-clock
> > > time.
> > > Therefore, when watchdog verification of TSC is disabled, enable it for
> > > HPET and PMTMR (AKA ACPI PM timer). This provides the needed in-kernel
> > > time-skew diagnostic without degrading the system's performance.
> > > Signed-off-by: Paul E. McKenney<[email protected]>
> > > Cc: Thomas Gleixner<[email protected]>
> > > Cc: Ingo Molnar<[email protected]>
> > > Cc: Borislav Petkov<[email protected]>
> > > Cc: Dave Hansen<[email protected]>
> > > Cc: "H. Peter Anvin"<[email protected]>
> > > Cc: Daniel Lezcano<[email protected]>
> > > Cc: Feng Tang<[email protected]>
> > > Cc: Waiman Long <[email protected]
> > > Cc:<[email protected]>
> >
> > As I currently understand, you are trying to use TSC as a watchdog to check
> > against HPET and PMTMR. I do have 2 questions about this patch.
> >
> > First of all, why you need to use both HPET and PMTMR? Can you just use one
> > of those that are available. Secondly, is it possible to enable this
> > time-skew diagnostic for a limit amount of time instead running
> > indefinitely? The running of the clocksource watchdog itself will still
> > consume a tiny amount of CPU cycles.
>
> I could certainly do something so that only the first of HPET and PMTMR
> is checked. Could you give me a quick run-through of the advantages of
> using only one? I would need to explain that in the commit log.
>
> Would it make sense to have a kernel boot variable giving the number of
> minutes for which the watchdog was to run, with a default of zero
> meaning "indefinitely"?
We've discussed about the "os noise", which customer may really care.
IIUC, this patch intends to test if HPET/PMTIMER HW is broken, so how
about making it run for a number of minutes the default behavior.
Also I've run the patch on a Alderlake system, with a fine acpi pm_timer
and a fake broken pm_timer, and they both works without errors.
Thanks,
Feng
> Thanx, Paul
On Wed, Dec 21, 2022 at 10:39:53PM -0500, Waiman Long wrote:
> On 12/21/22 19:40, Paul E. McKenney wrote:
> > commit 199dfa2ba23dd0d650b1482a091e2e15457698b7
> > Author: Paul E. McKenney<[email protected]>
> > Date: Wed Dec 21 16:20:25 2022 -0800
> >
> > clocksource: Verify HPET and PMTMR when TSC unverified
> > On systems with two or fewer sockets, when the boot CPU has CONSTANT_TSC,
> > NONSTOP_TSC, and TSC_ADJUST, clocksource watchdog verification of the
> > TSC is disabled. This works well much of the time, but there is the
> > occasional system that meets all of these criteria, but which still
> > has a TSC that skews significantly from atomic-clock time. This is
> > usually attributed to a firmware or hardware fault. Yes, the various
> > NTP daemons do express their opinions of userspace-to-atomic-clock time
> > skew, but they put them in various places, depending on the daemon and
> > distro in question. It would therefore be good for the kernel to have
> > some clue that there is a problem.
> > The old behavior of marking the TSC unstable is a non-starter because a
> > great many workloads simply cannot tolerate the overheads and latencies
> > of the various non-TSC clocksources. In addition, NTP-corrected systems
> > often seem to be able to tolerate significant kernel-space time skew as
> > long as the userspace time sources are within epsilon of atomic-clock
> > time.
> > Therefore, when watchdog verification of TSC is disabled, enable it for
> > HPET and PMTMR (AKA ACPI PM timer). This provides the needed in-kernel
> > time-skew diagnostic without degrading the system's performance.
> > Signed-off-by: Paul E. McKenney<[email protected]>
> > Cc: Thomas Gleixner<[email protected]>
> > Cc: Ingo Molnar<[email protected]>
> > Cc: Borislav Petkov<[email protected]>
> > Cc: Dave Hansen<[email protected]>
> > Cc: "H. Peter Anvin"<[email protected]>
> > Cc: Daniel Lezcano<[email protected]>
> > Cc: Feng Tang<[email protected]>
> > Cc: Waiman Long <[email protected]
> > Cc:<[email protected]>
>
> As I currently understand, you are trying to use TSC as a watchdog to check
> against HPET and PMTMR. I do have 2 questions about this patch.
>
> First of all, why you need to use both HPET and PMTMR? Can you just use one
> of those that are available. Secondly, is it possible to enable this
> time-skew diagnostic for a limit amount of time instead running
> indefinitely? The running of the clocksource watchdog itself will still
> consume a tiny amount of CPU cycles.
I could certainly do something so that only the first of HPET and PMTMR
is checked. Could you give me a quick run-through of the advantages of
using only one? I would need to explain that in the commit log.
Would it make sense to have a kernel boot variable giving the number of
minutes for which the watchdog was to run, with a default of zero
meaning "indefinitely"?
Thanx, Paul
On Thu, Dec 22, 2022 at 02:00:42PM +0800, Feng Tang wrote:
> On Wed, Dec 21, 2022 at 09:55:15PM -0800, Paul E. McKenney wrote:
> > On Wed, Dec 21, 2022 at 10:39:53PM -0500, Waiman Long wrote:
> > > On 12/21/22 19:40, Paul E. McKenney wrote:
> > > > commit 199dfa2ba23dd0d650b1482a091e2e15457698b7
> > > > Author: Paul E. McKenney<[email protected]>
> > > > Date: Wed Dec 21 16:20:25 2022 -0800
> > > >
> > > > clocksource: Verify HPET and PMTMR when TSC unverified
> > > > On systems with two or fewer sockets, when the boot CPU has CONSTANT_TSC,
> > > > NONSTOP_TSC, and TSC_ADJUST, clocksource watchdog verification of the
> > > > TSC is disabled. This works well much of the time, but there is the
> > > > occasional system that meets all of these criteria, but which still
> > > > has a TSC that skews significantly from atomic-clock time. This is
> > > > usually attributed to a firmware or hardware fault. Yes, the various
> > > > NTP daemons do express their opinions of userspace-to-atomic-clock time
> > > > skew, but they put them in various places, depending on the daemon and
> > > > distro in question. It would therefore be good for the kernel to have
> > > > some clue that there is a problem.
> > > > The old behavior of marking the TSC unstable is a non-starter because a
> > > > great many workloads simply cannot tolerate the overheads and latencies
> > > > of the various non-TSC clocksources. In addition, NTP-corrected systems
> > > > often seem to be able to tolerate significant kernel-space time skew as
> > > > long as the userspace time sources are within epsilon of atomic-clock
> > > > time.
> > > > Therefore, when watchdog verification of TSC is disabled, enable it for
> > > > HPET and PMTMR (AKA ACPI PM timer). This provides the needed in-kernel
> > > > time-skew diagnostic without degrading the system's performance.
> > > > Signed-off-by: Paul E. McKenney<[email protected]>
> > > > Cc: Thomas Gleixner<[email protected]>
> > > > Cc: Ingo Molnar<[email protected]>
> > > > Cc: Borislav Petkov<[email protected]>
> > > > Cc: Dave Hansen<[email protected]>
> > > > Cc: "H. Peter Anvin"<[email protected]>
> > > > Cc: Daniel Lezcano<[email protected]>
> > > > Cc: Feng Tang<[email protected]>
> > > > Cc: Waiman Long <[email protected]
> > > > Cc:<[email protected]>
> > >
> > > As I currently understand, you are trying to use TSC as a watchdog to check
> > > against HPET and PMTMR. I do have 2 questions about this patch.
> > >
> > > First of all, why you need to use both HPET and PMTMR? Can you just use one
> > > of those that are available. Secondly, is it possible to enable this
> > > time-skew diagnostic for a limit amount of time instead running
> > > indefinitely? The running of the clocksource watchdog itself will still
> > > consume a tiny amount of CPU cycles.
> >
> > I could certainly do something so that only the first of HPET and PMTMR
> > is checked. Could you give me a quick run-through of the advantages of
> > using only one? I would need to explain that in the commit log.
> >
> > Would it make sense to have a kernel boot variable giving the number of
> > minutes for which the watchdog was to run, with a default of zero
> > meaning "indefinitely"?
>
> We've discussed about the "os noise", which customer may really care.
> IIUC, this patch intends to test if HPET/PMTIMER HW is broken, so how
> about making it run for a number of minutes the default behavior.
It is also intended to determine if TSC is broken, with NTP drift rates
used to determine which timer is at fault.
OK, how about a Kconfig option for the number of minutes, set to whatever
you guys tell me? (Three minutes? Five minutes? Something else?)
People wanting to run it continuously could then build their kernels
with that Kconfig option set to zero.
> Also I've run the patch on a Alderlake system, with a fine acpi pm_timer
> and a fake broken pm_timer, and they both works without errors.
Thank you! Did it correctly identify the fake broken pm_timer as being
broken? If so, may I have your Tested-by?
Thanx, Paul
On Wed, Dec 21, 2022 at 10:14:29PM -0800, Paul E. McKenney wrote:
> On Thu, Dec 22, 2022 at 02:00:42PM +0800, Feng Tang wrote:
> > On Wed, Dec 21, 2022 at 09:55:15PM -0800, Paul E. McKenney wrote:
> > > On Wed, Dec 21, 2022 at 10:39:53PM -0500, Waiman Long wrote:
> > > > On 12/21/22 19:40, Paul E. McKenney wrote:
> > > > > commit 199dfa2ba23dd0d650b1482a091e2e15457698b7
> > > > > Author: Paul E. McKenney<[email protected]>
> > > > > Date: Wed Dec 21 16:20:25 2022 -0800
> > > > >
> > > > > clocksource: Verify HPET and PMTMR when TSC unverified
> > > > > On systems with two or fewer sockets, when the boot CPU has CONSTANT_TSC,
> > > > > NONSTOP_TSC, and TSC_ADJUST, clocksource watchdog verification of the
> > > > > TSC is disabled. This works well much of the time, but there is the
> > > > > occasional system that meets all of these criteria, but which still
> > > > > has a TSC that skews significantly from atomic-clock time. This is
> > > > > usually attributed to a firmware or hardware fault. Yes, the various
> > > > > NTP daemons do express their opinions of userspace-to-atomic-clock time
> > > > > skew, but they put them in various places, depending on the daemon and
> > > > > distro in question. It would therefore be good for the kernel to have
> > > > > some clue that there is a problem.
> > > > > The old behavior of marking the TSC unstable is a non-starter because a
> > > > > great many workloads simply cannot tolerate the overheads and latencies
> > > > > of the various non-TSC clocksources. In addition, NTP-corrected systems
> > > > > often seem to be able to tolerate significant kernel-space time skew as
> > > > > long as the userspace time sources are within epsilon of atomic-clock
> > > > > time.
> > > > > Therefore, when watchdog verification of TSC is disabled, enable it for
> > > > > HPET and PMTMR (AKA ACPI PM timer). This provides the needed in-kernel
> > > > > time-skew diagnostic without degrading the system's performance.
> > > > > Signed-off-by: Paul E. McKenney<[email protected]>
> > > > > Cc: Thomas Gleixner<[email protected]>
> > > > > Cc: Ingo Molnar<[email protected]>
> > > > > Cc: Borislav Petkov<[email protected]>
> > > > > Cc: Dave Hansen<[email protected]>
> > > > > Cc: "H. Peter Anvin"<[email protected]>
> > > > > Cc: Daniel Lezcano<[email protected]>
> > > > > Cc: Feng Tang<[email protected]>
> > > > > Cc: Waiman Long <[email protected]
> > > > > Cc:<[email protected]>
> > > >
> > > > As I currently understand, you are trying to use TSC as a watchdog to check
> > > > against HPET and PMTMR. I do have 2 questions about this patch.
> > > >
> > > > First of all, why you need to use both HPET and PMTMR? Can you just use one
> > > > of those that are available. Secondly, is it possible to enable this
> > > > time-skew diagnostic for a limit amount of time instead running
> > > > indefinitely? The running of the clocksource watchdog itself will still
> > > > consume a tiny amount of CPU cycles.
> > >
> > > I could certainly do something so that only the first of HPET and PMTMR
> > > is checked. Could you give me a quick run-through of the advantages of
> > > using only one? I would need to explain that in the commit log.
> > >
> > > Would it make sense to have a kernel boot variable giving the number of
> > > minutes for which the watchdog was to run, with a default of zero
> > > meaning "indefinitely"?
> >
> > We've discussed about the "os noise", which customer may really care.
> > IIUC, this patch intends to test if HPET/PMTIMER HW is broken, so how
> > about making it run for a number of minutes the default behavior.
>
> It is also intended to determine if TSC is broken, with NTP drift rates
> used to determine which timer is at fault.
>
> OK, how about a Kconfig option for the number of minutes, set to whatever
> you guys tell me? (Three minutes? Five minutes? Something else?)
> People wanting to run it continuously could then build their kernels
> with that Kconfig option set to zero.
I don't have specific preference for 5 or 10 minutes, as long as it
is a one time deal :)
> > Also I've run the patch on a Alderlake system, with a fine acpi pm_timer
> > and a fake broken pm_timer, and they both works without errors.
>
> Thank you! Did it correctly identify the fake broken pm_timer as being
> broken? If so, may I have your Tested-by?
On that Alderlake system, HPET will be disabled by kernel, and I
manually increased the tsc frequency about 1/256 to make pm_timer
look to have 1/256 deviation. And got dmesg like:
[ 2.738554] clocksource: timekeeping watchdog on CPU3: Marking clocksource 'acpi_pm' as unstable because the skew is too large:
[ 2.738558] clocksource: 'tsc' wd_nsec: 275956624 wd_now: 13aab38d0d wd_last: 1382c1144d mask: ffffffffffffffff
[ 2.738564] clocksource: 'acpi_pm' cs_nsec: 277034651 cs_now: 731575 cs_last: 63f3cb mask: ffffff
[ 2.738568] clocksource: 'tsc' (not 'acpi_pm') is current clocksource.
The deviation is indeed about 1/256. And pm_timer won't be shown in /sys/:
/sys/devices/system/clocksource/clocksource0/available_clocksource:tsc
/sys/devices/system/clocksource/clocksource0/current_clocksource:tsc
So feel free to add:
Tested-by: Feng Tang <[email protected]>
Thanks,
Feng
>
> Thanx, Paul
On Thu, Dec 22, 2022 at 02:37:24PM +0800, Feng Tang wrote:
> On Wed, Dec 21, 2022 at 10:14:29PM -0800, Paul E. McKenney wrote:
> > On Thu, Dec 22, 2022 at 02:00:42PM +0800, Feng Tang wrote:
> > > On Wed, Dec 21, 2022 at 09:55:15PM -0800, Paul E. McKenney wrote:
> > > > On Wed, Dec 21, 2022 at 10:39:53PM -0500, Waiman Long wrote:
> > > > > On 12/21/22 19:40, Paul E. McKenney wrote:
> > > > > > commit 199dfa2ba23dd0d650b1482a091e2e15457698b7
> > > > > > Author: Paul E. McKenney<[email protected]>
> > > > > > Date: Wed Dec 21 16:20:25 2022 -0800
> > > > > >
> > > > > > clocksource: Verify HPET and PMTMR when TSC unverified
> > > > > > On systems with two or fewer sockets, when the boot CPU has CONSTANT_TSC,
> > > > > > NONSTOP_TSC, and TSC_ADJUST, clocksource watchdog verification of the
> > > > > > TSC is disabled. This works well much of the time, but there is the
> > > > > > occasional system that meets all of these criteria, but which still
> > > > > > has a TSC that skews significantly from atomic-clock time. This is
> > > > > > usually attributed to a firmware or hardware fault. Yes, the various
> > > > > > NTP daemons do express their opinions of userspace-to-atomic-clock time
> > > > > > skew, but they put them in various places, depending on the daemon and
> > > > > > distro in question. It would therefore be good for the kernel to have
> > > > > > some clue that there is a problem.
> > > > > > The old behavior of marking the TSC unstable is a non-starter because a
> > > > > > great many workloads simply cannot tolerate the overheads and latencies
> > > > > > of the various non-TSC clocksources. In addition, NTP-corrected systems
> > > > > > often seem to be able to tolerate significant kernel-space time skew as
> > > > > > long as the userspace time sources are within epsilon of atomic-clock
> > > > > > time.
> > > > > > Therefore, when watchdog verification of TSC is disabled, enable it for
> > > > > > HPET and PMTMR (AKA ACPI PM timer). This provides the needed in-kernel
> > > > > > time-skew diagnostic without degrading the system's performance.
> > > > > > Signed-off-by: Paul E. McKenney<[email protected]>
> > > > > > Cc: Thomas Gleixner<[email protected]>
> > > > > > Cc: Ingo Molnar<[email protected]>
> > > > > > Cc: Borislav Petkov<[email protected]>
> > > > > > Cc: Dave Hansen<[email protected]>
> > > > > > Cc: "H. Peter Anvin"<[email protected]>
> > > > > > Cc: Daniel Lezcano<[email protected]>
> > > > > > Cc: Feng Tang<[email protected]>
> > > > > > Cc: Waiman Long <[email protected]
> > > > > > Cc:<[email protected]>
> > > > >
> > > > > As I currently understand, you are trying to use TSC as a watchdog to check
> > > > > against HPET and PMTMR. I do have 2 questions about this patch.
> > > > >
> > > > > First of all, why you need to use both HPET and PMTMR? Can you just use one
> > > > > of those that are available. Secondly, is it possible to enable this
> > > > > time-skew diagnostic for a limit amount of time instead running
> > > > > indefinitely? The running of the clocksource watchdog itself will still
> > > > > consume a tiny amount of CPU cycles.
> > > >
> > > > I could certainly do something so that only the first of HPET and PMTMR
> > > > is checked. Could you give me a quick run-through of the advantages of
> > > > using only one? I would need to explain that in the commit log.
> > > >
> > > > Would it make sense to have a kernel boot variable giving the number of
> > > > minutes for which the watchdog was to run, with a default of zero
> > > > meaning "indefinitely"?
> > >
> > > We've discussed about the "os noise", which customer may really care.
> > > IIUC, this patch intends to test if HPET/PMTIMER HW is broken, so how
> > > about making it run for a number of minutes the default behavior.
> >
> > It is also intended to determine if TSC is broken, with NTP drift rates
> > used to determine which timer is at fault.
> >
> > OK, how about a Kconfig option for the number of minutes, set to whatever
> > you guys tell me? (Three minutes? Five minutes? Something else?)
> > People wanting to run it continuously could then build their kernels
> > with that Kconfig option set to zero.
>
> I don't have specific preference for 5 or 10 minutes, as long as it
> is a one time deal :)
>
> > > Also I've run the patch on a Alderlake system, with a fine acpi pm_timer
> > > and a fake broken pm_timer, and they both works without errors.
> >
> > Thank you! Did it correctly identify the fake broken pm_timer as being
> > broken? If so, may I have your Tested-by?
>
> On that Alderlake system, HPET will be disabled by kernel, and I
> manually increased the tsc frequency about 1/256 to make pm_timer
> look to have 1/256 deviation. And got dmesg like:
>
> [ 2.738554] clocksource: timekeeping watchdog on CPU3: Marking clocksource 'acpi_pm' as unstable because the skew is too large:
> [ 2.738558] clocksource: 'tsc' wd_nsec: 275956624 wd_now: 13aab38d0d wd_last: 1382c1144d mask: ffffffffffffffff
> [ 2.738564] clocksource: 'acpi_pm' cs_nsec: 277034651 cs_now: 731575 cs_last: 63f3cb mask: ffffff
> [ 2.738568] clocksource: 'tsc' (not 'acpi_pm') is current clocksource.
>
> The deviation is indeed about 1/256. And pm_timer won't be shown in /sys/:
>
> /sys/devices/system/clocksource/clocksource0/available_clocksource:tsc
> /sys/devices/system/clocksource/clocksource0/current_clocksource:tsc
>
> So feel free to add:
>
> Tested-by: Feng Tang <[email protected]>
Thank you very much! I will apply this on my next rebase.
Thanx, Paul
On Thu, Dec 22, 2022 at 10:24:46AM -0800, Paul E. McKenney wrote:
> On Thu, Dec 22, 2022 at 02:37:24PM +0800, Feng Tang wrote:
> > On Wed, Dec 21, 2022 at 10:14:29PM -0800, Paul E. McKenney wrote:
> > > On Thu, Dec 22, 2022 at 02:00:42PM +0800, Feng Tang wrote:
> > > > On Wed, Dec 21, 2022 at 09:55:15PM -0800, Paul E. McKenney wrote:
> > > > > On Wed, Dec 21, 2022 at 10:39:53PM -0500, Waiman Long wrote:
[ . . . ]
> > > > > > As I currently understand, you are trying to use TSC as a watchdog to check
> > > > > > against HPET and PMTMR. I do have 2 questions about this patch.
> > > > > >
> > > > > > First of all, why you need to use both HPET and PMTMR? Can you just use one
> > > > > > of those that are available. Secondly, is it possible to enable this
> > > > > > time-skew diagnostic for a limit amount of time instead running
> > > > > > indefinitely? The running of the clocksource watchdog itself will still
> > > > > > consume a tiny amount of CPU cycles.
> > > > >
> > > > > I could certainly do something so that only the first of HPET and PMTMR
> > > > > is checked. Could you give me a quick run-through of the advantages of
> > > > > using only one? I would need to explain that in the commit log.
> > > > >
> > > > > Would it make sense to have a kernel boot variable giving the number of
> > > > > minutes for which the watchdog was to run, with a default of zero
> > > > > meaning "indefinitely"?
> > > >
> > > > We've discussed about the "os noise", which customer may really care.
> > > > IIUC, this patch intends to test if HPET/PMTIMER HW is broken, so how
> > > > about making it run for a number of minutes the default behavior.
> > >
> > > It is also intended to determine if TSC is broken, with NTP drift rates
> > > used to determine which timer is at fault.
> > >
> > > OK, how about a Kconfig option for the number of minutes, set to whatever
> > > you guys tell me? (Three minutes? Five minutes? Something else?)
> > > People wanting to run it continuously could then build their kernels
> > > with that Kconfig option set to zero.
> >
> > I don't have specific preference for 5 or 10 minutes, as long as it
> > is a one time deal :)
> >
> > > > Also I've run the patch on a Alderlake system, with a fine acpi pm_timer
> > > > and a fake broken pm_timer, and they both works without errors.
> > >
> > > Thank you! Did it correctly identify the fake broken pm_timer as being
> > > broken? If so, may I have your Tested-by?
> >
> > On that Alderlake system, HPET will be disabled by kernel, and I
> > manually increased the tsc frequency about 1/256 to make pm_timer
> > look to have 1/256 deviation. And got dmesg like:
> >
> > [ 2.738554] clocksource: timekeeping watchdog on CPU3: Marking clocksource 'acpi_pm' as unstable because the skew is too large:
> > [ 2.738558] clocksource: 'tsc' wd_nsec: 275956624 wd_now: 13aab38d0d wd_last: 1382c1144d mask: ffffffffffffffff
> > [ 2.738564] clocksource: 'acpi_pm' cs_nsec: 277034651 cs_now: 731575 cs_last: 63f3cb mask: ffffff
> > [ 2.738568] clocksource: 'tsc' (not 'acpi_pm') is current clocksource.
> >
> > The deviation is indeed about 1/256. And pm_timer won't be shown in /sys/:
> >
> > /sys/devices/system/clocksource/clocksource0/available_clocksource:tsc
> > /sys/devices/system/clocksource/clocksource0/current_clocksource:tsc
> >
> > So feel free to add:
> >
> > Tested-by: Feng Tang <[email protected]>
>
> Thank you very much! I will apply this on my next rebase.
But first, here is a prototype of the limited-time clocksource watchdog.
Thoughts?
Thanx, Paul
------------------------------------------------------------------------
commit dfbf67806c4c7f2bdd79cdefe86a2bea6e7afcab
Author: Paul E. McKenney <[email protected]>
Date: Thu Dec 22 13:21:47 2022 -0800
clocksource: Permit limited-duration clocksource watchdogging
Some systems want regular clocksource checking, but their workloads
cannot tolerate the overhead of full-time clocksource checking.
Therefore, add a CLOCKSOURCE_WATCHDOG_DURATION Kconfig option and a
clocksource.watchdog_duration kernel-boot parameter that limits the
clocksource watchdog to the specified number of minutes past boot.
Values of zero disable checking, and a value of -1 restores the
traditional behavior of always checking.
This does change behavior compared to older kernels, but recent kernels
disable the clocksource watchdog completely in the common case where the
TSC is judged to be trustworthy. This change in behavior is therefore
not a real issue.
Link: https://lore.kernel.org/all/[email protected]/
Reported-by: Waiman Long <[email protected]>
Reported-by: Feng Tang <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index bae8f11070bef..2676e011673d5 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -209,5 +209,17 @@ config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
per million. If the clocksource is good enough for NTP,
it is good enough for the clocksource watchdog!
+config CLOCKSOURCE_WATCHDOG_DURATION
+ int "Default time to run clocksource watchdog (in minutes)"
+ depends on CLOCKSOURCE_WATCHDOG
+ range -1 1440
+ default 10
+ help
+ Specify the default number of minutes that the clocksource
+ watchdog should run after being started. Specify -1 to run
+ indefinitely or zero to not run at all. This value may be
+ overridden using the clocksource.watchdog_duration kernel
+ boot parameter.
+
endmenu
endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index b153fce8c0e4b..c920c6c22e0fb 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -213,6 +213,9 @@ module_param(max_cswd_read_retries, ulong, 0644);
EXPORT_SYMBOL_GPL(max_cswd_read_retries);
static int verify_n_cpus = 8;
module_param(verify_n_cpus, int, 0644);
+static int watchdog_duration = CONFIG_CLOCKSOURCE_WATCHDOG_DURATION;
+module_param(watchdog_duration, int, 0444);
+static unsigned long watchdog_end_jiffies;
enum wd_read_status {
WD_READ_SUCCESS,
@@ -549,7 +552,9 @@ static void clocksource_watchdog(struct timer_list *unused)
* Arm timer if not already pending: could race with concurrent
* pair clocksource_stop_watchdog() clocksource_start_watchdog().
*/
- if (!timer_pending(&watchdog_timer)) {
+ if (!timer_pending(&watchdog_timer) &&
+ (watchdog_duration < 0 ||
+ (watchdog_duration >= 0 && time_before(jiffies, watchdog_end_jiffies)))) {
watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait;
add_timer_on(&watchdog_timer, next_cpu);
}
@@ -559,8 +564,10 @@ static void clocksource_watchdog(struct timer_list *unused)
static inline void clocksource_start_watchdog(void)
{
- if (watchdog_running || !watchdog || list_empty(&watchdog_list))
+ if (watchdog_running || !watchdog || list_empty(&watchdog_list) || !watchdog_duration)
return;
+ if (watchdog_duration > 0)
+ watchdog_end_jiffies = jiffies + watchdog_duration * 60 * HZ;
timer_setup(&watchdog_timer, clocksource_watchdog, 0);
watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
On Thu, Dec 22, 2022 at 01:42:19PM -0800, Paul E. McKenney wrote:
> On Thu, Dec 22, 2022 at 10:24:46AM -0800, Paul E. McKenney wrote:
> > On Thu, Dec 22, 2022 at 02:37:24PM +0800, Feng Tang wrote:
> > > On Wed, Dec 21, 2022 at 10:14:29PM -0800, Paul E. McKenney wrote:
> > > > On Thu, Dec 22, 2022 at 02:00:42PM +0800, Feng Tang wrote:
> > > > > On Wed, Dec 21, 2022 at 09:55:15PM -0800, Paul E. McKenney wrote:
> > > > > > On Wed, Dec 21, 2022 at 10:39:53PM -0500, Waiman Long wrote:
>
> [ . . . ]
>
> > > > > > > As I currently understand, you are trying to use TSC as a watchdog to check
> > > > > > > against HPET and PMTMR. I do have 2 questions about this patch.
> > > > > > >
> > > > > > > First of all, why you need to use both HPET and PMTMR? Can you just use one
> > > > > > > of those that are available. Secondly, is it possible to enable this
> > > > > > > time-skew diagnostic for a limit amount of time instead running
> > > > > > > indefinitely? The running of the clocksource watchdog itself will still
> > > > > > > consume a tiny amount of CPU cycles.
> > > > > >
> > > > > > I could certainly do something so that only the first of HPET and PMTMR
> > > > > > is checked. Could you give me a quick run-through of the advantages of
> > > > > > using only one? I would need to explain that in the commit log.
> > > > > >
> > > > > > Would it make sense to have a kernel boot variable giving the number of
> > > > > > minutes for which the watchdog was to run, with a default of zero
> > > > > > meaning "indefinitely"?
> > > > >
> > > > > We've discussed about the "os noise", which customer may really care.
> > > > > IIUC, this patch intends to test if HPET/PMTIMER HW is broken, so how
> > > > > about making it run for a number of minutes the default behavior.
> > > >
> > > > It is also intended to determine if TSC is broken, with NTP drift rates
> > > > used to determine which timer is at fault.
> > > >
> > > > OK, how about a Kconfig option for the number of minutes, set to whatever
> > > > you guys tell me? (Three minutes? Five minutes? Something else?)
> > > > People wanting to run it continuously could then build their kernels
> > > > with that Kconfig option set to zero.
> > >
> > > I don't have specific preference for 5 or 10 minutes, as long as it
> > > is a one time deal :)
> > >
> > > > > Also I've run the patch on a Alderlake system, with a fine acpi pm_timer
> > > > > and a fake broken pm_timer, and they both works without errors.
> > > >
> > > > Thank you! Did it correctly identify the fake broken pm_timer as being
> > > > broken? If so, may I have your Tested-by?
> > >
> > > On that Alderlake system, HPET will be disabled by kernel, and I
> > > manually increased the tsc frequency about 1/256 to make pm_timer
> > > look to have 1/256 deviation. And got dmesg like:
> > >
> > > [ 2.738554] clocksource: timekeeping watchdog on CPU3: Marking clocksource 'acpi_pm' as unstable because the skew is too large:
> > > [ 2.738558] clocksource: 'tsc' wd_nsec: 275956624 wd_now: 13aab38d0d wd_last: 1382c1144d mask: ffffffffffffffff
> > > [ 2.738564] clocksource: 'acpi_pm' cs_nsec: 277034651 cs_now: 731575 cs_last: 63f3cb mask: ffffff
> > > [ 2.738568] clocksource: 'tsc' (not 'acpi_pm') is current clocksource.
> > >
> > > The deviation is indeed about 1/256. And pm_timer won't be shown in /sys/:
> > >
> > > /sys/devices/system/clocksource/clocksource0/available_clocksource:tsc
> > > /sys/devices/system/clocksource/clocksource0/current_clocksource:tsc
> > >
> > > So feel free to add:
> > >
> > > Tested-by: Feng Tang <[email protected]>
> >
> > Thank you very much! I will apply this on my next rebase.
>
> But first, here is a prototype of the limited-time clocksource watchdog.
And here is the limited-watchdogging patch.
Thoughts?
Thanx, Paul
------------------------------------------------------------------------
commit 375e65d3055f9b379e5fbd449e69752cb69b4e19
Author: Paul E. McKenney <[email protected]>
Date: Thu Dec 22 15:20:37 2022 -0800
clocksource: Limit the number of watchdogged clocksources
When TSC is deemed trustworthy, the clocksource watchdog will verify
other clocksources against it. @@@ Why limit it? Needed from Waiman.
Maybe overhead and disturbance of additional checks? @@@
Therefore, supply a new tsc_watchdogged kernel boot parameter that
limits the number of clocksources that will be verified against TSC.
This parameter defaults to INT_MAX. A value of zero prevents any
verification.
Link: https://lore.kernel.org/all/[email protected]/
Reported-by: Waiman Long <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: Daniel Lezcano <[email protected]>
Cc: Feng Tang <[email protected]>
Cc: Waiman Long <[email protected]
Cc: <[email protected]>
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 68c597e5909a4..0e304e40c21fa 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -6345,6 +6345,12 @@
with CPUID.16h support and partial CPUID.15h support.
Format: <unsigned int>
+ tsc_watchdogged= [X86]
+ Specify the limit on the number of clocksources
+ that will be verified against TSC in cases where
+ the TSC is deemed trustworthy. Defaults to
+ INT_MAX. Specify zero to avoid verification.
+
tsx= [X86] Control Transactional Synchronization
Extensions (TSX) feature in Intel processors that
support TSX control.
diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h
index a53961c64a567..0fa15c4819082 100644
--- a/arch/x86/include/asm/time.h
+++ b/arch/x86/include/asm/time.h
@@ -8,7 +8,7 @@
extern void hpet_time_init(void);
extern void time_init(void);
extern bool pit_timer_init(void);
-extern bool tsc_clocksource_watchdog_disabled(void);
+extern void tsc_clocksource_watchdog_disabled(struct clocksource *csp);
extern struct clock_event_device *global_clock_event;
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index c8eb1ac5125ab..cf28b0abc06bd 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -1091,8 +1091,7 @@ int __init hpet_enable(void)
if (!hpet_counting())
goto out_nohpet;
- if (tsc_clocksource_watchdog_disabled())
- clocksource_hpet.flags |= CLOCK_SOURCE_MUST_VERIFY;
+ tsc_clocksource_watchdog_disabled(&clocksource_hpet);
clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
if (id & HPET_ID_LEGSUP) {
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 924e877b95f31..6a1def7c02a6e 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -53,6 +53,9 @@ static u32 art_to_tsc_denominator;
static u64 art_to_tsc_offset;
struct clocksource *art_related_clocksource;
+static int max_tsc_watchdogged = INT_MAX;
+static atomic_t cur_tsc_watchdogged;
+
struct cyc2ns {
struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */
seqcount_latch_t seq; /* 32 + 4 = 36 */
@@ -308,6 +311,14 @@ static int __init tsc_setup(char *str)
__setup("tsc=", tsc_setup);
+static int __init tsc_watchdogged_setup(char *str)
+{
+ max_tsc_watchdogged = simple_strtol(str, NULL, 0);
+ return 1;
+}
+
+__setup("tsc_watchdogged=", tsc_watchdogged_setup);
+
#define MAX_RETRIES 5
#define TSC_DEFAULT_THRESHOLD 0x20000
@@ -1186,9 +1197,18 @@ static void __init tsc_disable_clocksource_watchdog(void)
clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
}
-bool tsc_clocksource_watchdog_disabled(void)
+/*
+ * If the TSC is judged trustworthy and the limit on the number of
+ * to-be-watchdogged clocksources has not been exceeded, place the specified
+ * clocksource into must-verify state.
+ */
+void tsc_clocksource_watchdog_disabled(struct clocksource *csp)
{
- return !(clocksource_tsc.flags & CLOCK_SOURCE_MUST_VERIFY);
+ if (clocksource_tsc.flags & CLOCK_SOURCE_MUST_VERIFY ||
+ atomic_inc_return(&cur_tsc_watchdogged) > max_tsc_watchdogged)
+ return;
+ pr_info("clocksource: '%s' will be checked by clocksource watchdog.\n", csp->name);
+ csp->flags |= CLOCK_SOURCE_MUST_VERIFY;
}
static void __init check_system_tsc_reliable(void)
diff --git a/drivers/clocksource/acpi_pm.c b/drivers/clocksource/acpi_pm.c
index 82338773602ca..8562f59ac27e9 100644
--- a/drivers/clocksource/acpi_pm.c
+++ b/drivers/clocksource/acpi_pm.c
@@ -211,8 +211,7 @@ static int __init init_acpi_pm_clocksource(void)
return -ENODEV;
}
- if (tsc_clocksource_watchdog_disabled())
- clocksource_acpi_pm.flags |= CLOCK_SOURCE_MUST_VERIFY;
+ tsc_clocksource_watchdog_disabled(&clocksource_acpi_pm);
return clocksource_register_hz(&clocksource_acpi_pm, PMTMR_TICKS_PER_SEC);
}
On Thu, Dec 22, 2022 at 01:42:19PM -0800, Paul E. McKenney wrote:
[...]
> > >
> > > > > Also I've run the patch on a Alderlake system, with a fine acpi pm_timer
> > > > > and a fake broken pm_timer, and they both works without errors.
> > > >
> > > > Thank you! Did it correctly identify the fake broken pm_timer as being
> > > > broken? If so, may I have your Tested-by?
> > >
> > > On that Alderlake system, HPET will be disabled by kernel, and I
> > > manually increased the tsc frequency about 1/256 to make pm_timer
> > > look to have 1/256 deviation. And got dmesg like:
> > >
> > > [ 2.738554] clocksource: timekeeping watchdog on CPU3: Marking clocksource 'acpi_pm' as unstable because the skew is too large:
> > > [ 2.738558] clocksource: 'tsc' wd_nsec: 275956624 wd_now: 13aab38d0d wd_last: 1382c1144d mask: ffffffffffffffff
> > > [ 2.738564] clocksource: 'acpi_pm' cs_nsec: 277034651 cs_now: 731575 cs_last: 63f3cb mask: ffffff
> > > [ 2.738568] clocksource: 'tsc' (not 'acpi_pm') is current clocksource.
> > >
> > > The deviation is indeed about 1/256. And pm_timer won't be shown in /sys/:
> > >
> > > /sys/devices/system/clocksource/clocksource0/available_clocksource:tsc
> > > /sys/devices/system/clocksource/clocksource0/current_clocksource:tsc
> > >
> > > So feel free to add:
> > >
> > > Tested-by: Feng Tang <[email protected]>
> >
> > Thank you very much! I will apply this on my next rebase.
>
> But first, here is a prototype of the limited-time clocksource watchdog.
> Thoughts?
>
> Thanx, Paul
>
> ------------------------------------------------------------------------
>
> commit dfbf67806c4c7f2bdd79cdefe86a2bea6e7afcab
> Author: Paul E. McKenney <[email protected]>
> Date: Thu Dec 22 13:21:47 2022 -0800
>
> clocksource: Permit limited-duration clocksource watchdogging
>
> Some systems want regular clocksource checking, but their workloads
> cannot tolerate the overhead of full-time clocksource checking.
> Therefore, add a CLOCKSOURCE_WATCHDOG_DURATION Kconfig option and a
> clocksource.watchdog_duration kernel-boot parameter that limits the
> clocksource watchdog to the specified number of minutes past boot.
> Values of zero disable checking, and a value of -1 restores the
> traditional behavior of always checking.
>
> This does change behavior compared to older kernels, but recent kernels
> disable the clocksource watchdog completely in the common case where the
> TSC is judged to be trustworthy. This change in behavior is therefore
> not a real issue.
Yes, this changes the general semantics. Last year, I've posted a
patch to limit the watchdog to run for 10 minutes, and at that time
Thomas mentioned one of his machine may show tsc issue after running
for one day depending on work load [1].
As the intention is to validate HPET/PMTIMER, which are not as
delicate as TSC, maybe we can add a per-clocksource verify-period
field, and only set it for HPET/PMTIMER?
[1]. https://lore.kernel.org/lkml/[email protected]/
Thanks,
Feng
> Link: https://lore.kernel.org/all/[email protected]/
> Reported-by: Waiman Long <[email protected]>
> Reported-by: Feng Tang <[email protected]>
> Signed-off-by: Paul E. McKenney <[email protected]>
>
> diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
> index bae8f11070bef..2676e011673d5 100644
> --- a/kernel/time/Kconfig
> +++ b/kernel/time/Kconfig
> @@ -209,5 +209,17 @@ config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
> per million. If the clocksource is good enough for NTP,
> it is good enough for the clocksource watchdog!
>
> +config CLOCKSOURCE_WATCHDOG_DURATION
> + int "Default time to run clocksource watchdog (in minutes)"
> + depends on CLOCKSOURCE_WATCHDOG
> + range -1 1440
> + default 10
> + help
> + Specify the default number of minutes that the clocksource
> + watchdog should run after being started. Specify -1 to run
> + indefinitely or zero to not run at all. This value may be
> + overridden using the clocksource.watchdog_duration kernel
> + boot parameter.
> +
> endmenu
> endif
> diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
> index b153fce8c0e4b..c920c6c22e0fb 100644
> --- a/kernel/time/clocksource.c
> +++ b/kernel/time/clocksource.c
> @@ -213,6 +213,9 @@ module_param(max_cswd_read_retries, ulong, 0644);
> EXPORT_SYMBOL_GPL(max_cswd_read_retries);
> static int verify_n_cpus = 8;
> module_param(verify_n_cpus, int, 0644);
> +static int watchdog_duration = CONFIG_CLOCKSOURCE_WATCHDOG_DURATION;
> +module_param(watchdog_duration, int, 0444);
> +static unsigned long watchdog_end_jiffies;
>
> enum wd_read_status {
> WD_READ_SUCCESS,
> @@ -549,7 +552,9 @@ static void clocksource_watchdog(struct timer_list *unused)
> * Arm timer if not already pending: could race with concurrent
> * pair clocksource_stop_watchdog() clocksource_start_watchdog().
> */
> - if (!timer_pending(&watchdog_timer)) {
> + if (!timer_pending(&watchdog_timer) &&
> + (watchdog_duration < 0 ||
> + (watchdog_duration >= 0 && time_before(jiffies, watchdog_end_jiffies)))) {
> watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait;
> add_timer_on(&watchdog_timer, next_cpu);
> }
> @@ -559,8 +564,10 @@ static void clocksource_watchdog(struct timer_list *unused)
>
> static inline void clocksource_start_watchdog(void)
> {
> - if (watchdog_running || !watchdog || list_empty(&watchdog_list))
> + if (watchdog_running || !watchdog || list_empty(&watchdog_list) || !watchdog_duration)
> return;
> + if (watchdog_duration > 0)
> + watchdog_end_jiffies = jiffies + watchdog_duration * 60 * HZ;
> timer_setup(&watchdog_timer, clocksource_watchdog, 0);
> watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
> add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
On Fri, Dec 23, 2022 at 10:09:04AM +0800, Feng Tang wrote:
> On Thu, Dec 22, 2022 at 01:42:19PM -0800, Paul E. McKenney wrote:
> [...]
> > > >
> > > > > > Also I've run the patch on a Alderlake system, with a fine acpi pm_timer
> > > > > > and a fake broken pm_timer, and they both works without errors.
> > > > >
> > > > > Thank you! Did it correctly identify the fake broken pm_timer as being
> > > > > broken? If so, may I have your Tested-by?
> > > >
> > > > On that Alderlake system, HPET will be disabled by kernel, and I
> > > > manually increased the tsc frequency about 1/256 to make pm_timer
> > > > look to have 1/256 deviation. And got dmesg like:
> > > >
> > > > [ 2.738554] clocksource: timekeeping watchdog on CPU3: Marking clocksource 'acpi_pm' as unstable because the skew is too large:
> > > > [ 2.738558] clocksource: 'tsc' wd_nsec: 275956624 wd_now: 13aab38d0d wd_last: 1382c1144d mask: ffffffffffffffff
> > > > [ 2.738564] clocksource: 'acpi_pm' cs_nsec: 277034651 cs_now: 731575 cs_last: 63f3cb mask: ffffff
> > > > [ 2.738568] clocksource: 'tsc' (not 'acpi_pm') is current clocksource.
> > > >
> > > > The deviation is indeed about 1/256. And pm_timer won't be shown in /sys/:
> > > >
> > > > /sys/devices/system/clocksource/clocksource0/available_clocksource:tsc
> > > > /sys/devices/system/clocksource/clocksource0/current_clocksource:tsc
> > > >
> > > > So feel free to add:
> > > >
> > > > Tested-by: Feng Tang <[email protected]>
> > >
> > > Thank you very much! I will apply this on my next rebase.
> >
> > But first, here is a prototype of the limited-time clocksource watchdog.
> > Thoughts?
> >
> > Thanx, Paul
> >
> > ------------------------------------------------------------------------
> >
> > commit dfbf67806c4c7f2bdd79cdefe86a2bea6e7afcab
> > Author: Paul E. McKenney <[email protected]>
> > Date: Thu Dec 22 13:21:47 2022 -0800
> >
> > clocksource: Permit limited-duration clocksource watchdogging
> >
> > Some systems want regular clocksource checking, but their workloads
> > cannot tolerate the overhead of full-time clocksource checking.
> > Therefore, add a CLOCKSOURCE_WATCHDOG_DURATION Kconfig option and a
> > clocksource.watchdog_duration kernel-boot parameter that limits the
> > clocksource watchdog to the specified number of minutes past boot.
> > Values of zero disable checking, and a value of -1 restores the
> > traditional behavior of always checking.
> >
> > This does change behavior compared to older kernels, but recent kernels
> > disable the clocksource watchdog completely in the common case where the
> > TSC is judged to be trustworthy. This change in behavior is therefore
> > not a real issue.
>
> Yes, this changes the general semantics. Last year, I've posted a
> patch to limit the watchdog to run for 10 minutes, and at that time
> Thomas mentioned one of his machine may show tsc issue after running
> for one day depending on work load [1].
>
> As the intention is to validate HPET/PMTIMER, which are not as
> delicate as TSC, maybe we can add a per-clocksource verify-period
> field, and only set it for HPET/PMTIMER?
>
> [1]. https://lore.kernel.org/lkml/[email protected]/
Got it.
The workloads I am closest to are OK with the clocksource watchdog
running indefinitely, but thus far the skew is visible very early.
But broken hardware can do whatever it wants whenever it wants. I could
meet Thomas's request by making the default be indefinite, and allowing
whoever cares to make it finite. Or maybe the fact that the TSC is not
marked unstable makes a difference.
Thoughts?
Thanx, Paul
> Thanks,
> Feng
>
> > Link: https://lore.kernel.org/all/[email protected]/
> > Reported-by: Waiman Long <[email protected]>
> > Reported-by: Feng Tang <[email protected]>
> > Signed-off-by: Paul E. McKenney <[email protected]>
> >
> > diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
> > index bae8f11070bef..2676e011673d5 100644
> > --- a/kernel/time/Kconfig
> > +++ b/kernel/time/Kconfig
> > @@ -209,5 +209,17 @@ config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
> > per million. If the clocksource is good enough for NTP,
> > it is good enough for the clocksource watchdog!
> >
> > +config CLOCKSOURCE_WATCHDOG_DURATION
> > + int "Default time to run clocksource watchdog (in minutes)"
> > + depends on CLOCKSOURCE_WATCHDOG
> > + range -1 1440
> > + default 10
> > + help
> > + Specify the default number of minutes that the clocksource
> > + watchdog should run after being started. Specify -1 to run
> > + indefinitely or zero to not run at all. This value may be
> > + overridden using the clocksource.watchdog_duration kernel
> > + boot parameter.
> > +
> > endmenu
> > endif
> > diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
> > index b153fce8c0e4b..c920c6c22e0fb 100644
> > --- a/kernel/time/clocksource.c
> > +++ b/kernel/time/clocksource.c
> > @@ -213,6 +213,9 @@ module_param(max_cswd_read_retries, ulong, 0644);
> > EXPORT_SYMBOL_GPL(max_cswd_read_retries);
> > static int verify_n_cpus = 8;
> > module_param(verify_n_cpus, int, 0644);
> > +static int watchdog_duration = CONFIG_CLOCKSOURCE_WATCHDOG_DURATION;
> > +module_param(watchdog_duration, int, 0444);
> > +static unsigned long watchdog_end_jiffies;
> >
> > enum wd_read_status {
> > WD_READ_SUCCESS,
> > @@ -549,7 +552,9 @@ static void clocksource_watchdog(struct timer_list *unused)
> > * Arm timer if not already pending: could race with concurrent
> > * pair clocksource_stop_watchdog() clocksource_start_watchdog().
> > */
> > - if (!timer_pending(&watchdog_timer)) {
> > + if (!timer_pending(&watchdog_timer) &&
> > + (watchdog_duration < 0 ||
> > + (watchdog_duration >= 0 && time_before(jiffies, watchdog_end_jiffies)))) {
> > watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait;
> > add_timer_on(&watchdog_timer, next_cpu);
> > }
> > @@ -559,8 +564,10 @@ static void clocksource_watchdog(struct timer_list *unused)
> >
> > static inline void clocksource_start_watchdog(void)
> > {
> > - if (watchdog_running || !watchdog || list_empty(&watchdog_list))
> > + if (watchdog_running || !watchdog || list_empty(&watchdog_list) || !watchdog_duration)
> > return;
> > + if (watchdog_duration > 0)
> > + watchdog_end_jiffies = jiffies + watchdog_duration * 60 * HZ;
> > timer_setup(&watchdog_timer, clocksource_watchdog, 0);
> > watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
> > add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
On Thu, Dec 22, 2022 at 10:49:23PM -0500, Waiman Long wrote:
> On 12/22/22 22:37, Paul E. McKenney wrote:
> > > > commit dfbf67806c4c7f2bdd79cdefe86a2bea6e7afcab
> > > > Author: Paul E. McKenney<[email protected]>
> > > > Date: Thu Dec 22 13:21:47 2022 -0800
> > > >
> > > > clocksource: Permit limited-duration clocksource watchdogging
> > > > Some systems want regular clocksource checking, but their workloads
> > > > cannot tolerate the overhead of full-time clocksource checking.
> > > > Therefore, add a CLOCKSOURCE_WATCHDOG_DURATION Kconfig option and a
> > > > clocksource.watchdog_duration kernel-boot parameter that limits the
> > > > clocksource watchdog to the specified number of minutes past boot.
> > > > Values of zero disable checking, and a value of -1 restores the
> > > > traditional behavior of always checking.
> > > > This does change behavior compared to older kernels, but recent kernels
> > > > disable the clocksource watchdog completely in the common case where the
> > > > TSC is judged to be trustworthy. This change in behavior is therefore
> > > > not a real issue.
> > > Yes, this changes the general semantics. Last year, I've posted a
> > > patch to limit the watchdog to run for 10 minutes, and at that time
> > > Thomas mentioned one of his machine may show tsc issue after running
> > > for one day depending on work load [1].
> > >
> > > As the intention is to validate HPET/PMTIMER, which are not as
> > > delicate as TSC, maybe we can add a per-clocksource verify-period
> > > field, and only set it for HPET/PMTIMER?
> > >
> > > [1].https://lore.kernel.org/lkml/[email protected]/
> > Got it.
> >
> > The workloads I am closest to are OK with the clocksource watchdog
> > running indefinitely, but thus far the skew is visible very early.
> > But broken hardware can do whatever it wants whenever it wants. I could
> > meet Thomas's request by making the default be indefinite, and allowing
> > whoever cares to make it finite. Or maybe the fact that the TSC is not
> > marked unstable makes a difference.
> >
> > Thoughts?
>
> Sorry for the late reply.
>
> Maybe the default should be an auto mode where if TSC is marked stable and
> don't need to verify, we can run watchdog for HPET and PMTMR for 10 mins.
> Otherwise, run it indefinitely to not change existing behavior.
Yes, sounds reasonable to me.
btw, what I suggested in last mail is some code (untested) like this:
---
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index c8eb1ac5125a..db20aac5d14d 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -862,6 +862,7 @@ static struct clocksource clocksource_hpet = {
.mask = HPET_MASK,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
.resume = hpet_resume_counter,
+ .wd_limited = true,
};
/*
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 1d42d4b17327..2b6278f69516 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -125,6 +125,8 @@ struct clocksource {
struct list_head wd_list;
u64 cs_last;
u64 wd_last;
+ bool wd_limited;
+ u64 wd_iters;
#endif
struct module *owner;
};
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 777a5eba68fd..eb2d9adf06b0 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -425,6 +425,8 @@ static void clocksource_watchdog(struct timer_list *unused)
cs->flags |= CLOCK_SOURCE_WATCHDOG;
cs->wd_last = wdnow;
cs->cs_last = csnow;
+ if (cs->wd_limited)
+ cs->wd_iters = 1200;
continue;
}
@@ -492,6 +494,9 @@ static void clocksource_watchdog(struct timer_list *unused)
tick_clock_notify();
}
}
+
+ if (cs->wd_limited && !(cs->wd_iters--))
+ list_del_init(&cs->wd_list);
}
/*
Thanks,
Feng
> Given such a default, I don't think we need your second patch to determine
> if both HPET and PMTMR needs to be checked.
>
> Cheers,
> Longman
On Fri, Dec 23, 2022 at 12:14:57PM +0800, Feng Tang wrote:
> On Thu, Dec 22, 2022 at 10:49:23PM -0500, Waiman Long wrote:
> > On 12/22/22 22:37, Paul E. McKenney wrote:
> > > > > commit dfbf67806c4c7f2bdd79cdefe86a2bea6e7afcab
> > > > > Author: Paul E. McKenney<[email protected]>
> > > > > Date: Thu Dec 22 13:21:47 2022 -0800
> > > > >
> > > > > clocksource: Permit limited-duration clocksource watchdogging
> > > > > Some systems want regular clocksource checking, but their workloads
> > > > > cannot tolerate the overhead of full-time clocksource checking.
> > > > > Therefore, add a CLOCKSOURCE_WATCHDOG_DURATION Kconfig option and a
> > > > > clocksource.watchdog_duration kernel-boot parameter that limits the
> > > > > clocksource watchdog to the specified number of minutes past boot.
> > > > > Values of zero disable checking, and a value of -1 restores the
> > > > > traditional behavior of always checking.
> > > > > This does change behavior compared to older kernels, but recent kernels
> > > > > disable the clocksource watchdog completely in the common case where the
> > > > > TSC is judged to be trustworthy. This change in behavior is therefore
> > > > > not a real issue.
> > > > Yes, this changes the general semantics. Last year, I've posted a
> > > > patch to limit the watchdog to run for 10 minutes, and at that time
> > > > Thomas mentioned one of his machine may show tsc issue after running
> > > > for one day depending on work load [1].
> > > >
> > > > As the intention is to validate HPET/PMTIMER, which are not as
> > > > delicate as TSC, maybe we can add a per-clocksource verify-period
> > > > field, and only set it for HPET/PMTIMER?
> > > >
> > > > [1].https://lore.kernel.org/lkml/[email protected]/
> > > Got it.
> > >
> > > The workloads I am closest to are OK with the clocksource watchdog
> > > running indefinitely, but thus far the skew is visible very early.
> > > But broken hardware can do whatever it wants whenever it wants. I could
> > > meet Thomas's request by making the default be indefinite, and allowing
> > > whoever cares to make it finite. Or maybe the fact that the TSC is not
> > > marked unstable makes a difference.
> > >
> > > Thoughts?
> >
> > Sorry for the late reply.
> >
> > Maybe the default should be an auto mode where if TSC is marked stable and
> > don't need to verify, we can run watchdog for HPET and PMTMR for 10 mins.
> > Otherwise, run it indefinitely to not change existing behavior.
>
> Yes, sounds reasonable to me.
>
> btw, what I suggested in last mail is some code (untested) like this:
>
> ---
> diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
> index c8eb1ac5125a..db20aac5d14d 100644
> --- a/arch/x86/kernel/hpet.c
> +++ b/arch/x86/kernel/hpet.c
> @@ -862,6 +862,7 @@ static struct clocksource clocksource_hpet = {
> .mask = HPET_MASK,
> .flags = CLOCK_SOURCE_IS_CONTINUOUS,
> .resume = hpet_resume_counter,
> + .wd_limited = true,
> };
>
> /*
> diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
> index 1d42d4b17327..2b6278f69516 100644
> --- a/include/linux/clocksource.h
> +++ b/include/linux/clocksource.h
> @@ -125,6 +125,8 @@ struct clocksource {
> struct list_head wd_list;
> u64 cs_last;
> u64 wd_last;
> + bool wd_limited;
> + u64 wd_iters;
> #endif
> struct module *owner;
> };
> diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
> index 777a5eba68fd..eb2d9adf06b0 100644
> --- a/kernel/time/clocksource.c
> +++ b/kernel/time/clocksource.c
> @@ -425,6 +425,8 @@ static void clocksource_watchdog(struct timer_list *unused)
> cs->flags |= CLOCK_SOURCE_WATCHDOG;
> cs->wd_last = wdnow;
> cs->cs_last = csnow;
> + if (cs->wd_limited)
> + cs->wd_iters = 1200;
> continue;
> }
>
> @@ -492,6 +494,9 @@ static void clocksource_watchdog(struct timer_list *unused)
> tick_clock_notify();
> }
> }
> +
> + if (cs->wd_limited && !(cs->wd_iters--))
> + list_del_init(&cs->wd_list);
> }
>
> /*
If this works for Waiman, and given a proper patch, I am happy to replace
my b90f0a5cfc0e ("clocksource: Permit limited-duration clocksource
watchdogging") with this that proper patch.
> Thanks,
> Feng
>
> > Given such a default, I don't think we need your second patch to determine
> > if both HPET and PMTMR needs to be checked.
And indeed, this one is off to the side, not on track for mainline:
375e65d3055f ("clocksource: Limit the number of watchdogged clocksources")
But if I am confused about which patch you mean, please let me know!
Thanx, Paul