From: Bjorn Helgaas <[email protected]>
62187910b0fc ("x86/intel: Add quirk to disable HPET for the Baytrail
platform") implemented force_disable_hpet() as a special early quirk.
These run before the PCI core is initialized and depend on the
x86/pci/early.c accessors that use I/O ports 0xcf8 and 0xcfc.
But force_disable_hpet() doesn't need to be one of these special early
quirks. It merely sets "boot_hpet_disable", which is tested by
is_hpet_capable(), which is only used by hpet_enable() and hpet_disable().
hpet_enable() is an fs_initcall(), so it runs after the PCI core is
initialized.
Convert force_disable_hpet() to the standard PCI quirk mechanism.
Signed-off-by: Bjorn Helgaas <[email protected]>
Cc: Feng Tang <[email protected]>
Cc: Kai-Heng Feng <[email protected]>
---
arch/x86/kernel/early-quirks.c | 24 ------------------------
arch/x86/pci/fixup.c | 25 +++++++++++++++++++++++++
2 files changed, 25 insertions(+), 24 deletions(-)
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index a4b5af03dcc1..674967fc1071 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -604,14 +604,6 @@ static void __init intel_graphics_quirks(int num, int slot, int func)
}
}
-static void __init force_disable_hpet(int num, int slot, int func)
-{
-#ifdef CONFIG_HPET_TIMER
- boot_hpet_disable = true;
- pr_info("x86/hpet: Will disable the HPET for this platform because it's not reliable\n");
-#endif
-}
-
#define BCM4331_MMIO_SIZE 16384
#define BCM4331_PM_CAP 0x40
#define bcma_aread32(reg) ioread32(mmio + 1 * BCMA_CORE_SIZE + reg)
@@ -701,22 +693,6 @@ static struct chipset early_qrk[] __initdata = {
PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
{ PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID,
QFLAG_APPLY_ONCE, intel_graphics_quirks },
- /*
- * HPET on the current version of the Baytrail platform has accuracy
- * problems: it will halt in deep idle state - so we disable it.
- *
- * More details can be found in section 18.10.1.3 of the datasheet:
- *
- * http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/atom-z8000-datasheet-vol-1.pdf
- */
- { PCI_VENDOR_ID_INTEL, 0x0f00,
- PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
- { PCI_VENDOR_ID_INTEL, 0x3e20,
- PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
- { PCI_VENDOR_ID_INTEL, 0x3ec4,
- PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
- { PCI_VENDOR_ID_INTEL, 0x8a12,
- PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
{ PCI_VENDOR_ID_BROADCOM, 0x4331,
PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset},
{}
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 0a0e168be1cb..865bc3c5188b 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -780,3 +780,28 @@ DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x15b1, pci_amd_enable_64bit_bar);
DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x1601, pci_amd_enable_64bit_bar);
#endif
+
+/*
+ * HPET on the current version of the Baytrail platform has accuracy
+ * problems: it will halt in deep idle state - so we disable it.
+ *
+ * More details can be found in section 18.10.1.3 of the datasheet
+ * (Intel Document Number 332065-003, March 2016):
+ *
+ * http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/atom-z8000-datasheet-vol-1.pdf
+ */
+static void force_disable_hpet(struct pci_dev *dev)
+{
+#ifdef CONFIG_HPET_TIMER
+ boot_hpet_disable = true;
+ pci_info(dev, "x86/hpet: Will disable the HPET for this platform because it's not reliable\n");
+#endif
+}
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_INTEL, 0x0f00,
+ PCI_CLASS_BRIDGE_HOST, 8, force_disable_hpet);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_INTEL, 0x3e20,
+ PCI_CLASS_BRIDGE_HOST, 8, force_disable_hpet);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_INTEL, 0x3ec4,
+ PCI_CLASS_BRIDGE_HOST, 8, force_disable_hpet);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_INTEL, 0x8a12,
+ PCI_CLASS_BRIDGE_HOST, 8, force_disable_hpet);
--
2.25.1
On Thu, Nov 19, 2020 at 12:19:04PM -0600, Bjorn Helgaas wrote:
> From: Bjorn Helgaas <[email protected]>
>
> 62187910b0fc ("x86/intel: Add quirk to disable HPET for the Baytrail
> platform") implemented force_disable_hpet() as a special early quirk.
> These run before the PCI core is initialized and depend on the
> x86/pci/early.c accessors that use I/O ports 0xcf8 and 0xcfc.
>
> But force_disable_hpet() doesn't need to be one of these special early
> quirks. It merely sets "boot_hpet_disable", which is tested by
> is_hpet_capable(), which is only used by hpet_enable() and hpet_disable().
> hpet_enable() is an fs_initcall(), so it runs after the PCI core is
> initialized.
>
> Convert force_disable_hpet() to the standard PCI quirk mechanism.
>
> Signed-off-by: Bjorn Helgaas <[email protected]>
> Cc: Feng Tang <[email protected]>
> Cc: Kai-Heng Feng <[email protected]>
I applied this to pci/misc for v5.11. Let me know if you see any
issues.
> ---
> arch/x86/kernel/early-quirks.c | 24 ------------------------
> arch/x86/pci/fixup.c | 25 +++++++++++++++++++++++++
> 2 files changed, 25 insertions(+), 24 deletions(-)
>
> diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
> index a4b5af03dcc1..674967fc1071 100644
> --- a/arch/x86/kernel/early-quirks.c
> +++ b/arch/x86/kernel/early-quirks.c
> @@ -604,14 +604,6 @@ static void __init intel_graphics_quirks(int num, int slot, int func)
> }
> }
>
> -static void __init force_disable_hpet(int num, int slot, int func)
> -{
> -#ifdef CONFIG_HPET_TIMER
> - boot_hpet_disable = true;
> - pr_info("x86/hpet: Will disable the HPET for this platform because it's not reliable\n");
> -#endif
> -}
> -
> #define BCM4331_MMIO_SIZE 16384
> #define BCM4331_PM_CAP 0x40
> #define bcma_aread32(reg) ioread32(mmio + 1 * BCMA_CORE_SIZE + reg)
> @@ -701,22 +693,6 @@ static struct chipset early_qrk[] __initdata = {
> PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
> { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID,
> QFLAG_APPLY_ONCE, intel_graphics_quirks },
> - /*
> - * HPET on the current version of the Baytrail platform has accuracy
> - * problems: it will halt in deep idle state - so we disable it.
> - *
> - * More details can be found in section 18.10.1.3 of the datasheet:
> - *
> - * http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/atom-z8000-datasheet-vol-1.pdf
> - */
> - { PCI_VENDOR_ID_INTEL, 0x0f00,
> - PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
> - { PCI_VENDOR_ID_INTEL, 0x3e20,
> - PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
> - { PCI_VENDOR_ID_INTEL, 0x3ec4,
> - PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
> - { PCI_VENDOR_ID_INTEL, 0x8a12,
> - PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
> { PCI_VENDOR_ID_BROADCOM, 0x4331,
> PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset},
> {}
> diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
> index 0a0e168be1cb..865bc3c5188b 100644
> --- a/arch/x86/pci/fixup.c
> +++ b/arch/x86/pci/fixup.c
> @@ -780,3 +780,28 @@ DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x15b1, pci_amd_enable_64bit_bar);
> DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x1601, pci_amd_enable_64bit_bar);
>
> #endif
> +
> +/*
> + * HPET on the current version of the Baytrail platform has accuracy
> + * problems: it will halt in deep idle state - so we disable it.
> + *
> + * More details can be found in section 18.10.1.3 of the datasheet
> + * (Intel Document Number 332065-003, March 2016):
> + *
> + * http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/atom-z8000-datasheet-vol-1.pdf
> + */
> +static void force_disable_hpet(struct pci_dev *dev)
> +{
> +#ifdef CONFIG_HPET_TIMER
> + boot_hpet_disable = true;
> + pci_info(dev, "x86/hpet: Will disable the HPET for this platform because it's not reliable\n");
> +#endif
> +}
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_INTEL, 0x0f00,
> + PCI_CLASS_BRIDGE_HOST, 8, force_disable_hpet);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_INTEL, 0x3e20,
> + PCI_CLASS_BRIDGE_HOST, 8, force_disable_hpet);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_INTEL, 0x3ec4,
> + PCI_CLASS_BRIDGE_HOST, 8, force_disable_hpet);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_INTEL, 0x8a12,
> + PCI_CLASS_BRIDGE_HOST, 8, force_disable_hpet);
> --
> 2.25.1
>
On Thu, Nov 19 2020 at 12:19, Bjorn Helgaas wrote:
> 62187910b0fc ("x86/intel: Add quirk to disable HPET for the Baytrail
> platform") implemented force_disable_hpet() as a special early quirk.
> These run before the PCI core is initialized and depend on the
> x86/pci/early.c accessors that use I/O ports 0xcf8 and 0xcfc.
>
> But force_disable_hpet() doesn't need to be one of these special early
> quirks. It merely sets "boot_hpet_disable", which is tested by
> is_hpet_capable(), which is only used by hpet_enable() and hpet_disable().
> hpet_enable() is an fs_initcall(), so it runs after the PCI core is
> initialized.
hpet_enable() is not an fs_initcall(). hpet_late_init() is and that
invokes hpet_enable() only for the case that ACPI did not advertise it
and the force_hpet quirk provided a base address.
But hpet_enable() is also invoked via:
start_kernel()
late_time_init()
x86_late_time_init()
hpet_time_init()
which is way before the PCI core is available and we really don't want
to set it up there if it's known to be broken :)
Now the more interesting question is why this needs to be a PCI quirk in
the first place. Can't we just disable the HPET based on family/model
quirks?
e0748539e3d5 ("x86/intel: Disable HPET on Intel Ice Lake platforms")
f8edbde885bb ("x86/intel: Disable HPET on Intel Coffee Lake H platforms")
fc5db58539b4 ("x86/quirks: Disable HPET on Intel Coffe Lake platforms")
62187910b0fc ("x86/intel: Add quirk to disable HPET for the Baytrail platform")
I might be missing something here, but in general on anything modern
HPET is mostly useless.
Thanks,
tglx
On Wed, Nov 25, 2020 at 01:46:23PM +0100, Thomas Gleixner wrote:
> On Thu, Nov 19 2020 at 12:19, Bjorn Helgaas wrote:
> > 62187910b0fc ("x86/intel: Add quirk to disable HPET for the Baytrail
> > platform") implemented force_disable_hpet() as a special early quirk.
> > These run before the PCI core is initialized and depend on the
> > x86/pci/early.c accessors that use I/O ports 0xcf8 and 0xcfc.
> >
> > But force_disable_hpet() doesn't need to be one of these special early
> > quirks. It merely sets "boot_hpet_disable", which is tested by
> > is_hpet_capable(), which is only used by hpet_enable() and hpet_disable().
> > hpet_enable() is an fs_initcall(), so it runs after the PCI core is
> > initialized.
>
> hpet_enable() is not an fs_initcall(). hpet_late_init() is and that
> invokes hpet_enable() only for the case that ACPI did not advertise it
> and the force_hpet quirk provided a base address.
>
> But hpet_enable() is also invoked via:
>
> start_kernel()
> late_time_init()
> x86_late_time_init()
> hpet_time_init()
>
> which is way before the PCI core is available and we really don't want
> to set it up there if it's known to be broken :)
Wow, I really blew that, don't know how I missed that path. Thanks
for catching this! I'll drop this patch.
> Now the more interesting question is why this needs to be a PCI quirk in
> the first place. Can't we just disable the HPET based on family/model
> quirks?
You mean like a CPUID check or something? I'm all in favor of doing
something that doesn't depend on PCI.
> e0748539e3d5 ("x86/intel: Disable HPET on Intel Ice Lake platforms")
> f8edbde885bb ("x86/intel: Disable HPET on Intel Coffee Lake H platforms")
> fc5db58539b4 ("x86/quirks: Disable HPET on Intel Coffe Lake platforms")
> 62187910b0fc ("x86/intel: Add quirk to disable HPET for the Baytrail platform")
>
> I might be missing something here, but in general on anything modern
> HPET is mostly useless.
>
> Thanks,
>
> tglx
>
On Wed, Nov 25 2020 at 13:13, Bjorn Helgaas wrote:
> On Wed, Nov 25, 2020 at 01:46:23PM +0100, Thomas Gleixner wrote:
>> Now the more interesting question is why this needs to be a PCI quirk in
>> the first place. Can't we just disable the HPET based on family/model
>> quirks?
>
> You mean like a CPUID check or something? I'm all in favor of doing
> something that doesn't depend on PCI.
The reason why these are PCI quirks is that the HPET is not part of the
CPU core. It's usually part of 00:00:0 (aka. host bridge) and the legacy
mess which resides there.
OTOH that thing is integrated into the actual chip nowadays and these
quirks are platform quirks, so the CPUID and the PCI vendor/device codes
should be the same for a particular model.
But what do I know. This whole platform/cpuid mess is inpenetrable even
for people who have access to the Intel internal documentation...
But, the point is that HPET does not provide any value on contemporary
CPUs assumed that Intel finally decided that TSC and the TSC deadline
timer are crucial system components along with the ability to get the
correct frequency of these beasts from firmware/cpuid.
So if parts of a particular chip generation, which we can determine by
CPUID (family, model) has issues with HPET, then there is no value in
preserving HPET for the N out of M submodels which are not affected even
if we can differentiate them by the host bridge device id.
But as I said above, what do I know. The Intel wizards which might have
better insight into this should speak up and come forth with objections.
Otherwise I just go and disable HPET support for the CPU models which
are covered by these PCI quirks today.
Ideally we can just disable it for anything more modern as well, but of
course that requires that future devices have proper frequency
enumeration and Intel prevents the OEMs to screw that up with their
"value add" BIOS/SMM machinery. Hope dies last...
Thanks,
tglx
Hi Thomas,
On Wed, Nov 25, 2020 at 01:46:23PM +0100, Thomas Gleixner wrote:
> On Thu, Nov 19 2020 at 12:19, Bjorn Helgaas wrote:
> > 62187910b0fc ("x86/intel: Add quirk to disable HPET for the Baytrail
> > platform") implemented force_disable_hpet() as a special early quirk.
> > These run before the PCI core is initialized and depend on the
> > x86/pci/early.c accessors that use I/O ports 0xcf8 and 0xcfc.
> >
> > But force_disable_hpet() doesn't need to be one of these special early
> > quirks. It merely sets "boot_hpet_disable", which is tested by
> > is_hpet_capable(), which is only used by hpet_enable() and hpet_disable().
> > hpet_enable() is an fs_initcall(), so it runs after the PCI core is
> > initialized.
>
> hpet_enable() is not an fs_initcall(). hpet_late_init() is and that
> invokes hpet_enable() only for the case that ACPI did not advertise it
> and the force_hpet quirk provided a base address.
>
> But hpet_enable() is also invoked via:
>
> start_kernel()
> late_time_init()
> x86_late_time_init()
> hpet_time_init()
>
> which is way before the PCI core is available and we really don't want
> to set it up there if it's known to be broken :)
>
> Now the more interesting question is why this needs to be a PCI quirk in
> the first place. Can't we just disable the HPET based on family/model
> quirks?
>
> e0748539e3d5 ("x86/intel: Disable HPET on Intel Ice Lake platforms")
> f8edbde885bb ("x86/intel: Disable HPET on Intel Coffee Lake H platforms")
> fc5db58539b4 ("x86/quirks: Disable HPET on Intel Coffe Lake platforms")
> 62187910b0fc ("x86/intel: Add quirk to disable HPET for the Baytrail platform")
I added this commit, and I can explain some for Baytrail. There was
some discussion about the way to disable it:
https://lore.kernel.org/lkml/20140328073718.GA12762@feng-snb/t/
It uses PCI ID early quirk in the hope that later Baytrail stepping
doesn't have the problem. And later on, there was official document
(section 18.10.1.3 http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/atom-z8000-datasheet-vol-1.pdf)
stating Baytrail's HPET halts in deep idle. So I think your way of
using CPUID to disable Baytrail HPET makes more sense.
> I might be missing something here, but in general on anything modern
> HPET is mostly useless.
IIUC, nowdays HPET's main use is as a clocksource watchdog monitor.
And in one debug case, we found it still useful. The debug platform has
early serial console which prints many messages in early boot phase,
when the HPET is disabled, the software 'jiffies' clocksource will
be used as the monitor. Early printk will disable interrupt will
printing message, and this could be quite long for a slow 115200
device, and cause the periodic HW timer interrupt get missed, and
make the 'jiffies' clocksource not accurate, which will in turn
judge the TSC clocksrouce inaccurate, and disablt it. (Adding Rui,
Len for more details)
Thanks,
Feng
> Thanks,
>
> tglx
Feng,
On Thu, Nov 26 2020 at 09:24, Feng Tang wrote:
> On Wed, Nov 25, 2020 at 01:46:23PM +0100, Thomas Gleixner wrote:
>> Now the more interesting question is why this needs to be a PCI quirk in
>> the first place. Can't we just disable the HPET based on family/model
>> quirks?
>>
>> e0748539e3d5 ("x86/intel: Disable HPET on Intel Ice Lake platforms")
>> f8edbde885bb ("x86/intel: Disable HPET on Intel Coffee Lake H platforms")
>> fc5db58539b4 ("x86/quirks: Disable HPET on Intel Coffe Lake platforms")
>> 62187910b0fc ("x86/intel: Add quirk to disable HPET for the Baytrail platform")
> I added this commit, and I can explain some for Baytrail. There was
> some discussion about the way to disable it:
> https://lore.kernel.org/lkml/20140328073718.GA12762@feng-snb/t/
>
> It uses PCI ID early quirk in the hope that later Baytrail stepping
> doesn't have the problem. And later on, there was official document
> (section 18.10.1.3 http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/atom-z8000-datasheet-vol-1.pdf)
> stating Baytrail's HPET halts in deep idle. So I think your way of
> using CPUID to disable Baytrail HPET makes more sense.
Correct.
>> I might be missing something here, but in general on anything modern
>> HPET is mostly useless.
>
> IIUC, nowdays HPET's main use is as a clocksource watchdog monitor.
Plus for the TSC refined calibration, where it is really better than
anything else we have _if_ it is functional.
> And in one debug case, we found it still useful. The debug platform has
> early serial console which prints many messages in early boot phase,
> when the HPET is disabled, the software 'jiffies' clocksource will
> be used as the monitor. Early printk will disable interrupt will
> printing message, and this could be quite long for a slow 115200
> device, and cause the periodic HW timer interrupt get missed, and
> make the 'jiffies' clocksource not accurate, which will in turn
> judge the TSC clocksrouce inaccurate, and disablt it. (Adding Rui,
> Len for more details)
Yes, that can happen. But OTOH, we should start to think about the
requirements for using the TSC watchdog.
I'm inclined to lift that requirement when the CPU has:
1) X86_FEATURE_CONSTANT_TSC
2) X86_FEATURE_NONSTOP_TSC
3) X86_FEATURE_NONSTOP_TSC_S3
4) X86_FEATURE_TSC_ADJUST
5) At max. 4 sockets
After two decades of horrors we're finally at a point where TSC seems to
be halfways reliable and less abused by BIOS tinkerers. TSC_ADJUST was
really key as we can now detect even small modifications reliably and
the important point is that we can cure them as well (not pretty but
better than all other options).
The only nasty one in the list above is #5 because AFAIK there is still
no architecural guarantee for TSCs being synchronized on machines with
more than 4 sockets. I might be wrong, but then nobody told me.
The only reason I hate to disable HPET upfront at least during boot is
that HPET is the best mechanism for the refined TSC calibration. PMTIMER
sucks because it's slow and wraps around pretty quick.
So we could do the following even on platforms where HPET stops in some
magic PC? state:
- Register it during early boot as clocksource
- Prevent the enablement as clockevent and the chardev hpet timer muck
- Prevent the magic PC? state up to the point where the refined
TSC calibration is finished.
- Unregister it once the TSC has taken over as system clocksource and
enable the magic PC? state in which HPET gets disfunctional.
Hmm?
Thanks,
tglx
Hi Thomas,
On Fri, Nov 27, 2020 at 12:27:34AM +0100, Thomas Gleixner wrote:
> Feng,
>
> On Thu, Nov 26 2020 at 09:24, Feng Tang wrote:
> > On Wed, Nov 25, 2020 at 01:46:23PM +0100, Thomas Gleixner wrote:
> >> Now the more interesting question is why this needs to be a PCI quirk in
> >> the first place. Can't we just disable the HPET based on family/model
> >> quirks?
> >>
> >> e0748539e3d5 ("x86/intel: Disable HPET on Intel Ice Lake platforms")
> >> f8edbde885bb ("x86/intel: Disable HPET on Intel Coffee Lake H platforms")
> >> fc5db58539b4 ("x86/quirks: Disable HPET on Intel Coffe Lake platforms")
> >> 62187910b0fc ("x86/intel: Add quirk to disable HPET for the Baytrail platform")
>
> > I added this commit, and I can explain some for Baytrail. There was
> > some discussion about the way to disable it:
> > https://lore.kernel.org/lkml/20140328073718.GA12762@feng-snb/t/
> >
> > It uses PCI ID early quirk in the hope that later Baytrail stepping
> > doesn't have the problem. And later on, there was official document
> > (section 18.10.1.3 http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/atom-z8000-datasheet-vol-1.pdf)
> > stating Baytrail's HPET halts in deep idle. So I think your way of
> > using CPUID to disable Baytrail HPET makes more sense.
>
> Correct.
>
> >> I might be missing something here, but in general on anything modern
> >> HPET is mostly useless.
> >
> > IIUC, nowdays HPET's main use is as a clocksource watchdog monitor.
>
> Plus for the TSC refined calibration, where it is really better than
> anything else we have _if_ it is functional.
>
> > And in one debug case, we found it still useful. The debug platform has
> > early serial console which prints many messages in early boot phase,
> > when the HPET is disabled, the software 'jiffies' clocksource will
> > be used as the monitor. Early printk will disable interrupt will
> > printing message, and this could be quite long for a slow 115200
> > device, and cause the periodic HW timer interrupt get missed, and
> > make the 'jiffies' clocksource not accurate, which will in turn
> > judge the TSC clocksrouce inaccurate, and disablt it. (Adding Rui,
> > Len for more details)
>
> Yes, that can happen. But OTOH, we should start to think about the
> requirements for using the TSC watchdog.
>
> I'm inclined to lift that requirement when the CPU has:
>
> 1) X86_FEATURE_CONSTANT_TSC
> 2) X86_FEATURE_NONSTOP_TSC
> 3) X86_FEATURE_NONSTOP_TSC_S3
IIUC, this feature exists for several generations of Atom platforms,
and it is always coupled with 1) and 2), so it could be skipped for
the checking.
> 4) X86_FEATURE_TSC_ADJUST
>
> 5) At max. 4 sockets
>
> After two decades of horrors we're finally at a point where TSC seems to
> be halfways reliable and less abused by BIOS tinkerers. TSC_ADJUST was
> really key as we can now detect even small modifications reliably and
> the important point is that we can cure them as well (not pretty but
> better than all other options).
>
> The only nasty one in the list above is #5 because AFAIK there is still
> no architecural guarantee for TSCs being synchronized on machines with
> more than 4 sockets. I might be wrong, but then nobody told me.
>
> The only reason I hate to disable HPET upfront at least during boot is
> that HPET is the best mechanism for the refined TSC calibration. PMTIMER
> sucks because it's slow and wraps around pretty quick.
>
> So we could do the following even on platforms where HPET stops in some
> magic PC? state:
>
> - Register it during early boot as clocksource
>
> - Prevent the enablement as clockevent and the chardev hpet timer muck
>
> - Prevent the magic PC? state up to the point where the refined
> TSC calibration is finished.
>
> - Unregister it once the TSC has taken over as system clocksource and
> enable the magic PC? state in which HPET gets disfunctional.
This looks reasonable to me.
I have thought about lowering the hpet rating to lower than PMTIMER, so it
still contributes in early boot phase, and fades out after PMTIMER is
initialised.
Thanks,
Feng
> Hmm?
>
> Thanks,
>
> tglx
>
>
Feng,
On Fri, Nov 27 2020 at 14:11, Feng Tang wrote:
> On Fri, Nov 27, 2020 at 12:27:34AM +0100, Thomas Gleixner wrote:
>> On Thu, Nov 26 2020 at 09:24, Feng Tang wrote:
>> Yes, that can happen. But OTOH, we should start to think about the
>> requirements for using the TSC watchdog.
>>
>> I'm inclined to lift that requirement when the CPU has:
>>
>> 1) X86_FEATURE_CONSTANT_TSC
>> 2) X86_FEATURE_NONSTOP_TSC
>
>> 3) X86_FEATURE_NONSTOP_TSC_S3
> IIUC, this feature exists for several generations of Atom platforms,
> and it is always coupled with 1) and 2), so it could be skipped for
> the checking.
Yes, we can ignore that bit as it's not widely available and not
required to solve the problem.
>> 4) X86_FEATURE_TSC_ADJUST
>>
>> 5) At max. 4 sockets
>>
>> The only reason I hate to disable HPET upfront at least during boot is
>> that HPET is the best mechanism for the refined TSC calibration. PMTIMER
>> sucks because it's slow and wraps around pretty quick.
>>
>> So we could do the following even on platforms where HPET stops in some
>> magic PC? state:
>>
>> - Register it during early boot as clocksource
>>
>> - Prevent the enablement as clockevent and the chardev hpet timer muck
>>
>> - Prevent the magic PC? state up to the point where the refined
>> TSC calibration is finished.
>>
>> - Unregister it once the TSC has taken over as system clocksource and
>> enable the magic PC? state in which HPET gets disfunctional.
>
> This looks reasonable to me.
>
> I have thought about lowering the hpet rating to lower than PMTIMER, so it
> still contributes in early boot phase, and fades out after PMTIMER is
> initialised.
Not a good idea. pm_timer is initialized before the refined calibration
finishes.
Thanks,
tglx
Hi Thomas,
On Mon, Nov 30, 2020 at 08:21:03PM +0100, Thomas Gleixner wrote:
> Feng,
>
> On Fri, Nov 27 2020 at 14:11, Feng Tang wrote:
> > On Fri, Nov 27, 2020 at 12:27:34AM +0100, Thomas Gleixner wrote:
> >> On Thu, Nov 26 2020 at 09:24, Feng Tang wrote:
> >> Yes, that can happen. But OTOH, we should start to think about the
> >> requirements for using the TSC watchdog.
> >>
> >> I'm inclined to lift that requirement when the CPU has:
> >>
> >> 1) X86_FEATURE_CONSTANT_TSC
> >> 2) X86_FEATURE_NONSTOP_TSC
> >
> >> 3) X86_FEATURE_NONSTOP_TSC_S3
> > IIUC, this feature exists for several generations of Atom platforms,
> > and it is always coupled with 1) and 2), so it could be skipped for
> > the checking.
>
> Yes, we can ignore that bit as it's not widely available and not
> required to solve the problem.
>
> >> 4) X86_FEATURE_TSC_ADJUST
> >>
> >> 5) At max. 4 sockets
> >>
> >> The only reason I hate to disable HPET upfront at least during boot is
> >> that HPET is the best mechanism for the refined TSC calibration. PMTIMER
> >> sucks because it's slow and wraps around pretty quick.
> >>
> >> So we could do the following even on platforms where HPET stops in some
> >> magic PC? state:
> >>
> >> - Register it during early boot as clocksource
> >>
> >> - Prevent the enablement as clockevent and the chardev hpet timer muck
> >>
> >> - Prevent the magic PC? state up to the point where the refined
> >> TSC calibration is finished.
> >>
> >> - Unregister it once the TSC has taken over as system clocksource and
> >> enable the magic PC? state in which HPET gets disfunctional.
> >
> > This looks reasonable to me.
> >
> > I have thought about lowering the hpet rating to lower than PMTIMER, so it
> > still contributes in early boot phase, and fades out after PMTIMER is
> > initialised.
>
> Not a good idea. pm_timer is initialized before the refined calibration
> finishes.
Yes, you are right. I missed the part.
I dug some old notes, and found another old case (kernel 3.4) that a
broken PMTIMER as the watchdog clocksource wrongly judged TSC to be
unstable and disabled it, which makes me agree more to the idea of
"lift that requirement when the CPU has ..."
If the TSC has those bits to garantee its reliability, then no need
to use a less reliable clocksource to monitor it.
Thanks,
Feng
On Mon, 2020-11-30 at 20:21 +0100, Thomas Gleixner wrote:
> Feng,
>
> On Fri, Nov 27 2020 at 14:11, Feng Tang wrote:
> > On Fri, Nov 27, 2020 at 12:27:34AM +0100, Thomas Gleixner wrote:
> > > On Thu, Nov 26 2020 at 09:24, Feng Tang wrote:
> > > Yes, that can happen. But OTOH, we should start to think about
> > > the
> > > requirements for using the TSC watchdog.
My original proposal is to disable jiffies and refined-jiffies as the
clocksource watchdog, because they are not reliable and it's better to
use clocksource that has a hardware counter as watchdog, like the patch
below, which I didn't sent out for upstream.
From cf9ce0ecab8851a3745edcad92e072022af3dbd9 Mon Sep 17 00:00:00 2001
From: Zhang Rui <[email protected]>
Date: Fri, 19 Jun 2020 22:03:23 +0800
Subject: [RFC PATCH] time/clocksource: do not use refined-jiffies as watchdog
On IA platforms, if HPET is disabled, either via x86 early-quirks, or
via kernel commandline, refined-jiffies will be used as clocksource
watchdog in early boot phase, before acpi_pm timer registered.
This is not a problem if jiffies are accurate.
But in some cases, for example, when serial console is enabled, it may
take several milliseconds to write to the console, with irq disabled,
frequently. Thus many ticks may become longer than it should be.
Using refined-jiffies as watchdog in this case breaks the system because
a) duration calculated by refined-jiffies watchdog is always consistent
with the watchdog timeout issued using add_timer(), say, around 500ms.
b) duration calculated by the running clocksource, usually TSC on IA
platforms, reflects the real time cost, which may be much larger.
This results in the running clocksource being disabled erroneously.
This is reproduced on ICL because HPET is disabled in x86 early-quirks,
and also reproduced on a KBL and a WHL platform when HPET is disabled
via command line.
BTW, commit fd329f276eca
("x86/mtrr: Skip cache flushes on CPUs with cache self-snooping") is
another example that refined-jiffies causes the same problem when ticks
become slow for some other reason.
IMO, the right solution is to only use hardware clocksource as watchdog.
Then even if ticks are slow, both the running clocksource and the watchdog
returns real time cost, and they still match.
Signed-off-by: Zhang Rui <[email protected]>
---
kernel/time/clocksource.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 02441ead3c3b..e7e703858fa6 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -364,6 +364,10 @@ static void clocksource_select_watchdog(bool fallback)
watchdog = NULL;
list_for_each_entry(cs, &clocksource_list, list) {
+ /* Do not use refined-jiffies as clocksource watchdog */
+ if (cs->rating <= 2)
+ continue;
+
/* cs is a clocksource to be watched. */
if (cs->flags & CLOCK_SOURCE_MUST_VERIFY)
continue;
--
2.17.1
> > >
> > > I'm inclined to lift that requirement when the CPU has:
> > >
> > > 1) X86_FEATURE_CONSTANT_TSC
> > > 2) X86_FEATURE_NONSTOP_TSC
> > > 3) X86_FEATURE_NONSTOP_TSC_S3
> >
> > IIUC, this feature exists for several generations of Atom
> > platforms,
> > and it is always coupled with 1) and 2), so it could be skipped for
> > the checking.
>
> Yes, we can ignore that bit as it's not widely available and not
> required to solve the problem.
>
> > > 4) X86_FEATURE_TSC_ADJUST
> > >
> > > 5) At max. 4 sockets
> > >
Should we consider some other corner cases like TSC is not used as
clocksource? refined_jiffies watchdog can break any other hardware
clocksource when it becomes inaccurate.
> > > The only reason I hate to disable HPET upfront at least during
> > > boot is
> > > that HPET is the best mechanism for the refined TSC calibration.
> > > PMTIMER
> > > sucks because it's slow and wraps around pretty quick.
> > >
> > > So we could do the following even on platforms where HPET stops
> > > in some
> > > magic PC? state:
> > >
> > > - Register it during early boot as clocksource
> > >
> > > - Prevent the enablement as clockevent and the chardev hpet
> > > timer muck
> > >
> > > - Prevent the magic PC? state up to the point where the refined
> > > TSC calibration is finished.
> > >
> > > - Unregister it once the TSC has taken over as system
> > > clocksource and
> > > enable the magic PC? state in which HPET gets disfunctional.
> >
On the other side, for ICL, the HPET problem is observed on early
samples, and I didn't reproduce the problem on new ones I have.
But I need to confirm internally if it is safe to re-enable it first.
thanks,
rui
> > This looks reasonable to me.
> >
> > I have thought about lowering the hpet rating to lower than
> > PMTIMER, so it
> > still contributes in early boot phase, and fades out after PMTIMER
> > is
> > initialised.
>
> Not a good idea. pm_timer is initialized before the refined
> calibration
> finishes.
>
> Thanks,
>
> tglx
On 2020/12/2 15:28, Zhang Rui wrote:
> On Mon, 2020-11-30 at 20:21 +0100, Thomas Gleixner wrote:
>> Feng,
>>
>> On Fri, Nov 27 2020 at 14:11, Feng Tang wrote:
>>> On Fri, Nov 27, 2020 at 12:27:34AM +0100, Thomas Gleixner wrote:
>>>> On Thu, Nov 26 2020 at 09:24, Feng Tang wrote:
>>>> Yes, that can happen. But OTOH, we should start to think about
>>>> the
>>>> requirements for using the TSC watchdog.
>
> My original proposal is to disable jiffies and refined-jiffies as the
> clocksource watchdog, because they are not reliable and it's better to
> use clocksource that has a hardware counter as watchdog, like the patch
> below, which I didn't sent out for upstream.
>
>>From cf9ce0ecab8851a3745edcad92e072022af3dbd9 Mon Sep 17 00:00:00 2001
> From: Zhang Rui <[email protected]>
> Date: Fri, 19 Jun 2020 22:03:23 +0800
> Subject: [RFC PATCH] time/clocksource: do not use refined-jiffies as watchdog
>
> On IA platforms, if HPET is disabled, either via x86 early-quirks, or
> via kernel commandline, refined-jiffies will be used as clocksource
> watchdog in early boot phase, before acpi_pm timer registered.
>
> This is not a problem if jiffies are accurate.
> But in some cases, for example, when serial console is enabled, it may
> take several milliseconds to write to the console, with irq disabled,
> frequently. Thus many ticks may become longer than it should be.
>
> Using refined-jiffies as watchdog in this case breaks the system because
> a) duration calculated by refined-jiffies watchdog is always consistent
> with the watchdog timeout issued using add_timer(), say, around 500ms.
> b) duration calculated by the running clocksource, usually TSC on IA
> platforms, reflects the real time cost, which may be much larger.
> This results in the running clocksource being disabled erroneously.
>
> This is reproduced on ICL because HPET is disabled in x86 early-quirks,
> and also reproduced on a KBL and a WHL platform when HPET is disabled
> via command line.
>
> BTW, commit fd329f276eca
> ("x86/mtrr: Skip cache flushes on CPUs with cache self-snooping") is
> another example that refined-jiffies causes the same problem when ticks
> become slow for some other reason.
Hi, Zhang Rui, we have met the same problem as you mentioned above. I have
tested the following modification. It can solve the problem. Do you have plan
to push it to upstream ?
Thanks,
Liao Yu
>
> IMO, the right solution is to only use hardware clocksource as watchdog.
> Then even if ticks are slow, both the running clocksource and the watchdog
> returns real time cost, and they still match.
>
> Signed-off-by: Zhang Rui <[email protected]>
> ---
> kernel/time/clocksource.c | 4 ++++
> 1 file changed, 4 insertions(+)
>
> diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
> index 02441ead3c3b..e7e703858fa6 100644
> --- a/kernel/time/clocksource.c
> +++ b/kernel/time/clocksource.c
> @@ -364,6 +364,10 @@ static void clocksource_select_watchdog(bool fallback)
> watchdog = NULL;
>
> list_for_each_entry(cs, &clocksource_list, list) {
> + /* Do not use refined-jiffies as clocksource watchdog */
> + if (cs->rating <= 2)
> + continue;
> +
> /* cs is a clocksource to be watched. */
> if (cs->flags & CLOCK_SOURCE_MUST_VERIFY)
> continue;
On Fri, Sep 30, 2022 at 09:05:24AM +0800, Xiongfeng Wang wrote:
>
>
> On 2022/9/30 8:38, Feng Tang wrote:
> > On Thu, Sep 29, 2022 at 11:52:28PM +0800, Yu Liao wrote:
> >> On 2020/12/2 15:28, Zhang Rui wrote:
> >>> On Mon, 2020-11-30 at 20:21 +0100, Thomas Gleixner wrote:
> >>>> Feng,
> >>>>
> >>>> On Fri, Nov 27 2020 at 14:11, Feng Tang wrote:
> >>>>> On Fri, Nov 27, 2020 at 12:27:34AM +0100, Thomas Gleixner wrote:
> >>>>>> On Thu, Nov 26 2020 at 09:24, Feng Tang wrote:
> >>>>>> Yes, that can happen. But OTOH, we should start to think about
> >>>>>> the
> >>>>>> requirements for using the TSC watchdog.
> >>>
> >>> My original proposal is to disable jiffies and refined-jiffies as the
> >>> clocksource watchdog, because they are not reliable and it's better to
> >>> use clocksource that has a hardware counter as watchdog, like the patch
> >>> below, which I didn't sent out for upstream.
> >>>
> >>> >From cf9ce0ecab8851a3745edcad92e072022af3dbd9 Mon Sep 17 00:00:00 2001
> >>> From: Zhang Rui <[email protected]>
> >>> Date: Fri, 19 Jun 2020 22:03:23 +0800
> >>> Subject: [RFC PATCH] time/clocksource: do not use refined-jiffies as watchdog
> >>>
> >>> On IA platforms, if HPET is disabled, either via x86 early-quirks, or
> >>> via kernel commandline, refined-jiffies will be used as clocksource
> >>> watchdog in early boot phase, before acpi_pm timer registered.
> >>>
> >>> This is not a problem if jiffies are accurate.
> >>> But in some cases, for example, when serial console is enabled, it may
> >>> take several milliseconds to write to the console, with irq disabled,
> >>> frequently. Thus many ticks may become longer than it should be.
> >>>
> >>> Using refined-jiffies as watchdog in this case breaks the system because
> >>> a) duration calculated by refined-jiffies watchdog is always consistent
> >>> with the watchdog timeout issued using add_timer(), say, around 500ms.
> >>> b) duration calculated by the running clocksource, usually TSC on IA
> >>> platforms, reflects the real time cost, which may be much larger.
> >>> This results in the running clocksource being disabled erroneously.
> >>>
> >>> This is reproduced on ICL because HPET is disabled in x86 early-quirks,
> >>> and also reproduced on a KBL and a WHL platform when HPET is disabled
> >>> via command line.
> >>>
> >>> BTW, commit fd329f276eca
> >>> ("x86/mtrr: Skip cache flushes on CPUs with cache self-snooping") is
> >>> another example that refined-jiffies causes the same problem when ticks
> >>> become slow for some other reason.
> >>
> >> Hi, Zhang Rui, we have met the same problem as you mentioned above. I have
> >> tested the following modification. It can solve the problem. Do you have plan
> >> to push it to upstream ?
> >
> > Hi Liao Yu,
> >
> > Could you provoide more details? Like, what ARCH is the platform (x86
> > or others), client or sever, if sever, how many sockets (2S/4S/8S)?
> >
> > The error kernel log will also be helpful.
>
> Hi, Feng Tang,
>
> It's a X86 Sever. lscpu print the following information:
>
> Architecture: x86_64
> CPU op-mode(s): 32-bit, 64-bit
> Byte Order: Little Endian
> Address sizes: 46 bits physical, 48 bits virtual
> CPU(s): 224
> On-line CPU(s) list: 0-223
> Thread(s) per core: 2
> Core(s) per socket: 28
> Socket(s): 4
> NUMA node(s): 4
> Vendor ID: GenuineIntel
> CPU family: 6
> Model: 85
> Model name: Intel(R) Xeon(R) Platinum 8180 CPU @ 2.50GHz
> Stepping: 4
> CPU MHz: 3199.379
> CPU max MHz: 3800.0000
> CPU min MHz: 1000.0000
> BogoMIPS: 5000.00
> Virtualization: VT-x
> L1d cache: 3.5 MiB
> L1i cache: 3.5 MiB
> L2 cache: 112 MiB
> L3 cache: 154 MiB
> NUMA node0 CPU(s): 0-27,112-139
> NUMA node1 CPU(s): 28-55,140-167
> NUMA node2 CPU(s): 56-83,168-195
> NUMA node3 CPU(s): 84-111,196-223
>
> Part of the kernel log is as follows.
>
> [ 1.144402] smp: Brought up 4 nodes, 224 CPUs
> [ 1.144402] smpboot: Max logical packages: 4
> [ 1.144402] smpboot: Total of 224 processors activated (1121097.93 BogoMIPS)
> [ 1.520003] clocksource: timekeeping watchdog on CPU2: Marking clocksource
> 'tsc-early' as unstable because the skew is too large:
> [ 1.520010] clocksource: 'refined-jiffies' wd_now:
> fffb7210 wd_last: fffb7018 mask: ffffffff
> [ 1.520013] clocksource: 'tsc-early' cs_now:
> 6606717afddd0 cs_last: 66065eff88ad4 mask: ffffffffffffffff
> [ 1.520015] tsc: Marking TSC unstable due to clocksource watchdog
> [ 5.164635] node 0 initialised, 98233092 pages in 4013ms
> [ 5.209294] node 3 initialised, 98923232 pages in 4057ms
> [ 5.220001] node 2 initialised, 99054870 pages in 4068ms
> [ 5.222282] node 1 initialised, 99054870 pages in 4070ms
Thanks Xiaofeng for the info.
Could you try the below patch? It is kinda extension of
b50db7095fe0 ("x86/tsc: Disable clocksource watchdog for TSC on qualified platorms")
which I have run limited test on some 4 sockets Haswell and Cascadelake
AP x86 servers.
Thanks,
Feng
---
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index cafacb2e58cc..b4ea79cb1d1a 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1217,7 +1217,7 @@ static void __init check_system_tsc_reliable(void)
if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
- nr_online_nodes <= 2)
+ nr_online_nodes <= 8)
tsc_disable_clocksource_watchdog();
}
On 2022/9/30 8:38, Feng Tang wrote:
> On Thu, Sep 29, 2022 at 11:52:28PM +0800, Yu Liao wrote:
>> On 2020/12/2 15:28, Zhang Rui wrote:
>>> On Mon, 2020-11-30 at 20:21 +0100, Thomas Gleixner wrote:
>>>> Feng,
>>>>
>>>> On Fri, Nov 27 2020 at 14:11, Feng Tang wrote:
>>>>> On Fri, Nov 27, 2020 at 12:27:34AM +0100, Thomas Gleixner wrote:
>>>>>> On Thu, Nov 26 2020 at 09:24, Feng Tang wrote:
>>>>>> Yes, that can happen. But OTOH, we should start to think about
>>>>>> the
>>>>>> requirements for using the TSC watchdog.
>>>
>>> My original proposal is to disable jiffies and refined-jiffies as the
>>> clocksource watchdog, because they are not reliable and it's better to
>>> use clocksource that has a hardware counter as watchdog, like the patch
>>> below, which I didn't sent out for upstream.
>>>
>>> >From cf9ce0ecab8851a3745edcad92e072022af3dbd9 Mon Sep 17 00:00:00 2001
>>> From: Zhang Rui <[email protected]>
>>> Date: Fri, 19 Jun 2020 22:03:23 +0800
>>> Subject: [RFC PATCH] time/clocksource: do not use refined-jiffies as watchdog
>>>
>>> On IA platforms, if HPET is disabled, either via x86 early-quirks, or
>>> via kernel commandline, refined-jiffies will be used as clocksource
>>> watchdog in early boot phase, before acpi_pm timer registered.
>>>
>>> This is not a problem if jiffies are accurate.
>>> But in some cases, for example, when serial console is enabled, it may
>>> take several milliseconds to write to the console, with irq disabled,
>>> frequently. Thus many ticks may become longer than it should be.
>>>
>>> Using refined-jiffies as watchdog in this case breaks the system because
>>> a) duration calculated by refined-jiffies watchdog is always consistent
>>> with the watchdog timeout issued using add_timer(), say, around 500ms.
>>> b) duration calculated by the running clocksource, usually TSC on IA
>>> platforms, reflects the real time cost, which may be much larger.
>>> This results in the running clocksource being disabled erroneously.
>>>
>>> This is reproduced on ICL because HPET is disabled in x86 early-quirks,
>>> and also reproduced on a KBL and a WHL platform when HPET is disabled
>>> via command line.
>>>
>>> BTW, commit fd329f276eca
>>> ("x86/mtrr: Skip cache flushes on CPUs with cache self-snooping") is
>>> another example that refined-jiffies causes the same problem when ticks
>>> become slow for some other reason.
>>
>> Hi, Zhang Rui, we have met the same problem as you mentioned above. I have
>> tested the following modification. It can solve the problem. Do you have plan
>> to push it to upstream ?
>
> Hi Liao Yu,
>
> Could you provoide more details? Like, what ARCH is the platform (x86
> or others), client or sever, if sever, how many sockets (2S/4S/8S)?
>
> The error kernel log will also be helpful.
Hi, Feng Tang,
It's a X86 Sever. lscpu print the following information:
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
Address sizes: 46 bits physical, 48 bits virtual
CPU(s): 224
On-line CPU(s) list: 0-223
Thread(s) per core: 2
Core(s) per socket: 28
Socket(s): 4
NUMA node(s): 4
Vendor ID: GenuineIntel
CPU family: 6
Model: 85
Model name: Intel(R) Xeon(R) Platinum 8180 CPU @ 2.50GHz
Stepping: 4
CPU MHz: 3199.379
CPU max MHz: 3800.0000
CPU min MHz: 1000.0000
BogoMIPS: 5000.00
Virtualization: VT-x
L1d cache: 3.5 MiB
L1i cache: 3.5 MiB
L2 cache: 112 MiB
L3 cache: 154 MiB
NUMA node0 CPU(s): 0-27,112-139
NUMA node1 CPU(s): 28-55,140-167
NUMA node2 CPU(s): 56-83,168-195
NUMA node3 CPU(s): 84-111,196-223
Part of the kernel log is as follows.
[ 1.144402] smp: Brought up 4 nodes, 224 CPUs
[ 1.144402] smpboot: Max logical packages: 4
[ 1.144402] smpboot: Total of 224 processors activated (1121097.93 BogoMIPS)
[ 1.520003] clocksource: timekeeping watchdog on CPU2: Marking clocksource
'tsc-early' as unstable because the skew is too large:
[ 1.520010] clocksource: 'refined-jiffies' wd_now:
fffb7210 wd_last: fffb7018 mask: ffffffff
[ 1.520013] clocksource: 'tsc-early' cs_now:
6606717afddd0 cs_last: 66065eff88ad4 mask: ffffffffffffffff
[ 1.520015] tsc: Marking TSC unstable due to clocksource watchdog
[ 5.164635] node 0 initialised, 98233092 pages in 4013ms
[ 5.209294] node 3 initialised, 98923232 pages in 4057ms
[ 5.220001] node 2 initialised, 99054870 pages in 4068ms
[ 5.222282] node 1 initialised, 99054870 pages in 4070ms
Thanks,
Xiongfeng
>
> Thanks,
> Feng
>
>> Thanks,
>> Liao Yu
>>
>>>
>>> IMO, the right solution is to only use hardware clocksource as watchdog.
>>> Then even if ticks are slow, both the running clocksource and the watchdog
>>> returns real time cost, and they still match.
>>>
>>> Signed-off-by: Zhang Rui <[email protected]>
>>> ---
>>> kernel/time/clocksource.c | 4 ++++
>>> 1 file changed, 4 insertions(+)
>>>
>>> diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
>>> index 02441ead3c3b..e7e703858fa6 100644
>>> --- a/kernel/time/clocksource.c
>>> +++ b/kernel/time/clocksource.c
>>> @@ -364,6 +364,10 @@ static void clocksource_select_watchdog(bool fallback)
>>> watchdog = NULL;
>>>
>>> list_for_each_entry(cs, &clocksource_list, list) {
>>> + /* Do not use refined-jiffies as clocksource watchdog */
>>> + if (cs->rating <= 2)
>>> + continue;
>>> +
>>> /* cs is a clocksource to be watched. */
>>> if (cs->flags & CLOCK_SOURCE_MUST_VERIFY)
>>> continue;
>>
> .
>
On Thu, Sep 29, 2022 at 11:52:28PM +0800, Yu Liao wrote:
> On 2020/12/2 15:28, Zhang Rui wrote:
> > On Mon, 2020-11-30 at 20:21 +0100, Thomas Gleixner wrote:
> >> Feng,
> >>
> >> On Fri, Nov 27 2020 at 14:11, Feng Tang wrote:
> >>> On Fri, Nov 27, 2020 at 12:27:34AM +0100, Thomas Gleixner wrote:
> >>>> On Thu, Nov 26 2020 at 09:24, Feng Tang wrote:
> >>>> Yes, that can happen. But OTOH, we should start to think about
> >>>> the
> >>>> requirements for using the TSC watchdog.
> >
> > My original proposal is to disable jiffies and refined-jiffies as the
> > clocksource watchdog, because they are not reliable and it's better to
> > use clocksource that has a hardware counter as watchdog, like the patch
> > below, which I didn't sent out for upstream.
> >
> >>From cf9ce0ecab8851a3745edcad92e072022af3dbd9 Mon Sep 17 00:00:00 2001
> > From: Zhang Rui <[email protected]>
> > Date: Fri, 19 Jun 2020 22:03:23 +0800
> > Subject: [RFC PATCH] time/clocksource: do not use refined-jiffies as watchdog
> >
> > On IA platforms, if HPET is disabled, either via x86 early-quirks, or
> > via kernel commandline, refined-jiffies will be used as clocksource
> > watchdog in early boot phase, before acpi_pm timer registered.
> >
> > This is not a problem if jiffies are accurate.
> > But in some cases, for example, when serial console is enabled, it may
> > take several milliseconds to write to the console, with irq disabled,
> > frequently. Thus many ticks may become longer than it should be.
> >
> > Using refined-jiffies as watchdog in this case breaks the system because
> > a) duration calculated by refined-jiffies watchdog is always consistent
> > with the watchdog timeout issued using add_timer(), say, around 500ms.
> > b) duration calculated by the running clocksource, usually TSC on IA
> > platforms, reflects the real time cost, which may be much larger.
> > This results in the running clocksource being disabled erroneously.
> >
> > This is reproduced on ICL because HPET is disabled in x86 early-quirks,
> > and also reproduced on a KBL and a WHL platform when HPET is disabled
> > via command line.
> >
> > BTW, commit fd329f276eca
> > ("x86/mtrr: Skip cache flushes on CPUs with cache self-snooping") is
> > another example that refined-jiffies causes the same problem when ticks
> > become slow for some other reason.
>
> Hi, Zhang Rui, we have met the same problem as you mentioned above. I have
> tested the following modification. It can solve the problem. Do you have plan
> to push it to upstream ?
Hi Liao Yu,
Could you provoide more details? Like, what ARCH is the platform (x86
or others), client or sever, if sever, how many sockets (2S/4S/8S)?
The error kernel log will also be helpful.
Thanks,
Feng
> Thanks,
> Liao Yu
>
> >
> > IMO, the right solution is to only use hardware clocksource as watchdog.
> > Then even if ticks are slow, both the running clocksource and the watchdog
> > returns real time cost, and they still match.
> >
> > Signed-off-by: Zhang Rui <[email protected]>
> > ---
> > kernel/time/clocksource.c | 4 ++++
> > 1 file changed, 4 insertions(+)
> >
> > diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
> > index 02441ead3c3b..e7e703858fa6 100644
> > --- a/kernel/time/clocksource.c
> > +++ b/kernel/time/clocksource.c
> > @@ -364,6 +364,10 @@ static void clocksource_select_watchdog(bool fallback)
> > watchdog = NULL;
> >
> > list_for_each_entry(cs, &clocksource_list, list) {
> > + /* Do not use refined-jiffies as clocksource watchdog */
> > + if (cs->rating <= 2)
> > + continue;
> > +
> > /* cs is a clocksource to be watched. */
> > if (cs->flags & CLOCK_SOURCE_MUST_VERIFY)
> > continue;
>
On Fri, Sep 30, 2022 at 05:45:29PM +0800, Yu Liao wrote:
[...]
> >>>>
> >>>> Hi, Zhang Rui, we have met the same problem as you mentioned above. I have
> >>>> tested the following modification. It can solve the problem. Do you have plan
> >>>> to push it to upstream ?
> >>>
> >>> Hi Liao Yu,
> >>>
> >>> Could you provoide more details? Like, what ARCH is the platform (x86
> >>> or others), client or sever, if sever, how many sockets (2S/4S/8S)?
> >>>
> >>> The error kernel log will also be helpful.
> >>
> >> Hi, Feng Tang,
> >>
> >> It's a X86 Sever. lscpu print the following information:
> >>
> >> Architecture: x86_64
> >> CPU op-mode(s): 32-bit, 64-bit
> >> Byte Order: Little Endian
> >> Address sizes: 46 bits physical, 48 bits virtual
> >> CPU(s): 224
> >> On-line CPU(s) list: 0-223
> >> Thread(s) per core: 2
> >> Core(s) per socket: 28
> >> Socket(s): 4
> >> NUMA node(s): 4
> >> Vendor ID: GenuineIntel
> >> CPU family: 6
> >> Model: 85
> >> Model name: Intel(R) Xeon(R) Platinum 8180 CPU @ 2.50GHz
> >> Stepping: 4
> >> CPU MHz: 3199.379
> >> CPU max MHz: 3800.0000
> >> CPU min MHz: 1000.0000
> >> BogoMIPS: 5000.00
> >> Virtualization: VT-x
> >> L1d cache: 3.5 MiB
> >> L1i cache: 3.5 MiB
> >> L2 cache: 112 MiB
> >> L3 cache: 154 MiB
> >> NUMA node0 CPU(s): 0-27,112-139
> >> NUMA node1 CPU(s): 28-55,140-167
> >> NUMA node2 CPU(s): 56-83,168-195
> >> NUMA node3 CPU(s): 84-111,196-223
> >>
> >> Part of the kernel log is as follows.
> >>
> >> [ 1.144402] smp: Brought up 4 nodes, 224 CPUs
> >> [ 1.144402] smpboot: Max logical packages: 4
> >> [ 1.144402] smpboot: Total of 224 processors activated (1121097.93 BogoMIPS)
> >> [ 1.520003] clocksource: timekeeping watchdog on CPU2: Marking clocksource
> >> 'tsc-early' as unstable because the skew is too large:
> >> [ 1.520010] clocksource: 'refined-jiffies' wd_now:
> >> fffb7210 wd_last: fffb7018 mask: ffffffff
> >> [ 1.520013] clocksource: 'tsc-early' cs_now:
> >> 6606717afddd0 cs_last: 66065eff88ad4 mask: ffffffffffffffff
> >> [ 1.520015] tsc: Marking TSC unstable due to clocksource watchdog
> >> [ 5.164635] node 0 initialised, 98233092 pages in 4013ms
> >> [ 5.209294] node 3 initialised, 98923232 pages in 4057ms
> >> [ 5.220001] node 2 initialised, 99054870 pages in 4068ms
> >> [ 5.222282] node 1 initialised, 99054870 pages in 4070ms
> >
> > Thanks Xiaofeng for the info.
> >
> > Could you try the below patch? It is kinda extension of
> >
> > b50db7095fe0 ("x86/tsc: Disable clocksource watchdog for TSC on qualified platorms")
> >
> > which I have run limited test on some 4 sockets Haswell and Cascadelake
> > AP x86 servers.
> >
> >
> > Thanks,
> > Feng
> > ---
> > diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> > index cafacb2e58cc..b4ea79cb1d1a 100644
> > --- a/arch/x86/kernel/tsc.c
> > +++ b/arch/x86/kernel/tsc.c
> > @@ -1217,7 +1217,7 @@ static void __init check_system_tsc_reliable(void)
> > if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
> > boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
> > boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
> > - nr_online_nodes <= 2)
> > + nr_online_nodes <= 8)
> > tsc_disable_clocksource_watchdog();
> > }
> >
> >
> Hi Feng,
>
> I tested this patch on a previous server and it fixes the issue.
Thanks for the testing, please do let us know if there is any TSC
problem after long time or stress running.
Plan to send the patch for merging.
Thanks,
Feng
> Thanks,
> Yu
>
>
On 2022/9/30 9:15, Feng Tang wrote:
> On Fri, Sep 30, 2022 at 09:05:24AM +0800, Xiongfeng Wang wrote:
>>
>>
>> On 2022/9/30 8:38, Feng Tang wrote:
>>> On Thu, Sep 29, 2022 at 11:52:28PM +0800, Yu Liao wrote:
>>>> On 2020/12/2 15:28, Zhang Rui wrote:
>>>>> On Mon, 2020-11-30 at 20:21 +0100, Thomas Gleixner wrote:
>>>>>> Feng,
>>>>>>
>>>>>> On Fri, Nov 27 2020 at 14:11, Feng Tang wrote:
>>>>>>> On Fri, Nov 27, 2020 at 12:27:34AM +0100, Thomas Gleixner wrote:
>>>>>>>> On Thu, Nov 26 2020 at 09:24, Feng Tang wrote:
>>>>>>>> Yes, that can happen. But OTOH, we should start to think about
>>>>>>>> the
>>>>>>>> requirements for using the TSC watchdog.
>>>>>
>>>>> My original proposal is to disable jiffies and refined-jiffies as the
>>>>> clocksource watchdog, because they are not reliable and it's better to
>>>>> use clocksource that has a hardware counter as watchdog, like the patch
>>>>> below, which I didn't sent out for upstream.
>>>>>
>>>>> >From cf9ce0ecab8851a3745edcad92e072022af3dbd9 Mon Sep 17 00:00:00 2001
>>>>> From: Zhang Rui <[email protected]>
>>>>> Date: Fri, 19 Jun 2020 22:03:23 +0800
>>>>> Subject: [RFC PATCH] time/clocksource: do not use refined-jiffies as watchdog
>>>>>
>>>>> On IA platforms, if HPET is disabled, either via x86 early-quirks, or
>>>>> via kernel commandline, refined-jiffies will be used as clocksource
>>>>> watchdog in early boot phase, before acpi_pm timer registered.
>>>>>
>>>>> This is not a problem if jiffies are accurate.
>>>>> But in some cases, for example, when serial console is enabled, it may
>>>>> take several milliseconds to write to the console, with irq disabled,
>>>>> frequently. Thus many ticks may become longer than it should be.
>>>>>
>>>>> Using refined-jiffies as watchdog in this case breaks the system because
>>>>> a) duration calculated by refined-jiffies watchdog is always consistent
>>>>> with the watchdog timeout issued using add_timer(), say, around 500ms.
>>>>> b) duration calculated by the running clocksource, usually TSC on IA
>>>>> platforms, reflects the real time cost, which may be much larger.
>>>>> This results in the running clocksource being disabled erroneously.
>>>>>
>>>>> This is reproduced on ICL because HPET is disabled in x86 early-quirks,
>>>>> and also reproduced on a KBL and a WHL platform when HPET is disabled
>>>>> via command line.
>>>>>
>>>>> BTW, commit fd329f276eca
>>>>> ("x86/mtrr: Skip cache flushes on CPUs with cache self-snooping") is
>>>>> another example that refined-jiffies causes the same problem when ticks
>>>>> become slow for some other reason.
>>>>
>>>> Hi, Zhang Rui, we have met the same problem as you mentioned above. I have
>>>> tested the following modification. It can solve the problem. Do you have plan
>>>> to push it to upstream ?
>>>
>>> Hi Liao Yu,
>>>
>>> Could you provoide more details? Like, what ARCH is the platform (x86
>>> or others), client or sever, if sever, how many sockets (2S/4S/8S)?
>>>
>>> The error kernel log will also be helpful.
>>
>> Hi, Feng Tang,
>>
>> It's a X86 Sever. lscpu print the following information:
>>
>> Architecture: x86_64
>> CPU op-mode(s): 32-bit, 64-bit
>> Byte Order: Little Endian
>> Address sizes: 46 bits physical, 48 bits virtual
>> CPU(s): 224
>> On-line CPU(s) list: 0-223
>> Thread(s) per core: 2
>> Core(s) per socket: 28
>> Socket(s): 4
>> NUMA node(s): 4
>> Vendor ID: GenuineIntel
>> CPU family: 6
>> Model: 85
>> Model name: Intel(R) Xeon(R) Platinum 8180 CPU @ 2.50GHz
>> Stepping: 4
>> CPU MHz: 3199.379
>> CPU max MHz: 3800.0000
>> CPU min MHz: 1000.0000
>> BogoMIPS: 5000.00
>> Virtualization: VT-x
>> L1d cache: 3.5 MiB
>> L1i cache: 3.5 MiB
>> L2 cache: 112 MiB
>> L3 cache: 154 MiB
>> NUMA node0 CPU(s): 0-27,112-139
>> NUMA node1 CPU(s): 28-55,140-167
>> NUMA node2 CPU(s): 56-83,168-195
>> NUMA node3 CPU(s): 84-111,196-223
>>
>> Part of the kernel log is as follows.
>>
>> [ 1.144402] smp: Brought up 4 nodes, 224 CPUs
>> [ 1.144402] smpboot: Max logical packages: 4
>> [ 1.144402] smpboot: Total of 224 processors activated (1121097.93 BogoMIPS)
>> [ 1.520003] clocksource: timekeeping watchdog on CPU2: Marking clocksource
>> 'tsc-early' as unstable because the skew is too large:
>> [ 1.520010] clocksource: 'refined-jiffies' wd_now:
>> fffb7210 wd_last: fffb7018 mask: ffffffff
>> [ 1.520013] clocksource: 'tsc-early' cs_now:
>> 6606717afddd0 cs_last: 66065eff88ad4 mask: ffffffffffffffff
>> [ 1.520015] tsc: Marking TSC unstable due to clocksource watchdog
>> [ 5.164635] node 0 initialised, 98233092 pages in 4013ms
>> [ 5.209294] node 3 initialised, 98923232 pages in 4057ms
>> [ 5.220001] node 2 initialised, 99054870 pages in 4068ms
>> [ 5.222282] node 1 initialised, 99054870 pages in 4070ms
>
> Thanks Xiaofeng for the info.
>
> Could you try the below patch? It is kinda extension of
>
> b50db7095fe0 ("x86/tsc: Disable clocksource watchdog for TSC on qualified platorms")
>
> which I have run limited test on some 4 sockets Haswell and Cascadelake
> AP x86 servers.
>
>
> Thanks,
> Feng
> ---
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index cafacb2e58cc..b4ea79cb1d1a 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -1217,7 +1217,7 @@ static void __init check_system_tsc_reliable(void)
> if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
> boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
> boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
> - nr_online_nodes <= 2)
> + nr_online_nodes <= 8)
> tsc_disable_clocksource_watchdog();
> }
>
>
Hi Feng,
I tested this patch on a previous server and it fixes the issue.
Thanks,
Yu
On Fri, 2022-09-30 at 18:13 +0800, Feng Tang wrote:
> >
> On Fri, Sep 30, 2022 at 05:45:29PM +0800, Yu Liao wrote:
> [...]
> > > > > > Hi, Zhang Rui, we have met the same problem as you
> > > > > > mentioned above. I have
> > > > > > tested the following modification. It can solve the
> > > > > > problem. Do you have plan
> > > > > > to push it to upstream ?
> > > > >
> > > > > Hi Liao Yu,
> > > > >
> > > > > Could you provoide more details? Like, what ARCH is the
> > > > > platform (x86
> > > > > or others), client or sever, if sever, how many sockets
> > > > > (2S/4S/8S)?
> > > > >
> > > > > The error kernel log will also be helpful.
> > > >
> > > > Hi, Feng Tang,
> > > >
> > > > It's a X86 Sever. lscpu print the following information:
> > > >
> > > > Architecture: x86_64
> > > > CPU op-mode(s): 32-bit, 64-bit
> > > > Byte Order: Little Endian
> > > > Address sizes: 46 bits physical, 48 bits
> > > > virtual
> > > > CPU(s): 224
> > > > On-line CPU(s) list: 0-223
> > > > Thread(s) per core: 2
> > > > Core(s) per socket: 28
> > > > Socket(s): 4
> > > > NUMA node(s): 4
> > > > Vendor ID: GenuineIntel
> > > > CPU family: 6
> > > > Model: 85
> > > > Model name: Intel(R) Xeon(R) Platinum 8180
> > > > CPU @ 2.50GHz
> > > > Stepping: 4
> > > > CPU MHz: 3199.379
> > > > CPU max MHz: 3800.0000
> > > > CPU min MHz: 1000.0000
> > > > BogoMIPS: 5000.00
> > > > Virtualization: VT-x
> > > > L1d cache: 3.5 MiB
> > > > L1i cache: 3.5 MiB
> > > > L2 cache: 112 MiB
> > > > L3 cache: 154 MiB
> > > > NUMA node0 CPU(s): 0-27,112-139
> > > > NUMA node1 CPU(s): 28-55,140-167
> > > > NUMA node2 CPU(s): 56-83,168-195
> > > > NUMA node3 CPU(s): 84-111,196-223
> > > >
> > > > Part of the kernel log is as follows.
> > > >
> > > > [ 1.144402] smp: Brought up 4 nodes, 224 CPUs
> > > > [ 1.144402] smpboot: Max logical packages: 4
> > > > [ 1.144402] smpboot: Total of 224 processors activated
> > > > (1121097.93 BogoMIPS)
> > > > [ 1.520003] clocksource: timekeeping watchdog on CPU2:
> > > > Marking clocksource
> > > > 'tsc-early' as unstable because the skew is too large:
> > > > [ 1.520010] clocksource: 'refined-
> > > > jiffies' wd_now:
> > > > fffb7210 wd_last: fffb7018 mask: ffffffff
> > > > [ 1.520013] clocksource: 'tsc-early'
> > > > cs_now:
> > > > 6606717afddd0 cs_last: 66065eff88ad4 mask: ffffffffffffffff
> > > > [ 1.520015] tsc: Marking TSC unstable due to clocksource
> > > > watchdog
> > > > [ 5.164635] node 0 initialised, 98233092 pages in 4013ms
> > > > [ 5.209294] node 3 initialised, 98923232 pages in 4057ms
> > > > [ 5.220001] node 2 initialised, 99054870 pages in 4068ms
> > > > [ 5.222282] node 1 initialised, 99054870 pages in 4070ms
> > >
> > > Thanks Xiaofeng for the info.
> > >
> > > Could you try the below patch? It is kinda extension of
> > >
> > > b50db7095fe0 ("x86/tsc: Disable clocksource watchdog for TSC on
> > > qualified platorms")
> > >
> > > which I have run limited test on some 4 sockets Haswell and
> > > Cascadelake
> > > AP x86 servers.
> > >
> > >
> > > Thanks,
> > > Feng
> > > ---
> > > diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> > > index cafacb2e58cc..b4ea79cb1d1a 100644
> > > --- a/arch/x86/kernel/tsc.c
> > > +++ b/arch/x86/kernel/tsc.c
> > > @@ -1217,7 +1217,7 @@ static void __init
> > > check_system_tsc_reliable(void)
> > > if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
> > > boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
> > > boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
> > > - nr_online_nodes <= 2)
> > > + nr_online_nodes <= 8)
> > > tsc_disable_clocksource_watchdog();
> > > }
> > >
> > >
> > Hi Feng,
> >
> > I tested this patch on a previous server and it fixes the issue.
>
> Thanks for the testing, please do let us know if there is any TSC
> problem after long time or stress running.
>
> Plan to send the patch for merging.
>
Good to know.
This patch fixes the TSC issue for IA.
while the patch at
https://lore.kernel.org/lkml/YzZDLBKbDTbNr45b@feng-clx/T/#m34094630193e8320c6d75e9c8aeabe7633e051d2
impacts more Arches but I'm not sure if similar problem is observed on
other Arches as well. So I will hold it for now.
thanks,
rui
On Sat, Oct 01, 2022 at 01:18:05PM +0800, Zhang Rui wrote:
[...]
> > > > Thanks Xiaofeng for the info.
> > > >
> > > > Could you try the below patch? It is kinda extension of
> > > >
> > > > b50db7095fe0 ("x86/tsc: Disable clocksource watchdog for TSC on
> > > > qualified platorms")
> > > >
> > > > which I have run limited test on some 4 sockets Haswell and
> > > > Cascadelake
> > > > AP x86 servers.
> > > >
> > > >
> > > > Thanks,
> > > > Feng
> > > > ---
> > > > diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> > > > index cafacb2e58cc..b4ea79cb1d1a 100644
> > > > --- a/arch/x86/kernel/tsc.c
> > > > +++ b/arch/x86/kernel/tsc.c
> > > > @@ -1217,7 +1217,7 @@ static void __init
> > > > check_system_tsc_reliable(void)
> > > > if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
> > > > boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
> > > > boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
> > > > - nr_online_nodes <= 2)
> > > > + nr_online_nodes <= 8)
> > > > tsc_disable_clocksource_watchdog();
> > > > }
> > > >
> > > >
> > > Hi Feng,
> > >
> > > I tested this patch on a previous server and it fixes the issue.
> >
> > Thanks for the testing, please do let us know if there is any TSC
> > problem after long time or stress running.
> >
> > Plan to send the patch for merging.
> >
>
> Good to know.
>
> This patch fixes the TSC issue for IA.
> while the patch at
> https://lore.kernel.org/lkml/YzZDLBKbDTbNr45b@feng-clx/T/#m34094630193e8320c6d75e9c8aeabe7633e051d2
> impacts more Arches but I'm not sure if similar problem is observed on
> other Arches as well. So I will hold it for now.
I think your patch is more general and architecture independent. We've
seen quite some cases in boot phase on IA platforms, but I guess this
is also possible for other ARCHs.
Thanks,
Feng
> thanks,
> rui
>