From: Steven Noonan <[email protected]>
AMD processors don't implement any mechanism like Intel's
IA32_TSC_ADJUST MSR to sync the TSC. Instead of just relying on the
BIOS, TSC can be synced by calculating the difference and directly
writing it to the TSC MSR.
Add directsync flag to turn on the TSC sync when IA32_TSC_MSR isn't
available. Attempt 1000 times or for 30 seconds before giving up.
Signed-off-by: Steven Noonan <[email protected]>
Signed-off-by: Muhammad Usama Anjum <[email protected]>
---
.../admin-guide/kernel-parameters.txt | 4 +-
arch/x86/include/asm/tsc.h | 1 +
arch/x86/kernel/tsc.c | 3 ++
arch/x86/kernel/tsc_sync.c | 46 +++++++++++++++----
4 files changed, 43 insertions(+), 11 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index db5de5f0b9d3..f0e6ea580e68 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -6271,7 +6271,7 @@
If not specified, "default" is used. In this case,
the RNG's choice is left to each individual trust source.
- tsc= Disable clocksource stability checks for TSC.
+ tsc= Disable clocksource stability checks for TSC or sync the TSC.
Format: <string>
[x86] reliable: mark tsc clocksource as reliable, this
disables clocksource verification at runtime, as well
@@ -6289,6 +6289,8 @@
in situations with strict latency requirements (where
interruptions from clocksource watchdog are not
acceptable).
+ [x86] directsync: attempt to sync the tsc via direct
+ writes if MSR_IA32_TSC_ADJUST isn't available
tsc_early_khz= [X86] Skip early TSC calibration and use the given
value instead. Useful when the early TSC frequency discovery
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index fbdc3d951494..dc70909119e8 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -42,6 +42,7 @@ extern unsigned long native_calibrate_tsc(void);
extern unsigned long long native_sched_clock_from_tsc(u64 tsc);
extern int tsc_clocksource_reliable;
+extern int tsc_allow_direct_sync;
#ifdef CONFIG_X86_TSC
extern bool tsc_async_resets;
#else
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index cafacb2e58cc..6345af65a549 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -47,6 +47,7 @@ static unsigned int __initdata tsc_early_khz;
static DEFINE_STATIC_KEY_FALSE(__use_tsc);
int tsc_clocksource_reliable;
+int tsc_allow_direct_sync;
static u32 art_to_tsc_numerator;
static u32 art_to_tsc_denominator;
@@ -303,6 +304,8 @@ static int __init tsc_setup(char *str)
mark_tsc_unstable("boot parameter");
if (!strcmp(str, "nowatchdog"))
no_tsc_watchdog = 1;
+ if (!strcmp(str, "directsync"))
+ tsc_allow_direct_sync = 1;
return 1;
}
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 9452dc9664b5..2a855991f982 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -340,6 +340,8 @@ static cycles_t check_tsc_warp(unsigned int timeout)
*/
static inline unsigned int loop_timeout(int cpu)
{
+ if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+ return 30;
return (cpumask_weight(topology_core_cpumask(cpu)) > 1) ? 2 : 20;
}
@@ -360,13 +362,16 @@ void check_tsc_sync_source(int cpu)
/*
* Set the maximum number of test runs to
- * 1 if the CPU does not provide the TSC_ADJUST MSR
- * 3 if the MSR is available, so the target can try to adjust
+ * 3 if TSC_ADJUST MSR is available, so the target can try to adjust
+ * 1000 if TSC MSR can be written to compensate
+ * 1 if MSRs cannot be written
*/
- if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
- atomic_set(&test_runs, 1);
- else
+ if (boot_cpu_has(X86_FEATURE_TSC_ADJUST))
atomic_set(&test_runs, 3);
+ else if (tsc_allow_direct_sync)
+ atomic_set(&test_runs, 1000);
+ else
+ atomic_set(&test_runs, 1);
retry:
/*
* Wait for the target to start or to skip the test:
@@ -434,6 +439,21 @@ void check_tsc_sync_source(int cpu)
goto retry;
}
+static inline cycles_t write_tsc_adjustment(cycles_t adjustment)
+{
+ cycles_t adjval, nextval;
+
+ rdmsrl(MSR_IA32_TSC, adjval);
+ adjval += adjustment;
+ wrmsrl(MSR_IA32_TSC, adjval);
+ rdmsrl(MSR_IA32_TSC, nextval);
+
+ /*
+ * Estimated clock cycle overhead for wrmsr + rdmsr
+ */
+ return nextval - adjval;
+}
+
/*
* Freshly booted CPUs call into this:
*/
@@ -441,7 +461,7 @@ void check_tsc_sync_target(void)
{
struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust);
unsigned int cpu = smp_processor_id();
- cycles_t cur_max_warp, gbl_max_warp;
+ cycles_t cur_max_warp, gbl_max_warp, est_overhead = 0;
int cpus = 2;
/* Also aborts if there is no TSC. */
@@ -521,12 +541,18 @@ void check_tsc_sync_target(void)
* value is used. In the worst case the adjustment needs to go
* through a 3rd run for fine tuning.
*/
- cur->adjusted += cur_max_warp;
+ if (boot_cpu_has(X86_FEATURE_TSC_ADJUST)) {
+ cur->adjusted += cur_max_warp;
- pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n",
- cpu, cur_max_warp, cur->adjusted);
+ pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n",
+ cpu, cur_max_warp, cur->adjusted);
- wrmsrl(MSR_IA32_TSC_ADJUST, cur->adjusted);
+ wrmsrl(MSR_IA32_TSC_ADJUST, cur->adjusted);
+ } else {
+ pr_debug("TSC direct sync: CPU%u observed %lld warp. Overhead: %lld\n",
+ cpu, cur_max_warp, est_overhead);
+ est_overhead = write_tsc_adjustment(cur_max_warp + est_overhead);
+ }
goto retry;
}
--
2.30.2
From: Steven Noonan <[email protected]>
Update watchdog after syncing the TSCs of all the CPUs. This is needed
to avoid getting TSC clocksource marked as unstable after syncing them.
Signed-off-by: Steven Noonan <[email protected]>
Signed-off-by: Muhammad Usama Anjum <[email protected]>
---
arch/x86/kernel/smpboot.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f24227bc3220..4b3a03004a1f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -56,6 +56,7 @@
#include <linux/numa.h>
#include <linux/pgtable.h>
#include <linux/overflow.h>
+#include <linux/clocksource.h>
#include <asm/acpi.h>
#include <asm/desc.h>
@@ -1444,6 +1445,7 @@ void arch_thaw_secondary_cpus_begin(void)
void arch_thaw_secondary_cpus_end(void)
{
+ clocksource_touch_watchdog();
mtrr_aps_init();
}
@@ -1477,6 +1479,8 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
{
pr_debug("Boot done\n");
+ clocksource_touch_watchdog();
+
calculate_max_logical_packages();
/* XXX for now assume numa-in-package and hybrid don't overlap */
--
2.30.2
From: Steven Noonan <[email protected]>
There's some overhead in writing and reading MSR_IA32_TSC. We try to
account for it. But sometimes overhead gets under or over estimated.
When we retry syncing, it sees the clock "go backwards". Hence,
ignore random wrap if using direct sync.
Signed-off-by: Steven Noonan <[email protected]>
Signed-off-by: Muhammad Usama Anjum <[email protected]>
---
arch/x86/kernel/tsc_sync.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 2a855991f982..1fc751212a0e 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -405,7 +405,7 @@ void check_tsc_sync_source(int cpu)
pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n",
smp_processor_id(), cpu);
- } else if (atomic_dec_and_test(&test_runs) || random_warps) {
+ } else if (atomic_dec_and_test(&test_runs) || (random_warps && !tsc_allow_direct_sync)) {
/* Force it to 0 if random warps brought us here */
atomic_set(&test_runs, 0);
--
2.30.2
Hi,
Kind reminder.
On 8/8/22 4:39 PM, Muhammad Usama Anjum wrote:
> From: Steven Noonan <[email protected]>
>
> AMD processors don't implement any mechanism like Intel's
> IA32_TSC_ADJUST MSR to sync the TSC. Instead of just relying on the
> BIOS, TSC can be synced by calculating the difference and directly
> writing it to the TSC MSR.
>
> Add directsync flag to turn on the TSC sync when IA32_TSC_MSR isn't
> available. Attempt 1000 times or for 30 seconds before giving up.
>
> Signed-off-by: Steven Noonan <[email protected]>
> Signed-off-by: Muhammad Usama Anjum <[email protected]>
> ---
> .../admin-guide/kernel-parameters.txt | 4 +-
> arch/x86/include/asm/tsc.h | 1 +
> arch/x86/kernel/tsc.c | 3 ++
> arch/x86/kernel/tsc_sync.c | 46 +++++++++++++++----
> 4 files changed, 43 insertions(+), 11 deletions(-)
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index db5de5f0b9d3..f0e6ea580e68 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -6271,7 +6271,7 @@
> If not specified, "default" is used. In this case,
> the RNG's choice is left to each individual trust source.
>
> - tsc= Disable clocksource stability checks for TSC.
> + tsc= Disable clocksource stability checks for TSC or sync the TSC.
> Format: <string>
> [x86] reliable: mark tsc clocksource as reliable, this
> disables clocksource verification at runtime, as well
> @@ -6289,6 +6289,8 @@
> in situations with strict latency requirements (where
> interruptions from clocksource watchdog are not
> acceptable).
> + [x86] directsync: attempt to sync the tsc via direct
> + writes if MSR_IA32_TSC_ADJUST isn't available
>
> tsc_early_khz= [X86] Skip early TSC calibration and use the given
> value instead. Useful when the early TSC frequency discovery
> diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
> index fbdc3d951494..dc70909119e8 100644
> --- a/arch/x86/include/asm/tsc.h
> +++ b/arch/x86/include/asm/tsc.h
> @@ -42,6 +42,7 @@ extern unsigned long native_calibrate_tsc(void);
> extern unsigned long long native_sched_clock_from_tsc(u64 tsc);
>
> extern int tsc_clocksource_reliable;
> +extern int tsc_allow_direct_sync;
> #ifdef CONFIG_X86_TSC
> extern bool tsc_async_resets;
> #else
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index cafacb2e58cc..6345af65a549 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -47,6 +47,7 @@ static unsigned int __initdata tsc_early_khz;
> static DEFINE_STATIC_KEY_FALSE(__use_tsc);
>
> int tsc_clocksource_reliable;
> +int tsc_allow_direct_sync;
>
> static u32 art_to_tsc_numerator;
> static u32 art_to_tsc_denominator;
> @@ -303,6 +304,8 @@ static int __init tsc_setup(char *str)
> mark_tsc_unstable("boot parameter");
> if (!strcmp(str, "nowatchdog"))
> no_tsc_watchdog = 1;
> + if (!strcmp(str, "directsync"))
> + tsc_allow_direct_sync = 1;
> return 1;
> }
>
> diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
> index 9452dc9664b5..2a855991f982 100644
> --- a/arch/x86/kernel/tsc_sync.c
> +++ b/arch/x86/kernel/tsc_sync.c
> @@ -340,6 +340,8 @@ static cycles_t check_tsc_warp(unsigned int timeout)
> */
> static inline unsigned int loop_timeout(int cpu)
> {
> + if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
> + return 30;
> return (cpumask_weight(topology_core_cpumask(cpu)) > 1) ? 2 : 20;
> }
>
> @@ -360,13 +362,16 @@ void check_tsc_sync_source(int cpu)
>
> /*
> * Set the maximum number of test runs to
> - * 1 if the CPU does not provide the TSC_ADJUST MSR
> - * 3 if the MSR is available, so the target can try to adjust
> + * 3 if TSC_ADJUST MSR is available, so the target can try to adjust
> + * 1000 if TSC MSR can be written to compensate
> + * 1 if MSRs cannot be written
> */
> - if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
> - atomic_set(&test_runs, 1);
> - else
> + if (boot_cpu_has(X86_FEATURE_TSC_ADJUST))
> atomic_set(&test_runs, 3);
> + else if (tsc_allow_direct_sync)
> + atomic_set(&test_runs, 1000);
> + else
> + atomic_set(&test_runs, 1);
> retry:
> /*
> * Wait for the target to start or to skip the test:
> @@ -434,6 +439,21 @@ void check_tsc_sync_source(int cpu)
> goto retry;
> }
>
> +static inline cycles_t write_tsc_adjustment(cycles_t adjustment)
> +{
> + cycles_t adjval, nextval;
> +
> + rdmsrl(MSR_IA32_TSC, adjval);
> + adjval += adjustment;
> + wrmsrl(MSR_IA32_TSC, adjval);
> + rdmsrl(MSR_IA32_TSC, nextval);
> +
> + /*
> + * Estimated clock cycle overhead for wrmsr + rdmsr
> + */
> + return nextval - adjval;
> +}
> +
> /*
> * Freshly booted CPUs call into this:
> */
> @@ -441,7 +461,7 @@ void check_tsc_sync_target(void)
> {
> struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust);
> unsigned int cpu = smp_processor_id();
> - cycles_t cur_max_warp, gbl_max_warp;
> + cycles_t cur_max_warp, gbl_max_warp, est_overhead = 0;
> int cpus = 2;
>
> /* Also aborts if there is no TSC. */
> @@ -521,12 +541,18 @@ void check_tsc_sync_target(void)
> * value is used. In the worst case the adjustment needs to go
> * through a 3rd run for fine tuning.
> */
> - cur->adjusted += cur_max_warp;
> + if (boot_cpu_has(X86_FEATURE_TSC_ADJUST)) {
> + cur->adjusted += cur_max_warp;
>
> - pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n",
> - cpu, cur_max_warp, cur->adjusted);
> + pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n",
> + cpu, cur_max_warp, cur->adjusted);
>
> - wrmsrl(MSR_IA32_TSC_ADJUST, cur->adjusted);
> + wrmsrl(MSR_IA32_TSC_ADJUST, cur->adjusted);
> + } else {
> + pr_debug("TSC direct sync: CPU%u observed %lld warp. Overhead: %lld\n",
> + cpu, cur_max_warp, est_overhead);
> + est_overhead = write_tsc_adjustment(cur_max_warp + est_overhead);
> + }
> goto retry;
>
> }
--
Muhammad Usama Anjum
On Mon, Aug 08 2022 at 16:39, Muhammad Usama Anjum wrote:
> From: Steven Noonan <[email protected]>
>
> AMD processors don't implement any mechanism like Intel's
> IA32_TSC_ADJUST MSR to sync the TSC. Instead of just relying on the
> BIOS, TSC can be synced by calculating the difference and directly
> writing it to the TSC MSR.
Why? This has been tried before and is known to be flaky and
unrealiable.
> Add directsync flag to turn on the TSC sync when IA32_TSC_MSR isn't
> available. Attempt 1000 times or for 30 seconds before giving up.
Looping 30 seconds with interrupts disabled? Seriously?
Thanks,
tglx
On Mon, Aug 08 2022 at 16:39, Muhammad Usama Anjum wrote:
> There's some overhead in writing and reading MSR_IA32_TSC. We try to
> account for it. But sometimes overhead gets under or over estimated.
> When we retry syncing, it sees the clock "go backwards". Hence,
> ignore random wrap if using direct sync.
This is just wrong. If the sync test can observe clock going backwards
then it can be observed during runtime too. Preventing that is the whole
point of the TSC sync exercise.
Thanks,
tglx
On 8/24/22 7:13 PM, Thomas Gleixner wrote:
> On Mon, Aug 08 2022 at 16:39, Muhammad Usama Anjum wrote:
>> From: Steven Noonan <[email protected]>
>>
>> AMD processors don't implement any mechanism like Intel's
>> IA32_TSC_ADJUST MSR to sync the TSC. Instead of just relying on the
>> BIOS, TSC can be synced by calculating the difference and directly
>> writing it to the TSC MSR.
>
> Why? This has been tried before and is known to be flaky and
> unrealiable.
I'm sorry. I was trying to find the historic attempts about this. But I
didn't find it. Can someone point me to the history?
Do we have some information on how AMD synchronizes the TSC in BIOS? If
the ADJUST MSR like Intel's isn't present in AMD, they must be syncing
it by directly writing to the TSC MSR like this patch is doing.
>
>> Add directsync flag to turn on the TSC sync when IA32_TSC_MSR isn't
>> available. Attempt 1000 times or for 30 seconds before giving up.
>
> Looping 30 seconds with interrupts disabled? Seriously?
Yeah, that's too much. Some BSD variant uses 1000 attempts. We can
change the 1000 attempts to 5 or 10 attempts as in my experience, 5
attempts at max were always successful every time.
>
> Thanks,
>
> tglx
--
Muhammad Usama Anjum