2010-07-20 00:41:07

by Alok Kataria

[permalink] [raw]
Subject: [PATCH] add a arch specific delay calibration hook.

Hi,

This patch adds a hook for architectures to specify their own delay calibration
routine. VMware platform uses it to calculate the lpj value from the tsc_khz &
HZ value for all the processors.

Please note that this is a partial revert of -
commit 3da757daf86e498872855f0b5e101f763ba79499
x86: use cpu_khz for loops_per_jiffy calculation

where I added the lpj_fine variable to generic code, so that we can do this
lpj calibration trick just for the BP. It was considered wrong to apply this
trick for the AP's since on physical systems we can have cases where the AP
is brought up at a lower freq than the maximum possible for power reasons.
On VMware's platform we have VCPU's always running at the same
clockspeed as the TSC frequency so we can extend this for all cpus.

Please note that, though the original approach of doing this for just the BP
was safe to get around the "IO-APIC + timer doesn't work" on VMware, we still
need the AP's to have the correct lpj values for the timeouts to work correctly
on our platform for all vcpus.

Please consider this for the x86 tree, applies on the tip.

Signed-off-by: Alok N Kataria <[email protected]>

Index: linux-x86-tree.git/arch/x86/kernel/cpu/vmware.c
===================================================================
--- linux-x86-tree.git.orig/arch/x86/kernel/cpu/vmware.c 2010-07-08 13:53:33.000000000 -0700
+++ linux-x86-tree.git/arch/x86/kernel/cpu/vmware.c 2010-07-19 16:47:53.000000000 -0700
@@ -23,6 +23,7 @@

#include <linux/dmi.h>
#include <linux/module.h>
+#include <linux/delay.h>
#include <asm/div64.h>
#include <asm/x86_init.h>
#include <asm/hypervisor.h>
@@ -42,6 +43,8 @@
"2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \
"memory");

+static unsigned long lpj_fine;
+
static inline int __vmware_platform(void)
{
uint32_t eax, ebx, ecx, edx;
@@ -51,7 +54,7 @@ static inline int __vmware_platform(void

static unsigned long vmware_get_tsc_khz(void)
{
- uint64_t tsc_hz;
+ uint64_t tsc_hz, lpj;
uint32_t eax, ebx, ecx, edx;

VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
@@ -62,18 +65,35 @@ static unsigned long vmware_get_tsc_khz(
printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
(unsigned long) tsc_hz / 1000,
(unsigned long) tsc_hz % 1000);
+
+ lpj = ((u64)tsc_hz * 1000);
+ do_div(lpj, HZ);
+ lpj_fine = lpj;
+
return tsc_hz;
}

+/*
+ * We can skip the delay calibration and assign it a value calculated based on
+ * the timer frequency. On VMware's platform all the cpu's run at the same
+ * frequency as the timer frequency, so use this value for all the processors.
+ */
+static unsigned long vmware_calibrate_delay(void)
+{
+ BUG_ON(!lpj_fine);
+ return lpj_fine;
+}
+
static void __init vmware_platform_setup(void)
{
uint32_t eax, ebx, ecx, edx;

VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);

- if (ebx != UINT_MAX)
+ if (ebx != UINT_MAX) {
x86_platform.calibrate_tsc = vmware_get_tsc_khz;
- else
+ arch_calibrate_delay = vmware_calibrate_delay;
+ } else
printk(KERN_WARNING
"Failed to get TSC freq from the hypervisor\n");
}
Index: linux-x86-tree.git/include/linux/delay.h
===================================================================
--- linux-x86-tree.git.orig/include/linux/delay.h 2008-06-26 15:29:48.000000000 -0700
+++ linux-x86-tree.git/include/linux/delay.h 2010-07-19 16:31:21.000000000 -0700
@@ -41,7 +41,7 @@ static inline void ndelay(unsigned long
#define ndelay(x) ndelay(x)
#endif

-extern unsigned long lpj_fine;
+extern unsigned long (*arch_calibrate_delay)(void);
void calibrate_delay(void);
void msleep(unsigned int msecs);
unsigned long msleep_interruptible(unsigned int msecs);
Index: linux-x86-tree.git/init/calibrate.c
===================================================================
--- linux-x86-tree.git.orig/init/calibrate.c 2010-02-07 16:38:44.000000000 -0800
+++ linux-x86-tree.git/init/calibrate.c 2010-07-19 17:00:04.000000000 -0700
@@ -10,8 +10,9 @@
#include <linux/timex.h>
#include <linux/smp.h>

-unsigned long lpj_fine;
unsigned long preset_lpj;
+unsigned long (*arch_calibrate_delay)(void);
+
static int __init lpj_setup(char *str)
{
preset_lpj = simple_strtoul(str,NULL,0);
@@ -130,10 +131,11 @@ void __cpuinit calibrate_delay(void)
if (!printed)
pr_info("Calibrating delay loop (skipped) "
"preset value.. ");
- } else if ((!printed) && lpj_fine) {
- loops_per_jiffy = lpj_fine;
- pr_info("Calibrating delay loop (skipped), "
- "value calculated using timer frequency.. ");
+ } else if (arch_calibrate_delay) {
+ loops_per_jiffy = arch_calibrate_delay();
+ if (!printed)
+ pr_info("Calibrating delay using platform "
+ "specific routine.. ");
} else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) {
if (!printed)
pr_info("Calibrating delay using timer "
Index: linux-x86-tree.git/arch/x86/kernel/tsc.c
===================================================================
--- linux-x86-tree.git.orig/arch/x86/kernel/tsc.c 2010-07-19 16:30:35.000000000 -0700
+++ linux-x86-tree.git/arch/x86/kernel/tsc.c 2010-07-19 16:46:51.000000000 -0700
@@ -913,7 +913,6 @@ static inline unsigned long calibrate_cp

void __init tsc_init(void)
{
- u64 lpj;
int cpu;

x86_init.timers.tsc_pre_init();
@@ -952,10 +951,6 @@ void __init tsc_init(void)
/* now allow native_sched_clock() to use rdtsc */
tsc_disabled = 0;

- lpj = ((u64)tsc_khz * 1000);
- do_div(lpj, HZ);
- lpj_fine = lpj;
-
use_tsc_delay();
/* Check and install the TSC clocksource */
dmi_check_system(bad_tsc_dmi_table);


2010-07-20 04:21:21

by Alok Kataria

[permalink] [raw]
Subject: Re: [PATCH] add a arch specific delay calibration hook.

On Mon, 2010-07-19 at 17:41 -0700, Alok Kataria wrote:
> Hi,
>
> This patch adds a hook for architectures to specify their own delay calibration
> routine. VMware platform uses it to calculate the lpj value from the tsc_khz &
> HZ value for all the processors.
>
> Please note that this is a partial revert of -
> commit 3da757daf86e498872855f0b5e101f763ba79499
> x86: use cpu_khz for loops_per_jiffy calculation
>
> where I added the lpj_fine variable to generic code, so that we can do this
> lpj calibration trick just for the BP. It was considered wrong to apply this
> trick for the AP's since on physical systems we can have cases where the AP
> is brought up at a lower freq than the maximum possible for power reasons.
> On VMware's platform we have VCPU's always running at the same
> clockspeed as the TSC frequency so we can extend this for all cpus.
>
> Please note that, though the original approach of doing this for just the BP
> was safe to get around the "IO-APIC + timer doesn't work" on VMware, we still
> need the AP's to have the correct lpj values for the timeouts to work correctly
> on our platform for all vcpus.
>
> Please consider this for the x86 tree, applies on the tip.

I assumed that this lpj_fine thing was relevant only for VMware, but
this might be useful for native or other virtualized platforms too. So
as not to regress from the existing behavior, I have reworked this patch
so that we use this hook for x86 platform too. And when on VMware we
replace it with the VMware specific routine.

Please take a look and consider this patch instead of the first one.

Thanks,
Alok

--

We use the lpj_fine value to setup loops_per_jiffy just for the BP,
since on physical systems we can have cases where the AP is brought up
at a lower frequency than the maximum possible, for power reasons.
Though, on VMware's platform we have all the VCPU's always running at
the same clockspeed as the TSC frequency, so we can use the lpj_fine
value for all cpus.

This patch adds a hook for architectures to specify their own delay
calibration routine, x86 defines this by returning the lpj_fine value
for BP and zero for all others. When on VMware we override this with our
routine which always returns lpj_fine irrespective of which CPU you
running on.

Patch applies on x86-tip tree.

Signed-off-by: Alok N Kataria <[email protected]>

Index: linux-x86-tree.git/arch/x86/kernel/cpu/vmware.c
===================================================================
--- linux-x86-tree.git.orig/arch/x86/kernel/cpu/vmware.c 2010-07-19 19:57:36.000000000 -0700
+++ linux-x86-tree.git/arch/x86/kernel/cpu/vmware.c 2010-07-19 20:34:36.000000000 -0700
@@ -23,6 +23,7 @@

#include <linux/dmi.h>
#include <linux/module.h>
+#include <linux/delay.h>
#include <asm/div64.h>
#include <asm/x86_init.h>
#include <asm/hypervisor.h>
@@ -65,15 +66,27 @@ static unsigned long vmware_get_tsc_khz(
return tsc_hz;
}

+/*
+ * We can skip the delay calibration and assign it a value calculated based on
+ * the timer frequency. On VMware's platform all the cpu's run at the same
+ * frequency as the timer frequency, so use this value for all the processors.
+ */
+static unsigned long vmware_calibrate_delay(void)
+{
+ BUG_ON(!lpj_fine);
+ return lpj_fine;
+}
+
static void __init vmware_platform_setup(void)
{
uint32_t eax, ebx, ecx, edx;

VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);

- if (ebx != UINT_MAX)
+ if (ebx != UINT_MAX) {
x86_platform.calibrate_tsc = vmware_get_tsc_khz;
- else
+ arch_calibrate_delay = vmware_calibrate_delay;
+ } else
printk(KERN_WARNING
"Failed to get TSC freq from the hypervisor\n");
}
Index: linux-x86-tree.git/include/linux/delay.h
===================================================================
--- linux-x86-tree.git.orig/include/linux/delay.h 2010-07-19 19:57:36.000000000 -0700
+++ linux-x86-tree.git/include/linux/delay.h 2010-07-19 19:58:48.000000000 -0700
@@ -41,7 +41,7 @@ static inline void ndelay(unsigned long
#define ndelay(x) ndelay(x)
#endif

-extern unsigned long lpj_fine;
+extern unsigned long (*arch_calibrate_delay)(void);
void calibrate_delay(void);
void msleep(unsigned int msecs);
unsigned long msleep_interruptible(unsigned int msecs);
Index: linux-x86-tree.git/init/calibrate.c
===================================================================
--- linux-x86-tree.git.orig/init/calibrate.c 2010-07-19 19:57:36.000000000 -0700
+++ linux-x86-tree.git/init/calibrate.c 2010-07-19 20:25:36.000000000 -0700
@@ -10,8 +10,9 @@
#include <linux/timex.h>
#include <linux/smp.h>

-unsigned long lpj_fine;
unsigned long preset_lpj;
+unsigned long (*arch_calibrate_delay)(void);
+
static int __init lpj_setup(char *str)
{
preset_lpj = simple_strtoul(str,NULL,0);
@@ -112,16 +113,12 @@ static unsigned long __cpuinit calibrate
* This is the number of bits of precision for the loops_per_jiffy. Each
* bit takes on average 1.5/HZ seconds. This (like the original) is a little
* better than 1%
- * For the boot cpu we can skip the delay calibration and assign it a value
- * calculated based on the timer frequency.
- * For the rest of the CPUs we cannot assume that the timer frequency is same as
- * the cpu frequency, hence do the calibration for those.
*/
#define LPS_PREC 8

void __cpuinit calibrate_delay(void)
{
- unsigned long ticks, loopbit;
+ unsigned long ticks, loopbit, lpj;
int lps_precision = LPS_PREC;
static bool printed;

@@ -130,10 +127,11 @@ void __cpuinit calibrate_delay(void)
if (!printed)
pr_info("Calibrating delay loop (skipped) "
"preset value.. ");
- } else if ((!printed) && lpj_fine) {
- loops_per_jiffy = lpj_fine;
- pr_info("Calibrating delay loop (skipped), "
- "value calculated using timer frequency.. ");
+ } else if (arch_calibrate_delay && (lpj = arch_calibrate_delay())) {
+ loops_per_jiffy = lpj;
+ if (!printed)
+ pr_info("Calibrating delay using arch specific "
+ "calibration routine.. ");
} else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) {
if (!printed)
pr_info("Calibrating delay using timer "
Index: linux-x86-tree.git/arch/x86/include/asm/tsc.h
===================================================================
--- linux-x86-tree.git.orig/arch/x86/include/asm/tsc.h 2010-07-19 16:37:33.000000000 -0700
+++ linux-x86-tree.git/arch/x86/include/asm/tsc.h 2010-07-19 19:58:48.000000000 -0700
@@ -16,6 +16,7 @@ typedef unsigned long long cycles_t;

extern unsigned int cpu_khz;
extern unsigned int tsc_khz;
+extern unsigned long lpj_fine;

extern void disable_TSC(void);

Index: linux-x86-tree.git/arch/x86/kernel/tsc.c
===================================================================
--- linux-x86-tree.git.orig/arch/x86/kernel/tsc.c 2010-07-19 19:57:36.000000000 -0700
+++ linux-x86-tree.git/arch/x86/kernel/tsc.c 2010-07-19 21:16:05.000000000 -0700
@@ -26,6 +26,8 @@ EXPORT_SYMBOL(cpu_khz);
unsigned int __read_mostly tsc_khz;
EXPORT_SYMBOL(tsc_khz);

+unsigned long __read_mostly lpj_fine;
+
/*
* TSC can be unstable due to cpufreq or due to unsynced TSCs
*/
@@ -911,6 +913,20 @@ static unsigned long __init calibrate_cp
static inline unsigned long calibrate_cpu(void) { return cpu_khz; }
#endif

+/*
+ * For the boot cpu we can skip the delay calibration and assign it a value
+ * calculated based on the timer frequency.
+ * For the rest of the CPUs we cannot assume that the timer frequency is same as
+ * the cpu frequency, hence do the calibration for those.
+ */
+unsigned long x86_calibrate_delay(void)
+{
+ if (!smp_processor_id())
+ return lpj_fine;
+ else
+ return 0;
+}
+
void __init tsc_init(void)
{
u64 lpj;
@@ -955,6 +971,8 @@ void __init tsc_init(void)
lpj = ((u64)tsc_khz * 1000);
do_div(lpj, HZ);
lpj_fine = lpj;
+ if (!arch_calibrate_delay)
+ arch_calibrate_delay = x86_calibrate_delay;

use_tsc_delay();
/* Check and install the TSC clocksource */

2010-07-22 23:56:38

by Alok Kataria

[permalink] [raw]
Subject: Re: [PATCH] add a arch specific delay calibration hook.

Ingo/HPA/others,

Just want to make sure that this doesn't fell through the cracks.
Please let me know if you have any comments on this patch.

Thanks,
Alok

On Mon, 2010-07-19 at 21:21 -0700, Alok Kataria wrote:
> On Mon, 2010-07-19 at 17:41 -0700, Alok Kataria wrote:
> > Hi,
> >
> > This patch adds a hook for architectures to specify their own delay calibration
> > routine. VMware platform uses it to calculate the lpj value from the tsc_khz &
> > HZ value for all the processors.
> >
> > Please note that this is a partial revert of -
> > commit 3da757daf86e498872855f0b5e101f763ba79499
> > x86: use cpu_khz for loops_per_jiffy calculation
> >
> > where I added the lpj_fine variable to generic code, so that we can do this
> > lpj calibration trick just for the BP. It was considered wrong to apply this
> > trick for the AP's since on physical systems we can have cases where the AP
> > is brought up at a lower freq than the maximum possible for power reasons.
> > On VMware's platform we have VCPU's always running at the same
> > clockspeed as the TSC frequency so we can extend this for all cpus.
> >
> > Please note that, though the original approach of doing this for just the BP
> > was safe to get around the "IO-APIC + timer doesn't work" on VMware, we still
> > need the AP's to have the correct lpj values for the timeouts to work correctly
> > on our platform for all vcpus.
> >
> > Please consider this for the x86 tree, applies on the tip.
>
> I assumed that this lpj_fine thing was relevant only for VMware, but
> this might be useful for native or other virtualized platforms too. So
> as not to regress from the existing behavior, I have reworked this patch
> so that we use this hook for x86 platform too. And when on VMware we
> replace it with the VMware specific routine.
>
> Please take a look and consider this patch instead of the first one.
>
> Thanks,
> Alok
>
> --
>
> We use the lpj_fine value to setup loops_per_jiffy just for the BP,
> since on physical systems we can have cases where the AP is brought up
> at a lower frequency than the maximum possible, for power reasons.
> Though, on VMware's platform we have all the VCPU's always running at
> the same clockspeed as the TSC frequency, so we can use the lpj_fine
> value for all cpus.
>
> This patch adds a hook for architectures to specify their own delay
> calibration routine, x86 defines this by returning the lpj_fine value
> for BP and zero for all others. When on VMware we override this with our
> routine which always returns lpj_fine irrespective of which CPU you
> running on.
>
> Patch applies on x86-tip tree.
>
> Signed-off-by: Alok N Kataria <[email protected]>
>
> Index: linux-x86-tree.git/arch/x86/kernel/cpu/vmware.c
> ===================================================================
> --- linux-x86-tree.git.orig/arch/x86/kernel/cpu/vmware.c 2010-07-19 19:57:36.000000000 -0700
> +++ linux-x86-tree.git/arch/x86/kernel/cpu/vmware.c 2010-07-19 20:34:36.000000000 -0700
> @@ -23,6 +23,7 @@
>
> #include <linux/dmi.h>
> #include <linux/module.h>
> +#include <linux/delay.h>
> #include <asm/div64.h>
> #include <asm/x86_init.h>
> #include <asm/hypervisor.h>
> @@ -65,15 +66,27 @@ static unsigned long vmware_get_tsc_khz(
> return tsc_hz;
> }
>
> +/*
> + * We can skip the delay calibration and assign it a value calculated based on
> + * the timer frequency. On VMware's platform all the cpu's run at the same
> + * frequency as the timer frequency, so use this value for all the processors.
> + */
> +static unsigned long vmware_calibrate_delay(void)
> +{
> + BUG_ON(!lpj_fine);
> + return lpj_fine;
> +}
> +
> static void __init vmware_platform_setup(void)
> {
> uint32_t eax, ebx, ecx, edx;
>
> VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
>
> - if (ebx != UINT_MAX)
> + if (ebx != UINT_MAX) {
> x86_platform.calibrate_tsc = vmware_get_tsc_khz;
> - else
> + arch_calibrate_delay = vmware_calibrate_delay;
> + } else
> printk(KERN_WARNING
> "Failed to get TSC freq from the hypervisor\n");
> }
> Index: linux-x86-tree.git/include/linux/delay.h
> ===================================================================
> --- linux-x86-tree.git.orig/include/linux/delay.h 2010-07-19 19:57:36.000000000 -0700
> +++ linux-x86-tree.git/include/linux/delay.h 2010-07-19 19:58:48.000000000 -0700
> @@ -41,7 +41,7 @@ static inline void ndelay(unsigned long
> #define ndelay(x) ndelay(x)
> #endif
>
> -extern unsigned long lpj_fine;
> +extern unsigned long (*arch_calibrate_delay)(void);
> void calibrate_delay(void);
> void msleep(unsigned int msecs);
> unsigned long msleep_interruptible(unsigned int msecs);
> Index: linux-x86-tree.git/init/calibrate.c
> ===================================================================
> --- linux-x86-tree.git.orig/init/calibrate.c 2010-07-19 19:57:36.000000000 -0700
> +++ linux-x86-tree.git/init/calibrate.c 2010-07-19 20:25:36.000000000 -0700
> @@ -10,8 +10,9 @@
> #include <linux/timex.h>
> #include <linux/smp.h>
>
> -unsigned long lpj_fine;
> unsigned long preset_lpj;
> +unsigned long (*arch_calibrate_delay)(void);
> +
> static int __init lpj_setup(char *str)
> {
> preset_lpj = simple_strtoul(str,NULL,0);
> @@ -112,16 +113,12 @@ static unsigned long __cpuinit calibrate
> * This is the number of bits of precision for the loops_per_jiffy. Each
> * bit takes on average 1.5/HZ seconds. This (like the original) is a little
> * better than 1%
> - * For the boot cpu we can skip the delay calibration and assign it a value
> - * calculated based on the timer frequency.
> - * For the rest of the CPUs we cannot assume that the timer frequency is same as
> - * the cpu frequency, hence do the calibration for those.
> */
> #define LPS_PREC 8
>
> void __cpuinit calibrate_delay(void)
> {
> - unsigned long ticks, loopbit;
> + unsigned long ticks, loopbit, lpj;
> int lps_precision = LPS_PREC;
> static bool printed;
>
> @@ -130,10 +127,11 @@ void __cpuinit calibrate_delay(void)
> if (!printed)
> pr_info("Calibrating delay loop (skipped) "
> "preset value.. ");
> - } else if ((!printed) && lpj_fine) {
> - loops_per_jiffy = lpj_fine;
> - pr_info("Calibrating delay loop (skipped), "
> - "value calculated using timer frequency.. ");
> + } else if (arch_calibrate_delay && (lpj = arch_calibrate_delay())) {
> + loops_per_jiffy = lpj;
> + if (!printed)
> + pr_info("Calibrating delay using arch specific "
> + "calibration routine.. ");
> } else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) {
> if (!printed)
> pr_info("Calibrating delay using timer "
> Index: linux-x86-tree.git/arch/x86/include/asm/tsc.h
> ===================================================================
> --- linux-x86-tree.git.orig/arch/x86/include/asm/tsc.h 2010-07-19 16:37:33.000000000 -0700
> +++ linux-x86-tree.git/arch/x86/include/asm/tsc.h 2010-07-19 19:58:48.000000000 -0700
> @@ -16,6 +16,7 @@ typedef unsigned long long cycles_t;
>
> extern unsigned int cpu_khz;
> extern unsigned int tsc_khz;
> +extern unsigned long lpj_fine;
>
> extern void disable_TSC(void);
>
> Index: linux-x86-tree.git/arch/x86/kernel/tsc.c
> ===================================================================
> --- linux-x86-tree.git.orig/arch/x86/kernel/tsc.c 2010-07-19 19:57:36.000000000 -0700
> +++ linux-x86-tree.git/arch/x86/kernel/tsc.c 2010-07-19 21:16:05.000000000 -0700
> @@ -26,6 +26,8 @@ EXPORT_SYMBOL(cpu_khz);
> unsigned int __read_mostly tsc_khz;
> EXPORT_SYMBOL(tsc_khz);
>
> +unsigned long __read_mostly lpj_fine;
> +
> /*
> * TSC can be unstable due to cpufreq or due to unsynced TSCs
> */
> @@ -911,6 +913,20 @@ static unsigned long __init calibrate_cp
> static inline unsigned long calibrate_cpu(void) { return cpu_khz; }
> #endif
>
> +/*
> + * For the boot cpu we can skip the delay calibration and assign it a value
> + * calculated based on the timer frequency.
> + * For the rest of the CPUs we cannot assume that the timer frequency is same as
> + * the cpu frequency, hence do the calibration for those.
> + */
> +unsigned long x86_calibrate_delay(void)
> +{
> + if (!smp_processor_id())
> + return lpj_fine;
> + else
> + return 0;
> +}
> +
> void __init tsc_init(void)
> {
> u64 lpj;
> @@ -955,6 +971,8 @@ void __init tsc_init(void)
> lpj = ((u64)tsc_khz * 1000);
> do_div(lpj, HZ);
> lpj_fine = lpj;
> + if (!arch_calibrate_delay)
> + arch_calibrate_delay = x86_calibrate_delay;
>
> use_tsc_delay();
> /* Check and install the TSC clocksource */
>