LinuxLists.cc - [RFC][PATCH] new timeofday core subsystem (v. A2)

[permalink] [raw]

Subject: [RFC][PATCH] new timeofday arch specific hooks (v. A2)

All,
This patch implements the minimal architecture specific hooks to enable
the new time of day subsystem code for i386, x86-64, and ppc64. It
applies on top of my linux-2.6.11-rc1_timeofday-core_A2 patch and with
this patch applied, you can test the new time of day subsystem.

Basically it adds the call to timeofday_interrupt_hook() and cuts alot
of code out of the build via #ifdefs. I know, I know, #ifdefs' are ugly
and bad, and the final patch will just remove the old code. For now this
allows us to be flexible and easily switch between the two
implementations with a single define. Also it makes the patch a bit
easier to read.

New in this version:
o more i386 cleanups
o added arch specific read_persistent_clock interface
o ppc64 hooks!

I look forward to your comments and feedback.

thanks
-john

linux-2.6.11-rc2_timeofday-arch_A2.patch
=======================================
diff -Nru a/arch/i386/Kconfig b/arch/i386/Kconfig
--- a/arch/i386/Kconfig 2005-01-24 13:33:59 -08:00
+++ b/arch/i386/Kconfig 2005-01-24 13:33:59 -08:00
@@ -14,6 +14,10 @@
486, 586, Pentiums, and various instruction-set-compatible chips by
AMD, Cyrix, and others.

+config NEWTOD
+ bool
+ default y
+
config MMU
bool
default y
diff -Nru a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
--- a/arch/i386/kernel/Makefile 2005-01-24 13:33:59 -08:00
+++ b/arch/i386/kernel/Makefile 2005-01-24 13:33:59 -08:00
@@ -10,7 +10,7 @@
doublefault.o quirks.o

obj-y += cpu/
-obj-y += timers/
+obj-$(!CONFIG_NEWTOD) += timers/
obj-$(CONFIG_ACPI_BOOT) += acpi/
obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o
obj-$(CONFIG_MCA) += mca.o
diff -Nru a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
--- a/arch/i386/kernel/apm.c 2005-01-24 13:33:59 -08:00
+++ b/arch/i386/kernel/apm.c 2005-01-24 13:33:59 -08:00
@@ -224,6 +224,7 @@
#include <linux/smp_lock.h>
#include <linux/dmi.h>
#include <linux/suspend.h>
+#include <linux/timeofday.h>

#include <asm/system.h>
#include <asm/uaccess.h>
@@ -1204,6 +1205,7 @@
device_suspend(PMSG_SUSPEND);
device_power_down(PMSG_SUSPEND);

+ timeofday_suspend_hook();
/* serialize with the timer interrupt */
write_seqlock_irq(&xtime_lock);

@@ -1231,6 +1233,7 @@
spin_unlock(&i8253_lock);
write_sequnlock_irq(&xtime_lock);

+ timeofday_resume_hook();
if (err == APM_NO_ERROR)
err = APM_SUCCESS;
if (err != APM_SUCCESS)
diff -Nru a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c
--- a/arch/i386/kernel/i8259.c 2005-01-24 13:33:59 -08:00
+++ b/arch/i386/kernel/i8259.c 2005-01-24 13:33:59 -08:00
@@ -388,6 +388,48 @@
}
}

+#ifdef CONFIG_NEWTOD
+void setup_pit_timer(void)
+{
+ extern spinlock_t i8253_lock;
+ unsigned long flags;
+
+ spin_lock_irqsave(&i8253_lock, flags);
+ outb_p(0x34,PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
+ udelay(10);
+ outb_p(LATCH & 0xff , PIT_CH0); /* LSB */
+ udelay(10);
+ outb(LATCH >> 8 , PIT_CH0); /* MSB */
+ spin_unlock_irqrestore(&i8253_lock, flags);
+}
+
+static int timer_resume(struct sys_device *dev)
+{
+ setup_pit_timer();
+ return 0;
+}
+
+static struct sysdev_class timer_sysclass = {
+ set_kset_name("timer_pit"),
+ .resume = timer_resume,
+};
+
+static struct sys_device device_timer = {
+ .id = 0,
+ .cls = &timer_sysclass,
+};
+
+static int __init init_timer_sysfs(void)
+{
+ int error = sysdev_class_register(&timer_sysclass);
+ if (!error)
+ error = sysdev_register(&device_timer);
+ return error;
+}
+
+device_initcall(init_timer_sysfs);
+#endif
+
void __init init_IRQ(void)
{
int i;
diff -Nru a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
--- a/arch/i386/kernel/time.c 2005-01-24 13:33:59 -08:00
+++ b/arch/i386/kernel/time.c 2005-01-24 13:33:59 -08:00
@@ -68,6 +68,8 @@

#include "io_ports.h"

+#include <linux/timeofday.h>
+
extern spinlock_t i8259A_lock;
int pit_latch_buggy; /* extern */

@@ -86,6 +88,7 @@
DEFINE_SPINLOCK(i8253_lock);
EXPORT_SYMBOL(i8253_lock);

+#ifndef CONFIG_NEWTOD
struct timer_opts *cur_timer = &timer_none;

/*
@@ -170,6 +173,7 @@
}

EXPORT_SYMBOL(do_settimeofday);
+#endif

static int set_rtc_mmss(unsigned long nowtime)
{
@@ -195,11 +199,13 @@
* Note: This function is required to return accurate
* time even in the absence of multiple timer ticks.
*/
+#ifndef CONFIG_NEWTOD
unsigned long long monotonic_clock(void)
{
return cur_timer->monotonic_clock();
}
EXPORT_SYMBOL(monotonic_clock);
+#endif

#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
unsigned long profile_pc(struct pt_regs *regs)
@@ -239,6 +245,7 @@

do_timer_interrupt_hook(regs);

+#ifndef CONFIG_NEWTOD
/*
* If we have an externally synchronized Linux clock, then update
* CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
@@ -261,6 +268,7 @@
else
last_rtc_update = xtime.tv_sec - 600; /* do it again in 60 s */
}
+#endif

if (MCA_bus) {
/* The PS/2 uses level-triggered interrupts. You can't
@@ -293,11 +301,15 @@
*/
write_seqlock(&xtime_lock);

+#ifndef CONFIG_NEWTOD
cur_timer->mark_offset();
+#endif

do_timer_interrupt(irq, NULL, regs);

write_sequnlock(&xtime_lock);
+
+ timeofday_interrupt_hook();
return IRQ_HANDLED;
}

@@ -318,6 +330,40 @@
return retval;
}

+/* arch specific timeofday hooks */
+nsec_t read_persistent_clock(void)
+{
+ return (nsec_t)get_cmos_time() * NSEC_PER_SEC;
+}
+
+void sync_persistent_clock(struct timespec ts)
+{
+ /*
+ * If we have an externally synchronized Linux clock, then update
+ * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
+ * called as close as possible to 500 ms before the new second starts.
+ */
+ if (ts.tv_sec > last_rtc_update + 660 &&
+ (ts.tv_nsec / 1000)
+ >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
+ (ts.tv_nsec / 1000)
+ <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2) {
+ /* horrible...FIXME */
+ if (efi_enabled) {
+ if (efi_set_rtc_mmss(ts.tv_sec) == 0)
+ last_rtc_update = ts.tv_sec;
+ else
+ last_rtc_update = ts.tv_sec - 600;
+ } else if (set_rtc_mmss(ts.tv_sec) == 0)
+ last_rtc_update = ts.tv_sec;
+ else
+ last_rtc_update = ts.tv_sec - 600; /* do it again in 60 s */
+ }
+}
+
+
+
+#ifndef CONFIG_NEWTOD
static long clock_cmos_diff, sleep_start;

static int timer_suspend(struct sys_device *dev, u32 state)
@@ -351,6 +397,23 @@
wall_jiffies += sleep_length;
return 0;
}
+#else /* !CONFIG_NEWTOD */
+static int timer_suspend(struct sys_device *dev, u32 state)
+{
+ timeofday_suspend_hook();
+ return 0;
+}
+
+static int timer_resume(struct sys_device *dev)
+{
+#ifdef CONFIG_HPET_TIMER
+ if (is_hpet_enabled())
+ hpet_reenable();
+#endif
+ timeofday_resume_hook();
+ return 0;
+}
+#endif

static struct sysdev_class timer_sysclass = {
.resume = timer_resume,
diff -Nru a/arch/i386/lib/delay.c b/arch/i386/lib/delay.c
--- a/arch/i386/lib/delay.c 2005-01-24 13:33:59 -08:00
+++ b/arch/i386/lib/delay.c 2005-01-24 13:33:59 -08:00
@@ -23,10 +23,29 @@

extern struct timer_opts* timer;

+#ifndef CONFIG_NEWTOD
void __delay(unsigned long loops)
{
cur_timer->delay(loops);
}
+#else
+#include <linux/timeofday.h>
+/* XXX - For now just use a simple loop delay
+ * This has cpufreq issues, but so did the old method.
+ */
+void __delay(unsigned long loops)
+{
+ int d0;
+ __asm__ __volatile__(
+ "\tjmp 1f\n"
+ ".align 16\n"
+ "1:\tjmp 2f\n"
+ ".align 16\n"
+ "2:\tdecl %0\n\tjns 2b"
+ :"=&a" (d0)
+ :"0" (loops));
+}
+#endif

inline void __const_udelay(unsigned long xloops)
{
diff -Nru a/arch/ppc64/Kconfig b/arch/ppc64/Kconfig
--- a/arch/ppc64/Kconfig 2005-01-24 13:33:59 -08:00
+++ b/arch/ppc64/Kconfig 2005-01-24 13:33:59 -08:00
@@ -10,6 +10,10 @@
bool
default y

+config NEWTOD
+ bool
+ default y
+
config UID16
bool

diff -Nru a/arch/ppc64/kernel/sys_ppc32.c b/arch/ppc64/kernel/sys_ppc32.c
--- a/arch/ppc64/kernel/sys_ppc32.c 2005-01-24 13:33:59 -08:00
+++ b/arch/ppc64/kernel/sys_ppc32.c 2005-01-24 13:33:59 -08:00
@@ -322,8 +322,10 @@

ret = do_adjtimex(&txc);

+#ifndef CONFIG_NEWTOD
/* adjust the conversion of TB to time of day to track adjtimex */
ppc_adjtimex();
+#endif

if(put_user(txc.modes, &utp->modes) ||
__put_user(txc.offset, &utp->offset) ||
diff -Nru a/arch/ppc64/kernel/time.c b/arch/ppc64/kernel/time.c
--- a/arch/ppc64/kernel/time.c 2005-01-24 13:33:59 -08:00
+++ b/arch/ppc64/kernel/time.c 2005-01-24 13:33:59 -08:00
@@ -50,6 +50,7 @@
#include <linux/profile.h>
#include <linux/cpu.h>
#include <linux/security.h>
+#include <linux/timeofday.h>

#include <asm/segment.h>
#include <asm/io.h>
@@ -108,6 +109,7 @@

static unsigned adjusting_time = 0;

+#ifndef CONFIG_NEWTOD
static __inline__ void timer_check_rtc(void)
{
/*
@@ -141,6 +143,52 @@
last_rtc_update += 60;
}
}
+#else /* CONFIG_NEWTOD */
+nsec_t read_persistent_clock(void)
+{
+ struct rtc_time tm;
+ unsigned long sec;
+#ifdef CONFIG_PPC_ISERIES
+ if (!piranha_simulator)
+#endif
+ ppc_md.get_boot_time(&tm);
+
+ sec = mktime(tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
+ tm.tm_hour, tm.tm_min, tm.tm_sec);
+ return (nsec_t)sec * NSEC_PER_SEC;
+}
+void sync_persistent_clock(struct timespec ts)
+{
+ /*
+ * update the rtc when needed, this should be performed on the
+ * right fraction of a second. Half or full second ?
+ * Full second works on mk48t59 clocks, others need testing.
+ * Note that this update is basically only used through
+ * the adjtimex system calls. Setting the HW clock in
+ * any other way is a /dev/rtc and userland business.
+ * This is still wrong by -0.5/+1.5 jiffies because of the
+ * timer interrupt resolution and possible delay, but here we
+ * hit a quantization limit which can only be solved by higher
+ * resolution timers and decoupling time management from timer
+ * interrupts. This is also wrong on the clocks
+ * which require being written at the half second boundary.
+ * We should have an rtc call that only sets the minutes and
+ * seconds like on Intel to avoid problems with non UTC clocks.
+ */
+ if ( ts.tv_sec - last_rtc_update >= 659 &&
+ abs((ts.tv_nsec/1000) - (1000000-1000000/HZ)) < 500000/HZ) {
+ struct rtc_time tm;
+ to_tm(ts.tv_sec+1, &tm);
+ tm.tm_year -= 1900;
+ tm.tm_mon -= 1;
+ if (ppc_md.set_rtc_time(&tm) == 0)
+ last_rtc_update = ts.tv_sec+1;
+ else
+ /* Try again one minute later */
+ last_rtc_update += 60;
+ }
+}
+#endif /* CONFIG_NEWTOD */

/*
* This version of gettimeofday has microsecond resolution.
@@ -172,12 +220,14 @@
tv->tv_usec = usec;
}

+#ifndef CONFIG_NEWTOD
void do_gettimeofday(struct timeval *tv)
{
__do_gettimeofday(tv, get_tb());
}

EXPORT_SYMBOL(do_gettimeofday);
+#endif

/* Synchronize xtime with do_gettimeofday */

@@ -312,11 +362,15 @@
write_seqlock(&xtime_lock);
tb_last_stamp = lpaca->next_jiffy_update_tb;
do_timer(regs);
+#ifndef CONFIG_NEWTOD
timer_sync_xtime(lpaca->next_jiffy_update_tb);
timer_check_rtc();
+#endif
write_sequnlock(&xtime_lock);
+#ifndef CONFIG_NEWTOD
if ( adjusting_time && (time_adjust == 0) )
ppc_adjtimex();
+#endif
}
lpaca->next_jiffy_update_tb += tb_ticks_per_jiffy;
}
@@ -334,6 +388,7 @@
}
#endif

+ timeofday_interrupt_hook();
irq_exit();

return 1;
@@ -350,6 +405,7 @@
{
return mulhdu(get_tb(), tb_to_ns_scale) << tb_to_ns_shift;
}
+#ifndef CONFIG_NEWTOD

int do_settimeofday(struct timespec *tv)
{
@@ -425,6 +481,7 @@
}

EXPORT_SYMBOL(do_settimeofday);
+#endif /* !CONFIG_NEWTOD */

void __init time_init(void)
{
@@ -480,7 +537,9 @@
xtime_sync_interval = tb_ticks_per_sec - (tb_ticks_per_sec/8);
next_xtime_sync_tb = tb_last_stamp + xtime_sync_interval;

+#ifndef CONFIG_NEWTOD
time_freq = 0;
+#endif

xtime.tv_nsec = 0;
last_rtc_update = xtime.tv_sec;
@@ -503,6 +562,7 @@

/* #define DEBUG_PPC_ADJTIMEX 1 */

+#ifndef CONFIG_NEWTOD
void ppc_adjtimex(void)
{
unsigned long den, new_tb_ticks_per_sec, tb_ticks, old_xsec, new_tb_to_xs, new_xsec, new_stamp_xsec;
@@ -630,6 +690,7 @@
write_sequnlock_irqrestore( &xtime_lock, flags );

}
+#endif /* !CONFIG_NEWTOD */

#define TICK_SIZE tick
diff -Nru a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
--- a/arch/x86_64/Kconfig 2005-01-24 13:33:59 -08:00
+++ b/arch/x86_64/Kconfig 2005-01-24 13:33:59 -08:00
@@ -24,6 +24,10 @@
bool
default y

+config NEWTOD
+ bool
+ default y
+
config MMU
bool
default y
diff -Nru a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
--- a/arch/x86_64/kernel/time.c 2005-01-24 13:33:59 -08:00
+++ b/arch/x86_64/kernel/time.c 2005-01-24 13:33:59 -08:00
@@ -35,6 +35,7 @@
#include <asm/sections.h>
#include <linux/cpufreq.h>
#include <linux/hpet.h>
+#include <linux/timeofday.h>
#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/apic.h>
#endif
@@ -106,6 +107,7 @@

unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc;

+#ifndef CONFIG_NEWTOD
/*
* This version of gettimeofday() has microsecond resolution and better than
* microsecond precision, as we're using at least a 10 MHz (usually 14.31818
@@ -180,6 +182,7 @@
}

EXPORT_SYMBOL(do_settimeofday);
+#endif /* CONFIG_NEWTOD */

unsigned long profile_pc(struct pt_regs *regs)
{
@@ -281,6 +284,7 @@
}

+#ifndef CONFIG_NEWTOD
/* monotonic_clock(): returns # of nanoseconds passed since time_init()
* Note: This function is required to return accurate
* time even in the absence of multiple timer ticks.
@@ -357,6 +361,8 @@
}
#endif
}
+#endif /* CONFIG_NEWTOD */
+

static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
{
@@ -373,6 +379,7 @@

write_seqlock(&xtime_lock);

+#ifndef CONFIG_NEWTOD
if (vxtime.hpet_address) {
offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
delay = hpet_readl(HPET_COUNTER) - offset;
@@ -422,6 +429,7 @@
handle_lost_ticks(lost, regs);
jiffies += lost;
}
+#endif /* CONFIG_NEWTOD */

/*
* Do the timer stuff.
@@ -445,6 +453,7 @@
smp_local_timer_interrupt(regs);
#endif

+#ifndef CONFIG_NEWTOD
/*
* If we have an externally synchronized Linux clock, then update CMOS clock
* accordingly every ~11 minutes. set_rtc_mmss() will be called in the jiffy
@@ -458,9 +467,11 @@
set_rtc_mmss(xtime.tv_sec);
rtc_update = xtime.tv_sec + 660;
}
-
+#endif /* CONFIG_NEWTOD */
+
write_sequnlock(&xtime_lock);

+ timeofday_interrupt_hook();
return IRQ_HANDLED;
}

@@ -560,6 +571,30 @@
return mktime(year, mon, day, hour, min, sec);
}

+/* arch specific timeofday hooks */
+nsec_t read_persistent_clock(void)
+{
+ return (nsec_t)get_cmos_time() * NSEC_PER_SEC;
+}
+
+void sync_persistent_clock(struct timespec ts)
+{
+ static unsigned long rtc_update = 0;
+ /*
+ * If we have an externally synchronized Linux clock, then update
+ * CMOS clock accordingly every ~11 minutes. set_rtc_mmss() will
+ * be called in the jiffy closest to exactly 500 ms before the
+ * next second. If the update fails, we don't care, as it'll be
+ * updated on the next turn, and the problem (time way off) isn't
+ * likely to go away much sooner anyway.
+ */
+ if (ts.tv_sec > rtc_update &&
+ abs(ts.tv_nsec - 500000000) <= tick_nsec / 2) {
+ set_rtc_mmss(xtime.tv_sec);
+ rtc_update = xtime.tv_sec + 660;
+ }
+}
+
#ifdef CONFIG_CPU_FREQ

/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
@@ -955,6 +990,7 @@

__setup("report_lost_ticks", time_setup);

+#ifndef CONFIG_NEWTOD
static long clock_cmos_diff;
static unsigned long sleep_start;

@@ -990,6 +1026,21 @@
wall_jiffies += sleep_length;
return 0;
}
+#else /* !CONFIG_NEWTOD */
+static int timer_suspend(struct sys_device *dev, u32 state)
+{
+ timeofday_suspend_hook();
+ return 0;
+}
+
+static int timer_resume(struct sys_device *dev)
+{
+ if (vxtime.hpet_address)
+ hpet_reenable();
+ timeofday_resume_hook();
+ return 0;
+}
+#endif

static struct sysdev_class timer_sysclass = {
.resume = timer_resume,
diff -Nru a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
--- a/arch/x86_64/kernel/vsyscall.c 2005-01-24 13:33:59 -08:00
+++ b/arch/x86_64/kernel/vsyscall.c 2005-01-24 13:33:59 -08:00
@@ -171,8 +171,12 @@
BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
map_vsyscall();
+/* XXX - disable vsyscall gettimeofday for now */
+#ifndef CONFIG_NEWTOD
sysctl_vsyscall = 1;
-
+#else
+ sysctl_vsyscall = 0;
+#endif
return 0;
}

diff -Nru a/include/asm-generic/div64.h b/include/asm-generic/div64.h
--- a/include/asm-generic/div64.h 2005-01-24 13:33:59 -08:00
+++ b/include/asm-generic/div64.h 2005-01-24 13:33:59 -08:00
@@ -55,4 +55,13 @@

#endif /* BITS_PER_LONG */

+#ifndef div_long_long_rem
+#define div_long_long_rem(dividend,divisor,remainder) \
+({ \
+ u64 result = dividend; \
+ *remainder = do_div(result,divisor); \
+ result; \
+})
+#endif
+
#endif /* _ASM_GENERIC_DIV64_H */
diff -Nru a/include/asm-i386/timer.h b/include/asm-i386/timer.h
--- a/include/asm-i386/timer.h 2005-01-24 13:33:59 -08:00
+++ b/include/asm-i386/timer.h 2005-01-24 13:33:59 -08:00
@@ -2,6 +2,13 @@
#define _ASMi386_TIMER_H
#include <linux/init.h>

+#define TICK_SIZE (tick_nsec / 1000)
+void setup_pit_timer(void);
+/* Modifiers for buggy PIT handling */
+extern int pit_latch_buggy;
+extern int timer_ack;
+
+#ifndef CONFIG_NEWTOD
/**
* struct timer_ops - used to define a timer source
*
@@ -29,18 +36,10 @@
struct timer_opts *opts;
};

-#define TICK_SIZE (tick_nsec / 1000)
-
extern struct timer_opts* __init select_timer(void);
extern void clock_fallback(void);
-void setup_pit_timer(void);
-
-/* Modifiers for buggy PIT handling */
-
-extern int pit_latch_buggy;

extern struct timer_opts *cur_timer;
-extern int timer_ack;

/* list of externed timers */
extern struct timer_opts timer_none;
@@ -60,5 +59,6 @@

#ifdef CONFIG_X86_PM_TIMER
extern struct init_timer_opts timer_pmtmr_init;
+#endif
#endif
#endif

2005-01-24 23:29:05

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday core subsystem (v. A2)

On Mon, 24 Jan 2005, john stultz wrote:

> +/* __monotonic_clock():
> + * private function, must hold system_time_lock lock when being
> + * called. Returns the monotonically increasing number of
> + * nanoseconds since the system booted (adjusted by NTP scaling)
> + */
> +static nsec_t __monotonic_clock(void)
> +{
> + nsec_t ret, ns_offset;
> + cycle_t now, delta;
> +
> + /* read timesource */
> + now = read_timesource(timesource);
> +
> + /* calculate the delta since the last clock_interrupt */
> + delta = (now - offset_base) & timesource->mask;
> +
> + /* convert to nanoseconds */
> + ns_offset = cyc2ns(timesource, delta, NULL);
> +
> + /* apply the NTP scaling */
> + ns_offset = ntp_scale(ns_offset);

The monotonic clock is the time base for the gettime and gettimeofday
functions. This means ntp_scale() is called every time that the kernel or
an application access time.

As pointed out before this will dramatically worsen the performance
compared to the current code base.

ntp_scale() also will make it difficult to implement optimized arch
specific version of function for timer access.

The fastcalls would have to be disabled on ia64 to make this work. Its
likely extremely difficult to implement a fastcall if it involves
ntp_scale().

2005-01-24 23:33:08

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday arch specific timesources (v. A2)

On Mon, 24 Jan 2005, john stultz wrote:

> +/* helper macro to atomically read both cyclone counter registers */
> +#define read_cyclone_counter(low,high) \
> + do{ \
> + high = cyclone_timer[1]; low = cyclone_timer[0]; \
> + } while (high != cyclone_timer[1]);

This is only necessary on 32 bit platforms. On ia64 an atomic read would
do the job. Maybe that logic needs to go into the custom defined readq for
32 bit? Then you could avoid repeating the code for drivers that read 64
bit clocks on 32bit processors.

2005-01-25 00:13:59

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday core subsystem (v. A2)

On Mon, 24 Jan 2005, john stultz wrote:

> Yep, performance is a big concern. Re-working ntp_scale() is still on my
> TODO list. I just didn't get to it in this release.

This is a hopeless endeavor if you look at the function.
Throw ntp_scale out and calculate a scaling factor during the ticks. At
tick time then you may forward the clock a few ns in order to correct it
otherwise monkey around with the scaling factor.

2005-01-25 00:09:49

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday arch specific timesources (v. A2)

On Mon, 2005-01-24 at 15:29 -0800, Christoph Lameter wrote:
> On Mon, 24 Jan 2005, john stultz wrote:
>
> > +/* helper macro to atomically read both cyclone counter registers */
> > +#define read_cyclone_counter(low,high) \
> > + do{ \
> > + high = cyclone_timer[1]; low = cyclone_timer[0]; \
> > + } while (high != cyclone_timer[1]);
>
> This is only necessary on 32 bit platforms. On ia64 an atomic read would
> do the job. Maybe that logic needs to go into the custom defined readq for
> 32 bit? Then you could avoid repeating the code for drivers that read 64
> bit clocks on 32bit processors.

Yea, I still need to convert the cyclone timesource to an
TIMESOURCE_MMIO_64. Hopefully I'll get to that in the next release.

thanks again for the review and feedback!
-john

2005-01-25 00:19:19

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday core subsystem (v. A2)

On Mon, 2005-01-24 at 15:24 -0800, Christoph Lameter wrote:
> On Mon, 24 Jan 2005, john stultz wrote:
> > + /* convert to nanoseconds */
> > + ns_offset = cyc2ns(timesource, delta, NULL);
> > +
> > + /* apply the NTP scaling */
> > + ns_offset = ntp_scale(ns_offset);
>
> The monotonic clock is the time base for the gettime and gettimeofday
> functions. This means ntp_scale() is called every time that the kernel or
> an application access time.
>
> As pointed out before this will dramatically worsen the performance
> compared to the current code base.
>
> ntp_scale() also will make it difficult to implement optimized arch
> specific version of function for timer access.
>
> The fastcalls would have to be disabled on ia64 to make this work. Its
> likely extremely difficult to implement a fastcall if it involves
> ntp_scale().

Yep, performance is a big concern. Re-working ntp_scale() is still on my
TODO list. I just didn't get to it in this release.

Patches are always welcome. :)

thanks for the feedback!
-john

2005-01-25 00:35:47

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday core subsystem (v. A2)

On Mon, 2005-01-24 at 16:08 -0800, Christoph Lameter wrote:
> On Mon, 24 Jan 2005, john stultz wrote:
> > Yep, performance is a big concern. Re-working ntp_scale() is still on my
> > TODO list. I just didn't get to it in this release.
>
> This is a hopeless endeavor if you look at the function.
> Throw ntp_scale out and calculate a scaling factor during the ticks. At
> tick time then you may forward the clock a few ns in order to correct it
> otherwise monkey around with the scaling factor.

We talked about this last time. I do intend to re-work ntp_scale() so
its not a function call, much as you describe above.

hopelessly endeavoring,
-john

2005-01-25 01:20:07

[permalink] [raw]

Subject: [RFC][PATCH] new timeofday arch specific timesources (v. A2)

All,
This patch implements most of the time sources for i386, x86-64, and
ppc64 (tsc, pit, cyclone, acpi-pm, hpet and timebase). It applies on top
of my linux-2.6.11-rc2_timeofday-arch_A2 patch. It provides real
timesources (opposed to the example jiffies timesource) that can be used
for more realistic testing.

This patch is the shabbiest of the three. It needs to be broken up, and
cleaned. The i386_pit.c is broken. Also, acpi_pm and hpet need to be
made generic so they can be shared between i386 and x86-64. But for now
it will get you going so you can test and play with the core code.

New in this release:
o ppc64_timebase code
o move tsc code to TIMESOURCE_CYCLES
o add tsc_update_callback for cpufreq changes (untested)
o move hpet code to TIMESOURCE_MMIO_32

thanks
-john

linux-2.6.11-rc2_timeofday-timesources_A2.patch
===================================================
diff -Nru a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
--- a/arch/i386/kernel/Makefile 2005-01-24 13:36:22 -08:00
+++ b/arch/i386/kernel/Makefile 2005-01-24 13:36:22 -08:00
@@ -7,7 +7,7 @@
obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \
ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \
- doublefault.o quirks.o
+ doublefault.o quirks.o tsc.o

obj-y += cpu/
obj-$(!CONFIG_NEWTOD) += timers/
diff -Nru a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
--- a/arch/i386/kernel/setup.c 2005-01-24 13:36:22 -08:00
+++ b/arch/i386/kernel/setup.c 2005-01-24 13:36:22 -08:00
@@ -49,6 +49,7 @@
#include <asm/io_apic.h>
#include <asm/ist.h>
#include <asm/io.h>
+#include <asm/tsc.h>
#include "setup_arch_pre.h"
#include <bios_ebda.h>

@@ -1439,6 +1440,7 @@
conswitchp = &dummy_con;
#endif
#endif
+ tsc_init();
}

#include "setup_arch_post.h"
diff -Nru a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
--- a/arch/i386/kernel/time.c 2005-01-24 13:36:22 -08:00
+++ b/arch/i386/kernel/time.c 2005-01-24 13:36:22 -08:00
@@ -461,6 +461,7 @@

void __init time_init(void)
{
+#ifndef CONFIG_NEWTOD
#ifdef CONFIG_HPET_TIMER
if (is_hpet_capable()) {
/*
@@ -478,6 +479,7 @@

cur_timer = select_timer();
printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
+#endif

time_init_hook();
}
diff -Nru a/arch/i386/kernel/timers/common.c b/arch/i386/kernel/timers/common.c
--- a/arch/i386/kernel/timers/common.c 2005-01-24 13:36:22 -08:00
+++ b/arch/i386/kernel/timers/common.c 2005-01-24 13:36:22 -08:00
@@ -22,8 +22,6 @@
* device.
*/

-#define CALIBRATE_TIME (5 * 1000020/HZ)
-
unsigned long __init calibrate_tsc(void)
{
mach_prepare_counter();
diff -Nru a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/arch/i386/kernel/tsc.c 2005-01-24 13:36:22 -08:00
@@ -0,0 +1,111 @@
+#include <linux/init.h>
+#include <linux/timex.h>
+#include <linux/cpufreq.h>
+#include <asm/tsc.h>
+#include "mach_timer.h"
+
+unsigned long cpu_freq_khz;
+#ifdef CONFIG_NEWTOD
+int tsc_disable;
+#endif
+
+void tsc_init(void)
+{
+ unsigned long long start, end;
+ unsigned long count;
+ u64 delta64;
+ int i;
+
+ /* repeat 3 times to make sure the cache is warm */
+ for(i=0; i < 3; i++) {
+ mach_prepare_counter();
+ rdtscll(start);
+ mach_countup(&count);
+ rdtscll(end);
+ }
+ delta64 = end - start;
+
+ /* cpu freq too fast */
+ if(delta64 > (1ULL<<32))
+ return;
+ /* cpu freq too slow */
+ if (delta64 <= CALIBRATE_TIME)
+ return;
+
+ delta64 *= 1000;
+ do_div(delta64,CALIBRATE_TIME);
+ cpu_freq_khz = (unsigned long)delta64;
+
+ cpu_khz = cpu_freq_khz;
+
+ printk("Detected %lu.%03lu MHz processor.\n",
+ cpu_khz / 1000, cpu_khz % 1000);
+
+}
+
+
+/* All of the code below comes from arch/i386/kernel/timers/timer_tsc.c
+ * XXX: severly needs better comments and the ifdef's killed.
+ */
+
+#ifdef CONFIG_CPU_FREQ
+static unsigned int cpufreq_init = 0;
+
+/* If the CPU frequency is scaled, TSC-based delays will need a different
+ * loops_per_jiffy value to function properly.
+ */
+
+static unsigned int ref_freq = 0;
+static unsigned long loops_per_jiffy_ref = 0;
+
+#ifndef CONFIG_SMP
+static unsigned long cpu_khz_ref = 0;
+#endif
+
+static int time_cpufreq_notifier(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct cpufreq_freqs *freq = data;
+
+ if (val != CPUFREQ_RESUMECHANGE)
+ write_seqlock_irq(&xtime_lock);
+ if (!ref_freq) {
+ ref_freq = freq->old;
+ loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy;
+#ifndef CONFIG_SMP
+ cpu_khz_ref = cpu_khz;
+#endif
+ }
+
+ if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
+ (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
+ (val == CPUFREQ_RESUMECHANGE)) {
+ if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+ cpu_data[freq->cpu].loops_per_jiffy = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
+#ifndef CONFIG_SMP
+ if (cpu_khz)
+ cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
+#endif
+ }
+
+ if (val != CPUFREQ_RESUMECHANGE)
+ write_sequnlock_irq(&xtime_lock);
+
+ return 0;
+}
+
+static struct notifier_block time_cpufreq_notifier_block = {
+ .notifier_call = time_cpufreq_notifier
+};
+
+static int __init cpufreq_tsc(void)
+{
+ int ret;
+ ret = cpufreq_register_notifier(&time_cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+ if (!ret)
+ cpufreq_init = 1;
+ return ret;
+}
+core_initcall(cpufreq_tsc);
+#endif /* CONFIG_CPU_FREQ */
diff -Nru a/drivers/timesource/Makefile b/drivers/timesource/Makefile
--- a/drivers/timesource/Makefile 2005-01-24 13:36:22 -08:00
+++ b/drivers/timesource/Makefile 2005-01-24 13:36:22 -08:00
@@ -1 +1,7 @@
obj-y += jiffies.o
+obj-$(CONFIG_X86) += i386_tsc.o
+obj-$(CONFIG_PPC64) += ppc64_timebase.o
+#obj-$(CONFIG_X86) += i386_pit.o
+obj-$(CONFIG_X86_CYCLONE_TIMER) += cyclone.o
+obj-$(CONFIG_X86_PM_TIMER) += acpi_pm.o
+obj-$(CONFIG_X86_64) += x86-64_hpet.o
diff -Nru a/drivers/timesource/acpi_pm.c b/drivers/timesource/acpi_pm.c
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/drivers/timesource/acpi_pm.c 2005-01-24 13:36:22 -08:00
@@ -0,0 +1,113 @@
+#include <linux/timesource.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <asm/io.h>
+#include "mach_timer.h"
+
+/* Number of PMTMR ticks expected during calibration run */
+#define PMTMR_TICKS_PER_SEC 3579545
+#define PMTMR_EXPECTED_RATE \
+ ((CALIBRATE_LATCH * (PMTMR_TICKS_PER_SEC >> 10)) / (CLOCK_TICK_RATE>>10))
+
+
+/* The I/O port the PMTMR resides at.
+ * The location is detected during setup_arch(),
+ * in arch/i386/acpi/boot.c */
+u32 pmtmr_ioport = 0;
+
+#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
+
+static inline u32 read_pmtmr(void)
+{
+ u32 v1=0,v2=0,v3=0;
+ /* It has been reported that because of various broken
+ * chipsets (ICH4, PIIX4 and PIIX4E) where the ACPI PM time
+ * source is not latched, so you must read it multiple
+ * times to insure a safe value is read.
+ */
+ do {
+ v1 = inl(pmtmr_ioport);
+ v2 = inl(pmtmr_ioport);
+ v3 = inl(pmtmr_ioport);
+ } while ((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1)
+ || (v3 > v1 && v3 < v2));
+
+ /* mask the output to 24 bits */
+ return v2 & ACPI_PM_MASK;
+}
+
+
+static cycle_t acpi_pm_read(void)
+{
+ return (cycle_t)read_pmtmr();
+}
+
+struct timesource_t timesource_acpi_pm = {
+ .name = "acpi_pm",
+ .priority = 200,
+ .type = TIMESOURCE_FUNCTION,
+ .read_fnct = acpi_pm_read,
+ .mask = (cycle_t)ACPI_PM_MASK,
+ .mult = 286070,
+ .shift = 10,
+};
+
+/*
+ * Some boards have the PMTMR running way too fast. We check
+ * the PMTMR rate against PIT channel 2 to catch these cases.
+ */
+static int verify_pmtmr_rate(void)
+{
+ u32 value1, value2;
+ unsigned long count, delta;
+
+ mach_prepare_counter();
+ value1 = read_pmtmr();
+ mach_countup(&count);
+ value2 = read_pmtmr();
+ delta = (value2 - value1) & ACPI_PM_MASK;
+
+ /* Check that the PMTMR delta is within 5% of what we expect */
+ if (delta < (PMTMR_EXPECTED_RATE * 19) / 20 ||
+ delta > (PMTMR_EXPECTED_RATE * 21) / 20) {
+ printk(KERN_INFO "PM-Timer running at invalid rate: %lu%% of normal - aborting.\n", 100UL * delta / PMTMR_EXPECTED_RATE);
+ return -1;
+ }
+
+ return 0;
+}
+
+
+static int init_acpi_pm_timesource(void)
+{
+ u32 value1, value2;
+ unsigned int i;
+
+ if (!pmtmr_ioport)
+ return -ENODEV;
+
+ /* "verify" this timing source */
+ value1 = read_pmtmr();
+ for (i = 0; i < 10000; i++) {
+ value2 = read_pmtmr();
+ if (value2 == value1)
+ continue;
+ if (value2 > value1)
+ goto pm_good;
+ if ((value2 < value1) && ((value2) < 0xFFF))
+ goto pm_good;
+ printk(KERN_INFO "PM-Timer had inconsistent results: 0x%#x, 0x%#x - aborting.\n", value1, value2);
+ return -EINVAL;
+ }
+ printk(KERN_INFO "PM-Timer had no reasonable result: 0x%#x - aborting.\n", value1);
+ return -ENODEV;
+
+pm_good:
+ if (verify_pmtmr_rate() != 0)
+ return -ENODEV;
+
+ register_timesource(&timesource_acpi_pm);
+ return 0;
+}
+
+module_init(init_acpi_pm_timesource);
diff -Nru a/drivers/timesource/cyclone.c b/drivers/timesource/cyclone.c
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/drivers/timesource/cyclone.c 2005-01-24 13:36:22 -08:00
@@ -0,0 +1,154 @@
+#include <linux/timesource.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/timex.h>
+#include <linux/init.h>
+
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/fixmap.h>
+#include "mach_timer.h"
+
+#define CYCLONE_CBAR_ADDR 0xFEB00CD0
+#define CYCLONE_PMCC_OFFSET 0x51A0
+#define CYCLONE_MPMC_OFFSET 0x51D0
+#define CYCLONE_MPCS_OFFSET 0x51A8
+#define CYCLONE_TIMER_FREQ 100000000
+#define CYCLONE_TIMER_MASK (((u64)1<<40)-1) /* 40 bit mask */
+
+unsigned long cyclone_freq_khz;
+
+int use_cyclone = 0;
+static u32* volatile cyclone_timer; /* Cyclone MPMC0 register */
+
+/* helper macro to atomically read both cyclone counter registers */
+#define read_cyclone_counter(low,high) \
+ do{ \
+ high = cyclone_timer[1]; low = cyclone_timer[0]; \
+ } while (high != cyclone_timer[1]);
+
+
+static cycle_t cyclone_read(void)
+{
+ u32 low, high;
+ u64 ret;
+
+ read_cyclone_counter(low,high);
+ ret = ((u64)high << 32)|low;
+
+ return (cycle_t)ret;
+}
+
+struct timesource_t timesource_cyclone = {
+ .name = "cyclone",
+ .priority = 100,
+ .type = TIMESOURCE_FUNCTION,
+ .read_fnct = cyclone_read,
+ .mask = (cycle_t)CYCLONE_TIMER_MASK,
+ .mult = 10,
+ .shift = 0,
+};
+
+
+static void calibrate_cyclone(void)
+{
+ u32 startlow, starthigh, endlow, endhigh, delta32;
+ u64 start, end, delta64;
+ unsigned long i, count;
+ /* repeat 3 times to make sure the cache is warm */
+ for(i=0; i < 3; i++) {
+ mach_prepare_counter();
+ read_cyclone_counter(startlow,starthigh);
+ mach_countup(&count);
+ read_cyclone_counter(endlow,endhigh);
+ }
+ start = (u64)starthigh<<32|startlow;
+ end = (u64)endhigh<<32|endlow;
+
+ delta64 = end - start;
+ printk("cyclone delta: %llu\n", delta64);
+ delta64 *= (ACTHZ/1000)>>8;
+ printk("delta*hz = %llu\n", delta64);
+ delta32 = (u32)delta64;
+ cyclone_freq_khz = delta32/CALIBRATE_ITERATION;
+ printk("calculated cyclone_freq: %lu khz\n", cyclone_freq_khz);
+}
+
+static int init_cyclone_timesource(void)
+{
+ u32* reg;
+ u32 base; /* saved cyclone base address */
+ u32 pageaddr; /* page that contains cyclone_timer register */
+ u32 offset; /* offset from pageaddr to cyclone_timer register */
+ int i;
+
+ /*make sure we're on a summit box*/
+ if(!use_cyclone) return -ENODEV;
+
+ printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n");
+
+ /* find base address */
+ pageaddr = (CYCLONE_CBAR_ADDR)&PAGE_MASK;
+ offset = (CYCLONE_CBAR_ADDR)&(~PAGE_MASK);
+ set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
+ reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
+ if(!reg){
+ printk(KERN_ERR "Summit chipset: Could not find valid CBAR register.\n");
+ return -ENODEV;
+ }
+ base = *reg;
+ if(!base){
+ printk(KERN_ERR "Summit chipset: Could not find valid CBAR value.\n");
+ return -ENODEV;
+ }
+
+ /* setup PMCC */
+ pageaddr = (base + CYCLONE_PMCC_OFFSET)&PAGE_MASK;
+ offset = (base + CYCLONE_PMCC_OFFSET)&(~PAGE_MASK);
+ set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
+ reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
+ if(!reg){
+ printk(KERN_ERR "Summit chipset: Could not find valid PMCC register.\n");
+ return -ENODEV;
+ }
+ reg[0] = 0x00000001;
+
+ /* setup MPCS */
+ pageaddr = (base + CYCLONE_MPCS_OFFSET)&PAGE_MASK;
+ offset = (base + CYCLONE_MPCS_OFFSET)&(~PAGE_MASK);
+ set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
+ reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
+ if(!reg){
+ printk(KERN_ERR "Summit chipset: Could not find valid MPCS register.\n");
+ return -ENODEV;
+ }
+ reg[0] = 0x00000001;
+
+ /* map in cyclone_timer */
+ pageaddr = (base + CYCLONE_MPMC_OFFSET)&PAGE_MASK;
+ offset = (base + CYCLONE_MPMC_OFFSET)&(~PAGE_MASK);
+ set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
+ cyclone_timer = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
+ if(!cyclone_timer){
+ printk(KERN_ERR "Summit chipset: Could not find valid MPMC register.\n");
+ return -ENODEV;
+ }
+
+ /*quick test to make sure its ticking*/
+ for(i=0; i<3; i++){
+ u32 old = cyclone_timer[0];
+ int stall = 100;
+ while(stall--) barrier();
+ if(cyclone_timer[0] == old){
+ printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n");
+ cyclone_timer = NULL;
+ return -ENODEV;
+ }
+ }
+ calibrate_cyclone();
+ register_timesource(&timesource_cyclone);
+
+ return 0;
+}
+
+module_init(init_cyclone_timesource);
diff -Nru a/drivers/timesource/i386_pit.c b/drivers/timesource/i386_pit.c
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/drivers/timesource/i386_pit.c 2005-01-24 13:36:22 -08:00
@@ -0,0 +1,100 @@
+/* pit timesource: XXX - broken!
+ */
+
+#include <linux/timesource.h>
+#include <linux/timex.h>
+#include <linux/init.h>
+
+#include <asm/io.h>
+#include <asm/timer.h>
+#include "io_ports.h"
+#include "do_timer.h"
+
+extern u64 jiffies_64;
+extern long jiffies;
+extern spinlock_t i8253_lock;
+
+/* Since the PIT overflows every tick, its not very useful
+ * to just read by itself. So throw jiffies into the mix to
+ * and just return nanoseconds in pit_read().
+ */
+
+static cycle_t pit_read(void)
+{
+ unsigned long flags;
+ int count;
+ unsigned long jiffies_t;
+ static int count_p;
+ static unsigned long jiffies_p = 0;
+
+ spin_lock_irqsave(&i8253_lock, flags);
+
+ outb_p(0x00, PIT_MODE); /* latch the count ASAP */
+
+ count = inb_p(PIT_CH0); /* read the latched count */
+ jiffies_t = jiffies;
+ count |= inb_p(PIT_CH0) << 8;
+
+ /* VIA686a test code... reset the latch if count > max + 1 */
+ if (count > LATCH) {
+ outb_p(0x34, PIT_MODE);
+ outb_p(LATCH & 0xff, PIT_CH0);
+ outb(LATCH >> 8, PIT_CH0);
+ count = LATCH - 1;
+ }
+
+ /*
+ * avoiding timer inconsistencies (they are rare, but they happen)...
+ * there are two kinds of problems that must be avoided here:
+ * 1. the timer counter underflows
+ * 2. hardware problem with the timer, not giving us continuous time,
+ * the counter does small "jumps" upwards on some Pentium systems,
+ * (see c't 95/10 page 335 for Neptun bug.)
+ */
+
+ if( jiffies_t == jiffies_p ) {
+ if( count > count_p ) {
+ /* the nutcase */
+ count = do_timer_overflow(count);
+ }
+ } else
+ jiffies_p = jiffies_t;
+
+ count_p = count;
+
+ spin_unlock_irqrestore(&i8253_lock, flags);
+
+ count = ((LATCH-1) - count) * TICK_SIZE;
+ count = (count + LATCH/2) / LATCH;
+
+ count *= 1000; /* convert count from usec->nsec */
+
+ return (cycle_t)((jiffies_64 * TICK_NSEC) + count);
+}
+
+static cycle_t pit_delta(cycle_t now, cycle_t then)
+{
+ return now - then;
+}
+
+/* just return cyc, as its already in ns */
+static nsec_t pit_cyc2ns(cycle_t cyc, cycle_t* remainder)
+{
+ return (nsec_t)cyc;
+}
+
+static struct timesource_t timesource_pit = {
+ .name = "pit",
+ .priority = 0,
+ .read = pit_read,
+ .delta = pit_delta,
+ .cyc2ns = pit_cyc2ns,
+};
+
+static int init_pit_timesource(void)
+{
+ register_timesource(&timesource_pit);
+ return 0;
+}
+
+module_init(init_pit_timesource);
diff -Nru a/drivers/timesource/i386_tsc.c b/drivers/timesource/i386_tsc.c
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/drivers/timesource/i386_tsc.c 2005-01-24 13:36:22 -08:00
@@ -0,0 +1,55 @@
+/* TODO:
+ * o cpufreq code
+ * o better calibration
+ */
+
+#include <linux/timesource.h>
+#include <linux/timex.h>
+#include <linux/init.h>
+
+static void tsc_update_callback(void);
+
+static struct timesource_t timesource_tsc = {
+ .name = "tsc",
+ .priority = 25,
+ .type = TIMESOURCE_CYCLES,
+ .mask = (cycle_t)~0,
+ .mult = 0, /* to be set */
+ .shift = 22,
+ .update_callback = tsc_update_callback,
+};
+
+
+static inline u32 calculate_tsc_mult(unsigned long tsc_khz)
+{
+ unsigned long long x;
+ x = (NSEC_PER_SEC/HZ);
+ x = x << timesource_tsc.shift;
+ do_div(x, tsc_khz);
+ return (u32)x;
+}
+
+static unsigned long current_cpu_khz = 0;
+
+static void tsc_update_callback(void)
+{
+ /* only update if cpu_khz has changed */
+ if (current_cpu_khz != cpu_khz){
+ current_cpu_khz = cpu_khz;
+ timesource_tsc.mult = calculate_tsc_mult(current_cpu_khz);
+ }
+}
+
+static int init_tsc_timesource(void)
+{
+ /* TSC initialization is done in arch/i386/kernel/tsc.c */
+ if (cpu_has_tsc && cpu_khz) {
+ current_cpu_khz = cpu_khz;
+ timesource_tsc.mult = calculate_tsc_mult(current_cpu_khz);
+ register_timesource(&timesource_tsc);
+ }
+ return 0;
+}
+
+module_init(init_tsc_timesource);
+
diff -Nru a/drivers/timesource/ppc64_timebase.c b/drivers/timesource/ppc64_timebase.c
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/drivers/timesource/ppc64_timebase.c 2005-01-24 13:36:22 -08:00
@@ -0,0 +1,37 @@
+#include <linux/timesource.h>
+#include <asm/time.h>
+
+static cycle_t timebase_read(void)
+{
+ return (cycle_t)get_tb();
+}
+
+struct timesource_t timesource_timebase = {
+ .name = "timebase",
+ .priority = 200,
+ .type = TIMESOURCE_FUNCTION,
+ .read_fnct = timebase_read,
+ .mask = (cycle_t)-1UL,
+ .mult = 0,
+ .shift = 22,
+};
+
+
+/* XXX - this should be calculated or properly externed! */
+extern unsigned long tb_to_ns_scale;
+extern unsigned long tb_to_ns_shift;
+extern unsigned long tb_ticks_per_sec;
+
+static int init_timebase_timesource(void)
+{
+ unsigned long long x;
+ x = (NSEC_PER_SEC/HZ);
+ x = x << timesource_timebase.shift;
+ do_div(x, (tb_ticks_per_sec/1000));
+ timesource_timebase.mult = (u32)x;
+
+ register_timesource(&timesource_timebase);
+ return 0;
+}
+
+module_init(init_timebase_timesource);
diff -Nru a/drivers/timesource/x86-64_hpet.c b/drivers/timesource/x86-64_hpet.c
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/drivers/timesource/x86-64_hpet.c 2005-01-24 13:36:22 -08:00
@@ -0,0 +1,40 @@
+#include <linux/timesource.h>
+#include <linux/hpet.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <asm/io.h>
+#include <asm/hpet.h>
+
+#define HPET_MASK (~0L)
+#define HPET_SHIFT 32
+
+struct timesource_t timesource_hpet = {
+ .name = "hpet",
+ .priority = 300,
+ .type = TIMESOURCE_MMIO_32,
+ .mmio_ptr = NULL,
+ .mask = (cycle_t)HPET_MASK,
+ .mult = 0, /* set below */
+ .shift = HPET_SHIFT,
+};
+
+static int init_hpet_timesource(void)
+{
+ unsigned long hpet_period, hpet_hz;
+
+ if (!vxtime.hpet_address)
+ return -ENODEV;
+
+ /* calculate the hpet address */
+ timesource_hpet.mmio_ptr =
+ (void __iomem*)fix_to_virt(FIX_HPET_BASE) + HPET_COUNTER;
+
+ /* calculate and set the timesource multiplier */
+ hpet_period = hpet_readl(HPET_PERIOD);
+ hpet_hz = (1000000000000000L + hpet_period / 2) / hpet_period;
+ timesource_hpet.mult = (1000000L << HPET_SHIFT) / hpet_hz;
+
+ register_timesource(&timesource_hpet);
+ return 0;
+}
+module_init(init_hpet_timesource);
diff -Nru a/include/asm-i386/mach-default/mach_timer.h b/include/asm-i386/mach-default/mach_timer.h
--- a/include/asm-i386/mach-default/mach_timer.h 2005-01-24 13:36:22 -08:00
+++ b/include/asm-i386/mach-default/mach_timer.h 2005-01-24 13:36:22 -08:00
@@ -14,8 +14,12 @@
*/
#ifndef _MACH_TIMER_H
#define _MACH_TIMER_H
+#include <linux/jiffies.h>
+#include <asm/io.h>

-#define CALIBRATE_LATCH (5 * LATCH)
+#define CALIBRATE_ITERATION 50
+#define CALIBRATE_LATCH (CALIBRATE_ITERATION * LATCH)
+#define CALIBRATE_TIME (CALIBRATE_ITERATION * 1000020/HZ)

static inline void mach_prepare_counter(void)
{
diff -Nru a/include/asm-i386/tsc.h b/include/asm-i386/tsc.h
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/include/asm-i386/tsc.h 2005-01-24 13:36:22 -08:00
@@ -0,0 +1,6 @@
+#ifndef _ASM_I386_TSC_H
+#define _ASM_I386_TSC_H
+extern unsigned long cpu_freq_khz;
+void tsc_init(void);
+
+#endif
diff -Nru a/include/linux/timeofday.h b/include/linux/timeofday.h
--- a/include/linux/timeofday.h 2005-01-24 13:36:22 -08:00
+++ b/include/linux/timeofday.h 2005-01-24 13:36:22 -08:00
@@ -8,7 +8,9 @@
#define _LINUX_TIMEOFDAY_H
#include <linux/types.h>
#include <linux/time.h>
+#include <linux/timex.h>
#include <asm/div64.h>
+

#ifdef CONFIG_NEWTOD
nsec_t get_lowres_timestamp(void);
diff -Nru a/kernel/sched.c b/kernel/sched.c
--- a/kernel/sched.c 2005-01-24 13:36:22 -08:00
+++ b/kernel/sched.c 2005-01-24 13:36:22 -08:00
@@ -174,6 +174,11 @@
#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
< (long long) (sd)->cache_hot_time)

+
+#ifdef CONFIG_NEWTOD
+#define sched_clock() 0
+#endif
+
/*
* These are the runqueue data structures:
*/

2005-01-25 01:56:02

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday core subsystem (v. A2)

On Mon, 24 Jan 2005, john stultz wrote:

> We talked about this last time. I do intend to re-work ntp_scale() so
> its not a function call, much as you describe above.
>
> hopelessly endeavoring,

hehe.... But seriously: The easiest approach may be to modify the time
sources to allow a fine tuning of the scaling factor. That way ntp_scale
may be moved into tick processing where it would adjust the scaling of the
time sources up or downward. Thus no ntp_scale in the monotonic clock
processing anymore.

Monotonic clocks could be calculated

monotime = ns_at_last_tick + (time_source_cycles_since_tick *
current_scaling_factor) >> shift_factor.

This would also be easy to implement in asm if necessary.

tick processing could then increment or decrement the current scaling
factor to minimize the error between ticks. It could also add
nanoseconds to ns_at_last_tick to correct the clock forward.

With the appropiate shift_factor one should be able to fine tune time much
more accurately than ntp_scale would do. Over time the necessary
corrections could be minimized to just adding a few ns once in a while.

2005-01-25 02:40:28

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday arch specific hooks (v. A2)

On Mon, 2005-01-24 at 14:52 -0800, john stultz wrote:
> All,
> This patch implements the minimal architecture specific hooks to enable
> the new time of day subsystem code for i386, x86-64, and ppc64. It
> applies on top of my linux-2.6.11-rc1_timeofday-core_A2 patch and with
> this patch applied, you can test the new time of day subsystem.
>
> Basically it adds the call to timeofday_interrupt_hook() and cuts alot
> of code out of the build via #ifdefs. I know, I know, #ifdefs' are ugly
> and bad, and the final patch will just remove the old code. For now this
> allows us to be flexible and easily switch between the two
> implementations with a single define. Also it makes the patch a bit
> easier to read.

I haven't seen your other patch. Do you mean that with this patch, ppc64
stops using it's own gettimeofday implementation based on the CPU
hardware timebase ?

There are reasons why I plan to keep that. First, our implementation is
very efficient. It allows a timeofday computation without locks or
barriers thanks to carefully hand crafted data dependencies in the
operation. Second, we have an ABI issue here. For historical reasons, we
have this "systemcfg" data structure that can be mmap'ed to userland,
and which contains copy of some of the ppc64 internal time keeping
infos. Some userland stuff use it to implement a fully userland
gettimeofday (again, without barrier nor locks). This is done at least
by IBM's JVM. My still-to-be-merged vDSO patch will also use this for
the userland implementation of gettimeofday syscall itself.

Ben.

2005-01-25 07:43:24

by Ulrich Windl

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday core subsystem (v. A2)

On 24 Jan 2005 at 15:24, Christoph Lameter wrote:

> On Mon, 24 Jan 2005, john stultz wrote:
>
> > +/* __monotonic_clock():
> > + * private function, must hold system_time_lock lock when being
> > + * called. Returns the monotonically increasing number of
> > + * nanoseconds since the system booted (adjusted by NTP scaling)
> > + */
> > +static nsec_t __monotonic_clock(void)
> > +{
> > + nsec_t ret, ns_offset;
> > + cycle_t now, delta;
> > +
> > + /* read timesource */
> > + now = read_timesource(timesource);
> > +
> > + /* calculate the delta since the last clock_interrupt */
> > + delta = (now - offset_base) & timesource->mask;
> > +
> > + /* convert to nanoseconds */
> > + ns_offset = cyc2ns(timesource, delta, NULL);
> > +
> > + /* apply the NTP scaling */
> > + ns_offset = ntp_scale(ns_offset);
>
> The monotonic clock is the time base for the gettime and gettimeofday
> functions. This means ntp_scale() is called every time that the kernel or
> an application access time.

It depends on what you want: There is little sense in implementing a nanosecond
clock model with NTP when the result is wrong by several microseconds IMHO. You
don't know what the time is used for, so just get the best you can. Thos only
wanting some crude time, could be happy with the jiffies (or their equivalent),
right?

Regards,
Ulrich

>
> As pointed out before this will dramatically worsen the performance
> compared to the current code base.
>
> ntp_scale() also will make it difficult to implement optimized arch
> specific version of function for timer access.
>
> The fastcalls would have to be disabled on ia64 to make this work. Its
> likely extremely difficult to implement a fastcall if it involves
> ntp_scale().
>

2005-01-25 07:55:48

by Ulrich Windl

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday core subsystem (v. A2)

On 24 Jan 2005 at 17:54, Christoph Lameter wrote:

> On Mon, 24 Jan 2005, john stultz wrote:
>
> > We talked about this last time. I do intend to re-work ntp_scale() so
> > its not a function call, much as you describe above.
> >
> > hopelessly endeavoring,
>
> hehe.... But seriously: The easiest approach may be to modify the time
> sources to allow a fine tuning of the scaling factor. That way ntp_scale
> may be moved into tick processing where it would adjust the scaling of the
> time sources up or downward. Thus no ntp_scale in the monotonic clock
> processing anymore.

It depends what you want to have between ticks: If your ticks are too wide, the
clock will do a little jump forward at the start of a new tick; if they are too
narrow, the clock will jump back a bit at the start of a new tick (assuming tick
interpolation and tick generation are correlated. (The old kernel code uses a
constant to scale the timer's register to a tick. However if the time is too fast
or slow, the interpolation will also be). Those being blessed with a GPS or better
clock will be able to demonstrate the quality of the code as well as the tuning
possibilities against frequency errors.

>
> Monotonic clocks could be calculated
>
> monotime = ns_at_last_tick + (time_source_cycles_since_tick *
> current_scaling_factor) >> shift_factor.
>
> This would also be easy to implement in asm if necessary.
>
> tick processing could then increment or decrement the current scaling
> factor to minimize the error between ticks. It could also add
> nanoseconds to ns_at_last_tick to correct the clock forward.

Is that what corresponds to "adjust_nanoscale()" in my PPSkit?

>
> With the appropiate shift_factor one should be able to fine tune time much
> more accurately than ntp_scale would do. Over time the necessary
> corrections could be minimized to just adding a few ns once in a while.
>

Regards,
Ulrich

2005-01-25 08:17:32

by Andi Kleen

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday core subsystem (v. A2)

On Mon, Jan 24, 2005 at 02:51:29PM -0800, john stultz wrote:
> All,
> Here is a new release of my time of day proposal, which include ppc64
> support as well as suspend/resume and cpufreq hooks. For basic summary
> of my ideas, you can follow this link: http://lwn.net/Articles/100665/

[...]
How do vsyscalls (running gettimeofday in user space) fit into your
architecture? I don't see any provision for this.

Also on x86-64 we plan to keep the cycle time base per CPU, that
will likely require some more changes to your architecture too.

-Andi

2005-01-25 12:29:51

by Tim Schmielau

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday core subsystem (v. A2)

> Monotonic clocks could be calculated
>
> monotime = ns_at_last_tick + (time_source_cycles_since_tick *
> current_scaling_factor) >> shift_factor.
>
> This would also be easy to implement in asm if necessary.
>
> tick processing could then increment or decrement the current scaling
> factor to minimize the error between ticks. It could also add
> nanoseconds to ns_at_last_tick to correct the clock forward.

I'd think that adding nanoseconds to ns_at_last_tick is not a good idea.
It might minimize the error shortly after the tick, but not the total
error average over the whole tick period. And it introduces clock jumps.
Tiny, but unnecessary.

Just as you say,

> With the appropiate shift_factor one should be able to fine tune time much
> more accurately than ntp_scale would do. Over time the necessary
> corrections could be minimized to just adding a few ns once in a while.

finetuning the scaling factor should be enough.

Tim

2005-01-25 23:12:56

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday arch specific hooks (v. A2)

On Tue, 2005-01-25 at 13:28 +1100, Benjamin Herrenschmidt wrote:
> On Mon, 2005-01-24 at 14:52 -0800, john stultz wrote:
> > All,
> > This patch implements the minimal architecture specific hooks to enable
> > the new time of day subsystem code for i386, x86-64, and ppc64. It
> > applies on top of my linux-2.6.11-rc1_timeofday-core_A2 patch and with
> > this patch applied, you can test the new time of day subsystem.
> >
> > Basically it adds the call to timeofday_interrupt_hook() and cuts alot
> > of code out of the build via #ifdefs. I know, I know, #ifdefs' are ugly
> > and bad, and the final patch will just remove the old code. For now this
> > allows us to be flexible and easily switch between the two
> > implementations with a single define. Also it makes the patch a bit
> > easier to read.
>
> I haven't seen your other patch. Do you mean that with this patch, ppc64
> stops using it's own gettimeofday implementation based on the CPU
> hardware timebase ?

Not quite. It still uses the hardware timebase, but we use a common
infrastructure to calculate time. I believe you'll find the common code
similar to the current ppc64 time code, as it seemed to be one of the
better timeofday implementations (oh the joy of sane hardware time
devices).

> There are reasons why I plan to keep that. First, our implementation is
> very efficient. It allows a timeofday computation without locks or
> barriers thanks to carefully hand crafted data dependencies in the
> operation.

The performance is a concern, and right now there are issues (ntp_scale
being the top of the list) however I hope they can be resolved. Looking
at ppc64's do_gettimeofday() vs this implementation there we do have
more overhead, but maybe you could suggest how we can avoid some of it?

> Second, we have an ABI issue here. For historical reasons, we
> have this "systemcfg" data structure that can be mmap'ed to userland,
> and which contains copy of some of the ppc64 internal time keeping
> infos. Some userland stuff use it to implement a fully userland
> gettimeofday (again, without barrier nor locks). This is done at least
> by IBM's JVM. My still-to-be-merged vDSO patch will also use this for
> the userland implementation of gettimeofday syscall itself.

I still want to support vsyscall gettimeofday, although it does have to
be done on an arch-by-arch basis. It's likely the systemcfg data
structure can still be generated and exported. I'll look into it and see
what can be done.

thanks
-john

2005-01-26 00:03:00

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday arch specific hooks (v. A2)

On Tue, 2005-01-25 at 15:09 -0800, john stultz wrote:

> The performance is a concern, and right now there are issues (ntp_scale
> being the top of the list) however I hope they can be resolved. Looking
> at ppc64's do_gettimeofday() vs this implementation there we do have
> more overhead, but maybe you could suggest how we can avoid some of it?

I would suggest reclaculating the scale factor and offset for ntp
adjustement regulary from the timer tick or so, not on each gettimeofday
call.

Also, I have some updates to the ppc64 implementation where I regulary
update the pre-scale offset into the post-scale one so that the
timebase-prescale substraction always gives a 32 bits number. I do that
so my fast userland gettimeofday can be implemented more easily and more
efficiently for 32 bits processes. I yet have to check how I can hook
those things into your new scheme.

> I still want to support vsyscall gettimeofday, although it does have to
> be done on an arch-by-arch basis. It's likely the systemcfg data
> structure can still be generated and exported. I'll look into it and see
> what can be done.

Well, since it only contains the prescale and postscale offsets and the
scaling value, it only needs to be updated when they change, so a hook
here would be fine.

Ben.

2005-01-26 00:20:51

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday arch specific hooks (v. A2)

On Wed, 2005-01-26 at 10:53 +1100, Benjamin Herrenschmidt wrote:
> On Tue, 2005-01-25 at 15:09 -0800, john stultz wrote:
>
> > The performance is a concern, and right now there are issues (ntp_scale
> > being the top of the list) however I hope they can be resolved. Looking
> > at ppc64's do_gettimeofday() vs this implementation there we do have
> > more overhead, but maybe you could suggest how we can avoid some of it?
>
> I would suggest reclaculating the scale factor and offset for ntp
> adjustement regulary from the timer tick or so, not on each gettimeofday
> call.

Agreed. I'll get something like this done for the next release.

> Also, I have some updates to the ppc64 implementation where I regulary
> update the pre-scale offset into the post-scale one so that the
> timebase-prescale substraction always gives a 32 bits number. I do that
> so my fast userland gettimeofday can be implemented more easily and more
> efficiently for 32 bits processes. I yet have to check how I can hook
> those things into your new scheme.

Hmm. In my code, I move the interval delta (similar to your pre-scale
offset) to system_time (seems to be equivalent to the post-scale) at
each call to timeofday_interrupt_hook(). So while 64 bits are normally
used, you could probably get away doing the interval delta calculations
in 32bits if your timesource frequency isn't too large. This would only
be done in the arch-specific 32bit vsyscall code, right?

> > I still want to support vsyscall gettimeofday, although it does have to
> > be done on an arch-by-arch basis. It's likely the systemcfg data
> > structure can still be generated and exported. I'll look into it and see
> > what can be done.
>
> Well, since it only contains the prescale and postscale offsets and the
> scaling value, it only needs to be updated when they change, so a hook
> here would be fine.

Great, thats what I was hoping.

thanks
-john

2005-01-26 00:37:31

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday arch specific hooks (v. A2)

On Tue, 25 Jan 2005, john stultz wrote:

> Agreed. I'll get something like this done for the next release.
>
> > Well, since it only contains the prescale and postscale offsets and the
> > scaling value, it only needs to be updated when they change, so a hook
> > here would be fine.
>
> Great, thats what I was hoping.

I just hope that the implementation of one arch does not become a standard
without sufficient reflection. Could we first get an explanation of
the rationale of the offsets? From my viewpoint of the ia64 implementation
I have some difficulty understanding why such complicated things as
prescale and postscale are necessary in gettimeday and why the simple
formula that we use in gettimeofday is not sufficient?

Frankly, the direction that the design of the new time subsystem is
taking is bothering me. Work on this on our part would just improve the
situation from drastically worse performance to somewhat worse. So far I
have not seen a benefit of moving away from the existing code base. For
the project to make sense it needs at least to be evident that the design
of the solution would lead to better timer performance in the long run.
Conceptually that seems so far not to be possible.

I'd love simplication of the timer subsystem through the use of
nanosecond offsets. However, the POSIX api always has extra fields
for seconds and nanoseconds and converting back and forth between the
internal representation in 64bit nanoseconds and the POSIX structures may
be another performance penalty since it involves divisions and remainder
processing.

What I think is a priority need is some subsystem that manages
time sources effectively (including the ability of the ntp code to
scale the appropriately) and does that in an arch independent
way so that all the code can be consolidated. Extract the best existing
solutions and work from there.

2005-01-26 01:01:22

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday core subsystem (v. A2)

On Tue, 2005-01-25 at 09:17 +0100, Andi Kleen wrote:
> On Mon, Jan 24, 2005 at 02:51:29PM -0800, john stultz wrote:
> > All,
> > Here is a new release of my time of day proposal, which include ppc64
> > support as well as suspend/resume and cpufreq hooks. For basic summary
> > of my ideas, you can follow this link: http://lwn.net/Articles/100665/
>
> [...]
> How do vsyscalls (running gettimeofday in user space) fit into your
> architecture? I don't see any provision for this.

Yea, I had some earlier ideas for it, although they were misconceived.
My plan at the moment is to do it similarly to how x86-64 and my i386
patch did it, but still have it on an arch-per-arch basis.

> Also on x86-64 we plan to keep the cycle time base per CPU, that
> will likely require some more changes to your architecture too.

I like to hear more details, if you can discuss it. Its interesting,
because I don't quite see how you'd be able to do this.

thanks
-john

2005-01-26 03:20:44

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday arch specific hooks (v. A2)

On Tue, 2005-01-25 at 16:17 -0800, john stultz wrote:

> Hmm. In my code, I move the interval delta (similar to your pre-scale
> offset) to system_time (seems to be equivalent to the post-scale) at
> each call to timeofday_interrupt_hook(). So while 64 bits are normally
> used, you could probably get away doing the interval delta calculations
> in 32bits if your timesource frequency isn't too large. This would only
> be done in the arch-specific 32bit vsyscall code, right?

Yes. Looks ok so far, but I need to make sure by looking at the code.
I'll let you know.

> > > I still want to support vsyscall gettimeofday, although it does have to
> > > be done on an arch-by-arch basis. It's likely the systemcfg data
> > > structure can still be generated and exported. I'll look into it and see
> > > what can be done.
> >
> > Well, since it only contains the prescale and postscale offsets and the
> > scaling value, it only needs to be updated when they change, so a hook
> > here would be fine.
>
> Great, thats what I was hoping.
>
> thanks
> -john
--
Benjamin Herrenschmidt <[email protected]>

2005-01-26 03:36:01

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday arch specific hooks (v. A2)

On Tue, 2005-01-25 at 16:34 -0800, Christoph Lameter wrote:

> I just hope that the implementation of one arch does not become a standard
> without sufficient reflection. Could we first get an explanation of
> the rationale of the offsets? From my viewpoint of the ia64 implementation
> I have some difficulty understanding why such complicated things as
> prescale and postscale are necessary in gettimeday and why the simple
> formula that we use in gettimeofday is not sufficient?

What is complicated here ? The formula, at least as we do on ppc64, is
simply:

time = (hw_value - prescale offset) / scale + post scale offset

Please, don't tell me that a substraction bothers you for performances,
and this first offset, while it could maybe be "folded" into the second
one, is actually handy, especially since it allows you to keep the
(hw_value - prescale offset) in a 32 bits number if your HW timebase
isn't too fast.

Now, for the details, on ppc, we calculate time in what we call "xsecs"
which is 2^20 xsec/sec, so not exactly micro or nanoseconds, but that's
also for simplifying calculations, we may want the generic code to just
fallback on to ns.

> Frankly, the direction that the design of the new time subsystem is
> taking is bothering me. Work on this on our part would just improve the
> situation from drastically worse performance to somewhat worse. So far I
> have not seen a benefit of moving away from the existing code base. For
> the project to make sense it needs at least to be evident that the design
> of the solution would lead to better timer performance in the long run.
> Conceptually that seems so far not to be possible.

The main problem with performances in the new code is the fact that it
does the ntp correction on every call. John is aware that it is a
problem and will fix that.

> I'd love simplication of the timer subsystem through the use of
> nanosecond offsets. However, the POSIX api always has extra fields
> for seconds and nanoseconds and converting back and forth between the
> internal representation in 64bit nanoseconds and the POSIX structures may
> be another performance penalty since it involves divisions and remainder
> processing.
>
> What I think is a priority need is some subsystem that manages
> time sources effectively (including the ability of the ntp code to
> scale the appropriately) and does that in an arch independent
> way so that all the code can be consolidated. Extract the best existing
> solutions and work from there.

Which is what John is trying to do, so help instead of criticizing :)

Ben.

2005-01-26 16:54:45

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday arch specific hooks (v. A2)

On Wed, 26 Jan 2005, Benjamin Herrenschmidt wrote:

> On Tue, 2005-01-25 at 16:34 -0800, Christoph Lameter wrote:
>
> > I just hope that the implementation of one arch does not become a standard
> > without sufficient reflection. Could we first get an explanation of
> > the rationale of the offsets? From my viewpoint of the ia64 implementation
> > I have some difficulty understanding why such complicated things as
> > prescale and postscale are necessary in gettimeday and why the simple
> > formula that we use in gettimeofday is not sufficient?
>
> What is complicated here ? The formula, at least as we do on ppc64, is
> simply:
>
> time = (hw_value - prescale offset) / scale + post scale offset

Yes that is basically what we do on ia64 but we use different
terminology.

time = ns_at_last_tick + (hw_value - last_tick_hw_value) * scale >> shift

> > What I think is a priority need is some subsystem that manages
> > time sources effectively (including the ability of the ntp code to
> > scale the appropriately) and does that in an arch independent
> > way so that all the code can be consolidated. Extract the best existing
> > solutions and work from there.
>
> Which is what John is trying to do, so help instead of criticizing :)

I sure hope that we will be doing that. But so far this has been
a new implementation instead otherwise ntp_scale would not be in the
gettimeofday function.

2005-01-26 16:58:56

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday arch specific hooks (v. A2)

On Wed, 26 Jan 2005, Martin Schwidefsky wrote:

> Why not add an if at the start of gettimeofday to check when the last
> ntp updates has been done and if it has been too long since the last time
> then call ntp_scale ? That way the update isn't done on every call to
> gettimeofday and we don't depend on the regular timer tick.

Because ia64 does not support calling arbitrary C functions in fastcalls.

2005-01-26 23:18:14

by David Mosberger

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday arch specific hooks (v. A2)

>>>>> On Wed, 26 Jan 2005 08:52:12 -0800 (PST), Christoph Lameter <[email protected]> said:

Christoph> On Wed, 26 Jan 2005, Martin Schwidefsky wrote:
>> Why not add an if at the start of gettimeofday to check when the
>> last ntp updates has been done and if it has been too long since
>> the last time then call ntp_scale ? That way the update isn't
>> done on every call to gettimeofday and we don't depend on the
>> regular timer tick.

Christoph> Because ia64 does not support calling arbitrary C
Christoph> functions in fastcalls.

However, it can fall back on a heavy-weight syscall easily. We
already do that on a number of occasions, e.g., if we find a spinlock
already taken. I think it would be OK to have gettimeofday
occasionally fall back on the heavy-weight version to do NTP magic.

--david

2005-02-01 22:06:22

by Tim Bird

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday core subsystem (v. A2)

Minor spelling fix, and a question.

john stultz wrote:
> linux-2.6.11-rc2_timeofday-core_A2.patch
> ========================================
> diff -Nru a/drivers/Makefile b/drivers/Makefile
> --- a/drivers/Makefile 2005-01-24 13:30:06 -08:00
> +++ b/drivers/Makefile 2005-01-24 13:30:06 -08:00
...

> + * all systems. It has the same course resolution as
should be "coarse"

Do you replace get_cmos_time() - it doesn't look like it.

You use it in your patch here...

> diff -Nru a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
> --- a/arch/i386/kernel/time.c 2005-01-24 13:33:59 -08:00
> +++ b/arch/i386/kernel/time.c 2005-01-24 13:33:59 -08:00
...

> +/* arch specific timeofday hooks */
> +nsec_t read_persistent_clock(void)
> +{
> + return (nsec_t)get_cmos_time() * NSEC_PER_SEC;
> +}
> +

I didn't scan for all uses of read_persistent_clock, but
in my experience get_cmos_time() has a latency of up to
1 second on x86 because it synchronizes with the rollover
of the RTC seconds.

This comment in timeofday.c:timeofday_suspend_hook
worries me:

> + /* First off, save suspend start time
> + * then quickly read the time source.
> + * These two calls hopefully occur quickly
> + * because the difference will accumulate as
> + * time drift on resume.
> + */
> + suspend_start = read_persistent_clock();

Do you know if the sync problem is an issue here?

=============================
Tim Bird
Architecture Group Chair, CE Linux Forum
Senior Staff Engineer, Sony Electronics
=============================

2005-02-01 22:54:39

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday core subsystem (v. A2)

On Tue, 2005-02-01 at 14:06 -0800, Tim Bird wrote:
> Minor spelling fix, and a question.
>
> john stultz wrote:
> > linux-2.6.11-rc2_timeofday-core_A2.patch
> > ========================================
> > diff -Nru a/drivers/Makefile b/drivers/Makefile
> > --- a/drivers/Makefile 2005-01-24 13:30:06 -08:00
> > +++ b/drivers/Makefile 2005-01-24 13:30:06 -08:00
> ...
>
> > + * all systems. It has the same course resolution as
> should be "coarse"

Good catch, I'm a terrible speller.

> Do you replace get_cmos_time() - it doesn't look like it.

Nope, its still used on i386 and x86-64, however I had to create an arch
independent abstraction for set/read_persistent_clock().

> You use it in your patch here...
>
> > diff -Nru a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
> > --- a/arch/i386/kernel/time.c 2005-01-24 13:33:59 -08:00
> > +++ b/arch/i386/kernel/time.c 2005-01-24 13:33:59 -08:00
> ...
>
> > +/* arch specific timeofday hooks */
> > +nsec_t read_persistent_clock(void)
> > +{
> > + return (nsec_t)get_cmos_time() * NSEC_PER_SEC;
> > +}
> > +
>
> I didn't scan for all uses of read_persistent_clock, but
> in my experience get_cmos_time() has a latency of up to
> 1 second on x86 because it synchronizes with the rollover
> of the RTC seconds.

I believe you're right. Although we don't call read_persistent_clock()
very frequently, nor do we call it in ways we don't already call
get_cmos_time(). So I'm not sure exactly what the concern is.

> This comment in timeofday.c:timeofday_suspend_hook
> worries me:
>
> > + /* First off, save suspend start time
> > + * then quickly read the time source.
> > + * These two calls hopefully occur quickly
> > + * because the difference will accumulate as
> > + * time drift on resume.
> > + */
> > + suspend_start = read_persistent_clock();
>
> Do you know if the sync problem is an issue here?

I don't believe so. The full context of the code is this:

/* First off, save suspend start time
* then quickly read the time source.
* These two calls hopefully occur quickly
* because the difference will accumulate as
* time drift on resume.
*/
suspend_start = read_persistent_clock();
now = read_timesource(timesource);

Since we call read_persistent_clock(), it should return right as the
second changes, thus we will be marking the new second as closely as
possible with the timesource value. If the order was reversed, I think
it would be a concern.

I've only lightly tested the suspend code, but on my system I didn't see
very much drift appear. Regardless, it should be better then what the
current suspend/resume code does, which doesn't keep any sub-second
resolution across suspend.

thanks so much for the code review!
-john

2005-02-01 23:12:37

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday core subsystem (v. A2)

Hi John and Tim.

On Wed, 2005-02-02 at 09:48, john stultz wrote:
> > I didn't scan for all uses of read_persistent_clock, but
> > in my experience get_cmos_time() has a latency of up to
> > 1 second on x86 because it synchronizes with the rollover
> > of the RTC seconds.
>
> I believe you're right. Although we don't call read_persistent_clock()
> very frequently, nor do we call it in ways we don't already call
> get_cmos_time(). So I'm not sure exactly what the concern is.

Tim and I talked about this at the recent CELF conference. I have a
concern in that suspend-to-disk calls the suspend methods and then
(after the atomic copy) the resume methods. Since the copy usually takes
< 1s, and the suspend and resume methods both make two calls to
get_coms_time, that's an average of 1.5s per suspend call and 1.5s per
resume call - but if the copy does take next to no time (as normal),
it's really 1.5s + 2s = 3.5s average just for getting the time. I
believe Tim has similar issues in code he is working on. It's a concern
if your battery is running out and you're trying to hibernate!

[...]

> I've only lightly tested the suspend code, but on my system I didn't see
> very much drift appear. Regardless, it should be better then what the
> current suspend/resume code does, which doesn't keep any sub-second
> resolution across suspend.

My question is, "Is there a way we can get sub-second resolution without
waiting for the start of a new second four times in a row?" I'm sure
there must be.

Regards,

Nigel

--
Nigel Cunningham
Software Engineer, Canberra, Australia
http://www.cyclades.com

Ph: +61 (2) 6292 8028 Mob: +61 (417) 100 574

2005-02-01 23:32:44

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday core subsystem (v. A2)

On Wed, 2005-02-02 at 10:14 +1100, Nigel Cunningham wrote:
> Hi John and Tim.
>
> On Wed, 2005-02-02 at 09:48, john stultz wrote:
> > > I didn't scan for all uses of read_persistent_clock, but
> > > in my experience get_cmos_time() has a latency of up to
> > > 1 second on x86 because it synchronizes with the rollover
> > > of the RTC seconds.
> >
> > I believe you're right. Although we don't call read_persistent_clock()
> > very frequently, nor do we call it in ways we don't already call
> > get_cmos_time(). So I'm not sure exactly what the concern is.
>
> Tim and I talked about this at the recent CELF conference. I have a
> concern in that suspend-to-disk calls the suspend methods and then
> (after the atomic copy) the resume methods. Since the copy usually takes
> < 1s, and the suspend and resume methods both make two calls to
> get_coms_time, that's an average of 1.5s per suspend call and 1.5s per
> resume call - but if the copy does take next to no time (as normal),
> it's really 1.5s + 2s = 3.5s average just for getting the time. I
> believe Tim has similar issues in code he is working on. It's a concern
> if your battery is running out and you're trying to hibernate!

Well, counting the atomic copy in the "3.5s average just for getting the
time" doesn't quite seem fair, but I think I understand. Its
interesting, I wasn't aware of the suspend/copy/resume process that
occurs for suspend-to-disk. The thing I don't quite get is why are the
resume methods called before we really suspend to disk?

> > I've only lightly tested the suspend code, but on my system I didn't see
> > very much drift appear. Regardless, it should be better then what the
> > current suspend/resume code does, which doesn't keep any sub-second
> > resolution across suspend.
>
> My question is, "Is there a way we can get sub-second resolution without
> waiting for the start of a new second four times in a row?" I'm sure
> there must be.

Well, I'm not sure what else we could use for the persistent clock, but
I'd be happy to change the read/set_persistent_clock function to use it.

thanks
-john

2005-02-01 23:55:19

by Tim Bird

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday core subsystem (v. A2)

john stultz wrote:
> I believe you're right. Although we don't call read_persistent_clock()
> very frequently, nor do we call it in ways we don't already call
> get_cmos_time(). So I'm not sure exactly what the concern is.

Sorry - I should have given more context. I am worried about
suspend and resume times. An extra (up-to-a) second delay on
suspend it pretty painful for CE devices. (See my SIG for
my other hat in the forum.)

>
> Since we call read_persistent_clock(), it should return right as the
> second changes, thus we will be marking the new second as closely as
> possible with the timesource value. If the order was reversed, I think
> it would be a concern.
>

It sounds like for your code, this synchronization is a valuable.
For many CE products, the synchronization is not needed. I have a
patch that removes the synchronization for i386 and ppc, but
I haven't submitted it because I didn't want to mess up
non-boot-context callers of get_cmos_time which have valid
synchronization needs.

As you can see below, the patch is pretty braindead.
I was wondering if this conflicted with your new timer system or
not.

diffstat:
arch/ppc/kernel/time.c | 10 ++++++++--
include/asm-i386/mach-default/mach_time.h | 6 +++++-
init/Kconfig | 27 +++++++++++++++++++++++++++
3 files changed, 40 insertions(+), 3 deletions(-)

Signed-off-by: Tim Bird <[email protected]>

-----------------------
diff -pruN -X /home/tbird/dontdiff linux-2.6.10.orig/arch/ppc/kernel/time.c linux-2.6.10/arch/ppc/kernel/time.c
--- linux-2.6.10.orig/arch/ppc/kernel/time.c 2004-12-24 13:35:23.000000000 -0800
+++ linux-2.6.10/arch/ppc/kernel/time.c 2005-02-01 15:28:42.539108108 -0800
@@ -291,8 +291,12 @@ EXPORT_SYMBOL(do_settimeofday);
/* This function is only called on the boot processor */
void __init time_init(void)
{
- time_t sec, old_sec;
- unsigned old_stamp, stamp, elapsed;
+ time_t sec;
+ unsigned stamp;
+#ifndef CONFIG_RTC_NO_SYNC
+ time_t old_sec;
+ unsigned old_stamp, elapsed;
+#endif

if (ppc_md.time_init != NULL)
time_offset = ppc_md.time_init();
@@ -317,6 +321,7 @@ void __init time_init(void)
stamp = get_native_tbl();
if (ppc_md.get_rtc_time) {
sec = ppc_md.get_rtc_time();
+#ifndef CONFIG_RTC_NO_SYNC
elapsed = 0;
do {
old_stamp = stamp;
@@ -329,6 +334,7 @@ void __init time_init(void)
} while ( sec == old_sec && elapsed < 2*HZ*tb_ticks_per_jiffy);
if (sec==old_sec)
printk("Warning: real time clock seems stuck!\n");
+#endif
xtime.tv_sec = sec;
xtime.tv_nsec = 0;
/* No update now, we just read the time from the RTC ! */
diff -pruN -X /home/tbird/dontdiff linux-2.6.10.orig/include/asm-i386/mach-default/mach_time.h linux-2.6.10/include/asm-i386/mach-default/mach_time.h
--- linux-2.6.10.orig/include/asm-i386/mach-default/mach_time.h 2004-12-24 13:34:30.000000000 -0800
+++ linux-2.6.10/include/asm-i386/mach-default/mach_time.h 2005-02-01 15:28:48.245009070 -0800
@@ -89,6 +89,7 @@ static inline unsigned long mach_get_cmo
* RTC registers show the second which has precisely just started.
* Let's hope other operating systems interpret the RTC the same way.
*/
+#ifndef CONFIG_RTC_NO_SYNC_ON_READ
/* read RTC exactly on falling edge of update flag */
for (i = 0 ; i < 1000000 ; i++) /* may take up to 1 second... */
if (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP)
@@ -96,7 +97,10 @@ static inline unsigned long mach_get_cmo
for (i = 0 ; i < 1000000 ; i++) /* must try at least 2.228 ms */
if (!(CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP))
break;
- do { /* Isn't this overkill ? UIP above should guarantee consistency */
+/* The following is probably overkill because
+ * UIP above should guarantee consistency */
+#endif
+ do {
sec = CMOS_READ(RTC_SECONDS);
min = CMOS_READ(RTC_MINUTES);
hour = CMOS_READ(RTC_HOURS);
diff -pruN -X /home/tbird/dontdiff linux-2.6.10.orig/init/Kconfig linux-2.6.10/init/Kconfig
--- linux-2.6.10.orig/init/Kconfig 2004-12-24 13:35:24.000000000 -0800
+++ linux-2.6.10/init/Kconfig 2005-02-01 15:28:48.281002137 -0800
@@ -248,6 +248,33 @@ config IKCONFIG_PROC
This option enables access to the kernel configuration file
through /proc/config.gz.

+menuconfig FASTBOOT
+ bool "Fast boot options"
+ help
+ Say Y here to select among various options that can decrease
+ kernel boot time. These options may involve providing
+ hardcoded values for some parameters that the kernel usually
+ determines automatically.
+
+ This option is useful primarily on embedded systems.
+
+ If unsure, say N.
+
+config RTC_NO_SYNC
+ bool "Disable synch on read of Real Time Clock" if FASTBOOT
+ default n
+ help
+ The Real Time Clock is read aligned by default. That means a
+ series of reads of the RTC are done until it's verified that
+ the RTC's state has just changed. If you enable this feature,
+ this synchronization will not be performed. The result is that
+ the machine will boot up to 1 second faster.
+
+ A drawback is that, with this option enabled, your system
+ clock may drift from the correct value over the course
+ of several boot cycles (under certain circumstances).
+
+ If unsure, say N.

menuconfig EMBEDDED
bool "Configure standard kernel features (for small systems)"

2005-02-02 00:01:53

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday core subsystem (v. A2)

Hi.

On Wed, 2005-02-02 at 10:32, john stultz wrote:
> On Wed, 2005-02-02 at 10:14 +1100, Nigel Cunningham wrote:
> > Hi John and Tim.
> >
> > On Wed, 2005-02-02 at 09:48, john stultz wrote:
> > > > I didn't scan for all uses of read_persistent_clock, but
> > > > in my experience get_cmos_time() has a latency of up to
> > > > 1 second on x86 because it synchronizes with the rollover
> > > > of the RTC seconds.
> > >
> > > I believe you're right. Although we don't call read_persistent_clock()
> > > very frequently, nor do we call it in ways we don't already call
> > > get_cmos_time(). So I'm not sure exactly what the concern is.
> >
> > Tim and I talked about this at the recent CELF conference. I have a
> > concern in that suspend-to-disk calls the suspend methods and then
> > (after the atomic copy) the resume methods. Since the copy usually takes
> > < 1s, and the suspend and resume methods both make two calls to
> > get_coms_time, that's an average of 1.5s per suspend call and 1.5s per
> > resume call - but if the copy does take next to no time (as normal),
> > it's really 1.5s + 2s = 3.5s average just for getting the time. I
> > believe Tim has similar issues in code he is working on. It's a concern
> > if your battery is running out and you're trying to hibernate!
>
> Well, counting the atomic copy in the "3.5s average just for getting the
> time" doesn't quite seem fair, but I think I understand. Its

You're right. Maybe we could say 3s, accounting .5s of that 3.5s average
delay as being for the atomic copy.

> interesting, I wasn't aware of the suspend/copy/resume process that
> occurs for suspend-to-disk. The thing I don't quite get is why are the
> resume methods called before we really suspend to disk?

We call the suspend and resume methods because the suspend is supposed
to achieve atomicity, and the resume is necessary for us to be able to
write the image. (Remember that these calls are invoked as part of the
drivers_suspend and drivers_resume code). Until recently the
sysdev_suspend and resume methods weren't called and things did still
work, but that was an omission and we did then run into time issues.

> > > I've only lightly tested the suspend code, but on my system I didn't see
> > > very much drift appear. Regardless, it should be better then what the
> > > current suspend/resume code does, which doesn't keep any sub-second
> > > resolution across suspend.
> >
> > My question is, "Is there a way we can get sub-second resolution without
> > waiting for the start of a new second four times in a row?" I'm sure
> > there must be.
>
> Well, I'm not sure what else we could use for the persistent clock, but
> I'd be happy to change the read/set_persistent_clock function to use it.

Is it possible to still use the persistent clock, but do the math for
the portions of seconds?

By the way, Tim, I hope I didn't misunderstand anything, and that these
_are_ the same issues you had!

Regards,

Nigel

> thanks
> -john
--
Nigel Cunningham
Software Engineer, Canberra, Australia
http://www.cyclades.com

Ph: +61 (2) 6292 8028 Mob: +61 (417) 100 574

2005-02-02 00:19:43

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday core subsystem (v. A2)

On Tue, 2005-02-01 at 15:53 -0800, Tim Bird wrote:
> john stultz wrote:
> > I believe you're right. Although we don't call read_persistent_clock()
> > very frequently, nor do we call it in ways we don't already call
> > get_cmos_time(). So I'm not sure exactly what the concern is.
>
> Sorry - I should have given more context. I am worried about
> suspend and resume times. An extra (up-to-a) second delay on
> suspend it pretty painful for CE devices. (See my SIG for
> my other hat in the forum.)

Ok, Nigel clarified it pretty well. Thanks.

> >
> > Since we call read_persistent_clock(), it should return right as the
> > second changes, thus we will be marking the new second as closely as
> > possible with the timesource value. If the order was reversed, I think
> > it would be a concern.
> >
>
> It sounds like for your code, this synchronization is a valuable.

Well, it just affects how much time error we gain on suspend/resume. We
can't be perfect (well, unless our active timesource is persistent
clock), and the comment points that we're just trying to minimize the
error.

> For many CE products, the synchronization is not needed. I have a
> patch that removes the synchronization for i386 and ppc, but
> I haven't submitted it because I didn't want to mess up
> non-boot-context callers of get_cmos_time which have valid
> synchronization needs.

Interesting patch. Indeed, the trade off is just how quickly you want to
boot vs how much drift you gain each suspend/resume cycle. Assuming all
of the clocks are good, your patch could introduce up to 2 seconds of
drift each suspend/resume cycle.

> As you can see below, the patch is pretty braindead.
> I was wondering if this conflicted with your new timer system or
> not.

Not really. The issue is present with or without my code.

thanks
-john

2005-02-02 00:27:49

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday core subsystem (v. A2)

On Wed, 2005-02-02 at 11:04 +1100, Nigel Cunningham wrote:
> Hi.
>
> On Wed, 2005-02-02 at 10:32, john stultz wrote:
> > interesting, I wasn't aware of the suspend/copy/resume process that
> > occurs for suspend-to-disk. The thing I don't quite get is why are the
> > resume methods called before we really suspend to disk?
>
> We call the suspend and resume methods because the suspend is supposed
> to achieve atomicity, and the resume is necessary for us to be able to
> write the image. (Remember that these calls are invoked as part of the
> drivers_suspend and drivers_resume code). Until recently the
> sysdev_suspend and resume methods weren't called and things did still
> work, but that was an omission and we did then run into time issues.

Ah! Ok, thanks for the summary.

> > > > I've only lightly tested the suspend code, but on my system I didn't see
> > > > very much drift appear. Regardless, it should be better then what the
> > > > current suspend/resume code does, which doesn't keep any sub-second
> > > > resolution across suspend.
> > >
> > > My question is, "Is there a way we can get sub-second resolution without
> > > waiting for the start of a new second four times in a row?" I'm sure
> > > there must be.
> >
> > Well, I'm not sure what else we could use for the persistent clock, but
> > I'd be happy to change the read/set_persistent_clock function to use it.
>
> Is it possible to still use the persistent clock, but do the math for
> the portions of seconds?

I'm not sure what you mean? Given the patch Tim just sent, it seems the
issue is the CMOS only gives us second resolution, so we try to increase
our accuracy by aligning the reads so we return when the second changes.
We can avoid the read-alignment which speeds things up, but introduces
up to a second worth of drift. If that's ok, then the trade off is worth
it.

Alternative persistent clocks like the efi clock might provide better
resolution and could then avoid this issue. Although I don't know for
sure.

thanks
-john

2005-02-02 00:34:55

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday core subsystem (v. A2)

Hi.

On Wed, 2005-02-02 at 11:27, john stultz wrote:
> > We call the suspend and resume methods because the suspend is supposed
> > to achieve atomicity, and the resume is necessary for us to be able to
> > write the image. (Remember that these calls are invoked as part of the
> > drivers_suspend and drivers_resume code). Until recently the
> > sysdev_suspend and resume methods weren't called and things did still
> > work, but that was an omission and we did then run into time issues.
>
> Ah! Ok, thanks for the summary.

No problem.

> > > > > I've only lightly tested the suspend code, but on my system I didn't see
> > > > > very much drift appear. Regardless, it should be better then what the
> > > > > current suspend/resume code does, which doesn't keep any sub-second
> > > > > resolution across suspend.
> > > >
> > > > My question is, "Is there a way we can get sub-second resolution without
> > > > waiting for the start of a new second four times in a row?" I'm sure
> > > > there must be.
> > >
> > > Well, I'm not sure what else we could use for the persistent clock, but
> > > I'd be happy to change the read/set_persistent_clock function to use it.
> >
> > Is it possible to still use the persistent clock, but do the math for
> > the portions of seconds?
>
> I'm not sure what you mean? Given the patch Tim just sent, it seems the
> issue is the CMOS only gives us second resolution, so we try to increase
> our accuracy by aligning the reads so we return when the second changes.
> We can avoid the read-alignment which speeds things up, but introduces
> up to a second worth of drift. If that's ok, then the trade off is worth
> it.
>
> Alternative persistent clocks like the efi clock might provide better
> resolution and could then avoid this issue. Although I don't know for
> sure.

Ah. Okay. I hadn't looked that closely so that I realised the CMOS only
gives the accuracy we're using. Humble apologies. So then, I agree: it
would be best if we can move to something with greater precision and
make mileage from it. Is that an option on all x86 machines though? I
guess cmos is the lowest common denominator :<

Nigel
--
Nigel Cunningham
Software Engineer, Canberra, Australia
http://www.cyclades.com

Ph: +61 (2) 6292 8028 Mob: +61 (417) 100 574

2005-02-02 01:48:37

by Tim Bird

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday core subsystem (v. A2)

john stultz wrote:
> Interesting patch. Indeed, the trade off is just how quickly you want to
> boot vs how much drift you gain each suspend/resume cycle. Assuming all
> of the clocks are good, your patch could introduce up to 2 seconds of
> drift each suspend/resume cycle.

If we're not writing to the RTC on suspend, then I believe the drift is
capped. For some consumer products, 2 seconds of drift is OK.

Nigel, does the RTC get written to, or just read, on suspend?

Also, I'm worried about the clock appearing to run backwards over a suspend.
Unless a suspend/resume cycle took less than 1 second, I don't think this could
happen. Is that right?

=============================
Tim Bird
Architecture Group Chair, CE Linux Forum
Senior Staff Engineer, Sony Electronics
=============================

2005-02-02 02:00:28

[permalink] [raw]

Subject: Re: [RFC][PATCH] new timeofday core subsystem (v. A2)

On Tue, 2005-02-01 at 17:48 -0800, Tim Bird wrote:
> john stultz wrote:
> > Interesting patch. Indeed, the trade off is just how quickly you want to
> > boot vs how much drift you gain each suspend/resume cycle. Assuming all
> > of the clocks are good, your patch could introduce up to 2 seconds of
> > drift each suspend/resume cycle.
>
> If we're not writing to the RTC on suspend, then I believe the drift is
> capped. For some consumer products, 2 seconds of drift is OK.
>
> Nigel, does the RTC get written to, or just read, on suspend?

I'll let Nigel respond, but I don't believe so. The time code only
writes out to the CMOS every X-minutes if we're synced w/ the NTP
server.

> Also, I'm worried about the clock appearing to run backwards over a suspend.
> Unless a suspend/resume cycle took less than 1 second, I don't think this could
> happen. Is that right?

Well (with my code, the existing code might be slightly different), on
suspend we read the persistent clock and we accumulate all the time that
has passed on the timesource. Then on resume we read the persistent
clock, the delta between persistent clock reads (which cannot be
negative unless the CMOS runs backwards) is added to the system time and
a new time interval is started from the current value of the
timesource.

So, unless something tweaks the CMOS between reads, or the hardware has
problems, then time should not go backwards.

thanks
-john

2005-02-02 02:21:05