2005-04-29 22:49:08

by john stultz

[permalink] [raw]
Subject: [RFC][PATCH (1/4)] new timeofday core subsystem (v A4)

All,
This patch implements the architecture independent portion of
the time of day subsystem. For a brief description on the rework, see
here: http://lwn.net/Articles/120850/ (Many thanks to the LWN team for
that clear writeup!)

Mostly this version is just a cleanup of the last release. One neat
feature is the new sysfs interface which allows you to manually override
the selected timesource while the system is running.

Included below is timeofday.c (which includes all the time of day
management and accessor functions), ntp.c (which includes the ntp
scaling calculation code, leapsecond processing, and ntp kernel state
machine code), timesource.c (for timesource specific management
functions), interface definition .h files, the example jiffies
timesource (lowest common denominator time source, mainly for use as
example code) and minimal hooks into arch independent code.

The patch does not function without minimal architecture specific hooks
(i386, x86-64, ppc32, ppc64, ia64 and s390 examples to follow), and it
should be able to be applied to a tree without affecting the code.

New in this version:
o Improved cyc2ns remainder handling
o Added getnstimeofday() interface
o Better timesource management
o Sysfs interface for overriding timesources
o Cleanups from Nish Aravamudan and Matt Mackall

Items still on the TODO list:
o make ntp adjustments be in ppb instead of ppm
o posix-timers integration
o boot time "timesource=" override option

I look forward to your comments and feedback.

thanks
-john

linux-2.6.12-rc2_timeofday-core_A4.patch
=========================================
diff -Nru a/drivers/Makefile b/drivers/Makefile
--- a/drivers/Makefile 2005-04-29 15:12:09 -07:00
+++ b/drivers/Makefile 2005-04-29 15:12:09 -07:00
@@ -64,3 +64,4 @@
obj-$(CONFIG_BLK_DEV_SGIIOC4) += sn/
obj-y += firmware/
obj-$(CONFIG_CRYPTO) += crypto/
+obj-$(CONFIG_NEWTOD) += timesource/
diff -Nru a/drivers/timesource/Makefile b/drivers/timesource/Makefile
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/drivers/timesource/Makefile 2005-04-29 15:12:09 -07:00
@@ -0,0 +1 @@
+obj-y += jiffies.o
diff -Nru a/drivers/timesource/jiffies.c b/drivers/timesource/jiffies.c
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/drivers/timesource/jiffies.c 2005-04-29 15:12:09 -07:00
@@ -0,0 +1,45 @@
+/*
+ * linux/drivers/timesource/jiffies.c
+ *
+ * Copyright (C) 2004 IBM
+ *
+ * This file contains the jiffies based time source.
+ *
+ */
+#include <linux/timesource.h>
+#include <linux/jiffies.h>
+#include <linux/init.h>
+
+/* The Jiffies based timesource is the lowest common
+ * denominator time source which should function on
+ * all systems. It has the same coarse resolution as
+ * the timer interrupt frequency HZ and it suffers
+ * inaccuracies caused by missed or lost timer
+ * interrupts and the inability for the timer
+ * interrupt hardware to accuratly tick at the
+ * requested HZ value. It is also not reccomended
+ * for "tick-less" systems.
+ */
+
+static cycle_t jiffies_read(void)
+{
+ cycle_t ret = get_jiffies_64();
+ return ret;
+}
+
+struct timesource_t timesource_jiffies = {
+ .name = "jiffies",
+ .priority = 0, /* lowest priority*/
+ .type = TIMESOURCE_FUNCTION,
+ .read_fnct = jiffies_read,
+ .mask = (cycle_t)-1,
+ .mult = (NSEC_PER_SEC+(HZ/2))/HZ,
+ .shift = 0,
+};
+
+static int init_jiffies_timesource(void)
+{
+ register_timesource(&timesource_jiffies);
+ return 0;
+}
+module_init(init_jiffies_timesource);
diff -Nru a/include/linux/ntp.h b/include/linux/ntp.h
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/include/linux/ntp.h 2005-04-29 15:12:09 -07:00
@@ -0,0 +1,22 @@
+/* linux/include/linux/ntp.h
+ *
+ * Copyright (C) 2003, 2004, 2005 IBM, John Stultz ([email protected])
+ *
+ * This file NTP state machine accessor functions.
+ */
+
+#ifndef _LINUX_NTP_H
+#define _LINUX_NTP_H
+#include <linux/types.h>
+#include <linux/time.h>
+#include <linux/timex.h>
+
+/* NTP state machine interfaces */
+nsec_t ntp_scale(nsec_t value);
+int ntp_advance(nsec_t value);
+int ntp_adjtimex(struct timex*);
+int ntp_leapsecond(struct timespec now);
+void ntp_clear(void);
+int get_ntp_status(void);
+
+#endif
diff -Nru a/include/linux/time.h b/include/linux/time.h
--- a/include/linux/time.h 2005-04-29 15:12:09 -07:00
+++ b/include/linux/time.h 2005-04-29 15:12:09 -07:00
@@ -27,6 +27,10 @@

#ifdef __KERNEL__

+/* timeofday base types */
+typedef u64 nsec_t;
+typedef u64 cycle_t;
+
/* Parameters used to convert the timespec values */
#ifndef USEC_PER_SEC
#define USEC_PER_SEC (1000000L)
diff -Nru a/include/linux/timeofday.h b/include/linux/timeofday.h
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/include/linux/timeofday.h 2005-04-29 15:12:09 -07:00
@@ -0,0 +1,65 @@
+/* linux/include/linux/timeofday.h
+ *
+ * Copyright (C) 2003, 2004, 2005 IBM, John Stultz ([email protected])
+ *
+ * This file contains the interface to the time of day subsystem
+ */
+#ifndef _LINUX_TIMEOFDAY_H
+#define _LINUX_TIMEOFDAY_H
+#include <linux/types.h>
+#include <linux/time.h>
+#include <linux/timex.h>
+#include <asm/div64.h>
+
+#ifdef CONFIG_NEWTOD
+nsec_t get_lowres_timestamp(void);
+nsec_t get_lowres_timeofday(void);
+nsec_t do_monotonic_clock(void);
+
+void do_gettimeofday(struct timeval *tv);
+int do_settimeofday(struct timespec *tv);
+int do_adjtimex(struct timex *tx);
+
+void timeofday_suspend_hook(void);
+void timeofday_resume_hook(void);
+
+void timeofday_init(void);
+
+
+/* Helper functions */
+static inline struct timeval ns2timeval(nsec_t ns)
+{
+ struct timeval tv;
+ tv.tv_sec = div_long_long_rem(ns, NSEC_PER_SEC, &tv.tv_usec);
+ tv.tv_usec = (tv.tv_usec + NSEC_PER_USEC/2) / NSEC_PER_USEC;
+ return tv;
+}
+
+static inline struct timespec ns2timespec(nsec_t ns)
+{
+ struct timespec ts;
+ ts.tv_sec = div_long_long_rem(ns, NSEC_PER_SEC, &ts.tv_nsec);
+ return ts;
+}
+
+static inline nsec_t timespec2ns(struct timespec* ts)
+{
+ nsec_t ret;
+ ret = ((nsec_t)ts->tv_sec) * NSEC_PER_SEC;
+ ret += ts->tv_nsec;
+ return ret;
+}
+
+static inline nsec_t timeval2ns(struct timeval* tv)
+{
+ nsec_t ret;
+ ret = ((nsec_t)tv->tv_sec) * NSEC_PER_SEC;
+ ret += tv->tv_usec * NSEC_PER_USEC;
+ return ret;
+}
+#else /* CONFIG_NEWTOD */
+#define timeofday_suspend_hook()
+#define timeofday_resume_hook()
+#define timeofday_init()
+#endif /* CONFIG_NEWTOD */
+#endif /* _LINUX_TIMEOFDAY_H */
diff -Nru a/include/linux/timesource.h b/include/linux/timesource.h
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/include/linux/timesource.h 2005-04-29 15:12:09 -07:00
@@ -0,0 +1,159 @@
+/* linux/include/linux/timesource.h
+ *
+ * Copyright (C) 2003, 2004, 2005 IBM, John Stultz ([email protected])
+ *
+ * This file contains the structure definitions for timesources.
+ *
+ * If you are not a timesource, or the time of day code, you should
+ * not be including this file!
+ */
+#ifndef _LINUX_TIMESORUCE_H
+#define _LINUX_TIMESORUCE_H
+
+#include <linux/types.h>
+#include <linux/time.h>
+#include <linux/timex.h>
+#include <asm/io.h>
+#include <asm/div64.h>
+
+/* struct timesource_t:
+ * Provides mostly state-free accessors to the underlying hardware.
+ *
+ * name: ptr to timesource name
+ * priority: priority value for selection (higher is better)
+ * type: defines timesource type
+ * @read_fnct: returns a cycle value
+ * ptr: ptr to MMIO'ed counter
+ * mask: bitmask for two's complement
+ * subtraction of non 64 bit counters
+ * mult: cycle to nanosecond multiplier
+ * shift: cycle to nanosecond divisor (power of two)
+ * @update_callback: called when safe to alter timesource values
+ */
+struct timesource_t {
+ char* name;
+ int priority;
+ enum {
+ TIMESOURCE_FUNCTION,
+ TIMESOURCE_CYCLES,
+ TIMESOURCE_MMIO_32,
+ TIMESOURCE_MMIO_64
+ } type;
+ cycle_t (*read_fnct)(void);
+ void __iomem *mmio_ptr;
+ cycle_t mask;
+ u32 mult;
+ u32 shift;
+ void (*update_callback)(void);
+};
+
+
+/* Helper functions that converts a khz counter
+ * frequency to a timsource multiplier, given the
+ * timesource shift value
+ */
+static inline u32 timesource_khz2mult(u32 khz, u32 shift_constant)
+{
+ /* khz = cyc/(Million ns)
+ * mult/2^shift = ns/cyc
+ * mult = ns/cyc * 2^shift
+ * mult = 1Million/khz * 2^shift
+ * mult = 1000000 * 2^shift / khz
+ * mult = (1000000<<shift) / khz
+ */
+ u64 tmp = ((u64)1000000) << shift_constant;
+ /* XXX - should we round here? */
+ do_div(tmp, khz);
+ return (u32)tmp;
+}
+
+/* Helper functions that converts a hz counter
+ * frequency to a timsource multiplier, given the
+ * timesource shift value
+ */
+static inline u32 timesource_hz2mult(u32 hz, u32 shift_constant)
+{
+ /* hz = cyc/(Billion ns)
+ * mult/2^shift = ns/cyc
+ * mult = ns/cyc * 2^shift
+ * mult = 1Billion/hz * 2^shift
+ * mult = 1000000000 * 2^shift / hz
+ * mult = (1000000000<<shift) / hz
+ */
+ u64 tmp = ((u64)1000000000) << shift_constant;
+ /* XXX - should we round here? */
+ do_div(tmp, hz);
+ return (u32)tmp;
+}
+
+
+/* XXX - this should go somewhere better! */
+#ifndef readq
+static inline unsigned long long readq(void __iomem *addr)
+{
+ u32 low, high;
+ /* loop is required to make sure we get an atomic read */
+ do {
+ high = readl(addr+4);
+ low = readl(addr);
+ } while (high != readl(addr+4));
+
+ return low | (((unsigned long long)high) << 32LL);
+}
+#endif
+
+
+/* read_timesource():
+ * Uses the timesource to return the current cycle_t value
+ */
+static inline cycle_t read_timesource(struct timesource_t *ts)
+{
+ switch (ts->type) {
+ case TIMESOURCE_MMIO_32:
+ return (cycle_t)readl(ts->mmio_ptr);
+ case TIMESOURCE_MMIO_64:
+ return (cycle_t)readq(ts->mmio_ptr);
+ case TIMESOURCE_CYCLES:
+ return (cycle_t)get_cycles();
+ default:/* case: TIMESOURCE_FUNCTION */
+ return ts->read_fnct();
+ }
+}
+
+/* cyc2ns():
+ * Uses the timesource and ntp ajdustment interval to
+ * convert cycle_ts to nanoseconds.
+ */
+static inline nsec_t cyc2ns(struct timesource_t *ts, int ntp_adj, cycle_t cycles)
+{
+ u64 ret;
+ ret = (u64)cycles;
+ ret *= (ts->mult + ntp_adj);
+ ret >>= ts->shift;
+ return (nsec_t)ret;
+}
+
+/* cyc2ns_rem():
+ * Uses the timesource and ntp ajdustment interval to
+ * convert cycle_ts to nanoseconds. Add in remainder portion
+ * which is stored in ns<<ts->shift units and save the new
+ * remainder off.
+ */
+static inline nsec_t cyc2ns_rem(struct timesource_t *ts, int ntp_adj, cycle_t cycles, u64* rem)
+{
+ u64 ret;
+ ret = (u64)cycles;
+ ret *= (ts->mult + ntp_adj);
+ if (rem) {
+ ret += *rem;
+ *rem = ret & ((1<<ts->shift)-1);
+ }
+ ret >>= ts->shift;
+ return (nsec_t)ret;
+}
+
+/* used to install a new time source */
+void register_timesource(struct timesource_t*);
+struct timesource_t* get_next_timesource(void);
+
+#endif
diff -Nru a/init/main.c b/init/main.c
--- a/init/main.c 2005-04-29 15:12:09 -07:00
+++ b/init/main.c 2005-04-29 15:12:09 -07:00
@@ -47,6 +47,7 @@
#include <linux/rmap.h>
#include <linux/mempolicy.h>
#include <linux/key.h>
+#include <linux/timeofday.h>

#include <asm/io.h>
#include <asm/bugs.h>
@@ -467,6 +468,7 @@
pidhash_init();
init_timers();
softirq_init();
+ timeofday_init();
time_init();

/*
diff -Nru a/kernel/Makefile b/kernel/Makefile
--- a/kernel/Makefile 2005-04-29 15:12:09 -07:00
+++ b/kernel/Makefile 2005-04-29 15:12:09 -07:00
@@ -9,6 +9,7 @@
rcupdate.o intermodule.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o

+obj-$(CONFIG_NEWTOD) += timeofday.o timesource.o ntp.o
obj-$(CONFIG_FUTEX) += futex.o
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += cpu.o spinlock.o
diff -Nru a/kernel/ntp.c b/kernel/ntp.c
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/kernel/ntp.c 2005-04-29 15:12:09 -07:00
@@ -0,0 +1,498 @@
+/********************************************************************
+* linux/kernel/ntp.c
+*
+* NTP state machine and time scaling code.
+*
+* Copyright (C) 2004, 2005 IBM, John Stultz ([email protected])
+*
+* Portions rewritten from kernel/time.c and kernel/timer.c
+* Please see those files for original copyrights.
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*
+* Notes:
+*
+* Hopefully you should never have to understand or touch
+* any of the code below. but don't let that keep you from trying!
+*
+* This code is loosely based on David Mills' RFC 1589 and its
+* updates. Please see the following for more details:
+* http://www.eecis.udel.edu/~mills/database/rfc/rfc1589.txt
+* http://www.eecis.udel.edu/~mills/database/reports/kern/kernb.pdf
+*
+* NOTE: To simplify the code, we do not implement any of
+* the PPS code, as the code that uses it never was merged.
+* [email protected]
+*
+* Revision History:
+* 2004-09-02: A0
+* o First pass sent to lkml for review.
+* 2004-12-07: A1
+* o No changes, sent to lkml for review.
+* 2005-03-11: A3
+* o yanked ntp_scale(), ntp adjustments are done in cyc2ns
+* ??????????: A4
+* o Added conditional debug info
+*
+* TODO List:
+* o Move to using ppb for frequency adjustmetns
+* o More documentation
+* o More testing
+* o More optimization
+*********************************************************************/
+
+#include <linux/ntp.h>
+#include <linux/errno.h>
+
+/* XXX - remove later */
+#define NTP_DEBUG 0
+
+/* NTP scaling code
+ * Functions:
+ * ----------
+ * nsec_t ntp_scale(nsec_t value):
+ * Scales the nsec_t vale using ntp kernel state
+ * void ntp_advance(nsec_t interval):
+ * Increments the NTP state machine by interval time
+ * static int ntp_hardupdate(long offset, struct timeval tv)
+ * ntp_adjtimex helper function
+ * int ntp_adjtimex(struct timex* tx):
+ * Interface to adjust NTP state machine
+ * int ntp_leapsecond(struct timespec now)
+ * Does NTP leapsecond processing. Returns number of
+ * seconds current time should be adjusted by.
+ * void ntp_clear(void):
+ * Clears the ntp kernel state
+ * int get_ntp_status(void):
+ * returns ntp_status value
+ *
+ * Variables:
+ * ----------
+ * ntp kernel state variables:
+ * See below for full list.
+ * ntp_lock:
+ * Protects ntp kernel state variables
+ */
+
+
+
+/* Chapter 5: Kernel Variables [RFC 1589 pg. 28] */
+/* 5.1 Interface Variables */
+static int ntp_status = STA_UNSYNC; /* status */
+static long ntp_offset; /* usec */
+static long ntp_constant = 2; /* ntp magic? */
+static long ntp_maxerror = NTP_PHASE_LIMIT; /* usec */
+static long ntp_esterror = NTP_PHASE_LIMIT; /* usec */
+static const long ntp_tolerance = MAXFREQ; /* shifted ppm */
+static const long ntp_precision = 1; /* constant */
+
+/* 5.2 Phase-Lock Loop Variables */
+static long ntp_freq; /* shifted ppm */
+static long ntp_reftime; /* sec */
+
+/* Extra values */
+static int ntp_state = TIME_OK; /* leapsecond state */
+static long ntp_tick = USEC_PER_SEC/USER_HZ; /* tick length */
+
+static s64 ss_offset_len; /* SINGLESHOT offset adj interval (nsec)*/
+static long singleshot_adj; /* +/- MAX_SINGLESHOT_ADJ (ppm)*/
+static long tick_adj; /* tx->tick adjustment (ppm) */
+static long offset_adj; /* offset adjustment (ppm) */
+
+
+/* lock for the above variables */
+static seqlock_t ntp_lock = SEQLOCK_UNLOCKED;
+
+#define MAX_SINGLESHOT_ADJ 500 /* (ppm) */
+#define SEC_PER_DAY 86400
+
+/* Required to safely shift negative values */
+#define shiftR(x,s) (x < 0) ? (-((-x) >> (s))) : ((x) >> (s))
+
+/* int ntp_advance(nsec_t interval):
+ * Periodic hook which increments NTP state machine by interval.
+ * Returns the signed PPM adjustment to be used for the next interval.
+ * This is ntp_hardclock in the RFC.
+ */
+int ntp_advance(nsec_t interval)
+{
+ static u64 interval_sum = 0;
+ static long ss_adj = 0;
+ unsigned long flags;
+ long ppm_sum;
+
+ /* inc interval sum */
+ interval_sum += interval;
+
+ write_seqlock_irqsave(&ntp_lock, flags);
+
+ /* decrement singleshot offset interval */
+ ss_offset_len -= interval;
+ if(ss_offset_len < 0) /* make sure it doesn't go negative */
+ ss_offset_len = 0;
+
+ /* Do second overflow code */
+ while (interval_sum > NSEC_PER_SEC) {
+ /* XXX - I'd prefer to smoothly apply this math
+ * at each call to ntp_advance() rather then each
+ * second.
+ */
+ long tmp;
+
+ /* Bump maxerror by ntp_tolerance */
+ ntp_maxerror += shiftR(ntp_tolerance, SHIFT_USEC);
+ if (ntp_maxerror > NTP_PHASE_LIMIT) {
+ ntp_maxerror = NTP_PHASE_LIMIT;
+ ntp_status |= STA_UNSYNC;
+ }
+
+ /* Calculate offset_adj for the next second */
+ tmp = ntp_offset;
+ if (!(ntp_status & STA_FLL))
+ tmp = shiftR(tmp, SHIFT_KG + ntp_constant);
+
+ /* bound the adjustment to MAXPHASE/MINSEC */
+ tmp = min(tmp, (MAXPHASE / MINSEC) << SHIFT_UPDATE);
+ tmp = max(tmp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE);
+
+ offset_adj = shiftR(tmp, SHIFT_UPDATE); /* (usec/sec) = ppm */
+ ntp_offset -= tmp;
+
+ interval_sum -= NSEC_PER_SEC;
+
+ /* calculate singleshot aproximation ppm for the next second */
+ ss_adj = singleshot_adj;
+ singleshot_adj = 0;
+ }
+
+ /* calculate total ppm adjustment for the next interval */
+ ppm_sum = tick_adj;
+ ppm_sum += offset_adj;
+ ppm_sum += shiftR(ntp_freq,SHIFT_USEC);
+ ppm_sum += ss_adj;
+
+#if NTP_DEBUG
+{ /*XXX - yank me! just for debug */
+ static int dbg = 0;
+ if(!(dbg++%300000))
+ printk("tick_adj(%d) + offset_adj(%d) + ntp_freq(%d) + ss_adj(%d) = ppm_sum(%d)\n", tick_adj, offset_adj, shiftR(ntp_freq,SHIFT_USEC), ss_adj, ppm_sum);
+}
+#endif
+
+ write_sequnlock_irqrestore(&ntp_lock, flags);
+
+ return ppm_sum;
+}
+
+/* XXX - This function needs more explanation */
+/* called only by ntp_adjtimex while holding ntp_lock */
+static int ntp_hardupdate(long offset, struct timeval tv)
+{
+ int ret;
+ long tmp, interval;
+
+ ret = 0;
+ if (!(ntp_status & STA_PLL))
+ return ret;
+
+ tmp = offset;
+ /* Make sure offset is bounded by MAXPHASE */
+ tmp = min(tmp, MAXPHASE);
+ tmp = max(tmp, -MAXPHASE);
+
+ ntp_offset = tmp << SHIFT_UPDATE;
+
+ if ((ntp_status & STA_FREQHOLD) || (ntp_reftime == 0))
+ ntp_reftime = tv.tv_sec;
+
+ /* calculate seconds since last call to hardupdate */
+ interval = tv.tv_sec - ntp_reftime;
+ ntp_reftime = tv.tv_sec;
+
+ if ((ntp_status & STA_FLL) && (interval >= MINSEC)) {
+ long damping;
+ /* XXX - should we round here? */
+ tmp = offset / interval; /* ppm (usec/sec)*/
+
+ /* convert to shifted ppm, then apply damping factor */
+
+ /* calculate damping factor - XXX bigger comment!*/
+ damping = SHIFT_KH - SHIFT_USEC;
+
+ /* apply damping factor */
+ ntp_freq += shiftR(tmp,damping);
+
+ printk("ntp->freq change: %ld\n",shiftR(tmp,damping));
+
+ } else if ((ntp_status & STA_PLL) && (interval < MAXSEC)) {
+ long damping;
+ tmp = offset * interval; /* ppm XXX - not quite*/
+
+ /* calculate damping factor - XXX bigger comment!*/
+ damping = (2 * ntp_constant) + SHIFT_KF - SHIFT_USEC;
+
+ /* apply damping factor */
+ ntp_freq += shiftR(tmp,damping);
+
+ printk("ntp->freq change: %ld\n", shiftR(tmp,damping));
+
+ } else { /* interval out of bounds */
+ printk("ntp_hardupdate(): interval out of bounds: %ld\n",
+ interval);
+ ret = -1; /* TIME_ERROR */
+ }
+
+ /* bound ntp_freq */
+ if (ntp_freq > ntp_tolerance)
+ ntp_freq = ntp_tolerance;
+ if (ntp_freq < -ntp_tolerance)
+ ntp_freq = -ntp_tolerance;
+
+ return ret;
+}
+
+/* int ntp_adjtimex(struct timex* tx)
+ * Interface to change NTP state machine
+ */
+int ntp_adjtimex(struct timex* tx)
+{
+ long save_offset;
+ int result;
+ unsigned long flags;
+
+/* Sanity checking
+ */
+ /* frequency adjustment limited to +/- MAXFREQ */
+ if ((tx->modes & ADJ_FREQUENCY)
+ && (abs(tx->freq) > MAXFREQ))
+ return -EINVAL;
+
+ /* maxerror adjustment limited to NTP_PHASE_LIMIT */
+ if ((tx->modes & ADJ_MAXERROR)
+ && (tx->maxerror < 0
+ || tx->maxerror >= NTP_PHASE_LIMIT))
+ return -EINVAL;
+
+ /* esterror adjustment limited to NTP_PHASE_LIMIT */
+ if ((tx->modes & ADJ_ESTERROR)
+ && (tx->esterror < 0
+ || tx->esterror >= NTP_PHASE_LIMIT))
+ return -EINVAL;
+
+ /* constant adjustment must be positive */
+ if ((tx->modes & ADJ_TIMECONST)
+ && (tx->constant < 0))
+ return -EINVAL;
+
+ /* Single shot mode can only be used by itself */
+ if (((tx->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
+ && (tx->modes != ADJ_OFFSET_SINGLESHOT))
+ return -EINVAL;
+
+ /* offset adjustment limited to +/- MAXPHASE */
+ if ((tx->modes != ADJ_OFFSET_SINGLESHOT)
+ && (tx->modes & ADJ_OFFSET)
+ && (abs(tx->offset)>= MAXPHASE))
+ return -EINVAL;
+
+ /* tick adjustment limited to 10% */
+ /* XXX - should we round here? */
+ if ((tx->modes & ADJ_TICK)
+ && ((tx->tick < 900000/USER_HZ)
+ ||(tx->tick > 11000000/USER_HZ)))
+ return -EINVAL;
+
+#if NTP_DEBUG
+ /* dbg output XXX - yank me! */
+ if(tx->modes) {
+ printk("adjtimex: tx->offset: %ld tx->freq: %ld\n",
+ tx->offset, tx->freq);
+ }
+#endif
+
+/* Kernel input bits
+ */
+ write_seqlock_irqsave(&ntp_lock, flags);
+
+ result = ntp_state;
+
+ /* For ADJ_OFFSET_SINGLESHOT we must return the old offset */
+ save_offset = shiftR(ntp_offset, SHIFT_UPDATE);
+
+ /* Process input parameters */
+ if (tx->modes & ADJ_STATUS) {
+ ntp_status &= STA_RONLY;
+ ntp_status |= tx->status & ~STA_RONLY;
+ }
+
+ if (tx->modes & ADJ_FREQUENCY)
+ ntp_freq = tx->freq;
+
+ if (tx->modes & ADJ_MAXERROR)
+ ntp_maxerror = tx->maxerror;
+
+ if (tx->modes & ADJ_ESTERROR)
+ ntp_esterror = tx->esterror;
+
+ if (tx->modes & ADJ_TIMECONST)
+ ntp_constant = tx->constant;
+
+ if (tx->modes & ADJ_OFFSET) {
+ /* check if we're doing a singleshot adjustment */
+ if (tx->modes == ADJ_OFFSET_SINGLESHOT)
+ singleshot_adj = tx->offset;
+ /* otherwise, call hardupdate() */
+ else if (ntp_hardupdate(tx->offset, tx->time))
+ result = TIME_ERROR;
+ }
+
+ if (tx->modes & ADJ_TICK) {
+ /* first calculate usec/user_tick offset */
+ /* XXX - should we round here? */
+ tick_adj = (USEC_PER_SEC/USER_HZ) - tx->tick;
+ /* multiply by user_hz to get usec/sec => ppm */
+ tick_adj *= USER_HZ;
+ /* save tx->tick for future calls to adjtimex */
+ ntp_tick = tx->tick;
+ }
+
+ if ((ntp_status & (STA_UNSYNC|STA_CLOCKERR)) != 0 )
+ result = TIME_ERROR;
+
+/* Kernel output bits
+ */
+ /* write kernel state to user timex values*/
+ if ((tx->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
+ tx->offset = save_offset;
+ else
+ tx->offset = shiftR(ntp_offset, SHIFT_UPDATE);
+
+ tx->freq = ntp_freq;
+ tx->maxerror = ntp_maxerror;
+ tx->esterror = ntp_esterror;
+ tx->status = ntp_status;
+ tx->constant = ntp_constant;
+ tx->precision = ntp_precision;
+ tx->tolerance = ntp_tolerance;
+
+ /* PPS is not implemented, so these are zero */
+ tx->ppsfreq = /*XXX - Not Implemented!*/ 0;
+ tx->jitter = /*XXX - Not Implemented!*/ 0;
+ tx->shift = /*XXX - Not Implemented!*/ 0;
+ tx->stabil = /*XXX - Not Implemented!*/ 0;
+ tx->jitcnt = /*XXX - Not Implemented!*/ 0;
+ tx->calcnt = /*XXX - Not Implemented!*/ 0;
+ tx->errcnt = /*XXX - Not Implemented!*/ 0;
+ tx->stbcnt = /*XXX - Not Implemented!*/ 0;
+
+ write_sequnlock_irqrestore(&ntp_lock, flags);
+
+ return result;
+}
+
+
+/* void ntp_leapsecond(struct timespec now):
+ * NTP Leapsecnod processing code. Returns the number of
+ * seconds (-1, 0, or 1) that should be added to the current
+ * time to properly adjust for leapseconds.
+ */
+int ntp_leapsecond(struct timespec now)
+{
+ /*
+ * Leap second processing. If in leap-insert state at
+ * the end of the day, the system clock is set back one
+ * second; if in leap-delete state, the system clock is
+ * set ahead one second.
+ */
+ static time_t leaptime = 0;
+
+ switch (ntp_state) {
+ case TIME_OK:
+ if (ntp_status & STA_INS) {
+ ntp_state = TIME_INS;
+ /* calculate end of today (23:59:59)*/
+ leaptime = now.tv_sec + SEC_PER_DAY -
+ (now.tv_sec % SEC_PER_DAY) - 1;
+ }
+ else if (ntp_status & STA_DEL) {
+ ntp_state = TIME_DEL;
+ /* calculate end of today (23:59:59)*/
+ leaptime = now.tv_sec + SEC_PER_DAY -
+ (now.tv_sec % SEC_PER_DAY) - 1;
+ }
+ break;
+
+ case TIME_INS:
+ /* Once we are at (or past) leaptime, insert the second */
+ if (now.tv_sec > leaptime) {
+ ntp_state = TIME_OOP;
+ printk(KERN_NOTICE
+ "Clock: inserting leap second 23:59:60 UTC\n");
+ return -1;
+ }
+ break;
+
+ case TIME_DEL:
+ /* Once we are at (or past) leaptime, delete the second */
+ if (now.tv_sec >= leaptime) {
+ ntp_state = TIME_WAIT;
+ printk(KERN_NOTICE
+ "Clock: deleting leap second 23:59:59 UTC\n");
+ return 1;
+ }
+ break;
+
+ case TIME_OOP:
+ /* Wait for the end of the leap second*/
+ if (now.tv_sec > (leaptime + 1))
+ ntp_state = TIME_WAIT;
+ break;
+
+ case TIME_WAIT:
+ if (!(ntp_status & (STA_INS | STA_DEL)))
+ ntp_state = TIME_OK;
+ }
+
+ return 0;
+}
+
+/* void ntp_clear(void):
+ * Clears the NTP state machine.
+ */
+void ntp_clear(void)
+{
+ unsigned long flags;
+ write_seqlock_irqsave(&ntp_lock, flags);
+
+ /* clear everything */
+ ntp_status |= STA_UNSYNC;
+ ntp_maxerror = NTP_PHASE_LIMIT;
+ ntp_esterror = NTP_PHASE_LIMIT;
+ ss_offset_len = 0;
+ singleshot_adj = 0;
+ tick_adj = 0;
+ offset_adj =0;
+
+ write_sequnlock_irqrestore(&ntp_lock, flags);
+}
+
+/* int get_ntp_status(void):
+ * Returns the NTP status.
+ */
+int get_ntp_status(void)
+{
+ return ntp_status;
+}
+
diff -Nru a/kernel/time.c b/kernel/time.c
--- a/kernel/time.c 2005-04-29 15:12:09 -07:00
+++ b/kernel/time.c 2005-04-29 15:12:09 -07:00
@@ -38,6 +38,7 @@

#include <asm/uaccess.h>
#include <asm/unistd.h>
+#include <linux/timeofday.h>

/*
* The timezone where the local system is located. Used as a default by some
@@ -227,6 +228,7 @@
/* adjtimex mainly allows reading (and writing, if superuser) of
* kernel time-keeping variables. used by xntpd.
*/
+#ifndef CONFIG_NEWTOD
int do_adjtimex(struct timex *txc)
{
long ltemp, mtemp, save_adjust;
@@ -410,6 +412,7 @@
notify_arch_cmos_timer();
return(result);
}
+#endif

asmlinkage long sys_adjtimex(struct timex __user *txc_p)
{
@@ -566,6 +569,7 @@


#else
+#ifndef CONFIG_NEWTOD
/*
* Simulate gettimeofday using do_gettimeofday which only allows a timeval
* and therefore only yields usec accuracy
@@ -578,6 +582,7 @@
tv->tv_sec = x.tv_sec;
tv->tv_nsec = x.tv_usec * NSEC_PER_USEC;
}
+#endif /* CONFIG_NEWTOD */
#endif

#if (BITS_PER_LONG < 64)
diff -Nru a/kernel/timeofday.c b/kernel/timeofday.c
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/kernel/timeofday.c 2005-04-29 15:12:09 -07:00
@@ -0,0 +1,522 @@
+/*********************************************************************
+* linux/kernel/timeofday.c
+*
+* This file contains the functions which access and manage
+* the system's time of day functionality.
+*
+* Copyright (C) 2003, 2004, 2005 IBM, John Stultz ([email protected])
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*
+* Revision History:
+* 2004-09-02: A0
+* o First pass sent to lkml for review.
+* 2004-12-07: A1
+* o Rework of timesource structure
+* o Sent to lkml for review
+* 2005-01-24: A2
+* o write_seqlock_irq -> writeseqlock_irqsave
+* o arch generic interface for for get_cmos_time() equivalents
+* o suspend/resume hooks for sleep/hibernate (lightly tested)
+* o timesource adjust_callback hook
+* o Sent to lkml for review
+* 2005-03-11: A3
+* o periodic_hook (formerly interrupt_hook) now calle by softtimer
+* o yanked ntp_scale(), ntp adjustments are done in cyc2ns now
+* o sent to lkml for review
+* ??????????: A4
+* o Improved the cyc2ns remainder handling
+* o Added getnstimeofday
+* o Cleanups from Nish Aravamudan
+* TODO List:
+* o vsyscall/fsyscall infrastructure
+* o clock_was_set hook
+**********************************************************************/
+
+#include <linux/timeofday.h>
+#include <linux/timesource.h>
+#include <linux/ntp.h>
+#include <linux/timex.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/sched.h> /* Needed for capable() */
+
+/* XXX - remove later */
+#define TIME_DBG 0
+#define TIME_DBG_FREQ 60000
+
+/* only run periodic_hook every 50ms */
+#define PERIODIC_INTERVAL_MS 50
+
+/*[Nanosecond based variables]
+ * system_time:
+ * Monotonically increasing counter of the number of nanoseconds
+ * since boot.
+ * wall_time_offset:
+ * Offset added to system_time to provide accurate time-of-day
+ */
+static nsec_t system_time;
+static nsec_t wall_time_offset;
+
+/*[Cycle based variables]
+ * offset_base:
+ * Value of the timesource at the last timeofday_periodic_hook()
+ * (adjusted only minorly to account for rounded off cycles)
+ */
+static cycle_t offset_base;
+
+/*[Time source data]
+ * timesource:
+ * current timesource pointer
+ */
+static struct timesource_t *timesource;
+
+/*[NTP adjustment]
+ * ntp_adj:
+ * value of the current ntp adjustment,
+ * stored in timesource multiplier units.
+ */
+int ntp_adj;
+
+/*[Locks]
+ * system_time_lock:
+ * generic lock for all locally scoped time values
+ */
+static seqlock_t system_time_lock = SEQLOCK_UNLOCKED;
+
+
+/*[Suspend/Resume info]
+ * time_suspend_state:
+ * variable that keeps track of suspend state
+ * suspend_start:
+ * start of the suspend call
+ */
+static enum {
+ TIME_RUNNING,
+ TIME_SUSPENDED
+} time_suspend_state = TIME_RUNNING;
+
+static nsec_t suspend_start;
+
+
+/* [XXX - Hacks]
+ * Makes stuff compile
+ */
+extern nsec_t read_persistent_clock(void);
+extern void sync_persistent_clock(struct timespec ts);
+
+
+/* get_lowres_timestamp():
+ * Returns a low res timestamp w/ PERIODIC_INTERVAL_MS
+ * granularity. (ie: the value of system_time as
+ * calculated at the last invocation of
+ * timeofday_periodic_hook())
+ */
+nsec_t get_lowres_timestamp(void)
+{
+ nsec_t ret;
+ unsigned long seq;
+ do {
+ seq = read_seqbegin(&system_time_lock);
+
+ /* quickly grab system_time*/
+ ret = system_time;
+
+ } while (read_seqretry(&system_time_lock, seq));
+
+ return ret;
+}
+
+
+/* get_lowres_timeofday():
+ * Returns a low res time of day, as calculated at the
+ * last invocation of timeofday_periodic_hook()
+ */
+nsec_t get_lowres_timeofday(void)
+{
+ nsec_t ret;
+ unsigned long seq;
+ do {
+ seq = read_seqbegin(&system_time_lock);
+
+ /* quickly calculate low-res time of day */
+ ret = system_time + wall_time_offset;
+
+ } while (read_seqretry(&system_time_lock, seq));
+
+ return ret;
+}
+
+
+/* update_legacy_time_values():
+ * Private function. Used to sync legacy time values to
+ * current timeofday. Assumes we have the system_time_lock.
+ * Hopefully someday this function can be removed.
+ */
+static void update_legacy_time_values(void)
+{
+ unsigned long flags;
+ write_seqlock_irqsave(&xtime_lock, flags);
+ xtime = ns2timespec(system_time + wall_time_offset);
+ wall_to_monotonic = ns2timespec(wall_time_offset);
+ set_normalized_timespec(&wall_to_monotonic,
+ -wall_to_monotonic.tv_sec, -wall_to_monotonic.tv_nsec);
+ wall_jiffies = jiffies;
+ /* We don't update jiffies here because it is its own time domain */
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+}
+
+
+/* __monotonic_clock():
+ * private function, must hold system_time_lock lock when being
+ * called. Returns the monotonically increasing number of
+ * nanoseconds since the system booted (adjusted by NTP scaling)
+ */
+static inline nsec_t __monotonic_clock(void)
+{
+ nsec_t ret, ns_offset;
+ cycle_t now, cycle_delta;
+
+ /* read timesource */
+ now = read_timesource(timesource);
+
+ /* calculate the delta since the last timeofday_periodic_hook */
+ cycle_delta = (now - offset_base) & timesource->mask;
+
+ /* convert to nanoseconds */
+ ns_offset = cyc2ns(timesource, ntp_adj, cycle_delta);
+
+ /* add result to system time */
+ ret = system_time + ns_offset;
+
+ return ret;
+}
+
+
+/* do_monotonic_clock():
+ * Returns the monotonically increasing number of nanoseconds
+ * since the system booted via __monotonic_clock()
+ */
+nsec_t do_monotonic_clock(void)
+{
+ nsec_t ret;
+ unsigned long seq;
+
+ /* atomically read __monotonic_clock() */
+ do {
+ seq = read_seqbegin(&system_time_lock);
+
+ ret = __monotonic_clock();
+
+ } while (read_seqretry(&system_time_lock, seq));
+
+ return ret;
+}
+
+
+/* __gettimeofday():
+ * private function. Returns the timeofday in nsec_t.
+ */
+static inline nsec_t __gettimeofday(void)
+{
+ nsec_t wall, sys;
+ unsigned long seq;
+
+ /* atomically read wall and sys time */
+ do {
+ seq = read_seqbegin(&system_time_lock);
+
+ wall = wall_time_offset;
+ sys = __monotonic_clock();
+
+ } while (read_seqretry(&system_time_lock, seq));
+
+ return wall + sys;
+}
+
+
+/* getnstimeofday():
+ * Returns the time of day in a timespec
+ */
+void getnstimeofday(struct timespec *ts)
+{
+ *ts = ns2timespec(__gettimeofday());
+}
+EXPORT_SYMBOL(getnstimeofday);
+
+
+/* do_gettimeofday():
+ * Returns the time of day in a timeval
+ */
+void do_gettimeofday(struct timeval *tv)
+{
+ *tv = ns2timeval(__gettimeofday());
+}
+EXPORT_SYMBOL(do_gettimeofday);
+
+
+/* do_settimeofday():
+ * Sets the time of day
+ */
+int do_settimeofday(struct timespec *tv)
+{
+ unsigned long flags;
+ nsec_t newtime = timespec2ns(tv);
+
+ /* atomically adjust wall_time_offset & clear ntp state machine */
+ write_seqlock_irqsave(&system_time_lock, flags);
+
+ wall_time_offset = newtime - __monotonic_clock();
+ ntp_clear();
+
+ update_legacy_time_values();
+
+ write_sequnlock_irqrestore(&system_time_lock, flags);
+
+ return 0;
+}
+EXPORT_SYMBOL(do_settimeofday);
+
+
+/* do_adjtimex:
+ * Userspace NTP daemon's interface to the kernel NTP variables
+ */
+int do_adjtimex(struct timex *tx)
+{
+ /* Check capabilities if we're trying to modify something */
+ if (tx->modes && !capable(CAP_SYS_TIME))
+ return -EPERM;
+
+ /* Note: We set tx->time first,
+ * because ntp_adjtimex uses it
+ */
+ do_gettimeofday(&tx->time);
+
+ /* call out to NTP code */
+ return ntp_adjtimex(tx);
+}
+
+
+/* timeofday_suspend_hook():
+ * This function allows the timeofday subsystem to
+ * be shutdown for a period of time. Usefull when
+ * going into suspend/hibernate mode. The code is
+ * very similar to the first half of
+ * timeofday_periodic_hook().
+ */
+void timeofday_suspend_hook(void)
+{
+ unsigned long flags;
+
+ write_seqlock_irqsave(&system_time_lock, flags);
+
+ /* Make sure time_suspend_state is sane */
+ BUG_ON(time_suspend_state != TIME_RUNNING);
+
+ /* First off, save suspend start time
+ * then quickly call __monotonic_clock.
+ * These two calls hopefully occur quickly
+ * because the difference between reads will
+ * accumulate as time drift on resume.
+ */
+ suspend_start = read_persistent_clock();
+ system_time = __monotonic_clock();
+
+ /* switch states */
+ time_suspend_state = TIME_SUSPENDED;
+
+ write_sequnlock_irqrestore(&system_time_lock, flags);
+}
+
+
+/* timeofday_resume_hook():
+ * This function resumes the timeofday subsystem
+ * from a previous call to timeofday_suspend_hook.
+ */
+void timeofday_resume_hook(void)
+{
+ nsec_t now, suspend_time;
+ unsigned long flags;
+
+ write_seqlock_irqsave(&system_time_lock, flags);
+
+ /* Make sure time_suspend_state is sane */
+ BUG_ON(time_suspend_state != TIME_SUSPENDED);
+
+ /* Read persistent clock to mark the end of
+ * the suspend interval then rebase the
+ * offset_base to current timesource value.
+ * Again, time between these two calls will
+ * not be accounted for and will show up as
+ * time drift.
+ */
+ now = read_persistent_clock();
+ offset_base = read_timesource(timesource);
+
+ /* calculate how long we were out for */
+ suspend_time = now - suspend_start;
+
+ /* update system_time */
+ system_time += suspend_time;
+
+ ntp_clear();
+
+ /* Set us back to running */
+ time_suspend_state = TIME_RUNNING;
+
+ /* finally, update legacy time values */
+ update_legacy_time_values();
+
+ write_sequnlock_irqrestore(&system_time_lock, flags);
+}
+
+struct timer_list timeofday_timer;
+
+/* timeofday_periodic_hook:
+ * Calculates the delta since the last call,
+ * updates system time and clears the offset.
+ * Called via timeofday_timer.
+ */
+static void timeofday_periodic_hook(unsigned long unused)
+{
+ cycle_t now, cycle_delta;
+ static u64 remainder;
+ nsec_t ns, ns_ntp;
+ long leapsecond;
+ struct timesource_t* next;
+ unsigned long flags;
+ u64 tmp;
+
+ write_seqlock_irqsave(&system_time_lock, flags);
+
+ /* read time source & calc time since last call*/
+ now = read_timesource(timesource);
+ cycle_delta = (now - offset_base) & timesource->mask;
+
+ /* convert cycles to ntp adjusted ns and save remainder */
+ ns_ntp = cyc2ns_rem(timesource, ntp_adj, cycle_delta, &remainder);
+
+ /* convert cycles to raw ns for ntp advance */
+ ns = cyc2ns(timesource, 0, cycle_delta);
+
+#if TIME_DBG
+{ /* XXX - remove later*/
+ static int dbg=0;
+ if(!(dbg++%TIME_DBG_FREQ)){
+ printk(KERN_INFO "now: %lluc - then: %lluc = delta: %lluc -> %llu ns + %llu shift_ns (ntp_adj: %i)\n",
+ (unsigned long long)now, (unsigned long long)offset_base,
+ (unsigned long long)cycle_delta, (unsigned long long)ns,
+ (unsigned long long)remainder, ntp_adj);
+ }
+}
+#endif
+
+ /* update system_time */
+ system_time += ns_ntp;
+
+ /* reset the offset_base */
+ offset_base = now;
+
+ /* advance the ntp state machine by ns interval*/
+ ntp_adj = ntp_advance(ns);
+
+ /* do ntp leap second processing*/
+ leapsecond = ntp_leapsecond(ns2timespec(system_time+wall_time_offset));
+ wall_time_offset += leapsecond * NSEC_PER_SEC;
+
+ /* sync the persistent clock */
+ if (!(get_ntp_status() & STA_UNSYNC))
+ sync_persistent_clock(ns2timespec(system_time + wall_time_offset));
+
+ /* if necessary, switch timesources */
+ next = get_next_timesource();
+ if (next != timesource) {
+ /* immediately set new offset_base */
+ offset_base = read_timesource(next);
+ /* swap timesources */
+ timesource = next;
+ printk(KERN_INFO "Time: %s timesource has been installed.\n",
+ timesource->name);
+ ntp_clear();
+ ntp_adj = 0;
+ remainder = 0;
+ }
+
+ /* now is a safe time, so allow timesource to adjust
+ * itself (for example: to make cpufreq changes).
+ */
+ if(timesource->update_callback)
+ timesource->update_callback();
+
+
+ /* convert the signed ppm to timesource multiplier adjustment */
+ tmp = abs(ntp_adj);
+ tmp = tmp * timesource->mult;
+ /* XXX - should we round here? */
+ do_div(tmp, 1000000);
+ if (ntp_adj < 0)
+ ntp_adj = -(int)tmp;
+ else
+ ntp_adj = (int)tmp;
+
+ /* sync legacy values */
+ update_legacy_time_values();
+
+ write_sequnlock_irqrestore(&system_time_lock, flags);
+
+ /* Set us up to go off on the next interval */
+ mod_timer(&timeofday_timer, jiffies + (PERIODIC_INTERVAL_MS * HZ / 1000));
+}
+
+
+/* timeofday_init():
+ * Initializes time variables
+ */
+void __init timeofday_init(void)
+{
+ unsigned long flags;
+#if TIME_DBG
+ printk(KERN_INFO "timeofday_init: Starting up!\n");
+#endif
+ write_seqlock_irqsave(&system_time_lock, flags);
+
+ /* initialize the timesource variable */
+ timesource = get_next_timesource();
+
+ /* clear and initialize offsets*/
+ offset_base = read_timesource(timesource);
+ wall_time_offset = read_persistent_clock();
+
+ /* clear NTP scaling factor & state machine */
+ ntp_adj = 0;
+ ntp_clear();
+
+ /* initialize legacy time values */
+ update_legacy_time_values();
+
+ write_sequnlock_irqrestore(&system_time_lock, flags);
+
+ /* Install timeofday_periodic_hook timer */
+ init_timer(&timeofday_timer);
+ timeofday_timer.function = timeofday_periodic_hook;
+ timeofday_timer.expires = jiffies + 1;
+ add_timer(&timeofday_timer);
+
+
+#if TIME_DBG
+ printk(KERN_INFO "timeofday_init: finished!\n");
+#endif
+ return;
+}
diff -Nru a/kernel/timer.c b/kernel/timer.c
--- a/kernel/timer.c 2005-04-29 15:12:09 -07:00
+++ b/kernel/timer.c 2005-04-29 15:12:09 -07:00
@@ -577,6 +577,7 @@
int tickadj = 500/HZ ? : 1; /* microsecs */


+#ifndef CONFIG_NEWTOD
/*
* phase-lock loop variables
*/
@@ -807,6 +808,9 @@
}
} while (ticks);
}
+#else /* CONFIG_NEWTOD */
+#define update_wall_time(x)
+#endif /* CONFIG_NEWTOD */

/*
* Called from the timer interrupt handler to charge one tick to the current
diff -Nru a/kernel/timesource.c b/kernel/timesource.c
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/kernel/timesource.c 2005-04-29 15:12:09 -07:00
@@ -0,0 +1,210 @@
+/*********************************************************************
+* linux/kernel/timesource.c
+*
+* This file contains the functions which manage timesource drivers.
+*
+* Copyright (C) 2004, 2005 IBM, John Stultz ([email protected])
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*
+* Revision History:
+* 2004-12-07: A1
+* o Rework of timesource structure
+* o Sent to lkml for review
+* ??????????: A4
+* o Keep track of all registered timesources
+* o Add sysfs interface for overriding default selection
+*
+* TODO List:
+* o Allow timesource drivers to be unregistered
+* o Use "clock=xyz" boot option for selection overrides.
+* o get rid of timesource_jiffies extern
+**********************************************************************/
+
+#include <linux/timesource.h>
+#include <linux/sysdev.h>
+#include <linux/init.h>
+
+#define MAX_TIMESOURCES 10
+
+
+/* XXX - Need to have a better way for initializing curr_timesource */
+extern struct timesource_t timesource_jiffies;
+
+/*[Timesource internal variables]---------
+ * curr_timesource:
+ * currently selected timesource. Initialized to timesource_jiffies.
+ * next_timesource:
+ * pending next selected timesource.
+ * timesource_list:
+ * array of pointers pointing to registered timesources
+ * timesource_list_counter:
+ * value which counts the number of registered timesources
+ * timesource_lock:
+ * protects manipulations to curr_timesource and next_timesource
+ * and the timesource_list
+ */
+static struct timesource_t *curr_timesource = &timesource_jiffies;
+static struct timesource_t *next_timesource;
+static struct timesource_t *timesource_list[MAX_TIMESOURCES];
+static int timesource_list_counter;
+static seqlock_t timesource_lock = SEQLOCK_UNLOCKED;
+
+static char override_name[32];
+
+/* get_next_timesource():
+ * Returns the selected timesource
+ */
+struct timesource_t* get_next_timesource(void)
+{
+ write_seqlock(&timesource_lock);
+ if (next_timesource) {
+ curr_timesource = next_timesource;
+ next_timesource = NULL;
+ }
+ write_sequnlock(&timesource_lock);
+
+ return curr_timesource;
+}
+
+/* select_timesource():
+ * Private function. Finds the best registered timesource.
+ * Must have a writelock on timesource_lock when called.
+ */
+static struct timesource_t* select_timesource(void)
+{
+ struct timesource_t* best = timesource_list[0];
+ int i;
+
+ for (i=0; i < timesource_list_counter; i++) {
+ /* Check for override */
+ if ((override_name[0] != 0) &&
+ (!strncmp(timesource_list[i]->name, override_name,
+ strlen(override_name)))) {
+ best = timesource_list[i];
+ break;
+ }
+ /* Pick the highest priority */
+ if (timesource_list[i]->priority > best->priority)
+ best = timesource_list[i];
+ }
+ return best;
+}
+
+/* register_timesource():
+ * Used to install new timesources
+ */
+void register_timesource(struct timesource_t* t)
+{
+ char* error_msg = 0;
+ int i;
+ write_seqlock(&timesource_lock);
+
+ /* check if timesource is already registered */
+ for (i=0; i < timesource_list_counter; i++)
+ if (!strncmp(timesource_list[i]->name, t->name, strlen(t->name))){
+ error_msg = "Already registered!";
+ break;
+ }
+
+ /* check that the list isn't full */
+ if (timesource_list_counter >= MAX_TIMESOURCES)
+ error_msg = "Too many timesources!";
+
+ if(!error_msg)
+ timesource_list[timesource_list_counter++] = t;
+ else
+ printk("register_timesource: Cannot register %s. %s\n",
+ t->name, error_msg);
+
+ /* select next timesource */
+ next_timesource = select_timesource();
+
+ write_sequnlock(&timesource_lock);
+}
+
+/* sysfs_show_timesources():
+ * Provides sysfs interface for listing registered timesources
+ */
+static ssize_t sysfs_show_timesources(struct sys_device *dev, char *buf)
+{
+ int i;
+ char* curr = buf;
+ write_seqlock(&timesource_lock);
+ for(i=0; i < timesource_list_counter; i++) {
+ /* Mark current timesource w/ a star */
+ if (timesource_list[i] == curr_timesource)
+ curr += sprintf(curr, "*");
+ curr += sprintf(curr, "%s ",timesource_list[i]->name);
+ }
+ write_sequnlock(&timesource_lock);
+
+ curr += sprintf(curr, "\n");
+ return curr - buf;
+}
+
+/* sysfs_override_timesource():
+ * Takes input from sysfs interface for manually overriding
+ * the default timesource selction
+ */
+static ssize_t sysfs_override_timesource(struct sys_device *dev,
+ const char *buf, size_t count)
+{
+ /* check to avoid underflow later */
+ if (strlen(buf) == 0)
+ return count;
+
+ write_seqlock(&timesource_lock);
+
+ /* copy the name given */
+ strncpy(override_name, buf, strlen(buf)-1);
+ override_name[strlen(buf)-1] = 0;
+
+ /* see if we can find it */
+ next_timesource = select_timesource();
+
+ write_sequnlock(&timesource_lock);
+ return count;
+}
+
+/* Sysfs setup bits:
+ * XXX - Is there a simpler way?
+ */
+
+static SYSDEV_ATTR(timesource, 0600, sysfs_show_timesources, sysfs_override_timesource);
+
+static struct sysdev_class timesource_sysclass = {
+ set_kset_name("timesource"),
+};
+
+static struct sys_device device_timesource = {
+ .id = 0,
+ .cls = &timesource_sysclass,
+};
+
+static int init_timesource_sysfs(void)
+{
+ int error = sysdev_class_register(&timesource_sysclass);
+ if (!error) {
+ error = sysdev_register(&device_timesource);
+ /* XXX error checking? */
+ sysdev_create_file(&device_timesource, &attr_timesource);
+ }
+ return error;
+}
+device_initcall(init_timesource_sysfs);
+
+
+/* XXX - Do we need a boot time override interface? */



2005-04-29 22:52:52

by john stultz

[permalink] [raw]
Subject: [RFC][PATCH (2/4)] new timeofday arch specific hooks (v A4)

All,
This patch implements the minimal architecture specific hooks to
enable the new time of day subsystem code for i386, x86-64, ia64, ppc32,
ppc64 and s390. It applies on top of my linux-2.6.12-rc2_timeofday-
core_A4 patch and with this patch applied, you can test the new time of
day subsystem.

Basically it configs in the NEWTOD code and cuts alot of code out of the
build via #ifdefs. I know, I know, #ifdefs' are ugly and bad, and the
final patch will just remove the old code. For now this allows us to be
flexible and easily switch between the two implementations with a single
define.

While there is code for all of the arches above, I am unable to fully
test all of them with the limited time I have to spend on this work.
i386 and x86-64 are reasonably well tested, ppc32 less so and the others
hopefully still compile :)

New in this version:
o s390 arch support! (Big thanks to Martin Schwidefsky!)

Items still on the TODO list:
o Break up each arch into its own patch
o Get rid of #ifdefs

I look forward to your comments and feedback.

thanks
-john

linux-2.6.12-rc2_timeofday-arch_A4.patch
========================================
diff -Nru a/arch/i386/Kconfig b/arch/i386/Kconfig
--- a/arch/i386/Kconfig 2005-04-29 15:16:59 -07:00
+++ b/arch/i386/Kconfig 2005-04-29 15:16:59 -07:00
@@ -14,6 +14,10 @@
486, 586, Pentiums, and various instruction-set-compatible chips by
AMD, Cyrix, and others.

+config NEWTOD
+ bool
+ default y
+
config MMU
bool
default y
diff -Nru a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
--- a/arch/i386/kernel/apm.c 2005-04-29 15:16:59 -07:00
+++ b/arch/i386/kernel/apm.c 2005-04-29 15:16:59 -07:00
@@ -224,6 +224,7 @@
#include <linux/smp_lock.h>
#include <linux/dmi.h>
#include <linux/suspend.h>
+#include <linux/timeofday.h>

#include <asm/system.h>
#include <asm/uaccess.h>
@@ -1205,6 +1206,7 @@
local_irq_disable();
device_power_down(PMSG_SUSPEND);

+ timeofday_suspend_hook();
/* serialize with the timer interrupt */
write_seqlock(&xtime_lock);

@@ -1234,6 +1236,7 @@
spin_unlock(&i8253_lock);
write_sequnlock(&xtime_lock);

+ timeofday_resume_hook();
if (err == APM_NO_ERROR)
err = APM_SUCCESS;
if (err != APM_SUCCESS)
diff -Nru a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
--- a/arch/i386/kernel/time.c 2005-04-29 15:16:59 -07:00
+++ b/arch/i386/kernel/time.c 2005-04-29 15:16:59 -07:00
@@ -68,6 +68,8 @@

#include "io_ports.h"

+#include <linux/timeofday.h>
+
extern spinlock_t i8259A_lock;
int pit_latch_buggy; /* extern */

@@ -117,6 +119,7 @@
}
EXPORT_SYMBOL(rtc_cmos_write);

+#ifndef CONFIG_NEWTOD
/*
* This version of gettimeofday has microsecond resolution
* and better than microsecond precision on fast x86 machines with TSC.
@@ -199,20 +202,21 @@
}

EXPORT_SYMBOL(do_settimeofday);
+#endif

static int set_rtc_mmss(unsigned long nowtime)
{
int retval;
-
- WARN_ON(irqs_disabled());
+ unsigned long flags;

/* gets recalled with irq locally disabled */
- spin_lock_irq(&rtc_lock);
+ /* XXX - does irqsave resolve this? -johnstul */
+ spin_lock_irqsave(&rtc_lock, flags);
if (efi_enabled)
retval = efi_set_rtc_mmss(nowtime);
else
retval = mach_set_rtc_mmss(nowtime);
- spin_unlock_irq(&rtc_lock);
+ spin_unlock_irqrestore(&rtc_lock, flags);

return retval;
}
@@ -220,6 +224,7 @@

int timer_ack;

+#ifndef CONFIG_NEWTOD
/* monotonic_clock(): returns # of nanoseconds passed since time_init()
* Note: This function is required to return accurate
* time even in the absence of multiple timer ticks.
@@ -229,6 +234,7 @@
return cur_timer->monotonic_clock();
}
EXPORT_SYMBOL(monotonic_clock);
+#endif

#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
unsigned long profile_pc(struct pt_regs *regs)
@@ -300,7 +306,9 @@
*/
write_seqlock(&xtime_lock);

+#ifndef CONFIG_NEWTOD
cur_timer->mark_offset();
+#endif

do_timer_interrupt(irq, NULL, regs);

@@ -324,6 +332,8 @@

return retval;
}
+
+#ifndef CONFIG_NEWTOD
static void sync_cmos_clock(unsigned long dummy);

static struct timer_list sync_cmos_timer =
@@ -373,7 +383,38 @@
{
mod_timer(&sync_cmos_timer, jiffies + 1);
}
+#endif
+
+/* arch specific timeofday hooks */
+nsec_t read_persistent_clock(void)
+{
+ return (nsec_t)get_cmos_time() * NSEC_PER_SEC;
+}

+void sync_persistent_clock(struct timespec ts)
+{
+ static unsigned long last_rtc_update;
+ /*
+ * If we have an externally synchronized Linux clock, then update
+ * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
+ * called as close as possible to 500 ms before the new second starts.
+ */
+ if (ts.tv_sec <= last_rtc_update + 660)
+ return;
+
+ if((ts.tv_nsec / 1000) >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
+ (ts.tv_nsec / 1000) <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2) {
+ /* horrible...FIXME */
+ if (set_rtc_mmss(ts.tv_sec) == 0)
+ last_rtc_update = ts.tv_sec;
+ else
+ last_rtc_update = ts.tv_sec - 600; /* do it again in 60 s */
+ }
+}
+
+
+
+#ifndef CONFIG_NEWTOD
static long clock_cmos_diff, sleep_start;

static int timer_suspend(struct sys_device *dev, u32 state)
@@ -407,6 +448,23 @@
wall_jiffies += sleep_length;
return 0;
}
+#else /* !CONFIG_NEWTOD */
+static int timer_suspend(struct sys_device *dev, u32 state)
+{
+ timeofday_suspend_hook();
+ return 0;
+}
+
+static int timer_resume(struct sys_device *dev)
+{
+#ifdef CONFIG_HPET_TIMER
+ if (is_hpet_enabled())
+ hpet_reenable();
+#endif
+ timeofday_resume_hook();
+ return 0;
+}
+#endif

static struct sysdev_class timer_sysclass = {
.resume = timer_resume,
@@ -436,17 +494,21 @@
/* Duplicate of time_init() below, with hpet_enable part added */
static void __init hpet_time_init(void)
{
+#ifndef CONFIG_NEWTOD
xtime.tv_sec = get_cmos_time();
xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
set_normalized_timespec(&wall_to_monotonic,
-xtime.tv_sec, -xtime.tv_nsec);
+#endif

if (hpet_enable() >= 0) {
printk("Using HPET for base-timer\n");
}

+#ifndef CONFIG_NEWTOD
cur_timer = select_timer();
printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
+#endif

time_init_hook();
}
@@ -464,6 +526,7 @@
return;
}
#endif
+#ifndef CONFIG_NEWTOD
xtime.tv_sec = get_cmos_time();
xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
set_normalized_timespec(&wall_to_monotonic,
@@ -471,6 +534,7 @@

cur_timer = select_timer();
printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
+#endif

time_init_hook();
}
diff -Nru a/arch/i386/lib/delay.c b/arch/i386/lib/delay.c
--- a/arch/i386/lib/delay.c 2005-04-29 15:16:59 -07:00
+++ b/arch/i386/lib/delay.c 2005-04-29 15:16:59 -07:00
@@ -23,10 +23,29 @@

extern struct timer_opts* timer;

+#ifndef CONFIG_NEWTOD
void __delay(unsigned long loops)
{
cur_timer->delay(loops);
}
+#else
+#include <linux/timeofday.h>
+/* XXX - For now just use a simple loop delay
+ * This has cpufreq issues, but so did the old method.
+ */
+void __delay(unsigned long loops)
+{
+ int d0;
+ __asm__ __volatile__(
+ "\tjmp 1f\n"
+ ".align 16\n"
+ "1:\tjmp 2f\n"
+ ".align 16\n"
+ "2:\tdecl %0\n\tjns 2b"
+ :"=&a" (d0)
+ :"0" (loops));
+}
+#endif

inline void __const_udelay(unsigned long xloops)
{
diff -Nru a/arch/ia64/Kconfig b/arch/ia64/Kconfig
--- a/arch/ia64/Kconfig 2005-04-29 15:16:59 -07:00
+++ b/arch/ia64/Kconfig 2005-04-29 15:16:59 -07:00
@@ -18,6 +18,10 @@
page at <http://www.linuxia64.org/> and a mailing list at
<[email protected]>.

+config NEWTOD
+ bool
+ default y
+
config 64BIT
bool
default y
@@ -36,7 +40,7 @@

config TIME_INTERPOLATION
bool
- default y
+ default n

config EFI
bool
diff -Nru a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c
--- a/arch/ia64/kernel/asm-offsets.c 2005-04-29 15:16:59 -07:00
+++ b/arch/ia64/kernel/asm-offsets.c 2005-04-29 15:16:59 -07:00
@@ -222,6 +222,7 @@
DEFINE(IA64_MCA_CPU_INIT_STACK_OFFSET,
offsetof (struct ia64_mca_cpu, init_stack));
BLANK();
+#ifndef CONFIG_NEWTOD
/* used by fsys_gettimeofday in arch/ia64/kernel/fsys.S */
DEFINE(IA64_TIME_INTERPOLATOR_ADDRESS_OFFSET, offsetof (struct time_interpolator, addr));
DEFINE(IA64_TIME_INTERPOLATOR_SOURCE_OFFSET, offsetof (struct time_interpolator, source));
@@ -235,5 +236,6 @@
DEFINE(IA64_TIME_SOURCE_CPU, TIME_SOURCE_CPU);
DEFINE(IA64_TIME_SOURCE_MMIO64, TIME_SOURCE_MMIO64);
DEFINE(IA64_TIME_SOURCE_MMIO32, TIME_SOURCE_MMIO32);
+#endif /* CONFIG_NEWTOD */
DEFINE(IA64_TIMESPEC_TV_NSEC_OFFSET, offsetof (struct timespec, tv_nsec));
}
diff -Nru a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S
--- a/arch/ia64/kernel/fsys.S 2005-04-29 15:16:59 -07:00
+++ b/arch/ia64/kernel/fsys.S 2005-04-29 15:16:59 -07:00
@@ -145,6 +145,7 @@
FSYS_RETURN
END(fsys_set_tid_address)

+#ifndef CONFIG_NEWTOD
/*
* Ensure that the time interpolator structure is compatible with the asm code
*/
@@ -326,6 +327,7 @@
EX(.fail_efault, st8 [r31] = r9)
EX(.fail_efault, st8 [r23] = r21)
FSYS_RETURN
+#endif /* !CONFIG_NEWTOD */
.fail_einval:
mov r8 = EINVAL
mov r10 = -1
@@ -334,6 +336,7 @@
mov r8 = EFAULT
mov r10 = -1
FSYS_RETURN
+#ifndef CONFIG_NEWTOD
END(fsys_gettimeofday)

ENTRY(fsys_clock_gettime)
@@ -347,6 +350,7 @@
shl r30 = r32,15
br.many .gettime
END(fsys_clock_gettime)
+#endif /* !CONFIG_NEWTOD */

/*
* long fsys_rt_sigprocmask (int how, sigset_t *set, sigset_t *oset, size_t sigsetsize).
@@ -687,7 +691,11 @@
data8 0 // setrlimit
data8 0 // getrlimit // 1085
data8 0 // getrusage
+#ifdef CONFIG_NEWTOD
+ data8 0 // gettimeofday
+#else
data8 fsys_gettimeofday // gettimeofday
+#endif
data8 0 // settimeofday
data8 0 // select
data8 0 // poll // 1090
@@ -854,7 +862,11 @@
data8 0 // timer_getoverrun
data8 0 // timer_delete
data8 0 // clock_settime
+#ifdef CONFIG_NEWTOD
+ data8 0 // clock_gettime
+#else
data8 fsys_clock_gettime // clock_gettime
+#endif
data8 0 // clock_getres // 1255
data8 0 // clock_nanosleep
data8 0 // fstatfs64
diff -Nru a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
--- a/arch/ia64/kernel/time.c 2005-04-29 15:16:59 -07:00
+++ b/arch/ia64/kernel/time.c 2005-04-29 15:16:59 -07:00
@@ -21,6 +21,7 @@
#include <linux/efi.h>
#include <linux/profile.h>
#include <linux/timex.h>
+#include <linux/timeofday.h>

#include <asm/machvec.h>
#include <asm/delay.h>
@@ -45,11 +46,13 @@

#endif

+#ifndef CONFIG_NEWTOD
static struct time_interpolator itc_interpolator = {
.shift = 16,
.mask = 0xffffffffffffffffLL,
.source = TIME_SOURCE_CPU
};
+#endif /* CONFIG_NEWTOD */

static irqreturn_t
timer_interrupt (int irq, void *dev_id, struct pt_regs *regs)
@@ -211,6 +214,7 @@
local_cpu_data->nsec_per_cyc = ((NSEC_PER_SEC<<IA64_NSEC_PER_CYC_SHIFT)
+ itc_freq/2)/itc_freq;

+#ifndef CONFIG_NEWTOD
if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)) {
itc_interpolator.frequency = local_cpu_data->itc_freq;
itc_interpolator.drift = itc_drift;
@@ -229,6 +233,7 @@
#endif
register_time_interpolator(&itc_interpolator);
}
+#endif /* CONFIG_NEWTOD */

/* Setup the CPU local timer tick */
ia64_cpu_local_tick();
@@ -253,3 +258,17 @@
*/
set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec);
}
+
+/* arch specific timeofday hooks */
+nsec_t read_persistent_clock(void)
+{
+ struct timespec ts;
+ efi_gettimeofday(&ts);
+ return (nsec_t)(ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec);
+}
+
+void sync_persistent_clock(struct timespec ts)
+{
+ /* XXX - Something should go here, no? */
+}
+
diff -Nru a/arch/ia64/sn/kernel/sn2/timer.c b/arch/ia64/sn/kernel/sn2/timer.c
--- a/arch/ia64/sn/kernel/sn2/timer.c 2005-04-29 15:16:59 -07:00
+++ b/arch/ia64/sn/kernel/sn2/timer.c 2005-04-29 15:16:59 -07:00
@@ -19,6 +19,7 @@
#include <asm/sn/shub_mmr.h>
#include <asm/sn/clksupport.h>

+#ifndef CONFIG_NEWTOD
extern unsigned long sn_rtc_cycles_per_second;

static struct time_interpolator sn2_interpolator = {
@@ -34,3 +35,8 @@
sn2_interpolator.addr = RTC_COUNTER_ADDR;
register_time_interpolator(&sn2_interpolator);
}
+#else
+void __init sn_timer_init(void)
+{
+}
+#endif
diff -Nru a/arch/ppc/Kconfig b/arch/ppc/Kconfig
--- a/arch/ppc/Kconfig 2005-04-29 15:16:59 -07:00
+++ b/arch/ppc/Kconfig 2005-04-29 15:16:59 -07:00
@@ -8,6 +8,10 @@
bool
default y

+config NEWTOD
+ bool
+ default y
+
config UID16
bool

diff -Nru a/arch/ppc/kernel/time.c b/arch/ppc/kernel/time.c
--- a/arch/ppc/kernel/time.c 2005-04-29 15:16:59 -07:00
+++ b/arch/ppc/kernel/time.c 2005-04-29 15:16:59 -07:00
@@ -57,6 +57,7 @@
#include <linux/time.h>
#include <linux/init.h>
#include <linux/profile.h>
+#include <linux/timeofday.h>

#include <asm/segment.h>
#include <asm/io.h>
@@ -95,6 +96,46 @@

EXPORT_SYMBOL(rtc_lock);

+#ifdef CONFIG_NEWTOD
+nsec_t read_persistent_clock(void)
+{
+ if (ppc_md.get_rtc_time) {
+ return (nsec_t)ppc_md.get_rtc_time() * NSEC_PER_SEC;
+ } else {
+ printk(KERN_ERR "ppc_md.get_rtc_time does not exist???\n");
+ return 0;
+ }
+}
+
+void sync_persistent_clock(struct timespec ts)
+{
+ /*
+ * update the rtc when needed, this should be performed on the
+ * right fraction of a second. Half or full second ?
+ * Full second works on mk48t59 clocks, others need testing.
+ * Note that this update is basically only used through
+ * the adjtimex system calls. Setting the HW clock in
+ * any other way is a /dev/rtc and userland business.
+ * This is still wrong by -0.5/+1.5 jiffies because of the
+ * timer interrupt resolution and possible delay, but here we
+ * hit a quantization limit which can only be solved by higher
+ * resolution timers and decoupling time management from timer
+ * interrupts. This is also wrong on the clocks
+ * which require being written at the half second boundary.
+ * We should have an rtc call that only sets the minutes and
+ * seconds like on Intel to avoid problems with non UTC clocks.
+ */
+ if ( ppc_md.set_rtc_time && ts.tv_sec - last_rtc_update >= 659 &&
+ abs((ts.tv_nsec/1000) - (1000000-1000000/HZ)) < 500000/HZ) {
+ if (ppc_md.set_rtc_time(ts.tv_sec + 1 + time_offset) == 0)
+ last_rtc_update = ts.tv_sec+1;
+ else
+ /* Try again one minute later */
+ last_rtc_update += 60;
+ }
+}
+#endif /* CONFIG_NEWTOD */
+
/* Timer interrupt helper function */
static inline int tb_delta(unsigned *jiffy_stamp) {
int delta;
@@ -152,6 +193,7 @@
tb_last_stamp = jiffy_stamp;
do_timer(regs);

+#ifndef CONFIG_NEWTOD
/*
* update the rtc when needed, this should be performed on the
* right fraction of a second. Half or full second ?
@@ -178,6 +220,7 @@
/* Try again one minute later */
last_rtc_update += 60;
}
+#endif
write_sequnlock(&xtime_lock);
}
if ( !disarm_decr[smp_processor_id()] )
@@ -193,6 +236,7 @@
/*
* This version of gettimeofday has microsecond resolution.
*/
+#ifndef CONFIG_NEWTOD
void do_gettimeofday(struct timeval *tv)
{
unsigned long flags;
@@ -281,6 +325,7 @@
}

EXPORT_SYMBOL(do_settimeofday);
+#endif

/* This function is only called on the boot processor */
void __init time_init(void)
@@ -422,6 +467,8 @@
return mlt;
}

+/* XXX - sched_clock mess needs to be sorted out */
+#ifndef CONFIG_NEWTOD
unsigned long long sched_clock(void)
{
unsigned long lo, hi, hi2;
@@ -445,3 +492,4 @@
}
return tb;
}
+#endif
diff -Nru a/arch/ppc/platforms/chrp_time.c b/arch/ppc/platforms/chrp_time.c
--- a/arch/ppc/platforms/chrp_time.c 2005-04-29 15:16:59 -07:00
+++ b/arch/ppc/platforms/chrp_time.c 2005-04-29 15:16:59 -07:00
@@ -115,8 +115,10 @@
chrp_cmos_clock_write(save_control, RTC_CONTROL);
chrp_cmos_clock_write(save_freq_select, RTC_FREQ_SELECT);

+#ifndef CONFIG_NEWTOD
if ( (time_state == TIME_ERROR) || (time_state == TIME_BAD) )
time_state = TIME_OK;
+#endif
spin_unlock(&rtc_lock);
return 0;
}
diff -Nru a/arch/ppc64/Kconfig b/arch/ppc64/Kconfig
--- a/arch/ppc64/Kconfig 2005-04-29 15:16:59 -07:00
+++ b/arch/ppc64/Kconfig 2005-04-29 15:16:59 -07:00
@@ -10,6 +10,10 @@
bool
default y

+config NEWTOD
+ bool
+ default y
+
config UID16
bool

diff -Nru a/arch/ppc64/kernel/sys_ppc32.c b/arch/ppc64/kernel/sys_ppc32.c
--- a/arch/ppc64/kernel/sys_ppc32.c 2005-04-29 15:16:59 -07:00
+++ b/arch/ppc64/kernel/sys_ppc32.c 2005-04-29 15:16:59 -07:00
@@ -322,8 +322,10 @@

ret = do_adjtimex(&txc);

+#ifndef CONFIG_NEWTOD
/* adjust the conversion of TB to time of day to track adjtimex */
ppc_adjtimex();
+#endif

if(put_user(txc.modes, &utp->modes) ||
__put_user(txc.offset, &utp->offset) ||
diff -Nru a/arch/ppc64/kernel/time.c b/arch/ppc64/kernel/time.c
--- a/arch/ppc64/kernel/time.c 2005-04-29 15:16:59 -07:00
+++ b/arch/ppc64/kernel/time.c 2005-04-29 15:16:59 -07:00
@@ -50,6 +50,7 @@
#include <linux/profile.h>
#include <linux/cpu.h>
#include <linux/security.h>
+#include <linux/timeofday.h>

#include <asm/segment.h>
#include <asm/io.h>
@@ -107,6 +108,7 @@

static unsigned adjusting_time = 0;

+#ifndef CONFIG_NEWTOD
static __inline__ void timer_check_rtc(void)
{
/*
@@ -140,6 +142,52 @@
last_rtc_update += 60;
}
}
+#else /* CONFIG_NEWTOD */
+nsec_t read_persistent_clock(void)
+{
+ struct rtc_time tm;
+ unsigned long sec;
+#ifdef CONFIG_PPC_ISERIES
+ if (!piranha_simulator)
+#endif
+ ppc_md.get_boot_time(&tm);
+
+ sec = mktime(tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
+ tm.tm_hour, tm.tm_min, tm.tm_sec);
+ return (nsec_t)sec * NSEC_PER_SEC;
+}
+void sync_persistent_clock(struct timespec ts)
+{
+ /*
+ * update the rtc when needed, this should be performed on the
+ * right fraction of a second. Half or full second ?
+ * Full second works on mk48t59 clocks, others need testing.
+ * Note that this update is basically only used through
+ * the adjtimex system calls. Setting the HW clock in
+ * any other way is a /dev/rtc and userland business.
+ * This is still wrong by -0.5/+1.5 jiffies because of the
+ * timer interrupt resolution and possible delay, but here we
+ * hit a quantization limit which can only be solved by higher
+ * resolution timers and decoupling time management from timer
+ * interrupts. This is also wrong on the clocks
+ * which require being written at the half second boundary.
+ * We should have an rtc call that only sets the minutes and
+ * seconds like on Intel to avoid problems with non UTC clocks.
+ */
+ if ( ts.tv_sec - last_rtc_update >= 659 &&
+ abs((ts.tv_nsec/1000) - (1000000-1000000/HZ)) < 500000/HZ) {
+ struct rtc_time tm;
+ to_tm(ts.tv_sec+1, &tm);
+ tm.tm_year -= 1900;
+ tm.tm_mon -= 1;
+ if (ppc_md.set_rtc_time(&tm) == 0)
+ last_rtc_update = ts.tv_sec+1;
+ else
+ /* Try again one minute later */
+ last_rtc_update += 60;
+ }
+}
+#endif /* CONFIG_NEWTOD */

/*
* This version of gettimeofday has microsecond resolution.
@@ -171,12 +219,14 @@
tv->tv_usec = usec;
}

+#ifndef CONFIG_NEWTOD
void do_gettimeofday(struct timeval *tv)
{
__do_gettimeofday(tv, get_tb());
}

EXPORT_SYMBOL(do_gettimeofday);
+#endif

/* Synchronize xtime with do_gettimeofday */

@@ -350,11 +400,15 @@
tb_last_stamp = lpaca->next_jiffy_update_tb;
timer_recalc_offset(lpaca->next_jiffy_update_tb);
do_timer(regs);
+#ifndef CONFIG_NEWTOD
timer_sync_xtime(lpaca->next_jiffy_update_tb);
timer_check_rtc();
+#endif
write_sequnlock(&xtime_lock);
+#ifndef CONFIG_NEWTOD
if ( adjusting_time && (time_adjust == 0) )
ppc_adjtimex();
+#endif
}
lpaca->next_jiffy_update_tb += tb_ticks_per_jiffy;
}
@@ -385,6 +439,7 @@
return 1;
}

+#ifndef CONFIG_NEWTOD
/*
* Scheduler clock - returns current time in nanosec units.
*
@@ -473,6 +528,7 @@
}

EXPORT_SYMBOL(do_settimeofday);
+#endif /* !CONFIG_NEWTOD */

void __init time_init(void)
{
@@ -525,7 +581,9 @@
systemcfg->stamp_xsec = xtime.tv_sec * XSEC_PER_SEC;
systemcfg->tb_to_xs = tb_to_xs;

+#ifndef CONFIG_NEWTOD
time_freq = 0;
+#endif

xtime.tv_nsec = 0;
last_rtc_update = xtime.tv_sec;
@@ -548,6 +606,7 @@

/* #define DEBUG_PPC_ADJTIMEX 1 */

+#ifndef CONFIG_NEWTOD
void ppc_adjtimex(void)
{
unsigned long den, new_tb_ticks_per_sec, tb_ticks, old_xsec, new_tb_to_xs, new_xsec, new_stamp_xsec;
@@ -671,6 +730,7 @@
write_sequnlock_irqrestore( &xtime_lock, flags );

}
+#endif /* !CONFIG_NEWTOD */


#define TICK_SIZE tick
diff -Nru a/arch/s390/Kconfig b/arch/s390/Kconfig
--- a/arch/s390/Kconfig 2005-04-29 15:16:59 -07:00
+++ b/arch/s390/Kconfig 2005-04-29 15:16:59 -07:00
@@ -127,6 +127,10 @@
This allows you to run 32-bit Linux/ELF binaries on your zSeries
in 64 bit mode. Everybody wants this; say Y.

+config NEWTOD
+ bool
+ default y
+
comment "Code generation options"

choice
diff -Nru a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
--- a/arch/s390/kernel/time.c 2005-04-29 15:16:59 -07:00
+++ b/arch/s390/kernel/time.c 2005-04-29 15:16:59 -07:00
@@ -29,6 +29,7 @@
#include <linux/profile.h>
#include <linux/timex.h>
#include <linux/notifier.h>
+#include <linux/timeofday.h>

#include <asm/uaccess.h>
#include <asm/delay.h>
@@ -89,6 +90,7 @@
return (unsigned long) now;
}

+#ifndef CONFIG_NEWTOD
/*
* This version of gettimeofday has microsecond resolution.
*/
@@ -149,7 +151,27 @@
}

EXPORT_SYMBOL(do_settimeofday);
+#endif

+nsec_t read_persistent_clock(void)
+{
+ unsigned long long nsecs;
+ /*
+ * The TOD clock counts from 1900-01-01. Bit 2^12 of the
+ * 64 bit register is micro-seconds.
+ */
+ nsecs = get_clock() - 0x7d91048bca000000LL;
+ /*
+ * Calc nsecs * 1000 / 4096 without overflow and
+ * without loosing too many bits.
+ */
+ nsecs = (((((nsecs >> 3) * 5) >> 3) * 5) >> 3) * 5;
+ return (nsec_t) nsecs;
+}
+
+void sync_persistent_clock(struct timespec ts)
+{
+}

#ifdef CONFIG_PROFILING
#define s390_do_profile(regs) profile_tick(CPU_PROFILING, regs)
diff -Nru a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
--- a/arch/x86_64/Kconfig 2005-04-29 15:16:59 -07:00
+++ b/arch/x86_64/Kconfig 2005-04-29 15:16:59 -07:00
@@ -24,6 +24,10 @@
bool
default y

+config NEWTOD
+ bool
+ default y
+
config MMU
bool
default y
diff -Nru a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
--- a/arch/x86_64/kernel/time.c 2005-04-29 15:16:59 -07:00
+++ b/arch/x86_64/kernel/time.c 2005-04-29 15:16:59 -07:00
@@ -35,6 +35,7 @@
#include <asm/sections.h>
#include <linux/cpufreq.h>
#include <linux/hpet.h>
+#include <linux/timeofday.h>
#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/apic.h>
#endif
@@ -106,6 +107,7 @@

unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc;

+#ifndef CONFIG_NEWTOD
/*
* This version of gettimeofday() has microsecond resolution and better than
* microsecond precision, as we're using at least a 10 MHz (usually 14.31818
@@ -180,6 +182,7 @@
}

EXPORT_SYMBOL(do_settimeofday);
+#endif /* CONFIG_NEWTOD */

unsigned long profile_pc(struct pt_regs *regs)
{
@@ -281,6 +284,7 @@
}


+#ifndef CONFIG_NEWTOD
/* monotonic_clock(): returns # of nanoseconds passed since time_init()
* Note: This function is required to return accurate
* time even in the absence of multiple timer ticks.
@@ -357,6 +361,8 @@
}
#endif
}
+#endif /* CONFIG_NEWTOD */
+

static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
{
@@ -373,6 +379,7 @@

write_seqlock(&xtime_lock);

+#ifndef CONFIG_NEWTOD
if (vxtime.hpet_address) {
offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
delay = hpet_readl(HPET_COUNTER) - offset;
@@ -422,6 +429,7 @@
handle_lost_ticks(lost, regs);
jiffies += lost;
}
+#endif /* CONFIG_NEWTOD */

/*
* Do the timer stuff.
@@ -445,6 +453,7 @@
smp_local_timer_interrupt(regs);
#endif

+#ifndef CONFIG_NEWTOD
/*
* If we have an externally synchronized Linux clock, then update CMOS clock
* accordingly every ~11 minutes. set_rtc_mmss() will be called in the jiffy
@@ -458,7 +467,8 @@
set_rtc_mmss(xtime.tv_sec);
rtc_update = xtime.tv_sec + 660;
}
-
+#endif /* CONFIG_NEWTOD */
+
write_sequnlock(&xtime_lock);

return IRQ_HANDLED;
@@ -477,6 +487,7 @@
return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
}

+#ifndef CONFIG_NEWTOD
unsigned long long sched_clock(void)
{
unsigned long a = 0;
@@ -498,6 +509,7 @@
rdtscll(a);
return cycles_2_ns(a);
}
+#endif /* CONFIG_NEWTOD */

unsigned long get_cmos_time(void)
{
@@ -559,6 +571,30 @@
return mktime(year, mon, day, hour, min, sec);
}

+/* arch specific timeofday hooks */
+nsec_t read_persistent_clock(void)
+{
+ return (nsec_t)get_cmos_time() * NSEC_PER_SEC;
+}
+
+void sync_persistent_clock(struct timespec ts)
+{
+ static unsigned long rtc_update = 0;
+ /*
+ * If we have an externally synchronized Linux clock, then update
+ * CMOS clock accordingly every ~11 minutes. set_rtc_mmss() will
+ * be called in the jiffy closest to exactly 500 ms before the
+ * next second. If the update fails, we don't care, as it'll be
+ * updated on the next turn, and the problem (time way off) isn't
+ * likely to go away much sooner anyway.
+ */
+ if (ts.tv_sec > rtc_update &&
+ abs(ts.tv_nsec - 500000000) <= tick_nsec / 2) {
+ set_rtc_mmss(xtime.tv_sec);
+ rtc_update = xtime.tv_sec + 660;
+ }
+}
+
#ifdef CONFIG_CPU_FREQ

/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
@@ -954,6 +990,7 @@

__setup("report_lost_ticks", time_setup);

+#ifndef CONFIG_NEWTOD
static long clock_cmos_diff;
static unsigned long sleep_start;

@@ -991,6 +1028,24 @@
wall_jiffies += sleep_length;
return 0;
}
+#else /* !CONFIG_NEWTOD */
+static int timer_suspend(struct sys_device *dev, u32 state)
+{
+ timeofday_suspend_hook();
+ return 0;
+}
+
+static int timer_resume(struct sys_device *dev)
+{
+ if (vxtime.hpet_address)
+ hpet_reenable();
+ else
+ i8254_timer_resume();
+
+ timeofday_resume_hook();
+ return 0;
+}
+#endif

static struct sysdev_class timer_sysclass = {
.resume = timer_resume,
diff -Nru a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
--- a/arch/x86_64/kernel/vsyscall.c 2005-04-29 15:16:59 -07:00
+++ b/arch/x86_64/kernel/vsyscall.c 2005-04-29 15:16:59 -07:00
@@ -217,8 +217,13 @@
BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
map_vsyscall();
+/* XXX - disable vsyscall gettimeofday for now */
+#ifndef CONFIG_NEWTOD
sysctl_vsyscall = 1;
register_sysctl_table(kernel_root_table2, 0);
+#else
+ sysctl_vsyscall = 0;
+#endif
return 0;
}

diff -Nru a/include/asm-generic/div64.h b/include/asm-generic/div64.h
--- a/include/asm-generic/div64.h 2005-04-29 15:16:59 -07:00
+++ b/include/asm-generic/div64.h 2005-04-29 15:16:59 -07:00
@@ -55,4 +55,13 @@

#endif /* BITS_PER_LONG */

+#ifndef div_long_long_rem
+#define div_long_long_rem(dividend,divisor,remainder) \
+({ \
+ u64 result = dividend; \
+ *remainder = do_div(result,divisor); \
+ result; \
+})
+#endif
+
#endif /* _ASM_GENERIC_DIV64_H */
diff -Nru a/include/asm-ppc64/time.h b/include/asm-ppc64/time.h
--- a/include/asm-ppc64/time.h 2005-04-29 15:16:59 -07:00
+++ b/include/asm-ppc64/time.h 2005-04-29 15:16:59 -07:00
@@ -21,6 +21,7 @@
#include <asm/processor.h>
#include <asm/paca.h>
#include <asm/iSeries/HvCall.h>
+#include <asm/percpu.h>

/* time.c */
extern unsigned long tb_ticks_per_jiffy;
diff -Nru a/include/linux/timeofday.h b/include/linux/timeofday.h
--- a/include/linux/timeofday.h 2005-04-29 15:16:59 -07:00
+++ b/include/linux/timeofday.h 2005-04-29 15:16:59 -07:00
@@ -9,6 +9,7 @@
#include <linux/types.h>
#include <linux/time.h>
#include <linux/timex.h>
+#include <linux/jiffies.h> /* For div_long_long_rem */
#include <asm/div64.h>

#ifdef CONFIG_NEWTOD


2005-04-29 22:56:57

by john stultz

[permalink] [raw]
Subject: [RFC][PATCH (3/4)] new timeofday arch specific timesource drivers (v A4)

All,
This patch implements most of the time sources for i386, x86-64,
ppc32 and ppc64 (tsc, pit, cyclone, acpi-pm, hpet and timebase). There
are also initial untested sketches for s390 and the ia64 itc and sn2_rtc
timesources. It applies ontop of my linux-2.6.12-rc2_timeofday-arch_A4
patch. It provides real hardware timesources (opposed to the example
jiffies timesource) that can be used for more realistic testing.

This patch is the shabbiest of the three. It needs to be broken up, and
cleaned. The i386_pit.c is broken. The hpet and cyclone code have been
attempted to be cleaned up so they can be shared between x86-64, i386
and ia64, but they still need testing. acpi_pm also needs to be made
arch generic, but for now it will get you going so you can test and play
with the core code.

New in this release:
o s390 support (Big thanks to Martin Schwidefsky!)

Items still on the TODO list:
o Fix TSC C3 stalls
o real ia64 timesources
o make cyclone/apci_pm arch generic
o example interpolation timesource
o fix i386_pit timesource
o all other arch timesources (volunteers wanted!)
o lots of cleanups
o lots of testing

thanks
-john

linux-2.6.12-rc2_timeofday-timesources_A4.patch
===============================================
diff -Nru a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
--- a/arch/i386/kernel/Makefile 2005-04-29 15:21:27 -07:00
+++ b/arch/i386/kernel/Makefile 2005-04-29 15:21:27 -07:00
@@ -7,10 +7,10 @@
obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \
ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \
- doublefault.o quirks.o
+ doublefault.o quirks.o tsc.o

obj-y += cpu/
-obj-y += timers/
+obj-$(!CONFIG_NEWTOD) += timers/
obj-$(CONFIG_ACPI_BOOT) += acpi/
obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o
obj-$(CONFIG_MCA) += mca.o
diff -Nru a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c
--- a/arch/i386/kernel/acpi/boot.c 2005-04-29 15:21:27 -07:00
+++ b/arch/i386/kernel/acpi/boot.c 2005-04-29 15:21:27 -07:00
@@ -547,7 +547,7 @@


#ifdef CONFIG_HPET_TIMER
-
+#include <asm/hpet.h>
static int __init acpi_parse_hpet(unsigned long phys, unsigned long size)
{
struct acpi_table_hpet *hpet_tbl;
@@ -570,18 +570,12 @@
#ifdef CONFIG_X86_64
vxtime.hpet_address = hpet_tbl->addr.addrl |
((long) hpet_tbl->addr.addrh << 32);
-
- printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
- hpet_tbl->id, vxtime.hpet_address);
+ hpet_address = vxtime.hpet_address;
#else /* X86 */
- {
- extern unsigned long hpet_address;
-
hpet_address = hpet_tbl->addr.addrl;
+#endif /* X86 */
printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
hpet_tbl->id, hpet_address);
- }
-#endif /* X86 */

return 0;
}
diff -Nru a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c
--- a/arch/i386/kernel/i8259.c 2005-04-29 15:21:27 -07:00
+++ b/arch/i386/kernel/i8259.c 2005-04-29 15:21:27 -07:00
@@ -387,6 +387,48 @@
}
}

+#ifdef CONFIG_NEWTOD
+void setup_pit_timer(void)
+{
+ extern spinlock_t i8253_lock;
+ unsigned long flags;
+
+ spin_lock_irqsave(&i8253_lock, flags);
+ outb_p(0x34,PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
+ udelay(10);
+ outb_p(LATCH & 0xff , PIT_CH0); /* LSB */
+ udelay(10);
+ outb(LATCH >> 8 , PIT_CH0); /* MSB */
+ spin_unlock_irqrestore(&i8253_lock, flags);
+}
+
+static int timer_resume(struct sys_device *dev)
+{
+ setup_pit_timer();
+ return 0;
+}
+
+static struct sysdev_class timer_sysclass = {
+ set_kset_name("timer_pit"),
+ .resume = timer_resume,
+};
+
+static struct sys_device device_timer = {
+ .id = 0,
+ .cls = &timer_sysclass,
+};
+
+static int __init init_timer_sysfs(void)
+{
+ int error = sysdev_class_register(&timer_sysclass);
+ if (!error)
+ error = sysdev_register(&device_timer);
+ return error;
+}
+
+device_initcall(init_timer_sysfs);
+#endif
+
void __init init_IRQ(void)
{
int i;
diff -Nru a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
--- a/arch/i386/kernel/setup.c 2005-04-29 15:21:27 -07:00
+++ b/arch/i386/kernel/setup.c 2005-04-29 15:21:27 -07:00
@@ -50,6 +50,7 @@
#include <asm/io_apic.h>
#include <asm/ist.h>
#include <asm/io.h>
+#include <asm/tsc.h>
#include "setup_arch_pre.h"
#include <bios_ebda.h>

@@ -1523,6 +1524,7 @@
conswitchp = &dummy_con;
#endif
#endif
+ tsc_init();
}

#include "setup_arch_post.h"
diff -Nru a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
--- a/arch/i386/kernel/time.c 2005-04-29 15:21:27 -07:00
+++ b/arch/i386/kernel/time.c 2005-04-29 15:21:27 -07:00
@@ -88,7 +88,9 @@
DEFINE_SPINLOCK(i8253_lock);
EXPORT_SYMBOL(i8253_lock);

+#ifndef CONFIG_NEWTOD
struct timer_opts *cur_timer = &timer_none;
+#endif

/*
* This is a special lock that is owned by the CPU and holds the index
diff -Nru a/arch/i386/kernel/timers/common.c b/arch/i386/kernel/timers/common.c
--- a/arch/i386/kernel/timers/common.c 2005-04-29 15:21:27 -07:00
+++ b/arch/i386/kernel/timers/common.c 2005-04-29 15:21:27 -07:00
@@ -22,8 +22,6 @@
* device.
*/

-#define CALIBRATE_TIME (5 * 1000020/HZ)
-
unsigned long __init calibrate_tsc(void)
{
mach_prepare_counter();
diff -Nru a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/arch/i386/kernel/tsc.c 2005-04-29 15:21:27 -07:00
@@ -0,0 +1,111 @@
+#include <linux/init.h>
+#include <linux/timex.h>
+#include <linux/cpufreq.h>
+#include <asm/tsc.h>
+#include "mach_timer.h"
+
+unsigned long cpu_freq_khz;
+#ifdef CONFIG_NEWTOD
+int tsc_disable;
+#endif
+
+void tsc_init(void)
+{
+ unsigned long long start, end;
+ unsigned long count;
+ u64 delta64;
+ int i;
+
+ /* repeat 3 times to make sure the cache is warm */
+ for(i=0; i < 3; i++) {
+ mach_prepare_counter();
+ rdtscll(start);
+ mach_countup(&count);
+ rdtscll(end);
+ }
+ delta64 = end - start;
+
+ /* cpu freq too fast */
+ if(delta64 > (1ULL<<32))
+ return;
+ /* cpu freq too slow */
+ if (delta64 <= CALIBRATE_TIME)
+ return;
+
+ delta64 *= 1000;
+ do_div(delta64,CALIBRATE_TIME);
+ cpu_freq_khz = (unsigned long)delta64;
+
+ cpu_khz = cpu_freq_khz;
+
+ printk("Detected %lu.%03lu MHz processor.\n",
+ cpu_khz / 1000, cpu_khz % 1000);
+
+}
+
+
+/* All of the code below comes from arch/i386/kernel/timers/timer_tsc.c
+ * XXX: severly needs better comments and the ifdef's killed.
+ */
+
+#ifdef CONFIG_CPU_FREQ
+static unsigned int cpufreq_init = 0;
+
+/* If the CPU frequency is scaled, TSC-based delays will need a different
+ * loops_per_jiffy value to function properly.
+ */
+
+static unsigned int ref_freq = 0;
+static unsigned long loops_per_jiffy_ref = 0;
+
+#ifndef CONFIG_SMP
+static unsigned long cpu_khz_ref = 0;
+#endif
+
+static int time_cpufreq_notifier(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct cpufreq_freqs *freq = data;
+
+ if (val != CPUFREQ_RESUMECHANGE)
+ write_seqlock_irq(&xtime_lock);
+ if (!ref_freq) {
+ ref_freq = freq->old;
+ loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy;
+#ifndef CONFIG_SMP
+ cpu_khz_ref = cpu_khz;
+#endif
+ }
+
+ if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
+ (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
+ (val == CPUFREQ_RESUMECHANGE)) {
+ if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+ cpu_data[freq->cpu].loops_per_jiffy = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
+#ifndef CONFIG_SMP
+ if (cpu_khz)
+ cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
+#endif
+ }
+
+ if (val != CPUFREQ_RESUMECHANGE)
+ write_sequnlock_irq(&xtime_lock);
+
+ return 0;
+}
+
+static struct notifier_block time_cpufreq_notifier_block = {
+ .notifier_call = time_cpufreq_notifier
+};
+
+static int __init cpufreq_tsc(void)
+{
+ int ret;
+ ret = cpufreq_register_notifier(&time_cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+ if (!ret)
+ cpufreq_init = 1;
+ return ret;
+}
+core_initcall(cpufreq_tsc);
+#endif /* CONFIG_CPU_FREQ */
diff -Nru a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
--- a/arch/x86_64/kernel/time.c 2005-04-29 15:21:27 -07:00
+++ b/arch/x86_64/kernel/time.c 2005-04-29 15:21:27 -07:00
@@ -59,6 +59,7 @@
#undef HPET_HACK_ENABLE_DANGEROUS

unsigned int cpu_khz; /* TSC clocks / usec, not used here */
+unsigned long hpet_address;
static unsigned long hpet_period; /* fsecs / HPET clock */
unsigned long hpet_tick; /* HPET clocks / interrupt */
unsigned long vxtime_hz = PIT_TICK_RATE;
diff -Nru a/drivers/timesource/Makefile b/drivers/timesource/Makefile
--- a/drivers/timesource/Makefile 2005-04-29 15:21:27 -07:00
+++ b/drivers/timesource/Makefile 2005-04-29 15:21:27 -07:00
@@ -1 +1,15 @@
obj-y += jiffies.o
+obj-$(CONFIG_X86) += tsc.o
+obj-$(CONFIG_X86_CYCLONE_TIMER) += cyclone.o
+obj-$(CONFIG_X86_PM_TIMER) += acpi_pm.o
+obj-$(CONFIG_HPET_TIMER) += hpet.o
+obj-$(CONFIG_PPC64) += ppc64_timebase.o
+obj-$(CONFIG_PPC) += ppc_timebase.o
+obj-$(CONFIG_ARCH_S390) += s390_tod.o
+
+# XXX - Known broken
+#obj-$(CONFIG_X86) += i386_pit.o
+
+# XXX - Untested/Uncompiled
+#obj-$(CONFIG_IA64) += itc.c
+#obj-$(CONFIG_IA64_SGI_SN2) += sn2_rtc.c
diff -Nru a/drivers/timesource/acpi_pm.c b/drivers/timesource/acpi_pm.c
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/drivers/timesource/acpi_pm.c 2005-04-29 15:21:27 -07:00
@@ -0,0 +1,116 @@
+#include <linux/timesource.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <asm/io.h>
+#include "mach_timer.h"
+
+/* Number of PMTMR ticks expected during calibration run */
+#define PMTMR_TICKS_PER_SEC 3579545
+#define PMTMR_EXPECTED_RATE \
+ ((CALIBRATE_LATCH * (PMTMR_TICKS_PER_SEC >> 10)) / (CLOCK_TICK_RATE>>10))
+
+
+/* The I/O port the PMTMR resides at.
+ * The location is detected during setup_arch(),
+ * in arch/i386/acpi/boot.c */
+u32 pmtmr_ioport = 0;
+
+#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
+
+static inline u32 read_pmtmr(void)
+{
+ u32 v1=0,v2=0,v3=0;
+ /* It has been reported that because of various broken
+ * chipsets (ICH4, PIIX4 and PIIX4E) where the ACPI PM time
+ * source is not latched, so you must read it multiple
+ * times to insure a safe value is read.
+ */
+ do {
+ v1 = inl(pmtmr_ioport);
+ v2 = inl(pmtmr_ioport);
+ v3 = inl(pmtmr_ioport);
+ } while ((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1)
+ || (v3 > v1 && v3 < v2));
+
+ /* mask the output to 24 bits */
+ return v2 & ACPI_PM_MASK;
+}
+
+
+static cycle_t acpi_pm_read(void)
+{
+ return (cycle_t)read_pmtmr();
+}
+
+struct timesource_t timesource_acpi_pm = {
+ .name = "acpi_pm",
+ .priority = 200,
+ .type = TIMESOURCE_FUNCTION,
+ .read_fnct = acpi_pm_read,
+ .mask = (cycle_t)ACPI_PM_MASK,
+ .mult = 0, /*to be caluclated*/
+ .shift = 22,
+};
+
+/*
+ * Some boards have the PMTMR running way too fast. We check
+ * the PMTMR rate against PIT channel 2 to catch these cases.
+ */
+static int verify_pmtmr_rate(void)
+{
+ u32 value1, value2;
+ unsigned long count, delta;
+
+ mach_prepare_counter();
+ value1 = read_pmtmr();
+ mach_countup(&count);
+ value2 = read_pmtmr();
+ delta = (value2 - value1) & ACPI_PM_MASK;
+
+ /* Check that the PMTMR delta is within 5% of what we expect */
+ if (delta < (PMTMR_EXPECTED_RATE * 19) / 20 ||
+ delta > (PMTMR_EXPECTED_RATE * 21) / 20) {
+ printk(KERN_INFO "PM-Timer running at invalid rate: %lu%% of normal - aborting.\n", 100UL * delta / PMTMR_EXPECTED_RATE);
+ return -1;
+ }
+
+ return 0;
+}
+
+
+static int init_acpi_pm_timesource(void)
+{
+ u32 value1, value2;
+ unsigned int i;
+
+ if (!pmtmr_ioport)
+ return -ENODEV;
+
+ timesource_acpi_pm.mult = timesource_hz2mult(PMTMR_TICKS_PER_SEC,
+ timesource_acpi_pm.shift);
+
+ /* "verify" this timing source */
+ value1 = read_pmtmr();
+ for (i = 0; i < 10000; i++) {
+ value2 = read_pmtmr();
+ if (value2 == value1)
+ continue;
+ if (value2 > value1)
+ goto pm_good;
+ if ((value2 < value1) && ((value2) < 0xFFF))
+ goto pm_good;
+ printk(KERN_INFO "PM-Timer had inconsistent results: 0x%#x, 0x%#x - aborting.\n", value1, value2);
+ return -EINVAL;
+ }
+ printk(KERN_INFO "PM-Timer had no reasonable result: 0x%#x - aborting.\n", value1);
+ return -ENODEV;
+
+pm_good:
+ if (verify_pmtmr_rate() != 0)
+ return -ENODEV;
+
+ register_timesource(&timesource_acpi_pm);
+ return 0;
+}
+
+module_init(init_acpi_pm_timesource);
diff -Nru a/drivers/timesource/cyclone.c b/drivers/timesource/cyclone.c
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/drivers/timesource/cyclone.c 2005-04-29 15:21:27 -07:00
@@ -0,0 +1,135 @@
+#include <linux/timesource.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/timex.h>
+#include <linux/init.h>
+
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include "mach_timer.h"
+
+#define CYCLONE_CBAR_ADDR 0xFEB00CD0 /* base address ptr*/
+#define CYCLONE_PMCC_OFFSET 0x51A0 /* offset to control register */
+#define CYCLONE_MPCS_OFFSET 0x51A8 /* offset to select register */
+#define CYCLONE_MPMC_OFFSET 0x51D0 /* offset to count register */
+#define CYCLONE_TIMER_FREQ 100000000
+#define CYCLONE_TIMER_MASK (0xFFFFFFFF) /* 32 bit mask */
+
+int use_cyclone = 0;
+
+struct timesource_t timesource_cyclone = {
+ .name = "cyclone",
+ .priority = 100,
+ .type = TIMESOURCE_MMIO_32,
+ .mmio_ptr = NULL, /* to be set */
+ .mask = (cycle_t)CYCLONE_TIMER_MASK,
+ .mult = 10,
+ .shift = 0,
+};
+
+static unsigned long calibrate_cyclone(void)
+{
+ unsigned long start, end, delta;
+ unsigned long i, count;
+ unsigned long cyclone_freq_khz;
+
+ /* repeat 3 times to make sure the cache is warm */
+ for(i=0; i < 3; i++) {
+ mach_prepare_counter();
+ start = readl(timesource_cyclone.mmio_ptr);
+ mach_countup(&count);
+ end = readl(timesource_cyclone.mmio_ptr);
+ }
+
+ delta = end - start;
+ printk("cyclone delta: %lu\n", delta);
+ delta *= (ACTHZ/1000)>>8;
+ printk("delta*hz = %lu\n", delta);
+ cyclone_freq_khz = delta/CALIBRATE_ITERATION;
+ printk("calculated cyclone_freq: %lu khz\n", cyclone_freq_khz);
+ return cyclone_freq_khz;
+}
+
+static int init_cyclone_timesource(void)
+{
+ unsigned long base; /* saved value from CBAR */
+ unsigned long offset;
+ u32 __iomem* reg;
+ u32 __iomem* volatile cyclone_timer; /* Cyclone MPMC0 register */
+ unsigned long khz;
+ int i;
+
+ /*make sure we're on a summit box*/
+ if (!use_cyclone) return -ENODEV;
+
+ printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n");
+
+ /* find base address */
+ offset = CYCLONE_CBAR_ADDR;
+ reg = ioremap_nocache(offset, sizeof(reg));
+ if(!reg){
+ printk(KERN_ERR "Summit chipset: Could not find valid CBAR register.\n");
+ return -ENODEV;
+ }
+ /* even on 64bit systems, this is only 32bits */
+ base = readl(reg);
+ if(!base){
+ printk(KERN_ERR "Summit chipset: Could not find valid CBAR value.\n");
+ return -ENODEV;
+ }
+ iounmap(reg);
+
+ /* setup PMCC */
+ offset = base + CYCLONE_PMCC_OFFSET;
+ reg = ioremap_nocache(offset, sizeof(reg));
+ if(!reg){
+ printk(KERN_ERR "Summit chipset: Could not find valid PMCC register.\n");
+ return -ENODEV;
+ }
+ writel(0x00000001,reg);
+ iounmap(reg);
+
+ /* setup MPCS */
+ offset = base + CYCLONE_MPCS_OFFSET;
+ reg = ioremap_nocache(offset, sizeof(reg));
+ if(!reg){
+ printk(KERN_ERR "Summit chipset: Could not find valid MPCS register.\n");
+ return -ENODEV;
+ }
+ writel(0x00000001,reg);
+ iounmap(reg);
+
+ /* map in cyclone_timer */
+ offset = base + CYCLONE_MPMC_OFFSET;
+ cyclone_timer = ioremap_nocache(offset, sizeof(u64));
+ if(!cyclone_timer){
+ printk(KERN_ERR "Summit chipset: Could not find valid MPMC register.\n");
+ return -ENODEV;
+ }
+
+ /*quick test to make sure its ticking*/
+ for(i=0; i<3; i++){
+ u32 old = readl(cyclone_timer);
+ int stall = 100;
+ while(stall--) barrier();
+ if(readl(cyclone_timer) == old){
+ printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n");
+ iounmap(cyclone_timer);
+ cyclone_timer = NULL;
+ return -ENODEV;
+ }
+ }
+ timesource_cyclone.mmio_ptr = cyclone_timer;
+
+ /* sort out mult/shift values */
+ khz = calibrate_cyclone();
+ timesource_cyclone.shift = 22;
+ timesource_cyclone.mult = timesource_khz2mult(khz,
+ timesource_cyclone.shift);
+
+ register_timesource(&timesource_cyclone);
+
+ return 0;
+}
+
+module_init(init_cyclone_timesource);
diff -Nru a/drivers/timesource/hpet.c b/drivers/timesource/hpet.c
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/drivers/timesource/hpet.c 2005-04-29 15:21:27 -07:00
@@ -0,0 +1,59 @@
+#include <linux/timesource.h>
+#include <linux/hpet.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <asm/io.h>
+#include <asm/hpet.h>
+
+#define HPET_MASK (0xFFFFFFFF)
+#define HPET_SHIFT 22
+
+/* FSEC = 10^-15 NSEC = 10^-9 */
+#define FSEC_PER_NSEC 1000000
+
+struct timesource_t timesource_hpet = {
+ .name = "hpet",
+ .priority = 300,
+ .type = TIMESOURCE_MMIO_32,
+ .mmio_ptr = NULL,
+ .mask = (cycle_t)HPET_MASK,
+ .mult = 0, /* set below */
+ .shift = HPET_SHIFT,
+};
+
+static int init_hpet_timesource(void)
+{
+ unsigned long hpet_period;
+ void __iomem* hpet_base;
+ u64 tmp;
+
+ if (!hpet_address)
+ return -ENODEV;
+
+ /* calculate the hpet address */
+ hpet_base =
+ (void __iomem*)ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
+ timesource_hpet.mmio_ptr = hpet_base + HPET_COUNTER;
+
+ /* calculate the frequency */
+ hpet_period = readl(hpet_base + HPET_PERIOD);
+
+
+ /* hpet period is in femto seconds per cycle
+ * so we need to convert this to ns/cyc units
+ * aproximated by mult/2^shift
+ *
+ * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift
+ * fsec/cyc * 1ns/1000000fsec * 2^shift = mult
+ * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
+ * (fsec/cyc << shift)/1000000 = mult
+ * (hpet_period << shift)/FSEC_PER_NSEC = mult
+ */
+ tmp = (u64)hpet_period << HPET_SHIFT;
+ do_div(tmp, FSEC_PER_NSEC);
+ timesource_hpet.mult = (u32)tmp;
+
+ register_timesource(&timesource_hpet);
+ return 0;
+}
+module_init(init_hpet_timesource);
diff -Nru a/drivers/timesource/i386_pit.c b/drivers/timesource/i386_pit.c
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/drivers/timesource/i386_pit.c 2005-04-29 15:21:27 -07:00
@@ -0,0 +1,100 @@
+/* pit timesource: XXX - broken!
+ */
+
+#include <linux/timesource.h>
+#include <linux/timex.h>
+#include <linux/init.h>
+
+#include <asm/io.h>
+#include <asm/timer.h>
+#include "io_ports.h"
+#include "do_timer.h"
+
+extern u64 jiffies_64;
+extern long jiffies;
+extern spinlock_t i8253_lock;
+
+/* Since the PIT overflows every tick, its not very useful
+ * to just read by itself. So throw jiffies into the mix to
+ * and just return nanoseconds in pit_read().
+ */
+
+static cycle_t pit_read(void)
+{
+ unsigned long flags;
+ int count;
+ unsigned long jiffies_t;
+ static int count_p;
+ static unsigned long jiffies_p = 0;
+
+ spin_lock_irqsave(&i8253_lock, flags);
+
+ outb_p(0x00, PIT_MODE); /* latch the count ASAP */
+
+ count = inb_p(PIT_CH0); /* read the latched count */
+ jiffies_t = jiffies;
+ count |= inb_p(PIT_CH0) << 8;
+
+ /* VIA686a test code... reset the latch if count > max + 1 */
+ if (count > LATCH) {
+ outb_p(0x34, PIT_MODE);
+ outb_p(LATCH & 0xff, PIT_CH0);
+ outb(LATCH >> 8, PIT_CH0);
+ count = LATCH - 1;
+ }
+
+ /*
+ * avoiding timer inconsistencies (they are rare, but they happen)...
+ * there are two kinds of problems that must be avoided here:
+ * 1. the timer counter underflows
+ * 2. hardware problem with the timer, not giving us continuous time,
+ * the counter does small "jumps" upwards on some Pentium systems,
+ * (see c't 95/10 page 335 for Neptun bug.)
+ */
+
+ if( jiffies_t == jiffies_p ) {
+ if( count > count_p ) {
+ /* the nutcase */
+ count = do_timer_overflow(count);
+ }
+ } else
+ jiffies_p = jiffies_t;
+
+ count_p = count;
+
+ spin_unlock_irqrestore(&i8253_lock, flags);
+
+ count = ((LATCH-1) - count) * TICK_SIZE;
+ count = (count + LATCH/2) / LATCH;
+
+ count *= 1000; /* convert count from usec->nsec */
+
+ return (cycle_t)((jiffies_64 * TICK_NSEC) + count);
+}
+
+static cycle_t pit_delta(cycle_t now, cycle_t then)
+{
+ return now - then;
+}
+
+/* just return cyc, as its already in ns */
+static nsec_t pit_cyc2ns(cycle_t cyc, cycle_t* remainder)
+{
+ return (nsec_t)cyc;
+}
+
+static struct timesource_t timesource_pit = {
+ .name = "pit",
+ .priority = 0,
+ .read = pit_read,
+ .delta = pit_delta,
+ .cyc2ns = pit_cyc2ns,
+};
+
+static int init_pit_timesource(void)
+{
+ register_timesource(&timesource_pit);
+ return 0;
+}
+
+module_init(init_pit_timesource);
diff -Nru a/drivers/timesource/itc.c b/drivers/timesource/itc.c
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/drivers/timesource/itc.c 2005-04-29 15:21:27 -07:00
@@ -0,0 +1,35 @@
+/* XXX - this is totally untested and uncompiled
+ * TODO:
+ * o cpufreq issues
+ * o unsynched ITCs ?
+ */
+#include <linux/timesource.h>
+
+/* XXX - Other includes needed for:
+ * sal_platform_features, IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT,
+ * local_cpu_data->itc_freq
+ * See arch/ia64/kernel/time.c for ideas
+ */
+
+static struct timesource_t timesource_itc = {
+ .name = "itc",
+ .priority = 25,
+ .type = TIMESOURCE_CYCLES,
+ .mask = (cycle_t)-1,
+ .mult = 0, /* to be set */
+ .shift = 22,
+};
+
+static int init_itc_timesource(void)
+{
+ if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)) {
+ /* XXX - I'm not really sure if itc_freq is in cyc/sec */
+ timesource_itc.mult = timesource_hz2mult(local_cpu_data->itc_freq,
+ timesource_itc.shift);
+ register_timesource(&timesource_itc);
+ }
+ return 0;
+}
+
+module_init(init_itc_timesource);
+
diff -Nru a/drivers/timesource/ppc64_timebase.c b/drivers/timesource/ppc64_timebase.c
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/drivers/timesource/ppc64_timebase.c 2005-04-29 15:21:27 -07:00
@@ -0,0 +1,33 @@
+#include <linux/timesource.h>
+#include <asm/time.h>
+
+static cycle_t timebase_read(void)
+{
+ return (cycle_t)get_tb();
+}
+
+struct timesource_t timesource_timebase = {
+ .name = "timebase",
+ .priority = 200,
+ .type = TIMESOURCE_FUNCTION,
+ .read_fnct = timebase_read,
+ .mask = (cycle_t)-1,
+ .mult = 0,
+ .shift = 22,
+};
+
+
+/* XXX - this should be calculated or properly externed! */
+extern unsigned long tb_to_ns_scale;
+extern unsigned long tb_to_ns_shift;
+extern unsigned long tb_ticks_per_sec;
+
+static int init_timebase_timesource(void)
+{
+ timesource_timebase.mult = timesource_hz2mult(tb_ticks_per_sec,
+ timesource_timebase.shift);
+ register_timesource(&timesource_timebase);
+ return 0;
+}
+
+module_init(init_timebase_timesource);
diff -Nru a/drivers/timesource/ppc_timebase.c b/drivers/timesource/ppc_timebase.c
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/drivers/timesource/ppc_timebase.c 2005-04-29 15:21:27 -07:00
@@ -0,0 +1,56 @@
+#include <linux/timesource.h>
+#include <linux/init.h>
+#include <asm/time.h>
+#ifndef CONFIG_PPC64
+
+/* XXX - this should be calculated or properly externed! */
+
+/* DJWONG: tb_to_ns_scale is supposed to be set in time_init.
+ * No idea if that actually _happens_ on a ppc601, though it
+ * seems to work on a B&W G3. :D */
+extern unsigned long tb_to_ns_scale;
+
+static cycle_t ppc_timebase_read(void)
+{
+ unsigned long lo, hi, hi2;
+ unsigned long long tb;
+
+ do {
+ hi = get_tbu();
+ lo = get_tbl();
+ hi2 = get_tbu();
+ } while (hi2 != hi);
+ tb = ((unsigned long long) hi << 32) | lo;
+
+ return (cycle_t)tb;
+}
+
+struct timesource_t timesource_ppc_timebase = {
+ .name = "ppc_timebase",
+ .priority = 200,
+ .type = TIMESOURCE_FUNCTION,
+ .read_fnct = ppc_timebase_read,
+ .mask = (cycle_t)-1,
+ .mult = 0,
+ .shift = 22,
+};
+
+static int init_ppc_timebase_timesource(void)
+{
+ /* DJWONG: Extrapolated from ppc64 code. */
+ unsigned long tb_ticks_per_sec;
+
+ tb_ticks_per_sec = tb_ticks_per_jiffy * HZ;
+
+ timesource_ppc_timebase.mult = timesource_hz2mult(tb_ticks_per_sec,
+ timesource_ppc_timebase.shift);
+
+ printk(KERN_INFO "ppc_timebase: tb_ticks_per_sec = %lu, mult = %lu, tb_to_ns = %lu.\n",
+ tb_ticks_per_sec, timesource_ppc_timebase.mult , tb_to_ns_scale);
+
+ register_timesource(&timesource_ppc_timebase);
+ return 0;
+}
+
+module_init(init_ppc_timebase_timesource);
+#endif /* CONFIG_PPC64 */
diff -Nru a/drivers/timesource/s390_tod.c b/drivers/timesource/s390_tod.c
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/drivers/timesource/s390_tod.c 2005-04-29 15:21:27 -07:00
@@ -0,0 +1,37 @@
+/*
+ * linux/drivers/timesource/s390_tod.c
+ *
+ * (C) Copyright IBM Corp. 2004
+ *
+ * Author(s): Martin Schwidefsky ([email protected]),
+ *
+ * s390 TOD clock time source.
+ */
+
+#include <linux/timesource.h>
+#include <linux/timex.h>
+#include <linux/init.h>
+
+static cycle_t s390_tod_read(void)
+{
+ return get_clock();
+}
+
+struct timesource_t timesource_s390_tod = {
+ .name = "TOD",
+ .priority = 100,
+ .type = TIMESOURCE_FUNCTION,
+ .read_fnct = s390_tod_read,
+ .mask = -1ULL,
+ .mult = 1000,
+ .shift = 12
+};
+
+
+static int init_s390_timesource(void)
+{
+ register_timesource(&timesource_s390_tod);
+ return 0;
+}
+
+module_init(init_s390_timesource);
diff -Nru a/drivers/timesource/sn2_rtc.c b/drivers/timesource/sn2_rtc.c
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/drivers/timesource/sn2_rtc.c 2005-04-29 15:21:27 -07:00
@@ -0,0 +1,29 @@
+#include <linux/timesource.h>
+/* XXX this will need some includes
+ * to find: sn_rtc_cycles_per_second and RTC_COUNTER_ADDR
+ * See arch/ia64/sn/kernel/sn2/timer.c for likely suspects
+ */
+
+#define SN2_RTC_MASK ((1LL << 55) - 1)
+#define SN2_SHIFT 10
+
+struct timesource_t timesource_sn2_rtc = {
+ .name = "sn2_rtc",
+ .priority = 300, /* XXX - not sure what this should be */
+ .type = TIMESOURCE_MMIO_64,
+ .mmio_ptr = NULL,
+ .mask = (cycle_t)SN2_RTC_MASK,
+ .mult = 0, /* set below */
+ .shift = SN2_SHIFT,
+};
+
+static void init_sn2_timesource(void)
+{
+ timesource_sn2_rtc.mult = timesource_hz2mult(sn_rtc_cycles_per_second,
+ SN2_SHIFT);
+ timesource_sn2_rtc.mmio_ptr = RTC_COUNTER_ADDR;
+
+ register_time_interpolator(&timesource_sn2_rtc);
+ return 0;
+}
+module_init(init_sn2_timesource);
diff -Nru a/drivers/timesource/tsc.c b/drivers/timesource/tsc.c
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/drivers/timesource/tsc.c 2005-04-29 15:21:27 -07:00
@@ -0,0 +1,46 @@
+/* TODO:
+ * o better calibration
+ */
+
+#include <linux/timesource.h>
+#include <linux/timex.h>
+#include <linux/init.h>
+
+static void tsc_update_callback(void);
+
+static struct timesource_t timesource_tsc = {
+ .name = "tsc",
+ .priority = 25,
+ .type = TIMESOURCE_CYCLES,
+ .mask = (cycle_t)-1,
+ .mult = 0, /* to be set */
+ .shift = 22,
+ .update_callback = tsc_update_callback,
+};
+
+static unsigned long current_cpu_khz = 0;
+
+static void tsc_update_callback(void)
+{
+ /* only update if cpu_khz has changed */
+ if (current_cpu_khz != cpu_khz){
+ current_cpu_khz = cpu_khz;
+ timesource_tsc.mult = timesource_khz2mult(current_cpu_khz,
+ timesource_tsc.shift);
+ }
+}
+
+static int init_tsc_timesource(void)
+{
+ /* TSC initialization is done in arch/i386/kernel/tsc.c */
+ if (cpu_has_tsc && cpu_khz) {
+ current_cpu_khz = cpu_khz;
+ timesource_tsc.mult = timesource_khz2mult(current_cpu_khz,
+ timesource_tsc.shift);
+ register_timesource(&timesource_tsc);
+ }
+ return 0;
+}
+
+module_init(init_tsc_timesource);
+
diff -Nru a/include/asm-i386/mach-default/mach_timer.h b/include/asm-i386/mach-default/mach_timer.h
--- a/include/asm-i386/mach-default/mach_timer.h 2005-04-29 15:21:27 -07:00
+++ b/include/asm-i386/mach-default/mach_timer.h 2005-04-29 15:21:27 -07:00
@@ -14,8 +14,12 @@
*/
#ifndef _MACH_TIMER_H
#define _MACH_TIMER_H
+#include <linux/jiffies.h>
+#include <asm/io.h>

-#define CALIBRATE_LATCH (5 * LATCH)
+#define CALIBRATE_ITERATION 50
+#define CALIBRATE_LATCH (CALIBRATE_ITERATION * LATCH)
+#define CALIBRATE_TIME (CALIBRATE_ITERATION * 1000020/HZ)

static inline void mach_prepare_counter(void)
{
diff -Nru a/include/asm-i386/timer.h b/include/asm-i386/timer.h
--- a/include/asm-i386/timer.h 2005-04-29 15:21:27 -07:00
+++ b/include/asm-i386/timer.h 2005-04-29 15:21:27 -07:00
@@ -2,6 +2,13 @@
#define _ASMi386_TIMER_H
#include <linux/init.h>

+#define TICK_SIZE (tick_nsec / 1000)
+void setup_pit_timer(void);
+/* Modifiers for buggy PIT handling */
+extern int pit_latch_buggy;
+extern int timer_ack;
+
+#ifndef CONFIG_NEWTOD
/**
* struct timer_ops - used to define a timer source
*
@@ -29,18 +36,10 @@
struct timer_opts *opts;
};

-#define TICK_SIZE (tick_nsec / 1000)
-
extern struct timer_opts* __init select_timer(void);
extern void clock_fallback(void);
-void setup_pit_timer(void);
-
-/* Modifiers for buggy PIT handling */
-
-extern int pit_latch_buggy;

extern struct timer_opts *cur_timer;
-extern int timer_ack;

/* list of externed timers */
extern struct timer_opts timer_none;
@@ -60,5 +59,6 @@

#ifdef CONFIG_X86_PM_TIMER
extern struct init_timer_opts timer_pmtmr_init;
+#endif
#endif
#endif
diff -Nru a/include/asm-i386/tsc.h b/include/asm-i386/tsc.h
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/include/asm-i386/tsc.h 2005-04-29 15:21:27 -07:00
@@ -0,0 +1,6 @@
+#ifndef _ASM_I386_TSC_H
+#define _ASM_I386_TSC_H
+extern unsigned long cpu_freq_khz;
+void tsc_init(void);
+
+#endif
diff -Nru a/include/asm-x86_64/hpet.h b/include/asm-x86_64/hpet.h
--- a/include/asm-x86_64/hpet.h 2005-04-29 15:21:27 -07:00
+++ b/include/asm-x86_64/hpet.h 2005-04-29 15:21:27 -07:00
@@ -1,6 +1,6 @@
#ifndef _ASM_X8664_HPET_H
#define _ASM_X8664_HPET_H 1
-
+#include <asm/fixmap.h>
/*
* Documentation on HPET can be found at:
* http://www.intel.com/ial/home/sp/pcmmspec.htm
@@ -44,6 +44,7 @@
#define HPET_TN_SETVAL 0x040
#define HPET_TN_32BIT 0x100

+extern unsigned long hpet_address; /* hpet memory map physical address */
extern int is_hpet_enabled(void);
extern int hpet_rtc_timer_init(void);
extern int oem_force_hpet_timer(void);
diff -Nru a/include/linux/sched.h b/include/linux/sched.h
--- a/include/linux/sched.h 2005-04-29 15:21:27 -07:00
+++ b/include/linux/sched.h 2005-04-29 15:21:27 -07:00
@@ -825,7 +825,11 @@
}
#endif

+#ifndef CONFIG_NEWTOD
extern unsigned long long sched_clock(void);
+#else
+#define sched_clock() 0
+#endif
extern unsigned long long current_sched_time(const task_t *current_task);

/* sched_exec is called by processes performing an exec */


2005-04-29 22:59:39

by john stultz

[permalink] [raw]
Subject: [RFC][PATCH (4/4)] new timeofday vsyscall proof of concept (v A4)

All,

This patch implements vsyscall-gettimeofday() functions for i386 and
x86-64 using the new timeofday core code. This is just a hackish proof
of concept that shows how it could be done and what interfaces are
needed to have a clean separation of the arch independent time keeping
and the very arch specific vsyscall code.

I look forward to your comments and feedback.

thanks
-john

linux-2.6.12-rc2_timeofday-vsyscall_A4.patch
===============================================
diff -Nru a/arch/i386/Kconfig b/arch/i386/Kconfig
--- a/arch/i386/Kconfig 2005-04-29 15:31:09 -07:00
+++ b/arch/i386/Kconfig 2005-04-29 15:31:09 -07:00
@@ -464,6 +464,10 @@
bool "Provide RTC interrupt"
depends on HPET_TIMER && RTC=y

+config NEWTOD_VSYSCALL
+ depends on EXPERIMENTAL
+ bool "VSYSCALL gettimeofday() interface"
+
config SMP
bool "Symmetric multi-processing support"
---help---
diff -Nru a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
--- a/arch/i386/kernel/Makefile 2005-04-29 15:31:09 -07:00
+++ b/arch/i386/kernel/Makefile 2005-04-29 15:31:09 -07:00
@@ -11,6 +11,7 @@

obj-y += cpu/
obj-$(!CONFIG_NEWTOD) += timers/
+obj-$(CONFIG_NEWTOD_VSYSCALL) += vsyscall-gtod.o
obj-$(CONFIG_ACPI_BOOT) += acpi/
obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o
obj-$(CONFIG_MCA) += mca.o
diff -Nru a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
--- a/arch/i386/kernel/setup.c 2005-04-29 15:31:09 -07:00
+++ b/arch/i386/kernel/setup.c 2005-04-29 15:31:09 -07:00
@@ -51,6 +51,7 @@
#include <asm/ist.h>
#include <asm/io.h>
#include <asm/tsc.h>
+#include <asm/vsyscall-gtod.h>
#include "setup_arch_pre.h"
#include <bios_ebda.h>

@@ -1525,6 +1526,7 @@
#endif
#endif
tsc_init();
+ vsyscall_init();
}

#include "setup_arch_post.h"
diff -Nru a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
--- a/arch/i386/kernel/vmlinux.lds.S 2005-04-29 15:31:09 -07:00
+++ b/arch/i386/kernel/vmlinux.lds.S 2005-04-29 15:31:09 -07:00
@@ -5,6 +5,8 @@
#include <asm-generic/vmlinux.lds.h>
#include <asm/thread_info.h>
#include <asm/page.h>
+#include <linux/config.h>
+#include <asm/vsyscall-gtod.h>

OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
OUTPUT_ARCH(i386)
@@ -51,6 +53,31 @@
.data.cacheline_aligned : { *(.data.cacheline_aligned) }

_edata = .; /* End of data section */
+
+/* VSYSCALL_GTOD data */
+#ifdef CONFIG_NEWTOD_VSYSCALL
+
+ /* vsyscall entry */
+ . = ALIGN(64);
+ .data.cacheline_aligned : { *(.data.cacheline_aligned) }
+
+ .vsyscall_0 VSYSCALL_GTOD_START: AT ((LOADADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095)) { *(.vsyscall_0) }
+ __vsyscall_0 = LOADADDR(.vsyscall_0);
+
+
+ /* generic gtod variables */
+ . = ALIGN(64);
+ .vsyscall_gtod_data : AT ((LOADADDR(.vsyscall_0) + SIZEOF(.vsyscall_0) + 63) & ~(63)) { *(.vsyscall_gtod_data) }
+ vsyscall_gtod_data = LOADADDR(.vsyscall_gtod_data);
+
+ . = ALIGN(16);
+ .vsyscall_gtod_lock : AT ((LOADADDR(.vsyscall_gtod_data) + SIZEOF(.vsyscall_gtod_data) + 15) & ~(15)) { *(.vsyscall_gtod_lock) }
+ vsyscall_gtod_lock = LOADADDR(.vsyscall_gtod_lock);
+
+ .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT (LOADADDR(.vsyscall_0) + 1024) { *(.vsyscall_1) }
+ . = LOADADDR(.vsyscall_0) + 4096;
+#endif
+/* END of VSYSCALL_GTOD data*/

. = ALIGN(THREAD_SIZE); /* init_task */
.data.init_task : { *(.data.init_task) }
diff -Nru a/arch/i386/kernel/vsyscall-gtod.c b/arch/i386/kernel/vsyscall-gtod.c
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/arch/i386/kernel/vsyscall-gtod.c 2005-04-29 15:31:09 -07:00
@@ -0,0 +1,193 @@
+#include <linux/time.h>
+#include <linux/timeofday.h>
+#include <linux/timesource.h>
+#include <linux/sched.h>
+#include <asm/vsyscall-gtod.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/fixmap.h>
+#include <asm/msr.h>
+#include <asm/timer.h>
+#include <asm/system.h>
+#include <asm/unistd.h>
+#include <asm/errno.h>
+
+struct vsyscall_gtod_data_t {
+ struct timeval wall_time_tv;
+ struct timezone sys_tz;
+ cycle_t offset_base;
+ struct timesource_t timesource;
+};
+
+struct vsyscall_gtod_data_t vsyscall_gtod_data;
+struct vsyscall_gtod_data_t __vsyscall_gtod_data __section_vsyscall_gtod_data;
+
+seqlock_t vsyscall_gtod_lock = SEQLOCK_UNLOCKED;
+seqlock_t __vsyscall_gtod_lock __section_vsyscall_gtod_lock = SEQLOCK_UNLOCKED;
+
+int errno;
+static inline _syscall2(int,gettimeofday,struct timeval *,tv,struct timezone *,tz);
+
+static int vsyscall_mapped = 0; /* flag variable for remap_vsyscall() */
+extern struct timezone sys_tz;
+
+static inline void do_vgettimeofday(struct timeval* tv)
+{
+ cycle_t now, cycle_delta;
+ nsec_t nsec_delta;
+
+ if (__vsyscall_gtod_data.timesource.type == TIMESOURCE_FUNCTION) {
+ gettimeofday(tv, NULL);
+ return;
+ }
+
+ /* read the timeosurce and calc cycle_delta */
+ now = read_timesource(&__vsyscall_gtod_data.timesource);
+ cycle_delta = (now - __vsyscall_gtod_data.offset_base)
+ & __vsyscall_gtod_data.timesource.mask;
+
+ /* convert cycles to nsecs */
+ nsec_delta = cycle_delta * __vsyscall_gtod_data.timesource.mult;
+ nsec_delta = nsec_delta >> __vsyscall_gtod_data.timesource.shift;
+
+ /* add nsec offset to wall_time_tv */
+ *tv = __vsyscall_gtod_data.wall_time_tv;
+ do_div(nsec_delta, NSEC_PER_USEC);
+ tv->tv_usec += (unsigned long) nsec_delta;
+ while (tv->tv_usec > USEC_PER_SEC) {
+ tv->tv_sec += 1;
+ tv->tv_usec -= USEC_PER_SEC;
+ }
+}
+
+static inline void do_get_tz(struct timezone *tz)
+{
+ *tz = __vsyscall_gtod_data.sys_tz;
+}
+
+static int __vsyscall(0) asmlinkage vgettimeofday(struct timeval *tv, struct timezone *tz)
+{
+ unsigned long seq;
+ do {
+ seq = read_seqbegin(&__vsyscall_gtod_lock);
+
+ if (tv)
+ do_vgettimeofday(tv);
+ if (tz)
+ do_get_tz(tz);
+
+ } while (read_seqretry(&__vsyscall_gtod_lock, seq));
+
+ return 0;
+}
+
+static time_t __vsyscall(1) asmlinkage vtime(time_t * t)
+{
+ struct timeval tv;
+ vgettimeofday(&tv,NULL);
+ if (t)
+ *t = tv.tv_sec;
+ return tv.tv_sec;
+}
+
+struct timesource_t* curr_timesource;
+
+void arch_update_vsyscall_gtod(nsec_t wall_time, cycle_t offset_base,
+ struct timesource_t* timesource, int ntp_adj)
+{
+ unsigned long flags;
+
+ write_seqlock_irqsave(&vsyscall_gtod_lock, flags);
+
+ /* XXX - hackitty hack hack. this is terrible! */
+ if (curr_timesource != timesource) {
+ if ((timesource->type == TIMESOURCE_MMIO_32)
+ || (timesource->type == TIMESOURCE_MMIO_64)) {
+ unsigned long vaddr = (unsigned long)timesource->mmio_ptr;
+ pgd_t *pgd = pgd_offset_k(vaddr);
+ pud_t *pud = pud_offset(pgd, vaddr);
+ pmd_t *pmd = pmd_offset(pud,vaddr);
+ pte_t *pte = pte_offset_kernel(pmd, vaddr);
+ pte->pte_low |= _PAGE_USER;
+ }
+ }
+
+ /* save off wall time as timeval */
+ vsyscall_gtod_data.wall_time_tv = ns2timeval(wall_time);
+
+ /* save offset_base */
+ vsyscall_gtod_data.offset_base = offset_base;
+
+ /* copy current timesource */
+ vsyscall_gtod_data.timesource = *timesource;
+
+ /* apply ntp adjustment to timesource mult */
+ vsyscall_gtod_data.timesource.mult += ntp_adj;
+
+ /* save off current timezone */
+ vsyscall_gtod_data.sys_tz = sys_tz;
+
+ write_sequnlock_irqrestore(&vsyscall_gtod_lock, flags);
+
+}
+extern char __vsyscall_0;
+
+static void __init map_vsyscall(void)
+{
+ unsigned long physaddr_page0 = (unsigned long) &__vsyscall_0 - PAGE_OFFSET;
+
+ /* Initially we map the VSYSCALL page w/ PAGE_KERNEL permissions to
+ * keep the alternate_instruction code from bombing out when it
+ * changes the seq_lock memory barriers in vgettimeofday()
+ */
+ __set_fixmap(FIX_VSYSCALL_GTOD_FIRST_PAGE, physaddr_page0, PAGE_KERNEL);
+}
+
+static int __init remap_vsyscall(void)
+{
+ unsigned long physaddr_page0 = (unsigned long) &__vsyscall_0 - PAGE_OFFSET;
+
+ if (!vsyscall_mapped)
+ return 0;
+
+ /* Remap the VSYSCALL page w/ PAGE_KERNEL_VSYSCALL permissions
+ * after the alternate_instruction code has run
+ */
+ clear_fixmap(FIX_VSYSCALL_GTOD_FIRST_PAGE);
+ __set_fixmap(FIX_VSYSCALL_GTOD_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
+
+ return 0;
+}
+
+int __init vsyscall_init(void)
+{
+ printk("VSYSCALL: consistency checks...");
+ if ((unsigned long) &vgettimeofday != VSYSCALL_ADDR(__NR_vgettimeofday)) {
+ printk("vgettimeofday link addr broken\n");
+ printk("VSYSCALL: vsyscall_init failed!\n");
+ return -EFAULT;
+ }
+ if ((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)) {
+ printk("vtime link addr broken\n");
+ printk("VSYSCALL: vsyscall_init failed!\n");
+ return -EFAULT;
+ }
+ if (VSYSCALL_ADDR(0) != __fix_to_virt(FIX_VSYSCALL_GTOD_FIRST_PAGE)) {
+ printk("fixmap first vsyscall 0x%lx should be 0x%x\n",
+ __fix_to_virt(FIX_VSYSCALL_GTOD_FIRST_PAGE),
+ VSYSCALL_ADDR(0));
+ printk("VSYSCALL: vsyscall_init failed!\n");
+ return -EFAULT;
+ }
+
+
+ printk("passed...mapping...");
+ map_vsyscall();
+ printk("done.\n");
+ vsyscall_mapped = 1;
+ printk("VSYSCALL: fixmap virt addr: 0x%lx\n",
+ __fix_to_virt(FIX_VSYSCALL_GTOD_FIRST_PAGE));
+
+ return 0;
+}
+__initcall(remap_vsyscall);
diff -Nru a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
--- a/arch/x86_64/Kconfig 2005-04-29 15:31:09 -07:00
+++ b/arch/x86_64/Kconfig 2005-04-29 15:31:09 -07:00
@@ -57,6 +57,10 @@
bool
default y

+config NEWTOD_VSYSCALL
+ depends on EXPERIMENTAL
+ bool "VSYSCALL gettimeofday() interface"
+
config GENERIC_ISA_DMA
bool
default y
diff -Nru a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
--- a/arch/x86_64/kernel/time.c 2005-04-29 15:31:09 -07:00
+++ b/arch/x86_64/kernel/time.c 2005-04-29 15:31:09 -07:00
@@ -81,6 +81,7 @@
rdtscll(*tsc);
}

+#ifndef CONFIG_NEWTOD
/*
* do_gettimeoffset() returns microseconds since last timer interrupt was
* triggered by hardware. A memory read of HPET is slower than a register read
@@ -108,7 +109,6 @@

unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc;

-#ifndef CONFIG_NEWTOD
/*
* This version of gettimeofday() has microsecond resolution and better than
* microsecond precision, as we're using at least a 10 MHz (usually 14.31818
@@ -976,6 +976,7 @@
/* Some systems will want to disable TSC and use HPET. */
if (oem_force_hpet_timer())
notsc = 1;
+#ifndef CONFIG_NEWTOD
if (vxtime.hpet_address && notsc) {
timetype = "HPET";
vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
@@ -987,6 +988,7 @@
}

printk(KERN_INFO "time.c: Using %s based timekeeping.\n", timetype);
+#endif /* CONFIG_NEWTOD */
}

__setup("report_lost_ticks", time_setup);
diff -Nru a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
--- a/arch/x86_64/kernel/vmlinux.lds.S 2005-04-29 15:31:09 -07:00
+++ b/arch/x86_64/kernel/vmlinux.lds.S 2005-04-29 15:31:09 -07:00
@@ -71,6 +71,13 @@
. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
.jiffies : AT CACHE_ALIGN(AFTER(.xtime)) { *(.jiffies) }
jiffies = LOADADDR(.jiffies);
+
+ .vsyscall_gtod_data : AT AFTER(.jiffies) { *(.vsyscall_gtod_data) }
+ vsyscall_gtod_data = LOADADDR(.vsyscall_gtod_data);
+ .vsyscall_gtod_lock : AT AFTER(.vsyscall_gtod_data) { *(.vsyscall_gtod_lock) }
+ vsyscall_gtod_lock = LOADADDR(.vsyscall_gtod_lock);
+
+
.vsyscall_1 ADDR(.vsyscall_0) + 1024: AT (LOADADDR(.vsyscall_0) + 1024) { *(.vsyscall_1) }
. = LOADADDR(.vsyscall_0) + 4096;

diff -Nru a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
--- a/arch/x86_64/kernel/vsyscall.c 2005-04-29 15:31:09 -07:00
+++ b/arch/x86_64/kernel/vsyscall.c 2005-04-29 15:31:09 -07:00
@@ -19,6 +19,8 @@
* want per guest time just set the kernel.vsyscall64 sysctl to 0.
*/

+#include <linux/timeofday.h>
+#include <linux/timesource.h>
#include <linux/time.h>
#include <linux/init.h>
#include <linux/kernel.h>
@@ -40,6 +42,21 @@
int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;

+
+struct vsyscall_gtod_data_t {
+ struct timeval wall_time_tv;
+ struct timezone sys_tz;
+ cycle_t offset_base;
+ struct timesource_t timesource;
+};
+
+extern struct vsyscall_gtod_data_t vsyscall_gtod_data;
+struct vsyscall_gtod_data_t __vsyscall_gtod_data __section_vsyscall_gtod_data;
+
+extern seqlock_t vsyscall_gtod_lock;
+seqlock_t __vsyscall_gtod_lock __section_vsyscall_gtod_lock = SEQLOCK_UNLOCKED;
+
+
#include <asm/unistd.h>

static force_inline void timeval_normalize(struct timeval * tv)
@@ -52,7 +69,7 @@
tv->tv_sec += __sec;
}
}
-
+#ifndef CONFIG_NEWTOD_VSYSCALL
static force_inline void do_vgettimeofday(struct timeval * tv)
{
long sequence, t;
@@ -82,6 +99,52 @@
tv->tv_sec = sec + usec / 1000000;
tv->tv_usec = usec % 1000000;
}
+#else /* CONFIG_NEWTOD_VSYSCALL */
+/* XXX - this is ugly. gettimeofday() has a label in it so we can't
+ call it twice.
+ */
+static force_inline int syscall_gtod(struct timeval *tv, struct timezone *tz)
+{
+ int ret;
+ asm volatile("syscall"
+ : "=a" (ret)
+ : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber );
+ return ret;
+}
+static force_inline void do_vgettimeofday(struct timeval* tv)
+{
+ cycle_t now, cycle_delta;
+ nsec_t nsec_delta;
+ unsigned long seq;
+ do {
+ seq = read_seqbegin(&__vsyscall_gtod_lock);
+
+ if (__vsyscall_gtod_data.timesource.type == TIMESOURCE_FUNCTION) {
+ syscall_gtod(tv, NULL);
+ return;
+ }
+
+ /* read the timeosurce and calc cycle_delta */
+ now = read_timesource(&__vsyscall_gtod_data.timesource);
+ cycle_delta = (now - __vsyscall_gtod_data.offset_base)
+ & __vsyscall_gtod_data.timesource.mask;
+
+ /* convert cycles to nsecs */
+ nsec_delta = cycle_delta * __vsyscall_gtod_data.timesource.mult;
+ nsec_delta = nsec_delta >> __vsyscall_gtod_data.timesource.shift;
+
+ /* add nsec offset to wall_time_tv */
+ *tv = __vsyscall_gtod_data.wall_time_tv;
+ do_div(nsec_delta, NSEC_PER_USEC);
+ tv->tv_usec += (unsigned long) nsec_delta;
+ while (tv->tv_usec > USEC_PER_SEC) {
+ tv->tv_sec += 1;
+ tv->tv_usec -= USEC_PER_SEC;
+ }
+ } while (read_seqretry(&__vsyscall_gtod_lock, seq));
+}
+#endif /* CONFIG_NEWTOD_VSYSCALL */
+

/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */
static force_inline void do_get_tz(struct timezone * tz)
@@ -139,6 +202,48 @@
return -ENOSYS;
}

+struct timesource_t* curr_timesource;
+
+void arch_update_vsyscall_gtod(nsec_t wall_time, cycle_t offset_base,
+ struct timesource_t* timesource, int ntp_adj)
+{
+ unsigned long flags;
+
+ write_seqlock_irqsave(&vsyscall_gtod_lock, flags);
+
+ /* XXX - hackitty hack hack. this is terrible! */
+ if (curr_timesource != timesource) {
+ if ((timesource->type == TIMESOURCE_MMIO_32)
+ || (timesource->type == TIMESOURCE_MMIO_64)) {
+ unsigned long vaddr = (unsigned long)timesource->mmio_ptr;
+ pgd_t *pgd = pgd_offset_k(vaddr);
+ pud_t *pud = pud_offset(pgd, vaddr);
+ pmd_t *pmd = pmd_offset(pud,vaddr);
+ pte_t *pte = pte_offset_kernel(pmd, vaddr);
+ *pte = pte_mkread(*pte);
+ }
+ curr_timesource = timesource;
+ }
+
+ /* save off wall time as timeval */
+ vsyscall_gtod_data.wall_time_tv = ns2timeval(wall_time);
+
+ /* save offset_base */
+ vsyscall_gtod_data.offset_base = offset_base;
+
+ /* copy current timesource */
+ vsyscall_gtod_data.timesource = *timesource;
+
+ /* apply ntp adjustment to timesource mult */
+ vsyscall_gtod_data.timesource.mult += ntp_adj;
+
+ /* save off current timezone */
+ vsyscall_gtod_data.sys_tz = sys_tz;
+
+ write_sequnlock_irqrestore(&vsyscall_gtod_lock, flags);
+
+}
+
#ifdef CONFIG_SYSCTL

#define SYSCALL 0x050f
@@ -217,13 +322,8 @@
BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
map_vsyscall();
-/* XXX - disable vsyscall gettimeofday for now */
-#ifndef CONFIG_NEWTOD
sysctl_vsyscall = 1;
register_sysctl_table(kernel_root_table2, 0);
-#else
- sysctl_vsyscall = 0;
-#endif
return 0;
}

diff -Nru a/include/asm-i386/fixmap.h b/include/asm-i386/fixmap.h
--- a/include/asm-i386/fixmap.h 2005-04-29 15:31:09 -07:00
+++ b/include/asm-i386/fixmap.h 2005-04-29 15:31:09 -07:00
@@ -27,6 +27,7 @@
#include <asm/acpi.h>
#include <asm/apicdef.h>
#include <asm/page.h>
+#include <asm/vsyscall-gtod.h>
#ifdef CONFIG_HIGHMEM
#include <linux/threads.h>
#include <asm/kmap_types.h>
@@ -53,6 +54,11 @@
enum fixed_addresses {
FIX_HOLE,
FIX_VSYSCALL,
+#ifdef CONFIG_NEWTOD_VSYSCALL
+ FIX_VSYSCALL_GTOD_LAST_PAGE,
+ FIX_VSYSCALL_GTOD_FIRST_PAGE = FIX_VSYSCALL_GTOD_LAST_PAGE
+ + VSYSCALL_GTOD_NUMPAGES - 1,
+#endif
#ifdef CONFIG_X86_LOCAL_APIC
FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
#endif
diff -Nru a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
--- a/include/asm-i386/pgtable.h 2005-04-29 15:31:09 -07:00
+++ b/include/asm-i386/pgtable.h 2005-04-29 15:31:09 -07:00
@@ -159,6 +159,8 @@
#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD)
#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
+#define __PAGE_KERNEL_VSYSCALL \
+ (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)

#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
@@ -166,6 +168,8 @@
#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
+#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
+#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL|(__PAGE_KERNEL_RO | _PAGE_PCD))

/*
* The i386 can't do page protection for execute, and considers that
diff -Nru a/include/asm-i386/vsyscall-gtod.h b/include/asm-i386/vsyscall-gtod.h
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/include/asm-i386/vsyscall-gtod.h 2005-04-29 15:31:09 -07:00
@@ -0,0 +1,41 @@
+#ifndef _ASM_i386_VSYSCALL_GTOD_H_
+#define _ASM_i386_VSYSCALL_GTOD_H_
+
+#ifdef CONFIG_NEWTOD_VSYSCALL
+
+/* VSYSCALL_GTOD_START must be the same as
+ * __fix_to_virt(FIX_VSYSCALL_GTOD FIRST_PAGE)
+ * and must also be same as addr in vmlinux.lds.S */
+#define VSYSCALL_GTOD_START 0xffffd000
+#define VSYSCALL_GTOD_SIZE 1024
+#define VSYSCALL_GTOD_END (VSYSCALL_GTOD_START + PAGE_SIZE)
+#define VSYSCALL_GTOD_NUMPAGES \
+ ((VSYSCALL_GTOD_END-VSYSCALL_GTOD_START) >> PAGE_SHIFT)
+#define VSYSCALL_ADDR(vsyscall_nr) \
+ (VSYSCALL_GTOD_START+VSYSCALL_GTOD_SIZE*(vsyscall_nr))
+
+#ifdef __KERNEL__
+#ifndef __ASSEMBLY__
+#include <linux/seqlock.h>
+#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
+
+/* ReadOnly generic time value attributes*/
+#define __section_vsyscall_gtod_data __attribute__ ((unused, __section__ (".vsyscall_gtod_data")))
+
+#define __section_vsyscall_gtod_lock __attribute__ ((unused, __section__ (".vsyscall_gtod_lock")))
+
+
+enum vsyscall_num {
+ __NR_vgettimeofday,
+ __NR_vtime,
+};
+
+int vsyscall_init(void);
+extern char __vsyscall_0;
+#endif /* __ASSEMBLY__ */
+#endif /* __KERNEL__ */
+#else /* CONFIG_NEWTOD_VSYSCALL */
+#define vsyscall_init()
+#define vsyscall_set_timesource(x)
+#endif /* CONFIG_NEWTOD_VSYSCALL */
+#endif /* _ASM_i386_VSYSCALL_GTOD_H_ */
diff -Nru a/include/asm-x86_64/vsyscall.h b/include/asm-x86_64/vsyscall.h
--- a/include/asm-x86_64/vsyscall.h 2005-04-29 15:31:09 -07:00
+++ b/include/asm-x86_64/vsyscall.h 2005-04-29 15:31:09 -07:00
@@ -22,6 +22,8 @@
#define __section_sysctl_vsyscall __attribute__ ((unused, __section__ (".sysctl_vsyscall"), aligned(16)))
#define __section_xtime __attribute__ ((unused, __section__ (".xtime"), aligned(16)))
#define __section_xtime_lock __attribute__ ((unused, __section__ (".xtime_lock"), aligned(16)))
+#define __section_vsyscall_gtod_data __attribute__ ((unused, __section__ (".vsyscall_gtod_data"),aligned(16)))
+#define __section_vsyscall_gtod_lock __attribute__ ((unused, __section__ (".vsyscall_gtod_lock"),aligned(16)))

#define VXTIME_TSC 1
#define VXTIME_HPET 2
diff -Nru a/kernel/timeofday.c b/kernel/timeofday.c
--- a/kernel/timeofday.c 2005-04-29 15:31:09 -07:00
+++ b/kernel/timeofday.c 2005-04-29 15:31:09 -07:00
@@ -41,7 +41,6 @@
* o Added getnstimeofday
* o Cleanups from Nish Aravamudan
* TODO List:
-* o vsyscall/fsyscall infrastructure
* o clock_was_set hook
**********************************************************************/

@@ -116,6 +115,12 @@
*/
extern nsec_t read_persistent_clock(void);
extern void sync_persistent_clock(struct timespec ts);
+#ifdef CONFIG_NEWTOD_VSYSCALL
+extern void arch_update_vsyscall_gtod(nsec_t wall_time, cycle_t offset_base,
+ struct timesource_t* timesource, int ntp_adj);
+#else
+#define arch_update_vsyscall_gtod(x,y,z,w) {}
+#endif


/* get_lowres_timestamp():
@@ -282,6 +287,9 @@

update_legacy_time_values();

+ arch_update_vsyscall_gtod(system_time + wall_time_offset, offset_base,
+ timesource, ntp_adj);
+
write_sequnlock_irqrestore(&system_time_lock, flags);

return 0;
@@ -473,6 +481,9 @@
/* sync legacy values */
update_legacy_time_values();

+ arch_update_vsyscall_gtod(system_time + wall_time_offset, offset_base,
+ timesource, ntp_adj);
+
write_sequnlock_irqrestore(&system_time_lock, flags);

/* Set us up to go off on the next interval */
@@ -501,6 +512,9 @@
/* clear NTP scaling factor & state machine */
ntp_adj = 0;
ntp_clear();
+
+ arch_update_vsyscall_gtod(system_time + wall_time_offset, offset_base,
+ timesource, ntp_adj);

/* initialize legacy time values */
update_legacy_time_values();


2005-04-29 23:36:38

by Nishanth Aravamudan

[permalink] [raw]
Subject: [RFC][PATCH] new timeofday-based soft-timer subsystem

* john stultz <[email protected]> [2005-0429 15:45:47 -0700]:

> All,
> This patch implements the architecture independent portion of
> the time of day subsystem. For a brief description on the rework, see
> here: http://lwn.net/Articles/120850/ (Many thanks to the LWN team for
> that clear writeup!)

I have been working closely with John to re-work the soft-timer subsytem
to use the new timeofday() subsystem. The following patch attempts to
being this process. I would greatly appreciate any comments.

Some design points:

1) The patch is small but does a lot.
a) Renames timer_jiffies to last_timer_time (now that we are not
jiffies-based).
b) Converts the soft-timer time-vector's/bucket's entries to
timerinterval (a new unit) width, instead of jiffy width.
c) Defines timerintervals to be the current time as reported by
the new timeofday-subsystem shifted down by 20 bits and masked
to only grab the lower 32 bits. This effectively emulates a
32-bit millisecond value.
d) Uses do_monotonic_clock() (converted to timerintervals) as the
basis for addition and expiration instead of jiffies.
e) Adds some new helper functions for dealing with nanosecond
values.

2) Currently, the patch is dependent upon John's timeofday core rework.
For arches that will not have the new timeofday (or for which the rework
is still in progress), I can emulate the existing system with a
separate patch. The goal of this patch, though, is just to show how easy
the new system can be implemented and the benefits.

3) The reason for the re-work?: Many people complain about all of the
adding of 1 jiffy here or there to fix bugs. This new systems is
fundamentally human-time oriented and deals with those issues correctly.

The code is reasonably well commented, but does expect readers to
understand the current system to some degree.

This is my first posting of this re-work, so I expect criticism, but am
happy to make changes.

Thanks,
Nish

Signed-off-by: Nishanth Aravamudan <[email protected]>

diff -urpN 2.6.12-rc2-tod2/include/linux/jiffies.h 2.6.12-rc2-tod2-timer/include/linux/jiffies.h
--- 2.6.12-rc2-tod2/include/linux/jiffies.h 2005-04-04 09:37:51.000000000 -0700
+++ 2.6.12-rc2-tod2-timer/include/linux/jiffies.h 2005-04-29 23:04:47.000000000 -0700
@@ -263,7 +263,7 @@ static inline unsigned int jiffies_to_ms
#endif
}

-static inline unsigned int jiffies_to_usecs(const unsigned long j)
+static inline unsigned long jiffies_to_usecs(const unsigned long j)
{
#if HZ <= 1000000 && !(1000000 % HZ)
return (1000000 / HZ) * j;
@@ -274,6 +274,17 @@ static inline unsigned int jiffies_to_us
#endif
}

+static inline nsec_t jiffies_to_nsecs(const unsigned long j)
+{
+#if HZ <= NSEC_PER_SEC && !(NSEC_PER_SEC % HZ)
+ return (NSEC_PER_SEC / HZ) * (nsec_t)j;
+#elif HZ > NSEC_PER_SEC && !(HZ % NSEC_PER_SEC)
+ return ((nsec_t)j + (HZ / NSEC_PER_SEC) - 1)/(HZ / NSEC_PER_SEC);
+#else
+ return ((nsec_t)j * NSEC_PER_SEC) / HZ;
+#endif
+}
+
static inline unsigned long msecs_to_jiffies(const unsigned int m)
{
if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
@@ -287,7 +298,7 @@ static inline unsigned long msecs_to_jif
#endif
}

-static inline unsigned long usecs_to_jiffies(const unsigned int u)
+static inline unsigned long usecs_to_jiffies(const unsigned long u)
{
if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
return MAX_JIFFY_OFFSET;
@@ -300,6 +311,24 @@ static inline unsigned long usecs_to_jif
#endif
}

+static inline unsigned long nsecs_to_jiffies(const nsec_t n)
+{
+ nsec_t temp;
+ if (n > jiffies_to_nsecs(MAX_JIFFY_OFFSET))
+ return MAX_JIFFY_OFFSET;
+#if HZ <= NSEC_PER_SEC && !(NSEC_PER_SEC % HZ)
+ temp = n + (NSEC_PER_SEC / HZ) - 1;
+ do_div(temp, (NSEC_PER_SEC / HZ));
+ return (unsigned long)temp;
+#elif HZ > NSEC_PER_SEC && !(HZ % NSEC_PER_SEC)
+ return n * (HZ / NSEC_PER_SEC);
+#else
+ temp = n * HZ + NSEC_PER_SEC - 1;
+ do_div(temp, NSEC_PER_SEC);
+ return (unsigned long)temp;
+#endif
+}
+
/*
* The TICK_NSEC - 1 rounds up the value to the next resolution. Note
* that a remainder subtract here would not do the right thing as the
diff -urpN 2.6.12-rc2-tod2/include/linux/sched.h 2.6.12-rc2-tod2-timer/include/linux/sched.h
--- 2.6.12-rc2-tod2/include/linux/sched.h 2005-04-29 23:16:59.000000000 -0700
+++ 2.6.12-rc2-tod2-timer/include/linux/sched.h 2005-04-29 23:04:47.000000000 -0700
@@ -182,7 +182,13 @@ extern void scheduler_tick(void);
extern int in_sched_functions(unsigned long addr);

#define MAX_SCHEDULE_TIMEOUT LONG_MAX
+#define MAX_SCHEDULE_TIMEOUT_NSECS ((nsec_t)(-1))
+#define MAX_SCHEDULE_TIMEOUT_MSECS ULONG_MAX
+#define MAX_SCHEDULE_TIMEOUT_USECS UINT_MAX
extern signed long FASTCALL(schedule_timeout(signed long timeout));
+extern nsec_t FASTCALL(schedule_timeout_nsecs(nsec_t timeout_nsecs));
+extern unsigned long FASTCALL(schedule_timeout_usecs(unsigned long timeout_usecs));
+extern unsigned int FASTCALL(schedule_timeout_msecs(unsigned int timeout_msesc));
asmlinkage void schedule(void);

struct namespace;
diff -urpN 2.6.12-rc2-tod2/include/linux/timer.h 2.6.12-rc2-tod2-timer/include/linux/timer.h
--- 2.6.12-rc2-tod2/include/linux/timer.h 2005-04-04 09:39:01.000000000 -0700
+++ 2.6.12-rc2-tod2-timer/include/linux/timer.h 2005-04-29 23:04:47.000000000 -0700
@@ -5,6 +5,7 @@
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/stddef.h>
+#include <linux/timeofday.h>

struct tvec_t_base_s;

@@ -65,27 +66,11 @@ extern void add_timer_on(struct timer_li
extern int del_timer(struct timer_list * timer);
extern int __mod_timer(struct timer_list *timer, unsigned long expires);
extern int mod_timer(struct timer_list *timer, unsigned long expires);
+extern void add_timer(struct timer_list *timer);
+extern int set_timer_nsecs(struct timer_list *timer, nsec_t expires_nsecs);

extern unsigned long next_timer_interrupt(void);

-/***
- * add_timer - start a timer
- * @timer: the timer to be added
- *
- * The kernel will do a ->function(->data) callback from the
- * timer interrupt at the ->expired point in the future. The
- * current time is 'jiffies'.
- *
- * The timer's ->expired, ->function (and if the handler uses it, ->data)
- * fields must be set prior calling this function.
- *
- * Timers with an ->expired field in the past will be executed in the next
- * timer tick.
- */
-static inline void add_timer(struct timer_list * timer)
-{
- __mod_timer(timer, timer->expires);
-}

#ifdef CONFIG_SMP
extern int del_timer_sync(struct timer_list *timer);
diff -urpN 2.6.12-rc2-tod2/kernel/timer.c 2.6.12-rc2-tod2-timer/kernel/timer.c
--- 2.6.12-rc2-tod2/kernel/timer.c 2005-04-29 23:16:52.000000000 -0700
+++ 2.6.12-rc2-tod2-timer/kernel/timer.c 2005-04-29 23:15:45.000000000 -0700
@@ -33,6 +33,7 @@
#include <linux/posix-timers.h>
#include <linux/cpu.h>
#include <linux/syscalls.h>
+#include <linux/timeofday.h>

#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -40,6 +41,8 @@
#include <asm/timex.h>
#include <asm/io.h>

+#define TIMER_DBG 0
+
#ifdef CONFIG_TIME_INTERPOLATION
static void time_interpolator_update(long delta_nsec);
#else
@@ -56,6 +59,9 @@ static void time_interpolator_update(lon
#define TVR_SIZE (1 << TVR_BITS)
#define TVN_MASK (TVN_SIZE - 1)
#define TVR_MASK (TVR_SIZE - 1)
+#define TIMERINTERVAL_BITS 20
+#define TIMERINTERVAL_SIZE (1 << TIMERINTERVAL_BITS)
+#define TIMERINTERVAL_MASK (TIMERINTERVAL_SIZE - 1)

typedef struct tvec_s {
struct list_head vec[TVN_SIZE];
@@ -67,7 +73,7 @@ typedef struct tvec_root_s {

struct tvec_t_base_s {
spinlock_t lock;
- unsigned long timer_jiffies;
+ unsigned long last_timer_time;
struct timer_list *running_timer;
tvec_root_t tv1;
tvec_t tv2;
@@ -113,11 +119,55 @@ static inline void check_timer(struct ti
check_timer_failed(timer);
}

+/*
+ * nsecs_to_timerintervals - convert nsec value to soft-timer intervals
+ * @n: number of nanoseconds to convert
+ *
+ * This is "configurable" value, meaning it can be changed at compile-time
+ * and the soft-timer subsystem should change with it.
+ *
+ * Some explanation of the math is necessary:
+ * Currently we emulate milliseconds (but try to stay efficient)
+ * by dividing the nanosecond value by 2^20 (1048576 ~= 1000000)
+ * and masking it to an unsigned long
+ *
+ * To prevent timers from being expired early, we:
+ * Take the ceiling when we add; and
+ * Take the floor when we expire.
+ */
+static inline unsigned long nsecs_to_timerintervals_ceiling(nsec_t nsecs)
+{
+ return (unsigned long)((((nsecs - 1) >> TIMERINTERVAL_BITS) & ULONG_MAX) + 1);
+}
+
+static inline unsigned long nsecs_to_timerintervals_floor(nsec_t nsecs)
+{
+ return (unsigned long)((nsecs >> TIMERINTERVAL_BITS) & ULONG_MAX);
+}
+
+/*
+ * jiffies_to_timerintervals - convert absolute jiffies value to soft-timer intervals
+ * @abs_jiffies: number of jiffies to convert
+ *
+ * First, we convert the absolute jiffies parameter to a relative
+ * jiffies value. To maintain precision, we convert the relative
+ * jiffies value to a relative nanosecond value and then convert that
+ * to a relative soft-timer interval unit value. We then add this
+ * relative value to the current time according to the timeofday-
+ * subsystem, converted to soft-timer interval units.
+ *
+ */
+static inline unsigned long jiffies_to_timerintervals(unsigned long abs_jiffies)
+{
+ unsigned long relative_jiffies = abs_jiffies - jiffies;
+ return nsecs_to_timerintervals_ceiling(do_monotonic_clock()) +
+ nsecs_to_timerintervals_ceiling(jiffies_to_nsecs(relative_jiffies));
+}

static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
{
unsigned long expires = timer->expires;
- unsigned long idx = expires - base->timer_jiffies;
+ unsigned long idx = expires - base->last_timer_time;
struct list_head *vec;

if (idx < TVR_SIZE) {
@@ -137,7 +187,7 @@ static void internal_add_timer(tvec_base
* Can happen if you add a timer with expires == jiffies,
* or you set a timer to go off in the past
*/
- vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
+ vec = base->tv1.vec + (base->last_timer_time & TVR_MASK);
} else {
int i;
/* If the timeout is larger than 0xffffffff on 64-bit
@@ -145,7 +195,7 @@ static void internal_add_timer(tvec_base
*/
if (idx > 0xffffffffUL) {
idx = 0xffffffffUL;
- expires = idx + base->timer_jiffies;
+ expires = idx + base->last_timer_time;
}
i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
vec = base->tv5.vec + i;
@@ -238,11 +288,36 @@ void add_timer_on(struct timer_list *tim
check_timer(timer);

spin_lock_irqsave(&base->lock, flags);
+ timer->expires = jiffies_to_timerintervals(timer->expires);
internal_add_timer(base, timer);
timer->base = base;
spin_unlock_irqrestore(&base->lock, flags);
}

+/***
+ * add_timer - start a timer
+ * @timer: the timer to be added
+ *
+ * The kernel will do a ->function(->data) callback from the
+ * timer interrupt at the ->expired point in the future. The
+ * current time is 'jiffies'.
+ *
+ * The timer's ->expired, ->function (and if the handler uses it, ->data)
+ * fields must be set prior calling this function.
+ *
+ * Timers with an ->expired field in the past will be executed in the next
+ * timer tick.
+ *
+ * The callers of add_timer() should be aware that the interface is now
+ * deprecated. set_timer_nsecs() is the single interface for adding and
+ * modifying timers.
+ */
+void add_timer(struct timer_list * timer)
+{
+ __mod_timer(timer, jiffies_to_timerintervals(timer->expires));
+}
+
+EXPORT_SYMBOL(add_timer);

/***
* mod_timer - modify a timer's timeout
@@ -262,6 +337,10 @@ void add_timer_on(struct timer_list *tim
* The function returns whether it has modified a pending timer or not.
* (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an
* active timer returns 1.)
+ *
+ * The callers of mod_timer() should be aware that the interface is now
+ * deprecated. set_timer_nsecs() is the single interface for adding and
+ * modifying timers.
*/
int mod_timer(struct timer_list *timer, unsigned long expires)
{
@@ -269,6 +348,7 @@ int mod_timer(struct timer_list *timer,

check_timer(timer);

+ expires = jiffies_to_timerintervals(expires);
/*
* This is a common optimization triggered by the
* networking code - if the timer is re-modified
@@ -282,6 +362,29 @@ int mod_timer(struct timer_list *timer,

EXPORT_SYMBOL(mod_timer);

+/*
+ * set_timer_nsecs - modify a timer's timeout in nsecs
+ * @timer: the timer to be modified
+ *
+ * Do we want to modify via absolute nanoseconds instead of
+ * relative?
+ */
+int set_timer_nsecs(struct timer_list *timer, nsec_t expires_nsecs)
+{
+ unsigned long expires;
+
+ BUG_ON(!timer->function);
+
+ check_timer(timer);
+
+ expires = nsecs_to_timerintervals_ceiling(expires_nsecs);
+ if (timer_pending(timer) && timer->expires == expires)
+ return 1;
+
+ return __mod_timer(timer, expires);
+}
+EXPORT_SYMBOL_GPL(set_timer_nsecs);
+
/***
* del_timer - deactive a timer.
* @timer: the timer to be deactivated
@@ -431,17 +534,17 @@ static int cascade(tvec_base_t *base, tv
* This function cascades all vectors and executes all expired timer
* vectors.
*/
-#define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK
+#define INDEX(N) (base->last_timer_time >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK

-static inline void __run_timers(tvec_base_t *base)
+static inline void __run_timers(tvec_base_t *base, unsigned long current_timer_time)
{
struct timer_list *timer;

spin_lock_irq(&base->lock);
- while (time_after_eq(jiffies, base->timer_jiffies)) {
+ while (time_after_eq(current_timer_time, base->last_timer_time)) {
struct list_head work_list = LIST_HEAD_INIT(work_list);
struct list_head *head = &work_list;
- int index = base->timer_jiffies & TVR_MASK;
+ int index = base->last_timer_time & TVR_MASK;

/*
* Cascade timers:
@@ -451,7 +554,7 @@ static inline void __run_timers(tvec_bas
(!cascade(base, &base->tv3, INDEX(1))) &&
!cascade(base, &base->tv4, INDEX(2)))
cascade(base, &base->tv5, INDEX(3));
- ++base->timer_jiffies;
+ ++base->last_timer_time;
list_splice_init(base->tv1.vec + index, &work_list);
repeat:
if (!list_empty(head)) {
@@ -500,20 +603,20 @@ unsigned long next_timer_interrupt(void)

base = &__get_cpu_var(tvec_bases);
spin_lock(&base->lock);
- expires = base->timer_jiffies + (LONG_MAX >> 1);
+ expires = base->last_timer_time + (LONG_MAX >> 1);
list = 0;

/* Look for timer events in tv1. */
- j = base->timer_jiffies & TVR_MASK;
+ j = base->last_timer_time & TVR_MASK;
do {
list_for_each_entry(nte, base->tv1.vec + j, entry) {
expires = nte->expires;
- if (j < (base->timer_jiffies & TVR_MASK))
+ if (j < (base->last_timer_time & TVR_MASK))
list = base->tv2.vec + (INDEX(0));
goto found;
}
j = (j + 1) & TVR_MASK;
- } while (j != (base->timer_jiffies & TVR_MASK));
+ } while (j != (base->last_timer_time & TVR_MASK));

/* Check tv2-tv5. */
varray[0] = &base->tv2;
@@ -890,10 +993,14 @@ EXPORT_SYMBOL(xtime_lock);
*/
static void run_timer_softirq(struct softirq_action *h)
{
+ unsigned long current_timer_time;
tvec_base_t *base = &__get_cpu_var(tvec_bases);

- if (time_after_eq(jiffies, base->timer_jiffies))
- __run_timers(base);
+ current_timer_time =
+ nsecs_to_timerintervals_floor(do_monotonic_clock());
+
+ if (time_after_eq(current_timer_time, base->last_timer_time))
+ __run_timers(base, current_timer_time);
}

/*
@@ -1133,6 +1240,69 @@ fastcall signed long __sched schedule_ti

EXPORT_SYMBOL(schedule_timeout);

+fastcall nsec_t __sched schedule_timeout_nsecs(nsec_t timeout_nsecs)
+{
+ struct timer_list timer;
+ nsec_t expires;
+
+ if (timeout_nsecs == MAX_SCHEDULE_TIMEOUT_NSECS) {
+ schedule();
+ goto out;
+ }
+
+ expires = do_monotonic_clock() + timeout_nsecs;
+
+ init_timer(&timer);
+ timer.data = (unsigned long) current;
+ timer.function = process_timeout;
+
+ set_timer_nsecs(&timer, expires);
+ schedule();
+ del_singleshot_timer_sync(&timer);
+
+ timeout_nsecs = do_monotonic_clock();
+ if (expires < timeout_nsecs)
+ timeout_nsecs = (nsec_t)0UL;
+ else
+ timeout_nsecs = expires - timeout_nsecs;
+out:
+ return timeout_nsecs;
+}
+
+EXPORT_SYMBOL_GPL(schedule_timeout_nsecs);
+
+fastcall unsigned long __sched schedule_timeout_usecs(unsigned long timeout_usecs)
+{
+ nsec_t timeout_nsecs;
+
+ if (timeout_usecs == MAX_SCHEDULE_TIMEOUT_USECS)
+ timeout_nsecs = MAX_SCHEDULE_TIMEOUT_NSECS;
+ else
+ timeout_nsecs = timeout_usecs * (nsec_t)1000UL;
+ timeout_nsecs = schedule_timeout_nsecs(timeout_nsecs) - 1;
+ do_div(timeout_nsecs, 1000UL);
+ timeout_usecs = (unsigned long)timeout_nsecs + 1UL;
+ return timeout_usecs;
+}
+
+EXPORT_SYMBOL_GPL(schedule_timeout_usecs);
+
+fastcall unsigned int __sched schedule_timeout_msecs(unsigned int timeout_msecs)
+{
+ nsec_t timeout_nsecs;
+
+ if (timeout_msecs == MAX_SCHEDULE_TIMEOUT_MSECS)
+ timeout_nsecs = MAX_SCHEDULE_TIMEOUT_NSECS;
+ else
+ timeout_nsecs = timeout_msecs * (nsec_t)1000000;
+ timeout_nsecs = schedule_timeout_nsecs(timeout_nsecs) - 1;
+ do_div(timeout_nsecs, 1000000UL);
+ timeout_msecs = (unsigned int)timeout_nsecs + 1;
+ return timeout_msecs;
+}
+
+EXPORT_SYMBOL_GPL(schedule_timeout_msecs);
+
/* Thread ID - the internal kernel "pid" */
asmlinkage long sys_gettid(void)
{
@@ -1302,7 +1472,11 @@ static void __devinit init_timers_cpu(in
for (j = 0; j < TVR_SIZE; j++)
INIT_LIST_HEAD(base->tv1.vec + j);

- base->timer_jiffies = jiffies;
+ /*
+ * Under the new montonic_clock() oriented soft-timer subsystem,
+ * we should begin at 0
+ */
+ base->last_timer_time = 0UL;
}

#ifdef CONFIG_HOTPLUG_CPU

2005-04-29 23:48:01

by john stultz

[permalink] [raw]
Subject: Re: [RFC][PATCH (1/4)] new timeofday core subsystem (v A4)

Ack. At the last minute I accidentally committed a line I didn't mean
to, so the timeofday-core_A4 patch doesn't build.

This patch fixes the problem. Please use it instead.

linux-2.6.12-rc2_timeofday-core_A4fix.patch
===========================================
diff -Nru a/drivers/Makefile b/drivers/Makefile
--- a/drivers/Makefile 2005-04-29 16:39:35 -07:00
+++ b/drivers/Makefile 2005-04-29 16:39:35 -07:00
@@ -64,3 +64,4 @@
obj-$(CONFIG_BLK_DEV_SGIIOC4) += sn/
obj-y += firmware/
obj-$(CONFIG_CRYPTO) += crypto/
+obj-$(CONFIG_NEWTOD) += timesource/
diff -Nru a/drivers/timesource/Makefile b/drivers/timesource/Makefile
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/drivers/timesource/Makefile 2005-04-29 16:39:35 -07:00
@@ -0,0 +1 @@
+obj-y += jiffies.o
diff -Nru a/drivers/timesource/jiffies.c b/drivers/timesource/jiffies.c
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/drivers/timesource/jiffies.c 2005-04-29 16:39:35 -07:00
@@ -0,0 +1,45 @@
+/*
+ * linux/drivers/timesource/jiffies.c
+ *
+ * Copyright (C) 2004 IBM
+ *
+ * This file contains the jiffies based time source.
+ *
+ */
+#include <linux/timesource.h>
+#include <linux/jiffies.h>
+#include <linux/init.h>
+
+/* The Jiffies based timesource is the lowest common
+ * denominator time source which should function on
+ * all systems. It has the same coarse resolution as
+ * the timer interrupt frequency HZ and it suffers
+ * inaccuracies caused by missed or lost timer
+ * interrupts and the inability for the timer
+ * interrupt hardware to accuratly tick at the
+ * requested HZ value. It is also not reccomended
+ * for "tick-less" systems.
+ */
+
+static cycle_t jiffies_read(void)
+{
+ cycle_t ret = get_jiffies_64();
+ return ret;
+}
+
+struct timesource_t timesource_jiffies = {
+ .name = "jiffies",
+ .priority = 0, /* lowest priority*/
+ .type = TIMESOURCE_FUNCTION,
+ .read_fnct = jiffies_read,
+ .mask = (cycle_t)-1,
+ .mult = (NSEC_PER_SEC+(HZ/2))/HZ,
+ .shift = 0,
+};
+
+static int init_jiffies_timesource(void)
+{
+ register_timesource(&timesource_jiffies);
+ return 0;
+}
+module_init(init_jiffies_timesource);
diff -Nru a/include/linux/ntp.h b/include/linux/ntp.h
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/include/linux/ntp.h 2005-04-29 16:39:35 -07:00
@@ -0,0 +1,22 @@
+/* linux/include/linux/ntp.h
+ *
+ * Copyright (C) 2003, 2004, 2005 IBM, John Stultz ([email protected])
+ *
+ * This file NTP state machine accessor functions.
+ */
+
+#ifndef _LINUX_NTP_H
+#define _LINUX_NTP_H
+#include <linux/types.h>
+#include <linux/time.h>
+#include <linux/timex.h>
+
+/* NTP state machine interfaces */
+nsec_t ntp_scale(nsec_t value);
+int ntp_advance(nsec_t value);
+int ntp_adjtimex(struct timex*);
+int ntp_leapsecond(struct timespec now);
+void ntp_clear(void);
+int get_ntp_status(void);
+
+#endif
diff -Nru a/include/linux/time.h b/include/linux/time.h
--- a/include/linux/time.h 2005-04-29 16:39:35 -07:00
+++ b/include/linux/time.h 2005-04-29 16:39:35 -07:00
@@ -27,6 +27,10 @@

#ifdef __KERNEL__

+/* timeofday base types */
+typedef u64 nsec_t;
+typedef u64 cycle_t;
+
/* Parameters used to convert the timespec values */
#ifndef USEC_PER_SEC
#define USEC_PER_SEC (1000000L)
diff -Nru a/include/linux/timeofday.h b/include/linux/timeofday.h
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/include/linux/timeofday.h 2005-04-29 16:39:35 -07:00
@@ -0,0 +1,65 @@
+/* linux/include/linux/timeofday.h
+ *
+ * Copyright (C) 2003, 2004, 2005 IBM, John Stultz ([email protected])
+ *
+ * This file contains the interface to the time of day subsystem
+ */
+#ifndef _LINUX_TIMEOFDAY_H
+#define _LINUX_TIMEOFDAY_H
+#include <linux/types.h>
+#include <linux/time.h>
+#include <linux/timex.h>
+#include <asm/div64.h>
+
+#ifdef CONFIG_NEWTOD
+nsec_t get_lowres_timestamp(void);
+nsec_t get_lowres_timeofday(void);
+nsec_t do_monotonic_clock(void);
+
+void do_gettimeofday(struct timeval *tv);
+int do_settimeofday(struct timespec *tv);
+int do_adjtimex(struct timex *tx);
+
+void timeofday_suspend_hook(void);
+void timeofday_resume_hook(void);
+
+void timeofday_init(void);
+
+
+/* Helper functions */
+static inline struct timeval ns2timeval(nsec_t ns)
+{
+ struct timeval tv;
+ tv.tv_sec = div_long_long_rem(ns, NSEC_PER_SEC, &tv.tv_usec);
+ tv.tv_usec = (tv.tv_usec + NSEC_PER_USEC/2) / NSEC_PER_USEC;
+ return tv;
+}
+
+static inline struct timespec ns2timespec(nsec_t ns)
+{
+ struct timespec ts;
+ ts.tv_sec = div_long_long_rem(ns, NSEC_PER_SEC, &ts.tv_nsec);
+ return ts;
+}
+
+static inline nsec_t timespec2ns(struct timespec* ts)
+{
+ nsec_t ret;
+ ret = ((nsec_t)ts->tv_sec) * NSEC_PER_SEC;
+ ret += ts->tv_nsec;
+ return ret;
+}
+
+static inline nsec_t timeval2ns(struct timeval* tv)
+{
+ nsec_t ret;
+ ret = ((nsec_t)tv->tv_sec) * NSEC_PER_SEC;
+ ret += tv->tv_usec * NSEC_PER_USEC;
+ return ret;
+}
+#else /* CONFIG_NEWTOD */
+#define timeofday_suspend_hook()
+#define timeofday_resume_hook()
+#define timeofday_init()
+#endif /* CONFIG_NEWTOD */
+#endif /* _LINUX_TIMEOFDAY_H */
diff -Nru a/include/linux/timesource.h b/include/linux/timesource.h
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/include/linux/timesource.h 2005-04-29 16:39:35 -07:00
@@ -0,0 +1,159 @@
+/* linux/include/linux/timesource.h
+ *
+ * Copyright (C) 2003, 2004, 2005 IBM, John Stultz ([email protected])
+ *
+ * This file contains the structure definitions for timesources.
+ *
+ * If you are not a timesource, or the time of day code, you should
+ * not be including this file!
+ */
+#ifndef _LINUX_TIMESORUCE_H
+#define _LINUX_TIMESORUCE_H
+
+#include <linux/types.h>
+#include <linux/time.h>
+#include <linux/timex.h>
+#include <asm/io.h>
+#include <asm/div64.h>
+
+/* struct timesource_t:
+ * Provides mostly state-free accessors to the underlying hardware.
+ *
+ * name: ptr to timesource name
+ * priority: priority value for selection (higher is better)
+ * type: defines timesource type
+ * @read_fnct: returns a cycle value
+ * ptr: ptr to MMIO'ed counter
+ * mask: bitmask for two's complement
+ * subtraction of non 64 bit counters
+ * mult: cycle to nanosecond multiplier
+ * shift: cycle to nanosecond divisor (power of two)
+ * @update_callback: called when safe to alter timesource values
+ */
+struct timesource_t {
+ char* name;
+ int priority;
+ enum {
+ TIMESOURCE_FUNCTION,
+ TIMESOURCE_CYCLES,
+ TIMESOURCE_MMIO_32,
+ TIMESOURCE_MMIO_64
+ } type;
+ cycle_t (*read_fnct)(void);
+ void __iomem *mmio_ptr;
+ cycle_t mask;
+ u32 mult;
+ u32 shift;
+ void (*update_callback)(void);
+};
+
+
+/* Helper functions that converts a khz counter
+ * frequency to a timsource multiplier, given the
+ * timesource shift value
+ */
+static inline u32 timesource_khz2mult(u32 khz, u32 shift_constant)
+{
+ /* khz = cyc/(Million ns)
+ * mult/2^shift = ns/cyc
+ * mult = ns/cyc * 2^shift
+ * mult = 1Million/khz * 2^shift
+ * mult = 1000000 * 2^shift / khz
+ * mult = (1000000<<shift) / khz
+ */
+ u64 tmp = ((u64)1000000) << shift_constant;
+ /* XXX - should we round here? */
+ do_div(tmp, khz);
+ return (u32)tmp;
+}
+
+/* Helper functions that converts a hz counter
+ * frequency to a timsource multiplier, given the
+ * timesource shift value
+ */
+static inline u32 timesource_hz2mult(u32 hz, u32 shift_constant)
+{
+ /* hz = cyc/(Billion ns)
+ * mult/2^shift = ns/cyc
+ * mult = ns/cyc * 2^shift
+ * mult = 1Billion/hz * 2^shift
+ * mult = 1000000000 * 2^shift / hz
+ * mult = (1000000000<<shift) / hz
+ */
+ u64 tmp = ((u64)1000000000) << shift_constant;
+ /* XXX - should we round here? */
+ do_div(tmp, hz);
+ return (u32)tmp;
+}
+
+
+/* XXX - this should go somewhere better! */
+#ifndef readq
+static inline unsigned long long readq(void __iomem *addr)
+{
+ u32 low, high;
+ /* loop is required to make sure we get an atomic read */
+ do {
+ high = readl(addr+4);
+ low = readl(addr);
+ } while (high != readl(addr+4));
+
+ return low | (((unsigned long long)high) << 32LL);
+}
+#endif
+
+
+/* read_timesource():
+ * Uses the timesource to return the current cycle_t value
+ */
+static inline cycle_t read_timesource(struct timesource_t *ts)
+{
+ switch (ts->type) {
+ case TIMESOURCE_MMIO_32:
+ return (cycle_t)readl(ts->mmio_ptr);
+ case TIMESOURCE_MMIO_64:
+ return (cycle_t)readq(ts->mmio_ptr);
+ case TIMESOURCE_CYCLES:
+ return (cycle_t)get_cycles();
+ default:/* case: TIMESOURCE_FUNCTION */
+ return ts->read_fnct();
+ }
+}
+
+/* cyc2ns():
+ * Uses the timesource and ntp ajdustment interval to
+ * convert cycle_ts to nanoseconds.
+ */
+static inline nsec_t cyc2ns(struct timesource_t *ts, int ntp_adj, cycle_t cycles)
+{
+ u64 ret;
+ ret = (u64)cycles;
+ ret *= (ts->mult + ntp_adj);
+ ret >>= ts->shift;
+ return (nsec_t)ret;
+}
+
+/* cyc2ns_rem():
+ * Uses the timesource and ntp ajdustment interval to
+ * convert cycle_ts to nanoseconds. Add in remainder portion
+ * which is stored in ns<<ts->shift units and save the new
+ * remainder off.
+ */
+static inline nsec_t cyc2ns_rem(struct timesource_t *ts, int ntp_adj, cycle_t cycles, u64* rem)
+{
+ u64 ret;
+ ret = (u64)cycles;
+ ret *= (ts->mult + ntp_adj);
+ if (rem) {
+ ret += *rem;
+ *rem = ret & ((1<<ts->shift)-1);
+ }
+ ret >>= ts->shift;
+ return (nsec_t)ret;
+}
+
+/* used to install a new time source */
+void register_timesource(struct timesource_t*);
+struct timesource_t* get_next_timesource(void);
+
+#endif
diff -Nru a/init/main.c b/init/main.c
--- a/init/main.c 2005-04-29 16:39:35 -07:00
+++ b/init/main.c 2005-04-29 16:39:35 -07:00
@@ -47,6 +47,7 @@
#include <linux/rmap.h>
#include <linux/mempolicy.h>
#include <linux/key.h>
+#include <linux/timeofday.h>

#include <asm/io.h>
#include <asm/bugs.h>
@@ -467,6 +468,7 @@
pidhash_init();
init_timers();
softirq_init();
+ timeofday_init();
time_init();

/*
diff -Nru a/kernel/Makefile b/kernel/Makefile
--- a/kernel/Makefile 2005-04-29 16:39:35 -07:00
+++ b/kernel/Makefile 2005-04-29 16:39:35 -07:00
@@ -9,6 +9,7 @@
rcupdate.o intermodule.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o

+obj-$(CONFIG_NEWTOD) += timeofday.o timesource.o ntp.o
obj-$(CONFIG_FUTEX) += futex.o
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += cpu.o spinlock.o
diff -Nru a/kernel/ntp.c b/kernel/ntp.c
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/kernel/ntp.c 2005-04-29 16:39:35 -07:00
@@ -0,0 +1,500 @@
+/********************************************************************
+* linux/kernel/ntp.c
+*
+* NTP state machine and time scaling code.
+*
+* Copyright (C) 2004, 2005 IBM, John Stultz ([email protected])
+*
+* Portions rewritten from kernel/time.c and kernel/timer.c
+* Please see those files for original copyrights.
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*
+* Notes:
+*
+* Hopefully you should never have to understand or touch
+* any of the code below. but don't let that keep you from trying!
+*
+* This code is loosely based on David Mills' RFC 1589 and its
+* updates. Please see the following for more details:
+* http://www.eecis.udel.edu/~mills/database/rfc/rfc1589.txt
+* http://www.eecis.udel.edu/~mills/database/reports/kern/kernb.pdf
+*
+* NOTE: To simplify the code, we do not implement any of
+* the PPS code, as the code that uses it never was merged.
+* [email protected]
+*
+* Revision History:
+* 2004-09-02: A0
+* o First pass sent to lkml for review.
+* 2004-12-07: A1
+* o No changes, sent to lkml for review.
+* 2005-03-11: A3
+* o yanked ntp_scale(), ntp adjustments are done in cyc2ns
+* 2005-04-29: A4
+* o Added conditional debug info
+*
+* TODO List:
+* o Move to using ppb for frequency adjustmetns
+* o More documentation
+* o More testing
+* o More optimization
+*********************************************************************/
+
+#include <linux/ntp.h>
+#include <linux/errno.h>
+
+/* XXX - remove later */
+#define NTP_DEBUG 0
+
+/* NTP scaling code
+ * Functions:
+ * ----------
+ * nsec_t ntp_scale(nsec_t value):
+ * Scales the nsec_t vale using ntp kernel state
+ * void ntp_advance(nsec_t interval):
+ * Increments the NTP state machine by interval time
+ * static int ntp_hardupdate(long offset, struct timeval tv)
+ * ntp_adjtimex helper function
+ * int ntp_adjtimex(struct timex* tx):
+ * Interface to adjust NTP state machine
+ * int ntp_leapsecond(struct timespec now)
+ * Does NTP leapsecond processing. Returns number of
+ * seconds current time should be adjusted by.
+ * void ntp_clear(void):
+ * Clears the ntp kernel state
+ * int get_ntp_status(void):
+ * returns ntp_status value
+ *
+ * Variables:
+ * ----------
+ * ntp kernel state variables:
+ * See below for full list.
+ * ntp_lock:
+ * Protects ntp kernel state variables
+ */
+
+
+
+/* Chapter 5: Kernel Variables [RFC 1589 pg. 28] */
+/* 5.1 Interface Variables */
+static int ntp_status = STA_UNSYNC; /* status */
+static long ntp_offset; /* usec */
+static long ntp_constant = 2; /* ntp magic? */
+static long ntp_maxerror = NTP_PHASE_LIMIT; /* usec */
+static long ntp_esterror = NTP_PHASE_LIMIT; /* usec */
+static const long ntp_tolerance = MAXFREQ; /* shifted ppm */
+static const long ntp_precision = 1; /* constant */
+
+/* 5.2 Phase-Lock Loop Variables */
+static long ntp_freq; /* shifted ppm */
+static long ntp_reftime; /* sec */
+
+/* Extra values */
+static int ntp_state = TIME_OK; /* leapsecond state */
+static long ntp_tick = USEC_PER_SEC/USER_HZ; /* tick length */
+
+static s64 ss_offset_len; /* SINGLESHOT offset adj interval (nsec)*/
+static long singleshot_adj; /* +/- MAX_SINGLESHOT_ADJ (ppm)*/
+static long tick_adj; /* tx->tick adjustment (ppm) */
+static long offset_adj; /* offset adjustment (ppm) */
+
+
+/* lock for the above variables */
+static seqlock_t ntp_lock = SEQLOCK_UNLOCKED;
+
+#define MAX_SINGLESHOT_ADJ 500 /* (ppm) */
+#define SEC_PER_DAY 86400
+
+/* Required to safely shift negative values */
+#define shiftR(x,s) (x < 0) ? (-((-x) >> (s))) : ((x) >> (s))
+
+/* int ntp_advance(nsec_t interval):
+ * Periodic hook which increments NTP state machine by interval.
+ * Returns the signed PPM adjustment to be used for the next interval.
+ * This is ntp_hardclock in the RFC.
+ */
+int ntp_advance(nsec_t interval)
+{
+ static u64 interval_sum = 0;
+ static long ss_adj = 0;
+ unsigned long flags;
+ long ppm_sum;
+
+ /* inc interval sum */
+ interval_sum += interval;
+
+ write_seqlock_irqsave(&ntp_lock, flags);
+
+ /* decrement singleshot offset interval */
+ ss_offset_len -= interval;
+ if(ss_offset_len < 0) /* make sure it doesn't go negative */
+ ss_offset_len = 0;
+
+ /* Do second overflow code */
+ while (interval_sum > NSEC_PER_SEC) {
+ /* XXX - I'd prefer to smoothly apply this math
+ * at each call to ntp_advance() rather then each
+ * second.
+ */
+ long tmp;
+
+ /* Bump maxerror by ntp_tolerance */
+ ntp_maxerror += shiftR(ntp_tolerance, SHIFT_USEC);
+ if (ntp_maxerror > NTP_PHASE_LIMIT) {
+ ntp_maxerror = NTP_PHASE_LIMIT;
+ ntp_status |= STA_UNSYNC;
+ }
+
+ /* Calculate offset_adj for the next second */
+ tmp = ntp_offset;
+ if (!(ntp_status & STA_FLL))
+ tmp = shiftR(tmp, SHIFT_KG + ntp_constant);
+
+ /* bound the adjustment to MAXPHASE/MINSEC */
+ tmp = min(tmp, (MAXPHASE / MINSEC) << SHIFT_UPDATE);
+ tmp = max(tmp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE);
+
+ offset_adj = shiftR(tmp, SHIFT_UPDATE); /* (usec/sec) = ppm */
+ ntp_offset -= tmp;
+
+ interval_sum -= NSEC_PER_SEC;
+
+ /* calculate singleshot aproximation ppm for the next second */
+ ss_adj = singleshot_adj;
+ singleshot_adj = 0;
+ }
+
+ /* calculate total ppm adjustment for the next interval */
+ ppm_sum = tick_adj;
+ ppm_sum += offset_adj;
+ ppm_sum += shiftR(ntp_freq,SHIFT_USEC);
+ ppm_sum += ss_adj;
+
+#if NTP_DEBUG
+{ /*XXX - yank me! just for debug */
+ static int dbg = 0;
+ if(!(dbg++%300000))
+ printk("tick_adj(%d) + offset_adj(%d) + ntp_freq(%d) + ss_adj(%d) = ppm_sum(%d)\n", tick_adj, offset_adj, shiftR(ntp_freq,SHIFT_USEC), ss_adj, ppm_sum);
+}
+#endif
+
+ write_sequnlock_irqrestore(&ntp_lock, flags);
+
+ return ppm_sum;
+}
+
+/* XXX - This function needs more explanation */
+/* called only by ntp_adjtimex while holding ntp_lock */
+static int ntp_hardupdate(long offset, struct timeval tv)
+{
+ int ret;
+ long tmp, interval;
+
+ ret = 0;
+ if (!(ntp_status & STA_PLL))
+ return ret;
+
+ tmp = offset;
+ /* Make sure offset is bounded by MAXPHASE */
+ tmp = min(tmp, MAXPHASE);
+ tmp = max(tmp, -MAXPHASE);
+
+ ntp_offset = tmp << SHIFT_UPDATE;
+
+ if ((ntp_status & STA_FREQHOLD) || (ntp_reftime == 0))
+ ntp_reftime = tv.tv_sec;
+
+ /* calculate seconds since last call to hardupdate */
+ interval = tv.tv_sec - ntp_reftime;
+ ntp_reftime = tv.tv_sec;
+
+ if ((ntp_status & STA_FLL) && (interval >= MINSEC)) {
+ long damping;
+ /* XXX - should we round here? */
+ tmp = offset / interval; /* ppm (usec/sec)*/
+
+ /* convert to shifted ppm, then apply damping factor */
+
+ /* calculate damping factor - XXX bigger comment!*/
+ damping = SHIFT_KH - SHIFT_USEC;
+
+ /* apply damping factor */
+ ntp_freq += shiftR(tmp,damping);
+#if NTP_DEBUG
+ printk("ntp->freq change: %ld\n",shiftR(tmp,damping));
+#endif
+
+ } else if ((ntp_status & STA_PLL) && (interval < MAXSEC)) {
+ long damping;
+ tmp = offset * interval; /* ppm XXX - not quite*/
+
+ /* calculate damping factor - XXX bigger comment!*/
+ damping = (2 * ntp_constant) + SHIFT_KF - SHIFT_USEC;
+
+ /* apply damping factor */
+ ntp_freq += shiftR(tmp,damping);
+
+#if NTP_DEBUG
+ printk("ntp->freq change: %ld\n", shiftR(tmp,damping));
+#endif
+ } else { /* interval out of bounds */
+ printk("ntp_hardupdate(): interval out of bounds: %ld\n",
+ interval);
+ ret = -1; /* TIME_ERROR */
+ }
+
+ /* bound ntp_freq */
+ if (ntp_freq > ntp_tolerance)
+ ntp_freq = ntp_tolerance;
+ if (ntp_freq < -ntp_tolerance)
+ ntp_freq = -ntp_tolerance;
+
+ return ret;
+}
+
+/* int ntp_adjtimex(struct timex* tx)
+ * Interface to change NTP state machine
+ */
+int ntp_adjtimex(struct timex* tx)
+{
+ long save_offset;
+ int result;
+ unsigned long flags;
+
+/* Sanity checking
+ */
+ /* frequency adjustment limited to +/- MAXFREQ */
+ if ((tx->modes & ADJ_FREQUENCY)
+ && (abs(tx->freq) > MAXFREQ))
+ return -EINVAL;
+
+ /* maxerror adjustment limited to NTP_PHASE_LIMIT */
+ if ((tx->modes & ADJ_MAXERROR)
+ && (tx->maxerror < 0
+ || tx->maxerror >= NTP_PHASE_LIMIT))
+ return -EINVAL;
+
+ /* esterror adjustment limited to NTP_PHASE_LIMIT */
+ if ((tx->modes & ADJ_ESTERROR)
+ && (tx->esterror < 0
+ || tx->esterror >= NTP_PHASE_LIMIT))
+ return -EINVAL;
+
+ /* constant adjustment must be positive */
+ if ((tx->modes & ADJ_TIMECONST)
+ && (tx->constant < 0))
+ return -EINVAL;
+
+ /* Single shot mode can only be used by itself */
+ if (((tx->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
+ && (tx->modes != ADJ_OFFSET_SINGLESHOT))
+ return -EINVAL;
+
+ /* offset adjustment limited to +/- MAXPHASE */
+ if ((tx->modes != ADJ_OFFSET_SINGLESHOT)
+ && (tx->modes & ADJ_OFFSET)
+ && (abs(tx->offset)>= MAXPHASE))
+ return -EINVAL;
+
+ /* tick adjustment limited to 10% */
+ /* XXX - should we round here? */
+ if ((tx->modes & ADJ_TICK)
+ && ((tx->tick < 900000/USER_HZ)
+ ||(tx->tick > 11000000/USER_HZ)))
+ return -EINVAL;
+
+#if NTP_DEBUG
+ /* dbg output XXX - yank me! */
+ if(tx->modes) {
+ printk("adjtimex: tx->offset: %ld tx->freq: %ld\n",
+ tx->offset, tx->freq);
+ }
+#endif
+
+/* Kernel input bits
+ */
+ write_seqlock_irqsave(&ntp_lock, flags);
+
+ result = ntp_state;
+
+ /* For ADJ_OFFSET_SINGLESHOT we must return the old offset */
+ save_offset = shiftR(ntp_offset, SHIFT_UPDATE);
+
+ /* Process input parameters */
+ if (tx->modes & ADJ_STATUS) {
+ ntp_status &= STA_RONLY;
+ ntp_status |= tx->status & ~STA_RONLY;
+ }
+
+ if (tx->modes & ADJ_FREQUENCY)
+ ntp_freq = tx->freq;
+
+ if (tx->modes & ADJ_MAXERROR)
+ ntp_maxerror = tx->maxerror;
+
+ if (tx->modes & ADJ_ESTERROR)
+ ntp_esterror = tx->esterror;
+
+ if (tx->modes & ADJ_TIMECONST)
+ ntp_constant = tx->constant;
+
+ if (tx->modes & ADJ_OFFSET) {
+ /* check if we're doing a singleshot adjustment */
+ if (tx->modes == ADJ_OFFSET_SINGLESHOT)
+ singleshot_adj = tx->offset;
+ /* otherwise, call hardupdate() */
+ else if (ntp_hardupdate(tx->offset, tx->time))
+ result = TIME_ERROR;
+ }
+
+ if (tx->modes & ADJ_TICK) {
+ /* first calculate usec/user_tick offset */
+ /* XXX - should we round here? */
+ tick_adj = (USEC_PER_SEC/USER_HZ) - tx->tick;
+ /* multiply by user_hz to get usec/sec => ppm */
+ tick_adj *= USER_HZ;
+ /* save tx->tick for future calls to adjtimex */
+ ntp_tick = tx->tick;
+ }
+
+ if ((ntp_status & (STA_UNSYNC|STA_CLOCKERR)) != 0 )
+ result = TIME_ERROR;
+
+/* Kernel output bits
+ */
+ /* write kernel state to user timex values*/
+ if ((tx->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
+ tx->offset = save_offset;
+ else
+ tx->offset = shiftR(ntp_offset, SHIFT_UPDATE);
+
+ tx->freq = ntp_freq;
+ tx->maxerror = ntp_maxerror;
+ tx->esterror = ntp_esterror;
+ tx->status = ntp_status;
+ tx->constant = ntp_constant;
+ tx->precision = ntp_precision;
+ tx->tolerance = ntp_tolerance;
+
+ /* PPS is not implemented, so these are zero */
+ tx->ppsfreq = /*XXX - Not Implemented!*/ 0;
+ tx->jitter = /*XXX - Not Implemented!*/ 0;
+ tx->shift = /*XXX - Not Implemented!*/ 0;
+ tx->stabil = /*XXX - Not Implemented!*/ 0;
+ tx->jitcnt = /*XXX - Not Implemented!*/ 0;
+ tx->calcnt = /*XXX - Not Implemented!*/ 0;
+ tx->errcnt = /*XXX - Not Implemented!*/ 0;
+ tx->stbcnt = /*XXX - Not Implemented!*/ 0;
+
+ write_sequnlock_irqrestore(&ntp_lock, flags);
+
+ return result;
+}
+
+
+/* void ntp_leapsecond(struct timespec now):
+ * NTP Leapsecnod processing code. Returns the number of
+ * seconds (-1, 0, or 1) that should be added to the current
+ * time to properly adjust for leapseconds.
+ */
+int ntp_leapsecond(struct timespec now)
+{
+ /*
+ * Leap second processing. If in leap-insert state at
+ * the end of the day, the system clock is set back one
+ * second; if in leap-delete state, the system clock is
+ * set ahead one second.
+ */
+ static time_t leaptime = 0;
+
+ switch (ntp_state) {
+ case TIME_OK:
+ if (ntp_status & STA_INS) {
+ ntp_state = TIME_INS;
+ /* calculate end of today (23:59:59)*/
+ leaptime = now.tv_sec + SEC_PER_DAY -
+ (now.tv_sec % SEC_PER_DAY) - 1;
+ }
+ else if (ntp_status & STA_DEL) {
+ ntp_state = TIME_DEL;
+ /* calculate end of today (23:59:59)*/
+ leaptime = now.tv_sec + SEC_PER_DAY -
+ (now.tv_sec % SEC_PER_DAY) - 1;
+ }
+ break;
+
+ case TIME_INS:
+ /* Once we are at (or past) leaptime, insert the second */
+ if (now.tv_sec > leaptime) {
+ ntp_state = TIME_OOP;
+ printk(KERN_NOTICE
+ "Clock: inserting leap second 23:59:60 UTC\n");
+ return -1;
+ }
+ break;
+
+ case TIME_DEL:
+ /* Once we are at (or past) leaptime, delete the second */
+ if (now.tv_sec >= leaptime) {
+ ntp_state = TIME_WAIT;
+ printk(KERN_NOTICE
+ "Clock: deleting leap second 23:59:59 UTC\n");
+ return 1;
+ }
+ break;
+
+ case TIME_OOP:
+ /* Wait for the end of the leap second*/
+ if (now.tv_sec > (leaptime + 1))
+ ntp_state = TIME_WAIT;
+ break;
+
+ case TIME_WAIT:
+ if (!(ntp_status & (STA_INS | STA_DEL)))
+ ntp_state = TIME_OK;
+ }
+
+ return 0;
+}
+
+/* void ntp_clear(void):
+ * Clears the NTP state machine.
+ */
+void ntp_clear(void)
+{
+ unsigned long flags;
+ write_seqlock_irqsave(&ntp_lock, flags);
+
+ /* clear everything */
+ ntp_status |= STA_UNSYNC;
+ ntp_maxerror = NTP_PHASE_LIMIT;
+ ntp_esterror = NTP_PHASE_LIMIT;
+ ss_offset_len = 0;
+ singleshot_adj = 0;
+ tick_adj = 0;
+ offset_adj =0;
+
+ write_sequnlock_irqrestore(&ntp_lock, flags);
+}
+
+/* int get_ntp_status(void):
+ * Returns the NTP status.
+ */
+int get_ntp_status(void)
+{
+ return ntp_status;
+}
+
diff -Nru a/kernel/time.c b/kernel/time.c
--- a/kernel/time.c 2005-04-29 16:39:35 -07:00
+++ b/kernel/time.c 2005-04-29 16:39:35 -07:00
@@ -38,6 +38,7 @@

#include <asm/uaccess.h>
#include <asm/unistd.h>
+#include <linux/timeofday.h>

/*
* The timezone where the local system is located. Used as a default by some
@@ -227,6 +228,7 @@
/* adjtimex mainly allows reading (and writing, if superuser) of
* kernel time-keeping variables. used by xntpd.
*/
+#ifndef CONFIG_NEWTOD
int do_adjtimex(struct timex *txc)
{
long ltemp, mtemp, save_adjust;
@@ -410,6 +412,7 @@
notify_arch_cmos_timer();
return(result);
}
+#endif

asmlinkage long sys_adjtimex(struct timex __user *txc_p)
{
@@ -566,6 +569,7 @@


#else
+#ifndef CONFIG_NEWTOD
/*
* Simulate gettimeofday using do_gettimeofday which only allows a timeval
* and therefore only yields usec accuracy
@@ -578,6 +582,7 @@
tv->tv_sec = x.tv_sec;
tv->tv_nsec = x.tv_usec * NSEC_PER_USEC;
}
+#endif /* CONFIG_NEWTOD */
#endif

#if (BITS_PER_LONG < 64)
diff -Nru a/kernel/timeofday.c b/kernel/timeofday.c
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/kernel/timeofday.c 2005-04-29 16:39:35 -07:00
@@ -0,0 +1,521 @@
+/*********************************************************************
+* linux/kernel/timeofday.c
+*
+* This file contains the functions which access and manage
+* the system's time of day functionality.
+*
+* Copyright (C) 2003, 2004, 2005 IBM, John Stultz ([email protected])
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*
+* Revision History:
+* 2004-09-02: A0
+* o First pass sent to lkml for review.
+* 2004-12-07: A1
+* o Rework of timesource structure
+* o Sent to lkml for review
+* 2005-01-24: A2
+* o write_seqlock_irq -> writeseqlock_irqsave
+* o arch generic interface for for get_cmos_time() equivalents
+* o suspend/resume hooks for sleep/hibernate (lightly tested)
+* o timesource adjust_callback hook
+* o Sent to lkml for review
+* 2005-03-11: A3
+* o periodic_hook (formerly interrupt_hook) now calle by softtimer
+* o yanked ntp_scale(), ntp adjustments are done in cyc2ns now
+* o sent to lkml for review
+* 2005-04-29: A4
+* o Improved the cyc2ns remainder handling
+* o Added getnstimeofday
+* o Cleanups from Nish Aravamudan
+* TODO List:
+* o vsyscall/fsyscall infrastructure
+* o clock_was_set hook
+**********************************************************************/
+
+#include <linux/timeofday.h>
+#include <linux/timesource.h>
+#include <linux/ntp.h>
+#include <linux/timex.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/sched.h> /* Needed for capable() */
+
+/* XXX - remove later */
+#define TIME_DBG 0
+#define TIME_DBG_FREQ 60000
+
+/* only run periodic_hook every 50ms */
+#define PERIODIC_INTERVAL_MS 50
+
+/*[Nanosecond based variables]
+ * system_time:
+ * Monotonically increasing counter of the number of nanoseconds
+ * since boot.
+ * wall_time_offset:
+ * Offset added to system_time to provide accurate time-of-day
+ */
+static nsec_t system_time;
+static nsec_t wall_time_offset;
+
+/*[Cycle based variables]
+ * offset_base:
+ * Value of the timesource at the last timeofday_periodic_hook()
+ * (adjusted only minorly to account for rounded off cycles)
+ */
+static cycle_t offset_base;
+
+/*[Time source data]
+ * timesource:
+ * current timesource pointer
+ */
+static struct timesource_t *timesource;
+
+/*[NTP adjustment]
+ * ntp_adj:
+ * value of the current ntp adjustment,
+ * stored in timesource multiplier units.
+ */
+int ntp_adj;
+
+/*[Locks]
+ * system_time_lock:
+ * generic lock for all locally scoped time values
+ */
+static seqlock_t system_time_lock = SEQLOCK_UNLOCKED;
+
+
+/*[Suspend/Resume info]
+ * time_suspend_state:
+ * variable that keeps track of suspend state
+ * suspend_start:
+ * start of the suspend call
+ */
+static enum {
+ TIME_RUNNING,
+ TIME_SUSPENDED
+} time_suspend_state = TIME_RUNNING;
+
+static nsec_t suspend_start;
+
+
+/* [XXX - Hacks]
+ * Makes stuff compile
+ */
+extern nsec_t read_persistent_clock(void);
+extern void sync_persistent_clock(struct timespec ts);
+
+
+/* get_lowres_timestamp():
+ * Returns a low res timestamp w/ PERIODIC_INTERVAL_MS
+ * granularity. (ie: the value of system_time as
+ * calculated at the last invocation of
+ * timeofday_periodic_hook())
+ */
+nsec_t get_lowres_timestamp(void)
+{
+ nsec_t ret;
+ unsigned long seq;
+ do {
+ seq = read_seqbegin(&system_time_lock);
+
+ /* quickly grab system_time*/
+ ret = system_time;
+
+ } while (read_seqretry(&system_time_lock, seq));
+
+ return ret;
+}
+
+
+/* get_lowres_timeofday():
+ * Returns a low res time of day, as calculated at the
+ * last invocation of timeofday_periodic_hook()
+ */
+nsec_t get_lowres_timeofday(void)
+{
+ nsec_t ret;
+ unsigned long seq;
+ do {
+ seq = read_seqbegin(&system_time_lock);
+
+ /* quickly calculate low-res time of day */
+ ret = system_time + wall_time_offset;
+
+ } while (read_seqretry(&system_time_lock, seq));
+
+ return ret;
+}
+
+
+/* update_legacy_time_values():
+ * Private function. Used to sync legacy time values to
+ * current timeofday. Assumes we have the system_time_lock.
+ * Hopefully someday this function can be removed.
+ */
+static void update_legacy_time_values(void)
+{
+ unsigned long flags;
+ write_seqlock_irqsave(&xtime_lock, flags);
+ xtime = ns2timespec(system_time + wall_time_offset);
+ wall_to_monotonic = ns2timespec(wall_time_offset);
+ set_normalized_timespec(&wall_to_monotonic,
+ -wall_to_monotonic.tv_sec, -wall_to_monotonic.tv_nsec);
+ /* We don't update jiffies here because it is its own time domain */
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+}
+
+
+/* __monotonic_clock():
+ * private function, must hold system_time_lock lock when being
+ * called. Returns the monotonically increasing number of
+ * nanoseconds since the system booted (adjusted by NTP scaling)
+ */
+static inline nsec_t __monotonic_clock(void)
+{
+ nsec_t ret, ns_offset;
+ cycle_t now, cycle_delta;
+
+ /* read timesource */
+ now = read_timesource(timesource);
+
+ /* calculate the delta since the last timeofday_periodic_hook */
+ cycle_delta = (now - offset_base) & timesource->mask;
+
+ /* convert to nanoseconds */
+ ns_offset = cyc2ns(timesource, ntp_adj, cycle_delta);
+
+ /* add result to system time */
+ ret = system_time + ns_offset;
+
+ return ret;
+}
+
+
+/* do_monotonic_clock():
+ * Returns the monotonically increasing number of nanoseconds
+ * since the system booted via __monotonic_clock()
+ */
+nsec_t do_monotonic_clock(void)
+{
+ nsec_t ret;
+ unsigned long seq;
+
+ /* atomically read __monotonic_clock() */
+ do {
+ seq = read_seqbegin(&system_time_lock);
+
+ ret = __monotonic_clock();
+
+ } while (read_seqretry(&system_time_lock, seq));
+
+ return ret;
+}
+
+
+/* __gettimeofday():
+ * private function. Returns the timeofday in nsec_t.
+ */
+static inline nsec_t __gettimeofday(void)
+{
+ nsec_t wall, sys;
+ unsigned long seq;
+
+ /* atomically read wall and sys time */
+ do {
+ seq = read_seqbegin(&system_time_lock);
+
+ wall = wall_time_offset;
+ sys = __monotonic_clock();
+
+ } while (read_seqretry(&system_time_lock, seq));
+
+ return wall + sys;
+}
+
+
+/* getnstimeofday():
+ * Returns the time of day in a timespec
+ */
+void getnstimeofday(struct timespec *ts)
+{
+ *ts = ns2timespec(__gettimeofday());
+}
+EXPORT_SYMBOL(getnstimeofday);
+
+
+/* do_gettimeofday():
+ * Returns the time of day in a timeval
+ */
+void do_gettimeofday(struct timeval *tv)
+{
+ *tv = ns2timeval(__gettimeofday());
+}
+EXPORT_SYMBOL(do_gettimeofday);
+
+
+/* do_settimeofday():
+ * Sets the time of day
+ */
+int do_settimeofday(struct timespec *tv)
+{
+ unsigned long flags;
+ nsec_t newtime = timespec2ns(tv);
+
+ /* atomically adjust wall_time_offset & clear ntp state machine */
+ write_seqlock_irqsave(&system_time_lock, flags);
+
+ wall_time_offset = newtime - __monotonic_clock();
+ ntp_clear();
+
+ update_legacy_time_values();
+
+ write_sequnlock_irqrestore(&system_time_lock, flags);
+
+ return 0;
+}
+EXPORT_SYMBOL(do_settimeofday);
+
+
+/* do_adjtimex:
+ * Userspace NTP daemon's interface to the kernel NTP variables
+ */
+int do_adjtimex(struct timex *tx)
+{
+ /* Check capabilities if we're trying to modify something */
+ if (tx->modes && !capable(CAP_SYS_TIME))
+ return -EPERM;
+
+ /* Note: We set tx->time first,
+ * because ntp_adjtimex uses it
+ */
+ do_gettimeofday(&tx->time);
+
+ /* call out to NTP code */
+ return ntp_adjtimex(tx);
+}
+
+
+/* timeofday_suspend_hook():
+ * This function allows the timeofday subsystem to
+ * be shutdown for a period of time. Usefull when
+ * going into suspend/hibernate mode. The code is
+ * very similar to the first half of
+ * timeofday_periodic_hook().
+ */
+void timeofday_suspend_hook(void)
+{
+ unsigned long flags;
+
+ write_seqlock_irqsave(&system_time_lock, flags);
+
+ /* Make sure time_suspend_state is sane */
+ BUG_ON(time_suspend_state != TIME_RUNNING);
+
+ /* First off, save suspend start time
+ * then quickly call __monotonic_clock.
+ * These two calls hopefully occur quickly
+ * because the difference between reads will
+ * accumulate as time drift on resume.
+ */
+ suspend_start = read_persistent_clock();
+ system_time = __monotonic_clock();
+
+ /* switch states */
+ time_suspend_state = TIME_SUSPENDED;
+
+ write_sequnlock_irqrestore(&system_time_lock, flags);
+}
+
+
+/* timeofday_resume_hook():
+ * This function resumes the timeofday subsystem
+ * from a previous call to timeofday_suspend_hook.
+ */
+void timeofday_resume_hook(void)
+{
+ nsec_t now, suspend_time;
+ unsigned long flags;
+
+ write_seqlock_irqsave(&system_time_lock, flags);
+
+ /* Make sure time_suspend_state is sane */
+ BUG_ON(time_suspend_state != TIME_SUSPENDED);
+
+ /* Read persistent clock to mark the end of
+ * the suspend interval then rebase the
+ * offset_base to current timesource value.
+ * Again, time between these two calls will
+ * not be accounted for and will show up as
+ * time drift.
+ */
+ now = read_persistent_clock();
+ offset_base = read_timesource(timesource);
+
+ /* calculate how long we were out for */
+ suspend_time = now - suspend_start;
+
+ /* update system_time */
+ system_time += suspend_time;
+
+ ntp_clear();
+
+ /* Set us back to running */
+ time_suspend_state = TIME_RUNNING;
+
+ /* finally, update legacy time values */
+ update_legacy_time_values();
+
+ write_sequnlock_irqrestore(&system_time_lock, flags);
+}
+
+struct timer_list timeofday_timer;
+
+/* timeofday_periodic_hook:
+ * Calculates the delta since the last call,
+ * updates system time and clears the offset.
+ * Called via timeofday_timer.
+ */
+static void timeofday_periodic_hook(unsigned long unused)
+{
+ cycle_t now, cycle_delta;
+ static u64 remainder;
+ nsec_t ns, ns_ntp;
+ long leapsecond;
+ struct timesource_t* next;
+ unsigned long flags;
+ u64 tmp;
+
+ write_seqlock_irqsave(&system_time_lock, flags);
+
+ /* read time source & calc time since last call*/
+ now = read_timesource(timesource);
+ cycle_delta = (now - offset_base) & timesource->mask;
+
+ /* convert cycles to ntp adjusted ns and save remainder */
+ ns_ntp = cyc2ns_rem(timesource, ntp_adj, cycle_delta, &remainder);
+
+ /* convert cycles to raw ns for ntp advance */
+ ns = cyc2ns(timesource, 0, cycle_delta);
+
+#if TIME_DBG
+{ /* XXX - remove later*/
+ static int dbg=0;
+ if(!(dbg++%TIME_DBG_FREQ)){
+ printk(KERN_INFO "now: %lluc - then: %lluc = delta: %lluc -> %llu ns + %llu shift_ns (ntp_adj: %i)\n",
+ (unsigned long long)now, (unsigned long long)offset_base,
+ (unsigned long long)cycle_delta, (unsigned long long)ns,
+ (unsigned long long)remainder, ntp_adj);
+ }
+}
+#endif
+
+ /* update system_time */
+ system_time += ns_ntp;
+
+ /* reset the offset_base */
+ offset_base = now;
+
+ /* advance the ntp state machine by ns interval*/
+ ntp_adj = ntp_advance(ns);
+
+ /* do ntp leap second processing*/
+ leapsecond = ntp_leapsecond(ns2timespec(system_time+wall_time_offset));
+ wall_time_offset += leapsecond * NSEC_PER_SEC;
+
+ /* sync the persistent clock */
+ if (!(get_ntp_status() & STA_UNSYNC))
+ sync_persistent_clock(ns2timespec(system_time + wall_time_offset));
+
+ /* if necessary, switch timesources */
+ next = get_next_timesource();
+ if (next != timesource) {
+ /* immediately set new offset_base */
+ offset_base = read_timesource(next);
+ /* swap timesources */
+ timesource = next;
+ printk(KERN_INFO "Time: %s timesource has been installed.\n",
+ timesource->name);
+ ntp_clear();
+ ntp_adj = 0;
+ remainder = 0;
+ }
+
+ /* now is a safe time, so allow timesource to adjust
+ * itself (for example: to make cpufreq changes).
+ */
+ if(timesource->update_callback)
+ timesource->update_callback();
+
+
+ /* convert the signed ppm to timesource multiplier adjustment */
+ tmp = abs(ntp_adj);
+ tmp = tmp * timesource->mult;
+ /* XXX - should we round here? */
+ do_div(tmp, 1000000);
+ if (ntp_adj < 0)
+ ntp_adj = -(int)tmp;
+ else
+ ntp_adj = (int)tmp;
+
+ /* sync legacy values */
+ update_legacy_time_values();
+
+ write_sequnlock_irqrestore(&system_time_lock, flags);
+
+ /* Set us up to go off on the next interval */
+ mod_timer(&timeofday_timer, jiffies + (PERIODIC_INTERVAL_MS * HZ / 1000));
+}
+
+
+/* timeofday_init():
+ * Initializes time variables
+ */
+void __init timeofday_init(void)
+{
+ unsigned long flags;
+#if TIME_DBG
+ printk(KERN_INFO "timeofday_init: Starting up!\n");
+#endif
+ write_seqlock_irqsave(&system_time_lock, flags);
+
+ /* initialize the timesource variable */
+ timesource = get_next_timesource();
+
+ /* clear and initialize offsets*/
+ offset_base = read_timesource(timesource);
+ wall_time_offset = read_persistent_clock();
+
+ /* clear NTP scaling factor & state machine */
+ ntp_adj = 0;
+ ntp_clear();
+
+ /* initialize legacy time values */
+ update_legacy_time_values();
+
+ write_sequnlock_irqrestore(&system_time_lock, flags);
+
+ /* Install timeofday_periodic_hook timer */
+ init_timer(&timeofday_timer);
+ timeofday_timer.function = timeofday_periodic_hook;
+ timeofday_timer.expires = jiffies + 1;
+ add_timer(&timeofday_timer);
+
+
+#if TIME_DBG
+ printk(KERN_INFO "timeofday_init: finished!\n");
+#endif
+ return;
+}
diff -Nru a/kernel/timer.c b/kernel/timer.c
--- a/kernel/timer.c 2005-04-29 16:39:35 -07:00
+++ b/kernel/timer.c 2005-04-29 16:39:35 -07:00
@@ -577,6 +577,7 @@
int tickadj = 500/HZ ? : 1; /* microsecs */


+#ifndef CONFIG_NEWTOD
/*
* phase-lock loop variables
*/
@@ -807,6 +808,9 @@
}
} while (ticks);
}
+#else /* CONFIG_NEWTOD */
+#define update_wall_time(x)
+#endif /* CONFIG_NEWTOD */

/*
* Called from the timer interrupt handler to charge one tick to the current
diff -Nru a/kernel/timesource.c b/kernel/timesource.c
--- /dev/null Wed Dec 31 16:00:00 196900
+++ b/kernel/timesource.c 2005-04-29 16:39:35 -07:00
@@ -0,0 +1,210 @@
+/*********************************************************************
+* linux/kernel/timesource.c
+*
+* This file contains the functions which manage timesource drivers.
+*
+* Copyright (C) 2004, 2005 IBM, John Stultz ([email protected])
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*
+* Revision History:
+* 2004-12-07: A1
+* o Rework of timesource structure
+* o Sent to lkml for review
+* 2005-04-29: A4
+* o Keep track of all registered timesources
+* o Add sysfs interface for overriding default selection
+*
+* TODO List:
+* o Allow timesource drivers to be unregistered
+* o Use "clock=xyz" boot option for selection overrides.
+* o get rid of timesource_jiffies extern
+**********************************************************************/
+
+#include <linux/timesource.h>
+#include <linux/sysdev.h>
+#include <linux/init.h>
+
+#define MAX_TIMESOURCES 10
+
+
+/* XXX - Need to have a better way for initializing curr_timesource */
+extern struct timesource_t timesource_jiffies;
+
+/*[Timesource internal variables]---------
+ * curr_timesource:
+ * currently selected timesource. Initialized to timesource_jiffies.
+ * next_timesource:
+ * pending next selected timesource.
+ * timesource_list:
+ * array of pointers pointing to registered timesources
+ * timesource_list_counter:
+ * value which counts the number of registered timesources
+ * timesource_lock:
+ * protects manipulations to curr_timesource and next_timesource
+ * and the timesource_list
+ */
+static struct timesource_t *curr_timesource = &timesource_jiffies;
+static struct timesource_t *next_timesource;
+static struct timesource_t *timesource_list[MAX_TIMESOURCES];
+static int timesource_list_counter;
+static seqlock_t timesource_lock = SEQLOCK_UNLOCKED;
+
+static char override_name[32];
+
+/* get_next_timesource():
+ * Returns the selected timesource
+ */
+struct timesource_t* get_next_timesource(void)
+{
+ write_seqlock(&timesource_lock);
+ if (next_timesource) {
+ curr_timesource = next_timesource;
+ next_timesource = NULL;
+ }
+ write_sequnlock(&timesource_lock);
+
+ return curr_timesource;
+}
+
+/* select_timesource():
+ * Private function. Finds the best registered timesource.
+ * Must have a writelock on timesource_lock when called.
+ */
+static struct timesource_t* select_timesource(void)
+{
+ struct timesource_t* best = timesource_list[0];
+ int i;
+
+ for (i=0; i < timesource_list_counter; i++) {
+ /* Check for override */
+ if ((override_name[0] != 0) &&
+ (!strncmp(timesource_list[i]->name, override_name,
+ strlen(override_name)))) {
+ best = timesource_list[i];
+ break;
+ }
+ /* Pick the highest priority */
+ if (timesource_list[i]->priority > best->priority)
+ best = timesource_list[i];
+ }
+ return best;
+}
+
+/* register_timesource():
+ * Used to install new timesources
+ */
+void register_timesource(struct timesource_t* t)
+{
+ char* error_msg = 0;
+ int i;
+ write_seqlock(&timesource_lock);
+
+ /* check if timesource is already registered */
+ for (i=0; i < timesource_list_counter; i++)
+ if (!strncmp(timesource_list[i]->name, t->name, strlen(t->name))){
+ error_msg = "Already registered!";
+ break;
+ }
+
+ /* check that the list isn't full */
+ if (timesource_list_counter >= MAX_TIMESOURCES)
+ error_msg = "Too many timesources!";
+
+ if(!error_msg)
+ timesource_list[timesource_list_counter++] = t;
+ else
+ printk("register_timesource: Cannot register %s. %s\n",
+ t->name, error_msg);
+
+ /* select next timesource */
+ next_timesource = select_timesource();
+
+ write_sequnlock(&timesource_lock);
+}
+
+/* sysfs_show_timesources():
+ * Provides sysfs interface for listing registered timesources
+ */
+static ssize_t sysfs_show_timesources(struct sys_device *dev, char *buf)
+{
+ int i;
+ char* curr = buf;
+ write_seqlock(&timesource_lock);
+ for(i=0; i < timesource_list_counter; i++) {
+ /* Mark current timesource w/ a star */
+ if (timesource_list[i] == curr_timesource)
+ curr += sprintf(curr, "*");
+ curr += sprintf(curr, "%s ",timesource_list[i]->name);
+ }
+ write_sequnlock(&timesource_lock);
+
+ curr += sprintf(curr, "\n");
+ return curr - buf;
+}
+
+/* sysfs_override_timesource():
+ * Takes input from sysfs interface for manually overriding
+ * the default timesource selction
+ */
+static ssize_t sysfs_override_timesource(struct sys_device *dev,
+ const char *buf, size_t count)
+{
+ /* check to avoid underflow later */
+ if (strlen(buf) == 0)
+ return count;
+
+ write_seqlock(&timesource_lock);
+
+ /* copy the name given */
+ strncpy(override_name, buf, strlen(buf)-1);
+ override_name[strlen(buf)-1] = 0;
+
+ /* see if we can find it */
+ next_timesource = select_timesource();
+
+ write_sequnlock(&timesource_lock);
+ return count;
+}
+
+/* Sysfs setup bits:
+ * XXX - Is there a simpler way?
+ */
+
+static SYSDEV_ATTR(timesource, 0600, sysfs_show_timesources, sysfs_override_timesource);
+
+static struct sysdev_class timesource_sysclass = {
+ set_kset_name("timesource"),
+};
+
+static struct sys_device device_timesource = {
+ .id = 0,
+ .cls = &timesource_sysclass,
+};
+
+static int init_timesource_sysfs(void)
+{
+ int error = sysdev_class_register(&timesource_sysclass);
+ if (!error) {
+ error = sysdev_register(&device_timesource);
+ /* XXX error checking? */
+ sysdev_create_file(&device_timesource, &attr_timesource);
+ }
+ return error;
+}
+device_initcall(init_timesource_sysfs);
+
+
+/* XXX - Do we need a boot time override interface? */


2005-04-29 23:51:52

by Benjamin Herrenschmidt

[permalink] [raw]
Subject: Re: [RFC][PATCH (1/4)] new timeofday core subsystem (v A4)

On Fri, 2005-04-29 at 15:45 -0700, john stultz wrote:
> All,
> This patch implements the architecture independent portion of
> the time of day subsystem. For a brief description on the rework, see
> here: http://lwn.net/Articles/120850/ (Many thanks to the LWN team for
> that clear writeup!)
>
> Mostly this version is just a cleanup of the last release. One neat
> feature is the new sysfs interface which allows you to manually override
> the selected timesource while the system is running.
>
> Included below is timeofday.c (which includes all the time of day
> management and accessor functions), ntp.c (which includes the ntp
> scaling calculation code, leapsecond processing, and ntp kernel state
> machine code), timesource.c (for timesource specific management
> functions), interface definition .h files, the example jiffies
> timesource (lowest common denominator time source, mainly for use as
> example code) and minimal hooks into arch independent code.
>
> The patch does not function without minimal architecture specific hooks
> (i386, x86-64, ppc32, ppc64, ia64 and s390 examples to follow), and it
> should be able to be applied to a tree without affecting the code.

My concern at this point is how to deal with the userland gettimofday
implementation in the ppc64 vDSO ...

Ben.


2005-04-30 00:33:41

by john stultz

[permalink] [raw]
Subject: Re: [RFC][PATCH (1/4)] new timeofday core subsystem (v A4)

On Sat, 2005-04-30 at 09:50 +1000, Benjamin Herrenschmidt wrote:
> On Fri, 2005-04-29 at 15:45 -0700, john stultz wrote:
> > All,
> > This patch implements the architecture independent portion of
> > the time of day subsystem. For a brief description on the rework, see
> > here: http://lwn.net/Articles/120850/ (Many thanks to the LWN team for
> > that clear writeup!)
> >
> > Mostly this version is just a cleanup of the last release. One neat
> > feature is the new sysfs interface which allows you to manually override
> > the selected timesource while the system is running.
> >
> > Included below is timeofday.c (which includes all the time of day
> > management and accessor functions), ntp.c (which includes the ntp
> > scaling calculation code, leapsecond processing, and ntp kernel state
> > machine code), timesource.c (for timesource specific management
> > functions), interface definition .h files, the example jiffies
> > timesource (lowest common denominator time source, mainly for use as
> > example code) and minimal hooks into arch independent code.
> >
> > The patch does not function without minimal architecture specific hooks
> > (i386, x86-64, ppc32, ppc64, ia64 and s390 examples to follow), and it
> > should be able to be applied to a tree without affecting the code.
>
> My concern at this point is how to deal with the userland gettimofday
> implementation in the ppc64 vDSO ...

Hopefully in a method very similar to the way the vsyscall patch does
for i386/x86-64. The idea being that the core code makes an arch
specific call passing the core timekeeping values whenever they are
changed. Then the arch specific implementation can use those values in a
similar fashion to calculate time.

I'm not very familiar with ppc64's vDSO implementation, so please let me
know if there is a constraint that would make this difficult.

thanks
-john


2005-05-02 18:41:37

by Darren Hart

[permalink] [raw]
Subject: Re: [RFC][PATCH] new timeofday-based soft-timer subsystem

Nishanth Aravamudan wrote:
> * john stultz <[email protected]> [2005-0429 15:45:47 -0700]:
>
>
>>All,
>> This patch implements the architecture independent portion of
>>the time of day subsystem. For a brief description on the rework, see
>>here: http://lwn.net/Articles/120850/ (Many thanks to the LWN team for
>>that clear writeup!)
>
>
> I have been working closely with John to re-work the soft-timer subsytem
> to use the new timeofday() subsystem. The following patch attempts to
> being this process. I would greatly appreciate any comments.
>
>

Also working closely with John and Nish, I have been taking advantage of
the new human-time soft-timer subsystem and the NO_IDLE_HZ code to
dynamically schedule interrupts as needed. The idea is to have
interrupt source drivers (PIT, Local APIC, HPET, ppc decrementers, etc)
similar to the time sources in John's timeofday patches.

Because the resolution of the soft-timer sybsystem is configurable via
TIMER_INTERVAL_BITS, and the timeofday code is now free of the periodic
system tick, we can move the soft-timers to a dynamically scheduled
interrupt system. We can achieve both sub-millisecond timer resolution
and NO_IDLE_HZ simply by adjusting TIMER_INTERVAL_BITS and scheduling
the next timer interrupt appropriately whenever a soft-timer is added or
removed.

In general at the end of set_timer_nsecs(), we see when the next timer
is due to expire and pass that value (in absolute nanoseconds) to
schedule_next_timer_interrupt(). Each interrupt source driver is then
free to reprogram the hard-timer to the "best" interval. For something
like the local APIC, that may be exactly when the next timer needs to go
off. For the PIT, it may do nothing at all and just fire periodically.

I have a prototype using the PIT, which just demonstrates that the
system will still run this way. Obviously other timers will perform
much better since the PIT is so slow to program.

I feel that this is a clean approach to two soft-timer issues:
resolution and NO_IDLE_HZ. It integrates well with the patches from
John and Nish and is a direct approach to these issues, rather than an
attempt to add support on top of a jiffies based soft-timer subsystem.

I'd appreciate any feedback people have to offer. Particularly those
that have been working on alternative approaches to things like high
resolution timers and NO_IDLE_HZ.

Thanks,


--
Darren Hart
IBM Linux Technology Center
Linux Kernel Team
Phone: 503 578 3185
T/L: 775 3185

2005-05-02 21:16:54

by Pavel Machek

[permalink] [raw]
Subject: Re: [RFC][PATCH (2/4)] new timeofday arch specific hooks (v A4)

Hi!

> ppc64 and s390. It applies on top of my linux-2.6.12-rc2_timeofday-
> core_A4 patch and with this patch applied, you can test the new time of
> day subsystem.
....
> device_power_down(PMSG_SUSPEND);
>
> + timeofday_suspend_hook();
> /* serialize with the timer interrupt */

You should not add hooks like this. Just add your own [sys]_device.
Pavel

--
64 bytes from 195.113.31.123: icmp_seq=28 ttl=51 time=448769.1 ms

2005-05-02 21:28:28

by john stultz

[permalink] [raw]
Subject: Re: [RFC][PATCH (2/4)] new timeofday arch specific hooks (v A4)

On Mon, 2005-05-02 at 23:13 +0200, Pavel Machek wrote:
> Hi!
>
> > ppc64 and s390. It applies on top of my linux-2.6.12-rc2_timeofday-
> > core_A4 patch and with this patch applied, you can test the new time of
> > day subsystem.
> ....
> > device_power_down(PMSG_SUSPEND);
> >
> > + timeofday_suspend_hook();
> > /* serialize with the timer interrupt */
>
> You should not add hooks like this. Just add your own [sys]_device.

Agreed. Sorry I didn't get to it the last time you mentioned it.

thanks
-john




2005-05-03 17:02:38

by Nishanth Aravamudan

[permalink] [raw]
Subject: Re: [RFC][PATCH] new timeofday-based soft-timer subsystem

On 29.04.2005 [16:35:46 -0700], Nishanth Aravamudan wrote:
> * john stultz <[email protected]> [2005-0429 15:45:47 -0700]:
>
> > All,
> > This patch implements the architecture independent portion of
> > the time of day subsystem. For a brief description on the rework, see
> > here: http://lwn.net/Articles/120850/ (Many thanks to the LWN team for
> > that clear writeup!)
>
> I have been working closely with John to re-work the soft-timer subsytem
> to use the new timeofday() subsystem. The following patch attempts to
> being this process. I would greatly appreciate any comments.

I am not sure if anyone has looked at this patch closely, but I have
noticed one issue: My code assumes that all the rounding will be done
internally (rounding up on addition to find to the nearest
timerinterval); however, current interfaces do much of the rounding
before passing on structures on to the soft-timer subsystem, because the
jiffies-based one always rounds down.

This is most clear in sys_nanosleep(). Without any modifications to the
syscall, but with my patch applied, one will see around 5 millisecond
sleeps for a 1 millisecond request. This occurs, I believe, because
jiffies_to_timespec() rounds up once, we add one if there is any value
and then in internally I round up once more. If I rewrite
sys_nanosleep() to use schedule_timeout_nsecs() and thus never convert
from nanoseconds, I see 2 millisecond sleeps for 1 millisecond requests,
which is much closer (and accurate, as our granularity is slightly
greater than 1 millisecond and we are interruptiing at HZ=1000 slightly
more often than ever millisecond). This seems to be the right solution,
but then there is another issue: the restart_block used by
sys_nanosleep() only allows for 4 unsigned long arguments, when, in
fact, nanoseconds are a 64-bit quantity in the kernel. As long as the
nanosleep() request is no more than around 4 seconds, we should be ok
using unsigned longs. But anything longer will simply truncate
currently. I am not certain of a clean way to modify the restart_block
to incorporate a 64-bit quantity, as it is used by other interfaces as
well.

I still need to update the other version of nanosleep() (nsleep() and
posix) before I post an updated patch. Just wanted to let everyone know
of the issue.

Thanks,
Nish

2005-05-03 17:22:46

by Chris Friesen

[permalink] [raw]
Subject: Re: [RFC][PATCH] new timeofday-based soft-timer subsystem

Nishanth Aravamudan wrote:

> but then there is another issue: the restart_block used by
> sys_nanosleep() only allows for 4 unsigned long arguments, when, in
> fact, nanoseconds are a 64-bit quantity in the kernel. As long as the
> nanosleep() request is no more than around 4 seconds, we should be ok
> using unsigned longs.

My man page for nanosleep specifies that the "nanoseconds" portion of
the timespec must be under 1 billion and is of type "long". Is that no
longer valid?

Chris

2005-05-03 18:11:48

by Nish Aravamudan

[permalink] [raw]
Subject: Re: [RFC][PATCH] new timeofday-based soft-timer subsystem

On 5/3/05, Chris Friesen <[email protected]> wrote:
> Nishanth Aravamudan wrote:
>
> > but then there is another issue: the restart_block used by
> > sys_nanosleep() only allows for 4 unsigned long arguments, when, in
> > fact, nanoseconds are a 64-bit quantity in the kernel. As long as the
> > nanosleep() request is no more than around 4 seconds, we should be ok
> > using unsigned longs.
>
> My man page for nanosleep specifies that the "nanoseconds" portion of
> the timespec must be under 1 billion and is of type "long". Is that no
> longer valid?

Certainly would be, but the problem is if you pass in a timespec ts, where

ts.tv_sec = 10;
ts.tv_nsec = 99999;

This will overflow a 32-bit nanosecond representation internally
(10000099999 > 4294967296). Sorry for the confusion, the unsigned long
I was referring to was the internal representation of the nanoseconds
converted from the timespec parameter.

Thanks,
Nish

2005-05-03 21:47:48

by Nishanth Aravamudan

[permalink] [raw]
Subject: Re: [RFC][PATCH] new timeofday-based soft-timer subsystem

On 03.05.2005 [10:02:24 -0700], Nishanth Aravamudan wrote:
> On 29.04.2005 [16:35:46 -0700], Nishanth Aravamudan wrote:
> > * john stultz <[email protected]> [2005-0429 15:45:47 -0700]:
> >
> > > All,
> > > This patch implements the architecture independent portion of
> > > the time of day subsystem. For a brief description on the rework, see
> > > here: http://lwn.net/Articles/120850/ (Many thanks to the LWN team for
> > > that clear writeup!)
> >
> > I have been working closely with John to re-work the soft-timer subsytem
> > to use the new timeofday() subsystem. The following patch attempts to
> > being this process. I would greatly appreciate any comments.
>
> I am not sure if anyone has looked at this patch closely, but I have
> noticed one issue: My code assumes that all the rounding will be done
> internally (rounding up on addition to find to the nearest
> timerinterval); however, current interfaces do much of the rounding
> before passing on structures on to the soft-timer subsystem, because the
> jiffies-based one always rounds down.

A for instance: sys_nanosleep() assumes (correctly) that the
jiffies-based soft-timer subsystem rounds down, so it rounds up (twice).
But since I now round-up internally, that is not necessary. Fix
sys_nanosleep() to do this right.

Still todo: change restart->arg0 to be a pointer to an nsec_t.

diff -urpN 2.6.12-rc2-tod/kernel/timer.c 2.6.12-rc2-tod-timer/kernel/timer.c
--- 2.6.12-rc2-tod/kernel/timer.c 2005-05-02 12:59:04.000000000 -0700
+++ 2.6.12-rc2-tod-timer/kernel/timer.c 2005-05-03 09:13:43.000000000 -0700
@@ -1141,21 +1311,21 @@ asmlinkage long sys_gettid(void)

static long __sched nanosleep_restart(struct restart_block *restart)
{
- unsigned long expire = restart->arg0, now = jiffies;
+ nsec_t expire = restart->arg0, now = do_monotonic_clock();
struct timespec __user *rmtp = (struct timespec __user *) restart->arg1;
long ret;

/* Did it expire while we handled signals? */
- if (!time_after(expire, now))
+ if (now > expire)
return 0;

- current->state = TASK_INTERRUPTIBLE;
- expire = schedule_timeout(expire - now);
+ set_current_state(TASK_INTERRUPTIBLE);
+ expire = schedule_timeout_nsecs(expire - now);

ret = 0;
if (expire) {
struct timespec t;
- jiffies_to_timespec(expire, &t);
+ t = ns2timespec(expire);

ret = -ERESTART_RESTARTBLOCK;
if (rmtp && copy_to_user(rmtp, &t, sizeof(t)))
@@ -1168,7 +1338,7 @@ static long __sched nanosleep_restart(st
asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
{
struct timespec t;
- unsigned long expire;
+ nsec_t expire;
long ret;

if (copy_from_user(&t, rqtp, sizeof(t)))
@@ -1177,20 +1347,20 @@ asmlinkage long sys_nanosleep(struct tim
if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0))
return -EINVAL;

- expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
- current->state = TASK_INTERRUPTIBLE;
- expire = schedule_timeout(expire);
+ expire = timespec2ns(&t);
+ set_current_state(TASK_INTERRUPTIBLE);
+ expire = schedule_timeout_nsecs(expire);

ret = 0;
if (expire) {
struct restart_block *restart;
- jiffies_to_timespec(expire, &t);
+ t = ns2timespec(expire);
if (rmtp && copy_to_user(rmtp, &t, sizeof(t)))
return -EFAULT;

restart = &current_thread_info()->restart_block;
restart->fn = nanosleep_restart;
- restart->arg0 = jiffies + expire;
+ restart->arg0 = do_monotonic_clock() + expire;
restart->arg1 = (unsigned long) rmtp;
ret = -ERESTART_RESTARTBLOCK;
}