2001-10-31 11:36:40

by Tim Schmielau

[permalink] [raw]
Subject: [Patch] Re: Nasty suprise with uptime

The appended patch enables 32 bit linux boxes to display more than
497.1 days of uptime. No user land application changes are needed.

Credit is due to George Anzinger and the High-res-timers project
at https://sourceforge.net/projects/high-res-timers/, where I ripped
out the 64 bit jiffies patch. However, I do claim ownership on
all misdesign and bugs since this is my first kernel patch.

My Intel box just now displays a (faked, of course) uptime of 497 days,
2:38 hours, so the 32 jiffies value has wrapped without problems about 10
minutes ago.

Next step would be to decide what to do with the start_time field of
struct task_struct, which is still 32 bit, so ps on my box believes some
processes to have started 497 days ago. Probably there are tons of other
uses for the upper 32 bit of jiffies as well.

Tim



--- fs/proc/proc_misc.c.orig Wed Oct 31 04:14:05 2001
+++ fs/proc/proc_misc.c Wed Oct 31 11:07:31 2001
@@ -39,6 +39,7 @@
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/io.h>
+#include <asm/div64.h>


#define LOAD_INT(x) ((x) >> FSHIFT)
@@ -103,15 +104,19 @@
static int uptime_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
- unsigned long uptime;
- unsigned long idle;
+ u64 uptime;
+ unsigned long idle, remainder;
int len;

- uptime = jiffies;
+ /* On 32 bit platforms there is a tiny window here
+ * for a race condition every 497.1 days */
+ uptime = jiffies_64;
+ remainder = (unsigned long) do_div(uptime, HZ);
idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime;

- /* The formula for the fraction parts really is ((t * 100) / HZ) % 100, but
- that would overflow about every five days at HZ == 100.
+ /* The formula for the fraction part of the idle time really is
+ ((t * 100) / HZ) % 100, but that would overflow about
+ every five days at HZ == 100.
Therefore the identity a = (a / b) * b + a % b is used so that it is
calculated as (((t / HZ) * 100) + ((t % HZ) * 100) / HZ) % 100.
The part in front of the '+' always evaluates as 0 (mod 100). All divisions
@@ -121,14 +126,14 @@
*/
#if HZ!=100
len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
- uptime / HZ,
- (((uptime % HZ) * 100) / HZ) % 100,
+ (unsigned long) uptime,
+ ((remainder * 100) / HZ) % 100,
idle / HZ,
(((idle % HZ) * 100) / HZ) % 100);
#else
len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
- uptime / HZ,
- uptime % HZ,
+ (unsigned long) uptime,
+ remainder,
idle / HZ,
idle % HZ);
#endif
--- kernel/itimer.c.orig Wed Oct 31 01:14:19 2001
+++ kernel/itimer.c Wed Oct 31 01:15:38 2001
@@ -34,10 +34,10 @@
return HZ*sec+usec;
}

-static void jiffiestotv(unsigned long jiffies, struct timeval *value)
+static void jiffiestotv(unsigned long _jiffies, struct timeval *value)
{
- value->tv_usec = (jiffies % HZ) * (1000000 / HZ);
- value->tv_sec = jiffies / HZ;
+ value->tv_usec = (_jiffies % HZ) * (1000000 / HZ);
+ value->tv_sec = _jiffies / HZ;
}

int do_getitimer(int which, struct itimerval *value)
--- kernel/timer.c.orig Tue Oct 30 23:35:44 2001
+++ kernel/timer.c Wed Oct 31 03:25:12 2001
@@ -65,7 +65,8 @@

extern int do_setitimer(int, struct itimerval *, struct itimerval *);

-unsigned long volatile jiffies;
+#define INITIAL_JIFFIES 0
+volatile u64 jiffies_64 = INITIAL_JIFFIES;

unsigned int * prof_buffer;
unsigned long prof_len;
@@ -117,7 +118,7 @@
INIT_LIST_HEAD(tv1.vec + i);
}

-static unsigned long timer_jiffies;
+static unsigned long timer_jiffies = INITIAL_JIFFIES;

static inline void internal_add_timer(struct timer_list *timer)
{
@@ -638,7 +639,7 @@
}

/* jiffies at the most recent update of wall time */
-unsigned long wall_jiffies;
+unsigned long wall_jiffies = INITIAL_JIFFIES;

/*
* This spinlock protect us from races in SMP while playing with xtime. -arca
@@ -673,7 +674,7 @@

void do_timer(struct pt_regs *regs)
{
- (*(unsigned long *)&jiffies)++;
+ (*(u64 *)&jiffies_64)++;
#ifndef CONFIG_SMP
/* SMP process accounting uses the local APIC timer */

--- kernel/ksyms.c.orig Wed Oct 31 00:53:08 2001
+++ kernel/ksyms.c Wed Oct 31 02:22:20 2001
@@ -435,7 +435,7 @@
EXPORT_SYMBOL(interruptible_sleep_on_timeout);
EXPORT_SYMBOL(schedule);
EXPORT_SYMBOL(schedule_timeout);
-EXPORT_SYMBOL(jiffies);
+EXPORT_SYMBOL(jiffies_64);
EXPORT_SYMBOL(xtime);
EXPORT_SYMBOL(do_gettimeofday);
EXPORT_SYMBOL(do_settimeofday);
--- kernel/info.c.orig Wed Oct 31 00:57:24 2001
+++ kernel/info.c Wed Oct 31 10:38:50 2001
@@ -12,15 +12,21 @@
#include <linux/smp_lock.h>

#include <asm/uaccess.h>
+#include <asm/div64.h>

asmlinkage long sys_sysinfo(struct sysinfo *info)
{
struct sysinfo val;
+ u64 uptime;

memset((char *)&val, 0, sizeof(struct sysinfo));

cli();
- val.uptime = jiffies / HZ;
+ /* On 32 bit platforms there is a tiny window here
+ * for a race condition every 497.1 days */
+ uptime = jiffies_64;
+ do_div(uptime, HZ);
+ val.uptime = (unsigned long) uptime;

val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
--- include/linux/time.h.orig Wed Oct 31 01:40:39 2001
+++ include/linux/time.h Wed Oct 31 01:41:33 2001
@@ -42,10 +42,10 @@
}

static __inline__ void
-jiffies_to_timespec(unsigned long jiffies, struct timespec *value)
+jiffies_to_timespec(unsigned long _jiffies, struct timespec *value)
{
- value->tv_nsec = (jiffies % HZ) * (1000000000L / HZ);
- value->tv_sec = jiffies / HZ;
+ value->tv_nsec = (_jiffies % HZ) * (1000000000L / HZ);
+ value->tv_sec = _jiffies / HZ;
}


--- include/linux/sched.h.orig Wed Oct 31 00:48:44 2001
+++ include/linux/sched.h Wed Oct 31 01:49:53 2001
@@ -550,7 +550,15 @@

#include <asm/current.h>

-extern unsigned long volatile jiffies;
+/*
+ * you MUST NOT read jiffies_64 without holding read_lock_irq(&xtime_lock)
+ */
+#if defined(__LITTLE_ENDIAN) || (BITS_PER_LONG > 32)
+#define jiffies (((volatile unsigned long *)&jiffies_64)[0])
+#else
+#define jiffies (((volatile unsigned long *)&jiffies_64)[1])
+#endif
+extern u64 volatile jiffies_64;
extern unsigned long itimer_ticks;
extern unsigned long itimer_next;
extern struct timeval xtime;
--- include/linux/coda_proc.h.orig Wed Oct 31 01:29:56 2001
+++ include/linux/coda_proc.h Wed Oct 31 01:30:24 2001
@@ -14,7 +14,7 @@

void coda_sysctl_init(void);
void coda_sysctl_clean(void);
-void coda_upcall_stats(int opcode, unsigned long jiffies);
+void coda_upcall_stats(int opcode, unsigned long _jiffies);

#include <linux/sysctl.h>
#include <linux/coda_fs_i.h>
--- net/ipv4/ipconfig.c.orig Wed Oct 31 01:44:35 2001
+++ net/ipv4/ipconfig.c Wed Oct 31 01:45:29 2001
@@ -630,7 +630,7 @@
/*
* Send DHCP/BOOTP request to single interface.
*/
-static void __init ic_bootp_send_if(struct ic_device *d, u32 jiffies)
+static void __init ic_bootp_send_if(struct ic_device *d, u32 _jiffies)
{
struct net_device *dev = d->dev;
struct sk_buff *skb;
@@ -677,7 +677,7 @@
b->your_ip = INADDR_NONE;
b->server_ip = INADDR_NONE;
memcpy(b->hw_addr, dev->dev_addr, dev->addr_len);
- b->secs = htons(jiffies / HZ);
+ b->secs = htons(_jiffies / HZ);
b->xid = d->xid;

/* add DHCP options or BOOTP extensions */
--- drivers/net/wan/dscc4.c.orig Wed Oct 31 01:45:49 2001
+++ drivers/net/wan/dscc4.c Wed Oct 31 01:46:23 2001
@@ -123,7 +123,7 @@
u32 next;
u32 data;
u32 complete;
- u32 jiffies; /* more hack to come :o) */
+ u32 _jiffies; /* more hack to come :o) */
};

struct RxFD {
--- drivers/block/floppy.c.orig Wed Oct 31 01:16:05 2001
+++ drivers/block/floppy.c Wed Oct 31 01:18:17 2001
@@ -635,7 +635,7 @@
static struct output_log {
unsigned char data;
unsigned char status;
- unsigned long jiffies;
+ unsigned long _jiffies;
} output_log[OLOGSIZE];

static int output_log_pos;
@@ -1163,7 +1163,7 @@
#ifdef FLOPPY_SANITY_CHECK
output_log[output_log_pos].data = byte;
output_log[output_log_pos].status = status;
- output_log[output_log_pos].jiffies = jiffies;
+ output_log[output_log_pos]._jiffies = jiffies;
output_log_pos = (output_log_pos + 1) % OLOGSIZE;
#endif
return 0;
@@ -1851,7 +1851,7 @@
printk("%2x %2x %lu\n",
output_log[(i+output_log_pos) % OLOGSIZE].data,
output_log[(i+output_log_pos) % OLOGSIZE].status,
- output_log[(i+output_log_pos) % OLOGSIZE].jiffies);
+ output_log[(i+output_log_pos) % OLOGSIZE]._jiffies);
printk("last result at %lu\n", resultjiffies);
printk("last redo_fd_request at %lu\n", lastredo);
for (i=0; i<resultsize; i++){


2001-10-31 13:40:46

by Denis Vlasenko

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Wednesday 31 October 2001 11:35, Tim Schmielau wrote:
> The appended patch enables 32 bit linux boxes to display more than
> 497.1 days of uptime. No user land application changes are needed.
>
> Credit is due to George Anzinger and the High-res-timers project
> at https://sourceforge.net/projects/high-res-timers/, where I ripped
> out the 64 bit jiffies patch. However, I do claim ownership on
> all misdesign and bugs since this is my first kernel patch.
>
> My Intel box just now displays a (faked, of course) uptime of 497 days,
> 2:38 hours, so the 32 jiffies value has wrapped without problems about 10
> minutes ago.
>
> Next step would be to decide what to do with the start_time field of
> struct task_struct, which is still 32 bit, so ps on my box believes some
> processes to have started 497 days ago. Probably there are tons of other
> uses for the upper 32 bit of jiffies as well.

Hmm.... 64bit jiffies are attractive.

I'd like to see less #defines in kernel
Some parts of your patch fight with the fact that jiffies
is converted to macro -> it is illegal now to have local vars
called "jiffies". This is ugly. I know that there are tons of similarly
(ab)used macros in the kernel now but let's stop adding more!

This test prog shows how to make overlapping 32bit and 64bit vars.
It works for me.

#include <stdio.h>
typedef unsigned long long u64;

extern u64 jiffies_64;
extern unsigned long jiffies;
extern unsigned long jiffies_hi;

asm(
" .bss\n"
" .align 8\n"
".globl jiffies_64\n"
".globl jiffies\n"
".globl jiffies_hi\n"
"jiffies_64:\n"
// <- a bunch of ifdefs needed here to sort out endianness stuff...
"jiffies:\n"
" .zero 4\n"
"jiffies_hi:\n"
" .zero 4\n"

//not working!? how to return to prev .data/.text/whatever?
//I don't know gas...
//" .previous\n"
);

int main() {
jiffies_64 = 0xFEDCBA9876543210UL;
printf("lo: 0x%08x\n",jiffies);
printf("hi: 0x%08x\n",jiffies_hi);
return 0;
}

Is this better or not? If not, why?
--
vda

2001-10-31 14:32:30

by Richard B. Johnson

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Wed, 31 Oct 2001, vda wrote:

[SNIPPED...]

> Hmm.... 64bit jiffies are attractive.
>
> I'd like to see less #defines in kernel
> Some parts of your patch fight with the fact that jiffies
> is converted to macro -> it is illegal now to have local vars
> called "jiffies". This is ugly. I know that there are tons of similarly
> (ab)used macros in the kernel now but let's stop adding more!
>
> This test prog shows how to make overlapping 32bit and 64bit vars.
> It works for me.
>
> #include <stdio.h>
> typedef unsigned long long u64;
>
> extern u64 jiffies_64;
> extern unsigned long jiffies;
> extern unsigned long jiffies_hi;
>
> asm(
> " .bss\n"
> " .align 8\n"
> ".globl jiffies_64\n"
> ".globl jiffies\n"
> ".globl jiffies_hi\n"
> "jiffies_64:\n"
> // <- a bunch of ifdefs needed here to sort out endianness stuff...
> "jiffies:\n"
> " .zero 4\n"
> "jiffies_hi:\n"
> " .zero 4\n"
>
> //not working!? how to return to prev .data/.text/whatever?
> //I don't know gas...
> //" .previous\n"
> );
>
> int main() {
> jiffies_64 = 0xFEDCBA9876543210UL;
> printf("lo: 0x%08x\n",jiffies);
> printf("hi: 0x%08x\n",jiffies_hi);
> return 0;
> }
>
> Is this better or not? If not, why?
> --
> vda

The problem is that a 64-bit jiffies on a 32-bit machine would
require a spin-lock every time the jiffies variable is changed!
This is because there are two (or more) memory accesses for
every 64 bit operation, plus two or more register accesses for
every 64 bit operation. If a context-switch or an interrupt
occurs between those operations, all bets are off about the
result.

The appended small tar.gz file contains some 64-bit assembly
plus some 64-bit C, Look at the assembly and it will become
obvious to you that you don't want to use a 64-bit timer
on a 32 bit machine.


Cheers,
Dick Johnson

Penguin : Linux version 2.4.1 on an i686 machine (799.53 BogoMips).

I was going to compile a list of innovations that could be
attributed to Microsoft. Once I realized that Ctrl-Alt-Del
was handled in the BIOS, I found that there aren't any.


Attachments:
64bits.tar.gz (3.05 kB)

2001-10-31 18:18:32

by Tim Schmielau

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Wed, 31 Oct 2001, Richard B. Johnson wrote:

> On Wed, 31 Oct 2001, vda wrote:
>
> [SNIPPED...]
>
> > Hmm.... 64bit jiffies are attractive.
> >
> > I'd like to see less #defines in kernel
> > Some parts of your patch fight with the fact that jiffies
> > is converted to macro -> it is illegal now to have local vars
> > called "jiffies". This is ugly. I know that there are tons of similarly
> > (ab)used macros in the kernel now but let's stop adding more!
> >
> > This test prog shows how to make overlapping 32bit and 64bit vars.
> > It works for me.
> >

[asm snipped]

> >
> > Is this better or not? If not, why?
> > --
> > vda
>
> The problem is that a 64-bit jiffies on a 32-bit machine would
> require a spin-lock every time the jiffies variable is changed!
> This is because there are two (or more) memory accesses for
> every 64 bit operation, plus two or more register accesses for
> every 64 bit operation. If a context-switch or an interrupt
> occurs between those operations, all bets are off about the
> result.
>
> The appended small tar.gz file contains some 64-bit assembly
> plus some 64-bit C, Look at the assembly and it will become
> obvious to you that you don't want to use a 64-bit timer
> on a 32 bit machine.
>
>
> Cheers,
> Dick Johnson
>


The idea was that all drivers that use the 32 bit jiffies counter have to
be aware of the wraparound anyways, and won't see a difference.
The race only happens for 64 bit accesses to jiffies, but hey, without
the patch these values come out wrong _every_ time, so I believed a
tiny window for a single wrong display of uptime every 497.1 days to be
acceptable.

If we want to make sure to eliminate this possibility as well, the kind of
home-made synchronization may help that I came up with before considering
the race acceptable. It also has the advantage of not touching the jiffies
definition at all.

Tim

--- fs/proc/proc_misc.c.orig Wed Oct 31 17:45:08 2001
+++ fs/proc/proc_misc.c Wed Oct 31 18:49:19 2001
@@ -39,6 +39,7 @@
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/io.h>
+#include <asm/div64.h>


#define LOAD_INT(x) ((x) >> FSHIFT)
@@ -103,15 +104,28 @@
static int uptime_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
- unsigned long uptime;
+ u64 uptime;
+ unsigned long jiffies_tmp, jiffies_high_tmp, remainder;
unsigned long idle;
int len;

- uptime = jiffies;
+ /* We need to make sure jiffies_high does not change while
+ * reading jiffies and jiffies_high */
+ do {
+ jiffies_high_tmp = jiffies_high_shadow;
+ barrier();
+ jiffies_tmp = jiffies;
+ barrier();
+ } while (jiffies_high != jiffies_high_tmp);
+
+ uptime = jiffies_tmp + ((u64)jiffies_high_tmp << BITS_PER_LONG);
+ remainder = (unsigned long) do_div(uptime, HZ);
+
idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime;

- /* The formula for the fraction parts really is ((t * 100) / HZ) % 100, but
- that would overflow about every five days at HZ == 100.
+ /* The formula for the fraction part of the idle time really is
+ ((t * 100) / HZ) % 100, but that would overflow about
+ every five days at HZ == 100.
Therefore the identity a = (a / b) * b + a % b is used so that it is
calculated as (((t / HZ) * 100) + ((t % HZ) * 100) / HZ) % 100.
The part in front of the '+' always evaluates as 0 (mod 100). All divisions
@@ -121,14 +135,14 @@
*/
#if HZ!=100
len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
- uptime / HZ,
- (((uptime % HZ) * 100) / HZ) % 100,
+ (unsigned long) uptime,
+ ((remainder * 100) / HZ) % 100,
idle / HZ,
(((idle % HZ) * 100) / HZ) % 100);
#else
len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
- uptime / HZ,
- uptime % HZ,
+ (unsigned long) uptime,
+ remainder,
idle / HZ,
idle % HZ);
#endif
--- kernel/timer.c.orig Wed Oct 31 17:24:36 2001
+++ kernel/timer.c Wed Oct 31 18:38:47 2001
@@ -65,7 +65,9 @@

extern int do_setitimer(int, struct itimerval *, struct itimerval *);

-unsigned long volatile jiffies;
+#define INITIAL_JIFFIES 0xFFFFD000ul
+unsigned long volatile jiffies = INITIAL_JIFFIES;
+unsigned long volatile jiffies_high, jiffies_high_shadow;

unsigned int * prof_buffer;
unsigned long prof_len;
@@ -117,7 +119,7 @@
INIT_LIST_HEAD(tv1.vec + i);
}

-static unsigned long timer_jiffies;
+static unsigned long timer_jiffies = INITIAL_JIFFIES;

static inline void internal_add_timer(struct timer_list *timer)
{
@@ -638,7 +640,7 @@
}

/* jiffies at the most recent update of wall time */
-unsigned long wall_jiffies;
+unsigned long wall_jiffies = INITIAL_JIFFIES;

/*
* This spinlock protect us from races in SMP while playing with xtime. -arca
@@ -673,7 +675,22 @@

void do_timer(struct pt_regs *regs)
{
- (*(unsigned long *)&jiffies)++;
+ /* we assume that two calls to do_timer can never overlap
+ * since they are one jiffie apart in time */
+ if (jiffies != 0xffffffffUL) {
+ jiffies++;
+ } else {
+ /* We still need to care about the race with readers of
+ * jiffies_high. Readers have to discard the values if
+ * jiffies_high != jiffies_high_shadow when read with
+ * proper barriers in between. */
+ jiffies_high++;
+ barrier();
+ jiffies++;
+ barrier();
+ jiffies_high_shadow = jiffies_high;
+ barrier();
+ }
#ifndef CONFIG_SMP
/* SMP process accounting uses the local APIC timer */

--- kernel/info.c.orig Wed Oct 31 17:58:25 2001
+++ kernel/info.c Wed Oct 31 18:48:52 2001
@@ -12,15 +12,28 @@
#include <linux/smp_lock.h>

#include <asm/uaccess.h>
+#include <asm/div64.h>

asmlinkage long sys_sysinfo(struct sysinfo *info)
{
struct sysinfo val;
+ u64 uptime;
+ unsigned long jiffies_tmp, jiffies_high_tmp;

memset((char *)&val, 0, sizeof(struct sysinfo));

cli();
- val.uptime = jiffies / HZ;
+ /* We need to make sure jiffies_high does not change while
+ * reading jiffies and jiffies_high */
+ do {
+ jiffies_high_tmp = jiffies_high_shadow;
+ barrier();
+ jiffies_tmp = jiffies;
+ barrier();
+ } while (jiffies_high != jiffies_high_tmp);
+ uptime = jiffies_tmp + ((u64)jiffies_high_tmp << BITS_PER_LONG);
+ do_div(uptime, HZ);
+ val.uptime = uptime;

val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);

2001-10-31 18:36:42

by Tim Schmielau

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

Sorry, I forgot to set INITIAL_JIFFIES to zero before posting the patch.

Testing with high INITIAL_JIFFIES values unfortunately still discloses
instability after the wraparound even for an otherwise unpatched kernel.

I also forgot to mention that the introduced jiffies_high variable is
useless on 64 bit systems, so we might #ifdef it out there.

Tim


On Wed, 31 Oct 2001, Tim Schmielau wrote:

[parts of patch snipped]

> --- kernel/timer.c.orig Wed Oct 31 17:24:36 2001
> +++ kernel/timer.c Wed Oct 31 18:38:47 2001
> @@ -65,7 +65,9 @@
>
> extern int do_setitimer(int, struct itimerval *, struct itimerval *);
>
> -unsigned long volatile jiffies;
> +#define INITIAL_JIFFIES 0xFFFFD000ul
> +unsigned long volatile jiffies = INITIAL_JIFFIES;
> +unsigned long volatile jiffies_high, jiffies_high_shadow;
>

This of course needs to be
#define INITIAL_JIFFIES 0
for correct uptime display

2001-10-31 18:41:12

by Andreas Dilger

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Oct 31, 2001 19:16 +0100, Tim Schmielau wrote:
> The idea was that all drivers that use the 32 bit jiffies counter have to
> be aware of the wraparound anyways, and won't see a difference.

Agreed. I also like the change that you initialize jiffies to a pre-wrap
value, so the jiffies wrap bugs can more easily be found/fixed.

> The race only happens for 64 bit accesses to jiffies, but hey, without
> the patch these values come out wrong _every_ time, so I believed a
> tiny window for a single wrong display of uptime every 497.1 days to be
> acceptable.

I would say that the race is so rare that it should not be handled, especially
since it adds extra code in the timer interrupt.

> + /* We need to make sure jiffies_high does not change while
> + * reading jiffies and jiffies_high */
> + do {
> + jiffies_high_tmp = jiffies_high_shadow;
> + barrier();
> + jiffies_tmp = jiffies;
> + barrier();
> + } while (jiffies_high != jiffies_high_tmp);

Maybe this could be condensed into a macro/inline, so that people don't
screw it up (and it looks cleaner). Like get_jiffies64() or so, for
those few places that really care about the full value and can't stand
a miniscule chance of a race (i.e. uptime output is not a candidate).

Cheers, Andreas
--
Andreas Dilger
http://sourceforge.net/projects/ext2resize/
http://www-mddsp.enel.ucalgary.ca/People/adilger/

2001-10-31 18:55:53

by Gerhard Mack

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

Why exactly do we use the jiffie count for calculating uptime? Why not
just record the startup time and compare when needed?


Gerhard



On Wed, 31 Oct 2001, Andreas Dilger wrote:

> On Oct 31, 2001 19:16 +0100, Tim Schmielau wrote:
> > The idea was that all drivers that use the 32 bit jiffies counter have to
> > be aware of the wraparound anyways, and won't see a difference.
>
> Agreed. I also like the change that you initialize jiffies to a pre-wrap
> value, so the jiffies wrap bugs can more easily be found/fixed.
>
> > The race only happens for 64 bit accesses to jiffies, but hey, without
> > the patch these values come out wrong _every_ time, so I believed a
> > tiny window for a single wrong display of uptime every 497.1 days to be
> > acceptable.
>
> I would say that the race is so rare that it should not be handled, especially
> since it adds extra code in the timer interrupt.
>
> > + /* We need to make sure jiffies_high does not change while
> > + * reading jiffies and jiffies_high */
> > + do {
> > + jiffies_high_tmp = jiffies_high_shadow;
> > + barrier();
> > + jiffies_tmp = jiffies;
> > + barrier();
> > + } while (jiffies_high != jiffies_high_tmp);
>
> Maybe this could be condensed into a macro/inline, so that people don't
> screw it up (and it looks cleaner). Like get_jiffies64() or so, for
> those few places that really care about the full value and can't stand
> a miniscule chance of a race (i.e. uptime output is not a candidate).
>
> Cheers, Andreas
> --
> Andreas Dilger
> http://sourceforge.net/projects/ext2resize/
> http://www-mddsp.enel.ucalgary.ca/People/adilger/
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

--
Gerhard Mack

[email protected]

<>< As a computer I find your faith in technology amusing.

2001-10-31 19:00:43

by Andreas Dilger

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Oct 31, 2001 19:35 +0100, Tim Schmielau wrote:
> Sorry, I forgot to set INITIAL_JIFFIES to zero before posting the patch.

Actually, it SHOULD stay non-zero to fix exactly those problems, otherwise
you need to wait a long time to test it... And the Linux development
process is "developers can't possibly test everything, so users need to
do testing as well.

> Testing with high INITIAL_JIFFIES values unfortunately still discloses
> instability after the wraparound even for an otherwise unpatched kernel.

Exactly my point. Leave it in.

> This of course needs to be
> #define INITIAL_JIFFIES 0
> for correct uptime display

Maybe for uptime display only, you could subtract INITIAL_JIFFIES from
jiffies, for printing, but leave the initial value as non-zero?

Cheers, Andreas
--
Andreas Dilger
http://sourceforge.net/projects/ext2resize/
http://www-mddsp.enel.ucalgary.ca/People/adilger/

2001-10-31 19:14:29

by Richard B. Johnson

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Wed, 31 Oct 2001, Gerhard Mack wrote:

> Why exactly do we use the jiffie count for calculating uptime? Why not
> just record the startup time and compare when needed?
>
>
> Gerhard
>
Because you get it for free. The counter is necessary for time-outs
so you need it. If it starts at zero, you get uptime in HZ.

Cheers,
Dick Johnson

Penguin : Linux version 2.4.1 on an i686 machine (799.53 BogoMips).

I was going to compile a list of innovations that could be
attributed to Microsoft. Once I realized that Ctrl-Alt-Del
was handled in the BIOS, I found that there aren't any.


2001-10-31 19:26:37

by Tim Schmielau

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

> > I would say that the race is so rare that it should not be handled,
> > especially since it adds extra code in the timer interrupt.
>

The expensive code in the timer interupt will be executed every
497.1 days, so that's bearable.


> Race will be handled by readers of 64bit jiffy. There won't be many of them.
>
> > > + /* We need to make sure jiffies_high does not change while
> > > + * reading jiffies and jiffies_high */
> > > + do {
> > > + jiffies_high_tmp = jiffies_high_shadow;
> > > + barrier();
> > > + jiffies_tmp = jiffies;
> > > + barrier();
> > > + } while (jiffies_high != jiffies_high_tmp);
> >
> > Maybe this could be condensed into a macro/inline, so that people don't
> > screw it up (and it looks cleaner). Like get_jiffies64() or so, for
>
> Inline! Inline! Don't use macro unless you must!
>
> // extern or static? which is correct?
> // I see both types in kernel .h :-(
> extern inline u64 get_jiffies64() {
> unsigned long hi,lo;
> do {
> hi = jiffies_hi;
> barrier();
> lo = jiffies;
> barrier();
> } while (hi != jiffies_hi);
> return lo + (((u64)hi) << 32);
> }

should be
return lo + (((u64)hi) << BITS_PER_LONG);
to not break 64 bit platforms.

Admittedly, I broke 64 bit myself by submitting a wrong version of my
patch. The condition
if (jiffies != 0xffffffffUL) {
jiffies++;
in do_timer needs to be
if (jiffies != (unsigned long)-1) {
jiffies++;


>
> And guys, why you don't comment on my hand-crafted asm version of
> union { u32 jiffies; u64 jiffies64; }; ? Is it flawed?
> --
> vda
>

I believe not, but it is platform-specific.

Tim

2001-10-31 19:52:10

by Richard B. Johnson

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Wed, 31 Oct 2001, Tim Schmielau wrote:

> > > I would say that the race is so rare that it should not be handled,
> > > especially since it adds extra code in the timer interrupt.
> >
>
> The expensive code in the timer interupt will be executed every
> 497.1 days, so that's bearable.

The expensive code in the timer interrupt will be executed every
timer interrupt! There is no magic way to connect two 32-bit long-words
to make a 64-bit object, no matter how well it's hidden by the 'C'
compiler. An increment of the low long-word won't magically bump the
high long-word. This takes code, and the simplist code to do it was
shown. This:
adcl $1,(jiffies) # INCL won't set CY 4 clocks
adcl $0,(jiffies_hi) # 4 clocks
... takes 8 clocks.
This:
incl (jiffies)
... takes 2 clocks

Also gcc isn't as "kind" as this. It generates volumes of strange
code to bump a 'long long'.

That's 6 extra clocks every Hz or 600 clocks per second. By the time
you've reached the 497.1 days, you have wasted.... 0xffffffff/6 =
715,827,882 CPU clocks just so 'uptime' is correct? I don't think
so. I'd reboot.


Cheers,
Dick Johnson

Penguin : Linux version 2.4.1 on an i686 machine (799.53 BogoMips).

I was going to compile a list of innovations that could be
attributed to Microsoft. Once I realized that Ctrl-Alt-Del
was handled in the BIOS, I found that there aren't any.


2001-10-31 20:00:20

by J Sloan

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

"Richard B. Johnson" wrote:

> That's 6 extra clocks every Hz or 600 clocks per second. By the time
> you've reached the 497.1 days, you have wasted.... 0xffffffff/6 =
> 715,827,882 CPU clocks just so 'uptime' is correct? I don't think
> so. I'd reboot.

Actually, you don't need to reboot.

The mail/dns servers I mentioned are
running fine, processing smtp and pop3
mail without a hitch, serving up dns info
for 230 dns zones, and providing ntpd
and big brother services for quite a few
other linux and linux-like systems here.

You only need to reboot if you demand
that the uptime counter be correct -

I just add 497 days for now....

cu

jjs


2001-10-31 20:10:30

by Gerhard Mack

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Wed, 31 Oct 2001, Richard B. Johnson wrote:

> On Wed, 31 Oct 2001, Gerhard Mack wrote:
>
> > Why exactly do we use the jiffie count for calculating uptime? Why not
> > just record the startup time and compare when needed?
> >
> >
> > Gerhard
> >
> Because you get it for free. The counter is necessary for time-outs
> so you need it. If it starts at zero, you get uptime in HZ.

Yes that I understand and it works right up until the jiffie count wraps.
But now we have people adding cost to everything else just so we can all
have good uptime values. Since AFIK the drivers handle the wrap cleanly
the only thing that it bothers is the uptime stats.

Now we have people making jiffies more expensive just to deal with uptime.
At least as far as I can see it should just be easier/better to make
uptime use something else.

Or am I completly off base?


Gerhard



--
Gerhard Mack

[email protected]

<>< As a computer I find your faith in technology amusing.

2001-10-31 20:20:30

by Charles Cazabon

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

Richard B. Johnson <[email protected]> wrote:
>
> That's 6 extra clocks every Hz or 600 clocks per second. By the time
> you've reached the 497.1 days, you have wasted.... 0xffffffff/6 =
> 715,827,882 CPU clocks just so 'uptime' is correct? I don't think
> so. I'd reboot.

For proportion: 716 million CPU clocks, on an "average" PC today is
less than one second of CPU time. Over the course of 497 days, that's
not much overhead at all.

Charles
--
-----------------------------------------------------------------------
Charles Cazabon <[email protected]>
GPL'ed software available at: http://www.qcc.sk.ca/~charlesc/software/
-----------------------------------------------------------------------

2001-10-31 20:33:30

by Kurt Roeckx

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Wed, Oct 31, 2001 at 02:20:44PM -0600, Charles Cazabon wrote:
> Richard B. Johnson <[email protected]> wrote:
> >
> > That's 6 extra clocks every Hz or 600 clocks per second. By the time
> > you've reached the 497.1 days, you have wasted.... 0xffffffff/6 =
> > 715,827,882 CPU clocks just so 'uptime' is correct? I don't think
> > so. I'd reboot.
>
> For proportion: 716 million CPU clocks, on an "average" PC today is
> less than one second of CPU time. Over the course of 497 days, that's
> not much overhead at all.

Not to mention that it takes alot more clocks to setup and return
from the interupt.


Kurt

2001-10-31 20:39:50

by Richard B. Johnson

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Wed, 31 Oct 2001, Gerhard Mack wrote:

> On Wed, 31 Oct 2001, Richard B. Johnson wrote:
>
> > On Wed, 31 Oct 2001, Gerhard Mack wrote:
> >
> > > Why exactly do we use the jiffie count for calculating uptime? Why not
> > > just record the startup time and compare when needed?
> > >
> > >
> > > Gerhard
> > >
> > Because you get it for free. The counter is necessary for time-outs
> > so you need it. If it starts at zero, you get uptime in HZ.
>
> Yes that I understand and it works right up until the jiffie count wraps.
> But now we have people adding cost to everything else just so we can all
> have good uptime values. Since AFIK the drivers handle the wrap cleanly
> the only thing that it bothers is the uptime stats.
>
> Now we have people making jiffies more expensive just to deal with uptime.
> At least as far as I can see it should just be easier/better to make
> uptime use something else.
>
> Or am I completly off base?
>
>
> Gerhard

I think that if you just saved time_t (the time() seconds variable,
after the system time has been set, it would be a good win.

Unfortunately, the time gets set either from user-mode via a sys-call,
or initially from CMOS. If the boot time was set during this initial CMOS
setting, it could be very wrong. If it gets set every time the system-call
is executed, it becomes "time since last time the time was set".

A fix exists, though. The existing, possibly wrong, boot time could
be grabbed during the set-time sys-call, the difference between the
existing (wrong) time and the boot time could be calculated, the
new (correct) time could be set, then the boot-time would be changed
to the new correct time, minus the calculated difference. This
would result in a correct boot-time.

dif_time = time - boot_time;
time = new_time;
boot_time = time - dif_time;

Uptime could then be present time minus boot time. No mucking with
timer ticks are required.

BUT, and the big BUT. Demonstration variable 'time' above, isn't
a unique variable. It's derived from jiffies / HZ. So, we are back
to the same problem of 32-bits.

Cheers,
Dick Johnson

Penguin : Linux version 2.4.1 on an i686 machine (799.53 BogoMips).

I was going to compile a list of innovations that could be
attributed to Microsoft. Once I realized that Ctrl-Alt-Del
was handled in the BIOS, I found that there aren't any.


2001-10-31 20:48:00

by Tim Schmielau

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Wed, 31 Oct 2001, vda wrote:

> On Wednesday 31 October 2001 18:40, Andreas Dilger wrote:
> > On Oct 31, 2001 19:16 +0100, Tim Schmielau wrote:
> > > The idea was that all drivers that use the 32 bit jiffies counter have to
> > > be aware of the wraparound anyways, and won't see a difference.
> >
> > Agreed. I also like the change that you initialize jiffies to a pre-wrap
> > value, so the jiffies wrap bugs can more easily be found/fixed.
>
> Yes indeed. We should chase bugs, not run away from them :-)
> I seriously suggest setting juffy to close to wrap value
> if kernel hacking is enabled.
>
> > I would say that the race is so rare that it should not be handled,
> > especially since it adds extra code in the timer interrupt.
>
> Race will be handled by readers of 64bit jiffy. There won't be many of them.
>
> > > + /* We need to make sure jiffies_high does not change while
> > > + * reading jiffies and jiffies_high */
> > > + do {
> > > + jiffies_high_tmp = jiffies_high_shadow;
> > > + barrier();
> > > + jiffies_tmp = jiffies;
> > > + barrier();
> > > + } while (jiffies_high != jiffies_high_tmp);
> >
> > Maybe this could be condensed into a macro/inline, so that people don't
> > screw it up (and it looks cleaner). Like get_jiffies64() or so, for
>
> Inline! Inline! Don't use macro unless you must!
>
> // extern or static? which is correct?
> // I see both types in kernel .h :-(
> extern inline u64 get_jiffies64() {
> unsigned long hi,lo;
> do {
> hi = jiffies_hi;
> barrier();
> lo = jiffies;
> barrier();
> } while (hi != jiffies_hi);
> return lo + (((u64)hi) << 32);
> }
>

OK, I introduced get_jiffies64, corrected my 64 bit mistake and
subtract INITIAL_JIFFIES to obtain uptime, while leaving it at at pre-wrap
value for error-chasing.
Still this patch introduces jiffies_high on 64 bit platforms which will be
useless until the year 571234830.

************************************************************
* DISCLAIMER: This patch WILL make your linux box unstable *
* unless you set INITIAL_JIFFIES to zero!!! *
************************************************************

btw.: can someone please explain to me why do_timer uses
(*(unsigned long *)&jiffies)++;
instead of just doing jiffies++ ?

Tim



--- fs/proc/proc_misc.c.orig Wed Oct 31 17:45:08 2001
+++ fs/proc/proc_misc.c Wed Oct 31 21:07:11 2001
@@ -39,6 +39,7 @@
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/io.h>
+#include <asm/div64.h>


#define LOAD_INT(x) ((x) >> FSHIFT)
@@ -103,15 +104,19 @@
static int uptime_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
- unsigned long uptime;
+ u64 uptime;
+ unsigned long remainder;
unsigned long idle;
int len;

- uptime = jiffies;
+ uptime = get_jiffies64() - INITIAL_JIFFIES;
+ remainder = (unsigned long) do_div(uptime, HZ);
+
idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime;

- /* The formula for the fraction parts really is ((t * 100) / HZ) % 100, but
- that would overflow about every five days at HZ == 100.
+ /* The formula for the fraction part of the idle time really is
+ ((t * 100) / HZ) % 100, but that would overflow about
+ every five days at HZ == 100.
Therefore the identity a = (a / b) * b + a % b is used so that it is
calculated as (((t / HZ) * 100) + ((t % HZ) * 100) / HZ) % 100.
The part in front of the '+' always evaluates as 0 (mod 100). All divisions
@@ -121,14 +126,14 @@
*/
#if HZ!=100
len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
- uptime / HZ,
- (((uptime % HZ) * 100) / HZ) % 100,
+ (unsigned long) uptime,
+ ((remainder * 100) / HZ) % 100,
idle / HZ,
(((idle % HZ) * 100) / HZ) % 100);
#else
len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
- uptime / HZ,
- uptime % HZ,
+ (unsigned long) uptime,
+ remainder,
idle / HZ,
idle % HZ);
#endif
--- kernel/timer.c.orig Wed Oct 31 17:24:36 2001
+++ kernel/timer.c Wed Oct 31 21:10:39 2001
@@ -65,7 +65,8 @@

extern int do_setitimer(int, struct itimerval *, struct itimerval *);

-unsigned long volatile jiffies;
+unsigned long volatile jiffies = INITIAL_JIFFIES;
+unsigned long volatile jiffies_hi, jiffies_hi_shadow;

unsigned int * prof_buffer;
unsigned long prof_len;
@@ -117,7 +118,7 @@
INIT_LIST_HEAD(tv1.vec + i);
}

-static unsigned long timer_jiffies;
+static unsigned long timer_jiffies = INITIAL_JIFFIES;

static inline void internal_add_timer(struct timer_list *timer)
{
@@ -638,7 +639,7 @@
}

/* jiffies at the most recent update of wall time */
-unsigned long wall_jiffies;
+unsigned long wall_jiffies = INITIAL_JIFFIES;

/*
* This spinlock protect us from races in SMP while playing with xtime. -arca
@@ -673,7 +674,22 @@

void do_timer(struct pt_regs *regs)
{
- (*(unsigned long *)&jiffies)++;
+ /* we assume that two calls to do_timer can never overlap
+ * since they are one jiffie apart in time */
+ if (jiffies != (unsigned long)(-1)) {
+ jiffies++;
+ } else {
+ /* We still need to care about the race with readers of
+ * jiffies_hi. Readers have to discard the values if
+ * jiffies_hi != jiffies_hi_shadow when read with
+ * proper barriers in between. */
+ jiffies_hi++;
+ barrier();
+ jiffies++;
+ barrier();
+ jiffies_hi_shadow = jiffies_hi;
+ barrier();
+ }
#ifndef CONFIG_SMP
/* SMP process accounting uses the local APIC timer */

--- kernel/info.c.orig Wed Oct 31 17:58:25 2001
+++ kernel/info.c Wed Oct 31 21:07:28 2001
@@ -12,15 +12,19 @@
#include <linux/smp_lock.h>

#include <asm/uaccess.h>
+#include <asm/div64.h>

asmlinkage long sys_sysinfo(struct sysinfo *info)
{
struct sysinfo val;
+ u64 uptime;

memset((char *)&val, 0, sizeof(struct sysinfo));

cli();
- val.uptime = jiffies / HZ;
+ uptime = get_jiffies64() - INITIAL_JIFFIES;
+ do_div(uptime, HZ);
+ val.uptime = uptime;

val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
--- include/linux/sched.h.orig Wed Oct 31 17:42:41 2001
+++ include/linux/sched.h Wed Oct 31 21:09:21 2001
@@ -543,7 +543,21 @@

#include <asm/current.h>

-extern unsigned long volatile jiffies, jiffies_high, jiffies_high_shadow;
+#define INITIAL_JIFFIES 0xFFFFD000ul
+extern unsigned long volatile jiffies, jiffies_hi, jiffies_hi_shadow;
+static inline u64 get_jiffies64() {
+ unsigned long hi,lo;
+ /* We need to make sure jiffies_hi does not change while
+ * reading jiffies and jiffies_hi */
+ do {
+ hi = jiffies_hi;
+ barrier();
+ lo = jiffies;
+ barrier();
+ } while (hi != jiffies_hi);
+ return lo + (((u64)hi) << BITS_PER_LONG);
+}
+
extern unsigned long itimer_ticks;
extern unsigned long itimer_next;
extern struct timeval xtime;

2001-10-31 20:53:40

by Andreas Dilger

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Oct 31, 2001 12:11 -0800, Gerhard Mack wrote:
> Yes that I understand and it works right up until the jiffie count wraps.
> But now we have people adding cost to everything else just so we can all
> have good uptime values. Since AFIK the drivers handle the wrap cleanly
> the only thing that it bothers is the uptime stats.
>
> Now we have people making jiffies more expensive just to deal with uptime.
> At least as far as I can see it should just be easier/better to make
> uptime use something else.

What about the following. Since jiffies wraps are extremely rare, it
should be enough to have something along the lines of the following
in the uptime code only (or globally accessible for any code that
needs to use a full 64-bit jiffies value):

u64 get_jiffies64(void)
{
static unsigned long jiffies_hi = 0;
static unsigned long jiffies_last = INITIAL_JIFFIES;

/* probably need locking for this part */
if (jiffies < jiffies_last) { /* We have a wrap */
jiffies_hi++;
jiffies_last = jiffies;
}

return (jiffies | ((u64)jiffies_hi) << LONG_SHIFT));
}

This means you need to call something that _checks_ the uptime
(or needs the 64-bit jiffies value) at least once every 1.3 years.
If you don't do it at least that often, you probably don't care
about the uptime anyways.

This only impacts anything that really needs a 64-bit jiffies count,
and has zero impact everywhere else.

Cheers, Andreas
--
Andreas Dilger
http://sourceforge.net/projects/ext2resize/
http://www-mddsp.enel.ucalgary.ca/People/adilger/

2001-10-31 21:06:31

by Tim Schmielau

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Wed, 31 Oct 2001, Andreas Dilger wrote:

> What about the following. Since jiffies wraps are extremely rare, it
> should be enough to have something along the lines of the following
> in the uptime code only (or globally accessible for any code that
> needs to use a full 64-bit jiffies value):
>
> u64 get_jiffies64(void)
> {
> static unsigned long jiffies_hi = 0;
> static unsigned long jiffies_last = INITIAL_JIFFIES;
>
> /* probably need locking for this part */
> if (jiffies < jiffies_last) { /* We have a wrap */
> jiffies_hi++;
> jiffies_last = jiffies;
> }
>
> return (jiffies | ((u64)jiffies_hi) << LONG_SHIFT));
> }
>
> This means you need to call something that _checks_ the uptime
> (or needs the 64-bit jiffies value) at least once every 1.3 years.
> If you don't do it at least that often, you probably don't care
> about the uptime anyways.
>
> This only impacts anything that really needs a 64-bit jiffies count,
> and has zero impact everywhere else.
>

I initially thought of that too. My objection was that boxes with long
uptimes typically get forgotten in a corner until years later someone
checks uptime again.

However, I fully agree with your importance argument and believe this
proposal to be the best one.

Tim

2001-10-31 21:09:21

by Andreas Dilger

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Oct 31, 2001 21:47 +0100, Tim Schmielau wrote:
> OK, I introduced get_jiffies64, corrected my 64 bit mistake and
> subtract INITIAL_JIFFIES to obtain uptime, while leaving it at at pre-wrap
> value for error-chasing.
> Still this patch introduces jiffies_high on 64 bit platforms which will be
> useless until the year 571234830.

It probably won't be accepted until the whole thing disappears for 64-bit
systems.

> btw.: can someone please explain to me why do_timer uses
> (*(unsigned long *)&jiffies)++;
> instead of just doing jiffies++ ?

Can't say for sure, but it may be a compiler issue, maybe historical.


> */
> #if HZ!=100
> len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
> - uptime / HZ,
> - (((uptime % HZ) * 100) / HZ) % 100,
> + (unsigned long) uptime,
> + ((remainder * 100) / HZ) % 100,
> idle / HZ,
> (((idle % HZ) * 100) / HZ) % 100);

Given that uptime is now 64-bits, do we need this mathematical hoop jumping?
Also, won't the remainder already be modulus HZ in this case?

> void do_timer(struct pt_regs *regs)
> {
> - (*(unsigned long *)&jiffies)++;
> + /* we assume that two calls to do_timer can never overlap
> + * since they are one jiffie apart in time */
> + if (jiffies != (unsigned long)(-1)) {
> + jiffies++;
> + } else {
> + /* We still need to care about the race with readers of
> + * jiffies_hi. Readers have to discard the values if
> + * jiffies_hi != jiffies_hi_shadow when read with
> + * proper barriers in between. */
> + jiffies_hi++;
> + barrier();
> + jiffies++;
> + barrier();
> + jiffies_hi_shadow = jiffies_hi;
> + barrier();
> + }

I think this is the part of the patch that people object to. See my
other posting of how to handle 64-bit jiffies with only an impact to
users of the 64-bit value.

Cheers, Andreas
--
Andreas Dilger
http://sourceforge.net/projects/ext2resize/
http://www-mddsp.enel.ucalgary.ca/People/adilger/

2001-10-31 21:11:41

by Richard B. Johnson

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Wed, 31 Oct 2001, Tim Schmielau wrote:

> On Wed, 31 Oct 2001, vda wrote:
>
[SNIPPED...]
>
> btw.: can someone please explain to me why do_timer uses
> (*(unsigned long *)&jiffies)++;
> instead of just doing jiffies++ ?
>
> Tim

It's an attempt to prevent the 'C' compiler from doing this:

movl (jiffies), %eax # Read
incl %eax # Modify
movl %eax, (jiffies) # Write

Instead, we want it to do this:

movl jiffies, %ebx # init a pointer
incl (%ebx) # Modify directly

With some gcc versions, it works. Others, it doesn't hurt.

Cheers,
Dick Johnson

Penguin : Linux version 2.4.1 on an i686 machine (799.53 BogoMips).

I was going to compile a list of innovations that could be
attributed to Microsoft. Once I realized that Ctrl-Alt-Del
was handled in the BIOS, I found that there aren't any.


2001-10-31 21:12:41

by Petr Vandrovec

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On 31 Oct 01 at 15:39, Richard B. Johnson wrote:
> A fix exists, though. The existing, possibly wrong, boot time could
> be grabbed during the set-time sys-call, the difference between the
> existing (wrong) time and the boot time could be calculated, the
> new (correct) time could be set, then the boot-time would be changed
> to the new correct time, minus the calculated difference. This
> would result in a correct boot-time.
>
> dif_time = time - boot_time;
> time = new_time;
> boot_time = time - dif_time;
>
> Uptime could then be present time minus boot time. No mucking with
> timer ticks are required.

Hi,
this reminds me. Year or so ago there was patch from someone, which
detected jiffies overflow in /proc/uptime proc_read() code, so only thing
you had to do was run 'uptime', 'w', 'top' or something like that
every 497 days - you can schedule it as cron job for Jan 1, 0:00:00,
to find some workoholics.

Patch was something like:

...
static unsigned long long jiffies_hi = 0;
static unsigned long old_jiffies = 0;
unsigned long jiffy_cache;

/* some spinlock or inode lock or something like that */
jiffy_cache = jiffies;
if (jiffy_cache < old_jiffies) {
jiffies_hi += 1ULL << BITS_PER_LONG;
}
old_jiffies = jiffy_cache;
grand_jiffies = jiffies_hi + jiffy_cache;
/* unlock */
/* now do division and all strange things to format number for userspace */
...

Even if you add running uptime every year to your crontab,
it will consume less than 700,000,000 CPU cycles consumed by just 64bit
inc in timer interrupt.
Best regards,
Petr Vandrovec
[email protected]

2001-10-31 21:17:21

by Richard B. Johnson

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Wed, 31 Oct 2001, Andreas Dilger wrote:

> On Oct 31, 2001 12:11 -0800, Gerhard Mack wrote:
> > Yes that I understand and it works right up until the jiffie count wraps.
> > But now we have people adding cost to everything else just so we can all
> > have good uptime values. Since AFIK the drivers handle the wrap cleanly
> > the only thing that it bothers is the uptime stats.
> >
> > Now we have people making jiffies more expensive just to deal with uptime.
> > At least as far as I can see it should just be easier/better to make
> > uptime use something else.
>
> What about the following. Since jiffies wraps are extremely rare, it
> should be enough to have something along the lines of the following
> in the uptime code only (or globally accessible for any code that
> needs to use a full 64-bit jiffies value):
>
> u64 get_jiffies64(void)
> {
> static unsigned long jiffies_hi = 0;
> static unsigned long jiffies_last = INITIAL_JIFFIES;
>
> /* probably need locking for this part */
> if (jiffies < jiffies_last) { /* We have a wrap */
> jiffies_hi++;
> jiffies_last = jiffies;
> }
>
> return (jiffies | ((u64)jiffies_hi) << LONG_SHIFT));
> }
>
> This means you need to call something that _checks_ the uptime
> (or needs the 64-bit jiffies value) at least once every 1.3 years.
> If you don't do it at least that often, you probably don't care
> about the uptime anyways.
>
> This only impacts anything that really needs a 64-bit jiffies count,
> and has zero impact everywhere else.
>
> Cheers, Andreas

Ah, yes. It's perfect. It could be put right in the 'uptime' code.
It has zero overhead otherwise. Just check the uptime every year
or so and it's always correct. A dummy call to your procedure
every time the system time is set would make it robust.

Cheers,
Dick Johnson

Penguin : Linux version 2.4.1 on an i686 machine (799.53 BogoMips).

I was going to compile a list of innovations that could be
attributed to Microsoft. Once I realized that Ctrl-Alt-Del
was handled in the BIOS, I found that there aren't any.


2001-10-31 21:41:45

by Andreas Dilger

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Oct 31, 2001 22:05 +0100, Tim Schmielau wrote:
> > This means you need to call something that _checks_ the uptime
> > (or needs the 64-bit jiffies value) at least once every 1.3 years.
> > If you don't do it at least that often, you probably don't care
> > about the uptime anyways.
> >
> > This only impacts anything that really needs a 64-bit jiffies count,
> > and has zero impact everywhere else.
>
> I initially thought of that too. My objection was that boxes with long
> uptimes typically get forgotten in a corner until years later someone
> checks uptime again.
>
> However, I fully agree with your importance argument and believe this
> proposal to be the best one.

Note that there are several tools that check the uptime, not just the
"uptime" command. For example "top" and "w" also display the uptime
value (reading /proc/uptime). But yes, if people don't check on their
boxes in a year, then this method will lose full multiples of 1.3 years
between checks. Probably not a big deal - we can assume such a system
is an "appliance" at that point, regardless of what kind of real system
it is. If someone wants to ensure their uptime is correct, they can
always put "[ -r /proc/uptime ] && cat /proc/uptime > /dev/null" in their
cron.monthly directory.

Cheers, Andreas
--
Andreas Dilger
http://sourceforge.net/projects/ext2resize/
http://www-mddsp.enel.ucalgary.ca/People/adilger/

2001-10-31 21:46:57

by Andreas Dilger

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Oct 31, 2001 22:11 +0000, Petr Vandrovec wrote:
> this reminds me. Year or so ago there was patch from someone, which
> detected jiffies overflow in /proc/uptime proc_read() code, so only thing
> you had to do was run 'uptime', 'w', 'top' or something like that
> every 497 days - you can schedule it as cron job for Jan 1, 0:00:00,
> to find some workoholics.

Sorry, your posting is 14 minutes too late ;-). I just re-invented this
wheel. Such is the pace of Linux kernel development.

> static unsigned long long jiffies_hi = 0;
> static unsigned long old_jiffies = 0;
> unsigned long jiffy_cache;
>
> /* some spinlock or inode lock or something like that */
> jiffy_cache = jiffies;
> if (jiffy_cache < old_jiffies) {
> jiffies_hi += 1ULL << BITS_PER_LONG;
> }

Ah, my code only stored the high 32 bits of the jiffies_hi, so we don't
ever do 64-bit math for this, only a simple increment. Otherwise it is
exactly the same, including the "some spinlock or something" comment ;-).

Cheers, Andreas
--
Andreas Dilger
http://sourceforge.net/projects/ext2resize/
http://www-mddsp.enel.ucalgary.ca/People/adilger/

2001-10-31 22:54:00

by George Anzinger

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

I tried to post this this AM, but it was a no show. Here it is again:

First I want to thank Tim for pointing out (by usage) the do_div()
macro. This has troubled me for some time. Thanks Tim.

As to the method to use for 64-bit jiffies, the one Tim used was
discussed on the list a short time ago under the subject "How should we
do a 64-bit jiffies?" (thread started by me on 10/22/01). The asm
overlay was discussed as well as other methods. Linus proposed the
version that Tim used here and that I am using in the high-res-timers
patch. (See: http://sourceforge.net/projects/high-res-timers) And, yes
it does make it a NO-NO to have a local variable (or a struct member)
named jiffies and I am sure that we will find other uses of such as
additional drivers and platforms are tried.
http://sourceforge.net/projects/high-res-timers
On the issue of locking, jiffies (in the current system) is updated
under the xtime_lock, so this is the lock one would use to read it if
you cared, however, as patched, almost NO ONE cares or needs to. Just
using the low 32 bits and the existing compare macros will do just fine,
thank you. For those that care, i.e. uptime, CLOCK_MONOTOINIC (and
other code in POSIX timers), either the xtime_lock (a read/write lock by
the way, not a spin lock) can be taken.

Here Tim is proposing a while loop which, IMHO, will fail in the SMP
case, but does handle the UP interrupt case quite nicely.

Tim Schmielau wrote:
>
> On Wed, 31 Oct 2001, Richard B. Johnson wrote:
>
> > On Wed, 31 Oct 2001, vda wrote:
> >
> > [SNIPPED...]
> >
> > > Hmm.... 64bit jiffies are attractive.
> > >
> > > I'd like to see less #defines in kernel
> > > Some parts of your patch fight with the fact that jiffies
> > > is converted to macro -> it is illegal now to have local vars
> > > called "jiffies". This is ugly. I know that there are tons of similarly
> > > (ab)used macros in the kernel now but let's stop adding more!
> > >
> > > This test prog shows how to make overlapping 32bit and 64bit vars.
> > > It works for me.
> > >
>
> [asm snipped]
>
> > >
> > > Is this better or not? If not, why?

The principle problem is platform dependence.

> > > --
> > > vda
> >
> > The problem is that a 64-bit jiffies on a 32-bit machine would
> > require a spin-lock every time the jiffies variable is changed!

Ah, but it is only changed in the timer interrupt which is already under
a write_irq lock.

> > This is because there are two (or more) memory accesses for
> > every 64 bit operation, plus two or more register accesses for
> > every 64 bit operation. If a context-switch or an interrupt
> > occurs between those operations, all bets are off about the
> > result.
> >
> > The appended small tar.gz file contains some 64-bit assembly
> > plus some 64-bit C, Look at the assembly and it will become
> > obvious to you that you don't want to use a 64-bit timer
> > on a 32 bit machine.
> >
> >
> > Cheers,
> > Dick Johnson
> >
>
> The idea was that all drivers that use the 32 bit jiffies counter have to
> be aware of the wraparound anyways, and won't see a difference.
> The race only happens for 64 bit accesses to jiffies, but hey, without
> the patch these values come out wrong _every_ time, so I believed a
> tiny window for a single wrong display of uptime every 497.1 days to be
> acceptable.

For display, ok, but not for CLOCK_MONOTONIC and other POSIX timers
usage :)
>
> If we want to make sure to eliminate this possibility as well, the kind of
> home-made synchronization may help that I came up with before considering
> the race acceptable. It also has the advantage of not touching the jiffies
> definition at all.

As pointed out above, this works for interrupts on UP machines, but,
IMHO, will fail on SMP machines.

George

>
> Tim
>
> --- fs/proc/proc_misc.c.orig Wed Oct 31 17:45:08 2001
> +++ fs/proc/proc_misc.c Wed Oct 31 18:49:19 2001
> @@ -39,6 +39,7 @@
> #include <asm/uaccess.h>
> #include <asm/pgtable.h>
> #include <asm/io.h>
> +#include <asm/div64.h>
>
> #define LOAD_INT(x) ((x) >> FSHIFT)
> @@ -103,15 +104,28 @@
> static int uptime_read_proc(char *page, char **start, off_t off,
> int count, int *eof, void *data)
> {
> - unsigned long uptime;
> + u64 uptime;
> + unsigned long jiffies_tmp, jiffies_high_tmp, remainder;
> unsigned long idle;
> int len;
>
> - uptime = jiffies;
> + /* We need to make sure jiffies_high does not change while
> + * reading jiffies and jiffies_high */
> + do {
> + jiffies_high_tmp = jiffies_high_shadow;
> + barrier();
> + jiffies_tmp = jiffies;
> + barrier();
> + } while (jiffies_high != jiffies_high_tmp);
> +
> + uptime = jiffies_tmp + ((u64)jiffies_high_tmp << BITS_PER_LONG);
> + remainder = (unsigned long) do_div(uptime, HZ);
> +
> idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime;
>
> - /* The formula for the fraction parts really is ((t * 100) / HZ) % 100, but
> - that would overflow about every five days at HZ == 100.
> + /* The formula for the fraction part of the idle time really is
> + ((t * 100) / HZ) % 100, but that would overflow about
> + every five days at HZ == 100.
> Therefore the identity a = (a / b) * b + a % b is used so that it is
> calculated as (((t / HZ) * 100) + ((t % HZ) * 100) / HZ) % 100.
> The part in front of the '+' always evaluates as 0 (mod 100). All divisions
> @@ -121,14 +135,14 @@
> */
> #if HZ!=100
> len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
> - uptime / HZ,
> - (((uptime % HZ) * 100) / HZ) % 100,
> + (unsigned long) uptime,
> + ((remainder * 100) / HZ) % 100,
> idle / HZ,
> (((idle % HZ) * 100) / HZ) % 100);
> #else
> len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
> - uptime / HZ,
> - uptime % HZ,
> + (unsigned long) uptime,
> + remainder,
> idle / HZ,
> idle % HZ);
> #endif
> --- kernel/timer.c.orig Wed Oct 31 17:24:36 2001
> +++ kernel/timer.c Wed Oct 31 18:38:47 2001
> @@ -65,7 +65,9 @@
>
> extern int do_setitimer(int, struct itimerval *, struct itimerval *);
>
> -unsigned long volatile jiffies;
> +#define INITIAL_JIFFIES 0xFFFFD000ul
> +unsigned long volatile jiffies = INITIAL_JIFFIES;
> +unsigned long volatile jiffies_high, jiffies_high_shadow;
>
> unsigned int * prof_buffer;
> unsigned long prof_len;
> @@ -117,7 +119,7 @@
> INIT_LIST_HEAD(tv1.vec + i);
> }
>
> -static unsigned long timer_jiffies;
> +static unsigned long timer_jiffies = INITIAL_JIFFIES;
>
> static inline void internal_add_timer(struct timer_list *timer)
> {
> @@ -638,7 +640,7 @@
> }
>
> /* jiffies at the most recent update of wall time */
> -unsigned long wall_jiffies;
> +unsigned long wall_jiffies = INITIAL_JIFFIES;
>
> /*
> * This spinlock protect us from races in SMP while playing with xtime. -arca
> @@ -673,7 +675,22 @@
>
> void do_timer(struct pt_regs *regs)
> {
> - (*(unsigned long *)&jiffies)++;
> + /* we assume that two calls to do_timer can never overlap
> + * since they are one jiffie apart in time */
> + if (jiffies != 0xffffffffUL) {
> + jiffies++;
> + } else {
> + /* We still need to care about the race with readers of
> + * jiffies_high. Readers have to discard the values if
> + * jiffies_high != jiffies_high_shadow when read with
> + * proper barriers in between. */
> + jiffies_high++;
> + barrier();
> + jiffies++;
> + barrier();
> + jiffies_high_shadow = jiffies_high;
> + barrier();
> + }
> #ifndef CONFIG_SMP
> /* SMP process accounting uses the local APIC timer */
>
> --- kernel/info.c.orig Wed Oct 31 17:58:25 2001
> +++ kernel/info.c Wed Oct 31 18:48:52 2001
> @@ -12,15 +12,28 @@
> #include <linux/smp_lock.h>
>
> #include <asm/uaccess.h>
> +#include <asm/div64.h>
>
> asmlinkage long sys_sysinfo(struct sysinfo *info)
> {
> struct sysinfo val;
> + u64 uptime;
> + unsigned long jiffies_tmp, jiffies_high_tmp;
>
> memset((char *)&val, 0, sizeof(struct sysinfo));
>
> cli();
> - val.uptime = jiffies / HZ;
> + /* We need to make sure jiffies_high does not change while
> + * reading jiffies and jiffies_high */
> + do {
> + jiffies_high_tmp = jiffies_high_shadow;
> + barrier();
> + jiffies_tmp = jiffies;
> + barrier();
> + } while (jiffies_high != jiffies_high_tmp);
> + uptime = jiffies_tmp + ((u64)jiffies_high_tmp << BITS_PER_LONG);
> + do_div(uptime, HZ);
> + val.uptime = uptime;
>
> val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
> val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2001-10-31 22:57:50

by Tim Schmielau

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Wed, 31 Oct 2001, Petr Vandrovec wrote:

> Hi,
> this reminds me. Year or so ago there was patch from someone, which
> detected jiffies overflow in /proc/uptime proc_read() code, so only thing
> you had to do was run 'uptime', 'w', 'top' or something like that
> every 497 days - you can schedule it as cron job for Jan 1, 0:00:00,
> to find some workoholics.
>
> Patch was something like:
>
> ...

OK, to summarize and to not loose the patch again, I made up a final
version.

Anyone wanting to donate the stability of his box for linux
kernel-development may feel free to try out this patch.

It moves forward the wraparound of the jiffies counter on 32 bit boxes
from about 1.3 years after boot to approx 2 min after boot, which seemed
to somehow affect stability for my intel box.

As a side effect, if your box does not crash within the next 497.1 days,
this patch will allow it to still display correct uptime.

Next step would be to decide what to do with the start_time field of
struct task_struct, which is still 32 bit and stores seconds times HZ.
Other uses for 64 bit jiffies might be identified as well.

Thanks to all that contributed to the discussion.

Tim


--- linux-2.4.14-pre6/fs/proc/proc_misc.c Thu Oct 11 19:46:57 2001
+++ linux-2.4.14-pre6-jiffies64/fs/proc/proc_misc.c Wed Oct 31 22:53:55 2001
@@ -39,6 +39,7 @@
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/io.h>
+#include <asm/div64.h>


#define LOAD_INT(x) ((x) >> FSHIFT)
@@ -103,15 +104,19 @@
static int uptime_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
- unsigned long uptime;
+ u64 uptime;
+ unsigned long remainder;
unsigned long idle;
int len;

- uptime = jiffies;
+ uptime = get_jiffies64() - INITIAL_JIFFIES;
+ remainder = (unsigned long) do_div(uptime, HZ);
+
idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime;

- /* The formula for the fraction parts really is ((t * 100) / HZ) % 100, but
- that would overflow about every five days at HZ == 100.
+ /* The formula for the fraction part of the idle time really is
+ ((t * 100) / HZ) % 100, but that would overflow about
+ every five days at HZ == 100.
Therefore the identity a = (a / b) * b + a % b is used so that it is
calculated as (((t / HZ) * 100) + ((t % HZ) * 100) / HZ) % 100.
The part in front of the '+' always evaluates as 0 (mod 100). All divisions
@@ -121,14 +126,14 @@
*/
#if HZ!=100
len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
- uptime / HZ,
- (((uptime % HZ) * 100) / HZ) % 100,
+ (unsigned long) uptime,
+ (remainder * 100) / HZ,
idle / HZ,
(((idle % HZ) * 100) / HZ) % 100);
#else
len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
- uptime / HZ,
- uptime % HZ,
+ (unsigned long) uptime,
+ remainder,
idle / HZ,
idle % HZ);
#endif
--- linux-2.4.14-pre6/kernel/timer.c Mon Oct 8 19:41:41 2001
+++ linux-2.4.14-pre6-jiffies64/kernel/timer.c Wed Oct 31 22:49:20 2001
@@ -65,7 +65,7 @@

extern int do_setitimer(int, struct itimerval *, struct itimerval *);

-unsigned long volatile jiffies;
+unsigned long volatile jiffies = INITIAL_JIFFIES;

unsigned int * prof_buffer;
unsigned long prof_len;
@@ -117,7 +117,7 @@
INIT_LIST_HEAD(tv1.vec + i);
}

-static unsigned long timer_jiffies;
+static unsigned long timer_jiffies = INITIAL_JIFFIES;

static inline void internal_add_timer(struct timer_list *timer)
{
@@ -638,7 +638,7 @@
}

/* jiffies at the most recent update of wall time */
-unsigned long wall_jiffies;
+unsigned long wall_jiffies = INITIAL_JIFFIES;

/*
* This spinlock protect us from races in SMP while playing with xtime. -arca
@@ -683,6 +683,35 @@
if (TQ_ACTIVE(tq_timer))
mark_bh(TQUEUE_BH);
}
+
+
+#if BITS_PER_LONG < 48
+
+u64 get_jiffies64(void)
+{
+ static unsigned long jiffies_hi = 0;
+ static unsigned long jiffies_last = INITIAL_JIFFIES;
+ static unsigned long jiffies_tmp;
+
+ jiffies_tmp = jiffies; /* avoid races */
+ if (jiffies_tmp < jiffies_last) { /* We have a wrap */
+ jiffies_hi++;
+ jiffies_last = jiffies_tmp;
+ }
+
+ return (jiffies_tmp | ((u64)jiffies_hi) << BITS_PER_LONG);
+}
+
+#else
+ /* jiffies is wide enough to not wrap for 8716 years at HZ==1024 */
+
+static inline u64 get_jiffies64(void)
+{
+ return (u64)jiffies;
+}
+
+#endif /* BITS_PER_LONG < 48 */
+

#if !defined(__alpha__) && !defined(__ia64__)

--- linux-2.4.14-pre6/kernel/info.c Sat Apr 21 01:15:40 2001
+++ linux-2.4.14-pre6-jiffies64/kernel/info.c Wed Oct 31 23:12:56 2001
@@ -12,15 +12,19 @@
#include <linux/smp_lock.h>

#include <asm/uaccess.h>
+#include <asm/div64.h>

asmlinkage long sys_sysinfo(struct sysinfo *info)
{
struct sysinfo val;
+ u64 uptime;

memset((char *)&val, 0, sizeof(struct sysinfo));

cli();
- val.uptime = jiffies / HZ;
+ uptime = get_jiffies64() - INITIAL_JIFFIES;
+ do_div(uptime, HZ);
+ val.uptime = (unsigned long) uptime;

val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
--- linux-2.4.14-pre6/include/linux/sched.h Wed Oct 31 23:06:23 2001
+++ linux-2.4.14-pre6-jiffies64/include/linux/sched.h Wed Oct 31 22:50:34 2001
@@ -543,7 +543,10 @@

#include <asm/current.h>

+#define INITIAL_JIFFIES 0xFFFFD000ul
extern unsigned long volatile jiffies;
+extern u64 get_jiffies64(void);
+
extern unsigned long itimer_ticks;
extern unsigned long itimer_next;
extern struct timeval xtime;

2001-10-31 23:56:44

by Andreas Dilger

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Oct 31, 2001 23:58 +0100, Tim Schmielau wrote:
> Next step would be to decide what to do with the start_time field of
> struct task_struct, which is still 32 bit and stores seconds times HZ.
> Other uses for 64 bit jiffies might be identified as well.

I would say that (excluding stability issues because of jiffies wrap)
that this is ready for submission to Linus. He may be of the mind that
he would rather fix the wrap issues sooner rather than later, or he
may want to minimize disruption during the "VM stabilize" period (there
are still a couple of hang issues apparently).

> +u64 get_jiffies64(void)
> +{
> + static unsigned long jiffies_hi = 0;
> + static unsigned long jiffies_last = INITIAL_JIFFIES;
> + static unsigned long jiffies_tmp;
^^^^^^ jiffies_tmp doesn't need to be static.

One suggestion someone had was to put dummy "get_jiffies64()" calls
in some other infrequently used areas to ensure jiffies_hi is valid
if we don't call uptime for 1.3 years after the first wrap. I don't
know if that matters or not.

Cheers, Andreas
--
Andreas Dilger
http://sourceforge.net/projects/ext2resize/
http://www-mddsp.enel.ucalgary.ca/People/adilger/

2001-11-01 00:22:54

by Tim Schmielau

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Wed, 31 Oct 2001, Andreas Dilger wrote:

> I would say that (excluding stability issues because of jiffies wrap)
> that this is ready for submission to Linus. He may be of the mind that
> he would rather fix the wrap issues sooner rather than later, or he
> may want to minimize disruption during the "VM stabilize" period (there
> are still a couple of hang issues apparently).
>

I'd rather object for now. I've had a couple of hard freezes within
minutes to hours after jiffies wraparound. Also some KDE applications
behave strange, the KDE panel and kterm need approx 2 min to appear
even before the wraparound (so it seems to be a sign issue rather than
wraparound). I don't want to see this in a stable kernel.

However, I would be pleased if widespread intentional use of the patch
would help to solve the remaining wraparound issues.

> > +u64 get_jiffies64(void)
> > +{
> > + static unsigned long jiffies_hi = 0;
> > + static unsigned long jiffies_last = INITIAL_JIFFIES;
> > + static unsigned long jiffies_tmp;
> ^^^^^^ jiffies_tmp doesn't need to be static.

Yes, cut and paste error, sorry. And I wanted this patch to be final for
today...

>
> One suggestion someone had was to put dummy "get_jiffies64()" calls
> in some other infrequently used areas to ensure jiffies_hi is valid
> if we don't call uptime for 1.3 years after the first wrap. I don't
> know if that matters or not.
>

I don't have enough knowledge of the kernel to find good places.
Any suggestions?


Tim

2001-11-01 00:52:14

by Tim Schmielau

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Thu, 1 Nov 2001, Tim Schmielau wrote:

> On Wed, 31 Oct 2001, Andreas Dilger wrote:
>
> > > +u64 get_jiffies64(void)
> > > +{
> > > + static unsigned long jiffies_hi = 0;
> > > + static unsigned long jiffies_last = INITIAL_JIFFIES;
> > > + static unsigned long jiffies_tmp;
> > ^^^^^^ jiffies_tmp doesn't need to be static.
>
> Yes, cut and paste error, sorry. And I wanted this patch to be final for
> today...
>


OK, absolutely last patch for today. Sorry to bother everyone, but the
jiffies wraparound logic was broken in the previous patch.

As stated before, I would kindly ask for widespread testing PROVIDED IT IS
OK FOR YOU TO RISK THE STABILITY OF YOUR BOX!


Tim


--- linux-2.4.14-pre6/fs/proc/proc_misc.c Thu Oct 11 19:46:57 2001
+++ linux-2.4.14-pre6-jiffies64/fs/proc/proc_misc.c Wed Oct 31 22:53:55 2001
@@ -39,6 +39,7 @@
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/io.h>
+#include <asm/div64.h>


#define LOAD_INT(x) ((x) >> FSHIFT)
@@ -103,15 +104,19 @@
static int uptime_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
- unsigned long uptime;
+ u64 uptime;
+ unsigned long remainder;
unsigned long idle;
int len;

- uptime = jiffies;
+ uptime = get_jiffies64() - INITIAL_JIFFIES;
+ remainder = (unsigned long) do_div(uptime, HZ);
+
idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime;

- /* The formula for the fraction parts really is ((t * 100) / HZ) % 100, but
- that would overflow about every five days at HZ == 100.
+ /* The formula for the fraction part of the idle time really is
+ ((t * 100) / HZ) % 100, but that would overflow about
+ every five days at HZ == 100.
Therefore the identity a = (a / b) * b + a % b is used so that it is
calculated as (((t / HZ) * 100) + ((t % HZ) * 100) / HZ) % 100.
The part in front of the '+' always evaluates as 0 (mod 100). All divisions
@@ -121,14 +126,14 @@
*/
#if HZ!=100
len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
- uptime / HZ,
- (((uptime % HZ) * 100) / HZ) % 100,
+ (unsigned long) uptime,
+ (remainder * 100) / HZ,
idle / HZ,
(((idle % HZ) * 100) / HZ) % 100);
#else
len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
- uptime / HZ,
- uptime % HZ,
+ (unsigned long) uptime,
+ remainder,
idle / HZ,
idle % HZ);
#endif
--- linux-2.4.14-pre6/kernel/timer.c Mon Oct 8 19:41:41 2001
+++ linux-2.4.14-pre6-jiffies64/kernel/timer.c Thu Nov 1 01:37:12 2001
@@ -65,7 +65,7 @@

extern int do_setitimer(int, struct itimerval *, struct itimerval *);

-unsigned long volatile jiffies;
+unsigned long volatile jiffies = INITIAL_JIFFIES;

unsigned int * prof_buffer;
unsigned long prof_len;
@@ -117,7 +117,7 @@
INIT_LIST_HEAD(tv1.vec + i);
}

-static unsigned long timer_jiffies;
+static unsigned long timer_jiffies = INITIAL_JIFFIES;

static inline void internal_add_timer(struct timer_list *timer)
{
@@ -638,7 +638,7 @@
}

/* jiffies at the most recent update of wall time */
-unsigned long wall_jiffies;
+unsigned long wall_jiffies = INITIAL_JIFFIES;

/*
* This spinlock protect us from races in SMP while playing with xtime. -arca
@@ -683,6 +683,34 @@
if (TQ_ACTIVE(tq_timer))
mark_bh(TQUEUE_BH);
}
+
+
+#if BITS_PER_LONG < 48
+
+u64 get_jiffies64(void)
+{
+ static unsigned long jiffies_hi = 0;
+ static unsigned long jiffies_last = INITIAL_JIFFIES;
+ unsigned long jiffies_tmp;
+
+ jiffies_tmp = jiffies; /* avoid races */
+ if (jiffies_tmp < jiffies_last) /* We have a wrap */
+ jiffies_hi++;
+ jiffies_last = jiffies_tmp;
+
+ return (jiffies_tmp | ((u64)jiffies_hi) << BITS_PER_LONG);
+}
+
+#else
+ /* jiffies is wide enough to not wrap for 8716 years at HZ==1024 */
+
+static inline u64 get_jiffies64(void)
+{
+ return (u64)jiffies;
+}
+
+#endif /* BITS_PER_LONG < 48 */
+

#if !defined(__alpha__) && !defined(__ia64__)

--- linux-2.4.14-pre6/kernel/info.c Sat Apr 21 01:15:40 2001
+++ linux-2.4.14-pre6-jiffies64/kernel/info.c Wed Oct 31 23:15:26 2001
@@ -12,15 +12,19 @@
#include <linux/smp_lock.h>

#include <asm/uaccess.h>
+#include <asm/div64.h>

asmlinkage long sys_sysinfo(struct sysinfo *info)
{
struct sysinfo val;
+ u64 uptime;

memset((char *)&val, 0, sizeof(struct sysinfo));

cli();
- val.uptime = jiffies / HZ;
+ uptime = get_jiffies64() - INITIAL_JIFFIES;
+ do_div(uptime, HZ);
+ val.uptime = (unsigned long) uptime;

val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
--- linux-2.4.14-pre6/include/linux/sched.h Wed Oct 31 23:06:23 2001
+++ linux-2.4.14-pre6-jiffies64/include/linux/sched.h Wed Oct 31 23:23:51 2001
@@ -543,7 +543,10 @@

#include <asm/current.h>

+#define INITIAL_JIFFIES 0xFFFFD000ul
extern unsigned long volatile jiffies;
+extern u64 get_jiffies64(void);
+
extern unsigned long itimer_ticks;
extern unsigned long itimer_next;
extern struct timeval xtime;

2001-11-01 07:46:24

by Ville Herva

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Wed, Oct 31, 2001 at 04:17:36PM -0500, you [Richard B. Johnson] claimed:
> > u64 get_jiffies64(void)
> > {
> > static unsigned long jiffies_hi = 0;
> > static unsigned long jiffies_last = INITIAL_JIFFIES;
> >
> > /* probably need locking for this part */
> > if (jiffies < jiffies_last) { /* We have a wrap */
> > jiffies_hi++;
> > jiffies_last = jiffies;
> > }
> >
> > return (jiffies | ((u64)jiffies_hi) << LONG_SHIFT));
> > }
>
> Ah, yes. It's perfect. It could be put right in the 'uptime' code.
> It has zero overhead otherwise.

Just my two cents... I would prefer that to be in kernel (it has what, 8
byte overhead), so that /proc/uptime is correct, not just uptime(1) output.
There are other programs that access /proc/uptime as well, so it would be
good to fix it in one place.

I was thinking, could there be a elegant(ish) place in the kernel where one
could drop a dummy call to get_jiffies64 so that it would always be called
at least once a 497 days (I'm not sure wher the 1.3 years value comes from)?

Other than that this seems a good alternative.


-- v --

[email protected]

2001-11-01 10:06:23

by George Anzinger

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

First I want to thank Tim for pointing out (by usage) the do_div()
macro. This has troubled me for some time. Thanks Tim.

As to the method to use for 64-bit jiffies, the one Tim used was
discussed on the list a short time ago under the subject "How should we
do a 64-bit jiffies?" (thread started by me on 10/22/01). The asm
overlay was discussed as well as other methods. Linus proposed the
version that Tim used here and that I am using in the high-res-timers
patch. (See: http://sourceforge.net/projects/high-res-timers) And, yes
it does make it a NO-NO to have a local variable (or a struct member)
named jiffies and I am sure that we will find other uses of such as
additional drivers and platforms are tried.
http://sourceforge.net/projects/high-res-timers
On the issue of locking, jiffies (in the current system) is updated
under the xtime_lock, so this is the lock one would use to read it if
you cared, however, as patched, almost NO ONE cares or needs to. Just
using the low 32 bits and the existing compare macros will do just fine,
thank you. For those that care, i.e. uptime, CLOCK_MONOTOINIC (and
other code in POSIX timers), either the xtime_lock (a read/write lock by
the way, not a spin lock) can be taken.

Here Tim is proposing a while loop which, IMHO, will fail in the SMP
case, but does handle the UP interrupt case quite nicely.

Tim Schmielau wrote:
>
> On Wed, 31 Oct 2001, Richard B. Johnson wrote:
>
> > On Wed, 31 Oct 2001, vda wrote:
> >
> > [SNIPPED...]
> >
> > > Hmm.... 64bit jiffies are attractive.
> > >
> > > I'd like to see less #defines in kernel
> > > Some parts of your patch fight with the fact that jiffies
> > > is converted to macro -> it is illegal now to have local vars
> > > called "jiffies". This is ugly. I know that there are tons of similarly
> > > (ab)used macros in the kernel now but let's stop adding more!
> > >
> > > This test prog shows how to make overlapping 32bit and 64bit vars.
> > > It works for me.
> > >
>
> [asm snipped]
>
> > >
> > > Is this better or not? If not, why?

The principle problem is platform dependence.

> > > --
> > > vda
> >
> > The problem is that a 64-bit jiffies on a 32-bit machine would
> > require a spin-lock every time the jiffies variable is changed!

Ah, but it is only changed in the timer interrupt which is already under
a write_irq lock.

> > This is because there are two (or more) memory accesses for
> > every 64 bit operation, plus two or more register accesses for
> > every 64 bit operation. If a context-switch or an interrupt
> > occurs between those operations, all bets are off about the
> > result.
> >
> > The appended small tar.gz file contains some 64-bit assembly
> > plus some 64-bit C, Look at the assembly and it will become
> > obvious to you that you don't want to use a 64-bit timer
> > on a 32 bit machine.
> >
> >
> > Cheers,
> > Dick Johnson
> >
>
> The idea was that all drivers that use the 32 bit jiffies counter have to
> be aware of the wraparound anyways, and won't see a difference.
> The race only happens for 64 bit accesses to jiffies, but hey, without
> the patch these values come out wrong _every_ time, so I believed a
> tiny window for a single wrong display of uptime every 497.1 days to be
> acceptable.

For display, ok, but not for CLOCK_MONOTONIC and other POSIX timers
usage :)
>
> If we want to make sure to eliminate this possibility as well, the kind of
> home-made synchronization may help that I came up with before considering
> the race acceptable. It also has the advantage of not touching the jiffies
> definition at all.

As pointed out above, this works for interrupts on UP machines, but,
IMHO, will fail on SMP machines.

George

>
> Tim
>
> --- fs/proc/proc_misc.c.orig Wed Oct 31 17:45:08 2001
> +++ fs/proc/proc_misc.c Wed Oct 31 18:49:19 2001
> @@ -39,6 +39,7 @@
> #include <asm/uaccess.h>
> #include <asm/pgtable.h>
> #include <asm/io.h>
> +#include <asm/div64.h>
>
> #define LOAD_INT(x) ((x) >> FSHIFT)
> @@ -103,15 +104,28 @@
> static int uptime_read_proc(char *page, char **start, off_t off,
> int count, int *eof, void *data)
> {
> - unsigned long uptime;
> + u64 uptime;
> + unsigned long jiffies_tmp, jiffies_high_tmp, remainder;
> unsigned long idle;
> int len;
>
> - uptime = jiffies;
> + /* We need to make sure jiffies_high does not change while
> + * reading jiffies and jiffies_high */
> + do {
> + jiffies_high_tmp = jiffies_high_shadow;
> + barrier();
> + jiffies_tmp = jiffies;
> + barrier();
> + } while (jiffies_high != jiffies_high_tmp);
> +
> + uptime = jiffies_tmp + ((u64)jiffies_high_tmp << BITS_PER_LONG);
> + remainder = (unsigned long) do_div(uptime, HZ);
> +
> idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime;
>
> - /* The formula for the fraction parts really is ((t * 100) / HZ) % 100, but
> - that would overflow about every five days at HZ == 100.
> + /* The formula for the fraction part of the idle time really is
> + ((t * 100) / HZ) % 100, but that would overflow about
> + every five days at HZ == 100.
> Therefore the identity a = (a / b) * b + a % b is used so that it is
> calculated as (((t / HZ) * 100) + ((t % HZ) * 100) / HZ) % 100.
> The part in front of the '+' always evaluates as 0 (mod 100). All divisions
> @@ -121,14 +135,14 @@
> */
> #if HZ!=100
> len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
> - uptime / HZ,
> - (((uptime % HZ) * 100) / HZ) % 100,
> + (unsigned long) uptime,
> + ((remainder * 100) / HZ) % 100,
> idle / HZ,
> (((idle % HZ) * 100) / HZ) % 100);
> #else
> len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
> - uptime / HZ,
> - uptime % HZ,
> + (unsigned long) uptime,
> + remainder,
> idle / HZ,
> idle % HZ);
> #endif
> --- kernel/timer.c.orig Wed Oct 31 17:24:36 2001
> +++ kernel/timer.c Wed Oct 31 18:38:47 2001
> @@ -65,7 +65,9 @@
>
> extern int do_setitimer(int, struct itimerval *, struct itimerval *);
>
> -unsigned long volatile jiffies;
> +#define INITIAL_JIFFIES 0xFFFFD000ul
> +unsigned long volatile jiffies = INITIAL_JIFFIES;
> +unsigned long volatile jiffies_high, jiffies_high_shadow;
>
> unsigned int * prof_buffer;
> unsigned long prof_len;
> @@ -117,7 +119,7 @@
> INIT_LIST_HEAD(tv1.vec + i);
> }
>
> -static unsigned long timer_jiffies;
> +static unsigned long timer_jiffies = INITIAL_JIFFIES;
>
> static inline void internal_add_timer(struct timer_list *timer)
> {
> @@ -638,7 +640,7 @@
> }
>
> /* jiffies at the most recent update of wall time */
> -unsigned long wall_jiffies;
> +unsigned long wall_jiffies = INITIAL_JIFFIES;
>
> /*
> * This spinlock protect us from races in SMP while playing with xtime. -arca
> @@ -673,7 +675,22 @@
>
> void do_timer(struct pt_regs *regs)
> {
> - (*(unsigned long *)&jiffies)++;
> + /* we assume that two calls to do_timer can never overlap
> + * since they are one jiffie apart in time */
> + if (jiffies != 0xffffffffUL) {
> + jiffies++;
> + } else {
> + /* We still need to care about the race with readers of
> + * jiffies_high. Readers have to discard the values if
> + * jiffies_high != jiffies_high_shadow when read with
> + * proper barriers in between. */
> + jiffies_high++;
> + barrier();
> + jiffies++;
> + barrier();
> + jiffies_high_shadow = jiffies_high;
> + barrier();
> + }
> #ifndef CONFIG_SMP
> /* SMP process accounting uses the local APIC timer */
>
> --- kernel/info.c.orig Wed Oct 31 17:58:25 2001
> +++ kernel/info.c Wed Oct 31 18:48:52 2001
> @@ -12,15 +12,28 @@
> #include <linux/smp_lock.h>
>
> #include <asm/uaccess.h>
> +#include <asm/div64.h>
>
> asmlinkage long sys_sysinfo(struct sysinfo *info)
> {
> struct sysinfo val;
> + u64 uptime;
> + unsigned long jiffies_tmp, jiffies_high_tmp;
>
> memset((char *)&val, 0, sizeof(struct sysinfo));
>
> cli();
> - val.uptime = jiffies / HZ;
> + /* We need to make sure jiffies_high does not change while
> + * reading jiffies and jiffies_high */
> + do {
> + jiffies_high_tmp = jiffies_high_shadow;
> + barrier();
> + jiffies_tmp = jiffies;
> + barrier();
> + } while (jiffies_high != jiffies_high_tmp);
> + uptime = jiffies_tmp + ((u64)jiffies_high_tmp << BITS_PER_LONG);
> + do_div(uptime, HZ);
> + val.uptime = uptime;
>
> val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
> val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2001-11-01 08:03:54

by Petr Vandrovec

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On 1 Nov 01 at 1:52, Tim Schmielau wrote:

> + static unsigned long jiffies_hi = 0;
> + static unsigned long jiffies_last = INITIAL_JIFFIES;
> + unsigned long jiffies_tmp;
> +
> + jiffies_tmp = jiffies; /* avoid races */
> + if (jiffies_tmp < jiffies_last) /* We have a wrap */
> + jiffies_hi++;
> + jiffies_last = jiffies_tmp;

There is very small race - if two processes will call get_jiffies64()
at same time, they can both happen to test jiffies_tmp < jiffies_last
with old jiffies_last - incrementing jiffies_hi twice :-( This race
window is only few microseconds every 497 days, but if we want
correct kernel...
Best regards,
Petr Vandrovec
[email protected]

2001-11-01 11:22:40

by George Anzinger

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

Tim Schmielau wrote:
~snip
> @@ -683,6 +683,34 @@
> if (TQ_ACTIVE(tq_timer))
> mark_bh(TQUEUE_BH);
> }
> +
> +
> +#if BITS_PER_LONG < 48
> +
> +u64 get_jiffies64(void)
> +{
> + static unsigned long jiffies_hi = 0;
> + static unsigned long jiffies_last = INITIAL_JIFFIES;
> + unsigned long jiffies_tmp;
> +
> + jiffies_tmp = jiffies; /* avoid races */
> + if (jiffies_tmp < jiffies_last) /* We have a wrap */
> + jiffies_hi++;
> + jiffies_last = jiffies_tmp;
> +
> + return (jiffies_tmp | ((u64)jiffies_hi) << BITS_PER_LONG);

Doesn't this need to be protected on SMP machines? What if two cpus
call get_jiffies64() at the same time... Seems like jiffies_hi could
get bumped twice instead of once.

George

2001-11-01 11:41:30

by Tim Schmielau

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Thu, 1 Nov 2001, george anzinger wrote:

> Tim Schmielau wrote:
> > +
> > +#if BITS_PER_LONG < 48
> > +
> > +u64 get_jiffies64(void)
> > +{
> > + static unsigned long jiffies_hi = 0;
> > + static unsigned long jiffies_last = INITIAL_JIFFIES;
> > + unsigned long jiffies_tmp;
> > +
> > + jiffies_tmp = jiffies; /* avoid races */
> > + if (jiffies_tmp < jiffies_last) /* We have a wrap */
> > + jiffies_hi++;
> > + jiffies_last = jiffies_tmp;
> > +
> > + return (jiffies_tmp | ((u64)jiffies_hi) << BITS_PER_LONG);
>
> Doesn't this need to be protected on SMP machines? What if two cpus
> call get_jiffies64() at the same time... Seems like jiffies_hi could
> get bumped twice instead of once.
>
> George
>

Yes, it does, my race protection is bogus. Petr Vandrovec also pointed out
that. So we do need either to
a) stuff jiffies_hi and jiffies_last into one atomic type
(16 bits is enough for each) or
b) use locking.
My next patch will use b), but I won't do it until I have resolved the
most annoying stability issues. I won't have time to do this before the
weekend, and don't want to bother the list too much either.

Maybe the lockups are just due to my setting of INITIAL_JIFFIES instead of
waiting 471 days. The time adjustment routines are good candidates for
this kind of mistakes. Any ideas anyone where else I might have forgotten
to introduce INITIAL_JIFFIES ?

Tim

2001-11-01 14:11:12

by Denis Vlasenko

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Wednesday 31 October 2001 20:47, Tim Schmielau wrote:
> btw.: can someone please explain to me why do_timer uses
> (*(unsigned long *)&jiffies)++;
> instead of just doing jiffies++ ?

This casts away volatile -> gcc generates potentially faster code


> -unsigned long volatile jiffies;
> +unsigned long volatile jiffies = INITIAL_JIFFIES;
> +unsigned long volatile jiffies_hi, jiffies_hi_shadow;

Grepped your patch for uses of jiffies_hi_shadow, found none
except for stores. Seems to be unused?


> void do_timer(struct pt_regs *regs)
> {
> - (*(unsigned long *)&jiffies)++;
> + /* we assume that two calls to do_timer can never overlap
> + * since they are one jiffie apart in time */
> + if (jiffies != (unsigned long)(-1)) {
> + jiffies++;
> + } else {
> + /* We still need to care about the race with readers of
> + * jiffies_hi. Readers have to discard the values if
> + * jiffies_hi != jiffies_hi_shadow when read with
> + * proper barriers in between. */
> + jiffies_hi++;
> + barrier();
> + jiffies++;
> + barrier();
> + jiffies_hi_shadow = jiffies_hi;
> + barrier();
> + }

Looks like gross overkill for 64bit i++. I see no flaw in much simpler:
+ if(++jiffies == 0) {
+ /* barrier(); --vda: needed? I have some doubts... */
+ jiffies_hi++;
+ }

To avoid races 64bit readers must read in reverse order: hi,lo,check hi;
your patch already does this just right:

> +static inline u64 get_jiffies64() {
> + unsigned long hi,lo;
> + /* We need to make sure jiffies_hi does not change while
> + * reading jiffies and jiffies_hi */
> + do {
> + hi = jiffies_hi;
> + barrier();
> + lo = jiffies;
> + barrier();
> + } while (hi != jiffies_hi);
> + return lo + (((u64)hi) << BITS_PER_LONG);
> +}
> +


> +#define INITIAL_JIFFIES 0xFFFFD000ul
> +extern unsigned long volatile jiffies, jiffies_hi, jiffies_hi_shadow;

This belongs to some .h file, not .c (most likely include/linux/sched.h)

Also consider

+#if defined(CONFIG_DEBUG_KERNEL)
+/* Rollover in 1000 secs */
+#define INITIAL_JIFFIES ((unsigned long) -1000*HZ)
+#else
+#define INITIAL_JIFFIES 0
+#endif


Hope your patch will go into mainline soon!
--
vda

2001-11-01 14:36:53

by Denis Vlasenko

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Thursday 01 November 2001 00:52, Tim Schmielau wrote:

> OK, absolutely last patch for today. Sorry to bother everyone, but the
> jiffies wraparound logic was broken in the previous patch.
>
> As stated before, I would kindly ask for widespread testing PROVIDED IT IS
> OK FOR YOU TO RISK THE STABILITY OF YOUR BOX!

I see you dropped jiffies_hi update in timer int.
IMHO argument on wasting 6 CPU cycles or so per each timer int:

- jiffies++;
+ if(++jiffies==0) jiffies_hi++;

is not justified. I'd rather see simple and correct code in timer int
rather than jumping thru the hoops in get_jiffies_64().

For CPU cycle saving zealots: I advocate saving 2 static longs in get_jiffies
instead :-)
--
vda

2001-11-01 15:35:59

by Richard B. Johnson

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Thu, 1 Nov 2001, vda wrote:

> On Thursday 01 November 2001 00:52, Tim Schmielau wrote:
>
> > OK, absolutely last patch for today. Sorry to bother everyone, but the
> > jiffies wraparound logic was broken in the previous patch.
> >
> > As stated before, I would kindly ask for widespread testing PROVIDED IT IS
> > OK FOR YOU TO RISK THE STABILITY OF YOUR BOX!
>
> I see you dropped jiffies_hi update in timer int.
> IMHO argument on wasting 6 CPU cycles or so per each timer int:
>
> - jiffies++;
> + if(++jiffies==0) jiffies_hi++;
>
> is not justified. I'd rather see simple and correct code in timer int
> rather than jumping thru the hoops in get_jiffies_64().
>
> For CPU cycle saving zealots: I advocate saving 2 static longs in get_jiffies
> instead :-)
> --
> vda
> -

Well not exactly zealots. I test a lot of stuff. In fact, the code
you propose:

if(++jiffies==0) jiffies_hi++;

... actually works quite well:

Script started on Thu Nov 1 10:23:54 2001
# ./chk
Simple bump = 13
Bump chk and incr = 15
# ./chk
Simple bump = 13
Bump chk and incr = 15
# ./chk
Simple bump = 13
Bump chk and incr = 15
# exit
exit
Script done on Thu Nov 1 10:24:08 2001

It adds only two CPU clock cycles if (iff) the 'C' compiler is
well behaved.

Test code is appended. In the test code, I calculate everything, then
print the results. This is so the 'C' library + system call doesn't
mess up the cache. Note this if you use this as a template to test
other questionable code snippets.


Cheers,
Dick Johnson

Penguin : Linux version 2.4.1 on an i686 machine (799.53 BogoMips).

I was going to compile a list of innovations that could be
attributed to Microsoft. Once I realized that Ctrl-Alt-Del
was handled in the BIOS, I found that there aren't any.


Attachments:
test_jiff.tar.gz (837.00 B)

2001-11-01 17:02:54

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Thu, Nov 01, 2001 at 10:34:53AM -0500, Richard B. Johnson wrote:
> Well not exactly zealots. I test a lot of stuff. In fact, the code
> you propose:
>
> if(++jiffies==0) jiffies_hi++;
>
> ... actually works quite well:

Uhm, no, it really doesn't. See how it pairs with other instructions and
what the cost is when it doesn't have to be as bad:

this:
unsigned long a, b;
if (++a == 0) b++;
gives:
movl a, %eax
movl %esp, %ebp
incl %eax
testl %eax, %eax
movl %eax, a
je .L3
.L2:
popl %ebp
ret
.p2align 4,,7
.L3:
incl b
jmp .L2

which is really gross considering that:

unsigned long long c;
c++;

gives:

addl $1, c
adcl $0, c+4

which is quite excellent.

-ben

2001-11-01 18:04:09

by Richard B. Johnson

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Thu, 1 Nov 2001, Benjamin LaHaise wrote:

> On Thu, Nov 01, 2001 at 10:34:53AM -0500, Richard B. Johnson wrote:
> > Well not exactly zealots. I test a lot of stuff. In fact, the code
> > you propose:
> >
> > if(++jiffies==0) jiffies_hi++;
> >
> > ... actually works quite well:
>
> Uhm, no, it really doesn't. See how it pairs with other instructions and
> what the cost is when it doesn't have to be as bad:
>
> this:
> unsigned long a, b;
> if (++a == 0) b++;
> gives:
> movl a, %eax
> movl %esp, %ebp
> incl %eax
> testl %eax, %eax
> movl %eax, a
> je .L3
> .L2:
> popl %ebp
> ret
> .p2align 4,,7
> .L3:
> incl b
> jmp .L2
>
> which is really gross considering that:
>
> unsigned long long c;
> c++;
>
> gives:
>
> addl $1, c
> adcl $0, c+4
>
> which is quite excellent.
>
> -ben
>

Look I tested it, and I provided code to test it! You will not
convince the 'C' compiler to do:

addl $1, c
adcl $0, c+4

... with a long long (at least not egcs-2.91.66). Further, that's
not the whole story. If jiffies is a long long, then every operation
on that counter, all the timing queues, sleeps, etc., end up doing
multiple operations on the long long (which I showed on Monday with
some additional, best possible, code).

... which is code I showed initially. Knowing that the C compiler
does the jumps on condition and tests for zero, even after the
flags have been set by the previous operation, I tested what
the result was. It turns out that it's only a couple of clock
cycles, not the 6 extra clocks that the hand calculation shows.

So, if you leave jiffies alone, but bump another variable when it
wraps, you get to eat your cake and keep it too.


Cheers,
Dick Johnson

Penguin : Linux version 2.4.1 on an i686 machine (799.53 BogoMips).

I was going to compile a list of innovations that could be
attributed to Microsoft. Once I realized that Ctrl-Alt-Del
was handled in the BIOS, I found that there aren't any.


2001-11-01 18:35:02

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Thu, Nov 01, 2001 at 01:03:32PM -0500, Richard B. Johnson wrote:
> does the jumps on condition and tests for zero, even after the
> flags have been set by the previous operation, I tested what
> the result was. It turns out that it's only a couple of clock
> cycles, not the 6 extra clocks that the hand calculation shows.

*sigh* You're not testing any of the effects on available execution
resources within the processor.

> So, if you leave jiffies alone, but bump another variable when it
> wraps, you get to eat your cake and keep it too.

As Linus pointed out, using casting tricks with a long long will just
work for this case. Sounds good to me.

-ben

2001-11-01 19:28:36

by George Anzinger

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

"Richard B. Johnson" wrote:
>
> On Thu, 1 Nov 2001, vda wrote:
>
> > On Thursday 01 November 2001 00:52, Tim Schmielau wrote:
> >
> > > OK, absolutely last patch for today. Sorry to bother everyone, but the
> > > jiffies wraparound logic was broken in the previous patch.
> > >
> > > As stated before, I would kindly ask for widespread testing PROVIDED IT IS
> > > OK FOR YOU TO RISK THE STABILITY OF YOUR BOX!
> >
> > I see you dropped jiffies_hi update in timer int.
> > IMHO argument on wasting 6 CPU cycles or so per each timer int:
> >
> > - jiffies++;
> > + if(++jiffies==0) jiffies_hi++;
> >
> > is not justified. I'd rather see simple and correct code in timer int
> > rather than jumping thru the hoops in get_jiffies_64().
> >
> > For CPU cycle saving zealots: I advocate saving 2 static longs in get_jiffies
> > instead :-)
> > --
> > vda
> > -
>
> Well not exactly zealots. I test a lot of stuff. In fact, the code
> you propose:
>
> if(++jiffies==0) jiffies_hi++;
>
> ... actually works quite well:
>
I think that bumping a u64 is actually faster. We have to remember that
this code is executed every 10 ms. It is not in a loop. This means
that the chance of branch prediction being available is nil. In this
case the processor will predict the branch as not taken and be wrong
(except for once every 1.3 years).

On the other hand, the u64 bump has no branches, just an add carry.
This amounts to 1 add carry each bump (assuming we can get cc to do the
add carry to memory) and no branches to predict. We should also take
care to put the u64 in one cache line, of course.

George



> Script started on Thu Nov 1 10:23:54 2001
> # ./chk
> Simple bump = 13
> Bump chk and incr = 15
> # ./chk
> Simple bump = 13
> Bump chk and incr = 15
> # ./chk
> Simple bump = 13
> Bump chk and incr = 15
> # exit
> exit
> Script done on Thu Nov 1 10:24:08 2001
>
> It adds only two CPU clock cycles if (iff) the 'C' compiler is
> well behaved.
>
> Test code is appended. In the test code, I calculate everything, then
> print the results. This is so the 'C' library + system call doesn't
> mess up the cache. Note this if you use this as a template to test
> other questionable code snippets.
>
> Cheers,
> Dick Johnson
>
> Penguin : Linux version 2.4.1 on an i686 machine (799.53 BogoMips).
>
> I was going to compile a list of innovations that could be
> attributed to Microsoft. Once I realized that Ctrl-Alt-Del
> was handled in the BIOS, I found that there aren't any.
>
> ------------------------------------------------------------------------
> Name: test_jiff.tar.gz
> test_jiff.tar.gz Type: Unix Tape Archive (application/x-tar)
> Encoding: BASE64

2001-11-02 00:28:59

by Tim Schmielau

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Thu, 1 Nov 2001, Tim Schmielau wrote:

> On Thu, 1 Nov 2001, george anzinger wrote:
>
[...]
> > Doesn't this need to be protected on SMP machines? What if two cpus
> > call get_jiffies64() at the same time... Seems like jiffies_hi could
> > get bumped twice instead of once.
> >
> > George
> >
>
> Yes, it does, my race protection is bogus. Petr Vandrovec also pointed out
> that. So we do need either to
> a) stuff jiffies_hi and jiffies_last into one atomic type
> (16 bits is enough for each) or
> b) use locking.
> My next patch will use b), but I won't do it until I have resolved the
> most annoying stability issues. I won't have time to do this before the
> weekend, and don't want to bother the list too much either.
>
> Maybe the lockups are just due to my setting of INITIAL_JIFFIES instead of
> waiting 471 days. The time adjustment routines are good candidates for
> this kind of mistakes. Any ideas anyone where else I might have forgotten
> to introduce INITIAL_JIFFIES ?
>


Well, I did the next patch without waiting for progress on the stability
front (fsck still in heavy use here). As an excercise I added proper
locking to get_jiffies64().

Actually, it is not decided yet whether jiffies_hi should be incremented
in get_jiffies64() or in do_timer(), and whether jiffies and jiffies_hi
should be glued together to form a 64 bit value, so the innards of
get_jiffies64() might change again.

However, no objections have been raised on the interface, so I started to
add some uses of it. I bumped up the start_time field of struct task_struct
to 64 bits, so that ps output will still be ok after the 32 bit
wraparound. This change might break userland applications that expect the
/proc/#/stat fields to fit in 32 bit variables. Yet this is not a
regression as the kernel was broken for these cases until now.

As a side effect, the problem of the current get_jiffies64()
implementation loosing jiffie wraparounds is weakened further.
With the current patch wraparounds can only be lost if no processes are
started or finished for 497 days, a condition that I can only imagine to
be met by embedded devices.

Tim


--- linux-2.4.14-pre6/fs/proc/proc_misc.c Thu Oct 11 19:46:57 2001
+++ linux-2.4.14-pre6-jiffies64/fs/proc/proc_misc.c Wed Oct 31 22:53:55 2001
@@ -39,6 +39,7 @@
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/io.h>
+#include <asm/div64.h>


#define LOAD_INT(x) ((x) >> FSHIFT)
@@ -103,15 +104,19 @@
static int uptime_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
- unsigned long uptime;
+ u64 uptime;
+ unsigned long remainder;
unsigned long idle;
int len;

- uptime = jiffies;
+ uptime = get_jiffies64() - INITIAL_JIFFIES;
+ remainder = (unsigned long) do_div(uptime, HZ);
+
idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime;

- /* The formula for the fraction parts really is ((t * 100) / HZ) % 100, but
- that would overflow about every five days at HZ == 100.
+ /* The formula for the fraction part of the idle time really is
+ ((t * 100) / HZ) % 100, but that would overflow about
+ every five days at HZ == 100.
Therefore the identity a = (a / b) * b + a % b is used so that it is
calculated as (((t / HZ) * 100) + ((t % HZ) * 100) / HZ) % 100.
The part in front of the '+' always evaluates as 0 (mod 100). All divisions
@@ -121,14 +126,14 @@
*/
#if HZ!=100
len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
- uptime / HZ,
- (((uptime % HZ) * 100) / HZ) % 100,
+ (unsigned long) uptime,
+ (remainder * 100) / HZ,
idle / HZ,
(((idle % HZ) * 100) / HZ) % 100);
#else
len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
- uptime / HZ,
- uptime % HZ,
+ (unsigned long) uptime,
+ remainder,
idle / HZ,
idle % HZ);
#endif
--- linux-2.4.14-pre6/kernel/timer.c Mon Oct 8 19:41:41 2001
+++ linux-2.4.14-pre6-jiffies64/kernel/timer.c Thu Nov 1 23:05:38 2001
@@ -65,7 +65,7 @@

extern int do_setitimer(int, struct itimerval *, struct itimerval *);

-unsigned long volatile jiffies;
+unsigned long volatile jiffies = INITIAL_JIFFIES;

unsigned int * prof_buffer;
unsigned long prof_len;
@@ -117,7 +117,7 @@
INIT_LIST_HEAD(tv1.vec + i);
}

-static unsigned long timer_jiffies;
+static unsigned long timer_jiffies = INITIAL_JIFFIES;

static inline void internal_add_timer(struct timer_list *timer)
{
@@ -638,7 +638,7 @@
}

/* jiffies at the most recent update of wall time */
-unsigned long wall_jiffies;
+unsigned long wall_jiffies = INITIAL_JIFFIES;

/*
* This spinlock protect us from races in SMP while playing with xtime. -arca
@@ -683,6 +683,37 @@
if (TQ_ACTIVE(tq_timer))
mark_bh(TQUEUE_BH);
}
+
+
+#if BITS_PER_LONG < 48
+
+u64 get_jiffies64(void)
+{
+ static unsigned long jiffies_hi = 0;
+ static unsigned long jiffies_last = INITIAL_JIFFIES;
+ static spinlock_t jiffies64_lock = SPIN_LOCK_UNLOCKED;
+ unsigned long jiffies_tmp;
+
+ spin_lock(&jiffies64_lock);
+ jiffies_tmp = jiffies; /* avoid races */
+ if (jiffies_tmp < jiffies_last) /* We have a wrap */
+ jiffies_hi++;
+ jiffies_last = jiffies_tmp;
+ spin_unlock(&jiffies64_lock);
+
+ return (jiffies_tmp | ((u64)jiffies_hi) << BITS_PER_LONG);
+}
+
+#else
+ /* jiffies is wide enough to not wrap for 8716 years at HZ==1024 */
+
+static inline u64 get_jiffies64(void)
+{
+ return (u64)jiffies;
+}
+
+#endif /* BITS_PER_LONG < 48 */
+

#if !defined(__alpha__) && !defined(__ia64__)

--- linux-2.4.14-pre6/kernel/info.c Sat Apr 21 01:15:40 2001
+++ linux-2.4.14-pre6-jiffies64/kernel/info.c Wed Oct 31 23:15:26 2001
@@ -12,15 +12,19 @@
#include <linux/smp_lock.h>

#include <asm/uaccess.h>
+#include <asm/div64.h>

asmlinkage long sys_sysinfo(struct sysinfo *info)
{
struct sysinfo val;
+ u64 uptime;

memset((char *)&val, 0, sizeof(struct sysinfo));

cli();
- val.uptime = jiffies / HZ;
+ uptime = get_jiffies64() - INITIAL_JIFFIES;
+ do_div(uptime, HZ);
+ val.uptime = (unsigned long) uptime;

val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
--- linux-2.4.14-pre6/include/linux/sched.h Wed Oct 31 23:06:23 2001
+++ linux-2.4.14-pre6-jiffies64/include/linux/sched.h Fri Nov 2 00:46:39 2001
@@ -351,7 +351,7 @@
unsigned long it_real_incr, it_prof_incr, it_virt_incr;
struct timer_list real_timer;
struct tms times;
- unsigned long start_time;
+ u64 start_time;
long per_cpu_utime[NR_CPUS], per_cpu_stime[NR_CPUS];
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
@@ -543,7 +543,10 @@

#include <asm/current.h>

+#define INITIAL_JIFFIES ((unsigned long)(-120*HZ))
extern unsigned long volatile jiffies;
+extern u64 get_jiffies64(void);
+
extern unsigned long itimer_ticks;
extern unsigned long itimer_next;
extern struct timeval xtime;
--- linux-2.4.14-pre6/kernel/fork.c Wed Oct 24 02:44:15 2001
+++ linux-2.4.14-pre6-jiffies64/kernel/fork.c Thu Nov 1 22:57:20 2001
@@ -647,7 +647,7 @@
}
#endif
p->lock_depth = -1; /* -1 = no lock */
- p->start_time = jiffies;
+ p->start_time = get_jiffies64();

INIT_LIST_HEAD(&p->local_pages);

--- linux-2.4.14-pre6/kernel/acct.c Mon Mar 19 21:35:08 2001
+++ linux-2.4.14-pre6-jiffies64/kernel/acct.c Fri Nov 2 00:13:37 2001
@@ -56,6 +56,7 @@
#include <linux/tty.h>

#include <asm/uaccess.h>
+#include <asm/div64.h>

/*
* These constants control the amount of freespace that suspend and
@@ -227,20 +228,24 @@
* This routine has been adopted from the encode_comp_t() function in
* the kern_acct.c file of the FreeBSD operating system. The encoding
* is a 13-bit fraction with a 3-bit (base 8) exponent.
+ *
+ * Bumped up to encode 64 bit values. Unfortunately the result may
+ * overflow now.
*/

#define MANTSIZE 13 /* 13 bit mantissa. */
-#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */
+#define EXPSIZE 3 /* 3 bit exponent. */
+#define EXPBASE 3 /* Base 8 (3 bit) exponent. */
#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */

-static comp_t encode_comp_t(unsigned long value)
+static comp_t encode_comp_t(u64 value)
{
int exp, rnd;

exp = rnd = 0;
while (value > MAXFRACT) {
- rnd = value & (1 << (EXPSIZE - 1)); /* Round up? */
- value >>= EXPSIZE; /* Base 8 exponent == 3 bit shift. */
+ rnd = value & (1 << (EXPBASE - 1)); /* Round up? */
+ value >>= EXPBASE; /* Base 8 exponent == 3 bit shift. */
exp++;
}

@@ -248,16 +253,21 @@
* If we need to round up, do it (and handle overflow correctly).
*/
if (rnd && (++value > MAXFRACT)) {
- value >>= EXPSIZE;
+ value >>= EXPBASE;
exp++;
}

/*
* Clean it up and polish it off.
*/
- exp <<= MANTSIZE; /* Shift the exponent into place */
- exp += value; /* and add on the mantissa. */
- return exp;
+ if (exp >= (1 << EXPSIZE)) {
+ /* Overflow. Return largest representable number instead */
+ return (1 << (MANTSIZE + EXPSIZE)) - 1;
+ } else {
+ exp <<= MANTSIZE; /* Shift the exponent into place */
+ exp += value; /* and add on the mantissa. */
+ return exp;
+ }
}

/*
@@ -277,6 +287,7 @@
struct acct ac;
mm_segment_t fs;
unsigned long vsize;
+ u64 elapsed;

/*
* First check to see if there is enough free_space to continue
@@ -294,8 +305,10 @@
strncpy(ac.ac_comm, current->comm, ACCT_COMM);
ac.ac_comm[ACCT_COMM - 1] = '\0';

- ac.ac_btime = CT_TO_SECS(current->start_time) + (xtime.tv_sec - (jiffies / HZ));
- ac.ac_etime = encode_comp_t(jiffies - current->start_time);
+ elapsed = get_jiffies64() - current->start_time;
+ ac.ac_etime = encode_comp_t(elapsed);
+ do_div(elapsed, HZ);
+ ac.ac_btime = xtime.tv_sec - elapsed;
ac.ac_utime = encode_comp_t(current->times.tms_utime);
ac.ac_stime = encode_comp_t(current->times.tms_stime);
ac.ac_uid = current->uid;
--- linux-2.4.14-pre6/fs/proc/array.c Thu Oct 11 18:00:01 2001
+++ linux-2.4.14-pre6-jiffies64/fs/proc/array.c Thu Nov 1 23:04:02 2001
@@ -343,7 +343,7 @@
ppid = task->pid ? task->p_opptr->pid : 0;
read_unlock(&tasklist_lock);
res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
-%lu %lu %lu %lu %lu %ld %ld %ld %ld %ld %ld %lu %lu %ld %lu %lu %lu %lu %lu \
+%lu %lu %lu %lu %lu %ld %ld %ld %ld %ld %ld %llu %lu %ld %lu %lu %lu %lu %lu \
%lu %lu %lu %lu %lu %lu %lu %lu %d %d\n",
task->pid,
task->comm,
@@ -366,7 +366,7 @@
nice,
0UL /* removed */,
task->it_real_value,
- task->start_time,
+ (unsigned long long)(task->start_time - INITIAL_JIFFIES),
vsize,
mm ? mm->rss : 0, /* you might want to shift this left 3 */
task->rlim[RLIMIT_RSS].rlim_cur,

2001-11-02 01:24:45

by Andreas Dilger

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Nov 02, 2001 01:28 +0100, Tim Schmielau wrote:
> Well, I did the next patch without waiting for progress on the stability
> front (fsck still in heavy use here). As an excercise I added proper
> locking to get_jiffies64().

Looks good.

> idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime;
> [snip]
> */
> #if HZ!=100
> len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
> - uptime / HZ,
> - (((uptime % HZ) * 100) / HZ) % 100,
> + (unsigned long) uptime,
> + (remainder * 100) / HZ,
> idle / HZ,
> (((idle % HZ) * 100) / HZ) % 100);

Probably need to make idle a 64-bit value as well, even if the individual
items are not, just to avoid potential overflow... Calling do_div(idle,HZ)
may end up being just as fast as the hoops we jump through above to calculate
the fractions (2 divides, 2 modulus, and one multiply).

Cheers, Andreas
--
Andreas Dilger
http://sourceforge.net/projects/ext2resize/
http://www-mddsp.enel.ucalgary.ca/People/adilger/

2001-11-02 09:11:05

by Miquel van Smoorenburg

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

In article <[email protected]>,
Andreas Dilger <[email protected]> wrote:
>Probably need to make idle a 64-bit value as well, even if the individual
>items are not, just to avoid potential overflow...

Well idle will still overflow after a bit more than 497 days on a
typical system that is 99% idle, if init_tasks[0]->times.tms_{u,s}time
are left at 32 bits.

Mike.
--
"Only two things are infinite, the universe and human stupidity,
and I'm not sure about the former" -- Albert Einstein.

2001-11-02 12:54:51

by Denis Vlasenko

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Thursday 01 November 2001 18:34, Benjamin LaHaise wrote:
> > So, if you leave jiffies alone, but bump another variable when it
> > wraps, you get to eat your cake and keep it too.
>
> As Linus pointed out, using casting tricks with a long long will just
> work for this case. Sounds good to me.

I'm using these dirty tricks often. Too often in fact, it hurts readability
and violates "make it simple if you can" principle.
Look at this diff of "simple" and "optimized" version:

-unsigned long jiffies = INITIAL_JIFFIES;
-unsigned long jiffies_hi = 0;
+/* vda: need to enforce 8 byte alignment - how??? */
+#if defined(__LITTLE_ENDIAN) && (BITS_PER_LONG == 32)
+ unsigned long jiffies = INITIAL_JIFFIES;
+ unsigned long jiffies_hi = 0;
+#elif defined(__BIG_ENDIAN) && (BITS_PER_LONG == 32)
+ unsigned long jiffies_hi = 0;
+ unsigned long jiffies = INITIAL_JIFFIES;
+#elif (BITS_PER_LONG == 64)
+ unsigned long jiffies = INITIAL_JIFFIES;
+#else
+#error Fix me somebody please....
+#endif

...

+ if(++jiffies==0) jiffies_hi++;
+#if defined(__LITTLE_ENDIAN) && (BITS_PER_LONG == 32)
+ (*(unsigned long long*)&jiffies)++;
+#elif defined(__BIG_ENDIAN) && (BITS_PER_LONG == 32)
+ (*(unsigned long long*)&jiffies_hi)++;
+#elif (BITS_PER_LONG == 64)
+ jiffies++;
+#else
+#error Fix me somebody please....
+#endif

Does saving 600 CPU cycles per second (0.000001 of your CPU power)
worth this mess?
I think not. Let's hope gcc will get smarter some day :-)
--
vda

2001-11-02 17:18:29

by Tim Schmielau

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Thu, 1 Nov 2001, Andreas Dilger wrote:

> On Nov 02, 2001 01:28 +0100, Tim Schmielau wrote:
> > Well, I did the next patch without waiting for progress on the stability
> > front (fsck still in heavy use here). As an excercise I added proper
> > locking to get_jiffies64().
>
> Looks good.
>
> > idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime;
> > [snip]
> > */
> > #if HZ!=100
> > len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
> > - uptime / HZ,
> > - (((uptime % HZ) * 100) / HZ) % 100,
> > + (unsigned long) uptime,
> > + (remainder * 100) / HZ,
> > idle / HZ,
> > (((idle % HZ) * 100) / HZ) % 100);
>
> Probably need to make idle a 64-bit value as well, even if the individual
> items are not, just to avoid potential overflow... Calling do_div(idle,HZ)
> may end up being just as fast as the hoops we jump through above to calculate
> the fractions (2 divides, 2 modulus, and one multiply).
>
> Cheers, Andreas


I made up another patch as an RFC where this overflow is dealt with in the
same way as the jiffies wraparound. This will prevent wrong display of
idle time if /proc/uptime is read at least every 497 days.
The CPU time of the idle task will still overflow as it will for every
other process getting more than 497.1 days of CPU time, but I don't want
to blow up every time variable. I believe this to be acceptable behavior.

I think I have to give up on my stability problem. I see hard lockups on
SMP as well as UP at random times after the jiffies wrap.
Another problem surprises me: Sometimes I get quite an unresponsive system
even before jiffie wrap, sometimes not. Seems as some important timing
parameter sometimes gets detected wrong at boot time if INITIAL_JIFFIES is
large. The bogomips value, however, is always correct.

As suggested elsewhere, I made the pre-wrap INITIAL_JIFFIES value a config
option, so anybody might decide by himself to hunt down these problems
or not.
I kindly ask people to turn this on and help getting the issues sorted
out, as I don't want to imply a false feeling of safety with this patch
by hiding the jiffies wraparound from the user.


I will let this patch float around for some days to get feedback, and then
suggest it for inclusion into the mainline kernel.
I believe the ongoing discussion of a "real" 64 bit jiffie counter not to
interfere with this goal, since this can also be done at any later time
with the get_jiffies64() interface remaining unchanged.

Thanks all for your comments.

Tim


--- linux-2.4.14-pre6/fs/proc/proc_misc.c Thu Oct 11 19:46:57 2001
+++ linux-2.4.14-pre6-jiffies64/fs/proc/proc_misc.c Fri Nov 2 17:05:19 2001
@@ -39,6 +39,7 @@
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/io.h>
+#include <asm/div64.h>


#define LOAD_INT(x) ((x) >> FSHIFT)
@@ -100,37 +101,59 @@
return proc_calc_metrics(page, start, off, count, eof, len);
}

+#if BITS_PER_LONG < 48
+
+u64 get_idle64(void)
+{
+ static unsigned long idle_hi, idle_last;
+ static spinlock_t idle64_lock = SPIN_LOCK_UNLOCKED;
+ unsigned long idle;
+
+ spin_lock(&idle64_lock);
+ idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime;
+ if (idle < idle_last) /* We have a wrap */
+ idle_hi++;
+ idle_last = idle;
+ spin_unlock(&idle64_lock);
+
+ return (idle | ((u64)idle_hi) << BITS_PER_LONG);
+}
+
+#else
+ /* Idle time won't overflow for 8716 years at HZ==1024 */
+
+static inline u64 get_idle64(void)
+{
+ return (u64)(init_tasks[0]->times.tms_utime
+ + init_tasks[0]->times.tms_stime);
+}
+
+#endif /* BITS_PER_LONG < 48 */
+
static int uptime_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
- unsigned long uptime;
- unsigned long idle;
+ u64 uptime, idle;
+ unsigned long uptime_remainder, idle_remainder;
int len;

- uptime = jiffies;
- idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime;
+ uptime = get_jiffies64() - INITIAL_JIFFIES;
+ uptime_remainder = (unsigned long) do_div(uptime, HZ);
+ idle = get_idle64();
+ idle_remainder = (unsigned long) do_div(idle, HZ);

- /* The formula for the fraction parts really is ((t * 100) / HZ) % 100, but
- that would overflow about every five days at HZ == 100.
- Therefore the identity a = (a / b) * b + a % b is used so that it is
- calculated as (((t / HZ) * 100) + ((t % HZ) * 100) / HZ) % 100.
- The part in front of the '+' always evaluates as 0 (mod 100). All divisions
- in the above formulas are truncating. For HZ being a power of 10, the
- calculations simplify to the version in the #else part (if the printf
- format is adapted to the same number of digits as zeroes in HZ.
- */
#if HZ!=100
len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
- uptime / HZ,
- (((uptime % HZ) * 100) / HZ) % 100,
- idle / HZ,
- (((idle % HZ) * 100) / HZ) % 100);
+ (unsigned long) uptime,
+ (uptime_remainder * 100) / HZ,
+ (unsigned long) idle,
+ (idle_remainder * 100) / HZ);
#else
len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
- uptime / HZ,
- uptime % HZ,
- idle / HZ,
- idle % HZ);
+ (unsigned long) uptime,
+ uptime_remainder,
+ (unsigned long) idle,
+ idle_remainder);
#endif
return proc_calc_metrics(page, start, off, count, eof, len);
}
--- linux-2.4.14-pre6/kernel/timer.c Mon Oct 8 19:41:41 2001
+++ linux-2.4.14-pre6-jiffies64/kernel/timer.c Fri Nov 2 13:52:55 2001
@@ -65,7 +65,7 @@

extern int do_setitimer(int, struct itimerval *, struct itimerval *);

-unsigned long volatile jiffies;
+unsigned long volatile jiffies = INITIAL_JIFFIES;

unsigned int * prof_buffer;
unsigned long prof_len;
@@ -117,7 +117,7 @@
INIT_LIST_HEAD(tv1.vec + i);
}

-static unsigned long timer_jiffies;
+static unsigned long timer_jiffies = INITIAL_JIFFIES;

static inline void internal_add_timer(struct timer_list *timer)
{
@@ -638,7 +638,7 @@
}

/* jiffies at the most recent update of wall time */
-unsigned long wall_jiffies;
+unsigned long wall_jiffies = INITIAL_JIFFIES;

/*
* This spinlock protect us from races in SMP while playing with xtime. -arca
@@ -683,6 +683,37 @@
if (TQ_ACTIVE(tq_timer))
mark_bh(TQUEUE_BH);
}
+
+
+#if BITS_PER_LONG < 48
+
+u64 get_jiffies64(void)
+{
+ static unsigned long jiffies_hi = 0;
+ static unsigned long jiffies_last = INITIAL_JIFFIES;
+ static spinlock_t jiffies64_lock = SPIN_LOCK_UNLOCKED;
+ unsigned long jiffies_tmp;
+
+ spin_lock(&jiffies64_lock);
+ jiffies_tmp = jiffies; /* avoid races */
+ if (jiffies_tmp < jiffies_last) /* We have a wrap */
+ jiffies_hi++;
+ jiffies_last = jiffies_tmp;
+ spin_unlock(&jiffies64_lock);
+
+ return (jiffies_tmp | ((u64)jiffies_hi) << BITS_PER_LONG);
+}
+
+#else
+ /* jiffies is wide enough to not wrap for 8716 years at HZ==1024 */
+
+static inline u64 get_jiffies64(void)
+{
+ return (u64)jiffies;
+}
+
+#endif /* BITS_PER_LONG < 48 */
+

#if !defined(__alpha__) && !defined(__ia64__)

--- linux-2.4.14-pre6/kernel/info.c Sat Apr 21 01:15:40 2001
+++ linux-2.4.14-pre6-jiffies64/kernel/info.c Fri Nov 2 13:52:55 2001
@@ -12,15 +12,19 @@
#include <linux/smp_lock.h>

#include <asm/uaccess.h>
+#include <asm/div64.h>

asmlinkage long sys_sysinfo(struct sysinfo *info)
{
struct sysinfo val;
+ u64 uptime;

memset((char *)&val, 0, sizeof(struct sysinfo));

cli();
- val.uptime = jiffies / HZ;
+ uptime = get_jiffies64() - INITIAL_JIFFIES;
+ do_div(uptime, HZ);
+ val.uptime = (unsigned long) uptime;

val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
--- linux-2.4.14-pre6/include/linux/sched.h Wed Oct 31 23:06:23 2001
+++ linux-2.4.14-pre6-jiffies64/include/linux/sched.h Fri Nov 2 17:10:08 2001
@@ -351,7 +351,7 @@
unsigned long it_real_incr, it_prof_incr, it_virt_incr;
struct timer_list real_timer;
struct tms times;
- unsigned long start_time;
+ u64 start_time;
long per_cpu_utime[NR_CPUS], per_cpu_stime[NR_CPUS];
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
@@ -543,7 +543,14 @@

#include <asm/current.h>

+#ifdef CONFIG_DEBUG_JIFFIEWRAP
+# define INITIAL_JIFFIES ((unsigned long)(-300*HZ))
+#else
+# define INITIAL_JIFFIES 0
+#endif
extern unsigned long volatile jiffies;
+extern u64 get_jiffies64(void);
+
extern unsigned long itimer_ticks;
extern unsigned long itimer_next;
extern struct timeval xtime;
--- linux-2.4.14-pre6/kernel/fork.c Wed Oct 24 02:44:15 2001
+++ linux-2.4.14-pre6-jiffies64/kernel/fork.c Fri Nov 2 13:52:55 2001
@@ -647,7 +647,7 @@
}
#endif
p->lock_depth = -1; /* -1 = no lock */
- p->start_time = jiffies;
+ p->start_time = get_jiffies64();

INIT_LIST_HEAD(&p->local_pages);

--- linux-2.4.14-pre6/kernel/acct.c Mon Mar 19 21:35:08 2001
+++ linux-2.4.14-pre6-jiffies64/kernel/acct.c Fri Nov 2 13:52:55 2001
@@ -56,6 +56,7 @@
#include <linux/tty.h>

#include <asm/uaccess.h>
+#include <asm/div64.h>

/*
* These constants control the amount of freespace that suspend and
@@ -227,20 +228,24 @@
* This routine has been adopted from the encode_comp_t() function in
* the kern_acct.c file of the FreeBSD operating system. The encoding
* is a 13-bit fraction with a 3-bit (base 8) exponent.
+ *
+ * Bumped up to encode 64 bit values. Unfortunately the result may
+ * overflow now.
*/

#define MANTSIZE 13 /* 13 bit mantissa. */
-#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */
+#define EXPSIZE 3 /* 3 bit exponent. */
+#define EXPBASE 3 /* Base 8 (3 bit) exponent. */
#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */

-static comp_t encode_comp_t(unsigned long value)
+static comp_t encode_comp_t(u64 value)
{
int exp, rnd;

exp = rnd = 0;
while (value > MAXFRACT) {
- rnd = value & (1 << (EXPSIZE - 1)); /* Round up? */
- value >>= EXPSIZE; /* Base 8 exponent == 3 bit shift. */
+ rnd = value & (1 << (EXPBASE - 1)); /* Round up? */
+ value >>= EXPBASE; /* Base 8 exponent == 3 bit shift. */
exp++;
}

@@ -248,16 +253,21 @@
* If we need to round up, do it (and handle overflow correctly).
*/
if (rnd && (++value > MAXFRACT)) {
- value >>= EXPSIZE;
+ value >>= EXPBASE;
exp++;
}

/*
* Clean it up and polish it off.
*/
- exp <<= MANTSIZE; /* Shift the exponent into place */
- exp += value; /* and add on the mantissa. */
- return exp;
+ if (exp >= (1 << EXPSIZE)) {
+ /* Overflow. Return largest representable number instead */
+ return (1 << (MANTSIZE + EXPSIZE)) - 1;
+ } else {
+ exp <<= MANTSIZE; /* Shift the exponent into place */
+ exp += value; /* and add on the mantissa. */
+ return exp;
+ }
}

/*
@@ -277,6 +287,7 @@
struct acct ac;
mm_segment_t fs;
unsigned long vsize;
+ u64 elapsed;

/*
* First check to see if there is enough free_space to continue
@@ -294,8 +305,10 @@
strncpy(ac.ac_comm, current->comm, ACCT_COMM);
ac.ac_comm[ACCT_COMM - 1] = '\0';

- ac.ac_btime = CT_TO_SECS(current->start_time) + (xtime.tv_sec - (jiffies / HZ));
- ac.ac_etime = encode_comp_t(jiffies - current->start_time);
+ elapsed = get_jiffies64() - current->start_time;
+ ac.ac_etime = encode_comp_t(elapsed);
+ do_div(elapsed, HZ);
+ ac.ac_btime = xtime.tv_sec - elapsed;
ac.ac_utime = encode_comp_t(current->times.tms_utime);
ac.ac_stime = encode_comp_t(current->times.tms_stime);
ac.ac_uid = current->uid;
--- linux-2.4.14-pre6/fs/proc/array.c Thu Oct 11 18:00:01 2001
+++ linux-2.4.14-pre6-jiffies64/fs/proc/array.c Fri Nov 2 13:52:55 2001
@@ -343,7 +343,7 @@
ppid = task->pid ? task->p_opptr->pid : 0;
read_unlock(&tasklist_lock);
res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
-%lu %lu %lu %lu %lu %ld %ld %ld %ld %ld %ld %lu %lu %ld %lu %lu %lu %lu %lu \
+%lu %lu %lu %lu %lu %ld %ld %ld %ld %ld %ld %llu %lu %ld %lu %lu %lu %lu %lu \
%lu %lu %lu %lu %lu %lu %lu %lu %d %d\n",
task->pid,
task->comm,
@@ -366,7 +366,7 @@
nice,
0UL /* removed */,
task->it_real_value,
- task->start_time,
+ (unsigned long long)(task->start_time),
vsize,
mm ? mm->rss : 0, /* you might want to shift this left 3 */
task->rlim[RLIMIT_RSS].rlim_cur,
--- linux-2.4.14-pre6/Documentation/Configure.help Wed Oct 31 23:06:18 2001
+++ linux-2.4.14-pre6-jiffies64/Documentation/Configure.help Fri Nov 2 16:38:34 2001
@@ -18530,6 +18530,14 @@
Say Y here if you want the low-level print routines to direct their
output to the serial port in the DC21285 (Footbridge).

+Debug jiffie counter wraparound (DANGEROUS)
+CONFIG_DEBUG_JIFFIEWRAP
+ Say Y here to initialize the jiffie counter to a value 5 minutes
+ before wraparound. This may make your system UNSTABLE and its
+ only use is to hunt down the causes of this instability.
+ If you don't know what the jiffie counter is or if you want
+ a stable system, say N.
+
Split initialisation functions into discardable section
CONFIG_TEXT_SECTIONS
If you say Y here, kernel code that is only used during
--- linux-2.4.14-pre6/arch/i386/config.in Sun Oct 21 04:17:19 2001
+++ linux-2.4.14-pre6-jiffies64/arch/i386/config.in Fri Nov 2 15:51:47 2001
@@ -404,6 +404,7 @@
bool ' Magic SysRq key' CONFIG_MAGIC_SYSRQ
bool ' Spinlock debugging' CONFIG_DEBUG_SPINLOCK
bool ' Verbose BUG() reporting (adds 70K)' CONFIG_DEBUG_BUGVERBOSE
+ bool ' Debug jiffie counter wraparound (DANGEROUS)' CONFIG_DEBUG_JIFFIEWRAP
fi

endmenu
--- linux-2.4.14-pre6/arch/m68k/config.in Tue Jun 12 04:15:27 2001
+++ linux-2.4.14-pre6-jiffies64/arch/m68k/config.in Fri Nov 2 16:20:30 2001
@@ -545,4 +545,5 @@

#bool 'Debug kmalloc/kfree' CONFIG_DEBUG_MALLOC
bool 'Magic SysRq key' CONFIG_MAGIC_SYSRQ
+bool 'Debug jiffie counter wraparound (DANGEROUS)' CONFIG_DEBUG_JIFFIEWRAP
endmenu
--- linux-2.4.14-pre6/arch/mips/config.in Mon Oct 15 22:41:34 2001
+++ linux-2.4.14-pre6-jiffies64/arch/mips/config.in Fri Nov 2 16:21:40 2001
@@ -519,4 +519,5 @@
if [ "$CONFIG_SMP" != "y" ]; then
bool 'Run uncached' CONFIG_MIPS_UNCACHED
fi
+bool 'Debug jiffie counter wraparound (DANGEROUS)' CONFIG_DEBUG_JIFFIEWRAP
endmenu
--- linux-2.4.14-pre6/arch/parisc/config.in Wed Apr 18 02:19:25 2001
+++ linux-2.4.14-pre6-jiffies64/arch/parisc/config.in Fri Nov 2 16:22:33 2001
@@ -206,5 +206,6 @@

#bool 'Debug kmalloc/kfree' CONFIG_DEBUG_MALLOC
bool 'Magic SysRq key' CONFIG_MAGIC_SYSRQ
+bool 'Debug jiffie counter wraparound (DANGEROUS)' CONFIG_DEBUG_JIFFIEWRAP
endmenu

--- linux-2.4.14-pre6/arch/ppc/config.in Tue Aug 28 16:11:33 2001
+++ linux-2.4.14-pre6-jiffies64/arch/ppc/config.in Fri Nov 2 16:23:22 2001
@@ -388,4 +388,5 @@
bool 'Magic SysRq key' CONFIG_MAGIC_SYSRQ
bool 'Include kgdb kernel debugger' CONFIG_KGDB
bool 'Include xmon kernel debugger' CONFIG_XMON
+bool 'Debug jiffie counter wraparound (DANGEROUS)' CONFIG_DEBUG_JIFFIEWRAP
endmenu
--- linux-2.4.14-pre6/arch/sparc/config.in Tue Jun 12 04:15:27 2001
+++ linux-2.4.14-pre6-jiffies64/arch/sparc/config.in Fri Nov 2 16:23:51 2001
@@ -265,4 +265,5 @@
comment 'Kernel hacking'

bool 'Magic SysRq key' CONFIG_MAGIC_SYSRQ
+bool 'Debug jiffie counter wraparound (DANGEROUS)' CONFIG_DEBUG_JIFFIEWRAP
endmenu
--- linux-2.4.14-pre6/arch/sh/config.in Mon Oct 15 22:36:48 2001
+++ linux-2.4.14-pre6-jiffies64/arch/sh/config.in Fri Nov 2 16:24:33 2001
@@ -385,4 +385,5 @@
if [ "$CONFIG_SH_STANDARD_BIOS" = "y" ]; then
bool 'Early printk support' CONFIG_SH_EARLY_PRINTK
fi
+bool 'Debug jiffie counter wraparound (DANGEROUS)' CONFIG_DEBUG_JIFFIEWRAP
endmenu
--- linux-2.4.14-pre6/arch/arm/config.in Wed Oct 31 23:06:18 2001
+++ linux-2.4.14-pre6-jiffies64/arch/arm/config.in Fri Nov 2 16:26:10 2001
@@ -578,6 +578,7 @@
bool 'Magic SysRq key' CONFIG_MAGIC_SYSRQ
bool 'Spinlock debugging' CONFIG_DEBUG_SPINLOCK
dep_bool 'Disable pgtable cache' CONFIG_NO_PGT_CACHE $CONFIG_CPU_26
+bool 'Debug jiffie counter wraparound (DANGEROUS)' CONFIG_DEBUG_JIFFIEWRAP
# These options are only for real kernel hackers who want to get their hands dirty.
dep_bool 'Kernel low-level debugging functions' CONFIG_DEBUG_LL $CONFIG_EXPERIMENTAL
dep_bool ' Kernel low-level debugging messages via footbridge serial port' CONFIG_DEBUG_DC21285_PORT $CONFIG_DEBUG_LL $CONFIG_FOOTBRIDGE
--- linux-2.4.14-pre6/arch/cris/config.in Mon Oct 15 22:42:14 2001
+++ linux-2.4.14-pre6-jiffies64/arch/cris/config.in Fri Nov 2 16:26:36 2001
@@ -250,4 +250,5 @@
if [ "$CONFIG_PROFILE" = "y" ]; then
int ' Profile shift count' CONFIG_PROFILE_SHIFT 2
fi
+bool 'Debug jiffie counter wraparound (DANGEROUS)' CONFIG_DEBUG_JIFFIEWRAP
endmenu


2001-11-02 18:51:26

by Andreas Dilger

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

On Nov 02, 2001 18:18 +0100, Tim Schmielau wrote:
> I made up another patch as an RFC where this overflow is dealt with in the
> same way as the jiffies wraparound. This will prevent wrong display of
> idle time if /proc/uptime is read at least every 497 days.
> The CPU time of the idle task will still overflow as it will for every
> other process getting more than 497.1 days of CPU time, but I don't want
> to blow up every time variable. I believe this to be acceptable behavior.

I would agree.

> I think I have to give up on my stability problem. I see hard lockups on
> SMP as well as UP at random times after the jiffies wrap.
> Another problem surprises me: Sometimes I get quite an unresponsive system
> even before jiffie wrap, sometimes not. Seems as some important timing
> parameter sometimes gets detected wrong at boot time if INITIAL_JIFFIES is
> large. The bogomips value, however, is always correct.

This is an expected symptom of broken code not dealing with jiffies wrap
properly. Something is waiting for a timeout that won't happen for another
1.3 years.

AFAIK, in the 2.1 kernel development days there was an audit
of all users of jiffies so that they properly used (there are macros
time_before() and time_after() which are supposed to handle jiffies wrap).
I imagine that if you did a grep on jiffies for the drivers you use, you
would find that lots of code is again doing "time + foo > jiffies" or
similar, which is broken for wrap.

> As suggested elsewhere, I made the pre-wrap INITIAL_JIFFIES value a config
> option, so anybody might decide by himself to hunt down these problems
> or not. I kindly ask people to turn this on and help getting the issues
> sorted out, as I don't want to imply a false feeling of safety with this
> patch by hiding the jiffies wraparound from the user.

Maybe not in the 2.4 kernel, but definitely in the 2.5 kernel Linus will
enable this as a "no-option option", just like he did with SMP in older
development kernels. It _may_ be that the Chief Penguin will put his
foot (flipper?) down and enable it for 2.4 as well, just to get it fixed
ASAP. Otherwise it just means that there will be an upper bound on how
long a 2.4 kernel can run.

> I believe the ongoing discussion of a "real" 64 bit jiffie counter not to
> interfere with this goal, since this can also be done at any later time
> with the get_jiffies64() interface remaining unchanged.

I agree. Do we need a 64-bit jiffies value for gettimeofday, or is this
handled by the CPU TSC these days? I'm not really aware of other places
that would need a 64-bit value in a fast path (such that locking and a
function call to get_jiffies64() is unacceptable).

Cheers, Andreas
--
Andreas Dilger
http://sourceforge.net/projects/ext2resize/
http://www-mddsp.enel.ucalgary.ca/People/adilger/

2001-11-02 21:36:33

by Tim Schmielau

[permalink] [raw]
Subject: possibly incorrect comparisons of jiffies in linux kernel

On Fri, 2 Nov 2001, Andreas Dilger wrote:

[talking about hard freezes of my box after wraparound of the
jiffie counter]

> AFAIK, in the 2.1 kernel development days there was an audit
> of all users of jiffies so that they properly used (there are macros
> time_before() and time_after() which are supposed to handle jiffies wrap).
> I imagine that if you did a grep on jiffies for the drivers you use, you
> would find that lots of code is again doing "time + foo > jiffies" or
> similar, which is broken for wrap.
>

Well, on a quick inspection of the sources I found 171 files where
jiffies are compared without using time_before() or time_after().
I didn't bother to find out whether or not they actually get the
wraparound wrong. IMHO for ease of audit all of these should use
the macros.
Maybe this audit is a candidate for the CHECKER project?

Unfortunately I currently don't have much time to spend on kernel hacking,
so listing these files is about all I can do.

Tim


Here comes the list. Don't beat me about false positives or false
negatives please, I was quite tired when assembling it. No flamewars were
intended by this post!

net/core/dev.c
net/core/neighbour.c
net/ipv4/arp.c
net/ipv4/route.c
net/ipv4/route.c
net/ipv4/igmp.c
net/ipv4/ip_gre.c
net/ipv4/ipmr.c
net/ipv4/tcp_timer.c
net/ipv4/tcp_ipv4.c
net/ipv4/ipconfig.c
net/ipv4/tcp_minisocks.c
net/decnet/af_decnet.c
net/decnet/dn_dev.c
net/decnet/dn_nsp_out.c
net/decnet/dn_route.c
net/decnet/dn_timer.c
net/econet/af_econet.c
net/sched/sch_generic.c
net/irda/discovery.c
net/irda/irnet/irnet_irda.c
drivers/net/slip.c
drivers/net/ne.c
drivers/net/cs89x0.c
drivers/net/atp.c
drivers/net/eexpress.c
drivers/net/apne.c
drivers/net/hp100.c
drivers/net/wan/sdla_fr.c
drivers/net/wan/sdla_ppp.c
drivers/net/wan/sdla_x25.c
drivers/net/wan/sdla_chdlc.c
drivers/net/wan/comx-hw-comx.c
drivers/net/wan/comx-hw-mixcom.c
drivers/net/wan/lmc/lmc_debug.c
drivers/net/wan/lmc/lmc_main.c
drivers/net/wan/dscc4.c
drivers/net/wan/hdlc.c
drivers/net/wan/sdla_ft1.c
drivers/net/wan/wanpipe_multppp.c
drivers/net/seeq8005.c
drivers/net/3c59x.c
drivers/net/3c523.c
drivers/net/eth16i.c
drivers/net/tokenring/olympic.c
drivers/net/tokenring/lanstreamer.c
drivers/net/hamradio/baycom_par.c
drivers/net/hamradio/baycom_ser_fdx.c
drivers/net/hamradio/mkiss.c
drivers/net/hamradio/soundmodem/sm.c
drivers/net/hamradio/soundmodem/sm_wss.c
drivers/net/hamradio/dmascc.c
drivers/net/hamradio/baycom_epp.c
drivers/net/hamradio/yam.c
drivers/net/shaper.c
drivers/net/es3210.c
drivers/net/oaknet.c
drivers/net/sis900.c
drivers/net/arlan.c
drivers/net/arcnet/arcnet.c
drivers/net/ppp_async.c
drivers/net/aironet4500_core.c
drivers/net/ne2k-pci.c
drivers/net/sb1000.c
drivers/net/dmfe.c
drivers/net/rrunner.c
drivers/net/ariadne2.c
drivers/net/ne2.c
drivers/net/tulip/pnic.c
drivers/net/fc/iph5526.c
drivers/net/pcmcia/3c589_cs.c
drivers/net/pcmcia/pcnet_cs.c
drivers/net/pcmcia/netwave_cs.c
drivers/net/pcmcia/smc91c92_cs.c
drivers/net/pcmcia/wavelan_cs.c
drivers/net/aironet4500_card.c
drivers/net/mac89x0.c
drivers/net/appletalk/cops.c
drivers/block/floppy.c
drivers/block/paride/pseudo.h
drivers/char/lp.c
drivers/char/esp.c
drivers/char/serial.c
drivers/char/tpqic02.c
drivers/char/rtc.c
drivers/char/synclink.c
drivers/char/mxser.c
drivers/char/ip2main.c
drivers/char/ip2/i2lib.c
drivers/char/moxa.c
drivers/char/drm/i810_dma.c
drivers/char/agp/agpgart_be.c
drivers/scsi/eata.c
drivers/scsi/53c7,8xx.c
drivers/scsi/ppa.c
drivers/scsi/eata_generic.h
drivers/scsi/qlogicfc.c
drivers/scsi/BusLogic.c
drivers/scsi/megaraid.c
drivers/scsi/wd7000.c
drivers/scsi/53c7xx.c
drivers/scsi/qlogicfas.c
drivers/scsi/u14-34f.c
drivers/scsi/scsi_obsolete.c
drivers/scsi/gdth_proc.c
drivers/scsi/imm.c
drivers/scsi/mac_scsi.c
drivers/scsi/i91uscsi.c
drivers/scsi/i60uscsi.c
drivers/scsi/sym53c416.c
drivers/scsi/sym53c8xx_defs.h
drivers/scsi/sun3_scsi.c
drivers/scsi/cpqfcTSstructs.h
drivers/scsi/osst.c
drivers/sound/pss.c
drivers/sound/sb_common.c
drivers/sound/ymfpci.c
drivers/sound/vwsnd.c
drivers/cdrom/sonycd535.c
drivers/isdn/isdn_net.c
drivers/isdn/isdn_tty.c
drivers/isdn/eicon/eicon_idi.c
drivers/isdn/eicon/eicon_isa.c
drivers/isdn/tpam/tpam_commands.c
drivers/sbus/char/aurora.c
drivers/ide/ide-cd.c
drivers/ide/ide-features.c
drivers/ide/ide-probe.c
drivers/ide/ide-proc.c
drivers/ide/ide-tape.c
drivers/ide/ide.c
drivers/sgi/char/ds1286.c
drivers/fc4/fc.c
drivers/acorn/net/ether3.c
drivers/usb/usb-uhci.c
drivers/usb/serial/digi_acceleport.c
drivers/usb/serial/keyspan.c
drivers/usb/storage/isd200.c
drivers/tc/zs.c
drivers/atm/iphase.c
drivers/atm/fore200e.c
drivers/pcmcia/i82365.c
drivers/i2c/i2c-algo-bit.c
drivers/i2c/i2c-adap-ite.c
drivers/s390/misc/chandev.c
drivers/media/radio/radio-aimslab.c
drivers/media/video/c-qcam.c
drivers/media/video/w9966.c
drivers/media/video/zr36067.c
drivers/acpi/ospm/processor/prpower.c
drivers/mtd/chips/sharp.c
drivers/mtd/devices/doc1000.c
drivers/md/md.c
drivers/message/fusion/mptscsih.c
drivers/message/i2o/i2o_core.c
arch/i386/kernel/apm.c
arch/i386/kernel/io_apic.c
arch/alpha/kernel/traps.c
arch/alpha/kernel/time.c
arch/mips/jazz/reset.c
arch/mips/baget/vacserial.c
arch/mips/au1000/common/serial.c
arch/ppc/8xx_io/uart.c
arch/ppc/8260_io/uart.c
arch/sparc64/kernel/binfmt_aout32.c
arch/ia64/kernel/irq_ia64.c
arch/ia64/kernel/unaligned.c
arch/cris/drivers/ethernet.c
arch/cris/drivers/ide.c
arch/cris/drivers/serial.c
arch/cris/drivers/usb-host.c

2001-11-04 18:32:23

by linux-kernel

[permalink] [raw]
Subject: Re: [Patch] Re: Nasty suprise with uptime

Another way is to have jiffies still be a normal 32-bit counter, and have
another 32 bit value (high_jiffy) whose low bit is supposed to be equal to
the hight bit of the jiffy value. Set up a timer to repeatedly but rarely
check the high bit of jiffy and if it's different from the low bit of
high_jiffy, increase high_jiffy by one. In the rare cases you need to read
the full 64-bit (or rather, 63-bit) value, do the same test and combine the
two parts dropping one bit in the middle. (high_jiffy access would be locked)

This makes 63-bit read slow, but lets normal jiffy ops proceed at
normal speed. in fact, this method can be used to make any 32-bit counter
into a big counter without real speed loss if full length reads are rare