Hi Linus,
with HZ=1000 even on 32bit platforms, we really should use the
64 bit jiffies value for exported interfaces like uptime, process start
time etc.
Otherwise innocent users might get quite surprised when ps output goes
berzerk after 49.5 days, showing processes as having started in the
future.
Note that the appended patch does not change any of the exported
interfaces, it just avoids internal overflows.
These changes were already discussed on lkml long ago, before George
Anzinger introduced the current 64 bit jiffies implementation.
In its current form this patch was posted on lkml last month, and has been
in -dj (modulo the HZ=1000 change) since 2.5.20-dj3.
Tim
Part 1/4: "Infrastructure"
Provide a sane way to avoid unneccessary locking on 64 bit platforms,
and a 64 bit analogon to "jiffies_to_clock_t()".
Naming it "jiffies_64_to_user_HZ()" is the only part of these patches I
expect to be controversial.
--- linux-2.5.46-bk4/include/linux/jiffies.h Mon Nov 4 23:30:04 2002
+++ linux-2.5.46-bk4-j64a/include/linux/jiffies.h Sun Nov 10 09:16:35 2002
@@ -2,14 +2,34 @@
#define _LINUX_JIFFIES_H
#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <asm/system.h>
#include <asm/param.h> /* for HZ */
/*
* The 64-bit value is not volatile - you MUST NOT read it
- * without holding read_lock_irq(&xtime_lock)
+ * without holding read_lock_irq(&xtime_lock).
+ * get_jiffies_64() will do this for you as appropriate.
*/
extern u64 jiffies_64;
extern unsigned long volatile jiffies;
+
+static inline u64 get_jiffies_64(void)
+{
+#if BITS_PER_LONG < 64
+ extern rwlock_t xtime_lock;
+ unsigned long flags;
+ u64 tmp;
+
+ read_lock_irqsave(&xtime_lock, flags);
+ tmp = jiffies_64;
+ read_unlock_irqrestore(&xtime_lock, flags);
+ return tmp;
+#else
+ return (u64)jiffies;
+#endif
+}
+
/*
* These inlines deal with timer wrapping correctly. You are
--- linux-2.5.46-bk4/include/linux/times.h Mon Nov 4 23:30:06 2002
+++ linux-2.5.46-bk4-j64a/include/linux/times.h Sun Nov 10 09:16:35 2002
@@ -2,7 +2,16 @@
#define _LINUX_TIMES_H
#ifdef __KERNEL__
+#include <asm/div64.h>
+#include <asm/types.h>
+
# define jiffies_to_clock_t(x) ((x) / (HZ / USER_HZ))
+
+static inline u64 jiffies_64_to_user_HZ(u64 x)
+{
+ do_div(x, HZ / USER_HZ);
+ return x;
+}
#endif
struct tms {
Part 2/4: fix uptime wrap
Use 64 bit jiffies for reporting uptime.
--- linux-2.5.46-bk4/kernel/timer.c Sat Nov 9 08:31:13 2002
+++ linux-2.5.46-bk4-j64a/kernel/timer.c Sun Nov 10 09:16:35 2002
@@ -25,8 +25,10 @@
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/notifier.h>
+#include <linux/jiffies.h>
#include <asm/uaccess.h>
+#include <asm/div64.h>
/*
* per-CPU timer vector definitions:
@@ -1060,13 +1062,16 @@
asmlinkage long sys_sysinfo(struct sysinfo *info)
{
struct sysinfo val;
+ u64 uptime;
unsigned long mem_total, sav_total;
unsigned int mem_unit, bitcount;
memset((char *)&val, 0, sizeof(struct sysinfo));
read_lock_irq(&xtime_lock);
- val.uptime = jiffies / HZ;
+ uptime = jiffies_64;
+ do_div(uptime, HZ);
+ val.uptime = (unsigned long) uptime;
val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
--- linux-2.5.46-bk4/fs/proc/proc_misc.c Mon Nov 4 23:30:07 2002
+++ linux-2.5.46-bk4-j64a/fs/proc/proc_misc.c Sun Nov 10 09:16:35 2002
@@ -40,12 +40,14 @@
#include <linux/times.h>
#include <linux/profile.h>
#include <linux/blkdev.h>
+#include <linux/jiffies.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/io.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
+#include <asm/div64.h>
#define LOAD_INT(x) ((x) >> FSHIFT)
#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
@@ -97,34 +99,36 @@
static int uptime_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
- unsigned long uptime;
- unsigned long idle;
+ u64 uptime;
+ unsigned long uptime_remainder;
int len;
- uptime = jiffies;
- idle = init_task.utime + init_task.stime;
+ uptime = get_jiffies_64();
+ uptime_remainder = (unsigned long) do_div(uptime, HZ);
- /* The formula for the fraction parts really is ((t * 100) / HZ) % 100, but
- that would overflow about every five days at HZ == 100.
- Therefore the identity a = (a / b) * b + a % b is used so that it is
- calculated as (((t / HZ) * 100) + ((t % HZ) * 100) / HZ) % 100.
- The part in front of the '+' always evaluates as 0 (mod 100). All divisions
- in the above formulas are truncating. For HZ being a power of 10, the
- calculations simplify to the version in the #else part (if the printf
- format is adapted to the same number of digits as zeroes in HZ.
- */
#if HZ!=100
- len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
- uptime / HZ,
- (((uptime % HZ) * 100) / HZ) % 100,
- idle / HZ,
- (((idle % HZ) * 100) / HZ) % 100);
+ {
+ u64 idle = init_task.utime + init_task.stime;
+ unsigned long idle_remainder;
+
+ idle_remainder = (unsigned long) do_div(idle, HZ);
+ len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
+ (unsigned long) uptime,
+ (uptime_remainder * 100) / HZ,
+ (unsigned long) idle,
+ (idle_remainder * 100) / HZ);
+ }
#else
- len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
- uptime / HZ,
- uptime % HZ,
- idle / HZ,
- idle % HZ);
+ {
+ unsigned long idle = init_task.times.tms_utime
+ + init_task.times.tms_stime;
+
+ len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
+ (unsigned long) uptime,
+ uptime_remainder,
+ idle / HZ,
+ idle % HZ);
+ }
#endif
return proc_calc_metrics(page, start, off, count, eof, len);
}
@@ -320,6 +324,8 @@
};
#endif
+extern rwlock_t xtime_lock;
+
extern struct seq_operations slabinfo_op;
extern ssize_t slabinfo_write(struct file *, const char *, size_t, loff_t *);
static int slabinfo_open(struct inode *inode, struct file *file)
@@ -339,7 +345,7 @@ static int kstat_read_proc(char *page, c
{
int i, len;
extern unsigned long total_forks;
- unsigned long jif = jiffies;
+ u64 jif = get_jiffies_64();
unsigned int sum = 0, user = 0, nice = 0, system = 0, idle = 0, iowait = 0;
int major, disk;
@@ -401,6 +407,7 @@ static int kstat_read_proc(char *page, c
}
}
+ do_div(jif, HZ);
len += sprintf(page + len,
"\nctxt %lu\n"
"btime %lu\n"
@@ -408,7 +415,7 @@ static int kstat_read_proc(char *page, c
"procs_running %lu\n"
"procs_blocked %u\n",
nr_context_switches(),
- xtime.tv_sec - jif / HZ,
+ xtime.tv_sec - (unsigned long) jif,
total_forks,
nr_running(),
atomic_read(&nr_iowait_tasks));
Part 3/4: 64 bit process start time
This prevents reporting processes as having started in the future, after
32 bit jiffies wrap.
--- linux-2.5.46-bk4-j64a/include/linux/sched.h Sat Nov 9 08:31:13 2002
+++ linux-2.5.46-bk4-j64b/include/linux/sched.h Sun Nov 10 09:31:37 2002
@@ -336,7 +336,7 @@ struct task_struct {
unsigned long it_real_incr, it_prof_incr, it_virt_incr;
struct timer_list real_timer;
unsigned long utime, stime, cutime, cstime;
- unsigned long start_time;
+ u64 start_time;
long per_cpu_utime[NR_CPUS], per_cpu_stime[NR_CPUS];
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
--- linux-2.5.46-bk4-j64a/kernel/fork.c Sat Nov 9 08:31:13 2002
+++ linux-2.5.46-bk4-j64b/kernel/fork.c Sun Nov 10 09:31:37 2002
@@ -26,6 +26,7 @@
#include <linux/mman.h>
#include <linux/fs.h>
#include <linux/security.h>
+#include <linux/jiffies.h>
#include <linux/futex.h>
#include <linux/ptrace.h>
@@ -772,7 +773,7 @@ static struct task_struct *copy_process(
#endif
p->array = NULL;
p->lock_depth = -1; /* -1 = no lock */
- p->start_time = jiffies;
+ p->start_time = get_jiffies_64();
p->security = NULL;
p->core_waiter = 0;
--- linux-2.5.46-bk4-j64a/fs/proc/array.c Mon Nov 4 23:30:32 2002
+++ linux-2.5.46-bk4-j64b/fs/proc/array.c Sun Nov 10 09:31:37 2002
@@ -345,7 +345,7 @@ int proc_pid_stat(struct task_struct *ta
ppid = task->pid ? task->real_parent->pid : 0;
read_unlock(&tasklist_lock);
res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
-%lu %lu %lu %lu %lu %ld %ld %ld %ld %ld %ld %lu %lu %ld %lu %lu %lu %lu %lu \
+%lu %lu %lu %lu %lu %ld %ld %ld %ld %ld %ld %llu %lu %ld %lu %lu %lu %lu %lu \
%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu\n",
task->pid,
task->comm,
@@ -368,7 +368,7 @@ int proc_pid_stat(struct task_struct *ta
nice,
0UL /* removed */,
jiffies_to_clock_t(task->it_real_value),
- jiffies_to_clock_t(task->start_time),
+ (unsigned long long) jiffies_64_to_user_HZ(task->start_time),
vsize,
mm ? mm->rss : 0, /* you might want to shift this left 3 */
task->rlim[RLIMIT_RSS].rlim_cur,
--- linux-2.5.46-bk4-j64a/kernel/acct.c Mon Nov 4 23:30:31 2002
+++ linux-2.5.46-bk4-j64b/kernel/acct.c Sun Nov 10 09:31:37 2002
@@ -49,7 +49,9 @@
#include <linux/acct.h>
#include <linux/file.h>
#include <linux/tty.h>
+#include <linux/jiffies.h>
#include <asm/uaccess.h>
+#include <asm/div64.h>
/*
* These constants control the amount of freespace that suspend and
@@ -304,6 +306,7 @@ static void do_acct_process(long exitcod
mm_segment_t fs;
unsigned long vsize;
unsigned long flim;
+ u64 elapsed;
/*
* First check to see if there is enough free_space to continue
@@ -321,9 +324,11 @@ static void do_acct_process(long exitcod
strncpy(ac.ac_comm, current->comm, ACCT_COMM);
ac.ac_comm[ACCT_COMM - 1] = '\0';
- ac.ac_btime = CT_TO_SECS(current->start_time) +
- (xtime.tv_sec - (jiffies / HZ));
- ac.ac_etime = encode_comp_t(jiffies - current->start_time);
+ elapsed = get_jiffies_64() - current->start_time;
+ ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
+ (unsigned long) elapsed : (unsigned long) -1l);
+ do_div(elapsed, HZ);
+ ac.ac_btime = xtime.tv_sec - elapsed;
ac.ac_utime = encode_comp_t(current->utime);
ac.ac_stime = encode_comp_t(current->stime);
ac.ac_uid = current->uid;
--- linux-2.5.46-bk4-j64a/mm/oom_kill.c Mon Nov 4 23:30:30 2002
+++ linux-2.5.46-bk4-j64b/mm/oom_kill.c Sun Nov 10 09:31:37 2002
@@ -19,6 +19,7 @@
#include <linux/sched.h>
#include <linux/swap.h>
#include <linux/timex.h>
+#include <linux/jiffies.h>
/* #define DEBUG */
@@ -68,11 +69,10 @@ static int badness(struct task_struct *p
/*
* CPU time is in seconds and run time is in minutes. There is no
* particular reason for this other than that it turned out to work
- * very well in practice. This is not safe against jiffie wraps
- * but we don't care _that_ much...
+ * very well in practice.
*/
cpu_time = (p->utime + p->stime) >> (SHIFT_HZ + 3);
- run_time = (jiffies - p->start_time) >> (SHIFT_HZ + 10);
+ run_time = (get_jiffies_64() - p->start_time) >> (SHIFT_HZ + 10);
points /= int_sqrt(cpu_time);
points /= int_sqrt(int_sqrt(run_time));
Part 4/4: use unsigned long for cpu statistics
This is more like a band-aid.
It used to be enough when we had HZ=100 on 32 bit platforms, and only
Alphas had the cpu statistics wrapping after 48.5 days.
Fixing 64 bit platforms is easily done with this patch, fixing 32 bit
would need annoying locking.
Well, it's only statistics after all, and there weren't many complaints
about Alpha, so maybe we just don't fix it at all...
--- linux-2.5.46-bk4-j64b/include/linux/kernel_stat.h Mon Nov 4 23:30:51 2002
+++ linux-2.5.46-bk4-j64/include/linux/kernel_stat.h Sun Nov 10 09:40:24 2002
@@ -14,11 +14,11 @@
*/
struct cpu_usage_stat {
- unsigned int user;
- unsigned int nice;
- unsigned int system;
- unsigned int idle;
- unsigned int iowait;
+ unsigned long user;
+ unsigned long nice;
+ unsigned long system;
+ unsigned long idle;
+ unsigned long iowait;
};
struct kernel_stat {
--- linux-2.5.46-bk4-j64b/fs/proc/proc_misc.c Sun Nov 10 09:31:37 2002
+++ linux-2.5.46-bk4-j64/fs/proc/proc_misc.c Sun Nov 10 09:40:24 2002
@@ -346,7 +346,8 @@ static int kstat_read_proc(char *page, c
int i, len;
extern unsigned long total_forks;
u64 jif = get_jiffies_64();
- unsigned int sum = 0, user = 0, nice = 0, system = 0, idle = 0, iowait = 0;
+ unsigned int sum = 0;
+ unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0;
int major, disk;
for (i = 0 ; i < NR_CPUS; i++) {
@@ -364,7 +365,7 @@ static int kstat_read_proc(char *page, c
#endif
}
- len = sprintf(page, "cpu %u %u %u %u %u\n",
+ len = sprintf(page, "cpu %lu %lu %lu %lu %lu\n",
jiffies_to_clock_t(user),
jiffies_to_clock_t(nice),
jiffies_to_clock_t(system),
@@ -372,7 +373,7 @@ static int kstat_read_proc(char *page, c
jiffies_to_clock_t(iowait));
for (i = 0 ; i < NR_CPUS; i++){
if (!cpu_online(i)) continue;
- len += sprintf(page + len, "cpu%d %u %u %u %u %u\n",
+ len += sprintf(page + len, "cpu%d %lu %lu %lu %lu %lu\n",
i,
jiffies_to_clock_t(kstat_cpu(i).cpustat.user),
jiffies_to_clock_t(kstat_cpu(i).cpustat.nice),
In article <Pine.LNX.4.33.0211101014120.12784-100000@gans.physik3.uni-rostock.de>,
Tim Schmielau <[email protected]> wrote:
>
>Provide a sane way to avoid unneccessary locking on 64 bit platforms,
>and a 64 bit analogon to "jiffies_to_clock_t()".
>Naming it "jiffies_64_to_user_HZ()" is the only part of these patches I
>expect to be controversial.
I disagree with the implementation of this (and yes, I would also prefer
for it to be called "jiffies64_to_clock_t()"):
> # define jiffies_to_clock_t(x) ((x) / (HZ / USER_HZ))
This is my crapola 32-bit-overflow-horror-thing. Please don't look at it
too closely, since it makes you go blind, _and_ it encourages you to
write more crapola like:
>+static inline u64 jiffies_64_to_user_HZ(u64 x)
>+{
>+ do_div(x, HZ / USER_HZ);
>+ return x;
>+}
This is wrong. You should really start off by fixing the 32-bit case,
since even that needs fixing anyway (some interfaces really _are_
32-bit, and cannot be expanded).
It also only works for cases where HZ is a direct multiple of USER_HZ,
and yes, my original code has the same problem, but that's not a good
excuse to make it worse. I think it should be fairly straightforward to
get this right, and have a simple
static inline u64 jiffies64_to_clock_t(u64 x)
{
#if !(HZ % USER_HZ)
do_div(x, (HZ / USER_HZ))
return x;
#else
/*
* There are better ways that don't overflow early,
* but even this doesn't overflow in hundreds of years
* in 64 bits, so..
*/
do_div(x * USER_HZ, HZ);
return x;
#endif
}
(And yes, the above does not return a clock_t, it returns a 64-bit
thing, but people who need to can truncate it to clock_t and live with
the old 497-day overflow of USER_HZ in 32 bits for broken interfaces
like "clock()". This way the caller can use the same function for both
the "true 64-bit result" and the truncated case).
And then we should just remove the "jiffies_to_clock_t()" thing, I
suspect. The 49-day overflow is just too soon for comfort, you're
definitely right about that.
Linus
On Mon, 11 Nov 2002, Linus Torvalds wrote:
> I disagree with the implementation of this (and yes, I would also prefer
> for it to be called "jiffies64_to_clock_t()"):
I'm fine with the name "jiffies_64_to_clock_t()".
I'd like "jiffies64_to_clock_t()" (without the first underscore)
even more, but this is inconsistent with jiffies_64. (And I don't know if
we want to rename the latter now)
> This is wrong. You should really start off by fixing the 32-bit case,
> since even that needs fixing anyway (some interfaces really _are_
> 32-bit, and cannot be expanded).
>
> It also only works for cases where HZ is a direct multiple of USER_HZ,
> and yes, my original code has the same problem, but that's not a good
> excuse to make it worse. I think it should be fairly straightforward to
> get this right, and have a simple
OK, changed my patch to
+#if (HZ % USER_HZ)==0
# define jiffies_to_clock_t(x) ((x) / (HZ / USER_HZ))
+#else
+# define jiffies_to_clock_t(x) ((clock_t) jiffies_64_to_clock_t((u64) x))
+#endif
+
+static inline u64 jiffies_64_to_clock_t(u64 x)
+{
+#if (HZ % USER_HZ)==0
+ do_div(x, HZ / USER_HZ);
+#else
+ /*
+ * There are better ways that don't overflow early,
+ * but even this doesn't overflow in hundreds of years
+ * in 64 bits, so..
+ */
+ x *= USER_HZ;
+ do_div(x, HZ);
+#endif
+ return x;
+}
I'll send the full new set of patches off-list. They can also be retrieved
from
http://www.physik3.uni-rostock.de/tim/kernel/2.5/jiffies64-32a.patch.gz
(1/4: infrastructure)
http://www.physik3.uni-rostock.de/tim/kernel/2.5/jiffies64-32b.patch.gz
(2/4: fix uptime wrap)
http://www.physik3.uni-rostock.de/tim/kernel/2.5/jiffies64-32c.patch.gz
(3/4: 64 bit process start time)
http://www.physik3.uni-rostock.de/tim/kernel/2.5/jiffies64-32d.patch.gz
(4/4: use unsigned long for cpu statistics)
> (And yes, the above does not return a clock_t, it returns a 64-bit
> thing, but people who need to can truncate it to clock_t and live with
> the old 497-day overflow of USER_HZ in 32 bits for broken interfaces
> like "clock()". This way the caller can use the same function for both
> the "true 64-bit result" and the truncated case).
Hey, this is just why I renamed it from "jiffies_64_to_clock_t()" to
"jiffies_64_to_user_HZ()" before sending it off :-)
> And then we should just remove the "jiffies_to_clock_t()" thing, I
> suspect. The 49-day overflow is just too soon for comfort, you're
> definitely right about that.
OK, will prepare patch for that later.
Tim