The following patch generalizes Andrea's trick of using sequence numbers
to create a reader region/writer lock. It against the 2.5.30 kernel.
A new composite primitive 'frlock' is defined in include/linux/frlock.h
and used to replace the rwlock xtime_lock enforce consistent access to
the clock time variables.
This should solve the DOS problem when applications spin wildly doing
gettimeofday (or reading sysinfo) and starve the clock tick from ever
getting a write lock on xtime_lock
There is no performance difference for normal loads, although since the
reader does no locking, it should be faster than normal reader locks
The same technique could be used for other rarely updated data that
needs atomic access. Like owner/group of an inode or 64 bit counters.
NOTE: this patch works for i386 only, cause that's what I could test.
Other architectures require trivial changes to time.c to change
xtime_lock from rwlock to frlock and to use a loop when reading.
diff -urN -X dontdiff linux-2.5/arch/i386/kernel/time.c linux-2.5.exp/arch/i386/kernel/time.c
--- linux-2.5/arch/i386/kernel/time.c Mon Aug 12 10:17:59 2002
+++ linux-2.5.exp/arch/i386/kernel/time.c Mon Aug 12 10:32:54 2002
@@ -43,6 +43,7 @@
#include <linux/smp.h>
#include <linux/module.h>
#include <linux/device.h>
+#include <linux/frlock.h>
#include <asm/io.h>
#include <asm/smp.h>
@@ -81,7 +82,7 @@
*/
unsigned long fast_gettimeoffset_quotient;
-extern rwlock_t xtime_lock;
+extern frlock_t xtime_lock;
extern unsigned long wall_jiffies;
spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED;
@@ -269,19 +270,21 @@
*/
void do_gettimeofday(struct timeval *tv)
{
- unsigned long flags;
+ unsigned long seq;
unsigned long usec, sec;
- read_lock_irqsave(&xtime_lock, flags);
- usec = do_gettimeoffset();
- {
- unsigned long lost = jiffies - wall_jiffies;
- if (lost)
- usec += lost * (1000000 / HZ);
- }
- sec = xtime.tv_sec;
- usec += xtime.tv_usec;
- read_unlock_irqrestore(&xtime_lock, flags);
+ do {
+ seq = fr_read_begin(&xtime_lock);
+ usec = do_gettimeoffset();
+ {
+ unsigned long lost = jiffies - wall_jiffies;
+ if (lost)
+ usec += lost * (1000000 / HZ);
+ }
+
+ sec = xtime.tv_sec;
+ usec += xtime.tv_usec;
+ } while (seq != fr_read_end(&xtime_lock));
while (usec >= 1000000) {
usec -= 1000000;
@@ -294,7 +297,7 @@
void do_settimeofday(struct timeval *tv)
{
- write_lock_irq(&xtime_lock);
+ fr_write_lock_irq(&xtime_lock);
/*
* This is revolting. We need to set "xtime" correctly. However, the
* value in this location is the value at the most recent update of
@@ -314,7 +317,7 @@
time_status |= STA_UNSYNC;
time_maxerror = NTP_PHASE_LIMIT;
time_esterror = NTP_PHASE_LIMIT;
- write_unlock_irq(&xtime_lock);
+ fr_write_unlock_irq(&xtime_lock);
}
/*
@@ -480,7 +483,7 @@
* the irq version of write_lock because as just said we have irq
* locally disabled. -arca
*/
- write_lock(&xtime_lock);
+ fr_write_lock(&xtime_lock);
if (use_tsc)
{
@@ -513,7 +516,7 @@
do_timer_interrupt(irq, NULL, regs);
- write_unlock(&xtime_lock);
+ fr_write_unlock(&xtime_lock);
}
diff -urN -X dontdiff linux-2.5/include/linux/frlock.h linux-2.5.exp/include/linux/frlock.h
--- linux-2.5/include/linux/frlock.h Wed Dec 31 16:00:00 1969
+++ linux-2.5.exp/include/linux/frlock.h Mon Aug 12 10:09:56 2002
@@ -0,0 +1,99 @@
+#ifndef __LINUX_FRLOCK_H
+#define __LINUX_FRLOCK_H
+
+/*
+ * Fast read-write spinlocks.
+ *
+ * Fast reader/writer locks without starving writers. This type of
+ * lock for data where the reader wants a consitent set of information
+ * and is willing to retry if the information changes. Readers never
+ * block but they may have to retry if a writer is in
+ * progress. Writers do not wait for readers.
+ *
+ * Generalization on sequence variables used for gettimeofday on x86-64
+ * by Andrea Arcangeli
+ *
+ * This is not as cache friendly as brlock. Also, this will not work
+ * for data that contains pointers, because any writer could
+ * invalidate a pointer that a reader was following.
+ *
+ *
+ * Expected reader usage:
+ * do {
+ * seq = fr_read_begin();
+ * ...
+ * } while (seq != fr_read_end());
+ *
+ * On non-SMP the spin locks disappear but the writer still needs
+ * to increment the sequence variables because an interrupt routine could
+ * change the state of the data.
+ */
+
+#include <linux/config.h>
+#include <linux/spinlock.h>
+
+typedef struct {
+ spinlock_t lock;
+ unsigned pre_sequence;
+ unsigned post_sequence;
+} frlock_t;
+
+#define FR_LOCK_UNLOCKED { SPIN_LOCK_UNLOCKED, 0, 0 }
+#define frlock_init(x) do { *(x) = FR_LOCK_UNLOCKED; } while (0)
+
+static inline void fr_write_lock(frlock_t *rw)
+{
+ spin_lock(&rw->lock);
+ rw->pre_sequence++;
+ wmb();
+}
+
+static inline void fr_write_unlock(frlock_t *rw)
+{
+ wmb();
+ rw->post_sequence++;
+ spin_unlock(&rw->lock);
+}
+
+static inline int fr_write_trylock(frlock_t *rw)
+{
+ int ret = spin_trylock(&rw->lock);
+
+ if (ret) {
+ ++rw->pre_sequence;
+ wmb();
+ }
+ return ret;
+}
+
+static inline unsigned fr_read_begin(frlock_t *rw)
+{
+ rmb();
+ return rw->post_sequence;
+
+}
+
+static inline unsigned fr_read_end(frlock_t *rw)
+{
+ rmb();
+ return rw->pre_sequence;
+}
+
+/*
+ * Possible sw/hw IRQ protected versions of the interfaces.
+ */
+#define fr_write_lock_irqsave(lock, flags) \
+ do { local_irq_save(flags); fr_write_lock(lock); } while (0)
+#define fr_write_lock_irq(lock) \
+ do { local_irq_disable(); fr_write_lock(lock); } while (0)
+#define fr_write_lock_bh(lock) \
+ do { local_bh_disable(); fr_write_lock(lock); } while (0)
+
+#define fr_write_unlock_irqrestore(lock, flags) \
+ do { fr_write_unlock(lock); local_irq_restore(flags); } while(0)
+#define fr_write_unlock_irq(lock) \
+ do { fr_write_unlock(lock); local_irq_enable(); } while(0)
+#define fr_write_unlock_bh(lock) \
+ do { fr_write_unlock(lock); local_bh_enable(); } while(0)
+
+#endif /* __LINUX_FRLOCK_H */
diff -urN -X dontdiff linux-2.5/kernel/time.c linux-2.5.exp/kernel/time.c
--- linux-2.5/kernel/time.c Mon Aug 12 10:18:32 2002
+++ linux-2.5.exp/kernel/time.c Mon Aug 12 10:07:28 2002
@@ -27,6 +27,7 @@
#include <linux/timex.h>
#include <linux/errno.h>
#include <linux/smp_lock.h>
+#include <linux/frlock.h>
#include <asm/uaccess.h>
@@ -38,7 +39,7 @@
/* The xtime_lock is not only serializing the xtime read/writes but it's also
serializing all accesses to the global NTP variables now. */
-extern rwlock_t xtime_lock;
+extern frlock_t xtime_lock;
extern unsigned long last_time_offset;
#if !defined(__alpha__) && !defined(__ia64__)
@@ -80,7 +81,7 @@
return -EPERM;
if (get_user(value, tptr))
return -EFAULT;
- write_lock_irq(&xtime_lock);
+ fr_write_lock_irq(&xtime_lock);
xtime.tv_sec = value;
xtime.tv_usec = 0;
last_time_offset = 0;
@@ -88,7 +89,7 @@
time_status |= STA_UNSYNC;
time_maxerror = NTP_PHASE_LIMIT;
time_esterror = NTP_PHASE_LIMIT;
- write_unlock_irq(&xtime_lock);
+ fr_write_unlock_irq(&xtime_lock);
return 0;
}
@@ -127,10 +128,10 @@
*/
inline static void warp_clock(void)
{
- write_lock_irq(&xtime_lock);
+ fr_write_lock_irq(&xtime_lock);
xtime.tv_sec += sys_tz.tz_minuteswest * 60;
last_time_offset = 0;
- write_unlock_irq(&xtime_lock);
+ fr_write_unlock_irq(&xtime_lock);
}
/*
@@ -234,7 +235,7 @@
if (txc->tick < 900000/HZ || txc->tick > 1100000/HZ)
return -EINVAL;
- write_lock_irq(&xtime_lock);
+ fr_write_lock_irq(&xtime_lock);
result = time_state; /* mostly `TIME_OK' */
/* Save for later - semantics of adjtime is to return old value */
@@ -390,7 +391,7 @@
txc->errcnt = pps_errcnt;
txc->stbcnt = pps_stbcnt;
last_time_offset = 0;
- write_unlock_irq(&xtime_lock);
+ fr_write_unlock_irq(&xtime_lock);
do_gettimeofday(&txc->time);
return(result);
}
diff -urN -X dontdiff linux-2.5/kernel/timer.c linux-2.5.exp/kernel/timer.c
--- linux-2.5/kernel/timer.c Mon Aug 12 10:18:32 2002
+++ linux-2.5.exp/kernel/timer.c Mon Aug 12 10:26:47 2002
@@ -24,6 +24,7 @@
#include <linux/interrupt.h>
#include <linux/tqueue.h>
#include <linux/kernel_stat.h>
+#include <linux/frlock.h>
#include <asm/uaccess.h>
@@ -633,7 +634,7 @@
* This read-write spinlock protects us from races in SMP while
* playing with xtime and avenrun.
*/
-rwlock_t xtime_lock __cacheline_aligned_in_smp = RW_LOCK_UNLOCKED;
+frlock_t xtime_lock __cacheline_aligned_in_smp = FR_LOCK_UNLOCKED;
unsigned long last_time_offset;
static inline void update_times(void)
@@ -645,7 +646,7 @@
* just know that the irqs are locally enabled and so we don't
* need to save/restore the flags of the local CPU here. -arca
*/
- write_lock_irq(&xtime_lock);
+ fr_write_lock_irq(&xtime_lock);
ticks = jiffies - wall_jiffies;
if (ticks) {
@@ -654,7 +655,7 @@
}
last_time_offset = 0;
calc_load(ticks);
- write_unlock_irq(&xtime_lock);
+ fr_write_unlock_irq(&xtime_lock);
}
void timer_bh(void)
@@ -922,20 +923,23 @@
asmlinkage long sys_sysinfo(struct sysinfo *info)
{
struct sysinfo val;
+ unsigned long seq;
unsigned long mem_total, sav_total;
unsigned int mem_unit, bitcount;
memset((char *)&val, 0, sizeof(struct sysinfo));
- read_lock_irq(&xtime_lock);
- val.uptime = jiffies / HZ;
+ do {
+ seq = fr_read_begin(&xtime_lock);
+
+ val.uptime = jiffies / HZ;
- val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
- val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
- val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
+ val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
+ val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
+ val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
- val.procs = nr_threads;
- read_unlock_irq(&xtime_lock);
+ val.procs = nr_threads;
+ } while (seq != fr_read_end(&xtime_lock));
si_meminfo(&val);
si_swapinfo(&val);
@@ -980,7 +984,7 @@
val.totalhigh <<= bitcount;
val.freehigh <<= bitcount;
-out:
+ out:
if (copy_to_user(info, &val, sizeof(struct sysinfo)))
return -EFAULT;
> The following patch generalizes Andrea's trick of using sequence numbers
> to create a reader region/writer lock. It against the 2.5.30 kernel.
Ah! Very nice! I have hit the xtime_lock starvation issue, and was
looking to implement something like vxtime_lock to help it (I'm also
hoping to port the vsyscall gtod to i386 as well).
> A new composite primitive 'frlock' is defined in include/linux/frlock.h
> and used to replace the rwlock xtime_lock enforce consistent access to
> the clock time variables.
My only comment is that while you have wrapped it up nicely, I'm not
sure this locking algorithm is worth generalizing. While the vxtime_lock
implementation for x86-64 might take a bit to grasp, the macros are very
short, fairly straight forward, and difficult to confuse with a standard
spin/rw_lock. Additionally, the extra complexity of managing the
sequence number isn't really being hidden (and really, I'm not sure how
one would hide it).
Maybe I'm just partial to the way Andrea did it (especially considering
I'd like to use the lock from userspace as well), but regardless this
style of rwlock is definitely needed for the xtime_lock. Very cool!
thanks
-john
Missed this part which updated i386 apm.c
--- linux-2.5/arch/i386/kernel/apm.c Mon Aug 12 10:17:59 2002
+++ linux-2.5.exp/arch/i386/kernel/apm.c Mon Aug 12 17:25:57 2002
@@ -215,6 +215,7 @@
#include <linux/pm.h>
#include <linux/kernel.h>
#include <linux/smp_lock.h>
+#include <linux/frlock.h>
#include <asm/system.h>
#include <asm/uaccess.h>
@@ -222,7 +223,7 @@
#include <linux/sysrq.h>
-extern rwlock_t xtime_lock;
+extern frlock_t xtime_lock;
extern spinlock_t i8253_lock;
extern unsigned long get_cmos_time(void);
extern void machine_real_restart(unsigned char *, int);
@@ -1190,7 +1191,7 @@
printk(KERN_CRIT "apm: suspend was vetoed, but suspending anyway.\n");
}
/* serialize with the timer interrupt */
- write_lock_irq(&xtime_lock);
+ fr_write_lock_irq(&xtime_lock);
/* protect against access to timer chip registers */
spin_lock(&i8253_lock);
@@ -1202,7 +1203,7 @@
ignore_normal_resume = 1;
spin_unlock(&i8253_lock);
- write_unlock_irq(&xtime_lock);
+ fr_write_unlock_irq(&xtime_lock);
if (err == APM_NO_ERROR)
err = APM_SUCCESS;
@@ -1227,10 +1228,10 @@
int err;
/* serialize with the timer interrupt */
- write_lock_irq(&xtime_lock);
+ fr_write_lock_irq(&xtime_lock);
/* If needed, notify drivers here */
get_time_diff();
- write_unlock_irq(&xtime_lock);
+ fr_write_unlock_irq(&xtime_lock);
err = set_system_power_state(APM_STATE_STANDBY);
if ((err != APM_SUCCESS) && (err != APM_NO_ERROR))
@@ -1319,9 +1320,9 @@
ignore_bounce = 1;
if ((event != APM_NORMAL_RESUME)
|| (ignore_normal_resume == 0)) {
- write_lock_irq(&xtime_lock);
+ fr_write_lock_irq(&xtime_lock);
set_time();
- write_unlock_irq(&xtime_lock);
+ fr_write_unlock_irq(&xtime_lock);
pm_send_all(PM_RESUME, (void *)0);
queue_event(event, NULL);
}
@@ -1336,9 +1337,9 @@
break;
case APM_UPDATE_TIME:
- write_lock_irq(&xtime_lock);
+ fr_write_lock_irq(&xtime_lock);
set_time();
- write_unlock_irq(&xtime_lock);
+ fr_write_unlock_irq(&xtime_lock);
break;
case APM_CRITICAL_SUSPEND:
The following is an example of how to use frlock with other
architectures. Don't have a ia64 machine handy to test it
diff -urN -X dontdiff linux-2.5/arch/ia64/kernel/time.c linux-2.5.exp/arch/ia64/kernel/time.c
--- linux-2.5/arch/ia64/kernel/time.c Mon Aug 12 10:18:00 2002
+++ linux-2.5.exp/arch/ia64/kernel/time.c Mon Aug 12 17:35:26 2002
@@ -23,7 +23,7 @@
#include <asm/sal.h>
#include <asm/system.h>
-extern rwlock_t xtime_lock;
+extern frlock_t xtime_lock;
extern unsigned long wall_jiffies;
extern unsigned long last_time_offset;
@@ -87,7 +87,7 @@
void
do_settimeofday (struct timeval *tv)
{
- write_lock_irq(&xtime_lock);
+ fr_write_lock_irq(&xtime_lock);
{
/*
* This is revolting. We need to set "xtime" correctly. However, the value
@@ -109,16 +109,16 @@
time_maxerror = NTP_PHASE_LIMIT;
time_esterror = NTP_PHASE_LIMIT;
}
- write_unlock_irq(&xtime_lock);
+ fr_write_unlock_irq(&xtime_lock);
}
void
do_gettimeofday (struct timeval *tv)
{
- unsigned long flags, usec, sec, old;
+ unsigned long seq, usec, sec, old;
- read_lock_irqsave(&xtime_lock, flags);
- {
+ do {
+ seq = fr_read_begin(&xtime_lock);
usec = gettimeoffset();
/*
@@ -135,8 +135,7 @@
sec = xtime.tv_sec;
usec += xtime.tv_usec;
- }
- read_unlock_irqrestore(&xtime_lock, flags);
+ } while (seq != fr_read_end(&xtime_lock));
while (usec >= 1000000) {
usec -= 1000000;
@@ -179,10 +178,10 @@
* another CPU. We need to avoid to SMP race by acquiring the
* xtime_lock.
*/
- write_lock(&xtime_lock);
+ fr_write_lock(&xtime_lock);
do_timer(regs);
local_cpu_data->itm_next = new_itm;
- write_unlock(&xtime_lock);
+ fr_write_unlock(&xtime_lock);
} else
local_cpu_data->itm_next = new_itm;