2005-03-28 16:45:30

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH rc1-mm3] timers: simplify locking

This is the last one, I promise.
On top of "[PATCH rc1-mm3] timers: kill timer_list->lock", see
http://marc.theaimsgroup.com/?l=linux-kernel&m=111193319932543

This patch adds lock_timer() helper, which locks timer's base,
and checks it is still the same and != NULL.

After the previous patch the timer's base is never NULL. With
this patch __mod_timer() temporaly sets ->_base = __TIMER_PENDING
after deletion of timer, so the timer is still pending and can't
be locked by lock_timer().

The result is that __mod_timer() does not need to lock both bases.
Now it can do:
old_base = lock_timer()
list_del(timer)
__set_base(timer, NULL, 1)
unlock(old_base)
// The timer can't be seen by __run_timers(old_base).
// Still pending, del_timer() will call lock_timer().
// Nobody can modify it, lock_timer() will wait.
lock_new(new_base)
add timer
unlock(new_base)

This slightly speedups __mod_timer(), and in my opinion simplifies
the code. I hope it may also improve scalability of timers.

Signed-off-by: Oleg Nesterov <[email protected]>

--- rc1-mm3/include/linux/timer.h~ 2005-03-28 22:42:38.000000000 +0400
+++ rc1-mm3/include/linux/timer.h 2005-03-29 00:42:49.000000000 +0400
@@ -33,11 +33,6 @@ struct timer_list {

#define __TIMER_PENDING 1

-static inline int __timer_pending(struct tvec_t_base_s *base)
-{
- return ((unsigned long)base & __TIMER_PENDING) != 0;
-}
-
#define TIMER_MAGIC 0x4b87ad6e

extern struct fake_timer_base fake_timer_base;
@@ -64,7 +59,7 @@ void fastcall init_timer(struct timer_li
*/
static inline int timer_pending(const struct timer_list * timer)
{
- return __timer_pending(timer->_base);
+ return ((unsigned long)timer->_base & __TIMER_PENDING) != 0;
}

extern void add_timer_on(struct timer_list *timer, int cpu);
--- rc1-mm3/kernel/timer.c~ 2005-03-27 21:07:29.000000000 +0400
+++ rc1-mm3/kernel/timer.c 2005-03-29 00:42:49.000000000 +0400
@@ -199,56 +199,59 @@ static void internal_add_timer(tvec_base
list_add_tail(&timer->entry, vec);
}

+static tvec_base_t *lock_timer(struct timer_list *timer, unsigned long *flags)
+{
+ tvec_base_t *base;
+
+ for (;;) {
+ base = timer_base(timer);
+ /* Can be NULL when __mod_timer switches bases */
+ if (likely(base)) {
+ spin_lock_irqsave(&base->lock, *flags);
+ if (likely(base == timer_base(timer)))
+ return base;
+ spin_unlock_irqrestore(&base->lock, *flags);
+ }
+ cpu_relax();
+ }
+}
+
int __mod_timer(struct timer_list *timer, unsigned long expires)
{
- tvec_base_t *old_base, *new_base;
+ tvec_base_t *base, *new_base;
unsigned long flags;
- int new_lock;
int ret = -1;

BUG_ON(!timer->function);
check_timer(timer);

do {
- local_irq_save(flags);
+ base = lock_timer(timer, &flags);
new_base = &__get_cpu_var(tvec_bases);
- old_base = timer_base(timer);

- /* Prevent AB-BA deadlocks.
- * NOTE: (old_base == new_base) => (new_lock == 0)
- */
- new_lock = old_base < new_base;
- if (new_lock)
- spin_lock(&new_base->lock);
- spin_lock(&old_base->lock);
-
- if (timer_base(timer) != old_base)
+ if (base != new_base
+ && base->running_timer == timer)
goto unlock;

- if (old_base != new_base) {
- /* Ensure the timer is serialized. */
- if (old_base->running_timer == timer)
- goto unlock;
-
- if (!new_lock) {
- spin_lock(&new_base->lock);
- new_lock = 1;
- }
- }
-
ret = 0;
if (timer_pending(timer)) {
list_del(&timer->entry);
ret = 1;
}

+ if (base != new_base) {
+ __set_base(timer, NULL, 1);
+ /* Safe, lock_timer checks base != NULL */
+ spin_unlock(&base->lock);
+ base = new_base;
+ spin_lock(&base->lock);
+ }
+
timer->expires = expires;
- internal_add_timer(new_base, timer);
- __set_base(timer, new_base, 1);
+ internal_add_timer(base, timer);
+ __set_base(timer, base, 1);
unlock:
- if (new_lock)
- spin_unlock(&new_base->lock);
- spin_unlock_irqrestore(&old_base->lock, flags);
+ spin_unlock_irqrestore(&base->lock, flags);
} while (ret == -1);

return ret;
@@ -331,26 +334,22 @@ EXPORT_SYMBOL(mod_timer);
int del_timer(struct timer_list *timer)
{
unsigned long flags;
- tvec_base_t *_base, *base;
+ tvec_base_t *base;
+ int ret = 0;

check_timer(timer);

-repeat:
- _base = timer->_base;
- if (!__timer_pending(_base))
- return 0;
-
- base = (void*)_base - __TIMER_PENDING;
- spin_lock_irqsave(&base->lock, flags);
- if (_base != timer->_base) {
+ if (timer_pending(timer)) {
+ base = lock_timer(timer, &flags);
+ if (timer_pending(timer)) {
+ list_del(&timer->entry);
+ __set_base(timer, base, 0);
+ ret = 1;
+ }
spin_unlock_irqrestore(&base->lock, flags);
- goto repeat;
}
- list_del(&timer->entry);
- __set_base(timer, base, 0);
- spin_unlock_irqrestore(&base->lock, flags);

- return 1;
+ return ret;
}

EXPORT_SYMBOL(del_timer);
@@ -382,15 +381,11 @@ int del_timer_sync(struct timer_list *ti
check_timer(timer);

do {
- base = timer_base(timer);
- spin_lock_irqsave(&base->lock, flags);
+ base = lock_timer(timer, &flags);

if (base->running_timer == timer)
goto unlock;

- if (timer_base(timer) != base)
- goto unlock;
-
ret = 0;
if (timer_pending(timer)) {
list_del(&timer->entry);
@@ -1362,14 +1357,8 @@ static void __devinit migrate_timers(int
new_base = &get_cpu_var(tvec_bases);

local_irq_disable();
- /* Prevent deadlocks via ordering by old_base < new_base. */
- if (old_base < new_base) {
- spin_lock(&new_base->lock);
- spin_lock(&old_base->lock);
- } else {
- spin_lock(&old_base->lock);
- spin_lock(&new_base->lock);
- }
+ spin_lock(&new_base->lock);
+ spin_lock(&old_base->lock);

if (old_base->running_timer)
BUG();


2005-03-28 20:07:31

by Christoph Lameter

[permalink] [raw]
Subject: Re: [PATCH rc1-mm3] timers: simplify locking

Ok. Testing with your latest and greatest patches. Is there any
clarity about what caused the hangs?

2005-03-29 02:05:38

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH rc1-mm3] timers: simplify locking

Oleg Nesterov <[email protected]> wrote:
>
> This is the last one, I promise.
> On top of "[PATCH rc1-mm3] timers: kill timer_list->lock", see
> http://marc.theaimsgroup.com/?l=linux-kernel&m=111193319932543

I thought that earlier patch was a bit weird and I think it would be better
to get to the bottom of these problems which people have been reporting in
the 2.6.12-rc1-mm3 timer code before adding more things, don't you?

2005-03-29 11:21:34

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH rc1-mm3] timers: simplify locking

Christoph Lameter wrote:
>
> Ok. Testing with your latest and greatest patches.

Many thanks.

> Is there any clarity about what caused the hangs?

No, I still hope these hangs are unrelated to these patches.
I am trying to find the bug, but I can't. May be it is because
I do not want to believe that these patches are buggy :)

I tried to stress them with various small tests I wrote and I
have not found any problems.

And I don't know what kernel you are using, there is additional
timer patch in mm2.

Oleg.

2005-03-29 11:23:00

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH rc1-mm3] timers: simplify locking

Andrew Morton wrote:
>
> Oleg Nesterov <[email protected]> wrote:
> >
> > This is the last one, I promise.
> > On top of "[PATCH rc1-mm3] timers: kill timer_list->lock", see
> > http://marc.theaimsgroup.com/?l=linux-kernel&m=111193319932543
>
> I thought that earlier patch was a bit weird

A bit weird, or too weird to be acceptable?

I am very much interested in your and others opinion. You do not like
this fake_timer_base or you don't like the idea that the timer is always
locked through timer_base() ?

These 2 patches should be cleanuped of course.

struct XXX {
spinlock_t lock;
timer_list *running_timer;
};

struct XXX fake_timer_base;

struct tvec_t_base_s {
struct XXX xxx;

long timer_jiffies;
tvec_t tv1;
...
}

struct timer_list {
...
struct XXX *_base;
}

So tvec_t_base_s is used only in __mod_timer() for new_base. This cleanup
would be trivial and without changes in timer.o.

However this global fake_timer_base is really neccessary for this patch and
it is really wierd.

But I like the fact that __mod_timer() takes 2 locks sequentially instead
of 3 at once.

> and I think it would be better
> to get to the bottom of these problems which people have been reporting in
> the 2.6.12-rc1-mm3 timer code before adding more things, don't you?

I just can't imagine how this "del_timer_sync instead of del_singleshot_timer
in schedule_timeout" can reveal any bug in these patches. del_singleshot_timer
calls del_timer_sync anyway when the timer is inactive. The only difference
is that now schedule_timeout()->del_timer_sync() actually deletes this timer
when the caller was waken by a signal/event. And that deletion is very simple,
it just "can't be wrong", and it adds LIST_POISON to timer->entry.

But you are right of course. It is better to forget about these new patches
for a while.

Oleg.