2015-06-05 08:59:47

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer

Currently an hrtimer callback function cannot free its own timer
because __run_hrtimer() still needs to clear HRTIMER_STATE_CALLBACK
after it. Freeing the timer would result in a clear use-after-free.

Solve this by using a scheme similar to regular timers; track the
current running timer in hrtimer_clock_base::running.

Suggested-by: Thomas Gleixner <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
include/linux/hrtimer.h | 47 ++++++-----------
kernel/time/hrtimer.c | 130 ++++++++++++++++++++++++++++++++++++++----------
2 files changed, 122 insertions(+), 55 deletions(-)

--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -53,34 +53,27 @@ enum hrtimer_restart {
*
* 0x00 inactive
* 0x01 enqueued into rbtree
- * 0x02 callback function running
- * 0x04 timer is migrated to another cpu
+ * 0x02 timer is migrated to another cpu
*
- * Special cases:
- * 0x03 callback function running and enqueued
- * (was requeued on another CPU)
- * 0x05 timer was migrated on CPU hotunplug
+ * The callback state is not part of the timer->state because clearing it would
+ * mean touching the timer after the callback, this makes it impossible to free
+ * the timer from the callback function.
*
- * The "callback function running and enqueued" status is only possible on
- * SMP. It happens for example when a posix timer expired and the callback
+ * Therefore we track the callback state in:
+ *
+ * timer->base->cpu_base->running == timer
+ *
+ * On SMP it is possible to have a "callback function running and enqueued"
+ * status. It happens for example when a posix timer expired and the callback
* queued a signal. Between dropping the lock which protects the posix timer
* and reacquiring the base lock of the hrtimer, another CPU can deliver the
- * signal and rearm the timer. We have to preserve the callback running state,
- * as otherwise the timer could be removed before the softirq code finishes the
- * the handling of the timer.
- *
- * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state
- * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. This
- * also affects HRTIMER_STATE_MIGRATE where the preservation is not
- * necessary. HRTIMER_STATE_MIGRATE is cleared after the timer is
- * enqueued on the new cpu.
+ * signal and rearm the timer.
*
* All state transitions are protected by cpu_base->lock.
*/
#define HRTIMER_STATE_INACTIVE 0x00
#define HRTIMER_STATE_ENQUEUED 0x01
-#define HRTIMER_STATE_CALLBACK 0x02
-#define HRTIMER_STATE_MIGRATE 0x04
+#define HRTIMER_STATE_MIGRATE 0x02

/**
* struct hrtimer - the basic hrtimer structure
@@ -167,6 +160,8 @@ enum hrtimer_base_type {
* struct hrtimer_cpu_base - the per cpu clock bases
* @lock: lock protecting the base and associated clock bases
* and timers
+ * @seq: seqcount around __run_hrtimer
+ * @running: pointer to the currently running hrtimer
* @cpu: cpu number
* @active_bases: Bitfield to mark bases with active timers
* @clock_was_set_seq: Sequence counter of clock was set events
@@ -188,6 +183,8 @@ enum hrtimer_base_type {
*/
struct hrtimer_cpu_base {
raw_spinlock_t lock;
+ seqcount_t seq;
+ struct hrtimer *running;
unsigned int cpu;
unsigned int active_bases;
unsigned int clock_was_set_seq;
@@ -395,15 +392,7 @@ extern ktime_t hrtimer_get_remaining(con

extern u64 hrtimer_get_next_event(void);

-/*
- * A timer is active, when it is enqueued into the rbtree or the
- * callback function is running or it's in the state of being migrated
- * to another cpu.
- */
-static inline int hrtimer_active(const struct hrtimer *timer)
-{
- return timer->state != HRTIMER_STATE_INACTIVE;
-}
+extern bool hrtimer_active(const struct hrtimer *timer);

/*
* Helper function to check, whether the timer is on one of the queues
@@ -419,7 +408,7 @@ static inline int hrtimer_is_queued(stru
*/
static inline int hrtimer_callback_running(struct hrtimer *timer)
{
- return timer->state & HRTIMER_STATE_CALLBACK;
+ return timer->base->cpu_base->running == timer;
}

/* Forward a hrtimer so it expires after now: */
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -67,6 +67,7 @@
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
+ .seq = SEQCNT_ZERO(hrtimer_bases.seq),
.clock_base =
{
{
@@ -111,6 +112,19 @@ static inline int hrtimer_clockid_to_bas
#ifdef CONFIG_SMP

/*
+ * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
+ * such that hrtimer_callback_running() can unconditionally dereference
+ * timer->base->cpu_base
+ */
+static struct hrtimer_cpu_base migration_cpu_base = {
+ .seq = SEQCNT_ZERO(migration_cpu_base),
+};
+
+static struct hrtimer_clock_base migration_base = {
+ .cpu_base = &migration_cpu_base,
+};
+
+/*
* We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
* means that all timers which are tied to this base via timer->base are
* locked, and the base itself is locked too.
@@ -119,8 +133,8 @@ static inline int hrtimer_clockid_to_bas
* be found on the lists/queues.
*
* When the timer's base is locked, and the timer removed from list, it is
- * possible to set timer->base = NULL and drop the lock: the timer remains
- * locked.
+ * possible to set timer->base = &migration_base and drop the lock: the timer
+ * remains locked.
*/
static
struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
@@ -130,7 +144,7 @@ struct hrtimer_clock_base *lock_hrtimer_

for (;;) {
base = timer->base;
- if (likely(base != NULL)) {
+ if (likely(base != &migration_base)) {
raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
if (likely(base == timer->base))
return base;
@@ -194,8 +208,8 @@ switch_hrtimer_base(struct hrtimer *time
if (unlikely(hrtimer_callback_running(timer)))
return base;

- /* See the comment in lock_timer_base() */
- timer->base = NULL;
+ /* See the comment in lock_hrtimer_base() */
+ timer->base = &migration_base;
raw_spin_unlock(&base->cpu_base->lock);
raw_spin_lock(&new_base->cpu_base->lock);

@@ -840,11 +854,7 @@ static int enqueue_hrtimer(struct hrtime

base->cpu_base->active_bases |= 1 << base->index;

- /*
- * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
- * state of a possibly running callback.
- */
- timer->state |= HRTIMER_STATE_ENQUEUED;
+ timer->state = HRTIMER_STATE_ENQUEUED;

return timerqueue_add(&base->active, &timer->node);
}
@@ -894,7 +904,6 @@ static inline int
remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
{
if (hrtimer_is_queued(timer)) {
- unsigned long state;
int reprogram;

/*
@@ -908,13 +917,8 @@ remove_hrtimer(struct hrtimer *timer, st
debug_deactivate(timer);
timer_stats_hrtimer_clear_start_info(timer);
reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
- /*
- * We must preserve the CALLBACK state flag here,
- * otherwise we could move the timer base in
- * switch_hrtimer_base.
- */
- state = timer->state & HRTIMER_STATE_CALLBACK;
- __remove_hrtimer(timer, base, state, reprogram);
+
+ __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, reprogram);
return 1;
}
return 0;
@@ -1114,6 +1118,70 @@ void hrtimer_init(struct hrtimer *timer,
}
EXPORT_SYMBOL_GPL(hrtimer_init);

+/*
+ * A timer is active, when it is enqueued into the rbtree or the
+ * callback function is running or it's in the state of being migrated
+ * to another cpu.
+ */
+bool hrtimer_active(const struct hrtimer *timer)
+{
+ struct hrtimer_cpu_base *cpu_base;
+ unsigned int seq;
+ bool active;
+
+ do {
+ active = false;
+ cpu_base = READ_ONCE(timer->base->cpu_base);
+ seq = raw_read_seqcount(&cpu_base->seq);
+
+ if (timer->state != HRTIMER_STATE_INACTIVE ||
+ cpu_base->running == timer)
+ active = true;
+
+ } while (read_seqcount_retry(&cpu_base->seq, seq) ||
+ cpu_base != READ_ONCE(timer->base->cpu_base));
+
+ return active;
+}
+EXPORT_SYMBOL_GPL(hrtimer_active);
+
+/*
+ * __run_hrtimer() hrtimer_active()
+ *
+ * [S] cpu_base->running = timer [R] timer->base->cpu_base
+ * [S] seq++ [R] seq
+ * WMB RMB
+ * [S] timer->state = INACTIVE
+ * [R] timer->state
+ * fn(); [R] cpu_base->running
+ *
+ * [S] timer->state = ENQUEUED (optional)
+ * WMB RMB
+ * [S] seq++ [R] seq
+ * [S] cpu_base->running = NULL [R] timer->base->cpu_base
+ *
+ *
+ * The WMBs+seq on the __run_hrtimer() split the thing into 3 distinct sections:
+ *
+ * - queued: the timer is queued
+ * - callback: the timer is being ran
+ * - post: the timer is inactive or (re)queued
+ *
+ * On the read side we ensure we observe timer->state and cpu_base->running
+ * from the same section, if anything changed while we looked at it, we retry.
+ * This includes timer->base changing because sequence numbers alone are
+ * insufficient for that.
+ *
+ * Note: this is unusual seqlock usage in that we do not have the odd/even
+ * thing, all we really care about is both reads happening in the same section
+ * of __run_hrtimer().
+ *
+ * Note: both seq increments are not fully store ordered and this is fine, if
+ * for example the first seq increment is observed before the ->running store
+ * the section edge shifts slightly, but its a safe shift because here ->state
+ * is still ENQUEUED.
+ */
+
static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
struct hrtimer_clock_base *base,
struct hrtimer *timer, ktime_t *now)
@@ -1121,10 +1189,17 @@ static void __run_hrtimer(struct hrtimer
enum hrtimer_restart (*fn)(struct hrtimer *);
int restart;

- WARN_ON(!irqs_disabled());
+ lockdep_assert_held(&cpu_base->lock);

debug_deactivate(timer);
- __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+ cpu_base->running = timer;
+
+ /*
+ * separate the ->running assignment from the ->state assignment
+ */
+ raw_write_seqcount_begin(&cpu_base->seq);
+
+ __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
timer_stats_account_hrtimer(timer);
fn = timer->function;

@@ -1140,7 +1215,7 @@ static void __run_hrtimer(struct hrtimer
raw_spin_lock(&cpu_base->lock);

/*
- * Note: We clear the CALLBACK bit after enqueue_hrtimer and
+ * Note: We clear the running state after enqueue_hrtimer and
* we do not reprogramm the event hardware. Happens either in
* hrtimer_start_range_ns() or in hrtimer_interrupt()
*
@@ -1152,9 +1227,13 @@ static void __run_hrtimer(struct hrtimer
!(timer->state & HRTIMER_STATE_ENQUEUED))
enqueue_hrtimer(timer, base);

- WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
+ /*
+ * separate the ->running assignment from the ->state assignment
+ */
+ raw_write_seqcount_end(&cpu_base->seq);

- timer->state &= ~HRTIMER_STATE_CALLBACK;
+ WARN_ON_ONCE(cpu_base->running != timer);
+ cpu_base->running = NULL;
}

static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
@@ -1523,11 +1602,10 @@ static void migrate_hrtimer_list(struct
* hrtimer_interrupt after we migrated everything to
* sort out already expired timers and reprogram the
* event device.
+ *
+ * Sets timer->state = HRTIMER_STATE_ENQUEUED.
*/
enqueue_hrtimer(timer, new_base);
-
- /* Clear the migration state bit */
- timer->state &= ~HRTIMER_STATE_MIGRATE;
}
}



2015-06-05 09:48:37

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer

On Fri, 5 Jun 2015, Peter Zijlstra wrote:
> /*
> + * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
> + * such that hrtimer_callback_running() can unconditionally dereference
> + * timer->base->cpu_base
> + */
> +static struct hrtimer_cpu_base migration_cpu_base = {
> + .seq = SEQCNT_ZERO(migration_cpu_base),
> +};
> +
> +static struct hrtimer_clock_base migration_base = {
> + .cpu_base = &migration_cpu_base,
> +};

You can spare that extra migration clock base, because
migration_cpu_base already has clock bases inside.

static struct hrtimer_cpu_base migration_cpu_base = {
.seq = SEQCNT_ZERO(migration_cpu_base),
};

and do:

migration_cpu_base.clock_base[CLOCK_MONOTONIC].cpu_base = &migration_cpu_base;

in hrtimer_init.

So migration base becomes:

#define migration_base &migration_cpu_base.clock_base[CLOCK_MONOTONIC]

That's also more efficient in terms of cache because is two adjacent
cache lines instead of two distant ones.

Other than that, this looks good.

Thanks,

tglx

2015-06-07 19:44:46

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer

On 06/05, Peter Zijlstra wrote:
>
> #define HRTIMER_STATE_INACTIVE 0x00
> #define HRTIMER_STATE_ENQUEUED 0x01
> -#define HRTIMER_STATE_CALLBACK 0x02
> -#define HRTIMER_STATE_MIGRATE 0x04
> +#define HRTIMER_STATE_MIGRATE 0x02

Slightly pff-topic, but I do not understand HRTIMER_STATE_MIGRATE
with or without this change. Unless I am totally confused it looks
buggy and simply unneeded.

migrate_hrtimer_list() sets it to keep hrtimer_active() == T, but
this is not enough: this can fool, say, hrtimer_is_queued() in
dequeue_signal().

Can't migrate_hrtimer_list() simply use HRTIMER_STATE_ENQUEUED?
This fixes the race and we can kill STATE_MIGRATE.

Oleg.


--- x/include/linux/hrtimer.h
+++ x/include/linux/hrtimer.h
@@ -70,17 +70,13 @@ enum hrtimer_restart {
* the handling of the timer.
*
* The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state
- * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. This
- * also affects HRTIMER_STATE_MIGRATE where the preservation is not
- * necessary. HRTIMER_STATE_MIGRATE is cleared after the timer is
- * enqueued on the new cpu.
+ * to preserve the HRTIMER_STATE_CALLBACK in the above scenario.
*
* All state transitions are protected by cpu_base->lock.
*/
#define HRTIMER_STATE_INACTIVE 0x00
#define HRTIMER_STATE_ENQUEUED 0x01
#define HRTIMER_STATE_CALLBACK 0x02
-#define HRTIMER_STATE_MIGRATE 0x04

/**
* struct hrtimer - the basic hrtimer structure
--- x/kernel/time/hrtimer.c
+++ x/kernel/time/hrtimer.c
@@ -1642,11 +1642,11 @@ static void migrate_hrtimer_list(struct
debug_deactivate(timer);

/*
- * Mark it as STATE_MIGRATE not INACTIVE otherwise the
+ * Mark it as ENQUEUED not INACTIVE otherwise the
* timer could be seen as !active and just vanish away
* under us on another CPU
*/
- __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
+ __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
timer->base = new_base;
/*
* Enqueue the timers on the new cpu. This does not
@@ -1657,9 +1657,6 @@ static void migrate_hrtimer_list(struct
* event device.
*/
enqueue_hrtimer(timer, new_base);
-
- /* Clear the migration state bit */
- timer->state &= ~HRTIMER_STATE_MIGRATE;
}
}

2015-06-07 22:34:25

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer

Not sure I read this patch correctly, it doesn't apply to Linus's tree.

And I simply can not understand the complication in hrtimer_active(),
please help!

On 06/05, Peter Zijlstra wrote:
>
> +bool hrtimer_active(const struct hrtimer *timer)
> +{
> + struct hrtimer_cpu_base *cpu_base;
> + unsigned int seq;
> + bool active;
> +
> + do {
> + active = false;
> + cpu_base = READ_ONCE(timer->base->cpu_base);
> + seq = raw_read_seqcount(&cpu_base->seq);
> +
> + if (timer->state != HRTIMER_STATE_INACTIVE ||
> + cpu_base->running == timer)
> + active = true;

Why we can't simply return true in this case?

Unless you lock this timer, hrtimer_active() is inherently racy anyway.
Granted, it must not wrongly return False if the timer is pending or
running.

But "false positive" does not differ from the case when (say) the
running timer->function() finishes right after hrtimer_active() returns
True.

> + } while (read_seqcount_retry(&cpu_base->seq, seq) ||
> + cpu_base != READ_ONCE(timer->base->cpu_base));

Why do we need to re-check >cpu_base?

I think we can ignore migrate_hrtimer_list(), it doesn't clear ->state.

Otherwise the timer can change its ->base only if it is not running and
inactive, and again I think we should only eliminate the false negative
return.

And I think there is a problem. Consider a timer TIMER which always
rearms itself using some "default" timeout.

In this case __hrtimer_start_range_ns(&TIMER, ...) must preserve
hrtimer_active(&TIMER) == T. By definition, and currently this is
true.

After this patch this is no longer true (afaics). If the timer is
pending but not running, __hrtimer_start_range_ns()->remove_hrtimer()
will clear ENQUEUED first, then set it again in enqueue_hrtimer().

This means that hrtimer_active() returns false in between. And note
that it doesn't matter if the timer changes its ->base or not, so
that 2nd cpu_base above can't help.

I think that __hrtimer_start_range_ns() should preserve ENQUEUED
like migrate_hrtimer_list() should do (see the previous email).


Finally. Suppose that timer->function() returns HRTIMER_RESTART
and hrtimer_active() is called right after __run_hrtimer() sets
cpu_base->running = NULL. I can't understand why hrtimer_active()
can't miss ENQUEUED in this case. We have wmb() in between, yes,
but then hrtimer_active() should do something like

active = cpu_base->running == timer;
if (!active) {
rmb();
active = state != HRTIMER_STATE_INACTIVE;
}

No?

But I am already sleeping and probably totally confused.

Oleg.

2015-06-07 22:57:41

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer

On 06/08, Oleg Nesterov wrote:
>
> And I simply can not understand the complication in hrtimer_active(),
> please help!

Sorry for another off-topic email, but I don't even understand the
usage of hrtimer_active().

Say, do_nanosleep()

hrtimer_start_expires(&t->timer, mode);
if (!hrtimer_active(&t->timer))
t->task = NULL;

why? Assuming that hrtimer_active() is correct, it can only return
false if t->task was already cleared by hrtimer_wakeup().


OTOH. perf_cpu_hrtimer_restart() does

if (hrtimer_active(hr))
return;

if (!hrtimer_callback_running(hr))
__hrtimer_start_range_ns(...);

why it can't simply do

if (!hrtimer_active(hr)) // implies !hrtimer_callback_running()
__hrtimer_start_range_ns(...);


Confused.

Oleg.

2015-06-08 08:06:30

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer

On Mon, 8 Jun 2015, Oleg Nesterov wrote:

> On 06/08, Oleg Nesterov wrote:
> >
> > And I simply can not understand the complication in hrtimer_active(),
> > please help!
>
> Sorry for another off-topic email, but I don't even understand the
> usage of hrtimer_active().
>
> Say, do_nanosleep()
>
> hrtimer_start_expires(&t->timer, mode);
> if (!hrtimer_active(&t->timer))
> t->task = NULL;

That's gone in tip/timers/core.

> OTOH. perf_cpu_hrtimer_restart() does
>
> if (hrtimer_active(hr))
> return;
>
> if (!hrtimer_callback_running(hr))
> __hrtimer_start_range_ns(...);
>
> why it can't simply do
>
> if (!hrtimer_active(hr)) // implies !hrtimer_callback_running()
> __hrtimer_start_range_ns(...);

That's fixed as well.

Thanks,

tglx

2015-06-08 09:14:34

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer

On Mon, Jun 08, 2015 at 12:33:17AM +0200, Oleg Nesterov wrote:
> Not sure I read this patch correctly, it doesn't apply to Linus's tree.

I was working on tip/master, there's a number of timer patches in there.

> And I simply can not understand the complication in hrtimer_active(),
> please help!
>
> On 06/05, Peter Zijlstra wrote:
> >
> > +bool hrtimer_active(const struct hrtimer *timer)
> > +{
> > + struct hrtimer_cpu_base *cpu_base;
> > + unsigned int seq;
> > + bool active;
> > +
> > + do {
> > + active = false;
> > + cpu_base = READ_ONCE(timer->base->cpu_base);
> > + seq = raw_read_seqcount(&cpu_base->seq);
> > +
> > + if (timer->state != HRTIMER_STATE_INACTIVE ||
> > + cpu_base->running == timer)
> > + active = true;
>
> Why we can't simply return true in this case?
>
> Unless you lock this timer, hrtimer_active() is inherently racy anyway.
> Granted, it must not wrongly return False if the timer is pending or
> running.
>
> But "false positive" does not differ from the case when (say) the
> running timer->function() finishes right after hrtimer_active() returns
> True.

So the problem with:

static inline bool hrtimer_active(const struct timer *timer)
{
return timer->state != HRTIMER_STATE_INACTIVE ||
timer->base->cpu_base->running == timer;
}

Is that is can indeed return false falsely.

Consider __run_hrtimer:

cpu_base->running = timer;
__remove_hrtimer(..., HRTIMER_STATE_INACTIVE, ...);

Our test could observe INACTIVE but not yet see the ->running store. In
this case it would return false, even though it is in fact active.

> > + } while (read_seqcount_retry(&cpu_base->seq, seq) ||
> > + cpu_base != READ_ONCE(timer->base->cpu_base));
>
> Why do we need to re-check >cpu_base?

Because each CPU's cpu_base has independent seq numbers, so if the timer
migrates, the seq numbers might align just so that we fail to detect
change, even though there was -- extremely unlikely, but possible.

> I think we can ignore migrate_hrtimer_list(), it doesn't clear ->state.

I'm inclined to agree, although I did not get around to staring at that
on Friday and am currently in the process of waking up.

> Otherwise the timer can change its ->base only if it is not running and
> inactive, and again I think we should only eliminate the false negative
> return.

Agreed.

> And I think there is a problem. Consider a timer TIMER which always
> rearms itself using some "default" timeout.
>
> In this case __hrtimer_start_range_ns(&TIMER, ...) must preserve
> hrtimer_active(&TIMER) == T. By definition, and currently this is
> true.

I must ask for a little clarification; is this the simple:

->function()
hrtimer_forward_now(&self, timeout);
return HRTIMER_RESTART;

Or something that 'randomly' calls:

hrtimer_start() on a timer?

Because for the latter this isn't currently true for the same reason as
you give here:

> After this patch this is no longer true (afaics). If the timer is
> pending but not running, __hrtimer_start_range_ns()->remove_hrtimer()
> will clear ENQUEUED first, then set it again in enqueue_hrtimer().

That is so even with the current code; the current code uses:

hrtimer->state & CALLBACK

for __remove_hrtimer(.state). In the above case of a pending timer,
that's 0 aka. HRTIMER_STATE_INACTIVE.

> This means that hrtimer_active() returns false in between. And note
> that it doesn't matter if the timer changes its ->base or not, so
> that 2nd cpu_base above can't help.
>
> I think that __hrtimer_start_range_ns() should preserve ENQUEUED
> like migrate_hrtimer_list() should do (see the previous email).

I tend to agree, but I think its a pre-existing problem, not one
introduced by my proposed patch.

> Finally. Suppose that timer->function() returns HRTIMER_RESTART
> and hrtimer_active() is called right after __run_hrtimer() sets
> cpu_base->running = NULL. I can't understand why hrtimer_active()
> can't miss ENQUEUED in this case. We have wmb() in between, yes,
> but then hrtimer_active() should do something like
>
> active = cpu_base->running == timer;
> if (!active) {
> rmb();
> active = state != HRTIMER_STATE_INACTIVE;
> }
>
> No?

Hmm, good point. Let me think about that. It would be nice to be able to
avoid more memory barriers.

2015-06-08 10:55:49

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer

On Mon, Jun 08, 2015 at 11:14:17AM +0200, Peter Zijlstra wrote:
> On Mon, Jun 08, 2015 at 12:33:17AM +0200, Oleg Nesterov wrote:
> > Not sure I read this patch correctly, it doesn't apply to Linus's tree.
>
> I was working on tip/master, there's a number of timer patches in there.
>
> > And I simply can not understand the complication in hrtimer_active(),
> > please help!
> >
> > On 06/05, Peter Zijlstra wrote:
> > >
> > > +bool hrtimer_active(const struct hrtimer *timer)
> > > +{
> > > + struct hrtimer_cpu_base *cpu_base;
> > > + unsigned int seq;
> > > + bool active;
> > > +
> > > + do {
> > > + active = false;
> > > + cpu_base = READ_ONCE(timer->base->cpu_base);
> > > + seq = raw_read_seqcount(&cpu_base->seq);
> > > +
> > > + if (timer->state != HRTIMER_STATE_INACTIVE ||
> > > + cpu_base->running == timer)
> > > + active = true;
> >
> > Why we can't simply return true in this case?
> >
> > Unless you lock this timer, hrtimer_active() is inherently racy anyway.
> > Granted, it must not wrongly return False if the timer is pending or
> > running.
> >
> > But "false positive" does not differ from the case when (say) the
> > running timer->function() finishes right after hrtimer_active() returns
> > True.

OK I can't read; you asked why delay the return true inside that loop.

Yes we can as per your argument. I think I ended up being too paranoid
or something.

2015-06-08 12:42:55

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer

On Mon, Jun 08, 2015 at 11:14:17AM +0200, Peter Zijlstra wrote:
> > Finally. Suppose that timer->function() returns HRTIMER_RESTART
> > and hrtimer_active() is called right after __run_hrtimer() sets
> > cpu_base->running = NULL. I can't understand why hrtimer_active()
> > can't miss ENQUEUED in this case. We have wmb() in between, yes,
> > but then hrtimer_active() should do something like
> >
> > active = cpu_base->running == timer;
> > if (!active) {
> > rmb();
> > active = state != HRTIMER_STATE_INACTIVE;
> > }
> >
> > No?
>
> Hmm, good point. Let me think about that. It would be nice to be able to
> avoid more memory barriers.

So your scenario is:

[R] seq
RMB
[S] ->state = ACTIVE
WMB
[S] ->running = NULL
[R] ->running (== NULL)
[R] ->state (== INACTIVE; fail to observe
the ->state store due to
lack of order)
RMB
[R] seq (== seq)
[S] seq++

Conversely, if we re-order the (first) seq++ store such that it comes
first:

[S] seq++

[R] seq
RMB
[R] ->running (== NULL)
[S] ->running = timer;
WMB
[S] ->state = INACTIVE
[R] ->state (== INACTIVE)
RMB
[R] seq (== seq)

And we have another false negative.

And in this case we need the read order the other way around, we'd need:

active = timer->state != HRTIMER_STATE_INACTIVE;
if (!active) {
smp_rmb();
active = cpu_base->running == timer;
}

Now I think we can fix this by either doing:

WMB
seq++
WMB

On both sides of __run_hrtimer(), or do

bool hrtimer_active(const struct hrtimer *timer)
{
struct hrtimer_cpu_base *cpu_base;
unsigned int seq;

do {
cpu_base = READ_ONCE(timer->base->cpu_base);
seq = raw_read_seqcount(&cpu_base->seq);

if (timer->state != HRTIMER_STATE_INACTIVE)
return true;

smp_rmb();

if (cpu_base->running == timer)
return true;

smp_rmb();

if (timer->state != HRTIMER_STATE_INACTIVE)
return true;

} while (read_seqcount_retry(&cpu_base->seq, seq) ||
cpu_base != READ_ONCE(timer->base->cpu_base));

return false;
}
EXPORT_SYMBOL_GPL(hrtimer_active);


And since __run_hrtimer() is the more performance critical code, I think
it would be best to reduce the amount of memory barriers there.

2015-06-08 14:04:19

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer

On 06/08, Peter Zijlstra wrote:
>
> On Mon, Jun 08, 2015 at 12:33:17AM +0200, Oleg Nesterov wrote:
>
> static inline bool hrtimer_active(const struct timer *timer)
> {
> return timer->state != HRTIMER_STATE_INACTIVE ||
> timer->base->cpu_base->running == timer;
> }
>
> Is that is can indeed return false falsely.

Yes sure, this is obviously buggy. But again, only because
"false falsely" is wrong. OK, I see other emails from you, lets
discuss this problem there.

> > > + } while (read_seqcount_retry(&cpu_base->seq, seq) ||
> > > + cpu_base != READ_ONCE(timer->base->cpu_base));
> >
> > Why do we need to re-check >cpu_base?
>
> Because each CPU's cpu_base has independent seq numbers, so if the timer
> migrates, the seq numbers might align just so that we fail to detect
> change, even though there was -- extremely unlikely, but possible.

Of course. However. I am not sure, but it seems to me that a) seq numbers
can't really help exactly because we ran read the wrong base, and b) we can
rely on QUEUED check. I'll send another email, but see also below.

> > And I think there is a problem. Consider a timer TIMER which always
> > rearms itself using some "default" timeout.
> >
> > In this case __hrtimer_start_range_ns(&TIMER, ...) must preserve
> > hrtimer_active(&TIMER) == T. By definition, and currently this is
> > true.
>
> I must ask for a little clarification; is this the simple:
>
> ->function()
> hrtimer_forward_now(&self, timeout);
> return HRTIMER_RESTART;
>
> Or something that 'randomly' calls:
>
> hrtimer_start() on a timer?

The latter,

> Because for the latter this isn't currently true for the same reason as
> you give here:
>
> > After this patch this is no longer true (afaics). If the timer is
> > pending but not running, __hrtimer_start_range_ns()->remove_hrtimer()
> > will clear ENQUEUED first, then set it again in enqueue_hrtimer().
>
> That is so even with the current code; the current code uses:
>
> hrtimer->state & CALLBACK

Ah, indeed, I misread this code. So the code is already buggy.

> > I think that __hrtimer_start_range_ns() should preserve ENQUEUED
> > like migrate_hrtimer_list() should do (see the previous email).
>
> I tend to agree, but I think its a pre-existing problem, not one
> introduced by my proposed patch.

Yes. If we know that the timer always rearns itself then hrtimer_active()
must be always true. A "random" hrtimer_start() on this timer must not
create an "inactive" window.

So we should probably fix this in any case. And migrate_hrtimer_list()
too. And (it seems) this can help to simplify hrtimer_inactive().

Oleg.

2015-06-08 14:17:31

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer

On Mon, Jun 08, 2015 at 11:14:17AM +0200, Peter Zijlstra wrote:
> On Mon, Jun 08, 2015 at 12:33:17AM +0200, Oleg Nesterov wrote:
> > And I think there is a problem. Consider a timer TIMER which always
> > rearms itself using some "default" timeout.
> >
> > In this case __hrtimer_start_range_ns(&TIMER, ...) must preserve
> > hrtimer_active(&TIMER) == T. By definition, and currently this is
> > true.
> >
> > After this patch this is no longer true (afaics). If the timer is
> > pending but not running, __hrtimer_start_range_ns()->remove_hrtimer()
> > will clear ENQUEUED first, then set it again in enqueue_hrtimer().
>
> That is so even with the current code; the current code uses:
>
> hrtimer->state & CALLBACK
>
> for __remove_hrtimer(.state). In the above case of a pending timer,
> that's 0 aka. HRTIMER_STATE_INACTIVE.
>
> > This means that hrtimer_active() returns false in between. And note
> > that it doesn't matter if the timer changes its ->base or not, so
> > that 2nd cpu_base above can't help.
> >
> > I think that __hrtimer_start_range_ns() should preserve ENQUEUED
> > like migrate_hrtimer_list() should do (see the previous email).
>
> I tend to agree, but I think its a pre-existing problem, not one
> introduced by my proposed patch.

Something like this would fix that I think. It fully preserves
timer->state over hrtimer_start_range_ns().

---
kernel/time/hrtimer.c | 23 +++++++++++++----------
1 file changed, 13 insertions(+), 10 deletions(-)

--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -891,10 +891,10 @@ static void __remove_hrtimer(struct hrti
* remove hrtimer, called with base lock held
*/
static inline int
-remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
+remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart)
{
if (hrtimer_is_queued(timer)) {
- unsigned long state;
+ unsigned long state = timer->state;
int reprogram;

/*
@@ -908,12 +908,15 @@ remove_hrtimer(struct hrtimer *timer, st
debug_deactivate(timer);
timer_stats_hrtimer_clear_start_info(timer);
reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
- /*
- * We must preserve the CALLBACK state flag here,
- * otherwise we could move the timer base in
- * switch_hrtimer_base.
- */
- state = timer->state & HRTIMER_STATE_CALLBACK;
+
+ if (!restart) {
+ /*
+ * We must preserve the CALLBACK state flag here,
+ * otherwise we could move the timer base in
+ * switch_hrtimer_base.
+ */
+ state &= HRTIMER_STATE_CALLBACK;
+ }
__remove_hrtimer(timer, base, state, reprogram);
return 1;
}
@@ -938,7 +941,7 @@ void hrtimer_start_range_ns(struct hrtim
base = lock_hrtimer_base(timer, &flags);

/* Remove an active timer from the queue: */
- remove_hrtimer(timer, base);
+ remove_hrtimer(timer, base, true);

if (mode & HRTIMER_MODE_REL) {
tim = ktime_add_safe(tim, base->get_time());
@@ -1007,7 +1010,7 @@ int hrtimer_try_to_cancel(struct hrtimer
base = lock_hrtimer_base(timer, &flags);

if (!hrtimer_callback_running(timer))
- ret = remove_hrtimer(timer, base);
+ ret = remove_hrtimer(timer, base, false);

unlock_hrtimer_base(timer, &flags);

2015-06-08 14:28:57

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer

On 06/08, Peter Zijlstra wrote:
>
> On Mon, Jun 08, 2015 at 11:14:17AM +0200, Peter Zijlstra wrote:
> > > Finally. Suppose that timer->function() returns HRTIMER_RESTART
> > > and hrtimer_active() is called right after __run_hrtimer() sets
> > > cpu_base->running = NULL. I can't understand why hrtimer_active()
> > > can't miss ENQUEUED in this case. We have wmb() in between, yes,
> > > but then hrtimer_active() should do something like
> > >
> > > active = cpu_base->running == timer;
> > > if (!active) {
> > > rmb();
> > > active = state != HRTIMER_STATE_INACTIVE;
> > > }
> > >
> > > No?
> >
> > Hmm, good point. Let me think about that. It would be nice to be able to
> > avoid more memory barriers.

Yes, but otoh, can't we avoid seqcount_t altogether?

To remind, we assume that

- "false positive" is fine. If we observe ENQUEUED or ->running
we can safely return true. It doesn't matter if the timer becomes
"inactive" right after return.

- we need to fix migrate_hrtimer_list() and __hrtimer_start_range_ns()
to preserve ENQUEUED. This fixes the races with hrtimer_is_queued()
and hrtimer_active() we currently have.

Now, can't we simply do

__run_hrtimer()
{

cpu_base->running = timer;

wmb(); // 1

__remove_hrtimer(INACTIVE); // clears ENQUEUED

fn(); // autorearm can set ENQUEUED again

wmb(); // 2

cpu_base->running = NULL; // XXX
}

hrtimer_active(timer)
{
if (timer->state & ENQUEUED)
return true;

rmb(); // pairs with 1


// We do not care if we race with __hrtimer_start_range_ns().
// The running timer can't change its base.
// If it was ENQUEUED, we rely on the previous check.

base = timer->base->cpu_base;
read_barrier_depends();
if (base->running == timer)
return true;

rmb(); // pairs with 2

// Avoid the race with auto-rearming timer. If we see the
// result of XXX above we should also see ENQUEUED if it
// was set by __run_hrtimer() or timer->function().
//
// We do not care if another thread does hrtimer_start()
// and we miss ENQUEUED. In this case we can the "inactive"
// window anyway, we can pretend that hrtimer_start() was
// called after XXX above. So we can equally pretend that
// hrtimer_active() was called in this window.
//
if (timer->state & ENQUEUED)
return true;

return false;
}

Most probably I missed something... I'll try to think more, but perhaps
you see a hole immediately?

Oleg.

2015-06-08 14:42:51

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer

On Mon, Jun 08, 2015 at 04:27:49PM +0200, Oleg Nesterov wrote:
> On 06/08, Peter Zijlstra wrote:
> >
> > On Mon, Jun 08, 2015 at 11:14:17AM +0200, Peter Zijlstra wrote:
> > > > Finally. Suppose that timer->function() returns HRTIMER_RESTART
> > > > and hrtimer_active() is called right after __run_hrtimer() sets
> > > > cpu_base->running = NULL. I can't understand why hrtimer_active()
> > > > can't miss ENQUEUED in this case. We have wmb() in between, yes,
> > > > but then hrtimer_active() should do something like
> > > >
> > > > active = cpu_base->running == timer;
> > > > if (!active) {
> > > > rmb();
> > > > active = state != HRTIMER_STATE_INACTIVE;
> > > > }
> > > >
> > > > No?
> > >
> > > Hmm, good point. Let me think about that. It would be nice to be able to
> > > avoid more memory barriers.
>
> Yes, but otoh, can't we avoid seqcount_t altogether?
>
> To remind, we assume that
>
> - "false positive" is fine. If we observe ENQUEUED or ->running
> we can safely return true. It doesn't matter if the timer becomes
> "inactive" right after return.
>
> - we need to fix migrate_hrtimer_list() and __hrtimer_start_range_ns()
> to preserve ENQUEUED. This fixes the races with hrtimer_is_queued()
> and hrtimer_active() we currently have.
>
> Now, can't we simply do
>
> __run_hrtimer()
> {
>
> cpu_base->running = timer;
>
> wmb(); // 1
>
> __remove_hrtimer(INACTIVE); // clears ENQUEUED
>
> fn(); // autorearm can set ENQUEUED again
>
> wmb(); // 2
>
> cpu_base->running = NULL; // XXX
> }
>
> hrtimer_active(timer)
> {
> if (timer->state & ENQUEUED)
> return true;
>
> rmb(); // pairs with 1
>
>
> // We do not care if we race with __hrtimer_start_range_ns().
> // The running timer can't change its base.
> // If it was ENQUEUED, we rely on the previous check.
>
> base = timer->base->cpu_base;
> read_barrier_depends();
> if (base->running == timer)
> return true;
>
> rmb(); // pairs with 2
>
> // Avoid the race with auto-rearming timer. If we see the
> // result of XXX above we should also see ENQUEUED if it
> // was set by __run_hrtimer() or timer->function().
> //
> // We do not care if another thread does hrtimer_start()
> // and we miss ENQUEUED. In this case we can the "inactive"
> // window anyway, we can pretend that hrtimer_start() was
> // called after XXX above. So we can equally pretend that
> // hrtimer_active() was called in this window.
> //
> if (timer->state & ENQUEUED)
> return true;
>
> return false;
> }
>
> Most probably I missed something... I'll try to think more, but perhaps
> you see a hole immediately?

This is something I proposed earlier; Kirill said:

lkml.kernel.org/r/[email protected]

Which I read like the below, imagine our timer expires periodically and
rearms itself:

acquire
cpu_base->running = timer;
wmb
timer->state = INACTIVE;
release
[R] timer->state (== INACTIVE)
fn()
acquire
timer->state = ACTIVE
wmb
cpu_base->running = NULL
release

[R] cpu_base->running (== NULL)

acquire
cpu_base->running = timer;
wmb
timer->state = INACTIVE;
release

[R] timer->state (== INACTIVE)
fn()
acquire
timer->state = ACTIVE
wmb
cpu_base->running = NULL
release


And we have a false negative.

2015-06-08 15:10:14

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer

On Mon, Jun 08, 2015 at 04:27:49PM +0200, Oleg Nesterov wrote:
> - we need to fix migrate_hrtimer_list() and __hrtimer_start_range_ns()
> to preserve ENQUEUED. This fixes the races with hrtimer_is_queued()
> and hrtimer_active() we currently have.

So I have your patch and the one I just send out; together they should
do this.

Can I add your SoB to your patch?

2015-06-08 15:11:49

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH 0/3] hrtimer: HRTIMER_STATE_ fixes

On 06/08, Peter Zijlstra wrote:
>
> > I tend to agree, but I think its a pre-existing problem, not one
> > introduced by my proposed patch.
>
> Something like this would fix that I think. It fully preserves
> timer->state over hrtimer_start_range_ns().

Yes, but I think we can do a bit better.

Only for initial review, I need to re-check this...

And. I think that after you remove STATE_CALLBACK we can even kill
timer->state altogether, but this is another story.

What do you think?

Oleg.

2015-06-08 15:12:07

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH 1/3] hrtimer: kill HRTIMER_STATE_MIGRATE, fix the race with hrtimer_is_queued()

migrate_hrtimer_list() sets HRTIMER_STATE_MIGRATE to avoid the
race with hrtimer_active(), but this can't avoid the same race
with hrtimer_is_queued().

Kill HRTIMER_STATE_MIGRATE, we can simply use STATE_ENQUEUED.
---
include/linux/hrtimer.h | 6 +-----
kernel/time/hrtimer.c | 7 ++-----
2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 05f6df1..44810bc 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -70,17 +70,13 @@ enum hrtimer_restart {
* the handling of the timer.
*
* The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state
- * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. This
- * also affects HRTIMER_STATE_MIGRATE where the preservation is not
- * necessary. HRTIMER_STATE_MIGRATE is cleared after the timer is
- * enqueued on the new cpu.
+ * to preserve the HRTIMER_STATE_CALLBACK in the above scenario.
*
* All state transitions are protected by cpu_base->lock.
*/
#define HRTIMER_STATE_INACTIVE 0x00
#define HRTIMER_STATE_ENQUEUED 0x01
#define HRTIMER_STATE_CALLBACK 0x02
-#define HRTIMER_STATE_MIGRATE 0x04

/**
* struct hrtimer - the basic hrtimer structure
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 76d4bd9..005fd44 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1640,11 +1640,11 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
debug_deactivate(timer);

/*
- * Mark it as STATE_MIGRATE not INACTIVE otherwise the
+ * Mark it as ENQUEUED not INACTIVE otherwise the
* timer could be seen as !active and just vanish away
* under us on another CPU
*/
- __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
+ __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
timer->base = new_base;
/*
* Enqueue the timers on the new cpu. This does not
@@ -1655,9 +1655,6 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
* event device.
*/
enqueue_hrtimer(timer, new_base);
-
- /* Clear the migration state bit */
- timer->state &= ~HRTIMER_STATE_MIGRATE;
}
}

--
1.5.5.1

2015-06-08 15:12:43

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH 2/3] hrtimer: turn newstate arg of __remove_hrtimer() into clear_enqueued

No functional changes. Change __remove_hrtimer() to accept the boolean
and change only the HRTIMER_STATE_ENQUEUED bit. This preserves
HRTIMER_STATE_CALLBACK (which we are going to kill) automatically, the
only complication is that __run_hrtimer() should set it by hand.
---
kernel/time/hrtimer.c | 19 +++++++++----------
1 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 005fd44..5fceb3d 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -871,7 +871,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
*/
static void __remove_hrtimer(struct hrtimer *timer,
struct hrtimer_clock_base *base,
- unsigned long newstate, int reprogram)
+ bool clear_enqueued, int reprogram)
{
struct timerqueue_node *next_timer;
if (!(timer->state & HRTIMER_STATE_ENQUEUED))
@@ -895,7 +895,8 @@ static void __remove_hrtimer(struct hrtimer *timer,
if (!timerqueue_getnext(&base->active))
base->cpu_base->active_bases &= ~(1 << base->index);
out:
- timer->state = newstate;
+ if (clear_enqueued)
+ timer->state &= ~HRTIMER_STATE_ENQUEUED;
}

/*
@@ -905,7 +906,6 @@ static inline int
remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
{
if (hrtimer_is_queued(timer)) {
- unsigned long state;
int reprogram;

/*
@@ -920,12 +920,10 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
timer_stats_hrtimer_clear_start_info(timer);
reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
/*
- * We must preserve the CALLBACK state flag here,
- * otherwise we could move the timer base in
- * switch_hrtimer_base.
+ * This preserves the CALLBACK flag, otherwise we could move
+ * the timer base in switch_hrtimer_base.
*/
- state = timer->state & HRTIMER_STATE_CALLBACK;
- __remove_hrtimer(timer, base, state, reprogram);
+ __remove_hrtimer(timer, base, true, reprogram);
return 1;
}
return 0;
@@ -1204,7 +1202,8 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
WARN_ON(!irqs_disabled());

debug_deactivate(timer);
- __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+ timer->state |= HRTIMER_STATE_CALLBACK;
+ __remove_hrtimer(timer, base, true, 0);
timer_stats_account_hrtimer(timer);
fn = timer->function;

@@ -1644,7 +1643,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
* timer could be seen as !active and just vanish away
* under us on another CPU
*/
- __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
+ __remove_hrtimer(timer, old_base, false, 0);
timer->base = new_base;
/*
* Enqueue the timers on the new cpu. This does not
--
1.5.5.1

2015-06-08 15:13:04

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH 3/3] hrtimer: fix the __hrtimer_start_range_ns() race with hrtimer_active()

Change __hrtimer_start_range_ns() to preserve ENQUEUED.
---
kernel/time/hrtimer.c | 10 +++++-----
1 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 5fceb3d..1761f96 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -902,8 +902,8 @@ out:
/*
* remove hrtimer, called with base lock held
*/
-static inline int
-remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
+static inline int remove_hrtimer(struct hrtimer *timer,
+ struct hrtimer_clock_base *base, bool clear_enqueued)
{
if (hrtimer_is_queued(timer)) {
int reprogram;
@@ -923,7 +923,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
* This preserves the CALLBACK flag, otherwise we could move
* the timer base in switch_hrtimer_base.
*/
- __remove_hrtimer(timer, base, true, reprogram);
+ __remove_hrtimer(timer, base, clear_enqueued, reprogram);
return 1;
}
return 0;
@@ -940,7 +940,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
base = lock_hrtimer_base(timer, &flags);

/* Remove an active timer from the queue: */
- ret = remove_hrtimer(timer, base);
+ ret = remove_hrtimer(timer, base, false);

if (mode & HRTIMER_MODE_REL) {
tim = ktime_add_safe(tim, base->get_time());
@@ -1061,7 +1061,7 @@ int hrtimer_try_to_cancel(struct hrtimer *timer)
base = lock_hrtimer_base(timer, &flags);

if (!hrtimer_callback_running(timer))
- ret = remove_hrtimer(timer, base);
+ ret = remove_hrtimer(timer, base, true);

unlock_hrtimer_base(timer, &flags);

--
1.5.5.1

2015-06-08 15:13:38

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH 1/3] hrtimer: kill HRTIMER_STATE_MIGRATE, fix the race with hrtimer_is_queued()

migrate_hrtimer_list() sets HRTIMER_STATE_MIGRATE to avoid the
race with hrtimer_active(), but this can't avoid the same race
with hrtimer_is_queued().

Kill HRTIMER_STATE_MIGRATE, we can simply use STATE_ENQUEUED.
---
include/linux/hrtimer.h | 6 +-----
kernel/time/hrtimer.c | 7 ++-----
2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 05f6df1..44810bc 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -70,17 +70,13 @@ enum hrtimer_restart {
* the handling of the timer.
*
* The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state
- * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. This
- * also affects HRTIMER_STATE_MIGRATE where the preservation is not
- * necessary. HRTIMER_STATE_MIGRATE is cleared after the timer is
- * enqueued on the new cpu.
+ * to preserve the HRTIMER_STATE_CALLBACK in the above scenario.
*
* All state transitions are protected by cpu_base->lock.
*/
#define HRTIMER_STATE_INACTIVE 0x00
#define HRTIMER_STATE_ENQUEUED 0x01
#define HRTIMER_STATE_CALLBACK 0x02
-#define HRTIMER_STATE_MIGRATE 0x04

/**
* struct hrtimer - the basic hrtimer structure
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 76d4bd9..005fd44 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1640,11 +1640,11 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
debug_deactivate(timer);

/*
- * Mark it as STATE_MIGRATE not INACTIVE otherwise the
+ * Mark it as ENQUEUED not INACTIVE otherwise the
* timer could be seen as !active and just vanish away
* under us on another CPU
*/
- __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
+ __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
timer->base = new_base;
/*
* Enqueue the timers on the new cpu. This does not
@@ -1655,9 +1655,6 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
* event device.
*/
enqueue_hrtimer(timer, new_base);
-
- /* Clear the migration state bit */
- timer->state &= ~HRTIMER_STATE_MIGRATE;
}
}

--
1.5.5.1

2015-06-08 15:14:39

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 1/3] hrtimer: kill HRTIMER_STATE_MIGRATE, fix the race with hrtimer_is_queued()

Sorry, for double/wrong posting. Please ignore this one, I've resent
it in reply to 0/3.

On 06/08, Oleg Nesterov wrote:
>
> migrate_hrtimer_list() sets HRTIMER_STATE_MIGRATE to avoid the
> race with hrtimer_active(), but this can't avoid the same race
> with hrtimer_is_queued().
>
> Kill HRTIMER_STATE_MIGRATE, we can simply use STATE_ENQUEUED.
> ---
> include/linux/hrtimer.h | 6 +-----
> kernel/time/hrtimer.c | 7 ++-----
> 2 files changed, 3 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
> index 05f6df1..44810bc 100644
> --- a/include/linux/hrtimer.h
> +++ b/include/linux/hrtimer.h
> @@ -70,17 +70,13 @@ enum hrtimer_restart {
> * the handling of the timer.
> *
> * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state
> - * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. This
> - * also affects HRTIMER_STATE_MIGRATE where the preservation is not
> - * necessary. HRTIMER_STATE_MIGRATE is cleared after the timer is
> - * enqueued on the new cpu.
> + * to preserve the HRTIMER_STATE_CALLBACK in the above scenario.
> *
> * All state transitions are protected by cpu_base->lock.
> */
> #define HRTIMER_STATE_INACTIVE 0x00
> #define HRTIMER_STATE_ENQUEUED 0x01
> #define HRTIMER_STATE_CALLBACK 0x02
> -#define HRTIMER_STATE_MIGRATE 0x04
>
> /**
> * struct hrtimer - the basic hrtimer structure
> diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
> index 76d4bd9..005fd44 100644
> --- a/kernel/time/hrtimer.c
> +++ b/kernel/time/hrtimer.c
> @@ -1640,11 +1640,11 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
> debug_deactivate(timer);
>
> /*
> - * Mark it as STATE_MIGRATE not INACTIVE otherwise the
> + * Mark it as ENQUEUED not INACTIVE otherwise the
> * timer could be seen as !active and just vanish away
> * under us on another CPU
> */
> - __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
> + __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
> timer->base = new_base;
> /*
> * Enqueue the timers on the new cpu. This does not
> @@ -1655,9 +1655,6 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
> * event device.
> */
> enqueue_hrtimer(timer, new_base);
> -
> - /* Clear the migration state bit */
> - timer->state &= ~HRTIMER_STATE_MIGRATE;
> }
> }
>
> --
> 1.5.5.1
>

2015-06-08 15:17:40

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer

On 06/08, Peter Zijlstra wrote:
>
> On Mon, Jun 08, 2015 at 04:27:49PM +0200, Oleg Nesterov wrote:
> > - we need to fix migrate_hrtimer_list() and __hrtimer_start_range_ns()
> > to preserve ENQUEUED. This fixes the races with hrtimer_is_queued()
> > and hrtimer_active() we currently have.
>
> So I have your patch and the one I just send out; together they should
> do this.
>
> Can I add your SoB to your patch?

Yes sure, thanks, but could you look at 1-3 I just sent?

1/3 is the same migrate_hrtimer_list() fix and it looks trivial. I need
to re-check 2-3.

Oleg.

2015-06-08 15:36:06

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 0/3] hrtimer: HRTIMER_STATE_ fixes

On Mon, 2015-06-08 at 17:10 +0200, Oleg Nesterov wrote:
> On 06/08, Peter Zijlstra wrote:
> >
> > > I tend to agree, but I think its a pre-existing problem, not one
> > > introduced by my proposed patch.
> >
> > Something like this would fix that I think. It fully preserves
> > timer->state over hrtimer_start_range_ns().
>
> Yes, but I think we can do a bit better.
>
> Only for initial review, I need to re-check this...

I'm having a wee bit of bother spotting how you version is 'better'.

> And. I think that after you remove STATE_CALLBACK we can even kill
> timer->state altogether, but this is another story.

Ah, yes, we could introduce timerqueue_is_queued() which uses
RB_EMPTY_NODE(). Obviating the need for hrtimer::state entirely.

2015-06-08 15:50:28

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer

On 06/08, Peter Zijlstra wrote:
>
> On Mon, Jun 08, 2015 at 04:27:49PM +0200, Oleg Nesterov wrote:
> >
> > Most probably I missed something... I'll try to think more, but perhaps
> > you see a hole immediately?
>
> This is something I proposed earlier; Kirill said:
>
> lkml.kernel.org/r/[email protected]
>
> Which I read like the below, imagine our timer expires periodically and
> rearms itself:
>
> acquire
> cpu_base->running = timer;
> wmb
> timer->state = INACTIVE;
> release
> [R] timer->state (== INACTIVE)
> fn()
> acquire
> timer->state = ACTIVE
> wmb
> cpu_base->running = NULL
> release
>
> [R] cpu_base->running (== NULL)
>
> acquire
> cpu_base->running = timer;
> wmb
> timer->state = INACTIVE;
> release
>
> [R] timer->state (== INACTIVE)

Damn yes. Thanks Kirill and Peter.

And I swear, I swear I was thinking about this race yesterday but
forgot this problem today ;)

Yes it seems that we can't avoid the seq counter. But perhaps we
can increment/check it once in run_hrtimer/hrtimer_inactive ...
I'll try to think.

Thanks!

Oleg.

2015-06-08 15:57:41

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 0/3] hrtimer: HRTIMER_STATE_ fixes

On 06/08, Peter Zijlstra wrote:
>
> On Mon, 2015-06-08 at 17:10 +0200, Oleg Nesterov wrote:
> > On 06/08, Peter Zijlstra wrote:
> > >
> > > > I tend to agree, but I think its a pre-existing problem, not one
> > > > introduced by my proposed patch.
> > >
> > > Something like this would fix that I think. It fully preserves
> > > timer->state over hrtimer_start_range_ns().
> >
> > Yes, but I think we can do a bit better.
> >
> > Only for initial review, I need to re-check this...
>
> I'm having a wee bit of bother spotting how you version is 'better'.
>
> > And. I think that after you remove STATE_CALLBACK we can even kill
> > timer->state altogether, but this is another story.
>
> Ah, yes, we could introduce timerqueue_is_queued() which uses
> RB_EMPTY_NODE(). Obviating the need for hrtimer::state entirely.

Yes exactly.

And to me 2/3 looks like a cleanup in any case, __remove_hrtimer()
should do nothing with other bits. Yes,

timer->state |= HRTIMER_STATE_CALLBACK;
__remove_hrtimer(timer, base, true, 0);

in __run_hrtimer() looks worse than __remove_hrtimer(CALLBACK), but
you are going to kill STATE_CALLBACK. And this should even simplify
your patch a little bit.

But I agree, this is minor and subjective.

Oleg.

2015-06-08 17:11:57

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH 0/3] hrtimer: HRTIMER_STATE_ fixes

On Mon, 8 Jun 2015, Peter Zijlstra wrote:
> On Mon, 2015-06-08 at 17:10 +0200, Oleg Nesterov wrote:
> > On 06/08, Peter Zijlstra wrote:
> > >
> > > > I tend to agree, but I think its a pre-existing problem, not one
> > > > introduced by my proposed patch.
> > >
> > > Something like this would fix that I think. It fully preserves
> > > timer->state over hrtimer_start_range_ns().
> >
> > Yes, but I think we can do a bit better.
> >
> > Only for initial review, I need to re-check this...
>
> I'm having a wee bit of bother spotting how you version is 'better'.
>
> > And. I think that after you remove STATE_CALLBACK we can even kill
> > timer->state altogether, but this is another story.
>
> Ah, yes, we could introduce timerqueue_is_queued() which uses
> RB_EMPTY_NODE(). Obviating the need for hrtimer::state entirely.

Which won't work for the migration case unless we have some trickery
like we do with double linked lists (not setting the prev member to
NULL on dequeue).

Thanks,

tglx

2015-06-08 19:09:09

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 0/3] hrtimer: HRTIMER_STATE_ fixes

On Mon, 2015-06-08 at 19:11 +0200, Thomas Gleixner wrote:

> > Ah, yes, we could introduce timerqueue_is_queued() which uses
> > RB_EMPTY_NODE(). Obviating the need for hrtimer::state entirely.
>
> Which won't work for the migration case unless we have some trickery
> like we do with double linked lists (not setting the prev member to
> NULL on dequeue).

Yeah, that dawned on me while away from the computer.

2015-06-08 20:53:52

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 0/3] hrtimer: HRTIMER_STATE_ fixes

On 06/08, Thomas Gleixner wrote:
>
> On Mon, 8 Jun 2015, Peter Zijlstra wrote:
> > On Mon, 2015-06-08 at 17:10 +0200, Oleg Nesterov wrote:
> > > On 06/08, Peter Zijlstra wrote:
> > > >
> > > > > I tend to agree, but I think its a pre-existing problem, not one
> > > > > introduced by my proposed patch.
> > > >
> > > > Something like this would fix that I think. It fully preserves
> > > > timer->state over hrtimer_start_range_ns().
> > >
> > > Yes, but I think we can do a bit better.
> > >
> > > Only for initial review, I need to re-check this...
> >
> > I'm having a wee bit of bother spotting how you version is 'better'.
> >
> > > And. I think that after you remove STATE_CALLBACK we can even kill
> > > timer->state altogether, but this is another story.
> >
> > Ah, yes, we could introduce timerqueue_is_queued() which uses
> > RB_EMPTY_NODE(). Obviating the need for hrtimer::state entirely.
>
> Which won't work for the migration case unless we have some trickery
> like we do with double linked lists (not setting the prev member to
> NULL on dequeue).

Of course, but this is trivial, no? Nevermind, I could easily miss
somthing and right now this is off-topic.

What do you think about this series? To me it makes sense in any case.
But I need (at least)to update the changelogs. In particular 3/3 doesn't
explain why do we need this change. If you missed the previous discussion,
this (hopefully) fixes the problem with the auto-rearming timers, the
"random" hrtimer_restart() wrongly creates a window when this timer looks
as !hrtimer_inactive().




Peter, I tried to think again about ->running and all I can say is that
I am totally confused ;) I'll try to write another email tomorrow.

Oleg.

2015-06-09 21:34:33

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer

On 06/08, Peter Zijlstra wrote:
>
> On Mon, Jun 08, 2015 at 11:14:17AM +0200, Peter Zijlstra wrote:
> > > Finally. Suppose that timer->function() returns HRTIMER_RESTART
> > > and hrtimer_active() is called right after __run_hrtimer() sets
> > > cpu_base->running = NULL. I can't understand why hrtimer_active()
> > > can't miss ENQUEUED in this case. We have wmb() in between, yes,
> > > but then hrtimer_active() should do something like
> > >
> > > active = cpu_base->running == timer;
> > > if (!active) {
> > > rmb();
> > > active = state != HRTIMER_STATE_INACTIVE;
> > > }
> > >
> > > No?
> >
> > Hmm, good point. Let me think about that. It would be nice to be able to
> > avoid more memory barriers.
>
> So your scenario is:
>
> [R] seq
> RMB
> [S] ->state = ACTIVE
> WMB
> [S] ->running = NULL
> [R] ->running (== NULL)
> [R] ->state (== INACTIVE; fail to observe
> the ->state store due to
> lack of order)
> RMB
> [R] seq (== seq)
> [S] seq++
>
> Conversely, if we re-order the (first) seq++ store such that it comes
> first:
>
> [S] seq++
>
> [R] seq
> RMB
> [R] ->running (== NULL)
> [S] ->running = timer;
> WMB
> [S] ->state = INACTIVE
> [R] ->state (== INACTIVE)
> RMB
> [R] seq (== seq)
>
> And we have another false negative.
>
> And in this case we need the read order the other way around, we'd need:
>
> active = timer->state != HRTIMER_STATE_INACTIVE;
> if (!active) {
> smp_rmb();
> active = cpu_base->running == timer;
> }
>
> Now I think we can fix this by either doing:
>
> WMB
> seq++
> WMB
>
> On both sides of __run_hrtimer(), or do
>
> bool hrtimer_active(const struct hrtimer *timer)
> {
> struct hrtimer_cpu_base *cpu_base;
> unsigned int seq;
>
> do {
> cpu_base = READ_ONCE(timer->base->cpu_base);
> seq = raw_read_seqcount(&cpu_base->seq);
>
> if (timer->state != HRTIMER_STATE_INACTIVE)
> return true;
>
> smp_rmb();
>
> if (cpu_base->running == timer)
> return true;
>
> smp_rmb();
>
> if (timer->state != HRTIMER_STATE_INACTIVE)
> return true;
>
> } while (read_seqcount_retry(&cpu_base->seq, seq) ||
> cpu_base != READ_ONCE(timer->base->cpu_base));
>
> return false;
> }

You know, I simply can't convince myself I understand why this code
correct... or not.

But contrary to what I said before, I agree that we need to recheck
timer->base. This probably needs more discussion, to me it is very
unobvious why we can trust this cpu_base != READ_ONCE() check. Yes,
we have a lot of barriers, but they do not pair with each other. Lets
ignore this for now.

> And since __run_hrtimer() is the more performance critical code, I think
> it would be best to reduce the amount of memory barriers there.

Yes, but wmb() is cheap on x86... Perhaps we can make this code
"obviously correct" ?


How about the following..... We add cpu_base->seq as before but
limit its "write" scope so that we cam use the regular read/retry.

So,

hrtimer_active(timer)
{

do {
base = READ_ONCE(timer->base->cpu_base);
seq = read_seqcount_begin(&cpu_base->seq);

if (timer->state & ENQUEUED ||
base->running == timer)
return true;

} while (read_seqcount_retry(&cpu_base->seq, seq) ||
base != READ_ONCE(timer->base->cpu_base));

return false;
}

And we need to avoid the races with 2 transitions in __run_hrtimer().

The first race is trivial, we change __run_hrtimer() to do

write_seqcount_begin(cpu_base->seq);
cpu_base->running = timer;
__remove_hrtimer(timer); // clears ENQUEUED
write_seqcount_end(cpu_base->seq);

and hrtimer_active() obviously can't race with this section.

Then we change enqueue_hrtimer()


+ bool need_lock = base->cpu_base->running == timer;
+ if (need_lock)
+ write_seqcount_begin(cpu_base->seq);
+
timer->state |= HRTIMER_STATE_ENQUEUED;
+
+ if (need_lock)
+ write_seqcount_end(cpu_base->seq);


Now. If the timer is re-queued by the time __run_hrtimer() clears
->running we have the following sequence:

write_seqcount_begin(cpu_base->seq);
timer->state |= HRTIMER_STATE_ENQUEUED;
write_seqcount_end(cpu_base->seq);

base->running = NULL;

and I think this should equally work, because in this case we do not
care if hrtimer_active() misses "running = NULL".

Yes, we only have this 2nd write_seqcount_begin/end if the timer re-
arms itself, but otherwise we do not race. If another thread does
hrtime_start() in between we can pretend that hrtimer_active() hits
the "inactive".

What do you think?


And. Note that we can rewrite these 2 "write" critical sections in
__run_hrtimer() and enqueue_hrtimer() as

cpu_base->running = timer;

write_seqcount_begin(cpu_base->seq);
write_seqcount_end(cpu_base->seq);

__remove_hrtimer(timer);

and

timer->state |= HRTIMER_STATE_ENQUEUED;

write_seqcount_begin(cpu_base->seq);
write_seqcount_end(cpu_base->seq);

base->running = NULL;

So we can probably use write_seqcount_barrier() except I am not sure
about the 2nd wmb...

Oleg.

2015-06-09 21:40:44

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer

Damn,

On 06/09, Oleg Nesterov wrote:
>
> Yes, we only have this 2nd write_seqcount_begin/end if the timer re-
> arms itself, but otherwise we do not race. If another thread does
> hrtime_start() in between we can pretend that hrtimer_active() hits
> the "inactive".
^^^^^^^^^^^^^^^

I meant,

hits the "inactive" window we can't avoid. Even if another
thread does this _before_ __run_hrtimer() clears ->running,
we can't distinguish from the case when it does this right
_after_ the timer becomes "inactive".

Oleg.

2015-06-10 06:55:34

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer

On Tue, 2015-06-09 at 23:33 +0200, Oleg Nesterov wrote:
>
> Yes, but wmb() is cheap on x86... Perhaps we can make this code
> "obviously correct" ?

I'll reply to the rest a bit later; got to run some errands first.

The 'problem' of course is ARM/PPC etc.. we would like to keep this
generic code fast, even for the loosely ordered machines.

And wmb() is certainly not cheap for them; although not as bad as a full
barrier.

2015-06-10 07:47:04

by Kirill Tkhai

[permalink] [raw]
Subject: Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer

Hi, Oleg,

В Вт, 09/06/2015 в 23:33 +0200, Oleg Nesterov пишет:
> On 06/08, Peter Zijlstra wrote:
> >
> > On Mon, Jun 08, 2015 at 11:14:17AM +0200, Peter Zijlstra wrote:
> > > > Finally. Suppose that timer->function() returns HRTIMER_RESTART
> > > > and hrtimer_active() is called right after __run_hrtimer() sets
> > > > cpu_base->running = NULL. I can't understand why hrtimer_active()
> > > > can't miss ENQUEUED in this case. We have wmb() in between, yes,
> > > > but then hrtimer_active() should do something like
> > > >
> > > > active = cpu_base->running == timer;
> > > > if (!active) {
> > > > rmb();
> > > > active = state != HRTIMER_STATE_INACTIVE;
> > > > }
> > > >
> > > > No?
> > >
> > > Hmm, good point. Let me think about that. It would be nice to be able to
> > > avoid more memory barriers.
> >
> > So your scenario is:
> >
> > [R] seq
> > RMB
> > [S] ->state = ACTIVE
> > WMB
> > [S] ->running = NULL
> > [R] ->running (== NULL)
> > [R] ->state (== INACTIVE; fail to observe
> > the ->state store due to
> > lack of order)
> > RMB
> > [R] seq (== seq)
> > [S] seq++
> >
> > Conversely, if we re-order the (first) seq++ store such that it comes
> > first:
> >
> > [S] seq++
> >
> > [R] seq
> > RMB
> > [R] ->running (== NULL)
> > [S] ->running = timer;
> > WMB
> > [S] ->state = INACTIVE
> > [R] ->state (== INACTIVE)
> > RMB
> > [R] seq (== seq)
> >
> > And we have another false negative.
> >
> > And in this case we need the read order the other way around, we'd need:
> >
> > active = timer->state != HRTIMER_STATE_INACTIVE;
> > if (!active) {
> > smp_rmb();
> > active = cpu_base->running == timer;
> > }
> >
> > Now I think we can fix this by either doing:
> >
> > WMB
> > seq++
> > WMB
> >
> > On both sides of __run_hrtimer(), or do
> >
> > bool hrtimer_active(const struct hrtimer *timer)
> > {
> > struct hrtimer_cpu_base *cpu_base;
> > unsigned int seq;
> >
> > do {
> > cpu_base = READ_ONCE(timer->base->cpu_base);
> > seq = raw_read_seqcount(&cpu_base->seq);
> >
> > if (timer->state != HRTIMER_STATE_INACTIVE)
> > return true;
> >
> > smp_rmb();
> >
> > if (cpu_base->running == timer)
> > return true;
> >
> > smp_rmb();
> >
> > if (timer->state != HRTIMER_STATE_INACTIVE)
> > return true;
> >
> > } while (read_seqcount_retry(&cpu_base->seq, seq) ||
> > cpu_base != READ_ONCE(timer->base->cpu_base));
> >
> > return false;
> > }
>
> You know, I simply can't convince myself I understand why this code
> correct... or not.
>
> But contrary to what I said before, I agree that we need to recheck
> timer->base. This probably needs more discussion, to me it is very
> unobvious why we can trust this cpu_base != READ_ONCE() check. Yes,
> we have a lot of barriers, but they do not pair with each other. Lets
> ignore this for now.
>
> > And since __run_hrtimer() is the more performance critical code, I think
> > it would be best to reduce the amount of memory barriers there.
>
> Yes, but wmb() is cheap on x86... Perhaps we can make this code
> "obviously correct" ?
>
>
> How about the following..... We add cpu_base->seq as before but
> limit its "write" scope so that we cam use the regular read/retry.
>
> So,
>
> hrtimer_active(timer)
> {
>
> do {
> base = READ_ONCE(timer->base->cpu_base);
> seq = read_seqcount_begin(&cpu_base->seq);
>
> if (timer->state & ENQUEUED ||
> base->running == timer)
> return true;
>
> } while (read_seqcount_retry(&cpu_base->seq, seq) ||
> base != READ_ONCE(timer->base->cpu_base));
>
> return false;
> }
>
> And we need to avoid the races with 2 transitions in __run_hrtimer().
>
> The first race is trivial, we change __run_hrtimer() to do
>
> write_seqcount_begin(cpu_base->seq);
> cpu_base->running = timer;
> __remove_hrtimer(timer); // clears ENQUEUED
> write_seqcount_end(cpu_base->seq);

We use seqcount, because we are afraid that hrtimer_active() may miss
timer->state or cpu_base->running, when we are clearing it.

If we use two pairs of write_seqcount_{begin,end} in __run_hrtimer(),
we may protect only the places where we do that:

cpu_base->running = timer;
write_seqcount_begin(cpu_base->seq);
__remove_hrtimer(timer); // clears ENQUEUED
write_seqcount_end(cpu_base->seq);

....

timer->state |= HRTIMER_STATE_ENQUEUED;
write_seqcount_begin(cpu_base->seq);
base->running = NULL;
write_seqcount_end(cpu_base->seq);

>
> and hrtimer_active() obviously can't race with this section.
>
> Then we change enqueue_hrtimer()
>
>
> + bool need_lock = base->cpu_base->running == timer;
> + if (need_lock)
> + write_seqcount_begin(cpu_base->seq);
> +
> timer->state |= HRTIMER_STATE_ENQUEUED;
> +
> + if (need_lock)
> + write_seqcount_end(cpu_base->seq);
>
>
> Now. If the timer is re-queued by the time __run_hrtimer() clears
> ->running we have the following sequence:
>
> write_seqcount_begin(cpu_base->seq);
> timer->state |= HRTIMER_STATE_ENQUEUED;
> write_seqcount_end(cpu_base->seq);
>
> base->running = NULL;
>
> and I think this should equally work, because in this case we do not
> care if hrtimer_active() misses "running = NULL".
>
> Yes, we only have this 2nd write_seqcount_begin/end if the timer re-
> arms itself, but otherwise we do not race. If another thread does
> hrtime_start() in between we can pretend that hrtimer_active() hits
> the "inactive".
>
> What do you think?
>
>
> And. Note that we can rewrite these 2 "write" critical sections in
> __run_hrtimer() and enqueue_hrtimer() as
>
> cpu_base->running = timer;
>
> write_seqcount_begin(cpu_base->seq);
> write_seqcount_end(cpu_base->seq);
>
> __remove_hrtimer(timer);
>
> and
>
> timer->state |= HRTIMER_STATE_ENQUEUED;
>
> write_seqcount_begin(cpu_base->seq);
> write_seqcount_end(cpu_base->seq);
>
> base->running = NULL;
>
> So we can probably use write_seqcount_barrier() except I am not sure
> about the 2nd wmb...

2015-06-10 15:51:22

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer

Forgot to mention...

On 06/09, Oleg Nesterov wrote:
>
> And. Note that we can rewrite these 2 "write" critical sections in
> __run_hrtimer() and enqueue_hrtimer() as
>
> cpu_base->running = timer;
>
> write_seqcount_begin(cpu_base->seq);
> write_seqcount_end(cpu_base->seq);
>
> __remove_hrtimer(timer);
>
> and
>
> timer->state |= HRTIMER_STATE_ENQUEUED;
>
> write_seqcount_begin(cpu_base->seq);
> write_seqcount_end(cpu_base->seq);
>
> base->running = NULL;
>
> So we can probably use write_seqcount_barrier() except I am not sure
> about the 2nd wmb...

Or we can change hrtimer_active() to use raw_read_seqcount() likes your
patch does (avoid odd/even games) and then __run_hrtimer() can use
raw_write_seqcount_latch(), it already has wmb's on both sides.

I am not even sure we need the "if (base->cpu_base->running == timer)"
optimization for the 2nd _latch(), we cando this unconditionally in
__run_hrtimer().

And in this case the code will look very much like your patch, but
imo at the same it will look much more understandable. Because, again,
this way we just add 2 critical "write" sections (barriers) into
__run_hrtimer(), no need to explain the usage of memory barriers, etc,
we can simply rely on acquire/release semantics of seqcount_t.

But yes. This adds 2 additional wmb's into run_hrtimer(). Again, we
can make the 2nd _latch() condtitional, we only need it if the timer
requeues itself, but I am not sure.



Finally. You know, I am afraid very much I confused myself completely
and now I am trying to confuse you and Kirill ;)

Oleg.

2015-06-10 16:05:51

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer

Hi Kirill,

On 06/10, Kirill Tkhai wrote:
>
> В Вт, 09/06/2015 в 23:33 +0200, Oleg Nesterov пишет:
> >
> > hrtimer_active(timer)
> > {
> >
> > do {
> > base = READ_ONCE(timer->base->cpu_base);
> > seq = read_seqcount_begin(&cpu_base->seq);
> >
> > if (timer->state & ENQUEUED ||
> > base->running == timer)
> > return true;
> >
> > } while (read_seqcount_retry(&cpu_base->seq, seq) ||
> > base != READ_ONCE(timer->base->cpu_base));
> >
> > return false;
> > }
> >
> > And we need to avoid the races with 2 transitions in __run_hrtimer().
> >
> > The first race is trivial, we change __run_hrtimer() to do
> >
> > write_seqcount_begin(cpu_base->seq);
> > cpu_base->running = timer;
> > __remove_hrtimer(timer); // clears ENQUEUED
> > write_seqcount_end(cpu_base->seq);
>
> We use seqcount, because we are afraid that hrtimer_active() may miss
> timer->state or cpu_base->running, when we are clearing it.

Yes,

> If we use two pairs of write_seqcount_{begin,end} in __run_hrtimer(),
> we may protect only the places where we do that:
>
> cpu_base->running = timer;
> write_seqcount_begin(cpu_base->seq);
> __remove_hrtimer(timer); // clears ENQUEUED
> write_seqcount_end(cpu_base->seq);
>
> ....
>
> timer->state |= HRTIMER_STATE_ENQUEUED;
> write_seqcount_begin(cpu_base->seq);
> base->running = NULL;
> write_seqcount_end(cpu_base->seq);

Afaics, no. Afaics, the following code is correct:

seqcount_t LOCK;
bool X = true, Y = false;

void read(void)
{
bool x, y;

do {
seq = read_seqcount_begin(&LOCK);

x = X; y = Y;

} while (read_seqcount_retry(&LOCK, seq));

BUG_ON(!x && !y);
}

void write(void)
{
Y = true;

write_seqcount_begin(LOCK);
write_seqcount_end(LOCK);

X = false;
}

If we rely on the "locking" semantics of seqcount_t, this doesn't really
differ from

spinlock_t LOCK;
bool X = true, Y = false;

void read(void)
{
bool x, y;

spin_lock(LOCK);
x = X; y = Y;
spin_unlock(LOCK);

BUG_ON(!x && !y);
}

void write(void)
{
Y = true;

spin_lock(LOCK);
spin_unlock(LOCK);

X = false;
}

If "read" takes the lock before "write", it must see X == true.

Otherwise "read" should see all memory changes done before or
inside the "write" critical section, so it must see Y == true.

No?

Oleg.

2015-06-10 22:37:58

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer

On Tue, Jun 09, 2015 at 11:33:18PM +0200, Oleg Nesterov wrote:

> And. Note that we can rewrite these 2 "write" critical sections in
> __run_hrtimer() and enqueue_hrtimer() as
>
> cpu_base->running = timer;
>
> write_seqcount_begin(cpu_base->seq);
> write_seqcount_end(cpu_base->seq);
>
> __remove_hrtimer(timer);
>
> and
>
> timer->state |= HRTIMER_STATE_ENQUEUED;
>
> write_seqcount_begin(cpu_base->seq);
> write_seqcount_end(cpu_base->seq);
>
> base->running = NULL;
>
> So we can probably use write_seqcount_barrier() except I am not sure
> about the 2nd wmb...

Which second wmb?

In any case, you use that transform from your reply to Kirill, and I
cannot currently see a hole in that. Lets call this transformation A. It
gets us the quoted bit above.

Now the above is:

seq++;
smp_wmb();
smp_wmb();
seq++;

Now, double barriers are pointless, so I think we can all agree that the
above is identical to the below. Lets call this tranformation B.

seq++;
smp_wmb();
seq++;

And then because you use the traditional seqcount read side, which
stalls when seq&1, we can transform the above into this. Transformation
C.

smp_wmb();
seq += 2;

Which is write_seqcount_barrier(), as you say above.

And since there are no odd numbers possible in that scheme, its
identical to my modified read side with the single increment. Transform
D.

The only difference at this point is that I have my seq increment on the
'wrong' side on the first state.

cpu_base->running = timer;

seq++;
smp_wmb();

timer->state = 0;

...

timer->state = 1;

smp_wmb();
seq++;

cpu_base->running = NULL;

Which, per my previous mail provides the following:

[S] seq++
[R] seq
RMB
[R] ->running (== NULL)
[S] ->running = timer;
WMB
[S] ->state = INACTIVE
[R] ->state (== INACTIVE)
RMB
[R] seq (== seq)

Which is where we had to modify the read side to do:

[R] ->state
RMB
[R] ->running

Now, if we use write_seqcount_barrier() that would become:

__run_hrtimer() hrtimer_active()

[S] ->running = timer; [R] seq
WMB RMB
[S] seq += 2; [R] ->running
[S] ->state = 0; [R] ->state
RMB
[R] seq


Which we can reorder like:

[R] seq
RMB
[R] ->running (== NULL)
[S] ->running = timer
WMB
[S] ->state = 0
[R] ->state (== 0)
RMB
[R] seq (== seq)
[S] seq += 2


Which still gives us that false negative and would still require the
read side to be modified to do:

[R] ->state
RMB
[R] ->running

IOW, one of our transforms (A-D) is faulty for it requires a
modification to the read side.

I suspect its T-C, where we loose the odd count that holds up the read
side.

Because the moment we go from:

Y = true;
seq++
WMB
seq++
X = false;

to:

Y = true;
WMB
seq += 2;
X = false;

It becomes possible to re-order like:

Y = true;
WMB
X = false

seq += 2;

And we loose our read order; or rather, where previously we ordered the
read side by seq, the seq increments are no longer ordered.

With this I think we can prove my code correct, however it also suggests
that:

cpu_base->running = timer;
seq++;
smp_wmb();
seq++;
timer->state = 0;

...

timer->state = 1;
seq++;
smp_wmb();
seq++;
cpu_base->running = NULL;

vs
hrtimer_active(timer)
{

do {
base = READ_ONCE(timer->base->cpu_base);
seq = read_seqcount_begin(&cpu_base->seq);

if (timer->state & ENQUEUED ||
base->running == timer)
return true;

} while (read_seqcount_retry(&cpu_base->seq, seq) ||
base != READ_ONCE(timer->base->cpu_base));

return false;
}

Is the all-round cheapest solution. Those extra seq increments are
almost free on all archs as the cacheline will be hot and modified on
the local cpu.

Only under the very rare condition of a concurrent hrtimer_active() call
will that seq line be pulled into shared state.


I shall go sleep now, and update my patch tomorrow, lets see if I will
still agree with myself after a sleep :-)

2015-06-11 07:31:49

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer

On Wed, Jun 10, 2015 at 06:04:44PM +0200, Oleg Nesterov wrote:
> If we rely on the "locking" semantics of seqcount_t, this doesn't really
> differ from
>
> spinlock_t LOCK;
> bool X = true, Y = false;
>
> void read(void)
> {
> bool x, y;
>
> spin_lock(LOCK);
> x = X; y = Y;
> spin_unlock(LOCK);
>
> BUG_ON(!x && !y);
> }
>
> void write(void)
> {
> Y = true;
>
> spin_lock(LOCK);
> spin_unlock(LOCK);
>
> X = false;
> }

So when I first saw that I went, wait what?

Because I looked at it like:

Y = true;
ACQUIRE
RELEASE
X = false;

And we both know that is not ordered at all.

But its the actual lock semantics that make it work, we cannot acquire
until the other completes, at which time the acquire matches its
release and things end up ordered again.

And its exactly that part we lost in our transformation C yesterday.

In any case, breakfast time, will go do the patches shortly.

2015-06-11 16:25:14

by Kirill Tkhai

[permalink] [raw]
Subject: Re: [PATCH 08/14] hrtimer: Allow hrtimer::function() to free the timer

В Ср, 10/06/2015 в 18:04 +0200, Oleg Nesterov пишет:
> Hi Kirill,
>
> On 06/10, Kirill Tkhai wrote:
> >
> > В Вт, 09/06/2015 в 23:33 +0200, Oleg Nesterov пишет:
> > >
> > > hrtimer_active(timer)
> > > {
> > >
> > > do {
> > > base = READ_ONCE(timer->base->cpu_base);
> > > seq = read_seqcount_begin(&cpu_base->seq);
> > >
> > > if (timer->state & ENQUEUED ||
> > > base->running == timer)
> > > return true;
> > >
> > > } while (read_seqcount_retry(&cpu_base->seq, seq) ||
> > > base != READ_ONCE(timer->base->cpu_base));
> > >
> > > return false;
> > > }
> > >
> > > And we need to avoid the races with 2 transitions in __run_hrtimer().
> > >
> > > The first race is trivial, we change __run_hrtimer() to do
> > >
> > > write_seqcount_begin(cpu_base->seq);
> > > cpu_base->running = timer;
> > > __remove_hrtimer(timer); // clears ENQUEUED
> > > write_seqcount_end(cpu_base->seq);
> >
> > We use seqcount, because we are afraid that hrtimer_active() may miss
> > timer->state or cpu_base->running, when we are clearing it.
>
> Yes,
>
> > If we use two pairs of write_seqcount_{begin,end} in __run_hrtimer(),
> > we may protect only the places where we do that:
> >
> > cpu_base->running = timer;
> > write_seqcount_begin(cpu_base->seq);
> > __remove_hrtimer(timer); // clears ENQUEUED
> > write_seqcount_end(cpu_base->seq);
> >
> > ....
> >
> > timer->state |= HRTIMER_STATE_ENQUEUED;
> > write_seqcount_begin(cpu_base->seq);
> > base->running = NULL;
> > write_seqcount_end(cpu_base->seq);
>
> Afaics, no. Afaics, the following code is correct:
>
> seqcount_t LOCK;
> bool X = true, Y = false;
>
> void read(void)
> {
> bool x, y;
>
> do {
> seq = read_seqcount_begin(&LOCK);
>
> x = X; y = Y;
>
> } while (read_seqcount_retry(&LOCK, seq));
>
> BUG_ON(!x && !y);
> }
>
> void write(void)
> {
> Y = true;
>
> write_seqcount_begin(LOCK);
> write_seqcount_end(LOCK);
>
> X = false;
> }
>
> If we rely on the "locking" semantics of seqcount_t, this doesn't really
> differ from
>
> spinlock_t LOCK;
> bool X = true, Y = false;
>
> void read(void)
> {
> bool x, y;
>
> spin_lock(LOCK);
> x = X; y = Y;
> spin_unlock(LOCK);
>
> BUG_ON(!x && !y);
> }
>
> void write(void)
> {
> Y = true;
>
> spin_lock(LOCK);
> spin_unlock(LOCK);
>
> X = false;
> }
>
> If "read" takes the lock before "write", it must see X == true.
>
> Otherwise "read" should see all memory changes done before or
> inside the "write" critical section, so it must see Y == true.
>
> No?

I'm agree with you. Thanks for the explanation.