Currently an hrtimer callback function cannot free its own timer
because __run_hrtimer() still needs to clear HRTIMER_STATE_CALLBACK
after it. Freeing the timer would result in a clear use-after-free.
Solve this by using a scheme similar to regular timers; track the
current running timer in hrtimer_clock_base::running.
Suggested-by: Thomas Gleixner <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
include/linux/hrtimer.h | 35 ++++++++++++++---------------------
kernel/time/hrtimer.c | 48 ++++++++++++++++++++++--------------------------
2 files changed, 36 insertions(+), 47 deletions(-)
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -53,34 +53,25 @@ enum hrtimer_restart {
*
* 0x00 inactive
* 0x01 enqueued into rbtree
- * 0x02 callback function running
- * 0x04 timer is migrated to another cpu
+ * 0x02 timer is migrated to another cpu
*
- * Special cases:
- * 0x03 callback function running and enqueued
- * (was requeued on another CPU)
- * 0x05 timer was migrated on CPU hotunplug
+ * The callback state is not part of the timer->state because clearing it would
+ * mean touching the timer after the callback, this makes it impossible to free
+ * the timer from the callback function.
*
- * The "callback function running and enqueued" status is only possible on
- * SMP. It happens for example when a posix timer expired and the callback
+ * Therefore we track the callback state in timer->base->running == timer.
+ *
+ * On SMP it is possible to have a "callback function running and enqueued"
+ * status. It happens for example when a posix timer expired and the callback
* queued a signal. Between dropping the lock which protects the posix timer
* and reacquiring the base lock of the hrtimer, another CPU can deliver the
- * signal and rearm the timer. We have to preserve the callback running state,
- * as otherwise the timer could be removed before the softirq code finishes the
- * the handling of the timer.
- *
- * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state
- * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. This
- * also affects HRTIMER_STATE_MIGRATE where the preservation is not
- * necessary. HRTIMER_STATE_MIGRATE is cleared after the timer is
- * enqueued on the new cpu.
+ * signal and rearm the timer.
*
* All state transitions are protected by cpu_base->lock.
*/
#define HRTIMER_STATE_INACTIVE 0x00
#define HRTIMER_STATE_ENQUEUED 0x01
-#define HRTIMER_STATE_CALLBACK 0x02
-#define HRTIMER_STATE_MIGRATE 0x04
+#define HRTIMER_STATE_MIGRATE 0x02
/**
* struct hrtimer - the basic hrtimer structure
@@ -153,6 +144,7 @@ struct hrtimer_clock_base {
struct timerqueue_head active;
ktime_t (*get_time)(void);
ktime_t offset;
+ struct hrtimer *running;
} __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
enum hrtimer_base_type {
@@ -402,7 +394,8 @@ extern u64 hrtimer_get_next_event(void);
*/
static inline int hrtimer_active(const struct hrtimer *timer)
{
- return timer->state != HRTIMER_STATE_INACTIVE;
+ return timer->state != HRTIMER_STATE_INACTIVE ||
+ timer->base->running == timer;
}
/*
@@ -419,7 +412,7 @@ static inline int hrtimer_is_queued(stru
*/
static inline int hrtimer_callback_running(struct hrtimer *timer)
{
- return timer->state & HRTIMER_STATE_CALLBACK;
+ return timer->base->running == timer;
}
/* Forward a hrtimer so it expires after now: */
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -111,6 +111,13 @@ static inline int hrtimer_clockid_to_bas
#ifdef CONFIG_SMP
/*
+ * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
+ * such that hrtimer_callback_running() can unconditionally dereference
+ * timer->base.
+ */
+static struct hrtimer_clock_base migration_base;
+
+/*
* We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
* means that all timers which are tied to this base via timer->base are
* locked, and the base itself is locked too.
@@ -119,8 +126,8 @@ static inline int hrtimer_clockid_to_bas
* be found on the lists/queues.
*
* When the timer's base is locked, and the timer removed from list, it is
- * possible to set timer->base = NULL and drop the lock: the timer remains
- * locked.
+ * possible to set timer->base = &migration_base and drop the lock: the timer
+ * remains locked.
*/
static
struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
@@ -130,7 +137,7 @@ struct hrtimer_clock_base *lock_hrtimer_
for (;;) {
base = timer->base;
- if (likely(base != NULL)) {
+ if (likely(base != &migration_base)) {
raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
if (likely(base == timer->base))
return base;
@@ -194,8 +201,8 @@ switch_hrtimer_base(struct hrtimer *time
if (unlikely(hrtimer_callback_running(timer)))
return base;
- /* See the comment in lock_timer_base() */
- timer->base = NULL;
+ /* See the comment in lock_hrtimer_base() */
+ timer->base = &migration_base;
raw_spin_unlock(&base->cpu_base->lock);
raw_spin_lock(&new_base->cpu_base->lock);
@@ -840,11 +847,7 @@ static int enqueue_hrtimer(struct hrtime
base->cpu_base->active_bases |= 1 << base->index;
- /*
- * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
- * state of a possibly running callback.
- */
- timer->state |= HRTIMER_STATE_ENQUEUED;
+ timer->state = HRTIMER_STATE_ENQUEUED;
return timerqueue_add(&base->active, &timer->node);
}
@@ -894,7 +897,6 @@ static inline int
remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
{
if (hrtimer_is_queued(timer)) {
- unsigned long state;
int reprogram;
/*
@@ -908,13 +910,8 @@ remove_hrtimer(struct hrtimer *timer, st
debug_deactivate(timer);
timer_stats_hrtimer_clear_start_info(timer);
reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
- /*
- * We must preserve the CALLBACK state flag here,
- * otherwise we could move the timer base in
- * switch_hrtimer_base.
- */
- state = timer->state & HRTIMER_STATE_CALLBACK;
- __remove_hrtimer(timer, base, state, reprogram);
+
+ __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, reprogram);
return 1;
}
return 0;
@@ -1124,7 +1121,8 @@ static void __run_hrtimer(struct hrtimer
WARN_ON(!irqs_disabled());
debug_deactivate(timer);
- __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+ base->running = timer;
+ __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
timer_stats_account_hrtimer(timer);
fn = timer->function;
@@ -1140,7 +1138,7 @@ static void __run_hrtimer(struct hrtimer
raw_spin_lock(&cpu_base->lock);
/*
- * Note: We clear the CALLBACK bit after enqueue_hrtimer and
+ * Note: We clear the running state after enqueue_hrtimer and
* we do not reprogramm the event hardware. Happens either in
* hrtimer_start_range_ns() or in hrtimer_interrupt()
*
@@ -1152,9 +1150,8 @@ static void __run_hrtimer(struct hrtimer
!(timer->state & HRTIMER_STATE_ENQUEUED))
enqueue_hrtimer(timer, base);
- WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
-
- timer->state &= ~HRTIMER_STATE_CALLBACK;
+ WARN_ON_ONCE(base->running != timer);
+ base->running = NULL;
}
static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
@@ -1523,11 +1520,10 @@ static void migrate_hrtimer_list(struct
* hrtimer_interrupt after we migrated everything to
* sort out already expired timers and reprogram the
* event device.
+ *
+ * Sets timer->state = HRTIMER_STATE_ENQUEUED.
*/
enqueue_hrtimer(timer, new_base);
-
- /* Clear the migration state bit */
- timer->state &= ~HRTIMER_STATE_MIGRATE;
}
}
[sorry for died formatting]
03.06.2015, 16:55, "Peter Zijlstra" <[email protected]>:
> Currently an hrtimer callback function cannot free its own timer
> because __run_hrtimer() still needs to clear HRTIMER_STATE_CALLBACK
> after it. Freeing the timer would result in a clear use-after-free.
>
> Solve this by using a scheme similar to regular timers; track the
> current running timer in hrtimer_clock_base::running.
>
> Suggested-by: Thomas Gleixner <[email protected]>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> ?include/linux/hrtimer.h | ??35 ++++++++++++++---------------------
> ?kernel/time/hrtimer.c ??| ??48 ++++++++++++++++++++++--------------------------
> ?2 files changed, 36 insertions(+), 47 deletions(-)
>
> --- a/include/linux/hrtimer.h
> +++ b/include/linux/hrtimer.h
> @@ -53,34 +53,25 @@ enum hrtimer_restart {
> ??*
> ??* 0x00 inactive
> ??* 0x01 enqueued into rbtree
> - * 0x02 callback function running
> - * 0x04 timer is migrated to another cpu
> + * 0x02 timer is migrated to another cpu
> ??*
> - * Special cases:
> - * 0x03 callback function running and enqueued
> - * (was requeued on another CPU)
> - * 0x05 timer was migrated on CPU hotunplug
> + * The callback state is not part of the timer->state because clearing it would
> + * mean touching the timer after the callback, this makes it impossible to free
> + * the timer from the callback function.
> ??*
> - * The "callback function running and enqueued" status is only possible on
> - * SMP. It happens for example when a posix timer expired and the callback
> + * Therefore we track the callback state in timer->base->running == timer.
> + *
> + * On SMP it is possible to have a "callback function running and enqueued"
> + * status. It happens for example when a posix timer expired and the callback
> ??* queued a signal. Between dropping the lock which protects the posix timer
> ??* and reacquiring the base lock of the hrtimer, another CPU can deliver the
> - * signal and rearm the timer. We have to preserve the callback running state,
> - * as otherwise the timer could be removed before the softirq code finishes the
> - * the handling of the timer.
> - *
> - * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state
> - * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. This
> - * also affects HRTIMER_STATE_MIGRATE where the preservation is not
> - * necessary. HRTIMER_STATE_MIGRATE is cleared after the timer is
> - * enqueued on the new cpu.
> + * signal and rearm the timer.
> ??*
> ??* All state transitions are protected by cpu_base->lock.
> ??*/
> ?#define HRTIMER_STATE_INACTIVE 0x00
> ?#define HRTIMER_STATE_ENQUEUED 0x01
> -#define HRTIMER_STATE_CALLBACK 0x02
> -#define HRTIMER_STATE_MIGRATE 0x04
> +#define HRTIMER_STATE_MIGRATE 0x02
>
> ?/**
> ??* struct hrtimer - the basic hrtimer structure
> @@ -153,6 +144,7 @@ struct hrtimer_clock_base {
> ?????????struct timerqueue_head active;
> ?????????ktime_t (*get_time)(void);
> ?????????ktime_t offset;
> + struct hrtimer *running;
> ?} __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
>
> ?enum ?hrtimer_base_type {
> @@ -402,7 +394,8 @@ extern u64 hrtimer_get_next_event(void);
> ??*/
> ?static inline int hrtimer_active(const struct hrtimer *timer)
> ?{
> - return timer->state != HRTIMER_STATE_INACTIVE;
> + return timer->state != HRTIMER_STATE_INACTIVE ||
> + timer->base->running == timer;
> ?}
It seems to be not good, because hrtimer_active() check stops
to be atomic. So the things like hrtimer_try_to_cancel() race
with a callback of self-rearming timer and may return a false
positive result.
> ?/*
> @@ -419,7 +412,7 @@ static inline int hrtimer_is_queued(stru
> ??*/
> ?static inline int hrtimer_callback_running(struct hrtimer *timer)
> ?{
> - return timer->state & HRTIMER_STATE_CALLBACK;
> + return timer->base->running == timer;
> ?}
>
> ?/* Forward a hrtimer so it expires after now: */
> --- a/kernel/time/hrtimer.c
> +++ b/kernel/time/hrtimer.c
> @@ -111,6 +111,13 @@ static inline int hrtimer_clockid_to_bas
> ?#ifdef CONFIG_SMP
>
> ?/*
> + * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
> + * such that hrtimer_callback_running() can unconditionally dereference
> + * timer->base.
> + */
> +static struct hrtimer_clock_base migration_base;
> +
> +/*
> ??* We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
> ??* means that all timers which are tied to this base via timer->base are
> ??* locked, and the base itself is locked too.
> @@ -119,8 +126,8 @@ static inline int hrtimer_clockid_to_bas
> ??* be found on the lists/queues.
> ??*
> ??* When the timer's base is locked, and the timer removed from list, it is
> - * possible to set timer->base = NULL and drop the lock: the timer remains
> - * locked.
> + * possible to set timer->base = &migration_base and drop the lock: the timer
> + * remains locked.
> ??*/
> ?static
> ?struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
> @@ -130,7 +137,7 @@ struct hrtimer_clock_base *lock_hrtimer_
>
> ?????????for (;;) {
> ?????????????????base = timer->base;
> - if (likely(base != NULL)) {
> + if (likely(base != &migration_base)) {
> ?????????????????????????raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
> ?????????????????????????if (likely(base == timer->base))
> ?????????????????????????????????return base;
> @@ -194,8 +201,8 @@ switch_hrtimer_base(struct hrtimer *time
> ?????????????????if (unlikely(hrtimer_callback_running(timer)))
> ?????????????????????????return base;
>
> - /* See the comment in lock_timer_base() */
> - timer->base = NULL;
> + /* See the comment in lock_hrtimer_base() */
> + timer->base = &migration_base;
> ?????????????????raw_spin_unlock(&base->cpu_base->lock);
> ?????????????????raw_spin_lock(&new_base->cpu_base->lock);
>
> @@ -840,11 +847,7 @@ static int enqueue_hrtimer(struct hrtime
>
> ?????????base->cpu_base->active_bases |= 1 << base->index;
>
> - /*
> - * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
> - * state of a possibly running callback.
> - */
> - timer->state |= HRTIMER_STATE_ENQUEUED;
> + timer->state = HRTIMER_STATE_ENQUEUED;
>
> ?????????return timerqueue_add(&base->active, &timer->node);
> ?}
> @@ -894,7 +897,6 @@ static inline int
> ?remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
> ?{
> ?????????if (hrtimer_is_queued(timer)) {
> - unsigned long state;
> ?????????????????int reprogram;
>
> ?????????????????/*
> @@ -908,13 +910,8 @@ remove_hrtimer(struct hrtimer *timer, st
> ?????????????????debug_deactivate(timer);
> ?????????????????timer_stats_hrtimer_clear_start_info(timer);
> ?????????????????reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
> - /*
> - * We must preserve the CALLBACK state flag here,
> - * otherwise we could move the timer base in
> - * switch_hrtimer_base.
> - */
> - state = timer->state & HRTIMER_STATE_CALLBACK;
> - __remove_hrtimer(timer, base, state, reprogram);
> +
> + __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, reprogram);
> ?????????????????return 1;
> ?????????}
> ?????????return 0;
> @@ -1124,7 +1121,8 @@ static void __run_hrtimer(struct hrtimer
> ?????????WARN_ON(!irqs_disabled());
>
> ?????????debug_deactivate(timer);
> - __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
> + base->running = timer;
> + __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
> ?????????timer_stats_account_hrtimer(timer);
> ?????????fn = timer->function;
>
> @@ -1140,7 +1138,7 @@ static void __run_hrtimer(struct hrtimer
> ?????????raw_spin_lock(&cpu_base->lock);
>
> ?????????/*
> - * Note: We clear the CALLBACK bit after enqueue_hrtimer and
> + * Note: We clear the running state after enqueue_hrtimer and
> ??????????* we do not reprogramm the event hardware. Happens either in
> ??????????* hrtimer_start_range_ns() or in hrtimer_interrupt()
> ??????????*
> @@ -1152,9 +1150,8 @@ static void __run_hrtimer(struct hrtimer
> ?????????????!(timer->state & HRTIMER_STATE_ENQUEUED))
> ?????????????????enqueue_hrtimer(timer, base);
>
> - WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
> -
> - timer->state &= ~HRTIMER_STATE_CALLBACK;
> + WARN_ON_ONCE(base->running != timer);
> + base->running = NULL;
> ?}
>
> ?static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
> @@ -1523,11 +1520,10 @@ static void migrate_hrtimer_list(struct
> ??????????????????* hrtimer_interrupt after we migrated everything to
> ??????????????????* sort out already expired timers and reprogram the
> ??????????????????* event device.
> + *
> + * Sets timer->state = HRTIMER_STATE_ENQUEUED.
> ??????????????????*/
> ?????????????????enqueue_hrtimer(timer, new_base);
> -
> - /* Clear the migration state bit */
> - timer->state &= ~HRTIMER_STATE_MIGRATE;
> ?????????}
> ?}
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at ?http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at ?http://www.tux.org/lkml/
On Wed, 3 Jun 2015, Peter Zijlstra wrote:
> /**
> * struct hrtimer - the basic hrtimer structure
> @@ -153,6 +144,7 @@ struct hrtimer_clock_base {
> struct timerqueue_head active;
> ktime_t (*get_time)(void);
> ktime_t offset;
> + struct hrtimer *running;
Aside of lacking a KernelDoc comment, it expands the struct size on
32bit from 32 bytes to 36 bytes which undoes some of the recent cache
line optimizations I did. Mooo!
So we might think about storing the running timer pointer in cpu_base
instead for 32bit, which increases the foot print of the migration
base and the extra cost for the additional indirection, but it would
keep cache line tight for the hot pathes.
Other than that, this looks pretty good.
Thanks,
tglx
On Wed, Jun 03, 2015 at 07:26:00PM +0300, Kirill Tkhai wrote:
> > @@ -402,7 +394,8 @@ extern u64 hrtimer_get_next_event(void);
> > ??*/
> > ?static inline int hrtimer_active(const struct hrtimer *timer)
> > ?{
> > - return timer->state != HRTIMER_STATE_INACTIVE;
> > + return timer->state != HRTIMER_STATE_INACTIVE ||
> > + timer->base->running == timer;
> > ?}
>
> It seems to be not good, because hrtimer_active() check stops
> to be atomic. So the things like hrtimer_try_to_cancel() race
> with a callback of self-rearming timer and may return a false
> positive result.
Hurm.. the below isn't really pretty either I suppose. The best I can
say about it is that's its not too expensive on x86.
I should probably go sleep..
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -391,11 +391,25 @@ extern u64 hrtimer_get_next_event(void);
* A timer is active, when it is enqueued into the rbtree or the
* callback function is running or it's in the state of being migrated
* to another cpu.
+ *
+ * See __run_hrtimer().
*/
-static inline int hrtimer_active(const struct hrtimer *timer)
+static inline bool hrtimer_active(const struct hrtimer *timer)
{
- return timer->state != HRTIMER_STATE_INACTIVE ||
- timer->base->running == timer;
+ if (timer->state != HRTIMER_STATE_INACTIVE)
+ return true;
+
+ smp_rmb(); /* C matches A */
+
+ if (timer->base->running == timer)
+ return true;
+
+ smp_rmb(); /* D matches B */
+
+ if (timer->state != HRTIMER_STATE_INACTIVE)
+ return true;
+
+ return false;
}
/*
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1122,6 +1122,20 @@ static void __run_hrtimer(struct hrtimer
debug_deactivate(timer);
base->running = timer;
+
+ /*
+ * Pairs with hrtimer_active().
+ *
+ * [S] base->running = timer [L] timer->state
+ * WMB RMB
+ * [S] timer->state = INACTIVE [L] base->running
+ *
+ * BUG_ON(base->running != timer && timer->state != INACTIVE)
+ *
+ * If we observe INACTIVE we must observe base->running == timer.
+ */
+ smp_wmb(); /* A matches C */
+
__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
timer_stats_account_hrtimer(timer);
fn = timer->function;
@@ -1150,6 +1164,20 @@ static void __run_hrtimer(struct hrtimer
!(timer->state & HRTIMER_STATE_ENQUEUED))
enqueue_hrtimer(timer, base);
+ /*
+ * Pairs with hrtimer_active().
+ *
+ * [S] timer->state = ENQUEUED [L] base->running
+ * WMB RMB
+ * [S] base->running = NULL [L] timer->state
+ *
+ * BUG_ON(base->running == NULL && timer->state == INACTIVE)
+ *
+ * If we observe base->running == NULL, we must observe any preceding
+ * enqueue.
+ */
+ smp_wmb(); /* B matches D */
+
WARN_ON_ONCE(base->running != timer);
base->running = NULL;
}
On Wed, Jun 03, 2015 at 07:41:43PM +0200, Thomas Gleixner wrote:
> On Wed, 3 Jun 2015, Peter Zijlstra wrote:
> > /**
> > * struct hrtimer - the basic hrtimer structure
> > @@ -153,6 +144,7 @@ struct hrtimer_clock_base {
> > struct timerqueue_head active;
> > ktime_t (*get_time)(void);
> > ktime_t offset;
> > + struct hrtimer *running;
>
> Aside of lacking a KernelDoc comment, it expands the struct size on
> 32bit from 32 bytes to 36 bytes which undoes some of the recent cache
> line optimizations I did. Mooo!
>
> So we might think about storing the running timer pointer in cpu_base
> instead for 32bit, which increases the foot print of the migration
> base and the extra cost for the additional indirection, but it would
> keep cache line tight for the hot pathes.
A wee something like this then?
---
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -123,8 +123,10 @@ struct hrtimer_sleeper {
#ifdef CONFIG_64BIT
# define HRTIMER_CLOCK_BASE_ALIGN 64
+# define __timer_base_running(timer) timer->base->running
#else
# define HRTIMER_CLOCK_BASE_ALIGN 32
+# define __timer_base_running(timer) timer->base->cpu_base->running
#endif
/**
@@ -136,6 +138,7 @@ struct hrtimer_sleeper {
* @active: red black tree root node for the active timers
* @get_time: function to retrieve the current time of the clock
* @offset: offset of this clock to the monotonic base
+ * @running: pointer to the currently running hrtimer
*/
struct hrtimer_clock_base {
struct hrtimer_cpu_base *cpu_base;
@@ -144,7 +147,9 @@ struct hrtimer_clock_base {
struct timerqueue_head active;
ktime_t (*get_time)(void);
ktime_t offset;
+#ifdef CONFIG_64BIT
struct hrtimer *running;
+#endif
} __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
enum hrtimer_base_type {
@@ -162,6 +167,7 @@ enum hrtimer_base_type {
* @cpu: cpu number
* @active_bases: Bitfield to mark bases with active timers
* @clock_was_set_seq: Sequence counter of clock was set events
+ * @running: pointer to the currently running hrtimer
* @expires_next: absolute time of the next event which was scheduled
* via clock_set_next_event()
* @next_timer: Pointer to the first expiring timer
@@ -183,6 +189,9 @@ struct hrtimer_cpu_base {
unsigned int cpu;
unsigned int active_bases;
unsigned int clock_was_set_seq;
+#ifndef CONFIG_64BIT
+ struct hrtimer *running;
+#endif
#ifdef CONFIG_HIGH_RES_TIMERS
unsigned int in_hrtirq : 1,
hres_active : 1,
@@ -401,7 +410,7 @@ static inline bool hrtimer_active(const
smp_rmb(); /* C matches A */
- if (timer->base->running == timer)
+ if (__timer_base_running(timer) == timer)
return true;
smp_rmb(); /* D matches B */
@@ -426,7 +435,7 @@ static inline int hrtimer_is_queued(stru
*/
static inline int hrtimer_callback_running(struct hrtimer *timer)
{
- return timer->base->running == timer;
+ return __timer_base_running(timer) == timer;
}
/* Forward a hrtimer so it expires after now: */
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -110,12 +110,20 @@ static inline int hrtimer_clockid_to_bas
*/
#ifdef CONFIG_SMP
+#ifndef CONFIG_64BIT
+static struct hrtimer_cpu_base migration_cpu_base;
+
+#define MIGRATION_BASE_INIT { .cpu_base = &migration_cpu_base, }
+#else
+#define MIGRATION_BASE_INIT {}
+#endif
+
/*
* We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
* such that hrtimer_callback_running() can unconditionally dereference
* timer->base.
*/
-static struct hrtimer_clock_base migration_base;
+static struct hrtimer_clock_base migration_base = MIGRATION_BASE_INIT;
/*
* We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
@@ -1121,7 +1129,7 @@ static void __run_hrtimer(struct hrtimer
WARN_ON(!irqs_disabled());
debug_deactivate(timer);
- base->running = timer;
+ __timer_base_running(timer) = timer;
/*
* Pairs with hrtimer_active().
@@ -1178,8 +1186,8 @@ static void __run_hrtimer(struct hrtimer
*/
smp_wmb(); /* B matches D */
- WARN_ON_ONCE(base->running != timer);
- base->running = NULL;
+ WARN_ON_ONCE(__timer_base_running(timer) != timer);
+ __timer_base_running(timer) = NULL;
}
static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
* Peter Zijlstra <[email protected]> wrote:
> On Wed, Jun 03, 2015 at 07:41:43PM +0200, Thomas Gleixner wrote:
> > On Wed, 3 Jun 2015, Peter Zijlstra wrote:
> > > /**
> > > * struct hrtimer - the basic hrtimer structure
> > > @@ -153,6 +144,7 @@ struct hrtimer_clock_base {
> > > struct timerqueue_head active;
> > > ktime_t (*get_time)(void);
> > > ktime_t offset;
> > > + struct hrtimer *running;
> >
> > Aside of lacking a KernelDoc comment, it expands the struct size on
> > 32bit from 32 bytes to 36 bytes which undoes some of the recent cache
> > line optimizations I did. Mooo!
> >
> > So we might think about storing the running timer pointer in cpu_base
> > instead for 32bit, which increases the foot print of the migration
> > base and the extra cost for the additional indirection, but it would
> > keep cache line tight for the hot pathes.
>
> A wee something like this then?
>
> ---
> --- a/include/linux/hrtimer.h
> +++ b/include/linux/hrtimer.h
> @@ -123,8 +123,10 @@ struct hrtimer_sleeper {
>
> #ifdef CONFIG_64BIT
> # define HRTIMER_CLOCK_BASE_ALIGN 64
> +# define __timer_base_running(timer) timer->base->running
> #else
> # define HRTIMER_CLOCK_BASE_ALIGN 32
> +# define __timer_base_running(timer) timer->base->cpu_base->running
> #endif
Please put it into the cpu_base on 64-bit as well: the base pointer is available
already on 64-bit so there should be no measurable performance difference, and
readability is a primary concern with all this code.
Thanks,
Ingo
? ??, 03/06/2015 ? 23:13 +0200, Peter Zijlstra ?????:
On Wed, Jun 03, 2015 at 07:26:00PM +0300, Kirill Tkhai wrote:
> > > @@ -402,7 +394,8 @@ extern u64 hrtimer_get_next_event(void);
> > > */
> > > static inline int hrtimer_active(const struct hrtimer *timer)
> > > {
> > > - return timer->state != HRTIMER_STATE_INACTIVE;
> > > + return timer->state != HRTIMER_STATE_INACTIVE ||
> > > + timer->base->running == timer;
> > > }
> >
> > It seems to be not good, because hrtimer_active() check stops
> > to be atomic. So the things like hrtimer_try_to_cancel() race
> > with a callback of self-rearming timer and may return a false
> > positive result.
>
> Hurm.. the below isn't really pretty either I suppose. The best I can
> say about it is that's its not too expensive on x86.
>
> I should probably go sleep..
>
> --- a/include/linux/hrtimer.h
> +++ b/include/linux/hrtimer.h
> @@ -391,11 +391,25 @@ extern u64 hrtimer_get_next_event(void);
> * A timer is active, when it is enqueued into the rbtree or the
> * callback function is running or it's in the state of being migrated
> * to another cpu.
> + *
> + * See __run_hrtimer().
> */
> -static inline int hrtimer_active(const struct hrtimer *timer)
> +static inline bool hrtimer_active(const struct hrtimer *timer)
> {
> - return timer->state != HRTIMER_STATE_INACTIVE ||
> - timer->base->running == timer;
> + if (timer->state != HRTIMER_STATE_INACTIVE)
> + return true;
> +
> + smp_rmb(); /* C matches A */
> +
> + if (timer->base->running == timer)
> + return true;
> +
> + smp_rmb(); /* D matches B */
> +
> + if (timer->state != HRTIMER_STATE_INACTIVE)
> + return true;
> +
> + return false;
This races with two sequential timer handlers. hrtimer_active()
is preemptible everywhere, and no guarantees that all three "if"
conditions check the same timer tick.
How about transformation of hrtimer_bases.lock: raw_spinlock_t --> seqlock_t?
> }
>
> /*
> --- a/kernel/time/hrtimer.c
> +++ b/kernel/time/hrtimer.c
> @@ -1122,6 +1122,20 @@ static void __run_hrtimer(struct hrtimer
>
> debug_deactivate(timer);
> base->running = timer;
> +
> + /*
> + * Pairs with hrtimer_active().
> + *
> + * [S] base->running = timer [L] timer->state
> + * WMB RMB
> + * [S] timer->state = INACTIVE [L] base->running
> + *
> + * BUG_ON(base->running != timer && timer->state != INACTIVE)
> + *
> + * If we observe INACTIVE we must observe base->running == timer.
> + */
> + smp_wmb(); /* A matches C */
> +
> __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
> timer_stats_account_hrtimer(timer);
> fn = timer->function;
> @@ -1150,6 +1164,20 @@ static void __run_hrtimer(struct hrtimer
> !(timer->state & HRTIMER_STATE_ENQUEUED))
> enqueue_hrtimer(timer, base);
>
> + /*
> + * Pairs with hrtimer_active().
> + *
> + * [S] timer->state = ENQUEUED [L] base->running
> + * WMB RMB
> + * [S] base->running = NULL [L] timer->state
> + *
> + * BUG_ON(base->running == NULL && timer->state == INACTIVE)
> + *
> + * If we observe base->running == NULL, we must observe any preceding
> + * enqueue.
> + */
> + smp_wmb(); /* B matches D */
> +
> WARN_ON_ONCE(base->running != timer);
> base->running = NULL;
> }
>
On Thu, Jun 04, 2015 at 07:59:30AM +0200, Ingo Molnar wrote:
> > --- a/include/linux/hrtimer.h
> > +++ b/include/linux/hrtimer.h
> > @@ -123,8 +123,10 @@ struct hrtimer_sleeper {
> >
> > #ifdef CONFIG_64BIT
> > # define HRTIMER_CLOCK_BASE_ALIGN 64
> > +# define __timer_base_running(timer) timer->base->running
> > #else
> > # define HRTIMER_CLOCK_BASE_ALIGN 32
> > +# define __timer_base_running(timer) timer->base->cpu_base->running
> > #endif
>
> Please put it into the cpu_base on 64-bit as well: the base pointer is available
> already on 64-bit so there should be no measurable performance difference, and
> readability is a primary concern with all this code.
That's an extra pointer chase for no reason :-(
On Thu, Jun 04, 2015 at 12:07:03PM +0300, Kirill Tkhai wrote:
> > --- a/include/linux/hrtimer.h
> > +++ b/include/linux/hrtimer.h
> > @@ -391,11 +391,25 @@ extern u64 hrtimer_get_next_event(void);
> > * A timer is active, when it is enqueued into the rbtree or the
> > * callback function is running or it's in the state of being migrated
> > * to another cpu.
> > + *
> > + * See __run_hrtimer().
> > */
> > -static inline int hrtimer_active(const struct hrtimer *timer)
> > +static inline bool hrtimer_active(const struct hrtimer *timer)
> > {
> > - return timer->state != HRTIMER_STATE_INACTIVE ||
> > - timer->base->running == timer;
> > + if (timer->state != HRTIMER_STATE_INACTIVE)
> > + return true;
> > +
> > + smp_rmb(); /* C matches A */
> > +
> > + if (timer->base->running == timer)
> > + return true;
> > +
> > + smp_rmb(); /* D matches B */
> > +
> > + if (timer->state != HRTIMER_STATE_INACTIVE)
> > + return true;
> > +
> > + return false;
>
> This races with two sequential timer handlers. hrtimer_active()
> is preemptible everywhere, and no guarantees that all three "if"
> conditions check the same timer tick.
Indeed.
> How about transformation of hrtimer_bases.lock: raw_spinlock_t --> seqlock_t?
Ingo will like that because it means we already need to touch cpu_base.
But I think there's a problem there on timer migration, the timer can
migrate between bases while we do the seq read loop and then you can get
false positives on the different seqcount numbers.
We could of course do something like the below, but hrtimer_is_active()
is turning into quite the monster.
Needs more comments at the very least, its fully of trickery.
---
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -59,7 +59,9 @@ enum hrtimer_restart {
* mean touching the timer after the callback, this makes it impossible to free
* the timer from the callback function.
*
- * Therefore we track the callback state in timer->base->running == timer.
+ * Therefore we track the callback state in:
+ *
+ * timer->base->cpu_base->running == timer
*
* On SMP it is possible to have a "callback function running and enqueued"
* status. It happens for example when a posix timer expired and the callback
@@ -144,7 +146,6 @@ struct hrtimer_clock_base {
struct timerqueue_head active;
ktime_t (*get_time)(void);
ktime_t offset;
- struct hrtimer *running;
} __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
enum hrtimer_base_type {
@@ -159,6 +160,8 @@ enum hrtimer_base_type {
* struct hrtimer_cpu_base - the per cpu clock bases
* @lock: lock protecting the base and associated clock bases
* and timers
+ * @seq: seqcount around __run_hrtimer
+ * @running: pointer to the currently running hrtimer
* @cpu: cpu number
* @active_bases: Bitfield to mark bases with active timers
* @clock_was_set_seq: Sequence counter of clock was set events
@@ -180,6 +183,8 @@ enum hrtimer_base_type {
*/
struct hrtimer_cpu_base {
raw_spinlock_t lock;
+ seqcount_t seq;
+ struct hrtimer *running;
unsigned int cpu;
unsigned int active_bases;
unsigned int clock_was_set_seq;
@@ -394,8 +399,24 @@ extern u64 hrtimer_get_next_event(void);
*/
static inline int hrtimer_active(const struct hrtimer *timer)
{
- return timer->state != HRTIMER_STATE_INACTIVE ||
- timer->base->running == timer;
+ struct hrtimer_cpu_base *cpu_base;
+ unsigned int seq;
+ bool active;
+
+ do {
+ active = false;
+ cpu_base = READ_ONCE(timer->base->cpu_base);
+ seqcount_lockdep_reader_access(&cpu_base->seq);
+ seq = raw_read_seqcount(&cpu_base->seq);
+
+ if (timer->state != HRTIMER_STATE_INACTIVE ||
+ cpu_base->running == timer)
+ active = true;
+
+ } while (read_seqcount_retry(&cpu_base->seq, seq) ||
+ cpu_base != READ_ONCE(timer->base->cpu_base));
+
+ return active;
}
/*
@@ -412,7 +433,7 @@ static inline int hrtimer_is_queued(stru
*/
static inline int hrtimer_callback_running(struct hrtimer *timer)
{
- return timer->base->running == timer;
+ return timer->base->cpu_base->running == timer;
}
/* Forward a hrtimer so it expires after now: */
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -67,6 +67,7 @@
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
+ .seq = SEQCNT_ZERO(hrtimer_bases.seq),
.clock_base =
{
{
@@ -113,9 +114,15 @@ static inline int hrtimer_clockid_to_bas
/*
* We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
* such that hrtimer_callback_running() can unconditionally dereference
- * timer->base.
+ * timer->base->cpu_base
*/
-static struct hrtimer_clock_base migration_base;
+static struct hrtimer_cpu_base migration_cpu_base = {
+ .seq = SEQCNT_ZERO(migration_cpu_base),
+};
+
+static struct hrtimer_clock_base migration_base {
+ .cpu_base = &migration_cpu_base,
+};
/*
* We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
@@ -1118,10 +1125,16 @@ static void __run_hrtimer(struct hrtimer
enum hrtimer_restart (*fn)(struct hrtimer *);
int restart;
- WARN_ON(!irqs_disabled());
+ lockdep_assert_held(&cpu_base->lock);
debug_deactivate(timer);
- base->running = timer;
+ cpu_base->running = timer;
+
+ /*
+ * separate the ->running assignment from the ->state assignment
+ */
+ write_seqcount_begin(&cpu_base->seq);
+
__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
timer_stats_account_hrtimer(timer);
fn = timer->function;
@@ -1150,8 +1163,13 @@ static void __run_hrtimer(struct hrtimer
!(timer->state & HRTIMER_STATE_ENQUEUED))
enqueue_hrtimer(timer, base);
- WARN_ON_ONCE(base->running != timer);
- base->running = NULL;
+ /*
+ * separate the ->running assignment from the ->state assignment
+ */
+ write_seqcount_end(&cpu_base->seq);
+
+ WARN_ON_ONCE(cpu_base->running != timer);
+ cpu_base->running = NULL;
}
static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
On Thu, Jun 04, 2015 at 12:49:02PM +0200, Peter Zijlstra wrote:
> Needs more comments at the very least, its fully of trickery.
>
> @@ -1118,10 +1125,16 @@ static void __run_hrtimer(struct hrtimer
> enum hrtimer_restart (*fn)(struct hrtimer *);
> int restart;
>
> - WARN_ON(!irqs_disabled());
> + lockdep_assert_held(&cpu_base->lock);
>
> debug_deactivate(timer);
> - base->running = timer;
> + cpu_base->running = timer;
> +
> + /*
> + * separate the ->running assignment from the ->state assignment
> + */
> + write_seqcount_begin(&cpu_base->seq);
Maybe these need to be raw_write_seqcount_latch()..
> +
> __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
> timer_stats_account_hrtimer(timer);
> fn = timer->function;
> @@ -1150,8 +1163,13 @@ static void __run_hrtimer(struct hrtimer
> !(timer->state & HRTIMER_STATE_ENQUEUED))
> enqueue_hrtimer(timer, base);
>
> - WARN_ON_ONCE(base->running != timer);
> - base->running = NULL;
> + /*
> + * separate the ->running assignment from the ->state assignment
> + */
> + write_seqcount_end(&cpu_base->seq);
> +
> + WARN_ON_ONCE(cpu_base->running != timer);
> + cpu_base->running = NULL;
> }
>
> static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
On Thu, Jun 04, 2015 at 12:55:37PM +0200, Peter Zijlstra wrote:
> On Thu, Jun 04, 2015 at 12:49:02PM +0200, Peter Zijlstra wrote:
> > Needs more comments at the very least, its fully of trickery.
> >
> > @@ -1118,10 +1125,16 @@ static void __run_hrtimer(struct hrtimer
> > enum hrtimer_restart (*fn)(struct hrtimer *);
> > int restart;
> >
> > - WARN_ON(!irqs_disabled());
> > + lockdep_assert_held(&cpu_base->lock);
> >
> > debug_deactivate(timer);
> > - base->running = timer;
> > + cpu_base->running = timer;
> > +
> > + /*
> > + * separate the ->running assignment from the ->state assignment
> > + */
> > + write_seqcount_begin(&cpu_base->seq);
>
> Maybe these need to be raw_write_seqcount_latch()..
I'm properly confusing my self, so let me write a little more detail;
I didn't think it needed the double wmb because:
[S] running = timer;
[S] seq++;
WMB
[S] state = INACTIVE
I don't think it matters if we re-order the first two stores, since at
that point ->state is still ENQUEUED and we're good.
Similar for the case below, you can flip the seq increment and the
->running clear, but at that time ->state should already be ENQUEUED
again (if indeed the timer got re-armed), otherwise its not active
anymore.
> > __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
> > timer_stats_account_hrtimer(timer);
> > fn = timer->function;
> > @@ -1150,8 +1163,13 @@ static void __run_hrtimer(struct hrtimer
> > !(timer->state & HRTIMER_STATE_ENQUEUED))
> > enqueue_hrtimer(timer, base);
> >
> > - WARN_ON_ONCE(base->running != timer);
> > - base->running = NULL;
> > + /*
> > + * separate the ->running assignment from the ->state assignment
> > + */
> > + write_seqcount_end(&cpu_base->seq);
> > +
> > + WARN_ON_ONCE(cpu_base->running != timer);
> > + cpu_base->running = NULL;
> > }
> >
> > static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
* Peter Zijlstra <[email protected]> wrote:
> On Thu, Jun 04, 2015 at 07:59:30AM +0200, Ingo Molnar wrote:
>
> > > --- a/include/linux/hrtimer.h
> > > +++ b/include/linux/hrtimer.h
> > > @@ -123,8 +123,10 @@ struct hrtimer_sleeper {
> > >
> > > #ifdef CONFIG_64BIT
> > > # define HRTIMER_CLOCK_BASE_ALIGN 64
> > > +# define __timer_base_running(timer) timer->base->running
> > > #else
> > > # define HRTIMER_CLOCK_BASE_ALIGN 32
> > > +# define __timer_base_running(timer) timer->base->cpu_base->running
> > > #endif
> >
> > Please put it into the cpu_base on 64-bit as well: the base pointer is available
> > already on 64-bit so there should be no measurable performance difference, and
> > readability is a primary concern with all this code.
>
> That's an extra pointer chase for no reason :-(
Only if we otherwise don't dereference cpu_base - is that the case in the relevant
code paths?
If we already dereference cpu_base (say for the lock) and have its value loaded
then it's totally equivalent to chasing down it in base->.
Thanks,
Ingo
? ??, 04/06/2015 ? 12:49 +0200, Peter Zijlstra ?????:
On Thu, Jun 04, 2015 at 12:07:03PM +0300, Kirill Tkhai wrote:
> > > --- a/include/linux/hrtimer.h
> > > +++ b/include/linux/hrtimer.h
> > > @@ -391,11 +391,25 @@ extern u64 hrtimer_get_next_event(void);
> > > * A timer is active, when it is enqueued into the rbtree or the
> > > * callback function is running or it's in the state of being migrated
> > > * to another cpu.
> > > + *
> > > + * See __run_hrtimer().
> > > */
> > > -static inline int hrtimer_active(const struct hrtimer *timer)
> > > +static inline bool hrtimer_active(const struct hrtimer *timer)
> > > {
> > > - return timer->state != HRTIMER_STATE_INACTIVE ||
> > > - timer->base->running == timer;
> > > + if (timer->state != HRTIMER_STATE_INACTIVE)
> > > + return true;
> > > +
> > > + smp_rmb(); /* C matches A */
> > > +
> > > + if (timer->base->running == timer)
> > > + return true;
> > > +
> > > + smp_rmb(); /* D matches B */
> > > +
> > > + if (timer->state != HRTIMER_STATE_INACTIVE)
> > > + return true;
> > > +
> > > + return false;
> >
> > This races with two sequential timer handlers. hrtimer_active()
> > is preemptible everywhere, and no guarantees that all three "if"
> > conditions check the same timer tick.
>
> Indeed.
>
> > How about transformation of hrtimer_bases.lock: raw_spinlock_t --> seqlock_t?
>
> Ingo will like that because it means we already need to touch cpu_base.
>
> But I think there's a problem there on timer migration, the timer can
> migrate between bases while we do the seq read loop and then you can get
> false positives on the different seqcount numbers.
>
> We could of course do something like the below, but hrtimer_is_active()
> is turning into quite the monster.
>
> Needs more comments at the very least, its fully of trickery.
Yeah, it's safe for now, but it may happen difficulties with a support
in the future, because barrier logic is not easy to review. But it seems
we may simplify it a little bit. Please, see the comments below.
> ---
> --- a/include/linux/hrtimer.h
> +++ b/include/linux/hrtimer.h
> @@ -59,7 +59,9 @@ enum hrtimer_restart {
> * mean touching the timer after the callback, this makes it impossible to free
> * the timer from the callback function.
> *
> - * Therefore we track the callback state in timer->base->running == timer.
> + * Therefore we track the callback state in:
> + *
> + * timer->base->cpu_base->running == timer
> *
> * On SMP it is possible to have a "callback function running and enqueued"
> * status. It happens for example when a posix timer expired and the callback
> @@ -144,7 +146,6 @@ struct hrtimer_clock_base {
> struct timerqueue_head active;
> ktime_t (*get_time)(void);
> ktime_t offset;
> - struct hrtimer *running;
> } __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
>
> enum hrtimer_base_type {
> @@ -159,6 +160,8 @@ enum hrtimer_base_type {
> * struct hrtimer_cpu_base - the per cpu clock bases
> * @lock: lock protecting the base and associated clock bases
> * and timers
> + * @seq: seqcount around __run_hrtimer
> + * @running: pointer to the currently running hrtimer
> * @cpu: cpu number
> * @active_bases: Bitfield to mark bases with active timers
> * @clock_was_set_seq: Sequence counter of clock was set events
> @@ -180,6 +183,8 @@ enum hrtimer_base_type {
> */
> struct hrtimer_cpu_base {
> raw_spinlock_t lock;
> + seqcount_t seq;
> + struct hrtimer *running;
> unsigned int cpu;
> unsigned int active_bases;
> unsigned int clock_was_set_seq;
> @@ -394,8 +399,24 @@ extern u64 hrtimer_get_next_event(void);
> */
> static inline int hrtimer_active(const struct hrtimer *timer)
> {
> - return timer->state != HRTIMER_STATE_INACTIVE ||
> - timer->base->running == timer;
> + struct hrtimer_cpu_base *cpu_base;
> + unsigned int seq;
> + bool active;
> +
> + do {
> + active = false;
> + cpu_base = READ_ONCE(timer->base->cpu_base);
> + seqcount_lockdep_reader_access(&cpu_base->seq);
> + seq = raw_read_seqcount(&cpu_base->seq);
> +
> + if (timer->state != HRTIMER_STATE_INACTIVE ||
> + cpu_base->running == timer)
> + active = true;
> +
> + } while (read_seqcount_retry(&cpu_base->seq, seq) ||
> + cpu_base != READ_ONCE(timer->base->cpu_base));
> +
> + return active;
> }
This may race with migrate_hrtimer_list(), so it needs write seqcounter
too.
>
> /*
> @@ -412,7 +433,7 @@ static inline int hrtimer_is_queued(stru
> */
> static inline int hrtimer_callback_running(struct hrtimer *timer)
> {
> - return timer->base->running == timer;
> + return timer->base->cpu_base->running == timer;
> }
>
> /* Forward a hrtimer so it expires after now: */
> --- a/kernel/time/hrtimer.c
> +++ b/kernel/time/hrtimer.c
> @@ -67,6 +67,7 @@
> DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
> {
> .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
> + .seq = SEQCNT_ZERO(hrtimer_bases.seq),
> .clock_base =
> {
> {
> @@ -113,9 +114,15 @@ static inline int hrtimer_clockid_to_bas
> /*
> * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
> * such that hrtimer_callback_running() can unconditionally dereference
> - * timer->base.
> + * timer->base->cpu_base
> */
> -static struct hrtimer_clock_base migration_base;
> +static struct hrtimer_cpu_base migration_cpu_base = {
> + .seq = SEQCNT_ZERO(migration_cpu_base),
> +};
> +
> +static struct hrtimer_clock_base migration_base {
> + .cpu_base = &migration_cpu_base,
> +};
>
> /*
> * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
> @@ -1118,10 +1125,16 @@ static void __run_hrtimer(struct hrtimer
> enum hrtimer_restart (*fn)(struct hrtimer *);
> int restart;
>
> - WARN_ON(!irqs_disabled());
> + lockdep_assert_held(&cpu_base->lock);
>
> debug_deactivate(timer);
> - base->running = timer;
> + cpu_base->running = timer;
My suggestion is do not use seqcounters for long parts of code, and implement
short primitives for changing timer state and cpu_base running timer. Something
like this:
static inline void hrtimer_set_state(struct hrtimer *timer, unsigned long state)
{
struct hrtimer_cpu_base *cpu_base = timer->base->cpu_base;
lockdep_assert_held(&cpu_base->lock);
write_seqcount_begin(&cpu_base->seq);
timer->state = state;
write_seqcount_end(&cpu_base->seq);
}
static inline void cpu_base_set_running(struct hrtimer_cpu_base *cpu_base,
struct hrtimer *timer)
{
lockdep_assert_held(&cpu_base->lock);
write_seqcount_begin(&cpu_base->seq);
cpu_base->running = timer;
write_seqcount_end(&cpu_base->seq);
}
Implemented this, we may less think about right barrier order, because
all changes are being made under seqcount.
static inline int hrtimer_active(const struct hrtimer *timer)
{
struct hrtimer_cpu_base *cpu_base;
struct hrtimer_clock_base *base;
unsigned int seq;
bool active = false;
do {
base = READ_ONCE(timer->base);
if (base == &migration_base) {
active = true;
break;
}
cpu_base = base->cpu_base;
seqcount_lockdep_reader_access(&cpu_base->seq);
seq = raw_read_seqcount(&cpu_base->seq);
if (timer->state != HRTIMER_STATE_INACTIVE ||
cpu_base->running == timer) {
active = true;
break;
}
} while (read_seqcount_retry(&cpu_base->seq, seq) ||
READ_ONCE(timer->base) != base);
return active;
}
> +
> + /*
> + * separate the ->running assignment from the ->state assignment
> + */
> + write_seqcount_begin(&cpu_base->seq);
> +
> __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
> timer_stats_account_hrtimer(timer);
> fn = timer->function;
> @@ -1150,8 +1163,13 @@ static void __run_hrtimer(struct hrtimer
> !(timer->state & HRTIMER_STATE_ENQUEUED))
> enqueue_hrtimer(timer, base);
>
> - WARN_ON_ONCE(base->running != timer);
> - base->running = NULL;
> + /*
> + * separate the ->running assignment from the ->state assignment
> + */
> + write_seqcount_end(&cpu_base->seq);
> +
> + WARN_ON_ONCE(cpu_base->running != timer);
> + cpu_base->running = NULL;
> }
>
> static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
>
This message is too late, /me going to see new series :)
05.06.2015, 12:02, "Kirill Tkhai" <[email protected]>:
> ? ??, 04/06/2015 ? 12:49 +0200, Peter Zijlstra ?????:
> On Thu, Jun 04, 2015 at 12:07:03PM +0300, Kirill Tkhai wrote:
>>>> ?--- a/include/linux/hrtimer.h
>>>> ?+++ b/include/linux/hrtimer.h
>>>> ?@@ -391,11 +391,25 @@ extern u64 hrtimer_get_next_event(void);
>>>> ???* A timer is active, when it is enqueued into the rbtree or the
>>>> ???* callback function is running or it's in the state of being migrated
>>>> ???* to another cpu.
>>>> ?+ *
>>>> ?+ * See __run_hrtimer().
>>>> ???*/
>>>> ?-static inline int hrtimer_active(const struct hrtimer *timer)
>>>> ?+static inline bool hrtimer_active(const struct hrtimer *timer)
>>>> ??{
>>>> ?- return timer->state != HRTIMER_STATE_INACTIVE ||
>>>> ?- timer->base->running == timer;
>>>> ?+ if (timer->state != HRTIMER_STATE_INACTIVE)
>>>> ?+ return true;
>>>> ?+
>>>> ?+ smp_rmb(); /* C matches A */
>>>> ?+
>>>> ?+ if (timer->base->running == timer)
>>>> ?+ return true;
>>>> ?+
>>>> ?+ smp_rmb(); /* D matches B */
>>>> ?+
>>>> ?+ if (timer->state != HRTIMER_STATE_INACTIVE)
>>>> ?+ return true;
>>>> ?+
>>>> ?+ return false;
>>> ?This races with two sequential timer handlers. hrtimer_active()
>>> ?is preemptible everywhere, and no guarantees that all three "if"
>>> ?conditions check the same timer tick.
>> ?Indeed.
>>> ?How about transformation of hrtimer_bases.lock: raw_spinlock_t --> seqlock_t?
>> ?Ingo will like that because it means we already need to touch cpu_base.
>>
>> ?But I think there's a problem there on timer migration, the timer can
>> ?migrate between bases while we do the seq read loop and then you can get
>> ?false positives on the different seqcount numbers.
>>
>> ?We could of course do something like the below, but hrtimer_is_active()
>> ?is turning into quite the monster.
>>
>> ?Needs more comments at the very least, its fully of trickery.
>
> Yeah, it's safe for now, but it may happen difficulties with a support
> in the future, because barrier logic is not easy to review. But it seems
> we may simplify it a little bit. Please, see the comments below.
>> ?---
>> ?--- a/include/linux/hrtimer.h
>> ?+++ b/include/linux/hrtimer.h
>> ?@@ -59,7 +59,9 @@ enum hrtimer_restart {
>> ???* mean touching the timer after the callback, this makes it impossible to free
>> ???* the timer from the callback function.
>> ???*
>> ?- * Therefore we track the callback state in timer->base->running == timer.
>> ?+ * Therefore we track the callback state in:
>> ?+ *
>> ?+ * timer->base->cpu_base->running == timer
>> ???*
>> ???* On SMP it is possible to have a "callback function running and enqueued"
>> ???* status. It happens for example when a posix timer expired and the callback
>> ?@@ -144,7 +146,6 @@ struct hrtimer_clock_base {
>> ??????????struct timerqueue_head active;
>> ??????????ktime_t (*get_time)(void);
>> ??????????ktime_t offset;
>> ?- struct hrtimer *running;
>> ??} __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
>>
>> ??enum ?hrtimer_base_type {
>> ?@@ -159,6 +160,8 @@ enum ?hrtimer_base_type {
>> ???* struct hrtimer_cpu_base - the per cpu clock bases
>> ???* @lock: lock protecting the base and associated clock bases
>> ???* and timers
>> ?+ * @seq: seqcount around __run_hrtimer
>> ?+ * @running: pointer to the currently running hrtimer
>> ???* @cpu: cpu number
>> ???* @active_bases: Bitfield to mark bases with active timers
>> ???* @clock_was_set_seq: Sequence counter of clock was set events
>> ?@@ -180,6 +183,8 @@ enum ?hrtimer_base_type {
>> ???*/
>> ??struct hrtimer_cpu_base {
>> ??????????raw_spinlock_t lock;
>> ?+ seqcount_t seq;
>> ?+ struct hrtimer *running;
>> ??????????unsigned int cpu;
>> ??????????unsigned int active_bases;
>> ??????????unsigned int clock_was_set_seq;
>> ?@@ -394,8 +399,24 @@ extern u64 hrtimer_get_next_event(void);
>> ???*/
>> ??static inline int hrtimer_active(const struct hrtimer *timer)
>> ??{
>> ?- return timer->state != HRTIMER_STATE_INACTIVE ||
>> ?- timer->base->running == timer;
>> ?+ struct hrtimer_cpu_base *cpu_base;
>> ?+ unsigned int seq;
>> ?+ bool active;
>> ?+
>> ?+ do {
>> ?+ active = false;
>> ?+ cpu_base = READ_ONCE(timer->base->cpu_base);
>> ?+ seqcount_lockdep_reader_access(&cpu_base->seq);
>> ?+ seq = raw_read_seqcount(&cpu_base->seq);
>> ?+
>> ?+ if (timer->state != HRTIMER_STATE_INACTIVE ||
>> ?+ ???cpu_base->running == timer)
>> ?+ active = true;
>> ?+
>> ?+ } while (read_seqcount_retry(&cpu_base->seq, seq) ||
>> ?+ cpu_base != READ_ONCE(timer->base->cpu_base));
>> ?+
>> ?+ return active;
>> ??}
>
> This may race with migrate_hrtimer_list(), so it needs write seqcounter
> too.
>> ??/*
>> ?@@ -412,7 +433,7 @@ static inline int hrtimer_is_queued(stru
>> ???*/
>> ??static inline int hrtimer_callback_running(struct hrtimer *timer)
>> ??{
>> ?- return timer->base->running == timer;
>> ?+ return timer->base->cpu_base->running == timer;
>> ??}
>>
>> ??/* Forward a hrtimer so it expires after now: */
>> ?--- a/kernel/time/hrtimer.c
>> ?+++ b/kernel/time/hrtimer.c
>> ?@@ -67,6 +67,7 @@
>> ??DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
>> ??{
>> ??????????.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
>> ?+ .seq = SEQCNT_ZERO(hrtimer_bases.seq),
>> ??????????.clock_base =
>> ??????????{
>> ??????????????????{
>> ?@@ -113,9 +114,15 @@ static inline int hrtimer_clockid_to_bas
>> ??/*
>> ???* We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
>> ???* such that hrtimer_callback_running() can unconditionally dereference
>> ?- * timer->base.
>> ?+ * timer->base->cpu_base
>> ???*/
>> ?-static struct hrtimer_clock_base migration_base;
>> ?+static struct hrtimer_cpu_base migration_cpu_base = {
>> ?+ .seq = SEQCNT_ZERO(migration_cpu_base),
>> ?+};
>> ?+
>> ?+static struct hrtimer_clock_base migration_base {
>> ?+ .cpu_base = &migration_cpu_base,
>> ?+};
>>
>> ??/*
>> ???* We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
>> ?@@ -1118,10 +1125,16 @@ static void __run_hrtimer(struct hrtimer
>> ??????????enum hrtimer_restart (*fn)(struct hrtimer *);
>> ??????????int restart;
>>
>> ?- WARN_ON(!irqs_disabled());
>> ?+ lockdep_assert_held(&cpu_base->lock);
>>
>> ??????????debug_deactivate(timer);
>> ?- base->running = timer;
>> ?+ cpu_base->running = timer;
>
> My suggestion is do not use seqcounters for long parts of code, and implement
> short primitives for changing timer state and cpu_base running timer. Something
> like this:
>
> static inline void hrtimer_set_state(struct hrtimer *timer, unsigned long state)
> {
> ????????struct hrtimer_cpu_base *cpu_base = timer->base->cpu_base;
>
> ????????lockdep_assert_held(&cpu_base->lock);
>
> ????????write_seqcount_begin(&cpu_base->seq);
> ????????timer->state = state;
> ????????write_seqcount_end(&cpu_base->seq);
> }
>
> static inline void cpu_base_set_running(struct hrtimer_cpu_base *cpu_base,
> ????????????????????????????????????????struct hrtimer *timer)
> {
> ????????lockdep_assert_held(&cpu_base->lock);
>
> ????????write_seqcount_begin(&cpu_base->seq);
> ????????cpu_base->running = timer;
> ????????write_seqcount_end(&cpu_base->seq);
> }
>
> Implemented this, we may less think about right barrier order, because
> all changes are being made under seqcount.
>
> static inline int hrtimer_active(const struct hrtimer *timer)
> {
> ????????struct hrtimer_cpu_base *cpu_base;
> ????????struct hrtimer_clock_base *base;
> ????????unsigned int seq;
> ????????bool active = false;
>
> ????????do {
> ????????????????base = READ_ONCE(timer->base);
> ????????????????if (base == &migration_base) {
> ????????????????????????active = true;
> ????????????????????????break;
> ????????????????}
>
> ????????????????cpu_base = base->cpu_base;
> ????????????????seqcount_lockdep_reader_access(&cpu_base->seq);
> ????????????????seq = raw_read_seqcount(&cpu_base->seq);
>
> ????????????????if (timer->state != HRTIMER_STATE_INACTIVE ||
> ????????????????????cpu_base->running == timer) {
> ????????????????????????active = true;
> ????????????????????????break;
> ????????????????}
> ????????} while (read_seqcount_retry(&cpu_base->seq, seq) ||
> ?????????????????READ_ONCE(timer->base) != base);
>
> ????????return active;
> }
>> ?+
>> ?+ /*
>> ?+ * separate the ->running assignment from the ->state assignment
>> ?+ */
>> ?+ write_seqcount_begin(&cpu_base->seq);
>> ?+
>> ??????????__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
>> ??????????timer_stats_account_hrtimer(timer);
>> ??????????fn = timer->function;
>> ?@@ -1150,8 +1163,13 @@ static void __run_hrtimer(struct hrtimer
>> ??????????????!(timer->state & HRTIMER_STATE_ENQUEUED))
>> ??????????????????enqueue_hrtimer(timer, base);
>>
>> ?- WARN_ON_ONCE(base->running != timer);
>> ?- base->running = NULL;
>> ?+ /*
>> ?+ * separate the ->running assignment from the ->state assignment
>> ?+ */
>> ?+ write_seqcount_end(&cpu_base->seq);
>> ?+
>> ?+ WARN_ON_ONCE(cpu_base->running != timer);
>> ?+ cpu_base->running = NULL;
>> ??}
>>
>> ??static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
On Fri, Jun 05, 2015 at 12:02:01PM +0300, Kirill Tkhai wrote:
> Yeah, it's safe for now, but it may happen difficulties with a support
> in the future, because barrier logic is not easy to review. But it seems
> we may simplify it a little bit. Please, see the comments below.
> > @@ -394,8 +399,24 @@ extern u64 hrtimer_get_next_event(void);
> > */
> > static inline int hrtimer_active(const struct hrtimer *timer)
> > {
> > + struct hrtimer_cpu_base *cpu_base;
> > + unsigned int seq;
> > + bool active;
> > +
> > + do {
> > + active = false;
> > + cpu_base = READ_ONCE(timer->base->cpu_base);
> > + seqcount_lockdep_reader_access(&cpu_base->seq);
> > + seq = raw_read_seqcount(&cpu_base->seq);
> > +
> > + if (timer->state != HRTIMER_STATE_INACTIVE ||
> > + cpu_base->running == timer)
> > + active = true;
> > +
> > + } while (read_seqcount_retry(&cpu_base->seq, seq) ||
> > + cpu_base != READ_ONCE(timer->base->cpu_base));
> > +
> > + return active;
> > }
>
> This may race with migrate_hrtimer_list(), so it needs write seqcounter
> too.
Let me go stare at that.
> My suggestion is do not use seqcounters for long parts of code, and implement
> short primitives for changing timer state and cpu_base running timer. Something
> like this:
>
> static inline void hrtimer_set_state(struct hrtimer *timer, unsigned long state)
> {
> struct hrtimer_cpu_base *cpu_base = timer->base->cpu_base;
>
> lockdep_assert_held(&cpu_base->lock);
>
> write_seqcount_begin(&cpu_base->seq);
> timer->state = state;
> write_seqcount_end(&cpu_base->seq);
> }
>
> static inline void cpu_base_set_running(struct hrtimer_cpu_base *cpu_base,
> struct hrtimer *timer)
> {
> lockdep_assert_held(&cpu_base->lock);
>
> write_seqcount_begin(&cpu_base->seq);
> cpu_base->running = timer;
> write_seqcount_end(&cpu_base->seq);
> }
>
> Implemented this, we may less think about right barrier order, because
> all changes are being made under seqcount.
The problem with that is that it generates far more memory barriers,
while on x86 these are 'free', this is very much not so for other archs
like ARM / PPC etc.
On Fri, Jun 05, 2015 at 12:03:34PM +0300, Kirill Tkhai wrote:
> This message is too late, /me going to see new series :)
Never too late, and thanks for looking at them. The new patch is
basically the same as the last proposal (except build fixes and a
comment).
05.06.2015, 12:10, "Peter Zijlstra" <[email protected]>:
> On Fri, Jun 05, 2015 at 12:02:01PM +0300, Kirill Tkhai wrote:
>> ?Yeah, it's safe for now, but it may happen difficulties with a support
>> ?in the future, because barrier logic is not easy to review. But it seems
>> ?we may simplify it a little bit. Please, see the comments below.
>>> ?@@ -394,8 +399,24 @@ extern u64 hrtimer_get_next_event(void);
>>> ???*/
>>> ??static inline int hrtimer_active(const struct hrtimer *timer)
>>> ??{
>>> ?+ struct hrtimer_cpu_base *cpu_base;
>>> ?+ unsigned int seq;
>>> ?+ bool active;
>>> ?+
>>> ?+ do {
>>> ?+ active = false;
>>> ?+ cpu_base = READ_ONCE(timer->base->cpu_base);
>>> ?+ seqcount_lockdep_reader_access(&cpu_base->seq);
>>> ?+ seq = raw_read_seqcount(&cpu_base->seq);
>>> ?+
>>> ?+ if (timer->state != HRTIMER_STATE_INACTIVE ||
>>> ?+ ???cpu_base->running == timer)
>>> ?+ active = true;
>>> ?+
>>> ?+ } while (read_seqcount_retry(&cpu_base->seq, seq) ||
>>> ?+ cpu_base != READ_ONCE(timer->base->cpu_base));
>>> ?+
>>> ?+ return active;
>>> ??}
>> ?This may race with migrate_hrtimer_list(), so it needs write seqcounter
>> ?too.
>
> Let me go stare at that.
>> ?My suggestion is do not use seqcounters for long parts of code, and implement
>> ?short primitives for changing timer state and cpu_base running timer. Something
>> ?like this:
>>
>> ?static inline void hrtimer_set_state(struct hrtimer *timer, unsigned long state)
>> ?{
>> ?????????struct hrtimer_cpu_base *cpu_base = timer->base->cpu_base;
>>
>> ?????????lockdep_assert_held(&cpu_base->lock);
>>
>> ?????????write_seqcount_begin(&cpu_base->seq);
>> ?????????timer->state = state;
>> ?????????write_seqcount_end(&cpu_base->seq);
>> ?}
>>
>> ?static inline void cpu_base_set_running(struct hrtimer_cpu_base *cpu_base,
>> ?????????????????????????????????????????struct hrtimer *timer)
>> ?{
>> ?????????lockdep_assert_held(&cpu_base->lock);
>>
>> ?????????write_seqcount_begin(&cpu_base->seq);
>> ?????????cpu_base->running = timer;
>> ?????????write_seqcount_end(&cpu_base->seq);
>> ?}
>>
>> ?Implemented this, we may less think about right barrier order, because
>> ?all changes are being made under seqcount.
>
> The problem with that is that it generates far more memory barriers,
> while on x86 these are 'free', this is very much not so for other archs
> like ARM / PPC etc.
Ok, thanks.
One more way is to take write_seqcount every time we're taking base spin_lock,
thus we may group several smp_wmb() in a single. But this increases write
seqlocked code areas, and worsens the speed of hrtimer_active(), so it's not
good too.