From: Kirill Tkhai <tkhai@yandex.ru>
To: Peter Zijlstra <peterz@infradead.org>
Cc: "umgwanakikbuti@gmail.com" <umgwanakikbuti@gmail.com>,
        "mingo@elte.hu" <mingo@elte.hu>,
        "ktkhai@parallels.com" <ktkhai@parallels.com>,
        "rostedt@goodmis.org" <rostedt@goodmis.org>,
        "tglx@linutronix.de" <tglx@linutronix.de>,
        "juri.lelli@gmail.com" <juri.lelli@gmail.com>,
        "pang.xunlei@linaro.org" <pang.xunlei@linaro.org>,
        "oleg@redhat.com" <oleg@redhat.com>,
        "wanpeng.li@linux.intel.com" <wanpeng.li@linux.intel.com>,
        "linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>
In-Reply-To: <20150604104902.GH3644@twins.programming.kicks-ass.net>
References: <20150603132903.203333087@infradead.org>
	 <20150603134023.156059118@infradead.org>
	 <214021433348760@web25g.yandex.ru>
	 <20150603211324.GC3644@twins.programming.kicks-ass.net>
	 <2134411433408823@web8j.yandex.ru> <20150604104902.GH3644@twins.programming.kicks-ass.net>
Subject: Re: [PATCH 8/9] hrtimer: Allow hrtimer::function() to free the timer
MIME-Version: 1.0
Message-Id: <5883311433494921@web27h.yandex.ru>
Date: Fri, 05 Jun 2015 12:02:01 +0300
Content-Transfer-Encoding: 8bit
Content-Type: text/plain; charset=koi8-r
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 8413
Lines: 262

? ??, 04/06/2015 ? 12:49 +0200, Peter Zijlstra ?????:
On Thu, Jun 04, 2015 at 12:07:03PM +0300, Kirill Tkhai wrote:
> > > --- a/include/linux/hrtimer.h
> > > +++ b/include/linux/hrtimer.h
> > > @@ -391,11 +391,25 @@ extern u64 hrtimer_get_next_event(void);
> > >   * A timer is active, when it is enqueued into the rbtree or the
> > >   * callback function is running or it's in the state of being migrated
> > >   * to another cpu.
> > > + *
> > > + * See __run_hrtimer().
> > >   */
> > > -static inline int hrtimer_active(const struct hrtimer *timer)
> > > +static inline bool hrtimer_active(const struct hrtimer *timer)
> > >  {
> > > -	return timer->state != HRTIMER_STATE_INACTIVE ||
> > > -		timer->base->running == timer;
> > > +	if (timer->state != HRTIMER_STATE_INACTIVE)
> > > +		return true;
> > > +
> > > +	smp_rmb(); /* C matches A */
> > > +
> > > +	if (timer->base->running == timer)
> > > +		return true;
> > > +
> > > +	smp_rmb(); /* D matches B */
> > > +
> > > +	if (timer->state != HRTIMER_STATE_INACTIVE)
> > > +		return true;
> > > +
> > > +	return false;
> > 
> > This races with two sequential timer handlers. hrtimer_active()
> > is preemptible everywhere, and no guarantees that all three "if"
> > conditions check the same timer tick.
> 
> Indeed.
> 
> > How about transformation of hrtimer_bases.lock: raw_spinlock_t --> seqlock_t?
> 
> Ingo will like that because it means we already need to touch cpu_base.
> 
> But I think there's a problem there on timer migration, the timer can
> migrate between bases while we do the seq read loop and then you can get
> false positives on the different seqcount numbers.
> 
> We could of course do something like the below, but hrtimer_is_active()
> is turning into quite the monster.
> 
> Needs more comments at the very least, its fully of trickery.

Yeah, it's safe for now, but it may happen difficulties with a support
in the future, because barrier logic is not easy to review. But it seems
we may simplify it a little bit. Please, see the comments below.

> ---
> --- a/include/linux/hrtimer.h
> +++ b/include/linux/hrtimer.h
> @@ -59,7 +59,9 @@ enum hrtimer_restart {
>   * mean touching the timer after the callback, this makes it impossible to free
>   * the timer from the callback function.
>   *
> - * Therefore we track the callback state in timer->base->running == timer.
> + * Therefore we track the callback state in:
> + *
> + * 	timer->base->cpu_base->running == timer
>   *
>   * On SMP it is possible to have a "callback function running and enqueued"
>   * status. It happens for example when a posix timer expired and the callback
> @@ -144,7 +146,6 @@ struct hrtimer_clock_base {
>  	struct timerqueue_head	active;
>  	ktime_t			(*get_time)(void);
>  	ktime_t			offset;
> -	struct hrtimer		*running;
>  } __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
>  
>  enum  hrtimer_base_type {
> @@ -159,6 +160,8 @@ enum  hrtimer_base_type {
>   * struct hrtimer_cpu_base - the per cpu clock bases
>   * @lock:		lock protecting the base and associated clock bases
>   *			and timers
> + * @seq:		seqcount around __run_hrtimer
> + * @running:		pointer to the currently running hrtimer
>   * @cpu:		cpu number
>   * @active_bases:	Bitfield to mark bases with active timers
>   * @clock_was_set_seq:	Sequence counter of clock was set events
> @@ -180,6 +183,8 @@ enum  hrtimer_base_type {
>   */
>  struct hrtimer_cpu_base {
>  	raw_spinlock_t			lock;
> +	seqcount_t			seq;
> +	struct hrtimer			*running;
>  	unsigned int			cpu;
>  	unsigned int			active_bases;
>  	unsigned int			clock_was_set_seq;
> @@ -394,8 +399,24 @@ extern u64 hrtimer_get_next_event(void);
>   */
>  static inline int hrtimer_active(const struct hrtimer *timer)
>  {
> -	return timer->state != HRTIMER_STATE_INACTIVE ||
> -		timer->base->running == timer;
> +	struct hrtimer_cpu_base *cpu_base;
> +	unsigned int seq;
> +	bool active;
> +
> +	do {
> +		active = false;
> +		cpu_base = READ_ONCE(timer->base->cpu_base);
> +		seqcount_lockdep_reader_access(&cpu_base->seq);
> +		seq = raw_read_seqcount(&cpu_base->seq);
> +
> +		if (timer->state != HRTIMER_STATE_INACTIVE ||
> +		    cpu_base->running == timer)
> +			active = true;
> +
> +	} while (read_seqcount_retry(&cpu_base->seq, seq) ||
> +		 cpu_base != READ_ONCE(timer->base->cpu_base));
> +
> +	return active;
>  }

This may race with migrate_hrtimer_list(), so it needs write seqcounter
too.

>  
>  /*
> @@ -412,7 +433,7 @@ static inline int hrtimer_is_queued(stru
>   */
>  static inline int hrtimer_callback_running(struct hrtimer *timer)
>  {
> -	return timer->base->running == timer;
> +	return timer->base->cpu_base->running == timer;
>  }
>  
>  /* Forward a hrtimer so it expires after now: */
> --- a/kernel/time/hrtimer.c
> +++ b/kernel/time/hrtimer.c
> @@ -67,6 +67,7 @@
>  DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
>  {
>  	.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
> +	.seq = SEQCNT_ZERO(hrtimer_bases.seq),
>  	.clock_base =
>  	{
>  		{
> @@ -113,9 +114,15 @@ static inline int hrtimer_clockid_to_bas
>  /*
>   * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
>   * such that hrtimer_callback_running() can unconditionally dereference
> - * timer->base.
> + * timer->base->cpu_base
>   */
> -static struct hrtimer_clock_base migration_base;
> +static struct hrtimer_cpu_base migration_cpu_base = {
> +	.seq = SEQCNT_ZERO(migration_cpu_base),
> +};
> +
> +static struct hrtimer_clock_base migration_base {
> +	.cpu_base = &migration_cpu_base,
> +};
>  
>  /*
>   * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
> @@ -1118,10 +1125,16 @@ static void __run_hrtimer(struct hrtimer
>  	enum hrtimer_restart (*fn)(struct hrtimer *);
>  	int restart;
>  
> -	WARN_ON(!irqs_disabled());
> +	lockdep_assert_held(&cpu_base->lock);
>  
>  	debug_deactivate(timer);
> -	base->running = timer;
> +	cpu_base->running = timer;

My suggestion is do not use seqcounters for long parts of code, and implement
short primitives for changing timer state and cpu_base running timer. Something
like this:

static inline void hrtimer_set_state(struct hrtimer *timer, unsigned long state)
{
	struct hrtimer_cpu_base	*cpu_base = timer->base->cpu_base;

	lockdep_assert_held(&cpu_base->lock);

	write_seqcount_begin(&cpu_base->seq);
	timer->state = state;
	write_seqcount_end(&cpu_base->seq);
}

static inline void cpu_base_set_running(struct hrtimer_cpu_base *cpu_base,
					struct hrtimer *timer)
{
	lockdep_assert_held(&cpu_base->lock);

	write_seqcount_begin(&cpu_base->seq);
	cpu_base->running = timer;
	write_seqcount_end(&cpu_base->seq);
}

Implemented this, we may less think about right barrier order, because
all changes are being made under seqcount.

static inline int hrtimer_active(const struct hrtimer *timer)
{
	struct hrtimer_cpu_base *cpu_base;
	struct hrtimer_clock_base *base;
	unsigned int seq;
	bool active = false;

	do {
		base = READ_ONCE(timer->base);
		if (base == &migration_base) {
			active = true;
			break;
		}

		cpu_base = base->cpu_base;
		seqcount_lockdep_reader_access(&cpu_base->seq);
		seq = raw_read_seqcount(&cpu_base->seq);

		if (timer->state != HRTIMER_STATE_INACTIVE ||
		    cpu_base->running == timer) {
			active = true;
			break;
		}
	} while (read_seqcount_retry(&cpu_base->seq, seq) ||
		 READ_ONCE(timer->base) != base);

	return active;
}


> +
> +	/*
> +	 * separate the ->running assignment from the ->state assignment
> +	 */
> +	write_seqcount_begin(&cpu_base->seq);
> +
>  	__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
>  	timer_stats_account_hrtimer(timer);
>  	fn = timer->function;
> @@ -1150,8 +1163,13 @@ static void __run_hrtimer(struct hrtimer
>  	    !(timer->state & HRTIMER_STATE_ENQUEUED))
>  		enqueue_hrtimer(timer, base);
>  
> -	WARN_ON_ONCE(base->running != timer);
> -	base->running = NULL;
> +	/*
> +	 * separate the ->running assignment from the ->state assignment
> +	 */
> +	write_seqcount_end(&cpu_base->seq);
> +
> +	WARN_ON_ONCE(cpu_base->running != timer);
> +	cpu_base->running = NULL;
>  }
>  
>  static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/