Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1757249Ab3G2RnW (ORCPT ); Mon, 29 Jul 2013 13:43:22 -0400 Received: from e38.co.us.ibm.com ([32.97.110.159]:58777 "EHLO e38.co.us.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751381Ab3G2RnT (ORCPT ); Mon, 29 Jul 2013 13:43:19 -0400 Date: Mon, 29 Jul 2013 10:43:12 -0700 From: "Paul E. McKenney" To: Lai Jiangshan Cc: linux-kernel@vger.kernel.org, mingo@elte.hu, dipankar@in.ibm.com, akpm@linux-foundation.org, mathieu.desnoyers@polymtl.ca, josh@joshtriplett.org, niv@us.ibm.com, tglx@linutronix.de, peterz@infradead.org, rostedt@goodmis.org, dhowells@redhat.com, edumazet@google.com, darren@dvhart.com, fweisbec@gmail.com, sbw@mit.edu Subject: Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine Message-ID: <20130729174312.GQ26694@linux.vnet.ibm.com> Reply-To: paulmck@linux.vnet.ibm.com References: <20130726231848.GA12967@linux.vnet.ibm.com> <1374880764-14248-1-git-send-email-paulmck@linux.vnet.ibm.com> <1374880764-14248-6-git-send-email-paulmck@linux.vnet.ibm.com> <51F625A4.1020401@cn.fujitsu.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <51F625A4.1020401@cn.fujitsu.com> User-Agent: Mutt/1.5.21 (2010-09-15) X-TM-AS-MML: No X-Content-Scanned: Fidelis XPS MAILER x-cbid: 13072917-5518-0000-0000-000010B54DEA Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 17574 Lines: 499 On Mon, Jul 29, 2013 at 04:19:48PM +0800, Lai Jiangshan wrote: > On 07/27/2013 07:19 AM, Paul E. McKenney wrote: > > From: "Paul E. McKenney" > > > > This commit adds the state machine that takes the per-CPU idle data > > as input and produces a full-system-idle indication as output. This > > state machine is driven out of RCU's quiescent-state-forcing > > mechanism, which invokes rcu_sysidle_check_cpu() to collect per-CPU > > idle state and then rcu_sysidle_report() to drive the state machine. > > > > The full-system-idle state is sampled using rcu_sys_is_idle(), which > > also drives the state machine if RCU is idle (and does so by forcing > > RCU to become non-idle). This function returns true if all but the > > timekeeping CPU (tick_do_timer_cpu) are idle and have been idle long > > enough to avoid memory contention on the full_sysidle_state state > > variable. The rcu_sysidle_force_exit() may be called externally > > to reset the state machine back into non-idle state. > > > > Signed-off-by: Paul E. McKenney > > Cc: Frederic Weisbecker > > Cc: Steven Rostedt > > --- > > include/linux/rcupdate.h | 18 +++ > > kernel/rcutree.c | 16 ++- > > kernel/rcutree.h | 5 + > > kernel/rcutree_plugin.h | 284 ++++++++++++++++++++++++++++++++++++++++++++++- > > 4 files changed, 316 insertions(+), 7 deletions(-) > > > > diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h > > index 48f1ef9..1aa8d8c 100644 > > --- a/include/linux/rcupdate.h > > +++ b/include/linux/rcupdate.h > > @@ -1011,4 +1011,22 @@ static inline bool rcu_is_nocb_cpu(int cpu) { return false; } > > #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ > > > > > > +/* Only for use by adaptive-ticks code. */ > > +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE > > +extern bool rcu_sys_is_idle(void); > > +extern void rcu_sysidle_force_exit(void); > > +#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ > > + > > +static inline bool rcu_sys_is_idle(void) > > +{ > > + return false; > > +} > > + > > +static inline void rcu_sysidle_force_exit(void) > > +{ > > +} > > + > > +#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ > > + > > + > > #endif /* __LINUX_RCUPDATE_H */ > > diff --git a/kernel/rcutree.c b/kernel/rcutree.c > > index 725524e..aa6d96e 100644 > > --- a/kernel/rcutree.c > > +++ b/kernel/rcutree.c > > @@ -718,6 +718,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, > > bool *isidle, unsigned long *maxj) > > { > > rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); > > + rcu_sysidle_check_cpu(rdp, isidle, maxj); > > return (rdp->dynticks_snap & 0x1) == 0; > > } > > > > @@ -1356,11 +1357,17 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) > > rsp->n_force_qs++; > > if (fqs_state == RCU_SAVE_DYNTICK) { > > /* Collect dyntick-idle snapshots. */ > > + if (is_sysidle_rcu_state(rsp)) { > > + isidle = 1; > > isidle = true; > the type of isidle is bool > > > + maxj = jiffies - ULONG_MAX / 4; > > + } > > force_qs_rnp(rsp, dyntick_save_progress_counter, > > &isidle, &maxj); > > + rcu_sysidle_report_gp(rsp, isidle, maxj); > > fqs_state = RCU_FORCE_QS; > > } else { > > /* Handle dyntick-idle and offline CPUs. */ > > + isidle = 0; > > isidle = false; > > > force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); > > } > > /* Clear flag to prevent immediate re-entry. */ > > @@ -2087,9 +2094,12 @@ static void force_qs_rnp(struct rcu_state *rsp, > > cpu = rnp->grplo; > > bit = 1; > > for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { > > - if ((rnp->qsmask & bit) != 0 && > > - f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) > > - mask |= bit; > > + if ((rnp->qsmask & bit) != 0) { > > + if ((rnp->qsmaskinit & bit) != 0) > > + *isidle = 0; > > *isidle = false All good catches, fixed. > > + if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) > > + mask |= bit; > > + } > > } > > if (mask != 0) { > > > > diff --git a/kernel/rcutree.h b/kernel/rcutree.h > > index 1895043..e0de5dc 100644 > > --- a/kernel/rcutree.h > > +++ b/kernel/rcutree.h > > @@ -555,6 +555,11 @@ static void rcu_kick_nohz_cpu(int cpu); > > static bool init_nocb_callback_list(struct rcu_data *rdp); > > static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); > > static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); > > +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, > > + unsigned long *maxj); > > +static bool is_sysidle_rcu_state(struct rcu_state *rsp); > > +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, > > + unsigned long maxj); > > static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); > > > > #endif /* #ifndef RCU_TREE_NONCORE */ > > diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h > > index 3edae39..ff84bed 100644 > > --- a/kernel/rcutree_plugin.h > > +++ b/kernel/rcutree_plugin.h > > @@ -28,7 +28,7 @@ > > #include > > #include > > #include > > -#include > > +#include "time/tick-internal.h" > > > > #define RCU_KTHREAD_PRIO 1 > > > > @@ -2395,12 +2395,12 @@ static void rcu_kick_nohz_cpu(int cpu) > > * most active flavor of RCU. > > */ > > #ifdef CONFIG_PREEMPT_RCU > > -static struct rcu_state __maybe_unused *rcu_sysidle_state = &rcu_preempt_state; > > +static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state; > > #else /* #ifdef CONFIG_PREEMPT_RCU */ > > -static struct rcu_state __maybe_unused *rcu_sysidle_state = &rcu_sched_state; > > +static struct rcu_state *rcu_sysidle_state = &rcu_sched_state; > > #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ > > > > -static int __maybe_unused full_sysidle_state; /* Current system-idle state. */ > > +static int full_sysidle_state; /* Current system-idle state. */ > > #define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ > > #define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ > > #define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */ > > @@ -2444,6 +2444,38 @@ static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) > > } > > > > /* > > + * Unconditionally force exit from full system-idle state. This is > > + * invoked when a normal CPU exits idle, but must be called separately > > + * for the timekeeping CPU (tick_do_timer_cpu). The reason for this > > + * is that the timekeeping CPU is permitted to take scheduling-clock > > + * interrupts while the system is in system-idle state, and of course > > + * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock > > + * interrupt from any other type of interrupt. > > + */ > > +void rcu_sysidle_force_exit(void) > > +{ > > + int oldstate = ACCESS_ONCE(full_sysidle_state); > > + int newoldstate; > > + > > + /* > > + * Each pass through the following loop attempts to exit full > > + * system-idle state. If contention proves to be a problem, > > + * a trylock-based contention tree could be used here. > > + */ > > + while (oldstate > RCU_SYSIDLE_SHORT) { > > + newoldstate = cmpxchg(&full_sysidle_state, > > + oldstate, RCU_SYSIDLE_NOT); > > + if (oldstate == newoldstate && > > + oldstate == RCU_SYSIDLE_FULL_NOTED) { > > + rcu_kick_nohz_cpu(tick_do_timer_cpu); > > + return; /* We cleared it, done! */ > > + } > > + oldstate = newoldstate; > > + } > > + smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */ > > why we need this mb()? > which mb() is paired with this? This one if for the case where we didn't do the cmpxchg() above. The idea is that if we saw that oldstate was RCU_SYSIDLE_SHORT or RCU_SYSIDLE_NOT, that anyone attempting to update the value who has seen any of our later non-idle activity also "sees" the load from full_sysidle_state, which reduces the state space a bit. The barrier pairs with the various cmpxchg() operations that advance full_sysidle_state. Thanx, Paul > > +} > > + > > +/* > > * Invoked to note entry to irq or task transition from idle. Note that > > * usermode execution does -not- count as idle here! The caller must > > * have disabled interrupts. > > @@ -2476,6 +2508,235 @@ static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) > > atomic_inc(&rdtp->dynticks_idle); > > smp_mb__after_atomic_inc(); > > WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); > > + > > + /* > > + * If we are the timekeeping CPU, we are permitted to be non-idle > > + * during a system-idle state. This must be the case, because > > + * the timekeeping CPU has to take scheduling-clock interrupts > > + * during the time that the system is transitioning to full > > + * system-idle state. This means that the timekeeping CPU must > > + * invoke rcu_sysidle_force_exit() directly if it does anything > > + * more than take a scheduling-clock interrupt. > > + */ > > + if (smp_processor_id() == tick_do_timer_cpu) > > + return; > > + > > + /* Update system-idle state: We are clearly no longer fully idle! */ > > + rcu_sysidle_force_exit(); > > +} > > + > > +/* > > + * Check to see if the current CPU is idle. Note that usermode execution > > + * does not count as idle. The caller must have disabled interrupts. > > + */ > > +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, > > + unsigned long *maxj) > > +{ > > + int cur; > > + unsigned long j; > > + struct rcu_dynticks *rdtp = rdp->dynticks; > > + > > + /* > > + * If some other CPU has already reported non-idle, if this is > > + * not the flavor of RCU that tracks sysidle state, or if this > > + * is an offline or the timekeeping CPU, nothing to do. > > + */ > > + if (!*isidle || rdp->rsp != rcu_sysidle_state || > > + cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) > > + return; > > + /* WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); */ > > + > > + /* Pick up current idle and NMI-nesting counter and check. */ > > + cur = atomic_read(&rdtp->dynticks_idle); > > + if (cur & 0x1) { > > + *isidle = 0; /* We are not idle! */ > > *isidle = false; > > And other places which use "isidle". > > > + return; > > + } > > + smp_mb(); /* Read counters before timestamps. */ > > + > > + /* Pick up timestamps. */ > > + j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies); > > + /* If this CPU entered idle more recently, update maxj timestamp. */ > > + if (ULONG_CMP_LT(*maxj, j)) > > + *maxj = j; > > +} > > + > > +/* > > + * Is this the flavor of RCU that is handling full-system idle? > > + */ > > +static bool is_sysidle_rcu_state(struct rcu_state *rsp) > > +{ > > + return rsp == rcu_sysidle_state; > > +} > > + > > +/* > > + * Return a delay in jiffies based on the number of CPUs, rcu_node > > + * leaf fanout, and jiffies tick rate. The idea is to allow larger > > + * systems more time to transition to full-idle state in order to > > + * avoid the cache thrashing that otherwise occur on the state variable. > > + * Really small systems (less than a couple of tens of CPUs) should > > + * instead use a single global atomically incremented counter, and later > > + * versions of this will automatically reconfigure themselves accordingly. > > + */ > > +static unsigned long rcu_sysidle_delay(void) > > +{ > > + if (nr_cpu_ids <= RCU_SYSIDLE_SMALL) > > + return 0; > > + return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000); > > +} > > + > > +/* > > + * Advance the full-system-idle state. This is invoked when all of > > + * the non-timekeeping CPUs are idle. > > + */ > > +static void rcu_sysidle(unsigned long j) > > +{ > > + /* Check the current state. */ > > + switch (ACCESS_ONCE(full_sysidle_state)) { > > + case RCU_SYSIDLE_NOT: > > + > > + /* First time all are idle, so note a short idle period. */ > > + ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT; > > + break; > > + > > + case RCU_SYSIDLE_SHORT: > > + > > + /* > > + * Idle for a bit, time to advance to next state? > > + * cmpxchg failure means race with non-idle, let them win. > > + */ > > + if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) > > + (void)cmpxchg(&full_sysidle_state, > > + RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); > > + break; > > I don't think it will race with any body. > I think ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_LONG is enough. > > note: > rcu_sysidle_force_exit() doesn't change full_sysidle_state if it is RCU_SYSIDLE_SHORT. > > > + > > + case RCU_SYSIDLE_LONG: > > + > > + /* > > + * Do an additional check pass before advancing to full. > > + * cmpxchg failure means race with non-idle, let them win. > > + */ > > + if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) > > + (void)cmpxchg(&full_sysidle_state, > > + RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL); > > + break; > > + > > + default: > > + break; > > + } > > +} > > + > > +/* > > + * Found a non-idle non-timekeeping CPU, so kick the system-idle state > > + * back to the beginning. > > + */ > > +static void rcu_sysidle_cancel(void) > > +{ > > + smp_mb(); > > + ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; > > +} > > + > > +/* > > + * Update the sysidle state based on the results of a force-quiescent-state > > + * scan of the CPUs' dyntick-idle state. > > + */ > > +static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, > > + unsigned long maxj, bool gpkt) > > +{ > > + if (rsp != rcu_sysidle_state) > > + return; /* Wrong flavor, ignore. */ > > + if (isidle) { > > + if (gpkt && nr_cpu_ids > RCU_SYSIDLE_SMALL) > > + rcu_sysidle(maxj); /* More idle! */ > > + } else { > > + rcu_sysidle_cancel(); /* Idle is over. */ > > + } > > +} > > "gpkt" is always equal to "nr_cpu_ids > RCU_SYSIDLE_SMALL", > > so we can remove "gpkt" argument and rcu_sysidle_report_gp(). > > > > + > > +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, > > + unsigned long maxj) > > +{ > > + rcu_sysidle_report(rsp, isidle, maxj, true); > > +} > > + > > +/* Callback and function for forcing an RCU grace period. */ > > +struct rcu_sysidle_head { > > + struct rcu_head rh; > > + int inuse; > > +}; > > + > > +static void rcu_sysidle_cb(struct rcu_head *rhp) > > +{ > > + struct rcu_sysidle_head *rshp; > > + > > + smp_mb(); /* grace period precedes setting inuse. */ > > Why we need this mb()? > > > + rshp = container_of(rhp, struct rcu_sysidle_head, rh); > > + ACCESS_ONCE(rshp->inuse) = 0; > > +} > > + > > +/* > > + * Check to see if the system is fully idle, other than the timekeeping CPU. > > + * The caller must have disabled interrupts. > > + */ > > +bool rcu_sys_is_idle(void) > > +{ > > + static struct rcu_sysidle_head rsh; > > + int rss = ACCESS_ONCE(full_sysidle_state); > > + > > + if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) > > + return false; > > + > > + /* Handle small-system case by doing a full scan of CPUs. */ > > + if (nr_cpu_ids <= RCU_SYSIDLE_SMALL) { > > + int oldrss = rss - 1; > > + > > + /* > > + * One pass to advance to each state up to _FULL. > > + * Give up if any pass fails to advance the state. > > + */ > > + while (rss < RCU_SYSIDLE_FULL && oldrss < rss) { > > + int cpu; > > + bool isidle = true; > > + unsigned long maxj = jiffies - ULONG_MAX / 4; > > + struct rcu_data *rdp; > > + > > + /* Scan all the CPUs looking for nonidle CPUs. */ > > + for_each_possible_cpu(cpu) { > > + rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu); > > + rcu_sysidle_check_cpu(rdp, &isidle, &maxj); > > + if (!isidle) > > + break; > > + } > > + rcu_sysidle_report(rcu_sysidle_state, > > + isidle, maxj, false); > > + oldrss = rss; > > + rss = ACCESS_ONCE(full_sysidle_state); > > + } > > + } > > + > > + /* If this is the first observation of an idle period, record it. */ > > + if (rss == RCU_SYSIDLE_FULL) { > > + rss = cmpxchg(&full_sysidle_state, > > + RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED); > > + return rss == RCU_SYSIDLE_FULL; > > + } > > + > > + smp_mb(); /* ensure rss load happens before later caller actions. */ > > + > > + /* If already fully idle, tell the caller (in case of races). */ > > + if (rss == RCU_SYSIDLE_FULL_NOTED) > > + return true; > > + > > + /* > > + * If we aren't there yet, and a grace period is not in flight, > > + * initiate a grace period. Either way, tell the caller that > > + * we are not there yet. > > + */ > > + if (nr_cpu_ids > RCU_SYSIDLE_SMALL && > > + !rcu_gp_in_progress(rcu_sysidle_state) && > > + !rsh.inuse && xchg(&rsh.inuse, 1) == 0) > > why need to use xchg()? Who will it race with? > > > + call_rcu(&rsh.rh, rcu_sysidle_cb); > > + return false; > > } > > > > /* > > @@ -2496,6 +2757,21 @@ static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) > > { > > } > > > > +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, > > + unsigned long *maxj) > > +{ > > +} > > + > > +static bool is_sysidle_rcu_state(struct rcu_state *rsp) > > +{ > > + return false; > > +} > > + > > +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, > > + unsigned long maxj) > > +{ > > +} > > + > > static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) > > { > > } > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/