Message-ID: <5541265E.1020005@hp.com>
Date: Wed, 29 Apr 2015 14:43:42 -0400
From: Waiman Long <waiman.long@hp.com>
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:10.0.12) Gecko/20130109 Thunderbird/10.0.12
MIME-Version: 1.0
To: Jason Low <jason.low2@hp.com>
CC: Peter Zijlstra <peterz@infradead.org>, Ingo Molnar <mingo@kernel.org>,
        Thomas Gleixner <tglx@linutronix.de>, linux-kernel@vger.kernel.org,
        "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>,
        Andrew Morton <akpm@linux-foundation.org>,
        Oleg Nesterov <oleg@redhat.com>,
        Frederic Weisbecker <fweisbec@gmail.com>, Mel Gorman <mgorman@suse.de>,
        Rik van Riel <riel@redhat.com>, Steven Rostedt <rostedt@goodmis.org>,
        Preeti U Murthy <preeti@linux.vnet.ibm.com>,
        Mike Galbraith <umgwanakikbuti@gmail.com>,
        Davidlohr Bueso <dave@stgolabs.net>,
        Aswin Chandramouleeswaran <aswin@hp.com>,
        Scott J Norton <scott.norton@hp.com>
Subject: Re: [PATCH v2 3/5] sched, timer: Use atomics in thread_group_cputimer
 to improve scalability
References: <1430251224-5764-1-git-send-email-jason.low2@hp.com> <1430251224-5764-4-git-send-email-jason.low2@hp.com>
In-Reply-To: <1430251224-5764-4-git-send-email-jason.low2@hp.com>
Content-Type: text/plain; charset=ISO-8859-1; format=flowed
Content-Transfer-Encoding: 7bit
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 8993
Lines: 239

On 04/28/2015 04:00 PM, Jason Low wrote:
> While running a database workload, we found a scalability issue with itimers.
>
> Much of the problem was caused by the thread_group_cputimer spinlock.
> Each time we account for group system/user time, we need to obtain a
> thread_group_cputimer's spinlock to update the timers. On larger systems
> (such as a 16 socket machine), this caused more than 30% of total time
> spent trying to obtain this kernel lock to update these group timer stats.
>
> This patch converts the timers to 64 bit atomic variables and use
> atomic add to update them without a lock. With this patch, the percent
> of total time spent updating thread group cputimer timers was reduced
> from 30% down to less than 1%.
>
> Note: On 32 bit systems using the generic 64 bit atomics, this causes
> sample_group_cputimer() to take locks 3 times instead of just 1 time.
> However, we tested this patch on a 32 bit system ARM system using the
> generic atomics and did not find the overhead to be much of an issue.
> An explanation for why this isn't an issue is that 32 bit systems usually
> have small numbers of CPUs, and cacheline contention from extra spinlocks
> called periodically is not really apparent on smaller systems.
>
> Signed-off-by: Jason Low<jason.low2@hp.com>
> ---
>   include/linux/init_task.h      |    7 ++--
>   include/linux/sched.h          |   10 ++----
>   kernel/fork.c                  |    3 --
>   kernel/sched/stats.h           |   15 +++-----
>   kernel/time/posix-cpu-timers.c |   79 +++++++++++++++++++++++++---------------
>   5 files changed, 62 insertions(+), 52 deletions(-)
>
> diff --git a/include/linux/init_task.h b/include/linux/init_task.h
> index 696d223..7b9d8b5 100644
> --- a/include/linux/init_task.h
> +++ b/include/linux/init_task.h
> @@ -50,9 +50,10 @@ extern struct fs_struct init_fs;
>   	.cpu_timers	= INIT_CPU_TIMERS(sig.cpu_timers),		\
>   	.rlim		= INIT_RLIMITS,					\
>   	.cputimer	= { 						\
> -		.cputime = INIT_CPUTIME,				\
> -		.running = 0,						\
> -		.lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock),	\
> +		.utime 		  = ATOMIC64_INIT(0),			\
> +		.stime		  = ATOMIC64_INIT(0),			\
> +		.sum_exec_runtime = ATOMIC64_INIT(0),			\
> +		.running 	  = 0					\
>   	},								\
>   	.cred_guard_mutex =						\
>   		 __MUTEX_INITIALIZER(sig.cred_guard_mutex),		\
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 604eb7c..c736a47 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -601,9 +601,10 @@ struct task_cputime {
>    * used for thread group CPU timer calculations.
>    */
>   struct thread_group_cputimer {
> -	struct task_cputime cputime;
> +	atomic64_t utime;
> +	atomic64_t stime;
> +	atomic64_t sum_exec_runtime;
>   	int running;
> -	raw_spinlock_t lock;
>   };
>
>   #include<linux/rwsem.h>
> @@ -2970,11 +2971,6 @@ static __always_inline bool need_resched(void)
>   void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
>   void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
>
> -static inline void thread_group_cputime_init(struct signal_struct *sig)
> -{
> -	raw_spin_lock_init(&sig->cputimer.lock);
> -}
> -
>   /*
>    * Reevaluate whether the task has signals pending delivery.
>    * Wake the task if so.
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 47c37a4..2e67086 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1091,9 +1091,6 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
>   {
>   	unsigned long cpu_limit;
>
> -	/* Thread group counters. */
> -	thread_group_cputime_init(sig);
> -
>   	cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
>   	if (cpu_limit != RLIM_INFINITY) {
>   		sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
> diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
> index 4ab7043..c6d1c7d 100644
> --- a/kernel/sched/stats.h
> +++ b/kernel/sched/stats.h
> @@ -174,7 +174,8 @@ static inline bool cputimer_running(struct task_struct *tsk)
>   {
>   	struct thread_group_cputimer *cputimer =&tsk->signal->cputimer;
>
> -	if (!cputimer->running)
> +	/* Check if cputimer isn't running. This is accessed without locking. */
> +	if (!READ_ONCE(cputimer->running))
>   		return false;
>
>   	/*
> @@ -215,9 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
>   	if (!cputimer_running(tsk))
>   		return;
>
> -	raw_spin_lock(&cputimer->lock);
> -	cputimer->cputime.utime += cputime;
> -	raw_spin_unlock(&cputimer->lock);
> +	atomic64_add(cputime,&cputimer->utime);
>   }
>
>   /**
> @@ -238,9 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
>   	if (!cputimer_running(tsk))
>   		return;
>
> -	raw_spin_lock(&cputimer->lock);
> -	cputimer->cputime.stime += cputime;
> -	raw_spin_unlock(&cputimer->lock);
> +	atomic64_add(cputime,&cputimer->stime);
>   }
>
>   /**
> @@ -261,7 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
>   	if (!cputimer_running(tsk))
>   		return;
>
> -	raw_spin_lock(&cputimer->lock);
> -	cputimer->cputime.sum_exec_runtime += ns;
> -	raw_spin_unlock(&cputimer->lock);
> +	atomic64_add(ns,&cputimer->sum_exec_runtime);
>   }
> diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
> index e072d98..d857306 100644
> --- a/kernel/time/posix-cpu-timers.c
> +++ b/kernel/time/posix-cpu-timers.c
> @@ -196,39 +196,62 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
>   	return 0;
>   }
>
> -static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
> +/*
> + * Set cputime to sum_cputime if sum_cputime>  cputime. Use cmpxchg
> + * to avoid race conditions with concurrent updates to cputime.
> + */
> +static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
>   {
> -	if (b->utime>  a->utime)
> -		a->utime = b->utime;
> +	u64 curr_cputime;
> +retry:
> +	curr_cputime = atomic64_read(cputime);
> +	if (sum_cputime>  curr_cputime) {
> +		if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
> +			goto retry;
> +	}
> +}
>
> -	if (b->stime>  a->stime)
> -		a->stime = b->stime;
> +static void update_gt_cputime(struct thread_group_cputimer *cputimer, struct task_cputime *sum)
> +{
> +	__update_gt_cputime(&cputimer->utime, sum->utime);
> +	__update_gt_cputime(&cputimer->stime, sum->stime);
> +	__update_gt_cputime(&cputimer->sum_exec_runtime, sum->sum_exec_runtime);
> +}
>
> -	if (b->sum_exec_runtime>  a->sum_exec_runtime)
> -		a->sum_exec_runtime = b->sum_exec_runtime;
> +/* Sample thread_group_cputimer values in "cputimer", store results in "times". */
> +static inline void sample_group_cputimer(struct task_cputime *times,
> +					  struct thread_group_cputimer *cputimer)
> +{
> +	times->utime = atomic64_read(&cputimer->utime);
> +	times->stime = atomic64_read(&cputimer->stime);
> +	times->sum_exec_runtime = atomic64_read(&cputimer->sum_exec_runtime);
>   }
>
>   void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
>   {
>   	struct thread_group_cputimer *cputimer =&tsk->signal->cputimer;
>   	struct task_cputime sum;
> -	unsigned long flags;
>
> -	if (!cputimer->running) {
> +	/* Check if cputimer isn't running. This is accessed without locking. */
> +	if (!READ_ONCE(cputimer->running)) {
>   		/*
>   		 * The POSIX timer interface allows for absolute time expiry
>   		 * values through the TIMER_ABSTIME flag, therefore we have
> -		 * to synchronize the timer to the clock every time we start
> -		 * it.
> +		 * to synchronize the timer to the clock every time we start it.
>   		 */
>   		thread_group_cputime(tsk,&sum);
> -		raw_spin_lock_irqsave(&cputimer->lock, flags);
> -		cputimer->running = 1;
> -		update_gt_cputime(&cputimer->cputime,&sum);
> -	} else
> -		raw_spin_lock_irqsave(&cputimer->lock, flags);
> -	*times = cputimer->cputime;
> -	raw_spin_unlock_irqrestore(&cputimer->lock, flags);
> +		update_gt_cputime(cputimer,&sum);
> +
> +		/*
> +		 * We're setting cputimer->running without a lock. Ensure
> +		 * this only gets written to in one operation. We set
> +		 * running after update_gt_cputime() as a small optimization,
> +		 * but barriers are not required because update_gt_cputime()
> +		 * can handle concurrent updates.
> +		 */
> +		WRITE_ONCE(cputimer->running, 1);
> +	}
> +	sample_group_cputimer(times, cputimer);
>   }

If there is a possibility that more than one thread will be running this 
code concurrently, I think it will be safer to  use cmpxchg to set the 
running flag:

     if (!READ_ONCE(cputimer->running) && !cmpxchg(&cputimer->running, 
0, 1)) {
         ...

This will ensure that only one thread will update it.

Cheers,
Longman

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/