2015-04-28 20:00:39

by Jason Low

[permalink] [raw]
Subject: [PATCH v2 0/5] sched, timer: Improve scalability of itimers

This patchset improves the scalability of itimers, thread_group_cputimer
and addresses a performance issue we found while running a database
workload where more than 30% of total time is spent in the kernel
trying to acquire the thread_group_cputimer spinlock.

While we're modifying sched and timer, patch 1 also updates all existing
usages of ACCESS_ONCE with the new READ_ONCE and WRITE_ONCE APIs in
those areas.

Jason Low (5):
sched, timer: Remove usages of ACCESS_ONCE in the scheduler
sched, numa: Document usages of mm->numa_scan_seq
sched, timer: Use atomics in thread_group_cputimer to improve
scalability
sched, timer: Provide an atomic task_cputime data structure
sched, timer: Use the atomic task_cputime in thread_group_cputimer

include/linux/init_task.h | 5 +-
include/linux/sched.h | 29 +++++++++----
kernel/fork.c | 5 +--
kernel/sched/auto_group.c | 2 +-
kernel/sched/auto_group.h | 2 +-
kernel/sched/core.c | 4 +-
kernel/sched/cputime.c | 2 +-
kernel/sched/deadline.c | 2 +-
kernel/sched/fair.c | 26 +++++++++---
kernel/sched/proc.c | 4 +-
kernel/sched/rt.c | 2 +-
kernel/sched/sched.h | 2 +-
kernel/sched/stats.h | 15 ++-----
kernel/sched/wait.c | 4 +-
kernel/time/posix-cpu-timers.c | 87 +++++++++++++++++++++++++---------------
15 files changed, 113 insertions(+), 78 deletions(-)

--
1.7.2.5


2015-04-28 20:00:45

by Jason Low

[permalink] [raw]
Subject: [PATCH v2 1/5] sched, timer: Remove usages of ACCESS_ONCE in the scheduler

ACCESS_ONCE doesn't work reliably on non-scalar types. This patch removes
the rest of the existing usages of ACCESS_ONCE in the scheduler, and use
the new READ_ONCE and WRITE_ONCE APIs.

Signed-off-by: Jason Low <[email protected]>
---
include/linux/sched.h | 4 ++--
kernel/fork.c | 2 +-
kernel/sched/auto_group.c | 2 +-
kernel/sched/auto_group.h | 2 +-
kernel/sched/core.c | 4 ++--
kernel/sched/cputime.c | 2 +-
kernel/sched/deadline.c | 2 +-
kernel/sched/fair.c | 14 +++++++-------
kernel/sched/proc.c | 4 ++--
kernel/sched/rt.c | 2 +-
kernel/sched/sched.h | 2 +-
kernel/sched/wait.c | 4 ++--
kernel/time/posix-cpu-timers.c | 8 ++++----
13 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8222ae4..604eb7c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3088,13 +3088,13 @@ static inline void mm_update_next_owner(struct mm_struct *mm)
static inline unsigned long task_rlimit(const struct task_struct *tsk,
unsigned int limit)
{
- return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_cur);
+ return READ_ONCE(tsk->signal->rlim[limit].rlim_cur);
}

static inline unsigned long task_rlimit_max(const struct task_struct *tsk,
unsigned int limit)
{
- return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_max);
+ return READ_ONCE(tsk->signal->rlim[limit].rlim_max);
}

static inline unsigned long rlimit(unsigned int limit)
diff --git a/kernel/fork.c b/kernel/fork.c
index 03c1eaa..47c37a4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1094,7 +1094,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
/* Thread group counters. */
thread_group_cputime_init(sig);

- cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+ cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
if (cpu_limit != RLIM_INFINITY) {
sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
sig->cputimer.running = 1;
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 1a3b58d..750ed60 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -139,7 +139,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)

p->signal->autogroup = autogroup_kref_get(ag);

- if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
+ if (!READ_ONCE(sysctl_sched_autogroup_enabled))
goto out;

for_each_thread(p, t)
diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h
index 8bd0471..890c95f 100644
--- a/kernel/sched/auto_group.h
+++ b/kernel/sched/auto_group.h
@@ -29,7 +29,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
static inline struct task_group *
autogroup_task_group(struct task_struct *p, struct task_group *tg)
{
- int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+ int enabled = READ_ONCE(sysctl_sched_autogroup_enabled);

if (enabled && task_wants_autogroup(p, tg))
return p->signal->autogroup->tg;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f9123a8..aa88ff7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -511,7 +511,7 @@ static bool set_nr_and_not_polling(struct task_struct *p)
static bool set_nr_if_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
- typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+ typeof(ti->flags) old, val = READ_ONCE(ti->flags);

for (;;) {
if (!(val & _TIF_POLLING_NRFLAG))
@@ -2540,7 +2540,7 @@ void scheduler_tick(void)
u64 scheduler_tick_max_deferment(void)
{
struct rq *rq = this_rq();
- unsigned long next, now = ACCESS_ONCE(jiffies);
+ unsigned long next, now = READ_ONCE(jiffies);

next = rq->last_sched_tick + HZ;

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8394b1e..f5a64ff 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -567,7 +567,7 @@ static void cputime_advance(cputime_t *counter, cputime_t new)
{
cputime_t old;

- while (new > (old = ACCESS_ONCE(*counter)))
+ while (new > (old = READ_ONCE(*counter)))
cmpxchg_cputime(counter, old, new);
}

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 5e95145..890ce95 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -995,7 +995,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
rq = cpu_rq(cpu);

rcu_read_lock();
- curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+ curr = READ_ONCE(rq->curr); /* unlocked access */

/*
* If we are dealing with a -deadline task, we must
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ffeaa41..5a44371 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -834,7 +834,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)

static unsigned int task_scan_min(struct task_struct *p)
{
- unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
+ unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
unsigned int scan, floor;
unsigned int windows = 1;

@@ -1794,7 +1794,7 @@ static void task_numa_placement(struct task_struct *p)
u64 runtime, period;
spinlock_t *group_lock = NULL;

- seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+ seq = READ_ONCE(p->mm->numa_scan_seq);
if (p->numa_scan_seq == seq)
return;
p->numa_scan_seq = seq;
@@ -1938,7 +1938,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
}

rcu_read_lock();
- tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
+ tsk = READ_ONCE(cpu_rq(cpu)->curr);

if (!cpupid_match_pid(tsk, cpupid))
goto no_join;
@@ -2107,7 +2107,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)

static void reset_ptenuma_scan(struct task_struct *p)
{
- ACCESS_ONCE(p->mm->numa_scan_seq)++;
+ WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
p->mm->numa_scan_offset = 0;
}

@@ -4375,7 +4375,7 @@ static unsigned long capacity_orig_of(int cpu)
static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
+ unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
unsigned long load_avg = rq->cfs.runnable_load_avg;

if (nr_running)
@@ -6037,8 +6037,8 @@ static unsigned long scale_rt_capacity(int cpu)
* Since we're reading these variables without serialization make sure
* we read them once before doing sanity checks on them.
*/
- age_stamp = ACCESS_ONCE(rq->age_stamp);
- avg = ACCESS_ONCE(rq->rt_avg);
+ age_stamp = READ_ONCE(rq->age_stamp);
+ avg = READ_ONCE(rq->rt_avg);
delta = __rq_clock_broken(rq) - age_stamp;

if (unlikely(delta < 0))
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c
index 8ecd552..361e6f1 100644
--- a/kernel/sched/proc.c
+++ b/kernel/sched/proc.c
@@ -526,7 +526,7 @@ static inline unsigned long get_rq_runnable_load(struct rq *rq)
*/
void update_idle_cpu_load(struct rq *this_rq)
{
- unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+ unsigned long curr_jiffies = READ_ONCE(jiffies);
unsigned long load = get_rq_runnable_load(this_rq);
unsigned long pending_updates;

@@ -548,7 +548,7 @@ void update_idle_cpu_load(struct rq *this_rq)
void update_cpu_load_nohz(void)
{
struct rq *this_rq = this_rq();
- unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+ unsigned long curr_jiffies = READ_ONCE(jiffies);
unsigned long pending_updates;

if (curr_jiffies == this_rq->last_load_update_tick)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 575da76..560d2fa 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1323,7 +1323,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
rq = cpu_rq(cpu);

rcu_read_lock();
- curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+ curr = READ_ONCE(rq->curr); /* unlocked access */

/*
* If the current task on @p's runqueue is an RT task, then
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e0e1299..014b550 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -707,7 +707,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

static inline u64 __rq_clock_broken(struct rq *rq)
{
- return ACCESS_ONCE(rq->clock);
+ return READ_ONCE(rq->clock);
}

static inline u64 rq_clock(struct rq *rq)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 852143a..2ccec98 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -601,7 +601,7 @@ EXPORT_SYMBOL(bit_wait_io);

__sched int bit_wait_timeout(struct wait_bit_key *word)
{
- unsigned long now = ACCESS_ONCE(jiffies);
+ unsigned long now = READ_ONCE(jiffies);
if (signal_pending_state(current->state, current))
return 1;
if (time_after_eq(now, word->timeout))
@@ -613,7 +613,7 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout);

__sched int bit_wait_io_timeout(struct wait_bit_key *word)
{
- unsigned long now = ACCESS_ONCE(jiffies);
+ unsigned long now = READ_ONCE(jiffies);
if (signal_pending_state(current->state, current))
return 1;
if (time_after_eq(now, word->timeout))
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 0075da7..e072d98 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -852,10 +852,10 @@ static void check_thread_timers(struct task_struct *tsk,
/*
* Check for the special case thread timers.
*/
- soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
+ soft = READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
if (soft != RLIM_INFINITY) {
unsigned long hard =
- ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
+ READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);

if (hard != RLIM_INFINITY &&
tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -958,11 +958,11 @@ static void check_process_timers(struct task_struct *tsk,
SIGPROF);
check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
SIGVTALRM);
- soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+ soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
if (soft != RLIM_INFINITY) {
unsigned long psecs = cputime_to_secs(ptime);
unsigned long hard =
- ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
+ READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
cputime_t x;
if (psecs >= hard) {
/*
--
1.7.2.5

2015-04-28 20:00:51

by Jason Low

[permalink] [raw]
Subject: [PATCH v2 2/5] sched, numa: Document usages of mm->numa_scan_seq

The p->mm->numa_scan_seq is accessed using READ_ONCE/WRITE_ONCE
and modified without exclusive access. It is not clear why it is
accessed this way. This patch provides some documentation on that.

Signed-off-by: Jason Low <[email protected]>
---
kernel/sched/fair.c | 12 ++++++++++++
1 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5a44371..794f7d7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1794,6 +1794,11 @@ static void task_numa_placement(struct task_struct *p)
u64 runtime, period;
spinlock_t *group_lock = NULL;

+ /*
+ * The p->mm->numa_scan_seq gets updated without
+ * exclusive access. Use READ_ONCE() here to ensure
+ * that the field is read in a single access.
+ */
seq = READ_ONCE(p->mm->numa_scan_seq);
if (p->numa_scan_seq == seq)
return;
@@ -2107,6 +2112,13 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)

static void reset_ptenuma_scan(struct task_struct *p)
{
+ /*
+ * We only did a read acquisition of the mmap sem, so
+ * p->mm->numa_scan_seq is written to without exclusive access.
+ * That's not much of an issue though, since this is just used
+ * for statistical sampling. Use WRITE_ONCE and READ_ONCE, which
+ * are not expensive, to avoid load/store tearing.
+ */
WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
p->mm->numa_scan_offset = 0;
}
--
1.7.2.5

2015-04-28 20:01:03

by Jason Low

[permalink] [raw]
Subject: [PATCH v2 3/5] sched, timer: Use atomics in thread_group_cputimer to improve scalability

While running a database workload, we found a scalability issue with itimers.

Much of the problem was caused by the thread_group_cputimer spinlock.
Each time we account for group system/user time, we need to obtain a
thread_group_cputimer's spinlock to update the timers. On larger systems
(such as a 16 socket machine), this caused more than 30% of total time
spent trying to obtain this kernel lock to update these group timer stats.

This patch converts the timers to 64 bit atomic variables and use
atomic add to update them without a lock. With this patch, the percent
of total time spent updating thread group cputimer timers was reduced
from 30% down to less than 1%.

Note: On 32 bit systems using the generic 64 bit atomics, this causes
sample_group_cputimer() to take locks 3 times instead of just 1 time.
However, we tested this patch on a 32 bit system ARM system using the
generic atomics and did not find the overhead to be much of an issue.
An explanation for why this isn't an issue is that 32 bit systems usually
have small numbers of CPUs, and cacheline contention from extra spinlocks
called periodically is not really apparent on smaller systems.

Signed-off-by: Jason Low <[email protected]>
---
include/linux/init_task.h | 7 ++--
include/linux/sched.h | 10 ++----
kernel/fork.c | 3 --
kernel/sched/stats.h | 15 +++-----
kernel/time/posix-cpu-timers.c | 79 +++++++++++++++++++++++++---------------
5 files changed, 62 insertions(+), 52 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 696d223..7b9d8b5 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -50,9 +50,10 @@ extern struct fs_struct init_fs;
.cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \
.rlim = INIT_RLIMITS, \
.cputimer = { \
- .cputime = INIT_CPUTIME, \
- .running = 0, \
- .lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \
+ .utime = ATOMIC64_INIT(0), \
+ .stime = ATOMIC64_INIT(0), \
+ .sum_exec_runtime = ATOMIC64_INIT(0), \
+ .running = 0 \
}, \
.cred_guard_mutex = \
__MUTEX_INITIALIZER(sig.cred_guard_mutex), \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 604eb7c..c736a47 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -601,9 +601,10 @@ struct task_cputime {
* used for thread group CPU timer calculations.
*/
struct thread_group_cputimer {
- struct task_cputime cputime;
+ atomic64_t utime;
+ atomic64_t stime;
+ atomic64_t sum_exec_runtime;
int running;
- raw_spinlock_t lock;
};

#include <linux/rwsem.h>
@@ -2970,11 +2971,6 @@ static __always_inline bool need_resched(void)
void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);

-static inline void thread_group_cputime_init(struct signal_struct *sig)
-{
- raw_spin_lock_init(&sig->cputimer.lock);
-}
-
/*
* Reevaluate whether the task has signals pending delivery.
* Wake the task if so.
diff --git a/kernel/fork.c b/kernel/fork.c
index 47c37a4..2e67086 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1091,9 +1091,6 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
{
unsigned long cpu_limit;

- /* Thread group counters. */
- thread_group_cputime_init(sig);
-
cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
if (cpu_limit != RLIM_INFINITY) {
sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 4ab7043..c6d1c7d 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -174,7 +174,8 @@ static inline bool cputimer_running(struct task_struct *tsk)
{
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;

- if (!cputimer->running)
+ /* Check if cputimer isn't running. This is accessed without locking. */
+ if (!READ_ONCE(cputimer->running))
return false;

/*
@@ -215,9 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
if (!cputimer_running(tsk))
return;

- raw_spin_lock(&cputimer->lock);
- cputimer->cputime.utime += cputime;
- raw_spin_unlock(&cputimer->lock);
+ atomic64_add(cputime, &cputimer->utime);
}

/**
@@ -238,9 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
if (!cputimer_running(tsk))
return;

- raw_spin_lock(&cputimer->lock);
- cputimer->cputime.stime += cputime;
- raw_spin_unlock(&cputimer->lock);
+ atomic64_add(cputime, &cputimer->stime);
}

/**
@@ -261,7 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
if (!cputimer_running(tsk))
return;

- raw_spin_lock(&cputimer->lock);
- cputimer->cputime.sum_exec_runtime += ns;
- raw_spin_unlock(&cputimer->lock);
+ atomic64_add(ns, &cputimer->sum_exec_runtime);
}
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index e072d98..d857306 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -196,39 +196,62 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
return 0;
}

-static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
+/*
+ * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
+ * to avoid race conditions with concurrent updates to cputime.
+ */
+static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
{
- if (b->utime > a->utime)
- a->utime = b->utime;
+ u64 curr_cputime;
+retry:
+ curr_cputime = atomic64_read(cputime);
+ if (sum_cputime > curr_cputime) {
+ if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
+ goto retry;
+ }
+}

- if (b->stime > a->stime)
- a->stime = b->stime;
+static void update_gt_cputime(struct thread_group_cputimer *cputimer, struct task_cputime *sum)
+{
+ __update_gt_cputime(&cputimer->utime, sum->utime);
+ __update_gt_cputime(&cputimer->stime, sum->stime);
+ __update_gt_cputime(&cputimer->sum_exec_runtime, sum->sum_exec_runtime);
+}

- if (b->sum_exec_runtime > a->sum_exec_runtime)
- a->sum_exec_runtime = b->sum_exec_runtime;
+/* Sample thread_group_cputimer values in "cputimer", store results in "times". */
+static inline void sample_group_cputimer(struct task_cputime *times,
+ struct thread_group_cputimer *cputimer)
+{
+ times->utime = atomic64_read(&cputimer->utime);
+ times->stime = atomic64_read(&cputimer->stime);
+ times->sum_exec_runtime = atomic64_read(&cputimer->sum_exec_runtime);
}

void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
{
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
struct task_cputime sum;
- unsigned long flags;

- if (!cputimer->running) {
+ /* Check if cputimer isn't running. This is accessed without locking. */
+ if (!READ_ONCE(cputimer->running)) {
/*
* The POSIX timer interface allows for absolute time expiry
* values through the TIMER_ABSTIME flag, therefore we have
- * to synchronize the timer to the clock every time we start
- * it.
+ * to synchronize the timer to the clock every time we start it.
*/
thread_group_cputime(tsk, &sum);
- raw_spin_lock_irqsave(&cputimer->lock, flags);
- cputimer->running = 1;
- update_gt_cputime(&cputimer->cputime, &sum);
- } else
- raw_spin_lock_irqsave(&cputimer->lock, flags);
- *times = cputimer->cputime;
- raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+ update_gt_cputime(cputimer, &sum);
+
+ /*
+ * We're setting cputimer->running without a lock. Ensure
+ * this only gets written to in one operation. We set
+ * running after update_gt_cputime() as a small optimization,
+ * but barriers are not required because update_gt_cputime()
+ * can handle concurrent updates.
+ */
+ WRITE_ONCE(cputimer->running, 1);
+ }
+ sample_group_cputimer(times, cputimer);
}

/*
@@ -582,7 +605,8 @@ bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
if (!task_cputime_zero(&tsk->cputime_expires))
return false;

- if (tsk->signal->cputimer.running)
+ /* Check if cputimer is running. This is accessed without locking. */
+ if (READ_ONCE(tsk->signal->cputimer.running))
return false;

return true;
@@ -882,14 +906,12 @@ static void check_thread_timers(struct task_struct *tsk,
}
}

-static void stop_process_timers(struct signal_struct *sig)
+static inline void stop_process_timers(struct signal_struct *sig)
{
struct thread_group_cputimer *cputimer = &sig->cputimer;
- unsigned long flags;

- raw_spin_lock_irqsave(&cputimer->lock, flags);
- cputimer->running = 0;
- raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+ /* Turn off cputimer->running. This is done without locking. */
+ WRITE_ONCE(cputimer->running, 0);
}

static u32 onecputick;
@@ -1111,12 +1133,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
}

sig = tsk->signal;
- if (sig->cputimer.running) {
+ /* Check if cputimer is running. This is accessed without locking. */
+ if (READ_ONCE(sig->cputimer.running)) {
struct task_cputime group_sample;

- raw_spin_lock(&sig->cputimer.lock);
- group_sample = sig->cputimer.cputime;
- raw_spin_unlock(&sig->cputimer.lock);
+ sample_group_cputimer(&group_sample, &sig->cputimer);

if (task_cputime_expired(&group_sample, &sig->cputime_expires))
return 1;
@@ -1157,7 +1178,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
* If there are any active process wide timers (POSIX 1.b, itimers,
* RLIMIT_CPU) cputimer must be running.
*/
- if (tsk->signal->cputimer.running)
+ if (READ_ONCE(tsk->signal->cputimer.running))
check_process_timers(tsk, &firing);

/*
--
1.7.2.5

2015-04-28 20:01:07

by Jason Low

[permalink] [raw]
Subject: [PATCH v2 4/5] sched, timer: Provide an atomic task_cputime data structure

This patch adds an atomic variant of the task_cputime data structure,
which can be used to store and update task_cputime statistics without
needing to do locking.

Suggested-by: Ingo Molnar <[email protected]>
Signed-off-by: Jason Low <[email protected]>
---
include/linux/sched.h | 17 +++++++++++++++++
1 files changed, 17 insertions(+), 0 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c736a47..7092192 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -575,6 +575,23 @@ struct task_cputime {
.sum_exec_runtime = 0, \
}

+/*
+ * This is the atomic variant of task_cputime, which can be used for
+ * storing and updating task_cputime statistics without locking.
+ */
+struct task_cputime_atomic {
+ atomic64_t utime;
+ atomic64_t stime;
+ atomic64_t sum_exec_runtime;
+};
+
+#define INIT_CPUTIME_ATOMIC \
+ (struct task_cputime_atomic) { \
+ .utime = ATOMIC64_INIT(0), \
+ .stime = ATOMIC64_INIT(0), \
+ .sum_exec_runtime = ATOMIC64_INIT(0), \
+ }
+
#ifdef CONFIG_PREEMPT_COUNT
#define PREEMPT_DISABLED (1 + PREEMPT_ENABLED)
#else
--
1.7.2.5

2015-04-28 20:01:21

by Jason Low

[permalink] [raw]
Subject: [PATCH v2 5/5] sched, timer: Use the atomic task_cputime in thread_group_cputimer

Recent optimizations were made to thread_group_cputimer to improve its
scalability by keeping track of cputime stats without a lock. However,
the values were open coded to the structure, causing them to be at
a different abstraction level from the regular task_cputime structure.
Furthermore, any subsequent similar optimizations would not be able to
share the new code, since they are specific to thread_group_cputimer.

This patch adds the new task_cputime_atomic data structure (introduced in
the previous patch in the series) to thread_group_cputimer for keeping
track of the cputime atomically, which also helps generalize the code.

Suggested-by: Ingo Molnar <[email protected]>
Signed-off-by: Jason Low <[email protected]>
---
include/linux/init_task.h | 6 ++----
include/linux/sched.h | 4 +---
kernel/sched/stats.h | 6 +++---
kernel/time/posix-cpu-timers.c | 26 +++++++++++++-------------
4 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 7b9d8b5..bb9b075 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -50,10 +50,8 @@ extern struct fs_struct init_fs;
.cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \
.rlim = INIT_RLIMITS, \
.cputimer = { \
- .utime = ATOMIC64_INIT(0), \
- .stime = ATOMIC64_INIT(0), \
- .sum_exec_runtime = ATOMIC64_INIT(0), \
- .running = 0 \
+ .cputime_atomic = INIT_CPUTIME_ATOMIC, \
+ .running = 0, \
}, \
.cred_guard_mutex = \
__MUTEX_INITIALIZER(sig.cred_guard_mutex), \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7092192..80ce921c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -618,9 +618,7 @@ struct task_cputime_atomic {
* used for thread group CPU timer calculations.
*/
struct thread_group_cputimer {
- atomic64_t utime;
- atomic64_t stime;
- atomic64_t sum_exec_runtime;
+ struct task_cputime_atomic cputime_atomic;
int running;
};

diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index c6d1c7d..077ebbd 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -216,7 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
if (!cputimer_running(tsk))
return;

- atomic64_add(cputime, &cputimer->utime);
+ atomic64_add(cputime, &cputimer->cputime_atomic.utime);
}

/**
@@ -237,7 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
if (!cputimer_running(tsk))
return;

- atomic64_add(cputime, &cputimer->stime);
+ atomic64_add(cputime, &cputimer->cputime_atomic.stime);
}

/**
@@ -258,5 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
if (!cputimer_running(tsk))
return;

- atomic64_add(ns, &cputimer->sum_exec_runtime);
+ atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
}
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index d857306..892e3da 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -211,20 +211,20 @@ retry:
}
}

-static void update_gt_cputime(struct thread_group_cputimer *cputimer, struct task_cputime *sum)
+static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum)
{
- __update_gt_cputime(&cputimer->utime, sum->utime);
- __update_gt_cputime(&cputimer->stime, sum->stime);
- __update_gt_cputime(&cputimer->sum_exec_runtime, sum->sum_exec_runtime);
+ __update_gt_cputime(&cputime_atomic->utime, sum->utime);
+ __update_gt_cputime(&cputime_atomic->stime, sum->stime);
+ __update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime);
}

-/* Sample thread_group_cputimer values in "cputimer", store results in "times". */
-static inline void sample_group_cputimer(struct task_cputime *times,
- struct thread_group_cputimer *cputimer)
+/* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */
+static inline void sample_cputime_atomic(struct task_cputime *times,
+ struct task_cputime_atomic *atomic_times)
{
- times->utime = atomic64_read(&cputimer->utime);
- times->stime = atomic64_read(&cputimer->stime);
- times->sum_exec_runtime = atomic64_read(&cputimer->sum_exec_runtime);
+ times->utime = atomic64_read(&atomic_times->utime);
+ times->stime = atomic64_read(&atomic_times->stime);
+ times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime);
}

void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
@@ -240,7 +240,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
* to synchronize the timer to the clock every time we start it.
*/
thread_group_cputime(tsk, &sum);
- update_gt_cputime(cputimer, &sum);
+ update_gt_cputime(&cputimer->cputime_atomic, &sum);

/*
* We're setting cputimer->running without a lock. Ensure
@@ -251,7 +251,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
*/
WRITE_ONCE(cputimer->running, 1);
}
- sample_group_cputimer(times, cputimer);
+ sample_cputime_atomic(times, &cputimer->cputime_atomic);
}

/*
@@ -1137,7 +1137,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
if (READ_ONCE(sig->cputimer.running)) {
struct task_cputime group_sample;

- sample_group_cputimer(&group_sample, &sig->cputimer);
+ sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic);

if (task_cputime_expired(&group_sample, &sig->cputime_expires))
return 1;
--
1.7.2.5

2015-04-29 06:39:16

by Hillf Danton

[permalink] [raw]
Subject: Re: [PATCH v2 3/5] sched, timer: Use atomics in thread_group_cputimer to improve scalability

>
> While running a database workload, we found a scalability issue with itimers.
>
> Much of the problem was caused by the thread_group_cputimer spinlock.
> Each time we account for group system/user time, we need to obtain a
> thread_group_cputimer's spinlock to update the timers. On larger systems
> (such as a 16 socket machine), this caused more than 30% of total time
> spent trying to obtain this kernel lock to update these group timer stats.
>
FYI, another cache line problem encountered by Mel,
a368ab67aa
mm: move zone lock to a different cache line than order-0 free page lists

> This patch converts the timers to 64 bit atomic variables and use
> atomic add to update them without a lock. With this patch, the percent
> of total time spent updating thread group cputimer timers was reduced
> from 30% down to less than 1%.
>
> Note: On 32 bit systems using the generic 64 bit atomics, this causes
> sample_group_cputimer() to take locks 3 times instead of just 1 time.
> However, we tested this patch on a 32 bit system ARM system using the
> generic atomics and did not find the overhead to be much of an issue.
> An explanation for why this isn't an issue is that 32 bit systems usually
> have small numbers of CPUs, and cacheline contention from extra spinlocks
> called periodically is not really apparent on smaller systems.
>

2015-04-29 14:35:35

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH v2 1/5] sched, timer: Remove usages of ACCESS_ONCE in the scheduler

On 04/28/2015 04:00 PM, Jason Low wrote:
> ACCESS_ONCE doesn't work reliably on non-scalar types. This patch removes
> the rest of the existing usages of ACCESS_ONCE in the scheduler, and use
> the new READ_ONCE and WRITE_ONCE APIs.
>
> Signed-off-by: Jason Low <[email protected]>

Acked-by: Rik van Riel <[email protected]>

2015-04-29 14:36:19

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH v2 2/5] sched, numa: Document usages of mm->numa_scan_seq

On 04/28/2015 04:00 PM, Jason Low wrote:
> The p->mm->numa_scan_seq is accessed using READ_ONCE/WRITE_ONCE
> and modified without exclusive access. It is not clear why it is
> accessed this way. This patch provides some documentation on that.
>
> Signed-off-by: Jason Low <[email protected]>

Acked-by: Rik van Riel <[email protected]>

2015-04-29 14:38:25

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH v2 3/5] sched, timer: Use atomics in thread_group_cputimer to improve scalability

On 04/28/2015 04:00 PM, Jason Low wrote:
> While running a database workload, we found a scalability issue with itimers.
>
> Much of the problem was caused by the thread_group_cputimer spinlock.
> Each time we account for group system/user time, we need to obtain a
> thread_group_cputimer's spinlock to update the timers. On larger systems
> (such as a 16 socket machine), this caused more than 30% of total time
> spent trying to obtain this kernel lock to update these group timer stats.
>
> This patch converts the timers to 64 bit atomic variables and use
> atomic add to update them without a lock. With this patch, the percent
> of total time spent updating thread group cputimer timers was reduced
> from 30% down to less than 1%.
>
> Note: On 32 bit systems using the generic 64 bit atomics, this causes
> sample_group_cputimer() to take locks 3 times instead of just 1 time.
> However, we tested this patch on a 32 bit system ARM system using the
> generic atomics and did not find the overhead to be much of an issue.
> An explanation for why this isn't an issue is that 32 bit systems usually
> have small numbers of CPUs, and cacheline contention from extra spinlocks
> called periodically is not really apparent on smaller systems.

I don't see 32 bit systems ever getting so many CPUs
that this becomes an issue :)

> Signed-off-by: Jason Low <[email protected]>

Acked-by: Rik van Riel <[email protected]>

2015-04-29 14:47:53

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH v2 4/5] sched, timer: Provide an atomic task_cputime data structure

On 04/28/2015 04:00 PM, Jason Low wrote:
> This patch adds an atomic variant of the task_cputime data structure,
> which can be used to store and update task_cputime statistics without
> needing to do locking.
>
> Suggested-by: Ingo Molnar <[email protected]>
> Signed-off-by: Jason Low <[email protected]>

Acked-by: Rik van Riel <[email protected]>

2015-04-29 14:48:17

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH v2 5/5] sched, timer: Use the atomic task_cputime in thread_group_cputimer

On 04/28/2015 04:00 PM, Jason Low wrote:
> Recent optimizations were made to thread_group_cputimer to improve its
> scalability by keeping track of cputime stats without a lock. However,
> the values were open coded to the structure, causing them to be at
> a different abstraction level from the regular task_cputime structure.
> Furthermore, any subsequent similar optimizations would not be able to
> share the new code, since they are specific to thread_group_cputimer.
>
> This patch adds the new task_cputime_atomic data structure (introduced in
> the previous patch in the series) to thread_group_cputimer for keeping
> track of the cputime atomically, which also helps generalize the code.
>
> Suggested-by: Ingo Molnar <[email protected]>
> Signed-off-by: Jason Low <[email protected]>

Acked-by: Rik van Riel <[email protected]>

2015-04-29 17:06:05

by Waiman Long

[permalink] [raw]
Subject: Re: [PATCH v2 1/5] sched, timer: Remove usages of ACCESS_ONCE in the scheduler

On 04/28/2015 04:00 PM, Jason Low wrote:
> ACCESS_ONCE doesn't work reliably on non-scalar types. This patch removes
> the rest of the existing usages of ACCESS_ONCE in the scheduler, and use
> the new READ_ONCE and WRITE_ONCE APIs.
>
> Signed-off-by: Jason Low<[email protected]>
> ---
> include/linux/sched.h | 4 ++--
> kernel/fork.c | 2 +-
> kernel/sched/auto_group.c | 2 +-
> kernel/sched/auto_group.h | 2 +-
> kernel/sched/core.c | 4 ++--
> kernel/sched/cputime.c | 2 +-
> kernel/sched/deadline.c | 2 +-
> kernel/sched/fair.c | 14 +++++++-------
> kernel/sched/proc.c | 4 ++--
> kernel/sched/rt.c | 2 +-
> kernel/sched/sched.h | 2 +-
> kernel/sched/wait.c | 4 ++--
> kernel/time/posix-cpu-timers.c | 8 ++++----
> 13 files changed, 26 insertions(+), 26 deletions(-)
>
> ...
> goto no_join;
> @@ -2107,7 +2107,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
>
> static void reset_ptenuma_scan(struct task_struct *p)
> {
> - ACCESS_ONCE(p->mm->numa_scan_seq)++;
> + WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
> p->mm->numa_scan_offset = 0;
> }
>
>

Generally, I am for replacing ACCESS_ONCE() with the more descriptive
READ_ONCE() and WRITE_ONCE() except the above case where it makes the
code harder to read without any real advantage.

Other than that,

Acked-by: Waiman Long <[email protected]>

Cheers,
Longman

2015-04-29 17:15:37

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH v2 1/5] sched, timer: Remove usages of ACCESS_ONCE in the scheduler

On Wed, 29 Apr 2015 13:05:55 -0400
Waiman Long <[email protected]> wrote:

> > goto no_join;
> > @@ -2107,7 +2107,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
> >
> > static void reset_ptenuma_scan(struct task_struct *p)
> > {
> > - ACCESS_ONCE(p->mm->numa_scan_seq)++;
> > + WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
> > p->mm->numa_scan_offset = 0;
> > }
> >
> >
>
> Generally, I am for replacing ACCESS_ONCE() with the more descriptive
> READ_ONCE() and WRITE_ONCE() except the above case where it makes the
> code harder to read without any real advantage.
>
> Other than that,
>
> Acked-by: Waiman Long <[email protected]>
>

I agree, but I believe this code needs to be updated anyway. Making it
uglier may encourage that to happen.

-- Steve

2015-04-29 18:14:53

by Waiman Long

[permalink] [raw]
Subject: Re: [PATCH v2 2/5] sched, numa: Document usages of mm->numa_scan_seq

On 04/28/2015 04:00 PM, Jason Low wrote:
> The p->mm->numa_scan_seq is accessed using READ_ONCE/WRITE_ONCE
> and modified without exclusive access. It is not clear why it is
> accessed this way. This patch provides some documentation on that.
>
> Signed-off-by: Jason Low<[email protected]>
> ---
> kernel/sched/fair.c | 12 ++++++++++++
> 1 files changed, 12 insertions(+), 0 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 5a44371..794f7d7 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1794,6 +1794,11 @@ static void task_numa_placement(struct task_struct *p)
> u64 runtime, period;
> spinlock_t *group_lock = NULL;
>
> + /*
> + * The p->mm->numa_scan_seq gets updated without
> + * exclusive access. Use READ_ONCE() here to ensure
> + * that the field is read in a single access.
> + */
> seq = READ_ONCE(p->mm->numa_scan_seq);
> if (p->numa_scan_seq == seq)
> return;
> @@ -2107,6 +2112,13 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
>
> static void reset_ptenuma_scan(struct task_struct *p)
> {
> + /*
> + * We only did a read acquisition of the mmap sem, so
> + * p->mm->numa_scan_seq is written to without exclusive access.
> + * That's not much of an issue though, since this is just used
> + * for statistical sampling. Use WRITE_ONCE and READ_ONCE, which
> + * are not expensive, to avoid load/store tearing.
> + */
> WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
> p->mm->numa_scan_offset = 0;
> }

READ_ONCE followed by a WRITE_ONCE won't stop load/store tearing from
happening unless you use an atomic instruction to do the increment. So I
think your comment may be a bit misleading.

Cheers,
Longman

2015-04-29 18:25:16

by Jason Low

[permalink] [raw]
Subject: Re: [PATCH v2 1/5] sched, timer: Remove usages of ACCESS_ONCE in the scheduler

On Wed, 2015-04-29 at 13:15 -0400, Steven Rostedt wrote:
> On Wed, 29 Apr 2015 13:05:55 -0400
> Waiman Long <[email protected]> wrote:
>
> > > goto no_join;
> > > @@ -2107,7 +2107,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
> > >
> > > static void reset_ptenuma_scan(struct task_struct *p)
> > > {
> > > - ACCESS_ONCE(p->mm->numa_scan_seq)++;
> > > + WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
> > > p->mm->numa_scan_offset = 0;
> > > }
> > >
> > >
> >
> > Generally, I am for replacing ACCESS_ONCE() with the more descriptive
> > READ_ONCE() and WRITE_ONCE() except the above case where it makes the
> > code harder to read without any real advantage.
> >
> > Other than that,
> >
> > Acked-by: Waiman Long <[email protected]>
> >
>
> I agree, but I believe this code needs to be updated anyway. Making it
> uglier may encourage that to happen.

Yep, in this case, the ACCESS_ONCE conversion on numa_scan_seq
technically isn't necessary, but I think the best option is to
consistently update all of them, which makes things clearer than using
multiple sets of APIs.

Thanks,
Jason

2015-04-29 18:43:49

by Waiman Long

[permalink] [raw]
Subject: Re: [PATCH v2 3/5] sched, timer: Use atomics in thread_group_cputimer to improve scalability

On 04/28/2015 04:00 PM, Jason Low wrote:
> While running a database workload, we found a scalability issue with itimers.
>
> Much of the problem was caused by the thread_group_cputimer spinlock.
> Each time we account for group system/user time, we need to obtain a
> thread_group_cputimer's spinlock to update the timers. On larger systems
> (such as a 16 socket machine), this caused more than 30% of total time
> spent trying to obtain this kernel lock to update these group timer stats.
>
> This patch converts the timers to 64 bit atomic variables and use
> atomic add to update them without a lock. With this patch, the percent
> of total time spent updating thread group cputimer timers was reduced
> from 30% down to less than 1%.
>
> Note: On 32 bit systems using the generic 64 bit atomics, this causes
> sample_group_cputimer() to take locks 3 times instead of just 1 time.
> However, we tested this patch on a 32 bit system ARM system using the
> generic atomics and did not find the overhead to be much of an issue.
> An explanation for why this isn't an issue is that 32 bit systems usually
> have small numbers of CPUs, and cacheline contention from extra spinlocks
> called periodically is not really apparent on smaller systems.
>
> Signed-off-by: Jason Low<[email protected]>
> ---
> include/linux/init_task.h | 7 ++--
> include/linux/sched.h | 10 ++----
> kernel/fork.c | 3 --
> kernel/sched/stats.h | 15 +++-----
> kernel/time/posix-cpu-timers.c | 79 +++++++++++++++++++++++++---------------
> 5 files changed, 62 insertions(+), 52 deletions(-)
>
> diff --git a/include/linux/init_task.h b/include/linux/init_task.h
> index 696d223..7b9d8b5 100644
> --- a/include/linux/init_task.h
> +++ b/include/linux/init_task.h
> @@ -50,9 +50,10 @@ extern struct fs_struct init_fs;
> .cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \
> .rlim = INIT_RLIMITS, \
> .cputimer = { \
> - .cputime = INIT_CPUTIME, \
> - .running = 0, \
> - .lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \
> + .utime = ATOMIC64_INIT(0), \
> + .stime = ATOMIC64_INIT(0), \
> + .sum_exec_runtime = ATOMIC64_INIT(0), \
> + .running = 0 \
> }, \
> .cred_guard_mutex = \
> __MUTEX_INITIALIZER(sig.cred_guard_mutex), \
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 604eb7c..c736a47 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -601,9 +601,10 @@ struct task_cputime {
> * used for thread group CPU timer calculations.
> */
> struct thread_group_cputimer {
> - struct task_cputime cputime;
> + atomic64_t utime;
> + atomic64_t stime;
> + atomic64_t sum_exec_runtime;
> int running;
> - raw_spinlock_t lock;
> };
>
> #include<linux/rwsem.h>
> @@ -2970,11 +2971,6 @@ static __always_inline bool need_resched(void)
> void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
> void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
>
> -static inline void thread_group_cputime_init(struct signal_struct *sig)
> -{
> - raw_spin_lock_init(&sig->cputimer.lock);
> -}
> -
> /*
> * Reevaluate whether the task has signals pending delivery.
> * Wake the task if so.
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 47c37a4..2e67086 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1091,9 +1091,6 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
> {
> unsigned long cpu_limit;
>
> - /* Thread group counters. */
> - thread_group_cputime_init(sig);
> -
> cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
> if (cpu_limit != RLIM_INFINITY) {
> sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
> diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
> index 4ab7043..c6d1c7d 100644
> --- a/kernel/sched/stats.h
> +++ b/kernel/sched/stats.h
> @@ -174,7 +174,8 @@ static inline bool cputimer_running(struct task_struct *tsk)
> {
> struct thread_group_cputimer *cputimer =&tsk->signal->cputimer;
>
> - if (!cputimer->running)
> + /* Check if cputimer isn't running. This is accessed without locking. */
> + if (!READ_ONCE(cputimer->running))
> return false;
>
> /*
> @@ -215,9 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
> if (!cputimer_running(tsk))
> return;
>
> - raw_spin_lock(&cputimer->lock);
> - cputimer->cputime.utime += cputime;
> - raw_spin_unlock(&cputimer->lock);
> + atomic64_add(cputime,&cputimer->utime);
> }
>
> /**
> @@ -238,9 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
> if (!cputimer_running(tsk))
> return;
>
> - raw_spin_lock(&cputimer->lock);
> - cputimer->cputime.stime += cputime;
> - raw_spin_unlock(&cputimer->lock);
> + atomic64_add(cputime,&cputimer->stime);
> }
>
> /**
> @@ -261,7 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
> if (!cputimer_running(tsk))
> return;
>
> - raw_spin_lock(&cputimer->lock);
> - cputimer->cputime.sum_exec_runtime += ns;
> - raw_spin_unlock(&cputimer->lock);
> + atomic64_add(ns,&cputimer->sum_exec_runtime);
> }
> diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
> index e072d98..d857306 100644
> --- a/kernel/time/posix-cpu-timers.c
> +++ b/kernel/time/posix-cpu-timers.c
> @@ -196,39 +196,62 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
> return 0;
> }
>
> -static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
> +/*
> + * Set cputime to sum_cputime if sum_cputime> cputime. Use cmpxchg
> + * to avoid race conditions with concurrent updates to cputime.
> + */
> +static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
> {
> - if (b->utime> a->utime)
> - a->utime = b->utime;
> + u64 curr_cputime;
> +retry:
> + curr_cputime = atomic64_read(cputime);
> + if (sum_cputime> curr_cputime) {
> + if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
> + goto retry;
> + }
> +}
>
> - if (b->stime> a->stime)
> - a->stime = b->stime;
> +static void update_gt_cputime(struct thread_group_cputimer *cputimer, struct task_cputime *sum)
> +{
> + __update_gt_cputime(&cputimer->utime, sum->utime);
> + __update_gt_cputime(&cputimer->stime, sum->stime);
> + __update_gt_cputime(&cputimer->sum_exec_runtime, sum->sum_exec_runtime);
> +}
>
> - if (b->sum_exec_runtime> a->sum_exec_runtime)
> - a->sum_exec_runtime = b->sum_exec_runtime;
> +/* Sample thread_group_cputimer values in "cputimer", store results in "times". */
> +static inline void sample_group_cputimer(struct task_cputime *times,
> + struct thread_group_cputimer *cputimer)
> +{
> + times->utime = atomic64_read(&cputimer->utime);
> + times->stime = atomic64_read(&cputimer->stime);
> + times->sum_exec_runtime = atomic64_read(&cputimer->sum_exec_runtime);
> }
>
> void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
> {
> struct thread_group_cputimer *cputimer =&tsk->signal->cputimer;
> struct task_cputime sum;
> - unsigned long flags;
>
> - if (!cputimer->running) {
> + /* Check if cputimer isn't running. This is accessed without locking. */
> + if (!READ_ONCE(cputimer->running)) {
> /*
> * The POSIX timer interface allows for absolute time expiry
> * values through the TIMER_ABSTIME flag, therefore we have
> - * to synchronize the timer to the clock every time we start
> - * it.
> + * to synchronize the timer to the clock every time we start it.
> */
> thread_group_cputime(tsk,&sum);
> - raw_spin_lock_irqsave(&cputimer->lock, flags);
> - cputimer->running = 1;
> - update_gt_cputime(&cputimer->cputime,&sum);
> - } else
> - raw_spin_lock_irqsave(&cputimer->lock, flags);
> - *times = cputimer->cputime;
> - raw_spin_unlock_irqrestore(&cputimer->lock, flags);
> + update_gt_cputime(cputimer,&sum);
> +
> + /*
> + * We're setting cputimer->running without a lock. Ensure
> + * this only gets written to in one operation. We set
> + * running after update_gt_cputime() as a small optimization,
> + * but barriers are not required because update_gt_cputime()
> + * can handle concurrent updates.
> + */
> + WRITE_ONCE(cputimer->running, 1);
> + }
> + sample_group_cputimer(times, cputimer);
> }

If there is a possibility that more than one thread will be running this
code concurrently, I think it will be safer to use cmpxchg to set the
running flag:

if (!READ_ONCE(cputimer->running) && !cmpxchg(&cputimer->running,
0, 1)) {
...

This will ensure that only one thread will update it.

Cheers,
Longman

2015-04-29 18:45:10

by Jason Low

[permalink] [raw]
Subject: Re: [PATCH v2 2/5] sched, numa: Document usages of mm->numa_scan_seq

On Wed, 2015-04-29 at 14:14 -0400, Waiman Long wrote:
> On 04/28/2015 04:00 PM, Jason Low wrote:
> > The p->mm->numa_scan_seq is accessed using READ_ONCE/WRITE_ONCE
> > and modified without exclusive access. It is not clear why it is
> > accessed this way. This patch provides some documentation on that.
> >
> > Signed-off-by: Jason Low<[email protected]>
> > ---
> > kernel/sched/fair.c | 12 ++++++++++++
> > 1 files changed, 12 insertions(+), 0 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 5a44371..794f7d7 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -1794,6 +1794,11 @@ static void task_numa_placement(struct task_struct *p)
> > u64 runtime, period;
> > spinlock_t *group_lock = NULL;
> >
> > + /*
> > + * The p->mm->numa_scan_seq gets updated without
> > + * exclusive access. Use READ_ONCE() here to ensure
> > + * that the field is read in a single access.
> > + */
> > seq = READ_ONCE(p->mm->numa_scan_seq);
> > if (p->numa_scan_seq == seq)
> > return;
> > @@ -2107,6 +2112,13 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
> >
> > static void reset_ptenuma_scan(struct task_struct *p)
> > {
> > + /*
> > + * We only did a read acquisition of the mmap sem, so
> > + * p->mm->numa_scan_seq is written to without exclusive access.
> > + * That's not much of an issue though, since this is just used
> > + * for statistical sampling. Use WRITE_ONCE and READ_ONCE, which
> > + * are not expensive, to avoid load/store tearing.
> > + */
> > WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
> > p->mm->numa_scan_offset = 0;
> > }
>
> READ_ONCE followed by a WRITE_ONCE won't stop load/store tearing from
> happening unless you use an atomic instruction to do the increment. So I
> think your comment may be a bit misleading.

Right, the READ and WRITE operations will still be done separately and
won't be atomic. Here, we're saying that this prevents load/store
tearing on each of those individual write/read operations. Please let me
know if you prefer this to be worded differently.

2015-04-29 20:14:34

by Jason Low

[permalink] [raw]
Subject: Re: [PATCH v2 3/5] sched, timer: Use atomics in thread_group_cputimer to improve scalability

On Wed, 2015-04-29 at 14:43 -0400, Waiman Long wrote:
> On 04/28/2015 04:00 PM, Jason Low wrote:
> > void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
> > {
> > struct thread_group_cputimer *cputimer =&tsk->signal->cputimer;
> > struct task_cputime sum;
> > - unsigned long flags;
> >
> > - if (!cputimer->running) {
> > + /* Check if cputimer isn't running. This is accessed without locking. */
> > + if (!READ_ONCE(cputimer->running)) {
> > /*
> > * The POSIX timer interface allows for absolute time expiry
> > * values through the TIMER_ABSTIME flag, therefore we have
> > - * to synchronize the timer to the clock every time we start
> > - * it.
> > + * to synchronize the timer to the clock every time we start it.
> > */
> > thread_group_cputime(tsk,&sum);
> > - raw_spin_lock_irqsave(&cputimer->lock, flags);
> > - cputimer->running = 1;
> > - update_gt_cputime(&cputimer->cputime,&sum);
> > - } else
> > - raw_spin_lock_irqsave(&cputimer->lock, flags);
> > - *times = cputimer->cputime;
> > - raw_spin_unlock_irqrestore(&cputimer->lock, flags);
> > + update_gt_cputime(cputimer,&sum);
> > +
> > + /*
> > + * We're setting cputimer->running without a lock. Ensure
> > + * this only gets written to in one operation. We set
> > + * running after update_gt_cputime() as a small optimization,
> > + * but barriers are not required because update_gt_cputime()
> > + * can handle concurrent updates.
> > + */
> > + WRITE_ONCE(cputimer->running, 1);
> > + }
> > + sample_group_cputimer(times, cputimer);
> > }
>
> If there is a possibility that more than one thread will be running this
> code concurrently, I think it will be safer to use cmpxchg to set the
> running flag:
>
> if (!READ_ONCE(cputimer->running) && !cmpxchg(&cputimer->running,
> 0, 1)) {
> ...
>
> This will ensure that only one thread will update it.

Using cmpxchg to update the running field would be fine too, though
there isn't really much of a problem with multiple threads running this
code concurrently. The update_gt_cputime() already handles concurrent
update, and this code path gets rarely executed because we only enter it
when enabling the timer.

In that case, it might be better to to keep it the way it currently is
since I think it is a bit more readable.

2015-04-29 20:45:42

by Jason Low

[permalink] [raw]
Subject: Re: [PATCH v2 3/5] sched, timer: Use atomics in thread_group_cputimer to improve scalability

On Wed, 2015-04-29 at 10:38 -0400, Rik van Riel wrote:
> On 04/28/2015 04:00 PM, Jason Low wrote:
> > While running a database workload, we found a scalability issue with itimers.
> >
> > Much of the problem was caused by the thread_group_cputimer spinlock.
> > Each time we account for group system/user time, we need to obtain a
> > thread_group_cputimer's spinlock to update the timers. On larger systems
> > (such as a 16 socket machine), this caused more than 30% of total time
> > spent trying to obtain this kernel lock to update these group timer stats.
> >
> > This patch converts the timers to 64 bit atomic variables and use
> > atomic add to update them without a lock. With this patch, the percent
> > of total time spent updating thread group cputimer timers was reduced
> > from 30% down to less than 1%.
> >
> > Note: On 32 bit systems using the generic 64 bit atomics, this causes
> > sample_group_cputimer() to take locks 3 times instead of just 1 time.
> > However, we tested this patch on a 32 bit system ARM system using the
> > generic atomics and did not find the overhead to be much of an issue.
> > An explanation for why this isn't an issue is that 32 bit systems usually
> > have small numbers of CPUs, and cacheline contention from extra spinlocks
> > called periodically is not really apparent on smaller systems.
>
> I don't see 32 bit systems ever getting so many CPUs
> that this becomes an issue :)

Yeah, the generic 64 bit atomics are meant to be used on systems with
(<=4 or so) CPUs.

> > Signed-off-by: Jason Low <[email protected]>
>
> Acked-by: Rik van Riel <[email protected]>

Thanks!

2015-04-30 18:42:41

by Waiman Long

[permalink] [raw]
Subject: Re: [PATCH v2 2/5] sched, numa: Document usages of mm->numa_scan_seq

On 04/29/2015 02:45 PM, Jason Low wrote:
> On Wed, 2015-04-29 at 14:14 -0400, Waiman Long wrote:
>> On 04/28/2015 04:00 PM, Jason Low wrote:
>>> The p->mm->numa_scan_seq is accessed using READ_ONCE/WRITE_ONCE
>>> and modified without exclusive access. It is not clear why it is
>>> accessed this way. This patch provides some documentation on that.
>>>
>>> Signed-off-by: Jason Low<[email protected]>
>>> ---
>>> kernel/sched/fair.c | 12 ++++++++++++
>>> 1 files changed, 12 insertions(+), 0 deletions(-)
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 5a44371..794f7d7 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -1794,6 +1794,11 @@ static void task_numa_placement(struct task_struct *p)
>>> u64 runtime, period;
>>> spinlock_t *group_lock = NULL;
>>>
>>> + /*
>>> + * The p->mm->numa_scan_seq gets updated without
>>> + * exclusive access. Use READ_ONCE() here to ensure
>>> + * that the field is read in a single access.
>>> + */
>>> seq = READ_ONCE(p->mm->numa_scan_seq);
>>> if (p->numa_scan_seq == seq)
>>> return;
>>> @@ -2107,6 +2112,13 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
>>>
>>> static void reset_ptenuma_scan(struct task_struct *p)
>>> {
>>> + /*
>>> + * We only did a read acquisition of the mmap sem, so
>>> + * p->mm->numa_scan_seq is written to without exclusive access.
>>> + * That's not much of an issue though, since this is just used
>>> + * for statistical sampling. Use WRITE_ONCE and READ_ONCE, which
>>> + * are not expensive, to avoid load/store tearing.
>>> + */
>>> WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
>>> p->mm->numa_scan_offset = 0;
>>> }
>> READ_ONCE followed by a WRITE_ONCE won't stop load/store tearing from
>> happening unless you use an atomic instruction to do the increment. So I
>> think your comment may be a bit misleading.
> Right, the READ and WRITE operations will still be done separately and
> won't be atomic. Here, we're saying that this prevents load/store
> tearing on each of those individual write/read operations. Please let me
> know if you prefer this to be worded differently.

I do have a question of what kind of tearing you are talking about. Do
you mean the tearing due to mm being changed in the middle of the
access? The reason why I don't like this kind of construct is that I am
not sure if
the address translation p->mm->numa_scan_seq is being done once or
twice. I looked at the compiled code and the translation is done only once.

Anyway, the purpose of READ_ONCE and WRITE_ONCE is not for eliminating
data tearing. They are to make sure that the compiler won't compile away
data access and they are done in the order they appear in the program. I
don't think it is a good idea to associate tearing elimination with
those macros. So I would suggest removing the last sentence in your comment.

Cheers,
Longman

2015-04-30 18:54:41

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH v2 2/5] sched, numa: Document usages of mm->numa_scan_seq

On Thu, 2015-04-30 at 14:42 -0400, Waiman Long wrote:
> On 04/29/2015 02:45 PM, Jason Low wrote:
> > On Wed, 2015-04-29 at 14:14 -0400, Waiman Long wrote:
> >> On 04/28/2015 04:00 PM, Jason Low wrote:
> >>> The p->mm->numa_scan_seq is accessed using READ_ONCE/WRITE_ONCE
> >>> and modified without exclusive access. It is not clear why it is
> >>> accessed this way. This patch provides some documentation on that.
> >>>
> >>> Signed-off-by: Jason Low<[email protected]>
> >>> ---
> >>> kernel/sched/fair.c | 12 ++++++++++++
> >>> 1 files changed, 12 insertions(+), 0 deletions(-)
> >>>
> >>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >>> index 5a44371..794f7d7 100644
> >>> --- a/kernel/sched/fair.c
> >>> +++ b/kernel/sched/fair.c
> >>> @@ -1794,6 +1794,11 @@ static void task_numa_placement(struct task_struct *p)
> >>> u64 runtime, period;
> >>> spinlock_t *group_lock = NULL;
> >>>
> >>> + /*
> >>> + * The p->mm->numa_scan_seq gets updated without
> >>> + * exclusive access. Use READ_ONCE() here to ensure
> >>> + * that the field is read in a single access.
> >>> + */
> >>> seq = READ_ONCE(p->mm->numa_scan_seq);
> >>> if (p->numa_scan_seq == seq)
> >>> return;
> >>> @@ -2107,6 +2112,13 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
> >>>
> >>> static void reset_ptenuma_scan(struct task_struct *p)
> >>> {
> >>> + /*
> >>> + * We only did a read acquisition of the mmap sem, so
> >>> + * p->mm->numa_scan_seq is written to without exclusive access.
> >>> + * That's not much of an issue though, since this is just used
> >>> + * for statistical sampling. Use WRITE_ONCE and READ_ONCE, which
> >>> + * are not expensive, to avoid load/store tearing.
> >>> + */
> >>> WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
> >>> p->mm->numa_scan_offset = 0;
> >>> }
> >> READ_ONCE followed by a WRITE_ONCE won't stop load/store tearing from
> >> happening unless you use an atomic instruction to do the increment. So I
> >> think your comment may be a bit misleading.
> > Right, the READ and WRITE operations will still be done separately and
> > won't be atomic. Here, we're saying that this prevents load/store
> > tearing on each of those individual write/read operations. Please let me
> > know if you prefer this to be worded differently.
>
> I do have a question of what kind of tearing you are talking about. Do
> you mean the tearing due to mm being changed in the middle of the
> access? The reason why I don't like this kind of construct is that I am
> not sure if
> the address translation p->mm->numa_scan_seq is being done once or
> twice. I looked at the compiled code and the translation is done only once.
>
> Anyway, the purpose of READ_ONCE and WRITE_ONCE is not for eliminating
> data tearing. They are to make sure that the compiler won't compile away
> data access and they are done in the order they appear in the program. I
> don't think it is a good idea to associate tearing elimination with
> those macros. So I would suggest removing the last sentence in your comment.

I agree. Related, Linus also had some thoughts about the _very specific_
purposes of these macros:
http://www.spinics.net/lists/linux-next/msg32494.html

I also wonder why this patch is included in a set called
"sched, timer: Improve scalability of itimers" ;)

Thanks,
Davidlohr

2015-04-30 20:58:48

by Waiman Long

[permalink] [raw]
Subject: Re: [PATCH v2 2/5] sched, numa: Document usages of mm->numa_scan_seq

On 04/30/2015 02:54 PM, Davidlohr Bueso wrote:
> On Thu, 2015-04-30 at 14:42 -0400, Waiman Long wrote:
>> On 04/29/2015 02:45 PM, Jason Low wrote:
>>> On Wed, 2015-04-29 at 14:14 -0400, Waiman Long wrote:
>>>> On 04/28/2015 04:00 PM, Jason Low wrote:
>>>>> The p->mm->numa_scan_seq is accessed using READ_ONCE/WRITE_ONCE
>>>>> and modified without exclusive access. It is not clear why it is
>>>>> accessed this way. This patch provides some documentation on that.
>>>>>
>>>>> Signed-off-by: Jason Low<[email protected]>
>>>>> ---
>>>>> kernel/sched/fair.c | 12 ++++++++++++
>>>>> 1 files changed, 12 insertions(+), 0 deletions(-)
>>>>>
>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>>> index 5a44371..794f7d7 100644
>>>>> --- a/kernel/sched/fair.c
>>>>> +++ b/kernel/sched/fair.c
>>>>> @@ -1794,6 +1794,11 @@ static void task_numa_placement(struct task_struct *p)
>>>>> u64 runtime, period;
>>>>> spinlock_t *group_lock = NULL;
>>>>>
>>>>> + /*
>>>>> + * The p->mm->numa_scan_seq gets updated without
>>>>> + * exclusive access. Use READ_ONCE() here to ensure
>>>>> + * that the field is read in a single access.
>>>>> + */
>>>>> seq = READ_ONCE(p->mm->numa_scan_seq);
>>>>> if (p->numa_scan_seq == seq)
>>>>> return;
>>>>> @@ -2107,6 +2112,13 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
>>>>>
>>>>> static void reset_ptenuma_scan(struct task_struct *p)
>>>>> {
>>>>> + /*
>>>>> + * We only did a read acquisition of the mmap sem, so
>>>>> + * p->mm->numa_scan_seq is written to without exclusive access.
>>>>> + * That's not much of an issue though, since this is just used
>>>>> + * for statistical sampling. Use WRITE_ONCE and READ_ONCE, which
>>>>> + * are not expensive, to avoid load/store tearing.
>>>>> + */
>>>>> WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
>>>>> p->mm->numa_scan_offset = 0;
>>>>> }
>>>> READ_ONCE followed by a WRITE_ONCE won't stop load/store tearing from
>>>> happening unless you use an atomic instruction to do the increment. So I
>>>> think your comment may be a bit misleading.
>>> Right, the READ and WRITE operations will still be done separately and
>>> won't be atomic. Here, we're saying that this prevents load/store
>>> tearing on each of those individual write/read operations. Please let me
>>> know if you prefer this to be worded differently.
>> I do have a question of what kind of tearing you are talking about. Do
>> you mean the tearing due to mm being changed in the middle of the
>> access? The reason why I don't like this kind of construct is that I am
>> not sure if
>> the address translation p->mm->numa_scan_seq is being done once or
>> twice. I looked at the compiled code and the translation is done only once.
>>
>> Anyway, the purpose of READ_ONCE and WRITE_ONCE is not for eliminating
>> data tearing. They are to make sure that the compiler won't compile away
>> data access and they are done in the order they appear in the program. I
>> don't think it is a good idea to associate tearing elimination with
>> those macros. So I would suggest removing the last sentence in your comment.
> I agree. Related, Linus also had some thoughts about the _very specific_
> purposes of these macros:
> http://www.spinics.net/lists/linux-next/msg32494.html

Actually, I don't think modern compiler will reload a read value unless
it runs out of usable registers. It is more likely that it will reuse a
previously read value within the same function if READ_ONCE() isn't there.

Cheers,
Longman

2015-04-30 21:13:17

by Jason Low

[permalink] [raw]
Subject: Re: [PATCH v2 2/5] sched, numa: Document usages of mm->numa_scan_seq

On Thu, 2015-04-30 at 14:42 -0400, Waiman Long wrote:

> I do have a question of what kind of tearing you are talking about. Do
> you mean the tearing due to mm being changed in the middle of the
> access? The reason why I don't like this kind of construct is that I am
> not sure if
> the address translation p->mm->numa_scan_seq is being done once or
> twice. I looked at the compiled code and the translation is done only once.
>
> Anyway, the purpose of READ_ONCE and WRITE_ONCE is not for eliminating
> data tearing. They are to make sure that the compiler won't compile away
> data access and they are done in the order they appear in the program. I
> don't think it is a good idea to associate tearing elimination with
> those macros. So I would suggest removing the last sentence in your comment.

Yes, I can remove the last sentence in the comment since the main goal
was to document that we're access this field without exclusive access.

In terms of data tearing, an example would be the write operation gets
split into multiple stores (though this is architecture dependent). The
idea was that since we're modifying a seq variable without the write
lock, we want to remove any forms of optimizations as mentioned above or
unpredictable behavior, since READ_ONCE/WRITE_ONCE adds no overhead.

2015-04-30 21:26:44

by Jason Low

[permalink] [raw]
Subject: Re: [PATCH v2 2/5] sched, numa: Document usages of mm->numa_scan_seq

On Thu, 2015-04-30 at 11:54 -0700, Davidlohr Bueso wrote:

> I also wonder why this patch is included in a set called
> "sched, timer: Improve scalability of itimers" ;)

Good point :)

The reason these first 2 patches were included in this patchset is
because patch 3 depended on patch 1 (particularly due to the
modifications to posix_cpu_timers_init_group() which happen to use
ACCESS_ONCE after initializing the thread_group_cputimer lock). Likewise
patch 2 depended on patch 1. Thus, I included these changes to make
applying these patches a bit simpler.

2015-05-01 00:28:20

by Jason Low

[permalink] [raw]
Subject: [PATCH v3 2/5] sched, numa: Document usages of mm->numa_scan_seq

On Thu, 2015-04-30 at 14:13 -0700, Jason Low wrote:
> On Thu, 2015-04-30 at 14:42 -0400, Waiman Long wrote:
>
> > I do have a question of what kind of tearing you are talking about. Do
> > you mean the tearing due to mm being changed in the middle of the
> > access? The reason why I don't like this kind of construct is that I am
> > not sure if
> > the address translation p->mm->numa_scan_seq is being done once or
> > twice. I looked at the compiled code and the translation is done only once.
> >
> > Anyway, the purpose of READ_ONCE and WRITE_ONCE is not for eliminating
> > data tearing. They are to make sure that the compiler won't compile away
> > data access and they are done in the order they appear in the program. I
> > don't think it is a good idea to associate tearing elimination with
> > those macros. So I would suggest removing the last sentence in your comment.
>
> Yes, I can remove the last sentence in the comment since the main goal
> was to document that we're access this field without exclusive access.

---
Subject: [PATCH v3 2/5] sched, numa: Document usages of mm->numa_scan_seq

The p->mm->numa_scan_seq is accessed using READ_ONCE/WRITE_ONCE
and modified without exclusive access. It is not clear why it is
accessed this way. This patch provides some documentation on that.

Signed-off-by: Jason Low <[email protected]>
Suggested-by: Ingo Molnar <[email protected]>
Acked-by: Rik van Riel <[email protected]>
---
kernel/sched/fair.c | 13 +++++++++++++
1 files changed, 13 insertions(+), 0 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5a44371..65a9a1dc 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1794,6 +1794,11 @@ static void task_numa_placement(struct task_struct *p)
u64 runtime, period;
spinlock_t *group_lock = NULL;

+ /*
+ * The p->mm->numa_scan_seq gets updated without
+ * exclusive access. Use READ_ONCE() here to ensure
+ * that the field is read in a single access.
+ */
seq = READ_ONCE(p->mm->numa_scan_seq);
if (p->numa_scan_seq == seq)
return;
@@ -2107,6 +2112,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)

static void reset_ptenuma_scan(struct task_struct *p)
{
+ /*
+ * We only did a read acquisition of the mmap sem, so
+ * p->mm->numa_scan_seq is written to without exclusive access
+ * and the update is not guaranteed to be atomic. That's not
+ * much of an issue though, since this is just used for
+ * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
+ * expensive, to avoid any form of compiler optimizations.
+ */
WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
p->mm->numa_scan_offset = 0;
}
--
1.7.2.5


2015-05-01 15:22:06

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH v2 2/5] sched, numa: Document usages of mm->numa_scan_seq

On Thu, Apr 30, 2015 at 02:13:07PM -0700, Jason Low wrote:
> On Thu, 2015-04-30 at 14:42 -0400, Waiman Long wrote:
>
> > I do have a question of what kind of tearing you are talking about. Do
> > you mean the tearing due to mm being changed in the middle of the
> > access? The reason why I don't like this kind of construct is that I am
> > not sure if
> > the address translation p->mm->numa_scan_seq is being done once or
> > twice. I looked at the compiled code and the translation is done only once.
> >
> > Anyway, the purpose of READ_ONCE and WRITE_ONCE is not for eliminating
> > data tearing. They are to make sure that the compiler won't compile away
> > data access and they are done in the order they appear in the program. I
> > don't think it is a good idea to associate tearing elimination with
> > those macros. So I would suggest removing the last sentence in your comment.
>
> Yes, I can remove the last sentence in the comment since the main goal
> was to document that we're access this field without exclusive access.
>
> In terms of data tearing, an example would be the write operation gets
> split into multiple stores (though this is architecture dependent). The
> idea was that since we're modifying a seq variable without the write
> lock, we want to remove any forms of optimizations as mentioned above or
> unpredictable behavior, since READ_ONCE/WRITE_ONCE adds no overhead.

Just to be clear... READ_ONCE() and WRITE_ONCE() do not avoid data tearing
in cases where the thing read or written is too big for a machine word.
If the thing read/written does fit into a machine word and if the location
read/written is properly aligned, I would be quite surprised if either
READ_ONCE() or WRITE_ONCE() resulted in any sort of tearing.

Thanx, Paul

2015-05-01 17:40:19

by Jason Low

[permalink] [raw]
Subject: Re: [PATCH v2 2/5] sched, numa: Document usages of mm->numa_scan_seq

On Fri, 2015-05-01 at 08:21 -0700, Paul E. McKenney wrote:
> On Thu, Apr 30, 2015 at 02:13:07PM -0700, Jason Low wrote:
> > On Thu, 2015-04-30 at 14:42 -0400, Waiman Long wrote:
> >
> > > I do have a question of what kind of tearing you are talking about. Do
> > > you mean the tearing due to mm being changed in the middle of the
> > > access? The reason why I don't like this kind of construct is that I am
> > > not sure if
> > > the address translation p->mm->numa_scan_seq is being done once or
> > > twice. I looked at the compiled code and the translation is done only once.
> > >
> > > Anyway, the purpose of READ_ONCE and WRITE_ONCE is not for eliminating
> > > data tearing. They are to make sure that the compiler won't compile away
> > > data access and they are done in the order they appear in the program. I
> > > don't think it is a good idea to associate tearing elimination with
> > > those macros. So I would suggest removing the last sentence in your comment.
> >
> > Yes, I can remove the last sentence in the comment since the main goal
> > was to document that we're access this field without exclusive access.
> >
> > In terms of data tearing, an example would be the write operation gets
> > split into multiple stores (though this is architecture dependent). The
> > idea was that since we're modifying a seq variable without the write
> > lock, we want to remove any forms of optimizations as mentioned above or
> > unpredictable behavior, since READ_ONCE/WRITE_ONCE adds no overhead.
>
> Just to be clear... READ_ONCE() and WRITE_ONCE() do not avoid data tearing
> in cases where the thing read or written is too big for a machine word.

Right, that makes sense. I've updated the comment to instead mention
that it's used to avoid "compiler optimizations".

> If the thing read/written does fit into a machine word and if the location
> read/written is properly aligned, I would be quite surprised if either
> READ_ONCE() or WRITE_ONCE() resulted in any sort of tearing.

Subject: [tip:sched/core] sched, timer: Convert usages of ACCESS_ONCE() in the scheduler to READ_ONCE()/WRITE_ONCE()

Commit-ID: 316c1608d15c736439d4065ed12f306db554b3da
Gitweb: http://git.kernel.org/tip/316c1608d15c736439d4065ed12f306db554b3da
Author: Jason Low <[email protected]>
AuthorDate: Tue, 28 Apr 2015 13:00:20 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Fri, 8 May 2015 12:11:32 +0200

sched, timer: Convert usages of ACCESS_ONCE() in the scheduler to READ_ONCE()/WRITE_ONCE()

ACCESS_ONCE doesn't work reliably on non-scalar types. This patch removes
the rest of the existing usages of ACCESS_ONCE() in the scheduler, and use
the new READ_ONCE() and WRITE_ONCE() APIs as appropriate.

Signed-off-by: Jason Low <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Acked-by: Thomas Gleixner <[email protected]>
Acked-by: Rik van Riel <[email protected]>
Acked-by: Waiman Long <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Aswin Chandramouleeswaran <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Davidlohr Bueso <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Mel Gorman <[email protected]>
Cc: Mike Galbraith <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: Paul E. McKenney <[email protected]>
Cc: Preeti U Murthy <[email protected]>
Cc: Scott J Norton <[email protected]>
Cc: Steven Rostedt <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
include/linux/sched.h | 4 ++--
kernel/fork.c | 2 +-
kernel/sched/auto_group.c | 2 +-
kernel/sched/auto_group.h | 2 +-
kernel/sched/core.c | 4 ++--
kernel/sched/cputime.c | 2 +-
kernel/sched/deadline.c | 2 +-
kernel/sched/fair.c | 18 +++++++++---------
kernel/sched/rt.c | 2 +-
kernel/sched/sched.h | 2 +-
kernel/sched/wait.c | 4 ++--
kernel/time/posix-cpu-timers.c | 8 ++++----
12 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index fb650a2..d709103 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3085,13 +3085,13 @@ static inline void mm_update_next_owner(struct mm_struct *mm)
static inline unsigned long task_rlimit(const struct task_struct *tsk,
unsigned int limit)
{
- return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_cur);
+ return READ_ONCE(tsk->signal->rlim[limit].rlim_cur);
}

static inline unsigned long task_rlimit_max(const struct task_struct *tsk,
unsigned int limit)
{
- return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_max);
+ return READ_ONCE(tsk->signal->rlim[limit].rlim_max);
}

static inline unsigned long rlimit(unsigned int limit)
diff --git a/kernel/fork.c b/kernel/fork.c
index 03c1eaa..47c37a4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1094,7 +1094,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
/* Thread group counters. */
thread_group_cputime_init(sig);

- cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+ cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
if (cpu_limit != RLIM_INFINITY) {
sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
sig->cputimer.running = 1;
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 1a3b58d..750ed60 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -139,7 +139,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)

p->signal->autogroup = autogroup_kref_get(ag);

- if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
+ if (!READ_ONCE(sysctl_sched_autogroup_enabled))
goto out;

for_each_thread(p, t)
diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h
index 8bd0471..890c95f 100644
--- a/kernel/sched/auto_group.h
+++ b/kernel/sched/auto_group.h
@@ -29,7 +29,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
static inline struct task_group *
autogroup_task_group(struct task_struct *p, struct task_group *tg)
{
- int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+ int enabled = READ_ONCE(sysctl_sched_autogroup_enabled);

if (enabled && task_wants_autogroup(p, tg))
return p->signal->autogroup->tg;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 46a5d6f..22b53c8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -511,7 +511,7 @@ static bool set_nr_and_not_polling(struct task_struct *p)
static bool set_nr_if_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
- typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+ typeof(ti->flags) old, val = READ_ONCE(ti->flags);

for (;;) {
if (!(val & _TIF_POLLING_NRFLAG))
@@ -2526,7 +2526,7 @@ void scheduler_tick(void)
u64 scheduler_tick_max_deferment(void)
{
struct rq *rq = this_rq();
- unsigned long next, now = ACCESS_ONCE(jiffies);
+ unsigned long next, now = READ_ONCE(jiffies);

next = rq->last_sched_tick + HZ;

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8394b1e..f5a64ff 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -567,7 +567,7 @@ static void cputime_advance(cputime_t *counter, cputime_t new)
{
cputime_t old;

- while (new > (old = ACCESS_ONCE(*counter)))
+ while (new > (old = READ_ONCE(*counter)))
cmpxchg_cputime(counter, old, new);
}

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 5e95145..890ce95 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -995,7 +995,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
rq = cpu_rq(cpu);

rcu_read_lock();
- curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+ curr = READ_ONCE(rq->curr); /* unlocked access */

/*
* If we are dealing with a -deadline task, we must
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4bc6013..d6915a0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -834,7 +834,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)

static unsigned int task_scan_min(struct task_struct *p)
{
- unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
+ unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
unsigned int scan, floor;
unsigned int windows = 1;

@@ -1794,7 +1794,7 @@ static void task_numa_placement(struct task_struct *p)
u64 runtime, period;
spinlock_t *group_lock = NULL;

- seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+ seq = READ_ONCE(p->mm->numa_scan_seq);
if (p->numa_scan_seq == seq)
return;
p->numa_scan_seq = seq;
@@ -1938,7 +1938,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
}

rcu_read_lock();
- tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
+ tsk = READ_ONCE(cpu_rq(cpu)->curr);

if (!cpupid_match_pid(tsk, cpupid))
goto no_join;
@@ -2107,7 +2107,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)

static void reset_ptenuma_scan(struct task_struct *p)
{
- ACCESS_ONCE(p->mm->numa_scan_seq)++;
+ WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
p->mm->numa_scan_offset = 0;
}

@@ -4451,7 +4451,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
*/
static void update_idle_cpu_load(struct rq *this_rq)
{
- unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+ unsigned long curr_jiffies = READ_ONCE(jiffies);
unsigned long load = this_rq->cfs.runnable_load_avg;
unsigned long pending_updates;

@@ -4473,7 +4473,7 @@ static void update_idle_cpu_load(struct rq *this_rq)
void update_cpu_load_nohz(void)
{
struct rq *this_rq = this_rq();
- unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+ unsigned long curr_jiffies = READ_ONCE(jiffies);
unsigned long pending_updates;

if (curr_jiffies == this_rq->last_load_update_tick)
@@ -4558,7 +4558,7 @@ static unsigned long capacity_orig_of(int cpu)
static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
+ unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
unsigned long load_avg = rq->cfs.runnable_load_avg;

if (nr_running)
@@ -6220,8 +6220,8 @@ static unsigned long scale_rt_capacity(int cpu)
* Since we're reading these variables without serialization make sure
* we read them once before doing sanity checks on them.
*/
- age_stamp = ACCESS_ONCE(rq->age_stamp);
- avg = ACCESS_ONCE(rq->rt_avg);
+ age_stamp = READ_ONCE(rq->age_stamp);
+ avg = READ_ONCE(rq->rt_avg);
delta = __rq_clock_broken(rq) - age_stamp;

if (unlikely(delta < 0))
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 575da76..560d2fa 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1323,7 +1323,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
rq = cpu_rq(cpu);

rcu_read_lock();
- curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+ curr = READ_ONCE(rq->curr); /* unlocked access */

/*
* If the current task on @p's runqueue is an RT task, then
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 09ed26a..d854555 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -713,7 +713,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

static inline u64 __rq_clock_broken(struct rq *rq)
{
- return ACCESS_ONCE(rq->clock);
+ return READ_ONCE(rq->clock);
}

static inline u64 rq_clock(struct rq *rq)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 852143a..2ccec98 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -601,7 +601,7 @@ EXPORT_SYMBOL(bit_wait_io);

__sched int bit_wait_timeout(struct wait_bit_key *word)
{
- unsigned long now = ACCESS_ONCE(jiffies);
+ unsigned long now = READ_ONCE(jiffies);
if (signal_pending_state(current->state, current))
return 1;
if (time_after_eq(now, word->timeout))
@@ -613,7 +613,7 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout);

__sched int bit_wait_io_timeout(struct wait_bit_key *word)
{
- unsigned long now = ACCESS_ONCE(jiffies);
+ unsigned long now = READ_ONCE(jiffies);
if (signal_pending_state(current->state, current))
return 1;
if (time_after_eq(now, word->timeout))
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 0075da7..e072d98 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -852,10 +852,10 @@ static void check_thread_timers(struct task_struct *tsk,
/*
* Check for the special case thread timers.
*/
- soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
+ soft = READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
if (soft != RLIM_INFINITY) {
unsigned long hard =
- ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
+ READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);

if (hard != RLIM_INFINITY &&
tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -958,11 +958,11 @@ static void check_process_timers(struct task_struct *tsk,
SIGPROF);
check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
SIGVTALRM);
- soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+ soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
if (soft != RLIM_INFINITY) {
unsigned long psecs = cputime_to_secs(ptime);
unsigned long hard =
- ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
+ READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
cputime_t x;
if (psecs >= hard) {
/*

Subject: [tip:sched/core] sched/numa: Document usages of mm->numa_scan_seq

Commit-ID: 7e5a2c1729f1612618ed236249a15bf15f309325
Gitweb: http://git.kernel.org/tip/7e5a2c1729f1612618ed236249a15bf15f309325
Author: Jason Low <[email protected]>
AuthorDate: Thu, 30 Apr 2015 17:28:14 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Fri, 8 May 2015 12:13:13 +0200

sched/numa: Document usages of mm->numa_scan_seq

The p->mm->numa_scan_seq is accessed using READ_ONCE/WRITE_ONCE
and modified without exclusive access. It is not clear why it is
accessed this way. This patch provides some documentation on that.

Suggested-by: Ingo Molnar <[email protected]>
Signed-off-by: Jason Low <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Acked-by: Thomas Gleixner <[email protected]>
Acked-by: Rik van Riel <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Aswin Chandramouleeswaran <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Davidlohr Bueso <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Mel Gorman <[email protected]>
Cc: Mike Galbraith <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: Paul E. McKenney <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Preeti U Murthy <[email protected]>
Cc: Scott J Norton <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Waiman Long <[email protected]>
Link: http://lkml.kernel.org/r/1430440094.2475.61.camel@j-VirtualBox
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/sched/fair.c | 13 +++++++++++++
1 file changed, 13 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d6915a0..f18ddb7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1794,6 +1794,11 @@ static void task_numa_placement(struct task_struct *p)
u64 runtime, period;
spinlock_t *group_lock = NULL;

+ /*
+ * The p->mm->numa_scan_seq field gets updated without
+ * exclusive access. Use READ_ONCE() here to ensure
+ * that the field is read in a single access:
+ */
seq = READ_ONCE(p->mm->numa_scan_seq);
if (p->numa_scan_seq == seq)
return;
@@ -2107,6 +2112,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)

static void reset_ptenuma_scan(struct task_struct *p)
{
+ /*
+ * We only did a read acquisition of the mmap sem, so
+ * p->mm->numa_scan_seq is written to without exclusive access
+ * and the update is not guaranteed to be atomic. That's not
+ * much of an issue though, since this is just used for
+ * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
+ * expensive, to avoid any form of compiler optimizations:
+ */
WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
p->mm->numa_scan_offset = 0;
}

Subject: [tip:sched/core] sched, timer: Replace spinlocks with atomics in thread_group_cputimer(), to improve scalability

Commit-ID: 1018016c706f7ff9f56fde3a649789c47085a293
Gitweb: http://git.kernel.org/tip/1018016c706f7ff9f56fde3a649789c47085a293
Author: Jason Low <[email protected]>
AuthorDate: Tue, 28 Apr 2015 13:00:22 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Fri, 8 May 2015 12:15:31 +0200

sched, timer: Replace spinlocks with atomics in thread_group_cputimer(), to improve scalability

While running a database workload, we found a scalability issue with itimers.

Much of the problem was caused by the thread_group_cputimer spinlock.
Each time we account for group system/user time, we need to obtain a
thread_group_cputimer's spinlock to update the timers. On larger systems
(such as a 16 socket machine), this caused more than 30% of total time
spent trying to obtain this kernel lock to update these group timer stats.

This patch converts the timers to 64-bit atomic variables and use
atomic add to update them without a lock. With this patch, the percent
of total time spent updating thread group cputimer timers was reduced
from 30% down to less than 1%.

Note: On 32-bit systems using the generic 64-bit atomics, this causes
sample_group_cputimer() to take locks 3 times instead of just 1 time.
However, we tested this patch on a 32-bit system ARM system using the
generic atomics and did not find the overhead to be much of an issue.
An explanation for why this isn't an issue is that 32-bit systems usually
have small numbers of CPUs, and cacheline contention from extra spinlocks
called periodically is not really apparent on smaller systems.

Signed-off-by: Jason Low <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Acked-by: Thomas Gleixner <[email protected]>
Acked-by: Rik van Riel <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Aswin Chandramouleeswaran <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Davidlohr Bueso <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Mel Gorman <[email protected]>
Cc: Mike Galbraith <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: Paul E. McKenney <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Preeti U Murthy <[email protected]>
Cc: Scott J Norton <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Waiman Long <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
include/linux/init_task.h | 7 ++--
include/linux/sched.h | 10 ++----
kernel/fork.c | 3 --
kernel/sched/stats.h | 15 +++-----
kernel/time/posix-cpu-timers.c | 79 ++++++++++++++++++++++++++----------------
5 files changed, 62 insertions(+), 52 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 696d223..7b9d8b5 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -50,9 +50,10 @@ extern struct fs_struct init_fs;
.cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \
.rlim = INIT_RLIMITS, \
.cputimer = { \
- .cputime = INIT_CPUTIME, \
- .running = 0, \
- .lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \
+ .utime = ATOMIC64_INIT(0), \
+ .stime = ATOMIC64_INIT(0), \
+ .sum_exec_runtime = ATOMIC64_INIT(0), \
+ .running = 0 \
}, \
.cred_guard_mutex = \
__MUTEX_INITIALIZER(sig.cred_guard_mutex), \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d709103..a45874c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -598,9 +598,10 @@ struct task_cputime {
* used for thread group CPU timer calculations.
*/
struct thread_group_cputimer {
- struct task_cputime cputime;
+ atomic64_t utime;
+ atomic64_t stime;
+ atomic64_t sum_exec_runtime;
int running;
- raw_spinlock_t lock;
};

#include <linux/rwsem.h>
@@ -2967,11 +2968,6 @@ static __always_inline bool need_resched(void)
void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);

-static inline void thread_group_cputime_init(struct signal_struct *sig)
-{
- raw_spin_lock_init(&sig->cputimer.lock);
-}
-
/*
* Reevaluate whether the task has signals pending delivery.
* Wake the task if so.
diff --git a/kernel/fork.c b/kernel/fork.c
index 47c37a4..2e67086 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1091,9 +1091,6 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
{
unsigned long cpu_limit;

- /* Thread group counters. */
- thread_group_cputime_init(sig);
-
cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
if (cpu_limit != RLIM_INFINITY) {
sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 4ab7043..c6d1c7d 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -174,7 +174,8 @@ static inline bool cputimer_running(struct task_struct *tsk)
{
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;

- if (!cputimer->running)
+ /* Check if cputimer isn't running. This is accessed without locking. */
+ if (!READ_ONCE(cputimer->running))
return false;

/*
@@ -215,9 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
if (!cputimer_running(tsk))
return;

- raw_spin_lock(&cputimer->lock);
- cputimer->cputime.utime += cputime;
- raw_spin_unlock(&cputimer->lock);
+ atomic64_add(cputime, &cputimer->utime);
}

/**
@@ -238,9 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
if (!cputimer_running(tsk))
return;

- raw_spin_lock(&cputimer->lock);
- cputimer->cputime.stime += cputime;
- raw_spin_unlock(&cputimer->lock);
+ atomic64_add(cputime, &cputimer->stime);
}

/**
@@ -261,7 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
if (!cputimer_running(tsk))
return;

- raw_spin_lock(&cputimer->lock);
- cputimer->cputime.sum_exec_runtime += ns;
- raw_spin_unlock(&cputimer->lock);
+ atomic64_add(ns, &cputimer->sum_exec_runtime);
}
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index e072d98..d857306 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -196,39 +196,62 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
return 0;
}

-static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
+/*
+ * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
+ * to avoid race conditions with concurrent updates to cputime.
+ */
+static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
{
- if (b->utime > a->utime)
- a->utime = b->utime;
+ u64 curr_cputime;
+retry:
+ curr_cputime = atomic64_read(cputime);
+ if (sum_cputime > curr_cputime) {
+ if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
+ goto retry;
+ }
+}

- if (b->stime > a->stime)
- a->stime = b->stime;
+static void update_gt_cputime(struct thread_group_cputimer *cputimer, struct task_cputime *sum)
+{
+ __update_gt_cputime(&cputimer->utime, sum->utime);
+ __update_gt_cputime(&cputimer->stime, sum->stime);
+ __update_gt_cputime(&cputimer->sum_exec_runtime, sum->sum_exec_runtime);
+}

- if (b->sum_exec_runtime > a->sum_exec_runtime)
- a->sum_exec_runtime = b->sum_exec_runtime;
+/* Sample thread_group_cputimer values in "cputimer", store results in "times". */
+static inline void sample_group_cputimer(struct task_cputime *times,
+ struct thread_group_cputimer *cputimer)
+{
+ times->utime = atomic64_read(&cputimer->utime);
+ times->stime = atomic64_read(&cputimer->stime);
+ times->sum_exec_runtime = atomic64_read(&cputimer->sum_exec_runtime);
}

void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
{
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
struct task_cputime sum;
- unsigned long flags;

- if (!cputimer->running) {
+ /* Check if cputimer isn't running. This is accessed without locking. */
+ if (!READ_ONCE(cputimer->running)) {
/*
* The POSIX timer interface allows for absolute time expiry
* values through the TIMER_ABSTIME flag, therefore we have
- * to synchronize the timer to the clock every time we start
- * it.
+ * to synchronize the timer to the clock every time we start it.
*/
thread_group_cputime(tsk, &sum);
- raw_spin_lock_irqsave(&cputimer->lock, flags);
- cputimer->running = 1;
- update_gt_cputime(&cputimer->cputime, &sum);
- } else
- raw_spin_lock_irqsave(&cputimer->lock, flags);
- *times = cputimer->cputime;
- raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+ update_gt_cputime(cputimer, &sum);
+
+ /*
+ * We're setting cputimer->running without a lock. Ensure
+ * this only gets written to in one operation. We set
+ * running after update_gt_cputime() as a small optimization,
+ * but barriers are not required because update_gt_cputime()
+ * can handle concurrent updates.
+ */
+ WRITE_ONCE(cputimer->running, 1);
+ }
+ sample_group_cputimer(times, cputimer);
}

/*
@@ -582,7 +605,8 @@ bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
if (!task_cputime_zero(&tsk->cputime_expires))
return false;

- if (tsk->signal->cputimer.running)
+ /* Check if cputimer is running. This is accessed without locking. */
+ if (READ_ONCE(tsk->signal->cputimer.running))
return false;

return true;
@@ -882,14 +906,12 @@ static void check_thread_timers(struct task_struct *tsk,
}
}

-static void stop_process_timers(struct signal_struct *sig)
+static inline void stop_process_timers(struct signal_struct *sig)
{
struct thread_group_cputimer *cputimer = &sig->cputimer;
- unsigned long flags;

- raw_spin_lock_irqsave(&cputimer->lock, flags);
- cputimer->running = 0;
- raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+ /* Turn off cputimer->running. This is done without locking. */
+ WRITE_ONCE(cputimer->running, 0);
}

static u32 onecputick;
@@ -1111,12 +1133,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
}

sig = tsk->signal;
- if (sig->cputimer.running) {
+ /* Check if cputimer is running. This is accessed without locking. */
+ if (READ_ONCE(sig->cputimer.running)) {
struct task_cputime group_sample;

- raw_spin_lock(&sig->cputimer.lock);
- group_sample = sig->cputimer.cputime;
- raw_spin_unlock(&sig->cputimer.lock);
+ sample_group_cputimer(&group_sample, &sig->cputimer);

if (task_cputime_expired(&group_sample, &sig->cputime_expires))
return 1;
@@ -1157,7 +1178,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
* If there are any active process wide timers (POSIX 1.b, itimers,
* RLIMIT_CPU) cputimer must be running.
*/
- if (tsk->signal->cputimer.running)
+ if (READ_ONCE(tsk->signal->cputimer.running))
check_process_timers(tsk, &firing);

/*

Subject: [tip:sched/core] sched, timer: Provide an atomic ' struct task_cputime' data structure

Commit-ID: 971e8a985482c76487edb5a49811e99b96e846e1
Gitweb: http://git.kernel.org/tip/971e8a985482c76487edb5a49811e99b96e846e1
Author: Jason Low <[email protected]>
AuthorDate: Tue, 28 Apr 2015 13:00:23 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Fri, 8 May 2015 12:17:45 +0200

sched, timer: Provide an atomic 'struct task_cputime' data structure

This patch adds an atomic variant of the 'struct task_cputime' data structure,
which can be used to store and update task_cputime statistics without
needing to do locking.

Suggested-by: Ingo Molnar <[email protected]>
Signed-off-by: Jason Low <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Acked-by: Thomas Gleixner <[email protected]>
Acked-by: Rik van Riel <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Aswin Chandramouleeswaran <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Davidlohr Bueso <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Mel Gorman <[email protected]>
Cc: Mike Galbraith <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: Paul E. McKenney <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Preeti U Murthy <[email protected]>
Cc: Scott J Norton <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Waiman Long <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
include/linux/sched.h | 17 +++++++++++++++++
1 file changed, 17 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a45874c..6eb78cd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -572,6 +572,23 @@ struct task_cputime {
.sum_exec_runtime = 0, \
}

+/*
+ * This is the atomic variant of task_cputime, which can be used for
+ * storing and updating task_cputime statistics without locking.
+ */
+struct task_cputime_atomic {
+ atomic64_t utime;
+ atomic64_t stime;
+ atomic64_t sum_exec_runtime;
+};
+
+#define INIT_CPUTIME_ATOMIC \
+ (struct task_cputime_atomic) { \
+ .utime = ATOMIC64_INIT(0), \
+ .stime = ATOMIC64_INIT(0), \
+ .sum_exec_runtime = ATOMIC64_INIT(0), \
+ }
+
#ifdef CONFIG_PREEMPT_COUNT
#define PREEMPT_DISABLED (1 + PREEMPT_ENABLED)
#else

Subject: [tip:sched/core] sched, timer: Use the atomic task_cputime in thread_group_cputimer

Commit-ID: 7110744516276e906f9197e2857d026eb2343393
Gitweb: http://git.kernel.org/tip/7110744516276e906f9197e2857d026eb2343393
Author: Jason Low <[email protected]>
AuthorDate: Tue, 28 Apr 2015 13:00:24 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Fri, 8 May 2015 12:17:46 +0200

sched, timer: Use the atomic task_cputime in thread_group_cputimer

Recent optimizations were made to thread_group_cputimer to improve its
scalability by keeping track of cputime stats without a lock. However,
the values were open coded to the structure, causing them to be at
a different abstraction level from the regular task_cputime structure.
Furthermore, any subsequent similar optimizations would not be able to
share the new code, since they are specific to thread_group_cputimer.

This patch adds the new task_cputime_atomic data structure (introduced in
the previous patch in the series) to thread_group_cputimer for keeping
track of the cputime atomically, which also helps generalize the code.

Suggested-by: Ingo Molnar <[email protected]>
Signed-off-by: Jason Low <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Acked-by: Thomas Gleixner <[email protected]>
Acked-by: Rik van Riel <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Aswin Chandramouleeswaran <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Davidlohr Bueso <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Mel Gorman <[email protected]>
Cc: Mike Galbraith <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: Paul E. McKenney <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Preeti U Murthy <[email protected]>
Cc: Scott J Norton <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Waiman Long <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
include/linux/init_task.h | 6 ++----
include/linux/sched.h | 4 +---
kernel/sched/stats.h | 6 +++---
kernel/time/posix-cpu-timers.c | 26 +++++++++++++-------------
4 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 7b9d8b5..bb9b075 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -50,10 +50,8 @@ extern struct fs_struct init_fs;
.cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \
.rlim = INIT_RLIMITS, \
.cputimer = { \
- .utime = ATOMIC64_INIT(0), \
- .stime = ATOMIC64_INIT(0), \
- .sum_exec_runtime = ATOMIC64_INIT(0), \
- .running = 0 \
+ .cputime_atomic = INIT_CPUTIME_ATOMIC, \
+ .running = 0, \
}, \
.cred_guard_mutex = \
__MUTEX_INITIALIZER(sig.cred_guard_mutex), \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6eb78cd..4adc536 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -615,9 +615,7 @@ struct task_cputime_atomic {
* used for thread group CPU timer calculations.
*/
struct thread_group_cputimer {
- atomic64_t utime;
- atomic64_t stime;
- atomic64_t sum_exec_runtime;
+ struct task_cputime_atomic cputime_atomic;
int running;
};

diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index c6d1c7d..077ebbd 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -216,7 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
if (!cputimer_running(tsk))
return;

- atomic64_add(cputime, &cputimer->utime);
+ atomic64_add(cputime, &cputimer->cputime_atomic.utime);
}

/**
@@ -237,7 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
if (!cputimer_running(tsk))
return;

- atomic64_add(cputime, &cputimer->stime);
+ atomic64_add(cputime, &cputimer->cputime_atomic.stime);
}

/**
@@ -258,5 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
if (!cputimer_running(tsk))
return;

- atomic64_add(ns, &cputimer->sum_exec_runtime);
+ atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
}
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index d857306..892e3da 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -211,20 +211,20 @@ retry:
}
}

-static void update_gt_cputime(struct thread_group_cputimer *cputimer, struct task_cputime *sum)
+static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum)
{
- __update_gt_cputime(&cputimer->utime, sum->utime);
- __update_gt_cputime(&cputimer->stime, sum->stime);
- __update_gt_cputime(&cputimer->sum_exec_runtime, sum->sum_exec_runtime);
+ __update_gt_cputime(&cputime_atomic->utime, sum->utime);
+ __update_gt_cputime(&cputime_atomic->stime, sum->stime);
+ __update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime);
}

-/* Sample thread_group_cputimer values in "cputimer", store results in "times". */
-static inline void sample_group_cputimer(struct task_cputime *times,
- struct thread_group_cputimer *cputimer)
+/* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */
+static inline void sample_cputime_atomic(struct task_cputime *times,
+ struct task_cputime_atomic *atomic_times)
{
- times->utime = atomic64_read(&cputimer->utime);
- times->stime = atomic64_read(&cputimer->stime);
- times->sum_exec_runtime = atomic64_read(&cputimer->sum_exec_runtime);
+ times->utime = atomic64_read(&atomic_times->utime);
+ times->stime = atomic64_read(&atomic_times->stime);
+ times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime);
}

void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
@@ -240,7 +240,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
* to synchronize the timer to the clock every time we start it.
*/
thread_group_cputime(tsk, &sum);
- update_gt_cputime(cputimer, &sum);
+ update_gt_cputime(&cputimer->cputime_atomic, &sum);

/*
* We're setting cputimer->running without a lock. Ensure
@@ -251,7 +251,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
*/
WRITE_ONCE(cputimer->running, 1);
}
- sample_group_cputimer(times, cputimer);
+ sample_cputime_atomic(times, &cputimer->cputime_atomic);
}

/*
@@ -1137,7 +1137,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
if (READ_ONCE(sig->cputimer.running)) {
struct task_cputime group_sample;

- sample_group_cputimer(&group_sample, &sig->cputimer);
+ sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic);

if (task_cputime_expired(&group_sample, &sig->cputime_expires))
return 1;

2015-05-08 21:32:24

by Jason Low

[permalink] [raw]
Subject: [PATCH] sched, timer: Fix documentation for 'struct thread_group_cputimer'

On Fri, 2015-05-08 at 06:22 -0700, tip-bot for Jason Low wrote:
> Commit-ID: 1018016c706f7ff9f56fde3a649789c47085a293
> Gitweb: http://git.kernel.org/tip/1018016c706f7ff9f56fde3a649789c47085a293
> Author: Jason Low <[email protected]>
> AuthorDate: Tue, 28 Apr 2015 13:00:22 -0700
> Committer: Ingo Molnar <[email protected]>
> CommitDate: Fri, 8 May 2015 12:15:31 +0200
>
> sched, timer: Replace spinlocks with atomics in thread_group_cputimer(), to improve scalability

The following patch addresses the issue reported by Fengguang Wu
regarding this tip commit 1018016c706f.

---------------------------------------------------------------------------
The description for struct thread_group_cputimer contains the 'cputime'
and 'lock' members, which are not valid anymore since

tip commit 1018016c706f ("sched, timer: Replace spinlocks with atomics
in thread_group_cputimer(), to improve scalability")

modified/removed those fields. This patch updates the description
to reflect those changes.

Reported-by: Fengguang Wu <[email protected]>
Signed-off-by: Jason Low <[email protected]>
---
include/linux/sched.h | 3 +--
1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6cc4f7e..cb73486 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -606,10 +606,9 @@ struct task_cputime_atomic {

/**
* struct thread_group_cputimer - thread group interval timer counts
- * @cputime: thread group interval timers.
+ * @cputime_atomic: atomic thread group interval timers.
* @running: non-zero when there are timers running and
* @cputime receives updates.
- * @lock: lock for fields in this struct.
*
* This structure contains the version of task_cputime, above, that is
* used for thread group CPU timer calculations.
--
1.7.2.5


Subject: [tip:sched/core] sched, timer: Fix documentation for ' struct thread_group_cputimer'

Commit-ID: 920ce39f6c204d4ce4d8acebe7522f0dfa95f662
Gitweb: http://git.kernel.org/tip/920ce39f6c204d4ce4d8acebe7522f0dfa95f662
Author: Jason Low <[email protected]>
AuthorDate: Fri, 8 May 2015 14:31:50 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Sun, 10 May 2015 12:45:27 +0200

sched, timer: Fix documentation for 'struct thread_group_cputimer'

Fix the docbook build bug reported by Fengguang Wu.

Reported-by: Fengguang Wu <[email protected]>
Signed-off-by: Jason Low <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Link: http://lkml.kernel.org/r/1431120710.5136.12.camel@j-VirtualBox
Signed-off-by: Ingo Molnar <[email protected]>
---
include/linux/sched.h | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 254d88e..0eceeec 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -606,10 +606,9 @@ struct task_cputime_atomic {

/**
* struct thread_group_cputimer - thread group interval timer counts
- * @cputime: thread group interval timers.
+ * @cputime_atomic: atomic thread group interval timers.
* @running: non-zero when there are timers running and
* @cputime receives updates.
- * @lock: lock for fields in this struct.
*
* This structure contains the version of task_cputime, above, that is
* used for thread group CPU timer calculations.