2019-02-18 19:08:38

by Peter Zijlstra

[permalink] [raw]
Subject: [RFC][PATCH 03/16] sched: Wrap rq::lock access

In preparation of playing games with rq->lock, abstract the thing
using an accessor.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/sched/core.c | 44 ++++++++++----------
kernel/sched/deadline.c | 18 ++++----
kernel/sched/debug.c | 4 -
kernel/sched/fair.c | 41 +++++++++----------
kernel/sched/idle.c | 4 -
kernel/sched/pelt.h | 2
kernel/sched/rt.c | 8 +--
kernel/sched/sched.h | 102 ++++++++++++++++++++++++------------------------
kernel/sched/topology.c | 4 -
9 files changed, 114 insertions(+), 113 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -72,12 +72,12 @@ struct rq *__task_rq_lock(struct task_st

for (;;) {
rq = task_rq(p);
- raw_spin_lock(&rq->lock);
+ raw_spin_lock(rq_lockp(rq));
if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
rq_pin_lock(rq, rf);
return rq;
}
- raw_spin_unlock(&rq->lock);
+ raw_spin_unlock(rq_lockp(rq));

while (unlikely(task_on_rq_migrating(p)))
cpu_relax();
@@ -96,7 +96,7 @@ struct rq *task_rq_lock(struct task_stru
for (;;) {
raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
rq = task_rq(p);
- raw_spin_lock(&rq->lock);
+ raw_spin_lock(rq_lockp(rq));
/*
* move_queued_task() task_rq_lock()
*
@@ -118,7 +118,7 @@ struct rq *task_rq_lock(struct task_stru
rq_pin_lock(rq, rf);
return rq;
}
- raw_spin_unlock(&rq->lock);
+ raw_spin_unlock(rq_lockp(rq));
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);

while (unlikely(task_on_rq_migrating(p)))
@@ -188,7 +188,7 @@ void update_rq_clock(struct rq *rq)
{
s64 delta;

- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));

if (rq->clock_update_flags & RQCF_ACT_SKIP)
return;
@@ -497,7 +497,7 @@ void resched_curr(struct rq *rq)
struct task_struct *curr = rq->curr;
int cpu;

- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));

if (test_tsk_need_resched(curr))
return;
@@ -521,10 +521,10 @@ void resched_cpu(int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;

- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_lock_irqsave(rq_lockp(rq), flags);
if (cpu_online(cpu) || cpu == smp_processor_id())
resched_curr(rq);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_unlock_irqrestore(rq_lockp(rq), flags);
}

#ifdef CONFIG_SMP
@@ -956,7 +956,7 @@ static inline bool is_cpu_allowed(struct
static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
struct task_struct *p, int new_cpu)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));

WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
dequeue_task(rq, p, DEQUEUE_NOCLOCK);
@@ -1070,7 +1070,7 @@ void do_set_cpus_allowed(struct task_str
* Because __kthread_bind() calls this on blocked tasks without
* holding rq->lock.
*/
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));
dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
}
if (running)
@@ -1203,7 +1203,7 @@ void set_task_cpu(struct task_struct *p,
* task_rq_lock().
*/
WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
- lockdep_is_held(&task_rq(p)->lock)));
+ lockdep_is_held(rq_lockp(task_rq(p)))));
#endif
/*
* Clearly, migrating tasks to offline CPUs is a fairly daft thing.
@@ -1732,7 +1732,7 @@ ttwu_do_activate(struct rq *rq, struct t
{
int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;

- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));

#ifdef CONFIG_SMP
if (p->sched_contributes_to_load)
@@ -2123,7 +2123,7 @@ static void try_to_wake_up_local(struct
WARN_ON_ONCE(p == current))
return;

- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));

if (!raw_spin_trylock(&p->pi_lock)) {
/*
@@ -2606,10 +2606,10 @@ prepare_lock_switch(struct rq *rq, struc
* do an early lockdep release here:
*/
rq_unpin_lock(rq, rf);
- spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+ spin_release(&rq_lockp(rq)->dep_map, 1, _THIS_IP_);
#ifdef CONFIG_DEBUG_SPINLOCK
/* this is a valid case when another task releases the spinlock */
- rq->lock.owner = next;
+ rq_lockp(rq)->owner = next;
#endif
}

@@ -2620,8 +2620,8 @@ static inline void finish_lock_switch(st
* fix up the runqueue lock - which gets 'carried over' from
* prev into current:
*/
- spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
- raw_spin_unlock_irq(&rq->lock);
+ spin_acquire(&rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
+ raw_spin_unlock_irq(rq_lockp(rq));
}

/*
@@ -2771,7 +2771,7 @@ static void __balance_callback(struct rq
void (*func)(struct rq *rq);
unsigned long flags;

- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_lock_irqsave(rq_lockp(rq), flags);
head = rq->balance_callback;
rq->balance_callback = NULL;
while (head) {
@@ -2782,7 +2782,7 @@ static void __balance_callback(struct rq

func(rq);
}
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_unlock_irqrestore(rq_lockp(rq), flags);
}

static inline void balance_callback(struct rq *rq)
@@ -5411,7 +5411,7 @@ void init_idle(struct task_struct *idle,
unsigned long flags;

raw_spin_lock_irqsave(&idle->pi_lock, flags);
- raw_spin_lock(&rq->lock);
+ raw_spin_lock(rq_lockp(rq));

__sched_fork(0, idle);
idle->state = TASK_RUNNING;
@@ -5448,7 +5448,7 @@ void init_idle(struct task_struct *idle,
#ifdef CONFIG_SMP
idle->on_cpu = 1;
#endif
- raw_spin_unlock(&rq->lock);
+ raw_spin_unlock(rq_lockp(rq));
raw_spin_unlock_irqrestore(&idle->pi_lock, flags);

/* Set the preempt count _outside_ the spinlocks! */
@@ -6016,7 +6016,7 @@ void __init sched_init(void)
struct rq *rq;

rq = cpu_rq(i);
- raw_spin_lock_init(&rq->lock);
+ raw_spin_lock_init(&rq->__lock);
rq->nr_running = 0;
rq->calc_load_active = 0;
rq->calc_load_update = jiffies + LOAD_FREQ;
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -80,7 +80,7 @@ void __add_running_bw(u64 dl_bw, struct
{
u64 old = dl_rq->running_bw;

- lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ lockdep_assert_held(rq_lockp((rq_of_dl_rq(dl_rq))));
dl_rq->running_bw += dl_bw;
SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
@@ -93,7 +93,7 @@ void __sub_running_bw(u64 dl_bw, struct
{
u64 old = dl_rq->running_bw;

- lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ lockdep_assert_held(rq_lockp((rq_of_dl_rq(dl_rq))));
dl_rq->running_bw -= dl_bw;
SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */
if (dl_rq->running_bw > old)
@@ -107,7 +107,7 @@ void __add_rq_bw(u64 dl_bw, struct dl_rq
{
u64 old = dl_rq->this_bw;

- lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ lockdep_assert_held(rq_lockp((rq_of_dl_rq(dl_rq))));
dl_rq->this_bw += dl_bw;
SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */
}
@@ -117,7 +117,7 @@ void __sub_rq_bw(u64 dl_bw, struct dl_rq
{
u64 old = dl_rq->this_bw;

- lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ lockdep_assert_held(rq_lockp((rq_of_dl_rq(dl_rq))));
dl_rq->this_bw -= dl_bw;
SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */
if (dl_rq->this_bw > old)
@@ -893,7 +893,7 @@ static int start_dl_timer(struct task_st
ktime_t now, act;
s64 delta;

- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));

/*
* We want the timer to fire at the deadline, but considering
@@ -1003,9 +1003,9 @@ static enum hrtimer_restart dl_task_time
* If the runqueue is no longer available, migrate the
* task elsewhere. This necessarily changes rq.
*/
- lockdep_unpin_lock(&rq->lock, rf.cookie);
+ lockdep_unpin_lock(rq_lockp(rq), rf.cookie);
rq = dl_task_offline_migration(rq, p);
- rf.cookie = lockdep_pin_lock(&rq->lock);
+ rf.cookie = lockdep_pin_lock(rq_lockp(rq));
update_rq_clock(rq);

/*
@@ -1620,7 +1620,7 @@ static void migrate_task_rq_dl(struct ta
* from try_to_wake_up(). Hence, p->pi_lock is locked, but
* rq->lock is not... So, lock it
*/
- raw_spin_lock(&rq->lock);
+ raw_spin_lock(rq_lockp(rq));
if (p->dl.dl_non_contending) {
sub_running_bw(&p->dl, &rq->dl);
p->dl.dl_non_contending = 0;
@@ -1635,7 +1635,7 @@ static void migrate_task_rq_dl(struct ta
put_task_struct(p);
}
sub_rq_bw(&p->dl, &rq->dl);
- raw_spin_unlock(&rq->lock);
+ raw_spin_unlock(rq_lockp(rq));
}

static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -515,7 +515,7 @@ void print_cfs_rq(struct seq_file *m, in
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
SPLIT_NS(cfs_rq->exec_clock));

- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_lock_irqsave(rq_lockp(rq), flags);
if (rb_first_cached(&cfs_rq->tasks_timeline))
MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
last = __pick_last_entity(cfs_rq);
@@ -523,7 +523,7 @@ void print_cfs_rq(struct seq_file *m, in
max_vruntime = last->vruntime;
min_vruntime = cfs_rq->min_vruntime;
rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_unlock_irqrestore(rq_lockp(rq), flags);
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
SPLIT_NS(MIN_vruntime));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4966,7 +4966,7 @@ static void __maybe_unused update_runtim
{
struct task_group *tg;

- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));

rcu_read_lock();
list_for_each_entry_rcu(tg, &task_groups, list) {
@@ -4985,7 +4985,7 @@ static void __maybe_unused unthrottle_of
{
struct task_group *tg;

- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));

rcu_read_lock();
list_for_each_entry_rcu(tg, &task_groups, list) {
@@ -6743,7 +6743,7 @@ static void migrate_task_rq_fair(struct
* In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
* rq->lock and can modify state directly.
*/
- lockdep_assert_held(&task_rq(p)->lock);
+ lockdep_assert_held(rq_lockp(task_rq(p)));
detach_entity_cfs_rq(&p->se);

} else {
@@ -7317,7 +7317,7 @@ static int task_hot(struct task_struct *
{
s64 delta;

- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_held(rq_lockp(env->src_rq));

if (p->sched_class != &fair_sched_class)
return 0;
@@ -7411,7 +7411,7 @@ int can_migrate_task(struct task_struct
{
int tsk_cache_hot;

- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_held(rq_lockp(env->src_rq));

/*
* We do not migrate tasks that are:
@@ -7489,7 +7489,7 @@ int can_migrate_task(struct task_struct
*/
static void detach_task(struct task_struct *p, struct lb_env *env)
{
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_held(rq_lockp(env->src_rq));

p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
@@ -7506,7 +7506,7 @@ static struct task_struct *detach_one_ta
{
struct task_struct *p;

- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_held(rq_lockp(env->src_rq));

list_for_each_entry_reverse(p,
&env->src_rq->cfs_tasks, se.group_node) {
@@ -7542,7 +7542,7 @@ static int detach_tasks(struct lb_env *e
unsigned long load;
int detached = 0;

- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_held(rq_lockp(env->src_rq));

if (env->imbalance <= 0)
return 0;
@@ -7623,7 +7623,7 @@ static int detach_tasks(struct lb_env *e
*/
static void attach_task(struct rq *rq, struct task_struct *p)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));

BUG_ON(task_rq(p) != rq);
activate_task(rq, p, ENQUEUE_NOCLOCK);
@@ -9164,7 +9164,7 @@ static int load_balance(int this_cpu, st
if (need_active_balance(&env)) {
unsigned long flags;

- raw_spin_lock_irqsave(&busiest->lock, flags);
+ raw_spin_lock_irqsave(rq_lockp(busiest), flags);

/*
* Don't kick the active_load_balance_cpu_stop,
@@ -9172,8 +9172,7 @@ static int load_balance(int this_cpu, st
* moved to this_cpu:
*/
if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
- raw_spin_unlock_irqrestore(&busiest->lock,
- flags);
+ raw_spin_unlock_irqrestore(rq_lockp(busiest), flags);
env.flags |= LBF_ALL_PINNED;
goto out_one_pinned;
}
@@ -9188,7 +9187,7 @@ static int load_balance(int this_cpu, st
busiest->push_cpu = this_cpu;
active_balance = 1;
}
- raw_spin_unlock_irqrestore(&busiest->lock, flags);
+ raw_spin_unlock_irqrestore(rq_lockp(busiest), flags);

if (active_balance) {
stop_one_cpu_nowait(cpu_of(busiest),
@@ -9897,7 +9896,7 @@ static void nohz_newidle_balance(struct
time_before(jiffies, READ_ONCE(nohz.next_blocked)))
return;

- raw_spin_unlock(&this_rq->lock);
+ raw_spin_unlock(rq_lockp(this_rq));
/*
* This CPU is going to be idle and blocked load of idle CPUs
* need to be updated. Run the ilb locally as it is a good
@@ -9906,7 +9905,7 @@ static void nohz_newidle_balance(struct
*/
if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
kick_ilb(NOHZ_STATS_KICK);
- raw_spin_lock(&this_rq->lock);
+ raw_spin_lock(rq_lockp(this_rq));
}

#else /* !CONFIG_NO_HZ_COMMON */
@@ -9966,7 +9965,7 @@ static int idle_balance(struct rq *this_
goto out;
}

- raw_spin_unlock(&this_rq->lock);
+ raw_spin_unlock(rq_lockp(this_rq));

update_blocked_averages(this_cpu);
rcu_read_lock();
@@ -10007,7 +10006,7 @@ static int idle_balance(struct rq *this_
}
rcu_read_unlock();

- raw_spin_lock(&this_rq->lock);
+ raw_spin_lock(rq_lockp(this_rq));

if (curr_cost > this_rq->max_idle_balance_cost)
this_rq->max_idle_balance_cost = curr_cost;
@@ -10443,11 +10442,11 @@ void online_fair_sched_group(struct task
rq = cpu_rq(i);
se = tg->se[i];

- raw_spin_lock_irq(&rq->lock);
+ raw_spin_lock_irq(rq_lockp(rq));
update_rq_clock(rq);
attach_entity_cfs_rq(se);
sync_throttle(tg, i);
- raw_spin_unlock_irq(&rq->lock);
+ raw_spin_unlock_irq(rq_lockp(rq));
}
}

@@ -10470,9 +10469,9 @@ void unregister_fair_sched_group(struct

rq = cpu_rq(cpu);

- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_lock_irqsave(rq_lockp(rq), flags);
list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_unlock_irqrestore(rq_lockp(rq), flags);
}
}

--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -390,10 +390,10 @@ pick_next_task_idle(struct rq *rq, struc
static void
dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
{
- raw_spin_unlock_irq(&rq->lock);
+ raw_spin_unlock_irq(rq_lockp(rq));
printk(KERN_ERR "bad: scheduling from the idle thread!\n");
dump_stack();
- raw_spin_lock_irq(&rq->lock);
+ raw_spin_lock_irq(rq_lockp(rq));
}

static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -116,7 +116,7 @@ static inline void update_idle_rq_clock_

static inline u64 rq_clock_pelt(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));
assert_clock_updated(rq);

return rq->clock_pelt - rq->lost_idle_time;
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -845,7 +845,7 @@ static int do_sched_rt_period_timer(stru
if (skip)
continue;

- raw_spin_lock(&rq->lock);
+ raw_spin_lock(rq_lockp(rq));
update_rq_clock(rq);

if (rt_rq->rt_time) {
@@ -883,7 +883,7 @@ static int do_sched_rt_period_timer(stru

if (enqueue)
sched_rt_rq_enqueue(rt_rq);
- raw_spin_unlock(&rq->lock);
+ raw_spin_unlock(rq_lockp(rq));
}

if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
@@ -2034,9 +2034,9 @@ void rto_push_irq_work_func(struct irq_w
* When it gets updated, a check is made if a push is possible.
*/
if (has_pushable_tasks(rq)) {
- raw_spin_lock(&rq->lock);
+ raw_spin_lock(rq_lockp(rq));
push_rt_tasks(rq);
- raw_spin_unlock(&rq->lock);
+ raw_spin_unlock(rq_lockp(rq));
}

raw_spin_lock(&rd->rto_lock);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -806,7 +806,7 @@ extern void rto_push_irq_work_func(struc
*/
struct rq {
/* runqueue lock: */
- raw_spinlock_t lock;
+ raw_spinlock_t __lock;

/*
* nr_running and cpu_load should be in the same cacheline because
@@ -979,6 +979,10 @@ static inline int cpu_of(struct rq *rq)
#endif
}

+static inline raw_spinlock_t *rq_lockp(struct rq *rq)
+{
+ return &rq->__lock;
+}

#ifdef CONFIG_SCHED_SMT
extern void __update_idle_core(struct rq *rq);
@@ -1046,7 +1050,7 @@ static inline void assert_clock_updated(

static inline u64 rq_clock(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));
assert_clock_updated(rq);

return rq->clock;
@@ -1054,7 +1058,7 @@ static inline u64 rq_clock(struct rq *rq

static inline u64 rq_clock_task(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));
assert_clock_updated(rq);

return rq->clock_task;
@@ -1062,7 +1066,7 @@ static inline u64 rq_clock_task(struct r

static inline void rq_clock_skip_update(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));
rq->clock_update_flags |= RQCF_REQ_SKIP;
}

@@ -1072,7 +1076,7 @@ static inline void rq_clock_skip_update(
*/
static inline void rq_clock_cancel_skipupdate(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));
rq->clock_update_flags &= ~RQCF_REQ_SKIP;
}

@@ -1091,7 +1095,7 @@ struct rq_flags {

static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
{
- rf->cookie = lockdep_pin_lock(&rq->lock);
+ rf->cookie = lockdep_pin_lock(rq_lockp(rq));

#ifdef CONFIG_SCHED_DEBUG
rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
@@ -1106,12 +1110,12 @@ static inline void rq_unpin_lock(struct
rf->clock_update_flags = RQCF_UPDATED;
#endif

- lockdep_unpin_lock(&rq->lock, rf->cookie);
+ lockdep_unpin_lock(rq_lockp(rq), rf->cookie);
}

static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf)
{
- lockdep_repin_lock(&rq->lock, rf->cookie);
+ lockdep_repin_lock(rq_lockp(rq), rf->cookie);

#ifdef CONFIG_SCHED_DEBUG
/*
@@ -1132,7 +1136,7 @@ static inline void __task_rq_unlock(stru
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
- raw_spin_unlock(&rq->lock);
+ raw_spin_unlock(rq_lockp(rq));
}

static inline void
@@ -1141,7 +1145,7 @@ task_rq_unlock(struct rq *rq, struct tas
__releases(p->pi_lock)
{
rq_unpin_lock(rq, rf);
- raw_spin_unlock(&rq->lock);
+ raw_spin_unlock(rq_lockp(rq));
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
}

@@ -1149,7 +1153,7 @@ static inline void
rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
- raw_spin_lock_irqsave(&rq->lock, rf->flags);
+ raw_spin_lock_irqsave(rq_lockp(rq), rf->flags);
rq_pin_lock(rq, rf);
}

@@ -1157,7 +1161,7 @@ static inline void
rq_lock_irq(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
- raw_spin_lock_irq(&rq->lock);
+ raw_spin_lock_irq(rq_lockp(rq));
rq_pin_lock(rq, rf);
}

@@ -1165,7 +1169,7 @@ static inline void
rq_lock(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
- raw_spin_lock(&rq->lock);
+ raw_spin_lock(rq_lockp(rq));
rq_pin_lock(rq, rf);
}

@@ -1173,7 +1177,7 @@ static inline void
rq_relock(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
- raw_spin_lock(&rq->lock);
+ raw_spin_lock(rq_lockp(rq));
rq_repin_lock(rq, rf);
}

@@ -1182,7 +1186,7 @@ rq_unlock_irqrestore(struct rq *rq, stru
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
- raw_spin_unlock_irqrestore(&rq->lock, rf->flags);
+ raw_spin_unlock_irqrestore(rq_lockp(rq), rf->flags);
}

static inline void
@@ -1190,7 +1194,7 @@ rq_unlock_irq(struct rq *rq, struct rq_f
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
- raw_spin_unlock_irq(&rq->lock);
+ raw_spin_unlock_irq(rq_lockp(rq));
}

static inline void
@@ -1198,7 +1202,7 @@ rq_unlock(struct rq *rq, struct rq_flags
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
- raw_spin_unlock(&rq->lock);
+ raw_spin_unlock(rq_lockp(rq));
}

static inline struct rq *
@@ -1261,7 +1265,7 @@ queue_balance_callback(struct rq *rq,
struct callback_head *head,
void (*func)(struct rq *rq))
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));

if (unlikely(head->next))
return;
@@ -1917,7 +1921,7 @@ static inline int _double_lock_balance(s
__acquires(busiest->lock)
__acquires(this_rq->lock)
{
- raw_spin_unlock(&this_rq->lock);
+ raw_spin_unlock(rq_lockp(this_rq));
double_rq_lock(this_rq, busiest);

return 1;
@@ -1936,20 +1940,22 @@ static inline int _double_lock_balance(s
__acquires(busiest->lock)
__acquires(this_rq->lock)
{
- int ret = 0;
+ if (rq_lockp(this_rq) == rq_lockp(busiest))
+ return 0;

- if (unlikely(!raw_spin_trylock(&busiest->lock))) {
- if (busiest < this_rq) {
- raw_spin_unlock(&this_rq->lock);
- raw_spin_lock(&busiest->lock);
- raw_spin_lock_nested(&this_rq->lock,
- SINGLE_DEPTH_NESTING);
- ret = 1;
- } else
- raw_spin_lock_nested(&busiest->lock,
- SINGLE_DEPTH_NESTING);
+ if (likely(raw_spin_trylock(rq_lockp(busiest))))
+ return 0;
+
+ if (busiest >= this_rq) {
+ raw_spin_lock_nested(rq_lockp(busiest), SINGLE_DEPTH_NESTING);
+ return 0;
}
- return ret;
+
+ raw_spin_unlock(rq_lockp(this_rq));
+ raw_spin_lock(rq_lockp(busiest));
+ raw_spin_lock_nested(rq_lockp(this_rq), SINGLE_DEPTH_NESTING);
+
+ return 1;
}

#endif /* CONFIG_PREEMPT */
@@ -1959,20 +1965,16 @@ static inline int _double_lock_balance(s
*/
static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
{
- if (unlikely(!irqs_disabled())) {
- /* printk() doesn't work well under rq->lock */
- raw_spin_unlock(&this_rq->lock);
- BUG_ON(1);
- }
-
+ lockdep_assert_irqs_disabled();
return _double_lock_balance(this_rq, busiest);
}

static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
__releases(busiest->lock)
{
- raw_spin_unlock(&busiest->lock);
- lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+ if (rq_lockp(this_rq) != rq_lockp(busiest))
+ raw_spin_unlock(rq_lockp(busiest));
+ lock_set_subclass(&rq_lockp(this_rq)->dep_map, 0, _RET_IP_);
}

static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
@@ -2013,16 +2015,16 @@ static inline void double_rq_lock(struct
__acquires(rq2->lock)
{
BUG_ON(!irqs_disabled());
- if (rq1 == rq2) {
- raw_spin_lock(&rq1->lock);
+ if (rq_lockp(rq1) == rq_lockp(rq2)) {
+ raw_spin_lock(rq_lockp(rq1));
__acquire(rq2->lock); /* Fake it out ;) */
} else {
if (rq1 < rq2) {
- raw_spin_lock(&rq1->lock);
- raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
+ raw_spin_lock(rq_lockp(rq1));
+ raw_spin_lock_nested(rq_lockp(rq2), SINGLE_DEPTH_NESTING);
} else {
- raw_spin_lock(&rq2->lock);
- raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
+ raw_spin_lock(rq_lockp(rq2));
+ raw_spin_lock_nested(rq_lockp(rq1), SINGLE_DEPTH_NESTING);
}
}
}
@@ -2037,9 +2039,9 @@ static inline void double_rq_unlock(stru
__releases(rq1->lock)
__releases(rq2->lock)
{
- raw_spin_unlock(&rq1->lock);
- if (rq1 != rq2)
- raw_spin_unlock(&rq2->lock);
+ raw_spin_unlock(rq_lockp(rq1));
+ if (rq_lockp(rq1) != rq_lockp(rq2))
+ raw_spin_unlock(rq_lockp(rq2));
else
__release(rq2->lock);
}
@@ -2062,7 +2064,7 @@ static inline void double_rq_lock(struct
{
BUG_ON(!irqs_disabled());
BUG_ON(rq1 != rq2);
- raw_spin_lock(&rq1->lock);
+ raw_spin_lock(rq_lockp(rq1));
__acquire(rq2->lock); /* Fake it out ;) */
}

@@ -2077,7 +2079,7 @@ static inline void double_rq_unlock(stru
__releases(rq2->lock)
{
BUG_ON(rq1 != rq2);
- raw_spin_unlock(&rq1->lock);
+ raw_spin_unlock(rq_lockp(rq1));
__release(rq2->lock);
}

--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -442,7 +442,7 @@ void rq_attach_root(struct rq *rq, struc
struct root_domain *old_rd = NULL;
unsigned long flags;

- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_lock_irqsave(rq_lockp(rq), flags);

if (rq->rd) {
old_rd = rq->rd;
@@ -468,7 +468,7 @@ void rq_attach_root(struct rq *rq, struc
if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
set_rq_online(rq);

- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_unlock_irqrestore(rq_lockp(rq), flags);

if (old_rd)
call_rcu(&old_rd->rcu, free_rootdomain);




2019-02-19 16:15:29

by Phil Auld

[permalink] [raw]
Subject: Re: [RFC][PATCH 03/16] sched: Wrap rq::lock access

On Mon, Feb 18, 2019 at 05:56:23PM +0100 Peter Zijlstra wrote:
> In preparation of playing games with rq->lock, abstract the thing
> using an accessor.
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>

Hi Peter,

Sorry... what tree are these for? They don't apply to mainline.
Some branch on tip, I guess?


Thanks,
Phil



> ---
> kernel/sched/core.c | 44 ++++++++++----------
> kernel/sched/deadline.c | 18 ++++----
> kernel/sched/debug.c | 4 -
> kernel/sched/fair.c | 41 +++++++++----------
> kernel/sched/idle.c | 4 -
> kernel/sched/pelt.h | 2
> kernel/sched/rt.c | 8 +--
> kernel/sched/sched.h | 102 ++++++++++++++++++++++++------------------------
> kernel/sched/topology.c | 4 -
> 9 files changed, 114 insertions(+), 113 deletions(-)
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -72,12 +72,12 @@ struct rq *__task_rq_lock(struct task_st
>
> for (;;) {
> rq = task_rq(p);
> - raw_spin_lock(&rq->lock);
> + raw_spin_lock(rq_lockp(rq));
> if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
> rq_pin_lock(rq, rf);
> return rq;
> }
> - raw_spin_unlock(&rq->lock);
> + raw_spin_unlock(rq_lockp(rq));
>
> while (unlikely(task_on_rq_migrating(p)))
> cpu_relax();
> @@ -96,7 +96,7 @@ struct rq *task_rq_lock(struct task_stru
> for (;;) {
> raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
> rq = task_rq(p);
> - raw_spin_lock(&rq->lock);
> + raw_spin_lock(rq_lockp(rq));
> /*
> * move_queued_task() task_rq_lock()
> *
> @@ -118,7 +118,7 @@ struct rq *task_rq_lock(struct task_stru
> rq_pin_lock(rq, rf);
> return rq;
> }
> - raw_spin_unlock(&rq->lock);
> + raw_spin_unlock(rq_lockp(rq));
> raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
>
> while (unlikely(task_on_rq_migrating(p)))
> @@ -188,7 +188,7 @@ void update_rq_clock(struct rq *rq)
> {
> s64 delta;
>
> - lockdep_assert_held(&rq->lock);
> + lockdep_assert_held(rq_lockp(rq));
>
> if (rq->clock_update_flags & RQCF_ACT_SKIP)
> return;
> @@ -497,7 +497,7 @@ void resched_curr(struct rq *rq)
> struct task_struct *curr = rq->curr;
> int cpu;
>
> - lockdep_assert_held(&rq->lock);
> + lockdep_assert_held(rq_lockp(rq));
>
> if (test_tsk_need_resched(curr))
> return;
> @@ -521,10 +521,10 @@ void resched_cpu(int cpu)
> struct rq *rq = cpu_rq(cpu);
> unsigned long flags;
>
> - raw_spin_lock_irqsave(&rq->lock, flags);
> + raw_spin_lock_irqsave(rq_lockp(rq), flags);
> if (cpu_online(cpu) || cpu == smp_processor_id())
> resched_curr(rq);
> - raw_spin_unlock_irqrestore(&rq->lock, flags);
> + raw_spin_unlock_irqrestore(rq_lockp(rq), flags);
> }
>
> #ifdef CONFIG_SMP
> @@ -956,7 +956,7 @@ static inline bool is_cpu_allowed(struct
> static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
> struct task_struct *p, int new_cpu)
> {
> - lockdep_assert_held(&rq->lock);
> + lockdep_assert_held(rq_lockp(rq));
>
> WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
> dequeue_task(rq, p, DEQUEUE_NOCLOCK);
> @@ -1070,7 +1070,7 @@ void do_set_cpus_allowed(struct task_str
> * Because __kthread_bind() calls this on blocked tasks without
> * holding rq->lock.
> */
> - lockdep_assert_held(&rq->lock);
> + lockdep_assert_held(rq_lockp(rq));
> dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
> }
> if (running)
> @@ -1203,7 +1203,7 @@ void set_task_cpu(struct task_struct *p,
> * task_rq_lock().
> */
> WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
> - lockdep_is_held(&task_rq(p)->lock)));
> + lockdep_is_held(rq_lockp(task_rq(p)))));
> #endif
> /*
> * Clearly, migrating tasks to offline CPUs is a fairly daft thing.
> @@ -1732,7 +1732,7 @@ ttwu_do_activate(struct rq *rq, struct t
> {
> int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
>
> - lockdep_assert_held(&rq->lock);
> + lockdep_assert_held(rq_lockp(rq));
>
> #ifdef CONFIG_SMP
> if (p->sched_contributes_to_load)
> @@ -2123,7 +2123,7 @@ static void try_to_wake_up_local(struct
> WARN_ON_ONCE(p == current))
> return;
>
> - lockdep_assert_held(&rq->lock);
> + lockdep_assert_held(rq_lockp(rq));
>
> if (!raw_spin_trylock(&p->pi_lock)) {
> /*
> @@ -2606,10 +2606,10 @@ prepare_lock_switch(struct rq *rq, struc
> * do an early lockdep release here:
> */
> rq_unpin_lock(rq, rf);
> - spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
> + spin_release(&rq_lockp(rq)->dep_map, 1, _THIS_IP_);
> #ifdef CONFIG_DEBUG_SPINLOCK
> /* this is a valid case when another task releases the spinlock */
> - rq->lock.owner = next;
> + rq_lockp(rq)->owner = next;
> #endif
> }
>
> @@ -2620,8 +2620,8 @@ static inline void finish_lock_switch(st
> * fix up the runqueue lock - which gets 'carried over' from
> * prev into current:
> */
> - spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
> - raw_spin_unlock_irq(&rq->lock);
> + spin_acquire(&rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
> + raw_spin_unlock_irq(rq_lockp(rq));
> }
>
> /*
> @@ -2771,7 +2771,7 @@ static void __balance_callback(struct rq
> void (*func)(struct rq *rq);
> unsigned long flags;
>
> - raw_spin_lock_irqsave(&rq->lock, flags);
> + raw_spin_lock_irqsave(rq_lockp(rq), flags);
> head = rq->balance_callback;
> rq->balance_callback = NULL;
> while (head) {
> @@ -2782,7 +2782,7 @@ static void __balance_callback(struct rq
>
> func(rq);
> }
> - raw_spin_unlock_irqrestore(&rq->lock, flags);
> + raw_spin_unlock_irqrestore(rq_lockp(rq), flags);
> }
>
> static inline void balance_callback(struct rq *rq)
> @@ -5411,7 +5411,7 @@ void init_idle(struct task_struct *idle,
> unsigned long flags;
>
> raw_spin_lock_irqsave(&idle->pi_lock, flags);
> - raw_spin_lock(&rq->lock);
> + raw_spin_lock(rq_lockp(rq));
>
> __sched_fork(0, idle);
> idle->state = TASK_RUNNING;
> @@ -5448,7 +5448,7 @@ void init_idle(struct task_struct *idle,
> #ifdef CONFIG_SMP
> idle->on_cpu = 1;
> #endif
> - raw_spin_unlock(&rq->lock);
> + raw_spin_unlock(rq_lockp(rq));
> raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
>
> /* Set the preempt count _outside_ the spinlocks! */
> @@ -6016,7 +6016,7 @@ void __init sched_init(void)
> struct rq *rq;
>
> rq = cpu_rq(i);
> - raw_spin_lock_init(&rq->lock);
> + raw_spin_lock_init(&rq->__lock);
> rq->nr_running = 0;
> rq->calc_load_active = 0;
> rq->calc_load_update = jiffies + LOAD_FREQ;
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -80,7 +80,7 @@ void __add_running_bw(u64 dl_bw, struct
> {
> u64 old = dl_rq->running_bw;
>
> - lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
> + lockdep_assert_held(rq_lockp((rq_of_dl_rq(dl_rq))));
> dl_rq->running_bw += dl_bw;
> SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
> SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
> @@ -93,7 +93,7 @@ void __sub_running_bw(u64 dl_bw, struct
> {
> u64 old = dl_rq->running_bw;
>
> - lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
> + lockdep_assert_held(rq_lockp((rq_of_dl_rq(dl_rq))));
> dl_rq->running_bw -= dl_bw;
> SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */
> if (dl_rq->running_bw > old)
> @@ -107,7 +107,7 @@ void __add_rq_bw(u64 dl_bw, struct dl_rq
> {
> u64 old = dl_rq->this_bw;
>
> - lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
> + lockdep_assert_held(rq_lockp((rq_of_dl_rq(dl_rq))));
> dl_rq->this_bw += dl_bw;
> SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */
> }
> @@ -117,7 +117,7 @@ void __sub_rq_bw(u64 dl_bw, struct dl_rq
> {
> u64 old = dl_rq->this_bw;
>
> - lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
> + lockdep_assert_held(rq_lockp((rq_of_dl_rq(dl_rq))));
> dl_rq->this_bw -= dl_bw;
> SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */
> if (dl_rq->this_bw > old)
> @@ -893,7 +893,7 @@ static int start_dl_timer(struct task_st
> ktime_t now, act;
> s64 delta;
>
> - lockdep_assert_held(&rq->lock);
> + lockdep_assert_held(rq_lockp(rq));
>
> /*
> * We want the timer to fire at the deadline, but considering
> @@ -1003,9 +1003,9 @@ static enum hrtimer_restart dl_task_time
> * If the runqueue is no longer available, migrate the
> * task elsewhere. This necessarily changes rq.
> */
> - lockdep_unpin_lock(&rq->lock, rf.cookie);
> + lockdep_unpin_lock(rq_lockp(rq), rf.cookie);
> rq = dl_task_offline_migration(rq, p);
> - rf.cookie = lockdep_pin_lock(&rq->lock);
> + rf.cookie = lockdep_pin_lock(rq_lockp(rq));
> update_rq_clock(rq);
>
> /*
> @@ -1620,7 +1620,7 @@ static void migrate_task_rq_dl(struct ta
> * from try_to_wake_up(). Hence, p->pi_lock is locked, but
> * rq->lock is not... So, lock it
> */
> - raw_spin_lock(&rq->lock);
> + raw_spin_lock(rq_lockp(rq));
> if (p->dl.dl_non_contending) {
> sub_running_bw(&p->dl, &rq->dl);
> p->dl.dl_non_contending = 0;
> @@ -1635,7 +1635,7 @@ static void migrate_task_rq_dl(struct ta
> put_task_struct(p);
> }
> sub_rq_bw(&p->dl, &rq->dl);
> - raw_spin_unlock(&rq->lock);
> + raw_spin_unlock(rq_lockp(rq));
> }
>
> static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -515,7 +515,7 @@ void print_cfs_rq(struct seq_file *m, in
> SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
> SPLIT_NS(cfs_rq->exec_clock));
>
> - raw_spin_lock_irqsave(&rq->lock, flags);
> + raw_spin_lock_irqsave(rq_lockp(rq), flags);
> if (rb_first_cached(&cfs_rq->tasks_timeline))
> MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
> last = __pick_last_entity(cfs_rq);
> @@ -523,7 +523,7 @@ void print_cfs_rq(struct seq_file *m, in
> max_vruntime = last->vruntime;
> min_vruntime = cfs_rq->min_vruntime;
> rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
> - raw_spin_unlock_irqrestore(&rq->lock, flags);
> + raw_spin_unlock_irqrestore(rq_lockp(rq), flags);
> SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
> SPLIT_NS(MIN_vruntime));
> SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4966,7 +4966,7 @@ static void __maybe_unused update_runtim
> {
> struct task_group *tg;
>
> - lockdep_assert_held(&rq->lock);
> + lockdep_assert_held(rq_lockp(rq));
>
> rcu_read_lock();
> list_for_each_entry_rcu(tg, &task_groups, list) {
> @@ -4985,7 +4985,7 @@ static void __maybe_unused unthrottle_of
> {
> struct task_group *tg;
>
> - lockdep_assert_held(&rq->lock);
> + lockdep_assert_held(rq_lockp(rq));
>
> rcu_read_lock();
> list_for_each_entry_rcu(tg, &task_groups, list) {
> @@ -6743,7 +6743,7 @@ static void migrate_task_rq_fair(struct
> * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
> * rq->lock and can modify state directly.
> */
> - lockdep_assert_held(&task_rq(p)->lock);
> + lockdep_assert_held(rq_lockp(task_rq(p)));
> detach_entity_cfs_rq(&p->se);
>
> } else {
> @@ -7317,7 +7317,7 @@ static int task_hot(struct task_struct *
> {
> s64 delta;
>
> - lockdep_assert_held(&env->src_rq->lock);
> + lockdep_assert_held(rq_lockp(env->src_rq));
>
> if (p->sched_class != &fair_sched_class)
> return 0;
> @@ -7411,7 +7411,7 @@ int can_migrate_task(struct task_struct
> {
> int tsk_cache_hot;
>
> - lockdep_assert_held(&env->src_rq->lock);
> + lockdep_assert_held(rq_lockp(env->src_rq));
>
> /*
> * We do not migrate tasks that are:
> @@ -7489,7 +7489,7 @@ int can_migrate_task(struct task_struct
> */
> static void detach_task(struct task_struct *p, struct lb_env *env)
> {
> - lockdep_assert_held(&env->src_rq->lock);
> + lockdep_assert_held(rq_lockp(env->src_rq));
>
> p->on_rq = TASK_ON_RQ_MIGRATING;
> deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
> @@ -7506,7 +7506,7 @@ static struct task_struct *detach_one_ta
> {
> struct task_struct *p;
>
> - lockdep_assert_held(&env->src_rq->lock);
> + lockdep_assert_held(rq_lockp(env->src_rq));
>
> list_for_each_entry_reverse(p,
> &env->src_rq->cfs_tasks, se.group_node) {
> @@ -7542,7 +7542,7 @@ static int detach_tasks(struct lb_env *e
> unsigned long load;
> int detached = 0;
>
> - lockdep_assert_held(&env->src_rq->lock);
> + lockdep_assert_held(rq_lockp(env->src_rq));
>
> if (env->imbalance <= 0)
> return 0;
> @@ -7623,7 +7623,7 @@ static int detach_tasks(struct lb_env *e
> */
> static void attach_task(struct rq *rq, struct task_struct *p)
> {
> - lockdep_assert_held(&rq->lock);
> + lockdep_assert_held(rq_lockp(rq));
>
> BUG_ON(task_rq(p) != rq);
> activate_task(rq, p, ENQUEUE_NOCLOCK);
> @@ -9164,7 +9164,7 @@ static int load_balance(int this_cpu, st
> if (need_active_balance(&env)) {
> unsigned long flags;
>
> - raw_spin_lock_irqsave(&busiest->lock, flags);
> + raw_spin_lock_irqsave(rq_lockp(busiest), flags);
>
> /*
> * Don't kick the active_load_balance_cpu_stop,
> @@ -9172,8 +9172,7 @@ static int load_balance(int this_cpu, st
> * moved to this_cpu:
> */
> if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
> - raw_spin_unlock_irqrestore(&busiest->lock,
> - flags);
> + raw_spin_unlock_irqrestore(rq_lockp(busiest), flags);
> env.flags |= LBF_ALL_PINNED;
> goto out_one_pinned;
> }
> @@ -9188,7 +9187,7 @@ static int load_balance(int this_cpu, st
> busiest->push_cpu = this_cpu;
> active_balance = 1;
> }
> - raw_spin_unlock_irqrestore(&busiest->lock, flags);
> + raw_spin_unlock_irqrestore(rq_lockp(busiest), flags);
>
> if (active_balance) {
> stop_one_cpu_nowait(cpu_of(busiest),
> @@ -9897,7 +9896,7 @@ static void nohz_newidle_balance(struct
> time_before(jiffies, READ_ONCE(nohz.next_blocked)))
> return;
>
> - raw_spin_unlock(&this_rq->lock);
> + raw_spin_unlock(rq_lockp(this_rq));
> /*
> * This CPU is going to be idle and blocked load of idle CPUs
> * need to be updated. Run the ilb locally as it is a good
> @@ -9906,7 +9905,7 @@ static void nohz_newidle_balance(struct
> */
> if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
> kick_ilb(NOHZ_STATS_KICK);
> - raw_spin_lock(&this_rq->lock);
> + raw_spin_lock(rq_lockp(this_rq));
> }
>
> #else /* !CONFIG_NO_HZ_COMMON */
> @@ -9966,7 +9965,7 @@ static int idle_balance(struct rq *this_
> goto out;
> }
>
> - raw_spin_unlock(&this_rq->lock);
> + raw_spin_unlock(rq_lockp(this_rq));
>
> update_blocked_averages(this_cpu);
> rcu_read_lock();
> @@ -10007,7 +10006,7 @@ static int idle_balance(struct rq *this_
> }
> rcu_read_unlock();
>
> - raw_spin_lock(&this_rq->lock);
> + raw_spin_lock(rq_lockp(this_rq));
>
> if (curr_cost > this_rq->max_idle_balance_cost)
> this_rq->max_idle_balance_cost = curr_cost;
> @@ -10443,11 +10442,11 @@ void online_fair_sched_group(struct task
> rq = cpu_rq(i);
> se = tg->se[i];
>
> - raw_spin_lock_irq(&rq->lock);
> + raw_spin_lock_irq(rq_lockp(rq));
> update_rq_clock(rq);
> attach_entity_cfs_rq(se);
> sync_throttle(tg, i);
> - raw_spin_unlock_irq(&rq->lock);
> + raw_spin_unlock_irq(rq_lockp(rq));
> }
> }
>
> @@ -10470,9 +10469,9 @@ void unregister_fair_sched_group(struct
>
> rq = cpu_rq(cpu);
>
> - raw_spin_lock_irqsave(&rq->lock, flags);
> + raw_spin_lock_irqsave(rq_lockp(rq), flags);
> list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
> - raw_spin_unlock_irqrestore(&rq->lock, flags);
> + raw_spin_unlock_irqrestore(rq_lockp(rq), flags);
> }
> }
>
> --- a/kernel/sched/idle.c
> +++ b/kernel/sched/idle.c
> @@ -390,10 +390,10 @@ pick_next_task_idle(struct rq *rq, struc
> static void
> dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
> {
> - raw_spin_unlock_irq(&rq->lock);
> + raw_spin_unlock_irq(rq_lockp(rq));
> printk(KERN_ERR "bad: scheduling from the idle thread!\n");
> dump_stack();
> - raw_spin_lock_irq(&rq->lock);
> + raw_spin_lock_irq(rq_lockp(rq));
> }
>
> static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
> --- a/kernel/sched/pelt.h
> +++ b/kernel/sched/pelt.h
> @@ -116,7 +116,7 @@ static inline void update_idle_rq_clock_
>
> static inline u64 rq_clock_pelt(struct rq *rq)
> {
> - lockdep_assert_held(&rq->lock);
> + lockdep_assert_held(rq_lockp(rq));
> assert_clock_updated(rq);
>
> return rq->clock_pelt - rq->lost_idle_time;
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -845,7 +845,7 @@ static int do_sched_rt_period_timer(stru
> if (skip)
> continue;
>
> - raw_spin_lock(&rq->lock);
> + raw_spin_lock(rq_lockp(rq));
> update_rq_clock(rq);
>
> if (rt_rq->rt_time) {
> @@ -883,7 +883,7 @@ static int do_sched_rt_period_timer(stru
>
> if (enqueue)
> sched_rt_rq_enqueue(rt_rq);
> - raw_spin_unlock(&rq->lock);
> + raw_spin_unlock(rq_lockp(rq));
> }
>
> if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
> @@ -2034,9 +2034,9 @@ void rto_push_irq_work_func(struct irq_w
> * When it gets updated, a check is made if a push is possible.
> */
> if (has_pushable_tasks(rq)) {
> - raw_spin_lock(&rq->lock);
> + raw_spin_lock(rq_lockp(rq));
> push_rt_tasks(rq);
> - raw_spin_unlock(&rq->lock);
> + raw_spin_unlock(rq_lockp(rq));
> }
>
> raw_spin_lock(&rd->rto_lock);
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -806,7 +806,7 @@ extern void rto_push_irq_work_func(struc
> */
> struct rq {
> /* runqueue lock: */
> - raw_spinlock_t lock;
> + raw_spinlock_t __lock;
>
> /*
> * nr_running and cpu_load should be in the same cacheline because
> @@ -979,6 +979,10 @@ static inline int cpu_of(struct rq *rq)
> #endif
> }
>
> +static inline raw_spinlock_t *rq_lockp(struct rq *rq)
> +{
> + return &rq->__lock;
> +}
>
> #ifdef CONFIG_SCHED_SMT
> extern void __update_idle_core(struct rq *rq);
> @@ -1046,7 +1050,7 @@ static inline void assert_clock_updated(
>
> static inline u64 rq_clock(struct rq *rq)
> {
> - lockdep_assert_held(&rq->lock);
> + lockdep_assert_held(rq_lockp(rq));
> assert_clock_updated(rq);
>
> return rq->clock;
> @@ -1054,7 +1058,7 @@ static inline u64 rq_clock(struct rq *rq
>
> static inline u64 rq_clock_task(struct rq *rq)
> {
> - lockdep_assert_held(&rq->lock);
> + lockdep_assert_held(rq_lockp(rq));
> assert_clock_updated(rq);
>
> return rq->clock_task;
> @@ -1062,7 +1066,7 @@ static inline u64 rq_clock_task(struct r
>
> static inline void rq_clock_skip_update(struct rq *rq)
> {
> - lockdep_assert_held(&rq->lock);
> + lockdep_assert_held(rq_lockp(rq));
> rq->clock_update_flags |= RQCF_REQ_SKIP;
> }
>
> @@ -1072,7 +1076,7 @@ static inline void rq_clock_skip_update(
> */
> static inline void rq_clock_cancel_skipupdate(struct rq *rq)
> {
> - lockdep_assert_held(&rq->lock);
> + lockdep_assert_held(rq_lockp(rq));
> rq->clock_update_flags &= ~RQCF_REQ_SKIP;
> }
>
> @@ -1091,7 +1095,7 @@ struct rq_flags {
>
> static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
> {
> - rf->cookie = lockdep_pin_lock(&rq->lock);
> + rf->cookie = lockdep_pin_lock(rq_lockp(rq));
>
> #ifdef CONFIG_SCHED_DEBUG
> rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
> @@ -1106,12 +1110,12 @@ static inline void rq_unpin_lock(struct
> rf->clock_update_flags = RQCF_UPDATED;
> #endif
>
> - lockdep_unpin_lock(&rq->lock, rf->cookie);
> + lockdep_unpin_lock(rq_lockp(rq), rf->cookie);
> }
>
> static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf)
> {
> - lockdep_repin_lock(&rq->lock, rf->cookie);
> + lockdep_repin_lock(rq_lockp(rq), rf->cookie);
>
> #ifdef CONFIG_SCHED_DEBUG
> /*
> @@ -1132,7 +1136,7 @@ static inline void __task_rq_unlock(stru
> __releases(rq->lock)
> {
> rq_unpin_lock(rq, rf);
> - raw_spin_unlock(&rq->lock);
> + raw_spin_unlock(rq_lockp(rq));
> }
>
> static inline void
> @@ -1141,7 +1145,7 @@ task_rq_unlock(struct rq *rq, struct tas
> __releases(p->pi_lock)
> {
> rq_unpin_lock(rq, rf);
> - raw_spin_unlock(&rq->lock);
> + raw_spin_unlock(rq_lockp(rq));
> raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
> }
>
> @@ -1149,7 +1153,7 @@ static inline void
> rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
> __acquires(rq->lock)
> {
> - raw_spin_lock_irqsave(&rq->lock, rf->flags);
> + raw_spin_lock_irqsave(rq_lockp(rq), rf->flags);
> rq_pin_lock(rq, rf);
> }
>
> @@ -1157,7 +1161,7 @@ static inline void
> rq_lock_irq(struct rq *rq, struct rq_flags *rf)
> __acquires(rq->lock)
> {
> - raw_spin_lock_irq(&rq->lock);
> + raw_spin_lock_irq(rq_lockp(rq));
> rq_pin_lock(rq, rf);
> }
>
> @@ -1165,7 +1169,7 @@ static inline void
> rq_lock(struct rq *rq, struct rq_flags *rf)
> __acquires(rq->lock)
> {
> - raw_spin_lock(&rq->lock);
> + raw_spin_lock(rq_lockp(rq));
> rq_pin_lock(rq, rf);
> }
>
> @@ -1173,7 +1177,7 @@ static inline void
> rq_relock(struct rq *rq, struct rq_flags *rf)
> __acquires(rq->lock)
> {
> - raw_spin_lock(&rq->lock);
> + raw_spin_lock(rq_lockp(rq));
> rq_repin_lock(rq, rf);
> }
>
> @@ -1182,7 +1186,7 @@ rq_unlock_irqrestore(struct rq *rq, stru
> __releases(rq->lock)
> {
> rq_unpin_lock(rq, rf);
> - raw_spin_unlock_irqrestore(&rq->lock, rf->flags);
> + raw_spin_unlock_irqrestore(rq_lockp(rq), rf->flags);
> }
>
> static inline void
> @@ -1190,7 +1194,7 @@ rq_unlock_irq(struct rq *rq, struct rq_f
> __releases(rq->lock)
> {
> rq_unpin_lock(rq, rf);
> - raw_spin_unlock_irq(&rq->lock);
> + raw_spin_unlock_irq(rq_lockp(rq));
> }
>
> static inline void
> @@ -1198,7 +1202,7 @@ rq_unlock(struct rq *rq, struct rq_flags
> __releases(rq->lock)
> {
> rq_unpin_lock(rq, rf);
> - raw_spin_unlock(&rq->lock);
> + raw_spin_unlock(rq_lockp(rq));
> }
>
> static inline struct rq *
> @@ -1261,7 +1265,7 @@ queue_balance_callback(struct rq *rq,
> struct callback_head *head,
> void (*func)(struct rq *rq))
> {
> - lockdep_assert_held(&rq->lock);
> + lockdep_assert_held(rq_lockp(rq));
>
> if (unlikely(head->next))
> return;
> @@ -1917,7 +1921,7 @@ static inline int _double_lock_balance(s
> __acquires(busiest->lock)
> __acquires(this_rq->lock)
> {
> - raw_spin_unlock(&this_rq->lock);
> + raw_spin_unlock(rq_lockp(this_rq));
> double_rq_lock(this_rq, busiest);
>
> return 1;
> @@ -1936,20 +1940,22 @@ static inline int _double_lock_balance(s
> __acquires(busiest->lock)
> __acquires(this_rq->lock)
> {
> - int ret = 0;
> + if (rq_lockp(this_rq) == rq_lockp(busiest))
> + return 0;
>
> - if (unlikely(!raw_spin_trylock(&busiest->lock))) {
> - if (busiest < this_rq) {
> - raw_spin_unlock(&this_rq->lock);
> - raw_spin_lock(&busiest->lock);
> - raw_spin_lock_nested(&this_rq->lock,
> - SINGLE_DEPTH_NESTING);
> - ret = 1;
> - } else
> - raw_spin_lock_nested(&busiest->lock,
> - SINGLE_DEPTH_NESTING);
> + if (likely(raw_spin_trylock(rq_lockp(busiest))))
> + return 0;
> +
> + if (busiest >= this_rq) {
> + raw_spin_lock_nested(rq_lockp(busiest), SINGLE_DEPTH_NESTING);
> + return 0;
> }
> - return ret;
> +
> + raw_spin_unlock(rq_lockp(this_rq));
> + raw_spin_lock(rq_lockp(busiest));
> + raw_spin_lock_nested(rq_lockp(this_rq), SINGLE_DEPTH_NESTING);
> +
> + return 1;
> }
>
> #endif /* CONFIG_PREEMPT */
> @@ -1959,20 +1965,16 @@ static inline int _double_lock_balance(s
> */
> static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
> {
> - if (unlikely(!irqs_disabled())) {
> - /* printk() doesn't work well under rq->lock */
> - raw_spin_unlock(&this_rq->lock);
> - BUG_ON(1);
> - }
> -
> + lockdep_assert_irqs_disabled();
> return _double_lock_balance(this_rq, busiest);
> }
>
> static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
> __releases(busiest->lock)
> {
> - raw_spin_unlock(&busiest->lock);
> - lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
> + if (rq_lockp(this_rq) != rq_lockp(busiest))
> + raw_spin_unlock(rq_lockp(busiest));
> + lock_set_subclass(&rq_lockp(this_rq)->dep_map, 0, _RET_IP_);
> }
>
> static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
> @@ -2013,16 +2015,16 @@ static inline void double_rq_lock(struct
> __acquires(rq2->lock)
> {
> BUG_ON(!irqs_disabled());
> - if (rq1 == rq2) {
> - raw_spin_lock(&rq1->lock);
> + if (rq_lockp(rq1) == rq_lockp(rq2)) {
> + raw_spin_lock(rq_lockp(rq1));
> __acquire(rq2->lock); /* Fake it out ;) */
> } else {
> if (rq1 < rq2) {
> - raw_spin_lock(&rq1->lock);
> - raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
> + raw_spin_lock(rq_lockp(rq1));
> + raw_spin_lock_nested(rq_lockp(rq2), SINGLE_DEPTH_NESTING);
> } else {
> - raw_spin_lock(&rq2->lock);
> - raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
> + raw_spin_lock(rq_lockp(rq2));
> + raw_spin_lock_nested(rq_lockp(rq1), SINGLE_DEPTH_NESTING);
> }
> }
> }
> @@ -2037,9 +2039,9 @@ static inline void double_rq_unlock(stru
> __releases(rq1->lock)
> __releases(rq2->lock)
> {
> - raw_spin_unlock(&rq1->lock);
> - if (rq1 != rq2)
> - raw_spin_unlock(&rq2->lock);
> + raw_spin_unlock(rq_lockp(rq1));
> + if (rq_lockp(rq1) != rq_lockp(rq2))
> + raw_spin_unlock(rq_lockp(rq2));
> else
> __release(rq2->lock);
> }
> @@ -2062,7 +2064,7 @@ static inline void double_rq_lock(struct
> {
> BUG_ON(!irqs_disabled());
> BUG_ON(rq1 != rq2);
> - raw_spin_lock(&rq1->lock);
> + raw_spin_lock(rq_lockp(rq1));
> __acquire(rq2->lock); /* Fake it out ;) */
> }
>
> @@ -2077,7 +2079,7 @@ static inline void double_rq_unlock(stru
> __releases(rq2->lock)
> {
> BUG_ON(rq1 != rq2);
> - raw_spin_unlock(&rq1->lock);
> + raw_spin_unlock(rq_lockp(rq1));
> __release(rq2->lock);
> }
>
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -442,7 +442,7 @@ void rq_attach_root(struct rq *rq, struc
> struct root_domain *old_rd = NULL;
> unsigned long flags;
>
> - raw_spin_lock_irqsave(&rq->lock, flags);
> + raw_spin_lock_irqsave(rq_lockp(rq), flags);
>
> if (rq->rd) {
> old_rd = rq->rd;
> @@ -468,7 +468,7 @@ void rq_attach_root(struct rq *rq, struc
> if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
> set_rq_online(rq);
>
> - raw_spin_unlock_irqrestore(&rq->lock, flags);
> + raw_spin_unlock_irqrestore(rq_lockp(rq), flags);
>
> if (old_rd)
> call_rcu(&old_rd->rcu, free_rootdomain);
>
>

--

2019-02-19 16:24:08

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 03/16] sched: Wrap rq::lock access

On Tue, Feb 19, 2019 at 11:13:43AM -0500, Phil Auld wrote:
> On Mon, Feb 18, 2019 at 05:56:23PM +0100 Peter Zijlstra wrote:
> > In preparation of playing games with rq->lock, abstract the thing
> > using an accessor.
> >
> > Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
>
> Hi Peter,
>
> Sorry... what tree are these for? They don't apply to mainline.
> Some branch on tip, I guess?

tip/master I think; any rejects should be trivial tough.

2019-02-19 16:38:24

by Phil Auld

[permalink] [raw]
Subject: Re: [RFC][PATCH 03/16] sched: Wrap rq::lock access

On Tue, Feb 19, 2019 at 05:22:50PM +0100 Peter Zijlstra wrote:
> On Tue, Feb 19, 2019 at 11:13:43AM -0500, Phil Auld wrote:
> > On Mon, Feb 18, 2019 at 05:56:23PM +0100 Peter Zijlstra wrote:
> > > In preparation of playing games with rq->lock, abstract the thing
> > > using an accessor.
> > >
> > > Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> >
> > Hi Peter,
> >
> > Sorry... what tree are these for? They don't apply to mainline.
> > Some branch on tip, I guess?
>
> tip/master I think; any rejects should be trivial tough.

Yep... git foo failed. I was on an old branch on tip by mistake, which didn't
have rq_clock_pelt.


Thanks,
Phil
--

2019-03-18 15:43:06

by Julien Desfossez

[permalink] [raw]
Subject: Re: [RFC][PATCH 03/16] sched: Wrap rq::lock access

The case where we try to acquire the lock on 2 runqueues belonging to 2
different cores requires the rq_lockp wrapper as well otherwise we
frequently deadlock in there.

This fixes the crash reported in
[email protected]

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 76fee56..71bb71f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2078,7 +2078,7 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
raw_spin_lock(rq_lockp(rq1));
__acquire(rq2->lock); /* Fake it out ;) */
} else {
- if (rq1 < rq2) {
+ if (rq_lockp(rq1) < rq_lockp(rq2)) {
raw_spin_lock(rq_lockp(rq1));
raw_spin_lock_nested(rq_lockp(rq2), SINGLE_DEPTH_NESTING);
} else {
--
2.7.4


2019-03-20 02:33:43

by Subhra Mazumdar

[permalink] [raw]
Subject: Re: [RFC][PATCH 03/16] sched: Wrap rq::lock access


On 3/18/19 8:41 AM, Julien Desfossez wrote:
> The case where we try to acquire the lock on 2 runqueues belonging to 2
> different cores requires the rq_lockp wrapper as well otherwise we
> frequently deadlock in there.
>
> This fixes the crash reported in
> [email protected]
>
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 76fee56..71bb71f 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2078,7 +2078,7 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
> raw_spin_lock(rq_lockp(rq1));
> __acquire(rq2->lock); /* Fake it out ;) */
> } else {
> - if (rq1 < rq2) {
> + if (rq_lockp(rq1) < rq_lockp(rq2)) {
> raw_spin_lock(rq_lockp(rq1));
> raw_spin_lock_nested(rq_lockp(rq2), SINGLE_DEPTH_NESTING);
> } else {
With this fix and my previous NULL pointer fix my stress tests are
surviving. I
re-ran my 2 DB instance setup on 44 core 2 socket system by putting each DB
instance in separate core scheduling group. The numbers look much worse
now.

users  baseline  %stdev  %idle  core_sched  %stdev %idle
16     1         0.3     66     -73.4%      136.8 82
24     1         1.6     54     -95.8%      133.2 81
32     1         1.5     42     -97.5%      124.3 89

I also notice that if I enable a bunch of debug configs related to
mutexes, spin
locks, lockdep etc. (which I did earlier to debug the dead lock), it
opens up a
can of worms with multiple crashes.

2019-03-21 21:21:47

by Julien Desfossez

[permalink] [raw]
Subject: Re: [RFC][PATCH 03/16] sched: Wrap rq::lock access

On Tue, Mar 19, 2019 at 10:31 PM Subhra Mazumdar <[email protected]>
wrote:
> On 3/18/19 8:41 AM, Julien Desfossez wrote:
> > The case where we try to acquire the lock on 2 runqueues belonging to 2
> > different cores requires the rq_lockp wrapper as well otherwise we
> > frequently deadlock in there.
> >
> > This fixes the crash reported in
> > [email protected]
> >
> > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > index 76fee56..71bb71f 100644
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -2078,7 +2078,7 @@ static inline void double_rq_lock(struct rq *rq1,
> struct rq *rq2)
> > raw_spin_lock(rq_lockp(rq1));
> > __acquire(rq2->lock); /* Fake it out ;) */
> > } else {
> > - if (rq1 < rq2) {
> > + if (rq_lockp(rq1) < rq_lockp(rq2)) {
> > raw_spin_lock(rq_lockp(rq1));
> > raw_spin_lock_nested(rq_lockp(rq2),
> SINGLE_DEPTH_NESTING);
> > } else {
> With this fix and my previous NULL pointer fix my stress tests are
> surviving. I
> re-ran my 2 DB instance setup on 44 core 2 socket system by putting each DB
> instance in separate core scheduling group. The numbers look much worse
> now.
>
> users baseline %stdev %idle core_sched %stdev %idle
> 16 1 0.3 66 -73.4% 136.8 82
> 24 1 1.6 54 -95.8% 133.2 81
> 32 1 1.5 42 -97.5% 124.3 89

We are also seeing a performance degradation of about 83% on the throughput
of 2 MySQL VMs under a stress test (12 vcpus, 32GB of RAM). The server has 2
NUMA nodes, each with 18 cores (so a total of 72 hardware threads). Each
MySQL VM is pinned to a different NUMA node. The clients for the stress
tests are running on a separate physical machine, each client runs 48 query
threads. Only the MySQL VMs use core scheduling (all vcpus and emulator
threads). Overall the server is 90% idle when the 2 VMs use core scheduling,
and 75% when they don’t.

The rate of preemption vs normal “switch out” is about 1% with and without
core scheduling enabled, but the overall rate of sched_switch is 5 times
higher without core scheduling which suggests some heavy contention in the
scheduling path.

On further investigation, we could see that the contention is mostly in the
way rq locks are taken. With this patchset, we lock the whole core if
cpu.tag is set for at least one cgroup. Due to this, __schedule() is more or
less serialized for the core and that attributes to the performance loss
that we are seeing. We also saw that newidle_balance() takes considerably
long time in load_balance() due to the rq spinlock contention. Do you think
it would help if the core-wide locking was only performed when absolutely
needed ?

In terms of isolation, we measured the time a thread spends co-scheduled
with either a thread from the same group, the idle thread or a thread from
another group. This is what we see for 60 seconds of a specific busy VM
pinned to a whole NUMA node (all its threads):

no core scheduling:
- local neighbors (19.989 % of process runtime)
- idle neighbors (47.197 % of process runtime)
- foreign neighbors (22.811 % of process runtime)

core scheduling enabled:
- local neighbors (6.763 % of process runtime)
- idle neighbors (93.064 % of process runtime)
- foreign neighbors (0.236 % of process runtime)

As a separate test, we tried to pin all the vcpu threads to a set of cores
(6 cores for 12 vcpus):
no core scheduling:
- local neighbors (88.299 % of process runtime)
- idle neighbors (9.334 % of process runtime)
- foreign neighbors (0.197 % of process runtime)

core scheduling enabled:
- local neighbors (84.570 % of process runtime)
- idle neighbors (15.195 % of process runtime)
- foreign neighbors (0.257 % of process runtime)

Thanks,

Julien

2019-03-22 13:36:04

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 03/16] sched: Wrap rq::lock access

On Thu, Mar 21, 2019 at 05:20:17PM -0400, Julien Desfossez wrote:
> On further investigation, we could see that the contention is mostly in the
> way rq locks are taken. With this patchset, we lock the whole core if
> cpu.tag is set for at least one cgroup. Due to this, __schedule() is more or
> less serialized for the core and that attributes to the performance loss
> that we are seeing. We also saw that newidle_balance() takes considerably
> long time in load_balance() due to the rq spinlock contention. Do you think
> it would help if the core-wide locking was only performed when absolutely
> needed ?

Something like that could be done, but then you end up with 2 locks,
something which I was hoping to avoid.

Basically you keep rq->lock as it exists today, but add something like
rq->core->core_lock, you then have to take that second lock (nested
under rq->lock) for every scheduling action involving a tagged task.

It makes things complicatd though; because now my head hurts thikning
about pick_next_task().

(this can obviously do away with the whole rq->lock wrappery)

Also, completely untested..

---
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -146,6 +146,8 @@ void sched_core_enqueue(struct rq *rq, s
if (!p->core_cookie)
return;

+ raw_spin_lock(&rq->core->core_lock);
+
node = &rq->core_tree.rb_node;
parent = *node;

@@ -161,6 +163,8 @@ void sched_core_enqueue(struct rq *rq, s

rb_link_node(&p->core_node, parent, node);
rb_insert_color(&p->core_node, &rq->core_tree);
+
+ raw_spin_unlock(&rq->core->core_lock);
}

void sched_core_dequeue(struct rq *rq, struct task_struct *p)
@@ -170,7 +174,9 @@ void sched_core_dequeue(struct rq *rq, s
if (!p->core_cookie)
return;

+ raw_spin_lock(&rq->core->core_lock);
rb_erase(&p->core_node, &rq->core_tree);
+ raw_spin_unlock(&rq->core->core_lock);
}

/*
@@ -181,6 +187,8 @@ struct task_struct *sched_core_find(stru
struct rb_node *node = rq->core_tree.rb_node;
struct task_struct *node_task, *match;

+ lockdep_assert_held(&rq->core->core_lock);
+
/*
* The idle task always matches any cookie!
*/
@@ -206,6 +214,8 @@ struct task_struct *sched_core_next(stru
{
struct rb_node *node = &p->core_node;

+ lockdep_assert_held(&rq->core->core_lock);
+
node = rb_next(node);
if (!node)
return NULL;
@@ -3685,6 +3695,8 @@ pick_next_task(struct rq *rq, struct tas
* If there were no {en,de}queues since we picked (IOW, the task
* pointers are all still valid), and we haven't scheduled the last
* pick yet, do so now.
+ *
+ * XXX probably OK without ->core_lock
*/
if (rq->core->core_pick_seq == rq->core->core_task_seq &&
rq->core->core_pick_seq != rq->core_sched_seq) {
@@ -3710,6 +3722,20 @@ pick_next_task(struct rq *rq, struct tas
if (!rq->nr_running)
newidle_balance(rq, rf);

+ if (!rq->core->core_cookie) {
+ for_each_class(class) {
+ next = pick_task(rq, class, NULL);
+ if (next)
+ break;
+ }
+
+ if (!next->core_cookie) {
+ set_next_task(rq, next);
+ return next;
+ }
+ }
+
+ raw_spin_lock(&rq->core->core_lock);
cpu = cpu_of(rq);
smt_mask = cpu_smt_mask(cpu);

@@ -3849,6 +3875,7 @@ next_class:;
trace_printk("picked: %s/%d %lx\n", next->comm, next->pid, next->core_cookie);

done:
+ raw_spin_unlock(&rq->core->core_lock);
set_next_task(rq, next);
return next;
}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -966,6 +966,7 @@ struct rq {
struct rb_root core_tree;

/* shared state */
+ raw_spinlock_t core_lock;
unsigned int core_task_seq;
unsigned int core_pick_seq;
unsigned long core_cookie;
@@ -1007,9 +1008,6 @@ static inline bool sched_core_enabled(st

static inline raw_spinlock_t *rq_lockp(struct rq *rq)
{
- if (sched_core_enabled(rq))
- return &rq->core->__lock;
-
return &rq->__lock;
}


2019-03-22 21:00:53

by Julien Desfossez

[permalink] [raw]
Subject: Re: [RFC][PATCH 03/16] sched: Wrap rq::lock access

On Fri, Mar 22, 2019 at 9:34 AM Peter Zijlstra <[email protected]> wrote:
> On Thu, Mar 21, 2019 at 05:20:17PM -0400, Julien Desfossez wrote:
> > On further investigation, we could see that the contention is mostly in
> the
> > way rq locks are taken. With this patchset, we lock the whole core if
> > cpu.tag is set for at least one cgroup. Due to this, __schedule() is
> more or
> > less serialized for the core and that attributes to the performance loss
> > that we are seeing. We also saw that newidle_balance() takes considerably
> > long time in load_balance() due to the rq spinlock contention. Do you
> think
> > it would help if the core-wide locking was only performed when absolutely
> > needed ?
>
> Something like that could be done, but then you end up with 2 locks,
> something which I was hoping to avoid.
>
> Basically you keep rq->lock as it exists today, but add something like
> rq->core->core_lock, you then have to take that second lock (nested
> under rq->lock) for every scheduling action involving a tagged task.
>
> It makes things complicatd though; because now my head hurts thikning
> about pick_next_task().
>
> (this can obviously do away with the whole rq->lock wrappery)
>
> Also, completely untested..

We tried it and it dies within 30ms of enabling the tag on 2 VMs :-)
Now after trying to debug this my head hurts as well !

We'll continue trying to figure this out, but if you want to take a look,
the full dmesg is here: https://paste.debian.net/plainh/0b8f87f3

Thanks,

Julien

2019-03-22 23:29:34

by Tim Chen

[permalink] [raw]
Subject: Re: [RFC][PATCH 03/16] sched: Wrap rq::lock access

On 3/19/19 7:29 PM, Subhra Mazumdar wrote:
>
> On 3/18/19 8:41 AM, Julien Desfossez wrote:
>> The case where we try to acquire the lock on 2 runqueues belonging to 2
>> different cores requires the rq_lockp wrapper as well otherwise we
>> frequently deadlock in there.
>>
>> This fixes the crash reported in
>> [email protected]
>>
>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>> index 76fee56..71bb71f 100644
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -2078,7 +2078,7 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
>>           raw_spin_lock(rq_lockp(rq1));
>>           __acquire(rq2->lock);    /* Fake it out ;) */
>>       } else {
>> -        if (rq1 < rq2) {
>> +        if (rq_lockp(rq1) < rq_lockp(rq2)) {
>>               raw_spin_lock(rq_lockp(rq1));
>>               raw_spin_lock_nested(rq_lockp(rq2), SINGLE_DEPTH_NESTING);
>>           } else {


Pawan was seeing occasional crashes and lock up that's avoided by doing the following.
We're trying to dig a little more tracing to see why pick_next_entity is returning
NULL.

Tim

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5349ebedc645..4c7f353b8900 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7031,6 +7031,8 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
}

se = pick_next_entity(cfs_rq, curr);
+ if (!se)
+ return NULL;
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);

@@ -7070,6 +7072,8 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf

do {
se = pick_next_entity(cfs_rq, NULL);
+ if (!se)
+ return NULL;
set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);

2019-03-22 23:45:18

by Tim Chen

[permalink] [raw]
Subject: Re: [RFC][PATCH 03/16] sched: Wrap rq::lock access

On 3/22/19 4:28 PM, Tim Chen wrote:
> On 3/19/19 7:29 PM, Subhra Mazumdar wrote:
>>
>> On 3/18/19 8:41 AM, Julien Desfossez wrote:
>>> The case where we try to acquire the lock on 2 runqueues belonging to 2
>>> different cores requires the rq_lockp wrapper as well otherwise we
>>> frequently deadlock in there.
>>>
>>> This fixes the crash reported in
>>> [email protected]
>>>
>>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>>> index 76fee56..71bb71f 100644
>>> --- a/kernel/sched/sched.h
>>> +++ b/kernel/sched/sched.h
>>> @@ -2078,7 +2078,7 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
>>>           raw_spin_lock(rq_lockp(rq1));
>>>           __acquire(rq2->lock);    /* Fake it out ;) */
>>>       } else {
>>> -        if (rq1 < rq2) {
>>> +        if (rq_lockp(rq1) < rq_lockp(rq2)) {
>>>               raw_spin_lock(rq_lockp(rq1));
>>>               raw_spin_lock_nested(rq_lockp(rq2), SINGLE_DEPTH_NESTING);
>>>           } else {
>
>
> Pawan was seeing occasional crashes and lock up that's avoided by doing the following.
> We're trying to dig a little more tracing to see why pick_next_entity is returning
> NULL.
>

We found the root cause was a missing chunk when we port Subhra's fix of pick_next_entity

* Someone really wants this to run. If it's not unfair, run it.
*/
- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
+ if (left && cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left)
+ < 1)

That fixes the problem of pick_next_entity returning NULL. sorry for the noise.

Tim

2019-03-23 00:11:59

by Subhra Mazumdar

[permalink] [raw]
Subject: Re: [RFC][PATCH 03/16] sched: Wrap rq::lock access


On 3/21/19 2:20 PM, Julien Desfossez wrote:
> On Tue, Mar 19, 2019 at 10:31 PM Subhra Mazumdar <[email protected]>
> wrote:
>> On 3/18/19 8:41 AM, Julien Desfossez wrote:
>>
> On further investigation, we could see that the contention is mostly in the
> way rq locks are taken. With this patchset, we lock the whole core if
> cpu.tag is set for at least one cgroup. Due to this, __schedule() is more or
> less serialized for the core and that attributes to the performance loss
> that we are seeing. We also saw that newidle_balance() takes considerably
> long time in load_balance() due to the rq spinlock contention. Do you think
> it would help if the core-wide locking was only performed when absolutely
> needed ?
>
Is the core wide lock primarily responsible for the regression? I ran
upto patch
12 which also has the core wide lock for tagged cgroups and also calls
newidle_balance() from pick_next_task(). I don't see any regression.  Of
course
the core sched version of pick_next_task() may be doing more but
comparing with
the __pick_next_task() it doesn't look too horrible.

2019-03-27 01:07:05

by Subhra Mazumdar

[permalink] [raw]
Subject: Re: [RFC][PATCH 03/16] sched: Wrap rq::lock access


On 3/22/19 5:06 PM, Subhra Mazumdar wrote:
>
> On 3/21/19 2:20 PM, Julien Desfossez wrote:
>> On Tue, Mar 19, 2019 at 10:31 PM Subhra Mazumdar
>> <[email protected]>
>> wrote:
>>> On 3/18/19 8:41 AM, Julien Desfossez wrote:
>>>
>> On further investigation, we could see that the contention is mostly
>> in the
>> way rq locks are taken. With this patchset, we lock the whole core if
>> cpu.tag is set for at least one cgroup. Due to this, __schedule() is
>> more or
>> less serialized for the core and that attributes to the performance loss
>> that we are seeing. We also saw that newidle_balance() takes
>> considerably
>> long time in load_balance() due to the rq spinlock contention. Do you
>> think
>> it would help if the core-wide locking was only performed when
>> absolutely
>> needed ?
>>
> Is the core wide lock primarily responsible for the regression? I ran
> upto patch
> 12 which also has the core wide lock for tagged cgroups and also calls
> newidle_balance() from pick_next_task(). I don't see any regression. 
> Of course
> the core sched version of pick_next_task() may be doing more but
> comparing with
> the __pick_next_task() it doesn't look too horrible.
I gathered some data with only 1 DB instance running (which also has 52%
slow
down). Following are the numbers of pick_next_task() calls and their avg
cost
for patch 12 and patch 15. The total number of calls seems to be similar
but the
avg cost (in us) has more than doubled. For both the patches I had put
the DB
instance into a cpu tagged cgroup.

                             patch12 patch15
count pick_next_task         62317898 58925395
avg cost pick_next_task      0.6566323209   1.4223810108

2019-03-29 13:37:14

by Julien Desfossez

[permalink] [raw]
Subject: Re: [RFC][PATCH 03/16] sched: Wrap rq::lock access

On Fri, Mar 22, 2019 at 8:09 PM Subhra Mazumdar <[email protected]>
wrote:
> Is the core wide lock primarily responsible for the regression? I ran
> upto patch
> 12 which also has the core wide lock for tagged cgroups and also calls
> newidle_balance() from pick_next_task(). I don't see any regression. Of
> course
> the core sched version of pick_next_task() may be doing more but
> comparing with
> the __pick_next_task() it doesn't look too horrible.

On further testing and investigation, we also agree that spinlock contention
is not the major cause for the regression, but we feel that it should be one
of the major contributing factors to this performance loss.

To reduce the scope of the investigation of the performance regression, we
designed a couple of smaller test cases (compared to big VMs running complex
benchmarks) and it turns out the test case that is most impacted is a simple
disk write-intensive case (up to 99% performance drop). CPU-intensive and
scheduler-intensive tests (perf bench sched) behave pretty well.

On the same server we used before (2x18 cores, 72 hardware threads), with
all the non-essential services disabled, we setup a cpuset of 4 cores (8
hardware threads) and ran sysbench fileio on a dedicated drive (no RAID).
With sysbench running with 8 threads in this cpuset without core scheduling,
we get about 155.23 MiB/s in sequential write. If we enable the tag, we drop
to 0.25 MiB/s. Interestingly, even with 4 threads, we see the same kind of
performance drop.

Command used:

sysbench --test=fileio prepare
cgexec -g cpu,cpuset:test sysbench --threads=4 --test=fileio \
--file-test-mode=seqwr run

If we run this with the data in a ramdisk instead of a real drive, we don’t
notice any drop. The amount of performance drops depends a bit depending on
the machine, but it’s always significant.

We spent a lot of time in the trace and noticed that a couple times during
every run, the sysbench worker threads are waiting for IO sometimes up to 4
seconds, all the threads wait for the same duration, and during that time we
don’t see any block-related softirq coming in. As soon as the interrupt is
processed, sysbench gets woken up immediately. This long wait never happens
without the core scheduling. So we are trying to see if there is a place
where the interrupts are disabled for an extended period of time. The
irqsoff tracer doesn’t seem to pick it up.

Any thoughts about that ?

2019-03-29 22:29:13

by Subhra Mazumdar

[permalink] [raw]
Subject: Re: [RFC][PATCH 03/16] sched: Wrap rq::lock access


On 3/29/19 6:35 AM, Julien Desfossez wrote:
> On Fri, Mar 22, 2019 at 8:09 PM Subhra Mazumdar <[email protected]>
> wrote:
>> Is the core wide lock primarily responsible for the regression? I ran
>> upto patch
>> 12 which also has the core wide lock for tagged cgroups and also calls
>> newidle_balance() from pick_next_task(). I don't see any regression. Of
>> course
>> the core sched version of pick_next_task() may be doing more but
>> comparing with
>> the __pick_next_task() it doesn't look too horrible.
> On further testing and investigation, we also agree that spinlock contention
> is not the major cause for the regression, but we feel that it should be one
> of the major contributing factors to this performance loss.
>
>
I finally did some code bisection and found the following lines are
basically responsible for the regression. Commenting them out I don't see
the regressions. Can you confirm? I am yet to figure if this is needed for
the correctness of core scheduling and if so can we do this better?

-------->8-------------

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fe3918c..3b3388a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3741,8 +3741,8 @@ pick_next_task(struct rq *rq, struct task_struct
*prev, struct rq_flags *rf)
                                 * If there weren't no cookies; we
don't need
                                 * to bother with the other siblings.
*/
-                               if (i == cpu && !rq->core->core_cookie)
-                                       goto next_class;
+                               //if (i == cpu && !rq->core->core_cookie)
+                                       //goto next_class;

continue;
                        }

2019-04-01 21:40:34

by Subhra Mazumdar

[permalink] [raw]
Subject: Re: [RFC][PATCH 03/16] sched: Wrap rq::lock access


On 3/29/19 3:23 PM, Subhra Mazumdar wrote:
>
> On 3/29/19 6:35 AM, Julien Desfossez wrote:
>> On Fri, Mar 22, 2019 at 8:09 PM Subhra Mazumdar
>> <[email protected]>
>> wrote:
>>> Is the core wide lock primarily responsible for the regression? I ran
>>> upto patch
>>> 12 which also has the core wide lock for tagged cgroups and also calls
>>> newidle_balance() from pick_next_task(). I don't see any
>>> regression.  Of
>>> course
>>> the core sched version of pick_next_task() may be doing more but
>>> comparing with
>>> the __pick_next_task() it doesn't look too horrible.
>> On further testing and investigation, we also agree that spinlock
>> contention
>> is not the major cause for the regression, but we feel that it should
>> be one
>> of the major contributing factors to this performance loss.
>>
>>
> I finally did some code bisection and found the following lines are
> basically responsible for the regression. Commenting them out I don't see
> the regressions. Can you confirm? I am yet to figure if this is needed
> for
> the correctness of core scheduling and if so can we do this better?
>
> -------->8-------------
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index fe3918c..3b3388a 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3741,8 +3741,8 @@ pick_next_task(struct rq *rq, struct task_struct
> *prev, struct rq_flags *rf)
>                                  * If there weren't no cookies; we
> don't need
>                                  * to bother with the other siblings.
> */
> -                               if (i == cpu && !rq->core->core_cookie)
> -                                       goto next_class;
> +                               //if (i == cpu && !rq->core->core_cookie)
> +                                       //goto next_class;
>
> continue;
>                         }
AFAICT this condition is not needed for correctness as cookie matching will
sill be enforced. Peter any thoughts? I get the following numbers with 1 DB
and 2 DB instance.

1 DB instance
users  baseline   %idle  core_sched %idle
16     1          84     -5.5% 84
24     1          76     -5% 76
32     1          69     -0.45% 69

2 DB instance
users  baseline   %idle  core_sched %idle
16     1          66     -23.8% 69
24     1          54     -3.1% 57
32     1          42     -21.1%      48

2019-04-02 07:59:32

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 03/16] sched: Wrap rq::lock access

On Fri, Mar 29, 2019 at 03:23:14PM -0700, Subhra Mazumdar wrote:
>
> On 3/29/19 6:35 AM, Julien Desfossez wrote:
> > On Fri, Mar 22, 2019 at 8:09 PM Subhra Mazumdar <[email protected]>
> > wrote:
> > > Is the core wide lock primarily responsible for the regression? I ran
> > > upto patch
> > > 12 which also has the core wide lock for tagged cgroups and also calls
> > > newidle_balance() from pick_next_task(). I don't see any regression. Of
> > > course
> > > the core sched version of pick_next_task() may be doing more but
> > > comparing with
> > > the __pick_next_task() it doesn't look too horrible.
> > On further testing and investigation, we also agree that spinlock contention
> > is not the major cause for the regression, but we feel that it should be one
> > of the major contributing factors to this performance loss.
> >
> >
> I finally did some code bisection and found the following lines are
> basically responsible for the regression. Commenting them out I don't see
> the regressions. Can you confirm? I am yet to figure if this is needed for
> the correctness of core scheduling and if so can we do this better?

It was meant to be an optimization; specifically, when no cookie was
set, don't bother to schedule the sibling(s).

> -------->8-------------
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index fe3918c..3b3388a 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3741,8 +3741,8 @@ pick_next_task(struct rq *rq, struct task_struct
> *prev, struct rq_flags *rf)
> ???????????????????????????????? * If there weren't no cookies; we don't
> need
> ???????????????????????????????? * to bother with the other siblings.
> */
> -?????????????????????????????? if (i == cpu && !rq->core->core_cookie)
> -?????????????????????????????????????? goto next_class;
> +?????????????????????????????? //if (i == cpu && !rq->core->core_cookie)
> +?????????????????????????????????????? //goto next_class;
>
> continue;
> ??????????????????????? }

2019-04-03 20:17:54

by Julien Desfossez

[permalink] [raw]
Subject: Re: [RFC][PATCH 03/16] sched: Wrap rq::lock access

> >>>Is the core wide lock primarily responsible for the regression? I ran
> >>>upto patch
> >>>12 which also has the core wide lock for tagged cgroups and also calls
> >>>newidle_balance() from pick_next_task(). I don't see any regression. 
> >>>Of
> >>>course
> >>>the core sched version of pick_next_task() may be doing more but
> >>>comparing with
> >>>the __pick_next_task() it doesn't look too horrible.
> >>On further testing and investigation, we also agree that spinlock
> >>contention
> >>is not the major cause for the regression, but we feel that it should be
> >>one
> >>of the major contributing factors to this performance loss.
> >>
> >>
> >I finally did some code bisection and found the following lines are
> >basically responsible for the regression. Commenting them out I don't see
> >the regressions. Can you confirm? I am yet to figure if this is needed for
> >the correctness of core scheduling and if so can we do this better?
> >
> >-------->8-------------
> >
> >diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> >index fe3918c..3b3388a 100644
> >--- a/kernel/sched/core.c
> >+++ b/kernel/sched/core.c
> >@@ -3741,8 +3741,8 @@ pick_next_task(struct rq *rq, struct task_struct
> >*prev, struct rq_flags *rf)
> >                                 * If there weren't no cookies; we don't
> >need
> >                                 * to bother with the other siblings.
> >*/
> >-                               if (i == cpu && !rq->core->core_cookie)
> >-                                       goto next_class;
> >+                               //if (i == cpu && !rq->core->core_cookie)
> >+                                       //goto next_class;
> >
> >continue;
> >                        }
> AFAICT this condition is not needed for correctness as cookie matching will
> sill be enforced. Peter any thoughts? I get the following numbers with 1 DB
> and 2 DB instance.
>
> 1 DB instance
> users  baseline   %idle  core_sched %idle
> 16     1          84     -5.5% 84
> 24     1          76     -5% 76
> 32     1          69     -0.45% 69
>
> 2 DB instance
> users  baseline   %idle  core_sched %idle
> 16     1          66     -23.8% 69
> 24     1          54     -3.1% 57
> 32     1          42     -21.1%      48

We tried to comment those lines and it doesn’t seem to get rid of the
performance regression we are seeing.
Can you elaborate a bit more about the test you are performing, what kind of
resources it uses ?
Can you also try to replicate our test and see if you see the same problem ?

cgcreate -g cpu,cpuset:set1
cat /sys/devices/system/cpu/cpu{0,2,4,6}/topology/thread_siblings_list
0,36
2,38
4,40
6,42

echo "0,2,4,6,36,38,40,42" | sudo tee /sys/fs/cgroup/cpuset/set1/cpuset.cpus
echo 0 | sudo tee /sys/fs/cgroup/cpuset/set1/cpuset.mems

echo 1 | sudo tee /sys/fs/cgroup/cpu,cpuacct/set1/cpu.tag

sysbench --test=fileio prepare
cgexec -g cpu,cpuset:set1 sysbench --threads=4 --test=fileio \
--file-test-mode=seqwr run

The reason we create a cpuset is to narrow down the investigation to just 4
cores on a highly powerful machine. It might not be needed if testing on a
smaller machine.

Julien

2019-04-05 01:35:38

by Subhra Mazumdar

[permalink] [raw]
Subject: Re: [RFC][PATCH 03/16] sched: Wrap rq::lock access


> We tried to comment those lines and it doesn’t seem to get rid of the
> performance regression we are seeing.
> Can you elaborate a bit more about the test you are performing, what kind of
> resources it uses ?
I am running 1 and 2 Oracle DB instances each running TPC-C workload. The
clients driving the instances also run in same node. Each server client
pair is put in each cpu group and tagged.
> Can you also try to replicate our test and see if you see the same problem ?
>
> cgcreate -g cpu,cpuset:set1
> cat /sys/devices/system/cpu/cpu{0,2,4,6}/topology/thread_siblings_list
> 0,36
> 2,38
> 4,40
> 6,42
>
> echo "0,2,4,6,36,38,40,42" | sudo tee /sys/fs/cgroup/cpuset/set1/cpuset.cpus
> echo 0 | sudo tee /sys/fs/cgroup/cpuset/set1/cpuset.mems
>
> echo 1 | sudo tee /sys/fs/cgroup/cpu,cpuacct/set1/cpu.tag
>
> sysbench --test=fileio prepare
> cgexec -g cpu,cpuset:set1 sysbench --threads=4 --test=fileio \
> --file-test-mode=seqwr run
>
> The reason we create a cpuset is to narrow down the investigation to just 4
> cores on a highly powerful machine. It might not be needed if testing on a
> smaller machine.
With this sysbench test I am not seeing any improvement with removing the
condition. Also with hackbench I found it makes no difference but that has
much lower regression to begin with (18%)
>
> Julien