2020-10-23 10:30:00

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH v4 02/19] sched: Fix balance_callback()

The intent of balance_callback() has always been to delay executing
balancing operations until the end of the current rq->lock section.
This is because balance operations must often drop rq->lock, and that
isn't safe in general.

However, as noted by Scott, there were a few holes in that scheme;
balance_callback() was called after rq->lock was dropped, which means
another CPU can interleave and touch the callback list.

Rework code to call the balance callbacks before dropping rq->lock
where possible, and otherwise splice the balance list onto a local
stack.

This guarantees that the balance list must be empty when we take
rq->lock. IOW, we'll only ever run our own balance callbacks.

Reported-by: Scott Wood <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/sched/core.c | 119 ++++++++++++++++++++++++++++++++-------------------
kernel/sched/sched.h | 3 +
2 files changed, 78 insertions(+), 44 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3494,6 +3494,69 @@ static inline void finish_task(struct ta
#endif
}

+#ifdef CONFIG_SMP
+
+static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+ void (*func)(struct rq *rq);
+ struct callback_head *next;
+
+ lockdep_assert_held(&rq->lock);
+
+ while (head) {
+ func = (void (*)(struct rq *))head->func;
+ next = head->next;
+ head->next = NULL;
+ head = next;
+
+ func(rq);
+ }
+}
+
+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+{
+ struct callback_head *head = rq->balance_callback;
+
+ lockdep_assert_held(&rq->lock);
+ if (head)
+ rq->balance_callback = NULL;
+
+ return head;
+}
+
+static void __balance_callbacks(struct rq *rq)
+{
+ do_balance_callbacks(rq, splice_balance_callbacks(rq));
+}
+
+static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+ unsigned long flags;
+
+ if (unlikely(head)) {
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ do_balance_callbacks(rq, head);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ }
+}
+
+#else
+
+static inline void __balance_callbacks(struct rq *rq)
+{
+}
+
+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+{
+ return NULL;
+}
+
+static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+}
+
+#endif
+
static inline void
prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
{
@@ -3519,6 +3582,7 @@ static inline void finish_lock_switch(st
* prev into current:
*/
spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+ __balance_callbacks(rq);
raw_spin_unlock_irq(&rq->lock);
}

@@ -3660,43 +3724,6 @@ static struct rq *finish_task_switch(str
return rq;
}

-#ifdef CONFIG_SMP
-
-/* rq->lock is NOT held, but preemption is disabled */
-static void __balance_callback(struct rq *rq)
-{
- struct callback_head *head, *next;
- void (*func)(struct rq *rq);
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
- head = rq->balance_callback;
- rq->balance_callback = NULL;
- while (head) {
- func = (void (*)(struct rq *))head->func;
- next = head->next;
- head->next = NULL;
- head = next;
-
- func(rq);
- }
- raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
-static inline void balance_callback(struct rq *rq)
-{
- if (unlikely(rq->balance_callback))
- __balance_callback(rq);
-}
-
-#else
-
-static inline void balance_callback(struct rq *rq)
-{
-}
-
-#endif
-
/**
* schedule_tail - first thing a freshly forked thread must call.
* @prev: the thread we just switched away from.
@@ -3716,7 +3743,6 @@ asmlinkage __visible void schedule_tail(
*/

rq = finish_task_switch(prev);
- balance_callback(rq);
preempt_enable();

if (current->set_child_tid)
@@ -4532,10 +4558,11 @@ static void __sched notrace __schedule(b
rq = context_switch(rq, prev, next, &rf);
} else {
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
- rq_unlock_irq(rq, &rf);
- }

- balance_callback(rq);
+ rq_unpin_lock(rq, &rf);
+ __balance_callbacks(rq);
+ raw_spin_unlock_irq(&rq->lock);
+ }
}

void __noreturn do_task_dead(void)
@@ -4946,9 +4973,11 @@ void rt_mutex_setprio(struct task_struct
out_unlock:
/* Avoid rq from going away on us: */
preempt_disable();
- __task_rq_unlock(rq, &rf);

- balance_callback(rq);
+ rq_unpin_lock(rq, &rf);
+ __balance_callbacks(rq);
+ raw_spin_unlock(&rq->lock);
+
preempt_enable();
}
#else
@@ -5222,6 +5251,7 @@ static int __sched_setscheduler(struct t
int retval, oldprio, oldpolicy = -1, queued, running;
int new_effective_prio, policy = attr->sched_policy;
const struct sched_class *prev_class;
+ struct callback_head *head;
struct rq_flags rf;
int reset_on_fork;
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
@@ -5460,6 +5490,7 @@ static int __sched_setscheduler(struct t

/* Avoid rq from going away on us: */
preempt_disable();
+ head = splice_balance_callbacks(rq);
task_rq_unlock(rq, p, &rf);

if (pi) {
@@ -5468,7 +5499,7 @@ static int __sched_setscheduler(struct t
}

/* Run balance callbacks after we've adjusted the PI chain: */
- balance_callback(rq);
+ balance_callbacks(rq, head);
preempt_enable();

return 0;
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1221,6 +1221,9 @@ static inline void rq_pin_lock(struct rq
rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
rf->clock_update_flags = 0;
#endif
+#ifdef CONFIG_SMP
+ SCHED_WARN_ON(rq->balance_callback);
+#endif
}

static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)



2020-11-11 08:25:54

by tip-bot2 for Zqiang

[permalink] [raw]
Subject: [tip: sched/core] sched: Fix balance_callback()

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 565790d28b1e33ee2f77bad5348b99f6dfc366fd
Gitweb: https://git.kernel.org/tip/565790d28b1e33ee2f77bad5348b99f6dfc366fd
Author: Peter Zijlstra <[email protected]>
AuthorDate: Mon, 11 May 2020 14:13:00 +02:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Tue, 10 Nov 2020 18:38:57 +01:00

sched: Fix balance_callback()

The intent of balance_callback() has always been to delay executing
balancing operations until the end of the current rq->lock section.
This is because balance operations must often drop rq->lock, and that
isn't safe in general.

However, as noted by Scott, there were a few holes in that scheme;
balance_callback() was called after rq->lock was dropped, which means
another CPU can interleave and touch the callback list.

Rework code to call the balance callbacks before dropping rq->lock
where possible, and otherwise splice the balance list onto a local
stack.

This guarantees that the balance list must be empty when we take
rq->lock. IOW, we'll only ever run our own balance callbacks.

Reported-by: Scott Wood <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Reviewed-by: Valentin Schneider <[email protected]>
Reviewed-by: Daniel Bristot de Oliveira <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
kernel/sched/core.c | 119 ++++++++++++++++++++++++++----------------
kernel/sched/sched.h | 3 +-
2 files changed, 78 insertions(+), 44 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5e24104..0196a3f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3485,6 +3485,69 @@ static inline void finish_task(struct task_struct *prev)
#endif
}

+#ifdef CONFIG_SMP
+
+static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+ void (*func)(struct rq *rq);
+ struct callback_head *next;
+
+ lockdep_assert_held(&rq->lock);
+
+ while (head) {
+ func = (void (*)(struct rq *))head->func;
+ next = head->next;
+ head->next = NULL;
+ head = next;
+
+ func(rq);
+ }
+}
+
+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+{
+ struct callback_head *head = rq->balance_callback;
+
+ lockdep_assert_held(&rq->lock);
+ if (head)
+ rq->balance_callback = NULL;
+
+ return head;
+}
+
+static void __balance_callbacks(struct rq *rq)
+{
+ do_balance_callbacks(rq, splice_balance_callbacks(rq));
+}
+
+static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+ unsigned long flags;
+
+ if (unlikely(head)) {
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ do_balance_callbacks(rq, head);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ }
+}
+
+#else
+
+static inline void __balance_callbacks(struct rq *rq)
+{
+}
+
+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+{
+ return NULL;
+}
+
+static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+}
+
+#endif
+
static inline void
prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
{
@@ -3510,6 +3573,7 @@ static inline void finish_lock_switch(struct rq *rq)
* prev into current:
*/
spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+ __balance_callbacks(rq);
raw_spin_unlock_irq(&rq->lock);
}

@@ -3651,43 +3715,6 @@ static struct rq *finish_task_switch(struct task_struct *prev)
return rq;
}

-#ifdef CONFIG_SMP
-
-/* rq->lock is NOT held, but preemption is disabled */
-static void __balance_callback(struct rq *rq)
-{
- struct callback_head *head, *next;
- void (*func)(struct rq *rq);
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
- head = rq->balance_callback;
- rq->balance_callback = NULL;
- while (head) {
- func = (void (*)(struct rq *))head->func;
- next = head->next;
- head->next = NULL;
- head = next;
-
- func(rq);
- }
- raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
-static inline void balance_callback(struct rq *rq)
-{
- if (unlikely(rq->balance_callback))
- __balance_callback(rq);
-}
-
-#else
-
-static inline void balance_callback(struct rq *rq)
-{
-}
-
-#endif
-
/**
* schedule_tail - first thing a freshly forked thread must call.
* @prev: the thread we just switched away from.
@@ -3707,7 +3734,6 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
*/

rq = finish_task_switch(prev);
- balance_callback(rq);
preempt_enable();

if (current->set_child_tid)
@@ -4523,10 +4549,11 @@ static void __sched notrace __schedule(bool preempt)
rq = context_switch(rq, prev, next, &rf);
} else {
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
- rq_unlock_irq(rq, &rf);
- }

- balance_callback(rq);
+ rq_unpin_lock(rq, &rf);
+ __balance_callbacks(rq);
+ raw_spin_unlock_irq(&rq->lock);
+ }
}

void __noreturn do_task_dead(void)
@@ -4937,9 +4964,11 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
out_unlock:
/* Avoid rq from going away on us: */
preempt_disable();
- __task_rq_unlock(rq, &rf);

- balance_callback(rq);
+ rq_unpin_lock(rq, &rf);
+ __balance_callbacks(rq);
+ raw_spin_unlock(&rq->lock);
+
preempt_enable();
}
#else
@@ -5213,6 +5242,7 @@ static int __sched_setscheduler(struct task_struct *p,
int retval, oldprio, oldpolicy = -1, queued, running;
int new_effective_prio, policy = attr->sched_policy;
const struct sched_class *prev_class;
+ struct callback_head *head;
struct rq_flags rf;
int reset_on_fork;
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
@@ -5451,6 +5481,7 @@ change:

/* Avoid rq from going away on us: */
preempt_disable();
+ head = splice_balance_callbacks(rq);
task_rq_unlock(rq, p, &rf);

if (pi) {
@@ -5459,7 +5490,7 @@ change:
}

/* Run balance callbacks after we've adjusted the PI chain: */
- balance_callback(rq);
+ balance_callbacks(rq, head);
preempt_enable();

return 0;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index df80bfc..738a00b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1221,6 +1221,9 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
rf->clock_update_flags = 0;
#endif
+#ifdef CONFIG_SMP
+ SCHED_WARN_ON(rq->balance_callback);
+#endif
}

static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)

2020-11-11 20:36:38

by Paul Bolle

[permalink] [raw]
Subject: Re: [tip: sched/core] sched: Fix balance_callback()

tip-bot2 for Peter Zijlstra schreef op wo 11-11-2020 om 08:23 [+0000]:
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> [...]
> +static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
> +{
> + void (*func)(struct rq *rq);
> + struct callback_head *next;
> +
> + lockdep_assert_held(&rq->lock);
> +
> + while (head) {
> + func = (void (*)(struct rq *))head->func;
> + next = head->next;
> + head->next = NULL;
> + head = next;

Naive question: is there some subtle C-issue that is evaded here by setting
head->next to NULL prior to copying over it?

(I know this piece of code only got copied around in this patch and this is
therefor not something that this patch actually introduced.)

> +
> + func(rq);
> + }
> +}

Thanks,


Paul Bolle

2020-11-11 20:48:13

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [tip: sched/core] sched: Fix balance_callback()

On Wed, Nov 11, 2020 at 09:30:42PM +0100, Paul Bolle wrote:
> tip-bot2 for Peter Zijlstra schreef op wo 11-11-2020 om 08:23 [+0000]:
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > [...]
> > +static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
> > +{
> > + void (*func)(struct rq *rq);
> > + struct callback_head *next;
> > +
> > + lockdep_assert_held(&rq->lock);
> > +
> > + while (head) {
> > + func = (void (*)(struct rq *))head->func;
> > + next = head->next;
> > + head->next = NULL;
> > + head = next;
>
> Naive question: is there some subtle C-issue that is evaded here by setting
> head->next to NULL prior to copying over it?
>
> (I know this piece of code only got copied around in this patch and this is
> therefor not something that this patch actually introduced.)

It's like list_del_init(), it zeros the entry before unlinking it.
queue_balance_callback() relies on this.