2022-11-17 01:23:27

by Josh Don

[permalink] [raw]
Subject: [PATCH v3] sched: async unthrottling for cfs bandwidth

CFS bandwidth currently distributes new runtime and unthrottles cfs_rq's
inline in an hrtimer callback. Runtime distribution is a per-cpu
operation, and unthrottling is a per-cgroup operation, since a tg walk
is required. On machines with a large number of cpus and large cgroup
hierarchies, this cpus*cgroups work can be too much to do in a single
hrtimer callback: since IRQ are disabled, hard lockups may easily occur.
Specifically, we've found this scalability issue on configurations with
256 cpus, O(1000) cgroups in the hierarchy being throttled, and high
memory bandwidth usage.

To fix this, we can instead unthrottle cfs_rq's asynchronously via a
CSD. Each cpu is responsible for unthrottling itself, thus sharding the
total work more fairly across the system, and avoiding hard lockups.

Signed-off-by: Josh Don <[email protected]>
---
v2: Fixed !CONFIG_SMP build errors
v3: Removed the throttled_csd_count atomic

kernel/sched/fair.c | 127 ++++++++++++++++++++++++++++++++++++++++---
kernel/sched/sched.h | 8 +++
2 files changed, 126 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4cc56c91e06e..012ec9d03811 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5449,10 +5449,77 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
resched_curr(rq);
}

-static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
+#ifdef CONFIG_SMP
+static void __cfsb_csd_unthrottle(void *arg)
+{
+ struct rq *rq = arg;
+ struct rq_flags rf;
+ struct cfs_rq *cursor, *tmp;
+
+ rq_lock(rq, &rf);
+
+ /*
+ * Since we hold rq lock we're safe from concurrent manipulation of
+ * the CSD list. However, this RCU critical section annotates the
+ * fact that we pair with sched_free_group_rcu(), so that we cannot
+ * race with group being freed in the window between removing it
+ * from the list and advancing to the next entry in the list.
+ */
+ rcu_read_lock();
+
+ list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
+ throttled_csd_list) {
+ list_del_init(&cursor->throttled_csd_list);
+
+ if (cfs_rq_throttled(cursor))
+ unthrottle_cfs_rq(cursor);
+ }
+
+ rcu_read_unlock();
+
+ rq_unlock(rq, &rf);
+}
+
+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+
+ if (rq == this_rq()) {
+ unthrottle_cfs_rq(cfs_rq);
+ return;
+ }
+
+ /* Already enqueued */
+ if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
+ return;
+
+ list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
+
+ smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
+}
+#else
+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+ unthrottle_cfs_rq(cfs_rq);
+}
+#endif
+
+static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+ lockdep_assert_rq_held(rq_of(cfs_rq));
+
+ if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) ||
+ cfs_rq->runtime_remaining <= 0))
+ return;
+
+ __unthrottle_cfs_rq_async(cfs_rq);
+}
+
+static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
{
struct cfs_rq *cfs_rq;
u64 runtime, remaining = 1;
+ bool throttled = false;

rcu_read_lock();
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
@@ -5460,11 +5527,22 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
struct rq *rq = rq_of(cfs_rq);
struct rq_flags rf;

+ if (!remaining) {
+ throttled = true;
+ break;
+ }
+
rq_lock_irqsave(rq, &rf);
if (!cfs_rq_throttled(cfs_rq))
goto next;

- /* By the above check, this should never be true */
+#ifdef CONFIG_SMP
+ /* Already queued for async unthrottle */
+ if (!list_empty(&cfs_rq->throttled_csd_list))
+ goto next;
+#endif
+
+ /* By the above checks, this should never be true */
SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);

raw_spin_lock(&cfs_b->lock);
@@ -5479,15 +5557,14 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)

/* we check whether we're throttled above */
if (cfs_rq->runtime_remaining > 0)
- unthrottle_cfs_rq(cfs_rq);
+ unthrottle_cfs_rq_async(cfs_rq);

next:
rq_unlock_irqrestore(rq, &rf);
-
- if (!remaining)
- break;
}
rcu_read_unlock();
+
+ return throttled;
}

/*
@@ -5532,10 +5609,8 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
while (throttled && cfs_b->runtime > 0) {
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
- distribute_cfs_runtime(cfs_b);
+ throttled = distribute_cfs_runtime(cfs_b);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
-
- throttled = !list_empty(&cfs_b->throttled_cfs_rq);
}

/*
@@ -5812,6 +5887,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
cfs_rq->runtime_enabled = 0;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
+#ifdef CONFIG_SMP
+ INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
+#endif
}

void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -5828,12 +5906,38 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)

static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
+ int __maybe_unused i;
+
/* init_cfs_bandwidth() was not called */
if (!cfs_b->throttled_cfs_rq.next)
return;

hrtimer_cancel(&cfs_b->period_timer);
hrtimer_cancel(&cfs_b->slack_timer);
+
+ /*
+ * It is possible that we still have some cfs_rq's pending on a CSD
+ * list, though this race is very rare. In order for this to occur, we
+ * must have raced with the last task leaving the group while there
+ * exist throttled cfs_rq(s), and the period_timer must have queued the
+ * CSD item but the remote cpu has not yet processed it. To handle this,
+ * we can simply flush all pending CSD work inline here. We're
+ * guaranteed at this point that no additional cfs_rq of this group can
+ * join a CSD list.
+ */
+#ifdef CONFIG_SMP
+ for_each_possible_cpu(i) {
+ struct rq *rq = cpu_rq(i);
+ unsigned long flags;
+
+ if (list_empty(&rq->cfsb_csd_list))
+ continue;
+
+ local_irq_save(flags);
+ __cfsb_csd_unthrottle(rq);
+ local_irq_restore(flags);
+ }
+#endif
}

/*
@@ -12462,6 +12566,11 @@ __init void init_sched_fair_class(void)
for_each_possible_cpu(i) {
zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i));
+
+#ifdef CONFIG_CFS_BANDWIDTH
+ INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
+ INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list);
+#endif
}

open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 771f8ddb7053..b3d6e819127c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -645,6 +645,9 @@ struct cfs_rq {
int throttled;
int throttle_count;
struct list_head throttled_list;
+#ifdef CONFIG_SMP
+ struct list_head throttled_csd_list;
+#endif
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
@@ -1154,6 +1157,11 @@ struct rq {

/* Scratch cpumask to be temporarily used under rq_lock */
cpumask_var_t scratch_mask;
+
+#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP)
+ call_single_data_t cfsb_csd;
+ struct list_head cfsb_csd_list;
+#endif
};

#ifdef CONFIG_FAIR_GROUP_SCHED
--
2.38.1.431.g37b22c650d-goog



2022-11-18 14:26:17

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v3] sched: async unthrottling for cfs bandwidth

On Wed, Nov 16, 2022 at 04:54:18PM -0800, Josh Don wrote:

> +#ifdef CONFIG_SMP
> +static void __cfsb_csd_unthrottle(void *arg)
> +{
> + struct rq *rq = arg;
> + struct rq_flags rf;
> + struct cfs_rq *cursor, *tmp;
> +
> + rq_lock(rq, &rf);
> +
> + /*
> + * Since we hold rq lock we're safe from concurrent manipulation of
> + * the CSD list. However, this RCU critical section annotates the
> + * fact that we pair with sched_free_group_rcu(), so that we cannot
> + * race with group being freed in the window between removing it
> + * from the list and advancing to the next entry in the list.
> + */
> + rcu_read_lock();

preempt_disable() -- through rq->lock -- also holds off rcu. Strictly
speaking this here is superfluous. But if you want it as an annotation,
that's fine I suppose.

> +
> + list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
> + throttled_csd_list) {
> + list_del_init(&cursor->throttled_csd_list);
> +
> + if (cfs_rq_throttled(cursor))
> + unthrottle_cfs_rq(cursor);
> + }
> +
> + rcu_read_unlock();
> +
> + rq_unlock(rq, &rf);
> +}
> +
> +static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
> +{
> + struct rq *rq = rq_of(cfs_rq);
> +
> + if (rq == this_rq()) {
> + unthrottle_cfs_rq(cfs_rq);
> + return;
> + }

Ideally we'd first queue all the remotes and then process local, but
given how all this is organized that doesn't seem trivial to arrange.

Maybe have this function return false when local and save that cfs_rq in
a local var to process again later, dunno, that might turn messy.

> +
> + /* Already enqueued */
> + if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
> + return;
> +
> + list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
> +
> + smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);

Hurmph.. so I was expecting something like:

first = list_empty(&rq->cfsb_csd_list);
list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
if (first)
smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);

But I suppose I'm remembering the 'old' version. I don't think it is
broken as written. There's a very narrow window where you'll end up
sending a second IPI for naught, but meh.

> +}

Let me go queue this thing, we can always improve upon matters later.

2022-11-18 19:42:26

by Josh Don

[permalink] [raw]
Subject: Re: [PATCH v3] sched: async unthrottling for cfs bandwidth

On Fri, Nov 18, 2022 at 4:47 AM Peter Zijlstra <[email protected]> wrote:
>
> preempt_disable() -- through rq->lock -- also holds off rcu. Strictly
> speaking this here is superfluous. But if you want it as an annotation,
> that's fine I suppose.

Yep, I purely added this as extra annotation for future readers.

> Ideally we'd first queue all the remotes and then process local, but
> given how all this is organized that doesn't seem trivial to arrange.
>
> Maybe have this function return false when local and save that cfs_rq in
> a local var to process again later, dunno, that might turn messy.

Maybe something like this? Apologies for inline diff formatting.

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 012ec9d03811..100dae6023da 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5520,12 +5520,15 @@ static bool distribute_cfs_runtime(struct
cfs_bandwidth *cfs_b)
struct cfs_rq *cfs_rq;
u64 runtime, remaining = 1;
bool throttled = false;
+ int this_cpu = smp_processor_id();
+ struct cfs_rq *local_unthrottle = NULL;
+ struct rq *rq;
+ struct rq_flags rf;

rcu_read_lock();
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
throttled_list) {
- struct rq *rq = rq_of(cfs_rq);
- struct rq_flags rf;
+ rq = rq_of(cfs_rq);

if (!remaining) {
throttled = true;
@@ -5556,14 +5559,36 @@ static bool distribute_cfs_runtime(struct
cfs_bandwidth *cfs_b)
cfs_rq->runtime_remaining += runtime;

/* we check whether we're throttled above */
- if (cfs_rq->runtime_remaining > 0)
- unthrottle_cfs_rq_async(cfs_rq);
+ if (cfs_rq->runtime_remaining > 0) {
+ if (cpu_of(rq) != this_cpu ||
+ SCHED_WARN_ON(local_unthrottle)) {
+ unthrottle_cfs_rq_async(cfs_rq);
+ } else {
+ local_unthrottle = cfs_rq;
+ }
+ } else {
+ throttled = true;
+ }

next:
rq_unlock_irqrestore(rq, &rf);
}
rcu_read_unlock();

+ /*
+ * We prefer to stage the async unthrottles of all the remote cpus
+ * before we do the inline unthrottle locally. Note that
+ * unthrottle_cfs_rq_async() on the local cpu is actually synchronous,
+ * but it includes extra WARNs to make sure the cfs_rq really is
+ * still throttled.
+ */
+ if (local_unthrottle) {
+ rq = cpu_rq(this_cpu);
+ rq_lock_irqsave(rq, &rf);
+ unthrottle_cfs_rq_async(local_unthrottle);
+ rq_unlock_irqrestore(rq, &rf);
+ }
+
return throttled;
}

Note that one change we definitely want is the extra setting of
throttled = true in the case that cfs_rq->runtime_remaining <= 0, to
catch the case where we run out of runtime to distribute on the last
entity in the list.

> > +
> > + /* Already enqueued */
> > + if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
> > + return;
> > +
> > + list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
> > +
> > + smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
>
> Hurmph.. so I was expecting something like:
>
> first = list_empty(&rq->cfsb_csd_list);
> list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
> if (first)
> smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
>
> But I suppose I'm remembering the 'old' version. I don't think it is
> broken as written. There's a very narrow window where you'll end up
> sending a second IPI for naught, but meh.

The CSD doesn't get unlocked until right before we call the func().
But you're right that that's a (very) narrow window for an extra IPI.
Please feel free to modify the patch with that diff if you like.

>
> > +}
>
> Let me go queue this thing, we can always improve upon matters later.

Thanks! Please add at least the extra assignment of 'throttled = true'
from the diff above, but feel free to squash both the diffs if it
makes sense to you.

2022-11-20 03:01:50

by Chengming Zhou

[permalink] [raw]
Subject: Re: [PATCH v3] sched: async unthrottling for cfs bandwidth

On 2022/11/19 03:25, Josh Don wrote:
> On Fri, Nov 18, 2022 at 4:47 AM Peter Zijlstra <[email protected]> wrote:
>>
>> preempt_disable() -- through rq->lock -- also holds off rcu. Strictly
>> speaking this here is superfluous. But if you want it as an annotation,
>> that's fine I suppose.
>
> Yep, I purely added this as extra annotation for future readers.
>
>> Ideally we'd first queue all the remotes and then process local, but
>> given how all this is organized that doesn't seem trivial to arrange.
>>
>> Maybe have this function return false when local and save that cfs_rq in
>> a local var to process again later, dunno, that might turn messy.
>
> Maybe something like this? Apologies for inline diff formatting.
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 012ec9d03811..100dae6023da 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5520,12 +5520,15 @@ static bool distribute_cfs_runtime(struct
> cfs_bandwidth *cfs_b)
> struct cfs_rq *cfs_rq;
> u64 runtime, remaining = 1;
> bool throttled = false;
> + int this_cpu = smp_processor_id();
> + struct cfs_rq *local_unthrottle = NULL;
> + struct rq *rq;
> + struct rq_flags rf;
>
> rcu_read_lock();
> list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
> throttled_list) {
> - struct rq *rq = rq_of(cfs_rq);
> - struct rq_flags rf;
> + rq = rq_of(cfs_rq);
>
> if (!remaining) {
> throttled = true;
> @@ -5556,14 +5559,36 @@ static bool distribute_cfs_runtime(struct
> cfs_bandwidth *cfs_b)
> cfs_rq->runtime_remaining += runtime;
>
> /* we check whether we're throttled above */
> - if (cfs_rq->runtime_remaining > 0)
> - unthrottle_cfs_rq_async(cfs_rq);
> + if (cfs_rq->runtime_remaining > 0) {
> + if (cpu_of(rq) != this_cpu ||
> + SCHED_WARN_ON(local_unthrottle)) {
> + unthrottle_cfs_rq_async(cfs_rq);
> + } else {
> + local_unthrottle = cfs_rq;
> + }
> + } else {
> + throttled = true;
> + }

Hello,

I don't get the point why local unthrottle is put after all the remote cpus,
since this list is FIFO? (earliest throttled cfs_rq is at the head)

Should we distribute runtime in the FIFO order?

Thanks.

>
> next:
> rq_unlock_irqrestore(rq, &rf);
> }
> rcu_read_unlock();
>
> + /*
> + * We prefer to stage the async unthrottles of all the remote cpus
> + * before we do the inline unthrottle locally. Note that
> + * unthrottle_cfs_rq_async() on the local cpu is actually synchronous,
> + * but it includes extra WARNs to make sure the cfs_rq really is
> + * still throttled.
> + */
> + if (local_unthrottle) {
> + rq = cpu_rq(this_cpu);
> + rq_lock_irqsave(rq, &rf);
> + unthrottle_cfs_rq_async(local_unthrottle);
> + rq_unlock_irqrestore(rq, &rf);
> + }
> +
> return throttled;
> }
>
> Note that one change we definitely want is the extra setting of
> throttled = true in the case that cfs_rq->runtime_remaining <= 0, to
> catch the case where we run out of runtime to distribute on the last
> entity in the list.
>
>>> +
>>> + /* Already enqueued */
>>> + if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
>>> + return;
>>> +
>>> + list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
>>> +
>>> + smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
>>
>> Hurmph.. so I was expecting something like:
>>
>> first = list_empty(&rq->cfsb_csd_list);
>> list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
>> if (first)
>> smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
>>
>> But I suppose I'm remembering the 'old' version. I don't think it is
>> broken as written. There's a very narrow window where you'll end up
>> sending a second IPI for naught, but meh.
>
> The CSD doesn't get unlocked until right before we call the func().
> But you're right that that's a (very) narrow window for an extra IPI.
> Please feel free to modify the patch with that diff if you like.
>
>>
>>> +}
>>
>> Let me go queue this thing, we can always improve upon matters later.
>
> Thanks! Please add at least the extra assignment of 'throttled = true'
> from the diff above, but feel free to squash both the diffs if it
> makes sense to you.

2022-11-21 12:47:09

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v3] sched: async unthrottling for cfs bandwidth

On Sun, Nov 20, 2022 at 10:22:40AM +0800, Chengming Zhou wrote:
> > + if (cfs_rq->runtime_remaining > 0) {
> > + if (cpu_of(rq) != this_cpu ||
> > + SCHED_WARN_ON(local_unthrottle)) {
> > + unthrottle_cfs_rq_async(cfs_rq);
> > + } else {
> > + local_unthrottle = cfs_rq;
> > + }
> > + } else {
> > + throttled = true;
> > + }
>
> Hello,
>
> I don't get the point why local unthrottle is put after all the remote cpus,
> since this list is FIFO? (earliest throttled cfs_rq is at the head)

Let the local completion time for a CPU be W. Then if we queue a remote
work after the local synchronous work, the lower bound for total
completion is at least 2W.

OTOH, if we first queue all remote work and then process the local
synchronous work, the lower bound for total completion is W.

The practical difference is that all relevant CPUs get unthrottled
rougly at the same point in time, unlike with the original case, where
some CPUs have the opportunity to consume W runtime while another is
still throttled.


2022-11-21 13:06:19

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v3] sched: async unthrottling for cfs bandwidth

On Fri, Nov 18, 2022 at 11:25:09AM -0800, Josh Don wrote:

> > Maybe have this function return false when local and save that cfs_rq in
> > a local var to process again later, dunno, that might turn messy.
>
> Maybe something like this? Apologies for inline diff formatting.

That looks entirely reasonable, not nearly as horrible as I feared. Let
me go make that happen.

> Note that one change we definitely want is the extra setting of
> throttled = true in the case that cfs_rq->runtime_remaining <= 0, to
> catch the case where we run out of runtime to distribute on the last
> entity in the list.

Done.

> > > +
> > > + /* Already enqueued */
> > > + if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
> > > + return;
> > > +
> > > + list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
> > > +
> > > + smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
> >
> > Hurmph.. so I was expecting something like:
> >
> > first = list_empty(&rq->cfsb_csd_list);
> > list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
> > if (first)
> > smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
> >
> > But I suppose I'm remembering the 'old' version. I don't think it is
> > broken as written. There's a very narrow window where you'll end up
> > sending a second IPI for naught, but meh.
>
> The CSD doesn't get unlocked until right before we call the func().
> But you're right that that's a (very) narrow window for an extra IPI.
> Please feel free to modify the patch with that diff if you like.

Since I was manually editing things, I did that too.

Please test the final version as found here:

https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/commit/?h=sched/core&id=4e3c1b7b489e218dfa576cd6af0680b975b8743e

2022-11-21 18:17:31

by Michal Koutný

[permalink] [raw]
Subject: Re: [PATCH v3] sched: async unthrottling for cfs bandwidth

On Mon, Nov 21, 2022 at 01:34:33PM +0100, Peter Zijlstra <[email protected]> wrote:
> Please test the final version as found here:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/commit/?h=sched/core&id=4e3c1b7b489e218dfa576cd6af0680b975b8743e

I went through the patch and I claim it's
Reviewed-by: Michal Koutn? <[email protected]>

(not tested though. And thanks Josh for the notice about the not-double
queueing check.)


Attachments:
(No filename) (463.00 B)
signature.asc (235.00 B)
Digital signature
Download all attachments

2022-11-21 20:21:25

by Josh Don

[permalink] [raw]
Subject: Re: [PATCH v3] sched: async unthrottling for cfs bandwidth

On Mon, Nov 21, 2022 at 3:58 AM Peter Zijlstra <[email protected]> wrote:
>
> On Sun, Nov 20, 2022 at 10:22:40AM +0800, Chengming Zhou wrote:
> > > + if (cfs_rq->runtime_remaining > 0) {
> > > + if (cpu_of(rq) != this_cpu ||
> > > + SCHED_WARN_ON(local_unthrottle)) {
> > > + unthrottle_cfs_rq_async(cfs_rq);
> > > + } else {
> > > + local_unthrottle = cfs_rq;
> > > + }
> > > + } else {
> > > + throttled = true;
> > > + }
> >
> > Hello,
> >
> > I don't get the point why local unthrottle is put after all the remote cpus,
> > since this list is FIFO? (earliest throttled cfs_rq is at the head)
>
> Let the local completion time for a CPU be W. Then if we queue a remote
> work after the local synchronous work, the lower bound for total
> completion is at least 2W.
>
> OTOH, if we first queue all remote work and then process the local
> synchronous work, the lower bound for total completion is W.
>
> The practical difference is that all relevant CPUs get unthrottled
> rougly at the same point in time, unlike with the original case, where
> some CPUs have the opportunity to consume W runtime while another is
> still throttled.

Yep, this tradeoff feels "best", but there are some edge cases where
this could potentially disrupt fairness. For example, if we have
non-trivial W, a lot of cpus to iterate through for dispatching remote
unthrottle, and quota is small. Doesn't help that the timer is pinned
so that this will continually hit the same cpu. But as I alluded to, I
think the net benefit here is greater with the local unthrottling
ordered last.

2022-11-21 20:39:21

by Josh Don

[permalink] [raw]
Subject: Re: [PATCH v3] sched: async unthrottling for cfs bandwidth

> Please test the final version as found here:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/commit/?h=sched/core&id=4e3c1b7b489e218dfa576cd6af0680b975b8743e

Thanks Peter, the patch looks good to me and continues to work
correctly. Note that I needed to make two edits to get it to build
though on sched/core:

- local_cfq_rq doesn't match variable name used in function (local_unthrottle)
- rq_lock_irqrestore -> rq_unlock_irqrestore

Best,
Josh

2022-11-22 06:00:11

by Aaron Lu

[permalink] [raw]
Subject: Re: [PATCH v3] sched: async unthrottling for cfs bandwidth

On Mon, Nov 21, 2022 at 11:31:20AM -0800, Josh Don wrote:
> > Please test the final version as found here:
> >
> > https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/commit/?h=sched/core&id=4e3c1b7b489e218dfa576cd6af0680b975b8743e
>
> Thanks Peter, the patch looks good to me and continues to work
> correctly. Note that I needed to make two edits to get it to build
> though on sched/core:
>
> - local_cfq_rq doesn't match variable name used in function (local_unthrottle)
> - rq_lock_irqrestore -> rq_unlock_irqrestore

I also tested it on a desktop bare metal with 800 cgroups under 2
cgroups that have quota set. Before this patch, the majority of durations
for sched_cfs_period_timer() are in the range of 512us-1ms, with outliers
in the range of 1ms-4ms; after this patch, the majority are in the range
of 128us-512us with no outliers above 512us.

2022-11-22 06:27:57

by Aaron Lu

[permalink] [raw]
Subject: Re: [PATCH v3] sched: async unthrottling for cfs bandwidth

On Fri, Nov 18, 2022 at 11:25:09AM -0800, Josh Don wrote:
> On Fri, Nov 18, 2022 at 4:47 AM Peter Zijlstra <[email protected]> wrote:
> >
> > preempt_disable() -- through rq->lock -- also holds off rcu. Strictly
> > speaking this here is superfluous. But if you want it as an annotation,
> > that's fine I suppose.
>
> Yep, I purely added this as extra annotation for future readers.
>
> > Ideally we'd first queue all the remotes and then process local, but
> > given how all this is organized that doesn't seem trivial to arrange.
> >
> > Maybe have this function return false when local and save that cfs_rq in
> > a local var to process again later, dunno, that might turn messy.
>
> Maybe something like this? Apologies for inline diff formatting.
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 012ec9d03811..100dae6023da 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5520,12 +5520,15 @@ static bool distribute_cfs_runtime(struct
> cfs_bandwidth *cfs_b)
> struct cfs_rq *cfs_rq;
> u64 runtime, remaining = 1;
> bool throttled = false;
> + int this_cpu = smp_processor_id();
> + struct cfs_rq *local_unthrottle = NULL;
> + struct rq *rq;
> + struct rq_flags rf;
>
> rcu_read_lock();
> list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
> throttled_list) {
> - struct rq *rq = rq_of(cfs_rq);
> - struct rq_flags rf;
> + rq = rq_of(cfs_rq);
>
> if (!remaining) {
> throttled = true;
> @@ -5556,14 +5559,36 @@ static bool distribute_cfs_runtime(struct
> cfs_bandwidth *cfs_b)
> cfs_rq->runtime_remaining += runtime;
>
> /* we check whether we're throttled above */
> - if (cfs_rq->runtime_remaining > 0)
> - unthrottle_cfs_rq_async(cfs_rq);
> + if (cfs_rq->runtime_remaining > 0) {
> + if (cpu_of(rq) != this_cpu ||
> + SCHED_WARN_ON(local_unthrottle)) {
> + unthrottle_cfs_rq_async(cfs_rq);
> + } else {
> + local_unthrottle = cfs_rq;
> + }
> + } else {
> + throttled = true;
> + }
>
> next:
> rq_unlock_irqrestore(rq, &rf);
> }
> rcu_read_unlock();
>
> + /*
> + * We prefer to stage the async unthrottles of all the remote cpus
> + * before we do the inline unthrottle locally. Note that
> + * unthrottle_cfs_rq_async() on the local cpu is actually synchronous,
> + * but it includes extra WARNs to make sure the cfs_rq really is
> + * still throttled.

With this said ->

> + */
> + if (local_unthrottle) {
> + rq = cpu_rq(this_cpu);
> + rq_lock_irqsave(rq, &rf);

Should we add:
if (cfs_rq_throttled(local_unthrottle))

before calling into unthrottle_cfs_rq_async(local_unthrottle) to avoid a
potential WARN?

As for whether the local cfs_rq can be unthrottled now after rq lock is
re-acquired, I suppose it can be. e.g. another user sets a new quota to
this task group during the window of rq lock gets dropped in the above
loop and re-acquired here IIUC.

> + unthrottle_cfs_rq_async(local_unthrottle);
> + rq_unlock_irqrestore(rq, &rf);
> + }
> +
> return throttled;
> }
>

2022-11-22 11:08:03

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v3] sched: async unthrottling for cfs bandwidth

On Mon, Nov 21, 2022 at 11:37:14AM -0800, Josh Don wrote:
> Yep, this tradeoff feels "best", but there are some edge cases where
> this could potentially disrupt fairness. For example, if we have
> non-trivial W, a lot of cpus to iterate through for dispatching remote
> unthrottle, and quota is small. Doesn't help that the timer is pinned
> so that this will continually hit the same cpu.

We could -- if we wanted to -- manually rotate the timer around the
relevant CPUs. Doing that sanely would require a bit of hrtimer surgery
though I'm afraid.

2022-11-22 11:12:39

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v3] sched: async unthrottling for cfs bandwidth

On Mon, Nov 21, 2022 at 11:31:20AM -0800, Josh Don wrote:
> > Please test the final version as found here:
> >
> > https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/commit/?h=sched/core&id=4e3c1b7b489e218dfa576cd6af0680b975b8743e
>
> Thanks Peter, the patch looks good to me and continues to work
> correctly. Note that I needed to make two edits to get it to build
> though on sched/core:
>
> - local_cfq_rq doesn't match variable name used in function (local_unthrottle)
> - rq_lock_irqrestore -> rq_unlock_irqrestore

Bah; and here I throught the .config I build actually had this crud
enabled. Oh well.. fixed that, will push out shortly.

2022-11-22 20:08:19

by Josh Don

[permalink] [raw]
Subject: Re: [PATCH v3] sched: async unthrottling for cfs bandwidth

> > + */
> > + if (local_unthrottle) {
> > + rq = cpu_rq(this_cpu);
> > + rq_lock_irqsave(rq, &rf);
>
> Should we add:
> if (cfs_rq_throttled(local_unthrottle))
>
> before calling into unthrottle_cfs_rq_async(local_unthrottle) to avoid a
> potential WARN?
>
> As for whether the local cfs_rq can be unthrottled now after rq lock is
> re-acquired, I suppose it can be. e.g. another user sets a new quota to
> this task group during the window of rq lock gets dropped in the above
> loop and re-acquired here IIUC.
>
> > + unthrottle_cfs_rq_async(local_unthrottle);
> > + rq_unlock_irqrestore(rq, &rf);
> > + }
> > +
> > return throttled;
> > }

Yes, we should add that check due to the case you described with a
user concurrently configuring bandwidth. And as long as we're doing
that, we might as well make this unthrottle_cfs_rq() instead and snip
the comment. Peter, would you mind adding that delta?

2022-11-24 09:55:43

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v3] sched: async unthrottling for cfs bandwidth

On Tue, Nov 22, 2022 at 11:41:04AM -0800, Josh Don wrote:
> > > + */
> > > + if (local_unthrottle) {
> > > + rq = cpu_rq(this_cpu);
> > > + rq_lock_irqsave(rq, &rf);
> >
> > Should we add:
> > if (cfs_rq_throttled(local_unthrottle))
> >
> > before calling into unthrottle_cfs_rq_async(local_unthrottle) to avoid a
> > potential WARN?
> >
> > As for whether the local cfs_rq can be unthrottled now after rq lock is
> > re-acquired, I suppose it can be. e.g. another user sets a new quota to
> > this task group during the window of rq lock gets dropped in the above
> > loop and re-acquired here IIUC.
> >
> > > + unthrottle_cfs_rq_async(local_unthrottle);
> > > + rq_unlock_irqrestore(rq, &rf);
> > > + }
> > > +
> > > return throttled;
> > > }
>
> Yes, we should add that check due to the case you described with a
> user concurrently configuring bandwidth. And as long as we're doing
> that, we might as well make this unthrottle_cfs_rq() instead and snip
> the comment. Peter, would you mind adding that delta?

Done, should be pushed into the queue.git thing momentarily.

2022-11-25 09:24:21

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v3] sched: async unthrottling for cfs bandwidth

On Tue, Nov 22, 2022 at 11:35:48AM +0100, Peter Zijlstra wrote:
> On Mon, Nov 21, 2022 at 11:37:14AM -0800, Josh Don wrote:
> > Yep, this tradeoff feels "best", but there are some edge cases where
> > this could potentially disrupt fairness. For example, if we have
> > non-trivial W, a lot of cpus to iterate through for dispatching remote
> > unthrottle, and quota is small. Doesn't help that the timer is pinned
> > so that this will continually hit the same cpu.
>
> We could -- if we wanted to -- manually rotate the timer around the
> relevant CPUs. Doing that sanely would require a bit of hrtimer surgery
> though I'm afraid.

Here; something like so should enable us to cycle the bandwidth timer.
Just need to figure out a way to find another CPU or something.

---
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 0ee140176f10..f8bd200d678a 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -63,8 +63,10 @@ enum hrtimer_mode {
* Return values for the callback function
*/
enum hrtimer_restart {
- HRTIMER_NORESTART, /* Timer is not restarted */
- HRTIMER_RESTART, /* Timer must be restarted */
+ HRTIMER_RESTART = -1, /* Timer must be restarted */
+ HRTIMER_NORESTART = 0, /* Timer is not restarted */
+ HRTIMER_RESTART_MIGRATE = 1,
+ HRTIMER_RESTART_MIGRATE_MAX = HRTIMER_RESTART_MIGRATE + NR_CPUS,
};

/*
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 3ae661ab6260..e75033f78a19 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1621,6 +1621,16 @@ bool hrtimer_active(const struct hrtimer *timer)
}
EXPORT_SYMBOL_GPL(hrtimer_active);

+static void raw_spin_lock_double(raw_spinlock_t *a, raw_spinlock_t *b)
+{
+ if (b < a)
+ swap(a, b);
+
+ raw_spin_lock(a);
+ if (b != a)
+ raw_spin_lock_nested(b, SINGLE_DEPTH_NESTING);
+}
+
/*
* The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3
* distinct sections:
@@ -1644,6 +1654,8 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
struct hrtimer *timer, ktime_t *now,
unsigned long flags) __must_hold(&cpu_base->lock)
{
+ struct hrtimer_cpu_base *new_cpu_base = cpu_base;
+ struct hrtimer_clock_base *new_base = base;
enum hrtimer_restart (*fn)(struct hrtimer *);
bool expires_in_hardirq;
int restart;
@@ -1686,7 +1698,17 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,

lockdep_hrtimer_exit(expires_in_hardirq);
trace_hrtimer_expire_exit(timer);
- raw_spin_lock_irq(&cpu_base->lock);
+
+ local_irq_disable();
+
+ if (restart >= HRTIMER_RESTART_MIGRATE) {
+ int cpu = restart - HRTIMER_RESTART_MIGRATE;
+ int b = base - cpu_base->clock_base;
+
+ new_cpu_base = &per_cpu(hrtimer_bases, cpu);
+ new_base = new_cpu_base->clock_base[b];
+ }
+ raw_spin_lock_double(&cpu_base->lock, &new_cpu_base->lock);

/*
* Note: We clear the running state after enqueue_hrtimer and
@@ -1698,8 +1720,16 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
* for us already.
*/
if (restart != HRTIMER_NORESTART &&
- !(timer->state & HRTIMER_STATE_ENQUEUED))
- enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);
+ !(timer->state & HRTIMER_STATE_ENQUEUED)) {
+
+ if (new_cpu_base != cpu_base) {
+ timer->base = new_base;
+ enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS);
+ raw_spin_unlock(&new_cpu_base->lock);
+ } else {
+ enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);
+ }
+ }

/*
* Separate the ->running assignment from the ->state assignment.
@@ -2231,12 +2261,8 @@ int hrtimers_dead_cpu(unsigned int scpu)
local_irq_disable();
old_base = &per_cpu(hrtimer_bases, scpu);
new_base = this_cpu_ptr(&hrtimer_bases);
- /*
- * The caller is globally serialized and nobody else
- * takes two locks at once, deadlock is not possible.
- */
- raw_spin_lock(&new_base->lock);
- raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
+
+ raw_spin_lock_double(&old_base->lock, &new_base->lock);

for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
migrate_hrtimer_list(&old_base->clock_base[i],

2022-11-25 09:24:26

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v3] sched: async unthrottling for cfs bandwidth

On Fri, Nov 25, 2022 at 09:57:09AM +0100, Peter Zijlstra wrote:
> On Tue, Nov 22, 2022 at 11:35:48AM +0100, Peter Zijlstra wrote:
> > On Mon, Nov 21, 2022 at 11:37:14AM -0800, Josh Don wrote:
> > > Yep, this tradeoff feels "best", but there are some edge cases where
> > > this could potentially disrupt fairness. For example, if we have
> > > non-trivial W, a lot of cpus to iterate through for dispatching remote
> > > unthrottle, and quota is small. Doesn't help that the timer is pinned
> > > so that this will continually hit the same cpu.
> >
> > We could -- if we wanted to -- manually rotate the timer around the
> > relevant CPUs. Doing that sanely would require a bit of hrtimer surgery
> > though I'm afraid.
>
> Here; something like so should enable us to cycle the bandwidth timer.
> Just need to figure out a way to find another CPU or something.

Some more preparation...

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5617,7 +5617,7 @@ static int do_sched_cfs_period_timer(str
if (!throttled) {
/* mark as potentially idle for the upcoming period */
cfs_b->idle = 1;
- return 0;
+ return HRTIMER_RESTART;
}

/* account preceding periods in which throttling occurred */
@@ -5641,10 +5641,10 @@ static int do_sched_cfs_period_timer(str
*/
cfs_b->idle = 0;

- return 0;
+ return HRTIMER_RESTART;

out_deactivate:
- return 1;
+ return HRTIMER_NORESTART;
}

/* a cfs_rq won't donate quota below this amount */
@@ -5836,9 +5836,9 @@ static enum hrtimer_restart sched_cfs_pe
{
struct cfs_bandwidth *cfs_b =
container_of(timer, struct cfs_bandwidth, period_timer);
+ int restart = HRTIMER_RESTART;
unsigned long flags;
int overrun;
- int idle = 0;
int count = 0;

raw_spin_lock_irqsave(&cfs_b->lock, flags);
@@ -5847,7 +5847,7 @@ static enum hrtimer_restart sched_cfs_pe
if (!overrun)
break;

- idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
+ restart = do_sched_cfs_period_timer(cfs_b, overrun, flags);

if (++count > 3) {
u64 new, old = ktime_to_ns(cfs_b->period);
@@ -5880,11 +5880,11 @@ static enum hrtimer_restart sched_cfs_pe
count = 0;
}
}
- if (idle)
+ if (restart == HRTIMER_NORESTART)
cfs_b->period_active = 0;
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);

- return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+ return restart;
}

void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)

2022-11-25 09:26:41

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v3] sched: async unthrottling for cfs bandwidth

On Fri, Nov 25, 2022 at 09:59:23AM +0100, Peter Zijlstra wrote:
> On Fri, Nov 25, 2022 at 09:57:09AM +0100, Peter Zijlstra wrote:
> > On Tue, Nov 22, 2022 at 11:35:48AM +0100, Peter Zijlstra wrote:
> > > On Mon, Nov 21, 2022 at 11:37:14AM -0800, Josh Don wrote:
> > > > Yep, this tradeoff feels "best", but there are some edge cases where
> > > > this could potentially disrupt fairness. For example, if we have
> > > > non-trivial W, a lot of cpus to iterate through for dispatching remote
> > > > unthrottle, and quota is small. Doesn't help that the timer is pinned
> > > > so that this will continually hit the same cpu.
> > >
> > > We could -- if we wanted to -- manually rotate the timer around the
> > > relevant CPUs. Doing that sanely would require a bit of hrtimer surgery
> > > though I'm afraid.
> >
> > Here; something like so should enable us to cycle the bandwidth timer.
> > Just need to figure out a way to find another CPU or something.
>
> Some more preparation...

And then I think something like so.. That migrates the timer to the CPU
of the first throttled entry -- possibly not the best heuristic, but its
the simplest.

NOTE: none of this has seen a compiler up close.

---
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5595,13 +5595,21 @@ static bool distribute_cfs_runtime(struc
*/
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
{
- int throttled;
+ struct cfs_rq *first_cfs_rq;
+ int throttled = 0;
+ int cpu;

/* no need to continue the timer with no bandwidth constraint */
if (cfs_b->quota == RUNTIME_INF)
goto out_deactivate;

- throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+ first_cfs_rq = list_first_entry_or_null(&cfs_b->throttled_cfs_rq,
+ struct cfs_rq, throttled_list);
+ if (first_cfs_rq) {
+ throttled = 1;
+ cpu = cpu_of(rq_of(first_cfs_rq));
+ }
+
cfs_b->nr_periods += overrun;

/* Refill extra burst quota even if cfs_b->idle */
@@ -5641,7 +5649,7 @@ static int do_sched_cfs_period_timer(str
*/
cfs_b->idle = 0;

- return HRTIMER_RESTART;
+ return HRTIMER_RESTART_MIGRATE + cpu;

out_deactivate:
return HRTIMER_NORESTART;

2022-11-29 01:49:07

by Josh Don

[permalink] [raw]
Subject: Re: [PATCH v3] sched: async unthrottling for cfs bandwidth

> @@ -1686,7 +1698,17 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
>
> lockdep_hrtimer_exit(expires_in_hardirq);
> trace_hrtimer_expire_exit(timer);
> - raw_spin_lock_irq(&cpu_base->lock);
> +
> + local_irq_disable();
> +
> + if (restart >= HRTIMER_RESTART_MIGRATE) {
> + int cpu = restart - HRTIMER_RESTART_MIGRATE;

I know this is just a rough draft, but just noting that this wants a
check against MIGRATE_MAX :)

> + if (new_cpu_base != cpu_base) {
> + timer->base = new_base;
> + enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS);
> + raw_spin_unlock(&new_cpu_base->lock);

unlock the old base->lock right?

2022-11-29 01:49:25

by Josh Don

[permalink] [raw]
Subject: Re: [PATCH v3] sched: async unthrottling for cfs bandwidth

On Fri, Nov 25, 2022 at 1:12 AM Peter Zijlstra <[email protected]> wrote:
>
> On Fri, Nov 25, 2022 at 09:59:23AM +0100, Peter Zijlstra wrote:
> > On Fri, Nov 25, 2022 at 09:57:09AM +0100, Peter Zijlstra wrote:
> > > On Tue, Nov 22, 2022 at 11:35:48AM +0100, Peter Zijlstra wrote:
> > > > On Mon, Nov 21, 2022 at 11:37:14AM -0800, Josh Don wrote:
> > > > > Yep, this tradeoff feels "best", but there are some edge cases where
> > > > > this could potentially disrupt fairness. For example, if we have
> > > > > non-trivial W, a lot of cpus to iterate through for dispatching remote
> > > > > unthrottle, and quota is small. Doesn't help that the timer is pinned
> > > > > so that this will continually hit the same cpu.
> > > >
> > > > We could -- if we wanted to -- manually rotate the timer around the
> > > > relevant CPUs. Doing that sanely would require a bit of hrtimer surgery
> > > > though I'm afraid.
> > >
> > > Here; something like so should enable us to cycle the bandwidth timer.
> > > Just need to figure out a way to find another CPU or something.
> >
> > Some more preparation...
>
> And then I think something like so.. That migrates the timer to the CPU
> of the first throttled entry -- possibly not the best heuristic, but its
> the simplest.
>
> NOTE: none of this has seen a compiler up close.

Thanks Peter, this overall looks good to me. One question though: I
was expecting to see that when we migrate the timer, we adjust expiry
to account for clock shift between cpus. Was this just not part of the
initial draft here, or is this somehow already accounted for?

Subject: [tip: sched/core] sched: Async unthrottling for cfs bandwidth

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 8ad075c2eb1f6b4b33436144ea1ef2619f3b6398
Gitweb: https://git.kernel.org/tip/8ad075c2eb1f6b4b33436144ea1ef2619f3b6398
Author: Josh Don <[email protected]>
AuthorDate: Wed, 16 Nov 2022 16:54:18 -08:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Tue, 27 Dec 2022 12:52:09 +01:00

sched: Async unthrottling for cfs bandwidth

CFS bandwidth currently distributes new runtime and unthrottles cfs_rq's
inline in an hrtimer callback. Runtime distribution is a per-cpu
operation, and unthrottling is a per-cgroup operation, since a tg walk
is required. On machines with a large number of cpus and large cgroup
hierarchies, this cpus*cgroups work can be too much to do in a single
hrtimer callback: since IRQ are disabled, hard lockups may easily occur.
Specifically, we've found this scalability issue on configurations with
256 cpus, O(1000) cgroups in the hierarchy being throttled, and high
memory bandwidth usage.

To fix this, we can instead unthrottle cfs_rq's asynchronously via a
CSD. Each cpu is responsible for unthrottling itself, thus sharding the
total work more fairly across the system, and avoiding hard lockups.

Signed-off-by: Josh Don <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
kernel/sched/fair.c | 155 ++++++++++++++++++++++++++++++++++++++----
kernel/sched/sched.h | 8 ++-
2 files changed, 150 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c36aa54..ea81d48 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5461,22 +5461,105 @@ unthrottle_throttle:
resched_curr(rq);
}

-static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
+#ifdef CONFIG_SMP
+static void __cfsb_csd_unthrottle(void *arg)
{
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cursor, *tmp;
+ struct rq *rq = arg;
+ struct rq_flags rf;
+
+ rq_lock(rq, &rf);
+
+ /*
+ * Since we hold rq lock we're safe from concurrent manipulation of
+ * the CSD list. However, this RCU critical section annotates the
+ * fact that we pair with sched_free_group_rcu(), so that we cannot
+ * race with group being freed in the window between removing it
+ * from the list and advancing to the next entry in the list.
+ */
+ rcu_read_lock();
+
+ list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
+ throttled_csd_list) {
+ list_del_init(&cursor->throttled_csd_list);
+
+ if (cfs_rq_throttled(cursor))
+ unthrottle_cfs_rq(cursor);
+ }
+
+ rcu_read_unlock();
+
+ rq_unlock(rq, &rf);
+}
+
+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+ bool first;
+
+ if (rq == this_rq()) {
+ unthrottle_cfs_rq(cfs_rq);
+ return;
+ }
+
+ /* Already enqueued */
+ if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
+ return;
+
+ first = list_empty(&rq->cfsb_csd_list);
+ list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
+ if (first)
+ smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
+}
+#else
+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+ unthrottle_cfs_rq(cfs_rq);
+}
+#endif
+
+static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+ lockdep_assert_rq_held(rq_of(cfs_rq));
+
+ if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) ||
+ cfs_rq->runtime_remaining <= 0))
+ return;
+
+ __unthrottle_cfs_rq_async(cfs_rq);
+}
+
+static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
+{
+ struct cfs_rq *local_unthrottle = NULL;
+ int this_cpu = smp_processor_id();
u64 runtime, remaining = 1;
+ bool throttled = false;
+ struct cfs_rq *cfs_rq;
+ struct rq_flags rf;
+ struct rq *rq;

rcu_read_lock();
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
throttled_list) {
- struct rq *rq = rq_of(cfs_rq);
- struct rq_flags rf;
+ rq = rq_of(cfs_rq);
+
+ if (!remaining) {
+ throttled = true;
+ break;
+ }

rq_lock_irqsave(rq, &rf);
if (!cfs_rq_throttled(cfs_rq))
goto next;

- /* By the above check, this should never be true */
+#ifdef CONFIG_SMP
+ /* Already queued for async unthrottle */
+ if (!list_empty(&cfs_rq->throttled_csd_list))
+ goto next;
+#endif
+
+ /* By the above checks, this should never be true */
SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);

raw_spin_lock(&cfs_b->lock);
@@ -5490,16 +5573,30 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
cfs_rq->runtime_remaining += runtime;

/* we check whether we're throttled above */
- if (cfs_rq->runtime_remaining > 0)
- unthrottle_cfs_rq(cfs_rq);
+ if (cfs_rq->runtime_remaining > 0) {
+ if (cpu_of(rq) != this_cpu ||
+ SCHED_WARN_ON(local_unthrottle))
+ unthrottle_cfs_rq_async(cfs_rq);
+ else
+ local_unthrottle = cfs_rq;
+ } else {
+ throttled = true;
+ }

next:
rq_unlock_irqrestore(rq, &rf);
-
- if (!remaining)
- break;
}
rcu_read_unlock();
+
+ if (local_unthrottle) {
+ rq = cpu_rq(this_cpu);
+ rq_lock_irqsave(rq, &rf);
+ if (cfs_rq_throttled(local_unthrottle))
+ unthrottle_cfs_rq(local_unthrottle);
+ rq_unlock_irqrestore(rq, &rf);
+ }
+
+ return throttled;
}

/*
@@ -5544,10 +5641,8 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
while (throttled && cfs_b->runtime > 0) {
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
- distribute_cfs_runtime(cfs_b);
+ throttled = distribute_cfs_runtime(cfs_b);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
-
- throttled = !list_empty(&cfs_b->throttled_cfs_rq);
}

/*
@@ -5824,6 +5919,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
cfs_rq->runtime_enabled = 0;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
+#ifdef CONFIG_SMP
+ INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
+#endif
}

void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -5840,12 +5938,38 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)

static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
+ int __maybe_unused i;
+
/* init_cfs_bandwidth() was not called */
if (!cfs_b->throttled_cfs_rq.next)
return;

hrtimer_cancel(&cfs_b->period_timer);
hrtimer_cancel(&cfs_b->slack_timer);
+
+ /*
+ * It is possible that we still have some cfs_rq's pending on a CSD
+ * list, though this race is very rare. In order for this to occur, we
+ * must have raced with the last task leaving the group while there
+ * exist throttled cfs_rq(s), and the period_timer must have queued the
+ * CSD item but the remote cpu has not yet processed it. To handle this,
+ * we can simply flush all pending CSD work inline here. We're
+ * guaranteed at this point that no additional cfs_rq of this group can
+ * join a CSD list.
+ */
+#ifdef CONFIG_SMP
+ for_each_possible_cpu(i) {
+ struct rq *rq = cpu_rq(i);
+ unsigned long flags;
+
+ if (list_empty(&rq->cfsb_csd_list))
+ continue;
+
+ local_irq_save(flags);
+ __cfsb_csd_unthrottle(rq);
+ local_irq_restore(flags);
+ }
+#endif
}

/*
@@ -12474,6 +12598,11 @@ __init void init_sched_fair_class(void)
for_each_possible_cpu(i) {
zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i));
+
+#ifdef CONFIG_CFS_BANDWIDTH
+ INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
+ INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list);
+#endif
}

open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 771f8dd..b3d6e81 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -645,6 +645,9 @@ struct cfs_rq {
int throttled;
int throttle_count;
struct list_head throttled_list;
+#ifdef CONFIG_SMP
+ struct list_head throttled_csd_list;
+#endif
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
@@ -1154,6 +1157,11 @@ struct rq {

/* Scratch cpumask to be temporarily used under rq_lock */
cpumask_var_t scratch_mask;
+
+#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP)
+ call_single_data_t cfsb_csd;
+ struct list_head cfsb_csd_list;
+#endif
};

#ifdef CONFIG_FAIR_GROUP_SCHED