Each time we sync cfs_rq->runtime_expires with cfs_b->runtime_expires,
we should sync its ->expires_seq too. However it is missing
for distribute_cfs_runtime(), especially the slack timer call path.
Fixes: 512ac999d275 ("sched/fair: Fix bandwidth timer clock drift condition")
Cc: Xunlei Pang <[email protected]>
Cc: Ben Segall <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Signed-off-by: Cong Wang <[email protected]>
---
kernel/sched/fair.c | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2f0a0be4d344..910c50db3d74 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4857,7 +4857,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
}
static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
- u64 remaining, u64 expires)
+ u64 remaining, u64 expires, int expires_seq)
{
struct cfs_rq *cfs_rq;
u64 runtime;
@@ -4880,6 +4880,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
cfs_rq->runtime_remaining += runtime;
cfs_rq->runtime_expires = expires;
+ cfs_rq->expires_seq = expires_seq;
/* we check whether we're throttled above */
if (cfs_rq->runtime_remaining > 0)
@@ -4905,7 +4906,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
{
u64 runtime, runtime_expires;
- int throttled;
+ int throttled, expires_seq;
/* no need to continue the timer with no bandwidth constraint */
if (cfs_b->quota == RUNTIME_INF)
@@ -4933,6 +4934,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
cfs_b->nr_throttled += overrun;
runtime_expires = cfs_b->runtime_expires;
+ expires_seq = cfs_b->expires_seq;
/*
* This check is repeated as we are holding onto the new bandwidth while
@@ -4946,7 +4948,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
raw_spin_unlock(&cfs_b->lock);
/* we can't nest cfs_b->lock while distributing bandwidth */
runtime = distribute_cfs_runtime(cfs_b, runtime,
- runtime_expires);
+ runtime_expires, expires_seq);
raw_spin_lock(&cfs_b->lock);
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
@@ -5055,6 +5057,7 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
{
u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
+ int expires_seq;
u64 expires;
/* confirm we're still not at a refresh boundary */
@@ -5068,12 +5071,13 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
runtime = cfs_b->runtime;
expires = cfs_b->runtime_expires;
+ expires_seq = cfs_b->expires_seq;
raw_spin_unlock(&cfs_b->lock);
if (!runtime)
return;
- runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
+ runtime = distribute_cfs_runtime(cfs_b, runtime, expires, expires_seq);
raw_spin_lock(&cfs_b->lock);
if (expires == cfs_b->runtime_expires)
--
2.14.4
Xunlei Pang <[email protected]> writes:
> Hi Cong,
>
> On 7/28/18 8:24 AM, Cong Wang wrote:
>> Each time we sync cfs_rq->runtime_expires with cfs_b->runtime_expires,
>> we should sync its ->expires_seq too. However it is missing
>> for distribute_cfs_runtime(), especially the slack timer call path.
>
> I don't think it's a problem, as expires_seq will get synced in
> assign_cfs_rq_runtime().
>
> Thanks,
> Xunlei
It does seem unlikely to actually come up since the cfs_rq would have to
not run until the period was expired-locally-but-not-globally, but
there's no reason to not fix it.
>
>>
>> Fixes: 512ac999d275 ("sched/fair: Fix bandwidth timer clock drift condition")
>> Cc: Xunlei Pang <[email protected]>
>> Cc: Ben Segall <[email protected]>
>> Cc: Linus Torvalds <[email protected]>
>> Cc: Peter Zijlstra <[email protected]>
>> Cc: Thomas Gleixner <[email protected]>
>> Signed-off-by: Cong Wang <[email protected]>
>> ---
>> kernel/sched/fair.c | 12 ++++++++----
>> 1 file changed, 8 insertions(+), 4 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 2f0a0be4d344..910c50db3d74 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -4857,7 +4857,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
>> }
>>
>> static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
>> - u64 remaining, u64 expires)
>> + u64 remaining, u64 expires, int expires_seq)
>> {
>> struct cfs_rq *cfs_rq;
>> u64 runtime;
>> @@ -4880,6 +4880,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
>>
>> cfs_rq->runtime_remaining += runtime;
>> cfs_rq->runtime_expires = expires;
>> + cfs_rq->expires_seq = expires_seq;
>>
>> /* we check whether we're throttled above */
>> if (cfs_rq->runtime_remaining > 0)
>> @@ -4905,7 +4906,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
>> static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
>> {
>> u64 runtime, runtime_expires;
>> - int throttled;
>> + int throttled, expires_seq;
>>
>> /* no need to continue the timer with no bandwidth constraint */
>> if (cfs_b->quota == RUNTIME_INF)
>> @@ -4933,6 +4934,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
>> cfs_b->nr_throttled += overrun;
>>
>> runtime_expires = cfs_b->runtime_expires;
>> + expires_seq = cfs_b->expires_seq;
>>
>> /*
>> * This check is repeated as we are holding onto the new bandwidth while
>> @@ -4946,7 +4948,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
>> raw_spin_unlock(&cfs_b->lock);
>> /* we can't nest cfs_b->lock while distributing bandwidth */
>> runtime = distribute_cfs_runtime(cfs_b, runtime,
>> - runtime_expires);
>> + runtime_expires, expires_seq);
>> raw_spin_lock(&cfs_b->lock);
>>
>> throttled = !list_empty(&cfs_b->throttled_cfs_rq);
>> @@ -5055,6 +5057,7 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
>> static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
>> {
>> u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
>> + int expires_seq;
>> u64 expires;
>>
>> /* confirm we're still not at a refresh boundary */
>> @@ -5068,12 +5071,13 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
>> runtime = cfs_b->runtime;
>>
>> expires = cfs_b->runtime_expires;
>> + expires_seq = cfs_b->expires_seq;
>> raw_spin_unlock(&cfs_b->lock);
>>
>> if (!runtime)
>> return;
>>
>> - runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
>> + runtime = distribute_cfs_runtime(cfs_b, runtime, expires, expires_seq);
>>
>> raw_spin_lock(&cfs_b->lock);
>> if (expires == cfs_b->runtime_expires)
>>
On Sun, Jul 29, 2018 at 10:29 PM Xunlei Pang <[email protected]> wrote:
>
> Hi Cong,
>
> On 7/28/18 8:24 AM, Cong Wang wrote:
> > Each time we sync cfs_rq->runtime_expires with cfs_b->runtime_expires,
> > we should sync its ->expires_seq too. However it is missing
> > for distribute_cfs_runtime(), especially the slack timer call path.
>
> I don't think it's a problem, as expires_seq will get synced in
> assign_cfs_rq_runtime().
Sure, but there is a small window during which they are not synced.
Why do you want to wait until the next assign_cfs_rq_runtime() when
you already know runtime_expires is synced?
Also, expire_cfs_rq_runtime() is called before assign_cfs_rq_runtime()
inside __account_cfs_rq_runtime(), which means the check of
cfs_rq->expires_seq is not accurate for unthrottling case if the clock
drift happens soon enough?
On 7/31/18 1:55 AM, Cong Wang wrote:
> On Sun, Jul 29, 2018 at 10:29 PM Xunlei Pang <[email protected]> wrote:
>>
>> Hi Cong,
>>
>> On 7/28/18 8:24 AM, Cong Wang wrote:
>>> Each time we sync cfs_rq->runtime_expires with cfs_b->runtime_expires,
>>> we should sync its ->expires_seq too. However it is missing
>>> for distribute_cfs_runtime(), especially the slack timer call path.
>>
>> I don't think it's a problem, as expires_seq will get synced in
>> assign_cfs_rq_runtime().
>
> Sure, but there is a small window during which they are not synced.
> Why do you want to wait until the next assign_cfs_rq_runtime() when
> you already know runtime_expires is synced?
>
> Also, expire_cfs_rq_runtime() is called before assign_cfs_rq_runtime()
> inside __account_cfs_rq_runtime(), which means the check of
> cfs_rq->expires_seq is not accurate for unthrottling case if the clock
> drift happens soon enough?
>
expire_cfs_rq_runtime():
if (cfs_rq->expires_seq == cfs_b->expires_seq) {
/* extend local deadline, drift is bounded above by 2 ticks */
cfs_rq->runtime_expires += TICK_NSEC;
} else {
/* global deadline is ahead, expiration has passed */
cfs_rq->runtime_remaining = 0;
}
So if clock drift happens soon, then expires_seq decides the correct
thing we should do: if cfs_b->expires_seq advanced, then clear the stale
cfs_rq->runtime_remaining from the slack timer of the past period, then
assign_cfs_rq_runtime() will refresh them afterwards, otherwise it is a
real clock drift. I am still not getting where the race is?
Xunlei Pang <[email protected]> writes:
> On 7/31/18 1:55 AM, Cong Wang wrote:
>> On Sun, Jul 29, 2018 at 10:29 PM Xunlei Pang <[email protected]> wrote:
>>>
>>> Hi Cong,
>>>
>>> On 7/28/18 8:24 AM, Cong Wang wrote:
>>>> Each time we sync cfs_rq->runtime_expires with cfs_b->runtime_expires,
>>>> we should sync its ->expires_seq too. However it is missing
>>>> for distribute_cfs_runtime(), especially the slack timer call path.
>>>
>>> I don't think it's a problem, as expires_seq will get synced in
>>> assign_cfs_rq_runtime().
>>
>> Sure, but there is a small window during which they are not synced.
>> Why do you want to wait until the next assign_cfs_rq_runtime() when
>> you already know runtime_expires is synced?
>>
>> Also, expire_cfs_rq_runtime() is called before assign_cfs_rq_runtime()
>> inside __account_cfs_rq_runtime(), which means the check of
>> cfs_rq->expires_seq is not accurate for unthrottling case if the clock
>> drift happens soon enough?
>>
>
> expire_cfs_rq_runtime():
> if (cfs_rq->expires_seq == cfs_b->expires_seq) {
> /* extend local deadline, drift is bounded above by 2 ticks */
> cfs_rq->runtime_expires += TICK_NSEC;
> } else {
> /* global deadline is ahead, expiration has passed */
> cfs_rq->runtime_remaining = 0;
> }
>
> So if clock drift happens soon, then expires_seq decides the correct
> thing we should do: if cfs_b->expires_seq advanced, then clear the stale
> cfs_rq->runtime_remaining from the slack timer of the past period, then
> assign_cfs_rq_runtime() will refresh them afterwards, otherwise it is a
> real clock drift. I am still not getting where the race is?
Nothing /important/ goes wrong because distribute_cfs_runtime only fills
runtime_remaining up to 1, not a real amount.
On Tue, Jul 31, 2018 at 10:13 AM <[email protected]> wrote:
>
> Xunlei Pang <[email protected]> writes:
>
> > On 7/31/18 1:55 AM, Cong Wang wrote:
> >> On Sun, Jul 29, 2018 at 10:29 PM Xunlei Pang <[email protected]> wrote:
> >>>
> >>> Hi Cong,
> >>>
> >>> On 7/28/18 8:24 AM, Cong Wang wrote:
> >>>> Each time we sync cfs_rq->runtime_expires with cfs_b->runtime_expires,
> >>>> we should sync its ->expires_seq too. However it is missing
> >>>> for distribute_cfs_runtime(), especially the slack timer call path.
> >>>
> >>> I don't think it's a problem, as expires_seq will get synced in
> >>> assign_cfs_rq_runtime().
> >>
> >> Sure, but there is a small window during which they are not synced.
> >> Why do you want to wait until the next assign_cfs_rq_runtime() when
> >> you already know runtime_expires is synced?
> >>
> >> Also, expire_cfs_rq_runtime() is called before assign_cfs_rq_runtime()
> >> inside __account_cfs_rq_runtime(), which means the check of
> >> cfs_rq->expires_seq is not accurate for unthrottling case if the clock
> >> drift happens soon enough?
> >>
> >
> > expire_cfs_rq_runtime():
> > if (cfs_rq->expires_seq == cfs_b->expires_seq) {
> > /* extend local deadline, drift is bounded above by 2 ticks */
> > cfs_rq->runtime_expires += TICK_NSEC;
> > } else {
> > /* global deadline is ahead, expiration has passed */
> > cfs_rq->runtime_remaining = 0;
> > }
> >
> > So if clock drift happens soon, then expires_seq decides the correct
> > thing we should do: if cfs_b->expires_seq advanced, then clear the stale
> > cfs_rq->runtime_remaining from the slack timer of the past period, then
> > assign_cfs_rq_runtime() will refresh them afterwards, otherwise it is a
> > real clock drift. I am still not getting where the race is?
But expires_seq is supposed to be the same here, after
distribute_cfs_runtime(), therefore runtime_remaining is not supposed
to be cleared.
Which part do I misunderstand? expires_seq should not be same here?
Or you are saying a wrongly clear of runtime_remaning is fine?
>
> Nothing /important/ goes wrong because distribute_cfs_runtime only fills
> runtime_remaining up to 1, not a real amount.
No, runtime_remaining is updated right before expire_cfs_rq_runtime():
static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
/* dock delta_exec before expiring quota (as it could span periods) */
cfs_rq->runtime_remaining -= delta_exec;
expire_cfs_rq_runtime(cfs_rq);
so almost certainly it can't be 1.
Which means the following check could be passed:
4655 if (cfs_rq->runtime_remaining < 0)
4656 return;
therefore we are reaching the clock drift logic code inside
expire_cfs_rq_runtime()
where expires_seq is supposed to be same as they should be sync'ed.
Therefore without patch, we wrongly clear the runtime_remainng?
Thanks.
Cong Wang <[email protected]> writes:
> On Tue, Jul 31, 2018 at 10:13 AM <[email protected]> wrote:
>>
>> Xunlei Pang <[email protected]> writes:
>>
>> > On 7/31/18 1:55 AM, Cong Wang wrote:
>> >> On Sun, Jul 29, 2018 at 10:29 PM Xunlei Pang <[email protected]> wrote:
>> >>>
>> >>> Hi Cong,
>> >>>
>> >>> On 7/28/18 8:24 AM, Cong Wang wrote:
>> >>>> Each time we sync cfs_rq->runtime_expires with cfs_b->runtime_expires,
>> >>>> we should sync its ->expires_seq too. However it is missing
>> >>>> for distribute_cfs_runtime(), especially the slack timer call path.
>> >>>
>> >>> I don't think it's a problem, as expires_seq will get synced in
>> >>> assign_cfs_rq_runtime().
>> >>
>> >> Sure, but there is a small window during which they are not synced.
>> >> Why do you want to wait until the next assign_cfs_rq_runtime() when
>> >> you already know runtime_expires is synced?
>> >>
>> >> Also, expire_cfs_rq_runtime() is called before assign_cfs_rq_runtime()
>> >> inside __account_cfs_rq_runtime(), which means the check of
>> >> cfs_rq->expires_seq is not accurate for unthrottling case if the clock
>> >> drift happens soon enough?
>> >>
>> >
>> > expire_cfs_rq_runtime():
>> > if (cfs_rq->expires_seq == cfs_b->expires_seq) {
>> > /* extend local deadline, drift is bounded above by 2 ticks */
>> > cfs_rq->runtime_expires += TICK_NSEC;
>> > } else {
>> > /* global deadline is ahead, expiration has passed */
>> > cfs_rq->runtime_remaining = 0;
>> > }
>> >
>> > So if clock drift happens soon, then expires_seq decides the correct
>> > thing we should do: if cfs_b->expires_seq advanced, then clear the stale
>> > cfs_rq->runtime_remaining from the slack timer of the past period, then
>> > assign_cfs_rq_runtime() will refresh them afterwards, otherwise it is a
>> > real clock drift. I am still not getting where the race is?
>
> But expires_seq is supposed to be the same here, after
> distribute_cfs_runtime(), therefore runtime_remaining is not supposed
> to be cleared.
>
> Which part do I misunderstand? expires_seq should not be same here?
> Or you are saying a wrongly clear of runtime_remaning is fine?
>
>
>>
>> Nothing /important/ goes wrong because distribute_cfs_runtime only fills
>> runtime_remaining up to 1, not a real amount.
>
> No, runtime_remaining is updated right before expire_cfs_rq_runtime():
>
> static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
> {
> /* dock delta_exec before expiring quota (as it could span periods) */
> cfs_rq->runtime_remaining -= delta_exec;
> expire_cfs_rq_runtime(cfs_rq);
>
> so almost certainly it can't be 1.
Yes, in practice what's actually going to happen is that the
runtime_remaining will be put to 1 by distribute, the cfs_rq will be
unthrottled, and then when it runs it will go negative immediately and
hit the negative check in expires, so expires_seq being wrong will not
actually matter. In addition, the worst thing that will happen if one of
the account_cfs_rq_runtime(cfs_rq, 0) paths is hit first is that it will
lose 1ns of quota, which also doesn't really matter.
On Tue, Jul 31, 2018 at 8:24 PM Xunlei Pang <[email protected]> wrote:
>
> Let's see the unthrottle cases.
> 1. for the periodic timer
> distribute_cfs_runtime updates the throttled cfs_rq->runtime_expires to
> be a new value, so expire_cfs_rq_runtime does nothing because of:
> rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires < 0
>
> Afterwards assign_cfs_rq_runtime() will sync its expires_seq.
Is there any guarantee rq_clock(cfs_rq) is always ahead of
cfs_rq->runtime_expires in this case?
I doubt, because cfs_rq->runtime_expires could be assigned
by a sched_clock() on a different CPU running the periodic timer.
Also, rq_clock() is behind sched_clock() on the same CPU too,
sometimes it is merely hundreds of nanoseconds, sometimes it is
tens of thousands nanoseconds in my environment. (I have a
different patch to address this, but still not sure if it is correct.)
>
> 2. for the slack timer
> the two expires_seq should be the same, so if clock drift happens soon,
> expire_cfs_rq_runtime regards it as true clock drift:
> cfs_rq->runtime_expires += TICK_NSEC
> If it happens that global expires_seq advances, it also doesn't matter,
> expire_cfs_rq_runtime will clear the stale expire_cfs_rq_runtime as
> expected.
Hmm, looks like due to the runtime_refresh_within() check in
slack timer.
>
> >
> >>
> >> Nothing /important/ goes wrong because distribute_cfs_runtime only fills
> >> runtime_remaining up to 1, not a real amount.
> >
> > No, runtime_remaining is updated right before expire_cfs_rq_runtime():
> >
> > static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
> > {
> > /* dock delta_exec before expiring quota (as it could span periods) */
> > cfs_rq->runtime_remaining -= delta_exec;
> > expire_cfs_rq_runtime(cfs_rq);
> >
> > so almost certainly it can't be 1.
>
> I think Ben means it firstly gets a distributtion of 1 to run after
> unthrottling, soon it will have a negative runtime_remaining, and go
> to assign_cfs_rq_runtime().
That is obvious, being 1 in distribute_cfs_runtime is not relevant to the
discussion here.
On Wed, Aug 1, 2018 at 10:17 AM <[email protected]> wrote:
> Yes, in practice what's actually going to happen is that the
> runtime_remaining will be put to 1 by distribute, the cfs_rq will be
> unthrottled, and then when it runs it will go negative immediately and
> hit the negative check in expires, so expires_seq being wrong will not
> actually matter. In addition, the worst thing that will happen if one of
> the account_cfs_rq_runtime(cfs_rq, 0) paths is hit first is that it will
> lose 1ns of quota, which also doesn't really matter.
Ah, I see.
Thanks!