LinuxLists.cc - [RFC] sched/deadline: Prevent rt

2014-02-19 22:16:07

Subject: [RFC] sched/deadline: Prevent rt_time growth to infinity

Since deadline tasks share rt bandwidth, we must care about
bandwidth timer set. Otherwise rt_time may grow up to infinity
in update_curr_dl(), if there are no other available RT tasks
on top level bandwidth.

I'm going to decide the problem the way below. Almost untested
because of I skipped almost all of recent patches which haveto be applied from lkml.

Please say, if I skipped anything in idea. Maybe better put
start_top_rt_bandwidth() into set_curr_task_dl()?

---
kernel/sched/deadline.c | 1 +
kernel/sched/rt.c | 16 +++++++++++++++-
kernel/sched/sched.h | 1 +
3 files changed, 17 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index ed31ef6..f1d2304 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -720,6 +720,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)

inc_dl_deadline(dl_rq, deadline);
inc_dl_migration(dl_se, dl_rq);
+ start_top_rt_bandwidth(rq_of_dl_rq(dl_rq));
}

static inline
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 72f9ec7..a187eb8 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -726,7 +726,7 @@ static inline int balance_runtime(struct rt_rq *rt_rq)

static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
{
- int i, idle = 1, throttled = 0;
+ int i, idle = 1, throttled = 0, has_dl_tasks = 0;
const struct cpumask *span;

span = sched_rt_period_mask();
@@ -781,9 +781,15 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)

if (enqueue)
sched_rt_rq_enqueue(rt_rq);
+
+ if (rt_rq == &rq->rt && rq->dl.dl_nr_running)
+ has_dl_tasks = 1;
raw_spin_unlock(&rq->lock);
}

+ if (has_dl_tasks)
+ return 0;
+
if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
return 1;

@@ -1005,6 +1011,10 @@ dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
}

+void start_top_rt_bandwidth(struct rq *rq)
+{
+ start_rt_bandwidth(&rq->rt.tg->rt_bandwidth);
+}
#else /* CONFIG_RT_GROUP_SCHED */

static void
@@ -1016,6 +1026,10 @@ inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
static inline
void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}

+void start_top_rt_bandwidth(struct rq *rq)
+{
+ start_rt_bandwidth(&def_rt_bandwidth);
+}
#endif /* CONFIG_RT_GROUP_SCHED */

static inline
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1bf34c2..a20f8f7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1294,6 +1294,7 @@ static inline void sched_avg_update(struct rq *rq) { }
#endif

extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
+extern void start_top_rt_bandwidth(struct rq *rq);

#ifdef CONFIG_SMP
#ifdef CONFIG_PREEMPT

2014-02-20 21:22:45

by Steven Rostedt

[permalink] [raw]

Subject: Re: [RFC] sched/deadline: Prevent rt_time growth to infinity

Peter, Juri,

Have you seen this?

-- Steve

On Thu, 20 Feb 2014 02:16:00 +0400
Kirill Tkhai <[email protected]> wrote:

> Since deadline tasks share rt bandwidth, we must care about
> bandwidth timer set. Otherwise rt_time may grow up to infinity
> in update_curr_dl(), if there are no other available RT tasks
> on top level bandwidth.
>
> I'm going to decide the problem the way below. Almost untested
> because of I skipped almost all of recent patches which haveto be applied from lkml.
>
> Please say, if I skipped anything in idea. Maybe better put
> start_top_rt_bandwidth() into set_curr_task_dl()?
>
> ---
> kernel/sched/deadline.c | 1 +
> kernel/sched/rt.c | 16 +++++++++++++++-
> kernel/sched/sched.h | 1 +
> 3 files changed, 17 insertions(+), 1 deletion(-)
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index ed31ef6..f1d2304 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -720,6 +720,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
>
> inc_dl_deadline(dl_rq, deadline);
> inc_dl_migration(dl_se, dl_rq);
> + start_top_rt_bandwidth(rq_of_dl_rq(dl_rq));
> }
>
> static inline
> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> index 72f9ec7..a187eb8 100644
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -726,7 +726,7 @@ static inline int balance_runtime(struct rt_rq *rt_rq)
>
> static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
> {
> - int i, idle = 1, throttled = 0;
> + int i, idle = 1, throttled = 0, has_dl_tasks = 0;
> const struct cpumask *span;
>
> span = sched_rt_period_mask();
> @@ -781,9 +781,15 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
>
> if (enqueue)
> sched_rt_rq_enqueue(rt_rq);
> +
> + if (rt_rq == &rq->rt && rq->dl.dl_nr_running)
> + has_dl_tasks = 1;
> raw_spin_unlock(&rq->lock);
> }
>
> + if (has_dl_tasks)
> + return 0;
> +
> if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
> return 1;
>
> @@ -1005,6 +1011,10 @@ dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
> WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
> }
>
> +void start_top_rt_bandwidth(struct rq *rq)
> +{
> + start_rt_bandwidth(&rq->rt.tg->rt_bandwidth);
> +}
> #else /* CONFIG_RT_GROUP_SCHED */
>
> static void
> @@ -1016,6 +1026,10 @@ inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
> static inline
> void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
>
> +void start_top_rt_bandwidth(struct rq *rq)
> +{
> + start_rt_bandwidth(&def_rt_bandwidth);
> +}
> #endif /* CONFIG_RT_GROUP_SCHED */
>
> static inline
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 1bf34c2..a20f8f7 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1294,6 +1294,7 @@ static inline void sched_avg_update(struct rq *rq) { }
> #endif
>
> extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
> +extern void start_top_rt_bandwidth(struct rq *rq);
>
> #ifdef CONFIG_SMP
> #ifdef CONFIG_PREEMPT

2014-02-21 10:37:22

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [RFC] sched/deadline: Prevent rt_time growth to infinity

On Thu, Feb 20, 2014 at 02:16:00AM +0400, Kirill Tkhai wrote:
> Since deadline tasks share rt bandwidth, we must care about
> bandwidth timer set. Otherwise rt_time may grow up to infinity
> in update_curr_dl(), if there are no other available RT tasks
> on top level bandwidth.
>
> I'm going to decide the problem the way below. Almost untested
> because of I skipped almost all of recent patches which haveto be applied from lkml.
>
> Please say, if I skipped anything in idea. Maybe better put
> start_top_rt_bandwidth() into set_curr_task_dl()?

How about we only increment rt_time when there's an RT bandwidth timer
active?

---
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -568,6 +568,12 @@ static inline struct rt_bandwidth *sched

#endif /* CONFIG_RT_GROUP_SCHED */

+bool sched_rt_bandwidth_active(struct rt_rq *rt_rq)
+{
+ struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+ return hrtimer_active(&rt_b->rt_period_timer);
+}
+
#ifdef CONFIG_SMP
/*
* We ran out of runtime, see if we can borrow some from our neighbours.
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -587,6 +587,8 @@ int dl_runtime_exceeded(struct rq *rq, s
return 1;
}

+extern bool sched_rt_bandwidth_active(struct rt_rq *rt_rq);
+
/*
* Update the current task's runtime statistics (provided it is still
* a -deadline task and has not been removed from the dl_rq).
@@ -650,11 +652,13 @@ static void update_curr_dl(struct rq *rq
struct rt_rq *rt_rq = &rq->rt;

raw_spin_lock(&rt_rq->rt_runtime_lock);
- rt_rq->rt_time += delta_exec;
/*
* We'll let actual RT tasks worry about the overflow here, we
- * have our own CBS to keep us inline -- see above.
+ * have our own CBS to keep us inline; only account when RT
+ * bandwidth is relevant.
*/
+ if (sched_rt_bandwidth_active(rt_rq))
+ rt_rq->rt_time += delta_exec;
raw_spin_unlock(&rt_rq->rt_runtime_lock);
}
}

2014-02-21 11:39:13

by Kirill Tkhai

[permalink] [raw]

Subject: Re: [RFC] sched/deadline: Prevent rt_time growth to infinity

21.02.2014, 14:37, "Peter Zijlstra" <[email protected]>:
> On Thu, Feb 20, 2014 at 02:16:00AM +0400, Kirill Tkhai wrote:
>
>> ?Since deadline tasks share rt bandwidth, we must care about
>> ?bandwidth timer set. Otherwise rt_time may grow up to infinity
>> ?in update_curr_dl(), if there are no other available RT tasks
>> ?on top level bandwidth.
>>
>> ?I'm going to decide the problem the way below. Almost untested
>> ?because of I skipped almost all of recent patches which haveto be applied from lkml.
>>
>> ?Please say, if I skipped anything in idea. Maybe better put
>> ?start_top_rt_bandwidth() into set_curr_task_dl()?
>
> How about we only increment rt_time when there's an RT bandwidth timer
> active?

This case RT and DL may eat all the time:

-------------- time ------------------>
|RT's working |DL's working|
----------------------------
|rt_runtime | |
----------------------------
| rt_period |

Or at least more, than it's allowed.

It looks like, if we want to limit time of high priority classes
execution, we have to set the timer anyway.

> ---
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -568,6 +568,12 @@ static inline struct rt_bandwidth *sched

[snipped by dumb mail client]

2014-02-21 12:09:33

by Kirill Tkhai

[permalink] [raw]

Subject: Re: [RFC] sched/deadline: Prevent rt_time growth to infinity

21.02.2014, 15:39, "Kirill Tkhai" <[email protected]>:
> 21.02.2014, 14:37, "Peter Zijlstra" <[email protected]>:
>
>> ?On Thu, Feb 20, 2014 at 02:16:00AM +0400, Kirill Tkhai wrote:
>>> ??Since deadline tasks share rt bandwidth, we must care about
>>> ??bandwidth timer set. Otherwise rt_time may grow up to infinity
>>> ??in update_curr_dl(), if there are no other available RT tasks
>>> ??on top level bandwidth.
>>>
>>> ??I'm going to decide the problem the way below. Almost untested
>>> ??because of I skipped almost all of recent patches which haveto be applied from lkml.
>>>
>>> ??Please say, if I skipped anything in idea. Maybe better put
>>> ??start_top_rt_bandwidth() into set_curr_task_dl()?
>> ?How about we only increment rt_time when there's an RT bandwidth timer
>> ?active?
>
> This case RT and DL may eat all the time:
>
> -------------- ?time ------------------>
> |RT's working |DL's working|
> ----------------------------
> |rt_runtime ??| ???????????|
> ----------------------------
> | ???????rt_period ????????|
>
> Or at least more, than it's allowed.
>
> It looks like, if we want to limit time of high priority classes
> execution, we have to set the timer anyway.

Oh, above is confusing. Sorry.

I mean one RT task and

----------------- time -------------------------->
|DL's working |RT's working| |DL's working |
--------------------------------------------------
| |rt_runtime | | |
--------------------------------------------------
| | rt_period | |

In this case FAIR receives less ratio, than (rt_period-rt_runtime)/rt_period.

>> ?---
>> ?--- a/kernel/sched/rt.c
>> ?+++ b/kernel/sched/rt.c
>> ?@@ -568,6 +568,12 @@ static inline struct rt_bandwidth *sched
>
> [snipped by dumb mail client]
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at ?http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at ?http://www.tux.org/lkml/

2014-02-21 12:44:26

by Juri Lelli

[permalink] [raw]

Subject: Re: [RFC] sched/deadline: Prevent rt_time growth to infinity

On Fri, 21 Feb 2014 16:09:25 +0400
Kirill Tkhai <[email protected]> wrote:

>
>
> 21.02.2014, 15:39, "Kirill Tkhai" <[email protected]>:
> > 21.02.2014, 14:37, "Peter Zijlstra" <[email protected]>:
> >
> >> ?On Thu, Feb 20, 2014 at 02:16:00AM +0400, Kirill Tkhai wrote:
> >>> ??Since deadline tasks share rt bandwidth, we must care about
> >>> ??bandwidth timer set. Otherwise rt_time may grow up to infinity
> >>> ??in update_curr_dl(), if there are no other available RT tasks
> >>> ??on top level bandwidth.
> >>>
> >>> ??I'm going to decide the problem the way below. Almost untested
> >>> ??because of I skipped almost all of recent patches which haveto be applied from lkml.
> >>>
> >>> ??Please say, if I skipped anything in idea. Maybe better put
> >>> ??start_top_rt_bandwidth() into set_curr_task_dl()?
> >> ?How about we only increment rt_time when there's an RT bandwidth timer
> >> ?active?
> >
> > This case RT and DL may eat all the time:
> >
> > -------------- ?time ------------------>
> > |RT's working |DL's working|
> > ----------------------------
> > |rt_runtime ??| ???????????|
> > ----------------------------
> > | ???????rt_period ????????|
> >
> > Or at least more, than it's allowed.
> >
> > It looks like, if we want to limit time of high priority classes
> > execution, we have to set the timer anyway.
>
> Oh, above is confusing. Sorry.
>
> I mean one RT task and
>
> ----------------- time -------------------------->
> |DL's working |RT's working| |DL's working |
> --------------------------------------------------
> | |rt_runtime | | |
> --------------------------------------------------
> | | rt_period | |
>
> In this case FAIR receives less ratio, than (rt_period-rt_runtime)/rt_period.
>

DL tasks won't be allowed to run in this situation, as their bw exceedes
rt_runtime/rt_period.

Thanks,

- Juri

2014-02-21 14:25:12

by Kirill Tkhai

[permalink] [raw]

Subject: Re: [RFC] sched/deadline: Prevent rt_time growth to infinity

21.02.2014, 16:44, "Juri Lelli" <[email protected]>:
> On Fri, 21 Feb 2014 16:09:25 +0400
> Kirill Tkhai <[email protected]> wrote:
>
>> ?21.02.2014, 15:39, "Kirill Tkhai" <[email protected]>:
>>> ?21.02.2014, 14:37, "Peter Zijlstra" <[email protected]>:
>>>> ??On Thu, Feb 20, 2014 at 02:16:00AM +0400, Kirill Tkhai wrote:
>>>>> ???Since deadline tasks share rt bandwidth, we must care about
>>>>> ???bandwidth timer set. Otherwise rt_time may grow up to infinity
>>>>> ???in update_curr_dl(), if there are no other available RT tasks
>>>>> ???on top level bandwidth.
>>>>>
>>>>> ???I'm going to decide the problem the way below. Almost untested
>>>>> ???because of I skipped almost all of recent patches which haveto be applied from lkml.
>>>>>
>>>>> ???Please say, if I skipped anything in idea. Maybe better put
>>>>> ???start_top_rt_bandwidth() into set_curr_task_dl()?
>>>> ??How about we only increment rt_time when there's an RT bandwidth timer
>>>> ??active?
>>> ?This case RT and DL may eat all the time:
>>>
>>> ?-------------- ?time ------------------>
>>> ?|RT's working |DL's working|
>>> ?----------------------------
>>> ?|rt_runtime ??| ???????????|
>>> ?----------------------------
>>> ?| ???????rt_period ????????|
>>>
>>> ?Or at least more, than it's allowed.
>>>
>>> ?It looks like, if we want to limit time of high priority classes
>>> ?execution, we have to set the timer anyway.
>> ?Oh, above is confusing. Sorry.
>>
>> ?I mean one RT task and
>>
>> ?----------------- ?time -------------------------->
>> ?|DL's working ???|RT's working| |DL's working ???|
>> ?--------------------------------------------------
>> ?| ???????????????|rt_runtime ?| | ???????????????|
>> ?--------------------------------------------------
>> ?| ???????????????| ??rt_period ?| ???????????????|
>>
>> ?In this case FAIR receives less ratio, than (rt_period-rt_runtime)/rt_period.
>
> DL tasks won't be allowed to run in this situation, as their bw exceedes
> rt_runtime/rt_period.

Maybe, I don't uderstand. Where does DL control summary (DL+RT) runtime?

RT does not do this too. But it looks like, RT has to do this.

Kirill

2014-02-21 16:36:29

by Juri Lelli

[permalink] [raw]

Subject: Re: [RFC] sched/deadline: Prevent rt_time growth to infinity

On Fri, 21 Feb 2014 11:37:15 +0100
Peter Zijlstra <[email protected]> wrote:

> On Thu, Feb 20, 2014 at 02:16:00AM +0400, Kirill Tkhai wrote:
> > Since deadline tasks share rt bandwidth, we must care about
> > bandwidth timer set. Otherwise rt_time may grow up to infinity
> > in update_curr_dl(), if there are no other available RT tasks
> > on top level bandwidth.
> >
> > I'm going to decide the problem the way below. Almost untested
> > because of I skipped almost all of recent patches which haveto be applied from lkml.
> >
> > Please say, if I skipped anything in idea. Maybe better put
> > start_top_rt_bandwidth() into set_curr_task_dl()?
>
> How about we only increment rt_time when there's an RT bandwidth timer
> active?
>
>
> ---
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -568,6 +568,12 @@ static inline struct rt_bandwidth *sched
>
> #endif /* CONFIG_RT_GROUP_SCHED */
>
> +bool sched_rt_bandwidth_active(struct rt_rq *rt_rq)
> +{
> + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
> + return hrtimer_active(&rt_b->rt_period_timer);
> +}
> +
> #ifdef CONFIG_SMP
> /*
> * We ran out of runtime, see if we can borrow some from our neighbours.
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -587,6 +587,8 @@ int dl_runtime_exceeded(struct rq *rq, s
> return 1;
> }
>
> +extern bool sched_rt_bandwidth_active(struct rt_rq *rt_rq);
> +
> /*
> * Update the current task's runtime statistics (provided it is still
> * a -deadline task and has not been removed from the dl_rq).
> @@ -650,11 +652,13 @@ static void update_curr_dl(struct rq *rq
> struct rt_rq *rt_rq = &rq->rt;
>
> raw_spin_lock(&rt_rq->rt_runtime_lock);
> - rt_rq->rt_time += delta_exec;
> /*
> * We'll let actual RT tasks worry about the overflow here, we
> - * have our own CBS to keep us inline -- see above.
> + * have our own CBS to keep us inline; only account when RT
> + * bandwidth is relevant.
> */
> + if (sched_rt_bandwidth_active(rt_rq))
> + rt_rq->rt_time += delta_exec;
> raw_spin_unlock(&rt_rq->rt_runtime_lock);
> }
> }

So, I ran some tests with the above and I'd like to share with you what
I've found. You can find here a trace-cmd trace that should be feeded
to kernelshark to be able to understand what follows (or feel free to
reproduce same scenario :)):
http://retis.sssup.it/~jlelli/traces/trace_rt_time.dat

Here you have a DL task (4/10) and a while(1) RT task, both running
inside a rt_bw of 0.5. RT tasks is activated 500ms after DL. As I
filtered in sched_rt_period_timer(), you can search for time instants
when the rt_bw is replenished. It is evident that the first time after
rt timer is activated back (search for start_bandwidth_timer), we can
eat some bw to FAIR tasks (if any). This is due to the fact that we
reset rt_bw budget at this time, start decrementing rt_time for both DL
and RT tasks, throttle RT tasks when rt_time > runtime, but, since DL
tasks acually executes inside their own server, they don't care about
rt_bw. Good news is that steady state is ok: keeping track of overruns
we are able to stop eating bw to other guys.

My thougths:

- Peter's patch is an easy fix to Kirill's problem (RT tasks were
throttled too early);
- something to add to this solution could be to pre-calculate bw of
ready DL tasks and subtract it to rt_bw at replenishment time, but
it sounds quite awkward, pessimistic, and I'm not sure it is gonna
work;
- we are stealing bw to best-effort tasks, and just at the beginning
of the transistion, is it really a problem?
- I mean, if you want guarantees make your tasks DL! :);
- in the long run we are gonna have RT tasks scheduled inside CBS
servers, and all this will be properly fixed up.

Comments?

BTW, rt timer activation/deactivation should probably be fixed for
!RT_GROUP_SCHED with something like this:

---
kernel/sched/rt.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 6161de8..274f992 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -86,12 +86,12 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
raw_spin_lock_init(&rt_rq->rt_runtime_lock);
}

-#ifdef CONFIG_RT_GROUP_SCHED
static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
{
hrtimer_cancel(&rt_b->rt_period_timer);
}

+#ifdef CONFIG_RT_GROUP_SCHED
#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)

static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
@@ -1017,8 +1017,12 @@ inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
start_rt_bandwidth(&def_rt_bandwidth);
}

-static inline
-void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
+static void
+dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ if (!rt_rq->rt_nr_running)
+ destroy_rt_bandwidth(&def_rt_bandwidth);
+}

#endif /* CONFIG_RT_GROUP_SCHED */

--
1.7.9.5

2014-02-21 16:52:54

by Juri Lelli

[permalink] [raw]

Subject: Re: [RFC] sched/deadline: Prevent rt_time growth to infinity

On Fri, 21 Feb 2014 17:36:41 +0100
Juri Lelli <[email protected]> wrote:

> On Fri, 21 Feb 2014 11:37:15 +0100
> Peter Zijlstra <[email protected]> wrote:
>
> > On Thu, Feb 20, 2014 at 02:16:00AM +0400, Kirill Tkhai wrote:
> > > Since deadline tasks share rt bandwidth, we must care about
> > > bandwidth timer set. Otherwise rt_time may grow up to infinity
> > > in update_curr_dl(), if there are no other available RT tasks
> > > on top level bandwidth.
> > >
> > > I'm going to decide the problem the way below. Almost untested
> > > because of I skipped almost all of recent patches which haveto be applied from lkml.
> > >
> > > Please say, if I skipped anything in idea. Maybe better put
> > > start_top_rt_bandwidth() into set_curr_task_dl()?
> >
> > How about we only increment rt_time when there's an RT bandwidth timer
> > active?
> >
> >
> > ---
> > --- a/kernel/sched/rt.c
> > +++ b/kernel/sched/rt.c
> > @@ -568,6 +568,12 @@ static inline struct rt_bandwidth *sched
> >
> > #endif /* CONFIG_RT_GROUP_SCHED */
> >
> > +bool sched_rt_bandwidth_active(struct rt_rq *rt_rq)
> > +{
> > + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
> > + return hrtimer_active(&rt_b->rt_period_timer);
> > +}
> > +
> > #ifdef CONFIG_SMP
> > /*
> > * We ran out of runtime, see if we can borrow some from our neighbours.
> > --- a/kernel/sched/deadline.c
> > +++ b/kernel/sched/deadline.c
> > @@ -587,6 +587,8 @@ int dl_runtime_exceeded(struct rq *rq, s
> > return 1;
> > }
> >
> > +extern bool sched_rt_bandwidth_active(struct rt_rq *rt_rq);
> > +
> > /*
> > * Update the current task's runtime statistics (provided it is still
> > * a -deadline task and has not been removed from the dl_rq).
> > @@ -650,11 +652,13 @@ static void update_curr_dl(struct rq *rq
> > struct rt_rq *rt_rq = &rq->rt;
> >
> > raw_spin_lock(&rt_rq->rt_runtime_lock);
> > - rt_rq->rt_time += delta_exec;
> > /*
> > * We'll let actual RT tasks worry about the overflow here, we
> > - * have our own CBS to keep us inline -- see above.
> > + * have our own CBS to keep us inline; only account when RT
> > + * bandwidth is relevant.
> > */
> > + if (sched_rt_bandwidth_active(rt_rq))
> > + rt_rq->rt_time += delta_exec;
> > raw_spin_unlock(&rt_rq->rt_runtime_lock);
> > }
> > }
>
> So, I ran some tests with the above and I'd like to share with you what
> I've found. You can find here a trace-cmd trace that should be feeded
> to kernelshark to be able to understand what follows (or feel free to
> reproduce same scenario :)):
> http://retis.sssup.it/~jlelli/traces/trace_rt_time.dat
>
> Here you have a DL task (4/10) and a while(1) RT task, both running
> inside a rt_bw of 0.5. RT tasks is activated 500ms after DL. As I
> filtered in sched_rt_period_timer(), you can search for time instants
> when the rt_bw is replenished. It is evident that the first time after
> rt timer is activated back (search for start_bandwidth_timer), we can
> eat some bw to FAIR tasks (if any). This is due to the fact that we
> reset rt_bw budget at this time, start decrementing rt_time for both DL

The reset happens when rt_bw replenishment timer fires, after a bit:

sched_rt_period_timer <-- __run_hrtimer

Apologies,

- Juri

> and RT tasks, throttle RT tasks when rt_time > runtime, but, since DL
> tasks acually executes inside their own server, they don't care about
> rt_bw. Good news is that steady state is ok: keeping track of overruns
> we are able to stop eating bw to other guys.
>
> My thougths:
>
> - Peter's patch is an easy fix to Kirill's problem (RT tasks were
> throttled too early);
> - something to add to this solution could be to pre-calculate bw of
> ready DL tasks and subtract it to rt_bw at replenishment time, but
> it sounds quite awkward, pessimistic, and I'm not sure it is gonna
> work;
> - we are stealing bw to best-effort tasks, and just at the beginning
> of the transistion, is it really a problem?
> - I mean, if you want guarantees make your tasks DL! :);
> - in the long run we are gonna have RT tasks scheduled inside CBS
> servers, and all this will be properly fixed up.
>
> Comments?
>
> BTW, rt timer activation/deactivation should probably be fixed for
> !RT_GROUP_SCHED with something like this:
>
> ---
> kernel/sched/rt.c | 10 +++++++---
> 1 file changed, 7 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> index 6161de8..274f992 100644
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -86,12 +86,12 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
> raw_spin_lock_init(&rt_rq->rt_runtime_lock);
> }
>
> -#ifdef CONFIG_RT_GROUP_SCHED
> static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
> {
> hrtimer_cancel(&rt_b->rt_period_timer);
> }
>
> +#ifdef CONFIG_RT_GROUP_SCHED
> #define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
>
> static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
> @@ -1017,8 +1017,12 @@ inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
> start_rt_bandwidth(&def_rt_bandwidth);
> }
>
> -static inline
> -void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
> +static void
> +dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
> +{
> + if (!rt_rq->rt_nr_running)
> + destroy_rt_bandwidth(&def_rt_bandwidth);
> +}
>
> #endif /* CONFIG_RT_GROUP_SCHED */
>

2014-02-21 23:50:38

by Kirill Tkhai

[permalink] [raw]

Subject: Re: [RFC] sched/deadline: Prevent rt_time growth to infinity

21.02.2014, 20:52, "Juri Lelli" <[email protected]>:
> On Fri, 21 Feb 2014 17:36:41 +0100
> Juri Lelli <[email protected]> wrote:
>
>> ?On Fri, 21 Feb 2014 11:37:15 +0100
>> ?Peter Zijlstra <[email protected]> wrote:
>>> ?On Thu, Feb 20, 2014 at 02:16:00AM +0400, Kirill Tkhai wrote:
>>>> ?Since deadline tasks share rt bandwidth, we must care about
>>>> ?bandwidth timer set. Otherwise rt_time may grow up to infinity
>>>> ?in update_curr_dl(), if there are no other available RT tasks
>>>> ?on top level bandwidth.
>>>>
>>>> ?I'm going to decide the problem the way below. Almost untested
>>>> ?because of I skipped almost all of recent patches which haveto be applied from lkml.
>>>>
>>>> ?Please say, if I skipped anything in idea. Maybe better put
>>>> ?start_top_rt_bandwidth() into set_curr_task_dl()?
>>> ?How about we only increment rt_time when there's an RT bandwidth timer
>>> ?active?
>>>
>>> ?---
>>> ?--- a/kernel/sched/rt.c
>>> ?+++ b/kernel/sched/rt.c
>>> ?@@ -568,6 +568,12 @@ static inline struct rt_bandwidth *sched
>>>
>>> ??#endif /* CONFIG_RT_GROUP_SCHED */
>>>
>>> ?+bool sched_rt_bandwidth_active(struct rt_rq *rt_rq)
>>> ?+{
>>> ?+ struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
>>> ?+ return hrtimer_active(&rt_b->rt_period_timer);
>>> ?+}
>>> ?+
>>> ??#ifdef CONFIG_SMP
>>> ??/*
>>> ???* We ran out of runtime, see if we can borrow some from our neighbours.
>>> ?--- a/kernel/sched/deadline.c
>>> ?+++ b/kernel/sched/deadline.c
>>> ?@@ -587,6 +587,8 @@ int dl_runtime_exceeded(struct rq *rq, s
>>> ??????????return 1;
>>> ??}
>>>
>>> ?+extern bool sched_rt_bandwidth_active(struct rt_rq *rt_rq);
>>> ?+
>>> ??/*
>>> ???* Update the current task's runtime statistics (provided it is still
>>> ???* a -deadline task and has not been removed from the dl_rq).
>>> ?@@ -650,11 +652,13 @@ static void update_curr_dl(struct rq *rq
>>> ??????????????????struct rt_rq *rt_rq = &rq->rt;
>>>
>>> ??????????????????raw_spin_lock(&rt_rq->rt_runtime_lock);
>>> ?- rt_rq->rt_time += delta_exec;
>>> ??????????????????/*
>>> ???????????????????* We'll let actual RT tasks worry about the overflow here, we
>>> ?- * have our own CBS to keep us inline -- see above.
>>> ?+ * have our own CBS to keep us inline; only account when RT
>>> ?+ * bandwidth is relevant.
>>> ???????????????????*/
>>> ?+ if (sched_rt_bandwidth_active(rt_rq))
>>> ?+ rt_rq->rt_time += delta_exec;
>>> ??????????????????raw_spin_unlock(&rt_rq->rt_runtime_lock);
>>> ??????????}
>>> ??}
>> ?So, I ran some tests with the above and I'd like to share with you what
>> ?I've found. You can find here a trace-cmd trace that should be feeded
>> ?to kernelshark to be able to understand what follows (or feel free to
>> ?reproduce same scenario :)):
>> ?http://retis.sssup.it/~jlelli/traces/trace_rt_time.dat
>>
>> ?Here you have a DL task (4/10) and a while(1) RT task, both running
>> ?inside a rt_bw of 0.5. RT tasks is activated 500ms after DL. As I
>> ?filtered in sched_rt_period_timer(), you can search for time instants
>> ?when the rt_bw is replenished. It is evident that the first time after
>> ?rt timer is activated back (search for start_bandwidth_timer), we can
>> ?eat some bw to FAIR tasks (if any). This is due to the fact that we
>> ?reset rt_bw budget at this time, start decrementing rt_time for both DL
>
> The reset happens when rt_bw replenishment timer fires, after a bit:
>
> ?sched_rt_period_timer <-- __run_hrtimer

Juri, sorry, I forgot to wrote I mean the situation when only one task is on_rq
at every moment.

DL, RT, DL, RT, ...

rt_runtime = n;
rt_period = 2n;

| DL's working, RT's sleeping | RT's working, DL's sleeping | all sleep |
------------------------------------------------------------------------------------------|
| (1) duration = n | (2) duration = n | (3) duration = n | (repeat)
|------------------------------|------------------------------|---------------------------|
| (rt_bw timer is not running) | (rt_bw timer is running) |

According to the patch, rt_bw timer is working only if we have queued RT task.

In the case above part (1) has no queued RT tasks, so timer is not working.
rt_time is not being increased too.

We have ratio 2/3.

Thanks,
Kirill

>
> Apologies,
>
> - Juri
>
>> ?and RT tasks, throttle RT tasks when rt_time > runtime, but, since DL
>> ?tasks acually executes inside their own server, they don't care about
>> ?rt_bw. Good news is that steady state is ok: keeping track of overruns
>> ?we are able to stop eating bw to other guys.
>>
>> ?My thougths:
>>
>> ??- Peter's patch is an easy fix to Kirill's problem (RT tasks were
>> ????throttled too early);
>> ??- something to add to this solution could be to pre-calculate bw of
>> ????ready DL tasks and subtract it to rt_bw at replenishment time, but
>> ????it sounds quite awkward, pessimistic, and I'm not sure it is gonna
>> ????work;
>> ??- we are stealing bw to best-effort tasks, and just at the beginning
>> ????of the transistion, is it really a problem?
>> ??- I mean, if you want guarantees make your tasks DL! :);
>> ??- in the long run we are gonna have RT tasks scheduled inside CBS
>> ????servers, and all this will be properly fixed up.
>>
>> ?Comments?
>>
>> ?BTW, rt timer activation/deactivation should probably be fixed for
>> ?!RT_GROUP_SCHED with something like this:
>>
>> ?---
>> ??kernel/sched/rt.c | ??10 +++++++---
>> ??1 file changed, 7 insertions(+), 3 deletions(-)
>>
>> ?diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
>> ?index 6161de8..274f992 100644
>> ?--- a/kernel/sched/rt.c
>> ?+++ b/kernel/sched/rt.c
>> ?@@ -86,12 +86,12 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
>> ??????????raw_spin_lock_init(&rt_rq->rt_runtime_lock);
>> ??}
>>
>> ?-#ifdef CONFIG_RT_GROUP_SCHED
>> ??static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
>> ??{
>> ??????????hrtimer_cancel(&rt_b->rt_period_timer);
>> ??}
>>
>> ?+#ifdef CONFIG_RT_GROUP_SCHED
>> ??#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
>>
>> ??static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
>> ?@@ -1017,8 +1017,12 @@ inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
>> ??????????start_rt_bandwidth(&def_rt_bandwidth);
>> ??}
>>
>> ?-static inline
>> ?-void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
>> ?+static void
>> ?+dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
>> ?+{
>> ?+ if (!rt_rq->rt_nr_running)
>> ?+ destroy_rt_bandwidth(&def_rt_bandwidth);
>> ?+}
>>
>> ??#endif /* CONFIG_RT_GROUP_SCHED */

2014-02-22 00:59:42

by Kirill Tkhai

[permalink] [raw]

Subject: Re: [RFC] sched/deadline: Prevent rt_time growth to infinity

On 21.02.2014 20:36, Juri Lelli wrote:
> On Fri, 21 Feb 2014 11:37:15 +0100
> Peter Zijlstra <[email protected]> wrote:
>
>> On Thu, Feb 20, 2014 at 02:16:00AM +0400, Kirill Tkhai wrote:
>>> Since deadline tasks share rt bandwidth, we must care about
>>> bandwidth timer set. Otherwise rt_time may grow up to infinity
>>> in update_curr_dl(), if there are no other available RT tasks
>>> on top level bandwidth.
>>>
>>> I'm going to decide the problem the way below. Almost untested
>>> because of I skipped almost all of recent patches which haveto be applied from lkml.
>>>
>>> Please say, if I skipped anything in idea. Maybe better put
>>> start_top_rt_bandwidth() into set_curr_task_dl()?
>>
>> How about we only increment rt_time when there's an RT bandwidth timer
>> active?
>>
>>
>> ---
>> --- a/kernel/sched/rt.c
>> +++ b/kernel/sched/rt.c
>> @@ -568,6 +568,12 @@ static inline struct rt_bandwidth *sched
>>
>> #endif /* CONFIG_RT_GROUP_SCHED */
>>
>> +bool sched_rt_bandwidth_active(struct rt_rq *rt_rq)
>> +{
>> + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
>> + return hrtimer_active(&rt_b->rt_period_timer);
>> +}
>> +
>> #ifdef CONFIG_SMP
>> /*
>> * We ran out of runtime, see if we can borrow some from our neighbours.
>> --- a/kernel/sched/deadline.c
>> +++ b/kernel/sched/deadline.c
>> @@ -587,6 +587,8 @@ int dl_runtime_exceeded(struct rq *rq, s
>> return 1;
>> }
>>
>> +extern bool sched_rt_bandwidth_active(struct rt_rq *rt_rq);
>> +
>> /*
>> * Update the current task's runtime statistics (provided it is still
>> * a -deadline task and has not been removed from the dl_rq).
>> @@ -650,11 +652,13 @@ static void update_curr_dl(struct rq *rq
>> struct rt_rq *rt_rq = &rq->rt;
>>
>> raw_spin_lock(&rt_rq->rt_runtime_lock);
>> - rt_rq->rt_time += delta_exec;
>> /*
>> * We'll let actual RT tasks worry about the overflow here, we
>> - * have our own CBS to keep us inline -- see above.
>> + * have our own CBS to keep us inline; only account when RT
>> + * bandwidth is relevant.
>> */
>> + if (sched_rt_bandwidth_active(rt_rq))
>> + rt_rq->rt_time += delta_exec;
>> raw_spin_unlock(&rt_rq->rt_runtime_lock);
>> }
>> }
>
> So, I ran some tests with the above and I'd like to share with you what
> I've found. You can find here a trace-cmd trace that should be feeded
> to kernelshark to be able to understand what follows (or feel free to
> reproduce same scenario :)):
> http://retis.sssup.it/~jlelli/traces/trace_rt_time.dat
>
> Here you have a DL task (4/10) and a while(1) RT task, both running
> inside a rt_bw of 0.5. RT tasks is activated 500ms after DL. As I
> filtered in sched_rt_period_timer(), you can search for time instants
> when the rt_bw is replenished. It is evident that the first time after
> rt timer is activated back (search for start_bandwidth_timer), we can
> eat some bw to FAIR tasks (if any). This is due to the fact that we
> reset rt_bw budget at this time, start decrementing rt_time for both DL
> and RT tasks, throttle RT tasks when rt_time > runtime, but, since DL
> tasks acually executes inside their own server, they don't care about
> rt_bw. Good news is that steady state is ok: keeping track of overruns
> we are able to stop eating bw to other guys.
>
> My thougths:
>
> - Peter's patch is an easy fix to Kirill's problem (RT tasks were
> throttled too early);
> - something to add to this solution could be to pre-calculate bw of
> ready DL tasks and subtract it to rt_bw at replenishment time, but
> it sounds quite awkward, pessimistic, and I'm not sure it is gonna
> work;
> - we are stealing bw to best-effort tasks, and just at the beginning
> of the transistion, is it really a problem?
> - I mean, if you want guarantees make your tasks DL! :);
> - in the long run we are gonna have RT tasks scheduled inside CBS
> servers, and all this will be properly fixed up.
>
> Comments?
>
> BTW, rt timer activation/deactivation should probably be fixed for
> !RT_GROUP_SCHED with something like this:
>
> ---
> kernel/sched/rt.c | 10 +++++++---
> 1 file changed, 7 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> index 6161de8..274f992 100644
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -86,12 +86,12 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
> raw_spin_lock_init(&rt_rq->rt_runtime_lock);
> }
>
> -#ifdef CONFIG_RT_GROUP_SCHED
> static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
> {
> hrtimer_cancel(&rt_b->rt_period_timer);
> }
>
> +#ifdef CONFIG_RT_GROUP_SCHED
> #define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
>
> static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
> @@ -1017,8 +1017,12 @@ inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
> start_rt_bandwidth(&def_rt_bandwidth);
> }
>
> -static inline
> -void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
> +static void
> +dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
> +{
> + if (!rt_rq->rt_nr_running)
> + destroy_rt_bandwidth(&def_rt_bandwidth);
> +}
>
> #endif /* CONFIG_RT_GROUP_SCHED */
>

It looks with both patches applied, we may get into a situation,
when all CPU time is shared between RT and DL tasks:

rt_runtime = n
rt_period = 2n

| RT working, DL sleeping | DL working, RT sleeping |
-----------------------------------------------------------
| (1) duration = n | (2) duration = n | (repeat)
|--------------------------|------------------------------|
| (rt_bw timer is running) | (rt_bw timer is not running) |

No time for fair tasks at all.

Kirill

2014-02-25 14:15:01

by Juri Lelli

[permalink] [raw]

Subject: Re: [RFC] sched/deadline: Prevent rt_time growth to infinity

On Sat, 22 Feb 2014 04:56:59 +0400
Kirill Tkhai <[email protected]> wrote:

> On 21.02.2014 20:36, Juri Lelli wrote:
> > On Fri, 21 Feb 2014 11:37:15 +0100
> > Peter Zijlstra <[email protected]> wrote:
> >
> >> On Thu, Feb 20, 2014 at 02:16:00AM +0400, Kirill Tkhai wrote:
> >>> Since deadline tasks share rt bandwidth, we must care about
> >>> bandwidth timer set. Otherwise rt_time may grow up to infinity
> >>> in update_curr_dl(), if there are no other available RT tasks
> >>> on top level bandwidth.
> >>>
> >>> I'm going to decide the problem the way below. Almost untested
> >>> because of I skipped almost all of recent patches which haveto be applied from lkml.
> >>>
> >>> Please say, if I skipped anything in idea. Maybe better put
> >>> start_top_rt_bandwidth() into set_curr_task_dl()?
> >>
> >> How about we only increment rt_time when there's an RT bandwidth timer
> >> active?
> >>
> >>
> >> ---
> >> --- a/kernel/sched/rt.c
> >> +++ b/kernel/sched/rt.c
> >> @@ -568,6 +568,12 @@ static inline struct rt_bandwidth *sched
> >>
> >> #endif /* CONFIG_RT_GROUP_SCHED */
> >>
> >> +bool sched_rt_bandwidth_active(struct rt_rq *rt_rq)
> >> +{
> >> + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
> >> + return hrtimer_active(&rt_b->rt_period_timer);
> >> +}
> >> +
> >> #ifdef CONFIG_SMP
> >> /*
> >> * We ran out of runtime, see if we can borrow some from our neighbours.
> >> --- a/kernel/sched/deadline.c
> >> +++ b/kernel/sched/deadline.c
> >> @@ -587,6 +587,8 @@ int dl_runtime_exceeded(struct rq *rq, s
> >> return 1;
> >> }
> >>
> >> +extern bool sched_rt_bandwidth_active(struct rt_rq *rt_rq);
> >> +
> >> /*
> >> * Update the current task's runtime statistics (provided it is still
> >> * a -deadline task and has not been removed from the dl_rq).
> >> @@ -650,11 +652,13 @@ static void update_curr_dl(struct rq *rq
> >> struct rt_rq *rt_rq = &rq->rt;
> >>
> >> raw_spin_lock(&rt_rq->rt_runtime_lock);
> >> - rt_rq->rt_time += delta_exec;
> >> /*
> >> * We'll let actual RT tasks worry about the overflow here, we
> >> - * have our own CBS to keep us inline -- see above.
> >> + * have our own CBS to keep us inline; only account when RT
> >> + * bandwidth is relevant.
> >> */
> >> + if (sched_rt_bandwidth_active(rt_rq))
> >> + rt_rq->rt_time += delta_exec;
> >> raw_spin_unlock(&rt_rq->rt_runtime_lock);
> >> }
> >> }
> >
> > So, I ran some tests with the above and I'd like to share with you what
> > I've found. You can find here a trace-cmd trace that should be feeded
> > to kernelshark to be able to understand what follows (or feel free to
> > reproduce same scenario :)):
> > http://retis.sssup.it/~jlelli/traces/trace_rt_time.dat
> >
> > Here you have a DL task (4/10) and a while(1) RT task, both running
> > inside a rt_bw of 0.5. RT tasks is activated 500ms after DL. As I
> > filtered in sched_rt_period_timer(), you can search for time instants
> > when the rt_bw is replenished. It is evident that the first time after
> > rt timer is activated back (search for start_bandwidth_timer), we can
> > eat some bw to FAIR tasks (if any). This is due to the fact that we
> > reset rt_bw budget at this time, start decrementing rt_time for both DL
> > and RT tasks, throttle RT tasks when rt_time > runtime, but, since DL
> > tasks acually executes inside their own server, they don't care about
> > rt_bw. Good news is that steady state is ok: keeping track of overruns
> > we are able to stop eating bw to other guys.
> >
> > My thougths:
> >
> > - Peter's patch is an easy fix to Kirill's problem (RT tasks were
> > throttled too early);
> > - something to add to this solution could be to pre-calculate bw of
> > ready DL tasks and subtract it to rt_bw at replenishment time, but
> > it sounds quite awkward, pessimistic, and I'm not sure it is gonna
> > work;
> > - we are stealing bw to best-effort tasks, and just at the beginning
> > of the transistion, is it really a problem?
> > - I mean, if you want guarantees make your tasks DL! :);
> > - in the long run we are gonna have RT tasks scheduled inside CBS
> > servers, and all this will be properly fixed up.
> >
> > Comments?
> >
> > BTW, rt timer activation/deactivation should probably be fixed for
> > !RT_GROUP_SCHED with something like this:
> >
> > ---
> > kernel/sched/rt.c | 10 +++++++---
> > 1 file changed, 7 insertions(+), 3 deletions(-)
> >
> > diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> > index 6161de8..274f992 100644
> > --- a/kernel/sched/rt.c
> > +++ b/kernel/sched/rt.c
> > @@ -86,12 +86,12 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
> > raw_spin_lock_init(&rt_rq->rt_runtime_lock);
> > }
> >
> > -#ifdef CONFIG_RT_GROUP_SCHED
> > static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
> > {
> > hrtimer_cancel(&rt_b->rt_period_timer);
> > }
> >
> > +#ifdef CONFIG_RT_GROUP_SCHED
> > #define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
> >
> > static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
> > @@ -1017,8 +1017,12 @@ inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
> > start_rt_bandwidth(&def_rt_bandwidth);
> > }
> >
> > -static inline
> > -void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
> > +static void
> > +dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
> > +{
> > + if (!rt_rq->rt_nr_running)
> > + destroy_rt_bandwidth(&def_rt_bandwidth);
> > +}
> >
> > #endif /* CONFIG_RT_GROUP_SCHED */
> >
>
> It looks with both patches applied, we may get into a situation,
> when all CPU time is shared between RT and DL tasks:
>
> rt_runtime = n
> rt_period = 2n
>
> | RT working, DL sleeping | DL working, RT sleeping |
> -----------------------------------------------------------
> | (1) duration = n | (2) duration = n | (repeat)
> |--------------------------|------------------------------|
> | (rt_bw timer is running) | (rt_bw timer is not running) |
>
> No time for fair tasks at all.

Ok, this situation is pathological. DL bandwidth is guaranteed at
admission control, while RT isn't. In this case RT tasks are doomed by
construction. Still you'd like to let FAIR tasks execute :).

I argumented on a slightly different solution in what follows, what you
think?

Thanks,

- Juri

>From e44fe2eef34433a7799cfc153f467f7c62813596 Mon Sep 17 00:00:00 2001
From: Juri Lelli <[email protected]>
Date: Fri, 21 Feb 2014 11:37:15 +0100
Subject: [PATCH] sched/deadline: Prevent rt_time growth to infinity

Kirill Tkhai noted:
Since deadline tasks share rt bandwidth, we must care about
bandwidth timer set. Otherwise rt_time may grow up to infinity
in update_curr_dl(), if there are no other available RT tasks
on top level bandwidth.

RT task were in fact throttled right after they got enqueued,
and never executed again (rt_time never again went below rt_runtime).

Peter than proposed to accrue DL execution on rt_time only when
rt timer is active, and proposed a patch (this patch is a slight
modification of that) to implement that behavior. While this
solves Kirill problem, it has a drawback.

Indeed, Kirill noted again:
It looks we may get into a situation, when all CPU time is shared
between RT and DL tasks:

rt_runtime = n
rt_period = 2n

| RT working, DL sleeping | DL working, RT sleeping |
-----------------------------------------------------------
| (1) duration = n | (2) duration = n | (repeat)
|--------------------------|------------------------------|
| (rt_bw timer is running) | (rt_bw timer is not running) |

No time for fair tasks at all.

While this can happen during the first period, if rq is always backlogged,
RT tasks won't have the opportunity to execute anymore: rt_time reached
rt_runtime during (1), suppose after (2) RT is enqueued back, it gets
throttled since rt timer didn't fire, replenishment is from now on eaten up
by DL tasks that accrue their execution on rt_time (while rt timer is
active - we have an RT task waiting for replenishment). FAIR tasks are
not touched after this first period. Ok, this is not ideal, and the situation
is even worse!

What above (the nice case), practically never happens in reality, where
your rt timer is not aligned to tasks periods, tasks are in general not
periodic, etc.. Long story short, you always risk to overload your system.

This patch is based on Peter's idea, but exploits an additional fact:
if you don't have RT tasks enqueued, it makes little sense to continue
incrementing rt_time once you reached the upper limit (DL tasks have their
own mechanism for throttling).

This cures both problems:

- no matter how many DL instances in the past, you'll have an rt_time
slightly above rt_runtime when an RT task is enqueued, and from that
point on (after the first replenishment), the task will normally execute;

- you can still eat up all bandwidth during the first period, but not
anymore after that, remember that DL execution will increment rt_time
till the upper limit is reached.

The situation is still not perfect! But, we have a simple solution for now,
that limits how much you can jeopardize your system, as we keep working
towards the right answer: RT groups scheduled using deadline servers.

Signed-off-by: Juri Lelli <[email protected]>
---
kernel/sched/deadline.c | 8 ++++++--
kernel/sched/rt.c | 8 ++++++++
2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 15cbc17..f59d774 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -564,6 +564,8 @@ int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
return 1;
}

+extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
+
/*
* Update the current task's runtime statistics (provided it is still
* a -deadline task and has not been removed from the dl_rq).
@@ -627,11 +629,13 @@ static void update_curr_dl(struct rq *rq)
struct rt_rq *rt_rq = &rq->rt;

raw_spin_lock(&rt_rq->rt_runtime_lock);
- rt_rq->rt_time += delta_exec;
/*
* We'll let actual RT tasks worry about the overflow here, we
- * have our own CBS to keep us inline -- see above.
+ * have our own CBS to keep us inline; only account when RT
+ * bandwidth is relevant.
*/
+ if (sched_rt_bandwidth_account(rt_rq))
+ rt_rq->rt_time += delta_exec;
raw_spin_unlock(&rt_rq->rt_runtime_lock);
}
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7dba25a..7f372e1 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -538,6 +538,14 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)

#endif /* CONFIG_RT_GROUP_SCHED */

+bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
+{
+ struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+
+ return (hrtimer_active(&rt_b->rt_period_timer) ||
+ rt_rq->rt_time < rt_b->rt_runtime);
+}
+
#ifdef CONFIG_SMP
/*
* We ran out of runtime, see if we can borrow some from our neighbours.
--
1.7.9.5

2014-02-25 15:00:10

by Kirill Tkhai

[permalink] [raw]

Subject: Re: [RFC] sched/deadline: Prevent rt_time growth to infinity

25.02.2014, 18:14, "Juri Lelli" <[email protected]>:
> On Sat, 22 Feb 2014 04:56:59 +0400
> Kirill Tkhai <[email protected]> wrote:
>
>> ?On 21.02.2014 20:36, Juri Lelli wrote:
>>> ?On Fri, 21 Feb 2014 11:37:15 +0100
>>> ?Peter Zijlstra <[email protected]> wrote:
>>>> ?On Thu, Feb 20, 2014 at 02:16:00AM +0400, Kirill Tkhai wrote:
>>>>> ?Since deadline tasks share rt bandwidth, we must care about
>>>>> ?bandwidth timer set. Otherwise rt_time may grow up to infinity
>>>>> ?in update_curr_dl(), if there are no other available RT tasks
>>>>> ?on top level bandwidth.
>>>>>
>>>>> ?I'm going to decide the problem the way below. Almost untested
>>>>> ?because of I skipped almost all of recent patches which haveto be applied from lkml.
>>>>>
>>>>> ?Please say, if I skipped anything in idea. Maybe better put
>>>>> ?start_top_rt_bandwidth() into set_curr_task_dl()?
>>>> ?How about we only increment rt_time when there's an RT bandwidth timer
>>>> ?active?
>>>>
>>>> ?---
>>>> ?--- a/kernel/sched/rt.c
>>>> ?+++ b/kernel/sched/rt.c
>>>> ?@@ -568,6 +568,12 @@ static inline struct rt_bandwidth *sched
>>>>
>>>> ??#endif /* CONFIG_RT_GROUP_SCHED */
>>>>
>>>> ?+bool sched_rt_bandwidth_active(struct rt_rq *rt_rq)
>>>> ?+{
>>>> ?+ struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
>>>> ?+ return hrtimer_active(&rt_b->rt_period_timer);
>>>> ?+}
>>>> ?+
>>>> ??#ifdef CONFIG_SMP
>>>> ??/*
>>>> ???* We ran out of runtime, see if we can borrow some from our neighbours.
>>>> ?--- a/kernel/sched/deadline.c
>>>> ?+++ b/kernel/sched/deadline.c
>>>> ?@@ -587,6 +587,8 @@ int dl_runtime_exceeded(struct rq *rq, s
>>>> ??????????return 1;
>>>> ??}
>>>>
>>>> ?+extern bool sched_rt_bandwidth_active(struct rt_rq *rt_rq);
>>>> ?+
>>>> ??/*
>>>> ???* Update the current task's runtime statistics (provided it is still
>>>> ???* a -deadline task and has not been removed from the dl_rq).
>>>> ?@@ -650,11 +652,13 @@ static void update_curr_dl(struct rq *rq
>>>> ??????????????????struct rt_rq *rt_rq = &rq->rt;
>>>>
>>>> ??????????????????raw_spin_lock(&rt_rq->rt_runtime_lock);
>>>> ?- rt_rq->rt_time += delta_exec;
>>>> ??????????????????/*
>>>> ???????????????????* We'll let actual RT tasks worry about the overflow here, we
>>>> ?- * have our own CBS to keep us inline -- see above.
>>>> ?+ * have our own CBS to keep us inline; only account when RT
>>>> ?+ * bandwidth is relevant.
>>>> ???????????????????*/
>>>> ?+ if (sched_rt_bandwidth_active(rt_rq))
>>>> ?+ rt_rq->rt_time += delta_exec;
>>>> ??????????????????raw_spin_unlock(&rt_rq->rt_runtime_lock);
>>>> ??????????}
>>>> ??}
>>> ?So, I ran some tests with the above and I'd like to share with you what
>>> ?I've found. You can find here a trace-cmd trace that should be feeded
>>> ?to kernelshark to be able to understand what follows (or feel free to
>>> ?reproduce same scenario :)):
>>> ?http://retis.sssup.it/~jlelli/traces/trace_rt_time.dat
>>>
>>> ?Here you have a DL task (4/10) and a while(1) RT task, both running
>>> ?inside a rt_bw of 0.5. RT tasks is activated 500ms after DL. As I
>>> ?filtered in sched_rt_period_timer(), you can search for time instants
>>> ?when the rt_bw is replenished. It is evident that the first time after
>>> ?rt timer is activated back (search for start_bandwidth_timer), we can
>>> ?eat some bw to FAIR tasks (if any). This is due to the fact that we
>>> ?reset rt_bw budget at this time, start decrementing rt_time for both DL
>>> ?and RT tasks, throttle RT tasks when rt_time > runtime, but, since DL
>>> ?tasks acually executes inside their own server, they don't care about
>>> ?rt_bw. Good news is that steady state is ok: keeping track of overruns
>>> ?we are able to stop eating bw to other guys.
>>>
>>> ?My thougths:
>>>
>>> ??- Peter's patch is an easy fix to Kirill's problem (RT tasks were
>>> ????throttled too early);
>>> ??- something to add to this solution could be to pre-calculate bw of
>>> ????ready DL tasks and subtract it to rt_bw at replenishment time, but
>>> ????it sounds quite awkward, pessimistic, and I'm not sure it is gonna
>>> ????work;
>>> ??- we are stealing bw to best-effort tasks, and just at the beginning
>>> ????of the transistion, is it really a problem?
>>> ??- I mean, if you want guarantees make your tasks DL! :);
>>> ??- in the long run we are gonna have RT tasks scheduled inside CBS
>>> ????servers, and all this will be properly fixed up.
>>>
>>> ?Comments?
>>>
>>> ?BTW, rt timer activation/deactivation should probably be fixed for
>>> ?!RT_GROUP_SCHED with something like this:
>>>
>>> ?---
>>> ??kernel/sched/rt.c | ??10 +++++++---
>>> ??1 file changed, 7 insertions(+), 3 deletions(-)
>>>
>>> ?diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
>>> ?index 6161de8..274f992 100644
>>> ?--- a/kernel/sched/rt.c
>>> ?+++ b/kernel/sched/rt.c
>>> ?@@ -86,12 +86,12 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
>>> ??????????raw_spin_lock_init(&rt_rq->rt_runtime_lock);
>>> ??}
>>>
>>> ?-#ifdef CONFIG_RT_GROUP_SCHED
>>> ??static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
>>> ??{
>>> ??????????hrtimer_cancel(&rt_b->rt_period_timer);
>>> ??}
>>>
>>> ?+#ifdef CONFIG_RT_GROUP_SCHED
>>> ??#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
>>>
>>> ??static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
>>> ?@@ -1017,8 +1017,12 @@ inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
>>> ??????????start_rt_bandwidth(&def_rt_bandwidth);
>>> ??}
>>>
>>> ?-static inline
>>> ?-void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
>>> ?+static void
>>> ?+dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
>>> ?+{
>>> ?+ if (!rt_rq->rt_nr_running)
>>> ?+ destroy_rt_bandwidth(&def_rt_bandwidth);
>>> ?+}
>>>
>>> ??#endif /* CONFIG_RT_GROUP_SCHED */
>> ?It looks with both patches applied, we may get into a situation,
>> ?when all CPU time is shared between RT and DL tasks:
>>
>> ?rt_runtime = n
>> ?rt_period ?= 2n
>>
>> ?| RT working, DL sleeping ?| DL working, RT sleeping ?????|
>> ?-----------------------------------------------------------
>> ?| (1) ????duration = n ????| (2) ????duration = n ????????| (repeat)
>> ?|--------------------------|------------------------------|
>> ?| (rt_bw timer is running) | (rt_bw timer is not running) |
>>
>> ?No time for fair tasks at all.
>
> Ok, this situation is pathological. DL bandwidth is guaranteed at
> admission control, while RT isn't. In this case RT tasks are doomed by
> construction. Still you'd like to let FAIR tasks execute :).
>
> I argumented on a slightly different solution in what follows, what you
> think?
>
> Thanks,
>
> - Juri
>
> From e44fe2eef34433a7799cfc153f467f7c62813596 Mon Sep 17 00:00:00 2001
> From: Juri Lelli <[email protected]>
> Date: Fri, 21 Feb 2014 11:37:15 +0100
> Subject: [PATCH] sched/deadline: Prevent rt_time growth to infinity
>
> Kirill Tkhai noted:
> Since deadline tasks share rt bandwidth, we must care about
> bandwidth timer set. Otherwise rt_time may grow up to infinity
> in update_curr_dl(), if there are no other available RT tasks
> on top level bandwidth.
>
> RT task were in fact throttled right after they got enqueued,
> and never executed again (rt_time never again went below rt_runtime).
>
> Peter than proposed to accrue DL execution on rt_time only when
> rt timer is active, and proposed a patch (this patch is a slight
> modification of that) to implement that behavior. While this
> solves Kirill problem, it has a drawback.
>
> Indeed, Kirill noted again:
> It looks we may get into a situation, when all CPU time is shared
> between RT and DL tasks:
>
> rt_runtime = n
> rt_period ?= 2n
>
> | RT working, DL sleeping ?| DL working, RT sleeping ?????|
> -----------------------------------------------------------
> | (1) ????duration = n ????| (2) ????duration = n ????????| (repeat)
> |--------------------------|------------------------------|
> | (rt_bw timer is running) | (rt_bw timer is not running) |
>
> No time for fair tasks at all.
>
> While this can happen during the first period, if rq is always backlogged,
> RT tasks won't have the opportunity to execute anymore: rt_time reached
> rt_runtime during (1), suppose after (2) RT is enqueued back, it gets
> throttled since rt timer didn't fire, replenishment is from now on eaten up
> by DL tasks that accrue their execution on rt_time (while rt timer is
> active - we have an RT task waiting for replenishment). FAIR tasks are
> not touched after this first period. Ok, this is not ideal, and the situation
> is even worse!
>
> What above (the nice case), practically never happens in reality, where
> your rt timer is not aligned to tasks periods, tasks are in general not
> periodic, etc.. Long story short, you always risk to overload your system.
>
> This patch is based on Peter's idea, but exploits an additional fact:
> if you don't have RT tasks enqueued, it makes little sense to continue
> incrementing rt_time once you reached the upper limit (DL tasks have their
> own mechanism for throttling).
>
> This cures both problems:
>
> ?- no matter how many DL instances in the past, you'll have an rt_time
> ???slightly above rt_runtime when an RT task is enqueued, and from that
> ???point on (after the first replenishment), the task will normally execute;
>
> ?- you can still eat up all bandwidth during the first period, but not
> ???anymore after that, remember that DL execution will increment rt_time
> ???till the upper limit is reached.
>
> The situation is still not perfect! But, we have a simple solution for now,
> that limits how much you can jeopardize your system, as we keep working
> towards the right answer: RT groups scheduled using deadline servers.

Excellent, Juri! This is almost perfect.

Thanks,
Kirill

> Signed-off-by: Juri Lelli <[email protected]>
> ---
> ?kernel/sched/deadline.c | ???8 ++++++--
> ?kernel/sched/rt.c ??????| ???8 ++++++++
> ?2 files changed, 14 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 15cbc17..f59d774 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -564,6 +564,8 @@ int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
> ?????????return 1;
> ?}
>
> +extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
> +
> ?/*
> ??* Update the current task's runtime statistics (provided it is still
> ??* a -deadline task and has not been removed from the dl_rq).
> @@ -627,11 +629,13 @@ static void update_curr_dl(struct rq *rq)
> ?????????????????struct rt_rq *rt_rq = &rq->rt;
>
> ?????????????????raw_spin_lock(&rt_rq->rt_runtime_lock);
> - rt_rq->rt_time += delta_exec;
> ?????????????????/*
> ??????????????????* We'll let actual RT tasks worry about the overflow here, we
> - * have our own CBS to keep us inline -- see above.
> + * have our own CBS to keep us inline; only account when RT
> + * bandwidth is relevant.
> ??????????????????*/
> + if (sched_rt_bandwidth_account(rt_rq))
> + rt_rq->rt_time += delta_exec;
> ?????????????????raw_spin_unlock(&rt_rq->rt_runtime_lock);
> ?????????}
> ?}
> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> index 7dba25a..7f372e1 100644
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -538,6 +538,14 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
>
> ?#endif /* CONFIG_RT_GROUP_SCHED */
>
> +bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
> +{
> + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
> +
> + return (hrtimer_active(&rt_b->rt_period_timer) ||
> + rt_rq->rt_time < rt_b->rt_runtime);
> +}
> +
> ?#ifdef CONFIG_SMP
> ?/*
> ??* We ran out of runtime, see if we can borrow some from our neighbours.
> --
> 1.7.9.5

2014-02-27 13:32:47

by tip-bot for Vasyl Gomonovych

[permalink] [raw]

Subject: [tip:sched/urgent] sched/deadline: Prevent rt_time growth to infinity

Commit-ID: faa5993736d9b44b508cab4f1f3a77d66641c6f4
Gitweb: http://git.kernel.org/tip/faa5993736d9b44b508cab4f1f3a77d66641c6f4
Author: Juri Lelli <[email protected]>
AuthorDate: Fri, 21 Feb 2014 11:37:15 +0100
Committer: Ingo Molnar <[email protected]>
CommitDate: Thu, 27 Feb 2014 12:29:41 +0100

sched/deadline: Prevent rt_time growth to infinity

Kirill Tkhai noted:

Since deadline tasks share rt bandwidth, we must care about
bandwidth timer set. Otherwise rt_time may grow up to infinity
in update_curr_dl(), if there are no other available RT tasks
on top level bandwidth.

RT task were in fact throttled right after they got enqueued,
and never executed again (rt_time never again went below rt_runtime).

Peter then proposed to accrue DL execution on rt_time only when
rt timer is active, and proposed a patch (this patch is a slight
modification of that) to implement that behavior. While this
solves Kirill problem, it has a drawback.

Indeed, Kirill noted again:

It looks we may get into a situation, when all CPU time is shared
between RT and DL tasks:

rt_runtime = n
rt_period = 2n

| RT working, DL sleeping | DL working, RT sleeping |
-----------------------------------------------------------
| (1) duration = n | (2) duration = n | (repeat)
|--------------------------|------------------------------|
| (rt_bw timer is running) | (rt_bw timer is not running) |

No time for fair tasks at all.

While this can happen during the first period, if rq is always backlogged,
RT tasks won't have the opportunity to execute anymore: rt_time reached
rt_runtime during (1), suppose after (2) RT is enqueued back, it gets
throttled since rt timer didn't fire, replenishment is from now on eaten up
by DL tasks that accrue their execution on rt_time (while rt timer is
active - we have an RT task waiting for replenishment). FAIR tasks are
not touched after this first period. Ok, this is not ideal, and the situation
is even worse!

What above (the nice case), practically never happens in reality, where
your rt timer is not aligned to tasks periods, tasks are in general not
periodic, etc.. Long story short, you always risk to overload your system.

This patch is based on Peter's idea, but exploits an additional fact:
if you don't have RT tasks enqueued, it makes little sense to continue
incrementing rt_time once you reached the upper limit (DL tasks have their
own mechanism for throttling).

This cures both problems:

- no matter how many DL instances in the past, you'll have an rt_time
slightly above rt_runtime when an RT task is enqueued, and from that
point on (after the first replenishment), the task will normally execute;

- you can still eat up all bandwidth during the first period, but not
anymore after that, remember that DL execution will increment rt_time
till the upper limit is reached.

The situation is still not perfect! But, we have a simple solution for now,
that limits how much you can jeopardize your system, as we keep working
towards the right answer: RT groups scheduled using deadline servers.

Reported-by: Kirill Tkhai <[email protected]>
Signed-off-by: Juri Lelli <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
Cc: Steven Rostedt <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/sched/deadline.c | 8 ++++++--
kernel/sched/rt.c | 8 ++++++++
2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index aecf930..6e79b3f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -562,6 +562,8 @@ int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
return 1;
}

+extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
+
/*
* Update the current task's runtime statistics (provided it is still
* a -deadline task and has not been removed from the dl_rq).
@@ -625,11 +627,13 @@ static void update_curr_dl(struct rq *rq)
struct rt_rq *rt_rq = &rq->rt;

raw_spin_lock(&rt_rq->rt_runtime_lock);
- rt_rq->rt_time += delta_exec;
/*
* We'll let actual RT tasks worry about the overflow here, we
- * have our own CBS to keep us inline -- see above.
+ * have our own CBS to keep us inline; only account when RT
+ * bandwidth is relevant.
*/
+ if (sched_rt_bandwidth_account(rt_rq))
+ rt_rq->rt_time += delta_exec;
raw_spin_unlock(&rt_rq->rt_runtime_lock);
}
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a2740b7..1999021 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -538,6 +538,14 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)

#endif /* CONFIG_RT_GROUP_SCHED */

+bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
+{
+ struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+
+ return (hrtimer_active(&rt_b->rt_period_timer) ||
+ rt_rq->rt_time < rt_b->rt_runtime);
+}
+
#ifdef CONFIG_SMP
/*
* We ran out of runtime, see if we can borrow some from our neighbours.