Subject: [PATCH v4 6/7] sched/deadline: Deferrable dl server

Among the motivations for the DL servers is the real-time throttling
mechanism. This mechanism works by throttling the rt_rq after
running for a long period without leaving space for fair tasks.

The base dl server avoids this problem by boosting fair tasks instead
of throttling the rt_rq. The point is that it boosts without waiting
for potential starvation, causing some non-intuitive cases.

For example, an IRQ dispatches two tasks on an idle system, a fair
and an RT. The DL server will be activated, running the fair task
before the RT one. This problem can be avoided by deferring the
dl server activation.

By passing the deferring option, the dl_server will dispatch an
SCHED_DEADLINE reservation throttled, and the replenishment
timer set for (period - runtime) ns from start time. Thus,
boosting the fair rq on its 0-laxity time with respect
to rt_rq.

The fair server will be scheduled under EDF, with a new
a period at the replenishment time, thus not breaking dl tasks.

Signed-off-by: Daniel Bristot de Oliveira <[email protected]>
---
include/linux/sched.h | 7 +++++
kernel/sched/deadline.c | 61 ++++++++++++++++++++++++++++++++++++++---
kernel/sched/fair.c | 10 ++++---
kernel/sched/rt.c | 6 ++++
kernel/sched/sched.h | 12 +++++++-
5 files changed, 87 insertions(+), 9 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 40fbf3f034e0..38d0b3de03b2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -609,6 +609,12 @@ struct sched_rt_entity {
typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *);
typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *);

+enum dl_server_state {
+ DL_SERVER_STOPPED = 0,
+ DL_SERVER_DEFER,
+ DL_SERVER_RUNNING
+};
+
struct sched_dl_entity {
struct rb_node rb_node;

@@ -685,6 +691,7 @@ struct sched_dl_entity {
struct rq *rq;
dl_server_has_tasks_f server_has_tasks;
dl_server_pick_f server_pick;
+ enum dl_server_state server_state;

#ifdef CONFIG_RT_MUTEXES
/*
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 7844cfb73029..7f1c52bfe78f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -422,7 +422,7 @@ static void task_non_contending(struct sched_dl_entity *dl_se)
if (dl_entity_is_special(dl_se))
return;

- WARN_ON(dl_se->dl_non_contending);
+ WARN_ON_ONCE(dl_se->dl_non_contending);

zerolag_time = dl_se->deadline -
div64_long((dl_se->runtime * dl_se->dl_period),
@@ -1155,6 +1155,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
struct rq_flags rf;

rq_lock(rq, &rf);
+
if (dl_se->dl_throttled) {
sched_clock_tick();
update_rq_clock(rq);
@@ -1165,9 +1166,12 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
__push_dl_task(rq, &rf);
} else {
replenish_dl_entity(dl_se);
+ task_non_contending(dl_se);
}

}
+
+ dl_se->server_state = DL_SERVER_RUNNING;
rq_unlock(rq, &rf);

return HRTIMER_NORESTART;
@@ -1441,18 +1445,65 @@ void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
}

-void dl_server_start(struct sched_dl_entity *dl_se)
+void dl_server_start(struct sched_dl_entity *dl_se, int defer)
{
+ if (dl_se->server_state != DL_SERVER_STOPPED) {
+ WARN_ON_ONCE(!(on_dl_rq(dl_se) || dl_se->dl_throttled));
+ return;
+ }
+
+ if (defer) {
+ /*
+ * Postpone the replenishment to the (next period - the execution time)
+ *
+ * With this in place, we have two cases:
+ *
+ * On the absence of DL tasks:
+ * The server will start at the replenishment time, getting
+ * its runtime before now + period. This is the expected
+ * throttling behavior.
+ *
+ * In the presense of DL tasks:
+ * The server will be replenished, and then it will be
+ * schedule according to EDF, not breaking SCHED_DEADLINE.
+ *
+ * In the first cycle the server will be postponed at most
+ * at period + period - runtime at most. But then the
+ * server will receive its runtime/period.
+ *
+ * The server will, however, run on top of any RT task, which
+ * is the expected throttling behavior.
+ */
+ dl_se->deadline = rq_clock(dl_se->rq) + dl_se->dl_period - dl_se->dl_runtime;
+ /* Zero the runtime */
+ dl_se->runtime = 0;
+ /* throttle the server */
+ dl_se->dl_throttled = 1;
+
+ dl_se->server_state = DL_SERVER_DEFER;
+ start_dl_timer(dl_se);
+ return;
+ }
+
if (!dl_server(dl_se)) {
dl_se->dl_server = 1;
setup_new_dl_entity(dl_se);
}
+
+ dl_se->server_state = DL_SERVER_RUNNING;
enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP);
}

void dl_server_stop(struct sched_dl_entity *dl_se)
{
+ if (dl_se->server_state == DL_SERVER_STOPPED)
+ return;
+
+ hrtimer_try_to_cancel(&dl_se->dl_timer);
dequeue_dl_entity(dl_se, DEQUEUE_SLEEP);
+
+ dl_se->dl_throttled = 0;
+ dl_se->server_state = DL_SERVER_STOPPED;
}

void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
@@ -1462,6 +1513,8 @@ void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
dl_se->rq = rq;
dl_se->server_has_tasks = has_tasks;
dl_se->server_pick = pick;
+ dl_se->server_state = DL_SERVER_STOPPED;
+ dl_se->dl_server = 1;
}

/*
@@ -1817,8 +1870,9 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags)
* (the task moves from "active contending" to "active non contending"
* or "inactive")
*/
- if (flags & DEQUEUE_SLEEP)
+ if (flags & DEQUEUE_SLEEP && !dl_server(dl_se))
task_non_contending(dl_se);
+
}

static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
@@ -1875,7 +1929,6 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
enqueue_pushable_dl_task(rq, p);
}

-
static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
update_curr_dl(rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 580e6764a68b..b9d0f08dc8ca 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6499,9 +6499,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
*/
util_est_enqueue(&rq->cfs, p);

- if (!rq->cfs.h_nr_running)
- dl_server_start(&rq->fair_server);
-
/*
* If in_iowait is set, the code below may not trigger any cpufreq
* utilization updates, so do it here explicitly with the IOWAIT flag
@@ -6568,6 +6565,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_overutilized_status(rq);

enqueue_throttle:
+ if (sched_fair_server_needed(rq))
+ dl_server_start(&rq->fair_server, rq->fair_server_defer);
+
assert_list_leaf_cfs_rq(rq);

hrtick_update(rq);
@@ -6646,7 +6646,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
rq->next_balance = jiffies;

dequeue_throttle:
- if (!rq->cfs.h_nr_running)
+ if (!sched_fair_server_needed(rq))
dl_server_stop(&rq->fair_server);

util_est_update(&rq->cfs, p, task_sleep);
@@ -8317,6 +8317,8 @@ void fair_server_init(struct rq *rq)
dl_se->dl_deadline = 1000 * NSEC_PER_MSEC;
dl_se->dl_period = 1000 * NSEC_PER_MSEC;

+ rq->fair_server_defer = 1;
+
dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick);
}

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e23cc67c9467..7595110a5a3e 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1537,6 +1537,9 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)

if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
+
+ if (sched_fair_server_needed(rq))
+ dl_server_start(&rq->fair_server, rq->fair_server_defer);
}

static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -1547,6 +1550,9 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
dequeue_rt_entity(rt_se, flags);

dequeue_pushable_task(rq, p);
+
+ if (!sched_fair_server_needed(rq))
+ dl_server_stop(&rq->fair_server);
}

/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ac94c386741c..510c4db379be 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -345,7 +345,7 @@ extern int dl_bw_check_overflow(int cpu);
* dl_server_init() -- initializes the server.
*/
extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec);
-extern void dl_server_start(struct sched_dl_entity *dl_se);
+extern void dl_server_start(struct sched_dl_entity *dl_se, int defer);
extern void dl_server_stop(struct sched_dl_entity *dl_se);
extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
dl_server_has_tasks_f has_tasks,
@@ -1027,6 +1027,7 @@ struct rq {
struct dl_rq dl;

struct sched_dl_entity fair_server;
+ int fair_server_defer;

#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this CPU: */
@@ -2394,6 +2395,15 @@ static inline bool sched_fair_runnable(struct rq *rq)
return rq->cfs.nr_running > 0;
}

+static inline bool sched_fair_server_needed(struct rq *rq)
+{
+ /*
+ * The fair server will activate anytime a fair task can starve
+ * because real-time tasks.
+ */
+ return (sched_rt_runnable(rq) && sched_fair_runnable(rq));
+}
+
extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
extern struct task_struct *pick_next_task_idle(struct rq *rq);

--
2.40.1



Subject: Re: [PATCH v4 6/7] sched/deadline: Deferrable dl server

On 9/7/23 10:07, Peter Zijlstra wrote:
> On Wed, Sep 06, 2023 at 04:58:11PM +0200, Daniel Bristot de Oliveira wrote:
>
>>> Yeah, it's a wee hack to move it to the zero-laxity point. I was
>>> considering if it makes sense to push that down and make it available
>>> for all DL tasks, but I'm not sure..
>>
>> It might be useful in the future, like when DL dominates all other schedulers, so
>> we can have a way to schedule a deferred work, like kworkers... :-) But it might be
>> too early for that..
>
> So... that scheme I was pushing where we unconditionally decrement
> fair_server.dl_runtime from update_curr_fair(), that relies on it being
> a proper zero-laxity scheduler, and doesn't work with the proposed defer
> hack.
>
> That is, it relies on dl_runtime > 0 during throttle, and you explicitly
> set it 0.
>
> Now, I've not looked at all this code in detail in a minute, but would
> not something like the below work?
>
> AFAICT the regular dl_task_timer() callback works to make it go, because
> replenish will see positive runtime (or not, when already consumed) and
> DTRT.
>
>
> Index: linux-2.6/include/linux/sched.h
> ===================================================================
> --- linux-2.6.orig/include/linux/sched.h
> +++ linux-2.6/include/linux/sched.h
> @@ -657,6 +657,7 @@ struct sched_dl_entity {
> unsigned int dl_non_contending : 1;
> unsigned int dl_overrun : 1;
> unsigned int dl_server : 1;
> + unsigned int dl_zerolax : 1;
>
> /*
> * Bandwidth enforcement timer. Each -deadline task has its
> Index: linux-2.6/kernel/sched/deadline.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched/deadline.c
> +++ linux-2.6/kernel/sched/deadline.c
> @@ -895,6 +895,16 @@ static void replenish_dl_entity(struct s
> dl_se->dl_yielded = 0;
> if (dl_se->dl_throttled)
> dl_se->dl_throttled = 0;
> +
> + /*
> + * If this is a zero-laxity task, and we're before the zero-laxity
> + * point, throttle it.
> + */
> + if (dl_se->dl_zerolax &&
> + dl_time_before(dl_se->deadline - dl_se->runtime, rq_clock(rq))) {
> + if (!is_dl_boosted(dl_se) && start_dl_timer(dl_se))
> + dl_se->dl_throttled = 1;
> + }
> }
>
> /*
> @@ -1078,7 +1088,12 @@ static int start_dl_timer(struct sched_d
> * that it is actually coming from rq->clock and not from
> * hrtimer's time base reading.
> */
> - act = ns_to_ktime(dl_next_period(dl_se));
> + if (dl_se->dl_zerolax && !dl_se->dl_throttled) {
> + act = ns_to_ktime(dl_se->deadline - dl_se->runtime);
> + } else {
> + act = ns_to_ktime(dl_next_period(dl_se));
> + }
> +
> now = hrtimer_cb_get_time(timer);
> delta = ktime_to_ns(now) - rq_clock(rq);
> act = ktime_add_ns(act, delta);
> @@ -1794,6 +1809,13 @@ enqueue_dl_entity(struct sched_dl_entity
> setup_new_dl_entity(dl_se);
> }
>
> + /*
> + * If we are still throttled, eg. we got replenished but are a
> + * zero-laxity task and still got to wait, don't enqueue.
> + */
> + if (dl_se->dl_throttled)
> + return;
> +
> __enqueue_dl_entity(dl_se);
> }

Let me see if I got it:

- Always start the server, but throttled with full runtime...
- Unconditionally decrement fair_server.dl_runtime from update_curr_fair()
(check if it is not decremented twice as it runs)
- When the dl timer fire, replenish or throttle for the next period?

is that the base for it?

-- Daniel