2022-03-11 22:17:53

by chenying

[permalink] [raw]
Subject: Subject: [PATCH] sched/fair: prioritize normal task over sched_idle task with vruntime offset

We add a time offset to the se->vruntime when the idle sched_entity
is enqueued, so that the idle entity will always be on the right of
the non-idle in the runqueue. This can allow non-idle tasks to be
selected and run before the idle.

A use-case is that sched_idle for background tasks and non-idle
for foreground. The foreground tasks are latency sensitive and do
not want to be disturbed by the background. It is well known that
the idle tasks can be preempted by the non-idle tasks when waking up,
but will not distinguish between idle and non-idle when pick the next
entity. This may cause background tasks to disturb the foreground.

Test results as below:

~$ ./loop.sh &
[1] 764
~$ chrt -i 0 ./loop.sh &
[2] 765
~$ taskset -p 04 764
~$ taskset -p 04 765

~$ top -p 764 -p 765
top - 13:10:01 up 1 min,  2 users,  load average: 1.30, 0.38, 0.13
Tasks:   2 total,   2 running,   0 sleeping,   0 stopped,   0 zombie
%Cpu(s): 12.5 us,  0.0 sy,  0.0 ni, 87.4 id,  0.0 wa,  0.0 hi, 0.0 si, 
0.0 st
KiB Mem : 16393492 total, 16142256 free,   111028 used,   140208 buff/cache
KiB Swap:   385836 total,   385836 free,        0 used. 16037992 avail Mem

  PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM TIME+ COMMAND
  764 chenyin+  20   0   12888   1144   1004 R 100.0  0.0 1:05.12 loop.sh
  765 chenyin+  20   0   12888   1224   1080 R   0.0  0.0 0:16.21 loop.sh

The non-idle process (764) can run at 100% and without being disturbed by
the idle process (765).

~$ cat /sys/fs/cgroup/cpu/background/cgroup.procs
765
~$ cat /sys/fs/cgroup/cpu/foreground/cgroup.procs
764
~$ top -p 764 -p 765
top - 13:17:19 up 9 min,  2 users,  load average: 2.00, 1.64, 0.86
Tasks:   2 total,   2 running,   0 sleeping,   0 stopped,   0 zombie
%Cpu(s): 12.5 us,  0.0 sy,  0.0 ni, 87.5 id,  0.0 wa,  0.0 hi, 0.0 si, 
0.0 st
KiB Mem : 16393492 total, 16139576 free,   112732 used,   141184 buff/cache
KiB Swap:   385836 total,   385836 free,        0 used. 16036236 avail Mem

  PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM TIME+ COMMAND
  764 chenyin+  20   0   12888   1144   1004 R 100.0  0.0 8:23.51 loop.sh
  765 chenyin+  20   0   12888   1224   1080 R   0.0  0.0 0:16.21 loop.sh

The non-idle group can run at 100% and without being disturbed by the
idle group.

Co-developed-by: chengming zhou <[email protected]>
Signed-off-by: chenying <[email protected]>
---
 include/linux/sched.h   |  1 +
 kernel/sched/core.c     |  6 +++++-
 kernel/sched/debug.c    |  2 ++
 kernel/sched/fair.c     | 26 ++++++++++++++++++++++----
 kernel/sched/features.h |  2 ++
 kernel/sched/sched.h    |  1 +
 6 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 75ba8aa60248..20412f353cad 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -545,6 +545,7 @@ struct sched_entity {
     u64                exec_start;
     u64                sum_exec_runtime;
     u64                vruntime;
+    u64                vruntime_offset;
     u64                prev_sum_exec_runtime;

     u64                nr_migrations;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9745613d531c..beb9d6f54c52 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4239,6 +4239,7 @@ static void __sched_fork(unsigned long
clone_flags, struct task_struct *p)
     p->se.prev_sum_exec_runtime    = 0;
     p->se.nr_migrations        = 0;
     p->se.vruntime            = 0;
+    p->se.vruntime_offset        = 0;
     INIT_LIST_HEAD(&p->se.group_node);

 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -7211,8 +7212,11 @@ static void __setscheduler_params(struct
task_struct *p,

     if (dl_policy(policy))
         __setparam_dl(p, attr);
-    else if (fair_policy(policy))
+    else if (fair_policy(policy)) {
         p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+        p->se.vruntime_offset = 0;
+    } else if (idle_policy(policy))
+        p->se.vruntime_offset = sched_idle_vruntime_offset;

     /*
      * __sched_setscheduler() ensures attr->sched_priority == 0 when
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index aa29211de1bf..701496626830 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -460,6 +460,7 @@ static void print_cfs_group_stats(struct seq_file
*m, int cpu, struct task_group

     PN(se->exec_start);
     PN(se->vruntime);
+    PN(se->vruntime_offset);
     PN(se->sum_exec_runtime);

     if (schedstat_enabled()) {
@@ -969,6 +970,7 @@ void proc_sched_show_task(struct task_struct *p,
struct pid_namespace *ns,

     PN(se.exec_start);
     PN(se.vruntime);
+    PN(se.vruntime_offset);
     PN(se.sum_exec_runtime);

     nr_switches = p->nvcsw + p->nivcsw;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5146163bfabb..6a2cba63b4a9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -92,6 +92,8 @@ static unsigned int
normalized_sysctl_sched_wakeup_granularity    = 1000000UL;

 const_debug unsigned int sysctl_sched_migration_cost    = 500000UL;

+unsigned long long sched_idle_vruntime_offset    = 2592000000000000; /*
30 days */
+
 int sched_thermal_decay_shift;
 static int __init setup_sched_thermal_decay_shift(char *str)
 {
@@ -535,10 +537,19 @@ static inline u64 min_vruntime(u64 min_vruntime,
u64 vruntime)
     return min_vruntime;
 }

+static inline s64  vtime_diff(struct sched_entity *a,
+                struct sched_entity *b)
+{
+    if (sched_feat(VRUNTIME_OFFSET))
+        return (s64)(a->vruntime_offset - b->vruntime_offset);
+    else
+        return 0;
+}
+
 static inline bool entity_before(struct sched_entity *a,
                 struct sched_entity *b)
 {
-    return (s64)(a->vruntime - b->vruntime) < 0;
+    return (s64)(a->vruntime - b->vruntime + vtime_diff(a, b)) < 0;
 }

 #define __node_2_se(node) \
@@ -4445,7 +4456,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct
sched_entity *curr)
         return;

     se = __pick_first_entity(cfs_rq);
-    delta = curr->vruntime - se->vruntime;
+    delta = curr->vruntime - se->vruntime + vtime_diff(curr, se);

     if (delta < 0)
         return;
@@ -7036,7 +7047,7 @@ static unsigned long wakeup_gran(struct
sched_entity *se)
 static int
 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
 {
-    s64 gran, vdiff = curr->vruntime - se->vruntime;
+    s64 gran, vdiff = curr->vruntime - se->vruntime + vtime_diff(curr, se);

     if (vdiff <= 0)
         return -1;
@@ -11131,7 +11142,7 @@ bool cfs_prio_less(struct task_struct *a, struct
task_struct *b, bool in_fi)
      * min_vruntime_fi, which would have been updated in prior calls
      * to se_fi_update().
      */
-    delta = (s64)(sea->vruntime - seb->vruntime) +
+    delta = (s64)(sea->vruntime - seb->vruntime + vtime_diff(sea, seb)) +
         (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);

     return delta > 0;
@@ -11190,6 +11201,9 @@ static void task_fork_fair(struct task_struct *p)
     }
     place_entity(cfs_rq, se, 1);

+    if (task_has_idle_policy(p))
+        se->vruntime_offset = sched_idle_vruntime_offset;
+
     if (sysctl_sched_child_runs_first && curr && entity_before(curr,
se)) {
         /*
          * Upon rescheduling, sched_class::put_prev_task() will place
@@ -11655,6 +11669,10 @@ int sched_group_set_idle(struct task_group *tg,
long idle)
         rq_lock_irqsave(rq, &rf);

         grp_cfs_rq->idle = idle;
+        if (idle)
+            se->vruntime_offset = sched_idle_vruntime_offset;
+        else
+            se->vruntime_offset = 0;
         if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
             goto next_cpu;

diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1cf435bbcd9c..f59f507e6dba 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -100,3 +100,5 @@ SCHED_FEAT(LATENCY_WARN, false)

 SCHED_FEAT(ALT_PERIOD, true)
 SCHED_FEAT(BASE_SLICE, true)
+
+SCHED_FEAT(VRUNTIME_OFFSET, true)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index de53be905739..1bc0c0756fd4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -95,6 +95,7 @@ extern __read_mostly int scheduler_running;

 extern unsigned long calc_load_update;
 extern atomic_long_t calc_load_tasks;
+extern unsigned long long sched_idle_vruntime_offset;

 extern void calc_global_load_tick(struct rq *this_rq);
 extern long calc_load_fold_active(struct rq *this_rq, long adjust);
--
2.11.0


2022-03-12 13:31:25

by Peter Zijlstra

[permalink] [raw]
Subject: Re: Subject: [PATCH] sched/fair: prioritize normal task over sched_idle task with vruntime offset

On Fri, Mar 11, 2022 at 03:58:47PM +0800, chenying wrote:
> We add a time offset to the se->vruntime when the idle sched_entity
> is enqueued, so that the idle entity will always be on the right of
> the non-idle in the runqueue. This can allow non-idle tasks to be
> selected and run before the idle.
>
> A use-case is that sched_idle for background tasks and non-idle
> for foreground. The foreground tasks are latency sensitive and do
> not want to be disturbed by the background. It is well known that
> the idle tasks can be preempted by the non-idle tasks when waking up,
> but will not distinguish between idle and non-idle when pick the next
> entity. This may cause background tasks to disturb the foreground.
>
> Test results as below:
>
> ~$ ./loop.sh &
> [1] 764
> ~$ chrt -i 0 ./loop.sh &
> [2] 765
> ~$ taskset -p 04 764
> ~$ taskset -p 04 765
>
> ~$ top -p 764 -p 765
> top - 13:10:01 up 1 min,? 2 users,? load average: 1.30, 0.38, 0.13
> Tasks:?? 2 total,?? 2 running,?? 0 sleeping,?? 0 stopped,?? 0 zombie
> %Cpu(s): 12.5 us,? 0.0 sy,? 0.0 ni, 87.4 id,? 0.0 wa,? 0.0 hi, 0.0 si,? 0.0
> st
> KiB Mem : 16393492 total, 16142256 free,?? 111028 used,?? 140208 buff/cache
> KiB Swap:?? 385836 total,?? 385836 free,??????? 0 used. 16037992 avail Mem
>
> ? PID USER????? PR? NI??? VIRT??? RES??? SHR S? %CPU %MEM TIME+ COMMAND
> ? 764 chenyin+? 20?? 0?? 12888?? 1144?? 1004 R 100.0? 0.0 1:05.12 loop.sh
> ? 765 chenyin+? 20?? 0?? 12888?? 1224?? 1080 R?? 0.0? 0.0 0:16.21 loop.sh
>
> The non-idle process (764) can run at 100% and without being disturbed by
> the idle process (765).

Did you just do a very complicated true idle time scheduler, with all
the problems that brings?

2022-03-13 10:59:16

by chenying

[permalink] [raw]
Subject: Re: [External] Re: Subject: [PATCH] sched/fair: prioritize normal task over sched_idle task with vruntime offset

在 2022/3/12 20:03, Peter Zijlstra 写道:
> On Fri, Mar 11, 2022 at 03:58:47PM +0800, chenying wrote:
>> We add a time offset to the se->vruntime when the idle sched_entity
>> is enqueued, so that the idle entity will always be on the right of
>> the non-idle in the runqueue. This can allow non-idle tasks to be
>> selected and run before the idle.
>>
>> A use-case is that sched_idle for background tasks and non-idle
>> for foreground. The foreground tasks are latency sensitive and do
>> not want to be disturbed by the background. It is well known that
>> the idle tasks can be preempted by the non-idle tasks when waking up,
>> but will not distinguish between idle and non-idle when pick the next
>> entity. This may cause background tasks to disturb the foreground.
>>
>> Test results as below:
>>
>> ~$ ./loop.sh &
>> [1] 764
>> ~$ chrt -i 0 ./loop.sh &
>> [2] 765
>> ~$ taskset -p 04 764
>> ~$ taskset -p 04 765
>>
>> ~$ top -p 764 -p 765
>> top - 13:10:01 up 1 min,  2 users,  load average: 1.30, 0.38, 0.13
>> Tasks:   2 total,   2 running,   0 sleeping,   0 stopped,   0 zombie
>> %Cpu(s): 12.5 us,  0.0 sy,  0.0 ni, 87.4 id,  0.0 wa,  0.0 hi, 0.0 si,  0.0
>> st
>> KiB Mem : 16393492 total, 16142256 free,   111028 used,   140208 buff/cache
>> KiB Swap:   385836 total,   385836 free,        0 used. 16037992 avail Mem
>>
>>   PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM TIME+ COMMAND
>>   764 chenyin+  20   0   12888   1144   1004 R 100.0  0.0 1:05.12 loop.sh
>>   765 chenyin+  20   0   12888   1224   1080 R   0.0  0.0 0:16.21 loop.sh
>>
>> The non-idle process (764) can run at 100% and without being disturbed by
>> the idle process (765).
>
> Did you just do a very complicated true idle time scheduler, with all
> the problems that brings?

When colocating CPU-intensive jobs with latency-sensitive services can
improve CPU utilization but it is difficult to meet the stringent
tail-latency requirements of latency-sensitive services. We use a true
idle time scheduler for CPU-intensive jobs to minimize the impact on
latency-sensitive services.

2022-03-14 12:00:20

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [External] Re: Subject: [PATCH] sched/fair: prioritize normal task over sched_idle task with vruntime offset

On Sun, Mar 13, 2022 at 01:37:37PM +0800, chenying wrote:
> 在 2022/3/12 20:03, Peter Zijlstra 写道:
> > On Fri, Mar 11, 2022 at 03:58:47PM +0800, chenying wrote:
> > > We add a time offset to the se->vruntime when the idle sched_entity
> > > is enqueued, so that the idle entity will always be on the right of
> > > the non-idle in the runqueue. This can allow non-idle tasks to be
> > > selected and run before the idle.
> > >
> > > A use-case is that sched_idle for background tasks and non-idle
> > > for foreground. The foreground tasks are latency sensitive and do
> > > not want to be disturbed by the background. It is well known that
> > > the idle tasks can be preempted by the non-idle tasks when waking up,
> > > but will not distinguish between idle and non-idle when pick the next
> > > entity. This may cause background tasks to disturb the foreground.
> > >
> > > Test results as below:
> > >
> > > ~$ ./loop.sh &
> > > [1] 764
> > > ~$ chrt -i 0 ./loop.sh &
> > > [2] 765
> > > ~$ taskset -p 04 764
> > > ~$ taskset -p 04 765
> > >
> > > ~$ top -p 764 -p 765
> > > top - 13:10:01 up 1 min,  2 users,  load average: 1.30, 0.38, 0.13
> > > Tasks:   2 total,   2 running,   0 sleeping,   0 stopped,   0 zombie
> > > %Cpu(s): 12.5 us,  0.0 sy,  0.0 ni, 87.4 id,  0.0 wa,  0.0 hi, 0.0 si,  0.0
> > > st
> > > KiB Mem : 16393492 total, 16142256 free,   111028 used,   140208 buff/cache
> > > KiB Swap:   385836 total,   385836 free,        0 used. 16037992 avail Mem
> > >
> > >   PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM TIME+ COMMAND
> > >   764 chenyin+  20   0   12888   1144   1004 R 100.0  0.0 1:05.12 loop.sh
> > >   765 chenyin+  20   0   12888   1224   1080 R   0.0  0.0 0:16.21 loop.sh
> > >
> > > The non-idle process (764) can run at 100% and without being disturbed by
> > > the idle process (765).
> >
> > Did you just do a very complicated true idle time scheduler, with all
> > the problems that brings?
>
> When colocating CPU-intensive jobs with latency-sensitive services can
> improve CPU utilization but it is difficult to meet the stringent
> tail-latency requirements of latency-sensitive services. We use a true idle
> time scheduler for CPU-intensive jobs to minimize the impact on
> latency-sensitive services.

Hard NAK on any true idle-time scheduler until you make the whole kernel
immune to lock holder starvation issues.

And as said; this is a terrible way to do a true idle-time scheduler.