2024-04-05 11:06:07

by Peter Zijlstra

[permalink] [raw]
Subject: [RFC][PATCH 10/10] sched/eevdf: Use sched_attr::sched_runtime to set request/slice suggestion

Allow applications to directly set a suggested request/slice length using
sched_attr::sched_runtime.

The implementation clamps the value to: 0.1[ms] <= slice <= 100[ms]
which is 1/10 the size of HZ=1000 and 10 times the size of HZ=100.

Applications should strive to use their periodic runtime at a high
confidence interval (95%+) as the target slice. Using a smaller slice
will introduce undue preemptions, while using a larger value will
increase latency.

For all the following examples assume a scheduling quantum of 8, and for
consistency all examples have W=4:

{A,B,C,D}(w=1,r=8):

ABCD...
+---+---+---+---

t=0, V=1.5 t=1, V=3.5
A |------< A |------<
B |------< B |------<
C |------< C |------<
D |------< D |------<
---+*------+-------+--- ---+--*----+-------+---

t=2, V=5.5 t=3, V=7.5
A |------< A |------<
B |------< B |------<
C |------< C |------<
D |------< D |------<
---+----*--+-------+--- ---+------*+-------+---

Note: 4 identical tasks in FIFO order

~~~

{A,B}(w=1,r=16) C(w=2,r=16)

AACCBBCC...
+---+---+---+---

t=0, V=1.25 t=2, V=5.25
A |--------------< A |--------------<
B |--------------< B |--------------<
C |------< C |------<
---+*------+-------+--- ---+----*--+-------+---

t=4, V=8.25 t=6, V=12.25
A |--------------< A |--------------<
B |--------------< B |--------------<
C |------< C |------<
---+-------*-------+--- ---+-------+---*---+---

Note: 1 heavy task -- because q=8, double r such that the deadline of the w=2
task doesn't go below q.

Note: observe the full schedule becomes: W*max(r_i/w_i) = 4*2q = 8q in length.

Note: the period of the heavy task is half the full period at:
W*(r_i/w_i) = 4*(2q/2) = 4q

~~~

{A,C,D}(w=1,r=16) B(w=1,r=8):

BAACCBDD...
+---+---+---+---

t=0, V=1.5 t=1, V=3.5
A |--------------< A |---------------<
B |------< B |------<
C |--------------< C |--------------<
D |--------------< D |--------------<
---+*------+-------+--- ---+--*----+-------+---

t=3, V=7.5 t=5, V=11.5
A |---------------< A |---------------<
B |------< B |------<
C |--------------< C |--------------<
D |--------------< D |--------------<
---+------*+-------+--- ---+-------+--*----+---

t=6, V=13.5
A |---------------<
B |------<
C |--------------<
D |--------------<
---+-------+----*--+---

Note: 1 short task -- again double r so that the deadline of the short task
won't be below q. Made B short because its not the leftmost task, but is
eligible with the 0,1,2,3 spread.

Note: like with the heavy task, the period of the short task observes:
W*(r_i/w_i) = 4*(1q/1) = 4q

~~~

A(w=1,r=16) B(w=1,r=8) C(w=2,r=16)

BCCAABCC...
+---+---+---+---

t=0, V=1.25 t=1, V=3.25
A |--------------< A |--------------<
B |------< B |------<
C |------< C |------<
---+*------+-------+--- ---+--*----+-------+---

t=3, V=7.25 t=5, V=11.25
A |--------------< A |--------------<
B |------< B |------<
C |------< C |------<
---+------*+-------+--- ---+-------+--*----+---

t=6, V=13.25
A |--------------<
B |------<
C |------<
---+-------+----*--+---

Note: 1 heavy and 1 short task -- combine them all.

Note: both the short and heavy task end up with a period of 4q

~~~

A(w=1,r=16) B(w=2,r=16) C(w=1,r=8)

BBCAABBC...
+---+---+---+---

t=0, V=1 t=2, V=5
A |--------------< A |--------------<
B |------< B |------<
C |------< C |------<
---+*------+-------+--- ---+----*--+-------+---

t=3, V=7 t=5, V=11
A |--------------< A |--------------<
B |------< B |------<
C |------< C |------<
---+------*+-------+--- ---+-------+--*----+---

t=7, V=15
A |--------------<
B |------<
C |------<
---+-------+------*+---

Note: as before but permuted

~~~

>From all this it can be deduced that, for the steady state:

- the total period (P) of a schedule is: W*max(r_i/w_i)
- the average period of a task is: W*(r_i/w_i)
- each task obtains the fair share: w_i/W of each full period P

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
include/linux/sched.h | 5 ++++-
kernel/sched/core.c | 33 ++++++++++++++++++++++++++-------
kernel/sched/debug.c | 3 ++-
kernel/sched/fair.c | 6 ++++--
4 files changed, 36 insertions(+), 11 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -542,7 +542,10 @@ struct sched_entity {

struct list_head group_node;
unsigned int on_rq;
- unsigned int sched_delayed;
+
+ unsigned char sched_delayed;
+ unsigned char custom_slice;
+ /* 2 byte hole */

u64 exec_start;
u64 sum_exec_runtime;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4550,7 +4550,6 @@ static void __sched_fork(unsigned long c
p->se.nr_migrations = 0;
p->se.vruntime = 0;
p->se.vlag = 0;
- p->se.slice = sysctl_sched_base_slice;
INIT_LIST_HEAD(&p->se.group_node);

#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -4801,6 +4800,8 @@ int sched_fork(unsigned long clone_flags

p->prio = p->normal_prio = p->static_prio;
set_load_weight(p, false);
+ p->se.custom_slice = 0;
+ p->se.slice = sysctl_sched_base_slice;

/*
* We don't need the reset flag anymore after the fork. It has
@@ -7627,10 +7628,20 @@ static void __setscheduler_params(struct

p->policy = policy;

- if (dl_policy(policy))
+ if (dl_policy(policy)) {
__setparam_dl(p, attr);
- else if (fair_policy(policy))
+ } else if (fair_policy(policy)) {
p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+ if (attr->sched_runtime) {
+ p->se.custom_slice = 1;
+ p->se.slice = clamp_t(u64, attr->sched_runtime,
+ NSEC_PER_MSEC/10, /* HZ=1000 * 10 */
+ NSEC_PER_MSEC*100); /* HZ=100 / 10 */
+ } else {
+ p->se.custom_slice = 0;
+ p->se.slice = sysctl_sched_base_slice;
+ }
+ }

/*
* __sched_setscheduler() ensures attr->sched_priority == 0 when
@@ -7812,7 +7823,9 @@ static int __sched_setscheduler(struct t
* but store a possible modification of reset_on_fork.
*/
if (unlikely(policy == p->policy)) {
- if (fair_policy(policy) && attr->sched_nice != task_nice(p))
+ if (fair_policy(policy) &&
+ (attr->sched_nice != task_nice(p) ||
+ (attr->sched_runtime && attr->sched_runtime != p->se.slice)))
goto change;
if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
goto change;
@@ -7958,6 +7971,9 @@ static int _sched_setscheduler(struct ta
.sched_nice = PRIO_TO_NICE(p->static_prio),
};

+ if (p->se.custom_slice)
+ attr.sched_runtime = p->se.slice;
+
/* Fixup the legacy SCHED_RESET_ON_FORK hack. */
if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
@@ -8124,12 +8140,14 @@ static int sched_copy_attr(struct sched_

static void get_params(struct task_struct *p, struct sched_attr *attr)
{
- if (task_has_dl_policy(p))
+ if (task_has_dl_policy(p)) {
__getparam_dl(p, attr);
- else if (task_has_rt_policy(p))
+ } else if (task_has_rt_policy(p)) {
attr->sched_priority = p->rt_priority;
- else
+ } else {
attr->sched_nice = task_nice(p);
+ attr->sched_runtime = p->se.slice;
+ }
}

/**
@@ -10084,6 +10102,7 @@ void __init sched_init(void)
}

set_load_weight(&init_task, false);
+ init_task.se.slice = sysctl_sched_base_slice,

/*
* The boot idle thread does lazy MMU switching as well:
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -579,11 +579,12 @@ print_task(struct seq_file *m, struct rq
else
SEQ_printf(m, " %c", task_state_to_char(p));

- SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
+ SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
p->comm, task_pid_nr(p),
SPLIT_NS(p->se.vruntime),
entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N',
SPLIT_NS(p->se.deadline),
+ p->se.custom_slice ? 'S' : ' ',
SPLIT_NS(p->se.slice),
SPLIT_NS(p->se.sum_exec_runtime),
(long long)(p->nvcsw + p->nivcsw),
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -984,7 +984,8 @@ static void update_deadline(struct cfs_r
* nice) while the request time r_i is determined by
* sysctl_sched_base_slice.
*/
- se->slice = sysctl_sched_base_slice;
+ if (!se->custom_slice)
+ se->slice = sysctl_sched_base_slice;

/*
* EEVDF: vd_i = ve_i + r_i / w_i
@@ -5169,7 +5170,8 @@ place_entity(struct cfs_rq *cfs_rq, stru
u64 vslice, vruntime = avg_vruntime(cfs_rq);
s64 lag = 0;

- se->slice = sysctl_sched_base_slice;
+ if (!se->custom_slice)
+ se->slice = sysctl_sched_base_slice;
vslice = calc_delta_fair(se->slice, se);

/*




2024-04-06 08:17:27

by Hillf Danton

[permalink] [raw]
Subject: Re: [RFC][PATCH 10/10] sched/eevdf: Use sched_attr::sched_runtime to set request/slice suggestion

On Fri, 05 Apr 2024 12:28:04 +0200 Peter Zijlstra <[email protected]>
> Allow applications to directly set a suggested request/slice length using
> sched_attr::sched_runtime.
>
> The implementation clamps the value to: 0.1[ms] <= slice <= 100[ms]
> which is 1/10 the size of HZ=1000 and 10 times the size of HZ=100.
>
Given HZ=100 for example, what is preventing applications of suggested
slice=0.5ms from running 5ms a tick? If slice is 90ms otoh, is tick able
to kick the curr that has been on cpu for 10ms off cpu, given
cfs_rq->nr_running > 1?

> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -984,7 +984,8 @@ static void update_deadline(struct cfs_r
> * nice) while the request time r_i is determined by
> * sysctl_sched_base_slice.
> */
> - se->slice = sysctl_sched_base_slice;
> + if (!se->custom_slice)
> + se->slice = sysctl_sched_base_slice;
>
> /*
> * EEVDF: vd_i = ve_i + r_i / w_i

2024-05-07 05:35:48

by Mike Galbraith

[permalink] [raw]
Subject: Re: [RFC][PATCH 10/10] sched/eevdf: Use sched_attr::sched_runtime to set request/slice suggestion

On Fri, 2024-04-05 at 12:28 +0200, Peter Zijlstra wrote:
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -7812,7 +7823,9 @@ static int __sched_setscheduler(struct t
>          * but store a possible modification of reset_on_fork.
>          */
>         if (unlikely(policy == p->policy)) {
> -               if (fair_policy(policy) && attr->sched_nice != task_nice(p))
> +               if (fair_policy(policy) &&
> +                   (attr->sched_nice != task_nice(p) ||
> +                    (attr->sched_runtime && attr->sched_runtime != p->se.slice)))
>                         goto change;

Can we make that only look at attr->sched_runtime != p->se.slice?
Doing so lets you clear a custom slice by.. clearing it.. rather than
making tools rummage about for the proper value to overwrite.

-Mike

2024-05-07 15:16:40

by Chen Yu

[permalink] [raw]
Subject: Re: [RFC][PATCH 10/10] sched/eevdf: Use sched_attr::sched_runtime to set request/slice suggestion

On 2024-04-05 at 12:28:04 +0200, Peter Zijlstra wrote:
> Allow applications to directly set a suggested request/slice length using
> sched_attr::sched_runtime.
>
> The implementation clamps the value to: 0.1[ms] <= slice <= 100[ms]
> which is 1/10 the size of HZ=1000 and 10 times the size of HZ=100.
>
> Applications should strive to use their periodic runtime at a high
> confidence interval (95%+) as the target slice. Using a smaller slice
> will introduce undue preemptions, while using a larger value will
> increase latency.
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
>

Is it possible to leverage this task slice to do better task wakeup placement?
The idea is that, the smaller the slice the wakee has, the less idle CPU it
should scan. This can reduce wake latency and inhibit costly task migration,
especially on large systems.

We did some experiments and got some performance improvements:


From 9cb806476586d7048fcbd0f66d0101f0dbb8fd2b Mon Sep 17 00:00:00 2001
From: Chen Yu <[email protected]>
Date: Tue, 7 May 2024 22:36:29 +0800
Subject: [RFC PATCH] sched/eevdf: Use customized slice to reduce wakeup latency
and inhibit task migration

Problem 1:
The overhead of task migration is high on many-core system. The overhead
brings performance penalty due to broken cache locality/higher cache-to-cache
latency.

Problem 2:
During wakeup, the time spent on searching for an idle CPU is costly on
many-core system. Besides, access to other CPU's rq statistics brings
cace contention:

available_idle_cpu(cpu) -> idle_cpu(cpu) -> {rq->curr, rq->nr_running}

Although SIS_UTIL throttles the scan depth based on system utilization,
there is requirement to further limit the scan depth for specific workload,
especially for short duration wakee.

Now we have the interface to customize the request/slice. The smaller the
slice is, the earlier the task can be picked up, and the lower wakeup latency
the task expects. Leverage the wakee's slice to further throttle the
idle CPU scan depth - the shorter slice, the less CPUs to scan.

Test on 240 CPUs, 2 sockets system. With SNC(sub-numa-cluster) enabled,
each LLC domain has 60 CPUs. There is noticeable improvement of netperf.
(With SNC disabled, more improvements should be seen because C2C is higher)

The global slice is 3 msec(sysctl_sched_base_slice) by default on my ubuntu
22.04, and the customized slice is set to 0.1 msec for both netperf and netserver:

for i in $(seq 1 $job); do
netperf_slice -e 100000 -4 -H 127.0.01 -t TCP_RR -c -C -l 100 &
done

case load baseline(std%) compare%( std%)
TCP_RR 60-threads 1.00 ( 1.60) +0.35 ( 1.73)
TCP_RR 120-threads 1.00 ( 1.34) -0.96 ( 1.24)
TCP_RR 180-threads 1.00 ( 1.59) +92.20 ( 4.24)
TCP_RR 240-threads 1.00 ( 9.71) +43.11 ( 2.97)

Suggested-by: Tim Chen <[email protected]>
Signed-off-by: Chen Yu <[email protected]>
---
kernel/sched/fair.c | 23 ++++++++++++++++++++---
kernel/sched/features.h | 1 +
2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index edc23f6588a3..f269ae7d6e24 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7368,6 +7368,24 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd

#endif /* CONFIG_SCHED_SMT */

+/*
+ * Scale the scan number of idle CPUs according to customized
+ * wakee's slice. The smaller the slice is, the earlier the task
+ * wants be picked up, thus the lower wakeup latency the task expects.
+ * The baseline is the global sysctl_sched_base_slice. Task slice
+ * smaller than the global one would shrink the scan number.
+ */
+static int adjust_idle_scan(struct task_struct *p, int nr)
+{
+ if (!sched_feat(SIS_FAST))
+ return nr;
+
+ if (!p->se.custom_slice || p->se.slice >= sysctl_sched_base_slice)
+ return nr;
+
+ return div_u64(nr * p->se.slice, sysctl_sched_base_slice);
+}
+
/*
* Scan the LLC domain for idle CPUs; this is dynamically regulated by
* comparing the average scan cost (tracked in sd->avg_scan_cost) against the
@@ -7384,10 +7402,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
if (sched_feat(SIS_UTIL)) {
sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
if (sd_share) {
- /* because !--nr is the condition to stop scan */
- nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
+ nr = adjust_idle_scan(p, READ_ONCE(sd_share->nr_idle_scan));
/* overloaded LLC is unlikely to have idle cpu/core */
- if (nr == 1)
+ if (nr <= 0)
return -1;
}
}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 143f55df890b..176324236018 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -50,6 +50,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
* When doing wakeups, attempt to limit superfluous scans of the LLC domain.
*/
SCHED_FEAT(SIS_UTIL, true)
+SCHED_FEAT(SIS_FAST, true)

/*
* Issue a WARN when we do multiple update_rq_clock() calls
--
2.25.1


2024-05-08 13:54:12

by Mike Galbraith

[permalink] [raw]
Subject: Re: [RFC][PATCH 10/10] sched/eevdf: Use sched_attr::sched_runtime to set request/slice suggestion

On Tue, 2024-05-07 at 23:15 +0800, Chen Yu wrote:
> On 2024-04-05 at 12:28:04 +0200, Peter Zijlstra wrote:
> > Allow applications to directly set a suggested request/slice length using
> > sched_attr::sched_runtime.
> >
> > The implementation clamps the value to: 0.1[ms] <= slice <= 100[ms]
> > which is 1/10 the size of HZ=1000 and 10 times the size of HZ=100.
> >
> > Applications should strive to use their periodic runtime at a high
> > confidence interval (95%+) as the target slice. Using a smaller slice
> > will introduce undue preemptions, while using a larger value will
> > increase latency.
> >
> > Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> >
>
> Is it possible to leverage this task slice to do better task wakeup placement?

Slice being unrelated to placement makes its use in a placement related
knob look wrong. Even the smallest possible slice is orders of
magnitude larger than the cycle time of TCP_RR, making slice nearly
irrelevant to the issue being demonstrating via TCP_RR. Even for that
huge socket box, it won't take long as cycle time increases toward that
smallest possible slice for the cost of needless wait to bury placement
decision costs.

> The idea is that, the smaller the slice the wakee has, the less idle CPU it
> should scan. This can reduce wake latency and inhibit costly task migration,
> especially on large systems.

Sure, this is an age old issue that's scaled up to size extra ugly in
that huge socket box. Any solution needs to scale as well methinks, a
simple fixed yardstick won't work, as the costs being mitigated vary
wildly with platform size/shape.

-Mike

2024-05-09 03:49:08

by Chen Yu

[permalink] [raw]
Subject: Re: [RFC][PATCH 10/10] sched/eevdf: Use sched_attr::sched_runtime to set request/slice suggestion

On 2024-05-08 at 15:52:32 +0200, Mike Galbraith wrote:
> On Tue, 2024-05-07 at 23:15 +0800, Chen Yu wrote:
> > On 2024-04-05 at 12:28:04 +0200, Peter Zijlstra wrote:
> > > Allow applications to directly set a suggested request/slice length using
> > > sched_attr::sched_runtime.
> > >
> > > The implementation clamps the value to: 0.1[ms] <= slice <= 100[ms]
> > > which is 1/10 the size of HZ=1000 and 10 times the size of HZ=100.
> > >
> > > Applications should strive to use their periodic runtime at a high
> > > confidence interval (95%+) as the target slice. Using a smaller slice
> > > will introduce undue preemptions, while using a larger value will
> > > increase latency.
> > >
> > > Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> > >
> >
> > Is it possible to leverage this task slice to do better task wakeup placement?
>
> Slice being unrelated to placement makes its use in a placement related
> knob look wrong. Even the smallest possible slice is orders of
> magnitude larger than the cycle time of TCP_RR, making slice nearly
> irrelevant to the issue being demonstrating via TCP_RR.

Yes, I agree that there is no direct relationship between the slice and
the task placement. The idea is to use slice as an input hint from the user
to tell the kernel how much latency this user cares(but not expect the task
to last for that long).

> Even for that huge socket box, it won't take long as cycle time increases toward that
> smallest possible slice for the cost of needless wait to bury placement
> decision costs.
>

I see. The wake up latency is composed of:
1. The time to do placement decision.
2. The time wait in the runqueue to be picked.

Even if the 1 has been reduced, the 2 might take more time if the runqueue
is busy. We can mitigate this by checking if there is <= 1 short duration task
on that target CPU.

> > The idea is that, the smaller the slice the wakee has, the less idle CPU it
> > should scan. This can reduce wake latency and inhibit costly task migration,
> > especially on large systems.
>
> Sure, this is an age old issue that's scaled up to size extra ugly in
> that huge socket box. Any solution needs to scale as well methinks, a
> simple fixed yardstick won't work, as the costs being mitigated vary
> wildly with platform size/shape.
>

Understand, this slice based placement was chosen for experimental purpose,
and seek for directions. And I agree we should take the platform size(such
as CPU number) into consideration.

thanks,
Chenyu

2024-05-09 05:01:18

by Mike Galbraith

[permalink] [raw]
Subject: Re: [RFC][PATCH 10/10] sched/eevdf: Use sched_attr::sched_runtime to set request/slice suggestion

On Thu, 2024-05-09 at 11:48 +0800, Chen Yu wrote:

> And I agree we should take the platform size(such
> as CPU number) into consideration.

I think you'll need more that that, because size agnostic, when it
comes to latency, an idle CPU is damn hard to beat, making migration
restrictions (traditional and obvious target) tend to leave highly
annoying piles of collateral damage in their wake.

(spoken from BTDT perspective, have t-shirt, got butt kicked;)

-Mike

2024-05-13 04:07:29

by K Prateek Nayak

[permalink] [raw]
Subject: Re: [RFC][PATCH 10/10] sched/eevdf: Use sched_attr::sched_runtime to set request/slice suggestion

Hello Chenyu,

On 5/7/2024 8:45 PM, Chen Yu wrote:
> On 2024-04-05 at 12:28:04 +0200, Peter Zijlstra wrote:
>> Allow applications to directly set a suggested request/slice length using
>> sched_attr::sched_runtime.
>>
>> The implementation clamps the value to: 0.1[ms] <= slice <= 100[ms]
>> which is 1/10 the size of HZ=1000 and 10 times the size of HZ=100.
>>
>> Applications should strive to use their periodic runtime at a high
>> confidence interval (95%+) as the target slice. Using a smaller slice
>> will introduce undue preemptions, while using a larger value will
>> increase latency.
>>
>> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
>>
>
> Is it possible to leverage this task slice to do better task wakeup placement?
> The idea is that, the smaller the slice the wakee has, the less idle CPU it
> should scan. This can reduce wake latency and inhibit costly task migration,
> especially on large systems.
>
> We did some experiments and got some performance improvements:
>
>
> From 9cb806476586d7048fcbd0f66d0101f0dbb8fd2b Mon Sep 17 00:00:00 2001
> From: Chen Yu <[email protected]>
> Date: Tue, 7 May 2024 22:36:29 +0800
> Subject: [RFC PATCH] sched/eevdf: Use customized slice to reduce wakeup latency
> and inhibit task migration
>
> Problem 1:
> The overhead of task migration is high on many-core system. The overhead
> brings performance penalty due to broken cache locality/higher cache-to-cache
> latency.
>
> Problem 2:
> During wakeup, the time spent on searching for an idle CPU is costly on
> many-core system. Besides, access to other CPU's rq statistics brings
> cace contention:
>
> available_idle_cpu(cpu) -> idle_cpu(cpu) -> {rq->curr, rq->nr_running}
>
> Although SIS_UTIL throttles the scan depth based on system utilization,
> there is requirement to further limit the scan depth for specific workload,
> especially for short duration wakee.
>
> Now we have the interface to customize the request/slice. The smaller the
> slice is, the earlier the task can be picked up, and the lower wakeup latency
> the task expects. Leverage the wakee's slice to further throttle the
> idle CPU scan depth - the shorter slice, the less CPUs to scan.
>
> Test on 240 CPUs, 2 sockets system. With SNC(sub-numa-cluster) enabled,
> each LLC domain has 60 CPUs. There is noticeable improvement of netperf.
> (With SNC disabled, more improvements should be seen because C2C is higher)
>
> The global slice is 3 msec(sysctl_sched_base_slice) by default on my ubuntu
> 22.04, and the customized slice is set to 0.1 msec for both netperf and netserver:
>
> for i in $(seq 1 $job); do
> netperf_slice -e 100000 -4 -H 127.0.01 -t TCP_RR -c -C -l 100 &
> done
>
> case load baseline(std%) compare%( std%)
> TCP_RR 60-threads 1.00 ( 1.60) +0.35 ( 1.73)
> TCP_RR 120-threads 1.00 ( 1.34) -0.96 ( 1.24)
> TCP_RR 180-threads 1.00 ( 1.59) +92.20 ( 4.24)
> TCP_RR 240-threads 1.00 ( 9.71) +43.11 ( 2.97)
>
> Suggested-by: Tim Chen <[email protected]>
> Signed-off-by: Chen Yu <[email protected]>
> ---
> kernel/sched/fair.c | 23 ++++++++++++++++++++---
> kernel/sched/features.h | 1 +
> 2 files changed, 21 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index edc23f6588a3..f269ae7d6e24 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7368,6 +7368,24 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd
>
> #endif /* CONFIG_SCHED_SMT */
>
> +/*
> + * Scale the scan number of idle CPUs according to customized
> + * wakee's slice. The smaller the slice is, the earlier the task
> + * wants be picked up, thus the lower wakeup latency the task expects.
> + * The baseline is the global sysctl_sched_base_slice. Task slice
> + * smaller than the global one would shrink the scan number.
> + */
> +static int adjust_idle_scan(struct task_struct *p, int nr)
> +{
> + if (!sched_feat(SIS_FAST))
> + return nr;
> +
> + if (!p->se.custom_slice || p->se.slice >= sysctl_sched_base_slice)
> + return nr;
> +
> + return div_u64(nr * p->se.slice, sysctl_sched_base_slice);
> +}
> +
> /*
> * Scan the LLC domain for idle CPUs; this is dynamically regulated by
> * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
> @@ -7384,10 +7402,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
> if (sched_feat(SIS_UTIL)) {
> sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
> if (sd_share) {
> - /* because !--nr is the condition to stop scan */> - nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
> + nr = adjust_idle_scan(p, READ_ONCE(sd_share->nr_idle_scan));
> /* overloaded LLC is unlikely to have idle cpu/core */
> - if (nr == 1)
> + if (nr <= 0)

I was wondering if this would preserve the current behavior with
SIS_FAST toggled off? Since the implementation below still does a
"--nr <= 0" , wouldn't it effectively visit one CPU less overall now?

Have you tried something similar to the below hunk?

/* because !--nr is the condition to stop scan */
nr = adjust_idle_scan(p, READ_ONCE(sd_share->nr_idle_scan)) + 1;
if (nr == 1)
return -1;

I agree with Mike that looking at slice to limit scan-depth seems odd.
My experience with netperf is that the workload cares more about the
server-client being co-located on the closest cache domain and by
limiting scan-depth using slice, this is indirectly achieved since all
the wakeups carry the WF_SYNc flag.

P.S. have you tried using the slice in __select_idle_cpu()? Similar to
sched_idle_cpu() check, perhaps an additional sched_preempt_short_cpu()
which compares rq->curr->se.slice with the waking task's slice and
returs that cpu if SIS_SHORT can help run the workload quicker? Note:
This will not work if the SIS scan itself is the largest overhead in the
wakeup cycle and not the task placement itself. Previously during
SIS_UTIL testing, to measure the overheads of scan vs placement, we
would do a full scan but return the result that SIS_UTIL would have
returned to determine the overhead of the search itself.

> return -1;
> }
> }
> diff --git a/kernel/sched/features.h b/kernel/sched/features.h
> index 143f55df890b..176324236018 100644
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -50,6 +50,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
> * When doing wakeups, attempt to limit superfluous scans of the LLC domain.
> */
> SCHED_FEAT(SIS_UTIL, true)
> +SCHED_FEAT(SIS_FAST, true)
>
> /*
> * Issue a WARN when we do multiple update_rq_clock() calls

--
Thanks and Regards,
Prateek

2024-05-14 09:19:38

by Chen Yu

[permalink] [raw]
Subject: Re: [RFC][PATCH 10/10] sched/eevdf: Use sched_attr::sched_runtime to set request/slice suggestion

Hi Prateek,

On 2024-05-13 at 09:37:07 +0530, K Prateek Nayak wrote:
> Hello Chenyu,
>
> On 5/7/2024 8:45 PM, Chen Yu wrote:
> > On 2024-04-05 at 12:28:04 +0200, Peter Zijlstra wrote:
> >> Allow applications to directly set a suggested request/slice length using
> >> sched_attr::sched_runtime.
> >>
> >> The implementation clamps the value to: 0.1[ms] <= slice <= 100[ms]
> >> which is 1/10 the size of HZ=1000 and 10 times the size of HZ=100.
> >>
> >> Applications should strive to use their periodic runtime at a high
> >> confidence interval (95%+) as the target slice. Using a smaller slice
> >> will introduce undue preemptions, while using a larger value will
> >> increase latency.
> >>
> >> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> >>
> >
> > Is it possible to leverage this task slice to do better task wakeup placement?
> > The idea is that, the smaller the slice the wakee has, the less idle CPU it
> > should scan. This can reduce wake latency and inhibit costly task migration,
> > especially on large systems.
> >
> > We did some experiments and got some performance improvements:
> >
> >
> > From 9cb806476586d7048fcbd0f66d0101f0dbb8fd2b Mon Sep 17 00:00:00 2001
> > From: Chen Yu <[email protected]>
> > Date: Tue, 7 May 2024 22:36:29 +0800
> > Subject: [RFC PATCH] sched/eevdf: Use customized slice to reduce wakeup latency
> > and inhibit task migration
> >
> > Problem 1:
> > The overhead of task migration is high on many-core system. The overhead
> > brings performance penalty due to broken cache locality/higher cache-to-cache
> > latency.
> >
> > Problem 2:
> > During wakeup, the time spent on searching for an idle CPU is costly on
> > many-core system. Besides, access to other CPU's rq statistics brings
> > cace contention:
> >
> > available_idle_cpu(cpu) -> idle_cpu(cpu) -> {rq->curr, rq->nr_running}
> >
> > Although SIS_UTIL throttles the scan depth based on system utilization,
> > there is requirement to further limit the scan depth for specific workload,
> > especially for short duration wakee.
> >
> > Now we have the interface to customize the request/slice. The smaller the
> > slice is, the earlier the task can be picked up, and the lower wakeup latency
> > the task expects. Leverage the wakee's slice to further throttle the
> > idle CPU scan depth - the shorter slice, the less CPUs to scan.
> >
> > Test on 240 CPUs, 2 sockets system. With SNC(sub-numa-cluster) enabled,
> > each LLC domain has 60 CPUs. There is noticeable improvement of netperf.
> > (With SNC disabled, more improvements should be seen because C2C is higher)
> >
> > The global slice is 3 msec(sysctl_sched_base_slice) by default on my ubuntu
> > 22.04, and the customized slice is set to 0.1 msec for both netperf and netserver:
> >
> > for i in $(seq 1 $job); do
> > netperf_slice -e 100000 -4 -H 127.0.01 -t TCP_RR -c -C -l 100 &
> > done
> >
> > case load baseline(std%) compare%( std%)
> > TCP_RR 60-threads 1.00 ( 1.60) +0.35 ( 1.73)
> > TCP_RR 120-threads 1.00 ( 1.34) -0.96 ( 1.24)
> > TCP_RR 180-threads 1.00 ( 1.59) +92.20 ( 4.24)
> > TCP_RR 240-threads 1.00 ( 9.71) +43.11 ( 2.97)
> >
> > Suggested-by: Tim Chen <[email protected]>
> > Signed-off-by: Chen Yu <[email protected]>
> > ---
> > kernel/sched/fair.c | 23 ++++++++++++++++++++---
> > kernel/sched/features.h | 1 +
> > 2 files changed, 21 insertions(+), 3 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index edc23f6588a3..f269ae7d6e24 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -7368,6 +7368,24 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd
> >
> > #endif /* CONFIG_SCHED_SMT */
> >
> > +/*
> > + * Scale the scan number of idle CPUs according to customized
> > + * wakee's slice. The smaller the slice is, the earlier the task
> > + * wants be picked up, thus the lower wakeup latency the task expects.
> > + * The baseline is the global sysctl_sched_base_slice. Task slice
> > + * smaller than the global one would shrink the scan number.
> > + */
> > +static int adjust_idle_scan(struct task_struct *p, int nr)
> > +{
> > + if (!sched_feat(SIS_FAST))
> > + return nr;
> > +
> > + if (!p->se.custom_slice || p->se.slice >= sysctl_sched_base_slice)
> > + return nr;
> > +
> > + return div_u64(nr * p->se.slice, sysctl_sched_base_slice);
> > +}
> > +
> > /*
> > * Scan the LLC domain for idle CPUs; this is dynamically regulated by
> > * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
> > @@ -7384,10 +7402,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
> > if (sched_feat(SIS_UTIL)) {
> > sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
> > if (sd_share) {
> > - /* because !--nr is the condition to stop scan */> - nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
> > + nr = adjust_idle_scan(p, READ_ONCE(sd_share->nr_idle_scan));
> > /* overloaded LLC is unlikely to have idle cpu/core */
> > - if (nr == 1)
> > + if (nr <= 0)
>
> I was wondering if this would preserve the current behavior with
> SIS_FAST toggled off? Since the implementation below still does a
> "--nr <= 0" , wouldn't it effectively visit one CPU less overall now?
>
> Have you tried something similar to the below hunk?
>
> /* because !--nr is the condition to stop scan */
> nr = adjust_idle_scan(p, READ_ONCE(sd_share->nr_idle_scan)) + 1;
> if (nr == 1)
> return -1;
>

Yeah, right, to keep the scan depth consistent, the "+1" should be kept.

> I agree with Mike that looking at slice to limit scan-depth seems odd.
> My experience with netperf is that the workload cares more about the
> server-client being co-located on the closest cache domain and by
> limiting scan-depth using slice, this is indirectly achieved since all
> the wakeups carry the WF_SYNc flag.
>

Exactly. This is the original motivation.

> P.S. have you tried using the slice in __select_idle_cpu()? Similar to
> sched_idle_cpu() check, perhaps an additional sched_preempt_short_cpu()
> which compares rq->curr->se.slice with the waking task's slice and
> returs that cpu if SIS_SHORT can help run the workload quicker?

This is a good idea, it seems to be benefit PREEMPT_SHORT. If the customized
task slice is introduced, we can leverage this hint for latency related
optimization. Task wakeup is one thing, I can also think of other aspects,
like idle load balance, etc. I'm not sure what is the proper usage of the
task slice though, this is why I sent this RFC.

> Note:
> This will not work if the SIS scan itself is the largest overhead in the
> wakeup cycle and not the task placement itself. Previously during
> SIS_UTIL testing, to measure the overheads of scan vs placement, we
> would do a full scan but return the result that SIS_UTIL would have
> returned to determine the overhead of the search itself.
>

Regarding the task placement, do you mean the time between a task is enqueued
and picked up? Do you have any recommendation which workload can expose the
scan overhead most?

thanks,
Chenyu

> > return -1;
> > }
> > }
> > diff --git a/kernel/sched/features.h b/kernel/sched/features.h
> > index 143f55df890b..176324236018 100644
> > --- a/kernel/sched/features.h
> > +++ b/kernel/sched/features.h
> > @@ -50,6 +50,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
> > * When doing wakeups, attempt to limit superfluous scans of the LLC domain.
> > */
> > SCHED_FEAT(SIS_UTIL, true)
> > +SCHED_FEAT(SIS_FAST, true)
> >
> > /*
> > * Issue a WARN when we do multiple update_rq_clock() calls
>
> --
> Thanks and Regards,
> Prateek

2024-05-14 15:23:37

by K Prateek Nayak

[permalink] [raw]
Subject: Re: [RFC][PATCH 10/10] sched/eevdf: Use sched_attr::sched_runtime to set request/slice suggestion

Hello Chenyu,

On 5/14/2024 2:48 PM, Chen Yu wrote:
>>> [..snip..]
>>> /*
>>> * Scan the LLC domain for idle CPUs; this is dynamically regulated by
>>> * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
>>> @@ -7384,10 +7402,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
>>> if (sched_feat(SIS_UTIL)) {
>>> sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
>>> if (sd_share) {
>>> - /* because !--nr is the condition to stop scan */> - nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
>>> + nr = adjust_idle_scan(p, READ_ONCE(sd_share->nr_idle_scan));
>>> /* overloaded LLC is unlikely to have idle cpu/core */
>>> - if (nr == 1)
>>> + if (nr <= 0)
>>
>> I was wondering if this would preserve the current behavior with
>> SIS_FAST toggled off? Since the implementation below still does a
>> "--nr <= 0" , wouldn't it effectively visit one CPU less overall now?
>>
>> Have you tried something similar to the below hunk?
>>
>> /* because !--nr is the condition to stop scan */
>> nr = adjust_idle_scan(p, READ_ONCE(sd_share->nr_idle_scan)) + 1;
>> if (nr == 1)
>> return -1;
>>
>
> Yeah, right, to keep the scan depth consistent, the "+1" should be kept.
>
>> I agree with Mike that looking at slice to limit scan-depth seems odd.
>> My experience with netperf is that the workload cares more about the
>> server-client being co-located on the closest cache domain and by
>> limiting scan-depth using slice, this is indirectly achieved since all
>> the wakeups carry the WF_SYNc flag.
>>
>
> Exactly. This is the original motivation.
>
>> P.S. have you tried using the slice in __select_idle_cpu()? Similar to
>> sched_idle_cpu() check, perhaps an additional sched_preempt_short_cpu()
>> which compares rq->curr->se.slice with the waking task's slice and
>> returs that cpu if SIS_SHORT can help run the workload quicker?
>
> This is a good idea, it seems to be benefit PREEMPT_SHORT. If the customized
> task slice is introduced, we can leverage this hint for latency related
> optimization. Task wakeup is one thing, I can also think of other aspects,
> like idle load balance, etc. I'm not sure what is the proper usage of the
> task slice though, this is why I sent this RFC.
>
>> Note:
>> This will not work if the SIS scan itself is the largest overhead in the
>> wakeup cycle and not the task placement itself. Previously during
>> SIS_UTIL testing, to measure the overheads of scan vs placement, we
>> would do a full scan but return the result that SIS_UTIL would have
>> returned to determine the overhead of the search itself.
>>
>
> Regarding the task placement, do you mean the time between a task is enqueued
> and picked up? Do you have any recommendation which workload can expose the
> scan overhead most?

Sorry for not being clear here. From what I've observed in the past,
there are two dimensions to slect_idle_sibling():

i) Placement: Final CPU select_idle_sibling() returns
ii) Search: Do we find an idle core/CPU in select_idle_sibling()

In case of netperf, I've observed that i) is more important than ii)
wherin a placement of client on same core/thread as that of the server
results in better performance vs finding an idle CPU on a remote LLC.
For hackbench/tbench, when runqueues are under high utilization (~75%),
reduction in search time ii) seems to be more beneficial.

There was also a wakeup from IPI / without IPI angle that I never quite
got to the bottom of that Mathieu has highlighted last year. I'll go
get some more data on that front and give your patch a try. Expect
results in a couple of days.

Thank you.

>
> thanks,
> Chenyu
>
>>> return -1;
>>> }
>>> }
>>> [..snip..]
--
Thanks and Regards,
Prateek

2024-05-14 16:16:56

by Chen Yu

[permalink] [raw]
Subject: Re: [RFC][PATCH 10/10] sched/eevdf: Use sched_attr::sched_runtime to set request/slice suggestion

On 2024-05-14 at 20:53:16 +0530, K Prateek Nayak wrote:
> Hello Chenyu,
>
> On 5/14/2024 2:48 PM, Chen Yu wrote:
> >>> [..snip..]
> >>> /*
> >>> * Scan the LLC domain for idle CPUs; this is dynamically regulated by
> >>> * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
> >>> @@ -7384,10 +7402,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
> >>> if (sched_feat(SIS_UTIL)) {
> >>> sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
> >>> if (sd_share) {
> >>> - /* because !--nr is the condition to stop scan */> - nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
> >>> + nr = adjust_idle_scan(p, READ_ONCE(sd_share->nr_idle_scan));
> >>> /* overloaded LLC is unlikely to have idle cpu/core */
> >>> - if (nr == 1)
> >>> + if (nr <= 0)
> >>
> >> I was wondering if this would preserve the current behavior with
> >> SIS_FAST toggled off? Since the implementation below still does a
> >> "--nr <= 0" , wouldn't it effectively visit one CPU less overall now?
> >>
> >> Have you tried something similar to the below hunk?
> >>
> >> /* because !--nr is the condition to stop scan */
> >> nr = adjust_idle_scan(p, READ_ONCE(sd_share->nr_idle_scan)) + 1;
> >> if (nr == 1)
> >> return -1;
> >>
> >
> > Yeah, right, to keep the scan depth consistent, the "+1" should be kept.
> >
> >> I agree with Mike that looking at slice to limit scan-depth seems odd.
> >> My experience with netperf is that the workload cares more about the
> >> server-client being co-located on the closest cache domain and by
> >> limiting scan-depth using slice, this is indirectly achieved since all
> >> the wakeups carry the WF_SYNc flag.
> >>
> >
> > Exactly. This is the original motivation.
> >
> >> P.S. have you tried using the slice in __select_idle_cpu()? Similar to
> >> sched_idle_cpu() check, perhaps an additional sched_preempt_short_cpu()
> >> which compares rq->curr->se.slice with the waking task's slice and
> >> returs that cpu if SIS_SHORT can help run the workload quicker?
> >
> > This is a good idea, it seems to be benefit PREEMPT_SHORT. If the customized
> > task slice is introduced, we can leverage this hint for latency related
> > optimization. Task wakeup is one thing, I can also think of other aspects,
> > like idle load balance, etc. I'm not sure what is the proper usage of the
> > task slice though, this is why I sent this RFC.
> >
> >> Note:
> >> This will not work if the SIS scan itself is the largest overhead in the
> >> wakeup cycle and not the task placement itself. Previously during
> >> SIS_UTIL testing, to measure the overheads of scan vs placement, we
> >> would do a full scan but return the result that SIS_UTIL would have
> >> returned to determine the overhead of the search itself.
> >>
> >
> > Regarding the task placement, do you mean the time between a task is enqueued
> > and picked up? Do you have any recommendation which workload can expose the
> > scan overhead most?
>
> Sorry for not being clear here. From what I've observed in the past,
> there are two dimensions to slect_idle_sibling():
>
> i) Placement: Final CPU select_idle_sibling() returns
> ii) Search: Do we find an idle core/CPU in select_idle_sibling()
>

I see.

> In case of netperf, I've observed that i) is more important than ii)
> wherin a placement of client on same core/thread as that of the server
> results in better performance vs finding an idle CPU on a remote LLC.

How about placement of client on same core/thread vs finding an idle CPU
on a local LLC?

> For hackbench/tbench, when runqueues are under high utilization (~75%),
> reduction in search time ii) seems to be more beneficial.
>

I can understand that hackbench is idle-cpu sensitive because it is
MxN wakeup relationship and could result in task stacking easily. While
for tbench, it should be similar to netperf, not sure why it does not
fall into i) > ii) case. Is it because you were testing netperf TCP_RR(full-duplex)
and tbench is half-duplex(the former has stronger cache locality requirement)?

> There was also a wakeup from IPI / without IPI angle that I never quite
> got to the bottom of that Mathieu has highlighted last year. I'll go
> get some more data on that front and give your patch a try. Expect
> results in a couple of days.

Sure, I'd be glad to try your patch.

thanks,
Chenyu

2024-05-15 10:13:38

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 10/10] sched/eevdf: Use sched_attr::sched_runtime to set request/slice suggestion

On Tue, May 07, 2024 at 07:34:54AM +0200, Mike Galbraith wrote:
> On Fri, 2024-04-05 at 12:28 +0200, Peter Zijlstra wrote:
> >
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -7812,7 +7823,9 @@ static int __sched_setscheduler(struct t
> > ???????? * but store a possible modification of reset_on_fork.
> > ???????? */
> > ????????if (unlikely(policy == p->policy)) {
> > -???????????????if (fair_policy(policy) && attr->sched_nice != task_nice(p))
> > +???????????????if (fair_policy(policy) &&
> > +?????????????????? (attr->sched_nice != task_nice(p) ||
> > +??????????????????? (attr->sched_runtime && attr->sched_runtime != p->se.slice)))
> > ????????????????????????goto change;
>
> Can we make that only look at attr->sched_runtime != p->se.slice?
> Doing so lets you clear a custom slice by.. clearing it.. rather than
> making tools rummage about for the proper value to overwrite.

Duh yes. Seems I already did the right thing in __setscheduler_params()
for that.

Done.

2024-05-22 14:49:21

by Chen Yu

[permalink] [raw]
Subject: Re: [RFC][PATCH 10/10] sched/eevdf: Use sched_attr::sched_runtime to set request/slice suggestion

On 2024-05-14 at 20:53:16 +0530, K Prateek Nayak wrote:
> Hello Chenyu,
>
> On 5/14/2024 2:48 PM, Chen Yu wrote:
> >>> [..snip..]
> >>> /*
> >>> * Scan the LLC domain for idle CPUs; this is dynamically regulated by
> >>> * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
> >>> @@ -7384,10 +7402,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
> >>> if (sched_feat(SIS_UTIL)) {
> >>> sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
> >>> if (sd_share) {
> >>> - /* because !--nr is the condition to stop scan */> - nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
> >>> + nr = adjust_idle_scan(p, READ_ONCE(sd_share->nr_idle_scan));
> >>> /* overloaded LLC is unlikely to have idle cpu/core */
> >>> - if (nr == 1)
> >>> + if (nr <= 0)
> >>
> >> I was wondering if this would preserve the current behavior with
> >> SIS_FAST toggled off? Since the implementation below still does a
> >> "--nr <= 0" , wouldn't it effectively visit one CPU less overall now?
> >>
> >> Have you tried something similar to the below hunk?
> >>
> >> /* because !--nr is the condition to stop scan */
> >> nr = adjust_idle_scan(p, READ_ONCE(sd_share->nr_idle_scan)) + 1;
> >> if (nr == 1)
> >> return -1;
> >>
> >
> > Yeah, right, to keep the scan depth consistent, the "+1" should be kept.
> >
> >> I agree with Mike that looking at slice to limit scan-depth seems odd.
> >> My experience with netperf is that the workload cares more about the
> >> server-client being co-located on the closest cache domain and by
> >> limiting scan-depth using slice, this is indirectly achieved since all
> >> the wakeups carry the WF_SYNc flag.
> >>
> >
> > Exactly. This is the original motivation.
> >
> >> P.S. have you tried using the slice in __select_idle_cpu()? Similar to
> >> sched_idle_cpu() check, perhaps an additional sched_preempt_short_cpu()
> >> which compares rq->curr->se.slice with the waking task's slice and
> >> returs that cpu if SIS_SHORT can help run the workload quicker?
> >
> > This is a good idea, it seems to be benefit PREEMPT_SHORT. If the customized
> > task slice is introduced, we can leverage this hint for latency related
> > optimization. Task wakeup is one thing, I can also think of other aspects,
> > like idle load balance, etc. I'm not sure what is the proper usage of the
> > task slice though, this is why I sent this RFC.
> >
> >> Note:
> >> This will not work if the SIS scan itself is the largest overhead in the
> >> wakeup cycle and not the task placement itself. Previously during
> >> SIS_UTIL testing, to measure the overheads of scan vs placement, we
> >> would do a full scan but return the result that SIS_UTIL would have
> >> returned to determine the overhead of the search itself.
> >>
> >
> > Regarding the task placement, do you mean the time between a task is enqueued
> > and picked up? Do you have any recommendation which workload can expose the
> > scan overhead most?
>
> Sorry for not being clear here. From what I've observed in the past,
> there are two dimensions to slect_idle_sibling():
>
> i) Placement: Final CPU select_idle_sibling() returns
> ii) Search: Do we find an idle core/CPU in select_idle_sibling()
>
> In case of netperf, I've observed that i) is more important than ii)
> wherin a placement of client on same core/thread as that of the server
> results in better performance vs finding an idle CPU on a remote LLC.
> For hackbench/tbench, when runqueues are under high utilization (~75%),
> reduction in search time ii) seems to be more beneficial.
>

Usually task stacking is not preferred because it hurts latency. But as we
discovered on AMD and Intel's large servers, sometimes the overhead of task
migration offset the benefit of running an idle CPU.

Inspired by your description on SIS_SHORT, and also Mike's feedback, I respin
the SIS_SHORT patch, to consider the task's customized slice:

Leverage the WF_SYNC to achieve better cache locality.
If both the waker and the wakee are short duration tasks, wake up the wakee on
waker's CPU directly. To avoid task stacking as much as possible:

1. Only when has_idle_core is false, SIS_SYNC takes effect.

2. Only if the user has shrinked the task slice, SIS_SYNC takes effect.

3. The waker must be the only runnable task on the runqueue.

4. The duration threshold is based on the CPU number within the LLC.
The more CPUs there are, the lower the bar for the task to be treated as
a short duration one, and vice versa.

threshold = sysctl_sched_migration_cost * llc_weight^2 / 256^2

threshold
LLC_WEIGHT=8 0.5 usec
LLC_WEIGHT=16 2 usec
LLC_WEIGHT=32 8 usec
LLC_WEIGHT=64 31 usec
LLC_WEIGHT=128 125 usec
LLC_WEIGHT=256 500 usec

5. Honor idle-CPU-first for CPUs share the L2 cache(Cluster). Only
there is no idle CPU within the Cluster domain, SIS_SYNC takes
effect.

Benchmark:
Tested on 4 platforms, significant throughput improvement on tbench, netperf,
stress-ng, will-it-scale, and latency reduced of lmbench. No performance
difference was observed in an Online Transaction Processing (OLTP) test.

Platform1, 240 CPUs, 2 sockets Intel(R) Xeon(R)
========================================================================
netperf
=======
case load baseline(std%) compare%( std%)
TCP_RR 60-threads 1.00 ( 1.11) +8.22 ( 1.06)
TCP_RR 120-threads 1.00 ( 1.74) +38.71 ( 0.74)
TCP_RR 180-threads 1.00 ( 1.21) +108.12 ( 1.07)
TCP_RR 240-threads 1.00 ( 4.75) +160.98 ( 4.01)
UDP_RR 60-threads 1.00 ( 11.70) +7.17 ( 10.89)
UDP_RR 120-threads 1.00 ( 10.82) +5.54 ( 7.17)
UDP_RR 180-threads 1.00 ( 9.77) +10.29 ( 10.87)
UDP_RR 240-threads 1.00 ( 12.38) +4.45 ( 16.69)

tbench
======
case load baseline(std%) compare%( std%)
loopback 60-threads 1.00 ( 0.22) +8.67 ( 0.12)
loopback 120-threads 1.00 ( 0.20) +42.12 ( 1.16)
loopback 180-threads 1.00 ( 0.16) +96.36 ( 1.34)
loopback 240-threads 1.00 ( 0.23) +122.68 ( 4.87)

schbench
========
No noticeable difference of 99.0th wakeup/request latency, 50.0th RPS percentiles.
schbench -m 2 -r 100

baseline sis_sync

Wakeup Latencies 99.0th usec 27 25
Request Latencies 99.0th usec 15376 15376
RPS percentiles 50.0th 16608 16608

Platform2, 48 CPUs 2 sockets Intel(R) Xeon(R) CPU E5-2697
========================================================================
lmbench3: lmbench3.PIPE.latency.us 33.8% improvement
lmbench3: lmbench3.AF_UNIX.sock.stream.latency.us 30.6% improvement

Platform3: 224 threads 2 sockets Intel(R) Xeon(R) Platinum 8480
=======================================================================
stress-ng: stress-ng.vm-rw.ops_per_sec 250.8% improvement
will-it-scale: will-it-scale.per_process_ops 42.1% improvement

Platform4: 288 CPUs, 2 sockets, 4 CPUs share the L2 cache, Intel(R) Xeon(R)
=========================================================================
netperf
=======
case load baseline(std%) compare%( std%)
TCP_RR 72-threads 1.00 ( 0.89) +6.14 ( 0.95)
TCP_RR 144-threads 1.00 ( 1.53) +5.18 ( 1.24)
TCP_RR 216-threads 1.00 ( 10.76) +23.31 ( 1.67)
TCP_RR 288-threads 1.00 ( 2.28) +1.72 ( 2.10)
UDP_RR 72-threads 1.00 ( 3.44) +0.42 ( 3.11)
UDP_RR 144-threads 1.00 ( 14.53) +1.40 ( 16.11)
UDP_RR 216-threads 1.00 ( 17.30) +1.18 ( 23.00)
UDP_RR 288-threads 1.00 ( 21.30) -0.15 ( 20.14)


diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1437c276c915..b2b838d499ea 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1003,7 +1003,7 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
#include "pelt.h"
#ifdef CONFIG_SMP

-static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
+static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu, int sync);
static unsigned long task_h_load(struct task_struct *p);
static unsigned long capacity_of(int cpu);

@@ -7410,16 +7410,36 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd

#endif /* CONFIG_SCHED_SMT */

+static int short_task(struct task_struct *p, int llc)
+{
+ if (!p->se.custom_slice || p->se.slice >= sysctl_sched_base_slice)
+ return 0;
+
+ /*
+ * Scale the threshold by the LLC CPU number.
+ * The more CPUs there are, the more likely the
+ * task is regarded as a small one.
+ *
+ */
+ if ((p->duration_avg << 16) >=
+ (sysctl_sched_migration_cost * llc * llc))
+ return 0;
+
+ return 1;
+}
+
/*
* Scan the LLC domain for idle CPUs; this is dynamically regulated by
* comparing the average scan cost (tracked in sd->avg_scan_cost) against the
* average idle time for this rq (as found in rq->avg_idle).
*/
-static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target)
+static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target,
+ int sync)
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
int i, cpu, idle_cpu = -1, nr = INT_MAX;
struct sched_domain_shared *sd_share;
+ int llc_weight = per_cpu(sd_llc_size, target);

cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);

@@ -7458,6 +7478,20 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
}
}

+ /*
+ * When reached here, next step is to scan idle CPUs that do not share L2.
+ * However, the Core-to-Core cache latency could be large on big system.
+ * Give target another chance if waker and wakee are mutually waking up
+ * each other.
+ */
+ if (sched_feat(SIS_SYNC) &&
+ target == smp_processor_id() && !has_idle_core &&
+ sync && this_rq()->nr_running <= 1 &&
+ short_task(p, llc_weight) &&
+ short_task(current, llc_weight)) {
+ return target;
+ }
+
for_each_cpu_wrap(cpu, cpus, target + 1) {
if (has_idle_core) {
i = select_idle_core(p, cpu, cpus, &idle_cpu);
@@ -7550,7 +7584,7 @@ static inline bool asym_fits_cpu(unsigned long util,
/*
* Try and locate an idle core/thread in the LLC cache domain.
*/
-static int select_idle_sibling(struct task_struct *p, int prev, int target)
+static int select_idle_sibling(struct task_struct *p, int prev, int target, int sync)
{
bool has_idle_core = false;
struct sched_domain *sd;
@@ -7659,7 +7693,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
}
}

- i = select_idle_cpu(p, sd, has_idle_core, target);
+ i = select_idle_cpu(p, sd, has_idle_core, target, sync);
if ((unsigned)i < nr_cpumask_bits)
return i;

@@ -8259,7 +8293,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag);
} else if (wake_flags & WF_TTWU) { /* XXX always ? */
/* Fast path */
- new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
+ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu, sync);
}
rcu_read_unlock();

diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 143f55df890b..7e5968d01dcb 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -50,6 +50,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
* When doing wakeups, attempt to limit superfluous scans of the LLC domain.
*/
SCHED_FEAT(SIS_UTIL, true)
+SCHED_FEAT(SIS_SYNC, true)

/*
* Issue a WARN when we do multiple update_rq_clock() calls
--
2.25.1



Preparation patch to record the task's duration:


diff --git a/include/linux/sched.h b/include/linux/sched.h
index 61591ac6eab6..9e8dfdc81048 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1339,6 +1339,9 @@ struct task_struct {
struct callback_head cid_work;
#endif

+ u64 prev_sleep_sum_runtime;
+ u64 duration_avg;
+
struct tlbflush_unmap_batch tlb_ubc;

/* Cache last used pipe for splice(): */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bcf2c4cc0522..c4288d613374 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4566,6 +4566,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->migration_pending = NULL;
#endif
init_sched_mm_cid(p);
+ p->prev_sleep_sum_runtime = 0;
+ p->duration_avg = 0;
}

DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8a5b1ae0aa55..1437c276c915 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6833,6 +6833,15 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)

static void set_next_buddy(struct sched_entity *se);

+static inline void dur_avg_update(struct task_struct *p)
+{
+ u64 dur;
+
+ dur = p->se.sum_exec_runtime - p->prev_sleep_sum_runtime;
+ p->prev_sleep_sum_runtime = p->se.sum_exec_runtime;
+ update_avg(&p->duration_avg, dur);
+}
+
/*
* The dequeue_task method is called before nr_running is
* decreased. We remove the task from the rbtree and
@@ -6905,6 +6914,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)

dequeue_throttle:
util_est_update(&rq->cfs, p, task_sleep);
+ if (task_sleep)
+ dur_avg_update(p);
+
hrtick_update(rq);
}

--
2.25.1