This patchset restarts the work about adding a latency nice priority to
describe the latency tolerance of cfs tasks.
The patches [1-4] have been done by Parth:
https://lore.kernel.org/lkml/[email protected]/
I have just rebased and moved the set of latency priority outside the
priority update. I have removed the reviewed tag because the patches
are 2 years old.
The patches [5-6] use latency nice priority to decide if a cfs task can
preempt the current running task. Patch 5 gives some tests results with
cyclictests and hackbench to highlight the benefit of latency nice
priority for short interactive task or long intensive tasks.
Parth Shah (4):
sched: Introduce latency-nice as a per-task attribute
sched/core: Propagate parent task's latency requirements to the child
task
sched: Allow sched_{get,set}attr to change latency_nice of the task
sched/core: Add permission checks for setting the latency_nice value
Vincent Guittot (2):
sched/fair: Take into account latency nice at wakeup
sched/fair: Add sched group latency support
include/linux/sched.h | 3 +
include/uapi/linux/sched.h | 4 +-
include/uapi/linux/sched/types.h | 19 +++++++
init/init_task.c | 1 +
kernel/sched/core.c | 97 ++++++++++++++++++++++++++++++++
kernel/sched/debug.c | 1 +
kernel/sched/fair.c | 92 +++++++++++++++++++++++++++++-
kernel/sched/sched.h | 34 +++++++++++
tools/include/uapi/linux/sched.h | 4 +-
9 files changed, 251 insertions(+), 4 deletions(-)
--
2.17.1
From: Parth Shah <[email protected]>
Clone parent task's latency_nice attribute to the forked child task.
Reset the latency_nice value to default value when the child task is
set to sched_reset_on_fork.
Also, initialize init_task.latency_nice value with DEFAULT_LATENCY_NICE
value
Signed-off-by: Parth Shah <[email protected]>
[rebase]
Signed-off-by: Vincent Guittot <[email protected]>
---
init/init_task.c | 1 +
kernel/sched/core.c | 4 ++++
2 files changed, 5 insertions(+)
diff --git a/init/init_task.c b/init/init_task.c
index 73cc8f03511a..2afa249c253b 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -78,6 +78,7 @@ struct task_struct init_task
.prio = MAX_PRIO - 20,
.static_prio = MAX_PRIO - 20,
.normal_prio = MAX_PRIO - 20,
+ .latency_nice = 0,
.policy = SCHED_NORMAL,
.cpus_ptr = &init_task.cpus_mask,
.user_cpus_ptr = NULL,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1d863d7f6ad7..157eef880d1d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4393,6 +4393,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
*/
p->prio = current->normal_prio;
+ /* Propagate the parent's latency requirements to the child as well */
+ p->latency_nice = current->latency_nice;
+
uclamp_fork(p);
/*
@@ -4409,6 +4412,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->prio = p->normal_prio = p->static_prio;
set_load_weight(p, false);
+ p->latency_nice = DEFAULT_LATENCY_NICE;
/*
* We don't need the reset flag anymore after the fork. It has
* fulfilled its duty:
--
2.17.1
Tasks can set its latency priority which is then used to decide to preempt
the current running entity of the cfs but sched group entities still have
the default latency priority.
Add a latency field in task group to set the latency priority of the group
which will be used against other entity in the parent cfs.
Signed-off-by: Vincent Guittot <[email protected]>
---
kernel/sched/core.c | 41 +++++++++++++++++++++++++++++++++++++++++
kernel/sched/fair.c | 32 ++++++++++++++++++++++++++++++++
kernel/sched/sched.h | 4 ++++
3 files changed, 77 insertions(+)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 547b0da01efe..e0668652dd24 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10635,6 +10635,30 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
{
return sched_group_set_idle(css_tg(css), idle);
}
+
+static s64 cpu_latency_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return css_tg(css)->latency_prio;
+}
+
+static int cpu_latency_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft, s64 latency_prio)
+{
+ return sched_group_set_latency(css_tg(css), latency_prio);
+}
+
+static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return LATENCY_TO_NICE(css_tg(css)->latency_prio);
+}
+
+static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft, s64 latency_nice)
+{
+ return sched_group_set_latency(css_tg(css), NICE_TO_LATENCY(latency_nice));
+}
#endif
static struct cftype cpu_legacy_files[] = {
@@ -10649,6 +10673,11 @@ static struct cftype cpu_legacy_files[] = {
.read_s64 = cpu_idle_read_s64,
.write_s64 = cpu_idle_write_s64,
},
+ {
+ .name = "latency",
+ .read_s64 = cpu_latency_read_s64,
+ .write_s64 = cpu_latency_write_s64,
+ },
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
@@ -10866,6 +10895,18 @@ static struct cftype cpu_files[] = {
.read_s64 = cpu_idle_read_s64,
.write_s64 = cpu_idle_write_s64,
},
+ {
+ .name = "latency",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = cpu_latency_read_s64,
+ .write_s64 = cpu_latency_write_s64,
+ },
+ {
+ .name = "latency.nice",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = cpu_latency_nice_read_s64,
+ .write_s64 = cpu_latency_nice_write_s64,
+ },
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 506c482a0e48..cbccef025089 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11496,6 +11496,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
goto err;
tg->shares = NICE_0_LOAD;
+ tg->latency_prio = DEFAULT_LATENCY_PRIO;
init_cfs_bandwidth(tg_cfs_bandwidth(tg));
@@ -11594,6 +11595,7 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
}
se->my_q = cfs_rq;
+ se->latency_weight = sched_latency_to_weight[tg->latency_prio];
/* guarantee group entities always have weight */
update_load_set(&se->load, NICE_0_LOAD);
se->parent = parent;
@@ -11724,6 +11726,36 @@ int sched_group_set_idle(struct task_group *tg, long idle)
return 0;
}
+int sched_group_set_latency(struct task_group *tg, long latency_prio)
+{
+ int i;
+
+ if (tg == &root_task_group)
+ return -EINVAL;
+
+ if (latency_prio < 0 ||
+ latency_prio > LATENCY_NICE_WIDTH)
+ return -EINVAL;
+
+ mutex_lock(&shares_mutex);
+
+ if (tg->latency_prio == latency_prio) {
+ mutex_unlock(&shares_mutex);
+ return 0;
+ }
+
+ tg->latency_prio = latency_prio;
+
+ for_each_possible_cpu(i) {
+ struct sched_entity *se = tg->se[i];
+
+ WRITE_ONCE(se->latency_weight, sched_latency_to_weight[latency_prio]);
+ }
+
+ mutex_unlock(&shares_mutex);
+ return 0;
+}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
void free_fair_sched_group(struct task_group *tg) { }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index dd92aa9c36f9..885d1c809329 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -429,6 +429,8 @@ struct task_group {
/* A positive value indicates that this is a SCHED_IDLE group. */
int idle;
+ /* latency priority of the group. */
+ int latency_prio;
#ifdef CONFIG_SMP
/*
@@ -542,6 +544,8 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
extern int sched_group_set_idle(struct task_group *tg, long idle);
+extern int sched_group_set_latency(struct task_group *tg, long latency);
+
#ifdef CONFIG_SMP
extern void set_task_rq_fair(struct sched_entity *se,
struct cfs_rq *prev, struct cfs_rq *next);
--
2.17.1
From: Parth Shah <[email protected]>
Latency-nice indicates the latency requirements of a task with respect
to the other tasks in the system. The value of the attribute can be within
the range of [-20, 19] both inclusive to be in-line with the values just
like task nice values.
latency_nice = -20 indicates the task to have the least latency as
compared to the tasks having latency_nice = +19.
The latency_nice may affect only the CFS SCHED_CLASS by getting
latency requirements from the userspace.
Additionally, add debugging bits for newly added latency_nice attribute.
Signed-off-by: Parth Shah <[email protected]>
[rebase]
Signed-off-by: Vincent Guittot <[email protected]>
---
include/linux/sched.h | 1 +
kernel/sched/debug.c | 1 +
kernel/sched/sched.h | 18 ++++++++++++++++++
3 files changed, 20 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 508b91d57470..2aa889a59054 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -779,6 +779,7 @@ struct task_struct {
int static_prio;
int normal_prio;
unsigned int rt_priority;
+ int latency_nice;
struct sched_entity se;
struct sched_rt_entity rt;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 102d6f70e84d..5d76a8927888 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1043,6 +1043,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
#endif
P(policy);
P(prio);
+ P(latency_nice);
if (task_has_dl_policy(p)) {
P(dl.runtime);
P(dl.deadline);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9b33ba9c3c42..456ad2159eb1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -105,6 +105,24 @@ extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
*/
#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
+/*
+ * Latency nice is meant to provide scheduler hints about the relative
+ * latency requirements of a task with respect to other tasks.
+ * Thus a task with latency_nice == 19 can be hinted as the task with no
+ * latency requirements, in contrast to the task with latency_nice == -20
+ * which should be given priority in terms of lower latency.
+ */
+#define MAX_LATENCY_NICE 19
+#define MIN_LATENCY_NICE -20
+
+#define LATENCY_NICE_WIDTH \
+ (MAX_LATENCY_NICE - MIN_LATENCY_NICE + 1)
+
+/*
+ * Default tasks should be treated as a task with latency_nice = 0.
+ */
+#define DEFAULT_LATENCY_NICE 0
+
/*
* Increase resolution of nice-level calculations for 64-bit architectures.
* The extra resolution improves shares distribution and load balancing of
--
2.17.1
From: Parth Shah <[email protected]>
Introduce the latency_nice attribute to sched_attr and provide a
mechanism to change the value with the use of sched_setattr/sched_getattr
syscall.
Also add new flag "SCHED_FLAG_LATENCY_NICE" to hint the change in
latency_nice of the task on every sched_setattr syscall.
Signed-off-by: Parth Shah <[email protected]>
[rebase and add a dedicated __setscheduler_latency ]
Signed-off-by: Vincent Guittot <[email protected]>
---
include/uapi/linux/sched.h | 4 +++-
include/uapi/linux/sched/types.h | 19 +++++++++++++++++++
kernel/sched/core.c | 26 ++++++++++++++++++++++++++
tools/include/uapi/linux/sched.h | 4 +++-
4 files changed, 51 insertions(+), 2 deletions(-)
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 3bac0a8ceab2..b2e932c25be6 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -132,6 +132,7 @@ struct clone_args {
#define SCHED_FLAG_KEEP_PARAMS 0x10
#define SCHED_FLAG_UTIL_CLAMP_MIN 0x20
#define SCHED_FLAG_UTIL_CLAMP_MAX 0x40
+#define SCHED_FLAG_LATENCY_NICE 0x80
#define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \
SCHED_FLAG_KEEP_PARAMS)
@@ -143,6 +144,7 @@ struct clone_args {
SCHED_FLAG_RECLAIM | \
SCHED_FLAG_DL_OVERRUN | \
SCHED_FLAG_KEEP_ALL | \
- SCHED_FLAG_UTIL_CLAMP)
+ SCHED_FLAG_UTIL_CLAMP | \
+ SCHED_FLAG_LATENCY_NICE)
#endif /* _UAPI_LINUX_SCHED_H */
diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
index f2c4589d4dbf..0aa4e3b6ed59 100644
--- a/include/uapi/linux/sched/types.h
+++ b/include/uapi/linux/sched/types.h
@@ -10,6 +10,7 @@ struct sched_param {
#define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */
#define SCHED_ATTR_SIZE_VER1 56 /* add: util_{min,max} */
+#define SCHED_ATTR_SIZE_VER2 60 /* add: latency_nice */
/*
* Extended scheduling parameters data structure.
@@ -98,6 +99,22 @@ struct sched_param {
* scheduled on a CPU with no more capacity than the specified value.
*
* A task utilization boundary can be reset by setting the attribute to -1.
+ *
+ * Latency Tolerance Attributes
+ * ===========================
+ *
+ * A subset of sched_attr attributes allows to specify the relative latency
+ * requirements of a task with respect to the other tasks running/queued in the
+ * system.
+ *
+ * @ sched_latency_nice task's latency_nice value
+ *
+ * The latency_nice of a task can have any value in a range of
+ * [LATENCY_NICE_MIN..LATENCY_NICE_MAX].
+ *
+ * A task with latency_nice with the value of LATENCY_NICE_MIN can be
+ * taken for a task with lower latency requirements as opposed to the task with
+ * higher latency_nice.
*/
struct sched_attr {
__u32 size;
@@ -120,6 +137,8 @@ struct sched_attr {
__u32 sched_util_min;
__u32 sched_util_max;
+ /* latency requirement hints */
+ __s32 sched_latency_nice;
};
#endif /* _UAPI_LINUX_SCHED_TYPES_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 157eef880d1d..3edba1a38ecb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7219,6 +7219,16 @@ static void __setscheduler_params(struct task_struct *p,
p->rt_priority = attr->sched_priority;
p->normal_prio = normal_prio(p);
set_load_weight(p, true);
+
+}
+
+static void __setscheduler_latency(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) {
+ p->latency_prio = NICE_TO_LATENCY(attr->sched_latency_nice);
+ set_latency_weight(p);
+ }
}
/*
@@ -7345,6 +7355,13 @@ static int __sched_setscheduler(struct task_struct *p,
return retval;
}
+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) {
+ if (attr->sched_latency_nice > MAX_LATENCY_NICE)
+ return -EINVAL;
+ if (attr->sched_latency_nice < MIN_LATENCY_NICE)
+ return -EINVAL;
+ }
+
if (pi)
cpuset_read_lock();
@@ -7379,6 +7396,9 @@ static int __sched_setscheduler(struct task_struct *p,
goto change;
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
goto change;
+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE &&
+ attr->sched_latency_nice != p->latency_nice)
+ goto change;
p->sched_reset_on_fork = reset_on_fork;
retval = 0;
@@ -7467,6 +7487,7 @@ static int __sched_setscheduler(struct task_struct *p,
__setscheduler_params(p, attr);
__setscheduler_prio(p, newprio);
}
+ __setscheduler_latency(p, attr);
__setscheduler_uclamp(p, attr);
if (queued) {
@@ -7677,6 +7698,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
size < SCHED_ATTR_SIZE_VER1)
return -EINVAL;
+ if ((attr->sched_flags & SCHED_FLAG_LATENCY_NICE) &&
+ size < SCHED_ATTR_SIZE_VER2)
+ return -EINVAL;
/*
* XXX: Do we want to be lenient like existing syscalls; or do we want
* to be strict and return an error on out-of-bounds values?
@@ -7914,6 +7938,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
get_params(p, &kattr);
kattr.sched_flags &= SCHED_FLAG_ALL;
+ kattr.sched_latency_nice = p->latency_nice;
+
#ifdef CONFIG_UCLAMP_TASK
/*
* This could race with another potential updater, but this is fine
diff --git a/tools/include/uapi/linux/sched.h b/tools/include/uapi/linux/sched.h
index 3bac0a8ceab2..ecc4884bfe4b 100644
--- a/tools/include/uapi/linux/sched.h
+++ b/tools/include/uapi/linux/sched.h
@@ -132,6 +132,7 @@ struct clone_args {
#define SCHED_FLAG_KEEP_PARAMS 0x10
#define SCHED_FLAG_UTIL_CLAMP_MIN 0x20
#define SCHED_FLAG_UTIL_CLAMP_MAX 0x40
+#define SCHED_FLAG_LATENCY_NICE 0X80
#define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \
SCHED_FLAG_KEEP_PARAMS)
@@ -143,6 +144,7 @@ struct clone_args {
SCHED_FLAG_RECLAIM | \
SCHED_FLAG_DL_OVERRUN | \
SCHED_FLAG_KEEP_ALL | \
- SCHED_FLAG_UTIL_CLAMP)
+ SCHED_FLAG_UTIL_CLAMP | \
+ SCHED_FLAG_LATENCY_NICE)
#endif /* _UAPI_LINUX_SCHED_H */
--
2.17.1
Take into account the nice latency priority of a thread when deciding to
preempt the current running thread. We don't want to provide more CPU
bandwidth to a thread but reorder the scheduling to run latency sensitive
task first whenever possible.
As long as a thread didn't use its bandwidth, it will be able to preempt
the current thread.
At the opposite, a thread with a low latency priority will preempt current
thread at wakeup only to keep fair CPU bandwidth sharing. Otherwise it will
wait for the tick to get its sched slice.
curr vruntime
|
sysctl_sched_wakeup_granularity
<-->
----------------------------------|----|-----------------------|---------------
| |<--------------------->
| . sysctl_sched_latency
| .
default/current latency entity | .
| .
1111111111111111111111111111111111|0000|-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-
se preempts curr at wakeup ------>|<- se doesn't preempt curr -----------------
| .
| .
| .
low latency entity | .
---------------------->|
% of sysctl_sched_latency |
1111111111111111111111111111111111111111111111111111111111|0000|-1-1-1-1-1-1-1-
preempt ------------------------------------------------->|<- do not preempt --
| .
| .
| .
high latency entity | .
|<-----------------------| .
| % of sysctl_sched_latency .
111111111|0000|-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1
preempt->|<- se doesn't preempt curr ------------------------------------------
Tests results of nice latency impact on heavy load like hackbench:
hackbench -l (2560 / group) -g group
group latency 0 latency 19
1 1.450(+/- 0.60%) 1.376(+/- 0.54%) + 5%
4 1.537(+/- 1.75%) 1.335(+/- 1.81%) +13%
8 1.414(+/- 1.91%) 1.348(+/- 1.88%) + 5%
16 1.423(+/- 1.65%) 1.374(+/- 0.58%) + 3%
hackbench -p -l (2560 / group) -g group
group
1 1.416(+/- 3.45%) 0.886(+/- 0.54%) +37%
4 1.634(+/- 7.14%) 0.888(+/- 5.40%) +45%
8 1.449(+/- 2.14%) 0.804(+/- 4.55%) +44%
16 0.917(+/- 4.12%) 0.777(+/- 1.41%) +15%
By deacreasing the latency prio, we reduce the number of preemption at
wakeup and help hackbench making progress.
Test results of nice latency impact on short live load like cyclictest
while competing with heavy load like hackbench:
hackbench -l 10000 -g group &
cyclictest --policy other -D 5 -q -n
latency 0 latency -20
group min avg max min avg max
0 16 17 28 15 17 27
1 61 382 10603 63 89 4628
4 52 437 15455 54 98 16238
8 56 728 38499 61 125 28983
16 53 1215 52207 61 183 80751
group = 0 means that hackbench is not running.
The avg is significantly improved with nice latency -20 especially with
large number of groups but min and max remain quite similar. If we add the
histogram parameters to get details of latency, we have :
hackbench -l 10000 -g 16 &
cyclictest --policy other -D 5 -q -n -H 20000 --histfile data.txt
latency 0 latency -20
Min Latencies: 63 62
Avg Latencies: 1397 218
Max Latencies: 44926 42291
50% latencies: 100 98
75% latencies: 762 116
85% latencies: 1118 126
90% latencies: 1540 130
95% latencies: 5610 138
99% latencies: 13738 266
With percentile details, we see the benefit of nice latency -20 as
1% of the latencies stays above 266us whereas the default latency has
got 25% are above 762us and 10% over the 1ms.
Signed-off-by: Vincent Guittot <[email protected]>
---
include/linux/sched.h | 4 ++-
init/init_task.c | 2 +-
kernel/sched/core.c | 32 +++++++++++++++++++----
kernel/sched/debug.c | 2 +-
kernel/sched/fair.c | 60 +++++++++++++++++++++++++++++++++++++++++--
kernel/sched/sched.h | 12 +++++++++
6 files changed, 102 insertions(+), 10 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2aa889a59054..9aeb157e819b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -560,6 +560,8 @@ struct sched_entity {
unsigned long runnable_weight;
#endif
+ int latency_weight;
+
#ifdef CONFIG_SMP
/*
* Per entity load average tracking.
@@ -779,7 +781,7 @@ struct task_struct {
int static_prio;
int normal_prio;
unsigned int rt_priority;
- int latency_nice;
+ int latency_prio;
struct sched_entity se;
struct sched_rt_entity rt;
diff --git a/init/init_task.c b/init/init_task.c
index 2afa249c253b..e98c71f24981 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -78,7 +78,7 @@ struct task_struct init_task
.prio = MAX_PRIO - 20,
.static_prio = MAX_PRIO - 20,
.normal_prio = MAX_PRIO - 20,
- .latency_nice = 0,
+ .latency_prio = NICE_WIDTH - 20,
.policy = SCHED_NORMAL,
.cpus_ptr = &init_task.cpus_mask,
.user_cpus_ptr = NULL,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8f8b102a75c4..547b0da01efe 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1241,6 +1241,11 @@ static void set_load_weight(struct task_struct *p, bool update_load)
}
}
+static void set_latency_weight(struct task_struct *p)
+{
+ p->se.latency_weight = sched_latency_to_weight[p->latency_prio];
+}
+
#ifdef CONFIG_UCLAMP_TASK
/*
* Serializes updates of utilization clamp values
@@ -4394,7 +4399,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->prio = current->normal_prio;
/* Propagate the parent's latency requirements to the child as well */
- p->latency_nice = current->latency_nice;
+ p->latency_prio = current->latency_prio;
uclamp_fork(p);
@@ -4412,7 +4417,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->prio = p->normal_prio = p->static_prio;
set_load_weight(p, false);
- p->latency_nice = DEFAULT_LATENCY_NICE;
+ p->latency_prio = NICE_TO_LATENCY(0);
/*
* We don't need the reset flag anymore after the fork. It has
* fulfilled its duty:
@@ -4420,6 +4425,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->sched_reset_on_fork = 0;
}
+ /* Once latency_prio is set, update the latency weight */
+ set_latency_weight(p);
+
if (dl_prio(p->prio))
return -EAGAIN;
else if (rt_prio(p->prio))
@@ -7361,7 +7369,7 @@ static int __sched_setscheduler(struct task_struct *p,
if (attr->sched_latency_nice < MIN_LATENCY_NICE)
return -EINVAL;
/* Use the same security checks as NICE */
- if (attr->sched_latency_nice < p->latency_nice &&
+ if (attr->sched_latency_nice < LATENCY_TO_NICE(p->latency_prio) &&
!capable(CAP_SYS_NICE))
return -EPERM;
}
@@ -7401,7 +7409,7 @@ static int __sched_setscheduler(struct task_struct *p,
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
goto change;
if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE &&
- attr->sched_latency_nice != p->latency_nice)
+ attr->sched_latency_nice != LATENCY_TO_NICE(p->latency_prio))
goto change;
p->sched_reset_on_fork = reset_on_fork;
@@ -7942,7 +7950,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
get_params(p, &kattr);
kattr.sched_flags &= SCHED_FLAG_ALL;
- kattr.sched_latency_nice = p->latency_nice;
+ kattr.sched_latency_nice = LATENCY_TO_NICE(p->latency_prio);
#ifdef CONFIG_UCLAMP_TASK
/*
@@ -10954,6 +10962,20 @@ const u32 sched_prio_to_wmult[40] = {
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};
+/*
+ * latency weight for wakeup preemption
+ */
+const int sched_latency_to_weight[40] = {
+ /* -20 */ 1024, 973, 922, 870, 819,
+ /* -15 */ 768, 717, 666, 614, 563,
+ /* -10 */ 512, 461, 410, 358, 307,
+ /* -5 */ 256, 205, 154, 102, 51,
+ /* 0 */ 0, -51, -102, -154, -205,
+ /* 5 */ -256, -307, -358, -410, -461,
+ /* 10 */ -512, -563, -614, -666, -717,
+ /* 15 */ -768, -819, -870, -922, -973,
+};
+
void call_trace_sched_update_nr_running(struct rq *rq, int count)
{
trace_sched_update_nr_running_tp(rq, count);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 5d76a8927888..253e52ec73fb 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1043,7 +1043,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
#endif
P(policy);
P(prio);
- P(latency_nice);
+ P(latency_prio);
if (task_has_dl_policy(p)) {
P(dl.runtime);
P(dl.deadline);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5c4bfffe8c2c..506c482a0e48 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5555,6 +5555,35 @@ static int sched_idle_cpu(int cpu)
}
#endif
+static void set_next_buddy(struct sched_entity *se);
+
+static void check_preempt_from_idle(struct cfs_rq *cfs, struct sched_entity *se)
+{
+ struct sched_entity *next;
+
+ if (se->latency_weight <= 0)
+ return;
+
+ if (cfs->nr_running <= 1)
+ return;
+ /*
+ * When waking from idle, we don't need to check to preempt at wakeup
+ * the idle thread and don't set next buddy as a candidate for being
+ * picked in priority.
+ * In case of simultaneous wakeup from idle, the latency sensitive tasks
+ * lost opportunity to preempt non sensitive tasks which woke up
+ * simultaneously.
+ */
+
+ if (cfs->next)
+ next = cfs->next;
+ else
+ next = __pick_first_entity(cfs);
+
+ if (next && wakeup_preempt_entity(next, se) == 1)
+ set_next_buddy(se);
+}
+
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
@@ -5648,6 +5677,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!task_new)
update_overutilized_status(rq);
+ if (rq->curr == rq->idle)
+ check_preempt_from_idle(cfs_rq_of(&p->se), &p->se);
+
enqueue_throttle:
if (cfs_bandwidth_used()) {
/*
@@ -5669,8 +5701,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
hrtick_update(rq);
}
-static void set_next_buddy(struct sched_entity *se);
-
/*
* The dequeue_task method is called before nr_running is
* decreased. We remove the task from the rbtree and
@@ -6970,6 +7000,27 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}
#endif /* CONFIG_SMP */
+static long wakeup_latency_gran(int latency_weight)
+{
+ long thresh = sysctl_sched_latency;
+
+ if (!latency_weight)
+ return 0;
+
+ if (sched_feat(GENTLE_FAIR_SLEEPERS))
+ thresh >>= 1;
+
+ /*
+ * Clamp the delta to stay in the scheduler period range
+ * [-sysctl_sched_latency:sysctl_sched_latency]
+ */
+ latency_weight = clamp_t(long, latency_weight,
+ -1 * NICE_LATENCY_WEIGHT_MAX,
+ NICE_LATENCY_WEIGHT_MAX);
+
+ return (thresh * latency_weight) >> NICE_LATENCY_SHIFT;
+}
+
static unsigned long wakeup_gran(struct sched_entity *se)
{
unsigned long gran = sysctl_sched_wakeup_granularity;
@@ -7008,6 +7059,10 @@ static int
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
{
s64 gran, vdiff = curr->vruntime - se->vruntime;
+ int latency_weight = se->latency_weight - curr->latency_weight;
+
+ latency_weight = min(latency_weight, se->latency_weight);
+ vdiff += wakeup_latency_gran(latency_weight);
if (vdiff <= 0)
return -1;
@@ -7117,6 +7172,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return;
update_curr(cfs_rq_of(se));
+
if (wakeup_preempt_entity(se, pse) == 1) {
/*
* Bias pick_next to pick the sched entity that is
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 456ad2159eb1..dd92aa9c36f9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -122,6 +122,17 @@ extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
* Default tasks should be treated as a task with latency_nice = 0.
*/
#define DEFAULT_LATENCY_NICE 0
+#define DEFAULT_LATENCY_PRIO (DEFAULT_LATENCY_NICE + LATENCY_NICE_WIDTH/2)
+
+/*
+ * Convert user-nice values [ -20 ... 0 ... 19 ]
+ * to static latency [ 0..39 ],
+ * and back.
+ */
+#define NICE_TO_LATENCY(nice) ((nice) + DEFAULT_LATENCY_PRIO)
+#define LATENCY_TO_NICE(prio) ((prio) - DEFAULT_LATENCY_PRIO)
+#define NICE_LATENCY_SHIFT (SCHED_FIXEDPOINT_SHIFT)
+#define NICE_LATENCY_WEIGHT_MAX (1L << NICE_LATENCY_SHIFT)
/*
* Increase resolution of nice-level calculations for 64-bit architectures.
@@ -2098,6 +2109,7 @@ static_assert(WF_TTWU == SD_BALANCE_WAKE);
extern const int sched_prio_to_weight[40];
extern const u32 sched_prio_to_wmult[40];
+extern const int sched_latency_to_weight[40];
/*
* {de,en}queue flags:
--
2.17.1
From: Parth Shah <[email protected]>
Since the latency_nice uses the similar infrastructure as NICE, use the
already existing CAP_SYS_NICE security checks for the latency_nice. This
should return -EPERM for the non-root user when trying to set the task
latency_nice value to any lower than the current value.
Signed-off-by: Parth Shah <[email protected]>
[rebase]
Signed-off-by: Vincent Guittot <[email protected]>
---
kernel/sched/core.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3edba1a38ecb..8f8b102a75c4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7360,6 +7360,10 @@ static int __sched_setscheduler(struct task_struct *p,
return -EINVAL;
if (attr->sched_latency_nice < MIN_LATENCY_NICE)
return -EINVAL;
+ /* Use the same security checks as NICE */
+ if (attr->sched_latency_nice < p->latency_nice &&
+ !capable(CAP_SYS_NICE))
+ return -EPERM;
}
if (pi)
--
2.17.1
On Fri, Mar 11, 2022 at 8:15 AM Vincent Guittot
<[email protected]> wrote:
>
[snip]
>
> static struct cftype cpu_legacy_files[] = {
> @@ -10649,6 +10673,11 @@ static struct cftype cpu_legacy_files[] = {
> .read_s64 = cpu_idle_read_s64,
> .write_s64 = cpu_idle_write_s64,
> },
> + {
> + .name = "latency",
> + .read_s64 = cpu_latency_read_s64,
> + .write_s64 = cpu_latency_write_s64,
> + },
> #endif
> #ifdef CONFIG_CFS_BANDWIDTH
> {
> @@ -10866,6 +10895,18 @@ static struct cftype cpu_files[] = {
> .read_s64 = cpu_idle_read_s64,
> .write_s64 = cpu_idle_write_s64,
> },
> + {
> + .name = "latency",
> + .flags = CFTYPE_NOT_ON_ROOT,
> + .read_s64 = cpu_latency_read_s64,
> + .write_s64 = cpu_latency_write_s64,
> + },
> + {
> + .name = "latency.nice",
> + .flags = CFTYPE_NOT_ON_ROOT,
> + .read_s64 = cpu_latency_nice_read_s64,
> + .write_s64 = cpu_latency_nice_write_s64,
> + },
Something I considered when adding cpu.idle was that negative values
could be used to indicate increasing latency sensitivity. Folding the
above latency property into cpu.idle could help consolidate the
"latency" behavior, especially given that it shouldn't really be
possible to configure an entity as both latency sensitive and idle.
On Tue, 15 Mar 2022 at 01:53, Josh Don <[email protected]> wrote:
>
> Hi Vincent,
>
> On Fri, Mar 11, 2022 at 8:21 AM Vincent Guittot
> <[email protected]> wrote:
> >
> [snip]
> > +
> > +static void check_preempt_from_idle(struct cfs_rq *cfs, struct sched_entity *se)
> > +{
> > + struct sched_entity *next;
> > +
> > + if (se->latency_weight <= 0)
> > + return;
> > +
> > + if (cfs->nr_running <= 1)
> > + return;
>
> I don't quite understand this nr_running check.
just to return early if there is no other task running on the cfs
>
> > + /*
> > + * When waking from idle, we don't need to check to preempt at wakeup
> > + * the idle thread and don't set next buddy as a candidate for being
> > + * picked in priority.
> > + * In case of simultaneous wakeup from idle, the latency sensitive tasks
> > + * lost opportunity to preempt non sensitive tasks which woke up
> > + * simultaneously.
> > + */
> > +
> > + if (cfs->next)
> > + next = cfs->next;
> > + else
> > + next = __pick_first_entity(cfs);
> > +
> > + if (next && wakeup_preempt_entity(next, se) == 1)
> > + set_next_buddy(se);
> > +}
> > +
>
> What's the motivation to do this with the next buddy vs using wakeup
> entity placement to achieve a similar result? The latter would also
I don't want to modify vruntime because of latency prio because it
would mean impacting the cpu bandwidth of the task which is managed
with nice priority. latency nice is only about preempting current
running task
> more generically work when we aren't transitioning from idle. It also
> doesn't suffer from some slight weirdness here in the interaction with
> core scheduling, where rq->curr can be idle despite the presence of
> runnable tasks if the cpu is forced idle.
TBH, I haven't looked in details the core scheduling part which adds
another level of complexity when selecting which task should run on
the core
On Tue, 15 Mar 2022 at 01:59, Josh Don <[email protected]> wrote:
>
> On Fri, Mar 11, 2022 at 8:15 AM Vincent Guittot
> <[email protected]> wrote:
> >
> [snip]
> >
> > static struct cftype cpu_legacy_files[] = {
> > @@ -10649,6 +10673,11 @@ static struct cftype cpu_legacy_files[] = {
> > .read_s64 = cpu_idle_read_s64,
> > .write_s64 = cpu_idle_write_s64,
> > },
> > + {
> > + .name = "latency",
> > + .read_s64 = cpu_latency_read_s64,
> > + .write_s64 = cpu_latency_write_s64,
> > + },
> > #endif
> > #ifdef CONFIG_CFS_BANDWIDTH
> > {
> > @@ -10866,6 +10895,18 @@ static struct cftype cpu_files[] = {
> > .read_s64 = cpu_idle_read_s64,
> > .write_s64 = cpu_idle_write_s64,
> > },
> > + {
> > + .name = "latency",
> > + .flags = CFTYPE_NOT_ON_ROOT,
> > + .read_s64 = cpu_latency_read_s64,
> > + .write_s64 = cpu_latency_write_s64,
> > + },
> > + {
> > + .name = "latency.nice",
> > + .flags = CFTYPE_NOT_ON_ROOT,
> > + .read_s64 = cpu_latency_nice_read_s64,
> > + .write_s64 = cpu_latency_nice_write_s64,
> > + },
>
> Something I considered when adding cpu.idle was that negative values
> could be used to indicate increasing latency sensitivity. Folding the
> above latency property into cpu.idle could help consolidate the
> "latency" behavior, especially given that it shouldn't really be
> possible to configure an entity as both latency sensitive and idle.
The range of latency nice is [-19:20] and it doesn't touch on the
weight whereas sched_idle behavior impacts both the shares and the
preemption so I was afraid of possible confusion with what latency
nice is doing
Hi Vincent,
On Fri, Mar 11, 2022 at 8:21 AM Vincent Guittot
<[email protected]> wrote:
>
[snip]
> +
> +static void check_preempt_from_idle(struct cfs_rq *cfs, struct sched_entity *se)
> +{
> + struct sched_entity *next;
> +
> + if (se->latency_weight <= 0)
> + return;
> +
> + if (cfs->nr_running <= 1)
> + return;
I don't quite understand this nr_running check.
> + /*
> + * When waking from idle, we don't need to check to preempt at wakeup
> + * the idle thread and don't set next buddy as a candidate for being
> + * picked in priority.
> + * In case of simultaneous wakeup from idle, the latency sensitive tasks
> + * lost opportunity to preempt non sensitive tasks which woke up
> + * simultaneously.
> + */
> +
> + if (cfs->next)
> + next = cfs->next;
> + else
> + next = __pick_first_entity(cfs);
> +
> + if (next && wakeup_preempt_entity(next, se) == 1)
> + set_next_buddy(se);
> +}
> +
What's the motivation to do this with the next buddy vs using wakeup
entity placement to achieve a similar result? The latter would also
more generically work when we aren't transitioning from idle. It also
doesn't suffer from some slight weirdness here in the interaction with
core scheduling, where rq->curr can be idle despite the presence of
runnable tasks if the cpu is forced idle.
Hello,
On Fri, Mar 11, 2022 at 05:14:06PM +0100, Vincent Guittot wrote:
> Tasks can set its latency priority which is then used to decide to preempt
> the current running entity of the cfs but sched group entities still have
> the default latency priority.
>
> Add a latency field in task group to set the latency priority of the group
> which will be used against other entity in the parent cfs.
One thing that bothers me about this interface is that the configuration
values aren't well defined. We have the same problems with the nice levels
but at least have them map to well defined weight values, which likely won't
change in any foreseeable future. The fact that we have the
not-too-well-defined nice levels as an interface shouldn't be a reason we
add another one. Provided that this is something scheduler folks want, it'd
be really great if the interface can be better thought through. What are the
numbers actually encoding?
Thanks.
--
tejun
On Fri, 2022-03-11 at 17:14 +0100, Vincent Guittot wrote:
>
> Signed-off-by: Vincent Guittot <[email protected]>
> ---
> init/init_task.c | 1 +
> kernel/sched/core.c | 4 ++++
> 2 files changed, 5 insertions(+)
>
> diff --git a/init/init_task.c b/init/init_task.c
> index 73cc8f03511a..2afa249c253b 100644
> --- a/init/init_task.c
> +++ b/init/init_task.c
> @@ -78,6 +78,7 @@ struct task_struct init_task
> .prio = MAX_PRIO - 20,
> .static_prio = MAX_PRIO - 20,
> .normal_prio = MAX_PRIO - 20,
> + .latency_nice = 0,
Probably better to use non hardcoded number here
.latency_nice = DEFAULT_LATENCY_NICE;
> .policy = SCHED_NORMAL,
> .cpus_ptr = &init_task.cpus_mask,
> .user_cpus_ptr = NULL,
>
Tim
On Fri, 2022-03-11 at 17:14 +0100, Vincent Guittot wrote:
>
> +static void __setscheduler_latency(struct task_struct *p,
> + const struct sched_attr *attr)
> +{
> + if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) {
> + p->latency_prio = NICE_TO_LATENCY(attr->sched_latency_nice);
NICE_TO_LATENCY used here but has defined later in patch 5. This will break
bisect.
> + set_latency_weight(p);
> + }
> }
>
>
Tim
On Tue, 22 Mar 2022 at 01:22, Tim Chen <[email protected]> wrote:
>
> On Fri, 2022-03-11 at 17:14 +0100, Vincent Guittot wrote:
> >
> > Signed-off-by: Vincent Guittot <[email protected]>
> > ---
> > init/init_task.c | 1 +
> > kernel/sched/core.c | 4 ++++
> > 2 files changed, 5 insertions(+)
> >
> > diff --git a/init/init_task.c b/init/init_task.c
> > index 73cc8f03511a..2afa249c253b 100644
> > --- a/init/init_task.c
> > +++ b/init/init_task.c
> > @@ -78,6 +78,7 @@ struct task_struct init_task
> > .prio = MAX_PRIO - 20,
> > .static_prio = MAX_PRIO - 20,
> > .normal_prio = MAX_PRIO - 20,
> > + .latency_nice = 0,
>
> Probably better to use non hardcoded number here
> .latency_nice = DEFAULT_LATENCY_NICE;
yes
>
> > .policy = SCHED_NORMAL,
> > .cpus_ptr = &init_task.cpus_mask,
> > .user_cpus_ptr = NULL,
> >
>
> Tim
>
Hi Tejun,
On Mon, 21 Mar 2022 at 18:24, Tejun Heo <[email protected]> wrote:
>
> Hello,
>
> On Fri, Mar 11, 2022 at 05:14:06PM +0100, Vincent Guittot wrote:
> > Tasks can set its latency priority which is then used to decide to preempt
> > the current running entity of the cfs but sched group entities still have
> > the default latency priority.
> >
> > Add a latency field in task group to set the latency priority of the group
> > which will be used against other entity in the parent cfs.
>
> One thing that bothers me about this interface is that the configuration
> values aren't well defined. We have the same problems with the nice levels
> but at least have them map to well defined weight values, which likely won't
> change in any foreseeable future. The fact that we have the
> not-too-well-defined nice levels as an interface shouldn't be a reason we
> add another one. Provided that this is something scheduler folks want, it'd
> be really great if the interface can be better thought through. What are the
> numbers actually encoding?
latency_nice is quite similar to nice. The nice latency is used as an
index to get a latency weight in the range [-1024:1024]. latency_nice
is in the range [-20:19] and latency_prio shifts it in the range
[0:40] . This index is then used to get the latency weight similar to
how the nice prio is used to get a weight. That being said, the
latency should probably reflect the latency_weight instead of the
latency_prio in order to be aligned with the weight and weight.nice
fields of cgroups.
As described in patch 5 commit message, the weight is then used to
compute a relative offset to check whether the waking task can preempt
the current running task.
Vincent
>
> Thanks.
>
> --
> tejun
Hello,
On Tue, Mar 22, 2022 at 05:10:36PM +0100, Vincent Guittot wrote:
> latency_nice is quite similar to nice. The nice latency is used as an
> index to get a latency weight in the range [-1024:1024]. latency_nice
> is in the range [-20:19] and latency_prio shifts it in the range
> [0:40] . This index is then used to get the latency weight similar to
> how the nice prio is used to get a weight. That being said, the
> latency should probably reflect the latency_weight instead of the
> latency_prio in order to be aligned with the weight and weight.nice
> fields of cgroups.
>
> As described in patch 5 commit message, the weight is then used to
> compute a relative offset to check whether the waking task can preempt
> the current running task.
So, what I'm trying to say is if it is actually a weight, just use weight
values instead of arbitrary mapped nice values. Nobody can tell how the
latency nice value of -2 compares against, say, 3. If you can define it
clearly in terms of weights (or something else clearly describable), it'd be
a lot better.
Thanks.
--
tejun
Hi Vincent
Thanks for reviving this patchset!
On 03/11/22 17:14, Vincent Guittot wrote:
> This patchset restarts the work about adding a latency nice priority to
> describe the latency tolerance of cfs tasks.
>
> The patches [1-4] have been done by Parth:
> https://lore.kernel.org/lkml/[email protected]/
>
> I have just rebased and moved the set of latency priority outside the
> priority update. I have removed the reviewed tag because the patches
> are 2 years old.
AFAIR the blocking issue we had then is on agreement on the interface. Has this
been resolved now? I didn't see any further discussion since then.
>
> The patches [5-6] use latency nice priority to decide if a cfs task can
> preempt the current running task. Patch 5 gives some tests results with
> cyclictests and hackbench to highlight the benefit of latency nice
> priority for short interactive task or long intensive tasks.
This is a new use case AFAICT. For Android, we want to do something in EAS path
to skip feec() and revert to select_idle_capacity() (prefer_idle). I think
Oracle's use case was control the search depth in the LB.
I am not keen on this new use case. It looks awefully similar to how nice
works. And if I tweak nice values I can certainly achieve similar effects
without this new addition:
--::((TESTING NICE 0))::--
hackbench -l $((2560 / $group)) -g $group
count mean std min ... 90% 95% 99% max
group ...
1 20.0 0.69315 0.119378 0.545 ... 0.8309 0.84725 0.97265 1.004
4 20.0 0.54650 0.063123 0.434 ... 0.6363 0.64840 0.65448 0.656
8 20.0 0.51025 0.042268 0.441 ... 0.5725 0.57830 0.59806 0.603
16 20.0 0.54545 0.031041 0.483 ... 0.5824 0.58655 0.59491 0.597
hackbench -p -l $((2560 / $group)) -g $group
count mean std min ... 90% 95% 99% max
group ...
1 20.0 0.48135 0.036292 0.430 ... 0.5300 0.5481 0.54962 0.550
4 20.0 0.42925 0.050890 0.339 ... 0.4838 0.5094 0.51548 0.517
8 20.0 0.33655 0.049839 0.269 ... 0.4002 0.4295 0.43710 0.439
16 20.0 0.31775 0.031001 0.280 ... 0.3530 0.3639 0.39278 0.400
hackbench -l 10000 -g 16 &
cyclictest --policy other -D 5 -q -H 20000 --histfile data.txt
# Min Latencies: 00005
# Avg Latencies: 00342
# Max Latencies: 23562
--::((TESTING NICE -20))::--
hackbench -l $((2560 / $group)) -g $group
count mean std min ... 90% 95% 99% max
group ...
1 20.0 0.76990 0.126582 0.585 ... 0.9169 0.9316 1.03192 1.057
4 20.0 0.67715 0.105558 0.505 ... 0.8202 0.8581 0.85962 0.860
8 20.0 0.75715 0.061286 0.631 ... 0.8276 0.8425 0.85010 0.852
16 20.0 0.72085 0.089566 0.578 ... 0.8493 0.8818 0.92436 0.935
hackbench -p -l $((2560 / $group)) -g $group
count mean std min ... 90% 95% 99% max
group ...
1 20.0 0.50245 0.055636 0.388 ... 0.5839 0.60185 0.61477 0.618
4 20.0 0.56280 0.139277 0.354 ... 0.7280 0.75075 0.82295 0.841
8 20.0 0.58005 0.091819 0.412 ... 0.6969 0.71400 0.71400 0.714
16 20.0 0.52110 0.081465 0.323 ... 0.6169 0.63685 0.68017 0.691
hackbench -l 10000 -g 16 &
cyclictest --policy other -D 5 -q -H 20000 --histfile data.txt
# Min Latencies: 00007
# Avg Latencies: 00081
# Max Latencies: 20560
--::((TESTING NICE 19))::--
hackbench -l $((2560 / $group)) -g $group
count mean std min ... 90% 95% 99% max
group ...
1 20.0 0.46560 0.013694 0.448 ... 0.4782 0.49805 0.49881 0.499
4 20.0 0.43705 0.014979 0.414 ... 0.4550 0.45540 0.46148 0.463
8 20.0 0.45800 0.013471 0.436 ... 0.4754 0.47925 0.48305 0.484
16 20.0 0.53025 0.007239 0.522 ... 0.5391 0.54040 0.54648 0.548
hackbench -p -l $((2560 / $group)) -g $group
count mean std min ... 90% 95% 99% max
group ...
1 20.0 0.27480 0.013725 0.247 ... 0.2892 0.29125 0.29505 0.296
4 20.0 0.25095 0.011637 0.234 ... 0.2700 0.27010 0.27162 0.272
8 20.0 0.25250 0.010097 0.240 ... 0.2632 0.27415 0.27643 0.277
16 20.0 0.26700 0.007595 0.257 ... 0.2751 0.27645 0.28329 0.285
hackbench -l 10000 -g 16 &
cyclictest --policy other -D 5 -q -H 20000 --histfile data.txt
# Min Latencies: 00058
# Avg Latencies: 77232
# Max Latencies: 696375
For hackbench, the relationship seems to be inversed. Better nice value
produces worse result. But for the cycletest, the avg goes down considerably
similar to your results.
Aren't we just manipulating the same thing with your new proposal or did
I miss something? Can we impact preemption in isolation without having any
impact on bandwidth?
I am worried about how userspace can reason about the expected outcome when
nice and latency_nice are combined together.
Thanks
--
Qais Yousef
On Tue, 22 Mar 2022 at 01:22, Tim Chen <[email protected]> wrote:
>
> On Fri, 2022-03-11 at 17:14 +0100, Vincent Guittot wrote:
> >
> > +static void __setscheduler_latency(struct task_struct *p,
> > + const struct sched_attr *attr)
> > +{
> > + if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) {
> > + p->latency_prio = NICE_TO_LATENCY(attr->sched_latency_nice);
>
> NICE_TO_LATENCY used here but has defined later in patch 5. This will break
> bisect.
yes, I have done a mistake when reorganizing the patchset.
latency_prio replaces latency_nice in patch 5 so it should still be
latency_nice field here instead of latency_prio.
Will fix it.
>
> > + set_latency_weight(p);
> > + }
> > }
> >
> >
>
> Tim
>
On Mon, 2022-03-21 at 07:24 -1000, Tejun Heo wrote:
> Hello,
>
> On Fri, Mar 11, 2022 at 05:14:06PM +0100, Vincent Guittot wrote:
> > Tasks can set its latency priority which is then used to decide to preempt
> > the current running entity of the cfs but sched group entities still have
> > the default latency priority.
> >
> > Add a latency field in task group to set the latency priority of the group
> > which will be used against other entity in the parent cfs.
>
> One thing that bothers me about this interface is that the configuration
> values aren't well defined. We have the same problems with the nice levels
> but at least have them map to well defined weight values, which likely won't
> change in any foreseeable future. The fact that we have the
> not-too-well-defined nice levels as an interface shouldn't be a reason we
> add another one. Provided that this is something scheduler folks want, it'd
> be really great if the interface can be better thought through. What are the
> numbers actually encoding?
>
>
The way I was interpreting the latency_nice number is as follow:
Given runnable tasks that have not exceeded their time slice,
the task with the lowest latency nice number run first.
The current patchset takes care of the
case of tasks within a run queue. I think we also need to
consider which cpu should be selected for a waking
task, if we cannot find an idle CPU.
It seems reasonable that we wake a task on a CPU that is running a task with
a higher latency nice value and lower load than previous CPU the waking task
has run on. That way we can wake latency sensitive tasks faster in a busy system.
Tim
On Tue, 22 Mar 2022 at 17:40, Tejun Heo <[email protected]> wrote:
>
> Hello,
>
> On Tue, Mar 22, 2022 at 05:10:36PM +0100, Vincent Guittot wrote:
> > latency_nice is quite similar to nice. The nice latency is used as an
> > index to get a latency weight in the range [-1024:1024]. latency_nice
> > is in the range [-20:19] and latency_prio shifts it in the range
> > [0:40] . This index is then used to get the latency weight similar to
> > how the nice prio is used to get a weight. That being said, the
> > latency should probably reflect the latency_weight instead of the
> > latency_prio in order to be aligned with the weight and weight.nice
> > fields of cgroups.
> >
> > As described in patch 5 commit message, the weight is then used to
> > compute a relative offset to check whether the waking task can preempt
> > the current running task.
>
> So, what I'm trying to say is if it is actually a weight, just use weight
> values instead of arbitrary mapped nice values. Nobody can tell how the
> latency nice value of -2 compares against, say, 3. If you can define it
> clearly in terms of weights (or something else clearly describable), it'd be
> a lot better.
The current use is mapped to weight because weight makes sense when
comparing vruntime but others might want to map the latency nice to
something else in other places of the scheduler. Time but also others
have mentioned the depth of the loop for looking an idle CPU
>
> Thanks.
>
> --
> tejun
On 3/23/22 11:04 AM, Vincent Guittot wrote:
> On Tue, 22 Mar 2022 at 17:40, Tejun Heo <[email protected]> wrote:
>>
>> Hello,
>>
>> On Tue, Mar 22, 2022 at 05:10:36PM +0100, Vincent Guittot wrote:
>>> latency_nice is quite similar to nice. The nice latency is used as an
>>> index to get a latency weight in the range [-1024:1024]. latency_nice
>>> is in the range [-20:19] and latency_prio shifts it in the range
>>> [0:40] . This index is then used to get the latency weight similar to
>>> how the nice prio is used to get a weight. That being said, the
>>> latency should probably reflect the latency_weight instead of the
>>> latency_prio in order to be aligned with the weight and weight.nice
>>> fields of cgroups.
>>>
>>> As described in patch 5 commit message, the weight is then used to
>>> compute a relative offset to check whether the waking task can preempt
>>> the current running task.
>>
>> So, what I'm trying to say is if it is actually a weight, just use weight
>> values instead of arbitrary mapped nice values. Nobody can tell how the
>> latency nice value of -2 compares against, say, 3. If you can define it
>> clearly in terms of weights (or something else clearly describable), it'd be
>> a lot better.
>
> The current use is mapped to weight because weight makes sense when
> comparing vruntime but others might want to map the latency nice to
> something else in other places of the scheduler. Time but also others
> have mentioned the depth of the loop for looking an idle CPU
Reinforcing what Vincent said, about a year and half ago there were four different projects/ideas looking to do
different but relatively compatible things based on the latency sensitivity of a task. We were looking for a number to
describe this sensitivity, say like an integer. One of those projects wanted to do "bad for latency" power saving things
so we then needed "good for latency", "neutral" and "potentially bad for latency". Call neutral 0, negative good and
positive bad and there was a fair similarity to nice. Given that was an abstraction, we did recognize that beyond
smaller numbers being more sensitive there really isn't much more you can say about the relationship between numbers.
Certainly no guarantees of linearity.
-chrish
On Tue, 22 Mar 2022 at 17:39, Qais Yousef <[email protected]> wrote:
>
> Hi Vincent
>
> Thanks for reviving this patchset!
>
> On 03/11/22 17:14, Vincent Guittot wrote:
> > This patchset restarts the work about adding a latency nice priority to
> > describe the latency tolerance of cfs tasks.
> >
> > The patches [1-4] have been done by Parth:
> > https://lore.kernel.org/lkml/[email protected]/
> >
> > I have just rebased and moved the set of latency priority outside the
> > priority update. I have removed the reviewed tag because the patches
> > are 2 years old.
>
> AFAIR the blocking issue we had then is on agreement on the interface. Has this
> been resolved now? I didn't see any further discussion since then.
I think that there was an agreement about using a latency nice
priority in the range [-20:19] with -20 meaning sensitive to latency
whereas 19 means that task doesn't care about scheduling latency. The
open point was about how to use this input in the scheduler with some
behavior being opposed.
>
> >
> > The patches [5-6] use latency nice priority to decide if a cfs task can
> > preempt the current running task. Patch 5 gives some tests results with
> > cyclictests and hackbench to highlight the benefit of latency nice
> > priority for short interactive task or long intensive tasks.
>
> This is a new use case AFAICT. For Android, we want to do something in EAS path
I don't think it's new, it's about being able to run some tasks in
priority as long as they haven't use all their cpu bandwidth. ANd this
has the main advantage of not being impact by finding an idle cpu. If
the task is latency sensitive, it will run 1st on the cpu being idle
or not.
> to skip feec() and revert to select_idle_capacity() (prefer_idle). I think
> Oracle's use case was control the search depth in the LB.
>
> I am not keen on this new use case. It looks awefully similar to how nice
> works. And if I tweak nice values I can certainly achieve similar effects
> without this new addition:
nice is about the cpu bandwidth allocated to a task and not about
scheduling task priori to another one. If you provide all cpu
bandwidth to a task, it will most probably have low latency because
others will always have consumed all their time slice.
>
>
> --::((TESTING NICE 0))::--
>
> hackbench -l $((2560 / $group)) -g $group
>
> count mean std min ... 90% 95% 99% max
> group ...
> 1 20.0 0.69315 0.119378 0.545 ... 0.8309 0.84725 0.97265 1.004
> 4 20.0 0.54650 0.063123 0.434 ... 0.6363 0.64840 0.65448 0.656
> 8 20.0 0.51025 0.042268 0.441 ... 0.5725 0.57830 0.59806 0.603
> 16 20.0 0.54545 0.031041 0.483 ... 0.5824 0.58655 0.59491 0.597
>
> hackbench -p -l $((2560 / $group)) -g $group
>
> count mean std min ... 90% 95% 99% max
> group ...
> 1 20.0 0.48135 0.036292 0.430 ... 0.5300 0.5481 0.54962 0.550
> 4 20.0 0.42925 0.050890 0.339 ... 0.4838 0.5094 0.51548 0.517
> 8 20.0 0.33655 0.049839 0.269 ... 0.4002 0.4295 0.43710 0.439
> 16 20.0 0.31775 0.031001 0.280 ... 0.3530 0.3639 0.39278 0.400
>
> hackbench -l 10000 -g 16 &
> cyclictest --policy other -D 5 -q -H 20000 --histfile data.txt
>
> # Min Latencies: 00005
> # Avg Latencies: 00342
> # Max Latencies: 23562
>
>
> --::((TESTING NICE -20))::--
>
> hackbench -l $((2560 / $group)) -g $group
>
> count mean std min ... 90% 95% 99% max
> group ...
> 1 20.0 0.76990 0.126582 0.585 ... 0.9169 0.9316 1.03192 1.057
> 4 20.0 0.67715 0.105558 0.505 ... 0.8202 0.8581 0.85962 0.860
> 8 20.0 0.75715 0.061286 0.631 ... 0.8276 0.8425 0.85010 0.852
> 16 20.0 0.72085 0.089566 0.578 ... 0.8493 0.8818 0.92436 0.935
>
> hackbench -p -l $((2560 / $group)) -g $group
>
> count mean std min ... 90% 95% 99% max
> group ...
> 1 20.0 0.50245 0.055636 0.388 ... 0.5839 0.60185 0.61477 0.618
> 4 20.0 0.56280 0.139277 0.354 ... 0.7280 0.75075 0.82295 0.841
> 8 20.0 0.58005 0.091819 0.412 ... 0.6969 0.71400 0.71400 0.714
> 16 20.0 0.52110 0.081465 0.323 ... 0.6169 0.63685 0.68017 0.691
>
> hackbench -l 10000 -g 16 &
> cyclictest --policy other -D 5 -q -H 20000 --histfile data.txt
>
> # Min Latencies: 00007
> # Avg Latencies: 00081
> # Max Latencies: 20560
>
>
> --::((TESTING NICE 19))::--
>
> hackbench -l $((2560 / $group)) -g $group
>
> count mean std min ... 90% 95% 99% max
> group ...
> 1 20.0 0.46560 0.013694 0.448 ... 0.4782 0.49805 0.49881 0.499
> 4 20.0 0.43705 0.014979 0.414 ... 0.4550 0.45540 0.46148 0.463
> 8 20.0 0.45800 0.013471 0.436 ... 0.4754 0.47925 0.48305 0.484
> 16 20.0 0.53025 0.007239 0.522 ... 0.5391 0.54040 0.54648 0.548
>
> hackbench -p -l $((2560 / $group)) -g $group
>
> count mean std min ... 90% 95% 99% max
> group ...
> 1 20.0 0.27480 0.013725 0.247 ... 0.2892 0.29125 0.29505 0.296
> 4 20.0 0.25095 0.011637 0.234 ... 0.2700 0.27010 0.27162 0.272
> 8 20.0 0.25250 0.010097 0.240 ... 0.2632 0.27415 0.27643 0.277
> 16 20.0 0.26700 0.007595 0.257 ... 0.2751 0.27645 0.28329 0.285
>
> hackbench -l 10000 -g 16 &
> cyclictest --policy other -D 5 -q -H 20000 --histfile data.txt
>
> # Min Latencies: 00058
> # Avg Latencies: 77232
> # Max Latencies: 696375
>
> For hackbench, the relationship seems to be inversed. Better nice value
> produces worse result. But for the cycletest, the avg goes down considerably
> similar to your results.
>
> Aren't we just manipulating the same thing with your new proposal or did
> I miss something? Can we impact preemption in isolation without having any
> impact on bandwidth?
>
> I am worried about how userspace can reason about the expected outcome when
> nice and latency_nice are combined together.
>
>
> Thanks
>
> --
> Qais Yousef
On Tue, 22 Mar 2022 at 17:41, Tim Chen <[email protected]> wrote:
>
> On Mon, 2022-03-21 at 07:24 -1000, Tejun Heo wrote:
> > Hello,
> >
> > On Fri, Mar 11, 2022 at 05:14:06PM +0100, Vincent Guittot wrote:
> > > Tasks can set its latency priority which is then used to decide to preempt
> > > the current running entity of the cfs but sched group entities still have
> > > the default latency priority.
> > >
> > > Add a latency field in task group to set the latency priority of the group
> > > which will be used against other entity in the parent cfs.
> >
> > One thing that bothers me about this interface is that the configuration
> > values aren't well defined. We have the same problems with the nice levels
> > but at least have them map to well defined weight values, which likely won't
> > change in any foreseeable future. The fact that we have the
> > not-too-well-defined nice levels as an interface shouldn't be a reason we
> > add another one. Provided that this is something scheduler folks want, it'd
> > be really great if the interface can be better thought through. What are the
> > numbers actually encoding?
> >
> >
>
> The way I was interpreting the latency_nice number is as follow:
> Given runnable tasks that have not exceeded their time slice,
> the task with the lowest latency nice number run first.
The current version is not such binary in the decision as I wanted to
have a smoother level of preemption between close latency nice
priority.
>
> The current patchset takes care of the
> case of tasks within a run queue. I think we also need to
> consider which cpu should be selected for a waking
> task, if we cannot find an idle CPU.
>
> It seems reasonable that we wake a task on a CPU that is running a task with
> a higher latency nice value and lower load than previous CPU the waking task
> has run on. That way we can wake latency sensitive tasks faster in a busy system.
>
> Tim
>
On 03/23/22 16:32, Vincent Guittot wrote:
> On Tue, 22 Mar 2022 at 17:39, Qais Yousef <[email protected]> wrote:
> >
> > Hi Vincent
> >
> > Thanks for reviving this patchset!
> >
> > On 03/11/22 17:14, Vincent Guittot wrote:
> > > This patchset restarts the work about adding a latency nice priority to
> > > describe the latency tolerance of cfs tasks.
> > >
> > > The patches [1-4] have been done by Parth:
> > > https://lore.kernel.org/lkml/[email protected]/
> > >
> > > I have just rebased and moved the set of latency priority outside the
> > > priority update. I have removed the reviewed tag because the patches
> > > are 2 years old.
> >
> > AFAIR the blocking issue we had then is on agreement on the interface. Has this
> > been resolved now? I didn't see any further discussion since then.
>
> I think that there was an agreement about using a latency nice
> priority in the range [-20:19] with -20 meaning sensitive to latency
> whereas 19 means that task doesn't care about scheduling latency. The
> open point was about how to use this input in the scheduler with some
> behavior being opposed.
What I remember is that the problem was to consolidate on use cases then
discuss interfaces.
See https://lwn.net/Articles/820659/
" Youssef said that the interface to all of this is the sticking
point. Thomas Gleixner agreed, saying that the -20..19 range "requires
a crystal ball" to use properly. Zijlstra repeated his call to
enumerate the use cases before getting into the interface details.
Giani repeated that the interface does not look correct now, and agreed
that a more comprehensive look at the use cases was needed. Things were
being done backwards currently, he said. "
>
> >
> > >
> > > The patches [5-6] use latency nice priority to decide if a cfs task can
> > > preempt the current running task. Patch 5 gives some tests results with
> > > cyclictests and hackbench to highlight the benefit of latency nice
> > > priority for short interactive task or long intensive tasks.
> >
> > This is a new use case AFAICT. For Android, we want to do something in EAS path
>
> I don't think it's new, it's about being able to run some tasks in
I meant new use case to latency-nice interface. I don't think we had this in
any of our discussions before? I don't mind it, but it'd be good to clarify if
it has any relation about the other use cases and what should happen to the
other use cases.
Thanks
--
Qais Yousef
removed Dhaval's email which returns error
On Thu, 24 Mar 2022 at 18:25, Qais Yousef <[email protected]> wrote:
>
> On 03/23/22 16:32, Vincent Guittot wrote:
> > On Tue, 22 Mar 2022 at 17:39, Qais Yousef <[email protected]> wrote:
> > >
> > > Hi Vincent
> > >
> > > Thanks for reviving this patchset!
> > >
> > > On 03/11/22 17:14, Vincent Guittot wrote:
> > > > This patchset restarts the work about adding a latency nice priority to
> > > > describe the latency tolerance of cfs tasks.
> > > >
> > > > The patches [1-4] have been done by Parth:
> > > > https://lore.kernel.org/lkml/[email protected]/
> > > >
> > > > I have just rebased and moved the set of latency priority outside the
> > > > priority update. I have removed the reviewed tag because the patches
> > > > are 2 years old.
> > >
> > > AFAIR the blocking issue we had then is on agreement on the interface. Has this
> > > been resolved now? I didn't see any further discussion since then.
> >
> > I think that there was an agreement about using a latency nice
> > priority in the range [-20:19] with -20 meaning sensitive to latency
> > whereas 19 means that task doesn't care about scheduling latency. The
> > open point was about how to use this input in the scheduler with some
> > behavior being opposed.
>
> What I remember is that the problem was to consolidate on use cases then
> discuss interfaces.
>
> See https://lwn.net/Articles/820659/
>
> " Youssef said that the interface to all of this is the sticking
> point. Thomas Gleixner agreed, saying that the -20..19 range "requires
> a crystal ball" to use properly. Zijlstra repeated his call to
> enumerate the use cases before getting into the interface details.
> Giani repeated that the interface does not look correct now, and agreed
> that a more comprehensive look at the use cases was needed. Things were
> being done backwards currently, he said. "
>
At LPC, everybody seemed aligned with latency_nice so I assumed that
there was an agreement on this interface.
Latency_nice fits well with my proposal because it's all about
relative comparison between the running task to the others. The
current nice priority is used to set how much cpu bandwidth a task
will have compared to others and the latency_nice is used in a similar
way to know which one should run compared to the others.
> >
> > >
> > > >
> > > > The patches [5-6] use latency nice priority to decide if a cfs task can
> > > > preempt the current running task. Patch 5 gives some tests results with
> > > > cyclictests and hackbench to highlight the benefit of latency nice
> > > > priority for short interactive task or long intensive tasks.
> > >
> > > This is a new use case AFAICT. For Android, we want to do something in EAS path
> >
> > I don't think it's new, it's about being able to run some tasks in
>
> I meant new use case to latency-nice interface. I don't think we had this in
> any of our discussions before? I don't mind it, but it'd be good to clarify if
> it has any relation about the other use cases and what should happen to the
> other use cases.
Several discussions happened about changing the preemption policy of
CFS. I have Mel's example in mind with hackbench where we want to
reduce the preemption capabilities for the threads and on the other
side the multimedia tasks which complain about having to wait before
being scheduled. All this is about preempting or not the others. And
all this has been kept outside topology consideration but only for the
local run queue
Regards,
Vincent
>
>
> Thanks
>
> --
> Qais Yousef
On Mon, 28 Mar 2022 at 11:24, Dietmar Eggemann <[email protected]> wrote:
>
> On 11/03/2022 17:14, Vincent Guittot wrote:
>
> [...]
>
> > @@ -4412,7 +4417,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
> > p->prio = p->normal_prio = p->static_prio;
> > set_load_weight(p, false);
> >
> > - p->latency_nice = DEFAULT_LATENCY_NICE;
> > + p->latency_prio = NICE_TO_LATENCY(0);
> > /*
> > * We don't need the reset flag anymore after the fork. It has
> > * fulfilled its duty:
> > @@ -4420,6 +4425,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
> > p->sched_reset_on_fork = 0;
> > }
> >
> > + /* Once latency_prio is set, update the latency weight */
> > + set_latency_weight(p);
>
> I thought we only have to do this in the `sched_reset_on_fork` case?
> Like we do with set_load_weight(). Can we not rely on dup_task_struct()
> in the other case?
>
> [...]
>
> > @@ -5648,6 +5677,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> > if (!task_new)
> > update_overutilized_status(rq);
> >
> > + if (rq->curr == rq->idle)
> > + check_preempt_from_idle(cfs_rq_of(&p->se), &p->se);
>
> This is done here (1) because check_preempt_wakeup() (2) is only called
> if p and rq->curr have CFS sched class?
Yes
>
>
> ttwu_do_activate()
> activate_task()
> enqueue_task <-- (1)
> ttwu_do_wakeup()
> check_preempt_curr()
> if (p->sched_class == rq->curr->sched_class)
> rq->curr->sched_class->check_preempt_curr() <-- (2)
>
> [...]
>
> > @@ -7008,6 +7059,10 @@ static int
> > wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
> > {
> > s64 gran, vdiff = curr->vruntime - se->vruntime;
> > + int latency_weight = se->latency_weight - curr->latency_weight;
> > +
> > + latency_weight = min(latency_weight, se->latency_weight);
>
> Why the min out of latency_weight_diff(se, curr) and se->latency_weight
> here?
when there are 2 low latency tasks (weight 1024), there is no reason
to favor the the waking task so we take the diff (0 in this case)
When there are 2 high latency tolerant task (weight -1024), we want
to make sure to not preempt current task we take the weight (-1024)
instead of the diff
>
> [...]
On 11/03/2022 17:14, Vincent Guittot wrote:
[...]
> @@ -4412,7 +4417,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
> p->prio = p->normal_prio = p->static_prio;
> set_load_weight(p, false);
>
> - p->latency_nice = DEFAULT_LATENCY_NICE;
> + p->latency_prio = NICE_TO_LATENCY(0);
> /*
> * We don't need the reset flag anymore after the fork. It has
> * fulfilled its duty:
> @@ -4420,6 +4425,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
> p->sched_reset_on_fork = 0;
> }
>
> + /* Once latency_prio is set, update the latency weight */
> + set_latency_weight(p);
I thought we only have to do this in the `sched_reset_on_fork` case?
Like we do with set_load_weight(). Can we not rely on dup_task_struct()
in the other case?
[...]
> @@ -5648,6 +5677,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> if (!task_new)
> update_overutilized_status(rq);
>
> + if (rq->curr == rq->idle)
> + check_preempt_from_idle(cfs_rq_of(&p->se), &p->se);
This is done here (1) because check_preempt_wakeup() (2) is only called
if p and rq->curr have CFS sched class?
ttwu_do_activate()
activate_task()
enqueue_task <-- (1)
ttwu_do_wakeup()
check_preempt_curr()
if (p->sched_class == rq->curr->sched_class)
rq->curr->sched_class->check_preempt_curr() <-- (2)
[...]
> @@ -7008,6 +7059,10 @@ static int
> wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
> {
> s64 gran, vdiff = curr->vruntime - se->vruntime;
> + int latency_weight = se->latency_weight - curr->latency_weight;
> +
> + latency_weight = min(latency_weight, se->latency_weight);
Why the min out of latency_weight_diff(se, curr) and se->latency_weight
here?
[...]
On Mon, 28 Mar 2022 at 11:23, Dietmar Eggemann <[email protected]> wrote:
>
> On 11/03/2022 17:14, Vincent Guittot wrote:
>
> [...]
>
> > @@ -98,6 +99,22 @@ struct sched_param {
> > * scheduled on a CPU with no more capacity than the specified value.
> > *
> > * A task utilization boundary can be reset by setting the attribute to -1.
> > + *
> > + * Latency Tolerance Attributes
> > + * ===========================
> > + *
> > + * A subset of sched_attr attributes allows to specify the relative latency
> > + * requirements of a task with respect to the other tasks running/queued in the
> > + * system.
> > + *
> > + * @ sched_latency_nice task's latency_nice value
> > + *
> > + * The latency_nice of a task can have any value in a range of
> > + * [LATENCY_NICE_MIN..LATENCY_NICE_MAX].
>
> s/LATENCY_NICE_MIN/MIN_LATENCY_NICE
> s/LATENCY_NICE_MAX/MAX_LATENCY_NICE
yes
>
> > + * A task with latency_nice with the value of LATENCY_NICE_MIN can be
> > + * taken for a task with lower latency requirements as opposed to the task with
> > + * higher latency_nice.
>
> low latency nice (priority): -20 -> high weight: 1024 ... Doesn't a task
> with MIN_LATENCY_NICE -20 have the highest latency requirements?
IIUC, What Parth wanted to say here is that low latency nice
(priority): -20 is taken for a task which needs low latency. The low
applies to latency not requirements
>
> [...]
On 03/25/22 14:27, Vincent Guittot wrote:
> removed Dhaval's email which returns error
>
> On Thu, 24 Mar 2022 at 18:25, Qais Yousef <[email protected]> wrote:
> >
> > On 03/23/22 16:32, Vincent Guittot wrote:
> > > On Tue, 22 Mar 2022 at 17:39, Qais Yousef <[email protected]> wrote:
> > > >
> > > > Hi Vincent
> > > >
> > > > Thanks for reviving this patchset!
> > > >
> > > > On 03/11/22 17:14, Vincent Guittot wrote:
> > > > > This patchset restarts the work about adding a latency nice priority to
> > > > > describe the latency tolerance of cfs tasks.
> > > > >
> > > > > The patches [1-4] have been done by Parth:
> > > > > https://lore.kernel.org/lkml/[email protected]/
> > > > >
> > > > > I have just rebased and moved the set of latency priority outside the
> > > > > priority update. I have removed the reviewed tag because the patches
> > > > > are 2 years old.
> > > >
> > > > AFAIR the blocking issue we had then is on agreement on the interface. Has this
> > > > been resolved now? I didn't see any further discussion since then.
> > >
> > > I think that there was an agreement about using a latency nice
> > > priority in the range [-20:19] with -20 meaning sensitive to latency
> > > whereas 19 means that task doesn't care about scheduling latency. The
> > > open point was about how to use this input in the scheduler with some
> > > behavior being opposed.
> >
> > What I remember is that the problem was to consolidate on use cases then
> > discuss interfaces.
> >
> > See https://lwn.net/Articles/820659/
> >
> > " Youssef said that the interface to all of this is the sticking
> > point. Thomas Gleixner agreed, saying that the -20..19 range "requires
> > a crystal ball" to use properly. Zijlstra repeated his call to
> > enumerate the use cases before getting into the interface details.
> > Giani repeated that the interface does not look correct now, and agreed
> > that a more comprehensive look at the use cases was needed. Things were
> > being done backwards currently, he said. "
> >
>
> At LPC, everybody seemed aligned with latency_nice so I assumed that
> there was an agreement on this interface.
> Latency_nice fits well with my proposal because it's all about
> relative comparison between the running task to the others. The
> current nice priority is used to set how much cpu bandwidth a task
> will have compared to others and the latency_nice is used in a similar
> way to know which one should run compared to the others.
I think the users were happy, but not the maintainers :-)
I am still happy with it, but I just want to make sure that our use case is
something we still care about having in upstream and we'd still like to use
this interface to achieve that. I don't want it to be blocked based on
interface not suitable. So this should be taken into consideration that this is
not a replacement to at least our previous use case.
The concept of latency_nice conversion to weight is something new and I don't
think any of the other users requires it. So we need to keep the user visible
interface detached from weight which is internal implementation detail for your
use case.
>
> > >
> > > >
> > > > >
> > > > > The patches [5-6] use latency nice priority to decide if a cfs task can
> > > > > preempt the current running task. Patch 5 gives some tests results with
> > > > > cyclictests and hackbench to highlight the benefit of latency nice
> > > > > priority for short interactive task or long intensive tasks.
> > > >
> > > > This is a new use case AFAICT. For Android, we want to do something in EAS path
> > >
> > > I don't think it's new, it's about being able to run some tasks in
> >
> > I meant new use case to latency-nice interface. I don't think we had this in
> > any of our discussions before? I don't mind it, but it'd be good to clarify if
> > it has any relation about the other use cases and what should happen to the
> > other use cases.
>
> Several discussions happened about changing the preemption policy of
> CFS. I have Mel's example in mind with hackbench where we want to
> reduce the preemption capabilities for the threads and on the other
> side the multimedia tasks which complain about having to wait before
> being scheduled. All this is about preempting or not the others. And
> all this has been kept outside topology consideration but only for the
> local run queue
Cool. I can see its usefulness. Though I still have to convince myself that you
can affect preemption without impacting bandwidth and is not a subtler way to
modify nice.
Thanks
--
Qais Yousef
On 11/03/2022 17:14, Vincent Guittot wrote:
> This patchset restarts the work about adding a latency nice priority to
> describe the latency tolerance of cfs tasks.
>
> The patches [1-4] have been done by Parth:
> https://lore.kernel.org/lkml/[email protected]/
>
> I have just rebased and moved the set of latency priority outside the
> priority update. I have removed the reviewed tag because the patches
> are 2 years old.
>
> The patches [5-6] use latency nice priority to decide if a cfs task can
> preempt the current running task. Patch 5 gives some tests results with
> cyclictests and hackbench to highlight the benefit of latency nice
> priority for short interactive task or long intensive tasks.
The Android specific `latency_nice` (in Android `latency_sensitive`
[latency_nice < 0]) use case `Skip energy aware task placement` favors
an idle CPU over the EAS search path for a `latency_sensitive` task.
https://lkml.kernel.org/r/[email protected]
This is Android proprietary code similar to what we have in
find_idlest_group_cpu() in mainline.
We talked to the Android folks last week and IMHO they are not convinced
that they can switch this to the proposed `latency_nice->tweak
preemption` use case.
On 11/03/2022 17:14, Vincent Guittot wrote:
[...]
> @@ -98,6 +99,22 @@ struct sched_param {
> * scheduled on a CPU with no more capacity than the specified value.
> *
> * A task utilization boundary can be reset by setting the attribute to -1.
> + *
> + * Latency Tolerance Attributes
> + * ===========================
> + *
> + * A subset of sched_attr attributes allows to specify the relative latency
> + * requirements of a task with respect to the other tasks running/queued in the
> + * system.
> + *
> + * @ sched_latency_nice task's latency_nice value
> + *
> + * The latency_nice of a task can have any value in a range of
> + * [LATENCY_NICE_MIN..LATENCY_NICE_MAX].
s/LATENCY_NICE_MIN/MIN_LATENCY_NICE
s/LATENCY_NICE_MAX/MAX_LATENCY_NICE
> + * A task with latency_nice with the value of LATENCY_NICE_MIN can be
> + * taken for a task with lower latency requirements as opposed to the task with
> + * higher latency_nice.
low latency nice (priority): -20 -> high weight: 1024 ... Doesn't a task
with MIN_LATENCY_NICE -20 have the highest latency requirements?
[...]
On Mon, 28 Mar 2022 at 18:27, Qais Yousef <[email protected]> wrote:
>
> On 03/25/22 14:27, Vincent Guittot wrote:
> > removed Dhaval's email which returns error
> >
> > On Thu, 24 Mar 2022 at 18:25, Qais Yousef <[email protected]> wrote:
> > >
> > > On 03/23/22 16:32, Vincent Guittot wrote:
> > > > On Tue, 22 Mar 2022 at 17:39, Qais Yousef <[email protected]> wrote:
> > > > >
> > > > > Hi Vincent
> > > > >
> > > > > Thanks for reviving this patchset!
> > > > >
> > > > > On 03/11/22 17:14, Vincent Guittot wrote:
> > > > > > This patchset restarts the work about adding a latency nice priority to
> > > > > > describe the latency tolerance of cfs tasks.
> > > > > >
> > > > > > The patches [1-4] have been done by Parth:
> > > > > > https://lore.kernel.org/lkml/[email protected]/
> > > > > >
> > > > > > I have just rebased and moved the set of latency priority outside the
> > > > > > priority update. I have removed the reviewed tag because the patches
> > > > > > are 2 years old.
> > > > >
> > > > > AFAIR the blocking issue we had then is on agreement on the interface. Has this
> > > > > been resolved now? I didn't see any further discussion since then.
> > > >
> > > > I think that there was an agreement about using a latency nice
> > > > priority in the range [-20:19] with -20 meaning sensitive to latency
> > > > whereas 19 means that task doesn't care about scheduling latency. The
> > > > open point was about how to use this input in the scheduler with some
> > > > behavior being opposed.
> > >
> > > What I remember is that the problem was to consolidate on use cases then
> > > discuss interfaces.
> > >
> > > See https://lwn.net/Articles/820659/
> > >
> > > " Youssef said that the interface to all of this is the sticking
> > > point. Thomas Gleixner agreed, saying that the -20..19 range "requires
> > > a crystal ball" to use properly. Zijlstra repeated his call to
> > > enumerate the use cases before getting into the interface details.
> > > Giani repeated that the interface does not look correct now, and agreed
> > > that a more comprehensive look at the use cases was needed. Things were
> > > being done backwards currently, he said. "
> > >
> >
> > At LPC, everybody seemed aligned with latency_nice so I assumed that
> > there was an agreement on this interface.
> > Latency_nice fits well with my proposal because it's all about
> > relative comparison between the running task to the others. The
> > current nice priority is used to set how much cpu bandwidth a task
> > will have compared to others and the latency_nice is used in a similar
> > way to know which one should run compared to the others.
>
> I think the users were happy, but not the maintainers :-)
>
> I am still happy with it, but I just want to make sure that our use case is
> something we still care about having in upstream and we'd still like to use
> this interface to achieve that. I don't want it to be blocked based on
> interface not suitable. So this should be taken into consideration that this is
> not a replacement to at least our previous use case.
>
> The concept of latency_nice conversion to weight is something new and I don't
> think any of the other users requires it. So we need to keep the user visible
> interface detached from weight which is internal implementation detail for your
> use case.
note that the weight is only another way to describe relative priority
but I will keep that in mind for the next version
>
> >
> > > >
> > > > >
> > > > > >
> > > > > > The patches [5-6] use latency nice priority to decide if a cfs task can
> > > > > > preempt the current running task. Patch 5 gives some tests results with
> > > > > > cyclictests and hackbench to highlight the benefit of latency nice
> > > > > > priority for short interactive task or long intensive tasks.
> > > > >
> > > > > This is a new use case AFAICT. For Android, we want to do something in EAS path
> > > >
> > > > I don't think it's new, it's about being able to run some tasks in
> > >
> > > I meant new use case to latency-nice interface. I don't think we had this in
> > > any of our discussions before? I don't mind it, but it'd be good to clarify if
> > > it has any relation about the other use cases and what should happen to the
> > > other use cases.
> >
> > Several discussions happened about changing the preemption policy of
> > CFS. I have Mel's example in mind with hackbench where we want to
> > reduce the preemption capabilities for the threads and on the other
> > side the multimedia tasks which complain about having to wait before
> > being scheduled. All this is about preempting or not the others. And
> > all this has been kept outside topology consideration but only for the
> > local run queue
>
> Cool. I can see its usefulness. Though I still have to convince myself that you
> can affect preemption without impacting bandwidth and is not a subtler way to
> modify nice.
This has been one of my main goal too: to not modify cpu bandwidth
Thanks
Vincent
>
>
> Thanks
>
> --
> Qais Yousef
On Mon, May 02, 2022 at 11:08:15PM +0800, Tao Zhou wrote:
> On Mon, May 02, 2022 at 02:30:55PM +0200, Vincent Guittot wrote:
>
> > On Mon, 2 May 2022 at 11:54, Vincent Guittot <[email protected]> wrote:
> > >
> > > Hi Tao,
> > >
> > > On Sun, 1 May 2022 at 17:58, Tao Zhou <[email protected]> wrote:
> > > >
> > > > Hi Vincent,
> > > >
> > > > Change to Valentin Schneider's now using mail address.
> > >
> > > Thanks
> > >
> > > >
> > > > On Fri, Mar 11, 2022 at 05:14:05PM +0100, Vincent Guittot wrote:
> > > >
> > > > > Take into account the nice latency priority of a thread when deciding to
> > > > > preempt the current running thread. We don't want to provide more CPU
> > > > > bandwidth to a thread but reorder the scheduling to run latency sensitive
> > > > > task first whenever possible.
> > > > >
> > > > > As long as a thread didn't use its bandwidth, it will be able to preempt
> > > > > the current thread.
> > > > >
> > > > > At the opposite, a thread with a low latency priority will preempt current
> > > > > thread at wakeup only to keep fair CPU bandwidth sharing. Otherwise it will
> > > > > wait for the tick to get its sched slice.
> > > > >
> > > > > curr vruntime
> > > > > |
> > > > > sysctl_sched_wakeup_granularity
> > > > > <-->
> > > > > ----------------------------------|----|-----------------------|---------------
> > > > > | |<--------------------->
> > > > > | . sysctl_sched_latency
> > > > > | .
> > > > > default/current latency entity | .
> > > > > | .
> > > > > 1111111111111111111111111111111111|0000|-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-
> > > > > se preempts curr at wakeup ------>|<- se doesn't preempt curr -----------------
> > > > > | .
> > > > > | .
> > > > > | .
> > > > > low latency entity | .
> > > > > ---------------------->|
> > > > > % of sysctl_sched_latency |
> > > > > 1111111111111111111111111111111111111111111111111111111111|0000|-1-1-1-1-1-1-1-
> > > > > preempt ------------------------------------------------->|<- do not preempt --
> > > > > | .
> > > > > | .
> > > > > | .
> > > > > high latency entity | .
> > > > > |<-----------------------| .
> > > > > | % of sysctl_sched_latency .
> > > > > 111111111|0000|-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1
> > > > > preempt->|<- se doesn't preempt curr ------------------------------------------
> > > > >
> > > > > Tests results of nice latency impact on heavy load like hackbench:
> > > > >
> > > > > hackbench -l (2560 / group) -g group
> > > > > group latency 0 latency 19
> > > > > 1 1.450(+/- 0.60%) 1.376(+/- 0.54%) + 5%
> > > > > 4 1.537(+/- 1.75%) 1.335(+/- 1.81%) +13%
> > > > > 8 1.414(+/- 1.91%) 1.348(+/- 1.88%) + 5%
> > > > > 16 1.423(+/- 1.65%) 1.374(+/- 0.58%) + 3%
> > > > >
> > > > > hackbench -p -l (2560 / group) -g group
> > > > > group
> > > > > 1 1.416(+/- 3.45%) 0.886(+/- 0.54%) +37%
> > > > > 4 1.634(+/- 7.14%) 0.888(+/- 5.40%) +45%
> > > > > 8 1.449(+/- 2.14%) 0.804(+/- 4.55%) +44%
> > > > > 16 0.917(+/- 4.12%) 0.777(+/- 1.41%) +15%
> > > > >
> > > > > By deacreasing the latency prio, we reduce the number of preemption at
> > > >
> > > > s/deacreasing/decreasing/
> > >
> > > yes
> > >
> > > > s/reduce/increase/
> > >
> > > not in the case of hackbench tests above. By decreasing the latency
> > > prio of all hackbench threads, we make sure that they will not preempt
> > > the current thread and let it move forward so we reduce the number of
> > > preemption.
> > >
> > > >
> > > > > wakeup and help hackbench making progress.
> > > > >
> > > > > Test results of nice latency impact on short live load like cyclictest
> > > > > while competing with heavy load like hackbench:
> > > > >
> > > > > hackbench -l 10000 -g group &
> > > > > cyclictest --policy other -D 5 -q -n
> > > > > latency 0 latency -20
> > > > > group min avg max min avg max
> > > > > 0 16 17 28 15 17 27
> > > > > 1 61 382 10603 63 89 4628
> > > > > 4 52 437 15455 54 98 16238
> > > > > 8 56 728 38499 61 125 28983
> > > > > 16 53 1215 52207 61 183 80751
> > > > >
> > > > > group = 0 means that hackbench is not running.
> > > > >
> > > > > The avg is significantly improved with nice latency -20 especially with
> > > > > large number of groups but min and max remain quite similar. If we add the
> > > > > histogram parameters to get details of latency, we have :
> > > > >
> > > > > hackbench -l 10000 -g 16 &
> > > > > cyclictest --policy other -D 5 -q -n -H 20000 --histfile data.txt
> > > > > latency 0 latency -20
> > > > > Min Latencies: 63 62
> > > > > Avg Latencies: 1397 218
> > > > > Max Latencies: 44926 42291
> > > > > 50% latencies: 100 98
> > > > > 75% latencies: 762 116
> > > > > 85% latencies: 1118 126
> > > > > 90% latencies: 1540 130
> > > > > 95% latencies: 5610 138
> > > > > 99% latencies: 13738 266
> > > > >
> > > > > With percentile details, we see the benefit of nice latency -20 as
> > > > > 1% of the latencies stays above 266us whereas the default latency has
> > > > > got 25% are above 762us and 10% over the 1ms.
> > > > >
> > >
> > > [..]
> > >
> > > > > +static long wakeup_latency_gran(int latency_weight)
> > > > > +{
> > > > > + long thresh = sysctl_sched_latency;
> > > > > +
> > > > > + if (!latency_weight)
> > > > > + return 0;
> > > > > +
> > > > > + if (sched_feat(GENTLE_FAIR_SLEEPERS))
> > > > > + thresh >>= 1;
> > > > > +
> > > > > + /*
> > > > > + * Clamp the delta to stay in the scheduler period range
> > > > > + * [-sysctl_sched_latency:sysctl_sched_latency]
> > > > > + */
> > > > > + latency_weight = clamp_t(long, latency_weight,
> > > > > + -1 * NICE_LATENCY_WEIGHT_MAX,
> > > > > + NICE_LATENCY_WEIGHT_MAX);
> > > > > +
> > > > > + return (thresh * latency_weight) >> NICE_LATENCY_SHIFT;
> > > > > +}
> > > > > +
> > > > > static unsigned long wakeup_gran(struct sched_entity *se)
> > > > > {
> > > > > unsigned long gran = sysctl_sched_wakeup_granularity;
> > > > > @@ -7008,6 +7059,10 @@ static int
> > > > > wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
> > > > > {
> > > > > s64 gran, vdiff = curr->vruntime - se->vruntime;
> > > > > + int latency_weight = se->latency_weight - curr->latency_weight;
> > > > > +
> > > > > + latency_weight = min(latency_weight, se->latency_weight);
> > > >
> > > > Let lable A=se->latency_weight, B=curr->latency_weight, C=A-B.
> > > >
> > > > 1 A>0 B>0
> > > > ::C=A-B<0, min(C,A)=C, latency decrease, C is the real diff value no matter
> > > > what A is. That means it can be very 'long'(-sched_latency) and vdiff is
> > > > more possible to be in <= 0 case and return -1.
> > > > ::C=A-B>0, min(C,A)=A, latency increase, but it is conservative. Limit to
> > >
> > > A>0 and B>0 then min(C=A-B, A) = C
>
> It's my mistake. And in this case the calculating of value added to vdiff
> for latency increase and deleted to vdiff for latency decrease is the same.
>
> > >
> > > > A/1024*sched_latency.
> > > > When latecy is decreased, the negtive value added to vdiff is larger than the
> > > > positive value added to vdiff when latency is increased.
> > >
> > > When the weight > 0, the tasks have some latency requirements so we
> > > use their relative priority to decide if we can preempt current which
> > > also has some latency requirement
> > >
> > > >
> > > > 2 A>0 B<0
> > > > ::C=A-B>0 and C>A, min(C,A)=A, latency increase and it is conservative.
> >
> > For this one we want to use delta like for UC 1 above
> >
> > > >
> > > > 3 A<0 B<0
> > > > ::C=A-B>0, min(C,A)=A, latency increase but min(C,A)<0, I think if latency
> > > > increase means the value added to vdiff will be a positive value to increase
> > > > the chance to return 1. I would say it is max(C,A)=C
> > > > ::C=A-B<0, min(C,A)=A, latency decrease and the real negtive value.
> > >
> > > When the weight < 0, the tasks haven't latency requirement and even
> > > don't care of being scheduled "quickly". In this case, we don't care
> > > about the relative priority but we want to minimize the preemption of
> > > current so we use the weight
> > >
> > > >
> > > > 4 A<0,B>0
> > > > ::C=A-B<0, min(C,A)=C, latency decrease and the real negtive value.
> >
> > And for this one we probably want to use A like for other UC with A < 0
> >
> > I'm going to update the way the weight is computed to match this
>
> According to your explanations, the min(C,A) is used for A and B both in
> the negtive region or in the postive region, the max(C,A) is use for A and
> B both not in the same region.
>
> if ((A>>31)^(B>>31))
> max(C,A)
> else
> min(C,A)
Not right.
if ((A&(1<<31))^(B(1<<31)))
max(C,A)
else
min(C,A)
>
> wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
> {
> s64 gran, vdiff = curr->vruntime - se->vruntime;
> int latency_weight = se->latency_weight - curr->latency_weight;
>
> if ((se->latency_weight>>(WMULT_SHIFT-1)) ^
> curr->latency_weight>>(WMULT_SHIFT-1))
> latency_weight = max(latency_weight, se->latency_weight);
> else
> latency_weight = min(latency_weight, se->latency_weight);
>
> vdiff += wakeup_latency_gran(latency_weight);
> ...
> }
Not right.
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
{
s64 gran, vdiff = curr->vruntime - se->vruntime;
int latency_weight = se->latency_weight - curr->latency_weight;
if ((se->latency_weight & (1 << WMULT_SHIFT-1)) ^
curr->latency_weight & (1 << WMULT_SHIFT-1))
latency_weight = max(latency_weight, se->latency_weight);
else
latency_weight = min(latency_weight, se->latency_weight);
vdiff += wakeup_latency_gran(latency_weight);
...
}
> > > >
> > > > Is there a reason that the value when latency increase and latency decrease
> > > > be treated differently. Latency increase value is limited to se's latency_weight
> > >
> > > I have tried to explain why I treat differently if weight is > 0 or < 0.
> > >
> > > > but latency decrease value can extend to -sched_latency or treat them the same.
> > > > Penalty latency decrease and conserve latency increase.
> > > >
> > > >
> > > > There is any value that this latency_weight can be considered to be a average.
> > > >
> > > > The delta value choose is ~%5 to 1024. %5*sched_latency=0.05*6ms=0.3ms.(no scale)
> > > > I do not think over that vdiff equation and others though.
> > > >
> > > > Thanks,
> > > > Tao
> > > > > + vdiff += wakeup_latency_gran(latency_weight);
> > > > >
> > > > > if (vdiff <= 0)
> > > > > return -1;
> > > > > @@ -7117,6 +7172,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
> > > > > return;
> > > > >
> > > > > update_curr(cfs_rq_of(se));
> > > > > +
> > > > > if (wakeup_preempt_entity(se, pse) == 1) {
> > > > > /*
> > > > > * Bias pick_next to pick the sched entity that is
> > > > > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > > > > index 456ad2159eb1..dd92aa9c36f9 100644
> > > > > --- a/kernel/sched/sched.h
> > > > > +++ b/kernel/sched/sched.h
> > > > > @@ -122,6 +122,17 @@ extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
> > > > > * Default tasks should be treated as a task with latency_nice = 0.
> > > > > */
> > > > > #define DEFAULT_LATENCY_NICE 0
> > > > > +#define DEFAULT_LATENCY_PRIO (DEFAULT_LATENCY_NICE + LATENCY_NICE_WIDTH/2)
> > > > > +
> > > > > +/*
> > > > > + * Convert user-nice values [ -20 ... 0 ... 19 ]
> > > > > + * to static latency [ 0..39 ],
> > > > > + * and back.
> > > > > + */
> > > > > +#define NICE_TO_LATENCY(nice) ((nice) + DEFAULT_LATENCY_PRIO)
> > > > > +#define LATENCY_TO_NICE(prio) ((prio) - DEFAULT_LATENCY_PRIO)
> > > > > +#define NICE_LATENCY_SHIFT (SCHED_FIXEDPOINT_SHIFT)
> > > > > +#define NICE_LATENCY_WEIGHT_MAX (1L << NICE_LATENCY_SHIFT)
> > > > >
> > > > > /*
> > > > > * Increase resolution of nice-level calculations for 64-bit architectures.
> > > > > @@ -2098,6 +2109,7 @@ static_assert(WF_TTWU == SD_BALANCE_WAKE);
> > > > >
> > > > > extern const int sched_prio_to_weight[40];
> > > > > extern const u32 sched_prio_to_wmult[40];
> > > > > +extern const int sched_latency_to_weight[40];
> > > > >
> > > > > /*
> > > > > * {de,en}queue flags:
> > > > > --
> > > > > 2.17.1
> > > > >
On Mon, 2 May 2022 at 11:54, Vincent Guittot <[email protected]> wrote:
>
> Hi Tao,
>
> On Sun, 1 May 2022 at 17:58, Tao Zhou <[email protected]> wrote:
> >
> > Hi Vincent,
> >
> > Change to Valentin Schneider's now using mail address.
>
> Thanks
>
> >
> > On Fri, Mar 11, 2022 at 05:14:05PM +0100, Vincent Guittot wrote:
> >
> > > Take into account the nice latency priority of a thread when deciding to
> > > preempt the current running thread. We don't want to provide more CPU
> > > bandwidth to a thread but reorder the scheduling to run latency sensitive
> > > task first whenever possible.
> > >
> > > As long as a thread didn't use its bandwidth, it will be able to preempt
> > > the current thread.
> > >
> > > At the opposite, a thread with a low latency priority will preempt current
> > > thread at wakeup only to keep fair CPU bandwidth sharing. Otherwise it will
> > > wait for the tick to get its sched slice.
> > >
> > > curr vruntime
> > > |
> > > sysctl_sched_wakeup_granularity
> > > <-->
> > > ----------------------------------|----|-----------------------|---------------
> > > | |<--------------------->
> > > | . sysctl_sched_latency
> > > | .
> > > default/current latency entity | .
> > > | .
> > > 1111111111111111111111111111111111|0000|-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-
> > > se preempts curr at wakeup ------>|<- se doesn't preempt curr -----------------
> > > | .
> > > | .
> > > | .
> > > low latency entity | .
> > > ---------------------->|
> > > % of sysctl_sched_latency |
> > > 1111111111111111111111111111111111111111111111111111111111|0000|-1-1-1-1-1-1-1-
> > > preempt ------------------------------------------------->|<- do not preempt --
> > > | .
> > > | .
> > > | .
> > > high latency entity | .
> > > |<-----------------------| .
> > > | % of sysctl_sched_latency .
> > > 111111111|0000|-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1
> > > preempt->|<- se doesn't preempt curr ------------------------------------------
> > >
> > > Tests results of nice latency impact on heavy load like hackbench:
> > >
> > > hackbench -l (2560 / group) -g group
> > > group latency 0 latency 19
> > > 1 1.450(+/- 0.60%) 1.376(+/- 0.54%) + 5%
> > > 4 1.537(+/- 1.75%) 1.335(+/- 1.81%) +13%
> > > 8 1.414(+/- 1.91%) 1.348(+/- 1.88%) + 5%
> > > 16 1.423(+/- 1.65%) 1.374(+/- 0.58%) + 3%
> > >
> > > hackbench -p -l (2560 / group) -g group
> > > group
> > > 1 1.416(+/- 3.45%) 0.886(+/- 0.54%) +37%
> > > 4 1.634(+/- 7.14%) 0.888(+/- 5.40%) +45%
> > > 8 1.449(+/- 2.14%) 0.804(+/- 4.55%) +44%
> > > 16 0.917(+/- 4.12%) 0.777(+/- 1.41%) +15%
> > >
> > > By deacreasing the latency prio, we reduce the number of preemption at
> >
> > s/deacreasing/decreasing/
>
> yes
>
> > s/reduce/increase/
>
> not in the case of hackbench tests above. By decreasing the latency
> prio of all hackbench threads, we make sure that they will not preempt
> the current thread and let it move forward so we reduce the number of
> preemption.
>
> >
> > > wakeup and help hackbench making progress.
> > >
> > > Test results of nice latency impact on short live load like cyclictest
> > > while competing with heavy load like hackbench:
> > >
> > > hackbench -l 10000 -g group &
> > > cyclictest --policy other -D 5 -q -n
> > > latency 0 latency -20
> > > group min avg max min avg max
> > > 0 16 17 28 15 17 27
> > > 1 61 382 10603 63 89 4628
> > > 4 52 437 15455 54 98 16238
> > > 8 56 728 38499 61 125 28983
> > > 16 53 1215 52207 61 183 80751
> > >
> > > group = 0 means that hackbench is not running.
> > >
> > > The avg is significantly improved with nice latency -20 especially with
> > > large number of groups but min and max remain quite similar. If we add the
> > > histogram parameters to get details of latency, we have :
> > >
> > > hackbench -l 10000 -g 16 &
> > > cyclictest --policy other -D 5 -q -n -H 20000 --histfile data.txt
> > > latency 0 latency -20
> > > Min Latencies: 63 62
> > > Avg Latencies: 1397 218
> > > Max Latencies: 44926 42291
> > > 50% latencies: 100 98
> > > 75% latencies: 762 116
> > > 85% latencies: 1118 126
> > > 90% latencies: 1540 130
> > > 95% latencies: 5610 138
> > > 99% latencies: 13738 266
> > >
> > > With percentile details, we see the benefit of nice latency -20 as
> > > 1% of the latencies stays above 266us whereas the default latency has
> > > got 25% are above 762us and 10% over the 1ms.
> > >
>
> [..]
>
> > > +static long wakeup_latency_gran(int latency_weight)
> > > +{
> > > + long thresh = sysctl_sched_latency;
> > > +
> > > + if (!latency_weight)
> > > + return 0;
> > > +
> > > + if (sched_feat(GENTLE_FAIR_SLEEPERS))
> > > + thresh >>= 1;
> > > +
> > > + /*
> > > + * Clamp the delta to stay in the scheduler period range
> > > + * [-sysctl_sched_latency:sysctl_sched_latency]
> > > + */
> > > + latency_weight = clamp_t(long, latency_weight,
> > > + -1 * NICE_LATENCY_WEIGHT_MAX,
> > > + NICE_LATENCY_WEIGHT_MAX);
> > > +
> > > + return (thresh * latency_weight) >> NICE_LATENCY_SHIFT;
> > > +}
> > > +
> > > static unsigned long wakeup_gran(struct sched_entity *se)
> > > {
> > > unsigned long gran = sysctl_sched_wakeup_granularity;
> > > @@ -7008,6 +7059,10 @@ static int
> > > wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
> > > {
> > > s64 gran, vdiff = curr->vruntime - se->vruntime;
> > > + int latency_weight = se->latency_weight - curr->latency_weight;
> > > +
> > > + latency_weight = min(latency_weight, se->latency_weight);
> >
> > Let lable A=se->latency_weight, B=curr->latency_weight, C=A-B.
> >
> > 1 A>0 B>0
> > ::C=A-B<0, min(C,A)=C, latency decrease, C is the real diff value no matter
> > what A is. That means it can be very 'long'(-sched_latency) and vdiff is
> > more possible to be in <= 0 case and return -1.
> > ::C=A-B>0, min(C,A)=A, latency increase, but it is conservative. Limit to
>
> A>0 and B>0 then min(C=A-B, A) = C
>
> > A/1024*sched_latency.
> > When latecy is decreased, the negtive value added to vdiff is larger than the
> > positive value added to vdiff when latency is increased.
>
> When the weight > 0, the tasks have some latency requirements so we
> use their relative priority to decide if we can preempt current which
> also has some latency requirement
>
> >
> > 2 A>0 B<0
> > ::C=A-B>0 and C>A, min(C,A)=A, latency increase and it is conservative.
For this one we want to use delta like for UC 1 above
> >
> > 3 A<0 B<0
> > ::C=A-B>0, min(C,A)=A, latency increase but min(C,A)<0, I think if latency
> > increase means the value added to vdiff will be a positive value to increase
> > the chance to return 1. I would say it is max(C,A)=C
> > ::C=A-B<0, min(C,A)=A, latency decrease and the real negtive value.
>
> When the weight < 0, the tasks haven't latency requirement and even
> don't care of being scheduled "quickly". In this case, we don't care
> about the relative priority but we want to minimize the preemption of
> current so we use the weight
>
> >
> > 4 A<0,B>0
> > ::C=A-B<0, min(C,A)=C, latency decrease and the real negtive value.
And for this one we probably want to use A like for other UC with A < 0
I'm going to update the way the weight is computed to match this
> >
> > Is there a reason that the value when latency increase and latency decrease
> > be treated differently. Latency increase value is limited to se's latency_weight
>
> I have tried to explain why I treat differently if weight is > 0 or < 0.
>
> > but latency decrease value can extend to -sched_latency or treat them the same.
> > Penalty latency decrease and conserve latency increase.
> >
> >
> > There is any value that this latency_weight can be considered to be a average.
> >
> > The delta value choose is ~%5 to 1024. %5*sched_latency=0.05*6ms=0.3ms.(no scale)
> > I do not think over that vdiff equation and others though.
> >
> > Thanks,
> > Tao
> > > + vdiff += wakeup_latency_gran(latency_weight);
> > >
> > > if (vdiff <= 0)
> > > return -1;
> > > @@ -7117,6 +7172,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
> > > return;
> > >
> > > update_curr(cfs_rq_of(se));
> > > +
> > > if (wakeup_preempt_entity(se, pse) == 1) {
> > > /*
> > > * Bias pick_next to pick the sched entity that is
> > > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > > index 456ad2159eb1..dd92aa9c36f9 100644
> > > --- a/kernel/sched/sched.h
> > > +++ b/kernel/sched/sched.h
> > > @@ -122,6 +122,17 @@ extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
> > > * Default tasks should be treated as a task with latency_nice = 0.
> > > */
> > > #define DEFAULT_LATENCY_NICE 0
> > > +#define DEFAULT_LATENCY_PRIO (DEFAULT_LATENCY_NICE + LATENCY_NICE_WIDTH/2)
> > > +
> > > +/*
> > > + * Convert user-nice values [ -20 ... 0 ... 19 ]
> > > + * to static latency [ 0..39 ],
> > > + * and back.
> > > + */
> > > +#define NICE_TO_LATENCY(nice) ((nice) + DEFAULT_LATENCY_PRIO)
> > > +#define LATENCY_TO_NICE(prio) ((prio) - DEFAULT_LATENCY_PRIO)
> > > +#define NICE_LATENCY_SHIFT (SCHED_FIXEDPOINT_SHIFT)
> > > +#define NICE_LATENCY_WEIGHT_MAX (1L << NICE_LATENCY_SHIFT)
> > >
> > > /*
> > > * Increase resolution of nice-level calculations for 64-bit architectures.
> > > @@ -2098,6 +2109,7 @@ static_assert(WF_TTWU == SD_BALANCE_WAKE);
> > >
> > > extern const int sched_prio_to_weight[40];
> > > extern const u32 sched_prio_to_wmult[40];
> > > +extern const int sched_latency_to_weight[40];
> > >
> > > /*
> > > * {de,en}queue flags:
> > > --
> > > 2.17.1
> > >
Le lundi 02 mai 2022 ? 23:47:32 (+0800), Tao Zhou a ?crit :
> On Mon, May 02, 2022 at 11:26:14PM +0800, Tao Zhou wrote:
> > On Mon, May 02, 2022 at 11:08:15PM +0800, Tao Zhou wrote:
> > > On Mon, May 02, 2022 at 02:30:55PM +0200, Vincent Guittot wrote:
> > >
> > > > On Mon, 2 May 2022 at 11:54, Vincent Guittot <[email protected]> wrote:
> > > > >
> > > > > Hi Tao,
> > > > >
> > > > > On Sun, 1 May 2022 at 17:58, Tao Zhou <[email protected]> wrote:
> > > > > >
> > > > > > Hi Vincent,
> > > > > >
> > > > > > Change to Valentin Schneider's now using mail address.
> > > > >
> > > > > Thanks
> > > > >
> > > > > >
> > > > > > On Fri, Mar 11, 2022 at 05:14:05PM +0100, Vincent Guittot wrote:
> > > > > >
> > > > > > > Take into account the nice latency priority of a thread when deciding to
> > > > > > > preempt the current running thread. We don't want to provide more CPU
> > > > > > > bandwidth to a thread but reorder the scheduling to run latency sensitive
> > > > > > > task first whenever possible.
> > > > > > >
> > > > > > > As long as a thread didn't use its bandwidth, it will be able to preempt
> > > > > > > the current thread.
> > > > > > >
> > > > > > > At the opposite, a thread with a low latency priority will preempt current
> > > > > > > thread at wakeup only to keep fair CPU bandwidth sharing. Otherwise it will
> > > > > > > wait for the tick to get its sched slice.
> > > > > > >
> > > > > > > curr vruntime
> > > > > > > |
> > > > > > > sysctl_sched_wakeup_granularity
> > > > > > > <-->
> > > > > > > ----------------------------------|----|-----------------------|---------------
> > > > > > > | |<--------------------->
> > > > > > > | . sysctl_sched_latency
> > > > > > > | .
> > > > > > > default/current latency entity | .
> > > > > > > | .
> > > > > > > 1111111111111111111111111111111111|0000|-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-
> > > > > > > se preempts curr at wakeup ------>|<- se doesn't preempt curr -----------------
> > > > > > > | .
> > > > > > > | .
> > > > > > > | .
> > > > > > > low latency entity | .
> > > > > > > ---------------------->|
> > > > > > > % of sysctl_sched_latency |
> > > > > > > 1111111111111111111111111111111111111111111111111111111111|0000|-1-1-1-1-1-1-1-
> > > > > > > preempt ------------------------------------------------->|<- do not preempt --
> > > > > > > | .
> > > > > > > | .
> > > > > > > | .
> > > > > > > high latency entity | .
> > > > > > > |<-----------------------| .
> > > > > > > | % of sysctl_sched_latency .
> > > > > > > 111111111|0000|-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1
> > > > > > > preempt->|<- se doesn't preempt curr ------------------------------------------
> > > > > > >
> > > > > > > Tests results of nice latency impact on heavy load like hackbench:
> > > > > > >
> > > > > > > hackbench -l (2560 / group) -g group
> > > > > > > group latency 0 latency 19
> > > > > > > 1 1.450(+/- 0.60%) 1.376(+/- 0.54%) + 5%
> > > > > > > 4 1.537(+/- 1.75%) 1.335(+/- 1.81%) +13%
> > > > > > > 8 1.414(+/- 1.91%) 1.348(+/- 1.88%) + 5%
> > > > > > > 16 1.423(+/- 1.65%) 1.374(+/- 0.58%) + 3%
> > > > > > >
> > > > > > > hackbench -p -l (2560 / group) -g group
> > > > > > > group
> > > > > > > 1 1.416(+/- 3.45%) 0.886(+/- 0.54%) +37%
> > > > > > > 4 1.634(+/- 7.14%) 0.888(+/- 5.40%) +45%
> > > > > > > 8 1.449(+/- 2.14%) 0.804(+/- 4.55%) +44%
> > > > > > > 16 0.917(+/- 4.12%) 0.777(+/- 1.41%) +15%
> > > > > > >
> > > > > > > By deacreasing the latency prio, we reduce the number of preemption at
> > > > > >
> > > > > > s/deacreasing/decreasing/
> > > > >
> > > > > yes
> > > > >
> > > > > > s/reduce/increase/
> > > > >
> > > > > not in the case of hackbench tests above. By decreasing the latency
> > > > > prio of all hackbench threads, we make sure that they will not preempt
> > > > > the current thread and let it move forward so we reduce the number of
> > > > > preemption.
> > > > >
> > > > > >
> > > > > > > wakeup and help hackbench making progress.
> > > > > > >
> > > > > > > Test results of nice latency impact on short live load like cyclictest
> > > > > > > while competing with heavy load like hackbench:
> > > > > > >
> > > > > > > hackbench -l 10000 -g group &
> > > > > > > cyclictest --policy other -D 5 -q -n
> > > > > > > latency 0 latency -20
> > > > > > > group min avg max min avg max
> > > > > > > 0 16 17 28 15 17 27
> > > > > > > 1 61 382 10603 63 89 4628
> > > > > > > 4 52 437 15455 54 98 16238
> > > > > > > 8 56 728 38499 61 125 28983
> > > > > > > 16 53 1215 52207 61 183 80751
> > > > > > >
> > > > > > > group = 0 means that hackbench is not running.
> > > > > > >
> > > > > > > The avg is significantly improved with nice latency -20 especially with
> > > > > > > large number of groups but min and max remain quite similar. If we add the
> > > > > > > histogram parameters to get details of latency, we have :
> > > > > > >
> > > > > > > hackbench -l 10000 -g 16 &
> > > > > > > cyclictest --policy other -D 5 -q -n -H 20000 --histfile data.txt
> > > > > > > latency 0 latency -20
> > > > > > > Min Latencies: 63 62
> > > > > > > Avg Latencies: 1397 218
> > > > > > > Max Latencies: 44926 42291
> > > > > > > 50% latencies: 100 98
> > > > > > > 75% latencies: 762 116
> > > > > > > 85% latencies: 1118 126
> > > > > > > 90% latencies: 1540 130
> > > > > > > 95% latencies: 5610 138
> > > > > > > 99% latencies: 13738 266
> > > > > > >
> > > > > > > With percentile details, we see the benefit of nice latency -20 as
> > > > > > > 1% of the latencies stays above 266us whereas the default latency has
> > > > > > > got 25% are above 762us and 10% over the 1ms.
> > > > > > >
> > > > >
> > > > > [..]
> > > > >
> > > > > > > +static long wakeup_latency_gran(int latency_weight)
> > > > > > > +{
> > > > > > > + long thresh = sysctl_sched_latency;
> > > > > > > +
> > > > > > > + if (!latency_weight)
> > > > > > > + return 0;
> > > > > > > +
> > > > > > > + if (sched_feat(GENTLE_FAIR_SLEEPERS))
> > > > > > > + thresh >>= 1;
> > > > > > > +
> > > > > > > + /*
> > > > > > > + * Clamp the delta to stay in the scheduler period range
> > > > > > > + * [-sysctl_sched_latency:sysctl_sched_latency]
> > > > > > > + */
> > > > > > > + latency_weight = clamp_t(long, latency_weight,
> > > > > > > + -1 * NICE_LATENCY_WEIGHT_MAX,
> > > > > > > + NICE_LATENCY_WEIGHT_MAX);
> > > > > > > +
> > > > > > > + return (thresh * latency_weight) >> NICE_LATENCY_SHIFT;
> > > > > > > +}
> > > > > > > +
> > > > > > > static unsigned long wakeup_gran(struct sched_entity *se)
> > > > > > > {
> > > > > > > unsigned long gran = sysctl_sched_wakeup_granularity;
> > > > > > > @@ -7008,6 +7059,10 @@ static int
> > > > > > > wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
> > > > > > > {
> > > > > > > s64 gran, vdiff = curr->vruntime - se->vruntime;
> > > > > > > + int latency_weight = se->latency_weight - curr->latency_weight;
> > > > > > > +
> > > > > > > + latency_weight = min(latency_weight, se->latency_weight);
> > > > > >
> > > > > > Let lable A=se->latency_weight, B=curr->latency_weight, C=A-B.
> > > > > >
> > > > > > 1 A>0 B>0
> > > > > > ::C=A-B<0, min(C,A)=C, latency decrease, C is the real diff value no matter
> > > > > > what A is. That means it can be very 'long'(-sched_latency) and vdiff is
> > > > > > more possible to be in <= 0 case and return -1.
> > > > > > ::C=A-B>0, min(C,A)=A, latency increase, but it is conservative. Limit to
> > > > >
> > > > > A>0 and B>0 then min(C=A-B, A) = C
> > >
> > > It's my mistake. And in this case the calculating of value added to vdiff
> > > for latency increase and deleted to vdiff for latency decrease is the same.
> > >
> > > > >
> > > > > > A/1024*sched_latency.
> > > > > > When latecy is decreased, the negtive value added to vdiff is larger than the
> > > > > > positive value added to vdiff when latency is increased.
> > > > >
> > > > > When the weight > 0, the tasks have some latency requirements so we
> > > > > use their relative priority to decide if we can preempt current which
> > > > > also has some latency requirement
> > > > >
> > > > > >
> > > > > > 2 A>0 B<0
> > > > > > ::C=A-B>0 and C>A, min(C,A)=A, latency increase and it is conservative.
> > > >
> > > > For this one we want to use delta like for UC 1 above
> > > >
> > > > > >
> > > > > > 3 A<0 B<0
> > > > > > ::C=A-B>0, min(C,A)=A, latency increase but min(C,A)<0, I think if latency
> > > > > > increase means the value added to vdiff will be a positive value to increase
> > > > > > the chance to return 1. I would say it is max(C,A)=C
> > > > > > ::C=A-B<0, min(C,A)=A, latency decrease and the real negtive value.
> > > > >
> > > > > When the weight < 0, the tasks haven't latency requirement and even
> > > > > don't care of being scheduled "quickly". In this case, we don't care
> > > > > about the relative priority but we want to minimize the preemption of
> > > > > current so we use the weight
> > > > >
> > > > > >
> > > > > > 4 A<0,B>0
> > > > > > ::C=A-B<0, min(C,A)=C, latency decrease and the real negtive value.
> > > >
> > > > And for this one we probably want to use A like for other UC with A < 0
> > > >
> > > > I'm going to update the way the weight is computed to match this
> > >
> > > According to your explanations, the min(C,A) is used for A and B both in
> > > the negtive region or in the postive region, the max(C,A) is use for A and
> > > B both not in the same region.
> > >
> > > if ((A>>31)^(B>>31))
> > > max(C,A)
> > > else
> > > min(C,A)
> >
> > Not right.
> >
> > if ((A&(1<<31))^(B(1<<31)))
> > max(C,A)
> > else
> > min(C,A)
> >
> > >
> > > wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
> > > {
> > > s64 gran, vdiff = curr->vruntime - se->vruntime;
> > > int latency_weight = se->latency_weight - curr->latency_weight;
> > >
> > > if ((se->latency_weight>>(WMULT_SHIFT-1)) ^
> > > curr->latency_weight>>(WMULT_SHIFT-1))
> > > latency_weight = max(latency_weight, se->latency_weight);
> > > else
> > > latency_weight = min(latency_weight, se->latency_weight);
> > >
> > > vdiff += wakeup_latency_gran(latency_weight);
> > > ...
> > > }
> >
> > Not right.
> >
> > wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
> > {
> > s64 gran, vdiff = curr->vruntime - se->vruntime;
> > int latency_weight = se->latency_weight - curr->latency_weight;
> >
> > if ((se->latency_weight & (1 << WMULT_SHIFT-1)) ^
> > curr->latency_weight & (1 << WMULT_SHIFT-1))
> > latency_weight = max(latency_weight, se->latency_weight);
> > else
> > latency_weight = min(latency_weight, se->latency_weight);
> >
> > vdiff += wakeup_latency_gran(latency_weight);
> > ...
> > }
>
> I already on bed, but I think they are the same.. heh
case 1 & 2 use delta and have A > 0
case 3 & 4 use A and have A < 0
so I plan to use :
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
{
s64 gran, vdiff = curr->vruntime - se->vruntime;
+ int latency_weight = se->latency_weight;
+
+ /*
+ * A positive latency weight means that se has some latency requirements that
+ * needs to be evaluate versus the current thread.
+ * Otherwise, use the latency weight to evaluate how much scheduling
+ * delay is acceptable by se.
+ */
+ if (latency_weight > 0)
+ latency_weight -= curr->latency_weight;
+
+ vdiff += wakeup_latency_gran(latency_weight);
if (vdiff <= 0)
return -1;
>
> > > > > >
> > > > > > Is there a reason that the value when latency increase and latency decrease
> > > > > > be treated differently. Latency increase value is limited to se's latency_weight
> > > > >
> > > > > I have tried to explain why I treat differently if weight is > 0 or < 0.
> > > > >
> > > > > > but latency decrease value can extend to -sched_latency or treat them the same.
> > > > > > Penalty latency decrease and conserve latency increase.
> > > > > >
> > > > > >
> > > > > > There is any value that this latency_weight can be considered to be a average.
> > > > > >
> > > > > > The delta value choose is ~%5 to 1024. %5*sched_latency=0.05*6ms=0.3ms.(no scale)
> > > > > > I do not think over that vdiff equation and others though.
> > > > > >
> > > > > > Thanks,
> > > > > > Tao
> > > > > > > + vdiff += wakeup_latency_gran(latency_weight);
> > > > > > >
> > > > > > > if (vdiff <= 0)
> > > > > > > return -1;
> > > > > > > @@ -7117,6 +7172,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
> > > > > > > return;
> > > > > > >
> > > > > > > update_curr(cfs_rq_of(se));
> > > > > > > +
> > > > > > > if (wakeup_preempt_entity(se, pse) == 1) {
> > > > > > > /*
> > > > > > > * Bias pick_next to pick the sched entity that is
> > > > > > > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > > > > > > index 456ad2159eb1..dd92aa9c36f9 100644
> > > > > > > --- a/kernel/sched/sched.h
> > > > > > > +++ b/kernel/sched/sched.h
> > > > > > > @@ -122,6 +122,17 @@ extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
> > > > > > > * Default tasks should be treated as a task with latency_nice = 0.
> > > > > > > */
> > > > > > > #define DEFAULT_LATENCY_NICE 0
> > > > > > > +#define DEFAULT_LATENCY_PRIO (DEFAULT_LATENCY_NICE + LATENCY_NICE_WIDTH/2)
> > > > > > > +
> > > > > > > +/*
> > > > > > > + * Convert user-nice values [ -20 ... 0 ... 19 ]
> > > > > > > + * to static latency [ 0..39 ],
> > > > > > > + * and back.
> > > > > > > + */
> > > > > > > +#define NICE_TO_LATENCY(nice) ((nice) + DEFAULT_LATENCY_PRIO)
> > > > > > > +#define LATENCY_TO_NICE(prio) ((prio) - DEFAULT_LATENCY_PRIO)
> > > > > > > +#define NICE_LATENCY_SHIFT (SCHED_FIXEDPOINT_SHIFT)
> > > > > > > +#define NICE_LATENCY_WEIGHT_MAX (1L << NICE_LATENCY_SHIFT)
> > > > > > >
> > > > > > > /*
> > > > > > > * Increase resolution of nice-level calculations for 64-bit architectures.
> > > > > > > @@ -2098,6 +2109,7 @@ static_assert(WF_TTWU == SD_BALANCE_WAKE);
> > > > > > >
> > > > > > > extern const int sched_prio_to_weight[40];
> > > > > > > extern const u32 sched_prio_to_wmult[40];
> > > > > > > +extern const int sched_latency_to_weight[40];
> > > > > > >
> > > > > > > /*
> > > > > > > * {de,en}queue flags:
> > > > > > > --
> > > > > > > 2.17.1
> > > > > > >
Hi Vincent,
Change to Valentin Schneider's now using mail address.
On Fri, Mar 11, 2022 at 05:14:05PM +0100, Vincent Guittot wrote:
> Take into account the nice latency priority of a thread when deciding to
> preempt the current running thread. We don't want to provide more CPU
> bandwidth to a thread but reorder the scheduling to run latency sensitive
> task first whenever possible.
>
> As long as a thread didn't use its bandwidth, it will be able to preempt
> the current thread.
>
> At the opposite, a thread with a low latency priority will preempt current
> thread at wakeup only to keep fair CPU bandwidth sharing. Otherwise it will
> wait for the tick to get its sched slice.
>
> curr vruntime
> |
> sysctl_sched_wakeup_granularity
> <-->
> ----------------------------------|----|-----------------------|---------------
> | |<--------------------->
> | . sysctl_sched_latency
> | .
> default/current latency entity | .
> | .
> 1111111111111111111111111111111111|0000|-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-
> se preempts curr at wakeup ------>|<- se doesn't preempt curr -----------------
> | .
> | .
> | .
> low latency entity | .
> ---------------------->|
> % of sysctl_sched_latency |
> 1111111111111111111111111111111111111111111111111111111111|0000|-1-1-1-1-1-1-1-
> preempt ------------------------------------------------->|<- do not preempt --
> | .
> | .
> | .
> high latency entity | .
> |<-----------------------| .
> | % of sysctl_sched_latency .
> 111111111|0000|-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1
> preempt->|<- se doesn't preempt curr ------------------------------------------
>
> Tests results of nice latency impact on heavy load like hackbench:
>
> hackbench -l (2560 / group) -g group
> group latency 0 latency 19
> 1 1.450(+/- 0.60%) 1.376(+/- 0.54%) + 5%
> 4 1.537(+/- 1.75%) 1.335(+/- 1.81%) +13%
> 8 1.414(+/- 1.91%) 1.348(+/- 1.88%) + 5%
> 16 1.423(+/- 1.65%) 1.374(+/- 0.58%) + 3%
>
> hackbench -p -l (2560 / group) -g group
> group
> 1 1.416(+/- 3.45%) 0.886(+/- 0.54%) +37%
> 4 1.634(+/- 7.14%) 0.888(+/- 5.40%) +45%
> 8 1.449(+/- 2.14%) 0.804(+/- 4.55%) +44%
> 16 0.917(+/- 4.12%) 0.777(+/- 1.41%) +15%
>
> By deacreasing the latency prio, we reduce the number of preemption at
s/deacreasing/decreasing/
s/reduce/increase/
> wakeup and help hackbench making progress.
>
> Test results of nice latency impact on short live load like cyclictest
> while competing with heavy load like hackbench:
>
> hackbench -l 10000 -g group &
> cyclictest --policy other -D 5 -q -n
> latency 0 latency -20
> group min avg max min avg max
> 0 16 17 28 15 17 27
> 1 61 382 10603 63 89 4628
> 4 52 437 15455 54 98 16238
> 8 56 728 38499 61 125 28983
> 16 53 1215 52207 61 183 80751
>
> group = 0 means that hackbench is not running.
>
> The avg is significantly improved with nice latency -20 especially with
> large number of groups but min and max remain quite similar. If we add the
> histogram parameters to get details of latency, we have :
>
> hackbench -l 10000 -g 16 &
> cyclictest --policy other -D 5 -q -n -H 20000 --histfile data.txt
> latency 0 latency -20
> Min Latencies: 63 62
> Avg Latencies: 1397 218
> Max Latencies: 44926 42291
> 50% latencies: 100 98
> 75% latencies: 762 116
> 85% latencies: 1118 126
> 90% latencies: 1540 130
> 95% latencies: 5610 138
> 99% latencies: 13738 266
>
> With percentile details, we see the benefit of nice latency -20 as
> 1% of the latencies stays above 266us whereas the default latency has
> got 25% are above 762us and 10% over the 1ms.
>
> Signed-off-by: Vincent Guittot <[email protected]>
> ---
> include/linux/sched.h | 4 ++-
> init/init_task.c | 2 +-
> kernel/sched/core.c | 32 +++++++++++++++++++----
> kernel/sched/debug.c | 2 +-
> kernel/sched/fair.c | 60 +++++++++++++++++++++++++++++++++++++++++--
> kernel/sched/sched.h | 12 +++++++++
> 6 files changed, 102 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 2aa889a59054..9aeb157e819b 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -560,6 +560,8 @@ struct sched_entity {
> unsigned long runnable_weight;
> #endif
>
> + int latency_weight;
> +
> #ifdef CONFIG_SMP
> /*
> * Per entity load average tracking.
> @@ -779,7 +781,7 @@ struct task_struct {
> int static_prio;
> int normal_prio;
> unsigned int rt_priority;
> - int latency_nice;
> + int latency_prio;
>
> struct sched_entity se;
> struct sched_rt_entity rt;
> diff --git a/init/init_task.c b/init/init_task.c
> index 2afa249c253b..e98c71f24981 100644
> --- a/init/init_task.c
> +++ b/init/init_task.c
> @@ -78,7 +78,7 @@ struct task_struct init_task
> .prio = MAX_PRIO - 20,
> .static_prio = MAX_PRIO - 20,
> .normal_prio = MAX_PRIO - 20,
> - .latency_nice = 0,
> + .latency_prio = NICE_WIDTH - 20,
> .policy = SCHED_NORMAL,
> .cpus_ptr = &init_task.cpus_mask,
> .user_cpus_ptr = NULL,
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 8f8b102a75c4..547b0da01efe 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1241,6 +1241,11 @@ static void set_load_weight(struct task_struct *p, bool update_load)
> }
> }
>
> +static void set_latency_weight(struct task_struct *p)
> +{
> + p->se.latency_weight = sched_latency_to_weight[p->latency_prio];
> +}
> +
> #ifdef CONFIG_UCLAMP_TASK
> /*
> * Serializes updates of utilization clamp values
> @@ -4394,7 +4399,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
> p->prio = current->normal_prio;
>
> /* Propagate the parent's latency requirements to the child as well */
> - p->latency_nice = current->latency_nice;
> + p->latency_prio = current->latency_prio;
>
> uclamp_fork(p);
>
> @@ -4412,7 +4417,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
> p->prio = p->normal_prio = p->static_prio;
> set_load_weight(p, false);
>
> - p->latency_nice = DEFAULT_LATENCY_NICE;
> + p->latency_prio = NICE_TO_LATENCY(0);
> /*
> * We don't need the reset flag anymore after the fork. It has
> * fulfilled its duty:
> @@ -4420,6 +4425,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
> p->sched_reset_on_fork = 0;
> }
>
> + /* Once latency_prio is set, update the latency weight */
> + set_latency_weight(p);
> +
> if (dl_prio(p->prio))
> return -EAGAIN;
> else if (rt_prio(p->prio))
> @@ -7361,7 +7369,7 @@ static int __sched_setscheduler(struct task_struct *p,
> if (attr->sched_latency_nice < MIN_LATENCY_NICE)
> return -EINVAL;
> /* Use the same security checks as NICE */
> - if (attr->sched_latency_nice < p->latency_nice &&
> + if (attr->sched_latency_nice < LATENCY_TO_NICE(p->latency_prio) &&
> !capable(CAP_SYS_NICE))
> return -EPERM;
> }
> @@ -7401,7 +7409,7 @@ static int __sched_setscheduler(struct task_struct *p,
> if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
> goto change;
> if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE &&
> - attr->sched_latency_nice != p->latency_nice)
> + attr->sched_latency_nice != LATENCY_TO_NICE(p->latency_prio))
> goto change;
>
> p->sched_reset_on_fork = reset_on_fork;
> @@ -7942,7 +7950,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
> get_params(p, &kattr);
> kattr.sched_flags &= SCHED_FLAG_ALL;
>
> - kattr.sched_latency_nice = p->latency_nice;
> + kattr.sched_latency_nice = LATENCY_TO_NICE(p->latency_prio);
>
> #ifdef CONFIG_UCLAMP_TASK
> /*
> @@ -10954,6 +10962,20 @@ const u32 sched_prio_to_wmult[40] = {
> /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
> };
>
> +/*
> + * latency weight for wakeup preemption
> + */
> +const int sched_latency_to_weight[40] = {
> + /* -20 */ 1024, 973, 922, 870, 819,
> + /* -15 */ 768, 717, 666, 614, 563,
> + /* -10 */ 512, 461, 410, 358, 307,
> + /* -5 */ 256, 205, 154, 102, 51,
> + /* 0 */ 0, -51, -102, -154, -205,
> + /* 5 */ -256, -307, -358, -410, -461,
> + /* 10 */ -512, -563, -614, -666, -717,
> + /* 15 */ -768, -819, -870, -922, -973,
> +};
> +
> void call_trace_sched_update_nr_running(struct rq *rq, int count)
> {
> trace_sched_update_nr_running_tp(rq, count);
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 5d76a8927888..253e52ec73fb 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -1043,7 +1043,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
> #endif
> P(policy);
> P(prio);
> - P(latency_nice);
> + P(latency_prio);
> if (task_has_dl_policy(p)) {
> P(dl.runtime);
> P(dl.deadline);
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 5c4bfffe8c2c..506c482a0e48 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5555,6 +5555,35 @@ static int sched_idle_cpu(int cpu)
> }
> #endif
>
> +static void set_next_buddy(struct sched_entity *se);
> +
> +static void check_preempt_from_idle(struct cfs_rq *cfs, struct sched_entity *se)
> +{
> + struct sched_entity *next;
> +
> + if (se->latency_weight <= 0)
> + return;
> +
> + if (cfs->nr_running <= 1)
> + return;
> + /*
> + * When waking from idle, we don't need to check to preempt at wakeup
> + * the idle thread and don't set next buddy as a candidate for being
> + * picked in priority.
> + * In case of simultaneous wakeup from idle, the latency sensitive tasks
> + * lost opportunity to preempt non sensitive tasks which woke up
> + * simultaneously.
> + */
> +
> + if (cfs->next)
> + next = cfs->next;
> + else
> + next = __pick_first_entity(cfs);
> +
> + if (next && wakeup_preempt_entity(next, se) == 1)
> + set_next_buddy(se);
> +}
> +
> /*
> * The enqueue_task method is called before nr_running is
> * increased. Here we update the fair scheduling stats and
> @@ -5648,6 +5677,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> if (!task_new)
> update_overutilized_status(rq);
>
> + if (rq->curr == rq->idle)
> + check_preempt_from_idle(cfs_rq_of(&p->se), &p->se);
> +
> enqueue_throttle:
> if (cfs_bandwidth_used()) {
> /*
> @@ -5669,8 +5701,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> hrtick_update(rq);
> }
>
> -static void set_next_buddy(struct sched_entity *se);
> -
> /*
> * The dequeue_task method is called before nr_running is
> * decreased. We remove the task from the rbtree and
> @@ -6970,6 +7000,27 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> }
> #endif /* CONFIG_SMP */
>
> +static long wakeup_latency_gran(int latency_weight)
> +{
> + long thresh = sysctl_sched_latency;
> +
> + if (!latency_weight)
> + return 0;
> +
> + if (sched_feat(GENTLE_FAIR_SLEEPERS))
> + thresh >>= 1;
> +
> + /*
> + * Clamp the delta to stay in the scheduler period range
> + * [-sysctl_sched_latency:sysctl_sched_latency]
> + */
> + latency_weight = clamp_t(long, latency_weight,
> + -1 * NICE_LATENCY_WEIGHT_MAX,
> + NICE_LATENCY_WEIGHT_MAX);
> +
> + return (thresh * latency_weight) >> NICE_LATENCY_SHIFT;
> +}
> +
> static unsigned long wakeup_gran(struct sched_entity *se)
> {
> unsigned long gran = sysctl_sched_wakeup_granularity;
> @@ -7008,6 +7059,10 @@ static int
> wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
> {
> s64 gran, vdiff = curr->vruntime - se->vruntime;
> + int latency_weight = se->latency_weight - curr->latency_weight;
> +
> + latency_weight = min(latency_weight, se->latency_weight);
Let lable A=se->latency_weight, B=curr->latency_weight, C=A-B.
1 A>0 B>0
::C=A-B<0, min(C,A)=C, latency decrease, C is the real diff value no matter
what A is. That means it can be very 'long'(-sched_latency) and vdiff is
more possible to be in <= 0 case and return -1.
::C=A-B>0, min(C,A)=A, latency increase, but it is conservative. Limit to
A/1024*sched_latency.
When latecy is decreased, the negtive value added to vdiff is larger than the
positive value added to vdiff when latency is increased.
2 A>0 B<0
::C=A-B>0 and C>A, min(C,A)=A, latency increase and it is conservative.
3 A<0 B<0
::C=A-B>0, min(C,A)=A, latency increase but min(C,A)<0, I think if latency
increase means the value added to vdiff will be a positive value to increase
the chance to return 1. I would say it is max(C,A)=C
::C=A-B<0, min(C,A)=A, latency decrease and the real negtive value.
4 A<0,B>0
::C=A-B<0, min(C,A)=C, latency decrease and the real negtive value.
Is there a reason that the value when latency increase and latency decrease
be treated differently. Latency increase value is limited to se's latency_weight
but latency decrease value can extend to -sched_latency or treat them the same.
Penalty latency decrease and conserve latency increase.
There is any value that this latency_weight can be considered to be a average.
The delta value choose is ~%5 to 1024. %5*sched_latency=0.05*6ms=0.3ms.(no scale)
I do not think over that vdiff equation and others though.
Thanks,
Tao
> + vdiff += wakeup_latency_gran(latency_weight);
>
> if (vdiff <= 0)
> return -1;
> @@ -7117,6 +7172,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
> return;
>
> update_curr(cfs_rq_of(se));
> +
> if (wakeup_preempt_entity(se, pse) == 1) {
> /*
> * Bias pick_next to pick the sched entity that is
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 456ad2159eb1..dd92aa9c36f9 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -122,6 +122,17 @@ extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
> * Default tasks should be treated as a task with latency_nice = 0.
> */
> #define DEFAULT_LATENCY_NICE 0
> +#define DEFAULT_LATENCY_PRIO (DEFAULT_LATENCY_NICE + LATENCY_NICE_WIDTH/2)
> +
> +/*
> + * Convert user-nice values [ -20 ... 0 ... 19 ]
> + * to static latency [ 0..39 ],
> + * and back.
> + */
> +#define NICE_TO_LATENCY(nice) ((nice) + DEFAULT_LATENCY_PRIO)
> +#define LATENCY_TO_NICE(prio) ((prio) - DEFAULT_LATENCY_PRIO)
> +#define NICE_LATENCY_SHIFT (SCHED_FIXEDPOINT_SHIFT)
> +#define NICE_LATENCY_WEIGHT_MAX (1L << NICE_LATENCY_SHIFT)
>
> /*
> * Increase resolution of nice-level calculations for 64-bit architectures.
> @@ -2098,6 +2109,7 @@ static_assert(WF_TTWU == SD_BALANCE_WAKE);
>
> extern const int sched_prio_to_weight[40];
> extern const u32 sched_prio_to_wmult[40];
> +extern const int sched_latency_to_weight[40];
>
> /*
> * {de,en}queue flags:
> --
> 2.17.1
>
On Mon, May 02, 2022 at 11:26:14PM +0800, Tao Zhou wrote:
> On Mon, May 02, 2022 at 11:08:15PM +0800, Tao Zhou wrote:
> > On Mon, May 02, 2022 at 02:30:55PM +0200, Vincent Guittot wrote:
> >
> > > On Mon, 2 May 2022 at 11:54, Vincent Guittot <[email protected]> wrote:
> > > >
> > > > Hi Tao,
> > > >
> > > > On Sun, 1 May 2022 at 17:58, Tao Zhou <[email protected]> wrote:
> > > > >
> > > > > Hi Vincent,
> > > > >
> > > > > Change to Valentin Schneider's now using mail address.
> > > >
> > > > Thanks
> > > >
> > > > >
> > > > > On Fri, Mar 11, 2022 at 05:14:05PM +0100, Vincent Guittot wrote:
> > > > >
> > > > > > Take into account the nice latency priority of a thread when deciding to
> > > > > > preempt the current running thread. We don't want to provide more CPU
> > > > > > bandwidth to a thread but reorder the scheduling to run latency sensitive
> > > > > > task first whenever possible.
> > > > > >
> > > > > > As long as a thread didn't use its bandwidth, it will be able to preempt
> > > > > > the current thread.
> > > > > >
> > > > > > At the opposite, a thread with a low latency priority will preempt current
> > > > > > thread at wakeup only to keep fair CPU bandwidth sharing. Otherwise it will
> > > > > > wait for the tick to get its sched slice.
> > > > > >
> > > > > > curr vruntime
> > > > > > |
> > > > > > sysctl_sched_wakeup_granularity
> > > > > > <-->
> > > > > > ----------------------------------|----|-----------------------|---------------
> > > > > > | |<--------------------->
> > > > > > | . sysctl_sched_latency
> > > > > > | .
> > > > > > default/current latency entity | .
> > > > > > | .
> > > > > > 1111111111111111111111111111111111|0000|-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-
> > > > > > se preempts curr at wakeup ------>|<- se doesn't preempt curr -----------------
> > > > > > | .
> > > > > > | .
> > > > > > | .
> > > > > > low latency entity | .
> > > > > > ---------------------->|
> > > > > > % of sysctl_sched_latency |
> > > > > > 1111111111111111111111111111111111111111111111111111111111|0000|-1-1-1-1-1-1-1-
> > > > > > preempt ------------------------------------------------->|<- do not preempt --
> > > > > > | .
> > > > > > | .
> > > > > > | .
> > > > > > high latency entity | .
> > > > > > |<-----------------------| .
> > > > > > | % of sysctl_sched_latency .
> > > > > > 111111111|0000|-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1
> > > > > > preempt->|<- se doesn't preempt curr ------------------------------------------
> > > > > >
> > > > > > Tests results of nice latency impact on heavy load like hackbench:
> > > > > >
> > > > > > hackbench -l (2560 / group) -g group
> > > > > > group latency 0 latency 19
> > > > > > 1 1.450(+/- 0.60%) 1.376(+/- 0.54%) + 5%
> > > > > > 4 1.537(+/- 1.75%) 1.335(+/- 1.81%) +13%
> > > > > > 8 1.414(+/- 1.91%) 1.348(+/- 1.88%) + 5%
> > > > > > 16 1.423(+/- 1.65%) 1.374(+/- 0.58%) + 3%
> > > > > >
> > > > > > hackbench -p -l (2560 / group) -g group
> > > > > > group
> > > > > > 1 1.416(+/- 3.45%) 0.886(+/- 0.54%) +37%
> > > > > > 4 1.634(+/- 7.14%) 0.888(+/- 5.40%) +45%
> > > > > > 8 1.449(+/- 2.14%) 0.804(+/- 4.55%) +44%
> > > > > > 16 0.917(+/- 4.12%) 0.777(+/- 1.41%) +15%
> > > > > >
> > > > > > By deacreasing the latency prio, we reduce the number of preemption at
> > > > >
> > > > > s/deacreasing/decreasing/
> > > >
> > > > yes
> > > >
> > > > > s/reduce/increase/
> > > >
> > > > not in the case of hackbench tests above. By decreasing the latency
> > > > prio of all hackbench threads, we make sure that they will not preempt
> > > > the current thread and let it move forward so we reduce the number of
> > > > preemption.
> > > >
> > > > >
> > > > > > wakeup and help hackbench making progress.
> > > > > >
> > > > > > Test results of nice latency impact on short live load like cyclictest
> > > > > > while competing with heavy load like hackbench:
> > > > > >
> > > > > > hackbench -l 10000 -g group &
> > > > > > cyclictest --policy other -D 5 -q -n
> > > > > > latency 0 latency -20
> > > > > > group min avg max min avg max
> > > > > > 0 16 17 28 15 17 27
> > > > > > 1 61 382 10603 63 89 4628
> > > > > > 4 52 437 15455 54 98 16238
> > > > > > 8 56 728 38499 61 125 28983
> > > > > > 16 53 1215 52207 61 183 80751
> > > > > >
> > > > > > group = 0 means that hackbench is not running.
> > > > > >
> > > > > > The avg is significantly improved with nice latency -20 especially with
> > > > > > large number of groups but min and max remain quite similar. If we add the
> > > > > > histogram parameters to get details of latency, we have :
> > > > > >
> > > > > > hackbench -l 10000 -g 16 &
> > > > > > cyclictest --policy other -D 5 -q -n -H 20000 --histfile data.txt
> > > > > > latency 0 latency -20
> > > > > > Min Latencies: 63 62
> > > > > > Avg Latencies: 1397 218
> > > > > > Max Latencies: 44926 42291
> > > > > > 50% latencies: 100 98
> > > > > > 75% latencies: 762 116
> > > > > > 85% latencies: 1118 126
> > > > > > 90% latencies: 1540 130
> > > > > > 95% latencies: 5610 138
> > > > > > 99% latencies: 13738 266
> > > > > >
> > > > > > With percentile details, we see the benefit of nice latency -20 as
> > > > > > 1% of the latencies stays above 266us whereas the default latency has
> > > > > > got 25% are above 762us and 10% over the 1ms.
> > > > > >
> > > >
> > > > [..]
> > > >
> > > > > > +static long wakeup_latency_gran(int latency_weight)
> > > > > > +{
> > > > > > + long thresh = sysctl_sched_latency;
> > > > > > +
> > > > > > + if (!latency_weight)
> > > > > > + return 0;
> > > > > > +
> > > > > > + if (sched_feat(GENTLE_FAIR_SLEEPERS))
> > > > > > + thresh >>= 1;
> > > > > > +
> > > > > > + /*
> > > > > > + * Clamp the delta to stay in the scheduler period range
> > > > > > + * [-sysctl_sched_latency:sysctl_sched_latency]
> > > > > > + */
> > > > > > + latency_weight = clamp_t(long, latency_weight,
> > > > > > + -1 * NICE_LATENCY_WEIGHT_MAX,
> > > > > > + NICE_LATENCY_WEIGHT_MAX);
> > > > > > +
> > > > > > + return (thresh * latency_weight) >> NICE_LATENCY_SHIFT;
> > > > > > +}
> > > > > > +
> > > > > > static unsigned long wakeup_gran(struct sched_entity *se)
> > > > > > {
> > > > > > unsigned long gran = sysctl_sched_wakeup_granularity;
> > > > > > @@ -7008,6 +7059,10 @@ static int
> > > > > > wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
> > > > > > {
> > > > > > s64 gran, vdiff = curr->vruntime - se->vruntime;
> > > > > > + int latency_weight = se->latency_weight - curr->latency_weight;
> > > > > > +
> > > > > > + latency_weight = min(latency_weight, se->latency_weight);
> > > > >
> > > > > Let lable A=se->latency_weight, B=curr->latency_weight, C=A-B.
> > > > >
> > > > > 1 A>0 B>0
> > > > > ::C=A-B<0, min(C,A)=C, latency decrease, C is the real diff value no matter
> > > > > what A is. That means it can be very 'long'(-sched_latency) and vdiff is
> > > > > more possible to be in <= 0 case and return -1.
> > > > > ::C=A-B>0, min(C,A)=A, latency increase, but it is conservative. Limit to
> > > >
> > > > A>0 and B>0 then min(C=A-B, A) = C
> >
> > It's my mistake. And in this case the calculating of value added to vdiff
> > for latency increase and deleted to vdiff for latency decrease is the same.
> >
> > > >
> > > > > A/1024*sched_latency.
> > > > > When latecy is decreased, the negtive value added to vdiff is larger than the
> > > > > positive value added to vdiff when latency is increased.
> > > >
> > > > When the weight > 0, the tasks have some latency requirements so we
> > > > use their relative priority to decide if we can preempt current which
> > > > also has some latency requirement
> > > >
> > > > >
> > > > > 2 A>0 B<0
> > > > > ::C=A-B>0 and C>A, min(C,A)=A, latency increase and it is conservative.
> > >
> > > For this one we want to use delta like for UC 1 above
> > >
> > > > >
> > > > > 3 A<0 B<0
> > > > > ::C=A-B>0, min(C,A)=A, latency increase but min(C,A)<0, I think if latency
> > > > > increase means the value added to vdiff will be a positive value to increase
> > > > > the chance to return 1. I would say it is max(C,A)=C
> > > > > ::C=A-B<0, min(C,A)=A, latency decrease and the real negtive value.
> > > >
> > > > When the weight < 0, the tasks haven't latency requirement and even
> > > > don't care of being scheduled "quickly". In this case, we don't care
> > > > about the relative priority but we want to minimize the preemption of
> > > > current so we use the weight
> > > >
> > > > >
> > > > > 4 A<0,B>0
> > > > > ::C=A-B<0, min(C,A)=C, latency decrease and the real negtive value.
> > >
> > > And for this one we probably want to use A like for other UC with A < 0
> > >
> > > I'm going to update the way the weight is computed to match this
> >
> > According to your explanations, the min(C,A) is used for A and B both in
> > the negtive region or in the postive region, the max(C,A) is use for A and
> > B both not in the same region.
> >
> > if ((A>>31)^(B>>31))
> > max(C,A)
> > else
> > min(C,A)
>
> Not right.
>
> if ((A&(1<<31))^(B(1<<31)))
> max(C,A)
> else
> min(C,A)
>
> >
> > wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
> > {
> > s64 gran, vdiff = curr->vruntime - se->vruntime;
> > int latency_weight = se->latency_weight - curr->latency_weight;
> >
> > if ((se->latency_weight>>(WMULT_SHIFT-1)) ^
> > curr->latency_weight>>(WMULT_SHIFT-1))
> > latency_weight = max(latency_weight, se->latency_weight);
> > else
> > latency_weight = min(latency_weight, se->latency_weight);
> >
> > vdiff += wakeup_latency_gran(latency_weight);
> > ...
> > }
>
> Not right.
>
> wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
> {
> s64 gran, vdiff = curr->vruntime - se->vruntime;
> int latency_weight = se->latency_weight - curr->latency_weight;
>
> if ((se->latency_weight & (1 << WMULT_SHIFT-1)) ^
> curr->latency_weight & (1 << WMULT_SHIFT-1))
> latency_weight = max(latency_weight, se->latency_weight);
> else
> latency_weight = min(latency_weight, se->latency_weight);
>
> vdiff += wakeup_latency_gran(latency_weight);
> ...
> }
I already on bed, but I think they are the same.. heh
> > > > >
> > > > > Is there a reason that the value when latency increase and latency decrease
> > > > > be treated differently. Latency increase value is limited to se's latency_weight
> > > >
> > > > I have tried to explain why I treat differently if weight is > 0 or < 0.
> > > >
> > > > > but latency decrease value can extend to -sched_latency or treat them the same.
> > > > > Penalty latency decrease and conserve latency increase.
> > > > >
> > > > >
> > > > > There is any value that this latency_weight can be considered to be a average.
> > > > >
> > > > > The delta value choose is ~%5 to 1024. %5*sched_latency=0.05*6ms=0.3ms.(no scale)
> > > > > I do not think over that vdiff equation and others though.
> > > > >
> > > > > Thanks,
> > > > > Tao
> > > > > > + vdiff += wakeup_latency_gran(latency_weight);
> > > > > >
> > > > > > if (vdiff <= 0)
> > > > > > return -1;
> > > > > > @@ -7117,6 +7172,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
> > > > > > return;
> > > > > >
> > > > > > update_curr(cfs_rq_of(se));
> > > > > > +
> > > > > > if (wakeup_preempt_entity(se, pse) == 1) {
> > > > > > /*
> > > > > > * Bias pick_next to pick the sched entity that is
> > > > > > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > > > > > index 456ad2159eb1..dd92aa9c36f9 100644
> > > > > > --- a/kernel/sched/sched.h
> > > > > > +++ b/kernel/sched/sched.h
> > > > > > @@ -122,6 +122,17 @@ extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
> > > > > > * Default tasks should be treated as a task with latency_nice = 0.
> > > > > > */
> > > > > > #define DEFAULT_LATENCY_NICE 0
> > > > > > +#define DEFAULT_LATENCY_PRIO (DEFAULT_LATENCY_NICE + LATENCY_NICE_WIDTH/2)
> > > > > > +
> > > > > > +/*
> > > > > > + * Convert user-nice values [ -20 ... 0 ... 19 ]
> > > > > > + * to static latency [ 0..39 ],
> > > > > > + * and back.
> > > > > > + */
> > > > > > +#define NICE_TO_LATENCY(nice) ((nice) + DEFAULT_LATENCY_PRIO)
> > > > > > +#define LATENCY_TO_NICE(prio) ((prio) - DEFAULT_LATENCY_PRIO)
> > > > > > +#define NICE_LATENCY_SHIFT (SCHED_FIXEDPOINT_SHIFT)
> > > > > > +#define NICE_LATENCY_WEIGHT_MAX (1L << NICE_LATENCY_SHIFT)
> > > > > >
> > > > > > /*
> > > > > > * Increase resolution of nice-level calculations for 64-bit architectures.
> > > > > > @@ -2098,6 +2109,7 @@ static_assert(WF_TTWU == SD_BALANCE_WAKE);
> > > > > >
> > > > > > extern const int sched_prio_to_weight[40];
> > > > > > extern const u32 sched_prio_to_wmult[40];
> > > > > > +extern const int sched_latency_to_weight[40];
> > > > > >
> > > > > > /*
> > > > > > * {de,en}queue flags:
> > > > > > --
> > > > > > 2.17.1
> > > > > >
Hi Tao,
On Sun, 1 May 2022 at 17:58, Tao Zhou <[email protected]> wrote:
>
> Hi Vincent,
>
> Change to Valentin Schneider's now using mail address.
Thanks
>
> On Fri, Mar 11, 2022 at 05:14:05PM +0100, Vincent Guittot wrote:
>
> > Take into account the nice latency priority of a thread when deciding to
> > preempt the current running thread. We don't want to provide more CPU
> > bandwidth to a thread but reorder the scheduling to run latency sensitive
> > task first whenever possible.
> >
> > As long as a thread didn't use its bandwidth, it will be able to preempt
> > the current thread.
> >
> > At the opposite, a thread with a low latency priority will preempt current
> > thread at wakeup only to keep fair CPU bandwidth sharing. Otherwise it will
> > wait for the tick to get its sched slice.
> >
> > curr vruntime
> > |
> > sysctl_sched_wakeup_granularity
> > <-->
> > ----------------------------------|----|-----------------------|---------------
> > | |<--------------------->
> > | . sysctl_sched_latency
> > | .
> > default/current latency entity | .
> > | .
> > 1111111111111111111111111111111111|0000|-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-
> > se preempts curr at wakeup ------>|<- se doesn't preempt curr -----------------
> > | .
> > | .
> > | .
> > low latency entity | .
> > ---------------------->|
> > % of sysctl_sched_latency |
> > 1111111111111111111111111111111111111111111111111111111111|0000|-1-1-1-1-1-1-1-
> > preempt ------------------------------------------------->|<- do not preempt --
> > | .
> > | .
> > | .
> > high latency entity | .
> > |<-----------------------| .
> > | % of sysctl_sched_latency .
> > 111111111|0000|-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1
> > preempt->|<- se doesn't preempt curr ------------------------------------------
> >
> > Tests results of nice latency impact on heavy load like hackbench:
> >
> > hackbench -l (2560 / group) -g group
> > group latency 0 latency 19
> > 1 1.450(+/- 0.60%) 1.376(+/- 0.54%) + 5%
> > 4 1.537(+/- 1.75%) 1.335(+/- 1.81%) +13%
> > 8 1.414(+/- 1.91%) 1.348(+/- 1.88%) + 5%
> > 16 1.423(+/- 1.65%) 1.374(+/- 0.58%) + 3%
> >
> > hackbench -p -l (2560 / group) -g group
> > group
> > 1 1.416(+/- 3.45%) 0.886(+/- 0.54%) +37%
> > 4 1.634(+/- 7.14%) 0.888(+/- 5.40%) +45%
> > 8 1.449(+/- 2.14%) 0.804(+/- 4.55%) +44%
> > 16 0.917(+/- 4.12%) 0.777(+/- 1.41%) +15%
> >
> > By deacreasing the latency prio, we reduce the number of preemption at
>
> s/deacreasing/decreasing/
yes
> s/reduce/increase/
not in the case of hackbench tests above. By decreasing the latency
prio of all hackbench threads, we make sure that they will not preempt
the current thread and let it move forward so we reduce the number of
preemption.
>
> > wakeup and help hackbench making progress.
> >
> > Test results of nice latency impact on short live load like cyclictest
> > while competing with heavy load like hackbench:
> >
> > hackbench -l 10000 -g group &
> > cyclictest --policy other -D 5 -q -n
> > latency 0 latency -20
> > group min avg max min avg max
> > 0 16 17 28 15 17 27
> > 1 61 382 10603 63 89 4628
> > 4 52 437 15455 54 98 16238
> > 8 56 728 38499 61 125 28983
> > 16 53 1215 52207 61 183 80751
> >
> > group = 0 means that hackbench is not running.
> >
> > The avg is significantly improved with nice latency -20 especially with
> > large number of groups but min and max remain quite similar. If we add the
> > histogram parameters to get details of latency, we have :
> >
> > hackbench -l 10000 -g 16 &
> > cyclictest --policy other -D 5 -q -n -H 20000 --histfile data.txt
> > latency 0 latency -20
> > Min Latencies: 63 62
> > Avg Latencies: 1397 218
> > Max Latencies: 44926 42291
> > 50% latencies: 100 98
> > 75% latencies: 762 116
> > 85% latencies: 1118 126
> > 90% latencies: 1540 130
> > 95% latencies: 5610 138
> > 99% latencies: 13738 266
> >
> > With percentile details, we see the benefit of nice latency -20 as
> > 1% of the latencies stays above 266us whereas the default latency has
> > got 25% are above 762us and 10% over the 1ms.
> >
[..]
> > +static long wakeup_latency_gran(int latency_weight)
> > +{
> > + long thresh = sysctl_sched_latency;
> > +
> > + if (!latency_weight)
> > + return 0;
> > +
> > + if (sched_feat(GENTLE_FAIR_SLEEPERS))
> > + thresh >>= 1;
> > +
> > + /*
> > + * Clamp the delta to stay in the scheduler period range
> > + * [-sysctl_sched_latency:sysctl_sched_latency]
> > + */
> > + latency_weight = clamp_t(long, latency_weight,
> > + -1 * NICE_LATENCY_WEIGHT_MAX,
> > + NICE_LATENCY_WEIGHT_MAX);
> > +
> > + return (thresh * latency_weight) >> NICE_LATENCY_SHIFT;
> > +}
> > +
> > static unsigned long wakeup_gran(struct sched_entity *se)
> > {
> > unsigned long gran = sysctl_sched_wakeup_granularity;
> > @@ -7008,6 +7059,10 @@ static int
> > wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
> > {
> > s64 gran, vdiff = curr->vruntime - se->vruntime;
> > + int latency_weight = se->latency_weight - curr->latency_weight;
> > +
> > + latency_weight = min(latency_weight, se->latency_weight);
>
> Let lable A=se->latency_weight, B=curr->latency_weight, C=A-B.
>
> 1 A>0 B>0
> ::C=A-B<0, min(C,A)=C, latency decrease, C is the real diff value no matter
> what A is. That means it can be very 'long'(-sched_latency) and vdiff is
> more possible to be in <= 0 case and return -1.
> ::C=A-B>0, min(C,A)=A, latency increase, but it is conservative. Limit to
A>0 and B>0 then min(C=A-B, A) = C
> A/1024*sched_latency.
> When latecy is decreased, the negtive value added to vdiff is larger than the
> positive value added to vdiff when latency is increased.
When the weight > 0, the tasks have some latency requirements so we
use their relative priority to decide if we can preempt current which
also has some latency requirement
>
> 2 A>0 B<0
> ::C=A-B>0 and C>A, min(C,A)=A, latency increase and it is conservative.
>
> 3 A<0 B<0
> ::C=A-B>0, min(C,A)=A, latency increase but min(C,A)<0, I think if latency
> increase means the value added to vdiff will be a positive value to increase
> the chance to return 1. I would say it is max(C,A)=C
> ::C=A-B<0, min(C,A)=A, latency decrease and the real negtive value.
When the weight < 0, the tasks haven't latency requirement and even
don't care of being scheduled "quickly". In this case, we don't care
about the relative priority but we want to minimize the preemption of
current so we use the weight
>
> 4 A<0,B>0
> ::C=A-B<0, min(C,A)=C, latency decrease and the real negtive value.
>
> Is there a reason that the value when latency increase and latency decrease
> be treated differently. Latency increase value is limited to se's latency_weight
I have tried to explain why I treat differently if weight is > 0 or < 0.
> but latency decrease value can extend to -sched_latency or treat them the same.
> Penalty latency decrease and conserve latency increase.
>
>
> There is any value that this latency_weight can be considered to be a average.
>
> The delta value choose is ~%5 to 1024. %5*sched_latency=0.05*6ms=0.3ms.(no scale)
> I do not think over that vdiff equation and others though.
>
> Thanks,
> Tao
> > + vdiff += wakeup_latency_gran(latency_weight);
> >
> > if (vdiff <= 0)
> > return -1;
> > @@ -7117,6 +7172,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
> > return;
> >
> > update_curr(cfs_rq_of(se));
> > +
> > if (wakeup_preempt_entity(se, pse) == 1) {
> > /*
> > * Bias pick_next to pick the sched entity that is
> > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > index 456ad2159eb1..dd92aa9c36f9 100644
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -122,6 +122,17 @@ extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
> > * Default tasks should be treated as a task with latency_nice = 0.
> > */
> > #define DEFAULT_LATENCY_NICE 0
> > +#define DEFAULT_LATENCY_PRIO (DEFAULT_LATENCY_NICE + LATENCY_NICE_WIDTH/2)
> > +
> > +/*
> > + * Convert user-nice values [ -20 ... 0 ... 19 ]
> > + * to static latency [ 0..39 ],
> > + * and back.
> > + */
> > +#define NICE_TO_LATENCY(nice) ((nice) + DEFAULT_LATENCY_PRIO)
> > +#define LATENCY_TO_NICE(prio) ((prio) - DEFAULT_LATENCY_PRIO)
> > +#define NICE_LATENCY_SHIFT (SCHED_FIXEDPOINT_SHIFT)
> > +#define NICE_LATENCY_WEIGHT_MAX (1L << NICE_LATENCY_SHIFT)
> >
> > /*
> > * Increase resolution of nice-level calculations for 64-bit architectures.
> > @@ -2098,6 +2109,7 @@ static_assert(WF_TTWU == SD_BALANCE_WAKE);
> >
> > extern const int sched_prio_to_weight[40];
> > extern const u32 sched_prio_to_wmult[40];
> > +extern const int sched_latency_to_weight[40];
> >
> > /*
> > * {de,en}queue flags:
> > --
> > 2.17.1
> >
On Mon, May 02, 2022 at 02:30:55PM +0200, Vincent Guittot wrote:
> On Mon, 2 May 2022 at 11:54, Vincent Guittot <[email protected]> wrote:
> >
> > Hi Tao,
> >
> > On Sun, 1 May 2022 at 17:58, Tao Zhou <[email protected]> wrote:
> > >
> > > Hi Vincent,
> > >
> > > Change to Valentin Schneider's now using mail address.
> >
> > Thanks
> >
> > >
> > > On Fri, Mar 11, 2022 at 05:14:05PM +0100, Vincent Guittot wrote:
> > >
> > > > Take into account the nice latency priority of a thread when deciding to
> > > > preempt the current running thread. We don't want to provide more CPU
> > > > bandwidth to a thread but reorder the scheduling to run latency sensitive
> > > > task first whenever possible.
> > > >
> > > > As long as a thread didn't use its bandwidth, it will be able to preempt
> > > > the current thread.
> > > >
> > > > At the opposite, a thread with a low latency priority will preempt current
> > > > thread at wakeup only to keep fair CPU bandwidth sharing. Otherwise it will
> > > > wait for the tick to get its sched slice.
> > > >
> > > > curr vruntime
> > > > |
> > > > sysctl_sched_wakeup_granularity
> > > > <-->
> > > > ----------------------------------|----|-----------------------|---------------
> > > > | |<--------------------->
> > > > | . sysctl_sched_latency
> > > > | .
> > > > default/current latency entity | .
> > > > | .
> > > > 1111111111111111111111111111111111|0000|-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-
> > > > se preempts curr at wakeup ------>|<- se doesn't preempt curr -----------------
> > > > | .
> > > > | .
> > > > | .
> > > > low latency entity | .
> > > > ---------------------->|
> > > > % of sysctl_sched_latency |
> > > > 1111111111111111111111111111111111111111111111111111111111|0000|-1-1-1-1-1-1-1-
> > > > preempt ------------------------------------------------->|<- do not preempt --
> > > > | .
> > > > | .
> > > > | .
> > > > high latency entity | .
> > > > |<-----------------------| .
> > > > | % of sysctl_sched_latency .
> > > > 111111111|0000|-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1
> > > > preempt->|<- se doesn't preempt curr ------------------------------------------
> > > >
> > > > Tests results of nice latency impact on heavy load like hackbench:
> > > >
> > > > hackbench -l (2560 / group) -g group
> > > > group latency 0 latency 19
> > > > 1 1.450(+/- 0.60%) 1.376(+/- 0.54%) + 5%
> > > > 4 1.537(+/- 1.75%) 1.335(+/- 1.81%) +13%
> > > > 8 1.414(+/- 1.91%) 1.348(+/- 1.88%) + 5%
> > > > 16 1.423(+/- 1.65%) 1.374(+/- 0.58%) + 3%
> > > >
> > > > hackbench -p -l (2560 / group) -g group
> > > > group
> > > > 1 1.416(+/- 3.45%) 0.886(+/- 0.54%) +37%
> > > > 4 1.634(+/- 7.14%) 0.888(+/- 5.40%) +45%
> > > > 8 1.449(+/- 2.14%) 0.804(+/- 4.55%) +44%
> > > > 16 0.917(+/- 4.12%) 0.777(+/- 1.41%) +15%
> > > >
> > > > By deacreasing the latency prio, we reduce the number of preemption at
> > >
> > > s/deacreasing/decreasing/
> >
> > yes
> >
> > > s/reduce/increase/
> >
> > not in the case of hackbench tests above. By decreasing the latency
> > prio of all hackbench threads, we make sure that they will not preempt
> > the current thread and let it move forward so we reduce the number of
> > preemption.
> >
> > >
> > > > wakeup and help hackbench making progress.
> > > >
> > > > Test results of nice latency impact on short live load like cyclictest
> > > > while competing with heavy load like hackbench:
> > > >
> > > > hackbench -l 10000 -g group &
> > > > cyclictest --policy other -D 5 -q -n
> > > > latency 0 latency -20
> > > > group min avg max min avg max
> > > > 0 16 17 28 15 17 27
> > > > 1 61 382 10603 63 89 4628
> > > > 4 52 437 15455 54 98 16238
> > > > 8 56 728 38499 61 125 28983
> > > > 16 53 1215 52207 61 183 80751
> > > >
> > > > group = 0 means that hackbench is not running.
> > > >
> > > > The avg is significantly improved with nice latency -20 especially with
> > > > large number of groups but min and max remain quite similar. If we add the
> > > > histogram parameters to get details of latency, we have :
> > > >
> > > > hackbench -l 10000 -g 16 &
> > > > cyclictest --policy other -D 5 -q -n -H 20000 --histfile data.txt
> > > > latency 0 latency -20
> > > > Min Latencies: 63 62
> > > > Avg Latencies: 1397 218
> > > > Max Latencies: 44926 42291
> > > > 50% latencies: 100 98
> > > > 75% latencies: 762 116
> > > > 85% latencies: 1118 126
> > > > 90% latencies: 1540 130
> > > > 95% latencies: 5610 138
> > > > 99% latencies: 13738 266
> > > >
> > > > With percentile details, we see the benefit of nice latency -20 as
> > > > 1% of the latencies stays above 266us whereas the default latency has
> > > > got 25% are above 762us and 10% over the 1ms.
> > > >
> >
> > [..]
> >
> > > > +static long wakeup_latency_gran(int latency_weight)
> > > > +{
> > > > + long thresh = sysctl_sched_latency;
> > > > +
> > > > + if (!latency_weight)
> > > > + return 0;
> > > > +
> > > > + if (sched_feat(GENTLE_FAIR_SLEEPERS))
> > > > + thresh >>= 1;
> > > > +
> > > > + /*
> > > > + * Clamp the delta to stay in the scheduler period range
> > > > + * [-sysctl_sched_latency:sysctl_sched_latency]
> > > > + */
> > > > + latency_weight = clamp_t(long, latency_weight,
> > > > + -1 * NICE_LATENCY_WEIGHT_MAX,
> > > > + NICE_LATENCY_WEIGHT_MAX);
> > > > +
> > > > + return (thresh * latency_weight) >> NICE_LATENCY_SHIFT;
> > > > +}
> > > > +
> > > > static unsigned long wakeup_gran(struct sched_entity *se)
> > > > {
> > > > unsigned long gran = sysctl_sched_wakeup_granularity;
> > > > @@ -7008,6 +7059,10 @@ static int
> > > > wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
> > > > {
> > > > s64 gran, vdiff = curr->vruntime - se->vruntime;
> > > > + int latency_weight = se->latency_weight - curr->latency_weight;
> > > > +
> > > > + latency_weight = min(latency_weight, se->latency_weight);
> > >
> > > Let lable A=se->latency_weight, B=curr->latency_weight, C=A-B.
> > >
> > > 1 A>0 B>0
> > > ::C=A-B<0, min(C,A)=C, latency decrease, C is the real diff value no matter
> > > what A is. That means it can be very 'long'(-sched_latency) and vdiff is
> > > more possible to be in <= 0 case and return -1.
> > > ::C=A-B>0, min(C,A)=A, latency increase, but it is conservative. Limit to
> >
> > A>0 and B>0 then min(C=A-B, A) = C
It's my mistake. And in this case the calculating of value added to vdiff
for latency increase and deleted to vdiff for latency decrease is the same.
> >
> > > A/1024*sched_latency.
> > > When latecy is decreased, the negtive value added to vdiff is larger than the
> > > positive value added to vdiff when latency is increased.
> >
> > When the weight > 0, the tasks have some latency requirements so we
> > use their relative priority to decide if we can preempt current which
> > also has some latency requirement
> >
> > >
> > > 2 A>0 B<0
> > > ::C=A-B>0 and C>A, min(C,A)=A, latency increase and it is conservative.
>
> For this one we want to use delta like for UC 1 above
>
> > >
> > > 3 A<0 B<0
> > > ::C=A-B>0, min(C,A)=A, latency increase but min(C,A)<0, I think if latency
> > > increase means the value added to vdiff will be a positive value to increase
> > > the chance to return 1. I would say it is max(C,A)=C
> > > ::C=A-B<0, min(C,A)=A, latency decrease and the real negtive value.
> >
> > When the weight < 0, the tasks haven't latency requirement and even
> > don't care of being scheduled "quickly". In this case, we don't care
> > about the relative priority but we want to minimize the preemption of
> > current so we use the weight
> >
> > >
> > > 4 A<0,B>0
> > > ::C=A-B<0, min(C,A)=C, latency decrease and the real negtive value.
>
> And for this one we probably want to use A like for other UC with A < 0
>
> I'm going to update the way the weight is computed to match this
According to your explanations, the min(C,A) is used for A and B both in
the negtive region or in the postive region, the max(C,A) is use for A and
B both not in the same region.
if ((A>>31)^(B>>31))
max(C,A)
else
min(C,A)
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
{
s64 gran, vdiff = curr->vruntime - se->vruntime;
int latency_weight = se->latency_weight - curr->latency_weight;
if ((se->latency_weight>>(WMULT_SHIFT-1)) ^
curr->latency_weight>>(WMULT_SHIFT-1))
latency_weight = max(latency_weight, se->latency_weight);
else
latency_weight = min(latency_weight, se->latency_weight);
vdiff += wakeup_latency_gran(latency_weight);
...
}
> > >
> > > Is there a reason that the value when latency increase and latency decrease
> > > be treated differently. Latency increase value is limited to se's latency_weight
> >
> > I have tried to explain why I treat differently if weight is > 0 or < 0.
> >
> > > but latency decrease value can extend to -sched_latency or treat them the same.
> > > Penalty latency decrease and conserve latency increase.
> > >
> > >
> > > There is any value that this latency_weight can be considered to be a average.
> > >
> > > The delta value choose is ~%5 to 1024. %5*sched_latency=0.05*6ms=0.3ms.(no scale)
> > > I do not think over that vdiff equation and others though.
> > >
> > > Thanks,
> > > Tao
> > > > + vdiff += wakeup_latency_gran(latency_weight);
> > > >
> > > > if (vdiff <= 0)
> > > > return -1;
> > > > @@ -7117,6 +7172,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
> > > > return;
> > > >
> > > > update_curr(cfs_rq_of(se));
> > > > +
> > > > if (wakeup_preempt_entity(se, pse) == 1) {
> > > > /*
> > > > * Bias pick_next to pick the sched entity that is
> > > > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > > > index 456ad2159eb1..dd92aa9c36f9 100644
> > > > --- a/kernel/sched/sched.h
> > > > +++ b/kernel/sched/sched.h
> > > > @@ -122,6 +122,17 @@ extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
> > > > * Default tasks should be treated as a task with latency_nice = 0.
> > > > */
> > > > #define DEFAULT_LATENCY_NICE 0
> > > > +#define DEFAULT_LATENCY_PRIO (DEFAULT_LATENCY_NICE + LATENCY_NICE_WIDTH/2)
> > > > +
> > > > +/*
> > > > + * Convert user-nice values [ -20 ... 0 ... 19 ]
> > > > + * to static latency [ 0..39 ],
> > > > + * and back.
> > > > + */
> > > > +#define NICE_TO_LATENCY(nice) ((nice) + DEFAULT_LATENCY_PRIO)
> > > > +#define LATENCY_TO_NICE(prio) ((prio) - DEFAULT_LATENCY_PRIO)
> > > > +#define NICE_LATENCY_SHIFT (SCHED_FIXEDPOINT_SHIFT)
> > > > +#define NICE_LATENCY_WEIGHT_MAX (1L << NICE_LATENCY_SHIFT)
> > > >
> > > > /*
> > > > * Increase resolution of nice-level calculations for 64-bit architectures.
> > > > @@ -2098,6 +2109,7 @@ static_assert(WF_TTWU == SD_BALANCE_WAKE);
> > > >
> > > > extern const int sched_prio_to_weight[40];
> > > > extern const u32 sched_prio_to_wmult[40];
> > > > +extern const int sched_latency_to_weight[40];
> > > >
> > > > /*
> > > > * {de,en}queue flags:
> > > > --
> > > > 2.17.1
> > > >
On Mon, May 02, 2022 at 06:21:34PM +0200, Vincent Guittot wrote:
> Le lundi 02 mai 2022 à 23:47:32 (+0800), Tao Zhou a écrit :
> > On Mon, May 02, 2022 at 11:26:14PM +0800, Tao Zhou wrote:
> > > On Mon, May 02, 2022 at 11:08:15PM +0800, Tao Zhou wrote:
> > > > On Mon, May 02, 2022 at 02:30:55PM +0200, Vincent Guittot wrote:
> > > >
> > > > > On Mon, 2 May 2022 at 11:54, Vincent Guittot <[email protected]> wrote:
> > > > > >
> > > > > > Hi Tao,
> > > > > >
> > > > > > On Sun, 1 May 2022 at 17:58, Tao Zhou <[email protected]> wrote:
> > > > > > >
> > > > > > > Hi Vincent,
> > > > > > >
> > > > > > > Change to Valentin Schneider's now using mail address.
> > > > > >
> > > > > > Thanks
> > > > > >
> > > > > > >
> > > > > > > On Fri, Mar 11, 2022 at 05:14:05PM +0100, Vincent Guittot wrote:
> > > > > > >
> > > > > > > > Take into account the nice latency priority of a thread when deciding to
> > > > > > > > preempt the current running thread. We don't want to provide more CPU
> > > > > > > > bandwidth to a thread but reorder the scheduling to run latency sensitive
> > > > > > > > task first whenever possible.
> > > > > > > >
> > > > > > > > As long as a thread didn't use its bandwidth, it will be able to preempt
> > > > > > > > the current thread.
> > > > > > > >
> > > > > > > > At the opposite, a thread with a low latency priority will preempt current
> > > > > > > > thread at wakeup only to keep fair CPU bandwidth sharing. Otherwise it will
> > > > > > > > wait for the tick to get its sched slice.
> > > > > > > >
> > > > > > > > curr vruntime
> > > > > > > > |
> > > > > > > > sysctl_sched_wakeup_granularity
> > > > > > > > <-->
> > > > > > > > ----------------------------------|----|-----------------------|---------------
> > > > > > > > | |<--------------------->
> > > > > > > > | . sysctl_sched_latency
> > > > > > > > | .
> > > > > > > > default/current latency entity | .
> > > > > > > > | .
> > > > > > > > 1111111111111111111111111111111111|0000|-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-
> > > > > > > > se preempts curr at wakeup ------>|<- se doesn't preempt curr -----------------
> > > > > > > > | .
> > > > > > > > | .
> > > > > > > > | .
> > > > > > > > low latency entity | .
> > > > > > > > ---------------------->|
> > > > > > > > % of sysctl_sched_latency |
> > > > > > > > 1111111111111111111111111111111111111111111111111111111111|0000|-1-1-1-1-1-1-1-
> > > > > > > > preempt ------------------------------------------------->|<- do not preempt --
> > > > > > > > | .
> > > > > > > > | .
> > > > > > > > | .
> > > > > > > > high latency entity | .
> > > > > > > > |<-----------------------| .
> > > > > > > > | % of sysctl_sched_latency .
> > > > > > > > 111111111|0000|-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1
> > > > > > > > preempt->|<- se doesn't preempt curr ------------------------------------------
> > > > > > > >
> > > > > > > > Tests results of nice latency impact on heavy load like hackbench:
> > > > > > > >
> > > > > > > > hackbench -l (2560 / group) -g group
> > > > > > > > group latency 0 latency 19
> > > > > > > > 1 1.450(+/- 0.60%) 1.376(+/- 0.54%) + 5%
> > > > > > > > 4 1.537(+/- 1.75%) 1.335(+/- 1.81%) +13%
> > > > > > > > 8 1.414(+/- 1.91%) 1.348(+/- 1.88%) + 5%
> > > > > > > > 16 1.423(+/- 1.65%) 1.374(+/- 0.58%) + 3%
> > > > > > > >
> > > > > > > > hackbench -p -l (2560 / group) -g group
> > > > > > > > group
> > > > > > > > 1 1.416(+/- 3.45%) 0.886(+/- 0.54%) +37%
> > > > > > > > 4 1.634(+/- 7.14%) 0.888(+/- 5.40%) +45%
> > > > > > > > 8 1.449(+/- 2.14%) 0.804(+/- 4.55%) +44%
> > > > > > > > 16 0.917(+/- 4.12%) 0.777(+/- 1.41%) +15%
> > > > > > > >
> > > > > > > > By deacreasing the latency prio, we reduce the number of preemption at
> > > > > > >
> > > > > > > s/deacreasing/decreasing/
> > > > > >
> > > > > > yes
> > > > > >
> > > > > > > s/reduce/increase/
> > > > > >
> > > > > > not in the case of hackbench tests above. By decreasing the latency
> > > > > > prio of all hackbench threads, we make sure that they will not preempt
> > > > > > the current thread and let it move forward so we reduce the number of
> > > > > > preemption.
> > > > > >
> > > > > > >
> > > > > > > > wakeup and help hackbench making progress.
> > > > > > > >
> > > > > > > > Test results of nice latency impact on short live load like cyclictest
> > > > > > > > while competing with heavy load like hackbench:
> > > > > > > >
> > > > > > > > hackbench -l 10000 -g group &
> > > > > > > > cyclictest --policy other -D 5 -q -n
> > > > > > > > latency 0 latency -20
> > > > > > > > group min avg max min avg max
> > > > > > > > 0 16 17 28 15 17 27
> > > > > > > > 1 61 382 10603 63 89 4628
> > > > > > > > 4 52 437 15455 54 98 16238
> > > > > > > > 8 56 728 38499 61 125 28983
> > > > > > > > 16 53 1215 52207 61 183 80751
> > > > > > > >
> > > > > > > > group = 0 means that hackbench is not running.
> > > > > > > >
> > > > > > > > The avg is significantly improved with nice latency -20 especially with
> > > > > > > > large number of groups but min and max remain quite similar. If we add the
> > > > > > > > histogram parameters to get details of latency, we have :
> > > > > > > >
> > > > > > > > hackbench -l 10000 -g 16 &
> > > > > > > > cyclictest --policy other -D 5 -q -n -H 20000 --histfile data.txt
> > > > > > > > latency 0 latency -20
> > > > > > > > Min Latencies: 63 62
> > > > > > > > Avg Latencies: 1397 218
> > > > > > > > Max Latencies: 44926 42291
> > > > > > > > 50% latencies: 100 98
> > > > > > > > 75% latencies: 762 116
> > > > > > > > 85% latencies: 1118 126
> > > > > > > > 90% latencies: 1540 130
> > > > > > > > 95% latencies: 5610 138
> > > > > > > > 99% latencies: 13738 266
> > > > > > > >
> > > > > > > > With percentile details, we see the benefit of nice latency -20 as
> > > > > > > > 1% of the latencies stays above 266us whereas the default latency has
> > > > > > > > got 25% are above 762us and 10% over the 1ms.
> > > > > > > >
> > > > > >
> > > > > > [..]
> > > > > >
> > > > > > > > +static long wakeup_latency_gran(int latency_weight)
> > > > > > > > +{
> > > > > > > > + long thresh = sysctl_sched_latency;
> > > > > > > > +
> > > > > > > > + if (!latency_weight)
> > > > > > > > + return 0;
> > > > > > > > +
> > > > > > > > + if (sched_feat(GENTLE_FAIR_SLEEPERS))
> > > > > > > > + thresh >>= 1;
> > > > > > > > +
> > > > > > > > + /*
> > > > > > > > + * Clamp the delta to stay in the scheduler period range
> > > > > > > > + * [-sysctl_sched_latency:sysctl_sched_latency]
> > > > > > > > + */
> > > > > > > > + latency_weight = clamp_t(long, latency_weight,
> > > > > > > > + -1 * NICE_LATENCY_WEIGHT_MAX,
> > > > > > > > + NICE_LATENCY_WEIGHT_MAX);
> > > > > > > > +
> > > > > > > > + return (thresh * latency_weight) >> NICE_LATENCY_SHIFT;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > static unsigned long wakeup_gran(struct sched_entity *se)
> > > > > > > > {
> > > > > > > > unsigned long gran = sysctl_sched_wakeup_granularity;
> > > > > > > > @@ -7008,6 +7059,10 @@ static int
> > > > > > > > wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
> > > > > > > > {
> > > > > > > > s64 gran, vdiff = curr->vruntime - se->vruntime;
> > > > > > > > + int latency_weight = se->latency_weight - curr->latency_weight;
> > > > > > > > +
> > > > > > > > + latency_weight = min(latency_weight, se->latency_weight);
> > > > > > >
> > > > > > > Let lable A=se->latency_weight, B=curr->latency_weight, C=A-B.
> > > > > > >
> > > > > > > 1 A>0 B>0
> > > > > > > ::C=A-B<0, min(C,A)=C, latency decrease, C is the real diff value no matter
> > > > > > > what A is. That means it can be very 'long'(-sched_latency) and vdiff is
> > > > > > > more possible to be in <= 0 case and return -1.
> > > > > > > ::C=A-B>0, min(C,A)=A, latency increase, but it is conservative. Limit to
> > > > > >
> > > > > > A>0 and B>0 then min(C=A-B, A) = C
> > > >
> > > > It's my mistake. And in this case the calculating of value added to vdiff
> > > > for latency increase and deleted to vdiff for latency decrease is the same.
> > > >
> > > > > >
> > > > > > > A/1024*sched_latency.
> > > > > > > When latecy is decreased, the negtive value added to vdiff is larger than the
> > > > > > > positive value added to vdiff when latency is increased.
> > > > > >
> > > > > > When the weight > 0, the tasks have some latency requirements so we
> > > > > > use their relative priority to decide if we can preempt current which
> > > > > > also has some latency requirement
> > > > > >
> > > > > > >
> > > > > > > 2 A>0 B<0
> > > > > > > ::C=A-B>0 and C>A, min(C,A)=A, latency increase and it is conservative.
> > > > >
> > > > > For this one we want to use delta like for UC 1 above
> > > > >
> > > > > > >
> > > > > > > 3 A<0 B<0
> > > > > > > ::C=A-B>0, min(C,A)=A, latency increase but min(C,A)<0, I think if latency
> > > > > > > increase means the value added to vdiff will be a positive value to increase
> > > > > > > the chance to return 1. I would say it is max(C,A)=C
> > > > > > > ::C=A-B<0, min(C,A)=A, latency decrease and the real negtive value.
> > > > > >
> > > > > > When the weight < 0, the tasks haven't latency requirement and even
> > > > > > don't care of being scheduled "quickly". In this case, we don't care
> > > > > > about the relative priority but we want to minimize the preemption of
> > > > > > current so we use the weight
> > > > > >
> > > > > > >
> > > > > > > 4 A<0,B>0
> > > > > > > ::C=A-B<0, min(C,A)=C, latency decrease and the real negtive value.
> > > > >
> > > > > And for this one we probably want to use A like for other UC with A < 0
> > > > >
> > > > > I'm going to update the way the weight is computed to match this
> > > >
> > > > According to your explanations, the min(C,A) is used for A and B both in
> > > > the negtive region or in the postive region, the max(C,A) is use for A and
> > > > B both not in the same region.
> > > >
> > > > if ((A>>31)^(B>>31))
> > > > max(C,A)
> > > > else
> > > > min(C,A)
> > >
> > > Not right.
> > >
> > > if ((A&(1<<31))^(B(1<<31)))
> > > max(C,A)
> > > else
> > > min(C,A)
> > >
> > > >
> > > > wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
> > > > {
> > > > s64 gran, vdiff = curr->vruntime - se->vruntime;
> > > > int latency_weight = se->latency_weight - curr->latency_weight;
> > > >
> > > > if ((se->latency_weight>>(WMULT_SHIFT-1)) ^
> > > > curr->latency_weight>>(WMULT_SHIFT-1))
> > > > latency_weight = max(latency_weight, se->latency_weight);
> > > > else
> > > > latency_weight = min(latency_weight, se->latency_weight);
> > > >
> > > > vdiff += wakeup_latency_gran(latency_weight);
> > > > ...
> > > > }
> > >
> > > Not right.
> > >
> > > wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
> > > {
> > > s64 gran, vdiff = curr->vruntime - se->vruntime;
> > > int latency_weight = se->latency_weight - curr->latency_weight;
> > >
> > > if ((se->latency_weight & (1 << WMULT_SHIFT-1)) ^
> > > curr->latency_weight & (1 << WMULT_SHIFT-1))
> > > latency_weight = max(latency_weight, se->latency_weight);
> > > else
> > > latency_weight = min(latency_weight, se->latency_weight);
> > >
> > > vdiff += wakeup_latency_gran(latency_weight);
> > > ...
> > > }
> >
> > I already on bed, but I think they are the same.. heh
>
> case 1 & 2 use delta and have A > 0
>
> case 3 & 4 use A and have A < 0
>
> so I plan to use :
>
> wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
> {
> s64 gran, vdiff = curr->vruntime - se->vruntime;
> + int latency_weight = se->latency_weight;
> +
> + /*
> + * A positive latency weight means that se has some latency requirements that
> + * needs to be evaluate versus the current thread.
> + * Otherwise, use the latency weight to evaluate how much scheduling
> + * delay is acceptable by se.
> + */
> + if (latency_weight > 0)
> + latency_weight -= curr->latency_weight;
> +
> + vdiff += wakeup_latency_gran(latency_weight);
>
> if (vdiff <= 0)
> return -1;
Simple enough.
> >
> > > > > > >
> > > > > > > Is there a reason that the value when latency increase and latency decrease
> > > > > > > be treated differently. Latency increase value is limited to se's latency_weight
> > > > > >
> > > > > > I have tried to explain why I treat differently if weight is > 0 or < 0.
> > > > > >
> > > > > > > but latency decrease value can extend to -sched_latency or treat them the same.
> > > > > > > Penalty latency decrease and conserve latency increase.
> > > > > > >
> > > > > > >
> > > > > > > There is any value that this latency_weight can be considered to be a average.
> > > > > > >
> > > > > > > The delta value choose is ~%5 to 1024. %5*sched_latency=0.05*6ms=0.3ms.(no scale)
> > > > > > > I do not think over that vdiff equation and others though.
> > > > > > >
> > > > > > > Thanks,
> > > > > > > Tao
> > > > > > > > + vdiff += wakeup_latency_gran(latency_weight);
> > > > > > > >
> > > > > > > > if (vdiff <= 0)
> > > > > > > > return -1;
> > > > > > > > @@ -7117,6 +7172,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
> > > > > > > > return;
> > > > > > > >
> > > > > > > > update_curr(cfs_rq_of(se));
> > > > > > > > +
> > > > > > > > if (wakeup_preempt_entity(se, pse) == 1) {
> > > > > > > > /*
> > > > > > > > * Bias pick_next to pick the sched entity that is
> > > > > > > > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > > > > > > > index 456ad2159eb1..dd92aa9c36f9 100644
> > > > > > > > --- a/kernel/sched/sched.h
> > > > > > > > +++ b/kernel/sched/sched.h
> > > > > > > > @@ -122,6 +122,17 @@ extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
> > > > > > > > * Default tasks should be treated as a task with latency_nice = 0.
> > > > > > > > */
> > > > > > > > #define DEFAULT_LATENCY_NICE 0
> > > > > > > > +#define DEFAULT_LATENCY_PRIO (DEFAULT_LATENCY_NICE + LATENCY_NICE_WIDTH/2)
> > > > > > > > +
> > > > > > > > +/*
> > > > > > > > + * Convert user-nice values [ -20 ... 0 ... 19 ]
> > > > > > > > + * to static latency [ 0..39 ],
> > > > > > > > + * and back.
> > > > > > > > + */
> > > > > > > > +#define NICE_TO_LATENCY(nice) ((nice) + DEFAULT_LATENCY_PRIO)
> > > > > > > > +#define LATENCY_TO_NICE(prio) ((prio) - DEFAULT_LATENCY_PRIO)
> > > > > > > > +#define NICE_LATENCY_SHIFT (SCHED_FIXEDPOINT_SHIFT)
> > > > > > > > +#define NICE_LATENCY_WEIGHT_MAX (1L << NICE_LATENCY_SHIFT)
> > > > > > > >
> > > > > > > > /*
> > > > > > > > * Increase resolution of nice-level calculations for 64-bit architectures.
> > > > > > > > @@ -2098,6 +2109,7 @@ static_assert(WF_TTWU == SD_BALANCE_WAKE);
> > > > > > > >
> > > > > > > > extern const int sched_prio_to_weight[40];
> > > > > > > > extern const u32 sched_prio_to_wmult[40];
> > > > > > > > +extern const int sched_latency_to_weight[40];
> > > > > > > >
> > > > > > > > /*
> > > > > > > > * {de,en}queue flags:
> > > > > > > > --
> > > > > > > > 2.17.1
> > > > > > > >
On Sun, May 01, 2022 at 11:58:42PM +0800, Tao Zhou wrote:
> Hi Vincent,
>
> Change to Valentin Schneider's now using mail address.
>
> On Fri, Mar 11, 2022 at 05:14:05PM +0100, Vincent Guittot wrote:
>
> > Take into account the nice latency priority of a thread when deciding to
> > preempt the current running thread. We don't want to provide more CPU
> > bandwidth to a thread but reorder the scheduling to run latency sensitive
> > task first whenever possible.
> >
> > As long as a thread didn't use its bandwidth, it will be able to preempt
> > the current thread.
> >
> > At the opposite, a thread with a low latency priority will preempt current
> > thread at wakeup only to keep fair CPU bandwidth sharing. Otherwise it will
> > wait for the tick to get its sched slice.
> >
> > curr vruntime
> > |
> > sysctl_sched_wakeup_granularity
> > <-->
> > ----------------------------------|----|-----------------------|---------------
> > | |<--------------------->
> > | . sysctl_sched_latency
> > | .
> > default/current latency entity | .
> > | .
> > 1111111111111111111111111111111111|0000|-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-
> > se preempts curr at wakeup ------>|<- se doesn't preempt curr -----------------
> > | .
> > | .
> > | .
> > low latency entity | .
> > ---------------------->|
> > % of sysctl_sched_latency |
> > 1111111111111111111111111111111111111111111111111111111111|0000|-1-1-1-1-1-1-1-
> > preempt ------------------------------------------------->|<- do not preempt --
> > | .
> > | .
> > | .
> > high latency entity | .
> > |<-----------------------| .
> > | % of sysctl_sched_latency .
> > 111111111|0000|-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1
> > preempt->|<- se doesn't preempt curr ------------------------------------------
> >
> > Tests results of nice latency impact on heavy load like hackbench:
> >
> > hackbench -l (2560 / group) -g group
> > group latency 0 latency 19
> > 1 1.450(+/- 0.60%) 1.376(+/- 0.54%) + 5%
> > 4 1.537(+/- 1.75%) 1.335(+/- 1.81%) +13%
> > 8 1.414(+/- 1.91%) 1.348(+/- 1.88%) + 5%
> > 16 1.423(+/- 1.65%) 1.374(+/- 0.58%) + 3%
> >
> > hackbench -p -l (2560 / group) -g group
> > group
> > 1 1.416(+/- 3.45%) 0.886(+/- 0.54%) +37%
> > 4 1.634(+/- 7.14%) 0.888(+/- 5.40%) +45%
> > 8 1.449(+/- 2.14%) 0.804(+/- 4.55%) +44%
> > 16 0.917(+/- 4.12%) 0.777(+/- 1.41%) +15%
> >
> > By deacreasing the latency prio, we reduce the number of preemption at
>
> s/deacreasing/decreasing/
> s/reduce/increase/
>
> > wakeup and help hackbench making progress.
> >
> > Test results of nice latency impact on short live load like cyclictest
> > while competing with heavy load like hackbench:
> >
> > hackbench -l 10000 -g group &
> > cyclictest --policy other -D 5 -q -n
> > latency 0 latency -20
> > group min avg max min avg max
> > 0 16 17 28 15 17 27
> > 1 61 382 10603 63 89 4628
> > 4 52 437 15455 54 98 16238
> > 8 56 728 38499 61 125 28983
> > 16 53 1215 52207 61 183 80751
> >
> > group = 0 means that hackbench is not running.
> >
> > The avg is significantly improved with nice latency -20 especially with
> > large number of groups but min and max remain quite similar. If we add the
> > histogram parameters to get details of latency, we have :
> >
> > hackbench -l 10000 -g 16 &
> > cyclictest --policy other -D 5 -q -n -H 20000 --histfile data.txt
> > latency 0 latency -20
> > Min Latencies: 63 62
> > Avg Latencies: 1397 218
> > Max Latencies: 44926 42291
> > 50% latencies: 100 98
> > 75% latencies: 762 116
> > 85% latencies: 1118 126
> > 90% latencies: 1540 130
> > 95% latencies: 5610 138
> > 99% latencies: 13738 266
> >
> > With percentile details, we see the benefit of nice latency -20 as
> > 1% of the latencies stays above 266us whereas the default latency has
> > got 25% are above 762us and 10% over the 1ms.
> >
> > Signed-off-by: Vincent Guittot <[email protected]>
> > ---
> > include/linux/sched.h | 4 ++-
> > init/init_task.c | 2 +-
> > kernel/sched/core.c | 32 +++++++++++++++++++----
> > kernel/sched/debug.c | 2 +-
> > kernel/sched/fair.c | 60 +++++++++++++++++++++++++++++++++++++++++--
> > kernel/sched/sched.h | 12 +++++++++
> > 6 files changed, 102 insertions(+), 10 deletions(-)
> >
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index 2aa889a59054..9aeb157e819b 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -560,6 +560,8 @@ struct sched_entity {
> > unsigned long runnable_weight;
> > #endif
> >
> > + int latency_weight;
> > +
> > #ifdef CONFIG_SMP
> > /*
> > * Per entity load average tracking.
> > @@ -779,7 +781,7 @@ struct task_struct {
> > int static_prio;
> > int normal_prio;
> > unsigned int rt_priority;
> > - int latency_nice;
> > + int latency_prio;
> >
> > struct sched_entity se;
> > struct sched_rt_entity rt;
> > diff --git a/init/init_task.c b/init/init_task.c
> > index 2afa249c253b..e98c71f24981 100644
> > --- a/init/init_task.c
> > +++ b/init/init_task.c
> > @@ -78,7 +78,7 @@ struct task_struct init_task
> > .prio = MAX_PRIO - 20,
> > .static_prio = MAX_PRIO - 20,
> > .normal_prio = MAX_PRIO - 20,
> > - .latency_nice = 0,
> > + .latency_prio = NICE_WIDTH - 20,
> > .policy = SCHED_NORMAL,
> > .cpus_ptr = &init_task.cpus_mask,
> > .user_cpus_ptr = NULL,
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 8f8b102a75c4..547b0da01efe 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -1241,6 +1241,11 @@ static void set_load_weight(struct task_struct *p, bool update_load)
> > }
> > }
> >
> > +static void set_latency_weight(struct task_struct *p)
> > +{
> > + p->se.latency_weight = sched_latency_to_weight[p->latency_prio];
> > +}
> > +
> > #ifdef CONFIG_UCLAMP_TASK
> > /*
> > * Serializes updates of utilization clamp values
> > @@ -4394,7 +4399,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
> > p->prio = current->normal_prio;
> >
> > /* Propagate the parent's latency requirements to the child as well */
> > - p->latency_nice = current->latency_nice;
> > + p->latency_prio = current->latency_prio;
> >
> > uclamp_fork(p);
> >
> > @@ -4412,7 +4417,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
> > p->prio = p->normal_prio = p->static_prio;
> > set_load_weight(p, false);
> >
> > - p->latency_nice = DEFAULT_LATENCY_NICE;
> > + p->latency_prio = NICE_TO_LATENCY(0);
> > /*
> > * We don't need the reset flag anymore after the fork. It has
> > * fulfilled its duty:
> > @@ -4420,6 +4425,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
> > p->sched_reset_on_fork = 0;
> > }
> >
> > + /* Once latency_prio is set, update the latency weight */
> > + set_latency_weight(p);
> > +
> > if (dl_prio(p->prio))
> > return -EAGAIN;
> > else if (rt_prio(p->prio))
> > @@ -7361,7 +7369,7 @@ static int __sched_setscheduler(struct task_struct *p,
> > if (attr->sched_latency_nice < MIN_LATENCY_NICE)
> > return -EINVAL;
> > /* Use the same security checks as NICE */
> > - if (attr->sched_latency_nice < p->latency_nice &&
> > + if (attr->sched_latency_nice < LATENCY_TO_NICE(p->latency_prio) &&
> > !capable(CAP_SYS_NICE))
> > return -EPERM;
> > }
> > @@ -7401,7 +7409,7 @@ static int __sched_setscheduler(struct task_struct *p,
> > if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
> > goto change;
> > if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE &&
> > - attr->sched_latency_nice != p->latency_nice)
> > + attr->sched_latency_nice != LATENCY_TO_NICE(p->latency_prio))
> > goto change;
> >
> > p->sched_reset_on_fork = reset_on_fork;
> > @@ -7942,7 +7950,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
> > get_params(p, &kattr);
> > kattr.sched_flags &= SCHED_FLAG_ALL;
> >
> > - kattr.sched_latency_nice = p->latency_nice;
> > + kattr.sched_latency_nice = LATENCY_TO_NICE(p->latency_prio);
> >
> > #ifdef CONFIG_UCLAMP_TASK
> > /*
> > @@ -10954,6 +10962,20 @@ const u32 sched_prio_to_wmult[40] = {
> > /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
> > };
> >
> > +/*
> > + * latency weight for wakeup preemption
> > + */
> > +const int sched_latency_to_weight[40] = {
> > + /* -20 */ 1024, 973, 922, 870, 819,
> > + /* -15 */ 768, 717, 666, 614, 563,
> > + /* -10 */ 512, 461, 410, 358, 307,
> > + /* -5 */ 256, 205, 154, 102, 51,
> > + /* 0 */ 0, -51, -102, -154, -205,
> > + /* 5 */ -256, -307, -358, -410, -461,
> > + /* 10 */ -512, -563, -614, -666, -717,
> > + /* 15 */ -768, -819, -870, -922, -973,
> > +};
> > +
> > void call_trace_sched_update_nr_running(struct rq *rq, int count)
> > {
> > trace_sched_update_nr_running_tp(rq, count);
> > diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> > index 5d76a8927888..253e52ec73fb 100644
> > --- a/kernel/sched/debug.c
> > +++ b/kernel/sched/debug.c
> > @@ -1043,7 +1043,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
> > #endif
> > P(policy);
> > P(prio);
> > - P(latency_nice);
> > + P(latency_prio);
> > if (task_has_dl_policy(p)) {
> > P(dl.runtime);
> > P(dl.deadline);
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 5c4bfffe8c2c..506c482a0e48 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -5555,6 +5555,35 @@ static int sched_idle_cpu(int cpu)
> > }
> > #endif
> >
> > +static void set_next_buddy(struct sched_entity *se);
> > +
> > +static void check_preempt_from_idle(struct cfs_rq *cfs, struct sched_entity *se)
> > +{
> > + struct sched_entity *next;
> > +
> > + if (se->latency_weight <= 0)
> > + return;
> > +
> > + if (cfs->nr_running <= 1)
> > + return;
> > + /*
> > + * When waking from idle, we don't need to check to preempt at wakeup
> > + * the idle thread and don't set next buddy as a candidate for being
> > + * picked in priority.
> > + * In case of simultaneous wakeup from idle, the latency sensitive tasks
> > + * lost opportunity to preempt non sensitive tasks which woke up
> > + * simultaneously.
> > + */
> > +
> > + if (cfs->next)
> > + next = cfs->next;
> > + else
> > + next = __pick_first_entity(cfs);
> > +
> > + if (next && wakeup_preempt_entity(next, se) == 1)
> > + set_next_buddy(se);
> > +}
> > +
> > /*
> > * The enqueue_task method is called before nr_running is
> > * increased. Here we update the fair scheduling stats and
> > @@ -5648,6 +5677,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> > if (!task_new)
> > update_overutilized_status(rq);
> >
> > + if (rq->curr == rq->idle)
> > + check_preempt_from_idle(cfs_rq_of(&p->se), &p->se);
> > +
> > enqueue_throttle:
> > if (cfs_bandwidth_used()) {
> > /*
> > @@ -5669,8 +5701,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> > hrtick_update(rq);
> > }
> >
> > -static void set_next_buddy(struct sched_entity *se);
> > -
> > /*
> > * The dequeue_task method is called before nr_running is
> > * decreased. We remove the task from the rbtree and
> > @@ -6970,6 +7000,27 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> > }
> > #endif /* CONFIG_SMP */
> >
> > +static long wakeup_latency_gran(int latency_weight)
> > +{
> > + long thresh = sysctl_sched_latency;
> > +
> > + if (!latency_weight)
> > + return 0;
> > +
> > + if (sched_feat(GENTLE_FAIR_SLEEPERS))
> > + thresh >>= 1;
> > +
> > + /*
> > + * Clamp the delta to stay in the scheduler period range
> > + * [-sysctl_sched_latency:sysctl_sched_latency]
> > + */
> > + latency_weight = clamp_t(long, latency_weight,
> > + -1 * NICE_LATENCY_WEIGHT_MAX,
> > + NICE_LATENCY_WEIGHT_MAX);
> > +
> > + return (thresh * latency_weight) >> NICE_LATENCY_SHIFT;
> > +}
> > +
> > static unsigned long wakeup_gran(struct sched_entity *se)
> > {
> > unsigned long gran = sysctl_sched_wakeup_granularity;
> > @@ -7008,6 +7059,10 @@ static int
> > wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
> > {
> > s64 gran, vdiff = curr->vruntime - se->vruntime;
> > + int latency_weight = se->latency_weight - curr->latency_weight;
> > +
> > + latency_weight = min(latency_weight, se->latency_weight);
>
> Let lable A=se->latency_weight, B=curr->latency_weight, C=A-B.
>
> 1 A>0 B>0
> ::C=A-B<0, min(C,A)=C, latency decrease, C is the real diff value no matter
> what A is. That means it can be very 'long'(-sched_latency) and vdiff is
> more possible to be in <= 0 case and return -1.
> ::C=A-B>0, min(C,A)=A, latency increase, but it is conservative. Limit to
> A/1024*sched_latency.
> When latecy is decreased, the negtive value added to vdiff is larger than the
> positive value added to vdiff when latency is increased.
>
> 2 A>0 B<0
> ::C=A-B>0 and C>A, min(C,A)=A, latency increase and it is conservative.
>
> 3 A<0 B<0
> ::C=A-B>0, min(C,A)=A, latency increase but min(C,A)<0, I think if latency
> increase means the value added to vdiff will be a positive value to increase
> the chance to return 1. I would say it is max(C,A)=C
> ::C=A-B<0, min(C,A)=A, latency decrease and the real negtive value.
>
> 4 A<0,B>0
> ::C=A-B<0, min(C,A)=C, latency decrease and the real negtive value.
>
> Is there a reason that the value when latency increase and latency decrease
> be treated differently. Latency increase value is limited to se's latency_weight
> but latency decrease value can extend to -sched_latency or treat them the same.
> Penalty latency decrease and conserve latency increase.
>
>
> There is any value that this latency_weight can be considered to be a average.
>
> The delta value choose is ~%5 to 1024. %5*sched_latency=0.05*6ms=0.3ms.(no scale)
> I do not think over that vdiff equation and others though.
vruntime is the accumulated weight history scaled time.
vdiff is the scaled diff time delta.
latency_gran = [delta_weight|latency_weight]/1024*thresh.
If also to scale this latency_gran to reflect the load weight
distribution in sched_latency like:
latency_gran*1024/load_weight=[delta_weight|latency_weight]/load_weight*thresh.
About [delta_weight|latency_weight]/load_weight. Because latency nice and prio nice
range is the same. Can make a table to const this calculation.. But I do not
make sure that it is valuable to make this change. Just some random inputs..
> Thanks,
> Tao
> > + vdiff += wakeup_latency_gran(latency_weight);
> >
> > if (vdiff <= 0)
> > return -1;
> > @@ -7117,6 +7172,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
> > return;
> >
> > update_curr(cfs_rq_of(se));
> > +
> > if (wakeup_preempt_entity(se, pse) == 1) {
> > /*
> > * Bias pick_next to pick the sched entity that is
> > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > index 456ad2159eb1..dd92aa9c36f9 100644
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -122,6 +122,17 @@ extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
> > * Default tasks should be treated as a task with latency_nice = 0.
> > */
> > #define DEFAULT_LATENCY_NICE 0
> > +#define DEFAULT_LATENCY_PRIO (DEFAULT_LATENCY_NICE + LATENCY_NICE_WIDTH/2)
> > +
> > +/*
> > + * Convert user-nice values [ -20 ... 0 ... 19 ]
> > + * to static latency [ 0..39 ],
> > + * and back.
> > + */
> > +#define NICE_TO_LATENCY(nice) ((nice) + DEFAULT_LATENCY_PRIO)
> > +#define LATENCY_TO_NICE(prio) ((prio) - DEFAULT_LATENCY_PRIO)
> > +#define NICE_LATENCY_SHIFT (SCHED_FIXEDPOINT_SHIFT)
> > +#define NICE_LATENCY_WEIGHT_MAX (1L << NICE_LATENCY_SHIFT)
> >
> > /*
> > * Increase resolution of nice-level calculations for 64-bit architectures.
> > @@ -2098,6 +2109,7 @@ static_assert(WF_TTWU == SD_BALANCE_WAKE);
> >
> > extern const int sched_prio_to_weight[40];
> > extern const u32 sched_prio_to_wmult[40];
> > +extern const int sched_latency_to_weight[40];
> >
> > /*
> > * {de,en}queue flags:
> > --
> > 2.17.1
> >
On Tue, 3 May 2022 at 04:29, Tao Zhou <[email protected]> wrote:
>
> On Sun, May 01, 2022 at 11:58:42PM +0800, Tao Zhou wrote:
> > Hi Vincent,
> >
> > Change to Valentin Schneider's now using mail address.
> >
> > On Fri, Mar 11, 2022 at 05:14:05PM +0100, Vincent Guittot wrote:
> >
> > > Take into account the nice latency priority of a thread when deciding to
> > > preempt the current running thread. We don't want to provide more CPU
> > > bandwidth to a thread but reorder the scheduling to run latency sensitive
> > > task first whenever possible.
> > >
> > > As long as a thread didn't use its bandwidth, it will be able to preempt
> > > the current thread.
> > >
> > > At the opposite, a thread with a low latency priority will preempt current
> > > thread at wakeup only to keep fair CPU bandwidth sharing. Otherwise it will
> > > wait for the tick to get its sched slice.
> > >
> > > curr vruntime
> > > |
> > > sysctl_sched_wakeup_granularity
> > > <-->
> > > ----------------------------------|----|-----------------------|---------------
> > > | |<--------------------->
> > > | . sysctl_sched_latency
> > > | .
> > > default/current latency entity | .
> > > | .
> > > 1111111111111111111111111111111111|0000|-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-
> > > se preempts curr at wakeup ------>|<- se doesn't preempt curr -----------------
> > > | .
> > > | .
> > > | .
> > > low latency entity | .
> > > ---------------------->|
> > > % of sysctl_sched_latency |
> > > 1111111111111111111111111111111111111111111111111111111111|0000|-1-1-1-1-1-1-1-
> > > preempt ------------------------------------------------->|<- do not preempt --
> > > | .
> > > | .
> > > | .
> > > high latency entity | .
> > > |<-----------------------| .
> > > | % of sysctl_sched_latency .
> > > 111111111|0000|-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1
> > > preempt->|<- se doesn't preempt curr ------------------------------------------
> > >
> > > Tests results of nice latency impact on heavy load like hackbench:
> > >
> > > hackbench -l (2560 / group) -g group
> > > group latency 0 latency 19
> > > 1 1.450(+/- 0.60%) 1.376(+/- 0.54%) + 5%
> > > 4 1.537(+/- 1.75%) 1.335(+/- 1.81%) +13%
> > > 8 1.414(+/- 1.91%) 1.348(+/- 1.88%) + 5%
> > > 16 1.423(+/- 1.65%) 1.374(+/- 0.58%) + 3%
> > >
> > > hackbench -p -l (2560 / group) -g group
> > > group
> > > 1 1.416(+/- 3.45%) 0.886(+/- 0.54%) +37%
> > > 4 1.634(+/- 7.14%) 0.888(+/- 5.40%) +45%
> > > 8 1.449(+/- 2.14%) 0.804(+/- 4.55%) +44%
> > > 16 0.917(+/- 4.12%) 0.777(+/- 1.41%) +15%
> > >
> > > By deacreasing the latency prio, we reduce the number of preemption at
> >
> > s/deacreasing/decreasing/
> > s/reduce/increase/
> >
> > > wakeup and help hackbench making progress.
> > >
> > > Test results of nice latency impact on short live load like cyclictest
> > > while competing with heavy load like hackbench:
> > >
> > > hackbench -l 10000 -g group &
> > > cyclictest --policy other -D 5 -q -n
> > > latency 0 latency -20
> > > group min avg max min avg max
> > > 0 16 17 28 15 17 27
> > > 1 61 382 10603 63 89 4628
> > > 4 52 437 15455 54 98 16238
> > > 8 56 728 38499 61 125 28983
> > > 16 53 1215 52207 61 183 80751
> > >
> > > group = 0 means that hackbench is not running.
> > >
> > > The avg is significantly improved with nice latency -20 especially with
> > > large number of groups but min and max remain quite similar. If we add the
> > > histogram parameters to get details of latency, we have :
> > >
> > > hackbench -l 10000 -g 16 &
> > > cyclictest --policy other -D 5 -q -n -H 20000 --histfile data.txt
> > > latency 0 latency -20
> > > Min Latencies: 63 62
> > > Avg Latencies: 1397 218
> > > Max Latencies: 44926 42291
> > > 50% latencies: 100 98
> > > 75% latencies: 762 116
> > > 85% latencies: 1118 126
> > > 90% latencies: 1540 130
> > > 95% latencies: 5610 138
> > > 99% latencies: 13738 266
> > >
> > > With percentile details, we see the benefit of nice latency -20 as
> > > 1% of the latencies stays above 266us whereas the default latency has
> > > got 25% are above 762us and 10% over the 1ms.
> > >
> > > Signed-off-by: Vincent Guittot <[email protected]>
> > > ---
> > > include/linux/sched.h | 4 ++-
> > > init/init_task.c | 2 +-
> > > kernel/sched/core.c | 32 +++++++++++++++++++----
> > > kernel/sched/debug.c | 2 +-
> > > kernel/sched/fair.c | 60 +++++++++++++++++++++++++++++++++++++++++--
> > > kernel/sched/sched.h | 12 +++++++++
> > > 6 files changed, 102 insertions(+), 10 deletions(-)
> > >
> > > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > > index 2aa889a59054..9aeb157e819b 100644
> > > --- a/include/linux/sched.h
> > > +++ b/include/linux/sched.h
> > > @@ -560,6 +560,8 @@ struct sched_entity {
> > > unsigned long runnable_weight;
> > > #endif
> > >
> > > + int latency_weight;
> > > +
> > > #ifdef CONFIG_SMP
> > > /*
> > > * Per entity load average tracking.
> > > @@ -779,7 +781,7 @@ struct task_struct {
> > > int static_prio;
> > > int normal_prio;
> > > unsigned int rt_priority;
> > > - int latency_nice;
> > > + int latency_prio;
> > >
> > > struct sched_entity se;
> > > struct sched_rt_entity rt;
> > > diff --git a/init/init_task.c b/init/init_task.c
> > > index 2afa249c253b..e98c71f24981 100644
> > > --- a/init/init_task.c
> > > +++ b/init/init_task.c
> > > @@ -78,7 +78,7 @@ struct task_struct init_task
> > > .prio = MAX_PRIO - 20,
> > > .static_prio = MAX_PRIO - 20,
> > > .normal_prio = MAX_PRIO - 20,
> > > - .latency_nice = 0,
> > > + .latency_prio = NICE_WIDTH - 20,
> > > .policy = SCHED_NORMAL,
> > > .cpus_ptr = &init_task.cpus_mask,
> > > .user_cpus_ptr = NULL,
> > > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > > index 8f8b102a75c4..547b0da01efe 100644
> > > --- a/kernel/sched/core.c
> > > +++ b/kernel/sched/core.c
> > > @@ -1241,6 +1241,11 @@ static void set_load_weight(struct task_struct *p, bool update_load)
> > > }
> > > }
> > >
> > > +static void set_latency_weight(struct task_struct *p)
> > > +{
> > > + p->se.latency_weight = sched_latency_to_weight[p->latency_prio];
> > > +}
> > > +
> > > #ifdef CONFIG_UCLAMP_TASK
> > > /*
> > > * Serializes updates of utilization clamp values
> > > @@ -4394,7 +4399,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
> > > p->prio = current->normal_prio;
> > >
> > > /* Propagate the parent's latency requirements to the child as well */
> > > - p->latency_nice = current->latency_nice;
> > > + p->latency_prio = current->latency_prio;
> > >
> > > uclamp_fork(p);
> > >
> > > @@ -4412,7 +4417,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
> > > p->prio = p->normal_prio = p->static_prio;
> > > set_load_weight(p, false);
> > >
> > > - p->latency_nice = DEFAULT_LATENCY_NICE;
> > > + p->latency_prio = NICE_TO_LATENCY(0);
> > > /*
> > > * We don't need the reset flag anymore after the fork. It has
> > > * fulfilled its duty:
> > > @@ -4420,6 +4425,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
> > > p->sched_reset_on_fork = 0;
> > > }
> > >
> > > + /* Once latency_prio is set, update the latency weight */
> > > + set_latency_weight(p);
> > > +
> > > if (dl_prio(p->prio))
> > > return -EAGAIN;
> > > else if (rt_prio(p->prio))
> > > @@ -7361,7 +7369,7 @@ static int __sched_setscheduler(struct task_struct *p,
> > > if (attr->sched_latency_nice < MIN_LATENCY_NICE)
> > > return -EINVAL;
> > > /* Use the same security checks as NICE */
> > > - if (attr->sched_latency_nice < p->latency_nice &&
> > > + if (attr->sched_latency_nice < LATENCY_TO_NICE(p->latency_prio) &&
> > > !capable(CAP_SYS_NICE))
> > > return -EPERM;
> > > }
> > > @@ -7401,7 +7409,7 @@ static int __sched_setscheduler(struct task_struct *p,
> > > if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
> > > goto change;
> > > if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE &&
> > > - attr->sched_latency_nice != p->latency_nice)
> > > + attr->sched_latency_nice != LATENCY_TO_NICE(p->latency_prio))
> > > goto change;
> > >
> > > p->sched_reset_on_fork = reset_on_fork;
> > > @@ -7942,7 +7950,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
> > > get_params(p, &kattr);
> > > kattr.sched_flags &= SCHED_FLAG_ALL;
> > >
> > > - kattr.sched_latency_nice = p->latency_nice;
> > > + kattr.sched_latency_nice = LATENCY_TO_NICE(p->latency_prio);
> > >
> > > #ifdef CONFIG_UCLAMP_TASK
> > > /*
> > > @@ -10954,6 +10962,20 @@ const u32 sched_prio_to_wmult[40] = {
> > > /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
> > > };
> > >
> > > +/*
> > > + * latency weight for wakeup preemption
> > > + */
> > > +const int sched_latency_to_weight[40] = {
> > > + /* -20 */ 1024, 973, 922, 870, 819,
> > > + /* -15 */ 768, 717, 666, 614, 563,
> > > + /* -10 */ 512, 461, 410, 358, 307,
> > > + /* -5 */ 256, 205, 154, 102, 51,
> > > + /* 0 */ 0, -51, -102, -154, -205,
> > > + /* 5 */ -256, -307, -358, -410, -461,
> > > + /* 10 */ -512, -563, -614, -666, -717,
> > > + /* 15 */ -768, -819, -870, -922, -973,
> > > +};
> > > +
> > > void call_trace_sched_update_nr_running(struct rq *rq, int count)
> > > {
> > > trace_sched_update_nr_running_tp(rq, count);
> > > diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> > > index 5d76a8927888..253e52ec73fb 100644
> > > --- a/kernel/sched/debug.c
> > > +++ b/kernel/sched/debug.c
> > > @@ -1043,7 +1043,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
> > > #endif
> > > P(policy);
> > > P(prio);
> > > - P(latency_nice);
> > > + P(latency_prio);
> > > if (task_has_dl_policy(p)) {
> > > P(dl.runtime);
> > > P(dl.deadline);
> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > index 5c4bfffe8c2c..506c482a0e48 100644
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -5555,6 +5555,35 @@ static int sched_idle_cpu(int cpu)
> > > }
> > > #endif
> > >
> > > +static void set_next_buddy(struct sched_entity *se);
> > > +
> > > +static void check_preempt_from_idle(struct cfs_rq *cfs, struct sched_entity *se)
> > > +{
> > > + struct sched_entity *next;
> > > +
> > > + if (se->latency_weight <= 0)
> > > + return;
> > > +
> > > + if (cfs->nr_running <= 1)
> > > + return;
> > > + /*
> > > + * When waking from idle, we don't need to check to preempt at wakeup
> > > + * the idle thread and don't set next buddy as a candidate for being
> > > + * picked in priority.
> > > + * In case of simultaneous wakeup from idle, the latency sensitive tasks
> > > + * lost opportunity to preempt non sensitive tasks which woke up
> > > + * simultaneously.
> > > + */
> > > +
> > > + if (cfs->next)
> > > + next = cfs->next;
> > > + else
> > > + next = __pick_first_entity(cfs);
> > > +
> > > + if (next && wakeup_preempt_entity(next, se) == 1)
> > > + set_next_buddy(se);
> > > +}
> > > +
> > > /*
> > > * The enqueue_task method is called before nr_running is
> > > * increased. Here we update the fair scheduling stats and
> > > @@ -5648,6 +5677,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> > > if (!task_new)
> > > update_overutilized_status(rq);
> > >
> > > + if (rq->curr == rq->idle)
> > > + check_preempt_from_idle(cfs_rq_of(&p->se), &p->se);
> > > +
> > > enqueue_throttle:
> > > if (cfs_bandwidth_used()) {
> > > /*
> > > @@ -5669,8 +5701,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> > > hrtick_update(rq);
> > > }
> > >
> > > -static void set_next_buddy(struct sched_entity *se);
> > > -
> > > /*
> > > * The dequeue_task method is called before nr_running is
> > > * decreased. We remove the task from the rbtree and
> > > @@ -6970,6 +7000,27 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> > > }
> > > #endif /* CONFIG_SMP */
> > >
> > > +static long wakeup_latency_gran(int latency_weight)
> > > +{
> > > + long thresh = sysctl_sched_latency;
> > > +
> > > + if (!latency_weight)
> > > + return 0;
> > > +
> > > + if (sched_feat(GENTLE_FAIR_SLEEPERS))
> > > + thresh >>= 1;
> > > +
> > > + /*
> > > + * Clamp the delta to stay in the scheduler period range
> > > + * [-sysctl_sched_latency:sysctl_sched_latency]
> > > + */
> > > + latency_weight = clamp_t(long, latency_weight,
> > > + -1 * NICE_LATENCY_WEIGHT_MAX,
> > > + NICE_LATENCY_WEIGHT_MAX);
> > > +
> > > + return (thresh * latency_weight) >> NICE_LATENCY_SHIFT;
> > > +}
> > > +
> > > static unsigned long wakeup_gran(struct sched_entity *se)
> > > {
> > > unsigned long gran = sysctl_sched_wakeup_granularity;
> > > @@ -7008,6 +7059,10 @@ static int
> > > wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
> > > {
> > > s64 gran, vdiff = curr->vruntime - se->vruntime;
> > > + int latency_weight = se->latency_weight - curr->latency_weight;
> > > +
> > > + latency_weight = min(latency_weight, se->latency_weight);
> >
> > Let lable A=se->latency_weight, B=curr->latency_weight, C=A-B.
> >
> > 1 A>0 B>0
> > ::C=A-B<0, min(C,A)=C, latency decrease, C is the real diff value no matter
> > what A is. That means it can be very 'long'(-sched_latency) and vdiff is
> > more possible to be in <= 0 case and return -1.
> > ::C=A-B>0, min(C,A)=A, latency increase, but it is conservative. Limit to
> > A/1024*sched_latency.
> > When latecy is decreased, the negtive value added to vdiff is larger than the
> > positive value added to vdiff when latency is increased.
> >
> > 2 A>0 B<0
> > ::C=A-B>0 and C>A, min(C,A)=A, latency increase and it is conservative.
> >
> > 3 A<0 B<0
> > ::C=A-B>0, min(C,A)=A, latency increase but min(C,A)<0, I think if latency
> > increase means the value added to vdiff will be a positive value to increase
> > the chance to return 1. I would say it is max(C,A)=C
> > ::C=A-B<0, min(C,A)=A, latency decrease and the real negtive value.
> >
> > 4 A<0,B>0
> > ::C=A-B<0, min(C,A)=C, latency decrease and the real negtive value.
> >
> > Is there a reason that the value when latency increase and latency decrease
> > be treated differently. Latency increase value is limited to se's latency_weight
> > but latency decrease value can extend to -sched_latency or treat them the same.
> > Penalty latency decrease and conserve latency increase.
> >
> >
> > There is any value that this latency_weight can be considered to be a average.
> >
> > The delta value choose is ~%5 to 1024. %5*sched_latency=0.05*6ms=0.3ms.(no scale)
> > I do not think over that vdiff equation and others though.
>
> vruntime is the accumulated weight history scaled time.
> vdiff is the scaled diff time delta.
>
> latency_gran = [delta_weight|latency_weight]/1024*thresh.
> If also to scale this latency_gran to reflect the load weight
> distribution in sched_latency like:
>
> latency_gran*1024/load_weight=[delta_weight|latency_weight]/load_weight*thresh.
wakeup_gran already scales with se load_weight
>
> About [delta_weight|latency_weight]/load_weight. Because latency nice and prio nice
> range is the same. Can make a table to const this calculation.. But I do not
> make sure that it is valuable to make this change. Just some random inputs..
>
> > Thanks,
> > Tao
> > > + vdiff += wakeup_latency_gran(latency_weight);
> > >
> > > if (vdiff <= 0)
> > > return -1;
> > > @@ -7117,6 +7172,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
> > > return;
> > >
> > > update_curr(cfs_rq_of(se));
> > > +
> > > if (wakeup_preempt_entity(se, pse) == 1) {
> > > /*
> > > * Bias pick_next to pick the sched entity that is
> > > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > > index 456ad2159eb1..dd92aa9c36f9 100644
> > > --- a/kernel/sched/sched.h
> > > +++ b/kernel/sched/sched.h
> > > @@ -122,6 +122,17 @@ extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
> > > * Default tasks should be treated as a task with latency_nice = 0.
> > > */
> > > #define DEFAULT_LATENCY_NICE 0
> > > +#define DEFAULT_LATENCY_PRIO (DEFAULT_LATENCY_NICE + LATENCY_NICE_WIDTH/2)
> > > +
> > > +/*
> > > + * Convert user-nice values [ -20 ... 0 ... 19 ]
> > > + * to static latency [ 0..39 ],
> > > + * and back.
> > > + */
> > > +#define NICE_TO_LATENCY(nice) ((nice) + DEFAULT_LATENCY_PRIO)
> > > +#define LATENCY_TO_NICE(prio) ((prio) - DEFAULT_LATENCY_PRIO)
> > > +#define NICE_LATENCY_SHIFT (SCHED_FIXEDPOINT_SHIFT)
> > > +#define NICE_LATENCY_WEIGHT_MAX (1L << NICE_LATENCY_SHIFT)
> > >
> > > /*
> > > * Increase resolution of nice-level calculations for 64-bit architectures.
> > > @@ -2098,6 +2109,7 @@ static_assert(WF_TTWU == SD_BALANCE_WAKE);
> > >
> > > extern const int sched_prio_to_weight[40];
> > > extern const u32 sched_prio_to_wmult[40];
> > > +extern const int sched_latency_to_weight[40];
> > >
> > > /*
> > > * {de,en}queue flags:
> > > --
> > > 2.17.1
> > >
On Sat, Mar 12, 2022 at 7:11 AM Vincent Guittot
<[email protected]> wrote:
>
> Take into account the nice latency priority of a thread when deciding to
> preempt the current running thread. We don't want to provide more CPU
> bandwidth to a thread but reorder the scheduling to run latency sensitive
> task first whenever possible.
>
---------->8-------------------
> #endif /* CONFIG_SMP */
>
> +static long wakeup_latency_gran(int latency_weight)
> +{
> + long thresh = sysctl_sched_latency;
If I understood correctly, this is to consider the latency weight and
'shrink/expand'
current task's time slice thus to facilitate preemption. And may I
know why don't we use
__sched_period() but to use sysctl_sched_latency directly? Is it
possible the rq has
more than 8(sched_nr_latency) tasks thus the period is longer than
sysctl_sched_latency?
Thanks,
Chenyu
> +
> + if (!latency_weight)
> + return 0;
> +
> + if (sched_feat(GENTLE_FAIR_SLEEPERS))
> + thresh >>= 1;
> +
> + /*
> + * Clamp the delta to stay in the scheduler period range
> + * [-sysctl_sched_latency:sysctl_sched_latency]
> + */
> + latency_weight = clamp_t(long, latency_weight,
> + -1 * NICE_LATENCY_WEIGHT_MAX,
> + NICE_LATENCY_WEIGHT_MAX);
> +
> + return (thresh * latency_weight) >> NICE_LATENCY_SHIFT;
> +}
> +
On Wed, 4 May 2022 at 13:15, Chen Yu <[email protected]> wrote:
>
> On Sat, Mar 12, 2022 at 7:11 AM Vincent Guittot
> <[email protected]> wrote:
> >
> > Take into account the nice latency priority of a thread when deciding to
> > preempt the current running thread. We don't want to provide more CPU
> > bandwidth to a thread but reorder the scheduling to run latency sensitive
> > task first whenever possible.
> >
> ---------->8-------------------
> > #endif /* CONFIG_SMP */
> >
> > +static long wakeup_latency_gran(int latency_weight)
> > +{
> > + long thresh = sysctl_sched_latency;
> If I understood correctly, this is to consider the latency weight and
> 'shrink/expand'
> current task's time slice thus to facilitate preemption. And may I
> know why don't we use
> __sched_period() but to use sysctl_sched_latency directly? Is it
> possible the rq has
> more than 8(sched_nr_latency) tasks thus the period is longer than
> sysctl_sched_latency?
Main reason is to be aligned with place_entity which also uses
sysctl_sched_latency to cap entity's vruntime to be higher than
min_vruntime-sysctl_sched_latency
>
> Thanks,
> Chenyu
> > +
> > + if (!latency_weight)
> > + return 0;
> > +
> > + if (sched_feat(GENTLE_FAIR_SLEEPERS))
> > + thresh >>= 1;
> > +
> > + /*
> > + * Clamp the delta to stay in the scheduler period range
> > + * [-sysctl_sched_latency:sysctl_sched_latency]
> > + */
> > + latency_weight = clamp_t(long, latency_weight,
> > + -1 * NICE_LATENCY_WEIGHT_MAX,
> > + NICE_LATENCY_WEIGHT_MAX);
> > +
> > + return (thresh * latency_weight) >> NICE_LATENCY_SHIFT;
> > +}
> > +