2022-09-25 15:08:58

by Vincent Guittot

[permalink] [raw]
Subject: [PATCH v5 5/7] sched/fair: Add sched group latency support

Task can set its latency priority with sched_setattr(), which is then used
to set the latency offset of its sched_entity, but sched group entities
still have the default latency offset value.

Add a latency.nice field in cpu cgroup controller to set the latency
priority of the group similarly to sched_setattr(). The latency priority
is then used to set the offset of the sched_entities of the group.

Signed-off-by: Vincent Guittot <[email protected]>
---
Documentation/admin-guide/cgroup-v2.rst | 8 ++++
kernel/sched/core.c | 53 +++++++++++++++++++++++++
kernel/sched/fair.c | 33 +++++++++++++++
kernel/sched/sched.h | 4 ++
4 files changed, 98 insertions(+)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index be4a77baf784..d8ae7e411f9c 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1095,6 +1095,14 @@ All time durations are in microseconds.
values similar to the sched_setattr(2). This maximum utilization
value is used to clamp the task specific maximum utilization clamp.

+ cpu.latency.nice
+ A read-write single value file which exists on non-root
+ cgroups. The default is "0".
+
+ The nice value is in the range [-20, 19].
+
+ This interface file allows reading and setting latency using the
+ same values used by sched_setattr(2).


Memory
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 00fa2da12506..e8a1105bc87d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10890,6 +10890,48 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
{
return sched_group_set_idle(css_tg(css), idle);
}
+
+static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ int last_delta = INT_MAX;
+ int prio, delta;
+ s64 weight;
+
+ weight = css_tg(css)->latency_offset * NICE_LATENCY_WEIGHT_MAX;
+ weight = div_s64(weight, sysctl_sched_latency);
+
+ /* Find the closest nice value to the current weight */
+ for (prio = 0; prio < ARRAY_SIZE(sched_latency_to_weight); prio++) {
+ delta = abs(sched_latency_to_weight[prio] - weight);
+ if (delta >= last_delta)
+ break;
+ last_delta = delta;
+ }
+
+ return LATENCY_TO_NICE(prio-1);
+}
+
+static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft, s64 nice)
+{
+ s64 latency_offset;
+ long weight;
+ int idx;
+
+ if (nice < MIN_LATENCY_NICE || nice > MAX_LATENCY_NICE)
+ return -ERANGE;
+
+ idx = NICE_TO_LATENCY(nice);
+ idx = array_index_nospec(idx, LATENCY_NICE_WIDTH);
+ weight = sched_latency_to_weight[idx];
+
+ latency_offset = sysctl_sched_latency * weight;
+ latency_offset = div_s64(latency_offset, NICE_LATENCY_WEIGHT_MAX);
+
+ return sched_group_set_latency(css_tg(css), latency_offset);
+}
+
#endif

static struct cftype cpu_legacy_files[] = {
@@ -10904,6 +10946,11 @@ static struct cftype cpu_legacy_files[] = {
.read_s64 = cpu_idle_read_s64,
.write_s64 = cpu_idle_write_s64,
},
+ {
+ .name = "latency.nice",
+ .read_s64 = cpu_latency_nice_read_s64,
+ .write_s64 = cpu_latency_nice_write_s64,
+ },
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
@@ -11121,6 +11168,12 @@ static struct cftype cpu_files[] = {
.read_s64 = cpu_idle_read_s64,
.write_s64 = cpu_idle_write_s64,
},
+ {
+ .name = "latency.nice",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = cpu_latency_nice_read_s64,
+ .write_s64 = cpu_latency_nice_write_s64,
+ },
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c3f857630dcf..74e42d19c1ce 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11768,6 +11768,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
goto err;

tg->shares = NICE_0_LOAD;
+ tg->latency_offset = 0;

init_cfs_bandwidth(tg_cfs_bandwidth(tg));

@@ -11866,6 +11867,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
}

se->my_q = cfs_rq;
+
+ se->latency_offset = tg->latency_offset;
+
/* guarantee group entities always have weight */
update_load_set(&se->load, NICE_0_LOAD);
se->parent = parent;
@@ -11996,6 +12000,35 @@ int sched_group_set_idle(struct task_group *tg, long idle)
return 0;
}

+int sched_group_set_latency(struct task_group *tg, s64 latency)
+{
+ int i;
+
+ if (tg == &root_task_group)
+ return -EINVAL;
+
+ if (abs(latency) > sysctl_sched_latency)
+ return -EINVAL;
+
+ mutex_lock(&shares_mutex);
+
+ if (tg->latency_offset == latency) {
+ mutex_unlock(&shares_mutex);
+ return 0;
+ }
+
+ tg->latency_offset = latency;
+
+ for_each_possible_cpu(i) {
+ struct sched_entity *se = tg->se[i];
+
+ WRITE_ONCE(se->latency_offset, latency);
+ }
+
+ mutex_unlock(&shares_mutex);
+ return 0;
+}
+
#else /* CONFIG_FAIR_GROUP_SCHED */

void free_fair_sched_group(struct task_group *tg) { }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 80c4d2f5827f..a15fb955092c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -407,6 +407,8 @@ struct task_group {

/* A positive value indicates that this is a SCHED_IDLE group. */
int idle;
+ /* latency constraint of the group. */
+ int latency_offset;

#ifdef CONFIG_SMP
/*
@@ -517,6 +519,8 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);

extern int sched_group_set_idle(struct task_group *tg, long idle);

+extern int sched_group_set_latency(struct task_group *tg, s64 latency);
+
#ifdef CONFIG_SMP
extern void set_task_rq_fair(struct sched_entity *se,
struct cfs_rq *prev, struct cfs_rq *next);
--
2.17.1


2022-10-12 14:26:56

by Qais Yousef

[permalink] [raw]
Subject: Re: [PATCH v5 5/7] sched/fair: Add sched group latency support

On 09/25/22 16:39, Vincent Guittot wrote:
> Task can set its latency priority with sched_setattr(), which is then used
> to set the latency offset of its sched_entity, but sched group entities
> still have the default latency offset value.
>
> Add a latency.nice field in cpu cgroup controller to set the latency
> priority of the group similarly to sched_setattr(). The latency priority
> is then used to set the offset of the sched_entities of the group.
>
> Signed-off-by: Vincent Guittot <[email protected]>
> ---
> Documentation/admin-guide/cgroup-v2.rst | 8 ++++
> kernel/sched/core.c | 53 +++++++++++++++++++++++++
> kernel/sched/fair.c | 33 +++++++++++++++
> kernel/sched/sched.h | 4 ++
> 4 files changed, 98 insertions(+)
>
> diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
> index be4a77baf784..d8ae7e411f9c 100644
> --- a/Documentation/admin-guide/cgroup-v2.rst
> +++ b/Documentation/admin-guide/cgroup-v2.rst
> @@ -1095,6 +1095,14 @@ All time durations are in microseconds.
> values similar to the sched_setattr(2). This maximum utilization
> value is used to clamp the task specific maximum utilization clamp.
>
> + cpu.latency.nice
> + A read-write single value file which exists on non-root
> + cgroups. The default is "0".
> +
> + The nice value is in the range [-20, 19].
> +
> + This interface file allows reading and setting latency using the
> + same values used by sched_setattr(2).

I still don't understand how tasks will inherit the latency_nice value from
cgroups they're attached to.

For example, in EAS path we operate at task level only. If the task's
p->latency_nice = 0, but it belongs to a task group tg->latency_nice = -19;
what should the task's latency_nice be in this case? If it's in a hierarchy,
how would the effective value be calculated?


Thanks

--
Qais Yousef

2022-10-12 16:14:37

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH v5 5/7] sched/fair: Add sched group latency support

On Wed, 12 Oct 2022 at 16:22, Qais Yousef <[email protected]> wrote:
>
> On 09/25/22 16:39, Vincent Guittot wrote:
> > Task can set its latency priority with sched_setattr(), which is then used
> > to set the latency offset of its sched_entity, but sched group entities
> > still have the default latency offset value.
> >
> > Add a latency.nice field in cpu cgroup controller to set the latency
> > priority of the group similarly to sched_setattr(). The latency priority
> > is then used to set the offset of the sched_entities of the group.
> >
> > Signed-off-by: Vincent Guittot <[email protected]>
> > ---
> > Documentation/admin-guide/cgroup-v2.rst | 8 ++++
> > kernel/sched/core.c | 53 +++++++++++++++++++++++++
> > kernel/sched/fair.c | 33 +++++++++++++++
> > kernel/sched/sched.h | 4 ++
> > 4 files changed, 98 insertions(+)
> >
> > diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
> > index be4a77baf784..d8ae7e411f9c 100644
> > --- a/Documentation/admin-guide/cgroup-v2.rst
> > +++ b/Documentation/admin-guide/cgroup-v2.rst
> > @@ -1095,6 +1095,14 @@ All time durations are in microseconds.
> > values similar to the sched_setattr(2). This maximum utilization
> > value is used to clamp the task specific maximum utilization clamp.
> >
> > + cpu.latency.nice
> > + A read-write single value file which exists on non-root
> > + cgroups. The default is "0".
> > +
> > + The nice value is in the range [-20, 19].
> > +
> > + This interface file allows reading and setting latency using the
> > + same values used by sched_setattr(2).
>
> I still don't understand how tasks will inherit the latency_nice value from
> cgroups they're attached to.

The behavior is the same as for sched_entity weight. The latency is
applied on the sched_entity of the group

>
> For example, in EAS path we operate at task level only. If the task's
> p->latency_nice = 0, but it belongs to a task group tg->latency_nice = -19;
> what should the task's latency_nice be in this case? If it's in a hierarchy,
> how would the effective value be calculated?
>
>
> Thanks
>
> --
> Qais Yousef

2022-10-12 16:21:18

by Qais Yousef

[permalink] [raw]
Subject: Re: [PATCH v5 5/7] sched/fair: Add sched group latency support

On 10/12/22 17:42, Vincent Guittot wrote:
> On Wed, 12 Oct 2022 at 16:22, Qais Yousef <[email protected]> wrote:
> >
> > On 09/25/22 16:39, Vincent Guittot wrote:
> > > Task can set its latency priority with sched_setattr(), which is then used
> > > to set the latency offset of its sched_entity, but sched group entities
> > > still have the default latency offset value.
> > >
> > > Add a latency.nice field in cpu cgroup controller to set the latency
> > > priority of the group similarly to sched_setattr(). The latency priority
> > > is then used to set the offset of the sched_entities of the group.
> > >
> > > Signed-off-by: Vincent Guittot <[email protected]>
> > > ---
> > > Documentation/admin-guide/cgroup-v2.rst | 8 ++++
> > > kernel/sched/core.c | 53 +++++++++++++++++++++++++
> > > kernel/sched/fair.c | 33 +++++++++++++++
> > > kernel/sched/sched.h | 4 ++
> > > 4 files changed, 98 insertions(+)
> > >
> > > diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
> > > index be4a77baf784..d8ae7e411f9c 100644
> > > --- a/Documentation/admin-guide/cgroup-v2.rst
> > > +++ b/Documentation/admin-guide/cgroup-v2.rst
> > > @@ -1095,6 +1095,14 @@ All time durations are in microseconds.
> > > values similar to the sched_setattr(2). This maximum utilization
> > > value is used to clamp the task specific maximum utilization clamp.
> > >
> > > + cpu.latency.nice
> > > + A read-write single value file which exists on non-root
> > > + cgroups. The default is "0".
> > > +
> > > + The nice value is in the range [-20, 19].
> > > +
> > > + This interface file allows reading and setting latency using the
> > > + same values used by sched_setattr(2).
> >
> > I still don't understand how tasks will inherit the latency_nice value from
> > cgroups they're attached to.
>
> The behavior is the same as for sched_entity weight. The latency is
> applied on the sched_entity of the group

But this is the point I am raising. Not all users behave the same as weight.

In EAS we just look at the effective value of the task (see uclamp for
example). We don't care about the group value except to calculate how it
impacts the task's value.

Or am I missing something here?


Cheers

--
Qais Yousef