LinuxLists.cc - [patch 04/15] sched: validate CFS quota hierarchies

2011-05-07 06:32:36

Subject: [patch 04/15] sched: validate CFS quota hierarchies

Add constraints validation for CFS bandwidth hierachies.

Validate that:
sum(child bandwidth) <= parent_bandwidth

In a quota limited hierarchy, an unconstrainted entity
(e.g. bandwidth==RUNTIME_INF) inherits the bandwidth of its parent.

Since bandwidth periods may be non-uniform we normalize to the maximum allowed
period, 1 second.

This behavior may be disabled (allowing child bandwidth to exceed parent) via
kernel.sched_cfs_bandwidth_consistent=0

Signed-off-by: Paul Turner <[email protected]>

---
include/linux/sched.h | 8 ++
kernel/sched.c | 137 +++++++++++++++++++++++++++++++++++++++++++++-----
kernel/sched_fair.c | 8 ++
kernel/sysctl.c | 11 ++++
4 files changed, 151 insertions(+), 13 deletions(-)

Index: tip/kernel/sched.c
===================================================================
--- tip.orig/kernel/sched.c
+++ tip/kernel/sched.c
@@ -249,6 +249,7 @@ struct cfs_bandwidth {
raw_spinlock_t lock;
ktime_t period;
u64 quota;
+ s64 hierarchal_quota;
#endif
};

@@ -8789,12 +8790,7 @@ unsigned long sched_group_shares(struct
}
#endif

-#ifdef CONFIG_RT_GROUP_SCHED
-/*
- * Ensure that the real time constraints are schedulable.
- */
-static DEFINE_MUTEX(rt_constraints_mutex);
-
+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
static unsigned long to_ratio(u64 period, u64 runtime)
{
if (runtime == RUNTIME_INF)
@@ -8802,6 +8798,13 @@ static unsigned long to_ratio(u64 period

return div64_u64(runtime << 20, period);
}
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+/*
+ * Ensure that the real time constraints are schedulable.
+ */
+static DEFINE_MUTEX(rt_constraints_mutex);

/* Must be called with tasklist_lock held */
static inline int tg_has_rt_tasks(struct task_group *tg)
@@ -8822,7 +8825,7 @@ struct rt_schedulable_data {
u64 rt_runtime;
};

-static int tg_schedulable(struct task_group *tg, void *data)
+static int tg_rt_schedulable(struct task_group *tg, void *data)
{
struct rt_schedulable_data *d = data;
struct task_group *child;
@@ -8886,7 +8889,7 @@ static int __rt_schedulable(struct task_
.rt_runtime = runtime,
};

- return walk_tg_tree(tg_schedulable, tg_nop, &data);
+ return walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
}

static int tg_set_rt_bandwidth(struct task_group *tg,
@@ -9177,14 +9180,17 @@ static u64 cpu_shares_read_u64(struct cg
}

#ifdef CONFIG_CFS_BANDWIDTH
+static DEFINE_MUTEX(cfs_constraints_mutex);
+
const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */

+static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
+
static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
{
- int i;
+ int i, ret = 0;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
- static DEFINE_MUTEX(mutex);

if (tg == &root_task_group)
return -EINVAL;
@@ -9205,7 +9211,13 @@ static int tg_set_cfs_bandwidth(struct t
if (period > max_cfs_quota_period)
return -EINVAL;

- mutex_lock(&mutex);
+ mutex_lock(&cfs_constraints_mutex);
+ if (sysctl_sched_cfs_bandwidth_consistent) {
+ ret = __cfs_schedulable(tg, period, quota);
+ if (ret)
+ goto out_unlock;
+ }
+
raw_spin_lock_irq(&cfs_b->lock);
cfs_b->period = ns_to_ktime(period);
cfs_b->quota = quota;
@@ -9220,9 +9232,10 @@ static int tg_set_cfs_bandwidth(struct t
cfs_rq->runtime_remaining = 0;
raw_spin_unlock_irq(&rq->lock);
}
- mutex_unlock(&mutex);
+out_unlock:
+ mutex_unlock(&cfs_constraints_mutex);

- return 0;
+ return ret;
}

int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
@@ -9296,6 +9309,104 @@ static int cpu_cfs_period_write_u64(stru
return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
}

+
+struct cfs_schedulable_data {
+ struct task_group *tg;
+ u64 period, quota;
+};
+
+/*
+ * normalize group quota/period to be quota/max_period
+ * note: units are usecs
+ */
+static u64 normalize_cfs_quota(struct task_group *tg,
+ struct cfs_schedulable_data *d)
+{
+ u64 quota, period;
+
+ if (tg == d->tg) {
+ period = d->period;
+ quota = d->quota;
+ } else {
+ period = tg_get_cfs_period(tg);
+ quota = tg_get_cfs_quota(tg);
+ }
+
+ if (quota == RUNTIME_INF)
+ return RUNTIME_INF;
+
+ return to_ratio(period, quota);
+}
+
+static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
+{
+ struct cfs_schedulable_data *d = data;
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+ s64 quota = 0, parent_quota = -1;
+
+ quota = normalize_cfs_quota(tg, d);
+ if (!tg->parent) {
+ quota = RUNTIME_INF;
+ } else {
+ struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent);
+
+ parent_quota = parent_b->hierarchal_quota;
+ if (parent_quota != RUNTIME_INF) {
+ parent_quota -= quota;
+ /* invalid hierarchy, child bandwidth exceeds parent */
+ if (parent_quota < 0)
+ return -EINVAL;
+ }
+
+ /* if no inherent limit then inherit parent quota */
+ if (quota == RUNTIME_INF)
+ quota = parent_quota;
+ parent_b->hierarchal_quota = parent_quota;
+ }
+ cfs_b->hierarchal_quota = quota;
+
+ return 0;
+}
+
+static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
+{
+ struct cfs_schedulable_data data = {
+ .tg = tg,
+ .period = period,
+ .quota = quota,
+ };
+
+ if (!sysctl_sched_cfs_bandwidth_consistent)
+ return 0;
+
+ if (quota != RUNTIME_INF) {
+ do_div(data.period, NSEC_PER_USEC);
+ do_div(data.quota, NSEC_PER_USEC);
+ }
+
+ return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
+}
+
+int sched_cfs_consistent_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+
+ mutex_lock(&cfs_constraints_mutex);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ if (!ret && write && sysctl_sched_cfs_bandwidth_consistent) {
+ ret = __cfs_schedulable(NULL, 0, 0);
+
+ /* must be consistent to enable */
+ if (ret)
+ sysctl_sched_cfs_bandwidth_consistent = 0;
+ }
+ mutex_unlock(&cfs_constraints_mutex);
+
+ return ret;
+}
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */

Index: tip/kernel/sysctl.c
===================================================================
--- tip.orig/kernel/sysctl.c
+++ tip/kernel/sysctl.c
@@ -367,6 +367,17 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = sched_rt_handler,
},
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ .procname = "sched_cfs_bandwidth_consistent",
+ .data = &sysctl_sched_cfs_bandwidth_consistent,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_cfs_consistent_handler,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
#ifdef CONFIG_SCHED_AUTOGROUP
{
.procname = "sched_autogroup_enabled",
Index: tip/include/linux/sched.h
===================================================================
--- tip.orig/include/linux/sched.h
+++ tip/include/linux/sched.h
@@ -1950,6 +1950,14 @@ int sched_rt_handler(struct ctl_table *t
void __user *buffer, size_t *lenp,
loff_t *ppos);

+#ifdef CONFIG_CFS_BANDWIDTH
+extern unsigned int sysctl_sched_cfs_bandwidth_consistent;
+
+int sched_cfs_consistent_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos);
+#endif
+
#ifdef CONFIG_SCHED_AUTOGROUP
extern unsigned int sysctl_sched_autogroup_enabled;

Index: tip/kernel/sched_fair.c
===================================================================
--- tip.orig/kernel/sched_fair.c
+++ tip/kernel/sched_fair.c
@@ -88,6 +88,14 @@ const_debug unsigned int sysctl_sched_mi
*/
unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;

+#ifdef CONFIG_CFS_BANDWIDTH
+/*
+ * Whether a CFS bandwidth hierarchy is required to be consistent, that is:
+ * sum(child_bandwidth) <= parent_bandwidth
+ */
+unsigned int sysctl_sched_cfs_bandwidth_consistent = 1;
+#endif
+
static const struct sched_class fair_sched_class;

/**************************************************************

2011-05-10 07:21:06

by Hidetoshi Seto

[permalink] [raw]

Subject: Re: [patch 04/15] sched: validate CFS quota hierarchies

Description typos + one bug.

(2011/05/03 18:28), Paul Turner wrote:
> Add constraints validation for CFS bandwidth hierachies.

hierarchies

>
> Validate that:
> sum(child bandwidth) <= parent_bandwidth
>
> In a quota limited hierarchy, an unconstrainted entity

unconstrained

> (e.g. bandwidth==RUNTIME_INF) inherits the bandwidth of its parent.
>
> Since bandwidth periods may be non-uniform we normalize to the maximum allowed
> period, 1 second.
>
> This behavior may be disabled (allowing child bandwidth to exceed parent) via
> kernel.sched_cfs_bandwidth_consistent=0
>
> Signed-off-by: Paul Turner <[email protected]>
>
> ---
(snip)
> +/*
> + * normalize group quota/period to be quota/max_period
> + * note: units are usecs
> + */
> +static u64 normalize_cfs_quota(struct task_group *tg,
> + struct cfs_schedulable_data *d)
> +{
> + u64 quota, period;
> +
> + if (tg == d->tg) {
> + period = d->period;
> + quota = d->quota;
> + } else {
> + period = tg_get_cfs_period(tg);
> + quota = tg_get_cfs_quota(tg);
> + }
> +
> + if (quota == RUNTIME_INF)
> + return RUNTIME_INF;
> +
> + return to_ratio(period, quota);
> +}

Since tg_get_cfs_quota() doesn't return RUNTIME_INF but -1,
this function needs a fix like following.

For fixed version, feel free to add:

Reviewed-by: Hidetoshi Seto <[email protected]>

Thanks,
H.Seto

---
kernel/sched.c | 7 ++++---
1 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index d2562aa..f171ba5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9465,16 +9465,17 @@ static u64 normalize_cfs_quota(struct task_group *tg,
u64 quota, period;

if (tg == d->tg) {
+ if (d->quota == RUNTIME_INF)
+ return RUNTIME_INF;
period = d->period;
quota = d->quota;
} else {
+ if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF)
+ return RUNTIME_INF;
period = tg_get_cfs_period(tg);
quota = tg_get_cfs_quota(tg);
}

- if (quota == RUNTIME_INF)
- return RUNTIME_INF;
-
return to_ratio(period, quota);
}

2011-05-11 16:45:14

by Paul Turner

[permalink] [raw]

Subject: Re: [patch 04/15] sched: validate CFS quota hierarchies

On Tue, May 10, 2011 at 12:20 AM, Hidetoshi Seto
<[email protected]> wrote:
> Description typos + one bug.
>
> (2011/05/03 18:28), Paul Turner wrote:
>> Add constraints validation for CFS bandwidth hierachies.
>
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? hierarchies
>
>>
>> Validate that:
>> ? ?sum(child bandwidth) <= parent_bandwidth
>>
>> In a quota limited hierarchy, an unconstrainted entity
>
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? unconstrained
>
>> (e.g. bandwidth==RUNTIME_INF) inherits the bandwidth of its parent.
>>
>> Since bandwidth periods may be non-uniform we normalize to the maximum allowed
>> period, 1 second.
>>
>> This behavior may be disabled (allowing child bandwidth to exceed parent) via
>> kernel.sched_cfs_bandwidth_consistent=0
>>
>> Signed-off-by: Paul Turner <[email protected]>
>>
>> ---
> (snip)
>> +/*
>> + * normalize group quota/period to be quota/max_period
>> + * note: units are usecs
>> + */
>> +static u64 normalize_cfs_quota(struct task_group *tg,
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct cfs_schedulable_data *d)
>> +{
>> + ? ? u64 quota, period;
>> +
>> + ? ? if (tg == d->tg) {
>> + ? ? ? ? ? ? period = d->period;
>> + ? ? ? ? ? ? quota = d->quota;
>> + ? ? } else {
>> + ? ? ? ? ? ? period = tg_get_cfs_period(tg);
>> + ? ? ? ? ? ? quota = tg_get_cfs_quota(tg);
>> + ? ? }
>> +
>> + ? ? if (quota == RUNTIME_INF)
>> + ? ? ? ? ? ? return RUNTIME_INF;
>> +
>> + ? ? return to_ratio(period, quota);
>> +}
>
> Since tg_get_cfs_quota() doesn't return RUNTIME_INF but -1,
> this function needs a fix like following.
>
> For fixed version, feel free to add:
>
> Reviewed-by: Hidetoshi Seto <[email protected]>
>
> Thanks,
> H.Seto
>
> ---
> ?kernel/sched.c | ? ?7 ++++---
> ?1 files changed, 4 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/sched.c b/kernel/sched.c
> index d2562aa..f171ba5 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -9465,16 +9465,17 @@ static u64 normalize_cfs_quota(struct task_group *tg,
> ? ? ? ?u64 quota, period;
>
> ? ? ? ?if (tg == d->tg) {
> + ? ? ? ? ? ? ? if (d->quota == RUNTIME_INF)
> + ? ? ? ? ? ? ? ? ? ? ? return RUNTIME_INF;
> ? ? ? ? ? ? ? ?period = d->period;
> ? ? ? ? ? ? ? ?quota = d->quota;
> ? ? ? ?} else {
> + ? ? ? ? ? ? ? if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF)
> + ? ? ? ? ? ? ? ? ? ? ? return RUNTIME_INF;
> ? ? ? ? ? ? ? ?period = tg_get_cfs_period(tg);
> ? ? ? ? ? ? ? ?quota = tg_get_cfs_quota(tg);
> ? ? ? ?}
>

Good catch!

Just modifying:
+if (quota == RUNTIME_INF || quota == -1)
+ ? ? ? ? ? ? ? ? ? ? ? return RUNTIME_INF;

Seems simpler.

Although really there's no reason for tg_get_cfs_runtime (and
sched_group_rt_runtime from which it's cloned) not to be returning
RUNTIME_INF and then doing the conversion within the cgroupfs handler.

Fixing both is probably a better clean-up.

> - ? ? ? if (quota == RUNTIME_INF)
> - ? ? ? ? ? ? ? return RUNTIME_INF;
> -
> ? ? ? ?return to_ratio(period, quota);
> ?}
>
>
>

2011-05-16 09:30:28

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [patch 04/15] sched: validate CFS quota hierarchies

On Tue, 2011-05-03 at 02:28 -0700, Paul Turner wrote:
> Since bandwidth periods may be non-uniform we normalize to the maximum allowed
> period, 1 second.

I'm still somewhat confused on this point, what does it mean to have a
(parent) group with 0.1s period with child-groups that have 1s periods?

2011-05-16 09:44:04

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [patch 04/15] sched: validate CFS quota hierarchies

On Tue, 2011-05-03 at 02:28 -0700, Paul Turner wrote:
> This behavior may be disabled (allowing child bandwidth to exceed parent) via
> kernel.sched_cfs_bandwidth_consistent=0

why? this needs very good justification.

2011-05-16 12:33:07

by Paul Turner

[permalink] [raw]

Subject: Re: [patch 04/15] sched: validate CFS quota hierarchies

On Mon, May 16, 2011 at 2:43 AM, Peter Zijlstra <[email protected]> wrote:
>
> On Tue, 2011-05-03 at 02:28 -0700, Paul Turner wrote:
> > This behavior may be disabled (allowing child bandwidth to exceed parent) via
> > kernel.sched_cfs_bandwidth_consistent=0
>
> why? this needs very good justification.

I think it was lost in other discussion before, but I think there are
two useful use-cases for it:

Posting (condensed) relevant snippet:
-----------------------------------------------------------
Consider:

- I have some application that I want to limit to 3 cpus
I have a 2 workers in that application, across a period I would like
those workers to use a maximum of say 2.5 cpus each (suppose they
serve some sort of co-processor request per user and we want to
prevent a single user eating our entire limit and starving out
everything else).

The goal in this case is not preventing increasing availability within a
given limit, while not destroying the (relatively) work-conserving aspect of
its performance in general.

(...)

- There's also the case of managing an abusive user, use cases such
as the above means that users can usefully be given write permission
to their relevant sub-hierarchy.

If the system size changes, or a user becomes newly abusive then being
able to set non-conformant constraint avoids the adversarial problem of having
to find and bring all of their set (possibly maliciously large) limits
within the global limit.
-----------------------------------------------------------
(Previously: https://lkml.org/lkml/2011/2/24/477)

2011-05-17 15:27:14

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [patch 04/15] sched: validate CFS quota hierarchies

On Mon, 2011-05-16 at 05:32 -0700, Paul Turner wrote:
> On Mon, May 16, 2011 at 2:43 AM, Peter Zijlstra <[email protected]> wrote:
> >
> > On Tue, 2011-05-03 at 02:28 -0700, Paul Turner wrote:
> > > This behavior may be disabled (allowing child bandwidth to exceed parent) via
> > > kernel.sched_cfs_bandwidth_consistent=0
> >
> > why? this needs very good justification.
>
> I think it was lost in other discussion before, but I think there are
> two useful use-cases for it:
>
> Posting (condensed) relevant snippet:

Such stuff should really live in the changelog

> -----------------------------------------------------------
> Consider:
>
> - I have some application that I want to limit to 3 cpus
> I have a 2 workers in that application, across a period I would like
> those workers to use a maximum of say 2.5 cpus each (suppose they
> serve some sort of co-processor request per user and we want to
> prevent a single user eating our entire limit and starving out
> everything else).
>
> The goal in this case is not preventing increasing availability within a
> given limit, while not destroying the (relatively) work-conserving aspect of
> its performance in general.
>
> (...)
>
> - There's also the case of managing an abusive user, use cases such
> as the above means that users can usefully be given write permission
> to their relevant sub-hierarchy.
>
> If the system size changes, or a user becomes newly abusive then being
> able to set non-conformant constraint avoids the adversarial problem of having
> to find and bring all of their set (possibly maliciously large) limits
> within the global limit.
> -----------------------------------------------------------

But what about those where they want both behaviours on the same machine
but for different sub-trees?

Also, without the constraints, what does the hierarchy mean?

2011-05-18 07:16:42

by Paul Turner

[permalink] [raw]

Subject: Re: [patch 04/15] sched: validate CFS quota hierarchies

On Tue, May 17, 2011 at 8:26 AM, Peter Zijlstra <[email protected]> wrote:
> On Mon, 2011-05-16 at 05:32 -0700, Paul Turner wrote:
>> On Mon, May 16, 2011 at 2:43 AM, Peter Zijlstra <[email protected]> wrote:
>> >
>> > On Tue, 2011-05-03 at 02:28 -0700, Paul Turner wrote:
>> > > This behavior may be disabled (allowing child bandwidth to exceed parent) via
>> > > kernel.sched_cfs_bandwidth_consistent=0
>> >
>> > why? this needs very good justification.
>>
>> I think it was lost in other discussion before, but I think there are
>> two useful use-cases for it:
>>
>> Posting (condensed) relevant snippet:
>
> Such stuff should really live in the changelog
>

Given the discussion below it would seem to make sense to split the CL
into one part that adds the consistency checking. And (potentially,
depending on the discussion below) another that provides these state
semantics. This would also give us a chance to clearly call these
details out in the commit description.

>> -----------------------------------------------------------
>> Consider:
>>
>> - I have some application that I want to limit to 3 cpus
>> I have a 2 workers in that application, across a period I would like
>> those workers to use a maximum of say 2.5 cpus each (suppose they
>> serve some sort of co-processor request per user and we want to
>> prevent a single user eating our entire limit and starving out
>> everything else).
>>
>> The goal in this case is not preventing increasing availability within a
>> given limit, while not destroying the (relatively) work-conserving aspect of
>> its performance in general.
>>
>> (...)
>>
>> - There's also the case of managing an abusive user, use cases such
>> as the above means that users can usefully be given write permission
>> to their relevant sub-hierarchy.
>>
>> If the system size changes, or a user becomes newly abusive then being
>> able to set non-conformant constraint avoids the adversarial problem of having
>> to find and bring all of their set (possibly maliciously large) limits
>> within the global limit.
>> -----------------------------------------------------------
>
>
> But what about those where they want both behaviours on the same machine
> but for different sub-trees?

I originally considered a per-tg tunable. I made the assumption that
users would either handle this themselves (=0) or rely on the kernel
to do it (=1). There are some additional complexities that lead me to
withdraw from the per-cg approach in this pass given the known
resistance to it.

One concern was the potential ambiguity in the nesting of these values.

When an inconsistent entity is nested under a consistent one:

A) Do we allow this?
B) How do we treat it?

I think if this was the case that it would make sense to allow it and
that each inconsistent entity should effectively be treated as
terminal from the parent's point of view, and as the new root from the
child's point of view.

Does this make sense? While this is the most intuitive definition for
me there are certainly several other interpretations that could be
argued for.

Would you prefer this approach be taken to consistency vs at a global
level? Do the use-cases above have sufficient merit that we even make
this an option in the first place? Should we just always force
hierarchies to be consistent instead? I'm open on this.

>
> Also, without the constraints, what does the hierarchy mean?
>

It's still an upper-bound for usage, however it may not be achievable
in an inconsistent hierarchy. Whereas in a consistent one it should
always be achievable.

2011-05-18 11:57:51

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [patch 04/15] sched: validate CFS quota hierarchies

On Wed, 2011-05-18 at 00:16 -0700, Paul Turner wrote:

> >
> > But what about those where they want both behaviours on the same machine
> > but for different sub-trees?
>
> I originally considered a per-tg tunable. I made the assumption that
> users would either handle this themselves (=0) or rely on the kernel
> to do it (=1). There are some additional complexities that lead me to
> withdraw from the per-cg approach in this pass given the known
> resistance to it.

Yeah, that's quite horrid too, you chose wisely by not going there ;-)

> One concern was the potential ambiguity in the nesting of these values.
>
> When an inconsistent entity is nested under a consistent one:
>
> A) Do we allow this?
> B) How do we treat it?
>
> I think if this was the case that it would make sense to allow it and
> that each inconsistent entity should effectively be treated as
> terminal from the parent's point of view, and as the new root from the
> child's point of view.
>
> Does this make sense? While this is the most intuitive definition for
> me there are certainly several other interpretations that could be
> argued for.

I'm not quite sure I get it, so what you're saying is: there were the
semantics are violated we draw a border and we only look at local
consistency, thereby side-stepping the whole problem.

Doesn't fly for me, also, see below, by not having any invariants you
don't have clear semantics at all.

> Would you prefer this approach be taken to consistency vs at a global
> level? Do the use-cases above have sufficient merit that we even make
> this an option in the first place? Should we just always force
> hierarchies to be consistent instead? I'm open on this.

Yeah, I think the use cases do make sense, its just that I don't like
the two different semantics and the confusion that goes with it.

> >
> > Also, without the constraints, what does the hierarchy mean?
> >
>
> It's still an upper-bound for usage, however it may not be achievable
> in an inconsistent hierarchy. Whereas in a consistent one it should
> always be achievable.

See that doesn't quite make sense to me, if its not achievable its
simply not and the meaning is no more.

So lets consider these cases again:

> - I have some application that I want to limit to 3 cpus
> I have a 2 workers in that application, across a period I would like
> those workers to use a maximum of say 2.5 cpus each (suppose they
> serve some sort of co-processor request per user and we want to
> prevent a single user eating our entire limit and starving out
> everything else).
>
> The goal in this case is not preventing increasing availability within a
> given limit, while not destroying the (relatively) work-conserving aspect of
> its performance in general.

So the problem here is that 2.5+2.5 > 3, right? So maybe our constraint
isn't quite right, since clearly the whole SCHED_OTHER bandwidth crap
has the purpose of allowing overload.

What about instead of using: \Sum u_i =< U, we use max(u_i) =< U, that
would allow the above case, and mean that the bandwidth limit placed on
the parent is the maximum allowed limit in that subtree. In overload
situations things go back to proportional parts of the subtree limit.

> >> - There's also the case of managing an abusive user, use cases such
> >> as the above means that users can usefully be given write permission
> >> to their relevant sub-hierarchy.
> >>
> >> If the system size changes, or a user becomes newly abusive then being
> >> able to set non-conformant constraint avoids the adversarial problem of having
> >> to find and bring all of their set (possibly maliciously large) limits
> >> within the global limit.

Right, so this example is a little more contrived in that if you had
managed it from the get-go the problem wouldn't be that big (you'd have
had sane limits to begin with).

So one solution is to co-mount the freezer cgroup with your cpu cgroup
and simply freeze the whole subtree while you sort out the settings :-)

Another possibility would be to allow something like:

$ echo force:50000 > cfs_quota_us

Where the "force:" thing requires CAP_SYS_ADMIN and updates the entire
sub-tree such that the above invariant is kept.