Currently, when pids.max limit is breached in the hierarchy, the event
is counted and reported in the cgroup where the forking task resides.
This decouples the limit and the notification caused by the limit making
it hard to detect when the actual limit was effected.
Let's introduce new events:
max
The number of times the limit of the cgroup was hit.
max.imposed
The number of times fork failed in the cgroup because of self
or ancestor limit.
Since it changes semantics of the original "max" event, we introduce
this change only in the v2 API of the controller.
Signed-off-by: Michal Koutný <[email protected]>
---
Documentation/admin-guide/cgroup-v1/pids.rst | 3 +-
Documentation/admin-guide/cgroup-v2.rst | 12 ++++
kernel/cgroup/pids.c | 71 ++++++++++++++++----
3 files changed, 73 insertions(+), 13 deletions(-)
diff --git a/Documentation/admin-guide/cgroup-v1/pids.rst b/Documentation/admin-guide/cgroup-v1/pids.rst
index 6acebd9e72c8..0f9f9a7b1f6c 100644
--- a/Documentation/admin-guide/cgroup-v1/pids.rst
+++ b/Documentation/admin-guide/cgroup-v1/pids.rst
@@ -36,7 +36,8 @@ superset of parent/child/pids.current.
The pids.events file contains event counters:
- - max: Number of times fork failed because limit was hit.
+ - max: Number of times fork failed in the cgroup because limit was hit in
+ self or ancestors.
Example
-------
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 17e6e9565156..4f04538d688c 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -2186,6 +2186,18 @@ PID Interface Files
The number of processes currently in the cgroup and its
descendants.
+ pids.events
+ A read-only flat-keyed file which exists on non-root cgroups. Unless
+ specified otherwise, a value change in this file generates a file modified
+ event. The following entries are defined.
+
+ max
+ The number of times the limit of the cgroup was hit.
+
+ max.imposed
+ The number of times fork failed in the cgroup because of self
+ or ancestor limit.
+
Organisational operations are not blocked by cgroup policies, so it is
possible to have pids.current > pids.max. This can be done by either
setting the limit to be smaller than pids.current, or attaching enough
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 0e5ec7d59b4d..471562609eef 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -38,6 +38,14 @@
#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
#define PIDS_MAX_STR "max"
+enum pidcg_event {
+ /* Fork failed in subtree because this pids_cgroup limit was hit. */
+ PIDCG_MAX,
+ /* Fork failed in this pids_cgroup because ancestor limit was hit. */
+ PIDCG_MAX_IMPOSED,
+ NR_PIDCG_EVENTS,
+};
+
struct pids_cgroup {
struct cgroup_subsys_state css;
@@ -52,8 +60,7 @@ struct pids_cgroup {
/* Handle for "pids.events" */
struct cgroup_file events_file;
- /* Number of times fork failed because limit was hit. */
- atomic64_t events_limit;
+ atomic64_t events[NR_PIDCG_EVENTS];
};
static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
@@ -148,12 +155,13 @@ static void pids_charge(struct pids_cgroup *pids, int num)
* pids_try_charge - hierarchically try to charge the pid count
* @pids: the pid cgroup state
* @num: the number of pids to charge
+ * @fail: storage of pid cgroup causing the fail
*
* This function follows the set limit. It will fail if the charge would cause
* the new value to exceed the hierarchical limit. Returns 0 if the charge
* succeeded, otherwise -EAGAIN.
*/
-static int pids_try_charge(struct pids_cgroup *pids, int num)
+static int pids_try_charge(struct pids_cgroup *pids, int num, struct pids_cgroup **fail)
{
struct pids_cgroup *p, *q;
@@ -166,9 +174,10 @@ static int pids_try_charge(struct pids_cgroup *pids, int num)
* p->limit is %PIDS_MAX then we know that this test will never
* fail.
*/
- if (new > limit)
+ if (new > limit) {
+ *fail = p;
goto revert;
-
+ }
/*
* Not technically accurate if we go over limit somewhere up
* the hierarchy, but that's tolerable for the watermark.
@@ -236,7 +245,7 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
static int pids_can_fork(struct task_struct *task, struct css_set *cset)
{
struct cgroup_subsys_state *css;
- struct pids_cgroup *pids;
+ struct pids_cgroup *pids, *pids_over_limit;
int err;
if (cset)
@@ -244,15 +253,23 @@ static int pids_can_fork(struct task_struct *task, struct css_set *cset)
else
css = task_css_check(current, pids_cgrp_id, true);
pids = css_pids(css);
- err = pids_try_charge(pids, 1);
+ err = pids_try_charge(pids, 1, &pids_over_limit);
if (err) {
- /* Only log the first time events_limit is incremented. */
- if (atomic64_inc_return(&pids->events_limit) == 1) {
+ /* compatibility on v1 where events were notified in leaves. */
+ if (!cgroup_subsys_on_dfl(pids_cgrp_subsys))
+ pids_over_limit = pids;
+
+ /* Only log the first time limit is hit. */
+ if (atomic64_inc_return(&pids->events[PIDCG_MAX_IMPOSED]) == 1) {
pr_info("cgroup: fork rejected by pids controller in ");
- pr_cont_cgroup_path(css->cgroup);
+ pr_cont_cgroup_path(pids->css.cgroup);
pr_cont("\n");
}
+ atomic64_inc(&pids_over_limit->events[PIDCG_MAX]);
+
cgroup_file_notify(&pids->events_file);
+ if (pids_over_limit != pids)
+ cgroup_file_notify(&pids_over_limit->events_file);
}
return err;
}
@@ -341,7 +358,16 @@ static int pids_events_show(struct seq_file *sf, void *v)
{
struct pids_cgroup *pids = css_pids(seq_css(sf));
- seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit));
+ seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events[PIDCG_MAX]));
+ seq_printf(sf, "max.imposed %lld\n", (s64)atomic64_read(&pids->events[PIDCG_MAX_IMPOSED]));
+ return 0;
+}
+
+static int pids_events_show_legacy(struct seq_file *sf, void *v)
+{
+ struct pids_cgroup *pids = css_pids(seq_css(sf));
+
+ seq_printf(sf, "max%lld\n", (s64)atomic64_read(&pids->events[PIDCG_MAX_IMPOSED]));
return 0;
}
@@ -371,6 +397,27 @@ static struct cftype pids_files[] = {
{ } /* terminate */
};
+static struct cftype pids_files_legacy[] = {
+ {
+ .name = "max",
+ .write = pids_max_write,
+ .seq_show = pids_max_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "current",
+ .read_s64 = pids_current_read,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "events",
+ .seq_show = pids_events_show_legacy,
+ .file_offset = offsetof(struct pids_cgroup, events_file),
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ { } /* terminate */
+};
+
struct cgroup_subsys pids_cgrp_subsys = {
.css_alloc = pids_css_alloc,
.css_free = pids_css_free,
@@ -379,7 +426,7 @@ struct cgroup_subsys pids_cgrp_subsys = {
.can_fork = pids_can_fork,
.cancel_fork = pids_cancel_fork,
.release = pids_release,
- .legacy_cftypes = pids_files,
.dfl_cftypes = pids_files,
+ .legacy_cftypes = pids_files_legacy,
.threaded = true,
};
--
2.44.0
Hello,
On Fri, Apr 05, 2024 at 07:05:41PM +0200, Michal Koutn? wrote:
> Currently, when pids.max limit is breached in the hierarchy, the event
> is counted and reported in the cgroup where the forking task resides.
>
> This decouples the limit and the notification caused by the limit making
> it hard to detect when the actual limit was effected.
>
> Let's introduce new events:
> max
> The number of times the limit of the cgroup was hit.
>
> max.imposed
> The number of times fork failed in the cgroup because of self
> or ancestor limit.
The whole series make sense to me. I'm not sure about max.imposed field
name. Maybe a name which clearly signfies rejection of forks would be
clearer? Johannes, what do you think?
Thanks.
--
tejun
On Mon, Apr 08, 2024 at 07:55:38AM -1000, Tejun Heo wrote:
> Hello,
>
> On Fri, Apr 05, 2024 at 07:05:41PM +0200, Michal Koutn? wrote:
> > Currently, when pids.max limit is breached in the hierarchy, the event
> > is counted and reported in the cgroup where the forking task resides.
> >
> > This decouples the limit and the notification caused by the limit making
> > it hard to detect when the actual limit was effected.
> >
> > Let's introduce new events:
> > max
> > The number of times the limit of the cgroup was hit.
> >
> > max.imposed
> > The number of times fork failed in the cgroup because of self
> > or ancestor limit.
>
> The whole series make sense to me. I'm not sure about max.imposed field
> name. Maybe a name which clearly signfies rejection of forks would be
> clearer? Johannes, what do you think?
The max event at the level where the limit is set (and up, for
hierarchical accounting) makes sense to me.
max.imposed is conceptually not entirely unprecedented, but something
we've tried to avoid. Usually the idea is that events correspond to
specific cgroup limitations at that level. Failures due to constraints
higher up could be from anything, including system-level shortages.
IOW, events are supposed to be more about "how many times did this
limit here trigger", and less about "how many times did something
happen to the tasks local to this group".
It's a bit arbitrary and not perfectly followed everywhere, but I
think there is value in trying to maintain that distinction, so that
somebody looking at those files doesn't have to rack their brains or
look up every counter in the docs to figure out what it's tracking.
It's at least true for the misc controller, and for most of memcg -
with the weird exception of the swap.max events which we've tried to
fix before...
For "things that are happening to the tasks in this group", would it
make more sense to have an e.g. pids.stat::forkfail instead?
(Or just not have that event at all? I'm not sure if it's actually
needed or whether you kept it only to maintain some form of the
information that is currently provided by the pr_info()).
On Mon, Apr 08, 2024 at 07:55:38AM -1000, Tejun Heo <[email protected]> wrote:
> The whole series make sense to me.
Including the migration charging?
(Asking whether I should keep it stacked in v4 posting.)
Thanks,
Michal
On Fri, Apr 12, 2024 at 04:23:24PM +0200, Michal Koutn? wrote:
> On Mon, Apr 08, 2024 at 07:55:38AM -1000, Tejun Heo <[email protected]> wrote:
> > The whole series make sense to me.
>
> Including the migration charging?
> (Asking whether I should keep it stacked in v4 posting.)
Oh, let's separate that part out. I'm not sure about that. The problem with
can_attach failures is that they're really opaque and the more we do it the
less we'll be able to tell where the failures are coming from, so I'm not
very enthusiastic about them.
Thanks.
--
tejun