This patch introduces a new setting called "fork_remaining". When
positive, each successful fork decrements the value, and once it
reaches zero, no further forking is allowed, no matter how many of
those processes are still alive. The special value "unlimited"
disables the fork limit.
The goal of this limit is to have another safeguard against fork
bombs. It gives processes a chance to set up their child processes /
threads, but will be stopped once they attempt to waste resources by
continuously exiting and cloning new processes. This can be useful
for short-lived processes such as CGI programs.
This is a resubmission; my first attempt to get this feature merged
was as a separate cgroup controller called "fork", but the idea was
rejected (http://thread.gmane.org/gmane.linux.kernel/1210878). This
time, I'm trying to get this feature into the new "pids" controller,
which implements a similar idea.
Signed-off-by: Max Kellermann <[email protected]>
---
Documentation/cgroups/pids.txt | 31 ++++++++++
kernel/cgroup_pids.c | 123 ++++++++++++++++++++++++++++++++++++++++
2 files changed, 153 insertions(+), 1 deletion(-)
diff --git a/Documentation/cgroups/pids.txt b/Documentation/cgroups/pids.txt
index 1a078b5..c5707bc 100644
--- a/Documentation/cgroups/pids.txt
+++ b/Documentation/cgroups/pids.txt
@@ -83,3 +83,34 @@ sh: fork: Resource temporary unavailable
# /bin/echo "We can't even spawn a single process now."
sh: fork: Resource temporary unavailable
#
+
+Fork Limit
+----------
+
+Apart from limiting the total number of processes in a cgroup, the
+`pids` controller can also limit the number of fork()/clone() calls,
+no matter how many of those processes are still alive. That setting
+is controlled by "pids.fork_remaining". The default value is
+"unlimited", and it can be set to any non-negative integer. Each
+successful fork()/clone() decrements the counter, until it hits zero.
+At this point, further fork()/clone() fail.
+
+Example:
+
+# mkdir /sys/fs/cgroup/pids/parent
+# echo 2 > /sys/fs/cgroup/pids/parent/pids.fork_remaining
+# echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs
+# cat /sys/fs/cgroup/pids/parent/pids.fork_remaining
+1
+# cat /sys/fs/cgroup/pids/parent/pids.fork_remaining
+0
+# cat /sys/fs/cgroup/pids/parent/pids.fork_remaining
+sh: fork: Resource temporary unavailable
+
+Note that the first `cat` returns "1"; that is because at this point,
+the counter has already been decremented by launching `cat` inside
+that cgroup.
+
+To lift the limit, write "unlimited" to "pids.fork_remaining":
+
+# echo unlimited > /sys/fs/cgroup/pids/parent/pids.fork_remaining
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
index 806cd76..d902efb 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup_pids.c
@@ -40,6 +40,9 @@
#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
#define PIDS_MAX_STR "max"
+#define PIDS_UNLIMITED -1
+#define PIDS_UNLIMITED_STR "unlimited"
+
struct pids_cgroup {
struct cgroup_subsys_state css;
@@ -49,6 +52,12 @@ struct pids_cgroup {
*/
atomic64_t counter;
int64_t limit;
+
+ /**
+ * The remaining number of forks allowed. -1 is the magic
+ * value for "unlimited".
+ */
+ atomic_t fork_remaining;
};
static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
@@ -72,6 +81,7 @@ pids_css_alloc(struct cgroup_subsys_state *parent)
pids->limit = PIDS_MAX;
atomic64_set(&pids->counter, 0);
+ atomic_set(&pids->fork_remaining, -1);
return &pids->css;
}
@@ -162,6 +172,61 @@ revert:
return -EAGAIN;
}
+/**
+ * pids_cancel_fork_remaining - uncharge fork_remaining counter.
+ */
+static void pids_cancel_fork_remaining(struct pids_cgroup *pids, int n)
+{
+ atomic_add_unless(&pids->fork_remaining, n, -1);
+}
+
+/**
+ * pids_cancel_fork_remaining - uncharge fork_remaining counter,
+ * traversing the parent chain, until (not including) the given last
+ * one.
+ */
+static void pids_cancel_fork_remaining_until(struct pids_cgroup *pids,
+ struct pids_cgroup *last, int n)
+{
+ for (; pids != last; pids = parent_pids(pids))
+ pids_cancel_fork_remaining(pids, 1);
+}
+
+/**
+ * pids_cancel_fork_remaining - uncharge fork_remaining counter,
+ * traversing the whole parent chain.
+ */
+static void pids_cancel_fork_remaining_all(struct pids_cgroup *pids, int n)
+{
+ pids_cancel_fork_remaining_until(pids, NULL, n);
+}
+
+/**
+ * pids_try_fork - check if forking is allowed according to
+ * fork_remaining, and decrement the fork_remaining counter.
+ */
+static int pids_try_fork_remaining(struct pids_cgroup *pids)
+{
+ struct pids_cgroup *p;
+
+ for (p = pids; p; p = parent_pids(p)) {
+ int new = atomic_dec_if_positive(&p->fork_remaining);
+
+ if (new == -1)
+ /*
+ * The old value was 0 which means we're not
+ * allowed to fork.
+ */
+ goto revert;
+ }
+
+ return 0;
+
+revert:
+ pids_cancel_fork_remaining_until(pids, p, 1);
+ return -EAGAIN;
+}
+
static int pids_can_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
@@ -220,10 +285,16 @@ static int pids_can_fork(struct task_struct *task, void **priv_p)
css = task_get_css(current, pids_cgrp_id);
pids = css_pids(css);
- err = pids_try_charge(pids, 1);
+ err = pids_try_fork_remaining(pids);
if (err)
goto err_css_put;
+ err = pids_try_charge(pids, 1);
+ if (err) {
+ pids_cancel_fork_remaining_all(pids, 1);
+ goto err_css_put;
+ }
+
*priv_p = css;
return 0;
@@ -237,6 +308,7 @@ static void pids_cancel_fork(struct task_struct *task, void *priv)
struct cgroup_subsys_state *css = priv;
struct pids_cgroup *pids = css_pids(css);
+ pids_cancel_fork_remaining_all(pids, 1);
pids_uncharge(pids, 1);
css_put(css);
}
@@ -327,6 +399,49 @@ static s64 pids_current_read(struct cgroup_subsys_state *css,
return atomic64_read(&pids->counter);
}
+static int pids_fork_remaining_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct cgroup_subsys_state *css = of_css(of);
+ struct pids_cgroup *pids = css_pids(css);
+ int fork_remaining;
+ int64_t value;
+ int err;
+
+ buf = strstrip(buf);
+ if (!strcmp(buf, PIDS_UNLIMITED_STR)) {
+ fork_remaining = PIDS_UNLIMITED;
+ goto set_limit;
+ }
+
+ err = kstrtoll(buf, 0, &value);
+ if (err)
+ return err;
+
+ if (value < 0 || value > INT_MAX)
+ return -EINVAL;
+
+ fork_remaining = (int)value;
+
+set_limit:
+ atomic_set(&pids->fork_remaining, fork_remaining);
+ return nbytes;
+}
+
+static int pids_fork_remaining_show(struct seq_file *sf, void *v)
+{
+ struct cgroup_subsys_state *css = seq_css(sf);
+ struct pids_cgroup *pids = css_pids(css);
+ int fork_remaining = atomic_read(&pids->fork_remaining);
+
+ if (fork_remaining == PIDS_UNLIMITED)
+ seq_printf(sf, "%s\n", PIDS_UNLIMITED_STR);
+ else
+ seq_printf(sf, "%d\n", fork_remaining);
+
+ return 0;
+}
+
static struct cftype pids_files[] = {
{
.name = "max",
@@ -338,6 +453,12 @@ static struct cftype pids_files[] = {
.name = "current",
.read_s64 = pids_current_read,
},
+ {
+ .name = "fork_remaining",
+ .write = pids_fork_remaining_write,
+ .seq_show = pids_fork_remaining_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
{ } /* terminate */
};
On Tue, Nov 10, 2015 at 03:06:46PM +0100, Max Kellermann wrote:
> This patch introduces a new setting called "fork_remaining". When
> positive, each successful fork decrements the value, and once it
> reaches zero, no further forking is allowed, no matter how many of
> those processes are still alive. The special value "unlimited"
> disables the fork limit.
>
> The goal of this limit is to have another safeguard against fork
> bombs. It gives processes a chance to set up their child processes /
> threads, but will be stopped once they attempt to waste resources by
> continuously exiting and cloning new processes. This can be useful
> for short-lived processes such as CGI programs.
But what's the resource here? All first-order resources which can be
consumed by forking repeatedly already have proper controllers.
What's the point of adding an extra second-order controller? Where do
we go from there? Limit on the number of syscalls?
Thanks.
--
tejun
Hi Max,
I agree with what Tejun said, I just wanted to make a few specific points:
> This patch introduces a new setting called "fork_remaining". When
> positive, each successful fork decrements the value, and once it
> reaches zero, no further forking is allowed, no matter how many of
> those processes are still alive. The special value "unlimited"
> disables the fork limit.
I don't understand *why* you want this. cgroups are meant to deal with
the accounting of *legitimate resources*, and I don't see how "the
number of forks ever in a given cgroup" is a reasonable resource. Not
to mention that it doesn't make sense IMO to make a limit that gets
smaller over time when processes that have died and been freed
> The goal of this limit is to have another safeguard against fork
> bombs. It gives processes a chance to set up their child processes /
> threads, but will be stopped once they attempt to waste resources by
> continuously exiting and cloning new processes. This can be useful
> for short-lived processes such as CGI programs.
Processes don't "use up resources" after they've died and been freed
(which is dealt with inside PIDs). Yes, lots of small processes that
die quickly could (in principle) make hard work for the scheduler, but
I don't see how "time spent scheduling in general" is a resource...
Fork bombs aren't bad because they cause a lot of fork()s, they're bad
because the *create a bunch of processes that use up memory*, which
happens because they call fork() a bunch of times and **don't
exit()**.
--
Aleksa Sarai (cyphar)
http://www.cyphar.com
On 2015/11/10 16:12, Tejun Heo <[email protected]> wrote:
> On Tue, Nov 10, 2015 at 03:06:46PM +0100, Max Kellermann wrote:
> > This patch introduces a new setting called "fork_remaining". When
> > positive, each successful fork decrements the value, and once it
> > reaches zero, no further forking is allowed, no matter how many of
> > those processes are still alive. The special value "unlimited"
> > disables the fork limit.
> >
> > The goal of this limit is to have another safeguard against fork
> > bombs. It gives processes a chance to set up their child processes /
> > threads, but will be stopped once they attempt to waste resources by
> > continuously exiting and cloning new processes. This can be useful
> > for short-lived processes such as CGI programs.
>
> But what's the resource here?
CPU consumption and memory bandwidth. A fork/clone is an operation
that puts considerable load on a machine, most of which happens in
kernel space (copying page tables etc.).
> All first-order resources which can be consumed by forking
> repeatedly already have proper controllers.
They do?
I can limit CPU time with RLIMIT_CPU, but that's per-process and thus
completely useless. There's no cgroup controller with such a feature.
There's "cpu" which changes priority, "cpuset" selects CPUs, "cpuacct"
only does accounting and "freezer" (somewhat related). But nothing
that limits CPU usage according to configured rules.
I can limit absolute memory usage with memcg, which is a good thing,
but is orthogonal to this feature. Note that there are already
various RLIMITs about memory usage, and despite that, memcg was
merged due to RLIMIT shortcomings.
"pids" was merged even though there already was RLIMIT_NPROC. Again,
RLIMITs have their shortcomings.
But which controllers can I use to achieve the same effect as my fork
limit feature? Did I miss one?
> What's the point of adding an extra second-order controller?
I explained that, and you just cited my explanation.
> Where do we go from there? Limit on the number of syscalls?
No idea. Are these questions really relevant for my patch?
Max
Hello, Max.
On Tue, Nov 10, 2015 at 04:37:46PM +0100, Max Kellermann wrote:
> > But what's the resource here?
>
> CPU consumption and memory bandwidth. A fork/clone is an operation
Both are abstracted as CPU usage and controlled by the cpu controller.
> that puts considerable load on a machine, most of which happens in
> kernel space (copying page tables etc.).
>
> > All first-order resources which can be consumed by forking
> > repeatedly already have proper controllers.
>
> They do?
Yes.
> I can limit CPU time with RLIMIT_CPU, but that's per-process and thus
> completely useless. There's no cgroup controller with such a feature.
There's the cpu controller
> There's "cpu" which changes priority, "cpuset" selects CPUs, "cpuacct"
The cpu controller can limit both in terms of relative weight and
absolute CPU cycle bandwidth.
> only does accounting and "freezer" (somewhat related). But nothing
> that limits CPU usage according to configured rules.
>
> I can limit absolute memory usage with memcg, which is a good thing,
> but is orthogonal to this feature. Note that there are already
> various RLIMITs about memory usage, and despite that, memcg was
> merged due to RLIMIT shortcomings.
>
> "pids" was merged even though there already was RLIMIT_NPROC. Again,
> RLIMITs have their shortcomings.
Because pids turned out to be a first-order resource which is not
contrained by memory due to the limited pid space.
> But which controllers can I use to achieve the same effect as my fork
> limit feature? Did I miss one?
Apparently.
> > What's the point of adding an extra second-order controller?
>
> I explained that, and you just cited my explanation.
>
> > Where do we go from there? Limit on the number of syscalls?
>
> No idea. Are these questions really relevant for my patch?
Well, it's relevant to the fact that it's failing to distinguish what
are actual resources and what aren't.
Thanks.
--
tejun
On 2015-11-10 10:25, Aleksa Sarai wrote:
> Processes don't "use up resources" after they've died and been freed
> (which is dealt with inside PIDs). Yes, lots of small processes that
> die quickly could (in principle) make hard work for the scheduler, but
> I don't see how "time spent scheduling in general" is a resource...
> Fork bombs aren't bad because they cause a lot of fork()s, they're bad
> because the *create a bunch of processes that use up memory*, which
> happens because they call fork() a bunch of times and **don't
> exit()**.
While I'm indifferent about the patch, I would like to point out that
fork-bombs are also bad because they eat _a lot_ of processor time, and
I've seen ones designed to bring a system to it's knees just by
saturating the processor with calls to fork() (which is as slow as or
slower than stat() on many commodity systems, setting up the various
structures for a new process is an expensive operation) and clogging up
the scheduler. This isn't as evident of course when you run a fork-bomb
on a laptop or similar system, because you run out of memory and PID's
before the latency from scheduling and so many processes calling fork
really starts to become noticeable, but when you start to look at really
big systems (on the order of hundreds of GB of RAM), it does become much
more noticeable.
On 2015/11/10 16:25, Aleksa Sarai <[email protected]> wrote:
> > The goal of this limit is to have another safeguard against fork
> > bombs. It gives processes a chance to set up their child processes /
> > threads, but will be stopped once they attempt to waste resources by
> > continuously exiting and cloning new processes. This can be useful
> > for short-lived processes such as CGI programs.
>
> Processes don't "use up resources" after they've died and been freed
> (which is dealt with inside PIDs).
That is true, but misses the point.
At some point, while the fork was in progress, those processes did
consume a considerable amount of resources. At that very range of
time, the server was occupied with executing these forks, and was
unable to give CPU time to other processes.
Now if the kernel had stopped that fork bomb earlier, he would have
had more capacity to execute other jobs which are waiting in the
queue. That fork bomb did do its damage, even though the number of
processes was limited - and the goal of the fork limit feature is to
detect it early and stop it from spreading larger.
Some jobs are predictable in how many forks will happen. Just like
some jobs are predictable in how many processes there will be at a
time, how many open files it has at a time, how much memory it will
consume at a time. All those limits are useful.
That's the big difference: existing cgroups limit a given resource at
one point in time, while "fork limit" is a counter that expires after
a certain amount of resources is consumed (integrated over time). It
is about "consumption", not about "usage".
This is similar to RLIMIT_CPU, which does not rate-limit the CPU
usage, but the total amount of time spent executing.
> Fork bombs aren't bad because they cause a lot of fork()s, they're bad
> because the *create a bunch of processes that use up memory*, which
> happens because they call fork() a bunch of times and **don't
> exit()**.
That is partly true, but is just one side of the story.
The fork() calls itself are expensive, and a process forking and
exiting over and over can put heavy load on your server. All within
"pids" and "memcg" limits.
The goal of my patch is to stop the fork bomb as early as possible,
with an additional limit that is reasonable, which no "good" job
implementation will need to cross.
I developed this feature long before cgroups have been invented
(actually I developed something similar to cgroups/namespaces back
then). It has been proven very successful in a large CGI hosting
cluster. It's perfectly ok for me to maintain this in my private
branch forever ...
Max
On Tue, Nov 10, 2015 at 9:28 PM, Austin S Hemmelgarn
<[email protected]> wrote:
> On 2015-11-10 10:25, Aleksa Sarai wrote:
>>
>> Processes don't "use up resources" after they've died and been freed
>> (which is dealt with inside PIDs). Yes, lots of small processes that
>> die quickly could (in principle) make hard work for the scheduler, but
>> I don't see how "time spent scheduling in general" is a resource...
>> Fork bombs aren't bad because they cause a lot of fork()s, they're bad
>> because the *create a bunch of processes that use up memory*, which
>> happens because they call fork() a bunch of times and **don't
>> exit()**.
>
> While I'm indifferent about the patch, I would like to point out that
> fork-bombs are also bad because they eat _a lot_ of processor time, and I've
> seen ones designed to bring a system to it's knees just by saturating the
> processor with calls to fork() (which is as slow as or slower than stat() on
> many commodity systems, setting up the various structures for a new process
> is an expensive operation) and clogging up the scheduler.
Isn't cpu cgroup helpful there to limit it?
Are you saying time spent by scheduler is more that actually affects
the scheduling of processes of other threads?
If so, could you share little more insight on how that time measure
outside of the cpu's cgroup cycles? Just so that its helpful to wider
audience.
> This isn't as
> evident of course when you run a fork-bomb on a laptop or similar system,
> because you run out of memory and PID's before the latency from scheduling
> and so many processes calling fork really starts to become noticeable, but
> when you start to look at really big systems (on the order of hundreds of GB
> of RAM), it does become much more noticeable.
>
On 2015/11/10 16:44, Tejun Heo <[email protected]> wrote:
> On Tue, Nov 10, 2015 at 04:37:46PM +0100, Max Kellermann wrote:
> > There's "cpu" which changes priority
>
> The cpu controller can limit both in terms of relative weight and
> absolute CPU cycle bandwidth.
No, Tejun, the "cpu" controller does not do what my feature does: like
I said, it only changes the priority, or let's rephrase (to account
for the "absolute CPU cycle bandwith" thing): it changes the amount of
CPU cycles a process gets every period.
But it does NOT put an upper limit on total consumed CPU cycles! It
will only slow down a frantic process, but it will not stop it.
Stopping it is what I want. Once process crosses the limits I
configured, there's no point in keeping it running.
You may disagree that the feature I implemented is useful, and you may
not want it merged, but do not say that I missed a kernel feature,
because that's not true.
The Linux kernel currently does not have a feature that can emulate
the fork limit that I implemented. Useful or not, it doesn't exist.
Max
On Tue, Nov 10, 2015 at 06:06:12PM +0100, Max Kellermann wrote:
> No, Tejun, the "cpu" controller does not do what my feature does: like
> I said, it only changes the priority, or let's rephrase (to account
> for the "absolute CPU cycle bandwith" thing): it changes the amount of
> CPU cycles a process gets every period.
>
> But it does NOT put an upper limit on total consumed CPU cycles! It
> will only slow down a frantic process, but it will not stop it.
> Stopping it is what I want. Once process crosses the limits I
> configured, there's no point in keeping it running.
It's not a stateful resource. Of course the resource is controlled in
terms of bandwidth not absoulte amount consumed. That's what we do
with all stateless resources. It's absurd to limit absoulte amount
for CPU cycles. The only action possible from there on would be
terminating the group. If you wanna do that, do so from userspace.
> You may disagree that the feature I implemented is useful, and you may
> not want it merged, but do not say that I missed a kernel feature,
> because that's not true.
>
> The Linux kernel currently does not have a feature that can emulate
> the fork limit that I implemented. Useful or not, it doesn't exist.
The point is that the missing "feature" is really a non-starter. What
if the process falls into infinite loop on fork failures? It's just a
silly thing to implement.
Thanks.
--
tejun
On 2015/11/10 18:29, Tejun Heo <[email protected]> wrote:
> It's not a stateful resource. Of course the resource is controlled in
> terms of bandwidth not absoulte amount consumed.
I'm glad we now agree on the basic facts.
> It's absurd to limit absoulte amount for CPU cycles.
And yet there's an "absurd" feature called RLIMIT_CPU.
It's absurd because it's per-process. Not because the general idea is
absurd. The idea is good, and I wish the "cpu" or "cpuacct"
controller had such a knob. But that's just my opinion.
> The only action possible from there on would be terminating the
> group. If you wanna do that, do so from userspace.
The kernel already has a documented solution: SIGXCPU and SIGKILL
(already implemented for RLIMIT_CPU).
By the way, I'm not saying RLIMIT_CPU solves my problem - not at all!
I was just explaining why your suggestions don't solve my problem.
> The point is that the missing "feature" is really a non-starter. What
> if the process falls into infinite loop on fork failures? It's just a
> silly thing to implement.
Again, you're reverting to useless rhethorical questions to argue why
a feature is silly.
No, the feature is not silly just because it doesn't solve all
problems at once (which is what your rhetorical question implies).
You need other measures to account for endless loops (be it hostile or
out of stupidity). We do have such measures in our kernel fork.
Other kernel resource limits don't solve all corner cases, but they
were merged anyway.
For example, I can limit I/O and network bandwidth, but I can still
easily stall the whole kernel because the NFS client keeps inode
mutexes locked while waiting for the server, stalling the shrinker,
stalling everything else waiting for the shrinker. That is a real
problem for us. But the existence of that problem doesn't make the
net_prio controller bad - it's just one corner case no controller is
currently able to catch. (In this example, the root cause is bad
kernel code, not a frantic userspace process. But I hope you get the
point.)
To solve problems with frantic processes, I need more tools, not lame
excuses. My "fork limit" patch is one tool that has proven to be very
useful. Maybe one day somebody has a better idea to solve my problem,
but what you said does not.
Max
On 2015-11-10 11:19, Parav Pandit wrote:
> On Tue, Nov 10, 2015 at 9:28 PM, Austin S Hemmelgarn
> <[email protected]> wrote:
>> On 2015-11-10 10:25, Aleksa Sarai wrote:
>>>
>>> Processes don't "use up resources" after they've died and been freed
>>> (which is dealt with inside PIDs). Yes, lots of small processes that
>>> die quickly could (in principle) make hard work for the scheduler, but
>>> I don't see how "time spent scheduling in general" is a resource...
>>> Fork bombs aren't bad because they cause a lot of fork()s, they're bad
>>> because the *create a bunch of processes that use up memory*, which
>>> happens because they call fork() a bunch of times and **don't
>>> exit()**.
>>
>> While I'm indifferent about the patch, I would like to point out that
>> fork-bombs are also bad because they eat _a lot_ of processor time, and I've
>> seen ones designed to bring a system to it's knees just by saturating the
>> processor with calls to fork() (which is as slow as or slower than stat() on
>> many commodity systems, setting up the various structures for a new process
>> is an expensive operation) and clogging up the scheduler.
>
> Isn't cpu cgroup helpful there to limit it?
Possibly, I don't know the specifics of how it handles stuff executing
in a context technically outside of a process on behalf of that process.
I'm almost 100% certain that there is no sane way it can account and
limit time spent in the scheduler because a process is spawning lots of
children.
> Are you saying time spent by scheduler is more that actually affects
> the scheduling of processes of other threads?
In some cases yes, although this is very dependent on the system itself
(for example, if you have a really low /proc/sys/pids_max, it will never
be an issue, but that will also make it easier for a fork-bomb to make
your system unusable). The scheduler on Linux is comparatively fast for
how feature rich it is, but it still slows down as you have more and
more processes to schedule. If you have a lot of RAM proportionate to
your processing power (as in, multiple GB on a processor running at only
a few MHz, and yes such systems do exist), then the scheduling overhead
is much more significant than the memory overhead. Even without such a
situation, it's fully possible to weigh down the system with overhead
from the kernel. As an example, a on a Raspberry Pi (single core 700MHz
ARM11stj-z CPU, 512MB of RAM), you can spawn a few hundred processes
each just sitting on an interval timer set so that every time the
scheduler runs, at least 10% of them are runnable (and I've seen
fork-bombs that do this), and you will render the system unusable not
because of memory consumption, but the scheduling and timer overhead.
> If so, could you share little more insight on how that time measure
> outside of the cpu's cgroup cycles? Just so that its helpful to wider
> audience.
Well, there are a number of things that I can think of that the kernel
does on behalf of processes that can consume processor time that isn't
trivial to account:
* Updating timers on behalf of userspace processes (itimers or similar).
* Sending certain kernel generated signals to processes (that is,
stuff generated by the kernel like SIGFPE, SIGSEGV, and so forth).
* Queuing events from dnotify/inotify/fanotify.
* TLB misses, page faults, and swapping.
* Setting up new processes prior to them actually running.
* Scheduling.
All of these are things that fork-bombs can and (other than TLB misses)
do exploit to bring a system down, and the cpu cgroup is by no means a
magic bullet to handle this.
>> If so, could you share little more insight on how that time measure
>> outside of the cpu's cgroup cycles? Just so that its helpful to wider
>> audience.
>
> Well, there are a number of things that I can think of that the kernel does
> on behalf of processes that can consume processor time that isn't trivial to
> account:
> * Updating timers on behalf of userspace processes (itimers or similar).
> * Sending certain kernel generated signals to processes (that is, stuff
> generated by the kernel like SIGFPE, SIGSEGV, and so forth).
> * Queuing events from dnotify/inotify/fanotify.
> * TLB misses, page faults, and swapping.
> * Setting up new processes prior to them actually running.
> * Scheduling.
> All of these are things that fork-bombs can and (other than TLB misses) do
> exploit to bring a system down, and the cpu cgroup is by no means a magic
> bullet to handle this.
I feel like these are backed by different resources, and we should
work on limiting those *at the source* in the context of a controller
rather than just patching up the symptoms (too many forks causing
issues), because these are symptoms of a larger issue IMO.
--
Aleksa Sarai (cyphar)
http://www.cyphar.com
On 2015-11-15 08:36, Aleksa Sarai wrote:
>>> If so, could you share little more insight on how that time measure
>>> outside of the cpu's cgroup cycles? Just so that its helpful to wider
>>> audience.
>>
>> Well, there are a number of things that I can think of that the kernel does
>> on behalf of processes that can consume processor time that isn't trivial to
>> account:
>> * Updating timers on behalf of userspace processes (itimers or similar).
>> * Sending certain kernel generated signals to processes (that is, stuff
>> generated by the kernel like SIGFPE, SIGSEGV, and so forth).
>> * Queuing events from dnotify/inotify/fanotify.
>> * TLB misses, page faults, and swapping.
>> * Setting up new processes prior to them actually running.
>> * Scheduling.
>> All of these are things that fork-bombs can and (other than TLB misses) do
>> exploit to bring a system down, and the cpu cgroup is by no means a magic
>> bullet to handle this.
>
> I feel like these are backed by different resources, and we should
> work on limiting those *at the source* in the context of a controller
> rather than just patching up the symptoms (too many forks causing
> issues), because these are symptoms of a larger issue IMO.
OK, what specific resources back each of the things that I mentioned?
Other than setting up a new process (which in retrospect I realize
should probably just be accounted as processor time for the parent), I
can't really see much that most of these are backed by, other than
processor time (and until someone demonstrates otherwise, I stand by my
statement that they are non-trivial to account properly as processor time).