2014-06-17 17:28:46

by Jack Miller

[permalink] [raw]
Subject: [RESEND] shm: shm exit scalability fixes

[ RESEND note: Adding relevant CCs, fixed a couple of typos in commit message,
patches unchanged. Original intro follows. ]

All -

This is small set of patches our team has had kicking around for a few versions
internally that fixes tasks getting hung on shm_exit when there are many
threads hammering it at once.

Anton wrote a simple test to cause the issue:

http://ozlabs.org/~anton/junkcode/bust_shm_exit.c

Before applying this patchset, this test code will cause either hanging
tracebacks or pthread out of memory errors.

After this patchset, it will still produce output like:

root@somehost:~# ./bust_shm_exit 1024 160
...
INFO: rcu_sched detected stalls on CPUs/tasks: {} (detected by 116, t=2111
jiffies, g=241, c=240, q=7113)
INFO: Stall ended before state dump start
...
But the task will continue to run along happily, so we consider this an
improvement over hanging, even if it's a bit noisy.

I didn't author these patches, but I'd be happy to take any feedback and
address any issues.

- Jack


2014-06-17 17:28:45

by Jack Miller

[permalink] [raw]
Subject: [PATCH 1/3] shm: Make exit_shm work proportional to task activity

exit_shm obtains the ipc_ns shm rwsem for write and holds it while
it walks every shared memory segment in the namespace. Thus the
amount of work is related to the number of shm segments in the
namespace not the number of segments that might need to be cleaned.

In addition, this occurs after the task has been notified the
thread has exited, so the number of tasks waiting for the ns shm
rwsem can grow without bound until memory is exausted.

Add a list to the task struct of all shmids allocated by this task.
Init the list head in copy_process. Use the ns->rwsem for locking.
Add segments after id is added, remove before removing from id.

On unshare of NEW_IPCNS orphan any ids as if the task had exited,
similar to handling of semaphore undo.

I chose a define for the init sequence since its a simple list init,
otherwise it would require a function call to avoid include loops
between the semaphore code and the task struct. Converting the
list_del to list_del_init for the unshare cases would remove the
exit followed by init, but I left it blow up if not inited.

Signed-off-by: Milton Miller <[email protected]>
Signed-off-by: Jack Miller <[email protected]>
---
include/linux/sched.h | 2 ++
include/linux/shm.h | 16 +++++++++++++++-
ipc/shm.c | 22 +++++++++++-----------
kernel/fork.c | 6 ++++++
4 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 221b2bd..4833ecf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -33,6 +33,7 @@ struct sched_param {

#include <linux/smp.h>
#include <linux/sem.h>
+#include <linux/shm.h>
#include <linux/signal.h>
#include <linux/compiler.h>
#include <linux/completion.h>
@@ -1344,6 +1345,7 @@ struct task_struct {
#ifdef CONFIG_SYSVIPC
/* ipc stuff */
struct sysv_sem sysvsem;
+ struct sysv_shm sysvshm;
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
/* hung task detection */
diff --git a/include/linux/shm.h b/include/linux/shm.h
index 1e2cd2e..38a70a2 100644
--- a/include/linux/shm.h
+++ b/include/linux/shm.h
@@ -1,6 +1,7 @@
#ifndef _LINUX_SHM_H_
#define _LINUX_SHM_H_

+#include <linux/list.h>
#include <asm/page.h>
#include <uapi/linux/shm.h>

@@ -21,6 +22,7 @@ struct shmid_kernel /* private to the kernel */

/* The task created the shm object. NULL if the task is dead. */
struct task_struct *shm_creator;
+ struct list_head shm_clist; /* list by creator */
};

/* shm_mode upper byte flags */
@@ -45,11 +47,20 @@ struct shmid_kernel /* private to the kernel */
#define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT)

#ifdef CONFIG_SYSVIPC
+struct sysv_shm {
+ struct list_head shm_clist;
+};
+
long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr,
unsigned long shmlba);
extern int is_file_shm_hugepages(struct file *file);
-extern void exit_shm(struct task_struct *task);
+void exit_shm(struct task_struct *task);
+#define shm_init_task(task) INIT_LIST_HEAD(&(task)->sysvshm.shm_clist)
#else
+struct sysv_shm {
+ /* empty */
+};
+
static inline long do_shmat(int shmid, char __user *shmaddr,
int shmflg, unsigned long *addr,
unsigned long shmlba)
@@ -63,6 +74,9 @@ static inline int is_file_shm_hugepages(struct file *file)
static inline void exit_shm(struct task_struct *task)
{
}
+static inline void shm_init_task(struct task_struct *task)
+{
+}
#endif

#endif /* _LINUX_SHM_H_ */
diff --git a/ipc/shm.c b/ipc/shm.c
index 7645961..9790a0e 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -178,6 +178,7 @@ static void shm_rcu_free(struct rcu_head *head)

static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
{
+ list_del(&s->shm_clist);
ipc_rmid(&shm_ids(ns), &s->shm_perm);
}

@@ -268,14 +269,10 @@ static void shm_close(struct vm_area_struct *vma)
}

/* Called with ns->shm_ids(ns).rwsem locked */
-static int shm_try_destroy_current(int id, void *p, void *data)
+static void shm_mark_orphan(struct shmid_kernel *shp, struct ipc_namespace *ns)
{
- struct ipc_namespace *ns = data;
- struct kern_ipc_perm *ipcp = p;
- struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm);
-
- if (shp->shm_creator != current)
- return 0;
+ if (WARN_ON(shp->shm_creator != current)) /* Remove me when it works */
+ return;

/*
* Mark it as orphaned to destroy the segment when
@@ -289,13 +286,12 @@ static int shm_try_destroy_current(int id, void *p, void *data)
* is not set, it shouldn't be deleted here.
*/
if (!ns->shm_rmid_forced)
- return 0;
+ return;

if (shm_may_destroy(ns, shp)) {
shm_lock_by_ptr(shp);
shm_destroy(ns, shp);
}
- return 0;
}

/* Called with ns->shm_ids(ns).rwsem locked */
@@ -333,14 +329,17 @@ void shm_destroy_orphaned(struct ipc_namespace *ns)
void exit_shm(struct task_struct *task)
{
struct ipc_namespace *ns = task->nsproxy->ipc_ns;
+ struct shmid_kernel *shp, *n;

if (shm_ids(ns).in_use == 0)
return;

/* Destroy all already created segments, but not mapped yet */
down_write(&shm_ids(ns).rwsem);
- if (shm_ids(ns).in_use)
- idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns);
+ list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist)
+ shm_mark_orphan(shp, ns);
+ /* remove the list head from any segments still attached */
+ list_del(&task->sysvshm.shm_clist);
up_write(&shm_ids(ns).rwsem);
}

@@ -557,6 +556,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
shp->shm_nattch = 0;
shp->shm_file = file;
shp->shm_creator = current;
+ list_add(&shp->shm_clist, &current->sysvshm.shm_clist);

/*
* shmid gets reported as "inode#" in /proc/pid/maps.
diff --git a/kernel/fork.c b/kernel/fork.c
index 54a8d26..5301efb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1328,6 +1328,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (retval)
goto bad_fork_cleanup_policy;
/* copy all the process information */
+ shm_init_task(p);
retval = copy_semundo(clone_flags, p);
if (retval)
goto bad_fork_cleanup_audit;
@@ -1867,6 +1868,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
*/
exit_sem(current);
}
+ if (unshare_flags & CLONE_NEWIPC) {
+ /* Orphan segments in old ns (see sem above). */
+ exit_shm(current);
+ shm_init_task(current);
+ }

if (new_nsproxy)
switch_task_namespaces(current, new_nsproxy);
--
1.9.1

2014-06-17 17:29:48

by Jack Miller

[permalink] [raw]
Subject: [PATCH 2/3] shm: Allow exit_shm in parallel if only marking orphans

If shm_rmid_force (the default state) is not set then the shmids are only
marked as orphaned and does not require any add, delete, or locking of the tree
structure.

Seperate the sysctl on and off case, and only obtain the read lock. The newly
added list head can be deleted under the read lock because we are only called
with current and will only change the semids allocated by this task and not
manipulate the list.

This commit assumes that up_read includes a sufficient memory barrier for the
writes to be seen my others that later obtain a write lock.

Signed-off-by: Milton Miller <[email protected]>
Signed-off-by: Jack Miller <[email protected]>
---
ipc/shm.c | 67 +++++++++++++++++++++++++++++++++------------------------------
1 file changed, 35 insertions(+), 32 deletions(-)

diff --git a/ipc/shm.c b/ipc/shm.c
index 9790a0e..0dde176 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -269,32 +269,6 @@ static void shm_close(struct vm_area_struct *vma)
}

/* Called with ns->shm_ids(ns).rwsem locked */
-static void shm_mark_orphan(struct shmid_kernel *shp, struct ipc_namespace *ns)
-{
- if (WARN_ON(shp->shm_creator != current)) /* Remove me when it works */
- return;
-
- /*
- * Mark it as orphaned to destroy the segment when
- * kernel.shm_rmid_forced is changed.
- * It is noop if the following shm_may_destroy() returns true.
- */
- shp->shm_creator = NULL;
-
- /*
- * Don't even try to destroy it. If shm_rmid_forced=0 and IPC_RMID
- * is not set, it shouldn't be deleted here.
- */
- if (!ns->shm_rmid_forced)
- return;
-
- if (shm_may_destroy(ns, shp)) {
- shm_lock_by_ptr(shp);
- shm_destroy(ns, shp);
- }
-}
-
-/* Called with ns->shm_ids(ns).rwsem locked */
static int shm_try_destroy_orphaned(int id, void *p, void *data)
{
struct ipc_namespace *ns = data;
@@ -325,20 +299,49 @@ void shm_destroy_orphaned(struct ipc_namespace *ns)
up_write(&shm_ids(ns).rwsem);
}

-
+/* Locking assumes this will only be called with task == current */
void exit_shm(struct task_struct *task)
{
struct ipc_namespace *ns = task->nsproxy->ipc_ns;
struct shmid_kernel *shp, *n;

- if (shm_ids(ns).in_use == 0)
+ if (list_empty(&task->sysvshm.shm_clist))
+ return;
+
+ /*
+ * If kernel.shm_rmid_forced is not set then only keep track of
+ * which shmids are orphaned, so that a later set of the sysctl
+ * can clean them up.
+ */
+ if (!ns->shm_rmid_forced) {
+ down_read(&shm_ids(ns).rwsem);
+ list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist)
+ shp->shm_creator = NULL;
+ /*
+ * Only under read lock but we are only called on current
+ * so no entry on the list will be shared.
+ */
+ list_del(&task->sysvshm.shm_clist);
+ up_read(&shm_ids(ns).rwsem);
return;
+ }

- /* Destroy all already created segments, but not mapped yet */
+ /*
+ * Destroy all already created segments, that were not yet mapped,
+ * and mark any mapped as orphan to cover the sysctl toggling.
+ * Destroy is skipped if shm_may_destroy() returns false.
+ */
down_write(&shm_ids(ns).rwsem);
- list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist)
- shm_mark_orphan(shp, ns);
- /* remove the list head from any segments still attached */
+ list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) {
+ shp->shm_creator = NULL;
+
+ if (shm_may_destroy(ns, shp)) {
+ shm_lock_by_ptr(shp);
+ shm_destroy(ns, shp);
+ }
+ }
+
+ /* Remove the list head from any segments still attached. */
list_del(&task->sysvshm.shm_clist);
up_write(&shm_ids(ns).rwsem);
}
--
1.9.1

2014-06-17 17:32:33

by Jack Miller

[permalink] [raw]
Subject: [PATCH 3/3] shm: Remove unneeded extern for function

A small cleanup while changing adjacent code. Extern is not needed
for functions and only one declaration had it so remove it from the
odd line.

Signed-off-by: Milton Miller <[email protected]>
Signed-off-by: Jack Miller <[email protected]>
---
include/linux/shm.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/shm.h b/include/linux/shm.h
index 38a70a2..e0daca0 100644
--- a/include/linux/shm.h
+++ b/include/linux/shm.h
@@ -53,7 +53,7 @@ struct sysv_shm {

long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr,
unsigned long shmlba);
-extern int is_file_shm_hugepages(struct file *file);
+int is_file_shm_hugepages(struct file *file);
void exit_shm(struct task_struct *task);
#define shm_init_task(task) INIT_LIST_HEAD(&(task)->sysvshm.shm_clist)
#else
--
1.9.1

2014-06-17 17:48:36

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [RESEND] shm: shm exit scalability fixes

On Tue, 2014-06-17 at 12:27 -0500, Jack Miller wrote:
> [ RESEND note: Adding relevant CCs, fixed a couple of typos in commit message,
> patches unchanged. Original intro follows. ]
>
> All -
>
> This is small set of patches our team has had kicking around for a few versions
> internally that fixes tasks getting hung on shm_exit when there are many
> threads hammering it at once.
>
> Anton wrote a simple test to cause the issue:
>
> http://ozlabs.org/~anton/junkcode/bust_shm_exit.c

I'm actually in the process of adding shm microbenchmarks to perf-bench
so I might steal this :-)

>
> Before applying this patchset, this test code will cause either hanging
> tracebacks or pthread out of memory errors.

Are you seeing this issue in any real world setups? While the program
does stress the path you mention quite well, I fear it is very
unrealistic... how many shared mem segments do real applications
actually use/create for scaling issues to appear?

I normally wouldn't mind optimizing synthetic cases like this, but a
quick look at patch 1/3 shows that we're adding an extra overhead (16
bytes) in the task_struct.

In any case, I will take a closer look at the set.

Thanks,
Davidlohr

2014-06-17 19:56:11

by Jack Miller

[permalink] [raw]
Subject: Re: [RESEND] shm: shm exit scalability fixes

On Tue, Jun 17, 2014 at 10:48:32AM -0700, Davidlohr Bueso wrote:
> On Tue, 2014-06-17 at 12:27 -0500, Jack Miller wrote:
> > [ RESEND note: Adding relevant CCs, fixed a couple of typos in commit message,
> > patches unchanged. Original intro follows. ]
> >
> > All -
> >
> > This is small set of patches our team has had kicking around for a few versions
> > internally that fixes tasks getting hung on shm_exit when there are many
> > threads hammering it at once.
> >
> > Anton wrote a simple test to cause the issue:
> >
> > http://ozlabs.org/~anton/junkcode/bust_shm_exit.c
>
> I'm actually in the process of adding shm microbenchmarks to perf-bench
> so I might steal this :-)
>

Cool!

> >
> > Before applying this patchset, this test code will cause either hanging
> > tracebacks or pthread out of memory errors.
>
> Are you seeing this issue in any real world setups? While the program
> does stress the path you mention quite well, I fear it is very
> unrealistic... how many shared mem segments do real applications
> actually use/create for scaling issues to appear?

We've seen this while running multiple workloads on the same machine. One
workload that used shared memory extensively, and one that created many
shortlived threads. The testcase is just simulating these two workloads
running simultaneously, so I don't think it's too unreasonable to expect it
could happen in the wild.

Even if this is synthetic, the testcase could also be seen as proof of an
unprivileged denial of service as an arbitrary user could run bust_shm_exit
and subsequently start overloading the system.

>
> I normally wouldn't mind optimizing synthetic cases like this, but a
> quick look at patch 1/3 shows that we're adding an extra overhead (16
> bytes) in the task_struct.

Yeah, that's definitely not to be done lightly, but I think it's worth it to
make the work on exit proportional to the actual task usage instead of the
number of segments in the namespace.

>
> In any case, I will take a closer look at the set.

Thanks! I'd appreciate any feedback.

- Jack

2014-06-18 00:15:48

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [RESEND] shm: shm exit scalability fixes

On Tue, 2014-06-17 at 14:55 -0500, Jack Miller wrote:
> Even if this is synthetic, the testcase could also be seen as proof of an
> unprivileged denial of service as an arbitrary user could run bust_shm_exit
> and subsequently start overloading the system.

We have the shmmni limit (and friends) for that.

Thanks,
Davidlohr

2014-06-18 02:53:37

by Anton Blanchard

[permalink] [raw]
Subject: Re: [RESEND] shm: shm exit scalability fixes

Hi David,

> > Anton wrote a simple test to cause the issue:
> >
> > http://ozlabs.org/~anton/junkcode/bust_shm_exit.c
>
> I'm actually in the process of adding shm microbenchmarks to
> perf-bench so I might steal this :-)

Sounds good!

> Are you seeing this issue in any real world setups? While the program
> does stress the path you mention quite well, I fear it is very
> unrealistic... how many shared mem segments do real applications
> actually use/create for scaling issues to appear?

As Jack mentioned, we were asked to debug a box that was crawling. Each
process took over 10 minutes to execute which made it very hard to
analyse. We eventually narrowed it down to this.

> I normally wouldn't mind optimizing synthetic cases like this, but a
> quick look at patch 1/3 shows that we're adding an extra overhead (16
> bytes) in the task_struct.

The testcase is synthetic but I wrote it based on the application that
would, given enough time, take the box down.

> We have the shmmni limit (and friends) for that.

If we want to use this to guard against the problem, we may need to
drop shmmni. Looking at my notes, I could take down a box with 4096
segments and 16 threads. This is where I got to before it disappeared:

# ./bust_shm_exit 4096 16
# uptime
03:00:50 up 8 days, 18:05 5 users,load average: 6076.98, 2494.09, 910.37

Anton

2014-06-18 04:41:32

by Michael Ellerman

[permalink] [raw]
Subject: Re: [RESEND] shm: shm exit scalability fixes

On Wed, 2014-06-18 at 12:53 +1000, Anton Blanchard wrote:
> > I normally wouldn't mind optimizing synthetic cases like this, but a
> > quick look at patch 1/3 shows that we're adding an extra overhead (16
> > bytes) in the task_struct.
>
> > We have the shmmni limit (and friends) for that.
>
> If we want to use this to guard against the problem, we may need to
> drop shmmni. Looking at my notes, I could take down a box with 4096
> segments and 16 threads. This is where I got to before it disappeared:
>
> # ./bust_shm_exit 4096 16
> # uptime
> 03:00:50 up 8 days, 18:05 5 users,load average: 6076.98, 2494.09, 910.37

I win, using 4096 segments and 16 threads:

# uptime
13:50:46 up 1 day, 19:41, 2 users, load average: 7621.57, 1718.39, 943.13
13:52:35 up 1 day, 19:43, 2 users, load average: 15422.64, 7409.90, 3156.82

That's on a 16 cpu box running 3.16-rc1.

In contrast, if you run it with 1 segment and 16 threads it maxes out about:

# uptime
13:58:00 up 1 min, 2 users, load average: 1.81, 0.46, 0.15

And the box is entirely responsive.

cheers

2014-06-18 22:55:52

by Andrew Morton

[permalink] [raw]
Subject: Re: [RESEND] shm: shm exit scalability fixes

On Tue, 17 Jun 2014 12:27:44 -0500 Jack Miller <[email protected]> wrote:

> [ RESEND note: Adding relevant CCs, fixed a couple of typos in commit message,
> patches unchanged. Original intro follows. ]
>
> All -
>
> This is small set of patches our team has had kicking around for a few versions
> internally that fixes tasks getting hung on shm_exit when there are many
> threads hammering it at once.
>
> Anton wrote a simple test to cause the issue:
>
> http://ozlabs.org/~anton/junkcode/bust_shm_exit.c
>
> Before applying this patchset, this test code will cause either hanging
> tracebacks or pthread out of memory errors.
>
> After this patchset, it will still produce output like:
>
> root@somehost:~# ./bust_shm_exit 1024 160
> ...
> INFO: rcu_sched detected stalls on CPUs/tasks: {} (detected by 116, t=2111
> jiffies, g=241, c=240, q=7113)
> INFO: Stall ended before state dump start
> ...
> But the task will continue to run along happily, so we consider this an
> improvement over hanging, even if it's a bit noisy.
>
> I didn't author these patches, but I'd be happy to take any feedback and
> address any issues.

As it stands, these patches will be recorded as authored by yourself.
You can override this by putting

From: Someone Else <email-address>

at the start of the changelog. Please let me know if you'd like me to
make such changes.


I agree that adding 8/16 bytes to the task_struct (if CONFIG_SYSVIPC)
is regrettable. But the poor thing is a dumping ground anyway :( I
suppose one could replace this and sysvsem with a

struct ipc_stuff *ipc_stuff; /* NULL if I never used IPC */

but there are probably lots of similar changes we could (and one day
may) make to the task_struct.

I queued the patches, tagged for 3.16-rc1 - hopefully Davidlohr and
Manfred will be able to find the time to go over them closely.

2014-06-18 23:00:03

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 1/3] shm: Make exit_shm work proportional to task activity

On Tue, 17 Jun 2014 12:27:45 -0500 Jack Miller <[email protected]> wrote:

> --- a/include/linux/shm.h
> +++ b/include/linux/shm.h
> @@ -1,6 +1,7 @@
> #ifndef _LINUX_SHM_H_
> #define _LINUX_SHM_H_
>
> +#include <linux/list.h>

yup. shm.h only compiles by sheer luck - it uses file, task_struct,
user_struct, time_t, pid_t, __user, and probably other things without
including the required header files.