2006-12-13 07:38:39

by Hidehiro Kawai

[permalink] [raw]
Subject: [PATCH] binfmt_elf: core dump masking support

Hi,

This patch provides a feature which enables you to specify the memory
segment types you don't want to dump into a core file. You can specify
them per process via /proc/<pid>/coremask file. This file represents
the bitmask of memory segment types which are not written out when the
<pid> process is dumped. Currently, bit 0 is only available. This
means anonymous shared memory, which includes IPC shared memory and
some of mmap(2)'ed memory.

This patch can be applied against 2.6.19-rc6-mm2, and tested on i386,
x86_64, and ia64 platforms.

Here is the background. Some software programs share huge memory among
hundreds of processes. If a failure occurs on one of these processes,
they can be signaled by a monitoring process to generate core files
and restart the service. However, it can develop into a system-wide
failure such as system slow down for a long time and disk space
shortage because the total size of the core files is very huge!

To avoid the above situation we can limit the core file size by
setrlimit(2) or ulimit(1). But this method can lose important data
such as stack because core dumping is done from lower address to
higher address and terminated halfway.
So I suggest keeping shared memory segments from being dumped for
particular processes. Because the shared memory attached to processes
is common in them, we don't need to dump the shared memory every time.

If you don't want to dump all shared memory segments attached to
pid 1234, set the bit 0 of the process's coremask to 1:

$ echo 1 > /proc/1234/coremask

Additionally, you can check its hexadecimal value by reading the file:

$ cat /proc/1234/coremask
00000001

When a new process is created, the process inherits the coremask
setting from its parent. It is useful to set the coremask before
the program runs. For example:

$ echo 1 > /proc/self/coremask
$ ./some_program

Thanks,

Hidehiro Kawai
Hitachi, Ltd., Systems Development Laboratory

Signed-off-by: Hidehiro Kawai <[email protected]>
---
Documentation/filesystems/proc.txt | 36 ++++++++++
fs/binfmt_elf.c | 10 +-
fs/proc/base.c | 96 +++++++++++++++++++++++++++
include/linux/binfmts.h | 3
include/linux/sched.h | 1
kernel/fork.c | 1
6 files changed, 144 insertions(+), 3 deletions(-)

Index: linux-2.6.19-rc6-mm2/fs/binfmt_elf.c
===================================================================
--- linux-2.6.19-rc6-mm2.orig/fs/binfmt_elf.c
+++ linux-2.6.19-rc6-mm2/fs/binfmt_elf.c
@@ -1189,9 +1189,13 @@
if (vma->vm_flags & (VM_IO | VM_RESERVED))
return 0;

- /* Dump shared memory only if mapped from an anonymous file. */
- if (vma->vm_flags & VM_SHARED)
- return vma->vm_file->f_path.dentry->d_inode->i_nlink == 0;
+ /* Dump shared memory which mapped from an anonymous file and
+ * not masked. */
+ if (vma->vm_flags & VM_SHARED) {
+ if (vma->vm_file->f_dentry->d_inode->i_nlink)
+ return 0;
+ return (vma->vm_mm->core_mask & CORE_MASK_ANONSHM) == 0;
+ }

/* If it hasn't been written to, don't write it out */
if (!vma->anon_vma)
Index: linux-2.6.19-rc6-mm2/fs/proc/base.c
===================================================================
--- linux-2.6.19-rc6-mm2.orig/fs/proc/base.c
+++ linux-2.6.19-rc6-mm2/fs/proc/base.c
@@ -73,6 +73,7 @@
#include <linux/poll.h>
#include <linux/nsproxy.h>
#include <linux/oom.h>
+#include <linux/elf.h>
#include "internal.h"

/* NOTE:
@@ -912,6 +913,95 @@
};
#endif

+#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
+static ssize_t proc_coremask_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
+ struct mm_struct *mm;
+ char buffer[PROC_NUMBUF];
+ size_t len;
+ unsigned int coremask;
+ loff_t __ppos = *ppos;
+ int ret;
+
+ ret = -ESRCH;
+ if (!task)
+ goto out_no_task;
+
+ ret = 0;
+ mm = get_task_mm(task);
+ if (!mm)
+ goto out_no_mm;
+ coremask = mm->core_mask;
+
+ len = snprintf(buffer, sizeof(buffer), "%08x\n", coremask);
+ if (__ppos >= len)
+ goto out;
+ if (count > len - __ppos)
+ count = len - __ppos;
+
+ ret = -EFAULT;
+ if (copy_to_user(buf, buffer + __ppos, count))
+ goto out;
+
+ ret = count;
+ *ppos = __ppos + count;
+
+ out:
+ mmput(mm);
+ out_no_mm:
+ put_task_struct(task);
+ out_no_task:
+ return ret;
+}
+
+static ssize_t proc_coremask_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct task_struct *task;
+ struct mm_struct *mm;
+ char buffer[PROC_NUMBUF], *end;
+ unsigned int coremask;
+ int ret;
+
+ ret = -EFAULT;
+ memset(buffer, 0, sizeof(buffer));
+ if (count > sizeof(buffer) - 1)
+ count = sizeof(buffer) - 1;
+ if (copy_from_user(buffer, buf, count))
+ goto out_no_task;
+
+ ret = -EINVAL;
+ coremask = (unsigned int)simple_strtoul(buffer, &end, 0);
+ if (*end == '\n')
+ end++;
+ if (end - buffer == 0)
+ goto out_no_task;
+
+ ret = -ESRCH;
+ task = get_proc_task(file->f_dentry->d_inode);
+ if (!task)
+ goto out_no_task;
+
+ ret = end - buffer;
+ mm = get_task_mm(task);
+ if (mm) {
+ mm->core_mask = coremask;
+ mmput(mm);
+ }
+
+ put_task_struct(task);
+ out_no_task:
+ return ret;
+}
+
+static struct file_operations proc_coremask_operations = {
+ .read = proc_coremask_read,
+ .write = proc_coremask_write,
+};
+#endif
+
static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
{
struct inode *inode = dentry->d_inode;
@@ -1879,6 +1969,9 @@
#ifdef CONFIG_FAULT_INJECTION
REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
#endif
+#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
+ REG("coremask", S_IRUGO|S_IWUSR, coremask),
+#endif
};

static int proc_tgid_base_readdir(struct file * filp,
@@ -2157,6 +2250,9 @@
#ifdef CONFIG_FAULT_INJECTION
REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
#endif
+#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
+ REG("coremask", S_IRUGO|S_IWUSR, coremask),
+#endif
};

static int proc_tid_base_readdir(struct file * filp,
Index: linux-2.6.19-rc6-mm2/include/linux/sched.h
===================================================================
--- linux-2.6.19-rc6-mm2.orig/include/linux/sched.h
+++ linux-2.6.19-rc6-mm2/include/linux/sched.h
@@ -369,6 +369,7 @@
/* coredumping support */
int core_waiters;
struct completion *core_startup_done, core_done;
+ unsigned int core_mask;

/* aio bits */
rwlock_t ioctx_list_lock;
Index: linux-2.6.19-rc6-mm2/kernel/fork.c
===================================================================
--- linux-2.6.19-rc6-mm2.orig/kernel/fork.c
+++ linux-2.6.19-rc6-mm2/kernel/fork.c
@@ -332,6 +332,7 @@
init_rwsem(&mm->mmap_sem);
INIT_LIST_HEAD(&mm->mmlist);
mm->core_waiters = 0;
+ mm->core_mask = (current->mm) ? current->mm->core_mask : 0;
mm->nr_ptes = 0;
set_mm_counter(mm, file_rss, 0);
set_mm_counter(mm, anon_rss, 0);
Index: linux-2.6.19-rc6-mm2/include/linux/binfmts.h
===================================================================
--- linux-2.6.19-rc6-mm2.orig/include/linux/binfmts.h
+++ linux-2.6.19-rc6-mm2/include/linux/binfmts.h
@@ -79,6 +79,9 @@
#define EXSTACK_DISABLE_X 1 /* Disable executable stacks */
#define EXSTACK_ENABLE_X 2 /* Enable executable stacks */

+/* core dump masking */
+#define CORE_MASK_ANONSHM 0x1 /* Don't dump shared memory. */
+
extern int setup_arg_pages(struct linux_binprm * bprm,
unsigned long stack_top,
int executable_stack);
Index: linux-2.6.19-rc6-mm2/Documentation/filesystems/proc.txt
===================================================================
--- linux-2.6.19-rc6-mm2.orig/Documentation/filesystems/proc.txt
+++ linux-2.6.19-rc6-mm2/Documentation/filesystems/proc.txt
@@ -41,6 +41,7 @@
2.11 /proc/sys/fs/mqueue - POSIX message queues filesystem
2.12 /proc/<pid>/oom_adj - Adjust the oom-killer score
2.13 /proc/<pid>/oom_score - Display current oom-killer score
+ 2.14 /proc/<pid>/coremask - Core dump setting for segment types

------------------------------------------------------------------------------
Preface
@@ -1981,6 +1982,41 @@
any given <pid>. Use it together with /proc/<pid>/oom_adj to tune which
process should be killed in an out-of-memory situation.

+2.14 /proc/<pid>/coremask - Core dump setting for segment types
+---------------------------------------------------------------------
+The /proc/<pid>/coremask file controls the core dump routine. Its content is
+bitmask of segment types you don't want to dump. When the <pid> process is
+dumped, the core dump routine decides whether a given memory segment should be
+dumped into a core file or not, based on the type of the memory segment and
+bitmask.
+
+Currently, only valid bit is bit 0, which means anonymous shared memory
+segment. There are three types of anonymous shared memory:
+
+ - IPC shared memory
+ - the memory created by mmap(2) with MAP_ANONYMOUS and MAP_SHARED flags
+ - the memory where an already unlinked file is mapped with MAP_SHARED flag
+
+Because current core dump routine doesn't distinguish these segments, you can
+only choose either dumping all anonymous shared memory segments or not.
+
+If you don't want to dump all shared memory segments attached to pid 1234, set
+the bit 0 of the process's coremask to 1:
+
+ $ echo 1 > /proc/1234/coremask
+
+Additionally, you can check its hexadecimal value by reading the file:
+
+ $ cat /proc/1234/coremask
+ 00000001
+
+When a new process is created, the process inherits the coremask setting from
+its parent. It is useful to set the coremask before the program runs. For
+example:
+
+ $ echo 1 > /proc/self/coremask
+ $ ./some_program
+
------------------------------------------------------------------------------
Summary
------------------------------------------------------------------------------





2006-12-13 21:24:19

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] binfmt_elf: core dump masking support

On Wed, 13 Dec 2006 16:14:08 +0900
"Kawai, Hidehiro" <[email protected]> wrote:

> This patch provides a feature which enables you to specify the memory
> segment types you don't want to dump into a core file. You can specify
> them per process via /proc/<pid>/coremask file. This file represents
> the bitmask of memory segment types which are not written out when the
> <pid> process is dumped. Currently, bit 0 is only available. This
> means anonymous shared memory, which includes IPC shared memory and
> some of mmap(2)'ed memory.
>
> This patch can be applied against 2.6.19-rc6-mm2, and tested on i386,
> x86_64, and ia64 platforms.
>
> Here is the background. Some software programs share huge memory among
> hundreds of processes. If a failure occurs on one of these processes,
> they can be signaled by a monitoring process to generate core files
> and restart the service. However, it can develop into a system-wide
> failure such as system slow down for a long time and disk space
> shortage because the total size of the core files is very huge!
>
> To avoid the above situation we can limit the core file size by
> setrlimit(2) or ulimit(1). But this method can lose important data
> such as stack because core dumping is done from lower address to
> higher address and terminated halfway.
> So I suggest keeping shared memory segments from being dumped for
> particular processes. Because the shared memory attached to processes
> is common in them, we don't need to dump the shared memory every time.
>
> If you don't want to dump all shared memory segments attached to
> pid 1234, set the bit 0 of the process's coremask to 1:
>
> $ echo 1 > /proc/1234/coremask
>
> Additionally, you can check its hexadecimal value by reading the file:
>
> $ cat /proc/1234/coremask
> 00000001
>
> When a new process is created, the process inherits the coremask
> setting from its parent. It is useful to set the coremask before
> the program runs. For example:
>
> $ echo 1 > /proc/self/coremask
> $ ./some_program

The requirement makes sense, I guess.

Regarding the implementation: if we add

unsigned char coredump_omit_anon_memory:1;

into the mm_struct right next to `dumpable' then we avoid increasing the
size of the mm_struct, and the code gets neater.


Modification of this field is racy, and we don't have a suitable lock in
mm_struct to fix that. I don't think we care about that a lot, but it'd be
best to find some way of fixing it.


Really we should convert binfmt_aout.c and any other coredumpers too.


Does this feature have any security implications? For example, there might
be system administration programs which force a coredump on a "bad"
process, and leave the core somewhere for the administrator to look at.
With this change, we permit hiding of that corefile's anon memory from the
administrator. OK, lame example, but perhaps there are better ones.

2006-12-18 08:10:20

by Hidehiro Kawai

[permalink] [raw]
Subject: Re: [PATCH] binfmt_elf: core dump masking support

Hello Andrew,

Thank you for your reply and advice.
I'll send the revised patchset after I fix what you pointed out.

Andrew Morton wrote:

> Regarding the implementation: if we add
>
> unsigned char coredump_omit_anon_memory:1;
>
> into the mm_struct right next to `dumpable' then we avoid increasing the
> size of the mm_struct, and the code gets neater.
>
> Modification of this field is racy, and we don't have a suitable lock in
> mm_struct to fix that. I don't think we care about that a lot, but it'd be
> best to find some way of fixing it.

OK, I'll put a bit field right next to `dumpable' member and add a global
lock to protect them from the race.
I have the perception that only writing to these bit-fields needs to
acquire a lock. Simultaneous writes to both bit-fields can cause either one
to be overwritten with its old value. But simultaneous read and write
from/to separate bit-fields is safe because write to one bit-field
doesn't affect read from the other.

The dumpable can be modified at following timing:

- before starting core dumping in do_coredump()
- when initialize mm_struct in flush_old_exec()
- when *uid or *gid is changed by the coresponding system call
- when the dumpable is modified directly by prctl(2)

I expect that these don't occur so much frequently, so I consider that
the performance impact by using a global lock is small.


> Really we should convert binfmt_aout.c and any other coredumpers too.

Currently, binfmt_aout.c and binfmt_elf_fdpic.c have their own core dump
routines as well as binfmt_elf.c. However, as far as I know,
binfmt_aout.c never dumps shared memory.
So I will convert only binfmt_elf_fdpic.c to support this feature.


> Does this feature have any security implications? For example, there might
> be system administration programs which force a coredump on a "bad"
> process, and leave the core somewhere for the administrator to look at.
> With this change, we permit hiding of that corefile's anon memory from the
> administrator. OK, lame example, but perhaps there are better ones.

I think we can avoid it by providing a sysctl parameter which
disables/enables this feature.

Another idea is that we provide a sysctl parameter to prohibit non-root
user from writing to /proc/<pid>/coremask. If the administrator want to
force a full coredump on a bad process, he/she only has to clear the
coremask after setting the sysctl parameter.

For now, I will implement the first idea, because its design and
implementation are simple and it is easy to use.

Best regards,

--
Hidehiro Kawai
Hitachi, Ltd., Systems Development Laboratory


2006-12-20 15:41:11

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH] binfmt_elf: core dump masking support

Hi!

> > When a new process is created, the process inherits the coremask
> > setting from its parent. It is useful to set the coremask before
> > the program runs. For example:
> >
> > $ echo 1 > /proc/self/coremask
> > $ ./some_program
>
> The requirement makes sense, I guess.
>
> Regarding the implementation: if we add
>
> unsigned char coredump_omit_anon_memory:1;
>
> into the mm_struct right next to `dumpable' then we avoid increasing the
> size of the mm_struct, and the code gets neater.
>
>
> Modification of this field is racy, and we don't have a suitable lock in
> mm_struct to fix that. I don't think we care about that a lot, but it'd be
> best to find some way of fixing it.
>
>
> Really we should convert binfmt_aout.c and any other coredumpers too.
>
>
> Does this feature have any security implications? For example, there might
> be system administration programs which force a coredump on a "bad"
> process, and leave the core somewhere for the administrator to look at.
> With this change, we permit hiding of that corefile's anon memory from the
> administrator. OK, lame example, but perhaps there are better ones.

User can already ulimit -c 0 on himself, perhaps we want to use same
interface here? ulimit -cmask=(bitmask)?
Pavel
--
Thanks for all the (sleeping) penguins.

2007-01-09 01:09:23

by Hidehiro Kawai

[permalink] [raw]
Subject: Re: [PATCH] binfmt_elf: core dump masking support

Hi, Pavel

Pavel Machek wrote:

> > When a new process is created, the process inherits the coremask
> > setting from its parent. It is useful to set the coremask before
> > the program runs. For example:
> >
> > $ echo 1 > /proc/self/coremask
> > $ ./some_program
>
> User can already ulimit -c 0 on himself, perhaps we want to use same
> interface here? ulimit -cmask=(bitmask)?

Are you saying that 1) it is good to change ulimit (shell programs)
so that shell programs will read/write /proc/self/coremask when
the -cmask option is given to ulimit?
Or, 2) it is good to change ulimit and get/setrlimit so that shell
programs will invoke get/setrlimit with new parameter?

If the changes are acceptable to bash or other shell community, I think
the first approach is nice.
But the second approach is problematic because the bitmask doesn't
conform to the usage of setrlimit. You know, setrlimit controls amount
of resources the system can use by the soft limit and hard limit.
These limitations don't suit for the bitmask.


By the way, the /proc/<pid>/coremask method has an advantage over the
ulimit method. It allows system administrator to change the bitmask of
any process anytime.
That's why I decided to use the /proc/<pid>/ interface.

Best regards,
--
Hidehiro Kawai
Hitachi, Ltd., Systems Development Laboratory


2007-01-09 14:39:35

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH] binfmt_elf: core dump masking support

Hi!

> > > When a new process is created, the process inherits the coremask
> > > setting from its parent. It is useful to set the coremask before
> > > the program runs. For example:
> > >
> > > $ echo 1 > /proc/self/coremask
> > > $ ./some_program
> >
> > User can already ulimit -c 0 on himself, perhaps we want to use same
> > interface here? ulimit -cmask=(bitmask)?
>
> Are you saying that 1) it is good to change ulimit (shell programs)
> so that shell programs will read/write /proc/self/coremask when
> the -cmask option is given to ulimit?
> Or, 2) it is good to change ulimit and get/setrlimit so that shell
> programs will invoke get/setrlimit with new parameter?

I'm trying to say 2).

> If the changes are acceptable to bash or other shell community, I think
> the first approach is nice.
> But the second approach is problematic because the bitmask doesn't
> conform to the usage of setrlimit. You know, setrlimit controls amount
> of resources the system can use by the soft limit and hard limit.
> These limitations don't suit for the bitmask.

Well, you can have it as set of 0-1 "limits"...
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

2007-01-12 08:50:47

by Hidehiro Kawai

[permalink] [raw]
Subject: Re: [PATCH] binfmt_elf: core dump masking support

Hi,

>>>> $ echo 1 > /proc/self/coremask
>>>> $ ./some_program
>>>
>>>User can already ulimit -c 0 on himself, perhaps we want to use same
>>>interface here? ulimit -cmask=(bitmask)?
>>
>>Are you saying that 1) it is good to change ulimit (shell programs)
>>so that shell programs will read/write /proc/self/coremask when
>>the -cmask option is given to ulimit?
>>Or, 2) it is good to change ulimit and get/setrlimit so that shell
>>programs will invoke get/setrlimit with new parameter?

>>But the second approach is problematic because the bitmask doesn't
>>conform to the usage of setrlimit. You know, setrlimit controls amount
>>of resources the system can use by the soft limit and hard limit.
>>These limitations don't suit for the bitmask.
>
> Well, you can have it as set of 0-1 "limits"...

I have come up with a similar idea of regarding the ulimit
value as a bitmask, and I think it may work.
But it will be confusable for users to add the new concept of
0-1 limitation into the traditional resouce limitation feature.
Additionaly, this approach needs a modification of each shell
command.
What do you think about these demerits?

The /proc/<pid>/ approach doesn't have these demerits, and it
has an advantage that users can change the bitmask of any process
at anytime.

So I'm going to post the 2nd version patchset with /proc/<pid>/
interface. If the 2nd version has crucial problems, I'll try
the ulimit approach.

Best regards,
--
Hidehiro Kawai
Hitachi, Ltd., Systems Development Laboratory



2007-01-14 20:02:14

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH] binfmt_elf: core dump masking support

Hi!

> > Well, you can have it as set of 0-1 "limits"...
>
> I have come up with a similar idea of regarding the ulimit
> value as a bitmask, and I think it may work.
> But it will be confusable for users to add the new concept of
> 0-1 limitation into the traditional resouce limitation feature.
> Additionaly, this approach needs a modification of each shell
> command.
> What do you think about these demerits?

> The /proc/<pid>/ approach doesn't have these demerits, and it
> has an advantage that users can change the bitmask of any process
> at anytime.

Well... not sure if it is advantage. Semantics of ulimit inheritance
are well given, for example. How is this going to be inherited?

Anyway, yes, I see 0/1 "limits" have bad sides, too, so...
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

2007-01-19 00:42:21

by Hidehiro Kawai

[permalink] [raw]
Subject: Re: [PATCH] binfmt_elf: core dump masking support

Hi Pavel,

>>>Well, you can have it as set of 0-1 "limits"...
>>
>>I have come up with a similar idea of regarding the ulimit
>>value as a bitmask, and I think it may work.
>>But it will be confusable for users to add the new concept of
>>0-1 limitation into the traditional resouce limitation feature.
>>Additionaly, this approach needs a modification of each shell
>>command.
>>What do you think about these demerits?
>
>>The /proc/<pid>/ approach doesn't have these demerits, and it
>>has an advantage that users can change the bitmask of any process
>>at anytime.
>
> Well... not sure if it is advantage.

For example, consider the following case:
a process forks many children and system administrator wants to
allow only one of these processes to dump shared memory.

This is accomplished as follows:

$ echo 1 > /proc/self/coremask
$ ./some_program
(fork children)
$ echo 0 > /proc/<a child's pid>/coremask

With the /proc/<pid>/ interface, we don't need to modify the
user program. In contrast, with the ulimit or setrlimit interface,
the administrator can't do it without modifying the user program
to call setrlimit. This will not be preferred.


> Semantics of ulimit inheritance
> are well given, for example. How is this going to be inherited?

The coremask setting is inherited in mm_init(), which is called
once as an extention of do_fork(), do_execve() or compat_do_execve().
Inheritance is done by copying the bitmask from current->mm->coremask.
However, if current->mm is NULL, the inherited bitmask is set to 0
(default value).

Best regards,
--
Hidehiro Kawai
Hitachi, Ltd., Systems Development Laboratory


2007-01-19 00:46:10

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH] binfmt_elf: core dump masking support

On Fri 2007-01-19 09:40:39, Kawai, Hidehiro wrote:
> Hi Pavel,
>
> >>>Well, you can have it as set of 0-1 "limits"...
> >>
> >>I have come up with a similar idea of regarding the ulimit
> >>value as a bitmask, and I think it may work.
> >>But it will be confusable for users to add the new concept of
> >>0-1 limitation into the traditional resouce limitation feature.
> >>Additionaly, this approach needs a modification of each shell
> >>command.
> >>What do you think about these demerits?
> >
> >>The /proc/<pid>/ approach doesn't have these demerits, and it
> >>has an advantage that users can change the bitmask of any process
> >>at anytime.
> >
> > Well... not sure if it is advantage.
>
> For example, consider the following case:
> a process forks many children and system administrator wants to
> allow only one of these processes to dump shared memory.
>
> This is accomplished as follows:
>
> $ echo 1 > /proc/self/coremask
> $ ./some_program
> (fork children)
> $ echo 0 > /proc/<a child's pid>/coremask
>
> With the /proc/<pid>/ interface, we don't need to modify the
> user program. In contrast, with the ulimit or setrlimit interface,
> the administrator can't do it without modifying the user program
> to call setrlimit. This will not be preferred.

Yep, otoh process coremask setting can change while it is running,
that is not expected. Hmm, it can also change while it is dumping
core, are you sure it is not racy?

(run echo 1 > coremask, echo 0 > coremask in a loop while dumping
core. Do you have enough locking to make it work as expected?)


Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

2007-01-22 02:31:22

by Hidehiro Kawai

[permalink] [raw]
Subject: Re: [PATCH] binfmt_elf: core dump masking support

Hi Pavel,

>>>>The /proc/<pid>/ approach doesn't have these demerits, and it
>>>>has an advantage that users can change the bitmask of any process
>>>>at anytime.
>>>
>>>Well... not sure if it is advantage.
>>
>>For example, consider the following case:
>> a process forks many children and system administrator wants to
>> allow only one of these processes to dump shared memory.
>>
>>This is accomplished as follows:
>>
>> $ echo 1 > /proc/self/coremask
>> $ ./some_program
>> (fork children)
>> $ echo 0 > /proc/<a child's pid>/coremask
>>
>>With the /proc/<pid>/ interface, we don't need to modify the
>>user program. In contrast, with the ulimit or setrlimit interface,
>>the administrator can't do it without modifying the user program
>>to call setrlimit. This will not be preferred.
>
> Yep, otoh process coremask setting can change while it is running,
> that is not expected. Hmm, it can also change while it is dumping
> core, are you sure it is not racy?

Good point, thanks. I never thought of that.
We can change the coremask setting while dumping the process's
memory, and it is problematic.

maydump() function which decides a given VMA may be dumped or not
is invoked twice per VMAs. One is at the time of writing a program
header for a VMA, another is at the time of writing its contents.
If the coremask setting differs between the two, the program
header will point wrong place in the core file as its contents.


> (run echo 1 > coremask, echo 0 > coremask in a loop while dumping
> core. Do you have enough locking to make it work as expected?)

Currently, any lock isn't acquired. But I think the kernel only
have to preserve the coremask setting in a local variable at the
begining of core dumping. I'm going to do this in the next version.

Best regards,
--
Hidehiro Kawai
Hitachi, Ltd., Systems Development Laboratory


2007-01-22 10:06:31

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH] binfmt_elf: core dump masking support

On Mon 2007-01-22 11:29:40, Kawai, Hidehiro wrote:
> Hi Pavel,
>
> >>>>The /proc/<pid>/ approach doesn't have these demerits, and it
> >>>>has an advantage that users can change the bitmask of any process
> >>>>at anytime.
> >>>
> >>>Well... not sure if it is advantage.
> >>
> >>For example, consider the following case:
> >> a process forks many children and system administrator wants to
> >> allow only one of these processes to dump shared memory.
> >>
> >>This is accomplished as follows:
> >>
> >> $ echo 1 > /proc/self/coremask
> >> $ ./some_program
> >> (fork children)
> >> $ echo 0 > /proc/<a child's pid>/coremask
> >>
> >>With the /proc/<pid>/ interface, we don't need to modify the
> >>user program. In contrast, with the ulimit or setrlimit interface,
> >>the administrator can't do it without modifying the user program
> >>to call setrlimit. This will not be preferred.
> >
> > Yep, otoh process coremask setting can change while it is running,
> > that is not expected. Hmm, it can also change while it is dumping
> > core, are you sure it is not racy?
>
> Good point, thanks. I never thought of that.
> We can change the coremask setting while dumping the process's
> memory, and it is problematic.
>
> maydump() function which decides a given VMA may be dumped or not
> is invoked twice per VMAs. One is at the time of writing a program
> header for a VMA, another is at the time of writing its contents.
> If the coremask setting differs between the two, the program
> header will point wrong place in the core file as its contents.
>
>
> > (run echo 1 > coremask, echo 0 > coremask in a loop while dumping
> > core. Do you have enough locking to make it work as expected?)
>
> Currently, any lock isn't acquired. But I think the kernel only
> have to preserve the coremask setting in a local variable at the
> begining of core dumping. I'm going to do this in the next version.

No, I do not think that is enough. At minimum, you'd need atomic_t
variable. But I'd recomend against it. Playing with locking is tricky.

Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

2007-01-23 04:43:46

by Hidehiro Kawai

[permalink] [raw]
Subject: Re: [PATCH] binfmt_elf: core dump masking support

Hi,

>>>(run echo 1 > coremask, echo 0 > coremask in a loop while dumping
>>>core. Do you have enough locking to make it work as expected?)
>>
>>Currently, any lock isn't acquired. But I think the kernel only
>>have to preserve the coremask setting in a local variable at the
>>begining of core dumping. I'm going to do this in the next version.
>
> No, I do not think that is enough. At minimum, you'd need atomic_t
> variable. But I'd recomend against it. Playing with locking is tricky.

Why do you think it is not enough? I think that any locking is not
needed.
My design principle is that the core dump routine is controlled by
the bitmask which was assigned to the dumping process at the time of
starting core dump. So if a coremask setting is changed while
core dumping, the change doesn't affect current dumping process.
This can be implemented as follows:

core_dump_start:
unsigned int mask = current->mm->coremask;
for each VMA {
write a header which depends on the result of maydump(vma, mask)
}
for each VMA {
write a body which depends on the result of maydump(vma, mask)
}

NOTE:
maydump() is the central function, which decides whether a given
VMA should be dumped or not.

What do you think about this?


Best regards,
--
Hidehiro Kawai
Hitachi, Ltd., Systems Development Laboratory


2007-01-23 09:18:27

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH] binfmt_elf: core dump masking support

On Tue 2007-01-23 13:42:00, Kawai, Hidehiro wrote:
> Hi,
>
> >>>(run echo 1 > coremask, echo 0 > coremask in a loop while dumping
> >>>core. Do you have enough locking to make it work as expected?)
> >>
> >>Currently, any lock isn't acquired. But I think the kernel only
> >>have to preserve the coremask setting in a local variable at the
> >>begining of core dumping. I'm going to do this in the next version.
> >
> > No, I do not think that is enough. At minimum, you'd need atomic_t
> > variable. But I'd recomend against it. Playing with locking is tricky.
>
> Why do you think it is not enough? I think that any locking is not
> needed.

You are wrong.

> unsigned int mask = current->mm->coremask;

Not a valid C; coremask can change from another cpu.

Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

2007-01-23 12:19:22

by Hidehiro Kawai

[permalink] [raw]
Subject: Re: [PATCH] binfmt_elf: core dump masking support

Hi,

>>>>>(run echo 1 > coremask, echo 0 > coremask in a loop while dumping
>>>>>core. Do you have enough locking to make it work as expected?)
>>>>
>>>>Currently, any lock isn't acquired. But I think the kernel only
>>>>have to preserve the coremask setting in a local variable at the
>>>>begining of core dumping. I'm going to do this in the next version.
>>>
>>>No, I do not think that is enough. At minimum, you'd need atomic_t
>>>variable. But I'd recomend against it. Playing with locking is tricky.
>>
>>Why do you think it is not enough? I think that any locking is not
>>needed.
>
> You are wrong.
>
>> unsigned int mask = current->mm->coremask;
>
> Not a valid C; coremask can change from another cpu.

Sure. Although I thought it was no problem because coremask uses only
1 bit, it will be problematic when coremask grows beyond byte alignment.


>>>No, I do not think that is enough. At minimum, you'd need atomic_t
>>>variable. But I'd recomend against it. Playing with locking is tricky.

But Andrew proposed:

> Regarding the implementation: if we add
>
> unsigned char coredump_omit_anon_memory:1;
>
> into the mm_struct right next to `dumpable' then we avoid increasing the
> size of the mm_struct, and the code gets neater.

So I'm going to use a bit field to store the bitmask. Because of this,
I can't use an atomic_t variable.
Instead, I'll use a spin lock to read/write atomically.

Best regards,
--
Hidehiro Kawai
Hitachi, Ltd., Systems Development Laboratory