2009-11-13 22:53:34

by Masami Hiramatsu

[permalink] [raw]
Subject: [PATCH -tip 1/3] Pass mm->flags to binfmt core_dump for bitflag consistency

Pass mm->flags to binfmt core_dump for bitflag consistency.
Since mm->flags bit flags is not protected by locks, it will be
changed while dumping core. This patch copies mm->flags to a
mm_flags local variable at the beginning of do_coredump() and
use it while dumping. mm_flags also includes dump_filter which
filters elf sections from core file in elf_core_dump().
So, this patch also passes mm_flags to each binfmt->core_dump().

Signed-off-by: Masami Hiramatsu <[email protected]>
Cc: Hidehiro Kawai <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: Roland McGrath <[email protected]>
---

fs/binfmt_aout.c | 4 ++--
fs/binfmt_elf.c | 12 ++----------
fs/binfmt_elf_fdpic.c | 13 +++----------
fs/binfmt_flat.c | 4 ++--
fs/binfmt_som.c | 2 +-
fs/exec.c | 14 ++++++++++----
include/linux/binfmts.h | 2 +-
7 files changed, 21 insertions(+), 30 deletions(-)

diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index b639dcf..ef1d4aa 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -32,7 +32,7 @@

static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs);
static int load_aout_library(struct file*);
-static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit);
+static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit, unsigned long mm_flags);

static struct linux_binfmt aout_format = {
.module = THIS_MODULE,
@@ -89,7 +89,7 @@ if (file->f_op->llseek) { \
* dumping of the process results in another error..
*/

-static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit)
+static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit, unsigned long mm_flags)
{
mm_segment_t fs;
int has_dumped = 0;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b9b3bb5..dd7256c 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -45,7 +45,7 @@ static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
* don't even try.
*/
#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
-static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit);
+static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit, unsigned long mm_flags);
#else
#define elf_core_dump NULL
#endif
@@ -1906,7 +1906,7 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
* and then they are actually written out. If we run out of core limit
* we just truncate.
*/
-static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit)
+static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit, unsigned long mm_flags)
{
int has_dumped = 0;
mm_segment_t fs;
@@ -1915,7 +1915,6 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
struct vm_area_struct *vma, *gate_vma;
struct elfhdr *elf = NULL;
loff_t offset = 0, dataoff, foffset;
- unsigned long mm_flags;
struct elf_note_info info;

/*
@@ -1980,13 +1979,6 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un

dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);

- /*
- * We must use the same mm->flags while dumping core to avoid
- * inconsistency between the program headers and bodies, otherwise an
- * unusable core file can be generated.
- */
- mm_flags = current->mm->flags;
-
/* Write program headers for segments dump */
for (vma = first_vma(current, gate_vma); vma != NULL;
vma = next_vma(vma, gate_vma)) {
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 38502c6..8af3633 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -76,7 +76,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *,
struct file *, struct mm_struct *);

#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
-static int elf_fdpic_core_dump(long, struct pt_regs *, struct file *, unsigned long limit);
+static int elf_fdpic_core_dump(long, struct pt_regs *, struct file *, unsigned long limit, unsigned long mm_flags);
#endif

static struct linux_binfmt elf_fdpic_format = {
@@ -1582,7 +1582,8 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
* we just truncate.
*/
static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
- struct file *file, unsigned long limit)
+ struct file *file, unsigned long limit,
+ unsigned long mm_flags)
{
#define NUM_NOTES 6
int has_dumped = 0;
@@ -1605,7 +1606,6 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
#endif
int thread_status_size = 0;
elf_addr_t *auxv;
- unsigned long mm_flags;

/*
* We no longer stop all VM operations.
@@ -1736,13 +1736,6 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
/* Page-align dumped data */
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);

- /*
- * We must use the same mm->flags while dumping core to avoid
- * inconsistency between the program headers and bodies, otherwise an
- * unusable core file can be generated.
- */
- mm_flags = current->mm->flags;
-
/* write program headers for segments dump */
for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
struct elf_phdr phdr;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index a279665..6477678 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -87,7 +87,7 @@ static int load_flat_shared_library(int id, struct lib_info *p);
#endif

static int load_flat_binary(struct linux_binprm *, struct pt_regs * regs);
-static int flat_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit);
+static int flat_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit, unsigned long mm_flags);

static struct linux_binfmt flat_format = {
.module = THIS_MODULE,
@@ -102,7 +102,7 @@ static struct linux_binfmt flat_format = {
* Currently only a stub-function.
*/

-static int flat_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit)
+static int flat_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit, unsigned long mm_flags)
{
printk("Process %s:%d received signr %d and should have core dumped\n",
current->comm, current->pid, (int) signr);
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index eff74b9..983c01d 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -43,7 +43,7 @@ static int load_som_library(struct file *);
* don't even try.
*/
#if 0
-static int som_core_dump(long signr, struct pt_regs *regs, unsigned long limit);
+static int som_core_dump(long signr, struct pt_regs *regs, unsigned long limit, unsigned long mm_flags);
#else
#define som_core_dump NULL
#endif
diff --git a/fs/exec.c b/fs/exec.c
index ba112bd..dc418e3 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1722,7 +1722,7 @@ int get_dumpable(struct mm_struct *mm)
{
int ret;

- ret = mm->flags & 0x3;
+ ret = mm->flags & MMF_DUMPABLE_MASK;
return (ret >= 2) ? 2 : ret;
}

@@ -1754,6 +1754,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
struct core_state core_state;
char corename[CORENAME_MAX_SIZE + 1];
struct mm_struct *mm = current->mm;
+ /*
+ * We must use the same mm->flags while dumping core to avoid
+ * inconsistency of bit flags, since this flag is not protected
+ * by any locks.
+ */
+ unsigned long mm_flags = mm->flags;
struct linux_binfmt * binfmt;
struct inode * inode;
struct file * file;
@@ -1784,7 +1790,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
/*
* If another thread got here first, or we are not dumpable, bail out.
*/
- if (mm->core_state || !get_dumpable(mm)) {
+ if (mm->core_state || !(mm_flags & MMF_DUMPABLE_MASK)) {
up_write(&mm->mmap_sem);
put_cred(cred);
goto fail;
@@ -1795,7 +1801,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
* process nor do we know its entire history. We only know it
* was tainted so we dump it as root in mode 2.
*/
- if (get_dumpable(mm) == 2) { /* Setuid core dump mode */
+ if (mm_flags & (1 << MMF_DUMP_SECURELY)) { /* Setuid core dump mode */
flag = O_EXCL; /* Stop rewrite attacks */
cred->fsuid = 0; /* Dump root private */
}
@@ -1901,7 +1907,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
if (!ispipe && do_truncate(file->f_path.dentry, 0, 0, file) != 0)
goto close_fail;

- retval = binfmt->core_dump(signr, regs, file, core_limit);
+ retval = binfmt->core_dump(signr, regs, file, core_limit, mm_flags);

if (retval)
current->signal->group_exit_code |= 0x80;
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index aece486..6e65c31 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -77,7 +77,7 @@ struct linux_binfmt {
struct module *module;
int (*load_binary)(struct linux_binprm *, struct pt_regs * regs);
int (*load_shlib)(struct file *);
- int (*core_dump)(long signr, struct pt_regs *regs, struct file *file, unsigned long limit);
+ int (*core_dump)(long signr, struct pt_regs *regs, struct file *file, unsigned long limit, unsigned long mm_flags);
unsigned long min_coredump; /* minimal dump size */
int hasvdso;
};


--
Masami Hiramatsu

Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division

e-mail: [email protected]


2009-11-13 22:52:55

by Masami Hiramatsu

[permalink] [raw]
Subject: [PATCH -tip 2/3] Add coredump tracepoint

Add coredump tracepoint for tracing coredump event. This event shows coredump
caused signal, limit size, dumpable flag, dump-filter, and core name.

Signed-off-by: Masami Hiramatsu <[email protected]>
Cc: Roland McGrath <[email protected]>
---

fs/exec.c | 9 +++++++++
include/trace/events/sched.h | 34 ++++++++++++++++++++++++++++++++++
2 files changed, 43 insertions(+), 0 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index dc418e3..6b77d10 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -56,6 +56,7 @@
#include <linux/fsnotify.h>
#include <linux/fs_struct.h>
#include <linux/pipe_fs_i.h>
+#include <trace/events/sched.h>

#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -1907,6 +1908,14 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
if (!ispipe && do_truncate(file->f_path.dentry, 0, 0, file) != 0)
goto close_fail;

+ /*
+ * Trace core dump event.
+ * Casting signr to int is safe because it comes from
+ * si->signo int field.
+ */
+ trace_sched_process_coredump((int)signr, core_limit, mm_flags,
+ corename);
+
retval = binfmt->core_dump(signr, regs, file, core_limit, mm_flags);

if (retval)
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index b50b985..ff9d5fb 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -320,6 +320,40 @@ TRACE_EVENT(sched_process_fork,
);

/*
+ * Tracepoint for core_dump:
+ */
+TRACE_EVENT(sched_process_coredump,
+
+ TP_PROTO(int sig, unsigned long core_limit, unsigned long mm_flags,
+ const char *core_name),
+
+ TP_ARGS(sig, core_limit, mm_flags, core_name),
+
+ TP_STRUCT__entry(
+ __field( int, sig )
+ __field( unsigned long, limit )
+ __field( unsigned long, flags )
+ __string( name, core_name )
+ ),
+
+
+ TP_fast_assign(
+ __entry->sig = sig;
+ __entry->limit = core_limit;
+ __entry->flags = mm_flags;
+ __assign_str(name, core_name);
+ ),
+
+ TP_printk("sig: %d limit: %lu dumpable: %lx dump_filter: %lx "
+ "corename: %s",
+ __entry->sig, __entry->limit,
+ __entry->flags & MMF_DUMPABLE_MASK,
+ (__entry->flags & MMF_DUMP_FILTER_MASK) >>
+ MMF_DUMP_FILTER_SHIFT,
+ __get_str(name))
+);
+
+/*
* Tracepoint for sending a signal:
*/
TRACE_EVENT(sched_signal_send,


--
Masami Hiramatsu

Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division

e-mail: [email protected]

2009-11-13 22:53:14

by Masami Hiramatsu

[permalink] [raw]
Subject: [PATCH -tip 3/3] Add get_signal tracepoint

Add a tracepoint where a process gets a signal. This tracepoint
shows signal-number, sa-handler and sa-flag.

Signed-off-by: Masami Hiramatsu <[email protected]>
Cc: Roland McGrath <[email protected]>
---

include/trace/events/sched.h | 26 ++++++++++++++++++++++++++
kernel/signal.c | 3 +++
2 files changed, 29 insertions(+), 0 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index ff9d5fb..de8ca7e 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -354,6 +354,32 @@ TRACE_EVENT(sched_process_coredump,
);

/*
+ * Tracepoint for getting a signal:
+ */
+TRACE_EVENT(sched_process_get_signal,
+
+ TP_PROTO(int sig, struct k_sigaction *ka),
+
+ TP_ARGS(sig, ka),
+
+ TP_STRUCT__entry(
+ __field( int, sig )
+ __field( unsigned long, sa_handler )
+ __field( unsigned long, sa_flags )
+ ),
+
+
+ TP_fast_assign(
+ __entry->sig = sig;
+ __entry->sa_handler = (unsigned long)ka->sa.sa_handler;
+ __entry->sa_flags = ka->sa.sa_flags;
+ ),
+
+ TP_printk("sig: %d sa_handler: %lx sa_flags: %lx",
+ __entry->sig, __entry->sa_handler, __entry->sa_flags)
+);
+
+/*
* Tracepoint for sending a signal:
*/
TRACE_EVENT(sched_signal_send,
diff --git a/kernel/signal.c b/kernel/signal.c
index fe08008..44614fb 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1859,6 +1859,9 @@ relock:
ka = &sighand->action[signr-1];
}

+ /* Trace the actual queued signals including SIG_IGN.*/
+ trace_sched_process_get_signal(signr, ka);
+
if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */
continue;
if (ka->sa.sa_handler != SIG_DFL) {


--
Masami Hiramatsu

Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division

e-mail: [email protected]

2009-11-13 23:09:31

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH -tip 1/3] Pass mm->flags to binfmt core_dump for bitflag consistency

On Fri, 13 Nov 2009 17:52:27 -0500
Masami Hiramatsu <[email protected]> wrote:

> Pass mm->flags to binfmt core_dump for bitflag consistency.
> Since mm->flags bit flags is not protected by locks, it will be
> changed while dumping core. This patch copies mm->flags to a
> mm_flags local variable at the beginning of do_coredump() and
> use it while dumping. mm_flags also includes dump_filter which
> filters elf sections from core file in elf_core_dump().
> So, this patch also passes mm_flags to each binfmt->core_dump().

I can kind-of guess the answer, but it would be much more reliable if
we were to hear this from yourself:

Why did you write this patch? What problem is being observed?

2009-11-13 23:16:25

by Roland McGrath

[permalink] [raw]
Subject: Re: [PATCH -tip 1/3] Pass mm->flags to binfmt core_dump for bitflag consistency

To clarify, proc_coredump_filter_write() is the one place that can change
mm->flags during a core dump. I don't think any other is possible while
all the other tasks sharing that mm are prevented from running. Is there
any other way that mm->flags might change during do_coredump()?

I don't see anything wrong with this change. But (assuming that is the
only case), there is another approach we could take instead. That is,
have proc_coredump_filter_write() do:

down_read(&mm->mmap_sem);
ret = mm->core_state ? -EBUSY : 0;
up_read(&mm->mmap_sem);


Thanks,
Roland

2009-11-13 23:23:23

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH -tip 1/3] Pass mm->flags to binfmt core_dump for bitflag consistency


* Roland McGrath <[email protected]> wrote:

> To clarify, proc_coredump_filter_write() is the one place that can
> change mm->flags during a core dump. I don't think any other is
> possible while all the other tasks sharing that mm are prevented from
> running. Is there any other way that mm->flags might change during
> do_coredump()?
>
> I don't see anything wrong with this change. But (assuming that is
> the only case), there is another approach we could take instead. That
> is, have proc_coredump_filter_write() do:
>
> down_read(&mm->mmap_sem);
> ret = mm->core_state ? -EBUSY : 0;
> up_read(&mm->mmap_sem);

this would fix the (probably harmless) race too, but isnt the whole
approach taken by the patch more robust, i.e. to take a snapshot of
mm->flags value and pass it along coredump processing?

That makes it evidently immutable in the future too. It also makes the
code a bit easier to read IMO - instead of get_dumpable() we use the
mm_flags.

The only other observation i have is that for this parameter set:

long signr, struct pt_regs *regs, struct file *file,
unsigned long limit, unsigned long mm_flags)

we should probably introduce a coredump parameter struct, and pass that
along:

struct coredump_params {
long signr;
struct pt_regs *regs;
struct file *file;
unsigned long limit;
unsigned long mm_flags;
}

Had this been done in the past this present patch would be a lot simpler
as well: we could have added mm_flags to coredump_params, instead of
having to propagate it through ~6 function interface surfaces.

Thanks,

Ingo

2009-11-13 23:25:10

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH -tip 1/3] Pass mm->flags to binfmt core_dump for bitflag consistency


* Andrew Morton <[email protected]> wrote:

> On Fri, 13 Nov 2009 17:52:27 -0500
> Masami Hiramatsu <[email protected]> wrote:
>
> > Pass mm->flags to binfmt core_dump for bitflag consistency.
> > Since mm->flags bit flags is not protected by locks, it will be
> > changed while dumping core. This patch copies mm->flags to a
> > mm_flags local variable at the beginning of do_coredump() and
> > use it while dumping. mm_flags also includes dump_filter which
> > filters elf sections from core file in elf_core_dump().
> > So, this patch also passes mm_flags to each binfmt->core_dump().
>
> I can kind-of guess the answer, but it would be much more reliable if
> we were to hear this from yourself:
>
> Why did you write this patch? What problem is being observed?

i'm not Masami so i'm only guessing that while writing the tracepoint a
race got noticed but that otherwise there's no big practical effect,
'just' a cleanliness problem fixed.

Ingo

2009-11-13 23:30:30

by Roland McGrath

[permalink] [raw]
Subject: Re: [PATCH -tip 1/3] Pass mm->flags to binfmt core_dump for bitflag consistency

> this would fix the (probably harmless) race too, but isnt the whole
> approach taken by the patch more robust, i.e. to take a snapshot of
> mm->flags value and pass it along coredump processing?

Yes, I think it is a fine thing to do. I'm just being explicit about what
I think the (only) concrete issues are motivating a change.

> we should probably introduce a coredump parameter struct, and pass that
> along:

Also sounds fine to me.


Thanks,
Roland

2009-11-13 23:39:26

by Roland McGrath

[permalink] [raw]
Subject: Re: [PATCH -tip 2/3] Add coredump tracepoint

I can't really see what this has to do with "sched" to warrant that name.
But, whatever.

Note that you put the tracepoint where it won't get called in the various
cases where no dump is really being made because of RLIMIT_CORE or file
failures. I suspect you would like to get those reported. (Perhaps
especially so, since there won't be any file around to notice later.)

Also, it seems nice to give the tracepoint the chance to look at the actual
open file in case a fancy one wants to do that.

e.g.

diff --git a/fs/exec.c b/fs/exec.c
index ba112bd..0000000 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1822,9 +1822,7 @@ void do_coredump(long signr, int exit_co
ispipe = format_corename(corename, signr);
unlock_kernel();

- if ((!ispipe) && (core_limit < binfmt->min_coredump))
- goto fail_unlock;
-
+ file = NULL;
if (ispipe) {
if (core_limit == 0) {
/*
@@ -1845,7 +1843,7 @@ void do_coredump(long signr, int exit_co
"Process %d(%s) has RLIMIT_CORE set to 0\n",
task_tgid_vnr(current), current->comm);
printk(KERN_WARNING "Aborting core\n");
- goto fail_unlock;
+ goto nopipe;
}

dump_count = atomic_inc_return(&core_dump_count);
@@ -1853,14 +1851,14 @@ void do_coredump(long signr, int exit_co
printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
task_tgid_vnr(current), current->comm);
printk(KERN_WARNING "Skipping core dump\n");
- goto fail_dropcount;
+ goto nopipe;
}

helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
if (!helper_argv) {
printk(KERN_WARNING "%s failed to allocate memory\n",
__func__);
- goto fail_dropcount;
+ goto nopipe;
}

core_limit = RLIM_INFINITY;
@@ -1870,13 +1868,19 @@ void do_coredump(long signr, int exit_co
&file)) {
printk(KERN_INFO "Core dump to %s pipe failed\n",
corename);
- goto fail_dropcount;
+ goto nopipe;
}
- } else
+ } else if (core_limit >= binfmt->min_coredump) {
file = filp_open(corename,
O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
0600);
- if (IS_ERR(file))
+ }
+
+nopipe:
+ trace_process_coredump((int) signr, core_limit, mm_flags,
+ corename, file);
+
+ if (!file || IS_ERR(file))
goto fail_dropcount;
inode = file->f_path.dentry->d_inode;
if (inode->i_nlink > 1)

2009-11-13 23:45:32

by Masami Hiramatsu

[permalink] [raw]
Subject: Re: [PATCH -tip 1/3] Pass mm->flags to binfmt core_dump for bitflag consistency

Ingo Molnar wrote:
>
> * Andrew Morton<[email protected]> wrote:
>
>> On Fri, 13 Nov 2009 17:52:27 -0500
>> Masami Hiramatsu<[email protected]> wrote:
>>
>>> Pass mm->flags to binfmt core_dump for bitflag consistency.
>>> Since mm->flags bit flags is not protected by locks, it will be
>>> changed while dumping core. This patch copies mm->flags to a
>>> mm_flags local variable at the beginning of do_coredump() and
>>> use it while dumping. mm_flags also includes dump_filter which
>>> filters elf sections from core file in elf_core_dump().
>>> So, this patch also passes mm_flags to each binfmt->core_dump().
>>
>> I can kind-of guess the answer, but it would be much more reliable if
>> we were to hear this from yourself:
>>
>> Why did you write this patch? What problem is being observed?
>
> i'm not Masami so i'm only guessing that while writing the tracepoint a
> race got noticed but that otherwise there's no big practical effect,
> 'just' a cleanliness problem fixed.

Right, I'd like to add a tracepoint of coredump event with
its information. And also, this patch may fix a small
dumpable inconsistency issue below code

---
1787 if (mm->core_state || !get_dumpable(mm)) { <- (1)
1788 up_write(&mm->mmap_sem);
1789 put_cred(cred);
1790 goto fail;
1791 }
1792
[...]
1798 if (get_dumpable(mm) == 2) { /* Setuid core dump mode */ <-(2)
1799 flag = O_EXCL; /* Stop rewrite attacks */
1800 cred->fsuid = 0; /* Dump root private */
1801 }

Since dumpable bits are not protected by lock, there is a
chance to change these bits between (1) and (2).

This patch copies mm->flags to a local variable and check the variable
for consistency.

Thank you,

--
Masami Hiramatsu

Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division

e-mail: [email protected]

2009-11-13 23:53:52

by Roland McGrath

[permalink] [raw]
Subject: Re: [PATCH -tip 3/3] Add get_signal tracepoint

This is orthogonal to the core-dump tracepoint, I don't see why you
call them a unified patch series.

The proper name for this event is "signal delivery". But since the
proper name for "send_signal" is "signal generation", I suppose "get"
is analogously improper to the existing "send" tracepoint. ;-)

Especially if you call this "get" rather than "deliver", there is
another place that should invoke this tracepoint (or perhaps a third
one). sys_rt_sigtimedwait "gets" a signal without delivering it. In
POSIX terminology this is called "accepting" the signal: the three
things that can happen in the life of a signal are "generate",
"deliver", and "accept". If you are trying to match up what happened to
a signal generated by kill() or whatnot, then you want to notice both
delivery and acceptance as the complementary event.

(And again I have no clue why this signal stuff should be called
"sched" at all.)


Thanks,
Roland

2009-11-14 00:00:47

by Masami Hiramatsu

[permalink] [raw]
Subject: Re: [PATCH -tip 2/3] Add coredump tracepoint

Roland McGrath wrote:
> I can't really see what this has to do with "sched" to warrant that name.
> But, whatever.
>
> Note that you put the tracepoint where it won't get called in the various
> cases where no dump is really being made because of RLIMIT_CORE or file
> failures. I suspect you would like to get those reported. (Perhaps
> especially so, since there won't be any file around to notice later.)

Exactly, yes.

> Also, it seems nice to give the tracepoint the chance to look at the actual
> open file in case a fancy one wants to do that.

Ah, that's very nice to me! thanks!

>
> e.g.
>
> diff --git a/fs/exec.c b/fs/exec.c
> index ba112bd..0000000 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1822,9 +1822,7 @@ void do_coredump(long signr, int exit_co
> ispipe = format_corename(corename, signr);
> unlock_kernel();
>
> - if ((!ispipe)&& (core_limit< binfmt->min_coredump))
> - goto fail_unlock;
> -
> + file = NULL;
> if (ispipe) {
> if (core_limit == 0) {
> /*
> @@ -1845,7 +1843,7 @@ void do_coredump(long signr, int exit_co
> "Process %d(%s) has RLIMIT_CORE set to 0\n",
> task_tgid_vnr(current), current->comm);
> printk(KERN_WARNING "Aborting core\n");
> - goto fail_unlock;
> + goto nopipe;
> }
>
> dump_count = atomic_inc_return(&core_dump_count);
> @@ -1853,14 +1851,14 @@ void do_coredump(long signr, int exit_co
> printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
> task_tgid_vnr(current), current->comm);
> printk(KERN_WARNING "Skipping core dump\n");
> - goto fail_dropcount;
> + goto nopipe;
> }
>
> helper_argv = argv_split(GFP_KERNEL, corename+1,&helper_argc);
> if (!helper_argv) {
> printk(KERN_WARNING "%s failed to allocate memory\n",
> __func__);
> - goto fail_dropcount;
> + goto nopipe;
> }
>
> core_limit = RLIM_INFINITY;
> @@ -1870,13 +1868,19 @@ void do_coredump(long signr, int exit_co
> &file)) {
> printk(KERN_INFO "Core dump to %s pipe failed\n",
> corename);
> - goto fail_dropcount;
> + goto nopipe;
> }
> - } else
> + } else if (core_limit>= binfmt->min_coredump) {
> file = filp_open(corename,
> O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
> 0600);
> - if (IS_ERR(file))
> + }
> +
> +nopipe:
> + trace_process_coredump((int) signr, core_limit, mm_flags,
> + corename, file);
> +
> + if (!file || IS_ERR(file))
> goto fail_dropcount;
> inode = file->f_path.dentry->d_inode;
> if (inode->i_nlink> 1)

--
Masami Hiramatsu

Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division

e-mail: [email protected]

2009-11-14 00:02:48

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH -tip 2/3] Add coredump tracepoint


* Roland McGrath <[email protected]> wrote:

> I can't really see what this has to do with "sched" to warrant that
> name. But, whatever.

That's a misnomer indeed. I'd suggest to introduce a separate 'signal'
subsystem category.

> Note that you put the tracepoint where it won't get called in the
> various cases where no dump is really being made because of
> RLIMIT_CORE or file failures. I suspect you would like to get those
> reported. (Perhaps especially so, since there won't be any file
> around to notice later.)

Yeah, good point.

> Also, it seems nice to give the tracepoint the chance to look at the
> actual open file in case a fancy one wants to do that.

Oh, so SystemTap is the motivator of these tracepoints? Do you know
about exact usecases where these tracepoints would be utilized? Would be
interesting (and relevant) to list them.

Ingo

2009-11-14 00:07:04

by Roland McGrath

[permalink] [raw]
Subject: Re: [PATCH -tip 2/3] Add coredump tracepoint

> Oh, so SystemTap is the motivator of these tracepoints?

Nope. My suggestion about arguments to pass was just generic based on the
innate context of this tracepoint.

> Do you know about exact usecases where these tracepoints would be
> utilized? Would be interesting (and relevant) to list them.

Masami's colleagues do have something in particular in mind,
but I'm not sure whether they intend to use systemtap as part
of the implementation of that or not.


Thanks,
Roland

2009-11-14 00:12:20

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH -tip 3/3] Add get_signal tracepoint


* Roland McGrath <[email protected]> wrote:

> This is orthogonal to the core-dump tracepoint, I don't see why you
> call them a unified patch series.
>
> The proper name for this event is "signal delivery". But since the
> proper name for "send_signal" is "signal generation", I suppose "get"
> is analogously improper to the existing "send" tracepoint. ;-)

I'd suggest to add include/trace/events/signal.h and put these
tracepoints there.

> Especially if you call this "get" rather than "deliver", there is
> another place that should invoke this tracepoint (or perhaps a third
> one). sys_rt_sigtimedwait "gets" a signal without delivering it. In
> POSIX terminology this is called "accepting" the signal: the three
> things that can happen in the life of a signal are "generate",
> "deliver", and "accept". If you are trying to match up what happened
> to a signal generated by kill() or whatnot, then you want to notice
> both delivery and acceptance as the complementary event.
>
> (And again I have no clue why this signal stuff should be called
> "sched" at all.)

it shouldnt be called 'sched' - it should go into 'events/signal.h'.

But we also need fuller coverage than this. Coredumps and signal
delivery events are just a small part of all things signals, we also
want:

- signal generation events (send_sig*() variants)

- signal IPI/wakeup events

- signal loss events (queue overflow)

- [ optional: signal blocking/unblocking events ]

- [ optional: specific signal handler installation/deinstallation ]

That's what we generally require of new events: they should form a
coherent whole, a logical set of events that 'make sense' and explain
the workings of a subsystem on a given level of detail.

How finegrained or coarse the level of details is is an open question,
but if a given level of detail has been picked, we want completeness on
that level.

So for example in the list above, the '[ optional ]' events are
finegrained ones that could be left out of the initial version.

We've done this consistently for all subsystems that added tracepoints:
scheduling, locking, timers, workqueues, block IO, SLAB, IRQs, etc., and
we want a similar approach for newly covered subsystems (such as
signals) as well.

Thanks,

Ingo

2009-11-14 00:14:10

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH -tip 2/3] Add coredump tracepoint


* Roland McGrath <[email protected]> wrote:

> > Do you know about exact usecases where these tracepoints would be
> > utilized? Would be interesting (and relevant) to list them.
>
> Masami's colleagues do have something in particular in mind, but I'm
> not sure whether they intend to use systemtap as part of the
> implementation of that or not.

Some sort of flexible core dump server perhaps, driven by user-space
logic? (vastly more flexible than coredump-filter or coredump ulimits
are today)

Ingo

2009-11-14 00:27:04

by Masami Hiramatsu

[permalink] [raw]
Subject: Re: [PATCH -tip 2/3] Add coredump tracepoint

Roland McGrath wrote:
>> Oh, so SystemTap is the motivator of these tracepoints?
>
> Nope. My suggestion about arguments to pass was just generic based on the
> innate context of this tracepoint.
>
>> Do you know about exact usecases where these tracepoints would be
>> utilized? Would be interesting (and relevant) to list them.
>
> Masami's colleagues do have something in particular in mind,
> but I'm not sure whether they intend to use systemtap as part
> of the implementation of that or not.

Yeah, we'd like to use this tracepoint to analyze coredump
miss-configuration. Sometimes, users miss-configure dump-filter
and rlimit, that will cause system-slowdown when several processes,
which share a large amount of memory among them (e.g. database),
start core-dump with shared memory. And then, it can cause a
system-switching on HA cluster system.

With this tracepoint, we can analyze why the coredumps were slow.

Thank you,

--
Masami Hiramatsu

Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division

e-mail: [email protected]

2009-11-14 00:30:38

by Masami Hiramatsu

[permalink] [raw]
Subject: Re: [PATCH -tip 3/3] Add get_signal tracepoint

Roland McGrath wrote:
> This is orthogonal to the core-dump tracepoint, I don't see why you
> call them a unified patch series.

Agreed, I'll split them.

> The proper name for this event is "signal delivery". But since the
> proper name for "send_signal" is "signal generation", I suppose "get"
> is analogously improper to the existing "send" tracepoint. ;-)

Ah, I see. 'deliver_signal' is good to me :-).

Thank you,

> Especially if you call this "get" rather than "deliver", there is
> another place that should invoke this tracepoint (or perhaps a third
> one). sys_rt_sigtimedwait "gets" a signal without delivering it. In
> POSIX terminology this is called "accepting" the signal: the three
> things that can happen in the life of a signal are "generate",
> "deliver", and "accept". If you are trying to match up what happened to
> a signal generated by kill() or whatnot, then you want to notice both
> delivery and acceptance as the complementary event.
>
> (And again I have no clue why this signal stuff should be called
> "sched" at all.)
>
>
> Thanks,
> Roland
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

--
Masami Hiramatsu

Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division

e-mail: [email protected]

2009-11-14 01:49:27

by Roland McGrath

[permalink] [raw]
Subject: Re: [PATCH -tip 2/3] Add coredump tracepoint

> Some sort of flexible core dump server perhaps, driven by user-space
> logic? (vastly more flexible than coredump-filter or coredump ulimits
> are today)

Tracepoints are not a very good fit for that. I think people working on
things like that use the corename=|command support.


Thanks,
Roland

2009-11-16 21:51:56

by Masami Hiramatsu

[permalink] [raw]
Subject: Re: [PATCH -tip 3/3] Add get_signal tracepoint

Ingo Molnar wrote:
>> Especially if you call this "get" rather than "deliver", there is
>> another place that should invoke this tracepoint (or perhaps a third
>> one). sys_rt_sigtimedwait "gets" a signal without delivering it. In
>> POSIX terminology this is called "accepting" the signal: the three
>> things that can happen in the life of a signal are "generate",
>> "deliver", and "accept". If you are trying to match up what happened
>> to a signal generated by kill() or whatnot, then you want to notice
>> both delivery and acceptance as the complementary event.
>>
>> (And again I have no clue why this signal stuff should be called
>> "sched" at all.)
>
> it shouldnt be called 'sched' - it should go into 'events/signal.h'.
>
> But we also need fuller coverage than this. Coredumps and signal
> delivery events are just a small part of all things signals, we also
> want:

That's a good idea. I'll put coredump and signal related events
into events/signal.h.

>
> - signal generation events (send_sig*() variants)

Those events finally calls __send_signal(), so I think
trace_signal_send() can trace those events.

> - signal IPI/wakeup events

All signals might be used for IPI, isn't it? :-)
Or, did you mean SIGSTOP/SIGCONT pare?

> - signal loss events (queue overflow)

Perhaps, this event is only for rt-signals, since
legacy signals just overwritten if it was sent.

Thank you,

--
Masami Hiramatsu

Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division

e-mail: [email protected]

2009-11-16 22:12:31

by Roland McGrath

[permalink] [raw]
Subject: Re: [PATCH -tip 3/3] Add get_signal tracepoint

(I CC'd Oleg, who doesn't care much about tracepoints, but always stays up
to speed about all things related to signals.)

> > - signal IPI/wakeup events
>
> All signals might be used for IPI, isn't it? :-)
> Or, did you mean SIGSTOP/SIGCONT pare?

I suspect he means something approximating signal_wake_up() calls. If a
signal is blocked, ignored, already pending, etc., then the "sending" event
does not lead to any kind of wakeup or interrupt.

i.e. perhaps something like:

@@ -529,8 +529,11 @@ void signal_wake_up(struct task_struct *t, int resume)
mask = TASK_INTERRUPTIBLE;
if (resume)
mask |= TASK_WAKEKILL;
- if (!wake_up_state(t, mask))
+ trace_signal_wakeup(t, resume);
+ if (!wake_up_state(t, mask)) {
kick_process(t);
+ trace_signal_ipi(t, resume);
+ }

OTOH, kick_process() is only called from here.
Perhaps tracepoints in the sched layer can cover these.

Anyway, Ingo can be more precise about what he has in mind.

> > - signal loss events (queue overflow)
>
> Perhaps, this event is only for rt-signals, since
> legacy signals just overwritten if it was sent.

Not exactly. Nothing is ever "overwritten". If a non-RT signal is already
pending, then you just leave the existing queue elements alone--i.e. the
first one isn't overwritten, rather the second one is dropped. But this is
not really the point.

The "queue overflow" happens in two ways. For RT signals it really is a
"signal loss" event--but that's also reported to the sender as -EAGAIN. So
a signal-generation tracepoint that reports the return value would already
cover that in a way.

For non-RT signals, a new signal is never lost. But __sigqueue_alloc() can
still fail. In this case, you get no queue element and thus no siginfo_t
stored, so you can lose some information about the signal. You don't lose
the signal itself, but will later dequeue it with an all-zeros siginfo_t.
While calling this a "signal loss" is inaccurate, it is indeed a silent
failure of sorts (unlike the RT signal case, which the userland caller
knows about from the return value).


Thanks,
Roland

2009-11-16 22:42:01

by Masami Hiramatsu

[permalink] [raw]
Subject: Re: [PATCH -tip 3/3] Add get_signal tracepoint

Roland McGrath wrote:
>>> - signal loss events (queue overflow)
>>
>> Perhaps, this event is only for rt-signals, since
>> legacy signals just overwritten if it was sent.
>
> Not exactly. Nothing is ever "overwritten". If a non-RT signal is already
> pending, then you just leave the existing queue elements alone--i.e. the
> first one isn't overwritten, rather the second one is dropped. But this is
> not really the point.
>
> The "queue overflow" happens in two ways. For RT signals it really is a
> "signal loss" event--but that's also reported to the sender as -EAGAIN. So
> a signal-generation tracepoint that reports the return value would already
> cover that in a way.
>
> For non-RT signals, a new signal is never lost. But __sigqueue_alloc() can
> still fail. In this case, you get no queue element and thus no siginfo_t
> stored, so you can lose some information about the signal. You don't lose
> the signal itself, but will later dequeue it with an all-zeros siginfo_t.
> While calling this a "signal loss" is inaccurate, it is indeed a silent
> failure of sorts (unlike the RT signal case, which the userland caller
> knows about from the return value).

Hmm, actually, trace_signal_send() doesn't record the return value.
So, what about trace_signal_overflow() for RT-signals and
trace_signal_loss_info() for non-RT?

e.g.
@@ -918,12 +918,15 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
break;
}
} else if (!is_si_special(info)) {
- if (sig >= SIGRTMIN && info->si_code != SI_USER)
+ if (sig >= SIGRTMIN && info->si_code != SI_USER) {
/*
* Queue overflow, abort. We may abort if the signal was rt
* and sent by user using something other than kill().
*/
+ trace_signal_overflow(sig, t);
return -EAGAIN;
+ }
+ trace_signal_loss_info(sig, info);
}

Thank you,

--
Masami Hiramatsu

Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division

e-mail: [email protected]

2009-11-16 23:00:59

by Roland McGrath

[permalink] [raw]
Subject: Re: [PATCH -tip 3/3] Add get_signal tracepoint

> Hmm, actually, trace_signal_send() doesn't record the return value.

Is that because it's called before the action really happens?
Is it important that it be called beforehand? If it's called
afterwards, it's easy to pass the return value.

> So, what about trace_signal_overflow() for RT-signals and
> trace_signal_loss_info() for non-RT?

Really you can distinguish those just by looking at sig and info, so
perhaps a single tracepoint is enough. I guess it really depends on what
filtering you would want and how inconvenient it is to have to apply that
filtering. Having these two distinct tracepoints lets you trivially trace
only "silent information loss" without seeing the events where userland
gets full information (if applications are paying attention).

If you want to have a full suite of tracepoints where each one covers one
unambiguous corner of the semantics, then there are more than these just
for sending. e.g. see below.

Thanks,
Roland


--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -838,8 +841,10 @@ static int __send_signal(int sig, struct

assert_spin_locked(&t->sighand->siglock);

- if (!prepare_signal(sig, t, from_ancestor_ns))
+ if (!prepare_signal(sig, t, from_ancestor_ns)) {
+ trace_signal_generate_ignored(sig, group, info);
return 0;
+ }

pending = group ? &t->signal->shared_pending : &t->pending;
/*
@@ -847,8 +852,10 @@ static int __send_signal(int sig, struct
* exactly one non-rt signal, so that we can get more
* detailed information about the cause of the signal.
*/
- if (legacy_queue(pending, sig))
+ if (legacy_queue(pending, sig)) {
+ trace_signal_generate_dropped_duplicate(sig, group, info);
return 0;
+ }
/*
* fast-pathed signals for kernel-internal things like SIGSTOP
* or SIGKILL.
@@ -896,12 +903,22 @@ static int __send_signal(int sig, struct
break;
}
} else if (!is_si_special(info)) {
- if (sig >= SIGRTMIN && info->si_code != SI_USER)
- /*
- * Queue overflow, abort. We may abort if the signal was rt
- * and sent by user using something other than kill().
- */
+ if (sig >= SIGRTMIN && info->si_code != SI_USER) {
+ /*
+ * Queue overflow, abort. We may abort if the
+ * signal was rt and sent by user using something
+ * other than kill().
+ */
+ trace_signal_generate_overflow_fail(sig, group, info);
return -EAGAIN;
+ } else {
+ /*
+ * This is a silent loss of information. We still
+ * send the signal, but the *info bits are lost.
+ */
+ trace_signal_generate_overflow_lose_info(sig, group,
+ info);
+ }
}

out_set:

2009-11-16 23:45:56

by Masami Hiramatsu

[permalink] [raw]
Subject: Re: [PATCH -tip 3/3] Add get_signal tracepoint

Roland McGrath wrote:
>> Hmm, actually, trace_signal_send() doesn't record the return value.
>
> Is that because it's called before the action really happens?
> Is it important that it be called beforehand? If it's called
> afterwards, it's easy to pass the return value.

I'm not so sure why signal sending events was put beforehand.
However, I assume that original intent might be recording
the *timing* of all signal generation (including SIGSTOP/CONT).

>> So, what about trace_signal_overflow() for RT-signals and
>> trace_signal_loss_info() for non-RT?
>
> Really you can distinguish those just by looking at sig and info, so
> perhaps a single tracepoint is enough.

Ah, right :-)

> I guess it really depends on what
> filtering you would want and how inconvenient it is to have to apply that
> filtering. Having these two distinct tracepoints lets you trivially trace
> only "silent information loss" without seeing the events where userland
> gets full information (if applications are paying attention).
>
> If you want to have a full suite of tracepoints where each one covers one
> unambiguous corner of the semantics, then there are more than these just
> for sending. e.g. see below.

As Ingo said, I think this kind of finegrained events are optional.
I don't think we really need these events soon. IMHO, just adding
signal-loss event is enough at the first step.

But anyway, thank you so much for suggesting those tracepoints!

Thank you,


--
Masami Hiramatsu

Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division

e-mail: [email protected]

2009-11-17 06:01:51

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH -tip 3/3] Add get_signal tracepoint


* Masami Hiramatsu <[email protected]> wrote:

> > - signal IPI/wakeup events
>
> All signals might be used for IPI, isn't it? :-)

I mean, to analyze the various dynamic delivery details of how a signal
send affects a target task:

1) which task/PID was selected to be woken

2) if the task got woken (from sleep) due to the signal sending

3) if it was already woken, whether it needed an IPI via kick_process()

What proportion of signals were wakeups and what proportion hit an
already running task is a relevant question to ask when analyzing the
performance characteristics of signals.

Ingo

2009-11-17 15:24:38

by Masami Hiramatsu

[permalink] [raw]
Subject: Re: [PATCH -tip 3/3] Add get_signal tracepoint

Ingo Molnar wrote:
>
> * Masami Hiramatsu <[email protected]> wrote:
>
>>> - signal IPI/wakeup events
>>
>> All signals might be used for IPI, isn't it? :-)
>
> I mean, to analyze the various dynamic delivery details of how a signal
> send affects a target task:
>
> 1) which task/PID was selected to be woken
>
> 2) if the task got woken (from sleep) due to the signal sending
>
> 3) if it was already woken, whether it needed an IPI via kick_process()

Hmm, as far as I can see, some of these events can be caught by
sched layer too.
- trace_signal_send() will record target task.
- wake_up_state() just calls try_to_wake_up(), and trace_sched_wakeup()
will be called from it.
- kick_process() might better have its own tracepoint.

And also, I think signal_wake_up() might not be a good tracepoint for
signal event, since there is no signr. Moreover some signal_wake_up()
caller(e.g. recalc_sigpending*) silently wake up processes :-(.

> What proportion of signals were wakeups and what proportion hit an
> already running task is a relevant question to ask when analyzing the
> performance characteristics of signals.

Hmm, does it really require wakeup events in signal layer?
I think that we can analyze the characteristics by combination
of signal events and sched events.

Thank you,
--
Masami Hiramatsu

Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division

e-mail: [email protected]