From: Hiroshi Shimamoto <[email protected]>
Because the binfmt is not different between threads in the same process,
it can be moved from task_struct to signal_struct.
Signed-off-by: Hiroshi Shimamoto <[email protected]>
---
fs/exec.c | 6 +++---
include/linux/sched.h | 2 +-
kernel/exit.c | 4 ++--
kernel/fork.c | 7 ++++---
4 files changed, 10 insertions(+), 9 deletions(-)
diff --git a/fs/exec.c b/fs/exec.c
index 4a8849e..402d3b8 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1377,13 +1377,13 @@ out_ret:
int set_binfmt(struct linux_binfmt *new)
{
- struct linux_binfmt *old = current->binfmt;
+ struct linux_binfmt *old = current->signal->binfmt;
if (new) {
if (!try_module_get(new->module))
return -1;
}
- current->binfmt = new;
+ current->signal->binfmt = new;
if (old)
module_put(old->module);
return 0;
@@ -1730,7 +1730,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
audit_core_dumps(signr);
- binfmt = current->binfmt;
+ binfmt = current->signal->binfmt;
if (!binfmt || !binfmt->core_dump)
goto fail;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0085d75..2a6eb81 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -576,6 +576,7 @@ struct signal_struct {
int leader;
struct tty_struct *tty; /* NULL if no tty */
+ struct linux_binfmt *binfmt;
/*
* Cumulative resource counters for dead threads in the group,
@@ -1211,7 +1212,6 @@ struct task_struct {
struct mm_struct *mm, *active_mm;
/* task state */
- struct linux_binfmt *binfmt;
int exit_state;
int exit_code, exit_signal;
int pdeath_signal; /* The signal sent when the parent dies */
diff --git a/kernel/exit.c b/kernel/exit.c
index 628d41f..708348a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -971,8 +971,8 @@ NORET_TYPE void do_exit(long code)
disassociate_ctty(1);
module_put(task_thread_info(tsk)->exec_domain->module);
- if (tsk->binfmt)
- module_put(tsk->binfmt->module);
+ if (tsk->signal->binfmt)
+ module_put(tsk->signal->binfmt->module);
proc_exit_connector(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 467746b..dfb9d0a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -847,6 +847,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->leader = 0; /* session leadership doesn't inherit */
sig->tty_old_pgrp = NULL;
sig->tty = NULL;
+ sig->binfmt = current->signal->binfmt;
sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
sig->gtime = cputime_zero;
@@ -1014,7 +1015,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (!try_module_get(task_thread_info(p)->exec_domain->module))
goto bad_fork_cleanup_count;
- if (p->binfmt && !try_module_get(p->binfmt->module))
+ if (p->signal->binfmt && !try_module_get(p->signal->binfmt->module))
goto bad_fork_cleanup_put_domain;
p->did_exec = 0;
@@ -1301,8 +1302,8 @@ bad_fork_cleanup_cgroup:
#endif
cgroup_exit(p, cgroup_callbacks_done);
delayacct_tsk_free(p);
- if (p->binfmt)
- module_put(p->binfmt->module);
+ if (p->signal->binfmt)
+ module_put(p->signal->binfmt->module);
bad_fork_cleanup_put_domain:
module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
--
1.6.2.5
From: Hiroshi Shimamoto <[email protected]>
The binfmt is a member of signal_struct, so it can be handled per
signal_struct instead of task_struct.
Signed-off-by: Hiroshi Shimamoto <[email protected]>
---
kernel/exit.c | 2 --
kernel/fork.c | 12 ++++++------
2 files changed, 6 insertions(+), 8 deletions(-)
diff --git a/kernel/exit.c b/kernel/exit.c
index 708348a..187a211 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -971,8 +971,6 @@ NORET_TYPE void do_exit(long code)
disassociate_ctty(1);
module_put(task_thread_info(tsk)->exec_domain->module);
- if (tsk->signal->binfmt)
- module_put(tsk->signal->binfmt->module);
proc_exit_connector(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index dfb9d0a..636c06f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -848,6 +848,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->tty_old_pgrp = NULL;
sig->tty = NULL;
sig->binfmt = current->signal->binfmt;
+ if (sig->binfmt && !try_module_get(sig->binfmt->module)) {
+ kmem_cache_free(signal_cachep, sig);
+ return -EAGAIN;
+ }
sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
sig->gtime = cputime_zero;
@@ -876,6 +880,8 @@ void __cleanup_signal(struct signal_struct *sig)
{
thread_group_cputime_free(sig);
tty_kref_put(sig->tty);
+ if (sig->binfmt)
+ module_put(sig->binfmt->module);
kmem_cache_free(signal_cachep, sig);
}
@@ -1015,9 +1021,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (!try_module_get(task_thread_info(p)->exec_domain->module))
goto bad_fork_cleanup_count;
- if (p->signal->binfmt && !try_module_get(p->signal->binfmt->module))
- goto bad_fork_cleanup_put_domain;
-
p->did_exec = 0;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
copy_flags(clone_flags, p);
@@ -1302,9 +1305,6 @@ bad_fork_cleanup_cgroup:
#endif
cgroup_exit(p, cgroup_callbacks_done);
delayacct_tsk_free(p);
- if (p->signal->binfmt)
- module_put(p->signal->binfmt->module);
-bad_fork_cleanup_put_domain:
module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
atomic_dec(&p->cred->user->processes);
--
1.6.2.5
On Fri, 10 Jul 2009 17:42:31 +0900
Hiroshi Shimamoto <[email protected]> wrote:
> From: Hiroshi Shimamoto <[email protected]>
>
> Because the binfmt is not different between threads in the same process,
> it can be moved from task_struct to signal_struct.
>
Seems like a logical cleanup to me.
We should rename signal_struct.
> > Because the binfmt is not different between threads in the same process,
> > it can be moved from task_struct to signal_struct.
>
> Seems like a logical cleanup to me.
IMHO binfmt belongs in mm_struct.
> We should rename signal_struct.
Indeed, it's in fact 'struct process'. But historically Linus has objected
to admitting that there is really a concept of process in the POSIX sense
that exists deep in the implementation (as it truly does for years now),
beyond "tasks that happen to share a lot of CLONE_* stuff". The name
signal_struct is sufficiently misleading to make it easier to retain the
fantasy he prefers. ;-)
Thanks,
Roland
On 07/22, Roland McGrath wrote:
>
> > > Because the binfmt is not different between threads in the same process,
> > > it can be moved from task_struct to signal_struct.
> >
> > Seems like a logical cleanup to me.
>
> IMHO binfmt belongs in mm_struct.
Ah, agreed.
Hiroshi, what do you think?
Oleg.
Oleg Nesterov wrote:
> On 07/22, Roland McGrath wrote:
>>>> Because the binfmt is not different between threads in the same process,
>>>> it can be moved from task_struct to signal_struct.
>>> Seems like a logical cleanup to me.
>> IMHO binfmt belongs in mm_struct.
>
> Ah, agreed.
>
> Hiroshi, what do you think?
yeah, it sounds better. I'll try to update.
Thanks,
Hiroshi
From: Hiroshi Shimamoto <[email protected]>
Because the binfmt is not different between threads in the same process,
it can be moved from task_struct to mm_struct.
Signed-off-by: Hiroshi Shimamoto <[email protected]>
---
fs/exec.c | 10 +++++++---
include/linux/mm_types.h | 2 ++
include/linux/sched.h | 1 -
kernel/exit.c | 5 +++--
kernel/fork.c | 6 +++---
5 files changed, 15 insertions(+), 9 deletions(-)
diff --git a/fs/exec.c b/fs/exec.c
index 4a8849e..75441aa 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1377,13 +1377,17 @@ out_ret:
int set_binfmt(struct linux_binfmt *new)
{
- struct linux_binfmt *old = current->binfmt;
+ struct linux_binfmt *old;
+ if (!current->mm)
+ return -1;
+
+ old = current->mm->binfmt;
if (new) {
if (!try_module_get(new->module))
return -1;
}
- current->binfmt = new;
+ current->mm->binfmt = new;
if (old)
module_put(old->module);
return 0;
@@ -1730,7 +1734,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
audit_core_dumps(signr);
- binfmt = current->binfmt;
+ binfmt = current->mm ? current->mm->binfmt : NULL;
if (!binfmt || !binfmt->core_dump)
goto fail;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7acc843..6719040 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -240,6 +240,8 @@ struct mm_struct {
unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
+ struct linux_binfmt *binfmt;
+
s8 oom_adj; /* OOM kill score adjustment (bit shift) */
cpumask_t cpu_vm_mask;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3ab08e4..940b070 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1220,7 +1220,6 @@ struct task_struct {
struct mm_struct *mm, *active_mm;
/* task state */
- struct linux_binfmt *binfmt;
int exit_state;
int exit_code, exit_signal;
int pdeath_signal; /* The signal sent when the parent dies */
diff --git a/kernel/exit.c b/kernel/exit.c
index 869dc22..07524fa 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -953,6 +953,9 @@ NORET_TYPE void do_exit(long code)
tsk->exit_code = code;
taskstats_exit(tsk, group_dead);
+ if (tsk->mm && tsk->mm->binfmt)
+ module_put(tsk->mm->binfmt->module);
+
exit_mm(tsk);
if (group_dead)
@@ -970,8 +973,6 @@ NORET_TYPE void do_exit(long code)
disassociate_ctty(1);
module_put(task_thread_info(tsk)->exec_domain->module);
- if (tsk->binfmt)
- module_put(tsk->binfmt->module);
proc_exit_connector(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index fe2b1aa..b81223a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1008,7 +1008,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (!try_module_get(task_thread_info(p)->exec_domain->module))
goto bad_fork_cleanup_count;
- if (p->binfmt && !try_module_get(p->binfmt->module))
+ if (p->mm && p->mm->binfmt && !try_module_get(p->mm->binfmt->module))
goto bad_fork_cleanup_put_domain;
p->did_exec = 0;
@@ -1295,8 +1295,8 @@ bad_fork_cleanup_cgroup:
#endif
cgroup_exit(p, cgroup_callbacks_done);
delayacct_tsk_free(p);
- if (p->binfmt)
- module_put(p->binfmt->module);
+ if (p->mm && p->mm->binfmt)
+ module_put(p->mm->binfmt->module);
bad_fork_cleanup_put_domain:
module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
--
1.6.3.3
From: Hiroshi Shimamoto <[email protected]>
The binfmt is a member of mm_struct, so it can be handled per mm_struct
instead of task_struct.
Signed-off-by: Hiroshi Shimamoto <[email protected]>
---
kernel/exit.c | 3 ---
kernel/fork.c | 11 +++++------
2 files changed, 5 insertions(+), 9 deletions(-)
diff --git a/kernel/exit.c b/kernel/exit.c
index 07524fa..77b01be 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -953,9 +953,6 @@ NORET_TYPE void do_exit(long code)
tsk->exit_code = code;
taskstats_exit(tsk, group_dead);
- if (tsk->mm && tsk->mm->binfmt)
- module_put(tsk->mm->binfmt->module);
-
exit_mm(tsk);
if (group_dead)
diff --git a/kernel/fork.c b/kernel/fork.c
index b81223a..3f3de29 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -494,6 +494,8 @@ void mmput(struct mm_struct *mm)
spin_unlock(&mmlist_lock);
}
put_swap_token(mm);
+ if (mm->binfmt)
+ module_put(mm->binfmt->module);
mmdrop(mm);
}
}
@@ -619,6 +621,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
mm->hiwater_rss = get_mm_rss(mm);
mm->hiwater_vm = mm->total_vm;
+ if (mm->binfmt && !try_module_get(mm->binfmt->module))
+ goto free_pt;
+
return mm;
free_pt:
@@ -1008,9 +1013,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (!try_module_get(task_thread_info(p)->exec_domain->module))
goto bad_fork_cleanup_count;
- if (p->mm && p->mm->binfmt && !try_module_get(p->mm->binfmt->module))
- goto bad_fork_cleanup_put_domain;
-
p->did_exec = 0;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
copy_flags(clone_flags, p);
@@ -1295,9 +1297,6 @@ bad_fork_cleanup_cgroup:
#endif
cgroup_exit(p, cgroup_callbacks_done);
delayacct_tsk_free(p);
- if (p->mm && p->mm->binfmt)
- module_put(p->mm->binfmt->module);
-bad_fork_cleanup_put_domain:
module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
atomic_dec(&p->cred->user->processes);
--
1.6.3.3
On 07/24, Hiroshi Shimamoto wrote:
>
> int set_binfmt(struct linux_binfmt *new)
> {
> - struct linux_binfmt *old = current->binfmt;
> + struct linux_binfmt *old;
>
> + if (!current->mm)
> + return -1;
> +
> + old = current->mm->binfmt;
> if (new) {
> if (!try_module_get(new->module))
> return -1;
> }
> - current->binfmt = new;
> + current->mm->binfmt = new;
Hmm. Of-topic, but I think set_binfmt() is buggy (with or without this patch),
it should use __module_get(). I'll send the fix in a minute.
> @@ -1730,7 +1734,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
>
> audit_core_dumps(signr);
>
> - binfmt = current->binfmt;
> + binfmt = current->mm ? current->mm->binfmt : NULL;
current->mm can't be NULL here. And please note we already have
struct mm_struct *mm = current->mm, so the above should be
binfmt = mm->binfmt;
> @@ -953,6 +953,9 @@ NORET_TYPE void do_exit(long code)
> tsk->exit_code = code;
> taskstats_exit(tsk, group_dead);
>
> + if (tsk->mm && tsk->mm->binfmt)
> + module_put(tsk->mm->binfmt->module);
This is not right. We leak ->binfmt on exec.
Seems to be fixed by the next patch, but still this is not good.
I'd suggest you to merge these 2 patches into single patch, because
module_put(->binfmt) should go to mmput() from the very beginning.
Oleg.
Oleg Nesterov wrote:
> On 07/24, Hiroshi Shimamoto wrote:
>> int set_binfmt(struct linux_binfmt *new)
>> {
>> - struct linux_binfmt *old = current->binfmt;
>> + struct linux_binfmt *old;
>>
>> + if (!current->mm)
>> + return -1;
>> +
>> + old = current->mm->binfmt;
>> if (new) {
>> if (!try_module_get(new->module))
>> return -1;
>> }
>> - current->binfmt = new;
>> + current->mm->binfmt = new;
>
> Hmm. Of-topic, but I think set_binfmt() is buggy (with or without this patch),
> it should use __module_get(). I'll send the fix in a minute.
>
>> @@ -1730,7 +1734,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
>>
>> audit_core_dumps(signr);
>>
>> - binfmt = current->binfmt;
>> + binfmt = current->mm ? current->mm->binfmt : NULL;
>
> current->mm can't be NULL here. And please note we already have
> struct mm_struct *mm = current->mm, so the above should be
>
> binfmt = mm->binfmt;
>
>> @@ -953,6 +953,9 @@ NORET_TYPE void do_exit(long code)
>> tsk->exit_code = code;
>> taskstats_exit(tsk, group_dead);
>>
>> + if (tsk->mm && tsk->mm->binfmt)
>> + module_put(tsk->mm->binfmt->module);
>
> This is not right. We leak ->binfmt on exec.
>
> Seems to be fixed by the next patch, but still this is not good.
> I'd suggest you to merge these 2 patches into single patch, because
> module_put(->binfmt) should go to mmput() from the very beginning.
Hi Oleg, thank you very much for comments, here is an update patch.
This patch can be applied after your set_binfmt() fix.
========
From: Hiroshi Shimamoto <[email protected]>
Subject: [PATCH] task_struct cleanup: move binfmt field to mm_struct
Because the binfmt is not different between threads in the same process,
it can be moved from task_struct to mm_struct. And binfmt moudle is handled
per mm_struct instead of task_struct.
Signed-off-by: Hiroshi Shimamoto <[email protected]>
---
fs/exec.c | 11 +++++++----
include/linux/mm_types.h | 2 ++
include/linux/sched.h | 1 -
kernel/exit.c | 2 --
kernel/fork.c | 11 +++++------
5 files changed, 14 insertions(+), 13 deletions(-)
diff --git a/fs/exec.c b/fs/exec.c
index 61d5be2..41aea26 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1377,10 +1377,13 @@ out_ret:
void set_binfmt(struct linux_binfmt *new)
{
- if (current->binfmt)
- module_put(current->binfmt->module);
+ struct mm_struct *mm = current->mm;
+
+ BUG_ON(!mm);
+ if (mm->binfmt)
+ module_put(mm->binfmt->module);
- current->binfmt = new;
+ mm->binfmt = new;
if (new)
__module_get(new->module);
}
@@ -1726,7 +1729,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
audit_core_dumps(signr);
- binfmt = current->binfmt;
+ binfmt = mm->binfmt;
if (!binfmt || !binfmt->core_dump)
goto fail;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7acc843..6719040 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -240,6 +240,8 @@ struct mm_struct {
unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
+ struct linux_binfmt *binfmt;
+
s8 oom_adj; /* OOM kill score adjustment (bit shift) */
cpumask_t cpu_vm_mask;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3ab08e4..940b070 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1220,7 +1220,6 @@ struct task_struct {
struct mm_struct *mm, *active_mm;
/* task state */
- struct linux_binfmt *binfmt;
int exit_state;
int exit_code, exit_signal;
int pdeath_signal; /* The signal sent when the parent dies */
diff --git a/kernel/exit.c b/kernel/exit.c
index 869dc22..77b01be 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -970,8 +970,6 @@ NORET_TYPE void do_exit(long code)
disassociate_ctty(1);
module_put(task_thread_info(tsk)->exec_domain->module);
- if (tsk->binfmt)
- module_put(tsk->binfmt->module);
proc_exit_connector(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 9b42695..9c21984 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -494,6 +494,8 @@ void mmput(struct mm_struct *mm)
spin_unlock(&mmlist_lock);
}
put_swap_token(mm);
+ if (mm->binfmt)
+ module_put(mm->binfmt->module);
mmdrop(mm);
}
}
@@ -619,6 +621,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
mm->hiwater_rss = get_mm_rss(mm);
mm->hiwater_vm = mm->total_vm;
+ if (mm->binfmt && !try_module_get(mm->binfmt->module))
+ goto free_pt;
+
return mm;
free_pt:
@@ -1013,9 +1018,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (!try_module_get(task_thread_info(p)->exec_domain->module))
goto bad_fork_cleanup_count;
- if (p->binfmt && !try_module_get(p->binfmt->module))
- goto bad_fork_cleanup_put_domain;
-
p->did_exec = 0;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
copy_flags(clone_flags, p);
@@ -1300,9 +1302,6 @@ bad_fork_cleanup_cgroup:
#endif
cgroup_exit(p, cgroup_callbacks_done);
delayacct_tsk_free(p);
- if (p->binfmt)
- module_put(p->binfmt->module);
-bad_fork_cleanup_put_domain:
module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
atomic_dec(&p->cred->user->processes);
--
1.6.3.3
(add Rusty)
On 07/27, Hiroshi Shimamoto wrote:
>
> void set_binfmt(struct linux_binfmt *new)
> {
> - if (current->binfmt)
> - module_put(current->binfmt->module);
> + struct mm_struct *mm = current->mm;
> +
> + BUG_ON(!mm);
> + if (mm->binfmt)
I am not sure we need this BUG_ON() above. If mm == NULL we will
have the same debug info after the crash.
> @@ -619,6 +621,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
> mm->hiwater_rss = get_mm_rss(mm);
> mm->hiwater_vm = mm->total_vm;
>
> + if (mm->binfmt && !try_module_get(mm->binfmt->module))
> + goto free_pt;
free_pt does mmput() which calls module_put(mm->binfmt->module). This
is not right when try_module_get() fails. Hmm, the same if dup_mmap()
fails, we are doing an extra module_put().
Perhaps we can use __module_get() again, current->mm->binfmt holds a
reference. But this is a user-visible change, currently fork() can't
succeed if ->module->state == MODULE_STATE_GOING.
So I think we need another small change:
free_pt:
+ mm->binfmt = NULL;
mmput(mm);
Oleg.
Oleg Nesterov wrote:
> (add Rusty)
>
> On 07/27, Hiroshi Shimamoto wrote:
>> void set_binfmt(struct linux_binfmt *new)
>> {
>> - if (current->binfmt)
>> - module_put(current->binfmt->module);
>> + struct mm_struct *mm = current->mm;
>> +
>> + BUG_ON(!mm);
>> + if (mm->binfmt)
>
> I am not sure we need this BUG_ON() above. If mm == NULL we will
> have the same debug info after the crash.
Ah, right. It will crash at accessing mm->binfmt and we'll get the
same information. I didn't think seriously.
BTW, now I noticed that it looks similar the security issue with
NULL page mapping. If the kernel has the page mapping to NULL,
it causes unstable behavior, no?
>
>> @@ -619,6 +621,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
>> mm->hiwater_rss = get_mm_rss(mm);
>> mm->hiwater_vm = mm->total_vm;
>>
>> + if (mm->binfmt && !try_module_get(mm->binfmt->module))
>> + goto free_pt;
>
> free_pt does mmput() which calls module_put(mm->binfmt->module). This
> is not right when try_module_get() fails. Hmm, the same if dup_mmap()
> fails, we are doing an extra module_put().
Good catch, should avoid over putting binfmt->module.
>
> Perhaps we can use __module_get() again, current->mm->binfmt holds a
> reference. But this is a user-visible change, currently fork() can't
> succeed if ->module->state == MODULE_STATE_GOING.
>
> So I think we need another small change:
>
> free_pt:
> + mm->binfmt = NULL;
> mmput(mm);
looks good for me.
Will send an update.
thanks,
Hiroshi
On 07/28, Hiroshi Shimamoto wrote:
>
> Oleg Nesterov wrote:
> > (add Rusty)
> >
> > On 07/27, Hiroshi Shimamoto wrote:
> >> void set_binfmt(struct linux_binfmt *new)
> >> {
> >> - if (current->binfmt)
> >> - module_put(current->binfmt->module);
> >> + struct mm_struct *mm = current->mm;
> >> +
> >> + BUG_ON(!mm);
> >> + if (mm->binfmt)
> >
> > I am not sure we need this BUG_ON() above. If mm == NULL we will
> > have the same debug info after the crash.
>
> Ah, right. It will crash at accessing mm->binfmt and we'll get the
> same information. I didn't think seriously.
> BTW, now I noticed that it looks similar the security issue with
> NULL page mapping. If the kernel has the page mapping to NULL,
> it causes unstable behavior, no?
The kernel can't have NULL mapped. The task can, but in this case
it must have a valid ->mm != NULL.
IOW, if current->mm == NULL, current uses init_mm and thus NULL
can't be mapped.
Oleg.
This is task_struct cleanup patch.
This update patch replaces the below patches;
- task_struct-cleanup-move-binfmt-field-to-signal_struct.patch
- task_struct-cleanup-make-binfmt-module-get-and-put-per-signal_struct.patch
they have been already dropped from -mm.
This patch can be applied Oleg's patch;
- exec-fix-set_binfmt-vs-sys_delete_module-race.patch
thanks,
Hiroshi
From: Hiroshi Shimamoto <[email protected]>
Because the binfmt is not different between threads in the same process,
it can be moved from task_struct to mm_struct. And binfmt moudle is handled
per mm_struct instead of task_struct.
Signed-off-by: Hiroshi Shimamoto <[email protected]>
---
fs/exec.c | 10 ++++++----
include/linux/mm_types.h | 2 ++
include/linux/sched.h | 1 -
kernel/exit.c | 2 --
kernel/fork.c | 13 +++++++------
5 files changed, 15 insertions(+), 13 deletions(-)
diff --git a/fs/exec.c b/fs/exec.c
index 6751a25..1d9b7a1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1380,10 +1380,12 @@ out_ret:
*/
void set_binfmt(struct linux_binfmt *new)
{
- if (current->binfmt)
- module_put(current->binfmt->module);
+ struct mm_struct *mm = current->mm;
+
+ if (mm->binfmt)
+ module_put(mm->binfmt->module);
- current->binfmt = new;
+ mm->binfmt = new;
if (new)
__module_get(new->module);
}
@@ -1729,7 +1731,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
audit_core_dumps(signr);
- binfmt = current->binfmt;
+ binfmt = mm->binfmt;
if (!binfmt || !binfmt->core_dump)
goto fail;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7acc843..6719040 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -240,6 +240,8 @@ struct mm_struct {
unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
+ struct linux_binfmt *binfmt;
+
s8 oom_adj; /* OOM kill score adjustment (bit shift) */
cpumask_t cpu_vm_mask;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3ab08e4..940b070 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1220,7 +1220,6 @@ struct task_struct {
struct mm_struct *mm, *active_mm;
/* task state */
- struct linux_binfmt *binfmt;
int exit_state;
int exit_code, exit_signal;
int pdeath_signal; /* The signal sent when the parent dies */
diff --git a/kernel/exit.c b/kernel/exit.c
index 869dc22..77b01be 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -970,8 +970,6 @@ NORET_TYPE void do_exit(long code)
disassociate_ctty(1);
module_put(task_thread_info(tsk)->exec_domain->module);
- if (tsk->binfmt)
- module_put(tsk->binfmt->module);
proc_exit_connector(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 9b42695..653da52 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -494,6 +494,8 @@ void mmput(struct mm_struct *mm)
spin_unlock(&mmlist_lock);
}
put_swap_token(mm);
+ if (mm->binfmt)
+ module_put(mm->binfmt->module);
mmdrop(mm);
}
}
@@ -619,9 +621,14 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
mm->hiwater_rss = get_mm_rss(mm);
mm->hiwater_vm = mm->total_vm;
+ if (mm->binfmt && !try_module_get(mm->binfmt->module))
+ goto free_pt;
+
return mm;
free_pt:
+ /* don't put binfmt in mmput, we haven't got module yet */
+ mm->binfmt = NULL;
mmput(mm);
fail_nomem:
@@ -1013,9 +1020,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (!try_module_get(task_thread_info(p)->exec_domain->module))
goto bad_fork_cleanup_count;
- if (p->binfmt && !try_module_get(p->binfmt->module))
- goto bad_fork_cleanup_put_domain;
-
p->did_exec = 0;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
copy_flags(clone_flags, p);
@@ -1300,9 +1304,6 @@ bad_fork_cleanup_cgroup:
#endif
cgroup_exit(p, cgroup_callbacks_done);
delayacct_tsk_free(p);
- if (p->binfmt)
- module_put(p->binfmt->module);
-bad_fork_cleanup_put_domain:
module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
atomic_dec(&p->cred->user->processes);
--
1.6.3.3
On 07/30, Hiroshi Shimamoto wrote:
>
> Because the binfmt is not different between threads in the same process,
> it can be moved from task_struct to mm_struct. And binfmt moudle is handled
> per mm_struct instead of task_struct.
Looks like a nice patch to me.
Oleg.
> Signed-off-by: Hiroshi Shimamoto <[email protected]>
> ---
> fs/exec.c | 10 ++++++----
> include/linux/mm_types.h | 2 ++
> include/linux/sched.h | 1 -
> kernel/exit.c | 2 --
> kernel/fork.c | 13 +++++++------
> 5 files changed, 15 insertions(+), 13 deletions(-)
>
> diff --git a/fs/exec.c b/fs/exec.c
> index 6751a25..1d9b7a1 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1380,10 +1380,12 @@ out_ret:
> */
> void set_binfmt(struct linux_binfmt *new)
> {
> - if (current->binfmt)
> - module_put(current->binfmt->module);
> + struct mm_struct *mm = current->mm;
> +
> + if (mm->binfmt)
> + module_put(mm->binfmt->module);
>
> - current->binfmt = new;
> + mm->binfmt = new;
> if (new)
> __module_get(new->module);
> }
> @@ -1729,7 +1731,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
>
> audit_core_dumps(signr);
>
> - binfmt = current->binfmt;
> + binfmt = mm->binfmt;
> if (!binfmt || !binfmt->core_dump)
> goto fail;
>
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 7acc843..6719040 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -240,6 +240,8 @@ struct mm_struct {
>
> unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
>
> + struct linux_binfmt *binfmt;
> +
> s8 oom_adj; /* OOM kill score adjustment (bit shift) */
>
> cpumask_t cpu_vm_mask;
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 3ab08e4..940b070 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1220,7 +1220,6 @@ struct task_struct {
> struct mm_struct *mm, *active_mm;
>
> /* task state */
> - struct linux_binfmt *binfmt;
> int exit_state;
> int exit_code, exit_signal;
> int pdeath_signal; /* The signal sent when the parent dies */
> diff --git a/kernel/exit.c b/kernel/exit.c
> index 869dc22..77b01be 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -970,8 +970,6 @@ NORET_TYPE void do_exit(long code)
> disassociate_ctty(1);
>
> module_put(task_thread_info(tsk)->exec_domain->module);
> - if (tsk->binfmt)
> - module_put(tsk->binfmt->module);
>
> proc_exit_connector(tsk);
>
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 9b42695..653da52 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -494,6 +494,8 @@ void mmput(struct mm_struct *mm)
> spin_unlock(&mmlist_lock);
> }
> put_swap_token(mm);
> + if (mm->binfmt)
> + module_put(mm->binfmt->module);
> mmdrop(mm);
> }
> }
> @@ -619,9 +621,14 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
> mm->hiwater_rss = get_mm_rss(mm);
> mm->hiwater_vm = mm->total_vm;
>
> + if (mm->binfmt && !try_module_get(mm->binfmt->module))
> + goto free_pt;
> +
> return mm;
>
> free_pt:
> + /* don't put binfmt in mmput, we haven't got module yet */
> + mm->binfmt = NULL;
> mmput(mm);
>
> fail_nomem:
> @@ -1013,9 +1020,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
> if (!try_module_get(task_thread_info(p)->exec_domain->module))
> goto bad_fork_cleanup_count;
>
> - if (p->binfmt && !try_module_get(p->binfmt->module))
> - goto bad_fork_cleanup_put_domain;
> -
> p->did_exec = 0;
> delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
> copy_flags(clone_flags, p);
> @@ -1300,9 +1304,6 @@ bad_fork_cleanup_cgroup:
> #endif
> cgroup_exit(p, cgroup_callbacks_done);
> delayacct_tsk_free(p);
> - if (p->binfmt)
> - module_put(p->binfmt->module);
> -bad_fork_cleanup_put_domain:
> module_put(task_thread_info(p)->exec_domain->module);
> bad_fork_cleanup_count:
> atomic_dec(&p->cred->user->processes);
> --
> 1.6.3.3
>
> On 07/30, Hiroshi Shimamoto wrote:
> >
> > Because the binfmt is not different between threads in the same process,
> > it can be moved from task_struct to mm_struct. And binfmt moudle is handled
> > per mm_struct instead of task_struct.
>
> Looks like a nice patch to me.
I agree! It's a good clean-up to move this field.
Thanks,
Roland