2020-03-05 21:19:11

by Eric W. Biederman

[permalink] [raw]
Subject: [PATCH 2/2] exec: Add a exec_update_mutex to replace cred_guard_mutex


The cred_guard_mutex is problematic. The cred_guard_mutex is held
over the userspace accesses as the arguments from userspace are read.
The cred_guard_mutex is held of PTRACE_EVENT_EXIT as the the other
threads are killed. The cred_guard_mutex is held over
"put_user(0, tsk->clear_child_tid)" in exit_mm().

Any of those can result in deadlock, as the cred_guard_mutex is held
over a possible indefinite userspace waits for userspace.

Add exec_update_mutex that is only held over exec updating process
with the new contents of exec, so that code that needs not to be
confused by exec changing the mm and the cred in ways that can not
happen during ordinary execution of a process can take.

The plan is to switch the users of cred_guard_mutex to
exed_udpate_mutex one by one. This lets us move forward while still
being careful and not introducing any regressions.

Link: https://lore.kernel.org/lkml/[email protected]/
Link: https://lore.kernel.org/lkml/AM6PR03MB5170B06F3A2B75EFB98D071AE4E60@AM6PR03MB5170.eurprd03.prod.outlook.com/
Link: https://lore.kernel.org/linux-fsdevel/[email protected]/
Link: https://lore.kernel.org/lkml/[email protected]/
Link: https://lore.kernel.org/lkml/[email protected]/
Ref: 45c1a159b85b ("Add PTRACE_O_TRACEVFORKDONE and PTRACE_O_TRACEEXIT facilities.")
Ref: 456f17cd1a28 ("[PATCH] user-vm-unlock-2.5.31-A2")
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
fs/exec.c | 4 ++++
include/linux/sched/signal.h | 9 ++++++++-
kernel/fork.c | 1 +
3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/fs/exec.c b/fs/exec.c
index c243f9660d46..ad7b518f906d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1182,6 +1182,7 @@ static int de_thread(struct linux_binprm *bprm, struct task_struct *tsk)
release_task(leader);
}

+ mutex_lock(&current->signal->exec_update_mutex);
bprm->unrecoverable = true;
sig->group_exit_task = NULL;
sig->notify_count = 0;
@@ -1425,6 +1426,8 @@ static void free_bprm(struct linux_binprm *bprm)
{
free_arg_pages(bprm);
if (bprm->cred) {
+ if (bprm->unrecoverable)
+ mutex_unlock(&current->signal->exec_update_mutex);
mutex_unlock(&current->signal->cred_guard_mutex);
abort_creds(bprm->cred);
}
@@ -1474,6 +1477,7 @@ void install_exec_creds(struct linux_binprm *bprm)
* credentials; any time after this it may be unlocked.
*/
security_bprm_committed_creds(bprm);
+ mutex_unlock(&current->signal->exec_update_mutex);
mutex_unlock(&current->signal->cred_guard_mutex);
}
EXPORT_SYMBOL(install_exec_creds);
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 88050259c466..a29df79540ce 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -224,7 +224,14 @@ struct signal_struct {

struct mutex cred_guard_mutex; /* guard against foreign influences on
* credential calculations
- * (notably. ptrace) */
+ * (notably. ptrace)
+ * Deprecated do not use in new code.
+ * Use exec_update_mutex instead.
+ */
+ struct mutex exec_update_mutex; /* Held while task_struct is being
+ * updated during exec, and may have
+ * inconsistent permissions.
+ */
} __randomize_layout;

/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 60a1295f4384..12896a6ecee6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1594,6 +1594,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->oom_score_adj_min = current->signal->oom_score_adj_min;

mutex_init(&sig->cred_guard_mutex);
+ mutex_init(&sig->exec_update_mutex);

return 0;
}
--
2.25.0


2020-03-05 21:52:37

by Bernd Edlinger

[permalink] [raw]
Subject: Re: [PATCH 2/2] exec: Add a exec_update_mutex to replace cred_guard_mutex

On 3/5/20 10:16 PM, Eric W. Biederman wrote:
>
> The cred_guard_mutex is problematic. The cred_guard_mutex is held
> over the userspace accesses as the arguments from userspace are read.
> The cred_guard_mutex is held of PTRACE_EVENT_EXIT as the the other
> threads are killed. The cred_guard_mutex is held over
> "put_user(0, tsk->clear_child_tid)" in exit_mm().
>
> Any of those can result in deadlock, as the cred_guard_mutex is held
> over a possible indefinite userspace waits for userspace.
>
> Add exec_update_mutex that is only held over exec updating process
> with the new contents of exec, so that code that needs not to be
> confused by exec changing the mm and the cred in ways that can not
> happen during ordinary execution of a process can take.
>
> The plan is to switch the users of cred_guard_mutex to
> exed_udpate_mutex one by one. This lets us move forward while still
> being careful and not introducing any regressions.
>
> Link: https://lore.kernel.org/lkml/[email protected]/
> Link: https://lore.kernel.org/lkml/AM6PR03MB5170B06F3A2B75EFB98D071AE4E60@AM6PR03MB5170.eurprd03.prod.outlook.com/
> Link: https://lore.kernel.org/linux-fsdevel/[email protected]/
> Link: https://lore.kernel.org/lkml/[email protected]/
> Link: https://lore.kernel.org/lkml/[email protected]/
> Ref: 45c1a159b85b ("Add PTRACE_O_TRACEVFORKDONE and PTRACE_O_TRACEEXIT facilities.")
> Ref: 456f17cd1a28 ("[PATCH] user-vm-unlock-2.5.31-A2")
> Signed-off-by: "Eric W. Biederman" <[email protected]>
> ---
> fs/exec.c | 4 ++++
> include/linux/sched/signal.h | 9 ++++++++-
> kernel/fork.c | 1 +
> 3 files changed, 13 insertions(+), 1 deletion(-)
>
> diff --git a/fs/exec.c b/fs/exec.c
> index c243f9660d46..ad7b518f906d 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1182,6 +1182,7 @@ static int de_thread(struct linux_binprm *bprm, struct task_struct *tsk)
> release_task(leader);
> }
>
> + mutex_lock(&current->signal->exec_update_mutex);
> bprm->unrecoverable = true;
> sig->group_exit_task = NULL;
> sig->notify_count = 0;
> @@ -1425,6 +1426,8 @@ static void free_bprm(struct linux_binprm *bprm)
> {
> free_arg_pages(bprm);
> if (bprm->cred) {
> + if (bprm->unrecoverable)
> + mutex_unlock(&current->signal->exec_update_mutex);
> mutex_unlock(&current->signal->cred_guard_mutex);
> abort_creds(bprm->cred);
> }
> @@ -1474,6 +1477,7 @@ void install_exec_creds(struct linux_binprm *bprm)
> * credentials; any time after this it may be unlocked.
> */
> security_bprm_committed_creds(bprm);
> + mutex_unlock(&current->signal->exec_update_mutex);
> mutex_unlock(&current->signal->cred_guard_mutex);
> }
> EXPORT_SYMBOL(install_exec_creds);
> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> index 88050259c466..a29df79540ce 100644
> --- a/include/linux/sched/signal.h
> +++ b/include/linux/sched/signal.h
> @@ -224,7 +224,14 @@ struct signal_struct {
>
> struct mutex cred_guard_mutex; /* guard against foreign influences on
> * credential calculations
> - * (notably. ptrace) */
> + * (notably. ptrace)
> + * Deprecated do not use in new code.
> + * Use exec_update_mutex instead.
> + */
> + struct mutex exec_update_mutex; /* Held while task_struct is being
> + * updated during exec, and may have
> + * inconsistent permissions.
> + */
> } __randomize_layout;
>
> /*
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 60a1295f4384..12896a6ecee6 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1594,6 +1594,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
> sig->oom_score_adj_min = current->signal->oom_score_adj_min;
>
> mutex_init(&sig->cred_guard_mutex);
> + mutex_init(&sig->exec_update_mutex);
>
> return 0;
> }
>
Don't you need to add something like this to init/init_task.c ?

.exec_update_mutex = __MUTEX_INITIALIZER(init_signals.exec_update_mutex),


Bernd.

2020-03-06 05:20:52

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 2/2] exec: Add a exec_update_mutex to replace cred_guard_mutex

Bernd Edlinger <[email protected]> writes:

> On 3/5/20 10:16 PM, Eric W. Biederman wrote:
>>
>> The cred_guard_mutex is problematic. The cred_guard_mutex is held
>> over the userspace accesses as the arguments from userspace are read.
>> The cred_guard_mutex is held of PTRACE_EVENT_EXIT as the the other
>> threads are killed. The cred_guard_mutex is held over
>> "put_user(0, tsk->clear_child_tid)" in exit_mm().
>>
>> Any of those can result in deadlock, as the cred_guard_mutex is held
>> over a possible indefinite userspace waits for userspace.
>>
>> Add exec_update_mutex that is only held over exec updating process
>> with the new contents of exec, so that code that needs not to be
>> confused by exec changing the mm and the cred in ways that can not
>> happen during ordinary execution of a process can take.
>>
>> The plan is to switch the users of cred_guard_mutex to
>> exed_udpate_mutex one by one. This lets us move forward while still
>> being careful and not introducing any regressions.
>>
>> Link: https://lore.kernel.org/lkml/[email protected]/
>> Link: https://lore.kernel.org/lkml/AM6PR03MB5170B06F3A2B75EFB98D071AE4E60@AM6PR03MB5170.eurprd03.prod.outlook.com/
>> Link: https://lore.kernel.org/linux-fsdevel/[email protected]/
>> Link: https://lore.kernel.org/lkml/[email protected]/
>> Link: https://lore.kernel.org/lkml/[email protected]/
>> Ref: 45c1a159b85b ("Add PTRACE_O_TRACEVFORKDONE and PTRACE_O_TRACEEXIT facilities.")
>> Ref: 456f17cd1a28 ("[PATCH] user-vm-unlock-2.5.31-A2")
>> Signed-off-by: "Eric W. Biederman" <[email protected]>
>> ---
>> fs/exec.c | 4 ++++
>> include/linux/sched/signal.h | 9 ++++++++-
>> kernel/fork.c | 1 +
>> 3 files changed, 13 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/exec.c b/fs/exec.c
>> index c243f9660d46..ad7b518f906d 100644
>> --- a/fs/exec.c
>> +++ b/fs/exec.c
>> @@ -1182,6 +1182,7 @@ static int de_thread(struct linux_binprm *bprm, struct task_struct *tsk)
>> release_task(leader);
>> }
>>
>> + mutex_lock(&current->signal->exec_update_mutex);
>> bprm->unrecoverable = true;
>> sig->group_exit_task = NULL;
>> sig->notify_count = 0;
>> @@ -1425,6 +1426,8 @@ static void free_bprm(struct linux_binprm *bprm)
>> {
>> free_arg_pages(bprm);
>> if (bprm->cred) {
>> + if (bprm->unrecoverable)
>> + mutex_unlock(&current->signal->exec_update_mutex);
>> mutex_unlock(&current->signal->cred_guard_mutex);
>> abort_creds(bprm->cred);
>> }
>> @@ -1474,6 +1477,7 @@ void install_exec_creds(struct linux_binprm *bprm)
>> * credentials; any time after this it may be unlocked.
>> */
>> security_bprm_committed_creds(bprm);
>> + mutex_unlock(&current->signal->exec_update_mutex);
>> mutex_unlock(&current->signal->cred_guard_mutex);
>> }
>> EXPORT_SYMBOL(install_exec_creds);
>> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
>> index 88050259c466..a29df79540ce 100644
>> --- a/include/linux/sched/signal.h
>> +++ b/include/linux/sched/signal.h
>> @@ -224,7 +224,14 @@ struct signal_struct {
>>
>> struct mutex cred_guard_mutex; /* guard against foreign influences on
>> * credential calculations
>> - * (notably. ptrace) */
>> + * (notably. ptrace)
>> + * Deprecated do not use in new code.
>> + * Use exec_update_mutex instead.
>> + */
>> + struct mutex exec_update_mutex; /* Held while task_struct is being
>> + * updated during exec, and may have
>> + * inconsistent permissions.
>> + */
>> } __randomize_layout;
>>
>> /*
>> diff --git a/kernel/fork.c b/kernel/fork.c
>> index 60a1295f4384..12896a6ecee6 100644
>> --- a/kernel/fork.c
>> +++ b/kernel/fork.c
>> @@ -1594,6 +1594,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
>> sig->oom_score_adj_min = current->signal->oom_score_adj_min;
>>
>> mutex_init(&sig->cred_guard_mutex);
>> + mutex_init(&sig->exec_update_mutex);
>>
>> return 0;
>> }
>>
> Don't you need to add something like this to init/init_task.c ?
>
> .exec_update_mutex = __MUTEX_INITIALIZER(init_signals.exec_update_mutex),

Yes. I overlooked that. Thank you.

Eric

2020-03-06 11:47:36

by Bernd Edlinger

[permalink] [raw]
Subject: Re: [PATCH 2/2] exec: Add a exec_update_mutex to replace cred_guard_mutex



Am 06.03.20 um 06:17 schrieb Eric W. Biederman:
> Bernd Edlinger <[email protected]> writes:
>
>> On 3/5/20 10:16 PM, Eric W. Biederman wrote:
>>>
>>> The cred_guard_mutex is problematic. The cred_guard_mutex is held
>>> over the userspace accesses as the arguments from userspace are read.
>>> The cred_guard_mutex is held of PTRACE_EVENT_EXIT as the the other
>>> threads are killed. The cred_guard_mutex is held over
>>> "put_user(0, tsk->clear_child_tid)" in exit_mm().
>>>
>>> Any of those can result in deadlock, as the cred_guard_mutex is held
>>> over a possible indefinite userspace waits for userspace.
>>>
>>> Add exec_update_mutex that is only held over exec updating process
>>> with the new contents of exec, so that code that needs not to be
>>> confused by exec changing the mm and the cred in ways that can not
>>> happen during ordinary execution of a process can take.
>>>
>>> The plan is to switch the users of cred_guard_mutex to
>>> exed_udpate_mutex one by one. This lets us move forward while still
>>> being careful and not introducing any regressions.
>>>
>>> Link: https://lore.kernel.org/lkml/[email protected]/
>>> Link: https://lore.kernel.org/lkml/AM6PR03MB5170B06F3A2B75EFB98D071AE4E60@AM6PR03MB5170.eurprd03.prod.outlook.com/
>>> Link: https://lore.kernel.org/linux-fsdevel/[email protected]/
>>> Link: https://lore.kernel.org/lkml/[email protected]/
>>> Link: https://lore.kernel.org/lkml/[email protected]/
>>> Ref: 45c1a159b85b ("Add PTRACE_O_TRACEVFORKDONE and PTRACE_O_TRACEEXIT facilities.")
>>> Ref: 456f17cd1a28 ("[PATCH] user-vm-unlock-2.5.31-A2")
>>> Signed-off-by: "Eric W. Biederman" <[email protected]>
>>> ---
>>> fs/exec.c | 4 ++++
>>> include/linux/sched/signal.h | 9 ++++++++-
>>> kernel/fork.c | 1 +
>>> 3 files changed, 13 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/fs/exec.c b/fs/exec.c
>>> index c243f9660d46..ad7b518f906d 100644
>>> --- a/fs/exec.c
>>> +++ b/fs/exec.c
>>> @@ -1182,6 +1182,7 @@ static int de_thread(struct linux_binprm *bprm, struct task_struct *tsk)
>>> release_task(leader);
>>> }
>>>
>>> + mutex_lock(&current->signal->exec_update_mutex);

And by the way, could you make this mutex_lock_killable?


Bernd.

2020-03-06 19:16:44

by Bernd Edlinger

[permalink] [raw]
Subject: Re: [PATCH 2/2] exec: Add a exec_update_mutex to replace cred_guard_mutex

On 3/6/20 6:17 AM, Eric W. Biederman wrote:
> Bernd Edlinger <[email protected]> writes:
>
>> On 3/5/20 10:16 PM, Eric W. Biederman wrote:
>>>
>>> The cred_guard_mutex is problematic. The cred_guard_mutex is held
>>> over the userspace accesses as the arguments from userspace are read.
>>> The cred_guard_mutex is held of PTRACE_EVENT_EXIT as the the other
>>> threads are killed. The cred_guard_mutex is held over
>>> "put_user(0, tsk->clear_child_tid)" in exit_mm().
>>>

I am all for this patch, and the direction it is heading, Eric.

I just wanted to add a note that I think it is
possible that exec_mm_release can also invoke put_user(0, tsk->clear_child_tid),
under the new exec_update_mutex, since vm_access increments the
mm->mm_users, under the cred_update_mutex, but releases the mutex,
and the caller can hold the reference for a while and then exec_mmap is not
releasing the last reference.


Bernd.

2020-03-06 21:22:25

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 2/2] exec: Add a exec_update_mutex to replace cred_guard_mutex

Bernd Edlinger <[email protected]> writes:

> Am 06.03.20 um 06:17 schrieb Eric W. Biederman:
>> Bernd Edlinger <[email protected]> writes:
>>
>>> On 3/5/20 10:16 PM, Eric W. Biederman wrote:
>>>>
>>>> The cred_guard_mutex is problematic. The cred_guard_mutex is held
>>>> over the userspace accesses as the arguments from userspace are read.
>>>> The cred_guard_mutex is held of PTRACE_EVENT_EXIT as the the other
>>>> threads are killed. The cred_guard_mutex is held over
>>>> "put_user(0, tsk->clear_child_tid)" in exit_mm().
>>>>
>>>> Any of those can result in deadlock, as the cred_guard_mutex is held
>>>> over a possible indefinite userspace waits for userspace.
>>>>
>>>> Add exec_update_mutex that is only held over exec updating process
>>>> with the new contents of exec, so that code that needs not to be
>>>> confused by exec changing the mm and the cred in ways that can not
>>>> happen during ordinary execution of a process can take.
>>>>
>>>> The plan is to switch the users of cred_guard_mutex to
>>>> exed_udpate_mutex one by one. This lets us move forward while still
>>>> being careful and not introducing any regressions.
>>>>
>>>> Link: https://lore.kernel.org/lkml/[email protected]/
>>>> Link: https://lore.kernel.org/lkml/AM6PR03MB5170B06F3A2B75EFB98D071AE4E60@AM6PR03MB5170.eurprd03.prod.outlook.com/
>>>> Link: https://lore.kernel.org/linux-fsdevel/[email protected]/
>>>> Link: https://lore.kernel.org/lkml/[email protected]/
>>>> Link: https://lore.kernel.org/lkml/[email protected]/
>>>> Ref: 45c1a159b85b ("Add PTRACE_O_TRACEVFORKDONE and PTRACE_O_TRACEEXIT facilities.")
>>>> Ref: 456f17cd1a28 ("[PATCH] user-vm-unlock-2.5.31-A2")
>>>> Signed-off-by: "Eric W. Biederman" <[email protected]>
>>>> ---
>>>> fs/exec.c | 4 ++++
>>>> include/linux/sched/signal.h | 9 ++++++++-
>>>> kernel/fork.c | 1 +
>>>> 3 files changed, 13 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/fs/exec.c b/fs/exec.c
>>>> index c243f9660d46..ad7b518f906d 100644
>>>> --- a/fs/exec.c
>>>> +++ b/fs/exec.c
>>>> @@ -1182,6 +1182,7 @@ static int de_thread(struct linux_binprm *bprm, struct task_struct *tsk)
>>>> release_task(leader);
>>>> }
>>>>
>>>> + mutex_lock(&current->signal->exec_update_mutex);
>
> And by the way, could you make this mutex_lock_killable?

For some reason when I first read this suggestion I thought making this
mutex_lock_killable would cause me to rework the logic of when I
set unrecoverable and when I unlock the mutex. I blame a tired brain.
If a process has received a fatal signal none of that matters.

So yes I will do that just to make things robust in case we miss
something that would still make it possible to deadlock in with the new
mutex.

I am a little worried that the new mutex might still cover a little too
much. But past a certain point I we are not being able to make this
code perfect in the first change. The best we can do is to be careful
and avoid regressions. Whatever slips through we can fix when we spot
the problem.

Eric

2020-03-06 22:01:56

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 2/2] exec: Add a exec_update_mutex to replace cred_guard_mutex

Bernd Edlinger <[email protected]> writes:

> On 3/6/20 6:17 AM, Eric W. Biederman wrote:
>> Bernd Edlinger <[email protected]> writes:
>>
>>> On 3/5/20 10:16 PM, Eric W. Biederman wrote:
>>>>
>>>> The cred_guard_mutex is problematic. The cred_guard_mutex is held
>>>> over the userspace accesses as the arguments from userspace are read.
>>>> The cred_guard_mutex is held of PTRACE_EVENT_EXIT as the the other
>>>> threads are killed. The cred_guard_mutex is held over
>>>> "put_user(0, tsk->clear_child_tid)" in exit_mm().
>>>>
>
> I am all for this patch, and the direction it is heading, Eric.
>
> I just wanted to add a note that I think it is
> possible that exec_mm_release can also invoke put_user(0, tsk->clear_child_tid),
> under the new exec_update_mutex, since vm_access increments the
> mm->mm_users, under the cred_update_mutex, but releases the mutex,
> and the caller can hold the reference for a while and then exec_mmap is not
> releasing the last reference.

Good catch. I really appreciate your close look at the details.

I am wondering if process_vm_readv and process_vm_writev could be
safely changed to use mmgrab and mmdrop, instead of mmget and mmput.

That would resolve the potential issue you have pointed out. I just
haven't figured out if it is safe. The mm code has been seriously
refactored since I knew how it all worked.

Eric

2020-03-06 22:32:36

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 2/2] exec: Add a exec_update_mutex to replace cred_guard_mutex

[email protected] (Eric W. Biederman) writes:

> Bernd Edlinger <[email protected]> writes:
>
>> On 3/6/20 6:17 AM, Eric W. Biederman wrote:
>>> Bernd Edlinger <[email protected]> writes:
>>>
>>>> On 3/5/20 10:16 PM, Eric W. Biederman wrote:
>>>>>
>>>>> The cred_guard_mutex is problematic. The cred_guard_mutex is held
>>>>> over the userspace accesses as the arguments from userspace are read.
>>>>> The cred_guard_mutex is held of PTRACE_EVENT_EXIT as the the other
>>>>> threads are killed. The cred_guard_mutex is held over
>>>>> "put_user(0, tsk->clear_child_tid)" in exit_mm().
>>>>>
>>
>> I am all for this patch, and the direction it is heading, Eric.
>>
>> I just wanted to add a note that I think it is
>> possible that exec_mm_release can also invoke put_user(0, tsk->clear_child_tid),
>> under the new exec_update_mutex, since vm_access increments the
>> mm->mm_users, under the cred_update_mutex, but releases the mutex,
>> and the caller can hold the reference for a while and then exec_mmap is not
>> releasing the last reference.
>
> Good catch. I really appreciate your close look at the details.
>
> I am wondering if process_vm_readv and process_vm_writev could be
> safely changed to use mmgrab and mmdrop, instead of mmget and mmput.
>
> That would resolve the potential issue you have pointed out. I just
> haven't figured out if it is safe. The mm code has been seriously
> refactored since I knew how it all worked.

Nope, mmget can not be replaced by mmgrab.

It might be possible to do something creative like store a cred in place
of the userns on the mm and use that for mm_access permission checks.
Still we are talking a pretty narrow window, and a case that no one has
figured out how to trigger yet. So I will leave that corner case as
something for future improvements.

Eric

2020-03-07 01:06:58

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 2/2] exec: Add a exec_update_mutex to replace cred_guard_mutex

[email protected] (Eric W. Biederman) writes:

> [email protected] (Eric W. Biederman) writes:
>
>> Bernd Edlinger <[email protected]> writes:
>>
>>> On 3/6/20 6:17 AM, Eric W. Biederman wrote:
>>>> Bernd Edlinger <[email protected]> writes:
>>>>
>>>>> On 3/5/20 10:16 PM, Eric W. Biederman wrote:
>>>>>>
>>>>>> The cred_guard_mutex is problematic. The cred_guard_mutex is held
>>>>>> over the userspace accesses as the arguments from userspace are read.
>>>>>> The cred_guard_mutex is held of PTRACE_EVENT_EXIT as the the other
>>>>>> threads are killed. The cred_guard_mutex is held over
>>>>>> "put_user(0, tsk->clear_child_tid)" in exit_mm().
>>>>>>
>>>
>>> I am all for this patch, and the direction it is heading, Eric.
>>>
>>> I just wanted to add a note that I think it is
>>> possible that exec_mm_release can also invoke put_user(0, tsk->clear_child_tid),
>>> under the new exec_update_mutex, since vm_access increments the
>>> mm->mm_users, under the cred_update_mutex, but releases the mutex,
>>> and the caller can hold the reference for a while and then exec_mmap is not
>>> releasing the last reference.
>>
>> Good catch. I really appreciate your close look at the details.
>>
>> I am wondering if process_vm_readv and process_vm_writev could be
>> safely changed to use mmgrab and mmdrop, instead of mmget and mmput.
>>
>> That would resolve the potential issue you have pointed out. I just
>> haven't figured out if it is safe. The mm code has been seriously
>> refactored since I knew how it all worked.
>
> Nope, mmget can not be replaced by mmgrab.
>
> It might be possible to do something creative like store a cred in place
> of the userns on the mm and use that for mm_access permission checks.
> Still we are talking a pretty narrow window, and a case that no one has
> figured out how to trigger yet. So I will leave that corner case as
> something for future improvements.

My brain is restless and keep looking at it.

The worst case is processes created with CLONE_VM|CLONE_CHILD_CLEARTID
but not CLONE_THREAD. For those that put_user will occur ever time
in exec_mmap.

The only solution that I can see is to move taking the new mutex after
exec_mm_release. Which may be feasible given how close exec_mmap
follows de_thread.

I am going to sleep on that and perhaps I will be able to see how to
move taking the mutex lower.

It would be very nice not to have a known issue going into this set of
changes.

Eric

2020-03-08 12:59:15

by Bernd Edlinger

[permalink] [raw]
Subject: [PATCH] exec: make de_thread alloc new signal struct earlier

It was pointed out that de_thread may return -ENOMEM
when it already terminated threads, and returning
an error from execve, except when a fatal signal is
being delivered is not an option any more.

Allocate the memory for the signal table earlier,
and make sure that -ENOMEM is returned before the
unrecoverable actions are started.

Signed-off-by: Bernd Edlinger <[email protected]>
---
Eric, what do you think, might this be helpful
to move the "point of no return" lower, and simplify
your patch?

fs/exec.c | 31 +++++++++++++++++++++++--------
1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 74d88da..a0328dc 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1057,16 +1057,26 @@ static int exec_mmap(struct mm_struct *mm)
* disturbing other processes. (Other processes might share the signal
* table via the CLONE_SIGHAND option to clone().)
*/
-static int de_thread(struct task_struct *tsk)
+static int de_thread(void)
{
+ struct task_struct *tsk = current;
struct signal_struct *sig = tsk->signal;
struct sighand_struct *oldsighand = tsk->sighand;
spinlock_t *lock = &oldsighand->siglock;
+ struct sighand_struct *newsighand = NULL;

if (thread_group_empty(tsk))
goto no_thread_group;

/*
+ * This is the last time for an out of memory error.
+ * After this point only fatal signals are are okay.
+ */
+ newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
+ if (!newsighand)
+ return -ENOMEM;
+
+ /*
* Kill all other threads in the thread group.
*/
spin_lock_irq(lock);
@@ -1076,7 +1086,7 @@ static int de_thread(struct task_struct *tsk)
* return so that the signal is processed.
*/
spin_unlock_irq(lock);
- return -EAGAIN;
+ goto err_free;
}

sig->group_exit_task = tsk;
@@ -1191,14 +1201,16 @@ static int de_thread(struct task_struct *tsk)
#endif

if (refcount_read(&oldsighand->count) != 1) {
- struct sighand_struct *newsighand;
/*
* This ->sighand is shared with the CLONE_SIGHAND
* but not CLONE_THREAD task, switch to the new one.
*/
- newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
- if (!newsighand)
- return -ENOMEM;
+ if (!newsighand) {
+ newsighand = kmem_cache_alloc(sighand_cachep,
+ GFP_KERNEL);
+ if (!newsighand)
+ return -ENOMEM;
+ }

refcount_set(&newsighand->count, 1);
memcpy(newsighand->action, oldsighand->action,
@@ -1211,7 +1223,8 @@ static int de_thread(struct task_struct *tsk)
write_unlock_irq(&tasklist_lock);

__cleanup_sighand(oldsighand);
- }
+ } else if (newsighand)
+ kmem_cache_free(sighand_cachep, newsighand);

BUG_ON(!thread_group_leader(tsk));
return 0;
@@ -1222,6 +1235,8 @@ static int de_thread(struct task_struct *tsk)
sig->group_exit_task = NULL;
sig->notify_count = 0;
read_unlock(&tasklist_lock);
+err_free:
+ kmem_cache_free(sighand_cachep, newsighand);
return -EAGAIN;
}

@@ -1262,7 +1277,7 @@ int flush_old_exec(struct linux_binprm * bprm)
* Make sure we have a private signal table and that
* we are unassociated from the previous thread group.
*/
- retval = de_thread(current);
+ retval = de_thread();
if (retval)
goto out;

--
1.9.1

2020-03-08 18:15:23

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] exec: make de_thread alloc new signal struct earlier

Bernd Edlinger <[email protected]> writes:

> It was pointed out that de_thread may return -ENOMEM
> when it already terminated threads, and returning
> an error from execve, except when a fatal signal is
> being delivered is not an option any more.
>
> Allocate the memory for the signal table earlier,
> and make sure that -ENOMEM is returned before the
> unrecoverable actions are started.
>
> Signed-off-by: Bernd Edlinger <[email protected]>
> ---
> Eric, what do you think, might this be helpful
> to move the "point of no return" lower, and simplify
> your patch?

Good thinking but no. In this case it is possible to move the entire
allocation lower. As well as the posix timer cleanup. That code is
actually much clearer outside of de_thread. I will post a patch in that
direction in a moment.

It is something of a bad idea to move the new sighand allocation sooner
because in practice it does not happen. It only exists to support the
CLONE_VM | CLONE_SIGHAND without CLONE_SIGNAL case which is not used
by the modern posix thread libraries.

There are just enough old executables floating out there that I don't
think we can remove the CLONE_SIGHAND case in general but I keep
dreaming about it. We get a lot of complexity in the code to support
something that no one really does anymore.

Eric

> fs/exec.c | 31 +++++++++++++++++++++++--------
> 1 file changed, 23 insertions(+), 8 deletions(-)
>
> diff --git a/fs/exec.c b/fs/exec.c
> index 74d88da..a0328dc 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1057,16 +1057,26 @@ static int exec_mmap(struct mm_struct *mm)
> * disturbing other processes. (Other processes might share the signal
> * table via the CLONE_SIGHAND option to clone().)
> */
> -static int de_thread(struct task_struct *tsk)
> +static int de_thread(void)
> {
> + struct task_struct *tsk = current;
> struct signal_struct *sig = tsk->signal;
> struct sighand_struct *oldsighand = tsk->sighand;
> spinlock_t *lock = &oldsighand->siglock;
> + struct sighand_struct *newsighand = NULL;
>
> if (thread_group_empty(tsk))
> goto no_thread_group;
>
> /*
> + * This is the last time for an out of memory error.
> + * After this point only fatal signals are are okay.
> + */
> + newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
> + if (!newsighand)
> + return -ENOMEM;
> +
> + /*
> * Kill all other threads in the thread group.
> */
> spin_lock_irq(lock);
> @@ -1076,7 +1086,7 @@ static int de_thread(struct task_struct *tsk)
> * return so that the signal is processed.
> */
> spin_unlock_irq(lock);
> - return -EAGAIN;
> + goto err_free;
> }
>
> sig->group_exit_task = tsk;
> @@ -1191,14 +1201,16 @@ static int de_thread(struct task_struct *tsk)
> #endif
>
> if (refcount_read(&oldsighand->count) != 1) {
> - struct sighand_struct *newsighand;
> /*
> * This ->sighand is shared with the CLONE_SIGHAND
> * but not CLONE_THREAD task, switch to the new one.
> */
> - newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
> - if (!newsighand)
> - return -ENOMEM;
> + if (!newsighand) {
> + newsighand = kmem_cache_alloc(sighand_cachep,
> + GFP_KERNEL);
> + if (!newsighand)
> + return -ENOMEM;
> + }
>
> refcount_set(&newsighand->count, 1);
> memcpy(newsighand->action, oldsighand->action,
> @@ -1211,7 +1223,8 @@ static int de_thread(struct task_struct *tsk)
> write_unlock_irq(&tasklist_lock);
>
> __cleanup_sighand(oldsighand);
> - }
> + } else if (newsighand)
> + kmem_cache_free(sighand_cachep, newsighand);
>
> BUG_ON(!thread_group_leader(tsk));
> return 0;
> @@ -1222,6 +1235,8 @@ static int de_thread(struct task_struct *tsk)
> sig->group_exit_task = NULL;
> sig->notify_count = 0;
> read_unlock(&tasklist_lock);
> +err_free:
> + kmem_cache_free(sighand_cachep, newsighand);
> return -EAGAIN;
> }
>
> @@ -1262,7 +1277,7 @@ int flush_old_exec(struct linux_binprm * bprm)
> * Make sure we have a private signal table and that
> * we are unassociated from the previous thread group.
> */
> - retval = de_thread(current);
> + retval = de_thread();
> if (retval)
> goto out;