On Mon 17-10-16 11:39:49, Eric W. Biederman wrote:
>
> During exec dumpable is cleared if the file that is being executed is
> not readable by the user executing the file. A bug in
> ptrace_may_access allows reading the file if the executable happens to
> enter into a subordinate user namespace (aka clone(CLONE_NEWUSER),
> unshare(CLONE_NEWUSER), or setns(fd, CLONE_NEWUSER).
>
> This problem is fixed with only necessary userspace breakage by adding
> a user namespace owner to mm_struct, captured at the time of exec,
> so it is clear in which user namespace CAP_SYS_PTRACE must be present
> in to be able to safely give read permission to the executable.
>
> The function ptrace_may_access is modified to verify that the ptracer
> has CAP_SYS_ADMIN in task->mm->user_ns instead of task->cred->user_ns.
> This ensures that if the task changes it's cred into a subordinate
> user namespace it does not become ptraceable.
I haven't studied your patch too deeply but one thing that immediately
raised a red flag was that mm might be shared between processes (aka
thread groups). What prevents those two to sit in different user
namespaces?
I am primarily asking because this generated a lot of headache for the
memcg handling as those processes might sit in different cgroups while
there is only one correct memcg for them which can disagree with the
cgroup associated with one of the processes.
> Cc: [email protected]
> Fixes: 8409cca70561 ("userns: allow ptrace from non-init user namespaces")
> Signed-off-by: "Eric W. Biederman" <[email protected]>
> ---
>
> It turns out that dumpable needs to be fixed to be user namespace
> aware to fix this issue. When this patch is ready I plan to place it in
> my userns tree and send it to Linus, hopefully for -rc2.
>
> include/linux/mm_types.h | 1 +
> kernel/fork.c | 9 ++++++---
> kernel/ptrace.c | 17 ++++++-----------
> mm/init-mm.c | 2 ++
> 4 files changed, 15 insertions(+), 14 deletions(-)
>
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 4a8acedf4b7d..08d947fc4c59 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -473,6 +473,7 @@ struct mm_struct {
> */
> struct task_struct __rcu *owner;
> #endif
> + struct user_namespace *user_ns;
>
> /* store ref to file /proc/<pid>/exe symlink points to */
> struct file __rcu *exe_file;
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 623259fc794d..fd85c68c2791 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -742,7 +742,8 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
> #endif
> }
>
> -static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
> +static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
> + struct user_namespace *user_ns)
> {
> mm->mmap = NULL;
> mm->mm_rb = RB_ROOT;
> @@ -782,6 +783,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
> if (init_new_context(p, mm))
> goto fail_nocontext;
>
> + mm->user_ns = get_user_ns(user_ns);
> return mm;
>
> fail_nocontext:
> @@ -827,7 +829,7 @@ struct mm_struct *mm_alloc(void)
> return NULL;
>
> memset(mm, 0, sizeof(*mm));
> - return mm_init(mm, current);
> + return mm_init(mm, current, current_user_ns());
> }
>
> /*
> @@ -842,6 +844,7 @@ void __mmdrop(struct mm_struct *mm)
> destroy_context(mm);
> mmu_notifier_mm_destroy(mm);
> check_mm(mm);
> + put_user_ns(mm->user_ns);
> free_mm(mm);
> }
> EXPORT_SYMBOL_GPL(__mmdrop);
> @@ -1123,7 +1126,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
>
> memcpy(mm, oldmm, sizeof(*mm));
>
> - if (!mm_init(mm, tsk))
> + if (!mm_init(mm, tsk, mm->user_ns))
> goto fail_nomem;
>
> err = dup_mmap(mm, oldmm);
> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> index 2a99027312a6..f2d1b9afb3f8 100644
> --- a/kernel/ptrace.c
> +++ b/kernel/ptrace.c
> @@ -220,7 +220,7 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
> static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
> {
> const struct cred *cred = current_cred(), *tcred;
> - int dumpable = 0;
> + struct mm_struct *mm;
> kuid_t caller_uid;
> kgid_t caller_gid;
>
> @@ -271,16 +271,11 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
> return -EPERM;
> ok:
> rcu_read_unlock();
> - smp_rmb();
> - if (task->mm)
> - dumpable = get_dumpable(task->mm);
> - rcu_read_lock();
> - if (dumpable != SUID_DUMP_USER &&
> - !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
> - rcu_read_unlock();
> - return -EPERM;
> - }
> - rcu_read_unlock();
> + mm = task->mm;
> + if (!mm ||
> + ((get_dumpable(mm) != SUID_DUMP_USER) &&
> + !ptrace_has_cap(mm->user_ns, mode)))
> + return -EPERM;
>
> return security_ptrace_access_check(task, mode);
> }
> diff --git a/mm/init-mm.c b/mm/init-mm.c
> index a56a851908d2..975e49f00f34 100644
> --- a/mm/init-mm.c
> +++ b/mm/init-mm.c
> @@ -6,6 +6,7 @@
> #include <linux/cpumask.h>
>
> #include <linux/atomic.h>
> +#include <linux/user_namespace.h>
> #include <asm/pgtable.h>
> #include <asm/mmu.h>
>
> @@ -21,5 +22,6 @@ struct mm_struct init_mm = {
> .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
> .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
> .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
> + .user_ns = &init_user_ns,
> INIT_MM_CONTEXT(init_mm)
> };
> --
> 2.8.3
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
--
Michal Hocko
SUSE Labs
On Tue, Oct 18, 2016 at 03:50:32PM +0200, Michal Hocko wrote:
> On Mon 17-10-16 11:39:49, Eric W. Biederman wrote:
> >
> > During exec dumpable is cleared if the file that is being executed is
> > not readable by the user executing the file. A bug in
> > ptrace_may_access allows reading the file if the executable happens to
> > enter into a subordinate user namespace (aka clone(CLONE_NEWUSER),
> > unshare(CLONE_NEWUSER), or setns(fd, CLONE_NEWUSER).
> >
> > This problem is fixed with only necessary userspace breakage by adding
> > a user namespace owner to mm_struct, captured at the time of exec,
> > so it is clear in which user namespace CAP_SYS_PTRACE must be present
> > in to be able to safely give read permission to the executable.
> >
> > The function ptrace_may_access is modified to verify that the ptracer
> > has CAP_SYS_ADMIN in task->mm->user_ns instead of task->cred->user_ns.
> > This ensures that if the task changes it's cred into a subordinate
> > user namespace it does not become ptraceable.
>
> I haven't studied your patch too deeply but one thing that immediately
> raised a red flag was that mm might be shared between processes (aka
> thread groups).
You're conflating things. Threads always share memory, but sharing memory
doesn't imply being part of the same thread group.
> What prevents those two to sit in different user
> namespaces?
For thread groups: You can't change user namespace in a thread group
with more than one task.
For shared mm: Yeah, I think that could happen - but it doesn't matter.
The patch just needs the mm to determine the namespace in which the mm
was created, and that's always the same for tasks that share mm.
Michal Hocko <[email protected]> writes:
> On Mon 17-10-16 11:39:49, Eric W. Biederman wrote:
>>
>> During exec dumpable is cleared if the file that is being executed is
>> not readable by the user executing the file. A bug in
>> ptrace_may_access allows reading the file if the executable happens to
>> enter into a subordinate user namespace (aka clone(CLONE_NEWUSER),
>> unshare(CLONE_NEWUSER), or setns(fd, CLONE_NEWUSER).
>>
>> This problem is fixed with only necessary userspace breakage by adding
>> a user namespace owner to mm_struct, captured at the time of exec,
>> so it is clear in which user namespace CAP_SYS_PTRACE must be present
>> in to be able to safely give read permission to the executable.
>>
>> The function ptrace_may_access is modified to verify that the ptracer
>> has CAP_SYS_ADMIN in task->mm->user_ns instead of task->cred->user_ns.
>> This ensures that if the task changes it's cred into a subordinate
>> user namespace it does not become ptraceable.
>
> I haven't studied your patch too deeply but one thing that immediately
> raised a red flag was that mm might be shared between processes (aka
> thread groups). What prevents those two to sit in different user
> namespaces?
>
> I am primarily asking because this generated a lot of headache for the
> memcg handling as those processes might sit in different cgroups while
> there is only one correct memcg for them which can disagree with the
> cgroup associated with one of the processes.
That is a legitimate concern, but I do not see any of those kinds of
issues here.
Part of the memcg pain comes from the fact that control groups are
process centric, and part of the pain comes from the fact that it is
possible to change control groups. What I am doing is making the mm
owned by a user namespace (at creation time), and I am not allowing
changes to that ownership. The credentials of the tasks that use that mm
may be in the same user namespace or descendent user namespaces.
The core goal is to enforce the unreadability of an mm when an
non-readable file is executed. This is a time of mm creation property.
The enforcement of which fits very well with the security/permission
checking role of the user namespace.
Could this use of mm->user_ns be extended for some kind of
accounting/limiting in the future? Possibly. I can imagine a limit on
the total number of page table entries a group of processes are allowed
to have as being a sane kind of limit in this setting much like
RLIMIT_AS is sane on a single mm level. Pages don't belong to mm's so I
can't imagine anything like the memcg being built on this kind of
infrastructure.
Eric
On Tue, Oct 18, 2016 at 09:56:53AM -0500, Eric W. Biederman wrote:
> Michal Hocko <[email protected]> writes:
>
> > On Mon 17-10-16 11:39:49, Eric W. Biederman wrote:
> >>
> >> During exec dumpable is cleared if the file that is being executed is
> >> not readable by the user executing the file. A bug in
> >> ptrace_may_access allows reading the file if the executable happens to
> >> enter into a subordinate user namespace (aka clone(CLONE_NEWUSER),
> >> unshare(CLONE_NEWUSER), or setns(fd, CLONE_NEWUSER).
> >>
> >> This problem is fixed with only necessary userspace breakage by adding
> >> a user namespace owner to mm_struct, captured at the time of exec,
> >> so it is clear in which user namespace CAP_SYS_PTRACE must be present
> >> in to be able to safely give read permission to the executable.
> >>
> >> The function ptrace_may_access is modified to verify that the ptracer
> >> has CAP_SYS_ADMIN in task->mm->user_ns instead of task->cred->user_ns.
> >> This ensures that if the task changes it's cred into a subordinate
> >> user namespace it does not become ptraceable.
> >
> > I haven't studied your patch too deeply but one thing that immediately
> > raised a red flag was that mm might be shared between processes (aka
> > thread groups). What prevents those two to sit in different user
> > namespaces?
> >
> > I am primarily asking because this generated a lot of headache for the
> > memcg handling as those processes might sit in different cgroups while
> > there is only one correct memcg for them which can disagree with the
> > cgroup associated with one of the processes.
>
> That is a legitimate concern, but I do not see any of those kinds of
> issues here.
>
> Part of the memcg pain comes from the fact that control groups are
> process centric, and part of the pain comes from the fact that it is
> possible to change control groups. What I am doing is making the mm
> owned by a user namespace (at creation time), and I am not allowing
> changes to that ownership. The credentials of the tasks that use that mm
> may be in the same user namespace or descendent user namespaces.
>
> The core goal is to enforce the unreadability of an mm when an
> non-readable file is executed. This is a time of mm creation property.
> The enforcement of which fits very well with the security/permission
> checking role of the user namespace.
How is that going to work? I thought the core goal was better security for
entering containers.
If I want to dump a non-readable file, afaik, I can just make a new user
namespace, then run the file in there and dump its memory.
I guess you could fix that by entirely prohibiting the execution of a
non-readable file whose owner UID is not mapped. (Adding more dumping
restrictions wouldn't help much because you could still e.g. supply a
malicious dynamic linker if you control the mount namespace.)
Jann Horn <[email protected]> writes:
> On Tue, Oct 18, 2016 at 09:56:53AM -0500, Eric W. Biederman wrote:
>> Michal Hocko <[email protected]> writes:
>>
>> > On Mon 17-10-16 11:39:49, Eric W. Biederman wrote:
>> >>
>> >> During exec dumpable is cleared if the file that is being executed is
>> >> not readable by the user executing the file. A bug in
>> >> ptrace_may_access allows reading the file if the executable happens to
>> >> enter into a subordinate user namespace (aka clone(CLONE_NEWUSER),
>> >> unshare(CLONE_NEWUSER), or setns(fd, CLONE_NEWUSER).
>> >>
>> >> This problem is fixed with only necessary userspace breakage by adding
>> >> a user namespace owner to mm_struct, captured at the time of exec,
>> >> so it is clear in which user namespace CAP_SYS_PTRACE must be present
>> >> in to be able to safely give read permission to the executable.
>> >>
>> >> The function ptrace_may_access is modified to verify that the ptracer
>> >> has CAP_SYS_ADMIN in task->mm->user_ns instead of task->cred->user_ns.
>> >> This ensures that if the task changes it's cred into a subordinate
>> >> user namespace it does not become ptraceable.
>> >
>> > I haven't studied your patch too deeply but one thing that immediately
>> > raised a red flag was that mm might be shared between processes (aka
>> > thread groups). What prevents those two to sit in different user
>> > namespaces?
>> >
>> > I am primarily asking because this generated a lot of headache for the
>> > memcg handling as those processes might sit in different cgroups while
>> > there is only one correct memcg for them which can disagree with the
>> > cgroup associated with one of the processes.
>>
>> That is a legitimate concern, but I do not see any of those kinds of
>> issues here.
>>
>> Part of the memcg pain comes from the fact that control groups are
>> process centric, and part of the pain comes from the fact that it is
>> possible to change control groups. What I am doing is making the mm
>> owned by a user namespace (at creation time), and I am not allowing
>> changes to that ownership. The credentials of the tasks that use that mm
>> may be in the same user namespace or descendent user namespaces.
>>
>> The core goal is to enforce the unreadability of an mm when an
>> non-readable file is executed. This is a time of mm creation property.
>> The enforcement of which fits very well with the security/permission
>> checking role of the user namespace.
>
> How is that going to work? I thought the core goal was better security for
> entering containers.
The better security when entering containers came from fixing the the
check for unreadable files. Because that is fundamentally what
the mm dumpable settings are for.
> If I want to dump a non-readable file, afaik, I can just make a new user
> namespace, then run the file in there and dump its memory.
> I guess you could fix that by entirely prohibiting the execution of a
> non-readable file whose owner UID is not mapped. (Adding more dumping
> restrictions wouldn't help much because you could still e.g. supply a
> malicious dynamic linker if you control the mount namespace.)
That seems to be a part of this puzzle I have incompletely addressed,
thank you.
It looks like I need to change either the owning user namespace or
fail the exec. Malicious dynamic linkers are doubly interesting.
As mount name spaces are also owned if I have privileges I can address
the possibility of a malicious dynamic linker that way. AKA who cares
about the link if the owner of the mount namespace has permissions to
read the file.
I am going to look at failing the exec if the owning user namespace
of the mm would not have permissions to read the file. That should just
be a couple of lines of code and easy to maintain. Plus it does not
appear that non-readable executables are particularly common.
Eric
On Tue 18-10-16 09:56:53, Eric W. Biederman wrote:
> Michal Hocko <[email protected]> writes:
>
> > On Mon 17-10-16 11:39:49, Eric W. Biederman wrote:
> >>
> >> During exec dumpable is cleared if the file that is being executed is
> >> not readable by the user executing the file. A bug in
> >> ptrace_may_access allows reading the file if the executable happens to
> >> enter into a subordinate user namespace (aka clone(CLONE_NEWUSER),
> >> unshare(CLONE_NEWUSER), or setns(fd, CLONE_NEWUSER).
> >>
> >> This problem is fixed with only necessary userspace breakage by adding
> >> a user namespace owner to mm_struct, captured at the time of exec,
> >> so it is clear in which user namespace CAP_SYS_PTRACE must be present
> >> in to be able to safely give read permission to the executable.
> >>
> >> The function ptrace_may_access is modified to verify that the ptracer
> >> has CAP_SYS_ADMIN in task->mm->user_ns instead of task->cred->user_ns.
> >> This ensures that if the task changes it's cred into a subordinate
> >> user namespace it does not become ptraceable.
> >
> > I haven't studied your patch too deeply but one thing that immediately
> > raised a red flag was that mm might be shared between processes (aka
> > thread groups). What prevents those two to sit in different user
> > namespaces?
> >
> > I am primarily asking because this generated a lot of headache for the
> > memcg handling as those processes might sit in different cgroups while
> > there is only one correct memcg for them which can disagree with the
> > cgroup associated with one of the processes.
>
> That is a legitimate concern, but I do not see any of those kinds of
> issues here.
>
> Part of the memcg pain comes from the fact that control groups are
> process centric, and part of the pain comes from the fact that it is
> possible to change control groups. What I am doing is making the mm
> owned by a user namespace (at creation time), and I am not allowing
> changes to that ownership. The credentials of the tasks that use that mm
> may be in the same user namespace or descendent user namespaces.
OK, then my worries about this weird "threading" model is void.
Thanks for the clarification.
--
Michal Hocko
SUSE Labs
On Tue, Oct 18, 2016 at 10:35:23AM -0500, Eric W. Biederman wrote:
> Jann Horn <[email protected]> writes:
>
> > On Tue, Oct 18, 2016 at 09:56:53AM -0500, Eric W. Biederman wrote:
> >> Michal Hocko <[email protected]> writes:
> >>
> >> > On Mon 17-10-16 11:39:49, Eric W. Biederman wrote:
> >> >>
> >> >> During exec dumpable is cleared if the file that is being executed is
> >> >> not readable by the user executing the file. A bug in
> >> >> ptrace_may_access allows reading the file if the executable happens to
> >> >> enter into a subordinate user namespace (aka clone(CLONE_NEWUSER),
> >> >> unshare(CLONE_NEWUSER), or setns(fd, CLONE_NEWUSER).
> >> >>
> >> >> This problem is fixed with only necessary userspace breakage by adding
> >> >> a user namespace owner to mm_struct, captured at the time of exec,
> >> >> so it is clear in which user namespace CAP_SYS_PTRACE must be present
> >> >> in to be able to safely give read permission to the executable.
> >> >>
> >> >> The function ptrace_may_access is modified to verify that the ptracer
> >> >> has CAP_SYS_ADMIN in task->mm->user_ns instead of task->cred->user_ns.
> >> >> This ensures that if the task changes it's cred into a subordinate
> >> >> user namespace it does not become ptraceable.
> >> >
> >> > I haven't studied your patch too deeply but one thing that immediately
> >> > raised a red flag was that mm might be shared between processes (aka
> >> > thread groups). What prevents those two to sit in different user
> >> > namespaces?
> >> >
> >> > I am primarily asking because this generated a lot of headache for the
> >> > memcg handling as those processes might sit in different cgroups while
> >> > there is only one correct memcg for them which can disagree with the
> >> > cgroup associated with one of the processes.
> >>
> >> That is a legitimate concern, but I do not see any of those kinds of
> >> issues here.
> >>
> >> Part of the memcg pain comes from the fact that control groups are
> >> process centric, and part of the pain comes from the fact that it is
> >> possible to change control groups. What I am doing is making the mm
> >> owned by a user namespace (at creation time), and I am not allowing
> >> changes to that ownership. The credentials of the tasks that use that mm
> >> may be in the same user namespace or descendent user namespaces.
> >>
> >> The core goal is to enforce the unreadability of an mm when an
> >> non-readable file is executed. This is a time of mm creation property.
> >> The enforcement of which fits very well with the security/permission
> >> checking role of the user namespace.
> >
> > How is that going to work? I thought the core goal was better security for
> > entering containers.
>
> The better security when entering containers came from fixing the the
> check for unreadable files. Because that is fundamentally what
> the mm dumpable settings are for.
Oh, interesting.
> > If I want to dump a non-readable file, afaik, I can just make a new user
> > namespace, then run the file in there and dump its memory.
> > I guess you could fix that by entirely prohibiting the execution of a
> > non-readable file whose owner UID is not mapped. (Adding more dumping
> > restrictions wouldn't help much because you could still e.g. supply a
> > malicious dynamic linker if you control the mount namespace.)
>
> That seems to be a part of this puzzle I have incompletely addressed,
> thank you.
>
> It looks like I need to change either the owning user namespace or
> fail the exec. Malicious dynamic linkers are doubly interesting.
>
> As mount name spaces are also owned if I have privileges I can address
> the possibility of a malicious dynamic linker that way. AKA who cares
> about the link if the owner of the mount namespace has permissions to
> read the file.
If you just check the owner of the mount namespace, someone could still
use a user namespace to chroot() the process. That should also be
sufficient to get the evil linker in. I think it really needs to be the
user namespace of the executing process that's checked, not the user
namespace associated with some mount namespace.
> I am going to look at failing the exec if the owning user namespace
> of the mm would not have permissions to read the file. That should just
> be a couple of lines of code and easy to maintain. Plus it does not
> appear that non-readable executables are particularly common.
Hm. Yeah, I guess mode 04111 probably isn't sooo common.
>From a security perspective, I think that should work.
Jann Horn <[email protected]> writes:
> On Tue, Oct 18, 2016 at 10:35:23AM -0500, Eric W. Biederman wrote:
>> Jann Horn <[email protected]> writes:
>>
>> > On Tue, Oct 18, 2016 at 09:56:53AM -0500, Eric W. Biederman wrote:
>> >> Michal Hocko <[email protected]> writes:
>> >>
>> >> > On Mon 17-10-16 11:39:49, Eric W. Biederman wrote:
>> >> >>
>> >> >> During exec dumpable is cleared if the file that is being executed is
>> >> >> not readable by the user executing the file. A bug in
>> >> >> ptrace_may_access allows reading the file if the executable happens to
>> >> >> enter into a subordinate user namespace (aka clone(CLONE_NEWUSER),
>> >> >> unshare(CLONE_NEWUSER), or setns(fd, CLONE_NEWUSER).
>> >> >>
>> >> >> This problem is fixed with only necessary userspace breakage by adding
>> >> >> a user namespace owner to mm_struct, captured at the time of exec,
>> >> >> so it is clear in which user namespace CAP_SYS_PTRACE must be present
>> >> >> in to be able to safely give read permission to the executable.
>> >> >>
>> >> >> The function ptrace_may_access is modified to verify that the ptracer
>> >> >> has CAP_SYS_ADMIN in task->mm->user_ns instead of task->cred->user_ns.
>> >> >> This ensures that if the task changes it's cred into a subordinate
>> >> >> user namespace it does not become ptraceable.
>> >> >
>> >> > I haven't studied your patch too deeply but one thing that immediately
>> >> > raised a red flag was that mm might be shared between processes (aka
>> >> > thread groups). What prevents those two to sit in different user
>> >> > namespaces?
>> >> >
>> >> > I am primarily asking because this generated a lot of headache for the
>> >> > memcg handling as those processes might sit in different cgroups while
>> >> > there is only one correct memcg for them which can disagree with the
>> >> > cgroup associated with one of the processes.
>> >>
>> >> That is a legitimate concern, but I do not see any of those kinds of
>> >> issues here.
>> >>
>> >> Part of the memcg pain comes from the fact that control groups are
>> >> process centric, and part of the pain comes from the fact that it is
>> >> possible to change control groups. What I am doing is making the mm
>> >> owned by a user namespace (at creation time), and I am not allowing
>> >> changes to that ownership. The credentials of the tasks that use that mm
>> >> may be in the same user namespace or descendent user namespaces.
>> >>
>> >> The core goal is to enforce the unreadability of an mm when an
>> >> non-readable file is executed. This is a time of mm creation property.
>> >> The enforcement of which fits very well with the security/permission
>> >> checking role of the user namespace.
>> >
>> > How is that going to work? I thought the core goal was better security for
>> > entering containers.
>>
>> The better security when entering containers came from fixing the the
>> check for unreadable files. Because that is fundamentally what
>> the mm dumpable settings are for.
>
> Oh, interesting.
>
>
>> > If I want to dump a non-readable file, afaik, I can just make a new user
>> > namespace, then run the file in there and dump its memory.
>> > I guess you could fix that by entirely prohibiting the execution of a
>> > non-readable file whose owner UID is not mapped. (Adding more dumping
>> > restrictions wouldn't help much because you could still e.g. supply a
>> > malicious dynamic linker if you control the mount namespace.)
>>
>> That seems to be a part of this puzzle I have incompletely addressed,
>> thank you.
>>
>> It looks like I need to change either the owning user namespace or
>> fail the exec. Malicious dynamic linkers are doubly interesting.
>>
>> As mount name spaces are also owned if I have privileges I can address
>> the possibility of a malicious dynamic linker that way. AKA who cares
>> about the link if the owner of the mount namespace has permissions to
>> read the file.
>
> If you just check the owner of the mount namespace, someone could still
> use a user namespace to chroot() the process. That should also be
> sufficient to get the evil linker in. I think it really needs to be the
> user namespace of the executing process that's checked, not the user
> namespace associated with some mount namespace.
Something. I will just note that this is hard to analyze and
theoretically possible for now, since I don't intend to pursue
that solution.
>> I am going to look at failing the exec if the owning user namespace
>> of the mm would not have permissions to read the file. That should just
>> be a couple of lines of code and easy to maintain. Plus it does not
>> appear that non-readable executables are particularly common.
>
> Hm. Yeah, I guess mode 04111 probably isn't sooo common.
> From a security perspective, I think that should work.
Well there is at least one common distro that installs sudo
that way so I would not say uncommon. But we already ignore
the suid and sgid bit when executing such executables as without
having the uid or gid mapping into a user namespace suid and sgid
can not be supported.
So the only case that could cause a real regression/loss of
functionality is if there are unreadable executables without the suid or
sgid bit set. I can't find any of those.
Patch for this second bug in a moment.
Eric
When the user namespace support was merged the need to prevent
ptracing an executable that is not readable was overlooked.
Correct this oversight by not letting exec succeed if during exec an
executable is not readable and the current user namespace capabilities
do not apply to the executable's file.
While it happens that distros install some files setuid and
non-readable I have not found any executable files just installed
non-readalbe. Executables that are setuid to a user not mapped in a
user namespace are worthless, so I don't expect this to introduce
any problems in practice.
There may be a way to allow this execution to happen by setting
mm->user_ns to a more privileged user namespace and watching out for
the possibility of using dynamic linkers or other shared libraries
that the kernel loads into the mm to bypass the read-only
restriction. But the analysis is more difficult and it would
require more code churn so I don't think the effort is worth it.
Cc: [email protected]
Reported-by: Jann Horn <[email protected]>
Fixes: 9e4a36ece652 ("userns: Fail exec for suid and sgid binaries with ids outside our user namespace.")
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
Tossing this out for review in case I missed something silly but this
patch seems pretty trivial.
arch/x86/ia32/ia32_aout.c | 4 +++-
fs/binfmt_aout.c | 4 +++-
fs/binfmt_elf.c | 4 +++-
fs/binfmt_elf_fdpic.c | 4 +++-
fs/binfmt_flat.c | 4 +++-
fs/exec.c | 19 ++++++++++++++++---
include/linux/binfmts.h | 6 +++++-
7 files changed, 36 insertions(+), 9 deletions(-)
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index cb26f18d43af..7ad20dedd929 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -294,7 +294,9 @@ static int load_aout_binary(struct linux_binprm *bprm)
set_personality(PER_LINUX);
set_personality_ia32(false);
- setup_new_exec(bprm);
+ retval = setup_new_exec(bprm);
+ if (retval)
+ return retval;
regs->cs = __USER32_CS;
regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index ae1b5404fced..b7b8aa03ccd0 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -242,7 +242,9 @@ static int load_aout_binary(struct linux_binprm * bprm)
#else
set_personality(PER_LINUX);
#endif
- setup_new_exec(bprm);
+ retval = setup_new_exec(bprm);
+ if (retval)
+ return retval;
current->mm->end_code = ex.a_text +
(current->mm->start_code = N_TXTADDR(ex));
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 2472af2798c7..423fece0b8c4 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -852,7 +852,9 @@ static int load_elf_binary(struct linux_binprm *bprm)
if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
current->flags |= PF_RANDOMIZE;
- setup_new_exec(bprm);
+ retval = setup_new_exec(bprm);
+ if (retval)
+ goto out_free_dentry;
install_exec_creds(bprm);
/* Do this so that we can load the interpreter, if need be. We will
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 464a972e88c1..d3099caff96d 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -352,7 +352,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
if (elf_read_implies_exec(&exec_params.hdr, executable_stack))
current->personality |= READ_IMPLIES_EXEC;
- setup_new_exec(bprm);
+ retval = setup_new_exec(bprm);
+ if (retval)
+ goto error;
set_binfmt(&elf_fdpic_format);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 9b2917a30294..25ca68940ad4 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -524,7 +524,9 @@ static int load_flat_file(struct linux_binprm *bprm,
/* OK, This is the point of no return */
set_personality(PER_LINUX_32BIT);
- setup_new_exec(bprm);
+ ret = setup_new_exec(bprm);
+ if (ret)
+ goto err;
}
/*
diff --git a/fs/exec.c b/fs/exec.c
index 6fcfb3f7b137..f724ed94ba7a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1270,12 +1270,21 @@ EXPORT_SYMBOL(flush_old_exec);
void would_dump(struct linux_binprm *bprm, struct file *file)
{
- if (inode_permission(file_inode(file), MAY_READ) < 0)
+ struct inode *inode = file_inode(file);
+ if (inode_permission(inode, MAY_READ) < 0) {
+ struct user_namespace *user_ns = current->mm->user_ns;
bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
+
+ /* May the user_ns root read the executable? */
+ if (!kuid_has_mapping(user_ns, inode->i_uid) ||
+ !kgid_has_mapping(user_ns, inode->i_gid)) {
+ bprm->interp_flags |= BINPRM_FLAGS_EXEC_INACCESSIBLE;
+ }
+ }
}
EXPORT_SYMBOL(would_dump);
-void setup_new_exec(struct linux_binprm * bprm)
+int setup_new_exec(struct linux_binprm * bprm)
{
arch_pick_mmap_layout(current->mm);
@@ -1296,12 +1305,15 @@ void setup_new_exec(struct linux_binprm * bprm)
*/
current->mm->task_size = TASK_SIZE;
+ would_dump(bprm, bprm->file);
+ if (bprm->interp_flags & BINPRM_FLAGS_EXEC_INACCESSIBLE)
+ return -EPERM;
+
/* install the new credentials */
if (!uid_eq(bprm->cred->uid, current_euid()) ||
!gid_eq(bprm->cred->gid, current_egid())) {
current->pdeath_signal = 0;
} else {
- would_dump(bprm, bprm->file);
if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)
set_dumpable(current->mm, suid_dumpable);
}
@@ -1311,6 +1323,7 @@ void setup_new_exec(struct linux_binprm * bprm)
current->self_exec_id++;
flush_signal_handlers(current, 0);
do_close_on_exec(current->files);
+ return 0;
}
EXPORT_SYMBOL(setup_new_exec);
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 1303b570b18c..8e5fb9eca2ee 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -57,6 +57,10 @@ struct linux_binprm {
#define BINPRM_FLAGS_PATH_INACCESSIBLE_BIT 2
#define BINPRM_FLAGS_PATH_INACCESSIBLE (1 << BINPRM_FLAGS_PATH_INACCESSIBLE_BIT)
+/* executable is inaccessible for performing exec */
+#define BINPRM_FLAGS_EXEC_INACCESSIBLE_BIT 3
+#define BINPRM_FLAGS_EXEC_INACCESSIBLE (1 << BINPRM_FLAGS_EXEC_INACCESSIBLE_BIT)
+
/* Function parameter for binfmt->coredump */
struct coredump_params {
const siginfo_t *siginfo;
@@ -100,7 +104,7 @@ extern int prepare_binprm(struct linux_binprm *);
extern int __must_check remove_arg_zero(struct linux_binprm *);
extern int search_binary_handler(struct linux_binprm *);
extern int flush_old_exec(struct linux_binprm * bprm);
-extern void setup_new_exec(struct linux_binprm * bprm);
+extern int setup_new_exec(struct linux_binprm * bprm);
extern void would_dump(struct linux_binprm *, struct file *);
extern int suid_dumpable;
--
2.8.3
On Wed, Oct 19, 2016 at 12:15 AM, Eric W. Biederman
<[email protected]> wrote:
>
> When the user namespace support was merged the need to prevent
> ptracing an executable that is not readable was overlooked.
>
> Correct this oversight by not letting exec succeed if during exec an
> executable is not readable and the current user namespace capabilities
> do not apply to the executable's file.
>
> While it happens that distros install some files setuid and
> non-readable I have not found any executable files just installed
> non-readalbe. Executables that are setuid to a user not mapped in a
> user namespace are worthless, so I don't expect this to introduce
> any problems in practice.
>
> There may be a way to allow this execution to happen by setting
> mm->user_ns to a more privileged user namespace and watching out for
> the possibility of using dynamic linkers or other shared libraries
> that the kernel loads into the mm to bypass the read-only
> restriction. But the analysis is more difficult and it would
> require more code churn so I don't think the effort is worth it.
>
> Cc: [email protected]
> Reported-by: Jann Horn <[email protected]>
> Fixes: 9e4a36ece652 ("userns: Fail exec for suid and sgid binaries with ids outside our user namespace.")
> Signed-off-by: "Eric W. Biederman" <[email protected]>
> ---
>
> Tossing this out for review in case I missed something silly but this
> patch seems pretty trivial.
>
> arch/x86/ia32/ia32_aout.c | 4 +++-
> fs/binfmt_aout.c | 4 +++-
> fs/binfmt_elf.c | 4 +++-
> fs/binfmt_elf_fdpic.c | 4 +++-
> fs/binfmt_flat.c | 4 +++-
> fs/exec.c | 19 ++++++++++++++++---
> include/linux/binfmts.h | 6 +++++-
> 7 files changed, 36 insertions(+), 9 deletions(-)
>
> diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
> index cb26f18d43af..7ad20dedd929 100644
> --- a/arch/x86/ia32/ia32_aout.c
> +++ b/arch/x86/ia32/ia32_aout.c
> @@ -294,7 +294,9 @@ static int load_aout_binary(struct linux_binprm *bprm)
> set_personality(PER_LINUX);
> set_personality_ia32(false);
>
> - setup_new_exec(bprm);
> + retval = setup_new_exec(bprm);
> + if (retval)
> + return retval;
>
> regs->cs = __USER32_CS;
> regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
> diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
> index ae1b5404fced..b7b8aa03ccd0 100644
> --- a/fs/binfmt_aout.c
> +++ b/fs/binfmt_aout.c
> @@ -242,7 +242,9 @@ static int load_aout_binary(struct linux_binprm * bprm)
> #else
> set_personality(PER_LINUX);
> #endif
> - setup_new_exec(bprm);
> + retval = setup_new_exec(bprm);
> + if (retval)
> + return retval;
>
> current->mm->end_code = ex.a_text +
> (current->mm->start_code = N_TXTADDR(ex));
> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> index 2472af2798c7..423fece0b8c4 100644
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -852,7 +852,9 @@ static int load_elf_binary(struct linux_binprm *bprm)
> if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
> current->flags |= PF_RANDOMIZE;
>
> - setup_new_exec(bprm);
> + retval = setup_new_exec(bprm);
> + if (retval)
> + goto out_free_dentry;
> install_exec_creds(bprm);
>
> /* Do this so that we can load the interpreter, if need be. We will
> diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
> index 464a972e88c1..d3099caff96d 100644
> --- a/fs/binfmt_elf_fdpic.c
> +++ b/fs/binfmt_elf_fdpic.c
> @@ -352,7 +352,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
> if (elf_read_implies_exec(&exec_params.hdr, executable_stack))
> current->personality |= READ_IMPLIES_EXEC;
>
> - setup_new_exec(bprm);
> + retval = setup_new_exec(bprm);
> + if (retval)
> + goto error;
>
> set_binfmt(&elf_fdpic_format);
>
> diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
> index 9b2917a30294..25ca68940ad4 100644
> --- a/fs/binfmt_flat.c
> +++ b/fs/binfmt_flat.c
> @@ -524,7 +524,9 @@ static int load_flat_file(struct linux_binprm *bprm,
>
> /* OK, This is the point of no return */
> set_personality(PER_LINUX_32BIT);
> - setup_new_exec(bprm);
> + ret = setup_new_exec(bprm);
> + if (ret)
> + goto err;
> }
>
> /*
> diff --git a/fs/exec.c b/fs/exec.c
> index 6fcfb3f7b137..f724ed94ba7a 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1270,12 +1270,21 @@ EXPORT_SYMBOL(flush_old_exec);
>
> void would_dump(struct linux_binprm *bprm, struct file *file)
> {
> - if (inode_permission(file_inode(file), MAY_READ) < 0)
> + struct inode *inode = file_inode(file);
> + if (inode_permission(inode, MAY_READ) < 0) {
> + struct user_namespace *user_ns = current->mm->user_ns;
> bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
> +
> + /* May the user_ns root read the executable? */
> + if (!kuid_has_mapping(user_ns, inode->i_uid) ||
> + !kgid_has_mapping(user_ns, inode->i_gid)) {
> + bprm->interp_flags |= BINPRM_FLAGS_EXEC_INACCESSIBLE;
> + }
This feels like it should belong inside
inode_permission(file_inode(file), MAY_EXEC)
which hopefully should be checked long before getting here??
> + }
> }
> EXPORT_SYMBOL(would_dump);
>
> -void setup_new_exec(struct linux_binprm * bprm)
> +int setup_new_exec(struct linux_binprm * bprm)
> {
> arch_pick_mmap_layout(current->mm);
>
> @@ -1296,12 +1305,15 @@ void setup_new_exec(struct linux_binprm * bprm)
> */
> current->mm->task_size = TASK_SIZE;
>
> + would_dump(bprm, bprm->file);
> + if (bprm->interp_flags & BINPRM_FLAGS_EXEC_INACCESSIBLE)
> + return -EPERM;
> +
> /* install the new credentials */
> if (!uid_eq(bprm->cred->uid, current_euid()) ||
> !gid_eq(bprm->cred->gid, current_egid())) {
> current->pdeath_signal = 0;
> } else {
> - would_dump(bprm, bprm->file);
> if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)
> set_dumpable(current->mm, suid_dumpable);
> }
> @@ -1311,6 +1323,7 @@ void setup_new_exec(struct linux_binprm * bprm)
> current->self_exec_id++;
> flush_signal_handlers(current, 0);
> do_close_on_exec(current->files);
> + return 0;
> }
> EXPORT_SYMBOL(setup_new_exec);
>
> diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
> index 1303b570b18c..8e5fb9eca2ee 100644
> --- a/include/linux/binfmts.h
> +++ b/include/linux/binfmts.h
> @@ -57,6 +57,10 @@ struct linux_binprm {
> #define BINPRM_FLAGS_PATH_INACCESSIBLE_BIT 2
> #define BINPRM_FLAGS_PATH_INACCESSIBLE (1 << BINPRM_FLAGS_PATH_INACCESSIBLE_BIT)
>
> +/* executable is inaccessible for performing exec */
> +#define BINPRM_FLAGS_EXEC_INACCESSIBLE_BIT 3
> +#define BINPRM_FLAGS_EXEC_INACCESSIBLE (1 << BINPRM_FLAGS_EXEC_INACCESSIBLE_BIT)
> +
> /* Function parameter for binfmt->coredump */
> struct coredump_params {
> const siginfo_t *siginfo;
> @@ -100,7 +104,7 @@ extern int prepare_binprm(struct linux_binprm *);
> extern int __must_check remove_arg_zero(struct linux_binprm *);
> extern int search_binary_handler(struct linux_binprm *);
> extern int flush_old_exec(struct linux_binprm * bprm);
> -extern void setup_new_exec(struct linux_binprm * bprm);
> +extern int setup_new_exec(struct linux_binprm * bprm);
> extern void would_dump(struct linux_binprm *, struct file *);
>
> extern int suid_dumpable;
> --
> 2.8.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
Amir Goldstein <[email protected]> writes:
>> diff --git a/fs/exec.c b/fs/exec.c
>> index 6fcfb3f7b137..f724ed94ba7a 100644
>> --- a/fs/exec.c
>> +++ b/fs/exec.c
>> @@ -1270,12 +1270,21 @@ EXPORT_SYMBOL(flush_old_exec);
>>
>> void would_dump(struct linux_binprm *bprm, struct file *file)
>> {
>> - if (inode_permission(file_inode(file), MAY_READ) < 0)
>> + struct inode *inode = file_inode(file);
>> + if (inode_permission(inode, MAY_READ) < 0) {
>> + struct user_namespace *user_ns = current->mm->user_ns;
>> bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
>> +
>> + /* May the user_ns root read the executable? */
>> + if (!kuid_has_mapping(user_ns, inode->i_uid) ||
>> + !kgid_has_mapping(user_ns, inode->i_gid)) {
>> + bprm->interp_flags |= BINPRM_FLAGS_EXEC_INACCESSIBLE;
>> + }
>
> This feels like it should belong inside
> inode_permission(file_inode(file), MAY_EXEC)
> which hopefully should be checked long before getting here??
It is the active ingredient in capable_wrt_inode_uidgid and is indeed
inside of inode_permission.
What I am testing for here is if I have a process with a full
set of capabilities in current->mm->user_ns will the inode be readable.
I can see an argument for calling prepare_creds stuffing the new cred
full of capabilities. Calling override_cred. Calling inode_permission,
restoring the credentials. But it seems very much like overkill and
more error prone because of the more code involved.
So I have done the simple thing that doesn't hide what is really going on.
Eric
On Tue, Oct 18, 2016 at 2:15 PM, Eric W. Biederman
<[email protected]> wrote:
>
> When the user namespace support was merged the need to prevent
> ptracing an executable that is not readable was overlooked.
Before getting too excited about this fix, isn't there a much bigger
hole that's been there forever? Simply ptrace yourself, exec the
program, and then dump the program out. A program that really wants
to be unreadable should have a stub: the stub is setuid and readable,
but all the stub does is to exec the real program, and the real
program should have mode 0500 or similar.
ISTM the "right" check would be to enforce that the program's new
creds can read the program, but that will break backwards
compatibility.
--Andy
Andy Lutomirski <[email protected]> writes:
> On Tue, Oct 18, 2016 at 2:15 PM, Eric W. Biederman
> <[email protected]> wrote:
>>
>> When the user namespace support was merged the need to prevent
>> ptracing an executable that is not readable was overlooked.
>
> Before getting too excited about this fix, isn't there a much bigger
> hole that's been there forever?
In this case it was a newish hole (2011) that the user namespace support
added that I am closing. I am not super excited but I figure it is
useful to make the kernel semantics at least as secure as they were
before.
> Simply ptrace yourself, exec the
> program, and then dump the program out. A program that really wants
> to be unreadable should have a stub: the stub is setuid and readable,
> but all the stub does is to exec the real program, and the real
> program should have mode 0500 or similar.
>
> ISTM the "right" check would be to enforce that the program's new
> creds can read the program, but that will break backwards
> compatibility.
Last I looked I had the impression that exec of a setuid program kills
the ptrace.
If we are talking about a exec of a simple unreadable executable (aka
something that sets undumpable but is not setuid or setgid). Then I
agree it should break the ptrace as well and since those programs are as
rare as hens teeth I don't see any problem with changing the ptrace behavior
in that case.
Eric
[email protected] (Eric W. Biederman) writes:
> Amir Goldstein <[email protected]> writes:
>
>>> diff --git a/fs/exec.c b/fs/exec.c
>>> index 6fcfb3f7b137..f724ed94ba7a 100644
>>> --- a/fs/exec.c
>>> +++ b/fs/exec.c
>>> @@ -1270,12 +1270,21 @@ EXPORT_SYMBOL(flush_old_exec);
>>>
>>> void would_dump(struct linux_binprm *bprm, struct file *file)
>>> {
>>> - if (inode_permission(file_inode(file), MAY_READ) < 0)
>>> + struct inode *inode = file_inode(file);
>>> + if (inode_permission(inode, MAY_READ) < 0) {
>>> + struct user_namespace *user_ns = current->mm->user_ns;
>>> bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
>>> +
>>> + /* May the user_ns root read the executable? */
>>> + if (!kuid_has_mapping(user_ns, inode->i_uid) ||
>>> + !kgid_has_mapping(user_ns, inode->i_gid)) {
>>> + bprm->interp_flags |= BINPRM_FLAGS_EXEC_INACCESSIBLE;
>>> + }
>>
>> This feels like it should belong inside
>> inode_permission(file_inode(file), MAY_EXEC)
>> which hopefully should be checked long before getting here??
>
> It is the active ingredient in capable_wrt_inode_uidgid and is indeed
> inside of inode_permission.
>
> What I am testing for here is if I have a process with a full
> set of capabilities in current->mm->user_ns will the inode be readable.
>
> I can see an argument for calling prepare_creds stuffing the new cred
> full of capabilities. Calling override_cred. Calling inode_permission,
> restoring the credentials. But it seems very much like overkill and
> more error prone because of the more code involved.
>
> So I have done the simple thing that doesn't hide what is really going on.
At the same time I can see the addition of a helper function
bool ns_inode(struct user_namespace *user_ns, struct inode *inode)
{
return kuid_has_mapping(user_ns, inode->i_uid) &&
kgid_has_mapping(user_ns, inode->i_gid);
}
That abstracts out the concept instead of open codes it.
Eric
On Wed, Oct 19, 2016 at 11:52:50AM -0500, Eric W. Biederman wrote:
> Andy Lutomirski <[email protected]> writes:
> > Simply ptrace yourself, exec the
> > program, and then dump the program out. A program that really wants
> > to be unreadable should have a stub: the stub is setuid and readable,
> > but all the stub does is to exec the real program, and the real
> > program should have mode 0500 or similar.
> >
> > ISTM the "right" check would be to enforce that the program's new
> > creds can read the program, but that will break backwards
> > compatibility.
>
> Last I looked I had the impression that exec of a setuid program kills
> the ptrace.
>
> If we are talking about a exec of a simple unreadable executable (aka
> something that sets undumpable but is not setuid or setgid). Then I
> agree it should break the ptrace as well and since those programs are as
> rare as hens teeth I don't see any problem with changing the ptrace behavior
> in that case.
Nope. check_unsafe_exec() sets LSM_UNSAFE_* flags in bprm->unsafe, and then
the flags are checked by the LSMs and cap_bprm_set_creds() in commoncap.c.
cap_bprm_set_creds() just degrades the execution to a non-setuid-ish one,
and e.g. ptracers stay attached.
Same thing happens if the fs struct is shared with another process or if
NO_NEW_PRIVS is active.
(Actually, it's still a bit like normal setuid execution: IIRC AT_SECURE
stays active, and the resulting process still won't be dumpable, so it's
not possible for a *new* ptracer to attach afterwards. But this is just
from memory, I'm not entirely sure.)
On Wed, Oct 19, 2016 at 10:29 AM, Jann Horn <[email protected]> wrote:
> On Wed, Oct 19, 2016 at 11:52:50AM -0500, Eric W. Biederman wrote:
>> Andy Lutomirski <[email protected]> writes:
>> > Simply ptrace yourself, exec the
>> > program, and then dump the program out. A program that really wants
>> > to be unreadable should have a stub: the stub is setuid and readable,
>> > but all the stub does is to exec the real program, and the real
>> > program should have mode 0500 or similar.
>> >
>> > ISTM the "right" check would be to enforce that the program's new
>> > creds can read the program, but that will break backwards
>> > compatibility.
>>
>> Last I looked I had the impression that exec of a setuid program kills
>> the ptrace.
>>
>> If we are talking about a exec of a simple unreadable executable (aka
>> something that sets undumpable but is not setuid or setgid). Then I
>> agree it should break the ptrace as well and since those programs are as
>> rare as hens teeth I don't see any problem with changing the ptrace behavior
>> in that case.
>
> Nope. check_unsafe_exec() sets LSM_UNSAFE_* flags in bprm->unsafe, and then
> the flags are checked by the LSMs and cap_bprm_set_creds() in commoncap.c.
> cap_bprm_set_creds() just degrades the execution to a non-setuid-ish one,
> and e.g. ptracers stay attached.
I think you're right. I ought to be completely sure because I rewrote
that code back in 2005 or so back when I thought kernel programming
was only for the cool kids. It was probably my first kernel patch
ever and it closed an awkward-to-exploit root hole. But it's been a
while. (Too bad my second (IIRC) kernel patch was more mundane and
fixed the mute button on "new" Lenovo X60-era laptops and spend
several years in limbo...)
--Andy
Andy Lutomirski <[email protected]> writes:
> On Wed, Oct 19, 2016 at 10:29 AM, Jann Horn <[email protected]> wrote:
>> On Wed, Oct 19, 2016 at 11:52:50AM -0500, Eric W. Biederman wrote:
>>> Andy Lutomirski <[email protected]> writes:
>>> > Simply ptrace yourself, exec the
>>> > program, and then dump the program out. A program that really wants
>>> > to be unreadable should have a stub: the stub is setuid and readable,
>>> > but all the stub does is to exec the real program, and the real
>>> > program should have mode 0500 or similar.
>>> >
>>> > ISTM the "right" check would be to enforce that the program's new
>>> > creds can read the program, but that will break backwards
>>> > compatibility.
>>>
>>> Last I looked I had the impression that exec of a setuid program kills
>>> the ptrace.
>>>
>>> If we are talking about a exec of a simple unreadable executable (aka
>>> something that sets undumpable but is not setuid or setgid). Then I
>>> agree it should break the ptrace as well and since those programs are as
>>> rare as hens teeth I don't see any problem with changing the ptrace behavior
>>> in that case.
>>
>> Nope. check_unsafe_exec() sets LSM_UNSAFE_* flags in bprm->unsafe, and then
>> the flags are checked by the LSMs and cap_bprm_set_creds() in commoncap.c.
>> cap_bprm_set_creds() just degrades the execution to a non-setuid-ish one,
>> and e.g. ptracers stay attached.
>
> I think you're right. I ought to be completely sure because I rewrote
> that code back in 2005 or so back when I thought kernel programming
> was only for the cool kids. It was probably my first kernel patch
> ever and it closed an awkward-to-exploit root hole. But it's been a
> while. (Too bad my second (IIRC) kernel patch was more mundane and
> fixed the mute button on "new" Lenovo X60-era laptops and spend
> several years in limbo...)
Ah yes and this is only a problem if the ptracer does not have
CAP_SYS_PTRACE.
If the tracer does not have sufficient permissions any opinions on
failing the exec or kicking out the ptracer? I am leaning towards failing
the exec as it is more obvious if someone cares. Dropping the ptracer
could be a major mystery.
Eric
On Wed, Oct 19, 2016 at 10:55 AM, Eric W. Biederman
<[email protected]> wrote:
> Andy Lutomirski <[email protected]> writes:
>
>> On Wed, Oct 19, 2016 at 10:29 AM, Jann Horn <[email protected]> wrote:
>>> On Wed, Oct 19, 2016 at 11:52:50AM -0500, Eric W. Biederman wrote:
>>>> Andy Lutomirski <[email protected]> writes:
>>>> > Simply ptrace yourself, exec the
>>>> > program, and then dump the program out. A program that really wants
>>>> > to be unreadable should have a stub: the stub is setuid and readable,
>>>> > but all the stub does is to exec the real program, and the real
>>>> > program should have mode 0500 or similar.
>>>> >
>>>> > ISTM the "right" check would be to enforce that the program's new
>>>> > creds can read the program, but that will break backwards
>>>> > compatibility.
>>>>
>>>> Last I looked I had the impression that exec of a setuid program kills
>>>> the ptrace.
>>>>
>>>> If we are talking about a exec of a simple unreadable executable (aka
>>>> something that sets undumpable but is not setuid or setgid). Then I
>>>> agree it should break the ptrace as well and since those programs are as
>>>> rare as hens teeth I don't see any problem with changing the ptrace behavior
>>>> in that case.
>>>
>>> Nope. check_unsafe_exec() sets LSM_UNSAFE_* flags in bprm->unsafe, and then
>>> the flags are checked by the LSMs and cap_bprm_set_creds() in commoncap.c.
>>> cap_bprm_set_creds() just degrades the execution to a non-setuid-ish one,
>>> and e.g. ptracers stay attached.
>>
>> I think you're right. I ought to be completely sure because I rewrote
>> that code back in 2005 or so back when I thought kernel programming
>> was only for the cool kids. It was probably my first kernel patch
>> ever and it closed an awkward-to-exploit root hole. But it's been a
>> while. (Too bad my second (IIRC) kernel patch was more mundane and
>> fixed the mute button on "new" Lenovo X60-era laptops and spend
>> several years in limbo...)
>
> Ah yes and this is only a problem if the ptracer does not have
> CAP_SYS_PTRACE.
>
> If the tracer does not have sufficient permissions any opinions on
> failing the exec or kicking out the ptracer? I am leaning towards failing
> the exec as it is more obvious if someone cares. Dropping the ptracer
> could be a major mystery.
I would suggest leaving it alone. Changing it could break enough
things that a sysctl would be needed, and I just don't see how this is
a significant issue, especially since it's been insecure forever.
Anyone who cares should do the stub executable trick:
/sbin/foo: 04755, literally just does execve("/sbin/foo-helper");
/sbin/foo-helper: 0500.
--Andy
Andy Lutomirski <[email protected]> writes:
> On Wed, Oct 19, 2016 at 10:55 AM, Eric W. Biederman
> <[email protected]> wrote:
>> Andy Lutomirski <[email protected]> writes:
>>
>>> On Wed, Oct 19, 2016 at 10:29 AM, Jann Horn <[email protected]> wrote:
>>>> On Wed, Oct 19, 2016 at 11:52:50AM -0500, Eric W. Biederman wrote:
>>>>> Andy Lutomirski <[email protected]> writes:
>>>>> > Simply ptrace yourself, exec the
>>>>> > program, and then dump the program out. A program that really wants
>>>>> > to be unreadable should have a stub: the stub is setuid and readable,
>>>>> > but all the stub does is to exec the real program, and the real
>>>>> > program should have mode 0500 or similar.
>>>>> >
>>>>> > ISTM the "right" check would be to enforce that the program's new
>>>>> > creds can read the program, but that will break backwards
>>>>> > compatibility.
>>>>>
>>>>> Last I looked I had the impression that exec of a setuid program kills
>>>>> the ptrace.
>>>>>
>>>>> If we are talking about a exec of a simple unreadable executable (aka
>>>>> something that sets undumpable but is not setuid or setgid). Then I
>>>>> agree it should break the ptrace as well and since those programs are as
>>>>> rare as hens teeth I don't see any problem with changing the ptrace behavior
>>>>> in that case.
>>>>
>>>> Nope. check_unsafe_exec() sets LSM_UNSAFE_* flags in bprm->unsafe, and then
>>>> the flags are checked by the LSMs and cap_bprm_set_creds() in commoncap.c.
>>>> cap_bprm_set_creds() just degrades the execution to a non-setuid-ish one,
>>>> and e.g. ptracers stay attached.
>>>
>>> I think you're right. I ought to be completely sure because I rewrote
>>> that code back in 2005 or so back when I thought kernel programming
>>> was only for the cool kids. It was probably my first kernel patch
>>> ever and it closed an awkward-to-exploit root hole. But it's been a
>>> while. (Too bad my second (IIRC) kernel patch was more mundane and
>>> fixed the mute button on "new" Lenovo X60-era laptops and spend
>>> several years in limbo...)
>>
>> Ah yes and this is only a problem if the ptracer does not have
>> CAP_SYS_PTRACE.
>>
>> If the tracer does not have sufficient permissions any opinions on
>> failing the exec or kicking out the ptracer? I am leaning towards failing
>> the exec as it is more obvious if someone cares. Dropping the ptracer
>> could be a major mystery.
>
> I would suggest leaving it alone. Changing it could break enough
> things that a sysctl would be needed, and I just don't see how this is
> a significant issue, especially since it's been insecure forever.
> Anyone who cares should do the stub executable trick:
>
> /sbin/foo: 04755, literally just does execve("/sbin/foo-helper");
>
> /sbin/foo-helper: 0500.
I can't imagine what non-malware would depend on being able to
circumvent file permissions and ptrace a read-only executable. Is there
something you are thinking of?
I know I saw someone depending on read-only executables being read-only
earlier this week on the security list, and it could definitely act as
part of a counter measure to make binaries harder to exploit.
So given that people actually expect no-read permissions to be honored
on executables (with what seem valid and sensible use cases), that
I can't see any valid reason not to honor no-read permissions, that it
takes a really convoluted setup to bypass the current no-read
permissions, and that I can't believe anyone cares about the current
behavior of ptrace I think this is worth fixing.
Eric
On Oct 19, 2016 2:28 PM, "Eric W. Biederman" <[email protected]> wrote:
>
> Andy Lutomirski <[email protected]> writes:
>
> > On Wed, Oct 19, 2016 at 10:55 AM, Eric W. Biederman
> > <[email protected]> wrote:
> >> Andy Lutomirski <[email protected]> writes:
> >>
> >>> On Wed, Oct 19, 2016 at 10:29 AM, Jann Horn <[email protected]> wrote:
> >>>> On Wed, Oct 19, 2016 at 11:52:50AM -0500, Eric W. Biederman wrote:
> >>>>> Andy Lutomirski <[email protected]> writes:
> >>>>> > Simply ptrace yourself, exec the
> >>>>> > program, and then dump the program out. A program that really wants
> >>>>> > to be unreadable should have a stub: the stub is setuid and readable,
> >>>>> > but all the stub does is to exec the real program, and the real
> >>>>> > program should have mode 0500 or similar.
> >>>>> >
> >>>>> > ISTM the "right" check would be to enforce that the program's new
> >>>>> > creds can read the program, but that will break backwards
> >>>>> > compatibility.
> >>>>>
> >>>>> Last I looked I had the impression that exec of a setuid program kills
> >>>>> the ptrace.
> >>>>>
> >>>>> If we are talking about a exec of a simple unreadable executable (aka
> >>>>> something that sets undumpable but is not setuid or setgid). Then I
> >>>>> agree it should break the ptrace as well and since those programs are as
> >>>>> rare as hens teeth I don't see any problem with changing the ptrace behavior
> >>>>> in that case.
> >>>>
> >>>> Nope. check_unsafe_exec() sets LSM_UNSAFE_* flags in bprm->unsafe, and then
> >>>> the flags are checked by the LSMs and cap_bprm_set_creds() in commoncap.c.
> >>>> cap_bprm_set_creds() just degrades the execution to a non-setuid-ish one,
> >>>> and e.g. ptracers stay attached.
> >>>
> >>> I think you're right. I ought to be completely sure because I rewrote
> >>> that code back in 2005 or so back when I thought kernel programming
> >>> was only for the cool kids. It was probably my first kernel patch
> >>> ever and it closed an awkward-to-exploit root hole. But it's been a
> >>> while. (Too bad my second (IIRC) kernel patch was more mundane and
> >>> fixed the mute button on "new" Lenovo X60-era laptops and spend
> >>> several years in limbo...)
> >>
> >> Ah yes and this is only a problem if the ptracer does not have
> >> CAP_SYS_PTRACE.
> >>
> >> If the tracer does not have sufficient permissions any opinions on
> >> failing the exec or kicking out the ptracer? I am leaning towards failing
> >> the exec as it is more obvious if someone cares. Dropping the ptracer
> >> could be a major mystery.
> >
> > I would suggest leaving it alone. Changing it could break enough
> > things that a sysctl would be needed, and I just don't see how this is
> > a significant issue, especially since it's been insecure forever.
> > Anyone who cares should do the stub executable trick:
> >
> > /sbin/foo: 04755, literally just does execve("/sbin/foo-helper");
> >
> > /sbin/foo-helper: 0500.
>
> I can't imagine what non-malware would depend on being able to
> circumvent file permissions and ptrace a read-only executable. Is there
> something you are thinking of?
$ strace sudo foobar
or
$ strace auditctl
I find the current behavior somewhat odd, but I've taken advantage of
it on a semi-regular basis.
That being said, the "May the user_ns root read the executable?" test
in your patch is not strictly correct. Do we keep a struct cred
around for the ns root?
With everyone heading to Kernel Summit and Plumbers I put this set of
patches down temporarily. Now is the time to take it back up and to
make certain I am not missing something stupid in this set of patches.
There are other issues in this area as well, but these are the pieces
that I can see clearly, and have tested fixes for.
Andy as to your criticism about using strace sudo I can't possibly see
how that is effective or useful. Under strace sudo won't run as root
today, and will immediately exit because it is not root. Furthermore
the only place I can find non-readable executables is people hardening
suid root executables so they are more difficult to trace. So I
definitely think we should honor the unix permissions and people's
expressed wishes.
Eric W. Biederman (3):
ptrace: Capture the ptracer's creds not PT_PTRACE_CAP
exec: Don't allow ptracing an exec of an unreadable file
exec: Ensure mm->user_ns contains the execed files
fs/exec.c | 26 +++++++++++++++++++++++---
include/linux/capability.h | 2 ++
include/linux/ptrace.h | 1 -
include/linux/sched.h | 1 +
kernel/capability.c | 36 ++++++++++++++++++++++++++++++++++--
kernel/ptrace.c | 12 +++++++-----
6 files changed, 67 insertions(+), 11 deletions(-)
When the flag PT_PTRACE_CAP was added the PTRACE_TRACEME path was
overlooked. This can result in incorrect behavior when an application
like strace traces an exec of a setuid executable.
Further PT_PTRACE_CAP does not have enough information for making good
security decisions as it does not report which user namespace the
capability is in. This has already allowed one mistake through
insufficient granulariy.
I found this issue when I was testing another corner case of exec and
discovered that I could not get strace to set PT_PTRACE_CAP even when
running strace as root with a full set of caps.
This change fixes the above issue with strace allowing stracing as
root a setuid executable without disabling setuid. More fundamentaly
this change allows what is allowable at all times, by using the correct
information in it's decision.
Cc: [email protected]
Fixes: 4214e42f96d4 ("v2.4.9.11 -> v2.4.9.12")
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
fs/exec.c | 2 +-
include/linux/capability.h | 1 +
include/linux/ptrace.h | 1 -
include/linux/sched.h | 1 +
kernel/capability.c | 20 ++++++++++++++++++++
kernel/ptrace.c | 12 +++++++-----
6 files changed, 30 insertions(+), 7 deletions(-)
diff --git a/fs/exec.c b/fs/exec.c
index 6fcfb3f7b137..fdec760bfac3 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1401,7 +1401,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
unsigned n_fs;
if (p->ptrace) {
- if (p->ptrace & PT_PTRACE_CAP)
+ if (ptracer_capable(p, current_user_ns()))
bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP;
else
bprm->unsafe |= LSM_UNSAFE_PTRACE;
diff --git a/include/linux/capability.h b/include/linux/capability.h
index dbc21c719ce6..d6088e2a7668 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -242,6 +242,7 @@ static inline bool ns_capable_noaudit(struct user_namespace *ns, int cap)
#endif /* CONFIG_MULTIUSER */
extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap);
extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap);
+extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns);
/* audit system wants to get cap info from files as well */
extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 504c98a278d4..e13bfdf7f314 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -19,7 +19,6 @@
#define PT_SEIZED 0x00010000 /* SEIZE used, enable new behavior */
#define PT_PTRACED 0x00000001
#define PT_DTRACE 0x00000002 /* delayed trace (used on m68k, i386) */
-#define PT_PTRACE_CAP 0x00000004 /* ptracer can follow suid-exec */
#define PT_OPT_FLAG_SHIFT 3
/* PT_TRACE_* event enable flags */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 348f51b0ec92..8fe58255d219 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1656,6 +1656,7 @@ struct task_struct {
struct list_head cpu_timers[3];
/* process credentials */
+ const struct cred __rcu *ptracer_cred; /* Tracer's dredentials at attach */
const struct cred __rcu *real_cred; /* objective and real subjective task
* credentials (COW) */
const struct cred __rcu *cred; /* effective (overridable) subjective task
diff --git a/kernel/capability.c b/kernel/capability.c
index 00411c82dac5..dfa0e4528b0b 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -473,3 +473,23 @@ bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
kgid_has_mapping(ns, inode->i_gid);
}
EXPORT_SYMBOL(capable_wrt_inode_uidgid);
+
+/**
+ * ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace
+ * @tsk: The task that may be ptraced
+ * @ns: The user namespace to search for CAP_SYS_PTRACE in
+ *
+ * Return true if the task that is ptracing the current task had CAP_SYS_PTRACE
+ * in the specified user namespace.
+ */
+bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns)
+{
+ int ret = 0; /* An absent tracer adds no restrictions */
+ const struct cred *cred;
+ rcu_read_lock();
+ cred = rcu_dereference(tsk->ptracer_cred);
+ if (cred)
+ ret = security_capable_noaudit(cred, ns, CAP_SYS_PTRACE);
+ rcu_read_unlock();
+ return (ret == 0);
+}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 44a25a1e6e83..982505497680 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -39,6 +39,9 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
BUG_ON(!list_empty(&child->ptrace_entry));
list_add(&child->ptrace_entry, &new_parent->ptraced);
child->parent = new_parent;
+ rcu_read_lock();
+ child->ptracer_cred = get_cred(__task_cred(new_parent));
+ rcu_read_unlock();
}
/**
@@ -71,12 +74,16 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
*/
void __ptrace_unlink(struct task_struct *child)
{
+ const struct cred *old_cred;
BUG_ON(!child->ptrace);
clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
child->parent = child->real_parent;
list_del_init(&child->ptrace_entry);
+ old_cred = child->ptracer_cred;
+ child->ptracer_cred = NULL;
+ put_cred(old_cred);
spin_lock(&child->sighand->siglock);
child->ptrace = 0;
@@ -326,11 +333,6 @@ static int ptrace_attach(struct task_struct *task, long request,
task_lock(task);
retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS);
- if (!retval) {
- struct mm_struct *mm = task->mm;
- if (mm && ns_capable(mm->user_ns, CAP_SYS_PTRACE))
- flags |= PT_PTRACE_CAP;
- }
task_unlock(task);
if (retval)
goto unlock_creds;
--
2.10.1
It is the reasonable expectation that if an executable file is not
readable there will be no way for a user without special privileges to
read the file. This is enforced in ptrace_attach but if we are
already attached there is no enforcement if a readonly executable
is exec'd.
Therefore do the simple thing and if there is a non-dumpable
executable that we are tracing without privilege fail to exec it.
Fixes: v1.0
Cc: [email protected]
Reported-by: Andy Lutomirski <[email protected]>
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
fs/exec.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/fs/exec.c b/fs/exec.c
index fdec760bfac3..de107f74e055 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1230,6 +1230,11 @@ int flush_old_exec(struct linux_binprm * bprm)
{
int retval;
+ /* Fail if the tracer can't read the executable */
+ if ((bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP) &&
+ !ptracer_capable(current, bprm->mm->user_ns))
+ return -EPERM;
+
/*
* Make sure we have a private signal table and that
* we are unassociated from the previous thread group.
@@ -1301,7 +1306,6 @@ void setup_new_exec(struct linux_binprm * bprm)
!gid_eq(bprm->cred->gid, current_egid())) {
current->pdeath_signal = 0;
} else {
- would_dump(bprm, bprm->file);
if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)
set_dumpable(current->mm, suid_dumpable);
}
@@ -1736,6 +1740,8 @@ static int do_execveat_common(int fd, struct filename *filename,
if (retval < 0)
goto out;
+ would_dump(bprm, bprm->file);
+
retval = exec_binprm(bprm);
if (retval < 0)
goto out;
--
2.10.1
When the user namespace support was merged the need to prevent
ptrace from revealing the contents of an unreadable executable
was overlooked.
Correct this oversight by ensuring that the executed file
or files are in mm->user_ns, by adjusting mm->user_ns.
Use the new function privileged_wrt_inode_uidgid to see if
the executable is a member of the user namespace, and as such
if having CAP_SYS_PTRACE in the user namespace should allow
tracing the executable. If not update mm->user_ns to
the parent user namespace until an appropriate parent is found.
Cc: [email protected]
Reported-by: Jann Horn <[email protected]>
Fixes: 9e4a36ece652 ("userns: Fail exec for suid and sgid binaries with ids outside our user namespace.")
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
fs/exec.c | 16 +++++++++++++++-
include/linux/capability.h | 1 +
kernel/capability.c | 16 ++++++++++++++--
3 files changed, 30 insertions(+), 3 deletions(-)
diff --git a/fs/exec.c b/fs/exec.c
index de107f74e055..4ce5d68d6f5b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1275,8 +1275,22 @@ EXPORT_SYMBOL(flush_old_exec);
void would_dump(struct linux_binprm *bprm, struct file *file)
{
- if (inode_permission(file_inode(file), MAY_READ) < 0)
+ struct inode *inode = file_inode(file);
+ if (inode_permission(inode, MAY_READ) < 0) {
+ struct user_namespace *old, *user_ns;
bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
+
+ /* Ensure mm->user_ns contains the executable */
+ user_ns = old = bprm->mm->user_ns;
+ while ((user_ns != &init_user_ns) &&
+ !privileged_wrt_inode_uidgid(user_ns, inode))
+ user_ns = user_ns->parent;
+
+ if (old != user_ns) {
+ bprm->mm->user_ns = get_user_ns(user_ns);
+ put_user_ns(old);
+ }
+ }
}
EXPORT_SYMBOL(would_dump);
diff --git a/include/linux/capability.h b/include/linux/capability.h
index d6088e2a7668..6ffb67e10c06 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -240,6 +240,7 @@ static inline bool ns_capable_noaudit(struct user_namespace *ns, int cap)
return true;
}
#endif /* CONFIG_MULTIUSER */
+extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode);
extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap);
extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap);
extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns);
diff --git a/kernel/capability.c b/kernel/capability.c
index dfa0e4528b0b..4984e1f552eb 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -457,6 +457,19 @@ bool file_ns_capable(const struct file *file, struct user_namespace *ns,
EXPORT_SYMBOL(file_ns_capable);
/**
+ * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode?
+ * @ns: The user namespace in question
+ * @inode: The inode in question
+ *
+ * Return true if the inode uid and gid are within the namespace.
+ */
+bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode)
+{
+ return kuid_has_mapping(ns, inode->i_uid) &&
+ kgid_has_mapping(ns, inode->i_gid);
+}
+
+/**
* capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
* @inode: The inode in question
* @cap: The capability in question
@@ -469,8 +482,7 @@ bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
{
struct user_namespace *ns = current_user_ns();
- return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) &&
- kgid_has_mapping(ns, inode->i_gid);
+ return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, inode);
}
EXPORT_SYMBOL(capable_wrt_inode_uidgid);
--
2.10.1
On Thu, Nov 17, 2016 at 11:08:22AM -0600, Eric W. Biederman wrote:
>
> It is the reasonable expectation that if an executable file is not
> readable there will be no way for a user without special privileges to
> read the file. This is enforced in ptrace_attach but if we are
> already attached there is no enforcement if a readonly executable
> is exec'd.
I'm really scared by this Eric. At least you want to make it a hardening
option that can be disabled at run time, otherwise it can easily break a
lot of userspace :
admin@aloha:~$ ll /bin/bash /bin/coreutils /bin/ls /usr/bin/telnet
-r-xr-x--x 1 root adm 549272 Oct 28 16:25 /bin/bash
-rwx--x--x 1 root adm 765624 Oct 28 16:27 /bin/coreutils
lrwxrwxrwx 1 root root 9 Oct 28 16:27 /bin/ls -> coreutils
-r-xr-x--x 1 root adm 70344 Oct 28 16:34 /usr/bin/telnet
And I've not invented it, I've being taught to do this more than 20
years ago and been doing this since on any slightly hardened server
just because in pratice it's efficient at stopping quite a bunch of
rootkits which require to copy and modify your executables. Sure
they could get the contents using ptrace, but using cp is much more
common than ptrace in scripts and that works. This has prooven quite
efficient in field at stopping some rootkits several times over the
last two decades and I know I'm not the only one to do it. In fact
I *never* install an executable with read permissions for users if
there's no need for random users to copy it. Does it mean that
nobody should be able to see why their favorite utility doesn't
work anymore ? Not in my opinion, at least not by default.
So here I fear that we'll break strace at many places where strace
precisely matters to debug things.
However I'd love to have this feature controlled by a sysctl (to
enforce it by default where possible).
Thanks,
Willy
On Thu, Nov 17, 2016 at 12:47 PM, Willy Tarreau <[email protected]> wrote:
> On Thu, Nov 17, 2016 at 11:08:22AM -0600, Eric W. Biederman wrote:
>>
>> It is the reasonable expectation that if an executable file is not
>> readable there will be no way for a user without special privileges to
>> read the file. This is enforced in ptrace_attach but if we are
>> already attached there is no enforcement if a readonly executable
>> is exec'd.
>
> I'm really scared by this Eric. At least you want to make it a hardening
> option that can be disabled at run time, otherwise it can easily break a
> lot of userspace :
>
> admin@aloha:~$ ll /bin/bash /bin/coreutils /bin/ls /usr/bin/telnet
> -r-xr-x--x 1 root adm 549272 Oct 28 16:25 /bin/bash
> -rwx--x--x 1 root adm 765624 Oct 28 16:27 /bin/coreutils
> lrwxrwxrwx 1 root root 9 Oct 28 16:27 /bin/ls -> coreutils
> -r-xr-x--x 1 root adm 70344 Oct 28 16:34 /usr/bin/telnet
>
> And I've not invented it, I've being taught to do this more than 20
> years ago and been doing this since on any slightly hardened server
> just because in pratice it's efficient at stopping quite a bunch of
> rootkits which require to copy and modify your executables. Sure
> they could get the contents using ptrace, but using cp is much more
> common than ptrace in scripts and that works. This has prooven quite
> efficient in field at stopping some rootkits several times over the
> last two decades and I know I'm not the only one to do it. In fact
> I *never* install an executable with read permissions for users if
> there's no need for random users to copy it. Does it mean that
> nobody should be able to see why their favorite utility doesn't
> work anymore ? Not in my opinion, at least not by default.
>
> So here I fear that we'll break strace at many places where strace
> precisely matters to debug things.
>
> However I'd love to have this feature controlled by a sysctl (to
> enforce it by default where possible).
I'm not opposed to a sysctl for this. Regardless, I think we need to
embrace this idea now, though, since we'll soon end up with
architectures that enforce executable-only memory, in which case
ptrace will again fail. Almost better to get started here and then not
have more surprises later.
-Kees
--
Kees Cook
Nexus Security
On Thu, Nov 17, 2016 at 01:07:33PM -0800, Kees Cook wrote:
> I'm not opposed to a sysctl for this. Regardless, I think we need to
> embrace this idea now, though, since we'll soon end up with
> architectures that enforce executable-only memory, in which case
> ptrace will again fail. Almost better to get started here and then not
> have more surprises later.
Also that makes me realize that by far the largest use case of ptrace
is strace and that strace needs very little capabilities. I guess that
most users would be fine with having only pointers and not contents
for addresses or read/write of data, as they have on some other OSes,
when the process is not readable. But in my opinion when a process
is executable we should be able to trace its execution (even without
memory read access).
Willy
Willy Tarreau <[email protected]> writes:
> On Thu, Nov 17, 2016 at 01:07:33PM -0800, Kees Cook wrote:
>> I'm not opposed to a sysctl for this. Regardless, I think we need to
>> embrace this idea now, though, since we'll soon end up with
>> architectures that enforce executable-only memory, in which case
>> ptrace will again fail. Almost better to get started here and then not
>> have more surprises later.
>
> Also that makes me realize that by far the largest use case of ptrace
> is strace and that strace needs very little capabilities. I guess that
> most users would be fine with having only pointers and not contents
> for addresses or read/write of data, as they have on some other OSes,
> when the process is not readable. But in my opinion when a process
> is executable we should be able to trace its execution (even without
> memory read access).
Given all of this I will respin this series with a replacement patch
that adds a permission check ion the path where ptrace calls
access_process_vm.
I avoided it because the patch is a bit larger and with full ptrace control
is much better at leaking information. Even if you can't read the
data. But ptrace works even if it won't give you the memory based
arguments to system calls anymore.
Eric
It is the reasonable expectation that if an executable file is not
readable there will be no way for a user without special privileges to
read the file. This is enforced in ptrace_attach but if ptrace
is already attached before exec there is no enforcement for read-only
executables.
As the only way to read such an mm is through access_process_vm
spin a variant called ptrace_access_vm that will fail if the
target process is not being ptraced by the current process, or
the current process did not have sufficient privileges when ptracing
began to read the target processes mm.
In the ptrace implementations replace access_process_vm by
ptrace_access_vm. There remain several ptrace sites that still use
access_process_vm as they are reading the target executables
instructions (for kernel consumption) or register stacks. As such it
does not appear necessary to add a permission check to those calls.
This bug has always existed in Linux.
Fixes: v1.0
Cc: [email protected]
Reported-by: Andy Lutomirski <[email protected]>
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
arch/alpha/kernel/ptrace.c | 2 +-
arch/blackfin/kernel/ptrace.c | 4 ++--
arch/cris/arch-v32/kernel/ptrace.c | 2 +-
arch/ia64/kernel/ptrace.c | 2 +-
arch/mips/kernel/ptrace32.c | 4 ++--
arch/powerpc/kernel/ptrace32.c | 4 ++--
include/linux/mm.h | 2 ++
include/linux/ptrace.h | 3 +++
kernel/ptrace.c | 41 ++++++++++++++++++++++++++++++++------
mm/memory.c | 2 +-
mm/nommu.c | 2 +-
11 files changed, 51 insertions(+), 17 deletions(-)
diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c
index d9ee81769899..619d8b4bc890 100644
--- a/arch/alpha/kernel/ptrace.c
+++ b/arch/alpha/kernel/ptrace.c
@@ -281,7 +281,7 @@ long arch_ptrace(struct task_struct *child, long request,
/* When I and D space are separate, these will need to be fixed. */
case PTRACE_PEEKTEXT: /* read word at location addr. */
case PTRACE_PEEKDATA:
- copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
+ copied = ptrace_access_vm(child, addr, &tmp, sizeof(tmp), 0);
ret = -EIO;
if (copied != sizeof(tmp))
break;
diff --git a/arch/blackfin/kernel/ptrace.c b/arch/blackfin/kernel/ptrace.c
index 8b8fe671b1a6..7d8ece6a93fb 100644
--- a/arch/blackfin/kernel/ptrace.c
+++ b/arch/blackfin/kernel/ptrace.c
@@ -270,7 +270,7 @@ long arch_ptrace(struct task_struct *child, long request,
switch (bfin_mem_access_type(addr, to_copy)) {
case BFIN_MEM_ACCESS_CORE:
case BFIN_MEM_ACCESS_CORE_ONLY:
- copied = access_process_vm(child, addr, &tmp,
+ copied = ptrace_access_vm(child, addr, &tmp,
to_copy, 0);
if (copied)
break;
@@ -323,7 +323,7 @@ long arch_ptrace(struct task_struct *child, long request,
switch (bfin_mem_access_type(addr, to_copy)) {
case BFIN_MEM_ACCESS_CORE:
case BFIN_MEM_ACCESS_CORE_ONLY:
- copied = access_process_vm(child, addr, &data,
+ copied = ptrace_access_vm(child, addr, &data,
to_copy, 1);
break;
case BFIN_MEM_ACCESS_DMA:
diff --git a/arch/cris/arch-v32/kernel/ptrace.c b/arch/cris/arch-v32/kernel/ptrace.c
index f085229cf870..04251c6cb5f9 100644
--- a/arch/cris/arch-v32/kernel/ptrace.c
+++ b/arch/cris/arch-v32/kernel/ptrace.c
@@ -147,7 +147,7 @@ long arch_ptrace(struct task_struct *child, long request,
/* The trampoline page is globally mapped, no page table to traverse.*/
tmp = *(unsigned long*)addr;
} else {
- copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
+ copied = ptrace_access_vm(child, addr, &tmp, sizeof(tmp), 0);
if (copied != sizeof(tmp))
break;
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 6f54d511cc50..4c46672f3ac1 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -1156,7 +1156,7 @@ arch_ptrace (struct task_struct *child, long request,
case PTRACE_PEEKTEXT:
case PTRACE_PEEKDATA:
/* read word at location addr */
- if (access_process_vm(child, addr, &data, sizeof(data), 0)
+ if (ptrace_access_vm(child, addr, &data, sizeof(data), 0)
!= sizeof(data))
return -EIO;
/* ensure return value is not mistaken for error code */
diff --git a/arch/mips/kernel/ptrace32.c b/arch/mips/kernel/ptrace32.c
index 283b5a1967d1..114b577c5a51 100644
--- a/arch/mips/kernel/ptrace32.c
+++ b/arch/mips/kernel/ptrace32.c
@@ -69,7 +69,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
if (get_user(addrOthers, (u32 __user * __user *) (unsigned long) addr) != 0)
break;
- copied = access_process_vm(child, (u64)addrOthers, &tmp,
+ copied = ptrace_access_vm(child, (u64)addrOthers, &tmp,
sizeof(tmp), 0);
if (copied != sizeof(tmp))
break;
@@ -178,7 +178,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
if (get_user(addrOthers, (u32 __user * __user *) (unsigned long) addr) != 0)
break;
ret = 0;
- if (access_process_vm(child, (u64)addrOthers, &data,
+ if (ptrace_access_vm(child, (u64)addrOthers, &data,
sizeof(data), 1) == sizeof(data))
break;
ret = -EIO;
diff --git a/arch/powerpc/kernel/ptrace32.c b/arch/powerpc/kernel/ptrace32.c
index f52b7db327c8..2e4f01dc9d64 100644
--- a/arch/powerpc/kernel/ptrace32.c
+++ b/arch/powerpc/kernel/ptrace32.c
@@ -73,7 +73,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
if (get_user(addrOthers, (u32 __user * __user *)addr) != 0)
break;
- copied = access_process_vm(child, (u64)addrOthers, &tmp,
+ copied = ptrace_access_vm(child, (u64)addrOthers, &tmp,
sizeof(tmp), 0);
if (copied != sizeof(tmp))
break;
@@ -178,7 +178,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
if (get_user(addrOthers, (u32 __user * __user *)addr) != 0)
break;
ret = 0;
- if (access_process_vm(child, (u64)addrOthers, &tmp,
+ if (ptrace_access_vm(child, (u64)addrOthers, &tmp,
sizeof(tmp), 1) == sizeof(tmp))
break;
ret = -EIO;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e9caec6a51e9..f49727403cce 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1269,6 +1269,8 @@ static inline int fixup_user_fault(struct task_struct *tsk,
extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
void *buf, int len, int write);
+extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long addr, void *buf, int len, int write);
long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index e13bfdf7f314..7ef2f2b0a02e 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -8,6 +8,9 @@
#include <linux/pid_namespace.h> /* For task_active_pid_ns. */
#include <uapi/linux/ptrace.h>
+extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
+ void *buf, int len, int write);
+
/*
* Ptrace flags
*
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 982505497680..20288a3b3796 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -27,6 +27,35 @@
#include <linux/cn_proc.h>
#include <linux/compat.h>
+/*
+ * Access another process' address space via ptrace.
+ * Source/target buffer must be kernel space,
+ * Do not walk the page table directly, use get_user_pages
+ */
+int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
+ void *buf, int len, int write)
+{
+ struct mm_struct *mm;
+ int ret;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return 0;
+
+ if (!tsk->ptrace ||
+ (current != tsk->parent) ||
+ ((get_dumpable(mm) != SUID_DUMP_USER) &&
+ !ptracer_capable(tsk, mm->user_ns))) {
+ mmput(mm);
+ return 0;
+ }
+
+ ret = __access_remote_vm(tsk, mm, addr, buf, len, write);
+ mmput(mm);
+
+ return ret;
+}
+
/*
* ptrace a task: make the debugger its new parent and
@@ -535,7 +564,7 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst
int this_len, retval;
this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
- retval = access_process_vm(tsk, src, buf, this_len, 0);
+ retval = ptrace_access_vm(tsk, src, buf, this_len, 0);
if (!retval) {
if (copied)
break;
@@ -562,7 +591,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
if (copy_from_user(buf, src, this_len))
return -EFAULT;
- retval = access_process_vm(tsk, dst, buf, this_len, 1);
+ retval = ptrace_access_vm(tsk, dst, buf, this_len, 1);
if (!retval) {
if (copied)
break;
@@ -1125,7 +1154,7 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
unsigned long tmp;
int copied;
- copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0);
+ copied = ptrace_access_vm(tsk, addr, &tmp, sizeof(tmp), 0);
if (copied != sizeof(tmp))
return -EIO;
return put_user(tmp, (unsigned long __user *)data);
@@ -1136,7 +1165,7 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
{
int copied;
- copied = access_process_vm(tsk, addr, &data, sizeof(data), 1);
+ copied = ptrace_access_vm(tsk, addr, &data, sizeof(data), 1);
return (copied == sizeof(data)) ? 0 : -EIO;
}
@@ -1153,7 +1182,7 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
switch (request) {
case PTRACE_PEEKTEXT:
case PTRACE_PEEKDATA:
- ret = access_process_vm(child, addr, &word, sizeof(word), 0);
+ ret = ptrace_access_vm(child, addr, &word, sizeof(word), 0);
if (ret != sizeof(word))
ret = -EIO;
else
@@ -1162,7 +1191,7 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
case PTRACE_POKETEXT:
case PTRACE_POKEDATA:
- ret = access_process_vm(child, addr, &data, sizeof(data), 1);
+ ret = ptrace_access_vm(child, addr, &data, sizeof(data), 1);
ret = (ret != sizeof(data) ? -EIO : 0);
break;
diff --git a/mm/memory.c b/mm/memory.c
index fc1987dfd8cc..87bed1520690 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3868,7 +3868,7 @@ EXPORT_SYMBOL_GPL(generic_access_phys);
* Access another process' address space as given in mm. If non-NULL, use the
* given task for page fault accounting.
*/
-static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
+int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
unsigned long addr, void *buf, int len, int write)
{
struct vm_area_struct *vma;
diff --git a/mm/nommu.c b/mm/nommu.c
index 95daf81a4855..281d5adda9ef 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1816,7 +1816,7 @@ void filemap_map_pages(struct fault_env *fe,
}
EXPORT_SYMBOL(filemap_map_pages);
-static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
+int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
unsigned long addr, void *buf, int len, int write)
{
struct vm_area_struct *vma;
--
2.10.1
On Thu, Nov 17, 2016 at 9:05 AM, Eric W. Biederman
<[email protected]> wrote:
>
> When the flag PT_PTRACE_CAP was added the PTRACE_TRACEME path was
> overlooked. This can result in incorrect behavior when an application
> like strace traces an exec of a setuid executable.
>
> Further PT_PTRACE_CAP does not have enough information for making good
> security decisions as it does not report which user namespace the
> capability is in. This has already allowed one mistake through
> insufficient granulariy.
>
> I found this issue when I was testing another corner case of exec and
> discovered that I could not get strace to set PT_PTRACE_CAP even when
> running strace as root with a full set of caps.
>
> This change fixes the above issue with strace allowing stracing as
> root a setuid executable without disabling setuid. More fundamentaly
> this change allows what is allowable at all times, by using the correct
> information in it's decision.
>
> Cc: [email protected]
> Fixes: 4214e42f96d4 ("v2.4.9.11 -> v2.4.9.12")
> Signed-off-by: "Eric W. Biederman" <[email protected]>
> ---
> [...]
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 348f51b0ec92..8fe58255d219 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1656,6 +1656,7 @@ struct task_struct {
> struct list_head cpu_timers[3];
>
> /* process credentials */
> + const struct cred __rcu *ptracer_cred; /* Tracer's dredentials at attach */
Typo: credentials.
-Kees
--
Kees Cook
Nexus Security
On Thu, Nov 17, 2016 at 2:50 PM, Eric W. Biederman
<[email protected]> wrote:
>
> It is the reasonable expectation that if an executable file is not
> readable there will be no way for a user without special privileges to
> read the file. This is enforced in ptrace_attach but if ptrace
> is already attached before exec there is no enforcement for read-only
> executables.
Given the corner cases being fixed here, it might make sense to add
some simple tests to tools/testing/sefltests/ptrace/ to validate these
changes and avoid future regressions.
Regardless, it'll be nice to have this fixed. :)
-Kees
--
Kees Cook
Nexus Security
On Thu, Nov 17, 2016 at 9:05 AM, Eric W. Biederman
<[email protected]> wrote:
>
> When the flag PT_PTRACE_CAP was added the PTRACE_TRACEME path was
> overlooked. This can result in incorrect behavior when an application
> like strace traces an exec of a setuid executable.
>
> Further PT_PTRACE_CAP does not have enough information for making good
> security decisions as it does not report which user namespace the
> capability is in. This has already allowed one mistake through
> insufficient granulariy.
>
> I found this issue when I was testing another corner case of exec and
> discovered that I could not get strace to set PT_PTRACE_CAP even when
> running strace as root with a full set of caps.
>
> This change fixes the above issue with strace allowing stracing as
> root a setuid executable without disabling setuid. More fundamentaly
> this change allows what is allowable at all times, by using the correct
> information in it's decision.
>
> Cc: [email protected]
> Fixes: 4214e42f96d4 ("v2.4.9.11 -> v2.4.9.12")
> Signed-off-by: "Eric W. Biederman" <[email protected]>
> ---
> fs/exec.c | 2 +-
> include/linux/capability.h | 1 +
> include/linux/ptrace.h | 1 -
> include/linux/sched.h | 1 +
> kernel/capability.c | 20 ++++++++++++++++++++
> kernel/ptrace.c | 12 +++++++-----
> 6 files changed, 30 insertions(+), 7 deletions(-)
>
> diff --git a/fs/exec.c b/fs/exec.c
> index 6fcfb3f7b137..fdec760bfac3 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1401,7 +1401,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
> unsigned n_fs;
>
> if (p->ptrace) {
> - if (p->ptrace & PT_PTRACE_CAP)
> + if (ptracer_capable(p, current_user_ns()))
IIRC PT_PTRACE_CAP was added to prevent TOCTOU races. What prevents
that type of race now? For that matter, what guarantees that we've
already switched to new creds here and will continue to do so in the
future?
--Andy
On Thu, Nov 17, 2016 at 1:07 PM, Kees Cook <[email protected]> wrote:
> On Thu, Nov 17, 2016 at 12:47 PM, Willy Tarreau <[email protected]> wrote:
>> On Thu, Nov 17, 2016 at 11:08:22AM -0600, Eric W. Biederman wrote:
>>>
>>> It is the reasonable expectation that if an executable file is not
>>> readable there will be no way for a user without special privileges to
>>> read the file. This is enforced in ptrace_attach but if we are
>>> already attached there is no enforcement if a readonly executable
>>> is exec'd.
>>
>> I'm really scared by this Eric. At least you want to make it a hardening
>> option that can be disabled at run time, otherwise it can easily break a
>> lot of userspace :
>>
>> admin@aloha:~$ ll /bin/bash /bin/coreutils /bin/ls /usr/bin/telnet
>> -r-xr-x--x 1 root adm 549272 Oct 28 16:25 /bin/bash
>> -rwx--x--x 1 root adm 765624 Oct 28 16:27 /bin/coreutils
>> lrwxrwxrwx 1 root root 9 Oct 28 16:27 /bin/ls -> coreutils
>> -r-xr-x--x 1 root adm 70344 Oct 28 16:34 /usr/bin/telnet
>>
>> And I've not invented it, I've being taught to do this more than 20
>> years ago and been doing this since on any slightly hardened server
>> just because in pratice it's efficient at stopping quite a bunch of
>> rootkits which require to copy and modify your executables. Sure
>> they could get the contents using ptrace, but using cp is much more
>> common than ptrace in scripts and that works. This has prooven quite
>> efficient in field at stopping some rootkits several times over the
>> last two decades and I know I'm not the only one to do it. In fact
>> I *never* install an executable with read permissions for users if
>> there's no need for random users to copy it. Does it mean that
>> nobody should be able to see why their favorite utility doesn't
>> work anymore ? Not in my opinion, at least not by default.
>>
>> So here I fear that we'll break strace at many places where strace
>> precisely matters to debug things.
>>
>> However I'd love to have this feature controlled by a sysctl (to
>> enforce it by default where possible).
>
> I'm not opposed to a sysctl for this. Regardless, I think we need to
> embrace this idea now, though, since we'll soon end up with
> architectures that enforce executable-only memory, in which case
> ptrace will again fail. Almost better to get started here and then not
> have more surprises later.
That won't be a problem because exec-only memory is going to need to
allow ptrace to read it anyway.
--Andy
On Thu, Nov 17, 2016 at 9:08 AM, Eric W. Biederman
<[email protected]> wrote:
>
> It is the reasonable expectation that if an executable file is not
> readable there will be no way for a user without special privileges to
> read the file. This is enforced in ptrace_attach but if we are
> already attached there is no enforcement if a readonly executable
> is exec'd.
>
> Therefore do the simple thing and if there is a non-dumpable
> executable that we are tracing without privilege fail to exec it.
>
> Fixes: v1.0
> Cc: [email protected]
> Reported-by: Andy Lutomirski <[email protected]>
> Signed-off-by: "Eric W. Biederman" <[email protected]>
> ---
> fs/exec.c | 8 +++++++-
> 1 file changed, 7 insertions(+), 1 deletion(-)
>
> diff --git a/fs/exec.c b/fs/exec.c
> index fdec760bfac3..de107f74e055 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1230,6 +1230,11 @@ int flush_old_exec(struct linux_binprm * bprm)
> {
> int retval;
>
> + /* Fail if the tracer can't read the executable */
> + if ((bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP) &&
> + !ptracer_capable(current, bprm->mm->user_ns))
> + return -EPERM;
> +
At the very least, I think that BINPRM_FLAGS_ENFORCE_NONDUMP needs to
check capable_wrt_inode_uidgid too. Otherwise we risk breaking:
$ gcc whatever.c
$ chmod 400 a.out
$ strace a.out
Andy Lutomirski <[email protected]> writes:
> On Thu, Nov 17, 2016 at 9:05 AM, Eric W. Biederman
> <[email protected]> wrote:
>>
>> When the flag PT_PTRACE_CAP was added the PTRACE_TRACEME path was
>> overlooked. This can result in incorrect behavior when an application
>> like strace traces an exec of a setuid executable.
>>
>> Further PT_PTRACE_CAP does not have enough information for making good
>> security decisions as it does not report which user namespace the
>> capability is in. This has already allowed one mistake through
>> insufficient granulariy.
>>
>> I found this issue when I was testing another corner case of exec and
>> discovered that I could not get strace to set PT_PTRACE_CAP even when
>> running strace as root with a full set of caps.
>>
>> This change fixes the above issue with strace allowing stracing as
>> root a setuid executable without disabling setuid. More fundamentaly
>> this change allows what is allowable at all times, by using the correct
>> information in it's decision.
>>
>> Cc: [email protected]
>> Fixes: 4214e42f96d4 ("v2.4.9.11 -> v2.4.9.12")
>> Signed-off-by: "Eric W. Biederman" <[email protected]>
>> ---
>> fs/exec.c | 2 +-
>> include/linux/capability.h | 1 +
>> include/linux/ptrace.h | 1 -
>> include/linux/sched.h | 1 +
>> kernel/capability.c | 20 ++++++++++++++++++++
>> kernel/ptrace.c | 12 +++++++-----
>> 6 files changed, 30 insertions(+), 7 deletions(-)
>>
>> diff --git a/fs/exec.c b/fs/exec.c
>> index 6fcfb3f7b137..fdec760bfac3 100644
>> --- a/fs/exec.c
>> +++ b/fs/exec.c
>> @@ -1401,7 +1401,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
>> unsigned n_fs;
>>
>> if (p->ptrace) {
>> - if (p->ptrace & PT_PTRACE_CAP)
>> + if (ptracer_capable(p, current_user_ns()))
>
> IIRC PT_PTRACE_CAP was added to prevent TOCTOU races. What prevents
> that type of race now? For that matter, what guarantees that we've
> already switched to new creds here and will continue to do so in the
> future?
Because instead of capturing a single bit we now capture tracers
entire credentials in tsk->ptracer_cred. As such tsk->ptracer_cred
never changes except when ptracing begins or ends, and we remain
safe for TOCTOU races.
We do hold cred_guard_mutex here so that guarantees we get a new
ptracer. So the worst that can happen here is our tracer detaches
and ptracer_capable will uncondintionally return true.
Eric
Andy Lutomirski <[email protected]> writes:
> On Thu, Nov 17, 2016 at 9:08 AM, Eric W. Biederman
> <[email protected]> wrote:
>>
>> It is the reasonable expectation that if an executable file is not
>> readable there will be no way for a user without special privileges to
>> read the file. This is enforced in ptrace_attach but if we are
>> already attached there is no enforcement if a readonly executable
>> is exec'd.
>>
>> Therefore do the simple thing and if there is a non-dumpable
>> executable that we are tracing without privilege fail to exec it.
>>
>> Fixes: v1.0
>> Cc: [email protected]
>> Reported-by: Andy Lutomirski <[email protected]>
>> Signed-off-by: "Eric W. Biederman" <[email protected]>
>> ---
>> fs/exec.c | 8 +++++++-
>> 1 file changed, 7 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/exec.c b/fs/exec.c
>> index fdec760bfac3..de107f74e055 100644
>> --- a/fs/exec.c
>> +++ b/fs/exec.c
>> @@ -1230,6 +1230,11 @@ int flush_old_exec(struct linux_binprm * bprm)
>> {
>> int retval;
>>
>> + /* Fail if the tracer can't read the executable */
>> + if ((bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP) &&
>> + !ptracer_capable(current, bprm->mm->user_ns))
>> + return -EPERM;
>> +
>
> At the very least, I think that BINPRM_FLAGS_ENFORCE_NONDUMP needs to
> check capable_wrt_inode_uidgid too. Otherwise we risk breaking:
>
> $ gcc whatever.c
> $ chmod 400 a.out
> $ strace a.out
It is an invariant that if you have caps in mm->user_ns you will
also be capable_write_inode_uidgid of all files that a process exec's.
My third patch winds up changing mm->user_ns to maintain this invariant.
It is also true that Willy convinced me while this check is trivial it
will break historic uses so I have replaced this patch with:
"ptrace: Don't allow accessing an undumpable mm.
Eric
On Thu, Nov 17, 2016 at 3:55 PM, Eric W. Biederman
<[email protected]> wrote:
> Andy Lutomirski <[email protected]> writes:
>
>> On Thu, Nov 17, 2016 at 9:08 AM, Eric W. Biederman
>> <[email protected]> wrote:
>>>
>>> It is the reasonable expectation that if an executable file is not
>>> readable there will be no way for a user without special privileges to
>>> read the file. This is enforced in ptrace_attach but if we are
>>> already attached there is no enforcement if a readonly executable
>>> is exec'd.
>>>
>>> Therefore do the simple thing and if there is a non-dumpable
>>> executable that we are tracing without privilege fail to exec it.
>>>
>>> Fixes: v1.0
>>> Cc: [email protected]
>>> Reported-by: Andy Lutomirski <[email protected]>
>>> Signed-off-by: "Eric W. Biederman" <[email protected]>
>>> ---
>>> fs/exec.c | 8 +++++++-
>>> 1 file changed, 7 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/fs/exec.c b/fs/exec.c
>>> index fdec760bfac3..de107f74e055 100644
>>> --- a/fs/exec.c
>>> +++ b/fs/exec.c
>>> @@ -1230,6 +1230,11 @@ int flush_old_exec(struct linux_binprm * bprm)
>>> {
>>> int retval;
>>>
>>> + /* Fail if the tracer can't read the executable */
>>> + if ((bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP) &&
>>> + !ptracer_capable(current, bprm->mm->user_ns))
>>> + return -EPERM;
>>> +
>>
>> At the very least, I think that BINPRM_FLAGS_ENFORCE_NONDUMP needs to
>> check capable_wrt_inode_uidgid too. Otherwise we risk breaking:
>>
>> $ gcc whatever.c
>> $ chmod 400 a.out
>> $ strace a.out
>
> It is an invariant that if you have caps in mm->user_ns you will
> also be capable_write_inode_uidgid of all files that a process exec's.
I meant to check whether you *are* the owner, too.
>
> My third patch winds up changing mm->user_ns to maintain this invariant.
>
> It is also true that Willy convinced me while this check is trivial it
> will break historic uses so I have replaced this patch with:
> "ptrace: Don't allow accessing an undumpable mm.
I think that's better.
>
> Eric
>
>
--
Andy Lutomirski
AMA Capital Management, LLC
Andy Lutomirski <[email protected]> writes:
> On Thu, Nov 17, 2016 at 3:55 PM, Eric W. Biederman
> <[email protected]> wrote:
>> Andy Lutomirski <[email protected]> writes:
>>
>>> On Thu, Nov 17, 2016 at 9:08 AM, Eric W. Biederman
>>> <[email protected]> wrote:
>>>>
>>>> It is the reasonable expectation that if an executable file is not
>>>> readable there will be no way for a user without special privileges to
>>>> read the file. This is enforced in ptrace_attach but if we are
>>>> already attached there is no enforcement if a readonly executable
>>>> is exec'd.
>>>>
>>>> Therefore do the simple thing and if there is a non-dumpable
>>>> executable that we are tracing without privilege fail to exec it.
>>>>
>>>> Fixes: v1.0
>>>> Cc: [email protected]
>>>> Reported-by: Andy Lutomirski <[email protected]>
>>>> Signed-off-by: "Eric W. Biederman" <[email protected]>
>>>> ---
>>>> fs/exec.c | 8 +++++++-
>>>> 1 file changed, 7 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/fs/exec.c b/fs/exec.c
>>>> index fdec760bfac3..de107f74e055 100644
>>>> --- a/fs/exec.c
>>>> +++ b/fs/exec.c
>>>> @@ -1230,6 +1230,11 @@ int flush_old_exec(struct linux_binprm * bprm)
>>>> {
>>>> int retval;
>>>>
>>>> + /* Fail if the tracer can't read the executable */
>>>> + if ((bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP) &&
>>>> + !ptracer_capable(current, bprm->mm->user_ns))
>>>> + return -EPERM;
>>>> +
>>>
>>> At the very least, I think that BINPRM_FLAGS_ENFORCE_NONDUMP needs to
>>> check capable_wrt_inode_uidgid too. Otherwise we risk breaking:
>>>
>>> $ gcc whatever.c
>>> $ chmod 400 a.out
>>> $ strace a.out
>>
>> It is an invariant that if you have caps in mm->user_ns you will
>> also be capable_write_inode_uidgid of all files that a process exec's.
>
> I meant to check whether you *are* the owner, too.
I don't follow. BINPRM_FLAGS_ENFORCE_NONDUMP is only set if
the caller of exec does not have inode_permission(inode, MAY_READ).
Which in your example would have guaranteed that
BINPRM_FLAGS_ENFORCE_NONDUMP would have be unset.
The ptracer_capable thing is only asking in this instance if we can
ignore the nondumpable status because we have CAP_SYS_PTRACE over
a user namespace that includes all of the files that would_dump
was called on (mm->user_ns).
ptrace_access_vm in the replacement patch has essentially the same
permission check. It is just at PTRACE_PEEKTEXT, PTRACE_PEEKDATA,
PTRACE_POKETEXT, or PTRACE_POKEDATA time.
So I am curious if you are seeing something that is worth fixing.
>> My third patch winds up changing mm->user_ns to maintain this invariant.
>>
>> It is also true that Willy convinced me while this check is trivial it
>> will break historic uses so I have replaced this patch with:
>> "ptrace: Don't allow accessing an undumpable mm.
>
> I think that's better.
Eric
Kees Cook <[email protected]> writes:
> On Thu, Nov 17, 2016 at 9:05 AM, Eric W. Biederman
> <[email protected]> wrote:
>>
>> When the flag PT_PTRACE_CAP was added the PTRACE_TRACEME path was
>> overlooked. This can result in incorrect behavior when an application
>> like strace traces an exec of a setuid executable.
>>
>> Further PT_PTRACE_CAP does not have enough information for making good
>> security decisions as it does not report which user namespace the
>> capability is in. This has already allowed one mistake through
>> insufficient granulariy.
>>
>> I found this issue when I was testing another corner case of exec and
>> discovered that I could not get strace to set PT_PTRACE_CAP even when
>> running strace as root with a full set of caps.
>>
>> This change fixes the above issue with strace allowing stracing as
>> root a setuid executable without disabling setuid. More fundamentaly
>> this change allows what is allowable at all times, by using the correct
>> information in it's decision.
>>
>> Cc: [email protected]
>> Fixes: 4214e42f96d4 ("v2.4.9.11 -> v2.4.9.12")
>> Signed-off-by: "Eric W. Biederman" <[email protected]>
>> ---
>> [...]
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 348f51b0ec92..8fe58255d219 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -1656,6 +1656,7 @@ struct task_struct {
>> struct list_head cpu_timers[3];
>>
>> /* process credentials */
>> + const struct cred __rcu *ptracer_cred; /* Tracer's dredentials at attach */
>
> Typo: credentials.
Thank you, fixed.
Eric
Hi Eric,
On Thu, Nov 17, 2016 at 11:02:47AM -0600, Eric W. Biederman wrote:
>
> With everyone heading to Kernel Summit and Plumbers I put this set of
> patches down temporarily. Now is the time to take it back up and to
> make certain I am not missing something stupid in this set of patches.
I couldn't get your patch set to apply to any of the kernels I tried,
I manually adjusted some parts but the second one has too many rejects.
What kernel should I apply this to ? Or maybe some preliminary patches
are needed ?
Thanks,
Willy
On Sat, Nov 19, 2016 at 08:17:00AM +0100, Willy Tarreau wrote:
> Hi Eric,
>
> On Thu, Nov 17, 2016 at 11:02:47AM -0600, Eric W. Biederman wrote:
> >
> > With everyone heading to Kernel Summit and Plumbers I put this set of
> > patches down temporarily. Now is the time to take it back up and to
> > make certain I am not missing something stupid in this set of patches.
>
> I couldn't get your patch set to apply to any of the kernels I tried,
> I manually adjusted some parts but the second one has too many rejects.
> What kernel should I apply this to ? Or maybe some preliminary patches
> are needed ?
OK I finally managed to get it to work on top of 4.8.9 (required less changes
than master). I also had to drop the user_ns changes since there's no such
user_ns in mm_struct there.
I could run a test on it, that looks reasonable :
FS:
admin@vm:~$ strace -e trace=fstat,uname,ioctl,open uname
open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open(0x7f3f9a1663e3, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = 3
fstat(3, {...}) = 0
open(0x7ffd01bbee80, O_RDONLY|O_CLOEXEC) = 3
fstat(3, {...}) = 0
uname({...}) = 0
fstat(1, {...}) = 0
ioctl(1, SNDCTL_TMR_TIMEBASE or TCGETS, 0x7ffd01bbf400) = 0
admin@vm:~$ sudo strace -e trace=fstat,uname,ioctl,open uname
open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open("/lib64/tls/x86_64/libpthread.so.0", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open("/lib64/tls/libpthread.so.0", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open("/lib64/x86_64/libpthread.so.0", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open("/lib64/libpthread.so.0", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0555, st_size=101312, ...}) = 0
open("/lib64/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0555, st_size=1479016, ...}) = 0
uname({sys="Linux", node="vm", ...}) = 0
fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(4, 64), ...}) = 0
ioctl(1, SNDCTL_TMR_TIMEBASE or TCGETS, {B9600 opost isig icanon echo ...}) = 0
Network:
admin@vm:~$ strace -e trace=socket,setsockopt,connect /tmp/nc 198.18.3 22
socket(PF_FILE, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 3
connect(3, {...}, 110) = -1 ENOENT (No such file or directory)
socket(PF_FILE, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 3
connect(3, {...}, 110) = -1 ENOENT (No such file or directory)
socket(PF_INET, SOCK_STREAM, IPPROTO_TCP) = 3
setsockopt(3, SOL_SOCKET, SO_REUSEADDR, 0x7ffd2c26bdbc, 4) = 0
connect(3, {...}, 16) = 0
admin@vm:~$ sudo strace -e trace=socket,setsockopt,connect /tmp/nc 198.18.3 22
socket(PF_FILE, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 3
connect(3, {sa_family=AF_FILE, path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
socket(PF_FILE, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 3
connect(3, {sa_family=AF_FILE, path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
socket(PF_INET, SOCK_STREAM, IPPROTO_TCP) = 3
setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("198.18.0.3")}, 16) = 0
So in short now we can at least see what syscall fails eventhough we can't
know why. I think it can be an acceptable trade-off.
Thanks,
Willy
On Sat, Nov 19, 2016 at 10:28:04AM +0100, Willy Tarreau wrote:
> On Sat, Nov 19, 2016 at 08:17:00AM +0100, Willy Tarreau wrote:
> > Hi Eric,
> >
> > On Thu, Nov 17, 2016 at 11:02:47AM -0600, Eric W. Biederman wrote:
> > >
> > > With everyone heading to Kernel Summit and Plumbers I put this set of
> > > patches down temporarily. Now is the time to take it back up and to
> > > make certain I am not missing something stupid in this set of patches.
> >
> > I couldn't get your patch set to apply to any of the kernels I tried,
> > I manually adjusted some parts but the second one has too many rejects.
> > What kernel should I apply this to ? Or maybe some preliminary patches
> > are needed ?
>
> OK I finally managed to get it to work on top of 4.8.9 (required less changes
> than master). I also had to drop the user_ns changes since there's no such
> user_ns in mm_struct there.
>
> I could run a test on it, that looks reasonable :
>
> FS:
>
> admin@vm:~$ strace -e trace=fstat,uname,ioctl,open uname
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7f3f9a1663e3, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = 3
> fstat(3, {...}) = 0
> open(0x7ffd01bbee80, O_RDONLY|O_CLOEXEC) = 3
> fstat(3, {...}) = 0
> uname({...}) = 0
> fstat(1, {...}) = 0
> ioctl(1, SNDCTL_TMR_TIMEBASE or TCGETS, 0x7ffd01bbf400) = 0
>
> admin@vm:~$ sudo strace -e trace=fstat,uname,ioctl,open uname
> open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open("/lib64/tls/x86_64/libpthread.so.0", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open("/lib64/tls/libpthread.so.0", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open("/lib64/x86_64/libpthread.so.0", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open("/lib64/libpthread.so.0", O_RDONLY|O_CLOEXEC) = 3
> fstat(3, {st_mode=S_IFREG|0555, st_size=101312, ...}) = 0
> open("/lib64/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
> fstat(3, {st_mode=S_IFREG|0555, st_size=1479016, ...}) = 0
> uname({sys="Linux", node="vm", ...}) = 0
> fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(4, 64), ...}) = 0
> ioctl(1, SNDCTL_TMR_TIMEBASE or TCGETS, {B9600 opost isig icanon echo ...}) = 0
>
> Network:
>
> admin@vm:~$ strace -e trace=socket,setsockopt,connect /tmp/nc 198.18.3 22
> socket(PF_FILE, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 3
> connect(3, {...}, 110) = -1 ENOENT (No such file or directory)
> socket(PF_FILE, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 3
> connect(3, {...}, 110) = -1 ENOENT (No such file or directory)
> socket(PF_INET, SOCK_STREAM, IPPROTO_TCP) = 3
> setsockopt(3, SOL_SOCKET, SO_REUSEADDR, 0x7ffd2c26bdbc, 4) = 0
> connect(3, {...}, 16) = 0
>
> admin@vm:~$ sudo strace -e trace=socket,setsockopt,connect /tmp/nc 198.18.3 22
> socket(PF_FILE, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 3
> connect(3, {sa_family=AF_FILE, path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
> socket(PF_FILE, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 3
> connect(3, {sa_family=AF_FILE, path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
> socket(PF_INET, SOCK_STREAM, IPPROTO_TCP) = 3
> setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
> connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("198.18.0.3")}, 16) = 0
>
> So in short now we can at least see what syscall fails eventhough we can't
> know why. I think it can be an acceptable trade-off.
I also tested with gdb and it behaves as desired :
admin@vm:~$ sleep 100000 &
[1] 1615
admin@vm:~$ /tmp/gdb-x86_64 -q -p 1615
Attaching to process 1615
ptrace: Operation not permitted.
(gdb) quit
admin@vm:~$ sudo cp /bin/sleep /var/tmp/
admin@vm:~$ sudo chmod 755 /var/tmp/sleep
admin@vm:~$ /tmp/sleep 100000 &
[1] 1620
admin@vm:~$ /tmp/gdb-x86_64 -q -p 1620
Attaching to process 1620
Reading symbols from /var/tmp/sleep...(no debugging symbols found)...done.
(...)
0x00007f6723d561b0 in nanosleep () from /lib64/libpthread.so.0
(gdb) quit
Willy
Willy Tarreau <[email protected]> writes:
> Hi Eric,
>
> On Thu, Nov 17, 2016 at 11:02:47AM -0600, Eric W. Biederman wrote:
>>
>> With everyone heading to Kernel Summit and Plumbers I put this set of
>> patches down temporarily. Now is the time to take it back up and to
>> make certain I am not missing something stupid in this set of patches.
>
> I couldn't get your patch set to apply to any of the kernels I tried,
> I manually adjusted some parts but the second one has too many rejects.
> What kernel should I apply this to ? Or maybe some preliminary patches
> are needed ?
It is against my for-next branch, and there is one patch in there
that is significant.
The entire patchset should be at:
git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.git for-next
Eric
[email protected] (Eric W. Biederman) writes:
> Willy Tarreau <[email protected]> writes:
>
>> Hi Eric,
>>
>> On Thu, Nov 17, 2016 at 11:02:47AM -0600, Eric W. Biederman wrote:
>>>
>>> With everyone heading to Kernel Summit and Plumbers I put this set of
>>> patches down temporarily. Now is the time to take it back up and to
>>> make certain I am not missing something stupid in this set of patches.
>>
>> I couldn't get your patch set to apply to any of the kernels I tried,
>> I manually adjusted some parts but the second one has too many rejects.
>> What kernel should I apply this to ? Or maybe some preliminary patches
>> are needed ?
>
> It is against my for-next branch, and there is one patch in there
> that is significant.
>
> The entire patchset should be at:
> git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.git for-next
Grr. I typed too fast, the branch above is the base:
git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.git for-testing
should be the entire thing.
Eric
Willy Tarreau <[email protected]> writes:
> On Sat, Nov 19, 2016 at 08:17:00AM +0100, Willy Tarreau wrote:
>> Hi Eric,
>>
>> On Thu, Nov 17, 2016 at 11:02:47AM -0600, Eric W. Biederman wrote:
>> >
>> > With everyone heading to Kernel Summit and Plumbers I put this set of
>> > patches down temporarily. Now is the time to take it back up and to
>> > make certain I am not missing something stupid in this set of patches.
>>
>> I couldn't get your patch set to apply to any of the kernels I tried,
>> I manually adjusted some parts but the second one has too many rejects.
>> What kernel should I apply this to ? Or maybe some preliminary patches
>> are needed ?
>
> OK I finally managed to get it to work on top of 4.8.9 (required less changes
> than master). I also had to drop the user_ns changes since there's no such
> user_ns in mm_struct there.
>
> I could run a test on it, that looks reasonable :
>
> FS:
>
> admin@vm:~$ strace -e trace=fstat,uname,ioctl,open uname
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7f3f9a1663e3, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = 3
> fstat(3, {...}) = 0
> open(0x7ffd01bbee80, O_RDONLY|O_CLOEXEC) = 3
> fstat(3, {...}) = 0
> uname({...}) = 0
> fstat(1, {...}) = 0
> ioctl(1, SNDCTL_TMR_TIMEBASE or TCGETS, 0x7ffd01bbf400) = 0
>
> admin@vm:~$ sudo strace -e trace=fstat,uname,ioctl,open uname
> open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open("/lib64/tls/x86_64/libpthread.so.0", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open("/lib64/tls/libpthread.so.0", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open("/lib64/x86_64/libpthread.so.0", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open("/lib64/libpthread.so.0", O_RDONLY|O_CLOEXEC) = 3
> fstat(3, {st_mode=S_IFREG|0555, st_size=101312, ...}) = 0
> open("/lib64/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
> fstat(3, {st_mode=S_IFREG|0555, st_size=1479016, ...}) = 0
> uname({sys="Linux", node="vm", ...}) = 0
> fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(4, 64), ...}) = 0
> ioctl(1, SNDCTL_TMR_TIMEBASE or TCGETS, {B9600 opost isig icanon echo ...}) = 0
>
> Network:
>
> admin@vm:~$ strace -e trace=socket,setsockopt,connect /tmp/nc 198.18.3 22
> socket(PF_FILE, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 3
> connect(3, {...}, 110) = -1 ENOENT (No such file or directory)
> socket(PF_FILE, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 3
> connect(3, {...}, 110) = -1 ENOENT (No such file or directory)
> socket(PF_INET, SOCK_STREAM, IPPROTO_TCP) = 3
> setsockopt(3, SOL_SOCKET, SO_REUSEADDR, 0x7ffd2c26bdbc, 4) = 0
> connect(3, {...}, 16) = 0
>
> admin@vm:~$ sudo strace -e trace=socket,setsockopt,connect /tmp/nc 198.18.3 22
> socket(PF_FILE, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 3
> connect(3, {sa_family=AF_FILE, path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
> socket(PF_FILE, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 3
> connect(3, {sa_family=AF_FILE, path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
> socket(PF_INET, SOCK_STREAM, IPPROTO_TCP) = 3
> setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
> connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("198.18.0.3")}, 16) = 0
>
> So in short now we can at least see what syscall fails eventhough we can't
> know why. I think it can be an acceptable trade-off.
Thanks for testing, and thanks for you acceptance even if I didn't make
it easy for you.
Eric