PTRACE_GETFD is a generic ptrace API that allows the tracer to
get file descriptors from the traceee.
The primary reason to use this syscall is to allow sandboxers to
take action on an FD on behalf of the tracee. For example, this
can be combined with seccomp's user notification feature to extract
a file descriptor and call privileged syscalls, like binding
a socket to a privileged port.
Signed-off-by: Sargun Dhillon <[email protected]>
---
include/uapi/linux/ptrace.h | 5 +++++
kernel/ptrace.c | 39 +++++++++++++++++++++++++++++++++++--
2 files changed, 42 insertions(+), 2 deletions(-)
diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
index a71b6e3b03eb..2b69f759826a 100644
--- a/include/uapi/linux/ptrace.h
+++ b/include/uapi/linux/ptrace.h
@@ -101,6 +101,11 @@ struct ptrace_syscall_info {
};
};
+/* This gets a file descriptor from a running process. It doesn't require the
+ * process to be stopped.
+ */
+#define PTRACE_GETFD 0x420f
+
/*
* These values are stored in task->ptrace_message
* by tracehook_report_syscall_* to describe the current syscall-stop.
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index cb9ddcc08119..a1d7b289fe8e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -31,6 +31,7 @@
#include <linux/cn_proc.h>
#include <linux/compat.h>
#include <linux/sched/signal.h>
+#include <linux/fdtable.h>
#include <asm/syscall.h> /* for syscall_get_* */
@@ -994,6 +995,37 @@ ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size,
}
#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
+static int ptrace_getfd(struct task_struct *child, unsigned long fd)
+{
+ struct files_struct *files;
+ struct file *file;
+ int ret = 0;
+
+ files = get_files_struct(child);
+ if (!files)
+ return -ENOENT;
+
+ spin_lock(&files->file_lock);
+ file = fcheck_files(files, fd);
+ if (!file)
+ ret = -EBADF;
+ else
+ get_file(file);
+ spin_unlock(&files->file_lock);
+ put_files_struct(files);
+
+ if (ret)
+ goto out;
+
+ ret = get_unused_fd_flags(0);
+ if (ret >= 0)
+ fd_install(ret, file);
+
+ fput(file);
+out:
+ return ret;
+}
+
int ptrace_request(struct task_struct *child, long request,
unsigned long addr, unsigned long data)
{
@@ -1222,7 +1254,9 @@ int ptrace_request(struct task_struct *child, long request,
case PTRACE_SECCOMP_GET_METADATA:
ret = seccomp_get_metadata(child, addr, datavp);
break;
-
+ case PTRACE_GETFD:
+ ret = ptrace_getfd(child, data);
+ break;
default:
break;
}
@@ -1265,7 +1299,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
}
ret = ptrace_check_attach(child, request == PTRACE_KILL ||
- request == PTRACE_INTERRUPT);
+ request == PTRACE_INTERRUPT ||
+ request == PTRACE_GETFD);
if (ret < 0)
goto out_put_task_struct;
--
2.20.1
On Fri, Dec 6, 2019 at 12:44 AM Sargun Dhillon <[email protected]> wrote:
> PTRACE_GETFD is a generic ptrace API that allows the tracer to
> get file descriptors from the traceee.
typo: tracee
> The primary reason to use this syscall is to allow sandboxers to
> take action on an FD on behalf of the tracee. For example, this
> can be combined with seccomp's user notification feature to extract
> a file descriptor and call privileged syscalls, like binding
> a socket to a privileged port.
[...]
> +/* This gets a file descriptor from a running process. It doesn't require the
> + * process to be stopped.
> + */
> +#define PTRACE_GETFD 0x420f
[...]
> +static int ptrace_getfd(struct task_struct *child, unsigned long fd)
I'd make the "fd" parameter of this function an "unsigned int", given
that that's also the argument type of fcheck_files().
> +{
> + struct files_struct *files;
> + struct file *file;
> + int ret = 0;
> +
> + files = get_files_struct(child);
> + if (!files)
> + return -ENOENT;
> +
> + spin_lock(&files->file_lock);
> + file = fcheck_files(files, fd);
> + if (!file)
> + ret = -EBADF;
> + else
> + get_file(file);
> + spin_unlock(&files->file_lock);
> + put_files_struct(files);
> +
> + if (ret)
> + goto out;
> +
> + ret = get_unused_fd_flags(0);
You're hardcoding the flags for the fd as 0, which means that there is
no way for the caller to enable O_CLOEXEC on the fd in a way that is
race-free against a concurrent execve(). If you can't easily plumb
through an O_CLOEXEC flag from userspace to here, you should probably
hardcode O_CLOEXEC here.
> + if (ret >= 0)
> + fd_install(ret, file);
> +
> + fput(file);
Annoyingly, this isn't how fd_install() works. fd_install() has
slightly weird semantics and consumes the reference passed to it, so
this should be:
if (ret >= 0)
fd_install(ret, file);
else
fput(file);
> +out:
> + return ret;
> +}
On Thu, Dec 5, 2019 at 6:38 PM Jann Horn <[email protected]> wrote:
>
> On Fri, Dec 6, 2019 at 12:44 AM Sargun Dhillon <[email protected]> wrote:
> > PTRACE_GETFD is a generic ptrace API that allows the tracer to
> > get file descriptors from the traceee.
>
> typo: tracee
>
> > The primary reason to use this syscall is to allow sandboxers to
> > take action on an FD on behalf of the tracee. For example, this
> > can be combined with seccomp's user notification feature to extract
> > a file descriptor and call privileged syscalls, like binding
> > a socket to a privileged port.
> [...]
> > +/* This gets a file descriptor from a running process. It doesn't require the
> > + * process to be stopped.
> > + */
> > +#define PTRACE_GETFD 0x420f
> [...]
> > +static int ptrace_getfd(struct task_struct *child, unsigned long fd)
>
> I'd make the "fd" parameter of this function an "unsigned int", given
> that that's also the argument type of fcheck_files().
>
> > +{
> > + struct files_struct *files;
> > + struct file *file;
> > + int ret = 0;
> > +
> > + files = get_files_struct(child);
> > + if (!files)
> > + return -ENOENT;
> > +
> > + spin_lock(&files->file_lock);
> > + file = fcheck_files(files, fd);
> > + if (!file)
> > + ret = -EBADF;
> > + else
> > + get_file(file);
> > + spin_unlock(&files->file_lock);
> > + put_files_struct(files);
> > +
> > + if (ret)
> > + goto out;
> > +
> > + ret = get_unused_fd_flags(0);
>
> You're hardcoding the flags for the fd as 0, which means that there is
> no way for the caller to enable O_CLOEXEC on the fd in a way that is
> race-free against a concurrent execve(). If you can't easily plumb
> through an O_CLOEXEC flag from userspace to here, you should probably
> hardcode O_CLOEXEC here.
>
I thought about making addr used for flags. It seems a little weird, given the
name, but it'll do the job. Alternatively, it could be a point to an
options struct.
If we introduce options, one of the nice things we could add is add the ability
to cleanse the FD of certain information, like cgroups.
> > + if (ret >= 0)
> > + fd_install(ret, file);
> > +
> > + fput(file);
>
> Annoyingly, this isn't how fd_install() works. fd_install() has
> slightly weird semantics and consumes the reference passed to it, so
> this should be:
>
> if (ret >= 0)
> fd_install(ret, file);
> else
> fput(file);
>
> > +out:
> > + return ret;
> > +}
On 2019-12-05, Sargun Dhillon <[email protected]> wrote:
> On Thu, Dec 5, 2019 at 6:38 PM Jann Horn <[email protected]> wrote:
> >
> > On Fri, Dec 6, 2019 at 12:44 AM Sargun Dhillon <[email protected]> wrote:
> > > PTRACE_GETFD is a generic ptrace API that allows the tracer to
> > > get file descriptors from the traceee.
> >
> > typo: tracee
> >
> > > The primary reason to use this syscall is to allow sandboxers to
> > > take action on an FD on behalf of the tracee. For example, this
> > > can be combined with seccomp's user notification feature to extract
> > > a file descriptor and call privileged syscalls, like binding
> > > a socket to a privileged port.
> > [...]
> > > +/* This gets a file descriptor from a running process. It doesn't require the
> > > + * process to be stopped.
> > > + */
> > > +#define PTRACE_GETFD 0x420f
> > [...]
> > > +static int ptrace_getfd(struct task_struct *child, unsigned long fd)
> >
> > I'd make the "fd" parameter of this function an "unsigned int", given
> > that that's also the argument type of fcheck_files().
> >
> > > +{
> > > + struct files_struct *files;
> > > + struct file *file;
> > > + int ret = 0;
> > > +
> > > + files = get_files_struct(child);
> > > + if (!files)
> > > + return -ENOENT;
> > > +
> > > + spin_lock(&files->file_lock);
> > > + file = fcheck_files(files, fd);
> > > + if (!file)
> > > + ret = -EBADF;
> > > + else
> > > + get_file(file);
> > > + spin_unlock(&files->file_lock);
> > > + put_files_struct(files);
> > > +
> > > + if (ret)
> > > + goto out;
> > > +
> > > + ret = get_unused_fd_flags(0);
> >
> > You're hardcoding the flags for the fd as 0, which means that there is
> > no way for the caller to enable O_CLOEXEC on the fd in a way that is
> > race-free against a concurrent execve(). If you can't easily plumb
> > through an O_CLOEXEC flag from userspace to here, you should probably
> > hardcode O_CLOEXEC here.
> >
> I thought about making addr used for flags. It seems a little weird,
> given the name, but it'll do the job. Alternatively, it could be a
> point to an options struct. If we introduce options, one of the nice
> things we could add is add the ability to cleanse the FD of certain
> information, like cgroups.
If you do end up opting for an options struct, please make sure you use
copy_struct_from_user() or something similar so that we can painlessly
extend it in the future (if necessary). Since there isn't an additional
argument, you might want to do what perf_event_open() does and embed the
size as the first field of the options struct.
--
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
<https://www.cyphar.com/>
[+ Oleg, the maintainer. This needs to see his review before anything
can happen to this series.]
On Thu, Dec 05, 2019 at 11:44:53PM +0000, Sargun Dhillon wrote:
> PTRACE_GETFD is a generic ptrace API that allows the tracer to
> get file descriptors from the traceee.
>
> The primary reason to use this syscall is to allow sandboxers to
> take action on an FD on behalf of the tracee. For example, this
> can be combined with seccomp's user notification feature to extract
> a file descriptor and call privileged syscalls, like binding
> a socket to a privileged port.
>
> Signed-off-by: Sargun Dhillon <[email protected]>
> ---
> include/uapi/linux/ptrace.h | 5 +++++
> kernel/ptrace.c | 39 +++++++++++++++++++++++++++++++++++--
> 2 files changed, 42 insertions(+), 2 deletions(-)
>
> diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
> index a71b6e3b03eb..2b69f759826a 100644
> --- a/include/uapi/linux/ptrace.h
> +++ b/include/uapi/linux/ptrace.h
> @@ -101,6 +101,11 @@ struct ptrace_syscall_info {
> };
> };
>
> +/* This gets a file descriptor from a running process. It doesn't require the
> + * process to be stopped.
> + */
> +#define PTRACE_GETFD 0x420f
> +
> /*
> * These values are stored in task->ptrace_message
> * by tracehook_report_syscall_* to describe the current syscall-stop.
> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> index cb9ddcc08119..a1d7b289fe8e 100644
> --- a/kernel/ptrace.c
> +++ b/kernel/ptrace.c
> @@ -31,6 +31,7 @@
> #include <linux/cn_proc.h>
> #include <linux/compat.h>
> #include <linux/sched/signal.h>
> +#include <linux/fdtable.h>
>
> #include <asm/syscall.h> /* for syscall_get_* */
>
> @@ -994,6 +995,37 @@ ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size,
> }
> #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
>
> +static int ptrace_getfd(struct task_struct *child, unsigned long fd)
> +{
> + struct files_struct *files;
> + struct file *file;
> + int ret = 0;
> +
> + files = get_files_struct(child);
> + if (!files)
> + return -ENOENT;
> +
> + spin_lock(&files->file_lock);
> + file = fcheck_files(files, fd);
> + if (!file)
> + ret = -EBADF;
> + else
> + get_file(file);
> + spin_unlock(&files->file_lock);
> + put_files_struct(files);
> +
> + if (ret)
> + goto out;
> +
> + ret = get_unused_fd_flags(0);
> + if (ret >= 0)
> + fd_install(ret, file);
> +
> + fput(file);
> +out:
> + return ret;
> +}
> +
> int ptrace_request(struct task_struct *child, long request,
> unsigned long addr, unsigned long data)
> {
> @@ -1222,7 +1254,9 @@ int ptrace_request(struct task_struct *child, long request,
> case PTRACE_SECCOMP_GET_METADATA:
> ret = seccomp_get_metadata(child, addr, datavp);
> break;
> -
> + case PTRACE_GETFD:
> + ret = ptrace_getfd(child, data);
> + break;
> default:
> break;
> }
> @@ -1265,7 +1299,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
> }
>
> ret = ptrace_check_attach(child, request == PTRACE_KILL ||
> - request == PTRACE_INTERRUPT);
> + request == PTRACE_INTERRUPT ||
> + request == PTRACE_GETFD);
> if (ret < 0)
> goto out_put_task_struct;
>
> --
> 2.20.1
>
> On Thu, Dec 05, 2019 at 11:44:53PM +0000, Sargun Dhillon wrote:
>
> > +static int ptrace_getfd(struct task_struct *child, unsigned long fd)
> > +{
> > + struct files_struct *files;
> > + struct file *file;
> > + int ret = 0;
> > +
> > + files = get_files_struct(child);
> > + if (!files)
> > + return -ENOENT;
> > +
> > + spin_lock(&files->file_lock);
> > + file = fcheck_files(files, fd);
> > + if (!file)
> > + ret = -EBADF;
> > + else
> > + get_file(file);
> > + spin_unlock(&files->file_lock);
> > + put_files_struct(files);
may be someone can finally create a helper for this, it can have more users.
say,
struct file *get_task_file(task, fd)
{
struct file *file = NULL;
task_lock(task);
rcu_read_lock();
if (task->files) {
file = fcheck_files(task->files, fd);
if (file)
get_file(file);
}
rcu_read_unlock();
task_unlock(task);
return file;
}
no need to get/put files_struct, no need to take ->file_lock.
> > +
> > + if (ret)
> > + goto out;
> > +
> > + ret = get_unused_fd_flags(0);
> > + if (ret >= 0)
> > + fd_install(ret, file);
> > +
> > + fput(file);
this looks wrong or I am totally confused...
if (ret >= 0)
fd_install(file);
else
fput(file);
?
> > @@ -1265,7 +1299,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
> > }
> >
> > ret = ptrace_check_attach(child, request == PTRACE_KILL ||
> > - request == PTRACE_INTERRUPT);
> > + request == PTRACE_INTERRUPT ||
> > + request == PTRACE_GETFD);
Hmm. not sure why do you want this... But OK, we do not need to stop the tracee.
Oleg.
On Thu, Dec 05, 2019 at 11:44:53PM +0000, Sargun Dhillon wrote:
> PTRACE_GETFD is a generic ptrace API that allows the tracer to
> get file descriptors from the traceee.
>
> The primary reason to use this syscall is to allow sandboxers to
I might change this to "one motivation to use this ptrace command",
because I'm sure people will invent other crazy uses soon after it's
added :)
> take action on an FD on behalf of the tracee. For example, this
> can be combined with seccomp's user notification feature to extract
> a file descriptor and call privileged syscalls, like binding
> a socket to a privileged port.
This can already be accomplished via injecting parasite code like CRIU
does; adding a ptrace() command like this makes it much nicer to be
sure, but it is redundant.
Tycho
On Fri, Dec 6, 2019 at 6:10 AM Tycho Andersen <[email protected]> wrote:
>
> On Thu, Dec 05, 2019 at 11:44:53PM +0000, Sargun Dhillon wrote:
> > PTRACE_GETFD is a generic ptrace API that allows the tracer to
> > get file descriptors from the traceee.
> >
> > The primary reason to use this syscall is to allow sandboxers to
>
> I might change this to "one motivation to use this ptrace command",
> because I'm sure people will invent other crazy uses soon after it's
> added :)
>
Another use-case that's come up has been transparent proxy for
service meshes. Rather than doing intercept at L4 (iptables), or
DNS, just rewriting the connect is nicer. A side benefit is that
getpeername still works.
> > take action on an FD on behalf of the tracee. For example, this
> > can be combined with seccomp's user notification feature to extract
> > a file descriptor and call privileged syscalls, like binding
> > a socket to a privileged port.
>
> This can already be accomplished via injecting parasite code like CRIU
> does; adding a ptrace() command like this makes it much nicer to be
> sure, but it is redundant.
>
> Tycho
How can you do this if the tracee doesn't have privilege? For example,
if the tracee doesn't have CAP_SYS_BIND_SERVICE, how could you
get it to bind to a port that's privileged without taking the file descriptor
and doing it in a process that does have CAP_SYS_BIND_SERVICE?
The other aspect is that doing the parasitic code thing is kind of slow,
in that it requires quite a few operations.
On Fri, Dec 6, 2019 at 8:05 PM Jann Horn <[email protected]> wrote:
> On Fri, Dec 6, 2019 at 8:03 PM Sargun Dhillon <[email protected]> wrote:
> > On Fri, Dec 6, 2019 at 6:10 AM Tycho Andersen <[email protected]> wrote:
> > > On Thu, Dec 05, 2019 at 11:44:53PM +0000, Sargun Dhillon wrote:
> > > > take action on an FD on behalf of the tracee. For example, this
> > > > can be combined with seccomp's user notification feature to extract
> > > > a file descriptor and call privileged syscalls, like binding
> > > > a socket to a privileged port.
> > >
> > > This can already be accomplished via injecting parasite code like CRIU
> > > does; adding a ptrace() command like this makes it much nicer to be
> > > sure, but it is redundant.
> > >
> > How can you do this if the tracee doesn't have privilege? For example,
> > if the tracee doesn't have CAP_SYS_BIND_SERVICE, how could you
> > get it to bind to a port that's privileged without taking the file descriptor
> > and doing it in a process that does have CAP_SYS_BIND_SERVICE?
>
> (You can let the parasite send the fd to the tracer via
> SCM_CREDENTIALS if you have previously set up a unix domain socket for
> this.)
Eh, sorry, of course I meant SCM_RIGHTS.
On Fri, Dec 6, 2019 at 8:03 PM Sargun Dhillon <[email protected]> wrote:
> On Fri, Dec 6, 2019 at 6:10 AM Tycho Andersen <[email protected]> wrote:
> > On Thu, Dec 05, 2019 at 11:44:53PM +0000, Sargun Dhillon wrote:
> > > take action on an FD on behalf of the tracee. For example, this
> > > can be combined with seccomp's user notification feature to extract
> > > a file descriptor and call privileged syscalls, like binding
> > > a socket to a privileged port.
> >
> > This can already be accomplished via injecting parasite code like CRIU
> > does; adding a ptrace() command like this makes it much nicer to be
> > sure, but it is redundant.
> >
> How can you do this if the tracee doesn't have privilege? For example,
> if the tracee doesn't have CAP_SYS_BIND_SERVICE, how could you
> get it to bind to a port that's privileged without taking the file descriptor
> and doing it in a process that does have CAP_SYS_BIND_SERVICE?
(You can let the parasite send the fd to the tracer via
SCM_CREDENTIALS if you have previously set up a unix domain socket for
this.)
> On Dec 5, 2019, at 3:44 PM, Sargun Dhillon <[email protected]> wrote:
>
> PTRACE_GETFD is a generic ptrace API that allows the tracer to
> get file descriptors from the traceee.
>
> The primary reason to use this syscall is to allow sandboxers to
> take action on an FD on behalf of the tracee. For example, this
> can be combined with seccomp's user notification feature to extract
> a file descriptor and call privileged syscalls, like binding
> a socket to a privileged port.
Can you document the circumstances under which you can call this?
Does it require that you be attached as a tracer? Does the tracee need to be stopped? Does it work equivalently if the tracee is ptrace-stopped vs stopped by seccomp?
If the tracee is running and is single-threaded, is the locking correct?
>
> Signed-off-by: Sargun Dhillon <[email protected]>
> ---
> include/uapi/linux/ptrace.h | 5 +++++
> kernel/ptrace.c | 39 +++++++++++++++++++++++++++++++++++--
> 2 files changed, 42 insertions(+), 2 deletions(-)
>
> diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
> index a71b6e3b03eb..2b69f759826a 100644
> --- a/include/uapi/linux/ptrace.h
> +++ b/include/uapi/linux/ptrace.h
> @@ -101,6 +101,11 @@ struct ptrace_syscall_info {
> };
> };
>
> +/* This gets a file descriptor from a running process. It doesn't require the
> + * process to be stopped.
> + */
> +#define PTRACE_GETFD 0x420f
> +
> /*
> * These values are stored in task->ptrace_message
> * by tracehook_report_syscall_* to describe the current syscall-stop.
> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> index cb9ddcc08119..a1d7b289fe8e 100644
> --- a/kernel/ptrace.c
> +++ b/kernel/ptrace.c
> @@ -31,6 +31,7 @@
> #include <linux/cn_proc.h>
> #include <linux/compat.h>
> #include <linux/sched/signal.h>
> +#include <linux/fdtable.h>
>
> #include <asm/syscall.h> /* for syscall_get_* */
>
> @@ -994,6 +995,37 @@ ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size,
> }
> #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
>
> +static int ptrace_getfd(struct task_struct *child, unsigned long fd)
> +{
> + struct files_struct *files;
> + struct file *file;
> + int ret = 0;
> +
> + files = get_files_struct(child);
> + if (!files)
> + return -ENOENT;
> +
> + spin_lock(&files->file_lock);
> + file = fcheck_files(files, fd);
> + if (!file)
> + ret = -EBADF;
> + else
> + get_file(file);
> + spin_unlock(&files->file_lock);
> + put_files_struct(files);
> +
> + if (ret)
> + goto out;
> +
> + ret = get_unused_fd_flags(0);
> + if (ret >= 0)
> + fd_install(ret, file);
> +
> + fput(file);
> +out:
> + return ret;
> +}
> +
> int ptrace_request(struct task_struct *child, long request,
> unsigned long addr, unsigned long data)
> {
> @@ -1222,7 +1254,9 @@ int ptrace_request(struct task_struct *child, long request,
> case PTRACE_SECCOMP_GET_METADATA:
> ret = seccomp_get_metadata(child, addr, datavp);
> break;
> -
> + case PTRACE_GETFD:
> + ret = ptrace_getfd(child, data);
> + break;
> default:
> break;
> }
> @@ -1265,7 +1299,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
> }
>
> ret = ptrace_check_attach(child, request == PTRACE_KILL ||
> - request == PTRACE_INTERRUPT);
> + request == PTRACE_INTERRUPT ||
> + request == PTRACE_GETFD);
> if (ret < 0)
> goto out_put_task_struct;
>
> --
> 2.20.1
>