PTRACE_GETFD is a generic ptrace API that allows the tracer to
get file descriptors from the tracee.
One reason to use this is to allow sandboxers to take actions on file
descriptors on the behalf of a tracee. For example, this can be
combined with seccomp-bpf's user notification to ptrace on-demand and
capture an fd without requiring the tracer to always be attached to
the process. The tracer can then take a privileged action on behalf
of the tracee, like binding a socket to a privileged port.
It works whether or not the tracee is stopped. The only prior requirement
is that the tracer is attached to the process via PTRACE_ATTACH or
PTRACE_SEIZE. Stopping the process breaks certain runtimes that expect
to be able to preempt syscalls (quickly). In addition, it is meant to be
used in an on-demand fashion to avoid breaking debuggers.
The ptrace call takes a pointer to ptrace_getfd_args in data, and the
size of the structure in addr. There is an options field, which can
be used to state whether the fd should be opened with CLOEXEC, or not.
This options field may be extended in the future to include the ability
to clear cgroup information about the file descriptor at a later point.
If the structure is from a newer kernel, and includes members which
make it larger than the structure that's known to this kernel version,
E2BIG will be returned.
The requirement that the tracer has attached to the tracee prior to the
capture of the file descriptor may be lifted at a later point.
Signed-off-by: Sargun Dhillon <[email protected]>
---
include/uapi/linux/ptrace.h | 15 +++++++++++++++
kernel/ptrace.c | 35 +++++++++++++++++++++++++++++++++--
2 files changed, 48 insertions(+), 2 deletions(-)
diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
index a71b6e3b03eb..c84655bcc453 100644
--- a/include/uapi/linux/ptrace.h
+++ b/include/uapi/linux/ptrace.h
@@ -101,6 +101,21 @@ struct ptrace_syscall_info {
};
};
+/*
+ * This gets a file descriptor from a process. It requires that the process
+ * has either been attached to. It does not require that the process is
+ * stopped.
+ */
+#define PTRACE_GETFD 0x420f
+
+/* options to pass in to ptrace_getfd_args */
+#define PTRACE_GETFD_O_CLOEXEC (1 << 0) /* open the fd with cloexec */
+
+struct ptrace_getfd_args {
+ __u32 fd; /* the tracee's file descriptor to get */
+ __u32 options;
+} __attribute__((packed));
+
/*
* These values are stored in task->ptrace_message
* by tracehook_report_syscall_* to describe the current syscall-stop.
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index cb9ddcc08119..8f619dceac6f 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -31,6 +31,7 @@
#include <linux/cn_proc.h>
#include <linux/compat.h>
#include <linux/sched/signal.h>
+#include <linux/fdtable.h>
#include <asm/syscall.h> /* for syscall_get_* */
@@ -994,6 +995,33 @@ ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size,
}
#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
+static int ptrace_getfd(struct task_struct *child, unsigned long user_size,
+ void __user *datavp)
+{
+ struct ptrace_getfd_args args;
+ unsigned int fd_flags = 0;
+ struct file *file;
+ int ret;
+
+ ret = copy_struct_from_user(&args, sizeof(args), datavp, user_size);
+ if (ret)
+ goto out;
+ if ((args.options & ~(PTRACE_GETFD_O_CLOEXEC)) != 0)
+ return -EINVAL;
+ if (args.options & PTRACE_GETFD_O_CLOEXEC)
+ fd_flags &= O_CLOEXEC;
+ file = get_task_file(child, args.fd);
+ if (!file)
+ return -EBADF;
+ ret = get_unused_fd_flags(fd_flags);
+ if (ret >= 0)
+ fd_install(ret, file);
+ else
+ fput(file);
+out:
+ return ret;
+}
+
int ptrace_request(struct task_struct *child, long request,
unsigned long addr, unsigned long data)
{
@@ -1222,7 +1250,9 @@ int ptrace_request(struct task_struct *child, long request,
case PTRACE_SECCOMP_GET_METADATA:
ret = seccomp_get_metadata(child, addr, datavp);
break;
-
+ case PTRACE_GETFD:
+ ret = ptrace_getfd(child, addr, datavp);
+ break;
default:
break;
}
@@ -1265,7 +1295,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
}
ret = ptrace_check_attach(child, request == PTRACE_KILL ||
- request == PTRACE_INTERRUPT);
+ request == PTRACE_INTERRUPT ||
+ request == PTRACE_GETFD);
if (ret < 0)
goto out_put_task_struct;
--
2.20.1
On Mon, Dec 09, 2019 at 07:06:24AM +0000, Sargun Dhillon wrote:
> PTRACE_GETFD is a generic ptrace API that allows the tracer to
> get file descriptors from the tracee.
>
> One reason to use this is to allow sandboxers to take actions on file
> descriptors on the behalf of a tracee. For example, this can be
> combined with seccomp-bpf's user notification to ptrace on-demand and
> capture an fd without requiring the tracer to always be attached to
> the process. The tracer can then take a privileged action on behalf
> of the tracee, like binding a socket to a privileged port.
>
> It works whether or not the tracee is stopped. The only prior requirement
> is that the tracer is attached to the process via PTRACE_ATTACH or
> PTRACE_SEIZE. Stopping the process breaks certain runtimes that expect
> to be able to preempt syscalls (quickly). In addition, it is meant to be
> used in an on-demand fashion to avoid breaking debuggers.
>
> The ptrace call takes a pointer to ptrace_getfd_args in data, and the
> size of the structure in addr. There is an options field, which can
> be used to state whether the fd should be opened with CLOEXEC, or not.
> This options field may be extended in the future to include the ability
> to clear cgroup information about the file descriptor at a later point.
> If the structure is from a newer kernel, and includes members which
> make it larger than the structure that's known to this kernel version,
> E2BIG will be returned.
>
> The requirement that the tracer has attached to the tracee prior to the
> capture of the file descriptor may be lifted at a later point.
>
> Signed-off-by: Sargun Dhillon <[email protected]>
> ---
> include/uapi/linux/ptrace.h | 15 +++++++++++++++
> kernel/ptrace.c | 35 +++++++++++++++++++++++++++++++++--
> 2 files changed, 48 insertions(+), 2 deletions(-)
>
> diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
> index a71b6e3b03eb..c84655bcc453 100644
> --- a/include/uapi/linux/ptrace.h
> +++ b/include/uapi/linux/ptrace.h
> @@ -101,6 +101,21 @@ struct ptrace_syscall_info {
> };
> };
>
> +/*
> + * This gets a file descriptor from a process. It requires that the process
> + * has either been attached to. It does not require that the process is
> + * stopped.
> + */
> +#define PTRACE_GETFD 0x420f
> +
> +/* options to pass in to ptrace_getfd_args */
> +#define PTRACE_GETFD_O_CLOEXEC (1 << 0) /* open the fd with cloexec */
Hey Sargun,
Thanks for the patch!
Why not simply accept O_CLOEXEC as flag? If that's not possible for some
reason I'd say
#define PTRACE_GETFD_O_CLOEXEC O_CLOEXEC /* open the fd with cloexec */
is the right thing to do. This is fairly common:
include/uapi/linux/timerfd.h:#define TFD_CLOEXEC O_CLOEXEC
include/uapi/drm/drm.h:#define DRM_CLOEXEC O_CLOEXEC
include/linux/userfaultfd_k.h:#define UFFD_CLOEXEC O_CLOEXEC
include/linux/eventfd.h:#define EFD_CLOEXEC O_CLOEXEC
include/uapi/linux/eventpoll.h:#define EPOLL_CLOEXEC O_CLOEXEC
include/uapi/linux/inotify.h:/* For O_CLOEXEC and O_NONBLOCK */
include/uapi/linux/inotify.h:#define IN_CLOEXEC O_CLOEXEC
include/uapi/linux/mount.h:#define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */
You can also add a compile-time assert to ptrace like we did for
fs/namespace.c's OPEN_TREE_CLOEXEC:
BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
And I'd remove the _O if you go with a separate flag, i.e.:
#define PTRACE_GETFD_CLOEXEC O_CLOEXEC /* open the fd with cloexec */
> +
> +struct ptrace_getfd_args {
> + __u32 fd; /* the tracee's file descriptor to get */
> + __u32 options;
Nit and I'm not set on it at all but "flags" might just be better.
> +} __attribute__((packed));
What's the benefit in using __attribute__((packed)) here? Seems to me that:
+struct ptrace_getfd_args {
+ __u32 fd; /* the tracee's file descriptor to get */
+ __u32 options;
+};
would just work fine.
> +
> /*
> * These values are stored in task->ptrace_message
> * by tracehook_report_syscall_* to describe the current syscall-stop.
> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> index cb9ddcc08119..8f619dceac6f 100644
> --- a/kernel/ptrace.c
> +++ b/kernel/ptrace.c
> @@ -31,6 +31,7 @@
> #include <linux/cn_proc.h>
> #include <linux/compat.h>
> #include <linux/sched/signal.h>
> +#include <linux/fdtable.h>
>
> #include <asm/syscall.h> /* for syscall_get_* */
>
> @@ -994,6 +995,33 @@ ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size,
> }
> #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
>
> +static int ptrace_getfd(struct task_struct *child, unsigned long user_size,
> + void __user *datavp)
> +{
> + struct ptrace_getfd_args args;
> + unsigned int fd_flags = 0;
> + struct file *file;
> + int ret;
> +
> + ret = copy_struct_from_user(&args, sizeof(args), datavp, user_size);
> + if (ret)
> + goto out;
Why is this goto out and not just return ret?
> + if ((args.options & ~(PTRACE_GETFD_O_CLOEXEC)) != 0)
> + return -EINVAL;
> + if (args.options & PTRACE_GETFD_O_CLOEXEC)
> + fd_flags &= O_CLOEXEC;
> + file = get_task_file(child, args.fd);
> + if (!file)
> + return -EBADF;
> + ret = get_unused_fd_flags(fd_flags);
Why isn't that whole thing just:
ret = get_unused_fd_flags(fd_flags & {PTRACE_GETFD_}O_CLOEXEC);
> + if (ret >= 0)
> + fd_install(ret, file);
> + else
> + fput(file);
> +out:
> + return ret;
> +}
So sm like:
static int ptrace_getfd(struct task_struct *child, unsigned long user_size,
void __user *datavp)
{
struct ptrace_getfd_args args;
unsigned int fd_flags = 0;
struct file *file;
int ret;
ret = copy_struct_from_user(&args, sizeof(args), datavp, user_size);
if (ret)
return ret;
if ((args.options & ~(PTRACE_GETFD_O_CLOEXEC)) != 0)
return -EINVAL;
file = get_task_file(child, args.fd);
if (!file)
return -EBADF;
/* PTRACE_GETFD_CLOEXEC == O_CLOEXEC */
ret = get_unused_fd_flags(fd_flags & PTRACE_GETFD_O_CLOEXEC);
if (ret >= 0)
fd_install(ret, file);
else
fput(file);
return ret;
}
On Mon, Dec 9, 2019 at 1:39 AM Christian Brauner
<[email protected]> wrote:
>
> Hey Sargun,
>
> Thanks for the patch!
Thanks for your review.
>
> Why not simply accept O_CLOEXEC as flag? If that's not possible for some
> reason I'd say
>
I did this initially. My plan is to use this options field for other
(future) things as well, like
clearing (cgroup) metadata on sockets (assuming we figure out a safe
way to do it).
If we use O_CLOEXEC, it takes up an arbitrary bit which is different
on different
platforms, and working around that seems messy
Another way around this would be to have two members. One member which
is something
like fdflags, that just takes the fd flags, like O_CLOEXEC, and then
later on, we can add
an options member to enable these future use cases.
What do you think?
> #define PTRACE_GETFD_O_CLOEXEC O_CLOEXEC /* open the fd with cloexec */
>
> is the right thing to do. This is fairly common:
>
> include/uapi/linux/timerfd.h:#define TFD_CLOEXEC O_CLOEXEC
> include/uapi/drm/drm.h:#define DRM_CLOEXEC O_CLOEXEC
> include/linux/userfaultfd_k.h:#define UFFD_CLOEXEC O_CLOEXEC
> include/linux/eventfd.h:#define EFD_CLOEXEC O_CLOEXEC
> include/uapi/linux/eventpoll.h:#define EPOLL_CLOEXEC O_CLOEXEC
> include/uapi/linux/inotify.h:/* For O_CLOEXEC and O_NONBLOCK */
> include/uapi/linux/inotify.h:#define IN_CLOEXEC O_CLOEXEC
> include/uapi/linux/mount.h:#define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */
>
> You can also add a compile-time assert to ptrace like we did for
> fs/namespace.c's OPEN_TREE_CLOEXEC:
> BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
>
> And I'd remove the _O if you go with a separate flag, i.e.:
>
> #define PTRACE_GETFD_CLOEXEC O_CLOEXEC /* open the fd with cloexec */
>
> > +
> > +struct ptrace_getfd_args {
> > + __u32 fd; /* the tracee's file descriptor to get */
> > + __u32 options;
>
> Nit and I'm not set on it at all but "flags" might just be better.
>
> > +} __attribute__((packed));
>
> What's the benefit in using __attribute__((packed)) here? Seems to me that:
>
1) Are we always to assume that the compiler will give us 4-byte
alignment (paranoia)
2) If we're to add new non-4-byte aligned members later on, is it
kosher to add packed
later on?
> +struct ptrace_getfd_args {
> + __u32 fd; /* the tracee's file descriptor to get */
> + __u32 options;
> +};
>
> would just work fine.
>
> > +
> > /*
> > * These values are stored in task->ptrace_message
> > * by tracehook_report_syscall_* to describe the current syscall-stop.
> > diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> > index cb9ddcc08119..8f619dceac6f 100644
> > --- a/kernel/ptrace.c
> > +++ b/kernel/ptrace.c
> > @@ -31,6 +31,7 @@
> > #include <linux/cn_proc.h>
> > #include <linux/compat.h>
> > #include <linux/sched/signal.h>
> > +#include <linux/fdtable.h>
> >
> > #include <asm/syscall.h> /* for syscall_get_* */
> >
> > @@ -994,6 +995,33 @@ ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size,
> > }
> > #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
> >
> > +static int ptrace_getfd(struct task_struct *child, unsigned long user_size,
> > + void __user *datavp)
> > +{
> > + struct ptrace_getfd_args args;
> > + unsigned int fd_flags = 0;
> > + struct file *file;
> > + int ret;
> > +
> > + ret = copy_struct_from_user(&args, sizeof(args), datavp, user_size);
> > + if (ret)
> > + goto out;
>
> Why is this goto out and not just return ret?
>
> > + if ((args.options & ~(PTRACE_GETFD_O_CLOEXEC)) != 0)
> > + return -EINVAL;
> > + if (args.options & PTRACE_GETFD_O_CLOEXEC)
> > + fd_flags &= O_CLOEXEC;
> > + file = get_task_file(child, args.fd);
> > + if (!file)
> > + return -EBADF;
> > + ret = get_unused_fd_flags(fd_flags);
>
> Why isn't that whole thing just:
>
> ret = get_unused_fd_flags(fd_flags & {PTRACE_GETFD_}O_CLOEXEC);
>
> > + if (ret >= 0)
> > + fd_install(ret, file);
> > + else
> > + fput(file);
> > +out:
> > + return ret;
> > +}
>
> So sm like:
>
> static int ptrace_getfd(struct task_struct *child, unsigned long user_size,
> void __user *datavp)
> {
> struct ptrace_getfd_args args;
> unsigned int fd_flags = 0;
> struct file *file;
> int ret;
>
> ret = copy_struct_from_user(&args, sizeof(args), datavp, user_size);
> if (ret)
> return ret;
>
> if ((args.options & ~(PTRACE_GETFD_O_CLOEXEC)) != 0)
> return -EINVAL;
>
> file = get_task_file(child, args.fd);
> if (!file)
> return -EBADF;
>
> /* PTRACE_GETFD_CLOEXEC == O_CLOEXEC */
> ret = get_unused_fd_flags(fd_flags & PTRACE_GETFD_O_CLOEXEC);
Wouldn't this always be 0, since fd_flags is always 0?
> if (ret >= 0)
> fd_install(ret, file);
> else
> fput(file);
>
> return ret;
> }
On Mon, Dec 09, 2019 at 02:55:40AM -0800, Sargun Dhillon wrote:
> On Mon, Dec 9, 2019 at 1:39 AM Christian Brauner
> <[email protected]> wrote:
> >
> > Hey Sargun,
> >
> > Thanks for the patch!
> Thanks for your review.
>
> >
> > Why not simply accept O_CLOEXEC as flag? If that's not possible for some
> > reason I'd say
> >
> I did this initially. My plan is to use this options field for other
> (future) things as well, like
> clearing (cgroup) metadata on sockets (assuming we figure out a safe
> way to do it).
> If we use O_CLOEXEC, it takes up an arbitrary bit which is different
> on different
> platforms, and working around that seems messy
>
> Another way around this would be to have two members. One member which
> is something
> like fdflags, that just takes the fd flags, like O_CLOEXEC, and then
> later on, we can add
> an options member to enable these future use cases.
That honestly sounds cleaner to me. I really don't like this flag
explosion for CLOEXEC. Every new kernel api we add that deals with fds
comes with their own set of CLOEXEC flags. The minimal thing to do is
to make sure that at least
NEW_CLOEXEC_THINGY == O_CLOEXEC.
>
> What do you think?
> > #define PTRACE_GETFD_O_CLOEXEC O_CLOEXEC /* open the fd with cloexec */
> >
> > is the right thing to do. This is fairly common:
> >
> > include/uapi/linux/timerfd.h:#define TFD_CLOEXEC O_CLOEXEC
> > include/uapi/drm/drm.h:#define DRM_CLOEXEC O_CLOEXEC
> > include/linux/userfaultfd_k.h:#define UFFD_CLOEXEC O_CLOEXEC
> > include/linux/eventfd.h:#define EFD_CLOEXEC O_CLOEXEC
> > include/uapi/linux/eventpoll.h:#define EPOLL_CLOEXEC O_CLOEXEC
> > include/uapi/linux/inotify.h:/* For O_CLOEXEC and O_NONBLOCK */
> > include/uapi/linux/inotify.h:#define IN_CLOEXEC O_CLOEXEC
> > include/uapi/linux/mount.h:#define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */
> >
> > You can also add a compile-time assert to ptrace like we did for
> > fs/namespace.c's OPEN_TREE_CLOEXEC:
> > BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
> >
> > And I'd remove the _O if you go with a separate flag, i.e.:
> >
> > #define PTRACE_GETFD_CLOEXEC O_CLOEXEC /* open the fd with cloexec */
> >
> > > +
> > > +struct ptrace_getfd_args {
> > > + __u32 fd; /* the tracee's file descriptor to get */
> > > + __u32 options;
> >
> > Nit and I'm not set on it at all but "flags" might just be better.
> >
> > > +} __attribute__((packed));
> >
> > What's the benefit in using __attribute__((packed)) here? Seems to me that:
> >
> 1) Are we always to assume that the compiler will give us 4-byte
> alignment (paranoia)
> 2) If we're to add new non-4-byte aligned members later on, is it
> kosher to add packed
> later on?
Using explicit padding is the more common way we do this (e.g. struct
open_how, struct statx and a lot more add explicit padding to guarantee
alignment.).
>
> > +struct ptrace_getfd_args {
> > + __u32 fd; /* the tracee's file descriptor to get */
> > + __u32 options;
> > +};
> >
> > would just work fine.
> >
> > > +
> > > /*
> > > * These values are stored in task->ptrace_message
> > > * by tracehook_report_syscall_* to describe the current syscall-stop.
> > > diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> > > index cb9ddcc08119..8f619dceac6f 100644
> > > --- a/kernel/ptrace.c
> > > +++ b/kernel/ptrace.c
> > > @@ -31,6 +31,7 @@
> > > #include <linux/cn_proc.h>
> > > #include <linux/compat.h>
> > > #include <linux/sched/signal.h>
> > > +#include <linux/fdtable.h>
> > >
> > > #include <asm/syscall.h> /* for syscall_get_* */
> > >
> > > @@ -994,6 +995,33 @@ ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size,
> > > }
> > > #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
> > >
> > > +static int ptrace_getfd(struct task_struct *child, unsigned long user_size,
> > > + void __user *datavp)
> > > +{
> > > + struct ptrace_getfd_args args;
> > > + unsigned int fd_flags = 0;
> > > + struct file *file;
> > > + int ret;
> > > +
> > > + ret = copy_struct_from_user(&args, sizeof(args), datavp, user_size);
> > > + if (ret)
> > > + goto out;
> >
> > Why is this goto out and not just return ret?
> >
> > > + if ((args.options & ~(PTRACE_GETFD_O_CLOEXEC)) != 0)
> > > + return -EINVAL;
> > > + if (args.options & PTRACE_GETFD_O_CLOEXEC)
> > > + fd_flags &= O_CLOEXEC;
> > > + file = get_task_file(child, args.fd);
> > > + if (!file)
> > > + return -EBADF;
> > > + ret = get_unused_fd_flags(fd_flags);
> >
> > Why isn't that whole thing just:
> >
> > ret = get_unused_fd_flags(fd_flags & {PTRACE_GETFD_}O_CLOEXEC);
> >
> > > + if (ret >= 0)
> > > + fd_install(ret, file);
> > > + else
> > > + fput(file);
> > > +out:
> > > + return ret;
> > > +}
> >
> > So sm like:
> >
> > static int ptrace_getfd(struct task_struct *child, unsigned long user_size,
> > void __user *datavp)
> > {
> > struct ptrace_getfd_args args;
> > unsigned int fd_flags = 0;
> > struct file *file;
> > int ret;
> >
> > ret = copy_struct_from_user(&args, sizeof(args), datavp, user_size);
> > if (ret)
> > return ret;
> >
> > if ((args.options & ~(PTRACE_GETFD_O_CLOEXEC)) != 0)
> > return -EINVAL;
> >
> > file = get_task_file(child, args.fd);
> > if (!file)
> > return -EBADF;
> >
> > /* PTRACE_GETFD_CLOEXEC == O_CLOEXEC */
> > ret = get_unused_fd_flags(fd_flags & PTRACE_GETFD_O_CLOEXEC);
> Wouldn't this always be 0, since fd_flags is always 0?
You're right, I missed this. Maybe rather:
if (args.options & PTRACE_GETFD_O_CLOEXEC)
fd_flags |= O_CLOEXEC;
Another question is if we shouldn't just make them cloexec by default?
The notifier fds and pidfds already are.
Christian