LinuxLists.cc - [RFC][PATCH v3] procfs: Always expose /proc/<pid>/map

2015-02-12 02:29:47

Subject: [RFC][PATCH v3] procfs: Always expose /proc/<pid>/map_files/ and make it readable

Currently, /proc/<pid>/map_files/ is restricted to CAP_SYS_ADMIN, and is
only exposed if CONFIG_CHECKPOINT_RESTORE is set. This interface is
useful for enumerating the files mapped into a process when the more
verbose information in /proc/<pid>/maps is not needed. It also allows
access to file descriptors for files that have been deleted and closed
but are still mmapped into a process, which can be very useful for
introspection and debugging.

This patch moves the folder out from behind CHECKPOINT_RESTORE, and
removes the CAP_SYS_ADMIN restrictions. With that change alone,
accessing this interface would have required PTRACE_MODE_READ like the
links in /proc/<pid>/fd/*.

However, a discussion on lkml concluded that MODE_READ is not
sufficient, both because write access to the inodes these links point
to allows direct modification of a process's address space, and
because it exposes files that users may have overlooked permissions on
because it was assumed they would be inaccessible (either deleted as
per above, or created via O_TMPFILE).

So, in addition to the above, this patch enforces PTRACE_MODE_ATTACH on
all the map_files/ operations. Since this is the same check that
determines if access to /proc/<pid>/mem is allowed, it will not allow an
attacker to do anything that was not already possible through that
interface.

Signed-off-by: Calvin Owens <[email protected]>
---
Changes in v3: Changed permission checks to use PTRACE_MODE_ATTACH
instead of PTRACE_MODE_READ, and added a stub to
enforce MODE_ATTACH on follow_link() as well.

Changes in v2: Removed the follow_link() stub that returned -EPERM if
the caller didn't have CAP_SYS_ADMIN, since the caller
in my chroot() scenario gets -EACCES anyway.

fs/proc/base.c | 59 ++++++++++++++++++++++++++++++++++++----------------------
1 file changed, 37 insertions(+), 22 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3f3d7ae..1355a4d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1632,8 +1632,6 @@ end_instantiate:
return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
}

-#ifdef CONFIG_CHECKPOINT_RESTORE
-
/*
* dname_to_vma_addr - maps a dentry name into two unsigned longs
* which represent vma start and end addresses.
@@ -1660,17 +1658,12 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
if (flags & LOOKUP_RCU)
return -ECHILD;

- if (!capable(CAP_SYS_ADMIN)) {
- status = -EPERM;
- goto out_notask;
- }
-
inode = dentry->d_inode;
task = get_proc_task(inode);
if (!task)
goto out_notask;

- mm = mm_access(task, PTRACE_MODE_READ);
+ mm = mm_access(task, PTRACE_MODE_ATTACH);
if (IS_ERR_OR_NULL(mm))
goto out;

@@ -1753,6 +1746,39 @@ struct map_files_info {
unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
};

+/*
+ * Enforce stronger PTRACE_MODE_ATTACH permissions on the symlinks under
+ * /proc/<pid>/map_files, since these links may refer to deleted or O_TMPFILE
+ * files that users might assume are inaccessible regardless of their
+ * ownership/permissions.
+ */
+static void *proc_map_files_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ struct inode *inode = dentry->d_inode;
+ struct task_struct *task;
+ int allowed = 0;
+
+ task = get_proc_task(inode);
+ if (task) {
+ allowed = ptrace_may_access(task, PTRACE_MODE_ATTACH);
+ put_task_struct(task);
+ }
+
+ if (!allowed)
+ return ERR_PTR(-EACCES);
+
+ return proc_pid_follow_link(dentry, nd);
+}
+
+/*
+ * Identical to proc_pid_link_inode_operations except for follow_link()
+ */
+static const struct inode_operations proc_map_files_link_inode_operations = {
+ .readlink = proc_pid_readlink,
+ .follow_link = proc_map_files_follow_link,
+ .setattr = proc_setattr,
+};
+
static int
proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
struct task_struct *task, const void *ptr)
@@ -1768,7 +1794,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
ei = PROC_I(inode);
ei->op.proc_get_link = proc_map_files_get_link;

- inode->i_op = &proc_pid_link_inode_operations;
+ inode->i_op = &proc_map_files_link_inode_operations;
inode->i_size = 64;
inode->i_mode = S_IFLNK;

@@ -1792,17 +1818,13 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
int result;
struct mm_struct *mm;

- result = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
- goto out;
-
result = -ENOENT;
task = get_proc_task(dir);
if (!task)
goto out;

result = -EACCES;
- if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ if (!ptrace_may_access(task, PTRACE_MODE_ATTACH))
goto out_put_task;

result = -ENOENT;
@@ -1849,17 +1871,13 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
struct map_files_info *p;
int ret;

- ret = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
- goto out;
-
ret = -ENOENT;
task = get_proc_task(file_inode(file));
if (!task)
goto out;

ret = -EACCES;
- if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ if (!ptrace_may_access(task, PTRACE_MODE_ATTACH))
goto out_put_task;

ret = 0;
@@ -2040,7 +2058,6 @@ static const struct file_operations proc_timers_operations = {
.llseek = seq_lseek,
.release = seq_release_private,
};
-#endif /* CONFIG_CHECKPOINT_RESTORE */

static int proc_pident_instantiate(struct inode *dir,
struct dentry *dentry, struct task_struct *task, const void *ptr)
@@ -2537,9 +2554,7 @@ static const struct inode_operations proc_task_inode_operations;
static const struct pid_entry tgid_base_stuff[] = {
DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
-#ifdef CONFIG_CHECKPOINT_RESTORE
DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
-#endif
DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
--
1.8.1

2015-02-12 07:45:49

by Cyrill Gorcunov

[permalink] [raw]

Subject: Re: [RFC][PATCH v3] procfs: Always expose /proc/<pid>/map_files/ and make it readable

On Wed, Feb 11, 2015 at 06:29:10PM -0800, Calvin Owens wrote:
> Currently, /proc/<pid>/map_files/ is restricted to CAP_SYS_ADMIN, and is
> only exposed if CONFIG_CHECKPOINT_RESTORE is set. This interface is
> useful for enumerating the files mapped into a process when the more
> verbose information in /proc/<pid>/maps is not needed. It also allows
> access to file descriptors for files that have been deleted and closed
> but are still mmapped into a process, which can be very useful for
> introspection and debugging.
...
>
> +/*
> + * Enforce stronger PTRACE_MODE_ATTACH permissions on the symlinks under
> + * /proc/<pid>/map_files, since these links may refer to deleted or O_TMPFILE
> + * files that users might assume are inaccessible regardless of their
> + * ownership/permissions.
> + */
> +static void *proc_map_files_follow_link(struct dentry *dentry, struct nameidata *nd)
> +{
> + struct inode *inode = dentry->d_inode;
> + struct task_struct *task;
> + int allowed = 0;
> +
> + task = get_proc_task(inode);
> + if (task) {
> + allowed = ptrace_may_access(task, PTRACE_MODE_ATTACH);
> + put_task_struct(task);
> + }

else
return ERR_PTR(-ESRCH);

Other than that, looks good to me, thanks!

Rewieved-by: Cyrill Gorcunov <[email protected]>

2015-02-14 20:40:30

by Calvin Owens

[permalink] [raw]

Subject: [RFC][PATCH v4] procfs: Always expose /proc/<pid>/map_files/ and make it readable

Currently, /proc/<pid>/map_files/ is restricted to CAP_SYS_ADMIN, and
is only exposed if CONFIG_CHECKPOINT_RESTORE is set. This interface is
very useful for enumerating the files mapped into a process when the
more verbose information in /proc/<pid>/maps is not needed. It also
allows access to file descriptors for files that have been deleted and
closed but are still mmapped into a process, which can be very useful
for introspection and debugging.

This patch moves the folder out from behind CHECKPOINT_RESTORE, and
removes the CAP_SYS_ADMIN restrictions. With that change alone,
following the links would have required PTRACE_MODE_READ like the
links in /proc/<pid>/fd/*.

However, a discussion on lkml concluded that MODE_READ is not
sufficient, both because write access to the inodes these links point
to allows direct modification of a process's address space, and
because it exposes files that users may have overlooked permissions on
because it was assumed they would be inaccessible (either deleted as
per above, or created via O_TMPFILE).

So, in addition to the above, this patch enforces PTRACE_MODE_ATTACH on
all the map_files operations. Since this is the same check that
determines if access to /proc/<pid>/mem is allowed, it will not allow an
attacker to do anything that was not already possible through that
interface.

Signed-off-by: Calvin Owens <[email protected]>
---
Changes in v4: Return -ESRCH from follow_link() when get_proc_task()
returns NULL.

Changes in v3: Changed permission checks to use PTRACE_MODE_ATTACH
instead of PTRACE_MODE_READ, and added a stub to
enforce MODE_ATTACH on follow_link() as well.

Changes in v2: Removed the follow_link() stub that returned -EPERM if
the caller didn't have CAP_SYS_ADMIN, since the caller
in my chroot() scenario gets -EACCES anyway.

fs/proc/base.c | 61 +++++++++++++++++++++++++++++++++++++---------------------
1 file changed, 39 insertions(+), 22 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3f3d7ae..b918692 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1632,8 +1632,6 @@ end_instantiate:
return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
}

-#ifdef CONFIG_CHECKPOINT_RESTORE
-
/*
* dname_to_vma_addr - maps a dentry name into two unsigned longs
* which represent vma start and end addresses.
@@ -1660,17 +1658,12 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
if (flags & LOOKUP_RCU)
return -ECHILD;

- if (!capable(CAP_SYS_ADMIN)) {
- status = -EPERM;
- goto out_notask;
- }
-
inode = dentry->d_inode;
task = get_proc_task(inode);
if (!task)
goto out_notask;

- mm = mm_access(task, PTRACE_MODE_READ);
+ mm = mm_access(task, PTRACE_MODE_ATTACH);
if (IS_ERR_OR_NULL(mm))
goto out;

@@ -1753,6 +1746,41 @@ struct map_files_info {
unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
};

+/*
+ * Enforce stronger PTRACE_MODE_ATTACH permissions on the symlinks under
+ * /proc/<pid>/map_files, since these links may refer to deleted or O_TMPFILE
+ * files that users might assume are inaccessible regardless of their
+ * ownership/permissions.
+ */
+static void *proc_map_files_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ struct inode *inode = dentry->d_inode;
+ struct task_struct *task;
+ int allowed = 0;
+
+ task = get_proc_task(inode);
+ if (task) {
+ allowed = ptrace_may_access(task, PTRACE_MODE_ATTACH);
+ put_task_struct(task);
+ } else {
+ return ERR_PTR(-ESRCH);
+ }
+
+ if (!allowed)
+ return ERR_PTR(-EACCES);
+
+ return proc_pid_follow_link(dentry, nd);
+}
+
+/*
+ * Identical to proc_pid_link_inode_operations except for follow_link()
+ */
+static const struct inode_operations proc_map_files_link_inode_operations = {
+ .readlink = proc_pid_readlink,
+ .follow_link = proc_map_files_follow_link,
+ .setattr = proc_setattr,
+};
+
static int
proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
struct task_struct *task, const void *ptr)
@@ -1768,7 +1796,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
ei = PROC_I(inode);
ei->op.proc_get_link = proc_map_files_get_link;

- inode->i_op = &proc_pid_link_inode_operations;
+ inode->i_op = &proc_map_files_link_inode_operations;
inode->i_size = 64;
inode->i_mode = S_IFLNK;

@@ -1792,17 +1820,13 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
int result;
struct mm_struct *mm;

- result = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
- goto out;
-
result = -ENOENT;
task = get_proc_task(dir);
if (!task)
goto out;

result = -EACCES;
- if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ if (!ptrace_may_access(task, PTRACE_MODE_ATTACH))
goto out_put_task;

result = -ENOENT;
@@ -1849,17 +1873,13 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
struct map_files_info *p;
int ret;

- ret = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
- goto out;
-
ret = -ENOENT;
task = get_proc_task(file_inode(file));
if (!task)
goto out;

ret = -EACCES;
- if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ if (!ptrace_may_access(task, PTRACE_MODE_ATTACH))
goto out_put_task;

ret = 0;
@@ -2040,7 +2060,6 @@ static const struct file_operations proc_timers_operations = {
.llseek = seq_lseek,
.release = seq_release_private,
};
-#endif /* CONFIG_CHECKPOINT_RESTORE */

static int proc_pident_instantiate(struct inode *dir,
struct dentry *dentry, struct task_struct *task, const void *ptr)
@@ -2537,9 +2556,7 @@ static const struct inode_operations proc_task_inode_operations;
static const struct pid_entry tgid_base_stuff[] = {
DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
-#ifdef CONFIG_CHECKPOINT_RESTORE
DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
-#endif
DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
--
1.8.1

2015-02-14 20:45:18

by Calvin Owens

[permalink] [raw]

Subject: [PATCH] procfs: Return -ESRCH on /proc/N/fd/* when PID N doesn't exist

Currently, readlink() and follow_link() for the symbolic links in
/proc/<pid>/fd/* will return -EACCES in the case where looking up the
task finds that it does not exist.

This patch inlines the logic from proc_fd_access_allowed() into these
two functions such that they will return -ESRCH if the lookup in /proc
races with the task exiting. Since those were the only two callers of
that helper function, it also removes it.

Signed-off-by: Calvin Owens <[email protected]>
---
fs/proc/base.c | 47 ++++++++++++++++++++++++++---------------------
1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3f3d7ae..308fcbd 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -485,23 +485,6 @@ static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
/* Here the fs part begins */
/************************************************************************/

-/* permission checks */
-static int proc_fd_access_allowed(struct inode *inode)
-{
- struct task_struct *task;
- int allowed = 0;
- /* Allow access to a task's file descriptors if it is us or we
- * may use ptrace attach to the process and find out that
- * information.
- */
- task = get_proc_task(inode);
- if (task) {
- allowed = ptrace_may_access(task, PTRACE_MODE_READ);
- put_task_struct(task);
- }
- return allowed;
-}
-
int proc_setattr(struct dentry *dentry, struct iattr *attr)
{
int error;
@@ -1375,10 +1358,21 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
{
struct inode *inode = dentry->d_inode;
struct path path;
- int error = -EACCES;
+ int error = -ESRCH;
+ int allowed = 0;
+ struct task_struct *task;

/* Are we allowed to snoop on the tasks file descriptors? */
- if (!proc_fd_access_allowed(inode))
+ task = get_proc_task(inode);
+ if (task) {
+ allowed = ptrace_may_access(task, PTRACE_MODE_READ);
+ put_task_struct(task);
+ } else {
+ goto out;
+ }
+
+ error = -EACCES;
+ if (!allowed)
goto out;

error = PROC_I(inode)->op.proc_get_link(dentry, &path);
@@ -1417,12 +1411,23 @@ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)

static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen)
{
- int error = -EACCES;
+ int error = -ESRCH;
+ int allowed = 0;
+ struct task_struct *task;
struct inode *inode = dentry->d_inode;
struct path path;

/* Are we allowed to snoop on the tasks file descriptors? */
- if (!proc_fd_access_allowed(inode))
+ task = get_proc_task(inode);
+ if (task) {
+ allowed = ptrace_may_access(task, PTRACE_MODE_READ);
+ put_task_struct(task);
+ } else {
+ goto out;
+ }
+
+ error = -EACCES;
+ if (!allowed)
goto out;

error = PROC_I(inode)->op.proc_get_link(dentry, &path);
--
1.8.1

2015-04-28 22:24:19

by Calvin Owens

[permalink] [raw]

Subject: Re: [RFC][PATCH v4] procfs: Always expose /proc/<pid>/map_files/ and make it readable

On Wednesday 03/11 at 01:17 +0300, Cyrill Gorcunov wrote:
> On Sat, Feb 14, 2015 at 12:40:09PM -0800, Calvin Owens wrote:
> > Currently, /proc/<pid>/map_files/ is restricted to CAP_SYS_ADMIN, and
> > is only exposed if CONFIG_CHECKPOINT_RESTORE is set. This interface is
> > very useful for enumerating the files mapped into a process when the
> > more verbose information in /proc/<pid>/maps is not needed. It also
> > allows access to file descriptors for files that have been deleted and
> > closed but are still mmapped into a process, which can be very useful
> > for introspection and debugging.
>
> Guys, I'm really-really sorry for not replying the email that long.
> If I understand correctly all concerns were addressed, right?

Ping!

I thought everybody was happy after the permission check was changed
to be PTRACE_MODE_ATTACH. But I'll resend one more time if you prefer.

Thanks,
Calvin

2015-04-29 07:32:59

by Cyrill Gorcunov

[permalink] [raw]

Subject: Re: [RFC][PATCH v4] procfs: Always expose /proc/<pid>/map_files/ and make it readable

On Tue, Apr 28, 2015 at 03:23:53PM -0700, Calvin Owens wrote:
> >
> > Guys, I'm really-really sorry for not replying the email that long.
> > If I understand correctly all concerns were addressed, right?
>
> Ping!
>
> I thought everybody was happy after the permission check was changed
> to be PTRACE_MODE_ATTACH. But I'll resend one more time if you prefer.

Yes please. I thought that PTRACE_MODE_ATTACH suits all as well, but
lets give it another review shot.

2015-05-19 03:10:18

by Calvin Owens

[permalink] [raw]

Subject: [PATCH v5] procfs: Always expose /proc/<pid>/map_files/ and make it readable

Currently, /proc/<pid>/map_files/ is restricted to CAP_SYS_ADMIN, and
is only exposed if CONFIG_CHECKPOINT_RESTORE is set. This interface is
very useful for enumerating the files mapped into a process when the
more verbose information in /proc/<pid>/maps is not needed. It also
allows access to file descriptors for files that have been deleted and
closed but are still mmapped into a process, which can be very useful
for introspection and debugging.

This patch moves the folder out from behind CHECKPOINT_RESTORE, and
removes the CAP_SYS_ADMIN restrictions. With that change alone,
following the links would have required PTRACE_MODE_READ like the
links in /proc/<pid>/fd/*.

However, a discussion on lkml concluded that MODE_READ is not
sufficient, both because write access to the inodes these links point
to allows direct modification of a process's address space, and
because it exposes files that users may have overlooked permissions on
because it was assumed they would be inaccessible (either deleted as
per above, or created via O_TMPFILE).

So, in addition to the above, this patch enforces PTRACE_MODE_ATTACH on
all the map_files operations. Since this is the same check that
determines if access to /proc/<pid>/mem is allowed, it will not allow an
attacker to do anything that was not already possible through that
interface.

Cc: Andrew Morton <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Kees Cook <[email protected]>
Cc: Kirill A. Shutemov <[email protected]>
Reviewed-by: Cyrill Gorcunov <[email protected]>
Signed-off-by: Calvin Owens <[email protected]>
---
Changes in v5: s/dentry->d_inode/d_inode(dentry)/g

Changes in v4: Return -ESRCH from follow_link() when get_proc_task()
returns NULL.

Changes in v3: Changed permission checks to use PTRACE_MODE_ATTACH
instead of PTRACE_MODE_READ, and added a stub to
enforce MODE_ATTACH on follow_link() as well.

Changes in v2: Removed the follow_link() stub that returned -EPERM if
the caller didn't have CAP_SYS_ADMIN, since the caller
in my chroot() scenario gets -EACCES anyway.

fs/proc/base.c | 61 +++++++++++++++++++++++++++++++++++++---------------------
1 file changed, 39 insertions(+), 22 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 093ca14..22d95a7 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1641,8 +1641,6 @@ end_instantiate:
return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
}

-#ifdef CONFIG_CHECKPOINT_RESTORE
-
/*
* dname_to_vma_addr - maps a dentry name into two unsigned longs
* which represent vma start and end addresses.
@@ -1669,17 +1667,12 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
if (flags & LOOKUP_RCU)
return -ECHILD;

- if (!capable(CAP_SYS_ADMIN)) {
- status = -EPERM;
- goto out_notask;
- }
-
inode = d_inode(dentry);
task = get_proc_task(inode);
if (!task)
goto out_notask;

- mm = mm_access(task, PTRACE_MODE_READ);
+ mm = mm_access(task, PTRACE_MODE_ATTACH);
if (IS_ERR_OR_NULL(mm))
goto out;

@@ -1762,6 +1755,41 @@ struct map_files_info {
unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
};

+/*
+ * Enforce stronger PTRACE_MODE_ATTACH permissions on the symlinks under
+ * /proc/<pid>/map_files, since these links may refer to deleted or O_TMPFILE
+ * files that users might assume are inaccessible regardless of their
+ * ownership/permissions.
+ */
+static void *proc_map_files_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ struct inode *inode = d_inode(dentry);
+ struct task_struct *task;
+ int allowed = 0;
+
+ task = get_proc_task(inode);
+ if (task) {
+ allowed = ptrace_may_access(task, PTRACE_MODE_ATTACH);
+ put_task_struct(task);
+ } else {
+ return ERR_PTR(-ESRCH);
+ }
+
+ if (!allowed)
+ return ERR_PTR(-EACCES);
+
+ return proc_pid_follow_link(dentry, nd);
+}
+
+/*
+ * Identical to proc_pid_link_inode_operations except for follow_link()
+ */
+static const struct inode_operations proc_map_files_link_inode_operations = {
+ .readlink = proc_pid_readlink,
+ .follow_link = proc_map_files_follow_link,
+ .setattr = proc_setattr,
+};
+
static int
proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
struct task_struct *task, const void *ptr)
@@ -1777,7 +1805,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
ei = PROC_I(inode);
ei->op.proc_get_link = proc_map_files_get_link;

- inode->i_op = &proc_pid_link_inode_operations;
+ inode->i_op = &proc_map_files_link_inode_operations;
inode->i_size = 64;
inode->i_mode = S_IFLNK;

@@ -1801,17 +1829,13 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
int result;
struct mm_struct *mm;

- result = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
- goto out;
-
result = -ENOENT;
task = get_proc_task(dir);
if (!task)
goto out;

result = -EACCES;
- if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ if (!ptrace_may_access(task, PTRACE_MODE_ATTACH))
goto out_put_task;

result = -ENOENT;
@@ -1858,17 +1882,13 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
struct map_files_info *p;
int ret;

- ret = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
- goto out;
-
ret = -ENOENT;
task = get_proc_task(file_inode(file));
if (!task)
goto out;

ret = -EACCES;
- if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ if (!ptrace_may_access(task, PTRACE_MODE_ATTACH))
goto out_put_task;

ret = 0;
@@ -2050,7 +2070,6 @@ static const struct file_operations proc_timers_operations = {
.llseek = seq_lseek,
.release = seq_release_private,
};
-#endif /* CONFIG_CHECKPOINT_RESTORE */

static int proc_pident_instantiate(struct inode *dir,
struct dentry *dentry, struct task_struct *task, const void *ptr)
@@ -2549,9 +2568,7 @@ static const struct inode_operations proc_task_inode_operations;
static const struct pid_entry tgid_base_stuff[] = {
DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
-#ifdef CONFIG_CHECKPOINT_RESTORE
DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
-#endif
DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
--
1.8.1

2015-05-19 03:29:25

by Joe Perches

[permalink] [raw]

Subject: Re: [PATCH v5] procfs: Always expose /proc/<pid>/map_files/ and make it readable

On Mon, 2015-05-18 at 20:10 -0700, Calvin Owens wrote:
> Currently, /proc/<pid>/map_files/ is restricted to CAP_SYS_ADMIN, and
> is only exposed if CONFIG_CHECKPOINT_RESTORE is set. This interface is
> very useful for enumerating the files mapped into a process when the
> more verbose information in /proc/<pid>/maps is not needed. It also
> allows access to file descriptors for files that have been deleted and
> closed but are still mmapped into a process, which can be very useful
> for introspection and debugging.

style trivia:

> diff --git a/fs/proc/base.c b/fs/proc/base.c
[]
> +/*
> + * Enforce stronger PTRACE_MODE_ATTACH permissions on the symlinks under
> + * /proc/<pid>/map_files, since these links may refer to deleted or O_TMPFILE
> + * files that users might assume are inaccessible regardless of their
> + * ownership/permissions.
> + */
> +static void *proc_map_files_follow_link(struct dentry *dentry, struct nameidata *nd)
> +{
> + struct inode *inode = d_inode(dentry);
> + struct task_struct *task;
> + int allowed = 0;
> +
> + task = get_proc_task(inode);
> + if (task) {
> + allowed = ptrace_may_access(task, PTRACE_MODE_ATTACH);
> + put_task_struct(task);
> + } else {
> + return ERR_PTR(-ESRCH);
> + }
> +
> + if (!allowed)
> + return ERR_PTR(-EACCES);
> +
> + return proc_pid_follow_link(dentry, nd);
> +}

It'd perhaps be clearer to read this with an
immediate return after a failure in get_proc_task.

Maybe something like (move initializations as desired):

static void *proc_map_files_follow_link(struct dentry *dentry, struct nameidata *nd)
{
int allowed;
struct iode *inode = d_inode(dentry);
struct task_struct task = get_proc_task(inode);

if (!task)
return ERR_PTR(-ESRCH);

allowed = ptrace_may_access(task, PTRACE_MODE_ATTACH);

put_task_struct(task);

if (!allowed)
return ERR_PTR(-EACCES);

return proc_pic_follow_link(dentry, nd);
}

2015-05-19 18:05:02

by Andy Lutomirski

[permalink] [raw]

Subject: Re: [PATCH v5] procfs: Always expose /proc/<pid>/map_files/ and make it readable

On Mon, May 18, 2015 at 8:10 PM, Calvin Owens <[email protected]> wrote:
> Currently, /proc/<pid>/map_files/ is restricted to CAP_SYS_ADMIN, and
> is only exposed if CONFIG_CHECKPOINT_RESTORE is set. This interface is
> very useful for enumerating the files mapped into a process when the
> more verbose information in /proc/<pid>/maps is not needed. It also
> allows access to file descriptors for files that have been deleted and
> closed but are still mmapped into a process, which can be very useful
> for introspection and debugging.
>
> This patch moves the folder out from behind CHECKPOINT_RESTORE, and

I'm fine with this.

> removes the CAP_SYS_ADMIN restrictions. With that change alone,
> following the links would have required PTRACE_MODE_READ like the
> links in /proc/<pid>/fd/*.

I'm still not at all convinced that this is safe. Here are a few ways
that it could have unintended consequences:

1. Mmap a dma-buf and then open /proc/self/map_files/addr. You get an
fd pointing at a different inode than you mapped. (kdbus would have
the same problem if it were merged.)

2. Open a file with O_RDONLY, mmap it with PROT_READ, close the file,
then open /proc/self/map_files/addr with O_RDWR. I don't see anything
preventing that from succeeding.

3. Open a file, mmap it, close the fd, chroot, drop privileges, open
/proc/self/map_files/addr, then call ftruncate.

So NAK as-is, I think.

Fixing #1 would involve changing the way mmap works, I think. Fixing
#2 would require similar infrastructure to what we'd need to fix the
existing /proc/pid/fd mode holes. I have no clue how to even approach
fixing #3.

What's the use case of this patch?

--Andy

2015-05-21 01:52:25

by Calvin Owens

[permalink] [raw]

Subject: Re: [PATCH v5] procfs: Always expose /proc/<pid>/map_files/ and make it readable

On Tuesday 05/19 at 11:04 -0700, Andy Lutomirski wrote:
> On Mon, May 18, 2015 at 8:10 PM, Calvin Owens <[email protected]> wrote:
> > Currently, /proc/<pid>/map_files/ is restricted to CAP_SYS_ADMIN, and
> > is only exposed if CONFIG_CHECKPOINT_RESTORE is set. This interface is
> > very useful for enumerating the files mapped into a process when the
> > more verbose information in /proc/<pid>/maps is not needed. It also
> > allows access to file descriptors for files that have been deleted and
> > closed but are still mmapped into a process, which can be very useful
> > for introspection and debugging.
> >
> > This patch moves the folder out from behind CHECKPOINT_RESTORE, and
>
> I'm fine with this.
>
> > removes the CAP_SYS_ADMIN restrictions. With that change alone,
> > following the links would have required PTRACE_MODE_READ like the
> > links in /proc/<pid>/fd/*.
>
> I'm still not at all convinced that this is safe. Here are a few ways
> that it could have unintended consequences:
>
> 1. Mmap a dma-buf and then open /proc/self/map_files/addr. You get an
> fd pointing at a different inode than you mapped. (kdbus would have
> the same problem if it were merged.)
>
> 2. Open a file with O_RDONLY, mmap it with PROT_READ, close the file,
> then open /proc/self/map_files/addr with O_RDWR. I don't see anything
> preventing that from succeeding.

Hmm, that's a good point: it lets you bypass the permission checks on
all the path components you would normally walk through to get to the
file. But it still only works if you actually have permission to open
the file in question for writing.

Also, this is already how the /proc/N/fd/* symlinks work, isn't it?

> 3. Open a file, mmap it, close the fd, chroot, drop privileges, open
> /proc/self/map_files/addr, then call ftruncate.

This doesn't work unless the privileges you dropped to actually allow
you to open the mmapped file for writing. It's really the same
fundamental problem as (2), where you're allowing direct access to a
file without trying to walk the path down to it, right?

> So NAK as-is, I think.

Limiting ->follow_link() to CAP_SYS_ADMIN wouldn't affect anything I
imagine using this interface for (see below), so I have no problem with
putting that back in. I think that would alleviate all your concerns
above, right?

(That said, I don't think it makes sense to limit readdir() or
readlink() on map_files/* to CAP_SYS_ADMIN, since that alone is a subset
of what you can get from /proc/N/maps.)

> Fixing #1 would involve changing the way mmap works, I think. Fixing
> #2 would require similar infrastructure to what we'd need to fix the
> existing /proc/pid/fd mode holes. I have no clue how to even approach
> fixing #3.
>
> What's the use case of this patch?

The biggest use case: it enables you to stat() files that have been
deleted but are still mapped by some process.

This enables a much quicker and more accurate answer to the question
"How much disk space is being consumed by files that are deleted but
still mapped?" than is currently possible.

It also allows you to know how much space a specific mapped-but-deleted
file is using on a specific filesystem, which is currently impossible
from userspace AFAIK.

Thanks,
Calvin

> --Andy

2015-05-21 02:10:45

by Andy Lutomirski

[permalink] [raw]

Subject: Re: [PATCH v5] procfs: Always expose /proc/<pid>/map_files/ and make it readable

On Wed, May 20, 2015 at 6:52 PM, Calvin Owens <[email protected]> wrote:
> On Tuesday 05/19 at 11:04 -0700, Andy Lutomirski wrote:
>> On Mon, May 18, 2015 at 8:10 PM, Calvin Owens <[email protected]> wrote:
>> > Currently, /proc/<pid>/map_files/ is restricted to CAP_SYS_ADMIN, and
>> > is only exposed if CONFIG_CHECKPOINT_RESTORE is set. This interface is
>> > very useful for enumerating the files mapped into a process when the
>> > more verbose information in /proc/<pid>/maps is not needed. It also
>> > allows access to file descriptors for files that have been deleted and
>> > closed but are still mmapped into a process, which can be very useful
>> > for introspection and debugging.
>> >
>> > This patch moves the folder out from behind CHECKPOINT_RESTORE, and
>>
>> I'm fine with this.
>>
>> > removes the CAP_SYS_ADMIN restrictions. With that change alone,
>> > following the links would have required PTRACE_MODE_READ like the
>> > links in /proc/<pid>/fd/*.
>>
>> I'm still not at all convinced that this is safe. Here are a few ways
>> that it could have unintended consequences:
>>
>> 1. Mmap a dma-buf and then open /proc/self/map_files/addr. You get an
>> fd pointing at a different inode than you mapped. (kdbus would have
>> the same problem if it were merged.)
>>
>> 2. Open a file with O_RDONLY, mmap it with PROT_READ, close the file,
>> then open /proc/self/map_files/addr with O_RDWR. I don't see anything
>> preventing that from succeeding.
>
> Hmm, that's a good point: it lets you bypass the permission checks on
> all the path components you would normally walk through to get to the
> file. But it still only works if you actually have permission to open
> the file in question for writing.

But you might not still have that permission.

>
> Also, this is already how the /proc/N/fd/* symlinks work, isn't it?

Yes, but only for files that are open. Also, I hope to fix that some day.

>
>> 3. Open a file, mmap it, close the fd, chroot, drop privileges, open
>> /proc/self/map_files/addr, then call ftruncate.
>
> This doesn't work unless the privileges you dropped to actually allow
> you to open the mmapped file for writing. It's really the same
> fundamental problem as (2), where you're allowing direct access to a
> file without trying to walk the path down to it, right?

Yes, although I can imagine this actually happening. Also, there's issue #1.

>
>> So NAK as-is, I think.
>
> Limiting ->follow_link() to CAP_SYS_ADMIN wouldn't affect anything I
> imagine using this interface for (see below), so I have no problem with
> putting that back in. I think that would alleviate all your concerns
> above, right?

I think so. You could still maybe do awful things due to #1, but at
least you'd have to be privileged.

>
> (That said, I don't think it makes sense to limit readdir() or
> readlink() on map_files/* to CAP_SYS_ADMIN, since that alone is a subset
> of what you can get from /proc/N/maps.)

Agreed.

>
>> Fixing #1 would involve changing the way mmap works, I think. Fixing
>> #2 would require similar infrastructure to what we'd need to fix the
>> existing /proc/pid/fd mode holes. I have no clue how to even approach
>> fixing #3.
>>
>> What's the use case of this patch?
>
> The biggest use case: it enables you to stat() files that have been
> deleted but are still mapped by some process.
>
> This enables a much quicker and more accurate answer to the question
> "How much disk space is being consumed by files that are deleted but
> still mapped?" than is currently possible.
>
> It also allows you to know how much space a specific mapped-but-deleted
> file is using on a specific filesystem, which is currently impossible
> from userspace AFAIK.

Seems reasonable.

It might be nice to have a general interface for enumerating
deleted-but-still-in-use files on a filesystem some day, too.

--Andy

2015-06-09 03:40:24

by Calvin Owens

[permalink] [raw]

Subject: [PATCH v6] procfs: Always expose /proc/<pid>/map_files/ and make it readable

Currently, /proc/<pid>/map_files/ is restricted to CAP_SYS_ADMIN, and
is only exposed if CONFIG_CHECKPOINT_RESTORE is set.

This interface very useful because it allows userspace to stat()
deleted files that are still mapped by some process, which enables a
much quicker and more accurate answer to the question "How much disk
space is being consumed by files that are deleted but still mapped?"
than is currently possible.

This patch moves map_files/ out from behind CONFIG_CHECKPOINT_RESTORE,
and adjusts the permissions enforced on it as follows:

* proc_map_files_lookup()
* proc_map_files_readdir()
* map_files_d_revalidate()

Remove the CAP_SYS_ADMIN restriction, leaving only the current
restriction requiring PTRACE_MODE_READ.

In earlier versions of this patch, I changed the ptrace checks
in the functions above to enforce MODE_ATTACH instead of
MODE_READ. That was an oversight: all the information exposed
by the above three functions is already available with
MODE_READ from /proc/PID/maps. I was only being asked to
strengthen the protection around functionality provided by
follow_link(), not the above.

So, I've left the checks for MODE_READ as-is, since AFAICS all
objections raised so far are addressed by the new CAP_SYS_ADMIN
check in follow_link(), explained below.

* proc_map_files_follow_link()

This stub has been added, and requires that the user have
CAP_SYS_ADMIN in order to follow the links in map_files/,
since there was concern on LKML both about the potential for
bypassing permissions on ancestor directories in the path to
files pointed to, and about what happens with more exotic
memory mappings created by some drivers (ie dma-buf).

Cc: Andrew Morton <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Cyrill Gorcunov <[email protected]>
Cc: Joe Perches <[email protected]>
Cc: Kees Cook <[email protected]>
Cc: Kirill A. Shutemov <[email protected]>
Signed-off-by: Calvin Owens <[email protected]>
---
Changes in v6: Require CAP_SYS_ADMIN for follow_link(). Leave other
PTRACE_MODE_READ checks as-is, since CAP_SYS_ADMIN
alone addresses all concerns raised AFAICS.

Changes in v5: s/dentry->d_inode/d_inode(dentry)/g

Changes in v4: Return -ESRCH from follow_link() when get_proc_task()
returns NULL.

Changes in v3: Changed permission checks to use PTRACE_MODE_ATTACH
instead of PTRACE_MODE_READ, and added a stub to
enforce MODE_ATTACH on follow_link() as well.

Changes in v2: Removed the follow_link() stub that returned -EPERM if
the caller didn't have CAP_SYS_ADMIN, since the caller
in my chroot() scenario gets -EACCES anyway.

fs/proc/base.c | 42 +++++++++++++++++++++++-------------------
1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 093ca14..0270191 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1641,8 +1641,6 @@ end_instantiate:
return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
}

-#ifdef CONFIG_CHECKPOINT_RESTORE
-
/*
* dname_to_vma_addr - maps a dentry name into two unsigned longs
* which represent vma start and end addresses.
@@ -1669,11 +1667,6 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
if (flags & LOOKUP_RCU)
return -ECHILD;

- if (!capable(CAP_SYS_ADMIN)) {
- status = -EPERM;
- goto out_notask;
- }
-
inode = d_inode(dentry);
task = get_proc_task(inode);
if (!task)
@@ -1762,6 +1755,28 @@ struct map_files_info {
unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
};

+/*
+ * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the
+ * symlinks may be used to bypass permissions on ancestor directories in the
+ * path to the file in question.
+ */
+static void *proc_map_files_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ return proc_pid_follow_link(dentry, nd);
+}
+
+/*
+ * Identical to proc_pid_link_inode_operations except for follow_link()
+ */
+static const struct inode_operations proc_map_files_link_inode_operations = {
+ .readlink = proc_pid_readlink,
+ .follow_link = proc_map_files_follow_link,
+ .setattr = proc_setattr,
+};
+
static int
proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
struct task_struct *task, const void *ptr)
@@ -1777,7 +1792,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
ei = PROC_I(inode);
ei->op.proc_get_link = proc_map_files_get_link;

- inode->i_op = &proc_pid_link_inode_operations;
+ inode->i_op = &proc_map_files_link_inode_operations;
inode->i_size = 64;
inode->i_mode = S_IFLNK;

@@ -1801,10 +1816,6 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
int result;
struct mm_struct *mm;

- result = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
- goto out;
-
result = -ENOENT;
task = get_proc_task(dir);
if (!task)
@@ -1858,10 +1869,6 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
struct map_files_info *p;
int ret;

- ret = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
- goto out;
-
ret = -ENOENT;
task = get_proc_task(file_inode(file));
if (!task)
@@ -2050,7 +2057,6 @@ static const struct file_operations proc_timers_operations = {
.llseek = seq_lseek,
.release = seq_release_private,
};
-#endif /* CONFIG_CHECKPOINT_RESTORE */

static int proc_pident_instantiate(struct inode *dir,
struct dentry *dentry, struct task_struct *task, const void *ptr)
@@ -2549,9 +2555,7 @@ static const struct inode_operations proc_task_inode_operations;
static const struct pid_entry tgid_base_stuff[] = {
DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
-#ifdef CONFIG_CHECKPOINT_RESTORE
DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
-#endif
DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
--
1.8.1

2015-06-09 17:27:46

by Kees Cook

[permalink] [raw]

Subject: Re: [PATCH v6] procfs: Always expose /proc/<pid>/map_files/ and make it readable

On Mon, Jun 8, 2015 at 8:39 PM, Calvin Owens <[email protected]> wrote:
> Currently, /proc/<pid>/map_files/ is restricted to CAP_SYS_ADMIN, and
> is only exposed if CONFIG_CHECKPOINT_RESTORE is set.
>
> This interface very useful because it allows userspace to stat()
> deleted files that are still mapped by some process, which enables a
> much quicker and more accurate answer to the question "How much disk
> space is being consumed by files that are deleted but still mapped?"
> than is currently possible.
>
> This patch moves map_files/ out from behind CONFIG_CHECKPOINT_RESTORE,
> and adjusts the permissions enforced on it as follows:
>
> * proc_map_files_lookup()
> * proc_map_files_readdir()
> * map_files_d_revalidate()
>
> Remove the CAP_SYS_ADMIN restriction, leaving only the current
> restriction requiring PTRACE_MODE_READ.
>
> In earlier versions of this patch, I changed the ptrace checks
> in the functions above to enforce MODE_ATTACH instead of
> MODE_READ. That was an oversight: all the information exposed
> by the above three functions is already available with
> MODE_READ from /proc/PID/maps. I was only being asked to
> strengthen the protection around functionality provided by
> follow_link(), not the above.
>
> So, I've left the checks for MODE_READ as-is, since AFAICS all
> objections raised so far are addressed by the new CAP_SYS_ADMIN
> check in follow_link(), explained below.
>
> * proc_map_files_follow_link()
>
> This stub has been added, and requires that the user have
> CAP_SYS_ADMIN in order to follow the links in map_files/,
> since there was concern on LKML both about the potential for
> bypassing permissions on ancestor directories in the path to
> files pointed to, and about what happens with more exotic
> memory mappings created by some drivers (ie dma-buf).
>
> Cc: Andrew Morton <[email protected]>
> Cc: Andy Lutomirski <[email protected]>
> Cc: Cyrill Gorcunov <[email protected]>
> Cc: Joe Perches <[email protected]>
> Cc: Kees Cook <[email protected]>
> Cc: Kirill A. Shutemov <[email protected]>
> Signed-off-by: Calvin Owens <[email protected]>
> ---
> Changes in v6: Require CAP_SYS_ADMIN for follow_link(). Leave other
> PTRACE_MODE_READ checks as-is, since CAP_SYS_ADMIN
> alone addresses all concerns raised AFAICS.
>
> Changes in v5: s/dentry->d_inode/d_inode(dentry)/g
>
> Changes in v4: Return -ESRCH from follow_link() when get_proc_task()
> returns NULL.
>
> Changes in v3: Changed permission checks to use PTRACE_MODE_ATTACH
> instead of PTRACE_MODE_READ, and added a stub to
> enforce MODE_ATTACH on follow_link() as well.
>
> Changes in v2: Removed the follow_link() stub that returned -EPERM if
> the caller didn't have CAP_SYS_ADMIN, since the caller
> in my chroot() scenario gets -EACCES anyway.
>
> fs/proc/base.c | 42 +++++++++++++++++++++++-------------------
> 1 file changed, 23 insertions(+), 19 deletions(-)
>
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 093ca14..0270191 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -1641,8 +1641,6 @@ end_instantiate:
> return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
> }
>
> -#ifdef CONFIG_CHECKPOINT_RESTORE
> -
> /*
> * dname_to_vma_addr - maps a dentry name into two unsigned longs
> * which represent vma start and end addresses.
> @@ -1669,11 +1667,6 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
> if (flags & LOOKUP_RCU)
> return -ECHILD;
>
> - if (!capable(CAP_SYS_ADMIN)) {
> - status = -EPERM;
> - goto out_notask;
> - }
> -
> inode = d_inode(dentry);
> task = get_proc_task(inode);
> if (!task)
> @@ -1762,6 +1755,28 @@ struct map_files_info {
> unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
> };
>
> +/*
> + * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the
> + * symlinks may be used to bypass permissions on ancestor directories in the
> + * path to the file in question.
> + */

Cool, I think this looks good. Thanks!

Reviewed-by: Kees Cook <[email protected]>

-Kees

> +static void *proc_map_files_follow_link(struct dentry *dentry, struct nameidata *nd)
> +{
> + if (!capable(CAP_SYS_ADMIN))
> + return ERR_PTR(-EPERM);
> +
> + return proc_pid_follow_link(dentry, nd);
> +}
> +
> +/*
> + * Identical to proc_pid_link_inode_operations except for follow_link()
> + */
> +static const struct inode_operations proc_map_files_link_inode_operations = {
> + .readlink = proc_pid_readlink,
> + .follow_link = proc_map_files_follow_link,
> + .setattr = proc_setattr,
> +};
> +
> static int
> proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
> struct task_struct *task, const void *ptr)
> @@ -1777,7 +1792,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
> ei = PROC_I(inode);
> ei->op.proc_get_link = proc_map_files_get_link;
>
> - inode->i_op = &proc_pid_link_inode_operations;
> + inode->i_op = &proc_map_files_link_inode_operations;
> inode->i_size = 64;
> inode->i_mode = S_IFLNK;
>
> @@ -1801,10 +1816,6 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
> int result;
> struct mm_struct *mm;
>
> - result = -EPERM;
> - if (!capable(CAP_SYS_ADMIN))
> - goto out;
> -
> result = -ENOENT;
> task = get_proc_task(dir);
> if (!task)
> @@ -1858,10 +1869,6 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
> struct map_files_info *p;
> int ret;
>
> - ret = -EPERM;
> - if (!capable(CAP_SYS_ADMIN))
> - goto out;
> -
> ret = -ENOENT;
> task = get_proc_task(file_inode(file));
> if (!task)
> @@ -2050,7 +2057,6 @@ static const struct file_operations proc_timers_operations = {
> .llseek = seq_lseek,
> .release = seq_release_private,
> };
> -#endif /* CONFIG_CHECKPOINT_RESTORE */
>
> static int proc_pident_instantiate(struct inode *dir,
> struct dentry *dentry, struct task_struct *task, const void *ptr)
> @@ -2549,9 +2555,7 @@ static const struct inode_operations proc_task_inode_operations;
> static const struct pid_entry tgid_base_stuff[] = {
> DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
> DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
> -#ifdef CONFIG_CHECKPOINT_RESTORE
> DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
> -#endif
> DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
> DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
> #ifdef CONFIG_NET
> --
> 1.8.1
>

--
Kees Cook
Chrome OS Security

2015-06-09 18:09:46

by Andy Lutomirski

[permalink] [raw]

Subject: Re: [PATCH v6] procfs: Always expose /proc/<pid>/map_files/ and make it readable

On Tue, Jun 9, 2015 at 10:27 AM, Kees Cook <[email protected]> wrote:
> On Mon, Jun 8, 2015 at 8:39 PM, Calvin Owens <[email protected]> wrote:
>> Currently, /proc/<pid>/map_files/ is restricted to CAP_SYS_ADMIN, and
>> is only exposed if CONFIG_CHECKPOINT_RESTORE is set.
>>
>> This interface very useful because it allows userspace to stat()
>> deleted files that are still mapped by some process, which enables a
>> much quicker and more accurate answer to the question "How much disk
>> space is being consumed by files that are deleted but still mapped?"
>> than is currently possible.
>>
>> This patch moves map_files/ out from behind CONFIG_CHECKPOINT_RESTORE,
>> and adjusts the permissions enforced on it as follows:
>>
>> * proc_map_files_lookup()
>> * proc_map_files_readdir()
>> * map_files_d_revalidate()
>>
>> Remove the CAP_SYS_ADMIN restriction, leaving only the current
>> restriction requiring PTRACE_MODE_READ.
>>
>> In earlier versions of this patch, I changed the ptrace checks
>> in the functions above to enforce MODE_ATTACH instead of
>> MODE_READ. That was an oversight: all the information exposed
>> by the above three functions is already available with
>> MODE_READ from /proc/PID/maps. I was only being asked to
>> strengthen the protection around functionality provided by
>> follow_link(), not the above.
>>
>> So, I've left the checks for MODE_READ as-is, since AFAICS all
>> objections raised so far are addressed by the new CAP_SYS_ADMIN
>> check in follow_link(), explained below.
>>
>> * proc_map_files_follow_link()
>>
>> This stub has been added, and requires that the user have
>> CAP_SYS_ADMIN in order to follow the links in map_files/,
>> since there was concern on LKML both about the potential for
>> bypassing permissions on ancestor directories in the path to
>> files pointed to, and about what happens with more exotic
>> memory mappings created by some drivers (ie dma-buf).
>>
>> Cc: Andrew Morton <[email protected]>
>> Cc: Andy Lutomirski <[email protected]>
>> Cc: Cyrill Gorcunov <[email protected]>
>> Cc: Joe Perches <[email protected]>
>> Cc: Kees Cook <[email protected]>
>> Cc: Kirill A. Shutemov <[email protected]>
>> Signed-off-by: Calvin Owens <[email protected]>
>> ---
>> Changes in v6: Require CAP_SYS_ADMIN for follow_link(). Leave other
>> PTRACE_MODE_READ checks as-is, since CAP_SYS_ADMIN
>> alone addresses all concerns raised AFAICS.
>>
>> Changes in v5: s/dentry->d_inode/d_inode(dentry)/g
>>
>> Changes in v4: Return -ESRCH from follow_link() when get_proc_task()
>> returns NULL.
>>
>> Changes in v3: Changed permission checks to use PTRACE_MODE_ATTACH
>> instead of PTRACE_MODE_READ, and added a stub to
>> enforce MODE_ATTACH on follow_link() as well.
>>
>> Changes in v2: Removed the follow_link() stub that returned -EPERM if
>> the caller didn't have CAP_SYS_ADMIN, since the caller
>> in my chroot() scenario gets -EACCES anyway.
>>
>> fs/proc/base.c | 42 +++++++++++++++++++++++-------------------
>> 1 file changed, 23 insertions(+), 19 deletions(-)
>>
>> diff --git a/fs/proc/base.c b/fs/proc/base.c
>> index 093ca14..0270191 100644
>> --- a/fs/proc/base.c
>> +++ b/fs/proc/base.c
>> @@ -1641,8 +1641,6 @@ end_instantiate:
>> return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
>> }
>>
>> -#ifdef CONFIG_CHECKPOINT_RESTORE
>> -
>> /*
>> * dname_to_vma_addr - maps a dentry name into two unsigned longs
>> * which represent vma start and end addresses.
>> @@ -1669,11 +1667,6 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
>> if (flags & LOOKUP_RCU)
>> return -ECHILD;
>>
>> - if (!capable(CAP_SYS_ADMIN)) {
>> - status = -EPERM;
>> - goto out_notask;
>> - }
>> -
>> inode = d_inode(dentry);
>> task = get_proc_task(inode);
>> if (!task)
>> @@ -1762,6 +1755,28 @@ struct map_files_info {
>> unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
>> };
>>
>> +/*
>> + * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the
>> + * symlinks may be used to bypass permissions on ancestor directories in the
>> + * path to the file in question.
>> + */
>
> Cool, I think this looks good. Thanks!
>
> Reviewed-by: Kees Cook <[email protected]>
>

Looks good to me, too.

--Andy

> -Kees
>
>> +static void *proc_map_files_follow_link(struct dentry *dentry, struct nameidata *nd)
>> +{
>> + if (!capable(CAP_SYS_ADMIN))
>> + return ERR_PTR(-EPERM);
>> +
>> + return proc_pid_follow_link(dentry, nd);
>> +}
>> +
>> +/*
>> + * Identical to proc_pid_link_inode_operations except for follow_link()
>> + */
>> +static const struct inode_operations proc_map_files_link_inode_operations = {
>> + .readlink = proc_pid_readlink,
>> + .follow_link = proc_map_files_follow_link,
>> + .setattr = proc_setattr,
>> +};
>> +
>> static int
>> proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
>> struct task_struct *task, const void *ptr)
>> @@ -1777,7 +1792,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
>> ei = PROC_I(inode);
>> ei->op.proc_get_link = proc_map_files_get_link;
>>
>> - inode->i_op = &proc_pid_link_inode_operations;
>> + inode->i_op = &proc_map_files_link_inode_operations;
>> inode->i_size = 64;
>> inode->i_mode = S_IFLNK;
>>
>> @@ -1801,10 +1816,6 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
>> int result;
>> struct mm_struct *mm;
>>
>> - result = -EPERM;
>> - if (!capable(CAP_SYS_ADMIN))
>> - goto out;
>> -
>> result = -ENOENT;
>> task = get_proc_task(dir);
>> if (!task)
>> @@ -1858,10 +1869,6 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
>> struct map_files_info *p;
>> int ret;
>>
>> - ret = -EPERM;
>> - if (!capable(CAP_SYS_ADMIN))
>> - goto out;
>> -
>> ret = -ENOENT;
>> task = get_proc_task(file_inode(file));
>> if (!task)
>> @@ -2050,7 +2057,6 @@ static const struct file_operations proc_timers_operations = {
>> .llseek = seq_lseek,
>> .release = seq_release_private,
>> };
>> -#endif /* CONFIG_CHECKPOINT_RESTORE */
>>
>> static int proc_pident_instantiate(struct inode *dir,
>> struct dentry *dentry, struct task_struct *task, const void *ptr)
>> @@ -2549,9 +2555,7 @@ static const struct inode_operations proc_task_inode_operations;
>> static const struct pid_entry tgid_base_stuff[] = {
>> DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
>> DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
>> -#ifdef CONFIG_CHECKPOINT_RESTORE
>> DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
>> -#endif
>> DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
>> DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
>> #ifdef CONFIG_NET
>> --
>> 1.8.1
>>
>
>
>
> --
> Kees Cook
> Chrome OS Security

--
Andy Lutomirski
AMA Capital Management, LLC

2015-06-09 18:39:39

by Cyrill Gorcunov

[permalink] [raw]

Subject: Re: [PATCH v6] procfs: Always expose /proc/<pid>/map_files/ and make it readable

On Tue, Jun 09, 2015 at 10:47:31AM -0700, Andy Lutomirski wrote:
...
> >
> > Cool, I think this looks good. Thanks!
> >
> > Reviewed-by: Kees Cook <[email protected]>
> >
>
> Looks good to me, too.

Wow! Great job, Calvin, thanks!

2015-06-09 21:13:12

by Andrew Morton

[permalink] [raw]

Subject: Re: [PATCH v6] procfs: Always expose /proc/<pid>/map_files/ and make it readable

On Mon, 8 Jun 2015 20:39:33 -0700 Calvin Owens <[email protected]> wrote:

> Currently, /proc/<pid>/map_files/ is restricted to CAP_SYS_ADMIN, and
> is only exposed if CONFIG_CHECKPOINT_RESTORE is set.
>
> This interface very useful because it allows userspace to stat()
> deleted files that are still mapped by some process, which enables a
> much quicker and more accurate answer to the question "How much disk
> space is being consumed by files that are deleted but still mapped?"
> than is currently possible.

Why is that information useful?

I could perhaps think of some use for "How much disk space is being
consumed by files that are deleted but still open", but to count the
mmapped-then-unlinked files while excluding the opened-then-unlinked
files seems damned peculiar.

IOW, this changelog failed to explain the value of the patch. Bad
changelog! Please sell it to us. Preferably with real-world use
cases.

2015-06-10 01:39:52

by Calvin Owens

[permalink] [raw]

Subject: Re: [PATCH v6] procfs: Always expose /proc/<pid>/map_files/ and make it readable

On Tuesday 06/09 at 14:13 -0700, Andrew Morton wrote:
> On Mon, 8 Jun 2015 20:39:33 -0700 Calvin Owens <[email protected]> wrote:
>
> > Currently, /proc/<pid>/map_files/ is restricted to CAP_SYS_ADMIN, and
> > is only exposed if CONFIG_CHECKPOINT_RESTORE is set.
> >
> > This interface very useful because it allows userspace to stat()
> > deleted files that are still mapped by some process, which enables a
> > much quicker and more accurate answer to the question "How much disk
> > space is being consumed by files that are deleted but still mapped?"
> > than is currently possible.
>
> Why is that information useful?
>
> I could perhaps think of some use for "How much disk space is being
> consumed by files that are deleted but still open", but to count the
> mmapped-then-unlinked files while excluding the opened-then-unlinked
> files seems damned peculiar.

Let's phrase the question a bit more generically:

"How much disk space is being consumed by files that have been
unlinked, but are still referenced by some process?"

There are two pieces to this problem:
1) Unlinked files that are still open (whether mapped or not)
2) Unlinked files that are not open, but are still mapped

You can track down everything in (1) using /proc/<pid>/fd/*, and you
can use stat() to figure out how much space they're using.

But directly measuring how much space (2) consumes is actually not
currently possible from userspace: there's no way to stat() the files.
You can get the inode number from /proc/<pid>/maps, but that still
doesn't get you anywhere because it's been unlinked from the
filesystem.

So I'm not looking to measure (2) and exclude (1): I'm looking to have
a way to directly measure (2) at all.

The reason I say "directly", and I say "quicker and more accurate" in
the original message, is that there is a very ugly way to answer this
question right now: you sum up the number of blocks used by every file
on the disk and subtract it from what statfs() tells you. This
obviously stinks, and becomes untenable once your filesystem is large
enough.

> IOW, this changelog failed to explain the value of the patch. Bad
> changelog! Please sell it to us. Preferably with real-world use
> cases.

The real-world use case is catching long-lived processes that leak
references to temporary files and waste space on the disk. When such
processes leak file-backed mappings, this wasted space is especially
difficult to detect until it gets out of hand. The map_files/
interface eliminates this difficulty.

I've included a little test program at the end of this file to illustrate
what I'm getting at here. It creates a file at /tmp/DELETEDFILE:

calvinowens@Haydn:~$ gcc test.c
calvinowens@Haydn:~$ ./a.out &
[1] 5832
Holding mapping at 0x7fe74d1ea000
calvinowens@Haydn:~$ lsof -p `pgrep a.out`
COMMAND PID USER FD TYPE DEVICE SIZE/OFF NODE NAME
a.out 5832 calvinowens cwd DIR 254,1 4096 3413033 /home/calvinowens
a.out 5832 calvinowens rtd DIR 254,1 4096 2 /
a.out 5832 calvinowens txt REG 254,1 7512 3408268 /home/calvinowens/a.out
a.out 5832 calvinowens mem REG 254,1 1729984 4456767 /lib/x86_64-linux-gnu/libc-2.19.so
a.out 5832 calvinowens mem REG 254,1 140928 4456619 /lib/x86_64-linux-gnu/ld-2.19.so
a.out 5832 calvinowens mem REG 0,32 32768 184946 /tmp/DELETEDFILE
a.out 5832 calvinowens 0u CHR 136,2 0t0 5 /dev/pts/2
a.out 5832 calvinowens 1u CHR 136,2 0t0 5 /dev/pts/2
a.out 5832 calvinowens 2u CHR 136,2 0t0 5 /dev/pts/2
calvinowens@Haydn:~$ killall a.out
[1]+ Terminated ./a.out
calvinowens@Haydn:~$ gcc -DDO_UNLINK test.c
calvinowens@Haydn:~$ ./a.out &
[1] 5842
Holding mapping at 0x7fec8ae63000
calvinowens@Haydn:~$ lsof -p `pgrep a.out`
COMMAND PID USER FD TYPE DEVICE SIZE/OFF NODE NAME
a.out 5842 calvinowens cwd DIR 254,1 4096 3413033 /home/calvinowens
a.out 5842 calvinowens rtd DIR 254,1 4096 2 /
a.out 5842 calvinowens txt REG 254,1 7640 3408268 /home/calvinowens/a.out
a.out 5842 calvinowens mem REG 254,1 1729984 4456767 /lib/x86_64-linux-gnu/libc-2.19.so
a.out 5842 calvinowens mem REG 254,1 140928 4456619 /lib/x86_64-linux-gnu/ld-2.19.so
a.out 5842 calvinowens DEL REG 0,32 184946 /tmp/DELETEDFILE
a.out 5842 calvinowens 0u CHR 136,2 0t0 5 /dev/pts/2
a.out 5842 calvinowens 1u CHR 136,2 0t0 5 /dev/pts/2
a.out 5842 calvinowens 2u CHR 136,2 0t0 5 /dev/pts/2

Notice the gap under "SIZE/OFF" in the 2nd output? This is because lsof
has no possible way to actually determine the leaked file's size.
That's the functionality "hole" I'm trying to fill with this patch.

Does that all seem sensible?

Thanks,
Calvin

--
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <limits.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>

int main(void)
{
int ret, fd;
void *map;

fd = open("/tmp/DELETEDFILE", O_CREAT|O_TRUNC|O_RDWR, 0777);
if (fd == -1)
return -1;

ret = ftruncate(fd, 32768);
if (ret == -1)
return -1;

map = mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_POPULATE,
fd, 0);
if (map == MAP_FAILED)
return -1;

close(fd);
#ifdef DO_UNLINK
unlink("/tmp/DELETEDFILE");
#endif

printf("Holding mapping at %p\n", map);
while (1)
sleep(UINT_MAX);
}

2015-06-10 20:58:35

by Andrew Morton

[permalink] [raw]

Subject: Re: [PATCH v6] procfs: Always expose /proc/<pid>/map_files/ and make it readable

On Tue, 9 Jun 2015 18:39:02 -0700 Calvin Owens <[email protected]> wrote:

> On Tuesday 06/09 at 14:13 -0700, Andrew Morton wrote:
> > On Mon, 8 Jun 2015 20:39:33 -0700 Calvin Owens <[email protected]> wrote:
> >
> > > Currently, /proc/<pid>/map_files/ is restricted to CAP_SYS_ADMIN, and
> > > is only exposed if CONFIG_CHECKPOINT_RESTORE is set.
> > >
> > > This interface very useful because it allows userspace to stat()
> > > deleted files that are still mapped by some process, which enables a
> > > much quicker and more accurate answer to the question "How much disk
> > > space is being consumed by files that are deleted but still mapped?"
> > > than is currently possible.
> >
> > Why is that information useful?
> >
> > I could perhaps think of some use for "How much disk space is being
> > consumed by files that are deleted but still open", but to count the
> > mmapped-then-unlinked files while excluding the opened-then-unlinked
> > files seems damned peculiar.
>
> Let's phrase the question a bit more generically:
>
> "How much disk space is being consumed by files that have been
> unlinked, but are still referenced by some process?"
>
> There are two pieces to this problem:
> 1) Unlinked files that are still open (whether mapped or not)
> 2) Unlinked files that are not open, but are still mapped
>
> You can track down everything in (1) using /proc/<pid>/fd/*, and you
> can use stat() to figure out how much space they're using.

This doesn't work if the mapped file has been unlinked? What does the
/proc/pid/map_files listing look like for these?

> Does that all seem sensible?

Spose so. Please capture all this info in the changelog.

It all seems a bit awkward though. If we want to know "how much disk
space is this process using" (or similar) then I wonder what a syscall
(or prctl mode?) which does this would look like.

2015-06-11 11:10:55

by Alexey Dobriyan

[permalink] [raw]

Subject: Re: [PATCH v6] procfs: Always expose /proc/<pid>/map_files/ and make it readable

On Wed, Jun 10, 2015 at 11:58 PM, Andrew Morton
<[email protected]> wrote:
> On Tue, 9 Jun 2015 18:39:02 -0700 Calvin Owens <[email protected]> wrote:
>
>> On Tuesday 06/09 at 14:13 -0700, Andrew Morton wrote:
>> > On Mon, 8 Jun 2015 20:39:33 -0700 Calvin Owens <[email protected]> wrote:
>> >
>> > > Currently, /proc/<pid>/map_files/ is restricted to CAP_SYS_ADMIN, and
>> > > is only exposed if CONFIG_CHECKPOINT_RESTORE is set.
>> > >
>> > > This interface very useful because it allows userspace to stat()
>> > > deleted files that are still mapped by some process, which enables a
>> > > much quicker and more accurate answer to the question "How much disk
>> > > space is being consumed by files that are deleted but still mapped?"
>> > > than is currently possible.
>> >
>> > Why is that information useful?
>> >
>> > I could perhaps think of some use for "How much disk space is being
>> > consumed by files that are deleted but still open", but to count the
>> > mmapped-then-unlinked files while excluding the opened-then-unlinked
>> > files seems damned peculiar.
>>
>> Let's phrase the question a bit more generically:
>>
>> "How much disk space is being consumed by files that have been
>> unlinked, but are still referenced by some process?"
>>
>> There are two pieces to this problem:
>> 1) Unlinked files that are still open (whether mapped or not)
>> 2) Unlinked files that are not open, but are still mapped
>>
>> You can track down everything in (1) using /proc/<pid>/fd/*, and you
>> can use stat() to figure out how much space they're using.
>
> This doesn't work if the mapped file has been unlinked? What does the
> /proc/pid/map_files listing look like for these?

It says "(deleted)" like /proc/*/exe or any other symlink.

>> Does that all seem sensible?
>
> Spose so. Please capture all this info in the changelog.
>
>
> It all seems a bit awkward though. If we want to know "how much disk
> space is this process using" (or similar) then I wonder what a syscall
> (or prctl mode?) which does this would look like.

I believe something like this is needed for checkpointing,
otherwise mmaped but unlinked files could not be restored fully
(how do you reach them?).

2015-06-11 18:49:54

by Andrew Morton

[permalink] [raw]

Subject: Re: [PATCH v6] procfs: Always expose /proc/<pid>/map_files/ and make it readable

On Thu, 11 Jun 2015 14:10:45 +0300 Alexey Dobriyan <[email protected]> wrote:

> On Wed, Jun 10, 2015 at 11:58 PM, Andrew Morton
> <[email protected]> wrote:
> > On Tue, 9 Jun 2015 18:39:02 -0700 Calvin Owens <[email protected]> wrote:
> >
> >> On Tuesday 06/09 at 14:13 -0700, Andrew Morton wrote:
> >> > On Mon, 8 Jun 2015 20:39:33 -0700 Calvin Owens <[email protected]> wrote:
> >> >
> >> > > Currently, /proc/<pid>/map_files/ is restricted to CAP_SYS_ADMIN, and
> >> > > is only exposed if CONFIG_CHECKPOINT_RESTORE is set.
> >> > >
> >> > > This interface very useful because it allows userspace to stat()
> >> > > deleted files that are still mapped by some process, which enables a
> >> > > much quicker and more accurate answer to the question "How much disk
> >> > > space is being consumed by files that are deleted but still mapped?"
> >> > > than is currently possible.
> >> >
> >> > Why is that information useful?
> >> >
> >> > I could perhaps think of some use for "How much disk space is being
> >> > consumed by files that are deleted but still open", but to count the
> >> > mmapped-then-unlinked files while excluding the opened-then-unlinked
> >> > files seems damned peculiar.
> >>
> >> Let's phrase the question a bit more generically:
> >>
> >> "How much disk space is being consumed by files that have been
> >> unlinked, but are still referenced by some process?"
> >>
> >> There are two pieces to this problem:
> >> 1) Unlinked files that are still open (whether mapped or not)
> >> 2) Unlinked files that are not open, but are still mapped
> >>
> >> You can track down everything in (1) using /proc/<pid>/fd/*, and you
> >> can use stat() to figure out how much space they're using.
> >
> > This doesn't work if the mapped file has been unlinked? What does the
> > /proc/pid/map_files listing look like for these?
>
> It says "(deleted)" like /proc/*/exe or any other symlink.

Actually the symlink directs at "/home/akpm/foo (deleted)".

And lo, if you do `stat -L' on the symlink, you get the info for the
unlinked-but-still-mmapped inode. I never knew that. And I wouldn't
have learned it from the documentation, which is careful to keep all
this a secret.

2015-06-12 09:56:01

by Alexey Dobriyan

[permalink] [raw]

Subject: Re: [PATCH v6] procfs: Always expose /proc/<pid>/map_files/ and make it readable

On Thu, Jun 11, 2015 at 9:49 PM, Andrew Morton
<[email protected]> wrote:
> On Thu, 11 Jun 2015 14:10:45 +0300 Alexey Dobriyan <[email protected]> wrote:
>
>> On Wed, Jun 10, 2015 at 11:58 PM, Andrew Morton
>> <[email protected]> wrote:
>> > On Tue, 9 Jun 2015 18:39:02 -0700 Calvin Owens <[email protected]> wrote:
>> >
>> >> On Tuesday 06/09 at 14:13 -0700, Andrew Morton wrote:
>> >> > On Mon, 8 Jun 2015 20:39:33 -0700 Calvin Owens <[email protected]> wrote:
>> >> >
>> >> > > Currently, /proc/<pid>/map_files/ is restricted to CAP_SYS_ADMIN, and
>> >> > > is only exposed if CONFIG_CHECKPOINT_RESTORE is set.
>> >> > >
>> >> > > This interface very useful because it allows userspace to stat()
>> >> > > deleted files that are still mapped by some process, which enables a
>> >> > > much quicker and more accurate answer to the question "How much disk
>> >> > > space is being consumed by files that are deleted but still mapped?"
>> >> > > than is currently possible.
>> >> >
>> >> > Why is that information useful?
>> >> >
>> >> > I could perhaps think of some use for "How much disk space is being
>> >> > consumed by files that are deleted but still open", but to count the
>> >> > mmapped-then-unlinked files while excluding the opened-then-unlinked
>> >> > files seems damned peculiar.
>> >>
>> >> Let's phrase the question a bit more generically:
>> >>
>> >> "How much disk space is being consumed by files that have been
>> >> unlinked, but are still referenced by some process?"
>> >>
>> >> There are two pieces to this problem:
>> >> 1) Unlinked files that are still open (whether mapped or not)
>> >> 2) Unlinked files that are not open, but are still mapped
>> >>
>> >> You can track down everything in (1) using /proc/<pid>/fd/*, and you
>> >> can use stat() to figure out how much space they're using.
>> >
>> > This doesn't work if the mapped file has been unlinked? What does the
>> > /proc/pid/map_files listing look like for these?
>>
>> It says "(deleted)" like /proc/*/exe or any other symlink.
>
> Actually the symlink directs at "/home/akpm/foo (deleted)".

I meant exactly that: full path + (deleted).

> And lo, if you do `stat -L' on the symlink, you get the info for the
> unlinked-but-still-mmapped inode. I never knew that. And I wouldn't
> have learned it from the documentation, which is careful to keep all
> this a secret.

Yes, map_files symlinks allow to reach descriptors in the very same way
/proc/*/fd symlinks allow to reach descriptors.

2015-06-19 02:33:20

by Calvin Owens

[permalink] [raw]

Subject: [PATCH v7] procfs: Always expose /proc/<pid>/map_files/ and make it readable

Currently, /proc/<pid>/map_files/ is restricted to CAP_SYS_ADMIN, and
is only exposed if CONFIG_CHECKPOINT_RESTORE is set.

Each mapped file region gets a symlink in /proc/<pid>/map_files/
corresponding to the virtual address range at which it is mapped. The
symlinks work like the symlinks in /proc/<pid>/fd/, so you can follow
them to the backing file even if that backing file has been unlinked.

Currently, files which are mapped, unlinked, and closed are impossible
to stat() from userspace. Exposing /proc/<pid>/map_files/ closes this
functionality "hole".

Not being able to stat() such files makes noticing and explicitly
accounting for the space they use on the filesystem impossible. You can
work around this by summing up the space used by every file in the
filesystem and subtracting that total from what statfs() tells you, but
that obviously isn't great, and it becomes unworkable once your
filesystem becomes large enough.

This patch moves map_files/ out from behind CONFIG_CHECKPOINT_RESTORE,
and adjusts the permissions enforced on it as follows:

* proc_map_files_lookup()
* proc_map_files_readdir()
* map_files_d_revalidate()

Remove the CAP_SYS_ADMIN restriction, leaving only the current
restriction requiring PTRACE_MODE_READ. The information made
available to userspace by these three functions is already
available in /proc/PID/maps with MODE_READ, so I don't see any
reason to limit them any further (see below for more detail).

* proc_map_files_follow_link()

This stub has been added, and requires that the user have
CAP_SYS_ADMIN in order to follow the links in map_files/,
since there was concern on LKML both about the potential for
bypassing permissions on ancestor directories in the path to
files pointed to, and about what happens with more exotic
memory mappings created by some drivers (ie dma-buf).

In older versions of this patch, I changed every permission check in
the four functions above to enforce MODE_ATTACH instead of MODE_READ.
This was an oversight on my part, and after revisiting the discussion
it seems that nobody was concerned about anything outside of what is
made possible by ->follow_link(). So in this version, I've left the
checks for PTRACE_MODE_READ as-is.

Cc: Andrew Morton <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Cyrill Gorcunov <[email protected]>
Cc: Joe Perches <[email protected]>
Cc: Kirill A. Shutemov <[email protected]>
Reviewed-by: Kees Cook <[email protected]>
Signed-off-by: Calvin Owens <[email protected]>
---
Changes in v7: Better commit message (hopefully), patch is otherwise
identical to v6.

Changes in v6: Require CAP_SYS_ADMIN for follow_link(). Leave other
PTRACE_MODE_READ checks as-is, since CAP_SYS_ADMIN
alone addresses all concerns raised AFAICS.

Changes in v5: s/dentry->d_inode/d_inode(dentry)/g

Changes in v4: Return -ESRCH from follow_link() when get_proc_task()
returns NULL.

Changes in v3: Changed permission checks to use PTRACE_MODE_ATTACH
instead of PTRACE_MODE_READ, and added a stub to
enforce MODE_ATTACH on follow_link() as well.

Changes in v2: Removed the follow_link() stub that returned -EPERM if
the caller didn't have CAP_SYS_ADMIN, since the caller
in my chroot() scenario gets -EACCES anyway.

fs/proc/base.c | 42 +++++++++++++++++++++++-------------------
1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 093ca14..0270191 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1641,8 +1641,6 @@ end_instantiate:
return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
}

-#ifdef CONFIG_CHECKPOINT_RESTORE
-
/*
* dname_to_vma_addr - maps a dentry name into two unsigned longs
* which represent vma start and end addresses.
@@ -1669,11 +1667,6 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
if (flags & LOOKUP_RCU)
return -ECHILD;

- if (!capable(CAP_SYS_ADMIN)) {
- status = -EPERM;
- goto out_notask;
- }
-
inode = d_inode(dentry);
task = get_proc_task(inode);
if (!task)
@@ -1762,6 +1755,28 @@ struct map_files_info {
unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
};

+/*
+ * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the
+ * symlinks may be used to bypass permissions on ancestor directories in the
+ * path to the file in question.
+ */
+static void *proc_map_files_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ return proc_pid_follow_link(dentry, nd);
+}
+
+/*
+ * Identical to proc_pid_link_inode_operations except for follow_link()
+ */
+static const struct inode_operations proc_map_files_link_inode_operations = {
+ .readlink = proc_pid_readlink,
+ .follow_link = proc_map_files_follow_link,
+ .setattr = proc_setattr,
+};
+
static int
proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
struct task_struct *task, const void *ptr)
@@ -1777,7 +1792,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
ei = PROC_I(inode);
ei->op.proc_get_link = proc_map_files_get_link;

- inode->i_op = &proc_pid_link_inode_operations;
+ inode->i_op = &proc_map_files_link_inode_operations;
inode->i_size = 64;
inode->i_mode = S_IFLNK;

@@ -1801,10 +1816,6 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
int result;
struct mm_struct *mm;

- result = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
- goto out;
-
result = -ENOENT;
task = get_proc_task(dir);
if (!task)
@@ -1858,10 +1869,6 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
struct map_files_info *p;
int ret;

- ret = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
- goto out;
-
ret = -ENOENT;
task = get_proc_task(file_inode(file));
if (!task)
@@ -2050,7 +2057,6 @@ static const struct file_operations proc_timers_operations = {
.llseek = seq_lseek,
.release = seq_release_private,
};
-#endif /* CONFIG_CHECKPOINT_RESTORE */

static int proc_pident_instantiate(struct inode *dir,
struct dentry *dentry, struct task_struct *task, const void *ptr)
@@ -2549,9 +2555,7 @@ static const struct inode_operations proc_task_inode_operations;
static const struct pid_entry tgid_base_stuff[] = {
DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
-#ifdef CONFIG_CHECKPOINT_RESTORE
DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
-#endif
DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
--
1.8.1

2015-07-15 22:21:52

by Andrew Morton

[permalink] [raw]

Subject: Re: [PATCH v7] procfs: Always expose /proc/<pid>/map_files/ and make it readable

On Thu, 18 Jun 2015 19:32:18 -0700 Calvin Owens <[email protected]> wrote:

> Currently, /proc/<pid>/map_files/ is restricted to CAP_SYS_ADMIN, and
> is only exposed if CONFIG_CHECKPOINT_RESTORE is set.
>
> Each mapped file region gets a symlink in /proc/<pid>/map_files/
> corresponding to the virtual address range at which it is mapped. The
> symlinks work like the symlinks in /proc/<pid>/fd/, so you can follow
> them to the backing file even if that backing file has been unlinked.
>
> Currently, files which are mapped, unlinked, and closed are impossible
> to stat() from userspace. Exposing /proc/<pid>/map_files/ closes this
> functionality "hole".
>
> Not being able to stat() such files makes noticing and explicitly
> accounting for the space they use on the filesystem impossible. You can
> work around this by summing up the space used by every file in the
> filesystem and subtracting that total from what statfs() tells you, but
> that obviously isn't great, and it becomes unworkable once your
> filesystem becomes large enough.
>
> This patch moves map_files/ out from behind CONFIG_CHECKPOINT_RESTORE,
> and adjusts the permissions enforced on it as follows:

proc_pid_follow_link() got changed while you weren't looking, causing

fs/proc/base.c: In function 'proc_map_files_follow_link':
fs/proc/base.c:1963: warning: passing argument 2 of 'proc_pid_follow_link' from incompatible pointer type
fs/proc/base.c:1578: note: expected 'void **' but argument is of type 'struct nameidata *'
fs/proc/base.c:1963: warning: return discards qualifiers from pointer target type
fs/proc/base.c: At top level:
fs/proc/base.c:1971: warning: initialization from incompatible pointer type

I just changed it to pass NULL:

--- a/fs/proc/base.c~procfs-always-expose-proc-pid-map_files-and-make-it-readable-fix
+++ a/fs/proc/base.c
@@ -1955,12 +1955,13 @@ struct map_files_info {
* symlinks may be used to bypass permissions on ancestor directories in the
* path to the file in question.
*/
-static void *proc_map_files_follow_link(struct dentry *dentry, struct nameidata *nd)
+static void *
+proc_map_files_follow_link(struct dentry *dentry, struct nameidata *nd)
{
if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);

- return proc_pid_follow_link(dentry, nd);
+ return proc_pid_follow_link(dentry, NULL);
}

/*
_

2015-07-15 23:40:07

by Calvin Owens

[permalink] [raw]

Subject: Re: [PATCH v7] procfs: Always expose /proc/<pid>/map_files/ and make it readable

On Wednesday 07/15 at 15:21 -0700, Andrew Morton wrote:
> On Thu, 18 Jun 2015 19:32:18 -0700 Calvin Owens <[email protected]> wrote:
>
> > Currently, /proc/<pid>/map_files/ is restricted to CAP_SYS_ADMIN, and
> > is only exposed if CONFIG_CHECKPOINT_RESTORE is set.
> >
> > Each mapped file region gets a symlink in /proc/<pid>/map_files/
> > corresponding to the virtual address range at which it is mapped. The
> > symlinks work like the symlinks in /proc/<pid>/fd/, so you can follow
> > them to the backing file even if that backing file has been unlinked.
> >
> > Currently, files which are mapped, unlinked, and closed are impossible
> > to stat() from userspace. Exposing /proc/<pid>/map_files/ closes this
> > functionality "hole".
> >
> > Not being able to stat() such files makes noticing and explicitly
> > accounting for the space they use on the filesystem impossible. You can
> > work around this by summing up the space used by every file in the
> > filesystem and subtracting that total from what statfs() tells you, but
> > that obviously isn't great, and it becomes unworkable once your
> > filesystem becomes large enough.
> >
> > This patch moves map_files/ out from behind CONFIG_CHECKPOINT_RESTORE,
> > and adjusts the permissions enforced on it as follows:
>
> proc_pid_follow_link() got changed while you weren't looking, causing
>
> fs/proc/base.c: In function 'proc_map_files_follow_link':
> fs/proc/base.c:1963: warning: passing argument 2 of 'proc_pid_follow_link' from incompatible pointer type
> fs/proc/base.c:1578: note: expected 'void **' but argument is of type 'struct nameidata *'
> fs/proc/base.c:1963: warning: return discards qualifiers from pointer target type
> fs/proc/base.c: At top level:
> fs/proc/base.c:1971: warning: initialization from incompatible pointer type
>
> I just changed it to pass NULL:

Thanks for cleaning this up, I'll make sure to check outstanding patches
against new -rcs and -nexts in the future.

Thanks,
Calvin

> --- a/fs/proc/base.c~procfs-always-expose-proc-pid-map_files-and-make-it-readable-fix
> +++ a/fs/proc/base.c
> @@ -1955,12 +1955,13 @@ struct map_files_info {
> * symlinks may be used to bypass permissions on ancestor directories in the
> * path to the file in question.
> */
> -static void *proc_map_files_follow_link(struct dentry *dentry, struct nameidata *nd)
> +static void *
> +proc_map_files_follow_link(struct dentry *dentry, struct nameidata *nd)
> {
> if (!capable(CAP_SYS_ADMIN))
> return ERR_PTR(-EPERM);
>
> - return proc_pid_follow_link(dentry, nd);
> + return proc_pid_follow_link(dentry, NULL);
> }
>
> /*
> _
>