The suid root helper of grantpt /usr/lib/pt_chown remains in use today
because of sloppy userspace code that mount devpts and does not realize
their change in mount options also applies to the primary system devpts
instance. As the system devpts instance looses it gid=5 mount option
/usr/lib/pt_chown becomes required to set the gid properly on slave
pty instances.
That can be trivially fixed by making each and every mount of devpts
their own independent filesystems. Which can be done by always
forcing the existing newinstance option on.
To make the fix work one more thing has to be accomplished. The
device node /dev/ptmx needs to associate itself with the instance
of the devpts filesystem currently mounted at /dev/pts.
The bulk of this patch adds path walking code to the implementation
of /dev/ptmx so that it finds the devpts filesystem to create a
pty on by looking at the pts entry in the device nodes parent
directory.
The path walker is called kern_path_pts and is stored in fs/namei.c so
the are no weird vfs exports are necessary. The path walker is also
special in that it performs no permission checks for the path walk.
This patch addresses one additional practical problem with the opening
of /dev/ptmx. How to find which instance of the devpts filesystem
userspace is dealing with. The opening of /dev/ptmx now ensures that
fstat on the file descriptor returns st_dev of devpts filesystem on
which the slave pty resides, and that readlink /proc/self/NNN returns
the path to ptmx on the devpts filesystem.
Forcing newinstance for every mount of the devpts filesystem actually
requires the association between /dev/ptmx and the currently mounted
instance of devpts at /dev/pts. Simply remembering the first mount of
the devpts filesystem and associating that with /dev/ptmx is not
enough. I am aware of at least one instance where an initramfs mounts
devpts before the main system instance of devpts is mounted. In that
system ptys simply did not work after boot when I tested associating
/dev/ptmx with the first mount of the devpts filesystem.
Similary replacing the /dev/ptmx node in devtmpfs with a symlink to
/dev/pts/ptmx is not sufficient as there are some versions of udev
that look at sysfs see that a device node is supposed to be at
/dev/ptmx and replace the symlink with the device node.
Signed-off-by: Eric W. Biederman <[email protected]>
---
Greg I assume this change should go through your tty tree?
drivers/tty/pty.c | 4 +++
fs/devpts/inode.c | 46 ++++++++++++++++++++++++++++++-
fs/namei.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++
include/linux/devpts_fs.h | 1 +
include/linux/namei.h | 1 +
5 files changed, 120 insertions(+), 1 deletion(-)
diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index a45660f62db5..81ae0945cd53 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -738,6 +738,10 @@ static int ptmx_open(struct inode *inode, struct file *filp)
int retval;
int index;
+ inode = devpts_ptmx(inode, filp);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
nonseekable_open(inode, filp);
/* We refuse fsnotify events on ptmx, since it's a shared resource */
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index c35ffdc12bba..79e8d60ba0fe 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -136,6 +136,50 @@ static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb)
return sb->s_fs_info;
}
+struct inode *devpts_ptmx(struct inode *inode, struct file *filp)
+{
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+ struct path path, old;
+ struct super_block *sb;
+ struct dentry *root;
+
+ if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
+ return inode;
+
+ old = filp->f_path;
+ path = old;
+ path_get(&path);
+ if (kern_path_pts(&path)) {
+ path_put(&path);
+ return ERR_PTR(-EINVAL);
+ }
+
+ sb = path.mnt->mnt_sb;
+ if (sb->s_magic != DEVPTS_SUPER_MAGIC) {
+ path_put(&path);
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* Advance path to the ptmx dentry */
+ root = path.dentry;
+ path.dentry = dget(DEVPTS_SB(sb)->ptmx_dentry);
+ dput(root);
+
+ /*
+ * Update filp with the new path so that userspace can use
+ * fstat to know which instance of devpts is open, and so
+ * userspace can use readlink /proc/self/fd/NNN to find the
+ * path to the devpts filesystem for reporting slave inodes.
+ */
+ inode = path.dentry->d_inode;
+ filp->f_path = path;
+ filp->f_inode = inode;
+ filp->f_mapping = inode->i_mapping;
+ path_put(&old);
+#endif
+ return inode;
+}
+
static inline struct super_block *pts_sb_from_inode(struct inode *inode)
{
#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
@@ -175,7 +219,7 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
/* newinstance makes sense only on initial mount */
if (op == PARSE_MOUNT)
- opts->newinstance = 0;
+ opts->newinstance = 1;
while ((p = strsep(&data, ",")) != NULL) {
substring_t args[MAX_OPT_ARGS];
diff --git a/fs/namei.c b/fs/namei.c
index d84d7c7515fc..bd19db26a898 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2515,6 +2515,75 @@ kern_path_mountpoint(int dfd, const char *name, struct path *path,
}
EXPORT_SYMBOL(kern_path_mountpoint);
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+int kern_path_pts(struct path *path)
+{
+ /* This is a path walk to ${path}/../pts without permission checks */
+ struct path root;
+ struct dentry *parent, *pts;
+ struct qstr this;
+ int err;
+
+ get_fs_root(current->fs, &root);
+
+ /* Find the parent of the /dev/ptmx device node.
+ * This is a variation of follow_dotdot.
+ */
+ err = -ENOENT;
+ while (1) {
+ struct dentry *old = path->dentry;
+
+ if ((old == root.dentry) &&
+ (path->mnt == root.mnt))
+ goto fail;
+
+ if (old != path->mnt->mnt_root) {
+ path->dentry = dget_parent(old);
+ dput(old);
+ if (unlikely(!path_connected(path)))
+ goto fail;
+ break;
+ }
+ if (!follow_up(path))
+ goto fail;
+ }
+ follow_mount(path);
+
+ /* In the parent directory find the cached pts dentry. The
+ * dentry must be cached if it is a mountpoint for the devpts
+ * filesystem.
+ */
+ parent = path->dentry;
+ this.name = "pts";
+ this.len = 3;
+ this.hash = full_name_hash(this.name, this.len);
+ if (parent->d_flags & DCACHE_OP_HASH) {
+ err = parent->d_op->d_hash(parent, &this);
+ if (err < 0)
+ goto fail;
+ }
+
+ err = -ENOENT;
+ mutex_lock(&parent->d_inode->i_mutex);
+ pts = d_lookup(parent, &this);
+ mutex_unlock(&parent->d_inode->i_mutex);
+ if (!pts)
+ goto fail;
+
+ /* Find what is mounted on pts */
+ path->dentry = pts;
+ dput(parent);
+ follow_mount(path);
+
+ path_put(&root);
+ return 0;
+
+fail:
+ path_put(&root);
+ return err;
+}
+#endif
+
int __check_sticky(struct inode *dir, struct inode *inode)
{
kuid_t fsuid = current_fsuid();
diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h
index 251a2090a554..8834dba07ff9 100644
--- a/include/linux/devpts_fs.h
+++ b/include/linux/devpts_fs.h
@@ -17,6 +17,7 @@
#ifdef CONFIG_UNIX98_PTYS
+struct inode *devpts_ptmx(struct inode *inode, struct file *filp);
int devpts_new_index(struct inode *ptmx_inode);
void devpts_kill_index(struct inode *ptmx_inode, int idx);
/* mknod in devpts */
diff --git a/include/linux/namei.h b/include/linux/namei.h
index d8c6334cd150..cfac431bcd31 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -75,6 +75,7 @@ extern struct dentry *user_path_create(int, const char __user *, struct path *,
extern void done_path_create(struct path *, struct dentry *);
extern struct dentry *kern_path_locked(const char *, struct path *);
extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
+extern int kern_path_pts(struct path *path);
extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
--
2.6.3
On Fri, Dec 11, 2015 at 11:40 AM, Eric W. Biederman
<[email protected]> wrote:
>
>
> +struct inode *devpts_ptmx(struct inode *inode, struct file *filp)
> +{
> +#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
> + struct path path, old;
> + struct super_block *sb;
> + struct dentry *root;
> +
> + if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
> + return inode;
> +
> + old = filp->f_path;
> + path = old;
> + path_get(&path);
> + if (kern_path_pts(&path)) {
> + path_put(&path);
> + return ERR_PTR(-EINVAL);
> + }
So this is definitely crap.
You can't return an error. You should just return the old inode. If
somebody doesn't have /dev/pts/ mounted there, the legacy /dev/ptmx
should still work, not return ENOENT or whatever.
> + sb = path.mnt->mnt_sb;
> + if (sb->s_magic != DEVPTS_SUPER_MAGIC) {
> + path_put(&path);
> + return ERR_PTR(-EINVAL);
> + }
Same deal. Returning an error is wrong.
Of, alternatively, make the caller not consider an error an error, but
fall back to the old behavior in the caller.
> + /*
> + * Update filp with the new path so that userspace can use
> + * fstat to know which instance of devpts is open, and so
> + * userspace can use readlink /proc/self/fd/NNN to find the
> + * path to the devpts filesystem for reporting slave inodes.
> + */
Hmm. I'm not 100% convinced about this. Normally we do *not* allow
f_path and f_inode to change. I guess this file descriptor hasn't been
exposed yet, so it might be ok, but it makes me a bit nervous that
this code violates the basic filp rules..
Linus
Linus Torvalds <[email protected]> writes:
> On Fri, Dec 11, 2015 at 11:40 AM, Eric W. Biederman
> <[email protected]> wrote:
>>
>>
>> +struct inode *devpts_ptmx(struct inode *inode, struct file *filp)
>> +{
>> +#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
>> + struct path path, old;
>> + struct super_block *sb;
>> + struct dentry *root;
>> +
>> + if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
>> + return inode;
>> +
>> + old = filp->f_path;
>> + path = old;
>> + path_get(&path);
>> + if (kern_path_pts(&path)) {
>> + path_put(&path);
>> + return ERR_PTR(-EINVAL);
>> + }
>
> So this is definitely crap.
>
> You can't return an error. You should just return the old inode. If
> somebody doesn't have /dev/pts/ mounted there, the legacy /dev/ptmx
> should still work, not return ENOENT or whatever.
I return an error because an association is not found. -EINVAL is the
historical error when that happens.
We can't actually support the old devpts_mnt hack, because we are
forcing newinstance and so the devpts instance association devpts_mnt
will never be delivered to userspace. Furthermore things such as
initramfs images mounting devpts guarantee that even if we did try to
support devpts_mnt it would not work in practice.
So I really don't see a point in attemptting to support something that
won't actually matter, and won't work even if I try. That is really
crap.
>> + sb = path.mnt->mnt_sb;
>> + if (sb->s_magic != DEVPTS_SUPER_MAGIC) {
>> + path_put(&path);
>> + return ERR_PTR(-EINVAL);
>> + }
>
> Same deal. Returning an error is wrong.
>
> Of, alternatively, make the caller not consider an error an error, but
> fall back to the old behavior in the caller.
I understand why it would be nice to keep the old path still working,
but we can't.
>> + /*
>> + * Update filp with the new path so that userspace can use
>> + * fstat to know which instance of devpts is open, and so
>> + * userspace can use readlink /proc/self/fd/NNN to find the
>> + * path to the devpts filesystem for reporting slave inodes.
>> + */
>
> Hmm. I'm not 100% convinced about this. Normally we do *not* allow
> f_path and f_inode to change. I guess this file descriptor hasn't been
> exposed yet, so it might be ok, but it makes me a bit nervous that
> this code violates the basic filp rules..
It is not my favorite choice but it is backwards compatible. So we can
at least write a single version of ptsname that makes sense and works on
old and new kernels in all the crazy corner cases without too much
pain.
If we don't expose this in such a way that a general purpose version of
ptsname can be written and exposed to userspace we might as well pack
our bags and go home because there will be no way to make grantpt race
free in the face of multiple distinct instances of devpts.
Eric
On Fri, Dec 11, 2015 at 01:40:40PM -0600, Eric W. Biederman wrote:
> + inode = path.dentry->d_inode;
> + filp->f_path = path;
> + filp->f_inode = inode;
> + filp->f_mapping = inode->i_mapping;
> + path_put(&old);
Don't. You are creating a fairly subtle constraint on what the code in
fs/open.c and fs/namei.c can do, for no good reason. You can bloody
well maintain the information you need without that.
I'm less than thrilled about the whole approach as it is, but this definitely
pushes it into the NAK-on-sight territory. We'd been through that 3 years
ago when you brought that kind of hacks up last time; the objections still
stand.
Don't make it any more of a special snowflake than it absolutely has to be.
We already have more than enough potential headache sources to keep track
of when doing any kind of work on core kernel and extra ones are not
appreciated. Really.
Al Viro <[email protected]> writes:
> On Fri, Dec 11, 2015 at 01:40:40PM -0600, Eric W. Biederman wrote:
>
>> + inode = path.dentry->d_inode;
>> + filp->f_path = path;
>> + filp->f_inode = inode;
>> + filp->f_mapping = inode->i_mapping;
>> + path_put(&old);
>
> Don't. You are creating a fairly subtle constraint on what the code in
> fs/open.c and fs/namei.c can do, for no good reason. You can bloody
> well maintain the information you need without that.
There is a good reason. We can not write a race free version of ptsname
without it.
If it would help I am happy to add a helper to change the filp path that
lives next to d_dentry_open so that this is not insane to maintain. I
am not ready to consider yet another attempt by kernel people to solve
userspace problems that is a half thought out mess that means we will
still have bugs and problems 20 years hence.
We need to actually solve the problem and anything less than that is
just stupid.
Eric
On Fri, Dec 11, 2015 at 1:11 PM, Eric W. Biederman
<[email protected]> wrote:
> Al Viro <[email protected]> writes:
>
>> On Fri, Dec 11, 2015 at 01:40:40PM -0600, Eric W. Biederman wrote:
>>
>>> + inode = path.dentry->d_inode;
>>> + filp->f_path = path;
>>> + filp->f_inode = inode;
>>> + filp->f_mapping = inode->i_mapping;
>>> + path_put(&old);
>>
>> Don't. You are creating a fairly subtle constraint on what the code in
>> fs/open.c and fs/namei.c can do, for no good reason. You can bloody
>> well maintain the information you need without that.
>
> There is a good reason. We can not write a race free version of ptsname
> without it.
As long as this is for new userspace code, would it make sense to just
add a new ioctl to ask "does this ptmx fd match this /dev/pts fd?"
--Andy
On 12/11/15 13:48, Andy Lutomirski wrote:
> On Fri, Dec 11, 2015 at 1:11 PM, Eric W. Biederman
> <[email protected]> wrote:
>> Al Viro <[email protected]> writes:
>>
>>> On Fri, Dec 11, 2015 at 01:40:40PM -0600, Eric W. Biederman wrote:
>>>
>>>> + inode = path.dentry->d_inode;
>>>> + filp->f_path = path;
>>>> + filp->f_inode = inode;
>>>> + filp->f_mapping = inode->i_mapping;
>>>> + path_put(&old);
>>>
>>> Don't. You are creating a fairly subtle constraint on what the code in
>>> fs/open.c and fs/namei.c can do, for no good reason. You can bloody
>>> well maintain the information you need without that.
>>
>> There is a good reason. We can not write a race free version of ptsname
>> without it.
>
> As long as this is for new userspace code, would it make sense to just
> add a new ioctl to ask "does this ptmx fd match this /dev/pts fd?"
>
For the newinstance case st_dev should match between the master and the
slave. Unfortunately this is not the case for a legacy ptmx, as a
stat() on the master descriptor still returns the st_dev, st_rdev, and
st_ino for the ptmx device node.
-hpa
On Fri, Dec 11, 2015 at 2:07 PM, H. Peter Anvin <[email protected]> wrote:
> On 12/11/15 13:48, Andy Lutomirski wrote:
>> On Fri, Dec 11, 2015 at 1:11 PM, Eric W. Biederman
>> <[email protected]> wrote:
>>> Al Viro <[email protected]> writes:
>>>
>>>> On Fri, Dec 11, 2015 at 01:40:40PM -0600, Eric W. Biederman wrote:
>>>>
>>>>> + inode = path.dentry->d_inode;
>>>>> + filp->f_path = path;
>>>>> + filp->f_inode = inode;
>>>>> + filp->f_mapping = inode->i_mapping;
>>>>> + path_put(&old);
>>>>
>>>> Don't. You are creating a fairly subtle constraint on what the code in
>>>> fs/open.c and fs/namei.c can do, for no good reason. You can bloody
>>>> well maintain the information you need without that.
>>>
>>> There is a good reason. We can not write a race free version of ptsname
>>> without it.
>>
>> As long as this is for new userspace code, would it make sense to just
>> add a new ioctl to ask "does this ptmx fd match this /dev/pts fd?"
>>
>
> For the newinstance case st_dev should match between the master and the
> slave. Unfortunately this is not the case for a legacy ptmx, as a
> stat() on the master descriptor still returns the st_dev, st_rdev, and
> st_ino for the ptmx device node.
Sure, but I'm not talking about stat. I'm saying that we could add a
new ioctl that works on any ptmx fd (/dev/ptmx or /dev/pts/ptmx) that
answers the question "does this ptmx logically belong to the given
devpts filesystem".
Since it's not stat, we can make it do whatever we want, including
following a link to the devpts instance that isn't f_path or f_inode.
--Andy
On 12/11/15 14:12, Andy Lutomirski wrote:
>>
>> For the newinstance case st_dev should match between the master and the
>> slave. Unfortunately this is not the case for a legacy ptmx, as a
>> stat() on the master descriptor still returns the st_dev, st_rdev, and
>> st_ino for the ptmx device node.
>
> Sure, but I'm not talking about stat. I'm saying that we could add a
> new ioctl that works on any ptmx fd (/dev/ptmx or /dev/pts/ptmx) that
> answers the question "does this ptmx logically belong to the given
> devpts filesystem".
>
> Since it's not stat, we can make it do whatever we want, including
> following a link to the devpts instance that isn't f_path or f_inode.
>
Sure. My thinking, though, was whether or not we can do something that
works on legacy kernels, and/or is less intrusive than new ioctls.
What is the actual operation we need?
-hpa
On Fri, Dec 11, 2015 at 2:18 PM, H. Peter Anvin <[email protected]> wrote:
> On 12/11/15 14:12, Andy Lutomirski wrote:
>>>
>>> For the newinstance case st_dev should match between the master and the
>>> slave. Unfortunately this is not the case for a legacy ptmx, as a
>>> stat() on the master descriptor still returns the st_dev, st_rdev, and
>>> st_ino for the ptmx device node.
>>
>> Sure, but I'm not talking about stat. I'm saying that we could add a
>> new ioctl that works on any ptmx fd (/dev/ptmx or /dev/pts/ptmx) that
>> answers the question "does this ptmx logically belong to the given
>> devpts filesystem".
>>
>> Since it's not stat, we can make it do whatever we want, including
>> following a link to the devpts instance that isn't f_path or f_inode.
>>
>
> Sure. My thinking, though, was whether or not we can do something that
> works on legacy kernels, and/or is less intrusive than new ioctls.
>
> What is the actual operation we need?
To do the whole shebang at once:
ioctl(ptmx_fd, TIOCWHATEVER, fd_to_devpts_mount);
returns the slave number if fd_to_devpts_mount points to the right
place or an error if not.
ptsname(fd) logically does:
fd_to_devpts_mount = open("/dev/pts", O_RDONLY | O_DIRECTORY);
int n = ioctl(fd, TIOCWHATEVER, fd_to_devpts_mount);
close(fd_to_devpts_mount);
if (n < 0)
return some error;
return "/dev/pts/" + n;
I think that all kinds of variants are possible.
--Andy
>
> -hpa
>
--
Andy Lutomirski
AMA Capital Management, LLC
On 12/11/15 14:24, Andy Lutomirski wrote:
>
> To do the whole shebang at once:
>
> ioctl(ptmx_fd, TIOCWHATEVER, fd_to_devpts_mount);
>
> returns the slave number if fd_to_devpts_mount points to the right
> place or an error if not.
>
> ptsname(fd) logically does:
>
> fd_to_devpts_mount = open("/dev/pts", O_RDONLY | O_DIRECTORY);
> int n = ioctl(fd, TIOCWHATEVER, fd_to_devpts_mount);
> close(fd_to_devpts_mount);
> if (n < 0)
> return some error;
> return "/dev/pts/" + n;
>
> I think that all kinds of variants are possible.
>
If we're going to invent new names, any reason to not simply have
TIOCPTSNAME, since we can find the pts inode from the ptm fd, and then
walk the dentry tree (returning error if unreachable)?
Or does that open up entirely new issues?
-hpa
Andy Lutomirski <[email protected]> writes:
> On Fri, Dec 11, 2015 at 2:07 PM, H. Peter Anvin <[email protected]> wrote:
>> On 12/11/15 13:48, Andy Lutomirski wrote:
>>> On Fri, Dec 11, 2015 at 1:11 PM, Eric W. Biederman
>>> <[email protected]> wrote:
>>>> Al Viro <[email protected]> writes:
>>>>
>>>>> On Fri, Dec 11, 2015 at 01:40:40PM -0600, Eric W. Biederman wrote:
>>>>>
>>>>>> + inode = path.dentry->d_inode;
>>>>>> + filp->f_path = path;
>>>>>> + filp->f_inode = inode;
>>>>>> + filp->f_mapping = inode->i_mapping;
>>>>>> + path_put(&old);
>>>>>
>>>>> Don't. You are creating a fairly subtle constraint on what the code in
>>>>> fs/open.c and fs/namei.c can do, for no good reason. You can bloody
>>>>> well maintain the information you need without that.
>>>>
>>>> There is a good reason. We can not write a race free version of ptsname
>>>> without it.
>>>
>>> As long as this is for new userspace code, would it make sense to just
>>> add a new ioctl to ask "does this ptmx fd match this /dev/pts fd?"
>>>
>>
>> For the newinstance case st_dev should match between the master and the
>> slave. Unfortunately this is not the case for a legacy ptmx, as a
>> stat() on the master descriptor still returns the st_dev, st_rdev, and
>> st_ino for the ptmx device node.
>
> Sure, but I'm not talking about stat. I'm saying that we could add a
> new ioctl that works on any ptmx fd (/dev/ptmx or /dev/pts/ptmx) that
> answers the question "does this ptmx logically belong to the given
> devpts filesystem".
>
> Since it's not stat, we can make it do whatever we want, including
> following a link to the devpts instance that isn't f_path or f_inode.
The useful ioctl to add in my opinion would be one that actually opens
the slave, at which point ptsname could become ttyname, and that closes
races in grantpt.
I even posted an implementation earlier in the discussion and no one was
interested.
Honestly the more weird special cases we add to devpts the less likely
userspace will be to get things right. We have been trying since 1998
and devpts is still a poor enough design we have not been able to get
rid of /usr/lib/pt_chown. Adding another case where we have to sand on
one foot and touch our nose does not seem to likely to achieve
widespread adoption. How many version of libc are there now?
So I think the following incremental patch makes sense to improve the
maintainability of what I have written, but I haven't seen any arguments
that it is actually a bad idea.
Especially given that ptys are a core part of unix and they are used by
everyone all of the time.
Eric
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 79e8d60ba0fe..588e0a049daf 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -139,15 +139,14 @@ static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb)
struct inode *devpts_ptmx(struct inode *inode, struct file *filp)
{
#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
- struct path path, old;
+ struct path path;
struct super_block *sb;
struct dentry *root;
if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
return inode;
- old = filp->f_path;
- path = old;
+ path = filp->f_path;
path_get(&path);
if (kern_path_pts(&path)) {
path_put(&path);
@@ -172,10 +171,8 @@ struct inode *devpts_ptmx(struct inode *inode, struct file *filp)
* path to the devpts filesystem for reporting slave inodes.
*/
inode = path.dentry->d_inode;
- filp->f_path = path;
- filp->f_inode = inode;
- filp->f_mapping = inode->i_mapping;
- path_put(&old);
+ filp_set_path(filp, &path);
+ path_put(&path);
#endif
return inode;
}
diff --git a/fs/open.c b/fs/open.c
index b6f1e96a7c0b..5234a791d9ae 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -679,6 +679,19 @@ int open_check_o_direct(struct file *f)
return 0;
}
+void filp_set_path(struct file *filp, struct path *path)
+{
+ /* Only safe during open */
+ struct path old = filp->f_path;
+ struct inode *inode = path->dentry->d_inode;
+
+ path_get(path);
+ filp->f_path = *path;
+ filp->f_inode = inode;
+ filp->f_mapping = inode->i_mapping;
+ path_put(&old);
+}
+
static int do_dentry_open(struct file *f,
struct inode *inode,
int (*open)(struct inode *, struct file *),
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3aa514254161..f3659a8a2eec 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2220,6 +2220,7 @@ extern struct file *file_open_root(struct dentry *, struct vfsmount *,
const char *, int);
extern struct file * dentry_open(const struct path *, int, const struct cred *);
extern int filp_close(struct file *, fl_owner_t id);
+extern void filp_set_path(struct file *filp, struct path *path);
extern struct filename *getname_flags(const char __user *, int, int *);
extern struct filename *getname(const char __user *);
On Fri, Dec 11, 2015 at 2:35 PM, Eric W. Biederman
<[email protected]> wrote:
> Andy Lutomirski <[email protected]> writes:
>
>> On Fri, Dec 11, 2015 at 2:07 PM, H. Peter Anvin <[email protected]> wrote:
>>> On 12/11/15 13:48, Andy Lutomirski wrote:
>>>> On Fri, Dec 11, 2015 at 1:11 PM, Eric W. Biederman
>>>> <[email protected]> wrote:
>>>>> Al Viro <[email protected]> writes:
>>>>>
>>>>>> On Fri, Dec 11, 2015 at 01:40:40PM -0600, Eric W. Biederman wrote:
>>>>>>
>>>>>>> + inode = path.dentry->d_inode;
>>>>>>> + filp->f_path = path;
>>>>>>> + filp->f_inode = inode;
>>>>>>> + filp->f_mapping = inode->i_mapping;
>>>>>>> + path_put(&old);
>>>>>>
>>>>>> Don't. You are creating a fairly subtle constraint on what the code in
>>>>>> fs/open.c and fs/namei.c can do, for no good reason. You can bloody
>>>>>> well maintain the information you need without that.
>>>>>
>>>>> There is a good reason. We can not write a race free version of ptsname
>>>>> without it.
>>>>
>>>> As long as this is for new userspace code, would it make sense to just
>>>> add a new ioctl to ask "does this ptmx fd match this /dev/pts fd?"
>>>>
>>>
>>> For the newinstance case st_dev should match between the master and the
>>> slave. Unfortunately this is not the case for a legacy ptmx, as a
>>> stat() on the master descriptor still returns the st_dev, st_rdev, and
>>> st_ino for the ptmx device node.
>>
>> Sure, but I'm not talking about stat. I'm saying that we could add a
>> new ioctl that works on any ptmx fd (/dev/ptmx or /dev/pts/ptmx) that
>> answers the question "does this ptmx logically belong to the given
>> devpts filesystem".
>>
>> Since it's not stat, we can make it do whatever we want, including
>> following a link to the devpts instance that isn't f_path or f_inode.
>
> The useful ioctl to add in my opinion would be one that actually opens
> the slave, at which point ptsname could become ttyname, and that closes
> races in grantpt.
Unfortunately, ptsname is POSIX, so we can't get rid of it. It's a
bad idea, but it's in the standard.
>
> I even posted an implementation earlier in the discussion and no one was
> interested.
>
> Honestly the more weird special cases we add to devpts the less likely
> userspace will be to get things right. We have been trying since 1998
> and devpts is still a poor enough design we have not been able to get
> rid of /usr/lib/pt_chown. Adding another case where we have to sand on
> one foot and touch our nose does not seem to likely to achieve
> widespread adoption. How many version of libc are there now?
Old libc can stay buggy, I think. Given that this mess is partially a
libc mis-design, I don't see why it needs to be fixed entirely in the
kernel.
For new systems, it would be really nice if we can make a /dev/ptmx
symlink be 100% functional.
In any event, this is semi-moot. If we actually want to retire the
newinstance=0 thing from the kernel, we apparently need a magic
/dev/ptmx node. That doesn't mean that new userspace needs to *use*
that magic node. So why not implement the magic node without fiddling
with f_path?
--Andy
On December 11, 2015 2:35:16 PM PST, [email protected] wrote:
>Andy Lutomirski <[email protected]> writes:
>
>> On Fri, Dec 11, 2015 at 2:07 PM, H. Peter Anvin <[email protected]>
>wrote:
>>> On 12/11/15 13:48, Andy Lutomirski wrote:
>>>> On Fri, Dec 11, 2015 at 1:11 PM, Eric W. Biederman
>>>> <[email protected]> wrote:
>>>>> Al Viro <[email protected]> writes:
>>>>>
>>>>>> On Fri, Dec 11, 2015 at 01:40:40PM -0600, Eric W. Biederman
>wrote:
>>>>>>
>>>>>>> + inode = path.dentry->d_inode;
>>>>>>> + filp->f_path = path;
>>>>>>> + filp->f_inode = inode;
>>>>>>> + filp->f_mapping = inode->i_mapping;
>>>>>>> + path_put(&old);
>>>>>>
>>>>>> Don't. You are creating a fairly subtle constraint on what the
>code in
>>>>>> fs/open.c and fs/namei.c can do, for no good reason. You can
>bloody
>>>>>> well maintain the information you need without that.
>>>>>
>>>>> There is a good reason. We can not write a race free version of
>ptsname
>>>>> without it.
>>>>
>>>> As long as this is for new userspace code, would it make sense to
>just
>>>> add a new ioctl to ask "does this ptmx fd match this /dev/pts fd?"
>>>>
>>>
>>> For the newinstance case st_dev should match between the master and
>the
>>> slave. Unfortunately this is not the case for a legacy ptmx, as a
>>> stat() on the master descriptor still returns the st_dev, st_rdev,
>and
>>> st_ino for the ptmx device node.
>>
>> Sure, but I'm not talking about stat. I'm saying that we could add a
>> new ioctl that works on any ptmx fd (/dev/ptmx or /dev/pts/ptmx) that
>> answers the question "does this ptmx logically belong to the given
>> devpts filesystem".
>>
>> Since it's not stat, we can make it do whatever we want, including
>> following a link to the devpts instance that isn't f_path or f_inode.
>
>The useful ioctl to add in my opinion would be one that actually opens
>the slave, at which point ptsname could become ttyname, and that closes
>races in grantpt.
>
>I even posted an implementation earlier in the discussion and no one
>was
>interested.
>
>Honestly the more weird special cases we add to devpts the less likely
>userspace will be to get things right. We have been trying since 1998
>and devpts is still a poor enough design we have not been able to get
>rid of /usr/lib/pt_chown. Adding another case where we have to sand on
>one foot and touch our nose does not seem to likely to achieve
>widespread adoption. How many version of libc are there now?
>
>So I think the following incremental patch makes sense to improve the
>maintainability of what I have written, but I haven't seen any
>arguments
>that it is actually a bad idea.
>
>Especially given that ptys are a core part of unix and they are used by
>everyone all of the time.
>
>Eric
>
>diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
>index 79e8d60ba0fe..588e0a049daf 100644
>--- a/fs/devpts/inode.c
>+++ b/fs/devpts/inode.c
>@@ -139,15 +139,14 @@ static inline struct pts_fs_info
>*DEVPTS_SB(struct super_block *sb)
> struct inode *devpts_ptmx(struct inode *inode, struct file *filp)
> {
> #ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
>- struct path path, old;
>+ struct path path;
> struct super_block *sb;
> struct dentry *root;
>
> if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
> return inode;
>
>- old = filp->f_path;
>- path = old;
>+ path = filp->f_path;
> path_get(&path);
> if (kern_path_pts(&path)) {
> path_put(&path);
>@@ -172,10 +171,8 @@ struct inode *devpts_ptmx(struct inode *inode,
>struct file *filp)
> * path to the devpts filesystem for reporting slave inodes.
> */
> inode = path.dentry->d_inode;
>- filp->f_path = path;
>- filp->f_inode = inode;
>- filp->f_mapping = inode->i_mapping;
>- path_put(&old);
>+ filp_set_path(filp, &path);
>+ path_put(&path);
> #endif
> return inode;
> }
>diff --git a/fs/open.c b/fs/open.c
>index b6f1e96a7c0b..5234a791d9ae 100644
>--- a/fs/open.c
>+++ b/fs/open.c
>@@ -679,6 +679,19 @@ int open_check_o_direct(struct file *f)
> return 0;
> }
>
>+void filp_set_path(struct file *filp, struct path *path)
>+{
>+ /* Only safe during open */
>+ struct path old = filp->f_path;
>+ struct inode *inode = path->dentry->d_inode;
>+
>+ path_get(path);
>+ filp->f_path = *path;
>+ filp->f_inode = inode;
>+ filp->f_mapping = inode->i_mapping;
>+ path_put(&old);
>+}
>+
> static int do_dentry_open(struct file *f,
> struct inode *inode,
> int (*open)(struct inode *, struct file *),
>diff --git a/include/linux/fs.h b/include/linux/fs.h
>index 3aa514254161..f3659a8a2eec 100644
>--- a/include/linux/fs.h
>+++ b/include/linux/fs.h
>@@ -2220,6 +2220,7 @@ extern struct file *file_open_root(struct dentry
>*, struct vfsmount *,
> const char *, int);
>extern struct file * dentry_open(const struct path *, int, const struct
>cred *);
> extern int filp_close(struct file *, fl_owner_t id);
>+extern void filp_set_path(struct file *filp, struct path *path);
>
>extern struct filename *getname_flags(const char __user *, int, int *);
> extern struct filename *getname(const char __user *);
I'm calling bullshit on that. pt_chown is not and has not been needed on anything but severely misconfigured userspace since devpts was constructed. The problem, rather, is that by not disabling pt_chown at the same time they switched to devpts distros allowed these severe misconfigurations to go on unnoticed.
So pt_chown *created* the problem.
The other problem we are trying to deal with is that the layout is suboptimal for the multi instance case, a legacy from an older SysV layout with /dev/ptm/# and /dev/pts/#, the former replaced with the multiplex device /dev/ptmx, but that just shows the sheer amount of inertia we are dealing with.
--
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.
On Fri, Dec 11, 2015 at 02:52:01PM -0800, Andy Lutomirski wrote:
> On Fri, Dec 11, 2015 at 2:35 PM, Eric W. Biederman
> <[email protected]> wrote:
> > Andy Lutomirski <[email protected]> writes:
> >
> >> On Fri, Dec 11, 2015 at 2:07 PM, H. Peter Anvin <[email protected]> wrote:
> >>> On 12/11/15 13:48, Andy Lutomirski wrote:
> >>>> On Fri, Dec 11, 2015 at 1:11 PM, Eric W. Biederman
> >>>> <[email protected]> wrote:
> >>>>> Al Viro <[email protected]> writes:
> >>>>>
> >>>>>> On Fri, Dec 11, 2015 at 01:40:40PM -0600, Eric W. Biederman wrote:
> >>>>>>
> >>>>>>> + inode = path.dentry->d_inode;
> >>>>>>> + filp->f_path = path;
> >>>>>>> + filp->f_inode = inode;
> >>>>>>> + filp->f_mapping = inode->i_mapping;
> >>>>>>> + path_put(&old);
> >>>>>>
> >>>>>> Don't. You are creating a fairly subtle constraint on what the code in
> >>>>>> fs/open.c and fs/namei.c can do, for no good reason. You can bloody
> >>>>>> well maintain the information you need without that.
> >>>>>
> >>>>> There is a good reason. We can not write a race free version of ptsname
> >>>>> without it.
> >>>>
> >>>> As long as this is for new userspace code, would it make sense to just
> >>>> add a new ioctl to ask "does this ptmx fd match this /dev/pts fd?"
> >>>>
> >>>
> >>> For the newinstance case st_dev should match between the master and the
> >>> slave. Unfortunately this is not the case for a legacy ptmx, as a
> >>> stat() on the master descriptor still returns the st_dev, st_rdev, and
> >>> st_ino for the ptmx device node.
> >>
> >> Sure, but I'm not talking about stat. I'm saying that we could add a
> >> new ioctl that works on any ptmx fd (/dev/ptmx or /dev/pts/ptmx) that
> >> answers the question "does this ptmx logically belong to the given
> >> devpts filesystem".
> >>
> >> Since it's not stat, we can make it do whatever we want, including
> >> following a link to the devpts instance that isn't f_path or f_inode.
> >
> > The useful ioctl to add in my opinion would be one that actually opens
> > the slave, at which point ptsname could become ttyname, and that closes
> > races in grantpt.
>
> Unfortunately, ptsname is POSIX, so we can't get rid of it. It's a
> bad idea, but it's in the standard.
But then ptsname could become "open the slave, call ttyname() on it, close
the slave". Unless opening the slave would have side effects?
On Fri, Dec 11, 2015 at 2:58 PM, Jann Horn <[email protected]> wrote:
> On Fri, Dec 11, 2015 at 02:52:01PM -0800, Andy Lutomirski wrote:
>> On Fri, Dec 11, 2015 at 2:35 PM, Eric W. Biederman
>> <[email protected]> wrote:
>> > Andy Lutomirski <[email protected]> writes:
>> >
>> >> On Fri, Dec 11, 2015 at 2:07 PM, H. Peter Anvin <[email protected]> wrote:
>> >>> On 12/11/15 13:48, Andy Lutomirski wrote:
>> >>>> On Fri, Dec 11, 2015 at 1:11 PM, Eric W. Biederman
>> >>>> <[email protected]> wrote:
>> >>>>> Al Viro <[email protected]> writes:
>> >>>>>
>> >>>>>> On Fri, Dec 11, 2015 at 01:40:40PM -0600, Eric W. Biederman wrote:
>> >>>>>>
>> >>>>>>> + inode = path.dentry->d_inode;
>> >>>>>>> + filp->f_path = path;
>> >>>>>>> + filp->f_inode = inode;
>> >>>>>>> + filp->f_mapping = inode->i_mapping;
>> >>>>>>> + path_put(&old);
>> >>>>>>
>> >>>>>> Don't. You are creating a fairly subtle constraint on what the code in
>> >>>>>> fs/open.c and fs/namei.c can do, for no good reason. You can bloody
>> >>>>>> well maintain the information you need without that.
>> >>>>>
>> >>>>> There is a good reason. We can not write a race free version of ptsname
>> >>>>> without it.
>> >>>>
>> >>>> As long as this is for new userspace code, would it make sense to just
>> >>>> add a new ioctl to ask "does this ptmx fd match this /dev/pts fd?"
>> >>>>
>> >>>
>> >>> For the newinstance case st_dev should match between the master and the
>> >>> slave. Unfortunately this is not the case for a legacy ptmx, as a
>> >>> stat() on the master descriptor still returns the st_dev, st_rdev, and
>> >>> st_ino for the ptmx device node.
>> >>
>> >> Sure, but I'm not talking about stat. I'm saying that we could add a
>> >> new ioctl that works on any ptmx fd (/dev/ptmx or /dev/pts/ptmx) that
>> >> answers the question "does this ptmx logically belong to the given
>> >> devpts filesystem".
>> >>
>> >> Since it's not stat, we can make it do whatever we want, including
>> >> following a link to the devpts instance that isn't f_path or f_inode.
>> >
>> > The useful ioctl to add in my opinion would be one that actually opens
>> > the slave, at which point ptsname could become ttyname, and that closes
>> > races in grantpt.
>>
>> Unfortunately, ptsname is POSIX, so we can't get rid of it. It's a
>> bad idea, but it's in the standard.
>
> But then ptsname could become "open the slave, call ttyname() on it, close
> the slave". Unless opening the slave would have side effects?
Hmm, fair enough. So maybe that does make sense after all.
Anyway, I still think there are two pieces here:
1. Fix /dev/ptmx so that we can banish newinstance=0.
2. Fix libc. If that needs kernel help, then so be it.
ISTM we could still implement the "open the slave" operation for (2)
as an ioctl that does the appropriate magic the fd is /dev/ptmx as
opposed to /dev/pts/ptmx.
--Andy
On December 11, 2015 3:00:49 PM PST, Andy Lutomirski <[email protected]> wrote:
>On Fri, Dec 11, 2015 at 2:58 PM, Jann Horn <[email protected]> wrote:
>> On Fri, Dec 11, 2015 at 02:52:01PM -0800, Andy Lutomirski wrote:
>>> On Fri, Dec 11, 2015 at 2:35 PM, Eric W. Biederman
>>> <[email protected]> wrote:
>>> > Andy Lutomirski <[email protected]> writes:
>>> >
>>> >> On Fri, Dec 11, 2015 at 2:07 PM, H. Peter Anvin <[email protected]>
>wrote:
>>> >>> On 12/11/15 13:48, Andy Lutomirski wrote:
>>> >>>> On Fri, Dec 11, 2015 at 1:11 PM, Eric W. Biederman
>>> >>>> <[email protected]> wrote:
>>> >>>>> Al Viro <[email protected]> writes:
>>> >>>>>
>>> >>>>>> On Fri, Dec 11, 2015 at 01:40:40PM -0600, Eric W. Biederman
>wrote:
>>> >>>>>>
>>> >>>>>>> + inode = path.dentry->d_inode;
>>> >>>>>>> + filp->f_path = path;
>>> >>>>>>> + filp->f_inode = inode;
>>> >>>>>>> + filp->f_mapping = inode->i_mapping;
>>> >>>>>>> + path_put(&old);
>>> >>>>>>
>>> >>>>>> Don't. You are creating a fairly subtle constraint on what
>the code in
>>> >>>>>> fs/open.c and fs/namei.c can do, for no good reason. You can
>bloody
>>> >>>>>> well maintain the information you need without that.
>>> >>>>>
>>> >>>>> There is a good reason. We can not write a race free version
>of ptsname
>>> >>>>> without it.
>>> >>>>
>>> >>>> As long as this is for new userspace code, would it make sense
>to just
>>> >>>> add a new ioctl to ask "does this ptmx fd match this /dev/pts
>fd?"
>>> >>>>
>>> >>>
>>> >>> For the newinstance case st_dev should match between the master
>and the
>>> >>> slave. Unfortunately this is not the case for a legacy ptmx, as
>a
>>> >>> stat() on the master descriptor still returns the st_dev,
>st_rdev, and
>>> >>> st_ino for the ptmx device node.
>>> >>
>>> >> Sure, but I'm not talking about stat. I'm saying that we could
>add a
>>> >> new ioctl that works on any ptmx fd (/dev/ptmx or /dev/pts/ptmx)
>that
>>> >> answers the question "does this ptmx logically belong to the
>given
>>> >> devpts filesystem".
>>> >>
>>> >> Since it's not stat, we can make it do whatever we want,
>including
>>> >> following a link to the devpts instance that isn't f_path or
>f_inode.
>>> >
>>> > The useful ioctl to add in my opinion would be one that actually
>opens
>>> > the slave, at which point ptsname could become ttyname, and that
>closes
>>> > races in grantpt.
>>>
>>> Unfortunately, ptsname is POSIX, so we can't get rid of it. It's a
>>> bad idea, but it's in the standard.
>>
>> But then ptsname could become "open the slave, call ttyname() on it,
>close
>> the slave". Unless opening the slave would have side effects?
>
>Hmm, fair enough. So maybe that does make sense after all.
>
>Anyway, I still think there are two pieces here:
>
>1. Fix /dev/ptmx so that we can banish newinstance=0.
>
>2. Fix libc. If that needs kernel help, then so be it.
>
>ISTM we could still implement the "open the slave" operation for (2)
>as an ioctl that does the appropriate magic the fd is /dev/ptmx as
>opposed to /dev/pts/ptmx.
>
>
>--Andy
I want to be clear:
If /dev/ptmx -> pts/ptmx and devpts is mounted with the proper options, I believe ask the remaining parts of userspace should be fine, and pt_chown can be removed even with glibc.
The magic ptmx we are talking about is all about dealing with a mismanaged /dev.
--
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.
On Fri, Dec 11, 2015 at 3:07 PM, H. Peter Anvin <[email protected]> wrote:
> On December 11, 2015 3:00:49 PM PST, Andy Lutomirski <[email protected]> wrote:
>>On Fri, Dec 11, 2015 at 2:58 PM, Jann Horn <[email protected]> wrote:
>>> On Fri, Dec 11, 2015 at 02:52:01PM -0800, Andy Lutomirski wrote:
>>>> On Fri, Dec 11, 2015 at 2:35 PM, Eric W. Biederman
>>>> <[email protected]> wrote:
>>>> > Andy Lutomirski <[email protected]> writes:
>>>> >
>>>> >> On Fri, Dec 11, 2015 at 2:07 PM, H. Peter Anvin <[email protected]>
>>wrote:
>>>> >>> On 12/11/15 13:48, Andy Lutomirski wrote:
>>>> >>>> On Fri, Dec 11, 2015 at 1:11 PM, Eric W. Biederman
>>>> >>>> <[email protected]> wrote:
>>>> >>>>> Al Viro <[email protected]> writes:
>>>> >>>>>
>>>> >>>>>> On Fri, Dec 11, 2015 at 01:40:40PM -0600, Eric W. Biederman
>>wrote:
>>>> >>>>>>
>>>> >>>>>>> + inode = path.dentry->d_inode;
>>>> >>>>>>> + filp->f_path = path;
>>>> >>>>>>> + filp->f_inode = inode;
>>>> >>>>>>> + filp->f_mapping = inode->i_mapping;
>>>> >>>>>>> + path_put(&old);
>>>> >>>>>>
>>>> >>>>>> Don't. You are creating a fairly subtle constraint on what
>>the code in
>>>> >>>>>> fs/open.c and fs/namei.c can do, for no good reason. You can
>>bloody
>>>> >>>>>> well maintain the information you need without that.
>>>> >>>>>
>>>> >>>>> There is a good reason. We can not write a race free version
>>of ptsname
>>>> >>>>> without it.
>>>> >>>>
>>>> >>>> As long as this is for new userspace code, would it make sense
>>to just
>>>> >>>> add a new ioctl to ask "does this ptmx fd match this /dev/pts
>>fd?"
>>>> >>>>
>>>> >>>
>>>> >>> For the newinstance case st_dev should match between the master
>>and the
>>>> >>> slave. Unfortunately this is not the case for a legacy ptmx, as
>>a
>>>> >>> stat() on the master descriptor still returns the st_dev,
>>st_rdev, and
>>>> >>> st_ino for the ptmx device node.
>>>> >>
>>>> >> Sure, but I'm not talking about stat. I'm saying that we could
>>add a
>>>> >> new ioctl that works on any ptmx fd (/dev/ptmx or /dev/pts/ptmx)
>>that
>>>> >> answers the question "does this ptmx logically belong to the
>>given
>>>> >> devpts filesystem".
>>>> >>
>>>> >> Since it's not stat, we can make it do whatever we want,
>>including
>>>> >> following a link to the devpts instance that isn't f_path or
>>f_inode.
>>>> >
>>>> > The useful ioctl to add in my opinion would be one that actually
>>opens
>>>> > the slave, at which point ptsname could become ttyname, and that
>>closes
>>>> > races in grantpt.
>>>>
>>>> Unfortunately, ptsname is POSIX, so we can't get rid of it. It's a
>>>> bad idea, but it's in the standard.
>>>
>>> But then ptsname could become "open the slave, call ttyname() on it,
>>close
>>> the slave". Unless opening the slave would have side effects?
>>
>>Hmm, fair enough. So maybe that does make sense after all.
>>
>>Anyway, I still think there are two pieces here:
>>
>>1. Fix /dev/ptmx so that we can banish newinstance=0.
>>
>>2. Fix libc. If that needs kernel help, then so be it.
>>
>>ISTM we could still implement the "open the slave" operation for (2)
>>as an ioctl that does the appropriate magic the fd is /dev/ptmx as
>>opposed to /dev/pts/ptmx.
>>
>>
>>--Andy
>
> I want to be clear:
>
> If /dev/ptmx -> pts/ptmx and devpts is mounted with the proper options, I believe ask the remaining parts of userspace should be fine, and pt_chown can be removed even with glibc.
>
> The magic ptmx we are talking about is all about dealing with a mismanaged /dev.
> --
> Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.
I think you're right, modulo the one stupidity that a configuration
like that is prone to breakage with container apps running on the same
system.
Hmm. Could userspace be changed to set newinstance=1 on its /dev/pts
mount to work around that?
--Andy
--
Andy Lutomirski
AMA Capital Management, LLC
On December 11, 2015 3:16:48 PM PST, Andy Lutomirski <[email protected]> wrote:
>On Fri, Dec 11, 2015 at 3:07 PM, H. Peter Anvin <[email protected]> wrote:
>> On December 11, 2015 3:00:49 PM PST, Andy Lutomirski
><[email protected]> wrote:
>>>On Fri, Dec 11, 2015 at 2:58 PM, Jann Horn <[email protected]> wrote:
>>>> On Fri, Dec 11, 2015 at 02:52:01PM -0800, Andy Lutomirski wrote:
>>>>> On Fri, Dec 11, 2015 at 2:35 PM, Eric W. Biederman
>>>>> <[email protected]> wrote:
>>>>> > Andy Lutomirski <[email protected]> writes:
>>>>> >
>>>>> >> On Fri, Dec 11, 2015 at 2:07 PM, H. Peter Anvin <[email protected]>
>>>wrote:
>>>>> >>> On 12/11/15 13:48, Andy Lutomirski wrote:
>>>>> >>>> On Fri, Dec 11, 2015 at 1:11 PM, Eric W. Biederman
>>>>> >>>> <[email protected]> wrote:
>>>>> >>>>> Al Viro <[email protected]> writes:
>>>>> >>>>>
>>>>> >>>>>> On Fri, Dec 11, 2015 at 01:40:40PM -0600, Eric W. Biederman
>>>wrote:
>>>>> >>>>>>
>>>>> >>>>>>> + inode = path.dentry->d_inode;
>>>>> >>>>>>> + filp->f_path = path;
>>>>> >>>>>>> + filp->f_inode = inode;
>>>>> >>>>>>> + filp->f_mapping = inode->i_mapping;
>>>>> >>>>>>> + path_put(&old);
>>>>> >>>>>>
>>>>> >>>>>> Don't. You are creating a fairly subtle constraint on what
>>>the code in
>>>>> >>>>>> fs/open.c and fs/namei.c can do, for no good reason. You
>can
>>>bloody
>>>>> >>>>>> well maintain the information you need without that.
>>>>> >>>>>
>>>>> >>>>> There is a good reason. We can not write a race free
>version
>>>of ptsname
>>>>> >>>>> without it.
>>>>> >>>>
>>>>> >>>> As long as this is for new userspace code, would it make
>sense
>>>to just
>>>>> >>>> add a new ioctl to ask "does this ptmx fd match this /dev/pts
>>>fd?"
>>>>> >>>>
>>>>> >>>
>>>>> >>> For the newinstance case st_dev should match between the
>master
>>>and the
>>>>> >>> slave. Unfortunately this is not the case for a legacy ptmx,
>as
>>>a
>>>>> >>> stat() on the master descriptor still returns the st_dev,
>>>st_rdev, and
>>>>> >>> st_ino for the ptmx device node.
>>>>> >>
>>>>> >> Sure, but I'm not talking about stat. I'm saying that we could
>>>add a
>>>>> >> new ioctl that works on any ptmx fd (/dev/ptmx or
>/dev/pts/ptmx)
>>>that
>>>>> >> answers the question "does this ptmx logically belong to the
>>>given
>>>>> >> devpts filesystem".
>>>>> >>
>>>>> >> Since it's not stat, we can make it do whatever we want,
>>>including
>>>>> >> following a link to the devpts instance that isn't f_path or
>>>f_inode.
>>>>> >
>>>>> > The useful ioctl to add in my opinion would be one that actually
>>>opens
>>>>> > the slave, at which point ptsname could become ttyname, and that
>>>closes
>>>>> > races in grantpt.
>>>>>
>>>>> Unfortunately, ptsname is POSIX, so we can't get rid of it. It's
>a
>>>>> bad idea, but it's in the standard.
>>>>
>>>> But then ptsname could become "open the slave, call ttyname() on
>it,
>>>close
>>>> the slave". Unless opening the slave would have side effects?
>>>
>>>Hmm, fair enough. So maybe that does make sense after all.
>>>
>>>Anyway, I still think there are two pieces here:
>>>
>>>1. Fix /dev/ptmx so that we can banish newinstance=0.
>>>
>>>2. Fix libc. If that needs kernel help, then so be it.
>>>
>>>ISTM we could still implement the "open the slave" operation for (2)
>>>as an ioctl that does the appropriate magic the fd is /dev/ptmx as
>>>opposed to /dev/pts/ptmx.
>>>
>>>
>>>--Andy
>>
>> I want to be clear:
>>
>> If /dev/ptmx -> pts/ptmx and devpts is mounted with the proper
>options, I believe ask the remaining parts of userspace should be fine,
>and pt_chown can be removed even with glibc.
>>
>> The magic ptmx we are talking about is all about dealing with a
>mismanaged /dev.
>> --
>> Sent from my Android device with K-9 Mail. Please excuse brevity and
>formatting.
>
>I think you're right, modulo the one stupidity that a configuration
>like that is prone to breakage with container apps running on the same
>system.
>
>Hmm. Could userspace be changed to set newinstance=1 on its /dev/pts
>mount to work around that?
>
>--Andy
The newinstance option was always meant to be transitory, to let the kernel know that userspace has been properly enabled with a ptmx symlink, except that the user space enabling never happened, just as it was never done properly with devpts in the first place.
--
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.
On 12/11/2015 11:40 AM, Eric W. Biederman wrote:
> Forcing newinstance for every mount of the devpts filesystem actually
> requires the association between /dev/ptmx and the currently mounted
> instance of devpts at /dev/pts. Simply remembering the first mount of
> the devpts filesystem and associating that with /dev/ptmx is not
> enough. I am aware of at least one instance where an initramfs mounts
> devpts before the main system instance of devpts is mounted.
Can you point me to that usage please?
I ask because there's a patch to move devpts init from module initcall
up to fs initcall (neither devpts nor the pty driver is actually built
as a module anyway), and I'd like to look at what the consequences
might be for that userspace configuration.
> In that system ptys simply did not work after boot when I tested
> associating /dev/ptmx with the first mount of the devpts filesystem.
Assuming userspace isn't broken by that patch, is a fixed association
with first mount otherwise an acceptable solution for magic /dev/ptmx
(where /dev/ptmx is not a symlink to /dev/pts/ptmx)?
Regards,
Peter Hurley
On 12/14/15 11:47, Peter Hurley wrote:
> On 12/11/2015 11:40 AM, Eric W. Biederman wrote:
>> Forcing newinstance for every mount of the devpts filesystem actually
>> requires the association between /dev/ptmx and the currently mounted
>> instance of devpts at /dev/pts. Simply remembering the first mount of
>> the devpts filesystem and associating that with /dev/ptmx is not
>> enough. I am aware of at least one instance where an initramfs mounts
>> devpts before the main system instance of devpts is mounted.
>
> Can you point me to that usage please?
>
> I ask because there's a patch to move devpts init from module initcall
> up to fs initcall (neither devpts nor the pty driver is actually built
> as a module anyway), and I'd like to look at what the consequences
> might be for that userspace configuration.
>
>
>> In that system ptys simply did not work after boot when I tested
>> associating /dev/ptmx with the first mount of the devpts filesystem.
>
> Assuming userspace isn't broken by that patch, is a fixed association
> with first mount otherwise an acceptable solution for magic /dev/ptmx
> (where /dev/ptmx is not a symlink to /dev/pts/ptmx)?
>
The problem is containers, I would think, if they create a new /dev/ptmx
and then mount a separate devpts instance instead of doing a bind mount.
-hpa
Peter Hurley <[email protected]> writes:
> On 12/11/2015 11:40 AM, Eric W. Biederman wrote:
>> Forcing newinstance for every mount of the devpts filesystem actually
>> requires the association between /dev/ptmx and the currently mounted
>> instance of devpts at /dev/pts. Simply remembering the first mount of
>> the devpts filesystem and associating that with /dev/ptmx is not
>> enough. I am aware of at least one instance where an initramfs mounts
>> devpts before the main system instance of devpts is mounted.
>
> Can you point me to that usage please?
I have found that the Dracut versions in CentOS5 and CentOS6 generates
initial ramdisks that mount devpts before the primary OS mount of devpts
on /dev/pts. I have also found that openwrt-15.05 without an initial
ramdisk does something strange during startup and boots devpts twice as
well.
I have looked but I haven't seen that pattern elsewhere but my search
space of 15ish distros is small compared to what is out there. Given
that mounting devpts multiple times has been implemented at least twice
independently I would not be surprised if mounting devpts multiple times
during boot shows up somewhere else.
> I ask because there's a patch to move devpts init from module initcall
> up to fs initcall (neither devpts nor the pty driver is actually built
> as a module anyway), and I'd like to look at what the consequences
> might be for that userspace configuration.
I don't expect there are any. As all of this happens before userspace
initializes anyway. We have enough variation in the kernel anyway
that the device number the first devpts is mounted on varies between
kernels already.
>> In that system ptys simply did not work after boot when I tested
>> associating /dev/ptmx with the first mount of the devpts filesystem.
>
> Assuming userspace isn't broken by that patch, is a fixed association
> with first mount otherwise an acceptable solution for magic /dev/ptmx
> (where /dev/ptmx is not a symlink to /dev/pts/ptmx)?
I do not believe a fixed association with the first mount is an
acceptable solution for implementing /dev/ptmx in association with
a change to cause mount of devpts to be an independent filesystem.
Such an association fails to be backwards compatible with existing
userspace, and it is extremely fragile.
If the association between the device node and the filesystem in the
mount namespace is insufficient for backwards compatibility I do not
believe full backwards compatibility is acheivable with a magic version
of /dev/ptmx.
On the flip side the consequences of a ptmx symlink in devpts pointing
to pts/ptmx look extremely minor. Of my test cases only openwrt-15.05
and CentOS5 fail, as they don't use devtmpfs. While debian-6.0.2,
debian-7.9, debian-8.2, CentOS6, CentOS7, fedora32, magia-5, mint-17.3,
opensuse-42.1, slackware-14.1, unbuntu-14.04.3 and ubuntu-15.10 all
work.
By making the change in behavior controlled by a kernel command line
option (devpts.newinstance is what I have been testing with) that allows
me to build a single kernel that works on everything. Which is enough
backwards compatibility for me.
I still have not quite reached the point of testing what the real world
consequences for programs such as lxc that currently use the newinstance
option are. There is a possibility that if someone is bind mounting
/dev/pts/ptmx over /dev/ptmx they might break. Similarly there may be a
few cases do "mknod ptmx c 5 2" and that will start failing.
I don't expect running into weird userspace cases that fail will change
my opinion on a path forward, but it will be good to know what the
consequences are of flipping the option. As so far everything thing
looks like it will just work.
Right now having a nano-flag day and putting a symlink in devtmps looks
a whole lot cleaner in both implementation, maintenance and use than a
magic /dev/ptmx.
Eric
[email protected] (Eric W. Biederman) writes:
>>> In that system ptys simply did not work after boot when I tested
>>> associating /dev/ptmx with the first mount of the devpts filesystem.
>>
>> Assuming userspace isn't broken by that patch, is a fixed association
>> with first mount otherwise an acceptable solution for magic /dev/ptmx
>> (where /dev/ptmx is not a symlink to /dev/pts/ptmx)?
>
> I do not believe a fixed association with the first mount is an
> acceptable solution for implementing /dev/ptmx in association with
> a change to cause mount of devpts to be an independent filesystem.
> Such an association fails to be backwards compatible with existing
> userspace, and it is extremely fragile.
Ugh. After reviewing the userspace code that mounts devpts we have
to do use a magic /dev/ptmx to solve the issue we are trying to solve.
The fragility of detecting the primary system devpts seems solvable.
CentOS5 and openwrt-15.05 mount devpts, unmount devpts,
then mount devpts again. So a rule of mouting the internal devpts if it
isn't mounted would work for those.
CentOS6 uses switch_root and moves it's early mount of devpts onto the
primary root, and then because devpts is also in /etc/fstab tries and
fails to mount devpts once more at the same location. Implying
newinstance will make that mounting devpts twice. That sounds solvable
but I don't see a clean way of detecting that case yet.
Ugh.
I am going to pound my head up against what is needed to find the
primary system mount of devpts for a bit more and see if I can solve
that. Otherwise this exercise is pointless.
Eric
On December 19, 2015 8:11:50 PM PST, [email protected] wrote:
>[email protected] (Eric W. Biederman) writes:
>
>>>> In that system ptys simply did not work after boot when I tested
>>>> associating /dev/ptmx with the first mount of the devpts
>filesystem.
>>>
>>> Assuming userspace isn't broken by that patch, is a fixed
>association
>>> with first mount otherwise an acceptable solution for magic
>/dev/ptmx
>>> (where /dev/ptmx is not a symlink to /dev/pts/ptmx)?
>>
>> I do not believe a fixed association with the first mount is an
>> acceptable solution for implementing /dev/ptmx in association with
>> a change to cause mount of devpts to be an independent filesystem.
>> Such an association fails to be backwards compatible with existing
>> userspace, and it is extremely fragile.
>
>Ugh. After reviewing the userspace code that mounts devpts we have
>to do use a magic /dev/ptmx to solve the issue we are trying to solve.
>
>The fragility of detecting the primary system devpts seems solvable.
>
>CentOS5 and openwrt-15.05 mount devpts, unmount devpts,
>then mount devpts again. So a rule of mouting the internal devpts if
>it
>isn't mounted would work for those.
>
>CentOS6 uses switch_root and moves it's early mount of devpts onto the
>primary root, and then because devpts is also in /etc/fstab tries and
>fails to mount devpts once more at the same location. Implying
>newinstance will make that mounting devpts twice. That sounds solvable
>but I don't see a clean way of detecting that case yet.
>
>Ugh.
>
>I am going to pound my head up against what is needed to find the
>primary system mount of devpts for a bit more and see if I can solve
>that. Otherwise this exercise is pointless.
>
>Eric
Does it matter if it mounts devpts twice? It seems like a waste of a minuscule amount of memory, and nothing else.
--
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.
"H. Peter Anvin" <[email protected]> writes:
> Does it matter if it mounts devpts twice? It seems like a waste of a
> minuscule amount of memory, and nothing else.
It breaks system("mknod /tmp/ptmx c 5 2"); open("/tmp/ptmx");
As it opens a pty in an inaccessible instance of devpts. When
previously the instance of devpts was accessible. So backwards
compatibility is broken.
It doubly matters as we have evidence that b0rken userspace actually
does that things like that.
I will probably get a grumble or two but it turns out it isn't
particularly hard to deal with the overmounting that happens in CentOS6,
and the mounting then unmounting then mounting again that happens in
CentOS5, and openwrt.
For the cases I know to test for I have something that works now. I
am going to sleep on it and then see if I can find think of other
things to test before I push out a patch.
Eric
[email protected] (Eric W. Biederman) writes:
> "H. Peter Anvin" <[email protected]> writes:
>
>> Does it matter if it mounts devpts twice? It seems like a waste of a
>> minuscule amount of memory, and nothing else.
> It breaks system("mknod /tmp/ptmx c 5 2"); open("/tmp/ptmx");
Correction.
It does break the above but that isn't the real reason we need to
support that. We only have evidence of pople doing:
"mkdir -p dev/pts; mknod c dev/ptmx 5 2; mount -t devpts dev/pts/"
Where the relatives paths would work.
What actually breaks is "echo NNN > /proc/sys/kernel/pty/reserve"
Which allows the primary instance of devpts to have access to more
ptys than any other instance.
Ultimately if we are going to be backwards compatible we need to
preserve as much of the current behavior as possible so we don't forget
something in the analysis and break something we don't intend to break
by accident.
Eric
On Mon, Dec 21, 2015 at 2:03 PM, Eric W. Biederman
<[email protected]> wrote:
>
> What actually breaks is "echo NNN > /proc/sys/kernel/pty/reserve"
> Which allows the primary instance of devpts to have access to more
> ptys than any other instance.
Ok. We'll probably need to fix that similarly (make it look up
/dev/pts to figure out *which* pty reserve to fix), but I think that's
entirely separate from the ptmx issue for now.
Linus
To recap the situation for those who have not been following closely.
There are programs such as xen-create-image that run as root and setup
a chroot environment with:
"mknod dev/ptmx c 5 2"
"mkdir dev/pts"
"mount -t devpts none dev/pts"
Which mostly works but stomps the mount options of the system /dev/pts.
In particular the options of "gid=5,mode=620" are lost resulting in a
situation where creating a new pty by opening /dev/ptmx results in
that pty having the wrong permissions.
Some distributions have been working around this problem by continuing
to install a setuid root pt_chown binary that will be called by glibc
to fix the permissions.
This solution isn't too scary as long as there is only one instance
of devpts but add a second instance of devpts and it becomes possible
to trick the setuid root pt_chown binary into operating on the wrong
files and directories.
The following patchset attempts to dig use out of this mess by carefully
chaning devpts in a way that does not induce any userspace regressions.
My expectation is that userspace developers will love us as it makes
their problems go away, and kernel developers will not be happy with the
changes because what is required to preserve backwards compatibility is
not what anyone would have designed with a clean sheet implementation.
To dig our selves out of this mess it has been generally agreed that it
makes sense for each mount of devpts to result in a separate instance of
devpts. That works for ensuring that a new mount of devpts does not
stomp the permissions of another mount of devpts but then programs such
as xen-create-image break as they expect opening of /dev/ptmx to create
new ptys.
The problem of /dev/ptmx needing to create ptys on different instances
of devpts can be resolved by performing a path based look up from the
"/dev/ptmx" node to find the devpts filesystem.
How we get each mount of devpts to result in a distinct instance of
devpts matters. The kernel treats the system instance of devpts
specially and to maintain backwards compatibility that needs to be
preserved.
As there must be a system instance of devpts the code continues in it's
technique of mounting the system instance of devpts internally and then
exporting it userspace when devpts is mounted under the right
circumstances.
There is one pattern in userspace where an intial ramdisk mounts devpts
then unmounts devpts and the usual system startup scripts then mount
devpts on /dev/pts. I believe Centos5 and Openwrt use this pattern.
There is another pattern of userspace where devpts is mounted in the
initial ramdisk and then "mount --move" is used to move it onto the
final /dev/pts. However devpts remains listed in /etc/fstab and later
in the boot a "mount -a" honors that listing starts mounting devpts on
top of itself. Fails but only after succeeding in changing the mount
options. I believe this is the pattenr that Centos6 uses.
Then there is the question of which permissions checks should apply when
/dev/ptmx is opened. The way it works in v4.6-rc1 is that a pty on the
system instance of devtps can be created either by opening /dev/ptmx or
/dev/pts/ptmx (which happens to reside on the system instance. The only
restriction are the normal unix permission of opening those files. A
pty on a non-system instance of devpts must be created by opening the
ptmx file on the non-systme devpts instance. The default permisions for
/dev/ptmx are 0666 and for /dev/pts/ptmx 0000. Today for non-system
instances of devpts the permission on the ptmx file are always changed to
something else (typically 0666), and typically on the system instance
the permission of ptmx on devpts are ignoreed and left at 0000.
Which leaves the question of what to do about cases such as
xen-create-image where the new /dev/ptmx path based lookup is exercised
to find the mount of devpts. In that case out of an abundance of
caution I require the code to verify that the opener also has permission
to open the ptmx file on the non-system instance of devpts. To make
that work I have changed the default permissions on the ptmx file for
all non-system instance of devpts to 0666. Where the permission check
comes in as useful is on any system where a non-system instance of
devpts and has permissions on it's ptmx file that are not 0666. If that
was not done a simple bind mount of the /dev/ptmx device node would
allow overriding the policy of who is allowed to create ptys in an
instance of devpts.
As you can see above my guiding principle this round has been be very
careful to keep existing userspace working. I have tested this code on
as wide a range of distributions as I could to look for intesting
behavior. Those I managed to get setup and running in a vm for testing
are: on openwrt-15.05, centos5, centos6, centos7, debian-6.0.2,
debian-7.9, debian-8.2, ubuntu-14.04.3, ubuntu-15.10, fedora23, magia-5,
mint-17.3, opensuse-42.1, slackware-14.1, gentoo-20151225 (13.0?),
archlinux-2015-12-01.
As I could not find an image of Android that was easy to get running
in a VM, so I audited the code to see what Android does. Unlike
reports earlier in this conversation Android does not use a shell
script. Android has a daemon that listens on netlink for device events,
consults it's policy data base and creates the device node in a tmpfs
instance mounted on /dev according to policy (assuming the policy allows
that device node). Furthermore at the start of that daemon devpts is
mounted exactly once. Which thankfully means Android poses no special
problems for this patchset.
I have also run xen-create-image on debian 8.2 (where it was easily
installed with apt-get) and confirmed that without these changes it
stomps the mount options of devpts and with these changes it only uses
atypical mount options on a separate instance of devpts.
The first change in this series adds magic to /dev/ptmx. The rest of
the changes deal with all of the little issues needed to ensure that
every mount of devpts is a distinct instance.
Eric W. Biederman (13):
devpts: Teach /dev/ptmx to find the associated devpts via path lookup
devpts: More obvious check for the system devpts in pty allocation
devpts: Cleanup newinstance parsing
devpts: Stop rolling devpts_remount by hand in devpts_mount
devpts: Fail early (if appropriate) on overmount
devpts: Use the same default mode for both /dev/ptmx and dev/pts/ptmx
devpts: Move parse_mount_options into fill_super
devpts: Make devpts_kill_sb safe if fsi is NULL
devpts: Move the creation of /dev/pts/ptmx into fill_super
devpts: Simplify devpts_mount by using mount_nodev
vfs: Implement mount_super_once
devpts: Always return a distinct instance when mounting
devpts: Kill the DEVPTS_MULTIPLE_INSTANCE config option
Documentation/filesystems/devpts.txt | 122 +++++++----------
drivers/tty/Kconfig | 11 --
drivers/tty/pty.c | 4 +
drivers/tty/tty_io.c | 5 +-
fs/devpts/inode.c | 250 ++++++++++++++++++++---------------
fs/namei.c | 64 +++++++--
fs/namespace.c | 8 ++
fs/open.c | 19 +++
fs/super.c | 12 ++
include/linux/devpts_fs.h | 5 +
include/linux/fs.h | 5 +
include/linux/namei.h | 3 +
12 files changed, 302 insertions(+), 206 deletions(-)
This code is also available at:
git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.git devpts-for-testing
Eric
This is in preparation for forcing each mount of devpts to be a
distinct filesystem. The goal of this change is to have as few user
visible changes from the kernel today as possible.
On each open of /dev/ptmx look at the relative path pts and see if
devpts is mounted there.
If the filesystem found via the path lookup is the system devpts
do exactly what we do today.
If the filesystem found is not the system devpts make it appear
to the rest of the system that the /dev/pts/ptmx node was opened.
This includes respecting the permission checks of /dev/pts/ptmx
and updating the file's path to point to /dev/pts/ptmx.
In practice I expect the permission checks are a non-issue as the
permissions on /dev/ptmx and /dev/pts/ptmx are always 0666. If
someone happens to change the permission on /dev/pts/ptmx is seems
important to honor them for the sake of backwards compatibility. As
/dev/ptmx can be bind mounted to set next to any devpts filesystem not
honoring the permissions would provide a nice permission bypass under
the right circumstances if we did not check the permissions.
Similarly reflect the instance of devpts that was opened in f_path so
that we preserve the only way available today to test if someone is
attempting to confuse a pty creating program.
This winds up using 3 new vfs helpers path_parent, path_pts, and
update_file_path.
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
drivers/tty/pty.c | 4 +++
fs/devpts/inode.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++
fs/namei.c | 64 ++++++++++++++++++++++++++++++-------
fs/open.c | 19 +++++++++++
include/linux/devpts_fs.h | 2 ++
include/linux/fs.h | 2 ++
include/linux/namei.h | 3 ++
7 files changed, 164 insertions(+), 11 deletions(-)
diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index e16a49b507ef..557858ef00f5 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -725,6 +725,10 @@ static int ptmx_open(struct inode *inode, struct file *filp)
int retval;
int index;
+ inode = devpts_ptmx(inode, filp);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
nonseekable_open(inode, filp);
/* We refuse fsnotify events on ptmx, since it's a shared resource */
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 655f21f99160..c14d51795577 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -17,6 +17,7 @@
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/namei.h>
+#include <linux/fs_struct.h>
#include <linux/slab.h>
#include <linux/mount.h>
#include <linux/tty.h>
@@ -136,6 +137,86 @@ static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb)
return sb->s_fs_info;
}
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+static int devpts_path_ptmx(struct file *filp)
+{
+ struct pts_fs_info *fsi;
+ struct path root, path;
+ struct dentry *old;
+ int err = -ENOENT;
+ int ret;
+
+ /* Can the pts filesystem be found with a path walk? */
+ path = filp->f_path;
+ path_get(&path);
+ get_fs_root(current->fs, &root);
+ ret = path_parent(&root, &path);
+ path_put(&root);
+ if (ret != 1)
+ goto fail;
+
+ /* Remember the result of this permission check for later */
+ ret = inode_permission(path.dentry->d_inode, MAY_EXEC);
+ if (path_pts(&path))
+ goto fail;
+
+ /* Is path the root of a devpts filesystem? */
+ if ((path.mnt->mnt_sb->s_magic != DEVPTS_SUPER_MAGIC) ||
+ (path.mnt->mnt_root != path.mnt->mnt_sb->s_root))
+ goto fail;
+ fsi = DEVPTS_SB(path.mnt->mnt_sb);
+
+ /* Get out if the path walk resulted in the default devpts instance */
+ if (devpts_mnt->mnt_sb == path.mnt->mnt_sb)
+ goto fail;
+
+ /* Don't allow bypassing the existing /dev/pts/ptmx permission check */
+ err = ret;
+ if (!err)
+ err = inode_permission(path.dentry->d_inode, MAY_EXEC);
+ if (!err)
+ err = inode_permission(fsi->ptmx_dentry->d_inode,
+ ACC_MODE(filp->f_flags));
+ if (err)
+ goto fail;
+
+ /* Advance path to the ptmx dentry */
+ old = path.dentry;
+ path.dentry = dget(fsi->ptmx_dentry);
+ dput(old);
+
+ /* Make it look like /dev/pts/ptmx was opened */
+ err = update_file_path(filp, &path);
+ if (err)
+ goto fail;
+
+ return 0;
+fail:
+ path_put(&path);
+ return err;
+}
+#else
+static inline int devpts_path_ptmx(struct file *filp)
+{
+ return -ENOENT;
+}
+#endif
+
+struct inode *devpts_ptmx(struct inode *inode, struct file *filp)
+{
+ int err;
+ if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
+ return inode;
+
+ err = devpts_path_ptmx(filp);
+ if (err == 0)
+ return filp->f_inode;
+ if (err != -ENOENT)
+ return ERR_PTR(err);
+
+ return inode;
+}
+
static inline struct super_block *pts_sb_from_inode(struct inode *inode)
{
#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
diff --git a/fs/namei.c b/fs/namei.c
index 794f81dce766..afb5137ca199 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1415,29 +1415,41 @@ static void follow_mount(struct path *path)
}
}
-static int follow_dotdot(struct nameidata *nd)
+int path_parent(struct path *root, struct path *path)
{
+ int ret = 0;
+
while(1) {
- struct dentry *old = nd->path.dentry;
+ struct dentry *old = path->dentry;
- if (nd->path.dentry == nd->root.dentry &&
- nd->path.mnt == nd->root.mnt) {
+ if (old == root->dentry &&
+ path->mnt == root->mnt) {
break;
}
- if (nd->path.dentry != nd->path.mnt->mnt_root) {
+ if (old != path->mnt->mnt_root) {
/* rare case of legitimate dget_parent()... */
- nd->path.dentry = dget_parent(nd->path.dentry);
+ path->dentry = dget_parent(path->dentry);
dput(old);
- if (unlikely(!path_connected(&nd->path)))
+ if (unlikely(!path_connected(path)))
return -ENOENT;
+ ret = 1;
break;
}
- if (!follow_up(&nd->path))
+ if (!follow_up(path))
break;
}
- follow_mount(&nd->path);
- nd->inode = nd->path.dentry->d_inode;
- return 0;
+ follow_mount(path);
+ return ret;
+}
+
+static int follow_dotdot(struct nameidata *nd)
+{
+ int ret = path_parent(&nd->root, &nd->path);
+ if (ret >= 0) {
+ ret = 0;
+ nd->inode = nd->path.dentry->d_inode;
+ }
+ return ret;
}
/*
@@ -2374,6 +2386,36 @@ struct dentry *lookup_one_len_unlocked(const char *name,
}
EXPORT_SYMBOL(lookup_one_len_unlocked);
+#ifdef CONFIG_UNIX98_PTYS
+int path_pts(struct path *path)
+{
+ struct dentry *child, *parent = path->dentry;
+ struct qstr this;
+
+ if (!d_can_lookup(parent))
+ return -ENOENT;
+
+ this.name = "pts";
+ this.len = 3;
+ this.hash = full_name_hash(this.name, this.len);
+ if (parent->d_flags & DCACHE_OP_HASH) {
+ int err = parent->d_op->d_hash(parent, &this);
+ if (err < 0)
+ return err;
+ }
+ inode_lock(parent->d_inode);
+ child = d_lookup(parent, &this);
+ inode_unlock(parent->d_inode);
+ if (!child)
+ return -ENOENT;
+
+ path->dentry = child;
+ dput(parent);
+ follow_mount(path);
+ return 0;
+}
+#endif
+
int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
struct path *path, int *empty)
{
diff --git a/fs/open.c b/fs/open.c
index 17cb6b1dab75..e1ed78fa474b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -679,6 +679,24 @@ int open_check_o_direct(struct file *f)
return 0;
}
+int update_file_path(struct file *filp, struct path *path)
+{
+ /* Only valid during f_op->open, and even in open use very carefully */
+ struct path old;
+ struct inode *inode;
+
+ if (filp->f_mode & FMODE_WRITER)
+ return -EINVAL;
+
+ old = filp->f_path;
+ inode = path->dentry->d_inode;
+ filp->f_path = *path;
+ filp->f_inode = inode;
+ filp->f_mapping = inode->i_mapping;
+ path_put(&old);
+ return 0;
+}
+
static int do_dentry_open(struct file *f,
struct inode *inode,
int (*open)(struct inode *, struct file *),
@@ -736,6 +754,7 @@ static int do_dentry_open(struct file *f,
error = open(inode, f);
if (error)
goto cleanup_all;
+ inode = f->f_inode;
}
if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
i_readcount_inc(inode);
diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h
index e0ee0b3000b2..260190690674 100644
--- a/include/linux/devpts_fs.h
+++ b/include/linux/devpts_fs.h
@@ -17,6 +17,8 @@
#ifdef CONFIG_UNIX98_PTYS
+struct inode *devpts_ptmx(struct inode *inode, struct file *filp);
+
int devpts_new_index(struct inode *ptmx_inode);
void devpts_kill_index(struct inode *ptmx_inode, int idx);
void devpts_add_ref(struct inode *ptmx_inode);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 14a97194b34b..045bbfe2ecfc 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2272,6 +2272,8 @@ extern struct filename *getname(const char __user *);
extern struct filename *getname_kernel(const char *);
extern void putname(struct filename *name);
+extern int update_file_path(struct file *filp, struct path *path);
+
enum {
FILE_CREATED = 1,
FILE_OPENED = 2
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 77d01700daf7..647d97a4869b 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -45,6 +45,9 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
#define LOOKUP_ROOT 0x2000
#define LOOKUP_EMPTY 0x4000
+extern int path_parent(struct path *root, struct path *path);
+extern int path_pts(struct path *path);
+
extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty);
static inline int user_path_at(int dfd, const char __user *name, unsigned flags,
--
2.6.3
Just use devpts_remount and by doing so ensuring that ptxmode
actually get propogated to /dev/pts/ptmx on the initial mount
of devpts.
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
fs/devpts/inode.c | 26 ++++++++++++--------------
1 file changed, 12 insertions(+), 14 deletions(-)
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index c3d53d2f7c3e..14be886f987c 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -338,12 +338,6 @@ static int mknod_ptmx(struct super_block *sb)
inode_lock(d_inode(root));
- /* If we have already created ptmx node, return */
- if (fsi->ptmx_dentry) {
- rc = 0;
- goto out;
- }
-
dentry = d_alloc_name(root, "ptmx");
if (!dentry) {
pr_err("Unable to alloc dentry for ptmx node\n");
@@ -552,16 +546,20 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type,
error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
if (error)
goto out_undo_sget;
- s->s_flags |= MS_ACTIVE;
- }
- error = parse_mount_options(data, &DEVPTS_SB(s)->mount_opts);
- if (error)
- goto out_undo_sget;
+ error = parse_mount_options(data, &DEVPTS_SB(s)->mount_opts);
+ if (error)
+ goto out_undo_sget;
+
+ error = mknod_ptmx(s);
+ if (error)
+ goto out_undo_sget;
- error = mknod_ptmx(s);
- if (error)
- goto out_undo_sget;
+ s->s_flags |= MS_ACTIVE;
+ } else {
+ /* Match mount_single ignore errors on remount */
+ devpts_remount(s, &flags, data);
+ }
return dget(s->s_root);
--
2.6.3
The default mode is 0666 on /dev/ptmx so we really don't gain anything
except a bunch of hassle by not having the default mode on
/dev/pts/ptmx also being 0666.
Leave the default of ptmxmode at 0000 on the system mount of devpts to
avoid the off chance that it would open an security hole on existing
setups that change the permissions of devpts.
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
drivers/tty/tty_io.c | 5 +++--
fs/devpts/inode.c | 11 +++--------
include/linux/devpts_fs.h | 3 +++
3 files changed, 9 insertions(+), 10 deletions(-)
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 8d26ed79bb4c..438b2209ea41 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -3597,9 +3597,10 @@ static char *tty_devnode(struct device *dev, umode_t *mode)
{
if (!mode)
return NULL;
- if (dev->devt == MKDEV(TTYAUX_MAJOR, 0) ||
- dev->devt == MKDEV(TTYAUX_MAJOR, 2))
+ if (dev->devt == MKDEV(TTYAUX_MAJOR, 0))
*mode = 0666;
+ if (dev->devt == MKDEV(TTYAUX_MAJOR, PTMX_MINOR))
+ *mode = DEVPTS_DEFAULT_PTMX_MODE;
return NULL;
}
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index cb0cc4e33c3f..80c78bb472a9 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -30,14 +30,6 @@
#include <linux/seq_file.h>
#define DEVPTS_DEFAULT_MODE 0600
-/*
- * ptmx is a new node in /dev/pts and will be unused in legacy (single-
- * instance) mode. To prevent surprises in user space, set permissions of
- * ptmx to 0. Use 'chmod' or remount with '-o ptmxmode' to set meaningful
- * permissions.
- */
-#define DEVPTS_DEFAULT_PTMX_MODE 0000
-#define PTMX_MINOR 2
/*
* sysctl support for setting limits on the number of Unix98 ptys allocated.
@@ -260,6 +252,9 @@ static int parse_mount_options(char *data, struct pts_mount_opts *opts)
opts->mode = DEVPTS_DEFAULT_MODE;
opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
opts->max = NR_UNIX98_PTY_MAX;
+ if (!devpts_mnt ||
+ (&DEVPTS_SB(devpts_mnt->mnt_sb)->mount_opts == opts))
+ opts->ptmxmode = 0;
while ((p = strsep(&data, ",")) != NULL) {
substring_t args[MAX_OPT_ARGS];
diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h
index 260190690674..80908db52667 100644
--- a/include/linux/devpts_fs.h
+++ b/include/linux/devpts_fs.h
@@ -15,6 +15,9 @@
#include <linux/errno.h>
+#define DEVPTS_DEFAULT_PTMX_MODE 0666
+#define PTMX_MINOR 2
+
#ifdef CONFIG_UNIX98_PTYS
struct inode *devpts_ptmx(struct inode *inode, struct file *filp);
--
2.6.3
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
fs/devpts/inode.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 46d633ab16c7..266a7b7501a6 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -597,7 +597,8 @@ static void devpts_kill_sb(struct super_block *sb)
{
struct pts_fs_info *fsi = DEVPTS_SB(sb);
- ida_destroy(&fsi->allocated_ptys);
+ if (fsi)
+ ida_destroy(&fsi->allocated_ptys);
kfree(fsi);
kill_litter_super(sb);
}
--
2.6.3
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
fs/devpts/inode.c | 13 ++++++++-----
1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 80c78bb472a9..46d633ab16c7 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -462,6 +462,7 @@ static int
devpts_fill_super(struct super_block *s, void *data, int silent)
{
struct inode *inode;
+ int error;
s->s_blocksize = 1024;
s->s_blocksize_bits = 10;
@@ -469,10 +470,16 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
s->s_op = &devpts_sops;
s->s_time_gran = 1;
+ error = -ENOMEM;
s->s_fs_info = new_pts_fs_info();
if (!s->s_fs_info)
goto fail;
+ error = parse_mount_options(data, &DEVPTS_SB(s)->mount_opts);
+ if (error)
+ goto fail;
+
+ error = -ENOMEM;
inode = new_inode(s);
if (!inode)
goto fail;
@@ -490,7 +497,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
pr_err("get root dentry failed\n");
fail:
- return -ENOMEM;
+ return error;
}
#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
@@ -557,10 +564,6 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type,
if (error)
goto out_undo_sget;
- error = parse_mount_options(data, &DEVPTS_SB(s)->mount_opts);
- if (error)
- goto out_undo_sget;
-
error = mknod_ptmx(s);
if (error)
goto out_undo_sget;
--
2.6.3
Retain the code that was previously enabled with
DEVPTS_MULTIPLE_INSTANCES and remove the config option, and kill the
little bit of code that existed only when DEVPTS_MULTIPLE_INSTANCES
was not selected. With the recently updated semantics userspace
actively depends on having multiple instances of devpts for correct
operation.
Having each mount of devpts return a distinct instance ensures that
user space will not accidentally stomp gid or mode devpts options, of
the primary system devpts, by mounting devpts in a chroot environment.
A guarantee that userspace will not stomp attributes of system devpts
removes the need for a setuid root pt_chown executable.
Running a userspace without DEVPTS_MULTIPLE_INSTANCES that has depends
on the current behavior and has removed a setuid root pt_chown
exectuable will allow the system devpts gid and mode options to be
stomped breaking userspace. Which makes the ability to disable the
code previously selected by DEVPTS_MULTIPLE_INSTANCES actively wrong.
The size increase by always using the DEVPTS_MULTIPLE_INSTANCE
path is minimal, and the code is much easier to maintain and
use without having two different code paths to worry about.
The documentation has been updated to relfect this change.
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
Documentation/filesystems/devpts.txt | 122 ++++++++++++++---------------------
drivers/tty/Kconfig | 11 ----
fs/devpts/inode.c | 41 ------------
3 files changed, 47 insertions(+), 127 deletions(-)
diff --git a/Documentation/filesystems/devpts.txt b/Documentation/filesystems/devpts.txt
index 30d2fcb32f72..984b645ac341 100644
--- a/Documentation/filesystems/devpts.txt
+++ b/Documentation/filesystems/devpts.txt
@@ -1,38 +1,32 @@
-To support containers, we now allow multiple instances of devpts filesystem,
-such that indices of ptys allocated in one instance are independent of indices
-allocated in other instances of devpts.
-
-To preserve backward compatibility, this support for multiple instances is
-enabled only if:
-
- - CONFIG_DEVPTS_MULTIPLE_INSTANCES=y, and
- - '-o newinstance' mount option is specified while mounting devpts
-
-IOW, devpts now supports both single-instance and multi-instance semantics.
-
-If CONFIG_DEVPTS_MULTIPLE_INSTANCES=n, there is no change in behavior and
-this referred to as the "legacy" mode. In this mode, the new mount options
-(-o newinstance and -o ptmxmode) will be ignored with a 'bogus option' message
-on console.
-
-If CONFIG_DEVPTS_MULTIPLE_INSTANCES=y and devpts is mounted without the
-'newinstance' option (as in current start-up scripts) the new mount binds
-to the initial kernel mount of devpts. This mode is referred to as the
-'single-instance' mode and the current, single-instance semantics are
-preserved, i.e PTYs are common across the system.
-
-The only difference between this single-instance mode and the legacy mode
-is the presence of new, '/dev/pts/ptmx' node with permissions 0000, which
-can safely be ignored.
-
-If CONFIG_DEVPTS_MULTIPLE_INSTANCES=y and 'newinstance' option is specified,
-the mount is considered to be in the multi-instance mode and a new instance
-of the devpts fs is created. Any ptys created in this instance are independent
-of ptys in other instances of devpts. Like in the single-instance mode, the
-/dev/pts/ptmx node is present. To effectively use the multi-instance mode,
-open of /dev/ptmx must be a redirected to '/dev/pts/ptmx' using a symlink or
-bind-mount.
+Each mount of the devpts filesystem is now a distinct instance from
+all other mounts of devpts. Mount options and indicies of ptys
+allocated in one instance are independent of indices allocated in
+other instances of devpts.
+
+If devpts is mounted without the 'newinstance' option (as in current
+start-up scripts) and if the initial kernel mount of devpts has not
+been exported to userspace the new mount binds to the initial kernel
+mount of devpts.
+
+If devpts is mounted with the 'newinstance' option or the initial
+internal mount of devpts has already been mounted a new instance of
+the devpts filesystem is created. Any ptys created in this instance
+are independent of the ptys created in other instances of devpts, and
+the initial permissions of those ptys are independent from the initial
+permissions from any other instance of the devpts filesystem.
+
+Ideally people will make use of the /dev/pts/ptmx device node to
+create ptys on the devpts filesystem. This can be done by updating
+userspace, bind mounting /dev/pts/ptmx onto /dev/ptmx or making
+/dev/ptmx a symlink to /dev/pts/ptmx.
+
+To be seemlessly backwards compatible an open of /dev/ptmx will look
+to see if the name pts in the same directory is the root directory of
+a devpts filesystem. If that is the case and it is not the initial
+instance of devpts /dev/pts/ptmx is opened. Otherwise the initial
+instance of devpts is opened. In older kernels /dev/ptmx did not
+perform this redirection.
Eg: A container startup script could do the following:
@@ -60,44 +54,34 @@ Per-instance limit could be set by adding mount option "max=<count>".
This feature was added in kernel 3.4 together with sysctl kernel.pty.reserve.
In kernels older than 3.4 sysctl kernel.pty.max works as per-instance limit.
-User-space changes
-------------------
+What user-space needs to do
+----------------------------
-In multi-instance mode (i.e '-o newinstance' mount option is specified at least
-once), following user-space issues should be noted.
+1. If the devpts filesystem is only mounted once /dev/pts/ptmx can be
+ ignored and no change is needed to system-startup scripts.
-1. If -o newinstance mount option is never used, /dev/pts/ptmx can be ignored
- and no change is needed to system-startup scripts.
+2. For best results userspace libraries and applications should be updated
+ to try opening /dev/pts/ptmx before /dev/ptmx, as /dev/pts/ptmx is less
+ ambiguous and higher performance.
-2. To effectively use multi-instance mode (i.e -o newinstance is specified)
- administrators or startup scripts should "redirect" open of /dev/ptmx to
- /dev/pts/ptmx using either a bind mount or symlink.
+3. To effectively use a new instance of devpts open of /dev/ptmx should
+ be redirected to /dev/pts/ptmx using either a bind mount or symlink.
$ mount -t devpts -o newinstance devpts /dev/pts
followed by either
- $ rm /dev/ptmx
$ ln -s pts/ptmx /dev/ptmx
$ chmod 666 /dev/pts/ptmx
or
$ mount -o bind /dev/pts/ptmx /dev/ptmx
-3. The '/dev/ptmx -> pts/ptmx' symlink is the preferred method since it
- enables better error-reporting and treats both single-instance and
- multi-instance mounts similarly.
-
- But this method requires that system-startup scripts set the mode of
- /dev/pts/ptmx correctly (default mode is 0000). The scripts can set the
- mode by, either
+4. The '/dev/ptmx -> pts/ptmx' symlink is the preferred method since it
+ enables better error-reporting and treats all cases the same.
- - adding ptmxmode mount option to devpts entry in /etc/fstab, or
- - using 'chmod 0666 /dev/pts/ptmx'
-
-4. If multi-instance mode mount is needed for containers, but the system
- startup scripts have not yet been updated, container-startup scripts
- should bind mount /dev/ptmx to /dev/pts/ptmx to avoid breaking single-
- instance mounts.
+5. If the system startup scripts do not create /dev/ptmx as a symlink,
+ container-startup scripts should bind mount /dev/ptmx to /dev/pts/ptmx
+ to avoid breaking the rest of the system.
Or, in general, container-startup scripts should use:
@@ -106,11 +90,9 @@ once), following user-space issues should be noted.
mount -o bind /dev/pts/ptmx /dev/ptmx
fi
- When all devpts mounts are multi-instance, /dev/ptmx can permanently be
- a symlink to pts/ptmx and the bind mount can be ignored.
-
-5. A multi-instance mount that is not accompanied by the /dev/ptmx to
- /dev/pts/ptmx redirection would result in an unusable/unreachable pty.
+6. A mount of devpts that is not accompanied by the /dev/ptmx to
+ /dev/pts/ptmx redirection will result in an unusable/unreachable pty,
+ on older kernels.
mount -t devpts -o newinstance lxcpts /dev/pts
@@ -121,21 +103,11 @@ once), following user-space issues should be noted.
would create a pty, say /dev/pts/7, in the initial kernel mount.
But /dev/pts/7 would be invisible in the new mount.
-6. The permissions for /dev/pts/ptmx node should be specified when mounting
- /dev/pts, using the '-o ptmxmode=%o' mount option (default is 0000).
+7. The permissions for /dev/pts/ptmx node should be specified when mounting
+ /dev/pts, using the '-o ptmxmode=%o' mount option (default is 0666 was 0000).
mount -t devpts -o newinstance -o ptmxmode=0644 devpts /dev/pts
The permissions can be later be changed as usual with 'chmod'.
chmod 666 /dev/pts/ptmx
-
-7. A mount of devpts without the 'newinstance' option results in binding to
- initial kernel mount. This behavior while preserving legacy semantics,
- does not provide strict isolation in a container environment. i.e by
- mounting devpts without the 'newinstance' option, a container could
- get visibility into the 'host' or root container's devpts.
-
- To workaround this and have strict isolation, all mounts of devpts,
- including the mount in the root container, should use the newinstance
- option.
diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig
index 82c4d2e45319..95103054c0e4 100644
--- a/drivers/tty/Kconfig
+++ b/drivers/tty/Kconfig
@@ -120,17 +120,6 @@ config UNIX98_PTYS
All modern Linux systems use the Unix98 ptys. Say Y unless
you're on an embedded system and want to conserve memory.
-config DEVPTS_MULTIPLE_INSTANCES
- bool "Support multiple instances of devpts"
- depends on UNIX98_PTYS
- default n
- ---help---
- Enable support for multiple instances of devpts filesystem.
- If you want to have isolated PTY namespaces (eg: in containers),
- say Y here. Otherwise, say N. If enabled, each mount of devpts
- filesystem with the '-o newinstance' option will create an
- independent PTY namespace.
-
config LEGACY_PTYS
bool "Legacy (BSD) PTY support"
default y
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index f86fae8dac0b..9bed39d89cb3 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -109,11 +109,9 @@ static const match_table_t tokens = {
{Opt_uid, "uid=%u"},
{Opt_gid, "gid=%u"},
{Opt_mode, "mode=%o"},
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
{Opt_ptmxmode, "ptmxmode=%o"},
{Opt_newinstance, "newinstance"},
{Opt_max, "max=%d"},
-#endif
{Opt_err, NULL}
};
@@ -128,7 +126,6 @@ static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb)
return sb->s_fs_info;
}
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
static int devpts_path_ptmx(struct file *filp)
{
struct pts_fs_info *fsi;
@@ -186,12 +183,6 @@ fail:
path_put(&path);
return err;
}
-#else
-static inline int devpts_path_ptmx(struct file *filp)
-{
- return -ENOENT;
-}
-#endif
struct inode *devpts_ptmx(struct inode *inode, struct file *filp)
{
@@ -210,10 +201,8 @@ struct inode *devpts_ptmx(struct inode *inode, struct file *filp)
static inline struct super_block *pts_sb_from_inode(struct inode *inode)
{
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
return inode->i_sb;
-#endif
if (!devpts_mnt)
return NULL;
return devpts_mnt->mnt_sb;
@@ -289,7 +278,6 @@ static int parse_mount_options(char *data, struct pts_mount_opts *opts)
return -EINVAL;
opts->mode = option & S_IALLUGO;
break;
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
case Opt_ptmxmode:
if (match_octal(&args[0], &option))
return -EINVAL;
@@ -303,7 +291,6 @@ static int parse_mount_options(char *data, struct pts_mount_opts *opts)
return -EINVAL;
opts->max = option;
break;
-#endif
default:
pr_err("called with bogus options\n");
return -EINVAL;
@@ -313,7 +300,6 @@ static int parse_mount_options(char *data, struct pts_mount_opts *opts)
return 0;
}
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
static int mknod_ptmx(struct super_block *sb)
{
int mode;
@@ -374,16 +360,6 @@ static void update_ptmx_mode(struct pts_fs_info *fsi)
inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode;
}
}
-#else
-static inline void update_ptmx_mode(struct pts_fs_info *fsi)
-{
- return;
-}
-static inline int mknod_ptmx(struct super_block *sb)
-{
- return 0;
-}
-#endif
static int devpts_remount(struct super_block *sb, int *flags, char *data)
{
@@ -417,11 +393,9 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root)
seq_printf(seq, ",gid=%u",
from_kgid_munged(&init_user_ns, opts->gid));
seq_printf(seq, ",mode=%03o", opts->mode);
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode);
if (opts->max < NR_UNIX98_PTY_MAX)
seq_printf(seq, ",max=%d", opts->max);
-#endif
return 0;
}
@@ -512,7 +486,6 @@ fail:
return error;
}
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
/*
* devpts_mount()
*
@@ -565,18 +538,6 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type,
return root;
}
-#else
-/*
- * This supports only the legacy single-instance semantics (no
- * multiple-instance semantics)
- */
-static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags,
- const char *dev_name, void *data)
-{
- return mount_single(fs_type, flags, data, devpts_fill_super);
-}
-#endif
-
static void devpts_kill_sb(struct super_block *sb)
{
struct pts_fs_info *fsi = DEVPTS_SB(sb);
@@ -591,9 +552,7 @@ static struct file_system_type devpts_fs_type = {
.name = "devpts",
.mount = devpts_mount,
.kill_sb = devpts_kill_sb,
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
.fs_flags = FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT,
-#endif
};
/*
--
2.6.3
Update the vfs with a may_overmount superblock operation, allowing
devpts to fail early if the primary mount of devpts is going to be
mounted on top of itself.
This change is in preparation for each mount of devpts being distinct
from every other mount of devpts. To maintain a backward compatible
notion of a primary mount of devpts we need overmounts of the mount to
fail (as they do now), which requires a little vfs support so the case
can be detected.
Cause failed over mounts of devpts to go through the devpts remount
path. This already happens as overmounts have previously been detected
late, and it looks like CentOS 6 may actually depend on this behavior
to allow changing devpts mount options by placing them in /etc/fstab.
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
fs/devpts/inode.c | 15 +++++++++++++++
fs/namespace.c | 8 ++++++++
include/linux/fs.h | 1 +
3 files changed, 24 insertions(+)
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 14be886f987c..cb0cc4e33c3f 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -427,10 +427,25 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root)
return 0;
}
+bool devpts_may_overmount(struct super_block *sb,
+ int flags, const char *dev_name, void *data)
+{
+ if ((sb == devpts_mnt->mnt_sb) &&
+ (current_user_ns() == &init_user_ns) &&
+ !parse_newinstance(data)) {
+ down_write(&sb->s_umount);
+ devpts_remount(sb, &flags, data);
+ up_write(&sb->s_umount);
+ return false;
+ }
+ return true;
+}
+
static const struct super_operations devpts_sops = {
.statfs = simple_statfs,
.remount_fs = devpts_remount,
.show_options = devpts_show_options,
+ .may_overmount = devpts_may_overmount,
};
static void *new_pts_fs_info(void)
diff --git a/fs/namespace.c b/fs/namespace.c
index 4fb1691b4355..90bbbabfe3c9 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2386,6 +2386,7 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
{
struct file_system_type *type;
struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
+ struct super_block *path_sb;
struct vfsmount *mnt;
int err;
@@ -2414,6 +2415,13 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
}
}
+ path_sb = path->mnt->mnt_sb;
+ if ((path_sb->s_type == type) &&
+ (path->mnt->mnt_root == path->dentry) &&
+ path_sb->s_op->may_overmount &&
+ !path_sb->s_op->may_overmount(path_sb, flags, name, data))
+ return -EBUSY;
+
mnt = vfs_kern_mount(type, flags, name, data);
if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
!mnt->mnt_sb->s_subtype)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 045bbfe2ecfc..02a980bfad5c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1755,6 +1755,7 @@ struct super_operations {
struct shrink_control *);
long (*free_cached_objects)(struct super_block *,
struct shrink_control *);
+ bool (*may_overmount)(struct super_block *, int, const char *, void *);
};
/*
--
2.6.3
Add a dedicated parsing routing for newinstance that does not modify
data, so parsing out newinstance can be separate from the parsing of
the other mount options.
Allways pass as data to parse_mount_options the filesystem specific
portion of the super_block that holds the mount options.
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
fs/devpts/inode.c | 44 ++++++++++++++++++++++----------------------
1 file changed, 22 insertions(+), 22 deletions(-)
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 9f22c959d1f7..c3d53d2f7c3e 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -105,7 +105,6 @@ struct pts_mount_opts {
kgid_t gid;
umode_t mode;
umode_t ptmxmode;
- int newinstance;
int max;
};
@@ -228,19 +227,27 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode)
return devpts_mnt->mnt_sb;
}
-#define PARSE_MOUNT 0
-#define PARSE_REMOUNT 1
+static bool parse_newinstance(const char *data)
+{
+ while (data) {
+ const char *p = strchr(data, ',');
+ size_t len = p ? p - data : strlen(data);
+ if ((len == 11) && (memcmp(data, "newinstance", 11) == 0)) {
+ return true;
+ }
+ data = p ? p + 1 : NULL;
+ }
+ return false;
+}
/*
* parse_mount_options():
* Set @opts to mount options specified in @data. If an option is not
- * specified in @data, set it to its default value. The exception is
- * 'newinstance' option which can only be set/cleared on a mount (i.e.
- * cannot be changed during remount).
+ * specified in @data, set it to its default value.
*
* Note: @data may be NULL (in which case all options are set to default).
*/
-static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
+static int parse_mount_options(char *data, struct pts_mount_opts *opts)
{
char *p;
kuid_t uid;
@@ -254,10 +261,6 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
opts->max = NR_UNIX98_PTY_MAX;
- /* newinstance makes sense only on initial mount */
- if (op == PARSE_MOUNT)
- opts->newinstance = 0;
-
while ((p = strsep(&data, ",")) != NULL) {
substring_t args[MAX_OPT_ARGS];
int token;
@@ -298,9 +301,6 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
opts->ptmxmode = option & S_IALLUGO;
break;
case Opt_newinstance:
- /* newinstance makes sense only on initial mount */
- if (op == PARSE_MOUNT)
- opts->newinstance = 1;
break;
case Opt_max:
if (match_int(&args[0], &option) ||
@@ -399,7 +399,7 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
struct pts_mount_opts *opts = &fsi->mount_opts;
sync_filesystem(sb);
- err = parse_mount_options(data, PARSE_REMOUNT, opts);
+ err = parse_mount_options(data, opts);
/*
* parse_mount_options() restores options to default values
@@ -528,20 +528,18 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
int error;
- struct pts_mount_opts opts;
struct super_block *s;
+ bool newinstance;
- error = parse_mount_options(data, PARSE_MOUNT, &opts);
- if (error)
- return ERR_PTR(error);
+ newinstance = parse_newinstance(data);
/* Require newinstance for all user namespace mounts to ensure
* the mount options are not changed.
*/
- if ((current_user_ns() != &init_user_ns) && !opts.newinstance)
+ if ((current_user_ns() != &init_user_ns) && !newinstance)
return ERR_PTR(-EINVAL);
- if (opts.newinstance)
+ if (newinstance)
s = sget(fs_type, NULL, set_anon_super, flags, NULL);
else
s = sget(fs_type, compare_init_pts_sb, set_anon_super, flags,
@@ -557,7 +555,9 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type,
s->s_flags |= MS_ACTIVE;
}
- memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts));
+ error = parse_mount_options(data, &DEVPTS_SB(s)->mount_opts);
+ if (error)
+ goto out_undo_sget;
error = mknod_ptmx(s);
if (error)
--
2.6.3
This makes the logic of the test clearer, and removes a confusing
use of opts.newinstance, which allows later changes not to worry
if newinstance is set on the primary mount of devpts or not.
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
fs/devpts/inode.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index c14d51795577..9f22c959d1f7 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -622,7 +622,7 @@ retry:
mutex_lock(&allocated_ptys_lock);
if (pty_count >= pty_limit -
- (fsi->mount_opts.newinstance ? pty_reserve : 0)) {
+ ((devpts_mnt->mnt_sb == sb) ? pty_reserve : 0)) {
mutex_unlock(&allocated_ptys_lock);
return -ENOSPC;
}
--
2.6.3
The code makes more sense here and things are just clearer.
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
fs/devpts/inode.c | 22 +++++++++++++++-------
1 file changed, 15 insertions(+), 7 deletions(-)
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 266a7b7501a6..5356a68863b8 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -379,6 +379,10 @@ static inline void update_ptmx_mode(struct pts_fs_info *fsi)
{
return;
}
+static inline int mknod_ptmx(struct super_block *sb)
+{
+ return 0;
+}
#endif
static int devpts_remount(struct super_block *sb, int *flags, char *data)
@@ -491,11 +495,19 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
set_nlink(inode, 2);
s->s_root = d_make_root(inode);
- if (s->s_root)
- return 0;
+ if (!s->s_root) {
+ pr_err("get root dentry failed\n");
+ goto fail;
+ }
- pr_err("get root dentry failed\n");
+ error = mknod_ptmx(s);
+ if (error)
+ goto fail_dput;
+ return 0;
+fail_dput:
+ dput(s->s_root);
+ s->s_root = NULL;
fail:
return error;
}
@@ -564,10 +576,6 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type,
if (error)
goto out_undo_sget;
- error = mknod_ptmx(s);
- if (error)
- goto out_undo_sget;
-
s->s_flags |= MS_ACTIVE;
} else {
/* Match mount_single ignore errors on remount */
--
2.6.3
When devpts is mounted and the newinstance flag is not passed the code
first checks to see if the system devpts instance has been exported to
userspace. If it has not the system devpts instance is returned
otherwise a fresh instance of devpts is allocated and returned.
If newinstance is passed a fresh devpts instance is always returned.
Combined with the earlier work to cause mounts of devpts to fail
if devpts is mounted over itself, this ensures that the system devpts
is mounted on /dev/pts on all of the systems I have tested.
This has been verified to work properly on openwrt-15.05, centos5,
centos6, centos7, debian-6.0.2, debian-7.9, debian-8.2, ubuntu-14.04.3,
ubuntu-15.10, fedora23, magia-5, mint-17.3, opensuse-42.1, slackware-14.1,
gentoo-20151225 (13.0?), archlinux-2015-12-01
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
fs/devpts/inode.c | 33 ++++++++++++---------------------
1 file changed, 12 insertions(+), 21 deletions(-)
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 7b4fe0d4018d..f86fae8dac0b 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -513,13 +513,6 @@ fail:
}
#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
-static int compare_init_pts_sb(struct super_block *s, void *p)
-{
- if (devpts_mnt)
- return devpts_mnt->mnt_sb == s;
- return 0;
-}
-
/*
* devpts_mount()
*
@@ -550,28 +543,26 @@ static int compare_init_pts_sb(struct super_block *s, void *p)
static struct dentry *devpts_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- struct super_block *s;
+ struct dentry *root;
bool newinstance;
newinstance = parse_newinstance(data);
+ if (flags & MS_KERNMOUNT)
+ newinstance = true;
- /* Require newinstance for all user namespace mounts to ensure
+ /* Force newinstance for all user namespace mounts to ensure
* the mount options are not changed.
*/
- if ((current_user_ns() != &init_user_ns) && !newinstance)
- return ERR_PTR(-EINVAL);
-
- if (newinstance)
- return mount_nodev(fs_type, flags, data, devpts_fill_super);
-
- s = sget(fs_type, compare_init_pts_sb, set_anon_super, flags, NULL);
- if (IS_ERR(s))
- return ERR_CAST(s);
+ if (current_user_ns() != &init_user_ns)
+ newinstance = true;
- /* Match mount_single ignore errors on remount */
- devpts_remount(s, &flags, data);
+ root = NULL;
+ if (!newinstance)
+ root = mount_super_once(devpts_mnt->mnt_sb, flags, data);
+ if (IS_ERR_OR_NULL(root))
+ root = mount_nodev(fs_type, flags, data, devpts_fill_super);
- return dget(s->s_root);
+ return root;
}
#else
--
2.6.3
The devpts filesystem has a notion of a system or primary instance of
devpts. To retain the notion of a primary system instance of devpts
the code needs a way to allow userspace to mount the internally
mounted instance of devpts when it is not currently mounted by
userspace. The new helper mount_super_once allows that.
Ideally mount_super_once would ignore still referenced lazy unmounts,
but in testing I was not able to find an existing distribution that
cared. Since no one actually cares I did not try and solve the
formidable challenges of adding a test to ignore still referenced lazy
mounts in a race free way.
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
fs/super.c | 12 ++++++++++++
include/linux/fs.h | 2 ++
2 files changed, 14 insertions(+)
diff --git a/fs/super.c b/fs/super.c
index 74914b1bae70..98a569412036 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1101,6 +1101,18 @@ struct dentry *mount_single(struct file_system_type *fs_type,
}
EXPORT_SYMBOL(mount_single);
+struct dentry *mount_super_once(struct super_block *sb, int flags, void *data)
+{
+ /* Allow mounting the specified superblock by userspace if there
+ * are not any existing userspace mounts of it.
+ */
+ if (atomic_cmpxchg(&sb->s_active, 1, 2) != 1)
+ return ERR_PTR(-EBUSY);
+ down_write(&sb->s_umount);
+ do_remount_sb(sb, flags, data, 0);
+ return dget(sb->s_root);
+}
+
struct dentry *
mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
{
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 02a980bfad5c..515f874a6907 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2002,6 +2002,8 @@ extern struct dentry *mount_single(struct file_system_type *fs_type,
extern struct dentry *mount_nodev(struct file_system_type *fs_type,
int flags, void *data,
int (*fill_super)(struct super_block *, void *, int));
+extern struct dentry *mount_super_once(struct super_block *sb,
+ int flags, void *data);
extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path);
void generic_shutdown_super(struct super_block *sb);
void kill_block_super(struct super_block *sb);
--
2.6.3
Now that all of the work of setting up a superblock has been moved to
devpts_fill_super simplify devpts_mount by calling mount_nodev instead
of rolling mount_nodev by hand.
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
fs/devpts/inode.c | 23 ++++-------------------
1 file changed, 4 insertions(+), 19 deletions(-)
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 5356a68863b8..7b4fe0d4018d 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -550,7 +550,6 @@ static int compare_init_pts_sb(struct super_block *s, void *p)
static struct dentry *devpts_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- int error;
struct super_block *s;
bool newinstance;
@@ -563,30 +562,16 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type,
return ERR_PTR(-EINVAL);
if (newinstance)
- s = sget(fs_type, NULL, set_anon_super, flags, NULL);
- else
- s = sget(fs_type, compare_init_pts_sb, set_anon_super, flags,
- NULL);
+ return mount_nodev(fs_type, flags, data, devpts_fill_super);
+ s = sget(fs_type, compare_init_pts_sb, set_anon_super, flags, NULL);
if (IS_ERR(s))
return ERR_CAST(s);
- if (!s->s_root) {
- error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
- if (error)
- goto out_undo_sget;
-
- s->s_flags |= MS_ACTIVE;
- } else {
- /* Match mount_single ignore errors on remount */
- devpts_remount(s, &flags, data);
- }
+ /* Match mount_single ignore errors on remount */
+ devpts_remount(s, &flags, data);
return dget(s->s_root);
-
-out_undo_sget:
- deactivate_locked_super(s);
- return ERR_PTR(error);
}
#else
--
2.6.3
On Mon, Apr 04, 2016 at 08:29:17PM -0500, Eric W. Biederman wrote:
> +#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
> +static int devpts_path_ptmx(struct file *filp)
> +{
> + struct pts_fs_info *fsi;
> + struct path root, path;
> + struct dentry *old;
> + int err = -ENOENT;
> + int ret;
> +
> + /* Can the pts filesystem be found with a path walk? */
> + path = filp->f_path;
> + path_get(&path);
> + get_fs_root(current->fs, &root);
> + ret = path_parent(&root, &path);
> + path_put(&root);
> + if (ret != 1)
> + goto fail;
That, I take it, is a lookup for .. and buggering off if it fails *or* if
we had been in caller's root or something that overmount it? Not that the
latter had been possible - root is a directory and can be overmounted only
by another such, and we are called from ->open() of a device node.
> + /* Remember the result of this permission check for later */
> + ret = inode_permission(path.dentry->d_inode, MAY_EXEC);
> + if (path_pts(&path))
> + goto fail;
Egads, man - you've just introduced a special function for looking up
something named "pts" in a given directory!
The reason not to use kern_path() would be what, the fact that it doesn't
allow starting at given location? So let's make a variant that would - and
rather than bothering with RCU, just go for something like (completely
untested)
/* on success overwrite *path with the result of walk; do _not_ drop the
reference to old contents - let the caller arrange that */
int kern_path_relative(struct path *path, const char *s, int flags)
{
int err;
struct nameidata nd = {.path = *path};
struct filename *name;
if (!*s || *s == '/' || flags & (LOOKUP_ROOT | LOOKUP_RCU))
return -EINVAL;
name = getname_kernel(s);
if (IS_ERR(name))
return PTR_ERR(name);
set_nameidata(&nd, AT_FDCWD, name);
nd.last_type = LAST_ROOT;
nd.flags = flags | LOOKUP_REVAL | LOOKUP_JUMPED | LOOKUP_PARENT;
nd.m_seq = read_seqbegin(&mount_lock);
path_get(&nd.path);
nd.inode = nd.path.dentry->d_inode;
while (!(err = link_path_walk(s, &nd))
&& ((err = lookup_last(&nd)) > 0)) {
s = trailing_symlink(&nd);
if (IS_ERR(s)) {
err = PTR_ERR(s);
break;
}
}
if (!err)
err = complete_walk(&nd);
if (!err && flags & LOOKUP_DIRECTORY)
if (!d_can_lookup(nd.path.dentry))
err = -ENOTDIR;
if (!err) {
*path = nd.path;
nd.path.mnt = NULL;
nd.path.dentry = NULL;
}
terminate_walk(&nd);
restore_nameidata();
putname(name);
return err;
}
and use it as
path = filp->f_path;
err = kern_path_relative(&path, "../pts", LOOKUP_DIRECTORY);
if (err)
return err;
/* from here on we need to path_put() it */
if (path.mnt->mnt_sb->s_magic != DEVPTS_SUPER_MAGIC)
goto fail;
/* must be its root; no other directories on that puppy */
> + fsi = DEVPTS_SB(path.mnt->mnt_sb);
> +
> + /* Get out if the path walk resulted in the default devpts instance */
> + if (devpts_mnt->mnt_sb == path.mnt->mnt_sb)
> + goto fail;
> +
> + /* Don't allow bypassing the existing /dev/pts/ptmx permission check */
err = inode_permission(path.dentry->d_inode, MAY_EXEC);
if (err)
goto fail;
err = inode_permission(fsi->ptmx_dentry->d_inode,
ACC_MODE(filp->f_flags));
if (err)
goto fail;
> + /* Advance path to the ptmx dentry */
> + old = path.dentry;
> + path.dentry = dget(fsi->ptmx_dentry);
> + dput(old);
> +
> + /* Make it look like /dev/pts/ptmx was opened */
> + err = update_file_path(filp, &path);
> + if (err)
> + goto fail;
> +
> + return 0;
> +fail:
> + path_put(&path);
> + return err;
> +}
> +#else
> +static inline int devpts_path_ptmx(struct file *filp)
> +{
> + return -ENOENT;
> +}
> +#endif
> +
> +struct inode *devpts_ptmx(struct inode *inode, struct file *filp)
> +{
> + int err;
> + if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
> + return inode;
> +
> + err = devpts_path_ptmx(filp);
> + if (err == 0)
> + return filp->f_inode;
> + if (err != -ENOENT)
> + return ERR_PTR(err);
> +
> + return inode;
> +}
Umm... I'm not sure it makes for good calling conventions - the caller can
do inode = file_inode(filp) just as well, so why not simply return 0 or -E...?
"return inode;" cases become simply return 0...
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -1415,29 +1415,41 @@ static void follow_mount(struct path *path)
> }
> }
>
> -static int follow_dotdot(struct nameidata *nd)
> +int path_parent(struct path *root, struct path *path)
Please, don't.
> +#ifdef CONFIG_UNIX98_PTYS
> +int path_pts(struct path *path)
Fuck, no.
> index 17cb6b1dab75..e1ed78fa474b 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -679,6 +679,24 @@ int open_check_o_direct(struct file *f)
> return 0;
> }
>
> +int update_file_path(struct file *filp, struct path *path)
> +{
> + /* Only valid during f_op->open, and even in open use very carefully */
> + struct path old;
> + struct inode *inode;
> +
> + if (filp->f_mode & FMODE_WRITER)
> + return -EINVAL;
That really needs to be commented.
> + old = filp->f_path;
> + inode = path->dentry->d_inode;
> + filp->f_path = *path;
> + filp->f_inode = inode;
> + filp->f_mapping = inode->i_mapping;
> + path_put(&old);
> + return 0;
> +}
> +
> static int do_dentry_open(struct file *f,
> struct inode *inode,
> int (*open)(struct inode *, struct file *),
> @@ -736,6 +754,7 @@ static int do_dentry_open(struct file *f,
> error = open(inode, f);
> if (error)
> goto cleanup_all;
> + inode = f->f_inode;
> }
> if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
> i_readcount_inc(inode);
BTW, have you looked through the callers of dentry_open()? It can hit that
case as well...
On Tue, Apr 05, 2016 at 03:54:25AM +0100, Al Viro wrote:
> That, I take it, is a lookup for .. and buggering off if it fails *or* if
> we had been in caller's root or something that overmount it? Not that the
> latter had been possible - root is a directory and can be overmounted only
> by another such, and we are called from ->open() of a device node.
>
> > + /* Remember the result of this permission check for later */
> > + ret = inode_permission(path.dentry->d_inode, MAY_EXEC);
> > + if (path_pts(&path))
> > + goto fail;
>
> Egads, man - you've just introduced a special function for looking up
> something named "pts" in a given directory!
>
> The reason not to use kern_path() would be what, the fact that it doesn't
> allow starting at given location? So let's make a variant that would - and
> rather than bothering with RCU, just go for something like (completely
> untested)
Ah... Right, that would demand exec permissions on the starting point.
Still, this is incredibly ugly ;-/ I'll try to come up with something
more tolerable, but this "path_pts" thing is too ugly to live. Seriously.
On Mon, Apr 4, 2016 at 6:29 PM, Eric W. Biederman <[email protected]> wrote:
>
> In practice I expect the permission checks are a non-issue as the
> permissions on /dev/ptmx and /dev/pts/ptmx are always 0666.
So I think this is still entirely wrongheaded, and thinking about the
problem the wrong way around.
The issue is *not* that the "permissions on /dev/ptmx and
/dev/pts/ptmx are always 0666". Not at all.
The permissions of /dev/ptmx and /dev/pts/ptmx are simply *irrelevant*.
We're not interested in opening /dev/ptmx. We are interested in
looking up *which* ptmx that pty is associated with.
Those are two totally different issues.
We never opened /dev./ptmx before either, and we never ever cared
about the permiossions of it. We just hardcoded which superblock we
were using, regardless of those permissions.
We should basically continue to do the exact same thing. We don't care
about the permissions of the ptmx entry, and we're not even interested
in opening it (it's sufficient to just find the "pts" subdirectory),
we are _purely_ asking "which superblock/mount am I associated with".
In other words, we *could* do this by doing some insane parsing of
/proc/mounts, but that would be stupid.
My point is, talking about permissions of these nodes is _wrong_. It's
actively misleading. It is exactly the wrong thing to do, because it
confuses people into thinking that we somehow care, and that we
somehow open the new node. We don't. We're opening the *old* pathname,
the one whose permissions we already checked when we walked it, and
we're just looking up the pts directory so that we don't hardcode
which set of pty's we're talking about.
So I think the part of the patch where you check permissions is wrong.
I think the part of the commit message where you talk about this is
confused.
You should make this about looking up the superblock, and explicitly
talk about how this is *not* about permissions.
So get rid of all the pointless "inode_permission()" crap. We already
checked that by virtue of us opening "/dev/ptmx". THOSE permissions
matter, but they were already done. Now we're just saying "ok, the
user has a right to open the ptmx node, now _which_ devpts is that
ptmx node for?"
So also get rid of this:
+ /* Advance path to the ptmx dentry */
+ old = path.dentry;
+ path.dentry = dget(fsi->ptmx_dentry);
+ dput(old);
entirely. It's wrong. It's entirely pointless. We don't even care
about "what does pts/ptmx point to". We care about "which superblock
do we get when we look up the "pts/" subdirectory in the dentry cache
for this user (without permissions)"/
So get rid of all the pathname games. Just save the superblock pointer
in file->f_private or somewhere like that, and make it really clear
that what we are doing is making "/dev/ptmx" work sanely! The user is
not looking up "/dev/pts/ptmx". They are looking up "/dev/ptmx".
See the difference?
Linus
Linus Torvalds <[email protected]> writes:
> On Mon, Apr 4, 2016 at 6:29 PM, Eric W. Biederman <[email protected]> wrote:
>>
>> In practice I expect the permission checks are a non-issue as the
>> permissions on /dev/ptmx and /dev/pts/ptmx are always 0666.
>
> So I think this is still entirely wrongheaded, and thinking about the
> problem the wrong way around.
No. You are missing my concern.
My concern is that I suspect someone somewhere has created a chroot
environment. That chroot environment has devpts mounted with
"-o newinstance" and has set the permissions of /dev/pts/ptmx such
that only users in that container can create ptys on that instance
of devpts.
Being a mischevious user outside the container I can create a new user
namespace and a new mount namespace and bind mount our new and improved
version of /dev/ptmx right next to the chroot's /dev/pts mount.
Then because the permissions on /dev/ptmx are different than on the
chroots /dev/pts/ptmx I can create ptys that I could not have before
hand. Bypassing the existing permissions.
Given that concern under the rule we don't break userspace we have to
check the permissions of /dev/pts/ptmx when we are creating a new pty,
on a instance of devpts that was created with newinstance.
Short of saying we simply don't care about such users I don't see a way
we can allow bypassing the existing permission check.
Now I do think we can remove the permission check altogether. At this
point POSIX does not even require the existince of any files or device
nodes, and FreeBSD proves out that users of ptys don't care by
implementing a dedicated system call to create ptys that does contain
any permission checks. So only the small set of linux specific
chroot/container creating applications might care. As the permissions
were not in any way the focus of this patchset I choose not to tackle
a possible user visible change like this.
>
> So get rid of all the pointless "inode_permission()" crap. We already
> checked that by virtue of us opening "/dev/ptmx". THOSE permissions
> matter, but they were already done. Now we're just saying "ok, the
> user has a right to open the ptmx node, now _which_ devpts is that
> ptmx node for?"
I wish I could in conscience do that. But unless we decide that
permission are irrelevant we are adding a permission bypass for an
existing operation. Typically that is called a security bug. I am not
comfortable doing that unless we simply decide we don't care.
If we decide we don't care I will add a patch at the front of the
patchset that implements don't care before all of the rest of this.
> So also get rid of this:
>
> + /* Advance path to the ptmx dentry */
> + old = path.dentry;
> + path.dentry = dget(fsi->ptmx_dentry);
> + dput(old);
>
> entirely. It's wrong. It's entirely pointless. We don't even care
> about "what does pts/ptmx point to". We care about "which superblock
> do we get when we look up the "pts/" subdirectory in the dentry cache
> for this user (without permissions)"/
Actually it is not pointless. There is a second issue in all of this.
Right now it is possible to confuse the pt_chown setuid root binary
about which instance of devpts it should be calling chmod on. I am not
certain it even ensures it is calling chown on a devpts entry. It is
just hard coded paths today.
Right now userspace does something like:
masterfd = posix_openpt(O_RDWR);
grantpt(masterfd);
char *slave_name = ptsname(masterfd);
slavefd = open(slave_name);
Furthemore invoked by grantpt execs the pt_chown binary which does:
int pty_number;
ioctl(masterfd, TIOCGPTN, &pty_number)
sprintf(slave_name, "/dev/pty/%u", pty_number);
chown(slave_name, some_uid, some_gid);
It would be very nice if we could have a way to close the races
in this mess and allow a program like pt_chown to actually only affect
the pty it cares about. There is only one way I know to implement this
in a backwards compatible way and that is to have
readlink("/proc/self/fd/${masterfd}"), stat("/proc/self/fd/${masterfd}",
and open("/proc/self/fd/${masterfd}") talk about "/dev/pts/ptmx"
for a non-system instance of devpts
That will at least allow ptsname to find the proper instance of devpts.
I suppose it becomes hameless if grantpt stops calling a setuid root
exectuable if the connection between the master and the slave pty
gets confused and pt_chown, does not exist anymore. But it feels
wrong to allow userspace no way to ask the question which mounted
instance of devpts does this masterfd belong to. Especially
as readlink("/proc/self/fd/${masterfd}") is the natural way to ask that.
If anyone has a better idea on how userspace should connect the master
pty file descriptor the slave file descriptor, I would be willing to
implement that instead.
> So get rid of all the pathname games. Just save the superblock pointer
> in file->f_private or somewhere like that, and make it really clear
> that what we are doing is making "/dev/ptmx" work sanely! The user is
> not looking up "/dev/pts/ptmx". They are looking up "/dev/ptmx".
>
> See the difference?
The change that introduces devpts_add_ref devpts_del_ref takes care of
all of the needed reference counting of the super block level.
Apparently the interactions with /dev/tty require the tty's to have a
reference to the superblock.
The natural place to store the superblock in tty->driver_data. I even
took a look at that before posting my patches. Unfortunately there
is at least devpts_pty_kill that actually need the slave inode. So for
the slave side there does not appear to be a location where we can just
use the superblock. Furthermore most of the methods between the master
side and the slave side are shared so having tty->driver_data be an
inode pointer in one place and super_block pointer in another is tricky
to implement.
Given that it was all crazy weird and goofy and I was have a very hard
time tracing it, I figured the better part of valour was not attempting
to change code I was having trouble tracing.
That is the only reason why I do not refer to the super_block directly
from the tty layer. I actually have patches that get as far as killing
pts_sb_from_inode.
Or in summary the decisions you question most I have made not for
implementation reasons but for user space api reasons. The permission
check because I don't want to break existing userspace, and the
change of file path to allow the natural way to ask the question which
devpts does this master file descriptor belong to.
I am open to better ideas.
Eric
p.s. In the long term I think we should just update glibc and the
handful of ther libraries that care to prefer to use /dev/pts/ptmx if it
is available over /dev/ptmx and then make /dev/ptmx and all of the hacks
for supporting it a configuration option which in a decade or so can be
turned off by default.
Al Viro <[email protected]> writes:
> On Tue, Apr 05, 2016 at 03:54:25AM +0100, Al Viro wrote:
>
>> That, I take it, is a lookup for .. and buggering off if it fails *or* if
>> we had been in caller's root or something that overmount it? Not that the
>> latter had been possible - root is a directory and can be overmounted only
>> by another such, and we are called from ->open() of a device node.
>>
>> > + /* Remember the result of this permission check for later */
>> > + ret = inode_permission(path.dentry->d_inode, MAY_EXEC);
>> > + if (path_pts(&path))
>> > + goto fail;
>>
>> Egads, man - you've just introduced a special function for looking up
>> something named "pts" in a given directory!
>>
>> The reason not to use kern_path() would be what, the fact that it doesn't
>> allow starting at given location? So let's make a variant that would - and
>> rather than bothering with RCU, just go for something like (completely
>> untested)
>
> Ah... Right, that would demand exec permissions on the starting point.
> Still, this is incredibly ugly ;-/ I'll try to come up with something
> more tolerable, but this "path_pts" thing is too ugly to live.
> Seriously.
Given that I can think of no other reason than this special case to ever
want to use this code. I figured having something incredibily special
case and obviously so was the way to go. Then at least no one would
mistake it for a general purpose facility.
Eric
On Fri, Apr 8, 2016 at 11:51 AM, Eric W. Biederman
<[email protected]> wrote:
>
> Given that concern under the rule we don't break userspace we have to
> check the permissions of /dev/pts/ptmx when we are creating a new pty,
> on a instance of devpts that was created with newinstance.
The rule is that we don't break existing installations.
If somebody has root and installs a "ptmx" node in an existing mount
space next to a pts subdirectory, that's not a security issue, nor is
it going to break any existing installation.
The whole point of the patch is that yes, we change semantics. A
change of semantics means that people will see situations where the
behavior is different. But that's not "breaking user space", that's
just "ok, you can see a difference".
Linus
Linus Torvalds <[email protected]> writes:
> On Fri, Apr 8, 2016 at 11:51 AM, Eric W. Biederman
> <[email protected]> wrote:
>>
>> Given that concern under the rule we don't break userspace we have to
>> check the permissions of /dev/pts/ptmx when we are creating a new pty,
>> on a instance of devpts that was created with newinstance.
>
> The rule is that we don't break existing installations.
>
> If somebody has root and installs a "ptmx" node in an existing mount
> space next to a pts subdirectory, that's not a security issue, nor is
> it going to break any existing installation.
Anyone can do that with "mount --bind". All it takes is root in a user
namespace. I can get root in a user namespace as no one special.
So someone may have set such a thing up, and it may now be possible
to defeat such a regime as anyone.
In practice I suspect all such cases are handled by actually hiding the
mount of devpts in another mount namespace.
> The whole point of the patch is that yes, we change semantics. A
> change of semantics means that people will see situations where the
> behavior is different. But that's not "breaking user space", that's
> just "ok, you can see a difference".
If we don't want to care about this case, and if someone complains about
a security regression readd my permission checks I am fine with that.
But I don't want to let a possibility of breaking someone (that I don't
know how to test for, and would be silent breakage) slip through.
Eric
On Apr 8, 2016 12:05 PM, "Linus Torvalds" <[email protected]> wrote:
>
> On Fri, Apr 8, 2016 at 11:51 AM, Eric W. Biederman
> <[email protected]> wrote:
> >
> > Given that concern under the rule we don't break userspace we have to
> > check the permissions of /dev/pts/ptmx when we are creating a new pty,
> > on a instance of devpts that was created with newinstance.
>
> The rule is that we don't break existing installations.
>
> If somebody has root and installs a "ptmx" node in an existing mount
> space next to a pts subdirectory, that's not a security issue, nor is
> it going to break any existing installation.
What Eric's saying is that you don't have to be root for this.
But Eric, I think there might be a better mitigation. For a ptmx
chardev, just fail the open if the chardev's vfsmount or the devpts's
vfsmount doesn't belong to the same userns as the devpts's superblock.
After all, setting this attack up requires the caps on one of the
vfsmounts, and if you have those caps you could attack your own devpts
instance quite easily. Would that work?
>
> The whole point of the patch is that yes, we change semantics. A
> change of semantics means that people will see situations where the
> behavior is different. But that's not "breaking user space", that's
> just "ok, you can see a difference".
>
> Linus
Andy Lutomirski <[email protected]> writes:
> On Apr 8, 2016 12:05 PM, "Linus Torvalds" <[email protected]> wrote:
>>
>> On Fri, Apr 8, 2016 at 11:51 AM, Eric W. Biederman
>> <[email protected]> wrote:
>> >
>> > Given that concern under the rule we don't break userspace we have to
>> > check the permissions of /dev/pts/ptmx when we are creating a new pty,
>> > on a instance of devpts that was created with newinstance.
>>
>> The rule is that we don't break existing installations.
>>
>> If somebody has root and installs a "ptmx" node in an existing mount
>> space next to a pts subdirectory, that's not a security issue, nor is
>> it going to break any existing installation.
>
> What Eric's saying is that you don't have to be root for this.
>
> But Eric, I think there might be a better mitigation. For a ptmx
> chardev, just fail the open if the chardev's vfsmount or the devpts's
> vfsmount doesn't belong to the same userns as the devpts's superblock.
> After all, setting this attack up requires the caps on one of the
> vfsmounts, and if you have those caps you could attack your own devpts
> instance quite easily. Would that work?
I don't think so. For one it depends on getting s_user_ns which should
happen but is not there yet. For another the way you describe
it you would break the case of
unshare(CLONE_NEWUSER);
unshare(CLONE_NEWNS);
open("/dev/ptmx");
Which is actually more likely to break userspace than anything else we
have considered. I know people actually do that.
Also using any property from a mount namespace or a vfs mount is usually
an error, as it is an inconsistent model.
Plus I don't think what you are suggesting would make anything simpler
or easier to reason about. It only costs me about 3 lines of code to
perform the permission checks. The complaint is that they exist at all.
Eric
On Fri, Apr 8, 2016 at 2:29 PM, Eric W. Biederman <[email protected]> wrote:
>
> I don't think so. For one it depends on getting s_user_ns which should
> happen but is not there yet. For another the way you describe
> it you would break the case of
>
> unshare(CLONE_NEWUSER);
> unshare(CLONE_NEWNS);
> open("/dev/ptmx");
>
> Which is actually more likely to break userspace than anything else we
> have considered. I know people actually do that.
.. but you could just check that the ptmx node is actually the same
superblock that the pts directory is mounted on. If it's a bind mount,
that wouldn't be true.
But more fundamentally I still don't actually understand why you even
really care.
We get the wrong pts case *today*. We'd get a different wrong pts
namespace when somebody tries to do odd things. Why would we care? It
would be a _better_ guess.
I don't see the security issue. If you do tricks to get pty's in
another group, what's the problem? You have to do it consciously, and
I don't see what the downside is. You get what you ask for, and I
don't see a new attack surface.
The whole "somebody used chmod on /dev/pts/" argument sounds bogus.
That's an insane thing to do. If you want a private namespace, you
make *all* of /dev private, you don't go "oh, I'll just make the pts
subdirectory private".
In other words, your whole scenario sounds totally made up to begin
with. And even if it happens, I don't see what would be so disastrous
about it.
I mean, right now, /dev/ptmx is world read-write in the root container
and everybody gets access to the same underlying set of ptys. And
that's not some horrible security issue. It's how things are
*supposed* to work.
So I really don't see the argument. You guys are just making shit up.
Linus
On Fri, Apr 8, 2016 at 2:29 PM, Eric W. Biederman <[email protected]> wrote:
> Andy Lutomirski <[email protected]> writes:
>
>> On Apr 8, 2016 12:05 PM, "Linus Torvalds" <[email protected]> wrote:
>>>
>>> On Fri, Apr 8, 2016 at 11:51 AM, Eric W. Biederman
>>> <[email protected]> wrote:
>>> >
>>> > Given that concern under the rule we don't break userspace we have to
>>> > check the permissions of /dev/pts/ptmx when we are creating a new pty,
>>> > on a instance of devpts that was created with newinstance.
>>>
>>> The rule is that we don't break existing installations.
>>>
>>> If somebody has root and installs a "ptmx" node in an existing mount
>>> space next to a pts subdirectory, that's not a security issue, nor is
>>> it going to break any existing installation.
>>
>> What Eric's saying is that you don't have to be root for this.
>>
>> But Eric, I think there might be a better mitigation. For a ptmx
>> chardev, just fail the open if the chardev's vfsmount or the devpts's
>> vfsmount doesn't belong to the same userns as the devpts's superblock.
>> After all, setting this attack up requires the caps on one of the
>> vfsmounts, and if you have those caps you could attack your own devpts
>> instance quite easily. Would that work?
>
> I don't think so. For one it depends on getting s_user_ns which should
> happen but is not there yet. For another the way you describe
> it you would break the case of
>
> unshare(CLONE_NEWUSER);
> unshare(CLONE_NEWNS);
> open("/dev/ptmx");
>
> Which is actually more likely to break userspace than anything else we
> have considered. I know people actually do that.
>
Hmm, you're right. Never mind.
--Andy
Linus Torvalds <[email protected]> writes:
> But more fundamentally I still don't actually understand why you even
> really care.
At this point I care because there is a failure of communication.
Until this email no one has ever said: "Ok that actually could happen
but we don't actually care."
Right now I am a bit paranoid because I have seen a few too many cases
where some little detail was glossed over and someone clever turned it
into a great big CVE they could drive a truck through. So I am once
bitten twice shy and all of that.
> We get the wrong pts case *today*. We'd get a different wrong pts
> namespace when somebody tries to do odd things. Why would we care? It
> would be a _better_ guess.
>
> I don't see the security issue. If you do tricks to get pty's in
> another group, what's the problem? You have to do it consciously, and
> I don't see what the downside is. You get what you ask for, and I
> don't see a new attack surface.
>
> The whole "somebody used chmod on /dev/pts/" argument sounds bogus.
> That's an insane thing to do. If you want a private namespace, you
> make *all* of /dev private, you don't go "oh, I'll just make the pts
> subdirectory private".
Oh I pretty much agree it is an insane thing to do. At the same time I
know that people can make a lot of little sane decisions that can lead
to an insane situation, so just because it is insane I can't rule
it out automatically.
The actual sane thing to do, and what I think most of userspace does
at this point is to create it's own mount namespace so nothing is
visible to outsiders.
> In other words, your whole scenario sounds totally made up to begin
> with. And even if it happens, I don't see what would be so disastrous
> about it.
In general I agree. The scenario is made up. I would be surprised if
it happens.
> I mean, right now, /dev/ptmx is world read-write in the root container
> and everybody gets access to the same underlying set of ptys. And
> that's not some horrible security issue. It's how things are
> *supposed* to work.
I agree.
> So I really don't see the argument. You guys are just making shit up.
I don't see why we have the linux extension of supporting anything
except mode 0666 on /dev/ptmx or /dev/pts/ptmx. This is really about
not breaking that linux extension by overlooking some little detail.
On the attack analysis front the worst thing I can see happening is a
denial of service attack. I see two possible denial of service attacks.
One possible attack creates a pty and prevents devpts from being
unmounted. Another possible attack creates all possible ptys on a
devpts instance, and prevents legitimate tty creations from happening.
At the end of the day as you say it would be a pretty crazy person who
isolated a mount of devpts with just the permissions of /dev/pts/ptmx.
So if we don't want to care knowing those stupid attacks above are
possible I am happy not to care. They don't look all that serious to
me.
Eric
> If anyone has a better idea on how userspace should connect the master
> pty file descriptor the slave file descriptor, I would be willing to
> implement that instead.
If we are willing to go away from the existing mess of a tty interface
inflicted on us by BSD and then mashed up by POSIX then a syscall of
int err = ptypair(int fd[2], int perms, int flags);
[where flags is the O_ ones we usually need to cover (CLOEXEC etc) and
maybe even some kind of "private" flag to say don't even expose it via
devpts).
would do remarkably sane things to the majoirty of use cases as it breaks
the dependence on grantpt and also the historic screwup that pty pairs
aren't allocated atomically with both file handles returned as pipe()
does.
Alan
On April 9, 2016 6:09:09 AM PDT, One Thousand Gnomes <[email protected]> wrote:
>
>> If anyone has a better idea on how userspace should connect the
>master
>> pty file descriptor the slave file descriptor, I would be willing to
>> implement that instead.
>
>If we are willing to go away from the existing mess of a tty interface
>inflicted on us by BSD and then mashed up by POSIX then a syscall of
>
> int err = ptypair(int fd[2], int perms, int flags);
>
>[where flags is the O_ ones we usually need to cover (CLOEXEC etc) and
>maybe even some kind of "private" flag to say don't even expose it via
>devpts).
>
>would do remarkably sane things to the majoirty of use cases as it
>breaks
>the dependence on grantpt and also the historic screwup that pty pairs
>aren't allocated atomically with both file handles returned as pipe()
>does.
>
>Alan
We don't even need to do that if we'd be willing to change the user space interface... if we could rely on the POSIX interface then posix_openpt() could simply open /dev/pts/ptmx and everything would just work.
The trick here is how to make it work in the presence of some extremely bad practices in existing userspace.
--
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.
"H. Peter Anvin" <[email protected]> writes:
> On April 9, 2016 6:09:09 AM PDT, One Thousand Gnomes <[email protected]> wrote:
>>
>>> If anyone has a better idea on how userspace should connect the
>>master
>>> pty file descriptor the slave file descriptor, I would be willing to
>>> implement that instead.
>>
>>If we are willing to go away from the existing mess of a tty interface
>>inflicted on us by BSD and then mashed up by POSIX then a syscall of
>>
>> int err = ptypair(int fd[2], int perms, int flags);
>>
>>[where flags is the O_ ones we usually need to cover (CLOEXEC etc) and
>>maybe even some kind of "private" flag to say don't even expose it via
>>devpts).
>>
>>would do remarkably sane things to the majoirty of use cases as it
>>breaks
>>the dependence on grantpt and also the historic screwup that pty pairs
>>aren't allocated atomically with both file handles returned as pipe()
>>does.
>>
>>Alan
>
> We don't even need to do that if we'd be willing to change the user
> space interface... if we could rely on the POSIX interface then
> posix_openpt() could simply open /dev/pts/ptmx and everything would
> just work.
At a quick skim it does look like userspace uses posix_openpt for the
most part. Certainly portable apps that can run on FreeBSD do.
And just grepping through binaries all of the ones I have checked so far
are calling posix_openpt.
Peter if you or someone could start updating the userspace version of
posix_openpt to use /dev/pts/ptmx when available over /dev/ptmx in
parallel to the kernel work to always allow mount of devpts to give
distinct instances that would be great.
> The trick here is how to make it work in the presence of some
> extremely bad practices in existing userspace.
Yeah. I am going to look and see if I can move this controversial bit
to a separate patch so we can discuss it more conviniently.
Eric
On April 9, 2016 7:45:46 AM PDT, [email protected] wrote:
>"H. Peter Anvin" <[email protected]> writes:
>
>> On April 9, 2016 6:09:09 AM PDT, One Thousand Gnomes
><[email protected]> wrote:
>>>
>>>> If anyone has a better idea on how userspace should connect the
>>>master
>>>> pty file descriptor the slave file descriptor, I would be willing
>to
>>>> implement that instead.
>>>
>>>If we are willing to go away from the existing mess of a tty
>interface
>>>inflicted on us by BSD and then mashed up by POSIX then a syscall of
>>>
>>> int err = ptypair(int fd[2], int perms, int flags);
>>>
>>>[where flags is the O_ ones we usually need to cover (CLOEXEC etc)
>and
>>>maybe even some kind of "private" flag to say don't even expose it
>via
>>>devpts).
>>>
>>>would do remarkably sane things to the majoirty of use cases as it
>>>breaks
>>>the dependence on grantpt and also the historic screwup that pty
>pairs
>>>aren't allocated atomically with both file handles returned as pipe()
>>>does.
>>>
>>>Alan
>>
>> We don't even need to do that if we'd be willing to change the user
>> space interface... if we could rely on the POSIX interface then
>> posix_openpt() could simply open /dev/pts/ptmx and everything would
>> just work.
>
>At a quick skim it does look like userspace uses posix_openpt for the
>most part. Certainly portable apps that can run on FreeBSD do.
>And just grepping through binaries all of the ones I have checked so
>far
>are calling posix_openpt.
>
>Peter if you or someone could start updating the userspace version of
>posix_openpt to use /dev/pts/ptmx when available over /dev/ptmx in
>parallel to the kernel work to always allow mount of devpts to give
>distinct instances that would be great.
>
>> The trick here is how to make it work in the presence of some
>> extremely bad practices in existing userspace.
>
>Yeah. I am going to look and see if I can move this controversial bit
>to a separate patch so we can discuss it more conviniently.
>
>Eric
On the flipside, if we were to allow ourselves to break userspace, at this point I would suggest making /dev/pts/ptmx have a different device number and make the legacy /dev/ptmx print a warning message, after which it can at least eventually be deleted.
This might not be a bad idea anyway.
--
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.
On Sat, Apr 9, 2016 at 3:37 PM, H. Peter Anvin <[email protected]> wrote:
>
> On the flipside, if we were to allow ourselves to break userspace, at this point I would suggest making /dev/pts/ptmx have a different device number and make the legacy /dev/ptmx print a warning message, after which it can at least eventually be deleted.
You don't need a different device number.
The /dev/pts/ptmx file may look like it's the same node as /dev/ptmx,
but it is trivial to recognize as the pts one:
if (dentry->d_sb->s_magic == DEVPTS_SUPER_MAGIC)
and you're done.
But nobody actually uses /dev/pts/ptmx, because it has never had sane
permissions.
So the fact is, /dev/ptmx is what people use, and we're not breaking userspace.
But when we fix bad semantics (and always just looking up the initial
pts mount really is crazy semantics) that doesn't mean that we have to
bend over backwards to not make the changed semantics visible. We
don't _break_ user space, but we also don't care about some random
test-program that checks for particular semantics.
And I can pretty much _guarantee_ that nobody has ever done the "let's
bind-mount a 'ptmx' node in a /dev directory, and then expect that to
bind to some _other_ pts thing than the one in /dev/pts/".
Except as a test-program, or possibly as a "why the f*ck doesn't this
work? Oh, I need to use the single-instance thing because the
multi-instance pts thing is broken. Damn shitty implementation".
Linus
On April 9, 2016 5:01:27 PM PDT, Linus Torvalds <[email protected]> wrote:
>On Sat, Apr 9, 2016 at 3:37 PM, H. Peter Anvin <[email protected]> wrote:
>>
>> On the flipside, if we were to allow ourselves to break userspace, at
>this point I would suggest making /dev/pts/ptmx have a different device
>number and make the legacy /dev/ptmx print a warning message, after
>which it can at least eventually be deleted.
>
>You don't need a different device number.
>
>The /dev/pts/ptmx file may look like it's the same node as /dev/ptmx,
>but it is trivial to recognize as the pts one:
>
> if (dentry->d_sb->s_magic == DEVPTS_SUPER_MAGIC)
>
>and you're done.
>
>But nobody actually uses /dev/pts/ptmx, because it has never had sane
>permissions.
>
>So the fact is, /dev/ptmx is what people use, and we're not breaking
>userspace.
>
>But when we fix bad semantics (and always just looking up the initial
>pts mount really is crazy semantics) that doesn't mean that we have to
>bend over backwards to not make the changed semantics visible. We
>don't _break_ user space, but we also don't care about some random
>test-program that checks for particular semantics.
>
>And I can pretty much _guarantee_ that nobody has ever done the "let's
>bind-mount a 'ptmx' node in a /dev directory, and then expect that to
>bind to some _other_ pts thing than the one in /dev/pts/".
>
>Except as a test-program, or possibly as a "why the f*ck doesn't this
>work? Oh, I need to use the single-instance thing because the
>multi-instance pts thing is broken. Damn shitty implementation".
>
> Linus
Fixing the default permissions is trivial, of course. The intent from the beginning was to make a ptmx -> pts/ptmx, but user space never did...
--
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.
On Sat, Apr 9, 2016 at 5:06 PM, H. Peter Anvin <[email protected]> wrote:
>
> Fixing the default permissions is trivial, of course. The intent from the beginning was to make a ptmx -> pts/ptmx, but user space never did...
That wasn't my point.
Because the permissions have never been usable, I pretty much
guarantee that no current user space uses /dev/pts/ptmx.
So that node is almost entirely irrelevant. Us fixing the permissions
at this point isn't going to make it any more relevant, we might as
well ignore it.
Which all means that the way forward really is to just make /dev/ptmx
work. It's not going away, and it _is_ fairly easy to fix.
But I don't think the fix should care about permissions - and we might
as well leave the existing pts/ptmx node with broken permissions.
Because we've never been actually interested in looking up
/dev/pts/ptmx - all we actually care about is to look up which devpts
instance it is.
And that's not about the ptmx node, that's really about the
mount-point. So the right thing to do - conceptually - is *literally*
to just say "ok, what is mounted at 'pts'". Note how at no point do we
want to _open_ anything.
That's why I said that conceptually we could just open /proc/mounts.
Because *that* is really the operation we care about. We don't care
about lookup, and we don't care about permissions on the ptmx node.
Those are completely and utterly irrelevant to what we're actually
after.
So I think the permission thing is not just extra code with more
failure points. I think it's conceptually entirely the wrong thing to
do, and just confuses people into thinking that we're doing something
that we aren't.
Linus
On Sat, Apr 9, 2016 at 5:16 PM, Linus Torvalds
<[email protected]> wrote:
> On Sat, Apr 9, 2016 at 5:06 PM, H. Peter Anvin <[email protected]> wrote:
>>
>> Fixing the default permissions is trivial, of course. The intent from the beginning was to make a ptmx -> pts/ptmx, but user space never did...
>
> That wasn't my point.
>
> Because the permissions have never been usable, I pretty much
> guarantee that no current user space uses /dev/pts/ptmx.
>
> So that node is almost entirely irrelevant. Us fixing the permissions
> at this point isn't going to make it any more relevant, we might as
> well ignore it.
>
> Which all means that the way forward really is to just make /dev/ptmx
> work. It's not going away, and it _is_ fairly easy to fix.
>
> But I don't think the fix should care about permissions - and we might
> as well leave the existing pts/ptmx node with broken permissions.
> Because we've never been actually interested in looking up
> /dev/pts/ptmx - all we actually care about is to look up which devpts
> instance it is.
>
> And that's not about the ptmx node, that's really about the
> mount-point. So the right thing to do - conceptually - is *literally*
> to just say "ok, what is mounted at 'pts'". Note how at no point do we
> want to _open_ anything.
>
> That's why I said that conceptually we could just open /proc/mounts.
> Because *that* is really the operation we care about. We don't care
> about lookup, and we don't care about permissions on the ptmx node.
> Those are completely and utterly irrelevant to what we're actually
> after.
>
> So I think the permission thing is not just extra code with more
> failure points. I think it's conceptually entirely the wrong thing to
> do, and just confuses people into thinking that we're doing something
> that we aren't.
What we *do* want to do, though, is to prevent the following:
Root (or a container manager or whatever) does:
mknod /foo/ptmx c 5 2
chmod 600 /foo/ptmx
chmod 666 /dev/ptmx
mount -t devpts -o newinstance none /foo/pts
Evil user does:
$ unshare -urm
# mount --bind /dev /mnt/foo
# mount --bind /foo/pts /mnt/foo/pts
# open /mnt/foo/ptmx
The issue is that the evil user has the ability to open /mnt/foo/ptmx
(because it's 666), and the relative path 'pts' points to /foo/pts,
which the evil user should *not* be able to access. IOW, with a naive
implementation, we can match up the ptmx node with the wrong devpts
instance because the evil user unshared their mount namespace and
screwed around.
I don't immediately see how to fix this without playing permission games.
--Andy
On April 9, 2016 6:27:36 PM PDT, Linus Torvalds <[email protected]> wrote:
>On Apr 9, 2016 5:45 PM, "Andy Lutomirski" <[email protected]> wrote:
>>
>>
>> What we *do* want to do, though, is to prevent the following:
>
>I don't see the point. Why do you bring up this insane scenario that
>nobody
>can possibly care about?
>
>So you actually have any reason to believe somebody does that?
>
>I already asked about that earlier, and the silence was deafening.
>
> Linus
Here is an entire different approach, I don't know if it is sane or not: when *mounting* the devpts filesystem, it could automagically create the bins mount for ptmx in the parent of its mount point. Presumably the would be a mount option to disable that behavior.
Does anyone see an obvious problem with that?
--
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.
On Sat, Apr 9, 2016 at 6:27 PM, Linus Torvalds
<[email protected]> wrote:
>
> On Apr 9, 2016 5:45 PM, "Andy Lutomirski" <[email protected]> wrote:
>>
>>
>> What we *do* want to do, though, is to prevent the following:
>
> I don't see the point. Why do you bring up this insane scenario that nobody
> can possibly care about?
>
> So you actually have any reason to believe somebody does that?
>
> I already asked about that earlier, and the silence was deafening.
I have no idea, but I'm generally uncomfortable with magical things
that bypass normal security policy.
That being said, here's an idea for fixing this, at least in the long
run. Add a new devpts mount option "no_ptmx_redirect" that turns off
this behavior for the super in question. That is, opening /dev/ptmx
if "pts/ptmx" points to something with no_ptmx_redirect set will fail.
Distros shipping new kernels could be encouraged to (finally!) make
/dev/ptmx a symlink and set this option.
We just might be able to get away with spelling that option "newinstance".
Linus Torvalds <[email protected]> writes:
> On Sat, Apr 9, 2016 at 3:37 PM, H. Peter Anvin <[email protected]> wrote:
>>
>> On the flipside, if we were to allow ourselves to break userspace, at this point I would suggest making /dev/pts/ptmx have a different device number and make the legacy /dev/ptmx print a warning message, after which it can at least eventually be deleted.
>
> You don't need a different device number.
>
> The /dev/pts/ptmx file may look like it's the same node as /dev/ptmx,
> but it is trivial to recognize as the pts one:
>
> if (dentry->d_sb->s_magic == DEVPTS_SUPER_MAGIC)
>
> and you're done.
We can actually do better and set f_ops in devpts and bypass the whole
lookup by device number. In my pile of cleanups that are waiting for
the mess to resolve I actually have a patch that does that for the slave
ttys.
>
> But nobody actually uses /dev/pts/ptmx, because it has never had sane
> permissions.
Now that is an interesting misconception to see. There is actually a
lot more software that uses /dev/pts/ptmx (with a symlink from /dev/ptmx
or a bind mount to /dev/ptmx) than there is that actually needs the new
compatibility behavior.
Carefully written and maintained container software like lxc and docker
do use "-o newinstance". Fixing the permissions and redirecting the
/dev/ptmx path to /dev/pts/ptmx are not a problem when you know you are
setting up a special environment.
It is the one off sloppily created automation scripts like
xen-create-image that gets this wrong. I say sloppily created as in
practice every mount of devpts needs "-o gid=5,mode=620". If
xen-create-image and related software had gotting those mount options
correct pt_chown could have been done away with, with no one noticing a
long time ago.
Those sloppy pieces of code are probably the things we want to
break least as they work after their fasion and whoever wrote them
is likely long gone. So I would be surprised if there was anyone to
pick up the pieces if we break them.
> But when we fix bad semantics (and always just looking up the initial
> pts mount really is crazy semantics) that doesn't mean that we have to
> bend over backwards to not make the changed semantics visible. We
> don't _break_ user space, but we also don't care about some random
> test-program that checks for particular semantics.
No. Bending over backwards as you call it just makes the software test
matrix smaller.
That part is now settled in my book and those extra permission checks
will not be in the next version of this patchset.
Eric
Andy Lutomirski <[email protected]> writes:
> On Sat, Apr 9, 2016 at 6:27 PM, Linus Torvalds
> <[email protected]> wrote:
>>
>> On Apr 9, 2016 5:45 PM, "Andy Lutomirski" <[email protected]> wrote:
>>>
>>>
>>> What we *do* want to do, though, is to prevent the following:
>>
>> I don't see the point. Why do you bring up this insane scenario that nobody
>> can possibly care about?
>>
>> So you actually have any reason to believe somebody does that?
>>
>> I already asked about that earlier, and the silence was deafening.
>
> I have no idea, but I'm generally uncomfortable with magical things
> that bypass normal security policy.
>
> That being said, here's an idea for fixing this, at least in the long
> run. Add a new devpts mount option "no_ptmx_redirect" that turns off
> this behavior for the super in question. That is, opening /dev/ptmx
> if "pts/ptmx" points to something with no_ptmx_redirect set will fail.
> Distros shipping new kernels could be encouraged to (finally!) make
> /dev/ptmx a symlink and set this option.
>
> We just might be able to get away with spelling that option "newinstance".
Interesting point. Very interesting point. At this point I don't know
that it is worth it, but that would trivially prevent any non-sense,
that might possibly happen. The downside would be that the semantics
of /dev/ptmx would be more complicated.
Eric
On April 11, 2016 1:12:22 PM PDT, Andy Lutomirski <[email protected]> wrote:
>On Sat, Apr 9, 2016 at 6:27 PM, Linus Torvalds
><[email protected]> wrote:
>>
>> On Apr 9, 2016 5:45 PM, "Andy Lutomirski" <[email protected]>
>wrote:
>>>
>>>
>>> What we *do* want to do, though, is to prevent the following:
>>
>> I don't see the point. Why do you bring up this insane scenario that
>nobody
>> can possibly care about?
>>
>> So you actually have any reason to believe somebody does that?
>>
>> I already asked about that earlier, and the silence was deafening.
>
>I have no idea, but I'm generally uncomfortable with magical things
>that bypass normal security policy.
>
>That being said, here's an idea for fixing this, at least in the long
>run. Add a new devpts mount option "no_ptmx_redirect" that turns off
>this behavior for the super in question. That is, opening /dev/ptmx
>if "pts/ptmx" points to something with no_ptmx_redirect set will fail.
>Distros shipping new kernels could be encouraged to (finally!) make
>/dev/ptmx a symlink and set this option.
>
>We just might be able to get away with spelling that option
>"newinstance".
What about the idea of making the bind mount automatic?
--
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.
"H. Peter Anvin" <[email protected]> writes:
> On April 11, 2016 1:12:22 PM PDT, Andy Lutomirski <[email protected]> wrote:
>>On Sat, Apr 9, 2016 at 6:27 PM, Linus Torvalds
>><[email protected]> wrote:
>>>
>>> On Apr 9, 2016 5:45 PM, "Andy Lutomirski" <[email protected]>
>>wrote:
>>>>
>>>>
>>>> What we *do* want to do, though, is to prevent the following:
>>>
>>> I don't see the point. Why do you bring up this insane scenario that
>>nobody
>>> can possibly care about?
>>>
>>> So you actually have any reason to believe somebody does that?
>>>
>>> I already asked about that earlier, and the silence was deafening.
>>
>>I have no idea, but I'm generally uncomfortable with magical things
>>that bypass normal security policy.
>>
>>That being said, here's an idea for fixing this, at least in the long
>>run. Add a new devpts mount option "no_ptmx_redirect" that turns off
>>this behavior for the super in question. That is, opening /dev/ptmx
>>if "pts/ptmx" points to something with no_ptmx_redirect set will fail.
>>Distros shipping new kernels could be encouraged to (finally!) make
>>/dev/ptmx a symlink and set this option.
>>
>>We just might be able to get away with spelling that option
>>"newinstance".
>
> What about the idea of making the bind mount automatic?
We could almost do that cleanly by playing with the /dev/ptmx dentry
and implementing a d_automount method. That still needs the crazy path
based lookup without permission checks.
Unfortunately the filesystem not the device owns the dentry operations.
My practical concern if we worked through the implementation details
would be how would it interact with people who bind mount /dev/pts/ptmx
on top of /dev/ptmx. We might get some strange new errors.
Eric
Linus Torvalds <[email protected]> writes:
> On Apr 9, 2016 5:45 PM, "Andy Lutomirski" <[email protected]> wrote:
>>
>>
>> What we *do* want to do, though, is to prevent the following:
>
> I don't see the point. Why do you bring up this insane scenario that
> nobody can possibly care about?
>
> So you actually have any reason to believe somebody does that?
>
> I already asked about that earlier, and the silence was deafening.
I replied earlier. Did you not see my reply?
Eric
On Mon, Apr 11, 2016 at 4:37 PM, Eric W. Biederman
<[email protected]> wrote:
>
> My practical concern if we worked through the implementation details
> would be how would it interact with people who bind mount /dev/pts/ptmx
> on top of /dev/ptmx. We might get some strange new errors.
Yes, please don't let's play "clever" games. The semantics should be
fairly straightforward.
I still don't understand why people think that you shouldn't be able
to access a 'pts' subsystem that is accessible to others. If you can
bind-mount the pts directory somewhere, then you can damn well already
see that pts mount, claiming that somehow it should be sacred ground
and you shouldn't be able to access it with a ptmx node outside of it
is just insane.
So people have been bringing that up as an issue, but nobody has ever
actually been able to articulate why anybody should ever care.
Now people are just making up random odd semantics. Nobody has ever
explained why the _simple_ "ptmx binds to the pts directory next to
it" is actually problem. Even for a bind mount, you have to be able to
open the point you're mounting, so we know that the "attacker" already
had access to the pts subdirectory.
If somebody wants to keep the pts mount private, they should damn well
keep it _private_. I don't understand peoples "oh, you can access it
but you can't access it".excuses.
Linus
On Mon, Apr 11, 2016 at 4:49 PM, Eric W. Biederman
<[email protected]> wrote:
>
> I replied earlier. Did you not see my reply?
Are you talking about the one where you agreed that the scenario was
made up and insane? The one where you said that you're worried about
breaking out "extension" where ptmx is non-0666?
That was never an extension. It was a simple situation of people (a)
not knowing what the tty group should be in the kernel and (b) then
thinking that using a permission model of "no permission" somehow made
it saner.
What it actually resulted in was that most distros just ignore it
entirely, and just use /dev/ptmx.
Yes, you *can* then chmod it in user space and use a symlink, but so
what? Nobody who actually uses that node uses anythinig but 0666.
Because that would break pretty much everything that uses pty's.
So the whole "we need to worry about permission 0000" is complete and
uttter garbage. We really don't. The situation doesn't come up, and
it's not relevant. The standard part to access ptmx is /dev/ptmx, and
no amount of wishing it were otherwise will make it any different.
Seriously. Just look at the opengroup documentation. It talks about
/dev/ptmx. The whole /dev/pts/ptmx thing was a mistake. WE SHOULD NOT
EXTEND ON THAT MISTAKE.
We should just FIX the mistake. Ignore /dev/pts/ptmx, because that
node is non-standard SHIT.
Really. Really really.
Linus
Linus Torvalds <[email protected]> writes:
> On Mon, Apr 11, 2016 at 4:37 PM, Eric W. Biederman
> <[email protected]> wrote:
>>
>> My practical concern if we worked through the implementation details
>> would be how would it interact with people who bind mount /dev/pts/ptmx
>> on top of /dev/ptmx. We might get some strange new errors.
>
> Yes, please don't let's play "clever" games. The semantics should be
> fairly straightforward.
Actually for me this is about keeping the semantics simpler, and coming
up with a higher performance implementation.
A dentry that does an automount is already well defined.
Making the rule that accessing /dev/ptmx causes an automount of
/dev/pts/ptmx on top of the device node at /dev/ptmx is really simple,
with no special games. It also makes it more obvious to userspace what
is going on. AKA allows userspace to know which superblock does an open
ptmx master tty belongs to (and it happens in a backwards and forwards
compatible way).
My only concern is with this very minor change in semantics will
anything care. I need to implement and test to find out.
I think I see an implementation that Al won't grumble too loudly about.
Anyway I am going to try this and see what I can see.
Eric
Linus Torvalds <[email protected]> writes:
> On Mon, Apr 11, 2016 at 4:49 PM, Eric W. Biederman
> <[email protected]> wrote:
>>
>> I replied earlier. Did you not see my reply?
>
>
> Are you talking about the one where you agreed that the scenario was
> made up and insane? The one where you said that you're worried about
> breaking out "extension" where ptmx is non-0666?
I meant the one where I conceded that the only think that it could
possible protect against was a denial of service attack, from which we
probably don't care.
I just want to be certain that the emails are getting through. As the
meaning certainly has not been.
I do think I called a permision check in posix_open aka on /dev/ptmx
a linux specific extension in that email. But seriously it was all
about reducing the scope of the change. Reducing the size of the test
matrix. I simply had not looked far enough to see if there was anything
you could reasonable protect with those permissions.
As I agreed with you that it was unnecessary I was just puzzled why you
called what was essentially agreement with you deafening silence.
Eric
On Mon, Apr 11, 2016 at 5:22 PM, Eric W. Biederman
<[email protected]> wrote:
>
> I meant the one where I conceded that the only think that it could
> possible protect against was a denial of service attack, from which we
> probably don't care.
Yeah, that's the same email I was talking about, I was just quoting
another part.
> As I agreed with you that it was unnecessary I was just puzzled why you
> called what was essentially agreement with you deafening silence.
The "deafening silence" was about _why_ this all would be a problem,
and why the security checks would be needed.
Basically, I think that if /dev/pts/ is accessible, we should just say
"ok, you can open a pty on it".
The fact that you could open a pty by bind-mounting it somewhere else,
and then adding a "ptmx" node to the same directory is not a security
issue: it's simply how devpts works.
In no actual sane and relevant situation is that a problem, for the
simple fact that there will *already* be a ptmx node that is
world-accessible in the same directory that has that /dev/pts/ mount.
Anything else is insane and irrelevant. This is *literally* what POSIX
says. Sure, POSIX also has that whole language about "posix_openpt()",
but that's just BS and irrelevant. The very page that mentions
"posix_openpt()" also says
"On implementations supporting the /dev/ptmx clone device, opening
the master device of a pseudo-terminal is simply:
mfdp = open("/dev/ptmx", oflag );
if (mfdp < 0)
return -1;"
and Linux unquestionably falls in that "supports /dev/ptmx" camp.
So I claim that the _only_ sane use of devpts is to already have a
world-accessible ptmx node there, and nothing else makes sense.
And if you want to be private, you had better make the whole /dev/
subdirectory private (which also takes care of any bind mount issues)
Linus
On April 11, 2016 5:10:47 PM PDT, [email protected] wrote:
>Linus Torvalds <[email protected]> writes:
>
>> On Mon, Apr 11, 2016 at 4:37 PM, Eric W. Biederman
>> <[email protected]> wrote:
>>>
>>> My practical concern if we worked through the implementation details
>>> would be how would it interact with people who bind mount
>/dev/pts/ptmx
>>> on top of /dev/ptmx. We might get some strange new errors.
>>
>> Yes, please don't let's play "clever" games. The semantics should be
>> fairly straightforward.
>
>Actually for me this is about keeping the semantics simpler, and coming
>up with a higher performance implementation.
>
>A dentry that does an automount is already well defined.
>
>Making the rule that accessing /dev/ptmx causes an automount of
>/dev/pts/ptmx on top of the device node at /dev/ptmx is really simple,
>with no special games. It also makes it more obvious to userspace what
>is going on. AKA allows userspace to know which superblock does an
>open
>ptmx master tty belongs to (and it happens in a backwards and forwards
>compatible way).
>
>My only concern is with this very minor change in semantics will
>anything care. I need to implement and test to find out.
>
>I think I see an implementation that Al won't grumble too loudly about.
>
>Anyway I am going to try this and see what I can see.
>
>Eric
Why bother with an automount? You can look up ../ptmx from the devpts get_super method and just do the bind mount once. No fuss, no muss. What's wrong with that?
--
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.
On Mon, Apr 11, 2016 at 6:06 PM, H. Peter Anvin <[email protected]> wrote:
>
> Why bother with an automount? You can look up ../ptmx from the devpts get_super method and just do the bind mount once. No fuss, no muss. What's wrong with that?
Ehh. What if somebody wants to mount the same devpts in multiple
places? So now you need to do the bind mount every time devpts is
bindmounted?
None of this makes sense.
Let's just take Eric's patch, and strip out the permission check, and
strip out the code that fakes a new path for it.
That gets rid of 90% of devpts_path_ptmx(): all that remains is pretty
much the "are we already in devpts" and the call to "path_pts()"
thing.
No update_file_path(), no inode_permissions, no fsi->ptmx_dentry
games. Just get a reference to the "pts_fs_info", and it's all done.
(Getting a ref on the pts_fs_info might require us to have a ref to
the superblock, I didn't check that part. But rather than updating the
file path, just save it off in the file data).
Linus
On Mon, Apr 11, 2016 at 07:48:28AM -0700, H. Peter Anvin wrote:
> Here is an entire different approach, I don't know if it is sane or not: when *mounting* the devpts filesystem, it could automagically create the bins mount for ptmx in the parent of its mount point. Presumably the would be a mount option to disable that behavior.
>
> Does anyone see an obvious problem with that?
Yes. ->mount() doesn't (and fucking *shouldn't*) know anything about the
mountpoint to be. Not to mention that the same superblock can easily end
up being visible in many places, etc. This is insane.
"H. Peter Anvin" <[email protected]> writes:
> On April 11, 2016 5:10:47 PM PDT, [email protected] wrote:
>>Linus Torvalds <[email protected]> writes:
>>
>>> On Mon, Apr 11, 2016 at 4:37 PM, Eric W. Biederman
>>> <[email protected]> wrote:
>>>>
>>>> My practical concern if we worked through the implementation details
>>>> would be how would it interact with people who bind mount
>>/dev/pts/ptmx
>>>> on top of /dev/ptmx. We might get some strange new errors.
>>>
>>> Yes, please don't let's play "clever" games. The semantics should be
>>> fairly straightforward.
>>
>>Actually for me this is about keeping the semantics simpler, and coming
>>up with a higher performance implementation.
>>
>>A dentry that does an automount is already well defined.
>>
>>Making the rule that accessing /dev/ptmx causes an automount of
>>/dev/pts/ptmx on top of the device node at /dev/ptmx is really simple,
>>with no special games. It also makes it more obvious to userspace what
>>is going on. AKA allows userspace to know which superblock does an
>>open
>>ptmx master tty belongs to (and it happens in a backwards and forwards
>>compatible way).
>>
>>My only concern is with this very minor change in semantics will
>>anything care. I need to implement and test to find out.
>>
>>I think I see an implementation that Al won't grumble too loudly about.
>>
>>Anyway I am going to try this and see what I can see.
>>
>>Eric
>
> Why bother with an automount? You can look up ../ptmx from the devpts
> get_super method and just do the bind mount once. No fuss, no muss.
> What's wrong with that?
Perhaps I am reading the code wrong but as I read it that information is
simply not available in get_super.
Eric
On Mon, Apr 11, 2016 at 07:10:47PM -0500, Eric W. Biederman wrote:
> Actually for me this is about keeping the semantics simpler, and coming
> up with a higher performance implementation.
>
> A dentry that does an automount is already well defined.
>
> Making the rule that accessing /dev/ptmx causes an automount of
> /dev/pts/ptmx on top of the device node at /dev/ptmx is really simple,
> with no special games. It also makes it more obvious to userspace what
> is going on. AKA allows userspace to know which superblock does an open
> ptmx master tty belongs to (and it happens in a backwards and forwards
> compatible way).
_What_ dentry? Which filesystem would that be done to? Whatever we have
on /dev? Or we suddenly get the fucking dentry operations change when
dentry is attached to magical cdev inode?
On Mon, Apr 11, 2016 at 08:23:00PM -0500, Eric W. Biederman wrote:
> Perhaps I am reading the code wrong but as I read it that information is
> simply not available in get_super.
Correct. For a very good reason - the same superblock can bloody well end up
in many places; having it tied to _the_ mountpoint is just plain wrong.
You know how it would look done right?
a) mount --after support
b) devpts containing both /ptmx and /pts/1,...
c) mount --type devpts --after none /dev
That's it. And this would be way, way more useful that overlay-style unions;
it would *NOT* recurse into subdirectories. Just a search list done right...
Not an option, unfortunately, since it obviously breaks userland setups -
even if we go ahead and implement that kind of non-recursive unions. But
if we had a chance to design it from scratch, that would've been an interesting
option.
Al Viro <[email protected]> writes:
> On Mon, Apr 11, 2016 at 07:10:47PM -0500, Eric W. Biederman wrote:
>> Actually for me this is about keeping the semantics simpler, and coming
>> up with a higher performance implementation.
>>
>> A dentry that does an automount is already well defined.
>>
>> Making the rule that accessing /dev/ptmx causes an automount of
>> /dev/pts/ptmx on top of the device node at /dev/ptmx is really simple,
>> with no special games. It also makes it more obvious to userspace what
>> is going on. AKA allows userspace to know which superblock does an open
>> ptmx master tty belongs to (and it happens in a backwards and forwards
>> compatible way).
>
> _What_ dentry? Which filesystem would that be done to? Whatever we have
> on /dev? Or we suddenly get the fucking dentry operations change when
> dentry is attached to magical cdev inode?
Which dentry? Any dentry that corresponds to the /dev/ptmx inode.
No filesystem changes just magic in init_special_inode that I have
not completely figured out yet.
If we can get an automount method in follow_automount from somewhere
cdev specific then a cdev can perform an automount comparitively
cleanly. file_operations is attractive I am have not yet figured out a
clean method for passing the automount method yet.
For my proof of concept I am hardcoding things based on i_rdev. Ugly
but servicable for testing out the idea.
A snip of my proof of concept code that seems to be working:
diff --git a/fs/inode.c b/fs/inode.c
index 69b8b526c194..d3de77b01a84 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -18,6 +18,7 @@
#include <linux/buffer_head.h> /* for inode_has_buffers */
#include <linux/ratelimit.h>
#include <linux/list_lru.h>
+#include <linux/devpts_fs.h>
#include <trace/events/writeback.h>
#include "internal.h"
@@ -1917,6 +1918,11 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
if (S_ISCHR(mode)) {
inode->i_fop = &def_chr_fops;
inode->i_rdev = rdev;
+#if CONFIG_UNIX98_PTYS
+ if (rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)) {
+ inode->i_flags |= S_AUTOMOUNT;
+ }
+#endif
} else if (S_ISBLK(mode)) {
inode->i_fop = &def_blk_fops;
inode->i_rdev = rdev;
diff --git a/fs/namei.c b/fs/namei.c
index afb5137ca199..8894cf5fb43e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -35,6 +35,8 @@
#include <linux/fs_struct.h>
#include <linux/posix_acl.h>
#include <linux/hash.h>
+#include <linux/devpts_fs.h>
+#include <linux/major.h>
#include <asm/uaccess.h>
#include "internal.h"
@@ -1087,10 +1089,19 @@ EXPORT_SYMBOL(follow_up);
static int follow_automount(struct path *path, struct nameidata *nd,
bool *need_mntput)
{
+ struct vfsmount *(*automount)(struct path *) = NULL;
struct vfsmount *mnt;
int err;
- if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
+ if (path->dentry->d_op)
+ automount = path->dentry->d_op->d_automount;
+#if CONFIG_UNIX98_PTYS
+ if (path->dentry->d_inode &&
+ path->dentry->d_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)) {
+ automount = ptmx_automount;
+ }
+#endif
+ if (!automount)
return -EREMOTE;
/* We don't want to mount if someone's just doing a stat -
@@ -1113,7 +1124,7 @@ static int follow_automount(struct path *path, struct nameidata *nd,
if (nd->total_link_count >= 40)
return -ELOOP;
- mnt = path->dentry->d_op->d_automount(path);
+ mnt = automount(path);
if (IS_ERR(mnt)) {
/*
* The filesystem is allowed to return -EISDIR here to indicate
On Mon, Apr 11, 2016 at 1:12 PM, Andy Lutomirski <[email protected]> wrote:
> On Sat, Apr 9, 2016 at 6:27 PM, Linus Torvalds
> <[email protected]> wrote:
>>
>> On Apr 9, 2016 5:45 PM, "Andy Lutomirski" <[email protected]> wrote:
>>>
>>>
>>> What we *do* want to do, though, is to prevent the following:
>>
>> I don't see the point. Why do you bring up this insane scenario that nobody
>> can possibly care about?
>>
>> So you actually have any reason to believe somebody does that?
>>
>> I already asked about that earlier, and the silence was deafening.
>
> I have no idea, but I'm generally uncomfortable with magical things
> that bypass normal security policy.
>
> That being said, here's an idea for fixing this, at least in the long
> run. Add a new devpts mount option "no_ptmx_redirect" that turns off
> this behavior for the super in question. That is, opening /dev/ptmx
> if "pts/ptmx" points to something with no_ptmx_redirect set will fail.
> Distros shipping new kernels could be encouraged to (finally!) make
> /dev/ptmx a symlink and set this option.
>
> We just might be able to get away with spelling that option "newinstance".
Linus, you said that people who want to protect their pts should deny
execute. So I set it up:
# ls -l
total 0
crw-------. 1 root root 5, 2 Apr 12 10:38 ptmx
drwx------. 2 root root 0 Apr 2 11:35 pts
$ unshare -urm
# ls -l
total 0
crw-------. 1 nfsnobody nfsnobody 5, 2 Apr 12 10:38 ptmx
drwx------. 2 nfsnobody nfsnobody 0 Apr 2 11:35 pts
# mount --bind /dev/ptmx ptmx
# ls -l
total 0
crw-rw-rw-. 1 nfsnobody nfsnobody 5, 2 Apr 12 10:42 ptmx
drwx------. 2 nfsnobody nfsnobody 0 Apr 2 11:35 pts
And there goes your protection. So the whole /dev directory would
have to deny execute to protect against this.
But I think that gating this on mount options might be fine. If
devpts is mounted with newinstance, then /dev/ptmx *already doesn't
work for it*, right? So can we just say that the magic ptmx ->
pts/ptmx redirect doesn't work if the pts filesystem in question is
mounted with newinstance?
--Andy
On Tue, Apr 12, 2016 at 10:44 AM, Andy Lutomirski <[email protected]> wrote:
> Linus, you said that people who want to protect their pts should deny
> execute. So I set it up:
>
> # ls -l
> total 0
> crw-------. 1 root root 5, 2 Apr 12 10:38 ptmx
> drwx------. 2 root root 0 Apr 2 11:35 pts
No you didn't. You're root, and you still have access to /dev/ptmx.
> And there goes your protection. So the whole /dev directory would
> have to deny execute to protect against this.
Exactly. That's what I'm saying. If you want your ptmx to be private,
you need to make your /dev private.
Now, you can avoid the other attack that was talked about (which
involved bind-mounting the pts/ directory somewhere else) by making
just the pts/ directory non-execute, because afaik bind mount requires
the ability to do the lookup.
> But I think that gating this on mount options might be fine. If
> devpts is mounted with newinstance, then /dev/ptmx *already doesn't
> work for it*, right? So can we just say that the magic ptmx ->
> pts/ptmx redirect doesn't work if the pts filesystem in question is
> mounted with newinstance?
No, the problem that started this whole discussion is that
(a) newinstance should go the f*ck away, because this whole duality is broken.
(b) people wanted single instances and we couldn't even enable
default kernel support for DEVPTS_MULTIPLE_INSTANCES, because multiple
instances just don't work with /dev/ptmx.
So what I want to happen is to "just make /dev/ptmx work". Get rid of
the broken "single instance" crap. The only reason it exists is
exactly because /dev/ptmx does not work.
I think the current situation is completely and utterly broken. We
should never have done what we did. I want to *fix* the kernel, not
add random new magic crap.
And I think we _can_ fix the kernel. Not add new mount options that
people already don't use (because they are broken for the normal
situation).
Linus
On 04/12/16 11:12, Linus Torvalds wrote:
>
> So what I want to happen is to "just make /dev/ptmx work". Get rid of
> the broken "single instance" crap. The only reason it exists is
> exactly because /dev/ptmx does not work.
>
> I think the current situation is completely and utterly broken. We
> should never have done what we did. I want to *fix* the kernel, not
> add random new magic crap.
>
Agreed. As far as I'm concerned, there seem to be two realistic
variants, talking semantically as opposed to implementation-wise:
1. Change the default mode of /dev/pts/ptmx to default to 0666, and make
/dev/ptmx have the effective semantics of the symlink which userspace
and userdev/devramfs should have provided all along.
2. Make /dev/ptmx simply look up the pts superblock from its path and
then act like /dev/pts/ptmx. In that case we can probably remove the
ptmx device node unless the ptmxmode mount option is given (in which
case user space probably enabled the symlink.)
-hpa
To recap the situation for those who have not been following closely.
There are programs such as xen-create-image that run as root and setup
a chroot environment with:
"mknod dev/ptmx c 5 2"
"mkdir dev/pts"
"mount -t devpts none dev/pts"
Which mostly works but stomps the mount options of the system /dev/pts.
In particular the options of "gid=5,mode=620" are lost resulting in a
situation where creating a new pty by opening /dev/ptmx results in
that pty having the wrong permissions.
Some distributions have been working around this problem by continuing
to install a setuid root pt_chown binary that will be called by glibc
to fix the permissions.
Maintaining a setuid root pt_chown binary is not too scary until
multiple instances of devpts are considered at which point it becomes
possible to trick the setuid root pt_chown binary into operating on the
wrong files and directories. Leading to all of the things one might
fear when a setuid root binary goes wrong.
The following patchset digs us out of that hole by carefully devpts and
/dev/ptmx in a way that does not introduce any userspace regressions,
while making each mount of devpts distinct (so pt_chown is unnecessary)
and arranging things so that enough information is available so
that a secure pt_chown binary is possible to write if that is ever
needed.
The approach I have chosen to take is to first enhance the /dev/ptmx
device node to automount /dev/pts/ptmx on top of it. This leads to a
simple high performance solution that allows applications such as
xen-create-image (that call "mknod ptmx c 5 2" and mount devpts)
to continue to run as before even when they are given a non-system
instance of devpts.
Using automountic bind mounts of /dev/pts/ptmx results in no new
security cases to consider as this can already be done, and actually
results in a simplification of the analysis of the code. As now all
opens of ptmx are of /dev/pts/ptmx. /dev/ptmx is now just a magic
mountpoint that does the right thing.
Allowing each mount of devpts to be distinct is also a bit interesting
as there is a concept in the code of the primary system devpts instance.
/dev/ptmx automounts the primary system instance of devpts if can not
find an appropriate devpts instance by path lookup. The sysctl
sys.kernel.pty.max is a global maximum of the number of ptys in the
system with sys.kernel.pty.reserve the number of those ptys reserved
exclusively for the system instance of devpts. Overmounting the system
instance of devpts with itself is expected to fail but update the devpts
mount options anyway.
In my testing I have found pieces of code that depend or at least appear
to depend on all of these propeties.
The particular challenge in all of this have been distro's that mount
devpts in initial ram disks, and then mount devpts again during regular
system startup. It took a little bit of careful arranging to ensure
that it is the system instance of devpts that always winds up on
/dev/pts for all distros. CentOS5 and CentOS6 were particularly
challenging examples.
To look for surprising userspace behavior I have attempted to test this
patchset on a representative sample of linux distributions. The
distributions I managed to setup and test in vms are: on openwrt-15.05,
centos5, centos6, centos7, debian-6.0.2, debian-7.9, debian-8.2,
ubuntu-14.04.3, ubuntu-15.10, fedora23, magia-5, mint-17.3,
opensuse-42.1, slackware-14.1, gentoo-20151225 (13.0?),
archlinux-2015-12-01.
I wanted to test Android (as it is one of the most unique linux
distributions) but I could not find a freely available image that was
easy to get going in a VM, so I audited the Android code instead.
Android has a daemon that is responsible for everything under /dev
that listens on for netlink device events, consultis it's policy data
base and if the Android policy allows creates the device node in a
tmpfs instance mounted on /dev with the attributes specified by policy.
Furthermore at system startup this daemon mounts devpts exactly once,
which thankfully presents no interesting challenges.
I have also run xen-create-image on debian 8.2 (where it was easily
installed with apt-get) and confirmed that without these changes it
stomps the mount options of devpts and with these changes it only uses
atypical mount options on a separate instance of devpts.
The current technique of automounting /dev/pts/ptmx onto /dev/ptmx
results in the best userspace semantics and the easiest to understand
and maintain kernel code that I have seen implemented or heard proposed
in this discussion, as semantically and in the implementation each piece
is tasked with doing one thing.
Eric W. Biederman (16):
devpts: Use the same default mode for both /dev/ptmx and dev/pts/ptmx
devpts: Set the proper fops for /dev/pts/ptmx
vfs: Implement vfs_loopback_mount
devpts: Teach /dev/ptmx to automount the appropriate devpts via path lookup
vfs: Allow unlink, and rename on expirable file mounts
devpts: More obvious check for the system devpts in pty allocation
devpts: Cleanup newinstance parsing
devpts: Stop rolling devpts_remount by hand in devpts_mount
devpts: Fail early (if appropriate) on overmount
devpts: Move parse_mount_options into fill_super
devpts: Make devpts_kill_sb safe if fsi is NULL
devpts: Move the creation of /dev/pts/ptmx into fill_super
devpts: Simplify devpts_mount by using mount_nodev
vfs: Implement mount_super_once
devpts: Always return a distinct instance when mounting
devpts: Kill the DEVPTS_MULTIPLE_INSTANCE config option
Documentation/filesystems/devpts.txt | 122 +++++++-----------
drivers/tty/Kconfig | 11 --
drivers/tty/pty.c | 2 +-
drivers/tty/tty_io.c | 5 +-
fs/devpts/inode.c | 234 +++++++++++++++++++----------------
fs/inode.c | 3 +
fs/namei.c | 83 +++++++++++--
fs/namespace.c | 25 +++-
fs/super.c | 34 +++++
include/linux/devpts_fs.h | 18 +++
include/linux/fs.h | 3 +
include/linux/mount.h | 1 +
include/linux/namei.h | 2 +
13 files changed, 330 insertions(+), 213 deletions(-)
This code is also available at:
git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.git devpts-for-testing
Eric
The default mode is 0666 on /dev/ptmx so we really don't gain anything
except a bunch of hassle by not having the default mode on
/dev/pts/ptmx also being 0666.
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
drivers/tty/tty_io.c | 5 +++--
fs/devpts/inode.c | 8 --------
include/linux/devpts_fs.h | 3 +++
3 files changed, 6 insertions(+), 10 deletions(-)
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 8d26ed79bb4c..438b2209ea41 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -3597,9 +3597,10 @@ static char *tty_devnode(struct device *dev, umode_t *mode)
{
if (!mode)
return NULL;
- if (dev->devt == MKDEV(TTYAUX_MAJOR, 0) ||
- dev->devt == MKDEV(TTYAUX_MAJOR, 2))
+ if (dev->devt == MKDEV(TTYAUX_MAJOR, 0))
*mode = 0666;
+ if (dev->devt == MKDEV(TTYAUX_MAJOR, PTMX_MINOR))
+ *mode = DEVPTS_DEFAULT_PTMX_MODE;
return NULL;
}
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 655f21f99160..e8bf61cd1bf8 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -29,14 +29,6 @@
#include <linux/seq_file.h>
#define DEVPTS_DEFAULT_MODE 0600
-/*
- * ptmx is a new node in /dev/pts and will be unused in legacy (single-
- * instance) mode. To prevent surprises in user space, set permissions of
- * ptmx to 0. Use 'chmod' or remount with '-o ptmxmode' to set meaningful
- * permissions.
- */
-#define DEVPTS_DEFAULT_PTMX_MODE 0000
-#define PTMX_MINOR 2
/*
* sysctl support for setting limits on the number of Unix98 ptys allocated.
diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h
index e0ee0b3000b2..db329cd3d14e 100644
--- a/include/linux/devpts_fs.h
+++ b/include/linux/devpts_fs.h
@@ -15,6 +15,9 @@
#include <linux/errno.h>
+#define DEVPTS_DEFAULT_PTMX_MODE 0666
+#define PTMX_MINOR 2
+
#ifdef CONFIG_UNIX98_PTYS
int devpts_new_index(struct inode *ptmx_inode);
--
2.8.1
Simplify things and accelerate the open a bit by setting ptmx_fops
directly on the /dev/pts/ptmx inode.
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
drivers/tty/pty.c | 2 +-
fs/devpts/inode.c | 6 +++---
include/linux/devpts_fs.h | 2 ++
3 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index e16a49b507ef..c71d5b6ddbb8 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -803,7 +803,7 @@ err_file:
return retval;
}
-static struct file_operations ptmx_fops;
+struct file_operations ptmx_fops;
static void __init unix98_pty_init(void)
{
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index e8bf61cd1bf8..4fc6c49b0efd 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -232,7 +232,6 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
static int mknod_ptmx(struct super_block *sb)
{
- int mode;
int rc = -ENOMEM;
struct dentry *dentry;
struct inode *inode;
@@ -274,8 +273,9 @@ static int mknod_ptmx(struct super_block *sb)
inode->i_ino = 2;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
- mode = S_IFCHR|opts->ptmxmode;
- init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2));
+ inode->i_mode = S_IFCHR|opts->ptmxmode;
+ inode->i_fop = &ptmx_fops;
+ inode->i_rdev = MKDEV(TTYAUX_MAJOR, PTMX_MINOR);
inode->i_uid = root_uid;
inode->i_gid = root_gid;
diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h
index db329cd3d14e..ff2b7c274435 100644
--- a/include/linux/devpts_fs.h
+++ b/include/linux/devpts_fs.h
@@ -20,6 +20,8 @@
#ifdef CONFIG_UNIX98_PTYS
+extern struct file_operations ptmx_fops;
+
int devpts_new_index(struct inode *ptmx_inode);
void devpts_kill_index(struct inode *ptmx_inode, int idx);
void devpts_add_ref(struct inode *ptmx_inode);
--
2.8.1
Update the vfs with a may_overmount superblock operation, allowing
devpts to fail early if the primary mount of devpts is going to be
mounted on top of itself.
This change is in preparation for each mount of devpts being distinct
from every other mount of devpts. To maintain a backward compatible
notion of a primary mount of devpts we need overmounts of the mount to
fail (as they do now), which requires a little vfs support so the case
can be detected.
Cause failed over mounts of devpts to go through the devpts remount
path. This already happens as overmounts have previously been detected
late, and it looks like CentOS 6 may actually depend on this behavior
to allow changing devpts mount options by placing them in /etc/fstab.
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
fs/devpts/inode.c | 15 +++++++++++++++
fs/namespace.c | 8 ++++++++
include/linux/fs.h | 1 +
3 files changed, 24 insertions(+)
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 1266abd3251e..051a2657be36 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -339,10 +339,25 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root)
return 0;
}
+bool devpts_may_overmount(struct super_block *sb,
+ int flags, const char *dev_name, void *data)
+{
+ if ((sb == devpts_mnt->mnt_sb) &&
+ (current_user_ns() == &init_user_ns) &&
+ !parse_newinstance(data)) {
+ down_write(&sb->s_umount);
+ devpts_remount(sb, &flags, data);
+ up_write(&sb->s_umount);
+ return false;
+ }
+ return true;
+}
+
static const struct super_operations devpts_sops = {
.statfs = simple_statfs,
.remount_fs = devpts_remount,
.show_options = devpts_show_options,
+ .may_overmount = devpts_may_overmount,
};
static void *new_pts_fs_info(void)
diff --git a/fs/namespace.c b/fs/namespace.c
index 278ecb7b8e8e..01ad3207c3a2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2401,6 +2401,7 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
{
struct file_system_type *type;
struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
+ struct super_block *path_sb;
struct vfsmount *mnt;
int err;
@@ -2429,6 +2430,13 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
}
}
+ path_sb = path->mnt->mnt_sb;
+ if ((path_sb->s_type == type) &&
+ (path->mnt->mnt_root == path->dentry) &&
+ path_sb->s_op->may_overmount &&
+ !path_sb->s_op->may_overmount(path_sb, flags, name, data))
+ return -EBUSY;
+
mnt = vfs_kern_mount(type, flags, name, data);
if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
!mnt->mnt_sb->s_subtype)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 14a97194b34b..aade033bed49 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1755,6 +1755,7 @@ struct super_operations {
struct shrink_control *);
long (*free_cached_objects)(struct super_block *,
struct shrink_control *);
+ bool (*may_overmount)(struct super_block *, int, const char *, void *);
};
/*
--
2.8.1
Add a helper to turn a struct path into a mount.
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
fs/namespace.c | 9 +++++++++
include/linux/mount.h | 1 +
2 files changed, 10 insertions(+)
diff --git a/fs/namespace.c b/fs/namespace.c
index 4fb1691b4355..714778360c2b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1047,6 +1047,15 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
return ERR_PTR(err);
}
+struct vfsmount *vfs_loopback_mount(struct path *path)
+{
+ struct mount *p;
+ p = clone_mnt(real_mount(path->mnt), path->dentry, 0);
+ if (IS_ERR(p))
+ return ERR_CAST(p);
+ return &p->mnt;
+}
+
static void cleanup_mnt(struct mount *mnt)
{
/*
diff --git a/include/linux/mount.h b/include/linux/mount.h
index f822c3c11377..8f6039ad898e 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -89,6 +89,7 @@ struct file_system_type;
extern struct vfsmount *vfs_kern_mount(struct file_system_type *type,
int flags, const char *name,
void *data);
+struct vfsmount *vfs_loopback_mount(struct path *path);
extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list);
extern void mark_mounts_for_expiry(struct list_head *mounts);
--
2.8.1
Just use devpts_remount and by doing so ensuring that ptxmode
actually get propogated to /dev/pts/ptmx on the initial mount
of devpts.
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
fs/devpts/inode.c | 26 ++++++++++++--------------
1 file changed, 12 insertions(+), 14 deletions(-)
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index d65be43b6fe9..1266abd3251e 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -249,12 +249,6 @@ static int mknod_ptmx(struct super_block *sb)
inode_lock(d_inode(root));
- /* If we have already created ptmx node, return */
- if (fsi->ptmx_dentry) {
- rc = 0;
- goto out;
- }
-
dentry = d_alloc_name(root, "ptmx");
if (!dentry) {
pr_err("Unable to alloc dentry for ptmx node\n");
@@ -464,16 +458,20 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type,
error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
if (error)
goto out_undo_sget;
- s->s_flags |= MS_ACTIVE;
- }
- error = parse_mount_options(data, &DEVPTS_SB(s)->mount_opts);
- if (error)
- goto out_undo_sget;
+ error = parse_mount_options(data, &DEVPTS_SB(s)->mount_opts);
+ if (error)
+ goto out_undo_sget;
+
+ error = mknod_ptmx(s);
+ if (error)
+ goto out_undo_sget;
- error = mknod_ptmx(s);
- if (error)
- goto out_undo_sget;
+ s->s_flags |= MS_ACTIVE;
+ } else {
+ /* Match mount_single ignore errors on remount */
+ devpts_remount(s, &flags, data);
+ }
return dget(s->s_root);
--
2.8.1
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
fs/devpts/inode.c | 13 ++++++++-----
1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 051a2657be36..2f2760142ed2 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -379,6 +379,7 @@ static int
devpts_fill_super(struct super_block *s, void *data, int silent)
{
struct inode *inode;
+ int error;
s->s_blocksize = 1024;
s->s_blocksize_bits = 10;
@@ -386,10 +387,16 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
s->s_op = &devpts_sops;
s->s_time_gran = 1;
+ error = -ENOMEM;
s->s_fs_info = new_pts_fs_info();
if (!s->s_fs_info)
goto fail;
+ error = parse_mount_options(data, &DEVPTS_SB(s)->mount_opts);
+ if (error)
+ goto fail;
+
+ error = -ENOMEM;
inode = new_inode(s);
if (!inode)
goto fail;
@@ -407,7 +414,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
pr_err("get root dentry failed\n");
fail:
- return -ENOMEM;
+ return error;
}
#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
@@ -474,10 +481,6 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type,
if (error)
goto out_undo_sget;
- error = parse_mount_options(data, &DEVPTS_SB(s)->mount_opts);
- if (error)
- goto out_undo_sget;
-
error = mknod_ptmx(s);
if (error)
goto out_undo_sget;
--
2.8.1
Add a dedicated parsing routing for newinstance that does not modify
data, so parsing out newinstance can be separate from the parsing of
the other mount options.
Allways pass as data to parse_mount_options the filesystem specific
portion of the super_block that holds the mount options.
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
fs/devpts/inode.c | 44 ++++++++++++++++++++++----------------------
1 file changed, 22 insertions(+), 22 deletions(-)
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 752b2cc6be1c..d65be43b6fe9 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -97,7 +97,6 @@ struct pts_mount_opts {
kgid_t gid;
umode_t mode;
umode_t ptmxmode;
- int newinstance;
int max;
};
@@ -140,19 +139,27 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode)
return devpts_mnt->mnt_sb;
}
-#define PARSE_MOUNT 0
-#define PARSE_REMOUNT 1
+static bool parse_newinstance(const char *data)
+{
+ while (data) {
+ const char *p = strchr(data, ',');
+ size_t len = p ? p - data : strlen(data);
+ if ((len == 11) && (memcmp(data, "newinstance", 11) == 0)) {
+ return true;
+ }
+ data = p ? p + 1 : NULL;
+ }
+ return false;
+}
/*
* parse_mount_options():
* Set @opts to mount options specified in @data. If an option is not
- * specified in @data, set it to its default value. The exception is
- * 'newinstance' option which can only be set/cleared on a mount (i.e.
- * cannot be changed during remount).
+ * specified in @data, set it to its default value.
*
* Note: @data may be NULL (in which case all options are set to default).
*/
-static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
+static int parse_mount_options(char *data, struct pts_mount_opts *opts)
{
char *p;
kuid_t uid;
@@ -166,10 +173,6 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
opts->max = NR_UNIX98_PTY_MAX;
- /* newinstance makes sense only on initial mount */
- if (op == PARSE_MOUNT)
- opts->newinstance = 0;
-
while ((p = strsep(&data, ",")) != NULL) {
substring_t args[MAX_OPT_ARGS];
int token;
@@ -210,9 +213,6 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
opts->ptmxmode = option & S_IALLUGO;
break;
case Opt_newinstance:
- /* newinstance makes sense only on initial mount */
- if (op == PARSE_MOUNT)
- opts->newinstance = 1;
break;
case Opt_max:
if (match_int(&args[0], &option) ||
@@ -311,7 +311,7 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
struct pts_mount_opts *opts = &fsi->mount_opts;
sync_filesystem(sb);
- err = parse_mount_options(data, PARSE_REMOUNT, opts);
+ err = parse_mount_options(data, opts);
/*
* parse_mount_options() restores options to default values
@@ -440,20 +440,18 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
int error;
- struct pts_mount_opts opts;
struct super_block *s;
+ bool newinstance;
- error = parse_mount_options(data, PARSE_MOUNT, &opts);
- if (error)
- return ERR_PTR(error);
+ newinstance = parse_newinstance(data);
/* Require newinstance for all user namespace mounts to ensure
* the mount options are not changed.
*/
- if ((current_user_ns() != &init_user_ns) && !opts.newinstance)
+ if ((current_user_ns() != &init_user_ns) && !newinstance)
return ERR_PTR(-EINVAL);
- if (opts.newinstance)
+ if (newinstance)
s = sget(fs_type, NULL, set_anon_super, flags, NULL);
else
s = sget(fs_type, compare_init_pts_sb, set_anon_super, flags,
@@ -469,7 +467,9 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type,
s->s_flags |= MS_ACTIVE;
}
- memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts));
+ error = parse_mount_options(data, &DEVPTS_SB(s)->mount_opts);
+ if (error)
+ goto out_undo_sget;
error = mknod_ptmx(s);
if (error)
--
2.8.1
This makes the logic of the test clearer, and removes a confusing
use of opts.newinstance, which allows later changes not to worry
if newinstance is set on the primary mount of devpts or not.
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
fs/devpts/inode.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 0b84063a1e14..752b2cc6be1c 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -534,7 +534,7 @@ retry:
mutex_lock(&allocated_ptys_lock);
if (pty_count >= pty_limit -
- (fsi->mount_opts.newinstance ? pty_reserve : 0)) {
+ ((devpts_mnt->mnt_sb == sb) ? pty_reserve : 0)) {
mutex_unlock(&allocated_ptys_lock);
return -ENOSPC;
}
--
2.8.1
This is in preparation for forcing each mount of devpts to be a
distinct filesystem. The goal of this change is to cleanly allow
each mount of devpts to be a distince filesystem while not
introducing regressions in userspace.
On each open of /dev/ptmx look at the relative path ../pts and see if
devpts is mounted there.
If a devpts filesystem is found via the path lookup mount it's
ptmx node on /dev/ptmx.
If no devpts filesystem is found via the path lookup mount the
system devpts ptmx node on /dev/ptmx. This retains backwards
compatibility for weird setups.
This winds up using 3 new vfs helpers path_parent, path_pts, and
vfs_loopback_mount.
Additionally init_special_inode and follow_automount are updated
with calls to is_dev_ptmx to add a tiny bit of extra code in
those functions to allow /dev/ptmx to hook into the automount
path.
I endeavored to keep the vfs changes clean, but I did not strive for
generality.
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
fs/devpts/inode.c | 55 +++++++++++++++++++++++++++++++
fs/inode.c | 3 ++
fs/namei.c | 83 +++++++++++++++++++++++++++++++++++++++--------
include/linux/devpts_fs.h | 13 ++++++++
include/linux/namei.h | 2 ++
5 files changed, 143 insertions(+), 13 deletions(-)
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 4fc6c49b0efd..0b84063a1e14 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -17,6 +17,7 @@
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/namei.h>
+#include <linux/fs_struct.h>
#include <linux/slab.h>
#include <linux/mount.h>
#include <linux/tty.h>
@@ -695,6 +696,60 @@ void devpts_pty_kill(struct inode *inode)
inode_unlock(d_inode(root));
}
+static void ptmx_expire_automounts(struct work_struct *work);
+static LIST_HEAD(ptmx_automounts);
+static DECLARE_DELAYED_WORK(ptmx_automount_work, ptmx_expire_automounts);
+static unsigned long ptmx_automount_timeout = 10 * 60 * HZ;
+
+static void ptmx_expire_automounts(struct work_struct *work)
+{
+ struct list_head *list = &ptmx_automounts;
+
+ mark_mounts_for_expiry(list);
+ if (!list_empty(list))
+ schedule_delayed_work(&ptmx_automount_work,
+ ptmx_automount_timeout);
+}
+
+struct vfsmount *ptmx_automount(struct path *input_path)
+{
+ struct vfsmount *newmnt;
+ struct path path;
+ struct dentry *old;
+
+ /* Can the pts filesystem be found with a path walk? */
+ path = *input_path;
+ path_get(&path);
+
+ if ((path_pts(&path) != 0) ||
+ /* Is the path the root of a devpts filesystem? */
+ (path.mnt->mnt_sb->s_magic != DEVPTS_SUPER_MAGIC) ||
+ (path.mnt->mnt_root != path.mnt->mnt_sb->s_root)) {
+ /* No devpts filesystem found use the system devpts */
+ path_put(&path);
+ path.mnt = devpts_mnt;
+ path.dentry = DEVPTS_SB(devpts_mnt->mnt_sb)->ptmx_dentry;
+ path_get(&path);
+ }
+ else {
+ /* Advance path to the ptmx dentry */
+ old = path.dentry;
+ path.dentry = dget(DEVPTS_SB(path.mnt->mnt_sb)->ptmx_dentry);
+ dput(old);
+ }
+
+ newmnt = vfs_loopback_mount(&path);
+ if (IS_ERR(newmnt))
+ goto fail;
+
+ mntget(newmnt);
+ mnt_set_expiry(newmnt, &ptmx_automounts);
+ schedule_delayed_work(&ptmx_automount_work, ptmx_automount_timeout);
+fail:
+ path_put(&path);
+ return newmnt;
+}
+
static int __init init_devpts_fs(void)
{
int err = register_filesystem(&devpts_fs_type);
diff --git a/fs/inode.c b/fs/inode.c
index 69b8b526c194..251330ba336e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -18,6 +18,7 @@
#include <linux/buffer_head.h> /* for inode_has_buffers */
#include <linux/ratelimit.h>
#include <linux/list_lru.h>
+#include <linux/devpts_fs.h>
#include <trace/events/writeback.h>
#include "internal.h"
@@ -1917,6 +1918,8 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
if (S_ISCHR(mode)) {
inode->i_fop = &def_chr_fops;
inode->i_rdev = rdev;
+ if (is_dev_ptmx(inode))
+ inode->i_flags |= S_AUTOMOUNT;
} else if (S_ISBLK(mode)) {
inode->i_fop = &def_blk_fops;
inode->i_rdev = rdev;
diff --git a/fs/namei.c b/fs/namei.c
index 794f81dce766..a4bdbeec8067 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -35,6 +35,7 @@
#include <linux/fs_struct.h>
#include <linux/posix_acl.h>
#include <linux/hash.h>
+#include <linux/devpts_fs.h>
#include <asm/uaccess.h>
#include "internal.h"
@@ -1087,10 +1088,15 @@ EXPORT_SYMBOL(follow_up);
static int follow_automount(struct path *path, struct nameidata *nd,
bool *need_mntput)
{
+ struct vfsmount *(*automount)(struct path *) = NULL;
struct vfsmount *mnt;
int err;
- if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
+ if (path->dentry->d_op)
+ automount = path->dentry->d_op->d_automount;
+ if (path->dentry->d_inode && is_dev_ptmx(path->dentry->d_inode))
+ automount = ptmx_automount;
+ if (!automount)
return -EREMOTE;
/* We don't want to mount if someone's just doing a stat -
@@ -1113,7 +1119,7 @@ static int follow_automount(struct path *path, struct nameidata *nd,
if (nd->total_link_count >= 40)
return -ELOOP;
- mnt = path->dentry->d_op->d_automount(path);
+ mnt = automount(path);
if (IS_ERR(mnt)) {
/*
* The filesystem is allowed to return -EISDIR here to indicate
@@ -1415,29 +1421,41 @@ static void follow_mount(struct path *path)
}
}
-static int follow_dotdot(struct nameidata *nd)
+static int path_parent(struct path *root, struct path *path)
{
+ int ret = 0;
+
while(1) {
- struct dentry *old = nd->path.dentry;
+ struct dentry *old = path->dentry;
- if (nd->path.dentry == nd->root.dentry &&
- nd->path.mnt == nd->root.mnt) {
+ if (old == root->dentry &&
+ path->mnt == root->mnt) {
break;
}
- if (nd->path.dentry != nd->path.mnt->mnt_root) {
+ if (old != path->mnt->mnt_root) {
/* rare case of legitimate dget_parent()... */
- nd->path.dentry = dget_parent(nd->path.dentry);
+ path->dentry = dget_parent(path->dentry);
dput(old);
- if (unlikely(!path_connected(&nd->path)))
+ if (unlikely(!path_connected(path)))
return -ENOENT;
+ ret = 1;
break;
}
- if (!follow_up(&nd->path))
+ if (!follow_up(path))
break;
}
- follow_mount(&nd->path);
- nd->inode = nd->path.dentry->d_inode;
- return 0;
+ follow_mount(path);
+ return ret;
+}
+
+static int follow_dotdot(struct nameidata *nd)
+{
+ int ret = path_parent(&nd->root, &nd->path);
+ if (ret >= 0) {
+ ret = 0;
+ nd->inode = nd->path.dentry->d_inode;
+ }
+ return ret;
}
/*
@@ -2374,6 +2392,45 @@ struct dentry *lookup_one_len_unlocked(const char *name,
}
EXPORT_SYMBOL(lookup_one_len_unlocked);
+#ifdef CONFIG_UNIX98_PTYS
+int path_pts(struct path *path)
+{
+ /* A pathwalk of "../pts" with no permission checks. */
+ struct dentry *child, *parent = path->dentry;
+ struct qstr this;
+ struct path root;
+ int ret;
+
+ get_fs_root(current->fs, &root);
+ ret = path_parent(&root, path);
+ path_put(&root);
+ if (ret != 1)
+ return -ENOENT;
+
+ if (!d_can_lookup(parent))
+ return -ENOENT;
+
+ this.name = "pts";
+ this.len = 3;
+ this.hash = full_name_hash(this.name, this.len);
+ if (parent->d_flags & DCACHE_OP_HASH) {
+ int err = parent->d_op->d_hash(parent, &this);
+ if (err < 0)
+ return err;
+ }
+ inode_lock(parent->d_inode);
+ child = d_lookup(parent, &this);
+ inode_unlock(parent->d_inode);
+ if (!child)
+ return -ENOENT;
+
+ path->dentry = child;
+ dput(parent);
+ follow_mount(path);
+ return 0;
+}
+#endif
+
int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
struct path *path, int *empty)
{
diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h
index ff2b7c274435..5b2f6d6cd386 100644
--- a/include/linux/devpts_fs.h
+++ b/include/linux/devpts_fs.h
@@ -19,9 +19,12 @@
#define PTMX_MINOR 2
#ifdef CONFIG_UNIX98_PTYS
+#include <linux/major.h>
extern struct file_operations ptmx_fops;
+struct vfsmount *ptmx_automount(struct path *path);
+
int devpts_new_index(struct inode *ptmx_inode);
void devpts_kill_index(struct inode *ptmx_inode, int idx);
void devpts_add_ref(struct inode *ptmx_inode);
@@ -34,6 +37,10 @@ void *devpts_get_priv(struct inode *pts_inode);
/* unlink */
void devpts_pty_kill(struct inode *inode);
+static inline bool is_dev_ptmx(struct inode *inode)
+{
+ return inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR);
+}
#else
/* Dummy stubs in the no-pty case */
@@ -52,6 +59,12 @@ static inline void *devpts_get_priv(struct inode *pts_inode)
}
static inline void devpts_pty_kill(struct inode *inode) { }
+#define ptmx_automount NULL
+
+static inline bool is_dev_ptmx(struct inode *inode)
+{
+ return false;
+}
#endif
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 77d01700daf7..f29abda31e6d 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -45,6 +45,8 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
#define LOOKUP_ROOT 0x2000
#define LOOKUP_EMPTY 0x4000
+extern int path_pts(struct path *path);
+
extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty);
static inline int user_path_at(int dfd, const char __user *name, unsigned flags,
--
2.8.1
This is motivated by /dev/ptmx now being an automount. Given that
automounts eventually will go away I don't see that there is a point
in making unlink and rename to wait until the automounts actually go
away before allowing an operation.
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
fs/namespace.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/fs/namespace.c b/fs/namespace.c
index 714778360c2b..278ecb7b8e8e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -718,8 +718,14 @@ bool __is_local_mountpoint(struct dentry *dentry)
down_read(&namespace_sem);
list_for_each_entry(mnt, &ns->list, mnt_list) {
is_covered = (mnt->mnt_mountpoint == dentry);
- if (is_covered)
+ if (is_covered) {
+ if (!d_is_dir(dentry) &&
+ (mnt->mnt.mnt_flags & MNT_SHRINKABLE)) {
+ is_covered = false;
+ continue;
+ }
break;
+ }
}
up_read(&namespace_sem);
out:
--
2.8.1
The devpts filesystem has a notion of a system or primary instance of
devpts. To retain the notion of a primary system instance of devpts
the code needs a way to allow userspace to mount the internally
mounted instance of devpts when it is not currently mounted by
userspace. The new helper mount_super_once allows that.
The function mount_super_once ignores vfsmounts that are in use but
are not mounted in userspace. The set of ignored mounts includes
internal mounts, lazily unmounted instances of mounts, and
loopback/bind mounts show root directory is not a filesystems root
directory.
Furthermore mount_super_once only allows exporting a mount to
userspace if it can account for every reference in s_active
on the superblock. Ensuring races do not allow two simultaneous
mount requestions export the same filesystem to userspace.
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
fs/super.c | 34 ++++++++++++++++++++++++++++++++++
include/linux/fs.h | 2 ++
2 files changed, 36 insertions(+)
diff --git a/fs/super.c b/fs/super.c
index 74914b1bae70..4a6552395fad 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -33,6 +33,7 @@
#include <linux/cleancache.h>
#include <linux/fsnotify.h>
#include <linux/lockdep.h>
+#include "mount.h"
#include "internal.h"
@@ -1101,6 +1102,39 @@ struct dentry *mount_single(struct file_system_type *fs_type,
}
EXPORT_SYMBOL(mount_single);
+struct dentry *mount_super_once(struct super_block *sb, int flags, void *data)
+{
+ /* Allow mounting the specified superblock by userspace if there
+ * are not any existing userspace mounts of it.
+ */
+ struct mount *mnt;
+ int count = 0;
+
+ /* Walk through the existing mounts of this superblock. Fail
+ * if any of those mounts are exported to userspace. Otherwise
+ * increment s_active if the superblock s_active count matches
+ * the number of mounts in the list. This ensures that multiple
+ * simultaneous calls to mount_super_once do not race and result
+ * in the filesystem exported to userspace multiple times.
+ */
+ lock_mount_hash();
+ list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
+ if (is_mounted(&mnt->mnt) && (mnt->mnt.mnt_root == sb->s_root)) {
+ unlock_mount_hash();
+ return ERR_PTR(-EBUSY);
+ }
+ count++;
+ }
+ if (atomic_cmpxchg(&sb->s_active, count, count + 1) != count) {
+ unlock_mount_hash();
+ return ERR_PTR(-EBUSY);
+ }
+ unlock_mount_hash();
+ down_write(&sb->s_umount);
+ do_remount_sb(sb, flags, data, 0);
+ return dget(sb->s_root);
+}
+
struct dentry *
mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
{
diff --git a/include/linux/fs.h b/include/linux/fs.h
index aade033bed49..465c1155bf72 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2002,6 +2002,8 @@ extern struct dentry *mount_single(struct file_system_type *fs_type,
extern struct dentry *mount_nodev(struct file_system_type *fs_type,
int flags, void *data,
int (*fill_super)(struct super_block *, void *, int));
+extern struct dentry *mount_super_once(struct super_block *sb,
+ int flags, void *data);
extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path);
void generic_shutdown_super(struct super_block *sb);
void kill_block_super(struct super_block *sb);
--
2.8.1
Retain the code that was previously enabled with
DEVPTS_MULTIPLE_INSTANCES and remove the config option, and kill the
little bit of code that existed only when DEVPTS_MULTIPLE_INSTANCES
was not selected. With the recently updated semantics userspace
actively depends on having multiple instances of devpts for correct
operation.
Having each mount of devpts return a distinct instance ensures that
user space will not accidentally stomp gid or mode devpts options, of
the primary system devpts, by mounting devpts in a chroot environment.
A guarantee that userspace will not stomp attributes of system devpts
removes the need for a setuid root pt_chown executable.
Running a userspace without DEVPTS_MULTIPLE_INSTANCES that has depends
on the current behavior and has removed a setuid root pt_chown
exectuable will allow the system devpts gid and mode options to be
stomped breaking userspace. Which makes the ability to disable the
code previously selected by DEVPTS_MULTIPLE_INSTANCES actively wrong.
The size increase by always using the DEVPTS_MULTIPLE_INSTANCE
path is minimal, and the code is much easier to maintain and
use without having two different code paths to worry about.
The documentation has been updated to relfect this change.
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
Documentation/filesystems/devpts.txt | 122 ++++++++++++++---------------------
drivers/tty/Kconfig | 11 ----
fs/devpts/inode.c | 34 ----------
3 files changed, 47 insertions(+), 120 deletions(-)
diff --git a/Documentation/filesystems/devpts.txt b/Documentation/filesystems/devpts.txt
index 30d2fcb32f72..984b645ac341 100644
--- a/Documentation/filesystems/devpts.txt
+++ b/Documentation/filesystems/devpts.txt
@@ -1,38 +1,32 @@
-To support containers, we now allow multiple instances of devpts filesystem,
-such that indices of ptys allocated in one instance are independent of indices
-allocated in other instances of devpts.
-
-To preserve backward compatibility, this support for multiple instances is
-enabled only if:
-
- - CONFIG_DEVPTS_MULTIPLE_INSTANCES=y, and
- - '-o newinstance' mount option is specified while mounting devpts
-
-IOW, devpts now supports both single-instance and multi-instance semantics.
-
-If CONFIG_DEVPTS_MULTIPLE_INSTANCES=n, there is no change in behavior and
-this referred to as the "legacy" mode. In this mode, the new mount options
-(-o newinstance and -o ptmxmode) will be ignored with a 'bogus option' message
-on console.
-
-If CONFIG_DEVPTS_MULTIPLE_INSTANCES=y and devpts is mounted without the
-'newinstance' option (as in current start-up scripts) the new mount binds
-to the initial kernel mount of devpts. This mode is referred to as the
-'single-instance' mode and the current, single-instance semantics are
-preserved, i.e PTYs are common across the system.
-
-The only difference between this single-instance mode and the legacy mode
-is the presence of new, '/dev/pts/ptmx' node with permissions 0000, which
-can safely be ignored.
-
-If CONFIG_DEVPTS_MULTIPLE_INSTANCES=y and 'newinstance' option is specified,
-the mount is considered to be in the multi-instance mode and a new instance
-of the devpts fs is created. Any ptys created in this instance are independent
-of ptys in other instances of devpts. Like in the single-instance mode, the
-/dev/pts/ptmx node is present. To effectively use the multi-instance mode,
-open of /dev/ptmx must be a redirected to '/dev/pts/ptmx' using a symlink or
-bind-mount.
+Each mount of the devpts filesystem is now a distinct instance from
+all other mounts of devpts. Mount options and indicies of ptys
+allocated in one instance are independent of indices allocated in
+other instances of devpts.
+
+If devpts is mounted without the 'newinstance' option (as in current
+start-up scripts) and if the initial kernel mount of devpts has not
+been exported to userspace the new mount binds to the initial kernel
+mount of devpts.
+
+If devpts is mounted with the 'newinstance' option or the initial
+internal mount of devpts has already been mounted a new instance of
+the devpts filesystem is created. Any ptys created in this instance
+are independent of the ptys created in other instances of devpts, and
+the initial permissions of those ptys are independent from the initial
+permissions from any other instance of the devpts filesystem.
+
+Ideally people will make use of the /dev/pts/ptmx device node to
+create ptys on the devpts filesystem. This can be done by updating
+userspace, bind mounting /dev/pts/ptmx onto /dev/ptmx or making
+/dev/ptmx a symlink to /dev/pts/ptmx.
+
+To be seemlessly backwards compatible an open of /dev/ptmx will look
+to see if the name pts in the same directory is the root directory of
+a devpts filesystem. If that is the case and it is not the initial
+instance of devpts /dev/pts/ptmx is opened. Otherwise the initial
+instance of devpts is opened. In older kernels /dev/ptmx did not
+perform this redirection.
Eg: A container startup script could do the following:
@@ -60,44 +54,34 @@ Per-instance limit could be set by adding mount option "max=<count>".
This feature was added in kernel 3.4 together with sysctl kernel.pty.reserve.
In kernels older than 3.4 sysctl kernel.pty.max works as per-instance limit.
-User-space changes
-------------------
+What user-space needs to do
+----------------------------
-In multi-instance mode (i.e '-o newinstance' mount option is specified at least
-once), following user-space issues should be noted.
+1. If the devpts filesystem is only mounted once /dev/pts/ptmx can be
+ ignored and no change is needed to system-startup scripts.
-1. If -o newinstance mount option is never used, /dev/pts/ptmx can be ignored
- and no change is needed to system-startup scripts.
+2. For best results userspace libraries and applications should be updated
+ to try opening /dev/pts/ptmx before /dev/ptmx, as /dev/pts/ptmx is less
+ ambiguous and higher performance.
-2. To effectively use multi-instance mode (i.e -o newinstance is specified)
- administrators or startup scripts should "redirect" open of /dev/ptmx to
- /dev/pts/ptmx using either a bind mount or symlink.
+3. To effectively use a new instance of devpts open of /dev/ptmx should
+ be redirected to /dev/pts/ptmx using either a bind mount or symlink.
$ mount -t devpts -o newinstance devpts /dev/pts
followed by either
- $ rm /dev/ptmx
$ ln -s pts/ptmx /dev/ptmx
$ chmod 666 /dev/pts/ptmx
or
$ mount -o bind /dev/pts/ptmx /dev/ptmx
-3. The '/dev/ptmx -> pts/ptmx' symlink is the preferred method since it
- enables better error-reporting and treats both single-instance and
- multi-instance mounts similarly.
-
- But this method requires that system-startup scripts set the mode of
- /dev/pts/ptmx correctly (default mode is 0000). The scripts can set the
- mode by, either
+4. The '/dev/ptmx -> pts/ptmx' symlink is the preferred method since it
+ enables better error-reporting and treats all cases the same.
- - adding ptmxmode mount option to devpts entry in /etc/fstab, or
- - using 'chmod 0666 /dev/pts/ptmx'
-
-4. If multi-instance mode mount is needed for containers, but the system
- startup scripts have not yet been updated, container-startup scripts
- should bind mount /dev/ptmx to /dev/pts/ptmx to avoid breaking single-
- instance mounts.
+5. If the system startup scripts do not create /dev/ptmx as a symlink,
+ container-startup scripts should bind mount /dev/ptmx to /dev/pts/ptmx
+ to avoid breaking the rest of the system.
Or, in general, container-startup scripts should use:
@@ -106,11 +90,9 @@ once), following user-space issues should be noted.
mount -o bind /dev/pts/ptmx /dev/ptmx
fi
- When all devpts mounts are multi-instance, /dev/ptmx can permanently be
- a symlink to pts/ptmx and the bind mount can be ignored.
-
-5. A multi-instance mount that is not accompanied by the /dev/ptmx to
- /dev/pts/ptmx redirection would result in an unusable/unreachable pty.
+6. A mount of devpts that is not accompanied by the /dev/ptmx to
+ /dev/pts/ptmx redirection will result in an unusable/unreachable pty,
+ on older kernels.
mount -t devpts -o newinstance lxcpts /dev/pts
@@ -121,21 +103,11 @@ once), following user-space issues should be noted.
would create a pty, say /dev/pts/7, in the initial kernel mount.
But /dev/pts/7 would be invisible in the new mount.
-6. The permissions for /dev/pts/ptmx node should be specified when mounting
- /dev/pts, using the '-o ptmxmode=%o' mount option (default is 0000).
+7. The permissions for /dev/pts/ptmx node should be specified when mounting
+ /dev/pts, using the '-o ptmxmode=%o' mount option (default is 0666 was 0000).
mount -t devpts -o newinstance -o ptmxmode=0644 devpts /dev/pts
The permissions can be later be changed as usual with 'chmod'.
chmod 666 /dev/pts/ptmx
-
-7. A mount of devpts without the 'newinstance' option results in binding to
- initial kernel mount. This behavior while preserving legacy semantics,
- does not provide strict isolation in a container environment. i.e by
- mounting devpts without the 'newinstance' option, a container could
- get visibility into the 'host' or root container's devpts.
-
- To workaround this and have strict isolation, all mounts of devpts,
- including the mount in the root container, should use the newinstance
- option.
diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig
index 82c4d2e45319..95103054c0e4 100644
--- a/drivers/tty/Kconfig
+++ b/drivers/tty/Kconfig
@@ -120,17 +120,6 @@ config UNIX98_PTYS
All modern Linux systems use the Unix98 ptys. Say Y unless
you're on an embedded system and want to conserve memory.
-config DEVPTS_MULTIPLE_INSTANCES
- bool "Support multiple instances of devpts"
- depends on UNIX98_PTYS
- default n
- ---help---
- Enable support for multiple instances of devpts filesystem.
- If you want to have isolated PTY namespaces (eg: in containers),
- say Y here. Otherwise, say N. If enabled, each mount of devpts
- filesystem with the '-o newinstance' option will create an
- independent PTY namespace.
-
config LEGACY_PTYS
bool "Legacy (BSD) PTY support"
default y
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index e5e533fe824a..9ae15ef06475 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -109,11 +109,9 @@ static const match_table_t tokens = {
{Opt_uid, "uid=%u"},
{Opt_gid, "gid=%u"},
{Opt_mode, "mode=%o"},
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
{Opt_ptmxmode, "ptmxmode=%o"},
{Opt_newinstance, "newinstance"},
{Opt_max, "max=%d"},
-#endif
{Opt_err, NULL}
};
@@ -130,10 +128,8 @@ static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb)
static inline struct super_block *pts_sb_from_inode(struct inode *inode)
{
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
return inode->i_sb;
-#endif
if (!devpts_mnt)
return NULL;
return devpts_mnt->mnt_sb;
@@ -206,7 +202,6 @@ static int parse_mount_options(char *data, struct pts_mount_opts *opts)
return -EINVAL;
opts->mode = option & S_IALLUGO;
break;
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
case Opt_ptmxmode:
if (match_octal(&args[0], &option))
return -EINVAL;
@@ -220,7 +215,6 @@ static int parse_mount_options(char *data, struct pts_mount_opts *opts)
return -EINVAL;
opts->max = option;
break;
-#endif
default:
pr_err("called with bogus options\n");
return -EINVAL;
@@ -230,7 +224,6 @@ static int parse_mount_options(char *data, struct pts_mount_opts *opts)
return 0;
}
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
static int mknod_ptmx(struct super_block *sb)
{
int rc = -ENOMEM;
@@ -291,16 +284,6 @@ static void update_ptmx_mode(struct pts_fs_info *fsi)
inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode;
}
}
-#else
-static inline void update_ptmx_mode(struct pts_fs_info *fsi)
-{
- return;
-}
-static inline int mknod_ptmx(struct super_block *sb)
-{
- return 0;
-}
-#endif
static int devpts_remount(struct super_block *sb, int *flags, char *data)
{
@@ -334,11 +317,9 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root)
seq_printf(seq, ",gid=%u",
from_kgid_munged(&init_user_ns, opts->gid));
seq_printf(seq, ",mode=%03o", opts->mode);
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode);
if (opts->max < NR_UNIX98_PTY_MAX)
seq_printf(seq, ",max=%d", opts->max);
-#endif
return 0;
}
@@ -429,7 +410,6 @@ fail:
return error;
}
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
/*
* devpts_mount()
*
@@ -482,18 +462,6 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type,
return root;
}
-#else
-/*
- * This supports only the legacy single-instance semantics (no
- * multiple-instance semantics)
- */
-static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags,
- const char *dev_name, void *data)
-{
- return mount_single(fs_type, flags, data, devpts_fill_super);
-}
-#endif
-
static void devpts_kill_sb(struct super_block *sb)
{
struct pts_fs_info *fsi = DEVPTS_SB(sb);
@@ -508,9 +476,7 @@ static struct file_system_type devpts_fs_type = {
.name = "devpts",
.mount = devpts_mount,
.kill_sb = devpts_kill_sb,
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
.fs_flags = FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT,
-#endif
};
/*
--
2.8.1
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
fs/devpts/inode.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 2f2760142ed2..f33744debb59 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -514,7 +514,8 @@ static void devpts_kill_sb(struct super_block *sb)
{
struct pts_fs_info *fsi = DEVPTS_SB(sb);
- ida_destroy(&fsi->allocated_ptys);
+ if (fsi)
+ ida_destroy(&fsi->allocated_ptys);
kfree(fsi);
kill_litter_super(sb);
}
--
2.8.1
The code makes more sense here and things are just clearer.
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
fs/devpts/inode.c | 22 +++++++++++++++-------
1 file changed, 15 insertions(+), 7 deletions(-)
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index f33744debb59..3aa223803a08 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -296,6 +296,10 @@ static inline void update_ptmx_mode(struct pts_fs_info *fsi)
{
return;
}
+static inline int mknod_ptmx(struct super_block *sb)
+{
+ return 0;
+}
#endif
static int devpts_remount(struct super_block *sb, int *flags, char *data)
@@ -408,11 +412,19 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
set_nlink(inode, 2);
s->s_root = d_make_root(inode);
- if (s->s_root)
- return 0;
+ if (!s->s_root) {
+ pr_err("get root dentry failed\n");
+ goto fail;
+ }
- pr_err("get root dentry failed\n");
+ error = mknod_ptmx(s);
+ if (error)
+ goto fail_dput;
+ return 0;
+fail_dput:
+ dput(s->s_root);
+ s->s_root = NULL;
fail:
return error;
}
@@ -481,10 +493,6 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type,
if (error)
goto out_undo_sget;
- error = mknod_ptmx(s);
- if (error)
- goto out_undo_sget;
-
s->s_flags |= MS_ACTIVE;
} else {
/* Match mount_single ignore errors on remount */
--
2.8.1
Now that all of the work of setting up a superblock has been moved to
devpts_fill_super simplify devpts_mount by calling mount_nodev instead
of rolling mount_nodev by hand.
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
fs/devpts/inode.c | 23 ++++-------------------
1 file changed, 4 insertions(+), 19 deletions(-)
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 3aa223803a08..c1c346190f35 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -467,7 +467,6 @@ static int compare_init_pts_sb(struct super_block *s, void *p)
static struct dentry *devpts_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- int error;
struct super_block *s;
bool newinstance;
@@ -480,30 +479,16 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type,
return ERR_PTR(-EINVAL);
if (newinstance)
- s = sget(fs_type, NULL, set_anon_super, flags, NULL);
- else
- s = sget(fs_type, compare_init_pts_sb, set_anon_super, flags,
- NULL);
+ return mount_nodev(fs_type, flags, data, devpts_fill_super);
+ s = sget(fs_type, compare_init_pts_sb, set_anon_super, flags, NULL);
if (IS_ERR(s))
return ERR_CAST(s);
- if (!s->s_root) {
- error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
- if (error)
- goto out_undo_sget;
-
- s->s_flags |= MS_ACTIVE;
- } else {
- /* Match mount_single ignore errors on remount */
- devpts_remount(s, &flags, data);
- }
+ /* Match mount_single ignore errors on remount */
+ devpts_remount(s, &flags, data);
return dget(s->s_root);
-
-out_undo_sget:
- deactivate_locked_super(s);
- return ERR_PTR(error);
}
#else
--
2.8.1
When devpts is mounted and the newinstance flag is not passed the code
first checks to see if the system devpts instance has been exported to
userspace. If it has not the system devpts instance is returned
otherwise a fresh instance of devpts is allocated and returned.
If newinstance is passed a fresh devpts instance is always returned.
Combined with the earlier work to cause mounts of devpts to fail
if devpts is mounted over itself, this ensures that the system devpts
is mounted on /dev/pts on all of the systems I have tested.
This has been verified to work properly on openwrt-15.05, centos5,
centos6, centos7, debian-6.0.2, debian-7.9, debian-8.2, ubuntu-14.04.3,
ubuntu-15.10, fedora23, magia-5, mint-17.3, opensuse-42.1, slackware-14.1,
gentoo-20151225 (13.0?), archlinux-2015-12-01
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
fs/devpts/inode.c | 33 ++++++++++++---------------------
1 file changed, 12 insertions(+), 21 deletions(-)
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index c1c346190f35..e5e533fe824a 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -430,13 +430,6 @@ fail:
}
#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
-static int compare_init_pts_sb(struct super_block *s, void *p)
-{
- if (devpts_mnt)
- return devpts_mnt->mnt_sb == s;
- return 0;
-}
-
/*
* devpts_mount()
*
@@ -467,28 +460,26 @@ static int compare_init_pts_sb(struct super_block *s, void *p)
static struct dentry *devpts_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- struct super_block *s;
+ struct dentry *root;
bool newinstance;
newinstance = parse_newinstance(data);
+ if (flags & MS_KERNMOUNT)
+ newinstance = true;
- /* Require newinstance for all user namespace mounts to ensure
+ /* Force newinstance for all user namespace mounts to ensure
* the mount options are not changed.
*/
- if ((current_user_ns() != &init_user_ns) && !newinstance)
- return ERR_PTR(-EINVAL);
-
- if (newinstance)
- return mount_nodev(fs_type, flags, data, devpts_fill_super);
-
- s = sget(fs_type, compare_init_pts_sb, set_anon_super, flags, NULL);
- if (IS_ERR(s))
- return ERR_CAST(s);
+ if (current_user_ns() != &init_user_ns)
+ newinstance = true;
- /* Match mount_single ignore errors on remount */
- devpts_remount(s, &flags, data);
+ root = NULL;
+ if (!newinstance)
+ root = mount_super_once(devpts_mnt->mnt_sb, flags, data);
+ if (IS_ERR_OR_NULL(root))
+ root = mount_nodev(fs_type, flags, data, devpts_fill_super);
- return dget(s->s_root);
+ return root;
}
#else
--
2.8.1
On Fri, Apr 15, 2016 at 8:34 AM, Eric W. Biederman
<[email protected]> wrote:
>
> To recap the situation for those who have not been following closely.
>
> There are programs such as xen-create-image that run as root and setup
> a chroot environment with:
> "mknod dev/ptmx c 5 2"
> "mkdir dev/pts"
> "mount -t devpts none dev/pts"
>
> Which mostly works but stomps the mount options of the system /dev/pts.
> In particular the options of "gid=5,mode=620" are lost resulting in a
> situation where creating a new pty by opening /dev/ptmx results in
> that pty having the wrong permissions.
>
> Some distributions have been working around this problem by continuing
> to install a setuid root pt_chown binary that will be called by glibc
> to fix the permissions.
>
> Maintaining a setuid root pt_chown binary is not too scary until
> multiple instances of devpts are considered at which point it becomes
> possible to trick the setuid root pt_chown binary into operating on the
> wrong files and directories. Leading to all of the things one might
> fear when a setuid root binary goes wrong.
>
> The following patchset digs us out of that hole by carefully devpts and
> /dev/ptmx in a way that does not introduce any userspace regressions,
> while making each mount of devpts distinct (so pt_chown is unnecessary)
> and arranging things so that enough information is available so
> that a secure pt_chown binary is possible to write if that is ever
> needed.
>
> The approach I have chosen to take is to first enhance the /dev/ptmx
> device node to automount /dev/pts/ptmx on top of it. This leads to a
> simple high performance solution that allows applications such as
> xen-create-image (that call "mknod ptmx c 5 2" and mount devpts)
> to continue to run as before even when they are given a non-system
> instance of devpts.
>
> Using automountic bind mounts of /dev/pts/ptmx results in no new
> security cases to consider as this can already be done, and actually
> results in a simplification of the analysis of the code. As now all
> opens of ptmx are of /dev/pts/ptmx. /dev/ptmx is now just a magic
> mountpoint that does the right thing.
And what happens when someone tries to rm /dev/ptmx or unmount their
pts instance or similar? What happens if /dev/ptmx is in a mount that
is set to propagate elsewhere but /dev/pts was replaced by an
unprivileged user? (Can this happen? I'm not sure.)
This seems much weirder than the previous approach. I think I'm
starting to come over to Linus' view -- the magic lookup was fine, and
I still can't think of a case where the permissions matter. If we
care, we can cause the /dev/ptmx magic lookup to fail if the devpts it
finds was created with newinstance. (After all, devpts instances
created with newinstance *never* worked via /dev/ptmx magic.)
--Andy
Andy Lutomirski <[email protected]> writes:
> On Fri, Apr 15, 2016 at 8:34 AM, Eric W. Biederman
> <[email protected]> wrote:
>>
>> To recap the situation for those who have not been following closely.
>>
>> There are programs such as xen-create-image that run as root and setup
>> a chroot environment with:
>> "mknod dev/ptmx c 5 2"
>> "mkdir dev/pts"
>> "mount -t devpts none dev/pts"
>>
>> Which mostly works but stomps the mount options of the system /dev/pts.
>> In particular the options of "gid=5,mode=620" are lost resulting in a
>> situation where creating a new pty by opening /dev/ptmx results in
>> that pty having the wrong permissions.
>>
>> Some distributions have been working around this problem by continuing
>> to install a setuid root pt_chown binary that will be called by glibc
>> to fix the permissions.
>>
>> Maintaining a setuid root pt_chown binary is not too scary until
>> multiple instances of devpts are considered at which point it becomes
>> possible to trick the setuid root pt_chown binary into operating on the
>> wrong files and directories. Leading to all of the things one might
>> fear when a setuid root binary goes wrong.
>>
>> The following patchset digs us out of that hole by carefully devpts and
>> /dev/ptmx in a way that does not introduce any userspace regressions,
>> while making each mount of devpts distinct (so pt_chown is unnecessary)
>> and arranging things so that enough information is available so
>> that a secure pt_chown binary is possible to write if that is ever
>> needed.
>>
>> The approach I have chosen to take is to first enhance the /dev/ptmx
>> device node to automount /dev/pts/ptmx on top of it. This leads to a
>> simple high performance solution that allows applications such as
>> xen-create-image (that call "mknod ptmx c 5 2" and mount devpts)
>> to continue to run as before even when they are given a non-system
>> instance of devpts.
>>
>> Using automountic bind mounts of /dev/pts/ptmx results in no new
>> security cases to consider as this can already be done, and actually
>> results in a simplification of the analysis of the code. As now all
>> opens of ptmx are of /dev/pts/ptmx. /dev/ptmx is now just a magic
>> mountpoint that does the right thing.
>
> And what happens when someone tries to rm /dev/ptmx or unmount their
> pts instance or similar?
Unlink works (that was a trivial extension of the existing semantics).
It really didn't make sense that we honor mounts that can appear and
disappear on their own as much as we honor other mounts.
> What happens if /dev/ptmx is in a mount that
> is set to propagate elsewhere but /dev/pts was replaced by an
> unprivileged user? (Can this happen? I'm not sure.)
Well nothing that currently works will break in such a weird scenario.
So I call what happens in the crazy cases you are imagining well defined
obvious semantics (aka all you need to do is to apply the current
rules). Plus you have to be pretty crazy to do something like that.
Further I am 90% certain that the propogation semantics on the /dev
directory are what define how mounts of /dev/pts and /dev/ptmx
propagate.
> This seems much weirder than the previous approach. I think I'm
> starting to come over to Linus' view -- the magic lookup was fine, and
> I still can't think of a case where the permissions matter. If we
> care, we can cause the /dev/ptmx magic lookup to fail if the devpts it
> finds was created with newinstance. (After all, devpts instances
> created with newinstance *never* worked via /dev/ptmx magic.)
It isn't weirder. The rules are actually just a little simpler.
That is my take away from playing with it and working with it. The code
is actually really boring and just works in practice.
But please take my branch and play with it, and see if you can get it to
do something weird and magical. It is observable enough I don't expect
you can. But please. If I am wrong I will happily scrap this.
Right now this looks from my vanrage point like really really obvious
semantics, that don't cause regressions, and that are easy to understand
and observe.
Eric
It's really too bad we can't just use follow_link :-/
--
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.
On Fri, Apr 15, 2016 at 10:35:20AM -0500, Eric W. Biederman wrote:
> +static inline bool is_dev_ptmx(struct inode *inode)
> +{
> + return inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR);
> +}
I'm not sure whether it matters, but I think a FUSE filesystem
should be able to create a fake device that passes this check.
fuse_init_inode() calls init_special_inode() with a user-supplied
rdev for character devices. Is that a problem?
On Fri, Apr 15, 2016 at 8:35 AM, Eric W. Biederman
<[email protected]> wrote:
> The devpts filesystem has a notion of a system or primary instance of
> devpts. To retain the notion of a primary system instance of devpts
> the code needs a way to allow userspace to mount the internally
> mounted instance of devpts when it is not currently mounted by
> userspace. The new helper mount_super_once allows that.
This is where I stopped reading this patch series.
No.
We want to get *rid* of the idiotic "primary instance" crap.
The whole and only point of doing the "which pts filesystem am I
associated with" for ptmx is to stop the idiotic "one devpts is
pecial".
I don't want to see 16 random patches.
I want to see *one* patch that makes /dev/ptmx look up the pts
filesystem, and be done with it.
Plonk.
Linus
So I've looked at the devpts patches some more, and I'm still not happy
with them.
And one thing I really detest about them is that the 16-patch series
didn't really make me warm and fuzzy in general. Some of the patches were
fine, but on the whole it all still looked rather hacky.
So I started looking at the code with the intent of trying to clean things
up _gradually_, knowing roughly where we want to end up, but also trying
to make single patches that look sane on their own, and can be judged on
their own without any other patches or even any semantic arguments.
And this appended patch is I think the first step.
What this does is get rid of the horrible notion of having that
struct inode *ptmx_inode
be the interface between the pty code and devpts. By de-emphasizing the
ptmx inode, a lot of things actually get cleaner, and we will have a much
saner way forward.
The patch itself is actually fairly straightforward, and apart from some
locking cleanups it's pretty mechanical:
- the interfaces that devpts exposes all take "struct pts_fs_info *"
instead of "struct inode *ptmx_inode" now.
NOTE! The "struct pts_fs_info" thing is a completely opaque structure
as far as the pty driver is concerned: it's still declared entirely
internally to devpts. So the pty code can't actually access it in any
way, just pass it as a "cookie" to the devpts code.
- the "look up the pts fs info" is now a single clear operation, that
also does the reference count increment on the pts superblock.
So "devpts_add/del_ref()" is gone, and replaced by a "lookup and get
ref" operation (devpts_get_ref(inode)), along with a "put ref" op
(devpts_put_ref()).
- the pty master "tty->driver_data" field now contains the pts_fs_info,
not the ptmx inode.
- because we don't care about the ptmx inode any more as some kind of
base index, the ref counting can now drop the inode games - it just
gets the ref on the superblock.
- the pts_fs_info now has a back-pointer to the super_block. That's so
that we can easily look up the information we actually need. Although
quite often, the pts fs info was actually all we wanted, and not having
to look it up based on some magical inode makes things more
straightforward.
Now, I haven't actually *tested* the patch very much: it compiles, it
boots, and my (fairly limited) environment still works, but that's by no
means exhaustive. So I may have screwed something up, but most of this was
really fairly straightforward.
But more importantly, I think it all makes sense independently of anything
else. In particular, now that "devpts_get_ref(inode)" operation should
really be the *only* place we need to look up what devpts instance we're
associated with, and we do it exactly once, at ptmx_open() time.
So I think this is a good step forward, while avoiding anything that could
be at all controversial.
Comments about the patch?
Linus
---
drivers/tty/pty.c | 63 ++++++++++++++++++++++-------------------------
fs/devpts/inode.c | 49 ++++++++++++++++++------------------
include/linux/devpts_fs.h | 30 +++++++++++++---------
3 files changed, 73 insertions(+), 69 deletions(-)
diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index e16a49b507ef..8906470793b9 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -663,14 +663,14 @@ static int pty_unix98_install(struct tty_driver *driver, struct tty_struct *tty)
/* this is called once with whichever end is closed last */
static void pty_unix98_remove(struct tty_driver *driver, struct tty_struct *tty)
{
- struct inode *ptmx_inode;
+ struct pts_fs_info *fsi;
if (tty->driver->subtype == PTY_TYPE_MASTER)
- ptmx_inode = tty->driver_data;
+ fsi = tty->driver_data;
else
- ptmx_inode = tty->link->driver_data;
- devpts_kill_index(ptmx_inode, tty->index);
- devpts_del_ref(ptmx_inode);
+ fsi = tty->link->driver_data;
+ devpts_kill_index(fsi, tty->index);
+ devpts_put_ref(fsi);
}
static const struct tty_operations ptm_unix98_ops = {
@@ -720,6 +720,7 @@ static const struct tty_operations pty_unix98_ops = {
static int ptmx_open(struct inode *inode, struct file *filp)
{
+ struct pts_fs_info *fsi;
struct tty_struct *tty;
struct inode *slave_inode;
int retval;
@@ -734,47 +735,41 @@ static int ptmx_open(struct inode *inode, struct file *filp)
if (retval)
return retval;
+ fsi = devpts_get_ref(inode);
+ retval = -ENODEV;
+ if (!fsi)
+ goto out_free_file;
+
/* find a device that is not in use. */
mutex_lock(&devpts_mutex);
- index = devpts_new_index(inode);
- if (index < 0) {
- retval = index;
- mutex_unlock(&devpts_mutex);
- goto err_file;
- }
-
+ index = devpts_new_index(fsi);
mutex_unlock(&devpts_mutex);
- mutex_lock(&tty_mutex);
- tty = tty_init_dev(ptm_driver, index);
+ retval = index;
+ if (index < 0)
+ goto out_put_ref;
- if (IS_ERR(tty)) {
- retval = PTR_ERR(tty);
- goto out;
- }
+ mutex_lock(&tty_mutex);
+ tty = tty_init_dev(ptm_driver, index);
/* The tty returned here is locked so we can safely
drop the mutex */
mutex_unlock(&tty_mutex);
- set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */
- tty->driver_data = inode;
+ retval = PTR_ERR(tty);
+ if (IS_ERR(tty))
+ goto out;
/*
- * In the case where all references to ptmx inode are dropped and we
- * still have /dev/tty opened pointing to the master/slave pair (ptmx
- * is closed/released before /dev/tty), we must make sure that the inode
- * is still valid when we call the final pty_unix98_shutdown, thus we
- * hold an additional reference to the ptmx inode. For the same /dev/tty
- * last close case, we also need to make sure the super_block isn't
- * destroyed (devpts instance unmounted), before /dev/tty is closed and
- * on its release devpts_kill_index is called.
+ * From here on out, the tty is "live", and the index and
+ * fsi will be killed/put by the tty_release()
*/
- devpts_add_ref(inode);
+ set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */
+ tty->driver_data = fsi;
tty_add_file(tty, filp);
- slave_inode = devpts_pty_new(inode,
+ slave_inode = devpts_pty_new(fsi,
MKDEV(UNIX98_PTY_SLAVE_MAJOR, index), index,
tty->link);
if (IS_ERR(slave_inode)) {
@@ -793,12 +788,14 @@ static int ptmx_open(struct inode *inode, struct file *filp)
return 0;
err_release:
tty_unlock(tty);
+ // This will also put-ref the fsi
tty_release(inode, filp);
return retval;
out:
- mutex_unlock(&tty_mutex);
- devpts_kill_index(inode, index);
-err_file:
+ devpts_kill_index(fsi, index);
+out_put_ref:
+ devpts_put_ref(fsi);
+out_free_file:
tty_free_file(filp);
return retval;
}
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 655f21f99160..61ae12f5670e 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -128,6 +128,7 @@ static const match_table_t tokens = {
struct pts_fs_info {
struct ida allocated_ptys;
struct pts_mount_opts mount_opts;
+ struct super_block *sb;
struct dentry *ptmx_dentry;
};
@@ -358,7 +359,7 @@ static const struct super_operations devpts_sops = {
.show_options = devpts_show_options,
};
-static void *new_pts_fs_info(void)
+static void *new_pts_fs_info(struct super_block *sb)
{
struct pts_fs_info *fsi;
@@ -369,6 +370,7 @@ static void *new_pts_fs_info(void)
ida_init(&fsi->allocated_ptys);
fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE;
fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
+ fsi->sb = sb;
return fsi;
}
@@ -384,7 +386,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
s->s_op = &devpts_sops;
s->s_time_gran = 1;
- s->s_fs_info = new_pts_fs_info();
+ s->s_fs_info = new_pts_fs_info(s);
if (!s->s_fs_info)
goto fail;
@@ -524,17 +526,14 @@ static struct file_system_type devpts_fs_type = {
* to the System V naming convention
*/
-int devpts_new_index(struct inode *ptmx_inode)
+int devpts_new_index(struct pts_fs_info *fsi)
{
- struct super_block *sb = pts_sb_from_inode(ptmx_inode);
- struct pts_fs_info *fsi;
int index;
int ida_ret;
- if (!sb)
+ if (!fsi)
return -ENODEV;
- fsi = DEVPTS_SB(sb);
retry:
if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL))
return -ENOMEM;
@@ -564,11 +563,8 @@ retry:
return index;
}
-void devpts_kill_index(struct inode *ptmx_inode, int idx)
+void devpts_kill_index(struct pts_fs_info *fsi, int idx)
{
- struct super_block *sb = pts_sb_from_inode(ptmx_inode);
- struct pts_fs_info *fsi = DEVPTS_SB(sb);
-
mutex_lock(&allocated_ptys_lock);
ida_remove(&fsi->allocated_ptys, idx);
pty_count--;
@@ -578,21 +574,25 @@ void devpts_kill_index(struct inode *ptmx_inode, int idx)
/*
* pty code needs to hold extra references in case of last /dev/tty close
*/
-
-void devpts_add_ref(struct inode *ptmx_inode)
+struct pts_fs_info *devpts_get_ref(struct inode *ptmx_inode)
{
- struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+ struct super_block *sb;
+ struct pts_fs_info *fsi;
+
+ sb = pts_sb_from_inode(ptmx_inode);
+ if (!sb)
+ return NULL;
+ fsi = DEVPTS_SB(sb);
+ if (!fsi)
+ return NULL;
atomic_inc(&sb->s_active);
- ihold(ptmx_inode);
+ return fsi;
}
-void devpts_del_ref(struct inode *ptmx_inode)
+void devpts_put_ref(struct pts_fs_info *fsi)
{
- struct super_block *sb = pts_sb_from_inode(ptmx_inode);
-
- iput(ptmx_inode);
- deactivate_super(sb);
+ deactivate_super(fsi->sb);
}
/**
@@ -604,22 +604,21 @@ void devpts_del_ref(struct inode *ptmx_inode)
*
* The created inode is returned. Remove it from /dev/pts/ by devpts_pty_kill.
*/
-struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
+struct inode *devpts_pty_new(struct pts_fs_info *fsi, dev_t device, int index,
void *priv)
{
struct dentry *dentry;
- struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+ struct super_block *sb;
struct inode *inode;
struct dentry *root;
- struct pts_fs_info *fsi;
struct pts_mount_opts *opts;
char s[12];
- if (!sb)
+ if (!fsi)
return ERR_PTR(-ENODEV);
+ sb = fsi->sb;
root = sb->s_root;
- fsi = DEVPTS_SB(sb);
opts = &fsi->mount_opts;
inode = new_inode(sb);
diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h
index e0ee0b3000b2..d2f9517ca000 100644
--- a/include/linux/devpts_fs.h
+++ b/include/linux/devpts_fs.h
@@ -15,15 +15,19 @@
#include <linux/errno.h>
+struct pts_fs_info;
+
#ifdef CONFIG_UNIX98_PTYS
-int devpts_new_index(struct inode *ptmx_inode);
-void devpts_kill_index(struct inode *ptmx_inode, int idx);
-void devpts_add_ref(struct inode *ptmx_inode);
-void devpts_del_ref(struct inode *ptmx_inode);
+/* Look up a pts fs info and get a ref to it */
+struct pts_fs_info *devpts_get_ref(struct inode *);
+void devpts_put_ref(struct pts_fs_info *);
+
+int devpts_new_index(struct pts_fs_info *);
+void devpts_kill_index(struct pts_fs_info *, int);
+
/* mknod in devpts */
-struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
- void *priv);
+struct inode *devpts_pty_new(struct pts_fs_info *, dev_t, int, void *);
/* get private structure */
void *devpts_get_priv(struct inode *pts_inode);
/* unlink */
@@ -32,11 +36,15 @@ void devpts_pty_kill(struct inode *inode);
#else
/* Dummy stubs in the no-pty case */
-static inline int devpts_new_index(struct inode *ptmx_inode) { return -EINVAL; }
-static inline void devpts_kill_index(struct inode *ptmx_inode, int idx) { }
-static inline void devpts_add_ref(struct inode *ptmx_inode) { }
-static inline void devpts_del_ref(struct inode *ptmx_inode) { }
-static inline struct inode *devpts_pty_new(struct inode *ptmx_inode,
+static inline struct pts_fs_info *devpts_get_ref(struct pts_fs_info *)
+{
+ return NULL;
+}
+static inline void devpts_put_ref(struct pts_fs_info *) { }
+
+static inline int devpts_new_index(struct pts_fs_info *) { return -EINVAL; }
+static inline void devpts_kill_index(struct pts_fs_info *, int idx) { }
+static inline struct inode *devpts_pty_new(struct pts_fs_info *pts_fs,
dev_t device, int index, void *priv)
{
return ERR_PTR(-EINVAL);
Linus Torvalds <[email protected]> writes:
> On Fri, Apr 15, 2016 at 8:35 AM, Eric W. Biederman
> <[email protected]> wrote:
>> The devpts filesystem has a notion of a system or primary instance of
>> devpts. To retain the notion of a primary system instance of devpts
>> the code needs a way to allow userspace to mount the internally
>> mounted instance of devpts when it is not currently mounted by
>> userspace. The new helper mount_super_once allows that.
>
> This is where I stopped reading this patch series.
>
> No.
>
> We want to get *rid* of the idiotic "primary instance" crap.
That is actually pretty much the opposite of what you said last time,
but having looked at the cost to maintian a "primary instance" notion
and what will break if we don't I am happy to remove such a notion
from devpts.
> The whole and only point of doing the "which pts filesystem am I
> associated with" for ptmx is to stop the idiotic "one devpts is
> pecial".
>
> I don't want to see 16 random patches.
>
> I want to see *one* patch that makes /dev/ptmx look up the pts
> filesystem, and be done with it.
Now that I know where most of the landmines are in userspace I performed
some limited testing to see to see what the implications are:
Causing every userspace mount of devpts to use mount_nodev means the
following:
- Support for reserving ptys for the system devpts instance using
/proc/sys/kernel/pty/reserve needs to be removed.
- On CentOS6 devpts will wind up mounted twice /dev/pts.
- Open of /dev/ptmx can use devpts_mnt to find the devpts filesystem.
In my cursory testing userspace still boots and works desipite those
changes so I am quite happy to go down this path.
Eric
On April 19, 2016 11:22:24 AM PDT, [email protected] wrote:
>Linus Torvalds <[email protected]> writes:
>
>> On Fri, Apr 15, 2016 at 8:35 AM, Eric W. Biederman
>> <[email protected]> wrote:
>>> The devpts filesystem has a notion of a system or primary instance
>of
>>> devpts. To retain the notion of a primary system instance of devpts
>>> the code needs a way to allow userspace to mount the internally
>>> mounted instance of devpts when it is not currently mounted by
>>> userspace. The new helper mount_super_once allows that.
>>
>> This is where I stopped reading this patch series.
>>
>> No.
>>
>> We want to get *rid* of the idiotic "primary instance" crap.
>
>That is actually pretty much the opposite of what you said last time,
>but having looked at the cost to maintian a "primary instance" notion
>and what will break if we don't I am happy to remove such a notion
>from devpts.
>
>> The whole and only point of doing the "which pts filesystem am I
>> associated with" for ptmx is to stop the idiotic "one devpts is
>> pecial".
>>
>> I don't want to see 16 random patches.
>>
>> I want to see *one* patch that makes /dev/ptmx look up the pts
>> filesystem, and be done with it.
>
>Now that I know where most of the landmines are in userspace I
>performed
>some limited testing to see to see what the implications are:
>
>Causing every userspace mount of devpts to use mount_nodev means the
>following:
>
>- Support for reserving ptys for the system devpts instance using
> /proc/sys/kernel/pty/reserve needs to be removed.
>
>- On CentOS6 devpts will wind up mounted twice /dev/pts.
>
>- Open of /dev/ptmx can use devpts_mnt to find the devpts filesystem.
>
>In my cursory testing userspace still boots and works desipite those
>changes so I am quite happy to go down this path.
>
>Eric
pty capping should probably be a devpts mount option, and perhaps a sufficiently privileged user could be allowed to set another mount option to allow that instance to dip into the reserved pool or exempt it completely from the global limit as set in sysctl.
--
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.
Linus Torvalds <[email protected]> writes:
> What this does is get rid of the horrible notion of having that
>
> struct inode *ptmx_inode
>
> be the interface between the pty code and devpts. By de-emphasizing the
> ptmx inode, a lot of things actually get cleaner, and we will have a much
> saner way forward.
I will take a look in a minute. Before I do that I want to mention
why I care about /dev/pts/ptmx.
There is a posix function that is widely used called ptsname. It's
function is to take a master file descriptor and returns the path to the
slave.
All we have in the kernel to support ptsname is an ioctl TIOCGPTN that
returns the pty number in the appropriate instance of devpts.
The only way we have today to query which instance of devpts the pty is
on is through fstat and look st_dev to see if the file is on the correct
filesystem. This works when /dev/pts/ptmx is used and fails when
/dev/ptmx is used.
Does anyone else care?
If no one cares I will stop worrying about it and just get on with
fixing the rest of this mess which there definitely seems to be the will
to do.
Eric
Jann Horn <[email protected]> writes:
> On Fri, Apr 15, 2016 at 10:35:20AM -0500, Eric W. Biederman wrote:
>> +static inline bool is_dev_ptmx(struct inode *inode)
>> +{
>> + return inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR);
>> +}
>
> I'm not sure whether it matters, but I think a FUSE filesystem
> should be able to create a fake device that passes this check.
> fuse_init_inode() calls init_special_inode() with a user-supplied
> rdev for character devices. Is that a problem?
It isn't a problem with fuse, and I don't think it is a real problem
in general, but yes it would be good if that code had honored nodev
mount options.
Eric
On April 19, 2016 11:22:24 AM PDT, [email protected] wrote:
>Linus Torvalds <[email protected]> writes:
>
>> On Fri, Apr 15, 2016 at 8:35 AM, Eric W. Biederman
>> <[email protected]> wrote:
>>> The devpts filesystem has a notion of a system or primary instance
>of
>>> devpts. To retain the notion of a primary system instance of devpts
>>> the code needs a way to allow userspace to mount the internally
>>> mounted instance of devpts when it is not currently mounted by
>>> userspace. The new helper mount_super_once allows that.
>>
>> This is where I stopped reading this patch series.
>>
>> No.
>>
>> We want to get *rid* of the idiotic "primary instance" crap.
>
>That is actually pretty much the opposite of what you said last time,
>but having looked at the cost to maintian a "primary instance" notion
>and what will break if we don't I am happy to remove such a notion
>from devpts.
>
>> The whole and only point of doing the "which pts filesystem am I
>> associated with" for ptmx is to stop the idiotic "one devpts is
>> pecial".
>>
>> I don't want to see 16 random patches.
>>
>> I want to see *one* patch that makes /dev/ptmx look up the pts
>> filesystem, and be done with it.
>
>Now that I know where most of the landmines are in userspace I
>performed
>some limited testing to see to see what the implications are:
>
>Causing every userspace mount of devpts to use mount_nodev means the
>following:
>
>- Support for reserving ptys for the system devpts instance using
> /proc/sys/kernel/pty/reserve needs to be removed.
>
>- On CentOS6 devpts will wind up mounted twice /dev/pts.
>
>- Open of /dev/ptmx can use devpts_mnt to find the devpts filesystem.
>
>In my cursory testing userspace still boots and works desipite those
>changes so I am quite happy to go down this path.
>
>Eric
On a totally different subject: I'm still quite unhappy with the pty number assignment policy. We had to move away from a completely circular policy because of glibc issues in the handling of utmp, which may or may not have been fixed since, but the current policy of always allocating the lowest number is absolutely pessimal for accidental references to a stale pty.
At the time, I suggested that we'd go with a circular policy within the used high water mark, which would still work around the glibc problem; another option would be to quarantine a pty number for a time before reallocation.
--
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.
"H. Peter Anvin" <[email protected]> writes:
> It's really too bad we can't just use follow_link :-/
Well follow_link is actually impossible to use as it doesn't exist
anymore. The routine now is get_link. ;-)
That said just to be certain of where everything stands I took a look to
verify that we can't.
I got about half way there.
I tweaked init_special_inode, and d_flags_for_inode to set inode->i_link
to "pts/ptmx", and to set DCACHE_SYMLINK_TYPE (aka made it so
d_is_symlink returned true) on all instances of /dev/ptmx except
/dev/pts/ptmx.
Things sort of worked and things also acted very very weird.
It is tempting because we would not need special vfs helpers, and would
not require much code, but since it did not work cleanly I give that
case a pass.
I think old udev on distro's like centos5 and centos6 are actually
a limit, as udev does something like calling stat on the device node
after creation to ensure everything was created properly.
Eric
"H. Peter Anvin" <[email protected]> writes:
>>- Support for reserving ptys for the system devpts instance using
>> /proc/sys/kernel/pty/reserve needs to be removed.
>>
>>Eric
>
> pty capping should probably be a devpts mount option
There is a max option so pty capping is a per devpts option.
> , and perhaps a
> sufficiently privileged user could be allowed to set another mount
> option to allow that instance to dip into the reserved pool or exempt
> it completely from the global limit as set in sysctl.
I agree that we could keep the reserved pool, and add a new way to
access it. However no piece of existing userspace could use it. So the
simplest thing to do (unless something actually breaks), is to just
remove the reserve pool.
Eric
On April 19, 2016 11:44:40 AM PDT, [email protected] wrote:
>Linus Torvalds <[email protected]> writes:
>
>> What this does is get rid of the horrible notion of having that
>>
>> struct inode *ptmx_inode
>>
>> be the interface between the pty code and devpts. By de-emphasizing
>the
>> ptmx inode, a lot of things actually get cleaner, and we will have a
>much
>> saner way forward.
>
>I will take a look in a minute. Before I do that I want to mention
>why I care about /dev/pts/ptmx.
>
>There is a posix function that is widely used called ptsname. It's
>function is to take a master file descriptor and returns the path to
>the
>slave.
>
>All we have in the kernel to support ptsname is an ioctl TIOCGPTN that
>returns the pty number in the appropriate instance of devpts.
>
>The only way we have today to query which instance of devpts the pty is
>on is through fstat and look st_dev to see if the file is on the
>correct
>filesystem. This works when /dev/pts/ptmx is used and fails when
>/dev/ptmx is used.
>
>Does anyone else care?
>
>If no one cares I will stop worrying about it and just get on with
>fixing the rest of this mess which there definitely seems to be the
>will
>to do.
>
>Eric
We could add another ioctl for that purpose of we need to. Perhaps an ioctl which returns a file descriptor to the slave device?
However, since we are now defining ptmx to explicitly look up pts/ by name it seems like /dev/ptmx -> /dev/pts/# is true by definition. If what you worry about is namespace reshuffling then ptsname() is the wrong interface in the first place since it returns a pathname.
Fwiw, in klibc ptsname() is basically just an sprintf().
--
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.
On April 19, 2016 12:03:47 PM PDT, [email protected] wrote:
>"H. Peter Anvin" <[email protected]> writes:
>
>>>- Support for reserving ptys for the system devpts instance using
>>> /proc/sys/kernel/pty/reserve needs to be removed.
>>>
>>>Eric
>>
>> pty capping should probably be a devpts mount option
>
>There is a max option so pty capping is a per devpts option.
>
>> , and perhaps a
>> sufficiently privileged user could be allowed to set another mount
>> option to allow that instance to dip into the reserved pool or exempt
>> it completely from the global limit as set in sysctl.
>
>I agree that we could keep the reserved pool, and add a new way to
>access it. However no piece of existing userspace could use it. So
>the
>simplest thing to do (unless something actually breaks), is to just
>remove the reserve pool.
>
>Eric
Perhaps a (privileged) option to exempt from the global limit, then. Something we can implement if asked for.
However, I wouldn't be 100% that the reserved pool isn't used. Someone added it presumably for a reason. An administrator could say it and we'd have no idea.
--
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.
On April 19, 2016 12:25:03 PM PDT, "H. Peter Anvin" <[email protected]> wrote:
>On April 19, 2016 12:03:47 PM PDT, [email protected] wrote:
>>"H. Peter Anvin" <[email protected]> writes:
>>
>>>>- Support for reserving ptys for the system devpts instance using
>>>> /proc/sys/kernel/pty/reserve needs to be removed.
>>>>
>>>>Eric
>>>
>>> pty capping should probably be a devpts mount option
>>
>>There is a max option so pty capping is a per devpts option.
>>
>>> , and perhaps a
>>> sufficiently privileged user could be allowed to set another mount
>>> option to allow that instance to dip into the reserved pool or
>exempt
>>> it completely from the global limit as set in sysctl.
>>
>>I agree that we could keep the reserved pool, and add a new way to
>>access it. However no piece of existing userspace could use it. So
>>the
>>simplest thing to do (unless something actually breaks), is to just
>>remove the reserve pool.
>>
>>Eric
>
>Perhaps a (privileged) option to exempt from the global limit, then.
>Something we can implement if asked for.
>
>However, I wouldn't be 100% that the reserved pool isn't used. Someone
>added it presumably for a reason. An administrator could say it and
>we'd have no idea.
... and if I personally was running a container-hosting system, I would *absolutely* set it to make sure the administrator could not get locked out.
--
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.
Quoting Eric W. Biederman ([email protected]):
> Linus Torvalds <[email protected]> writes:
>
> > What this does is get rid of the horrible notion of having that
> >
> > struct inode *ptmx_inode
> >
> > be the interface between the pty code and devpts. By de-emphasizing the
> > ptmx inode, a lot of things actually get cleaner, and we will have a much
> > saner way forward.
>
> I will take a look in a minute. Before I do that I want to mention
> why I care about /dev/pts/ptmx.
>
> There is a posix function that is widely used called ptsname. It's
> function is to take a master file descriptor and returns the path to the
> slave.
>
> All we have in the kernel to support ptsname is an ioctl TIOCGPTN that
> returns the pty number in the appropriate instance of devpts.
>
> The only way we have today to query which instance of devpts the pty is
> on is through fstat and look st_dev to see if the file is on the correct
> filesystem. This works when /dev/pts/ptmx is used and fails when
> /dev/ptmx is used.
>
> Does anyone else care?
>
> If no one cares I will stop worrying about it and just get on with
> fixing the rest of this mess which there definitely seems to be the will
> to do.
There seem to be quite a few users of ptsname (as found by
codesearch.debian.net). I'm going to look through those results
a bit more tonight. One common idiom is
blah = ptsname(fd);
slavefd = open(blah, ...);
That's more easily solved with a new helper.
The scariest thing of course is any callers of ptsname which are setuid-root.
short answer: i'm going to do some research to try and answer "who cares".
-serge
"H. Peter Anvin" <[email protected]> writes:
> We could add another ioctl for that purpose of we need to. Perhaps an
> ioctl which returns a file descriptor to the slave device?
>
> However, since we are now defining ptmx to explicitly look up pts/ by
> name it seems like /dev/ptmx -> /dev/pts/# is true by definition. If
> what you worry about is namespace reshuffling then ptsname() is the
> wrong interface in the first place since it returns a pathname.
Good point. Yes if we are not using devpts_mnt (hooray!) there should
be no complications, and the largest check we would need is to verify
that /dev/ptmx is in the current namespace.
> Fwiw, in klibc ptsname() is basically just an sprintf().
The challenge came in operations such as granpt. Where you are passed
in a ptmx file descriptor from who knows where, and you pass it on
to applications such as pt_chown which run with elevatated privileged.
As the information is available of where devpts is mounted in
relationship to /dev/ptmx I have no more concerns about implementing
ptsname. Path pased is also sufficiently backwards compatible it would
not usually be wrong even on existing kernels.
Good enough.
Eric
On 04/19/2016 01:32 PM, Eric W. Biederman wrote:
>
> The challenge came in operations such as granpt. Where you are passed
> in a ptmx file descriptor from who knows where, and you pass it on
> to applications such as pt_chown which run with elevatated privileged.
>
> As the information is available of where devpts is mounted in
> relationship to /dev/ptmx I have no more concerns about implementing
> ptsname. Path pased is also sufficiently backwards compatible it would
> not usually be wrong even on existing kernels.
>
pt_chown is evil. It should have been removed ages ago, and from the
very beginning have failed if run on a devpts filesystem. Unfortunately
the glibc people didn't do so, and that is a major reason we're in the
current mess.
That being said, the ioctl(TIOCOPENSLAVE) idea would deal with that.
-hpa
Linus Torvalds <[email protected]> writes:
> So I've looked at the devpts patches some more, and I'm still not happy
> with them.
>
> And one thing I really detest about them is that the 16-patch series
> didn't really make me warm and fuzzy in general. Some of the patches were
> fine, but on the whole it all still looked rather hacky.
>
> So I started looking at the code with the intent of trying to clean things
> up _gradually_, knowing roughly where we want to end up, but also trying
> to make single patches that look sane on their own, and can be judged on
> their own without any other patches or even any semantic arguments.
>
> And this appended patch is I think the first step.
>
> What this does is get rid of the horrible notion of having that
>
> struct inode *ptmx_inode
>
> be the interface between the pty code and devpts. By de-emphasizing the
> ptmx inode, a lot of things actually get cleaner, and we will have a much
> saner way forward.
>
> The patch itself is actually fairly straightforward, and apart from some
> locking cleanups it's pretty mechanical:
>
> - the interfaces that devpts exposes all take "struct pts_fs_info *"
> instead of "struct inode *ptmx_inode" now.
>
> NOTE! The "struct pts_fs_info" thing is a completely opaque structure
> as far as the pty driver is concerned: it's still declared entirely
> internally to devpts. So the pty code can't actually access it in any
> way, just pass it as a "cookie" to the devpts code.
>
> - the "look up the pts fs info" is now a single clear operation, that
> also does the reference count increment on the pts superblock.
>
> So "devpts_add/del_ref()" is gone, and replaced by a "lookup and get
> ref" operation (devpts_get_ref(inode)), along with a "put ref" op
> (devpts_put_ref()).
>
> - the pty master "tty->driver_data" field now contains the pts_fs_info,
> not the ptmx inode.
>
> - because we don't care about the ptmx inode any more as some kind of
> base index, the ref counting can now drop the inode games - it just
> gets the ref on the superblock.
>
> - the pts_fs_info now has a back-pointer to the super_block. That's so
> that we can easily look up the information we actually need. Although
> quite often, the pts fs info was actually all we wanted, and not having
> to look it up based on some magical inode makes things more
> straightforward.
>
> Now, I haven't actually *tested* the patch very much: it compiles, it
> boots, and my (fairly limited) environment still works, but that's by no
> means exhaustive. So I may have screwed something up, but most of this was
> really fairly straightforward.
>
> But more importantly, I think it all makes sense independently of anything
> else. In particular, now that "devpts_get_ref(inode)" operation should
> really be the *only* place we need to look up what devpts instance we're
> associated with, and we do it exactly once, at ptmx_open() time.
>
> So I think this is a good step forward, while avoiding anything that could
> be at all controversial.
>
> Comments about the patch?
Mostly I think it is six of one half dozen of the other, but given
driver_data is used so few times and that those times are very clear
whether we are accessing a master or a slave worth doing just for
clarity.
I have work inspired by this rolled into my code. I will post shortly
after a little more testing.
Eric
On Tue, Apr 19, 2016 at 11:44 AM, Eric W. Biederman
<[email protected]> wrote:
>
> I will take a look in a minute. Before I do that I want to mention
> why I care about /dev/pts/ptmx.
>
> There is a posix function that is widely used called ptsname. It's
> function is to take a master file descriptor and returns the path to the
> slave.
>
> All we have in the kernel to support ptsname is an ioctl TIOCGPTN that
> returns the pty number in the appropriate instance of devpts.
Don't bother with that completely mis-designed interface.It's crap.
So we'll keep it working for legacy models, but the whole "return an
integer index" is just pure shit. It's not worth worrying about.
We can (and probably should) just introduce a new ioctl or even a
system call that just does the sane thing and returns the pathname
from the kernel.
But for legacy reasons, we will continue to just return that silly
integer, and it will continue to work - if you use /dev/pts/<n>.
And if you mount devpts anywhere else, or have some other setup, that
interface *cannot* work. More importantly, it's not even worth
worrying about.
Linus
On Tue, Apr 19, 2016 at 11:22 AM, Eric W. Biederman
<[email protected]> wrote:
> Linus Torvalds <[email protected]> writes:
>>
>> No.
>>
>> We want to get *rid* of the idiotic "primary instance" crap.
>
> That is actually pretty much the opposite of what you said last time,
> but having looked at the cost to maintian a "primary instance" notion
> and what will break if we don't I am happy to remove such a notion
> from devpts.
No, it's what I said last time too, but there may have been an misunderstanding.
I _violently_ oppose the stupid DEVPTS_MULTIPLE_INSTANCES config option.
But I do _not_ oppose it because I want to have that "primary
instance" cap. Quite the reverse. I oppose it because the *option* is
stupid, and we should not have such an option.
The kernel should always do multiple instances. There should never
_ever_ be a "single instance" option, although obviously if you were
to do a bind-mount of devpts, the bound mount would be the same
instance.
But if you do two separate mounts, they should alwats be separate insnaces.
So what I want to fix is the crazy "that doesn't work" problem with
Ubuntu or CentOS (or whatever distro it was) breaking, because
/dev/ptmx ends up pointing to the *wrong* instance.
This is literally why I think /dev/ptmx should just look up /dev/pts -
simply to get rid of that idiotic special instance. It should not
exist. It should not exist in the kernel, and it shouldn't exist in
user space.
Linus
On Tue, Apr 19, 2016 at 3:06 PM, Eric W. Biederman
<[email protected]> wrote:
>
> I have work inspired by this rolled into my code. I will post shortly
> after a little more testing.
Actually, I have a slightly fixed version in my tree. I've been
running this on my own machines for a few days, just to verify, along
with some testing.
The fixes are some cleanups of the header file (the !UNIX98 section
that nobody uses was bogus), and fixing "devpts_get_ref()" to get the
"struct file *" argument too. The current code doesn't need it, but
the code to actually look up the right pts/ directory from the ptmx
file open needs it because that's where the path is - passing in just
the inode isn't sufficient).
Anyway, I think I'll just merge my branch instead of sending out
another emailed patch, because I don't think that patch is
controversial or unsafe. It doesn't actually change any semantics, and
only does cleanups. If it breaks something due to the rules about
private_data being different for slave and master side pty's, then the
old code was broken too - it used to always put a inode pointer in
there, but they were very different inode pointers.
Linus
On 04/19/2016 04:23 PM, Linus Torvalds wrote:
> On Tue, Apr 19, 2016 at 11:44 AM, Eric W. Biederman
> <[email protected]> wrote:
>>
>> I will take a look in a minute. Before I do that I want to mention
>> why I care about /dev/pts/ptmx.
>>
>> There is a posix function that is widely used called ptsname. It's
>> function is to take a master file descriptor and returns the path to the
>> slave.
>>
>> All we have in the kernel to support ptsname is an ioctl TIOCGPTN that
>> returns the pty number in the appropriate instance of devpts.
>
> Don't bother with that completely mis-designed interface.It's crap.
>
> So we'll keep it working for legacy models, but the whole "return an
> integer index" is just pure shit. It's not worth worrying about.
>
> We can (and probably should) just introduce a new ioctl or even a
> system call that just does the sane thing and returns the pathname
> from the kernel.
>
What do you think of the idea of TIOCPTSOPEN (or whatever) to get a file
descriptor for the slave device given the master device? Then we can
use realpath() or a readlink on /proc/self/fd/# to get the pathname if
needed.
(Incidentally, we added getcwd() as a system call. Should we add
[f]realpath() as a system call too?)
-hpa
On Tue, Apr 19, 2016 at 4:39 PM, H. Peter Anvin <[email protected]> wrote:
>
> What do you think of the idea of TIOCPTSOPEN (or whatever) to get a file
> descriptor for the slave device given the master device? Then we can
> use realpath() or a readlink on /proc/self/fd/# to get the pathname if
> needed.
That sounds reasonable to me.
> (Incidentally, we added getcwd() as a system call. Should we add
> [f]realpath() as a system call too?)
Probably a good idea. It's kind of silly to have to do a sprintf
followed by a readlink().
Linus
On 04/19/2016 04:35 PM, Linus Torvalds wrote:
> On Tue, Apr 19, 2016 at 3:06 PM, Eric W. Biederman
> <[email protected]> wrote:
>>
>> I have work inspired by this rolled into my code. I will post shortly
>> after a little more testing.
>
> Actually, I have a slightly fixed version in my tree. I've been
> running this on my own machines for a few days, just to verify, along
> with some testing.
>
> The fixes are some cleanups of the header file (the !UNIX98 section
> that nobody uses was bogus), and fixing "devpts_get_ref()" to get the
> "struct file *" argument too. The current code doesn't need it, but
> the code to actually look up the right pts/ directory from the ptmx
> file open needs it because that's where the path is - passing in just
> the inode isn't sufficient).
>
> Anyway, I think I'll just merge my branch instead of sending out
> another emailed patch, because I don't think that patch is
> controversial or unsafe. It doesn't actually change any semantics, and
> only does cleanups. If it breaks something due to the rules about
> private_data being different for slave and master side pty's, then the
> old code was broken too - it used to always put a inode pointer in
> there, but they were very different inode pointers.
Yes, the old code was broken. The new code always uses the ptmx inode.
Which is a stopgap measure to prevent the devpts instance from
teardown when the only open filp is /dev/tty.
What I want to do instead (but not for -stable) is bump the inode
reference at controlling tty association instead (either first
qualifying open or ioctl(TIOCSCTTY)). That way it would always be
the slave inode. This method is non-trivial though because it makes
disassociation more complicated.
Regards,
Peter Hurley
On 04/19/2016 05:24 PM, Peter Hurley wrote:
> On 04/19/2016 04:35 PM, Linus Torvalds wrote:
>> On Tue, Apr 19, 2016 at 3:06 PM, Eric W. Biederman
>> <[email protected]> wrote:
>>>
>>> I have work inspired by this rolled into my code. I will post shortly
>>> after a little more testing.
>>
>> Actually, I have a slightly fixed version in my tree. I've been
>> running this on my own machines for a few days, just to verify, along
>> with some testing.
>>
>> The fixes are some cleanups of the header file (the !UNIX98 section
>> that nobody uses was bogus), and fixing "devpts_get_ref()" to get the
>> "struct file *" argument too. The current code doesn't need it, but
>> the code to actually look up the right pts/ directory from the ptmx
>> file open needs it because that's where the path is - passing in just
>> the inode isn't sufficient).
>>
>> Anyway, I think I'll just merge my branch instead of sending out
>> another emailed patch, because I don't think that patch is
>> controversial or unsafe. It doesn't actually change any semantics, and
>> only does cleanups. If it breaks something due to the rules about
>> private_data being different for slave and master side pty's, then the
>> old code was broken too - it used to always put a inode pointer in
>> there, but they were very different inode pointers.
>
> Yes, the old code was broken. The new code always uses the ptmx inode.
In re-reading, I was not as clear as I should have been here.
Yes, the different inodes are stored in their corresponding driver_data
but the ptmx inode is always used for final teardown.
> Which is a stopgap measure to prevent the devpts instance from
> teardown when the only open filp is /dev/tty.
>
> What I want to do instead (but not for -stable) is bump the inode
> reference at controlling tty association instead (either first
> qualifying open or ioctl(TIOCSCTTY)). That way it would always be
> the slave inode. This method is non-trivial though because it makes
> disassociation more complicated.
>
> Regards,
> Peter Hurley
>
On Tue, Apr 19, 2016 at 4:29 PM, Linus Torvalds
<[email protected]> wrote:
>
> I _violently_ oppose the stupid DEVPTS_MULTIPLE_INSTANCES config option.
So just to show what I want to actually happen, here's the hacky patch
on top of my (now merged) cleanup patch that actually does what I want
devpts to do.
I say it's hacky, because the "follow_mount()" thing there really is
pretty hacky. Al - suggestions for how to do this *right*?
But this actually forcibly removes the whole "newinstance" thing, and
makes every pts mount a new instance, and just relies on "ptmx" doing
the right thing.
In other words, with this patch, you can *literally* do just this (as
root, obviously):
mkdir test-dir
cd test-dir
mknod ptmx c 5 2
mkdir pts
mount -t devpts pts pts
and after that it all just works. You can do this:
ls -l pts
which shows just the other ptmx noode (that is unused and pointless -
I'd actually like to just remove it, but whatever), and then you can
do
sleep 100 < ptmx &
sleep 100 < ptmx &
ls -l pts
and you will magically see those new 0/1 entries in that pts
subdirectory.. It's entirely independent of /dev/pts/, and there's no
magic connection or any magic dis-connection. It all JustWorks(tm).
Note how this works even *outside* of /dev. But it works inside of
/dev equally well.
Now, a *real* patch would
- solve that "follow_mount()" issue some other way
- not remove the newinstance code immediately (I did it to show that
even the bootup works with a unmodified distro)
- actually remove the whole "DEVPTS_MULTIPLE_INSTANCES" config option
- I'm not happy with devpts_pty_kill(). I would want to clean that up
a bit somehow. I think this is at least partly what Peter Hurley was
talking about. That thing is not pretty.
so this attached patch is by no means meant to be applied as-is. But
it's meant to show what (a) the new organization allows and (b) what I
was going for.
Linus
On April 19, 2016 6:24:12 PM PDT, Linus Torvalds <[email protected]> wrote:
>On Tue, Apr 19, 2016 at 4:29 PM, Linus Torvalds
><[email protected]> wrote:
>>
>> I _violently_ oppose the stupid DEVPTS_MULTIPLE_INSTANCES config
>option.
>
>So just to show what I want to actually happen, here's the hacky patch
>on top of my (now merged) cleanup patch that actually does what I want
>devpts to do.
>
>I say it's hacky, because the "follow_mount()" thing there really is
>pretty hacky. Al - suggestions for how to do this *right*?
>
>But this actually forcibly removes the whole "newinstance" thing, and
>makes every pts mount a new instance, and just relies on "ptmx" doing
>the right thing.
>
>In other words, with this patch, you can *literally* do just this (as
>root, obviously):
>
> mkdir test-dir
> cd test-dir
>
> mknod ptmx c 5 2
> mkdir pts
> mount -t devpts pts pts
>
>and after that it all just works. You can do this:
>
> ls -l pts
>
>which shows just the other ptmx noode (that is unused and pointless -
>I'd actually like to just remove it, but whatever), and then you can
>do
>
> sleep 100 < ptmx &
> sleep 100 < ptmx &
> ls -l pts
>
>and you will magically see those new 0/1 entries in that pts
>subdirectory.. It's entirely independent of /dev/pts/, and there's no
>magic connection or any magic dis-connection. It all JustWorks(tm).
>
>Note how this works even *outside* of /dev. But it works inside of
>/dev equally well.
>
>Now, a *real* patch would
>
> - solve that "follow_mount()" issue some other way
>
> - not remove the newinstance code immediately (I did it to show that
>even the bootup works with a unmodified distro)
>
> - actually remove the whole "DEVPTS_MULTIPLE_INSTANCES" config option
>
> - I'm not happy with devpts_pty_kill(). I would want to clean that up
>a bit somehow. I think this is at least partly what Peter Hurley was
>talking about. That thing is not pretty.
>
>so this attached patch is by no means meant to be applied as-is. But
>it's meant to show what (a) the new organization allows and (b) what I
>was going for.
>
> Linus
I say let's remove it unless ptmxmode= is specified. That way we don't break people who actually did the symlink thing.
--
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.
On Tue, Apr 19, 2016 at 04:39:07PM -0700, H. Peter Anvin wrote:
> On 04/19/2016 04:23 PM, Linus Torvalds wrote:
> > On Tue, Apr 19, 2016 at 11:44 AM, Eric W. Biederman
> > <[email protected]> wrote:
> >>
> >> I will take a look in a minute. Before I do that I want to mention
> >> why I care about /dev/pts/ptmx.
> >>
> >> There is a posix function that is widely used called ptsname. It's
> >> function is to take a master file descriptor and returns the path to the
> >> slave.
> >>
> >> All we have in the kernel to support ptsname is an ioctl TIOCGPTN that
> >> returns the pty number in the appropriate instance of devpts.
> >
> > Don't bother with that completely mis-designed interface.It's crap.
> >
> > So we'll keep it working for legacy models, but the whole "return an
> > integer index" is just pure shit. It's not worth worrying about.
> >
> > We can (and probably should) just introduce a new ioctl or even a
> > system call that just does the sane thing and returns the pathname
> > from the kernel.
> >
>
> What do you think of the idea of TIOCPTSOPEN (or whatever) to get a file
> descriptor for the slave device given the master device? Then we can
That's what i was thinking, since as I said many users just want
blah = ptsname(fd);
slavefd = open(blah, ...);
so they really just want an open fd.
> use realpath() or a readlink on /proc/self/fd/# to get the pathname if
> needed.
>
> (Incidentally, we added getcwd() as a system call. Should we add
> [f]realpath() as a system call too?)
>
> -hpa
The /dev/ptmx device node is changed to lookup the directory entry
"pts" in the same directory as the /dev/ptmx device node was opened
in. If there is a "pts" entry and that entry is a devpts filesystem
/dev/ptmx uses that filesystem. Otherwise the open of /dev/ptmx
fails.
The DEVPTS_MULTIPLE_INSTANCES configuration option is removed,
so that userspace can now safely depend on each mount of devpts
creating a new instance of the filesystem.
Each mount of devpts is now a separate and equal filesystem.
The kernel.pty.reserve sysctl is neutered with no way currently
implemented to be able to use the reserved ptys.
A new vfs helper path_pts is introduced that finds a directory entry
named "pts" in the directory of the passed in path, and changes the
passed in path to point to it. The helper path_pts uses a function
path_parent_directory that was factored out of follow_dotdot.
In the implementation of devpts:
- devpts_mnt is killed as it is no longer meaningful if all
mounts of devpts are equal.
- pts_sb_from_inode is replaced by just inode->i_sb as all
cached inodes in the tty layer are now from the devpts
filesystem.
- devpts_add_ref is rolled into the new function devpts_ptmx.
And the unnecessary inode hold is removed.
- devpts_del_ref is renamed devpts_release and reduced
to just a deacrivate_super.
- The newinstance mount option continues to be accepted but is now ignored.
In devpts_fs.h definitions for when !CONFIG_UNIX98_PTYS are removed
as they are never used.
Documentation/filesystems/devices.txt is updated to describe
the current situation.
This has been verified to work properly on openwrt-15.05, centos5,
centos6, centos7, debian-6.0.2, debian-7.9, debian-8.2,
ubuntu-14.04.3, ubuntu-15.10, fedora23, magia-5, mint-17.3,
opensuse-42.1, slackware-14.1, gentoo-20151225 (13.0?),
archlinux-2015-12-01. With the caveat that on centos6 and on
slackware-14.1 that there wind up being two instances of the devpts
filesystem mounted on /dev/pts, the lower copy does not end up getting
used.
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
Documentation/filesystems/devpts.txt | 145 +++----------------------
drivers/tty/Kconfig | 11 --
drivers/tty/pty.c | 41 ++++---
fs/devpts/inode.c | 205 +++++++++--------------------------
fs/namei.c | 58 ++++++++--
include/linux/devpts_fs.h | 31 ++----
include/linux/namei.h | 2 +
7 files changed, 148 insertions(+), 345 deletions(-)
diff --git a/Documentation/filesystems/devpts.txt b/Documentation/filesystems/devpts.txt
index 30d2fcb32f72..6a40d7e59247 100644
--- a/Documentation/filesystems/devpts.txt
+++ b/Documentation/filesystems/devpts.txt
@@ -1,141 +1,26 @@
+Each mount of the devpts filesystem is now distinct such that ptys
+and their indicies allocated in one mount are independent from ptys
+and their indicies in all other mounts.
-To support containers, we now allow multiple instances of devpts filesystem,
-such that indices of ptys allocated in one instance are independent of indices
-allocated in other instances of devpts.
+All mounts of the devpts filesystem now create a /dev/pts/ptmx node
+with permissions 0000.
-To preserve backward compatibility, this support for multiple instances is
-enabled only if:
+To retain backwards compatibility the a ptmx device node (aka any node
+created with "mknod name c 5 2") when opened will look for an instance
+of devpts under the name "pts" in the same directory as the ptmx device
+node.
- - CONFIG_DEVPTS_MULTIPLE_INSTANCES=y, and
- - '-o newinstance' mount option is specified while mounting devpts
-
-IOW, devpts now supports both single-instance and multi-instance semantics.
-
-If CONFIG_DEVPTS_MULTIPLE_INSTANCES=n, there is no change in behavior and
-this referred to as the "legacy" mode. In this mode, the new mount options
-(-o newinstance and -o ptmxmode) will be ignored with a 'bogus option' message
-on console.
-
-If CONFIG_DEVPTS_MULTIPLE_INSTANCES=y and devpts is mounted without the
-'newinstance' option (as in current start-up scripts) the new mount binds
-to the initial kernel mount of devpts. This mode is referred to as the
-'single-instance' mode and the current, single-instance semantics are
-preserved, i.e PTYs are common across the system.
-
-The only difference between this single-instance mode and the legacy mode
-is the presence of new, '/dev/pts/ptmx' node with permissions 0000, which
-can safely be ignored.
-
-If CONFIG_DEVPTS_MULTIPLE_INSTANCES=y and 'newinstance' option is specified,
-the mount is considered to be in the multi-instance mode and a new instance
-of the devpts fs is created. Any ptys created in this instance are independent
-of ptys in other instances of devpts. Like in the single-instance mode, the
-/dev/pts/ptmx node is present. To effectively use the multi-instance mode,
-open of /dev/ptmx must be a redirected to '/dev/pts/ptmx' using a symlink or
-bind-mount.
-
-Eg: A container startup script could do the following:
-
- $ chmod 0666 /dev/pts/ptmx
- $ rm /dev/ptmx
- $ ln -s pts/ptmx /dev/ptmx
- $ ns_exec -cm /bin/bash
-
- # We are now in new container
-
- $ umount /dev/pts
- $ mount -t devpts -o newinstance lxcpts /dev/pts
- $ sshd -p 1234
-
-where 'ns_exec -cm /bin/bash' calls clone() with CLONE_NEWNS flag and execs
-/bin/bash in the child process. A pty created by the sshd is not visible in
-the original mount of /dev/pts.
+An an option instead of placing a /dev/ptmx device node at /dev/ptmx
+it is possible to place a symlink to /dev/pts/ptmx at /dev/ptmx or
+to bind mount /dev/ptx/ptmx to /dev/ptmx. If you opt for using
+the devpts filesystem in this manner devpts should be mounted with
+the ptmxmode=0666, or chmod 0666 /dev/pts/ptmx should be called.
Total count of pty pairs in all instances is limited by sysctls:
kernel.pty.max = 4096 - global limit
-kernel.pty.reserve = 1024 - reserve for initial instance
+kernel.pty.reserve = 1024 - reserved and not currently available for use
kernel.pty.nr - current count of ptys
Per-instance limit could be set by adding mount option "max=<count>".
This feature was added in kernel 3.4 together with sysctl kernel.pty.reserve.
In kernels older than 3.4 sysctl kernel.pty.max works as per-instance limit.
-
-User-space changes
-------------------
-
-In multi-instance mode (i.e '-o newinstance' mount option is specified at least
-once), following user-space issues should be noted.
-
-1. If -o newinstance mount option is never used, /dev/pts/ptmx can be ignored
- and no change is needed to system-startup scripts.
-
-2. To effectively use multi-instance mode (i.e -o newinstance is specified)
- administrators or startup scripts should "redirect" open of /dev/ptmx to
- /dev/pts/ptmx using either a bind mount or symlink.
-
- $ mount -t devpts -o newinstance devpts /dev/pts
-
- followed by either
-
- $ rm /dev/ptmx
- $ ln -s pts/ptmx /dev/ptmx
- $ chmod 666 /dev/pts/ptmx
- or
- $ mount -o bind /dev/pts/ptmx /dev/ptmx
-
-3. The '/dev/ptmx -> pts/ptmx' symlink is the preferred method since it
- enables better error-reporting and treats both single-instance and
- multi-instance mounts similarly.
-
- But this method requires that system-startup scripts set the mode of
- /dev/pts/ptmx correctly (default mode is 0000). The scripts can set the
- mode by, either
-
- - adding ptmxmode mount option to devpts entry in /etc/fstab, or
- - using 'chmod 0666 /dev/pts/ptmx'
-
-4. If multi-instance mode mount is needed for containers, but the system
- startup scripts have not yet been updated, container-startup scripts
- should bind mount /dev/ptmx to /dev/pts/ptmx to avoid breaking single-
- instance mounts.
-
- Or, in general, container-startup scripts should use:
-
- mount -t devpts -o newinstance -o ptmxmode=0666 devpts /dev/pts
- if [ ! -L /dev/ptmx ]; then
- mount -o bind /dev/pts/ptmx /dev/ptmx
- fi
-
- When all devpts mounts are multi-instance, /dev/ptmx can permanently be
- a symlink to pts/ptmx and the bind mount can be ignored.
-
-5. A multi-instance mount that is not accompanied by the /dev/ptmx to
- /dev/pts/ptmx redirection would result in an unusable/unreachable pty.
-
- mount -t devpts -o newinstance lxcpts /dev/pts
-
- immediately followed by:
-
- open("/dev/ptmx")
-
- would create a pty, say /dev/pts/7, in the initial kernel mount.
- But /dev/pts/7 would be invisible in the new mount.
-
-6. The permissions for /dev/pts/ptmx node should be specified when mounting
- /dev/pts, using the '-o ptmxmode=%o' mount option (default is 0000).
-
- mount -t devpts -o newinstance -o ptmxmode=0644 devpts /dev/pts
-
- The permissions can be later be changed as usual with 'chmod'.
-
- chmod 666 /dev/pts/ptmx
-
-7. A mount of devpts without the 'newinstance' option results in binding to
- initial kernel mount. This behavior while preserving legacy semantics,
- does not provide strict isolation in a container environment. i.e by
- mounting devpts without the 'newinstance' option, a container could
- get visibility into the 'host' or root container's devpts.
-
- To workaround this and have strict isolation, all mounts of devpts,
- including the mount in the root container, should use the newinstance
- option.
diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig
index 82c4d2e45319..95103054c0e4 100644
--- a/drivers/tty/Kconfig
+++ b/drivers/tty/Kconfig
@@ -120,17 +120,6 @@ config UNIX98_PTYS
All modern Linux systems use the Unix98 ptys. Say Y unless
you're on an embedded system and want to conserve memory.
-config DEVPTS_MULTIPLE_INSTANCES
- bool "Support multiple instances of devpts"
- depends on UNIX98_PTYS
- default n
- ---help---
- Enable support for multiple instances of devpts filesystem.
- If you want to have isolated PTY namespaces (eg: in containers),
- say Y here. Otherwise, say N. If enabled, each mount of devpts
- filesystem with the '-o newinstance' option will create an
- independent PTY namespace.
-
config LEGACY_PTYS
bool "Legacy (BSD) PTY support"
default y
diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index e16a49b507ef..6e57eb31e327 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -663,14 +663,14 @@ static int pty_unix98_install(struct tty_driver *driver, struct tty_struct *tty)
/* this is called once with whichever end is closed last */
static void pty_unix98_remove(struct tty_driver *driver, struct tty_struct *tty)
{
- struct inode *ptmx_inode;
+ struct pts_fs_info *fsi;
if (tty->driver->subtype == PTY_TYPE_MASTER)
- ptmx_inode = tty->driver_data;
+ fsi = tty->driver_data;
else
- ptmx_inode = tty->link->driver_data;
- devpts_kill_index(ptmx_inode, tty->index);
- devpts_del_ref(ptmx_inode);
+ fsi = tty->link->driver_data;
+ devpts_kill_index(fsi, tty->index);
+ devpts_release(fsi);
}
static const struct tty_operations ptm_unix98_ops = {
@@ -721,6 +721,7 @@ static const struct tty_operations pty_unix98_ops = {
static int ptmx_open(struct inode *inode, struct file *filp)
{
struct tty_struct *tty;
+ struct pts_fs_info *fsi;
struct inode *slave_inode;
int retval;
int index;
@@ -734,13 +735,19 @@ static int ptmx_open(struct inode *inode, struct file *filp)
if (retval)
return retval;
+ fsi = devpts_acquire(filp);
+ if (IS_ERR(fsi)) {
+ retval = PTR_ERR(fsi);
+ goto err_file;
+ }
+
/* find a device that is not in use. */
mutex_lock(&devpts_mutex);
- index = devpts_new_index(inode);
+ index = devpts_new_index(fsi);
if (index < 0) {
retval = index;
mutex_unlock(&devpts_mutex);
- goto err_file;
+ goto err_fsi;
}
mutex_unlock(&devpts_mutex);
@@ -758,23 +765,11 @@ static int ptmx_open(struct inode *inode, struct file *filp)
mutex_unlock(&tty_mutex);
set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */
- tty->driver_data = inode;
-
- /*
- * In the case where all references to ptmx inode are dropped and we
- * still have /dev/tty opened pointing to the master/slave pair (ptmx
- * is closed/released before /dev/tty), we must make sure that the inode
- * is still valid when we call the final pty_unix98_shutdown, thus we
- * hold an additional reference to the ptmx inode. For the same /dev/tty
- * last close case, we also need to make sure the super_block isn't
- * destroyed (devpts instance unmounted), before /dev/tty is closed and
- * on its release devpts_kill_index is called.
- */
- devpts_add_ref(inode);
+ tty->driver_data = fsi;
tty_add_file(tty, filp);
- slave_inode = devpts_pty_new(inode,
+ slave_inode = devpts_pty_new(fsi,
MKDEV(UNIX98_PTY_SLAVE_MAJOR, index), index,
tty->link);
if (IS_ERR(slave_inode)) {
@@ -797,7 +792,9 @@ err_release:
return retval;
out:
mutex_unlock(&tty_mutex);
- devpts_kill_index(inode, index);
+ devpts_kill_index(fsi, index);
+err_fsi:
+ devpts_release(fsi);
err_file:
tty_free_file(filp);
return retval;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 655f21f99160..ec9653db5452 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -95,8 +95,6 @@ static struct ctl_table pty_root_table[] = {
static DEFINE_MUTEX(allocated_ptys_lock);
-static struct vfsmount *devpts_mnt;
-
struct pts_mount_opts {
int setuid;
int setgid;
@@ -104,7 +102,6 @@ struct pts_mount_opts {
kgid_t gid;
umode_t mode;
umode_t ptmxmode;
- int newinstance;
int max;
};
@@ -117,17 +114,16 @@ static const match_table_t tokens = {
{Opt_uid, "uid=%u"},
{Opt_gid, "gid=%u"},
{Opt_mode, "mode=%o"},
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
{Opt_ptmxmode, "ptmxmode=%o"},
{Opt_newinstance, "newinstance"},
{Opt_max, "max=%d"},
-#endif
{Opt_err, NULL}
};
struct pts_fs_info {
struct ida allocated_ptys;
struct pts_mount_opts mount_opts;
+ struct super_block *sb;
struct dentry *ptmx_dentry;
};
@@ -136,30 +132,53 @@ static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb)
return sb->s_fs_info;
}
-static inline struct super_block *pts_sb_from_inode(struct inode *inode)
+struct pts_fs_info *devpts_acquire(struct file *filp)
{
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
- if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
- return inode->i_sb;
-#endif
- if (!devpts_mnt)
- return NULL;
- return devpts_mnt->mnt_sb;
+ struct pts_fs_info *result;
+ struct path path;
+ struct super_block *sb;
+ int err;
+
+ /* Is a devpts filesystem at "pts" in the same directory? */
+ path = filp->f_path;
+ path_get(&path);
+ err = path_pts(&path);
+ if (err) {
+ result = ERR_PTR(err);
+ goto out;
+ }
+
+ /* Is the path the root of a devpts filesystem? */
+ result = ERR_PTR(-ENOENT);
+ sb = path.mnt->mnt_sb;
+ if ((sb->s_magic != DEVPTS_SUPER_MAGIC) ||
+ (path.mnt->mnt_root != sb->s_root))
+ goto out;
+
+ /*
+ * pty code needs to hold extra references in case of last /dev/tty close
+ */
+ atomic_inc(&sb->s_active);
+ result = DEVPTS_SB(sb);
+
+out:
+ path_put(&path);
+ return result;
}
-#define PARSE_MOUNT 0
-#define PARSE_REMOUNT 1
+void devpts_release(struct pts_fs_info *fsi)
+{
+ deactivate_super(fsi->sb);
+}
/*
* parse_mount_options():
* Set @opts to mount options specified in @data. If an option is not
- * specified in @data, set it to its default value. The exception is
- * 'newinstance' option which can only be set/cleared on a mount (i.e.
- * cannot be changed during remount).
+ * specified in @data, set it to its default value.
*
* Note: @data may be NULL (in which case all options are set to default).
*/
-static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
+static int parse_mount_options(char *data, struct pts_mount_opts *opts)
{
char *p;
kuid_t uid;
@@ -173,10 +192,6 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
opts->max = NR_UNIX98_PTY_MAX;
- /* newinstance makes sense only on initial mount */
- if (op == PARSE_MOUNT)
- opts->newinstance = 0;
-
while ((p = strsep(&data, ",")) != NULL) {
substring_t args[MAX_OPT_ARGS];
int token;
@@ -210,16 +225,12 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
return -EINVAL;
opts->mode = option & S_IALLUGO;
break;
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
case Opt_ptmxmode:
if (match_octal(&args[0], &option))
return -EINVAL;
opts->ptmxmode = option & S_IALLUGO;
break;
case Opt_newinstance:
- /* newinstance makes sense only on initial mount */
- if (op == PARSE_MOUNT)
- opts->newinstance = 1;
break;
case Opt_max:
if (match_int(&args[0], &option) ||
@@ -227,7 +238,6 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
return -EINVAL;
opts->max = option;
break;
-#endif
default:
pr_err("called with bogus options\n");
return -EINVAL;
@@ -237,7 +247,6 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
return 0;
}
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
static int mknod_ptmx(struct super_block *sb)
{
int mode;
@@ -304,12 +313,6 @@ static void update_ptmx_mode(struct pts_fs_info *fsi)
inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode;
}
}
-#else
-static inline void update_ptmx_mode(struct pts_fs_info *fsi)
-{
- return;
-}
-#endif
static int devpts_remount(struct super_block *sb, int *flags, char *data)
{
@@ -318,7 +321,7 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
struct pts_mount_opts *opts = &fsi->mount_opts;
sync_filesystem(sb);
- err = parse_mount_options(data, PARSE_REMOUNT, opts);
+ err = parse_mount_options(data, opts);
/*
* parse_mount_options() restores options to default values
@@ -343,11 +346,9 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root)
seq_printf(seq, ",gid=%u",
from_kgid_munged(&init_user_ns, opts->gid));
seq_printf(seq, ",mode=%03o", opts->mode);
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode);
if (opts->max < NR_UNIX98_PTY_MAX)
seq_printf(seq, ",max=%d", opts->max);
-#endif
return 0;
}
@@ -358,7 +359,7 @@ static const struct super_operations devpts_sops = {
.show_options = devpts_show_options,
};
-static void *new_pts_fs_info(void)
+static void *new_pts_fs_info(struct super_block *sb)
{
struct pts_fs_info *fsi;
@@ -369,6 +370,7 @@ static void *new_pts_fs_info(void)
ida_init(&fsi->allocated_ptys);
fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE;
fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
+ fsi->sb = sb;
return fsi;
}
@@ -384,7 +386,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
s->s_op = &devpts_sops;
s->s_time_gran = 1;
- s->s_fs_info = new_pts_fs_info();
+ s->s_fs_info = new_pts_fs_info(s);
if (!s->s_fs_info)
goto fail;
@@ -408,40 +410,11 @@ fail:
return -ENOMEM;
}
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
-static int compare_init_pts_sb(struct super_block *s, void *p)
-{
- if (devpts_mnt)
- return devpts_mnt->mnt_sb == s;
- return 0;
-}
-
/*
* devpts_mount()
*
- * If the '-o newinstance' mount option was specified, mount a new
- * (private) instance of devpts. PTYs created in this instance are
- * independent of the PTYs in other devpts instances.
- *
- * If the '-o newinstance' option was not specified, mount/remount the
- * initial kernel mount of devpts. This type of mount gives the
- * legacy, single-instance semantics.
- *
- * The 'newinstance' option is needed to support multiple namespace
- * semantics in devpts while preserving backward compatibility of the
- * current 'single-namespace' semantics. i.e all mounts of devpts
- * without the 'newinstance' mount option should bind to the initial
- * kernel mount, like mount_single().
- *
- * Mounts with 'newinstance' option create a new, private namespace.
- *
- * NOTE:
- *
- * For single-mount semantics, devpts cannot use mount_single(),
- * because mount_single()/sget() find and use the super-block from
- * the most recent mount of devpts. But that recent mount may be a
- * 'newinstance' mount and mount_single() would pick the newinstance
- * super-block instead of the initial super-block.
+ * Mount a new (private) instance of devpts. PTYs created in this
+ * instance are independent of the PTYs in other devpts instances.
*/
static struct dentry *devpts_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
@@ -450,22 +423,11 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type,
struct pts_mount_opts opts;
struct super_block *s;
- error = parse_mount_options(data, PARSE_MOUNT, &opts);
+ error = parse_mount_options(data, &opts);
if (error)
return ERR_PTR(error);
- /* Require newinstance for all user namespace mounts to ensure
- * the mount options are not changed.
- */
- if ((current_user_ns() != &init_user_ns) && !opts.newinstance)
- return ERR_PTR(-EINVAL);
-
- if (opts.newinstance)
- s = sget(fs_type, NULL, set_anon_super, flags, NULL);
- else
- s = sget(fs_type, compare_init_pts_sb, set_anon_super, flags,
- NULL);
-
+ s = sget(fs_type, NULL, set_anon_super, flags, NULL);
if (IS_ERR(s))
return ERR_CAST(s);
@@ -489,18 +451,6 @@ out_undo_sget:
return ERR_PTR(error);
}
-#else
-/*
- * This supports only the legacy single-instance semantics (no
- * multiple-instance semantics)
- */
-static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags,
- const char *dev_name, void *data)
-{
- return mount_single(fs_type, flags, data, devpts_fill_super);
-}
-#endif
-
static void devpts_kill_sb(struct super_block *sb)
{
struct pts_fs_info *fsi = DEVPTS_SB(sb);
@@ -514,9 +464,7 @@ static struct file_system_type devpts_fs_type = {
.name = "devpts",
.mount = devpts_mount,
.kill_sb = devpts_kill_sb,
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
.fs_flags = FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT,
-#endif
};
/*
@@ -524,24 +472,17 @@ static struct file_system_type devpts_fs_type = {
* to the System V naming convention
*/
-int devpts_new_index(struct inode *ptmx_inode)
+int devpts_new_index(struct pts_fs_info *fsi)
{
- struct super_block *sb = pts_sb_from_inode(ptmx_inode);
- struct pts_fs_info *fsi;
int index;
int ida_ret;
- if (!sb)
- return -ENODEV;
-
- fsi = DEVPTS_SB(sb);
retry:
if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL))
return -ENOMEM;
mutex_lock(&allocated_ptys_lock);
- if (pty_count >= pty_limit -
- (fsi->mount_opts.newinstance ? pty_reserve : 0)) {
+ if (pty_count >= (pty_limit - pty_reserve)) {
mutex_unlock(&allocated_ptys_lock);
return -ENOSPC;
}
@@ -564,37 +505,14 @@ retry:
return index;
}
-void devpts_kill_index(struct inode *ptmx_inode, int idx)
+void devpts_kill_index(struct pts_fs_info *fsi, int idx)
{
- struct super_block *sb = pts_sb_from_inode(ptmx_inode);
- struct pts_fs_info *fsi = DEVPTS_SB(sb);
-
mutex_lock(&allocated_ptys_lock);
ida_remove(&fsi->allocated_ptys, idx);
pty_count--;
mutex_unlock(&allocated_ptys_lock);
}
-/*
- * pty code needs to hold extra references in case of last /dev/tty close
- */
-
-void devpts_add_ref(struct inode *ptmx_inode)
-{
- struct super_block *sb = pts_sb_from_inode(ptmx_inode);
-
- atomic_inc(&sb->s_active);
- ihold(ptmx_inode);
-}
-
-void devpts_del_ref(struct inode *ptmx_inode)
-{
- struct super_block *sb = pts_sb_from_inode(ptmx_inode);
-
- iput(ptmx_inode);
- deactivate_super(sb);
-}
-
/**
* devpts_pty_new -- create a new inode in /dev/pts/
* @ptmx_inode: inode of the master
@@ -604,22 +522,17 @@ void devpts_del_ref(struct inode *ptmx_inode)
*
* The created inode is returned. Remove it from /dev/pts/ by devpts_pty_kill.
*/
-struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
+struct inode *devpts_pty_new(struct pts_fs_info *fsi, dev_t device, int index,
void *priv)
{
struct dentry *dentry;
- struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+ struct super_block *sb = fsi->sb;
struct inode *inode;
struct dentry *root;
- struct pts_fs_info *fsi;
struct pts_mount_opts *opts;
char s[12];
- if (!sb)
- return ERR_PTR(-ENODEV);
-
root = sb->s_root;
- fsi = DEVPTS_SB(sb);
opts = &fsi->mount_opts;
inode = new_inode(sb);
@@ -685,7 +598,7 @@ void *devpts_get_priv(struct inode *pts_inode)
*/
void devpts_pty_kill(struct inode *inode)
{
- struct super_block *sb = pts_sb_from_inode(inode);
+ struct super_block *sb = inode->i_sb;
struct dentry *root = sb->s_root;
struct dentry *dentry;
@@ -706,20 +619,8 @@ void devpts_pty_kill(struct inode *inode)
static int __init init_devpts_fs(void)
{
int err = register_filesystem(&devpts_fs_type);
- struct ctl_table_header *table;
-
if (!err) {
- struct vfsmount *mnt;
-
- table = register_sysctl_table(pty_root_table);
- mnt = kern_mount(&devpts_fs_type);
- if (IS_ERR(mnt)) {
- err = PTR_ERR(mnt);
- unregister_filesystem(&devpts_fs_type);
- unregister_sysctl_table(table);
- } else {
- devpts_mnt = mnt;
- }
+ register_sysctl_table(pty_root_table);
}
return err;
}
diff --git a/fs/namei.c b/fs/namei.c
index 794f81dce766..d745ea969c20 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1415,21 +1415,28 @@ static void follow_mount(struct path *path)
}
}
+static int path_parent_directory(struct path *path)
+{
+ struct dentry *old = path->dentry;
+ /* rare case of legitimate dget_parent()... */
+ path->dentry = dget_parent(path->dentry);
+ dput(old);
+ if (unlikely(!path_connected(path)))
+ return -ENOENT;
+ return 0;
+}
+
static int follow_dotdot(struct nameidata *nd)
{
while(1) {
- struct dentry *old = nd->path.dentry;
-
if (nd->path.dentry == nd->root.dentry &&
nd->path.mnt == nd->root.mnt) {
break;
}
if (nd->path.dentry != nd->path.mnt->mnt_root) {
- /* rare case of legitimate dget_parent()... */
- nd->path.dentry = dget_parent(nd->path.dentry);
- dput(old);
- if (unlikely(!path_connected(&nd->path)))
- return -ENOENT;
+ int ret = path_parent_directory(&nd->path);
+ if (ret)
+ return ret;
break;
}
if (!follow_up(&nd->path))
@@ -2374,6 +2381,43 @@ struct dentry *lookup_one_len_unlocked(const char *name,
}
EXPORT_SYMBOL(lookup_one_len_unlocked);
+#ifdef CONFIG_UNIX98_PTYS
+int path_pts(struct path *path)
+{
+ /* Find "pts" in the same directory as the input path */
+ struct dentry *child, *parent;
+ struct qstr this;
+ int ret;
+
+ ret = path_parent_directory(path);
+ if (ret)
+ return ret;
+
+ parent = path->dentry;
+ if (!d_can_lookup(parent))
+ return -ENOENT;
+
+ this.name = "pts";
+ this.len = 3;
+ this.hash = full_name_hash(this.name, this.len);
+ if (parent->d_flags & DCACHE_OP_HASH) {
+ int err = parent->d_op->d_hash(parent, &this);
+ if (err < 0)
+ return err;
+ }
+ inode_lock(parent->d_inode);
+ child = d_lookup(parent, &this);
+ inode_unlock(parent->d_inode);
+ if (!child)
+ return -ENOENT;
+
+ path->dentry = child;
+ dput(parent);
+ follow_mount(path);
+ return 0;
+}
+#endif
+
int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
struct path *path, int *empty)
{
diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h
index e0ee0b3000b2..486f6282b0c6 100644
--- a/include/linux/devpts_fs.h
+++ b/include/linux/devpts_fs.h
@@ -17,36 +17,21 @@
#ifdef CONFIG_UNIX98_PTYS
-int devpts_new_index(struct inode *ptmx_inode);
-void devpts_kill_index(struct inode *ptmx_inode, int idx);
-void devpts_add_ref(struct inode *ptmx_inode);
-void devpts_del_ref(struct inode *ptmx_inode);
+struct pts_fs_info;
+
+struct pts_fs_info *devpts_acquire(struct file *filp);
+void devpts_release(struct pts_fs_info *fsi);
+
+int devpts_new_index(struct pts_fs_info *fsi);
+void devpts_kill_index(struct pts_fs_info *fsi, int idx);
/* mknod in devpts */
-struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
+struct inode *devpts_pty_new(struct pts_fs_info *fsi, dev_t device, int index,
void *priv);
/* get private structure */
void *devpts_get_priv(struct inode *pts_inode);
/* unlink */
void devpts_pty_kill(struct inode *inode);
-#else
-
-/* Dummy stubs in the no-pty case */
-static inline int devpts_new_index(struct inode *ptmx_inode) { return -EINVAL; }
-static inline void devpts_kill_index(struct inode *ptmx_inode, int idx) { }
-static inline void devpts_add_ref(struct inode *ptmx_inode) { }
-static inline void devpts_del_ref(struct inode *ptmx_inode) { }
-static inline struct inode *devpts_pty_new(struct inode *ptmx_inode,
- dev_t device, int index, void *priv)
-{
- return ERR_PTR(-EINVAL);
-}
-static inline void *devpts_get_priv(struct inode *pts_inode)
-{
- return NULL;
-}
-static inline void devpts_pty_kill(struct inode *inode) { }
-
#endif
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 77d01700daf7..f29abda31e6d 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -45,6 +45,8 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
#define LOOKUP_ROOT 0x2000
#define LOOKUP_EMPTY 0x4000
+extern int path_pts(struct path *path);
+
extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty);
static inline int user_path_at(int dfd, const char __user *name, unsigned flags,
--
2.8.1
On Tue, Apr 19, 2016 at 10:04:20PM -0500, Eric W. Biederman wrote:
> +#ifdef CONFIG_UNIX98_PTYS
> +int path_pts(struct path *path)
> +{
> + /* Find "pts" in the same directory as the input path */
> + struct dentry *child, *parent;
> + struct qstr this;
> + int ret;
> +
> + ret = path_parent_directory(path);
> + if (ret)
> + return ret;
> +
> + parent = path->dentry;
> + if (!d_can_lookup(parent))
> + return -ENOENT;
And how, pray tell, would a parent of anything fail to be a directory?
> + this.name = "pts";
> + this.len = 3;
> + this.hash = full_name_hash(this.name, this.len);
> + if (parent->d_flags & DCACHE_OP_HASH) {
> + int err = parent->d_op->d_hash(parent, &this);
> + if (err < 0)
> + return err;
> + }
> + inode_lock(parent->d_inode);
What the hell for? What does that lock on parent change for the
dcache lookup you are doing here?
> + child = d_lookup(parent, &this);
> + inode_unlock(parent->d_inode);
> + if (!child)
> + return -ENOENT;
Take a look at d_hash_and_lookup(), BTW...
> + path->dentry = child;
> + dput(parent);
> + follow_mount(path);
> + return 0;
> +}
> +#endif
> +
> int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
> struct path *path, int *empty)
> {
> diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h
> index e0ee0b3000b2..486f6282b0c6 100644
> --- a/include/linux/devpts_fs.h
> +++ b/include/linux/devpts_fs.h
> @@ -17,36 +17,21 @@
>
> #ifdef CONFIG_UNIX98_PTYS
>
> -int devpts_new_index(struct inode *ptmx_inode);
> -void devpts_kill_index(struct inode *ptmx_inode, int idx);
> -void devpts_add_ref(struct inode *ptmx_inode);
> -void devpts_del_ref(struct inode *ptmx_inode);
> +struct pts_fs_info;
> +
> +struct pts_fs_info *devpts_acquire(struct file *filp);
> +void devpts_release(struct pts_fs_info *fsi);
> +
> +int devpts_new_index(struct pts_fs_info *fsi);
> +void devpts_kill_index(struct pts_fs_info *fsi, int idx);
> /* mknod in devpts */
> -struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
> +struct inode *devpts_pty_new(struct pts_fs_info *fsi, dev_t device, int index,
> void *priv);
> /* get private structure */
> void *devpts_get_priv(struct inode *pts_inode);
> /* unlink */
> void devpts_pty_kill(struct inode *inode);
>
> -#else
> -
> -/* Dummy stubs in the no-pty case */
> -static inline int devpts_new_index(struct inode *ptmx_inode) { return -EINVAL; }
> -static inline void devpts_kill_index(struct inode *ptmx_inode, int idx) { }
> -static inline void devpts_add_ref(struct inode *ptmx_inode) { }
> -static inline void devpts_del_ref(struct inode *ptmx_inode) { }
> -static inline struct inode *devpts_pty_new(struct inode *ptmx_inode,
> - dev_t device, int index, void *priv)
> -{
> - return ERR_PTR(-EINVAL);
> -}
> -static inline void *devpts_get_priv(struct inode *pts_inode)
> -{
> - return NULL;
> -}
> -static inline void devpts_pty_kill(struct inode *inode) { }
> -
> #endif
>
>
> diff --git a/include/linux/namei.h b/include/linux/namei.h
> index 77d01700daf7..f29abda31e6d 100644
> --- a/include/linux/namei.h
> +++ b/include/linux/namei.h
> @@ -45,6 +45,8 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
> #define LOOKUP_ROOT 0x2000
> #define LOOKUP_EMPTY 0x4000
>
> +extern int path_pts(struct path *path);
> +
> extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty);
>
> static inline int user_path_at(int dfd, const char __user *name, unsigned flags,
> --
> 2.8.1
>
"H. Peter Anvin" <[email protected]> writes:
> On April 19, 2016 12:25:03 PM PDT, "H. Peter Anvin" <[email protected]> wrote:
>>
>>Perhaps a (privileged) option to exempt from the global limit, then.
>>Something we can implement if asked for.
>>
>>However, I wouldn't be 100% that the reserved pool isn't used. Someone
>>added it presumably for a reason. An administrator could say it and
>>we'd have no idea.
>
> ... and if I personally was running a container-hosting system, I
> would *absolutely* set it to make sure the administrator could not get
> locked out.
That is likely easier done by setting:
echo RIDICULOUSLY_LARGE_NUMBER > /proc/sys/kernel/pty/max
All I am certain about at this point is that no one cares on a day to
day basis or in any kind of ordinary scenario so this is something that
we can get away with changing.
But yes I would not be surprised if we have to come back and implement
something like your suggested extra mount option for devpts, so some
specified instances can dip into the reserved pool.
Eric
Al Viro <[email protected]> writes:
> On Tue, Apr 19, 2016 at 10:04:20PM -0500, Eric W. Biederman wrote:
>> +#ifdef CONFIG_UNIX98_PTYS
>> +int path_pts(struct path *path)
>> +{
>> + /* Find "pts" in the same directory as the input path */
>> + struct dentry *child, *parent;
>> + struct qstr this;
>> + int ret;
>> +
>> + ret = path_parent_directory(path);
>> + if (ret)
>> + return ret;
>> +
>> + parent = path->dentry;
>> + if (!d_can_lookup(parent))
>> + return -ENOENT;
>
> And how, pray tell, would a parent of anything fail to be a directory?
It is to make that function be visually distinct from path_parentat
which does something rather different.
>> + this.name = "pts";
>> + this.len = 3;
>> + this.hash = full_name_hash(this.name, this.len);
>> + if (parent->d_flags & DCACHE_OP_HASH) {
>> + int err = parent->d_op->d_hash(parent, &this);
>> + if (err < 0)
>> + return err;
>> + }
>> + inode_lock(parent->d_inode);
>
> What the hell for? What does that lock on parent change for the
> dcache lookup you are doing here?
Good point. That is overkill. As we know the dentry is a mount point and
must be in the dcache, the customary lock for performing a lookup from
the disk is not necessary.
>> + child = d_lookup(parent, &this);
>> + inode_unlock(parent->d_inode);
>> + if (!child)
>> + return -ENOENT;
>
> Take a look at d_hash_and_lookup(), BTW...
Yes. That does look like a reasonable simplification.
Eric
On Tue, Apr 19, 2016 at 10:43:03PM -0500, Eric W. Biederman wrote:
> >> + if (!d_can_lookup(parent))
> >> + return -ENOENT;
> >
> > And how, pray tell, would a parent of anything fail to be a directory?
>
> It is to make that function be visually distinct from path_parentat
> which does something rather different.
Huh? I'm asking how can that condition ever turn out to be true. Unless
you really advocate something like
if (2 * 17 != 34)
return -234567; // to make it visually distinct from foobar(),
// which doesn't have such a test
your reply doesn't seem to make any sense...
> >> + this.name = "pts";
> >> + this.len = 3;
> >> + this.hash = full_name_hash(this.name, this.len);
> >> + if (parent->d_flags & DCACHE_OP_HASH) {
> >> + int err = parent->d_op->d_hash(parent, &this);
> >> + if (err < 0)
> >> + return err;
> >> + }
> >> + inode_lock(parent->d_inode);
> >
> > What the hell for? What does that lock on parent change for the
> > dcache lookup you are doing here?
>
> Good point. That is overkill. As we know the dentry is a mount point and
> must be in the dcache, the customary lock for performing a lookup from
> the disk is not necessary.
Er... To avoid reader confusion:
a) d_lookup() does *not* do a filesystem lookup
b) it does not need inode_lock()
c) it (and not a "lookup from the disk") is what's actually being
called in the code in question.
Al Viro <[email protected]> writes:
> On Tue, Apr 19, 2016 at 10:43:03PM -0500, Eric W. Biederman wrote:
>> >> + if (!d_can_lookup(parent))
>> >> + return -ENOENT;
>> >
>> > And how, pray tell, would a parent of anything fail to be a directory?
>>
>> It is to make that function be visually distinct from path_parentat
>> which does something rather different.
>
> Huh? I'm asking how can that condition ever turn out to be true. Unless
> you really advocate something like
> if (2 * 17 != 34)
> return -234567; // to make it visually distinct from foobar(),
> // which doesn't have such a test
> your reply doesn't seem to make any sense...
Oh apologies I thought you were asking about the naming of the function,
path_parent_directory. Yes. The d_can_lookup does appear to be redundant.
It definitely looks like bedtime for me.
>> >> + this.name = "pts";
>> >> + this.len = 3;
>> >> + this.hash = full_name_hash(this.name, this.len);
>> >> + if (parent->d_flags & DCACHE_OP_HASH) {
>> >> + int err = parent->d_op->d_hash(parent, &this);
>> >> + if (err < 0)
>> >> + return err;
>> >> + }
>> >> + inode_lock(parent->d_inode);
>> >
>> > What the hell for? What does that lock on parent change for the
>> > dcache lookup you are doing here?
>>
>> Good point. That is overkill. As we know the dentry is a mount point and
>> must be in the dcache, the customary lock for performing a lookup from
>> the disk is not necessary.
>
> Er... To avoid reader confusion:
> a) d_lookup() does *not* do a filesystem lookup
> b) it does not need inode_lock()
> c) it (and not a "lookup from the disk") is what's actually being
> called in the code in question.
And since I was stripping down the ordinary filesystem lookup path to
just the pieces needed I apparently wound up with a few extras.
Do you think it would be possible to guarantee an rcu lookup for the
operations in path_pts? I think needing to perform a follow_mount makes
that impossible to guarantee.
All the caller wants is to find the superblock of the mounted filesystem
and increment sb->s_active.
Eric
On Wed, Apr 20, 2016 at 6:04 AM, Eric W. Biederman
<[email protected]> wrote:
>
> The /dev/ptmx device node is changed to lookup the directory entry
> "pts" in the same directory as the /dev/ptmx device node was opened
> in. If there is a "pts" entry and that entry is a devpts filesystem
> /dev/ptmx uses that filesystem. Otherwise the open of /dev/ptmx
> fails.
>
> The DEVPTS_MULTIPLE_INSTANCES configuration option is removed,
> so that userspace can now safely depend on each mount of devpts
> creating a new instance of the filesystem.
>
> Each mount of devpts is now a separate and equal filesystem.
>
> The kernel.pty.reserve sysctl is neutered with no way currently
> implemented to be able to use the reserved ptys.
I think we could convert this into reserve for init user namespace,
ssh in host will work even if containers eaten all ptys.
>
> A new vfs helper path_pts is introduced that finds a directory entry
> named "pts" in the directory of the passed in path, and changes the
> passed in path to point to it. The helper path_pts uses a function
> path_parent_directory that was factored out of follow_dotdot.
>
> In the implementation of devpts:
> - devpts_mnt is killed as it is no longer meaningful if all
> mounts of devpts are equal.
> - pts_sb_from_inode is replaced by just inode->i_sb as all
> cached inodes in the tty layer are now from the devpts
> filesystem.
> - devpts_add_ref is rolled into the new function devpts_ptmx.
> And the unnecessary inode hold is removed.
> - devpts_del_ref is renamed devpts_release and reduced
> to just a deacrivate_super.
> - The newinstance mount option continues to be accepted but is now ignored.
>
> In devpts_fs.h definitions for when !CONFIG_UNIX98_PTYS are removed
> as they are never used.
>
> Documentation/filesystems/devices.txt is updated to describe
> the current situation.
>
> This has been verified to work properly on openwrt-15.05, centos5,
> centos6, centos7, debian-6.0.2, debian-7.9, debian-8.2,
> ubuntu-14.04.3, ubuntu-15.10, fedora23, magia-5, mint-17.3,
> opensuse-42.1, slackware-14.1, gentoo-20151225 (13.0?),
> archlinux-2015-12-01. With the caveat that on centos6 and on
> slackware-14.1 that there wind up being two instances of the devpts
> filesystem mounted on /dev/pts, the lower copy does not end up getting
> used.
>
> Signed-off-by: "Eric W. Biederman" <[email protected]>
> ---
> Documentation/filesystems/devpts.txt | 145 +++----------------------
> drivers/tty/Kconfig | 11 --
> drivers/tty/pty.c | 41 ++++---
> fs/devpts/inode.c | 205 +++++++++--------------------------
> fs/namei.c | 58 ++++++++--
> include/linux/devpts_fs.h | 31 ++----
> include/linux/namei.h | 2 +
> 7 files changed, 148 insertions(+), 345 deletions(-)
On Tue, Apr 19, 2016 at 9:36 PM, Konstantin Khlebnikov <[email protected]> wrote:
> On Wed, Apr 20, 2016 at 6:04 AM, Eric W. Biederman
>>
>> The kernel.pty.reserve sysctl is neutered with no way currently
>> implemented to be able to use the reserved ptys.
>
> I think we could convert this into reserve for init user namespace,
> ssh in host will work even if containers eaten all ptys.
Yes. That's basically how it effectively worked before (ie everything
but the initial non-newinstance devpts mount would be limited to the
non-reserved numbers).
We required the non-init namespaces to do a newinstance mount, so the
whole test for "newinstance" was effectively the same thing as just
checking for the init namespace from a security standpoint.
And in fact, rewriting it in that form (ie checking for init_ns) would
just make it much more obvious what the intent it.
Linus
On 2016-04-19 23:27, Eric W. Biederman wrote:
> "H. Peter Anvin" <[email protected]> writes:
>
>> On April 19, 2016 12:25:03 PM PDT, "H. Peter Anvin" <[email protected]> wrote:
>>>
>>> Perhaps a (privileged) option to exempt from the global limit, then.
>>> Something we can implement if asked for.
>>>
>>> However, I wouldn't be 100% that the reserved pool isn't used. Someone
>>> added it presumably for a reason. An administrator could say it and
>>> we'd have no idea.
>>
>> ... and if I personally was running a container-hosting system, I
>> would *absolutely* set it to make sure the administrator could not get
>> locked out.
>
> That is likely easier done by setting:
> echo RIDICULOUSLY_LARGE_NUMBER > /proc/sys/kernel/pty/max
This may protect against administrative lockout on a sane system with
responsible users, but it doesn't protect you from lockout due to a DoS
attack, while the reserved pool does (or at least, it makes sure you can
still allocate a few PTY's even when under attack).
Linus Torvalds <[email protected]> writes:
> On Tue, Apr 19, 2016 at 9:36 PM, Konstantin Khlebnikov <[email protected]> wrote:
>> On Wed, Apr 20, 2016 at 6:04 AM, Eric W. Biederman
>>>
>>> The kernel.pty.reserve sysctl is neutered with no way currently
>>> implemented to be able to use the reserved ptys.
>>
>> I think we could convert this into reserve for init user namespace,
>> ssh in host will work even if containers eaten all ptys.
>
> Yes. That's basically how it effectively worked before (ie everything
> but the initial non-newinstance devpts mount would be limited to the
> non-reserved numbers).
>
> We required the non-init namespaces to do a newinstance mount, so the
> whole test for "newinstance" was effectively the same thing as just
> checking for the init namespace from a security standpoint.
>
> And in fact, rewriting it in that form (ie checking for init_ns) would
> just make it much more obvious what the intent it.
How does this sound.
When mounting a devpts filesystem. We look at the caller (aka current)
and if we are in the initial mount namespace set a flag in fsi that
allows that instance of devpts to draw into the reserve pool.
That will still allow crazy pieces of code like xen-create-instance run
by root that mount a devpts filesystem in a chroot environment to draw
into the reserved pool, but any sane users that set up their own mount
namespace won't be able to user the reserve pool.
I believe that will give an almost identical policy to what we have
today, and it certainly makes a good default test for a container. Just
for cleanliness containers (of anyone's definition) almost always use
mount namespaces instead of chroots.
Sigh one last past through all of the distros, to confirm that this
works in practice.
Eric
On Wed, Apr 20, 2016 at 5:55 PM, Eric W. Biederman
<[email protected]> wrote:
> Linus Torvalds <[email protected]> writes:
>
>> On Tue, Apr 19, 2016 at 9:36 PM, Konstantin Khlebnikov <[email protected]> wrote:
>>> On Wed, Apr 20, 2016 at 6:04 AM, Eric W. Biederman
>>>>
>>>> The kernel.pty.reserve sysctl is neutered with no way currently
>>>> implemented to be able to use the reserved ptys.
>>>
>>> I think we could convert this into reserve for init user namespace,
>>> ssh in host will work even if containers eaten all ptys.
>>
>> Yes. That's basically how it effectively worked before (ie everything
>> but the initial non-newinstance devpts mount would be limited to the
>> non-reserved numbers).
>>
>> We required the non-init namespaces to do a newinstance mount, so the
>> whole test for "newinstance" was effectively the same thing as just
>> checking for the init namespace from a security standpoint.
>>
>> And in fact, rewriting it in that form (ie checking for init_ns) would
>> just make it much more obvious what the intent it.
>
> How does this sound.
>
> When mounting a devpts filesystem. We look at the caller (aka current)
> and if we are in the initial mount namespace set a flag in fsi that
> allows that instance of devpts to draw into the reserve pool.
Maybe just check current user namespace when task opens /dev/ptmx?
IIRR now check looks like: count < limit - (newinstance ? reserved : 0).
So, it will be count < limit - (current_in_init_userns ? 0 : newinstance).
>
> That will still allow crazy pieces of code like xen-create-instance run
> by root that mount a devpts filesystem in a chroot environment to draw
> into the reserved pool, but any sane users that set up their own mount
> namespace won't be able to user the reserve pool.
>
> I believe that will give an almost identical policy to what we have
> today, and it certainly makes a good default test for a container. Just
> for cleanliness containers (of anyone's definition) almost always use
> mount namespaces instead of chroots.
>
> Sigh one last past through all of the distros, to confirm that this
> works in practice.
>
> Eric
Konstantin Khlebnikov <[email protected]> writes:
> On Wed, Apr 20, 2016 at 5:55 PM, Eric W. Biederman
> <[email protected]> wrote:
>> Linus Torvalds <[email protected]> writes:
>>
>>> On Tue, Apr 19, 2016 at 9:36 PM, Konstantin Khlebnikov <[email protected]> wrote:
>>>> On Wed, Apr 20, 2016 at 6:04 AM, Eric W. Biederman
>>>>>
>>>>> The kernel.pty.reserve sysctl is neutered with no way currently
>>>>> implemented to be able to use the reserved ptys.
>>>>
>>>> I think we could convert this into reserve for init user namespace,
>>>> ssh in host will work even if containers eaten all ptys.
>>>
>>> Yes. That's basically how it effectively worked before (ie everything
>>> but the initial non-newinstance devpts mount would be limited to the
>>> non-reserved numbers).
>>>
>>> We required the non-init namespaces to do a newinstance mount, so the
>>> whole test for "newinstance" was effectively the same thing as just
>>> checking for the init namespace from a security standpoint.
>>>
>>> And in fact, rewriting it in that form (ie checking for init_ns) would
>>> just make it much more obvious what the intent it.
>>
>> How does this sound.
>>
>> When mounting a devpts filesystem. We look at the caller (aka current)
>> and if we are in the initial mount namespace set a flag in fsi that
>> allows that instance of devpts to draw into the reserve pool.
>
> Maybe just check current user namespace when task opens /dev/ptmx?
>
> IIRR now check looks like: count < limit - (newinstance ? reserved : 0).
> So, it will be count < limit - (current_in_init_userns ? 0 : newinstance).
Looking at current user namespace really is not enough. Lots of
container solutions at least historically (which means deployed right
now) don't use the user namespace.
I can see an argument to make the check: "capable(CAP_SYS_RESOURCE)".
Although for pty applications I don't know if that is particularly
meaningful.
I am a little dubious of making it a check at allocation time rather
than at mount time. The issue is that tty allocation is an unprivileged
operation. I expect applications such as sshd (the one that really
matters) will have droped privileges by the time they allocation a pty.
So I feel much more comfortable with a model where things are arranged
so that applications within access of the devpts filesystem can use it
(and are not limited), and applications not in range can't. Roughly the
authenticate at open time model. Also what devpts implements today.
The is also the question of how things should work if you are running in
a system where every new daemon, and every new login is in it's own
mount namespace. Allowing each of these to have a distinct /tmp
directory. I believe systemd systems are well on their way to doing
that today. As such it does not seem appropriate to check the mount
namespace of the opener of the tty.
Who knows we may not be long until the pty master lives in some very
tight bubble where it can barely do anything (as that is the program
that talks to the network) and user namespaces are used as part of the
enforcement of that.
For all of those reasons, a permission check in devpts_pty_new seems
like the wrong place.
Eric
On April 20, 2016 4:50:53 AM PDT, "Austin S. Hemmelgarn" <[email protected]> wrote:
>On 2016-04-19 23:27, Eric W. Biederman wrote:
>> "H. Peter Anvin" <[email protected]> writes:
>>
>>> On April 19, 2016 12:25:03 PM PDT, "H. Peter Anvin" <[email protected]>
>wrote:
>>>>
>>>> Perhaps a (privileged) option to exempt from the global limit,
>then.
>>>> Something we can implement if asked for.
>>>>
>>>> However, I wouldn't be 100% that the reserved pool isn't used.
>Someone
>>>> added it presumably for a reason. An administrator could say it
>and
>>>> we'd have no idea.
>>>
>>> ... and if I personally was running a container-hosting system, I
>>> would *absolutely* set it to make sure the administrator could not
>get
>>> locked out.
>>
>> That is likely easier done by setting:
>> echo RIDICULOUSLY_LARGE_NUMBER > /proc/sys/kernel/pty/max
>This may protect against administrative lockout on a sane system with
>responsible users, but it doesn't protect you from lockout due to a DoS
>
>attack, while the reserved pool does (or at least, it makes sure you
>can
>still allocate a few PTY's even when under attack).
... or even accidental due to some runaway script. (I may or may not have some first hand knowledge of a similar situation once in college...)
--
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.
The /dev/ptmx device node is changed to lookup the directory entry
"pts" in the same directory as the /dev/ptmx device node was opened
in. If there is a "pts" entry and that entry is a devpts filesystem
/dev/ptmx uses that filesystem. Otherwise the open of /dev/ptmx
fails.
The DEVPTS_MULTIPLE_INSTANCES configuration option is removed,
so that userspace can now safely depend on each mount of devpts
creating a new instance of the filesystem.
Each mount of devpts is now a separate and equal filesystem.
Reserved ttys are now available to all instances of devpts where the
mounter is in the initial mount namespace.
A new vfs helper path_pts is introduced that finds a directory entry
named "pts" in the directory of the passed in path, and changes the
passed in path to point to it. The helper path_pts uses a function
path_parent_directory that was factored out of follow_dotdot.
In the implementation of devpts:
- devpts_mnt is killed as it is no longer meaningful if all
mounts of devpts are equal.
- pts_sb_from_inode is replaced by just inode->i_sb as all
cached inodes in the tty layer are now from the devpts
filesystem.
- devpts_add_ref is rolled into the new function devpts_ptmx.
And the unnecessary inode hold is removed.
- devpts_del_ref is renamed devpts_release and reduced
to just a deacrivate_super.
- The newinstance mount option continues to be accepted but is now ignored.
In devpts_fs.h definitions for when !CONFIG_UNIX98_PTYS are removed
as they are never used.
Documentation/filesystems/devices.txt is updated to describe
the current situation.
This has been verified to work properly on openwrt-15.05, centos5,
centos6, centos7, debian-6.0.2, debian-7.9, debian-8.2,
ubuntu-14.04.3, ubuntu-15.10, fedora23, magia-5, mint-17.3,
opensuse-42.1, slackware-14.1, gentoo-20151225 (13.0?),
archlinux-2015-12-01. With the caveat that on centos6 and on
slackware-14.1 that there wind up being two instances of the devpts
filesystem mounted on /dev/pts, the lower copy does not end up getting
used.
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
Documentation/filesystems/devpts.txt | 145 +++----------------------
drivers/tty/Kconfig | 11 --
drivers/tty/pty.c | 41 ++++---
fs/devpts/inode.c | 203 ++++++++++-------------------------
fs/namei.c | 49 +++++++--
include/linux/devpts_fs.h | 31 ++----
include/linux/namei.h | 2 +
7 files changed, 144 insertions(+), 338 deletions(-)
diff --git a/Documentation/filesystems/devpts.txt b/Documentation/filesystems/devpts.txt
index 30d2fcb32f72..9f94fe276dea 100644
--- a/Documentation/filesystems/devpts.txt
+++ b/Documentation/filesystems/devpts.txt
@@ -1,141 +1,26 @@
+Each mount of the devpts filesystem is now distinct such that ptys
+and their indicies allocated in one mount are independent from ptys
+and their indicies in all other mounts.
-To support containers, we now allow multiple instances of devpts filesystem,
-such that indices of ptys allocated in one instance are independent of indices
-allocated in other instances of devpts.
+All mounts of the devpts filesystem now create a /dev/pts/ptmx node
+with permissions 0000.
-To preserve backward compatibility, this support for multiple instances is
-enabled only if:
+To retain backwards compatibility the a ptmx device node (aka any node
+created with "mknod name c 5 2") when opened will look for an instance
+of devpts under the name "pts" in the same directory as the ptmx device
+node.
- - CONFIG_DEVPTS_MULTIPLE_INSTANCES=y, and
- - '-o newinstance' mount option is specified while mounting devpts
-
-IOW, devpts now supports both single-instance and multi-instance semantics.
-
-If CONFIG_DEVPTS_MULTIPLE_INSTANCES=n, there is no change in behavior and
-this referred to as the "legacy" mode. In this mode, the new mount options
-(-o newinstance and -o ptmxmode) will be ignored with a 'bogus option' message
-on console.
-
-If CONFIG_DEVPTS_MULTIPLE_INSTANCES=y and devpts is mounted without the
-'newinstance' option (as in current start-up scripts) the new mount binds
-to the initial kernel mount of devpts. This mode is referred to as the
-'single-instance' mode and the current, single-instance semantics are
-preserved, i.e PTYs are common across the system.
-
-The only difference between this single-instance mode and the legacy mode
-is the presence of new, '/dev/pts/ptmx' node with permissions 0000, which
-can safely be ignored.
-
-If CONFIG_DEVPTS_MULTIPLE_INSTANCES=y and 'newinstance' option is specified,
-the mount is considered to be in the multi-instance mode and a new instance
-of the devpts fs is created. Any ptys created in this instance are independent
-of ptys in other instances of devpts. Like in the single-instance mode, the
-/dev/pts/ptmx node is present. To effectively use the multi-instance mode,
-open of /dev/ptmx must be a redirected to '/dev/pts/ptmx' using a symlink or
-bind-mount.
-
-Eg: A container startup script could do the following:
-
- $ chmod 0666 /dev/pts/ptmx
- $ rm /dev/ptmx
- $ ln -s pts/ptmx /dev/ptmx
- $ ns_exec -cm /bin/bash
-
- # We are now in new container
-
- $ umount /dev/pts
- $ mount -t devpts -o newinstance lxcpts /dev/pts
- $ sshd -p 1234
-
-where 'ns_exec -cm /bin/bash' calls clone() with CLONE_NEWNS flag and execs
-/bin/bash in the child process. A pty created by the sshd is not visible in
-the original mount of /dev/pts.
+As an option instead of placing a /dev/ptmx device node at /dev/ptmx
+it is possible to place a symlink to /dev/pts/ptmx at /dev/ptmx or
+to bind mount /dev/ptx/ptmx to /dev/ptmx. If you opt for using
+the devpts filesystem in this manner devpts should be mounted with
+the ptmxmode=0666, or chmod 0666 /dev/pts/ptmx should be called.
Total count of pty pairs in all instances is limited by sysctls:
kernel.pty.max = 4096 - global limit
-kernel.pty.reserve = 1024 - reserve for initial instance
+kernel.pty.reserve = 1024 - reserved for filesystems mounted from the initial mount namespace
kernel.pty.nr - current count of ptys
Per-instance limit could be set by adding mount option "max=<count>".
This feature was added in kernel 3.4 together with sysctl kernel.pty.reserve.
In kernels older than 3.4 sysctl kernel.pty.max works as per-instance limit.
-
-User-space changes
-------------------
-
-In multi-instance mode (i.e '-o newinstance' mount option is specified at least
-once), following user-space issues should be noted.
-
-1. If -o newinstance mount option is never used, /dev/pts/ptmx can be ignored
- and no change is needed to system-startup scripts.
-
-2. To effectively use multi-instance mode (i.e -o newinstance is specified)
- administrators or startup scripts should "redirect" open of /dev/ptmx to
- /dev/pts/ptmx using either a bind mount or symlink.
-
- $ mount -t devpts -o newinstance devpts /dev/pts
-
- followed by either
-
- $ rm /dev/ptmx
- $ ln -s pts/ptmx /dev/ptmx
- $ chmod 666 /dev/pts/ptmx
- or
- $ mount -o bind /dev/pts/ptmx /dev/ptmx
-
-3. The '/dev/ptmx -> pts/ptmx' symlink is the preferred method since it
- enables better error-reporting and treats both single-instance and
- multi-instance mounts similarly.
-
- But this method requires that system-startup scripts set the mode of
- /dev/pts/ptmx correctly (default mode is 0000). The scripts can set the
- mode by, either
-
- - adding ptmxmode mount option to devpts entry in /etc/fstab, or
- - using 'chmod 0666 /dev/pts/ptmx'
-
-4. If multi-instance mode mount is needed for containers, but the system
- startup scripts have not yet been updated, container-startup scripts
- should bind mount /dev/ptmx to /dev/pts/ptmx to avoid breaking single-
- instance mounts.
-
- Or, in general, container-startup scripts should use:
-
- mount -t devpts -o newinstance -o ptmxmode=0666 devpts /dev/pts
- if [ ! -L /dev/ptmx ]; then
- mount -o bind /dev/pts/ptmx /dev/ptmx
- fi
-
- When all devpts mounts are multi-instance, /dev/ptmx can permanently be
- a symlink to pts/ptmx and the bind mount can be ignored.
-
-5. A multi-instance mount that is not accompanied by the /dev/ptmx to
- /dev/pts/ptmx redirection would result in an unusable/unreachable pty.
-
- mount -t devpts -o newinstance lxcpts /dev/pts
-
- immediately followed by:
-
- open("/dev/ptmx")
-
- would create a pty, say /dev/pts/7, in the initial kernel mount.
- But /dev/pts/7 would be invisible in the new mount.
-
-6. The permissions for /dev/pts/ptmx node should be specified when mounting
- /dev/pts, using the '-o ptmxmode=%o' mount option (default is 0000).
-
- mount -t devpts -o newinstance -o ptmxmode=0644 devpts /dev/pts
-
- The permissions can be later be changed as usual with 'chmod'.
-
- chmod 666 /dev/pts/ptmx
-
-7. A mount of devpts without the 'newinstance' option results in binding to
- initial kernel mount. This behavior while preserving legacy semantics,
- does not provide strict isolation in a container environment. i.e by
- mounting devpts without the 'newinstance' option, a container could
- get visibility into the 'host' or root container's devpts.
-
- To workaround this and have strict isolation, all mounts of devpts,
- including the mount in the root container, should use the newinstance
- option.
diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig
index 82c4d2e45319..95103054c0e4 100644
--- a/drivers/tty/Kconfig
+++ b/drivers/tty/Kconfig
@@ -120,17 +120,6 @@ config UNIX98_PTYS
All modern Linux systems use the Unix98 ptys. Say Y unless
you're on an embedded system and want to conserve memory.
-config DEVPTS_MULTIPLE_INSTANCES
- bool "Support multiple instances of devpts"
- depends on UNIX98_PTYS
- default n
- ---help---
- Enable support for multiple instances of devpts filesystem.
- If you want to have isolated PTY namespaces (eg: in containers),
- say Y here. Otherwise, say N. If enabled, each mount of devpts
- filesystem with the '-o newinstance' option will create an
- independent PTY namespace.
-
config LEGACY_PTYS
bool "Legacy (BSD) PTY support"
default y
diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index e16a49b507ef..6e57eb31e327 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -663,14 +663,14 @@ static int pty_unix98_install(struct tty_driver *driver, struct tty_struct *tty)
/* this is called once with whichever end is closed last */
static void pty_unix98_remove(struct tty_driver *driver, struct tty_struct *tty)
{
- struct inode *ptmx_inode;
+ struct pts_fs_info *fsi;
if (tty->driver->subtype == PTY_TYPE_MASTER)
- ptmx_inode = tty->driver_data;
+ fsi = tty->driver_data;
else
- ptmx_inode = tty->link->driver_data;
- devpts_kill_index(ptmx_inode, tty->index);
- devpts_del_ref(ptmx_inode);
+ fsi = tty->link->driver_data;
+ devpts_kill_index(fsi, tty->index);
+ devpts_release(fsi);
}
static const struct tty_operations ptm_unix98_ops = {
@@ -721,6 +721,7 @@ static const struct tty_operations pty_unix98_ops = {
static int ptmx_open(struct inode *inode, struct file *filp)
{
struct tty_struct *tty;
+ struct pts_fs_info *fsi;
struct inode *slave_inode;
int retval;
int index;
@@ -734,13 +735,19 @@ static int ptmx_open(struct inode *inode, struct file *filp)
if (retval)
return retval;
+ fsi = devpts_acquire(filp);
+ if (IS_ERR(fsi)) {
+ retval = PTR_ERR(fsi);
+ goto err_file;
+ }
+
/* find a device that is not in use. */
mutex_lock(&devpts_mutex);
- index = devpts_new_index(inode);
+ index = devpts_new_index(fsi);
if (index < 0) {
retval = index;
mutex_unlock(&devpts_mutex);
- goto err_file;
+ goto err_fsi;
}
mutex_unlock(&devpts_mutex);
@@ -758,23 +765,11 @@ static int ptmx_open(struct inode *inode, struct file *filp)
mutex_unlock(&tty_mutex);
set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */
- tty->driver_data = inode;
-
- /*
- * In the case where all references to ptmx inode are dropped and we
- * still have /dev/tty opened pointing to the master/slave pair (ptmx
- * is closed/released before /dev/tty), we must make sure that the inode
- * is still valid when we call the final pty_unix98_shutdown, thus we
- * hold an additional reference to the ptmx inode. For the same /dev/tty
- * last close case, we also need to make sure the super_block isn't
- * destroyed (devpts instance unmounted), before /dev/tty is closed and
- * on its release devpts_kill_index is called.
- */
- devpts_add_ref(inode);
+ tty->driver_data = fsi;
tty_add_file(tty, filp);
- slave_inode = devpts_pty_new(inode,
+ slave_inode = devpts_pty_new(fsi,
MKDEV(UNIX98_PTY_SLAVE_MAJOR, index), index,
tty->link);
if (IS_ERR(slave_inode)) {
@@ -797,7 +792,9 @@ err_release:
return retval;
out:
mutex_unlock(&tty_mutex);
- devpts_kill_index(inode, index);
+ devpts_kill_index(fsi, index);
+err_fsi:
+ devpts_release(fsi);
err_file:
tty_free_file(filp);
return retval;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 655f21f99160..0b8edbc617e4 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -95,8 +95,6 @@ static struct ctl_table pty_root_table[] = {
static DEFINE_MUTEX(allocated_ptys_lock);
-static struct vfsmount *devpts_mnt;
-
struct pts_mount_opts {
int setuid;
int setgid;
@@ -104,7 +102,7 @@ struct pts_mount_opts {
kgid_t gid;
umode_t mode;
umode_t ptmxmode;
- int newinstance;
+ int reserve;
int max;
};
@@ -117,17 +115,16 @@ static const match_table_t tokens = {
{Opt_uid, "uid=%u"},
{Opt_gid, "gid=%u"},
{Opt_mode, "mode=%o"},
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
{Opt_ptmxmode, "ptmxmode=%o"},
{Opt_newinstance, "newinstance"},
{Opt_max, "max=%d"},
-#endif
{Opt_err, NULL}
};
struct pts_fs_info {
struct ida allocated_ptys;
struct pts_mount_opts mount_opts;
+ struct super_block *sb;
struct dentry *ptmx_dentry;
};
@@ -136,15 +133,43 @@ static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb)
return sb->s_fs_info;
}
-static inline struct super_block *pts_sb_from_inode(struct inode *inode)
+struct pts_fs_info *devpts_acquire(struct file *filp)
{
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
- if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
- return inode->i_sb;
-#endif
- if (!devpts_mnt)
- return NULL;
- return devpts_mnt->mnt_sb;
+ struct pts_fs_info *result;
+ struct path path;
+ struct super_block *sb;
+ int err;
+
+ /* Is a devpts filesystem at "pts" in the same directory? */
+ path = filp->f_path;
+ path_get(&path);
+ err = path_pts(&path);
+ if (err) {
+ result = ERR_PTR(err);
+ goto out;
+ }
+
+ /* Is the path the root of a devpts filesystem? */
+ result = ERR_PTR(-ENOENT);
+ sb = path.mnt->mnt_sb;
+ if ((sb->s_magic != DEVPTS_SUPER_MAGIC) ||
+ (path.mnt->mnt_root != sb->s_root))
+ goto out;
+
+ /*
+ * pty code needs to hold extra references in case of last /dev/tty close
+ */
+ atomic_inc(&sb->s_active);
+ result = DEVPTS_SB(sb);
+
+out:
+ path_put(&path);
+ return result;
+}
+
+void devpts_release(struct pts_fs_info *fsi)
+{
+ deactivate_super(fsi->sb);
}
#define PARSE_MOUNT 0
@@ -153,9 +178,7 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode)
/*
* parse_mount_options():
* Set @opts to mount options specified in @data. If an option is not
- * specified in @data, set it to its default value. The exception is
- * 'newinstance' option which can only be set/cleared on a mount (i.e.
- * cannot be changed during remount).
+ * specified in @data, set it to its default value.
*
* Note: @data may be NULL (in which case all options are set to default).
*/
@@ -173,9 +196,12 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
opts->max = NR_UNIX98_PTY_MAX;
- /* newinstance makes sense only on initial mount */
+ /* Only allow instances mounted from the initial mount
+ * namespace to tap the reserve pool of ptys.
+ */
if (op == PARSE_MOUNT)
- opts->newinstance = 0;
+ opts->reserve =
+ (current->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns);
while ((p = strsep(&data, ",")) != NULL) {
substring_t args[MAX_OPT_ARGS];
@@ -210,16 +236,12 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
return -EINVAL;
opts->mode = option & S_IALLUGO;
break;
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
case Opt_ptmxmode:
if (match_octal(&args[0], &option))
return -EINVAL;
opts->ptmxmode = option & S_IALLUGO;
break;
case Opt_newinstance:
- /* newinstance makes sense only on initial mount */
- if (op == PARSE_MOUNT)
- opts->newinstance = 1;
break;
case Opt_max:
if (match_int(&args[0], &option) ||
@@ -227,7 +249,6 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
return -EINVAL;
opts->max = option;
break;
-#endif
default:
pr_err("called with bogus options\n");
return -EINVAL;
@@ -237,7 +258,6 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
return 0;
}
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
static int mknod_ptmx(struct super_block *sb)
{
int mode;
@@ -304,12 +324,6 @@ static void update_ptmx_mode(struct pts_fs_info *fsi)
inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode;
}
}
-#else
-static inline void update_ptmx_mode(struct pts_fs_info *fsi)
-{
- return;
-}
-#endif
static int devpts_remount(struct super_block *sb, int *flags, char *data)
{
@@ -343,11 +357,9 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root)
seq_printf(seq, ",gid=%u",
from_kgid_munged(&init_user_ns, opts->gid));
seq_printf(seq, ",mode=%03o", opts->mode);
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode);
if (opts->max < NR_UNIX98_PTY_MAX)
seq_printf(seq, ",max=%d", opts->max);
-#endif
return 0;
}
@@ -358,7 +370,7 @@ static const struct super_operations devpts_sops = {
.show_options = devpts_show_options,
};
-static void *new_pts_fs_info(void)
+static void *new_pts_fs_info(struct super_block *sb)
{
struct pts_fs_info *fsi;
@@ -369,6 +381,7 @@ static void *new_pts_fs_info(void)
ida_init(&fsi->allocated_ptys);
fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE;
fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
+ fsi->sb = sb;
return fsi;
}
@@ -384,7 +397,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
s->s_op = &devpts_sops;
s->s_time_gran = 1;
- s->s_fs_info = new_pts_fs_info();
+ s->s_fs_info = new_pts_fs_info(s);
if (!s->s_fs_info)
goto fail;
@@ -408,40 +421,11 @@ fail:
return -ENOMEM;
}
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
-static int compare_init_pts_sb(struct super_block *s, void *p)
-{
- if (devpts_mnt)
- return devpts_mnt->mnt_sb == s;
- return 0;
-}
-
/*
* devpts_mount()
*
- * If the '-o newinstance' mount option was specified, mount a new
- * (private) instance of devpts. PTYs created in this instance are
- * independent of the PTYs in other devpts instances.
- *
- * If the '-o newinstance' option was not specified, mount/remount the
- * initial kernel mount of devpts. This type of mount gives the
- * legacy, single-instance semantics.
- *
- * The 'newinstance' option is needed to support multiple namespace
- * semantics in devpts while preserving backward compatibility of the
- * current 'single-namespace' semantics. i.e all mounts of devpts
- * without the 'newinstance' mount option should bind to the initial
- * kernel mount, like mount_single().
- *
- * Mounts with 'newinstance' option create a new, private namespace.
- *
- * NOTE:
- *
- * For single-mount semantics, devpts cannot use mount_single(),
- * because mount_single()/sget() find and use the super-block from
- * the most recent mount of devpts. But that recent mount may be a
- * 'newinstance' mount and mount_single() would pick the newinstance
- * super-block instead of the initial super-block.
+ * Mount a new (private) instance of devpts. PTYs created in this
+ * instance are independent of the PTYs in other devpts instances.
*/
static struct dentry *devpts_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
@@ -454,18 +438,7 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type,
if (error)
return ERR_PTR(error);
- /* Require newinstance for all user namespace mounts to ensure
- * the mount options are not changed.
- */
- if ((current_user_ns() != &init_user_ns) && !opts.newinstance)
- return ERR_PTR(-EINVAL);
-
- if (opts.newinstance)
- s = sget(fs_type, NULL, set_anon_super, flags, NULL);
- else
- s = sget(fs_type, compare_init_pts_sb, set_anon_super, flags,
- NULL);
-
+ s = sget(fs_type, NULL, set_anon_super, flags, NULL);
if (IS_ERR(s))
return ERR_CAST(s);
@@ -489,18 +462,6 @@ out_undo_sget:
return ERR_PTR(error);
}
-#else
-/*
- * This supports only the legacy single-instance semantics (no
- * multiple-instance semantics)
- */
-static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags,
- const char *dev_name, void *data)
-{
- return mount_single(fs_type, flags, data, devpts_fill_super);
-}
-#endif
-
static void devpts_kill_sb(struct super_block *sb)
{
struct pts_fs_info *fsi = DEVPTS_SB(sb);
@@ -514,9 +475,7 @@ static struct file_system_type devpts_fs_type = {
.name = "devpts",
.mount = devpts_mount,
.kill_sb = devpts_kill_sb,
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
.fs_flags = FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT,
-#endif
};
/*
@@ -524,24 +483,18 @@ static struct file_system_type devpts_fs_type = {
* to the System V naming convention
*/
-int devpts_new_index(struct inode *ptmx_inode)
+int devpts_new_index(struct pts_fs_info *fsi)
{
- struct super_block *sb = pts_sb_from_inode(ptmx_inode);
- struct pts_fs_info *fsi;
int index;
int ida_ret;
- if (!sb)
- return -ENODEV;
-
- fsi = DEVPTS_SB(sb);
retry:
if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL))
return -ENOMEM;
mutex_lock(&allocated_ptys_lock);
- if (pty_count >= pty_limit -
- (fsi->mount_opts.newinstance ? pty_reserve : 0)) {
+ if (pty_count >= (pty_limit -
+ (fsi->mount_opts.reserve ? 0 : pty_reserve))) {
mutex_unlock(&allocated_ptys_lock);
return -ENOSPC;
}
@@ -564,37 +517,14 @@ retry:
return index;
}
-void devpts_kill_index(struct inode *ptmx_inode, int idx)
+void devpts_kill_index(struct pts_fs_info *fsi, int idx)
{
- struct super_block *sb = pts_sb_from_inode(ptmx_inode);
- struct pts_fs_info *fsi = DEVPTS_SB(sb);
-
mutex_lock(&allocated_ptys_lock);
ida_remove(&fsi->allocated_ptys, idx);
pty_count--;
mutex_unlock(&allocated_ptys_lock);
}
-/*
- * pty code needs to hold extra references in case of last /dev/tty close
- */
-
-void devpts_add_ref(struct inode *ptmx_inode)
-{
- struct super_block *sb = pts_sb_from_inode(ptmx_inode);
-
- atomic_inc(&sb->s_active);
- ihold(ptmx_inode);
-}
-
-void devpts_del_ref(struct inode *ptmx_inode)
-{
- struct super_block *sb = pts_sb_from_inode(ptmx_inode);
-
- iput(ptmx_inode);
- deactivate_super(sb);
-}
-
/**
* devpts_pty_new -- create a new inode in /dev/pts/
* @ptmx_inode: inode of the master
@@ -604,22 +534,17 @@ void devpts_del_ref(struct inode *ptmx_inode)
*
* The created inode is returned. Remove it from /dev/pts/ by devpts_pty_kill.
*/
-struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
+struct inode *devpts_pty_new(struct pts_fs_info *fsi, dev_t device, int index,
void *priv)
{
struct dentry *dentry;
- struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+ struct super_block *sb = fsi->sb;
struct inode *inode;
struct dentry *root;
- struct pts_fs_info *fsi;
struct pts_mount_opts *opts;
char s[12];
- if (!sb)
- return ERR_PTR(-ENODEV);
-
root = sb->s_root;
- fsi = DEVPTS_SB(sb);
opts = &fsi->mount_opts;
inode = new_inode(sb);
@@ -685,7 +610,7 @@ void *devpts_get_priv(struct inode *pts_inode)
*/
void devpts_pty_kill(struct inode *inode)
{
- struct super_block *sb = pts_sb_from_inode(inode);
+ struct super_block *sb = inode->i_sb;
struct dentry *root = sb->s_root;
struct dentry *dentry;
@@ -706,20 +631,8 @@ void devpts_pty_kill(struct inode *inode)
static int __init init_devpts_fs(void)
{
int err = register_filesystem(&devpts_fs_type);
- struct ctl_table_header *table;
-
if (!err) {
- struct vfsmount *mnt;
-
- table = register_sysctl_table(pty_root_table);
- mnt = kern_mount(&devpts_fs_type);
- if (IS_ERR(mnt)) {
- err = PTR_ERR(mnt);
- unregister_filesystem(&devpts_fs_type);
- unregister_sysctl_table(table);
- } else {
- devpts_mnt = mnt;
- }
+ register_sysctl_table(pty_root_table);
}
return err;
}
diff --git a/fs/namei.c b/fs/namei.c
index 794f81dce766..5c0a38971dbd 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1415,21 +1415,28 @@ static void follow_mount(struct path *path)
}
}
+static int path_parent_directory(struct path *path)
+{
+ struct dentry *old = path->dentry;
+ /* rare case of legitimate dget_parent()... */
+ path->dentry = dget_parent(path->dentry);
+ dput(old);
+ if (unlikely(!path_connected(path)))
+ return -ENOENT;
+ return 0;
+}
+
static int follow_dotdot(struct nameidata *nd)
{
while(1) {
- struct dentry *old = nd->path.dentry;
-
if (nd->path.dentry == nd->root.dentry &&
nd->path.mnt == nd->root.mnt) {
break;
}
if (nd->path.dentry != nd->path.mnt->mnt_root) {
- /* rare case of legitimate dget_parent()... */
- nd->path.dentry = dget_parent(nd->path.dentry);
- dput(old);
- if (unlikely(!path_connected(&nd->path)))
- return -ENOENT;
+ int ret = path_parent_directory(&nd->path);
+ if (ret)
+ return ret;
break;
}
if (!follow_up(&nd->path))
@@ -2374,6 +2381,34 @@ struct dentry *lookup_one_len_unlocked(const char *name,
}
EXPORT_SYMBOL(lookup_one_len_unlocked);
+#ifdef CONFIG_UNIX98_PTYS
+int path_pts(struct path *path)
+{
+ /* Find something mounted on "pts" in the same directory as
+ * the input path.
+ */
+ struct dentry *child, *parent;
+ struct qstr this;
+ int ret;
+
+ ret = path_parent_directory(path);
+ if (ret)
+ return ret;
+
+ parent = path->dentry;
+ this.name = "pts";
+ this.len = 3;
+ child = d_hash_and_lookup(parent, &this);
+ if (!child)
+ return -ENOENT;
+
+ path->dentry = child;
+ dput(parent);
+ follow_mount(path);
+ return 0;
+}
+#endif
+
int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
struct path *path, int *empty)
{
diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h
index e0ee0b3000b2..486f6282b0c6 100644
--- a/include/linux/devpts_fs.h
+++ b/include/linux/devpts_fs.h
@@ -17,36 +17,21 @@
#ifdef CONFIG_UNIX98_PTYS
-int devpts_new_index(struct inode *ptmx_inode);
-void devpts_kill_index(struct inode *ptmx_inode, int idx);
-void devpts_add_ref(struct inode *ptmx_inode);
-void devpts_del_ref(struct inode *ptmx_inode);
+struct pts_fs_info;
+
+struct pts_fs_info *devpts_acquire(struct file *filp);
+void devpts_release(struct pts_fs_info *fsi);
+
+int devpts_new_index(struct pts_fs_info *fsi);
+void devpts_kill_index(struct pts_fs_info *fsi, int idx);
/* mknod in devpts */
-struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
+struct inode *devpts_pty_new(struct pts_fs_info *fsi, dev_t device, int index,
void *priv);
/* get private structure */
void *devpts_get_priv(struct inode *pts_inode);
/* unlink */
void devpts_pty_kill(struct inode *inode);
-#else
-
-/* Dummy stubs in the no-pty case */
-static inline int devpts_new_index(struct inode *ptmx_inode) { return -EINVAL; }
-static inline void devpts_kill_index(struct inode *ptmx_inode, int idx) { }
-static inline void devpts_add_ref(struct inode *ptmx_inode) { }
-static inline void devpts_del_ref(struct inode *ptmx_inode) { }
-static inline struct inode *devpts_pty_new(struct inode *ptmx_inode,
- dev_t device, int index, void *priv)
-{
- return ERR_PTR(-EINVAL);
-}
-static inline void *devpts_get_priv(struct inode *pts_inode)
-{
- return NULL;
-}
-static inline void devpts_pty_kill(struct inode *inode) { }
-
#endif
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 77d01700daf7..f29abda31e6d 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -45,6 +45,8 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
#define LOOKUP_ROOT 0x2000
#define LOOKUP_EMPTY 0x4000
+extern int path_pts(struct path *path);
+
extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty);
static inline int user_path_at(int dfd, const char __user *name, unsigned flags,
--
2.8.1