This patch implement a new syscall sys_socket2(), that accepts an
extra "flags" parameter:
int socket2(int domain, int type, int protocol, int flags);
The flags parameter is used to pass extra flags to the kernel, and is
at the moment used to select the file descriptor allocations inside
the non-sequential area (O_NONSEQFD). The remaining parameters are
exactly the same as the ones of sys_socket().
The sys_accept() system call has been modified to return a file
descriptor inside the non-sequential area, if the listening fd is.
The sys_socketcall() system call has been also changed to support
a new SYS_SOCKET2 indentifier.
Signed-off-by: Davide Libenzi <[email protected]>
- Davide
Index: linux-2.6.mod/fs/9p/trans_fd.c
===================================================================
--- linux-2.6.mod.orig/fs/9p/trans_fd.c 2007-06-06 12:38:27.000000000 -0700
+++ linux-2.6.mod/fs/9p/trans_fd.c 2007-06-06 12:48:41.000000000 -0700
@@ -181,7 +181,7 @@
int fd, ret;
csocket->sk->sk_allocation = GFP_NOIO;
- if ((fd = sock_map_fd(csocket)) < 0) {
+ if ((fd = sock_map_fd(csocket, 0)) < 0) {
eprintk(KERN_ERR, "v9fs_socket_open: failed to map fd\n");
ret = fd;
release_csocket:
Index: linux-2.6.mod/include/linux/net.h
===================================================================
--- linux-2.6.mod.orig/include/linux/net.h 2007-06-06 12:38:27.000000000 -0700
+++ linux-2.6.mod/include/linux/net.h 2007-06-06 13:09:20.000000000 -0700
@@ -43,6 +43,7 @@
#define SYS_GETSOCKOPT 15 /* sys_getsockopt(2) */
#define SYS_SENDMSG 16 /* sys_sendmsg(2) */
#define SYS_RECVMSG 17 /* sys_recvmsg(2) */
+#define SYS_SOCKET2 18 /* sys_socket2(2) */
typedef enum {
SS_FREE = 0, /* not allocated */
@@ -190,7 +191,7 @@
size_t len);
extern int sock_recvmsg(struct socket *sock, struct msghdr *msg,
size_t size, int flags);
-extern int sock_map_fd(struct socket *sock);
+extern int sock_map_fd(struct socket *sock, int flags);
extern struct socket *sockfd_lookup(int fd, int *err);
#define sockfd_put(sock) fput(sock->file)
extern int net_ratelimit(void);
Index: linux-2.6.mod/net/sctp/socket.c
===================================================================
--- linux-2.6.mod.orig/net/sctp/socket.c 2007-06-06 12:38:27.000000000 -0700
+++ linux-2.6.mod/net/sctp/socket.c 2007-06-06 12:48:41.000000000 -0700
@@ -3605,7 +3605,7 @@
goto out;
/* Map the socket to an unused fd that can be returned to the user. */
- retval = sock_map_fd(newsock);
+ retval = sock_map_fd(newsock, 0);
if (retval < 0) {
sock_release(newsock);
goto out;
Index: linux-2.6.mod/net/socket.c
===================================================================
--- linux-2.6.mod.orig/net/socket.c 2007-06-06 12:38:27.000000000 -0700
+++ linux-2.6.mod/net/socket.c 2007-06-06 13:14:08.000000000 -0700
@@ -344,11 +344,11 @@
* but we take care of internal coherence yet.
*/
-static int sock_alloc_fd(struct file **filep)
+static int sock_alloc_fd(struct file **filep, int flags)
{
int fd;
- fd = get_unused_fd();
+ fd = allocate_fd(flags);
if (likely(fd >= 0)) {
struct file *file = get_empty_filp();
@@ -391,10 +391,10 @@
return 0;
}
-int sock_map_fd(struct socket *sock)
+int sock_map_fd(struct socket *sock, int flags)
{
struct file *newfile;
- int fd = sock_alloc_fd(&newfile);
+ int fd = sock_alloc_fd(&newfile, flags);
if (likely(fd >= 0)) {
int err = sock_attach_fd(sock, newfile);
@@ -1198,7 +1198,7 @@
return __sock_create(family, type, protocol, res, 1);
}
-asmlinkage long sys_socket(int family, int type, int protocol)
+asmlinkage long sys_socket2(int family, int type, int protocol, int flags)
{
int retval;
struct socket *sock;
@@ -1207,7 +1207,7 @@
if (retval < 0)
goto out;
- retval = sock_map_fd(sock);
+ retval = sock_map_fd(sock, flags);
if (retval < 0)
goto out_release;
@@ -1220,6 +1220,11 @@
return retval;
}
+asmlinkage long sys_socket(int family, int type, int protocol)
+{
+ return sys_socket2(family, type, protocol, 0);
+}
+
/*
* Create a pair of connected sockets.
*/
@@ -1248,11 +1253,11 @@
if (err < 0)
goto out_release_both;
- fd1 = sock_alloc_fd(&newfile1);
+ fd1 = sock_alloc_fd(&newfile1, 0);
if (unlikely(fd1 < 0))
goto out_release_both;
- fd2 = sock_alloc_fd(&newfile2);
+ fd2 = sock_alloc_fd(&newfile2, 0);
if (unlikely(fd2 < 0)) {
put_filp(newfile1);
put_unused_fd(fd1);
@@ -1407,7 +1412,8 @@
*/
__module_get(newsock->ops->owner);
- newfd = sock_alloc_fd(&newfile);
+ newfd = sock_alloc_fd(&newfile,
+ fd > current->signal->rlim[RLIMIT_NOFILE].rlim_cur ? O_NONSEQFD: 0);
if (unlikely(newfd < 0)) {
err = newfd;
sock_release(newsock);
@@ -1983,10 +1989,11 @@
/* Argument list sizes for sys_socketcall */
#define AL(x) ((x) * sizeof(unsigned long))
-static const unsigned char nargs[18]={
+static const unsigned char nargs[]={
AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
AL(3),AL(3),AL(4),AL(4),AL(4),AL(6),
- AL(6),AL(2),AL(5),AL(5),AL(3),AL(3)
+ AL(6),AL(2),AL(5),AL(5),AL(3),AL(3),
+ AL(4)
};
#undef AL
@@ -2005,7 +2012,7 @@
unsigned long a0, a1;
int err;
- if (call < 1 || call > SYS_RECVMSG)
+ if (call < 1 || call >= ARRAY_SIZE(nargs))
return -EINVAL;
/* copy_from_user should be SMP safe. */
@@ -2082,6 +2089,9 @@
case SYS_RECVMSG:
err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]);
break;
+ case SYS_SOCKET2:
+ err = sys_socket2(a0, a1, a[2], a[3]);
+ break;
default:
err = -EINVAL;
break;
From: Davide Libenzi <[email protected]>
Date: Wed, 06 Jun 2007 15:30:31 -0700
> This patch implement a new syscall sys_socket2(), that accepts an
> extra "flags" parameter:
>
> int socket2(int domain, int type, int protocol, int flags);
>
> The flags parameter is used to pass extra flags to the kernel, and is
> at the moment used to select the file descriptor allocations inside
> the non-sequential area (O_NONSEQFD). The remaining parameters are
> exactly the same as the ones of sys_socket().
> The sys_accept() system call has been modified to return a file
> descriptor inside the non-sequential area, if the listening fd is.
> The sys_socketcall() system call has been also changed to support
> a new SYS_SOCKET2 indentifier.
>
> Signed-off-by: Davide Libenzi <[email protected]>
Since the valid range of "domain" values is quite small,
we could avoid the new system call by cribbing some of the
upper bits of the 'domain' argument.
Valid existing programs pass in valid 'domain' values and
thus will not set any of the new flags.
Just an idea.
On Wed, 6 Jun 2007, David Miller wrote:
> From: Davide Libenzi <[email protected]>
> Date: Wed, 06 Jun 2007 15:30:31 -0700
>
> > This patch implement a new syscall sys_socket2(), that accepts an
> > extra "flags" parameter:
> >
> > int socket2(int domain, int type, int protocol, int flags);
> >
> > The flags parameter is used to pass extra flags to the kernel, and is
> > at the moment used to select the file descriptor allocations inside
> > the non-sequential area (O_NONSEQFD). The remaining parameters are
> > exactly the same as the ones of sys_socket().
> > The sys_accept() system call has been modified to return a file
> > descriptor inside the non-sequential area, if the listening fd is.
> > The sys_socketcall() system call has been also changed to support
> > a new SYS_SOCKET2 indentifier.
> >
> > Signed-off-by: Davide Libenzi <[email protected]>
>
> Since the valid range of "domain" values is quite small,
> we could avoid the new system call by cribbing some of the
> upper bits of the 'domain' argument.
>
> Valid existing programs pass in valid 'domain' values and
> thus will not set any of the new flags.
>
> Just an idea.
I have no huge preferences. If not the slight one of using the same flags
for open() and socket{2}() (O_NONSEQFD). If we overload socket() we may
need to fight with existing O_* flags. OTOH the current free ones are
pretty high in bits, so it may be OK too.
- Davide
> The sys_accept() system call has been modified to return a file
> descriptor inside the non-sequential area, if the listening fd is.
> The sys_socketcall() system call has been also changed to support
> a new SYS_SOCKET2 indentifier.
This still all seems really really ugly. Is there anything wrong with
throwing all these extra cases out and replacing the entire lot with
prctl(PR_SPARSEFD, 1);
to turn on sparse fd allocation for a process ?
Anyone needing to deal with certain special fds will use dup2() anyway so
a task global switch seems to be cleaner and make the behaviour simply to
flip on, with no extra calls (and you need to submit man pages for them
all too), and also more importantly no new glibc stuff should be needed,
and a process can try to set sparsefd, fail and carry on so its more
portable and back portable.
Alan
From: Davide Libenzi <[email protected]>
Date: Wed, 6 Jun 2007 15:52:31 -0700 (PDT)
> I have no huge preferences. If not the slight one of using the same flags
> for open() and socket{2}() (O_NONSEQFD). If we overload socket() we may
> need to fight with existing O_* flags. OTOH the current free ones are
> pretty high in bits, so it may be OK too.
I see.
To me this tends to give higher weight to just adding the
new socket2() system call.
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
David Miller wrote:
> Since the valid range of "domain" values is quite small,
> we could avoid the new system call by cribbing some of the
> upper bits of the 'domain' argument.
>
> Valid existing programs pass in valid 'domain' values and
> thus will not set any of the new flags.
I can see several problems with that:
- - experimental implementers might choose domain values which definitely
won't collide with others
- - the flags parameter ideally allows using the same values used for
open's mode argument. The lowest value I can see making sense is
O_NONBLOCK (04000).
- - how to recognize kernels without the support? -EAFNOSUPPORT can also
with new kernels mean it's actually the domain which is wrong
- - there might be new flags we want to use over time
I would strongly argue that any change we're doing in this area at
userlevel would involve a new interface. Programs also need new
definitions from headers files. This means a recent enough glibc will
be needed in any case. Unless programs use their own definitions in
which case they might as well use the syscall() function.
- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)
iD8DBQFGZzvl2ijCOnn/RHQRAgV0AKDBDhqSQ/cs4qGYLKGL4dwzpFZ2zgCgl/qO
oFKnQ2eRuiziRu/N5vwWCeM=
=tttP
-----END PGP SIGNATURE-----
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Alan Cox wrote:
> prctl(PR_SPARSEFD, 1);
>
> to turn on sparse fd allocation for a process ?
Yes, there is. Go back and read the archives. It has been discussed in
depth.
- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)
iD8DBQFGZzwk2ijCOnn/RHQRAtKGAKCTX5njQnYeyDn4XUGFAZ3Ojai+mwCeN/j0
jibBDSqQpXhR2CwIQNRAnXw=
=sJb0
-----END PGP SIGNATURE-----
From: Ulrich Drepper <[email protected]>
Date: Wed, 06 Jun 2007 15:57:41 -0700
> I would strongly argue that any change we're doing in this area at
> userlevel would involve a new interface. Programs also need new
> definitions from headers files. This means a recent enough glibc will
> be needed in any case. Unless programs use their own definitions in
> which case they might as well use the syscall() function.
To be honest, after reading Alan's response a few moments ago
I'm growing in favor of his suggestions and that all of these
new system calls perhaps really are overkill.
On Wed, 6 Jun 2007, Alan Cox wrote:
> > The sys_accept() system call has been modified to return a file
> > descriptor inside the non-sequential area, if the listening fd is.
> > The sys_socketcall() system call has been also changed to support
> > a new SYS_SOCKET2 indentifier.
>
> This still all seems really really ugly. Is there anything wrong with
> throwing all these extra cases out and replacing the entire lot with
>
> prctl(PR_SPARSEFD, 1);
>
> to turn on sparse fd allocation for a process ?
There was a little discussion where I tried to whisper something similar,
but Linus and Uli shot me :) - with good reasons IMO.
You may link to runtimes that are not non-sequentialfd aware, and will
break them.
> Anyone needing to deal with certain special fds will use dup2() anyway so
> a task global switch seems to be cleaner and make the behaviour simply to
> flip on, with no extra calls (and you need to submit man pages for them
> all too), and also more importantly no new glibc stuff should be needed,
> and a process can try to set sparsefd, fail and carry on so its more
> portable and back portable.
Man pages! Damn, I forgot Michael Kerrisk is already waiting for the other
stuff :(
- Davide
From: Davide Libenzi <[email protected]>
Date: Wed, 6 Jun 2007 16:04:40 -0700 (PDT)
> On Wed, 6 Jun 2007, Alan Cox wrote:
>
> > > The sys_accept() system call has been modified to return a file
> > > descriptor inside the non-sequential area, if the listening fd is.
> > > The sys_socketcall() system call has been also changed to support
> > > a new SYS_SOCKET2 indentifier.
> >
> > This still all seems really really ugly. Is there anything wrong with
> > throwing all these extra cases out and replacing the entire lot with
> >
> > prctl(PR_SPARSEFD, 1);
> >
> > to turn on sparse fd allocation for a process ?
>
> There was a little discussion where I tried to whisper something similar,
> but Linus and Uli shot me :) - with good reasons IMO.
> You may link to runtimes that are not non-sequentialfd aware, and will
> break them.
Thanks for explaining this issue clearly instead of telling people
to "go read the archives" in a condescending manner like someone
else did.
> > prctl(PR_SPARSEFD, 1);
> >
> > to turn on sparse fd allocation for a process ?
>
> There was a little discussion where I tried to whisper something similar,
> but Linus and Uli shot me :) - with good reasons IMO.
> You may link to runtimes that are not non-sequentialfd aware, and will
> break them.
Linking to the correct version of a libary and getting the library
versioning right is not rocket science and isn't a sane excuse. Its no
different to the stdio to large fd migration issues with many Unixen and
they all coped just fine.
Really all this new syscall hackery stuff is just too ugly to live. If
you use the prctl then yes we have a bit of library versioning to worry
about for the odd library that cares but thats a once over thing. The
crappy zillion extra syscalls we have to support for years and years just
to save a little bit of userspace work.
At its most moronic its no different to 32 and 64bit binary linking - and
the gnu tools manage to cope with stopping me linking a 32bit app to a
64bit lib and vice versa just fine, so I'm sure they can cope the same
way with sparse fd safe/non sparese fd safe libraries
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Alan Cox wrote:
> Linking to the correct version of a libary and getting the library
> versioning right is not rocket science and isn't a sane excuse. Its no
> different to the stdio to large fd migration issues with many Unixen and
> they all coped just fine.
This has nothing to do with linking and ABI. The assumptions about
continuous allocation are part of the API. It's required by POSIX and
provided by Unix since the early days. There are entire code bases out
there which depend on this assumption. Linking with code like this,
before or after the new version controlled symbol is introduced, will
break it. Policies or stateful behavior, however yo want to call it, is
just plain wrong for this (and most other things).
- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)
iD8DBQFGZ0G92ijCOnn/RHQRAkB9AJ93ol7XV2GiCw+8wgbJ9uMBnHU6dQCgmmAp
9m+WEup3iPkEHH6HIHDa88I=
=Dhto
-----END PGP SIGNATURE-----
On Thu, 7 Jun 2007, Alan Cox wrote:
> > > prctl(PR_SPARSEFD, 1);
> > >
> > > to turn on sparse fd allocation for a process ?
> >
> > There was a little discussion where I tried to whisper something similar,
> > but Linus and Uli shot me :) - with good reasons IMO.
> > You may link to runtimes that are not non-sequentialfd aware, and will
> > break them.
>
> Linking to the correct version of a libary and getting the library
> versioning right is not rocket science and isn't a sane excuse. Its no
> different to the stdio to large fd migration issues with many Unixen and
> they all coped just fine.
I don't think it's a matter of versioning. Many userspace libraries
expects their fds to be compact (for many reasons - they use select, they
use them to index 0-based arrays, etc...), and if the kernel suddendly
starts returning values in the 1<<28 up arena, they sure won't be happy.
So I believe that the correct way is that the caller specifically selects
the feature, leaving the legacy fd allocation as default.
- Davide
On Thursday 07 June 2007, Davide Libenzi wrote:
> The sys_socketcall() system call has been also changed to support
> a new SYS_SOCKET2 indentifier.
I thought the general agreement was that sys_socketcall is a bad
idea to start with. Is there any benefit in adding new calls to
it instead of using a new system call number for sys_socket2 on
all architectures?
Arnd <><
On Thu, 7 Jun 2007, Arnd Bergmann wrote:
> On Thursday 07 June 2007, Davide Libenzi wrote:
> > The sys_socketcall() system call has been also changed to support
> > a new SYS_SOCKET2 indentifier.
>
> I thought the general agreement was that sys_socketcall is a bad
> idea to start with. Is there any benefit in adding new calls to
> it instead of using a new system call number for sys_socket2 on
> all architectures?
Ohh, I didn't know it was flagged as "bad" ;) I actually had it that way,
but then I noticed there was no __NR_socket, so I complied to the
existing way of doing it.
- Davide
> This has nothing to do with linking and ABI. The assumptions about
It may not to someone who has their head in linkers all day and very
defined ideas about linkers but joining to pieces of code together and
figuring out if they can be combined is "linking", regardless of the
whether-they-can-be-joined being down to symbol availability, instruction
set, memory layout or assumptions about behaviour.
> continuous allocation are part of the API. It's required by POSIX and
> provided by Unix since the early days. There are entire code bases out
> there which depend on this assumption. Linking with code like this,
> before or after the new version controlled symbol is introduced, will
> break it.
If your linker is doing its job then you won't be able to link them
together because they have incompatible assumptions. Not exactly rocket
science even if it is done a little differently to the usual symbol
compatibility tests.
> Policies or stateful behavior, however yo want to call it, is
> just plain wrong for this (and most other things).
And the stuff you are trying to put into the kernel is better ? No, its a
bunch of ugly hacks caused by trying to solve the problem in the wrong
way and in the wrong place.
Alan
> I don't think it's a matter of versioning. Many userspace libraries
> expects their fds to be compact (for many reasons - they use select, they
> use them to index 0-based arrays, etc...), and if the kernel suddendly
> starts returning values in the 1<<28 up arena, they sure won't be happy.
> So I believe that the correct way is that the caller specifically selects
> the feature, leaving the legacy fd allocation as default.
I don't understand the connection between this paragraph (with which I
agree) and the urge to add a ton of ugly syscall hacks. "Caller
specifically selects feature" - > prctl(). Libraries get unhappy ->
linker issue.
On Thu, 7 Jun 2007 11:06:23 +0100
Alan Cox <[email protected]> wrote:
> > I don't think it's a matter of versioning. Many userspace libraries
> > expects their fds to be compact (for many reasons - they use select, they
> > use them to index 0-based arrays, etc...), and if the kernel suddendly
> > starts returning values in the 1<<28 up arena, they sure won't be happy.
> > So I believe that the correct way is that the caller specifically selects
> > the feature, leaving the legacy fd allocation as default.
>
> I don't understand the connection between this paragraph (with which I
> agree) and the urge to add a ton of ugly syscall hacks. "Caller
> specifically selects feature" - > prctl(). Libraries get unhappy ->
> linker issue.
>
Alan, prctl() things are usually inherited at fork()/exec() time.
If you fork() from a new application , then exec an old one
(eventually a statically linked program), we have a problem ?
> Alan, prctl() things are usually inherited at fork()/exec() time.
You want them inherited at fork time.
> If you fork() from a new application , then exec an old one
> (eventually a statically linked program), we have a problem ?
Only if you decide not to reset the value. There are lots of things we do
inherit and lots we don't.
Alan
On Jun 07, 2007, at 06:04:32, Alan Cox wrote:
>> continuous allocation are part of the API. It's required by POSIX
>> and provided by Unix since the early days. There are entire code
>> bases out there which depend on this assumption. Linking with
>> code like this, before or after the new version controlled symbol
>> is introduced, will break it.
>
> If your linker is doing its job then you won't be able to link them
> together because they have incompatible assumptions. Not exactly
> rocket science even if it is done a little differently to the usual
> symbol compatibility tests.
No, there is a fundamental problem with "solving" this via linking.
Many programs need the continuous FD allocation space, but then have
tendancies to do things like:
int i;
for (i = 0; i < 1024; i++)
if (i != comm_sock && i != server_sock)
close(i);
Yes I have seen many such programs, it seems to be a pretty standard
idiom.
On the other hand, that makes it completely impossible for libraries
to reliably use FDs behind the application's back; they might get
closed on you at any time without warning. One such library is
glibc; it would really like to be able to use file-descriptors behind
the back of the rest of userspace to implement certain functionality:
http://www.mail-archive.com/[email protected]/
msg163522.html
Likewise there are a massive group of other libraries (especially
user-interface and server related ones) that would really like to
have support for creating file-descriptors without the top-level
application closing them randomly (like several shells seem to).
Cheers,
Kyle Moffett
On Thu, 7 Jun 2007 07:59:47 -0400
Kyle Moffett <[email protected]> wrote:
> Likewise there are a massive group of other libraries (especially
> user-interface and server related ones) that would really like to
> have support for creating file-descriptors without the top-level
> application closing them randomly (like several shells seem to).
>
True, shells are sometimes quite strange.
For example, bash uses file descriptor 255 (FD_CLOEXEC)
When it forks a new process, child gets a file table with 256 slots.
At exec() time, 255 is closed but file table doesnt shrink.
(shrinking is done at fork() time only)
With fdmap, that means each process started by bash uses at least
256 * sizeof(list_head) bytes, ie 4096 bytes on x86_64, even if only three
file-descriptors are opened (0,1,2)
FD_CLOFORK should help here (BTW : current patch from Davide doesnt take this
into account and might need a change in fdmap_top_open_fd())
Davide, are you sure we want FIFO for non sequential allocations ?
This tends to use all the fmap slots, and not very cache friendly
if an app does a lot of [open(),...,close()] things. We already got a
perf drop because of RCUification of file freeing (FIFO mode instead
of LIFO given by kmalloc()/kfree())
If the idea behind this FIFO was security (ie not easy for an app to predict
next glibc file handle), we/glibc might use yet another FD_SECUREMODE flag,
wich ORed with O_NONSEQFD would ask to fdmap_newfd() to take the tail of
fmap->slist, not head.
On Thu, 7 Jun 2007, Alan Cox wrote:
> > I don't think it's a matter of versioning. Many userspace libraries
> > expects their fds to be compact (for many reasons - they use select, they
> > use them to index 0-based arrays, etc...), and if the kernel suddendly
> > starts returning values in the 1<<28 up arena, they sure won't be happy.
> > So I believe that the correct way is that the caller specifically selects
> > the feature, leaving the legacy fd allocation as default.
>
> I don't understand the connection between this paragraph (with which I
> agree) and the urge to add a ton of ugly syscall hacks. "Caller
> specifically selects feature" - > prctl(). Libraries get unhappy ->
> linker issue.
I meant, caller specifically selects the feature, on a per-fd basis. If
you select the task flag runtime, then all the allocated fds will be in
the non-sequential area. Even fds allocated inside the library code, with
libraries not expecting this. I fail to understand how a linker can nicely
solve this. If my app is using, for its own reasons, fds in the
non-sequential area, and library XYZ internally uses fds and does not like
them in that area, I still want to link to that library. As long as I do
not pass my fds to the library ("XYZ internally .."), this must still
work. The contrary is also true. If my app is not non-sequential fd aware,
and my library wants to use them in order to keep them alive from apps
doing the for-each-fd-close loop, with a per-fd policy, you can still mix
them together.
Or are you planning to have two sets of each userspace (userspace is not
only glibc, there's other stuff too) libraries?
- Davide
On Thu, 7 Jun 2007, Eric Dumazet wrote:
> On Thu, 7 Jun 2007 07:59:47 -0400
> Kyle Moffett <[email protected]> wrote:
>
>
> > Likewise there are a massive group of other libraries (especially
> > user-interface and server related ones) that would really like to
> > have support for creating file-descriptors without the top-level
> > application closing them randomly (like several shells seem to).
> >
>
> True, shells are sometimes quite strange.
>
> For example, bash uses file descriptor 255 (FD_CLOEXEC)
>
> When it forks a new process, child gets a file table with 256 slots.
>
> At exec() time, 255 is closed but file table doesnt shrink.
> (shrinking is done at fork() time only)
>
> With fdmap, that means each process started by bash uses at least
> 256 * sizeof(list_head) bytes, ie 4096 bytes on x86_64, even if only three
> file-descriptors are opened (0,1,2)
>
> FD_CLOFORK should help here (BTW : current patch from Davide doesnt take this
> into account and might need a change in fdmap_top_open_fd())
Yes, the CLOFORK flag is there, but it needs to be taken in account in
fdmap_top_open_fd().
> Davide, are you sure we want FIFO for non sequential allocations ?
>
> This tends to use all the fmap slots, and not very cache friendly
> if an app does a lot of [open(),...,close()] things. We already got a
> perf drop because of RCUification of file freeing (FIFO mode instead
> of LIFO given by kmalloc()/kfree())
>
> If the idea behind this FIFO was security (ie not easy for an app to predict
> next glibc file handle), we/glibc might use yet another FD_SECUREMODE flag,
> wich ORed with O_NONSEQFD would ask to fdmap_newfd() to take the tail of
> fmap->slist, not head.
That was the reason, yes. If we agree that the base randomization is
enough, we can use a LIFO.
- Davide
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Alan Cox wrote:
>> continuous allocation are part of the API. It's required by POSIX and
>> provided by Unix since the early days. There are entire code bases out
>> there which depend on this assumption. Linking with code like this,
>> before or after the new version controlled symbol is introduced, will
>> break it.
>
> If your linker is doing its job then you won't be able to link them
> together because they have incompatible assumptions. Not exactly rocket
> science even if it is done a little differently to the usual symbol
> compatibility tests.
No. You still don't appreciate the problem.
Assume I would change code so that newly compiled programs which use
open, socket, etc generate different references which cannot be
satisfied by old runtimes. That and only that is something linkers can
see, linkers deal with symbols and references.
If I would do that and people would recompile their existing code this
*still* does not change the fact that the program behavior would be
changed and the code might break. It can be something as simple as
int fd = open(...);
...
close(fd);
...
close(STDIN_FILENO);
open(...);
The last open is supposed to open a new file with descriptor 0 (i.e.,
STDIN_FILENO). By using non-sequential descriptors this is not guaranteed.
This is nothing you can change with linker magic. It is embedded in the
code. It's behavior *guaranteed* by POSIX and Unix before it for
decades. This isn't anything you can change.
> And the stuff you are trying to put into the kernel is better ? No, its a
> bunch of ugly hacks caused by trying to solve the problem in the wrong
> way and in the wrong place.
Go back 30 years and convince people to not guarantee first-match
allocation of descriptors. Then you can stand up and demand purity.
But just as in many other cases, legacy demands its price. We sometimes
have to pay the price for compatibility and if it isn't high it's worth
it. I don't say the current proposed code is the answer but iff
Davide's unified code does not perform worse than the current code I
don't see the harm since, for instance, extending socket() is in any
case necessary. I mentioned that close_on_exit must be set on open,
else leaks are risked. This will come naturally with a flags parameter
which already takes O_NONSEQFD.
- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)
iD8DBQFGaBVb2ijCOnn/RHQRAnzGAJwLY3dDc9nl69yFPZfYUr8qbIeXygCfTqMd
u5Jofy3gFB5bWEHdnPtUhJY=
=Jz/g
-----END PGP SIGNATURE-----
On Thu, 07 Jun 2007 07:25:31 -0700
Ulrich Drepper <[email protected]> wrote:
> I don't say the current proposed code is the answer but iff
> Davide's unified code does not perform worse than the current code I
> don't see the harm since, for instance, extending socket() is in any
> case necessary. I mentioned that close_on_exit must be set on open,
> else leaks are risked. This will come naturally with a flags parameter
> which already takes O_NONSEQFD.
Yes, and for completeness :
accept2(int fd, ...)
pipe2(int *fds, int oflags);
eventfd2(int count, int oflags);
signalfd2(int ufd, sigset_t __user *user_mask, size_t sizemask, int oflags);
timerfd2(int ufd, int clockid, int flags,const struct itimerspec __user *utmr, int oflags) ...
We probably should name those with a better sufix than "2", it is ugly.
On Thu, 7 Jun 2007, Eric Dumazet wrote:
> accept2(int fd, ...)
I don't see any reason to not have it inherit the non-sequential
characteristics of "fd".
> pipe2(int *fds, int oflags);
I think pipe+sys_nonseqfd should be OK for those.
> eventfd2(int count, int oflags);
> signalfd2(int ufd, sigset_t __user *user_mask, size_t sizemask, int oflags);
> timerfd2(int ufd, int clockid, int flags,const struct itimerspec __user *utmr, int oflags) ...
Those I think we're still in time to change the interface. No?
Even if not, those are not perf-critical, so I think that
syscall+sys_nonseqfd is still fine.
- Davide
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Eric Dumazet wrote:
> eventfd2(int count, int oflags);
>
> signalfd2(int ufd, sigset_t __user *user_mask, size_t sizemask, int oflags);
>
> timerfd2(int ufd, int clockid, int flags,const struct itimerspec __user *utmr, int oflags) ...
These aren't released yet, so, change them now before it's too late.
And to add to your list:
epoll_create(). Important if you think that's the interface people
should use.
- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)
iD8DBQFGaE3t2ijCOnn/RHQRAqPGAJ4rhSxvWLvAseBTCZIywIpQ7JCTJACfZSdR
fGkbIAX5l/1zISYQ6rLmIrk=
=Ge8V
-----END PGP SIGNATURE-----
On Thu, 7 Jun 2007, Ulrich Drepper wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
>
> Eric Dumazet wrote:
> > eventfd2(int count, int oflags);
> >
> > signalfd2(int ufd, sigset_t __user *user_mask, size_t sizemask, int oflags);
> >
> > timerfd2(int ufd, int clockid, int flags,const struct itimerspec __user *utmr, int oflags) ...
>
> These aren't released yet, so, change them now before it's too late.
Linus, Andrew, would it be OK to change?
> And to add to your list:
>
> epoll_create(). Important if you think that's the interface people
> should use.
Can't we leave it as is, and use sys_nonseqfd() for those? So basically:
pipe(2) -> Use sys_nonseqfd
epoll_create(2) -> Use sys_nonseqfd
accept(2) -> Inherit the "origin" fd characteristics
open(2) -> Add O_NONSEQFD
socket(2) -> Introduce socket2()
- Davide
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Davide Libenzi wrote:
>> And to add to your list:
>>
>> epoll_create(). Important if you think that's the interface people
>> should use.
>
> Can't we leave it as is, and use sys_nonseqfd() for those? So basically:
The problem is O_CLOEXEC. They are all racy and if we add a variant
with flags then we might as well support O_NONSEQFD as well.
- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)
iD8DBQFGaFTO2ijCOnn/RHQRAm7uAJ9NsP6FegpO3cAnCe83eZ8awtkHawCgzKK4
2OjiNCFJhviQSMOaKHS6cTI=
=v9jU
-----END PGP SIGNATURE-----
Davide Libenzi a ?crit :
> On Thu, 7 Jun 2007, Eric Dumazet wrote:
>
>> accept2(int fd, ...)
>
> I don't see any reason to not have it inherit the non-sequential
> characteristics of "fd".
>
>
>
>> pipe2(int *fds, int oflags);
>
> I think pipe+sys_nonseqfd should be OK for those.
yes, but O_CLOEXEC/O_CLOFORK ?
I was refering to Uli wanting to close races on multi-threading apps doing a
fork() while a thread is doing :
fd = open()
<---- race here if another thread does a fork() ---->
fcntl( CLOEXEC)
>
>
>
>> eventfd2(int count, int oflags);
>> signalfd2(int ufd, sigset_t __user *user_mask, size_t sizemask, int oflags);
>> timerfd2(int ufd, int clockid, int flags,const struct itimerspec __user *utmr, int oflags) ...
>
> Those I think we're still in time to change the interface. No?
> Even if not, those are not perf-critical, so I think that
> syscall+sys_nonseqfd is still fine.
Same point here, non a nonseqfd problem.
On Thu, 7 Jun 2007, Ulrich Drepper wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
>
> Davide Libenzi wrote:
> >> And to add to your list:
> >>
> >> epoll_create(). Important if you think that's the interface people
> >> should use.
> >
> > Can't we leave it as is, and use sys_nonseqfd() for those? So basically:
>
> The problem is O_CLOEXEC. They are all racy and if we add a variant
> with flags then we might as well support O_NONSEQFD as well.
Ahh, I forgot the original O_CLOEXEC atomicity. Right.
- Davide
On Thu, 7 Jun 2007, Eric Dumazet wrote:
> Davide, are you sure we want FIFO for non sequential allocations ?
>
> This tends to use all the fmap slots, and not very cache friendly
> if an app does a lot of [open(),...,close()] things. We already got a
> perf drop because of RCUification of file freeing (FIFO mode instead
> of LIFO given by kmalloc()/kfree())
>
> If the idea behind this FIFO was security (ie not easy for an app to predict
> next glibc file handle), we/glibc might use yet another FD_SECUREMODE flag,
> wich ORed with O_NONSEQFD would ask to fdmap_newfd() to take the tail of
> fmap->slist, not head.
Uli, would it be OK to rely only on base randomization and use a LIFO
instead? We have base randomization, plus LIFO does not mean strictly
sequential like legacy allocator, just more compatc and cache friendly.
- Davide
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Davide Libenzi wrote:
> Uli, would it be OK to rely only on base randomization and use a LIFO
> instead? We have base randomization, plus LIFO does not mean strictly
> sequential like legacy allocator, just more compatc and cache friendly.
If FIFO is slowing things down it's certainly OK to you LIFO. If there
is wiggle room (i.e., choose between two descriptors without additional
cost) then taking advantage of this would be of advantage. A policy
which enforces that only the last closed descriptor is not reused
immediately might might be enough. Maybe that's too specific for
people's taste.
- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)
iD8DBQFGaGRP2ijCOnn/RHQRAv2qAJ0WzyKvOPx01PviCp4L/mUmNaehtQCfdKF5
4Qc7Uj47zY8jdqUZf+Ht3gs=
=jRfN
-----END PGP SIGNATURE-----
On Thu, 7 Jun 2007 11:39:39 -0700 (PDT)
Davide Libenzi <[email protected]> wrote:
> On Thu, 7 Jun 2007, Ulrich Drepper wrote:
>
> > -----BEGIN PGP SIGNED MESSAGE-----
> > Hash: SHA1
> >
> > Eric Dumazet wrote:
> > > eventfd2(int count, int oflags);
> > >
> > > signalfd2(int ufd, sigset_t __user *user_mask, size_t sizemask, int oflags);
> > >
> > > timerfd2(int ufd, int clockid, int flags,const struct itimerspec __user *utmr, int oflags) ...
> >
> > These aren't released yet, so, change them now before it's too late.
>
> Linus, Andrew, would it be OK to change?
Absolutely. Let's get them into their final form now, rather than letting
some intermediate interface escape out into a major kernel release.
<directs attention to
ftp://ftp.firstfloor.org/pub/ak/x86_64/quilt/patches/fix-eventd-syscall>
Davide Libenzi a ?crit :
> On Thu, 7 Jun 2007, Eric Dumazet wrote:
>
>> Davide, are you sure we want FIFO for non sequential allocations ?
>>
>> This tends to use all the fmap slots, and not very cache friendly
>> if an app does a lot of [open(),...,close()] things. We already got a
>> perf drop because of RCUification of file freeing (FIFO mode instead
>> of LIFO given by kmalloc()/kfree())
>>
>> If the idea behind this FIFO was security (ie not easy for an app to predict
>> next glibc file handle), we/glibc might use yet another FD_SECUREMODE flag,
>> wich ORed with O_NONSEQFD would ask to fdmap_newfd() to take the tail of
>> fmap->slist, not head.
>
> Uli, would it be OK to rely only on base randomization and use a LIFO
> instead? We have base randomization, plus LIFO does not mean strictly
> sequential like legacy allocator, just more compatc and cache friendly.
>
I am afraid randomization wont really work if /sbin/init or /bin/bash for
example uses one (or more) unseq fd :
The 'random base' will be propagated at fork()/exec() time ?
On Wed, 6 Jun 2007, Alan Cox wrote:
>
> This still all seems really really ugly.
I do agree that it's ugly. That many new system calls with new prototypes
and new glibc support is just nasty.
So I don't think this is viable.
> Is there anything wrong with throwing all these extra cases out and
> replacing the entire lot with
>
> prctl(PR_SPARSEFD, 1);
>
> to turn on sparse fd allocation for a process ?
Yes. We really don't want to set global state that affects any random
library thing that runs after it.
HOWEVER.
I think we could introduce a *single* new system call, which does
basically a "run the specified system call with the following flags".
The flags would literally be local to that *one* system call, and one of
the flags could be the semantics for FD allocation.
[ There are a few other cases where such an indirect system call might be
interesting: temporarily unmasking a signal for just the duration of a
single system call is the reason for things like 'pselect()' and
'sigtimedwait()', and similarly the 'access()' system call is basically
a "temporarily run with my real UID, rather than the effective UID
thing, and quite frankly, it might be perfectly valid to want to do an
'open()' with that rule too, because "access()+open()" is racy! ]
So maybe the proper solution to this mess is *not* to add fifteen new
system calls, but to add *one*, which takes a "flags" value to set certain
things:
- FD_NONSEQ: "allocate any new fd's nonsequentially"
- FD_CLOEXEC: "allocate any new fd's as close-on-exec"
Rationale: allow people to open any fd with the flags set a certain
way, regardless of the system call.
- LOOKUP_REALUID/GID: "make the fsuid/fsgid temporarily be my _real_
uid/gid for this single system call"
Rationale: avoid the inevitable races that the fundamentally broken
"access()" system call has!
- LOOKUP_NOFOLLOW: "do not follow any symlink at the end of the path"
LOOKUP_NOATIME: "don't update atime"
Rationale: "open()" already has O_NOFOLLOW/O_NOATIME, and "stat()" has
"lstat()", but a lot of other path-handlign system calls cannot do the
same thing.
- LOOKUP_NOSYMLINKS: "do not allow any symlink traversal at *all*"
LOOKUP_NODOTDOT: "don't traverse a .. upwards"
LOOKUP_NOMOUNT: "don't traverse a mount point"
Rationale: for security-conscious things, quite often it's not the
_last_ symlink you want to avoid, it's any symlinks at all, and
sometimes it's things like guaranteeing that you stay in a certain
directory structure - which means not going outside with ".." or some
magic mount-point.
People currently literally end up traversing things one path component
at a time, doing a "lstat()" on it, and checking. Even if 99%
of the time you probably don't actually ever hit the problem case.
(Eg Apache at some point used to do something like this if you asked
for security, I'm not sure if it still does).
- signal mask for temporarily blocking/unblocking during a single system
call.
- something else? The above are things that I know I _personally_ have
occasionally cursed not having had.
What do people think about that kind of approach? It has the advantage
that it does *not* involve multiple kernel entries (just a single entry to
a small wrapper that sets some process state temporarily), and that it
doesn't have any sticky state that might confuse a library (or a signal
handler: even if you end up doing "prctrl(ON) ; syscall(); prctrl(OFF)", a
signal handler that happens in between the prctrl's would see unexpected
behaviour).
It has the disadvantage that it would need some per-architecture setup to
load the actual real arguments from memory: the system call would probably
look something like
syscall_indirect(unsigned long flags, sigset_t *,
int syscall, unsigned long args[6]);
and the rule would be that it would just load the six system call
registers from that "args[]" array. Always load the full six registers, to
make it simpler and faster, and not having any confusion or ever needing
any wrappers that depend on the number of system calls.
Linus
Linus Torvalds a ?crit :
>
> On Wed, 6 Jun 2007, Alan Cox wrote:
>> This still all seems really really ugly.
>
> I do agree that it's ugly. That many new system calls with new prototypes
> and new glibc support is just nasty.
>
> So I don't think this is viable.
>
>> Is there anything wrong with throwing all these extra cases out and
>> replacing the entire lot with
>>
>> prctl(PR_SPARSEFD, 1);
>>
>> to turn on sparse fd allocation for a process ?
>
> Yes. We really don't want to set global state that affects any random
> library thing that runs after it.
>
> HOWEVER.
>
> I think we could introduce a *single* new system call, which does
> basically a "run the specified system call with the following flags".
>
> The flags would literally be local to that *one* system call, and one of
> the flags could be the semantics for FD allocation.
>
> [ There are a few other cases where such an indirect system call might be
> interesting: temporarily unmasking a signal for just the duration of a
> single system call is the reason for things like 'pselect()' and
> 'sigtimedwait()', and similarly the 'access()' system call is basically
> a "temporarily run with my real UID, rather than the effective UID
> thing, and quite frankly, it might be perfectly valid to want to do an
> 'open()' with that rule too, because "access()+open()" is racy! ]
>
> So maybe the proper solution to this mess is *not* to add fifteen new
> system calls, but to add *one*, which takes a "flags" value to set certain
> things:
>
> - FD_NONSEQ: "allocate any new fd's nonsequentially"
> - FD_CLOEXEC: "allocate any new fd's as close-on-exec"
>
> Rationale: allow people to open any fd with the flags set a certain
> way, regardless of the system call.
>
> - LOOKUP_REALUID/GID: "make the fsuid/fsgid temporarily be my _real_
> uid/gid for this single system call"
>
> Rationale: avoid the inevitable races that the fundamentally broken
> "access()" system call has!
>
> - LOOKUP_NOFOLLOW: "do not follow any symlink at the end of the path"
> LOOKUP_NOATIME: "don't update atime"
>
> Rationale: "open()" already has O_NOFOLLOW/O_NOATIME, and "stat()" has
> "lstat()", but a lot of other path-handlign system calls cannot do the
> same thing.
>
> - LOOKUP_NOSYMLINKS: "do not allow any symlink traversal at *all*"
> LOOKUP_NODOTDOT: "don't traverse a .. upwards"
> LOOKUP_NOMOUNT: "don't traverse a mount point"
>
> Rationale: for security-conscious things, quite often it's not the
> _last_ symlink you want to avoid, it's any symlinks at all, and
> sometimes it's things like guaranteeing that you stay in a certain
> directory structure - which means not going outside with ".." or some
> magic mount-point.
>
> People currently literally end up traversing things one path component
> at a time, doing a "lstat()" on it, and checking. Even if 99%
> of the time you probably don't actually ever hit the problem case.
> (Eg Apache at some point used to do something like this if you asked
> for security, I'm not sure if it still does).
>
> - signal mask for temporarily blocking/unblocking during a single system
> call.
>
> - something else? The above are things that I know I _personally_ have
> occasionally cursed not having had.
>
> What do people think about that kind of approach? It has the advantage
> that it does *not* involve multiple kernel entries (just a single entry to
> a small wrapper that sets some process state temporarily), and that it
> doesn't have any sticky state that might confuse a library (or a signal
> handler: even if you end up doing "prctrl(ON) ; syscall(); prctrl(OFF)", a
> signal handler that happens in between the prctrl's would see unexpected
> behaviour).
>
> It has the disadvantage that it would need some per-architecture setup to
> load the actual real arguments from memory: the system call would probably
> look something like
>
> syscall_indirect(unsigned long flags, sigset_t *,
> int syscall, unsigned long args[6]);
>
> and the rule would be that it would just load the six system call
> registers from that "args[]" array. Always load the full six registers, to
> make it simpler and faster, and not having any confusion or ever needing
> any wrappers that depend on the number of system calls.
This is a nice idea, but 32/64 compat code is going to hate it :)
syscall_indirect() would be writen in assembly for each arch, since there is
no generic syscall table. Thats really a lot of work, especially if we want to
mess with signal mask, umask ...
2007/6/7, Linus Torvalds <[email protected]>:
> syscall_indirect(unsigned long flags, sigset_t *,
> int syscall, unsigned long args[6]);
Would that also be the user interface or all the wrappers would
still be implemented by the glibc?
Also, it sounds like a per-thread prctl, fun with syslets though ... ;-)
--
Guillaume
2007/6/7, Guillaume Chazarain <[email protected]>:
> Also, it sounds like a per-thread prctl, fun with syslets though ... ;-)
Oops, even if the user takes care to reset the prctl after his code (the
library problem), there are still signals that can run with an unexpected
prctl. Sorry for the noise.
--
Guillaume
On Thu, 7 Jun 2007, Eric Dumazet wrote:
>
> This is a nice idea, but 32/64 compat code is going to hate it :)
It should be fairly simple for 32/64-bit compat code too.
The compat code should just call the compat system call
> syscall_indirect() would be writen in assembly for each arch, since there is
> no generic syscall table. Thats really a lot of work, especially if we want to
> mess with signal mask, umask ...
No no no. That would be horribly idiotic.
The thing should be 99% generic code, ie we would have
syscall_indirect(..)
{
long retval;
.. set up signals/flags ..
retval = arch_syscall_indirect(syscall, args);
.. unsetup signals/flags ..
return retval;
}
compat_syscall_indirect(..)
{
int retval;
.. set up signals/flags ..
retval = compat_arch_syscall_indirect(syscall, args);
.. unsetup signals/flags ..
return retval;
}
and the *only* thing that each architecture would need to do is that
(trivial) arch_syscall_indirect() thing (and the compat version, if
applicable). And those literally should be generally pretty damn trivial.
The only _subtle_ issue is any system call that actually uses "pt_regs".
So we'd have to disallow exec/fork/vfork/clone/. They take magic pt_regs
pointers etc. On x86, for example, the following system calls should *not*
be things you can do this with:
asmlinkage int sys_fork(struct pt_regs regs)
asmlinkage int sys_clone(struct pt_regs regs)
asmlinkage int sys_vfork(struct pt_regs regs)
asmlinkage int sys_execve(struct pt_regs regs)
asmlinkage int sys_vm86old(struct pt_regs regs)
asmlinkage int sys_vm86(struct pt_regs regs)
asmlinkage long sys_iopl(unsigned long unused)
because those either take pt_regs, or make pt_regs out of their arguments
(that "unused" is used to do:
volatile struct pt_regs * regs = (struct pt_regs *) &unused;
so there is *some* care needed, but other than taking care of this, the
implementation on x86 should really be totally trivial.
Linus
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Linus Torvalds wrote:
> syscall_indirect(unsigned long flags, sigset_t *,
> int syscall, unsigned long args[6]);
It's a kernel detail, so if this is how you want it, let it be. I can
certainly live with this. The only comment about the comments I would
have are
- - think hard about the additional things you want to set
- - make it a bit expendable so that we don't run out of bits. Maybe
we also have to pass some additional integer values.
At userlevel this is of course something we cannot expose. Here we will
need new interfaces like acceptNG, socketNG, etc which themselves can
call this syscall.
This extension reminds me of something we've talked about several times
in the past (I know that I at least discussed this with Ingo). You
basically implement little scriptlets. In your call they are simply
current->flags_arg = flags;
current->sigmask_arg = sigmask;
r = syscall(nr, ...);
current->flags_arg = 0;
current->sigmask_arg = NULL;
return r;
This could be made more generic in that you can allow the script to do
more. The threadlets etc are not too different from this.
If all this is unwanted then go with what you proposed. Otherwise think
about a more generic approach.
- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)
iD8DBQFGaHkj2ijCOnn/RHQRAhWoAJ4loMzrYJQDCU4e6jdOfjL4LG/TsACguhUL
ldVvp0PWIazV2iAWraCc+IU=
=VGDR
-----END PGP SIGNATURE-----
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Eric Dumazet wrote:
> I am afraid randomization wont really work if /sbin/init or /bin/bash
> for example uses one (or more) unseq fd :
> The 'random base' will be propagated at fork()/exec() time ?
The base certainly should be reset o fork. Yes, this might expand the
region in which descriptors are allocated due to inherited descriptors.
But I consider this the application's problem and it usually is not
really an issue. Apps have to explicitly request using the new
descriptors and they can use CLOEXEC (CLOFORK) correctly.
- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)
iD8DBQFGaGgQ2ijCOnn/RHQRAutyAKChp9KT9NVfUTD76GRhyY62GUTtaACglgxi
N/4+vmcUPEYtLmUTYKVjMvg=
=otvw
-----END PGP SIGNATURE-----
On Thu, 7 Jun 2007, Linus Torvalds wrote:
> On Thu, 7 Jun 2007, Eric Dumazet wrote:
> >
> > This is a nice idea, but 32/64 compat code is going to hate it :)
>
> It should be fairly simple for 32/64-bit compat code too.
>
> The compat code should just call the compat system call
>
> > syscall_indirect() would be writen in assembly for each arch, since there is
> > no generic syscall table. Thats really a lot of work, especially if we want to
> > mess with signal mask, umask ...
>
> No no no. That would be horribly idiotic.
>
> The thing should be 99% generic code, ie we would have
>
> syscall_indirect(..)
> {
> long retval;
>
> .. set up signals/flags ..
> retval = arch_syscall_indirect(syscall, args);
> .. unsetup signals/flags ..
> return retval;
> }
>
> compat_syscall_indirect(..)
> {
> int retval;
>
> .. set up signals/flags ..
> retval = compat_arch_syscall_indirect(syscall, args);
> .. unsetup signals/flags ..
> return retval;
> }
>
> and the *only* thing that each architecture would need to do is that
> (trivial) arch_syscall_indirect() thing (and the compat version, if
> applicable). And those literally should be generally pretty damn trivial.
Ok, I like this better honestly. I really did not want to win the Oscar
for the 2007 Syscall Spammer of the year :)
- Davide
On Thu, 7 Jun 2007, Ulrich Drepper wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
>
> Eric Dumazet wrote:
> > I am afraid randomization wont really work if /sbin/init or /bin/bash
> > for example uses one (or more) unseq fd :
> > The 'random base' will be propagated at fork()/exec() time ?
>
> The base certainly should be reset o fork. Yes, this might expand the
> region in which descriptors are allocated due to inherited descriptors.
> But I consider this the application's problem and it usually is not
> really an issue. Apps have to explicitly request using the new
> descriptors and they can use CLOEXEC (CLOFORK) correctly.
This is kinda fishy. We'll end up with very large map with lots of
unused space in them.
What we can sanily do, is re-random the base if no fds are in there (of
course CLOFORK and CLOEXEC do not count).
- Davide
On Thu, 7 Jun 2007, Eric Dumazet wrote:
> Davide Libenzi a ?crit :
> > On Thu, 7 Jun 2007, Eric Dumazet wrote:
> >
> > > Davide, are you sure we want FIFO for non sequential allocations ?
> > >
> > > This tends to use all the fmap slots, and not very cache friendly
> > > if an app does a lot of [open(),...,close()] things. We already got a perf
> > > drop because of RCUification of file freeing (FIFO mode instead of LIFO
> > > given by kmalloc()/kfree())
> > >
> > > If the idea behind this FIFO was security (ie not easy for an app to
> > > predict next glibc file handle), we/glibc might use yet another
> > > FD_SECUREMODE flag, wich ORed with O_NONSEQFD would ask to fdmap_newfd()
> > > to take the tail of fmap->slist, not head.
> >
> > Uli, would it be OK to rely only on base randomization and use a LIFO
> > instead? We have base randomization, plus LIFO does not mean strictly
> > sequential like legacy allocator, just more compatc and cache friendly.
> >
>
> I am afraid randomization wont really work if /sbin/init or /bin/bash for
> example uses one (or more) unseq fd :
> The 'random base' will be propagated at fork()/exec() time ?
As I said to Uli, we can't move the base while fds are in there. We can
re-randomize it when it's empty. This can also be done (it's a trivial and
fast operation - just set fmap->base to a new value) even every time the
fd count on the map touches zero.
- Davide
On Thu, 7 Jun 2007, Linus Torvalds wrote:
> It has the disadvantage that it would need some per-architecture setup to
> load the actual real arguments from memory: the system call would probably
> look something like
>
> syscall_indirect(unsigned long flags, sigset_t *,
> int syscall, unsigned long args[6]);
>
> and the rule would be that it would just load the six system call
> registers from that "args[]" array. Always load the full six registers, to
> make it simpler and faster, and not having any confusion or ever needing
> any wrappers that depend on the number of system calls.
We'd still need sys_nonseqfd() though, to move/dup legacy fds into the
non-sequential area.
- Davide
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Davide Libenzi wrote:
> What we can sanily do, is re-random the base if no fds are in there (of
> course CLOFORK and CLOEXEC do not count).
With the last comment you mean "count after CLOFORK and CLOEXEC", right?
So the re-basing would be done in two places: after fork and after execve?
That would be enough. Programs should clean after themselves.
- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)
iD8DBQFGaICk2ijCOnn/RHQRAqq5AJwIDmKdsL71ecQVd6PoDI2oOL/KcwCguj2b
TFqDhpJEDqFx1ypdHITuywA=
=Cbfg
-----END PGP SIGNATURE-----
On Thu, 7 Jun 2007, Ulrich Drepper wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
>
> Davide Libenzi wrote:
> > What we can sanily do, is re-random the base if no fds are in there (of
> > course CLOFORK and CLOEXEC do not count).
>
> With the last comment you mean "count after CLOFORK and CLOEXEC", right?
> So the re-basing would be done in two places: after fork and after execve?
Yes. Files with the CLOFORK and CLOEXEC flag do not count for fork and
exec copies.
I was also planning on doing it in __put_unused_fd(), every time
fmap->count goes to zero. But get_random_int() is not as cheap as I
thought. If we use a cheaper (although less secure) function to mix pid &
jiffies, we could do it even in there.
- Davide
On Thu, 7 Jun 2007, Davide Libenzi wrote:
>
> We'd still need sys_nonseqfd() though, to move/dup legacy fds into the
> non-sequential area.
Umm. No we don't. Because it's no more than
indirect_syscall(dup, FD_NONSEQ)
isn't it?
Linus
On Thu, 7 Jun 2007, Linus Torvalds wrote:
>
>
> On Thu, 7 Jun 2007, Davide Libenzi wrote:
> >
> > We'd still need sys_nonseqfd() though, to move/dup legacy fds into the
> > non-sequential area.
>
> Umm. No we don't. Because it's no more than
>
> indirect_syscall(dup, FD_NONSEQ)
>
> isn't it?
Hmm, ok. It need some changes since sys_dup() and F_DUPFD uses common code
at the moment, but it'd ok.
Basically, everything that calls get_unused_fd() can get the magic
indirect_syscall() settings. I was just planning to localize the
sequential/non-sequential behaviour just in there.
The sys_dup(), sys_dup2() and F_DUPFD have some custom code, although
sys_dup() should really use get_unused_fd() in any way.
- Davide
On Thu, Jun 07, 2007 at 01:10:23PM -0700, Linus Torvalds wrote:
> What do people think about that kind of approach? It has the advantage
> that it does *not* involve multiple kernel entries (just a single entry to
> a small wrapper that sets some process state temporarily), and that it
> doesn't have any sticky state that might confuse a library (or a signal
> handler: even if you end up doing "prctrl(ON) ; syscall(); prctrl(OFF)", a
> signal handler that happens in between the prctrl's would see unexpected
> behaviour).
First, how does this work in-kernel? Does it set a flag in the thread
struct that magically gets used in the actual syscall? Or do we pass
flags down to the sys_foo() function in some manner?
If the former, there is potential for interaction with asynchronous
code running on behalf of the thread, no? Especially if we ever adopt
one of the syslet schemes.
Second, I think we're likely to run out of flag bits really quickly as
this is a good dumping spot for patching up our many slightly
brain-damaged APIs (be they POSIX or Linux-specific).
Third, can I do sys_indirect(sys_indirect(foo, args), flags1),
flags2)?
Fourth, can we do sys_indirect(foo, args, flags | ASYNC) and get most
of the way to merging this with the syslet proposal?
--
Mathematics is the supreme nostalgia of our time.
On Thu, 7 Jun 2007, Matt Mackall wrote:
>
> First, how does this work in-kernel? Does it set a flag in the thread
> struct that magically gets used in the actual syscall? Or do we pass
> flags down to the sys_foo() function in some manner?
Set a flag in the thread-struct.
In fact, that's how "access()" already works.
And yes, syslets would need to have their own thread-structs and/or
save/restore the thing.
> Second, I think we're likely to run out of flag bits really quickly as
> this is a good dumping spot for patching up our many slightly
> brain-damaged APIs (be they POSIX or Linux-specific).
Well, I do suspect that we'd need to basically make the flags be
per-system call. With "common features" (ie a system call that doesn't
return a file descriptor would re-use the bit for "nonlinear-fd" for
something else, while a system call that doesn't do path lookup would use
all the LOOKUP_xyzzy bits for something else).
I agree that if we kept flags _totally_ separate, we'd run out of them
really quickly. But I don't think we want to ever be in the situation
where _one_ set of system calls would need that many flags. If we get
there, we'd really be much better off with a new system call!
> Third, can I do sys_indirect(sys_indirect(foo, args), flags1), flags2)?
I'd say no.
> Fourth, can we do sys_indirect(foo, args, flags | ASYNC) and get most
> of the way to merging this with the syslet proposal?
I think that may well be a really good idea.
Linus
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Andrew Morton wrote:
> Absolutely. Let's get them into their final form now, rather than letting
> some intermediate interface escape out into a major kernel release.
Even if Linus' redirection is adopted, these interfaces should still be
fixed up. No need to rely on hacks if we can still fix up the interfaces.
- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)
iD8DBQFGaMU42ijCOnn/RHQRAm0MAJ9pprTVUD/2YvDsL6CPL21WvZSkEACeN3r8
TT8a5FyF2rCct+8gBKyWF/s=
=fs4f
-----END PGP SIGNATURE-----
Davide Libenzi a ?crit :
> On Thu, 7 Jun 2007, Eric Dumazet wrote:
>> I am afraid randomization wont really work if /sbin/init or /bin/bash for
>> example uses one (or more) unseq fd :
>> The 'random base' will be propagated at fork()/exec() time ?
>
> As I said to Uli, we can't move the base while fds are in there. We can
> re-randomize it when it's empty. This can also be done (it's a trivial and
> fast operation - just set fmap->base to a new value) even every time the
> fd count on the map touches zero.
>
Hum, I think it would be better to free fmap if it's empty, instead of change
fmap->base. (Only in fork() after removal of O_CLOFORK file handles, and in
exec() after removal of O_CLOEXEC file handles)
On Thu, 7 Jun 2007, Ulrich Drepper wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
>
> Andrew Morton wrote:
> > Absolutely. Let's get them into their final form now, rather than letting
> > some intermediate interface escape out into a major kernel release.
>
> Even if Linus' redirection is adopted, these interfaces should still be
> fixed up. No need to rely on hacks if we can still fix up the interfaces.
Indeed. I still think that adding O_NONSEQFD is a good idea, independently
on the other interfaces.
- Davide
On Fri, 8 Jun 2007, Eric Dumazet wrote:
> Davide Libenzi a ?crit :
> > On Thu, 7 Jun 2007, Eric Dumazet wrote:
> > > I am afraid randomization wont really work if /sbin/init or /bin/bash for
> > > example uses one (or more) unseq fd :
> > > The 'random base' will be propagated at fork()/exec() time ?
> >
> > As I said to Uli, we can't move the base while fds are in there. We can
> > re-randomize it when it's empty. This can also be done (it's a trivial and
> > fast operation - just set fmap->base to a new value) even every time the fd
> > count on the map touches zero.
> >
>
> Hum, I think it would be better to free fmap if it's empty, instead of change
> fmap->base. (Only in fork() after removal of O_CLOFORK file handles, and in
> exec() after removal of O_CLOEXEC file handles)
That can be done too. When it'll be re-created will be randomized anyway.
I'll probably be doing it everytime fmap->count touches zero in
__put_unused_fd(), so even if the map is not empty at fork and/or exec
time, the program have other chances of randomize in the middle of its
lifetime.
- Davide
On Thu, Jun 07, 2007 at 03:40:14PM -0700, Davide Libenzi wrote:
> Yes. Files with the CLOFORK and CLOEXEC flag do not count for fork and
> exec copies.
> I was also planning on doing it in __put_unused_fd(), every time
> fmap->count goes to zero. But get_random_int() is not as cheap as I
> thought. If we use a cheaper (although less secure) function to mix pid &
> jiffies, we could do it even in there.
Um, how cheap do you need it? get_random_int() is actually pretty
cheep, since it was designed to be usable by the networking stack for
sequence numbers for TCP packets; and it's not like sys_close() or
sys_open() is a majorly critical path, is it? If the concern is
increasing the potential hold time, I suppose you could have the
exactly two callers of __put_unused_fd() (sys_close() and
put_unused_fd()) call get_random_int() before grabbing the
current->files->file_lock spinlock,
- Ted
> Um, how cheap do you need it? get_random_int() is actually pretty
> cheep, since it was designed to be usable by the networking stack for
> sequence numbers for TCP packets; and it's not like sys_close() or
> sys_open() is a majorly critical path, is it? If the concern is
At this point wouldn't it be cheaper to allocate file handles using a
different algorithm than firing up the RNG - say like in the POSIX
fashion ...
It seems every purpose of this fd stuff is now a failure when
compared with existing behaviours.
On Thu, Jun 07, 2007 at 01:10:23PM -0700, Linus Torvalds wrote:
> HOWEVER.
>
> I think we could introduce a *single* new system call, which does
> basically a "run the specified system call with the following flags".
As long as we are considering indirecting system calls, how about
reviving your proposal to run a system call in a different address
space:
http://www.ussg.iu.edu/hypermail/linux/kernel/0212.3/0502.html
UML could make good use of the ability to remotely manipulate address
spaces. I never liked any of the proposed interfaces very much,
including mm_indirect. However, if my tastebuds are defective, and
indirecting system calls is the way to solve a family of problems,
then we should look at fitting in mm_indirect.
> It has the disadvantage that it would need some per-architecture setup to
> load the actual real arguments from memory: the system call would probably
> look something like
>
> syscall_indirect(unsigned long flags, sigset_t *,
> int syscall, unsigned long args[6]);
mm_indirect would need a file descriptor to the other address space,
which we would presumably get from a new get_mm() system call.
Jeff
--
Work email - jdike at linux dot intel dot com
On Fri, 8 Jun 2007, Theodore Tso wrote:
> On Thu, Jun 07, 2007 at 03:40:14PM -0700, Davide Libenzi wrote:
> > Yes. Files with the CLOFORK and CLOEXEC flag do not count for fork and
> > exec copies.
> > I was also planning on doing it in __put_unused_fd(), every time
> > fmap->count goes to zero. But get_random_int() is not as cheap as I
> > thought. If we use a cheaper (although less secure) function to mix pid &
> > jiffies, we could do it even in there.
>
> Um, how cheap do you need it? get_random_int() is actually pretty
> cheep, since it was designed to be usable by the networking stack for
> sequence numbers for TCP packets; and it's not like sys_close() or
> sys_open() is a majorly critical path, is it? If the concern is
> increasing the potential hold time, I suppose you could have the
> exactly two callers of __put_unused_fd() (sys_close() and
> put_unused_fd()) call get_random_int() before grabbing the
> current->files->file_lock spinlock,
I'm actually using get_random_int() in the slow path (fmap creation time).
It does a few things get_random_int(), and one of those is an MD4 transform.
This does not need to be super secure (the Unix allocator has been
exactly predicatble for years), so maybe a cheaper combination of the
previous base (generated with get_random_int) together with jiffies and
pid is enough. I really would not want to put something like an MD4
transform in that path.
- Davide
On Fri, 8 Jun 2007, Alan Cox wrote:
> > Um, how cheap do you need it? get_random_int() is actually pretty
> > cheep, since it was designed to be usable by the networking stack for
> > sequence numbers for TCP packets; and it's not like sys_close() or
> > sys_open() is a majorly critical path, is it? If the concern is
>
> At this point wouldn't it be cheaper to allocate file handles using a
> different algorithm than firing up the RNG - say like in the POSIX
> fashion ...
The only reason we use a floating base, is because Uli preferred to have
non-exactly predictable fd allocations. There no reason of re-doing the
same POSIX mistake all over again:
Errare humanum est, perseverare autem diabolicum
- Davide
> The only reason we use a floating base, is because Uli preferred to have
> non-exactly predictable fd allocations. There no reason of re-doing the
> same POSIX mistake all over again:
Why does he want an unpredictable algorithm - to make debugging hell
because apps crash only for some patterns and reduce reproducability of
debugging of userspace ?
> Errare humanum est, perseverare autem diabolicum
Speaking latin didn't make the romans too wise either to look at their
history.
Alan
On Fri, 8 Jun 2007, Theodore Tso wrote:
>
> ... and it's not like sys_close() or sys_open() is a majorly critical
> path, is it?
open/close/stat/lstat are _the_ most important system calls, so yes, it's
a majorly critical path. MUCH more so than opening a new TCP connection.
You _may_ open a few hundred TCP connections a second (yeah, yeah, don't
tell me about unrealistic benchmarks that do more), but that's on a server
with good bandwidth etc. open/closes easily happen tens of _thousands_ of
times a second. We're talking sub-microsecond system calls.
Whether get_random_int() is noticeable or not, I dunno. But that path is a
hell of a lot more performance-sensitive than pretty much anything else.
Linus
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Alan Cox wrote:
> Why does he want an unpredictable algorithm
To avoid exactly the kind of problem we have now in future: programs
relying on specific patterns.
- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)
iD4DBQFGaaNR2ijCOnn/RHQRAtGQAJUWlqUTHuxU7TMi+0tiEldqQPi0AJwIVOAx
lG6JQ9o6YNYVtCFOLyZLsg==
=0Vp+
-----END PGP SIGNATURE-----
On Fri, Jun 08, 2007 at 11:43:29AM -0700, Ulrich Drepper wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
>
> Alan Cox wrote:
> > Why does he want an unpredictable algorithm
>
> To avoid exactly the kind of problem we have now in future: programs
> relying on specific patterns.
You are making debugging no end of fun...
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Al Viro wrote:
> You are making debugging no end of fun...
We are talking about file descriptors here. If you're using file
descriptors as anything other than tokens you'll find out soon enough
that your code is broken. The new type of file descriptors cannot be
used as indeces and the randomization makes sure that no program by some
fluke happens to work.
- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)
iD8DBQFGaaZ02ijCOnn/RHQRAkN0AJ9ZDdaoBDKbKMyDfZI7pa3E7w3pXwCeLlbB
9kRy/E09QvPhMOfjeVYPNio=
=Cjh7
-----END PGP SIGNATURE-----
On Fri, 8 Jun 2007, Ulrich Drepper wrote:
>
> We are talking about file descriptors here. If you're using file
> descriptors as anything other than tokens you'll find out soon enough
> that your code is broken. The new type of file descriptors cannot be
> used as indeces and the randomization makes sure that no program by some
> fluke happens to work.
No, Uli.
You need things to be *repeatable* for debugging. No ifs, buts, or maybes
about it.
I think truly random is a bad idea, or at the very least needs some way to
disable it.
Linus
On Fri, 8 Jun 2007, Linus Torvalds wrote:
> On Fri, 8 Jun 2007, Ulrich Drepper wrote:
> >
> > We are talking about file descriptors here. If you're using file
> > descriptors as anything other than tokens you'll find out soon enough
> > that your code is broken. The new type of file descriptors cannot be
> > used as indeces and the randomization makes sure that no program by some
> > fluke happens to work.
>
> No, Uli.
>
> You need things to be *repeatable* for debugging. No ifs, buts, or maybes
> about it.
It all depends on how you use the file descriptor. If you see a file
descriptor as an opaque handle (like it should be, really), that is simply
passed to the OS to use services exposed by the handle, you will be fine
independently from the values handed out by the OS. It was for the exactly
this guarantee that created the problems, with ppl relying on it for
indexing table, closing all files < NR_FILE and so on.
- Davide
On Fri, 8 Jun 2007, Alan Cox wrote:
> > Errare humanum est, perseverare autem diabolicum
>
> Speaking latin didn't make the romans too wise either to look at their
> history.
If you really look at history, they were kinda wise.
- Davide
On Fri, 08 Jun 2007 11:43:29 -0700
Ulrich Drepper <[email protected]> wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
>
> Alan Cox wrote:
> > Why does he want an unpredictable algorithm
>
> To avoid exactly the kind of problem we have now in future: programs
> relying on specific patterns.
Which you seem to think is a bad thing, yet is actually a very good thing
because it means that crashes are repeatable and problems are debuggable
from end user reports.
Trying to randomize filehandles for the general case is not a productive
activity. If you want to debug cases write yourself a glibc wrapper that
does annoying things but don't inflict it on people who actually want to
build working, testable, debuggable systems (ie most of us)
Alan
> We are talking about file descriptors here. If you're using file
> descriptors as anything other than tokens you'll find out soon enough
> that your code is broken. The new type of file descriptors cannot be
> used as indeces and the randomization makes sure that no program by some
> fluke happens to work.
If you are building a stable system and you test it and it passes
extensive testing you don't care if it works because of a specific
pattern of accesses since your testing shows that it continues to work.
If you randomize these it becomes fundamentally untestable. There is a
role for this in fuzz testing but there is not a role for it in normal
production behaviour.
Please consign the whole funky file handle farce to the bucket labelled
"dumb ideas". I know its a bit full but there is room in there for more -
unlike the kernel.
Alan
On Fri, 8 Jun 2007, Alan Cox wrote:
> On Fri, 08 Jun 2007 11:43:29 -0700
> Ulrich Drepper <[email protected]> wrote:
>
> > -----BEGIN PGP SIGNED MESSAGE-----
> > Hash: SHA1
> >
> > Alan Cox wrote:
> > > Why does he want an unpredictable algorithm
> >
> > To avoid exactly the kind of problem we have now in future: programs
> > relying on specific patterns.
>
> Which you seem to think is a bad thing, yet is actually a very good thing
> because it means that crashes are repeatable and problems are debuggable
> from end user reports.
>
> Trying to randomize filehandles for the general case is not a productive
> activity. If you want to debug cases write yourself a glibc wrapper that
> does annoying things but don't inflict it on people who actually want to
> build working, testable, debuggable systems (ie most of us)
So, what do you plan to do? Those handle won't be zero-based. Your
"working" system I immagine will do:
bleeh[handle - BASE].duh = ...;
How nice for a working system. If you *store* the handle returned by the
OS, and you *use* the handle to call for OS services, you will be fine
independently from the value handed out by the OS.
- Davide
> So, what do you plan to do? Those handle won't be zero-based. Your
> "working" system I immagine will do:
>
> bleeh[handle - BASE].duh = ...;
>
> How nice for a working system. If you *store* the handle returned by the
> OS, and you *use* the handle to call for OS services, you will be fine
> independently from the value handed out by the OS.
Well there are two ways I'd do this
#1: Throw the whole thing away and accept its not a good idea anyway
#2: If I was really going this way and I wanted to use it for serious
tricks for high performance I/O then I'd provide the handle from
userspace so that the strategy for allocation is controlled by the caller
who is the only one who can make the smart decisions
On Fri, 8 Jun 2007, Alan Cox wrote:
> > So, what do you plan to do? Those handle won't be zero-based. Your
> > "working" system I immagine will do:
> >
> > bleeh[handle - BASE].duh = ...;
> >
> > How nice for a working system. If you *store* the handle returned by the
> > OS, and you *use* the handle to call for OS services, you will be fine
> > independently from the value handed out by the OS.
>
> Well there are two ways I'd do this
>
> #1: Throw the whole thing away and accept its not a good idea anyway
Unfortunately (exactly because of the same guarantees you're asking for
those handles), in order for userspace libraries to reliably internally
use fds to interact with the kernel, you need another kind of allocation
strategy.
> #2: If I was really going this way and I wanted to use it for serious
> tricks for high performance I/O then I'd provide the handle from
> userspace so that the strategy for allocation is controlled by the caller
> who is the only one who can make the smart decisions
It does not work. What if the main application, library A and library B
wants to implement their own allocation strategy?
- Davide
> > #1: Throw the whole thing away and accept its not a good idea anyway
>
> Unfortunately (exactly because of the same guarantees you're asking for
> those handles), in order for userspace libraries to reliably internally
> use fds to interact with the kernel, you need another kind of allocation
> strategy.
Unproven and dubious at best as a claim.
> > #2: If I was really going this way and I wanted to use it for serious
> > tricks for high performance I/O then I'd provide the handle from
> > userspace so that the strategy for allocation is controlled by the caller
> > who is the only one who can make the smart decisions
>
> It does not work. What if the main application, library A and library B
> wants to implement their own allocation strategy?
Its called "discipline". I would suggest that libc contains a default
allocator. You might also want to assign library and application ranges
for clarity.
Alan
On Fri, 8 Jun 2007, Alan Cox wrote:
> > > #1: Throw the whole thing away and accept its not a good idea anyway
> >
> > Unfortunately (exactly because of the same guarantees you're asking for
> > those handles), in order for userspace libraries to reliably internally
> > use fds to interact with the kernel, you need another kind of allocation
> > strategy.
>
> Unproven and dubious at best as a claim.
I really don't mean to be rude and pointing you to read the archives, but
the proof and the reason why claims are valid is inside there.
> > > #2: If I was really going this way and I wanted to use it for serious
> > > tricks for high performance I/O then I'd provide the handle from
> > > userspace so that the strategy for allocation is controlled by the caller
> > > who is the only one who can make the smart decisions
> >
> > It does not work. What if the main application, library A and library B
> > wants to implement their own allocation strategy?
>
> Its called "discipline". I would suggest that libc contains a default
> allocator. You might also want to assign library and application ranges
> for clarity.
That is really nice solution. Each library has to have each own allocator.
Then we'll have what, a committee that assigns fd ranges?
Ranges cannot be implemented with the current fdtable allocator, because
they'll be way far apart and they'll waste space. Multiple separate ranges
will require multiple fdtable structures (one for each range/allocator).
Instead of a two-way switch (legacy and non-sequential) you will have
multiple choices to select. This multiple choices will have to be
replicated all around the code that access directly the fdtables. I did
the fdmap consolidation patch, and I can tell you there are quite a few
places that access fdtables directly.
- Davide
> > Unproven and dubious at best as a claim.
>
> I really don't mean to be rude and pointing you to read the archives, but
> the proof and the reason why claims are valid is inside there.
I've read the archive. I'm totally unconvinced by any of the fd
allocation policy stuff. There are some good arguments about O_CLOEXEC
and threading but not about fd allocation.
> > > It does not work. What if the main application, library A and library B
> > > wants to implement their own allocation strategy?
> >
> > Its called "discipline". I would suggest that libc contains a default
> > allocator. You might also want to assign library and application ranges
> > for clarity.
>
> That is really nice solution. Each library has to have each own allocator.
Are you being deliberately stupid ?
I suggested *libc* contains a default allocator
> Then we'll have what, a committee that assigns fd ranges?
Currently the fd ranges are assigned by a committee called POSIX based on
Unix practice.
> replicated all around the code that access directly the fdtables. I did
> the fdmap consolidation patch, and I can tell you there are quite a few
> places that access fdtables directly.
This is true, but if you are worried about complexity we get back to the
original posix allocator which packs them in tight and produces a most
excellent spread in the general case (whacko apps like bash aside)
On Fri, 8 Jun 2007, Alan Cox wrote:
> > > > It does not work. What if the main application, library A and library B
> > > > wants to implement their own allocation strategy?
> > >
> > > Its called "discipline". I would suggest that libc contains a default
> > > allocator. You might also want to assign library and application ranges
> > > for clarity.
> >
> > That is really nice solution. Each library has to have each own allocator.
>
> Are you being deliberately stupid ?
>
> I suggested *libc* contains a default allocator
"You might also want to assign library and application ranges for clarity."
So let's see. We have the legacy one (1st fdtable), the non-legacy app one
(2nd fdtable) and one for the library (3rd fdtable). And which library?
Each one his own (4th, 5th ... fdtable)? ...
> > replicated all around the code that access directly the fdtables. I did
> > the fdmap consolidation patch, and I can tell you there are quite a few
> > places that access fdtables directly.
>
> This is true, but if you are worried about complexity we get back to the
> original posix allocator which packs them in tight and produces a most
> excellent spread in the general case (whacko apps like bash aside)
... complexity does not come from the allocator, but from the fact that
direct access to the fdtable is done all over the kernel. Code that
assumes there's only *one* fdtable.
- Davide
On Fri, 8 Jun 2007, Davide Libenzi wrote:
>
> On Fri, 8 Jun 2007, Linus Torvalds wrote:
> >
> > You need things to be *repeatable* for debugging. No ifs, buts, or maybes
> > about it.
>
> It all depends on how you use the file descriptor.
Read what I wrote. "for debugging".
If your code is bug-free, and does what you intend it to do, everything is
fine. But you wouldn't be doing debugging then, would you?
For debugging, it does _not_ depend on "how you use the file descriptor".
The whole _point_ is that something does something wrong. Maybe you
_intended_ to use the file descriptor some way, and the bug was that you
didn't.
Linus
On Fri, 8 Jun 2007, Linus Torvalds wrote:
> On Fri, 8 Jun 2007, Davide Libenzi wrote:
> >
> > On Fri, 8 Jun 2007, Linus Torvalds wrote:
> > >
> > > You need things to be *repeatable* for debugging. No ifs, buts, or maybes
> > > about it.
> >
> > It all depends on how you use the file descriptor.
>
> Read what I wrote. "for debugging".
>
> If your code is bug-free, and does what you intend it to do, everything is
> fine. But you wouldn't be doing debugging then, would you?
>
> For debugging, it does _not_ depend on "how you use the file descriptor".
> The whole _point_ is that something does something wrong. Maybe you
> _intended_ to use the file descriptor some way, and the bug was that you
> didn't.
Ok, so what's your idea? Have another POSIX-like allocator somewhere up
there in the fd space, with a fixed and include-defined base?
Or just drop all this altogether?
- Davide
On Fri, Jun 08, 2007 at 05:03:57PM -0700, Linus Torvalds wrote:
>
>
> On Fri, 8 Jun 2007, Davide Libenzi wrote:
> >
> > On Fri, 8 Jun 2007, Linus Torvalds wrote:
> > >
> > > You need things to be *repeatable* for debugging. No ifs, buts, or maybes
> > > about it.
> >
> > It all depends on how you use the file descriptor.
>
> Read what I wrote. "for debugging".
>
> If your code is bug-free, and does what you intend it to do, everything is
> fine. But you wouldn't be doing debugging then, would you?
>
> For debugging, it does _not_ depend on "how you use the file descriptor".
> The whole _point_ is that something does something wrong. Maybe you
> _intended_ to use the file descriptor some way, and the bug was that you
> didn't.
Exactly. Put it another way, randomizer is a stress-tester. Which is
a damn useful debugging tool in its own right. *However*, the main thing
about debugging tools is that you need to be able to turn them on and off
individually; then you get a useful information narrowing the diagnosis
down. If you can't do that, you lose.
Folks, the real question is whether we consider the loops blindly
shooting down the file descriptors as a supportable thing. Correction:
whether the code written in that style *and* *correct* *with* *current*
*semantics* is sufficiently numerous and hard to notice that we need that
sort of kludges. Because what Uli's suggesting is certainly that - a kludge
sufficient for glibc internal needs.
Note that
#define NR_FILES <some constant>
for (i = 0; i < NR_FILES; i++)
close(i);
is simply broken. No ifs, no buts, it's broken on the current kernel. So
that kind of stuff needs fixing anyway; do we have enough left to make it
worth preserving?
I don't know the answer; some data would be much appreciated. If the broken
stuff like above outnumbers the valid uses, we probably need think of some
way to catch that kind of crap anyway... Randomizer for open() et.al.
switchable on per-process basis would be a nice tool to catch some of those,
along with instrumenting the kernel to notice massive close() on files that
hadn't been opened, etc.
As long as all we have is a handwaving about widespread and unspecified
code doing this, this and that, but not that...
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Al Viro wrote:
> Exactly. Put it another way, randomizer is a stress-tester.
... and a security mechanism. And as such it is only useful if it is
used. Probably it should be policy-controlled whether you can turn it off.
> Note that
> #define NR_FILES <some constant>
>
> for (i = 0; i < NR_FILES; i++)
> close(i);
You're confusing the problems. This is not the argument for having a
separate file descriptor set. It is the argument to have hidden file
descriptors. Randomization has nothing whatsoever to do with this example.
- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)
iD8DBQFGagAg2ijCOnn/RHQRAhV+AJ9qT2epTxDWWS++74f+vrV3NucVHACdGkxm
MULoyE+0NY7dSHcB2epKe7w=
=o3+1
-----END PGP SIGNATURE-----
On Fri, Jun 08, 2007 at 06:19:28PM -0700, Ulrich Drepper wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
>
> Al Viro wrote:
> > Exactly. Put it another way, randomizer is a stress-tester.
>
> ... and a security mechanism. And as such it is only useful if it is
> used. Probably it should be policy-controlled whether you can turn it off.
Any real-world examples of exploitable holes based on that? Real blue-sky
world, not grsec-advocacy "undiscovered null ptr derefs on every corner and
all of the variety fixable by our magic goo" one, please.
> > Note that
> > #define NR_FILES <some constant>
> >
> > for (i = 0; i < NR_FILES; i++)
> > close(i);
>
> You're confusing the problems.
No, I'm not. The entire argument for having a separate set of descriptors
is based on programs behaving in similar fashion, working correctly now but
limiting what libraries can do with opening files for internal needs.
So here's the question: how widespread it really is, considering that "working
correctly" is not a trivial constraint. Again, I'd love to see real data;
if I need handwaving, I know where to find it.
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Al Viro wrote:
> Any real-world examples of exploitable holes based on that?
Return to libc exploit, calling dup2, where some privileged data is
redirected from the normal file descriptor to one of the attackers
choosing. The latter could be an outgoing socket connection which would
result in leaking the data to the outside.
normal code intruder
so = socket()
fd = open ("local-file")
dup2(so, fd);
write (fd, privileged data)
It's just a little function call. If the arguments of dup2() are known
this is not a big issue to construct.
>> You're confusing the problems.
>
> No, I'm not. The entire argument for having a separate set of descriptors
> is based on programs behaving in similar fashion, working correctly now but
> limiting what libraries can do with opening files for internal needs.
It's completely different.
The reason why runtime libraries cannot keep descriptors open unless it
is explicitly part of the API (e.g., opendir) is that POSIX and Unix
forever guarantee that descriptors are allocated sequentially. Linus
already showed a code sequence:
close(0);
.. something else ..
if (open("myfile", O_RDONLY) < 0)
exit(1);
This occurs in the real world and it is guaranteed to work.
- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)
iD8DBQFGagv72ijCOnn/RHQRAqxTAJwLhjuFT22SegEVXrbevpsnOkDxLQCgwgza
7ZOScxEm2lgMJNjG9UDAdfo=
=fenl
-----END PGP SIGNATURE-----
Davide Libenzi writes:
> The only reason we use a floating base, is because Uli preferred to have
> non-exactly predictable fd allocations. There no reason of re-doing the
> same POSIX mistake all over again:
Why must everything that makes things a bit simpler and more
predictable for application programmers be called a "mistake"?
Paul.
On Jun 09, 2007, at 01:41:42, Paul Mackerras wrote:
> Davide Libenzi writes:
>
>> The only reason we use a floating base, is because Uli preferred
>> to have non-exactly predictable fd allocations. There no reason of
>> re-doing the same POSIX mistake all over again:
>
> Why must everything that makes things a bit simpler and more
> predictable for application programmers be called a "mistake"?
1) Linear FD allocation makes it IMPOSSIBLE for libraries to
reliably use persistent FDs behind an application's back. For
example, this code sequence should almost always be OK: (except when
calling into a library specifically to open a file or socket)
close(0);
some_library_function();
fd = open("some_file", O_RDWR);
/* fd == 0 here */
2) Another common code example which causes problems:
int i;
for (i = 0; i < NR_OPEN; i++)
if (!fd_is_special_to_us(i))
close(i);
Note that this is conceptually buggy, but occurs in several major C
programming books, most of the major shells, and a lot of other
software to boot.
3) In order to allocate new FDs, the kernel has to scan over a
(potentially very large) bitmap. A process with 100,000 fds (not
terribly uncommon) would have 12.5kbyte of FD bitmap and would trash
the cache every time it tried to allocate an FD.
You can't even hope to use persistent library FDs with these
problems, and they even cause problems with servers using lots of
FDs. Introducing another linear FD space will just make this sort of
stuff happen ALL OVER AGAIN.
If you want to make things reproducible, you could have an option
where the "random" algorithm is just pseudo-linearly allocated FDs
(no bitmap search) XORed with a boot-time constant either randomly-
generated or set in the boot args. That way the people interested in
maximum security or stress testing can use completely randomized
numbers and the people who need reproducibility can get that too,
*without* having the same linear FD allocation problems all over
again. The fundamental problem is that a series of numbers
increasing linearly from X (especially where X == 0) are used as
meaningful data by lazy programmers, instead of just as a cookie. By
nonlinearizing the data, even just a little, that becomes unlikely to
occur. It also forces programmers to use FDs correctly and prevents
problems like this in the future.
Cheers,
Kyle Moffett
On Fri, Jun 08, 2007 at 07:10:03PM -0700, Ulrich Drepper wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
>
> Al Viro wrote:
> > Any real-world examples of exploitable holes based on that?
>
> Return to libc exploit, calling dup2, where some privileged data is
> redirected from the normal file descriptor to one of the attackers
> choosing. The latter could be an outgoing socket connection which would
> result in leaking the data to the outside.
>
> normal code intruder
>
> so = socket()
>
> fd = open ("local-file")
>
> dup2(so, fd);
>
> write (fd, privileged data)
>
>
> It's just a little function call. If the arguments of dup2() are known
> this is not a big issue to construct.
So which code is supposed to do that open/write in your example? Library?
Unmodified application? Application specifically modified to make *that*
open() randomized?
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Al Viro wrote:
> So which code is supposed to do that open/write in your example? Library?
> Unmodified application? Application specifically modified to make *that*
> open() randomized?
Why should that matter? All of the above. Any piece of code can of
course choose to not be safe but it should have the opportunity to be,
too. Currently that's not the case. With an O_NONSEQFD flag to open it
would be trivial to change any code. Or one can introduce a new set of
userlevel interfaces to create and write new files which then can do
even more.
In whatever way you look at it, there currently is a problem which
cannot be solved except with truly horrible, horrible hack (open N
descriptors and randomly select one to use; yep, horrible, I said so).
- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)
iD8DBQFGatS62ijCOnn/RHQRAqcfAJ9vMc6GUFxlgxOZ9rhAcTV9N95kUACguUpq
ER8Y64pIp80NHiMHXwMxxK0=
=ytei
-----END PGP SIGNATURE-----
On Sat, Jun 09, 2007 at 09:26:34AM -0700, Ulrich Drepper wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
>
> Al Viro wrote:
> > So which code is supposed to do that open/write in your example? Library?
> > Unmodified application? Application specifically modified to make *that*
> > open() randomized?
>
> Why should that matter? All of the above.
I asked for real-world example. Can I have one, please?
> In whatever way you look at it, there currently is a problem which
> cannot be solved except with truly horrible, horrible hack (open N
> descriptors and randomly select one to use; yep, horrible, I said so).
That's simply not true. On the current kernel nothing stops you from e.g.
picking a random number and using F_DUPFD. Voila - there's your randomized
descriptor. Portable to earlier kernels.
Moreover, nonsense^H^H^Hq_fd() can be implemented in userland just fine
if we allow F_DUPFD to arbitrary number - just pass it a random one *or*
base chosen like davedel is doing (constant + 20bit random chosen at start
time).
On Sat, 9 Jun 2007, Paul Mackerras wrote:
> Davide Libenzi writes:
>
> > The only reason we use a floating base, is because Uli preferred to have
> > non-exactly predictable fd allocations. There no reason of re-doing the
> > same POSIX mistake all over again:
>
> Why must everything that makes things a bit simpler and more
> predictable for application programmers be called a "mistake"?
Because if you give guarantees on something, ppl start using such
guarantee in the wrong way. Kyle's email summarizes it.
This should really be treated as an opaque handle, with no assumption on
its value. And if you start handing over values that are not predictable,
the userspace is *forced* to not use any assumption on its values. I never
made any assumption on values returned by APIs returning "handles", and I
never had any problem (or even care) about how those values were
distributed in the N bit space.
- Davide
On Sat, 9 Jun 2007, Al Viro wrote:
> That's simply not true. On the current kernel nothing stops you from e.g.
> picking a random number and using F_DUPFD. Voila - there's your randomized
> descriptor. Portable to earlier kernels.
>
> Moreover, nonsense^H^H^Hq_fd() can be implemented in userland just fine
> if we allow F_DUPFD to arbitrary number - just pass it a random one *or*
> base chosen like davedel is doing (constant + 20bit random chosen at start
> time).
That does not work. Or better, it works but it forces *huge* fdtables to
be created. To do that, you need "start" to be over NR_FILE, and this
creates an fdtable bigger than NR_FILE ((pointer-size + a-few-bits) *
NR_FILE) propagated down to every app you fork.
- Davide
On Sat, 9 Jun 2007, Davide Libenzi wrote:
> On Sat, 9 Jun 2007, Al Viro wrote:
>
> > That's simply not true. On the current kernel nothing stops you from e.g.
> > picking a random number and using F_DUPFD. Voila - there's your randomized
> > descriptor. Portable to earlier kernels.
> >
> > Moreover, nonsense^H^H^Hq_fd() can be implemented in userland just fine
> > if we allow F_DUPFD to arbitrary number - just pass it a random one *or*
> > base chosen like davedel is doing (constant + 20bit random chosen at start
> > time).
>
> That does not work. Or better, it works but it forces *huge* fdtables to
> be created. To do that, you need "start" to be over NR_FILE, and this
> creates an fdtable bigger than NR_FILE ((pointer-size + a-few-bits) *
> NR_FILE) propagated down to every app you fork.
Keep in mind also, that quite a few places in the code, walk through the
whole fdtable memory. So it is not only a problem of wasted RAM.
- Davide
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Al Viro wrote:
> That's simply not true. On the current kernel nothing stops you from e.g.
> picking a random number and using F_DUPFD.
Of course there are things stopping one from doing this (aside from the
kernel not allowing this in the moment at all since the highest fd
number is limited severely):
- - this scheme would only be use if it would be possible to have
completely random descriptor values. But what has been said here
already is that this is too costly. Hence the approach to randomize
only the base value.
- - there are two interface to use: open + fcntl. This is racy. And
don't tell me this doesn't matter.
- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)
iD8DBQFGat6r2ijCOnn/RHQRAiatAKCs7RLZmpgAU5NyT58c8ueJum4fgwCgjqP0
jPVCCWEdIHVQS05oIjdsZYs=
=7UX+
-----END PGP SIGNATURE-----
On Sat, Jun 09, 2007 at 10:08:59AM -0700, Ulrich Drepper wrote:
> - - there are two interface to use: open + fcntl. This is racy. And
> don't tell me this doesn't matter.
Racy with respect to what? Return-to-libc exploits from another thread?
On Jun 09, 2007, at 13:24:29, Al Viro wrote:
> On Sat, Jun 09, 2007 at 10:08:59AM -0700, Ulrich Drepper wrote:
>> - - there are two interface to use: open + fcntl. This is racy.
>> And don't tell me this doesn't matter.
> Racy with respect to what? Return-to-libc exploits from another
> thread?
How about racy with respect to normal open or fork+exec from another
thread? Specifically there are cases where libc or other libraries
want to create a backend thread dealing with file descriptors in
response to the program's straightforward calls into that library
(Examples include using syslets or event-based polling threads).
SCENARIO 1:
Program Thread: Library Thread:
fd = socket(AF_*, SOCK_*, 0);
fork();
int x = FD_CLOEXEC;
fcntl(fd, F_SETFD, &x);
New Process:
setgroups(...);
seteuid(...);
exec(....);
Whoops!!! Suddenly the user process executed by the (theoretically)
single-threaded program got a handle to a netlink socket affecting
some system resource!!!
SCENARIO 2:
Program Thread: Async libc getpwent()-cache syslet
close(0);
fd = open("/etc/shadow");
open("/dev/null");
code_which_insecurely_reads_from_stdin();
Here we were trying to safely call into code which reads from stdin
and shouldn't be given privileged data, but the syslet makes the
common paradigm 'close(0); open("/dev/null");' horribly insecure.
If you extend all the FD syscalls to all take a "flags" parameter and
add the appropriate flags, then you can pass O_CLOEXEC|O_RANDFD to
whatever syscall you are using and both problems vanish.
Cheers,
Kyle Moffett
On Sat, Jun 09, 2007 at 03:27:43PM -0400, Kyle Moffett wrote:
> On Jun 09, 2007, at 13:24:29, Al Viro wrote:
> >On Sat, Jun 09, 2007 at 10:08:59AM -0700, Ulrich Drepper wrote:
> >>- - there are two interface to use: open + fcntl. This is racy.
> >>And don't tell me this doesn't matter.
> >Racy with respect to what? Return-to-libc exploits from another
> >thread?
>
> How about racy with respect to normal open
How the hell can it be racy wrt normal open()? F_DUPFD is not dup2(),
it's non-overriding.
> or fork+exec from another
> thread? Specifically there are cases where libc or other libraries
> want to create a backend thread dealing with file descriptors in
> response to the program's straightforward calls into that library
> (Examples include using syslets or event-based polling threads).
>
>
> SCENARIO 1:
>
> Program Thread: Library Thread:
> fd = socket(AF_*, SOCK_*, 0);
> fork();
> int x = FD_CLOEXEC;
> fcntl(fd, F_SETFD, &x);
>
> New Process:
> setgroups(...);
> seteuid(...);
> exec(....);
>
> Whoops!!! Suddenly the user process executed by the (theoretically)
> single-threaded program got a handle to a netlink socket affecting
> some system resource!!!
Give me a break. fork(3) is nowhere near plain fork(2); read the nptl
code for details. Getting a low-overhead exclusion into that scheme is not
a rocket science. And lose the bangs, please...
> SCENARIO 2:
>
> Program Thread: Async libc getpwent()-cache syslet
> close(0);
> fd = open("/etc/shadow");
> open("/dev/null");
> code_which_insecurely_reads_from_stdin();
>From what, again? Use of stdio after that is deep in nasal demon land...
On Sat, 9 Jun 2007, Al Viro wrote:
>
> How the hell can it be racy wrt normal open()? F_DUPFD is not dup2(),
> it's non-overriding.
Al, you probably didn't read this thread from the beginning (not in this
particular email thread - an earlier one on the whole feature).
The problem is that a thread wants to open a FD_CLOEXEC file descriptor,
because it is doing something that is thread-local. So it does
fd = some.op.that.returns.an.fd.like.socket();
fcntl(fd, F_SETFD, &FD_CLOEXEC);
but *another* thread does an execve() at the same time, and the fcntl()
never gets to happen!
Which is why you'd like to do the *initial* operation with a flag that
says "please set the FD_CLOEXEC flag on the file descriptor", so that you
*atomically* install the file file descriptor and set the FD_CLOEXEC bit.
It's trivial to do for open(), but there are about a million ways to get a
file descriptor, and open() is just about the *only* one of those that
actually takes a "flags" field that can be used to tell the kernel.
So ignore the fdmapping for now: that's just an extended thing. The
problem is _independent_ of the fdmapping, but it turns out that a lot of
these problems are intertwined, in that you actually want *other* flags
than just "FD_CLOEXEC". For example, one of the flags would be "private fd
space" (which is where fdmap comes in), so that a library can allocate its
own internal file descriptors *without* impacting the caller that depends
on its own file descriptor allocation.
(And dammit, that _is_ a *real*issue*. No races necessary, no NR_OPEN
iterations, no even *halfway* suspect code. It's perfectly fine to do
close(0);
close(1);
close(2);
.. generate filenames, whatever ..
if (open(..) < 0 || open(..) < 0 || open(..) < 0)
die("Couldn't redirect stdin/stdout/stderr");
and there's absolutely nothing wrong with this kind of setup, even if you
could obviously have done it other ways too (ie by using "dup2()" instead
of "close + open"),
And that means that libraries currently MUST NOT open their own file
descriptors, exactly because they mess with the "application file
descriptor namespace", namely the linear POSIX-defined fd allocation
rules!
And no, "dup2(fd, SOME_BIG_FD)" is *not* the answer, exactly because we
end up sucking like mad if you actually were to do it!
So I think both the FD_CLOEXEC _and_ the "private fd space" are real
issues. I don't agree with the "random fd" approach. I'd much rather have
a non-random setup for the nonlinear ones (it just shouldn't be linear).
Linus
On Sat, 9 Jun 2007, Linus Torvalds wrote:
> So I think both the FD_CLOEXEC _and_ the "private fd space" are real
> issues. I don't agree with the "random fd" approach. I'd much rather have
> a non-random setup for the nonlinear ones (it just shouldn't be linear).
That is fine for me. So what about the randomness?
A) Don't do it at all
B) Let userspace select it in some way globally
C) Let userspace select it per-fd (this won't be an O(1) anymore though)
- Davide
On Sat, Jun 09, 2007 at 01:21:24PM -0700, Linus Torvalds wrote:
> Which is why you'd like to do the *initial* operation with a flag that
> says "please set the FD_CLOEXEC flag on the file descriptor", so that you
> *atomically* install the file file descriptor and set the FD_CLOEXEC bit.
>
> It's trivial to do for open(), but there are about a million ways to get a
> file descriptor, and open() is just about the *only* one of those that
> actually takes a "flags" field that can be used to tell the kernel.
Eww... Idea of pipe(2) taking flags as argument...
BTW, you also need that for recvmsg() (SCM_RIGHTS) and fsckloads of
syscalls we've got duplicating open() for no good reason (and no, "BSD
folks did it for sockets, so we'll do it for tons of new subsystems" doesn't
really qualify ;-/).
I don't know if your indirect is a good idea in that respect, actually.
AFAICS, you are suggesting per-syscall meanings of the flags, so it really
smells like YAMultiplexor, free for abuse.
> (And dammit, that _is_ a *real*issue*. No races necessary, no NR_OPEN
> iterations, no even *halfway* suspect code. It's perfectly fine to do
>
> close(0);
> close(1);
> close(2);
> .. generate filenames, whatever ..
> if (open(..) < 0 || open(..) < 0 || open(..) < 0)
> die("Couldn't redirect stdin/stdout/stderr");
>
> and there's absolutely nothing wrong with this kind of setup, even if you
> could obviously have done it other ways too (ie by using "dup2()" instead
> of "close + open"),
Yeah, well - I wouldn't call that perfectly fine, but it's probably too
widespread to kill. Just as use of 0 for NULL ;-)
On Sat, Jun 09, 2007 at 01:31:56PM -0700, Davide Libenzi wrote:
> On Sat, 9 Jun 2007, Linus Torvalds wrote:
>
> > So I think both the FD_CLOEXEC _and_ the "private fd space" are real
> > issues. I don't agree with the "random fd" approach. I'd much rather have
> > a non-random setup for the nonlinear ones (it just shouldn't be linear).
>
> That is fine for me. So what about the randomness?
>
> A) Don't do it at all
>
> B) Let userspace select it in some way globally
>
> C) Let userspace select it per-fd (this won't be an O(1) anymore though)
It should be handled the same way that VMA layout randomness is
handled. There are multiple knobs including ELF markers and /proc
bits.
--
Mathematics is the supreme nostalgia of our time.
On Sat, Jun 09, 2007 at 09:49:07PM +0100, Al Viro wrote:
> On Sat, Jun 09, 2007 at 01:21:24PM -0700, Linus Torvalds wrote:
> > Which is why you'd like to do the *initial* operation with a flag that
> > says "please set the FD_CLOEXEC flag on the file descriptor", so that you
> > *atomically* install the file file descriptor and set the FD_CLOEXEC bit.
> >
> > It's trivial to do for open(), but there are about a million ways to get a
> > file descriptor, and open() is just about the *only* one of those that
> > actually takes a "flags" field that can be used to tell the kernel.
>
> Eww... Idea of pipe(2) taking flags as argument...
>
> BTW, you also need that for recvmsg() (SCM_RIGHTS) and fsckloads of
> syscalls we've got duplicating open() for no good reason (and no, "BSD
> folks did it for sockets, so we'll do it for tons of new subsystems" doesn't
> really qualify ;-/).
>
> I don't know if your indirect is a good idea in that respect, actually.
> AFAICS, you are suggesting per-syscall meanings of the flags, so it really
> smells like YAMultiplexor, free for abuse.
Well, of course it sucks. The question is does it suck less than
adding dozens of new syscalls to patch up problems with POSIX?
I don't think we can get this right in one iteration, so I actually
prefer Linus's approach as less likely to leave numerous vestiges of
incremental improvements around.
On the other hand, we don't want to end up in a future where the bulk
of our calls are through sys_indirect! Given the number of important
APIs that have gotten slight semantic bumps, that's a real
possibility.
--
Mathematics is the supreme nostalgia of our time.
On Sat, 9 Jun 2007, Matt Mackall wrote:
> On Sat, Jun 09, 2007 at 01:31:56PM -0700, Davide Libenzi wrote:
> > On Sat, 9 Jun 2007, Linus Torvalds wrote:
> >
> > > So I think both the FD_CLOEXEC _and_ the "private fd space" are real
> > > issues. I don't agree with the "random fd" approach. I'd much rather have
> > > a non-random setup for the nonlinear ones (it just shouldn't be linear).
> >
> > That is fine for me. So what about the randomness?
> >
> > A) Don't do it at all
> >
> > B) Let userspace select it in some way globally
> >
> > C) Let userspace select it per-fd (this won't be an O(1) anymore though)
>
> It should be handled the same way that VMA layout randomness is
> handled. There are multiple knobs including ELF markers and /proc
> bits.
You mean a global bit controlled by /proc, eventually overridden by an ELF
flag? An maybe a prctl() to give sw configurability?
- Davide
On Sat, 9 Jun 2007, Al Viro wrote:
>
> Eww... Idea of pipe(2) taking flags as argument...
Right. That was one of the patches, and it was one that I said was too
damn ugly to live.
So I instead suggested the alternate approach of adding a single new
system call that runs another system call indirectly, with a set of flags
and/or other behaviour modifications.
And I actually think that's a great idea, because we have *multiple* uses
of this kind of "run system call with special flags" issues:
- the whole "async" thing, and as Matt reminded me, this is the perfect
interface for also saying "run this system call asynchronously"
- things like FD_CLOEXEC, but also things like O_NOFOLLOW and O_NDELAY
(which currently only "open()" supports, even though it makes sense for
a lot of other ops too)
- extended flags like LOOKUP_NOSYMLINK and LOOKUP_NOMOUNT for any system
call that does path lookup (to make it return errors if you *ever*
traverse a symlink or a mount-point respectively: security conscious
programs tend to want this - think about apache that exports per-user
"public_html" stuff, but does *not* want to export /etc, and thus
doesn't like following symlinks).
> I don't know if your indirect is a good idea in that respect, actually.
> AFAICS, you are suggesting per-syscall meanings of the flags, so it really
> smells like YAMultiplexor, free for abuse.
Well, the actual _setup_ would be global (ie we would have no per-system
call actions: we'd just copy the "flags" into the thread
"system_call_flags" thing).
But yes, different system calls could decide to *interpret* the flags
differently. Quite frankly, I'd rather have that kind of overloaded flag
bitmap, than try to create something "extensible".
That said, I don't think there really will be all that many flags, and we
could even just decide that the bits have to be totally unique (and simply
limited to 32 flags). The whole point of the flags, after all, would be
things that *do* make sense for a whole group of system calls: if that's
not true, and it's some single-system-call thing, then you might as well
just always add the new system call itself!
So it makes sense for generic flags (like ASYNC and path lookup rules: not
every system call takes a path, of course, but it's conceptually still
pretty damn generic). but it wouldn't really make sense to do for a very
specific system call - it would be faster and easier to just create the
new system call in the first place.
Linus
On Sat, Jun 09, 2007 at 03:27:43PM -0400, Kyle Moffett wrote:
> SCENARIO 1:
>
> Program Thread: Library Thread:
> fd = socket(AF_*, SOCK_*, 0);
> fork();
> int x = FD_CLOEXEC;
> fcntl(fd, F_SETFD, &x);
BTW, regardless of anything else, in such situation this "library
thread" would be better off by just having independent descriptors.
We _can_ do that just fine.
That is, if library code using such stuff would be OK with being a thread.
Any specific examples one way or another?
On Sat, Jun 09, 2007 at 01:21:24PM -0700, Linus Torvalds wrote:
> (And dammit, that _is_ a *real*issue*. No races necessary, no NR_OPEN
> iterations, no even *halfway* suspect code. It's perfectly fine to do
>
> close(0);
> close(1);
> close(2);
> .. generate filenames, whatever ..
> if (open(..) < 0 || open(..) < 0 || open(..) < 0)
> die("Couldn't redirect stdin/stdout/stderr");
>
> and there's absolutely nothing wrong with this kind of setup, even if you
> could obviously have done it other ways too (ie by using "dup2()" instead
> of "close + open"),
>
> And that means that libraries currently MUST NOT open their own file
> descriptors, exactly because they mess with the "application file
> descriptor namespace", namely the linear POSIX-defined fd allocation
> rules!
Unless it does so in a thread that has unshared its descriptor table.
Which is certainly safer than suggested variants and which might be
natural thing to do anyway (e.g. if we do behind-the-scene getpwent()
caching, etc., we wouldn't be hurt by doing the work asynchronously
wrt the rest of program).
It sure as hell beats the "it's probably going to be safe with expected
use patterns in programs using our library" in the safety department;
that way we *know* that nothing of our stuff will leak anywhere, just as
we know that whatever other threads might be doing, they won't screw with
our descriptors in any way.
That's independent from O_CLOEXEC, etc., and for cases where such technics
is usable we get a bonus of code working with earlier kernels just fine.
Comments?
On Sat, 9 Jun 2007, Linus Torvalds wrote:
> On Sat, 9 Jun 2007, Al Viro wrote:
> >
> > Eww... Idea of pipe(2) taking flags as argument...
>
> Right. That was one of the patches, and it was one that I said was too
> damn ugly to live.
>
> So I instead suggested the alternate approach of adding a single new
> system call that runs another system call indirectly, with a set of flags
> and/or other behaviour modifications.
What if we put a data section in the vdso and we let userspace set the
flags or whatever in there? Syslets could use one of those bits to set the
"threadlet mode" w/out calling a syscall at all.
In that way we have no new syscalls whatsoever. No?
- Davide
On Sat, 9 Jun 2007, Davide Libenzi wrote:
> On Sat, 9 Jun 2007, Linus Torvalds wrote:
>
> > On Sat, 9 Jun 2007, Al Viro wrote:
> > >
> > > Eww... Idea of pipe(2) taking flags as argument...
> >
> > Right. That was one of the patches, and it was one that I said was too
> > damn ugly to live.
> >
> > So I instead suggested the alternate approach of adding a single new
> > system call that runs another system call indirectly, with a set of flags
> > and/or other behaviour modifications.
>
> What if we put a data section in the vdso and we let userspace set the
> flags or whatever in there? Syslets could use one of those bits to set the
> "threadlet mode" w/out calling a syscall at all.
> In that way we have no new syscalls whatsoever. No?
Uhh, never mind :) That won't work in the vdso. Maybe in a TLS area.
- Davide
On Sun, 10 Jun 2007, Al Viro wrote:
> >
> > And that means that libraries currently MUST NOT open their own file
> > descriptors, exactly because they mess with the "application file
> > descriptor namespace", namely the linear POSIX-defined fd allocation
> > rules!
>
> Unless it does so in a thread that has unshared its descriptor table.
Agreed. That was actually part of the reason why I thought clone() was
much better than the pthreads interface.
That said, the Linux !CLONE_FILES does have downsides:
- it is potentially much slower to do than sharing everything (if you
have lots of file descriptors, incrementing the refcounts etc is
actually a real overhead)
- it simply doesn't work, if the library wants to run in the same
execution context, and just wants to open one (or more) file
descriptors for some helper thing.
IOW, the most common case for libraries is not that they get invoced to do
one thing, but that they get loaded and then used over and over and over
again, and the _reason_ for wanting to have a file descriptor open may
well be that the library wants to cache the file descriptor, rather than
having to open a file over and over again!
For example, a library routine that does a full
fd = open();
.. do something with it ..
close(fd);
generally doesn't need any private file descriptors at all (although there
are the threading issues with exec etc) - it will temporarily use a normal
file descriptor, and the caller won't be any wiser. Lots of current
library routines do this all the time.
But let's say that you want to do a library that does name resolution, and
you actually want to create the socket that binds to the DNS server just
once, and then re-use that socket across library calls. It's not that the
library is a thread of its own - it's not - but with the normal linear fd
space it really cannot do this. Sure, it could try to hide it up somewhere
in high fd space, but that would slow down other operations, and there's
no way to guarantee it doesn't clash with some _other_ library doing the
same thing, so it really isn't a good idea.
Now, when you do a DNS query, the setup cost of opening the socket is the
least of your worries, so the above example is not a very good one. I'm
really just giving it as a concrete example of a _conceptual_ problem,
where some other library really had more pressing performance reasons why
they cannot keep re-opening a file descriptor and closing it each time.
So _that_ is the kind of situation where I think "anonymous file
descriptors" make sense.
Linus
On Sat, Jun 09, 2007 at 08:48:39PM -0700, Linus Torvalds wrote:
> Agreed. That was actually part of the reason why I thought clone() was
> much better than the pthreads interface.
>
> That said, the Linux !CLONE_FILES does have downsides:
>
> - it is potentially much slower to do than sharing everything (if you
> have lots of file descriptors, incrementing the refcounts etc is
> actually a real overhead)
Huh? We _skip_ the overhead when descriptor table is not shared. Think
for a minute - we can skip playing with refcount of fget() in the beginning
of syscall if and only if we know that nobody will touch the reference from
the descriptor table. I.e. if descriptor table is not shared. IOW, it's
the other way round - it's _faster_ to not share descriptors.
> - it simply doesn't work, if the library wants to run in the same
> execution context, and just wants to open one (or more) file
> descriptors for some helper thing.
True, but... That really depends on the potential uses. Any real-world
examples (e.g. in related threads)?
On Sun, 10 Jun 2007, Al Viro wrote:
> On Sat, Jun 09, 2007 at 08:48:39PM -0700, Linus Torvalds wrote:
> > Agreed. That was actually part of the reason why I thought clone() was
> > much better than the pthreads interface.
> >
> > That said, the Linux !CLONE_FILES does have downsides:
> >
> > - it is potentially much slower to do than sharing everything (if you
> > have lots of file descriptors, incrementing the refcounts etc is
> > actually a real overhead)
>
> Huh? We _skip_ the overhead when descriptor table is not shared.
Not for clone()/exit(), ie the actual _creation_ of the thread.
Linus
On Sat, Jun 09, 2007 at 09:03:23PM -0700, Linus Torvalds wrote:
>
>
> On Sun, 10 Jun 2007, Al Viro wrote:
>
> > On Sat, Jun 09, 2007 at 08:48:39PM -0700, Linus Torvalds wrote:
> > > Agreed. That was actually part of the reason why I thought clone() was
> > > much better than the pthreads interface.
> > >
> > > That said, the Linux !CLONE_FILES does have downsides:
> > >
> > > - it is potentially much slower to do than sharing everything (if you
> > > have lots of file descriptors, incrementing the refcounts etc is
> > > actually a real overhead)
> >
> > Huh? We _skip_ the overhead when descriptor table is not shared.
>
> Not for clone()/exit(), ie the actual _creation_ of the thread.
Umm... Yes, but that depends on the time when that sucker is started and
the lifetime. IOW, we really need specific examples.
On Sat, 9 Jun 2007, Linus Torvalds wrote:
> IOW, the most common case for libraries is not that they get invoced to do
> one thing, but that they get loaded and then used over and over and over
> again, and the _reason_ for wanting to have a file descriptor open may
> well be that the library wants to cache the file descriptor, rather than
> having to open a file over and over again!
for an example of a library wanting to cache an open fd ... and failing
miserably at protecting itself from the application closing its fd read:
http://bugzilla.padl.com/show_bug.cgi?id=304
http://bugzilla.padl.com/show_bug.cgi?id=305
basically libnss-ldap is trying to use getsockname/getpeername to prove
that an fd belongs to it. the failure modes are quite delightful.
-dean
On Sat, 9 Jun 2007, dean gaudet wrote:
>
> for an example of a library wanting to cache an open fd ... and failing
> miserably at protecting itself from the application closing its fd read:
Heh.
Why the hell doesn't that thing just do an "fstat()" on the thing, and
compare the inode number? Not that I would guarantee that it works either
for a socket, but it would seem to make more sense than what it apparently
does now, and I think it should work at least on Linux.
(Ie linux will give fake "st_ino" numbers to sockets. Whether anybody else
will do that, I have no friggin clue. And you probably shouldn't depend on
it even under Linux, but damn, despite all of those issues, it's likely
better and more logical than what libnss-ldap apparently does now ;)
Doing a "fstat()" call is how you generally test if two fd's point to the
same file. Doing the same thing for sockets would at least be half-way
sane, and even if the OS doesn't give new sockets individual st_ino
numbers, they probably won't be *changing*.
Linus
On Sat, Jun 09, 2007 at 10:06:27PM -0700, Linus Torvalds wrote:
> Why the hell doesn't that thing just do an "fstat()" on the thing, and
> compare the inode number? Not that I would guarantee that it works either
> for a socket, but it would seem to make more sense than what it apparently
> does now, and I think it should work at least on Linux.
It works on Linux, all right.
> (Ie linux will give fake "st_ino" numbers to sockets. Whether anybody else
> will do that, I have no friggin clue. And you probably shouldn't depend on
> it even under Linux, but damn, despite all of those issues, it's likely
> better and more logical than what libnss-ldap apparently does now ;)
On FreeBSD it will simply give you zero st_ino on almost all sockets;
AF_UNIX ones get st_ino invented (and stable). st_dev is NODEV in all
cases...
Linus Torvalds <[email protected]> writes in gmane.linux.kernel:
<...>
> But let's say that you want to do a library that does name resolution, and
> you actually want to create the socket that binds to the DNS server just
> once, and then re-use that socket across library calls. It's not that the
> library is a thread of its own - it's not - but with the normal linear fd
> space it really cannot do this. Sure, it could try to hide it up somewhere
> in high fd space, but that would slow down other operations, and there's
> no way to guarantee it doesn't clash with some _other_ library doing the
> same thing, so it really isn't a good idea.
>
> Now, when you do a DNS query, the setup cost of opening the socket is the
> least of your worries, so the above example is not a very good one. I'm
> really just giving it as a concrete example of a _conceptual_ problem,
> where some other library really had more pressing performance reasons why
> they cannot keep re-opening a file descriptor and closing it each time.
>
> So _that_ is the kind of situation where I think "anonymous file
> descriptors" make sense.
>
> Linus
Real world example is nss_ldap / pam_ldap -- these needs open socket to
ldap server. That socket is cached. And because they can not trust that
application does not have closed file description of them, they check it with
getpeername + getsockname (at least it did when I looked code on
some years ago.)
( opening socket again includes using starttls and authentication .. so it is
quite some overhead )
/ Kari Hurtta
Davide Libenzi writes:
> > Why must everything that makes things a bit simpler and more
> > predictable for application programmers be called a "mistake"?
>
> Because if you give guarantees on something, ppl start using such
> guarantee in the wrong way. Kyle's email summarizes it.
OK, my question/protest was a bit terse.
What I am objecting to is this idea that many kernel developers seem
to have, that if there is some aspect of the kernel/user API that
becomes a bit inconvenient for the kernel to implement, then we can
put the blame on the applications that rely on that aspect, call them
names such as "legacy", "abuser", "conceptually buggy", "broken",
etc., and ultimately justify breaking the ABI -- since it's only those
applications that we have demonised that will be affected, after all.
In a way your response quoted above illustrates this perfectly. If we
give guarantees then people will start relying on them "in the wrong
way" -- meaning, in any way that later becomes inconvenient to us.
What next? Shall we break the guarantee that when read() returns, the
data is already in the user's buffer? I'm sure we could improve
performance if we didn't have to give that guarantee, and after all,
only old, broken, legacy, conceptually buggy programs would be abusing
the interface by relying on that. :)
> This should really be treated as an opaque handle, with no assumption on
> its value.
No. This is an assertion that you have just conjured up to suit
yourself, but it is wrong. It does not reflect either historical UNIX
practice or what is specified in POSIX. Application writers are
perfectly justified in relying on getting the lowest-numbered unused
file descriptor.
If you don't think we should be bound by POSIX, then you are perfectly
free to go off and write your own research kernel with whatever
interface you want, and no programs available to run on it. :)
Paul.
Kyle Moffett writes:
> 1) Linear FD allocation makes it IMPOSSIBLE for libraries to
> reliably use persistent FDs behind an application's back. For
That's not completely true; for example, openlog() opens a file
descriptor for the library's own use, as does sethostent(). I agree
that it creates difficulties if the library implementor wants to use a
file descriptor in a set of functions that didn't previously use one,
but with a bit of assistance from the kernel, that can be solved
without breaking the ABI.
> for (i = 0; i < NR_OPEN; i++)
> if (!fd_is_special_to_us(i))
> close(i);
>
> Note that this is conceptually buggy, but occurs in several major C
> programming books, most of the major shells, and a lot of other
> software to boot.
Buggy in what way? In the use of the NR_OPEN constant?
> 3) In order to allocate new FDs, the kernel has to scan over a
> (potentially very large) bitmap. A process with 100,000 fds (not
> terribly uncommon) would have 12.5kbyte of FD bitmap and would trash
> the cache every time it tried to allocate an FD.
For specialized programs like that we can offer alternative fd
allocation strategies if necessary (although I expect that with
100,000 fds other things will limit performance more than
allocation).
None of those things is an excuse for breaking the ABI, however.
As I said to Davide, I was really protesting about the attitude that
we can just break the ABI however and whenever we like and force
programs to adapt.
Paul.
On Sun, Jun 10, 2007 at 04:26:07PM +1000, Paul Mackerras wrote:
> If you don't think we should be bound by POSIX, then you are perfectly
> free to go off and write your own research kernel with whatever
> interface you want, and no programs available to run on it. :)
This isn't fair to research kernels. Breaking applications is not an
active area of research.
-- wli
Linus Torvalds a ?crit :
> (And dammit, that _is_ a *real*issue*. No races necessary, no NR_OPEN
> iterations, no even *halfway* suspect code. It's perfectly fine to do
>
> close(0);
> close(1);
> close(2);
> .. generate filenames, whatever ..
> if (open(..) < 0 || open(..) < 0 || open(..) < 0)
> die("Couldn't redirect stdin/stdout/stderr");
>
> and there's absolutely nothing wrong with this kind of setup, even if you
> could obviously have done it other ways too (ie by using "dup2()" instead
> of "close + open"),
>
This kind of setup was OK 25 years ago, before multithreading era.
You cannot reasonably expect it to work in a multithreaded program.
Anyway, I would like to give an alternative idea of the double fdmap, and
probably more *secure* .
Current fd API mandates integers (32 bits)
Lot of broken code consider a fd must be >= 0, so we currently are limited to
31 bits.
With NR_OPEN = 1024*1024 = 2^20, that give us 11 bits that we could use as a
signature.
That is, we could use O_NONSEQ as a indication to kernel to give us a
composite fd : 20 low order bits give the slot in file table, then 11 bits can
be use to make sure the fd was not stolen by malicious code.
Legacy app, (without O_NONSEQ in flags) would get POSIX compatables fd in [0,
2^20-1] range, with the lowest available fd.
If O_NONSEQ is given, kernel is free to give an fd in [0, 2^31 - 1], with a
strategy that could be the one Davide gave in its patch (with a list of
available slots). But instead of FIFO, we can use now LIFO, more cache friendly.
In fget()/fget_light()/close(), we can then use 20 bits to select the slot in
the single fdmap.
And 11 bits to check the 'signature'.
So if open( O_NONSEQFD) gave us 0x77000010, we cannot do close(0x10) or
read(0x10, ....)
Storage for these bits is already there in Davide fd_slot structure, where we
currently use one long to store 3 bits 'only'.
This should work even bumping NR_OPEN to say... 8*1024*1024, and 8 bits signature.
> > and there's absolutely nothing wrong with this kind of setup, even if you
> > could obviously have done it other ways too (ie by using "dup2()" instead
> > of "close + open"),
> >
>
> This kind of setup was OK 25 years ago, before multithreading era.
> You cannot reasonably expect it to work in a multithreaded program.
Why not.
When execution begins which is the normal point you do this then you've
got one thread. If you need to do this from a thread after that point
posix provides threaded applications with locking.
Not much else works in a threaded app if you get the locking wrong, and
that is considered the authors job. Why is fd allocation different ?
> Real world example is nss_ldap / pam_ldap -- these needs open socket to
> ldap server. That socket is cached. And because they can not trust that
> application does not have closed file description of them, they check it with
> getpeername + getsockname (at least it did when I looked code on
> some years ago.)
>
> ( opening socket again includes using starttls and authentication .. so it is
> quite some overhead )
And if the fd was closed because of a security transition in the
application hiding it and caching it from the application might then lead
to a security hole.
Alan
On Sun, 10 Jun 2007, Paul Mackerras wrote:
> What I am objecting to is this idea that many kernel developers seem
> to have, that if there is some aspect of the kernel/user API that
> becomes a bit inconvenient for the kernel to implement, then we can
> put the blame on the applications that rely on that aspect, call them
> names such as "legacy", "abuser", "conceptually buggy", "broken",
> etc., and ultimately justify breaking the ABI -- since it's only those
> applications that we have demonised that will be affected, after all.
>
> In a way your response quoted above illustrates this perfectly. If we
> give guarantees then people will start relying on them "in the wrong
> way" -- meaning, in any way that later becomes inconvenient to us.
> What next? Shall we break the guarantee that when read() returns, the
> data is already in the user's buffer? I'm sure we could improve
> performance if we didn't have to give that guarantee, and after all,
> only old, broken, legacy, conceptually buggy programs would be abusing
> the interface by relying on that. :)
This fds will be out of POSIX definition in any case. They have to be
based at very high values in any case in order to be useful, and this
already breaks the POSIX definition of them.
> > This should really be treated as an opaque handle, with no assumption on
> > its value.
>
> No. This is an assertion that you have just conjured up to suit
> yourself, but it is wrong. It does not reflect either historical UNIX
> practice or what is specified in POSIX. Application writers are
> perfectly justified in relying on getting the lowest-numbered unused
> file descriptor.
You seem to deny the fact that libraries cannot reliably use fds allocated
by POSIX rules for internal communication with the kernel. I really don't
know what to say, if not that we must have a different definition of the
word "reliable".
Maybe if you go back in the thread and give your solutions, using POSIX
fds allocation semantics, to the problems that have been exposed, that
might help understanding.
- Davide
On Sun, 10 Jun 2007, Paul Mackerras wrote:
> > for (i = 0; i < NR_OPEN; i++)
> > if (!fd_is_special_to_us(i))
> > close(i);
> >
> > Note that this is conceptually buggy, but occurs in several major C
> > programming books, most of the major shells, and a lot of other
> > software to boot.
>
> Buggy in what way? In the use of the NR_OPEN constant?
That is typical daemonize code done by quite a few apps, and even
described in many unix network programming books. Now say that before that
you loaded a library that had an internal fd allocated to get some sort of
services from the kernel (say an epoll fd). What happens when the library
uses the cached apoll fd after that loop?
- Davide
On Sun, 10 Jun 2007, Al Viro wrote:
>
> On FreeBSD it will simply give you zero st_ino on almost all sockets;
> AF_UNIX ones get st_ino invented (and stable). st_dev is NODEV in all
> cases...
So it will still work better than trying to do a "getsockname()" or
something. If the file descriptor really _has_ been changed, it might give
a false negative (somebody replaced it with *another* socket), but on the
other hand, that's much better than a false positive. And if somebody
replaced the socket with a real file descriptor, it will catch it..
Linus
On Sun, 10 Jun 2007, Eric Dumazet wrote:
> Linus Torvalds a ?crit :
> >
> > close(0);
> > close(1);
> > close(2);
> > .. generate filenames, whatever ..
> > if (open(..) < 0 || open(..) < 0 || open(..) < 0)
> > die("Couldn't redirect stdin/stdout/stderr");
> >
> > and there's absolutely nothing wrong with this kind of setup, even if you
> > could obviously have done it other ways too (ie by using "dup2()" instead of
> > "close + open"),
>
> This kind of setup was OK 25 years ago, before multithreading era.
> You cannot reasonably expect it to work in a multithreaded program.
Who said _anything_ about threading?
The above is a totally non-threaded app. 99% of all applications are like
that.
Threading is for the 1%. But 99% is what we need to make sure works.
Linus
On Sun, 10 Jun 2007, Paul Mackerras wrote:
> > for (i = 0; i < NR_OPEN; i++)
> > if (!fd_is_special_to_us(i))
> > close(i);
> >
> > Note that this is conceptually buggy, but occurs in several major C
> > programming books, most of the major shells, and a lot of other
> > software to boot.
>
> Buggy in what way? In the use of the NR_OPEN constant?
That is buggy because the code assumes it is the only owner of the fd
space. Thing that is not true if runtimes have opened their own file
descriptors to handle their own links to the kernel.
The syslog case is kinda bogus. For certain things, you can afford a retry
policy (assuming somone else did not open another fd over the fd that you
cached - then it's gonna be fun). It is crappy, but it *may* work. Sure
is, does not fit any definition of "reliable" I'm aware of.
Think about a runtime that had an epoll fd open and a few fds dropped into
it to, say, deliver some sort of events to the application. After the
Close Loop of Death, the whole thing is gone.
As is, runtimes cannot reliably use (cached) fds for their own internal
communication with the kernel. That, I think we can agree it is a fact.
We can then say that we do not care if runtimes cannot reliably use fds,
and move on. But it's hard to say that the problem does not exist.
- Davide
On Fri, 2007-06-08 at 20:30 +0100, Alan Cox wrote:
> > To avoid exactly the kind of problem we have now in future: programs
> > relying on specific patterns.
>
> Which you seem to think is a bad thing, yet is actually a very good thing
> because it means that crashes are repeatable and problems are debuggable
> from end user reports.
You can have both.
Look at malloc(): when you write your program you can't really guess
which address will be returned by a malloc() call, but you know that if
you launch it twice and if it has the same input, malloc()'s behavior is
repeatable so it's debuggable.
Xav