This patch plugs the extended fdmap into the kernel. At the moment, this
is done only through sys_dup2() and F_DUPFD.
The base value for the unsequential file descriptor allocation is (at the
moment) set to FD_UNSEQ_BASE (defined in asm-generic/fcntl.h):
#define FD_UNSEQ_BASE (1U << 28)
The sys_dup2() system call (and the F_DUPFD ioclt) understand values from
FD_UNSEQ_BASE up, and use the unsequential fdmap in that case.
In sys_dup2(), if the FD_UNSEQ_ALLOC bit of "newfd" is set, the syscall will
allocate a new file descriptor inside the unsequential fdmap:
#define FD_UNSEQ_ALLOC (1U << 30)
All the functions that deal with fd<->file* has been changed to make them
unsequential fdmap aware.
There is a new kernel function get_unused_fd_unseq() (and its locked version
__get_unused_fd_unseq()), that can be used to allocate file descriptors
inside the unsequential fdmap. At the moment, no one besides sys_dup2() and
F_DUPFD uses it, but it is possible to integrate it in other paths generating
new file descriptors.
It'd be possible to add a new O_UNSEQFD flag to open(2) and make sys_open()
to allocate the new descriptor inside the unsequential map.
So at the moment, to allocate a file descriptor in the unsequential map, you
can do something like:
ufd = dup2(fd, FD_UNSEQ_ALLOC);
close(fd);
and use "ufd" instead of "fd".
Verified and tested on a P4 HT and a dual Opteron using this test program:
http://www.xmailserver.org/extfd-test.c
Signed-off-by: Davide Libenzi <[email protected]>
- Davide
Index: linux-2.6.mod/include/linux/file.h
===================================================================
--- linux-2.6.mod.orig/include/linux/file.h 2007-06-02 15:34:26.000000000 -0700
+++ linux-2.6.mod/include/linux/file.h 2007-06-02 15:36:07.000000000 -0700
@@ -11,6 +11,12 @@
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/types.h>
+#include <linux/fdmap.h>
+
+/*
+ * Initial size for the non sequential file descriptor arena
+ */
+#define FDMAP_UNSEQ_SIZE 64U
/*
* The default fd array needs to be at least BITS_PER_LONG,
@@ -50,9 +56,15 @@
*/
spinlock_t file_lock ____cacheline_aligned_in_smp;
int next_fd;
+ int fd_count;
struct embedded_fd_set close_on_exec_init;
struct embedded_fd_set open_fds_init;
struct file * fd_array[NR_OPEN_DEFAULT];
+
+ /*
+ * Used for non-contiguous file descriptor allocations.
+ */
+ struct fd_map *fmap;
};
#define files_fdtable(files) (rcu_dereference((files)->fdt))
@@ -77,22 +89,27 @@
struct kmem_cache;
extern int expand_files(struct files_struct *, int nr);
+extern struct fd_map *files_fdmap_alloc(struct files_struct *files,
+ unsigned int size);
+extern int __get_unused_fd_unseq(struct files_struct *files, int fd,
+ unsigned long flags);
+extern int get_unused_fd_unseq(struct files_struct *files, int fd,
+ unsigned long flags);
extern void free_fdtable_rcu(struct rcu_head *rcu);
extern void __init files_defer_init(void);
+extern struct file *fcheck_files(struct files_struct *files, unsigned int fd);
static inline void free_fdtable(struct fdtable *fdt)
{
call_rcu(&fdt->rcu, free_fdtable_rcu);
}
-static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd)
+/*
+ * Must be called with files->lock held.
+ */
+static inline struct fd_map *files_fdmap(struct files_struct *files)
{
- struct file * file = NULL;
- struct fdtable *fdt = files_fdtable(files);
-
- if (fd < fdt->max_fds)
- file = rcu_dereference(fdt->fd[fd]);
- return file;
+ return files->fmap ? files->fmap: files_fdmap_alloc(files, 0);
}
/*
Index: linux-2.6.mod/fs/fcntl.c
===================================================================
--- linux-2.6.mod.orig/fs/fcntl.c 2007-06-02 15:34:27.000000000 -0700
+++ linux-2.6.mod/fs/fcntl.c 2007-06-02 15:36:07.000000000 -0700
@@ -28,22 +28,34 @@
struct files_struct *files = current->files;
struct fdtable *fdt;
spin_lock(&files->file_lock);
- fdt = files_fdtable(files);
- if (flag)
- FD_SET(fd, fdt->close_on_exec);
- else
- FD_CLR(fd, fdt->close_on_exec);
+ if (files->fmap && fdmap_fdof(files->fmap, fd))
+ fdmap_set_fdflags(files->fmap, fd, flag ? 0: FDMAP_F_CLOEXEC,
+ flag ? FDMAP_F_CLOEXEC: 0);
+ else {
+ fdt = files_fdtable(files);
+ if (flag)
+ FD_SET(fd, fdt->close_on_exec);
+ else
+ FD_CLR(fd, fdt->close_on_exec);
+ }
spin_unlock(&files->file_lock);
}
static int get_close_on_exec(unsigned int fd)
{
struct files_struct *files = current->files;
+ struct fd_map *fmap;
struct fdtable *fdt;
int res;
+
rcu_read_lock();
- fdt = files_fdtable(files);
- res = FD_ISSET(fd, fdt->close_on_exec);
+ fmap = rcu_dereference(files->fmap);
+ if (fmap && fdmap_fdof(fmap, fd))
+ res = (fdmap_get_fdflags(fmap, fd) & FDMAP_F_CLOEXEC) != 0;
+ else {
+ fdt = files_fdtable(files);
+ res = FD_ISSET(fd, fdt->close_on_exec);
+ }
rcu_read_unlock();
return res;
}
@@ -62,11 +74,12 @@
int error;
struct fdtable *fdt;
- error = -EINVAL;
if (orig_start >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
- goto out;
-
+ return -EINVAL;
repeat:
+ if (files->fd_count >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
+ return -EMFILE;
+
fdt = files_fdtable(files);
/*
* Someone might have closed fd's in the range
@@ -80,14 +93,13 @@
if (start < fdt->max_fds)
newfd = find_next_zero_bit(fdt->open_fds->fds_bits,
fdt->max_fds, start);
-
- error = -EMFILE;
+
if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
- goto out;
+ return -EMFILE;
error = expand_files(files, newfd);
if (error < 0)
- goto out;
+ return error;
/*
* If we needed to expand the fs array we
@@ -103,11 +115,9 @@
*/
if (start <= files->next_fd)
files->next_fd = newfd + 1;
+ files->fd_count++;
- error = newfd;
-
-out:
- return error;
+ return newfd;
}
static int dupfd(struct file *file, unsigned int start)
@@ -117,18 +127,22 @@
int fd;
spin_lock(&files->file_lock);
- fd = locate_fd(files, file, start);
- if (fd >= 0) {
- /* locate_fd() may have expanded fdtable, load the ptr */
- fdt = files_fdtable(files);
- FD_SET(fd, fdt->open_fds);
- FD_CLR(fd, fdt->close_on_exec);
- spin_unlock(&files->file_lock);
+ if (start >= FD_UNSEQ_BASE)
+ fd = __get_unused_fd_unseq(files, FDMAP_HINT_FDUP(start), 0);
+ else {
+ fd = locate_fd(files, file, start);
+ if (fd >= 0) {
+ /* locate_fd() may have expanded fdtable, load the ptr */
+ fdt = files_fdtable(files);
+ FD_SET(fd, fdt->open_fds);
+ FD_CLR(fd, fdt->close_on_exec);
+ }
+ }
+ spin_unlock(&files->file_lock);
+ if (likely(fd >= 0))
fd_install(fd, file);
- } else {
- spin_unlock(&files->file_lock);
+ else
fput(file);
- }
return fd;
}
@@ -136,7 +150,7 @@
asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
{
int err = -EBADF;
- struct file * file, *tofree;
+ struct file * file, *tofree = NULL;
struct files_struct * files = current->files;
struct fdtable *fdt;
@@ -146,32 +160,52 @@
err = newfd;
if (newfd == oldfd)
goto out_unlock;
- err = -EBADF;
- if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
- goto out_unlock;
- get_file(file); /* We are now finished with oldfd */
-
- err = expand_files(files, newfd);
- if (err < 0)
- goto out_fput;
-
- /* To avoid races with open() and dup(), we will mark the fd as
- * in-use in the open-file bitmap throughout the entire dup2()
- * process. This is quite safe: do_close() uses the fd array
- * entry, not the bitmap, to decide what work needs to be
- * done. --sct */
- /* Doesn't work. open() might be there first. --AV */
+ if (newfd >= FD_UNSEQ_BASE) {
+ if (!(newfd & FD_UNSEQ_ALLOC)) {
+ if (files->fmap && fdmap_fdof(files->fmap, newfd))
+ tofree = fdmap_file_get(files->fmap, newfd);
+ }
+ if (!tofree) {
+ err = __get_unused_fd_unseq(files,
+ newfd & FD_UNSEQ_ALLOC ? FDMAP_HINT_ANY:
+ FDMAP_HINT_EXACT(newfd), 0);
+ if (err < 0)
+ goto out_unlock;
+ newfd = err;
+ }
+ get_file(file);
+ fdmap_install(files->fmap, newfd, file);
+ } else {
+ err = -EBADF;
+ if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
+ goto out_unlock;
+ get_file(file); /* We are now finished with oldfd */
+
+ err = expand_files(files, newfd);
+ if (err < 0)
+ goto out_fput;
+
+ /* To avoid races with open() and dup(), we will mark the fd as
+ * in-use in the open-file bitmap throughout the entire dup2()
+ * process. This is quite safe: do_close() uses the fd array
+ * entry, not the bitmap, to decide what work needs to be
+ * done. --sct */
+ /* Doesn't work. open() might be there first. --AV */
- /* Yes. It's a race. In user space. Nothing sane to do */
- err = -EBUSY;
- fdt = files_fdtable(files);
- tofree = fdt->fd[newfd];
- if (!tofree && FD_ISSET(newfd, fdt->open_fds))
- goto out_fput;
-
- rcu_assign_pointer(fdt->fd[newfd], file);
- FD_SET(newfd, fdt->open_fds);
- FD_CLR(newfd, fdt->close_on_exec);
+ /* Yes. It's a race. In user space. Nothing sane to do */
+ err = -EBUSY;
+ fdt = files_fdtable(files);
+ tofree = fdt->fd[newfd];
+ if (FD_ISSET(newfd, fdt->open_fds)) {
+ if (!tofree)
+ goto out_fput;
+ } else
+ files->fd_count++;
+
+ rcu_assign_pointer(fdt->fd[newfd], file);
+ FD_SET(newfd, fdt->open_fds);
+ FD_CLR(newfd, fdt->close_on_exec);
+ }
spin_unlock(&files->file_lock);
if (tofree)
Index: linux-2.6.mod/fs/exec.c
===================================================================
--- linux-2.6.mod.orig/fs/exec.c 2007-06-02 15:34:27.000000000 -0700
+++ linux-2.6.mod/fs/exec.c 2007-06-02 15:36:07.000000000 -0700
@@ -783,6 +783,8 @@
static void flush_old_files(struct files_struct * files)
{
long j = -1;
+ unsigned int start, base;
+ unsigned long fset;
struct fdtable *fdt;
spin_lock(&files->file_lock);
@@ -807,6 +809,18 @@
spin_lock(&files->file_lock);
}
+ for (start = 0;;) {
+ if (!files->fmap)
+ break;
+ if (!fdmap_next_flag_set(files->fmap, FDMAP_BIT_CLOEXEC,
+ &start, &base, &fset))
+ break;
+ spin_unlock(&files->file_lock);
+ for (; fset; base++, fset >>= 1)
+ if (fset & 1)
+ sys_close(base);
+ spin_lock(&files->file_lock);
+ }
spin_unlock(&files->file_lock);
}
Index: linux-2.6.mod/kernel/exit.c
===================================================================
--- linux-2.6.mod.orig/kernel/exit.c 2007-06-02 15:34:27.000000000 -0700
+++ linux-2.6.mod/kernel/exit.c 2007-06-02 15:36:07.000000000 -0700
@@ -417,10 +417,18 @@
EXPORT_SYMBOL(daemonize);
+static int files_fdmap_close(void *priv, struct file *file)
+{
+ filp_close(file, (struct files_struct *) priv);
+ cond_resched();
+ return 0;
+}
+
static void close_files(struct files_struct * files)
{
int i, j;
struct fdtable *fdt;
+ struct file *file;
j = 0;
@@ -438,7 +446,7 @@
set = fdt->open_fds->fds_bits[j++];
while (set) {
if (set & 1) {
- struct file * file = xchg(&fdt->fd[i], NULL);
+ file = xchg(&fdt->fd[i], NULL);
if (file) {
filp_close(file, files);
cond_resched();
@@ -448,6 +456,8 @@
set >>= 1;
}
}
+ if (files->fmap)
+ fdmap_for_each_file(files->fmap, files_fdmap_close, files);
}
struct files_struct *get_files_struct(struct task_struct *task)
@@ -469,6 +479,8 @@
if (atomic_dec_and_test(&files->count)) {
close_files(files);
+ if (files->fmap)
+ fdmap_free(files->fmap);
/*
* Free the fd and fdset arrays if we expanded them.
* If the fdtable was embedded, pass files for freeing
Index: linux-2.6.mod/fs/open.c
===================================================================
--- linux-2.6.mod.orig/fs/open.c 2007-06-02 15:34:27.000000000 -0700
+++ linux-2.6.mod/fs/open.c 2007-06-02 15:36:07.000000000 -0700
@@ -861,10 +861,12 @@
int fd, error;
struct fdtable *fdt;
- error = -EMFILE;
spin_lock(&files->file_lock);
-
repeat:
+ error = -EMFILE;
+ if (files->fd_count >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
+ goto out;
+
fdt = files_fdtable(files);
fd = find_next_zero_bit(fdt->open_fds->fds_bits, fdt->max_fds,
files->next_fd);
@@ -881,18 +883,17 @@
if (error < 0)
goto out;
- if (error) {
- /*
- * If we needed to expand the fs array we
- * might have blocked - try again.
- */
- error = -EMFILE;
+ /*
+ * If we needed to expand the fs array we
+ * might have blocked - try again.
+ */
+ if (error)
goto repeat;
- }
FD_SET(fd, fdt->open_fds);
FD_CLR(fd, fdt->close_on_exec);
files->next_fd = fd + 1;
+ files->fd_count++;
#if 1
/* Sanity check */
if (fdt->fd[fd] != NULL) {
@@ -911,10 +912,17 @@
static void __put_unused_fd(struct files_struct *files, unsigned int fd)
{
- struct fdtable *fdt = files_fdtable(files);
- __FD_CLR(fd, fdt->open_fds);
- if (fd < files->next_fd)
- files->next_fd = fd;
+ struct fdtable *fdt;
+
+ if (files->fmap && fdmap_fdof(files->fmap, fd)) {
+ fdmap_putfd(files->fmap, fd);
+ } else {
+ fdt = files_fdtable(files);
+ __FD_CLR(fd, fdt->open_fds);
+ if (fd < files->next_fd)
+ files->next_fd = fd;
+ }
+ files->fd_count--;
}
void fastcall put_unused_fd(unsigned int fd)
@@ -927,6 +935,116 @@
EXPORT_SYMBOL(put_unused_fd);
+struct fd_map *files_fdmap_alloc(struct files_struct *files, unsigned int size)
+{
+ struct fd_map *fmap, *ofmap, *nfmap;
+
+ size = max(size, FDMAP_UNSEQ_SIZE);
+ ofmap = files->fmap;
+ if (ofmap)
+ size = max(size, 2 * ofmap->size);
+ spin_unlock(&files->file_lock);
+ fmap = fdmap_alloc(FD_UNSEQ_BASE, size, !ofmap);
+ spin_lock(&files->file_lock);
+ if (fmap) {
+ nfmap = files->fmap;
+ if (nfmap) {
+ if (ofmap == nfmap) {
+ fdmap_copy(fmap, nfmap, NULL, 0);
+ rcu_assign_pointer(files->fmap, fmap);
+ fdmap_free(nfmap);
+ } else {
+ fdmap_free(fmap);
+ fmap = nfmap;
+ }
+ } else
+ rcu_assign_pointer(files->fmap, fmap);
+ }
+ return fmap;
+}
+
+/**
+ * __get_unused_fd_unseq - Allocates a file descriptor inside the unsequential
+ * file descriptor map (locked)
+ *
+ * @files: [in] Pointer the files_struct that hosts the unsequential file
+ * descriptor map
+ * @fd: [in] Hint value for the file descriptor allocation. See function
+ * fdmap_newfd() description for the @fd values documentation
+ * @flags: [in] Flags to be associated with the file descriptor
+ *
+ * Returns the allocated file descriptor, or a negative value in case of error.
+ * This function must be called while holding @files->lock. In case the file
+ * descriptor map should be resized, the held lock will be temporarly released
+ * (and re-acquired).
+ */
+int __get_unused_fd_unseq(struct files_struct *files, int fd,
+ unsigned long flags)
+{
+ int nfd;
+ unsigned long mflags = 0;
+ struct fd_map *fmap;
+
+ /*
+ * Map special open flags parameters to fdmap flags. TODO!!
+ */
+
+repeat:
+ if (unlikely(files->fd_count >=
+ current->signal->rlim[RLIMIT_NOFILE].rlim_cur))
+ return -EMFILE;
+ fmap = files_fdmap(files);
+ if (!fmap)
+ return -ENOMEM;
+ nfd = fdmap_newfd(fmap, fd, mflags);
+ if (unlikely(nfd == -EMFILE)) {
+ unsigned int size = 0, afd = abs(fd);
+
+ if (afd > FD_UNSEQ_BASE) {
+ size = afd - FD_UNSEQ_BASE;
+ size += size / 2;
+ }
+ if (!files_fdmap_alloc(files, size))
+ return -ENOMEM;
+ goto repeat;
+ }
+ files->fd_count++;
+ return nfd;
+}
+
+/**
+ * get_unused_fd_unseq - Allocates a file descriptor inside the unsequential
+ * file descriptor map (unlocked)
+ *
+ * This function is the unlocked counterpart of the __get_unused_fd_unseq()
+ * function.
+ */
+int get_unused_fd_unseq(struct files_struct *files, int fd,
+ unsigned long flags)
+{
+ spin_lock(&files->file_lock);
+ fd = __get_unused_fd_unseq(files, fd, flags);
+ spin_unlock(&files->file_lock);
+ return fd;
+}
+
+struct file *fcheck_files(struct files_struct *files, unsigned int fd)
+{
+ struct file *file = NULL;
+ struct fd_map *fmap;
+ struct fdtable *fdt;
+
+ fmap = rcu_dereference(files->fmap);
+ if (fmap && fdmap_fdof(fmap, fd))
+ file = fdmap_file_get(fmap, fd);
+ else {
+ fdt = files_fdtable(files);
+ if (fd < fdt->max_fds)
+ file = rcu_dereference(fdt->fd[fd]);
+ }
+ return file;
+}
+
/*
* Install a file pointer in the fd array.
*
@@ -945,9 +1063,13 @@
struct files_struct *files = current->files;
struct fdtable *fdt;
spin_lock(&files->file_lock);
- fdt = files_fdtable(files);
- BUG_ON(fdt->fd[fd] != NULL);
- rcu_assign_pointer(fdt->fd[fd], file);
+ if (files->fmap && fdmap_fdof(files->fmap, fd)) {
+ fdmap_install(files->fmap, fd, file);
+ } else {
+ fdt = files_fdtable(files);
+ BUG_ON(fdt->fd[fd] != NULL);
+ rcu_assign_pointer(fdt->fd[fd], file);
+ }
spin_unlock(&files->file_lock);
}
@@ -1053,14 +1175,20 @@
int retval;
spin_lock(&files->file_lock);
- fdt = files_fdtable(files);
- if (fd >= fdt->max_fds)
- goto out_unlock;
- filp = fdt->fd[fd];
- if (!filp)
- goto out_unlock;
- rcu_assign_pointer(fdt->fd[fd], NULL);
- FD_CLR(fd, fdt->close_on_exec);
+ if (files->fmap && fdmap_fdof(files->fmap, fd)) {
+ filp = fdmap_file_get(files->fmap, fd);
+ if (!filp)
+ goto out_unlock;
+ } else {
+ fdt = files_fdtable(files);
+ if (fd >= fdt->max_fds)
+ goto out_unlock;
+ filp = fdt->fd[fd];
+ if (!filp)
+ goto out_unlock;
+ rcu_assign_pointer(fdt->fd[fd], NULL);
+ FD_CLR(fd, fdt->close_on_exec);
+ }
__put_unused_fd(files, fd);
spin_unlock(&files->file_lock);
retval = filp_close(filp, files);
Index: linux-2.6.mod/kernel/fork.c
===================================================================
--- linux-2.6.mod.orig/kernel/fork.c 2007-06-02 15:34:27.000000000 -0700
+++ linux-2.6.mod/kernel/fork.c 2007-06-02 15:36:07.000000000 -0700
@@ -641,6 +641,8 @@
spin_lock_init(&newf->file_lock);
newf->next_fd = 0;
+ newf->fd_count = 0;
+ newf->fmap = NULL;
fdt = &newf->fdtab;
fdt->max_fds = NR_OPEN_DEFAULT;
fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
@@ -671,6 +673,27 @@
goto out;
spin_lock(&oldf->file_lock);
+repeat:
+ if (oldf->fmap) {
+ struct fd_map *ofmap;
+ unsigned int size, fcount;
+
+ ofmap = oldf->fmap;
+ size = ofmap->size;
+ spin_unlock(&oldf->file_lock);
+ newf->fmap = fdmap_alloc(FD_UNSEQ_BASE, size, 0);
+ if (!newf->fmap)
+ goto out_release;
+ spin_lock(&oldf->file_lock);
+ if (oldf->fmap != ofmap) {
+ fdmap_free(newf->fmap);
+ newf->fmap = NULL;
+ goto repeat;
+ }
+ fdmap_copy(newf->fmap, ofmap, &fcount, 1);
+ newf->fd_count = fcount;
+ }
+
old_fdt = files_fdtable(oldf);
new_fdt = files_fdtable(newf);
open_files = count_open_files(old_fdt);
@@ -709,6 +732,7 @@
struct file *f = *old_fds++;
if (f) {
get_file(f);
+ newf->fd_count++;
} else {
/*
* The fd may be claimed in the fd bitmap but not yet
@@ -739,6 +763,8 @@
return newf;
out_release:
+ if (newf->fmap)
+ fdmap_free(newf->fmap);
kmem_cache_free(files_cachep, newf);
out:
return NULL;
Index: linux-2.6.mod/include/linux/init_task.h
===================================================================
--- linux-2.6.mod.orig/include/linux/init_task.h 2007-06-02 15:34:27.000000000 -0700
+++ linux-2.6.mod/include/linux/init_task.h 2007-06-02 15:36:07.000000000 -0700
@@ -28,7 +28,9 @@
.next_fd = 0, \
.close_on_exec_init = { { 0, } }, \
.open_fds_init = { { 0, } }, \
- .fd_array = { NULL, } \
+ .fd_array = { NULL, }, \
+ .fd_count = 0, \
+ .fmap = NULL \
}
#define INIT_KIOCTX(name, which_mm) \
Index: linux-2.6.mod/kernel/kmod.c
===================================================================
--- linux-2.6.mod.orig/kernel/kmod.c 2007-06-02 15:34:27.000000000 -0700
+++ linux-2.6.mod/kernel/kmod.c 2007-06-02 15:36:07.000000000 -0700
@@ -155,6 +155,7 @@
fdt = files_fdtable(f);
FD_SET(0, fdt->open_fds);
FD_CLR(0, fdt->close_on_exec);
+ f->fd_count++;
spin_unlock(&f->file_lock);
/* and disallow core files too */
Index: linux-2.6.mod/include/asm-generic/fcntl.h
===================================================================
--- linux-2.6.mod.orig/include/asm-generic/fcntl.h 2007-06-02 15:34:27.000000000 -0700
+++ linux-2.6.mod/include/asm-generic/fcntl.h 2007-06-02 15:36:07.000000000 -0700
@@ -3,6 +3,17 @@
#include <linux/types.h>
+/*
+ * Base for non sequential file descriptors
+ */
+#define FD_UNSEQ_BASE (1U << 28)
+
+/*
+ * Speacial value for non sequential file descriptors, to tell to
+ * allocate a new non sequential value
+ */
+#define FD_UNSEQ_ALLOC (1U << 30)
+
/* open/fcntl - O_SYNC is only implemented on blocks devices and on files
located on an ext2 file system */
#define O_ACCMODE 00000003
Index: linux-2.6.mod/fs/proc/base.c
===================================================================
--- linux-2.6.mod.orig/fs/proc/base.c 2007-06-02 15:34:27.000000000 -0700
+++ linux-2.6.mod/fs/proc/base.c 2007-06-02 15:36:07.000000000 -0700
@@ -1384,10 +1384,11 @@
struct dentry *dentry = filp->f_path.dentry;
struct inode *inode = dentry->d_inode;
struct task_struct *p = get_proc_task(inode);
- unsigned int fd, tid, ino;
+ unsigned int fd, tid, ino, topfd;
int retval;
struct files_struct * files;
struct fdtable *fdt;
+ struct fd_map *fmap;
retval = -ENOENT;
if (!p)
@@ -1412,9 +1413,14 @@
goto out;
rcu_read_lock();
fdt = files_fdtable(files);
- for (fd = filp->f_pos-2;
- fd < fdt->max_fds;
- fd++, filp->f_pos++) {
+ fmap = rcu_dereference(files->fmap);
+ fd = filp->f_pos - 2;
+ if (fd < fdt->max_fds || !fmap)
+ topfd = fdt->max_fds;
+ else
+ topfd = fdmap_topfd(fmap);
+rescan:
+ for (; fd < topfd; fd++, filp->f_pos++) {
char name[PROC_NUMBUF];
int len;
@@ -1425,13 +1431,19 @@
len = snprintf(name, sizeof(name), "%d", fd);
if (proc_fill_cache(filp, dirent, filldir,
name, len, instantiate,
- p, &fd) < 0) {
- rcu_read_lock();
- break;
- }
+ p, &fd) < 0)
+ goto out_put_files;
rcu_read_lock();
}
+ fmap = rcu_dereference(files->fmap);
+ if (fmap && fd < fdmap_basefd(fmap)) {
+ fd = fdmap_basefd(fmap);
+ filp->f_pos = fd + 2;
+ topfd = fdmap_topfd(fmap);
+ goto rescan;
+ }
rcu_read_unlock();
+out_put_files:
put_files_struct(files);
}
out:
* Davide Libenzi <[email protected]> wrote:
> This patch plugs the extended fdmap into the kernel. At the moment, this
> is done only through sys_dup2() and F_DUPFD.
> The base value for the unsequential file descriptor allocation is (at the
> moment) set to FD_UNSEQ_BASE (defined in asm-generic/fcntl.h):
really nice stuff! :-)
> #define FD_UNSEQ_BASE (1U << 28)
> #define FD_UNSEQ_ALLOC (1U << 30)
i'm wondering, why not use (1 << 30) both as the base and as the flag?
That would make integration of the new fd space 'seemless' in terms of
dup2() use.
> It'd be possible to add a new O_UNSEQFD flag to open(2) and make
> sys_open() to allocate the new descriptor inside the unsequential map.
yeah, please do that now - lets not leave any incomplete areas. We've
too often made the mistake of not pushing through new APIs consistently
enough.
I'd also suggest a new sys_socket2() call that takes a 'flags' parameter
as well - because one primary user of this facility will be networking
servers. (O_UNSEQFD would make sense for it and O_NDELAY - currently
network apps that want to set O_NDELAY need to do it with an extra
fcntl() - while they could already indicate this in the sys_socket()
call, if it were closer to sys_open() semantics)
in any case, your patch is looking really good and already deserves an
ack!
Acked-by: Ingo Molnar <[email protected]>
Ingo
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Davide Libenzi wrote:
> #define FD_UNSEQ_BASE (1U << 28)
I agree with Ingo, no need for a second magic value. Use the same value
as FD_UNSEQ_ALLOC which will just mean this exact value should never be
used as a file descriptor.
If it's not too expensive, I would prefer to see fdmap_newfd to return
more or less random descriptor values. This way nobody can try to rely
on a specific sequence. Plus it might add a tiny bit of extra security.
While dup2() and fcntl() might seem like good candidates to introduce
the new functionality I think we should jump the gun and do it right.
There are both not the best fit, as can be seen by your description.
The parameter is now a hint. Additionally everybody who'd use
dup2/fcntl would have to issue a close syscall right after it. Finally,
I'm worried about accidental use of the new functionality. Not too
likely but it can happen. This will cause hard to debug problems.
I think it's better to have a dedicated interface:
int nonseqfd (int fd, int flags);
The semantics would include returning the new descriptor, closing the
old descriptor (maybe this can be overwritten with a flags bit). I
guess I would also like to see the default of close_on_exit changed but
perhaps the caller can be required to set an appropriate bit in the
flags parameter.
This approach is cleaner, no magic constants exported from the kernel
and it should be more efficient in general. It probably will also spark
reevaluating the choice of the interface for fdmap_newfd. I don't like
overloading a parameter to be used as a flag and a value at the same time.
The flags parameter will also allow to specify the additional
functionality needed. For instance, by default descriptors allocated
this way *should* appear in /proc/self/fd. You mentioned web servers
which don't care about sequential allocation and are slowed down by the
current strict allocation. Those should have the descriptor appear in
/proc/self/fd. On the other hand, uses of the interfaces in, say,
glibc or valgrind should create invisible descriptor. Well, descriptors
visible in perhaps /proc/self/fd-private or so.
> It'd be possible to add a new O_UNSEQFD flag to open(2) and make sys_open()
> to allocate the new descriptor inside the unsequential map.
Again, I agree with Ingo. This should be done. If my CLOEXEC patches
are acceptable the same kind of change for Unix domain sockets can be
applied for non-sequential descriptors.
BTW: those whose perfect knowledge of the English language, I guess
non-sequential is better than unsequential, right? This would require
renaming.
> repeat:
> + if (files->fd_count >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
> + return -EMFILE;
I haven't studied the entire patch completely. So, help me understand
this. ->fd_count is the exact count for the number of descriptors which
is open. This would make sense.
But I hope everybody realizes that this is a change in the ABI. So far
RLIMIT_NOFILE specified the highest file descriptor number which could
be returned by an open call etc.
There is a fine difference. Even if yo have just a couple of
descriptors open, you could not, for instance, dup2 to a descriptor
higher than RLIMIT_NOFILE. With this patch it seems possible, even for
processes not using non-sequential descriptors. This means programs
written on new kernels might not run on older kernels.
I don't say that this is unacceptable. It is a change. If it can be
minimized this would be better but the new RLIMIT_NOFILE semantics is
certainly also correct according to POSIX.
- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)
iD8DBQFGYwZ82ijCOnn/RHQRAsvGAJ4jXamjOJVq4ImmVYT3/ghBXTB1bgCdGF0O
Y4nv9i7L98UEDEKc09Wt0VU=
=OS4+
-----END PGP SIGNATURE-----
On Sun, 3 Jun 2007, Ingo Molnar wrote:
> > #define FD_UNSEQ_BASE (1U << 28)
> > #define FD_UNSEQ_ALLOC (1U << 30)
>
> i'm wondering, why not use (1 << 30) both as the base and as the flag?
> That would make integration of the new fd space 'seemless' in terms of
> dup2() use.
There are two behaviours to handle. One is the standard dup2() one, where
you pass a "newfd" by saying that you want "oldfd" exactly on top of
"newfd". With the new sys_dup2() you can pass an fd inside the
unsequential area, and have the same behviour of the legacy dup2().
The other one, unique to unsequential fds, is asking the kernel to just
allocate a free one from the unsequential map, whatever it is.
So we need a flag, that is "far enough" from the unsequential map
allocation space. I was actually thinking of moving FD_UNSEQ_BASE down to
(1 << 27) or (1 << 26), and leave some more space for flags.
128M or 64M of descriptors should be enough for the legacy, sequential fd
allocator ;)
> > It'd be possible to add a new O_UNSEQFD flag to open(2) and make
> > sys_open() to allocate the new descriptor inside the unsequential map.
>
> yeah, please do that now - lets not leave any incomplete areas. We've
> too often made the mistake of not pushing through new APIs consistently
> enough.
Will do today.
> I'd also suggest a new sys_socket2() call that takes a 'flags' parameter
> as well - because one primary user of this facility will be networking
> servers. (O_UNSEQFD would make sense for it and O_NDELAY - currently
> network apps that want to set O_NDELAY need to do it with an extra
> fcntl() - while they could already indicate this in the sys_socket()
> call, if it were closer to sys_open() semantics)
Makes sense for me, since sys_socket() is the primary source of fds
wherever you have fd-crowded applications. And those definitely aren't
using select() ;)
I seem to remember, that either you or Uli, talked about task-private fds,
although I do not remember the context. A close-on-fork flag?
- Davide
On Sun, 3 Jun 2007, Ulrich Drepper wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
>
> Davide Libenzi wrote:
> > #define FD_UNSEQ_BASE (1U << 28)
>
> I agree with Ingo, no need for a second magic value. Use the same value
> as FD_UNSEQ_ALLOC which will just mean this exact value should never be
> used as a file descriptor.
I explained this in my answer to Ingo...
> If it's not too expensive, I would prefer to see fdmap_newfd to return
> more or less random descriptor values. This way nobody can try to rely
> on a specific sequence. Plus it might add a tiny bit of extra security.
Random can be expensive. At the moment is FIFO. I'm missing though how
this can be a security flaw, when the legacy one is exactly predictable.
> While dup2() and fcntl() might seem like good candidates to introduce
> the new functionality I think we should jump the gun and do it right.
> There are both not the best fit, as can be seen by your description.
> The parameter is now a hint. Additionally everybody who'd use
> dup2/fcntl would have to issue a close syscall right after it. Finally,
> I'm worried about accidental use of the new functionality. Not too
> likely but it can happen. This will cause hard to debug problems.
>
> I think it's better to have a dedicated interface:
>
> int nonseqfd (int fd, int flags);
>
> The semantics would include returning the new descriptor, closing the
> old descriptor (maybe this can be overwritten with a flags bit). I
> guess I would also like to see the default of close_on_exit changed but
> perhaps the caller can be required to set an appropriate bit in the
> flags parameter.
>
> This approach is cleaner, no magic constants exported from the kernel
> and it should be more efficient in general. It probably will also spark
> reevaluating the choice of the interface for fdmap_newfd. I don't like
> overloading a parameter to be used as a flag and a value at the same time.
I can do a new syscall, no problem (I actually even slightly prefer). We
cannot break dup2() and F_DUPFD though, so we have to handle those too.
I was just trying to use Linus suggestion of using sus_dup2().
> The flags parameter will also allow to specify the additional
> functionality needed. For instance, by default descriptors allocated
> this way *should* appear in /proc/self/fd. You mentioned web servers
> which don't care about sequential allocation and are slowed down by the
> current strict allocation. Those should have the descriptor appear in
> /proc/self/fd. On the other hand, uses of the interfaces in, say,
> glibc or valgrind should create invisible descriptor. Well, descriptors
> visible in perhaps /proc/self/fd-private or so.
Ohh, my last work yesterday was changing procfs/base.c to have them show
up in there. We have quite a few flags available (31 or 63) to be
assicated with each fd, so I guess we could use one for that.
> BTW: those whose perfect knowledge of the English language, I guess
> non-sequential is better than unsequential, right? This would require
> renaming.
Yeah :D
> > repeat:
> > + if (files->fd_count >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
> > + return -EMFILE;
>
> I haven't studied the entire patch completely. So, help me understand
> this. ->fd_count is the exact count for the number of descriptors which
> is open. This would make sense.
Yes, that's the count of open fds.
> But I hope everybody realizes that this is a change in the ABI. So far
> RLIMIT_NOFILE specified the highest file descriptor number which could
> be returned by an open call etc.
>
> There is a fine difference. Even if yo have just a couple of
> descriptors open, you could not, for instance, dup2 to a descriptor
> higher than RLIMIT_NOFILE. With this patch it seems possible, even for
> processes not using non-sequential descriptors. This means programs
> written on new kernels might not run on older kernels.
>
> I don't say that this is unacceptable. It is a change. If it can be
> minimized this would be better but the new RLIMIT_NOFILE semantics is
> certainly also correct according to POSIX.
If you look a few lines below, there's also (this is inside the lagacy
fd allocator BTW):
if (fd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
goto out;
So the POSIX behaviour does not change. You cannot dup2() to an fd higher
than RLIMIT_NOFILE (when using legacy fd allocation), *and* you cannot
open more than RLIMIT_NOFILE files.
This means that if you have RLIMIT_NOFILE==1000 and you have 999 files
open in the nonsequential area, you can only open one file in the legacy
area.
- Davide
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Davide Libenzi wrote:
>> I agree with Ingo, no need for a second magic value. Use the same value
>> as FD_UNSEQ_ALLOC which will just mean this exact value should never be
>> used as a file descriptor.
>
> I explained this in my answer to Ingo...
And if we have a new syscall we don't need any of that special dup2
behavior you describe. I really don't think this should be added. dup2
should just do what POSIX specifies, nothing more. I would even suggest
to not allow to dup2() to a descriptor > RLIMIT_NOFILE unless it is
already allocated. I.e., don't allow creating arbitrary high descriptors.
This behavior is completely consistent with the current implementation.
No bad surprises. In fact, it eliminates parts of the ABI
incompatibility I talked about.
> Random can be expensive. At the moment is FIFO. I'm missing though how
> this can be a security flaw, when the legacy one is exactly predictable.
It's not an added security issue. It would mean removing a possible
security the current file descriptor allocation has.
If randomizing each allocator is too expensive then randomize at the
very least the number of the first descriptor you give out.
> I can do a new syscall, no problem (I actually even slightly prefer). We
> cannot break dup2() and F_DUPFD though, so we have to handle those too.
There is no requirement to duplicate everything since non-sequential
descriptors are a new feature. Existing program will not be affected.
As said above, I think it is better if dup2() to a high descriptor which
does not exist is not allowed. You have to be able to dup2() over an
existing one but that's it.
For F_DUPFD I also don't see justification for an extension to
non-sequential descriptors. The concept of "greater than" makes not
much sense for non-sequential descriptors. I think I would be more than
happy to restrict the specified lower bound to RLIMIT_NOFILE as it is now.
If latter this turns out to be a big limitation we can still extend the
dup2/fcntl functionality. But removing functionality is harder.
- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)
iD8DBQFGYxnb2ijCOnn/RHQRAkyoAJ4+L3W/aVYaQ/IG0HPm0tt5PKPcRACgoJYF
bChz20uicqD9tBbDtU1yruM=
=4vNd
-----END PGP SIGNATURE-----
On Sun, 3 Jun 2007, Ulrich Drepper wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
>
> Davide Libenzi wrote:
> >> I agree with Ingo, no need for a second magic value. Use the same value
> >> as FD_UNSEQ_ALLOC which will just mean this exact value should never be
> >> used as a file descriptor.
> >
> > I explained this in my answer to Ingo...
>
> And if we have a new syscall we don't need any of that special dup2
> behavior you describe. I really don't think this should be added. dup2
> should just do what POSIX specifies, nothing more. I would even suggest
> to not allow to dup2() to a descriptor > RLIMIT_NOFILE unless it is
> already allocated. I.e., don't allow creating arbitrary high descriptors.
>
> This behavior is completely consistent with the current implementation.
> No bad surprises. In fact, it eliminates parts of the ABI
> incompatibility I talked about.
Agreed, a new syscall looks less messy. I'll make sys_dup2() to allow
installing in the non-sequential area, only if there's an fd already
allocated. F_DUPFD will remain unchanged (that is, not allow
non-sequential fds allocations).
> > Random can be expensive. At the moment is FIFO. I'm missing though how
> > this can be a security flaw, when the legacy one is exactly predictable.
>
> It's not an added security issue. It would mean removing a possible
> security the current file descriptor allocation has.
>
> If randomizing each allocator is too expensive then randomize at the
> very least the number of the first descriptor you give out.
Can you tell me how this can be a problem, and in which way making a
random thing would help?
- Davide
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Davide Libenzi wrote:
>> If randomizing each allocator is too expensive then randomize at the
>> very least the number of the first descriptor you give out.
>
> Can you tell me how this can be a problem, and in which way making a
> random thing would help?
In attacking an application every bit of known data can be used in an
exploit. Be it something as simple as having a predetermined value at a
certain point in the program since it loaded a file descriptor into a
register.
But what I'm mostly thinking about is the case where I/O could be
redirected. The intruding program could call dup2() and suddenly the
program wanting to write a password to disk could be directed to send it
over a socket. One could imagine countless such attacks.
I don't say such an attack exists today. But this is no reason to not
implement these extra security measures. The cost of a randomized star
base (offset from 2^30) should be zero.
- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)
iD8DBQFGYyid2ijCOnn/RHQRAjRoAJ9XsAazZtc9V3AxaPjiNMjK8jPUZgCdG/Eg
KPug5Sq9REHd6H3AR0ax2aU=
=9iUM
-----END PGP SIGNATURE-----
On Sun, 3 Jun 2007, Ulrich Drepper wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
>
> Davide Libenzi wrote:
> >> If randomizing each allocator is too expensive then randomize at the
> >> very least the number of the first descriptor you give out.
> >
> > Can you tell me how this can be a problem, and in which way making a
> > random thing would help?
>
> In attacking an application every bit of known data can be used in an
> exploit. Be it something as simple as having a predetermined value at a
> certain point in the program since it loaded a file descriptor into a
> register.
>
> But what I'm mostly thinking about is the case where I/O could be
> redirected. The intruding program could call dup2() and suddenly the
> program wanting to write a password to disk could be directed to send it
> over a socket. One could imagine countless such attacks.
>
> I don't say such an attack exists today. But this is no reason to not
> implement these extra security measures. The cost of a randomized star
> base (offset from 2^30) should be zero.
Randomizing the base is not a problem. Should this be always, or flag
driven?
- Davide
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Davide Libenzi wrote:
> Randomizing the base is not a problem. Should this be always, or flag
> driven?
I would say all the time. I don't think it's a problem with
reproducibility in any reasonable code.
- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)
iD8DBQFGY0oq2ijCOnn/RHQRArmJAJ99Q0dLsPcnQQHjmSpLgv8va/F8/wCgyF06
57LVbqftsMC9x0/CzjWTgkw=
=WXM6
-----END PGP SIGNATURE-----
On Sun, 3 Jun 2007, Ulrich Drepper wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
>
> Davide Libenzi wrote:
> > Randomizing the base is not a problem. Should this be always, or flag
> > driven?
>
> I would say all the time. I don't think it's a problem with
> reproducibility in any reasonable code.
Agreed. That makes code simpler too.
- Davide
On Sun, 3 Jun 2007, Davide Libenzi wrote:
>
> Agreed, a new syscall looks less messy. I'll make sys_dup2() to allow
> installing in the non-sequential area, only if there's an fd already
> allocated. F_DUPFD will remain unchanged (that is, not allow
> non-sequential fds allocations).
I actually think that a new system call is _hugely_ messy. It means that
anybody who wants to use a new feature needs to have a new glibc. Not
nice, not nice at all. I'd much rather see a totally transparent extension
that doesn't require any new user space what-so-ever to be used...
Linus
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Linus Torvalds wrote:
> I actually think that a new system call is _hugely_ messy. It means that
> anybody who wants to use a new feature needs to have a new glibc.
Strange definition of "messy". Have you looked at the proposed dup2
extension? Magic value, semantics which don't match what the function
does today. That's messy.
The private descriptors are something new, programs have to be adjusted.
The open() extension will allow them to be used naturally without
additional interfaces. This leaves converting normal descriptors into
private ones. If the new syscall isn't yet available it's easy enough
to use the syscall() function. The parameters are simple integer so no
tricks needed.
I don't think the requirement for a new glibc should cloud the judgement
when it comes to cleanliness of the API. The new syscall wins there
clearly, I hope you agree. Even the dup2() approach requires program to
either be compiled with headers which have the definitions (i.e., a new
glibc) or programs have to carry their own definitions of the magic
constants and O_NONSEQ. I really cannot see any advantage of
misappropriating dup2.
- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)
iD8DBQFGY6GA2ijCOnn/RHQRAtQ1AJ4+dDIgeBKlggt4U+lyZnDfQJZ/VQCfRbG4
0FtGWqlLOWFlspMJ9S3xQ7E=
=jrG1
-----END PGP SIGNATURE-----