This patch provides an improved fdtable allocation scheme, useful for
expanding fdtable file descriptor entries. The main focus is on the fdarray,
as its memory usage grows 128 times faster than that of an fdset.
The allocation algorithm sizes the fdarray in such a way that its memory usage
increases in easy page-sized chunks. Additionally, it tries to account for the
optimal usage of the allocators involved: kmalloc() for sizes less than a
page, and vmalloc() with page granularity for sizes greater than a page.
Namely, the following sizes for the fdarray are considered, and the smallest
that accommodates the requested fd count is chosen:
pagesize / 4
pagesize / 2
pagesize <- memory allocator switch point
pagesize * 2
pagesize * 3
pagesize * 4
...etc...
Unlike the current implementation, this allocation scheme does not require a
loop to compute the optimal fdarray size, and can be done in straightline
code.
Furthermore, since the fdarray overflows the pagesize boundary long before any
of the fdsets do, it makes sense to optimize run-time by allocating both
fdsets
in a single swoop. Even together, they will still be, by far, smaller than the
fdarray.
As long as we're replacing the guts of fs/file.c, it makes sense to tidy up
the code. This work includes:
simplification via refactoring,
elimination of unnecessary code, and
extensive commenting throughout the entire file.
This is the last patch in the series. All the code should now be sparkly
clean.
Signed-off-by: Vadim Lobanov <[email protected]>
diff -Npru old/fs/file.c new/fs/file.c
--- old/fs/file.c 2006-09-28 20:13:13.000000000 -0700
+++ new/fs/file.c 2006-09-28 20:22:48.000000000 -0700
@@ -1,21 +1,18 @@
/*
* linux/fs/file.c
*
+ * Manage the dynamic fd arrays in the process files_struct.
* Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
*
- * Manage the dynamic fd arrays in the process files_struct.
+ * Pagesize-based fdarray/fdset allocation algorithm; major cleanups.
+ * Copyright (C) 2006, Vadim Lobanov
*/
#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/time.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/file.h>
-#include <linux/bitops.h>
#include <linux/interrupt.h>
-#include <linux/spinlock.h>
-#include <linux/rcupdate.h>
#include <linux/workqueue.h>
struct fdtable_defer {
@@ -26,120 +23,153 @@ struct fdtable_defer {
};
/*
- * We use this list to defer free fdtables that have vmalloced
- * sets/arrays. By keeping a per-cpu list, we avoid having to embed
- * the work_struct in fdtable itself which avoids a 64 byte (i386) increase
in
- * this per-task structure.
+ * We use this list to defer free fdtables that have vmalloced sets/arrays.
By
+ * keeping a per-cpu list, we avoid having to embed the work_struct in the
+ * fdtable itself.
*/
static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
-
-/*
- * Allocate an fd array, using kmalloc or vmalloc.
- * Note: the array isn't cleared at allocation time.
+/**
+ * alloc_fdmem - Allocate space for fdtable dynamic data.
+ * @size: Amount of memory, in bytes, required to hold the data.
*/
-struct file ** alloc_fd_array(int num)
+static inline void * alloc_fdmem(unsigned int size)
{
- struct file **new_fds;
- int size = num * sizeof(struct file *);
-
if (size <= PAGE_SIZE)
- new_fds = (struct file **) kmalloc(size, GFP_KERNEL);
- else
- new_fds = (struct file **) vmalloc(size);
- return new_fds;
+ return kmalloc(size, GFP_KERNEL);
+ else
+ return vmalloc(size);
}
-void free_fd_array(struct file **array, int num)
+/**
+ * free_fdarr - Free the fdarray within the fdtable.
+ * @fdt: The containing fdtable.
+ */
+static inline void free_fdarr(struct fdtable *fdt)
{
- int size = num * sizeof(struct file *);
-
- if (!array) {
- printk (KERN_ERR "free_fd_array: array = 0 (num = %d)\n", num);
- return;
- }
-
- if (num <= NR_OPEN_DEFAULT) /* Don't free the embedded fd array! */
- return;
- else if (size <= PAGE_SIZE)
- kfree(array);
+ if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *)))
+ kfree(fdt->fd);
else
- vfree(array);
+ vfree(fdt->fd);
}
-static void __free_fdtable(struct fdtable *fdt)
+/**
+ * free_fdset - Free the fdsets within the fdtable.
+ * @fdt: The containing fdtable.
+ */
+static inline void free_fdset(struct fdtable *fdt)
{
- free_fdset(fdt->open_fds, fdt->max_fds);
- free_fdset(fdt->close_on_exec, fdt->max_fds);
- free_fd_array(fdt->fd, fdt->max_fds);
- kfree(fdt);
+ if (fdt->max_fds <= (PAGE_SIZE * BITS_PER_BYTE / 2))
+ kfree(fdt->open_fds);
+ else
+ vfree(fdt->open_fds);
}
+/**
+ * fdtable_timer - Reschedule deferred fdtable deletion work.
+ * @data: Typecast pointer to the relevant fdtable_defer structure.
+ */
static void fdtable_timer(unsigned long data)
{
- struct fdtable_defer *fddef = (struct fdtable_defer *)data;
+ struct fdtable_defer *fddef;
+ fddef = (struct fdtable_defer *)data;
spin_lock(&fddef->lock);
/*
- * If someone already emptied the queue return.
+ * If there are any fdtables scheduled for deletion, then try to
+ * schedule this work. If we could not schedule, then run this function
+ * again in a little while.
*/
- if (!fddef->next)
- goto out;
- if (!schedule_work(&fddef->wq))
- mod_timer(&fddef->timer, 5);
-out:
+ if (fddef->next)
+ if (!schedule_work(&fddef->wq))
+ mod_timer(&fddef->timer, 5);
spin_unlock(&fddef->lock);
}
-static void free_fdtable_work(struct fdtable_defer *f)
+/**
+ * free_fdtable_work - Free deferred fdtables.
+ * @fddef: Per-cpu area containing a list of deferred fdtables.
+ *
+ * Fdtable structures which contain member data obtained using vmalloc are
not
+ * freed immediately, but are instead deferred for the workqueue context. The
+ * workqueue uses this function to handle the deferred fdtables.
+ */
+static void free_fdtable_work(struct fdtable_defer *fddef)
{
struct fdtable *fdt;
- spin_lock_bh(&f->lock);
- fdt = f->next;
- f->next = NULL;
- spin_unlock_bh(&f->lock);
- while(fdt) {
- struct fdtable *next = fdt->next;
- __free_fdtable(fdt);
+ /*
+ * Grab a linked list of the deferred fdtables. We'll free those, so
+ * set the list as empty before continuing with the real work.
+ */
+ spin_lock_bh(&fddef->lock);
+ fdt = fddef->next;
+ fddef->next = NULL;
+ spin_unlock_bh(&fddef->lock);
+
+ while (fdt) {
+ struct fdtable *next;
+
+ next = fdt->next;
+ /*
+ * Since this fdtable was deferred, we know for a fact that the
+ * fdarray was obtained with vmalloc. The fdset is smaller,
+ * however, so we must check its size to know how to release
+ * it.
+ */
+ vfree(fdt->fd);
+ free_fdset(fdt);
+ kfree(fdt);
fdt = next;
}
}
+/**
+ * free_fdtable_rcu - Free an fdtable or its wrapper files_struct.
+ * @rcu: The RCU head structure embedded within the to-be-freed fdtable.
+ *
+ * In order to correctly free an fdtable that was in use by the system, this
+ * function should be invoked as an RCU callback on the target fdtable. It
must
+ * be used on non-embedded fdtables or embedded fdtables once the wrapper
+ * files_struct is to be discarded; it must not be used on embedded fdtables
+ * where the wrapper files_struct must persist.
+ */
void free_fdtable_rcu(struct rcu_head *rcu)
{
- struct fdtable *fdt = container_of(rcu, struct fdtable, rcu);
- int fdset_size, fdarray_size;
- struct fdtable_defer *fddef;
-
- BUG_ON(!fdt);
- fdset_size = fdt->max_fds / 8;
- fdarray_size = fdt->max_fds * sizeof(struct file *);
+ struct fdtable *fdt;
+ fdt = container_of(rcu, struct fdtable, rcu);
if (fdt->max_fds <= NR_OPEN_DEFAULT) {
/*
- * This fdtable is embedded in the files structure and that
- * structure itself is getting destroyed.
+ * This fdtable is embedded within a wrapper files_struct, and
+ * both are now expired. Free the container.
*/
kmem_cache_free(files_cachep,
container_of(fdt, struct files_struct, fdtab));
return;
}
- if (fdset_size <= PAGE_SIZE && fdarray_size <= PAGE_SIZE) {
- kfree(fdt->open_fds);
- kfree(fdt->close_on_exec);
+ if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *))) {
+ /*
+ * The fdarray was obtained with kmalloc, and since the fdset
+ * will always be smaller we know it was also obtained with
+ * kmalloc. Thus, we can dispose of the fdtable right now.
+ */
kfree(fdt->fd);
+ kfree(fdt->open_fds);
kfree(fdt);
} else {
+ struct fdtable_defer *fddef;
+
+ /*
+ * The fdset has at least one component obtained with vmalloc.
+ * Hence, we will handle deallocation from the workqueue
+ * context. If we are unable to schedule the work, then we set
+ * a timer to fire and reattempt to schedule later.
+ */
fddef = &get_cpu_var(fdtable_defer_list);
spin_lock(&fddef->lock);
fdt->next = fddef->next;
fddef->next = fdt;
- /*
- * vmallocs are handled from the workqueue context.
- * If the per-cpu workqueue is running, then we
- * defer work scheduling through a timer.
- */
if (!schedule_work(&fddef->wq))
mod_timer(&fddef->timer, 5);
spin_unlock(&fddef->lock);
@@ -147,197 +177,179 @@ void free_fdtable_rcu(struct rcu_head *r
}
}
-/*
- * Expand the fdset in the files_struct. Called with the files spinlock
- * held for write.
+/**
+ * copy_fdtable - Copy fdtable data.
+ * @nfdt: New fdtable to copy data to.
+ * @ofdt: Old fdtable to copy data from.
+ *
+ * Copy fdarray and fdset data from the old fdtable to the new fdtable. If
the
+ * new fdtable supports more file entries, then the extra high-order data
will
+ * be zeroed. The file_lock related to ofdt must be held for write.
*/
-static void copy_fdtable(struct fdtable *nfdt, struct fdtable *fdt)
+static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
{
- int i;
- int count;
-
- BUG_ON(nfdt->max_fds < fdt->max_fds);
- /* Copy the existing tables and install the new pointers */
-
- i = fdt->max_fds / (sizeof(unsigned long) * 8);
- count = (nfdt->max_fds - fdt->max_fds) / 8;
+ unsigned int cpy, set;
+ BUG_ON(nfdt->max_fds < ofdt->max_fds);
/*
- * Don't copy the entire array if the current fdset is
- * not yet initialised.
+ * Don't copy or clear the data if we are creating a new fdtable for
+ * fork().
*/
- if (i) {
- memcpy (nfdt->open_fds, fdt->open_fds,
- fdt->max_fds/8);
- memcpy (nfdt->close_on_exec, fdt->close_on_exec,
- fdt->max_fds/8);
- memset (&nfdt->open_fds->fds_bits[i], 0, count);
- memset (&nfdt->close_on_exec->fds_bits[i], 0, count);
- }
-
- /* Don't copy/clear the array if we are creating a new
- fd array for fork() */
- if (fdt->max_fds) {
- memcpy(nfdt->fd, fdt->fd,
- fdt->max_fds * sizeof(struct file *));
- /* clear the remainder of the array */
- memset(&nfdt->fd[fdt->max_fds], 0,
- (nfdt->max_fds - fdt->max_fds) *
- sizeof(struct file *));
- }
-}
-
-/*
- * Allocate an fdset array, using kmalloc or vmalloc.
- * Note: the array isn't cleared at allocation time.
- */
-fd_set * alloc_fdset(int num)
-{
- fd_set *new_fdset;
- int size = num / 8;
-
- if (size <= PAGE_SIZE)
- new_fdset = (fd_set *) kmalloc(size, GFP_KERNEL);
- else
- new_fdset = (fd_set *) vmalloc(size);
- return new_fdset;
-}
-
-void free_fdset(fd_set *array, int num)
-{
- if (num <= NR_OPEN_DEFAULT) /* Don't free an embedded fdset */
+ if (ofdt->max_fds == 0)
return;
- else if (num <= 8 * PAGE_SIZE)
- kfree(array);
- else
- vfree(array);
-}
-static struct fdtable *alloc_fdtable(int nr)
+ /* Initialize the new fdarray. */
+ cpy = ofdt->max_fds * sizeof(struct file *);
+ set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
+ memcpy(nfdt->fd, ofdt->fd, cpy);
+ memset((char *)(nfdt->fd) + cpy, 0, set);
+
+ /* Initialize the new fdsets. */
+ cpy = ofdt->max_fds / BITS_PER_BYTE;
+ set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE;
+ memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
+ memset((char *)(nfdt->open_fds) + cpy, 0, set);
+ memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
+ memset((char *)(nfdt->close_on_exec) + cpy, 0, set);
+}
+
+/**
+ * alloc_fdtable - Allocate an appropriately-sized fdtable.
+ * @nr: Requested fd index to be supported.
+ *
+ * Allocate and initialize a new fdtable. The fdtable must be able to support
+ * the requested file descriptor nr within its internal data structures.
+ *
+ * On success, the newly-created fdtable is returned. On allocation failure,
+ * NULL is returned.
+ */
+static struct fdtable * alloc_fdtable(unsigned int nr)
{
- struct fdtable *fdt = NULL;
- int nfds = 0;
- fd_set *new_openset = NULL, *new_execset = NULL;
- struct file **new_fds;
-
- fdt = kzalloc(sizeof(*fdt), GFP_KERNEL);
- if (!fdt)
- goto out;
+ struct fdtable *fdt;
+ char *data;
- nfds = NR_OPEN_DEFAULT;
/*
- * Expand to the max in easy steps, and keep expanding it until
- * we have enough for the requested fd array size.
+ * Figure out how many fds we actually want to support in this fdtable.
+ * Allocation steps are keyed to the size of the fdarray, since it
+ * grows far faster than any of the other dynamic data. We try to fit
+ * the fdarray into page-sized chunks: starting at a quarter of a page;
+ * growing exponentially at first and linearly once the page boundary
+ * is surpassed.
*/
- do {
-#if NR_OPEN_DEFAULT < 256
- if (nfds < 256)
- nfds = 256;
- else
-#endif
- if (nfds < (PAGE_SIZE / sizeof(struct file *)))
- nfds = PAGE_SIZE / sizeof(struct file *);
- else {
- nfds = nfds * 2;
- if (nfds > NR_OPEN)
- nfds = NR_OPEN;
- }
- } while (nfds <= nr);
-
- new_openset = alloc_fdset(nfds);
- new_execset = alloc_fdset(nfds);
- if (!new_openset || !new_execset)
- goto out;
- fdt->open_fds = new_openset;
- fdt->close_on_exec = new_execset;
+ nr /= (PAGE_SIZE / 4 / sizeof(struct file *));
+ if (nr > 1)
+ nr |= 3;
+ nr++;
+ nr *= (PAGE_SIZE / 4 / sizeof(struct file *));
- new_fds = alloc_fd_array(nfds);
- if (!new_fds)
+ /* Create the fdtable itself. */
+ fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
+ if (!fdt)
goto out;
- fdt->fd = new_fds;
- fdt->max_fds = nfds;
+ fdt->max_fds = nr;
+ /* Allocate space for the fdarray. */
+ data = alloc_fdmem(nr * sizeof(struct file *));
+ if (!data)
+ goto out_fdt;
+ fdt->fd = (struct file **)data;
+ /* Allocate space for both fdsets together - open_fds is the base. */
+ data = alloc_fdmem(2 * nr / BITS_PER_BYTE);
+ if (!data)
+ goto out_arr;
+ fdt->open_fds = (fd_set *)data;
+ data += nr / BITS_PER_BYTE;
+ fdt->close_on_exec = (fd_set *)data;
+ /* Initialize the rest of the fdtable. */
+ INIT_RCU_HEAD(&fdt->rcu);
+ fdt->next = NULL;
+
return fdt;
-out:
- free_fdset(new_openset, nfds);
- free_fdset(new_execset, nfds);
+
+out_arr:
+ free_fdarr(fdt);
+out_fdt:
kfree(fdt);
+out:
return NULL;
}
-/*
- * Expand the file descriptor table.
- * This function will allocate a new fdtable and both fd array and fdset, of
- * the given size.
- * Return <0 error code on error; 1 on successful completion.
- * The files->file_lock should be held on entry, and will be held on exit.
+/**
+ * expand_files - Accommodate an fd index inside a files structure.
+ * @files: The files structure that must be sized.
+ * @nr: Requested fd index to be supported.
+ *
+ * Make sure that the given files structure can accommodate the provided fd
+ * index within its associated fdtable. If the requested index exceeds the
+ * current capacity and there is room for expansion, a larger fdtable will be
+ * created and installed. The files->file_lock should be held on entry, and
+ * will be held on exit.
+ *
+ * If the current fdtable is sufficient, 0 is returned. If the fdtable was
+ * expanded and execution may have blocked, 1 is returned. On an error
+ * condition, a negative error code is returned.
*/
-static int expand_fdtable(struct files_struct *files, int nr)
+int expand_files(struct files_struct *files, int nr)
__releases(files->file_lock)
__acquires(files->file_lock)
{
- struct fdtable *new_fdt, *cur_fdt;
+ struct fdtable *cur_fdt, *new_fdt;
+
+ cur_fdt = files_fdtable(files);
+ /* Do we need to expand? */
+ if (nr < cur_fdt->max_fds)
+ return 0;
+ /* Are we allowed to expand? */
+ if (nr >= NR_OPEN)
+ return -EMFILE;
+ /* Allocate a larger fdtable outside the lock. */
spin_unlock(&files->file_lock);
new_fdt = alloc_fdtable(nr);
spin_lock(&files->file_lock);
if (!new_fdt)
return -ENOMEM;
+ cur_fdt = files_fdtable(files);
/*
- * Check again since another task may have expanded the fd table while
- * we dropped the lock
+ * Check the sizes again since another task may have expanded the
+ * fdtable while we dropped the lock.
*/
- cur_fdt = files_fdtable(files);
- if (nr >= cur_fdt->max_fds) {
- /* Continue as planned */
+ if (nr < cur_fdt->max_fds) {
+ /* Somebody else expanded, so undo our allocation attempt. */
+ free_fdarr(new_fdt);
+ free_fdset(new_fdt);
+ kfree(new_fdt);
+ } else {
+ /* All good. Install the new fdtable and retire the old one. */
copy_fdtable(new_fdt, cur_fdt);
rcu_assign_pointer(files->fdt, new_fdt);
if (cur_fdt->max_fds > NR_OPEN_DEFAULT)
call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
- } else {
- /* Somebody else expanded, so undo our attempt */
- __free_fdtable(new_fdt);
}
return 1;
}
-/*
- * Expand files.
- * This function will expand the file structures, if the requested size
exceeds
- * the current capacity and there is room for expansion.
- * Return <0 error code on error; 0 when nothing done; 1 when files were
- * expanded and execution may have blocked.
- * The files->file_lock should be held on entry, and will be held on exit.
+/**
+ * fdtable_defer_list_init - Initialize the per-cpu fdtable defer list.
+ * @cpu: The cpu for which the defer list should be initialized.
*/
-int expand_files(struct files_struct *files, int nr)
-{
- struct fdtable *fdt;
-
- fdt = files_fdtable(files);
- /* Do we need to expand? */
- if (nr < fdt->max_fds)
- return 0;
- /* Can we expand? */
- if (nr >= NR_OPEN)
- return -EMFILE;
-
- /* All good, so we try */
- return expand_fdtable(files, nr);
-}
-
static void __devinit fdtable_defer_list_init(int cpu)
{
- struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);
+ struct fdtable_defer *fddef;
+
+ fddef = &per_cpu(fdtable_defer_list, cpu);
spin_lock_init(&fddef->lock);
INIT_WORK(&fddef->wq, (void (*)(void *))free_fdtable_work, fddef);
- init_timer(&fddef->timer);
- fddef->timer.data = (unsigned long)fddef;
- fddef->timer.function = fdtable_timer;
+ setup_timer(&fddef->timer, fdtable_timer, (unsigned long)fddef);
fddef->next = NULL;
}
+/**
+ * files_defer_init - Initialize the fdtable defer lists.
+ */
void __init files_defer_init(void)
{
int i;
+
for_each_possible_cpu(i)
fdtable_defer_list_init(i);
}
diff -Npru old/include/linux/file.h new/include/linux/file.h
--- old/include/linux/file.h 2006-09-28 20:13:13.000000000 -0700
+++ new/include/linux/file.h 2006-09-28 20:22:05.000000000 -0700
@@ -29,8 +29,8 @@ struct embedded_fd_set {
struct fdtable {
unsigned int max_fds;
struct file ** fd; /* current fd array */
- fd_set *close_on_exec;
fd_set *open_fds;
+ fd_set *close_on_exec;
struct rcu_head rcu;
struct fdtable *next;
};
@@ -74,12 +74,6 @@ extern int get_unused_fd(void);
extern void FASTCALL(put_unused_fd(unsigned int fd));
struct kmem_cache;
-extern struct file ** alloc_fd_array(int);
-extern void free_fd_array(struct file **, int);
-
-extern fd_set *alloc_fdset(int);
-extern void free_fdset(fd_set *, int);
-
extern int expand_files(struct files_struct *, int nr);
extern void free_fdtable_rcu(struct rcu_head *rcu);
extern void __init files_defer_init(void);
On Sunday 01 October 2006 23:14, Vadim Lobanov wrote:
> This patch provides an improved fdtable allocation scheme, useful for
> expanding fdtable file descriptor entries. The main focus is on the
> fdarray, as its memory usage grows 128 times faster than that of an fdset.
>
> The allocation algorithm sizes the fdarray in such a way that its memory
> usage increases in easy page-sized chunks. Additionally, it tries to
> account for the optimal usage of the allocators involved: kmalloc() for
> sizes less than a page, and vmalloc() with page granularity for sizes
> greater than a page. Namely, the following sizes for the fdarray are
> considered, and the smallest that accommodates the requested fd count is
> chosen:
> pagesize / 4
> pagesize / 2
> pagesize <- memory allocator switch point
> pagesize * 2
> pagesize * 3
> pagesize * 4
> ...etc...
> Unlike the current implementation, this allocation scheme does not require
> a loop to compute the optimal fdarray size, and can be done in straightline
> code.
>
> Furthermore, since the fdarray overflows the pagesize boundary long before
> any of the fdsets do, it makes sense to optimize run-time by allocating
> both fdsets
> in a single swoop. Even together, they will still be, by far, smaller than
> the fdarray.
>
> As long as we're replacing the guts of fs/file.c, it makes sense to tidy up
> the code. This work includes:
> simplification via refactoring,
> elimination of unnecessary code, and
> extensive commenting throughout the entire file.
> This is the last patch in the series. All the code should now be sparkly
> clean.
>
Vadim, I think your patch is way too complex, and some changes are dubious.
You mix cleanups and changes in the same patch, making hard to match your
patch description and its content.
Current scheme is to allocate power of two sizes, and not 'the smallest that
accommodates the requested fd count'. This is for a good reason, because we
don't want to call vmalloc()/vfree() each time a process opens 512 or 1024
more files (x86_64 or ia32)
I personally prefer that table grows by a two factor, especially when they
are huge. Also, power of two sizes gives less vmalloc space fragmentation
(might be a concern for some people that are LOWMEM tight and that reduce
VMALLOC space to get more LOWMEM)
default __VMALLOC_RESERVE on i386 is 128Mo, but I have some servers where I
use
vmalloc=16M just to give more LOWMEM for kernel use.
diff -Npru old/include/linux/file.h new/include/linux/file.h
--- old/include/linux/file.h????2006-09-28 20:13:13.000000000 -0700
+++ new/include/linux/file.h????2006-09-28 20:22:05.000000000 -0700
@@ -29,8 +29,8 @@ struct embedded_fd_set {
?struct fdtable {
????????unsigned int max_fds;
????????struct file ** fd; ? ? ?/* current fd array */
-???????fd_set *close_on_exec;
????????fd_set *open_fds;
+???????fd_set *close_on_exec;
????????struct rcu_head rcu;
????????struct fdtable *next;
?};
Whats the reason for moving this close_on_exec definition in struct fdtable ?
Eric
Vadim Lobanov <[email protected]> writes:
>
> The allocation algorithm sizes the fdarray in such a way that its memory usage
> increases in easy page-sized chunks. Additionally, it tries to account for the
> optimal usage of the allocators involved: kmalloc() for sizes less than a
> page, and vmalloc() with page granularity for sizes greater than a page.
Best would be to avoid vmalloc() completely because it can be quite
costly
-Andi
> Namely, the following sizes for the fdarray are considered, and the smallest
> that accommodates the requested fd count is chosen:
> pagesize / 4
> pagesize / 2
> pagesize <- memory allocator switch point
> pagesize * 2
> pagesize * 3
> pagesize * 4
> ...etc...
> Unlike the current implementation, this allocation scheme does not require a
> loop to compute the optimal fdarray size, and can be done in straightline
> code.
>
> Furthermore, since the fdarray overflows the pagesize boundary long before any
> of the fdsets do, it makes sense to optimize run-time by allocating both
> fdsets
> in a single swoop. Even together, they will still be, by far, smaller than the
> fdarray.
>
> As long as we're replacing the guts of fs/file.c, it makes sense to tidy up
> the code. This work includes:
> simplification via refactoring,
> elimination of unnecessary code, and
> extensive commenting throughout the entire file.
> This is the last patch in the series. All the code should now be sparkly
> clean.
>
> Signed-off-by: Vadim Lobanov <[email protected]>
>
> diff -Npru old/fs/file.c new/fs/file.c
> --- old/fs/file.c 2006-09-28 20:13:13.000000000 -0700
> +++ new/fs/file.c 2006-09-28 20:22:48.000000000 -0700
> @@ -1,21 +1,18 @@
> /*
> * linux/fs/file.c
> *
> + * Manage the dynamic fd arrays in the process files_struct.
> * Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
> *
> - * Manage the dynamic fd arrays in the process files_struct.
> + * Pagesize-based fdarray/fdset allocation algorithm; major cleanups.
> + * Copyright (C) 2006, Vadim Lobanov
> */
>
> #include <linux/fs.h>
> -#include <linux/mm.h>
> -#include <linux/time.h>
> #include <linux/slab.h>
> #include <linux/vmalloc.h>
> #include <linux/file.h>
> -#include <linux/bitops.h>
> #include <linux/interrupt.h>
> -#include <linux/spinlock.h>
> -#include <linux/rcupdate.h>
> #include <linux/workqueue.h>
>
> struct fdtable_defer {
> @@ -26,120 +23,153 @@ struct fdtable_defer {
> };
>
> /*
> - * We use this list to defer free fdtables that have vmalloced
> - * sets/arrays. By keeping a per-cpu list, we avoid having to embed
> - * the work_struct in fdtable itself which avoids a 64 byte (i386) increase
> in
> - * this per-task structure.
> + * We use this list to defer free fdtables that have vmalloced sets/arrays.
> By
> + * keeping a per-cpu list, we avoid having to embed the work_struct in the
> + * fdtable itself.
> */
> static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
>
> -
> -/*
> - * Allocate an fd array, using kmalloc or vmalloc.
> - * Note: the array isn't cleared at allocation time.
> +/**
> + * alloc_fdmem - Allocate space for fdtable dynamic data.
> + * @size: Amount of memory, in bytes, required to hold the data.
> */
> -struct file ** alloc_fd_array(int num)
> +static inline void * alloc_fdmem(unsigned int size)
> {
> - struct file **new_fds;
> - int size = num * sizeof(struct file *);
> -
> if (size <= PAGE_SIZE)
> - new_fds = (struct file **) kmalloc(size, GFP_KERNEL);
> - else
> - new_fds = (struct file **) vmalloc(size);
> - return new_fds;
> + return kmalloc(size, GFP_KERNEL);
> + else
> + return vmalloc(size);
> }
>
> -void free_fd_array(struct file **array, int num)
> +/**
> + * free_fdarr - Free the fdarray within the fdtable.
> + * @fdt: The containing fdtable.
> + */
> +static inline void free_fdarr(struct fdtable *fdt)
> {
> - int size = num * sizeof(struct file *);
> -
> - if (!array) {
> - printk (KERN_ERR "free_fd_array: array = 0 (num = %d)\n", num);
> - return;
> - }
> -
> - if (num <= NR_OPEN_DEFAULT) /* Don't free the embedded fd array! */
> - return;
> - else if (size <= PAGE_SIZE)
> - kfree(array);
> + if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *)))
> + kfree(fdt->fd);
> else
> - vfree(array);
> + vfree(fdt->fd);
> }
>
> -static void __free_fdtable(struct fdtable *fdt)
> +/**
> + * free_fdset - Free the fdsets within the fdtable.
> + * @fdt: The containing fdtable.
> + */
> +static inline void free_fdset(struct fdtable *fdt)
> {
> - free_fdset(fdt->open_fds, fdt->max_fds);
> - free_fdset(fdt->close_on_exec, fdt->max_fds);
> - free_fd_array(fdt->fd, fdt->max_fds);
> - kfree(fdt);
> + if (fdt->max_fds <= (PAGE_SIZE * BITS_PER_BYTE / 2))
> + kfree(fdt->open_fds);
> + else
> + vfree(fdt->open_fds);
> }
>
> +/**
> + * fdtable_timer - Reschedule deferred fdtable deletion work.
> + * @data: Typecast pointer to the relevant fdtable_defer structure.
> + */
> static void fdtable_timer(unsigned long data)
> {
> - struct fdtable_defer *fddef = (struct fdtable_defer *)data;
> + struct fdtable_defer *fddef;
>
> + fddef = (struct fdtable_defer *)data;
> spin_lock(&fddef->lock);
> /*
> - * If someone already emptied the queue return.
> + * If there are any fdtables scheduled for deletion, then try to
> + * schedule this work. If we could not schedule, then run this function
> + * again in a little while.
> */
> - if (!fddef->next)
> - goto out;
> - if (!schedule_work(&fddef->wq))
> - mod_timer(&fddef->timer, 5);
> -out:
> + if (fddef->next)
> + if (!schedule_work(&fddef->wq))
> + mod_timer(&fddef->timer, 5);
> spin_unlock(&fddef->lock);
> }
>
> -static void free_fdtable_work(struct fdtable_defer *f)
> +/**
> + * free_fdtable_work - Free deferred fdtables.
> + * @fddef: Per-cpu area containing a list of deferred fdtables.
> + *
> + * Fdtable structures which contain member data obtained using vmalloc are
> not
> + * freed immediately, but are instead deferred for the workqueue context. The
> + * workqueue uses this function to handle the deferred fdtables.
> + */
> +static void free_fdtable_work(struct fdtable_defer *fddef)
> {
> struct fdtable *fdt;
>
> - spin_lock_bh(&f->lock);
> - fdt = f->next;
> - f->next = NULL;
> - spin_unlock_bh(&f->lock);
> - while(fdt) {
> - struct fdtable *next = fdt->next;
> - __free_fdtable(fdt);
> + /*
> + * Grab a linked list of the deferred fdtables. We'll free those, so
> + * set the list as empty before continuing with the real work.
> + */
> + spin_lock_bh(&fddef->lock);
> + fdt = fddef->next;
> + fddef->next = NULL;
> + spin_unlock_bh(&fddef->lock);
> +
> + while (fdt) {
> + struct fdtable *next;
> +
> + next = fdt->next;
> + /*
> + * Since this fdtable was deferred, we know for a fact that the
> + * fdarray was obtained with vmalloc. The fdset is smaller,
> + * however, so we must check its size to know how to release
> + * it.
> + */
> + vfree(fdt->fd);
> + free_fdset(fdt);
> + kfree(fdt);
> fdt = next;
> }
> }
>
> +/**
> + * free_fdtable_rcu - Free an fdtable or its wrapper files_struct.
> + * @rcu: The RCU head structure embedded within the to-be-freed fdtable.
> + *
> + * In order to correctly free an fdtable that was in use by the system, this
> + * function should be invoked as an RCU callback on the target fdtable. It
> must
> + * be used on non-embedded fdtables or embedded fdtables once the wrapper
> + * files_struct is to be discarded; it must not be used on embedded fdtables
> + * where the wrapper files_struct must persist.
> + */
> void free_fdtable_rcu(struct rcu_head *rcu)
> {
> - struct fdtable *fdt = container_of(rcu, struct fdtable, rcu);
> - int fdset_size, fdarray_size;
> - struct fdtable_defer *fddef;
> -
> - BUG_ON(!fdt);
> - fdset_size = fdt->max_fds / 8;
> - fdarray_size = fdt->max_fds * sizeof(struct file *);
> + struct fdtable *fdt;
>
> + fdt = container_of(rcu, struct fdtable, rcu);
> if (fdt->max_fds <= NR_OPEN_DEFAULT) {
> /*
> - * This fdtable is embedded in the files structure and that
> - * structure itself is getting destroyed.
> + * This fdtable is embedded within a wrapper files_struct, and
> + * both are now expired. Free the container.
> */
> kmem_cache_free(files_cachep,
> container_of(fdt, struct files_struct, fdtab));
> return;
> }
> - if (fdset_size <= PAGE_SIZE && fdarray_size <= PAGE_SIZE) {
> - kfree(fdt->open_fds);
> - kfree(fdt->close_on_exec);
> + if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *))) {
> + /*
> + * The fdarray was obtained with kmalloc, and since the fdset
> + * will always be smaller we know it was also obtained with
> + * kmalloc. Thus, we can dispose of the fdtable right now.
> + */
> kfree(fdt->fd);
> + kfree(fdt->open_fds);
> kfree(fdt);
> } else {
> + struct fdtable_defer *fddef;
> +
> + /*
> + * The fdset has at least one component obtained with vmalloc.
> + * Hence, we will handle deallocation from the workqueue
> + * context. If we are unable to schedule the work, then we set
> + * a timer to fire and reattempt to schedule later.
> + */
> fddef = &get_cpu_var(fdtable_defer_list);
> spin_lock(&fddef->lock);
> fdt->next = fddef->next;
> fddef->next = fdt;
> - /*
> - * vmallocs are handled from the workqueue context.
> - * If the per-cpu workqueue is running, then we
> - * defer work scheduling through a timer.
> - */
> if (!schedule_work(&fddef->wq))
> mod_timer(&fddef->timer, 5);
> spin_unlock(&fddef->lock);
> @@ -147,197 +177,179 @@ void free_fdtable_rcu(struct rcu_head *r
> }
> }
>
> -/*
> - * Expand the fdset in the files_struct. Called with the files spinlock
> - * held for write.
> +/**
> + * copy_fdtable - Copy fdtable data.
> + * @nfdt: New fdtable to copy data to.
> + * @ofdt: Old fdtable to copy data from.
> + *
> + * Copy fdarray and fdset data from the old fdtable to the new fdtable. If
> the
> + * new fdtable supports more file entries, then the extra high-order data
> will
> + * be zeroed. The file_lock related to ofdt must be held for write.
> */
> -static void copy_fdtable(struct fdtable *nfdt, struct fdtable *fdt)
> +static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
> {
> - int i;
> - int count;
> -
> - BUG_ON(nfdt->max_fds < fdt->max_fds);
> - /* Copy the existing tables and install the new pointers */
> -
> - i = fdt->max_fds / (sizeof(unsigned long) * 8);
> - count = (nfdt->max_fds - fdt->max_fds) / 8;
> + unsigned int cpy, set;
>
> + BUG_ON(nfdt->max_fds < ofdt->max_fds);
> /*
> - * Don't copy the entire array if the current fdset is
> - * not yet initialised.
> + * Don't copy or clear the data if we are creating a new fdtable for
> + * fork().
> */
> - if (i) {
> - memcpy (nfdt->open_fds, fdt->open_fds,
> - fdt->max_fds/8);
> - memcpy (nfdt->close_on_exec, fdt->close_on_exec,
> - fdt->max_fds/8);
> - memset (&nfdt->open_fds->fds_bits[i], 0, count);
> - memset (&nfdt->close_on_exec->fds_bits[i], 0, count);
> - }
> -
> - /* Don't copy/clear the array if we are creating a new
> - fd array for fork() */
> - if (fdt->max_fds) {
> - memcpy(nfdt->fd, fdt->fd,
> - fdt->max_fds * sizeof(struct file *));
> - /* clear the remainder of the array */
> - memset(&nfdt->fd[fdt->max_fds], 0,
> - (nfdt->max_fds - fdt->max_fds) *
> - sizeof(struct file *));
> - }
> -}
> -
> -/*
> - * Allocate an fdset array, using kmalloc or vmalloc.
> - * Note: the array isn't cleared at allocation time.
> - */
> -fd_set * alloc_fdset(int num)
> -{
> - fd_set *new_fdset;
> - int size = num / 8;
> -
> - if (size <= PAGE_SIZE)
> - new_fdset = (fd_set *) kmalloc(size, GFP_KERNEL);
> - else
> - new_fdset = (fd_set *) vmalloc(size);
> - return new_fdset;
> -}
> -
> -void free_fdset(fd_set *array, int num)
> -{
> - if (num <= NR_OPEN_DEFAULT) /* Don't free an embedded fdset */
> + if (ofdt->max_fds == 0)
> return;
> - else if (num <= 8 * PAGE_SIZE)
> - kfree(array);
> - else
> - vfree(array);
> -}
>
> -static struct fdtable *alloc_fdtable(int nr)
> + /* Initialize the new fdarray. */
> + cpy = ofdt->max_fds * sizeof(struct file *);
> + set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
> + memcpy(nfdt->fd, ofdt->fd, cpy);
> + memset((char *)(nfdt->fd) + cpy, 0, set);
> +
> + /* Initialize the new fdsets. */
> + cpy = ofdt->max_fds / BITS_PER_BYTE;
> + set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE;
> + memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
> + memset((char *)(nfdt->open_fds) + cpy, 0, set);
> + memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
> + memset((char *)(nfdt->close_on_exec) + cpy, 0, set);
> +}
> +
> +/**
> + * alloc_fdtable - Allocate an appropriately-sized fdtable.
> + * @nr: Requested fd index to be supported.
> + *
> + * Allocate and initialize a new fdtable. The fdtable must be able to support
> + * the requested file descriptor nr within its internal data structures.
> + *
> + * On success, the newly-created fdtable is returned. On allocation failure,
> + * NULL is returned.
> + */
> +static struct fdtable * alloc_fdtable(unsigned int nr)
> {
> - struct fdtable *fdt = NULL;
> - int nfds = 0;
> - fd_set *new_openset = NULL, *new_execset = NULL;
> - struct file **new_fds;
> -
> - fdt = kzalloc(sizeof(*fdt), GFP_KERNEL);
> - if (!fdt)
> - goto out;
> + struct fdtable *fdt;
> + char *data;
>
> - nfds = NR_OPEN_DEFAULT;
> /*
> - * Expand to the max in easy steps, and keep expanding it until
> - * we have enough for the requested fd array size.
> + * Figure out how many fds we actually want to support in this fdtable.
> + * Allocation steps are keyed to the size of the fdarray, since it
> + * grows far faster than any of the other dynamic data. We try to fit
> + * the fdarray into page-sized chunks: starting at a quarter of a page;
> + * growing exponentially at first and linearly once the page boundary
> + * is surpassed.
> */
> - do {
> -#if NR_OPEN_DEFAULT < 256
> - if (nfds < 256)
> - nfds = 256;
> - else
> -#endif
> - if (nfds < (PAGE_SIZE / sizeof(struct file *)))
> - nfds = PAGE_SIZE / sizeof(struct file *);
> - else {
> - nfds = nfds * 2;
> - if (nfds > NR_OPEN)
> - nfds = NR_OPEN;
> - }
> - } while (nfds <= nr);
> -
> - new_openset = alloc_fdset(nfds);
> - new_execset = alloc_fdset(nfds);
> - if (!new_openset || !new_execset)
> - goto out;
> - fdt->open_fds = new_openset;
> - fdt->close_on_exec = new_execset;
> + nr /= (PAGE_SIZE / 4 / sizeof(struct file *));
> + if (nr > 1)
> + nr |= 3;
> + nr++;
> + nr *= (PAGE_SIZE / 4 / sizeof(struct file *));
>
> - new_fds = alloc_fd_array(nfds);
> - if (!new_fds)
> + /* Create the fdtable itself. */
> + fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
> + if (!fdt)
> goto out;
> - fdt->fd = new_fds;
> - fdt->max_fds = nfds;
> + fdt->max_fds = nr;
> + /* Allocate space for the fdarray. */
> + data = alloc_fdmem(nr * sizeof(struct file *));
> + if (!data)
> + goto out_fdt;
> + fdt->fd = (struct file **)data;
> + /* Allocate space for both fdsets together - open_fds is the base. */
> + data = alloc_fdmem(2 * nr / BITS_PER_BYTE);
> + if (!data)
> + goto out_arr;
> + fdt->open_fds = (fd_set *)data;
> + data += nr / BITS_PER_BYTE;
> + fdt->close_on_exec = (fd_set *)data;
> + /* Initialize the rest of the fdtable. */
> + INIT_RCU_HEAD(&fdt->rcu);
> + fdt->next = NULL;
> +
> return fdt;
> -out:
> - free_fdset(new_openset, nfds);
> - free_fdset(new_execset, nfds);
> +
> +out_arr:
> + free_fdarr(fdt);
> +out_fdt:
> kfree(fdt);
> +out:
> return NULL;
> }
>
> -/*
> - * Expand the file descriptor table.
> - * This function will allocate a new fdtable and both fd array and fdset, of
> - * the given size.
> - * Return <0 error code on error; 1 on successful completion.
> - * The files->file_lock should be held on entry, and will be held on exit.
> +/**
> + * expand_files - Accommodate an fd index inside a files structure.
> + * @files: The files structure that must be sized.
> + * @nr: Requested fd index to be supported.
> + *
> + * Make sure that the given files structure can accommodate the provided fd
> + * index within its associated fdtable. If the requested index exceeds the
> + * current capacity and there is room for expansion, a larger fdtable will be
> + * created and installed. The files->file_lock should be held on entry, and
> + * will be held on exit.
> + *
> + * If the current fdtable is sufficient, 0 is returned. If the fdtable was
> + * expanded and execution may have blocked, 1 is returned. On an error
> + * condition, a negative error code is returned.
> */
> -static int expand_fdtable(struct files_struct *files, int nr)
> +int expand_files(struct files_struct *files, int nr)
> __releases(files->file_lock)
> __acquires(files->file_lock)
> {
> - struct fdtable *new_fdt, *cur_fdt;
> + struct fdtable *cur_fdt, *new_fdt;
> +
> + cur_fdt = files_fdtable(files);
> + /* Do we need to expand? */
> + if (nr < cur_fdt->max_fds)
> + return 0;
> + /* Are we allowed to expand? */
> + if (nr >= NR_OPEN)
> + return -EMFILE;
>
> + /* Allocate a larger fdtable outside the lock. */
> spin_unlock(&files->file_lock);
> new_fdt = alloc_fdtable(nr);
> spin_lock(&files->file_lock);
> if (!new_fdt)
> return -ENOMEM;
> + cur_fdt = files_fdtable(files);
> /*
> - * Check again since another task may have expanded the fd table while
> - * we dropped the lock
> + * Check the sizes again since another task may have expanded the
> + * fdtable while we dropped the lock.
> */
> - cur_fdt = files_fdtable(files);
> - if (nr >= cur_fdt->max_fds) {
> - /* Continue as planned */
> + if (nr < cur_fdt->max_fds) {
> + /* Somebody else expanded, so undo our allocation attempt. */
> + free_fdarr(new_fdt);
> + free_fdset(new_fdt);
> + kfree(new_fdt);
> + } else {
> + /* All good. Install the new fdtable and retire the old one. */
> copy_fdtable(new_fdt, cur_fdt);
> rcu_assign_pointer(files->fdt, new_fdt);
> if (cur_fdt->max_fds > NR_OPEN_DEFAULT)
> call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
> - } else {
> - /* Somebody else expanded, so undo our attempt */
> - __free_fdtable(new_fdt);
> }
> return 1;
> }
>
> -/*
> - * Expand files.
> - * This function will expand the file structures, if the requested size
> exceeds
> - * the current capacity and there is room for expansion.
> - * Return <0 error code on error; 0 when nothing done; 1 when files were
> - * expanded and execution may have blocked.
> - * The files->file_lock should be held on entry, and will be held on exit.
> +/**
> + * fdtable_defer_list_init - Initialize the per-cpu fdtable defer list.
> + * @cpu: The cpu for which the defer list should be initialized.
> */
> -int expand_files(struct files_struct *files, int nr)
> -{
> - struct fdtable *fdt;
> -
> - fdt = files_fdtable(files);
> - /* Do we need to expand? */
> - if (nr < fdt->max_fds)
> - return 0;
> - /* Can we expand? */
> - if (nr >= NR_OPEN)
> - return -EMFILE;
> -
> - /* All good, so we try */
> - return expand_fdtable(files, nr);
> -}
> -
> static void __devinit fdtable_defer_list_init(int cpu)
> {
> - struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);
> + struct fdtable_defer *fddef;
> +
> + fddef = &per_cpu(fdtable_defer_list, cpu);
> spin_lock_init(&fddef->lock);
> INIT_WORK(&fddef->wq, (void (*)(void *))free_fdtable_work, fddef);
> - init_timer(&fddef->timer);
> - fddef->timer.data = (unsigned long)fddef;
> - fddef->timer.function = fdtable_timer;
> + setup_timer(&fddef->timer, fdtable_timer, (unsigned long)fddef);
> fddef->next = NULL;
> }
>
> +/**
> + * files_defer_init - Initialize the fdtable defer lists.
> + */
> void __init files_defer_init(void)
> {
> int i;
> +
> for_each_possible_cpu(i)
> fdtable_defer_list_init(i);
> }
> diff -Npru old/include/linux/file.h new/include/linux/file.h
> --- old/include/linux/file.h 2006-09-28 20:13:13.000000000 -0700
> +++ new/include/linux/file.h 2006-09-28 20:22:05.000000000 -0700
> @@ -29,8 +29,8 @@ struct embedded_fd_set {
> struct fdtable {
> unsigned int max_fds;
> struct file ** fd; /* current fd array */
> - fd_set *close_on_exec;
> fd_set *open_fds;
> + fd_set *close_on_exec;
> struct rcu_head rcu;
> struct fdtable *next;
> };
> @@ -74,12 +74,6 @@ extern int get_unused_fd(void);
> extern void FASTCALL(put_unused_fd(unsigned int fd));
> struct kmem_cache;
>
> -extern struct file ** alloc_fd_array(int);
On Monday 02 October 2006 01:52, Eric Dumazet wrote:
> On Sunday 01 October 2006 23:14, Vadim Lobanov wrote:
> > This patch provides an improved fdtable allocation scheme, useful for
> > expanding fdtable file descriptor entries. The main focus is on the
> > fdarray, as its memory usage grows 128 times faster than that of an
> > fdset.
> >
> > The allocation algorithm sizes the fdarray in such a way that its memory
> > usage increases in easy page-sized chunks. Additionally, it tries to
> > account for the optimal usage of the allocators involved: kmalloc() for
> > sizes less than a page, and vmalloc() with page granularity for sizes
> > greater than a page. Namely, the following sizes for the fdarray are
> > considered, and the smallest that accommodates the requested fd count is
> > chosen:
> > pagesize / 4
> > pagesize / 2
> > pagesize <- memory allocator switch point
> > pagesize * 2
> > pagesize * 3
> > pagesize * 4
> > ...etc...
> > Unlike the current implementation, this allocation scheme does not
> > require a loop to compute the optimal fdarray size, and can be done in
> > straightline code.
> >
> > Furthermore, since the fdarray overflows the pagesize boundary long
> > before any of the fdsets do, it makes sense to optimize run-time by
> > allocating both fdsets
> > in a single swoop. Even together, they will still be, by far, smaller
> > than the fdarray.
> >
> > As long as we're replacing the guts of fs/file.c, it makes sense to tidy
> > up the code. This work includes:
> > simplification via refactoring,
> > elimination of unnecessary code, and
> > extensive commenting throughout the entire file.
> > This is the last patch in the series. All the code should now be sparkly
> > clean.
>
> Vadim, I think your patch is way too complex, and some changes are dubious.
> You mix cleanups and changes in the same patch, making hard to match your
> patch description and its content.
Sorry; it was simply easier to roll all the really intrusive fs/file.c changes
into a single patch, rather than making several smaller but equally-intrusive
patches in a row. It was a timesaver for me, and I thought it would
ultimately be a timersaver for those trying to follow the changes as well. If
necessary, I can always split up the last patch in the series into multiple
patches -- no problems there. :) Andrew, any preference in this particular
case?
> Current scheme is to allocate power of two sizes, and not 'the smallest
> that accommodates the requested fd count'. This is for a good reason,
> because we don't want to call vmalloc()/vfree() each time a process opens
> 512 or 1024 more files (x86_64 or ia32)
Yep, that is most definitely a consideration. I was balancing it against the
fact that, when the table becomes big, growing it by a power of two
regardless of the size results in massive memory usage deltas. The worry here
is that an application may likely cause the table to grow by a huge amount,
due to the power-of-two increase, and then actually use only a modest number
of further fds, wasting the rest of the allocated table memory.
Which applications open so many file handles so quickly? Do they actually need
the amortized power-of-two table area increase? In those cases, would the
actual process of opening these files take more time than growing the table
in fixed-size steps? Or at least outweigh it enough that it would be more
preferable to try to reduce memory waste instead of improve file open time?
> I personally prefer that table grows by a two factor, especially when they
> are huge. Also, power of two sizes gives less vmalloc space fragmentation
> (might be a concern for some people that are LOWMEM tight and that reduce
> VMALLOC space to get more LOWMEM)
> default __VMALLOC_RESERVE on i386 is 128Mo, but I have some servers where I
> use
> vmalloc=16M just to give more LOWMEM for kernel use.
Is it really true that it will create less fragmentation? It seems to me that
this will only be the true if most of the other heavy users of vmalloc also
tried to use power-of-two allocation sizes.
What do you think of Andi Kleen's follow-up suggestion about eliminating
vmalloc use altogether?
>
> diff -Npru old/include/linux/file.h new/include/linux/file.h
> --- old/include/linux/file.h????2006-09-28 20:13:13.000000000 -0700
> +++ new/include/linux/file.h????2006-09-28 20:22:05.000000000 -0700
> @@ -29,8 +29,8 @@ struct embedded_fd_set {
> ?struct fdtable {
> ????????unsigned int max_fds;
> ????????struct file ** fd; ? ? ?/* current fd array */
> -???????fd_set *close_on_exec;
> ????????fd_set *open_fds;
> +???????fd_set *close_on_exec;
> ????????struct rcu_head rcu;
> ????????struct fdtable *next;
> ?};
>
> Whats the reason for moving this close_on_exec definition in struct fdtable
> ?
Better code readability. The code in fs/file.c initializes all the fields in
the fdtable in the same order as they appear in the struct definition: this
makes it easy to spot any omissions in the field initializations or verify
that all members are, in fact, initialized correctly. (Simply read downwards
in both the header file and the source file.) Since open_fds is the anchor
for the fdset memory area, it is easier to initialize it first, then jump
ahead, and initialize close_on_exec from this offset.
> Eric
Thanks for the input.
-- Vadim Lobanov
On Monday 02 October 2006 03:01, Andi Kleen wrote:
> Vadim Lobanov <[email protected]> writes:
> > The allocation algorithm sizes the fdarray in such a way that its memory
> > usage increases in easy page-sized chunks. Additionally, it tries to
> > account for the optimal usage of the allocators involved: kmalloc() for
> > sizes less than a page, and vmalloc() with page granularity for sizes
> > greater than a page.
>
> Best would be to avoid vmalloc() completely because it can be quite
> costly
It's possible. This switch between kmalloc() and vmalloc() was there in the
original code, and I didn't feel safe ripping it out right now. We can always
explore this approach too, however.
What is the origin and history of this particular code? (It's been there since
at least 2.4.x.) Who put in the switch between the two allocators, and for
what reason? Is that reason still valid?
> -Andi
Thanks for the input.
-- Vadim Lobanov
On Monday 02 October 2006 19:00, Vadim Lobanov wrote:
> On Monday 02 October 2006 01:52, Eric Dumazet wrote:
>
> > Current scheme is to allocate power of two sizes, and not 'the smallest
> > that accommodates the requested fd count'. This is for a good reason,
> > because we don't want to call vmalloc()/vfree() each time a process opens
> > 512 or 1024 more files (x86_64 or ia32)
>
> Yep, that is most definitely a consideration. I was balancing it against
> the fact that, when the table becomes big, growing it by a power of two
> regardless of the size results in massive memory usage deltas. The worry
> here is that an application may likely cause the table to grow by a huge
> amount, due to the power-of-two increase, and then actually use only a
> modest number of further fds, wasting the rest of the allocated table
> memory.
>
> Which applications open so many file handles so quickly? Do they actually
> need the amortized power-of-two table area increase? In those cases, would
> the actual process of opening these files take more time than growing the
> table in fixed-size steps? Or at least outweigh it enough that it would be
> more preferable to try to reduce memory waste instead of improve file open
> time?
I think that for such applications, the 'waste' of ram for fd table is nothing
compared to the ram cost of opened files/sockets/dentries/inodes.
>
> Is it really true that it will create less fragmentation? It seems to me
> that this will only be the true if most of the other heavy users of vmalloc
> also tried to use power-of-two allocation sizes.
I am quite sure that on my machines, big vmalloc users are fdtable most of the
time.
>
> What do you think of Andi Kleen's follow-up suggestion about eliminating
> vmalloc use altogether?
It would be interesting, but would need one indirection level, that could kill
performance of said huge applications... For such applications, the cost of
expanding fdtable is nothing compared to the cost of
open()/close()/poll()/read()/write() calls. This is because fdtable never
shrinks. So adding one indirection (if you use a table of pointers to PAGES
containing 1024 or 512 (struct file *)).
At least, expanding fdtable by 1024 slots would need to reallocate the first
table (adding one void *), and allocating one PAGE only. No need to copy
previous pages, this is a win.
Of course, big fdset still need vmalloc(), or else we cannot use
find_next_zero_bit() anymore... And for such applications, the time and
memory scanned to find a zero bit at open() time is probably the killer
(touching a large part of cpu caches). One million bits is 128 KB...
Eric
On Monday 02 October 2006 19:04, Vadim Lobanov wrote:
> On Monday 02 October 2006 03:01, Andi Kleen wrote:
> > Vadim Lobanov <[email protected]> writes:
> > > The allocation algorithm sizes the fdarray in such a way that its
> > > memory usage increases in easy page-sized chunks. Additionally, it
> > > tries to account for the optimal usage of the allocators involved:
> > > kmalloc() for sizes less than a page, and vmalloc() with page
> > > granularity for sizes greater than a page.
> >
> > Best would be to avoid vmalloc() completely because it can be quite
> > costly
>
> It's possible. This switch between kmalloc() and vmalloc() was there in the
> original code, and I didn't feel safe ripping it out right now. We can
> always explore this approach too, however.
>
> What is the origin and history of this particular code? (It's been there
> since at least 2.4.x.) Who put in the switch between the two allocators,
> and for what reason? Is that reason still valid?
I think Andi was suggesting using a indirection, with a table of pointers to
PAGES, each PAGE containing PAGE_SIZE/sizeof(struct file *) pointers. Kind of
what is doing vmalloc(), but without the need of contiguous virtual memory
space.
You cannot just zap vmalloc() and use one kmalloc(), because some programs
open one million files. That's 8 MByte of memory on x86_64. kmalloc() cannot
cope with that. It cannot even cope with 32KB allocations but just after
reboot...
Eric
On Monday 02 October 2006 10:25, Eric Dumazet wrote:
> On Monday 02 October 2006 19:00, Vadim Lobanov wrote:
> > On Monday 02 October 2006 01:52, Eric Dumazet wrote:
> > > Current scheme is to allocate power of two sizes, and not 'the smallest
> > > that accommodates the requested fd count'. This is for a good reason,
> > > because we don't want to call vmalloc()/vfree() each time a process
> > > opens 512 or 1024 more files (x86_64 or ia32)
> >
> > Yep, that is most definitely a consideration. I was balancing it against
> > the fact that, when the table becomes big, growing it by a power of two
> > regardless of the size results in massive memory usage deltas. The worry
> > here is that an application may likely cause the table to grow by a huge
> > amount, due to the power-of-two increase, and then actually use only a
> > modest number of further fds, wasting the rest of the allocated table
> > memory.
> >
> > Which applications open so many file handles so quickly? Do they actually
> > need the amortized power-of-two table area increase? In those cases,
> > would the actual process of opening these files take more time than
> > growing the table in fixed-size steps? Or at least outweigh it enough
> > that it would be more preferable to try to reduce memory waste instead of
> > improve file open time?
>
> I think that for such applications, the 'waste' of ram for fd table is
> nothing compared to the ram cost of opened files/sockets/dentries/inodes.
>
> > Is it really true that it will create less fragmentation? It seems to me
> > that this will only be the true if most of the other heavy users of
> > vmalloc also tried to use power-of-two allocation sizes.
>
> I am quite sure that on my machines, big vmalloc users are fdtable most of
> the time.
Ok, I think I'm convinced. :) Do you want to dig up some actual data regarding
vmalloc usage on your boxes, at least so it can be saved on the mailing list
archives for future reference?
I'll reroll the last patch in the series soon, after letting it rest for a bit
and seeing if anyone else has any more input on the proposed changes. I'll
tweak it so that it always does power-of-two increases in table size, rather
than switching to linear deltas after a certain point. Sounds good to you?
> > What do you think of Andi Kleen's follow-up suggestion about eliminating
> > vmalloc use altogether?
>
> It would be interesting, but would need one indirection level, that could
> kill performance of said huge applications... For such applications, the
> cost of expanding fdtable is nothing compared to the cost of
> open()/close()/poll()/read()/write() calls. This is because fdtable never
> shrinks. So adding one indirection (if you use a table of pointers to PAGES
> containing 1024 or 512 (struct file *)).
>
> At least, expanding fdtable by 1024 slots would need to reallocate the
> first table (adding one void *), and allocating one PAGE only. No need to
> copy previous pages, this is a win.
>
> Of course, big fdset still need vmalloc(), or else we cannot use
> find_next_zero_bit() anymore... And for such applications, the time and
> memory scanned to find a zero bit at open() time is probably the killer
> (touching a large part of cpu caches). One million bits is 128 KB...
Yep. (We could play really nifty games with the fdtable if we didn't have to
return the lowest available fd for the new file, but alas...)
> Eric
-- Vadim Lobanov