2006-01-04 00:06:23

by Eric Dumazet

[permalink] [raw]
Subject: [PATCH] Shrinks sizeof(files_struct) and better layout

--- linux-2.6.15/include/linux/file.h 2006-01-03 04:21:10.000000000 +0100
+++ linux-2.6.15-edum/include/linux/file.h 2006-01-04 00:35:17.000000000 +0100
@@ -17,10 +17,17 @@
*/
#define NR_OPEN_DEFAULT BITS_PER_LONG

+/*
+ * The embedded_fd_set is a small fd_set,
+ * suitable for most tasks (which open <= BITS_PER_LONG files)
+ */
+typedef struct {
+ unsigned long fds_bits[1];
+} embedded_fd_set;
+
struct fdtable {
unsigned int max_fds;
int max_fdset;
- int next_fd;
struct file ** fd; /* current fd array */
fd_set *close_on_exec;
fd_set *open_fds;
@@ -33,13 +40,20 @@
* Open file table structure
*/
struct files_struct {
+ /*
+ * read mostly part
+ */
atomic_t count;
struct fdtable *fdt;
struct fdtable fdtab;
- fd_set close_on_exec_init;
- fd_set open_fds_init;
+ /*
+ * written part on a separate cache line in SMP
+ */
+ spinlock_t file_lock ____cacheline_aligned_in_smp;
+ int next_fd;
+ embedded_fd_set close_on_exec_init;
+ embedded_fd_set open_fds_init;
struct file * fd_array[NR_OPEN_DEFAULT];
- spinlock_t file_lock; /* Protects concurrent writers. Nests inside tsk->alloc_lock */
};

#define files_fdtable(files) (rcu_dereference((files)->fdt))
--- linux-2.6.15/include/linux/init_task.h 2006-01-03 04:21:10.000000000 +0100
+++ linux-2.6.15-edum/include/linux/init_task.h 2006-01-04 00:32:05.000000000 +0100
@@ -7,11 +7,10 @@
#define INIT_FDTABLE \
{ \
.max_fds = NR_OPEN_DEFAULT, \
- .max_fdset = __FD_SETSIZE, \
- .next_fd = 0, \
+ .max_fdset = 8 * sizeof(embedded_fd_set), \
.fd = &init_files.fd_array[0], \
- .close_on_exec = &init_files.close_on_exec_init, \
- .open_fds = &init_files.open_fds_init, \
+ .close_on_exec = (fd_set *)&init_files.close_on_exec_init, \
+ .open_fds = (fd_set *)&init_files.open_fds_init, \
.rcu = RCU_HEAD_INIT, \
.free_files = NULL, \
.next = NULL, \
@@ -20,9 +19,10 @@
#define INIT_FILES \
{ \
.count = ATOMIC_INIT(1), \
- .file_lock = SPIN_LOCK_UNLOCKED, \
.fdt = &init_files.fdtab, \
.fdtab = INIT_FDTABLE, \
+ .file_lock = SPIN_LOCK_UNLOCKED, \
+ .next_fd = 0, \
.close_on_exec_init = { { 0, } }, \
.open_fds_init = { { 0, } }, \
.fd_array = { NULL, } \
--- linux-2.6.15/kernel/fork.c 2006-01-03 04:21:10.000000000 +0100
+++ linux-2.6.15-edum/kernel/fork.c 2006-01-04 00:32:05.000000000 +0100
@@ -581,12 +581,12 @@
atomic_set(&newf->count, 1);

spin_lock_init(&newf->file_lock);
+ newf->next_fd = 0;
fdt = &newf->fdtab;
- fdt->next_fd = 0;
fdt->max_fds = NR_OPEN_DEFAULT;
- fdt->max_fdset = __FD_SETSIZE;
- fdt->close_on_exec = &newf->close_on_exec_init;
- fdt->open_fds = &newf->open_fds_init;
+ fdt->max_fdset = 8 * sizeof(embedded_fd_set);
+ fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
+ fdt->open_fds = (fd_set *)&newf->open_fds_init;
fdt->fd = &newf->fd_array[0];
INIT_RCU_HEAD(&fdt->rcu);
fdt->free_files = NULL;
--- linux-2.6.15/fs/open.c 2006-01-03 04:21:10.000000000 +0100
+++ linux-2.6.15-edum/fs/open.c 2006-01-04 00:32:05.000000000 +0100
@@ -928,7 +928,7 @@
fdt = files_fdtable(files);
fd = find_next_zero_bit(fdt->open_fds->fds_bits,
fdt->max_fdset,
- fdt->next_fd);
+ files->next_fd);

/*
* N.B. For clone tasks sharing a files structure, this test
@@ -953,7 +953,7 @@

FD_SET(fd, fdt->open_fds);
FD_CLR(fd, fdt->close_on_exec);
- fdt->next_fd = fd + 1;
+ files->next_fd = fd + 1;
#if 1
/* Sanity check */
if (fdt->fd[fd] != NULL) {
@@ -974,8 +974,8 @@
{
struct fdtable *fdt = files_fdtable(files);
__FD_CLR(fd, fdt->open_fds);
- if (fd < fdt->next_fd)
- fdt->next_fd = fd;
+ if (fd < files->next_fd)
+ files->next_fd = fd;
}

void fastcall put_unused_fd(unsigned int fd)
--- linux-2.6.15/fs/fcntl.c 2006-01-03 04:21:10.000000000 +0100
+++ linux-2.6.15-edum/fs/fcntl.c 2006-01-04 00:32:05.000000000 +0100
@@ -72,8 +72,8 @@
* orig_start..fdt->next_fd
*/
start = orig_start;
- if (start < fdt->next_fd)
- start = fdt->next_fd;
+ if (start < files->next_fd)
+ start = files->next_fd;

newfd = start;
if (start < fdt->max_fdset) {
@@ -101,9 +101,8 @@
* we reacquire the fdtable pointer and use it while holding
* the lock, no one can free it during that time.
*/
- fdt = files_fdtable(files);
- if (start <= fdt->next_fd)
- fdt->next_fd = newfd + 1;
+ if (start <= files->next_fd)
+ files->next_fd = newfd + 1;

error = newfd;

--- linux-2.6.15/fs/file.c 2006-01-03 04:21:10.000000000 +0100
+++ linux-2.6.15-edum/fs/file.c 2006-01-04 00:32:05.000000000 +0100
@@ -125,7 +125,8 @@
kmem_cache_free(files_cachep, fdt->free_files);
return;
}
- if (fdt->max_fdset <= __FD_SETSIZE && fdt->max_fds <= NR_OPEN_DEFAULT) {
+ if (fdt->max_fdset <= 8 * sizeof(embedded_fd_set) &&
+ fdt->max_fds <= NR_OPEN_DEFAULT) {
/*
* The fdtable was embedded
*/
@@ -155,8 +156,9 @@

void free_fdtable(struct fdtable *fdt)
{
- if (fdt->free_files || fdt->max_fdset > __FD_SETSIZE ||
- fdt->max_fds > NR_OPEN_DEFAULT)
+ if (fdt->free_files ||
+ fdt->max_fdset > 8 * sizeof(embedded_fd_set) ||
+ fdt->max_fds > NR_OPEN_DEFAULT)
call_rcu(&fdt->rcu, free_fdtable_rcu);
}

@@ -199,7 +201,6 @@
(nfdt->max_fds - fdt->max_fds) *
sizeof(struct file *));
}
- nfdt->next_fd = fdt->next_fd;
}

/*
@@ -220,11 +221,9 @@

void free_fdset(fd_set *array, int num)
{
- int size = num / 8;
-
- if (num <= __FD_SETSIZE) /* Don't free an embedded fdset */
+ if (num <= 8 * sizeof(embedded_fd_set)) /* Don't free an embedded fdset */
return;
- else if (size <= PAGE_SIZE)
+ else if (num <= 8 * PAGE_SIZE)
kfree(array);
else
vfree(array);
@@ -237,22 +236,17 @@
fd_set *new_openset = NULL, *new_execset = NULL;
struct file **new_fds;

- fdt = kmalloc(sizeof(*fdt), GFP_KERNEL);
+ fdt = kzalloc(sizeof(*fdt), GFP_KERNEL);
if (!fdt)
goto out;
- memset(fdt, 0, sizeof(*fdt));

- nfds = __FD_SETSIZE;
+ nfds = 8 * L1_CACHE_BYTES;
/* Expand to the max in easy steps */
- do {
- if (nfds < (PAGE_SIZE * 8))
- nfds = PAGE_SIZE * 8;
- else {
- nfds = nfds * 2;
- if (nfds > NR_OPEN)
- nfds = NR_OPEN;
- }
- } while (nfds <= nr);
+ while (nfds <= nr) {
+ nfds = nfds * 2;
+ if (nfds > NR_OPEN)
+ nfds = NR_OPEN;
+ }

new_openset = alloc_fdset(nfds);
new_execset = alloc_fdset(nfds);


Attachments:
shrink-fdset-2.6.15.patch (6.20 kB)

2006-01-04 09:11:29

by Jan Engelhardt

[permalink] [raw]
Subject: Re: [PATCH] Shrinks sizeof(files_struct) and better layout

> 2) Reduces the size of (files_struct), using a special 32 bits (or 64bits)
> embedded_fd_set, instead of a 1024 bits fd_set for the close_on_exec_init and
> open_fds_init fields. This save some ram (248 bytes per task)


> as most tasks dont open more than 32 files.

How do you know, have you done some empirical testing?



Jan Engelhardt
--

2006-01-04 10:12:11

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH] Shrinks sizeof(files_struct) and better layout

Jan Engelhardt a ?crit :
>> 2) Reduces the size of (files_struct), using a special 32 bits (or 64bits)
>> embedded_fd_set, instead of a 1024 bits fd_set for the close_on_exec_init and
>> open_fds_init fields. This save some ram (248 bytes per task)
>
>
>> as most tasks dont open more than 32 files.
>
> How do you know, have you done some empirical testing?
>
20 years working on Unix/linux machines yes :)

Just try this script on your linux machines :

for f in /proc/*/fd; do ls $f|wc -l;done

more than 95% of tasks have less than 32 concurrent files opened.

(I remember working on AT&T Unix in 1985, with a limit of 20 concurrent files
per process : it was just fine)

Eric

2006-01-04 10:28:17

by folkert

[permalink] [raw]
Subject: Re: [PATCH] Shrinks sizeof(files_struct) and better layout

> >>2) Reduces the size of (files_struct), using a special 32 bits (or 64bits)
> >>embedded_fd_set, instead of a 1024 bits fd_set for the close_on_exec_init
> >>and
> >>open_fds_init fields. This save some ram (248 bytes per task)
> >>as most tasks dont open more than 32 files.
> >How do you know, have you done some empirical testing?
> 20 years working on Unix/linux machines yes :)
> Just try this script on your linux machines :
> for f in /proc/*/fd; do ls $f|wc -l;done
> more than 95% of tasks have less than 32 concurrent files opened.

0 root@muur:/home/folkert# for f in /proc/*/fd; do ls $f|wc -l;done | awk '{TOT+=$1; N++;} END{ print TOT / N, N; }'
13.7079 291

So on my system (running 291 processes (postfix, mysql, apache,
asterisk, spamassassin, clamav) it is on average 13.7 filehandles.
On an idle veritas netbackup server (130 processes): 4
On a system running 4 vmware systems (137 processes): 16
On a heavily used mailserver (130 processes, sendmail and MailScanner
package): 6,6


Folkert van Heusden

--
Try MultiTail! Multiple windows with logfiles, filtered with regular
expressions, colored output, etc. etc. http://www.vanheusden.com/multitail/
----------------------------------------------------------------------
Get your PGP/GPG key signed at http://www.biglumber.com!
----------------------------------------------------------------------
Phone: +31-6-41278122, PGP-key: 1F28D8AE, http://www.vanheusden.com

2006-01-04 10:45:11

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] Shrinks sizeof(files_struct) and better layout

Eric Dumazet <[email protected]> writes:
>
> 1) Reduces the size of (struct fdtable) to exactly 64 bytes on 32bits
> platforms, lowering kmalloc() allocated space by 50%.

It should be probably a kmem_cache_alloc() instead of a kmalloc
in the first place anyways. This would reduce fragmentation.

> 2) Reduces the size of (files_struct), using a special 32 bits (or
> 64bits) embedded_fd_set, instead of a 1024 bits fd_set for the
> close_on_exec_init and open_fds_init fields. This save some ram (248
> bytes per task) as most tasks dont open more than 32 files. D-Cache
> footprint for such tasks is also reduced to the minimum.
>
> 3) Reduces size of allocated fdset. Currently two full pages are
> allocated, that is 32768 bits on x86 for example, and way too
> much. The minimum is now L1_CACHE_BYTES.
>
> UP and SMP should benefit from this patch, because most tasks will
> touch only one cache line when open()/close() stdin/stdout/stderr
> (0/1/2), (next_fd, close_on_exec_init, open_fds_init, fd_array[0 .. 2]
> being in the same cache line)

Looks mostly good to me.

> + * read mostly part
> + */
> atomic_t count;
> struct fdtable *fdt;
> struct fdtable fdtab;
> - fd_set close_on_exec_init;
> - fd_set open_fds_init;
> + /*
> + * written part on a separate cache line in SMP
> + */
> + spinlock_t file_lock ____cacheline_aligned_in_smp;
> + int next_fd;
> + embedded_fd_set close_on_exec_init;
> + embedded_fd_set open_fds_init;

You didn't describe that change, but unless it's clear the separate cache lines
are a win I would not do it and save memory again. Was this split based on
actual measurements or more theoretical considerations?

-Andi

2006-01-04 11:13:33

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH] Shrinks sizeof(files_struct) and better layout

Andi Kleen a ?crit :
> Eric Dumazet <[email protected]> writes:
>> 1) Reduces the size of (struct fdtable) to exactly 64 bytes on 32bits
>> platforms, lowering kmalloc() allocated space by 50%.
>
> It should be probably a kmem_cache_alloc() instead of a kmalloc
> in the first place anyways. This would reduce fragmentation.

Well in theory yes, if you really expect thousand of tasks running...
But for most machines, number of concurrent tasks is < 200, and using a
special cache for this is not a win.

>
>> + * read mostly part
>> + */
>> atomic_t count;
>> struct fdtable *fdt;
>> struct fdtable fdtab;
>> - fd_set close_on_exec_init;
>> - fd_set open_fds_init;
>> + /*
>> + * written part on a separate cache line in SMP
>> + */
>> + spinlock_t file_lock ____cacheline_aligned_in_smp;
>> + int next_fd;
>> + embedded_fd_set close_on_exec_init;
>> + embedded_fd_set open_fds_init;
>
> You didn't describe that change, but unless it's clear the separate cache lines
> are a win I would not do it and save memory again. Was this split based on
> actual measurements or more theoretical considerations?

As it is a refinement on a previous patch (that was integrated in 2.6.15) that
put spin_lock after the array[] (so cleary using a separate cache line), I
omited to describe it.

Yes, this part is really important because some multi-threaded benchmarks get
a nice speedup with this separation in two parts.

Threads that are doing read()/write() (reading the first part of files_struct)
are not slowed by others that do open()/close() syscalls (writing the second
part), no false sharing (and no locking thanks to RCU of course).

Big Apache 2.0 servers, database servers directly benefit from this.

Note that process that tend to create/destroy a lot of threads per second are
writing into 'count' field and might dirty the 'read mostly' part, but we can
expect that well writen high performance programs wont do this.

Eric

2006-01-04 11:15:43

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] Shrinks sizeof(files_struct) and better layout

On Wednesday 04 January 2006 12:13, Eric Dumazet wrote:
> Andi Kleen a ?crit :
> > Eric Dumazet <[email protected]> writes:
> >> 1) Reduces the size of (struct fdtable) to exactly 64 bytes on 32bits
> >> platforms, lowering kmalloc() allocated space by 50%.
> >
> > It should be probably a kmem_cache_alloc() instead of a kmalloc
> > in the first place anyways. This would reduce fragmentation.
>
> Well in theory yes, if you really expect thousand of tasks running...
> But for most machines, number of concurrent tasks is < 200, and using a
> special cache for this is not a win.

It is because it avoids fragmentation because objects with similar livetimes
are clustered together. In general caches are a win
if the data is nearly a page or more.

>
> >
> >> + * read mostly part
> >> + */
> >> atomic_t count;
> >> struct fdtable *fdt;
> >> struct fdtable fdtab;
> >> - fd_set close_on_exec_init;
> >> - fd_set open_fds_init;
> >> + /*
> >> + * written part on a separate cache line in SMP
> >> + */
> >> + spinlock_t file_lock ____cacheline_aligned_in_smp;
> >> + int next_fd;
> >> + embedded_fd_set close_on_exec_init;
> >> + embedded_fd_set open_fds_init;
> >
> > You didn't describe that change, but unless it's clear the separate cache lines
> > are a win I would not do it and save memory again. Was this split based on
> > actual measurements or more theoretical considerations?
>
> As it is a refinement on a previous patch (that was integrated in 2.6.15) that
> put spin_lock after the array[] (so cleary using a separate cache line), I
> omited to describe it.

Ok, perhaps you should describe that too then

-Andi


2006-01-04 11:19:22

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH] Shrinks sizeof(files_struct) and better layout

Andi Kleen a ?crit :
> On Wednesday 04 January 2006 12:13, Eric Dumazet wrote:
>> Andi Kleen a ?crit :
>>> Eric Dumazet <[email protected]> writes:
>>>> 1) Reduces the size of (struct fdtable) to exactly 64 bytes on 32bits
>>>> platforms, lowering kmalloc() allocated space by 50%.
>>> It should be probably a kmem_cache_alloc() instead of a kmalloc
>>> in the first place anyways. This would reduce fragmentation.
>> Well in theory yes, if you really expect thousand of tasks running...
>> But for most machines, number of concurrent tasks is < 200, and using a
>> special cache for this is not a win.
>
> It is because it avoids fragmentation because objects with similar livetimes
> are clustered together. In general caches are a win
> if the data is nearly a page or more.

I dont undertand your last sentence. Do you mean 'if the object size is near
PAGE_SIZE' ?

Eric

2006-01-04 11:22:17

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] Shrinks sizeof(files_struct) and better layout

On Wednesday 04 January 2006 12:19, Eric Dumazet wrote:
> Andi Kleen a ?crit :
> > On Wednesday 04 January 2006 12:13, Eric Dumazet wrote:
> >> Andi Kleen a ?crit :
> >>> Eric Dumazet <[email protected]> writes:
> >>>> 1) Reduces the size of (struct fdtable) to exactly 64 bytes on 32bits
> >>>> platforms, lowering kmalloc() allocated space by 50%.
> >>> It should be probably a kmem_cache_alloc() instead of a kmalloc
> >>> in the first place anyways. This would reduce fragmentation.
> >> Well in theory yes, if you really expect thousand of tasks running...
> >> But for most machines, number of concurrent tasks is < 200, and using a
> >> special cache for this is not a win.
> >
> > It is because it avoids fragmentation because objects with similar livetimes
> > are clustered together. In general caches are a win
> > if the data is nearly a page or more.
>
> I dont undertand your last sentence. Do you mean 'if the object size is near
> PAGE_SIZE' ?

Total data of all objects together. That's because caches always get their
own pages and cannot share them with other caches. The overhead of the kmem_cache_t
by itself is negligible.

-Andi

2006-01-04 11:42:07

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH] Shrinks sizeof(files_struct) and better layout

Andi Kleen a ?crit :
>
> Total data of all objects together. That's because caches always get their
> own pages and cannot share them with other caches.

OK for this part.

> The overhead of the kmem_cache_t by itself is negligible.

This seems a common misconception among kernel devs (even the best ones Andi :) )

On SMP (and/or NUMA) machines : overhead of kmem_cache_t is *big*

See enable_cpucache in mm/slab.c for 'limit' determination :

if (cachep->objsize > 131072)
limit = 1;
else if (cachep->objsize > PAGE_SIZE)
limit = 8;
else if (cachep->objsize > 1024)
limit = 24;
else if (cachep->objsize > 256)
limit = 54;
else
limit = 120;

On a 64 bits machines, 120*sizeof(void*) = 120*8 = 960

So for small objects (<= 256 bytes), you end with a sizeof(array_cache) = 1024
bytes per cpu

If 16 CPUS : 16*1024 = 16 Kbytes + all other kmem_cache structures : (If you
have a lot of Memory Nodes, then it can be *very* big too).

If you know that no more than 100 objects are used in 99% of setups, then a
dedicated cache is overkill, even locking 100 pages because of extreme
fragmentation is better.

Probability that a *lot* of tasks are created at once and killed at once is
close to 0 during a machine lifetime.


Maybe we can introduce an ultra basic memory allocator for such objects
(without CPU caches, node caches), so that the memory overhead is small.
Hitting a spinlock at thread creation/deletion time is not that time critical.

Eric

2006-01-04 11:45:56

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] Shrinks sizeof(files_struct) and better layout

Eric Dumazet <[email protected]> wrote:
>
> [PATCH] shrink the fdset and reorder fields to give better multi-threaded
> performance.

Boy, this is going to need some elaborate performance testing..

We can't have that `8 * sizeof(embedded_fd_set)' splattered all over the
tree. This?

fs/file.c | 6 +++---
include/linux/file.h | 5 +++++
include/linux/init_task.h | 2 +-
kernel/fork.c | 2 +-
4 files changed, 10 insertions(+), 5 deletions(-)

diff -puN fs/file.c~shrinks-sizeoffiles_struct-and-better-layout-tidy fs/file.c
--- devel/fs/file.c~shrinks-sizeoffiles_struct-and-better-layout-tidy 2006-01-04 03:45:10.000000000 -0800
+++ devel-akpm/fs/file.c 2006-01-04 03:45:10.000000000 -0800
@@ -125,7 +125,7 @@ static void free_fdtable_rcu(struct rcu_
kmem_cache_free(files_cachep, fdt->free_files);
return;
}
- if (fdt->max_fdset <= 8 * sizeof(embedded_fd_set) &&
+ if (fdt->max_fdset <= EMBEDDED_FD_SET_SIZE &&
fdt->max_fds <= NR_OPEN_DEFAULT) {
/*
* The fdtable was embedded
@@ -157,7 +157,7 @@ static void free_fdtable_rcu(struct rcu_
void free_fdtable(struct fdtable *fdt)
{
if (fdt->free_files ||
- fdt->max_fdset > 8 * sizeof(embedded_fd_set) ||
+ fdt->max_fdset > EMBEDDED_FD_SET_SIZE ||
fdt->max_fds > NR_OPEN_DEFAULT)
call_rcu(&fdt->rcu, free_fdtable_rcu);
}
@@ -221,7 +221,7 @@ fd_set * alloc_fdset(int num)

void free_fdset(fd_set *array, int num)
{
- if (num <= 8 * sizeof(embedded_fd_set)) /* Don't free an embedded fdset */
+ if (num <= EMBEDDED_FD_SET_SIZE) /* Don't free an embedded fdset */
return;
else if (num <= 8 * PAGE_SIZE)
kfree(array);
diff -puN include/linux/file.h~shrinks-sizeoffiles_struct-and-better-layout-tidy include/linux/file.h
--- devel/include/linux/file.h~shrinks-sizeoffiles_struct-and-better-layout-tidy 2006-01-04 03:45:10.000000000 -0800
+++ devel-akpm/include/linux/file.h 2006-01-04 03:45:10.000000000 -0800
@@ -25,6 +25,11 @@ typedef struct {
unsigned long fds_bits[1];
} embedded_fd_set;

+/*
+ * More than this number of fds: we use a separately allocated fd_set
+ */
+#define EMBEDDED_FD_SET_SIZE (8 * sizeof(struct embedded_fd_set))
+
struct fdtable {
unsigned int max_fds;
int max_fdset;
diff -puN include/linux/init_task.h~shrinks-sizeoffiles_struct-and-better-layout-tidy include/linux/init_task.h
--- devel/include/linux/init_task.h~shrinks-sizeoffiles_struct-and-better-layout-tidy 2006-01-04 03:45:10.000000000 -0800
+++ devel-akpm/include/linux/init_task.h 2006-01-04 03:45:10.000000000 -0800
@@ -7,7 +7,7 @@
#define INIT_FDTABLE \
{ \
.max_fds = NR_OPEN_DEFAULT, \
- .max_fdset = 8 * sizeof(embedded_fd_set), \
+ .max_fdset = EMBEDDED_FD_SET_SIZE, \
.fd = &init_files.fd_array[0], \
.close_on_exec = (fd_set *)&init_files.close_on_exec_init, \
.open_fds = (fd_set *)&init_files.open_fds_init, \
diff -puN kernel/fork.c~shrinks-sizeoffiles_struct-and-better-layout-tidy kernel/fork.c
--- devel/kernel/fork.c~shrinks-sizeoffiles_struct-and-better-layout-tidy 2006-01-04 03:45:10.000000000 -0800
+++ devel-akpm/kernel/fork.c 2006-01-04 03:45:10.000000000 -0800
@@ -584,7 +584,7 @@ static struct files_struct *alloc_files(
newf->next_fd = 0;
fdt = &newf->fdtab;
fdt->max_fds = NR_OPEN_DEFAULT;
- fdt->max_fdset = 8 * sizeof(embedded_fd_set);
+ fdt->max_fdset = EMBEDDED_FD_SET_SIZE;
fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
fdt->open_fds = (fd_set *)&newf->open_fds_init;
fdt->fd = &newf->fd_array[0];
_

2006-01-04 11:58:47

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] Shrinks sizeof(files_struct) and better layout

On Wednesday 04 January 2006 12:41, Eric Dumazet wrote:

> > The overhead of the kmem_cache_t by itself is negligible.
>
> This seems a common misconception among kernel devs (even the best ones Andi :) )

It used to be true at least at some point :/

>
> On SMP (and/or NUMA) machines : overhead of kmem_cache_t is *big*
>
> See enable_cpucache in mm/slab.c for 'limit' determination :
>
> if (cachep->objsize > 131072)
> limit = 1;
> else if (cachep->objsize > PAGE_SIZE)
> limit = 8;
> else if (cachep->objsize > 1024)
> limit = 24;
> else if (cachep->objsize > 256)
> limit = 54;
> else
> limit = 120;
>
> On a 64 bits machines, 120*sizeof(void*) = 120*8 = 960
>
> So for small objects (<= 256 bytes), you end with a sizeof(array_cache) = 1024
> bytes per cpu

Hmm - in theory it could be tuned down for SMT siblings which really don't
care about sharing because they have shared caches. But I don't know
how many complications that would add to the slab code.

>
> If 16 CPUS : 16*1024 = 16 Kbytes + all other kmem_cache structures : (If you
> have a lot of Memory Nodes, then it can be *very* big too).
>
> If you know that no more than 100 objects are used in 99% of setups, then a
> dedicated cache is overkill, even locking 100 pages because of extreme
> fragmentation is better.

A system with 16 memory nodes should have more than 100 processes, but ok.

>
> Maybe we can introduce an ultra basic memory allocator for such objects
> (without CPU caches, node caches), so that the memory overhead is small.
> Hitting a spinlock at thread creation/deletion time is not that time critical.

Might be a good idea yes. There used to a "simp" allocator long ago for this,
but it was removed because it had other issues. But this was before even slab
got the per cpu/node support.

-Andi

2006-01-04 13:14:11

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH] Shrinks sizeof(files_struct) and better layout

Andrew Morton a ?crit :
> Eric Dumazet <[email protected]> wrote:
>> [PATCH] shrink the fdset and reorder fields to give better multi-threaded
>> performance.
>
> Boy, this is going to need some elaborate performance testing..
>
> We can't have that `8 * sizeof(embedded_fd_set)' splattered all over the
> tree. This?
>

Fine patch Andrew :)

I was a bit unsure of introducing a XXXX_SIZE macro expressed in bits instead
of bytes (more natural unit at least for us humans)

Thank you
Eric



2006-01-04 23:24:43

by Adrian Bunk

[permalink] [raw]
Subject: [2.6 patch] Define BITS_PER_BYTE

On Wed, Jan 04, 2006 at 03:45:34AM -0800, Andrew Morton wrote:
>...
> +/*
> + * More than this number of fds: we use a separately allocated fd_set
> + */
> +#define EMBEDDED_FD_SET_SIZE (8 * sizeof(struct embedded_fd_set))
> +
>...

What about applying and using the patch below?

cu
Adrian


<-- snip -->


This can make some arithmetic expressions clearer.


Signed-off-by: Bryan O'Sullivan <[email protected]>
Signed-off-by: Adrian Bunk <[email protected]>

--- a/include/linux/types.h Wed Dec 28 14:19:42 2005 -0800
+++ b/include/linux/types.h Wed Dec 28 14:19:42 2005 -0800
@@ -8,6 +8,8 @@
(((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
#define DECLARE_BITMAP(name,bits) \
unsigned long name[BITS_TO_LONGS(bits)]
+
+#define BITS_PER_BYTE 8
#endif

#include <linux/posix_types.h>

2006-01-05 07:04:36

by Jan Engelhardt

[permalink] [raw]
Subject: Re: [2.6 patch] Define BITS_PER_BYTE

>What about applying and using the patch below?
>
>cu
>Adrian
>
>This can make some arithmetic expressions clearer.
>
>+
>+#define BITS_PER_BYTE 8

Oh no :( This sounds as uncommon as CHAR_BIT in C.



Jan Engelhardt
--

2006-01-05 15:18:27

by Bryan O'Sullivan

[permalink] [raw]
Subject: Re: [2.6 patch] Define BITS_PER_BYTE

On Thu, 2006-01-05 at 08:03 +0100, Jan Engelhardt wrote:

> Oh no :( This sounds as uncommon as CHAR_BIT in C.

CHAR_BIT is completely unclear. BITS_PER_BYTE is self-evident, and
makes it a lot more obvious when you're doing arithmetic that involves
counting bits.

<b

2006-01-05 19:19:59

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [2.6 patch] Define BITS_PER_BYTE

Followup to: <[email protected]>
By author: "Bryan O'Sullivan" <[email protected]>
In newsgroup: linux.dev.kernel
>
> On Thu, 2006-01-05 at 08:03 +0100, Jan Engelhardt wrote:
>
> > Oh no :( This sounds as uncommon as CHAR_BIT in C.
>
> CHAR_BIT is completely unclear. BITS_PER_BYTE is self-evident, and
> makes it a lot more obvious when you're doing arithmetic that involves
> counting bits.
>

Tough cookies. The standard name for this define is CHAR_BIT, and
anyone who doesn't know that "char" means byte in C doesn't know the C
language. "char" certainly doesn't mean "character" in this day of
UTF-8 and friends.

-hpa

2006-01-06 03:01:44

by David Lang

[permalink] [raw]
Subject: Re: [PATCH] Shrinks sizeof(files_struct) and better layout

On Wed, 4 Jan 2006, Eric Dumazet wrote:

> Date: Wed, 04 Jan 2006 12:13:25 +0100
> From: Eric Dumazet <[email protected]>
> To: Andi Kleen <[email protected]>
> Cc: [email protected]
> Subject: Re: [PATCH] Shrinks sizeof(files_struct) and better layout
>
> Andi Kleen a ?crit :
>> Eric Dumazet <[email protected]> writes:
>>> 1) Reduces the size of (struct fdtable) to exactly 64 bytes on 32bits
>>> platforms, lowering kmalloc() allocated space by 50%.
>>
>> It should be probably a kmem_cache_alloc() instead of a kmalloc
>> in the first place anyways. This would reduce fragmentation.
>
> Well in theory yes, if you really expect thousand of tasks running...
> But for most machines, number of concurrent tasks is < 200, and using a
> special cache for this is not a win.

is it enough of a win on machines with thousands of concurrent tasks that
it would be a useful config option?

David Lang

--
There are two ways of constructing a software design. One way is to make it so simple that there are obviously no deficiencies. And the other way is to make it so complicated that there are no obvious deficiencies.
-- C.A.R. Hoare

2006-01-06 06:35:53

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH] Shrinks sizeof(files_struct) and better layout

David Lang a ?crit :
> On Wed, 4 Jan 2006, Eric Dumazet wrote:
>> Andi Kleen a ?crit :
>>> Eric Dumazet <[email protected]> writes:
>>>> 1) Reduces the size of (struct fdtable) to exactly 64 bytes on 32bits
>>>> platforms, lowering kmalloc() allocated space by 50%.
>>>
>>> It should be probably a kmem_cache_alloc() instead of a kmalloc
>>> in the first place anyways. This would reduce fragmentation.
>>
>> Well in theory yes, if you really expect thousand of tasks running...
>> But for most machines, number of concurrent tasks is < 200, and using
>> a special cache for this is not a win.
>
> is it enough of a win on machines with thousands of concurrent tasks
> that it would be a useful config option?

Well..., not if NR_CPUS is big too. (We just saw a thread on lkml about
raising NR_CPUS to 1024 on ia64).

On a 1024 CPUS machine, a kmem cache could use at least 1 MB for its internal
structures, plus 1024 pages (PAGE_SIZE) for holding the caches (one cache per
CPU), if you assume at least one task was created on behalf each cpu.

if PAGE_SIZE is 64KB, you end up with 65 MB of ram for the cache. Even with
100.000 tasks running on the machine, its not a win.

slab caches are very complex machinery that consume O(NR_CPUS) ram.

Eric

2006-01-06 07:26:42

by David Lang

[permalink] [raw]
Subject: Re: [PATCH] Shrinks sizeof(files_struct) and better layout

On Fri, 6 Jan 2006, Eric Dumazet wrote:
> David Lang a ?crit :
>> On Wed, 4 Jan 2006, Eric Dumazet wrote:
>>> Andi Kleen a ?crit :
>>>> Eric Dumazet <[email protected]> writes:
>>>>> 1) Reduces the size of (struct fdtable) to exactly 64 bytes on 32bits
>>>>> platforms, lowering kmalloc() allocated space by 50%.
>>>>
>>>> It should be probably a kmem_cache_alloc() instead of a kmalloc
>>>> in the first place anyways. This would reduce fragmentation.
>>>
>>> Well in theory yes, if you really expect thousand of tasks running...
>>> But for most machines, number of concurrent tasks is < 200, and using a
>>> special cache for this is not a win.
>>
>> is it enough of a win on machines with thousands of concurrent tasks that
>> it would be a useful config option?
>
> Well..., not if NR_CPUS is big too. (We just saw a thread on lkml about
> raising NR_CPUS to 1024 on ia64).
>
> On a 1024 CPUS machine, a kmem cache could use at least 1 MB for its internal
> structures, plus 1024 pages (PAGE_SIZE) for holding the caches (one cache per
> CPU), if you assume at least one task was created on behalf each cpu.
>
> if PAGE_SIZE is 64KB, you end up with 65 MB of ram for the cache. Even with
> 100.000 tasks running on the machine, its not a win.
>
> slab caches are very complex machinery that consume O(NR_CPUS) ram.

Ok, so if you have large numbers of CPU's and large page sizes it's not
useful. however, what about a 2-4 cpu machine with 4k page
sizes, 8-32G of ram (a not unreasonable Opteron system config) that will
be running 5,000-20,000 processes/threads?

I know people argue that programs that do such things are bad (and I
definantly agree that they aren't optimized), but the reality is that some
workloads are like that. if a machine is being built for such uses
configuring the kernel to better tolorate such use may be useful

David Lang


--
There are two ways of constructing a software design. One way is to make it so simple that there are obviously no deficiencies. And the other way is to make it so complicated that there are no obvious deficiencies.
-- C.A.R. Hoare

2006-01-06 07:37:46

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH] Shrinks sizeof(files_struct) and better layout

David Lang a ?crit :
> Ok, so if you have large numbers of CPU's and large page sizes it's not
> useful. however, what about a 2-4 cpu machine with 4k page sizes, 8-32G
> of ram (a not unreasonable Opteron system config) that will be running
> 5,000-20,000 processes/threads?

Dont forget 'struct files_struct' are shared between threads of one process.

So may benefit from this 'special cache' only if you plan to run 20.000 processes.

>
> I know people argue that programs that do such things are bad (and I
> definantly agree that they aren't optimized), but the reality is that
> some workloads are like that. if a machine is being built for such uses
> configuring the kernel to better tolorate such use may be useful

If 20.000 process runs on a machine, I doubt the main problem of sysadmin is
about the 'struct files_struct' placement in memory :)

Eric

2006-01-06 08:28:48

by David Lang

[permalink] [raw]
Subject: Re: [PATCH] Shrinks sizeof(files_struct) and better layout

On Fri, 6 Jan 2006, Eric Dumazet wrote:

> David Lang a ?crit :
>> Ok, so if you have large numbers of CPU's and large page sizes it's not
>> useful. however, what about a 2-4 cpu machine with 4k page sizes, 8-32G of
>> ram (a not unreasonable Opteron system config) that will be running
>> 5,000-20,000 processes/threads?
>
> Dont forget 'struct files_struct' are shared between threads of one process.
>
> So may benefit from this 'special cache' only if you plan to run 20.000
> processes.

Ok, that's why I as asking

>>
>> I know people argue that programs that do such things are bad (and I
>> definantly agree that they aren't optimized), but the reality is that some
>> workloads are like that. if a machine is being built for such uses
>> configuring the kernel to better tolorate such use may be useful
>
> If 20.000 process runs on a machine, I doubt the main problem of sysadmin is
> about the 'struct files_struct' placement in memory :)

I have some boxes that routinely sit with 3,500-4,000 processes (fork
heavy workload, ~1400 forks/sec so far) that topple over when they go much
over 10,000 processes.

I'm only running 32 bit kernels with 1G of ram available, (no himem), I've
been assuming that ram was my limiting factor, but it hasn't been
enough of an issue to really track down yet :-)

David Lang

--
There are two ways of constructing a software design. One way is to make it so simple that there are obviously no deficiencies. And the other way is to make it so complicated that there are no obvious deficiencies.
-- C.A.R. Hoare