2015-04-16 12:16:49

by Mateusz Guzik

[permalink] [raw]
Subject: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

Hi,

Currently obtaining a new file descriptor results in locking fdtable
twice - once in order to reserve a slot and second time to fill it.

Hack below gets rid of the second lock usage.

It gives me a ~30% speedup (~300k ops -> ~400k ops) in a microbenchmark
where 16 threads create a pipe (2 fds) and call 2 * close.

Results are fluctuating and even with the patch sometimes drop to around
~300k ops. However, without the patch they never get higher.

I can come up with a better benchmark if that's necessary.

Comments?

==============================================

Subject: [PATCH] fs: use a sequence counter instead of file_lock in fd_install

Because the lock is not held, it is possible that fdtable will be
reallocated as we fill it.

RCU is used to guarantee the old table is not freed just in case we
happen to write to it (which is harmless).

sequence counter is used to ensure we updated the right table.

Signed-off-by: Mateusz Guzik <[email protected]>
---
fs/file.c | 24 +++++++++++++++++++-----
include/linux/fdtable.h | 5 +++++
2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/fs/file.c b/fs/file.c
index 93c5f89..bd1ef4c 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -165,8 +165,10 @@ static int expand_fdtable(struct files_struct *files, int nr)
cur_fdt = files_fdtable(files);
if (nr >= cur_fdt->max_fds) {
/* Continue as planned */
+ write_seqcount_begin(&files->fdt_seqcount);
copy_fdtable(new_fdt, cur_fdt);
rcu_assign_pointer(files->fdt, new_fdt);
+ write_seqcount_end(&files->fdt_seqcount);
if (cur_fdt != &files->fdtab)
call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
} else {
@@ -256,6 +258,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
atomic_set(&newf->count, 1);

spin_lock_init(&newf->file_lock);
+ seqcount_init(&newf->fdt_seqcount);
newf->next_fd = 0;
new_fdt = &newf->fdtab;
new_fdt->max_fds = NR_OPEN_DEFAULT;
@@ -429,6 +432,7 @@ void exit_files(struct task_struct *tsk)

struct files_struct init_files = {
.count = ATOMIC_INIT(1),
+ .fdt_seqcount = SEQCNT_ZERO(fdt_seqcount),
.fdt = &init_files.fdtab,
.fdtab = {
.max_fds = NR_OPEN_DEFAULT,
@@ -552,12 +556,22 @@ EXPORT_SYMBOL(put_unused_fd);
void __fd_install(struct files_struct *files, unsigned int fd,
struct file *file)
{
+ unsigned long seq;
struct fdtable *fdt;
- spin_lock(&files->file_lock);
- fdt = files_fdtable(files);
- BUG_ON(fdt->fd[fd] != NULL);
- rcu_assign_pointer(fdt->fd[fd], file);
- spin_unlock(&files->file_lock);
+
+ rcu_read_lock();
+ do {
+ seq = read_seqcount_begin(&files->fdt_seqcount);
+ fdt = files_fdtable_seq(files);
+ /*
+ * Entry in the table can already be equal to file if we
+ * had to restart and copy_fdtable picked up our update.
+ */
+ BUG_ON(!(fdt->fd[fd] == NULL || fdt->fd[fd] == file));
+ rcu_assign_pointer(fdt->fd[fd], file);
+ smp_mb();
+ } while (__read_seqcount_retry(&files->fdt_seqcount, seq));
+ rcu_read_unlock();
}

void fd_install(unsigned int fd, struct file *file)
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 230f87b..9e41765 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -12,6 +12,7 @@
#include <linux/types.h>
#include <linux/init.h>
#include <linux/fs.h>
+#include <linux/seqlock.h>

#include <linux/atomic.h>

@@ -47,6 +48,7 @@ struct files_struct {
* read mostly part
*/
atomic_t count;
+ seqcount_t fdt_seqcount;
struct fdtable __rcu *fdt;
struct fdtable fdtab;
/*
@@ -69,6 +71,9 @@ struct dentry;
#define files_fdtable(files) \
rcu_dereference_check_fdtable((files), (files)->fdt)

+#define files_fdtable_seq(files) \
+ rcu_dereference((files)->fdt)
+
/*
* The caller must ensure that fd table isn't shared or hold rcu or file lock
*/
--
1.8.3.1


2015-04-16 17:47:30

by Eric Dumazet

[permalink] [raw]
Subject: Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

On Thu, 2015-04-16 at 14:16 +0200, Mateusz Guzik wrote:
> Hi,
>
> Currently obtaining a new file descriptor results in locking fdtable
> twice - once in order to reserve a slot and second time to fill it.
>
> Hack below gets rid of the second lock usage.
>
> It gives me a ~30% speedup (~300k ops -> ~400k ops) in a microbenchmark
> where 16 threads create a pipe (2 fds) and call 2 * close.
>
> Results are fluctuating and even with the patch sometimes drop to around
> ~300k ops. However, without the patch they never get higher.
>
> I can come up with a better benchmark if that's necessary.

Please push a patch with this test program alone, it will serve as a
baseline.

I discussed with Al about this problem in LKS 2014 in Chicago.

I am pleased to see you are working on this !

Please find one comment enclosed.

>
> Comments?
>
> ==============================================
>
> Subject: [PATCH] fs: use a sequence counter instead of file_lock in fd_install
>
> Because the lock is not held, it is possible that fdtable will be
> reallocated as we fill it.
>
> RCU is used to guarantee the old table is not freed just in case we
> happen to write to it (which is harmless).
>
> sequence counter is used to ensure we updated the right table.
>
> Signed-off-by: Mateusz Guzik <[email protected]>
> ---
> fs/file.c | 24 +++++++++++++++++++-----
> include/linux/fdtable.h | 5 +++++
> 2 files changed, 24 insertions(+), 5 deletions(-)


> void fd_install(unsigned int fd, struct file *file)
> diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
> index 230f87b..9e41765 100644
> --- a/include/linux/fdtable.h
> +++ b/include/linux/fdtable.h
> @@ -12,6 +12,7 @@
> #include <linux/types.h>
> #include <linux/init.h>
> #include <linux/fs.h>
> +#include <linux/seqlock.h>
>
> #include <linux/atomic.h>
>
> @@ -47,6 +48,7 @@ struct files_struct {
> * read mostly part
> */
> atomic_t count;
> + seqcount_t fdt_seqcount;


You put fdt_seqcount in the 'read mostly part' of 'struct files_struct',
please move it in the 'written part'


> struct fdtable __rcu *fdt;
> struct fdtable fdtab;
> /*
> @@ -69,6 +71,9 @@ struct dentry;
> #define files_fdtable(files) \
> rcu_dereference_check_fdtable((files), (files)->fdt)
>
> +#define files_fdtable_seq(files) \
> + rcu_dereference((files)->fdt)
> +
> /*
> * The caller must ensure that fd table isn't shared or hold rcu or file lock
> */

2015-04-16 18:09:59

by Al Viro

[permalink] [raw]
Subject: Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

On Thu, Apr 16, 2015 at 02:16:31PM +0200, Mateusz Guzik wrote:
> @@ -165,8 +165,10 @@ static int expand_fdtable(struct files_struct *files, int nr)
> cur_fdt = files_fdtable(files);
> if (nr >= cur_fdt->max_fds) {
> /* Continue as planned */
> + write_seqcount_begin(&files->fdt_seqcount);
> copy_fdtable(new_fdt, cur_fdt);
> rcu_assign_pointer(files->fdt, new_fdt);
> + write_seqcount_end(&files->fdt_seqcount);
> if (cur_fdt != &files->fdtab)
> call_rcu(&cur_fdt->rcu, free_fdtable_rcu);

Interesting. AFAICS, your test doesn't step anywhere near that path,
does it? So basically you never hit the retries during that...

2015-04-16 20:42:12

by Eric Dumazet

[permalink] [raw]
Subject: Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

On Thu, 2015-04-16 at 19:09 +0100, Al Viro wrote:
> On Thu, Apr 16, 2015 at 02:16:31PM +0200, Mateusz Guzik wrote:
> > @@ -165,8 +165,10 @@ static int expand_fdtable(struct files_struct *files, int nr)
> > cur_fdt = files_fdtable(files);
> > if (nr >= cur_fdt->max_fds) {
> > /* Continue as planned */
> > + write_seqcount_begin(&files->fdt_seqcount);
> > copy_fdtable(new_fdt, cur_fdt);
> > rcu_assign_pointer(files->fdt, new_fdt);
> > + write_seqcount_end(&files->fdt_seqcount);
> > if (cur_fdt != &files->fdtab)
> > call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
>
> Interesting. AFAICS, your test doesn't step anywhere near that path,
> does it? So basically you never hit the retries during that...

Right, but then the table is almost never changed for a given process,
as we only increase it by power of two steps.

(So I scratch my initial comment, fdt_seqcount is really mostly read)


2015-04-16 20:55:50

by Eric Dumazet

[permalink] [raw]
Subject: Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

On Thu, 2015-04-16 at 13:42 -0700, Eric Dumazet wrote:
> On Thu, 2015-04-16 at 19:09 +0100, Al Viro wrote:
> > On Thu, Apr 16, 2015 at 02:16:31PM +0200, Mateusz Guzik wrote:
> > > @@ -165,8 +165,10 @@ static int expand_fdtable(struct files_struct *files, int nr)
> > > cur_fdt = files_fdtable(files);
> > > if (nr >= cur_fdt->max_fds) {
> > > /* Continue as planned */
> > > + write_seqcount_begin(&files->fdt_seqcount);
> > > copy_fdtable(new_fdt, cur_fdt);
> > > rcu_assign_pointer(files->fdt, new_fdt);
> > > + write_seqcount_end(&files->fdt_seqcount);
> > > if (cur_fdt != &files->fdtab)
> > > call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
> >
> > Interesting. AFAICS, your test doesn't step anywhere near that path,
> > does it? So basically you never hit the retries during that...
>
> Right, but then the table is almost never changed for a given process,
> as we only increase it by power of two steps.
>
> (So I scratch my initial comment, fdt_seqcount is really mostly read)

I tested Mateusz patch with my opensock program, mimicking a bit more
what a server does (having lot of sockets)

24 threads running, doing close(randomfd())/socket() calls like crazy.

Before patch :

# time ./opensock

real 0m10.863s
user 0m0.954s
sys 2m43.659s


After patch :

# time ./opensock

real 0m9.750s
user 0m0.804s
sys 2m18.034s

So this is an improvement for sure, but not massive.

perf record ./opensock ; report

87.60% opensock [kernel.kallsyms] [k] _raw_spin_lock
1.57% opensock [kernel.kallsyms] [k] find_next_zero_bit
0.50% opensock [kernel.kallsyms] [k] memset_erms
0.44% opensock [kernel.kallsyms] [k] __alloc_fd
0.44% opensock [kernel.kallsyms] [k] tcp_close
0.43% opensock [kernel.kallsyms] [k] get_empty_filp
0.43% opensock [kernel.kallsyms] [k] kmem_cache_free
0.40% opensock [kernel.kallsyms] [k] free_block
0.34% opensock [kernel.kallsyms] [k] __close_fd
0.32% opensock [kernel.kallsyms] [k] sk_alloc
0.30% opensock [kernel.kallsyms] [k] _raw_spin_lock_bh
0.24% opensock [kernel.kallsyms] [k] inet_csk_destroy_sock
0.22% opensock [kernel.kallsyms] [k] kmem_cache_alloc
0.22% opensock opensock [.] __pthread_disable_asynccancel
0.21% opensock [kernel.kallsyms] [k] lockref_put_return
0.20% opensock [kernel.kallsyms] [k] filp_close

perf record -g ./opensock ; perf report --stdio

87.80% opensock [kernel.kallsyms] [k] _raw_spin_lock
|
--- _raw_spin_lock
|
|--52.70%-- __close_fd
| sys_close
| system_call_fastpath
| __libc_close
| |
| |--98.97%-- 0x0
| --1.03%-- [...]
|
|--46.41%-- __alloc_fd
| get_unused_fd_flags
| sock_map_fd
| sys_socket
| system_call_fastpath
| __socket
| |
| --100.00%-- 0x0
--0.89%-- [...]

1.54% opensock [kernel.kallsyms] [k] find_next_zero_bit
|




2015-04-16 22:00:39

by Mateusz Guzik

[permalink] [raw]
Subject: Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

On Thu, Apr 16, 2015 at 01:55:39PM -0700, Eric Dumazet wrote:
> On Thu, 2015-04-16 at 13:42 -0700, Eric Dumazet wrote:
> > On Thu, 2015-04-16 at 19:09 +0100, Al Viro wrote:
> > > On Thu, Apr 16, 2015 at 02:16:31PM +0200, Mateusz Guzik wrote:
> > > > @@ -165,8 +165,10 @@ static int expand_fdtable(struct files_struct *files, int nr)
> > > > cur_fdt = files_fdtable(files);
> > > > if (nr >= cur_fdt->max_fds) {
> > > > /* Continue as planned */
> > > > + write_seqcount_begin(&files->fdt_seqcount);
> > > > copy_fdtable(new_fdt, cur_fdt);
> > > > rcu_assign_pointer(files->fdt, new_fdt);
> > > > + write_seqcount_end(&files->fdt_seqcount);
> > > > if (cur_fdt != &files->fdtab)
> > > > call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
> > >
> > > Interesting. AFAICS, your test doesn't step anywhere near that path,
> > > does it? So basically you never hit the retries during that...
> >
> > Right, but then the table is almost never changed for a given process,
> > as we only increase it by power of two steps.
> >
> > (So I scratch my initial comment, fdt_seqcount is really mostly read)
>
> I tested Mateusz patch with my opensock program, mimicking a bit more
> what a server does (having lot of sockets)
>
> 24 threads running, doing close(randomfd())/socket() calls like crazy.
>
> Before patch :
>
> # time ./opensock
>
> real 0m10.863s
> user 0m0.954s
> sys 2m43.659s
>
>
> After patch :
>
> # time ./opensock
>
> real 0m9.750s
> user 0m0.804s
> sys 2m18.034s
>
> So this is an improvement for sure, but not massive.
>
> perf record ./opensock ; report
>
> 87.80% opensock [kernel.kallsyms] [k] _raw_spin_lock
> |--52.70%-- __close_fd
> |--46.41%-- __alloc_fd

My crap benchmark is here: http://people.redhat.com/~mguzik/pipebench.c
(compile with -pthread, run with -s 10 -n 16 for 10 second test + 16
threads)

As noted earlier it tends to go from rougly 300k ops/s to 400.

The fundamental problem here seems to be this pesky POSIX requirement of
providing the lowest possible fd on each allocation (as a side note
Linux breaks this with parallel fd allocs, where one of these backs off
the reservation, not that I believe this causes trouble).

Ideally a process-wide switch could be implemented (e.g.
prctl(SCRATCH_LOWEST_FD_REQ)) which would grant the kernel the freedom
to return any fd it wants, so it would be possible to have fd ranges
per thread and the like.

Having only a O_SCRATCH_POSIX flag passed to syscalls would still leave
close() as a bottleneck.

In the meantime I consider the approach taken in my patch as an ok
temporary improvement.

--
Mateusz Guzik

2015-04-16 22:35:40

by Mateusz Guzik

[permalink] [raw]
Subject: Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

On Thu, Apr 16, 2015 at 07:09:32PM +0100, Al Viro wrote:
> On Thu, Apr 16, 2015 at 02:16:31PM +0200, Mateusz Guzik wrote:
> > @@ -165,8 +165,10 @@ static int expand_fdtable(struct files_struct *files, int nr)
> > cur_fdt = files_fdtable(files);
> > if (nr >= cur_fdt->max_fds) {
> > /* Continue as planned */
> > + write_seqcount_begin(&files->fdt_seqcount);
> > copy_fdtable(new_fdt, cur_fdt);
> > rcu_assign_pointer(files->fdt, new_fdt);
> > + write_seqcount_end(&files->fdt_seqcount);
> > if (cur_fdt != &files->fdtab)
> > call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
>
> Interesting. AFAICS, your test doesn't step anywhere near that path,
> does it? So basically you never hit the retries during that...

well, yeah. In fact for non-shared tables one could go a step further
and just plop the pointer in, but I don't know if that makes much sense.
Other processes inspecting the table could get away with a data
dependency barrier. Closing would still take the lock, so you can only
suddenly see filp installed, but never one going away.

Now, as far as correctness goes, I think there is a bug in the patch
(which does not invalidate the idea though). Chances are I got a fix as
well.

Benchmark prog is here: http://people.redhat.com/~mguzik/pipebench.c
A modified version: http://people.redhat.com/~mguzik/fdi-fail.c

Benchmark is just doing pipe + close in a loop in multiple threads.

Modified version spawns threads, sleeps 100 ms and does dup(0, 300) to
reallocate the table while other threads continue the work.

This succesfully tested retries (along with cases where installed file
got copied and was encountered during retry).

However, I see sporadic close failures. I presume this is because of a
missing read barrier after write_seqcount_begin. Adding a smp_mb()
seems to solve the problem, but I could only test on 2 * 16 Intel(R)
Xeon(R) CPU E5-2660 0 @ 2.20GHz.

My memory barrier-fu is rather weak and I'm not that confident in my
crap suspicion here.

Thoughts?

--
Mateusz Guzik

2015-04-16 22:52:33

by Eric Dumazet

[permalink] [raw]
Subject: Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

On Fri, 2015-04-17 at 00:00 +0200, Mateusz Guzik wrote:
> On Thu, Apr 16, 2015 at 01:55:39PM -0700, Eric Dumazet wrote:
> > On Thu, 2015-04-16 at 13:42 -0700, Eric Dumazet wrote:
> > > On Thu, 2015-04-16 at 19:09 +0100, Al Viro wrote:
> > > > On Thu, Apr 16, 2015 at 02:16:31PM +0200, Mateusz Guzik wrote:
> > > > > @@ -165,8 +165,10 @@ static int expand_fdtable(struct files_struct *files, int nr)
> > > > > cur_fdt = files_fdtable(files);
> > > > > if (nr >= cur_fdt->max_fds) {
> > > > > /* Continue as planned */
> > > > > + write_seqcount_begin(&files->fdt_seqcount);
> > > > > copy_fdtable(new_fdt, cur_fdt);
> > > > > rcu_assign_pointer(files->fdt, new_fdt);
> > > > > + write_seqcount_end(&files->fdt_seqcount);
> > > > > if (cur_fdt != &files->fdtab)
> > > > > call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
> > > >
> > > > Interesting. AFAICS, your test doesn't step anywhere near that path,
> > > > does it? So basically you never hit the retries during that...
> > >
> > > Right, but then the table is almost never changed for a given process,
> > > as we only increase it by power of two steps.
> > >
> > > (So I scratch my initial comment, fdt_seqcount is really mostly read)
> >
> > I tested Mateusz patch with my opensock program, mimicking a bit more
> > what a server does (having lot of sockets)
> >
> > 24 threads running, doing close(randomfd())/socket() calls like crazy.
> >
> > Before patch :
> >
> > # time ./opensock
> >
> > real 0m10.863s
> > user 0m0.954s
> > sys 2m43.659s
> >
> >
> > After patch :
> >
> > # time ./opensock
> >
> > real 0m9.750s
> > user 0m0.804s
> > sys 2m18.034s
> >
> > So this is an improvement for sure, but not massive.
> >
> > perf record ./opensock ; report
> >
> > 87.80% opensock [kernel.kallsyms] [k] _raw_spin_lock
> > |--52.70%-- __close_fd
> > |--46.41%-- __alloc_fd
>
> My crap benchmark is here: http://people.redhat.com/~mguzik/pipebench.c
> (compile with -pthread, run with -s 10 -n 16 for 10 second test + 16
> threads)
>
> As noted earlier it tends to go from rougly 300k ops/s to 400.
>
> The fundamental problem here seems to be this pesky POSIX requirement of
> providing the lowest possible fd on each allocation (as a side note
> Linux breaks this with parallel fd allocs, where one of these backs off
> the reservation, not that I believe this causes trouble).

Note POSIX never talked about multi threads. The POSIX requirement came
from traditional linux stdin/stdout/stderr handling and legacy programs,
before dup2() even existed.


>
> Ideally a process-wide switch could be implemented (e.g.
> prctl(SCRATCH_LOWEST_FD_REQ)) which would grant the kernel the freedom
> to return any fd it wants, so it would be possible to have fd ranges
> per thread and the like.

I played months ago with a SOCK_FD_FASTALLOC ;)

idea was to use a random starting point instead of 0.

But the bottleneck was really the spinlock, not the bit search, unless I
used 10 million fds in the program...

>
> Having only a O_SCRATCH_POSIX flag passed to syscalls would still leave
> close() as a bottleneck.
>
> In the meantime I consider the approach taken in my patch as an ok
> temporary improvement.

Yes please formally submit this patch.

Note that adding atomic bit operations could eventually allow to not
hold the spinlock at close() time.

2015-04-17 21:47:02

by Eric Dumazet

[permalink] [raw]
Subject: Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

On Thu, 2015-04-16 at 14:16 +0200, Mateusz Guzik wrote:
> Hi,
>
> Currently obtaining a new file descriptor results in locking fdtable
> twice - once in order to reserve a slot and second time to fill it

...


> void __fd_install(struct files_struct *files, unsigned int fd,
> struct file *file)
> {
> + unsigned long seq;

unsigned int seq;

> struct fdtable *fdt;
> - spin_lock(&files->file_lock);
> - fdt = files_fdtable(files);
> - BUG_ON(fdt->fd[fd] != NULL);
> - rcu_assign_pointer(fdt->fd[fd], file);
> - spin_unlock(&files->file_lock);
> +
> + rcu_read_lock();
> + do {
> + seq = read_seqcount_begin(&files->fdt_seqcount);
> + fdt = files_fdtable_seq(files);
> + /*
> + * Entry in the table can already be equal to file if we
> + * had to restart and copy_fdtable picked up our update.
> + */
> + BUG_ON(!(fdt->fd[fd] == NULL || fdt->fd[fd] == file));
> + rcu_assign_pointer(fdt->fd[fd], file);
> + smp_mb();
> + } while (__read_seqcount_retry(&files->fdt_seqcount, seq));
> + rcu_read_unlock();
> }
>

So one problem here is :

As soon as rcu_assign_pointer(fdt->fd[fd], file) is done, and other cpu
does one expand_fdtable() and releases files->file_lock, another cpu can
close(fd).

Then another cpu can reuse the [fd] now empty slot and install a new
file in it.

Then this cpu will crash here :

BUG_ON(!(fdt->fd[fd] == NULL || fdt->fd[fd] == file));


2015-04-17 22:17:16

by Mateusz Guzik

[permalink] [raw]
Subject: Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

On Fri, Apr 17, 2015 at 02:46:56PM -0700, Eric Dumazet wrote:
> On Thu, 2015-04-16 at 14:16 +0200, Mateusz Guzik wrote:
> > Hi,
> >
> > Currently obtaining a new file descriptor results in locking fdtable
> > twice - once in order to reserve a slot and second time to fill it
>
> ...
>
>
> > void __fd_install(struct files_struct *files, unsigned int fd,
> > struct file *file)
> > {
> > + unsigned long seq;
>
> unsigned int seq;
>
> > struct fdtable *fdt;
> > - spin_lock(&files->file_lock);
> > - fdt = files_fdtable(files);
> > - BUG_ON(fdt->fd[fd] != NULL);
> > - rcu_assign_pointer(fdt->fd[fd], file);
> > - spin_unlock(&files->file_lock);
> > +
> > + rcu_read_lock();
> > + do {
> > + seq = read_seqcount_begin(&files->fdt_seqcount);
> > + fdt = files_fdtable_seq(files);
> > + /*
> > + * Entry in the table can already be equal to file if we
> > + * had to restart and copy_fdtable picked up our update.
> > + */
> > + BUG_ON(!(fdt->fd[fd] == NULL || fdt->fd[fd] == file));
> > + rcu_assign_pointer(fdt->fd[fd], file);
> > + smp_mb();
> > + } while (__read_seqcount_retry(&files->fdt_seqcount, seq));
> > + rcu_read_unlock();
> > }
> >
>
> So one problem here is :
>
> As soon as rcu_assign_pointer(fdt->fd[fd], file) is done, and other cpu
> does one expand_fdtable() and releases files->file_lock, another cpu can
> close(fd).
>
> Then another cpu can reuse the [fd] now empty slot and install a new
> file in it.
>
> Then this cpu will crash here :
>
> BUG_ON(!(fdt->fd[fd] == NULL || fdt->fd[fd] == file));
>

Ouch, this is so obvious now that you mention it. Really stupid
mistake on my side.

I would say this makes the use of seq counter impossible. Even if we
decided to fall back to a lock on retry, we cannot know what to do if
the slot is reserved - it very well could be that something called
close, and something else reserved the slot, so putting the file inside
could be really bad. In fact we would be putting a file for which we
don't have a reference anymore.

However, not all hope is lost and I still think we can speed things up.

A locking primitive which only locks stuff for current cpu and has
another mode where it locks stuff for all cpus would do the trick just
fine. I'm not a linux guy, quick search suggests 'lglock' would do what
I want.

table reallocation is an extremely rare operation, so this should be
fine. It would take the lock 'globally' for given table.

I'll play with this.

--
Mateusz Guzik

2015-04-17 23:03:02

by Al Viro

[permalink] [raw]
Subject: Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

On Sat, Apr 18, 2015 at 12:16:48AM +0200, Mateusz Guzik wrote:

> I would say this makes the use of seq counter impossible. Even if we
> decided to fall back to a lock on retry, we cannot know what to do if
> the slot is reserved - it very well could be that something called
> close, and something else reserved the slot, so putting the file inside
> could be really bad. In fact we would be putting a file for which we
> don't have a reference anymore.
>
> However, not all hope is lost and I still think we can speed things up.
>
> A locking primitive which only locks stuff for current cpu and has
> another mode where it locks stuff for all cpus would do the trick just
> fine. I'm not a linux guy, quick search suggests 'lglock' would do what
> I want.
>
> table reallocation is an extremely rare operation, so this should be
> fine. It would take the lock 'globally' for given table.

It would also mean percpu_alloc() for each descriptor table...

2015-04-18 19:41:43

by Eric Dumazet

[permalink] [raw]
Subject: Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

On Sat, 2015-04-18 at 00:02 +0100, Al Viro wrote:
> On Sat, Apr 18, 2015 at 12:16:48AM +0200, Mateusz Guzik wrote:
>
> > I would say this makes the use of seq counter impossible. Even if we
> > decided to fall back to a lock on retry, we cannot know what to do if
> > the slot is reserved - it very well could be that something called
> > close, and something else reserved the slot, so putting the file inside
> > could be really bad. In fact we would be putting a file for which we
> > don't have a reference anymore.
> >
> > However, not all hope is lost and I still think we can speed things up.
> >
> > A locking primitive which only locks stuff for current cpu and has
> > another mode where it locks stuff for all cpus would do the trick just
> > fine. I'm not a linux guy, quick search suggests 'lglock' would do what
> > I want.
> >
> > table reallocation is an extremely rare operation, so this should be
> > fine. It would take the lock 'globally' for given table.
>
> It would also mean percpu_alloc() for each descriptor table...

I would rather use an xchg() instead of rcu_assign_ponter()

old = xchg(&fdt->fd[fd], file);
if (unlikely(old))
filp_close(old, files);

If threads are using close() on random fds, final result is not
guaranteed anyway.

2015-04-20 13:06:47

by Mateusz Guzik

[permalink] [raw]
Subject: Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

On Sat, Apr 18, 2015 at 12:02:52AM +0100, Al Viro wrote:
> On Sat, Apr 18, 2015 at 12:16:48AM +0200, Mateusz Guzik wrote:
>
> > I would say this makes the use of seq counter impossible. Even if we
> > decided to fall back to a lock on retry, we cannot know what to do if
> > the slot is reserved - it very well could be that something called
> > close, and something else reserved the slot, so putting the file inside
> > could be really bad. In fact we would be putting a file for which we
> > don't have a reference anymore.
> >
> > However, not all hope is lost and I still think we can speed things up.
> >
> > A locking primitive which only locks stuff for current cpu and has
> > another mode where it locks stuff for all cpus would do the trick just
> > fine. I'm not a linux guy, quick search suggests 'lglock' would do what
> > I want.
> >
> > table reallocation is an extremely rare operation, so this should be
> > fine. It would take the lock 'globally' for given table.
>
> It would also mean percpu_alloc() for each descriptor table...

Well as it was noted I have not checked how it's implemented at the time
of writing the message. I agree embedding something like this into files
struct is a non-starter.

I would say this could work with a small set of locks, selected by hashing
struct files pointer.

Table resizing is supposed to be extremely rare - most processes should
not need it at all (if they do, the default size is too small and should
be adjusted). Not only that, the lock is only needed if the process in
question is multithreaded.

So I would say this would not contend in real-world workloads, but still
looks crappy.

Unfortunately the whole thing loses original appeal of a simple hack
with no potential perfomrance drawbacks. Maybe I'll hack it up later and
run some tests anyway.

--
Mateusz Guzik

2015-04-20 13:41:23

by Mateusz Guzik

[permalink] [raw]
Subject: Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

On Sat, Apr 18, 2015 at 12:41:38PM -0700, Eric Dumazet wrote:
> On Sat, 2015-04-18 at 00:02 +0100, Al Viro wrote:
> > On Sat, Apr 18, 2015 at 12:16:48AM +0200, Mateusz Guzik wrote:
> >
> > > I would say this makes the use of seq counter impossible. Even if we
> > > decided to fall back to a lock on retry, we cannot know what to do if
> > > the slot is reserved - it very well could be that something called
> > > close, and something else reserved the slot, so putting the file inside
> > > could be really bad. In fact we would be putting a file for which we
> > > don't have a reference anymore.
> > >
> > > However, not all hope is lost and I still think we can speed things up.
> > >
> > > A locking primitive which only locks stuff for current cpu and has
> > > another mode where it locks stuff for all cpus would do the trick just
> > > fine. I'm not a linux guy, quick search suggests 'lglock' would do what
> > > I want.
> > >
> > > table reallocation is an extremely rare operation, so this should be
> > > fine. It would take the lock 'globally' for given table.
> >
> > It would also mean percpu_alloc() for each descriptor table...
>
> I would rather use an xchg() instead of rcu_assign_ponter()
>
> old = xchg(&fdt->fd[fd], file);
> if (unlikely(old))
> filp_close(old, files);
>
> If threads are using close() on random fds, final result is not
> guaranteed anyway.
>

Well I don't see how could this be used to fix the problem.

If you are retrying and see NULL, you don't know whether your previous
update was not picked up by memcpy OR the fd got closed, which also
unreferenced the file you are installing. But you can't tell what
happened.

If you see non-NULL and what you found is not the file you are
installing, you know the file was freed so you can't close the old file.

One could try to introduce an invariant that files installed in a
lockless manner have to start with refcount 1, you still can't infer
anything from the fact that the counter is 1 when you retry (even if you
take the lock). It could have been duped, or even sent over a unix
socket and closed (although that awould surely require a solid pause in
execution) and who knows what else.

In general I would say this approach is too hard to get right to be
worthwile given expected speedup.

--
Mateusz Guzik

2015-04-20 13:43:46

by Mateusz Guzik

[permalink] [raw]
Subject: Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

On Mon, Apr 20, 2015 at 03:06:33PM +0200, Mateusz Guzik wrote:
> On Sat, Apr 18, 2015 at 12:02:52AM +0100, Al Viro wrote:
> > On Sat, Apr 18, 2015 at 12:16:48AM +0200, Mateusz Guzik wrote:
> >
> > > I would say this makes the use of seq counter impossible. Even if we
> > > decided to fall back to a lock on retry, we cannot know what to do if
> > > the slot is reserved - it very well could be that something called
> > > close, and something else reserved the slot, so putting the file inside
> > > could be really bad. In fact we would be putting a file for which we
> > > don't have a reference anymore.
> > >
> > > However, not all hope is lost and I still think we can speed things up.
> > >
> > > A locking primitive which only locks stuff for current cpu and has
> > > another mode where it locks stuff for all cpus would do the trick just
> > > fine. I'm not a linux guy, quick search suggests 'lglock' would do what
> > > I want.
> > >
> > > table reallocation is an extremely rare operation, so this should be
> > > fine. It would take the lock 'globally' for given table.
> >
> > It would also mean percpu_alloc() for each descriptor table...
>
> Well as it was noted I have not checked how it's implemented at the time
> of writing the message. I agree embedding something like this into files
> struct is a non-starter.
>
> I would say this could work with a small set of locks, selected by hashing
> struct files pointer.
>
> Table resizing is supposed to be extremely rare - most processes should
> not need it at all (if they do, the default size is too small and should
> be adjusted). Not only that, the lock is only needed if the process in
> question is multithreaded.
>
> So I would say this would not contend in real-world workloads, but still
> looks crappy.
>
> Unfortunately the whole thing loses original appeal of a simple hack
> with no potential perfomrance drawbacks. Maybe I'll hack it up later and
> run some tests anyway.
>

I just came up with another stupid hack, but this time it could really
work just fine.

Note that the entire issue stems from the fact that the table can be
resized at any moment. If only we had a guarantee the table "stands
still", we would not even need that sequence couner. fd_install could
just plop the file in.

So a stupid hack which comes to mind tells the kernel to make sure the
table is big enough and then never resize it ever again (inherited on
fork, cleared on exec):
prctl(FDTABLE_SIZE_FIXED, BIGNUM);

or

dup2(0, BIGNUM); /* sizes the table appropriately */
close(BIGNUM);
prctl(FDTABLE_SIZE_FIXED);

Thoughts?
--
Mateusz Guzik

2015-04-20 15:11:15

by Mateusz Guzik

[permalink] [raw]
Subject: Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

On Mon, Apr 20, 2015 at 03:43:26PM +0200, Mateusz Guzik wrote:
> On Mon, Apr 20, 2015 at 03:06:33PM +0200, Mateusz Guzik wrote:
> > On Sat, Apr 18, 2015 at 12:02:52AM +0100, Al Viro wrote:
> > > On Sat, Apr 18, 2015 at 12:16:48AM +0200, Mateusz Guzik wrote:
> > >
> > > > I would say this makes the use of seq counter impossible. Even if we
> > > > decided to fall back to a lock on retry, we cannot know what to do if
> > > > the slot is reserved - it very well could be that something called
> > > > close, and something else reserved the slot, so putting the file inside
> > > > could be really bad. In fact we would be putting a file for which we
> > > > don't have a reference anymore.
> > > >
> > > > However, not all hope is lost and I still think we can speed things up.
> > > >
> > > > A locking primitive which only locks stuff for current cpu and has
> > > > another mode where it locks stuff for all cpus would do the trick just
> > > > fine. I'm not a linux guy, quick search suggests 'lglock' would do what
> > > > I want.
> > > >
> > > > table reallocation is an extremely rare operation, so this should be
> > > > fine. It would take the lock 'globally' for given table.
> > >
> > > It would also mean percpu_alloc() for each descriptor table...
> >
> > Well as it was noted I have not checked how it's implemented at the time
> > of writing the message. I agree embedding something like this into files
> > struct is a non-starter.
> >
> > I would say this could work with a small set of locks, selected by hashing
> > struct files pointer.
> >
> > Table resizing is supposed to be extremely rare - most processes should
> > not need it at all (if they do, the default size is too small and should
> > be adjusted). Not only that, the lock is only needed if the process in
> > question is multithreaded.
> >
> > So I would say this would not contend in real-world workloads, but still
> > looks crappy.
> >
> > Unfortunately the whole thing loses original appeal of a simple hack
> > with no potential perfomrance drawbacks. Maybe I'll hack it up later and
> > run some tests anyway.
> >
>
> I just came up with another stupid hack, but this time it could really
> work just fine.
>
> Note that the entire issue stems from the fact that the table can be
> resized at any moment. If only we had a guarantee the table "stands
> still", we would not even need that sequence couner. fd_install could
> just plop the file in.
>
> So a stupid hack which comes to mind tells the kernel to make sure the
> table is big enough and then never resize it ever again (inherited on
> fork, cleared on exec):
> prctl(FDTABLE_SIZE_FIXED, BIGNUM);
>
> or
>
> dup2(0, BIGNUM); /* sizes the table appropriately */
> close(BIGNUM);
> prctl(FDTABLE_SIZE_FIXED);
>
> Thoughts?

Sorry for spam but I came up with another hack. :)

The idea is that we can have a variable which would signify the that
given thread is playing with fd table in fd_install (kind of a lock
embedded into task_struct). We would also have a flag in files struct
indicating that a thread would like to resize it.

expand_fdtable would set the flag and iterate over all threads waiting
for all of them to have the var set to 0.

fd_install would set the var, test the flag and if needed would just
unset the var and take the spin lock associated with the table.

This way the common case (nobody resizes the table) is lockless.

Resizing operation can get expensive but that should be totally fine.

As a hack in a hack we could abuse rcu's counter to server as the "lock".

Thoughts?

--
Mateusz Guzik

2015-04-20 16:46:28

by Eric Dumazet

[permalink] [raw]
Subject: Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

On Mon, 2015-04-20 at 15:41 +0200, Mateusz Guzik wrote:
> On Sat, Apr 18, 2015 at 12:41:38PM -0700, Eric Dumazet wrote:
> > On Sat, 2015-04-18 at 00:02 +0100, Al Viro wrote:
> > > On Sat, Apr 18, 2015 at 12:16:48AM +0200, Mateusz Guzik wrote:
> > >
> > > > I would say this makes the use of seq counter impossible. Even if we
> > > > decided to fall back to a lock on retry, we cannot know what to do if
> > > > the slot is reserved - it very well could be that something called
> > > > close, and something else reserved the slot, so putting the file inside
> > > > could be really bad. In fact we would be putting a file for which we
> > > > don't have a reference anymore.
> > > >
> > > > However, not all hope is lost and I still think we can speed things up.
> > > >
> > > > A locking primitive which only locks stuff for current cpu and has
> > > > another mode where it locks stuff for all cpus would do the trick just
> > > > fine. I'm not a linux guy, quick search suggests 'lglock' would do what
> > > > I want.
> > > >
> > > > table reallocation is an extremely rare operation, so this should be
> > > > fine. It would take the lock 'globally' for given table.
> > >
> > > It would also mean percpu_alloc() for each descriptor table...
> >
> > I would rather use an xchg() instead of rcu_assign_ponter()
> >
> > old = xchg(&fdt->fd[fd], file);
> > if (unlikely(old))
> > filp_close(old, files);
> >
> > If threads are using close() on random fds, final result is not
> > guaranteed anyway.
> >
>
> Well I don't see how could this be used to fix the problem.
>
> If you are retrying and see NULL, you don't know whether your previous
> update was not picked up by memcpy OR the fd got closed, which also
> unreferenced the file you are installing. But you can't tell what
> happened.
>
> If you see non-NULL and what you found is not the file you are
> installing, you know the file was freed so you can't close the old file.
>
> One could try to introduce an invariant that files installed in a
> lockless manner have to start with refcount 1, you still can't infer
> anything from the fact that the counter is 1 when you retry (even if you
> take the lock). It could have been duped, or even sent over a unix
> socket and closed (although that awould surely require a solid pause in
> execution) and who knows what else.
>
> In general I would say this approach is too hard to get right to be
> worthwile given expected speedup.
>

Hey, that's because I really meant (during the week end)

2015-04-20 16:48:29

by Eric Dumazet

[permalink] [raw]
Subject: Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

On Mon, 2015-04-20 at 09:46 -0700, Eric Dumazet wrote:

>
> Hey, that's because I really meant (during the week end)

old = xchg(&fdt->fd[fd], file);
if (unlikely(old && old != file))
filp_close(old, files);

That's going to be hard.

If you thought this problem was trivial, it probably means you missed
something.


2015-04-20 17:15:30

by Eric Dumazet

[permalink] [raw]
Subject: Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

On Mon, 2015-04-20 at 17:10 +0200, Mateusz Guzik wrote:

> Sorry for spam but I came up with another hack. :)
>
> The idea is that we can have a variable which would signify the that
> given thread is playing with fd table in fd_install (kind of a lock
> embedded into task_struct). We would also have a flag in files struct
> indicating that a thread would like to resize it.
>
> expand_fdtable would set the flag and iterate over all threads waiting
> for all of them to have the var set to 0.

The opposite : you have to block them in some way and add a rcu_sched()
or something.

Another way would be to expand the table leaving the old one in place,
thanks to a new vrealloc() api. This would require all file tables to at
least use one page.

(Instead of use a new set of pages for the new vmalloc()ed area, reuse
the pages that are already mapped into previous vmalloc() area.

Right now, expanding 4M slots to 8M slots is taking a long time and is a
latency killer.

2015-04-20 20:50:01

by Eric Dumazet

[permalink] [raw]
Subject: Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

On Mon, 2015-04-20 at 10:15 -0700, Eric Dumazet wrote:
> On Mon, 2015-04-20 at 17:10 +0200, Mateusz Guzik wrote:
>
> > Sorry for spam but I came up with another hack. :)
> >
> > The idea is that we can have a variable which would signify the that
> > given thread is playing with fd table in fd_install (kind of a lock
> > embedded into task_struct). We would also have a flag in files struct
> > indicating that a thread would like to resize it.
> >
> > expand_fdtable would set the flag and iterate over all threads waiting
> > for all of them to have the var set to 0.
>
> The opposite : you have to block them in some way and add a rcu_sched()
> or something.

Here is the patch I cooked here but not yet tested.

diff --git a/fs/file.c b/fs/file.c
index 93c5f89c248b..d72bdcacd4df 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -145,17 +145,23 @@ static int expand_fdtable(struct files_struct *files, int nr)
{
struct fdtable *new_fdt, *cur_fdt;

+ files->resize_in_progress++;
spin_unlock(&files->file_lock);
new_fdt = alloc_fdtable(nr);
+ /* make sure all __fd_install() have seen resize_in_progress */
+ synchronize_sched();
spin_lock(&files->file_lock);
- if (!new_fdt)
+ if (!new_fdt) {
+ files->resize_in_progress--;
return -ENOMEM;
+ }
/*
* extremely unlikely race - sysctl_nr_open decreased between the check in
* caller and alloc_fdtable(). Cheaper to catch it here...
*/
if (unlikely(new_fdt->max_fds <= nr)) {
__free_fdtable(new_fdt);
+ files->resize_in_progress--;
return -EMFILE;
}
/*
@@ -173,6 +179,9 @@ static int expand_fdtable(struct files_struct *files, int nr)
/* Somebody else expanded, so undo our attempt */
__free_fdtable(new_fdt);
}
+ /* coupled with smp_rmb() in __fd_install() */
+ smp_wmb();
+ files->resize_in_progress--;
return 1;
}

@@ -256,6 +265,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
atomic_set(&newf->count, 1);

spin_lock_init(&newf->file_lock);
+ newf->resize_in_progress = 0;
newf->next_fd = 0;
new_fdt = &newf->fdtab;
new_fdt->max_fds = NR_OPEN_DEFAULT;
@@ -553,11 +563,21 @@ void __fd_install(struct files_struct *files, unsigned int fd,
struct file *file)
{
struct fdtable *fdt;
- spin_lock(&files->file_lock);
+
+ rcu_read_lock_sched();
+
+ while (unlikely(READ_ONCE(files->resize_in_progress))) {
+ rcu_read_unlock_sched();
+ yield();
+ rcu_read_lock_sched();
+ }
+ /* coupled with smp_wmb() in expand_fdtable() */
+ smp_rmb();
fdt = files_fdtable(files);
BUG_ON(fdt->fd[fd] != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
- spin_unlock(&files->file_lock);
+
+ rcu_read_unlock_sched();
}

void fd_install(unsigned int fd, struct file *file)
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 230f87bdf5ad..7496a0d73a7c 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -47,6 +47,7 @@ struct files_struct {
* read mostly part
*/
atomic_t count;
+ int resize_in_progress;
struct fdtable __rcu *fdt;
struct fdtable fdtab;
/*



2015-04-21 18:05:48

by Eric Dumazet

[permalink] [raw]
Subject: Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

On Mon, 2015-04-20 at 13:49 -0700, Eric Dumazet wrote:
> On Mon, 2015-04-20 at 10:15 -0700, Eric Dumazet wrote:
> > On Mon, 2015-04-20 at 17:10 +0200, Mateusz Guzik wrote:
> >
> > > Sorry for spam but I came up with another hack. :)
> > >
> > > The idea is that we can have a variable which would signify the that
> > > given thread is playing with fd table in fd_install (kind of a lock
> > > embedded into task_struct). We would also have a flag in files struct
> > > indicating that a thread would like to resize it.
> > >
> > > expand_fdtable would set the flag and iterate over all threads waiting
> > > for all of them to have the var set to 0.
> >
> > The opposite : you have to block them in some way and add a rcu_sched()
> > or something.
>
> Here is the patch I cooked here but not yet tested.

In following version :

1) I replaced the yield() hack by a proper wait queue.

2) I do not invoke synchronize_sched() for mono threaded programs.

3) I avoid multiple threads doing a resize and then only one wins the
deal.

(copying/clearing big amount of memory only to discover another guy did
the same is a big waste of resources)


This seems to run properly on my hosts.

Your comments/tests are most welcomed, thanks !

fs/file.c | 41 +++++++++++++++++++++++++++++++++-----
include/linux/fdtable.h | 3 ++
2 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/fs/file.c b/fs/file.c
index 93c5f89c248b..e0e113a56444 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -147,6 +147,9 @@ static int expand_fdtable(struct files_struct *files, int nr)

spin_unlock(&files->file_lock);
new_fdt = alloc_fdtable(nr);
+ /* make sure no __fd_install() are still updating fdt */
+ if (atomic_read(&files->count) > 1)
+ synchronize_sched();
spin_lock(&files->file_lock);
if (!new_fdt)
return -ENOMEM;
@@ -170,9 +173,12 @@ static int expand_fdtable(struct files_struct *files, int nr)
if (cur_fdt != &files->fdtab)
call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
} else {
+ WARN_ON_ONCE(1);
/* Somebody else expanded, so undo our attempt */
__free_fdtable(new_fdt);
}
+ /* coupled with smp_rmb() in __fd_install() */
+ smp_wmb();
return 1;
}

@@ -187,19 +193,33 @@ static int expand_fdtable(struct files_struct *files, int nr)
static int expand_files(struct files_struct *files, int nr)
{
struct fdtable *fdt;
+ int expanded = 0;

+begin:
fdt = files_fdtable(files);

/* Do we need to expand? */
if (nr < fdt->max_fds)
- return 0;
+ return expanded;

/* Can we expand? */
if (nr >= sysctl_nr_open)
return -EMFILE;

+ while (unlikely(files->resize_in_progress)) {
+ spin_unlock(&files->file_lock);
+ expanded = 1;
+ wait_event(files->resize_wait, !files->resize_in_progress);
+ spin_lock(&files->file_lock);
+ goto begin;
+ }
+
/* All good, so we try */
- return expand_fdtable(files, nr);
+ files->resize_in_progress = true;
+ expanded = expand_fdtable(files, nr);
+ files->resize_in_progress = false;
+ wake_up_all(&files->resize_wait);
+ return expanded;
}

static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
@@ -256,6 +276,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
atomic_set(&newf->count, 1);

spin_lock_init(&newf->file_lock);
+ newf->resize_in_progress = 0;
+ init_waitqueue_head(&newf->resize_wait);
newf->next_fd = 0;
new_fdt = &newf->fdtab;
new_fdt->max_fds = NR_OPEN_DEFAULT;
@@ -553,11 +575,20 @@ void __fd_install(struct files_struct *files, unsigned int fd,
struct file *file)
{
struct fdtable *fdt;
- spin_lock(&files->file_lock);
- fdt = files_fdtable(files);
+
+ rcu_read_lock_sched();
+
+ while (unlikely(files->resize_in_progress)) {
+ rcu_read_unlock_sched();
+ wait_event(files->resize_wait, !files->resize_in_progress);
+ rcu_read_lock_sched();
+ }
+ /* coupled with smp_wmb() in expand_fdtable() */
+ smp_rmb();
+ fdt = READ_ONCE(files->fdt);
BUG_ON(fdt->fd[fd] != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
- spin_unlock(&files->file_lock);
+ rcu_read_unlock_sched();
}

void fd_install(unsigned int fd, struct file *file)
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 230f87bdf5ad..fbb88740634a 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -47,6 +47,9 @@ struct files_struct {
* read mostly part
*/
atomic_t count;
+ bool resize_in_progress;
+ wait_queue_head_t resize_wait;
+
struct fdtable __rcu *fdt;
struct fdtable fdtab;
/*



2015-04-21 20:06:46

by Mateusz Guzik

[permalink] [raw]
Subject: Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

On Tue, Apr 21, 2015 at 11:05:43AM -0700, Eric Dumazet wrote:
> On Mon, 2015-04-20 at 13:49 -0700, Eric Dumazet wrote:
> > On Mon, 2015-04-20 at 10:15 -0700, Eric Dumazet wrote:
> > > On Mon, 2015-04-20 at 17:10 +0200, Mateusz Guzik wrote:
> > >
> > > > Sorry for spam but I came up with another hack. :)
> > > >
> > > > The idea is that we can have a variable which would signify the that
> > > > given thread is playing with fd table in fd_install (kind of a lock
> > > > embedded into task_struct). We would also have a flag in files struct
> > > > indicating that a thread would like to resize it.
> > > >
> > > > expand_fdtable would set the flag and iterate over all threads waiting
> > > > for all of them to have the var set to 0.
> > >
> > > The opposite : you have to block them in some way and add a rcu_sched()
> > > or something.
> >

What I described would block them, although it was a crappy approach
(iterating threads vs cpus). I was wondering if RCU could be abused for
this feature and apparently it can.

> > Here is the patch I cooked here but not yet tested.
>
> In following version :
>
> 1) I replaced the yield() hack by a proper wait queue.
>
> 2) I do not invoke synchronize_sched() for mono threaded programs.
>
> 3) I avoid multiple threads doing a resize and then only one wins the
> deal.
>

One could argue this last bit could be committed separately (a different
logical change).

As I read up about synchronize_sched and rcu_read_lock_sched, the code
should be correct.

Also see nits below.

> (copying/clearing big amount of memory only to discover another guy did
> the same is a big waste of resources)
>
>
> This seems to run properly on my hosts.
>
> Your comments/tests are most welcomed, thanks !
>
> fs/file.c | 41 +++++++++++++++++++++++++++++++++-----
> include/linux/fdtable.h | 3 ++
> 2 files changed, 39 insertions(+), 5 deletions(-)
>
> diff --git a/fs/file.c b/fs/file.c
> index 93c5f89c248b..e0e113a56444 100644
> --- a/fs/file.c
> +++ b/fs/file.c
> @@ -147,6 +147,9 @@ static int expand_fdtable(struct files_struct *files, int nr)
>
> spin_unlock(&files->file_lock);
> new_fdt = alloc_fdtable(nr);
> + /* make sure no __fd_install() are still updating fdt */
> + if (atomic_read(&files->count) > 1)
> + synchronize_sched();
> spin_lock(&files->file_lock);
> if (!new_fdt)
> return -ENOMEM;
> @@ -170,9 +173,12 @@ static int expand_fdtable(struct files_struct *files, int nr)
> if (cur_fdt != &files->fdtab)
> call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
> } else {
> + WARN_ON_ONCE(1);
> /* Somebody else expanded, so undo our attempt */
> __free_fdtable(new_fdt);

The reader may be left confused why there is a warning while the comment
does not indicate anything is wrong.

> }
> + /* coupled with smp_rmb() in __fd_install() */
> + smp_wmb();
> return 1;
> }
>
> @@ -187,19 +193,33 @@ static int expand_fdtable(struct files_struct *files, int nr)
> static int expand_files(struct files_struct *files, int nr)
> {
> struct fdtable *fdt;
> + int expanded = 0;
>
> +begin:
> fdt = files_fdtable(files);
>
> /* Do we need to expand? */
> if (nr < fdt->max_fds)
> - return 0;
> + return expanded;
>
> /* Can we expand? */
> if (nr >= sysctl_nr_open)
> return -EMFILE;
>
> + while (unlikely(files->resize_in_progress)) {
> + spin_unlock(&files->file_lock);
> + expanded = 1;
> + wait_event(files->resize_wait, !files->resize_in_progress);
> + spin_lock(&files->file_lock);
> + goto begin;
> + }

This does not loop anymore, so s/while/if/ ?

> +
> /* All good, so we try */
> - return expand_fdtable(files, nr);
> + files->resize_in_progress = true;
> + expanded = expand_fdtable(files, nr);
> + files->resize_in_progress = false;
> + wake_up_all(&files->resize_wait);
> + return expanded;
> }
>
> static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
> @@ -256,6 +276,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
> atomic_set(&newf->count, 1);
>
> spin_lock_init(&newf->file_lock);
> + newf->resize_in_progress = 0;
> + init_waitqueue_head(&newf->resize_wait);
> newf->next_fd = 0;
> new_fdt = &newf->fdtab;
> new_fdt->max_fds = NR_OPEN_DEFAULT;
> @@ -553,11 +575,20 @@ void __fd_install(struct files_struct *files, unsigned int fd,
> struct file *file)
> {
> struct fdtable *fdt;
> - spin_lock(&files->file_lock);
> - fdt = files_fdtable(files);
> +
> + rcu_read_lock_sched();
> +
> + while (unlikely(files->resize_in_progress)) {
> + rcu_read_unlock_sched();
> + wait_event(files->resize_wait, !files->resize_in_progress);
> + rcu_read_lock_sched();
> + }
> + /* coupled with smp_wmb() in expand_fdtable() */
> + smp_rmb();
> + fdt = READ_ONCE(files->fdt);
> BUG_ON(fdt->fd[fd] != NULL);
> rcu_assign_pointer(fdt->fd[fd], file);
> - spin_unlock(&files->file_lock);
> + rcu_read_unlock_sched();
> }
>
> void fd_install(unsigned int fd, struct file *file)
> diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
> index 230f87bdf5ad..fbb88740634a 100644
> --- a/include/linux/fdtable.h
> +++ b/include/linux/fdtable.h
> @@ -47,6 +47,9 @@ struct files_struct {
> * read mostly part
> */
> atomic_t count;
> + bool resize_in_progress;
> + wait_queue_head_t resize_wait;
> +
> struct fdtable __rcu *fdt;
> struct fdtable fdtab;
> /*
>
>
>
>

--
Mateusz Guzik

2015-04-21 20:12:23

by Mateusz Guzik

[permalink] [raw]
Subject: Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

On Tue, Apr 21, 2015 at 10:06:24PM +0200, Mateusz Guzik wrote:
> On Tue, Apr 21, 2015 at 11:05:43AM -0700, Eric Dumazet wrote:
> > On Mon, 2015-04-20 at 13:49 -0700, Eric Dumazet wrote:
> > > On Mon, 2015-04-20 at 10:15 -0700, Eric Dumazet wrote:
> > > > On Mon, 2015-04-20 at 17:10 +0200, Mateusz Guzik wrote:
> > > >
> > > > > Sorry for spam but I came up with another hack. :)
> > > > >
> > > > > The idea is that we can have a variable which would signify the that
> > > > > given thread is playing with fd table in fd_install (kind of a lock
> > > > > embedded into task_struct). We would also have a flag in files struct
> > > > > indicating that a thread would like to resize it.
> > > > >
> > > > > expand_fdtable would set the flag and iterate over all threads waiting
> > > > > for all of them to have the var set to 0.
> > > >
> > > > The opposite : you have to block them in some way and add a rcu_sched()
> > > > or something.
> > >
>
> What I described would block them, although it was a crappy approach
> (iterating threads vs cpus). I was wondering if RCU could be abused for
> this feature and apparently it can.
>
> > > Here is the patch I cooked here but not yet tested.
> >
> > In following version :
> >
> > 1) I replaced the yield() hack by a proper wait queue.
> >
> > 2) I do not invoke synchronize_sched() for mono threaded programs.
> >
> > 3) I avoid multiple threads doing a resize and then only one wins the
> > deal.
> >
>
> One could argue this last bit could be committed separately (a different
> logical change).
>
> As I read up about synchronize_sched and rcu_read_lock_sched, the code
> should be correct.
>
> Also see nits below.
>

I'm utter crap with memory barriers, but I believe some are needed now:

in dup_fd:
for (i = open_files; i != 0; i--) {
struct file *f = *old_fds++;
if (f) {
get_file(f);

at least a data dependency barrier, or maybe smp_rmb for peace of mind

similarly in do_dup2:
tofree = fdt->fd[fd];
if (!tofree && fd_is_open(fd, fdt))
goto Ebusy;

> > (copying/clearing big amount of memory only to discover another guy did
> > the same is a big waste of resources)
> >
> >
> > This seems to run properly on my hosts.
> >
> > Your comments/tests are most welcomed, thanks !
> >
> > fs/file.c | 41 +++++++++++++++++++++++++++++++++-----
> > include/linux/fdtable.h | 3 ++
> > 2 files changed, 39 insertions(+), 5 deletions(-)
> >
> > diff --git a/fs/file.c b/fs/file.c
> > index 93c5f89c248b..e0e113a56444 100644
> > --- a/fs/file.c
> > +++ b/fs/file.c
> > @@ -147,6 +147,9 @@ static int expand_fdtable(struct files_struct *files, int nr)
> >
> > spin_unlock(&files->file_lock);
> > new_fdt = alloc_fdtable(nr);
> > + /* make sure no __fd_install() are still updating fdt */
> > + if (atomic_read(&files->count) > 1)
> > + synchronize_sched();
> > spin_lock(&files->file_lock);
> > if (!new_fdt)
> > return -ENOMEM;
> > @@ -170,9 +173,12 @@ static int expand_fdtable(struct files_struct *files, int nr)
> > if (cur_fdt != &files->fdtab)
> > call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
> > } else {
> > + WARN_ON_ONCE(1);
> > /* Somebody else expanded, so undo our attempt */
> > __free_fdtable(new_fdt);
>
> The reader may be left confused why there is a warning while the comment
> does not indicate anything is wrong.
>
> > }
> > + /* coupled with smp_rmb() in __fd_install() */
> > + smp_wmb();
> > return 1;
> > }
> >
> > @@ -187,19 +193,33 @@ static int expand_fdtable(struct files_struct *files, int nr)
> > static int expand_files(struct files_struct *files, int nr)
> > {
> > struct fdtable *fdt;
> > + int expanded = 0;
> >
> > +begin:
> > fdt = files_fdtable(files);
> >
> > /* Do we need to expand? */
> > if (nr < fdt->max_fds)
> > - return 0;
> > + return expanded;
> >
> > /* Can we expand? */
> > if (nr >= sysctl_nr_open)
> > return -EMFILE;
> >
> > + while (unlikely(files->resize_in_progress)) {
> > + spin_unlock(&files->file_lock);
> > + expanded = 1;
> > + wait_event(files->resize_wait, !files->resize_in_progress);
> > + spin_lock(&files->file_lock);
> > + goto begin;
> > + }
>
> This does not loop anymore, so s/while/if/ ?
>
> > +
> > /* All good, so we try */
> > - return expand_fdtable(files, nr);
> > + files->resize_in_progress = true;
> > + expanded = expand_fdtable(files, nr);
> > + files->resize_in_progress = false;
> > + wake_up_all(&files->resize_wait);
> > + return expanded;
> > }
> >
> > static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
> > @@ -256,6 +276,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
> > atomic_set(&newf->count, 1);
> >
> > spin_lock_init(&newf->file_lock);
> > + newf->resize_in_progress = 0;
> > + init_waitqueue_head(&newf->resize_wait);
> > newf->next_fd = 0;
> > new_fdt = &newf->fdtab;
> > new_fdt->max_fds = NR_OPEN_DEFAULT;
> > @@ -553,11 +575,20 @@ void __fd_install(struct files_struct *files, unsigned int fd,
> > struct file *file)
> > {
> > struct fdtable *fdt;
> > - spin_lock(&files->file_lock);
> > - fdt = files_fdtable(files);
> > +
> > + rcu_read_lock_sched();
> > +
> > + while (unlikely(files->resize_in_progress)) {
> > + rcu_read_unlock_sched();
> > + wait_event(files->resize_wait, !files->resize_in_progress);
> > + rcu_read_lock_sched();
> > + }
> > + /* coupled with smp_wmb() in expand_fdtable() */
> > + smp_rmb();
> > + fdt = READ_ONCE(files->fdt);
> > BUG_ON(fdt->fd[fd] != NULL);
> > rcu_assign_pointer(fdt->fd[fd], file);
> > - spin_unlock(&files->file_lock);
> > + rcu_read_unlock_sched();
> > }
> >
> > void fd_install(unsigned int fd, struct file *file)
> > diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
> > index 230f87bdf5ad..fbb88740634a 100644
> > --- a/include/linux/fdtable.h
> > +++ b/include/linux/fdtable.h
> > @@ -47,6 +47,9 @@ struct files_struct {
> > * read mostly part
> > */
> > atomic_t count;
> > + bool resize_in_progress;
> > + wait_queue_head_t resize_wait;
> > +
> > struct fdtable __rcu *fdt;
> > struct fdtable fdtab;
> > /*
> >
> >
> >
> >
>
> --
> Mateusz Guzik

--
Mateusz Guzik

2015-04-21 20:57:44

by Eric Dumazet

[permalink] [raw]
Subject: Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

On Tue, 2015-04-21 at 22:06 +0200, Mateusz Guzik wrote:
> On Tue, Apr 21, 2015 at 11:05:43AM -0700, Eric Dumazet wrote:

> > 3) I avoid multiple threads doing a resize and then only one wins the
> > deal.
> >
>
> One could argue this last bit could be committed separately (a different
> logical change).

Not really. The prior code was fine, but the addition of
synchronize_sched() made the overhead much bigger in case multiple
threads do this at the same time.

>
> As I read up about synchronize_sched and rcu_read_lock_sched, the code
> should be correct.

> > spin_lock(&files->file_lock);
> > if (!new_fdt)
> > return -ENOMEM;
> > @@ -170,9 +173,12 @@ static int expand_fdtable(struct files_struct *files, int nr)
> > if (cur_fdt != &files->fdtab)
> > call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
> > } else {
> > + WARN_ON_ONCE(1);
> > /* Somebody else expanded, so undo our attempt */
> > __free_fdtable(new_fdt);
>
> The reader may be left confused why there is a warning while the comment
> does not indicate anything is wrong.

My intent is to remove completely this code, but I left this
WARN_ON_ONCE() for my tests, just to make sure my theory was right ;)

>
> > }
> > + /* coupled with smp_rmb() in __fd_install() */
> > + smp_wmb();
> > return 1;
> > }
> >
> > @@ -187,19 +193,33 @@ static int expand_fdtable(struct files_struct *files, int nr)
> > static int expand_files(struct files_struct *files, int nr)
> > {
> > struct fdtable *fdt;
> > + int expanded = 0;
> >
> > +begin:
> > fdt = files_fdtable(files);
> >
> > /* Do we need to expand? */
> > if (nr < fdt->max_fds)
> > - return 0;
> > + return expanded;
> >
> > /* Can we expand? */
> > if (nr >= sysctl_nr_open)
> > return -EMFILE;
> >
> > + while (unlikely(files->resize_in_progress)) {
> > + spin_unlock(&files->file_lock);
> > + expanded = 1;
> > + wait_event(files->resize_wait, !files->resize_in_progress);
> > + spin_lock(&files->file_lock);
> > + goto begin;
> > + }
>
> This does not loop anymore, so s/while/if/ ?

You are right, thanks !


2015-04-21 21:06:57

by Eric Dumazet

[permalink] [raw]
Subject: Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

On Tue, 2015-04-21 at 22:12 +0200, Mateusz Guzik wrote:

> in dup_fd:
> for (i = open_files; i != 0; i--) {
> struct file *f = *old_fds++;
> if (f) {
> get_file(f);
>

I see no new requirement here. f is either NULL or not.
multi threaded programs never had a guarantee dup_fd() would catch a non
NULL pointer here.


> at least a data dependency barrier, or maybe smp_rmb for peace of mind
>
> similarly in do_dup2:
> tofree = fdt->fd[fd];
> if (!tofree && fd_is_open(fd, fdt))
> goto Ebusy;

Same here.

2015-04-22 04:59:33

by Eric Dumazet

[permalink] [raw]
Subject: [PATCH] fs/file.c: don't acquire files->file_lock in fd_install()

From: Eric Dumazet <[email protected]>

Mateusz Guzik reported :

Currently obtaining a new file descriptor results in locking fdtable
twice - once in order to reserve a slot and second time to fill it.

Holding the spinlock in __fd_install() is needed in case a resize is
done, or to prevent a resize.

Mateusz provided an RFC patch and a micro benchmark :
http://people.redhat.com/~mguzik/pipebench.c

A resize is an unlikely operation in a process lifetime,
as table size is at least doubled at every resize.

We can use RCU instead of the spinlock :

__fd_install() must wait if a resize is in progress.

The resize must block new __fd_install() callers from starting,
and wait that ongoing install are finished (synchronize_sched())

resize should be attempted by a single thread to not waste resources.

rcu_sched variant is used, as __fd_install() and expand_fdtable() run
from process context.

It gives us a ~30% speedup using pipebench with 16 threads, and a ~10%
speedup with another program using TCP sockets.

Signed-off-by: Eric Dumazet <[email protected]>
Reported-by: Mateusz Guzik <[email protected]>
---
fs/file.c | 66 +++++++++++++++++++++++++++-----------
include/linux/fdtable.h | 3 +
2 files changed, 50 insertions(+), 19 deletions(-)

diff --git a/fs/file.c b/fs/file.c
index 93c5f89c248b..17cef5e52f16 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -147,6 +147,13 @@ static int expand_fdtable(struct files_struct *files, int nr)

spin_unlock(&files->file_lock);
new_fdt = alloc_fdtable(nr);
+
+ /* make sure all __fd_install() have seen resize_in_progress
+ * or have finished their rcu_read_lock_sched() section.
+ */
+ if (atomic_read(&files->count) > 1)
+ synchronize_sched();
+
spin_lock(&files->file_lock);
if (!new_fdt)
return -ENOMEM;
@@ -158,21 +165,14 @@ static int expand_fdtable(struct files_struct *files, int nr)
__free_fdtable(new_fdt);
return -EMFILE;
}
- /*
- * Check again since another task may have expanded the fd table while
- * we dropped the lock
- */
cur_fdt = files_fdtable(files);
- if (nr >= cur_fdt->max_fds) {
- /* Continue as planned */
- copy_fdtable(new_fdt, cur_fdt);
- rcu_assign_pointer(files->fdt, new_fdt);
- if (cur_fdt != &files->fdtab)
- call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
- } else {
- /* Somebody else expanded, so undo our attempt */
- __free_fdtable(new_fdt);
- }
+ BUG_ON(nr < cur_fdt->max_fds);
+ copy_fdtable(new_fdt, cur_fdt);
+ rcu_assign_pointer(files->fdt, new_fdt);
+ if (cur_fdt != &files->fdtab)
+ call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
+ /* coupled with smp_rmb() in __fd_install() */
+ smp_wmb();
return 1;
}

@@ -185,21 +185,38 @@ static int expand_fdtable(struct files_struct *files, int nr)
* The files->file_lock should be held on entry, and will be held on exit.
*/
static int expand_files(struct files_struct *files, int nr)
+ __releases(files->file_lock)
+ __acquires(files->file_lock)
{
struct fdtable *fdt;
+ int expanded = 0;

+repeat:
fdt = files_fdtable(files);

/* Do we need to expand? */
if (nr < fdt->max_fds)
- return 0;
+ return expanded;

/* Can we expand? */
if (nr >= sysctl_nr_open)
return -EMFILE;

+ if (unlikely(files->resize_in_progress)) {
+ spin_unlock(&files->file_lock);
+ expanded = 1;
+ wait_event(files->resize_wait, !files->resize_in_progress);
+ spin_lock(&files->file_lock);
+ goto repeat;
+ }
+
/* All good, so we try */
- return expand_fdtable(files, nr);
+ files->resize_in_progress = true;
+ expanded = expand_fdtable(files, nr);
+ files->resize_in_progress = false;
+
+ wake_up_all(&files->resize_wait);
+ return expanded;
}

static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
@@ -256,6 +273,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
atomic_set(&newf->count, 1);

spin_lock_init(&newf->file_lock);
+ newf->resize_in_progress = false;
+ init_waitqueue_head(&newf->resize_wait);
newf->next_fd = 0;
new_fdt = &newf->fdtab;
new_fdt->max_fds = NR_OPEN_DEFAULT;
@@ -553,11 +572,20 @@ void __fd_install(struct files_struct *files, unsigned int fd,
struct file *file)
{
struct fdtable *fdt;
- spin_lock(&files->file_lock);
- fdt = files_fdtable(files);
+
+ rcu_read_lock_sched();
+
+ while (unlikely(files->resize_in_progress)) {
+ rcu_read_unlock_sched();
+ wait_event(files->resize_wait, !files->resize_in_progress);
+ rcu_read_lock_sched();
+ }
+ /* coupled with smp_wmb() in expand_fdtable() */
+ smp_rmb();
+ fdt = READ_ONCE(files->fdt);
BUG_ON(fdt->fd[fd] != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
- spin_unlock(&files->file_lock);
+ rcu_read_unlock_sched();
}

void fd_install(unsigned int fd, struct file *file)
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 230f87bdf5ad..fbb88740634a 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -47,6 +47,9 @@ struct files_struct {
* read mostly part
*/
atomic_t count;
+ bool resize_in_progress;
+ wait_queue_head_t resize_wait;
+
struct fdtable __rcu *fdt;
struct fdtable fdtab;
/*

2015-04-22 13:32:20

by Mateusz Guzik

[permalink] [raw]
Subject: Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

On Tue, Apr 21, 2015 at 02:06:53PM -0700, Eric Dumazet wrote:
> On Tue, 2015-04-21 at 22:12 +0200, Mateusz Guzik wrote:
>
> > in dup_fd:
> > for (i = open_files; i != 0; i--) {
> > struct file *f = *old_fds++;
> > if (f) {
> > get_file(f);
> >
>
> I see no new requirement here. f is either NULL or not.
> multi threaded programs never had a guarantee dup_fd() would catch a non
> NULL pointer here.
>

It's not about seeing NULL f or not, but using the right address for
dereference.

If I read memory-barriers.txt right (see 'DATA DEPENDENCY BARRIERS'), it
is possible that cpus like alpha will see a non-NULL pointer and then
proceed to dereference *the old* (here: NULL) value.

Hence I suspect this needs smp_read_barrier_depends (along with
ACCESS_ONCE).

Other consumers (e.g. procfs code) use rcu_dereference macro which does
ends up using lockless_dereference macro, which in turn does:
#define lockless_dereference(p) \
({ \
typeof(p) _________p1 = ACCESS_ONCE(p); \
smp_read_barrier_depends(); /* Dependency order vs. p
above. */ \
(_________p1); \
})

That said memory barriers are not exactly my strong suit, but I do
believe my suspicion here is justified enough to ask someone with solid
memory barrier-fu to comment.

>
> > at least a data dependency barrier, or maybe smp_rmb for peace of mind
> >
> > similarly in do_dup2:
> > tofree = fdt->fd[fd];
> > if (!tofree && fd_is_open(fd, fdt))
> > goto Ebusy;
>
> Same here.
>
>

--
Mateusz Guzik

2015-04-22 13:55:20

by Eric Dumazet

[permalink] [raw]
Subject: Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

On Wed, 2015-04-22 at 15:31 +0200, Mateusz Guzik wrote:
> On Tue, Apr 21, 2015 at 02:06:53PM -0700, Eric Dumazet wrote:
> > On Tue, 2015-04-21 at 22:12 +0200, Mateusz Guzik wrote:
> >
> > > in dup_fd:
> > > for (i = open_files; i != 0; i--) {
> > > struct file *f = *old_fds++;
> > > if (f) {
> > > get_file(f);
> > >
> >
> > I see no new requirement here. f is either NULL or not.
> > multi threaded programs never had a guarantee dup_fd() would catch a non
> > NULL pointer here.
> >
>
> It's not about seeing NULL f or not, but using the right address for
> dereference.
>
> If I read memory-barriers.txt right (see 'DATA DEPENDENCY BARRIERS'), it
> is possible that cpus like alpha will see a non-NULL pointer and then
> proceed to dereference *the old* (here: NULL) value.
>
> Hence I suspect this needs smp_read_barrier_depends (along with
> ACCESS_ONCE).
>
> Other consumers (e.g. procfs code) use rcu_dereference macro which does
> ends up using lockless_dereference macro, which in turn does:
> #define lockless_dereference(p) \
> ({ \
> typeof(p) _________p1 = ACCESS_ONCE(p); \
> smp_read_barrier_depends(); /* Dependency order vs. p
> above. */ \
> (_________p1); \
> })
>
> That said memory barriers are not exactly my strong suit, but I do
> believe my suspicion here is justified enough to ask someone with solid
> memory barrier-fu to comment.


Again, your comment has nothing to do with the patch.

If there is old data, it only can be a NULL. And it is fine, case was
_already_ handled.

It can not be an 'old' file pointer, because close() takes the spinlock.
spin_unlock() contains a write memory barrier, so the NULL pointer put
by close() would have been committed to memory.

This works also on alpha cpus.


2015-04-27 19:05:53

by Mateusz Guzik

[permalink] [raw]
Subject: Re: [PATCH] fs/file.c: don't acquire files->file_lock in fd_install()

On Tue, Apr 21, 2015 at 09:59:28PM -0700, Eric Dumazet wrote:
> From: Eric Dumazet <[email protected]>
>
> Mateusz Guzik reported :
>
> Currently obtaining a new file descriptor results in locking fdtable
> twice - once in order to reserve a slot and second time to fill it.
>
> Holding the spinlock in __fd_install() is needed in case a resize is
> done, or to prevent a resize.
>
> Mateusz provided an RFC patch and a micro benchmark :
> http://people.redhat.com/~mguzik/pipebench.c
>
> A resize is an unlikely operation in a process lifetime,
> as table size is at least doubled at every resize.
>
> We can use RCU instead of the spinlock :
>
> __fd_install() must wait if a resize is in progress.
>
> The resize must block new __fd_install() callers from starting,
> and wait that ongoing install are finished (synchronize_sched())
>
> resize should be attempted by a single thread to not waste resources.
>
> rcu_sched variant is used, as __fd_install() and expand_fdtable() run
> from process context.
>
> It gives us a ~30% speedup using pipebench with 16 threads, and a ~10%
> speedup with another program using TCP sockets.
>
> Signed-off-by: Eric Dumazet <[email protected]>
> Reported-by: Mateusz Guzik <[email protected]>

Reviewed-by: Mateusz Guzik <[email protected]>
Tested-by: Mateusz Guzik <[email protected]>

Thanks,

> ---
> fs/file.c | 66 +++++++++++++++++++++++++++-----------
> include/linux/fdtable.h | 3 +
> 2 files changed, 50 insertions(+), 19 deletions(-)
>
> diff --git a/fs/file.c b/fs/file.c
> index 93c5f89c248b..17cef5e52f16 100644
> --- a/fs/file.c
> +++ b/fs/file.c
> @@ -147,6 +147,13 @@ static int expand_fdtable(struct files_struct *files, int nr)
>
> spin_unlock(&files->file_lock);
> new_fdt = alloc_fdtable(nr);
> +
> + /* make sure all __fd_install() have seen resize_in_progress
> + * or have finished their rcu_read_lock_sched() section.
> + */
> + if (atomic_read(&files->count) > 1)
> + synchronize_sched();
> +
> spin_lock(&files->file_lock);
> if (!new_fdt)
> return -ENOMEM;
> @@ -158,21 +165,14 @@ static int expand_fdtable(struct files_struct *files, int nr)
> __free_fdtable(new_fdt);
> return -EMFILE;
> }
> - /*
> - * Check again since another task may have expanded the fd table while
> - * we dropped the lock
> - */
> cur_fdt = files_fdtable(files);
> - if (nr >= cur_fdt->max_fds) {
> - /* Continue as planned */
> - copy_fdtable(new_fdt, cur_fdt);
> - rcu_assign_pointer(files->fdt, new_fdt);
> - if (cur_fdt != &files->fdtab)
> - call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
> - } else {
> - /* Somebody else expanded, so undo our attempt */
> - __free_fdtable(new_fdt);
> - }
> + BUG_ON(nr < cur_fdt->max_fds);
> + copy_fdtable(new_fdt, cur_fdt);
> + rcu_assign_pointer(files->fdt, new_fdt);
> + if (cur_fdt != &files->fdtab)
> + call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
> + /* coupled with smp_rmb() in __fd_install() */
> + smp_wmb();
> return 1;
> }
>
> @@ -185,21 +185,38 @@ static int expand_fdtable(struct files_struct *files, int nr)
> * The files->file_lock should be held on entry, and will be held on exit.
> */
> static int expand_files(struct files_struct *files, int nr)
> + __releases(files->file_lock)
> + __acquires(files->file_lock)
> {
> struct fdtable *fdt;
> + int expanded = 0;
>
> +repeat:
> fdt = files_fdtable(files);
>
> /* Do we need to expand? */
> if (nr < fdt->max_fds)
> - return 0;
> + return expanded;
>
> /* Can we expand? */
> if (nr >= sysctl_nr_open)
> return -EMFILE;
>
> + if (unlikely(files->resize_in_progress)) {
> + spin_unlock(&files->file_lock);
> + expanded = 1;
> + wait_event(files->resize_wait, !files->resize_in_progress);
> + spin_lock(&files->file_lock);
> + goto repeat;
> + }
> +
> /* All good, so we try */
> - return expand_fdtable(files, nr);
> + files->resize_in_progress = true;
> + expanded = expand_fdtable(files, nr);
> + files->resize_in_progress = false;
> +
> + wake_up_all(&files->resize_wait);
> + return expanded;
> }
>
> static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
> @@ -256,6 +273,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
> atomic_set(&newf->count, 1);
>
> spin_lock_init(&newf->file_lock);
> + newf->resize_in_progress = false;
> + init_waitqueue_head(&newf->resize_wait);
> newf->next_fd = 0;
> new_fdt = &newf->fdtab;
> new_fdt->max_fds = NR_OPEN_DEFAULT;
> @@ -553,11 +572,20 @@ void __fd_install(struct files_struct *files, unsigned int fd,
> struct file *file)
> {
> struct fdtable *fdt;
> - spin_lock(&files->file_lock);
> - fdt = files_fdtable(files);
> +
> + rcu_read_lock_sched();
> +
> + while (unlikely(files->resize_in_progress)) {
> + rcu_read_unlock_sched();
> + wait_event(files->resize_wait, !files->resize_in_progress);
> + rcu_read_lock_sched();
> + }
> + /* coupled with smp_wmb() in expand_fdtable() */
> + smp_rmb();
> + fdt = READ_ONCE(files->fdt);
> BUG_ON(fdt->fd[fd] != NULL);
> rcu_assign_pointer(fdt->fd[fd], file);
> - spin_unlock(&files->file_lock);
> + rcu_read_unlock_sched();
> }
>
> void fd_install(unsigned int fd, struct file *file)
> diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
> index 230f87bdf5ad..fbb88740634a 100644
> --- a/include/linux/fdtable.h
> +++ b/include/linux/fdtable.h
> @@ -47,6 +47,9 @@ struct files_struct {
> * read mostly part
> */
> atomic_t count;
> + bool resize_in_progress;
> + wait_queue_head_t resize_wait;
> +
> struct fdtable __rcu *fdt;
> struct fdtable fdtab;
> /*
>
>

--
Mateusz Guzik

2015-04-28 16:21:05

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH] fs/file.c: don't acquire files->file_lock in fd_install()

On Mon, 2015-04-27 at 21:05 +0200, Mateusz Guzik wrote:
> On Tue, Apr 21, 2015 at 09:59:28PM -0700, Eric Dumazet wrote:
> > From: Eric Dumazet <[email protected]>
> >
> > Mateusz Guzik reported :
> >
> > Currently obtaining a new file descriptor results in locking fdtable
> > twice - once in order to reserve a slot and second time to fill it.
> >
> > Holding the spinlock in __fd_install() is needed in case a resize is
> > done, or to prevent a resize.
> >
> > Mateusz provided an RFC patch and a micro benchmark :
> > http://people.redhat.com/~mguzik/pipebench.c
> >
> > A resize is an unlikely operation in a process lifetime,
> > as table size is at least doubled at every resize.
> >
> > We can use RCU instead of the spinlock :
> >
> > __fd_install() must wait if a resize is in progress.
> >
> > The resize must block new __fd_install() callers from starting,
> > and wait that ongoing install are finished (synchronize_sched())
> >
> > resize should be attempted by a single thread to not waste resources.
> >
> > rcu_sched variant is used, as __fd_install() and expand_fdtable() run
> > from process context.
> >
> > It gives us a ~30% speedup using pipebench with 16 threads, and a ~10%
> > speedup with another program using TCP sockets.
> >
> > Signed-off-by: Eric Dumazet <[email protected]>
> > Reported-by: Mateusz Guzik <[email protected]>
>
> Reviewed-by: Mateusz Guzik <[email protected]>
> Tested-by: Mateusz Guzik <[email protected]>
>

Thanks Mateusz

I'll send a v2, replacing the READ_ONCE() by more appropriate
rcu_dereference_sched() :

> > new_fdt->max_fds = NR_OPEN_DEFAULT;
> > @@ -553,11 +572,20 @@ void __fd_install(struct files_struct *files, unsigned int fd,
> > struct file *file)
> > {
> > struct fdtable *fdt;
> > - spin_lock(&files->file_lock);
> > - fdt = files_fdtable(files);
> > +
> > + rcu_read_lock_sched();
> > +
> > + while (unlikely(files->resize_in_progress)) {
> > + rcu_read_unlock_sched();
> > + wait_event(files->resize_wait, !files->resize_in_progress);
> > + rcu_read_lock_sched();
> > + }
> > + /* coupled with smp_wmb() in expand_fdtable() */
> > + smp_rmb();

> > + fdt = READ_ONCE(files->fdt);

This should be :
fdt = rcu_dereference_sched(files->fdt);

> > BUG_ON(fdt->fd[fd] != NULL);
> > rcu_assign_pointer(fdt->fd[fd], file);
> > - spin_unlock(&files->file_lock);
> > + rcu_read_unlock_sched();
> > }
> >

2015-04-29 04:25:10

by Eric Dumazet

[permalink] [raw]
Subject: [PATCH v2] fs/file.c: don't acquire files->file_lock in fd_install()

From: Eric Dumazet <[email protected]>

Mateusz Guzik reported :

Currently obtaining a new file descriptor results in locking fdtable
twice - once in order to reserve a slot and second time to fill it.

Holding the spinlock in __fd_install() is needed in case a resize is
done, or to prevent a resize.

Mateusz provided an RFC patch and a micro benchmark :
http://people.redhat.com/~mguzik/pipebench.c

A resize is an unlikely operation in a process lifetime,
as table size is at least doubled at every resize.

We can use RCU instead of the spinlock :

__fd_install() must wait if a resize is in progress.

The resize must block new __fd_install() callers from starting,
and wait that ongoing install are finished (synchronize_sched())

resize should be attempted by a single thread to not waste resources.

rcu_sched variant is used, as __fd_install() and expand_fdtable() run
from process context.

It gives us a ~30% speedup using pipebench with 16 threads, and a ~10%
speedup with another program using TCP sockets.

Signed-off-by: Eric Dumazet <[email protected]>
Reported-by: Mateusz Guzik <[email protected]>
Acked-by: Mateusz Guzik <[email protected]>
Tested-by: Mateusz Guzik <[email protected]>
---
v2: replaced a READ_ONCE() by more appropriate rcu_dereference_sched()

fs/file.c | 66 +++++++++++++++++++++++++++-----------
include/linux/fdtable.h | 3 +
2 files changed, 50 insertions(+), 19 deletions(-)

diff --git a/fs/file.c b/fs/file.c
index 93c5f89c248b..17cef5e52f16 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -147,6 +147,13 @@ static int expand_fdtable(struct files_struct *files, int nr)

spin_unlock(&files->file_lock);
new_fdt = alloc_fdtable(nr);
+
+ /* make sure all __fd_install() have seen resize_in_progress
+ * or have finished their rcu_read_lock_sched() section.
+ */
+ if (atomic_read(&files->count) > 1)
+ synchronize_sched();
+
spin_lock(&files->file_lock);
if (!new_fdt)
return -ENOMEM;
@@ -158,21 +165,14 @@ static int expand_fdtable(struct files_struct *files, int nr)
__free_fdtable(new_fdt);
return -EMFILE;
}
- /*
- * Check again since another task may have expanded the fd table while
- * we dropped the lock
- */
cur_fdt = files_fdtable(files);
- if (nr >= cur_fdt->max_fds) {
- /* Continue as planned */
- copy_fdtable(new_fdt, cur_fdt);
- rcu_assign_pointer(files->fdt, new_fdt);
- if (cur_fdt != &files->fdtab)
- call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
- } else {
- /* Somebody else expanded, so undo our attempt */
- __free_fdtable(new_fdt);
- }
+ BUG_ON(nr < cur_fdt->max_fds);
+ copy_fdtable(new_fdt, cur_fdt);
+ rcu_assign_pointer(files->fdt, new_fdt);
+ if (cur_fdt != &files->fdtab)
+ call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
+ /* coupled with smp_rmb() in __fd_install() */
+ smp_wmb();
return 1;
}

@@ -185,21 +185,38 @@ static int expand_fdtable(struct files_struct *files, int nr)
* The files->file_lock should be held on entry, and will be held on exit.
*/
static int expand_files(struct files_struct *files, int nr)
+ __releases(files->file_lock)
+ __acquires(files->file_lock)
{
struct fdtable *fdt;
+ int expanded = 0;

+repeat:
fdt = files_fdtable(files);

/* Do we need to expand? */
if (nr < fdt->max_fds)
- return 0;
+ return expanded;

/* Can we expand? */
if (nr >= sysctl_nr_open)
return -EMFILE;

+ if (unlikely(files->resize_in_progress)) {
+ spin_unlock(&files->file_lock);
+ expanded = 1;
+ wait_event(files->resize_wait, !files->resize_in_progress);
+ spin_lock(&files->file_lock);
+ goto repeat;
+ }
+
/* All good, so we try */
- return expand_fdtable(files, nr);
+ files->resize_in_progress = true;
+ expanded = expand_fdtable(files, nr);
+ files->resize_in_progress = false;
+
+ wake_up_all(&files->resize_wait);
+ return expanded;
}

static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
@@ -256,6 +273,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
atomic_set(&newf->count, 1);

spin_lock_init(&newf->file_lock);
+ newf->resize_in_progress = false;
+ init_waitqueue_head(&newf->resize_wait);
newf->next_fd = 0;
new_fdt = &newf->fdtab;
new_fdt->max_fds = NR_OPEN_DEFAULT;
@@ -553,11 +572,20 @@ void __fd_install(struct files_struct *files, unsigned int fd,
struct file *file)
{
struct fdtable *fdt;
- spin_lock(&files->file_lock);
- fdt = files_fdtable(files);
+
+ rcu_read_lock_sched();
+
+ while (unlikely(files->resize_in_progress)) {
+ rcu_read_unlock_sched();
+ wait_event(files->resize_wait, !files->resize_in_progress);
+ rcu_read_lock_sched();
+ }
+ /* coupled with smp_wmb() in expand_fdtable() */
+ smp_rmb();
+ fdt = rcu_dereference_sched(files->fdt);
BUG_ON(fdt->fd[fd] != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
- spin_unlock(&files->file_lock);
+ rcu_read_unlock_sched();
}

void fd_install(unsigned int fd, struct file *file)
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 230f87bdf5ad..fbb88740634a 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -47,6 +47,9 @@ struct files_struct {
* read mostly part
*/
atomic_t count;
+ bool resize_in_progress;
+ wait_queue_head_t resize_wait;
+
struct fdtable __rcu *fdt;
struct fdtable fdtab;
/*

2015-06-22 02:32:27

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH v2] fs/file.c: don't acquire files->file_lock in fd_install()

On Tue, Apr 28, 2015 at 09:25:03PM -0700, Eric Dumazet wrote:

> @@ -553,11 +572,20 @@ void __fd_install(struct files_struct *files, unsigned int fd,
> struct file *file)
> {
> struct fdtable *fdt;
> - spin_lock(&files->file_lock);
> - fdt = files_fdtable(files);
> +
> + rcu_read_lock_sched();
> +
> + while (unlikely(files->resize_in_progress)) {
> + rcu_read_unlock_sched();
> + wait_event(files->resize_wait, !files->resize_in_progress);
> + rcu_read_lock_sched();
> + }
> + /* coupled with smp_wmb() in expand_fdtable() */
> + smp_rmb();
> + fdt = rcu_dereference_sched(files->fdt);
> BUG_ON(fdt->fd[fd] != NULL);
> rcu_assign_pointer(fdt->fd[fd], file);
> - spin_unlock(&files->file_lock);
> + rcu_read_unlock_sched();

Umm... You've taken something that was safe to use in atomic contexts
and turned into something that might wait for GFP_KERNEL allocation; what's
to guarantee that no users get broken by that? At the very least, you want
to slap might_sleep() in there - the actual sleep is going to be very rare,
so it would be an extremely hard to reproduce and debug.

AFAICS, all current in-tree users should be safe, but fd_install() is exported
and quiet changes of that sort are rather antisocial. Generally I don't give
a damn about out-of-tree code, but this one is over the top.

I _think_ it's otherwise OK, but please, add might_sleep() *AND* a note in
Documentation/filesystems/porting.

2015-06-23 05:31:36

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH v2] fs/file.c: don't acquire files->file_lock in fd_install()

On Mon, 2015-06-22 at 03:32 +0100, Al Viro wrote:
> On Tue, Apr 28, 2015 at 09:25:03PM -0700, Eric Dumazet wrote:
>
> > @@ -553,11 +572,20 @@ void __fd_install(struct files_struct *files, unsigned int fd,
> > struct file *file)
> > {
> > struct fdtable *fdt;
> > - spin_lock(&files->file_lock);
> > - fdt = files_fdtable(files);
> > +
> > + rcu_read_lock_sched();
> > +
> > + while (unlikely(files->resize_in_progress)) {
> > + rcu_read_unlock_sched();
> > + wait_event(files->resize_wait, !files->resize_in_progress);
> > + rcu_read_lock_sched();
> > + }
> > + /* coupled with smp_wmb() in expand_fdtable() */
> > + smp_rmb();
> > + fdt = rcu_dereference_sched(files->fdt);
> > BUG_ON(fdt->fd[fd] != NULL);
> > rcu_assign_pointer(fdt->fd[fd], file);
> > - spin_unlock(&files->file_lock);
> > + rcu_read_unlock_sched();
>
> Umm... You've taken something that was safe to use in atomic contexts
> and turned into something that might wait for GFP_KERNEL allocation; what's
> to guarantee that no users get broken by that? At the very least, you want
> to slap might_sleep() in there - the actual sleep is going to be very rare,
> so it would be an extremely hard to reproduce and debug.
>
> AFAICS, all current in-tree users should be safe, but fd_install() is exported
> and quiet changes of that sort are rather antisocial. Generally I don't give
> a damn about out-of-tree code, but this one is over the top.
>
> I _think_ it's otherwise OK, but please, add might_sleep() *AND* a note in
> Documentation/filesystems/porting.
>

Good points. I am currently traveling and will address this asap.

Thanks

2015-06-30 13:54:24

by Eric Dumazet

[permalink] [raw]
Subject: [PATCH v3] fs/file.c: don't acquire files->file_lock in fd_install()

From: Eric Dumazet <[email protected]>

Mateusz Guzik reported :

Currently obtaining a new file descriptor results in locking fdtable
twice - once in order to reserve a slot and second time to fill it.

Holding the spinlock in __fd_install() is needed in case a resize is
done, or to prevent a resize.

Mateusz provided an RFC patch and a micro benchmark :
http://people.redhat.com/~mguzik/pipebench.c

A resize is an unlikely operation in a process lifetime,
as table size is at least doubled at every resize.

We can use RCU instead of the spinlock.

__fd_install() must wait if a resize is in progress.

The resize must block new __fd_install() callers from starting,
and wait that ongoing install are finished (synchronize_sched())

resize should be attempted by a single thread to not waste resources.

rcu_sched variant is used, as __fd_install() and expand_fdtable() run
from process context.

It gives us a ~30% speedup using pipebench on a dual Intel(R) Xeon(R)
CPU E5-2696 v2 @ 2.50GHz

Signed-off-by: Eric Dumazet <[email protected]>
Reported-by: Mateusz Guzik <[email protected]>
Acked-by: Mateusz Guzik <[email protected]>
Tested-by: Mateusz Guzik <[email protected]>
---
v3: added a might_sleep() and a section in
Documentation/filesystems/porting
v2: replaced a READ_ONCE() by more appropriate rcu_dereference_sched()

Documentation/filesystems/porting | 4 +
fs/file.c | 67 ++++++++++++++++++++--------
include/linux/fdtable.h | 3 +
3 files changed, 55 insertions(+), 19 deletions(-)

diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 68f1c9106573..f24d1b833957 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -500,3 +500,7 @@ in your dentry operations instead.
dentry, it does not get nameidata at all and it gets called only when cookie
is non-NULL. Note that link body isn't available anymore, so if you need it,
store it as cookie.
+--
+[mandatory]
+ __fd_install() & fd_install() can now sleep. Callers should not
+ hold a spinlock or other resources that do not allow a schedule.
diff --git a/fs/file.c b/fs/file.c
index 93c5f89c248b..3d2eb4c542a4 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -147,6 +147,13 @@ static int expand_fdtable(struct files_struct *files, int nr)

spin_unlock(&files->file_lock);
new_fdt = alloc_fdtable(nr);
+
+ /* make sure all __fd_install() have seen resize_in_progress
+ * or have finished their rcu_read_lock_sched() section.
+ */
+ if (atomic_read(&files->count) > 1)
+ synchronize_sched();
+
spin_lock(&files->file_lock);
if (!new_fdt)
return -ENOMEM;
@@ -158,21 +165,14 @@ static int expand_fdtable(struct files_struct *files, int nr)
__free_fdtable(new_fdt);
return -EMFILE;
}
- /*
- * Check again since another task may have expanded the fd table while
- * we dropped the lock
- */
cur_fdt = files_fdtable(files);
- if (nr >= cur_fdt->max_fds) {
- /* Continue as planned */
- copy_fdtable(new_fdt, cur_fdt);
- rcu_assign_pointer(files->fdt, new_fdt);
- if (cur_fdt != &files->fdtab)
- call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
- } else {
- /* Somebody else expanded, so undo our attempt */
- __free_fdtable(new_fdt);
- }
+ BUG_ON(nr < cur_fdt->max_fds);
+ copy_fdtable(new_fdt, cur_fdt);
+ rcu_assign_pointer(files->fdt, new_fdt);
+ if (cur_fdt != &files->fdtab)
+ call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
+ /* coupled with smp_rmb() in __fd_install() */
+ smp_wmb();
return 1;
}

@@ -185,21 +185,38 @@ static int expand_fdtable(struct files_struct *files, int nr)
* The files->file_lock should be held on entry, and will be held on exit.
*/
static int expand_files(struct files_struct *files, int nr)
+ __releases(files->file_lock)
+ __acquires(files->file_lock)
{
struct fdtable *fdt;
+ int expanded = 0;

+repeat:
fdt = files_fdtable(files);

/* Do we need to expand? */
if (nr < fdt->max_fds)
- return 0;
+ return expanded;

/* Can we expand? */
if (nr >= sysctl_nr_open)
return -EMFILE;

+ if (unlikely(files->resize_in_progress)) {
+ spin_unlock(&files->file_lock);
+ expanded = 1;
+ wait_event(files->resize_wait, !files->resize_in_progress);
+ spin_lock(&files->file_lock);
+ goto repeat;
+ }
+
/* All good, so we try */
- return expand_fdtable(files, nr);
+ files->resize_in_progress = true;
+ expanded = expand_fdtable(files, nr);
+ files->resize_in_progress = false;
+
+ wake_up_all(&files->resize_wait);
+ return expanded;
}

static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
@@ -256,6 +273,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
atomic_set(&newf->count, 1);

spin_lock_init(&newf->file_lock);
+ newf->resize_in_progress = false;
+ init_waitqueue_head(&newf->resize_wait);
newf->next_fd = 0;
new_fdt = &newf->fdtab;
new_fdt->max_fds = NR_OPEN_DEFAULT;
@@ -553,11 +572,21 @@ void __fd_install(struct files_struct *files, unsigned int fd,
struct file *file)
{
struct fdtable *fdt;
- spin_lock(&files->file_lock);
- fdt = files_fdtable(files);
+
+ might_sleep();
+ rcu_read_lock_sched();
+
+ while (unlikely(files->resize_in_progress)) {
+ rcu_read_unlock_sched();
+ wait_event(files->resize_wait, !files->resize_in_progress);
+ rcu_read_lock_sched();
+ }
+ /* coupled with smp_wmb() in expand_fdtable() */
+ smp_rmb();
+ fdt = rcu_dereference_sched(files->fdt);
BUG_ON(fdt->fd[fd] != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
- spin_unlock(&files->file_lock);
+ rcu_read_unlock_sched();
}

void fd_install(unsigned int fd, struct file *file)
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 230f87bdf5ad..fbb88740634a 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -47,6 +47,9 @@ struct files_struct {
* read mostly part
*/
atomic_t count;
+ bool resize_in_progress;
+ wait_queue_head_t resize_wait;
+
struct fdtable __rcu *fdt;
struct fdtable fdtab;
/*