2011-05-07 04:49:55

by Changli Gao

[permalink] [raw]
Subject: [PATCH] fs: add FD_CLOFORK and O_CLOFORK

If FD_CLOFORK is 1, when a fork occurs, the corresponding file descriptor
will be closed for the child process. IOW, the file descriptor isn't
inheritable.

FD_CLOFORK is used as IBM does.

O_CLOFORK is also added to avoid the additional fcntl(2) after open(2).

Signed-off-by: Changli Gao <[email protected]>
---
fs/fcntl.c | 27 +++++++++++++++++++++++++++
fs/file.c | 22 ++++++++++++++++++++--
include/asm-generic/fcntl.h | 5 +++++
include/linux/fdtable.h | 2 ++
include/linux/file.h | 1 +
5 files changed, 55 insertions(+), 2 deletions(-)
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 22764c7..8127744 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -50,6 +50,31 @@ static int get_close_on_exec(unsigned int fd)
return res;
}

+void set_close_on_fork(unsigned int fd, int flag)
+{
+ struct files_struct *files = current->files;
+ struct fdtable *fdt;
+ spin_lock(&files->file_lock);
+ fdt = files_fdtable(files);
+ if (flag)
+ FD_SET(fd, fdt->close_on_fork);
+ else
+ FD_CLR(fd, fdt->close_on_fork);
+ spin_unlock(&files->file_lock);
+}
+
+static int get_close_on_fork(unsigned int fd)
+{
+ struct files_struct *files = current->files;
+ struct fdtable *fdt;
+ int res;
+ rcu_read_lock();
+ fdt = files_fdtable(files);
+ res = FD_ISSET(fd, fdt->close_on_fork);
+ rcu_read_unlock();
+ return res;
+}
+
SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
{
int err = -EBADF;
@@ -358,10 +383,12 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
break;
case F_GETFD:
err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
+ err |= get_close_on_fork(fd) ? FD_CLOFORK : 0;
break;
case F_SETFD:
err = 0;
set_close_on_exec(fd, arg & FD_CLOEXEC);
+ set_close_on_fork(fd, arg & FD_CLOFORK);
break;
case F_GETFL:
err = filp->f_flags;
diff --git a/fs/file.c b/fs/file.c
index 0be3447..ef79197 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -133,6 +133,8 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
memset((char *)(nfdt->open_fds) + cpy, 0, set);
memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
memset((char *)(nfdt->close_on_exec) + cpy, 0, set);
+ memcpy(nfdt->close_on_fork, ofdt->close_on_fork, cpy);
+ memset((char *)(nfdt->close_on_fork) + cpy, 0, set);
}

static struct fdtable * alloc_fdtable(unsigned int nr)
@@ -170,12 +172,14 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
goto out_fdt;
fdt->fd = (struct file **)data;
data = alloc_fdmem(max_t(unsigned int,
- 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES));
+ 3 * nr / BITS_PER_BYTE, L1_CACHE_BYTES));
if (!data)
goto out_arr;
fdt->open_fds = (fd_set *)data;
data += nr / BITS_PER_BYTE;
fdt->close_on_exec = (fd_set *)data;
+ data += nr / BITS_PER_BYTE;
+ fdt->close_on_fork = (fd_set *)data;
fdt->next = NULL;

return fdt;
@@ -303,6 +307,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
new_fdt = &newf->fdtab;
new_fdt->max_fds = NR_OPEN_DEFAULT;
new_fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
+ new_fdt->close_on_fork = (fd_set *)&newf->close_on_fork_init;
new_fdt->open_fds = (fd_set *)&newf->open_fds_init;
new_fdt->fd = &newf->fd_array[0];
new_fdt->next = NULL;
@@ -350,11 +355,18 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
old_fdt->open_fds->fds_bits, open_files/8);
memcpy(new_fdt->close_on_exec->fds_bits,
old_fdt->close_on_exec->fds_bits, open_files/8);
+ memcpy(new_fdt->close_on_fork->fds_bits,
+ old_fdt->close_on_fork->fds_bits, open_files/8);

for (i = open_files; i != 0; i--) {
struct file *f = *old_fds++;
if (f) {
- get_file(f);
+ if (FD_ISSET(open_files - i, new_fdt->close_on_fork)) {
+ FD_CLR(open_files - i, new_fdt->open_fds);
+ f = NULL;
+ } else {
+ get_file(f);
+ }
} else {
/*
* The fd may be claimed in the fd bitmap but not yet
@@ -380,6 +392,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)

memset(&new_fdt->open_fds->fds_bits[start], 0, left);
memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
+ memset(&new_fdt->close_on_fork->fds_bits[start], 0, left);
}

rcu_assign_pointer(newf->fdt, new_fdt);
@@ -416,6 +429,7 @@ struct files_struct init_files = {
.max_fds = NR_OPEN_DEFAULT,
.fd = &init_files.fd_array[0],
.close_on_exec = (fd_set *)&init_files.close_on_exec_init,
+ .close_on_fork = (fd_set *)&init_files.close_on_fork_init,
.open_fds = (fd_set *)&init_files.open_fds_init,
},
.file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock),
@@ -461,6 +475,10 @@ repeat:
FD_SET(fd, fdt->close_on_exec);
else
FD_CLR(fd, fdt->close_on_exec);
+ if (flags & O_CLOFORK)
+ FD_SET(fd, fdt->close_on_fork);
+ else
+ FD_CLR(fd, fdt->close_on_fork);
error = fd;
#if 1
/* Sanity check */
diff --git a/include/asm-generic/fcntl.h b/include/asm-generic/fcntl.h
index 84793c7..2a7c474 100644
--- a/include/asm-generic/fcntl.h
+++ b/include/asm-generic/fcntl.h
@@ -88,6 +88,10 @@
#define O_NDELAY O_NONBLOCK
#endif

+#ifndef O_CLOFORK
+#define O_CLOFORK 020000000 /* set close_on_fork */
+#endif
+
#define F_DUPFD 0 /* dup */
#define F_GETFD 1 /* get close_on_exec */
#define F_SETFD 2 /* set/clear close_on_exec */
@@ -131,6 +135,7 @@ struct f_owner_ex {

/* for F_[GET|SET]FL */
#define FD_CLOEXEC 1 /* actually anything with low bit set goes */
+#define FD_CLOFORK 2

/* for posix fcntl() and lockf() */
#ifndef F_RDLCK
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 133c0ba..bb9f0be 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -33,6 +33,7 @@ struct fdtable {
unsigned int max_fds;
struct file __rcu **fd; /* current fd array */
fd_set *close_on_exec;
+ fd_set *close_on_fork;
fd_set *open_fds;
struct rcu_head rcu;
struct fdtable *next;
@@ -54,6 +55,7 @@ struct files_struct {
spinlock_t file_lock ____cacheline_aligned_in_smp;
int next_fd;
struct embedded_fd_set close_on_exec_init;
+ struct embedded_fd_set close_on_fork_init;
struct embedded_fd_set open_fds_init;
struct file __rcu * fd_array[NR_OPEN_DEFAULT];
};
diff --git a/include/linux/file.h b/include/linux/file.h
index 21a7995..c592d1f 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -32,6 +32,7 @@ extern struct file *fget_light(unsigned int fd, int *fput_needed);
extern struct file *fget_raw(unsigned int fd);
extern struct file *fget_raw_light(unsigned int fd, int *fput_needed);
extern void set_close_on_exec(unsigned int fd, int flag);
+extern void set_close_on_fork(unsigned int fd, int flag);
extern void put_filp(struct file *);
extern int alloc_fd(unsigned start, unsigned flags);
extern int get_unused_fd(void);


2011-05-07 06:18:40

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH] fs: add FD_CLOFORK and O_CLOFORK

Le samedi 07 mai 2011 à 12:49 +0800, Changli Gao a écrit :
> If FD_CLOFORK is 1, when a fork occurs, the corresponding file descriptor
> will be closed for the child process. IOW, the file descriptor isn't
> inheritable.
>
> FD_CLOFORK is used as IBM does.

Is it part of a standard, and what could be the use for such thing ?
Why had we wait 2011 to add it in linux ?

>
> O_CLOFORK is also added to avoid the additional fcntl(2) after open(2).
>
> Signed-off-by: Changli Gao <[email protected]>


Your implementation has some peformance implication.

close_on_exec and close_on_fork bit for a given fd would be on separate
cache lines. So you add a cost on threaded programs for open()/close()

[ Yes, we apparently clear close_on_exec bit in close()... we could let
it untouched and make flush_old_files() aware of that ]


2011-05-07 06:06:44

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH] fs: add FD_CLOFORK and O_CLOFORK

Le samedi 07 mai 2011 à 12:49 +0800, Changli Gao a écrit :

>
> for (i = open_files; i != 0; i--) {
> struct file *f = *old_fds++;
> if (f) {
> - get_file(f);
> + if (FD_ISSET(open_files - i, new_fdt->close_on_fork)) {
> + FD_CLR(open_files - i, new_fdt->open_fds);
> + f = NULL;
> + } else {
> + get_file(f);
> + }
> } else {
> /*

You should change the main loop to

for (i = 0; i < open_files; i++) {
struct file *f = *old_fds++;

if (f && FD_ISSET(i, new_fdt->close_on_fork))
f = NULL;
if (f)
get_file(f);
else
FD_CLR(i, new_fdt->open_fds)
rcu_assign_pointer(*new_fds++, f);
}

BTW the rcu_assign_pointer() is not necessary here, since we are the
only thread populating new_fds at this point.

spin_unlock(&oldf->file_lock); and
rcu_assign_pointer(newf->fdt, new_fdt);
make sure once new_fds is visible to other threads, all our memory
writes are committed.

sparse even warns us ;)

fs/file.c:371:3: warning: incorrect type in assignment (different address spaces)
fs/file.c:371:3: expected struct file *<noident>
fs/file.c:371:3: got struct file [noderef] <asn:4>*<noident>


I'll submit a patch in a separate thread

2011-05-07 06:22:59

by Changli Gao

[permalink] [raw]
Subject: Re: [PATCH] fs: add FD_CLOFORK and O_CLOFORK

On Sat, May 7, 2011 at 2:06 PM, Eric Dumazet <[email protected]> wrote:
> Le samedi 07 mai 2011 ? 12:49 +0800, Changli Gao a ?crit :
>
>>
>> ? ? ? for (i = open_files; i != 0; i--) {
>> ? ? ? ? ? ? ? struct file *f = *old_fds++;
>> ? ? ? ? ? ? ? if (f) {
>> - ? ? ? ? ? ? ? ? ? ? get_file(f);
>> + ? ? ? ? ? ? ? ? ? ? if (FD_ISSET(open_files - i, new_fdt->close_on_fork)) {
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? FD_CLR(open_files - i, new_fdt->open_fds);
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? f = NULL;
>> + ? ? ? ? ? ? ? ? ? ? } else {
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? get_file(f);
>> + ? ? ? ? ? ? ? ? ? ? }
>> ? ? ? ? ? ? ? } else {
>> ? ? ? ? ? ? ? ? ? ? ? /*
>
> You should change the main loop to
>
> for (i = 0; i < open_files; i++) {
> ? ? ? ?struct file *f = *old_fds++;
>
> ? ? ? ?if (f && FD_ISSET(i, new_fdt->close_on_fork))
> ? ? ? ? ? ? ? ?f = NULL;
> ? ? ? ?if (f)
> ? ? ? ? ? ? ? ?get_file(f);
> ? ? ? ?else
> ? ? ? ? ? ? ? ?FD_CLR(i, new_fdt->open_fds)
> ? ? ? ?rcu_assign_pointer(*new_fds++, f);
> }

It seems more clear. Thanks.

BTW: I will work on F_CLOSFD after this patch accepted. It is useful
to close all the opened file descriptors when programing a daemon.

--
Regards,
Changli Gao([email protected])

2011-05-07 06:29:22

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH] fs: add FD_CLOFORK and O_CLOFORK

Le samedi 07 mai 2011 à 14:22 +0800, Changli Gao a écrit :

> It seems more clear. Thanks.
>
> BTW: I will work on F_CLOSFD after this patch accepted. It is useful
> to close all the opened file descriptors when programing a daemon.
>

I am sure the following idea already was discussed in the past :

Embed the close_on_exec / close_on_close bits in the low order bits of
file pointers.

Might be interesting to raise it again.



2011-05-07 10:09:46

by Changli Gao

[permalink] [raw]
Subject: Re: [PATCH] fs: add FD_CLOFORK and O_CLOFORK

On Sat, May 7, 2011 at 1:25 PM, Eric Dumazet <[email protected]> wrote:
> Le samedi 07 mai 2011 ? 12:49 +0800, Changli Gao a ?crit :
>> If FD_CLOFORK is 1, when a fork occurs, the corresponding file descriptor
>> will be closed for the child process. IOW, the file descriptor isn't
>> inheritable.
>>
>> FD_CLOFORK is used as IBM does.
>
> Is it part of a standard, and what could be the use for such thing ?
> Why had we wait 2011 to add it in linux ?
>

It isn't part of any standard. It can be used in multi-process
programs, which don't want the child processes inherit some file
descriptors. Here is a basic server program.

serv_sock = socket(...);
bind(serv_sock, ...);
listen(serv_sock, ...);
fcntl(serv_sock, F_SETFD, FD_CLOFORK);
while (1) {
clnt_sock = accept(serv_sock);
switch (fork()) {
case 0:
exit(do_serv(clnt_sock));
default:
break;
}
}

This flag can also been used instead of FD_CLOEXEC in the exec(2)
after fork(2) environment. As no such file descriptors is duplicated
between fork(2) and exec(2), the later close(2) in kernel won't be
needed when exec(2). It can improve the performance.

Thanks.

--
Regards,
Changli Gao([email protected])

2011-05-07 10:10:46

by Changli Gao

[permalink] [raw]
Subject: Re: [PATCH] fs: add FD_CLOFORK and O_CLOFORK

On Sat, May 7, 2011 at 2:29 PM, Eric Dumazet <[email protected]> wrote:
> Le samedi 07 mai 2011 ? 14:22 +0800, Changli Gao a ?crit :
>
>> It seems more clear. Thanks.
>>
>> BTW: I will work on F_CLOSFD after this patch accepted. It is useful
>> to close all the opened file descriptors when programing a daemon.
>>
>
> I am sure the following idea already was discussed in the past :
>
> Embed the close_on_exec / close_on_close bits in the low order bits of
> file pointers.
>
> Might be interesting to raise it again.
>
>

Excellent idea! Thanks.

--
Regards,
Changli Gao([email protected])

2011-05-07 11:42:06

by Geert Uytterhoeven

[permalink] [raw]
Subject: Re: [PATCH] fs: add FD_CLOFORK and O_CLOFORK

On Sat, May 7, 2011 at 08:29, Eric Dumazet <[email protected]> wrote:
> I am sure the following idea already was discussed in the past :
>
> Embed the close_on_exec / close_on_close bits in the low order bits of
> file pointers.

How many low order bits do you expect to have in file pointers?
Are files allocated using kmalloc(), or do they just obey the alignment of the
individual members of a file struct? In the latter case, you'll have onlyv1 low
order bit to play with on m68k.

Gr{oetje,eeting}s,

                        Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- [email protected]

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

2011-05-07 12:32:41

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH] fs: add FD_CLOFORK and O_CLOFORK

Le samedi 07 mai 2011 à 13:41 +0200, Geert Uytterhoeven a écrit :
> On Sat, May 7, 2011 at 08:29, Eric Dumazet <[email protected]> wrote:
> > I am sure the following idea already was discussed in the past :
> >
> > Embed the close_on_exec / close_on_close bits in the low order bits of
> > file pointers.
>
> How many low order bits do you expect to have in file pointers?
> Are files allocated using kmalloc(), or do they just obey the alignment of the
> individual members of a file struct? In the latter case, you'll have onlyv1 low
> order bit to play with on m68k.

We should have two bits at a very minimum.

clone_on_exec & clone_on_fork would fit.

filp are allocated with a kmem_cachep with cache line alignement.

filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);

So in practice we should have more than two bits ;)