2020-04-20 07:19:16

by Karstens, Nate

[permalink] [raw]
Subject: Implement close-on-fork

Series of 4 patches to implement close-on-fork. Tests have been
published to https://github.com/nkarstens/ltp/tree/close-on-fork.

close-on-fork addresses race conditions in system(), which
(depending on the implementation) is non-atomic in that it
first calls a fork() and then an exec().

This functionality was approved by the Austin Common Standards
Revision Group for inclusion in the next revision of the POSIX
standard (see issue 1318 in the Austin Group Defect Tracker).

[PATCH 1/4] fs: Implement close-on-fork
[PATCH 2/4] fs: Add O_CLOFORK flag for open(2) and dup3(2)
[PATCH 3/4] fs: Add F_DUPFD_CLOFORK to fcntl(2)
[PATCH 4/4] net: Add SOCK_CLOFORK


2020-04-20 07:20:04

by Karstens, Nate

[permalink] [raw]
Subject: [PATCH 1/4] fs: Implement close-on-fork

The close-on-fork flag causes the file descriptor to be closed
atomically in the child process before the child process returns
from fork(). Implement this feature and provide a method to
get/set the close-on-fork flag using fcntl(2).

This functionality was approved by the Austin Common Standards
Revision Group for inclusion in the next revision of the POSIX
standard (see issue 1318 in the Austin Group Defect Tracker).

Co-developed-by: Changli Gao <[email protected]>
Signed-off-by: Changli Gao <[email protected]>
Signed-off-by: Nate Karstens <[email protected]>
---
fs/fcntl.c | 2 ++
fs/file.c | 50 +++++++++++++++++++++++++-
include/linux/fdtable.h | 7 ++++
include/linux/file.h | 2 ++
include/uapi/asm-generic/fcntl.h | 5 +--
tools/include/uapi/asm-generic/fcntl.h | 5 +--
6 files changed, 66 insertions(+), 5 deletions(-)

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 2e4c0fa2074b..23964abf4a1a 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -335,10 +335,12 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
break;
case F_GETFD:
err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
+ err |= get_close_on_fork(fd) ? FD_CLOFORK : 0;
break;
case F_SETFD:
err = 0;
set_close_on_exec(fd, arg & FD_CLOEXEC);
+ set_close_on_fork(fd, arg & FD_CLOFORK);
break;
case F_GETFL:
err = filp->f_flags;
diff --git a/fs/file.c b/fs/file.c
index c8a4e4c86e55..de7260ba718d 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -57,6 +57,8 @@ static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
memset((char *)nfdt->open_fds + cpy, 0, set);
memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
memset((char *)nfdt->close_on_exec + cpy, 0, set);
+ memcpy(nfdt->close_on_fork, ofdt->close_on_fork, cpy);
+ memset((char *)nfdt->close_on_fork + cpy, 0, set);

cpy = BITBIT_SIZE(count);
set = BITBIT_SIZE(nfdt->max_fds) - cpy;
@@ -118,7 +120,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
fdt->fd = data;

data = kvmalloc(max_t(size_t,
- 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES),
+ 3 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES),
GFP_KERNEL_ACCOUNT);
if (!data)
goto out_arr;
@@ -126,6 +128,8 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
data += nr / BITS_PER_BYTE;
fdt->close_on_exec = data;
data += nr / BITS_PER_BYTE;
+ fdt->close_on_fork = data;
+ data += nr / BITS_PER_BYTE;
fdt->full_fds_bits = data;

return fdt;
@@ -236,6 +240,17 @@ static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt)
__clear_bit(fd, fdt->close_on_exec);
}

+static inline void __set_close_on_fork(unsigned int fd, struct fdtable *fdt)
+{
+ __set_bit(fd, fdt->close_on_fork);
+}
+
+static inline void __clear_close_on_fork(unsigned int fd, struct fdtable *fdt)
+{
+ if (test_bit(fd, fdt->close_on_fork))
+ __clear_bit(fd, fdt->close_on_fork);
+}
+
static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt)
{
__set_bit(fd, fdt->open_fds);
@@ -290,6 +305,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
new_fdt = &newf->fdtab;
new_fdt->max_fds = NR_OPEN_DEFAULT;
new_fdt->close_on_exec = newf->close_on_exec_init;
+ new_fdt->close_on_fork = newf->close_on_fork_init;
new_fdt->open_fds = newf->open_fds_init;
new_fdt->full_fds_bits = newf->full_fds_bits_init;
new_fdt->fd = &newf->fd_array[0];
@@ -337,6 +353,12 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)

for (i = open_files; i != 0; i--) {
struct file *f = *old_fds++;
+
+ if (test_bit(open_files - i, new_fdt->close_on_fork)) {
+ __clear_bit(open_files - i, new_fdt->open_fds);
+ f = NULL;
+ }
+
if (f) {
get_file(f);
} else {
@@ -453,6 +475,7 @@ struct files_struct init_files = {
.max_fds = NR_OPEN_DEFAULT,
.fd = &init_files.fd_array[0],
.close_on_exec = init_files.close_on_exec_init,
+ .close_on_fork = init_files.close_on_fork_init,
.open_fds = init_files.open_fds_init,
.full_fds_bits = init_files.full_fds_bits_init,
},
@@ -865,6 +888,31 @@ bool get_close_on_exec(unsigned int fd)
return res;
}

+void set_close_on_fork(unsigned int fd, int flag)
+{
+ struct files_struct *files = current->files;
+ struct fdtable *fdt;
+ spin_lock(&files->file_lock);
+ fdt = files_fdtable(files);
+ if (flag)
+ __set_close_on_fork(fd, fdt);
+ else
+ __clear_close_on_fork(fd, fdt);
+ spin_unlock(&files->file_lock);
+}
+
+bool get_close_on_fork(unsigned int fd)
+{
+ struct files_struct *files = current->files;
+ struct fdtable *fdt;
+ bool res;
+ rcu_read_lock();
+ fdt = files_fdtable(files);
+ res = close_on_fork(fd, fdt);
+ rcu_read_unlock();
+ return res;
+}
+
static int do_dup2(struct files_struct *files,
struct file *file, unsigned fd, unsigned flags)
__releases(&files->file_lock)
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index f07c55ea0c22..61c551947fa3 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -27,6 +27,7 @@ struct fdtable {
unsigned int max_fds;
struct file __rcu **fd; /* current fd array */
unsigned long *close_on_exec;
+ unsigned long *close_on_fork;
unsigned long *open_fds;
unsigned long *full_fds_bits;
struct rcu_head rcu;
@@ -37,6 +38,11 @@ static inline bool close_on_exec(unsigned int fd, const struct fdtable *fdt)
return test_bit(fd, fdt->close_on_exec);
}

+static inline bool close_on_fork(unsigned int fd, const struct fdtable *fdt)
+{
+ return test_bit(fd, fdt->close_on_fork);
+}
+
static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
{
return test_bit(fd, fdt->open_fds);
@@ -61,6 +67,7 @@ struct files_struct {
spinlock_t file_lock ____cacheline_aligned_in_smp;
unsigned int next_fd;
unsigned long close_on_exec_init[1];
+ unsigned long close_on_fork_init[1];
unsigned long open_fds_init[1];
unsigned long full_fds_bits_init[1];
struct file __rcu * fd_array[NR_OPEN_DEFAULT];
diff --git a/include/linux/file.h b/include/linux/file.h
index 142d102f285e..86fbb36b438b 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -85,6 +85,8 @@ extern int f_dupfd(unsigned int from, struct file *file, unsigned flags);
extern int replace_fd(unsigned fd, struct file *file, unsigned flags);
extern void set_close_on_exec(unsigned int fd, int flag);
extern bool get_close_on_exec(unsigned int fd);
+extern void set_close_on_fork(unsigned int fd, int flag);
+extern bool get_close_on_fork(unsigned int fd);
extern int __get_unused_fd_flags(unsigned flags, unsigned long nofile);
extern int get_unused_fd_flags(unsigned flags);
extern void put_unused_fd(unsigned int fd);
diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
index 9dc0bf0c5a6e..0cb7199a7743 100644
--- a/include/uapi/asm-generic/fcntl.h
+++ b/include/uapi/asm-generic/fcntl.h
@@ -98,8 +98,8 @@
#endif

#define F_DUPFD 0 /* dup */
-#define F_GETFD 1 /* get close_on_exec */
-#define F_SETFD 2 /* set/clear close_on_exec */
+#define F_GETFD 1 /* get close_on_exec & close_on_fork */
+#define F_SETFD 2 /* set/clear close_on_exec & close_on_fork */
#define F_GETFL 3 /* get file->f_flags */
#define F_SETFL 4 /* set file->f_flags */
#ifndef F_GETLK
@@ -160,6 +160,7 @@ struct f_owner_ex {

/* for F_[GET|SET]FL */
#define FD_CLOEXEC 1 /* actually anything with low bit set goes */
+#define FD_CLOFORK 2

/* for posix fcntl() and lockf() */
#ifndef F_RDLCK
diff --git a/tools/include/uapi/asm-generic/fcntl.h b/tools/include/uapi/asm-generic/fcntl.h
index ac190958c981..e04a00fecb4a 100644
--- a/tools/include/uapi/asm-generic/fcntl.h
+++ b/tools/include/uapi/asm-generic/fcntl.h
@@ -97,8 +97,8 @@
#endif

#define F_DUPFD 0 /* dup */
-#define F_GETFD 1 /* get close_on_exec */
-#define F_SETFD 2 /* set/clear close_on_exec */
+#define F_GETFD 1 /* get close_on_exec & close_on_fork */
+#define F_SETFD 2 /* set/clear close_on_exec & close_on_fork */
#define F_GETFL 3 /* get file->f_flags */
#define F_SETFL 4 /* set file->f_flags */
#ifndef F_GETLK
@@ -159,6 +159,7 @@ struct f_owner_ex {

/* for F_[GET|SET]FL */
#define FD_CLOEXEC 1 /* actually anything with low bit set goes */
+#define FD_CLOFORK 2

/* for posix fcntl() and lockf() */
#ifndef F_RDLCK
--
2.26.1

2020-04-20 10:28:59

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH 1/4] fs: Implement close-on-fork



On 4/20/20 12:15 AM, Nate Karstens wrote:
> The close-on-fork flag causes the file descriptor to be closed
> atomically in the child process before the child process returns
> from fork(). Implement this feature and provide a method to
> get/set the close-on-fork flag using fcntl(2).
>
> This functionality was approved by the Austin Common Standards
> Revision Group for inclusion in the next revision of the POSIX
> standard (see issue 1318 in the Austin Group Defect Tracker).

Oh well... yet another feature slowing down a critical path.

>
> Co-developed-by: Changli Gao <[email protected]>
> Signed-off-by: Changli Gao <[email protected]>
> Signed-off-by: Nate Karstens <[email protected]>
> ---
> fs/fcntl.c | 2 ++
> fs/file.c | 50 +++++++++++++++++++++++++-
> include/linux/fdtable.h | 7 ++++
> include/linux/file.h | 2 ++
> include/uapi/asm-generic/fcntl.h | 5 +--
> tools/include/uapi/asm-generic/fcntl.h | 5 +--
> 6 files changed, 66 insertions(+), 5 deletions(-)
>
> diff --git a/fs/fcntl.c b/fs/fcntl.c
> index 2e4c0fa2074b..23964abf4a1a 100644
> --- a/fs/fcntl.c
> +++ b/fs/fcntl.c
> @@ -335,10 +335,12 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
> break;
> case F_GETFD:
> err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
> + err |= get_close_on_fork(fd) ? FD_CLOFORK : 0;
> break;
> case F_SETFD:
> err = 0;
> set_close_on_exec(fd, arg & FD_CLOEXEC);
> + set_close_on_fork(fd, arg & FD_CLOFORK);
> break;
> case F_GETFL:
> err = filp->f_flags;
> diff --git a/fs/file.c b/fs/file.c
> index c8a4e4c86e55..de7260ba718d 100644
> --- a/fs/file.c
> +++ b/fs/file.c
> @@ -57,6 +57,8 @@ static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
> memset((char *)nfdt->open_fds + cpy, 0, set);
> memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
> memset((char *)nfdt->close_on_exec + cpy, 0, set);
> + memcpy(nfdt->close_on_fork, ofdt->close_on_fork, cpy);
> + memset((char *)nfdt->close_on_fork + cpy, 0, set);
>

I suggest we group the two bits of a file (close_on_exec, close_on_fork) together,
so that we do not have to dirty two separate cache lines.

Otherwise we will add yet another cache line miss at every file opening/closing for processes
with big file tables.

Ie having a _single_ bitmap array, even bit for close_on_exec, odd bit for close_on_fork

static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt)
{
__set_bit(fd * 2, fdt->close_on_fork_exec);
}

static inline void __set_close_on_fork(unsigned int fd, struct fdtable *fdt)
{
__set_bit(fd * 2 + 1, fdt->close_on_fork_exec);
}

Also the F_GETFD/F_SETFD implementation must use a single function call,
to not acquire the spinlock twice.


2020-04-22 03:40:11

by Changli Gao

[permalink] [raw]
Subject: Re: [PATCH 1/4] fs: Implement close-on-fork

On Mon, Apr 20, 2020 at 6:25 PM Eric Dumazet <[email protected]> wrote:
>
>
>
> On 4/20/20 12:15 AM, Nate Karstens wrote:
> > The close-on-fork flag causes the file descriptor to be closed
> > atomically in the child process before the child process returns
> > from fork(). Implement this feature and provide a method to
> > get/set the close-on-fork flag using fcntl(2).
> >
> > This functionality was approved by the Austin Common Standards
> > Revision Group for inclusion in the next revision of the POSIX
> > standard (see issue 1318 in the Austin Group Defect Tracker).
>
> Oh well... yet another feature slowing down a critical path.
>
> >
> > Co-developed-by: Changli Gao <[email protected]>
> > Signed-off-by: Changli Gao <[email protected]>
> > Signed-off-by: Nate Karstens <[email protected]>
> > ---
> > fs/fcntl.c | 2 ++
> > fs/file.c | 50 +++++++++++++++++++++++++-
> > include/linux/fdtable.h | 7 ++++
> > include/linux/file.h | 2 ++
> > include/uapi/asm-generic/fcntl.h | 5 +--
> > tools/include/uapi/asm-generic/fcntl.h | 5 +--
> > 6 files changed, 66 insertions(+), 5 deletions(-)
> >
> > diff --git a/fs/fcntl.c b/fs/fcntl.c
> > index 2e4c0fa2074b..23964abf4a1a 100644
> > --- a/fs/fcntl.c
> > +++ b/fs/fcntl.c
> > @@ -335,10 +335,12 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
> > break;
> > case F_GETFD:
> > err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
> > + err |= get_close_on_fork(fd) ? FD_CLOFORK : 0;
> > break;
> > case F_SETFD:
> > err = 0;
> > set_close_on_exec(fd, arg & FD_CLOEXEC);
> > + set_close_on_fork(fd, arg & FD_CLOFORK);
> > break;
> > case F_GETFL:
> > err = filp->f_flags;
> > diff --git a/fs/file.c b/fs/file.c
> > index c8a4e4c86e55..de7260ba718d 100644
> > --- a/fs/file.c
> > +++ b/fs/file.c
> > @@ -57,6 +57,8 @@ static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
> > memset((char *)nfdt->open_fds + cpy, 0, set);
> > memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
> > memset((char *)nfdt->close_on_exec + cpy, 0, set);
> > + memcpy(nfdt->close_on_fork, ofdt->close_on_fork, cpy);
> > + memset((char *)nfdt->close_on_fork + cpy, 0, set);
> >
>
> I suggest we group the two bits of a file (close_on_exec, close_on_fork) together,
> so that we do not have to dirty two separate cache lines.
>
> Otherwise we will add yet another cache line miss at every file opening/closing for processes
> with big file tables.
>
> Ie having a _single_ bitmap array, even bit for close_on_exec, odd bit for close_on_fork
>
> static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt)
> {
> __set_bit(fd * 2, fdt->close_on_fork_exec);
> }
>
> static inline void __set_close_on_fork(unsigned int fd, struct fdtable *fdt)
> {
> __set_bit(fd * 2 + 1, fdt->close_on_fork_exec);
> }
>
> Also the F_GETFD/F_SETFD implementation must use a single function call,
> to not acquire the spinlock twice.
>

Good suggestions.

At the same time, we'd better extend other syscalls, which set the
FD_CLOEXEC when creating FDs. i.e. open, pipe3...


--
Regards,
Changli Gao([email protected])

2020-04-22 03:42:53

by Changli Gao

[permalink] [raw]
Subject: Re: [PATCH 1/4] fs: Implement close-on-fork

On Wed, Apr 22, 2020 at 11:38 AM Changli Gao <[email protected]> wrote:
> At the same time, we'd better extend other syscalls, which set the
> FD_CLOEXEC when creating FDs. i.e. open, pipe3...
>

Ignore me, I missed the latter patches.



--
Regards,
Changli Gao([email protected])

2020-04-22 08:37:04

by David Laight

[permalink] [raw]
Subject: RE: [PATCH 1/4] fs: Implement close-on-fork

From: Eric Dumazet
> Sent: 20 April 2020 11:26
> On 4/20/20 12:15 AM, Nate Karstens wrote:
> > The close-on-fork flag causes the file descriptor to be closed
> > atomically in the child process before the child process returns
> > from fork(). Implement this feature and provide a method to
> > get/set the close-on-fork flag using fcntl(2).
> >
> > This functionality was approved by the Austin Common Standards
> > Revision Group for inclusion in the next revision of the POSIX
> > standard (see issue 1318 in the Austin Group Defect Tracker).
>
> Oh well... yet another feature slowing down a critical path.
...
> I suggest we group the two bits of a file (close_on_exec, close_on_fork) together,
> so that we do not have to dirty two separate cache lines.
>
> Otherwise we will add yet another cache line miss at every file opening/closing for processes
> with big file tables.

How about only allocating the 'close on fork' bitmap the first time
a process sets a bit in it?

Off hand I can't imagine the use case.
I thought posix always shared fd tables across fork().

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

2020-04-22 14:40:54

by James Bottomley

[permalink] [raw]
Subject: Re: Implement close-on-fork

On Mon, 2020-04-20 at 02:15 -0500, Nate Karstens wrote:
> Series of 4 patches to implement close-on-fork. Tests have been
> published to https://github.com/nkarstens/ltp/tree/close-on-fork.
>
> close-on-fork addresses race conditions in system(), which
> (depending on the implementation) is non-atomic in that it
> first calls a fork() and then an exec().

Why is this a problem? I get that there's a time between fork and exec
when you have open file descriptors, but they should still be running
in the binary context of the programme that called fork, i.e. under
your control. The security problems don't seem to occur until you exec
some random binary, which close on exec covers. So what problem would
close on fork fix?

> This functionality was approved by the Austin Common Standards
> Revision Group for inclusion in the next revision of the POSIX
> standard (see issue 1318 in the Austin Group Defect Tracker).

URL? Does this standard give a reason why the functionality might be
useful.

James

2020-04-22 15:08:09

by Al Viro

[permalink] [raw]
Subject: Re: Implement close-on-fork

On Mon, Apr 20, 2020 at 02:15:44AM -0500, Nate Karstens wrote:
> Series of 4 patches to implement close-on-fork. Tests have been
> published to https://github.com/nkarstens/ltp/tree/close-on-fork.
>
> close-on-fork addresses race conditions in system(), which
> (depending on the implementation) is non-atomic in that it
> first calls a fork() and then an exec().
>
> This functionality was approved by the Austin Common Standards
> Revision Group for inclusion in the next revision of the POSIX
> standard (see issue 1318 in the Austin Group Defect Tracker).

What exactly the reasons are and why would we want to implement that?

Pardon me, but going by the previous history, "The Austin Group Says It's
Good" is more of a source of concern regarding the merits, general sanity
and, most of all, good taste of a proposal.

I'm not saying that it's automatically bad, but you'll have to go much
deeper into the rationale of that change before your proposal is taken
seriously.

2020-04-22 15:20:04

by Matthew Wilcox

[permalink] [raw]
Subject: Re: Implement close-on-fork

On Wed, Apr 22, 2020 at 04:01:07PM +0100, Al Viro wrote:
> On Mon, Apr 20, 2020 at 02:15:44AM -0500, Nate Karstens wrote:
> > Series of 4 patches to implement close-on-fork. Tests have been
> > published to https://github.com/nkarstens/ltp/tree/close-on-fork.
> >
> > close-on-fork addresses race conditions in system(), which
> > (depending on the implementation) is non-atomic in that it
> > first calls a fork() and then an exec().
> >
> > This functionality was approved by the Austin Common Standards
> > Revision Group for inclusion in the next revision of the POSIX
> > standard (see issue 1318 in the Austin Group Defect Tracker).
>
> What exactly the reasons are and why would we want to implement that?
>
> Pardon me, but going by the previous history, "The Austin Group Says It's
> Good" is more of a source of concern regarding the merits, general sanity
> and, most of all, good taste of a proposal.
>
> I'm not saying that it's automatically bad, but you'll have to go much
> deeper into the rationale of that change before your proposal is taken
> seriously.

https://www.mail-archive.com/[email protected]/msg05324.html
might be useful

2020-04-22 15:36:02

by James Bottomley

[permalink] [raw]
Subject: Re: Implement close-on-fork

On Wed, 2020-04-22 at 08:18 -0700, Matthew Wilcox wrote:
> On Wed, Apr 22, 2020 at 04:01:07PM +0100, Al Viro wrote:
> > On Mon, Apr 20, 2020 at 02:15:44AM -0500, Nate Karstens wrote:
> > > Series of 4 patches to implement close-on-fork. Tests have been
> > > published to https://github.com/nkarstens/ltp/tree/close-on-fork.
> > >
> > > close-on-fork addresses race conditions in system(), which
> > > (depending on the implementation) is non-atomic in that it
> > > first calls a fork() and then an exec().
> > >
> > > This functionality was approved by the Austin Common Standards
> > > Revision Group for inclusion in the next revision of the POSIX
> > > standard (see issue 1318 in the Austin Group Defect Tracker).
> >
> > What exactly the reasons are and why would we want to implement
> > that?
> >
> > Pardon me, but going by the previous history, "The Austin Group
> > Says It's Good" is more of a source of concern regarding the
> > merits, general sanity and, most of all, good taste of a proposal.
> >
> > I'm not saying that it's automatically bad, but you'll have to go
> > much deeper into the rationale of that change before your proposal
> > is taken seriously.
>
> https://www.mail-archive.com/[email protected]/msg05324.ht
> ml
> might be useful

So the problem is an application is written in such a way that the time
window after it forks and before it execs can cause a file descriptor
based resource to be held when the application state thinks it should
have been released because of a mismatch in the expected use count?

Might it not be easier to rewrite the application for this problem
rather than the kernel? Especially as the best justification in the
entire thread seems to be "because solaris had it".

James

2020-04-22 15:38:35

by Karstens, Nate

[permalink] [raw]
Subject: RE: [PATCH 1/4] fs: Implement close-on-fork

Thanks for everyone's feedback so far. There have been a few questions on why this feature is necessary/desired, so I'll describe that here.

We are running Linux on an embedded system. The platform can change the IP address either according to a proprietary negotiation scheme or a manual setting. The application uses netlink to listen for IP address changes; when this occurs the application closes all of its sockets and re-opens them using the new address. A problem can occur if the application is simultaneously fork/exec-ing a new process. The parent process attempts to bind a new socket to a port that it had previously bound to (before the IP address change), only to fail because the child process continues to hold a socket bound to that port.

Our initial solution was to use pthread_atfork() handlers to lock a mutex and wait for the child process to close all of its sockets (as signaled through an eventfd) before the parent attempts to create them again. This doesn't work if the application uses system() anywhere. glibc does not invoke pthread_atfork() handlers; older versions did but this was removed, and the Linux manpage for system(2) notes that "According to POSIX.1, it is unspecified whether handlers registered using pthread_atfork(3) are called during the execution of system(). In the glibc implementation, such handlers are not called. "

This issue was discussed in the Austin Group mailing list; the root message is here:

https://www.mail-archive.com/[email protected]/msg05324.html

There was some skepticism about whether our practice of closing/reopening sockets was advisable. Regardless, it does expose what I believe to be something that was overlooked in the forking process model. We posted two solutions to the Austin Group defect tracker:

http://austingroupbugs.net/view.php?id=1317
http://austingroupbugs.net/view.php?id=1318

Ultimately the Austin Group felt that close-on-fork was the preferred approach. I think it's also worth pointing that out Solaris reportedly has this feature (https://www.mail-archive.com/[email protected]/msg05359.html).

Cheers,

Nate

-----Original Message-----
From: Karstens, Nate <[email protected]>
Sent: Monday, April 20, 2020 02:16
To: Alexander Viro <[email protected]>; Jeff Layton <[email protected]>; J. Bruce Fields <[email protected]>; Arnd Bergmann <[email protected]>; Richard Henderson <[email protected]>; Ivan Kokshaysky <[email protected]>; Matt Turner <[email protected]>; James E.J. Bottomley <[email protected]>; Helge Deller <[email protected]>; David S. Miller <[email protected]>; Jakub Kicinski <[email protected]>; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]
Cc: Changli Gao <[email protected]>; Karstens, Nate <[email protected]>
Subject: [PATCH 1/4] fs: Implement close-on-fork

The close-on-fork flag causes the file descriptor to be closed atomically in the child process before the child process returns from fork(). Implement this feature and provide a method to get/set the close-on-fork flag using fcntl(2).

This functionality was approved by the Austin Common Standards Revision Group for inclusion in the next revision of the POSIX standard (see issue 1318 in the Austin Group Defect Tracker).

Co-developed-by: Changli Gao <[email protected]>
Signed-off-by: Changli Gao <[email protected]>
Signed-off-by: Nate Karstens <[email protected]>
---
fs/fcntl.c | 2 ++
fs/file.c | 50 +++++++++++++++++++++++++-
include/linux/fdtable.h | 7 ++++
include/linux/file.h | 2 ++
include/uapi/asm-generic/fcntl.h | 5 +--
tools/include/uapi/asm-generic/fcntl.h | 5 +--
6 files changed, 66 insertions(+), 5 deletions(-)

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 2e4c0fa2074b..23964abf4a1a 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -335,10 +335,12 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
break;
case F_GETFD:
err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
+ err |= get_close_on_fork(fd) ? FD_CLOFORK : 0;
break;
case F_SETFD:
err = 0;
set_close_on_exec(fd, arg & FD_CLOEXEC);
+ set_close_on_fork(fd, arg & FD_CLOFORK);
break;
case F_GETFL:
err = filp->f_flags;
diff --git a/fs/file.c b/fs/file.c
index c8a4e4c86e55..de7260ba718d 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -57,6 +57,8 @@ static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
memset((char *)nfdt->open_fds + cpy, 0, set);
memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
memset((char *)nfdt->close_on_exec + cpy, 0, set);
+ memcpy(nfdt->close_on_fork, ofdt->close_on_fork, cpy);
+ memset((char *)nfdt->close_on_fork + cpy, 0, set);

cpy = BITBIT_SIZE(count);
set = BITBIT_SIZE(nfdt->max_fds) - cpy; @@ -118,7 +120,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
fdt->fd = data;

data = kvmalloc(max_t(size_t,
- 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES),
+ 3 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES),
GFP_KERNEL_ACCOUNT);
if (!data)
goto out_arr;
@@ -126,6 +128,8 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
data += nr / BITS_PER_BYTE;
fdt->close_on_exec = data;
data += nr / BITS_PER_BYTE;
+ fdt->close_on_fork = data;
+ data += nr / BITS_PER_BYTE;
fdt->full_fds_bits = data;

return fdt;
@@ -236,6 +240,17 @@ static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt)
__clear_bit(fd, fdt->close_on_exec);
}

+static inline void __set_close_on_fork(unsigned int fd, struct fdtable
+*fdt) {
+ __set_bit(fd, fdt->close_on_fork);
+}
+
+static inline void __clear_close_on_fork(unsigned int fd, struct
+fdtable *fdt) {
+ if (test_bit(fd, fdt->close_on_fork))
+ __clear_bit(fd, fdt->close_on_fork);
+}
+
static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt) {
__set_bit(fd, fdt->open_fds);
@@ -290,6 +305,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
new_fdt = &newf->fdtab;
new_fdt->max_fds = NR_OPEN_DEFAULT;
new_fdt->close_on_exec = newf->close_on_exec_init;
+ new_fdt->close_on_fork = newf->close_on_fork_init;
new_fdt->open_fds = newf->open_fds_init;
new_fdt->full_fds_bits = newf->full_fds_bits_init;
new_fdt->fd = &newf->fd_array[0];
@@ -337,6 +353,12 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)

for (i = open_files; i != 0; i--) {
struct file *f = *old_fds++;
+
+ if (test_bit(open_files - i, new_fdt->close_on_fork)) {
+ __clear_bit(open_files - i, new_fdt->open_fds);
+ f = NULL;
+ }
+
if (f) {
get_file(f);
} else {
@@ -453,6 +475,7 @@ struct files_struct init_files = {
.max_fds = NR_OPEN_DEFAULT,
.fd = &init_files.fd_array[0],
.close_on_exec = init_files.close_on_exec_init,
+ .close_on_fork = init_files.close_on_fork_init,
.open_fds = init_files.open_fds_init,
.full_fds_bits = init_files.full_fds_bits_init,
},
@@ -865,6 +888,31 @@ bool get_close_on_exec(unsigned int fd)
return res;
}

+void set_close_on_fork(unsigned int fd, int flag) {
+ struct files_struct *files = current->files;
+ struct fdtable *fdt;
+ spin_lock(&files->file_lock);
+ fdt = files_fdtable(files);
+ if (flag)
+ __set_close_on_fork(fd, fdt);
+ else
+ __clear_close_on_fork(fd, fdt);
+ spin_unlock(&files->file_lock);
+}
+
+bool get_close_on_fork(unsigned int fd) {
+ struct files_struct *files = current->files;
+ struct fdtable *fdt;
+ bool res;
+ rcu_read_lock();
+ fdt = files_fdtable(files);
+ res = close_on_fork(fd, fdt);
+ rcu_read_unlock();
+ return res;
+}
+
static int do_dup2(struct files_struct *files,
struct file *file, unsigned fd, unsigned flags)
__releases(&files->file_lock)
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index f07c55ea0c22..61c551947fa3 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -27,6 +27,7 @@ struct fdtable {
unsigned int max_fds;
struct file __rcu **fd; /* current fd array */
unsigned long *close_on_exec;
+ unsigned long *close_on_fork;
unsigned long *open_fds;
unsigned long *full_fds_bits;
struct rcu_head rcu;
@@ -37,6 +38,11 @@ static inline bool close_on_exec(unsigned int fd, const struct fdtable *fdt)
return test_bit(fd, fdt->close_on_exec); }

+static inline bool close_on_fork(unsigned int fd, const struct fdtable
+*fdt) {
+ return test_bit(fd, fdt->close_on_fork); }
+
static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt) {
return test_bit(fd, fdt->open_fds);
@@ -61,6 +67,7 @@ struct files_struct {
spinlock_t file_lock ____cacheline_aligned_in_smp;
unsigned int next_fd;
unsigned long close_on_exec_init[1];
+ unsigned long close_on_fork_init[1];
unsigned long open_fds_init[1];
unsigned long full_fds_bits_init[1];
struct file __rcu * fd_array[NR_OPEN_DEFAULT]; diff --git a/include/linux/file.h b/include/linux/file.h index 142d102f285e..86fbb36b438b 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -85,6 +85,8 @@ extern int f_dupfd(unsigned int from, struct file *file, unsigned flags); extern int replace_fd(unsigned fd, struct file *file, unsigned flags); extern void set_close_on_exec(unsigned int fd, int flag); extern bool get_close_on_exec(unsigned int fd);
+extern void set_close_on_fork(unsigned int fd, int flag); extern bool
+get_close_on_fork(unsigned int fd);
extern int __get_unused_fd_flags(unsigned flags, unsigned long nofile); extern int get_unused_fd_flags(unsigned flags); extern void put_unused_fd(unsigned int fd); diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
index 9dc0bf0c5a6e..0cb7199a7743 100644
--- a/include/uapi/asm-generic/fcntl.h
+++ b/include/uapi/asm-generic/fcntl.h
@@ -98,8 +98,8 @@
#endif

#define F_DUPFD 0 /* dup */
-#define F_GETFD 1 /* get close_on_exec */
-#define F_SETFD 2 /* set/clear close_on_exec */
+#define F_GETFD 1 /* get close_on_exec & close_on_fork */
+#define F_SETFD 2 /* set/clear close_on_exec & close_on_fork */
#define F_GETFL 3 /* get file->f_flags */
#define F_SETFL 4 /* set file->f_flags */
#ifndef F_GETLK
@@ -160,6 +160,7 @@ struct f_owner_ex {

/* for F_[GET|SET]FL */
#define FD_CLOEXEC 1 /* actually anything with low bit set goes */
+#define FD_CLOFORK 2

/* for posix fcntl() and lockf() */
#ifndef F_RDLCK
diff --git a/tools/include/uapi/asm-generic/fcntl.h b/tools/include/uapi/asm-generic/fcntl.h
index ac190958c981..e04a00fecb4a 100644
--- a/tools/include/uapi/asm-generic/fcntl.h
+++ b/tools/include/uapi/asm-generic/fcntl.h
@@ -97,8 +97,8 @@
#endif

#define F_DUPFD 0 /* dup */
-#define F_GETFD 1 /* get close_on_exec */
-#define F_SETFD 2 /* set/clear close_on_exec */
+#define F_GETFD 1 /* get close_on_exec & close_on_fork */
+#define F_SETFD 2 /* set/clear close_on_exec & close_on_fork */
#define F_GETFL 3 /* get file->f_flags */
#define F_SETFL 4 /* set file->f_flags */
#ifndef F_GETLK
@@ -159,6 +159,7 @@ struct f_owner_ex {

/* for F_[GET|SET]FL */
#define FD_CLOEXEC 1 /* actually anything with low bit set goes */
+#define FD_CLOFORK 2

/* for posix fcntl() and lockf() */
#ifndef F_RDLCK
--
2.26.1

2020-04-22 15:46:00

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 1/4] fs: Implement close-on-fork

On Wed, Apr 22, 2020 at 03:36:09PM +0000, Karstens, Nate wrote:
> There was some skepticism about whether our practice of
> closing/reopening sockets was advisable. Regardless, it does expose what
> I believe to be something that was overlooked in the forking process
> model. We posted two solutions to the Austin Group defect tracker:

I don't think it was "overlooked" at all. It's not safe to call system()
from a threaded app. That's all. It's right there in the DESCRIPTION:

The system() function need not be thread-safe.
https://pubs.opengroup.org/onlinepubs/9699919799/functions/system.html

> Ultimately the Austin Group felt that close-on-fork
> was the preferred approach. I think it's also worth
> pointing that out Solaris reportedly has this feature
> (https://www.mail-archive.com/[email protected]/msg05359.html).

I am perplexed that the Austin Group thought this was a good idea.

2020-04-22 16:02:42

by Al Viro

[permalink] [raw]
Subject: Re: Implement close-on-fork

On Wed, Apr 22, 2020 at 08:18:15AM -0700, Matthew Wilcox wrote:
> On Wed, Apr 22, 2020 at 04:01:07PM +0100, Al Viro wrote:
> > On Mon, Apr 20, 2020 at 02:15:44AM -0500, Nate Karstens wrote:
> > > Series of 4 patches to implement close-on-fork. Tests have been
> > > published to https://github.com/nkarstens/ltp/tree/close-on-fork.
> > >
> > > close-on-fork addresses race conditions in system(), which
> > > (depending on the implementation) is non-atomic in that it
> > > first calls a fork() and then an exec().
> > >
> > > This functionality was approved by the Austin Common Standards
> > > Revision Group for inclusion in the next revision of the POSIX
> > > standard (see issue 1318 in the Austin Group Defect Tracker).
> >
> > What exactly the reasons are and why would we want to implement that?
> >
> > Pardon me, but going by the previous history, "The Austin Group Says It's
> > Good" is more of a source of concern regarding the merits, general sanity
> > and, most of all, good taste of a proposal.
> >
> > I'm not saying that it's automatically bad, but you'll have to go much
> > deeper into the rationale of that change before your proposal is taken
> > seriously.
>
> https://www.mail-archive.com/[email protected]/msg05324.html
> might be useful

*snort*

Alan Coopersmith in that thread:
|| https://lwn.net/Articles/785430/ suggests AIX, BSD, & MacOS have also defined
|| it, and though it's been proposed multiple times for Linux, never adopted there.

Now, look at the article in question. You'll see that it should've been
"someone's posting in the end of comments thread under LWN article says that
apparently it exists on AIX, BSD, ..."

The strength of evidence aside, that got me curious; I have checked the
source of FreeBSD, NetBSD and OpenBSD. No such thing exists in either of
their kernels, so at least that part can be considered an urban legend.

As for the original problem... what kind of exclusion is used between
the reaction to netlink notifications (including closing every socket,
etc.) and actual IO done on those sockets?

2020-04-22 16:04:56

by Karstens, Nate

[permalink] [raw]
Subject: RE: [PATCH 1/4] fs: Implement close-on-fork

> It's not safe to call system() from a threaded app. That's all. It's right there in the DESCRIPTION:

That is true, but that description is missing from both the Linux man page and the glibc documentation (https://www.gnu.org/software/libc/manual/html_mono/libc.html#Running-a-Command). It seems like a minor point that won't be noticed until it causes a problem, and problems are rare enough they might go unnoticed for a while. We have removed system() from our application, but we're also concerned that libraries we integrate will use system() without our knowledge.

-----Original Message-----
From: Matthew Wilcox <[email protected]>
Sent: Wednesday, April 22, 2020 10:44
To: Karstens, Nate <[email protected]>
Cc: Alexander Viro <[email protected]>; Jeff Layton <[email protected]>; J. Bruce Fields <[email protected]>; Arnd Bergmann <[email protected]>; Richard Henderson <[email protected]>; Ivan Kokshaysky <[email protected]>; Matt Turner <[email protected]>; James E.J. Bottomley <[email protected]>; Helge Deller <[email protected]>; David S. Miller <[email protected]>; Jakub Kicinski <[email protected]>; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; David Laight <[email protected]>; Changli Gao <[email protected]>
Subject: Re: [PATCH 1/4] fs: Implement close-on-fork

CAUTION - EXTERNAL EMAIL: Do not click any links or open any attachments unless you trust the sender and know the content is safe.


On Wed, Apr 22, 2020 at 03:36:09PM +0000, Karstens, Nate wrote:
> There was some skepticism about whether our practice of
> closing/reopening sockets was advisable. Regardless, it does expose
> what I believe to be something that was overlooked in the forking
> process model. We posted two solutions to the Austin Group defect tracker:

I don't think it was "overlooked" at all. It's not safe to call system() from a threaded app. That's all. It's right there in the DESCRIPTION:

The system() function need not be thread-safe.
https://pubs.opengroup.org/onlinepubs/9699919799/functions/system.html

> Ultimately the Austin Group felt that close-on-fork was the preferred
> approach. I think it's also worth pointing that out Solaris reportedly
> has this feature
> (https://www.mail-archive.com/[email protected]/msg05359.html).

I am perplexed that the Austin Group thought this was a good idea.

________________________________

CONFIDENTIALITY NOTICE: This email and any attachments are for the sole use of the intended recipient(s) and contain information that may be Garmin confidential and/or Garmin legally privileged. If you have received this email in error, please notify the sender by reply email and delete the message. Any disclosure, copying, distribution or use of this communication (including attachments) by someone other than the intended recipient is prohibited. Thank you.

2020-04-22 16:17:52

by Al Viro

[permalink] [raw]
Subject: Re: Implement close-on-fork

On Wed, Apr 22, 2020 at 05:00:32PM +0100, Al Viro wrote:

> *snort*
>
> Alan Coopersmith in that thread:
> || https://lwn.net/Articles/785430/ suggests AIX, BSD, & MacOS have also defined
> || it, and though it's been proposed multiple times for Linux, never adopted there.
>
> Now, look at the article in question. You'll see that it should've been
> "someone's posting in the end of comments thread under LWN article says that
> apparently it exists on AIX, BSD, ..."
>
> The strength of evidence aside, that got me curious; I have checked the
> source of FreeBSD, NetBSD and OpenBSD. No such thing exists in either of
> their kernels, so at least that part can be considered an urban legend.
>
> As for the original problem... what kind of exclusion is used between
> the reaction to netlink notifications (including closing every socket,
> etc.) and actual IO done on those sockets?

Not an idle question, BTW - unlike Solaris we do NOT (and will not) have
close(2) abort IO on the same descriptor from another thread. So if one
thread sits in recvmsg(2) while another does close(2), the socket will
*NOT* actually shut down until recvmsg(2) returns.

2020-04-22 16:35:47

by Bernd Petrovitsch

[permalink] [raw]
Subject: Re: [PATCH 1/4] fs: Implement close-on-fork

On 22/04/2020 16:02, Karstens, Nate wrote:
>> It's not safe to call system() from a threaded app. That's all. It's right there in the DESCRIPTION:
>
> That is true, but that description is missing from both the Linux man page and the glibc documentation (https://www.gnu.org/software/libc/manual/html_mono/libc.html#Running-a-Command). It seems like a minor point that won't be noticed until it causes a problem, and problems are rare enough they might go unnoticed for a while. We have removed system() from our application, but we're also concerned that libraries we integrate will use system() without our knowledge.

Reimplementing system() is trivial.
LD_LIBRARY_PRELOAD should take care of all system(3) calls.

I wonder it it has some value to add runtime checking for
"multi-threaded" to such lib functions and error out if
yes.

Apart from that, system() is a PITA even on
single/non-threaded apps.

MfG,
Bernd
--
There is no cloud, just other people computers.
-- https://static.fsf.org/nosvn/stickers/thereisnocloud.svg


Attachments:
pEpkey.asc (2.45 kB)

2020-04-22 16:58:28

by David Laight

[permalink] [raw]
Subject: RE: [PATCH 1/4] fs: Implement close-on-fork

From: Bernd Petrovitsch
> Sent: 22 April 2020 17:32
...
> Apart from that, system() is a PITA even on
> single/non-threaded apps.

Not only that, it is bloody dangerous because (typically)
shell is doing post substitution syntax analysis.

If you need to run an external process you need to generate
an arv[] array containing the parameters.

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

2020-04-23 12:38:47

by Bernd Petrovitsch

[permalink] [raw]
Subject: Re: [PATCH 1/4] fs: Implement close-on-fork

Hi all!

On 22/04/2020 16:55, David Laight wrote:
> From: Bernd Petrovitsch
>> Sent: 22 April 2020 17:32
> ...
>> Apart from that, system() is a PITA even on
>> single/non-threaded apps.
>
> Not only that, it is bloody dangerous because (typically)
> shell is doing post substitution syntax analysis.

I actually meant exactly that with PITA;-)

> If you need to run an external process you need to generate
> an arv[] array containing the parameters.

FullACK. That is usually similar trivial ...

MfG,
Bernd
--
There is no cloud, just other people computers.
-- https://static.fsf.org/nosvn/stickers/thereisnocloud.svg


Attachments:
pEpkey.asc (2.45 kB)

2020-05-01 14:47:53

by Karstens, Nate

[permalink] [raw]
Subject: RE: [PATCH 1/4] fs: Implement close-on-fork

Eric,

Thanks for the suggestion. I looked into it and noticed that do_close_on_exec() appears to have some optimizations as well:

> set = fdt->close_on_exec[i];
> if (!set)
> continue;

If we interleave the close-on-exec and close-on-fork flags then this optimization will have to be removed. Do you have a sense of which optimization provides the most benefit?

I noticed a couple of other issues with the original patch that I will need to investigate or rework:

1) I'm not sure dup_fd() is the best place to check the close-on-fork flag. For example, the ksys_unshare() > unshare_fd() > dup_fd() execution path seems suspect. I will either add a parameter to the function indicating if the flag should be checked or do a separate function, like do_close_on_fork().
2) If the close-on-fork flag is set, then __clear_open_fd() should be called instead of just __clear_bit(). This will ensure that fdt->full_fds_bits() is updated.
3) Need to investigate if the close-on-fork (or close-on-exec) flags need to be cleared when the file is closed as part of the close-on-fork execution path.

Others -- I will respond to feedback outside of implementation details in a separate message.

Thanks,

Nate

-----Original Message-----
From: Eric Dumazet <[email protected]>
Sent: Monday, April 20, 2020 05:26
To: Karstens, Nate <[email protected]>; Alexander Viro <[email protected]>; Jeff Layton <[email protected]>; J. Bruce Fields <[email protected]>; Arnd Bergmann <[email protected]>; Richard Henderson <[email protected]>; Ivan Kokshaysky <[email protected]>; Matt Turner <[email protected]>; James E.J. Bottomley <[email protected]>; Helge Deller <[email protected]>; David S. Miller <[email protected]>; Jakub Kicinski <[email protected]>; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]
Cc: Changli Gao <[email protected]>
Subject: Re: [PATCH 1/4] fs: Implement close-on-fork

CAUTION - EXTERNAL EMAIL: Do not click any links or open any attachments unless you trust the sender and know the content is safe.


On 4/20/20 12:15 AM, Nate Karstens wrote:
> The close-on-fork flag causes the file descriptor to be closed
> atomically in the child process before the child process returns from
> fork(). Implement this feature and provide a method to get/set the
> close-on-fork flag using fcntl(2).
>
> This functionality was approved by the Austin Common Standards
> Revision Group for inclusion in the next revision of the POSIX
> standard (see issue 1318 in the Austin Group Defect Tracker).

Oh well... yet another feature slowing down a critical path.

>
> Co-developed-by: Changli Gao <[email protected]>
> Signed-off-by: Changli Gao <[email protected]>
> Signed-off-by: Nate Karstens <[email protected]>
> ---
> fs/fcntl.c | 2 ++
> fs/file.c | 50 +++++++++++++++++++++++++-
> include/linux/fdtable.h | 7 ++++
> include/linux/file.h | 2 ++
> include/uapi/asm-generic/fcntl.h | 5 +--
> tools/include/uapi/asm-generic/fcntl.h | 5 +--
> 6 files changed, 66 insertions(+), 5 deletions(-)
>
> diff --git a/fs/fcntl.c b/fs/fcntl.c
> index 2e4c0fa2074b..23964abf4a1a 100644
> --- a/fs/fcntl.c
> +++ b/fs/fcntl.c
> @@ -335,10 +335,12 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
> break;
> case F_GETFD:
> err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
> + err |= get_close_on_fork(fd) ? FD_CLOFORK : 0;
> break;
> case F_SETFD:
> err = 0;
> set_close_on_exec(fd, arg & FD_CLOEXEC);
> + set_close_on_fork(fd, arg & FD_CLOFORK);
> break;
> case F_GETFL:
> err = filp->f_flags;
> diff --git a/fs/file.c b/fs/file.c
> index c8a4e4c86e55..de7260ba718d 100644
> --- a/fs/file.c
> +++ b/fs/file.c
> @@ -57,6 +57,8 @@ static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
> memset((char *)nfdt->open_fds + cpy, 0, set);
> memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
> memset((char *)nfdt->close_on_exec + cpy, 0, set);
> + memcpy(nfdt->close_on_fork, ofdt->close_on_fork, cpy);
> + memset((char *)nfdt->close_on_fork + cpy, 0, set);
>

I suggest we group the two bits of a file (close_on_exec, close_on_fork) together, so that we do not have to dirty two separate cache lines.

Otherwise we will add yet another cache line miss at every file opening/closing for processes with big file tables.

Ie having a _single_ bitmap array, even bit for close_on_exec, odd bit for close_on_fork

static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt) {
__set_bit(fd * 2, fdt->close_on_fork_exec); }

static inline void __set_close_on_fork(unsigned int fd, struct fdtable *fdt) {
__set_bit(fd * 2 + 1, fdt->close_on_fork_exec); }

Also the F_GETFD/F_SETFD implementation must use a single function call, to not acquire the spinlock twice.


2020-05-01 15:26:42

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 1/4] fs: Implement close-on-fork

On Fri, May 01, 2020 at 02:45:16PM +0000, Karstens, Nate wrote:
> Others -- I will respond to feedback outside of implementation details in a separate message.

FWIW, I'm opposed to the entire feature. Improving the implementation
will not change that.

2020-05-03 13:56:59

by David Laight

[permalink] [raw]
Subject: RE: [PATCH 1/4] fs: Implement close-on-fork

From: Karstens, Nate
> Sent: 01 May 2020 15:45
> Thanks for the suggestion. I looked into it and noticed that do_close_on_exec() appears to have some
> optimizations as well:
>
> > set = fdt->close_on_exec[i];
> > if (!set)
> > continue;
>
> If we interleave the close-on-exec and close-on-fork flags then this optimization will have to be
> removed. Do you have a sense of which optimization provides the most benefit?

Thinks....
A moderate proportion of exec() will have at least one fd with 'close on exec' set.
Very few fork() will have any fd with 'close on fork' set.
The 'close on fork' table shouldn't be copied to the forked process.
The 'close on exec' table is deleted by exec().

So...
On fork() take a copy and clear the 'close_on_fork' bitmap.
For every bit set lookup the fd and close if the live bit is set.
Similarly exec() clears and acts on the 'close on exec' map.

You should be able to use the same 'close the fds in this bitmap'
function for both cases.

So I think you need two bitmaps.
But the code needs to differentiate between requests to set bits
(which need to allocate/extend the bitmap) and ones to clear/read
bits (which do not).

You might even consider putting the 'live' flag into the fd structure
and using the bitmap value as a 'hint' - which might be hashed.

After all, it is likely that the 'close on exec' processing
will be faster overall if it just loops through the open fd and
checks each in turn!
I doubt many processes actually exec with more than an handful
of open files.

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

2020-05-04 14:59:51

by Karstens, Nate

[permalink] [raw]
Subject: RE: Implement close-on-fork

Thanks everyone for their comments, sorry for the delay in my reply.

> As for the original problem... what kind of exclusion is used between the reaction to netlink notifications (including closing every socket,
> etc.) and actual IO done on those sockets?
> Not an idle question, BTW - unlike Solaris we do NOT (and will not) have
> close(2) abort IO on the same descriptor from another thread. So if one thread sits in recvmsg(2) while another does close(2), the socket will
> *NOT* actually shut down until recvmsg(2) returns.

The netlink notification is received on a separate thread, but handling of that notification (closing and re-opening sockets) and the socket I/O is all done on the same thread. The call to system() happens sometime between when this thread decides to close all of its sockets and when the sockets have been closed. The child process is left with a reference to one or more sockets. The close-on-exec flag is set on the socket, so the period of time is brief, but because system() is not atomic this still leaves a window of opportunity for the failure to occur. The parent process tries to open the socket again but fails because the child process still has an open socket that controls the port.

This phenomenon can really be generalized to any resource that 1) a process needs exclusive access to and 2) the operating system automatically creates a new reference in the child when the process forks.

> Reimplementing system() is trivial.
> LD_LIBRARY_PRELOAD should take care of all system(3) calls.

Yes, that would solve the problem for our system. We identified what we believe to be a problem with the POSIX threading model and wanted to work with the community to improve this for others as well. The Austin Group agreed with the premise enough that they were willing to update the POSIX standard.

> I wonder it it has some value to add runtime checking for "multi-threaded" to such lib functions and error out if yes.
> Apart from that, system() is a PITA even on single/non-threaded apps.

That may be, but system() is convenient and there isn't much in the documentation that warns the average developer away from its use. The manpage indicates system() is thread-safe. The manpage is also somewhat contradictory in that it describes the operation as being equivalent to a fork() and an execl(), though it later points out that pthread_atfork() handlers may not be executed.

> FWIW, I'm opposed to the entire feature. Improving the implementation will not change that.

I get it. From our perspective, changing the OS to resolve an issue seems like a drastic step. We tried hard to come up with an alternative (see https://www.mail-archive.com/[email protected]/msg05324.html and https://austingroupbugs.net/view.php?id=1317), but nothing else addresses the underlying issue: there is no way to prevent a fork() from duplicating the resource. The close-on-exec flag partially-addresses this by allowing the parent process to mark a file descriptor as exclusive to itself, but there is still a period of time the failure can occur because the auto-close only occurs during the exec(). Perhaps this would not be an issue with a different process/threading model, but that is another discussion entirely.

Best Regards,

Nate

-----Original Message-----
From: Al Viro <[email protected]> On Behalf Of Al Viro
Sent: Wednesday, April 22, 2020 11:01
To: Matthew Wilcox <[email protected]>
Cc: Karstens, Nate <[email protected]>; Jeff Layton <[email protected]>; J. Bruce Fields <[email protected]>; Arnd Bergmann <[email protected]>; Richard Henderson <[email protected]>; Ivan Kokshaysky <[email protected]>; Matt Turner <[email protected]>; James E.J. Bottomley <[email protected]>; Helge Deller <[email protected]>; David S. Miller <[email protected]>; Jakub Kicinski <[email protected]>; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; Changli Gao <[email protected]>
Subject: Re: Implement close-on-fork

CAUTION - EXTERNAL EMAIL: Do not click any links or open any attachments unless you trust the sender and know the content is safe.


On Wed, Apr 22, 2020 at 08:18:15AM -0700, Matthew Wilcox wrote:
> On Wed, Apr 22, 2020 at 04:01:07PM +0100, Al Viro wrote:
> > On Mon, Apr 20, 2020 at 02:15:44AM -0500, Nate Karstens wrote:
> > > Series of 4 patches to implement close-on-fork. Tests have been
> > > published to https://github.com/nkarstens/ltp/tree/close-on-fork.
> > >
> > > close-on-fork addresses race conditions in system(), which
> > > (depending on the implementation) is non-atomic in that it first
> > > calls a fork() and then an exec().
> > >
> > > This functionality was approved by the Austin Common Standards
> > > Revision Group for inclusion in the next revision of the POSIX
> > > standard (see issue 1318 in the Austin Group Defect Tracker).
> >
> > What exactly the reasons are and why would we want to implement that?
> >
> > Pardon me, but going by the previous history, "The Austin Group Says
> > It's Good" is more of a source of concern regarding the merits,
> > general sanity and, most of all, good taste of a proposal.
> >
> > I'm not saying that it's automatically bad, but you'll have to go
> > much deeper into the rationale of that change before your proposal
> > is taken seriously.
>
> https://www.mail-archive.com/[email protected]/msg05324.htm
> l
> might be useful

*snort*

Alan Coopersmith in that thread:
|| https://lwn.net/Articles/785430/ suggests AIX, BSD, & MacOS have also
|| defined it, and though it's been proposed multiple times for Linux, never adopted there.

Now, look at the article in question. You'll see that it should've been "someone's posting in the end of comments thread under LWN article says that apparently it exists on AIX, BSD, ..."

The strength of evidence aside, that got me curious; I have checked the source of FreeBSD, NetBSD and OpenBSD. No such thing exists in either of their kernels, so at least that part can be considered an urban legend.

As for the original problem... what kind of exclusion is used between the reaction to netlink notifications (including closing every socket,
etc.) and actual IO done on those sockets?


________________________________

CONFIDENTIALITY NOTICE: This email and any attachments are for the sole use of the intended recipient(s) and contain information that may be Garmin confidential and/or Garmin legally privileged. If you have received this email in error, please notify the sender by reply email and delete the message. Any disclosure, copying, distribution or use of this communication (including attachments) by someone other than the intended recipient is prohibited. Thank you.