2020-10-13 16:43:31

by Giuseppe Scrivano

[permalink] [raw]
Subject: [PATCH 0/2] fs, close_range: add flag CLOSE_RANGE_CLOEXEC

When the new flag is used, close_range will set the close-on-exec bit
for the file descriptors instead of close()-ing them.

It is useful for e.g. container runtimes that want to minimize the
number of syscalls used after a seccomp profile is installed but want
to keep some fds open until the container process is executed.

Giuseppe Scrivano (2):
fs, close_range: add flag CLOSE_RANGE_CLOEXEC
selftests: add tests for CLOSE_RANGE_CLOEXEC

fs/file.c | 56 +++++++++++++------
include/uapi/linux/close_range.h | 3 +
.../testing/selftests/core/close_range_test.c | 44 +++++++++++++++
3 files changed, 86 insertions(+), 17 deletions(-)

--
2.26.2


2020-10-13 16:45:23

by Giuseppe Scrivano

[permalink] [raw]
Subject: [PATCH 1/2] fs, close_range: add flag CLOSE_RANGE_CLOEXEC

When the flag CLOSE_RANGE_CLOEXEC is set, close_range doesn't
immediately close the files but it sets the close-on-exec bit.

Signed-off-by: Giuseppe Scrivano <[email protected]>
---
fs/file.c | 56 ++++++++++++++++++++++----------
include/uapi/linux/close_range.h | 3 ++
2 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/fs/file.c b/fs/file.c
index 21c0893f2f1d..ad4ebee41e09 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -672,6 +672,17 @@ int __close_fd(struct files_struct *files, unsigned fd)
}
EXPORT_SYMBOL(__close_fd); /* for ksys_close() */

+static unsigned int __get_max_fds(struct files_struct *cur_fds)
+{
+ unsigned int max_fds;
+
+ rcu_read_lock();
+ /* cap to last valid index into fdtable */
+ max_fds = files_fdtable(cur_fds)->max_fds;
+ rcu_read_unlock();
+ return max_fds;
+}
+
/**
* __close_range() - Close all file descriptors in a given range.
*
@@ -683,27 +694,23 @@ EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
*/
int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
{
- unsigned int cur_max;
+ unsigned int cur_max = UINT_MAX;
struct task_struct *me = current;
struct files_struct *cur_fds = me->files, *fds = NULL;

- if (flags & ~CLOSE_RANGE_UNSHARE)
+ if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC))
return -EINVAL;

if (fd > max_fd)
return -EINVAL;

- rcu_read_lock();
- cur_max = files_fdtable(cur_fds)->max_fds;
- rcu_read_unlock();
-
- /* cap to last valid index into fdtable */
- cur_max--;
-
if (flags & CLOSE_RANGE_UNSHARE) {
int ret;
unsigned int max_unshare_fds = NR_OPEN_MAX;

+ /* cap to last valid index into fdtable */
+ cur_max = __get_max_fds(cur_fds) - 1;
+
/*
* If the requested range is greater than the current maximum,
* we're closing everything so only copy all file descriptors
@@ -724,16 +731,31 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
swap(cur_fds, fds);
}

- max_fd = min(max_fd, cur_max);
- while (fd <= max_fd) {
- struct file *file;
+ if (flags & CLOSE_RANGE_CLOEXEC) {
+ struct fdtable *fdt;

- file = pick_file(cur_fds, fd++);
- if (!file)
- continue;
+ spin_lock(&cur_fds->file_lock);
+ fdt = files_fdtable(cur_fds);
+ cur_max = fdt->max_fds - 1;
+ max_fd = min(max_fd, cur_max);
+ while (fd <= max_fd)
+ __set_close_on_exec(fd++, fdt);
+ spin_unlock(&cur_fds->file_lock);
+ } else {
+ /* Initialize cur_max if needed. */
+ if (cur_max == UINT_MAX)
+ cur_max = __get_max_fds(cur_fds) - 1;
+ max_fd = min(max_fd, cur_max);
+ while (fd <= max_fd) {
+ struct file *file;

- filp_close(file, cur_fds);
- cond_resched();
+ file = pick_file(cur_fds, fd++);
+ if (!file)
+ continue;
+
+ filp_close(file, cur_fds);
+ cond_resched();
+ }
}

if (fds) {
diff --git a/include/uapi/linux/close_range.h b/include/uapi/linux/close_range.h
index 6928a9fdee3c..2d804281554c 100644
--- a/include/uapi/linux/close_range.h
+++ b/include/uapi/linux/close_range.h
@@ -5,5 +5,8 @@
/* Unshare the file descriptor table before closing file descriptors. */
#define CLOSE_RANGE_UNSHARE (1U << 1)

+/* Set the FD_CLOEXEC bit instead of closing the file descriptor. */
+#define CLOSE_RANGE_CLOEXEC (1U << 2)
+
#endif /* _UAPI_LINUX_CLOSE_RANGE_H */

--
2.26.2

2020-10-13 18:27:22

by Giuseppe Scrivano

[permalink] [raw]
Subject: [PATCH 2/2] selftests: add tests for CLOSE_RANGE_CLOEXEC

Signed-off-by: Giuseppe Scrivano <[email protected]>
---
.../testing/selftests/core/close_range_test.c | 44 +++++++++++++++++++
1 file changed, 44 insertions(+)

diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c
index c99b98b0d461..b8789262cd7d 100644
--- a/tools/testing/selftests/core/close_range_test.c
+++ b/tools/testing/selftests/core/close_range_test.c
@@ -23,6 +23,10 @@
#define CLOSE_RANGE_UNSHARE (1U << 1)
#endif

+#ifndef CLOSE_RANGE_CLOEXEC
+#define CLOSE_RANGE_CLOEXEC (1U << 2)
+#endif
+
static inline int sys_close_range(unsigned int fd, unsigned int max_fd,
unsigned int flags)
{
@@ -224,4 +228,44 @@ TEST(close_range_unshare_capped)
EXPECT_EQ(0, WEXITSTATUS(status));
}

+TEST(close_range_cloexec)
+{
+ int i, ret;
+ int open_fds[101];
+
+ for (i = 0; i < ARRAY_SIZE(open_fds); i++) {
+ int fd;
+
+ fd = open("/dev/null", O_RDONLY);
+ ASSERT_GE(fd, 0) {
+ if (errno == ENOENT)
+ XFAIL(return, "Skipping test since /dev/null does not exist");
+ }
+
+ open_fds[i] = fd;
+ }
+
+ EXPECT_EQ(-1, sys_close_range(open_fds[0], open_fds[100], -1)) {
+ if (errno == ENOSYS)
+ XFAIL(return, "close_range() syscall not supported");
+ }
+
+ EXPECT_EQ(0, sys_close_range(open_fds[0], open_fds[50], CLOSE_RANGE_CLOEXEC));
+
+ for (i = 0; i <= 50; i++) {
+ int flags = fcntl(open_fds[i], F_GETFD);
+
+ EXPECT_GT(flags, -1);
+ EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC);
+ }
+
+ for (i = 51; i <= 100; i++) {
+ int flags = fcntl(open_fds[i], F_GETFD);
+
+ EXPECT_GT(flags, -1);
+ EXPECT_EQ(flags & FD_CLOEXEC, 0);
+ }
+}
+
+
TEST_HARNESS_MAIN
--
2.26.2

2020-10-13 20:56:34

by Christian Brauner

[permalink] [raw]
Subject: Re: [PATCH 1/2] fs, close_range: add flag CLOSE_RANGE_CLOEXEC

On Tue, Oct 13, 2020 at 04:06:08PM +0200, Giuseppe Scrivano wrote:

Hey Guiseppe,

Thanks for the patch!

> When the flag CLOSE_RANGE_CLOEXEC is set, close_range doesn't
> immediately close the files but it sets the close-on-exec bit.

Hm, please expand on the use-cases a little here so people know where
and how this is useful. Keeping the rationale for a change in the commit
log is really important.

>
> Signed-off-by: Giuseppe Scrivano <[email protected]>
> ---

> fs/file.c | 56 ++++++++++++++++++++++----------
> include/uapi/linux/close_range.h | 3 ++
> 2 files changed, 42 insertions(+), 17 deletions(-)
>
> diff --git a/fs/file.c b/fs/file.c
> index 21c0893f2f1d..ad4ebee41e09 100644
> --- a/fs/file.c
> +++ b/fs/file.c
> @@ -672,6 +672,17 @@ int __close_fd(struct files_struct *files, unsigned fd)
> }
> EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
>
> +static unsigned int __get_max_fds(struct files_struct *cur_fds)
> +{
> + unsigned int max_fds;
> +
> + rcu_read_lock();
> + /* cap to last valid index into fdtable */
> + max_fds = files_fdtable(cur_fds)->max_fds;
> + rcu_read_unlock();
> + return max_fds;
> +}
> +
> /**
> * __close_range() - Close all file descriptors in a given range.
> *
> @@ -683,27 +694,23 @@ EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
> */
> int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
> {
> - unsigned int cur_max;
> + unsigned int cur_max = UINT_MAX;
> struct task_struct *me = current;
> struct files_struct *cur_fds = me->files, *fds = NULL;
>
> - if (flags & ~CLOSE_RANGE_UNSHARE)
> + if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC))
> return -EINVAL;
>
> if (fd > max_fd)
> return -EINVAL;
>
> - rcu_read_lock();
> - cur_max = files_fdtable(cur_fds)->max_fds;
> - rcu_read_unlock();
> -
> - /* cap to last valid index into fdtable */
> - cur_max--;
> -
> if (flags & CLOSE_RANGE_UNSHARE) {
> int ret;
> unsigned int max_unshare_fds = NR_OPEN_MAX;
>
> + /* cap to last valid index into fdtable */
> + cur_max = __get_max_fds(cur_fds) - 1;
> +
> /*
> * If the requested range is greater than the current maximum,
> * we're closing everything so only copy all file descriptors
> @@ -724,16 +731,31 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
> swap(cur_fds, fds);
> }
>
> - max_fd = min(max_fd, cur_max);
> - while (fd <= max_fd) {
> - struct file *file;
> + if (flags & CLOSE_RANGE_CLOEXEC) {
> + struct fdtable *fdt;
>
> - file = pick_file(cur_fds, fd++);
> - if (!file)
> - continue;
> + spin_lock(&cur_fds->file_lock);
> + fdt = files_fdtable(cur_fds);
> + cur_max = fdt->max_fds - 1;
> + max_fd = min(max_fd, cur_max);
> + while (fd <= max_fd)
> + __set_close_on_exec(fd++, fdt);
> + spin_unlock(&cur_fds->file_lock);
> + } else {
> + /* Initialize cur_max if needed. */
> + if (cur_max == UINT_MAX)
> + cur_max = __get_max_fds(cur_fds) - 1;

The separation between how cur_fd is retrieved in the two branches makes
the code more difficult to follow imho. Unless there's a clear reason
why you've done it that way I would think that something like the patch
I appended below might be a little clearer and easier to maintain(?).

> + max_fd = min(max_fd, cur_max);
> + while (fd <= max_fd) {
> + struct file *file;
>
> - filp_close(file, cur_fds);
> - cond_resched();
> + file = pick_file(cur_fds, fd++);
> + if (!file)
> + continue;
> +
> + filp_close(file, cur_fds);
> + cond_resched();
> + }
> }

I think I don't have quarrels with this patch in principle but I wonder
if something like the following wouldn't be easier to follow:

diff --git a/fs/file.c b/fs/file.c
index 21c0893f2f1d..872a4098c3be 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -672,6 +672,32 @@ int __close_fd(struct files_struct *files, unsigned fd)
}
EXPORT_SYMBOL(__close_fd); /* for ksys_close() */

+static inline void __range_cloexec(struct files_struct *cur_fds,
+ unsigned int fd, unsigned max_fd)
+{
+ struct fdtable *fdt;
+ spin_lock(&cur_fds->file_lock);
+ fdt = files_fdtable(cur_fds);
+ while (fd <= max_fd)
+ __set_close_on_exec(fd++, fdt);
+ spin_unlock(&cur_fds->file_lock);
+}
+
+static inline void __range_close(struct files_struct *cur_fds, unsigned int fd,
+ unsigned max_fd)
+{
+ while (fd <= max_fd) {
+ struct file *file;
+
+ file = pick_file(cur_fds, fd++);
+ if (!file)
+ continue;
+
+ filp_close(file, cur_fds);
+ cond_resched();
+ }
+}
+
/**
* __close_range() - Close all file descriptors in a given range.
*
@@ -687,7 +713,7 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
struct task_struct *me = current;
struct files_struct *cur_fds = me->files, *fds = NULL;

- if (flags & ~CLOSE_RANGE_UNSHARE)
+ if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC))
return -EINVAL;

if (fd > max_fd)
@@ -725,16 +751,10 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
}

max_fd = min(max_fd, cur_max);
- while (fd <= max_fd) {
- struct file *file;
-
- file = pick_file(cur_fds, fd++);
- if (!file)
- continue;
-
- filp_close(file, cur_fds);
- cond_resched();
- }
+ if (flags & CLOSE_RANGE_CLOEXEC)
+ __range_cloexec(cur_fds, fd, max_fd);
+ else
+ __range_close(cur_fds, fd, max_fd);

if (fds) {
/*
diff --git a/include/uapi/linux/close_range.h b/include/uapi/linux/close_range.h
index 6928a9fdee3c..2d804281554c 100644
--- a/include/uapi/linux/close_range.h
+++ b/include/uapi/linux/close_range.h
@@ -5,5 +5,8 @@
/* Unshare the file descriptor table before closing file descriptors. */
#define CLOSE_RANGE_UNSHARE (1U << 1)

+/* Set the FD_CLOEXEC bit instead of closing the file descriptor. */
+#define CLOSE_RANGE_CLOEXEC (1U << 2)
+
#endif /* _UAPI_LINUX_CLOSE_RANGE_H */

2020-10-14 06:37:53

by Rasmus Villemoes

[permalink] [raw]
Subject: Re: [PATCH 1/2] fs, close_range: add flag CLOSE_RANGE_CLOEXEC

On 13/10/2020 22.54, Christian Brauner wrote:
> On Tue, Oct 13, 2020 at 04:06:08PM +0200, Giuseppe Scrivano wrote:
>
> Hey Guiseppe,
>
> Thanks for the patch!
>
>> When the flag CLOSE_RANGE_CLOEXEC is set, close_range doesn't
>> immediately close the files but it sets the close-on-exec bit.
>
> Hm, please expand on the use-cases a little here so people know where
> and how this is useful. Keeping the rationale for a change in the commit
> log is really important.
>

> I think I don't have quarrels with this patch in principle but I wonder
> if something like the following wouldn't be easier to follow:
>
> diff --git a/fs/file.c b/fs/file.c
> index 21c0893f2f1d..872a4098c3be 100644
> --- a/fs/file.c
> +++ b/fs/file.c
> @@ -672,6 +672,32 @@ int __close_fd(struct files_struct *files, unsigned fd)
> }
> EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
>
> +static inline void __range_cloexec(struct files_struct *cur_fds,
> + unsigned int fd, unsigned max_fd)
> +{
> + struct fdtable *fdt;
> + spin_lock(&cur_fds->file_lock);
> + fdt = files_fdtable(cur_fds);
> + while (fd <= max_fd)
> + __set_close_on_exec(fd++, fdt);

Doesn't that want to be

bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1)

to do word-at-a-time? I assume this would mostly be called with (3, ~0U)
as arguments or something like that.

Rasmus

2020-10-14 06:46:57

by Christian Brauner

[permalink] [raw]
Subject: Re: [PATCH 1/2] fs, close_range: add flag CLOSE_RANGE_CLOEXEC

On Tue, Oct 13, 2020 at 11:04:21PM +0200, Rasmus Villemoes wrote:
> On 13/10/2020 22.54, Christian Brauner wrote:
> > On Tue, Oct 13, 2020 at 04:06:08PM +0200, Giuseppe Scrivano wrote:
> >
> > Hey Guiseppe,
> >
> > Thanks for the patch!
> >
> >> When the flag CLOSE_RANGE_CLOEXEC is set, close_range doesn't
> >> immediately close the files but it sets the close-on-exec bit.
> >
> > Hm, please expand on the use-cases a little here so people know where
> > and how this is useful. Keeping the rationale for a change in the commit
> > log is really important.
> >
>
> > I think I don't have quarrels with this patch in principle but I wonder
> > if something like the following wouldn't be easier to follow:
> >
> > diff --git a/fs/file.c b/fs/file.c
> > index 21c0893f2f1d..872a4098c3be 100644
> > --- a/fs/file.c
> > +++ b/fs/file.c
> > @@ -672,6 +672,32 @@ int __close_fd(struct files_struct *files, unsigned fd)
> > }
> > EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
> >
> > +static inline void __range_cloexec(struct files_struct *cur_fds,
> > + unsigned int fd, unsigned max_fd)
> > +{
> > + struct fdtable *fdt;
> > + spin_lock(&cur_fds->file_lock);
> > + fdt = files_fdtable(cur_fds);
> > + while (fd <= max_fd)
> > + __set_close_on_exec(fd++, fdt);
>

(I should've warned that I just proposed this as a completely untested
brainstorm.)

> Doesn't that want to be
>
> bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1)
>
> to do word-at-a-time? I assume this would mostly be called with (3, ~0U)
> as arguments or something like that.

Yes, that is the common case.

Thanks Rasmus, I was unaware we had that function.

In that case I think we'd actually need sm like:
spin_lock(&cur_fds->file_lock);
fdt = files_fdtable(cur_fds);
cur_max = files_fdtable(cur_fds)->max_fds - 1;
max_fd = min(max_fd, cur_max);
bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1)

so we retrieve max_fd with the spinlock held, I think.

Christian

2020-10-14 06:55:41

by Christian Brauner

[permalink] [raw]
Subject: Re: [PATCH 1/2] fs, close_range: add flag CLOSE_RANGE_CLOEXEC

On Tue, Oct 13, 2020 at 10:09:25PM +0100, Al Viro wrote:
> On Tue, Oct 13, 2020 at 04:06:08PM +0200, Giuseppe Scrivano wrote:
> > + spin_lock(&cur_fds->file_lock);
> > + fdt = files_fdtable(cur_fds);
> > + cur_max = fdt->max_fds - 1;
> > + max_fd = min(max_fd, cur_max);
> > + while (fd <= max_fd)
> > + __set_close_on_exec(fd++, fdt);
> > + spin_unlock(&cur_fds->file_lock);
>
> First of all, this is an atrocious way to set all bits
> in a range. What's more, you don't want to set it for *all*

Hm, good point.

Would it make sense to just use the bitmap_set() proposal since the 3 to
~0 case is most common or to actually iterate based on the open fds?


Christian

2020-10-14 09:24:07

by Rasmus Villemoes

[permalink] [raw]
Subject: Re: [PATCH 1/2] fs, close_range: add flag CLOSE_RANGE_CLOEXEC

On 13/10/2020 23.09, Al Viro wrote:
> On Tue, Oct 13, 2020 at 04:06:08PM +0200, Giuseppe Scrivano wrote:
>> + spin_lock(&cur_fds->file_lock);
>> + fdt = files_fdtable(cur_fds);
>> + cur_max = fdt->max_fds - 1;
>> + max_fd = min(max_fd, cur_max);
>> + while (fd <= max_fd)
>> + __set_close_on_exec(fd++, fdt);
>> + spin_unlock(&cur_fds->file_lock);
>
> First of all, this is an atrocious way to set all bits
> in a range. What's more, you don't want to set it for *all*
> bits - only for the ones present in open bitmap. It's probably
> harmless at the moment, but let's not create interesting surprises
> for the future.

Eh, why not? They can already be set for unallocated slots:

commit 5297908270549b734c7c2556745e2385b6d4941d
Author: Mateusz Guzik <[email protected]>
Date: Tue Oct 3 12:58:14 2017 +0200

vfs: stop clearing close on exec when closing a fd

Codepaths allocating a fd always make sure the bit is set/unset
depending on flags, thus clearing on close is redundant.

And while we're on that subject, yours truly suggested exactly that two
years prior [1], with a follow-up [2] in 2018 to do what wasn't done in
5297908, but (still) seems like obvious micro-optimizations, given that
the close_on_exec bitmap is not maintained as a subset of the open
bitmap. Mind taking a look at [2]?

[1]
https://lore.kernel.org/lkml/[email protected]/t/#u
[2]
https://lore.kernel.org/lkml/[email protected]/

Rasmus

2020-10-14 10:43:03

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH 1/2] fs, close_range: add flag CLOSE_RANGE_CLOEXEC

On Tue, Oct 13, 2020 at 04:06:08PM +0200, Giuseppe Scrivano wrote:
> + spin_lock(&cur_fds->file_lock);
> + fdt = files_fdtable(cur_fds);
> + cur_max = fdt->max_fds - 1;
> + max_fd = min(max_fd, cur_max);
> + while (fd <= max_fd)
> + __set_close_on_exec(fd++, fdt);
> + spin_unlock(&cur_fds->file_lock);

First of all, this is an atrocious way to set all bits
in a range. What's more, you don't want to set it for *all*
bits - only for the ones present in open bitmap. It's probably
harmless at the moment, but let's not create interesting surprises
for the future.

2020-10-14 10:43:28

by Giuseppe Scrivano

[permalink] [raw]
Subject: Re: [PATCH 1/2] fs, close_range: add flag CLOSE_RANGE_CLOEXEC

Christian Brauner <[email protected]> writes:

> On Tue, Oct 13, 2020 at 04:06:08PM +0200, Giuseppe Scrivano wrote:
>
> Hey Guiseppe,
>
> Thanks for the patch!
>
>> When the flag CLOSE_RANGE_CLOEXEC is set, close_range doesn't
>> immediately close the files but it sets the close-on-exec bit.
>
> Hm, please expand on the use-cases a little here so people know where
> and how this is useful. Keeping the rationale for a change in the commit
> log is really important.
>
>>
>> Signed-off-by: Giuseppe Scrivano <[email protected]>
>> ---
>
>> fs/file.c | 56 ++++++++++++++++++++++----------
>> include/uapi/linux/close_range.h | 3 ++
>> 2 files changed, 42 insertions(+), 17 deletions(-)
>>
>> diff --git a/fs/file.c b/fs/file.c
>> index 21c0893f2f1d..ad4ebee41e09 100644
>> --- a/fs/file.c
>> +++ b/fs/file.c
>> @@ -672,6 +672,17 @@ int __close_fd(struct files_struct *files, unsigned fd)
>> }
>> EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
>>
>> +static unsigned int __get_max_fds(struct files_struct *cur_fds)
>> +{
>> + unsigned int max_fds;
>> +
>> + rcu_read_lock();
>> + /* cap to last valid index into fdtable */
>> + max_fds = files_fdtable(cur_fds)->max_fds;
>> + rcu_read_unlock();
>> + return max_fds;
>> +}
>> +
>> /**
>> * __close_range() - Close all file descriptors in a given range.
>> *
>> @@ -683,27 +694,23 @@ EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
>> */
>> int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
>> {
>> - unsigned int cur_max;
>> + unsigned int cur_max = UINT_MAX;
>> struct task_struct *me = current;
>> struct files_struct *cur_fds = me->files, *fds = NULL;
>>
>> - if (flags & ~CLOSE_RANGE_UNSHARE)
>> + if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC))
>> return -EINVAL;
>>
>> if (fd > max_fd)
>> return -EINVAL;
>>
>> - rcu_read_lock();
>> - cur_max = files_fdtable(cur_fds)->max_fds;
>> - rcu_read_unlock();
>> -
>> - /* cap to last valid index into fdtable */
>> - cur_max--;
>> -
>> if (flags & CLOSE_RANGE_UNSHARE) {
>> int ret;
>> unsigned int max_unshare_fds = NR_OPEN_MAX;
>>
>> + /* cap to last valid index into fdtable */
>> + cur_max = __get_max_fds(cur_fds) - 1;
>> +
>> /*
>> * If the requested range is greater than the current maximum,
>> * we're closing everything so only copy all file descriptors
>> @@ -724,16 +731,31 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
>> swap(cur_fds, fds);
>> }
>>
>> - max_fd = min(max_fd, cur_max);
>> - while (fd <= max_fd) {
>> - struct file *file;
>> + if (flags & CLOSE_RANGE_CLOEXEC) {
>> + struct fdtable *fdt;
>>
>> - file = pick_file(cur_fds, fd++);
>> - if (!file)
>> - continue;
>> + spin_lock(&cur_fds->file_lock);
>> + fdt = files_fdtable(cur_fds);
>> + cur_max = fdt->max_fds - 1;
>> + max_fd = min(max_fd, cur_max);
>> + while (fd <= max_fd)
>> + __set_close_on_exec(fd++, fdt);
>> + spin_unlock(&cur_fds->file_lock);
>> + } else {
>> + /* Initialize cur_max if needed. */
>> + if (cur_max == UINT_MAX)
>> + cur_max = __get_max_fds(cur_fds) - 1;
>
> The separation between how cur_fd is retrieved in the two branches makes
> the code more difficult to follow imho. Unless there's a clear reason
> why you've done it that way I would think that something like the patch
> I appended below might be a little clearer and easier to maintain(?).

Thanks for the review!

I've opted for this version as in the flags=CLOSE_RANGE_CLOEXEC case we
can read max_fds directly from the fds table and avoid doing it from the
RCU critical section as well. I'll change it in favor of more readable
code.

Giuseppe