From: Eric Biggers <[email protected]>
I found that statx() was significantly slower than stat(). As a
microbenchmark, I compared 10,000,000 invocations of fstat() on a tmpfs
file to the same with statx() passed a NULL path:
$ time ./stat_benchmark
real 0m1.464s
user 0m0.275s
sys 0m1.187s
$ time ./statx_benchmark
real 0m5.530s
user 0m0.281s
sys 0m5.247s
statx is expected to be a little slower than stat because struct statx
is larger than struct stat, but not by *that* much. It turns out that
most of the overhead was in copying struct statx to userspace,
apparently mostly in all the stac/clac instructions that got generated
for each __put_user() call. (This was on x86_64, but some other
architectures, e.g. arm64, have something similar now too.)
stat() instead initializes its struct on the stack and copies it to
userspace with a single call to copy_to_user(). This turns out to be
much faster, and changing statx to do this makes it almost as fast as
stat:
$ time ./statx_benchmark
real 0m1.573s
user 0m0.229s
sys 0m1.344s
Signed-off-by: Eric Biggers <[email protected]>
---
fs/stat.c | 72 +++++++++++++++++++++++++++++----------------------------------
1 file changed, 33 insertions(+), 39 deletions(-)
diff --git a/fs/stat.c b/fs/stat.c
index fa0be59340cc..5cc267ec7865 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -509,46 +509,41 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename,
}
#endif /* __ARCH_WANT_STAT64 || __ARCH_WANT_COMPAT_STAT64 */
-static inline int __put_timestamp(struct timespec *kts,
- struct statx_timestamp __user *uts)
+static inline void init_statx_timestamp(struct statx_timestamp *uts,
+ const struct timespec *kts)
{
- return (__put_user(kts->tv_sec, &uts->tv_sec ) ||
- __put_user(kts->tv_nsec, &uts->tv_nsec ) ||
- __put_user(0, &uts->__reserved ));
+ uts->tv_sec = kts->tv_sec;
+ uts->tv_nsec = kts->tv_nsec;
+ uts->__reserved = 0;
}
-/*
- * Set the statx results.
- */
-static long statx_set_result(struct kstat *stat, struct statx __user *buffer)
+static int cp_statx(const struct kstat *stat, struct statx __user *buffer)
{
- uid_t uid = from_kuid_munged(current_user_ns(), stat->uid);
- gid_t gid = from_kgid_munged(current_user_ns(), stat->gid);
-
- if (__put_user(stat->result_mask, &buffer->stx_mask ) ||
- __put_user(stat->mode, &buffer->stx_mode ) ||
- __clear_user(&buffer->__spare0, sizeof(buffer->__spare0)) ||
- __put_user(stat->nlink, &buffer->stx_nlink ) ||
- __put_user(uid, &buffer->stx_uid ) ||
- __put_user(gid, &buffer->stx_gid ) ||
- __put_user(stat->attributes, &buffer->stx_attributes ) ||
- __put_user(stat->blksize, &buffer->stx_blksize ) ||
- __put_user(MAJOR(stat->rdev), &buffer->stx_rdev_major ) ||
- __put_user(MINOR(stat->rdev), &buffer->stx_rdev_minor ) ||
- __put_user(MAJOR(stat->dev), &buffer->stx_dev_major ) ||
- __put_user(MINOR(stat->dev), &buffer->stx_dev_minor ) ||
- __put_timestamp(&stat->atime, &buffer->stx_atime ) ||
- __put_timestamp(&stat->btime, &buffer->stx_btime ) ||
- __put_timestamp(&stat->ctime, &buffer->stx_ctime ) ||
- __put_timestamp(&stat->mtime, &buffer->stx_mtime ) ||
- __put_user(stat->ino, &buffer->stx_ino ) ||
- __put_user(stat->size, &buffer->stx_size ) ||
- __put_user(stat->blocks, &buffer->stx_blocks ) ||
- __clear_user(&buffer->__spare1, sizeof(buffer->__spare1)) ||
- __clear_user(&buffer->__spare2, sizeof(buffer->__spare2)))
- return -EFAULT;
-
- return 0;
+ struct statx tmp;
+
+ tmp.stx_mask = stat->result_mask;
+ tmp.stx_blksize = stat->blksize;
+ tmp.stx_attributes = stat->attributes;
+ tmp.stx_nlink = stat->nlink;
+ tmp.stx_uid = from_kuid_munged(current_user_ns(), stat->uid);
+ tmp.stx_gid = from_kgid_munged(current_user_ns(), stat->gid);
+ tmp.stx_mode = stat->mode;
+ memset(tmp.__spare0, 0, sizeof(tmp.__spare0));
+ tmp.stx_ino = stat->ino;
+ tmp.stx_size = stat->size;
+ tmp.stx_blocks = stat->blocks;
+ memset(tmp.__spare1, 0, sizeof(tmp.__spare1));
+ init_statx_timestamp(&tmp.stx_atime, &stat->atime);
+ init_statx_timestamp(&tmp.stx_btime, &stat->btime);
+ init_statx_timestamp(&tmp.stx_ctime, &stat->ctime);
+ init_statx_timestamp(&tmp.stx_mtime, &stat->mtime);
+ tmp.stx_rdev_major = MAJOR(stat->rdev);
+ tmp.stx_rdev_minor = MINOR(stat->rdev);
+ tmp.stx_dev_major = MAJOR(stat->dev);
+ tmp.stx_dev_minor = MINOR(stat->dev);
+ memset(tmp.__spare2, 0, sizeof(tmp.__spare2));
+
+ return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}
/**
@@ -572,8 +567,6 @@ SYSCALL_DEFINE5(statx,
if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE)
return -EINVAL;
- if (!access_ok(VERIFY_WRITE, buffer, sizeof(*buffer)))
- return -EFAULT;
if (filename)
error = vfs_statx(dfd, filename, flags, &stat, mask);
@@ -581,7 +574,8 @@ SYSCALL_DEFINE5(statx,
error = vfs_statx_fd(dfd, &stat, mask, flags);
if (error)
return error;
- return statx_set_result(&stat, buffer);
+
+ return cp_statx(&stat, buffer);
}
/* Caller is here responsible for sufficient locking (ie. inode->i_lock) */
--
2.12.0
On Sat, Mar 11, 2017 at 01:45:55PM -0800, Eric Biggers wrote:
> From: Eric Biggers <[email protected]>
>
> I found that statx() was significantly slower than stat(). As a
> microbenchmark, I compared 10,000,000 invocations of fstat() on a tmpfs
> file to the same with statx() passed a NULL path:
Umm...
> + struct statx tmp;
> +
> + tmp.stx_mask = stat->result_mask;
> + tmp.stx_blksize = stat->blksize;
> + tmp.stx_attributes = stat->attributes;
> + tmp.stx_nlink = stat->nlink;
> + tmp.stx_uid = from_kuid_munged(current_user_ns(), stat->uid);
> + tmp.stx_gid = from_kgid_munged(current_user_ns(), stat->gid);
> + tmp.stx_mode = stat->mode;
> + memset(tmp.__spare0, 0, sizeof(tmp.__spare0));
> + tmp.stx_ino = stat->ino;
> + tmp.stx_size = stat->size;
> + tmp.stx_blocks = stat->blocks;
> + memset(tmp.__spare1, 0, sizeof(tmp.__spare1));
> + init_statx_timestamp(&tmp.stx_atime, &stat->atime);
> + init_statx_timestamp(&tmp.stx_btime, &stat->btime);
> + init_statx_timestamp(&tmp.stx_ctime, &stat->ctime);
> + init_statx_timestamp(&tmp.stx_mtime, &stat->mtime);
> + tmp.stx_rdev_major = MAJOR(stat->rdev);
> + tmp.stx_rdev_minor = MINOR(stat->rdev);
> + tmp.stx_dev_major = MAJOR(stat->dev);
> + tmp.stx_dev_minor = MINOR(stat->dev);
> + memset(tmp.__spare2, 0, sizeof(tmp.__spare2));
> +
> + return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
That relies upon there being no padding in the damn structure.
It's true and probably will be true on any target, but
a) it's bloody well worth stating explicitly
and
b) struct statx tmp = {.stx_mask = stat->result_mask};
will get rid of those memset() you've got there by implicit
zeroing of fields missing in partial structure initializer.
Padding is *not* included into that, but you are relying upon
having no padding anyway...
Hi Al,
On Sun, Mar 12, 2017 at 01:24:15AM +0000, Al Viro wrote:
> On Sat, Mar 11, 2017 at 01:45:55PM -0800, Eric Biggers wrote:
> > From: Eric Biggers <[email protected]>
> >
> > I found that statx() was significantly slower than stat(). As a
> > microbenchmark, I compared 10,000,000 invocations of fstat() on a tmpfs
> > file to the same with statx() passed a NULL path:
>
> Umm...
>
Well, it's a silly benchmark, but stat performance is important, and usually
things are cached already so most of the time is just overhead --- which this
measures. And since nothing actually uses statx() yet, you can't do a benchmark
just by running some command like 'git status' or whatever.
> > + tmp.stx_dev_major = MAJOR(stat->dev);
> > + tmp.stx_dev_minor = MINOR(stat->dev);
> > + memset(tmp.__spare2, 0, sizeof(tmp.__spare2));
> > +
> > + return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
>
> That relies upon there being no padding in the damn structure.
> It's true and probably will be true on any target, but
> a) it's bloody well worth stating explicitly
> and
> b) struct statx tmp = {.stx_mask = stat->result_mask};
> will get rid of those memset() you've got there by implicit
> zeroing of fields missing in partial structure initializer.
> Padding is *not* included into that, but you are relying upon
> having no padding anyway...
If padding is a concern at all (AFAICS it's not actually an issue now with
struct statx, but people tend to have different opinions on how careful they
want to be with padding), then I think we'll just have to start by memsetting
the whole struct to 0.
stat() actually does that, except that it's overridden on some architectures,
like x86, to only memset the actual reserved fields. I had kind of assumed that
as long as we're defining a new struct that has the same layout on all
architectures, with no padding, that we'd want to take the opportunity to only
memset the actual reserved fields, to make it a bit faster. And yes, a
designated initializer would make this look a bit nicer, though it might have to
be changed later if any future statx() extensions involve fields that are
unioned or conditionally written.
Note that another approach would be to copy the fields individually, but with
unsafe_put_user() instead of __put_user(). That would avoid the overhead of
user_access_begin()/user_access_end() which was the main performance problem.
However, the infrastructure around this doesn't seem to be ready quite yet:
there aren't good implementations of unsafe_put_user() yet, nor is there an
unsafe_clear_user() yet.
- Eric
On Sat, Mar 11, 2017 at 06:16:55PM -0800, Eric Biggers wrote:
> Hi Al,
>
> On Sun, Mar 12, 2017 at 01:24:15AM +0000, Al Viro wrote:
> > On Sat, Mar 11, 2017 at 01:45:55PM -0800, Eric Biggers wrote:
> > > From: Eric Biggers <[email protected]>
> > >
> > > I found that statx() was significantly slower than stat(). As a
> > > microbenchmark, I compared 10,000,000 invocations of fstat() on a tmpfs
> > > file to the same with statx() passed a NULL path:
> >
> > Umm...
> >
>
> Well, it's a silly benchmark, but stat performance is important, and usually
> things are cached already so most of the time is just overhead --- which this
> measures. And since nothing actually uses statx() yet, you can't do a benchmark
> just by running some command like 'git status' or whatever.
Oh, I agree that multiple __put_user() are wrong; I also agree that bulk copy is
the right approach (when we get the unsafe stuff right, we can revisit that, but
I suspect that on quite a few architectures a bulk copy will still give better
time, no matter what).
> If padding is a concern at all (AFAICS it's not actually an issue now with
> struct statx, but people tend to have different opinions on how careful they
> want to be with padding), then I think we'll just have to start by memsetting
> the whole struct to 0.
My point is simply that it's worth a comment in that code.
On Sun, Mar 12, 2017 at 02:29:27AM +0000, Al Viro wrote:
>
> Oh, I agree that multiple __put_user() are wrong; I also agree that bulk copy is
> the right approach (when we get the unsafe stuff right, we can revisit that, but
> I suspect that on quite a few architectures a bulk copy will still give better
> time, no matter what).
>
> > If padding is a concern at all (AFAICS it's not actually an issue now with
> > struct statx, but people tend to have different opinions on how careful they
> > want to be with padding), then I think we'll just have to start by memsetting
> > the whole struct to 0.
>
> My point is simply that it's worth a comment in that code.
Okay, thanks. I'll add a comment about the padding assumption, and I think I'll
take the suggestion to use a designated initializer. Then at least all *fields*
get initialized by default. And if in the future someone wants to conditionally
initialize fields, then they can use ?: or they can do it after the initializer.
Either way, at least they won't be able to forget to zero some field.
- Eric
On Sat, Mar 11, 2017 at 08:02:06PM -0800, Eric Biggers wrote:
> On Sun, Mar 12, 2017 at 02:29:27AM +0000, Al Viro wrote:
> >
> > Oh, I agree that multiple __put_user() are wrong; I also agree that bulk copy is
> > the right approach (when we get the unsafe stuff right, we can revisit that, but
> > I suspect that on quite a few architectures a bulk copy will still give better
> > time, no matter what).
> >
> > > If padding is a concern at all (AFAICS it's not actually an issue now with
> > > struct statx, but people tend to have different opinions on how careful they
> > > want to be with padding), then I think we'll just have to start by memsetting
> > > the whole struct to 0.
> >
> > My point is simply that it's worth a comment in that code.
>
> Okay, thanks. I'll add a comment about the padding assumption, and I think I'll
> take the suggestion to use a designated initializer. Then at least all *fields*
> get initialized by default. And if in the future someone wants to conditionally
> initialize fields, then they can use ?: or they can do it after the initializer.
> Either way, at least they won't be able to forget to zero some field.
>
> - Eric
Okay, well, I may have changed my mind again... I tried the designated
initializer on x86_64 with gcc 4.8 and 6.3, and also on arm64 with gcc 4.8. In
each case, it was compiled into first zeroing all 256 bytes of the struct, just
like memset(&tmp, 0, sizeof(tmp)). Yes, this was with
CC_OPTIMIZE_FOR_PERFORMANCE=y. So I think we might as well just write the full
memset(), making it completely clear that everything is initialized. (This is
especially useful for people who are auditing code paths like this for
information leaks.) Also, a smart compiler could potentially optimize away
parts of the memset() anyway...
Eric
On Mar 11, 2017, at 11:01 PM, Eric Biggers <[email protected]> wrote:
>
> On Sat, Mar 11, 2017 at 08:02:06PM -0800, Eric Biggers wrote:
>> On Sun, Mar 12, 2017 at 02:29:27AM +0000, Al Viro wrote:
>>>
>>> Oh, I agree that multiple __put_user() are wrong; I also agree that bulk
>>> copy is the right approach (when we get the unsafe stuff right, we can
>>> revisit that, but I suspect that on quite a few architectures a bulk copy
>>> will still give better time, no matter what).
>>>
>>>> If padding is a concern at all (AFAICS it's not actually an issue now
>>>> with struct statx, but people tend to have different opinions on how
>>>> careful they want to be with padding), then I think we'll just have to
>>>> start by memsetting the whole struct to 0.
>>>
>>> My point is simply that it's worth a comment in that code.
>>
>> Okay, thanks. I'll add a comment about the padding assumption, and I think
>> I'll take the suggestion to use a designated initializer. Then at least
>> all *fields* get initialized by default. And if in the future someone
>> wants to conditionally initialize fields, then they can use ?: or they can
>> do it after the initializer. Either way, at least they won't be able to
>> forget to zero some field.
>
> Okay, well, I may have changed my mind again... I tried the designated
> initializer on x86_64 with gcc 4.8 and 6.3, and also on arm64 with gcc 4.8.
> In each case, it was compiled into first zeroing all 256 bytes of the struct,
> just like memset(&tmp, 0, sizeof(tmp)). Yes, this was with
> CC_OPTIMIZE_FOR_PERFORMANCE=y. So I think we might as well just write the
> full memset(), making it completely clear that everything is initialized.
> (This is especially useful for people who are auditing code paths like this
> for information leaks.) Also, a smart compiler could potentially optimize
> away parts of the memset() anyway...
Not that it is a huge deal either way, but I'd think it is harder for the
compiler to optimize across a function call boundary like memset() vs. a
struct initialization in the same function where it can see that all but
a few of the fields are being overwritten immediately before they are used.
I don't think the designated initializer is any less clear to the reader
that the struct is zeroed out compared to using memset(). Possibly the
best compromise is to use a designated initializer that specifies all of
the known fields, and leaves it to the compiler to initialize unset fields
or padding. That avoids double zeroing without any risk of exposing unset
fields to userspace:
static int cp_statx(const struct kstat *stat, struct statx __user *buffer)
{
struct statx tmp = {
.stx_mask = stat->result_mask;
.stx_blksize = stat->blksize;
.stx_attributes = stat->attributes;
.stx_nlink = stat->nlink;
.stx_uid = from_kuid_munged(current_user_ns(), stat->uid);
.stx_gid = from_kgid_munged(current_user_ns(), stat->gid);
.stx_mode = stat->mode;
.stx_ino = stat->ino;
.stx_size = stat->size;
.stx_blocks = stat->blocks;
.stx_atime.tv_sec = stat->atime.tv_sec;
.stx_atime.tv_nsec = stat->atime.tv_nsec;
.stx_btime.tv_sec = stat->btime.tv_sec;
.stx_btime.tv_nsec = stat->btime.tv_nsec;
.stx_ctime.tv_sec = stat->ctime.tv_sec;
.stx_ctime.tv_nsec = stat->ctime.tv_nsec;
.stx_mtime.tv_sec = stat->mtime.tv_sec;
.stx_mtime.tv_nsec = stat->mtime.tv_nsec;
.stx_rdev_major = MAJOR(stat->rdev);
.stx_rdev_minor = MINOR(stat->rdev);
.stx_dev_major = MAJOR(stat->dev);
.stx_dev_minor = MINOR(stat->dev);
};
return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}
Cheers, Andreas
On 03/13/2017 05:34 AM, Andreas Dilger wrote:
> Not that it is a huge deal either way, but I'd think it is harder for the
> compiler to optimize across a function call boundary like memset() vs. a
> struct initialization in the same function where it can see that all but
> a few of the fields are being overwritten immediately before they are used.
GCC treats memset as a function call only if options such as
-ffreestanding or -fno-builtin are enabled, or if memset is redefined in
a header file. Does the kernel do this?
> I don't think the designated initializer is any less clear to the reader
> that the struct is zeroed out compared to using memset(). Possibly the
> best compromise is to use a designated initializer that specifies all of
> the known fields, and leaves it to the compiler to initialize unset fields
> or padding.
GCC will not always initialize padding if you specify a designated
initializer because padding values are unspeficied and their value does
not matter from a language point of view.
Thanks,
Florian
On Mon, Mar 13, 2017 at 11:27:32AM +0100, Florian Weimer wrote:
> On 03/13/2017 05:34 AM, Andreas Dilger wrote:
> >Not that it is a huge deal either way, but I'd think it is harder for the
> >compiler to optimize across a function call boundary like memset() vs. a
> >struct initialization in the same function where it can see that all but
> >a few of the fields are being overwritten immediately before they are used.
>
> GCC treats memset as a function call only if options such as
> -ffreestanding or -fno-builtin are enabled, or if memset is
> redefined in a header file. Does the kernel do this?
>
No, it does not. So gcc treats memset() as a request to clear memory, not as a
request to call a function called "memset()" specifically. On x86_64 it's
compiling it into a "rep stos" instruction.
- Eric