2002-07-17 05:23:34

by Andrew Morton

[permalink] [raw]
Subject: [patch 13/13] lseek speedup



This is a fairly dopey patch to fix up the i_sem contention in lseek.
Better ideas are welcome, but I'm offline until Monday so don't think
I'm ignoring them...

We need to decide what we really want to lock in there. If multiple
threads are updating f_pos and/or i_size at the same time then they are
going to see strange results whatever the kernel does. I'd maintain
that the only real obligation which the kernel has in this situation is
to not oops, to not munch data and to not return competely outlandish
results.

And the only way we can return outlandish results is on 32-bit SMP if
one CPU reads i_size or f_pos while another CPU is in the middle of
modifying it.

- We only need to take i_sem for reading i_size on 32-bit machines
with SMP or preempt, so add the entertaining down32() and up32()
functions for that.

- Only take i_sem in the case where we actually need to read i_size:
SEEK_END.

So this patch only speeds up SEEK_SET, SEEK_CUR and SEEK_END on
64-bit. SEEK_END on 32-bit will continue to bounce i_sem around.

- Introduce a new spinlock in struct file. Its scope is currently
for providing atomicity to f_pos updates.

This lock could also be used for guarding consistency of the
readahead state. But after a close walkthrough the readahead code I
don't think readahead needs any locking (apart from the 32-bit reader
of i_size!). Any errors in there will be +/- 1 differences in the
numbers, and readahead just fixes things like that up anyway.

- Fiddled with the layout of struct file in an attempt to make it
more cache-friendly.



fs/file_table.c | 1
fs/read_write.c | 86 ++++++++++++++++++++++++++++-------------------------
include/linux/fs.h | 15 ++++++---
3 files changed, 57 insertions(+), 45 deletions(-)

--- 2.5.26/fs/read_write.c~lseek-speedup Tue Jul 16 21:47:22 2002
+++ 2.5.26-akpm/fs/read_write.c Tue Jul 16 21:47:22 2002
@@ -20,53 +20,59 @@ struct file_operations generic_ro_fops =
mmap: generic_file_mmap,
};

+/*
+ * Assume reads and writes of long longs are atomic on 64-bit machines
+ */
+static inline void down32(struct semaphore *sem)
+{
+#if (BITS_PER_LONG != 64) && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT))
+ down(sem);
+#endif
+}
+
+static inline void up32(struct semaphore *sem)
+{
+#if (BITS_PER_LONG != 64) && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT))
+ up(sem);
+#endif
+}
+
loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
{
- long long retval;
+ loff_t retval;
struct inode *inode = file->f_dentry->d_inode->i_mapping->host;

- down(&inode->i_sem);
- switch (origin) {
- case 2:
- offset += inode->i_size;
- break;
- case 1:
- offset += file->f_pos;
+ if (origin == 2) {
+ down32(&inode->i_sem);
+ offset += inode->i_size;
+ up32(&inode->i_sem);
+ spin_lock(&file->f_lock);
+ } else if (origin == 1) {
+ spin_lock(&file->f_lock);
+ offset += file->f_pos;
+ } else {
+ spin_lock(&file->f_lock);
}
retval = -EINVAL;
- if (offset>=0 && offset<=inode->i_sb->s_maxbytes) {
+ if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
if (offset != file->f_pos) {
file->f_pos = offset;
file->f_version = ++event;
}
retval = offset;
}
- up(&inode->i_sem);
+ spin_unlock(&file->f_lock);
return retval;
}

loff_t remote_llseek(struct file *file, loff_t offset, int origin)
{
- long long retval;
+ loff_t ret;

lock_kernel();
- switch (origin) {
- case 2:
- offset += file->f_dentry->d_inode->i_size;
- break;
- case 1:
- offset += file->f_pos;
- }
- retval = -EINVAL;
- if (offset>=0 && offset<=file->f_dentry->d_inode->i_sb->s_maxbytes) {
- if (offset != file->f_pos) {
- file->f_pos = offset;
- file->f_version = ++event;
- }
- retval = offset;
- }
+ ret = generic_file_llseek(file, offset, origin);
unlock_kernel();
- return retval;
+ return ret;
}

loff_t no_llseek(struct file *file, loff_t offset, int origin)
@@ -76,15 +82,18 @@ loff_t no_llseek(struct file *file, loff

loff_t default_llseek(struct file *file, loff_t offset, int origin)
{
+ struct inode *inode = file->f_dentry->d_inode;
long long retval;

- lock_kernel();
- switch (origin) {
- case 2:
- offset += file->f_dentry->d_inode->i_size;
- break;
- case 1:
- offset += file->f_pos;
+ if (origin == 2) {
+ down32(&inode->i_sem);
+ offset += inode->i_size;
+ up32(&inode->i_sem);
+ } else if (origin == 1) {
+ spin_lock(&file->f_lock);
+ offset += file->f_pos;
+ } else {
+ spin_lock(&file->f_lock);
}
retval = -EINVAL;
if (offset >= 0) {
@@ -94,18 +103,15 @@ loff_t default_llseek(struct file *file,
}
retval = offset;
}
- unlock_kernel();
+ spin_unlock(&file->f_lock);
return retval;
}

static inline loff_t llseek(struct file *file, loff_t offset, int origin)
{
- loff_t (*fn)(struct file *, loff_t, int);
-
- fn = default_llseek;
if (file->f_op && file->f_op->llseek)
- fn = file->f_op->llseek;
- return fn(file, offset, origin);
+ return (*file->f_op->llseek)(file, offset, origin);
+ return default_llseek(file, offset, origin);
}

asmlinkage off_t sys_lseek(unsigned int fd, off_t offset, unsigned int origin)
--- 2.5.26/include/linux/fs.h~lseek-speedup Tue Jul 16 21:47:22 2002
+++ 2.5.26-akpm/include/linux/fs.h Tue Jul 16 21:47:22 2002
@@ -475,20 +475,25 @@ struct file_ra_state {
};

struct file {
- struct list_head f_list;
struct dentry *f_dentry;
+ struct list_head f_list;
struct vfsmount *f_vfsmnt;
- struct file_operations *f_op;
- atomic_t f_count;
+
unsigned int f_flags;
mode_t f_mode;
+ struct file_operations *f_op;
+ unsigned long f_version;
+
+ atomic_t f_count;
+ spinlock_t f_lock; /* for f_pos atomicity */
loff_t f_pos;
+
struct fown_struct f_owner;
- unsigned int f_uid, f_gid;
+ unsigned int f_uid;
+ unsigned int f_gid;
int f_error;
struct file_ra_state f_ra;

- unsigned long f_version;

/* needed for tty driver, and maybe others */
void *private_data;
--- 2.5.26/fs/file_table.c~lseek-speedup Tue Jul 16 21:47:22 2002
+++ 2.5.26-akpm/fs/file_table.c Tue Jul 16 21:47:22 2002
@@ -44,6 +44,7 @@ struct file * get_empty_filp(void)
new_one:
memset(f, 0, sizeof(*f));
atomic_set(&f->f_count,1);
+ spin_lock_init(&f->f_lock);
f->f_version = ++event;
f->f_uid = current->fsuid;
f->f_gid = current->fsgid;

.


2002-07-17 09:39:41

by Anton Altaparmakov

[permalink] [raw]
Subject: Re: [patch 13/13] lseek speedup

At 06:31 17/07/02, Andrew Morton wrote:
>This is a fairly dopey patch to fix up the i_sem contention in lseek.
>Better ideas are welcome, but I'm offline until Monday so don't think
>I'm ignoring them...

I am afraid I don't have any better ideas but I don't think your patch is
safe. )-:

>We need to decide what we really want to lock in there. If multiple
>threads are updating f_pos and/or i_size at the same time then they are
>going to see strange results whatever the kernel does. I'd maintain
>that the only real obligation which the kernel has in this situation is
>to not oops, to not munch data and to not return competely outlandish
>results.

That can only be guaranteed by holding i_sem. Since the BKL was taken out
of ->readdir(), i_sem is now used for exclusion between ->readdir() and
->llseek() (well NTFS uses it anyway but many other fs which switched to
generic_file_llseek() do the same). This exclusion is necessary IMHO.

->readdir() may well blow up or at the very least give "completely
outlandish results" if someone modifies f_pos while it is running.
Depending on when exactly it happens, ntfs_readdir() is going to be ok as
it mostly sets f_pos instead of incrementing it but still, a succeeding
->llseek(), may actually end up in a different position to the one
requested because of a concurrent ->readdir().

I know doing the two things at the same time requires a completely crazy
multithreaded application, but then again many application developers fall
into that category. (-;

>And the only way we can return outlandish results is on 32-bit SMP if
>one CPU reads i_size or f_pos while another CPU is in the middle of
>modifying it.

I believe it can also happen if ->llseek() is allowed to run concurrently
with a ->readdir()...

Best regards,

Anton


--
"I've not lost my mind. It's backed up on tape somewhere." - Unknown
--
Anton Altaparmakov <aia21 at cantab.net> (replace at with @)
Linux NTFS Maintainer / IRC: #ntfs on irc.openprojects.net
WWW: http://linux-ntfs.sf.net/ & http://www-stu.christs.cam.ac.uk/~aia21/

2002-07-17 10:48:35

by Anton Altaparmakov

[permalink] [raw]
Subject: Re: [patch 13/13] lseek speedup

Hi Andrew,

At 06:31 17/07/02, Andrew Morton wrote:
> static inline loff_t llseek(struct file *file, loff_t offset, int origin)
> {
>- loff_t (*fn)(struct file *, loff_t, int);
>-
>- fn = default_llseek;
> if (file->f_op && file->f_op->llseek)
>- fn = file->f_op->llseek;
>- return fn(file, offset, origin);
>+ return (*file->f_op->llseek)(file, offset, origin);
>+ return default_llseek(file, offset, origin);
> }

This one is interesting. I have been wondering for quite a while whether
constructs like the original one actually produce better machine code or
whether they should all be cleaned up just as you do here. I was never
bothered enough about it to try but I have half an hour spare at the moment
so I tried both - using gcc-2.96, RedHat 7.3 latest version, ia32,
compiling for Athlon.

While I cannot speak for other compilers, gcc-2.96 at least generates
better code for the old llseek() than for the new llseek() proposed by the
above patch snippet.

The old code has following advantages:
- Has only one conditional jump (comparing to two in new code.
- Has no unconditional jump (comparing to one in new code).
- Uses 16 bytes less stack space.
- Machine code is shorter.

Here are the relevant different sections in sys_llseek() taken from the gcc
generated assembly files both with and without just the above patch snippet
applied:

----old code----
movl $default_llseek, %esi
movl 16(%ebx), %eax
testl %eax, %eax
je .L1427
movl 4(%eax), %eax
testl %eax, %eax
cmovne %eax, %esi
.L1427:
pushl 48(%esp)
pushl %ecx
pushl %edx
pushl 12(%esp)
call *%esi

----new code----
movl 16(%ebx), %eax
testl %eax, %eax
je .L1428
movl 4(%eax), %eax
testl %eax, %eax
je .L1428
pushl 64(%esp)
pushl %ecx
pushl %edx
pushl %ebx
call *%eax
jmp .L1442
.p2align 4,,7
.L1428:
pushl 64(%esp)
pushl %ecx
pushl %edx
pushl 12(%esp)
call default_llseek
.L1442:

Obviously we are counting a few cycles difference only (although the fewer
jmps could make a bigger difference if the branch prediction of the CPU
doesn't match to the common usage patch and the pipe line is stalled) and
sys_llseek() is not exactly the most performance critical code, however I
thought it is interesting to note that the old code is a "coding style"
which produces better machine code in general with (gcc-2.96)...

And also it provided me with something fun to do in the half hour I am
waiting for my gel to run. (-8 Back to lab work...

Best regards,

Anton


--
"I've not lost my mind. It's backed up on tape somewhere." - Unknown
--
Anton Altaparmakov <aia21 at cantab.net> (replace at with @)
Linux NTFS Maintainer / IRC: #ntfs on irc.openprojects.net
WWW: http://linux-ntfs.sf.net/ & http://www-stu.christs.cam.ac.uk/~aia21/

2002-07-17 16:08:06

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch 13/13] lseek speedup

Anton Altaparmakov wrote:
>
> At 06:31 17/07/02, Andrew Morton wrote:
> >This is a fairly dopey patch to fix up the i_sem contention in lseek.
> >Better ideas are welcome, but I'm offline until Monday so don't think
> >I'm ignoring them...
>
> I am afraid I don't have any better ideas but I don't think your patch is
> safe. )-:

It wasn't a very good idea in the first place. Forgot to take
the new lock over in the updaters of f_pos.

And it's attempting to cater for a buggy application on a 32-bit
machine, SMP, where the fd is shared. It's hard to justify
putting any locking in lseek for that. (Then again, people
should use pread() more..)

Except for readdir(). Now, why are we taking i_sem for lseek/readdir
exclusion and not a per-file lock?

-

2002-07-17 16:17:52

by Anton Altaparmakov

[permalink] [raw]
Subject: Re: [patch 13/13] lseek speedup

Hi Andrew,

At 17:18 17/07/02, Andrew Morton wrote:
>Anton Altaparmakov wrote:
> > At 06:31 17/07/02, Andrew Morton wrote:
> > >This is a fairly dopey patch to fix up the i_sem contention in lseek.
> > >Better ideas are welcome, but I'm offline until Monday so don't think
> > >I'm ignoring them...
> >
> > I am afraid I don't have any better ideas but I don't think your patch is
> > safe. )-:
>
>It wasn't a very good idea in the first place. Forgot to take
>the new lock over in the updaters of f_pos.
>
>And it's attempting to cater for a buggy application on a 32-bit
>machine, SMP, where the fd is shared. It's hard to justify
>putting any locking in lseek for that. (Then again, people
>should use pread() more..)
>
>Except for readdir(). Now, why are we taking i_sem for lseek/readdir
>exclusion and not a per-file lock?

Because it also excludes against directory modifications, etc. Just imagine
what "rm somefile" or "mv somefile otherfile" or "touch newfile" would do
to the directory contents and what a concurrent readdir() would do... A
very loud *BANG* is the only thing that springs to mind...

btw. the directory modification locking rules are written up in
Documentation/filesystems/directory-locking by our very own VFS maintainer
Al Viro himself... (-;

Cheers,

Anton


--
"I've not lost my mind. It's backed up on tape somewhere." - Unknown
--
Anton Altaparmakov <aia21 at cantab.net> (replace at with @)
Linux NTFS Maintainer / IRC: #ntfs on irc.openprojects.net
WWW: http://linux-ntfs.sf.net/ & http://www-stu.christs.cam.ac.uk/~aia21/

2002-07-17 16:38:49

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch 13/13] lseek speedup

Anton Altaparmakov wrote:
>
> > Now, why are we taking i_sem for lseek/readdir
> >exclusion and not a per-file lock?
>
> Because it also excludes against directory modifications, etc. Just imagine
> what "rm somefile" or "mv somefile otherfile" or "touch newfile" would do
> to the directory contents and what a concurrent readdir() would do... A
> very loud *BANG* is the only thing that springs to mind...

That's different. i_size, contents of things, yes - i_sem for
those.

But protection of struct file should not be via any per-inode thing.

> btw. the directory modification locking rules are written up in
> Documentation/filesystems/directory-locking by our very own VFS maintainer
> Al Viro himself... (-;

Doesn't cover lseek...

-

2002-07-17 18:04:18

by Anton Altaparmakov

[permalink] [raw]
Subject: Re: [patch 13/13] lseek speedup

At 17:49 17/07/02, Andrew Morton wrote:
>Anton Altaparmakov wrote:
> >
> > > Now, why are we taking i_sem for lseek/readdir
> > >exclusion and not a per-file lock?
> >
> > Because it also excludes against directory modifications, etc. Just imagine
> > what "rm somefile" or "mv somefile otherfile" or "touch newfile" would do
> > to the directory contents and what a concurrent readdir() would do... A
> > very loud *BANG* is the only thing that springs to mind...
>
>That's different. i_size, contents of things, yes - i_sem for
>those.
>
>But protection of struct file should not be via any per-inode thing.

Oh, I see. But that would mean adding yet another lock to an existing
locking scheme? So both i_sem and the "per file lock" would nead to be held
over readdir(). That's doable but it would have to be a semaphore based
lock due to readdir()...

Perhaps the f_lock from your patch, changed to a semaphore, and acquiring
it in generic_file_llseek&friends, vfs_readdir() (and any other place where
f_pos needs protecting) would be able to do the trick. Would that solve the
lock contention problems you were seing? It would at least off-load i_sem a
bit... but it would only replace one semaphore for another...

> > btw. the directory modification locking rules are written up in
> > Documentation/filesystems/directory-locking by our very own VFS maintainer
> > Al Viro himself... (-;
>
>Doesn't cover lseek...

Hm, true. I hadn't read it for quite a while...

Anton


--
"I've not lost my mind. It's backed up on tape somewhere." - Unknown
--
Anton Altaparmakov <aia21 at cantab.net> (replace at with @)
Linux NTFS Maintainer / IRC: #ntfs on irc.openprojects.net
WWW: http://linux-ntfs.sf.net/ & http://www-stu.christs.cam.ac.uk/~aia21/

2002-07-17 18:26:29

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [patch 13/13] lseek speedup

On Wed, Jul 17, 2002 at 09:49:40AM -0700, Andrew Morton wrote:
> But protection of struct file should not be via any per-inode thing.

Why not make use of set_64bit instead of using a spinlock? It might
need a companion get_64bit, but at least on 64 bit machines they could
be defined to use a direct load/store.

-ben
--
"You will be reincarnated as a toad; and you will be much happier."

2002-07-22 07:40:30

by Anton Altaparmakov

[permalink] [raw]
Subject: Re: [patch 13/13] lseek speedup

At 08:16 22/07/02, Andrew Morton wrote:
>Anton Altaparmakov wrote:
> > At 17:49 17/07/02, Andrew Morton wrote:
> > >Anton Altaparmakov wrote:
> > > >
> > > > > Now, why are we taking i_sem for lseek/readdir
> > > > >exclusion and not a per-file lock?
> > > >
> > > > Because it also excludes against directory modifications, etc. Just
> imagine
> > > > what "rm somefile" or "mv somefile otherfile" or "touch newfile"
> would do
> > > > to the directory contents and what a concurrent readdir() would do... A
> > > > very loud *BANG* is the only thing that springs to mind...
> > >
> > >That's different. i_size, contents of things, yes - i_sem for
> > >those.
> > >
> > >But protection of struct file should not be via any per-inode thing.
> >
> > Oh, I see. But that would mean adding yet another lock to an existing
> > locking scheme? So both i_sem and the "per file lock" would nead to be held
> > over readdir(). That's doable but it would have to be a semaphore based
> > lock due to readdir()...
>
>Adding a sleeping lock to struct file for this would make sense; using
>i_sem to protect the innards of struct file ain't right.
>
>However I'm not sure that the VFS needs to be serialising lseek
>and readdir at all. The fs can do that if it really needs to.
>
>And does it really need to? Setting aside the non-atomic 64-bit
>read, it may be sufficient for the fs to do what ext2_readdir
>does: read f_pos once on entry, write it once on exit and if
>there was a concurrent lseek then its results get stomped on.
>
>Can you see any problem with that?

No problem. NTFS can very easily be turned into doing that. It would need
to happen to all file systems and perhaps a little documentation in the
porting document to say so for out of kernel fs...

>Anyway. It's not exactly a top-priority thing. I'll park
>it for a while and just stop running that test ;)

Well, I will get ntfs_readdir changed to only read/write f_pos once on
entry/exit respectively anyway...

Anton


--
"I've not lost my mind. It's backed up on tape somewhere." - Unknown
--
Anton Altaparmakov <aia21 at cantab.net> (replace at with @)
Linux NTFS Maintainer / IRC: #ntfs on irc.openprojects.net
WWW: http://linux-ntfs.sf.net/ & http://www-stu.christs.cam.ac.uk/~aia21/

2002-07-22 07:05:26

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch 13/13] lseek speedup

Anton Altaparmakov wrote:
>
> At 17:49 17/07/02, Andrew Morton wrote:
> >Anton Altaparmakov wrote:
> > >
> > > > Now, why are we taking i_sem for lseek/readdir
> > > >exclusion and not a per-file lock?
> > >
> > > Because it also excludes against directory modifications, etc. Just imagine
> > > what "rm somefile" or "mv somefile otherfile" or "touch newfile" would do
> > > to the directory contents and what a concurrent readdir() would do... A
> > > very loud *BANG* is the only thing that springs to mind...
> >
> >That's different. i_size, contents of things, yes - i_sem for
> >those.
> >
> >But protection of struct file should not be via any per-inode thing.
>
> Oh, I see. But that would mean adding yet another lock to an existing
> locking scheme? So both i_sem and the "per file lock" would nead to be held
> over readdir(). That's doable but it would have to be a semaphore based
> lock due to readdir()...

Adding a sleeping lock to struct file for this would make sense; using
i_sem to protect the innards of struct file ain't right.

However I'm not sure that the VFS needs to be serialising lseek
and readdir at all. The fs can do that if it really needs to.

And does it really need to? Setting aside the non-atomic 64-bit
read, it may be sufficient for the fs to do what ext2_readdir
does: read f_pos once on entry, write it once on exit and if
there was a concurrent lseek then its results get stomped on.

Can you see any problem with that?

Anyway. It's not exactly a top-priority thing. I'll park
it for a while and just stop running that test ;)

-