2002-01-30 00:08:13

by Robert Love

[permalink] [raw]
Subject: [PATCH] 2.5: push BKL out of llseek

This patch pushes the BKL out of llseek() and into the individual llseek
methods. For generic_file_llseek, I replaced it with the inode
semaphore. The lock contention is noticeable even on 2-way systems.
Since we simply push the BKL further down the call chain (its the llseek
method's responsibilities now) we aren't doing anything hackish or
unsafe.

I suspect some (Al) may consider this a suboptimal solution, and I
agree. However it is a first step -- tightening the locks -- toward a
better locking scheme, which is hopefully devoid of the BKL.

The best scores from a slew of dbench runs:

(2.5.3-pre6 on 2-way Athlon)
with patch 133.651 165.575 66.9876 37.5297 24.9436
without patch 132.541 160.774 60.1174 33.2065 22.0126

Interestingly, the shorter lock times corresponded to an 8.9% reduction
in scheduling latency (under the above dbench load) with the preemptible
kernel.

Robert Love

diff -urN linux-2.5.3-pre6/Documentation/filesystems/Locking linux/Documentation/filesystems/Locking
--- linux-2.5.3-pre6/Documentation/filesystems/Locking Mon Jan 28 18:30:27 2002
+++ linux/Documentation/filesystems/Locking Tue Jan 29 17:07:37 2002
@@ -219,7 +219,7 @@
locking rules:
All except ->poll() may block.
BKL
-llseek: yes
+llseek: yes (see below)
read: no
write: no
readdir: yes (see below)
@@ -235,6 +235,10 @@
readv: no
writev: no

+->llseek() locking has moved from llseek to the individual llseek
+implementations. If your fs is not using generic_file_llseek, you
+need to acquire and release the BKL in your ->llseek().
+
->open() locking is in-transit: big lock partially moved into the methods.
The only exception is ->open() in the instances of file_operations that never
end up in ->i_fop/->proc_fops, i.e. ones that belong to character devices
diff -urN linux-2.5.3-pre6/fs/block_dev.c linux/fs/block_dev.c
--- linux-2.5.3-pre6/fs/block_dev.c Mon Jan 28 18:30:22 2002
+++ linux/fs/block_dev.c Tue Jan 29 16:49:52 2002
@@ -170,6 +170,8 @@
loff_t size = file->f_dentry->d_inode->i_bdev->bd_inode->i_size;
loff_t retval;

+ lock_kernel();
+
switch (origin) {
case 2:
offset += size;
@@ -186,6 +188,7 @@
}
retval = offset;
}
+ unlock_kernel();
return retval;
}

diff -urN linux-2.5.3-pre6/fs/hfs/file_cap.c linux/fs/hfs/file_cap.c
--- linux-2.5.3-pre6/fs/hfs/file_cap.c Mon Jan 28 18:30:22 2002
+++ linux/fs/hfs/file_cap.c Tue Jan 29 16:49:52 2002
@@ -91,6 +91,8 @@
{
long long retval;

+ lock_kernel();
+
switch (origin) {
case 2:
offset += file->f_dentry->d_inode->i_size;
@@ -106,6 +108,7 @@
}
retval = offset;
}
+ unlock_kernel();
return retval;
}

diff -urN linux-2.5.3-pre6/fs/hfs/file_hdr.c linux/fs/hfs/file_hdr.c
--- linux-2.5.3-pre6/fs/hfs/file_hdr.c Mon Jan 28 18:30:22 2002
+++ linux/fs/hfs/file_hdr.c Tue Jan 29 16:49:52 2002
@@ -347,6 +347,8 @@
{
long long retval;

+ lock_kernel();
+
switch (origin) {
case 2:
offset += file->f_dentry->d_inode->i_size;
@@ -362,6 +364,7 @@
}
retval = offset;
}
+ unlock_kernel();
return retval;
}

diff -urN linux-2.5.3-pre6/fs/hpfs/dir.c linux/fs/hpfs/dir.c
--- linux-2.5.3-pre6/fs/hpfs/dir.c Mon Jan 28 18:30:22 2002
+++ linux/fs/hpfs/dir.c Tue Jan 29 16:49:52 2002
@@ -29,6 +29,9 @@
struct inode *i = filp->f_dentry->d_inode;
struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
struct super_block *s = i->i_sb;
+
+ lock_kernel();
+
/*printk("dir lseek\n");*/
if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok;
hpfs_lock_inode(i);
@@ -40,10 +43,12 @@
}
hpfs_unlock_inode(i);
ok:
+ unlock_kernel();
return filp->f_pos = new_off;
fail:
hpfs_unlock_inode(i);
/*printk("illegal lseek: %016llx\n", new_off);*/
+ unlock_kernel();
return -ESPIPE;
}

diff -urN linux-2.5.3-pre6/fs/proc/generic.c linux/fs/proc/generic.c
--- linux-2.5.3-pre6/fs/proc/generic.c Mon Jan 28 18:30:22 2002
+++ linux/fs/proc/generic.c Tue Jan 29 16:49:52 2002
@@ -16,6 +16,7 @@
#include <linux/stat.h>
#define __NO_VERSION__
#include <linux/module.h>
+#include <linux/smp_lock.h>
#include <asm/bitops.h>

static ssize_t proc_file_read(struct file * file, char * buf,
@@ -140,22 +141,30 @@
static loff_t
proc_file_lseek(struct file * file, loff_t offset, int orig)
{
+ lock_kernel();
+
switch (orig) {
case 0:
if (offset < 0)
- return -EINVAL;
+ goto out;
file->f_pos = offset;
+ unlock_kernel();
return(file->f_pos);
case 1:
if (offset + file->f_pos < 0)
- return -EINVAL;
+ goto out;
file->f_pos += offset;
+ unlock_kernel();
return(file->f_pos);
case 2:
- return(-EINVAL);
+ goto out;
default:
- return(-EINVAL);
+ goto out;
}
+
+out:
+ unlock_kernel();
+ return -EINVAL;
}

/*
diff -urN linux-2.5.3-pre6/fs/read_write.c linux/fs/read_write.c
--- linux-2.5.3-pre6/fs/read_write.c Mon Jan 28 18:30:22 2002
+++ linux/fs/read_write.c Tue Jan 29 16:49:52 2002
@@ -29,6 +29,8 @@
{
long long retval;

+ down(&file->f_dentry->d_inode->i_sem);
+
switch (origin) {
case 2:
offset += file->f_dentry->d_inode->i_size;
@@ -45,6 +47,7 @@
}
retval = offset;
}
+ up(&file->f_dentry->d_inode->i_sem);
return retval;
}

@@ -57,6 +60,8 @@
{
long long retval;

+ lock_kernel();
+
switch (origin) {
case 2:
offset += file->f_dentry->d_inode->i_size;
@@ -73,6 +78,7 @@
}
retval = offset;
}
+ unlock_kernel();
return retval;
}

@@ -84,9 +90,7 @@
fn = default_llseek;
if (file->f_op && file->f_op->llseek)
fn = file->f_op->llseek;
- lock_kernel();
retval = fn(file, offset, origin);
- unlock_kernel();
return retval;
}

diff -urN linux-2.5.3-pre6/fs/ufs/file.c linux/fs/ufs/file.c
--- linux-2.5.3-pre6/fs/ufs/file.c Mon Jan 28 18:30:22 2002
+++ linux/fs/ufs/file.c Tue Jan 29 16:49:52 2002
@@ -47,6 +47,8 @@
long long retval;
struct inode *inode = file->f_dentry->d_inode;

+ lock_kernel();
+
switch (origin) {
case 2:
offset += inode->i_size;
@@ -64,6 +66,7 @@
}
retval = offset;
}
+ unlock_kernel();
return retval;
}


2002-01-30 00:13:14

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] 2.5: push BKL out of llseek


On 29 Jan 2002, Robert Love wrote:
>
> This patch pushes the BKL out of llseek() and into the individual llseek
> methods. For generic_file_llseek, I replaced it with the inode
> semaphore.

Thinking about that, that actally sounds like the _right_ thing to do even
from a correctness standpoint - as llseek() looks at the inode size, so we
should have that lock anyway.

So I'd suggest doing the inode semaphore globally, instead of using
kernel_lock at all.

Al?

Linus

2002-01-30 00:36:33

by Robert Love

[permalink] [raw]
Subject: Re: [PATCH] 2.5: push BKL out of llseek

On Tue, 2002-01-29 at 19:09, Linus Torvalds wrote:

> Thinking about that, that actally sounds like the _right_ thing to do even
> from a correctness standpoint - as llseek() looks at the inode size, so we
> should have that lock anyway.
>
> So I'd suggest doing the inode semaphore globally, instead of using
> kernel_lock at all.

This was my end result (or some sort of proper finer-grained lock) but I
was unsure of how to deal with the other llseek methods that may be
oddball. I have no reason to suspect using the inode semaphore won't
work.

Another gain from pushing the locks into each method is that we can pick
and choose as-needed. If it turns out inode semaphore is a global
solution, the following patch is sufficient. Otherwise, we could
replace the lock_kernel in each caller with the inode semaphore, as
appropriate. Oh Al ??

Robert Love

--- linux-2.5.3-pre6/fs/read_write.c Mon Jan 28 18:30:22 2002
+++ linux/fs/read_write.c Tue Jan 29 19:29:32 2002
@@ -84,9 +84,9 @@
fn = default_llseek;
if (file->f_op && file->f_op->llseek)
fn = file->f_op->llseek;
- lock_kernel();
+ down(&file->f_dentry->d_inode->i_sem);
retval = fn(file, offset, origin);
- unlock_kernel();
+ up(&file->f_dentry->d_inode->i_sem);
return retval;
}


2002-01-30 00:54:13

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] 2.5: push BKL out of llseek


On 29 Jan 2002, Robert Love wrote:
>
> Another gain from pushing the locks into each method is that we can pick
> and choose as-needed. If it turns out inode semaphore is a global
> solution, the following patch is sufficient. Otherwise, we could
> replace the lock_kernel in each caller with the inode semaphore, as
> appropriate. Oh Al ??

Doing it in the low-level filesystem would match how we now do it inside
generic_file_write() - ie the locking is done by the low-level filesystem,
but most low-level filesystems choose to use a generic helper function.

And I think your patch is slightly wrong:

> + down(&file->f_dentry->d_inode->i_sem);

That should really be:

file->f_dentry->d_inode->i_mapping->host->i_sem

to get the hosted filesystem case right (ie coda).

Linus

2002-01-30 01:34:14

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] 2.5: push BKL out of llseek

Robert Love wrote:
>
> @@ -84,9 +84,9 @@
> fn = default_llseek;
> if (file->f_op && file->f_op->llseek)
> fn = file->f_op->llseek;
> - lock_kernel();
> + down(&file->f_dentry->d_inode->i_sem);
> retval = fn(file, offset, origin);
> - unlock_kernel();
> + up(&file->f_dentry->d_inode->i_sem);
> return retval;
> }

Just a little word of caution here. Remember the
apache-flock-synchronisation fiasco, where removal
of the BKL halved Apache throughput on 8-way x86.

This was because the BKL removal turned serialisation
on a quick codepath from a spinlock into a schedule().

So... I'd suggest that changes such as this should be
benchmarked in isolation; otherwise we end up spending
quite some time hunting down mysterious reports of
performance regression, and having to rethink stuff.

And llseek is *fast*. If we're seeing significant
lock contention in there then adding a schedule() is
likely to turn Anton into one unhappy dbencher.

-

2002-01-30 02:14:45

by Robert Love

[permalink] [raw]
Subject: Re: [PATCH] 2.5: push BKL out of llseek

On Tue, 2002-01-29 at 20:26, Andrew Morton wrote:

> Just a little word of caution here. Remember the
> apache-flock-synchronisation fiasco, where removal
> of the BKL halved Apache throughput on 8-way x86.
>
> This was because the BKL removal turned serialisation
> on a quick codepath from a spinlock into a schedule().

I feared this too, but eventually I decided it was worth it and
benchmarks backed that up. If nothing else this is yet-another-excuse
for locks that can spin-then-sleep.

I posted dbench results, which show a positive gain even on 2-way for
multiple client loads.

Robert Love

2002-01-30 02:17:55

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] 2.5: push BKL out of llseek


On Tue, 29 Jan 2002, Andrew Morton wrote:
>
> And llseek is *fast*. If we're seeing significant
> lock contention in there then adding a schedule() is
> likely to turn Anton into one unhappy dbencher.

I'd agree, except I doubt there is every much contention on the _same_
file.

The reason llseek() ends up being so clear on the profiles is that it's a
very common system call under certain loads _and_ it uses a shared lock
for everything.

Also note the correctness issue (ie serialization on i_size), although
that is only an issue for SEEK_END (and maybe the lock should only be
gotten for that case. I'd love to hear what Al thinks..

Linus

2002-01-30 02:19:26

by Robert Love

[permalink] [raw]
Subject: Re: [PATCH] 2.5: push BKL out of llseek

On Tue, 2002-01-29 at 19:52, Linus Torvalds wrote:

> Doing it in the low-level filesystem would match how we now do it inside
> generic_file_write() - ie the locking is done by the low-level filesystem,
> but most low-level filesystems choose to use a generic helper function.

OK. Hopefully the inode semaphore works ...

> And I think your patch is slightly wrong:
>
> > + down(&file->f_dentry->d_inode->i_sem);
>
> That should really be:
>
> file->f_dentry->d_inode->i_mapping->host->i_sem
>
> to get the hosted filesystem case right (ie coda).

Ahh, learn something ;-)

Robert Love

--- linux-2.5.3-pre6/fs/read_write.c Mon Jan 28 18:30:22 2002
+++ linux/fs/read_write.c Tue Jan 29 19:29:32 2002
@@ -84,9 +84,9 @@
fn = default_llseek;
if (file->f_op && file->f_op->llseek)
fn = file->f_op->llseek;
- lock_kernel();
+ down(&file->f_dentry->d_inode->i_mapping->host->i_sem);
retval = fn(file, offset, origin);
- unlock_kernel();
+ up(&file->f_dentry->d_inode->i_mapping->host->i_sem);
return retval;
}



2002-01-30 02:22:05

by Dave Jones

[permalink] [raw]
Subject: Re: [PATCH] 2.5: push BKL out of llseek

On Tue, Jan 29, 2002 at 09:20:10PM -0500, Robert Love wrote:

> I feared this too, but eventually I decided it was worth it and
> benchmarks backed that up. If nothing else this is yet-another-excuse
> for locks that can spin-then-sleep.
> I posted dbench results, which show a positive gain even on 2-way for
> multiple client loads.

did you benchmark with anything other than dbench ?

--
| Dave Jones. http://www.codemonkey.org.uk
| SuSE Labs

2002-01-30 02:27:55

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] 2.5: push BKL out of llseek

Robert Love wrote:
>
> On Tue, 2002-01-29 at 20:26, Andrew Morton wrote:
>
> > Just a little word of caution here. Remember the
> > apache-flock-synchronisation fiasco, where removal
> > of the BKL halved Apache throughput on 8-way x86.
> >
> > This was because the BKL removal turned serialisation
> > on a quick codepath from a spinlock into a schedule().
>
> I feared this too, but eventually I decided it was worth it and
> benchmarks backed that up. If nothing else this is yet-another-excuse
> for locks that can spin-then-sleep.
>
> I posted dbench results, which show a positive gain even on 2-way for
> multiple client loads.
>

But dbench does lots of seeking against *different* files,
so removal of a shared lock will help there.

But an application where multiple CPUs lseek and write
the *same* file could take a hit....

(And where's the locking for (non-atomic) i_size in sys_stat())


-

2002-01-30 02:32:35

by Robert Love

[permalink] [raw]
Subject: Re: [PATCH] 2.5: push BKL out of llseek

On Tue, 2002-01-29 at 21:21, Dave Jones wrote:

> did you benchmark with anything other than dbench ?

No, and I really don't want to hear how dbench is a terrible benchmark.
I didn't craft the patch around dbench and I think, here at least,
dbench is an OK benchmark. I ran it numerous times over multiple client
loads.

I think its clear there won't be a negative impact, because:

- acquiring the inode semaphore isn't any heavier (in the acquire
case) than the BKL

- the lock contention on each inode semaphore is relatively
zero

- besides just scaling badly with the using a global lock against
all inodes, we use the BKL which in such workloads is already
highly contested.

That said, I did do some lock profiling and latency tests. Contention
was near-zero, but I only did 2-way testing. Under the preemptible
kernel, while running dbench, scheduling latency improved 8.9%.

Robert Love

2002-01-30 02:50:56

by Nigel Gamble

[permalink] [raw]
Subject: Re: [PATCH] 2.5: push BKL out of llseek

On 29 Jan 2002, Robert Love wrote:
> On Tue, 2002-01-29 at 20:26, Andrew Morton wrote:
> > Just a little word of caution here. Remember the
> > apache-flock-synchronisation fiasco, where removal
> > of the BKL halved Apache throughput on 8-way x86.
> >
> > This was because the BKL removal turned serialisation
> > on a quick codepath from a spinlock into a schedule().

Yes, but the other factor to consider here is why did the extra schedule
take place at all? I think this is a actually a scheduler issue, and
I'm hoping that the new scheduler will behave better in this case. A
call to schedule() should not happen unless the woken process has a
higher priority than the process that did the unlock, but the old
scheduler evidently always calculated this to be the case. But we
really want the process that did the unlock to continue running (until
the end of its timeslice, if not preempted or blocked before then), just
as it would when the lock was a spinlock. It would be interesting to
see whether the new scheduler gets this right.

Am I remembering the problem correctly?

Nigel Gamble [email protected]
Mountain View, CA, USA. http://www.nrg.org/

2002-01-30 03:27:00

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] 2.5: push BKL out of llseek

Nigel Gamble wrote:
>
> On 29 Jan 2002, Robert Love wrote:
> > On Tue, 2002-01-29 at 20:26, Andrew Morton wrote:
> > > Just a little word of caution here. Remember the
> > > apache-flock-synchronisation fiasco, where removal
> > > of the BKL halved Apache throughput on 8-way x86.
> > >
> > > This was because the BKL removal turned serialisation
> > > on a quick codepath from a spinlock into a schedule().
>
> Yes, but the other factor to consider here is why did the extra schedule
> take place at all? I think this is a actually a scheduler issue, and
> I'm hoping that the new scheduler will behave better in this case. A
> call to schedule() should not happen unless the woken process has a
> higher priority than the process that did the unlock, but the old
> scheduler evidently always calculated this to be the case. But we
> really want the process that did the unlock to continue running (until
> the end of its timeslice, if not preempted or blocked before then), just
> as it would when the lock was a spinlock. It would be interesting to
> see whether the new scheduler gets this right.
>
> Am I remembering the problem correctly?
>

I don't think so :)

The problem was that the semaphore was highly contended, so the
losing process was explicitly scheduling away.

This doesn't necessarily mean that it was a long-held lock. In
this case, it was a short-held lock, but it was also very *frequently*
being held and released. This is a scenario where a spinlock is
heaps more appropriate than a semaphore.

I don't think we need any locking at all in the default lseek()
path, btw. Apart from the non-atomic i_size thing, which is
only an issue for 32-bit machines.


-

2002-01-30 04:54:37

by Alexander Viro

[permalink] [raw]
Subject: Re: [PATCH] 2.5: push BKL out of llseek



On Tue, 29 Jan 2002, Linus Torvalds wrote:

>
> On 29 Jan 2002, Robert Love wrote:
> >
> > This patch pushes the BKL out of llseek() and into the individual llseek
> > methods. For generic_file_llseek, I replaced it with the inode
> > semaphore.
>
> Thinking about that, that actally sounds like the _right_ thing to do even
> from a correctness standpoint - as llseek() looks at the inode size, so we
> should have that lock anyway.
>
> So I'd suggest doing the inode semaphore globally, instead of using
> kernel_lock at all.
>
> Al?

It's OK for regular files and directories, but I'm not sure about devices.

So I'd prefer to do it in two stages - shift BKL into ->llseek() and then
see where it can be dropped/replaced with ->i_sem.

2002-01-30 04:55:27

by Anton Blanchard

[permalink] [raw]
Subject: Re: [PATCH] 2.5: push BKL out of llseek


> This patch pushes the BKL out of llseek() and into the individual llseek
> methods. For generic_file_llseek, I replaced it with the inode
> semaphore. The lock contention is noticeable even on 2-way systems.
> Since we simply push the BKL further down the call chain (its the llseek
> method's responsibilities now) we aren't doing anything hackish or
> unsafe.

A great idea, I wonder why someone didnt think of it before?

http://banyan.dlut.edu.cn/news/112801/0186.html

Wow someone did! And the patch basically matches yours including whitespace.

Robert _please_ attribute your sources more carefully.

Anton

2002-01-30 04:58:07

by Robert Love

[permalink] [raw]
Subject: Re: [PATCH] 2.5: push BKL out of llseek

I've been informed Anton Blanchard did this same thing back in November:

http://banyan.dlut.edu.cn/news/112801/0186.html

So credit should go to him.

Robert Love

2002-01-30 08:02:13

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH] 2.5: push BKL out of llseek

>>>>> " " == Alexander Viro <[email protected]> writes:

> So I'd prefer to do it in two stages - shift BKL into
> ->llseek() and then see where it can be dropped/replaced with
> ->i_sem.

Seconded. There are several filesystems out there for which i_sem does
nothing to protect inode->i_size.

Cheers,
Trond

2002-01-30 09:34:56

by Nigel Gamble

[permalink] [raw]
Subject: Re: [PATCH] 2.5: push BKL out of llseek

On Tue, 29 Jan 2002, Andrew Morton wrote:
> Nigel Gamble wrote:
> > Am I remembering the problem correctly?
>
> I don't think so :)
>
> The problem was that the semaphore was highly contended, so the
> losing process was explicitly scheduling away.
>
> This doesn't necessarily mean that it was a long-held lock. In
> this case, it was a short-held lock, but it was also very *frequently*
> being held and released. This is a scenario where a spinlock is
> heaps more appropriate than a semaphore.

Oh, well in that case, I agree that a spinlock is more appropriate.

Nigel Gamble [email protected]
Mountain View, CA, USA. http://www.nrg.org/

2002-01-30 10:37:07

by Russell King

[permalink] [raw]
Subject: Re: [PATCH] 2.5: push BKL out of llseek

On Tue, Jan 29, 2002 at 09:20:10PM -0500, Robert Love wrote:
> On Tue, 2002-01-29 at 20:26, Andrew Morton wrote:
> > Just a little word of caution here. Remember the
> > apache-flock-synchronisation fiasco, where removal
> > of the BKL halved Apache throughput on 8-way x86.
> >
> > This was because the BKL removal turned serialisation
> > on a quick codepath from a spinlock into a schedule().
>
> I feared this too, but eventually I decided it was worth it and
> benchmarks backed that up. If nothing else this is yet-another-excuse
> for locks that can spin-then-sleep.
>
> I posted dbench results, which show a positive gain even on 2-way for
> multiple client loads.

Luckily, apache on its own doesn't seem to use lseek() when sending the
file - it seems to be an open, mmap, write. (apache 1.3.22)

However, php with apache does do an lseek on the target script. Now
remember the /. effect... You'll be accessing the same file from
several apache or php processes.

--
Russell King ([email protected]) The developer of ARM Linux
http://www.arm.linux.org.uk/personal/aboutme.html

2002-01-30 13:33:52

by Robert Love

[permalink] [raw]
Subject: Re: [PATCH] 2.5: push BKL out of llseek

On Wed, 2002-01-30 at 03:00, Trond Myklebust wrote:

>> " " == Alexander Viro <[email protected]> writes:
>
> > So I'd prefer to do it in two stages - shift BKL into
> > ->llseek() and then see where it can be dropped/replaced with
> > ->i_sem.
>
> Seconded. There are several filesystems out there for which i_sem does
> nothing to protect inode->i_size.

Then the first patch is the way to go. We know generic_file_llseek is
safe and perhaps a few other of the llseek methods. The remaining can
explicitly grab the bkl on their own, as Al suggested.

Robert Love

2002-01-30 21:15:58

by Martin Wirth

[permalink] [raw]
Subject: Re: [PATCH] 2.5: push BKL out of llseek

Hi,

This is just a general idea I had a few month ago and might be of some
value
for the replacement of BKL or longheld spinlocks in the future 2.5
developement.

While writing some device driver for a real-time data acquisition I had
a
similar problem. I had to protect some driver data structure that is
heavily accessed from multiple processes for merely reading a few
variables
consistently. But from time to time there are bigger tasks to be done,
where
holding a spinlock is not appropriate.

So I used a combination of a spinlock and a semaphore. You can lock this
combilock for short-term issues in a spin-lock mode:

combi_spin_lock(struct combilock *x)
combi_spin_unlock(struct combilock *x)

and for longer lasting tasks in a semaphore mode by:

combi_mutex_lock(struct combilock *x)
combi_mutex_unlock(struct combilock *x)

If a spin-lock request is blocked by a mutex-lock, the spin-lock
attempt also sleeps i.e. behaves like a semaphore.

This approach is less automatic than a first_spin_then_sleep mutex,
but normally the programmer knows better if he is going to do quick
things, or
maybe maybe unbounded stuff.

Note: For a preemtible kernel this approach could lead to much less
scheduling ping-pong also for UP if a spinlock is replaced by a
combilock
instead of a semaphore.


The code is quite simple and borrowed a bit from the completion handler
stuff
in sched.c. (Of course the owner could be a simple flag, but I had some
later
extension to a priority inheritance scheme in mind).

struct combilock {
wait_queue_head_t wait;
task_t *owner;
};


void combi_spin_lock(struct combilock *x)
{
spin_lock(&x->wait.lock);
if (x->owner) {
DECLARE_WAITQUEUE(wait, current);
wait.flags |= WQ_FLAG_EXCLUSIVE;
__add_wait_queue_tail(&x->wait, &wait);
do {
__set_current_state(TASK_UNINTERRUPTIBLE);
spin_unlock(&x->wait.lock);
schedule();
spin_lock(&x->wait.lock);
} while (x->owner);
__remove_wait_queue(&x->wait, &wait);
}
}


void combi_spin_unlock(struct combilock *x)
{
spin_unlock(&x->wait.lock);
}


void combi_mutex_lock(struct combilock *x)
{
spin_lock(&x->wait.lock);
if (x->owner) {
DECLARE_WAITQUEUE(wait, current);
wait.flags |= WQ_FLAG_EXCLUSIVE;
__add_wait_queue_tail(&x->wait, &wait);
do {
__set_current_state(TASK_UNINTERRUPTIBLE);
spin_unlock(&x->wait.lock);
schedule();
spin_lock(&x->wait.lock);
} while (x->owner);
__remove_wait_queue(&x->wait, &wait);
} else
x->owner=current;
spin_unlock(&x->wait.lock);
}


void combi_mutex_unlock(struct combilock *x)
{
spin_lock(&x->wait.lock);
x->owner=NULL;
__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
1, 0);
spin_unlock(&x->wait.lock);
}


Martin Wirth

2002-01-31 15:39:57

by Martin Wirth

[permalink] [raw]
Subject: Re: [PATCH] 2.5: push BKL out of llseek

On 30 Jan 2002, Martin Wirth wrote:
>
>void combi_mutex_lock(struct combilock *x)
.....
> } else <---
> x->owner=current;
> spin_unlock(&x->wait.lock);

Uugh, the else is wrong of course. The owner has to be set in any
case.(Just deleted some debugging code and reformatted a bit to quick
:))

A further note: Although the combilock shares some advantages with a
spin-lock (no unnecessary scheduling for short time locking) it may
behave like a semaphore on entry also if you call combi_spin_lock.
For example

spin_lock(&slock);
combi_spin_lock(&clock);

is a BUG because combi_spin_lock may sleep while holding slock!

Would be nice if there were some comments.

Martin Wirth

2002-01-31 21:07:22

by Nigel Gamble

[permalink] [raw]
Subject: Re: [PATCH] 2.5: push BKL out of llseek

On Thu, 31 Jan 2002, Martin Wirth wrote:
> A further note: Although the combilock shares some advantages with a
> spin-lock (no unnecessary scheduling for short time locking) it may
> behave like a semaphore on entry also if you call combi_spin_lock.
> For example
>
> spin_lock(&slock);
> combi_spin_lock(&clock);
>
> is a BUG because combi_spin_lock may sleep while holding slock!
>
> Would be nice if there were some comments.

Nice work! This could turn out to be a useful tool for those of us
working on reliable low-latency kernels. I certainly agree that it is a
much better solution than adaptive spinlocks (which dynamically decide
when to sleep) as the kernel programmer should always know whether a
spinlock or a sleep lock is more appropriate.

Unfortunately, as you point out, it's not as useful as it may first
appear in the short term, because last time I looked into the problem of
long-held spinlocks they were all nested under other spinlocks and/or
the BKL.

Nigel Gamble [email protected]
Mountain View, CA, USA. http://www.nrg.org/

2002-02-01 19:29:21

by John Hawkes

[permalink] [raw]
Subject: Re: [PATCH] 2.5: push BKL out of llseek

From: "Dave Jones" <[email protected]>
> did you benchmark with anything other than dbench ?

I've done substantial AIM7 benchmarking on a 28p ia64 NUMA system, and
llseek's BKL usage is a significant contributor to poor scaling. For
500 AIM7 "tasks" and ext2 filesystems, waiting on the BKL consumes about
half of the available CPU cycles, and sys_lseek()'s usage is the most
significant cycle waster, followed by ext2_get_block() and
ext2_write_inode(). Anton's llseek patch from last November does make
a measurable improvement in AIM7 throughput.

--
John Hawkes
[email protected]