LinuxLists.cc - [PATCH 1/2] [RFC] block: replace BKL with global mutex

2010-04-14 20:36:56

Subject: [PATCH 1/2] [RFC] block: replace BKL with global mutex

The block layer is one of the remaining users
of the Big Kernel Lock. In there, it is used
mainly for serializing the blkdev_get and
blkdev_put functions and some ioctl implementations
as well as some less common open functions of
related character devices following a previous
pushdown and parts of the blktrace code.

The only one that seems to be a bit nasty is the
blkdev_get function which is actually recursive
and may try to get the BKL twice.

All users except the one in blkdev_get seem to
be outermost locks, meaning we don't rely on
the release-on-sleep semantics to avoid deadlocks.

The ctl_mutex (pktcdvd.ko), raw_mutex (raw.ko),
state_mutex (dasd.ko), reconfig_mutex (md.ko),
and jfs_log_mutex (jfs.ko) may be held when
blkdev_get is called, but as far as I can tell,
these mutexes are never acquired from any of the
functions that get converted in this patch.

In order to get rid of the BKL, this introduces
a new global mutex called blkdev_mutex, which
replaces the BKL in all drivers that directly
interact with the block layer. In case of blkdev_get,
the mutex is moved outside of the function itself
in order to avoid the recursive taking of blkdev_mutex.

Testing so far has shown no problems whatsoever
from this patch, but the usage in blkdev_get
may introduce extra latencies, and I may have
missed corner cases where an block device ioctl
function sleeps for a significant amount of time,
which may be harmful to the performance of other
threads.

Signed-off-by: Arnd Bergmann <[email protected]>
Cc: Jens Axboe <[email protected]>
Cc: Vivek Goyal <[email protected]>
Cc: Tejun Heo <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: FUJITA Tomonori <[email protected]>
Cc: "Martin K. Petersen" <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: John Kacur <[email protected]>
Cc: [email protected]
Cc: [email protected]
---
block/bsg.c | 2 --
block/compat_ioctl.c | 8 ++++----
block/ioctl.c | 24 ++++++++++++------------
drivers/block/DAC960.c | 4 ++--
drivers/block/cciss.c | 4 ++--
drivers/block/paride/pg.c | 4 ++--
drivers/block/paride/pt.c | 16 ++++++++--------
drivers/scsi/sg.c | 22 ++++++++++++++++------
fs/block_dev.c | 20 +++++++++++++-------
include/linux/blkdev.h | 6 ++++++
kernel/trace/blktrace.c | 14 ++++++++------
11 files changed, 73 insertions(+), 51 deletions(-)

diff --git a/block/bsg.c b/block/bsg.c
index 82d5882..7c50a26 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -843,9 +843,7 @@ static int bsg_open(struct inode *inode, struct file *file)
{
struct bsg_device *bd;

- lock_kernel();
bd = bsg_get_device(inode, file);
- unlock_kernel();

if (IS_ERR(bd))
return PTR_ERR(bd);
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index f26051f..11cfd3c 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -802,16 +802,16 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return compat_put_u64(arg, bdev->bd_inode->i_size);

case BLKTRACESETUP32:
- lock_kernel();
+ mutex_lock(&blkdev_mutex);
ret = compat_blk_trace_setup(bdev, compat_ptr(arg));
- unlock_kernel();
+ mutex_unlock(&blkdev_mutex);
return ret;
case BLKTRACESTART: /* compatible */
case BLKTRACESTOP: /* compatible */
case BLKTRACETEARDOWN: /* compatible */
- lock_kernel();
+ mutex_lock(&blkdev_mutex);
ret = blk_trace_ioctl(bdev, cmd, compat_ptr(arg));
- unlock_kernel();
+ mutex_unlock(&blkdev_mutex);
return ret;
default:
if (disk->fops->compat_ioctl)
diff --git a/block/ioctl.c b/block/ioctl.c
index 8905d2a..272cdce 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -169,9 +169,9 @@ int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode,
return disk->fops->ioctl(bdev, mode, cmd, arg);

if (disk->fops->locked_ioctl) {
- lock_kernel();
+ mutex_lock(&blkdev_mutex);
ret = disk->fops->locked_ioctl(bdev, mode, cmd, arg);
- unlock_kernel();
+ mutex_unlock(&blkdev_mutex);
return ret;
}

@@ -206,10 +206,10 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
if (ret != -EINVAL && ret != -ENOTTY)
return ret;

- lock_kernel();
+ mutex_lock(&blkdev_mutex);
fsync_bdev(bdev);
invalidate_bdev(bdev);
- unlock_kernel();
+ mutex_unlock(&blkdev_mutex);
return 0;

case BLKROSET:
@@ -221,9 +221,9 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
return -EACCES;
if (get_user(n, (int __user *)(arg)))
return -EFAULT;
- lock_kernel();
+ mutex_lock(&blkdev_mutex);
set_device_ro(bdev, n);
- unlock_kernel();
+ mutex_unlock(&blkdev_mutex);
return 0;

case BLKDISCARD: {
@@ -309,14 +309,14 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
bd_release(bdev);
return ret;
case BLKPG:
- lock_kernel();
+ mutex_lock(&blkdev_mutex);
ret = blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg);
- unlock_kernel();
+ mutex_unlock(&blkdev_mutex);
break;
case BLKRRPART:
- lock_kernel();
+ mutex_lock(&blkdev_mutex);
ret = blkdev_reread_part(bdev);
- unlock_kernel();
+ mutex_unlock(&blkdev_mutex);
break;
case BLKGETSIZE:
size = bdev->bd_inode->i_size;
@@ -329,9 +329,9 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
case BLKTRACESTOP:
case BLKTRACESETUP:
case BLKTRACETEARDOWN:
- lock_kernel();
+ mutex_lock(&blkdev_mutex);
ret = blk_trace_ioctl(bdev, cmd, (char __user *) arg);
- unlock_kernel();
+ mutex_unlock(&blkdev_mutex);
break;
default:
ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index c5f22bb..8faaa12 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -6620,7 +6620,7 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
long ErrorCode = 0;
if (!capable(CAP_SYS_ADMIN)) return -EACCES;

- lock_kernel();
+ mutex_lock(&blkdev_mutex);
switch (Request)
{
case DAC960_IOCTL_GET_CONTROLLER_COUNT:
@@ -7051,7 +7051,7 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
default:
ErrorCode = -ENOTTY;
}
- unlock_kernel();
+ mutex_unlock(&blkdev_mutex);
return ErrorCode;
}

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index eb5ff05..0e21a9d 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1027,9 +1027,9 @@ static int do_ioctl(struct block_device *bdev, fmode_t mode,
unsigned cmd, unsigned long arg)
{
int ret;
- lock_kernel();
+ mutex_lock(&blkdev_mutex);
ret = cciss_ioctl(bdev, mode, cmd, arg);
- unlock_kernel();
+ mutex_unlock(&blkdev_mutex);
return ret;
}

diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c
index c397b3d..81660d0 100644
--- a/drivers/block/paride/pg.c
+++ b/drivers/block/paride/pg.c
@@ -518,7 +518,7 @@ static int pg_open(struct inode *inode, struct file *file)
struct pg *dev = &devices[unit];
int ret = 0;

- lock_kernel();
+ mutex_lock(&blkdev_mutex);
if ((unit >= PG_UNITS) || (!dev->present)) {
ret = -ENODEV;
goto out;
@@ -547,7 +547,7 @@ static int pg_open(struct inode *inode, struct file *file)
file->private_data = dev;

out:
- unlock_kernel();
+ mutex_unlock(&blkdev_mutex);
return ret;
}

diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c
index bc5825f..05e272e 100644
--- a/drivers/block/paride/pt.c
+++ b/drivers/block/paride/pt.c
@@ -650,9 +650,9 @@ static int pt_open(struct inode *inode, struct file *file)
struct pt_unit *tape = pt + unit;
int err;

- lock_kernel();
+ mutex_lock(&blkdev_mutex);
if (unit >= PT_UNITS || (!tape->present)) {
- unlock_kernel();
+ mutex_unlock(&blkdev_mutex);
return -ENODEV;
}

@@ -681,12 +681,12 @@ static int pt_open(struct inode *inode, struct file *file)
}

file->private_data = tape;
- unlock_kernel();
+ mutex_unlock(&blkdev_mutex);
return 0;

out:
atomic_inc(&tape->available);
- unlock_kernel();
+ mutex_unlock(&blkdev_mutex);
return err;
}

@@ -704,15 +704,15 @@ static long pt_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
switch (mtop.mt_op) {

case MTREW:
- lock_kernel();
+ mutex_lock(&blkdev_mutex);
pt_rewind(tape);
- unlock_kernel();
+ mutex_unlock(&blkdev_mutex);
return 0;

case MTWEOF:
- lock_kernel();
+ mutex_lock(&blkdev_mutex);
pt_write_fm(tape);
- unlock_kernel();
+ mutex_unlock(&blkdev_mutex);
return 0;

default:
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index dee1c96..ec5b43e 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -229,7 +229,7 @@ sg_open(struct inode *inode, struct file *filp)
int res;
int retval;

- lock_kernel();
+ mutex_lock(&blkdev_mutex);
nonseekable_open(inode, filp);
SCSI_LOG_TIMEOUT(3, printk("sg_open: dev=%d, flags=0x%x\n", dev, flags));
sdp = sg_get_dev(dev);
@@ -307,7 +307,7 @@ error_out:
sg_put:
if (sdp)
sg_put_dev(sdp);
- unlock_kernel();
+ mutex_unlock(&blkdev_mutex);
return retval;
}

@@ -757,9 +757,7 @@ sg_common_write(Sg_fd * sfp, Sg_request * srp,
return 0;
}

-static int
-sg_ioctl(struct inode *inode, struct file *filp,
- unsigned int cmd_in, unsigned long arg)
+static long sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg)
{
void __user *p = (void __user *)arg;
int __user *ip = p;
@@ -1078,6 +1076,17 @@ sg_ioctl(struct inode *inode, struct file *filp,
}
}

+static long sg_unlocked_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg)
+{
+ int ret;
+
+ mutex_lock(&blkdev_mutex);
+ ret = sg_ioctl(filp, cmd_in, arg);
+ mutex_unlock(&blkdev_mutex);
+
+ return ret;
+}
+
#ifdef CONFIG_COMPAT
static long sg_compat_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg)
{
@@ -1322,7 +1331,8 @@ static const struct file_operations sg_fops = {
.read = sg_read,
.write = sg_write,
.poll = sg_poll,
- .ioctl = sg_ioctl,
+ .llseek = generic_file_llseek,
+ .unlocked_ioctl = sg_unlocked_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = sg_compat_ioctl,
#endif
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2a6d019..f43d24f 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -29,6 +29,9 @@
#include <asm/uaccess.h>
#include "internal.h"

+DEFINE_MUTEX(blkdev_mutex);
+EXPORT_SYMBOL_GPL(blkdev_mutex);
+
struct bdev_inode {
struct block_device bdev;
struct inode vfs_inode;
@@ -1191,7 +1194,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
return ret;
}

- lock_kernel();
restart:

ret = -ENXIO;
@@ -1277,7 +1279,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
if (for_part)
bdev->bd_part_count++;
mutex_unlock(&bdev->bd_mutex);
- unlock_kernel();
return 0;

out_clear:
@@ -1291,7 +1292,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
out_unlock_bdev:
mutex_unlock(&bdev->bd_mutex);
out_unlock_kernel:
- unlock_kernel();

if (disk)
module_put(disk->fops->owner);
@@ -1303,7 +1303,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)

int blkdev_get(struct block_device *bdev, fmode_t mode)
{
- return __blkdev_get(bdev, mode, 0);
+ int ret;
+ mutex_lock(&blkdev_mutex);
+ ret = __blkdev_get(bdev, mode, 0);
+ mutex_unlock(&blkdev_mutex);
+ return ret;
}
EXPORT_SYMBOL(blkdev_get);

@@ -1357,7 +1361,6 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
struct block_device *victim = NULL;

mutex_lock_nested(&bdev->bd_mutex, for_part);
- lock_kernel();
if (for_part)
bdev->bd_part_count--;

@@ -1382,7 +1385,6 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
victim = bdev->bd_contains;
bdev->bd_contains = NULL;
}
- unlock_kernel();
mutex_unlock(&bdev->bd_mutex);
bdput(bdev);
if (victim)
@@ -1392,7 +1394,11 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)

int blkdev_put(struct block_device *bdev, fmode_t mode)
{
- return __blkdev_put(bdev, mode, 0);
+ int ret;
+ mutex_lock(&blkdev_mutex);
+ ret = __blkdev_put(bdev, mode, 0);
+ mutex_unlock(&blkdev_mutex);
+ return ret;
}
EXPORT_SYMBOL(blkdev_put);

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6690e8b..7810a2d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1292,6 +1292,12 @@ struct block_device_operations {

extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
unsigned long);
+
+/*
+ * replaces the big kernel lock in the block subsystem
+ */
+extern struct mutex blkdev_mutex;
+
#else /* CONFIG_BLOCK */
/*
* stubs for when the block layer is configured out
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b3bc91a..5012fe1 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -305,7 +305,7 @@ static int blk_dropped_open(struct inode *inode, struct file *filp)
{
filp->private_data = inode->i_private;

- return 0;
+ return nonseekable_open(inode, filp);
}

static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
@@ -323,13 +323,14 @@ static const struct file_operations blk_dropped_fops = {
.owner = THIS_MODULE,
.open = blk_dropped_open,
.read = blk_dropped_read,
+ .llseek = no_llseek,
};

static int blk_msg_open(struct inode *inode, struct file *filp)
{
filp->private_data = inode->i_private;

- return 0;
+ return nonseekable_open(inode, filp);
}

static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
@@ -362,6 +363,7 @@ static const struct file_operations blk_msg_fops = {
.owner = THIS_MODULE,
.open = blk_msg_open,
.write = blk_msg_write,
+ .llseek = no_llseek,
};

/*
@@ -1581,7 +1583,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
struct block_device *bdev;
ssize_t ret = -ENXIO;

- lock_kernel();
+ mutex_lock(&blkdev_mutex);
bdev = bdget(part_devt(p));
if (bdev == NULL)
goto out_unlock_kernel;
@@ -1613,7 +1615,7 @@ out_unlock_bdev:
out_bdput:
bdput(bdev);
out_unlock_kernel:
- unlock_kernel();
+ mutex_unlock(&blkdev_mutex);
return ret;
}

@@ -1643,7 +1645,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,

ret = -ENXIO;

- lock_kernel();
+ mutex_unlock(&blkdev_mutex);
p = dev_to_part(dev);
bdev = bdget(part_devt(p));
if (bdev == NULL)
@@ -1683,7 +1685,7 @@ out_unlock_bdev:
out_bdput:
bdput(bdev);
out_unlock_kernel:
- unlock_kernel();
+ mutex_unlock(&blkdev_mutex);
out:
return ret ? ret : count;
}
--
1.7.0.4

2010-04-14 20:37:18

by Arnd Bergmann

[permalink] [raw]

Subject: [PATCH 2/2] [RFC] Remove BKL from fs/locks.c

From: Matthew Wilcox <[email protected]>

I've taken a patch originally written by Matthew Wilcox and
ported it to the current version. It seems that there were
originally concerns that this breaks NFS, but since Trond
has recently removed the BKL from NFS, my naive assumption
would be that it's all good now, despite not having tried to
understand what it does.

Original introduction from Willy:

I've been promising to do this for about seven years now.

It seems to work well enough, but I haven't run any serious stress
tests on it. This implementation uses one spinlock to protect both lock
lists and all the i_flock chains. It doesn't seem worth splitting up
the locking any further.

I had to move one memory allocation out from under the file_lock_lock.
I hope I got that logic right. I'm rather tempted to split out the
find_conflict algorithm from that function into something that can be
called separately for the FL_ACCESS case.

I also have to drop and reacquire the file_lock_lock around the call
to cond_resched(). This was done automatically for us before by the
special BKL semantics.

I had to change vfs_setlease() as it relied on the special BKL ability
to recursively acquire the same lock. The internal caller now calls
__vfs_setlease and the exported interface acquires and releases the
file_lock_lock around calling __vfs_setlease.

I should probably split out the removal of interruptible_sleep_on_locked()
as it's basically unrelated to all this.

Signed-off-by: Arnd Bergmann <[email protected]>
Cc: Matthew Wilcox <[email protected]>
Cc: Christoph Hellwig <[email protected]>
Cc: Trond Myklebust <[email protected]>
Cc: "J. Bruce Fields" <[email protected]>
Cc: Miklos Szeredi <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: John Kacur <[email protected]>
Cc: [email protected]
Cc: [email protected]
---
fs/locks.c | 110 ++++++++++++++++++++++++++++++++++++------------------------
1 files changed, 66 insertions(+), 44 deletions(-)

diff --git a/fs/locks.c b/fs/locks.c
index ab24d49..87f1c60 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -140,9 +140,23 @@ int lease_break_time = 45;
#define for_each_lock(inode, lockp) \
for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next)

+/*
+ * Protects the two list heads below, plus the inode->i_flock list
+ */
+static DEFINE_SPINLOCK(file_lock_lock);
static LIST_HEAD(file_lock_list);
static LIST_HEAD(blocked_list);

+static inline void lock_flocks(void)
+{
+ spin_lock(&file_lock_lock);
+}
+
+static inline void unlock_flocks(void)
+{
+ spin_unlock(&file_lock_lock);
+}
+
static struct kmem_cache *filelock_cache __read_mostly;

/* Allocate an empty lock structure. */
@@ -511,9 +525,9 @@ static void __locks_delete_block(struct file_lock *waiter)
*/
static void locks_delete_block(struct file_lock *waiter)
{
- lock_kernel();
+ lock_flocks();
__locks_delete_block(waiter);
- unlock_kernel();
+ unlock_flocks();
}

/* Insert waiter into blocker's block list.
@@ -644,7 +658,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
{
struct file_lock *cfl;

- lock_kernel();
+ lock_flocks();
for (cfl = filp->f_path.dentry->d_inode->i_flock; cfl; cfl = cfl->fl_next) {
if (!IS_POSIX(cfl))
continue;
@@ -657,7 +671,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
fl->fl_pid = pid_vnr(cfl->fl_nspid);
} else
fl->fl_type = F_UNLCK;
- unlock_kernel();
+ unlock_flocks();
return;
}
EXPORT_SYMBOL(posix_test_lock);
@@ -730,18 +744,16 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
int error = 0;
int found = 0;

- lock_kernel();
- if (request->fl_flags & FL_ACCESS)
- goto find_conflict;
-
- if (request->fl_type != F_UNLCK) {
- error = -ENOMEM;
+ if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) {
new_fl = locks_alloc_lock();
- if (new_fl == NULL)
- goto out;
- error = 0;
+ if (!new_fl)
+ return -ENOMEM;
}

+ lock_flocks();
+ if (request->fl_flags & FL_ACCESS)
+ goto find_conflict;
+
for_each_lock(inode, before) {
struct file_lock *fl = *before;
if (IS_POSIX(fl))
@@ -767,8 +779,11 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
* If a higher-priority process was blocked on the old file lock,
* give it the opportunity to lock the file.
*/
- if (found)
+ if (found) {
+ unlock_flocks();
cond_resched();
+ lock_flocks();
+ }

find_conflict:
for_each_lock(inode, before) {
@@ -794,7 +809,7 @@ find_conflict:
error = 0;

out:
- unlock_kernel();
+ unlock_flocks();
if (new_fl)
locks_free_lock(new_fl);
return error;
@@ -823,7 +838,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
new_fl2 = locks_alloc_lock();
}

- lock_kernel();
+ lock_flocks();
if (request->fl_type != F_UNLCK) {
for_each_lock(inode, before) {
fl = *before;
@@ -991,7 +1006,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
locks_wake_up_blocks(left);
}
out:
- unlock_kernel();
+ unlock_flocks();
/*
* Free any unused locks.
*/
@@ -1066,14 +1081,14 @@ int locks_mandatory_locked(struct inode *inode)
/*
* Search the lock list for this inode for any POSIX locks.
*/
- lock_kernel();
+ lock_flocks();
for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
if (!IS_POSIX(fl))
continue;
if (fl->fl_owner != owner)
break;
}
- unlock_kernel();
+ unlock_flocks();
return fl ? -EAGAIN : 0;
}

@@ -1186,7 +1201,7 @@ int __break_lease(struct inode *inode, unsigned int mode)

new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);

- lock_kernel();
+ lock_flocks();

time_out_leases(inode);

@@ -1247,8 +1262,10 @@ restart:
break_time++;
}
locks_insert_block(flock, new_fl);
+ unlock_flocks();
error = wait_event_interruptible_timeout(new_fl->fl_wait,
!new_fl->fl_next, break_time);
+ lock_flocks();
__locks_delete_block(new_fl);
if (error >= 0) {
if (error == 0)
@@ -1263,7 +1280,7 @@ restart:
}

out:
- unlock_kernel();
+ unlock_flocks();
if (!IS_ERR(new_fl))
locks_free_lock(new_fl);
return error;
@@ -1319,7 +1336,7 @@ int fcntl_getlease(struct file *filp)
struct file_lock *fl;
int type = F_UNLCK;

- lock_kernel();
+ lock_flocks();
time_out_leases(filp->f_path.dentry->d_inode);
for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl);
fl = fl->fl_next) {
@@ -1328,7 +1345,7 @@ int fcntl_getlease(struct file *filp)
break;
}
}
- unlock_kernel();
+ unlock_flocks();
return type;
}

@@ -1341,7 +1358,7 @@ int fcntl_getlease(struct file *filp)
* The (input) flp->fl_lmops->fl_break function is required
* by break_lease().
*
- * Called with kernel lock held.
+ * Called with file_lock_lock held.
*/
int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
{
@@ -1436,7 +1453,15 @@ out:
}
EXPORT_SYMBOL(generic_setlease);

- /**
+static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
+{
+ if (filp->f_op && filp->f_op->setlease)
+ return filp->f_op->setlease(filp, arg, lease);
+ else
+ return generic_setlease(filp, arg, lease);
+}
+
+/**
* vfs_setlease - sets a lease on an open file
* @filp: file pointer
* @arg: type of lease to obtain
@@ -1467,12 +1492,9 @@ int vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
{
int error;

- lock_kernel();
- if (filp->f_op && filp->f_op->setlease)
- error = filp->f_op->setlease(filp, arg, lease);
- else
- error = generic_setlease(filp, arg, lease);
- unlock_kernel();
+ lock_flocks();
+ error = __vfs_setlease(filp, arg, lease);
+ unlock_flocks();

return error;
}
@@ -1499,9 +1521,9 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
if (error)
return error;

- lock_kernel();
+ lock_flocks();

- error = vfs_setlease(filp, arg, &flp);
+ error = __vfs_setlease(filp, arg, &flp);
if (error || arg == F_UNLCK)
goto out_unlock;

@@ -1516,7 +1538,7 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg)

error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
out_unlock:
- unlock_kernel();
+ unlock_flocks();
return error;
}

@@ -2020,7 +2042,7 @@ void locks_remove_flock(struct file *filp)
fl.fl_ops->fl_release_private(&fl);
}

- lock_kernel();
+ lock_flocks();
before = &inode->i_flock;

while ((fl = *before) != NULL) {
@@ -2038,7 +2060,7 @@ void locks_remove_flock(struct file *filp)
}
before = &fl->fl_next;
}
- unlock_kernel();
+ unlock_flocks();
}

/**
@@ -2053,12 +2075,12 @@ posix_unblock_lock(struct file *filp, struct file_lock *waiter)
{
int status = 0;

- lock_kernel();
+ lock_flocks();
if (waiter->fl_next)
__locks_delete_block(waiter);
else
status = -ENOENT;
- unlock_kernel();
+ unlock_flocks();
return status;
}

@@ -2172,7 +2194,7 @@ static int locks_show(struct seq_file *f, void *v)

static void *locks_start(struct seq_file *f, loff_t *pos)
{
- lock_kernel();
+ lock_flocks();
f->private = (void *)1;
return seq_list_start(&file_lock_list, *pos);
}
@@ -2184,7 +2206,7 @@ static void *locks_next(struct seq_file *f, void *v, loff_t *pos)

static void locks_stop(struct seq_file *f, void *v)
{
- unlock_kernel();
+ unlock_flocks();
}

static const struct seq_operations locks_seq_operations = {
@@ -2231,7 +2253,7 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
{
struct file_lock *fl;
int result = 1;
- lock_kernel();
+ lock_flocks();
for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
if (IS_POSIX(fl)) {
if (fl->fl_type == F_RDLCK)
@@ -2248,7 +2270,7 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
result = 0;
break;
}
- unlock_kernel();
+ unlock_flocks();
return result;
}

@@ -2271,7 +2293,7 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
{
struct file_lock *fl;
int result = 1;
- lock_kernel();
+ lock_flocks();
for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
if (IS_POSIX(fl)) {
if ((fl->fl_end < start) || (fl->fl_start > (start + len)))
@@ -2286,7 +2308,7 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
result = 0;
break;
}
- unlock_kernel();
+ unlock_flocks();
return result;
}

--
1.7.0.4

2010-04-14 20:52:29

by Trond Myklebust

[permalink] [raw]

Subject: Re: [PATCH 2/2] [RFC] Remove BKL from fs/locks.c

On Wed, 2010-04-14 at 22:36 +0200, Arnd Bergmann wrote:
> From: Matthew Wilcox <[email protected]>
>
> I've taken a patch originally written by Matthew Wilcox and
> ported it to the current version. It seems that there were
> originally concerns that this breaks NFS, but since Trond
> has recently removed the BKL from NFS, my naive assumption
> would be that it's all good now, despite not having tried to
> understand what it does.

Hi Arnd,

We still need to fix up the bits in NFS that dereference inode->i_flock.
On the client side, those are mainly the bits that deal with lock
recovery when the NFS server has rebooted or restarted.

AFAICS, there are two places in the NFSv4 client that need to be changed
to call lock_flocks(): nfs_delegation_claim_locks(), and
nfs4_reclaim_locks(). In both cases, the replacement is trivial.

For NFSv3, I think we are already safe, since AFAICS the host->h_rwsem
already provides exclusion between file locking and lock recovery
attempts. I think we should therefore be able to immediately remove the
BKL in fs/lockd/clntlock.c:reclaimer().

I'm not as sure about how sensitive the NFS server is to the switch from
BKL -> lock_flocks(). Perhaps Bruce can comment...

Cheers
Trond

2010-04-14 21:04:17

by J. Bruce Fields

[permalink] [raw]

Subject: Re: [PATCH 2/2] [RFC] Remove BKL from fs/locks.c

On Wed, Apr 14, 2010 at 04:52:14PM -0400, Trond Myklebust wrote:
> On Wed, 2010-04-14 at 22:36 +0200, Arnd Bergmann wrote:
> > From: Matthew Wilcox <[email protected]>
> >
> > I've taken a patch originally written by Matthew Wilcox and
> > ported it to the current version. It seems that there were
> > originally concerns that this breaks NFS, but since Trond
> > has recently removed the BKL from NFS, my naive assumption
> > would be that it's all good now, despite not having tried to
> > understand what it does.
>
> Hi Arnd,
>
> We still need to fix up the bits in NFS that dereference inode->i_flock.
> On the client side, those are mainly the bits that deal with lock
> recovery when the NFS server has rebooted or restarted.
>
> AFAICS, there are two places in the NFSv4 client that need to be changed
> to call lock_flocks(): nfs_delegation_claim_locks(), and
> nfs4_reclaim_locks(). In both cases, the replacement is trivial.
>
> For NFSv3, I think we are already safe, since AFAICS the host->h_rwsem
> already provides exclusion between file locking and lock recovery
> attempts. I think we should therefore be able to immediately remove the
> BKL in fs/lockd/clntlock.c:reclaimer().
>
> I'm not as sure about how sensitive the NFS server is to the switch from
> BKL -> lock_flocks(). Perhaps Bruce can comment...

There's a callback from the lease code, the implementation of which
sleeps in the NFSv4 server case. I've got patches for the next merge
window which remove that sleep (by just deferring work to a workqueue).

release_lockowner() traverses the lock list. I need to fix that.

The lockd thread is entirely under the BKL--it takes it at the top of
fs/lockd/svc.c:lockd(), and drops it at the bottom. It's possible
there may be code that lockd runs that assumes mutual exclusion with
code in locks.c, but I don't know.

--b.

2010-04-14 22:48:30

by Douglas Gilbert

[permalink] [raw]

Subject: Re: [PATCH 1/2] [RFC] block: replace BKL with global mutex

Arnd Bergmann wrote:
> The block layer is one of the remaining users
> of the Big Kernel Lock. In there, it is used
> mainly for serializing the blkdev_get and
> blkdev_put functions and some ioctl implementations
> as well as some less common open functions of
> related character devices following a previous
> pushdown and parts of the blktrace code.
>
> The only one that seems to be a bit nasty is the
> blkdev_get function which is actually recursive
> and may try to get the BKL twice.
>
> All users except the one in blkdev_get seem to
> be outermost locks, meaning we don't rely on
> the release-on-sleep semantics to avoid deadlocks.
>
> The ctl_mutex (pktcdvd.ko), raw_mutex (raw.ko),
> state_mutex (dasd.ko), reconfig_mutex (md.ko),
> and jfs_log_mutex (jfs.ko) may be held when
> blkdev_get is called, but as far as I can tell,
> these mutexes are never acquired from any of the
> functions that get converted in this patch.
>
> In order to get rid of the BKL, this introduces
> a new global mutex called blkdev_mutex, which
> replaces the BKL in all drivers that directly
> interact with the block layer. In case of blkdev_get,
> the mutex is moved outside of the function itself
> in order to avoid the recursive taking of blkdev_mutex.
>
> Testing so far has shown no problems whatsoever
> from this patch, but the usage in blkdev_get
> may introduce extra latencies, and I may have
> missed corner cases where an block device ioctl
> function sleeps for a significant amount of time,
> which may be harmful to the performance of other
> threads.

<snip>

> diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
> index dee1c96..ec5b43e 100644
> --- a/drivers/scsi/sg.c
> +++ b/drivers/scsi/sg.c
> @@ -229,7 +229,7 @@ sg_open(struct inode *inode, struct file *filp)
> int res;
> int retval;
>
> - lock_kernel();
> + mutex_lock(&blkdev_mutex);
> nonseekable_open(inode, filp);
> SCSI_LOG_TIMEOUT(3, printk("sg_open: dev=%d, flags=0x%x\n", dev, flags));
> sdp = sg_get_dev(dev);
> @@ -307,7 +307,7 @@ error_out:
> sg_put:
> if (sdp)
> sg_put_dev(sdp);
> - unlock_kernel();
> + mutex_unlock(&blkdev_mutex);
> return retval;
> }
>
> @@ -757,9 +757,7 @@ sg_common_write(Sg_fd * sfp, Sg_request * srp,
> return 0;
> }
>
> -static int
> -sg_ioctl(struct inode *inode, struct file *filp,
> - unsigned int cmd_in, unsigned long arg)
> +static long sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg)
> {
> void __user *p = (void __user *)arg;
> int __user *ip = p;
> @@ -1078,6 +1076,17 @@ sg_ioctl(struct inode *inode, struct file *filp,
> }
> }
>
> +static long sg_unlocked_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg)
> +{
> + int ret;
> +
> + mutex_lock(&blkdev_mutex);
> + ret = sg_ioctl(filp, cmd_in, arg);
> + mutex_unlock(&blkdev_mutex);
> +
> + return ret;
> +}
> +
> #ifdef CONFIG_COMPAT
> static long sg_compat_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg)
> {
> @@ -1322,7 +1331,8 @@ static const struct file_operations sg_fops = {
> .read = sg_read,
> .write = sg_write,
> .poll = sg_poll,
> - .ioctl = sg_ioctl,
> + .llseek = generic_file_llseek,

The sg driver has no seek semantics on its read() and
write() calls. And sg_open() calls nonseekable_open(). So
.llseek = no_llseek,
seems more appropriate.

> + .unlocked_ioctl = sg_unlocked_ioctl,
> #ifdef CONFIG_COMPAT
> .compat_ioctl = sg_compat_ioctl,
> #endif

And I just checked st.c (SCSI tape driver) and it calls
lock_kernel() .

Doug Gilbert

2010-04-15 05:16:57

by Brad Boyer

[permalink] [raw]

Subject: Re: [PATCH 2/2] [RFC] Remove BKL from fs/locks.c

On Wed, Apr 14, 2010 at 10:36:24PM +0200, Arnd Bergmann wrote:
> From: Matthew Wilcox <[email protected]>
>
> I've taken a patch originally written by Matthew Wilcox and
> ported it to the current version. It seems that there were
> originally concerns that this breaks NFS, but since Trond
> has recently removed the BKL from NFS, my naive assumption
> would be that it's all good now, despite not having tried to
> understand what it does.
>
> Original introduction from Willy:
>
> I've been promising to do this for about seven years now.
>
> It seems to work well enough, but I haven't run any serious stress
> tests on it. This implementation uses one spinlock to protect both lock
> lists and all the i_flock chains. It doesn't seem worth splitting up
> the locking any further.
>
> I had to move one memory allocation out from under the file_lock_lock.
> I hope I got that logic right. I'm rather tempted to split out the
> find_conflict algorithm from that function into something that can be
> called separately for the FL_ACCESS case.
>
> I also have to drop and reacquire the file_lock_lock around the call
> to cond_resched(). This was done automatically for us before by the
> special BKL semantics.
>
> I had to change vfs_setlease() as it relied on the special BKL ability
> to recursively acquire the same lock. The internal caller now calls
> __vfs_setlease and the exported interface acquires and releases the
> file_lock_lock around calling __vfs_setlease.
>
> I should probably split out the removal of interruptible_sleep_on_locked()
> as it's basically unrelated to all this.

Don't we need to have access to this new lock from modules? In particular,
nfsd/lockd currently call lock_kernel to be able to safely access i_flock.
This would seem to imply that they would either need new functions inside
locks.c to do the same work or export the new lock functions. It seems
easiest to just do EXPORT_SYMBOL on the new lock/unlock functions added
in this patch, but I do understand if that isn't desireable.

Brad Boyer
[email protected]

2010-04-15 07:12:28

by Arnd Bergmann

[permalink] [raw]

Subject: Re: [PATCH 1/2] [RFC] block: replace BKL with global mutex

On Thursday 15 April 2010 00:48:19 Douglas Gilbert wrote:

> > @@ -1322,7 +1331,8 @@ static const struct file_operations sg_fops = {
> > .read = sg_read,
> > .write = sg_write,
> > .poll = sg_poll,
> > - .ioctl = sg_ioctl,
> > + .llseek = generic_file_llseek,
>
> The sg driver has no seek semantics on its read() and
> write() calls. And sg_open() calls nonseekable_open(). So
> .llseek = no_llseek,
> seems more appropriate.

Ok, I missed the nonseekable_open here and assumed someone
might be calling seek on it. I'll use no_llseek then, or
just leave it alone.

> > + .unlocked_ioctl = sg_unlocked_ioctl,
> > #ifdef CONFIG_COMPAT
> > .compat_ioctl = sg_compat_ioctl,
> > #endif
>
> And I just checked st.c (SCSI tape driver) and it calls
> lock_kernel() .

Ah, good point. So even if the st driver does not need
any locking against the block layer, it might need to
lock its ioctl against sg.

The most simple solution for this would be to let sg
take both blkdev_mutex and the BKL, which of course
feels like a step backwards.

A better way is to get rid of the BKL in sg, which requires
a better understanding of what it's actually protecting.
It only gets it in the open and ioctl functions, which is a
result of the pushdown from the respective file operations.
Chances are that it's not needed at all, but that's really
hard to tell. Can you shed some more light on this?

Arnd

2010-04-15 13:15:59

by Douglas Gilbert

[permalink] [raw]

Subject: Re: [PATCH 1/2] [RFC] block: replace BKL with global mutex

Arnd Bergmann wrote:
> On Thursday 15 April 2010 00:48:19 Douglas Gilbert wrote:
>
>>> @@ -1322,7 +1331,8 @@ static const struct file_operations sg_fops = {
>>> .read = sg_read,
>>> .write = sg_write,
>>> .poll = sg_poll,
>>> - .ioctl = sg_ioctl,
>>> + .llseek = generic_file_llseek,
>> The sg driver has no seek semantics on its read() and
>> write() calls. And sg_open() calls nonseekable_open(). So
>> .llseek = no_llseek,
>> seems more appropriate.
>
> Ok, I missed the nonseekable_open here and assumed someone
> might be calling seek on it. I'll use no_llseek then, or
> just leave it alone.
>
>>> + .unlocked_ioctl = sg_unlocked_ioctl,
>>> #ifdef CONFIG_COMPAT
>>> .compat_ioctl = sg_compat_ioctl,
>>> #endif
>> And I just checked st.c (SCSI tape driver) and it calls
>> lock_kernel() .
>
> Ah, good point. So even if the st driver does not need
> any locking against the block layer, it might need to
> lock its ioctl against sg.

At the level of SCSI commands, tape device state assumptions
made by the st driver could be compromised by SCSI commands
sent by the sg driver. However the BKL was never meant
to address that concern.

From the comment in st_open() [st.c] it would be using
nonseekable_open() as well but there are apps out there
that do lseek()s on its file descriptors. Not sure
how long nonseekable_open() has been in the sg driver
but no-one has complained to me about it.

> The most simple solution for this would be to let sg
> take both blkdev_mutex and the BKL, which of course
> feels like a step backwards.
>
> A better way is to get rid of the BKL in sg, which requires
> a better understanding of what it's actually protecting.
> It only gets it in the open and ioctl functions, which is a
> result of the pushdown from the respective file operations.
> Chances are that it's not needed at all, but that's really
> hard to tell. Can you shed some more light on this?

The BKL is not used to protect any of the internal
objects within the sg driver. From memory it was added
in some large code sweep through, not unlike what you
are proposing now.

So I would not be concerned about any kernel locking
interactions between the sg and st drivers. I have
added Kai Makisara (st maintainer) to the cc list.

Doug Gilbert

2010-04-15 14:29:53

by Arnd Bergmann

[permalink] [raw]

Subject: Re: [PATCH 1/2] [RFC] block: replace BKL with global mutex

On Thursday 15 April 2010, Douglas Gilbert wrote:
> At the level of SCSI commands, tape device state assumptions
> made by the st driver could be compromised by SCSI commands
> sent by the sg driver. However the BKL was never meant
> to address that concern.
>
> From the comment in st_open() [st.c] it would be using
> nonseekable_open() as well but there are apps out there
> that do lseek()s on its file descriptors. Not sure
> how long nonseekable_open() has been in the sg driver
> but no-one has complained to me about it.

It's been there for a long time, at least since the start of
the git history, and it's very likely correct this way.

> > The most simple solution for this would be to let sg
> > take both blkdev_mutex and the BKL, which of course
> > feels like a step backwards.
> >
> > A better way is to get rid of the BKL in sg, which requires
> > a better understanding of what it's actually protecting.
> > It only gets it in the open and ioctl functions, which is a
> > result of the pushdown from the respective file operations.
> > Chances are that it's not needed at all, but that's really
> > hard to tell. Can you shed some more light on this?
>
> The BKL is not used to protect any of the internal
> objects within the sg driver. From memory it was added
> in some large code sweep through, not unlike what you
> are proposing now.

The one in the open function was moved there when the
BKL was moved out from vfs_open(), while the use in ioctl is
implicit by never having been converted to unlocked_ioctl.

I don't see anything that really needs BKL protection in
sg_open, so that can probably just be killed. For sg_ioctl,
at least the blk_trace_* and scsi_ioctl stuff is currently
called with BKL held everywhere else (not in st_ioctl though)
and may still need that.

> So I would not be concerned about any kernel locking
> interactions between the sg and st drivers. I have
> added Kai Makisara (st maintainer) to the cc list.

Ok. I've also checked st.c again and noticed that it
doesn't use use the BKL in ioctl() but only in open(),
which is very unlikely to race against anything in sg.c
or the block subsystem.

Arnd

2010-04-15 14:49:16

by Steven Whitehouse

[permalink] [raw]

Subject: Re: [PATCH 2/2] [RFC] Remove BKL from fs/locks.c

Hi,

Some comments...

On Wed, 2010-04-14 at 22:36 +0200, Arnd Bergmann wrote:
> From: Matthew Wilcox <[email protected]>
>
> I've taken a patch originally written by Matthew Wilcox and
> ported it to the current version. It seems that there were
> originally concerns that this breaks NFS, but since Trond
> has recently removed the BKL from NFS, my naive assumption
> would be that it's all good now, despite not having tried to
> understand what it does.
[snip]
> fs/locks.c | 110 ++++++++++++++++++++++++++++++++++++------------------------
> 1 files changed, 66 insertions(+), 44 deletions(-)
>
> diff --git a/fs/locks.c b/fs/locks.c
> index ab24d49..87f1c60 100644
> --- a/fs/locks.c
> +++ b/fs/locks.c
> @@ -140,9 +140,23 @@ int lease_break_time = 45;
> #define for_each_lock(inode, lockp) \
> for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next)
>
> +/*
> + * Protects the two list heads below, plus the inode->i_flock list
> + */
> +static DEFINE_SPINLOCK(file_lock_lock);
> static LIST_HEAD(file_lock_list);
> static LIST_HEAD(blocked_list);
>
> +static inline void lock_flocks(void)
> +{
> + spin_lock(&file_lock_lock);
> +}
> +
> +static inline void unlock_flocks(void)
> +{
> + spin_unlock(&file_lock_lock);
> +}
> +
> static struct kmem_cache *filelock_cache __read_mostly;
>
Why not just put the spin lock calls inline?

[snip]
> for_each_lock(inode, before) {
> struct file_lock *fl = *before;
> if (IS_POSIX(fl))
> @@ -767,8 +779,11 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
> * If a higher-priority process was blocked on the old file lock,
> * give it the opportunity to lock the file.
> */
> - if (found)
> + if (found) {
> + unlock_flocks();
> cond_resched();
> + lock_flocks();
> + }
>
Use cond_resched_lock() here perhaps?

[snip]

> @@ -1341,7 +1358,7 @@ int fcntl_getlease(struct file *filp)
> * The (input) flp->fl_lmops->fl_break function is required
> * by break_lease().
> *
> - * Called with kernel lock held.
> + * Called with file_lock_lock held.
> */
> int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
> {
> @@ -1436,7 +1453,15 @@ out:
> }
> EXPORT_SYMBOL(generic_setlease);
>
> - /**
> +static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
> +{
> + if (filp->f_op && filp->f_op->setlease)
> + return filp->f_op->setlease(filp, arg, lease);
> + else
> + return generic_setlease(filp, arg, lease);
> +}
> +
> +/**
> * vfs_setlease - sets a lease on an open file
> * @filp: file pointer
> * @arg: type of lease to obtain
> @@ -1467,12 +1492,9 @@ int vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
> {
> int error;
>
> - lock_kernel();
> - if (filp->f_op && filp->f_op->setlease)
> - error = filp->f_op->setlease(filp, arg, lease);
> - else
> - error = generic_setlease(filp, arg, lease);
> - unlock_kernel();
> + lock_flocks();
> + error = __vfs_setlease(filp, arg, lease);
> + unlock_flocks();
>
This looks to me like generic_setlease() or whatever fs specific
->setlease() there might be will be called under a spin lock. That
doesn't look right to me.

Rather than adding locking here, why not push the BKL down into
generic_setlease() and ->setlease() first, and then eliminate it on a
case by case basis later on? That is the pattern that has been followed
elsewhere in the kernel.

I might have some further comment on this, but thats as far as I've got
at the moment,

Steve.

2010-04-15 15:18:05

by Arnd Bergmann

[permalink] [raw]

Subject: Re: [PATCH 2/2] [RFC] Remove BKL from fs/locks.c

On Thursday 15 April 2010, Steven Whitehouse wrote:

> Some comments...

I'll wait for Willy to comment on most of these, except

> On Wed, 2010-04-14 at 22:36 +0200, Arnd Bergmann wrote:
> > @@ -1467,12 +1492,9 @@ int vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
> > {
> > int error;
> >
> > - lock_kernel();
> > - if (filp->f_op && filp->f_op->setlease)
> > - error = filp->f_op->setlease(filp, arg, lease);
> > - else
> > - error = generic_setlease(filp, arg, lease);
> > - unlock_kernel();
> > + lock_flocks();
> > + error = __vfs_setlease(filp, arg, lease);
> > + unlock_flocks();
> >
> This looks to me like generic_setlease() or whatever fs specific
> ->setlease() there might be will be called under a spin lock. That
> doesn't look right to me.
>
> Rather than adding locking here, why not push the BKL down into
> generic_setlease() and ->setlease() first, and then eliminate it on a
> case by case basis later on? That is the pattern that has been followed
> elsewhere in the kernel.

Sounds fair. Besides generic_setlease (which is in this file as well),
the only non-trivial one is cifs_setlease (Cc'ing Steve French now)
and that calls generic_setlease in the end.

If we can show that cifs_setlease does not need locking, the new lock
could be put into generic_setlease directly.

Arnd

2010-04-15 20:11:25

by Kai Mäkisara (Kolumbus)

[permalink] [raw]

Subject: Re: [PATCH 1/2] [RFC] block: replace BKL with global mutex

On Thu, 15 Apr 2010, Arnd Bergmann wrote:

> On Thursday 15 April 2010, Douglas Gilbert wrote:
> > At the level of SCSI commands, tape device state assumptions
> > made by the st driver could be compromised by SCSI commands
> > sent by the sg driver. However the BKL was never meant
> > to address that concern.
> >
...
> > So I would not be concerned about any kernel locking
> > interactions between the sg and st drivers. I have
> > added Kai Makisara (st maintainer) to the cc list.
>
> Ok. I've also checked st.c again and noticed that it
> doesn't use use the BKL in ioctl() but only in open(),
> which is very unlikely to race against anything in sg.c
> or the block subsystem.
>
Using sg with a tape opened by st may lead to incorrect state within st.
However, to prevent this would be far too complicated and so the
coordination responsibility is left to the user. (There are some
cases where use of both sg and st can be justified but then the user
should know what he/she is doing and properly serialize access in user
space.)

BKL does not have any "hidden duties" in open() in st.c. I don't know any
reason why it would be needed, but, because I have not been absolutely
sure, I have not removed it. (The tape devices are not opened often and so
the overhead has been negligible. That is, while BKL has been available.)
If you don't see any reason for BKL in open(), go ahead and remove it.

Kai

2010-04-15 20:36:33

by Arnd Bergmann

[permalink] [raw]

Subject: Re: [PATCH 2/2] [RFC] Remove BKL from fs/locks.c

On Wednesday 14 April 2010 23:04:03 J. Bruce Fields wrote:
> The lockd thread is entirely under the BKL--it takes it at the top of
> fs/lockd/svc.c:lockd(), and drops it at the bottom. It's possible
> there may be code that lockd runs that assumes mutual exclusion with
> code in locks.c, but I don't know.

That one seems interesting as the lockd thread calls lots of sleeping
functions as well as file lock code that takes the BKL again, both
of which of course is not allowed when converting to a spinlock.

I just attempted to blindly convert lockd to taking lock_flocks()
in place of the BKL and releasing it where necessary, but quickly gave
up because this seems rather pointless. It basically never does
anything interesting between places where it would need to drop the
lock.

Arnd

2010-04-15 20:51:56

by Arnd Bergmann

[permalink] [raw]

Subject: [PATCH] scsi/st: remove BKL from open

The st_open function is serialized through the st_dev_arr_lock
and the STp->in_use flag, so there is no race that the BKL
can protect against in the driver itself, and the function
does not access any global state outside of the driver that
might be protected with the BKL.

Signed-off-by: Arnd Bergmann <[email protected]>
---
drivers/scsi/st.c | 9 +--------
1 files changed, 1 insertions(+), 8 deletions(-)

On Thursday 15 April 2010 22:03:24 Kai Makisara wrote:

> BKL does not have any "hidden duties" in open() in st.c. I don't know any
> reason why it would be needed, but, because I have not been absolutely
> sure, I have not removed it. (The tape devices are not opened often and so
> the overhead has been negligible. That is, while BKL has been available.)
> If you don't see any reason for BKL in open(), go ahead and remove it.

Ok, that certainly simplifies the sg.c conversion. Thanks!

Arnd

diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 3ea1a71..b1462e0 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -39,7 +39,6 @@ static const char *verstr = "20081215";
#include <linux/cdev.h>
#include <linux/delay.h>
#include <linux/mutex.h>
-#include <linux/smp_lock.h>

#include <asm/uaccess.h>
#include <asm/dma.h>
@@ -1180,7 +1179,6 @@ static int st_open(struct inode *inode, struct file *filp)
int dev = TAPE_NR(inode);
char *name;

- lock_kernel();
/*
* We really want to do nonseekable_open(inode, filp); here, but some
* versions of tar incorrectly call lseek on tapes and bail out if that
@@ -1188,10 +1186,8 @@ static int st_open(struct inode *inode, struct file *filp)
*/
filp->f_mode &= ~(FMODE_PREAD | FMODE_PWRITE);

- if (!(STp = scsi_tape_get(dev))) {
- unlock_kernel();
+ if (!(STp = scsi_tape_get(dev)))
return -ENXIO;
- }

write_lock(&st_dev_arr_lock);
filp->private_data = STp;
@@ -1200,7 +1196,6 @@ static int st_open(struct inode *inode, struct file *filp)
if (STp->in_use) {
write_unlock(&st_dev_arr_lock);
scsi_tape_put(STp);
- unlock_kernel();
DEB( printk(ST_DEB_MSG "%s: Device already in use.\n", name); )
return (-EBUSY);
}
@@ -1249,14 +1244,12 @@ static int st_open(struct inode *inode, struct file *filp)
retval = (-EIO);
goto err_out;
}
- unlock_kernel();
return 0;

err_out:
normalize_buffer(STp->buffer);
STp->in_use = 0;
scsi_tape_put(STp);
- unlock_kernel();
return retval;

}
--
1.7.0.4

2010-04-30 18:54:16

by Frederic Weisbecker

[permalink] [raw]

Subject: Re: [PATCH] scsi/st: remove BKL from open

On Thu, Apr 15, 2010 at 10:51:38PM +0200, Arnd Bergmann wrote:
> The st_open function is serialized through the st_dev_arr_lock
> and the STp->in_use flag, so there is no race that the BKL
> can protect against in the driver itself, and the function
> does not access any global state outside of the driver that
> might be protected with the BKL.
>
> Signed-off-by: Arnd Bergmann <[email protected]>
> ---

Kai, can we get your ack on this?

Thanks.

2010-04-30 19:04:23

by Kai Mäkisara (Kolumbus)

[permalink] [raw]

Subject: Re: [PATCH] scsi/st: remove BKL from open

On Fri, 30 Apr 2010, Frederic Weisbecker wrote:

> On Thu, Apr 15, 2010 at 10:51:38PM +0200, Arnd Bergmann wrote:
> > The st_open function is serialized through the st_dev_arr_lock
> > and the STp->in_use flag, so there is no race that the BKL
> > can protect against in the driver itself, and the function
> > does not access any global state outside of the driver that
> > might be protected with the BKL.
> >
> > Signed-off-by: Arnd Bergmann <[email protected]>
> > ---
>
>
>
> Kai, can we get your ack on this?
>
I am sorry, I have forgotten to send it.

Acked-by: Kai Makisara <[email protected]>

Thanks,
Kai