The code was written with "oh big kernel lock, please protect me from
all the evil" mentality: it does not locks its own data structures, it
just hopes that big kernel lock somehow helps.
It does not. (My fault).
So this uses tx_lock to protect data structures from concurrent use
between ioctl and worker threads.
Signed-off-by: Pavel Machek <[email protected]>
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 34f80fa..81c94a4 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -559,74 +557,68 @@ static void do_nbd_request(struct reques
}
}
-static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned int cmd, unsigned long arg)
-{
- struct nbd_device *lo = bdev->bd_disk->private_data;
- struct file *file;
- int error;
- struct request sreq ;
- struct task_struct *thread;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- BUG_ON(lo->magic != LO_MAGIC);
-
- /* Anyone capable of this syscall can do *real bad* things */
- dprintk(DBG_IOCTL, "%s: nbd_ioctl cmd=%s(0x%x) arg=%lu\n",
- lo->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg);
+/* Must be called with tx_lock held */
+static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
+ unsigned int cmd, unsigned long arg)
+{
switch (cmd) {
case NBD_DISCONNECT:
printk(KERN_INFO "%s: NBD_DISCONNECT\n", lo->disk->disk_name);
- blk_rq_init(NULL, &sreq);
- sreq.cmd_type = REQ_TYPE_SPECIAL;
- nbd_cmd(&sreq) = NBD_CMD_DISC;
- /*
- * Set these to sane values in case server implementation
- * fails to check the request type first and also to keep
- * debugging output cleaner.
- */
- sreq.sector = 0;
- sreq.nr_sectors = 0;
- if (!lo->sock)
- return -EINVAL;
- mutex_lock(&lo->tx_lock);
- nbd_send_req(lo, &sreq);
- mutex_unlock(&lo->tx_lock);
+ {
+ struct request sreq;
+
+ blk_rq_init(NULL, &sreq);
+ sreq.cmd_type = REQ_TYPE_SPECIAL;
+ nbd_cmd(&sreq) = NBD_CMD_DISC;
+ /*
+ * Set these to sane values in case server implementation
+ * fails to check the request type first and also to keep
+ * debugging output cleaner.
+ */
+ sreq.sector = 0;
+ sreq.nr_sectors = 0;
+ if (!lo->sock)
+ return -EINVAL;
+ nbd_send_req(lo, &sreq);
+ }
return 0;
case NBD_CLEAR_SOCK:
- error = 0;
- mutex_lock(&lo->tx_lock);
- lo->sock = NULL;
- mutex_unlock(&lo->tx_lock);
- file = lo->file;
- lo->file = NULL;
- nbd_clear_que(lo);
- BUG_ON(!list_empty(&lo->queue_head));
- if (file)
- fput(file);
- return error;
- case NBD_SET_SOCK:
- if (lo->file)
- return -EBUSY;
- error = -EINVAL;
- file = fget(arg);
- if (file) {
- struct inode *inode = file->f_path.dentry->d_inode;
- if (S_ISSOCK(inode->i_mode)) {
- lo->file = file;
- lo->sock = SOCKET_I(inode);
- if (max_part > 0)
- bdev->bd_invalidated = 1;
- error = 0;
- } else {
+ {
+ struct file *file;
+
+ lo->sock = NULL;
+ file = lo->file;
+ lo->file = NULL;
+ nbd_clear_que(lo);
+ BUG_ON(!list_empty(&lo->queue_head));
+ if (file)
fput(file);
+ }
+ return 0;
+
+ case NBD_SET_SOCK:
+ {
+ struct file *file;
+ if (lo->file)
+ return -EBUSY;
+ file = fget(arg);
+ if (file) {
+ struct inode *inode = file->f_path.dentry->d_inode;
+ if (S_ISSOCK(inode->i_mode)) {
+ lo->file = file;
+ lo->sock = SOCKET_I(inode);
+ if (max_part > 0)
+ bdev->bd_invalidated = 1;
+ return 0;
+ } else {
+ fput(file);
+ }
}
}
- return error;
+ return -EINVAL;
+
case NBD_SET_BLKSIZE:
lo->blksize = arg;
lo->bytesize &= ~(lo->blksize-1);
@@ -634,47 +626,65 @@ static int nbd_ioctl(struct block_device
set_blocksize(bdev, lo->blksize);
set_capacity(lo->disk, lo->bytesize >> 9);
return 0;
+
case NBD_SET_SIZE:
lo->bytesize = arg & ~(lo->blksize-1);
bdev->bd_inode->i_size = lo->bytesize;
set_blocksize(bdev, lo->blksize);
set_capacity(lo->disk, lo->bytesize >> 9);
return 0;
+
case NBD_SET_TIMEOUT:
lo->xmit_timeout = arg * HZ;
return 0;
+
case NBD_SET_SIZE_BLOCKS:
lo->bytesize = ((u64) arg) * lo->blksize;
bdev->bd_inode->i_size = lo->bytesize;
set_blocksize(bdev, lo->blksize);
set_capacity(lo->disk, lo->bytesize >> 9);
return 0;
+
case NBD_DO_IT:
- if (lo->pid)
- return -EBUSY;
- if (!lo->file)
- return -EINVAL;
- thread = kthread_create(nbd_thread, lo, lo->disk->disk_name);
- if (IS_ERR(thread))
- return PTR_ERR(thread);
- wake_up_process(thread);
- error = nbd_do_it(lo);
- kthread_stop(thread);
- if (error)
- return error;
- sock_shutdown(lo, 1);
- file = lo->file;
- lo->file = NULL;
- nbd_clear_que(lo);
- printk(KERN_WARNING "%s: queue cleared\n", lo->disk->disk_name);
- if (file)
- fput(file);
- lo->bytesize = 0;
- bdev->bd_inode->i_size = 0;
- set_capacity(lo->disk, 0);
- if (max_part > 0)
- ioctl_by_bdev(bdev, BLKRRPART, 0);
- return lo->harderror;
+ {
+ struct task_struct *thread;
+ struct file *file;
+ int error;
+
+ if (lo->pid)
+ return -EBUSY;
+ if (!lo->file)
+ return -EINVAL;
+
+ mutex_unlock(&lo->tx_lock);
+
+ thread = kthread_create(nbd_thread, lo, lo->disk->disk_name);
+ if (IS_ERR(thread)) {
+ mutex_lock(&lo->tx_lock);
+ return PTR_ERR(thread);
+ }
+ wake_up_process(thread);
+ error = nbd_do_it(lo);
+ kthread_stop(thread);
+
+ mutex_lock(&lo->tx_lock);
+ if (error)
+ return error;
+ sock_shutdown(lo, 0);
+ file = lo->file;
+ lo->file = NULL;
+ nbd_clear_que(lo);
+ printk(KERN_WARNING "%s: queue cleared\n", lo->disk->disk_name);
+ if (file)
+ fput(file);
+ lo->bytesize = 0;
+ bdev->bd_inode->i_size = 0;
+ set_capacity(lo->disk, 0);
+ if (max_part > 0)
+ ioctl_by_bdev(bdev, BLKRRPART, 0);
+ return lo->harderror;
+ }
+
case NBD_CLEAR_QUE:
/*
* This is for compatibility only. The queue is always cleared
@@ -682,6 +692,7 @@ static int nbd_ioctl(struct block_device
*/
BUG_ON(!lo->sock && !list_empty(&lo->queue_head));
return 0;
+
case NBD_PRINT_DEBUG:
printk(KERN_INFO "%s: next = %p, prev = %p, head = %p\n",
bdev->bd_disk->disk_name,
@@ -689,7 +700,29 @@ static int nbd_ioctl(struct block_device
&lo->queue_head);
return 0;
}
- return -EINVAL;
+}
+
+
+static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ struct nbd_device *lo = bdev->bd_disk->private_data;
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ BUG_ON(lo->magic != LO_MAGIC);
+
+ /* Anyone capable of this syscall can do *real bad* things */
+ dprintk(DBG_IOCTL, "%s: nbd_ioctl cmd=%s(0x%x) arg=%lu\n",
+ lo->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg);
+
+ mutex_lock(&lo->tx_lock);
+ error = __nbd_ioctl(bdev, lo, cmd, arg);
+ mutex_unlock(&lo->tx_lock);
+
+ return error;
}
static struct block_device_operations nbd_fops =
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
On Fri 2009-01-16 12:55:12, Pavel Machek wrote:
>
> The code was written with "oh big kernel lock, please protect me from
> all the evil" mentality: it does not locks its own data structures, it
> just hopes that big kernel lock somehow helps.
>
> It does not. (My fault).
>
> So this uses tx_lock to protect data structures from concurrent use
> between ioctl and worker threads.
>
> Signed-off-by: Pavel Machek <[email protected]>
Ok, it should return -EINVAL at the end of __nbd_ioctl. But Paul, does
it look ok otherwise?
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
On Friday 16 January 2009, Pavel Machek wrote:
> The code was written with "oh big kernel lock, please protect me from
> all the evil" mentality: it does not locks its own data structures, it
> just hopes that big kernel lock somehow helps.
If the locking is ok now, can you also move the call to ->ioctl()? instead
of locked_ioctl()? That would also be a good opportunity to move to
compat_ioctl() and kill the handlers in fs/compat_ioctl.c.
Arnd <><
Pavel Machek wrote:
> The code was written with "oh big kernel lock, please protect me from
> all the evil" mentality: it does not locks its own data structures, it
> just hopes that big kernel lock somehow helps.
>
> It does not. (My fault).
>
> So this uses tx_lock to protect data structures from concurrent use
> between ioctl and worker threads.
What is the particular problem that this fixes? I thought we had already
been careful to take tx_lock where necessary to protect data structures.
Perhaps there is something I missed?
--
Paul
On Fri 2009-01-16 10:24:06, Paul Clements wrote:
> Pavel Machek wrote:
>> The code was written with "oh big kernel lock, please protect me from
>> all the evil" mentality: it does not locks its own data structures, it
>> just hopes that big kernel lock somehow helps.
>>
>> It does not. (My fault).
>>
>> So this uses tx_lock to protect data structures from concurrent use
>> between ioctl and worker threads.
>
> What is the particular problem that this fixes? I thought we had already
> been careful to take tx_lock where necessary to protect data structures.
> Perhaps there is something I missed?
for example lo->sock / lo->file are written to without holding any
lock in current code. (lo->xmit_timeout has similar problem, and other
fields, too).
I used tx_lock to protect those fields.
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
Pavel Machek wrote:
> On Fri 2009-01-16 10:24:06, Paul Clements wrote:
>> Pavel Machek wrote:
>>> The code was written with "oh big kernel lock, please protect me from
>>> all the evil" mentality: it does not locks its own data structures, it
>>> just hopes that big kernel lock somehow helps.
>>>
>>> It does not. (My fault).
>>>
>>> So this uses tx_lock to protect data structures from concurrent use
>>> between ioctl and worker threads.
>> What is the particular problem that this fixes? I thought we had already
>> been careful to take tx_lock where necessary to protect data structures.
>> Perhaps there is something I missed?
>
> for example lo->sock / lo->file are written to without holding any
> lock in current code. (lo->xmit_timeout has similar problem, and other
> fields, too).
lo->sock is only modified under tx_lock (except for SET_SOCK, where the
device is being initialized, in which case it's impossible for any other
thread to be accessing the device)
no one else uses lo->file except for the ioctls
I agree that if you really misuse the ioctls you could potentially get
yourself in trouble with the xmit_timeout (the timer not being deleted
or initialized properly if you hit the correct window). Taking tx_lock
would prevent this.
As for other fields, I assume you're talking about blksize, et al.
Taking tx_lock doesn't prevent you from screwing yourself if you modify
those while the device is active. You'd need to disallow those ioctls
when the device is active (check lo->file). Again, this is only going to
happen if you really misuse the ioctls.
--
Paul
> Pavel Machek wrote:
> >On Fri 2009-01-16 10:24:06, Paul Clements wrote:
> >>Pavel Machek wrote:
> >>>The code was written with "oh big kernel lock, please protect me from
> >>>all the evil" mentality: it does not locks its own data structures, it
> >>>just hopes that big kernel lock somehow helps.
> >>>
> >>>It does not. (My fault).
> >>>
> >>>So this uses tx_lock to protect data structures from concurrent use
> >>>between ioctl and worker threads.
> >>What is the particular problem that this fixes? I thought we had already
> >>been careful to take tx_lock where necessary to protect data structures.
> >> Perhaps there is something I missed?
> >
> >for example lo->sock / lo->file are written to without holding any
> >lock in current code. (lo->xmit_timeout has similar problem, and other
> >fields, too).
>
> lo->sock is only modified under tx_lock (except for SET_SOCK, where the
> device is being initialized, in which case it's impossible for any other
> thread to be accessing the device)
Well, unless the user is evil or confused? :-).
> no one else uses lo->file except for the ioctls
>
> I agree that if you really misuse the ioctls you could potentially get
> yourself in trouble with the xmit_timeout (the timer not being deleted
> or initialized properly if you hit the correct window). Taking tx_lock
> would prevent this.
Good.
> As for other fields, I assume you're talking about blksize, et al.
> Taking tx_lock doesn't prevent you from screwing yourself if you modify
> those while the device is active. You'd need to disallow those ioctls
> when the device is active (check lo->file). Again, this is only going to
> happen if you really misuse the ioctls.
Ok, I'll take a look at the missing checks. I'd really like to make
this "stable" -- no ammount of misuse should crash the kernel.
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
Pavel Machek wrote:
>> Pavel Machek wrote:
>>> On Fri 2009-01-16 10:24:06, Paul Clements wrote:
>> lo->sock is only modified under tx_lock (except for SET_SOCK, where the
>> device is being initialized, in which case it's impossible for any other
>> thread to be accessing the device)
>
> Well, unless the user is evil or confused? :-).
Even in that case, you're just going to get EBUSY. Nothing bad will
happen. SET_SOCK checks for lo->file, so it cannot be called on an
active nbd device.
>> As for other fields, I assume you're talking about blksize, et al.
>> Taking tx_lock doesn't prevent you from screwing yourself if you modify
>> those while the device is active. You'd need to disallow those ioctls
>> when the device is active (check lo->file). Again, this is only going to
>> happen if you really misuse the ioctls.
>
> Ok, I'll take a look at the missing checks. I'd really like to make
> this "stable" -- no amount of misuse should crash the kernel.
Just to summarize, I don't think we need to hold tx_lock around the
entirety of nbd_ioctl. We do need one extra tx_lock around xmit_timeout
and we do need to check for lo->file and return EBUSY in all of the
SET_*SIZE* ioctls.
Thanks,
Paul
> Pavel Machek wrote:
> >>Pavel Machek wrote:
> >>>On Fri 2009-01-16 10:24:06, Paul Clements wrote:
>
> >>lo->sock is only modified under tx_lock (except for SET_SOCK, where the
> >>device is being initialized, in which case it's impossible for any other
> >>thread to be accessing the device)
> >
> >Well, unless the user is evil or confused? :-).
>
> Even in that case, you're just going to get EBUSY. Nothing bad will
> happen. SET_SOCK checks for lo->file, so it cannot be called on an
> active nbd device.
>
>
> >>As for other fields, I assume you're talking about blksize, et al.
> >>Taking tx_lock doesn't prevent you from screwing yourself if you modify
> >>those while the device is active. You'd need to disallow those ioctls
> >>when the device is active (check lo->file). Again, this is only going to
> >>happen if you really misuse the ioctls.
> >
> >Ok, I'll take a look at the missing checks. I'd really like to make
> >this "stable" -- no amount of misuse should crash the kernel.
>
> Just to summarize, I don't think we need to hold tx_lock around the
> entirety of nbd_ioctl. We do need one extra tx_lock around xmit_timeout
> and we do need to check for lo->file and return EBUSY in all of the
> SET_*SIZE* ioctls.
I could do that but it would be a bit too complex, and still rely on
big kernel lock. Would you agree to patch that added tx_lock around
all of it, and moved ioctl to unlocked ioctl?
You are right that errors were much less severe than I thought in the
beggining.
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
Pavel Machek wrote:
>> Pavel Machek wrote:
>>>> Pavel Machek wrote:
>>>>> On Fri 2009-01-16 10:24:06, Paul Clements wrote:
>>>> lo->sock is only modified under tx_lock (except for SET_SOCK, where the
>>>> device is being initialized, in which case it's impossible for any other
>>>> thread to be accessing the device)
>>> Well, unless the user is evil or confused? :-).
>> Even in that case, you're just going to get EBUSY. Nothing bad will
>> happen. SET_SOCK checks for lo->file, so it cannot be called on an
>> active nbd device.
>>
>>
>>>> As for other fields, I assume you're talking about blksize, et al.
>>>> Taking tx_lock doesn't prevent you from screwing yourself if you modify
>>>> those while the device is active. You'd need to disallow those ioctls
>>>> when the device is active (check lo->file). Again, this is only going to
>>>> happen if you really misuse the ioctls.
>>> Ok, I'll take a look at the missing checks. I'd really like to make
>>> this "stable" -- no amount of misuse should crash the kernel.
>> Just to summarize, I don't think we need to hold tx_lock around the
>> entirety of nbd_ioctl. We do need one extra tx_lock around xmit_timeout
>> and we do need to check for lo->file and return EBUSY in all of the
>> SET_*SIZE* ioctls.
>
> I could do that but it would be a bit too complex, and still rely on
> big kernel lock. Would you agree to patch that added tx_lock around
> all of it, and moved ioctl to unlocked ioctl?
OK, I can buy the complexity argument. Your patch sounds fine to me.
--
Paul
> Pavel Machek wrote:
> >>Pavel Machek wrote:
> >>>>Pavel Machek wrote:
> >>>>>On Fri 2009-01-16 10:24:06, Paul Clements wrote:
> >>>>lo->sock is only modified under tx_lock (except for SET_SOCK, where the
> >>>>device is being initialized, in which case it's impossible for any
> >>>>other thread to be accessing the device)
> >>>Well, unless the user is evil or confused? :-).
> >>Even in that case, you're just going to get EBUSY. Nothing bad will
> >>happen. SET_SOCK checks for lo->file, so it cannot be called on an
> >>active nbd device.
> >>
> >>
> >>>>As for other fields, I assume you're talking about blksize, et al.
> >>>>Taking tx_lock doesn't prevent you from screwing yourself if you modify
> >>>>those while the device is active. You'd need to disallow those ioctls
> >>>>when the device is active (check lo->file). Again, this is only going
> >>>>to happen if you really misuse the ioctls.
> >>>Ok, I'll take a look at the missing checks. I'd really like to make
> >>>this "stable" -- no amount of misuse should crash the kernel.
> >>Just to summarize, I don't think we need to hold tx_lock around the
> >>entirety of nbd_ioctl. We do need one extra tx_lock around xmit_timeout
> >>and we do need to check for lo->file and return EBUSY in all of the
> >>SET_*SIZE* ioctls.
> >
> >I could do that but it would be a bit too complex, and still rely on
> >big kernel lock. Would you agree to patch that added tx_lock around
> >all of it, and moved ioctl to unlocked ioctl?
>
> OK, I can buy the complexity argument. Your patch sounds fine to me.
Thanks for review!
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html