2002-07-02 18:27:52

by Matthew Wilcox

[permalink] [raw]
Subject: O_DIRECT handling introduced a race in F_SETFL


I was doing some random kernel janitoring, pushing the BKL down a
little bit, when I noticed O_DIRECT introduced a race. If alloc_kiovec
sleeps, we will effectively be unprotected by the BKL. This would
allow two processes sharing an fd both changing the FASYNC flag to
call f_op->fasync() the wrong number of times and potentially leave
the application erroneously believing that fasync is off when it is on,
or vice versa.

My patch below fixes this problem by allocating the O_DIRECT kiovec
before taking the BKL. Any comments? Also, should we be freeing the
kiovec when removing the O_DIRECT attribute from the filp?

diff -urNX dontdiff linux-2.5.24/fs/fcntl.c linux-2.5.24-mm/fs/fcntl.c
--- linux-2.5.24/fs/fcntl.c Sun Jun 9 06:09:49 2002
+++ linux-2.5.24-mm/fs/fcntl.c Tue Jul 2 10:55:29 2002
@@ -235,24 +235,11 @@
if (!(arg & O_APPEND) && IS_APPEND(inode))
return -EPERM;

- /* Did FASYNC state change? */
- if ((arg ^ filp->f_flags) & FASYNC) {
- if (filp->f_op && filp->f_op->fasync) {
- error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
- if (error < 0)
- return error;
- }
- }
-
if (arg & O_DIRECT) {
/*
- * alloc_kiovec() can sleep and we are only serialized by
- * the big kernel lock here, so abuse the i_sem to serialize
- * this case too. We of course wouldn't need to go deep down
- * to the inode layer, we could stay at the file layer, but
- * we don't want to pay for the memory of a semaphore in each
- * file structure too and we use the inode semaphore that we just
- * pay for anyways.
+ * alloc_kiovec() can sleep, so abuse the i_sem to serialize
+ * this case too. Note we have to do this before we take the
+ * BKL otherwise we have a race if it sleeps.
*/
error = 0;
down(&inode->i_sem);
@@ -263,13 +250,26 @@
return error;
}

+ lock_kernel();
+ /* Did FASYNC state change? */
+ if ((arg ^ filp->f_flags) & FASYNC) {
+ if (filp->f_op && filp->f_op->fasync) {
+ error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
+ if (error < 0)
+ goto out;
+ }
+ }
+
/* required for strict SunOS emulation */
if (O_NONBLOCK != O_NDELAY)
if (arg & O_NDELAY)
arg |= O_NONBLOCK;

filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
- return 0;
+ error = 0;
+ out:
+ unlock_kernel();
+ return error;
}

static long do_fcntl(unsigned int fd, unsigned int cmd,
@@ -295,9 +295,7 @@
err = filp->f_flags;
break;
case F_SETFL:
- lock_kernel();
err = setfl(fd, filp, arg);
- unlock_kernel();
break;
case F_GETLK:
err = fcntl_getlk(filp, (struct flock *) arg);

--
Revolutions do not require corporate support.


2002-07-05 14:16:19

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: O_DIRECT handling introduced a race in F_SETFL

On Tue, Jul 02, 2002 at 07:30:19PM +0100, Matthew Wilcox wrote:
>
> I was doing some random kernel janitoring, pushing the BKL down a
> little bit, when I noticed O_DIRECT introduced a race. If alloc_kiovec
> sleeps, we will effectively be unprotected by the BKL. This would
> allow two processes sharing an fd both changing the FASYNC flag to
> call f_op->fasync() the wrong number of times and potentially leave
> the application erroneously believing that fasync is off when it is on,
> or vice versa.
>
> My patch below fixes this problem by allocating the O_DIRECT kiovec
> before taking the BKL. Any comments? Also, should we be freeing the
> kiovec when removing the O_DIRECT attribute from the filp?
>
> diff -urNX dontdiff linux-2.5.24/fs/fcntl.c linux-2.5.24-mm/fs/fcntl.c
> --- linux-2.5.24/fs/fcntl.c Sun Jun 9 06:09:49 2002
> +++ linux-2.5.24-mm/fs/fcntl.c Tue Jul 2 10:55:29 2002
> @@ -235,24 +235,11 @@
> if (!(arg & O_APPEND) && IS_APPEND(inode))
> return -EPERM;
>
> - /* Did FASYNC state change? */
> - if ((arg ^ filp->f_flags) & FASYNC) {
> - if (filp->f_op && filp->f_op->fasync) {
> - error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
> - if (error < 0)
> - return error;
> - }
> - }
> -
> if (arg & O_DIRECT) {
> /*
> - * alloc_kiovec() can sleep and we are only serialized by
> - * the big kernel lock here, so abuse the i_sem to serialize
> - * this case too. We of course wouldn't need to go deep down
> - * to the inode layer, we could stay at the file layer, but
> - * we don't want to pay for the memory of a semaphore in each
> - * file structure too and we use the inode semaphore that we just
> - * pay for anyways.
> + * alloc_kiovec() can sleep, so abuse the i_sem to serialize
> + * this case too. Note we have to do this before we take the
> + * BKL otherwise we have a race if it sleeps.
> */
> error = 0;
> down(&inode->i_sem);
> @@ -263,13 +250,26 @@
> return error;
> }
>
> + lock_kernel();
> + /* Did FASYNC state change? */
> + if ((arg ^ filp->f_flags) & FASYNC) {
> + if (filp->f_op && filp->f_op->fasync) {
> + error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
> + if (error < 0)
> + goto out;
> + }
> + }
> +
> /* required for strict SunOS emulation */
> if (O_NONBLOCK != O_NDELAY)
> if (arg & O_NDELAY)
> arg |= O_NONBLOCK;
>
> filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
> - return 0;
> + error = 0;
> + out:
> + unlock_kernel();
> + return error;
> }
>
> static long do_fcntl(unsigned int fd, unsigned int cmd,
> @@ -295,9 +295,7 @@
> err = filp->f_flags;
> break;
> case F_SETFL:
> - lock_kernel();
> err = setfl(fd, filp, arg);
> - unlock_kernel();
> break;
> case F_GETLK:
> err = fcntl_getlk(filp, (struct flock *) arg);

that's correct. Also moving the if (arg & O_DIRECT) {} right after
setting filp->f_flags would fix the race, that wouldn't require the move
the locking to the setfl, but I guess moving the locking down was your
object since the first place :). Please post it for mainline inclusion
too.

Andrea

2002-07-10 11:27:33

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: O_DIRECT handling introduced a race in F_SETFL

On Fri, Jul 05, 2002 at 04:19:34PM +0200, Andrea Arcangeli wrote:
> On Tue, Jul 02, 2002 at 07:30:19PM +0100, Matthew Wilcox wrote:
> >
> > I was doing some random kernel janitoring, pushing the BKL down a
> > little bit, when I noticed O_DIRECT introduced a race. If alloc_kiovec
> > sleeps, we will effectively be unprotected by the BKL. This would
> > allow two processes sharing an fd both changing the FASYNC flag to
> > call f_op->fasync() the wrong number of times and potentially leave
> > the application erroneously believing that fasync is off when it is on,
> > or vice versa.
> >
> > My patch below fixes this problem by allocating the O_DIRECT kiovec
> > before taking the BKL. Any comments? Also, should we be freeing the
> > kiovec when removing the O_DIRECT attribute from the filp?
> >
> > diff -urNX dontdiff linux-2.5.24/fs/fcntl.c linux-2.5.24-mm/fs/fcntl.c
> > --- linux-2.5.24/fs/fcntl.c Sun Jun 9 06:09:49 2002
> > +++ linux-2.5.24-mm/fs/fcntl.c Tue Jul 2 10:55:29 2002
> > @@ -235,24 +235,11 @@
> > if (!(arg & O_APPEND) && IS_APPEND(inode))
> > return -EPERM;
> >
> > - /* Did FASYNC state change? */
> > - if ((arg ^ filp->f_flags) & FASYNC) {
> > - if (filp->f_op && filp->f_op->fasync) {
> > - error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
> > - if (error < 0)
> > - return error;
> > - }
> > - }
> > -
> > if (arg & O_DIRECT) {
> > /*
> > - * alloc_kiovec() can sleep and we are only serialized by
> > - * the big kernel lock here, so abuse the i_sem to serialize
> > - * this case too. We of course wouldn't need to go deep down
> > - * to the inode layer, we could stay at the file layer, but
> > - * we don't want to pay for the memory of a semaphore in each
> > - * file structure too and we use the inode semaphore that we just
> > - * pay for anyways.
> > + * alloc_kiovec() can sleep, so abuse the i_sem to serialize
> > + * this case too. Note we have to do this before we take the
> > + * BKL otherwise we have a race if it sleeps.
> > */
> > error = 0;
> > down(&inode->i_sem);
> > @@ -263,13 +250,26 @@
> > return error;
> > }
> >
> > + lock_kernel();
> > + /* Did FASYNC state change? */
> > + if ((arg ^ filp->f_flags) & FASYNC) {
> > + if (filp->f_op && filp->f_op->fasync) {
> > + error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
> > + if (error < 0)
> > + goto out;
> > + }
> > + }
> > +
> > /* required for strict SunOS emulation */
> > if (O_NONBLOCK != O_NDELAY)
> > if (arg & O_NDELAY)
> > arg |= O_NONBLOCK;
> >
> > filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
> > - return 0;
> > + error = 0;
> > + out:
> > + unlock_kernel();
> > + return error;
> > }
> >
> > static long do_fcntl(unsigned int fd, unsigned int cmd,
> > @@ -295,9 +295,7 @@
> > err = filp->f_flags;
> > break;
> > case F_SETFL:
> > - lock_kernel();
> > err = setfl(fd, filp, arg);
> > - unlock_kernel();
> > break;
> > case F_GETLK:
> > err = fcntl_getlk(filp, (struct flock *) arg);
>
> that's correct. Also moving the if (arg & O_DIRECT) {} right after
> setting filp->f_flags would fix the race, that wouldn't require the move
> the locking to the setfl, but I guess moving the locking down was your
> object since the first place :). Please post it for mainline inclusion
> too.
>


this one from -aa fixes fasync too. Please review, thanks.

--- odirect/fs/fcntl.c.~1~ Fri Jul 5 12:20:47 2002
+++ odirect/fs/fcntl.c Sat Jul 6 18:53:56 2002
@@ -213,32 +213,29 @@ static int setfl(int fd, struct file * f
if (!(arg & O_APPEND) && IS_APPEND(inode))
return -EPERM;

+ /*
+ * alloc_kiovec() and ->fasync can sleep, so abuse the i_sem
+ * to serialize against parallel setfl on the same filp,
+ * to avoid races with ->f_flags and ->f_iobuf.
+ */
+ down(&inode->i_sem);
/* Did FASYNC state change? */
if ((arg ^ filp->f_flags) & FASYNC) {
if (filp->f_op && filp->f_op->fasync) {
+ lock_kernel();
error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
+ unlock_kernel();
if (error < 0)
- return error;
+ goto out;
}
}

if (arg & O_DIRECT) {
- /*
- * alloc_kiovec() can sleep and we are only serialized by
- * the big kernel lock here, so abuse the i_sem to serialize
- * this case too. We of course wouldn't need to go deep down
- * to the inode layer, we could stay at the file layer, but
- * we don't want to pay for the memory of a semaphore in each
- * file structure too and we use the inode semaphore that we just
- * pay for anyways.
- */
- error = 0;
- down(&inode->i_sem);
- if (!filp->f_iobuf)
+ if (!filp->f_iobuf) {
error = alloc_kiovec(1, &filp->f_iobuf);
- up(&inode->i_sem);
- if (error < 0)
- return error;
+ if (error < 0)
+ goto out;
+ }
}

/* required for strict SunOS emulation */
@@ -247,7 +244,10 @@ static int setfl(int fd, struct file * f
arg |= O_NONBLOCK;

filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
- return 0;
+ error = 0;
+ out:
+ up(&inode->i_sem);
+ return error;
}

static long do_fcntl(unsigned int fd, unsigned int cmd,
@@ -273,9 +273,7 @@ static long do_fcntl(unsigned int fd, un
err = filp->f_flags;
break;
case F_SETFL:
- lock_kernel();
err = setfl(fd, filp, arg);
- unlock_kernel();
break;
case F_GETLK:
err = fcntl_getlk(fd, (struct flock *) arg);

Andrea

2002-07-11 21:22:30

by Marcus Alanen

[permalink] [raw]
Subject: Re: O_DIRECT handling introduced a race in F_SETFL

>this one from -aa fixes fasync too. Please review, thanks.
>
>--- odirect/fs/fcntl.c.~1~ Fri Jul 5 12:20:47 2002
>+++ odirect/fs/fcntl.c Sat Jul 6 18:53:56 2002
>@@ -213,32 +213,29 @@ static int setfl(int fd, struct file * f
> if (!(arg & O_APPEND) && IS_APPEND(inode))
> return -EPERM;
>
>+ /*
>+ * alloc_kiovec() and ->fasync can sleep, so abuse the i_sem
>+ * to serialize against parallel setfl on the same filp,
>+ * to avoid races with ->f_flags and ->f_iobuf.
>+ */
>+ down(&inode->i_sem);
^^^^
> /* Did FASYNC state change? */
> if ((arg ^ filp->f_flags) & FASYNC) {
> if (filp->f_op && filp->f_op->fasync) {
>+ lock_kernel();
> error = filp->f_op->fasync(fd, filp, (arg & FASYNC) !=
> 0);
>+ unlock_kernel();
...

You introduce locking that hasn't been there previously.


To keep things in sync, ioctl.c needs to take i_sem also. In kernel 2.5,
pipe.c needs to move acquisition of i_sem to the surrounding functions.

Patch below. (Trivial documentation update added)

Could somebody please verify this?


Marcus


===== Documentation/filesystems/Locking 1.32 vs edited =====
--- 1.32/Documentation/filesystems/Locking Wed Jul 3 03:22:36 2002
+++ edited/Documentation/filesystems/Locking Thu Jul 11 23:03:52 2002
@@ -343,6 +343,8 @@

->fsync() has i_sem on inode.

+->fasync() has i_sem on inode.
+
--------------------------- dquot_operations -------------------------------
prototypes:
void (*initialize) (struct inode *, short);
===== fs/ioctl.c 1.4 vs edited =====
--- 1.4/fs/ioctl.c Sat May 18 20:50:00 2002
+++ edited/fs/ioctl.c Thu Jul 11 23:49:25 2002
@@ -82,10 +82,13 @@
filp->f_flags &= ~flag;
break;

- case FIOASYNC:
+ case FIOASYNC: {
+ struct inode * inode;
if ((error = get_user(on, (int *)arg)) != 0)
break;
flag = on ? FASYNC : 0;
+ inode = filp->f_dentry->d_inode;
+ down(&inode->i_sem);

/* Did FASYNC state change ? */
if ((flag ^ filp->f_flags) & FASYNC) {
@@ -94,13 +97,16 @@
else error = -ENOTTY;
}
if (error != 0)
- break;
+ goto fioasync_out;

if (on)
filp->f_flags |= FASYNC;
else
filp->f_flags &= ~FASYNC;
+ fioasync_out:
+ up(&inode->i_sem);
break;
+ }

case FIOQSIZE:
if (S_ISDIR(filp->f_dentry->d_inode->i_mode) ||
===== fs/pipe.c 1.14 vs edited =====
--- 1.14/fs/pipe.c Wed Jun 12 03:23:20 2002
+++ edited/fs/pipe.c Thu Jul 11 23:23:58 2002
@@ -329,9 +329,7 @@
struct inode *inode = filp->f_dentry->d_inode;
int retval;

- down(PIPE_SEM(*inode));
retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode));
- up(PIPE_SEM(*inode));

if (retval < 0)
return retval;
@@ -346,9 +344,7 @@
struct inode *inode = filp->f_dentry->d_inode;
int retval;

- down(PIPE_SEM(*inode));
retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode));
- up(PIPE_SEM(*inode));

if (retval < 0)
return retval;
@@ -363,15 +359,11 @@
struct inode *inode = filp->f_dentry->d_inode;
int retval;

- down(PIPE_SEM(*inode));
-
retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode));

if (retval >= 0)
retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode));

- up(PIPE_SEM(*inode));
-
if (retval < 0)
return retval;

@@ -382,14 +374,18 @@
static int
pipe_read_release(struct inode *inode, struct file *filp)
{
+ down(PIPE_SEM(*inode));
pipe_read_fasync(-1, filp, 0);
+ up(PIPE_SEM(*inode));
return pipe_release(inode, 1, 0);
}

static int
pipe_write_release(struct inode *inode, struct file *filp)
{
+ down(PIPE_SEM(*inode));
pipe_write_fasync(-1, filp, 0);
+ up(PIPE_SEM(*inode));
return pipe_release(inode, 0, 1);
}

@@ -398,7 +394,9 @@
{
int decr, decw;

+ down(PIPE_SEM(*inode));
pipe_rdwr_fasync(-1, filp, 0);
+ up(PIPE_SEM(*inode));
decr = (filp->f_mode & FMODE_READ) != 0;
decw = (filp->f_mode & FMODE_WRITE) != 0;
return pipe_release(inode, decr, decw);


--
Marcus Alanen
[email protected]