>From 2.5 proposed diff to fix illegal access to inode in
ext3_journal_stop during writepage:
===== fs/ext3/acl.c 1.6 vs edited =====
--- 1.6/fs/ext3/acl.c Tue Feb 25 16:21:08 2003
+++ edited/fs/ext3/acl.c Thu Mar 20 17:18:19 2003
@@ -420,7 +420,7 @@
return PTR_ERR(handle);
}
error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, clone);
- ext3_journal_stop(handle, inode);
+ ext3_journal_stop(handle, inode->i_sb);
}
posix_acl_release(clone);
return error;
@@ -522,7 +522,7 @@
if (IS_ERR(handle))
return PTR_ERR(handle);
error = ext3_set_acl(handle, inode, type, acl);
- ext3_journal_stop(handle, inode);
+ ext3_journal_stop(handle, inode->i_sb);
release_and_out:
posix_acl_release(acl);
===== fs/ext3/inode.c 1.63 vs edited =====
--- 1.63/fs/ext3/inode.c Mon Mar 17 06:33:16 2003
+++ edited/fs/ext3/inode.c Thu Mar 20 17:19:55 2003
@@ -235,7 +235,7 @@
clear_inode(inode);
else
ext3_free_inode(handle, inode);
- ext3_journal_stop(handle, inode);
+ ext3_journal_stop(handle, inode->i_sb);
unlock_kernel();
return;
no_delete:
@@ -1107,7 +1107,7 @@
}
prepare_write_failed:
if (ret)
- ext3_journal_stop(handle, inode);
+ ext3_journal_stop(handle, inode->i_sb);
out:
unlock_kernel();
return ret;
@@ -1176,7 +1176,7 @@
if (!ret)
ret = ret2;
}
- ret2 = ext3_journal_stop(handle, inode);
+ ret2 = ext3_journal_stop(handle, inode->i_sb);
unlock_kernel();
if (!ret)
ret = ret2;
@@ -1299,6 +1299,7 @@
static int ext3_writepage(struct page *page, struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
+ struct super_block *sb = inode->i_sb;
struct buffer_head *page_bufs;
handle_t *handle = NULL;
int ret = 0, err;
@@ -1374,7 +1375,7 @@
PAGE_CACHE_SIZE, NULL, bput_one);
}
- err = ext3_journal_stop(handle, inode);
+ err = ext3_journal_stop(handle, sb);
if (!ret)
ret = err;
unlock_kernel();
@@ -1479,7 +1480,7 @@
ret = err;
}
}
- err = ext3_journal_stop(handle, inode);
+ err = ext3_journal_stop(handle, inode->i_sb);
if (ret == 0)
ret = err;
unlock_kernel();
@@ -2140,7 +2141,7 @@
if (inode->i_nlink)
ext3_orphan_del(handle, inode);
- ext3_journal_stop(handle, inode);
+ ext3_journal_stop(handle, inode->i_sb);
unlock_kernel();
}
@@ -2560,7 +2561,7 @@
rc = ext3_mark_inode_dirty(handle, inode);
if (!error)
error = rc;
- ext3_journal_stop(handle, inode);
+ ext3_journal_stop(handle, inode->i_sb);
}
rc = inode_setattr(inode, attr);
@@ -2737,7 +2738,7 @@
current_handle);
ext3_mark_inode_dirty(handle, inode);
}
- ext3_journal_stop(handle, inode);
+ ext3_journal_stop(handle, inode->i_sb);
out:
unlock_kernel();
}
@@ -2818,7 +2819,7 @@
err = ext3_mark_inode_dirty(handle, inode);
handle->h_sync = 1;
- ext3_journal_stop(handle, inode);
+ ext3_journal_stop(handle, inode->i_sb);
ext3_std_error(inode->i_sb, err);
return err;
===== fs/ext3/ioctl.c 1.7 vs edited =====
--- 1.7/fs/ext3/ioctl.c Mon Mar 17 06:33:16 2003
+++ edited/fs/ext3/ioctl.c Thu Mar 20 17:18:18 2003
@@ -90,7 +90,7 @@
err = ext3_mark_iloc_dirty(handle, inode, &iloc);
flags_err:
- ext3_journal_stop(handle, inode);
+ ext3_journal_stop(handle, inode->i_sb);
if (err)
return err;
@@ -126,7 +126,7 @@
inode->i_generation = generation;
err = ext3_mark_iloc_dirty(handle, inode, &iloc);
- ext3_journal_stop(handle, inode);
+ ext3_journal_stop(handle, inode->i_sb);
return err;
}
#ifdef CONFIG_JBD_DEBUG
===== fs/ext3/namei.c 1.36 vs edited =====
--- 1.36/fs/ext3/namei.c Fri Feb 28 20:13:20 2003
+++ edited/fs/ext3/namei.c Thu Mar 20 17:18:39 2003
@@ -1615,7 +1615,7 @@
inode->i_mapping->a_ops = &ext3_aops;
err = ext3_add_nondir(handle, dentry, inode);
}
- ext3_journal_stop(handle, dir);
+ ext3_journal_stop(handle, dir->i_sb);
unlock_kernel();
return err;
}
@@ -1647,7 +1647,7 @@
#endif
err = ext3_add_nondir(handle, dentry, inode);
}
- ext3_journal_stop(handle, dir);
+ ext3_journal_stop(handle, dir->i_sb);
unlock_kernel();
return err;
}
@@ -1721,7 +1721,7 @@
ext3_mark_inode_dirty(handle, dir);
d_instantiate(dentry, inode);
out_stop:
- ext3_journal_stop(handle, dir);
+ ext3_journal_stop(handle, dir->i_sb);
unlock_kernel();
return err;
}
@@ -1994,7 +1994,7 @@
ext3_mark_inode_dirty(handle, dir);
end_rmdir:
- ext3_journal_stop(handle, dir);
+ ext3_journal_stop(handle, dir->i_sb);
brelse (bh);
unlock_kernel();
return retval;
@@ -2050,7 +2050,7 @@
retval = 0;
end_unlink:
- ext3_journal_stop(handle, dir);
+ ext3_journal_stop(handle, dir->i_sb);
unlock_kernel();
brelse (bh);
return retval;
@@ -2109,7 +2109,7 @@
EXT3_I(inode)->i_disksize = inode->i_size;
err = ext3_add_nondir(handle, dentry, inode);
out_stop:
- ext3_journal_stop(handle, dir);
+ ext3_journal_stop(handle, dir->i_sb);
unlock_kernel();
return err;
}
@@ -2142,7 +2142,7 @@
atomic_inc(&inode->i_count);
err = ext3_add_nondir(handle, dentry, inode);
- ext3_journal_stop(handle, dir);
+ ext3_journal_stop(handle, dir->i_sb);
unlock_kernel();
return err;
}
@@ -2299,7 +2299,7 @@
brelse (dir_bh);
brelse (old_bh);
brelse (new_bh);
- ext3_journal_stop(handle, old_dir);
+ ext3_journal_stop(handle, old_dir->i_sb);
unlock_kernel();
return retval;
}
===== fs/ext3/super.c 1.53 vs edited =====
--- 1.53/fs/ext3/super.c Tue Mar 11 06:34:32 2003
+++ edited/fs/ext3/super.c Thu Mar 20 11:01:35 2003
@@ -1735,7 +1735,7 @@
tid_t target;
lock_kernel();
- sb->s_dirt = 0;
+ sb->s_need_sync_fs = 0;
target = log_start_commit(EXT3_SB(sb)->s_journal, NULL);
if (wait)
log_wait_commit(EXT3_SB(sb)->s_journal, target);
===== fs/ext3/xattr.c 1.11 vs edited =====
--- 1.11/fs/ext3/xattr.c Sat Mar 8 22:50:42 2003
+++ edited/fs/ext3/xattr.c Thu Mar 20 17:18:38 2003
@@ -854,7 +854,7 @@
else
error = ext3_xattr_set_handle(handle, inode, name_index, name,
value, value_len, flags);
- error2 = ext3_journal_stop(handle, inode);
+ error2 = ext3_journal_stop(handle, inode->i_sb);
unlock_kernel();
return error ? error : error2;
===== include/linux/ext3_jbd.h 1.9 vs edited =====
--- 1.9/include/linux/ext3_jbd.h Tue Feb 11 03:20:37 2003
+++ edited/include/linux/ext3_jbd.h Thu Mar 20 17:13:18 2003
@@ -217,20 +217,21 @@
* appropriate.
*/
static inline int __ext3_journal_stop(const char *where,
- handle_t *handle, struct inode *inode)
+ handle_t *handle,
+ struct super_block *sb)
{
int err = handle->h_err;
int rc = journal_stop(handle);
- inode->i_sb->s_dirt = 1;
+ sb->s_need_sync_fs = 1;
if (!err)
err = rc;
if (err)
- __ext3_std_error(inode->i_sb, where, err);
+ __ext3_std_error(sb, where, err);
return err;
}
-#define ext3_journal_stop(handle, inode) \
- __ext3_journal_stop(__FUNCTION__, (handle), (inode))
+#define ext3_journal_stop(handle, sb) \
+ __ext3_journal_stop(__FUNCTION__, (handle), (sb))
static inline handle_t *ext3_journal_current_handle(void)
{
"Stephen C. Tweedie" <[email protected]> wrote:
>
> Hi Andrew,
>
> The patch below addresses the problem we were talking about earlier
> where ext3_writepage ends up accessing the inode after the page lock has
> been dropped (and hence at a point where it is possible for the inode to
> have been reclaimed.) Tested minimally (it builds and boots.)
Burton has confirmed that removing the
inode->i_sb->s_dirt = 1;
line makes the oopses go away, so this will fix it.
> It makes ext3_journal_stop take an sb, not an inode, as its final
> parameter.
argh. I wrote and tested a patch too. That patch puts the superblock
pointer into the new journal->j_private and removes the second arg to
ext3_journal_start altogether.
I went that way just to save a little text. Because ext3_journal_start/stop
need to be uninlined - that saves 5.5 kbytes of text.
Which do you think is best? If you're planning on patching 2.4 and if you
want to do that by passing the superblock pointer in, then we should go that
way in 2.5 too, keep things in sync.
> It also sets sb->s_need_sync_fs, not sb->s_dirt, as setting
> s_dirt was only ever a workaround for the lack of a proper sync-fs
> mechanism.
>
> Btw, we clear s_need_sync_fs in sync_filesystems(). Don't we also need
> to do the same in fsync_super()?
The intent of s_need_sync_fs is to avoid livelock in sync_filesystems().
Imagine that two filesytems are being continually dirtied. It would be very,
very easy for a sync_filesystems() caller to never terminate. This is a
repeated problem with lists of dirty objects which need cleaning, in which
only the head-of-list is stable outside the lock.
So sync_filesystems() will tag all the filesystems on the first pass and then
only sync tagged filesytems on the second pass. No livelock. (This still
means that new sync() callers will cause older sync() callers to perform a
second round of syncing. I guess I should slap a mutex around the whole
thing to prevent that).
So s_need_sync_fs is "private to sync_fileystems". I should have commented
that, sorry.
sync_filesystems() will even call call ->sync_fs against non-s_dirt
filesystems, which seems a little odd.
The reason for this is that ext3_write_super() will clear s_dirt and _not_
wait on the writeout. So if sync_filesystems() happened to be called against
a filesystem shortly after its ->write_super() had been called,
sync_filesystems() would incorrectly assume that the filesystem was fully
synced.
I shall comment that, too.
Hi,
On Thu, 2003-03-20 at 21:15, Andrew Morton wrote:
> Burton has confirmed that removing the
>
> inode->i_sb->s_dirt = 1;
>
> line makes the oopses go away, so this will fix it.
Good.
> > It makes ext3_journal_stop take an sb, not an inode, as its final
> > parameter.
>
> argh. I wrote and tested a patch too. That patch puts the superblock
> pointer into the new journal->j_private and removes the second arg to
> ext3_journal_start altogether.
Well, there's still the
if (err)
__ext3_std_error(inode->i_sb, where, err);
case in ext3_journal_stop() to worry about, so we still need it; and I'd
much rather not hack this via j_private, when what we're doing at this
point is most definitely a fs-specific, not journal-related, operation.
> I went that way just to save a little text. Because ext3_journal_start/stop
> need to be uninlined - that saves 5.5 kbytes of text.
Agreed.
> Which do you think is best? If you're planning on patching 2.4 and if you
> want to do that by passing the superblock pointer in, then we should go that
> way in 2.5 too, keep things in sync.
I was wondering why we've never seen this on 2.4, even with slab
poisoning enabled. But I think the vulnerability exists on 2.4 too, so
yes, we ought to keep the two in sync.
> > It also sets sb->s_need_sync_fs, not sb->s_dirt, as setting
> > s_dirt was only ever a workaround for the lack of a proper sync-fs
> > mechanism.
> >
> > Btw, we clear s_need_sync_fs in sync_filesystems(). Don't we also need
> > to do the same in fsync_super()?
>
> The intent of s_need_sync_fs is to avoid livelock in sync_filesystems().
...
Well, the intent of the s_dirt was to force a call to ext3_write_super
when the fs was dirty, back before the days when we had a sync_fs()
method at all. Now that we have the latter, it sounds like we should
actually just drop the line which sets s_dirt in ext3_journal_stop
entirely, because sync will always call the new sync_fs which will do
the commit that we need.
We still have the error handling path in ext3_journal_stop so we can't
avoid having to find the sb, so _some_ rejigging is still needed.
Cheers,
Stephen
"Stephen C. Tweedie" <[email protected]> wrote:
>
> Well, there's still the
>
> if (err)
> __ext3_std_error(inode->i_sb, where, err);
>
> case in ext3_journal_stop() to worry about
We already have that.
int __ext3_journal_stop(const char *where, handle_t *handle)
{
struct super_block *sb;
int err;
int rc;
sb = handle->h_transaction->t_journal->j_private;
err = handle->h_err;
rc = journal_stop(handle);
sb->s_dirt = 1;
if (!err)
err = rc;
if (err)
__ext3_std_error(sb, where, err);
return err;
}
> , so we still need it; and I'd
> much rather not hack this via j_private, when what we're doing at this
> point is most definitely a fs-specific, not journal-related, operation.
I don't think it's a hack at all. ext3 owns the journal and there is plenty
of precedent for putting owner-private things into owned objects.
But I'm not particularly fussed either way - it will only be 100-200 bytes of
code saved.
> I was wondering why we've never seen this on 2.4, even with slab
> poisoning enabled. But I think the vulnerability exists on 2.4 too, so
> yes, we ought to keep the two in sync.
It could be due to differences in inode reclaim. If 2.4 sees an inode with
attached pages it will skip it. 2.5 will instead detach the pages and free
the inode.
And writepage() doesn't get used much in 2.4.
Plus this bug was hit on a preemptible kernel where the timing windows are
much wider.
> Well, the intent of the s_dirt was to force a call to ext3_write_super
> when the fs was dirty, back before the days when we had a sync_fs()
> method at all. Now that we have the latter, it sounds like we should
> actually just drop the line which sets s_dirt in ext3_journal_stop
> entirely, because sync will always call the new sync_fs which will do
> the commit that we need.
OK.
> We still have the error handling path in ext3_journal_stop so we can't
> avoid having to find the sb, so _some_ rejigging is still needed.
That is available from the handle. (And via
ext3_journal_current_handle()->j_private, even).
The journal and the superblock have a definite one-to-one relationship - I think the
backpointer makes sense. But whatever - I'll let you flip that coin.
Hi,On Fri, 2003-03-21 at 00:12, Andrew Morton wrote:
> > Well, there's still the
> > if (err)
> > __ext3_std_error(inode->i_sb, where, err);
> > case in ext3_journal_stop() to worry about
>
> We already have that.
Only if we fix the underlying problem --- I was only pointing out that
even if we drop the setting of s_dirt entirely, which was what we were
trying to fix, we can't avoid having to find the sb.
> But I'm not particularly fussed either way - it will only be 100-200 bytes of
> code saved.
Yep, but there are probably other places we can find where we can avoid
passing the sb around too if we have the back-pointer. I guess it makes
sense to go ahead with that.
> The journal and the superblock have a definite one-to-one relationship - I think the
> backpointer makes sense. But whatever - I'll let you flip that coin.
OK, go for it and I'll merge for 2.4.
Cheers,
Stephen