2002-08-28 15:44:19

by Stephen C. Tweedie

[permalink] [raw]
Subject: [Patch 5/8] 2.4.20-pre4/ext3: Fix O_SYNC for non-data-journaled modes.

ext3 has its own code which marks buffers dirty, in addition to the setting
done by the core generic_commit_write code. However, the core code does

if (!atomic_set_buffer_dirty(bh)) {
__mark_dirty(bh);
buffer_insert_inode_queue(bh, inode);

so if ext3 marks the buffer dirty itself, the core fails to put it on the
per-inode list of dirty buffers. Hence, fsync_inode_buffers() misses it.

The fix is to let ext3 put the buffer on the inode queue manually when
walking the page's buffer lists in its page write code.

--- linux-ext3-2.4merge/fs/ext3/inode.c.=K0006=.orig Tue Aug 27 23:19:57 2002
+++ linux-ext3-2.4merge/fs/ext3/inode.c Tue Aug 27 23:19:57 2002
@@ -949,11 +949,13 @@
}

static int walk_page_buffers( handle_t *handle,
+ struct inode *inode,
struct buffer_head *head,
unsigned from,
unsigned to,
int *partial,
int (*fn)( handle_t *handle,
+ struct inode *inode,
struct buffer_head *bh))
{
struct buffer_head *bh;
@@ -971,7 +973,7 @@
*partial = 1;
continue;
}
- err = (*fn)(handle, bh);
+ err = (*fn)(handle, inode, bh);
if (!ret)
ret = err;
}
@@ -1004,7 +1006,7 @@
* write.
*/

-static int do_journal_get_write_access(handle_t *handle,
+static int do_journal_get_write_access(handle_t *handle, struct inode *inode,
struct buffer_head *bh)
{
return ext3_journal_get_write_access(handle, bh);
@@ -1030,7 +1032,7 @@
goto prepare_write_failed;

if (ext3_should_journal_data(inode)) {
- ret = walk_page_buffers(handle, page->buffers,
+ ret = walk_page_buffers(handle, inode, page->buffers,
from, to, NULL, do_journal_get_write_access);
if (ret) {
/*
@@ -1051,24 +1053,32 @@
return ret;
}

-static int journal_dirty_sync_data(handle_t *handle, struct buffer_head *bh)
+static int journal_dirty_sync_data(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh)
{
- return ext3_journal_dirty_data(handle, bh, 0);
+ int ret = ext3_journal_dirty_data(handle, bh, 0);
+ if (bh->b_inode != inode)
+ buffer_insert_inode_data_queue(bh, inode);
+ return ret;
}

/*
* For ext3_writepage(). We also brelse() the buffer to account for
* the bget() which ext3_writepage() performs.
*/
-static int journal_dirty_async_data(handle_t *handle, struct buffer_head *bh)
+static int journal_dirty_async_data(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh)
{
int ret = ext3_journal_dirty_data(handle, bh, 1);
+ if (bh->b_inode != inode)
+ buffer_insert_inode_data_queue(bh, inode);
__brelse(bh);
return ret;
}

/* For commit_write() in data=journal mode */
-static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
+static int commit_write_fn(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh)
{
set_bit(BH_Uptodate, &bh->b_state);
return ext3_journal_dirty_metadata(handle, bh);
@@ -1103,7 +1113,7 @@
int partial = 0;
loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;

- ret = walk_page_buffers(handle, page->buffers,
+ ret = walk_page_buffers(handle, inode, page->buffers,
from, to, &partial, commit_write_fn);
if (!partial)
SetPageUptodate(page);
@@ -1113,7 +1123,7 @@
EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
} else {
if (ext3_should_order_data(inode)) {
- ret = walk_page_buffers(handle, page->buffers,
+ ret = walk_page_buffers(handle, inode, page->buffers,
from, to, NULL, journal_dirty_sync_data);
}
/* Be careful here if generic_commit_write becomes a
@@ -1195,7 +1205,8 @@
return generic_block_bmap(mapping,block,ext3_get_block);
}

-static int bget_one(handle_t *handle, struct buffer_head *bh)
+static int bget_one(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh)
{
atomic_inc(&bh->b_count);
return 0;
@@ -1294,7 +1305,7 @@
create_empty_buffers(page,
inode->i_dev, inode->i_sb->s_blocksize);
page_buffers = page->buffers;
- walk_page_buffers(handle, page_buffers, 0,
+ walk_page_buffers(handle, inode, page_buffers, 0,
PAGE_CACHE_SIZE, NULL, bget_one);
}

@@ -1312,7 +1323,7 @@

/* And attach them to the current transaction */
if (order_data) {
- err = walk_page_buffers(handle, page_buffers,
+ err = walk_page_buffers(handle, inode, page_buffers,
0, PAGE_CACHE_SIZE, NULL, journal_dirty_async_data);
if (!ret)
ret = err;


2002-08-28 16:15:07

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [Patch 5/8] 2.4.20-pre4/ext3: Fix O_SYNC for non-data-journaled modes.

On Wed, Aug 28, 2002 at 04:45:20PM +0100, Stephen Tweedie wrote:
> ext3 has its own code which marks buffers dirty, in addition to the setting
> done by the core generic_commit_write code. However, the core code does
>
> if (!atomic_set_buffer_dirty(bh)) {
> __mark_dirty(bh);
> buffer_insert_inode_queue(bh, inode);
>
> so if ext3 marks the buffer dirty itself, the core fails to put it on the
> per-inode list of dirty buffers. Hence, fsync_inode_buffers() misses it.
>
> The fix is to let ext3 put the buffer on the inode queue manually when
> walking the page's buffer lists in its page write code.

This patch conflicts with the b_inode as bool patch you recently ACKed..

2002-08-28 16:22:32

by Stephen C. Tweedie

[permalink] [raw]
Subject: Re: [Patch 5/8] 2.4.20-pre4/ext3: Fix O_SYNC for non-data-journaled modes.

Hi,

On Wed, Aug 28, 2002 at 05:18:13PM +0100, Christoph Hellwig wrote:
> On Wed, Aug 28, 2002 at 04:45:20PM +0100, Stephen Tweedie wrote:
> > The fix is to let ext3 put the buffer on the inode queue manually when
> > walking the page's buffer lists in its page write code.
>
> This patch conflicts with the b_inode as bool patch you recently ACKed..

Do you have a pointer to the most recent version of that patch you
were going to submit? I've got one final batch of ext3-related things
to submit, containing no bug-fixes but only tweaks and new features
(eg. allowing you to specify the commit interval per-filesystem). I
can merge the b_inode diff for that batch if you want.

--Stephen

2002-08-28 16:24:29

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [Patch 5/8] 2.4.20-pre4/ext3: Fix O_SYNC for non-data-journaled modes.

On Wed, Aug 28, 2002 at 05:26:42PM +0100, Stephen C. Tweedie wrote:
> Do you have a pointer to the most recent version of that patch you
> were going to submit? I've got one final batch of ext3-related things
> to submit, containing no bug-fixes but only tweaks and new features
> (eg. allowing you to specify the commit interval per-filesystem). I
> can merge the b_inode diff for that batch if you want.

Somewhere on lkml or below:


diff -uNr -Xdontdiff -p linux-2.4.20-pre4/fs/buffer.c linux/fs/buffer.c
--- linux-2.4.20-pre4/fs/buffer.c Tue Aug 13 15:56:00 2002
+++ linux/fs/buffer.c Sun Aug 25 19:28:55 2002
@@ -583,37 +583,29 @@ struct buffer_head * get_hash_table(kdev
return bh;
}

-void buffer_insert_inode_queue(struct buffer_head *bh, struct inode *inode)
+void buffer_insert_list(struct buffer_head *bh, struct list_head *list)
{
spin_lock(&lru_list_lock);
- if (bh->b_inode)
+ if (buffer_attached(bh))
list_del(&bh->b_inode_buffers);
- bh->b_inode = inode;
- list_add(&bh->b_inode_buffers, &inode->i_dirty_buffers);
+ set_buffer_attached(bh);
+ list_add(&bh->b_inode_buffers, list);
spin_unlock(&lru_list_lock);
}

-void buffer_insert_inode_data_queue(struct buffer_head *bh, struct inode *inode)
-{
- spin_lock(&lru_list_lock);
- if (bh->b_inode)
- list_del(&bh->b_inode_buffers);
- bh->b_inode = inode;
- list_add(&bh->b_inode_buffers, &inode->i_dirty_data_buffers);
- spin_unlock(&lru_list_lock);
-}
-
-/* The caller must have the lru_list lock before calling the
- remove_inode_queue functions. */
+/*
+ * The caller must have the lru_list lock before calling the
+ * remove_inode_queue functions.
+ */
static void __remove_inode_queue(struct buffer_head *bh)
{
- bh->b_inode = NULL;
list_del(&bh->b_inode_buffers);
+ clear_buffer_attached(bh);
}

static inline void remove_inode_queue(struct buffer_head *bh)
{
- if (bh->b_inode)
+ if (buffer_attached(bh))
__remove_inode_queue(bh);
}

@@ -827,10 +819,10 @@ inline void set_buffer_async_io(struct b
int fsync_buffers_list(struct list_head *list)
{
struct buffer_head *bh;
- struct inode tmp;
+ struct list_head tmp;
int err = 0, err2;

- INIT_LIST_HEAD(&tmp.i_dirty_buffers);
+ INIT_LIST_HEAD(&tmp);

spin_lock(&lru_list_lock);

@@ -838,10 +830,10 @@ int fsync_buffers_list(struct list_head
bh = BH_ENTRY(list->next);
list_del(&bh->b_inode_buffers);
if (!buffer_dirty(bh) && !buffer_locked(bh))
- bh->b_inode = NULL;
+ clear_buffer_attached(bh);
else {
- bh->b_inode = &tmp;
- list_add(&bh->b_inode_buffers, &tmp.i_dirty_buffers);
+ set_buffer_attached(bh);
+ list_add(&bh->b_inode_buffers, &tmp);
if (buffer_dirty(bh)) {
get_bh(bh);
spin_unlock(&lru_list_lock);
@@ -861,8 +853,8 @@ int fsync_buffers_list(struct list_head
}
}

- while (!list_empty(&tmp.i_dirty_buffers)) {
- bh = BH_ENTRY(tmp.i_dirty_buffers.prev);
+ while (!list_empty(&tmp)) {
+ bh = BH_ENTRY(tmp.prev);
remove_inode_queue(bh);
get_bh(bh);
spin_unlock(&lru_list_lock);
@@ -1134,7 +1126,7 @@ struct buffer_head * bread(kdev_t dev, i
*/
static void __put_unused_buffer_head(struct buffer_head * bh)
{
- if (bh->b_inode)
+ if (unlikely(buffer_attached(bh)))
BUG();
if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
kmem_cache_free(bh_cachep, bh);
diff -uNr -Xdontdiff -p linux-2.4.20-pre4/fs/reiserfs/inode.c linux/fs/reiserfs/inode.c
--- linux-2.4.20-pre4/fs/reiserfs/inode.c Tue Aug 13 15:56:01 2002
+++ linux/fs/reiserfs/inode.c Tue Aug 20 11:39:48 2002
@@ -102,9 +102,9 @@ inline void make_le_item_head (struct it
}

static void add_to_flushlist(struct inode *inode, struct buffer_head *bh) {
- struct inode *jinode = &(SB_JOURNAL(inode->i_sb)->j_dummy_inode) ;
+ struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ;

- buffer_insert_inode_queue(bh, jinode) ;
+ buffer_insert_list(bh, &j->j_dirty_buffers) ;
}

//
diff -uNr -Xdontdiff -p linux-2.4.20-pre4/fs/reiserfs/journal.c linux/fs/reiserfs/journal.c
--- linux-2.4.20-pre4/fs/reiserfs/journal.c Sat Aug 17 14:54:39 2002
+++ linux/fs/reiserfs/journal.c Tue Aug 20 11:39:48 2002
@@ -1937,7 +1937,7 @@ int journal_init(struct super_block *p_s
memset(journal_writers, 0, sizeof(char *) * 512) ; /* debug code */

INIT_LIST_HEAD(&SB_JOURNAL(p_s_sb)->j_bitmap_nodes) ;
- INIT_LIST_HEAD(&(SB_JOURNAL(p_s_sb)->j_dummy_inode.i_dirty_buffers)) ;
+ INIT_LIST_HEAD(&SB_JOURNAL(p_s_sb)->j_dirty_buffers) ;
reiserfs_allocate_list_bitmaps(p_s_sb, SB_JOURNAL(p_s_sb)->j_list_bitmap,
SB_BMAP_NR(p_s_sb)) ;
allocate_bitmap_nodes(p_s_sb) ;
@@ -2933,7 +2933,7 @@ printk("journal-2020: do_journal_end: BA
SB_JOURNAL_LIST_INDEX(p_s_sb) = jindex ;

/* write any buffers that must hit disk before this commit is done */
- fsync_inode_buffers(&(SB_JOURNAL(p_s_sb)->j_dummy_inode)) ;
+ fsync_buffers_list(&(SB_JOURNAL(p_s_sb)->j_dirty_buffers)) ;

/* honor the flush and async wishes from the caller */
if (flush) {
diff -uNr -Xdontdiff -p linux-2.4.20-pre4/include/linux/fs.h linux/include/linux/fs.h
--- linux-2.4.20-pre4/include/linux/fs.h Tue Aug 20 11:37:00 2002
+++ linux/include/linux/fs.h Sun Aug 25 19:20:22 2002
@@ -219,6 +219,7 @@ enum bh_state_bits {
BH_Async, /* 1 if the buffer is under end_buffer_io_async I/O */
BH_Wait_IO, /* 1 if we should write out this buffer */
BH_Launder, /* 1 if we can throttle on this buffer */
+ BH_Attached, /* 1 if b_inode_buffers is linked into a list */
BH_JBD, /* 1 if it has an attached journal_head */

BH_PrivateStart,/* not a state bit, but the first bit available
@@ -266,7 +267,6 @@ struct buffer_head {
unsigned long b_rsector; /* Real buffer location on disk */
wait_queue_head_t b_wait;

- struct inode * b_inode;
struct list_head b_inode_buffers; /* doubly linked list of inode dirty buffers */
};

@@ -1167,8 +1167,18 @@ static inline void mark_buffer_clean(str
extern void FASTCALL(__mark_dirty(struct buffer_head *bh));
extern void FASTCALL(__mark_buffer_dirty(struct buffer_head *bh));
extern void FASTCALL(mark_buffer_dirty(struct buffer_head *bh));
-extern void FASTCALL(buffer_insert_inode_queue(struct buffer_head *, struct inode *));
-extern void FASTCALL(buffer_insert_inode_data_queue(struct buffer_head *, struct inode *));
+
+extern void FASTCALL(buffer_insert_list(struct buffer_head *, struct list_head *));
+
+static inline void buffer_insert_inode_queue(struct buffer_head *bh, struct inode *inode)
+{
+ buffer_insert_list(bh, &inode->i_dirty_buffers);
+}
+
+static inline void buffer_insert_inode_data_queue(struct buffer_head *bh, struct inode *inode)
+{
+ buffer_insert_list(bh, &inode->i_dirty_data_buffers);
+}

static inline int atomic_set_buffer_dirty(struct buffer_head *bh)
{
@@ -1183,6 +1193,21 @@ static inline void mark_buffer_async(str
clear_bit(BH_Async, &bh->b_state);
}

+static inline void set_buffer_attached(struct buffer_head *bh)
+{
+ __set_bit(BH_Attached, &bh->b_state);
+}
+
+static inline void clear_buffer_attached(struct buffer_head *bh)
+{
+ clear_bit(BH_Attached, &bh->b_state);
+}
+
+static inline int buffer_attached(struct buffer_head *bh)
+{
+ return test_bit(BH_Attached, &bh->b_state);
+}
+
/*
* If an error happens during the make_request, this function
* has to be recalled. It marks the buffer as clean and not
diff -uNr -Xdontdiff -p linux-2.4.20-pre4/include/linux/reiserfs_fs_sb.h linux/include/linux/reiserfs_fs_sb.h
--- linux-2.4.20-pre4/include/linux/reiserfs_fs_sb.h Tue Aug 13 15:56:04 2002
+++ linux/include/linux/reiserfs_fs_sb.h Tue Aug 20 15:26:38 2002
@@ -312,7 +312,7 @@ struct reiserfs_journal {
int j_free_bitmap_nodes ;
int j_used_bitmap_nodes ;
struct list_head j_bitmap_nodes ;
- struct inode j_dummy_inode ;
+ struct list_head j_dirty_buffers ;
struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS] ; /* array of bitmaps to record the deleted blocks */
struct reiserfs_journal_list j_journal_list[JOURNAL_LIST_COUNT] ; /* array of all the journal lists */
struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE] ; /* hash table for real buffer heads in current trans */
diff -uNr -Xdontdiff -p linux-2.4.20-pre4/kernel/ksyms.c linux/kernel/ksyms.c
--- linux-2.4.20-pre4/kernel/ksyms.c Tue Aug 13 15:56:05 2002
+++ linux/kernel/ksyms.c Sun Aug 25 19:32:04 2002
@@ -525,8 +527,7 @@ EXPORT_SYMBOL(get_hash_table);
EXPORT_SYMBOL(get_empty_inode);
EXPORT_SYMBOL(insert_inode_hash);
EXPORT_SYMBOL(remove_inode_hash);
-EXPORT_SYMBOL(buffer_insert_inode_queue);
-EXPORT_SYMBOL(buffer_insert_inode_data_queue);
+EXPORT_SYMBOL(buffer_insert_list);
EXPORT_SYMBOL(make_bad_inode);
EXPORT_SYMBOL(is_bad_inode);
EXPORT_SYMBOL(event);