Hi,
The metadata block reservation counter overflows with data write
on ext4 (indirect block map) when its disk space is almost full.
This overflow triggers following BUG_ON.
Jan 14 09:36:48 TNESG9423 kernel: ------------[ cut here ]------------
Jan 14 09:36:48 TNESG9423 kernel: kernel BUG at fs/ext4/inode.c:2170!
Jan 14 09:36:48 TNESG9423 kernel: invalid opcode: 0000 [#1] SMP
Jan 14 09:36:48 TNESG9423 kernel: last sysfs file: /sys/kernel/mm/ksm/run
Jan 14 09:36:48 TNESG9423 kernel: CPU 0
Jan 14 09:36:48 TNESG9423 kernel: Modules linked in: ip6table_filter ip6_tables ebtable_nat ebtables xt_CHECKSUM iptable_mangle ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4
xt_state nf_conntrack ipt_REJECT bridge stp llc autofs4 sunrpc p4_clockmod freq_table speedstep_lib ipv6 xt_physdev iptable_filter ip_tables nls_utf8 dm_mirror dm_region_hash dm_log dm_mod kvm_intel
kvm uinput ppdev parport_pc parport sg pcspkr i2c_i801 iTCO_wdt iTCO_vendor_support snd_hda_codec_realtek snd_hda_intel snd_hda_codec snd_hwdep snd_seq snd_seq_device snd_pcm snd_timer snd soundcore
snd_page_alloc e1000e ext3 jbd sd_mod crc_t10dif sr_mod cdrom pata_via pata_acpi ata_generic ata_piix i915 drm_kms_helper drm i2c_algo_bit i2c_core video output [last unloaded: mperf]
Jan 14 09:36:48 TNESG9423 kernel:
Jan 14 09:36:48 TNESG9423 kernel: Pid: 937, comm: flush-8:0 Not tainted 2.6.37 #1 MS-7264BLM/PC-MJ18ABZR4
Jan 14 09:36:48 TNESG9423 kernel: RIP: 0010:[<ffffffff811c5298>] [<ffffffff811c5298>] ext4_da_block_invalidatepages+0x168/0x180
Jan 14 09:36:48 TNESG9423 kernel: RSP: 0018:ffff88007613f780 EFLAGS: 00010246
Jan 14 09:36:48 TNESG9423 kernel: RAX: 0010000000000024 RBX: 0000000000008cf2 RCX: 000000000000000e
Jan 14 09:36:48 TNESG9423 kernel: RDX: 000000000000000e RSI: 0000000000000001 RDI: ffffea0000a70d30
Jan 14 09:36:48 TNESG9423 kernel: RBP: ffff88007613f850 R08: 0000000000000001 R09: 0000000000000002
Jan 14 09:36:48 TNESG9423 kernel: R10: ffffea0000a70d38 R11: ffff880035f01b58 R12: ffff88007613f7a0
Jan 14 09:36:48 TNESG9423 kernel: R13: ffff880065eecd68 R14: ffff88007613f7b8 R15: ffffea0000a70a58
Jan 14 09:36:48 TNESG9423 kernel: FS: 0000000000000000(0000) GS:ffff88007f400000(0000) knlGS:0000000000000000
Jan 14 09:36:48 TNESG9423 kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
Jan 14 09:36:48 TNESG9423 kernel: CR2: 0000003680ae1560 CR3: 000000004da45000 CR4: 00000000000006f0
Jan 14 09:36:48 TNESG9423 kernel: DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
Jan 14 09:36:48 TNESG9423 kernel: DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Jan 14 09:36:48 TNESG9423 kernel: Process flush-8:0 (pid: 937, threadinfo ffff88007613e000, task ffff8800371b54e0)
Jan 14 09:36:48 TNESG9423 kernel: Stack:
Jan 14 09:36:48 TNESG9423 kernel: ffff88007613f7e0 ffffffff814ee3d6 0000000000000008 0000000e7613f7f0
Jan 14 09:36:48 TNESG9423 kernel: 000000000000000e 000000003741a4b9 ffffea0000a70a58 ffffea0000a70a90
Jan 14 09:36:48 TNESG9423 kernel: ffffea0000a70ac8 ffffea0000a70b00 ffffea0000a70b38 ffffea0000a70b70
Jan 14 09:36:48 TNESG9423 kernel: Call Trace:
Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff814ee3d6>] ? printk+0x41/0x43
Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811c9fe4>] mpage_da_map_and_submit+0x274/0x470
Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811ca24d>] mpage_add_bh_to_extent+0x6d/0xf0
Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811ca5a0>] write_cache_pages_da+0x2d0/0x4a0
Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811caa4c>] ext4_da_writepages+0x2dc/0x650
Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81105321>] do_writepages+0x21/0x40
Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff810fb55b>] __filemap_fdatawrite_range+0x5b/0x60
Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff810fb833>] filemap_fdatawrite_range+0x13/0x20
Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811fb6ce>] jbd2_journal_begin_ordered_truncate+0x8e/0xb0
Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811cd02b>] ext4_evict_inode+0x23b/0x3b0
Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81167477>] evict+0x27/0xc0
Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81167a4b>] iput+0x1bb/0x2a0
Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811734a4>] writeback_sb_inodes+0x104/0x180
Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81173cfd>] writeback_inodes_wb+0x9d/0x160
Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff8117404b>] wb_writeback+0x28b/0x400
Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff810714cc>] ? lock_timer_base+0x3c/0x70
Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81072092>] ? del_timer_sync+0x22/0x30
Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81174257>] wb_do_writeback+0x97/0x1e0
Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81174452>] bdi_writeback_thread+0xb2/0x270
Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811743a0>] ? bdi_writeback_thread+0x0/0x270
Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811743a0>] ? bdi_writeback_thread+0x0/0x270
Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81083356>] kthread+0x96/0xa0
Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff8100ce84>] kernel_thread_helper+0x4/0x10
Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff810832c0>] ? kthread+0x0/0xa0
Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff8100ce80>] ? kernel_thread_helper+0x0/0x10
Jan 14 09:36:48 TNESG9423 kernel: Code: a8 00 00 00 5b 41 5c 41 5d 41 5e 41 5f c9 c3 0f 1f 40 00 4c 89 e7 48 89 95 40 ff ff ff e8 01 1b f4 ff 48 8b 95 40 ff ff ff eb c9 <0f> 0b eb fe 0f 0b 66 90 eb fc
66 66 66 66 66 2e 0f 1f 84 00 00
Jan 14 09:36:48 TNESG9423 kernel: RIP [<ffffffff811c5298>] ext4_da_block_invalidatepages+0x168/0x180
Jan 14 09:36:48 TNESG9423 kernel: RSP <ffff88007613f780>
Jan 14 09:36:48 TNESG9423 kernel: ---[ end trace 0496eaed3b9ec629 ]---
To fix this, I referred to the patch which is for data blocks
reservation counter (commit: ef627929781c98113e6ae93f159dd3c12a884ad8)
and made a following patch which prints metadata block inconsistency
and corrects it.
My patch is trial, if you have better idea, feel free to fix this bug.
# You can reproduce this problem with attached programs.
# In my environment, this occurs in 1 minute.
Signed-off-by: Akira Fujita <[email protected]>
---
fs/ext4/inode.c | 10 ++++++++++
1 file changed, 10 insertions(+)
diff -Nrup -X linux-2.6.37-org/Documentation/dontdiff linux-2.6.37-org/fs/ext4/inode.c linux-2.6.37/fs/ext4/inode.c
--- linux-2.6.37-org/fs/ext4/inode.c 2011-01-17 15:47:59.000000000 +0900
+++ linux-2.6.37/fs/ext4/inode.c 2011-01-17 15:52:25.000000000 +0900
@@ -1127,6 +1127,16 @@ void ext4_da_update_reserve_space(struct
used = ei->i_reserved_data_blocks;
}
+ if (unlikely(ei->i_allocated_meta_blocks >
+ ei->i_reserved_meta_blocks)) {
+ ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, "
+ "meta blocks %d with only %d reserved meta blocks\n",
+ __func__, inode->i_ino, ei->i_allocated_meta_blocks,
+ ei->i_reserved_meta_blocks);
+ WARN_ON(1);
+ ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks;
+ }
+
/* Update per-inode reservations */
ei->i_reserved_data_blocks -= used;
ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
On Mon, 17 Jan 2011 17:29:42 +0900, Akira Fujita <[email protected]> wrote:
> Hi,
>
> The metadata block reservation counter overflows with data write
> on ext4 (indirect block map) when its disk space is almost full.
> This overflow triggers following BUG_ON.
>
> Jan 14 09:36:48 TNESG9423 kernel: ------------[ cut here ]------------
> Jan 14 09:36:48 TNESG9423 kernel: kernel BUG at fs/ext4/inode.c:2170!
> Jan 14 09:36:48 TNESG9423 kernel: invalid opcode: 0000 [#1] SMP
> Jan 14 09:36:48 TNESG9423 kernel: last sysfs file: /sys/kernel/mm/ksm/run
> Jan 14 09:36:48 TNESG9423 kernel: CPU 0
> Jan 14 09:36:48 TNESG9423 kernel: Modules linked in: ip6table_filter ip6_tables ebtable_nat ebtables xt_CHECKSUM iptable_mangle ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4
> xt_state nf_conntrack ipt_REJECT bridge stp llc autofs4 sunrpc p4_clockmod freq_table speedstep_lib ipv6 xt_physdev iptable_filter ip_tables nls_utf8 dm_mirror dm_region_hash dm_log dm_mod kvm_intel
> kvm uinput ppdev parport_pc parport sg pcspkr i2c_i801 iTCO_wdt iTCO_vendor_support snd_hda_codec_realtek snd_hda_intel snd_hda_codec snd_hwdep snd_seq snd_seq_device snd_pcm snd_timer snd soundcore
> snd_page_alloc e1000e ext3 jbd sd_mod crc_t10dif sr_mod cdrom pata_via pata_acpi ata_generic ata_piix i915 drm_kms_helper drm i2c_algo_bit i2c_core video output [last unloaded: mperf]
> Jan 14 09:36:48 TNESG9423 kernel:
> Jan 14 09:36:48 TNESG9423 kernel: Pid: 937, comm: flush-8:0 Not tainted 2.6.37 #1 MS-7264BLM/PC-MJ18ABZR4
> Jan 14 09:36:48 TNESG9423 kernel: RIP: 0010:[<ffffffff811c5298>] [<ffffffff811c5298>] ext4_da_block_invalidatepages+0x168/0x180
> Jan 14 09:36:48 TNESG9423 kernel: RSP: 0018:ffff88007613f780 EFLAGS: 00010246
> Jan 14 09:36:48 TNESG9423 kernel: RAX: 0010000000000024 RBX: 0000000000008cf2 RCX: 000000000000000e
> Jan 14 09:36:48 TNESG9423 kernel: RDX: 000000000000000e RSI: 0000000000000001 RDI: ffffea0000a70d30
> Jan 14 09:36:48 TNESG9423 kernel: RBP: ffff88007613f850 R08: 0000000000000001 R09: 0000000000000002
> Jan 14 09:36:48 TNESG9423 kernel: R10: ffffea0000a70d38 R11: ffff880035f01b58 R12: ffff88007613f7a0
> Jan 14 09:36:48 TNESG9423 kernel: R13: ffff880065eecd68 R14: ffff88007613f7b8 R15: ffffea0000a70a58
> Jan 14 09:36:48 TNESG9423 kernel: FS: 0000000000000000(0000) GS:ffff88007f400000(0000) knlGS:0000000000000000
> Jan 14 09:36:48 TNESG9423 kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> Jan 14 09:36:48 TNESG9423 kernel: CR2: 0000003680ae1560 CR3: 000000004da45000 CR4: 00000000000006f0
> Jan 14 09:36:48 TNESG9423 kernel: DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> Jan 14 09:36:48 TNESG9423 kernel: DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
> Jan 14 09:36:48 TNESG9423 kernel: Process flush-8:0 (pid: 937, threadinfo ffff88007613e000, task ffff8800371b54e0)
> Jan 14 09:36:48 TNESG9423 kernel: Stack:
> Jan 14 09:36:48 TNESG9423 kernel: ffff88007613f7e0 ffffffff814ee3d6 0000000000000008 0000000e7613f7f0
> Jan 14 09:36:48 TNESG9423 kernel: 000000000000000e 000000003741a4b9 ffffea0000a70a58 ffffea0000a70a90
> Jan 14 09:36:48 TNESG9423 kernel: ffffea0000a70ac8 ffffea0000a70b00 ffffea0000a70b38 ffffea0000a70b70
> Jan 14 09:36:48 TNESG9423 kernel: Call Trace:
> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff814ee3d6>] ? printk+0x41/0x43
> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811c9fe4>] mpage_da_map_and_submit+0x274/0x470
> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811ca24d>] mpage_add_bh_to_extent+0x6d/0xf0
> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811ca5a0>] write_cache_pages_da+0x2d0/0x4a0
> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811caa4c>] ext4_da_writepages+0x2dc/0x650
> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81105321>] do_writepages+0x21/0x40
> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff810fb55b>] __filemap_fdatawrite_range+0x5b/0x60
> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff810fb833>] filemap_fdatawrite_range+0x13/0x20
> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811fb6ce>] jbd2_journal_begin_ordered_truncate+0x8e/0xb0
> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811cd02b>] ext4_evict_inode+0x23b/0x3b0
> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81167477>] evict+0x27/0xc0
> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81167a4b>] iput+0x1bb/0x2a0
> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811734a4>] writeback_sb_inodes+0x104/0x180
> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81173cfd>] writeback_inodes_wb+0x9d/0x160
> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff8117404b>] wb_writeback+0x28b/0x400
> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff810714cc>] ? lock_timer_base+0x3c/0x70
> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81072092>] ? del_timer_sync+0x22/0x30
> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81174257>] wb_do_writeback+0x97/0x1e0
> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81174452>] bdi_writeback_thread+0xb2/0x270
> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811743a0>] ? bdi_writeback_thread+0x0/0x270
> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811743a0>] ? bdi_writeback_thread+0x0/0x270
> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81083356>] kthread+0x96/0xa0
> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff8100ce84>] kernel_thread_helper+0x4/0x10
> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff810832c0>] ? kthread+0x0/0xa0
> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff8100ce80>] ? kernel_thread_helper+0x0/0x10
> Jan 14 09:36:48 TNESG9423 kernel: Code: a8 00 00 00 5b 41 5c 41 5d 41 5e 41 5f c9 c3 0f 1f 40 00 4c 89 e7 48 89 95 40 ff ff ff e8 01 1b f4 ff 48 8b 95 40 ff ff ff eb c9 <0f> 0b eb fe 0f 0b 66 90 eb fc
> 66 66 66 66 66 2e 0f 1f 84 00 00
> Jan 14 09:36:48 TNESG9423 kernel: RIP [<ffffffff811c5298>] ext4_da_block_invalidatepages+0x168/0x180
> Jan 14 09:36:48 TNESG9423 kernel: RSP <ffff88007613f780>
> Jan 14 09:36:48 TNESG9423 kernel: ---[ end trace 0496eaed3b9ec629 ]---
>
> To fix this, I referred to the patch which is for data blocks
> reservation counter (commit: ef627929781c98113e6ae93f159dd3c12a884ad8)
I cant find that commit in linus's, and tytso's tries, can you please
specify where it can be found.
> and made a following patch which prints metadata block inconsistency
> and corrects it.
> My patch is trial, if you have better idea, feel free to fix this bug.
>
> # You can reproduce this problem with attached programs.
> # In my environment, this occurs in 1 minute.
>
> Signed-off-by: Akira Fujita <[email protected]>
> ---
> fs/ext4/inode.c | 10 ++++++++++
> 1 file changed, 10 insertions(+)
> diff -Nrup -X linux-2.6.37-org/Documentation/dontdiff linux-2.6.37-org/fs/ext4/inode.c linux-2.6.37/fs/ext4/inode.c
> --- linux-2.6.37-org/fs/ext4/inode.c 2011-01-17 15:47:59.000000000 +0900
> +++ linux-2.6.37/fs/ext4/inode.c 2011-01-17 15:52:25.000000000 +0900
> @@ -1127,6 +1127,16 @@ void ext4_da_update_reserve_space(struct
> used = ei->i_reserved_data_blocks;
> }
>
> + if (unlikely(ei->i_allocated_meta_blocks >
> + ei->i_reserved_meta_blocks)) {
> + ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, "
> + "meta blocks %d with only %d reserved meta blocks\n",
> + __func__, inode->i_ino, ei->i_allocated_meta_blocks,
> + ei->i_reserved_meta_blocks);
> + WARN_ON(1);
> + ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks;
> + }
> +
The situation where allocated > reservation is a very nasty bug and
means what reservation is broken, so next moment result in user data
drop/corruption, due to ENOSPC, so we definitely have to investigate
the root of cause. BTW i cant reproduce the bug with your test
on 2.6.34.7-63.fc13 which is really old, but still.
Which version should i use to reproduce the issue?
> /* Update per-inode reservations */
> ei->i_reserved_data_blocks -= used;
> ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
Attachment: metadata_blocks_overflow.sh (application/x-shellscript)
> #include <stdio.h>
> #include <stdlib.h>
> #include <unistd.h>
> #include <string.h>
> #include <sys/file.h>
> #include <sys/mman.h>
> #include <sys/uio.h>
> #include <sys/wait.h>
> #include <errno.h>
> #include <malloc.h>
>
> #define BUFSIZE 4096
> #define PATH_MAX 4096
> #define MINIOSIZE 4*1024
> #define MAXIOSIZE 4*1024*1024
>
> #ifndef O_DIRECT
> #define O_DIRECT 040000
> #endif
>
> static int bufsize = 0; /* Buffersize. Default random */
> static int bufsize_random = 1; /* Buffersize random flag. Default random */
> static long long filesize = 0; /* Target file size */
> static long long target_range = 0; /* Target range per process */
> static int files_num = 0;
> static char *filenames;
> static int mmap_write = 0;
> static int o_sync = 0;
> static int o_async = 0;
> static int o_direct = 0;
>
> static void setup(void);
> static void cleanup(void);
>
> void prg_usage()
> {
> fprintf(stderr, "Usage: fs_write [-S 0|1] [-D] [-n num_process]\n");
> fprintf(stderr, " -s file_size file...\n");
> fprintf(stderr, "\n");
> fprintf(stderr, " -S sync mode (0:O_ASYNC 1:O_SYNC)\n");
> fprintf(stderr, " -D Direct IO\n");
> fprintf(stderr, " -n Process count (default is 1)\n");
> fprintf(stderr, " -s File size\n");
> fprintf(stderr, " file... Traget file\n");
>
> exit(1);
> }
>
> int runtest(int fd_w[], int childnum)
> {
> int num = 0;
> int err = 0;
> long long offset = target_range * childnum;
> long long remain = target_range;
> int len = bufsize;
> int write_len = 0;
> int psize = getpagesize();
> int max_page = MAXIOSIZE / psize;
> char *buf_w;
> void *p;
>
> buf_w = (char*)memalign(psize, MAXIOSIZE);
>
> while (remain > 0) {
> if (bufsize_random == 1) {
> len = (rand() % max_page) * psize;
> if (!len)
> len = MINIOSIZE;
> }
> if (len > remain)
> len = remain;
>
> for (num = 0; num < files_num ; num++) {
> memset(buf_w, childnum+num, len);
> if (!mmap_write) {
> if (lseek(fd_w[num], offset, SEEK_SET) < 0) {
> err = errno;
> goto out;
> }
> if ((write_len = write(fd_w[num], buf_w, len)) < 0) {
> err = errno;
> goto out;
> }
> if (len != write_len)
> goto out;
> } else {
> p = mmap(NULL, len, PROT_WRITE, MAP_SHARED,
> fd_w[num], offset);
> if (p == MAP_FAILED) {
> err = errno;
> goto out;
> }
> memcpy(p, buf_w, len);
> if (munmap(p, len) == -1) {
> err = errno;
> goto out;
> }
> }
> }
> offset += len;
> remain -= len;
> }
>
> out:
> if (err == ENOSPC || len != write_len)
> err = 0;
> if (err > 0)
> fprintf(stderr, "%s;\n", strerror(err));
> free(buf_w);
>
> return err;
> }
>
> int child_function(int childnum)
> {
> int num, ret = -1;
> int fd_w[files_num];
> char *filename;
>
> for (num = 0; num < files_num ; num++) {
> fd_w[num] = -1;
> }
> for (num = 0; num < files_num ; num++) {
> filename = filenames + (num*PATH_MAX);
> if ((fd_w[num] = open(filename,
> O_RDWR|o_sync|o_async|o_direct)) < 0) {
> printf("TINFO:fd_w open failed for %s: %s\n",filename, strerror(errno));
> goto out;
> }
> }
>
> ret = runtest(fd_w, childnum);
>
> out:
> for (num = 0; num < files_num ; num++) {
> if (fd_w[num] != -1)
> close(fd_w[num]);
> }
>
> exit(ret);
> }
>
> static void setup(void)
> {
> int num, fd;
> char *filename;
>
> for (num = 0; num < files_num; num++) {
> filename = filenames + (num * PATH_MAX);
> if ((fd = open(filename, O_CREAT|O_EXCL|O_WRONLY|O_TRUNC, 0666)) < 0) {
> printf("TBROK, Couldn't create test file %s: %s\n", filename, strerror(errno));
> cleanup();
> }
>
> if (mmap_write) {
> if (ftruncate(fd, filesize) < 0) {
> printf("TBROK2: Couldn't create test file %s: %s\n", filename, strerror(errno));
> cleanup();
> }
> }
> close(fd);
> }
> }
>
> static void cleanup(void)
> {
> int num;
> for (num = 0; num < files_num; num++)
> unlink(filenames + (num * PATH_MAX));
> free(filenames);
> exit(1);
> }
>
> int main(int argc, char *argv[])
> {
> int numchild = 1; /* Number of children. Default 5 */
> int i, num;
> int err = 0;
> int sync_mode = 0;
> int *cpid;
> int status;
>
> while ((i = getopt(argc, argv, "MS:Dn:s:")) != -1) {
> switch(i) {
> case 'M':
> mmap_write = 1;
> break;
> case 'S':
> if ((sync_mode = atoi(optarg)) < 0) {
> fprintf(stderr, "sync mode must be 0 or 1;\n");
> prg_usage();
> }
> switch (sync_mode) {
> case 0:
> o_async = O_ASYNC;
> break;
> case 1:
> o_sync = O_SYNC;
> break;
> default:
> fprintf(stderr, "sync mode must be 0 or 1;\n");
> prg_usage();
> }
> break;
> case 'D':
> o_direct = O_DIRECT;
> break;
> case 'n':
> if ((numchild = atoi(optarg)) <= 0) {
> fprintf(stderr, "no of children must be > 0;\n");
> prg_usage();
> }
> break;
> case 's':
> filesize = atol(optarg);
> if (filesize <= 0) {
> fprintf(stderr, "filesize must be > 0;\n");
> prg_usage();
> }
> break;
> default:
> prg_usage();
> }
> }
>
> files_num = argc - optind;
> if (files_num <= 0)
> prg_usage();
> if (filesize % numchild != 0 || (filesize / numchild) % BUFSIZE != 0) {
> fprintf(stderr, "filesize must be multiple of 4k*numchild:"
> "filesize=%lld;\n", filesize);
> prg_usage();
> }
>
> if ((filenames = (char *)malloc(files_num * PATH_MAX)) == NULL ||
> (cpid = (int *)malloc(sizeof(int)*numchild)) == NULL) {
> perror("malloc");
> exit(1);
> }
> for(i = optind, num = 0 ; i < argc; i++, num++)
> strcpy((char *)filenames + (num * PATH_MAX), argv[i]);
> target_range = filesize / numchild;
>
>
> setup();
>
> for(i = 0; i < numchild; i++) {
> *(cpid+i) = fork();
> if (*(cpid+i) == 0)
> child_function(i);
> }
> for(i = 0; i < numchild; i++) {
> waitpid(*(cpid+1), &status, 0);
> }
>
> cleanup();
> return err;
> }
(2011/01/17 20:05), Dmitry wrote:
> On Mon, 17 Jan 2011 17:29:42 +0900, Akira Fujita<[email protected]> wrote:
>> Hi,
>>
>> The metadata block reservation counter overflows with data write
>> on ext4 (indirect block map) when its disk space is almost full.
>> This overflow triggers following BUG_ON.
>>
>> Jan 14 09:36:48 TNESG9423 kernel: ------------[ cut here ]------------
>> Jan 14 09:36:48 TNESG9423 kernel: kernel BUG at fs/ext4/inode.c:2170!
>> Jan 14 09:36:48 TNESG9423 kernel: invalid opcode: 0000 [#1] SMP
>> Jan 14 09:36:48 TNESG9423 kernel: last sysfs file: /sys/kernel/mm/ksm/run
>> Jan 14 09:36:48 TNESG9423 kernel: CPU 0
>> Jan 14 09:36:48 TNESG9423 kernel: Modules linked in: ip6table_filter ip6_tables ebtable_nat ebtables xt_CHECKSUM iptable_mangle ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4
>> xt_state nf_conntrack ipt_REJECT bridge stp llc autofs4 sunrpc p4_clockmod freq_table speedstep_lib ipv6 xt_physdev iptable_filter ip_tables nls_utf8 dm_mirror dm_region_hash dm_log dm_mod kvm_intel
>> kvm uinput ppdev parport_pc parport sg pcspkr i2c_i801 iTCO_wdt iTCO_vendor_support snd_hda_codec_realtek snd_hda_intel snd_hda_codec snd_hwdep snd_seq snd_seq_device snd_pcm snd_timer snd soundcore
>> snd_page_alloc e1000e ext3 jbd sd_mod crc_t10dif sr_mod cdrom pata_via pata_acpi ata_generic ata_piix i915 drm_kms_helper drm i2c_algo_bit i2c_core video output [last unloaded: mperf]
>> Jan 14 09:36:48 TNESG9423 kernel:
>> Jan 14 09:36:48 TNESG9423 kernel: Pid: 937, comm: flush-8:0 Not tainted 2.6.37 #1 MS-7264BLM/PC-MJ18ABZR4
>> Jan 14 09:36:48 TNESG9423 kernel: RIP: 0010:[<ffffffff811c5298>] [<ffffffff811c5298>] ext4_da_block_invalidatepages+0x168/0x180
>> Jan 14 09:36:48 TNESG9423 kernel: RSP: 0018:ffff88007613f780 EFLAGS: 00010246
>> Jan 14 09:36:48 TNESG9423 kernel: RAX: 0010000000000024 RBX: 0000000000008cf2 RCX: 000000000000000e
>> Jan 14 09:36:48 TNESG9423 kernel: RDX: 000000000000000e RSI: 0000000000000001 RDI: ffffea0000a70d30
>> Jan 14 09:36:48 TNESG9423 kernel: RBP: ffff88007613f850 R08: 0000000000000001 R09: 0000000000000002
>> Jan 14 09:36:48 TNESG9423 kernel: R10: ffffea0000a70d38 R11: ffff880035f01b58 R12: ffff88007613f7a0
>> Jan 14 09:36:48 TNESG9423 kernel: R13: ffff880065eecd68 R14: ffff88007613f7b8 R15: ffffea0000a70a58
>> Jan 14 09:36:48 TNESG9423 kernel: FS: 0000000000000000(0000) GS:ffff88007f400000(0000) knlGS:0000000000000000
>> Jan 14 09:36:48 TNESG9423 kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
>> Jan 14 09:36:48 TNESG9423 kernel: CR2: 0000003680ae1560 CR3: 000000004da45000 CR4: 00000000000006f0
>> Jan 14 09:36:48 TNESG9423 kernel: DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>> Jan 14 09:36:48 TNESG9423 kernel: DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
>> Jan 14 09:36:48 TNESG9423 kernel: Process flush-8:0 (pid: 937, threadinfo ffff88007613e000, task ffff8800371b54e0)
>> Jan 14 09:36:48 TNESG9423 kernel: Stack:
>> Jan 14 09:36:48 TNESG9423 kernel: ffff88007613f7e0 ffffffff814ee3d6 0000000000000008 0000000e7613f7f0
>> Jan 14 09:36:48 TNESG9423 kernel: 000000000000000e 000000003741a4b9 ffffea0000a70a58 ffffea0000a70a90
>> Jan 14 09:36:48 TNESG9423 kernel: ffffea0000a70ac8 ffffea0000a70b00 ffffea0000a70b38 ffffea0000a70b70
>> Jan 14 09:36:48 TNESG9423 kernel: Call Trace:
>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff814ee3d6>] ? printk+0x41/0x43
>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811c9fe4>] mpage_da_map_and_submit+0x274/0x470
>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811ca24d>] mpage_add_bh_to_extent+0x6d/0xf0
>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811ca5a0>] write_cache_pages_da+0x2d0/0x4a0
>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811caa4c>] ext4_da_writepages+0x2dc/0x650
>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81105321>] do_writepages+0x21/0x40
>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff810fb55b>] __filemap_fdatawrite_range+0x5b/0x60
>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff810fb833>] filemap_fdatawrite_range+0x13/0x20
>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811fb6ce>] jbd2_journal_begin_ordered_truncate+0x8e/0xb0
>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811cd02b>] ext4_evict_inode+0x23b/0x3b0
>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81167477>] evict+0x27/0xc0
>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81167a4b>] iput+0x1bb/0x2a0
>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811734a4>] writeback_sb_inodes+0x104/0x180
>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81173cfd>] writeback_inodes_wb+0x9d/0x160
>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff8117404b>] wb_writeback+0x28b/0x400
>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff810714cc>] ? lock_timer_base+0x3c/0x70
>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81072092>] ? del_timer_sync+0x22/0x30
>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81174257>] wb_do_writeback+0x97/0x1e0
>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81174452>] bdi_writeback_thread+0xb2/0x270
>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811743a0>] ? bdi_writeback_thread+0x0/0x270
>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811743a0>] ? bdi_writeback_thread+0x0/0x270
>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81083356>] kthread+0x96/0xa0
>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff8100ce84>] kernel_thread_helper+0x4/0x10
>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff810832c0>] ? kthread+0x0/0xa0
>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff8100ce80>] ? kernel_thread_helper+0x0/0x10
>> Jan 14 09:36:48 TNESG9423 kernel: Code: a8 00 00 00 5b 41 5c 41 5d 41 5e 41 5f c9 c3 0f 1f 40 00 4c 89 e7 48 89 95 40 ff ff ff e8 01 1b f4 ff 48 8b 95 40 ff ff ff eb c9<0f> 0b eb fe 0f 0b 66 90 eb fc
>> 66 66 66 66 66 2e 0f 1f 84 00 00
>> Jan 14 09:36:48 TNESG9423 kernel: RIP [<ffffffff811c5298>] ext4_da_block_invalidatepages+0x168/0x180
>> Jan 14 09:36:48 TNESG9423 kernel: RSP<ffff88007613f780>
>> Jan 14 09:36:48 TNESG9423 kernel: ---[ end trace 0496eaed3b9ec629 ]---
>>
>> To fix this, I referred to the patch which is for data blocks
>> reservation counter (commit: ef627929781c98113e6ae93f159dd3c12a884ad8)
> I cant find that commit in linus's, and tytso's tries, can you please
> specify where it can be found.
Oops, commit ef627929781c98113e6ae93f159dd3c12a884ad8 is on
http://lkml.indiana.edu/hypermail/linux/kernel/0912.3/01475.html,
but actually it's incorrect. The correct commit is
61820f33e1bcadff14af25a4a894240e7903dd3d in ext4 patch queue.
>> and made a following patch which prints metadata block inconsistency
>> and corrects it.
>> My patch is trial, if you have better idea, feel free to fix this bug.
>>
>> # You can reproduce this problem with attached programs.
>> # In my environment, this occurs in 1 minute.
>>
>> Signed-off-by: Akira Fujita<[email protected]>
>> ---
>> fs/ext4/inode.c | 10 ++++++++++
>> 1 file changed, 10 insertions(+)
>> diff -Nrup -X linux-2.6.37-org/Documentation/dontdiff linux-2.6.37-org/fs/ext4/inode.c linux-2.6.37/fs/ext4/inode.c
>> --- linux-2.6.37-org/fs/ext4/inode.c 2011-01-17 15:47:59.000000000 +0900
>> +++ linux-2.6.37/fs/ext4/inode.c 2011-01-17 15:52:25.000000000 +0900
>> @@ -1127,6 +1127,16 @@ void ext4_da_update_reserve_space(struct
>> used = ei->i_reserved_data_blocks;
>> }
>>
>> + if (unlikely(ei->i_allocated_meta_blocks>
>> + ei->i_reserved_meta_blocks)) {
>> + ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, "
>> + "meta blocks %d with only %d reserved meta blocks\n",
>> + __func__, inode->i_ino, ei->i_allocated_meta_blocks,
>> + ei->i_reserved_meta_blocks);
>> + WARN_ON(1);
>> + ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks;
>> + }
>> +
> The situation where allocated> reservation is a very nasty bug and
> means what reservation is broken, so next moment result in user data
> drop/corruption, due to ENOSPC, so we definitely have to investigate
> the root of cause. BTW i cant reproduce the bug with your test
> on 2.6.34.7-63.fc13 which is really old, but still.
> Which version should i use to reproduce the issue?
My kernel is 2.6.37 + ext4 patch queue (2741742fcfca2e19cd26b1cfe8e43e7b444c8e35).
[akira@bsd086 ~]$ uname -a
Linux bsd086.bsd.tnes.nec.co.jp 2.6.37 #6 SMP Mon Jan 17 11:54:42 JST 2011 x86_64 x86_64 x86_64 GNU/Linux
>> /* Update per-inode reservations */
>> ei->i_reserved_data_blocks -= used;
>> ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
> Attachment: metadata_blocks_overflow.sh (application/x-shellscript)
>> #include<stdio.h>
>> #include<stdlib.h>
>> #include<unistd.h>
>> #include<string.h>
>> #include<sys/file.h>
>> #include<sys/mman.h>
>> #include<sys/uio.h>
>> #include<sys/wait.h>
>> #include<errno.h>
>> #include<malloc.h>
>>
>> #define BUFSIZE 4096
>> #define PATH_MAX 4096
>> #define MINIOSIZE 4*1024
>> #define MAXIOSIZE 4*1024*1024
>>
>> #ifndef O_DIRECT
>> #define O_DIRECT 040000
>> #endif
>>
>> static int bufsize = 0; /* Buffersize. Default random */
>> static int bufsize_random = 1; /* Buffersize random flag. Default random */
>> static long long filesize = 0; /* Target file size */
>> static long long target_range = 0; /* Target range per process */
>> static int files_num = 0;
>> static char *filenames;
>> static int mmap_write = 0;
>> static int o_sync = 0;
>> static int o_async = 0;
>> static int o_direct = 0;
>>
>> static void setup(void);
>> static void cleanup(void);
>>
>> void prg_usage()
>> {
>> fprintf(stderr, "Usage: fs_write [-S 0|1] [-D] [-n num_process]\n");
>> fprintf(stderr, " -s file_size file...\n");
>> fprintf(stderr, "\n");
>> fprintf(stderr, " -S sync mode (0:O_ASYNC 1:O_SYNC)\n");
>> fprintf(stderr, " -D Direct IO\n");
>> fprintf(stderr, " -n Process count (default is 1)\n");
>> fprintf(stderr, " -s File size\n");
>> fprintf(stderr, " file... Traget file\n");
>>
>> exit(1);
>> }
>>
>> int runtest(int fd_w[], int childnum)
>> {
>> int num = 0;
>> int err = 0;
>> long long offset = target_range * childnum;
>> long long remain = target_range;
>> int len = bufsize;
>> int write_len = 0;
>> int psize = getpagesize();
>> int max_page = MAXIOSIZE / psize;
>> char *buf_w;
>> void *p;
>>
>> buf_w = (char*)memalign(psize, MAXIOSIZE);
>>
>> while (remain> 0) {
>> if (bufsize_random == 1) {
>> len = (rand() % max_page) * psize;
>> if (!len)
>> len = MINIOSIZE;
>> }
>> if (len> remain)
>> len = remain;
>>
>> for (num = 0; num< files_num ; num++) {
>> memset(buf_w, childnum+num, len);
>> if (!mmap_write) {
>> if (lseek(fd_w[num], offset, SEEK_SET)< 0) {
>> err = errno;
>> goto out;
>> }
>> if ((write_len = write(fd_w[num], buf_w, len))< 0) {
>> err = errno;
>> goto out;
>> }
>> if (len != write_len)
>> goto out;
>> } else {
>> p = mmap(NULL, len, PROT_WRITE, MAP_SHARED,
>> fd_w[num], offset);
>> if (p == MAP_FAILED) {
>> err = errno;
>> goto out;
>> }
>> memcpy(p, buf_w, len);
>> if (munmap(p, len) == -1) {
>> err = errno;
>> goto out;
>> }
>> }
>> }
>> offset += len;
>> remain -= len;
>> }
>>
>> out:
>> if (err == ENOSPC || len != write_len)
>> err = 0;
>> if (err> 0)
>> fprintf(stderr, "%s;\n", strerror(err));
>> free(buf_w);
>>
>> return err;
>> }
>>
>> int child_function(int childnum)
>> {
>> int num, ret = -1;
>> int fd_w[files_num];
>> char *filename;
>>
>> for (num = 0; num< files_num ; num++) {
>> fd_w[num] = -1;
>> }
>> for (num = 0; num< files_num ; num++) {
>> filename = filenames + (num*PATH_MAX);
>> if ((fd_w[num] = open(filename,
>> O_RDWR|o_sync|o_async|o_direct))< 0) {
>> printf("TINFO:fd_w open failed for %s: %s\n",filename, strerror(errno));
>> goto out;
>> }
>> }
>>
>> ret = runtest(fd_w, childnum);
>>
>> out:
>> for (num = 0; num< files_num ; num++) {
>> if (fd_w[num] != -1)
>> close(fd_w[num]);
>> }
>>
>> exit(ret);
>> }
>>
>> static void setup(void)
>> {
>> int num, fd;
>> char *filename;
>>
>> for (num = 0; num< files_num; num++) {
>> filename = filenames + (num * PATH_MAX);
>> if ((fd = open(filename, O_CREAT|O_EXCL|O_WRONLY|O_TRUNC, 0666))< 0) {
>> printf("TBROK, Couldn't create test file %s: %s\n", filename, strerror(errno));
>> cleanup();
>> }
>>
>> if (mmap_write) {
>> if (ftruncate(fd, filesize)< 0) {
>> printf("TBROK2: Couldn't create test file %s: %s\n", filename, strerror(errno));
>> cleanup();
>> }
>> }
>> close(fd);
>> }
>> }
>>
>> static void cleanup(void)
>> {
>> int num;
>> for (num = 0; num< files_num; num++)
>> unlink(filenames + (num * PATH_MAX));
>> free(filenames);
>> exit(1);
>> }
>>
>> int main(int argc, char *argv[])
>> {
>> int numchild = 1; /* Number of children. Default 5 */
>> int i, num;
>> int err = 0;
>> int sync_mode = 0;
>> int *cpid;
>> int status;
>>
>> while ((i = getopt(argc, argv, "MS:Dn:s:")) != -1) {
>> switch(i) {
>> case 'M':
>> mmap_write = 1;
>> break;
>> case 'S':
>> if ((sync_mode = atoi(optarg))< 0) {
>> fprintf(stderr, "sync mode must be 0 or 1;\n");
>> prg_usage();
>> }
>> switch (sync_mode) {
>> case 0:
>> o_async = O_ASYNC;
>> break;
>> case 1:
>> o_sync = O_SYNC;
>> break;
>> default:
>> fprintf(stderr, "sync mode must be 0 or 1;\n");
>> prg_usage();
>> }
>> break;
>> case 'D':
>> o_direct = O_DIRECT;
>> break;
>> case 'n':
>> if ((numchild = atoi(optarg))<= 0) {
>> fprintf(stderr, "no of children must be> 0;\n");
>> prg_usage();
>> }
>> break;
>> case 's':
>> filesize = atol(optarg);
>> if (filesize<= 0) {
>> fprintf(stderr, "filesize must be> 0;\n");
>> prg_usage();
>> }
>> break;
>> default:
>> prg_usage();
>> }
>> }
>>
>> files_num = argc - optind;
>> if (files_num<= 0)
>> prg_usage();
>> if (filesize % numchild != 0 || (filesize / numchild) % BUFSIZE != 0) {
>> fprintf(stderr, "filesize must be multiple of 4k*numchild:"
>> "filesize=%lld;\n", filesize);
>> prg_usage();
>> }
>>
>> if ((filenames = (char *)malloc(files_num * PATH_MAX)) == NULL ||
>> (cpid = (int *)malloc(sizeof(int)*numchild)) == NULL) {
>> perror("malloc");
>> exit(1);
>> }
>> for(i = optind, num = 0 ; i< argc; i++, num++)
>> strcpy((char *)filenames + (num * PATH_MAX), argv[i]);
>> target_range = filesize / numchild;
>>
>>
>> setup();
>>
>> for(i = 0; i< numchild; i++) {
>> *(cpid+i) = fork();
>> if (*(cpid+i) == 0)
>> child_function(i);
>> }
>> for(i = 0; i< numchild; i++) {
>> waitpid(*(cpid+1),&status, 0);
>> }
>>
>> cleanup();
>> return err;
>> }
>
--
Akira Fujita <[email protected]>
The First Fundamental Software Development Group,
Software Development Division,
NEC Software Tohoku, Ltd.
Hi Dmitry,
Could you reproduce this?
It still occurs on 2.6.38-rc3...
Feb 4 09:19:43 TNESG9423 kernel: [ 770.742185] EXT4-fs (sda9): delayed block allocation failed for inode 13 at logical offset 36019 with max blocks 64 with error -28
Feb 4 09:19:43 TNESG9423 kernel: [ 770.742253] EXT4-fs (sda9): This should not happen!! Data will be lost
Feb 4 09:19:43 TNESG9423 kernel: [ 770.742255]
Feb 4 09:19:43 TNESG9423 kernel: [ 770.742298] Total free blocks count 0
Feb 4 09:19:43 TNESG9423 kernel: [ 770.742318] Free/Dirty block details
Feb 4 09:19:43 TNESG9423 kernel: [ 770.742339] free_blocks=0
Feb 4 09:19:43 TNESG9423 kernel: [ 770.742355] dirty_blocks=-4294926403
Feb 4 09:19:43 TNESG9423 kernel: [ 770.742375] Block reservation details
Feb 4 09:19:43 TNESG9423 kernel: [ 770.742395] i_reserved_data_blocks=40781
Feb 4 09:19:43 TNESG9423 kernel: [ 770.742417] i_reserved_meta_blocks=112
Feb 4 09:19:43 TNESG9423 kernel: [ 770.742617] EXT4-fs (sda9): delayed block allocation failed for inode 13 at logical offset 36019 with max blocks 64 with error -28
Feb 4 09:19:43 TNESG9423 kernel: [ 770.742675] EXT4-fs (sda9): This should not happen!! Data will be lost
Feb 4 09:19:43 TNESG9423 kernel: [ 770.742677]
Feb 4 09:19:43 TNESG9423 kernel: [ 770.742718] Total free blocks count 0
Feb 4 09:19:43 TNESG9423 kernel: [ 770.742738] Free/Dirty block details
Feb 4 09:19:43 TNESG9423 kernel: [ 770.742757] free_blocks=0
Feb 4 09:19:43 TNESG9423 kernel: [ 770.742772] dirty_blocks=-4294926403
Feb 4 09:19:43 TNESG9423 kernel: [ 770.742792] Block reservation details
Feb 4 09:19:43 TNESG9423 kernel: [ 770.742812] i_reserved_data_blocks=40781
Feb 4 09:19:43 TNESG9423 kernel: [ 770.742833] i_reserved_meta_blocks=112
Feb 4 09:19:43 TNESG9423 kernel: [ 770.742878] ------------[ cut here ]------------
Feb 4 09:19:43 TNESG9423 kernel: [ 770.742902] kernel BUG at fs/ext4/inode.c:2188!
Feb 4 09:19:43 TNESG9423 kernel: [ 770.742926] invalid opcode: 0000 [#1] SMP
Feb 4 09:19:43 TNESG9423 kernel: [ 770.742954] last sysfs file: /sys/devices/pci0000:00/0000:00:1f.2/host0/target0:0:0/0:0:0:0/block/sda/sda9/alignment_offset
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] CPU 1
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] Modules linked in: ext4 jbd2 crc16 ip6table_filter ip6_tables sunrpc p4_clockmod freq_table speedstep_lib ipv6 uinput ppdev parport_pc parport pcspkr
i2c_i801 iTCO_wdt iTCO_vendor_support e1000e pata_acpi ata_generic pata_via i915 drm_kms_helper drm i2c_algo_bit i2c_core video [last unloaded: microcode]
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001]
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] Pid: 2212, comm: flush-8:0 Not tainted 2.6.38-rc3 #1 MS-7264BLM/PC-MJ18ABZR4
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] RIP: 0010:[<ffffffffa02278be>] [<ffffffffa02278be>] ext4_da_block_invalidatepages+0x81/0x119 [ext4]
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] RSP: 0018:ffff88007b0d5750 EFLAGS: 00010246
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] RAX: 000000000000000e RBX: 0000000000008cf2 RCX: ffff88007b0d5780
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] RDX: 0000000000000000 RSI: 0100000000000000 RDI: ffffea0000ab3610
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] RBP: ffff88007b0d5820 R08: ffff88007b0d5690 R09: 0000000000000001
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] R10: ffff88002fe49880 R11: ffff88002fe49880 R12: ffff88007b0d5770
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] R13: ffff88007b0d5780 R14: ffff880025c7c1f8 R15: ffffea0000ab3338
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] FS: 0000000000000000(0000) GS:ffff88007f480000(0000) knlGS:0000000000000000
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] CR2: 0000003680ae1560 CR3: 0000000079d19000 CR4: 00000000000006e0
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] Process flush-8:0 (pid: 2212, threadinfo ffff88007b0d4000, task ffff88007b29e900)
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] Stack:
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] 372e30373720205b 00205d3333383234 ffff880000000020 ffff88007b0d57c0
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] 000000000000000e ffff8800000000a7 ffffea0000ab3338 ffffea0000ab3370
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] ffffea0000ab33a8 ffffea0000ab33e0 ffffea0000ab3418 ffffea0000ab3450
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] Call Trace:
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] [<ffffffffa022afb8>] mpage_da_map_and_submit+0x203/0x2f4 [ext4]
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] [<ffffffffa022b177>] mpage_add_bh_to_extent+0xce/0xdd [ext4]
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] [<ffffffffa022bcf0>] write_cache_pages_da+0x29f/0x3a7 [ext4]
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] [<ffffffffa022c10b>] ext4_da_writepages+0x313/0x50c [ext4]
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] [<ffffffff814e148c>] ? kmemleak_free+0x26/0x45
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] [<ffffffff810b8e92>] do_writepages+0x21/0x2a
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] [<ffffffff810b1658>] __filemap_fdatawrite_range+0x50/0x52
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] [<ffffffff810b1bc4>] filemap_fdatawrite_range+0x13/0x15
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] [<ffffffffa02080dc>] jbd2_journal_begin_ordered_truncate+0x6d/0x90 [jbd2]
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] [<ffffffffa0225c98>] ext4_begin_ordered_truncate+0x6d/0x76 [ext4]
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] [<ffffffffa022abbc>] ext4_evict_inode+0x81/0x27a [ext4]
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] [<ffffffff8110057b>] evict+0x24/0x8d
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] [<ffffffff81100aed>] iput+0x1e9/0x21a
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] [<ffffffff8110a806>] writeback_sb_inodes+0xb3/0x102
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] [<ffffffff8110b092>] writeback_inodes_wb+0xfd/0x10f
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] [<ffffffff8110b2d3>] wb_writeback+0x22f/0x34a
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] [<ffffffff810b8116>] ? determine_dirtyable_memory+0x1a/0x2d
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] [<ffffffff8110b5cc>] wb_do_writeback+0x1de/0x1fe
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] [<ffffffff810491cd>] ? process_timeout+0x0/0x10
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] [<ffffffff8110b6af>] bdi_writeback_thread+0xc3/0x1f8
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] [<ffffffff8110b5ec>] ? bdi_writeback_thread+0x0/0x1f8
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] [<ffffffff8110b5ec>] ? bdi_writeback_thread+0x0/0x1f8
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] [<ffffffff810567dd>] kthread+0x82/0x8a
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] [<ffffffff81003894>] kernel_thread_helper+0x4/0x10
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] [<ffffffff8105675b>] ? kthread+0x0/0x8a
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] [<ffffffff81003890>] ? kernel_thread_helper+0x0/0x10
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] Code: 89 f6 4c 89 e7 e8 dd 1d e9 e0 85 c0 0f 84 9f 00 00 00 4c 89 e9 31 d2 eb 68 4c 8b 39 49 39 5f 20 77 63 49 8b 37 40 80 e6 01 75 04 <0f> 0b eb fe 49
8b 37 48 83 c1 08 f7 c6 00 20 00 00 74 04 0f 0b
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] RIP [<ffffffffa02278be>] ext4_da_block_invalidatepages+0x81/0x119 [ext4]
Feb 4 09:19:43 TNESG9423 kernel: [ 770.743001] RSP <ffff88007b0d5750>
Feb 4 09:19:43 TNESG9423 kernel: [ 770.833820] ---[ end trace d27d30499e24ac96 ]---
Feb 4 09:19:43 TNESG9423 kernel: [ 770.835481] flush-8:0 used greatest stack depth: 2960 bytes left
(2011/01/18 10:02), Akira Fujita wrote:
>
>
> (2011/01/17 20:05), Dmitry wrote:
>> On Mon, 17 Jan 2011 17:29:42 +0900, Akira Fujita<[email protected]> wrote:
>>> Hi,
>>>
>>> The metadata block reservation counter overflows with data write
>>> on ext4 (indirect block map) when its disk space is almost full.
>>> This overflow triggers following BUG_ON.
>>>
>>> Jan 14 09:36:48 TNESG9423 kernel: ------------[ cut here ]------------
>>> Jan 14 09:36:48 TNESG9423 kernel: kernel BUG at fs/ext4/inode.c:2170!
>>> Jan 14 09:36:48 TNESG9423 kernel: invalid opcode: 0000 [#1] SMP
>>> Jan 14 09:36:48 TNESG9423 kernel: last sysfs file: /sys/kernel/mm/ksm/run
>>> Jan 14 09:36:48 TNESG9423 kernel: CPU 0
>>> Jan 14 09:36:48 TNESG9423 kernel: Modules linked in: ip6table_filter ip6_tables ebtable_nat ebtables xt_CHECKSUM iptable_mangle ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4
>>> xt_state nf_conntrack ipt_REJECT bridge stp llc autofs4 sunrpc p4_clockmod freq_table speedstep_lib ipv6 xt_physdev iptable_filter ip_tables nls_utf8 dm_mirror dm_region_hash dm_log dm_mod kvm_intel
>>> kvm uinput ppdev parport_pc parport sg pcspkr i2c_i801 iTCO_wdt iTCO_vendor_support snd_hda_codec_realtek snd_hda_intel snd_hda_codec snd_hwdep snd_seq snd_seq_device snd_pcm snd_timer snd soundcore
>>> snd_page_alloc e1000e ext3 jbd sd_mod crc_t10dif sr_mod cdrom pata_via pata_acpi ata_generic ata_piix i915 drm_kms_helper drm i2c_algo_bit i2c_core video output [last unloaded: mperf]
>>> Jan 14 09:36:48 TNESG9423 kernel:
>>> Jan 14 09:36:48 TNESG9423 kernel: Pid: 937, comm: flush-8:0 Not tainted 2.6.37 #1 MS-7264BLM/PC-MJ18ABZR4
>>> Jan 14 09:36:48 TNESG9423 kernel: RIP: 0010:[<ffffffff811c5298>] [<ffffffff811c5298>] ext4_da_block_invalidatepages+0x168/0x180
>>> Jan 14 09:36:48 TNESG9423 kernel: RSP: 0018:ffff88007613f780 EFLAGS: 00010246
>>> Jan 14 09:36:48 TNESG9423 kernel: RAX: 0010000000000024 RBX: 0000000000008cf2 RCX: 000000000000000e
>>> Jan 14 09:36:48 TNESG9423 kernel: RDX: 000000000000000e RSI: 0000000000000001 RDI: ffffea0000a70d30
>>> Jan 14 09:36:48 TNESG9423 kernel: RBP: ffff88007613f850 R08: 0000000000000001 R09: 0000000000000002
>>> Jan 14 09:36:48 TNESG9423 kernel: R10: ffffea0000a70d38 R11: ffff880035f01b58 R12: ffff88007613f7a0
>>> Jan 14 09:36:48 TNESG9423 kernel: R13: ffff880065eecd68 R14: ffff88007613f7b8 R15: ffffea0000a70a58
>>> Jan 14 09:36:48 TNESG9423 kernel: FS: 0000000000000000(0000) GS:ffff88007f400000(0000) knlGS:0000000000000000
>>> Jan 14 09:36:48 TNESG9423 kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
>>> Jan 14 09:36:48 TNESG9423 kernel: CR2: 0000003680ae1560 CR3: 000000004da45000 CR4: 00000000000006f0
>>> Jan 14 09:36:48 TNESG9423 kernel: DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>>> Jan 14 09:36:48 TNESG9423 kernel: DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
>>> Jan 14 09:36:48 TNESG9423 kernel: Process flush-8:0 (pid: 937, threadinfo ffff88007613e000, task ffff8800371b54e0)
>>> Jan 14 09:36:48 TNESG9423 kernel: Stack:
>>> Jan 14 09:36:48 TNESG9423 kernel: ffff88007613f7e0 ffffffff814ee3d6 0000000000000008 0000000e7613f7f0
>>> Jan 14 09:36:48 TNESG9423 kernel: 000000000000000e 000000003741a4b9 ffffea0000a70a58 ffffea0000a70a90
>>> Jan 14 09:36:48 TNESG9423 kernel: ffffea0000a70ac8 ffffea0000a70b00 ffffea0000a70b38 ffffea0000a70b70
>>> Jan 14 09:36:48 TNESG9423 kernel: Call Trace:
>>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff814ee3d6>] ? printk+0x41/0x43
>>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811c9fe4>] mpage_da_map_and_submit+0x274/0x470
>>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811ca24d>] mpage_add_bh_to_extent+0x6d/0xf0
>>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811ca5a0>] write_cache_pages_da+0x2d0/0x4a0
>>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811caa4c>] ext4_da_writepages+0x2dc/0x650
>>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81105321>] do_writepages+0x21/0x40
>>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff810fb55b>] __filemap_fdatawrite_range+0x5b/0x60
>>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff810fb833>] filemap_fdatawrite_range+0x13/0x20
>>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811fb6ce>] jbd2_journal_begin_ordered_truncate+0x8e/0xb0
>>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811cd02b>] ext4_evict_inode+0x23b/0x3b0
>>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81167477>] evict+0x27/0xc0
>>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81167a4b>] iput+0x1bb/0x2a0
>>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811734a4>] writeback_sb_inodes+0x104/0x180
>>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81173cfd>] writeback_inodes_wb+0x9d/0x160
>>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff8117404b>] wb_writeback+0x28b/0x400
>>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff810714cc>] ? lock_timer_base+0x3c/0x70
>>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81072092>] ? del_timer_sync+0x22/0x30
>>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81174257>] wb_do_writeback+0x97/0x1e0
>>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81174452>] bdi_writeback_thread+0xb2/0x270
>>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811743a0>] ? bdi_writeback_thread+0x0/0x270
>>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff811743a0>] ? bdi_writeback_thread+0x0/0x270
>>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff81083356>] kthread+0x96/0xa0
>>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff8100ce84>] kernel_thread_helper+0x4/0x10
>>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff810832c0>] ? kthread+0x0/0xa0
>>> Jan 14 09:36:48 TNESG9423 kernel: [<ffffffff8100ce80>] ? kernel_thread_helper+0x0/0x10
>>> Jan 14 09:36:48 TNESG9423 kernel: Code: a8 00 00 00 5b 41 5c 41 5d 41 5e 41 5f c9 c3 0f 1f 40 00 4c 89 e7 48 89 95 40 ff ff ff e8 01 1b f4 ff 48 8b 95 40 ff ff ff eb c9<0f> 0b eb fe 0f 0b 66 90 eb fc
>>> 66 66 66 66 66 2e 0f 1f 84 00 00
>>> Jan 14 09:36:48 TNESG9423 kernel: RIP [<ffffffff811c5298>] ext4_da_block_invalidatepages+0x168/0x180
>>> Jan 14 09:36:48 TNESG9423 kernel: RSP<ffff88007613f780>
>>> Jan 14 09:36:48 TNESG9423 kernel: ---[ end trace 0496eaed3b9ec629 ]---
>>>
>>> To fix this, I referred to the patch which is for data blocks
>>> reservation counter (commit: ef627929781c98113e6ae93f159dd3c12a884ad8)
>> I cant find that commit in linus's, and tytso's tries, can you please
>> specify where it can be found.
>
> Oops, commit ef627929781c98113e6ae93f159dd3c12a884ad8 is on
> http://lkml.indiana.edu/hypermail/linux/kernel/0912.3/01475.html,
> but actually it's incorrect. The correct commit is
> 61820f33e1bcadff14af25a4a894240e7903dd3d in ext4 patch queue.
>
>
>>> and made a following patch which prints metadata block inconsistency
>>> and corrects it.
>>> My patch is trial, if you have better idea, feel free to fix this bug.
>>>
>>> # You can reproduce this problem with attached programs.
>>> # In my environment, this occurs in 1 minute.
>>>
>>> Signed-off-by: Akira Fujita<[email protected]>
>>> ---
>>> fs/ext4/inode.c | 10 ++++++++++
>>> 1 file changed, 10 insertions(+)
>>> diff -Nrup -X linux-2.6.37-org/Documentation/dontdiff linux-2.6.37-org/fs/ext4/inode.c linux-2.6.37/fs/ext4/inode.c
>>> --- linux-2.6.37-org/fs/ext4/inode.c 2011-01-17 15:47:59.000000000 +0900
>>> +++ linux-2.6.37/fs/ext4/inode.c 2011-01-17 15:52:25.000000000 +0900
>>> @@ -1127,6 +1127,16 @@ void ext4_da_update_reserve_space(struct
>>> used = ei->i_reserved_data_blocks;
>>> }
>>>
>>> + if (unlikely(ei->i_allocated_meta_blocks>
>>> + ei->i_reserved_meta_blocks)) {
>>> + ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, "
>>> + "meta blocks %d with only %d reserved meta blocks\n",
>>> + __func__, inode->i_ino, ei->i_allocated_meta_blocks,
>>> + ei->i_reserved_meta_blocks);
>>> + WARN_ON(1);
>>> + ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks;
>>> + }
>>> +
>> The situation where allocated> reservation is a very nasty bug and
>> means what reservation is broken, so next moment result in user data
>> drop/corruption, due to ENOSPC, so we definitely have to investigate
>> the root of cause. BTW i cant reproduce the bug with your test
>> on 2.6.34.7-63.fc13 which is really old, but still.
>> Which version should i use to reproduce the issue?
>
> My kernel is 2.6.37 + ext4 patch queue (2741742fcfca2e19cd26b1cfe8e43e7b444c8e35).
> [akira@bsd086 ~]$ uname -a
> Linux bsd086.bsd.tnes.nec.co.jp 2.6.37 #6 SMP Mon Jan 17 11:54:42 JST 2011 x86_64 x86_64 x86_64 GNU/Linux
>
>
>>> /* Update per-inode reservations */
>>> ei->i_reserved_data_blocks -= used;
>>> ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
>> Attachment: metadata_blocks_overflow.sh (application/x-shellscript)
>>> #include<stdio.h>
>>> #include<stdlib.h>
>>> #include<unistd.h>
>>> #include<string.h>
>>> #include<sys/file.h>
>>> #include<sys/mman.h>
>>> #include<sys/uio.h>
>>> #include<sys/wait.h>
>>> #include<errno.h>
>>> #include<malloc.h>
>>>
>>> #define BUFSIZE 4096
>>> #define PATH_MAX 4096
>>> #define MINIOSIZE 4*1024
>>> #define MAXIOSIZE 4*1024*1024
>>>
>>> #ifndef O_DIRECT
>>> #define O_DIRECT 040000
>>> #endif
>>>
>>> static int bufsize = 0; /* Buffersize. Default random */
>>> static int bufsize_random = 1; /* Buffersize random flag. Default random */
>>> static long long filesize = 0; /* Target file size */
>>> static long long target_range = 0; /* Target range per process */
>>> static int files_num = 0;
>>> static char *filenames;
>>> static int mmap_write = 0;
>>> static int o_sync = 0;
>>> static int o_async = 0;
>>> static int o_direct = 0;
>>>
>>> static void setup(void);
>>> static void cleanup(void);
>>>
>>> void prg_usage()
>>> {
>>> fprintf(stderr, "Usage: fs_write [-S 0|1] [-D] [-n num_process]\n");
>>> fprintf(stderr, " -s file_size file...\n");
>>> fprintf(stderr, "\n");
>>> fprintf(stderr, " -S sync mode (0:O_ASYNC 1:O_SYNC)\n");
>>> fprintf(stderr, " -D Direct IO\n");
>>> fprintf(stderr, " -n Process count (default is 1)\n");
>>> fprintf(stderr, " -s File size\n");
>>> fprintf(stderr, " file... Traget file\n");
>>>
>>> exit(1);
>>> }
>>>
>>> int runtest(int fd_w[], int childnum)
>>> {
>>> int num = 0;
>>> int err = 0;
>>> long long offset = target_range * childnum;
>>> long long remain = target_range;
>>> int len = bufsize;
>>> int write_len = 0;
>>> int psize = getpagesize();
>>> int max_page = MAXIOSIZE / psize;
>>> char *buf_w;
>>> void *p;
>>>
>>> buf_w = (char*)memalign(psize, MAXIOSIZE);
>>>
>>> while (remain> 0) {
>>> if (bufsize_random == 1) {
>>> len = (rand() % max_page) * psize;
>>> if (!len)
>>> len = MINIOSIZE;
>>> }
>>> if (len> remain)
>>> len = remain;
>>>
>>> for (num = 0; num< files_num ; num++) {
>>> memset(buf_w, childnum+num, len);
>>> if (!mmap_write) {
>>> if (lseek(fd_w[num], offset, SEEK_SET)< 0) {
>>> err = errno;
>>> goto out;
>>> }
>>> if ((write_len = write(fd_w[num], buf_w, len))< 0) {
>>> err = errno;
>>> goto out;
>>> }
>>> if (len != write_len)
>>> goto out;
>>> } else {
>>> p = mmap(NULL, len, PROT_WRITE, MAP_SHARED,
>>> fd_w[num], offset);
>>> if (p == MAP_FAILED) {
>>> err = errno;
>>> goto out;
>>> }
>>> memcpy(p, buf_w, len);
>>> if (munmap(p, len) == -1) {
>>> err = errno;
>>> goto out;
>>> }
>>> }
>>> }
>>> offset += len;
>>> remain -= len;
>>> }
>>>
>>> out:
>>> if (err == ENOSPC || len != write_len)
>>> err = 0;
>>> if (err> 0)
>>> fprintf(stderr, "%s;\n", strerror(err));
>>> free(buf_w);
>>>
>>> return err;
>>> }
>>>
>>> int child_function(int childnum)
>>> {
>>> int num, ret = -1;
>>> int fd_w[files_num];
>>> char *filename;
>>>
>>> for (num = 0; num< files_num ; num++) {
>>> fd_w[num] = -1;
>>> }
>>> for (num = 0; num< files_num ; num++) {
>>> filename = filenames + (num*PATH_MAX);
>>> if ((fd_w[num] = open(filename,
>>> O_RDWR|o_sync|o_async|o_direct))< 0) {
>>> printf("TINFO:fd_w open failed for %s: %s\n",filename, strerror(errno));
>>> goto out;
>>> }
>>> }
>>>
>>> ret = runtest(fd_w, childnum);
>>>
>>> out:
>>> for (num = 0; num< files_num ; num++) {
>>> if (fd_w[num] != -1)
>>> close(fd_w[num]);
>>> }
>>>
>>> exit(ret);
>>> }
>>>
>>> static void setup(void)
>>> {
>>> int num, fd;
>>> char *filename;
>>>
>>> for (num = 0; num< files_num; num++) {
>>> filename = filenames + (num * PATH_MAX);
>>> if ((fd = open(filename, O_CREAT|O_EXCL|O_WRONLY|O_TRUNC, 0666))< 0) {
>>> printf("TBROK, Couldn't create test file %s: %s\n", filename, strerror(errno));
>>> cleanup();
>>> }
>>>
>>> if (mmap_write) {
>>> if (ftruncate(fd, filesize)< 0) {
>>> printf("TBROK2: Couldn't create test file %s: %s\n", filename, strerror(errno));
>>> cleanup();
>>> }
>>> }
>>> close(fd);
>>> }
>>> }
>>>
>>> static void cleanup(void)
>>> {
>>> int num;
>>> for (num = 0; num< files_num; num++)
>>> unlink(filenames + (num * PATH_MAX));
>>> free(filenames);
>>> exit(1);
>>> }
>>>
>>> int main(int argc, char *argv[])
>>> {
>>> int numchild = 1; /* Number of children. Default 5 */
>>> int i, num;
>>> int err = 0;
>>> int sync_mode = 0;
>>> int *cpid;
>>> int status;
>>>
>>> while ((i = getopt(argc, argv, "MS:Dn:s:")) != -1) {
>>> switch(i) {
>>> case 'M':
>>> mmap_write = 1;
>>> break;
>>> case 'S':
>>> if ((sync_mode = atoi(optarg))< 0) {
>>> fprintf(stderr, "sync mode must be 0 or 1;\n");
>>> prg_usage();
>>> }
>>> switch (sync_mode) {
>>> case 0:
>>> o_async = O_ASYNC;
>>> break;
>>> case 1:
>>> o_sync = O_SYNC;
>>> break;
>>> default:
>>> fprintf(stderr, "sync mode must be 0 or 1;\n");
>>> prg_usage();
>>> }
>>> break;
>>> case 'D':
>>> o_direct = O_DIRECT;
>>> break;
>>> case 'n':
>>> if ((numchild = atoi(optarg))<= 0) {
>>> fprintf(stderr, "no of children must be> 0;\n");
>>> prg_usage();
>>> }
>>> break;
>>> case 's':
>>> filesize = atol(optarg);
>>> if (filesize<= 0) {
>>> fprintf(stderr, "filesize must be> 0;\n");
>>> prg_usage();
>>> }
>>> break;
>>> default:
>>> prg_usage();
>>> }
>>> }
>>>
>>> files_num = argc - optind;
>>> if (files_num<= 0)
>>> prg_usage();
>>> if (filesize % numchild != 0 || (filesize / numchild) % BUFSIZE != 0) {
>>> fprintf(stderr, "filesize must be multiple of 4k*numchild:"
>>> "filesize=%lld;\n", filesize);
>>> prg_usage();
>>> }
>>>
>>> if ((filenames = (char *)malloc(files_num * PATH_MAX)) == NULL ||
>>> (cpid = (int *)malloc(sizeof(int)*numchild)) == NULL) {
>>> perror("malloc");
>>> exit(1);
>>> }
>>> for(i = optind, num = 0 ; i< argc; i++, num++)
>>> strcpy((char *)filenames + (num * PATH_MAX), argv[i]);
>>> target_range = filesize / numchild;
>>>
>>>
>>> setup();
>>>
>>> for(i = 0; i< numchild; i++) {
>>> *(cpid+i) = fork();
>>> if (*(cpid+i) == 0)
>>> child_function(i);
>>> }
>>> for(i = 0; i< numchild; i++) {
>>> waitpid(*(cpid+1),&status, 0);
>>> }
>>>
>>> cleanup();
>>> return err;
>>> }
>>
>
--
Akira Fujita <[email protected]>
The First Fundamental Software Development Group,
Software Development Division,
NEC Software Tohoku, Ltd.