From: Dmitry Subject: Re: [RFC][PATCH] ext4: Fix overflow of metadata block reservation counter Date: Mon, 17 Jan 2011 14:05:45 +0300 Message-ID: <87tyh7x01i.fsf@dmon-lp.sw.ru> References: <4D33FDF6.5040700@rs.jp.nec.com> Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii Cc: ext4 development To: Akira Fujita , Theodore Tso Return-path: Received: from mail-ey0-f174.google.com ([209.85.215.174]:51708 "EHLO mail-ey0-f174.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751210Ab1AQLFq (ORCPT ); Mon, 17 Jan 2011 06:05:46 -0500 Received: by eye27 with SMTP id 27so2415812eye.19 for ; Mon, 17 Jan 2011 03:05:44 -0800 (PST) In-Reply-To: <4D33FDF6.5040700@rs.jp.nec.com> Sender: linux-ext4-owner@vger.kernel.org List-ID: On Mon, 17 Jan 2011 17:29:42 +0900, Akira Fujita wrote: > Hi, > > The metadata block reservation counter overflows with data write > on ext4 (indirect block map) when its disk space is almost full. > This overflow triggers following BUG_ON. > > Jan 14 09:36:48 TNESG9423 kernel: ------------[ cut here ]------------ > Jan 14 09:36:48 TNESG9423 kernel: kernel BUG at fs/ext4/inode.c:2170! > Jan 14 09:36:48 TNESG9423 kernel: invalid opcode: 0000 [#1] SMP > Jan 14 09:36:48 TNESG9423 kernel: last sysfs file: /sys/kernel/mm/ksm/run > Jan 14 09:36:48 TNESG9423 kernel: CPU 0 > Jan 14 09:36:48 TNESG9423 kernel: Modules linked in: ip6table_filter ip6_tables ebtable_nat ebtables xt_CHECKSUM iptable_mangle ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 > xt_state nf_conntrack ipt_REJECT bridge stp llc autofs4 sunrpc p4_clockmod freq_table speedstep_lib ipv6 xt_physdev iptable_filter ip_tables nls_utf8 dm_mirror dm_region_hash dm_log dm_mod kvm_intel > kvm uinput ppdev parport_pc parport sg pcspkr i2c_i801 iTCO_wdt iTCO_vendor_support snd_hda_codec_realtek snd_hda_intel snd_hda_codec snd_hwdep snd_seq snd_seq_device snd_pcm snd_timer snd soundcore > snd_page_alloc e1000e ext3 jbd sd_mod crc_t10dif sr_mod cdrom pata_via pata_acpi ata_generic ata_piix i915 drm_kms_helper drm i2c_algo_bit i2c_core video output [last unloaded: mperf] > Jan 14 09:36:48 TNESG9423 kernel: > Jan 14 09:36:48 TNESG9423 kernel: Pid: 937, comm: flush-8:0 Not tainted 2.6.37 #1 MS-7264BLM/PC-MJ18ABZR4 > Jan 14 09:36:48 TNESG9423 kernel: RIP: 0010:[] [] ext4_da_block_invalidatepages+0x168/0x180 > Jan 14 09:36:48 TNESG9423 kernel: RSP: 0018:ffff88007613f780 EFLAGS: 00010246 > Jan 14 09:36:48 TNESG9423 kernel: RAX: 0010000000000024 RBX: 0000000000008cf2 RCX: 000000000000000e > Jan 14 09:36:48 TNESG9423 kernel: RDX: 000000000000000e RSI: 0000000000000001 RDI: ffffea0000a70d30 > Jan 14 09:36:48 TNESG9423 kernel: RBP: ffff88007613f850 R08: 0000000000000001 R09: 0000000000000002 > Jan 14 09:36:48 TNESG9423 kernel: R10: ffffea0000a70d38 R11: ffff880035f01b58 R12: ffff88007613f7a0 > Jan 14 09:36:48 TNESG9423 kernel: R13: ffff880065eecd68 R14: ffff88007613f7b8 R15: ffffea0000a70a58 > Jan 14 09:36:48 TNESG9423 kernel: FS: 0000000000000000(0000) GS:ffff88007f400000(0000) knlGS:0000000000000000 > Jan 14 09:36:48 TNESG9423 kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b > Jan 14 09:36:48 TNESG9423 kernel: CR2: 0000003680ae1560 CR3: 000000004da45000 CR4: 00000000000006f0 > Jan 14 09:36:48 TNESG9423 kernel: DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > Jan 14 09:36:48 TNESG9423 kernel: DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 > Jan 14 09:36:48 TNESG9423 kernel: Process flush-8:0 (pid: 937, threadinfo ffff88007613e000, task ffff8800371b54e0) > Jan 14 09:36:48 TNESG9423 kernel: Stack: > Jan 14 09:36:48 TNESG9423 kernel: ffff88007613f7e0 ffffffff814ee3d6 0000000000000008 0000000e7613f7f0 > Jan 14 09:36:48 TNESG9423 kernel: 000000000000000e 000000003741a4b9 ffffea0000a70a58 ffffea0000a70a90 > Jan 14 09:36:48 TNESG9423 kernel: ffffea0000a70ac8 ffffea0000a70b00 ffffea0000a70b38 ffffea0000a70b70 > Jan 14 09:36:48 TNESG9423 kernel: Call Trace: > Jan 14 09:36:48 TNESG9423 kernel: [] ? printk+0x41/0x43 > Jan 14 09:36:48 TNESG9423 kernel: [] mpage_da_map_and_submit+0x274/0x470 > Jan 14 09:36:48 TNESG9423 kernel: [] mpage_add_bh_to_extent+0x6d/0xf0 > Jan 14 09:36:48 TNESG9423 kernel: [] write_cache_pages_da+0x2d0/0x4a0 > Jan 14 09:36:48 TNESG9423 kernel: [] ext4_da_writepages+0x2dc/0x650 > Jan 14 09:36:48 TNESG9423 kernel: [] do_writepages+0x21/0x40 > Jan 14 09:36:48 TNESG9423 kernel: [] __filemap_fdatawrite_range+0x5b/0x60 > Jan 14 09:36:48 TNESG9423 kernel: [] filemap_fdatawrite_range+0x13/0x20 > Jan 14 09:36:48 TNESG9423 kernel: [] jbd2_journal_begin_ordered_truncate+0x8e/0xb0 > Jan 14 09:36:48 TNESG9423 kernel: [] ext4_evict_inode+0x23b/0x3b0 > Jan 14 09:36:48 TNESG9423 kernel: [] evict+0x27/0xc0 > Jan 14 09:36:48 TNESG9423 kernel: [] iput+0x1bb/0x2a0 > Jan 14 09:36:48 TNESG9423 kernel: [] writeback_sb_inodes+0x104/0x180 > Jan 14 09:36:48 TNESG9423 kernel: [] writeback_inodes_wb+0x9d/0x160 > Jan 14 09:36:48 TNESG9423 kernel: [] wb_writeback+0x28b/0x400 > Jan 14 09:36:48 TNESG9423 kernel: [] ? lock_timer_base+0x3c/0x70 > Jan 14 09:36:48 TNESG9423 kernel: [] ? del_timer_sync+0x22/0x30 > Jan 14 09:36:48 TNESG9423 kernel: [] wb_do_writeback+0x97/0x1e0 > Jan 14 09:36:48 TNESG9423 kernel: [] bdi_writeback_thread+0xb2/0x270 > Jan 14 09:36:48 TNESG9423 kernel: [] ? bdi_writeback_thread+0x0/0x270 > Jan 14 09:36:48 TNESG9423 kernel: [] ? bdi_writeback_thread+0x0/0x270 > Jan 14 09:36:48 TNESG9423 kernel: [] kthread+0x96/0xa0 > Jan 14 09:36:48 TNESG9423 kernel: [] kernel_thread_helper+0x4/0x10 > Jan 14 09:36:48 TNESG9423 kernel: [] ? kthread+0x0/0xa0 > Jan 14 09:36:48 TNESG9423 kernel: [] ? kernel_thread_helper+0x0/0x10 > Jan 14 09:36:48 TNESG9423 kernel: Code: a8 00 00 00 5b 41 5c 41 5d 41 5e 41 5f c9 c3 0f 1f 40 00 4c 89 e7 48 89 95 40 ff ff ff e8 01 1b f4 ff 48 8b 95 40 ff ff ff eb c9 <0f> 0b eb fe 0f 0b 66 90 eb fc > 66 66 66 66 66 2e 0f 1f 84 00 00 > Jan 14 09:36:48 TNESG9423 kernel: RIP [] ext4_da_block_invalidatepages+0x168/0x180 > Jan 14 09:36:48 TNESG9423 kernel: RSP > Jan 14 09:36:48 TNESG9423 kernel: ---[ end trace 0496eaed3b9ec629 ]--- > > To fix this, I referred to the patch which is for data blocks > reservation counter (commit: ef627929781c98113e6ae93f159dd3c12a884ad8) I cant find that commit in linus's, and tytso's tries, can you please specify where it can be found. > and made a following patch which prints metadata block inconsistency > and corrects it. > My patch is trial, if you have better idea, feel free to fix this bug. > > # You can reproduce this problem with attached programs. > # In my environment, this occurs in 1 minute. > > Signed-off-by: Akira Fujita > --- > fs/ext4/inode.c | 10 ++++++++++ > 1 file changed, 10 insertions(+) > diff -Nrup -X linux-2.6.37-org/Documentation/dontdiff linux-2.6.37-org/fs/ext4/inode.c linux-2.6.37/fs/ext4/inode.c > --- linux-2.6.37-org/fs/ext4/inode.c 2011-01-17 15:47:59.000000000 +0900 > +++ linux-2.6.37/fs/ext4/inode.c 2011-01-17 15:52:25.000000000 +0900 > @@ -1127,6 +1127,16 @@ void ext4_da_update_reserve_space(struct > used = ei->i_reserved_data_blocks; > } > > + if (unlikely(ei->i_allocated_meta_blocks > > + ei->i_reserved_meta_blocks)) { > + ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, " > + "meta blocks %d with only %d reserved meta blocks\n", > + __func__, inode->i_ino, ei->i_allocated_meta_blocks, > + ei->i_reserved_meta_blocks); > + WARN_ON(1); > + ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks; > + } > + The situation where allocated > reservation is a very nasty bug and means what reservation is broken, so next moment result in user data drop/corruption, due to ENOSPC, so we definitely have to investigate the root of cause. BTW i cant reproduce the bug with your test on 2.6.34.7-63.fc13 which is really old, but still. Which version should i use to reproduce the issue? > /* Update per-inode reservations */ > ei->i_reserved_data_blocks -= used; > ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; Attachment: metadata_blocks_overflow.sh (application/x-shellscript) > #include > #include > #include > #include > #include > #include > #include > #include > #include > #include > > #define BUFSIZE 4096 > #define PATH_MAX 4096 > #define MINIOSIZE 4*1024 > #define MAXIOSIZE 4*1024*1024 > > #ifndef O_DIRECT > #define O_DIRECT 040000 > #endif > > static int bufsize = 0; /* Buffersize. Default random */ > static int bufsize_random = 1; /* Buffersize random flag. Default random */ > static long long filesize = 0; /* Target file size */ > static long long target_range = 0; /* Target range per process */ > static int files_num = 0; > static char *filenames; > static int mmap_write = 0; > static int o_sync = 0; > static int o_async = 0; > static int o_direct = 0; > > static void setup(void); > static void cleanup(void); > > void prg_usage() > { > fprintf(stderr, "Usage: fs_write [-S 0|1] [-D] [-n num_process]\n"); > fprintf(stderr, " -s file_size file...\n"); > fprintf(stderr, "\n"); > fprintf(stderr, " -S sync mode (0:O_ASYNC 1:O_SYNC)\n"); > fprintf(stderr, " -D Direct IO\n"); > fprintf(stderr, " -n Process count (default is 1)\n"); > fprintf(stderr, " -s File size\n"); > fprintf(stderr, " file... Traget file\n"); > > exit(1); > } > > int runtest(int fd_w[], int childnum) > { > int num = 0; > int err = 0; > long long offset = target_range * childnum; > long long remain = target_range; > int len = bufsize; > int write_len = 0; > int psize = getpagesize(); > int max_page = MAXIOSIZE / psize; > char *buf_w; > void *p; > > buf_w = (char*)memalign(psize, MAXIOSIZE); > > while (remain > 0) { > if (bufsize_random == 1) { > len = (rand() % max_page) * psize; > if (!len) > len = MINIOSIZE; > } > if (len > remain) > len = remain; > > for (num = 0; num < files_num ; num++) { > memset(buf_w, childnum+num, len); > if (!mmap_write) { > if (lseek(fd_w[num], offset, SEEK_SET) < 0) { > err = errno; > goto out; > } > if ((write_len = write(fd_w[num], buf_w, len)) < 0) { > err = errno; > goto out; > } > if (len != write_len) > goto out; > } else { > p = mmap(NULL, len, PROT_WRITE, MAP_SHARED, > fd_w[num], offset); > if (p == MAP_FAILED) { > err = errno; > goto out; > } > memcpy(p, buf_w, len); > if (munmap(p, len) == -1) { > err = errno; > goto out; > } > } > } > offset += len; > remain -= len; > } > > out: > if (err == ENOSPC || len != write_len) > err = 0; > if (err > 0) > fprintf(stderr, "%s;\n", strerror(err)); > free(buf_w); > > return err; > } > > int child_function(int childnum) > { > int num, ret = -1; > int fd_w[files_num]; > char *filename; > > for (num = 0; num < files_num ; num++) { > fd_w[num] = -1; > } > for (num = 0; num < files_num ; num++) { > filename = filenames + (num*PATH_MAX); > if ((fd_w[num] = open(filename, > O_RDWR|o_sync|o_async|o_direct)) < 0) { > printf("TINFO:fd_w open failed for %s: %s\n",filename, strerror(errno)); > goto out; > } > } > > ret = runtest(fd_w, childnum); > > out: > for (num = 0; num < files_num ; num++) { > if (fd_w[num] != -1) > close(fd_w[num]); > } > > exit(ret); > } > > static void setup(void) > { > int num, fd; > char *filename; > > for (num = 0; num < files_num; num++) { > filename = filenames + (num * PATH_MAX); > if ((fd = open(filename, O_CREAT|O_EXCL|O_WRONLY|O_TRUNC, 0666)) < 0) { > printf("TBROK, Couldn't create test file %s: %s\n", filename, strerror(errno)); > cleanup(); > } > > if (mmap_write) { > if (ftruncate(fd, filesize) < 0) { > printf("TBROK2: Couldn't create test file %s: %s\n", filename, strerror(errno)); > cleanup(); > } > } > close(fd); > } > } > > static void cleanup(void) > { > int num; > for (num = 0; num < files_num; num++) > unlink(filenames + (num * PATH_MAX)); > free(filenames); > exit(1); > } > > int main(int argc, char *argv[]) > { > int numchild = 1; /* Number of children. Default 5 */ > int i, num; > int err = 0; > int sync_mode = 0; > int *cpid; > int status; > > while ((i = getopt(argc, argv, "MS:Dn:s:")) != -1) { > switch(i) { > case 'M': > mmap_write = 1; > break; > case 'S': > if ((sync_mode = atoi(optarg)) < 0) { > fprintf(stderr, "sync mode must be 0 or 1;\n"); > prg_usage(); > } > switch (sync_mode) { > case 0: > o_async = O_ASYNC; > break; > case 1: > o_sync = O_SYNC; > break; > default: > fprintf(stderr, "sync mode must be 0 or 1;\n"); > prg_usage(); > } > break; > case 'D': > o_direct = O_DIRECT; > break; > case 'n': > if ((numchild = atoi(optarg)) <= 0) { > fprintf(stderr, "no of children must be > 0;\n"); > prg_usage(); > } > break; > case 's': > filesize = atol(optarg); > if (filesize <= 0) { > fprintf(stderr, "filesize must be > 0;\n"); > prg_usage(); > } > break; > default: > prg_usage(); > } > } > > files_num = argc - optind; > if (files_num <= 0) > prg_usage(); > if (filesize % numchild != 0 || (filesize / numchild) % BUFSIZE != 0) { > fprintf(stderr, "filesize must be multiple of 4k*numchild:" > "filesize=%lld;\n", filesize); > prg_usage(); > } > > if ((filenames = (char *)malloc(files_num * PATH_MAX)) == NULL || > (cpid = (int *)malloc(sizeof(int)*numchild)) == NULL) { > perror("malloc"); > exit(1); > } > for(i = optind, num = 0 ; i < argc; i++, num++) > strcpy((char *)filenames + (num * PATH_MAX), argv[i]); > target_range = filesize / numchild; > > > setup(); > > for(i = 0; i < numchild; i++) { > *(cpid+i) = fork(); > if (*(cpid+i) == 0) > child_function(i); > } > for(i = 0; i < numchild; i++) { > waitpid(*(cpid+1), &status, 0); > } > > cleanup(); > return err; > }