From: "Darrick J. Wong" Subject: EXT4_IOC_MOVE_EXT file corruption! Date: Mon, 5 Apr 2010 15:02:20 -0700 Message-ID: <20100405220220.GT29604@tux1.beaverton.ibm.com> Reply-To: djwong@us.ibm.com Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii Cc: linux-ext4 To: "Theodore Ts'o" Return-path: Received: from e9.ny.us.ibm.com ([32.97.182.139]:40342 "EHLO e9.ny.us.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1756450Ab0DEWCY (ORCPT ); Mon, 5 Apr 2010 18:02:24 -0400 Received: from d01relay03.pok.ibm.com (d01relay03.pok.ibm.com [9.56.227.235]) by e9.ny.us.ibm.com (8.14.3/8.13.1) with ESMTP id o35LonvB011987 for ; Mon, 5 Apr 2010 17:50:49 -0400 Received: from d03av03.boulder.ibm.com (d03av03.boulder.ibm.com [9.17.195.169]) by d01relay03.pok.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id o35M2MoL144812 for ; Mon, 5 Apr 2010 18:02:22 -0400 Received: from d03av03.boulder.ibm.com (loopback [127.0.0.1]) by d03av03.boulder.ibm.com (8.14.3/8.13.1/NCO v10.0 AVout) with ESMTP id o35F2Lm6006072 for ; Mon, 5 Apr 2010 09:02:21 -0600 Content-Disposition: inline Sender: linux-ext4-owner@vger.kernel.org List-ID: Hi all, I wrote a program called e4frag that deliberately tries to fragment an ext4 filesystem via EXT4_IOC_MOVE_EXT so that I could run e4defrag through its paces. While running e4frag and e4defrag concurrently on a kernel source tree, I discovered ongoing file corruption. It appears that if e4frag and e4defrag hit the same file at same time, the file ends up with a 4K data block from somewhere else. "Somewhere else" seems to be a small chunk of binary gibberish followed by contents from other files(!) Obviously this isn't a good thing to see, since today it's header files but tomorrow it could be the credit card/SSN database. :) Ted asked me to send out a copy of the program ASAP, so the test program source code is at the end of this message. To build it, run: $ gcc -o e4frag -O2 -Wall e4frag.c and then to run it: (unpack something in /path/to/files) $ cp -pRdu /path/to/files /path/to/intact_files $ while true; do e4defrag /path/to/files & done $ while true; do ./e4frag -m 500 -s random /path/to/files & done $ while true; do diff -Naurp /path/to/intact_files /path/to/files; done ...and wait for diff to cough up differences. This seems to happen on 2.6.34-rc3, and only if e4frag and e4defrag are running concurrently. Running e4frag or e4defrag in a serial loop doesn't produce this corruption, so I think it's purely a concurrent access problem. On a lark, I ran fsck afterwards: # fsck -C -f -y /dev/sda fsck from util-linux-ng 2.16 e2fsck 1.41.9 (22-Aug-2009) Pass 1: Checking inodes, blocks, and sizes Pass 2: Checking directory structure Pass 3: Checking directory connectivity Pass 4: Checking reference counts Pass 5: Checking group summary information Inode bitmap differences: -534593 -534654 -534744 -534768 -534947 -662276 -662438 -1058789 -1058850 -1059026 -1059219 -1318193 -1583270 -1583378 -1583422 -2234673 -2631973 -3156444 -3156632 -3680888 -3680950 -4204922 -4205252 -4205286 Fix? yes /dev/sda: ***** FILE SYSTEM WAS MODIFIED ***** /dev/sda: 291596/107143168 files (4.6% non-contiguous), 7829819/428544000 blocks Is this a sign that the extent tree is getting corrupted somehow? Ted thought that it might have something to do with an ialloc mutex, I think. --D /* * Try to fragment files. * Copyright (C) 2010 IBM. All rights reserved. * * This program is licensed under the GPLv2. * Signed-off-by: Darrick J. Wong */ #define _FILE_OFFSET_BITS 64 #define _XOPEN_SOURCE 600 #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEFAULT_MAX_DONOR_FILES 0 #define STATUS_NEWLINE "\r" #define PROGRAM "e4frag v0.2" struct fragment_context { const char *fpath; off_t max_progress; off_t current_progress; int old_pct; }; struct fragment_profile { const char *name; int (*get_donor_fd)(struct fragment_context *fc, off_t max_files, off_t num_blocks); int (*prepare)(struct fragment_context *fc, off_t max_files, off_t num_blocks); }; static int max_donor_files = DEFAULT_MAX_DONOR_FILES; static struct statvfs statvfsbuf; static char donor_file_template[PATH_MAX]; static off_t donor_files; /* expect as many donor files as blocks */ static struct fragment_profile *profile; static int verbose = 0; /* Shamelessly stolen from e4defrag.c */ struct move_extent { __s32 reserved; /* original file descriptor */ __u32 donor_fd; /* donor file descriptor */ __u64 orig_start; /* logical start offset in block for orig */ __u64 donor_start; /* logical start offset in block for donor */ __u64 len; /* block length to be moved */ __u64 moved_len; /* moved block length */ }; #ifndef EXT4_IOC_MOVE_EXT #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) #endif /* end stuff from e4defrag */ void print_status(struct fragment_context *fc, const char *str) { if (!verbose) return; printf("%s: %s\n", fc->fpath, str); fflush(stdout); } void emit_status(struct fragment_context *fc, const char *str) { if (!verbose) return; printf("%s: %s" STATUS_NEWLINE, fc->fpath, str); fflush(stdout); } void inc_status(struct fragment_context *fc) { int pct; fc->current_progress++; pct = 100 * fc->current_progress / fc->max_progress; if (pct != fc->old_pct) { if (verbose) printf("%s: %d%%" STATUS_NEWLINE, fc->fpath, pct); fflush(stdout); fc->old_pct = pct; } } int cleanup_donor_files(struct fragment_context *fc, int report_errors) { int ret; char tmp_inode_name[PATH_MAX]; while (donor_files) { snprintf(tmp_inode_name, PATH_MAX, donor_file_template, --donor_files); ret = unlink(tmp_inode_name); if (report_errors && ret) { perror(tmp_inode_name); return ret; } inc_status(fc); } return 0; } off_t calculate_max_files(off_t num_blocks) { off_t x = statvfsbuf.f_bavail / num_blocks; /* Only use user setting if there's space. */ if (max_donor_files > 0 && x > max_donor_files) return max_donor_files; return x; } int generic_frag_file(const char *fpath, const struct stat *sb, struct fragment_profile *fp) { struct fragment_context fc; struct move_extent move_data; off_t num_blocks, block, max_files; int ret, donor_fd, fd; fc.fpath = fpath; fc.max_progress = 0; fc.current_progress = 0; fc.old_pct = -1; /* Screen out non-files or single-block files. */ if (!S_ISREG(sb->st_mode)) return 0; num_blocks = sb->st_size / statvfsbuf.f_bsize; if (sb->st_size % statvfsbuf.f_bsize) num_blocks++; if (num_blocks < 2) return 0; fd = open(fpath, O_RDWR); if (fd < 0) { perror(fpath); ret = -errno; goto out; } /* Kernel can return -ENODATA if we don't sync the source file first. */ emit_status(&fc, "syncing..."); fsync(fd); emit_status(&fc, " "); /* Prepare for donor files */ assert(!donor_files); donor_files = 0; snprintf(donor_file_template, PATH_MAX, "%s.%%lu.defrag", fpath); /* Figure out the maximum donor file count for this file */ max_files = calculate_max_files(num_blocks); ret = fp->prepare(&fc, max_files, num_blocks); if (ret) goto err; /* Start moving blocks */ memset(&move_data, 0, sizeof(move_data)); move_data.len = 1; for (block = num_blocks - 1; block >= 0; block--) { donor_fd = fp->get_donor_fd(&fc, max_files, num_blocks); if (donor_fd < 0) goto err; /* Swap blocks */ /* NB: Source and donor logical block must be the same. */ move_data.donor_fd = donor_fd; move_data.orig_start = move_data.donor_start = block; move_data.moved_len = 0; ret = ioctl(fd, EXT4_IOC_MOVE_EXT, &move_data); if (ret < 0) { perror(fpath); goto err2; } ret = close(donor_fd); if (ret) { perror("closing donor file"); goto err; } inc_status(&fc); } cleanup_donor_files(&fc, 0); print_status(&fc, "Done."); close(fd); return 0; err2: cleanup_donor_files(&fc, 0); close(donor_fd); err: close(fd); out: return ret; } /* * So, to "reverse" the source logical block numbers, create a donor * file for every block and do the swap. Occasionally flush out the * donor files. Iterate the source file's blocks backwards in the * hope of maximizing the amount of extent blocks that must also be * dumped all over the filesystem. */ int reverse_prepare(struct fragment_context *fc, off_t max_files, off_t num_blocks) { fc->max_progress = 3 * num_blocks; return 0; } int reverse_get_donor_fd(struct fragment_context *fc, off_t max_files, off_t num_blocks) { char tmp_inode_name[PATH_MAX]; int donor_fd, ret; /* Clean out donor files */ if (donor_files > max_files) { ret = cleanup_donor_files(fc, 1); if (ret) return ret; } /* Create hidden donor inode */ snprintf(tmp_inode_name, PATH_MAX, donor_file_template, donor_files++); donor_fd = open(tmp_inode_name, O_WRONLY | O_CREAT | O_EXCL, S_IRUSR); if (donor_fd < 0) { perror(tmp_inode_name); fprintf(stderr, "Is the fragmenter already running?\n"); errno = EBUSY; return -1; } /* Allocate space in the donor file */ ret = posix_fallocate(donor_fd, 0, num_blocks * statvfsbuf.f_bsize); if (ret) { perror(tmp_inode_name); close(donor_fd); return ret; } inc_status(fc); return donor_fd; } /* * So, to "randomize" the source logical block numbers, create a bunch * of donor files. For each block, pick a donor file at random and * swap blocks with it. */ int random_prepare(struct fragment_context *fc, off_t max_files, off_t num_blocks) { int donor_fd, ret; char tmp_inode_name[PATH_MAX]; fc->max_progress = num_blocks + (2 * max_files); /* Allocate the donor files */ for (donor_files = 0; donor_files < max_files; donor_files++) { /* Create donor inode */ snprintf(tmp_inode_name, PATH_MAX, donor_file_template, donor_files); donor_fd = open(tmp_inode_name, O_WRONLY | O_CREAT | O_EXCL, S_IRUSR); if (donor_fd < 0) { perror(tmp_inode_name); fprintf(stderr, "Is a fragmenter already running?\n"); return -1; } /* Allocate space in the donor file */ ret = posix_fallocate(donor_fd, 0, num_blocks * statvfsbuf.f_bsize); if (ret) { perror(tmp_inode_name); close(donor_fd); return -1; } close(donor_fd); inc_status(fc); } return 0; } int random_get_donor_fd(struct fragment_context *fc, off_t max_files, off_t num_blocks) { char tmp_inode_name[PATH_MAX]; int donor_fd; off_t donor = random() * max_files / RAND_MAX; /* Reopen donor inode */ snprintf(tmp_inode_name, PATH_MAX, donor_file_template, donor); donor_fd = open(tmp_inode_name, O_WRONLY, S_IRUSR); if (donor_fd < 0) { perror(tmp_inode_name); errno = EBUSY; return -1; } return donor_fd; } static struct fragment_profile profiles[] = { {"random", random_get_donor_fd, random_prepare}, {"reverse", reverse_get_donor_fd, reverse_prepare}, {NULL}, }; int fragment_file(const char *fpath, const struct stat *sb, int typeflag, struct FTW *ftwbuf) { return generic_frag_file(fpath, sb, profile); } void print_help(char *progname) { printf("Usage: %s [-m max_files] [-s random|reverse] [-v] pathspec [pathspecs...]\n", progname); printf("-m Number of donor files to create while fragmenting. 0 = automatic\n"); printf("-s Set fragmentation strategy. (\"reverse\" or \"random\" (default))\n"); printf("-v Print progress indicators.\n"); } int main(int argc, char *argv[]) { struct fragment_profile *fp; struct statfs statfsbuf; struct stat statbuf; int i, ret, opt; profile = profiles; if (argc < 2) { print_help(argv[0]); return 0; } while ((opt = getopt(argc, argv, "vm:s:")) != -1) { switch (opt) { case 'm': max_donor_files = atoi(optarg); break; case 's': fp = profiles; while (fp->name) { if (!strcmp(fp->name, optarg)) { profile = fp; break; } fp++; } if (!fp->name) { print_help(argv[0]); return 1; } break; case 'v': verbose = 1; break; default: print_help(argv[0]); return 1; } } if (verbose) printf(PROGRAM ", strategy \"%s\" max donors %d.\n", profile->name, max_donor_files); for (i = optind; i < argc; i++) { /* ignore files on non-ext4 filesystems */ ret = statfs(argv[i], &statfsbuf); if (ret) { perror(argv[i]); break; } if (statfsbuf.f_type != EXT3_SUPER_MAGIC) { ret = -ENOENT; fprintf(stderr, "%s: Ignoring file on non-ext2/3/4 filesystem.\n", argv[i]); break; } ret = stat(argv[i], &statbuf); if (ret) { perror(argv[i]); break; } ret = statvfs(argv[i], &statvfsbuf); if (ret) { perror(argv[i]); break; } if (S_ISDIR(statbuf.st_mode)) nftw(argv[i], fragment_file, 64, FTW_MOUNT | FTW_PHYS); else fragment_file(argv[i], &statbuf, 0, NULL); } sync(); return 0; }