Hi all,
I wrote a program called e4frag that deliberately tries to fragment an ext4
filesystem via EXT4_IOC_MOVE_EXT so that I could run e4defrag through its
paces. While running e4frag and e4defrag concurrently on a kernel source tree,
I discovered ongoing file corruption. It appears that if e4frag and e4defrag
hit the same file at same time, the file ends up with a 4K data block from
somewhere else. "Somewhere else" seems to be a small chunk of binary gibberish
followed by contents from other files(!) Obviously this isn't a good thing to
see, since today it's header files but tomorrow it could be the credit card/SSN
database. :)
Ted asked me to send out a copy of the program ASAP, so the test program source
code is at the end of this message. To build it, run:
$ gcc -o e4frag -O2 -Wall e4frag.c
and then to run it:
(unpack something in /path/to/files)
$ cp -pRdu /path/to/files /path/to/intact_files
$ while true; do e4defrag /path/to/files & done
$ while true; do ./e4frag -m 500 -s random /path/to/files & done
$ while true; do diff -Naurp /path/to/intact_files /path/to/files; done
...and wait for diff to cough up differences. This seems to happen on
2.6.34-rc3, and only if e4frag and e4defrag are running concurrently. Running
e4frag or e4defrag in a serial loop doesn't produce this corruption, so I think
it's purely a concurrent access problem.
On a lark, I ran fsck afterwards:
# fsck -C -f -y /dev/sda
fsck from util-linux-ng 2.16
e2fsck 1.41.9 (22-Aug-2009)
Pass 1: Checking inodes, blocks, and sizes
Pass 2: Checking directory structure
Pass 3: Checking directory connectivity
Pass 4: Checking reference counts
Pass 5: Checking group summary information
Inode bitmap differences: -534593 -534654 -534744 -534768 -534947 -662276
-662438 -1058789 -1058850 -1059026 -1059219 -1318193 -1583270 -1583378 -1583422
-2234673 -2631973 -3156444 -3156632 -3680888 -3680950 -4204922 -4205252
-4205286
Fix? yes
/dev/sda: ***** FILE SYSTEM WAS MODIFIED *****
/dev/sda: 291596/107143168 files (4.6% non-contiguous), 7829819/428544000 blocks
Is this a sign that the extent tree is getting corrupted somehow? Ted thought
that it might have something to do with an ialloc mutex, I think.
--D
/*
* Try to fragment files.
* Copyright (C) 2010 IBM. All rights reserved.
*
* This program is licensed under the GPLv2.
* Signed-off-by: Darrick J. Wong <[email protected]>
*/
#define _FILE_OFFSET_BITS 64
#define _XOPEN_SOURCE 600
#define _GNU_SOURCE
#include <stdio.h>
#include <string.h>
#include <ftw.h>
#include <sys/vfs.h>
#include <sys/statfs.h>
#include <assert.h>
#include <sys/statvfs.h>
#include <errno.h>
#include <linux/magic.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/param.h>
#include <unistd.h>
#include <stdlib.h>
#include <asm-generic/int-l64.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#define DEFAULT_MAX_DONOR_FILES 0
#define STATUS_NEWLINE "\r"
#define PROGRAM "e4frag v0.2"
struct fragment_context {
const char *fpath;
off_t max_progress;
off_t current_progress;
int old_pct;
};
struct fragment_profile {
const char *name;
int (*get_donor_fd)(struct fragment_context *fc, off_t max_files, off_t num_blocks);
int (*prepare)(struct fragment_context *fc, off_t max_files, off_t num_blocks);
};
static int max_donor_files = DEFAULT_MAX_DONOR_FILES;
static struct statvfs statvfsbuf;
static char donor_file_template[PATH_MAX];
static off_t donor_files; /* expect as many donor files as blocks */
static struct fragment_profile *profile;
static int verbose = 0;
/* Shamelessly stolen from e4defrag.c */
struct move_extent {
__s32 reserved; /* original file descriptor */
__u32 donor_fd; /* donor file descriptor */
__u64 orig_start; /* logical start offset in block for orig */
__u64 donor_start; /* logical start offset in block for donor */
__u64 len; /* block length to be moved */
__u64 moved_len; /* moved block length */
};
#ifndef EXT4_IOC_MOVE_EXT
#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
#endif
/* end stuff from e4defrag */
void print_status(struct fragment_context *fc, const char *str)
{
if (!verbose)
return;
printf("%s: %s\n", fc->fpath, str);
fflush(stdout);
}
void emit_status(struct fragment_context *fc, const char *str)
{
if (!verbose)
return;
printf("%s: %s" STATUS_NEWLINE, fc->fpath, str);
fflush(stdout);
}
void inc_status(struct fragment_context *fc)
{
int pct;
fc->current_progress++;
pct = 100 * fc->current_progress / fc->max_progress;
if (pct != fc->old_pct) {
if (verbose)
printf("%s: %d%%" STATUS_NEWLINE, fc->fpath, pct);
fflush(stdout);
fc->old_pct = pct;
}
}
int cleanup_donor_files(struct fragment_context *fc, int report_errors)
{
int ret;
char tmp_inode_name[PATH_MAX];
while (donor_files) {
snprintf(tmp_inode_name, PATH_MAX, donor_file_template, --donor_files);
ret = unlink(tmp_inode_name);
if (report_errors && ret) {
perror(tmp_inode_name);
return ret;
}
inc_status(fc);
}
return 0;
}
off_t calculate_max_files(off_t num_blocks)
{
off_t x = statvfsbuf.f_bavail / num_blocks;
/* Only use user setting if there's space. */
if (max_donor_files > 0 && x > max_donor_files)
return max_donor_files;
return x;
}
int generic_frag_file(const char *fpath, const struct stat *sb, struct fragment_profile *fp)
{
struct fragment_context fc;
struct move_extent move_data;
off_t num_blocks, block, max_files;
int ret, donor_fd, fd;
fc.fpath = fpath;
fc.max_progress = 0;
fc.current_progress = 0;
fc.old_pct = -1;
/* Screen out non-files or single-block files. */
if (!S_ISREG(sb->st_mode))
return 0;
num_blocks = sb->st_size / statvfsbuf.f_bsize;
if (sb->st_size % statvfsbuf.f_bsize)
num_blocks++;
if (num_blocks < 2)
return 0;
fd = open(fpath, O_RDWR);
if (fd < 0) {
perror(fpath);
ret = -errno;
goto out;
}
/* Kernel can return -ENODATA if we don't sync the source file first. */
emit_status(&fc, "syncing...");
fsync(fd);
emit_status(&fc, " ");
/* Prepare for donor files */
assert(!donor_files);
donor_files = 0;
snprintf(donor_file_template, PATH_MAX, "%s.%%lu.defrag", fpath);
/* Figure out the maximum donor file count for this file */
max_files = calculate_max_files(num_blocks);
ret = fp->prepare(&fc, max_files, num_blocks);
if (ret)
goto err;
/* Start moving blocks */
memset(&move_data, 0, sizeof(move_data));
move_data.len = 1;
for (block = num_blocks - 1; block >= 0; block--) {
donor_fd = fp->get_donor_fd(&fc, max_files, num_blocks);
if (donor_fd < 0)
goto err;
/* Swap blocks */
/* NB: Source and donor logical block must be the same. */
move_data.donor_fd = donor_fd;
move_data.orig_start = move_data.donor_start = block;
move_data.moved_len = 0;
ret = ioctl(fd, EXT4_IOC_MOVE_EXT, &move_data);
if (ret < 0) {
perror(fpath);
goto err2;
}
ret = close(donor_fd);
if (ret) {
perror("closing donor file");
goto err;
}
inc_status(&fc);
}
cleanup_donor_files(&fc, 0);
print_status(&fc, "Done.");
close(fd);
return 0;
err2:
cleanup_donor_files(&fc, 0);
close(donor_fd);
err:
close(fd);
out:
return ret;
}
/*
* So, to "reverse" the source logical block numbers, create a donor
* file for every block and do the swap. Occasionally flush out the
* donor files. Iterate the source file's blocks backwards in the
* hope of maximizing the amount of extent blocks that must also be
* dumped all over the filesystem.
*/
int reverse_prepare(struct fragment_context *fc, off_t max_files, off_t num_blocks)
{
fc->max_progress = 3 * num_blocks;
return 0;
}
int reverse_get_donor_fd(struct fragment_context *fc, off_t max_files, off_t num_blocks)
{
char tmp_inode_name[PATH_MAX];
int donor_fd, ret;
/* Clean out donor files */
if (donor_files > max_files) {
ret = cleanup_donor_files(fc, 1);
if (ret)
return ret;
}
/* Create hidden donor inode */
snprintf(tmp_inode_name, PATH_MAX, donor_file_template, donor_files++);
donor_fd = open(tmp_inode_name, O_WRONLY | O_CREAT | O_EXCL, S_IRUSR);
if (donor_fd < 0) {
perror(tmp_inode_name);
fprintf(stderr, "Is the fragmenter already running?\n");
errno = EBUSY;
return -1;
}
/* Allocate space in the donor file */
ret = posix_fallocate(donor_fd, 0, num_blocks * statvfsbuf.f_bsize);
if (ret) {
perror(tmp_inode_name);
close(donor_fd);
return ret;
}
inc_status(fc);
return donor_fd;
}
/*
* So, to "randomize" the source logical block numbers, create a bunch
* of donor files. For each block, pick a donor file at random and
* swap blocks with it.
*/
int random_prepare(struct fragment_context *fc, off_t max_files, off_t num_blocks)
{
int donor_fd, ret;
char tmp_inode_name[PATH_MAX];
fc->max_progress = num_blocks + (2 * max_files);
/* Allocate the donor files */
for (donor_files = 0; donor_files < max_files; donor_files++) {
/* Create donor inode */
snprintf(tmp_inode_name, PATH_MAX, donor_file_template, donor_files);
donor_fd = open(tmp_inode_name, O_WRONLY | O_CREAT | O_EXCL, S_IRUSR);
if (donor_fd < 0) {
perror(tmp_inode_name);
fprintf(stderr, "Is a fragmenter already running?\n");
return -1;
}
/* Allocate space in the donor file */
ret = posix_fallocate(donor_fd, 0, num_blocks * statvfsbuf.f_bsize);
if (ret) {
perror(tmp_inode_name);
close(donor_fd);
return -1;
}
close(donor_fd);
inc_status(fc);
}
return 0;
}
int random_get_donor_fd(struct fragment_context *fc, off_t max_files, off_t num_blocks)
{
char tmp_inode_name[PATH_MAX];
int donor_fd;
off_t donor = random() * max_files / RAND_MAX;
/* Reopen donor inode */
snprintf(tmp_inode_name, PATH_MAX, donor_file_template, donor);
donor_fd = open(tmp_inode_name, O_WRONLY, S_IRUSR);
if (donor_fd < 0) {
perror(tmp_inode_name);
errno = EBUSY;
return -1;
}
return donor_fd;
}
static struct fragment_profile profiles[] = {
{"random", random_get_donor_fd, random_prepare},
{"reverse", reverse_get_donor_fd, reverse_prepare},
{NULL},
};
int fragment_file(const char *fpath, const struct stat *sb, int typeflag,
struct FTW *ftwbuf)
{
return generic_frag_file(fpath, sb, profile);
}
void print_help(char *progname)
{
printf("Usage: %s [-m max_files] [-s random|reverse] [-v] pathspec [pathspecs...]\n", progname);
printf("-m Number of donor files to create while fragmenting. 0 = automatic\n");
printf("-s Set fragmentation strategy. (\"reverse\" or \"random\" (default))\n");
printf("-v Print progress indicators.\n");
}
int main(int argc, char *argv[])
{
struct fragment_profile *fp;
struct statfs statfsbuf;
struct stat statbuf;
int i, ret, opt;
profile = profiles;
if (argc < 2) {
print_help(argv[0]);
return 0;
}
while ((opt = getopt(argc, argv, "vm:s:")) != -1) {
switch (opt) {
case 'm':
max_donor_files = atoi(optarg);
break;
case 's':
fp = profiles;
while (fp->name) {
if (!strcmp(fp->name, optarg)) {
profile = fp;
break;
}
fp++;
}
if (!fp->name) {
print_help(argv[0]);
return 1;
}
break;
case 'v':
verbose = 1;
break;
default:
print_help(argv[0]);
return 1;
}
}
if (verbose)
printf(PROGRAM ", strategy \"%s\" max donors %d.\n", profile->name, max_donor_files);
for (i = optind; i < argc; i++) {
/* ignore files on non-ext4 filesystems */
ret = statfs(argv[i], &statfsbuf);
if (ret) {
perror(argv[i]);
break;
}
if (statfsbuf.f_type != EXT3_SUPER_MAGIC) {
ret = -ENOENT;
fprintf(stderr, "%s: Ignoring file on non-ext2/3/4 filesystem.\n", argv[i]);
break;
}
ret = stat(argv[i], &statbuf);
if (ret) {
perror(argv[i]);
break;
}
ret = statvfs(argv[i], &statvfsbuf);
if (ret) {
perror(argv[i]);
break;
}
if (S_ISDIR(statbuf.st_mode))
nftw(argv[i], fragment_file, 64, FTW_MOUNT | FTW_PHYS);
else
fragment_file(argv[i], &statbuf, 0, NULL);
}
sync();
return 0;
}
On Mon, Apr 05, 2010 at 03:02:20PM -0700, Darrick J. Wong wrote:
> Hi all,
>
> I wrote a program called e4frag that deliberately tries to fragment an ext4
> filesystem via EXT4_IOC_MOVE_EXT so that I could run e4defrag through its
> paces. While running e4frag and e4defrag concurrently on a kernel source tree,
> I discovered ongoing file corruption. It appears that if e4frag and e4defrag
> hit the same file at same time, the file ends up with a 4K data block from
> somewhere else. "Somewhere else" seems to be a small chunk of binary gibberish
> followed by contents from other files(!) Obviously this isn't a good thing to
It seems that if you mount the filesystem with -o sync this problem goes away.
--D
> see, since today it's header files but tomorrow it could be the credit card/SSN
> database. :)
>
> Ted asked me to send out a copy of the program ASAP, so the test program source
> code is at the end of this message. To build it, run:
>
> $ gcc -o e4frag -O2 -Wall e4frag.c
>
> and then to run it:
>
> (unpack something in /path/to/files)
> $ cp -pRdu /path/to/files /path/to/intact_files
> $ while true; do e4defrag /path/to/files & done
> $ while true; do ./e4frag -m 500 -s random /path/to/files & done
> $ while true; do diff -Naurp /path/to/intact_files /path/to/files; done
>
> ...and wait for diff to cough up differences. This seems to happen on
> 2.6.34-rc3, and only if e4frag and e4defrag are running concurrently. Running
> e4frag or e4defrag in a serial loop doesn't produce this corruption, so I think
> it's purely a concurrent access problem.
>
> On a lark, I ran fsck afterwards:
>
> # fsck -C -f -y /dev/sda
> fsck from util-linux-ng 2.16
> e2fsck 1.41.9 (22-Aug-2009)
> Pass 1: Checking inodes, blocks, and sizes
> Pass 2: Checking directory structure
> Pass 3: Checking directory connectivity
> Pass 4: Checking reference counts
> Pass 5: Checking group summary information
> Inode bitmap differences: -534593 -534654 -534744 -534768 -534947 -662276
> -662438 -1058789 -1058850 -1059026 -1059219 -1318193 -1583270 -1583378 -1583422
> -2234673 -2631973 -3156444 -3156632 -3680888 -3680950 -4204922 -4205252
> -4205286
> Fix? yes
>
>
> /dev/sda: ***** FILE SYSTEM WAS MODIFIED *****
> /dev/sda: 291596/107143168 files (4.6% non-contiguous), 7829819/428544000 blocks
>
> Is this a sign that the extent tree is getting corrupted somehow? Ted thought
> that it might have something to do with an ialloc mutex, I think.
>
> --D
>
> /*
> * Try to fragment files.
> * Copyright (C) 2010 IBM. All rights reserved.
> *
> * This program is licensed under the GPLv2.
> * Signed-off-by: Darrick J. Wong <[email protected]>
> */
> #define _FILE_OFFSET_BITS 64
> #define _XOPEN_SOURCE 600
> #define _GNU_SOURCE
>
> #include <stdio.h>
> #include <string.h>
> #include <ftw.h>
> #include <sys/vfs.h>
> #include <sys/statfs.h>
> #include <assert.h>
> #include <sys/statvfs.h>
> #include <errno.h>
> #include <linux/magic.h>
> #include <fcntl.h>
> #include <sys/types.h>
> #include <sys/stat.h>
> #include <sys/param.h>
> #include <unistd.h>
> #include <stdlib.h>
> #include <asm-generic/int-l64.h>
> #include <sys/ioctl.h>
> #include <sys/mman.h>
>
> #define DEFAULT_MAX_DONOR_FILES 0
> #define STATUS_NEWLINE "\r"
> #define PROGRAM "e4frag v0.2"
>
> struct fragment_context {
> const char *fpath;
> off_t max_progress;
> off_t current_progress;
> int old_pct;
> };
>
> struct fragment_profile {
> const char *name;
> int (*get_donor_fd)(struct fragment_context *fc, off_t max_files, off_t num_blocks);
> int (*prepare)(struct fragment_context *fc, off_t max_files, off_t num_blocks);
> };
>
> static int max_donor_files = DEFAULT_MAX_DONOR_FILES;
> static struct statvfs statvfsbuf;
> static char donor_file_template[PATH_MAX];
> static off_t donor_files; /* expect as many donor files as blocks */
> static struct fragment_profile *profile;
> static int verbose = 0;
>
> /* Shamelessly stolen from e4defrag.c */
>
> struct move_extent {
> __s32 reserved; /* original file descriptor */
> __u32 donor_fd; /* donor file descriptor */
> __u64 orig_start; /* logical start offset in block for orig */
> __u64 donor_start; /* logical start offset in block for donor */
> __u64 len; /* block length to be moved */
> __u64 moved_len; /* moved block length */
> };
>
> #ifndef EXT4_IOC_MOVE_EXT
> #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
> #endif
>
> /* end stuff from e4defrag */
>
> void print_status(struct fragment_context *fc, const char *str)
> {
> if (!verbose)
> return;
>
> printf("%s: %s\n", fc->fpath, str);
> fflush(stdout);
> }
>
> void emit_status(struct fragment_context *fc, const char *str)
> {
> if (!verbose)
> return;
>
> printf("%s: %s" STATUS_NEWLINE, fc->fpath, str);
> fflush(stdout);
> }
>
> void inc_status(struct fragment_context *fc)
> {
> int pct;
>
> fc->current_progress++;
> pct = 100 * fc->current_progress / fc->max_progress;
> if (pct != fc->old_pct) {
> if (verbose)
> printf("%s: %d%%" STATUS_NEWLINE, fc->fpath, pct);
> fflush(stdout);
> fc->old_pct = pct;
> }
> }
>
> int cleanup_donor_files(struct fragment_context *fc, int report_errors)
> {
> int ret;
> char tmp_inode_name[PATH_MAX];
>
> while (donor_files) {
> snprintf(tmp_inode_name, PATH_MAX, donor_file_template, --donor_files);
> ret = unlink(tmp_inode_name);
> if (report_errors && ret) {
> perror(tmp_inode_name);
> return ret;
> }
> inc_status(fc);
> }
>
> return 0;
> }
>
> off_t calculate_max_files(off_t num_blocks)
> {
> off_t x = statvfsbuf.f_bavail / num_blocks;
>
> /* Only use user setting if there's space. */
> if (max_donor_files > 0 && x > max_donor_files)
> return max_donor_files;
>
> return x;
> }
>
> int generic_frag_file(const char *fpath, const struct stat *sb, struct fragment_profile *fp)
> {
> struct fragment_context fc;
> struct move_extent move_data;
> off_t num_blocks, block, max_files;
> int ret, donor_fd, fd;
>
> fc.fpath = fpath;
> fc.max_progress = 0;
> fc.current_progress = 0;
> fc.old_pct = -1;
>
> /* Screen out non-files or single-block files. */
> if (!S_ISREG(sb->st_mode))
> return 0;
>
> num_blocks = sb->st_size / statvfsbuf.f_bsize;
> if (sb->st_size % statvfsbuf.f_bsize)
> num_blocks++;
>
> if (num_blocks < 2)
> return 0;
>
> fd = open(fpath, O_RDWR);
> if (fd < 0) {
> perror(fpath);
> ret = -errno;
> goto out;
> }
>
> /* Kernel can return -ENODATA if we don't sync the source file first. */
> emit_status(&fc, "syncing...");
> fsync(fd);
> emit_status(&fc, " ");
>
> /* Prepare for donor files */
> assert(!donor_files);
> donor_files = 0;
> snprintf(donor_file_template, PATH_MAX, "%s.%%lu.defrag", fpath);
>
> /* Figure out the maximum donor file count for this file */
> max_files = calculate_max_files(num_blocks);
>
> ret = fp->prepare(&fc, max_files, num_blocks);
> if (ret)
> goto err;
>
> /* Start moving blocks */
> memset(&move_data, 0, sizeof(move_data));
> move_data.len = 1;
> for (block = num_blocks - 1; block >= 0; block--) {
> donor_fd = fp->get_donor_fd(&fc, max_files, num_blocks);
> if (donor_fd < 0)
> goto err;
>
> /* Swap blocks */
> /* NB: Source and donor logical block must be the same. */
> move_data.donor_fd = donor_fd;
> move_data.orig_start = move_data.donor_start = block;
> move_data.moved_len = 0;
> ret = ioctl(fd, EXT4_IOC_MOVE_EXT, &move_data);
> if (ret < 0) {
> perror(fpath);
> goto err2;
> }
>
> ret = close(donor_fd);
> if (ret) {
> perror("closing donor file");
> goto err;
> }
>
> inc_status(&fc);
> }
>
> cleanup_donor_files(&fc, 0);
> print_status(&fc, "Done.");
> close(fd);
> return 0;
>
> err2:
> cleanup_donor_files(&fc, 0);
> close(donor_fd);
> err:
> close(fd);
> out:
> return ret;
> }
>
> /*
> * So, to "reverse" the source logical block numbers, create a donor
> * file for every block and do the swap. Occasionally flush out the
> * donor files. Iterate the source file's blocks backwards in the
> * hope of maximizing the amount of extent blocks that must also be
> * dumped all over the filesystem.
> */
> int reverse_prepare(struct fragment_context *fc, off_t max_files, off_t num_blocks)
> {
> fc->max_progress = 3 * num_blocks;
> return 0;
> }
>
> int reverse_get_donor_fd(struct fragment_context *fc, off_t max_files, off_t num_blocks)
> {
> char tmp_inode_name[PATH_MAX];
> int donor_fd, ret;
>
> /* Clean out donor files */
> if (donor_files > max_files) {
> ret = cleanup_donor_files(fc, 1);
> if (ret)
> return ret;
> }
>
> /* Create hidden donor inode */
> snprintf(tmp_inode_name, PATH_MAX, donor_file_template, donor_files++);
> donor_fd = open(tmp_inode_name, O_WRONLY | O_CREAT | O_EXCL, S_IRUSR);
> if (donor_fd < 0) {
> perror(tmp_inode_name);
> fprintf(stderr, "Is the fragmenter already running?\n");
> errno = EBUSY;
> return -1;
> }
>
> /* Allocate space in the donor file */
> ret = posix_fallocate(donor_fd, 0, num_blocks * statvfsbuf.f_bsize);
> if (ret) {
> perror(tmp_inode_name);
> close(donor_fd);
> return ret;
> }
>
> inc_status(fc);
>
> return donor_fd;
> }
>
> /*
> * So, to "randomize" the source logical block numbers, create a bunch
> * of donor files. For each block, pick a donor file at random and
> * swap blocks with it.
> */
> int random_prepare(struct fragment_context *fc, off_t max_files, off_t num_blocks)
> {
> int donor_fd, ret;
> char tmp_inode_name[PATH_MAX];
>
> fc->max_progress = num_blocks + (2 * max_files);
>
> /* Allocate the donor files */
> for (donor_files = 0; donor_files < max_files; donor_files++) {
> /* Create donor inode */
> snprintf(tmp_inode_name, PATH_MAX, donor_file_template, donor_files);
> donor_fd = open(tmp_inode_name, O_WRONLY | O_CREAT | O_EXCL, S_IRUSR);
> if (donor_fd < 0) {
> perror(tmp_inode_name);
> fprintf(stderr, "Is a fragmenter already running?\n");
> return -1;
> }
>
> /* Allocate space in the donor file */
> ret = posix_fallocate(donor_fd, 0, num_blocks * statvfsbuf.f_bsize);
> if (ret) {
> perror(tmp_inode_name);
> close(donor_fd);
> return -1;
> }
>
> close(donor_fd);
> inc_status(fc);
> }
>
> return 0;
> }
>
> int random_get_donor_fd(struct fragment_context *fc, off_t max_files, off_t num_blocks)
> {
> char tmp_inode_name[PATH_MAX];
> int donor_fd;
> off_t donor = random() * max_files / RAND_MAX;
>
> /* Reopen donor inode */
> snprintf(tmp_inode_name, PATH_MAX, donor_file_template, donor);
> donor_fd = open(tmp_inode_name, O_WRONLY, S_IRUSR);
> if (donor_fd < 0) {
> perror(tmp_inode_name);
> errno = EBUSY;
> return -1;
> }
>
> return donor_fd;
> }
>
> static struct fragment_profile profiles[] = {
> {"random", random_get_donor_fd, random_prepare},
> {"reverse", reverse_get_donor_fd, reverse_prepare},
> {NULL},
> };
>
> int fragment_file(const char *fpath, const struct stat *sb, int typeflag,
> struct FTW *ftwbuf)
> {
> return generic_frag_file(fpath, sb, profile);
> }
>
> void print_help(char *progname)
> {
> printf("Usage: %s [-m max_files] [-s random|reverse] [-v] pathspec [pathspecs...]\n", progname);
> printf("-m Number of donor files to create while fragmenting. 0 = automatic\n");
> printf("-s Set fragmentation strategy. (\"reverse\" or \"random\" (default))\n");
> printf("-v Print progress indicators.\n");
> }
>
> int main(int argc, char *argv[])
> {
> struct fragment_profile *fp;
> struct statfs statfsbuf;
> struct stat statbuf;
> int i, ret, opt;
>
> profile = profiles;
>
> if (argc < 2) {
> print_help(argv[0]);
> return 0;
> }
>
> while ((opt = getopt(argc, argv, "vm:s:")) != -1) {
> switch (opt) {
> case 'm':
> max_donor_files = atoi(optarg);
> break;
> case 's':
> fp = profiles;
> while (fp->name) {
> if (!strcmp(fp->name, optarg)) {
> profile = fp;
> break;
> }
> fp++;
> }
>
> if (!fp->name) {
> print_help(argv[0]);
> return 1;
> }
> break;
> case 'v':
> verbose = 1;
> break;
> default:
> print_help(argv[0]);
> return 1;
> }
> }
>
> if (verbose)
> printf(PROGRAM ", strategy \"%s\" max donors %d.\n", profile->name, max_donor_files);
>
> for (i = optind; i < argc; i++) {
> /* ignore files on non-ext4 filesystems */
> ret = statfs(argv[i], &statfsbuf);
> if (ret) {
> perror(argv[i]);
> break;
> }
>
> if (statfsbuf.f_type != EXT3_SUPER_MAGIC) {
> ret = -ENOENT;
> fprintf(stderr, "%s: Ignoring file on non-ext2/3/4 filesystem.\n", argv[i]);
> break;
> }
>
> ret = stat(argv[i], &statbuf);
> if (ret) {
> perror(argv[i]);
> break;
> }
>
> ret = statvfs(argv[i], &statvfsbuf);
> if (ret) {
> perror(argv[i]);
> break;
> }
>
> if (S_ISDIR(statbuf.st_mode))
> nftw(argv[i], fragment_file, 64, FTW_MOUNT | FTW_PHYS);
> else
> fragment_file(argv[i], &statbuf, 0, NULL);
> }
>
> sync();
>
> return 0;
> }
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Apr 9, 2010 at 12:20 PM, Darrick J. Wong <[email protected]> wrote:
> On Mon, Apr 05, 2010 at 03:02:20PM -0700, Darrick J. Wong wrote:
>> Hi all,
>>
>> I wrote a program called e4frag that deliberately tries to fragment an ext4
>> filesystem via EXT4_IOC_MOVE_EXT so that I could run e4defrag through its
>> paces. ?While running e4frag and e4defrag concurrently on a kernel source tree,
>> I discovered ongoing file corruption. ?It appears that if e4frag and e4defrag
>> hit the same file at same time, the file ends up with a 4K data block from
>> somewhere else. ?"Somewhere else" seems to be a small chunk of binary gibberish
>> followed by contents from other files(!) ?Obviously this isn't a good thing to
>
> It seems that if you mount the filesystem with -o sync this problem goes away.
>
> --D
That implies to me that there is a missing block flush prior to a lock
being released. That should not be too hard to find by code
inspection.
As conceptually interested as I am in ext4_ioc_move_ext, I have not
really gone through the code with any detail. Maybe I'll contribute
by doing that.
Greg
Hi Darrick,
(2010/04/06 7:02), Darrick J. Wong wrote:
> Hi all,
>
> I wrote a program called e4frag that deliberately tries to fragment an ext4
> filesystem via EXT4_IOC_MOVE_EXT so that I could run e4defrag through its
> paces. While running e4frag and e4defrag concurrently on a kernel source tree,
> I discovered ongoing file corruption. It appears that if e4frag and e4defrag
> hit the same file at same time, the file ends up with a 4K data block from
> somewhere else. "Somewhere else" seems to be a small chunk of binary gibberish
> followed by contents from other files(!) Obviously this isn't a good thing to
> see, since today it's header files but tomorrow it could be the credit card/SSN
> database. :)
>
> Ted asked me to send out a copy of the program ASAP, so the test program source
> code is at the end of this message. To build it, run:
>
> $ gcc -o e4frag -O2 -Wall e4frag.c
>
> and then to run it:
>
> (unpack something in /path/to/files)
> $ cp -pRdu /path/to/files /path/to/intact_files
> $ while true; do e4defrag /path/to/files& done
> $ while true; do ./e4frag -m 500 -s random /path/to/files& done
> $ while true; do diff -Naurp /path/to/intact_files /path/to/files; done
>
> ...and wait for diff to cough up differences. This seems to happen on
> 2.6.34-rc3, and only if e4frag and e4defrag are running concurrently. Running
> e4frag or e4defrag in a serial loop doesn't produce this corruption, so I think
> it's purely a concurrent access problem.
I couldn't reproduce this problem, somehow.
My environment is:
Arch: i386
Kernel: 2.6.34-rc3
e2fsprogs: 1.41.11
Mount option: delalloc, data=ordered, async
Block size: 4KB
Partition size: 100GB
Is there any difference in your case?
And how long does this file corruption take to be detected?
I ran below program all day long, but problem did not occur.
---
#!/bin/bash
TARGET="/mnt/mp1/TEST/linux-2.6.34-rc3"
ORIG="/mnt/mp1/TEST/linux-2.6.34-rc3-orig"
cp -pRdu $TARGET $ORIG
while true; do ./e4defrag -v $TARGET & done
while true; do ./e4frag -m 500 -s random $TARGET & done
while true; do diff -Naurp $ORIG $TARGET; done
---
# The OOM killer sometimes runs while running this program
because this is a heavy load for system, though.
Regards,
Akira Fujita
On Thu, Apr 15, 2010 at 05:27:50PM +0900, Akira Fujita wrote:
> Hi Darrick,
>
> (2010/04/06 7:02), Darrick J. Wong wrote:
>> Hi all,
>>
>> I wrote a program called e4frag that deliberately tries to fragment an ext4
>> filesystem via EXT4_IOC_MOVE_EXT so that I could run e4defrag through its
>> paces. While running e4frag and e4defrag concurrently on a kernel source tree,
>> I discovered ongoing file corruption. It appears that if e4frag and e4defrag
>> hit the same file at same time, the file ends up with a 4K data block from
>> somewhere else. "Somewhere else" seems to be a small chunk of binary gibberish
>> followed by contents from other files(!) Obviously this isn't a good thing to
>> see, since today it's header files but tomorrow it could be the credit card/SSN
>> database. :)
>>
>> Ted asked me to send out a copy of the program ASAP, so the test program source
>> code is at the end of this message. To build it, run:
>>
>> $ gcc -o e4frag -O2 -Wall e4frag.c
>>
>> and then to run it:
>>
>> (unpack something in /path/to/files)
>> $ cp -pRdu /path/to/files /path/to/intact_files
>> $ while true; do e4defrag /path/to/files& done
>> $ while true; do ./e4frag -m 500 -s random /path/to/files& done
>> $ while true; do diff -Naurp /path/to/intact_files /path/to/files; done
>>
>> ...and wait for diff to cough up differences. This seems to happen on
>> 2.6.34-rc3, and only if e4frag and e4defrag are running concurrently. Running
>> e4frag or e4defrag in a serial loop doesn't produce this corruption, so I think
>> it's purely a concurrent access problem.
>
> I couldn't reproduce this problem, somehow.
>
> My environment is:
> Arch: i386
> Kernel: 2.6.34-rc3
> e2fsprogs: 1.41.11
> Mount option: delalloc, data=ordered, async
> Block size: 4KB
> Partition size: 100GB
>
> Is there any difference in your case?
> And how long does this file corruption take to be detected?
>
> I ran below program all day long, but problem did not occur.
Hmm. I was running with 2.6.34-rc3 on x86-64, same block size, though with a
2TB mdraid0. It usually took a few hours to reproduce, though I've noticed
that if I kick off at least as many e4defrags and e4frags, it will show up much
sooner. Thank you for trying this out!
> ---
> #!/bin/bash
>
> TARGET="/mnt/mp1/TEST/linux-2.6.34-rc3"
> ORIG="/mnt/mp1/TEST/linux-2.6.34-rc3-orig"
>
> cp -pRdu $TARGET $ORIG
> while true; do ./e4defrag -v $TARGET & done
> while true; do ./e4frag -m 500 -s random $TARGET & done
> while true; do diff -Naurp $ORIG $TARGET; done
> ---
>
> # The OOM killer sometimes runs while running this program
> because this is a heavy load for system, though.
Hmm... I don't ever see the OOM killer.
I've now seen this show up, just once:
[267630.741537] ------------[ cut here ]------------
[267630.746247] kernel BUG at /home/djwong/linux-2.6.34-rc3-fs/fs/ext4/extents.c:1922!
[267630.753903] invalid opcode: 0000 [#3] PREEMPT SMP
[267630.758855] last sysfs file: /sys/devices/virtual/block/md0/md/metadata_version
[267630.766249] CPU 12
[267630.768274] Modules linked in: ext4 mbcache jbd2 crc16 kvm_intel kvm eeprom i2c_dev ipmi_si ipmi_msghandler iptable_filter coretemp hwmon ip_tables x_tables i2c_scmi i2c_i801 cdc_ether usbnet i2c_core mousedev rtc_cmos evdev rtc_core serio_raw rtc_lib shpchp acpi_cpufreq pci_hotplug ioatdma button dca processor af_packet nfs lockd fscache nfs_acl auth_rpcgss sunrpc virtio_pci virtio_ring sd_mod crc_t10dif sg sr_mod cdrom usbhid hid raid10 raid456 async_raid6_recov async_pq raid6_pq async_xor xor async_memcpy async_tx raid1 raid0 multipath linear md_mod dm_mirror dm_region_hash dm_log dm_snapshot dm_mod virtio_net virtio bridge stp mptsas llc mptscsih lpfc mptbase fan ata_piix scsi_transport_fc ehci_hcd uhci_hcd scsi_transport_sas usbcore scsi_tgt fuse libata nls_base scsi_mod bnx2 ther
mal [last unloaded: crc16]
[267630.842501]
[267630.844091] Pid: 5938, comm: e4defrag Tainted: G D W 2.6.34-rc3-fs64 #12 49Y6512 /System x3550 M2 -[7946AC1]-
[267630.854950] RIP: 0010:[<ffffffffa054d8c2>] [<ffffffffa054d8c2>] ext4_ext_walk_space+0x14a/0x24a [ext4]
[267630.864462] RSP: 0018:ffff88011b047d78 EFLAGS: 00010246
[267630.869864] RAX: 0000000000000000 RBX: 0000000000000039 RCX: ffff880107a57228
[267630.877082] RDX: 00000000ffffffc6 RSI: 0000000000000038 RDI: 0000000000000039
[267630.884304] RBP: ffff88011b047df8 R08: 0000000000000001 R09: 00000000000002f4
[267630.891523] R10: 0000000000000000 R11: 0000000000001000 R12: ffff880377b2d800
[267630.898743] R13: ffff88016b305a58 R14: ffff88016b3058e0 R15: ffffffffa054ba01
[267630.905965] FS: 00007fa4dbf766f0(0000) GS:ffff880205e00000(0000) knlGS:0000000000000000
[267630.914137] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[267630.919971] CR2: 000000000211e5f8 CR3: 0000000185793000 CR4: 00000000000006e0
[267630.927193] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[267630.934415] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[267630.941637] Process e4defrag (pid: 5938, threadinfo ffff88011b046000, task ffff8801f1c81460)
[267630.950156] Stack:
[267630.952259] ffffea000591eab0 ffff880107a57228 ffff8801ffffffc6 ffff88011b047e88
[267630.959634] <0> ffff88016b3059d8 00000001ffffffff 0000000007aebdb8 0000000100000038
[267630.967482] <0> ffff880100000002 ffffffff810c8a2a ffff88011b047e28 ffff88016b305a58
[267630.975525] Call Trace:
[267630.978069] [<ffffffff810c8a2a>] ? do_writepages+0x24/0x2d
[267630.983742] [<ffffffffa054db04>] ext4_fiemap+0x142/0x14f [ext4]
[267630.989839] [<ffffffff810c1f33>] ? filemap_fdatawait+0x21/0x23
[267630.995848] [<ffffffff8111388f>] ioctl_fiemap+0x10f/0x16f
[267631.001424] [<ffffffff81113d57>] do_vfs_ioctl+0x259/0x2cb
[267631.007000] [<ffffffff81113e10>] sys_ioctl+0x47/0x6a
[267631.012146] [<ffffffff81009d32>] system_call_fastpath+0x16/0x1b
[267631.018239] Code: c7 eb 22 66 41 81 f8 00 80 41 0f b7 f8 76 06 81 ef 00 80 00 00 01 f7 01 da b8 01 00 00 00 39 fa 0f 42 fa eb 02 31 c0 39 df 77 04 <0f> 0b eb fe 85 c0 75 19 29 df 89 5d b8 89 7d bc 48 c7 45 b0 00
[267631.038581] RIP [<ffffffffa054d8c2>] ext4_ext_walk_space+0x14a/0x24a [ext4]
[267631.045753] RSP <ffff88011b047d78>
[267631.049404] ---[ end trace 0df9444f77ea4b61 ]---
Judging from the source code, some bitmap somewhere is getting corrupted,
because the test that triggers the bug is if end <= beginning while walking the
extent list. I rebooted the box and ran fsck, and fsck complained about bitmap
errors.
--D
On Thu, Apr 15, 2010 at 3:17 PM, Darrick J. Wong <[email protected]> wrote:
> On Thu, Apr 15, 2010 at 05:27:50PM +0900, Akira Fujita wrote:
>> Hi Darrick,
>>
>> (2010/04/06 7:02), Darrick J. Wong wrote:
>>> Hi all,
>>>
>>> I wrote a program called e4frag that deliberately tries to fragment an ext4
>>> filesystem via EXT4_IOC_MOVE_EXT so that I could run e4defrag through its
>>> paces. ?While running e4frag and e4defrag concurrently on a kernel source tree,
>>> I discovered ongoing file corruption. ?It appears that if e4frag and e4defrag
>>> hit the same file at same time, the file ends up with a 4K data block from
>>> somewhere else. ?"Somewhere else" seems to be a small chunk of binary gibberish
>>> followed by contents from other files(!) ?Obviously this isn't a good thing to
>>> see, since today it's header files but tomorrow it could be the credit card/SSN
>>> database. :)
>>>
>>> Ted asked me to send out a copy of the program ASAP, so the test program source
>>> code is at the end of this message. ?To build it, run:
>>>
>>> $ gcc -o e4frag -O2 -Wall e4frag.c
>>>
>>> and then to run it:
>>>
>>> (unpack something in /path/to/files)
>>> $ cp -pRdu /path/to/files /path/to/intact_files
>>> $ while true; do e4defrag /path/to/files& ?done
>>> $ while true; do ./e4frag -m 500 -s random /path/to/files& ?done
>>> $ while true; do diff -Naurp /path/to/intact_files /path/to/files; done
>>>
>>> ...and wait for diff to cough up differences. ?This seems to happen on
>>> 2.6.34-rc3, and only if e4frag and e4defrag are running concurrently. ?Running
>>> e4frag or e4defrag in a serial loop doesn't produce this corruption, so I think
>>> it's purely a concurrent access problem.
>>
>> I couldn't reproduce this problem, somehow.
>>
>> My environment is:
>> Arch: i386
>> Kernel: 2.6.34-rc3
>> e2fsprogs: 1.41.11
>> Mount option: delalloc, data=ordered, async
>> Block size: 4KB
>> Partition size: 100GB
>>
>> Is there any difference in your case?
>> And how long does this file corruption take to be detected?
>>
>> I ran below program all day long, but problem did not occur.
>
> Hmm. ?I was running with 2.6.34-rc3 on x86-64, same block size, though with a
> 2TB mdraid0. ?It usually took a few hours to reproduce, though I've noticed
> that if I kick off at least as many e4defrags and e4frags, it will show up much
> sooner. ?Thank you for trying this out!
If its not reproducible on a simple disk, it could be a bug in the
barrier code for mdraid.
But I think raid0 support for barriers has been around for a long time.
Greg