2007-07-25 05:36:43

by Aneesh Kumar K.V

[permalink] [raw]
Subject: e2fsprogs: Undo I/O Manager and large inode migration support in tune2fs


Hi Ted,

The series is on top of

f1f115a78f5ea599fc5f8815a741d43fedd5840d

Changes from last post include

a) Add O_EXCL flag to tdb_open.
b) Fix the return value when we fail to create the tdb file
c) Add tune2fs -I <new_inode_size> for migrating to large inode size.


2007-07-25 05:37:23

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH 1/3] e2fsprogs: Add undo I/O manager

From: Aneesh Kumar K.V <[email protected]>

This I/O manager saves the contents of the location being overwritten
to a tdb database. This helps in undoing the changes done to the
file system.

The call sequence involve

set_undo_io_backing_manager(unix_io_manager);
set_undo_io_backup_file("/tmp/test.tdb");
retval = ext2fs_open2(dev_name, 0, flags,
superblock, block_size, undo_io_manager,
&current_fs);

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
lib/ext2fs/Makefile.in | 7 +-
lib/ext2fs/ext2_io.h | 5 +
lib/ext2fs/undo_io.c | 500 ++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 510 insertions(+), 2 deletions(-)
create mode 100644 lib/ext2fs/undo_io.c

diff --git a/lib/ext2fs/Makefile.in b/lib/ext2fs/Makefile.in
index 70e18e7..7afd5eb 100644
--- a/lib/ext2fs/Makefile.in
+++ b/lib/ext2fs/Makefile.in
@@ -66,7 +66,8 @@ OBJS= $(DEBUGFS_LIB_OBJS) $(RESIZE_LIB_OBJS) $(E2IMAGE_LIB_OBJS) \
unix_io.o \
unlink.o \
valid_blk.o \
- version.o
+ version.o \
+ undo_io.o

SRCS= ext2_err.c \
$(srcdir)/alloc.c \
@@ -132,7 +133,8 @@ SRCS= ext2_err.c \
$(srcdir)/tst_bitops.c \
$(srcdir)/tst_byteswap.c \
$(srcdir)/tst_getsize.c \
- $(srcdir)/tst_iscan.c
+ $(srcdir)/tst_iscan.c \
+ $(srcdir)/undo_io.c

HFILES= bitops.h ext2fs.h ext2_io.h ext2_fs.h ext2_ext_attr.h ext3_extents.h \
tdb.h
@@ -573,3 +575,4 @@ tst_iscan.o: $(srcdir)/tst_iscan.c $(srcdir)/ext2_fs.h \
$(top_builddir)/lib/ext2fs/ext2_types.h $(srcdir)/ext2fs.h \
$(srcdir)/ext2_fs.h $(srcdir)/ext3_extents.h $(top_srcdir)/lib/et/com_err.h \
$(srcdir)/ext2_io.h $(top_builddir)/lib/ext2fs/ext2_err.h $(srcdir)/bitops.h
+undo_io.o: $(srcdir)/undo_io.c $(srcdir)/ext2_fs.h $(srcdir)/ext2fs.h
diff --git a/lib/ext2fs/ext2_io.h b/lib/ext2fs/ext2_io.h
index eada278..476eb4d 100644
--- a/lib/ext2fs/ext2_io.h
+++ b/lib/ext2fs/ext2_io.h
@@ -96,6 +96,11 @@ extern errcode_t io_channel_write_byte(io_channel channel,
/* unix_io.c */
extern io_manager unix_io_manager;

+/* undo_io.c */
+extern io_manager undo_io_manager;
+extern errcode_t set_undo_io_backing_manager(io_manager manager);
+extern errcode_t set_undo_io_backup_file(char *file_name);
+
/* test_io.c */
extern io_manager test_io_manager, test_io_backing_manager;
extern void (*test_io_cb_read_blk)
diff --git a/lib/ext2fs/undo_io.c b/lib/ext2fs/undo_io.c
new file mode 100644
index 0000000..320428c
--- /dev/null
+++ b/lib/ext2fs/undo_io.c
@@ -0,0 +1,500 @@
+/*
+ * undo_io.c --- This is the undo io manager that copies the old data that
+ * copies the old data being overwritten into a tdb database
+ *
+ * Copyright IBM Corporation, 2007
+ * Author Aneesh Kumar K.V <[email protected]>
+ *
+ * %Begin-Header%
+ * This file may be redistributed under the terms of the GNU Public
+ * License.
+ * %End-Header%
+ */
+
+#define _LARGEFILE_SOURCE
+#define _LARGEFILE64_SOURCE
+
+#include <stdio.h>
+#include <string.h>
+#if HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#if HAVE_ERRNO_H
+#include <errno.h>
+#endif
+#include <fcntl.h>
+#include <time.h>
+#ifdef __linux__
+#include <sys/utsname.h>
+#endif
+#if HAVE_SYS_STAT_H
+#include <sys/stat.h>
+#endif
+#if HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+#if HAVE_SYS_RESOURCE_H
+#include <sys/resource.h>
+#endif
+
+#include "tdb.h"
+
+#include "ext2_fs.h"
+#include "ext2fs.h"
+
+/*
+ * For checking structure magic numbers...
+ */
+
+#define EXT2_CHECK_MAGIC(struct, code) \
+ if ((struct)->magic != (code)) return (code)
+
+
+
+struct undo_private_data {
+ int magic;
+ TDB_CONTEXT *tdb;
+ char *tdb_file;
+
+ /* The backing io channel */
+ io_channel real;
+
+ /* to support offset in unix I/O manager */
+ ext2_loff_t offset;
+};
+
+static errcode_t undo_open(const char *name, int flags, io_channel *channel);
+static errcode_t undo_close(io_channel channel);
+static errcode_t undo_set_blksize(io_channel channel, int blksize);
+static errcode_t undo_read_blk(io_channel channel, unsigned long block,
+ int count, void *data);
+static errcode_t undo_write_blk(io_channel channel, unsigned long block,
+ int count, const void *data);
+static errcode_t undo_flush(io_channel channel);
+static errcode_t undo_write_byte(io_channel channel, unsigned long offset,
+ int size, const void *data);
+static errcode_t undo_set_option(io_channel channel, const char *option,
+ const char *arg);
+
+static struct struct_io_manager struct_undo_manager = {
+ EXT2_ET_MAGIC_IO_MANAGER,
+ "Undo I/O Manager",
+ undo_open,
+ undo_close,
+ undo_set_blksize,
+ undo_read_blk,
+ undo_write_blk,
+ undo_flush,
+ undo_write_byte,
+ undo_set_option
+};
+
+io_manager undo_io_manager = &struct_undo_manager;
+static io_manager undo_io_backing_manager ;
+static char *tdb_file ;
+static int tdb_data_size = 0;
+
+errcode_t set_undo_io_backing_manager(io_manager manager)
+{
+ /*
+ * We may want to do some validation later
+ */
+ undo_io_backing_manager = manager;
+ return 0;
+}
+
+errcode_t set_undo_io_backup_file(char *file_name)
+{
+ tdb_file = strdup(file_name);
+
+ if (tdb_file == NULL) {
+ return EXT2_ET_NO_MEMORY;
+ }
+
+ return 0;
+}
+
+
+static errcode_t undo_write_tdb(io_channel channel,
+ unsigned long block, int count)
+
+{
+ int size, loop_count = 0, i;
+ unsigned long block_num, backing_blk_num;
+ errcode_t retval = 0;
+ ext2_loff_t offset;
+ struct undo_private_data *data;
+ TDB_DATA tdb_key, tdb_data;
+ char *read_ptr;
+
+ data = (struct undo_private_data *) channel->private_data;
+
+
+ if (data->tdb == NULL) {
+ /*
+ * Transaction database not initialized
+ */
+ return 0;
+ }
+
+ /*
+ * Set the block size used to read for tdb
+ */
+ if (!tdb_data_size)
+ tdb_data_size = channel->block_size;
+
+ if (count == 1)
+ size = channel->block_size;
+ else {
+ if (count < 0)
+ size = -count;
+ else
+ size = count * channel->block_size;
+ }
+
+ /*
+ * Data is stored in tdb database as blocks of tdb_data_size size
+ * This helps in efficient lookup further.
+ *
+ * We divide the disk to blocks of tdb_data_size.
+ */
+
+ block_num = ((block*channel->block_size)+data->offset)/tdb_data_size;
+
+
+ loop_count = (size + tdb_data_size -1)/tdb_data_size;
+
+ tdb_transaction_start(data->tdb);
+ for (i = 0; i < loop_count; i++) {
+
+ tdb_key.dptr = (unsigned char *)&block_num;
+ tdb_key.dsize = sizeof(block_num);
+
+ /*
+ * Check if we have the record already
+ */
+ if (tdb_exists(data->tdb, tdb_key)) {
+
+ /* Try the next block */
+ block_num++;
+ continue;
+ }
+
+ /*
+ * Read one block using the backing I/O manager
+ * The backing I/O manager block size may be
+ * different from the tdb_data_size.
+ * Also we need to recalcuate the block number with respect
+ * to the backing I/O manager.
+ */
+
+ offset = block_num * tdb_data_size;
+ backing_blk_num = (offset - data->offset) / channel->block_size;
+
+ count = tdb_data_size +
+ ((offset - data->offset) % channel->block_size);
+
+ retval = ext2fs_get_mem(count, &read_ptr);
+ if (retval) {
+ tdb_transaction_cancel(data->tdb);
+ return retval;
+ }
+
+ memset(read_ptr, 0, count);
+
+ retval = io_channel_read_blk(data->real,
+ backing_blk_num,
+ -count, read_ptr);
+ if (retval) {
+ free(read_ptr);
+ tdb_transaction_cancel(data->tdb);
+ return retval;
+ }
+
+
+ tdb_data.dptr = read_ptr +
+ ((offset - data->offset) % channel->block_size);
+
+ tdb_data.dsize = tdb_data_size;
+
+#ifdef DEBUG
+ printf("Printing with key %ld data %x and size %d\n",
+ block_num,
+ tdb_data.dptr, tdb_data_size);
+#endif
+
+ retval = tdb_store(data->tdb, tdb_key, tdb_data, TDB_INSERT);
+ if (retval == -1) {
+ /*
+ * TDB_ERR_EXISTS cannot happen because we
+ * have already verified it doesn't exist
+ */
+ tdb_transaction_cancel(data->tdb);
+ retval = EXT2_ET_TDB_ERR_IO;
+ free(read_ptr);
+ return retval;
+
+ }
+ free(read_ptr);
+ /* Next block */
+ block_num++;
+ }
+
+ tdb_transaction_commit(data->tdb);
+
+
+ return retval;
+}
+
+static TDB_CONTEXT *undo_setup_tdb(char *tdb_file,
+ struct undo_private_data *data)
+{
+ errcode_t retval;
+
+ data->tdb = tdb_open(tdb_file, 0, TDB_CLEAR_IF_FIRST,
+ O_RDWR | O_CREAT | O_TRUNC | O_EXCL, 0600);
+ return data->tdb;
+
+}
+static errcode_t undo_open(const char *name, int flags, io_channel *channel)
+{
+ io_channel io = NULL;
+ struct undo_private_data *data = NULL;
+ errcode_t retval;
+ int open_flags;
+ struct stat st;
+
+ if (name == 0)
+ return EXT2_ET_BAD_DEVICE_NAME;
+ retval = ext2fs_get_mem(sizeof(struct struct_io_channel), &io);
+ if (retval)
+ return retval;
+ memset(io, 0, sizeof(struct struct_io_channel));
+ io->magic = EXT2_ET_MAGIC_IO_CHANNEL;
+ retval = ext2fs_get_mem(sizeof(struct undo_private_data), &data);
+ if (retval)
+ goto cleanup;
+
+ io->manager = undo_io_manager;
+ retval = ext2fs_get_mem(strlen(name)+1, &io->name);
+ if (retval)
+ goto cleanup;
+
+ strcpy(io->name, name);
+ io->private_data = data;
+ io->block_size = 1024;
+ io->read_error = 0;
+ io->write_error = 0;
+ io->refcount = 1;
+
+ memset(data, 0, sizeof(struct undo_private_data));
+ data->magic = EXT2_ET_MAGIC_UNIX_IO_CHANNEL;
+
+ if (undo_io_backing_manager) {
+ retval = undo_io_backing_manager->open(name, flags,
+ &data->real);
+ if (retval)
+ goto cleanup;
+ } else {
+ data->real = 0;
+ }
+
+ /* setup the tdb file */
+ if (undo_setup_tdb(tdb_file, data) == NULL ) {
+ /*
+ * This retval results in the below
+ * string in com_err
+ * "TDB: Record exists". This helps
+ * in finding out that the error is
+ * with respect to TDB
+ */
+ retval = EXT2_ET_TDB_ERR_EXISTS;
+ goto cleanup;
+ }
+
+ *channel = io;
+ return 0;
+
+cleanup:
+ if (data->real)
+ io_channel_close(data->real);
+
+ if (data)
+ ext2fs_free_mem(&data);
+
+ if (io)
+ ext2fs_free_mem(&io);
+
+ return retval;
+}
+
+static errcode_t undo_close(io_channel channel)
+{
+ struct undo_private_data *data;
+ errcode_t retval = 0;
+
+ EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+ data = (struct undo_private_data *) channel->private_data;
+ EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
+
+ if (--channel->refcount > 0)
+ return 0;
+
+ if (data->real)
+ retval = io_channel_close(data->real);
+
+ if (data->tdb)
+ tdb_close(data->tdb);
+
+ ext2fs_free_mem(&channel->private_data);
+ if (channel->name)
+ ext2fs_free_mem(&channel->name);
+ ext2fs_free_mem(&channel);
+
+ return retval;
+}
+
+static errcode_t undo_set_blksize(io_channel channel, int blksize)
+{
+ struct undo_private_data *data;
+ errcode_t retval;
+
+ EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+ data = (struct undo_private_data *) channel->private_data;
+ EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
+
+ if (data->real)
+ retval = io_channel_set_blksize(data->real, blksize);
+
+ channel->block_size = blksize;
+ return retval;
+}
+
+
+static errcode_t undo_read_blk(io_channel channel, unsigned long block,
+ int count, void *buf)
+{
+ errcode_t retval;
+ struct undo_private_data *data;
+
+ EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+ data = (struct undo_private_data *) channel->private_data;
+ EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
+
+ if (data->real)
+ retval = io_channel_read_blk(data->real, block, count, buf);
+
+ return retval;
+}
+
+static errcode_t undo_write_blk(io_channel channel, unsigned long block,
+ int count, const void *buf)
+{
+ struct undo_private_data *data;
+ errcode_t retval = 0;
+
+ EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+ data = (struct undo_private_data *) channel->private_data;
+ EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
+
+ /*
+ * First write the existing content into database
+ */
+ retval = undo_write_tdb(channel, block, count);
+ if (retval)
+ return retval;
+
+
+ if (data->real)
+ retval = io_channel_write_blk(data->real, block, count, buf);
+
+ return retval;
+
+}
+
+static errcode_t undo_write_byte(io_channel channel, unsigned long offset,
+ int size, const void *buf)
+{
+ struct undo_private_data *data;
+ errcode_t retval = 0;
+ ssize_t actual;
+ ext2_loff_t location;
+ unsigned long blk_num, count;;
+
+ EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+ data = (struct undo_private_data *) channel->private_data;
+ EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
+
+ location = offset + data->offset;
+ blk_num = location/channel->block_size;
+ /*
+ * the size specified may spread across multiple blocks
+ * also make sure we account for the fact that block start
+ * offset for tdb is different from the backing I/O manager
+ * due to possible different block size
+ */
+ count = (size + (location % channel->block_size) +
+ channel->block_size -1)/channel->block_size;
+
+ retval = undo_write_tdb(channel, blk_num, count);
+ if (retval)
+ return retval;
+
+ if (data->real && data->real->manager->write_byte)
+ retval = io_channel_write_byte(data->real, offset, size, buf);
+
+ return retval;
+}
+
+/*
+ * Flush data buffers to disk.
+ */
+static errcode_t undo_flush(io_channel channel)
+{
+ errcode_t retval = 0;
+ struct undo_private_data *data;
+
+ EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+ data = (struct undo_private_data *) channel->private_data;
+ EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
+
+ if (data->real)
+ retval = io_channel_flush(data->real);
+
+ return retval;
+}
+
+static errcode_t undo_set_option(io_channel channel, const char *option,
+ const char *arg)
+{
+ errcode_t retval = 0;
+ struct undo_private_data *data;
+ unsigned long tmp;
+ char *end;
+
+ EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+ data = (struct undo_private_data *) channel->private_data;
+ EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
+
+ /*
+ * Need to support offset option to work with
+ * Unix I/O manager
+ */
+ if (data->real && data->real->manager->set_option) {
+
+ retval = data->real->manager->set_option(data->real,
+ option, arg);
+ }
+
+ if (!retval && !strcmp(option, "offset")) {
+ if (!arg)
+ return EXT2_ET_INVALID_ARGUMENT;
+
+ tmp = strtoul(arg, &end, 0);
+ if (*end)
+ return EXT2_ET_INVALID_ARGUMENT;
+ data->offset = tmp;
+ }
+
+ return retval;
+}
--
1.5.3.rc2.22.g69a9b-dirty

2007-07-25 05:37:45

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH 2/3] e2fsprogs: Add undoe2fs

From: Aneesh Kumar K.V <[email protected]>

undoe2fs can be used to replay the transaction saved
in the transaction file using undo I/O Manager

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
misc/Makefile.in | 10 +++++-
misc/undoe2fs.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 85 insertions(+), 2 deletions(-)
create mode 100644 misc/undoe2fs.c

diff --git a/misc/Makefile.in b/misc/Makefile.in
index ccad78c..51bb17a 100644
--- a/misc/Makefile.in
+++ b/misc/Makefile.in
@@ -15,7 +15,7 @@ INSTALL = @[email protected]
@[email protected]_MAN= e2image.8

SPROGS= mke2fs badblocks tune2fs dumpe2fs blkid logsave \
- $(E2IMAGE_PROG) @[email protected]
+ $(E2IMAGE_PROG) @[email protected] undoe2fs
USPROGS= mklost+found filefrag
SMANPAGES= tune2fs.8 mklost+found.8 mke2fs.8 dumpe2fs.8 badblocks.8 \
e2label.8 findfs.8 blkid.8 $(E2IMAGE_MAN) \
@@ -39,6 +39,7 @@ E2IMAGE_OBJS= e2image.o
FSCK_OBJS= fsck.o base_device.o
BLKID_OBJS= blkid.o
FILEFRAG_OBJS= filefrag.o
+UNDOE2FS_OBJS= undoe2fs.o

XTRA_CFLAGS= -I$(srcdir)/../e2fsck -I.

@@ -47,7 +48,7 @@ SRCS= $(srcdir)/tune2fs.c $(srcdir)/mklost+found.c $(srcdir)/mke2fs.c \
$(srcdir)/badblocks.c $(srcdir)/fsck.c $(srcdir)/util.c \
$(srcdir)/uuidgen.c $(srcdir)/blkid.c $(srcdir)/logsave.c \
$(srcdir)/filefrag.c $(srcdir)/base_device.c \
- $(srcdir)/../e2fsck/profile.c
+ $(srcdir)/../e2fsck/profile.c $(srcdir)/undoe2fs.c

LIBS= $(LIBEXT2FS) $(LIBCOM_ERR)
DEPLIBS= $(LIBEXT2FS) $(LIBCOM_ERR)
@@ -108,6 +109,10 @@ e2image: $(E2IMAGE_OBJS) $(DEPLIBS)
@echo " LD [email protected]"
@$(CC) $(ALL_LDFLAGS) -o e2image $(E2IMAGE_OBJS) $(LIBS) $(LIBINTL)

+undoe2fs: $(UNDOE2FS_OBJS) $(DEPLIBS)
+ @echo " LD [email protected]"
+ @$(CC) $(ALL_LDFLAGS) -o undoe2fs $(UNDOE2FS_OBJS) $(LIBS)
+
base_device: base_device.c
@echo " LD [email protected]"
@$(CC) $(ALL_CFLAGS) $(ALL_LDFLAGS) $(srcdir)/base_device.c \
@@ -434,3 +439,4 @@ filefrag.o: $(srcdir)/filefrag.c
base_device.o: $(srcdir)/base_device.c $(srcdir)/fsck.h
profile.o: $(srcdir)/../e2fsck/profile.c $(top_srcdir)/lib/et/com_err.h \
$(srcdir)/../e2fsck/profile.h prof_err.h
+undoe2fs.o: $(srcdir)/undoe2fs.c $(top_srcdir)/lib/ext2fs/tdb.h
diff --git a/misc/undoe2fs.c b/misc/undoe2fs.c
new file mode 100644
index 0000000..d14d44a
--- /dev/null
+++ b/misc/undoe2fs.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright IBM Corporation, 2007
+ * Author Aneesh Kumar K.V <[email protected]>
+ *
+ * %Begin-Header%
+ * This file may be redistributed under the terms of the GNU Public
+ * License.
+ * %End-Header%
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#if HAVE_ERRNO_H
+#include <errno.h>
+#endif
+#include "ext2fs/tdb.h"
+
+void usage(char *prg_name)
+{
+ fprintf(stderr,
+ "Usage: %s <transaction file> <filesystem>\n", prg_name);
+ exit(1);
+
+}
+
+
+main(int argc, char *argv[])
+{
+ TDB_CONTEXT *tdb;
+ TDB_DATA key, data;
+ unsigned long blk_num;
+ unsigned long long int location;
+ int fd, retval;
+
+ if (argc != 3)
+ usage(argv[0]);
+
+ tdb = tdb_open(argv[1], 0, 0, O_RDONLY, 0600);
+
+ if (!tdb) {
+ fprintf(stderr, "Failed tdb_open %s\n", strerror(errno));
+ exit(1);
+ }
+
+ fd = open(argv[2], O_WRONLY);
+ if (fd == -1) {
+ fprintf(stderr, "Failed open %s\n", strerror(errno));
+ exit(1);
+ }
+
+ for (key = tdb_firstkey(tdb); key.dptr; key = tdb_nextkey(tdb, key)) {
+ data = tdb_fetch(tdb, key);
+ if (!data.dptr) {
+ fprintf(stderr,
+ "Failed tdb_fetch %s\n", tdb_errorstr(tdb));
+ exit(1);
+ }
+ blk_num = *(unsigned long *)key.dptr;
+ location = blk_num * data.dsize;
+ printf("Replayed transaction of size %d at location %ld\n",
+ data.dsize, blk_num);
+ retval = lseek(fd, location, SEEK_SET);
+ if (retval == -1) {
+ fprintf(stderr, "Failed lseek %s\n", strerror(errno));
+ exit(1);
+ }
+ retval = write(fd, data.dptr, data.dsize);
+ if (retval == -1) {
+ fprintf(stderr, "Failed write %s\n", strerror(errno));
+ exit(1);
+ }
+ }
+ close(fd);
+ tdb_close(tdb);
+
+}
--
1.5.3.rc2.22.g69a9b-dirty

2007-07-25 05:38:41

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH 3/3] e2fsprogs: Support for large inode migration.

From: Aneesh Kumar K.V <[email protected]>

Add new option -I <inode_size> to tune2fs.
This is used to change the inode size. The size
need to be multiple of 2 and we don't allow to
decrease the inode size.

As a part of increasing the inode size we throw
away the free inodes in the last block group. If
we can't we fail. In such case one can resize the
file system and then try to increase the inode size.


tune2fs use undo I/O manager when migrating to large
inode. This helps in reverting the changes if end results
are not correct.The environment variable TUNE2FS_SCRATCH_DIR
is used to indicate the directory within which the tdb
file need to be created. The file will be named tune2fs-XXXXXX

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
misc/tune2fs.c | 269 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 files changed, 266 insertions(+), 3 deletions(-)

diff --git a/misc/tune2fs.c b/misc/tune2fs.c
index 833b994..d8f12a6 100644
--- a/misc/tune2fs.c
+++ b/misc/tune2fs.c
@@ -61,6 +61,7 @@ char * new_label, *new_last_mounted, *new_UUID;
char * io_options;
static int c_flag, C_flag, e_flag, f_flag, g_flag, i_flag, l_flag, L_flag;
static int m_flag, M_flag, r_flag, s_flag = -1, u_flag, U_flag, T_flag;
+static int I_flag;
static time_t last_check_time;
static int print_label;
static int max_mount_count, mount_count, mount_flags;
@@ -71,6 +72,7 @@ static unsigned short errors;
static int open_flag;
static char *features_cmd;
static char *mntopts_cmd;
+static unsigned long int new_inode_size;

int journal_size, journal_flags;
char *journal_device;
@@ -89,7 +91,8 @@ static void usage(void)
"\t[-o [^]mount_options[,...]] [-r reserved_blocks_count]\n"
"\t[-u user] [-C mount_count] [-L volume_label] "
"[-M last_mounted_dir]\n"
- "\t[-O [^]feature[,...]] [-T last_check_time] [-U UUID]"
+ "\t[-O [^]feature[,...]] [-T last_check_time] [-U UUID]\n"
+ "\t[ -I new_inode_size ]"
" device\n"), program_name);
exit (1);
}
@@ -505,7 +508,7 @@ static void parse_tune2fs_options(int argc, char **argv)
struct passwd * pw;

printf("tune2fs %s (%s)\n", E2FSPROGS_VERSION, E2FSPROGS_DATE);
- while ((c = getopt(argc, argv, "c:e:fg:i:jlm:o:r:s:u:C:J:L:M:O:T:U:")) != EOF)
+ while ((c = getopt(argc, argv, "c:e:fg:i:jlm:o:r:s:u:C:J:L:M:O:T:U:I:")) != EOF)
switch (c)
{
case 'c':
@@ -702,6 +705,23 @@ static void parse_tune2fs_options(int argc, char **argv)
open_flag = EXT2_FLAG_RW |
EXT2_FLAG_JOURNAL_DEV_OK;
break;
+ case 'I':
+ new_inode_size = strtoul (optarg, &tmp, 0);
+ if (*tmp) {
+ com_err (program_name, 0,
+ _("bad Inode size - %s"),
+ optarg);
+ usage();
+ }
+ if (!((new_inode_size & (new_inode_size - 1)) == 0)) {
+ com_err (program_name, 0,
+ _("bad Inode size - %s"),
+ optarg);
+ usage();
+ }
+ open_flag = EXT2_FLAG_RW;
+ I_flag = 1;
+ break;
default:
usage();
}
@@ -739,6 +759,215 @@ void do_findfs(int argc, char **argv)
exit(0);
}

+static void *ext2fs_read_next_inode(ext2_filsys fs,
+ void *inode, void *inode_table,
+ int *retval)
+{
+ int blk;
+ static int group = 0, offset = -1;
+ int max_group = fs->super->s_inodes_count/fs->super->s_inodes_per_group;
+ int itable_size = fs->blocksize * fs->inode_blocks_per_group;
+
+ *retval = 0;
+
+ if (offset != -1 && offset < itable_size)
+ goto found;
+
+
+ if (group >= max_group)
+ return NULL;
+
+ blk = fs->group_desc[group].bg_inode_table;
+ *retval = io_channel_read_blk(fs->io, blk,
+ fs->inode_blocks_per_group, inode_table);
+ if (*retval)
+ return NULL;
+ group++;
+ offset = 0;
+
+found:
+ memcpy(inode, inode_table + offset, EXT2_INODE_SIZE(fs->super));
+ offset += EXT2_INODE_SIZE(fs->super);
+ return inode;
+
+}
+
+static int ext2fs_write_itb(ext2_filsys fs, int group,
+ void *inode_table_block, int free_inodes_count)
+{
+ int retval;
+ int blk = fs->group_desc[group].bg_inode_table;
+ retval = io_channel_write_blk(fs->io, blk,
+ fs->inode_blocks_per_group, inode_table_block);
+ if (retval)
+ return retval;
+
+ fs->group_desc[group].bg_free_inodes_count = free_inodes_count;
+
+ return 0;
+}
+
+static int resize_inode(ext2_filsys fs, unsigned long int new_inode_size)
+{
+ ext2_ino_t inode_num;
+ int itable_size, retval;
+ int new_inode_per_group, new_max_inode;
+ int copied_inode_count = 0;
+ void *inode, *buf, *tmp_buf, *inode_table;
+ int free_inodes_count = 0, group_count = 0;
+ int old_size = EXT2_INODE_SIZE(fs->super);
+ int max_group = fs->super->s_inodes_count/fs->super->s_inodes_per_group;
+
+ if (new_inode_size <= fs->super->s_inode_size ) {
+ fprintf(stderr, _("New Inode size too small\n"));
+ return EXT2_ET_INVALID_ARGUMENT;
+ }
+
+ new_inode_per_group = ((fs->inode_blocks_per_group) *
+ fs->blocksize)/new_inode_size;
+
+ new_max_inode = new_inode_per_group * max_group;
+
+ /*
+ * Test whether we can resize the inode
+ */
+ ext2fs_read_inode_bitmap(fs);
+ for (inode_num = new_max_inode+1;
+ inode_num <= fs->super->s_inodes_count; inode_num++) {
+
+ if (ext2fs_test_inode_bitmap(fs->inode_map, inode_num)) {
+ fprintf(stderr, _("Failed for inode %d\n"), inode_num);
+
+ return EXT2_ET_TOOSMALL;
+ }
+ }
+ retval = ext2fs_get_mem(fs->blocksize * fs->inode_blocks_per_group,
+ &inode_table);
+ if (retval) {
+ com_err(__FUNCTION__, retval, "Failed to allocate mem\n");
+ return retval;
+ }
+
+ retval = ext2fs_get_mem(old_size, &inode);
+ if (retval) {
+ com_err(__FUNCTION__, retval, "Failed to allocate mem\n");
+ return retval;
+ }
+
+ /* New inode table block */
+ itable_size = fs->blocksize * fs->inode_blocks_per_group;
+ retval = ext2fs_get_mem(itable_size, &buf);
+ if (retval) {
+ com_err(__FUNCTION__, retval, "Failed to allocate mem\n");
+ return retval;
+ }
+ tmp_buf = buf;
+ inode_num = 0;
+
+ while (ext2fs_read_next_inode(fs, inode, inode_table, &retval)) {
+
+ memcpy(buf, inode, old_size);
+ memset(buf+old_size, 0, new_inode_size - old_size);
+ buf = buf + new_inode_size;
+ copied_inode_count++;
+ inode_num++;
+
+ if (!ext2fs_test_inode_bitmap(fs->inode_map, inode_num)) {
+ free_inodes_count++;
+ }
+
+ if (copied_inode_count < new_inode_per_group) {
+ /*
+ * we can have only new_inode_per_group
+ * in this group
+ */
+ continue;
+ }
+
+ /* Now write the inode table related to the group */
+ retval = ext2fs_write_itb(fs, group_count,
+ tmp_buf, free_inodes_count);
+ if (retval) {
+ com_err(__FUNCTION__, retval,
+ "Failed to write inode table block\n");
+ return retval;
+ }
+ group_count++;
+ if (group_count >= max_group) {
+ /*
+ * We ignore all the following inodes
+ */
+ break;
+ }
+
+ buf = tmp_buf;
+ copied_inode_count = 0;
+ free_inodes_count = 0;
+ }
+
+ if (retval) {
+ /*
+ *ext2fs_read_next_inode returned error
+ */
+ com_err(__FUNCTION__, retval,
+ "Failed to read inode table block\n");
+ return retval;
+ }
+
+ /* Now update all the meta data fields */
+ fs->super->s_inode_size = new_inode_size;
+ fs->super->s_inodes_per_group = new_inode_per_group;
+ fs->super->s_free_inodes_count -= fs->super->s_inodes_count -
+ new_max_inode;
+ fs->super->s_inodes_count = new_max_inode;
+ /*
+ * Mark the inode bitmap dirty so that
+ * the readjusted inode bitmap get written
+ * automatically
+ */
+ ext2fs_mark_ib_dirty(fs);
+ ext2fs_mark_super_dirty(fs);
+
+ return 0;
+}
+static int setup_tdb(void)
+{
+ char *tdb_dir, tdb_file[PATH_MAX];
+#if 0 /* FIXME!! */
+ /*
+ * Configuration via a conf file would be
+ * nice
+ */
+ profile_get_string(profile, "scratch_files",
+ "directory", 0, 0,
+ &tdb_dir);
+#endif
+ tdb_dir = getenv("TUNE2FS_SCRATCH_DIR");
+ if (!tdb_dir) {
+ com_err(__FUNCTION__, 0,
+ _("TUNE2FS_SCRATCH_DIR not configured\n"));
+ return EXT2_ET_INVALID_ARGUMENT;
+ }
+ if (access(tdb_dir, W_OK)) {
+ fprintf(stderr,
+ _("Cannot create file under %s\n"),
+ tdb_dir);
+ return EXT2_ET_INVALID_ARGUMENT;
+
+ }
+
+ sprintf(tdb_file, "%s/tune2fs-XXXXXX", tdb_dir);
+ printf (_("Using tdb file %s\n"), tdb_file);
+
+ if (!access(tdb_file, F_OK)) {
+ fprintf(stderr,
+ _("File exist %s\n"), tdb_file);
+ return EXT2_ET_INVALID_ARGUMENT;
+ }
+
+ set_undo_io_backup_file(tdb_file);
+ return 0;
+}

int main (int argc, char ** argv)
{
@@ -768,7 +997,19 @@ int main (int argc, char ** argv)
io_ptr = test_io_manager;
test_io_backing_manager = unix_io_manager;
#else
- io_ptr = unix_io_manager;
+ if (new_inode_size) {
+ /*
+ * If inode resize is requested use the
+ * Undo I/O manager
+ */
+ io_ptr = undo_io_manager;
+ set_undo_io_backing_manager(unix_io_manager);
+ retval = setup_tdb();
+ if (retval)
+ exit(1);
+ } else {
+ io_ptr = unix_io_manager;
+ }
#endif
retval = ext2fs_open2(device_name, io_options, open_flag,
0, 0, io_ptr, &fs);
@@ -919,6 +1160,28 @@ int main (int argc, char ** argv)
}
ext2fs_mark_super_dirty(fs);
}
+ if (I_flag) {
+ if (mount_flags & EXT2_MF_MOUNTED) {
+ fputs(_("The Inode size may only be "
+ "changed when the filesystem is "
+ "unmounted.\n"), stderr);
+ exit(1);
+ }
+ /*
+ * We want to update group descriptor also
+ * with the new free inode count
+ */
+ fs->flags &= ~EXT2_FLAG_SUPER_ONLY;
+ if (resize_inode(fs, new_inode_size)) {
+
+ fputs(_("Error in resizing the Inode.\n"
+ "Run undoe2fs to undo the "
+ "file system changes. \n"), stderr);
+ } else {
+ printf (_("Setting Inode size %d\n"),
+ new_inode_size);
+ }
+ }

if (l_flag)
list_super (sb);
--
1.5.3.rc2.22.g69a9b-dirty

2007-07-25 14:32:14

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH 3/3] e2fsprogs: Support for large inode migration.

On Wed, Jul 25, 2007 at 11:06:28AM +0530, Aneesh Kumar K.V wrote:
> From: Aneesh Kumar K.V <[email protected]>
>
> Add new option -I <inode_size> to tune2fs.
> This is used to change the inode size. The size
> need to be multiple of 2 and we don't allow to
> decrease the inode size.
>
> As a part of increasing the inode size we throw
> away the free inodes in the last block group. If
> we can't we fail. In such case one can resize the
> file system and then try to increase the inode size.

Let me guess, you're testing with a filesystem with two block groups,
right? And to date you've tested *only* by doubling the size of the
inode.

What your patch does is is keep the number of inode blocks per block
group constant, so that the total number of inodes decreases by
whatever factor the inode size is increasing. It's a cheap, dirty way
of doing the resizing, since it avoids needing to either (a) update
directory entries when inode numbers get renumbered, and (b) need to
update inodes when blocks need to get relocated in order to make room
for growing the inode table.

The problem with your patch is:

* By shrinking the number of inodes, it can constrain the
ability of the filesystem to create new files in the future.

* It ruins the inode and block placement algorithms where we
try to keep inodes in the same block group as their parent
directory, and we try to allocate blocks in the same block
group as their containing inode.

* Because when the current patch makes no attempt to relocate
inodes, and when it doubles the inode size, it chops the
number of inodes in half, there must be no inodes in the
last half of the inode table. That is if there are N block
groups, the inode tables in blockgroups N/2 to N-1 must be
empty. But because of the block group spreading algorithm,
where new directories get pushed out to new block groups, in
any real real-life filesystem, the use of block groups is
evenly spread out, which means in practice you won't see
case where the last half of the inodes will not be in use.
Hence, your patch won't actually work in practice.

So unfortunately, the right answer *will* require expanding the inode
tables, and potentially moving blocks out of the way in order to make
room for it. A lot of that machinery is in resize2fs, actually, and
I'm wondering if the right answer is to move resize2fs's functionality
into tune2fs. We will also need this to be able to add the resize
inode after the fact.

That's not going to be a trivial set of changes; if you're looking for
something to test the undo manager, my suggestion would be to wire it
up into mke2fs and/or e2fsck first. Mke2fs might be nice since it
will give us a recovery path in case someone screws up the arguments
to mkfs.

> tune2fs use undo I/O manager when migrating to large
> inode. This helps in reverting the changes if end results
> are not correct.The environment variable TUNE2FS_SCRATCH_DIR
> is used to indicate the directory within which the tdb
> file need to be created. The file will be named tune2fs-XXXXXX

My suggestion would be to use something like /var/lib/e2fsprogs as the
defalut directory. And we should also do some tests to make sure
something sane happens if we run out of room for the undo file.
Presumably the only thing we can do is to abort the run and then back
out the chnages using what was written out to the undo file.

- Ted

2007-07-25 19:46:31

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH 3/3] e2fsprogs: Support for large inode migration.

On Jul 25, 2007 10:32 -0400, Theodore Tso wrote:
> The problem with your patch is:
>
> * By shrinking the number of inodes, it can constrain the
> ability of the filesystem to create new files in the future.
>
> * It ruins the inode and block placement algorithms where we
> try to keep inodes in the same block group as their parent
> directory, and we try to allocate blocks in the same block
> group as their containing inode.
>
> * Because when the current patch makes no attempt to relocate
> inodes, and when it doubles the inode size, it chops the
> number of inodes in half, there must be no inodes in the
> last half of the inode table. That is if there are N block
> groups, the inode tables in blockgroups N/2 to N-1 must be
> empty. But because of the block group spreading algorithm,
> where new directories get pushed out to new block groups, in
> any real real-life filesystem, the use of block groups is
> evenly spread out, which means in practice you won't see
> case where the last half of the inodes will not be in use.
> Hence, your patch won't actually work in practice.

I was just going to write the same things. However, it should be noted
that we DO in fact use only the start of each inode table block in
normal cases. We did a survey of this across some hundreds of filesystems
and because all kernels scan each block group's itable from the start of
the bitmap we only find the beginning of each itable used.

That said, if we were going to follow this approach (which isn't so bad,
IMHO, because most filesystems are far over-provisioned in terms of
inodes) then we shouldn't _require_ that the last inodes are unused, but
rather that < 1/2 of the inodes in each group are unused. Also, we
should still keep the inodes in the same block group, and do renumbering
of the inodes in the directories. At this point, we have a tool that
could also allow changing the total number of inodes in the filesystem,
which is something we've wanted in the past.

> So unfortunately, the right answer *will* require expanding the inode
> tables, and potentially moving blocks out of the way in order to make
> room for it. A lot of that machinery is in resize2fs, actually, and
> I'm wondering if the right answer is to move resize2fs's functionality
> into tune2fs. We will also need this to be able to add the resize
> inode after the fact.

Well, since this isn't exactly a common occurrance, I don't think we
need to push everything into tune2fs. Having a separate resize2fs
seems reasonable (we are resizing the inodes after all), and keeping
so much complexity out of tune2fs helps ensure that we don't introduce
bugs into tune2fs itself (which is a far more used and critical tool IMHO).

> > tune2fs use undo I/O manager when migrating to large
> > inode. This helps in reverting the changes if end results
> > are not correct.The environment variable TUNE2FS_SCRATCH_DIR
> > is used to indicate the directory within which the tdb
> > file need to be created. The file will be named tune2fs-XXXXXX
>
> My suggestion would be to use something like /var/lib/e2fsprogs as the
> defalut directory. And we should also do some tests to make sure
> something sane happens if we run out of room for the undo file.
> Presumably the only thing we can do is to abort the run and then back
> out the chnages using what was written out to the undo file.

I was going to say /var/tmp, since we don't want to start having to
manage old versions of these files, add entries to logrotate, etc.

Cheers, Andreas
--
Andreas Dilger
Principal Software Engineer
Cluster File Systems, Inc.

2007-07-25 19:49:11

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH 3/3] e2fsprogs: Support for large inode migration.

On Jul 25, 2007 11:06 +0530, Aneesh Kumar K.V wrote:
> This helps in reverting the changes if end results
> are not correct.The environment variable TUNE2FS_SCRATCH_DIR
> is used to indicate the directory within which the tdb
> file need to be created. The file will be named tune2fs-XXXXXX

> @@ -702,6 +705,23 @@ static void parse_tune2fs_options(int argc, char **argv)
> break;
> + case 'I':
> + new_inode_size = strtoul (optarg, &tmp, 0);
> + if (!((new_inode_size & (new_inode_size - 1)) == 0)) {
> + com_err (program_name, 0,
> + _("bad Inode size - %s"),
> + optarg);
> + usage();
> + }

Please print a more useful message like "inode size must be a power of two",
as "bad inode size - 200" isn't very useful to the user to figure out what
a good inode size is.

> +static void *ext2fs_read_next_inode(ext2_filsys fs,
> + void *inode, void *inode_table,
> + int *retval)
> +{
> + blk = fs->group_desc[group].bg_inode_table;
> + *retval = io_channel_read_blk(fs->io, blk,
> + fs->inode_blocks_per_group, inode_table);

This isn't doing swabbing of the inodes?

> +}
> +
> +static int ext2fs_write_itb(ext2_filsys fs, int group,
> + void *inode_table_block, int free_inodes_count)
> +{
> + int retval;
> + int blk = fs->group_desc[group].bg_inode_table;
> + retval = io_channel_write_blk(fs->io, blk,
> + fs->inode_blocks_per_group, inode_table_block);

This isn't doing swabbing either?

> +static int resize_inode(ext2_filsys fs, unsigned long int new_inode_size)
> +{
> + ext2_ino_t inode_num;
> + int itable_size, retval;
> + int new_inode_per_group, new_max_inode;
> + int copied_inode_count = 0;
> + void *inode, *buf, *tmp_buf, *inode_table;
> + int free_inodes_count = 0, group_count = 0;
> + int old_size = EXT2_INODE_SIZE(fs->super);
> + int max_group = fs->super->s_inodes_count/fs->super->s_inodes_per_group;
> +
> + if (new_inode_size <= fs->super->s_inode_size ) {
> + fprintf(stderr, _("New Inode size too small\n"));
> + return EXT2_ET_INVALID_ARGUMENT;
> + }

Shouldn't this be verified when the inode size is parsed instead of down here?

> + if (I_flag) {
> + if (mount_flags & EXT2_MF_MOUNTED) {
> + fputs(_("The Inode size may only be "
> + "changed when the filesystem is "
> + "unmounted.\n"), stderr);

Why do you capitalize "Inode" in all of the messages?

Cheers, Andreas
--
Andreas Dilger
Principal Software Engineer
Cluster File Systems, Inc.

2007-07-26 11:46:05

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH 3/3] e2fsprogs: Support for large inode migration.



Theodore Tso wrote:
> On Wed, Jul 25, 2007 at 11:06:28AM +0530, Aneesh Kumar K.V wrote:
>> From: Aneesh Kumar K.V <[email protected]>
>>
>> Add new option -I <inode_size> to tune2fs.
>> This is used to change the inode size. The size
>> need to be multiple of 2 and we don't allow to
>> decrease the inode size.
>>
>> As a part of increasing the inode size we throw
>> away the free inodes in the last block group. If
>> we can't we fail. In such case one can resize the
>> file system and then try to increase the inode size.
>
> Let me guess, you're testing with a filesystem with two block groups,
> right? And to date you've tested *only* by doubling the size of the
> inode.
>


I tested this with multiple ( 1 and 7 ) groups. But yes all the testing was to change
inode size from 128 to 256.


> What your patch does is is keep the number of inode blocks per block
> group constant, so that the total number of inodes decreases by
> whatever factor the inode size is increasing. It's a cheap, dirty way
> of doing the resizing, since it avoids needing to either (a) update
> directory entries when inode numbers get renumbered, and (b) need to
> update inodes when blocks need to get relocated in order to make room
> for growing the inode table.
>


That is correct. What i was looking at was to get the dynamic inode location
first. That should help us to place large inode any where right ?. But i know
that is a long term goal since there is no patches for dynamic inode location.

I will work at increasing the inode table size as a part of increasing the inode
size.



> The problem with your patch is:
>
> * By shrinking the number of inodes, it can constrain the
> ability of the filesystem to create new files in the future.
>

I explained this in the commit log.


> * It ruins the inode and block placement algorithms where we
> try to keep inodes in the same block group as their parent
> directory, and we try to allocate blocks in the same block
> group as their containing inode.


I missed this in my analysis. So this means we may end up with bad performance
after resizing the inode. I will look at increasing the inode table size as a
part of increasing the inode size.


>
> * Because when the current patch makes no attempt to relocate
> inodes, and when it doubles the inode size, it chops the
> number of inodes in half, there must be no inodes in the
> last half of the inode table. That is if there are N block
> groups, the inode tables in blockgroups N/2 to N-1 must be
> empty. But because of the block group spreading algorithm,
> where new directories get pushed out to new block groups, in
> any real real-life filesystem, the use of block groups is
> evenly spread out, which means in practice you won't see
> case where the last half of the inodes will not be in use.
> Hence, your patch won't actually work in practice.
>
> So unfortunately, the right answer *will* require expanding the inode
> tables, and potentially moving blocks out of the way in order to make
> room for it. A lot of that machinery is in resize2fs, actually, and
> I'm wondering if the right answer is to move resize2fs's functionality
> into tune2fs. We will also need this to be able to add the resize
> inode after the fact.
>
> That's not going to be a trivial set of changes; if you're looking for
> something to test the undo manager, my suggestion would be to wire it
> up into mke2fs and/or e2fsck first. Mke2fs might be nice since it
> will give us a recovery path in case someone screws up the arguments
> to mkfs.
>

I guess Undo I/O manager can go in because I have been using it for
the ext3 -> ext4 inode migration testing and for testing the above patch.


Why would one need to recover on mkfs. He can very well run mkfs again right ?



>> tune2fs use undo I/O manager when migrating to large
>> inode. This helps in reverting the changes if end results
>> are not correct.The environment variable TUNE2FS_SCRATCH_DIR
>> is used to indicate the directory within which the tdb
>> file need to be created. The file will be named tune2fs-XXXXXX
>
> My suggestion would be to use something like /var/lib/e2fsprogs as the
> defalut directory. And we should also do some tests to make sure
> something sane happens if we run out of room for the undo file.
> Presumably the only thing we can do is to abort the run and then back
> out the chnages using what was written out to the undo file.
>

I had a FIXME!! in the code which stated it would be nice to use the conf file
But right now the conffile is e2fsck specific

+ char *tdb_dir, tdb_file[PATH_MAX];
+#if 0 /* FIXME!! */
+ /*
+ * Configuration via a conf file would be
+ * nice
+ */
+ profile_get_string(profile, "scratch_files",
+ "directory", 0, 0,
+ &tdb_dir);
+#endif
+ tdb_dir = getenv("TUNE2FS_SCRATCH_DIR");


-aneesh

2007-07-26 14:59:14

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH 3/3] e2fsprogs: Support for large inode migration.

On Wed, Jul 25, 2007 at 01:46:25PM -0600, Andreas Dilger wrote:
> I was just going to write the same things. However, it should be noted
> that we DO in fact use only the start of each inode table block in
> normal cases. We did a survey of this across some hundreds of filesystems
> and because all kernels scan each block group's itable from the start of
> the bitmap we only find the beginning of each itable used.

Yep, granted.

> That said, if we were going to follow this approach (which isn't so bad,
> IMHO, because most filesystems are far over-provisioned in terms of
> inodes) then we shouldn't _require_ that the last inodes are unused, but
> rather that < 1/2 of the inodes in each group are unused. Also, we
> should still keep the inodes in the same block group, and do renumbering
> of the inodes in the directories. At this point, we have a tool that
> could also allow changing the total number of inodes in the filesystem,
> which is something we've wanted in the past.

The problem is without the block relocation code, all we will be able
to do is shrink the total number of inodes in the filesystem --- and
most of the time, there's not a lot of value in shrinking the size of
the inode table. Sure, you get a tiny amount of space back, and maybe
e2fsck times speed up, but that's about it. Most of the time people
who want to change the total number of inodes want to increase it.

> Well, since this isn't exactly a common occurrance, I don't think we
> need to push everything into tune2fs. Having a separate resize2fs
> seems reasonable (we are resizing the inodes after all), and keeping
> so much complexity out of tune2fs helps ensure that we don't introduce
> bugs into tune2fs itself (which is a far more used and critical tool IMHO).

Well, if you look at resize2fs, the major complexity can roughly be
broken down as follows:

* 30% -- Inode mover (and iterating over directory entries)
* 30% -- Block mover (and iterating over inodes to fix up entries)
* 5% -- Moving the inode table
* 10% -- Updating the superblock/block group descriptors
* 25% -- Making sure nothing bad happens if resize2fs crashes in the
middle (except when moving the inode table; then we cross
our fingers and pray)

With the undo I/O manager, the last goes away, and if you need to deal
with iterating over the directory entries, what's left is mostly the
block mover, which really isn't that hard, since the inode mover and
the block mover shares a fair amount of the infrastructure to keep
track what had moved where.

> > My suggestion would be to use something like /var/lib/e2fsprogs as the
> > defalut directory. And we should also do some tests to make sure
> > something sane happens if we run out of room for the undo file.
> > Presumably the only thing we can do is to abort the run and then back
> > out the chnages using what was written out to the undo file.
>
> I was going to say /var/tmp, since we don't want to start having to
> manage old versions of these files, add entries to logrotate, etc.

Well, I wanted them in a separate directory so that we could
automatically find the undo files and deal with them automatically.
For example, e2fsck would be able to deal with recovering from an
interrupted resize2fs or tune2fs operation if the system crashed.

In some cases you wouldn't want to automatically reuse them after a
completed operation (i.e., an e2fsck "undo" file would rarely get
used), so we would need to tag the undo file with what program
generated them, and some kind of temporal identifer (i.e., the
superblock last write/mount time).

I also don't think logrotate entries are that bad of an idea....

- Ted

2007-07-26 16:13:29

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH 3/3] e2fsprogs: Support for large inode migration.

On Thu, Jul 26, 2007 at 05:15:30PM +0530, Aneesh Kumar K.V wrote:
> >Let me guess, you're testing with a filesystem with two block groups,
> >right? And to date you've tested *only* by doubling the size of the
> >inode.
>
> I tested this with multiple ( 1 and 7 ) groups. But yes all the
> testing was to change inode size from 128 to 256.

Your patch comments stated "As a part of increasing the inode size we
throw away the free inodes in the last block group." This is *only*
true if the filesystem has two (and exactly two) block groups. Hence
my guess that only tested on filesystems with two block groups.

If you tested with larger numbers of block groups, the filesystem must
have been mostly empty, or at least not a large number of directories,
or else you would have noticed that most of the time that your
algorithm wouldn't work.

> I guess Undo I/O manager can go in because I have been using it for
> the ext3 -> ext4 inode migration testing and for testing the above patch.

Well, I really want a valid use case for undo code, and right now the
above patch is IMHO not suitable for merging into mainline, given all
of the problems that it has.

> Why would one need to recover on mkfs. He can very well run mkfs again
> right ?

The idea is if there is a valid ext2/3 filesystem, and someone runs
mkfs on that filesystem, that it creates an undo file so that if
someone fumble-figured the arguments to mkfs, it becomes possible to
undo it.

In the ideal case the undo file would be copied into blocks that were
not in use by the original file, and placed into the new filesystem,
so that, so that it would be possible to undo the mkfs without needing
a separate file. But I think it would be acceptable to say, "previous
ext3 filesystem detected; to undo the mke2fs operation, please run the
command 'undoe2fs /var/tmp/XXXX /dev/sdYY' in order to recover".


> >Presumably the only thing we can do is to abort the run and then back
> >out the chnages using what was written out to the undo file.
> >
>
> I had a FIXME!! in the code which stated it would be nice to use the
> conf file But right now the conffile is e2fsck specific

Actually, e2fsck and mke2fs have their own config files. An open
issue is whether we should have a separate tune2fs config file, or a
more general e2fsprogs.conf file. Or we could have tune2fs use the
mke2fs.conf file, which is a bit of UI surprise; OTOH, tune2fs and
mke2fs do share code already, so it does make sense from a progamming
point of view. I do consider it a UI botch, though.

Another possibility is to merge e2fsck.conf and mke2fs.conf into a
single config file, with symlinks for backwards compatibility. My
current thinking is to just to add a new e2fsprogs.conf file, and let
tune2fs use it. But I'm certainly willing to hear arguments for other
ways of handling this.

Or maybe we could just simply add a default of /var/lib/e2fsprogs, and
just leave it be for now.

- Ted

2007-07-27 02:59:39

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH 3/3] e2fsprogs: Support for large inode migration.



Theodore Tso wrote:
> On Thu, Jul 26, 2007 at 05:15:30PM +0530, Aneesh Kumar K.V wrote:
>>> Let me guess, you're testing with a filesystem with two block groups,
>>> right? And to date you've tested *only* by doubling the size of the
>>> inode.
>> I tested this with multiple ( 1 and 7 ) groups. But yes all the
>> testing was to change inode size from 128 to 256.
>
> Your patch comments stated "As a part of increasing the inode size we
> throw away the free inodes in the last block group." This is *only*
> true if the filesystem has two (and exactly two) block groups. Hence
> my guess that only tested on filesystems with two block groups.
>

Okey i didn't stress the "last" part in the above sentence. What i wanted
to convey was inodes in the block groups towards the end (well more than one).



> If you tested with larger numbers of block groups, the filesystem must
> have been mostly empty, or at least not a large number of directories,
> or else you would have noticed that most of the time that your
> algorithm wouldn't work.
>
>> I guess Undo I/O manager can go in because I have been using it for
>> the ext3 -> ext4 inode migration testing and for testing the above patch.
>
> Well, I really want a valid use case for undo code, and right now the
> above patch is IMHO not suitable for merging into mainline, given all
> of the problems that it has.
>

What are the issues you see with PATCH 1 and PATCH 2 which implement
Undo I/O Manager and undoe2fs other than it is not hooked into any of
the existing tools. I will try to add it to mke2fs as you suggested. But
should that prevent it from going in ?


-aneesh

2007-07-27 15:34:31

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH 3/3] e2fsprogs: Support for large inode migration.

On Fri, Jul 27, 2007 at 08:29:31AM +0530, Aneesh Kumar K.V wrote:
> What are the issues you see with PATCH 1 and PATCH 2 which implement
> Undo I/O Manager and undoe2fs other than it is not hooked into any of
> the existing tools. I will try to add it to mke2fs as you suggested. But
> should that prevent it from going in ?

The main issue is that it's not used by an in-tree caller. If you can
add it to mke2fs, that would be great; otherwise, it's something
that's on my todo list so we can merge the undo manager. The other
possibility is that I might use this as justification for creating a
"pu" (proposed update) branch which has the property that patches on
the pu branch can get removed or modified at any time, and the "pu"
branch can get rewound or rebased at any time.

See the section of text starting at line #68 (and going on to line
#160) in the following git's maintainer notes for a much more
comprehensive description of "maint", "master", "next", and "pu",
which is the direction in which I plan to take the e2fsprogs git tree:

http://git.kernel.org/?p=git/git.git;a=blob;f=MaintNotes;h=8bf0352adf2b4ac775d6100ad937600ecb5be5f2;hb=962753f75390f5c5ea23a2d1e6996f6027003478

But yes, the main reason why I haven't merged the undo manager is
because we don't have an in-tree user of that interface. We don't
even have test cases in the tree, either (also on my todo list, but
feel free to beat me to it).

- Ted

2007-07-27 19:04:06

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH 3/3] e2fsprogs: Support for large inode migration.



Theodore Tso wrote:
> On Fri, Jul 27, 2007 at 08:29:31AM +0530, Aneesh Kumar K.V wrote:
>> What are the issues you see with PATCH 1 and PATCH 2 which implement
>> Undo I/O Manager and undoe2fs other than it is not hooked into any of
>> the existing tools. I will try to add it to mke2fs as you suggested. But
>> should that prevent it from going in ?
>
> The main issue is that it's not used by an in-tree caller. If you can
> add it to mke2fs, that would be great;


diff --git a/misc/mke2fs.c b/misc/mke2fs.c
index 0c6d4f3..fab8830 100644
--- a/misc/mke2fs.c
+++ b/misc/mke2fs.c
@@ -1521,6 +1521,92 @@ static void PRS(int argc, char *argv[])
fs_param.s_blocks_count);
}

+static int fileystem_exist(const char *name)
+{
+ int retval;
+ io_channel channel;
+ __u16 s_magic;
+ struct ext2_super_block super;
+ io_manager manager = unix_io_manager;
+
+ retval = manager->open(name, IO_FLAG_EXCLUSIVE, &channel);
+ if (retval) {
+ retval = 0;
+ goto open_err_out;
+ }
+
+ io_channel_set_blksize(channel, SUPERBLOCK_OFFSET);
+ retval = io_channel_read_blk(channel, 1, -SUPERBLOCK_SIZE, &super);
+ if (retval) {
+ retval = 0;
+ goto err_out;
+ }
+
+#if defined(WORDS_BIGENDIAN)
+ s_magic = ext2fs_swab16(super.s_magic);
+#else
+ s_magic = super.s_magic;
+#endif
+
+ if (s_magic == EXT2_SUPER_MAGIC)
+ retval = 1;
+
+err_out:
+ io_channel_close(channel);
+
+open_err_out:
+
+ /*
+ * We don't handle error cases instead we
+ * declare that the file system doesn't exist
+ * and let the rest of mke2fs take care of
+ * error
+ */
+ return retval;
+}
+
+static int mke2fs_setup_tdb(const char *name)
+{
+ char *tdb_dir, tdb_file[PATH_MAX];
+#if 0 /* FIXME!! */
+ /*
+ * Configuration via a conf file would be
+ * nice
+ */
+ profile_get_string(profile, "scratch_files",
+ "directory", 0, 0,
+ &tdb_dir);
+#endif
+ tdb_dir = getenv("MKE2FS_SCRATCH_DIR");
+ if (!tdb_dir) {
+ printf(_("MKE2FS_SCRATCH_DIR not configured\n"));
+ printf(_("Using /var/lib/e2fsprogs\n"));
+ tdb_dir="/var/lib/e2fsprogs";
+ }
+ if (access(tdb_dir, W_OK)) {
+ fprintf(stderr,
+ _("Cannot create file under %s\n"),
+ tdb_dir);
+ return EXT2_ET_INVALID_ARGUMENT;
+
+ }
+
+ /* FIXME!! Should we generate Unique file name ?? */
+ sprintf(tdb_file, "%s/mke2fs-XXXXXX", tdb_dir);
+
+ if (!access(tdb_file, F_OK)) {
+ fprintf(stderr,
+ _("File exist %s\n"), tdb_file);
+ return EXT2_ET_INVALID_ARGUMENT;
+ }
+
+ set_undo_io_backup_file(tdb_file);
+ printf(_("previous filesystem detected; to undo "
+ "the mke2fs operation, please run the "
+ "command \n'undoe2fs %s %s' in order to recover\n\n"),
+ tdb_file, name);
+ return 0;
+}
int main (int argc, char *argv[])
{
errcode_t retval = 0;
@@ -1543,7 +1629,17 @@ int main (int argc, char *argv[])
io_ptr = test_io_manager;
test_io_backing_manager = unix_io_manager;
#else
- io_ptr = unix_io_manager;
+ if (fileystem_exist(device_name)) {
+
+ io_ptr = undo_io_manager;
+ set_undo_io_backing_manager(unix_io_manager);
+ retval = mke2fs_setup_tdb(device_name);
+ if (retval)
+ exit(1);
+
+ } else {
+ io_ptr = unix_io_manager;
+ }
#endif

/*