LinuxLists.cc - [RFC PATCH 1/1] e2fsprogs: Add undo I/O manager.

2007-05-22 10:17:58

Subject: [RFC PATCH 1/1] e2fsprogs: Add undo I/O manager.

This I/O manager saves the contents of the location being overwritten
to a tdb database. This helps in undoing the changes done to the
file system.

The call sequence involve

set_undo_io_backing_manager(unix_io_manager);
set_undo_io_backup_file("/tmp/test.tdb");
retval = ext2fs_open2(dev_name, 0, flags,
superblock, block_size, undo_io_manager,
&current_fs);

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
lib/ext2fs/Makefile.in | 10 +-
lib/ext2fs/ext2_io.h | 5 +
lib/ext2fs/undo_io.c | 533 ++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 546 insertions(+), 2 deletions(-)
create mode 100644 lib/ext2fs/undo_io.c

diff --git a/lib/ext2fs/Makefile.in b/lib/ext2fs/Makefile.in
index 799659e..6b49a13 100644
--- a/lib/ext2fs/Makefile.in
+++ b/lib/ext2fs/Makefile.in
@@ -66,7 +66,8 @@ OBJS= $(DEBUGFS_LIB_OBJS) $(RESIZE_LIB_OBJS) $(E2IMAGE_LIB_OBJS) \
unix_io.o \
unlink.o \
valid_blk.o \
- version.o
+ version.o \
+ undo_io.o

SRCS= ext2_err.c \
$(srcdir)/alloc.c \
@@ -134,7 +135,8 @@ SRCS= ext2_err.c \
$(srcdir)/tst_bitops.c \
$(srcdir)/tst_byteswap.c \
$(srcdir)/tst_getsize.c \
- $(srcdir)/tst_iscan.c
+ $(srcdir)/tst_iscan.c \
+ $(srcdir)/undo_io.c

HFILES= bitops.h ext2fs.h ext2_io.h ext2_fs.h ext2_ext_attr.h ext3_extents.h
HFILES_IN= ext2_err.h ext2_types.h
@@ -550,3 +552,7 @@ tst_iscan.o: $(srcdir)/tst_iscan.c $(srcdir)/ext2_fs.h \
$(top_builddir)/lib/ext2fs/ext2_types.h $(srcdir)/ext2fs.h \
$(srcdir)/ext2_fs.h $(top_srcdir)/lib/et/com_err.h $(srcdir)/ext2_io.h \
$(top_builddir)/lib/ext2fs/ext2_err.h $(srcdir)/bitops.h
+undo_io.o: $(srcdir)/undo_io.c $(srcdir)/ext2_fs.h \
+ $(top_builddir)/lib/ext2fs/ext2_types.h $(srcdir)/ext2fs.h \
+ $(srcdir)/ext2_fs.h $(top_srcdir)/lib/et/com_err.h $(srcdir)/ext2_io.h \
+ $(top_builddir)/lib/ext2fs/ext2_err.h $(srcdir)/bitops.h
diff --git a/lib/ext2fs/ext2_io.h b/lib/ext2fs/ext2_io.h
index eada278..476eb4d 100644
--- a/lib/ext2fs/ext2_io.h
+++ b/lib/ext2fs/ext2_io.h
@@ -96,6 +96,11 @@ extern errcode_t io_channel_write_byte(io_channel channel,
/* unix_io.c */
extern io_manager unix_io_manager;

+/* undo_io.c */
+extern io_manager undo_io_manager;
+extern errcode_t set_undo_io_backing_manager(io_manager manager);
+extern errcode_t set_undo_io_backup_file(char *file_name);
+
/* test_io.c */
extern io_manager test_io_manager, test_io_backing_manager;
extern void (*test_io_cb_read_blk)
diff --git a/lib/ext2fs/undo_io.c b/lib/ext2fs/undo_io.c
new file mode 100644
index 0000000..2c8f6d8
--- /dev/null
+++ b/lib/ext2fs/undo_io.c
@@ -0,0 +1,533 @@
+/*
+ * undo_io.c --- This is the undo io manager that copies the old data that
+ * copies the old data being overwritten into a tdb database
+ *
+ * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
+ * 2002 by Theodore Ts'o.
+ *
+ * %Begin-Header%
+ * This file may be redistributed under the terms of the GNU Public
+ * License.
+ * %End-Header%
+ */
+
+#define _LARGEFILE_SOURCE
+#define _LARGEFILE64_SOURCE
+
+#include <stdio.h>
+#include <string.h>
+#if HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#if HAVE_ERRNO_H
+#include <errno.h>
+#endif
+#include <fcntl.h>
+#include <time.h>
+#ifdef __linux__
+#include <sys/utsname.h>
+#endif
+#if HAVE_SYS_STAT_H
+#include <sys/stat.h>
+#endif
+#if HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+#if HAVE_SYS_RESOURCE_H
+#include <sys/resource.h>
+#endif
+
+#include <tdb.h>
+
+#include "ext2_fs.h"
+#include "ext2fs.h"
+
+/*
+ * For checking structure magic numbers...
+ */
+
+#define EXT2_CHECK_MAGIC(struct, code) \
+ if ((struct)->magic != (code)) return (code)
+
+
+struct undo_private_data {
+ int magic;
+ TDB_CONTEXT *tdb;
+ char *tdb_file;
+
+ /* The backing io channel */
+ io_channel real;
+
+ /* device from from which data should be read */
+ int dev;
+
+ /* to support offset in unix I/O manager */
+ ext2_loff_t offset;
+};
+
+static errcode_t undo_open(const char *name, int flags, io_channel *channel);
+static errcode_t undo_close(io_channel channel);
+static errcode_t undo_set_blksize(io_channel channel, int blksize);
+static errcode_t undo_read_blk(io_channel channel, unsigned long block,
+ int count, void *data);
+static errcode_t undo_write_blk(io_channel channel, unsigned long block,
+ int count, const void *data);
+static errcode_t undo_flush(io_channel channel);
+static errcode_t undo_write_byte(io_channel channel, unsigned long offset,
+ int size, const void *data);
+static errcode_t undo_set_option(io_channel channel, const char *option,
+ const char *arg);
+
+static struct struct_io_manager struct_undo_manager = {
+ EXT2_ET_MAGIC_IO_MANAGER,
+ "Undo I/O Manager",
+ undo_open,
+ undo_close,
+ undo_set_blksize,
+ undo_read_blk,
+ undo_write_blk,
+ undo_flush,
+ undo_write_byte,
+ undo_set_option
+};
+
+io_manager undo_io_manager = &struct_undo_manager;
+static io_manager undo_io_backing_manager ;
+static char *tdb_file ;
+
+errcode_t set_undo_io_backing_manager(io_manager manager)
+{
+ /*
+ * We may want to do some validation later
+ */
+ undo_io_backing_manager = manager;
+ return 0;
+}
+
+errcode_t set_undo_io_backup_file(char *file_name)
+{
+ tdb_file = strdup(file_name);
+
+ if (tdb_file == NULL) {
+ return EXT2_ET_NO_MEMORY;
+ }
+
+ return 0;
+}
+
+static errcode_t is_trans_overlapping(TDB_CONTEXT *tdb,
+ ext2_loff_t *location, int *size)
+{
+
+ ext2_loff_t req_loc, cur_loc;
+ int req_size, cur_size;
+ TDB_DATA key, data;
+
+restart:
+ req_loc = *location;
+ req_size = *size;
+
+ /* loop through the existing entries and find if they overlap */
+ for (key = tdb_firstkey(tdb); key.dptr; key = tdb_nextkey(tdb, key)) {
+ data = tdb_fetch(tdb, key);
+ /*
+ * we are not interested in the data
+ */
+ free(data.dptr);
+
+ cur_loc = *(ext2_loff_t *)key.dptr;
+ cur_size = data.dsize;
+
+ if (req_loc < cur_loc) {
+ if (req_loc+req_size > cur_loc) {
+ /* ------------------
+ * | | | |
+ * ------------------
+ * rl cl rs cs
+ */
+ *size = cur_loc - req_loc;
+ goto restart;
+ }
+ continue;
+ }
+
+ if(req_loc >= cur_loc + cur_size)
+ continue;
+
+ if (req_loc + req_size <= cur_loc + cur_size) {
+
+ /* ------------------
+ * | | | |
+ * ------------------
+ * cl rl rs cs
+ */
+ /*
+ * The old entry completely overlap
+ * with the new entry. So no need
+ * to add new entries
+ */
+ return 1;
+ }
+
+ /* ------------------
+ * | | | |
+ * ------------------
+ * cl rl cs rs
+ */
+ /*
+ * We have partial overlapping entry
+ * update location and new size
+ */
+
+ *location = cur_loc + cur_size;
+ *size = req_loc + req_size - (*location);
+ /*
+ * Now we need to check whether the new value of
+ * location and size are overlapping
+ */
+ goto restart;
+ }
+
+ /* Not a overlapping entry */
+ return 0;
+
+}
+
+static errcode_t undo_write_tdb(struct undo_private_data * data,
+ ext2_loff_t location, int size)
+
+{
+ TDB_DATA tdb_key, tdb_data;
+ errcode_t retval = 0;
+ int actual = 0;
+ ext2_loff_t cur_loc ;
+
+ if (data->tdb == NULL) {
+ /*
+ * Transaction database not initialized
+ */
+ return 0;
+ }
+
+ if (is_trans_overlapping(data->tdb, &location, &size)) {
+ /*
+ * Already we have the original contents
+ * in the database. So skip this write
+ */
+ return 0;
+ }
+
+ tdb_key.dptr = (unsigned char *)&location;
+ tdb_key.dsize = sizeof(location);
+
+ retval = ext2fs_get_mem(size, &(tdb_data.dptr));
+ if (retval) {
+ return retval;
+ }
+
+ if ((cur_loc = ext2fs_llseek(data->dev, 0, SEEK_CUR)) == -1) {
+ retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
+ goto error_out_lseek;
+ }
+
+ if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
+ retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
+ goto error_out_lseek;
+ }
+
+ actual = read(data->dev, tdb_data.dptr, size);
+ if (actual != size) {
+ /*
+ * eventhough it is read we still are doing a write
+ * so return error that is related to write
+ */
+ retval = EXT2_ET_SHORT_WRITE;
+ goto error_out;
+ }
+
+ tdb_data.dsize = size;
+
+#ifdef DEBUG
+ printf("Printing with key %lld data %x and size %d\n", location, tdb_data.dptr, size);
+#endif
+ // I have libtdb1. Enable after rebasing to latest e2fsprogs that has this API
+ //tdb_transaction_start(data->tdb);
+
+ retval = tdb_store(data->tdb, tdb_key, tdb_data, TDB_INSERT);
+ if (retval == -1) {
+ /*
+ * TDB_ERR_EXISTS cannot happen because we have already
+ * verified it doesn't exist
+ */
+
+ /* FIXME!! Not sure what should be the error */
+ //tdb_transaction_cancel(data->tdb);
+ retval = EXT2_ET_SHORT_WRITE;
+ goto error_out;
+
+ }
+ //tdb_transaction_commit(data->tdb);
+
+error_out:
+ /* Move the location back */
+ if (ext2fs_llseek(data->dev, cur_loc, SEEK_SET) != cur_loc) {
+ retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
+ goto error_out;
+ }
+
+error_out_lseek:
+ free(tdb_data.dptr);
+ return retval;
+
+}
+
+static TDB_CONTEXT *undo_setup_tdb(char *tdb_file,
+ struct undo_private_data *data)
+{
+ errcode_t retval;
+
+ data->tdb = tdb_open(tdb_file, 0, TDB_CLEAR_IF_FIRST,
+ O_RDWR | O_CREAT | O_TRUNC, 0600);
+ return data->tdb;
+
+}
+static errcode_t undo_open(const char *name, int flags, io_channel *channel)
+{
+ io_channel io = NULL;
+ struct undo_private_data *data = NULL;
+ errcode_t retval;
+ int open_flags;
+ struct stat st;
+
+ if (name == 0)
+ return EXT2_ET_BAD_DEVICE_NAME;
+ retval = ext2fs_get_mem(sizeof(struct struct_io_channel), &io);
+ if (retval)
+ return retval;
+ memset(io, 0, sizeof(struct struct_io_channel));
+ io->magic = EXT2_ET_MAGIC_IO_CHANNEL;
+ retval = ext2fs_get_mem(sizeof(struct undo_private_data), &data);
+ if (retval)
+ goto cleanup;
+
+ io->manager = undo_io_manager;
+ retval = ext2fs_get_mem(strlen(name)+1, &io->name);
+ if (retval)
+ goto cleanup;
+
+ strcpy(io->name, name);
+ io->private_data = data;
+ io->block_size = 1024;
+ io->read_error = 0;
+ io->write_error = 0;
+ io->refcount = 1;
+
+ memset(data, 0, sizeof(struct undo_private_data));
+ data->magic = EXT2_ET_MAGIC_UNIX_IO_CHANNEL;
+
+ if (undo_io_backing_manager) {
+ retval = undo_io_backing_manager->open(name, flags,
+ &data->real);
+ if (retval)
+ goto cleanup;
+ } else {
+ data->real = 0;
+ }
+
+ /* setup the tdb file */
+ if (undo_setup_tdb(tdb_file, data) == NULL )
+ goto cleanup;
+
+#ifdef HAVE_OPEN64
+ data->dev = open64(io->name, O_RDONLY);
+#else
+ data->dev = open(io->name, O_RDONLY);
+#endif
+ if (data->dev < 0) {
+ retval = errno;
+ goto cleanup;
+ }
+
+ *channel = io;
+ return 0;
+
+cleanup:
+ if (data->real)
+ io_channel_close(data->real);
+
+ if (data)
+ ext2fs_free_mem(&data);
+
+ if (io)
+ ext2fs_free_mem(&io);
+
+ return retval;
+}
+
+static errcode_t undo_close(io_channel channel)
+{
+ struct undo_private_data *data;
+ errcode_t retval = 0;
+
+ EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+ data = (struct undo_private_data *) channel->private_data;
+ EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
+
+ if (--channel->refcount > 0)
+ return 0;
+
+ if (data->real)
+ retval = io_channel_close(data->real);
+
+ if (data->tdb)
+ tdb_close(data->tdb);
+
+ close(data->dev);
+
+ ext2fs_free_mem(&channel->private_data);
+ if (channel->name)
+ ext2fs_free_mem(&channel->name);
+ ext2fs_free_mem(&channel);
+
+ return retval;
+}
+
+static errcode_t undo_set_blksize(io_channel channel, int blksize)
+{
+ struct undo_private_data *data;
+ errcode_t retval;
+
+ EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+ data = (struct undo_private_data *) channel->private_data;
+ EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
+
+ if (data->real)
+ retval = io_channel_set_blksize(data->real, blksize);
+
+ channel->block_size = blksize;
+ return 0;
+}
+
+
+static errcode_t undo_read_blk(io_channel channel, unsigned long block,
+ int count, void *buf)
+{
+ errcode_t retval;
+ struct undo_private_data *data;
+
+ EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+ data = (struct undo_private_data *) channel->private_data;
+ EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
+
+ if (data->real)
+ retval = io_channel_read_blk(data->real, block, count, buf);
+
+ return retval;
+}
+
+static errcode_t undo_write_blk(io_channel channel, unsigned long block,
+ int count, const void *buf)
+{
+ struct undo_private_data *data;
+ errcode_t retval = 0;
+ ssize_t size;
+ ext2_loff_t location;
+
+ EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+ data = (struct undo_private_data *) channel->private_data;
+ EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
+
+
+ if (count == 1)
+ size = channel->block_size;
+ else {
+ if (count < 0)
+ size = -count;
+ else
+ size = count * channel->block_size;
+ }
+
+ location = ((ext2_loff_t) block * channel->block_size) + data->offset;
+
+ /*
+ * First write the existing content into database
+ */
+ retval = undo_write_tdb(data, location, size);
+ if (retval)
+ return retval;
+
+
+ if (data->real)
+ retval = io_channel_write_blk(data->real, block, count, buf);
+
+ return retval;
+
+}
+
+static errcode_t undo_write_byte(io_channel channel, unsigned long offset,
+ int size, const void *buf)
+{
+ struct undo_private_data *data;
+ errcode_t retval = 0;
+ ssize_t actual;
+ ext2_loff_t location;
+
+ EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+ data = (struct undo_private_data *) channel->private_data;
+ EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
+
+ location = offset + data->offset;
+ retval = undo_write_tdb(data, location, size);
+ if (retval)
+ return retval;
+
+ if (data->real && data->real->manager->write_byte)
+ retval = io_channel_write_byte(data->real, offset, size, buf);
+
+ return retval;
+}
+
+/*
+ * Flush data buffers to disk.
+ */
+static errcode_t undo_flush(io_channel channel)
+{
+ errcode_t retval = 0;
+ struct undo_private_data *data;
+
+ EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+ data = (struct undo_private_data *) channel->private_data;
+ EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
+
+ if (data->real)
+ retval = io_channel_flush(data->real);
+
+ return retval;
+}
+
+static errcode_t undo_set_option(io_channel channel, const char *option,
+ const char *arg)
+{
+ struct undo_private_data *data;
+ unsigned long tmp;
+ char *end;
+
+ EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+ data = (struct undo_private_data *) channel->private_data;
+ EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
+
+ /*
+ * Need to support offset option to work with
+ * Unix I/O manager
+ */
+ if (!strcmp(option, "offset")) {
+ if (!arg)
+ return EXT2_ET_INVALID_ARGUMENT;
+
+ tmp = strtoul(arg, &end, 0);
+ if (*end)
+ return EXT2_ET_INVALID_ARGUMENT;
+ data->offset = tmp;
+ return 0;
+ }
+}
--
1.5.2.14.g45bde-dirty

2007-06-03 23:17:26

by Theodore Ts'o

[permalink] [raw]

Subject: Re: [RFC PATCH 1/1] e2fsprogs: Add undo I/O manager.

On Tue, May 22, 2007 at 03:47:33PM +0530, Aneesh Kumar K.V wrote:
> This I/O manager saves the contents of the location being overwritten
> to a tdb database. This helps in undoing the changes done to the
> file system.
>
> + /* loop through the existing entries and find if they overlap */
> + for (key = tdb_firstkey(tdb); key.dptr; key = tdb_nextkey(tdb, key)) {
> + data = tdb_fetch(tdb, key);
> + /*
> + * we are not interested in the data
> + */
> + free(data.dptr);

Youch. This is going to be painful; it means that for every single
write, we need to iterate over every single entry in the TDB. This
doesn't scale terribly well. :-(

I suspect you will do much better by using a different encoding for
the tdb database; namely, one where you don't use an extent-based
encoding, but rather something which is strictly block based. So that
means a separate entry in the tdb database for each block entry. That
will be slightly more inefficient in terms of overhead stored in the
tdb database, yes, but as a percentage of the blocksize (typically
4k), the overhead isn't that great, and the performance improvement
will be huge.

The downside of doing this is that you need to take into account
changes in blocksize via set_blksize() (this is rare; it's only done
by mke2fs, and then only to zap various MD superblocks, et. al), and
if the size is negative (in which case it represents the size in
bytes, which could be more or less than a blocksize, and not
necessarily a multiple of a blocksize).

But that's not too bad, since now you just have to do is figure out
which block(s) need to be saved, and then check to see if a record
already exists in the tdb; if it does, the original value has been
saved, and you don't have to do anything; if it doesn't then you just
save the entire block. The undo manager need to save the first
blocksize specified, and backup the original data in the tdb file in
terms of that first blocksize, regardless of any later changes in the
blocksize (which as I said is rare).

> + if (req_loc < cur_loc) {
> + if (req_loc+req_size > cur_loc) {
> + /* ------------------
> + * | | | |
> + * ------------------
> + * rl cl rs cs
> + */

This looks like notes to yourself; but it's not obvious what's going
on here. Fortunately with the above suggested change in algorith,
most of this will go away, but please make your comments more obvious.

> + // I have libtdb1. Enable after rebasing to latest e2fsprogs that has this API
> + //tdb_transaction_start(data->tdb);

What version of e2fsprogs are you developing against?

> + /* FIXME!! Not sure what should be the error */
> + //tdb_transaction_cancel(data->tdb);
> + retval = EXT2_ET_SHORT_WRITE;
> + goto error_out;

I would add a new error code in lib/ext2fs/ext2_err.et.in; that way we
can have a very specific error code explaining that we have a failure
to save the original version of the block.

> + actual = read(data->dev, tdb_data.dptr, size);
> + if (actual != size) {

Instead of using ext2fs_llseek() and read(), please use the underlying
io_manager. That way the undo manager might have a chance to work on
some other io manager other than unix_io. Yes, that means that you
might in some cases need to save and restore the current blksize. But
that's not going to be the common case, and it will make the code much
more general.

> +error_out:
> + /* Move the location back */
> + if (ext2fs_llseek(data->dev, cur_loc, SEEK_SET) != cur_loc) {
> + retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> + goto error_out;
> + }

Why do you need to move the location back? As far as I know nothing
should be depending on current offset of the file descriptor.

- Ted

2007-06-06 10:03:53

by Aneesh Kumar K.V

[permalink] [raw]

Subject: Re: [RFC PATCH 1/1] e2fsprogs: Add undo I/O manager.

Theodore Tso wrote:
> On Tue, May 22, 2007 at 03:47:33PM +0530, Aneesh Kumar K.V wrote:
>> This I/O manager saves the contents of the location being overwritten
>> to a tdb database. This helps in undoing the changes done to the
>> file system.
>>
>> + /* loop through the existing entries and find if they overlap */
>> + for (key = tdb_firstkey(tdb); key.dptr; key = tdb_nextkey(tdb, key)) {
>> + data = tdb_fetch(tdb, key);
>> + /*
>> + * we are not interested in the data
>> + */
>> + free(data.dptr);
>
> Youch. This is going to be painful; it means that for every single
> write, we need to iterate over every single entry in the TDB. This
> doesn't scale terribly well. :-(
>
> I suspect you will do much better by using a different encoding for
> the tdb database; namely, one where you don't use an extent-based
> encoding, but rather something which is strictly block based. So that
> means a separate entry in the tdb database for each block entry. That
> will be slightly more inefficient in terms of overhead stored in the
> tdb database, yes, but as a percentage of the blocksize (typically
> 4k), the overhead isn't that great, and the performance improvement
> will be huge.
>
> The downside of doing this is that you need to take into account
> changes in blocksize via set_blksize() (this is rare; it's only done
> by mke2fs, and then only to zap various MD superblocks, et. al), and
> if the size is negative (in which case it represents the size in
> bytes, which could be more or less than a blocksize, and not
> necessarily a multiple of a blocksize).
>

If we allow to change the block size in between that would mean the
records that we store in the tdb database will be of variable size (
different block sizes). That would also add all the code/complexity that
i have in is_trans_overlapping. So if we are looking at avoiding the
above for() loop then we should have constant block size (4K ?). But in
your above statement, you are counting overhead as a percentage of
blocksize. So how do we handle this ?

Now with constant block size The write_blk gets complex because of there
won't be a 1:1 mapping between the block number we need to use in
tdb_database and the backing I/O manager.

> But that's not too bad, since now you just have to do is figure out
> which block(s) need to be saved, and then check to see if a record
> already exists in the tdb; if it does, the original value has been
> saved, and you don't have to do anything; if it doesn't then you just
> save the entire block. The undo manager need to save the first
> blocksize specified, and backup the original data in the tdb file in
> terms of that first blocksize, regardless of any later changes in the
> blocksize (which as I said is rare).
>

>> + if (req_loc < cur_loc) {
>> + if (req_loc+req_size > cur_loc) {
>> + /* ------------------
>> + * | | | |
>> + * ------------------
>> + * rl cl rs cs
>> + */
>
> This looks like notes to yourself; but it's not obvious what's going
> on here. Fortunately with the above suggested change in algorith,
> most of this will go away, but please make your comments more obvious.
>

rl -> req_loc
cl -> cur_loc
rs -> req_size
cs -> cur_size

>> + // I have libtdb1. Enable after rebasing to latest e2fsprogs that has this API
>> + //tdb_transaction_start(data->tdb);
>
> What version of e2fsprogs are you developing against?
>

Right now i am manually linking it against libtdb.

dpkg --search /usr/lib/libtdb.so
tdb-dev: /usr/lib/libtdb.so

>> + /* FIXME!! Not sure what should be the error */
>> + //tdb_transaction_cancel(data->tdb);
>> + retval = EXT2_ET_SHORT_WRITE;
>> + goto error_out;
>
> I would add a new error code in lib/ext2fs/ext2_err.et.in; that way we
> can have a very specific error code explaining that we have a failure
> to save the original version of the block.
>

Will do this

>> + actual = read(data->dev, tdb_data.dptr, size);
>> + if (actual != size) {
>
> Instead of using ext2fs_llseek() and read(), please use the underlying
> io_manager. That way the undo manager might have a chance to work on
> some other io manager other than unix_io. Yes, that means that you
> might in some cases need to save and restore the current blksize. But
> that's not going to be the common case, and it will make the code much
> more general.
>
>> +error_out:
>> + /* Move the location back */
>> + if (ext2fs_llseek(data->dev, cur_loc, SEEK_SET) != cur_loc) {
>> + retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
>> + goto error_out;
>> + }
>
> Why do you need to move the location back? As far as I know nothing
> should be depending on current offset of the file descriptor.
>
>

No specific reason.

-aneesh

2007-06-06 12:02:34

by Theodore Ts'o

[permalink] [raw]

Subject: Re: [RFC PATCH 1/1] e2fsprogs: Add undo I/O manager.

On Wed, Jun 06, 2007 at 03:32:27PM +0530, Aneesh Kumar K.V wrote:
>
> If we allow to change the block size in between that would mean the
> records that we store in the tdb database will be of variable size (
> different block sizes). That would also add all the code/complexity that
> i have in is_trans_overlapping. So if we are looking at avoiding the
> above for() loop then we should have constant block size (4K ?). But in
> your above statement, you are counting overhead as a percentage of
> blocksize. So how do we handle this ?

As I suggested in my previous mail message, the block size rarely
changes (mke2fs being the primary counter-example, and then only in a
fairly restricted case). So as far as the tdb is concerned, we have
to use a constant blocksize (the first one which is used when writing
to the i/o block). So the undo manager would save away the blocksize
the first time it was written to --- and yes, we would have to store
that information in the tdb file so the restore program knows what
block size is used, but that's easy; just write out the blocksize that
out as an ascii number (to avoid byte swapping issues) with the key
"blocksize" :-).

Once the block size has been established, all writes to the tdb file
would be in terms of that block size. For correctness' sake, the undo
manager would have to pass any blocksize changes to the underlying i/o
manager, and keep track of what the "current" blocksize, and if the
"current" blocksize is different from the blocksized used to store the
original contents of the filesystem in the tdb file, it would have to
translate the block numbers appropriately. So for example if in the
tdb file everything is stored in terms of 4k blocks, and the blocksize
changes to 512, and a write to sector 23 is attempted, 23 in 512 byte
sectors is equivalent to block #5 assuming a 4k blocksize, and the
undo manager would check to see if block #5 had been saved, and if it
hadn't, it would temporarily set the blocksize of the underlying i/o
manager to 4k, read block #5, save it to the tdb, and then restore the
blocksize back to the current 512 byte blocksize.

If the tdb blocksize is 512, and the block size gets switched to 4k,
then we have to do the reverse; we would need to take the 4k block
number, translate it to 512 byte sector numbers, and then save out 8
entries in the tdb file to correspond to each 4k block that the first
time each 4k block is written.

Makes sense? As I said, in practice this happens but rarely, so this
is just for correctness's sake, so it doesn't have to be efficient.
If necessary we can modify mke2fs to make sure the first write is the
native filesystem blocksize write, so that the undo manager will be
efficient if used with mke2fs.

> >This looks like notes to yourself; but it's not obvious what's going
> >on here. Fortunately with the above suggested change in algorith,
> >most of this will go away, but please make your comments more obvious.
> >
>
> rl -> req_loc
> cl -> cur_loc
> rs -> req_size
> cs -> cur_size

Yes, I could figure it out, but the point I was trying to make was
that obscure comments don't improve code readability. When writing
comments, it's polite to think in terms of a future reader of the
code. (As E.B. White wrote in the introduction of the classic, 'The
Elements of Style' by Oliver Strunk, "All through 'The Elements of
Style', one finds evidence of the author's deep sympathy for the
reader." This general philosophy applies just as much to code as it
does to English prose.)

So if it really is necessary to document the various sizes, then do
something like this:

/* req_loc < cur_loc < req_lock+req_size < cur_loc+cur+size */

Or perhaps something more explicit in english text explaining the case
that you're dealing with --- but only if it's adding something beyond
the C code. As others have said, this is not particularly useful:

/* Add two to the block count */
block_count += 2;

> >What version of e2fsprogs are you developing against?
>
> Right now i am manually linking it against libtdb.
>
> dpkg --search /usr/lib/libtdb.so
> tdb-dev: /usr/lib/libtdb.so

Any particular reason you're not using the development version from
Mercurial for your development? In general it's good practice to send
patches against the latest develoment tip. What caught my eye of that
particular comment was that it was pretty much saying that you weren't
doing that....

Thanks, regards,

- Ted

2007-06-07 11:42:22

by Aneesh Kumar K.V

[permalink] [raw]

Subject: Re: [RFC PATCH 1/1] e2fsprogs: Add undo I/O manager.

Theodore Tso wrote:
> On Wed, Jun 06, 2007 at 03:32:27PM +0530, Aneesh Kumar K.V wrote:
>> If we allow to change the block size in between that would mean the
>> records that we store in the tdb database will be of variable size (
>> different block sizes). That would also add all the code/complexity that
>> i have in is_trans_overlapping. So if we are looking at avoiding the
>> above for() loop then we should have constant block size (4K ?). But in
>> your above statement, you are counting overhead as a percentage of
>> blocksize. So how do we handle this ?
>
> As I suggested in my previous mail message, the block size rarely
> changes (mke2fs being the primary counter-example, and then only in a
> fairly restricted case). So as far as the tdb is concerned, we have
> to use a constant blocksize (the first one which is used when writing
> to the i/o block). So the undo manager would save away the blocksize
> the first time it was written to --- and yes, we would have to store
> that information in the tdb file so the restore program knows what
> block size is used, but that's easy; just write out the blocksize that
> out as an ascii number (to avoid byte swapping issues) with the key
> "blocksize" :-).
>
>

I don't think we need to store the blocksize because we can use the
data.dsize that is returned from tdb_fetch as the block size. For the
replay below is what i have right now.

for (key = tdb_firstkey(tdb); key.dptr; key = tdb_nextkey(tdb, key)) {
data = tdb_fetch(tdb, key);
blk_num = *(unsigned long *)key.dptr;
location = blk_num * data.dsize;
printf("Replayed transaction of size %d at location %ld\n", data.dsize,
blk_num);
lseek(fd, location, SEEK_SET);
write(fd, data.dptr, data.dsize);
}

>
>>> What version of e2fsprogs are you developing against?
>> Right now i am manually linking it against libtdb.
>>
>> dpkg --search /usr/lib/libtdb.so
>> tdb-dev: /usr/lib/libtdb.so
>
> Any particular reason you're not using the development version from
> Mercurial for your development? In general it's good practice to send
> patches against the latest develoment tip. What caught my eye of that
> particular comment was that it was pretty much saying that you weren't
> doing that....
>

Nothing particular. In the beginning i have imported the extent based
patches into a git repository and i continued using the same. Once we
all agree with the approach followed by the code i will port the same to
the code found in mercurial.

-aneesh