LinuxLists.cc - [RFC][PATCH] fiemap support for ext3

2008-04-18 21:18:41

Subject: [RFC][PATCH] fiemap support for ext3

Hello,

Here is my patch for fiemap support on ext3. The main reason for doing this is
because it will make it easier for application developers who are wanting to
take advantage of fiemap on extent based fs's to be able to use the same
interface for ext3 as well without having to fallback onto something like
fibmap. Fibmap also means you are calling ext3_get_block for _every_ block in
the file, which is ineffecient when ext3_get_blocks can map multiple contiguous
blocks all at once, reducing the number of times you have to call
ext3_get_blocks. Tested this with sandeens fiemap test program and verified it
with filefrag. Thanks much,

Signed-off-by: Josef Bacik <[email protected]>

Index: linux-2.6/fs/ext3/file.c
===================================================================
--- linux-2.6.orig/fs/ext3/file.c
+++ linux-2.6/fs/ext3/file.c
@@ -134,5 +134,6 @@ const struct inode_operations ext3_file_
.removexattr = generic_removexattr,
#endif
.permission = ext3_permission,
+ .fiemap = ext3_fiemap,
};

Index: linux-2.6/fs/ext3/inode.c
===================================================================
--- linux-2.6.orig/fs/ext3/inode.c
+++ linux-2.6/fs/ext3/inode.c
@@ -36,6 +36,7 @@
#include <linux/mpage.h>
#include <linux/uio.h>
#include <linux/bio.h>
+#include <linux/fiemap.h>
#include "xattr.h"
#include "acl.h"

@@ -981,6 +982,157 @@ out:
return ret;
}

+int ext3_fiemap(struct inode *inode, unsigned long arg)
+{
+ struct fiemap *fiemap_s;
+ struct fiemap_extent fiemap_e;
+ struct buffer_head tmp;
+ unsigned int start_blk;
+ unsigned int num_of_extents;
+ long length;
+ char *cur_ext_ptr = (char *)(arg + sizeof(struct fiemap));
+ int ret, hole = 0;
+
+ fiemap_s = kmalloc(sizeof(struct fiemap), GFP_KERNEL);
+ if (!fiemap_s)
+ return -ENOMEM;
+
+ if (copy_from_user(fiemap_s, (struct fiemap __user *)arg,
+ sizeof(struct fiemap))) {
+ ret = -EFAULT;
+ goto out_free;
+ }
+
+ /*
+ * if fm_start is in the middle of the current block, get the next
+ * block so we don't end up returning a start thats before the given
+ * fm_start
+ */
+ start_blk = (fiemap_s->fm_start + (1 << inode->i_blkbits) - 1) >>
+ inode->i_blkbits;
+ num_of_extents = fiemap_s->fm_extent_count;
+
+ if (fiemap_s->fm_flags & FIEMAP_FLAG_SYNC)
+ ext3_force_commit(inode->i_sb);
+
+ /* guard against change */
+ mutex_lock(&EXT3_I(inode)->truncate_mutex);
+
+ /*
+ * we want the comparisons to be unsigned, in case somebody passes -1,
+ * meaning they want they want the entire file, but the result has to be
+ * signed so we can handle the case where we get more blocks than the
+ * size of the file
+ */
+ length = (long)min((unsigned long)fiemap_s->fm_length,
+ (unsigned long)i_size_read(inode));
+
+ fiemap_s->fm_start = start_blk << inode->i_blkbits;
+ fiemap_s->fm_length = 0;
+
+ memset(&fiemap_e, 0, sizeof(struct fiemap_extent));
+ do {
+ /*
+ * we set this to length because we want ext3_get_block to
+ * find as many contiguous blocks as it can
+ */
+ memset(&tmp, 0, sizeof(struct buffer_head));
+ tmp.b_size = length;
+
+ ret = ext3_get_block(inode, start_blk, &tmp, 0);
+ if (ret)
+ break;
+
+ /*
+ * Hole, we're going to have to walk the inodes blocks until
+ * find data
+ */
+ if (!tmp.b_blocknr) {
+
+ if (!hole) {
+ hole = 1;
+ fiemap_e.fe_flags |= FIEMAP_EXTENT_HOLE;
+ fiemap_e.fe_offset = start_blk <<
+ inode->i_blkbits;
+ }
+
+ fiemap_e.fe_length += 1 << inode->i_blkbits;
+ length -= 1 << inode->i_blkbits;
+ start_blk++;
+ } else {
+ if (hole &&
+ !copy_to_user(cur_ext_ptr, &fiemap_e,
+ sizeof(struct fiemap_extent))) {
+ cur_ext_ptr += sizeof(struct fiemap_extent);
+ fiemap_s->fm_extent_count++;
+ fiemap_s->fm_length += fiemap_e.fe_length;
+
+ hole = 0;
+ num_of_extents--;
+ memset(&fiemap_e, 0,
+ sizeof(struct fiemap_extent));
+
+ if (!num_of_extents || length <= 0)
+ break;
+ } else if (hole) {
+ /* copy_to_user failed */
+ ret = -EFAULT;
+ break;
+ }
+
+ length -= tmp.b_size;
+ start_blk += tmp.b_size >> inode->i_blkbits;
+
+ fiemap_e.fe_offset = tmp.b_blocknr <<
+ inode->i_blkbits;
+ fiemap_e.fe_length = tmp.b_size;
+
+ if (length <= 0)
+ fiemap_e.fe_flags |= FIEMAP_EXTENT_LAST;
+
+ if (!copy_to_user(cur_ext_ptr, &fiemap_e,
+ sizeof(struct fiemap_extent))) {
+ cur_ext_ptr += sizeof(struct fiemap_extent);
+ num_of_extents--;
+ } else {
+ ret = -EFAULT;
+ break;
+ }
+
+ fiemap_s->fm_extent_count++;
+ fiemap_s->fm_length += fiemap_e.fe_length;
+
+ memset(&fiemap_e, 0, sizeof(struct fiemap_extent));
+ }
+ } while (length > 0 && num_of_extents);
+
+ mutex_unlock(&EXT3_I(inode)->truncate_mutex);
+
+ /* hole at the end of the file */
+ if (hole && !copy_to_user(cur_ext_ptr, &fiemap_e,
+ sizeof(struct fiemap_extent))) {
+ fiemap_s->fm_extent_count++;
+ fiemap_s->fm_length += fiemap_e.fe_length;
+
+ if (length <= 0)
+ fiemap_e.fe_flags |= FIEMAP_EXTENT_LAST;
+
+ } else if (hole) {
+ /* copy to user failed */
+ ret = -EFAULT;
+ }
+
+ if (!ret) {
+ if (copy_to_user((char *)arg, fiemap_s,
+ sizeof(struct fiemap)))
+ ret = -EFAULT;
+ }
+
+out_free:
+ kfree(fiemap_s);
+ return ret;
+}
+
/*
* `handle' can be NULL if create is zero
*/
Index: linux-2.6/include/linux/ext3_fs.h
===================================================================
--- linux-2.6.orig/include/linux/ext3_fs.h
+++ linux-2.6/include/linux/ext3_fs.h
@@ -836,6 +836,7 @@ extern void ext3_truncate (struct inode
extern void ext3_set_inode_flags(struct inode *);
extern void ext3_get_inode_flags(struct ext3_inode_info *);
extern void ext3_set_aops(struct inode *inode);
+extern int ext3_fiemap(struct inode *inode, unsigned long arg);

/* ioctl.c */
extern int ext3_ioctl (struct inode *, struct file *, unsigned int,

2008-04-21 22:08:56

by Andreas Dilger

[permalink] [raw]

Subject: Re: [RFC][PATCH] fiemap support for ext3

On Apr 18, 2008 17:09 -0400, Josef Bacik wrote:
> Here is my patch for fiemap support on ext3. The main reason for doing this is
> because it will make it easier for application developers who are wanting to
> take advantage of fiemap on extent based fs's to be able to use the same
> interface for ext3 as well without having to fallback onto something like
> fibmap. Fibmap also means you are calling ext3_get_block for _every_ block in
> the file, which is ineffecient when ext3_get_blocks can map multiple contiguous
> blocks all at once, reducing the number of times you have to call
> ext3_get_blocks. Tested this with sandeens fiemap test program and verified it
> with filefrag. Thanks much,
>
> Signed-off-by: Josef Bacik <[email protected]>

Josef, thanks for doing this work. Having more than a single filesystem
implement FIEMAP (especially a block-mapped one) is very useful. Did you
look at all at making a "generic_fiemap()" function? It seems very little
of ext3_fiemap() is ext3 specific, only the call to ext3_force_commit()
(which could just be a sync on the inode), ext3_block_map() (generic for
all block-based filesystems), and truncate_mutex (would i_sem be enough?).

> +int ext3_fiemap(struct inode *inode, unsigned long arg)
> +{
> + /*
> + * if fm_start is in the middle of the current block, get the next
> + * block so we don't end up returning a start thats before the given
> + * fm_start
> + */
> + start_blk = (fiemap_s->fm_start + (1 << inode->i_blkbits) - 1) >>
> + inode->i_blkbits;

Hmm, I'd think that if someone is requesting the mapping for bytes [50-5000]
they wouldn't be very happy with the mapping returned being [4096-8191],
because it is missing part of the requested range. Instead, the fm_start
should be rounded down to the start of the first block and up to the end
of the last block to return [0-8191] (fm_start = 0, fm_length = 8192).

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.

2008-04-21 22:16:27

by Eric Sandeen

[permalink] [raw]

Subject: Re: [RFC][PATCH] fiemap support for ext3

Andreas Dilger wrote:
> On Apr 18, 2008 17:09 -0400, Josef Bacik wrote:
>> Here is my patch for fiemap support on ext3. The main reason for doing this is
>> because it will make it easier for application developers who are wanting to
>> take advantage of fiemap on extent based fs's to be able to use the same
>> interface for ext3 as well without having to fallback onto something like
>> fibmap. Fibmap also means you are calling ext3_get_block for _every_ block in
>> the file, which is ineffecient when ext3_get_blocks can map multiple contiguous
>> blocks all at once, reducing the number of times you have to call
>> ext3_get_blocks. Tested this with sandeens fiemap test program and verified it
>> with filefrag. Thanks much,
>>
>> Signed-off-by: Josef Bacik <[email protected]>
>
> Josef, thanks for doing this work. Having more than a single filesystem
> implement FIEMAP (especially a block-mapped one) is very useful.

I have an xfs patch too if anyone wants it ;)

So hopefully we can roll out at least 3 fs's when it goes upstream.

> Did you
> look at all at making a "generic_fiemap()" function? It seems very little
> of ext3_fiemap() is ext3 specific, only the call to ext3_force_commit()
> (which could just be a sync on the inode), ext3_block_map() (generic for
> all block-based filesystems), and truncate_mutex (would i_sem be enough?).

Yep, I agree, it'd be good if ! ->fiemap then go the generic route.

Although my only question/worry is do all filesystems behave sanely in
the face of large b_size for getblocks? All that can handle direct IO
do anyway.

>> +int ext3_fiemap(struct inode *inode, unsigned long arg)
>> +{
>> + /*
>> + * if fm_start is in the middle of the current block, get the next
>> + * block so we don't end up returning a start thats before the given
>> + * fm_start
>> + */
>> + start_blk = (fiemap_s->fm_start + (1 << inode->i_blkbits) - 1) >>
>> + inode->i_blkbits;
>
> Hmm, I'd think that if someone is requesting the mapping for bytes [50-5000]
> they wouldn't be very happy with the mapping returned being [4096-8191],
> because it is missing part of the requested range. Instead, the fm_start
> should be rounded down to the start of the first block and up to the end
> of the last block to return [0-8191] (fm_start = 0, fm_length = 8192).

In fact that should be part of the interface definition, right. Should
the returned mapping start at the beginning of the block that contains
the requsted offset, or at the requested offset itself? I'd vote for
the former.

At some point I should probably write some QA for this thing to test
various file layouts and make sure we get the "right" answers on all
filesystems...

-Eric

2008-04-22 00:26:16

by Josef Bacik

[permalink] [raw]

Subject: Re: [RFC][PATCH] fiemap support for ext3

On Mon, Apr 21, 2008 at 04:08:51PM -0600, Andreas Dilger wrote:
> On Apr 18, 2008 17:09 -0400, Josef Bacik wrote:
> > Here is my patch for fiemap support on ext3. The main reason for doing this is
> > because it will make it easier for application developers who are wanting to
> > take advantage of fiemap on extent based fs's to be able to use the same
> > interface for ext3 as well without having to fallback onto something like
> > fibmap. Fibmap also means you are calling ext3_get_block for _every_ block in
> > the file, which is ineffecient when ext3_get_blocks can map multiple contiguous
> > blocks all at once, reducing the number of times you have to call
> > ext3_get_blocks. Tested this with sandeens fiemap test program and verified it
> > with filefrag. Thanks much,
> >
> > Signed-off-by: Josef Bacik <[email protected]>
>
> Josef, thanks for doing this work. Having more than a single filesystem
> implement FIEMAP (especially a block-mapped one) is very useful. Did you
> look at all at making a "generic_fiemap()" function? It seems very little
> of ext3_fiemap() is ext3 specific, only the call to ext3_force_commit()
> (which could just be a sync on the inode), ext3_block_map() (generic for
> all block-based filesystems), and truncate_mutex (would i_sem be enough?).
>

Like Eric points out I wasn't sure if other fs's would handle the case where
b_size > blocksize properly and map contiguous blocks together until it hit an
unallocated block. If that is a safe assumption to take then I'll do it
generically. I thought i_sem would be a bit heavy handed, but if you would
prefer it that way I can change it.

> > +int ext3_fiemap(struct inode *inode, unsigned long arg)
> > +{
> > + /*
> > + * if fm_start is in the middle of the current block, get the next
> > + * block so we don't end up returning a start thats before the given
> > + * fm_start
> > + */
> > + start_blk = (fiemap_s->fm_start + (1 << inode->i_blkbits) - 1) >>
> > + inode->i_blkbits;
>
> Hmm, I'd think that if someone is requesting the mapping for bytes [50-5000]
> they wouldn't be very happy with the mapping returned being [4096-8191],
> because it is missing part of the requested range. Instead, the fm_start
> should be rounded down to the start of the first block and up to the end
> of the last block to return [0-8191] (fm_start = 0, fm_length = 8192).
>

Ok I will change it, just wasn't sure if returning a start value before the
given start value would be appreciated or not. I guess we can say what it will
do in the man page :). Thanks much,

Josef

2008-04-22 02:33:42

by Andreas Dilger

[permalink] [raw]

Subject: Re: [RFC][PATCH] fiemap support for ext3

On Apr 21, 2008 17:16 -0500, Eric Sandeen wrote:
> Andreas Dilger wrote:
> > Josef, thanks for doing this work. Having more than a single filesystem
> > implement FIEMAP (especially a block-mapped one) is very useful.
>
> I have an xfs patch too if anyone wants it ;)

Please, do send it on to Dave Chinner. He was one of the main contributors
to the FIEMAP specification. He could also give another opinion on whether
the NUM_EXTENTS should return an "extent" for a hole or not.

> So hopefully we can roll out at least 3 fs's when it goes upstream.
>
> > Did you
> > look at all at making a "generic_fiemap()" function? It seems very little
> > of ext3_fiemap() is ext3 specific, only the call to ext3_force_commit()
> > (which could just be a sync on the inode), ext3_block_map() (generic for
> > all block-based filesystems), and truncate_mutex (would i_sem be enough?).
>
> Yep, I agree, it'd be good if ! ->fiemap then go the generic route.
>
> Although my only question/worry is do all filesystems behave sanely in
> the face of large b_size for getblocks? All that can handle direct IO
> do anyway.
>
> >> +int ext3_fiemap(struct inode *inode, unsigned long arg)
> >> +{
> >> + /*
> >> + * if fm_start is in the middle of the current block, get the next
> >> + * block so we don't end up returning a start thats before the given
> >> + * fm_start
> >> + */
> >> + start_blk = (fiemap_s->fm_start + (1 << inode->i_blkbits) - 1) >>
> >> + inode->i_blkbits;
> >
> > Hmm, I'd think that if someone is requesting the mapping for bytes [50-5000]
> > they wouldn't be very happy with the mapping returned being [4096-8191],
> > because it is missing part of the requested range. Instead, the fm_start
> > should be rounded down to the start of the first block and up to the end
> > of the last block to return [0-8191] (fm_start = 0, fm_length = 8192).
>
> In fact that should be part of the interface definition, right. Should
> the returned mapping start at the beginning of the block that contains
> the requsted offset, or at the requested offset itself? I'd vote for
> the former.
>
> At some point I should probably write some QA for this thing to test
> various file layouts and make sure we get the "right" answers on all
> filesystems...
>
> -Eric

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.