2017-08-29 14:29:46

by Andreas Gruenbacher

[permalink] [raw]
Subject: [PATCH 0/4] ext4: SEEK_HOLE / SEEK_DATA via iomap

Add IOMAP_REPORT support to ext4, including support for inline data
which iomap couldn't report so far. Switch to iomap_seek_{hole,data} on
ext4.

This patch set seems to be working correctly on ext4 as well as xfs
under xfstests. Jan Kara has pointed out an issue that I can't
reproduce though, so please review carefully.

Thanks,
Andreas

Andreas Gruenbacher (3):
iomap: Switch from blkno to disk offset
iomap: Add IOMAP_F_DATA_INLINE flag
ext4: Add iomap support for inline data

Christoph Hellwig (1):
ext4: Switch to iomap for SEEK_HOLE / SEEK_DATA

fs/buffer.c | 4 +-
fs/dax.c | 2 +-
fs/ext2/inode.c | 4 +-
fs/ext4/Kconfig | 1 +
fs/ext4/ext4.h | 7 +-
fs/ext4/file.c | 271 +++-----------------------------------------------
fs/ext4/inline.c | 33 ++++++
fs/ext4/inode.c | 114 +++++++--------------
fs/iomap.c | 13 +--
fs/nfsd/blocklayout.c | 4 +-
fs/xfs/xfs_iomap.c | 6 +-
include/linux/iomap.h | 15 +--
12 files changed, 115 insertions(+), 359 deletions(-)

--
2.13.3


2017-08-29 14:29:39

by Andreas Gruenbacher

[permalink] [raw]
Subject: [PATCH 1/4] iomap: Switch from blkno to disk offset

Replace iomap->blkno, the sector number, with iomap->addr, the disk
offset in bytes. For invalid disk offsets, use the special value
IOMAP_NULL_ADDR instead of IOMAP_NULL_BLOCK. This allows to use iomap
for mappings which are not block aligned, such as inline data on ext4
and stuffed inodes on gfs2.

Tested on xfs and ext4 with xfstests; seems to be working correctly.

Signed-off-by: Andreas Gruenbacher <[email protected]>
---
fs/buffer.c | 4 ++--
fs/dax.c | 2 +-
fs/ext2/inode.c | 4 ++--
fs/ext4/inode.c | 4 ++--
fs/iomap.c | 11 +++++------
fs/nfsd/blocklayout.c | 4 ++--
fs/xfs/xfs_iomap.c | 6 +++---
include/linux/iomap.h | 10 +++++-----
8 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 5715dac7821f..b9d20076bb8f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1979,8 +1979,8 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
case IOMAP_MAPPED:
if (offset >= i_size_read(inode))
set_buffer_new(bh);
- bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) +
- ((offset - iomap->offset) >> inode->i_blkbits);
+ bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
+ inode->i_blkbits;
set_buffer_mapped(bh);
break;
}
diff --git a/fs/dax.c b/fs/dax.c
index 865d42c63e23..539aaa0b9c45 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -987,7 +987,7 @@ EXPORT_SYMBOL_GPL(__dax_zero_page_range);

static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
{
- return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
+ return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
}

static loff_t
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 30163d007b2f..e7d8925d0140 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -824,11 +824,11 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,

if (ret == 0) {
iomap->type = IOMAP_HOLE;
- iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->addr = IOMAP_NULL_ADDR;
iomap->length = 1 << blkbits;
} else {
iomap->type = IOMAP_MAPPED;
- iomap->blkno = (sector_t)bno << (blkbits - 9);
+ iomap->addr = (u64)bno << blkbits;
iomap->length = (u64)ret << blkbits;
iomap->flags |= IOMAP_F_MERGED;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c774bdc22759..429f0afde9f2 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3483,7 +3483,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,

if (ret == 0) {
iomap->type = IOMAP_HOLE;
- iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->addr = IOMAP_NULL_ADDR;
iomap->length = (u64)map.m_len << blkbits;
} else {
if (map.m_flags & EXT4_MAP_MAPPED) {
@@ -3494,7 +3494,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
WARN_ON_ONCE(1);
return -EIO;
}
- iomap->blkno = (sector_t)map.m_pblk << (blkbits - 9);
+ iomap->addr = (u64)map.m_pblk << blkbits;
iomap->length = (u64)map.m_len << blkbits;
}

diff --git a/fs/iomap.c b/fs/iomap.c
index 59cc98ad7577..7bc1797c2292 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -350,8 +350,8 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
struct iomap *iomap)
{
- sector_t sector = iomap->blkno +
- (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9);
+ sector_t sector = (iomap->addr +
+ (pos & PAGE_MASK) - iomap->offset) >> 9;

return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector,
offset, bytes);
@@ -512,9 +512,8 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
flags |= FIEMAP_EXTENT_SHARED;

return fiemap_fill_next_extent(fi, iomap->offset,
- iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
+ iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0,
iomap->length, flags);
-
}

static loff_t
@@ -807,7 +806,7 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
bio = bio_alloc(GFP_KERNEL, 1);
bio->bi_bdev = iomap->bdev;
bio->bi_iter.bi_sector =
- iomap->blkno + ((pos - iomap->offset) >> 9);
+ (iomap->addr + pos - iomap->offset) >> 9;
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;

@@ -886,7 +885,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
bio = bio_alloc(GFP_KERNEL, nr_pages);
bio->bi_bdev = iomap->bdev;
bio->bi_iter.bi_sector =
- iomap->blkno + ((pos - iomap->offset) >> 9);
+ (iomap->addr + pos - iomap->offset) >> 9;
bio->bi_write_hint = dio->iocb->ki_hint;
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index c862c2489df0..2d1d37b27dc7 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -65,7 +65,7 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
bex->es = PNFS_BLOCK_READ_DATA;
else
bex->es = PNFS_BLOCK_READWRITE_DATA;
- bex->soff = (iomap.blkno << 9);
+ bex->soff = iomap.addr;
break;
case IOMAP_UNWRITTEN:
if (seg->iomode & IOMODE_RW) {
@@ -78,7 +78,7 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
}

bex->es = PNFS_BLOCK_INVALID_DATA;
- bex->soff = (iomap.blkno << 9);
+ bex->soff = iomap.addr;
break;
}
/*FALLTHRU*/
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 813394c62849..1bcc5d0373a5 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -54,13 +54,13 @@ xfs_bmbt_to_iomap(
struct xfs_mount *mp = ip->i_mount;

if (imap->br_startblock == HOLESTARTBLOCK) {
- iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_HOLE;
} else if (imap->br_startblock == DELAYSTARTBLOCK) {
- iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_DELALLOC;
} else {
- iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
+ iomap->addr = (u64)xfs_fsb_to_db(ip, imap->br_startblock) << 9;
if (imap->br_state == XFS_EXT_UNWRITTEN)
iomap->type = IOMAP_UNWRITTEN;
else
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index f64dc6ce5161..7b8a615fa021 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -15,8 +15,8 @@ struct vm_fault;
*/
#define IOMAP_HOLE 0x01 /* no blocks allocated, need allocation */
#define IOMAP_DELALLOC 0x02 /* delayed allocation blocks */
-#define IOMAP_MAPPED 0x03 /* blocks allocated @blkno */
-#define IOMAP_UNWRITTEN 0x04 /* blocks allocated @blkno in unwritten state */
+#define IOMAP_MAPPED 0x03 /* blocks allocated at @addr */
+#define IOMAP_UNWRITTEN 0x04 /* blocks allocated at @addr in unwritten state */

/*
* Flags for all iomap mappings:
@@ -30,12 +30,12 @@ struct vm_fault;
#define IOMAP_F_SHARED 0x20 /* block shared with another file */

/*
- * Magic value for blkno:
+ * Magic value for addr:
*/
-#define IOMAP_NULL_BLOCK -1LL /* blkno is not valid */
+#define IOMAP_NULL_ADDR -1ULL /* addr is not valid */

struct iomap {
- sector_t blkno; /* 1st sector of mapping, 512b units */
+ u64 addr; /* disk offset of mapping, bytes */
loff_t offset; /* file offset of mapping, bytes */
u64 length; /* length of mapping, bytes */
u16 type; /* type of mapping */
--
2.13.3

2017-08-29 14:29:40

by Andreas Gruenbacher

[permalink] [raw]
Subject: [PATCH 2/4] iomap: Add IOMAP_F_DATA_INLINE flag

Add a new IOMAP_F_DATA_INLINE flag to indicate that a mapping is in a
disk area that contains data as well as metadata. In iomap_fiemap, map
this flag to FIEMAP_EXTENT_DATA_INLINE.

Signed-off-by: Andreas Gruenbacher <[email protected]>
---
fs/iomap.c | 2 ++
include/linux/iomap.h | 5 +++--
2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/iomap.c b/fs/iomap.c
index 7bc1797c2292..f0a263482a0e 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -510,6 +510,8 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
flags |= FIEMAP_EXTENT_MERGED;
if (iomap->flags & IOMAP_F_SHARED)
flags |= FIEMAP_EXTENT_SHARED;
+ if (iomap->flags & IOMAP_F_DATA_INLINE)
+ flags |= FIEMAP_EXTENT_DATA_INLINE;

return fiemap_fill_next_extent(fi, iomap->offset,
iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0,
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 7b8a615fa021..2b0790dbd6ea 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -26,8 +26,9 @@ struct vm_fault;
/*
* Flags that only need to be reported for IOMAP_REPORT requests:
*/
-#define IOMAP_F_MERGED 0x10 /* contains multiple blocks/extents */
-#define IOMAP_F_SHARED 0x20 /* block shared with another file */
+#define IOMAP_F_MERGED 0x10 /* contains multiple blocks/extents */
+#define IOMAP_F_SHARED 0x20 /* block shared with another file */
+#define IOMAP_F_DATA_INLINE 0x40 /* data inline in the inode */

/*
* Magic value for addr:
--
2.13.3

2017-08-29 14:29:41

by Andreas Gruenbacher

[permalink] [raw]
Subject: [PATCH 3/4] ext4: Add iomap support for inline data

Report inline data as an IOMAP_F_DATA_INLINE mapping. This allows to
switch to iomap_seek_hole and iomap_seek_data in ext4_llseek, and will
make switching to iomap_fiemap in ext4_fiemap easier as well.

Signed-off-by: Andreas Gruenbacher <[email protected]>
---
fs/ext4/ext4.h | 4 ++++
fs/ext4/inline.c | 33 +++++++++++++++++++++++++++++++++
fs/ext4/inode.c | 10 ++++++++--
3 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a2bb7d2870e4..017e55942a49 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3048,6 +3048,10 @@ extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
extern int ext4_inline_data_fiemap(struct inode *inode,
struct fiemap_extent_info *fieinfo,
int *has_inline, __u64 start, __u64 len);
+
+struct iomap;
+extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap);
+
extern int ext4_try_to_evict_inline_data(handle_t *handle,
struct inode *inode,
int needed);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 28c5c3abddb3..21a078fef80f 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -12,6 +12,7 @@
* GNU General Public License for more details.
*/

+#include <linux/iomap.h>
#include <linux/fiemap.h>

#include "ext4_jbd2.h"
@@ -1827,6 +1828,38 @@ int ext4_destroy_inline_data(handle_t *handle, struct inode *inode)
return ret;
}

+int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap)
+{
+ __u64 addr;
+ int error = -ENOENT;
+ struct ext4_iloc iloc;
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode))
+ goto out;
+
+ error = ext4_get_inode_loc(inode, &iloc);
+ if (error)
+ goto out;
+
+ addr = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
+ addr += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
+ addr += offsetof(struct ext4_inode, i_block);
+
+ brelse(iloc.bh);
+
+ iomap->addr = addr;
+ iomap->offset = 0;
+ iomap->length = min_t(loff_t, ext4_get_inline_size(inode),
+ i_size_read(inode));
+ iomap->type = 0;
+ iomap->flags = IOMAP_F_DATA_INLINE;
+
+out:
+ up_read(&EXT4_I(inode)->xattr_sem);
+ return error;
+}
+
int ext4_inline_data_fiemap(struct inode *inode,
struct fiemap_extent_info *fieinfo,
int *has_inline, __u64 start, __u64 len)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 429f0afde9f2..ab6ab835255e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3411,8 +3411,14 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
struct ext4_map_blocks map;
int ret;

- if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
- return -ERANGE;
+ if ((flags & IOMAP_REPORT) && ext4_has_inline_data(inode)) {
+ ret = ext4_inline_data_iomap(inode, iomap);
+ if (ret != -ENOENT) {
+ if (ret == 0 && offset >= iomap->length)
+ ret = -ENOENT;
+ return ret;
+ }
+ }

map.m_lblk = first_block;
map.m_len = last_block - first_block + 1;
--
2.13.3

2017-08-29 14:29:42

by Andreas Gruenbacher

[permalink] [raw]
Subject: [PATCH 4/4] ext4: Switch to iomap for SEEK_HOLE / SEEK_DATA

From: Christoph Hellwig <[email protected]>

Switch to the iomap_seek_hole and iomap_seek_data helpers for
implementing lseek SEEK_HOLE / SEEK_DATA, and remove all the code that
isn't needed any more.

Note that with this patch, ext4 will now always depend on the iomap code
instead of only when CONFIG_DAX is enabled, and it requires adding a
call into the extent status tree for iomap_begin as well to properly
deal with delalloc extents.

Signed-off-by: Christoph Hellwig <[email protected]>
[Minor fixes and cleanups by Andreas]
Signed-off-by: Andreas Gruenbacher <[email protected]>
---
fs/ext4/Kconfig | 1 +
fs/ext4/ext4.h | 3 -
fs/ext4/file.c | 271 +++-----------------------------------------------------
fs/ext4/inode.c | 100 ++++++---------------
4 files changed, 43 insertions(+), 332 deletions(-)

diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index e38039fd96ff..73b850f5659c 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -37,6 +37,7 @@ config EXT4_FS
select CRC16
select CRYPTO
select CRYPTO_CRC32C
+ select FS_IOMAP
help
This is the next generation of the ext3 filesystem.

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 017e55942a49..f295fce7a9c8 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2515,9 +2515,6 @@ extern void ext4_da_update_reserve_space(struct inode *inode,
int used, int quota_claim);
extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
ext4_fsblk_t pblk, ext4_lblk_t len);
-extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
- unsigned int map_len,
- struct extent_status *result);

/* indirect.c */
extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 0d7cf0cc9b87..bc7a901c8b3f 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -20,6 +20,7 @@

#include <linux/time.h>
#include <linux/fs.h>
+#include <linux/iomap.h>
#include <linux/mount.h>
#include <linux/path.h>
#include <linux/dax.h>
@@ -455,256 +456,6 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
}

/*
- * Here we use ext4_map_blocks() to get a block mapping for a extent-based
- * file rather than ext4_ext_walk_space() because we can introduce
- * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same
- * function. When extent status tree has been fully implemented, it will
- * track all extent status for a file and we can directly use it to
- * retrieve the offset for SEEK_DATA/SEEK_HOLE.
- */
-
-/*
- * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to
- * lookup page cache to check whether or not there has some data between
- * [startoff, endoff] because, if this range contains an unwritten extent,
- * we determine this extent as a data or a hole according to whether the
- * page cache has data or not.
- */
-static int ext4_find_unwritten_pgoff(struct inode *inode,
- int whence,
- ext4_lblk_t end_blk,
- loff_t *offset)
-{
- struct pagevec pvec;
- unsigned int blkbits;
- pgoff_t index;
- pgoff_t end;
- loff_t endoff;
- loff_t startoff;
- loff_t lastoff;
- int found = 0;
-
- blkbits = inode->i_sb->s_blocksize_bits;
- startoff = *offset;
- lastoff = startoff;
- endoff = (loff_t)end_blk << blkbits;
-
- index = startoff >> PAGE_SHIFT;
- end = (endoff - 1) >> PAGE_SHIFT;
-
- pagevec_init(&pvec, 0);
- do {
- int i, num;
- unsigned long nr_pages;
-
- num = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1;
- nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
- (pgoff_t)num);
- if (nr_pages == 0)
- break;
-
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
- struct buffer_head *bh, *head;
-
- /*
- * If current offset is smaller than the page offset,
- * there is a hole at this offset.
- */
- if (whence == SEEK_HOLE && lastoff < endoff &&
- lastoff < page_offset(pvec.pages[i])) {
- found = 1;
- *offset = lastoff;
- goto out;
- }
-
- if (page->index > end)
- goto out;
-
- lock_page(page);
-
- if (unlikely(page->mapping != inode->i_mapping)) {
- unlock_page(page);
- continue;
- }
-
- if (!page_has_buffers(page)) {
- unlock_page(page);
- continue;
- }
-
- if (page_has_buffers(page)) {
- lastoff = page_offset(page);
- bh = head = page_buffers(page);
- do {
- if (lastoff + bh->b_size <= startoff)
- goto next;
- if (buffer_uptodate(bh) ||
- buffer_unwritten(bh)) {
- if (whence == SEEK_DATA)
- found = 1;
- } else {
- if (whence == SEEK_HOLE)
- found = 1;
- }
- if (found) {
- *offset = max_t(loff_t,
- startoff, lastoff);
- unlock_page(page);
- goto out;
- }
-next:
- lastoff += bh->b_size;
- bh = bh->b_this_page;
- } while (bh != head);
- }
-
- lastoff = page_offset(page) + PAGE_SIZE;
- unlock_page(page);
- }
-
- /* The no. of pages is less than our desired, we are done. */
- if (nr_pages < num)
- break;
-
- index = pvec.pages[i - 1]->index + 1;
- pagevec_release(&pvec);
- } while (index <= end);
-
- if (whence == SEEK_HOLE && lastoff < endoff) {
- found = 1;
- *offset = lastoff;
- }
-out:
- pagevec_release(&pvec);
- return found;
-}
-
-/*
- * ext4_seek_data() retrieves the offset for SEEK_DATA.
- */
-static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
-{
- struct inode *inode = file->f_mapping->host;
- struct extent_status es;
- ext4_lblk_t start, last, end;
- loff_t dataoff, isize;
- int blkbits;
- int ret;
-
- inode_lock(inode);
-
- isize = i_size_read(inode);
- if (offset >= isize) {
- inode_unlock(inode);
- return -ENXIO;
- }
-
- blkbits = inode->i_sb->s_blocksize_bits;
- start = offset >> blkbits;
- last = start;
- end = isize >> blkbits;
- dataoff = offset;
-
- do {
- ret = ext4_get_next_extent(inode, last, end - last + 1, &es);
- if (ret <= 0) {
- /* No extent found -> no data */
- if (ret == 0)
- ret = -ENXIO;
- inode_unlock(inode);
- return ret;
- }
-
- last = es.es_lblk;
- if (last != start)
- dataoff = (loff_t)last << blkbits;
- if (!ext4_es_is_unwritten(&es))
- break;
-
- /*
- * If there is a unwritten extent at this offset,
- * it will be as a data or a hole according to page
- * cache that has data or not.
- */
- if (ext4_find_unwritten_pgoff(inode, SEEK_DATA,
- es.es_lblk + es.es_len, &dataoff))
- break;
- last += es.es_len;
- dataoff = (loff_t)last << blkbits;
- cond_resched();
- } while (last <= end);
-
- inode_unlock(inode);
-
- if (dataoff > isize)
- return -ENXIO;
-
- return vfs_setpos(file, dataoff, maxsize);
-}
-
-/*
- * ext4_seek_hole() retrieves the offset for SEEK_HOLE.
- */
-static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
-{
- struct inode *inode = file->f_mapping->host;
- struct extent_status es;
- ext4_lblk_t start, last, end;
- loff_t holeoff, isize;
- int blkbits;
- int ret;
-
- inode_lock(inode);
-
- isize = i_size_read(inode);
- if (offset >= isize) {
- inode_unlock(inode);
- return -ENXIO;
- }
-
- blkbits = inode->i_sb->s_blocksize_bits;
- start = offset >> blkbits;
- last = start;
- end = isize >> blkbits;
- holeoff = offset;
-
- do {
- ret = ext4_get_next_extent(inode, last, end - last + 1, &es);
- if (ret < 0) {
- inode_unlock(inode);
- return ret;
- }
- /* Found a hole? */
- if (ret == 0 || es.es_lblk > last) {
- if (last != start)
- holeoff = (loff_t)last << blkbits;
- break;
- }
- /*
- * If there is a unwritten extent at this offset,
- * it will be as a data or a hole according to page
- * cache that has data or not.
- */
- if (ext4_es_is_unwritten(&es) &&
- ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
- last + es.es_len, &holeoff))
- break;
-
- last += es.es_len;
- holeoff = (loff_t)last << blkbits;
- cond_resched();
- } while (last <= end);
-
- inode_unlock(inode);
-
- if (holeoff > isize)
- holeoff = isize;
-
- return vfs_setpos(file, holeoff, maxsize);
-}
-
-/*
* ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
* by calling generic_file_llseek_size() with the appropriate maxbytes
* value for each.
@@ -720,18 +471,24 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
maxbytes = inode->i_sb->s_maxbytes;

switch (whence) {
- case SEEK_SET:
- case SEEK_CUR:
- case SEEK_END:
+ default:
return generic_file_llseek_size(file, offset, whence,
maxbytes, i_size_read(inode));
- case SEEK_DATA:
- return ext4_seek_data(file, offset, maxbytes);
case SEEK_HOLE:
- return ext4_seek_hole(file, offset, maxbytes);
+ inode_lock_shared(inode);
+ offset = iomap_seek_hole(inode, offset, &ext4_iomap_ops);
+ inode_unlock_shared(inode);
+ break;
+ case SEEK_DATA:
+ inode_lock_shared(inode);
+ offset = iomap_seek_data(inode, offset, &ext4_iomap_ops);
+ inode_unlock_shared(inode);
+ break;
}

- return -EINVAL;
+ if (offset < 0)
+ return offset;
+ return vfs_setpos(file, offset, maxbytes);
}

const struct file_operations ext4_file_operations = {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ab6ab835255e..f5895ea6ac70 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3400,7 +3400,6 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
return try_to_free_buffers(page);
}

-#ifdef CONFIG_FS_DAX
static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned flags, struct iomap *iomap)
{
@@ -3409,6 +3408,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned long first_block = offset >> blkbits;
unsigned long last_block = (offset + length - 1) >> blkbits;
struct ext4_map_blocks map;
+ bool delalloc = false;
int ret;

if ((flags & IOMAP_REPORT) && ext4_has_inline_data(inode)) {
@@ -3425,6 +3425,29 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,

if (!(flags & IOMAP_WRITE)) {
ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ return ret;
+ if (!ret) {
+ struct extent_status es = {};
+
+ ext4_es_find_delayed_extent_range(inode, map.m_lblk,
+ map.m_lblk + map.m_len - 1, &es);
+ /* Is delalloc data before next block in extent tree? */
+ if (es.es_len && es.es_lblk < map.m_lblk + map.m_len) {
+ ext4_lblk_t offs = 0;
+
+ if (es.es_lblk < map.m_lblk)
+ offs = map.m_lblk - es.es_lblk;
+ map.m_lblk = es.es_lblk + offs;
+ map.m_pblk = ext4_es_pblock(&es) + offs;
+ map.m_len = es.es_len - offs;
+ if (ext4_es_is_unwritten(&es))
+ map.m_flags |= EXT4_MAP_UNWRITTEN;
+ if (ext4_es_is_delayed(&es))
+ delalloc = true;
+ ret = 1;
+ }
+ }
} else {
int dio_credits;
handle_t *handle;
@@ -3486,11 +3509,14 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
else
iomap->dax_dev = NULL;
iomap->offset = first_block << blkbits;
+ iomap->length = (u64)map.m_len << blkbits;

if (ret == 0) {
iomap->type = IOMAP_HOLE;
iomap->addr = IOMAP_NULL_ADDR;
- iomap->length = (u64)map.m_len << blkbits;
+ } else if (delalloc) {
+ iomap->type = IOMAP_DELALLOC;
+ iomap->addr = IOMAP_NULL_ADDR;
} else {
if (map.m_flags & EXT4_MAP_MAPPED) {
iomap->type = IOMAP_MAPPED;
@@ -3501,7 +3527,6 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
return -EIO;
}
iomap->addr = (u64)map.m_pblk << blkbits;
- iomap->length = (u64)map.m_len << blkbits;
}

if (map.m_flags & EXT4_MAP_NEW)
@@ -3567,8 +3592,6 @@ const struct iomap_ops ext4_iomap_ops = {
.iomap_end = ext4_iomap_end,
};

-#endif
-
static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
ssize_t size, void *private)
{
@@ -6132,70 +6155,3 @@ int ext4_filemap_fault(struct vm_fault *vmf)

return err;
}
-
-/*
- * Find the first extent at or after @lblk in an inode that is not a hole.
- * Search for @map_len blocks at most. The extent is returned in @result.
- *
- * The function returns 1 if we found an extent. The function returns 0 in
- * case there is no extent at or after @lblk and in that case also sets
- * @result->es_len to 0. In case of error, the error code is returned.
- */
-int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
- unsigned int map_len, struct extent_status *result)
-{
- struct ext4_map_blocks map;
- struct extent_status es = {};
- int ret;
-
- map.m_lblk = lblk;
- map.m_len = map_len;
-
- /*
- * For non-extent based files this loop may iterate several times since
- * we do not determine full hole size.
- */
- while (map.m_len > 0) {
- ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret < 0)
- return ret;
- /* There's extent covering m_lblk? Just return it. */
- if (ret > 0) {
- int status;
-
- ext4_es_store_pblock(result, map.m_pblk);
- result->es_lblk = map.m_lblk;
- result->es_len = map.m_len;
- if (map.m_flags & EXT4_MAP_UNWRITTEN)
- status = EXTENT_STATUS_UNWRITTEN;
- else
- status = EXTENT_STATUS_WRITTEN;
- ext4_es_store_status(result, status);
- return 1;
- }
- ext4_es_find_delayed_extent_range(inode, map.m_lblk,
- map.m_lblk + map.m_len - 1,
- &es);
- /* Is delalloc data before next block in extent tree? */
- if (es.es_len && es.es_lblk < map.m_lblk + map.m_len) {
- ext4_lblk_t offset = 0;
-
- if (es.es_lblk < lblk)
- offset = lblk - es.es_lblk;
- result->es_lblk = es.es_lblk + offset;
- ext4_es_store_pblock(result,
- ext4_es_pblock(&es) + offset);
- result->es_len = es.es_len - offset;
- ext4_es_store_status(result, ext4_es_status(&es));
-
- return 1;
- }
- /* There's a hole at m_lblk, advance us after it */
- map.m_lblk += map.m_len;
- map_len -= map.m_len;
- map.m_len = map_len;
- cond_resched();
- }
- result->es_len = 0;
- return 0;
-}
--
2.13.3

2017-08-29 16:11:15

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 1/4] iomap: Switch from blkno to disk offset

On Tue, Aug 29, 2017 at 04:29:39PM +0200, Andreas Gruenbacher wrote:
> Replace iomap->blkno, the sector number, with iomap->addr, the disk
> offset in bytes. For invalid disk offsets, use the special value
> IOMAP_NULL_ADDR instead of IOMAP_NULL_BLOCK. This allows to use iomap
> for mappings which are not block aligned, such as inline data on ext4
> and stuffed inodes on gfs2.
>
> Tested on xfs and ext4 with xfstests; seems to be working correctly.
>
> Signed-off-by: Andreas Gruenbacher <[email protected]>
> ---
> fs/buffer.c | 4 ++--
> fs/dax.c | 2 +-
> fs/ext2/inode.c | 4 ++--
> fs/ext4/inode.c | 4 ++--
> fs/iomap.c | 11 +++++------
> fs/nfsd/blocklayout.c | 4 ++--
> fs/xfs/xfs_iomap.c | 6 +++---
> include/linux/iomap.h | 10 +++++-----
> 8 files changed, 22 insertions(+), 23 deletions(-)
>
> diff --git a/fs/buffer.c b/fs/buffer.c
> index 5715dac7821f..b9d20076bb8f 100644
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -1979,8 +1979,8 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
> case IOMAP_MAPPED:
> if (offset >= i_size_read(inode))
> set_buffer_new(bh);
> - bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) +
> - ((offset - iomap->offset) >> inode->i_blkbits);
> + bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
> + inode->i_blkbits;
> set_buffer_mapped(bh);
> break;
> }
> diff --git a/fs/dax.c b/fs/dax.c
> index 865d42c63e23..539aaa0b9c45 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -987,7 +987,7 @@ EXPORT_SYMBOL_GPL(__dax_zero_page_range);
>
> static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
> {
> - return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
> + return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
> }
>
> static loff_t
> diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
> index 30163d007b2f..e7d8925d0140 100644
> --- a/fs/ext2/inode.c
> +++ b/fs/ext2/inode.c
> @@ -824,11 +824,11 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>
> if (ret == 0) {
> iomap->type = IOMAP_HOLE;
> - iomap->blkno = IOMAP_NULL_BLOCK;
> + iomap->addr = IOMAP_NULL_ADDR;
> iomap->length = 1 << blkbits;
> } else {
> iomap->type = IOMAP_MAPPED;
> - iomap->blkno = (sector_t)bno << (blkbits - 9);
> + iomap->addr = (u64)bno << blkbits;
> iomap->length = (u64)ret << blkbits;
> iomap->flags |= IOMAP_F_MERGED;
> }
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index c774bdc22759..429f0afde9f2 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -3483,7 +3483,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>
> if (ret == 0) {
> iomap->type = IOMAP_HOLE;
> - iomap->blkno = IOMAP_NULL_BLOCK;
> + iomap->addr = IOMAP_NULL_ADDR;
> iomap->length = (u64)map.m_len << blkbits;
> } else {
> if (map.m_flags & EXT4_MAP_MAPPED) {
> @@ -3494,7 +3494,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> WARN_ON_ONCE(1);
> return -EIO;
> }
> - iomap->blkno = (sector_t)map.m_pblk << (blkbits - 9);
> + iomap->addr = (u64)map.m_pblk << blkbits;
> iomap->length = (u64)map.m_len << blkbits;
> }
>
> diff --git a/fs/iomap.c b/fs/iomap.c
> index 59cc98ad7577..7bc1797c2292 100644
> --- a/fs/iomap.c
> +++ b/fs/iomap.c
> @@ -350,8 +350,8 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
> static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
> struct iomap *iomap)
> {
> - sector_t sector = iomap->blkno +
> - (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9);
> + sector_t sector = (iomap->addr +
> + (pos & PAGE_MASK) - iomap->offset) >> 9;
>
> return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector,
> offset, bytes);
> @@ -512,9 +512,8 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
> flags |= FIEMAP_EXTENT_SHARED;
>
> return fiemap_fill_next_extent(fi, iomap->offset,
> - iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
> + iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0,
> iomap->length, flags);
> -
> }
>
> static loff_t
> @@ -807,7 +806,7 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
> bio = bio_alloc(GFP_KERNEL, 1);
> bio->bi_bdev = iomap->bdev;
> bio->bi_iter.bi_sector =
> - iomap->blkno + ((pos - iomap->offset) >> 9);
> + (iomap->addr + pos - iomap->offset) >> 9;
> bio->bi_private = dio;
> bio->bi_end_io = iomap_dio_bio_end_io;
>
> @@ -886,7 +885,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
> bio = bio_alloc(GFP_KERNEL, nr_pages);
> bio->bi_bdev = iomap->bdev;
> bio->bi_iter.bi_sector =
> - iomap->blkno + ((pos - iomap->offset) >> 9);
> + (iomap->addr + pos - iomap->offset) >> 9;
> bio->bi_write_hint = dio->iocb->ki_hint;
> bio->bi_private = dio;
> bio->bi_end_io = iomap_dio_bio_end_io;
> diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
> index c862c2489df0..2d1d37b27dc7 100644
> --- a/fs/nfsd/blocklayout.c
> +++ b/fs/nfsd/blocklayout.c
> @@ -65,7 +65,7 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
> bex->es = PNFS_BLOCK_READ_DATA;
> else
> bex->es = PNFS_BLOCK_READWRITE_DATA;
> - bex->soff = (iomap.blkno << 9);
> + bex->soff = iomap.addr;
> break;
> case IOMAP_UNWRITTEN:
> if (seg->iomode & IOMODE_RW) {
> @@ -78,7 +78,7 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
> }
>
> bex->es = PNFS_BLOCK_INVALID_DATA;
> - bex->soff = (iomap.blkno << 9);
> + bex->soff = iomap.addr;
> break;
> }
> /*FALLTHRU*/
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 813394c62849..1bcc5d0373a5 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -54,13 +54,13 @@ xfs_bmbt_to_iomap(
> struct xfs_mount *mp = ip->i_mount;
>
> if (imap->br_startblock == HOLESTARTBLOCK) {
> - iomap->blkno = IOMAP_NULL_BLOCK;
> + iomap->addr = IOMAP_NULL_ADDR;
> iomap->type = IOMAP_HOLE;
> } else if (imap->br_startblock == DELAYSTARTBLOCK) {
> - iomap->blkno = IOMAP_NULL_BLOCK;
> + iomap->addr = IOMAP_NULL_ADDR;
> iomap->type = IOMAP_DELALLOC;
> } else {
> - iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
> + iomap->addr = (u64)xfs_fsb_to_db(ip, imap->br_startblock) << 9;

iomap->addr = BBTOB(xfs_fsb_to_db(...));

Same machine-level transformation, but please help us maintain
consistent unit conversion code; the 'BBTOB' name makes the units
analysis of a given piece of code more straightforward.

With that fixed,
Reviewed-by: Darrick J. Wong <[email protected]>

for the iomap and XFS parts.

--D

> if (imap->br_state == XFS_EXT_UNWRITTEN)
> iomap->type = IOMAP_UNWRITTEN;
> else
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index f64dc6ce5161..7b8a615fa021 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -15,8 +15,8 @@ struct vm_fault;
> */
> #define IOMAP_HOLE 0x01 /* no blocks allocated, need allocation */
> #define IOMAP_DELALLOC 0x02 /* delayed allocation blocks */
> -#define IOMAP_MAPPED 0x03 /* blocks allocated @blkno */
> -#define IOMAP_UNWRITTEN 0x04 /* blocks allocated @blkno in unwritten state */
> +#define IOMAP_MAPPED 0x03 /* blocks allocated at @addr */
> +#define IOMAP_UNWRITTEN 0x04 /* blocks allocated at @addr in unwritten state */
>
> /*
> * Flags for all iomap mappings:
> @@ -30,12 +30,12 @@ struct vm_fault;
> #define IOMAP_F_SHARED 0x20 /* block shared with another file */
>
> /*
> - * Magic value for blkno:
> + * Magic value for addr:
> */
> -#define IOMAP_NULL_BLOCK -1LL /* blkno is not valid */
> +#define IOMAP_NULL_ADDR -1ULL /* addr is not valid */
>
> struct iomap {
> - sector_t blkno; /* 1st sector of mapping, 512b units */
> + u64 addr; /* disk offset of mapping, bytes */
> loff_t offset; /* file offset of mapping, bytes */
> u64 length; /* length of mapping, bytes */
> u16 type; /* type of mapping */
> --
> 2.13.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2017-08-30 12:47:24

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH 3/4] ext4: Add iomap support for inline data

On Tue 29-08-17 16:29:41, Andreas Gruenbacher wrote:
> Report inline data as an IOMAP_F_DATA_INLINE mapping. This allows to
> switch to iomap_seek_hole and iomap_seek_data in ext4_llseek, and will
> make switching to iomap_fiemap in ext4_fiemap easier as well.
>
> Signed-off-by: Andreas Gruenbacher <[email protected]>

Just one nit and one comment below. Feel free to add:

Reviewed-by: Jan Kara <[email protected]>


> ---
> fs/ext4/ext4.h | 4 ++++
> fs/ext4/inline.c | 33 +++++++++++++++++++++++++++++++++
> fs/ext4/inode.c | 10 ++++++++--
> 3 files changed, 45 insertions(+), 2 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index a2bb7d2870e4..017e55942a49 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -3048,6 +3048,10 @@ extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
> extern int ext4_inline_data_fiemap(struct inode *inode,
> struct fiemap_extent_info *fieinfo,
> int *has_inline, __u64 start, __u64 len);
> +
> +struct iomap;
> +extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap);
> +
> extern int ext4_try_to_evict_inline_data(handle_t *handle,
> struct inode *inode,
> int needed);
> diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
> index 28c5c3abddb3..21a078fef80f 100644
> --- a/fs/ext4/inline.c
> +++ b/fs/ext4/inline.c
> @@ -12,6 +12,7 @@
> * GNU General Public License for more details.
> */
>
> +#include <linux/iomap.h>
> #include <linux/fiemap.h>
>
> #include "ext4_jbd2.h"
> @@ -1827,6 +1828,38 @@ int ext4_destroy_inline_data(handle_t *handle, struct inode *inode)
> return ret;
> }
>
> +int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap)
> +{
> + __u64 addr;
> + int error = -ENOENT;
> + struct ext4_iloc iloc;
> +
> + down_read(&EXT4_I(inode)->xattr_sem);
> + if (!ext4_has_inline_data(inode))
> + goto out;

Hum, ENOENT looks rather arbitrary for a lost race with inode conversion.
Maybe EAGAIN would be better? We use it in some other place as well to
indicate that inode has been converted...

> +
> + error = ext4_get_inode_loc(inode, &iloc);
> + if (error)
> + goto out;

Using ext4_get_inode_loc() just to get block + offset of the inode is a bit
overkill since it will also allocate buffer_head, load the block with inode
from disk, etc. But since this is not performance critical and the buffer
is likely to be in cache anyway, I can live with this I guess.

Honza

--
Jan Kara <[email protected]>
SUSE Labs, CR

2017-08-30 13:03:33

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH 3/4] ext4: Add iomap support for inline data

On Tue 29-08-17 16:29:41, Andreas Gruenbacher wrote:
> Report inline data as an IOMAP_F_DATA_INLINE mapping. This allows to
> switch to iomap_seek_hole and iomap_seek_data in ext4_llseek, and will
> make switching to iomap_fiemap in ext4_fiemap easier as well.
>
> Signed-off-by: Andreas Gruenbacher <[email protected]>

...

> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 429f0afde9f2..ab6ab835255e 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -3411,8 +3411,14 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> struct ext4_map_blocks map;
> int ret;
>
> - if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
> - return -ERANGE;
> + if ((flags & IOMAP_REPORT) && ext4_has_inline_data(inode)) {
> + ret = ext4_inline_data_iomap(inode, iomap);
> + if (ret != -ENOENT) {
> + if (ret == 0 && offset >= iomap->length)
> + ret = -ENOENT;
> + return ret;
> + }
> + }

One more thing: Please keep that WARN_ON and bail out for !IOMAP_REPORT
cases. Thanks!

Honza

--
Jan Kara <[email protected]>
SUSE Labs, CR

2017-08-30 14:59:21

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH 4/4] ext4: Switch to iomap for SEEK_HOLE / SEEK_DATA

On Tue 29-08-17 16:29:42, Andreas Gruenbacher wrote:
> From: Christoph Hellwig <[email protected]>
>
> Switch to the iomap_seek_hole and iomap_seek_data helpers for
> implementing lseek SEEK_HOLE / SEEK_DATA, and remove all the code that
> isn't needed any more.
>
> Note that with this patch, ext4 will now always depend on the iomap code
> instead of only when CONFIG_DAX is enabled, and it requires adding a
> call into the extent status tree for iomap_begin as well to properly
> deal with delalloc extents.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> [Minor fixes and cleanups by Andreas]
> Signed-off-by: Andreas Gruenbacher <[email protected]>

...

> @@ -3425,6 +3425,29 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>
> if (!(flags & IOMAP_WRITE)) {
> ret = ext4_map_blocks(NULL, inode, &map, 0);
> + if (ret < 0)
> + return ret;

One general question about IOMAP_REPORT: When there is unwritten extent, we
use page_cache_seek_hole_data() to determine what is actually a hole and
what is data. We could use exactly the same function to determine what is a
hole and what is data in an IOMAP_HOLE extent, couldn't we? Now I
understand that a filesystem is supposed to return IOMAP_DELALLOC extent if
there is delayed allocation data covering queried offset so probably it's
not a great idea to use that however it still seems a bit like an
unnecessary duplication...

> + if (!ret) {

Please go to this branch only for IOMAP_REPORT case to avoid unnecessary
overhead for DAX mappings...

> + struct extent_status es = {};
> +
> + ext4_es_find_delayed_extent_range(inode, map.m_lblk,
> + map.m_lblk + map.m_len - 1, &es);
> + /* Is delalloc data before next block in extent tree? */
> + if (es.es_len && es.es_lblk < map.m_lblk + map.m_len) {

And this is still wrong - I have actually verified that with attached patch
that disables caching of extents (so it is equivalent to *very* aggresive
slab reclaim happening). With that patch applied you'll get:

kvm0:~ # rm /mnt/file; xfs_io -f -c "pwrite 4096 4096" -c "seek -a 0"
/mnt/file
wrote 4096/4096 bytes at offset 4096
4 KiB, 1 ops; 0.0000 sec (16.693 MiB/sec and 4273.5043 ops/sec)
Whence Result
DATA 0
HOLE 8192

Which is obviously wrong - hole should be 0, data should be 4096.

And the reason why this is wrong is that when we are asked to map things at
first_block and there is a hole at that offset, we need to just truncate
the hole extent returned by ext4_map_blocks() by possibly following
delalloc allocation. But we still need to return that there *is a hole* at
first_block. But your patch seems to try to return the delalloc extent
instead which is wrong.

> + ext4_lblk_t offs = 0;
> +
> + if (es.es_lblk < map.m_lblk)
> + offs = map.m_lblk - es.es_lblk;
> + map.m_lblk = es.es_lblk + offs;
> + map.m_pblk = ext4_es_pblock(&es) + offs;
> + map.m_len = es.es_len - offs;
> + if (ext4_es_is_unwritten(&es))
> + map.m_flags |= EXT4_MAP_UNWRITTEN;
> + if (ext4_es_is_delayed(&es))
> + delalloc = true;
> + ret = 1;
> + }
> + }
> } else {
> int dio_credits;
> handle_t *handle;

Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR


Attachments:
(No filename) (3.14 kB)
extent_status_disable.diff (467.00 B)
Download all attachments

2017-08-30 15:00:03

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH 1/4] iomap: Switch from blkno to disk offset

On Tue 29-08-17 16:29:39, Andreas Gruenbacher wrote:
> Replace iomap->blkno, the sector number, with iomap->addr, the disk
> offset in bytes. For invalid disk offsets, use the special value
> IOMAP_NULL_ADDR instead of IOMAP_NULL_BLOCK. This allows to use iomap
> for mappings which are not block aligned, such as inline data on ext4
> and stuffed inodes on gfs2.
>
> Tested on xfs and ext4 with xfstests; seems to be working correctly.
>
> Signed-off-by: Andreas Gruenbacher <[email protected]>

The patch looks good to me. You can add:

Reviewed-by: Jan Kara <[email protected]>

Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR

2017-08-30 15:04:00

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH 2/4] iomap: Add IOMAP_F_DATA_INLINE flag

On Tue 29-08-17 16:29:40, Andreas Gruenbacher wrote:
> Add a new IOMAP_F_DATA_INLINE flag to indicate that a mapping is in a
> disk area that contains data as well as metadata. In iomap_fiemap, map
> this flag to FIEMAP_EXTENT_DATA_INLINE.
>
> Signed-off-by: Andreas Gruenbacher <[email protected]>

Looks good to me. You can add:

Reviewed-by: Jan Kara <[email protected]>

Honza

> ---
> fs/iomap.c | 2 ++
> include/linux/iomap.h | 5 +++--
> 2 files changed, 5 insertions(+), 2 deletions(-)
>
> diff --git a/fs/iomap.c b/fs/iomap.c
> index 7bc1797c2292..f0a263482a0e 100644
> --- a/fs/iomap.c
> +++ b/fs/iomap.c
> @@ -510,6 +510,8 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
> flags |= FIEMAP_EXTENT_MERGED;
> if (iomap->flags & IOMAP_F_SHARED)
> flags |= FIEMAP_EXTENT_SHARED;
> + if (iomap->flags & IOMAP_F_DATA_INLINE)
> + flags |= FIEMAP_EXTENT_DATA_INLINE;
>
> return fiemap_fill_next_extent(fi, iomap->offset,
> iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0,
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 7b8a615fa021..2b0790dbd6ea 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -26,8 +26,9 @@ struct vm_fault;
> /*
> * Flags that only need to be reported for IOMAP_REPORT requests:
> */
> -#define IOMAP_F_MERGED 0x10 /* contains multiple blocks/extents */
> -#define IOMAP_F_SHARED 0x20 /* block shared with another file */
> +#define IOMAP_F_MERGED 0x10 /* contains multiple blocks/extents */
> +#define IOMAP_F_SHARED 0x20 /* block shared with another file */
> +#define IOMAP_F_DATA_INLINE 0x40 /* data inline in the inode */
>
> /*
> * Magic value for addr:
> --
> 2.13.3
>
--
Jan Kara <[email protected]>
SUSE Labs, CR

2017-08-31 21:32:35

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 1/4] iomap: Switch from blkno to disk offset

On Tue, Aug 29, 2017 at 09:11:15AM -0700, Darrick J. Wong wrote:
> On Tue, Aug 29, 2017 at 04:29:39PM +0200, Andreas Gruenbacher wrote:
> > Replace iomap->blkno, the sector number, with iomap->addr, the disk
> > offset in bytes. For invalid disk offsets, use the special value
> > IOMAP_NULL_ADDR instead of IOMAP_NULL_BLOCK. This allows to use iomap
> > for mappings which are not block aligned, such as inline data on ext4
> > and stuffed inodes on gfs2.
> >
> > Tested on xfs and ext4 with xfstests; seems to be working correctly.
> >
> > Signed-off-by: Andreas Gruenbacher <[email protected]>
> > ---
> > fs/buffer.c | 4 ++--
> > fs/dax.c | 2 +-
> > fs/ext2/inode.c | 4 ++--
> > fs/ext4/inode.c | 4 ++--
> > fs/iomap.c | 11 +++++------
> > fs/nfsd/blocklayout.c | 4 ++--
> > fs/xfs/xfs_iomap.c | 6 +++---
> > include/linux/iomap.h | 10 +++++-----
> > 8 files changed, 22 insertions(+), 23 deletions(-)
> >
> > diff --git a/fs/buffer.c b/fs/buffer.c
> > index 5715dac7821f..b9d20076bb8f 100644
> > --- a/fs/buffer.c
> > +++ b/fs/buffer.c
> > @@ -1979,8 +1979,8 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
> > case IOMAP_MAPPED:
> > if (offset >= i_size_read(inode))
> > set_buffer_new(bh);
> > - bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) +
> > - ((offset - iomap->offset) >> inode->i_blkbits);
> > + bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
> > + inode->i_blkbits;
> > set_buffer_mapped(bh);
> > break;
> > }
> > diff --git a/fs/dax.c b/fs/dax.c
> > index 865d42c63e23..539aaa0b9c45 100644
> > --- a/fs/dax.c
> > +++ b/fs/dax.c
> > @@ -987,7 +987,7 @@ EXPORT_SYMBOL_GPL(__dax_zero_page_range);
> >
> > static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
> > {
> > - return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
> > + return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
> > }
> >
> > static loff_t
> > diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
> > index 30163d007b2f..e7d8925d0140 100644
> > --- a/fs/ext2/inode.c
> > +++ b/fs/ext2/inode.c
> > @@ -824,11 +824,11 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> >
> > if (ret == 0) {
> > iomap->type = IOMAP_HOLE;
> > - iomap->blkno = IOMAP_NULL_BLOCK;
> > + iomap->addr = IOMAP_NULL_ADDR;
> > iomap->length = 1 << blkbits;
> > } else {
> > iomap->type = IOMAP_MAPPED;
> > - iomap->blkno = (sector_t)bno << (blkbits - 9);
> > + iomap->addr = (u64)bno << blkbits;
> > iomap->length = (u64)ret << blkbits;
> > iomap->flags |= IOMAP_F_MERGED;
> > }
> > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > index c774bdc22759..429f0afde9f2 100644
> > --- a/fs/ext4/inode.c
> > +++ b/fs/ext4/inode.c
> > @@ -3483,7 +3483,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> >
> > if (ret == 0) {
> > iomap->type = IOMAP_HOLE;
> > - iomap->blkno = IOMAP_NULL_BLOCK;
> > + iomap->addr = IOMAP_NULL_ADDR;
> > iomap->length = (u64)map.m_len << blkbits;
> > } else {
> > if (map.m_flags & EXT4_MAP_MAPPED) {
> > @@ -3494,7 +3494,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> > WARN_ON_ONCE(1);
> > return -EIO;
> > }
> > - iomap->blkno = (sector_t)map.m_pblk << (blkbits - 9);
> > + iomap->addr = (u64)map.m_pblk << blkbits;
> > iomap->length = (u64)map.m_len << blkbits;
> > }
> >
> > diff --git a/fs/iomap.c b/fs/iomap.c
> > index 59cc98ad7577..7bc1797c2292 100644
> > --- a/fs/iomap.c
> > +++ b/fs/iomap.c
> > @@ -350,8 +350,8 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
> > static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
> > struct iomap *iomap)
> > {
> > - sector_t sector = iomap->blkno +
> > - (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9);
> > + sector_t sector = (iomap->addr +
> > + (pos & PAGE_MASK) - iomap->offset) >> 9;
> >
> > return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector,
> > offset, bytes);
> > @@ -512,9 +512,8 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
> > flags |= FIEMAP_EXTENT_SHARED;
> >
> > return fiemap_fill_next_extent(fi, iomap->offset,
> > - iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
> > + iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0,
> > iomap->length, flags);
> > -
> > }
> >
> > static loff_t
> > @@ -807,7 +806,7 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
> > bio = bio_alloc(GFP_KERNEL, 1);
> > bio->bi_bdev = iomap->bdev;
> > bio->bi_iter.bi_sector =
> > - iomap->blkno + ((pos - iomap->offset) >> 9);
> > + (iomap->addr + pos - iomap->offset) >> 9;
> > bio->bi_private = dio;
> > bio->bi_end_io = iomap_dio_bio_end_io;
> >
> > @@ -886,7 +885,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
> > bio = bio_alloc(GFP_KERNEL, nr_pages);
> > bio->bi_bdev = iomap->bdev;
> > bio->bi_iter.bi_sector =
> > - iomap->blkno + ((pos - iomap->offset) >> 9);
> > + (iomap->addr + pos - iomap->offset) >> 9;
> > bio->bi_write_hint = dio->iocb->ki_hint;
> > bio->bi_private = dio;
> > bio->bi_end_io = iomap_dio_bio_end_io;
> > diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
> > index c862c2489df0..2d1d37b27dc7 100644
> > --- a/fs/nfsd/blocklayout.c
> > +++ b/fs/nfsd/blocklayout.c
> > @@ -65,7 +65,7 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
> > bex->es = PNFS_BLOCK_READ_DATA;
> > else
> > bex->es = PNFS_BLOCK_READWRITE_DATA;
> > - bex->soff = (iomap.blkno << 9);
> > + bex->soff = iomap.addr;
> > break;
> > case IOMAP_UNWRITTEN:
> > if (seg->iomode & IOMODE_RW) {
> > @@ -78,7 +78,7 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
> > }
> >
> > bex->es = PNFS_BLOCK_INVALID_DATA;
> > - bex->soff = (iomap.blkno << 9);
> > + bex->soff = iomap.addr;
> > break;
> > }
> > /*FALLTHRU*/
> > diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> > index 813394c62849..1bcc5d0373a5 100644
> > --- a/fs/xfs/xfs_iomap.c
> > +++ b/fs/xfs/xfs_iomap.c
> > @@ -54,13 +54,13 @@ xfs_bmbt_to_iomap(
> > struct xfs_mount *mp = ip->i_mount;
> >
> > if (imap->br_startblock == HOLESTARTBLOCK) {
> > - iomap->blkno = IOMAP_NULL_BLOCK;
> > + iomap->addr = IOMAP_NULL_ADDR;
> > iomap->type = IOMAP_HOLE;
> > } else if (imap->br_startblock == DELAYSTARTBLOCK) {
> > - iomap->blkno = IOMAP_NULL_BLOCK;
> > + iomap->addr = IOMAP_NULL_ADDR;
> > iomap->type = IOMAP_DELALLOC;
> > } else {
> > - iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
> > + iomap->addr = (u64)xfs_fsb_to_db(ip, imap->br_startblock) << 9;
>
> iomap->addr = BBTOB(xfs_fsb_to_db(...));
>
> Same machine-level transformation, but please help us maintain
> consistent unit conversion code; the 'BBTOB' name makes the units
> analysis of a given piece of code more straightforward.
>
> With that fixed,
> Reviewed-by: Darrick J. Wong <[email protected]>
>
> for the iomap and XFS parts.

FWIW I still want the BBTOB thing fixed in whatever patch lands
upstream, but I tested the first two patches in this series and nothing
blew up so I guess everything works ok.

--D

>
> --D
>
> > if (imap->br_state == XFS_EXT_UNWRITTEN)
> > iomap->type = IOMAP_UNWRITTEN;
> > else
> > diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> > index f64dc6ce5161..7b8a615fa021 100644
> > --- a/include/linux/iomap.h
> > +++ b/include/linux/iomap.h
> > @@ -15,8 +15,8 @@ struct vm_fault;
> > */
> > #define IOMAP_HOLE 0x01 /* no blocks allocated, need allocation */
> > #define IOMAP_DELALLOC 0x02 /* delayed allocation blocks */
> > -#define IOMAP_MAPPED 0x03 /* blocks allocated @blkno */
> > -#define IOMAP_UNWRITTEN 0x04 /* blocks allocated @blkno in unwritten state */
> > +#define IOMAP_MAPPED 0x03 /* blocks allocated at @addr */
> > +#define IOMAP_UNWRITTEN 0x04 /* blocks allocated at @addr in unwritten state */
> >
> > /*
> > * Flags for all iomap mappings:
> > @@ -30,12 +30,12 @@ struct vm_fault;
> > #define IOMAP_F_SHARED 0x20 /* block shared with another file */
> >
> > /*
> > - * Magic value for blkno:
> > + * Magic value for addr:
> > */
> > -#define IOMAP_NULL_BLOCK -1LL /* blkno is not valid */
> > +#define IOMAP_NULL_ADDR -1ULL /* addr is not valid */
> >
> > struct iomap {
> > - sector_t blkno; /* 1st sector of mapping, 512b units */
> > + u64 addr; /* disk offset of mapping, bytes */
> > loff_t offset; /* file offset of mapping, bytes */
> > u64 length; /* length of mapping, bytes */
> > u16 type; /* type of mapping */
> > --
> > 2.13.3
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to [email protected]
> > More majordomo info at http://vger.kernel.org/majordomo-info.html

2017-08-31 21:37:45

by Andreas Grünbacher

[permalink] [raw]
Subject: Re: [PATCH 1/4] iomap: Switch from blkno to disk offset

2017-08-31 23:32 GMT+02:00 Darrick J. Wong <[email protected]>:
> FWIW I still want the BBTOB thing fixed in whatever patch lands
> upstream, but I tested the first two patches in this series and nothing
> blew up so I guess everything works ok.

Sure, I'll repost the series once I have the ext4 part working
properly. Thanks for the review and testing.

Andreas

2017-09-14 09:17:24

by Andreas Gruenbacher

[permalink] [raw]
Subject: Re: [PATCH 4/4] ext4: Switch to iomap for SEEK_HOLE / SEEK_DATA

Jan,

On Wed, Aug 30, 2017 at 4:59 PM, Jan Kara <[email protected]> wrote:
> On Tue 29-08-17 16:29:42, Andreas Gruenbacher wrote:
>> From: Christoph Hellwig <[email protected]>
>>
>> Switch to the iomap_seek_hole and iomap_seek_data helpers for
>> implementing lseek SEEK_HOLE / SEEK_DATA, and remove all the code that
>> isn't needed any more.
>>
>> Note that with this patch, ext4 will now always depend on the iomap code
>> instead of only when CONFIG_DAX is enabled, and it requires adding a
>> call into the extent status tree for iomap_begin as well to properly
>> deal with delalloc extents.
>>
>> Signed-off-by: Christoph Hellwig <[email protected]>
>> [Minor fixes and cleanups by Andreas]
>> Signed-off-by: Andreas Gruenbacher <[email protected]>
>
> ...
>
>> @@ -3425,6 +3425,29 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>>
>> if (!(flags & IOMAP_WRITE)) {
>> ret = ext4_map_blocks(NULL, inode, &map, 0);
>> + if (ret < 0)
>> + return ret;
>
> One general question about IOMAP_REPORT: When there is unwritten extent, we
> use page_cache_seek_hole_data() to determine what is actually a hole and
> what is data. We could use exactly the same function to determine what is a
> hole and what is data in an IOMAP_HOLE extent, couldn't we? Now I
> understand that a filesystem is supposed to return IOMAP_DELALLOC extent if
> there is delayed allocation data covering queried offset so probably it's
> not a great idea to use that however it still seems a bit like an
> unnecessary duplication...

There is no point in scanning the page cache on filesystems that don't
support delayed allocation, so it does make sense to distinguish the
two types of mappings.

>> + if (!ret) {
>
> Please go to this branch only for IOMAP_REPORT case to avoid unnecessary
> overhead for DAX mappings...
>
>> + struct extent_status es = {};
>> +
>> + ext4_es_find_delayed_extent_range(inode, map.m_lblk,
>> + map.m_lblk + map.m_len - 1, &es);
>> + /* Is delalloc data before next block in extent tree? */
>> + if (es.es_len && es.es_lblk < map.m_lblk + map.m_len) {
>
> And this is still wrong - I have actually verified that with attached patch
> that disables caching of extents (so it is equivalent to *very* aggresive
> slab reclaim happening). With that patch applied you'll get:
>
> kvm0:~ # rm /mnt/file; xfs_io -f -c "pwrite 4096 4096" -c "seek -a 0"
> /mnt/file
> wrote 4096/4096 bytes at offset 4096
> 4 KiB, 1 ops; 0.0000 sec (16.693 MiB/sec and 4273.5043 ops/sec)
> Whence Result
> DATA 0
> HOLE 8192
>
> Which is obviously wrong - hole should be 0, data should be 4096.
>
> And the reason why this is wrong is that when we are asked to map things at
> first_block and there is a hole at that offset, we need to just truncate
> the hole extent returned by ext4_map_blocks() by possibly following
> delalloc allocation. But we still need to return that there *is a hole* at
> first_block. But your patch seems to try to return the delalloc extent
> instead which is wrong.

Got it, thanks for the debug patch. I'll send a fixed version of the series.

>> + ext4_lblk_t offs = 0;
>> +
>> + if (es.es_lblk < map.m_lblk)
>> + offs = map.m_lblk - es.es_lblk;
>> + map.m_lblk = es.es_lblk + offs;
>> + map.m_pblk = ext4_es_pblock(&es) + offs;
>> + map.m_len = es.es_len - offs;
>> + if (ext4_es_is_unwritten(&es))
>> + map.m_flags |= EXT4_MAP_UNWRITTEN;
>> + if (ext4_es_is_delayed(&es))
>> + delalloc = true;
>> + ret = 1;
>> + }
>> + }
>> } else {
>> int dio_credits;
>> handle_t *handle;

Thanks,
Andreas