2009-01-07 13:12:54

by Ryusuke Konishi

[permalink] [raw]
Subject: [PATCH mmotm] nilfs2: insert checks and hole block allocation in page_mkwrite

Chris Mason told me that page_mkwrite() has some works to do.

On Wed, 17 Dec 2008 21:52:55 -0500, Chris Mason wrote:
> nilfs_page_mkwrite doesn't seem to dirty the page? block_page_mkwrite
> does more, including checks against i_size and others.

This will insert i_size check, and hole block allocation code in the
nilfs_page_mkwrite.

Previously, the hole block allocation was delayed until just before
writing for mmapped pages. This accompanies removal of the delayed
allocation code.

Cc: Chris Mason <[email protected]>
Signed-off-by: Ryusuke Konishi <[email protected]>
---
fs/nilfs2/file.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++-
fs/nilfs2/segment.c | 42 ++++--------------------------------
2 files changed, 62 insertions(+), 39 deletions(-)

diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 7ddd42e..146dc23 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -75,11 +75,66 @@ nilfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,

static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
{
- if (!(vma->vm_flags & (VM_WRITE | VM_MAYWRITE)))
- return -EPERM;
+ struct inode *inode = vma->vm_file->f_dentry->d_inode;
+ struct address_space *mapping = inode->i_mapping;
+ struct nilfs_transaction_info ti;
+ struct buffer_head *bh, *head;
+ int fully_mapped = 1;
+ int ret = -EINVAL;
+
+ /*
+ * use i_alloc_sem to stop truncate operations on the inode
+ */
+ down_read(&inode->i_alloc_sem);
+
+ if (page->mapping != mapping ||
+ page_offset(page) >= i_size_read(inode) || !PageUptodate(page))
+ goto failed;
+
+ if (PageMappedToDisk(page))
+ goto mapped;
+
+ spin_lock(&mapping->private_lock); /* exclude try_to_free_buffers() */
+ if (page_has_buffers(page)) {
+ bh = head = page_buffers(page);
+ do {
+ if (!buffer_mapped(bh)) {
+ fully_mapped = 0;
+ break;
+ }
+ } while (bh = bh->b_this_page, bh != head);
+
+ if (fully_mapped) {
+ spin_unlock(&mapping->private_lock);
+ SetPageMappedToDisk(page);
+ goto mapped;
+ }
+ }
+ spin_unlock(&mapping->private_lock);
+
+ /*
+ * Fill hole blocks. This accompanies a disk capacity check.
+ */
+ ret = nilfs_transaction_begin(inode->i_sb, &ti, 1);
+ if (unlikely(ret))
+ goto failed;
+
+ ret = block_page_mkwrite(vma, page, nilfs_get_block);
+ if (unlikely(ret)) {
+ nilfs_transaction_abort(inode->i_sb);
+ goto failed;
+ }
+ nilfs_transaction_commit(inode->i_sb);
+
+ mapped:
+ up_read(&inode->i_alloc_sem);
SetPageChecked(page);
wait_on_page_writeback(page);
return 0;
+
+ failed:
+ up_read(&inode->i_alloc_sem);
+ return ret;
}

struct vm_operations_struct nilfs_file_vm_ops = {
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 182fb26..b5d28d2 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -667,42 +667,6 @@ struct nilfs_sc_operations nilfs_sc_dsync_ops = {
.write_node_binfo = NULL,
};

-static int nilfs_prepare_data_page(struct inode *inode, struct page *page)
-{
- int err = 0;
-
- lock_page(page);
- if (!page_has_buffers(page))
- create_empty_buffers(page, 1 << inode->i_blkbits, 0);
-
- if (!PageMappedToDisk(page)) {
- struct buffer_head *bh, *head;
- sector_t blkoff
- = page->index << (PAGE_SHIFT - inode->i_blkbits);
-
- int non_mapped = 0;
-
- bh = head = page_buffers(page);
- do {
- if (!buffer_mapped(bh)) {
- if (!buffer_dirty(bh)) {
- non_mapped++;
- continue;
- }
- err = nilfs_get_block(inode, blkoff, bh, 1);
- if (unlikely(err))
- goto out_unlock;
- }
- } while (blkoff++, (bh = bh->b_this_page) != head);
- if (!non_mapped)
- SetPageMappedToDisk(page);
- }
-
- out_unlock:
- unlock_page(page);
- return err;
-}
-
static int nilfs_lookup_dirty_data_buffers(struct inode *inode,
struct list_head *listp,
struct nilfs_sc_info *sci)
@@ -727,7 +691,11 @@ static int nilfs_lookup_dirty_data_buffers(struct inode *inode,
struct page *page = pvec.pages[i];

if (mapping->host) {
- err = nilfs_prepare_data_page(inode, page);
+ lock_page(page);
+ if (!page_has_buffers(page))
+ create_empty_buffers(page,
+ 1 << inode->i_blkbits, 0);
+ unlock_page(page);
if (unlikely(err))
break;
}
--
1.5.6.5


2009-01-07 14:43:30

by Chris Mason

[permalink] [raw]
Subject: Re: [PATCH mmotm] nilfs2: insert checks and hole block allocation in page_mkwrite

On Wed, 2009-01-07 at 22:06 +0900, Ryusuke Konishi wrote:
> Chris Mason told me that page_mkwrite() has some works to do.
>
> On Wed, 17 Dec 2008 21:52:55 -0500, Chris Mason wrote:
> > nilfs_page_mkwrite doesn't seem to dirty the page? block_page_mkwrite
> > does more, including checks against i_size and others.
>
> This will insert i_size check, and hole block allocation code in the
> nilfs_page_mkwrite.
>
> Previously, the hole block allocation was delayed until just before
> writing for mmapped pages. This accompanies removal of the delayed
> allocation code.
>
> Cc: Chris Mason <[email protected]>
> Signed-off-by: Ryusuke Konishi <[email protected]>
> ---
> fs/nilfs2/file.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++-
> fs/nilfs2/segment.c | 42 ++++--------------------------------
> 2 files changed, 62 insertions(+), 39 deletions(-)
>
> diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
> index 7ddd42e..146dc23 100644
> --- a/fs/nilfs2/file.c
> +++ b/fs/nilfs2/file.c
> @@ -75,11 +75,66 @@ nilfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
>
> static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
> {
> - if (!(vma->vm_flags & (VM_WRITE | VM_MAYWRITE)))
> - return -EPERM;
> + struct inode *inode = vma->vm_file->f_dentry->d_inode;
> + struct address_space *mapping = inode->i_mapping;
> + struct nilfs_transaction_info ti;
> + struct buffer_head *bh, *head;
> + int fully_mapped = 1;
> + int ret = -EINVAL;
> +
> + /*
> + * use i_alloc_sem to stop truncate operations on the inode
> + */
> + down_read(&inode->i_alloc_sem);

I'm not 100% sure this is safe. It seems likely the direct io paths
could trigger a mkwrite with the i_alloc_sem already held?

If I'm reading the code correctly you're:

* grab i_alloc sem to protect against truncate
* check to see if the page is mapped already (no holes)

* if there are holes, lock the page, start transaction and do the full
block_page_mkwrite

* drop i_alloc_sem

Would this work instead?:

* lock the page, check to see if it is mapped

* if there are holes, drop the lock, start transaction and do the full
block_page_mkwrite

-chris

2009-01-07 17:38:40

by Ryusuke Konishi

[permalink] [raw]
Subject: Re: [PATCH mmotm] nilfs2: insert checks and hole block allocation in page_mkwrite

On Wed, 07 Jan 2009 09:43:02 -0500, Chris Mason wrote:
> On Wed, 2009-01-07 at 22:06 +0900, Ryusuke Konishi wrote:
> > Chris Mason told me that page_mkwrite() has some works to do.
> >
> > On Wed, 17 Dec 2008 21:52:55 -0500, Chris Mason wrote:
> > > nilfs_page_mkwrite doesn't seem to dirty the page? block_page_mkwrite
> > > does more, including checks against i_size and others.
> >
> > This will insert i_size check, and hole block allocation code in the
> > nilfs_page_mkwrite.
> >
> > Previously, the hole block allocation was delayed until just before
> > writing for mmapped pages. This accompanies removal of the delayed
> > allocation code.
<snip>
> > static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
> > {
> > - if (!(vma->vm_flags & (VM_WRITE | VM_MAYWRITE)))
> > - return -EPERM;
> > + struct inode *inode = vma->vm_file->f_dentry->d_inode;
> > + struct address_space *mapping = inode->i_mapping;
> > + struct nilfs_transaction_info ti;
> > + struct buffer_head *bh, *head;
> > + int fully_mapped = 1;
> > + int ret = -EINVAL;
> > +
> > + /*
> > + * use i_alloc_sem to stop truncate operations on the inode
> > + */
> > + down_read(&inode->i_alloc_sem);
>
> I'm not 100% sure this is safe. It seems likely the direct io paths
> could trigger a mkwrite with the i_alloc_sem already held?

vmtruncate() can change i_size outside the page lock, and it even
calls unmap_mapping_range(). Does it have any problems?

> If I'm reading the code correctly you're:
>
> * grab i_alloc sem to protect against truncate
> * check to see if the page is mapped already (no holes)
>
> * if there are holes, lock the page, start transaction and do the full
> block_page_mkwrite
>
> * drop i_alloc_sem

Yes, it is.

> Would this work instead?:
>
> * lock the page, check to see if it is mapped
>
> * if there are holes, drop the lock, start transaction and do the full
> block_page_mkwrite
>
> -chris

If we don't need care for vmtruncate(), I think it's better, too.

Regards,
Ryusuke

2009-01-07 17:47:54

by Chris Mason

[permalink] [raw]
Subject: Re: [PATCH mmotm] nilfs2: insert checks and hole block allocation in page_mkwrite

On Thu, 2009-01-08 at 02:33 +0900, Ryusuke Konishi wrote:
> On Wed, 07 Jan 2009 09:43:02 -0500, Chris Mason wrote:
> > On Wed, 2009-01-07 at 22:06 +0900, Ryusuke Konishi wrote:
> > > Chris Mason told me that page_mkwrite() has some works to do.
> > >
> > > On Wed, 17 Dec 2008 21:52:55 -0500, Chris Mason wrote:
> > > > nilfs_page_mkwrite doesn't seem to dirty the page? block_page_mkwrite
> > > > does more, including checks against i_size and others.
> > >
> > > This will insert i_size check, and hole block allocation code in the
> > > nilfs_page_mkwrite.
> > >
> > > Previously, the hole block allocation was delayed until just before
> > > writing for mmapped pages. This accompanies removal of the delayed
> > > allocation code.
> <snip>
> > > static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
> > > {
> > > - if (!(vma->vm_flags & (VM_WRITE | VM_MAYWRITE)))
> > > - return -EPERM;
> > > + struct inode *inode = vma->vm_file->f_dentry->d_inode;
> > > + struct address_space *mapping = inode->i_mapping;
> > > + struct nilfs_transaction_info ti;
> > > + struct buffer_head *bh, *head;
> > > + int fully_mapped = 1;
> > > + int ret = -EINVAL;
> > > +
> > > + /*
> > > + * use i_alloc_sem to stop truncate operations on the inode
> > > + */
> > > + down_read(&inode->i_alloc_sem);
> >
> > I'm not 100% sure this is safe. It seems likely the direct io paths
> > could trigger a mkwrite with the i_alloc_sem already held?
>
> vmtruncate() can change i_size outside the page lock, and it even
> calls unmap_mapping_range(). Does it have any problems?
>

You're right, i_size can change without you knowing about it, but before
vmtruncate does anything to the page itself, it locks the page.

So, we tend to do a somewhat racey check against i_size. If the page is
outside i_size, we happily ignore it. If it is inside i_size, we assume
it is part of the file. Most of the code that does this checks i_size
once after locking the page and saves that value, so if i_size changes
later on the code at least does consistent operations based on a single
value of i_size.

If the page straddles i_size and someone extends the file, they are
expected to lock the page as well (see block_truncate_page and the
places it gets called).

-chris

2009-01-08 08:04:21

by Ryusuke Konishi

[permalink] [raw]
Subject: Re: [PATCH mmotm] nilfs2: insert checks and hole block allocation in page_mkwrite

On Wed, 07 Jan 2009 12:47:22 -0500, Chris Mason wrote:
> On Thu, 2009-01-08 at 02:33 +0900, Ryusuke Konishi wrote:
> > On Wed, 07 Jan 2009 09:43:02 -0500, Chris Mason wrote:
> > > On Wed, 2009-01-07 at 22:06 +0900, Ryusuke Konishi wrote:
> > > > Chris Mason told me that page_mkwrite() has some works to do.
> > > >
> > > > On Wed, 17 Dec 2008 21:52:55 -0500, Chris Mason wrote:
> > > > > nilfs_page_mkwrite doesn't seem to dirty the page? block_page_mkwrite
> > > > > does more, including checks against i_size and others.
> > > >
> > > > This will insert i_size check, and hole block allocation code in the
> > > > nilfs_page_mkwrite.
> > > >
> > > > Previously, the hole block allocation was delayed until just before
> > > > writing for mmapped pages. This accompanies removal of the delayed
> > > > allocation code.
> > <snip>
> > > > + /*
> > > > + * use i_alloc_sem to stop truncate operations on the inode
> > > > + */
> > > > + down_read(&inode->i_alloc_sem);
> > >
> > > I'm not 100% sure this is safe. It seems likely the direct io paths
> > > could trigger a mkwrite with the i_alloc_sem already held?
> >
> > vmtruncate() can change i_size outside the page lock, and it even
> > calls unmap_mapping_range(). Does it have any problems?
>
> You're right, i_size can change without you knowing about it, but before
> vmtruncate does anything to the page itself, it locks the page.
>
> So, we tend to do a somewhat racey check against i_size. If the page is
> outside i_size, we happily ignore it. If it is inside i_size, we assume
> it is part of the file. Most of the code that does this checks i_size
> once after locking the page and saves that value, so if i_size changes
> later on the code at least does consistent operations based on a single
> value of i_size.
>
> If the page straddles i_size and someone extends the file, they are
> expected to lock the page as well (see block_truncate_page and the
> places it gets called).
>
> -chris

I got it, using page locking seems OK even for vmtruncate().

I don't know if the mkwrite is actually called from the direct io
path, but no doubt about i_alloc_sem is used in it. To make this
safer and avoid confusion, I took your advice.

The revised patch is attached below.

Ryusuke
--
nilfs2: insert checks and hole block allocation in page_mkwrite (rev 2)

Chris Mason told me that page_mkwrite() has some works to do.

On Wed, 17 Dec 2008 21:52:55 -0500, Chris Mason wrote:
> nilfs_page_mkwrite doesn't seem to dirty the page? block_page_mkwrite
> does more, including checks against i_size and others.

This will insert i_size check, a disk capacity check, and hole block
allocation code in the nilfs_page_mkwrite.

Previously, the hole block allocation was delayed until just before
writing for mmapped pages. This accompanies removal of the delayed
allocation code.

This revision uses a page lock instead of i_alloc_sem for safety.

Cc: Chris Mason <[email protected]>
Signed-off-by: Ryusuke Konishi <[email protected]>
---
fs/nilfs2/file.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++-
fs/nilfs2/segment.c | 44 ++++----------------------------------
2 files changed, 61 insertions(+), 41 deletions(-)

diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 7ddd42e..c22f97c 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -75,8 +75,62 @@ nilfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,

static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
{
- if (!(vma->vm_flags & (VM_WRITE | VM_MAYWRITE)))
- return -EPERM;
+ struct inode *inode = vma->vm_file->f_dentry->d_inode;
+ struct nilfs_transaction_info ti;
+ int ret;
+
+ if (unlikely(nilfs_near_disk_full(NILFS_SB(inode->i_sb)->s_nilfs)))
+ return -ENOSPC;
+
+ lock_page(page);
+ if (page->mapping != inode->i_mapping ||
+ page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) {
+ unlock_page(page);
+ return -EINVAL;
+ }
+
+ /*
+ * check to see if the page is mapped already (no holes)
+ */
+ if (PageMappedToDisk(page)) {
+ unlock_page(page);
+ goto mapped;
+ }
+ if (page_has_buffers(page)) {
+ struct buffer_head *bh, *head;
+ int fully_mapped = 1;
+
+ bh = head = page_buffers(page);
+ do {
+ if (!buffer_mapped(bh)) {
+ fully_mapped = 0;
+ break;
+ }
+ } while (bh = bh->b_this_page, bh != head);
+
+ if (fully_mapped) {
+ SetPageMappedToDisk(page);
+ unlock_page(page);
+ goto mapped;
+ }
+ }
+ unlock_page(page);
+
+ /*
+ * fill hole blocks
+ */
+ ret = nilfs_transaction_begin(inode->i_sb, &ti, 1);
+ if (unlikely(ret))
+ return ret;
+
+ ret = block_page_mkwrite(vma, page, nilfs_get_block);
+ if (unlikely(ret)) {
+ nilfs_transaction_abort(inode->i_sb);
+ return ret;
+ }
+ nilfs_transaction_commit(inode->i_sb);
+
+ mapped:
SetPageChecked(page);
wait_on_page_writeback(page);
return 0;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 182fb26..b4b96ac 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -667,42 +667,6 @@ struct nilfs_sc_operations nilfs_sc_dsync_ops = {
.write_node_binfo = NULL,
};

-static int nilfs_prepare_data_page(struct inode *inode, struct page *page)
-{
- int err = 0;
-
- lock_page(page);
- if (!page_has_buffers(page))
- create_empty_buffers(page, 1 << inode->i_blkbits, 0);
-
- if (!PageMappedToDisk(page)) {
- struct buffer_head *bh, *head;
- sector_t blkoff
- = page->index << (PAGE_SHIFT - inode->i_blkbits);
-
- int non_mapped = 0;
-
- bh = head = page_buffers(page);
- do {
- if (!buffer_mapped(bh)) {
- if (!buffer_dirty(bh)) {
- non_mapped++;
- continue;
- }
- err = nilfs_get_block(inode, blkoff, bh, 1);
- if (unlikely(err))
- goto out_unlock;
- }
- } while (blkoff++, (bh = bh->b_this_page) != head);
- if (!non_mapped)
- SetPageMappedToDisk(page);
- }
-
- out_unlock:
- unlock_page(page);
- return err;
-}
-
static int nilfs_lookup_dirty_data_buffers(struct inode *inode,
struct list_head *listp,
struct nilfs_sc_info *sci)
@@ -727,9 +691,11 @@ static int nilfs_lookup_dirty_data_buffers(struct inode *inode,
struct page *page = pvec.pages[i];

if (mapping->host) {
- err = nilfs_prepare_data_page(inode, page);
- if (unlikely(err))
- break;
+ lock_page(page);
+ if (!page_has_buffers(page))
+ create_empty_buffers(page,
+ 1 << inode->i_blkbits, 0);
+ unlock_page(page);
}

bh = head = page_buffers(page);
--
1.5.6.5