readpage(), prepare_write(), and commit_write() callers are updated to
understand the special return code AOP_TRUNCATED_PAGE in the style of
writepage() and WRITEPAGE_ACTIVATE. AOP_TRUNCATED_PAGE tells the caller that
the callee has unlocked the page and that the operation should be tried again
with a new page. OCFS2 uses this to detect and work around a lock inversion in
its aop methods. There should be no change in behaviour for methods that don't
return AOP_TRUNCATED_PAGE.
WRITEPAGE_ACTIVATE is also prepended with AOP_ for consistency and they are
made enums so that kerneldoc can be used to document their semantics.
Signed-off-by: Zach Brown <[email protected]>
---
Andrew, this is against -mm instead of mainline so that it catches the reiser4
use of WRITEPAGE_ACTIVATE. Also, right now the OCFS2 git repository doesn't
have the code that uses this, but it will once this is in -mm.
drivers/block/loop.c | 23 +++++++++++---
drivers/block/rd.c | 4 +-
fs/mpage.c | 2 -
fs/reiser4/entd.c | 2 -
include/linux/fs.h | 31 +++++++++++++++++++
include/linux/writeback.h | 6 ---
mm/filemap.c | 73 +++++++++++++++++++++++++++++++---------------
mm/readahead.c | 15 +++++----
mm/shmem.c | 2 -
mm/vmscan.c | 2 -
10 files changed, 114 insertions(+), 46 deletions(-)
Index: 2.6.14-rc5-mm1-aop-truncated-page/drivers/block/loop.c
===================================================================
--- 2.6.14-rc5-mm1-aop-truncated-page.orig/drivers/block/loop.c 2005-10-27 11:09:50.000000000 -0700
+++ 2.6.14-rc5-mm1-aop-truncated-page/drivers/block/loop.c 2005-10-27 11:14:56.000000000 -0700
@@ -213,7 +213,7 @@
struct address_space_operations *aops = mapping->a_ops;
pgoff_t index;
unsigned offset, bv_offs;
- int len, ret = 0;
+ int len, ret;
down(&mapping->host->i_sem);
index = pos >> PAGE_CACHE_SHIFT;
@@ -232,9 +232,15 @@
page = grab_cache_page(mapping, index);
if (unlikely(!page))
goto fail;
- if (unlikely(aops->prepare_write(file, page, offset,
- offset + size)))
+ ret = aops->prepare_write(file, page, offset,
+ offset + size);
+ if (unlikely(ret)) {
+ if (ret == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ continue;
+ }
goto unlock;
+ }
transfer_result = lo_do_transfer(lo, WRITE, page, offset,
bvec->bv_page, bv_offs, size, IV);
if (unlikely(transfer_result)) {
@@ -251,9 +257,15 @@
kunmap_atomic(kaddr, KM_USER0);
}
flush_dcache_page(page);
- if (unlikely(aops->commit_write(file, page, offset,
- offset + size)))
+ ret = aops->commit_write(file, page, offset,
+ offset + size);
+ if (unlikely(ret)) {
+ if (ret == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ continue;
+ }
goto unlock;
+ }
if (unlikely(transfer_result))
goto unlock;
bv_offs += size;
@@ -264,6 +276,7 @@
unlock_page(page);
page_cache_release(page);
}
+ ret = 0;
out:
up(&mapping->host->i_sem);
return ret;
Index: 2.6.14-rc5-mm1-aop-truncated-page/drivers/block/rd.c
===================================================================
--- 2.6.14-rc5-mm1-aop-truncated-page.orig/drivers/block/rd.c 2005-10-27 11:09:21.000000000 -0700
+++ 2.6.14-rc5-mm1-aop-truncated-page/drivers/block/rd.c 2005-10-27 12:01:40.000000000 -0700
@@ -154,7 +154,7 @@
/*
* ->writepage to the the blockdev's mapping has to redirty the page so that the
- * VM doesn't go and steal it. We return WRITEPAGE_ACTIVATE so that the VM
+ * VM doesn't go and steal it. We return AOP_WRITEPAGE_ACTIVATE so that the VM
* won't try to (pointlessly) write the page again for a while.
*
* Really, these pages should not be on the LRU at all.
@@ -165,7 +165,7 @@
make_page_uptodate(page);
SetPageDirty(page);
if (wbc->for_reclaim)
- return WRITEPAGE_ACTIVATE;
+ return AOP_WRITEPAGE_ACTIVATE;
unlock_page(page);
return 0;
}
Index: 2.6.14-rc5-mm1-aop-truncated-page/fs/mpage.c
===================================================================
--- 2.6.14-rc5-mm1-aop-truncated-page.orig/fs/mpage.c 2005-10-27 11:10:45.000000000 -0700
+++ 2.6.14-rc5-mm1-aop-truncated-page/fs/mpage.c 2005-10-27 12:02:32.000000000 -0700
@@ -721,7 +721,7 @@
&last_block_in_bio, &ret, wbc,
page->mapping->a_ops->writepage);
}
- if (unlikely(ret == WRITEPAGE_ACTIVATE))
+ if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE))
unlock_page(page);
if (ret || (--(wbc->nr_to_write) <= 0))
done = 1;
Index: 2.6.14-rc5-mm1-aop-truncated-page/fs/reiser4/entd.c
===================================================================
--- 2.6.14-rc5-mm1-aop-truncated-page.orig/fs/reiser4/entd.c 2005-10-27 11:11:26.000000000 -0700
+++ 2.6.14-rc5-mm1-aop-truncated-page/fs/reiser4/entd.c 2005-10-27 12:03:40.000000000 -0700
@@ -390,7 +390,7 @@
return 1;
}
lock_page(page);
- return WRITEPAGE_ACTIVATE;
+ return AOP_WRITEPAGE_ACTIVATE;
}
void ent_writes_page(struct super_block *sb, struct page *page)
Index: 2.6.14-rc5-mm1-aop-truncated-page/include/linux/fs.h
===================================================================
--- 2.6.14-rc5-mm1-aop-truncated-page.orig/include/linux/fs.h 2005-10-27 11:11:26.000000000 -0700
+++ 2.6.14-rc5-mm1-aop-truncated-page/include/linux/fs.h 2005-10-27 12:12:39.078455633 -0700
@@ -292,6 +292,37 @@
*/
#include <linux/quota.h>
+/**
+ * enum positive_aop_returns - aop return codes with specific semantics
+ *
+ * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has
+ * completed, that the page is still locked, and
+ * should be considered active. The VM uses this hint
+ * to return the page to the active list -- it won't
+ * be a candidate for writeback again in the near
+ * future. Other callers must be careful to unlock
+ * the page if they get this return. Returned by
+ * writepage();
+ *
+ * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has
+ * unlocked it and the page might have been truncated.
+ * The caller should back up to acquiring a new page and
+ * trying again. The aop will be taking reasonable
+ * precautions not to livelock. If the caller held a page
+ * reference, it should drop it before retrying. Returned
+ * by readpage(), prepare_write(), and commit_write().
+ *
+ * address_space_operation functions return these large constants to indicate
+ * special semantics to the caller. These are much larger than the bytes in a
+ * page to allow for functions that return the number of bytes operated on in a
+ * given page.
+ */
+
+enum positive_aop_returns {
+ AOP_WRITEPAGE_ACTIVATE = 0x80000,
+ AOP_TRUNCATED_PAGE = 0x80001,
+};
+
/*
* oh the beauties of C type declarations.
*/
Index: 2.6.14-rc5-mm1-aop-truncated-page/include/linux/writeback.h
===================================================================
--- 2.6.14-rc5-mm1-aop-truncated-page.orig/include/linux/writeback.h 2005-10-27 11:10:45.000000000 -0700
+++ 2.6.14-rc5-mm1-aop-truncated-page/include/linux/writeback.h 2005-10-27 11:14:42.879021785 -0700
@@ -60,12 +60,6 @@
};
/*
- * ->writepage() return values (make these much larger than a pagesize, in
- * case some fs is returning number-of-bytes-written from writepage)
- */
-#define WRITEPAGE_ACTIVATE 0x80000 /* IO was not started: activate page */
-
-/*
* fs/fs-writeback.c
*/
void writeback_inodes(struct writeback_control *wbc);
Index: 2.6.14-rc5-mm1-aop-truncated-page/mm/filemap.c
===================================================================
--- 2.6.14-rc5-mm1-aop-truncated-page.orig/mm/filemap.c 2005-10-27 11:11:26.000000000 -0700
+++ 2.6.14-rc5-mm1-aop-truncated-page/mm/filemap.c 2005-10-27 11:56:32.000000000 -0700
@@ -853,8 +853,13 @@
/* Start the actual read. The read will unlock the page. */
error = mapping->a_ops->readpage(filp, page);
- if (unlikely(error))
+ if (unlikely(error)) {
+ if (error == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ goto find_page;
+ }
goto readpage_error;
+ }
if (!PageUptodate(page)) {
lock_page(page);
@@ -1174,26 +1179,24 @@
{
struct address_space *mapping = file->f_mapping;
struct page *page;
- int error;
+ int ret;
- page = page_cache_alloc_cold(mapping);
- if (!page)
- return -ENOMEM;
+ do {
+ page = page_cache_alloc_cold(mapping);
+ if (!page)
+ return -ENOMEM;
+
+ ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
+ if (ret == 0)
+ ret = mapping->a_ops->readpage(file, page);
+ else if (ret == -EEXIST)
+ ret = 0; /* losing race to add is OK */
- error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
- if (!error) {
- error = mapping->a_ops->readpage(file, page);
page_cache_release(page);
- return error;
- }
- /*
- * We arrive here in the unlikely event that someone
- * raced with us and added our page to the cache first
- * or we are out of memory for radix-tree nodes.
- */
- page_cache_release(page);
- return error == -EEXIST ? 0 : error;
+ } while (ret == AOP_TRUNCATED_PAGE);
+
+ return ret;
}
#define MMAP_LOTSAMISS (100)
@@ -1353,10 +1356,14 @@
goto success;
}
- if (!mapping->a_ops->readpage(file, page)) {
+ error = mapping->a_ops->readpage(file, page);
+ if (!error) {
wait_on_page_locked(page);
if (PageUptodate(page))
goto success;
+ } else if (error == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ goto retry_find;
}
/*
@@ -1380,10 +1387,14 @@
goto success;
}
ClearPageError(page);
- if (!mapping->a_ops->readpage(file, page)) {
+ error = mapping->a_ops->readpage(file, page);
+ if (!error) {
wait_on_page_locked(page);
if (PageUptodate(page))
goto success;
+ } else if (error == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ goto retry_find;
}
/*
@@ -1466,10 +1477,14 @@
goto success;
}
- if (!mapping->a_ops->readpage(file, page)) {
+ error = mapping->a_ops->readpage(file, page);
+ if (!error) {
wait_on_page_locked(page);
if (PageUptodate(page))
goto success;
+ } else if (error == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ goto retry_find;
}
/*
@@ -1492,10 +1507,14 @@
}
ClearPageError(page);
- if (!mapping->a_ops->readpage(file, page)) {
+ error = mapping->a_ops->readpage(file, page);
+ if (!error) {
wait_on_page_locked(page);
if (PageUptodate(page))
goto success;
+ } else if (error == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ goto retry_find;
}
/*
@@ -1956,12 +1975,16 @@
status = a_ops->prepare_write(file, page, offset, offset+bytes);
if (unlikely(status)) {
loff_t isize = i_size_read(inode);
+
+ if (status != AOP_TRUNCATED_PAGE)
+ unlock_page(page);
+ page_cache_release(page);
+ if (status == AOP_TRUNCATED_PAGE)
+ continue;
/*
* prepare_write() may have instantiated a few blocks
* outside i_size. Trim these off again.
*/
- unlock_page(page);
- page_cache_release(page);
if (pos + bytes > isize)
vmtruncate(inode, isize);
break;
@@ -1974,6 +1997,10 @@
cur_iov, iov_base, bytes);
flush_dcache_page(page);
status = a_ops->commit_write(file, page, offset, offset+bytes);
+ if (status == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ continue;
+ }
if (likely(copied > 0)) {
if (!status)
status = copied;
Index: 2.6.14-rc5-mm1-aop-truncated-page/mm/readahead.c
===================================================================
--- 2.6.14-rc5-mm1-aop-truncated-page.orig/mm/readahead.c 2005-10-27 11:11:26.000000000 -0700
+++ 2.6.14-rc5-mm1-aop-truncated-page/mm/readahead.c 2005-10-27 11:58:59.000000000 -0700
@@ -159,7 +159,7 @@
{
unsigned page_idx;
struct pagevec lru_pvec;
- int ret = 0;
+ int ret;
if (mapping->a_ops->readpages) {
ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
@@ -172,14 +172,17 @@
list_del(&page->lru);
if (!add_to_page_cache(page, mapping,
page->index, GFP_KERNEL)) {
- mapping->a_ops->readpage(filp, page);
- if (!pagevec_add(&lru_pvec, page))
- __pagevec_lru_add(&lru_pvec);
- } else {
- page_cache_release(page);
+ ret = mapping->a_ops->readpage(filp, page);
+ if (ret != AOP_TRUNCATED_PAGE) {
+ if (!pagevec_add(&lru_pvec, page))
+ __pagevec_lru_add(&lru_pvec);
+ continue;
+ } /* else fall through to release */
}
+ page_cache_release(page);
}
pagevec_lru_add(&lru_pvec);
+ ret = 0;
out:
return ret;
}
Index: 2.6.14-rc5-mm1-aop-truncated-page/mm/shmem.c
===================================================================
--- 2.6.14-rc5-mm1-aop-truncated-page.orig/mm/shmem.c 2005-10-27 11:11:26.000000000 -0700
+++ 2.6.14-rc5-mm1-aop-truncated-page/mm/shmem.c 2005-10-27 12:04:18.000000000 -0700
@@ -855,7 +855,7 @@
swap_free(swap);
redirty:
set_page_dirty(page);
- return WRITEPAGE_ACTIVATE; /* Return with the page locked */
+ return AOP_WRITEPAGE_ACTIVATE; /* Return with the page locked */
}
#ifdef CONFIG_NUMA
Index: 2.6.14-rc5-mm1-aop-truncated-page/mm/vmscan.c
===================================================================
--- 2.6.14-rc5-mm1-aop-truncated-page.orig/mm/vmscan.c 2005-10-27 11:11:26.000000000 -0700
+++ 2.6.14-rc5-mm1-aop-truncated-page/mm/vmscan.c 2005-10-27 12:12:45.000000000 -0700
@@ -355,7 +355,7 @@
res = mapping->a_ops->writepage(page, &wbc);
if (res < 0)
handle_write_error(mapping, page, res);
- if (res == WRITEPAGE_ACTIVATE) {
+ if (res == AOP_WRITEPAGE_ACTIVATE) {
ClearPageReclaim(page);
return PAGE_ACTIVATE;
}
Zach Brown <[email protected]> wrote:
>
> readpage(), prepare_write(), and commit_write() callers are updated to
> understand the special return code AOP_TRUNCATED_PAGE in the style of
> writepage() and WRITEPAGE_ACTIVATE. AOP_TRUNCATED_PAGE tells the caller that
> the callee has unlocked the page and that the operation should be tried again
> with a new page. OCFS2 uses this to detect and work around a lock inversion in
> its aop methods. There should be no change in behaviour for methods that don't
> return AOP_TRUNCATED_PAGE.
>
> WRITEPAGE_ACTIVATE is also prepended with AOP_ for consistency and they are
> made enums so that kerneldoc can be used to document their semantics.
Looks sane to me. Can you carry this in the ocfs2 tree?
On Thu, Nov 03, 2005 at 12:45:36PM +1100, Andrew Morton wrote:
> Zach Brown <[email protected]> wrote:
> > readpage(), prepare_write(), and commit_write() callers are updated to
> > understand the special return code AOP_TRUNCATED_PAGE in the style of
> > writepage() and WRITEPAGE_ACTIVATE. AOP_TRUNCATED_PAGE tells the caller that
> > the callee has unlocked the page and that the operation should be tried again
>
> Looks sane to me. Can you carry this in the ocfs2 tree?
No problem. Give us a day or two to merge the changes to our
main trees.
Joel
--
"Baby, even the losers
Get luck sometimes.
Even the losers
Keep a little bit of pride."
Joel Becker
Principal Software Developer
Oracle
E-mail: [email protected]
Phone: (650) 506-8127
On Wed, Nov 02, 2005 at 11:43:12PM -0800, Joel Becker wrote:
> > Looks sane to me. Can you carry this in the ocfs2 tree?
>
> No problem. Give us a day or two to merge the changes to our
> main trees.
I think I disagree with Andew here. Having a core patch separate
from a new drivers/filesystem/etc.. is always a good idea. It makes
reviewing a lot easier and allows independant handling, e.g. merging it
earlier than the new driver for some reason - as happened for example
with the clear_inode changes we needed earlier for ocfs or the pagevec
exports that came in via the reiser4 patches but were needed in mainline
for cifs now.
Christoph Hellwig <[email protected]> wrote:
>
> On Wed, Nov 02, 2005 at 11:43:12PM -0800, Joel Becker wrote:
> > > Looks sane to me. Can you carry this in the ocfs2 tree?
> >
> > No problem. Give us a day or two to merge the changes to our
> > main trees.
>
> I think I disagree with Andew here. Having a core patch separate
> from a new drivers/filesystem/etc.. is always a good idea. It makes
> reviewing a lot easier and allows independant handling, e.g. merging it
> earlier than the new driver for some reason - as happened for example
> with the clear_inode changes we needed earlier for ocfs or the pagevec
> exports that came in via the reiser4 patches but were needed in mainline
> for cifs now.
Yes, that's better from a code staging/reviewing pov.
But there's a practical problem: if I put the patch in -mm for testing, and
the ocfs2 guys put the patch into their git tree for their testing, I get
rejects. So I drop the core patch from -mm again. If the ocfs guys _dont_
put the patch into their tree, they need to add this additional patch for
their testing. It's all screwed up.
So for both -mm and for the ocfs2 team, leaving the patch in the ocfs2 git
tree is the most convenient place for it.
Obviously, merging it into Linus's tree will fix up everyone's patching
problems, but it has no users at this time...
> So for both -mm and for the ocfs2 team, leaving the patch in the ocfs2 git
> tree is the most convenient place for it.
Well, I don't think it matters much either way because we mostly test by
checking out of svn and buliding against -mm or patched distro kernels. I'd
leave the details up to you and Joel (who maintains our ocfs2 git) -- whatever
is easiest for you guys.
> Obviously, merging it into Linus's tree will fix up everyone's patching
> problems, but it has no users at this time...
So, speaking of which, are there any barriers to merging OCFS2 now? I think
Christoph's concerns (silly /proc files, vma walking, endian stuff) have been
addressed.
- z
> > Obviously, merging it into Linus's tree will fix up everyone's patching
> > problems, but it has no users at this time...
>
> So, speaking of which, are there any barriers to merging OCFS2 now? I think
> Christoph's concerns (silly /proc files, vma walking, endian stuff) have been
> addressed.
and context dependent symlinks are gone too ;) to please mighty Al :)
On Fri, Nov 04, 2005 at 10:31:35AM -0800, Zach Brown wrote:
> > Obviously, merging it into Linus's tree will fix up everyone's patching
> > problems, but it has no users at this time...
>
> So, speaking of which, are there any barriers to merging OCFS2 now? I think
> Christoph's concerns (silly /proc files, vma walking, endian stuff) have been
> addressed.
I think it's in pretty good shape now. The stuct typedefs should go before
we put it in mainline so we don't need to do a major sweep through everything
just after it appeared in SCM history, but that's just a minor thingy.
Else I'd say it looks good unless we see problems with the AOP_TRUNCATED_PAGE
patch we should send it to linus after the time for the big core merges is
over (aka in about a week or two)
On Fri, Nov 04, 2005 at 08:59:14PM +0000, Christoph Hellwig wrote:
> I think it's in pretty good shape now. The stuct typedefs should go before
> we put it in mainline so we don't need to do a major sweep through everything
> just after it appeared in SCM history, but that's just a minor thingy.
> Else I'd say it looks good unless we see problems with the AOP_TRUNCATED_PAGE
> patch we should send it to linus after the time for the big core merges is
> over (aka in about a week or two)
Ok, the OCFS2 git trees have the latest code, including the
AOP_TRUNCATED_PAGE patch, the OCFS2 locking changes, and the removal of
the typedefs. Let us know if you see anything else.
Andrew, the AOP_TRUNCATED_PAGE patch is also broken out at
http://oss.oracle.com/projects/ocfs2/dist/files/patches/2.6.14/2005-11-08/broken-out/linux-2.6.14-aop-truncated-page-1.patch.
If you pull it via the OCFS2 git tree, you'll still need
http://oss.oracle.com/projects/ocfs2/dist/files/patches/2.6.14/2005-11-08/broken-out/linux-2.6.14-mm1-aop-truncated-page-reiser4.patch
for reiser4.
Thanks everyone for the comments and the help.
Joel
--
"There is no more evil thing on earth than race prejudice, none at
all. I write deliberately -- it is the worst single thing in life
now. It justifies and holds together more baseness, cruelty and
abomination than any other sort of error in the world."
- H. G. Wells
Joel Becker
Principal Software Developer
Oracle
E-mail: [email protected]
Phone: (650) 506-8127