2005-10-17 22:21:09

by Zach Brown

[permalink] [raw]
Subject: [RFC] page lock ordering and OCFS2


I sent an ealier version of this patch to linux-fsdevel and was met with
deafening silence. I'm resending the commentary from that first mail and am
including a new version of the patch. This time it has much clearer naming
and some kerneldoc blurbs. Here goes...

--

In recent weeks we've been reworking the locking in OCFS2 to simplify things
and make it behave more like a "local" Linux file system. We've run into an
ordering inversion between a page's PG_locked and OCFS2's DLM locks that
protect page cache contents. I'm including a patch at the end of this mail
that I think is a clean way to give the file system a chance to get the
ordering right, but we're open to any and all suggestions. We want to do the
cleanest thing.

OCFS2 maintains page cache coherence between nodes by requiring that a node
hold a valid lock while there are active pages in the page cache. The page
cache is invalidated before a node releases a lock so that another node can
acquire it. While this invalidation is happening new locks can not be acquired
on that node. This is equivalent to a DLM processing thread acquiring
PG_locked during truncation while holding a DLM lock. Normal file system user
tasks come to the a_ops with PG_locked acquired by their callers before they
have a chance to get DLM locks.

We talked a little about modifying the invalidation path to avoid waiting for
pages that are held by an OCFS2 path that is waiting for a DLM lock. It seems
awfully hard to get right without some very nasty code. The truncation on lock
removal has to write back dirty pages so that other nodes can read it -- it
can't simply skip these pages if they happened to be locked.

So we aimed right for the lock ordering inversion problem with the appended
patch. It gives file systems a callback that is tried before page locks are
acquired that are going to be passed in to the file system's a_ops methods.

So, what do people think about this? Is it reasonable to patch the core to
help OCFS2 with this ordering inversion?

- z

Index: 2.6.14-rc4-guard-page-locks/drivers/block/loop.c
===================================================================
--- 2.6.14-rc4-guard-page-locks.orig/drivers/block/loop.c 2005-10-14 16:47:25.000000000 -0700
+++ 2.6.14-rc4-guard-page-locks/drivers/block/loop.c 2005-10-14 17:01:42.000000000 -0700
@@ -229,9 +229,13 @@
size = PAGE_CACHE_SIZE - offset;
if (size > len)
size = len;
+ if (mapping_guard_page_locks(mapping, 1))
+ goto fail;
page = grab_cache_page(mapping, index);
- if (unlikely(!page))
+ if (unlikely(!page)) {
+ mapping_page_locks_done(mapping, 1);
goto fail;
+ }
if (unlikely(aops->prepare_write(file, page, offset,
offset + size)))
goto unlock;
@@ -263,6 +267,7 @@
pos += size;
unlock_page(page);
page_cache_release(page);
+ mapping_page_locks_done(mapping, 1);
}
out:
up(&mapping->host->i_sem);
@@ -270,6 +275,7 @@
unlock:
unlock_page(page);
page_cache_release(page);
+ mapping_page_locks_done(mapping, 1);
fail:
ret = -1;
goto out;
Index: 2.6.14-rc4-guard-page-locks/fs/buffer.c
===================================================================
--- 2.6.14-rc4-guard-page-locks.orig/fs/buffer.c 2005-10-14 16:48:12.000000000 -0700
+++ 2.6.14-rc4-guard-page-locks/fs/buffer.c 2005-10-14 16:55:34.000000000 -0700
@@ -2174,16 +2174,24 @@
if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
offset++;
}
+
+ err = mapping_guard_page_locks(mapping, 1);
+ if (err)
+ goto out;
+
index = size >> PAGE_CACHE_SHIFT;
- err = -ENOMEM;
page = grab_cache_page(mapping, index);
- if (!page)
+ if (!page) {
+ err = -ENOMEM;
+ mapping_page_locks_done(mapping, 1);
goto out;
+ }
err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
if (!err) {
err = mapping->a_ops->commit_write(NULL, page, offset, offset);
}
unlock_page(page);
+ mapping_page_locks_done(mapping, 1);
page_cache_release(page);
if (err > 0)
err = 0;
@@ -2572,10 +2580,16 @@
if ((offset & (blocksize - 1)) == 0)
goto out;

- ret = -ENOMEM;
+ ret = mapping_guard_page_locks(mapping, 1);
+ if (ret)
+ goto out;
+
page = grab_cache_page(mapping, index);
- if (!page)
+ if (!page) {
+ mapping_page_locks_done(mapping, 1);
+ ret = -ENOMEM;
goto out;
+ }

to = (offset + blocksize) & ~(blocksize - 1);
ret = a_ops->prepare_write(NULL, page, offset, to);
@@ -2588,6 +2602,7 @@
}
unlock_page(page);
page_cache_release(page);
+ mapping_page_locks_done(mapping, 1);
out:
return ret;
}
Index: 2.6.14-rc4-guard-page-locks/include/linux/fs.h
===================================================================
--- 2.6.14-rc4-guard-page-locks.orig/include/linux/fs.h 2005-10-14 16:48:15.000000000 -0700
+++ 2.6.14-rc4-guard-page-locks/include/linux/fs.h 2005-10-17 14:54:05.841846285 -0700
@@ -325,6 +325,10 @@
loff_t offset, unsigned long nr_segs);
struct page* (*get_xip_page)(struct address_space *, sector_t,
int);
+
+ /* see mapping_guard_page_locks() */
+ int (*guard_page_locks)(struct address_space *, int write);
+ void (*page_locks_done)(struct address_space *, int write);
};

struct backing_dev_info;
Index: 2.6.14-rc4-guard-page-locks/include/linux/pagemap.h
===================================================================
--- 2.6.14-rc4-guard-page-locks.orig/include/linux/pagemap.h 2005-10-14 16:48:15.000000000 -0700
+++ 2.6.14-rc4-guard-page-locks/include/linux/pagemap.h 2005-10-17 15:05:46.980757058 -0700
@@ -143,6 +143,54 @@
return ret;
}

+/**
+ * mapping_guard_page_locks - call before handing locked pages to a_ops
+ * @mapping: mapping whose aops will be called
+ * @write: whether or not the pages will be written to
+ *
+ * Call this before acquiring page locks and handing the locked pages to the
+ * following address_space_operations: readpage(), readpages(),
+ * prepare_write(), and commit_write(). This must be called before the page
+ * locks are acquired. It must be called only once for a given series of page
+ * locks on a mapping. A mapping_guard_page_locks() call must be completed
+ * with a call to mapping_page_locks_done() before another
+ * maping_guard_page_locks() call can be made -- it isn't recursive.
+ *
+ * Multiple pages may be locked and multiple a_ops methods may be called within
+ * a given mapping_guard_page_locks() and mapping_page_locks_done() pair.
+ *
+ * This may return -errno and must be considered fatal. Page locks should not
+ * be acquired and handed to a_ops called if this fails.
+ *
+ * This exists because some file systems have lock ordering which requires that
+ * internal locks be acquired before page locks.
+ *
+ * This call must be ordered under i_sem.
+ */
+static inline int mapping_guard_page_locks(struct address_space *mapping, int write)
+{
+ might_sleep();
+ if (mapping->a_ops->guard_page_locks)
+ return mapping->a_ops->guard_page_locks(mapping, write);
+ else
+ return 0;
+}
+
+/**
+ * mapping_page_locks_done - called once locked pages are handed to a_ops
+ * @mapping: mapping whose aops will be called
+ * @write: matches paired mapping_guard_page_locks() call
+ *
+ * see mapping_guard_page_locks(). This must be called once all the pages
+ * that were locked after mapping_guard_page_locks() have been handed to
+ * the a_ops methods.
+ */
+static inline void mapping_page_locks_done(struct address_space *mapping, int write)
+{
+ if (mapping->a_ops->page_locks_done)
+ mapping->a_ops->page_locks_done(mapping, write);
+}
+
/*
* Return byte-offset into filesystem object for page.
*/
Index: 2.6.14-rc4-guard-page-locks/mm/filemap.c
===================================================================
--- 2.6.14-rc4-guard-page-locks.orig/mm/filemap.c 2005-10-14 16:48:15.000000000 -0700
+++ 2.6.14-rc4-guard-page-locks/mm/filemap.c 2005-10-14 16:55:34.000000000 -0700
@@ -811,12 +811,17 @@
goto out;

page_not_up_to_date:
+ error = mapping_guard_page_locks(mapping, 0);
+ if (error)
+ goto readpage_error;
+
/* Get exclusive access to the page ... */
lock_page(page);

/* Did it get unhashed before we got the lock? */
if (!page->mapping) {
unlock_page(page);
+ mapping_page_locks_done(mapping, 0);
page_cache_release(page);
continue;
}
@@ -824,13 +829,14 @@
/* Did somebody else fill it already? */
if (PageUptodate(page)) {
unlock_page(page);
+ mapping_page_locks_done(mapping, 0);
goto page_ok;
}

readpage:
/* Start the actual read. The read will unlock the page. */
error = mapping->a_ops->readpage(filp, page);
-
+ mapping_page_locks_done(mapping, 0);
if (unlikely(error))
goto readpage_error;

@@ -890,21 +896,35 @@
* Ok, it wasn't cached, so we need to create a new
* page..
*/
+ error = mapping_guard_page_locks(mapping, 0);
+ if (error) {
+ desc->error = error;
+ goto out;
+ }
+
if (!cached_page) {
cached_page = page_cache_alloc_cold(mapping);
if (!cached_page) {
+ mapping_page_locks_done(mapping, 0);
desc->error = -ENOMEM;
goto out;
}
}
+
error = add_to_page_cache_lru(cached_page, mapping,
index, GFP_KERNEL);
if (error) {
+ mapping_page_locks_done(mapping, 0);
+ if (mapping->a_ops->guard_page_locks) {
+ page_cache_release(cached_page);
+ cached_page = NULL;
+ }
if (error == -EEXIST)
goto find_page;
desc->error = error;
goto out;
}
+
page = cached_page;
cached_page = NULL;
goto readpage;
@@ -1151,27 +1171,37 @@
static int fastcall page_cache_read(struct file * file, unsigned long offset)
{
struct address_space *mapping = file->f_mapping;
- struct page *page;
+ struct page *page;
int error;

+ error = mapping_guard_page_locks(mapping, 0);
+ if (error)
+ return error;
+
page = page_cache_alloc_cold(mapping);
- if (!page)
- return -ENOMEM;
+ if (!page) {
+ error = -ENOMEM;
+ goto out;
+ }

error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
- if (!error) {
- error = mapping->a_ops->readpage(file, page);
- page_cache_release(page);
- return error;
+ if (error) {
+ if (error == -EEXIST) {
+ /*
+ * We arrive here in the unlikely event that someone
+ * raced with us and added our page to the cache first.
+ */
+ error = 0;
+ }
+ goto out;
}

- /*
- * We arrive here in the unlikely event that someone
- * raced with us and added our page to the cache first
- * or we are out of memory for radix-tree nodes.
- */
- page_cache_release(page);
- return error == -EEXIST ? 0 : error;
+ error = mapping->a_ops->readpage(file, page);
+out:
+ if (page)
+ page_cache_release(page);
+ mapping_page_locks_done(mapping, 0);
+ return error;
}

#define MMAP_LOTSAMISS (100)
@@ -1316,38 +1346,52 @@
majmin = VM_FAULT_MAJOR;
inc_page_state(pgmajfault);
}
+
+ if (mapping_guard_page_locks(mapping, 0))
+ goto retry;
+
lock_page(page);

/* Did it get unhashed while we waited for it? */
if (!page->mapping) {
unlock_page(page);
+ mapping_page_locks_done(mapping, 0);
page_cache_release(page);
goto retry_all;
}

/* Did somebody else get it up-to-date? */
if (PageUptodate(page)) {
+ mapping_page_locks_done(mapping, 0);
unlock_page(page);
goto success;
}

if (!mapping->a_ops->readpage(file, page)) {
+ mapping_page_locks_done(mapping, 0);
wait_on_page_locked(page);
if (PageUptodate(page))
goto success;
}

+ mapping_page_locks_done(mapping, 0);
+
/*
* Umm, take care of errors if the page isn't up-to-date.
* Try to re-read it _once_. We do this synchronously,
* because there really aren't any performance issues here
* and we need to check for errors.
*/
+retry:
+ if (mapping_guard_page_locks(mapping, 0))
+ goto error;
+
lock_page(page);

/* Somebody truncated the page on us? */
if (!page->mapping) {
unlock_page(page);
+ mapping_page_locks_done(mapping, 0);
page_cache_release(page);
goto retry_all;
}
@@ -1355,10 +1399,12 @@
/* Somebody else successfully read it in? */
if (PageUptodate(page)) {
unlock_page(page);
+ mapping_page_locks_done(mapping, 0);
goto success;
}
ClearPageError(page);
if (!mapping->a_ops->readpage(file, page)) {
+ mapping_page_locks_done(mapping, 0);
wait_on_page_locked(page);
if (PageUptodate(page))
goto success;
@@ -1368,6 +1414,7 @@
* Things didn't work out. Return zero to tell the
* mm layer so, possibly freeing the page cache page first.
*/
+error:
page_cache_release(page);
return NULL;
}
@@ -1430,52 +1477,69 @@
return NULL;

page_not_uptodate:
+ if (mapping_guard_page_locks(mapping, 0))
+ goto retry;
+
lock_page(page);

/* Did it get unhashed while we waited for it? */
if (!page->mapping) {
unlock_page(page);
+ mapping_page_locks_done(mapping, 0);
goto err;
}

/* Did somebody else get it up-to-date? */
if (PageUptodate(page)) {
unlock_page(page);
+ mapping_page_locks_done(mapping, 0);
goto success;
}

if (!mapping->a_ops->readpage(file, page)) {
+ mapping_page_locks_done(mapping, 0);
wait_on_page_locked(page);
if (PageUptodate(page))
goto success;
}

+ mapping_page_locks_done(mapping, 0);
+
/*
* Umm, take care of errors if the page isn't up-to-date.
* Try to re-read it _once_. We do this synchronously,
* because there really aren't any performance issues here
* and we need to check for errors.
*/
+retry:
+ if (mapping_guard_page_locks(mapping, 0))
+ goto err;
+
lock_page(page);

/* Somebody truncated the page on us? */
if (!page->mapping) {
unlock_page(page);
+ mapping_page_locks_done(mapping, 0);
goto err;
}
/* Somebody else successfully read it in? */
if (PageUptodate(page)) {
unlock_page(page);
+ mapping_page_locks_done(mapping, 0);
goto success;
}

ClearPageError(page);
if (!mapping->a_ops->readpage(file, page)) {
+ mapping_page_locks_done(mapping, 0);
wait_on_page_locked(page);
if (PageUptodate(page))
goto success;
}

+ mapping_page_locks_done(mapping, 0);
+
/*
* Things didn't work out. Return zero to tell the
* mm layer so, possibly freeing the page cache page first.
@@ -1889,6 +1953,7 @@
const struct iovec *cur_iov = iov; /* current iovec */
size_t iov_base = 0; /* offset in the current iovec */
char __user *buf;
+ int guarding = 0;

pagevec_init(&lru_pvec, 0);

@@ -1925,6 +1990,11 @@
maxlen = bytes;
fault_in_pages_readable(buf, maxlen);

+ status = mapping_guard_page_locks(mapping, 1);
+ if (status)
+ break;
+ guarding = 1;
+
page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
if (!page) {
status = -ENOMEM;
@@ -1976,6 +2046,8 @@
if (status >= 0)
status = -EFAULT;
unlock_page(page);
+ mapping_page_locks_done(mapping, 1);
+ guarding = 0;
mark_page_accessed(page);
page_cache_release(page);
if (status < 0)
@@ -1987,6 +2059,8 @@

if (cached_page)
page_cache_release(cached_page);
+ if (guarding)
+ mapping_page_locks_done(mapping, 1);

/*
* For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
Index: 2.6.14-rc4-guard-page-locks/mm/readahead.c
===================================================================
--- 2.6.14-rc4-guard-page-locks.orig/mm/readahead.c 2005-10-14 16:48:15.000000000 -0700
+++ 2.6.14-rc4-guard-page-locks/mm/readahead.c 2005-10-14 16:55:34.000000000 -0700
@@ -269,6 +269,10 @@

end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);

+ ret = mapping_guard_page_locks(mapping, 0);
+ if (ret)
+ goto out;
+
/*
* Preallocate as many pages as we will need.
*/
@@ -302,6 +306,7 @@
if (ret)
read_pages(mapping, filp, &page_pool, ret);
BUG_ON(!list_empty(&page_pool));
+ mapping_page_locks_done(mapping, 0);
out:
return ret;
}


2005-10-17 23:18:26

by Andrew Morton

[permalink] [raw]
Subject: Re: [RFC] page lock ordering and OCFS2

Zach Brown <[email protected]> wrote:
>
>
> I sent an ealier version of this patch to linux-fsdevel and was met with
> deafening silence.

Maybe because nobody understood your description ;)

> I'm resending the commentary from that first mail and am
> including a new version of the patch. This time it has much clearer naming
> and some kerneldoc blurbs. Here goes...
>
> --
>
> In recent weeks we've been reworking the locking in OCFS2 to simplify things
> and make it behave more like a "local" Linux file system. We've run into an
> ordering inversion between a page's PG_locked and OCFS2's DLM locks that
> protect page cache contents. I'm including a patch at the end of this mail
> that I think is a clean way to give the file system a chance to get the
> ordering right, but we're open to any and all suggestions. We want to do the
> cleanest thing.

The patch is of course pretty unwelcome: lots of weird stuff in the core
VFS which everyone has to maintain but probably will not test.

So I think we need a better understanding of what the locking inversion
problem is, so we can perhaps find a better solution. Bear in mind that
ext3 has (rare, unsolved) lock inversion problems in this area as well, so
commonality will be sought for.

> OCFS2 maintains page cache coherence between nodes by requiring that a node
> hold a valid lock while there are active pages in the page cache.

"active pages in the page cache" means present pagecache pages in the node
which holds pages in its pagecache, yes?

> The page
> cache is invalidated before a node releases a lock so that another node can
> acquire it. While this invalidation is happening new locks can not be acquired
> on that node. This is equivalent to a DLM processing thread acquiring
> PG_locked during truncation while holding a DLM lock. Normal file system user
> tasks come to the a_ops with PG_locked acquired by their callers before they
> have a chance to get DLM locks.

So where is the lock inversion?

Perhaps if you were to cook up one of those little threadA/threadB ascii
diagrams we could see where the inversion occurs?

2005-10-17 23:38:00

by Badari Pulavarty

[permalink] [raw]
Subject: Re: [RFC] page lock ordering and OCFS2

On Mon, 2005-10-17 at 15:20 -0700, Zach Brown wrote:
> I sent an ealier version of this patch to linux-fsdevel and was met with
> deafening silence. I'm resending the commentary from that first mail and am
> including a new version of the patch. This time it has much clearer naming
> and some kerneldoc blurbs. Here goes...
>
> --
>
> In recent weeks we've been reworking the locking in OCFS2 to simplify things
> and make it behave more like a "local" Linux file system. We've run into an
> ordering inversion between a page's PG_locked and OCFS2's DLM locks that
> protect page cache contents. I'm including a patch at the end of this mail
> that I think is a clean way to give the file system a chance to get the
> ordering right, but we're open to any and all suggestions. We want to do the
> cleanest thing.
>
> OCFS2 maintains page cache coherence between nodes by requiring that a node
> hold a valid lock while there are active pages in the page cache. The page
> cache is invalidated before a node releases a lock so that another node can
> acquire it. While this invalidation is happening new locks can not be acquired
> on that node. This is equivalent to a DLM processing thread acquiring
> PG_locked during truncation while holding a DLM lock. Normal file system user
> tasks come to the a_ops with PG_locked acquired by their callers before they
> have a chance to get DLM locks.
>
> We talked a little about modifying the invalidation path to avoid waiting for
> pages that are held by an OCFS2 path that is waiting for a DLM lock. It seems
> awfully hard to get right without some very nasty code. The truncation on lock
> removal has to write back dirty pages so that other nodes can read it -- it
> can't simply skip these pages if they happened to be locked.
>
> So we aimed right for the lock ordering inversion problem with the appended
> patch. It gives file systems a callback that is tried before page locks are
> acquired that are going to be passed in to the file system's a_ops methods.
>
> So, what do people think about this? Is it reasonable to patch the core to
> help OCFS2 with this ordering inversion?

Sorry for the "deafening silence" on fs-devel. I was trying to see
what exactly I need and how to combine with what you are trying to
do, before I reply. Unfortunately, I don't understand your lock
inversion problem well enough :(

I was looking at ext3 pagelock, trasaction ordering. I wanted
a way to reverse the ordering to support writepages() cleanly.
I guess what you are proposing could be used (in a weird way) to
that, except that I need the support in different places (callers
of writepage, writepages). BTW, I wasn't comfortable touching VFS
locking just of ext3 purpose :(

Thanks,
Badari

2005-10-18 00:41:54

by Zach Brown

[permalink] [raw]
Subject: Re: [RFC] page lock ordering and OCFS2

Andrew Morton wrote:
> Zach Brown <[email protected]> wrote:
>
>>
>>I sent an ealier version of this patch to linux-fsdevel and was met with
>>deafening silence.
>
>
> Maybe because nobody understood your description ;)

I have no trouble believing that :) Adding the DLM to the already
complicated FS universe isn't exactly a barrel of laughs.

> The patch is of course pretty unwelcome: lots of weird stuff in the core
> VFS which everyone has to maintain but probably will not test.

Yeah, believe me, we want minimal impact. With this patch we were
considering the whole kernel once OCFS2 was merged. We think we *could*
probably handle this internally in OCFS2, it would just be a bit of a
mess. It could conceivably involve ocfs2-local truncation loops that
check special lists of page pointers, that kind of thing. It's unclear
to me which is the lesser evil.

> So I think we need a better understanding of what the locking inversion
> problem is, so we can perhaps find a better solution. Bear in mind that
> ext3 has (rare, unsolved) lock inversion problems in this area as well, so
> commonality will be sought for.

This hits almost immediately with one node spinning with appending
writes (>> from the shell, say) and another node sitting in tail -f. We
don't have the luxury of leaving it unsolved :(

I'd love to learn more about the ext3 case. Maybe I'll bug Andreas :)

>>OCFS2 maintains page cache coherence between nodes by requiring that a node
>>hold a valid lock while there are active pages in the page cache.
>
> "active pages in the page cache" means present pagecache pages in the node
> which holds pages in its pagecache, yes?

Yeah, a node associates a held DLM lock with the pages it has in its
page cache. Other nodes might be doing the same thing, say, if they all
have read locks and all have cached reads in their page caches. We can
illustrate the mechanics a little more by starting with that scenario
and adding another node who wants to write to this file (these
associations and locks are per-inode). This new node will try to
acquire a write lock on the inode. Before its write lock is granted it
will ask the other nodes to all drop their read locks. As part of
dropping their locks they will invalidate their page caches. In the
future those nodes will have to read the pages back from shared disk,
after having gotten the right to do so from the DLM..

I guess it's important to realize that DLM locks can remain valid
between system calls. They're pinned by references *during* the system
calls, but they remain valid and held on the node between system calls
as long as another node doesn't send network messages asking them to be
dropped.

>> The page
>>cache is invalidated before a node releases a lock so that another node can
>>acquire it. While this invalidation is happening new locks can not be acquired
>>on that node. This is equivalent to a DLM processing thread acquiring
>>PG_locked during truncation while holding a DLM lock. Normal file system user
>>tasks come to the a_ops with PG_locked acquired by their callers before they
>>have a chance to get DLM locks.
>
>
> So where is the lock inversion?
>
> Perhaps if you were to cook up one of those little threadA/threadB ascii
> diagrams we could see where the inversion occurs?

Yeah, let me give that a try. I'll try to trim it down to the relevant
bits. First let's start with a totally fresh node and have a read get a
read DLM lock and populate the page cache on this node:

sys_read
generic_file_aio_read
ocfs2_readpage
ocfs2_data_lock
block_read_full_page
ocfs2_data_unlock

So it was only allowed to proceed past ocfs2_data_lock() towards
block_read_full_page() once the DLM granted it a read lock. As it calls
ocfs2_data_unlock() it only is dropping this caller's local reference on
the lock. The lock still exists on that node and is still valid and
holding data in the page cache until it gets a network message saying
that another node, who is probably going to be writing, would like the
lock dropped.

DLM kernel threads respond to the network messages and truncate the page
cache. While the thread is busy with this inode's lock other paths on
that node won't be able get locks. Say one of those messages arrives.
While a local DLM thread is invalidating the page cache another user
thread tries to read:

user thread dlm thread


kthread
...
ocfs2_data_convert_worker
truncate_inode_pages
sys_read
generic_file_aio_read
* gets page lock
ocfs2_readpage
ocfs2_data_lock
(stuck waiting for dlm)
lock_page
(stuck waiting for page)


The user task holds a page lock while waiting for the DLM to allow it to
proceed. The DLM thread is preventing lock granting progress while
waiting for the page lock that the user task holds.

I don't know how far to go in explaining what leads up to laying out the
locking like this. It is typical (and OCFS2 used to do this) to wait
for the DLM locks up in file->{read,write} and pin them for the duration
of the IO. This avoids the page lock and DLM lock inversion problem,
but it suffers from a host of other problems -- most fatally needing
that vma walking to govern holding multiple DLM locks during an IO.

Hmm, I managed to dig up a console log of a sysrq-T of one of the dlm
threads stuck in this state.. maybe it clarifies a little. I can't seem
to find the matching read state, but it should be more comfortable..
ocfs2_readpage really is the first significant ocfs2 call in that path
and the first thing it does is ask the DLM for a read lock.

[<c035a46e>] __wait_on_bit_lock+0x5e/0x70
[<c0147934>] __lock_page+0x84/0x90
[<c0154e47>] truncate_inode_pages_range+0x207/0x320
[<c0154f91>] truncate_inode_pages+0x31/0x40
[<d0f55978>] ocfs2_data_convert_worker+0x148/0x260 [ocfs2]
[<d0f55540>] ocfs2_generic_unblock_lock+0x90/0x380 [ocfs2]
[<d0f55b11>] ocfs2_unblock_data+0x81/0x360 [ocfs2]
[<d0f5674d>] ocfs2_process_blocked_lock+0x9d/0x3f0 [ocfs2]
[<d0f8b089>] ocfs2_vote_thread_do_work+0xa9/0x270 [ocfs2]
[<d0f8b36f>] ocfs2_vote_thread+0x5f/0x170 [ocfs2]
[<c0137c9a>] kthread+0xba/0xc0

I guess all that really illustrates its lots of funky DLM internals and
then truncate_inode_pages(). :)

Does this help?

- z

2005-10-18 01:25:09

by Andrew Morton

[permalink] [raw]
Subject: Re: [RFC] page lock ordering and OCFS2

Zach Brown <[email protected]> wrote:
>
> > So where is the lock inversion?
> >
> > Perhaps if you were to cook up one of those little threadA/threadB ascii
> > diagrams we could see where the inversion occurs?
>
> Yeah, let me give that a try. I'll try to trim it down to the relevant
> bits. First let's start with a totally fresh node and have a read get a
> read DLM lock and populate the page cache on this node:
>
> sys_read
> generic_file_aio_read
> ocfs2_readpage
> ocfs2_data_lock
> block_read_full_page
> ocfs2_data_unlock
>
> So it was only allowed to proceed past ocfs2_data_lock() towards
> block_read_full_page() once the DLM granted it a read lock. As it calls
> ocfs2_data_unlock() it only is dropping this caller's local reference on
> the lock. The lock still exists on that node and is still valid and
> holding data in the page cache until it gets a network message saying
> that another node, who is probably going to be writing, would like the
> lock dropped.
>
> DLM kernel threads respond to the network messages and truncate the page
> cache. While the thread is busy with this inode's lock other paths on
> that node won't be able get locks. Say one of those messages arrives.
> While a local DLM thread is invalidating the page cache another user
> thread tries to read:
>
> user thread dlm thread
>
>
> kthread
> ...
> ocfs2_data_convert_worker

I assume there's an ocfs2_data_lock
hereabouts?

> truncate_inode_pages
> sys_read
> generic_file_aio_read
> * gets page lock
> ocfs2_readpage
> ocfs2_data_lock
> (stuck waiting for dlm)
> lock_page
> (stuck waiting for page)
>

Why does ocfs2_readpage() need to take ocfs2_data_lock? (Is
ocfs2_data_lock a range-based read-lock thing, or what?)

> The user task holds a page lock while waiting for the DLM to allow it to
> proceed. The DLM thread is preventing lock granting progress while
> waiting for the page lock that the user task holds.
>
> I don't know how far to go in explaining what leads up to laying out the
> locking like this. It is typical (and OCFS2 used to do this) to wait
> for the DLM locks up in file->{read,write} and pin them for the duration
> of the IO. This avoids the page lock and DLM lock inversion problem,
> but it suffers from a host of other problems -- most fatally needing
> that vma walking to govern holding multiple DLM locks during an IO.

Oh.

Have you considered using invalidate_inode_pages() instead of
truncate_inode_pages()? If that leaves any pages behind, drop the read
lock, sleep a bit, try again - something klunky like that might get you out
of trouble, dunno.


2005-10-18 08:24:07

by Anton Altaparmakov

[permalink] [raw]
Subject: Re: [RFC] page lock ordering and OCFS2

On Mon, 17 Oct 2005, Andrew Morton wrote:
> Zach Brown <[email protected]> wrote:
> >
> > > So where is the lock inversion?
> > >
> > > Perhaps if you were to cook up one of those little threadA/threadB ascii
> > > diagrams we could see where the inversion occurs?
> >
> > Yeah, let me give that a try. I'll try to trim it down to the relevant
> > bits. First let's start with a totally fresh node and have a read get a
> > read DLM lock and populate the page cache on this node:
> >
> > sys_read
> > generic_file_aio_read
> > ocfs2_readpage
> > ocfs2_data_lock
> > block_read_full_page
> > ocfs2_data_unlock
> >
> > So it was only allowed to proceed past ocfs2_data_lock() towards
> > block_read_full_page() once the DLM granted it a read lock. As it calls
> > ocfs2_data_unlock() it only is dropping this caller's local reference on
> > the lock. The lock still exists on that node and is still valid and
> > holding data in the page cache until it gets a network message saying
> > that another node, who is probably going to be writing, would like the
> > lock dropped.
> >
> > DLM kernel threads respond to the network messages and truncate the page
> > cache. While the thread is busy with this inode's lock other paths on
> > that node won't be able get locks. Say one of those messages arrives.
> > While a local DLM thread is invalidating the page cache another user
> > thread tries to read:
> >
> > user thread dlm thread
> >
> >
> > kthread
> > ...
> > ocfs2_data_convert_worker
>
> I assume there's an ocfs2_data_lock
> hereabouts?
>
> > truncate_inode_pages
> > sys_read
> > generic_file_aio_read
> > * gets page lock
> > ocfs2_readpage
> > ocfs2_data_lock
> > (stuck waiting for dlm)
> > lock_page
> > (stuck waiting for page)
> >
>
> Why does ocfs2_readpage() need to take ocfs2_data_lock? (Is
> ocfs2_data_lock a range-based read-lock thing, or what?)

If I get this right, then they need it to guarantee that noone is writing
to that file offset on a different node at the same time otherwise this
readpage will see stale data.

What I would ask is why does the above dlm thread need to hold the
data_lock duing truncate_inode_pages? Just take it _after_
truncate_inode_pages(), check that there noone instantiated any pages in
between truncate_inode_pages and data_lock acquisition and there are no
pages all is great and if there are some drop data_lock, cond_resched(),
and repeat truncate_inode_pages, etc. Eventually it will succeed. And no
need for nasty VFS patch you are proposing...

> > The user task holds a page lock while waiting for the DLM to allow it to
> > proceed. The DLM thread is preventing lock granting progress while
> > waiting for the page lock that the user task holds.
> >
> > I don't know how far to go in explaining what leads up to laying out the
> > locking like this. It is typical (and OCFS2 used to do this) to wait
> > for the DLM locks up in file->{read,write} and pin them for the duration
> > of the IO. This avoids the page lock and DLM lock inversion problem,
> > but it suffers from a host of other problems -- most fatally needing
> > that vma walking to govern holding multiple DLM locks during an IO.
>
> Oh.
>
> Have you considered using invalidate_inode_pages() instead of
> truncate_inode_pages()? If that leaves any pages behind, drop the read
> lock, sleep a bit, try again - something klunky like that might get you out
> of trouble, dunno.

Or my suggestion above, I think mine has higher chance of needing less
taking/dropping of data_lock as at the end of the truncate there will be
no pages left, unless there is an overeager read process at work on that
mapping at the same time.

Best regards,

Anton
--
Anton Altaparmakov <aia21 at cam.ac.uk> (replace at with @)
Unix Support, Computing Service, University of Cambridge, CB2 3QH, UK
Linux NTFS maintainer / IRC: #ntfs on irc.freenode.net
WWW: http://linux-ntfs.sf.net/ & http://www-stu.christs.cam.ac.uk/~aia21/

2005-10-18 17:14:55

by Zach Brown

[permalink] [raw]
Subject: Re: [RFC] page lock ordering and OCFS2

Andrew Morton wrote:

>>user thread dlm thread
>>
>>
>> kthread
>> ...
>> ocfs2_data_convert_worker
>
>
> I assume there's an ocfs2_data_lock
> hereabouts?

No. The thread is working behind the scenes to bring the lock to a
state where others can get the lock again. It doesn't explicitly hold
the lock like regular fs paths do, but no one else can get the lock
until it finishes its work.

A wild over-simplification of the logic goes something like this:

dlm_lock(lock) {
wait_event(, !lock->blocked);
atomic_inc(&lock->ref);
}

dlm_unlock(lock) {
if (atomic_dec_and_test(&lock->ref) {
wake_worker_thread()
}
}

incoming_drop_message() {
lock->blocked = 1;
wake_worker_thread();
}

worker_thread() {
wait_event(, atomic_read(&lock->ref) == 0);
truncate_inode_pages();
lock->blocked = 0;
}

It's much worse than that, but that's the basic idea for our problem
here. A network message comes in which forbids additional acquiry of
the lock until all the current holders have released it and the the
worker thread has truncated the page cache.

So in our inversion ocfs2_readpage() holds a page lock while waiting in
dlm_lock() for ->blocked to be clear, but worker_thread() can't clear
->blocked until it locks and truncates the page that ocfs2_readpage() holds.

> Have you considered using invalidate_inode_pages() instead of
> truncate_inode_pages()? If that leaves any pages behind, drop the read
> lock, sleep a bit, try again - something klunky like that might get you out
> of trouble, dunno.

Yeah, that doesn't work because the locked pages aren't making forward
progress. In the read case it would be fine to just skip them, they're
about to be re-read once ocfs2_readpage() gets its DLM lock. But we'd
have to strongly identify locked pages waiting in ocfs2_readpage(). And
in the write case the DLM thread is responsible writing those pages back
before releasing the lock so that other nodes can read the pages in.

We think we could do this sort of thing with a specific truncation loop,
but that's the nasty code I wasn't sure would be any more acceptable
than this nasty core patch. The truncation loop would need a way to
identify locked pages that it can just ignore or initiate writeback on
because it'd know that it's just another ocfs2 path that holds the page
lock while waiting for a dlm lock. I wonder if we could just do it with
PG_fs_misc. I can give it a try if that doesn't send shivers down
everyone's spine.

- z

2005-10-18 17:26:05

by Zach Brown

[permalink] [raw]
Subject: Re: [RFC] page lock ordering and OCFS2

Anton Altaparmakov wrote:

> What I would ask is why does the above dlm thread need to hold the
> data_lock duing truncate_inode_pages?

I hope the mail I just sent made that a little more clear.

> and repeat truncate_inode_pages, etc. Eventually it will succeed. And no
> need for nasty VFS patch you are proposing...

Yeah, this also came to me this morning in the shower :) There are some
hard cases because these are actually read-write locks, but it might be
doable. We're discussing it.

> no pages left, unless there is an overeager read process at work on that
> mapping at the same time.

I fear that it'll be pretty easy to get bad capture effects, but maybe
that's ok. We'll see.

- z

2005-10-21 17:45:47

by Zach Brown

[permalink] [raw]
Subject: Re: [RFC] page lock ordering and OCFS2


> We think we could do this sort of thing with a specific truncation loop,
> but that's the nasty code I wasn't sure would be any more acceptable
> than this nasty core patch.

OK, I'm appending a patch which attempts to solve this problem in OCFS2 code
instead of up in the VFS. It's against a branch in OCFS2 svn, but one which
isn't incomprehensibly different from what is in -mm.

The idea is to stop the DLM thread from deadlocking on locked pages (which are
held by other tasks waiting for the DLM) by tagging them with PG_fs_misc. The
DLM thread knows that the pages are locked by callers who are waiting for the
thread so they can assume ownership of the page lock, of a sort. The DLM
thread writes back dirty data in the page manually so it doesn't unlock the
callers page. It can basically ignore clean pages because they're going to be
reread by the lock holder once they get the DLM lock. Comments in the patch
above ocfs2_data_convert_worker, get_lock_or_fs_misc,
ocfs2_write_dirty_fs_misc_page, and ocfs2_set_fs_misc lay this out.

It introduces block_read_full_page() and truncate_inode_pages() derivatives
which understand the PG_fs_misc special case. It needs a few export patches to
the core, but the real burden is on OCFS2 to keep these derivatives up to date.
Maybe after they stabilize for a while, and say another clustered file system
has similar problems, we could roll some core helpers out of them. But for now
it illustrates what the solution would look like in OCFS2. I'm sure I missed
some corner cases, but this passes the most basic functional testing.

The specific exports it needs from 2.6.14-rc4-mm1 are:

$ grep '+EXPORT' patches/*.patch
patches/add-wake_up_page_all.patch:+EXPORT_SYMBOL(__wake_up_bit_all);
patches/add-wake_up_page_all.patch:+EXPORT_SYMBOL(wake_up_page_all);
patches/export-pagevec-helpers.patch:+EXPORT_SYMBOL_GPL(pagevec_lookup);
patches/export-page_waitqueue.patch:+EXPORT_SYMBOL_GPL(page_waitqueue);
patches/export-truncate_complete_pate.patch:+EXPORT_SYMBOL(truncate_complete_page);
patches/export-wake_up_page.patch:+EXPORT_SYMBOL(wake_up_page);

that wake_up_page_all() is just a variant that provides 0 nr_exclusive to
__wake_up_bit():

-void fastcall __wake_up_bit(wait_queue_head_t *wq, void *word, int bit)
+static inline int __wake_up_bit_nr(wait_queue_head_t *wq, void *word, int bit,
+ int nr_exclusive)
{
struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
if (waitqueue_active(wq))
- __wake_up(wq, TASK_INTERRUPTIBLE|TASK_UNINTERRUPTIBLE, 1, &key);
+ __wake_up(wq, TASK_INTERRUPTIBLE|TASK_UNINTERRUPTIBLE,
+ nr_exclusive, &key);
+}
+
+void fastcall __wake_up_bit(wait_queue_head_t *wq, void *word, int bit)
+{
+ __wake_up_bit_nr(wq, word, bit, 1);
}
EXPORT_SYMBOL(__wake_up_bit);

+void fastcall __wake_up_bit_all(wait_queue_head_t *wq, void *word, int bit)
+{
+ __wake_up_bit_nr(wq, word, bit, 0);
+}
+EXPORT_SYMBOL(__wake_up_bit_all);

Is this preferable to the core changes and is it something that's mergeable?
We'd love to come to a solution that won't be a barrier to merging so we can
get on with it. I can send that exporting series if we decide this is the
right thing.

- z

Index: fs/ocfs2/dlmglue.h
===================================================================
--- fs/ocfs2/dlmglue.h (revision 2660)
+++ fs/ocfs2/dlmglue.h (working copy)
@@ -54,6 +54,8 @@ int ocfs2_create_new_inode_locks(struct
int ocfs2_drop_inode_locks(struct inode *inode);
int ocfs2_data_lock(struct inode *inode,
int write);
+int ocfs2_data_lock_holding_page(struct inode *inode, int write,
+ struct page *page);
void ocfs2_data_unlock(struct inode *inode,
int write);
int ocfs2_rw_lock(struct inode *inode, int write);
@@ -70,6 +72,11 @@ int ocfs2_meta_lock_full(struct inode *i
/* 99% of the time we don't want to supply any additional flags --
* those are for very specific cases only. */
#define ocfs2_meta_lock(i, h, b, e) ocfs2_meta_lock_full(i, h, b, e, 0)
+int ocfs2_meta_lock_holding_page(struct inode *inode,
+ ocfs2_journal_handle *handle,
+ struct buffer_head **ret_bh,
+ int ex,
+ struct page *page);
void ocfs2_meta_unlock(struct inode *inode,
int ex);
int ocfs2_super_lock(ocfs2_super *osb,
Index: fs/ocfs2/aops.c
===================================================================
--- fs/ocfs2/aops.c (revision 2661)
+++ fs/ocfs2/aops.c (working copy)
@@ -130,8 +130,8 @@ bail:
return err;
}

-static int ocfs2_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+int ocfs2_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
{
int err = 0;
u64 p_blkno, past_eof;
@@ -198,7 +198,7 @@ static int ocfs2_readpage(struct file *f

mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));

- ret = ocfs2_meta_lock(inode, NULL, NULL, 0);
+ ret = ocfs2_meta_lock_holding_page(inode, NULL, NULL, 0, page);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -226,7 +226,7 @@ static int ocfs2_readpage(struct file *f
goto out_alloc;
}

- ret = ocfs2_data_lock(inode, 0);
+ ret = ocfs2_data_lock_holding_page(inode, 0, page);
if (ret < 0) {
mlog_errno(ret);
goto out_alloc;
@@ -283,7 +283,7 @@ int ocfs2_prepare_write(struct file *fil

mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);

- ret = ocfs2_meta_lock(inode, NULL, NULL, 0);
+ ret = ocfs2_meta_lock_holding_page(inode, NULL, NULL, 0, page);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -397,13 +397,14 @@ static int ocfs2_commit_write(struct fil
locklevel = 1;
}

- ret = ocfs2_meta_lock(inode, NULL, &di_bh, locklevel);
+ ret = ocfs2_meta_lock_holding_page(inode, NULL, &di_bh, locklevel,
+ page);
if (ret < 0) {
mlog_errno(ret);
goto out;
}

- ret = ocfs2_data_lock(inode, 1);
+ ret = ocfs2_data_lock_holding_page(inode, 1, page);
if (ret < 0) {
mlog_errno(ret);
goto out_unlock_meta;
Index: fs/ocfs2/aops.h
===================================================================
--- fs/ocfs2/aops.h (revision 2660)
+++ fs/ocfs2/aops.h (working copy)
@@ -22,6 +22,8 @@
#ifndef OCFS2_AOPS_H
#define OCFS2_AOPS_H

+int ocfs2_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
int ocfs2_prepare_write(struct file *file, struct page *page,
unsigned from, unsigned to);

Index: fs/ocfs2/dlmglue.c
===================================================================
--- fs/ocfs2/dlmglue.c (revision 2660)
+++ fs/ocfs2/dlmglue.c (working copy)
@@ -30,6 +30,8 @@
#include <linux/smp_lock.h>
#include <linux/crc32.h>
#include <linux/kthread.h>
+#include <linux/pagevec.h>
+#include <linux/blkdev.h>

#include <cluster/heartbeat.h>
#include <cluster/nodemanager.h>
@@ -43,6 +45,7 @@
#include "ocfs2.h"

#include "alloc.h"
+#include "aops.h"
#include "dlmglue.h"
#include "extent_map.h"
#include "heartbeat.h"
@@ -113,9 +116,6 @@ static struct ocfs2_lock_res_ops ocfs2_i
.unblock = ocfs2_unblock_meta,
};

-static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
- int blocking);
-
static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
.ast = ocfs2_inode_ast_func,
.bast = ocfs2_inode_bast_func,
@@ -1154,6 +1154,43 @@ out:
return status;
}

+/*
+ * The use of PG_fs_misc here is working around a lock ordering inversion.
+ * We're indicating to the DLM vote thread (the thread which invalidates the
+ * page cache of inodes who are releasing their locks to other nodes) that
+ * we're about to sleep on the DLM while holding a page lock. See
+ * ocfs2_data_convert_worker().
+ */
+static void ocfs2_set_fs_misc(struct page *page)
+{
+ BUG_ON(PageFsMisc(page));
+ SetPageFsMisc(page);
+ mb();
+ wake_up_page_all(page, PG_locked);
+}
+
+static void ocfs2_clear_fs_misc(struct page *page)
+{
+ wait_event(*page_waitqueue(page), PageFsMisc(page));
+ ClearPageFsMisc(page);
+ smp_mb__after_clear_bit();
+ /* we'll also be unlocking the page so we don't need to wake
+ * the page waitqueue here */
+}
+
+int ocfs2_data_lock_holding_page(struct inode *inode,
+ int write,
+ struct page *page)
+{
+ int ret;
+
+ ocfs2_set_fs_misc(page);
+ ret = ocfs2_data_lock(inode, write);
+ ocfs2_clear_fs_misc(page);
+
+ return ret;
+}
+
static void ocfs2_vote_on_unlock(ocfs2_super *osb,
struct ocfs2_lock_res *lockres)
{
@@ -1597,6 +1634,21 @@ bail:
return status;
}

+int ocfs2_meta_lock_holding_page(struct inode *inode,
+ ocfs2_journal_handle *handle,
+ struct buffer_head **ret_bh,
+ int ex,
+ struct page *page)
+{
+ int ret;
+
+ ocfs2_set_fs_misc(page);
+ ret = ocfs2_meta_lock(inode, handle, ret_bh, ex);
+ ocfs2_clear_fs_misc(page);
+
+ return ret;
+}
+
void ocfs2_meta_unlock(struct inode *inode,
int ex)
{
@@ -2297,32 +2349,209 @@ leave:
return ret;
}

+/*
+ * this is a hand-rolled block_read_full_page() which takes our specific
+ * situation into account -- it doesn't unlock the page nor change its state
+ * bits in the radix tree. This page has already had its dirty bit cleared by
+ * the caller so we want to clear the dirty state in the buffers, too.
+ *
+ * It doesn't have to zero pages that straddle i_size because we can't dirty
+ * pages for writeback from mmap. ocfs2_prepare_write() has already made sure
+ * that partially written pages contain zeros.
+ *
+ * Truncate can't race with this -- it'll get stuck waiting for a DLM lock.
+ */
+static void ocfs2_write_dirty_fs_misc_page(struct page *page)
+{
+ struct buffer_head *bh, *head;
+ struct inode *inode = page->mapping->host;
+ sector_t block;
+
+ block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+ BUG_ON(!page_has_buffers(page));
+ head = page_buffers(page);
+
+ bh = head;
+ do {
+ if (!buffer_mapped(bh)) {
+ int err = ocfs2_get_block(inode, block, bh, 1);
+ BUG_ON(err);
+ if (buffer_new(bh)) {
+ clear_buffer_new(bh);
+ unmap_underlying_metadata(bh->b_bdev,
+ bh->b_blocknr);
+ }
+ }
+ bh = bh->b_this_page;
+ block++;
+ } while (bh != head);
+
+ /* start io */
+ do {
+ lock_buffer(bh);
+ BUG_ON(buffer_jbd(bh)); /* not sure.. */
+ /* clear dirty to match having cleared dirty on the page
+ * in the caller */
+ clear_buffer_dirty(bh);
+ bh->b_end_io = end_buffer_write_sync;
+ get_bh(bh);
+ submit_bh(WRITE, bh);
+ } while ((bh = bh->b_this_page) != head);
+
+ block_sync_page(page);
+
+ /* wait on it */
+ do {
+ wait_on_buffer(bh);
+ } while ((bh = bh->b_this_page) != head);
+}
+
+
+/*
+ * This returns when either we get the page lock or we clear PG_fs_msic on
+ * behalf of an ocfs2 thread that is heading to block in the DLM. If we have
+ * the page lock we just unlock it as usual. If we've cleared the misc bit
+ * then that ocfs2 path is going to wait for us to set it again when we're
+ * done. It's like that ocfs2 path has transferred ownership of the page lock
+ * to us temporarily.
+ */
+static inline int get_lock_or_fs_misc(struct page *page, int *have_fs_misc)
+{
+ if (!TestSetPageLocked(page)) {
+ *have_fs_misc = 0;
+ return 1;
+ }
+
+ if (TestClearPageFsMisc(page)) {
+ *have_fs_misc = 1;
+ return 1;
+ }
+
+ return 0;
+}
+
+static void ocfs2_lock_page_or_fs_misc(struct page *page, int *have_fs_misc)
+{
+ wait_event(*page_waitqueue(page), get_lock_or_fs_misc(page,
+ have_fs_misc));
+}
+
+static void ocfs2_return_fs_misc(struct page *page)
+{
+ BUG_ON(PageFsMisc(page));
+ SetPageFsMisc(page);
+ mb();
+ wake_up_page_all(page, PG_fs_misc);
+}
+
+/*
+ * This is called in the context of a DLM helper thread that is operating on a
+ * lock that is converting between two modes. While it is converting there are
+ * no local holders of the lock and local lock acquiry is blocking waiting for
+ * it to finish. If a write lock is being dropped so that another node can
+ * acquire a read lock then dirty data has to be written so that other nodes
+ * who later acquire the read lock can read in data. If a read lock is being
+ * dropped so that another node can acquire a write lock then the page cache
+ * has to be emptied entirely so that read requests aren't satisfied from the
+ * cache once the other node's write starts.
+ *
+ * There is a lock ordering inversion here. Some FS paths will be holding a
+ * page lock while they wait for a DLM lock. This path will be blocking local
+ * DLM lock acquiry while it is doing its page cache work which involves
+ * waiting for locked pages. This is worked around by having those FS paths
+ * set PG_fs_misc on the pages they hold the lock on before they block waiting
+ * for the DLM. When this path sees a page like this we use PG_fs_misc to sort
+ * of transfer ownership of the page lock to us temporarily. We leave clean
+ * pages alone -- they're not uptodate and their owner is going to be reading
+ * into them once they get their DLM locks. We have to write back dirty pages,
+ * though, so that other nodes can read them.
+ *
+ * XXX I think these pagevec loops are safe as index approaches ~0 because
+ * radix_tree_gang_lookup() returns if the index wrapps, but I might not be
+ * reading the code right.
+ */
static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
int blocking)
{
- struct inode *inode;
- struct address_space *mapping;
+ struct inode *inode = ocfs2_lock_res_inode(lockres);
+ struct address_space *mapping = inode->i_mapping;
+ int old_level = lockres->l_level;
+ pgoff_t index;
+ struct pagevec pvec;
+ unsigned i;

mlog_entry_void();

- inode = ocfs2_lock_res_inode(lockres);
- mapping = inode->i_mapping;
+ /* we're betting that we don't need to call sync_mapping_buffers(),
+ * having never called mark_buffer_dirty_inode() */
+ BUG_ON(!mapping->assoc_mapping && !list_empty(&mapping->private_list));
+
+ /* first drop all mappings of our pages. When we have shared
+ * write this will introduce dirty pages that we'll want to
+ * writeback below */
+ if (blocking == LKM_EXMODE)
+ unmap_mapping_range(mapping, 0, 0, 0);

- if (filemap_fdatawrite(mapping)) {
- mlog(ML_ERROR, "Could not sync inode %"MLFu64" for downconvert!",
- OCFS2_I(inode)->ip_blkno);
+ blk_run_address_space(mapping);
+
+ /* start writeback on dirty pages if we're converting from a lock that
+ * could have dirtied pages. */
+ pagevec_init(&pvec, 0);
+ index = 0;
+ while (old_level == LKM_EXMODE &&
+ pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
+ PAGEVEC_SIZE)) {
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ int have_fs_misc;
+ struct page *page = pvec.pages[i];
+
+ ocfs2_lock_page_or_fs_misc(page, &have_fs_misc);
+
+ if (have_fs_misc) {
+ /* we can't wait on or unlock this page, but we
+ * need to write it out */
+ if (test_clear_page_dirty(page))
+ ocfs2_write_dirty_fs_misc_page(page);
+ ocfs2_return_fs_misc(page);
+ continue;
+ }
+
+ if (PageDirty(page))
+ write_one_page(page, 0);
+ else
+ unlock_page(page);
+ }
+ pagevec_release(&pvec);
+ cond_resched();
}
- sync_mapping_buffers(mapping);
- if (blocking == LKM_EXMODE) {
- truncate_inode_pages(mapping, 0);
- unmap_mapping_range(mapping, 0, 0, 0);
- } else {
- /* We only need to wait on the I/O if we're not also
- * truncating pages because truncate_inode_pages waits
- * for us above. We don't truncate pages if we're
- * blocking anything < EXMODE because we want to keep
- * them around in that case. */
+
+ /* wait for io */
+ if (old_level == LKM_EXMODE)
filemap_fdatawait(mapping);
+
+ /* now remove all pages if we're blocking exclusive and can't have any
+ * remaining in our page cache */
+ index = 0;
+ while (blocking == LKM_EXMODE &&
+ pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE)) {
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
+ int have_fs_misc;
+ index = page->index + 1;
+
+ ocfs2_lock_page_or_fs_misc(page, &have_fs_misc);
+
+ if (have_fs_misc) {
+ ocfs2_return_fs_misc(page);
+ continue;
+ }
+
+ truncate_complete_page(mapping, page);
+ unlock_page(page);
+ }
+ pagevec_release(&pvec);
+ cond_resched();
}

mlog_exit_void();

2005-10-21 17:57:35

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [RFC] page lock ordering and OCFS2

On Fri, Oct 21, 2005 at 10:43:24AM -0700, Zach Brown wrote:
> It introduces block_read_full_page() and truncate_inode_pages() derivatives
> which understand the PG_fs_misc special case. It needs a few export patches to
> the core, but the real burden is on OCFS2 to keep these derivatives up to date.

The way you do it looks nice, but the exports aren't a big no-way. That
stuff is far too internal to be exported. Either we can get Andrew to
agree on moving those bits into the codepath for all filesystems or
we need to do some hackery where every functions gets renamed to __function
with an argument int cluster_aware and we have to functions inling them,
one normal and one for cluster filesystems.

2005-10-21 18:03:36

by Andrew Morton

[permalink] [raw]
Subject: Re: [RFC] page lock ordering and OCFS2

Zach Brown <[email protected]> wrote:
>
> The specific exports it needs from 2.6.14-rc4-mm1 are:
>
> $ grep '+EXPORT' patches/*.patch
> patches/add-wake_up_page_all.patch:+EXPORT_SYMBOL(__wake_up_bit_all);
> patches/add-wake_up_page_all.patch:+EXPORT_SYMBOL(wake_up_page_all);
> patches/export-pagevec-helpers.patch:+EXPORT_SYMBOL_GPL(pagevec_lookup);
> patches/export-page_waitqueue.patch:+EXPORT_SYMBOL_GPL(page_waitqueue);
> patches/export-truncate_complete_pate.patch:+EXPORT_SYMBOL(truncate_complete_page);
> patches/export-wake_up_page.patch:+EXPORT_SYMBOL(wake_up_page);

Exporting page_waitqueue seems wrong. Might be better to add a core
function to do the wait_event(*page_waitqueue(page), PageFsMisc(page)); and
export that.

How did you come up with this mix of GPL and non-GPL?

> that wake_up_page_all() is just a variant that provides 0 nr_exclusive to
> __wake_up_bit():
>
> -void fastcall __wake_up_bit(wait_queue_head_t *wq, void *word, int bit)
> +static inline int __wake_up_bit_nr(wait_queue_head_t *wq, void *word, int bit,
> + int nr_exclusive)
> {
> struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
> if (waitqueue_active(wq))
> - __wake_up(wq, TASK_INTERRUPTIBLE|TASK_UNINTERRUPTIBLE, 1, &key);
> + __wake_up(wq, TASK_INTERRUPTIBLE|TASK_UNINTERRUPTIBLE,
> + nr_exclusive, &key);
> +}
> +
> +void fastcall __wake_up_bit(wait_queue_head_t *wq, void *word, int bit)
> +{
> + __wake_up_bit_nr(wq, word, bit, 1);
> }
> EXPORT_SYMBOL(__wake_up_bit);
>
> +void fastcall __wake_up_bit_all(wait_queue_head_t *wq, void *word, int bit)
> +{
> + __wake_up_bit_nr(wq, word, bit, 0);
> +}
> +EXPORT_SYMBOL(__wake_up_bit_all);
>
> Is this preferable to the core changes and is it something that's mergeable?
> We'd love to come to a solution that won't be a barrier to merging so we can
> get on with it. I can send that exporting series if we decide this is the
> right thing.

The above looks sane enough. Please run it by Bill?

2005-10-21 20:33:13

by Zach Brown

[permalink] [raw]
Subject: Re: [RFC] page lock ordering and OCFS2


>> patches/add-wake_up_page_all.patch:+EXPORT_SYMBOL(__wake_up_bit_all);
>> patches/add-wake_up_page_all.patch:+EXPORT_SYMBOL(wake_up_page_all);
>> patches/export-pagevec-helpers.patch:+EXPORT_SYMBOL_GPL(pagevec_lookup);
>> patches/export-page_waitqueue.patch:+EXPORT_SYMBOL_GPL(page_waitqueue);
>> patches/export-truncate_complete_pate.patch:+EXPORT_SYMBOL(truncate_complete_page);
>> patches/export-wake_up_page.patch:+EXPORT_SYMBOL(wake_up_page);
>
>
> Exporting page_waitqueue seems wrong. Might be better to add a core
> function to do the wait_event(*page_waitqueue(page), PageFsMisc(page)); and
> export that.

Sure thing.

> How did you come up with this mix of GPL and non-GPL?

Carelessness. I'll aim for _GPL for anything that still needs to be exported.

> The above looks sane enough. Please run it by Bill?

OK.

- z

2005-10-21 20:36:15

by Zach Brown

[permalink] [raw]
Subject: Re: [RFC] page lock ordering and OCFS2

Christoph Hellwig wrote:
> On Fri, Oct 21, 2005 at 10:43:24AM -0700, Zach Brown wrote:
>
>>It introduces block_read_full_page() and truncate_inode_pages() derivatives
>>which understand the PG_fs_misc special case. It needs a few export patches to
>>the core, but the real burden is on OCFS2 to keep these derivatives up to date.
>
> The way you do it looks nice, but the exports aren't a big no-way. That
> stuff is far too internal to be exported. Either we can get Andrew to
> agree on moving those bits into the codepath for all filesystems or
> we need to do some hackery where every functions gets renamed to __function
> with an argument int cluster_aware and we have to functions inling them,
> one normal and one for cluster filesystems.

Yeah, I can certainly appreciate that line of reasoning. I'm happy to do that
work, but it'd be nice to get some assurance that it won't be wasted effort.
Andrew, is this a reasonable direction to take things in? We'd avoid the
exports by introducing some wrappers and helpers to the core that OCFS2 would
call..

- z

2005-10-21 21:04:21

by Andrew Morton

[permalink] [raw]
Subject: Re: [RFC] page lock ordering and OCFS2

Zach Brown <[email protected]> wrote:
>
> Christoph Hellwig wrote:
> > On Fri, Oct 21, 2005 at 10:43:24AM -0700, Zach Brown wrote:
> >
> >>It introduces block_read_full_page() and truncate_inode_pages() derivatives
> >>which understand the PG_fs_misc special case. It needs a few export patches to
> >>the core, but the real burden is on OCFS2 to keep these derivatives up to date.
> >
> > The way you do it looks nice, but the exports aren't a big no-way. That
> > stuff is far too internal to be exported. Either we can get Andrew to
> > agree on moving those bits into the codepath for all filesystems or
> > we need to do some hackery where every functions gets renamed to __function
> > with an argument int cluster_aware and we have to functions inling them,
> > one normal and one for cluster filesystems.
>
> Yeah, I can certainly appreciate that line of reasoning. I'm happy to do that
> work, but it'd be nice to get some assurance that it won't be wasted effort.
> Andrew, is this a reasonable direction to take things in? We'd avoid the
> exports by introducing some wrappers and helpers to the core that OCFS2 would
> call..

It depends on what the patch ends up looking like I guess.

All those games with PG_fs_misc look awfully similar to lock_page() - I'd
have thought there's some room for rationalising code in there.

The overall approach would be to avoid adding overhead and complexity for
other filesystems and to only export symbols which constitute a sensible
API: avoid exporting weirdo internal helpers which we might change in the
future (but we don't care about external modules, so why does this matter?
Because some of them are GPL?)

2005-10-21 21:58:39

by Zach Brown

[permalink] [raw]
Subject: Re: [RFC] page lock ordering and OCFS2


> All those games with PG_fs_misc look awfully similar to lock_page() - I'd
> have thought there's some room for rationalising code in there.

Yeah, it's very much like a secondary restricted page lock. Instead of
sleeping when you can't get the page lock, you can also try acquiring this sort
of secondary page lock bit which lets you do *very* specific things. Maybe
it's not *that* weird, I just didn't think that the core would be interested in
supporting that notion. I can try rolling a patch to see what a more sensible
API would look like.

> The overall approach would be to avoid adding overhead and complexity for
> other filesystems and to only export symbols which constitute a sensible
> API:

*nod*

- z

2005-10-25 00:04:34

by Zach Brown

[permalink] [raw]
Subject: Re: [RFC] page lock ordering and OCFS2

Christoph Hellwig wrote:
>
> The way you do it looks nice, but the exports aren't a big no-way. That
> stuff is far too internal to be exported.

Yeah, understood.

So here's an entirely different approach. We add a return value to readpage,
prepare_write, and commit_write in the style of WRITEPAGE_ACTIVATE. It tells
the callers that the locked page they handed to the op has been unlocked and
that they should back off and try again. This lets the ocfs2 methods notice
when they're going to sleep on the DLM and can unlock the page before sleeping.

This compiles but hasn't been tested.

Index: 2.6.14-rc5-mm1-aop-truncated-page/include/linux/fs.h
===================================================================
--- 2.6.14-rc5-mm1-aop-truncated-page.orig/include/linux/fs.h
+++ 2.6.14-rc5-mm1-aop-truncated-page/include/linux/fs.h
@@ -292,6 +292,30 @@ struct iattr {
*/
#include <linux/quota.h>

+/**
+ * enum positive_aop_returns - aop return codes with specific semantics
+ *
+ * @WRITEPAGE_ACTIVATE: IO was not started: activate page. Returned by
+ * writepage();
+ *
+ * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has
+ * unlocked it and the page might have been truncated.
+ * The caller should back up to acquiring a new page and
+ * trying again. The aop will be taking reasonable
+ * precautions not to livelock. If the caller held a page
+ * reference, it should drop it before retrying. Returned
+ * by readpage(), prepare_write(), and commit_write().
+ *
+ * address_space_operation functions return these large constants to indicate
+ * special semantics to the caller. These are much larger than the bytes in a
+ * page to allow for functions that return the number of bytes operated on in a
+ * given page.
+ */
+enum positive_aop_returns {
+ WRITEPAGE_ACTIVATE = 0x80000,
+ AOP_TRUNCATED_PAGE = 0x80001,
+};
+
/*
* oh the beauties of C type declarations.
*/
Index: 2.6.14-rc5-mm1-aop-truncated-page/include/linux/writeback.h
===================================================================
--- 2.6.14-rc5-mm1-aop-truncated-page.orig/include/linux/writeback.h
+++ 2.6.14-rc5-mm1-aop-truncated-page/include/linux/writeback.h
@@ -60,12 +60,6 @@ struct writeback_control {
};

/*
- * ->writepage() return values (make these much larger than a pagesize, in
- * case some fs is returning number-of-bytes-written from writepage)
- */
-#define WRITEPAGE_ACTIVATE 0x80000 /* IO was not started: activate page */
-
-/*
* fs/fs-writeback.c
*/
void writeback_inodes(struct writeback_control *wbc);
Index: 2.6.14-rc5-mm1-aop-truncated-page/drivers/block/loop.c
===================================================================
--- 2.6.14-rc5-mm1-aop-truncated-page.orig/drivers/block/loop.c
+++ 2.6.14-rc5-mm1-aop-truncated-page/drivers/block/loop.c
@@ -213,7 +213,7 @@ static int do_lo_send_aops(struct loop_d
struct address_space_operations *aops = mapping->a_ops;
pgoff_t index;
unsigned offset, bv_offs;
- int len, ret = 0;
+ int len, ret;

down(&mapping->host->i_sem);
index = pos >> PAGE_CACHE_SHIFT;
@@ -232,9 +232,15 @@ static int do_lo_send_aops(struct loop_d
page = grab_cache_page(mapping, index);
if (unlikely(!page))
goto fail;
- if (unlikely(aops->prepare_write(file, page, offset,
- offset + size)))
+ ret = aops->prepare_write(file, page, offset,
+ offset + size);
+ if (unlikely(ret)) {
+ if (ret == AOP_TRUNCTED_PAGE) {
+ page_cache_release(page);
+ continue;
+ }
goto unlock;
+ }
transfer_result = lo_do_transfer(lo, WRITE, page, offset,
bvec->bv_page, bv_offs, size, IV);
if (unlikely(transfer_result)) {
@@ -251,9 +257,15 @@ static int do_lo_send_aops(struct loop_d
kunmap_atomic(kaddr, KM_USER0);
}
flush_dcache_page(page);
- if (unlikely(aops->commit_write(file, page, offset,
- offset + size)))
+ ret = aops->commit_write(file, page, offset,
+ offset + size);
+ if (unlikely(ret)) {
+ if (ret == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ continue;
+ }
goto unlock;
+ }
if (unlikely(transfer_result))
goto unlock;
bv_offs += size;
@@ -264,6 +276,7 @@ static int do_lo_send_aops(struct loop_d
unlock_page(page);
page_cache_release(page);
}
+ ret = 0;
out:
up(&mapping->host->i_sem);
return ret;
Index: 2.6.14-rc5-mm1-aop-truncated-page/mm/filemap.c
===================================================================
--- 2.6.14-rc5-mm1-aop-truncated-page.orig/mm/filemap.c
+++ 2.6.14-rc5-mm1-aop-truncated-page/mm/filemap.c
@@ -853,8 +853,13 @@ readpage:
/* Start the actual read. The read will unlock the page. */
error = mapping->a_ops->readpage(filp, page);

- if (unlikely(error))
+ if (unlikely(error)) {
+ if (error == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ goto find_page;
+ }
goto readpage_error;
+ }

if (!PageUptodate(page)) {
lock_page(page);
@@ -1174,26 +1179,24 @@ static int fastcall page_cache_read(stru
{
struct address_space *mapping = file->f_mapping;
struct page *page;
- int error;
+ int ret;

- page = page_cache_alloc_cold(mapping);
- if (!page)
- return -ENOMEM;
+ do {
+ page = page_cache_alloc_cold(mapping);
+ if (!page)
+ return -ENOMEM;
+
+ ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
+ if (ret == 0)
+ ret = mapping->a_ops->readpage(file, page);
+ else if (ret == -EEXIST)
+ ret = 0; /* losing race to add is OK */

- error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
- if (!error) {
- error = mapping->a_ops->readpage(file, page);
page_cache_release(page);
- return error;
- }

- /*
- * We arrive here in the unlikely event that someone
- * raced with us and added our page to the cache first
- * or we are out of memory for radix-tree nodes.
- */
- page_cache_release(page);
- return error == -EEXIST ? 0 : error;
+ } while (ret == AOP_TRUNCATED_PAGE);
+
+ return ret;
}

#define MMAP_LOTSAMISS (100)
@@ -1353,10 +1356,14 @@ page_not_uptodate:
goto success;
}

- if (!mapping->a_ops->readpage(file, page)) {
+ error = mapping->a_ops->readpage(file, page);
+ if (!error) {
wait_on_page_locked(page);
if (PageUptodate(page))
goto success;
+ } else if (error == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ goto retry_find;
}

/*
@@ -1380,10 +1387,14 @@ page_not_uptodate:
goto success;
}
ClearPageError(page);
- if (!mapping->a_ops->readpage(file, page)) {
+ error = mapping->a_ops->readpage(file, page);
+ if (!error) {
wait_on_page_locked(page);
if (PageUptodate(page))
goto success;
+ } else if (error == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ goto retry_find;
}

/*
@@ -1466,10 +1477,14 @@ page_not_uptodate:
goto success;
}

- if (!mapping->a_ops->readpage(file, page)) {
+ error = mapping->a_ops->readpage(file, page);
+ if (!error) {
wait_on_page_locked(page);
if (PageUptodate(page))
goto success;
+ } else if (error == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ goto retry_find;
}

/*
@@ -1492,10 +1507,14 @@ page_not_uptodate:
}

ClearPageError(page);
- if (!mapping->a_ops->readpage(file, page)) {
+ error = mapping->a_ops->readpage(file, page);
+ if (!error) {
wait_on_page_locked(page);
if (PageUptodate(page))
goto success;
+ } else if (error == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ goto retry_find;
}

/*
@@ -1956,12 +1975,16 @@ generic_file_buffered_write(struct kiocb
status = a_ops->prepare_write(file, page, offset, offset+bytes);
if (unlikely(status)) {
loff_t isize = i_size_read(inode);
+
+ if (status != AOP_TRUNCATED_PAGE)
+ unlock_page(page);
+ page_cache_release(page);
+ if (status == AOP_TRUNCATED_PAGE)
+ continue;
/*
* prepare_write() may have instantiated a few blocks
* outside i_size. Trim these off again.
*/
- unlock_page(page);
- page_cache_release(page);
if (pos + bytes > isize)
vmtruncate(inode, isize);
break;
@@ -1975,6 +1998,10 @@ generic_file_buffered_write(struct kiocb
flush_dcache_page(page);
status = a_ops->commit_write(file, page, offset, offset+bytes);
if (likely(copied > 0)) {
+ if (status == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ continue;
+ }
if (!status)
status = copied;

Index: 2.6.14-rc5-mm1-aop-truncated-page/mm/readahead.c
===================================================================
--- 2.6.14-rc5-mm1-aop-truncated-page.orig/mm/readahead.c
+++ 2.6.14-rc5-mm1-aop-truncated-page/mm/readahead.c
@@ -159,7 +159,7 @@ static int read_pages(struct address_spa
{
unsigned page_idx;
struct pagevec lru_pvec;
- int ret = 0;
+ int ret;

if (mapping->a_ops->readpages) {
ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
@@ -172,14 +172,17 @@ static int read_pages(struct address_spa
list_del(&page->lru);
if (!add_to_page_cache(page, mapping,
page->index, GFP_KERNEL)) {
- mapping->a_ops->readpage(filp, page);
- if (!pagevec_add(&lru_pvec, page))
+ ret = mapping->a_ops->readpage(filp, page);
+ if (ret != AOP_TRUNCATED_PAGE &&
+ !pagevec_add(&lru_pvec, page)) {
__pagevec_lru_add(&lru_pvec);
- } else {
- page_cache_release(page);
+ continue;
+ }
}
+ page_cache_release(page);
}
pagevec_lru_add(&lru_pvec);
+ ret = 0;
out:
return ret;
}