2005-02-02 15:14:36

by Anton Altaparmakov

[permalink] [raw]
Subject: RFC: [PATCH-2.6] Add helper function to lock multiple page cache pages.

Hi,

On Thu, 27 Jan 2005, Andrew Morton wrote:
> Anton Altaparmakov <[email protected]> wrote:
> > What would you propose can I do to perform the required zeroing in a
> > deadlock safe manner whilst also ensuring that it cannot happen that a
> > concurrent ->readpage() will cause me to miss a page and thus end up
> > with non-initialized/random data on disk for that page?
>
> The only thing I can think of is to lock all the pages.
[snip discussion of how this can be done safely - for details see
Andrew's post with the subject "Re: Advice sought on how to lock multiple
pages in ->prepare_write and ->writepage"]

Below is a patch which adds a function
mm/filemap.c::find_or_create_pages(), locks a range of pages. Please see
the function description in the patch for details.

Nathan, would this be useful to you?

Andrew, would this be acceptable? And does it look right from a safety
point of view? I have added the try locks as you suggested.

Note, I have only compile tested this function as the NTFS code
implementing hole filling is nowhere near ready to even test yet...

Best regards,

Anton
--
Anton Altaparmakov <aia21 at cam.ac.uk> (replace at with @)
Unix Support, Computing Service, University of Cambridge, CB2 3QH, UK
Linux NTFS maintainer / IRC: #ntfs on irc.freenode.net
WWW: http://linux-ntfs.sf.net/ & http://www-stu.christs.cam.ac.uk/~aia21/

--- ntfs-2.6-devel/mm/filemap.c.old 2005-02-01 16:19:13.000000000 +0000
+++ ntfs-2.6-devel/mm/filemap.c 2005-02-02 15:06:57.158759886 +0000
@@ -586,6 +586,123 @@
EXPORT_SYMBOL(find_or_create_page);

/**
+ * find_or_create_pages - locate and/or add a number of page cache pages
+ * @mapping: address space in which to locate the pages
+ * @start: starting page index
+ * @nr_pages: number of pages to locate
+ * @pages: where the resulting pages are placed
+ * @gfp_mask: page allocation mode (for new pages)
+ * @locked_page: already locked page in the range of pages to lock
+ * @wbc: writeback control of @locked_page if applicable
+ *
+ * find_or_create_pages() locates @nr_pages starting at page index @start in
+ * the page cache described by the address space @mapping. The pages are
+ * placed in @pages. If any pages are not present in the page cache, new pages
+ * are allocated using @gfp_mask and are added to the page cache of @mapping as
+ * well as to the VM's LRU list. The returned pages are locked and have their
+ * reference count incremented. The locking is done in ascending page index
+ * order.
+ *
+ * @locked_page is a locked page in the requested range of pages. A reference
+ * is not acquired for @locked_page as the caller is assumed to hold one
+ * already. (Callers of ->prepare_write and ->writepage take a reference on
+ * the page.)
+ *
+ * @wbc is the writeback control of @locked_page if called from ->writepage and
+ * NULL otherwise (called from ->prepare_write()).
+ *
+ * Note, find_or_create_pages() may sleep, even if @gfp_flags specifies an
+ * atomic allocation!
+ *
+ * Return 0 on success and -errno on error. Possible errors are:
+ * -ENOMEM memory exhaustion
+ * -EDEADLK could not safely acquire required locks
+ * -ESTALE @locked_page was truncated by a racing process
+ */
+int find_or_create_pages(struct address_space *mapping, pgoff_t start,
+ const unsigned int nr_pages, struct page **pages,
+ const unsigned int gfp_mask, struct page *locked_page,
+ const struct writeback_control *wbc)
+{
+ struct page *cached_page = NULL;
+ struct inode *inode = mapping->host;
+ pgoff_t lp_idx = locked_page->index;
+ int err, nr;
+
+ if (lp_idx != start) {
+ if (wbc && wbc->for_reclaim) {
+ if (!down_read_trylock(&inode->i_sb->s_umount))
+ return -EDEADLK;
+ if (down_trylock(&inode->i_sem)) {
+ up_read(&inode->i_sb->s_umount);
+ return -EDEADLK;
+ }
+ }
+ unlock_page(locked_page);
+ }
+ err = nr = 0;
+ while (nr < nr_pages) {
+ if (start == lp_idx) {
+ pages[nr] = locked_page;
+ if (nr) {
+ lock_page(locked_page);
+ if (wbc) {
+ if (wbc->for_reclaim) {
+ up(&inode->i_sem);
+ up_read(&inode->i_sb->s_umount);
+ }
+ /* Was the page truncated under us? */
+ if (page_mapping(locked_page) !=
+ mapping) {
+ err = -ESTALE;
+ goto err_out_locked;
+ }
+ }
+ }
+ } else {
+ pages[nr] = find_lock_page(mapping, start);
+ if (!pages[nr]) {
+ if (!cached_page) {
+ cached_page = alloc_page(gfp_mask);
+ if (!unlikely(cached_page))
+ goto err_out;
+ }
+ err = add_to_page_cache_lru(cached_page,
+ mapping, start, gfp_mask);
+ if (unlikely(err)) {
+ if (err == -EEXIST)
+ continue;
+ goto err_out;
+ }
+ pages[nr] = cached_page;
+ cached_page = NULL;
+ }
+ }
+ nr++;
+ start++;
+ }
+out:
+ if (unlikely(cached_page))
+ page_cache_release(cached_page);
+ return err;
+err_out:
+ err = -ENOMEM;
+ /* Relock the original page if it was unlocked and not yet relocked. */
+ if (!nr || pages[nr-1]->index < lp_idx)
+ lock_page(locked_page);
+err_out_locked:
+ while (nr > 0) {
+ if (pages[--nr] != locked_page) {
+ unlock_page(pages[nr]);
+ page_cache_release(pages[nr]);
+ }
+ }
+ goto out;
+}
+
+EXPORT_SYMBOL(find_or_create_pages);
+
+/**
* find_get_pages - gang pagecache lookup
* @mapping: The address_space to search
* @start: The starting page index
--- ntfs-2.6-devel/include/linux/pagemap.h.old 2005-02-02 13:49:14.742362485 +0000
+++ ntfs-2.6-devel/include/linux/pagemap.h 2005-02-02 13:51:28.214387025 +0000
@@ -70,6 +70,10 @@
unsigned long index);
extern struct page * find_or_create_page(struct address_space *mapping,
unsigned long index, unsigned int gfp_mask);
+extern int find_or_create_pages(struct address_space *mapping, pgoff_t start,
+ const unsigned int nr_pages, struct page **pages,
+ const unsigned int gfp_mask, struct page *locked_page,
+ const struct writeback_control *wbc);
unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
unsigned int nr_pages, struct page **pages);
unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,


2005-02-02 15:43:55

by Matthew Wilcox

[permalink] [raw]
Subject: Re: RFC: [PATCH-2.6] Add helper function to lock multiple page cache pages.

On Wed, Feb 02, 2005 at 03:12:50PM +0000, Anton Altaparmakov wrote:

I think the below loop would be clearer as a for loop ...

err = 0;
for (nr = 0; nr < nr_pages; nr++, start++) {
if (start == lp_idx) {
pages[nr] = locked_page;
if (!nr)
continue;
lock_page(locked_page);
if (!wbc)
continue;
if (wbc->for_reclaim) {
up(&inode->i_sem);
up_read(&inode->i_sb->s_umount);
}
/* Was the page truncated under us? */
if (page_mapping(locked_page) != mapping) {
err = -ESTALE;
goto err_out_locked;
}
} else {
pages[nr] = find_lock_page(mapping, start);
if (pages[nr])
continue;
if (!cached_page) {
cached_page = alloc_page(gfp_mask);
if (unlikely(!cached_page))
goto err_out;
}
err = add_to_page_cache_lru(cached_page,
mapping, start, gfp_mask);
if (unlikely(err)) {
if (err == -EEXIST)
continue;
goto err_out;
}
pages[nr] = cached_page;
cached_page = NULL;
}
}

The above fixes two bugs in the below:
- if (!unlikely(cached_page)) should be if (unlikely(!cached_page))
- The -EEXIST case after add_to_page_cache_lru() would result in
an infinite loop in the original as nr wasn't being incremented.

> + err = nr = 0;
> + while (nr < nr_pages) {
> + if (start == lp_idx) {
> + pages[nr] = locked_page;
> + if (nr) {
> + lock_page(locked_page);
> + if (wbc) {
> + if (wbc->for_reclaim) {
> + up(&inode->i_sem);
> + up_read(&inode->i_sb->s_umount);
> + }
> + /* Was the page truncated under us? */
> + if (page_mapping(locked_page) !=
> + mapping) {
> + err = -ESTALE;
> + goto err_out_locked;
> + }
> + }
> + }
> + } else {
> + pages[nr] = find_lock_page(mapping, start);
> + if (!pages[nr]) {
> + if (!cached_page) {
> + cached_page = alloc_page(gfp_mask);
> + if (!unlikely(cached_page))
> + goto err_out;
> + }
> + err = add_to_page_cache_lru(cached_page,
> + mapping, start, gfp_mask);
> + if (unlikely(err)) {
> + if (err == -EEXIST)
> + continue;
> + goto err_out;
> + }
> + pages[nr] = cached_page;
> + cached_page = NULL;
> + }
> + }
> + nr++;
> + start++;
> + }

--
"Next the statesmen will invent cheap lies, putting the blame upon
the nation that is attacked, and every man will be glad of those
conscience-soothing falsities, and will diligently study them, and refuse
to examine any refutations of them; and thus he will by and by convince
himself that the war is just, and will thank God for the better sleep
he enjoys after this process of grotesque self-deception." -- Mark Twain

2005-02-02 16:00:15

by Anton Altaparmakov

[permalink] [raw]
Subject: Re: RFC: [PATCH-2.6] Add helper function to lock multiple page cache pages.

Hi Matthew,

On Wed, 2005-02-02 at 15:43 +0000, Matthew Wilcox wrote:
> On Wed, Feb 02, 2005 at 03:12:50PM +0000, Anton Altaparmakov wrote:
>
> I think the below loop would be clearer as a for loop ...
>
> err = 0;
> for (nr = 0; nr < nr_pages; nr++, start++) {
> if (start == lp_idx) {
> pages[nr] = locked_page;
> if (!nr)
> continue;
> lock_page(locked_page);
> if (!wbc)
> continue;
> if (wbc->for_reclaim) {
> up(&inode->i_sem);
> up_read(&inode->i_sb->s_umount);
> }
> /* Was the page truncated under us? */
> if (page_mapping(locked_page) != mapping) {
> err = -ESTALE;
> goto err_out_locked;
> }
> } else {
> pages[nr] = find_lock_page(mapping, start);
> if (pages[nr])
> continue;
> if (!cached_page) {
> cached_page = alloc_page(gfp_mask);
> if (unlikely(!cached_page))
> goto err_out;
> }
> err = add_to_page_cache_lru(cached_page,
> mapping, start, gfp_mask);
> if (unlikely(err)) {
> if (err == -EEXIST)
> continue;
> goto err_out;
> }
> pages[nr] = cached_page;
> cached_page = NULL;
> }
> }
>
> The above fixes two bugs in the below:
> - if (!unlikely(cached_page)) should be if (unlikely(!cached_page))

Ah, oops. Thanks! Well spotted! I did say it was only compile
tested... (-;

> - The -EEXIST case after add_to_page_cache_lru() would result in
> an infinite loop in the original as nr wasn't being incremented.

That was exactly what was meant to happen. It is not a bug. It is a
feature. This is why it is a while loop instead of a for loop. I need
to have @nr and @start incremented only if the code reaches the end of
the loop.

The -EEXIST case needs to repeat for the same @nr and @start. It
basically means that someone else allocated the page with index @start
and added it to the page cache in between us running find_lock_page()
and add_to_page_cache_lru(). So what we want to do is to run
find_lock_page() again which should then find and lock the page that the
other process created.

Of course what could happen is that between us getting the -EEXIST and
us repeating the find_lock_page() the page is freed again so the
find_lock_page() fails again. Perhaps this time we will succeed with
add_to_page_cache_lru() and if not we repeat again. Eventually either
find_lock_page() or add_to_page_cache_lru() will succeed so in practise
it will never be an endless loop.

If the while loop is changed to a for loop, the "continue;" on -EEXIST
would need to be changed to "goto repeat;" and a label "repeat:" would
need to be placed at the beginning of the loop. I considered this but
decided the while loop looks nicer. (-:

Thanks for the review!

> > + err = nr = 0;
> > + while (nr < nr_pages) {
> > + if (start == lp_idx) {
> > + pages[nr] = locked_page;
> > + if (nr) {
> > + lock_page(locked_page);
> > + if (wbc) {
> > + if (wbc->for_reclaim) {
> > + up(&inode->i_sem);
> > + up_read(&inode->i_sb->s_umount);
> > + }
> > + /* Was the page truncated under us? */
> > + if (page_mapping(locked_page) !=
> > + mapping) {
> > + err = -ESTALE;
> > + goto err_out_locked;
> > + }
> > + }
> > + }
> > + } else {
> > + pages[nr] = find_lock_page(mapping, start);
> > + if (!pages[nr]) {
> > + if (!cached_page) {
> > + cached_page = alloc_page(gfp_mask);
> > + if (!unlikely(cached_page))
> > + goto err_out;
> > + }
> > + err = add_to_page_cache_lru(cached_page,
> > + mapping, start, gfp_mask);
> > + if (unlikely(err)) {
> > + if (err == -EEXIST)
> > + continue;
> > + goto err_out;
> > + }
> > + pages[nr] = cached_page;
> > + cached_page = NULL;
> > + }
> > + }
> > + nr++;
> > + start++;
> > + }

Best regards,

Anton
--
Anton Altaparmakov <aia21 at cam.ac.uk> (replace at with @)
Unix Support, Computing Service, University of Cambridge, CB2 3QH, UK
Linux NTFS maintainer / IRC: #ntfs on irc.freenode.net
WWW: http://linux-ntfs.sf.net/ & http://www-stu.christs.cam.ac.uk/~aia21/

2005-02-02 22:39:00

by Andrew Morton

[permalink] [raw]
Subject: Re: RFC: [PATCH-2.6] Add helper function to lock multiple page cache pages.

Anton Altaparmakov <[email protected]> wrote:
>
> Below is a patch which adds a function
> mm/filemap.c::find_or_create_pages(), locks a range of pages. Please see
> the function description in the patch for details.

This isn't very nice, is it, really? Kind of a square peg in a round hole.

If you took the approach of defining a custom file_operations.write() then
I'd imagine that the write() side of things would fall out fairly neatly:
no need for s_umount and i_sem needs to be taken anyway. No trylocking.

And for the vmscan->writepage() side of things I wonder if it would be
possible to overload the mapping's ->nopage handler. If the target page
lies in a hole, go off and allocate all the necessary pagecache pages, zero
them, mark them dirty?

2005-02-03 10:39:41

by Anton Altaparmakov

[permalink] [raw]
Subject: Re: RFC: [PATCH-2.6] Add helper function to lock multiple page cache pages.

On Wed, 2005-02-02 at 14:34 -0800, Andrew Morton wrote:
> Anton Altaparmakov <[email protected]> wrote:
> >
> > Below is a patch which adds a function
> > mm/filemap.c::find_or_create_pages(), locks a range of pages. Please see
> > the function description in the patch for details.
>
> This isn't very nice, is it, really? Kind of a square peg in a round hole.

Only followed your advice. (-; But yes, it is not very nice at all.

> If you took the approach of defining a custom file_operations.write() then
> I'd imagine that the write() side of things would fall out fairly neatly:
> no need for s_umount and i_sem needs to be taken anyway. No trylocking.

But the write() side of things don't need s_umount or trylocking with
the proposed find_or_create_pages(), either...

Unfortunately it is not possible to do this since removing
->{prepare,commit}_write() from NTFS would mean that we cannot use loop
devices on NTFS any more and this is a really important feature for
several Linux distributions (e.g. TopologiLinux) which install Linux on
a loopback mounted NTFS file which they then use to place an ext3 (or
whatever) fs on and use that as the root fs...

So we definitely need full blown prepare/commit write. (Unless we
modify the loop device driver not to use ->{prepare,commit}_write
first.)

Any ideas how to solve that one?

> And for the vmscan->writepage() side of things I wonder if it would be
> possible to overload the mapping's ->nopage handler. If the target page
> lies in a hole, go off and allocate all the necessary pagecache pages, zero
> them, mark them dirty?

I guess it would be possible but ->nopage is used for the read case and
why would we want to then cause writes/allocations? Example: I create a
sparse file of 2TiB size and put some data in relevant places. Then an
applications mmap()s it and does loads of reads on the mmap()ped file
and perhaps a write here or there. Do we really want that to start
allocating and filling in all read holes? That seems worse than having
a square peg for a round hole that is hidden away in a single function.

There is nothing in the proposed find_or_create_pages() that means it
needs to go into mm/filemap.c. It could easily be a private function in
fs/ntfs/aops.c. I just thought that other fs who want to support
writing to large block sizes might find it useful and having a shared
copy in mm/filemap.c would be better in that case. But if it is too
ugly to go in mm/filemap.c then that is fine, too.

At the moment I cannot see a way to solve my problem without the
proposed find_or_create_pages(). )-:

Best regards,

Anton
--
Anton Altaparmakov <aia21 at cam.ac.uk> (replace at with @)
Unix Support, Computing Service, University of Cambridge, CB2 3QH, UK
Linux NTFS maintainer / IRC: #ntfs on irc.freenode.net
WWW: http://linux-ntfs.sf.net/ & http://www-stu.christs.cam.ac.uk/~aia21/

2005-02-03 10:53:48

by Andrew Morton

[permalink] [raw]
Subject: Re: RFC: [PATCH-2.6] Add helper function to lock multiple page cache pages.

Anton Altaparmakov <[email protected]> wrote:
>
> On Wed, 2005-02-02 at 14:34 -0800, Andrew Morton wrote:
> > Anton Altaparmakov <[email protected]> wrote:
> > >
> > > Below is a patch which adds a function
> > > mm/filemap.c::find_or_create_pages(), locks a range of pages. Please see
> > > the function description in the patch for details.
> >
> > This isn't very nice, is it, really? Kind of a square peg in a round hole.
>
> Only followed your advice. (-; But yes, it is not very nice at all.
>
> > If you took the approach of defining a custom file_operations.write() then
> > I'd imagine that the write() side of things would fall out fairly neatly:
> > no need for s_umount and i_sem needs to be taken anyway. No trylocking.
>
> But the write() side of things don't need s_umount or trylocking with
> the proposed find_or_create_pages(), either...

i_sem nests outside lock_page, normally. I guess that can be avoided though.

> Unfortunately it is not possible to do this since removing
> ->{prepare,commit}_write() from NTFS would mean that we cannot use loop
> devices on NTFS any more and this is a really important feature for
> several Linux distributions (e.g. TopologiLinux) which install Linux on
> a loopback mounted NTFS file which they then use to place an ext3 (or
> whatever) fs on and use that as the root fs...
>
> So we definitely need full blown prepare/commit write. (Unless we
> modify the loop device driver not to use ->{prepare,commit}_write
> first.)
>
> Any ideas how to solve that one?

I did a patch which switched loop to use the file_operations.read/write
about a year ago. Forget what happened to it. It always seemed the right
thing to do..

> > And for the vmscan->writepage() side of things I wonder if it would be
> > possible to overload the mapping's ->nopage handler. If the target page
> > lies in a hole, go off and allocate all the necessary pagecache pages, zero
> > them, mark them dirty?
>
> I guess it would be possible but ->nopage is used for the read case and
> why would we want to then cause writes/allocations?

yup, we'd need to create a new handler for writes, or pass `write_access'
into ->nopage. I think others (dwdm2?) have seen a need for that.

> At the moment I cannot see a way to solve my problem without the
> proposed find_or_create_pages(). )-:

Unpleasant, isn't it.

I guess the path of least resistance is to do it within ntfs for now.

2005-02-03 11:29:00

by Anton Altaparmakov

[permalink] [raw]
Subject: Re: RFC: [PATCH-2.6] Add helper function to lock multiple page cache pages.

On Thu, 2005-02-03 at 02:47 -0800, Andrew Morton wrote:
> Anton Altaparmakov <[email protected]> wrote:
> > On Wed, 2005-02-02 at 14:34 -0800, Andrew Morton wrote:
> > > Anton Altaparmakov <[email protected]> wrote:
> > > >
> > > > Below is a patch which adds a function
> > > > mm/filemap.c::find_or_create_pages(), locks a range of pages. Please see
> > > > the function description in the patch for details.
> > >
> > > This isn't very nice, is it, really? Kind of a square peg in a round hole.
> >
> > Only followed your advice. (-; But yes, it is not very nice at all.
> >
> > > If you took the approach of defining a custom file_operations.write() then
> > > I'd imagine that the write() side of things would fall out fairly neatly:
> > > no need for s_umount and i_sem needs to be taken anyway. No trylocking.
> >
> > But the write() side of things don't need s_umount or trylocking with
> > the proposed find_or_create_pages(), either...
>
> i_sem nests outside lock_page, normally. I guess that can be avoided though.

I meant that the write() side of things (i.e. ->{prepare,commit}_write)
already has i_sem held on entry.

> > Unfortunately it is not possible to do this since removing
> > ->{prepare,commit}_write() from NTFS would mean that we cannot use loop
> > devices on NTFS any more and this is a really important feature for
> > several Linux distributions (e.g. TopologiLinux) which install Linux on
> > a loopback mounted NTFS file which they then use to place an ext3 (or
> > whatever) fs on and use that as the root fs...
> >
> > So we definitely need full blown prepare/commit write. (Unless we
> > modify the loop device driver not to use ->{prepare,commit}_write
> > first.)
> >
> > Any ideas how to solve that one?
>
> I did a patch which switched loop to use the file_operations.read/write
> about a year ago. Forget what happened to it. It always seemed the right
> thing to do..

Yes, I remember seeing something like that on LKML. I guess it would
enable readahead on the loop devices. Whether this is a good or bad
thing is I guess entirely dependent on the usage scenario.

> > > And for the vmscan->writepage() side of things I wonder if it would be
> > > possible to overload the mapping's ->nopage handler. If the target page
> > > lies in a hole, go off and allocate all the necessary pagecache pages, zero
> > > them, mark them dirty?
> >
> > I guess it would be possible but ->nopage is used for the read case and
> > why would we want to then cause writes/allocations?
>
> yup, we'd need to create a new handler for writes, or pass `write_access'
> into ->nopage. I think others (dwdm2?) have seen a need for that.

That would work as long as all writable mappings are actually written to
everywhere. Otherwise you still get that reading the whole mmap()ped
are but writing a small part of it would still instantiate all of it on
disk. As far as I understand this there is no way to hook into the mmap
system such that we have a hook whenever a mmap()ped page gets written
to for the first time. (I may well be wrong on that one so please
correct me if that is the case.)

> > At the moment I cannot see a way to solve my problem without the
> > proposed find_or_create_pages(). )-:
>
> Unpleasant, isn't it.
>
> I guess the path of least resistance is to do it within ntfs for now.

Ok, I will do that. ntfs_find_or_create_pages() will be hitting
fs/ntfs/aops.c soon...

As always, thanks a lot for your help!

Best regards,

Anton
--
Anton Altaparmakov <aia21 at cam.ac.uk> (replace at with @)
Unix Support, Computing Service, University of Cambridge, CB2 3QH, UK
Linux NTFS maintainer / IRC: #ntfs on irc.freenode.net
WWW: http://linux-ntfs.sf.net/ & http://www-stu.christs.cam.ac.uk/~aia21/

2005-02-03 19:26:48

by Bryan Henderson

[permalink] [raw]
Subject: Re: RFC: [PATCH-2.6] Add helper function to lock multiple page cache pages - nopage alternative

>> > > And for the vmscan->writepage() side of things I wonder if it would
be
>> > > possible to overload the mapping's ->nopage handler. If the target
page
>> > > lies in a hole, go off and allocate all the necessary pagecache
pages, zero
>> > > them, mark them dirty?
>> >
>> > I guess it would be possible but ->nopage is used for the read case
and
>> > why would we want to then cause writes/allocations?
>>
>> yup, we'd need to create a new handler for writes, or pass
`write_access'
>> into ->nopage. I think others (dwdm2?) have seen a need for that.
>
>That would work as long as all writable mappings are actually written to
>everywhere. Otherwise you still get that reading the whole mmap()ped
>are but writing a small part of it would still instantiate all of it on
>disk. As far as I understand this there is no way to hook into the mmap
>system such that we have a hook whenever a mmap()ped page gets written
>to for the first time. (I may well be wrong on that one so please
>correct me if that is the case.)

I think the point is that we can't have a "handler for writes," because
the writes are being done by simple CPU Store instructions in a user
program. The handler we're talking about is just for page faults. Other
operating systems approach this by actually _having_ a handler for a CPU
store instruction, in the form of a page protection fault handler -- the
nopage routine adds the page to the user's address space, but write
protects it. The first time the user tries to store into it, the
filesystem driver gets a chance to do what's necessary to support a dirty
cache page -- allocate a block, add additional dirty pages to the cache,
etc. It would be wonderful to have that in Linux. I saw hints of such
code in a Linux kernel once (a "write_protect" address space operation or
something like that); I don't know what happened to it.

Short of that, I don't see any way to avoid sometimes filling in holes due
to reads. It's not a huge problem, though -- it requires someone to do a
shared writable mmap and then read lots of holes and not write to them,
which is a pretty rare situation for a normal file.

I didn't follow how the helper function solves this problem. If it's
something involving adding the required extra pages to the cache at
pageout time, then that's not going to work -- you can't make adding pages
to the cache a prerequisite for cleaning a page -- that would be Deadlock
City.

My large-block filesystem driver does the nopage thing, and does in fact
fill in files unnecessarily in this scenario. :-( The driver for the
same filesystems on AIX does not, though. It has the write protection
thing.

--
Bryan Henderson IBM Almaden Research Center
San Jose CA Filesystems

2005-02-03 19:19:20

by Bryan Henderson

[permalink] [raw]
Subject: Re: RFC: [PATCH-2.6] Add helper function to lock multiple page cache pages - loop device

>I did a patch which switched loop to use the file_operations.read/write
>about a year ago. Forget what happened to it. It always seemed the
right
>thing to do..

This is unquestionably the right thing to do (at least compared to what we
have now). The loop device driver has no business assuming that the
underlying filesystem uses the generic routines. I always assumed it was
a simple design error that it did. (Such errors are easy to make because
prepare_write and commit_write are declared as address space operations,
when they're really private to the buffer cache and generic writer).

--
Bryan Henderson IBM Almaden Research Center
San Jose CA Filesystems

2005-02-04 15:48:30

by Anton Altaparmakov

[permalink] [raw]
Subject: Re: RFC: [PATCH-2.6] Add helper function to lock multiple page cache pages - nopage alternative

On Thu, 2005-02-03 at 11:23 -0800, Bryan Henderson wrote:
> >> > > And for the vmscan->writepage() side of things I wonder if it would
> be
> >> > > possible to overload the mapping's ->nopage handler. If the target
> page
> >> > > lies in a hole, go off and allocate all the necessary pagecache
> pages, zero
> >> > > them, mark them dirty?
> >> >
> >> > I guess it would be possible but ->nopage is used for the read case
> and
> >> > why would we want to then cause writes/allocations?
> >>
> >> yup, we'd need to create a new handler for writes, or pass
> `write_access'
> >> into ->nopage. I think others (dwdm2?) have seen a need for that.
> >
> >That would work as long as all writable mappings are actually written to
> >everywhere. Otherwise you still get that reading the whole mmap()ped
> >are but writing a small part of it would still instantiate all of it on
> >disk. As far as I understand this there is no way to hook into the mmap
> >system such that we have a hook whenever a mmap()ped page gets written
> >to for the first time. (I may well be wrong on that one so please
> >correct me if that is the case.)
>
> I think the point is that we can't have a "handler for writes," because
> the writes are being done by simple CPU Store instructions in a user
> program. The handler we're talking about is just for page faults. Other

That was my understanding.

> operating systems approach this by actually _having_ a handler for a CPU
> store instruction, in the form of a page protection fault handler -- the
> nopage routine adds the page to the user's address space, but write
> protects it. The first time the user tries to store into it, the
> filesystem driver gets a chance to do what's necessary to support a dirty
> cache page -- allocate a block, add additional dirty pages to the cache,
> etc. It would be wonderful to have that in Linux.

It would. This would certainly solve this problem.

> Short of that, I don't see any way to avoid sometimes filling in holes due
> to reads. It's not a huge problem, though -- it requires someone to do a
> shared writable mmap and then read lots of holes and not write to them,
> which is a pretty rare situation for a normal file.
>
> I didn't follow how the helper function solves this problem. If it's
> something involving adding the required extra pages to the cache at
> pageout time, then that's not going to work -- you can't make adding pages
> to the cache a prerequisite for cleaning a page -- that would be Deadlock
> City.

Ouch. Yes, it would rather, wouldn't it. How very annoying of it. I
guess the ->nomap way is the only sane way to deal with this then. I
suppose it is also possible to do it via writepage by scanning for and
locking pages if present in writepage and if not present go direct to
disk (using bio or bh) whilst holding exclusive lock so no new pages can
be added to the page cache with stale data. Or actually we wouldn't
even care if stale pages are added as they would still be cleared in
readpage(). And pages found and uptodate and locked simply need to be
marked dirty and released again and if not uptodate they need to be
cleared first. This might actually turn out the easiest for ntfs at
least and it should avoid the deadlocks that would be caused by trying
to add new pages to the page cache. Its maybe not as clean as a write
on a read-only page causing a fault handler to be triggered but it
should work I think. Comments?

> My large-block filesystem driver does the nopage thing, and does in fact
> fill in files unnecessarily in this scenario. :-( The driver for the
> same filesystems on AIX does not, though. It has the write protection
> thing.

Is your driver's source available to look at?

Best regards,

Anton
--
Anton Altaparmakov <aia21 at cam.ac.uk> (replace at with @)
Unix Support, Computing Service, University of Cambridge, CB2 3QH, UK
Linux NTFS maintainer / IRC: #ntfs on irc.freenode.net
WWW: http://linux-ntfs.sf.net/ & http://www-stu.christs.cam.ac.uk/~aia21/

2005-02-04 17:18:58

by Hugh Dickins

[permalink] [raw]
Subject: Re: RFC: [PATCH-2.6] Add helper function to lock multiple page cache pages - nopage alternative

On Fri, 4 Feb 2005, Anton Altaparmakov wrote:
> On Thu, 2005-02-03 at 11:23 -0800, Bryan Henderson wrote:
> >
> > I think the point is that we can't have a "handler for writes," because
> > the writes are being done by simple CPU Store instructions in a user
> > program. The handler we're talking about is just for page faults. Other
>
> That was my understanding.
>
> > operating systems approach this by actually _having_ a handler for a CPU
> > store instruction, in the form of a page protection fault handler -- the
> > nopage routine adds the page to the user's address space, but write
> > protects it. The first time the user tries to store into it, the
> > filesystem driver gets a chance to do what's necessary to support a dirty
> > cache page -- allocate a block, add additional dirty pages to the cache,
> > etc. It would be wonderful to have that in Linux.
>
> It would. This would certainly solve this problem.

Isn't this exactly what David Howells' page_mkwrite stuff in -mm's
add-page-becoming-writable-notification.patch is designed for?

Though it looks a little broken to me as it stands (beyond the two
fixup patches already there). I've not found time to double-check
or test, apologies in advance if I'm libelling, but...

(a) I thought the prot bits do_nopage gives a pte in a shared writable
mapping include write permission, even when it's a read fault:
that can't be allowed if there's a page_mkwrite.

(b) I don't understand how do_wp_page's "reuse" logic for whether it
can just go ahead and use the existing anonymous page, would have
any relevance to calling page_mkwrite on a shared writable page,
which must be used and not COWed however many references there are.

Didn't look further, think you should take a look at page_mkwrite,
but don't expect the implementation to be correct yet.

Hugh

2005-02-06 19:42:40

by Anton Altaparmakov

[permalink] [raw]
Subject: Re: RFC: [PATCH-2.6] Add helper function to lock multiple page cache pages.

On Thu, 3 Feb 2005, Andrew Morton wrote:
> I did a patch which switched loop to use the file_operations.read/write
> about a year ago. Forget what happened to it. It always seemed the right
> thing to do..

How did you implement the write? At the moment the loop driver gets hold
of both source and destination pages (the latter via grab_cache_page() and
aops->prepare_write()) and copies/transforms directly from the source to
the destination page (and then calls commit_write() on the destination
page). Did you allocate a buffer for each request, copy/transform to the
buffer and then submit the buffer via file_operations->write? That would
clearly be not very efficient but given fops->write() is not atomic I
don't see how that could be optimised further...

Perhaps the loop driver should work as is when
aops->{prepare,commit}_write() are not NULL and should fall back to
a buffered fops->write() otherwise?

Or have I missed some way in which the fops->write() case can be
optimized?

Best regards,

Anton
--
Anton Altaparmakov <aia21 at cam.ac.uk> (replace at with @)
Unix Support, Computing Service, University of Cambridge, CB2 3QH, UK
Linux NTFS maintainer / IRC: #ntfs on irc.freenode.net
WWW: http://linux-ntfs.sf.net/ & http://www-stu.christs.cam.ac.uk/~aia21/

2005-02-06 20:43:03

by Andrew Morton

[permalink] [raw]
Subject: Re: RFC: [PATCH-2.6] Add helper function to lock multiple page cache pages.

Anton Altaparmakov <[email protected]> wrote:
>
> On Thu, 3 Feb 2005, Andrew Morton wrote:
> > I did a patch which switched loop to use the file_operations.read/write
> > about a year ago. Forget what happened to it. It always seemed the right
> > thing to do..
>
> How did you implement the write?

It was Urban, actually. Memory fails me.

http://marc.theaimsgroup.com/?l=linux-fsdevel&m=102133217310200&w=2

It won't work properly for crypto transformations - iirc we decided it
would need to perform a copy.

2005-02-16 21:58:43

by Anton Altaparmakov

[permalink] [raw]
Subject: Re: RFC: [PATCH-2.6] Add helper function to lock multiple page cache pages.

Hi Andrew,

On Sun, 6 Feb 2005, Andrew Morton wrote:
> Anton Altaparmakov <[email protected]> wrote:
> > On Thu, 3 Feb 2005, Andrew Morton wrote:
> > > I did a patch which switched loop to use the file_operations.read/write
> > > about a year ago. Forget what happened to it. It always seemed the right
> > > thing to do..
> >
> > How did you implement the write?
>
> It was Urban, actually. Memory fails me.
>
> http://marc.theaimsgroup.com/?l=linux-fsdevel&m=102133217310200&w=2
>
> It won't work properly for crypto transformations - iirc we decided it
> would need to perform a copy.

That is the conclusion I came to as well. I whipped up a patch today that
implements fallback to file_operations->write in the case that
aops->{prepare,commit}_write are not present on the backing filesystem.

The fallback happens in two different ways:

- For normal loop devices, i.e. ones which do not do transformation on the
data but simply pass it along, we simply call fops->write. This should be
pretty much just as fast as using aops->{prepare,commit}_write directly.

- For all other loop devices (e.g. xor and cryptoloop), i.e. all the ones
which may be doing transformations on the data, we allocate and map a page
(once for each bio), then for each bio vec we copy the bio vec page data
to our mapped page, apply the loop transformation, and use fops->write to
write out the transformed data from our page. Once all bio vecs from the
bio are done, we unmap and free the page.

This approach is the absolute minimum of overhead I could come up with and
for performance hungry people, as you can see I left the address space
operations method in place for filesystems which implement
aops->{prepare,commit}_write.

I have tested this patch with normal loop devices using
aops->{prepare,commit}_write on the backing filesystem, with normal loop
devices using the fops->write code path and with cryptoloop devices using
the deouble buffering + fops->write code path.

It all seemed to work fine for me.

Andrew, please consider this patch for your -mm tree and please apply it
to mainline when/as you see fit. Thanks!

Best regards,

Anton
--
Anton Altaparmakov <aia21 at cam.ac.uk> (replace at with @)
Unix Support, Computing Service, University of Cambridge, CB2 3QH, UK
Linux NTFS maintainer / IRC: #ntfs on irc.freenode.net
WWW: http://linux-ntfs.sf.net/ & http://www-stu.christs.cam.ac.uk/~aia21/

--- loop-write.diff ---

loop: Implement fallback to file operations->write in the loop driver for
backing filesystems which do not implement address space
operations->{prepare,commit}_write.

Signed-off-by: Anton Altaparmakov <[email protected]>

drivers/block/loop.c | 159 ++++++++++++++++++++++++++++++++++++++++++---------
include/linux/loop.h | 5 +
2 files changed, 136 insertions(+), 28 deletions(-)

--- linus-2.6/include/linux/loop.h 2005-02-16 11:04:41.439418700 +0000
+++ linux-2.6/include/linux/loop.h 2005-02-16 11:08:07.535781915 +0000
@@ -71,7 +71,10 @@ struct loop_device {
/*
* Loop flags
*/
-#define LO_FLAGS_READ_ONLY 1
+enum {
+ LO_FLAGS_READ_ONLY = 1,
+ LO_FLAGS_USE_AOPS = 2,
+};

#include <asm/posix_types.h> /* for __kernel_old_dev_t */
#include <asm/types.h> /* for __u64 */
--- linus-2.6/drivers/block/loop.c 2005-02-16 10:44:02.994427343 +0000
+++ linux-2.6/drivers/block/loop.c 2005-02-16 21:08:25.235894884 +0000
@@ -39,6 +39,11 @@
* Support up to 256 loop devices
* Heinz Mauelshagen <[email protected]>, Feb 2002
*
+ * Support for falling back on the write file operation when the address space
+ * operations prepare_write and/or commit_write are not available on the
+ * backing filesystem.
+ * Anton Altaparmakov, 16 Feb 2005
+ *
* Still To Fix:
* - Advisory locking is ignored here.
* - Should use an own CAP_* category instead of CAP_SYS_ADMIN
@@ -67,6 +72,8 @@
#include <linux/writeback.h>
#include <linux/buffer_head.h> /* for invalidate_bdev() */
#include <linux/completion.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>

#include <asm/uaccess.h>

@@ -127,7 +134,7 @@ static int transfer_xor(struct loop_devi

static int xor_init(struct loop_device *lo, const struct loop_info64 *info)
{
- if (info->lo_encrypt_key_size <= 0)
+ if (unlikely(info->lo_encrypt_key_size <= 0))
return -EINVAL;
return 0;
}
@@ -173,7 +180,7 @@ figure_loop_size(struct loop_device *lo)
loff_t size = get_loop_size(lo, lo->lo_backing_file);
sector_t x = (sector_t)size;

- if ((loff_t)x != size)
+ if (unlikely((loff_t)x != size))
return -EFBIG;

set_capacity(disks[lo->lo_number], x);
@@ -186,23 +193,27 @@ lo_do_transfer(struct loop_device *lo, i
struct page *lpage, unsigned loffs,
int size, sector_t rblock)
{
- if (!lo->transfer)
+ if (unlikely(!lo->transfer))
return 0;

return lo->transfer(lo, cmd, rpage, roffs, lpage, loffs, size, rblock);
}

-static int
-do_lo_send(struct loop_device *lo, struct bio_vec *bvec, int bsize, loff_t pos)
+/**
+ * do_lo_send_aops - helper for writing data to a loop device
+ *
+ * This is the fast version for backing filesystems which implement the address
+ * space operations prepare_write and commit_write.
+ */
+static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
+ int bsize, loff_t pos, struct page *page)
{
struct file *file = lo->lo_backing_file; /* kudos to NFsckingS */
struct address_space *mapping = file->f_mapping;
struct address_space_operations *aops = mapping->a_ops;
- struct page *page;
pgoff_t index;
- unsigned size, offset, bv_offs;
- int len;
- int ret = 0;
+ unsigned offset, bv_offs;
+ int len, ret = 0;

down(&mapping->host->i_sem);
index = pos >> PAGE_CACHE_SHIFT;
@@ -211,23 +222,22 @@ do_lo_send(struct loop_device *lo, struc
len = bvec->bv_len;
while (len > 0) {
sector_t IV;
+ unsigned size;
int transfer_result;

IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9);
-
size = PAGE_CACHE_SIZE - offset;
if (size > len)
size = len;
-
page = grab_cache_page(mapping, index);
- if (!page)
+ if (unlikely(!page))
goto fail;
- if (aops->prepare_write(file, page, offset, offset+size))
+ if (unlikely(aops->prepare_write(file, page, offset,
+ offset + size)))
goto unlock;
transfer_result = lo_do_transfer(lo, WRITE, page, offset,
- bvec->bv_page, bv_offs,
- size, IV);
- if (transfer_result) {
+ bvec->bv_page, bv_offs, size, IV);
+ if (unlikely(transfer_result)) {
char *kaddr;

/*
@@ -241,9 +251,10 @@ do_lo_send(struct loop_device *lo, struc
kunmap_atomic(kaddr, KM_USER0);
}
flush_dcache_page(page);
- if (aops->commit_write(file, page, offset, offset+size))
+ if (unlikely(aops->commit_write(file, page, offset,
+ offset + size)))
goto unlock;
- if (transfer_result)
+ if (unlikely(transfer_result))
goto unlock;
bv_offs += size;
len -= size;
@@ -253,32 +264,125 @@ do_lo_send(struct loop_device *lo, struc
unlock_page(page);
page_cache_release(page);
}
- up(&mapping->host->i_sem);
out:
+ up(&mapping->host->i_sem);
return ret;
-
unlock:
unlock_page(page);
page_cache_release(page);
fail:
- up(&mapping->host->i_sem);
ret = -1;
goto out;
}

-static int
-lo_send(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos)
+/**
+ * __do_lo_send_write - helper for writing data to a loop device
+ *
+ * This helper just factors out common code between do_lo_send_direct_write()
+ * and do_lo_send_write().
+ */
+static inline int __do_lo_send_write(struct file *file,
+ u8 __user *buf, const int len, loff_t pos)
{
+ ssize_t bw;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(get_ds());
+ bw = file->f_op->write(file, buf, len, &pos);
+ set_fs(old_fs);
+ if (likely(bw == len))
+ return 0;
+ printk(KERN_ERR "loop: Write error at byte offset %llu, length %i.\n",
+ (unsigned long long)pos, len);
+ if (bw >= 0)
+ bw = -EIO;
+ return bw;
+}
+
+/**
+ * do_lo_send_direct_write - helper for writing data to a loop device
+ *
+ * This is the fast, non-transforming version for backing filesystems which do
+ * not implement the address space operations prepare_write and commit_write.
+ * It uses the write file operation which should be present on all writeable
+ * filesystems.
+ */
+static int do_lo_send_direct_write(struct loop_device *lo,
+ struct bio_vec *bvec, int bsize, loff_t pos, struct page *page)
+{
+ ssize_t bw = __do_lo_send_write(lo->lo_backing_file,
+ (u8 __user *)kmap(bvec->bv_page) + bvec->bv_offset,
+ bvec->bv_len, pos);
+ kunmap(bvec->bv_page);
+ cond_resched();
+ return bw;
+}
+
+/**
+ * do_lo_send_write - helper for writing data to a loop device
+ *
+ * This is the slow, transforming version for filesystems which do not
+ * implement the address space operations prepare_write and commit_write. It
+ * uses the write file operation which should be present on all writeable
+ * filesystems.
+ *
+ * Using fops->write is slower than using aops->{prepare,commit}_write in the
+ * transforming case because we need to double buffer the data as we cannot do
+ * the transformations in place as we do not have direct access to the
+ * destination pages of the backing file.
+ */
+static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec,
+ int bsize, loff_t pos, struct page *page)
+{
+ int ret = lo_do_transfer(lo, WRITE, page, 0, bvec->bv_page,
+ bvec->bv_offset, bvec->bv_len, pos >> 9);
+ if (likely(!ret))
+ return __do_lo_send_write(lo->lo_backing_file,
+ (u8 __user *)page_address(page), bvec->bv_len,
+ pos);
+ printk(KERN_ERR "loop: Transfer error at byte offset %llu, "
+ "length %i.\n", (unsigned long long)pos, bvec->bv_len);
+ if (ret > 0)
+ ret = -EIO;
+ return ret;
+}
+
+static int lo_send(struct loop_device *lo, struct bio *bio, int bsize,
+ loff_t pos)
+{
+ int (*do_lo_send)(struct loop_device *, struct bio_vec *, int, loff_t,
+ struct page *page);
struct bio_vec *bvec;
+ struct page *page = NULL;
int i, ret = 0;

+ do_lo_send = do_lo_send_aops;
+ if (!(lo->lo_flags & LO_FLAGS_USE_AOPS)) {
+ do_lo_send = do_lo_send_direct_write;
+ if (lo->transfer != transfer_none) {
+ page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
+ if (unlikely(!page))
+ goto fail;
+ kmap(page);
+ do_lo_send = do_lo_send_write;
+ }
+ }
bio_for_each_segment(bvec, bio, i) {
- ret = do_lo_send(lo, bvec, bsize, pos);
+ ret = do_lo_send(lo, bvec, bsize, pos, page);
if (ret < 0)
break;
pos += bvec->bv_len;
}
+ if (page) {
+ kunmap(page);
+ __free_page(page);
+ }
+out:
return ret;
+fail:
+ printk(KERN_ERR "loop: Failed to allocate temporary page for write.\n");
+ ret = -ENOMEM;
+ goto out;
}

struct lo_read_data {
@@ -584,7 +688,7 @@ static int loop_change_fd(struct loop_de

/* the loop device has to be read-only */
error = -EINVAL;
- if (lo->lo_flags != LO_FLAGS_READ_ONLY)
+ if (!(lo->lo_flags & LO_FLAGS_READ_ONLY))
goto out;

error = -EBADF;
@@ -683,8 +787,9 @@ static int loop_set_fd(struct loop_devic
*/
if (!file->f_op->sendfile)
goto out_putf;
-
- if (!aops->prepare_write || !aops->commit_write)
+ if (aops->prepare_write && aops->commit_write)
+ lo_flags |= LO_FLAGS_USE_AOPS;
+ if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write)
lo_flags |= LO_FLAGS_READ_ONLY;

lo_blocksize = inode->i_blksize;