The patch goes through the various places which were calling
lru_cache_add() against bulk pages and batches them up.
Also. This whole patch series improves the behaviour of the system
under heavy writeback load. There is a reduction in page allocation
failures, some reduction in loss of interactivity due to page
allocators getting stuck on writeback from the VM. (This is still bad
though).
I think it's due to the change here in mpage_writepages(). That
function was originally unconditionally refiling written-back pages to
the head of the inactive list. The theory being that they should be
moved out of the way of page allocators, who would end up waiting on
them.
It appears that this simply had the effect of pushing dirty, unwritten
data closer to the tail of the inactive list, making things worse.
So instead, if the caller is (typically) balance_dirty_pages() then
leave the pages where they are on the LRU.
If the caller is PF_MEMALLOC then the pages *have* to be refiled. This
is because VM writeback is clustered along mapping->dirty_pages, and
it's almost certain that the pages which are being written are near the
tail of the LRU. If they were left there, page allocators would block
on them too soon. It would effectively become a synchronous write.
fs/mpage.c | 14 ++++++++++---
include/linux/pagemap.h | 2 +
mm/filemap.c | 50 +++++++++++++++++++++++++++++++++++-------------
mm/readahead.c | 13 ++++++++++--
mm/shmem.c | 2 -
mm/swap_state.c | 6 +++--
6 files changed, 66 insertions(+), 21 deletions(-)
--- 2.5.31/fs/mpage.c~batched-lru-add Sun Aug 11 00:20:33 2002
+++ 2.5.31-akpm/fs/mpage.c Sun Aug 11 00:20:57 2002
@@ -263,18 +263,25 @@ mpage_readpages(struct address_space *ma
struct bio *bio = NULL;
unsigned page_idx;
sector_t last_block_in_bio = 0;
+ struct pagevec lru_pvec;
+ pagevec_init(&lru_pvec);
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
struct page *page = list_entry(pages->prev, struct page, list);
prefetchw(&page->flags);
list_del(&page->list);
- if (!add_to_page_cache(page, mapping, page->index))
+ if (!add_to_page_cache(page, mapping, page->index)) {
bio = do_mpage_readpage(bio, page,
nr_pages - page_idx,
&last_block_in_bio, get_block);
- page_cache_release(page);
+ if (!pagevec_add(&lru_pvec, page))
+ __pagevec_lru_add(&lru_pvec);
+ } else {
+ page_cache_release(page);
+ }
}
+ pagevec_lru_add(&lru_pvec);
BUG_ON(!list_empty(pages));
if (bio)
mpage_bio_submit(READ, bio);
@@ -566,7 +573,8 @@ mpage_writepages(struct address_space *m
bio = mpage_writepage(bio, page, get_block,
&last_block_in_bio, &ret);
}
- if (!PageActive(page) && PageLRU(page)) {
+ if ((current->flags & PF_MEMALLOC) &&
+ !PageActive(page) && PageLRU(page)) {
if (!pagevec_add(&pvec, page))
pagevec_deactivate_inactive(&pvec);
page = NULL;
--- 2.5.31/mm/filemap.c~batched-lru-add Sun Aug 11 00:20:33 2002
+++ 2.5.31-akpm/mm/filemap.c Sun Aug 11 00:21:02 2002
@@ -21,6 +21,7 @@
#include <linux/iobuf.h>
#include <linux/hash.h>
#include <linux/writeback.h>
+#include <linux/pagevec.h>
#include <linux/security.h>
/*
* This is needed for the following functions:
@@ -530,27 +531,37 @@ int filemap_fdatawait(struct address_spa
* In the case of swapcache, try_to_swap_out() has already locked the page, so
* SetPageLocked() is ugly-but-OK there too. The required page state has been
* set up by swap_out_add_to_swap_cache().
+ *
+ * This function does not add the page to the LRU. The caller must do that.
*/
int add_to_page_cache(struct page *page,
- struct address_space *mapping, unsigned long offset)
+ struct address_space *mapping, pgoff_t offset)
{
int error;
+ page_cache_get(page);
write_lock(&mapping->page_lock);
error = radix_tree_insert(&mapping->page_tree, offset, page);
if (!error) {
SetPageLocked(page);
ClearPageDirty(page);
___add_to_page_cache(page, mapping, offset);
- page_cache_get(page);
+ } else {
+ page_cache_release(page);
}
write_unlock(&mapping->page_lock);
- /* Anon pages are already on the LRU */
- if (!error && !PageSwapCache(page))
- lru_cache_add(page);
return error;
}
+int add_to_page_cache_lru(struct page *page,
+ struct address_space *mapping, pgoff_t offset)
+{
+ int ret = add_to_page_cache(page, mapping, offset);
+ if (ret == 0)
+ lru_cache_add(page);
+ return ret;
+}
+
/*
* This adds the requested page to the page cache if it isn't already there,
* and schedules an I/O to read in its contents from disk.
@@ -566,7 +577,7 @@ static int page_cache_read(struct file *
if (!page)
return -ENOMEM;
- error = add_to_page_cache(page, mapping, offset);
+ error = add_to_page_cache_lru(page, mapping, offset);
if (!error) {
error = mapping->a_ops->readpage(file, page);
page_cache_release(page);
@@ -797,7 +808,7 @@ repeat:
if (!cached_page)
return NULL;
}
- err = add_to_page_cache(cached_page, mapping, index);
+ err = add_to_page_cache_lru(cached_page, mapping, index);
if (!err) {
page = cached_page;
cached_page = NULL;
@@ -830,7 +841,7 @@ grab_cache_page_nowait(struct address_sp
return NULL;
}
page = alloc_pages(mapping->gfp_mask & ~__GFP_FS, 0);
- if (page && add_to_page_cache(page, mapping, index)) {
+ if (page && add_to_page_cache_lru(page, mapping, index)) {
page_cache_release(page);
page = NULL;
}
@@ -994,7 +1005,7 @@ no_cached_page:
break;
}
}
- error = add_to_page_cache(cached_page, mapping, index);
+ error = add_to_page_cache_lru(cached_page, mapping, index);
if (error) {
if (error == -EEXIST)
goto find_page;
@@ -1704,7 +1715,7 @@ repeat:
if (!cached_page)
return ERR_PTR(-ENOMEM);
}
- err = add_to_page_cache(cached_page, mapping, index);
+ err = add_to_page_cache_lru(cached_page, mapping, index);
if (err == -EEXIST)
goto repeat;
if (err < 0) {
@@ -1764,8 +1775,14 @@ retry:
return page;
}
-static inline struct page * __grab_cache_page(struct address_space *mapping,
- unsigned long index, struct page **cached_page)
+/*
+ * If the page was newly created, increment its refcount and add it to the
+ * caller's lru-buffering pagevec. This function is specifically for
+ * generic_file_write().
+ */
+static inline struct page *
+__grab_cache_page(struct address_space *mapping, unsigned long index,
+ struct page **cached_page, struct pagevec *lru_pvec)
{
int err;
struct page *page;
@@ -1782,6 +1799,9 @@ repeat:
goto repeat;
if (err == 0) {
page = *cached_page;
+ page_cache_get(page);
+ if (!pagevec_add(lru_pvec, page))
+ __pagevec_lru_add(lru_pvec);
*cached_page = NULL;
}
}
@@ -1829,6 +1849,7 @@ generic_file_write(struct file *file, co
int err;
unsigned bytes;
time_t time_now;
+ struct pagevec lru_pvec;
if (unlikely((ssize_t) count < 0))
return -EINVAL;
@@ -1836,6 +1857,8 @@ generic_file_write(struct file *file, co
if (unlikely(!access_ok(VERIFY_READ, buf, count)))
return -EFAULT;
+ pagevec_init(&lru_pvec);
+
down(&inode->i_sem);
pos = *ppos;
if (unlikely(pos < 0)) {
@@ -1976,7 +1999,7 @@ generic_file_write(struct file *file, co
__get_user(dummy, buf+bytes-1);
}
- page = __grab_cache_page(mapping, index, &cached_page);
+ page = __grab_cache_page(mapping, index, &cached_page, &lru_pvec);
if (!page) {
status = -ENOMEM;
break;
@@ -2038,6 +2061,7 @@ generic_file_write(struct file *file, co
out_status:
err = written ? written : status;
out:
+ pagevec_lru_add(&lru_pvec);
up(&inode->i_sem);
return err;
}
--- 2.5.31/mm/readahead.c~batched-lru-add Sun Aug 11 00:20:33 2002
+++ 2.5.31-akpm/mm/readahead.c Sun Aug 11 00:20:33 2002
@@ -12,6 +12,7 @@
#include <linux/mm.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
+#include <linux/pagevec.h>
struct backing_dev_info default_backing_dev_info = {
.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE,
@@ -36,6 +37,9 @@ read_pages(struct file *file, struct add
struct list_head *pages, unsigned nr_pages)
{
unsigned page_idx;
+ struct pagevec lru_pvec;
+
+ pagevec_init(&lru_pvec);
if (mapping->a_ops->readpages)
return mapping->a_ops->readpages(mapping, pages, nr_pages);
@@ -43,10 +47,15 @@ read_pages(struct file *file, struct add
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
struct page *page = list_entry(pages->prev, struct page, list);
list_del(&page->list);
- if (!add_to_page_cache(page, mapping, page->index))
+ if (!add_to_page_cache(page, mapping, page->index)) {
+ if (!pagevec_add(&lru_pvec, page))
+ __pagevec_lru_add(&lru_pvec);
mapping->a_ops->readpage(file, page);
- page_cache_release(page);
+ } else {
+ page_cache_release(page);
+ }
}
+ pagevec_lru_add(&lru_pvec);
return 0;
}
--- 2.5.31/mm/swap_state.c~batched-lru-add Sun Aug 11 00:20:33 2002
+++ 2.5.31-akpm/mm/swap_state.c Sun Aug 11 00:21:01 2002
@@ -72,6 +72,9 @@ int add_to_swap_cache(struct page *page,
return -ENOENT;
}
error = add_to_page_cache(page, &swapper_space, entry.val);
+ /*
+ * Anon pages are already on the LRU, we don't run lru_cache_add here.
+ */
if (error != 0) {
swap_free(entry);
if (error == -EEXIST)
@@ -276,8 +279,7 @@ int move_from_swap_cache(struct page *pa
SetPageDirty(page);
___add_to_page_cache(page, mapping, index);
/* fix that up */
- list_del(&page->list);
- list_add(&page->list, &mapping->dirty_pages);
+ list_move(&page->list, &mapping->dirty_pages);
write_unlock(&mapping->page_lock);
write_unlock(&swapper_space.page_lock);
--- 2.5.31/mm/shmem.c~batched-lru-add Sun Aug 11 00:20:33 2002
+++ 2.5.31-akpm/mm/shmem.c Sun Aug 11 00:20:33 2002
@@ -668,7 +668,7 @@ repeat:
page = page_cache_alloc(mapping);
if (!page)
goto no_mem;
- error = add_to_page_cache(page, mapping, idx);
+ error = add_to_page_cache_lru(page, mapping, idx);
if (error < 0) {
page_cache_release(page);
goto no_mem;
--- 2.5.31/include/linux/pagemap.h~batched-lru-add Sun Aug 11 00:20:33 2002
+++ 2.5.31-akpm/include/linux/pagemap.h Sun Aug 11 00:21:02 2002
@@ -58,6 +58,8 @@ extern struct page * read_cache_page(str
extern int add_to_page_cache(struct page *page,
struct address_space *mapping, unsigned long index);
+extern int add_to_page_cache_lru(struct page *page,
+ struct address_space *mapping, unsigned long index);
extern void remove_from_page_cache(struct page *page);
extern void __remove_from_page_cache(struct page *page);
.
On Sun, 11 Aug 2002, Andrew Morton wrote:
> Also. This whole patch series improves the behaviour of the system
> under heavy writeback load. There is a reduction in page allocation
> failures, some reduction in loss of interactivity due to page
> allocators getting stuck on writeback from the VM. (This is still bad
> though).
> It appears that this simply had the effect of pushing dirty, unwritten
> data closer to the tail of the inactive list, making things worse.
This nicely points out the weak points in having the VM block
on individual pages ... the fact that there are so damn many
of those pages.
Every time the VM is shifting workloads we can run into the
problem of having a significant part (or even the whole) of
the inactive list full of dirty pages and with the inactive
list being 1/3rd of RAM you could easily run into 200 MB of
dirty pages.
If you insist on deferring the "waiting on IO" for these
pages too much, you'll end up blocking in __get_request_wait
instead, until you've scheduled all 200 MB of dirty pages
for IO.
By the time you've gotten out of __get_request_wait and
scheduled the last page for IO, you can be pretty sure the
first 120 MB of dirty pages have been written to disk already
and would have been reclaimable for many seconds already.
I really need to look at fixing this thing right ...
kind regards,
Rik
--
Bravely reimplemented by the knights who say "NIH".
http://www.surriel.com/ http://distro.conectiva.com/
Rik van Riel wrote:
>
> On Sun, 11 Aug 2002, Andrew Morton wrote:
>
> > Also. This whole patch series improves the behaviour of the system
> > under heavy writeback load. There is a reduction in page allocation
> > failures, some reduction in loss of interactivity due to page
> > allocators getting stuck on writeback from the VM. (This is still bad
> > though).
>
> > It appears that this simply had the effect of pushing dirty, unwritten
> > data closer to the tail of the inactive list, making things worse.
>
> This nicely points out the weak points in having the VM block
> on individual pages ... the fact that there are so damn many
> of those pages.
>
> Every time the VM is shifting workloads we can run into the
> problem of having a significant part (or even the whole) of
> the inactive list full of dirty pages and with the inactive
> list being 1/3rd of RAM you could easily run into 200 MB of
> dirty pages.
Well let's have a statement of principles:
- Light writeback should occur via pdflush.
- Heavy writeback should be performed by the caller of write().
- If dirty pages reach the tail of the LRU, we've goofed. Because
the wrong process is being penalised.
I suspect the problem is a form of priority inversion. The
heavy write() caller has filled the request queue and is nicely
asleep. But some dirty page has reached the tail of the LRU
and some poor memory allocator tries to write it out, and gets
to go to sleep for ages because the "bad" task has gummed up
the request queue.
And it only takes one dirty block! Any LRU page which is dirty
against a blocked queue is like a hand grenade floating
down a stream [1]. If some innocent task tries to write that
page it gets DoSed via the request queue.
I did wome work to try and fix this. See
http://www.zip.com.au/~akpm/linux/patches/2.5/2.5.29/throttling-fairness.patch
The idea here is to split up the request queue in a new way:
when the heavy writer is writing back data (via balance_dirty_pages),
he goes to sleep whenever there are less than 50% of requests
available. But all other writers can use 100% of the requests.
This is designed to prevent the priority inversion: the caller
of shrink_cache can perform a non-blocking write without getting
penalised by the write() caller's consumption of all requests.
The patch helps, and at the end of the day, it may be the right
thing to do. But first I want to find out exactly why all this
dirty data is reaching the tail of the LRU, and see if there's
some way to make the correct process write it out.
[1] OK, hand grenades don't float. I was thinking of something
else, but this is a family mailing list.
On Sun, 11 Aug 2002, Andrew Morton wrote:
> And it only takes one dirty block! Any LRU page which is dirty
> against a blocked queue is like a hand grenade floating
> down a stream [1]. If some innocent task tries to write that
> page it gets DoSed via the request queue.
This is exactly why we shouldn't wait on dirty pages in
the pageout path.
Of course we need to wait and we should stall before
the system gets overloaded so we don't "run into a wall",
but waiting on any random _single_ dirty page just doesn't
make sense.
Then again, that will probably still not fix the problem
that we're not keeping the disk busy so we won't get full
writeout speed and end up stalling...
regards,
Rik
--
Bravely reimplemented by the knights who say "NIH".
http://www.surriel.com/ http://distro.conectiva.com/
Rik van Riel wrote:
>
> On Sun, 11 Aug 2002, Andrew Morton wrote:
>
> > And it only takes one dirty block! Any LRU page which is dirty
> > against a blocked queue is like a hand grenade floating
> > down a stream [1]. If some innocent task tries to write that
> > page it gets DoSed via the request queue.
>
> This is exactly why we shouldn't wait on dirty pages in
> the pageout path.
It's not the wait-on-writeback which is the problem. It's
the writeout. Perhaps that's what you meant.