2011-06-09 22:31:22

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 0/7] tmpfs: simplify by splice instead of readpage

Here's a second patchset for mmotm, based on 30-rc2 plus the 14
in mm tmpfs and trunc changes, which were preparation for this.

Add shmem_file_splice_read(), remove shmem_readpage(), and
simplify: before advancing to the interesting radix_tree
conversion in the final patchset, to follow in a few days.

1/7 tmpfs: clone shmem_file_splice_read
2/7 tmpfs: refine shmem_file_splice_read
3/7 tmpfs: pass gfp to shmem_getpage_gfp
4/7 tmpfs: remove shmem_readpage
5/7 tmpfs: simplify prealloc_page
6/7 tmpfs: simplify filepage/swappage
7/7 tmpfs: simplify unuse and writepage

fs/splice.c | 4
include/linux/splice.h | 2
mm/shmem.c | 549 ++++++++++++++++++++-------------------
3 files changed, 299 insertions(+), 256 deletions(-)

Hugh


2011-06-09 22:32:37

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 1/7] tmpfs: clone shmem_file_splice_read

Copy __generic_file_splice_read() and generic_file_splice_read()
from fs/splice.c to shmem_file_splice_read() in mm/shmem.c. Make
page_cache_pipe_buf_ops and spd_release_page() accessible to it.

Signed-off-by: Hugh Dickins <[email protected]>
Cc: Jens Axboe <[email protected]>
---
fs/splice.c | 4
include/linux/splice.h | 2
mm/shmem.c | 218 ++++++++++++++++++++++++++++++++++++++-
3 files changed, 221 insertions(+), 3 deletions(-)

--- linux.orig/fs/splice.c 2011-06-09 11:37:57.548770334 -0700
+++ linux/fs/splice.c 2011-06-09 11:38:05.228808404 -0700
@@ -132,7 +132,7 @@ error:
return err;
}

-static const struct pipe_buf_operations page_cache_pipe_buf_ops = {
+const struct pipe_buf_operations page_cache_pipe_buf_ops = {
.can_merge = 0,
.map = generic_pipe_buf_map,
.unmap = generic_pipe_buf_unmap,
@@ -264,7 +264,7 @@ ssize_t splice_to_pipe(struct pipe_inode
return ret;
}

-static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
+void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
{
page_cache_release(spd->pages[i]);
}
--- linux.orig/include/linux/splice.h 2011-06-09 11:37:57.548770334 -0700
+++ linux/include/linux/splice.h 2011-06-09 11:38:05.228808404 -0700
@@ -88,5 +88,7 @@ extern ssize_t splice_direct_to_actor(st
extern int splice_grow_spd(struct pipe_inode_info *, struct splice_pipe_desc *);
extern void splice_shrink_spd(struct pipe_inode_info *,
struct splice_pipe_desc *);
+extern void spd_release_page(struct splice_pipe_desc *, unsigned int);

+extern const struct pipe_buf_operations page_cache_pipe_buf_ops;
#endif
--- linux.orig/mm/shmem.c 2011-06-09 11:37:57.552770354 -0700
+++ linux/mm/shmem.c 2011-06-09 11:38:05.232808448 -0700
@@ -51,6 +51,7 @@ static struct vfsmount *shm_mnt;
#include <linux/shmem_fs.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
+#include <linux/splice.h>
#include <linux/security.h>
#include <linux/swapops.h>
#include <linux/mempolicy.h>
@@ -1844,6 +1845,221 @@ static ssize_t shmem_file_aio_read(struc
return retval;
}

+static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ struct address_space *mapping = in->f_mapping;
+ unsigned int loff, nr_pages, req_pages;
+ struct page *pages[PIPE_DEF_BUFFERS];
+ struct partial_page partial[PIPE_DEF_BUFFERS];
+ struct page *page;
+ pgoff_t index, end_index;
+ loff_t isize, left;
+ int error, page_nr;
+ struct splice_pipe_desc spd = {
+ .pages = pages,
+ .partial = partial,
+ .flags = flags,
+ .ops = &page_cache_pipe_buf_ops,
+ .spd_release = spd_release_page,
+ };
+
+ isize = i_size_read(in->f_mapping->host);
+ if (unlikely(*ppos >= isize))
+ return 0;
+
+ left = isize - *ppos;
+ if (unlikely(left < len))
+ len = left;
+
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
+
+ index = *ppos >> PAGE_CACHE_SHIFT;
+ loff = *ppos & ~PAGE_CACHE_MASK;
+ req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ nr_pages = min(req_pages, pipe->buffers);
+
+ /*
+ * Lookup the (hopefully) full range of pages we need.
+ */
+ spd.nr_pages = find_get_pages_contig(mapping, index,
+ nr_pages, spd.pages);
+ index += spd.nr_pages;
+
+ /*
+ * If find_get_pages_contig() returned fewer pages than we needed,
+ * readahead/allocate the rest and fill in the holes.
+ */
+ if (spd.nr_pages < nr_pages)
+ page_cache_sync_readahead(mapping, &in->f_ra, in,
+ index, req_pages - spd.nr_pages);
+
+ error = 0;
+ while (spd.nr_pages < nr_pages) {
+ /*
+ * Page could be there, find_get_pages_contig() breaks on
+ * the first hole.
+ */
+ page = find_get_page(mapping, index);
+ if (!page) {
+ /*
+ * page didn't exist, allocate one.
+ */
+ page = page_cache_alloc_cold(mapping);
+ if (!page)
+ break;
+
+ error = add_to_page_cache_lru(page, mapping, index,
+ GFP_KERNEL);
+ if (unlikely(error)) {
+ page_cache_release(page);
+ if (error == -EEXIST)
+ continue;
+ break;
+ }
+ /*
+ * add_to_page_cache() locks the page, unlock it
+ * to avoid convoluting the logic below even more.
+ */
+ unlock_page(page);
+ }
+
+ spd.pages[spd.nr_pages++] = page;
+ index++;
+ }
+
+ /*
+ * Now loop over the map and see if we need to start IO on any
+ * pages, fill in the partial map, etc.
+ */
+ index = *ppos >> PAGE_CACHE_SHIFT;
+ nr_pages = spd.nr_pages;
+ spd.nr_pages = 0;
+ for (page_nr = 0; page_nr < nr_pages; page_nr++) {
+ unsigned int this_len;
+
+ if (!len)
+ break;
+
+ /*
+ * this_len is the max we'll use from this page
+ */
+ this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
+ page = spd.pages[page_nr];
+
+ if (PageReadahead(page))
+ page_cache_async_readahead(mapping, &in->f_ra, in,
+ page, index, req_pages - page_nr);
+
+ /*
+ * If the page isn't uptodate, we may need to start io on it
+ */
+ if (!PageUptodate(page)) {
+ lock_page(page);
+
+ /*
+ * Page was truncated, or invalidated by the
+ * filesystem. Redo the find/create, but this time the
+ * page is kept locked, so there's no chance of another
+ * race with truncate/invalidate.
+ */
+ if (!page->mapping) {
+ unlock_page(page);
+ page = find_or_create_page(mapping, index,
+ mapping_gfp_mask(mapping));
+
+ if (!page) {
+ error = -ENOMEM;
+ break;
+ }
+ page_cache_release(spd.pages[page_nr]);
+ spd.pages[page_nr] = page;
+ }
+ /*
+ * page was already under io and is now done, great
+ */
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ goto fill_it;
+ }
+
+ /*
+ * need to read in the page
+ */
+ error = mapping->a_ops->readpage(in, page);
+ if (unlikely(error)) {
+ /*
+ * We really should re-lookup the page here,
+ * but it complicates things a lot. Instead
+ * lets just do what we already stored, and
+ * we'll get it the next time we are called.
+ */
+ if (error == AOP_TRUNCATED_PAGE)
+ error = 0;
+
+ break;
+ }
+ }
+fill_it:
+ /*
+ * i_size must be checked after PageUptodate.
+ */
+ isize = i_size_read(mapping->host);
+ end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+ if (unlikely(!isize || index > end_index))
+ break;
+
+ /*
+ * if this is the last page, see if we need to shrink
+ * the length and stop
+ */
+ if (end_index == index) {
+ unsigned int plen;
+
+ /*
+ * max good bytes in this page
+ */
+ plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+ if (plen <= loff)
+ break;
+
+ /*
+ * force quit after adding this page
+ */
+ this_len = min(this_len, plen - loff);
+ len = this_len;
+ }
+
+ spd.partial[page_nr].offset = loff;
+ spd.partial[page_nr].len = this_len;
+ len -= this_len;
+ loff = 0;
+ spd.nr_pages++;
+ index++;
+ }
+
+ /*
+ * Release any pages at the end, if we quit early. 'page_nr' is how far
+ * we got, 'nr_pages' is how many pages are in the map.
+ */
+ while (page_nr < nr_pages)
+ page_cache_release(spd.pages[page_nr++]);
+ in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+
+ if (spd.nr_pages)
+ error = splice_to_pipe(pipe, &spd);
+
+ splice_shrink_spd(pipe, &spd);
+
+ if (error > 0) {
+ *ppos += error;
+ file_accessed(in);
+ }
+ return error;
+}
+
static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
@@ -2699,7 +2915,7 @@ static const struct file_operations shme
.aio_read = shmem_file_aio_read,
.aio_write = generic_file_aio_write,
.fsync = noop_fsync,
- .splice_read = generic_file_splice_read,
+ .splice_read = shmem_file_splice_read,
.splice_write = generic_file_splice_write,
#endif
};

2011-06-09 22:33:38

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 2/7] tmpfs: refine shmem_file_splice_read

Tidy up shmem_file_splice_read():

Remove readahead: okay, we could implement shmem readahead on swap,
but have never done so before, swap being the slow exceptional path.

Use shmem_getpage() instead of find_or_create_page() plus ->readpage().

Remove several comments: sorry, I found them more distracting than
helpful, and this will not be the reference version of splice_read().

Signed-off-by: Hugh Dickins <[email protected]>
---
mm/shmem.c | 138 +++++++--------------------------------------------
1 file changed, 19 insertions(+), 119 deletions(-)

--- linux.orig/mm/shmem.c 2011-06-09 11:38:05.232808448 -0700
+++ linux/mm/shmem.c 2011-06-09 11:38:13.436849182 -0700
@@ -1850,6 +1850,7 @@ static ssize_t shmem_file_splice_read(st
unsigned int flags)
{
struct address_space *mapping = in->f_mapping;
+ struct inode *inode = mapping->host;
unsigned int loff, nr_pages, req_pages;
struct page *pages[PIPE_DEF_BUFFERS];
struct partial_page partial[PIPE_DEF_BUFFERS];
@@ -1865,7 +1866,7 @@ static ssize_t shmem_file_splice_read(st
.spd_release = spd_release_page,
};

- isize = i_size_read(in->f_mapping->host);
+ isize = i_size_read(inode);
if (unlikely(*ppos >= isize))
return 0;

@@ -1881,153 +1882,57 @@ static ssize_t shmem_file_splice_read(st
req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
nr_pages = min(req_pages, pipe->buffers);

- /*
- * Lookup the (hopefully) full range of pages we need.
- */
spd.nr_pages = find_get_pages_contig(mapping, index,
nr_pages, spd.pages);
index += spd.nr_pages;
-
- /*
- * If find_get_pages_contig() returned fewer pages than we needed,
- * readahead/allocate the rest and fill in the holes.
- */
- if (spd.nr_pages < nr_pages)
- page_cache_sync_readahead(mapping, &in->f_ra, in,
- index, req_pages - spd.nr_pages);
-
error = 0;
- while (spd.nr_pages < nr_pages) {
- /*
- * Page could be there, find_get_pages_contig() breaks on
- * the first hole.
- */
- page = find_get_page(mapping, index);
- if (!page) {
- /*
- * page didn't exist, allocate one.
- */
- page = page_cache_alloc_cold(mapping);
- if (!page)
- break;
-
- error = add_to_page_cache_lru(page, mapping, index,
- GFP_KERNEL);
- if (unlikely(error)) {
- page_cache_release(page);
- if (error == -EEXIST)
- continue;
- break;
- }
- /*
- * add_to_page_cache() locks the page, unlock it
- * to avoid convoluting the logic below even more.
- */
- unlock_page(page);
- }

+ while (spd.nr_pages < nr_pages) {
+ page = NULL;
+ error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL);
+ if (error)
+ break;
+ unlock_page(page);
spd.pages[spd.nr_pages++] = page;
index++;
}

- /*
- * Now loop over the map and see if we need to start IO on any
- * pages, fill in the partial map, etc.
- */
index = *ppos >> PAGE_CACHE_SHIFT;
nr_pages = spd.nr_pages;
spd.nr_pages = 0;
+
for (page_nr = 0; page_nr < nr_pages; page_nr++) {
unsigned int this_len;

if (!len)
break;

- /*
- * this_len is the max we'll use from this page
- */
this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
page = spd.pages[page_nr];

- if (PageReadahead(page))
- page_cache_async_readahead(mapping, &in->f_ra, in,
- page, index, req_pages - page_nr);
-
- /*
- * If the page isn't uptodate, we may need to start io on it
- */
- if (!PageUptodate(page)) {
- lock_page(page);
-
- /*
- * Page was truncated, or invalidated by the
- * filesystem. Redo the find/create, but this time the
- * page is kept locked, so there's no chance of another
- * race with truncate/invalidate.
- */
- if (!page->mapping) {
- unlock_page(page);
- page = find_or_create_page(mapping, index,
- mapping_gfp_mask(mapping));
-
- if (!page) {
- error = -ENOMEM;
- break;
- }
- page_cache_release(spd.pages[page_nr]);
- spd.pages[page_nr] = page;
- }
- /*
- * page was already under io and is now done, great
- */
- if (PageUptodate(page)) {
- unlock_page(page);
- goto fill_it;
- }
-
- /*
- * need to read in the page
- */
- error = mapping->a_ops->readpage(in, page);
- if (unlikely(error)) {
- /*
- * We really should re-lookup the page here,
- * but it complicates things a lot. Instead
- * lets just do what we already stored, and
- * we'll get it the next time we are called.
- */
- if (error == AOP_TRUNCATED_PAGE)
- error = 0;
-
+ if (!PageUptodate(page) || page->mapping != mapping) {
+ page = NULL;
+ error = shmem_getpage(inode, index, &page,
+ SGP_CACHE, NULL);
+ if (error)
break;
- }
+ unlock_page(page);
+ page_cache_release(spd.pages[page_nr]);
+ spd.pages[page_nr] = page;
}
-fill_it:
- /*
- * i_size must be checked after PageUptodate.
- */
- isize = i_size_read(mapping->host);
+
+ isize = i_size_read(inode);
end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
if (unlikely(!isize || index > end_index))
break;

- /*
- * if this is the last page, see if we need to shrink
- * the length and stop
- */
if (end_index == index) {
unsigned int plen;

- /*
- * max good bytes in this page
- */
plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
if (plen <= loff)
break;

- /*
- * force quit after adding this page
- */
this_len = min(this_len, plen - loff);
len = this_len;
}
@@ -2040,13 +1945,8 @@ fill_it:
index++;
}

- /*
- * Release any pages at the end, if we quit early. 'page_nr' is how far
- * we got, 'nr_pages' is how many pages are in the map.
- */
while (page_nr < nr_pages)
page_cache_release(spd.pages[page_nr++]);
- in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;

if (spd.nr_pages)
error = splice_to_pipe(pipe, &spd);

2011-06-09 22:34:48

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 3/7] tmpfs: pass gfp to shmem_getpage_gfp

Make shmem_getpage() a wrapper, passing mapping_gfp_mask() down to
shmem_getpage_gfp(), which in turn passes gfp down to shmem_swp_alloc().

Change shmem_read_mapping_page_gfp() to use shmem_getpage_gfp() in the
CONFIG_SHMEM case; but leave tiny !SHMEM using read_cache_page_gfp().

Add a BUG_ON() in case anyone happens to call this on a non-shmem mapping;
though we might later want to let that case route to read_cache_page_gfp().

It annoys me to have these two almost-redundant args, gfp and fault_type:
I can't find a better way; but initialize fault_type only in shmem_fault().

Note that before, read_cache_page_gfp() was allocating i915_gem's pages
with __GFP_NORETRY as intended; but the corresponding swap vector pages
got allocated without it, leaving a small possibility of OOM.

Signed-off-by: Hugh Dickins <[email protected]>
---
mm/shmem.c | 67 +++++++++++++++++++++++++++++++++------------------
1 file changed, 44 insertions(+), 23 deletions(-)

--- linux.orig/mm/shmem.c 2011-06-09 11:38:13.436849182 -0700
+++ linux/mm/shmem.c 2011-06-09 11:38:22.912896122 -0700
@@ -127,8 +127,15 @@ static unsigned long shmem_default_max_i
}
#endif

-static int shmem_getpage(struct inode *inode, unsigned long idx,
- struct page **pagep, enum sgp_type sgp, int *type);
+static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
+ struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
+
+static inline int shmem_getpage(struct inode *inode, pgoff_t index,
+ struct page **pagep, enum sgp_type sgp, int *fault_type)
+{
+ return shmem_getpage_gfp(inode, index, pagep, sgp,
+ mapping_gfp_mask(inode->i_mapping), fault_type);
+}

static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
{
@@ -404,10 +411,12 @@ static void shmem_swp_set(struct shmem_i
* @info: info structure for the inode
* @index: index of the page to find
* @sgp: check and recheck i_size? skip allocation?
+ * @gfp: gfp mask to use for any page allocation
*
* If the entry does not exist, allocate it.
*/
-static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp)
+static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info,
+ unsigned long index, enum sgp_type sgp, gfp_t gfp)
{
struct inode *inode = &info->vfs_inode;
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
@@ -435,7 +444,7 @@ static swp_entry_t *shmem_swp_alloc(stru
}

spin_unlock(&info->lock);
- page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping));
+ page = shmem_dir_alloc(gfp);
spin_lock(&info->lock);

if (!page) {
@@ -1225,14 +1234,14 @@ static inline struct mempolicy *shmem_ge
#endif

/*
- * shmem_getpage - either get the page from swap or allocate a new one
+ * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
*
* If we allocate a new one we do not mark it dirty. That's up to the
* vm. If we swap it in we mark it dirty since we also free the swap
* entry since a page cannot live in both the swap and page cache
*/
-static int shmem_getpage(struct inode *inode, unsigned long idx,
- struct page **pagep, enum sgp_type sgp, int *type)
+static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx,
+ struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
{
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
@@ -1242,15 +1251,11 @@ static int shmem_getpage(struct inode *i
struct page *prealloc_page = NULL;
swp_entry_t *entry;
swp_entry_t swap;
- gfp_t gfp;
int error;

if (idx >= SHMEM_MAX_INDEX)
return -EFBIG;

- if (type)
- *type = 0;
-
/*
* Normally, filepage is NULL on entry, and either found
* uptodate immediately, or allocated and zeroed, or read
@@ -1264,13 +1269,12 @@ repeat:
filepage = find_lock_page(mapping, idx);
if (filepage && PageUptodate(filepage))
goto done;
- gfp = mapping_gfp_mask(mapping);
if (!filepage) {
/*
* Try to preload while we can wait, to not make a habit of
* draining atomic reserves; but don't latch on to this cpu.
*/
- error = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
+ error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
if (error)
goto failed;
radix_tree_preload_end();
@@ -1290,7 +1294,7 @@ repeat:

spin_lock(&info->lock);
shmem_recalc_inode(inode);
- entry = shmem_swp_alloc(info, idx, sgp);
+ entry = shmem_swp_alloc(info, idx, sgp, gfp);
if (IS_ERR(entry)) {
spin_unlock(&info->lock);
error = PTR_ERR(entry);
@@ -1305,12 +1309,12 @@ repeat:
shmem_swp_unmap(entry);
spin_unlock(&info->lock);
/* here we actually do the io */
- if (type)
- *type |= VM_FAULT_MAJOR;
+ if (fault_type)
+ *fault_type |= VM_FAULT_MAJOR;
swappage = shmem_swapin(swap, gfp, info, idx);
if (!swappage) {
spin_lock(&info->lock);
- entry = shmem_swp_alloc(info, idx, sgp);
+ entry = shmem_swp_alloc(info, idx, sgp, gfp);
if (IS_ERR(entry))
error = PTR_ERR(entry);
else {
@@ -1461,7 +1465,7 @@ repeat:
SetPageSwapBacked(filepage);
}

- entry = shmem_swp_alloc(info, idx, sgp);
+ entry = shmem_swp_alloc(info, idx, sgp, gfp);
if (IS_ERR(entry))
error = PTR_ERR(entry);
else {
@@ -1539,7 +1543,7 @@ static int shmem_fault(struct vm_area_st
{
struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
int error;
- int ret;
+ int ret = VM_FAULT_LOCKED;

if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
return VM_FAULT_SIGBUS;
@@ -1547,11 +1551,12 @@ static int shmem_fault(struct vm_area_st
error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
if (error)
return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
+
if (ret & VM_FAULT_MAJOR) {
count_vm_event(PGMAJFAULT);
mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
}
- return ret | VM_FAULT_LOCKED;
+ return ret;
}

#ifdef CONFIG_NUMA
@@ -3162,13 +3167,29 @@ int shmem_zero_setup(struct vm_area_stru
* suit tmpfs, since it may have pages in swapcache, and needs to find those
* for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
*
- * Provide a stub for those callers to start using now, then later
- * flesh it out to call shmem_getpage() with additional gfp mask, when
- * shmem_file_splice_read() is added and shmem_readpage() is removed.
+ * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
+ * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
*/
struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
pgoff_t index, gfp_t gfp)
{
+#ifdef CONFIG_SHMEM
+ struct inode *inode = mapping->host;
+ struct page *page = NULL;
+ int error;
+
+ BUG_ON(mapping->a_ops != &shmem_aops);
+ error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL);
+ if (error)
+ page = ERR_PTR(error);
+ else
+ unlock_page(page);
+ return page;
+#else
+ /*
+ * The tiny !SHMEM case uses ramfs without swap
+ */
return read_cache_page_gfp(mapping, index, gfp);
+#endif
}
EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);

2011-06-09 22:35:48

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 4/7] tmpfs: remove_shmem_readpage

Remove that pernicious shmem_readpage() at last: the things we needed
it for (splice, loop, sendfile, i915 GEM) are now fully taken care of
by shmem_file_splice_read() and shmem_read_mapping_page_gfp().

This removal clears the way for a simpler shmem_getpage_gfp(), since
page is never passed in; but leave most of that cleanup until after.

sys_readahead() and sys_fadvise(POSIX_FADV_WILLNEED) will now EINVAL,
instead of unexpectedly trying to read ahead on tmpfs: if that proves
to be an issue for someone, then we can either arrange for them to
return success instead, or try to implement async readahead on tmpfs.

Signed-off-by: Hugh Dickins <[email protected]>
---
mm/shmem.c | 40 ++++++----------------------------------
1 file changed, 6 insertions(+), 34 deletions(-)

--- linux.orig/mm/shmem.c 2011-06-09 11:38:22.912896122 -0700
+++ linux/mm/shmem.c 2011-06-09 11:39:32.361240481 -0700
@@ -1246,7 +1246,7 @@ static int shmem_getpage_gfp(struct inod
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo;
- struct page *filepage = *pagep;
+ struct page *filepage;
struct page *swappage;
struct page *prealloc_page = NULL;
swp_entry_t *entry;
@@ -1255,18 +1255,8 @@ static int shmem_getpage_gfp(struct inod

if (idx >= SHMEM_MAX_INDEX)
return -EFBIG;
-
- /*
- * Normally, filepage is NULL on entry, and either found
- * uptodate immediately, or allocated and zeroed, or read
- * in under swappage, which is then assigned to filepage.
- * But shmem_readpage (required for splice) passes in a locked
- * filepage, which may be found not uptodate by other callers
- * too, and may need to be copied from the swappage read in.
- */
repeat:
- if (!filepage)
- filepage = find_lock_page(mapping, idx);
+ filepage = find_lock_page(mapping, idx);
if (filepage && PageUptodate(filepage))
goto done;
if (!filepage) {
@@ -1513,8 +1503,7 @@ nospace:
* Perhaps the page was brought in from swap between find_lock_page
* and taking info->lock? We allow for that at add_to_page_cache_lru,
* but must also avoid reporting a spurious ENOSPC while working on a
- * full tmpfs. (When filepage has been passed in to shmem_getpage, it
- * is already in page cache, which prevents this race from occurring.)
+ * full tmpfs.
*/
if (!filepage) {
struct page *page = find_get_page(mapping, idx);
@@ -1527,7 +1516,7 @@ nospace:
spin_unlock(&info->lock);
error = -ENOSPC;
failed:
- if (*pagep != filepage) {
+ if (filepage) {
unlock_page(filepage);
page_cache_release(filepage);
}
@@ -1673,19 +1662,6 @@ static struct inode *shmem_get_inode(str
static const struct inode_operations shmem_symlink_inode_operations;
static const struct inode_operations shmem_symlink_inline_operations;

-/*
- * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin;
- * but providing them allows a tmpfs file to be used for splice, sendfile, and
- * below the loop driver, in the generic fashion that many filesystems support.
- */
-static int shmem_readpage(struct file *file, struct page *page)
-{
- struct inode *inode = page->mapping->host;
- int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL);
- unlock_page(page);
- return error;
-}
-
static int
shmem_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
@@ -1693,7 +1669,6 @@ shmem_write_begin(struct file *file, str
{
struct inode *inode = mapping->host;
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- *pagep = NULL;
return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
}

@@ -1893,7 +1868,6 @@ static ssize_t shmem_file_splice_read(st
error = 0;

while (spd.nr_pages < nr_pages) {
- page = NULL;
error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL);
if (error)
break;
@@ -1916,7 +1890,6 @@ static ssize_t shmem_file_splice_read(st
page = spd.pages[page_nr];

if (!PageUptodate(page) || page->mapping != mapping) {
- page = NULL;
error = shmem_getpage(inode, index, &page,
SGP_CACHE, NULL);
if (error)
@@ -2125,7 +2098,7 @@ static int shmem_symlink(struct inode *d
int error;
int len;
struct inode *inode;
- struct page *page = NULL;
+ struct page *page;
char *kaddr;
struct shmem_inode_info *info;

@@ -2803,7 +2776,6 @@ static const struct address_space_operat
.writepage = shmem_writepage,
.set_page_dirty = __set_page_dirty_no_writeback,
#ifdef CONFIG_TMPFS
- .readpage = shmem_readpage,
.write_begin = shmem_write_begin,
.write_end = shmem_write_end,
#endif
@@ -3175,7 +3147,7 @@ struct page *shmem_read_mapping_page_gfp
{
#ifdef CONFIG_SHMEM
struct inode *inode = mapping->host;
- struct page *page = NULL;
+ struct page *page;
int error;

BUG_ON(mapping->a_ops != &shmem_aops);

2011-06-09 22:39:17

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 5/7] tmpfs: simplify prealloc_page

The prealloc_page handling in shmem_getpage_gfp() is unnecessarily
complicated: first simplify that before going on to filepage/swappage.

That's right, don't report ENOMEM when the preallocation fails: we may
or may not need the page. But simply report ENOMEM once we find we do
need it, instead of dropping lock, repeating allocation, unwinding on
failure etc. And leave the out label on the fast path, don't goto.

Fix something that looks like a bug but turns out not to be: set
PageSwapBacked on prealloc_page before its mem_cgroup_cache_charge(),
as the removed case was doing. That's important before adding to LRU
(determines which LRU the page goes on), and does affect which path it
takes through memcontrol.c, but in the end MEM_CGROUP_CHANGE_TYPE_
SHMEM is handled no differently from CACHE.

Signed-off-by: Hugh Dickins <[email protected]>
Cc: Shaohua Li <[email protected]>
Cc: "Zhang, Yanmin" <[email protected]>
Cc: Tim Chen <[email protected]>
---
mm/shmem.c | 59 ++++++++++++---------------------------------------
1 file changed, 15 insertions(+), 44 deletions(-)

--- linux.orig/mm/shmem.c 2011-06-09 11:39:32.361240481 -0700
+++ linux/mm/shmem.c 2011-06-09 11:39:42.845292474 -0700
@@ -1269,9 +1269,9 @@ repeat:
goto failed;
radix_tree_preload_end();
if (sgp != SGP_READ && !prealloc_page) {
- /* We don't care if this fails */
prealloc_page = shmem_alloc_page(gfp, info, idx);
if (prealloc_page) {
+ SetPageSwapBacked(prealloc_page);
if (mem_cgroup_cache_charge(prealloc_page,
current->mm, GFP_KERNEL)) {
page_cache_release(prealloc_page);
@@ -1403,7 +1403,8 @@ repeat:
goto repeat;
}
spin_unlock(&info->lock);
- } else {
+
+ } else if (prealloc_page) {
shmem_swp_unmap(entry);
sbinfo = SHMEM_SB(inode->i_sb);
if (sbinfo->max_blocks) {
@@ -1419,41 +1420,8 @@ repeat:
if (!filepage) {
int ret;

- if (!prealloc_page) {
- spin_unlock(&info->lock);
- filepage = shmem_alloc_page(gfp, info, idx);
- if (!filepage) {
- spin_lock(&info->lock);
- shmem_unacct_blocks(info->flags, 1);
- shmem_free_blocks(inode, 1);
- spin_unlock(&info->lock);
- error = -ENOMEM;
- goto failed;
- }
- SetPageSwapBacked(filepage);
-
- /*
- * Precharge page while we can wait, compensate
- * after
- */
- error = mem_cgroup_cache_charge(filepage,
- current->mm, GFP_KERNEL);
- if (error) {
- page_cache_release(filepage);
- spin_lock(&info->lock);
- shmem_unacct_blocks(info->flags, 1);
- shmem_free_blocks(inode, 1);
- spin_unlock(&info->lock);
- filepage = NULL;
- goto failed;
- }
-
- spin_lock(&info->lock);
- } else {
- filepage = prealloc_page;
- prealloc_page = NULL;
- SetPageSwapBacked(filepage);
- }
+ filepage = prealloc_page;
+ prealloc_page = NULL;

entry = shmem_swp_alloc(info, idx, sgp, gfp);
if (IS_ERR(entry))
@@ -1492,11 +1460,19 @@ repeat:
SetPageUptodate(filepage);
if (sgp == SGP_DIRTY)
set_page_dirty(filepage);
+ } else {
+ error = -ENOMEM;
+ goto out;
}
done:
*pagep = filepage;
error = 0;
- goto out;
+out:
+ if (prealloc_page) {
+ mem_cgroup_uncharge_cache_page(prealloc_page);
+ page_cache_release(prealloc_page);
+ }
+ return error;

nospace:
/*
@@ -1520,12 +1496,7 @@ failed:
unlock_page(filepage);
page_cache_release(filepage);
}
-out:
- if (prealloc_page) {
- mem_cgroup_uncharge_cache_page(prealloc_page);
- page_cache_release(prealloc_page);
- }
- return error;
+ goto out;
}

static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)

2011-06-09 22:40:22

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 6/7] tmpfs: simplify filepage/swappage

We can now simplify shmem_getpage_gfp(): there is no longer a dilemma
of filepage passed in via shmem_readpage(), then swappage found, which
must then be copied over to it.

Although at first it's tempting to replace the **pagep arg by returning
struct page *, that makes a mess of IS_ERR_OR_NULL(page)s in all the
callers, so leave as is.

Insert BUG_ON(!PageUptodate) when we find and lock page: some of the
complication came from uninitialized pages inserted into filecache
prior to readpage; but now we're in control, and only release pagelock
on filecache once it's uptodate (if an error occurs in reading back
from swap, the page remains in swapcache, never moved to filecache).

Signed-off-by: Hugh Dickins <[email protected]>
---
mm/shmem.c | 237 +++++++++++++++++++++++----------------------------
1 file changed, 108 insertions(+), 129 deletions(-)

--- linux.orig/mm/shmem.c 2011-06-09 11:39:42.845292474 -0700
+++ linux/mm/shmem.c 2011-06-09 11:39:50.369329884 -0700
@@ -1246,41 +1246,47 @@ static int shmem_getpage_gfp(struct inod
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo;
- struct page *filepage;
- struct page *swappage;
+ struct page *page;
struct page *prealloc_page = NULL;
swp_entry_t *entry;
swp_entry_t swap;
int error;
+ int ret;

if (idx >= SHMEM_MAX_INDEX)
return -EFBIG;
repeat:
- filepage = find_lock_page(mapping, idx);
- if (filepage && PageUptodate(filepage))
- goto done;
- if (!filepage) {
+ page = find_lock_page(mapping, idx);
+ if (page) {
/*
- * Try to preload while we can wait, to not make a habit of
- * draining atomic reserves; but don't latch on to this cpu.
+ * Once we can get the page lock, it must be uptodate:
+ * if there were an error in reading back from swap,
+ * the page would not be inserted into the filecache.
*/
- error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
- if (error)
- goto failed;
- radix_tree_preload_end();
- if (sgp != SGP_READ && !prealloc_page) {
- prealloc_page = shmem_alloc_page(gfp, info, idx);
- if (prealloc_page) {
- SetPageSwapBacked(prealloc_page);
- if (mem_cgroup_cache_charge(prealloc_page,
- current->mm, GFP_KERNEL)) {
- page_cache_release(prealloc_page);
- prealloc_page = NULL;
- }
+ BUG_ON(!PageUptodate(page));
+ goto done;
+ }
+
+ /*
+ * Try to preload while we can wait, to not make a habit of
+ * draining atomic reserves; but don't latch on to this cpu.
+ */
+ error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
+ if (error)
+ goto out;
+ radix_tree_preload_end();
+
+ if (sgp != SGP_READ && !prealloc_page) {
+ prealloc_page = shmem_alloc_page(gfp, info, idx);
+ if (prealloc_page) {
+ SetPageSwapBacked(prealloc_page);
+ if (mem_cgroup_cache_charge(prealloc_page,
+ current->mm, GFP_KERNEL)) {
+ page_cache_release(prealloc_page);
+ prealloc_page = NULL;
}
}
}
- error = 0;

spin_lock(&info->lock);
shmem_recalc_inode(inode);
@@ -1288,21 +1294,21 @@ repeat:
if (IS_ERR(entry)) {
spin_unlock(&info->lock);
error = PTR_ERR(entry);
- goto failed;
+ goto out;
}
swap = *entry;

if (swap.val) {
/* Look it up and read it in.. */
- swappage = lookup_swap_cache(swap);
- if (!swappage) {
+ page = lookup_swap_cache(swap);
+ if (!page) {
shmem_swp_unmap(entry);
spin_unlock(&info->lock);
/* here we actually do the io */
if (fault_type)
*fault_type |= VM_FAULT_MAJOR;
- swappage = shmem_swapin(swap, gfp, info, idx);
- if (!swappage) {
+ page = shmem_swapin(swap, gfp, info, idx);
+ if (!page) {
spin_lock(&info->lock);
entry = shmem_swp_alloc(info, idx, sgp, gfp);
if (IS_ERR(entry))
@@ -1314,62 +1320,42 @@ repeat:
}
spin_unlock(&info->lock);
if (error)
- goto failed;
+ goto out;
goto repeat;
}
- wait_on_page_locked(swappage);
- page_cache_release(swappage);
+ wait_on_page_locked(page);
+ page_cache_release(page);
goto repeat;
}

/* We have to do this with page locked to prevent races */
- if (!trylock_page(swappage)) {
+ if (!trylock_page(page)) {
shmem_swp_unmap(entry);
spin_unlock(&info->lock);
- wait_on_page_locked(swappage);
- page_cache_release(swappage);
+ wait_on_page_locked(page);
+ page_cache_release(page);
goto repeat;
}
- if (PageWriteback(swappage)) {
+ if (PageWriteback(page)) {
shmem_swp_unmap(entry);
spin_unlock(&info->lock);
- wait_on_page_writeback(swappage);
- unlock_page(swappage);
- page_cache_release(swappage);
+ wait_on_page_writeback(page);
+ unlock_page(page);
+ page_cache_release(page);
goto repeat;
}
- if (!PageUptodate(swappage)) {
+ if (!PageUptodate(page)) {
shmem_swp_unmap(entry);
spin_unlock(&info->lock);
- unlock_page(swappage);
- page_cache_release(swappage);
+ unlock_page(page);
+ page_cache_release(page);
error = -EIO;
- goto failed;
+ goto out;
}

- if (filepage) {
- shmem_swp_set(info, entry, 0);
- shmem_swp_unmap(entry);
- delete_from_swap_cache(swappage);
- spin_unlock(&info->lock);
- copy_highpage(filepage, swappage);
- unlock_page(swappage);
- page_cache_release(swappage);
- flush_dcache_page(filepage);
- SetPageUptodate(filepage);
- set_page_dirty(filepage);
- swap_free(swap);
- } else if (!(error = add_to_page_cache_locked(swappage, mapping,
- idx, GFP_NOWAIT))) {
- info->flags |= SHMEM_PAGEIN;
- shmem_swp_set(info, entry, 0);
- shmem_swp_unmap(entry);
- delete_from_swap_cache(swappage);
- spin_unlock(&info->lock);
- filepage = swappage;
- set_page_dirty(filepage);
- swap_free(swap);
- } else {
+ error = add_to_page_cache_locked(page, mapping,
+ idx, GFP_NOWAIT);
+ if (error) {
shmem_swp_unmap(entry);
spin_unlock(&info->lock);
if (error == -ENOMEM) {
@@ -1378,28 +1364,33 @@ repeat:
* call memcg's OOM if needed.
*/
error = mem_cgroup_shmem_charge_fallback(
- swappage,
- current->mm,
- gfp);
+ page, current->mm, gfp);
if (error) {
- unlock_page(swappage);
- page_cache_release(swappage);
- goto failed;
+ unlock_page(page);
+ page_cache_release(page);
+ goto out;
}
}
- unlock_page(swappage);
- page_cache_release(swappage);
+ unlock_page(page);
+ page_cache_release(page);
goto repeat;
}
- } else if (sgp == SGP_READ && !filepage) {
+
+ info->flags |= SHMEM_PAGEIN;
+ shmem_swp_set(info, entry, 0);
+ shmem_swp_unmap(entry);
+ delete_from_swap_cache(page);
+ spin_unlock(&info->lock);
+ set_page_dirty(page);
+ swap_free(swap);
+
+ } else if (sgp == SGP_READ) {
shmem_swp_unmap(entry);
- filepage = find_get_page(mapping, idx);
- if (filepage &&
- (!PageUptodate(filepage) || !trylock_page(filepage))) {
+ page = find_get_page(mapping, idx);
+ if (page && !trylock_page(page)) {
spin_unlock(&info->lock);
- wait_on_page_locked(filepage);
- page_cache_release(filepage);
- filepage = NULL;
+ wait_on_page_locked(page);
+ page_cache_release(page);
goto repeat;
}
spin_unlock(&info->lock);
@@ -1417,55 +1408,51 @@ repeat:
} else if (shmem_acct_block(info->flags))
goto nospace;

- if (!filepage) {
- int ret;
-
- filepage = prealloc_page;
- prealloc_page = NULL;
+ page = prealloc_page;
+ prealloc_page = NULL;

- entry = shmem_swp_alloc(info, idx, sgp, gfp);
- if (IS_ERR(entry))
- error = PTR_ERR(entry);
- else {
- swap = *entry;
- shmem_swp_unmap(entry);
- }
- ret = error || swap.val;
- if (ret)
- mem_cgroup_uncharge_cache_page(filepage);
- else
- ret = add_to_page_cache_lru(filepage, mapping,
+ entry = shmem_swp_alloc(info, idx, sgp, gfp);
+ if (IS_ERR(entry))
+ error = PTR_ERR(entry);
+ else {
+ swap = *entry;
+ shmem_swp_unmap(entry);
+ }
+ ret = error || swap.val;
+ if (ret)
+ mem_cgroup_uncharge_cache_page(page);
+ else
+ ret = add_to_page_cache_lru(page, mapping,
idx, GFP_NOWAIT);
- /*
- * At add_to_page_cache_lru() failure, uncharge will
- * be done automatically.
- */
- if (ret) {
- shmem_unacct_blocks(info->flags, 1);
- shmem_free_blocks(inode, 1);
- spin_unlock(&info->lock);
- page_cache_release(filepage);
- filepage = NULL;
- if (error)
- goto failed;
- goto repeat;
- }
- info->flags |= SHMEM_PAGEIN;
+ /*
+ * At add_to_page_cache_lru() failure,
+ * uncharge will be done automatically.
+ */
+ if (ret) {
+ shmem_unacct_blocks(info->flags, 1);
+ shmem_free_blocks(inode, 1);
+ spin_unlock(&info->lock);
+ page_cache_release(page);
+ if (error)
+ goto out;
+ goto repeat;
}

+ info->flags |= SHMEM_PAGEIN;
info->alloced++;
spin_unlock(&info->lock);
- clear_highpage(filepage);
- flush_dcache_page(filepage);
- SetPageUptodate(filepage);
+ clear_highpage(page);
+ flush_dcache_page(page);
+ SetPageUptodate(page);
if (sgp == SGP_DIRTY)
- set_page_dirty(filepage);
+ set_page_dirty(page);
+
} else {
error = -ENOMEM;
goto out;
}
done:
- *pagep = filepage;
+ *pagep = page;
error = 0;
out:
if (prealloc_page) {
@@ -1481,21 +1468,13 @@ nospace:
* but must also avoid reporting a spurious ENOSPC while working on a
* full tmpfs.
*/
- if (!filepage) {
- struct page *page = find_get_page(mapping, idx);
- if (page) {
- spin_unlock(&info->lock);
- page_cache_release(page);
- goto repeat;
- }
- }
+ page = find_get_page(mapping, idx);
spin_unlock(&info->lock);
- error = -ENOSPC;
-failed:
- if (filepage) {
- unlock_page(filepage);
- page_cache_release(filepage);
+ if (page) {
+ page_cache_release(page);
+ goto repeat;
}
+ error = -ENOSPC;
goto out;
}

2011-06-09 22:42:10

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 7/7] tmpfs: simplify unuse and writepage

shmem_unuse_inode() and shmem_writepage() contain a little code to
cope with pages inserted independently into the filecache, probably
by a filesystem stacked on top of tmpfs, then fed to its ->readpage()
or ->writepage().

Unionfs was indeed experimenting with working in that way three years
ago, but I find no current examples: nowadays the stacking filesystems
use vfs interfaces to the lower filesystem.

It's now illegal: remove most of that code, adding some WARN_ON_ONCEs.

Signed-off-by: Hugh Dickins <[email protected]>
Cc: Erez Zadok <[email protected]>
---
mm/shmem.c | 50 ++++++++++++++++----------------------------------
1 file changed, 16 insertions(+), 34 deletions(-)

--- linux.orig/mm/shmem.c 2011-06-09 11:39:50.369329884 -0700
+++ linux/mm/shmem.c 2011-06-09 11:40:02.761391246 -0700
@@ -972,20 +972,7 @@ found:
error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT);
/* which does mem_cgroup_uncharge_cache_page on error */

- if (error == -EEXIST) {
- struct page *filepage = find_get_page(mapping, idx);
- error = 1;
- if (filepage) {
- /*
- * There might be a more uptodate page coming down
- * from a stacked writepage: forget our swappage if so.
- */
- if (PageUptodate(filepage))
- error = 0;
- page_cache_release(filepage);
- }
- }
- if (!error) {
+ if (error != -ENOMEM) {
delete_from_swap_cache(page);
set_page_dirty(page);
info->flags |= SHMEM_PAGEIN;
@@ -1072,16 +1059,17 @@ static int shmem_writepage(struct page *
/*
* shmem_backing_dev_info's capabilities prevent regular writeback or
* sync from ever calling shmem_writepage; but a stacking filesystem
- * may use the ->writepage of its underlying filesystem, in which case
+ * might use ->writepage of its underlying filesystem, in which case
* tmpfs should write out to swap only in response to memory pressure,
- * and not for the writeback threads or sync. However, in those cases,
- * we do still want to check if there's a redundant swappage to be
- * discarded.
+ * and not for the writeback threads or sync.
*/
- if (wbc->for_reclaim)
- swap = get_swap_page();
- else
- swap.val = 0;
+ if (!wbc->for_reclaim) {
+ WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
+ goto redirty;
+ }
+ swap = get_swap_page();
+ if (!swap.val)
+ goto redirty;

/*
* Add inode to shmem_unuse()'s list of swapped-out inodes,
@@ -1092,15 +1080,12 @@ static int shmem_writepage(struct page *
* we've taken the spinlock, because shmem_unuse_inode() will
* prune a !swapped inode from the swaplist under both locks.
*/
- if (swap.val) {
- mutex_lock(&shmem_swaplist_mutex);
- if (list_empty(&info->swaplist))
- list_add_tail(&info->swaplist, &shmem_swaplist);
- }
+ mutex_lock(&shmem_swaplist_mutex);
+ if (list_empty(&info->swaplist))
+ list_add_tail(&info->swaplist, &shmem_swaplist);

spin_lock(&info->lock);
- if (swap.val)
- mutex_unlock(&shmem_swaplist_mutex);
+ mutex_unlock(&shmem_swaplist_mutex);

if (index >= info->next_index) {
BUG_ON(!(info->flags & SHMEM_TRUNCATE));
@@ -1108,16 +1093,13 @@ static int shmem_writepage(struct page *
}
entry = shmem_swp_entry(info, index, NULL);
if (entry->val) {
- /*
- * The more uptodate page coming down from a stacked
- * writepage should replace our old swappage.
- */
+ WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
free_swap_and_cache(*entry);
shmem_swp_set(info, entry, 0);
}
shmem_recalc_inode(inode);

- if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
+ if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
delete_from_page_cache(page);
shmem_swp_set(info, entry, swap.val);
shmem_swp_unmap(entry);

2011-06-10 02:02:09

by Shaohua Li

[permalink] [raw]
Subject: Re: [PATCH 5/7] tmpfs: simplify prealloc_page

On Fri, 2011-06-10 at 06:39 +0800, Hugh Dickins wrote:
> The prealloc_page handling in shmem_getpage_gfp() is unnecessarily
> complicated: first simplify that before going on to filepage/swappage.
>
> That's right, don't report ENOMEM when the preallocation fails: we may
> or may not need the page. But simply report ENOMEM once we find we do
> need it, instead of dropping lock, repeating allocation, unwinding on
> failure etc. And leave the out label on the fast path, don't goto.
>
> Fix something that looks like a bug but turns out not to be: set
> PageSwapBacked on prealloc_page before its mem_cgroup_cache_charge(),
> as the removed case was doing. That's important before adding to LRU
> (determines which LRU the page goes on), and does affect which path it
> takes through memcontrol.c, but in the end MEM_CGROUP_CHANGE_TYPE_
> SHMEM is handled no differently from CACHE.
>
> Signed-off-by: Hugh Dickins <[email protected]>
> Cc: Shaohua Li <[email protected]>
> Cc: "Zhang, Yanmin" <[email protected]>
> Cc: Tim Chen <[email protected]>
> ---
> mm/shmem.c | 59 ++++++++++++---------------------------------------
> 1 file changed, 15 insertions(+), 44 deletions(-)
>
> --- linux.orig/mm/shmem.c 2011-06-09 11:39:32.361240481 -0700
> +++ linux/mm/shmem.c 2011-06-09 11:39:42.845292474 -0700
> @@ -1269,9 +1269,9 @@ repeat:
> goto failed;
> radix_tree_preload_end();
> if (sgp != SGP_READ && !prealloc_page) {
> - /* We don't care if this fails */
> prealloc_page = shmem_alloc_page(gfp, info, idx);
> if (prealloc_page) {
> + SetPageSwapBacked(prealloc_page);
> if (mem_cgroup_cache_charge(prealloc_page,
> current->mm, GFP_KERNEL)) {
> page_cache_release(prealloc_page);
> @@ -1403,7 +1403,8 @@ repeat:
> goto repeat;
> }
> spin_unlock(&info->lock);
> - } else {
> +
> + } else if (prealloc_page) {
> shmem_swp_unmap(entry);
> sbinfo = SHMEM_SB(inode->i_sb);
> if (sbinfo->max_blocks) {
> @@ -1419,41 +1420,8 @@ repeat:
> if (!filepage) {
> int ret;
>
> - if (!prealloc_page) {
> - spin_unlock(&info->lock);
> - filepage = shmem_alloc_page(gfp, info, idx);
> - if (!filepage) {
> - spin_lock(&info->lock);
> - shmem_unacct_blocks(info->flags, 1);
> - shmem_free_blocks(inode, 1);
> - spin_unlock(&info->lock);
> - error = -ENOMEM;
> - goto failed;
> - }
> - SetPageSwapBacked(filepage);
> -
> - /*
> - * Precharge page while we can wait, compensate
> - * after
> - */
> - error = mem_cgroup_cache_charge(filepage,
> - current->mm, GFP_KERNEL);
> - if (error) {
> - page_cache_release(filepage);
> - spin_lock(&info->lock);
> - shmem_unacct_blocks(info->flags, 1);
> - shmem_free_blocks(inode, 1);
> - spin_unlock(&info->lock);
> - filepage = NULL;
> - goto failed;
> - }
> -
> - spin_lock(&info->lock);
> - } else {
> - filepage = prealloc_page;
> - prealloc_page = NULL;
> - SetPageSwapBacked(filepage);
> - }
> + filepage = prealloc_page;
> + prealloc_page = NULL;
>
> entry = shmem_swp_alloc(info, idx, sgp, gfp);
> if (IS_ERR(entry))
> @@ -1492,11 +1460,19 @@ repeat:
> SetPageUptodate(filepage);
> if (sgp == SGP_DIRTY)
> set_page_dirty(filepage);
> + } else {
Looks info->lock unlock is missed here.
Otherwise looks good to me.

Thanks,
Shaohua

2011-06-10 06:40:59

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH 5/7] tmpfs: simplify prealloc_page

On Fri, 10 Jun 2011, Shaohua Li wrote:
> On Fri, 2011-06-10 at 06:39 +0800, Hugh Dickins wrote:
> > @@ -1492,11 +1460,19 @@ repeat:
> > SetPageUptodate(filepage);
> > if (sgp == SGP_DIRTY)
> > set_page_dirty(filepage);
> > + } else {
> Looks info->lock unlock is missed here.
> Otherwise looks good to me.

Many thanks for catching that! Replacements for this patch,
and the next which then rejects, follow as replies.

Hugh

2011-06-10 06:44:47

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH v2 5/7] tmpfs: simplify prealloc_page

The prealloc_page handling in shmem_getpage_gfp() is unnecessarily
complicated: first simplify that before going on to filepage/swappage.

That's right, don't report ENOMEM when the preallocation fails: we may
or may not need the page. But simply report ENOMEM once we find we do
need it, instead of dropping lock, repeating allocation, unwinding on
failure etc. And leave the out label on the fast path, don't goto.

Fix something that looks like a bug but turns out not to be: set
PageSwapBacked on prealloc_page before its mem_cgroup_cache_charge(),
as the removed case was doing. That's important before adding to LRU
(determines which LRU the page goes on), and does affect which path it
takes through memcontrol.c, but in the end MEM_CGROUP_CHANGE_TYPE_
SHMEM is handled no differently from CACHE.

Signed-off-by: Hugh Dickins <[email protected]>
Acked-by: Shaohua Li <[email protected]>
Cc: "Zhang, Yanmin" <[email protected]>
Cc: Tim Chen <[email protected]>
---
mm/shmem.c | 60 +++++++++++++--------------------------------------
1 file changed, 16 insertions(+), 44 deletions(-)

--- linux.orig/mm/shmem.c 2011-06-09 11:39:32.361240481 -0700
+++ linux/mm/shmem.c 2011-06-09 23:17:08.364060453 -0700
@@ -1269,9 +1269,9 @@ repeat:
goto failed;
radix_tree_preload_end();
if (sgp != SGP_READ && !prealloc_page) {
- /* We don't care if this fails */
prealloc_page = shmem_alloc_page(gfp, info, idx);
if (prealloc_page) {
+ SetPageSwapBacked(prealloc_page);
if (mem_cgroup_cache_charge(prealloc_page,
current->mm, GFP_KERNEL)) {
page_cache_release(prealloc_page);
@@ -1403,7 +1403,8 @@ repeat:
goto repeat;
}
spin_unlock(&info->lock);
- } else {
+
+ } else if (prealloc_page) {
shmem_swp_unmap(entry);
sbinfo = SHMEM_SB(inode->i_sb);
if (sbinfo->max_blocks) {
@@ -1419,41 +1420,8 @@ repeat:
if (!filepage) {
int ret;

- if (!prealloc_page) {
- spin_unlock(&info->lock);
- filepage = shmem_alloc_page(gfp, info, idx);
- if (!filepage) {
- spin_lock(&info->lock);
- shmem_unacct_blocks(info->flags, 1);
- shmem_free_blocks(inode, 1);
- spin_unlock(&info->lock);
- error = -ENOMEM;
- goto failed;
- }
- SetPageSwapBacked(filepage);
-
- /*
- * Precharge page while we can wait, compensate
- * after
- */
- error = mem_cgroup_cache_charge(filepage,
- current->mm, GFP_KERNEL);
- if (error) {
- page_cache_release(filepage);
- spin_lock(&info->lock);
- shmem_unacct_blocks(info->flags, 1);
- shmem_free_blocks(inode, 1);
- spin_unlock(&info->lock);
- filepage = NULL;
- goto failed;
- }
-
- spin_lock(&info->lock);
- } else {
- filepage = prealloc_page;
- prealloc_page = NULL;
- SetPageSwapBacked(filepage);
- }
+ filepage = prealloc_page;
+ prealloc_page = NULL;

entry = shmem_swp_alloc(info, idx, sgp, gfp);
if (IS_ERR(entry))
@@ -1492,11 +1460,20 @@ repeat:
SetPageUptodate(filepage);
if (sgp == SGP_DIRTY)
set_page_dirty(filepage);
+ } else {
+ spin_unlock(&info->lock);
+ error = -ENOMEM;
+ goto out;
}
done:
*pagep = filepage;
error = 0;
- goto out;
+out:
+ if (prealloc_page) {
+ mem_cgroup_uncharge_cache_page(prealloc_page);
+ page_cache_release(prealloc_page);
+ }
+ return error;

nospace:
/*
@@ -1520,12 +1497,7 @@ failed:
unlock_page(filepage);
page_cache_release(filepage);
}
-out:
- if (prealloc_page) {
- mem_cgroup_uncharge_cache_page(prealloc_page);
- page_cache_release(prealloc_page);
- }
- return error;
+ goto out;
}

static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)

2011-06-10 06:47:03

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH v2 6/7] tmpfs: simplify filepage/swappage

We can now simplify shmem_getpage_gfp(): there is no longer a dilemma
of filepage passed in via shmem_readpage(), then swappage found, which
must then be copied over to it.

Although at first it's tempting to replace the **pagep arg by returning
struct page *, that makes a mess of IS_ERR_OR_NULL(page)s in all the
callers, so leave as is.

Insert BUG_ON(!PageUptodate) when we find and lock page: some of the
complication came from uninitialized pages inserted into filecache
prior to readpage; but now we're in control, and only release pagelock
on filecache once it's uptodate (if an error occurs in reading back
from swap, the page remains in swapcache, never moved to filecache).

Signed-off-by: Hugh Dickins <[email protected]>
---
mm/shmem.c | 237 +++++++++++++++++++++++----------------------------
1 file changed, 108 insertions(+), 129 deletions(-)

--- linux.orig/mm/shmem.c 2011-06-09 23:17:08.000000000 -0700
+++ linux/mm/shmem.c 2011-06-09 23:31:44.056402814 -0700
@@ -1246,41 +1246,47 @@ static int shmem_getpage_gfp(struct inod
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo;
- struct page *filepage;
- struct page *swappage;
+ struct page *page;
struct page *prealloc_page = NULL;
swp_entry_t *entry;
swp_entry_t swap;
int error;
+ int ret;

if (idx >= SHMEM_MAX_INDEX)
return -EFBIG;
repeat:
- filepage = find_lock_page(mapping, idx);
- if (filepage && PageUptodate(filepage))
- goto done;
- if (!filepage) {
+ page = find_lock_page(mapping, idx);
+ if (page) {
/*
- * Try to preload while we can wait, to not make a habit of
- * draining atomic reserves; but don't latch on to this cpu.
+ * Once we can get the page lock, it must be uptodate:
+ * if there were an error in reading back from swap,
+ * the page would not be inserted into the filecache.
*/
- error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
- if (error)
- goto failed;
- radix_tree_preload_end();
- if (sgp != SGP_READ && !prealloc_page) {
- prealloc_page = shmem_alloc_page(gfp, info, idx);
- if (prealloc_page) {
- SetPageSwapBacked(prealloc_page);
- if (mem_cgroup_cache_charge(prealloc_page,
- current->mm, GFP_KERNEL)) {
- page_cache_release(prealloc_page);
- prealloc_page = NULL;
- }
+ BUG_ON(!PageUptodate(page));
+ goto done;
+ }
+
+ /*
+ * Try to preload while we can wait, to not make a habit of
+ * draining atomic reserves; but don't latch on to this cpu.
+ */
+ error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
+ if (error)
+ goto out;
+ radix_tree_preload_end();
+
+ if (sgp != SGP_READ && !prealloc_page) {
+ prealloc_page = shmem_alloc_page(gfp, info, idx);
+ if (prealloc_page) {
+ SetPageSwapBacked(prealloc_page);
+ if (mem_cgroup_cache_charge(prealloc_page,
+ current->mm, GFP_KERNEL)) {
+ page_cache_release(prealloc_page);
+ prealloc_page = NULL;
}
}
}
- error = 0;

spin_lock(&info->lock);
shmem_recalc_inode(inode);
@@ -1288,21 +1294,21 @@ repeat:
if (IS_ERR(entry)) {
spin_unlock(&info->lock);
error = PTR_ERR(entry);
- goto failed;
+ goto out;
}
swap = *entry;

if (swap.val) {
/* Look it up and read it in.. */
- swappage = lookup_swap_cache(swap);
- if (!swappage) {
+ page = lookup_swap_cache(swap);
+ if (!page) {
shmem_swp_unmap(entry);
spin_unlock(&info->lock);
/* here we actually do the io */
if (fault_type)
*fault_type |= VM_FAULT_MAJOR;
- swappage = shmem_swapin(swap, gfp, info, idx);
- if (!swappage) {
+ page = shmem_swapin(swap, gfp, info, idx);
+ if (!page) {
spin_lock(&info->lock);
entry = shmem_swp_alloc(info, idx, sgp, gfp);
if (IS_ERR(entry))
@@ -1314,62 +1320,42 @@ repeat:
}
spin_unlock(&info->lock);
if (error)
- goto failed;
+ goto out;
goto repeat;
}
- wait_on_page_locked(swappage);
- page_cache_release(swappage);
+ wait_on_page_locked(page);
+ page_cache_release(page);
goto repeat;
}

/* We have to do this with page locked to prevent races */
- if (!trylock_page(swappage)) {
+ if (!trylock_page(page)) {
shmem_swp_unmap(entry);
spin_unlock(&info->lock);
- wait_on_page_locked(swappage);
- page_cache_release(swappage);
+ wait_on_page_locked(page);
+ page_cache_release(page);
goto repeat;
}
- if (PageWriteback(swappage)) {
+ if (PageWriteback(page)) {
shmem_swp_unmap(entry);
spin_unlock(&info->lock);
- wait_on_page_writeback(swappage);
- unlock_page(swappage);
- page_cache_release(swappage);
+ wait_on_page_writeback(page);
+ unlock_page(page);
+ page_cache_release(page);
goto repeat;
}
- if (!PageUptodate(swappage)) {
+ if (!PageUptodate(page)) {
shmem_swp_unmap(entry);
spin_unlock(&info->lock);
- unlock_page(swappage);
- page_cache_release(swappage);
+ unlock_page(page);
+ page_cache_release(page);
error = -EIO;
- goto failed;
+ goto out;
}

- if (filepage) {
- shmem_swp_set(info, entry, 0);
- shmem_swp_unmap(entry);
- delete_from_swap_cache(swappage);
- spin_unlock(&info->lock);
- copy_highpage(filepage, swappage);
- unlock_page(swappage);
- page_cache_release(swappage);
- flush_dcache_page(filepage);
- SetPageUptodate(filepage);
- set_page_dirty(filepage);
- swap_free(swap);
- } else if (!(error = add_to_page_cache_locked(swappage, mapping,
- idx, GFP_NOWAIT))) {
- info->flags |= SHMEM_PAGEIN;
- shmem_swp_set(info, entry, 0);
- shmem_swp_unmap(entry);
- delete_from_swap_cache(swappage);
- spin_unlock(&info->lock);
- filepage = swappage;
- set_page_dirty(filepage);
- swap_free(swap);
- } else {
+ error = add_to_page_cache_locked(page, mapping,
+ idx, GFP_NOWAIT);
+ if (error) {
shmem_swp_unmap(entry);
spin_unlock(&info->lock);
if (error == -ENOMEM) {
@@ -1378,28 +1364,33 @@ repeat:
* call memcg's OOM if needed.
*/
error = mem_cgroup_shmem_charge_fallback(
- swappage,
- current->mm,
- gfp);
+ page, current->mm, gfp);
if (error) {
- unlock_page(swappage);
- page_cache_release(swappage);
- goto failed;
+ unlock_page(page);
+ page_cache_release(page);
+ goto out;
}
}
- unlock_page(swappage);
- page_cache_release(swappage);
+ unlock_page(page);
+ page_cache_release(page);
goto repeat;
}
- } else if (sgp == SGP_READ && !filepage) {
+
+ info->flags |= SHMEM_PAGEIN;
+ shmem_swp_set(info, entry, 0);
+ shmem_swp_unmap(entry);
+ delete_from_swap_cache(page);
+ spin_unlock(&info->lock);
+ set_page_dirty(page);
+ swap_free(swap);
+
+ } else if (sgp == SGP_READ) {
shmem_swp_unmap(entry);
- filepage = find_get_page(mapping, idx);
- if (filepage &&
- (!PageUptodate(filepage) || !trylock_page(filepage))) {
+ page = find_get_page(mapping, idx);
+ if (page && !trylock_page(page)) {
spin_unlock(&info->lock);
- wait_on_page_locked(filepage);
- page_cache_release(filepage);
- filepage = NULL;
+ wait_on_page_locked(page);
+ page_cache_release(page);
goto repeat;
}
spin_unlock(&info->lock);
@@ -1417,56 +1408,52 @@ repeat:
} else if (shmem_acct_block(info->flags))
goto nospace;

- if (!filepage) {
- int ret;
-
- filepage = prealloc_page;
- prealloc_page = NULL;
+ page = prealloc_page;
+ prealloc_page = NULL;

- entry = shmem_swp_alloc(info, idx, sgp, gfp);
- if (IS_ERR(entry))
- error = PTR_ERR(entry);
- else {
- swap = *entry;
- shmem_swp_unmap(entry);
- }
- ret = error || swap.val;
- if (ret)
- mem_cgroup_uncharge_cache_page(filepage);
- else
- ret = add_to_page_cache_lru(filepage, mapping,
+ entry = shmem_swp_alloc(info, idx, sgp, gfp);
+ if (IS_ERR(entry))
+ error = PTR_ERR(entry);
+ else {
+ swap = *entry;
+ shmem_swp_unmap(entry);
+ }
+ ret = error || swap.val;
+ if (ret)
+ mem_cgroup_uncharge_cache_page(page);
+ else
+ ret = add_to_page_cache_lru(page, mapping,
idx, GFP_NOWAIT);
- /*
- * At add_to_page_cache_lru() failure, uncharge will
- * be done automatically.
- */
- if (ret) {
- shmem_unacct_blocks(info->flags, 1);
- shmem_free_blocks(inode, 1);
- spin_unlock(&info->lock);
- page_cache_release(filepage);
- filepage = NULL;
- if (error)
- goto failed;
- goto repeat;
- }
- info->flags |= SHMEM_PAGEIN;
+ /*
+ * At add_to_page_cache_lru() failure,
+ * uncharge will be done automatically.
+ */
+ if (ret) {
+ shmem_unacct_blocks(info->flags, 1);
+ shmem_free_blocks(inode, 1);
+ spin_unlock(&info->lock);
+ page_cache_release(page);
+ if (error)
+ goto out;
+ goto repeat;
}

+ info->flags |= SHMEM_PAGEIN;
info->alloced++;
spin_unlock(&info->lock);
- clear_highpage(filepage);
- flush_dcache_page(filepage);
- SetPageUptodate(filepage);
+ clear_highpage(page);
+ flush_dcache_page(page);
+ SetPageUptodate(page);
if (sgp == SGP_DIRTY)
- set_page_dirty(filepage);
+ set_page_dirty(page);
+
} else {
spin_unlock(&info->lock);
error = -ENOMEM;
goto out;
}
done:
- *pagep = filepage;
+ *pagep = page;
error = 0;
out:
if (prealloc_page) {
@@ -1482,21 +1469,13 @@ nospace:
* but must also avoid reporting a spurious ENOSPC while working on a
* full tmpfs.
*/
- if (!filepage) {
- struct page *page = find_get_page(mapping, idx);
- if (page) {
- spin_unlock(&info->lock);
- page_cache_release(page);
- goto repeat;
- }
- }
+ page = find_get_page(mapping, idx);
spin_unlock(&info->lock);
- error = -ENOSPC;
-failed:
- if (filepage) {
- unlock_page(filepage);
- page_cache_release(filepage);
+ if (page) {
+ page_cache_release(page);
+ goto repeat;
}
+ error = -ENOSPC;
goto out;
}

2011-06-10 09:19:46

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH 1/7] tmpfs: clone shmem_file_splice_read

On 2011-06-10 00:32, Hugh Dickins wrote:
> Copy __generic_file_splice_read() and generic_file_splice_read()
> from fs/splice.c to shmem_file_splice_read() in mm/shmem.c. Make
> page_cache_pipe_buf_ops and spd_release_page() accessible to it.

That's a lot of fairly complicated and convoluted code to have
duplicated. Yes, I know I know, it's already largely duplicated from the
normal file read, but still... Really no easy way to share this?

--
Jens Axboe

2011-06-10 20:01:35

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH 1/7] tmpfs: clone shmem_file_splice_read

On Fri, 10 Jun 2011, Jens Axboe wrote:
> On 2011-06-10 00:32, Hugh Dickins wrote:
> > Copy __generic_file_splice_read() and generic_file_splice_read()
> > from fs/splice.c to shmem_file_splice_read() in mm/shmem.c. Make
> > page_cache_pipe_buf_ops and spd_release_page() accessible to it.
>
> That's a lot of fairly complicated and convoluted code to have
> duplicated. Yes, I know I know, it's already largely duplicated from the
> normal file read, but still... Really no easy way to share this?

I hadn't thought in that direction at all, to be honest: imagined you
had already factored out what could be factored out, splice_to_pipe()
and the spd helpers. I just copied the code over in this patch, then
trimmed it down for tmpfs in the next.

(I didn't Cc you on that second patch because it was local to shmem.c:
maybe you looked it up on the list, maybe you didn't - here it is below.
It would be very unfair to claim that it halves the size, since I took
out some comments; compiled size goes down from 1219 to 897 bytes.)

What tmpfs wants to avoid is outsiders inserting pages into its filecache
then calling ->readpage on them (because tmpfs may already have those
pages in swapcache - though I've another reason to avoid it coming soon).
Since the interesting uses of tmpfs go through its ->splice_read nowadays,
I just switched that over not to use ->readpage.

You ask, no easy way to share this? I guess we could make a little
library-like function of the isize, this_len code at fill_it; but
that doesn't really seem worth much.

I could drop shmem.c's find_get_pages_contig() and combine the other
two loops to make just a single shmem_getpage()ing loop; or even
resort to using default_file_splice_read() with its page allocation
and copying. But crippling the shmem splice just seems a retrograde
step, which might be noticed by sendfile users: I don't think that's
what you intend.

Ever since birth, shmem had to have its own file_read and file_write:
your splice work with Nick's write_begin changes let it use generic
splice for writing, but I do need to avoid readpage in reading.

If I thought I had my finger on the pulse of what modern filesystems
want, I might propose a readpage replacement for all; but, frankly,
the notion that I have my finger on any pulse at all is laughable ;)

Any suggestions?

Hugh

[PATCH 2/7] tmpfs: refine shmem_file_splice_read

Tidy up shmem_file_splice_read():

Remove readahead: okay, we could implement shmem readahead on swap,
but have never done so before, swap being the slow exceptional path.

Use shmem_getpage() instead of find_or_create_page() plus ->readpage().

Remove several comments: sorry, I found them more distracting than
helpful, and this will not be the reference version of splice_read().

Signed-off-by: Hugh Dickins <[email protected]>
---
mm/shmem.c | 138 +++++++--------------------------------------------
1 file changed, 19 insertions(+), 119 deletions(-)

--- linux.orig/mm/shmem.c 2011-06-09 11:38:05.232808448 -0700
+++ linux/mm/shmem.c 2011-06-09 11:38:13.436849182 -0700
@@ -1850,6 +1850,7 @@ static ssize_t shmem_file_splice_read(st
unsigned int flags)
{
struct address_space *mapping = in->f_mapping;
+ struct inode *inode = mapping->host;
unsigned int loff, nr_pages, req_pages;
struct page *pages[PIPE_DEF_BUFFERS];
struct partial_page partial[PIPE_DEF_BUFFERS];
@@ -1865,7 +1866,7 @@ static ssize_t shmem_file_splice_read(st
.spd_release = spd_release_page,
};

- isize = i_size_read(in->f_mapping->host);
+ isize = i_size_read(inode);
if (unlikely(*ppos >= isize))
return 0;

@@ -1881,153 +1882,57 @@ static ssize_t shmem_file_splice_read(st
req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
nr_pages = min(req_pages, pipe->buffers);

- /*
- * Lookup the (hopefully) full range of pages we need.
- */
spd.nr_pages = find_get_pages_contig(mapping, index,
nr_pages, spd.pages);
index += spd.nr_pages;
-
- /*
- * If find_get_pages_contig() returned fewer pages than we needed,
- * readahead/allocate the rest and fill in the holes.
- */
- if (spd.nr_pages < nr_pages)
- page_cache_sync_readahead(mapping, &in->f_ra, in,
- index, req_pages - spd.nr_pages);
-
error = 0;
- while (spd.nr_pages < nr_pages) {
- /*
- * Page could be there, find_get_pages_contig() breaks on
- * the first hole.
- */
- page = find_get_page(mapping, index);
- if (!page) {
- /*
- * page didn't exist, allocate one.
- */
- page = page_cache_alloc_cold(mapping);
- if (!page)
- break;
-
- error = add_to_page_cache_lru(page, mapping, index,
- GFP_KERNEL);
- if (unlikely(error)) {
- page_cache_release(page);
- if (error == -EEXIST)
- continue;
- break;
- }
- /*
- * add_to_page_cache() locks the page, unlock it
- * to avoid convoluting the logic below even more.
- */
- unlock_page(page);
- }

+ while (spd.nr_pages < nr_pages) {
+ page = NULL;
+ error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL);
+ if (error)
+ break;
+ unlock_page(page);
spd.pages[spd.nr_pages++] = page;
index++;
}

- /*
- * Now loop over the map and see if we need to start IO on any
- * pages, fill in the partial map, etc.
- */
index = *ppos >> PAGE_CACHE_SHIFT;
nr_pages = spd.nr_pages;
spd.nr_pages = 0;
+
for (page_nr = 0; page_nr < nr_pages; page_nr++) {
unsigned int this_len;

if (!len)
break;

- /*
- * this_len is the max we'll use from this page
- */
this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
page = spd.pages[page_nr];

- if (PageReadahead(page))
- page_cache_async_readahead(mapping, &in->f_ra, in,
- page, index, req_pages - page_nr);
-
- /*
- * If the page isn't uptodate, we may need to start io on it
- */
- if (!PageUptodate(page)) {
- lock_page(page);
-
- /*
- * Page was truncated, or invalidated by the
- * filesystem. Redo the find/create, but this time the
- * page is kept locked, so there's no chance of another
- * race with truncate/invalidate.
- */
- if (!page->mapping) {
- unlock_page(page);
- page = find_or_create_page(mapping, index,
- mapping_gfp_mask(mapping));
-
- if (!page) {
- error = -ENOMEM;
- break;
- }
- page_cache_release(spd.pages[page_nr]);
- spd.pages[page_nr] = page;
- }
- /*
- * page was already under io and is now done, great
- */
- if (PageUptodate(page)) {
- unlock_page(page);
- goto fill_it;
- }
-
- /*
- * need to read in the page
- */
- error = mapping->a_ops->readpage(in, page);
- if (unlikely(error)) {
- /*
- * We really should re-lookup the page here,
- * but it complicates things a lot. Instead
- * lets just do what we already stored, and
- * we'll get it the next time we are called.
- */
- if (error == AOP_TRUNCATED_PAGE)
- error = 0;
-
+ if (!PageUptodate(page) || page->mapping != mapping) {
+ page = NULL;
+ error = shmem_getpage(inode, index, &page,
+ SGP_CACHE, NULL);
+ if (error)
break;
- }
+ unlock_page(page);
+ page_cache_release(spd.pages[page_nr]);
+ spd.pages[page_nr] = page;
}
-fill_it:
- /*
- * i_size must be checked after PageUptodate.
- */
- isize = i_size_read(mapping->host);
+
+ isize = i_size_read(inode);
end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
if (unlikely(!isize || index > end_index))
break;

- /*
- * if this is the last page, see if we need to shrink
- * the length and stop
- */
if (end_index == index) {
unsigned int plen;

- /*
- * max good bytes in this page
- */
plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
if (plen <= loff)
break;

- /*
- * force quit after adding this page
- */
this_len = min(this_len, plen - loff);
len = this_len;
}
@@ -2040,13 +1945,8 @@ fill_it:
index++;
}

- /*
- * Release any pages at the end, if we quit early. 'page_nr' is how far
- * we got, 'nr_pages' is how many pages are in the map.
- */
while (page_nr < nr_pages)
page_cache_release(spd.pages[page_nr++]);
- in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;

if (spd.nr_pages)
error = splice_to_pipe(pipe, &spd);