[permalink] [raw]

Subject: [PATCH v2 3/3] NFS: Fix a memory leak in nfs_readdir

We need to ensure that the entries in the nfs_cache_array get cleared
when the page is removed from the page cache. To do so, we use the
releasepage address_space operation (which also requires us to set
the Pg_private flag).

Change nfs_readdir_clear_array to use kmap_atomic(), so that the
function can be safely called from all direct reclaim contexts.

Finally, modify the cache_page_release helper to call
nfs_readdir_clear_array directly, when dealing with an anonymous
page from 'uncached_readdir'.

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/dir.c | 22 +++++++++++++++++-----
fs/nfs/inode.c | 1 +
include/linux/nfs_fs.h | 1 +
3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 3ec3f1c..4c6319e 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -58,6 +58,7 @@ static int nfs_rename(struct inode *, struct dentry *,
static int nfs_fsync_dir(struct file *, int);
static loff_t nfs_llseek_dir(struct file *, loff_t, int);
static int nfs_readdir_clear_array(struct page*, gfp_t);
+static void nfs_readdir_invalidatepage(struct page*, unsigned long);

const struct file_operations nfs_dir_operations = {
.llseek = nfs_llseek_dir,
@@ -85,6 +86,7 @@ const struct inode_operations nfs_dir_inode_operations = {

const struct address_space_operations nfs_dir_addr_space_ops = {
.releasepage = nfs_readdir_clear_array,
+ .invalidatepage = nfs_readdir_invalidatepage,
};

#ifdef CONFIG_NFS_V3
@@ -216,15 +218,22 @@ void nfs_readdir_release_array(struct page *page)
static
int nfs_readdir_clear_array(struct page *page, gfp_t mask)
{
- struct nfs_cache_array *array = nfs_readdir_get_array(page);
+ struct nfs_cache_array *array;
int i;

- if (IS_ERR(array))
- return PTR_ERR(array);
+ array = kmap_atomic(page, KM_USER0);
for (i = 0; i < array->size; i++)
kfree(array->array[i].string.name);
- nfs_readdir_release_array(page);
- return 0;
+ kunmap_atomic(array, KM_USER0);
+ ClearPageUptodate(page);
+ ClearPagePrivate(page);
+ return 1;
+}
+
+static
+void nfs_readdir_invalidatepage(struct page *page, unsigned long offset)
+{
+ nfs_readdir_clear_array(page, 0);
}

/*
@@ -624,6 +633,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
if (ret < 0)
goto error;
SetPageUptodate(page);
+ SetPagePrivate(page);

if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
/* Should never happen */
@@ -639,6 +649,8 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
static
void cache_page_release(nfs_readdir_descriptor_t *desc)
{
+ if (!desc->page->mapping)
+ nfs_readdir_clear_array(desc->page, GFP_KERNEL);
unlock_page(desc->page);
page_cache_release(desc->page);
desc->page = NULL;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 314f571..0018e07 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -289,6 +289,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
} else if (S_ISDIR(inode->i_mode)) {
inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
inode->i_fop = &nfs_dir_operations;
+ inode->i_data.a_ops = &nfs_dir_addr_space_ops;
if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS))
set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
/* Deal with crossing mountpoints */
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index c66fdb7..b5d3ab0 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -401,6 +401,7 @@ extern const struct inode_operations nfs3_file_inode_operations;
#endif /* CONFIG_NFS_V3 */
extern const struct file_operations nfs_file_operations;
extern const struct address_space_operations nfs_file_aops;
+extern const struct address_space_operations nfs_dir_addr_space_ops;

static inline struct nfs_open_context *nfs_file_open_context(struct file *filp)
{
--
1.7.3.2

2010-12-01 15:37:43

by Myklebust, Trond

[permalink] [raw]

Subject: [PATCH v2 1/3] NFS: Ensure we use the correct cookie in nfs_readdir_xdr_filler

We need to use the cookie from the previous array entry, not the
actual cookie that we are searching for (except for the case of
uncached_readdir).

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/dir.c | 10 ++++++++--
1 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index f0a384e..e03537f 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -178,6 +178,7 @@ typedef struct {
struct page *page;
unsigned long page_index;
u64 *dir_cookie;
+ u64 last_cookie;
loff_t current_index;
decode_dirent_t decode;

@@ -344,6 +345,8 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
else
status = nfs_readdir_search_for_cookie(array, desc);

+ if (status == -EAGAIN)
+ desc->last_cookie = array->last_cookie;
nfs_readdir_release_array(desc->page);
out:
return status;
@@ -563,7 +566,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
unsigned int array_size = ARRAY_SIZE(pages);

entry.prev_cookie = 0;
- entry.cookie = *desc->dir_cookie;
+ entry.cookie = desc->last_cookie;
entry.eof = 0;
entry.fh = nfs_alloc_fhandle();
entry.fattr = nfs_alloc_fattr();
@@ -672,8 +675,10 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
{
int res;

- if (desc->page_index == 0)
+ if (desc->page_index == 0) {
desc->current_index = 0;
+ desc->last_cookie = 0;
+ }
while (1) {
res = find_cache_page(desc);
if (res != -EAGAIN)
@@ -764,6 +769,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
}

desc->page_index = 0;
+ desc->last_cookie = *desc->dir_cookie;
desc->page = page;

status = nfs_readdir_xdr_to_array(desc, page, inode);
--
1.7.3.2

2010-12-01 16:18:06

2010-12-06 17:00:32

by Myklebust, Trond

[permalink] [raw]

Subject: [PATCH v4 1/3] NFS: Ensure we use the correct cookie in nfs_readdir_xdr_filler

2010-12-06 17:00:07

by Myklebust, Trond

[permalink] [raw]

Subject: [PATCH v4 2/3] Call the filesystem back whenever a page is removed from the page cache

From: Linus Torvalds <[email protected]>

NFS needs to be able to release objects that are stored in the page
cache once the page itself is no longer visible from the page cache.

This patch adds a callback to the address space operations that allows
filesystems to perform page cleanups once the page has been removed
from the page cache.

Original patch by: Linus Torvalds <[email protected]>
[trondmy: cover the cases of invalidate_inode_pages2() and
truncate_inode_pages()]
Signed-off-by: Trond Myklebust <[email protected]>
---
Documentation/filesystems/Locking | 7 ++++++-
Documentation/filesystems/vfs.txt | 7 +++++++
include/linux/fs.h | 1 +
mm/filemap.c | 5 +++++
mm/truncate.c | 4 ++++
mm/vmscan.c | 7 +++++++
6 files changed, 30 insertions(+), 1 deletions(-)

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index a91f308..b6426f1 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -173,12 +173,13 @@ prototypes:
sector_t (*bmap)(struct address_space *, sector_t);
int (*invalidatepage) (struct page *, unsigned long);
int (*releasepage) (struct page *, int);
+ void (*freepage)(struct page *);
int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
loff_t offset, unsigned long nr_segs);
int (*launder_page) (struct page *);

locking rules:
- All except set_page_dirty may block
+ All except set_page_dirty and freepage may block

BKL PageLocked(page) i_mutex
writepage: no yes, unlocks (see below)
@@ -193,6 +194,7 @@ perform_write: no n/a yes
bmap: no
invalidatepage: no yes
releasepage: no yes
+freepage: no yes
direct_IO: no
launder_page: no yes

@@ -288,6 +290,9 @@ buffers from the page in preparation for freeing it. It returns zero to
indicate that the buffers are (or may be) freeable. If ->releasepage is zero,
the kernel assumes that the fs has no private interest in the buffers.

+ ->freepage() is called when the kernel is done dropping the page
+from the page cache.
+
->launder_page() may be called prior to releasing a page if
it is still found to be dirty. It returns zero if the page was successfully
cleaned, or an error value if not. Note that in order to prevent the page
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index ed7e5ef..3b14a55 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -534,6 +534,7 @@ struct address_space_operations {
sector_t (*bmap)(struct address_space *, sector_t);
int (*invalidatepage) (struct page *, unsigned long);
int (*releasepage) (struct page *, int);
+ void (*freepage)(struct page *);
ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
loff_t offset, unsigned long nr_segs);
struct page* (*get_xip_page)(struct address_space *, sector_t,
@@ -679,6 +680,12 @@ struct address_space_operations {
need to ensure this. Possibly it can clear the PageUptodate
bit if it cannot free private data yet.

+ freepage: freepage is called once the page is no longer visible in
+ the page cache in order to allow the cleanup of any private
+ data. Since it may be called by the memory reclaimer, it
+ should not assume that the original address_space mapping still
+ exists, and it should not block.
+
direct_IO: called by the generic read/write routines to perform
direct_IO - that is IO requests which bypass the page cache
and transfer data directly between the storage and the
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c9e06cc..090f0ea 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -602,6 +602,7 @@ struct address_space_operations {
sector_t (*bmap)(struct address_space *, sector_t);
void (*invalidatepage) (struct page *, unsigned long);
int (*releasepage) (struct page *, gfp_t);
+ void (*freepage)(struct page *);
ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
loff_t offset, unsigned long nr_segs);
int (*get_xip_mem)(struct address_space *, pgoff_t, int,
diff --git a/mm/filemap.c b/mm/filemap.c
index ea89840..6b9aee2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -143,13 +143,18 @@ void __remove_from_page_cache(struct page *page)
void remove_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
+ void (*freepage)(struct page *);

BUG_ON(!PageLocked(page));

+ freepage = mapping->a_ops->freepage;
spin_lock_irq(&mapping->tree_lock);
__remove_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
+
+ if (freepage)
+ freepage(page);
}
EXPORT_SYMBOL(remove_from_page_cache);

diff --git a/mm/truncate.c b/mm/truncate.c
index ba887bf..3c2d5dd 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -390,6 +390,10 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
__remove_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
+
+ if (mapping->a_ops->freepage)
+ mapping->a_ops->freepage(page);
+
page_cache_release(page); /* pagecache ref */
return 1;
failed:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d31d7ce..9ca587c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -494,9 +494,16 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
spin_unlock_irq(&mapping->tree_lock);
swapcache_free(swap, page);
} else {
+ void (*freepage)(struct page *);
+
+ freepage = mapping->a_ops->freepage;
+
__remove_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
+
+ if (freepage != NULL)
+ freepage(page);
}

return 1;
--
1.7.3.2

2010-12-07 07:08:35

by Nick Piggin

[permalink] [raw]

Subject: Re: [PATCH v4 2/3] Call the filesystem back whenever a page is removed from the page cache

On Mon, Dec 06, 2010 at 11:59:07AM -0500, Trond Myklebust wrote:
> From: Linus Torvalds <[email protected]>
>
> NFS needs to be able to release objects that are stored in the page
> cache once the page itself is no longer visible from the page cache.
>
> This patch adds a callback to the address space operations that allows
> filesystems to perform page cleanups once the page has been removed
> from the page cache.
>
> Original patch by: Linus Torvalds <[email protected]>
> [trondmy: cover the cases of invalidate_inode_pages2() and
> truncate_inode_pages()]
> Signed-off-by: Trond Myklebust <[email protected]>
> ---
> Documentation/filesystems/Locking | 7 ++++++-
> Documentation/filesystems/vfs.txt | 7 +++++++
> include/linux/fs.h | 1 +
> mm/filemap.c | 5 +++++
> mm/truncate.c | 4 ++++
> mm/vmscan.c | 7 +++++++
> 6 files changed, 30 insertions(+), 1 deletions(-)
>
> diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
> index a91f308..b6426f1 100644
> --- a/Documentation/filesystems/Locking
> +++ b/Documentation/filesystems/Locking
> @@ -173,12 +173,13 @@ prototypes:
> sector_t (*bmap)(struct address_space *, sector_t);
> int (*invalidatepage) (struct page *, unsigned long);
> int (*releasepage) (struct page *, int);
> + void (*freepage)(struct page *);
> int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
> loff_t offset, unsigned long nr_segs);
> int (*launder_page) (struct page *);
>
> locking rules:
> - All except set_page_dirty may block
> + All except set_page_dirty and freepage may block
>
> BKL PageLocked(page) i_mutex
> writepage: no yes, unlocks (see below)
> @@ -193,6 +194,7 @@ perform_write: no n/a yes
> bmap: no
> invalidatepage: no yes
> releasepage: no yes
> +freepage: no yes
> direct_IO: no
> launder_page: no yes
>
> @@ -288,6 +290,9 @@ buffers from the page in preparation for freeing it. It returns zero to
> indicate that the buffers are (or may be) freeable. If ->releasepage is zero,
> the kernel assumes that the fs has no private interest in the buffers.
>
> + ->freepage() is called when the kernel is done dropping the page
> +from the page cache.
> +
> ->launder_page() may be called prior to releasing a page if
> it is still found to be dirty. It returns zero if the page was successfully
> cleaned, or an error value if not. Note that in order to prevent the page
> diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
> index ed7e5ef..3b14a55 100644
> --- a/Documentation/filesystems/vfs.txt
> +++ b/Documentation/filesystems/vfs.txt
> @@ -534,6 +534,7 @@ struct address_space_operations {
> sector_t (*bmap)(struct address_space *, sector_t);
> int (*invalidatepage) (struct page *, unsigned long);
> int (*releasepage) (struct page *, int);
> + void (*freepage)(struct page *);
> ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
> loff_t offset, unsigned long nr_segs);
> struct page* (*get_xip_page)(struct address_space *, sector_t,
> @@ -679,6 +680,12 @@ struct address_space_operations {
> need to ensure this. Possibly it can clear the PageUptodate
> bit if it cannot free private data yet.
>
> + freepage: freepage is called once the page is no longer visible in
> + the page cache in order to allow the cleanup of any private
> + data. Since it may be called by the memory reclaimer, it
> + should not assume that the original address_space mapping still
> + exists, and it should not block.

Of course we still have bugs in this regard, without inode RCU and
filesystem deregistration RCU, but when those things are implemented
for RCU path-walk, this section should be updated somewhat, and we'll
have to look at RCU protecting the final mapping manipulations after
a page is removed from pagecache.

But I'll help work on that after RCU inodes / filesystems is merged.

> +
> direct_IO: called by the generic read/write routines to perform
> direct_IO - that is IO requests which bypass the page cache
> and transfer data directly between the storage and the
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index c9e06cc..090f0ea 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -602,6 +602,7 @@ struct address_space_operations {
> sector_t (*bmap)(struct address_space *, sector_t);
> void (*invalidatepage) (struct page *, unsigned long);
> int (*releasepage) (struct page *, gfp_t);
> + void (*freepage)(struct page *);
> ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
> loff_t offset, unsigned long nr_segs);
> int (*get_xip_mem)(struct address_space *, pgoff_t, int,
> diff --git a/mm/filemap.c b/mm/filemap.c
> index ea89840..6b9aee2 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -143,13 +143,18 @@ void __remove_from_page_cache(struct page *page)
> void remove_from_page_cache(struct page *page)
> {
> struct address_space *mapping = page->mapping;
> + void (*freepage)(struct page *);
>
> BUG_ON(!PageLocked(page));
>
> + freepage = mapping->a_ops->freepage;
> spin_lock_irq(&mapping->tree_lock);
> __remove_from_page_cache(page);
> spin_unlock_irq(&mapping->tree_lock);
> mem_cgroup_uncharge_cache_page(page);
> +
> + if (freepage)
> + freepage(page);
> }
> EXPORT_SYMBOL(remove_from_page_cache);
>
> diff --git a/mm/truncate.c b/mm/truncate.c
> index ba887bf..3c2d5dd 100644
> --- a/mm/truncate.c
> +++ b/mm/truncate.c
> @@ -390,6 +390,10 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
> __remove_from_page_cache(page);
> spin_unlock_irq(&mapping->tree_lock);
> mem_cgroup_uncharge_cache_page(page);
> +
> + if (mapping->a_ops->freepage)
> + mapping->a_ops->freepage(page);
> +
> page_cache_release(page); /* pagecache ref */
> return 1;
> failed:

The generic parts of the code look OK to me, but why is there a
difference in your sequences of loading the freepage function pointer
here?