Hi,
These patches implement swap files on NFS, but lay the foundation to
allow swap files on any non block device backed file.
As is, these patches allow for swapfiles to me used on NFS mounts. However
some extra work is needed to make this safe. It is not very hard to deadlock
a kernel with only these patches.
In the next VM deadlock avoidance series I will include a patch to remedy
this.
Add support for non block device backed swap files.
A new addres_space_operations method is added:
int swapfile(struct address_space *, int)
When during sys_swapon() this method is found and returns no error the
swapper_space.a_ops will proxy to sis->swap_file->f_mapping->a_ops.
The swapfile method will be used to communicate to the address_space that the
VM relies on it, and the address_space should take adequate measures (like
reserving memory for mempools or the like).
Signed-off-by: Peter Zijlstra <[email protected]>
---
include/linux/fs.h | 1
include/linux/swap.h | 4 +++
init/Kconfig | 5 ++++
mm/page_io.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++
mm/swap_state.c | 6 +++++
mm/swapfile.c | 27 ++++++++++++++++++++++
6 files changed, 102 insertions(+), 1 deletion(-)
Index: linux-2.6/include/linux/swap.h
===================================================================
--- linux-2.6.orig/include/linux/swap.h
+++ linux-2.6/include/linux/swap.h
@@ -115,6 +115,7 @@ enum {
SWP_USED = (1 << 0), /* is slot in swap_info[] used? */
SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */
SWP_ACTIVE = (SWP_USED | SWP_WRITEOK),
+ SWP_FILE = (1 << 2), /* file swap area */
/* add others here before... */
SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */
};
@@ -212,6 +213,9 @@ extern void swap_unplug_io_fn(struct bac
/* linux/mm/page_io.c */
extern int swap_readpage(struct file *, struct page *);
extern int swap_writepage(struct page *page, struct writeback_control *wbc);
+extern void swap_sync_page(struct page *page);
+extern int swap_set_page_dirty(struct page *page);
+extern int swap_releasepage(struct page *page, gfp_t gfp_mask);
extern int rw_swap_page_sync(int, swp_entry_t, struct page *);
/* linux/mm/swap_state.c */
Index: linux-2.6/init/Kconfig
===================================================================
--- linux-2.6.orig/init/Kconfig
+++ linux-2.6/init/Kconfig
@@ -100,6 +100,11 @@ config SWAP
used to provide more virtual memory than the actual RAM present
in your computer. If unsure say Y.
+config SWAP_FILE
+ bool "Support for paging to/from non block device files"
+ depends on SWAP
+ default n
+
config SYSVIPC
bool "System V IPC"
---help---
Index: linux-2.6/mm/page_io.c
===================================================================
--- linux-2.6.orig/mm/page_io.c
+++ linux-2.6/mm/page_io.c
@@ -17,6 +17,7 @@
#include <linux/bio.h>
#include <linux/swapops.h>
#include <linux/writeback.h>
+#include <linux/buffer_head.h>
#include <asm/pgtable.h>
static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index,
@@ -91,6 +92,14 @@ int swap_writepage(struct page *page, st
unlock_page(page);
goto out;
}
+#ifdef CONFIG_SWAP_FILE
+ {
+ struct swap_info_struct *sis = page_swap_info(page);
+ if (sis->flags & SWP_FILE)
+ return sis->swap_file->f_mapping->
+ a_ops->writepage(page, wbc);
+ }
+#endif
bio = get_swap_bio(GFP_NOIO, page_private(page), page,
end_swap_bio_write);
if (bio == NULL) {
@@ -116,6 +125,14 @@ int swap_readpage(struct file *file, str
BUG_ON(!PageLocked(page));
ClearPageUptodate(page);
+#ifdef CONFIG_SWAP_FILE
+ {
+ struct swap_info_struct *sis = page_swap_info(page);
+ if (sis->flags & SWP_FILE)
+ return sis->swap_file->f_mapping->
+ a_ops->readpage(sis->swap_file, page);
+ }
+#endif
bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
end_swap_bio_read);
if (bio == NULL) {
@@ -129,6 +146,49 @@ out:
return ret;
}
+#ifdef CONFIG_SWAP_FILE
+void swap_sync_page(struct page *page)
+{
+ struct swap_info_struct *sis = page_swap_info(page);
+
+ if (sis->flags & SWP_FILE) {
+ const struct address_space_operations * a_ops =
+ sis->swap_file->f_mapping->a_ops;
+ if (a_ops->sync_page)
+ a_ops->sync_page(page);
+ } else
+ block_sync_page(page);
+}
+
+int swap_set_page_dirty(struct page *page)
+{
+ struct swap_info_struct *sis = page_swap_info(page);
+
+ if (sis->flags & SWP_FILE) {
+ const struct address_space_operations * a_ops =
+ sis->swap_file->f_mapping->a_ops;
+ if (a_ops->set_page_dirty)
+ return a_ops->set_page_dirty(page);
+ return __set_page_dirty_buffers(page);
+ }
+
+ return __set_page_dirty_nobuffers(page);
+}
+
+int swap_releasepage(struct page *page, gfp_t gfp_mask)
+{
+ struct swap_info_struct *sis = page_swap_info(page);
+ const struct address_space_operations * a_ops =
+ sis->swap_file->f_mapping->a_ops;
+
+ if ((sis->flags & SWP_FILE) && a_ops->releasepage)
+ return a_ops->releasepage(page, gfp_mask);
+
+ BUG();
+ return 0;
+}
+#endif
+
#ifdef CONFIG_SOFTWARE_SUSPEND
/*
* A scruffy utility function to read or write an arbitrary swap page
Index: linux-2.6/mm/swap_state.c
===================================================================
--- linux-2.6.orig/mm/swap_state.c
+++ linux-2.6/mm/swap_state.c
@@ -26,8 +26,14 @@
*/
static const struct address_space_operations swap_aops = {
.writepage = swap_writepage,
+#ifdef CONFIG_SWAP_FILE
+ .sync_page = swap_sync_page,
+ .set_page_dirty = swap_set_page_dirty,
+ .releasepage = swap_releasepage,
+#else
.sync_page = block_sync_page,
.set_page_dirty = __set_page_dirty_nobuffers,
+#endif
.migratepage = migrate_page,
};
Index: linux-2.6/mm/swapfile.c
===================================================================
--- linux-2.6.orig/mm/swapfile.c
+++ linux-2.6/mm/swapfile.c
@@ -411,7 +411,12 @@ void free_swap_and_cache(swp_entry_t ent
if (page) {
int one_user;
+#ifdef CONFIG_SWAP_FILE
+ if (PagePrivate(page))
+ page_mapping(page)->a_ops->releasepage(page, 0);
+#else
BUG_ON(PagePrivate(page));
+#endif
one_user = (page_count(page) == 2);
/* Only cache user (+us), or swap space full? Free it! */
/* Also recheck PageSwapCache after page is locked (above) */
@@ -943,6 +948,13 @@ static void destroy_swap_extents(struct
list_del(&se->list);
kfree(se);
}
+#ifdef CONFIG_SWAP_FILE
+ if (sis->flags & SWP_FILE) {
+ sis->flags &= ~SWP_FILE;
+ sis->swap_file->f_mapping->a_ops->
+ swapfile(sis->swap_file->f_mapping, 0);
+ }
+#endif
}
/*
@@ -1035,6 +1047,19 @@ static int setup_swap_extents(struct swa
goto done;
}
+#ifdef CONFIG_SWAP_FILE
+ if (sis->swap_file->f_mapping->a_ops->swapfile) {
+ ret = sis->swap_file->f_mapping->a_ops->
+ swapfile(sis->swap_file->f_mapping, 1);
+ if (!ret) {
+ sis->flags |= SWP_FILE;
+ ret = add_swap_extent(sis, 0, sis->max, 0);
+ *span = sis->pages;
+ }
+ goto done;
+ }
+#endif
+
blkbits = inode->i_blkbits;
blocks_per_page = PAGE_SIZE >> blkbits;
@@ -1591,7 +1616,7 @@ asmlinkage long sys_swapon(const char __
mutex_lock(&swapon_mutex);
spin_lock(&swap_lock);
- p->flags = SWP_ACTIVE;
+ p->flags |= SWP_WRITEOK;
nr_swap_pages += nr_good_pages;
total_swap_pages += nr_good_pages;
Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h
+++ linux-2.6/include/linux/fs.h
@@ -382,6 +382,7 @@ struct address_space_operations {
/* migrate the contents of a page to the specified target */
int (*migratepage) (struct address_space *,
struct page *, struct page *);
+ int (*swapfile)(struct address_space *, int);
};
struct backing_dev_info;
Teach the NFS client how to treat PG_swapcache pages.
Replace all occurences of page->index and page->mapping in the NFS client
with the new page_file_index() and page_file_mapping() functions.
Signed-off-by: Peter Zijlstra <[email protected]>
---
fs/nfs/dir.c | 4 ++--
fs/nfs/file.c | 6 +++---
fs/nfs/pagelist.c | 8 ++++----
fs/nfs/read.c | 10 +++++-----
fs/nfs/write.c | 34 +++++++++++++++++-----------------
5 files changed, 31 insertions(+), 31 deletions(-)
Index: linux-2.6/fs/nfs/file.c
===================================================================
--- linux-2.6.orig/fs/nfs/file.c
+++ linux-2.6/fs/nfs/file.c
@@ -303,16 +303,16 @@ static int nfs_commit_write(struct file
static void nfs_invalidate_page(struct page *page, unsigned long offset)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = page_file_mapping(page)->host;
/* Cancel any unstarted writes on this page */
if (offset == 0)
- nfs_sync_inode_wait(inode, page->index, 1, FLUSH_INVALIDATE);
+ nfs_sync_inode_wait(inode, page_file_index(page), 1, FLUSH_INVALIDATE);
}
static int nfs_release_page(struct page *page, gfp_t gfp)
{
- return !nfs_wb_page(page->mapping->host, page);
+ return !nfs_wb_page(page_file_mapping(page)->host, page);
}
const struct address_space_operations nfs_file_aops = {
Index: linux-2.6/fs/nfs/pagelist.c
===================================================================
--- linux-2.6.orig/fs/nfs/pagelist.c
+++ linux-2.6/fs/nfs/pagelist.c
@@ -82,11 +82,11 @@ nfs_create_request(struct nfs_open_conte
* update_nfs_request below if the region is not locked. */
req->wb_page = page;
atomic_set(&req->wb_complete, 0);
- req->wb_index = page->index;
+ req->wb_index = page_file_index(page);
page_cache_get(page);
BUG_ON(PagePrivate(page));
BUG_ON(!PageLocked(page));
- BUG_ON(page->mapping->host != inode);
+ BUG_ON(page_file_mapping(page)->host != inode);
req->wb_offset = offset;
req->wb_pgbase = offset;
req->wb_bytes = count;
@@ -271,7 +271,7 @@ nfs_coalesce_requests(struct list_head *
* nfs_scan_lock_dirty - Scan the radix tree for dirty requests
* @nfsi: NFS inode
* @dst: Destination list
- * @idx_start: lower bound of page->index to scan
+ * @idx_start: lower bound of page_file_index(page) to scan
* @npages: idx_start + npages sets the upper bound to scan.
*
* Moves elements from one of the inode request lists.
@@ -328,7 +328,7 @@ out:
* @nfsi: NFS inode
* @head: One of the NFS inode request lists
* @dst: Destination list
- * @idx_start: lower bound of page->index to scan
+ * @idx_start: lower bound of page_file_index(page) to scan
* @npages: idx_start + npages sets the upper bound to scan.
*
* Moves elements from one of the inode request lists.
Index: linux-2.6/fs/nfs/read.c
===================================================================
--- linux-2.6.orig/fs/nfs/read.c
+++ linux-2.6/fs/nfs/read.c
@@ -84,9 +84,9 @@ unsigned int nfs_page_length(struct inod
if (i_size <= 0)
return 0;
idx = (i_size - 1) >> PAGE_CACHE_SHIFT;
- if (page->index > idx)
+ if (page_file_index(page) > idx)
return 0;
- if (page->index != idx)
+ if (page_file_index(page) != idx)
return PAGE_CACHE_SIZE;
return 1 + ((i_size - 1) & (PAGE_CACHE_SIZE - 1));
}
@@ -586,11 +586,11 @@ int nfs_readpage_result(struct rpc_task
int nfs_readpage(struct file *file, struct page *page)
{
struct nfs_open_context *ctx;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = page_file_mapping(page)->host;
int error;
dprintk("NFS: nfs_readpage (%p %ld@%lu)\n",
- page, PAGE_CACHE_SIZE, page->index);
+ page, PAGE_CACHE_SIZE, page_file_index(page));
nfs_inc_stats(inode, NFSIOS_VFSREADPAGE);
nfs_add_stats(inode, NFSIOS_READPAGES, 1);
@@ -638,7 +638,7 @@ static int
readpage_async_filler(void *data, struct page *page)
{
struct nfs_readdesc *desc = (struct nfs_readdesc *)data;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = page_file_mapping(page)->host;
struct nfs_page *new;
unsigned int len;
Index: linux-2.6/fs/nfs/write.c
===================================================================
--- linux-2.6.orig/fs/nfs/write.c
+++ linux-2.6/fs/nfs/write.c
@@ -152,13 +152,13 @@ void nfs_writedata_release(void *wdata)
/* Adjust the file length if we're writing beyond the end */
static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = page_file_mapping(page)->host;
loff_t end, i_size = i_size_read(inode);
unsigned long end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
- if (i_size > 0 && page->index < end_index)
+ if (i_size > 0 && page_file_index(page) < end_index)
return;
- end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count);
+ end = page_offset(page) + ((loff_t)offset+count);
if (i_size >= end)
return;
nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
@@ -181,11 +181,11 @@ static void nfs_mark_uptodate(struct pag
return;
}
- end_offs = i_size_read(page->mapping->host) - 1;
+ end_offs = i_size_read(page_file_mapping(page)->host) - 1;
if (end_offs < 0)
return;
/* Is this the last page? */
- if (page->index != (unsigned long)(end_offs >> PAGE_CACHE_SHIFT))
+ if (page_file_index(page) != (unsigned long)(end_offs >> PAGE_CACHE_SHIFT))
return;
/* This is the last page: set PG_uptodate if we cover the entire
* extent of the data, then zero the rest of the page.
@@ -300,7 +300,7 @@ static int wb_priority(struct writeback_
int nfs_writepage(struct page *page, struct writeback_control *wbc)
{
struct nfs_open_context *ctx;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = page_file_mapping(page)->host;
unsigned long end_index;
unsigned offset = PAGE_CACHE_SIZE;
loff_t i_size = i_size_read(inode);
@@ -327,14 +327,14 @@ int nfs_writepage(struct page *page, str
nfs_wb_page_priority(inode, page, priority);
/* easy case */
- if (page->index < end_index)
+ if (page_file_index(page) < end_index)
goto do_it;
/* things got complicated... */
offset = i_size & (PAGE_CACHE_SIZE-1);
/* OK, are we completely out? */
err = 0; /* potential race with truncate - ignore */
- if (page->index >= end_index+1 || !offset)
+ if (page_file_index(page) >= end_index+1 || !offset)
goto out;
do_it:
ctx = nfs_find_open_context(inode, NULL, FMODE_WRITE);
@@ -606,7 +606,7 @@ static void nfs_cancel_commit_list(struc
* nfs_scan_dirty - Scan an inode for dirty requests
* @inode: NFS inode to scan
* @dst: destination list
- * @idx_start: lower bound of page->index to scan.
+ * @idx_start: lower bound of page_file_index(page) to scan.
* @npages: idx_start + npages sets the upper bound to scan.
*
* Moves requests from the inode's dirty page list.
@@ -632,7 +632,7 @@ nfs_scan_dirty(struct inode *inode, stru
* nfs_scan_commit - Scan an inode for commit requests
* @inode: NFS inode to scan
* @dst: destination list
- * @idx_start: lower bound of page->index to scan.
+ * @idx_start: lower bound of page_file_index(page) to scan.
* @npages: idx_start + npages sets the upper bound to scan.
*
* Moves requests from the inode's 'commit' request list.
@@ -713,14 +713,14 @@ static struct nfs_page * nfs_update_requ
end = offset + bytes;
- if (nfs_wait_on_write_congestion(page->mapping, server->flags & NFS_MOUNT_INTR))
+ if (nfs_wait_on_write_congestion(page_file_mapping(page), server->flags & NFS_MOUNT_INTR))
return ERR_PTR(-ERESTARTSYS);
for (;;) {
/* Loop over all inode entries and see if we find
* A request for the page we wish to update
*/
spin_lock(&nfsi->req_lock);
- req = _nfs_find_request(inode, page->index);
+ req = _nfs_find_request(inode, page_file_index(page));
if (req) {
if (!nfs_lock_request_dontget(req)) {
int error;
@@ -791,7 +791,7 @@ static struct nfs_page * nfs_update_requ
int nfs_flush_incompatible(struct file *file, struct page *page)
{
struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = page_file_mapping(page)->host;
struct nfs_page *req;
int status = 0;
/*
@@ -802,7 +802,7 @@ int nfs_flush_incompatible(struct file *
* Also do the same if we find a request from an existing
* dropped page.
*/
- req = nfs_find_request(inode, page->index);
+ req = nfs_find_request(inode, page_file_index(page));
if (req) {
if (req->wb_page != page || ctx != req->wb_context)
status = nfs_wb_page(inode, page);
@@ -821,7 +821,7 @@ int nfs_updatepage(struct file *file, st
unsigned int offset, unsigned int count)
{
struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = page_file_mapping(page)->host;
struct nfs_page *req;
int status = 0;
@@ -854,12 +854,12 @@ int nfs_updatepage(struct file *file, st
offset = 0;
if (unlikely(end_offs < 0)) {
/* Do nothing */
- } else if (page->index == end_index) {
+ } else if (page_file_index(page) == end_index) {
unsigned int pglen;
pglen = (unsigned int)(end_offs & (PAGE_CACHE_SIZE-1)) + 1;
if (count < pglen)
count = pglen;
- } else if (page->index < end_index)
+ } else if (page_file_index(page) < end_index)
count = PAGE_CACHE_SIZE;
}
Index: linux-2.6/fs/nfs/dir.c
===================================================================
--- linux-2.6.orig/fs/nfs/dir.c
+++ linux-2.6/fs/nfs/dir.c
@@ -177,7 +177,7 @@ int nfs_readdir_filler(nfs_readdir_descr
dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n",
__FUNCTION__, (long long)desc->entry->cookie,
- page->index);
+ page_file_index(page));
again:
timestamp = jiffies;
@@ -201,7 +201,7 @@ int nfs_readdir_filler(nfs_readdir_descr
* Note: assumes we have exclusive access to this mapping either
* through inode->i_mutex or some other mechanism.
*/
- if (page->index == 0)
+ if (page_file_index(page) == 0)
invalidate_inode_pages2_range(inode->i_mapping, PAGE_CACHE_SIZE, -1);
unlock_page(page);
return 0;
In order to teach filesystems to handle swap cache pages, two new page
functions are introduced:
pgoff_t page_file_index(struct page *);
struct address_space *page_file_mapping(struct page *);
page_file_index - gives the offset of this page in the file in PAGE_CACHE_SIZE
blocks. Like page->index is for mapped pages, this function also gives the
correct index for PG_swapcache pages.
page_file_mapping - gives the mapping backing the actual page; that is for
swap cache pages it will give swap_file->f_mapping.
page_offset() is modified to use page_file_index(), so that it will give the
expected result, even for PG_swapcache pages.
Signed-off-by: Peter Zijlstra <[email protected]>
---
include/linux/mm.h | 30 ++++++++++++++++++++++++++++++
include/linux/pagemap.h | 2 +-
include/linux/swap.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/swapops.h | 44 --------------------------------------------
4 files changed, 79 insertions(+), 45 deletions(-)
Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -15,6 +15,7 @@
#include <linux/fs.h>
#include <linux/mutex.h>
#include <linux/debug_locks.h>
+#include <linux/swap.h>
struct mempolicy;
struct anon_vma;
@@ -579,6 +580,22 @@ static inline struct address_space *page
return mapping;
}
+static inline
+struct swap_info_struct * page_swap_info(struct page *page)
+{
+ swp_entry_t swap = { .val = page_private(page) };
+ BUG_ON(!PageSwapCache(page));
+ return get_swap_info_struct(swp_type(swap));
+}
+
+static inline
+struct address_space *page_file_mapping(struct page *page)
+{
+ if (unlikely(PageSwapCache(page)))
+ return page_swap_info(page)->swap_file->f_mapping;
+ return page->mapping;
+}
+
static inline int PageAnon(struct page *page)
{
return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
@@ -596,6 +613,19 @@ static inline pgoff_t page_index(struct
}
/*
+ * Return the file index of the page. Regular pagecache pages use ->index
+ * whereas swapcache pages use swp_offset(->private)
+ */
+static inline pgoff_t page_file_index(struct page *page)
+{
+ if (unlikely(PageSwapCache(page))) {
+ swp_entry_t swap = { .val = page_private(page) };
+ return swp_offset(swap);
+ }
+ return page->index;
+}
+
+/*
* The atomic page->_mapcount, like _count, starts from -1:
* so that transitions both from it and to it can be tracked,
* using atomic_inc_and_test and atomic_add_negative(-1).
Index: linux-2.6/include/linux/pagemap.h
===================================================================
--- linux-2.6.orig/include/linux/pagemap.h
+++ linux-2.6/include/linux/pagemap.h
@@ -118,7 +118,7 @@ extern void __remove_from_page_cache(str
*/
static inline loff_t page_offset(struct page *page)
{
- return ((loff_t)page->index) << PAGE_CACHE_SHIFT;
+ return ((loff_t)page_file_index(page)) << PAGE_CACHE_SHIFT;
}
static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
Index: linux-2.6/include/linux/swap.h
===================================================================
--- linux-2.6.orig/include/linux/swap.h
+++ linux-2.6/include/linux/swap.h
@@ -75,6 +75,50 @@ typedef struct {
} swp_entry_t;
/*
+ * swapcache pages are stored in the swapper_space radix tree. We want to
+ * get good packing density in that tree, so the index should be dense in
+ * the low-order bits.
+ *
+ * We arrange the `type' and `offset' fields so that `type' is at the five
+ * high-order bits of the swp_entry_t and `offset' is right-aligned in the
+ * remaining bits.
+ *
+ * swp_entry_t's are *never* stored anywhere in their arch-dependent format.
+ */
+#define SWP_TYPE_SHIFT(e) (sizeof(e.val) * 8 - MAX_SWAPFILES_SHIFT)
+#define SWP_OFFSET_MASK(e) ((1UL << SWP_TYPE_SHIFT(e)) - 1)
+
+/*
+ * Store a type+offset into a swp_entry_t in an arch-independent format
+ */
+static inline swp_entry_t swp_entry(unsigned long type, pgoff_t offset)
+{
+ swp_entry_t ret;
+
+ ret.val = (type << SWP_TYPE_SHIFT(ret)) |
+ (offset & SWP_OFFSET_MASK(ret));
+ return ret;
+}
+
+/*
+ * Extract the `type' field from a swp_entry_t. The swp_entry_t is in
+ * arch-independent format
+ */
+static inline unsigned swp_type(swp_entry_t entry)
+{
+ return (entry.val >> SWP_TYPE_SHIFT(entry));
+}
+
+/*
+ * Extract the `offset' field from a swp_entry_t. The swp_entry_t is in
+ * arch-independent format
+ */
+static inline pgoff_t swp_offset(swp_entry_t entry)
+{
+ return entry.val & SWP_OFFSET_MASK(entry);
+}
+
+/*
* current->reclaim_state points to one of these when a task is running
* memory reclaim
*/
@@ -322,6 +366,10 @@ static inline int valid_swaphandles(swp_
return 0;
}
+static inline struct swap_info_struct *get_swap_info_struct(unsigned type)
+{
+ return NULL;
+}
#define can_share_swap_page(p) (page_mapcount(p) == 1)
static inline int move_to_swap_cache(struct page *page, swp_entry_t entry)
Index: linux-2.6/include/linux/swapops.h
===================================================================
--- linux-2.6.orig/include/linux/swapops.h
+++ linux-2.6/include/linux/swapops.h
@@ -1,48 +1,4 @@
/*
- * swapcache pages are stored in the swapper_space radix tree. We want to
- * get good packing density in that tree, so the index should be dense in
- * the low-order bits.
- *
- * We arrange the `type' and `offset' fields so that `type' is at the five
- * high-order bits of the swp_entry_t and `offset' is right-aligned in the
- * remaining bits.
- *
- * swp_entry_t's are *never* stored anywhere in their arch-dependent format.
- */
-#define SWP_TYPE_SHIFT(e) (sizeof(e.val) * 8 - MAX_SWAPFILES_SHIFT)
-#define SWP_OFFSET_MASK(e) ((1UL << SWP_TYPE_SHIFT(e)) - 1)
-
-/*
- * Store a type+offset into a swp_entry_t in an arch-independent format
- */
-static inline swp_entry_t swp_entry(unsigned long type, pgoff_t offset)
-{
- swp_entry_t ret;
-
- ret.val = (type << SWP_TYPE_SHIFT(ret)) |
- (offset & SWP_OFFSET_MASK(ret));
- return ret;
-}
-
-/*
- * Extract the `type' field from a swp_entry_t. The swp_entry_t is in
- * arch-independent format
- */
-static inline unsigned swp_type(swp_entry_t entry)
-{
- return (entry.val >> SWP_TYPE_SHIFT(entry));
-}
-
-/*
- * Extract the `offset' field from a swp_entry_t. The swp_entry_t is in
- * arch-independent format
- */
-static inline pgoff_t swp_offset(swp_entry_t entry)
-{
- return entry.val & SWP_OFFSET_MASK(entry);
-}
-
-/*
* Convert the arch-dependent pte representation of a swp_entry_t into an
* arch-independent swp_entry_t.
*/
Now that 'include/linux/mm.h' includes 'include/linux/swap.h', the global
remove_mapping() definition clashes with the arch/um one.
Rename the arch/um one.
Signed-off-by: Peter Zijlstra <[email protected]>
---
arch/um/kernel/physmem.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
Index: linux-2.6/arch/um/kernel/physmem.c
===================================================================
--- linux-2.6.orig/arch/um/kernel/physmem.c
+++ linux-2.6/arch/um/kernel/physmem.c
@@ -160,7 +160,7 @@ int physmem_subst_mapping(void *virt, in
static int physmem_fd = -1;
-static void remove_mapping(struct phys_desc *desc)
+static void um_remove_mapping(struct phys_desc *desc)
{
void *virt = desc->virt;
int err;
@@ -184,7 +184,7 @@ int physmem_remove_mapping(void *virt)
if(desc == NULL)
return(0);
- remove_mapping(desc);
+ um_remove_mapping(desc);
return(1);
}
@@ -205,7 +205,7 @@ void physmem_forget_descriptor(int fd)
page = list_entry(ele, struct phys_desc, list);
offset = page->offset;
addr = page->virt;
- remove_mapping(page);
+ um_remove_mapping(page);
err = os_seek_file(fd, offset);
if(err)
panic("physmem_forget_descriptor - failed to seek "
Add a comment explaining the use of PG_private in the NFS client.
Signed-off-by: Peter Zijlstra <[email protected]>
---
fs/nfs/write.c | 5 +++++
1 file changed, 5 insertions(+)
Index: linux-2.6/fs/nfs/write.c
===================================================================
--- linux-2.6.orig/fs/nfs/write.c
+++ linux-2.6/fs/nfs/write.c
@@ -424,6 +424,11 @@ static int nfs_inode_add_request(struct
if (nfs_have_delegation(inode, FMODE_WRITE))
nfsi->change_attr++;
}
+ /*
+ * The PG_private bit is unfortunately needed if we want to fix the
+ * hole in the mmap semantics. If we do not set it, then the VM will
+ * fail to call the "releasepage" address ops.
+ */
SetPagePrivate(req->wb_page);
nfsi->npages++;
atomic_inc(&req->wb_count);
Now that NFS can handle swap cache pages, add a swapfile method to allow
swapping over NFS.
NOTE: this dummy method is obviously not enough to make it safe.
A more complete version of the nfs_swapfile() function will be present
in the next VM deadlock avoidance patches.
Signed-off-by: Peter Zijlstra <[email protected]>
---
fs/nfs/file.c | 6 ++++++
1 file changed, 6 insertions(+)
Index: linux-2.6/fs/nfs/file.c
===================================================================
--- linux-2.6.orig/fs/nfs/file.c
+++ linux-2.6/fs/nfs/file.c
@@ -315,6 +315,11 @@ static int nfs_release_page(struct page
return !nfs_wb_page(page_file_mapping(page)->host, page);
}
+static int nfs_swapfile(struct address_space *mapping, int enable)
+{
+ return 0;
+}
+
const struct address_space_operations nfs_file_aops = {
.readpage = nfs_readpage,
.readpages = nfs_readpages,
@@ -328,6 +333,7 @@ const struct address_space_operations nf
#ifdef CONFIG_NFS_DIRECTIO
.direct_IO = nfs_direct_IO,
#endif
+ .swapfile = nfs_swapfile,
};
/*
Make sure we clear PG_writeback after we clear PG_private, otherwise
weird and wonderfull stuff will happen.
Also, teach try_to_release_page() about PG_swapcache pages.
Signed-off-by: Peter Zijlstra <[email protected]>
---
fs/buffer.c | 2 +-
fs/nfs/write.c | 5 ++---
2 files changed, 3 insertions(+), 4 deletions(-)
Index: linux-2.6/fs/buffer.c
===================================================================
--- linux-2.6.orig/fs/buffer.c
+++ linux-2.6/fs/buffer.c
@@ -1567,7 +1567,7 @@ static void discard_buffer(struct buffer
*/
int try_to_release_page(struct page *page, gfp_t gfp_mask)
{
- struct address_space * const mapping = page->mapping;
+ struct address_space * const mapping = page_mapping(page);
BUG_ON(!PageLocked(page));
if (PageWriteback(page))
Index: linux-2.6/fs/nfs/write.c
===================================================================
--- linux-2.6.orig/fs/nfs/write.c
+++ linux-2.6/fs/nfs/write.c
@@ -902,7 +902,6 @@ done:
static void nfs_writepage_release(struct nfs_page *req)
{
- end_page_writeback(req->wb_page);
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
if (!PageError(req->wb_page)) {
@@ -922,6 +921,7 @@ out:
#else
nfs_inode_remove_request(req);
#endif
+ end_page_writeback(req->wb_page);
nfs_clear_page_writeback(req);
}
@@ -1222,12 +1222,10 @@ static void nfs_writeback_done_full(stru
ClearPageUptodate(page);
SetPageError(page);
req->wb_context->error = task->tk_status;
- end_page_writeback(page);
nfs_inode_remove_request(req);
dprintk(", error = %d\n", task->tk_status);
goto next;
}
- end_page_writeback(page);
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
if (data->args.stable != NFS_UNSTABLE || data->verf.committed == NFS_FILE_SYNC) {
@@ -1242,6 +1240,7 @@ static void nfs_writeback_done_full(stru
nfs_inode_remove_request(req);
#endif
next:
+ end_page_writeback(page);
nfs_clear_page_writeback(req);
}
}
On Fri, 2006-08-25 at 17:37 +0200, Peter Zijlstra wrote:
> Teach the NFS client how to treat PG_swapcache pages.
>
> Replace all occurences of page->index and page->mapping in the NFS client
> with the new page_file_index() and page_file_mapping() functions.
>
> Signed-off-by: Peter Zijlstra <[email protected]>
> ---
> fs/nfs/dir.c | 4 ++--
> fs/nfs/file.c | 6 +++---
> fs/nfs/pagelist.c | 8 ++++----
> fs/nfs/read.c | 10 +++++-----
> fs/nfs/write.c | 34 +++++++++++++++++-----------------
> 5 files changed, 31 insertions(+), 31 deletions(-)
<snip>
> @@ -821,7 +821,7 @@ int nfs_updatepage(struct file *file, st
> unsigned int offset, unsigned int count)
> {
> struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
> - struct inode *inode = page->mapping->host;
> + struct inode *inode = page_file_mapping(page)->host;
> struct nfs_page *req;
> int status = 0;
>
> @@ -854,12 +854,12 @@ int nfs_updatepage(struct file *file, st
> offset = 0;
> if (unlikely(end_offs < 0)) {
> /* Do nothing */
> - } else if (page->index == end_index) {
> + } else if (page_file_index(page) == end_index) {
Is this necessary? When will we ever call nfs_updatepage() with a swap
page? AFAICS, the swap stuff always uses page dirtying and (ugh)
writepage().
> unsigned int pglen;
> pglen = (unsigned int)(end_offs & (PAGE_CACHE_SIZE-1)) + 1;
> if (count < pglen)
> count = pglen;
> - } else if (page->index < end_index)
> + } else if (page_file_index(page) < end_index)
> count = PAGE_CACHE_SIZE;
> }
>
> Index: linux-2.6/fs/nfs/dir.c
> ===================================================================
> --- linux-2.6.orig/fs/nfs/dir.c
> +++ linux-2.6/fs/nfs/dir.c
> @@ -177,7 +177,7 @@ int nfs_readdir_filler(nfs_readdir_descr
>
> dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n",
> __FUNCTION__, (long long)desc->entry->cookie,
> - page->index);
> + page_file_index(page));
>
> again:
> timestamp = jiffies;
> @@ -201,7 +201,7 @@ int nfs_readdir_filler(nfs_readdir_descr
> * Note: assumes we have exclusive access to this mapping either
> * through inode->i_mutex or some other mechanism.
> */
> - if (page->index == 0)
> + if (page_file_index(page) == 0)
> invalidate_inode_pages2_range(inode->i_mapping, PAGE_CACHE_SIZE, -1);
> unlock_page(page);
> return 0;
Why are we worried about the possibility of NFS readdir pages being swap
pages?
Cheers,
Trond
On Fri, 2006-08-25 at 18:36 +0200, Peter Zijlstra wrote:
> Make sure we clear PG_writeback after we clear PG_private, otherwise
> weird and wonderfull stuff will happen.
>
NACK.
Look carefully at the case of unstable writes: your patch does nothing
to guarantee that PG_writeback is cleared after PG_private for that
case.
Anyhow, you don't explain exactly what is wrong with clearing
PG_writeback before PG_private.
Cheers,
Trond
On Fri, 2006-08-25 at 16:03 -0400, Trond Myklebust wrote:
> On Fri, 2006-08-25 at 17:37 +0200, Peter Zijlstra wrote:
> > Teach the NFS client how to treat PG_swapcache pages.
> >
> > Replace all occurences of page->index and page->mapping in the NFS client
> > with the new page_file_index() and page_file_mapping() functions.
> >
> > Signed-off-by: Peter Zijlstra <[email protected]>
> > ---
> > fs/nfs/dir.c | 4 ++--
> > fs/nfs/file.c | 6 +++---
> > fs/nfs/pagelist.c | 8 ++++----
> > fs/nfs/read.c | 10 +++++-----
> > fs/nfs/write.c | 34 +++++++++++++++++-----------------
> > 5 files changed, 31 insertions(+), 31 deletions(-)
>
> <snip>
>
> > @@ -821,7 +821,7 @@ int nfs_updatepage(struct file *file, st
> > unsigned int offset, unsigned int count)
> > {
> > struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
> > - struct inode *inode = page->mapping->host;
> > + struct inode *inode = page_file_mapping(page)->host;
> > struct nfs_page *req;
> > int status = 0;
> >
> > @@ -854,12 +854,12 @@ int nfs_updatepage(struct file *file, st
> > offset = 0;
> > if (unlikely(end_offs < 0)) {
> > /* Do nothing */
> > - } else if (page->index == end_index) {
> > + } else if (page_file_index(page) == end_index) {
>
> Is this necessary? When will we ever call nfs_updatepage() with a swap
> page? AFAICS, the swap stuff always uses page dirtying and (ugh)
> writepage().
Yes, swap uses writepage(), Nikita Danilov had a patch that did cluster
pageout using writepages(), however that tended to deadlock even on
local disk.
> > unsigned int pglen;
> > pglen = (unsigned int)(end_offs & (PAGE_CACHE_SIZE-1)) + 1;
> > if (count < pglen)
> > count = pglen;
> > - } else if (page->index < end_index)
> > + } else if (page_file_index(page) < end_index)
> > count = PAGE_CACHE_SIZE;
> > }
> >
> > Index: linux-2.6/fs/nfs/dir.c
> > ===================================================================
> > --- linux-2.6.orig/fs/nfs/dir.c
> > +++ linux-2.6/fs/nfs/dir.c
> > @@ -177,7 +177,7 @@ int nfs_readdir_filler(nfs_readdir_descr
> >
> > dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n",
> > __FUNCTION__, (long long)desc->entry->cookie,
> > - page->index);
> > + page_file_index(page));
> >
> > again:
> > timestamp = jiffies;
> > @@ -201,7 +201,7 @@ int nfs_readdir_filler(nfs_readdir_descr
> > * Note: assumes we have exclusive access to this mapping either
> > * through inode->i_mutex or some other mechanism.
> > */
> > - if (page->index == 0)
> > + if (page_file_index(page) == 0)
> > invalidate_inode_pages2_range(inode->i_mapping, PAGE_CACHE_SIZE, -1);
> > unlock_page(page);
> > return 0;
>
> Why are we worried about the possibility of NFS readdir pages being swap
> pages?
Indiscriminate search and replace followed by a manual check for
correctness. They might not be needed, but they're not wrong either.
Would you prefer I take them out?
On Fri, 2006-08-25 at 22:20 +0200, Peter Zijlstra wrote:
> Indiscriminate search and replace followed by a manual check for
> correctness. They might not be needed, but they're not wrong either.
>
> Would you prefer I take them out?
It won't give us any massive performance optimisations, but it is nice
to be able to avoid that call to test_bit() whenever possible.
Cheers,
Trond
On Fri, 2006-08-25 at 16:11 -0400, Trond Myklebust wrote:
> On Fri, 2006-08-25 at 18:36 +0200, Peter Zijlstra wrote:
> > Make sure we clear PG_writeback after we clear PG_private, otherwise
> > weird and wonderfull stuff will happen.
> >
> NACK.
>
> Look carefully at the case of unstable writes: your patch does nothing
> to guarantee that PG_writeback is cleared after PG_private for that
> case.
Ah, right. Thanks for pointing this out.
> Anyhow, you don't explain exactly what is wrong with clearing
> PG_writeback before PG_private.
Yes, this was a rather hasty patch, I was mortified to find that I
missed a few changes and my patch-set would crash instantly someone
would try it.
The VM doesn't really like PG_private set on PG_swapcache pages, I guess
I'll have to rectify that and leave the NFS behaviour as is.
Will correct this in the next round.
Thanks for the feedback,
Peter
On Fri, 2006-08-25 at 22:44 +0200, Peter Zijlstra wrote:
> The VM doesn't really like PG_private set on PG_swapcache pages, I guess
> I'll have to rectify that and leave the NFS behaviour as is.
You might want to consider disabling NFS data cache revalidation on swap
files since it doesn't really make sense to have other clients change
the file while you are using it.
If you do, you could also skip setting PG_private on swap pages, since
there ought to be no further races with invalidate_inode_pages2() to
deal with.
Cheers,
Trond
Hi!
> Now that NFS can handle swap cache pages, add a swapfile method to allow
> swapping over NFS.
>
> NOTE: this dummy method is obviously not enough to make it safe.
> A more complete version of the nfs_swapfile() function will be present
> in the next VM deadlock avoidance patches.
>
> Signed-off-by: Peter Zijlstra <[email protected]>
We probably do not want to enable functionality before it is safe...
Also swsusp interactions will be interesting. (Rafael is working on
swapfile support these days).
Pavel
--
Thanks for all the (sleeping) penguins.
On Sat, 2006-08-26 at 14:36 +0000, Pavel Machek wrote:
> Hi!
>
> > Now that NFS can handle swap cache pages, add a swapfile method to allow
> > swapping over NFS.
> >
> > NOTE: this dummy method is obviously not enough to make it safe.
> > A more complete version of the nfs_swapfile() function will be present
> > in the next VM deadlock avoidance patches.
> >
> > Signed-off-by: Peter Zijlstra <[email protected]>
>
> We probably do not want to enable functionality before it is safe...
:-), probably not no, but some ppl might want to live on the edge.
> Also swsusp interactions will be interesting. (Rafael is working on
> swapfile support these days).
Yes, I've considered this, and this was one of the motivators to keep
the functionality under its own config option, so that it might be
mutually exclusive with swsusp to swapfile.
Pavel Machek wrote:
> Hi!
>
>> Now that NFS can handle swap cache pages, add a swapfile method to allow
>> swapping over NFS.
>>
>> NOTE: this dummy method is obviously not enough to make it safe.
>> A more complete version of the nfs_swapfile() function will be present
>> in the next VM deadlock avoidance patches.
>>
>> Signed-off-by: Peter Zijlstra <[email protected]>
>
> We probably do not want to enable functionality before it is safe...
OTOH, if we never enable this, what motivation do we have to
make it safe? :)
Scratching an itch works, so maybe we ought to create an itch?
--
What is important? What you want to be true, or what is true?
On Fri, Aug 25, 2006 at 05:37:40PM +0200, Peter Zijlstra wrote:
> Now that 'include/linux/mm.h' includes 'include/linux/swap.h', the global
> remove_mapping() definition clashes with the arch/um one.
>
> Rename the arch/um one.
If you tested the UML build -
Acked-by: Jeff Dike <[email protected]>