2008-02-20 15:34:36

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 22/28] mm: add support for non block device backed swap files

New addres_space_operations methods are added:
int swapfile(struct address_space *, int);
int swap_out(struct file *, struct page *, struct writeback_control *);
int swap_in(struct file *, struct page *);

When during sys_swapon() the swapfile() method is found and returns no error
the swapper_space.a_ops will proxy to sis->swap_file->f_mapping->a_ops, and
make use of swap_{out,in}() to write/read swapcache pages.

The swapfile method will be used to communicate to the address_space that the
VM relies on it, and the address_space should take adequate measures (like
reserving memory for mempools or the like).

This new interface can be used to obviate the need for ->bmap in the swapfile
code. A filesystem would need to load (and maybe even allocate) the full block
map for a file into memory and pin it there on ->swapfile(,1) so that
->swap_{out,in}() have instant access to it. It can be released on
->swapfile(,0).

The reason to provide ->swap_{out,in}() over using {write,read}page() is to
1) make a distinction between swapcache and pagecache pages, and
2) to provide a struct file * for credential context (normally not needed
in the context of writepage, as the page content is normally dirtied
using either of the following interfaces:
write_{begin,end}()
{prepare,commit}_write()
page_mkwrite()
which do have the file context.

Signed-off-by: Peter Zijlstra <[email protected]>
---
Documentation/filesystems/Locking | 19 +++++++++++++
Documentation/filesystems/vfs.txt | 17 ++++++++++++
include/linux/buffer_head.h | 2 -
include/linux/fs.h | 8 +++++
include/linux/swap.h | 4 ++
mm/page_io.c | 52 ++++++++++++++++++++++++++++++++++++++
mm/swap_state.c | 4 +-
mm/swapfile.c | 26 ++++++++++++++++++-
8 files changed, 128 insertions(+), 4 deletions(-)

Index: linux-2.6/include/linux/swap.h
===================================================================
--- linux-2.6.orig/include/linux/swap.h
+++ linux-2.6/include/linux/swap.h
@@ -120,6 +120,7 @@ enum {
SWP_USED = (1 << 0), /* is slot in swap_info[] used? */
SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */
SWP_ACTIVE = (SWP_USED | SWP_WRITEOK),
+ SWP_FILE = (1 << 2), /* file swap area */
/* add others here before... */
SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */
};
@@ -217,6 +218,8 @@ extern void swap_unplug_io_fn(struct bac
/* linux/mm/page_io.c */
extern int swap_readpage(struct file *, struct page *);
extern int swap_writepage(struct page *page, struct writeback_control *wbc);
+extern void swap_sync_page(struct page *page);
+extern int swap_set_page_dirty(struct page *page);
extern void end_swap_bio_read(struct bio *bio, int err);

/* linux/mm/swap_state.c */
@@ -250,6 +253,7 @@ extern unsigned int count_swap_pages(int
extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t);
extern sector_t swapdev_block(int, pgoff_t);
extern struct swap_info_struct *get_swap_info_struct(unsigned);
+extern struct swap_info_struct *page_swap_info(struct page *);
extern int can_share_swap_page(struct page *);
extern int remove_exclusive_swap_page(struct page *);
struct backing_dev_info;
Index: linux-2.6/mm/page_io.c
===================================================================
--- linux-2.6.orig/mm/page_io.c
+++ linux-2.6/mm/page_io.c
@@ -17,6 +17,7 @@
#include <linux/bio.h>
#include <linux/swapops.h>
#include <linux/writeback.h>
+#include <linux/buffer_head.h>
#include <asm/pgtable.h>

static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index,
@@ -97,11 +98,21 @@ int swap_writepage(struct page *page, st
{
struct bio *bio;
int ret = 0, rw = WRITE;
+ struct swap_info_struct *sis = page_swap_info(page);

if (remove_exclusive_swap_page(page)) {
unlock_page(page);
goto out;
}
+
+ if (sis->flags & SWP_FILE) {
+ ret = sis->swap_file->f_mapping->
+ a_ops->swap_out(sis->swap_file, page, wbc);
+ if (!ret)
+ count_vm_event(PSWPOUT);
+ return ret;
+ }
+
bio = get_swap_bio(GFP_NOIO, page_private(page), page,
end_swap_bio_write);
if (bio == NULL) {
@@ -120,13 +131,54 @@ out:
return ret;
}

+void swap_sync_page(struct page *page)
+{
+ struct swap_info_struct *sis = page_swap_info(page);
+
+ if (sis->flags & SWP_FILE) {
+ const struct address_space_operations *a_ops =
+ sis->swap_file->f_mapping->a_ops;
+ if (a_ops->sync_page)
+ a_ops->sync_page(page);
+ } else
+ block_sync_page(page);
+}
+
+int swap_set_page_dirty(struct page *page)
+{
+ struct swap_info_struct *sis = page_swap_info(page);
+
+ if (sis->flags & SWP_FILE) {
+ const struct address_space_operations *a_ops =
+ sis->swap_file->f_mapping->a_ops;
+ int (*spd)(struct page *) = a_ops->set_page_dirty;
+#ifdef CONFIG_BLOCK
+ if (!spd)
+ spd = __set_page_dirty_buffers;
+#endif
+ return (*spd)(page);
+ }
+
+ return __set_page_dirty_nobuffers(page);
+}
+
int swap_readpage(struct file *file, struct page *page)
{
struct bio *bio;
int ret = 0;
+ struct swap_info_struct *sis = page_swap_info(page);

BUG_ON(!PageLocked(page));
BUG_ON(PageUptodate(page));
+
+ if (sis->flags & SWP_FILE) {
+ ret = sis->swap_file->f_mapping->
+ a_ops->swap_in(sis->swap_file, page);
+ if (!ret)
+ count_vm_event(PSWPIN);
+ return ret;
+ }
+
bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
end_swap_bio_read);
if (bio == NULL) {
Index: linux-2.6/mm/swap_state.c
===================================================================
--- linux-2.6.orig/mm/swap_state.c
+++ linux-2.6/mm/swap_state.c
@@ -27,8 +27,8 @@
*/
static const struct address_space_operations swap_aops = {
.writepage = swap_writepage,
- .sync_page = block_sync_page,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .sync_page = swap_sync_page,
+ .set_page_dirty = swap_set_page_dirty,
.migratepage = migrate_page,
};

Index: linux-2.6/mm/swapfile.c
===================================================================
--- linux-2.6.orig/mm/swapfile.c
+++ linux-2.6/mm/swapfile.c
@@ -1012,6 +1012,12 @@ static void destroy_swap_extents(struct
list_del(&se->list);
kfree(se);
}
+
+ if (sis->flags & SWP_FILE) {
+ sis->flags &= ~SWP_FILE;
+ sis->swap_file->f_mapping->a_ops->
+ swapfile(sis->swap_file->f_mapping, 0);
+ }
}

/*
@@ -1104,6 +1110,17 @@ static int setup_swap_extents(struct swa
goto done;
}

+ if (sis->swap_file->f_mapping->a_ops->swapfile) {
+ ret = sis->swap_file->f_mapping->a_ops->
+ swapfile(sis->swap_file->f_mapping, 1);
+ if (!ret) {
+ sis->flags |= SWP_FILE;
+ ret = add_swap_extent(sis, 0, sis->max, 0);
+ *span = sis->pages;
+ }
+ goto done;
+ }
+
blkbits = inode->i_blkbits;
blocks_per_page = PAGE_SIZE >> blkbits;

@@ -1668,7 +1685,7 @@ asmlinkage long sys_swapon(const char __

mutex_lock(&swapon_mutex);
spin_lock(&swap_lock);
- p->flags = SWP_ACTIVE;
+ p->flags |= SWP_WRITEOK;
nr_swap_pages += nr_good_pages;
total_swap_pages += nr_good_pages;

@@ -1793,6 +1810,13 @@ get_swap_info_struct(unsigned type)
return &swap_info[type];
}

+struct swap_info_struct *page_swap_info(struct page *page)
+{
+ swp_entry_t swap = { .val = page_private(page) };
+ BUG_ON(!PageSwapCache(page));
+ return &swap_info[swp_type(swap)];
+}
+
/*
* swap_lock prevents swap_map being freed. Don't grab an extra
* reference on the swaphandle, it doesn't matter if it becomes unused.
Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h
+++ linux-2.6/include/linux/fs.h
@@ -481,6 +481,14 @@ struct address_space_operations {
int (*migratepage) (struct address_space *,
struct page *, struct page *);
int (*launder_page) (struct page *);
+
+ /*
+ * swapfile support
+ */
+ int (*swapfile)(struct address_space *, int);
+ int (*swap_out)(struct file *file, struct page *page,
+ struct writeback_control *wbc);
+ int (*swap_in)(struct file *file, struct page *page);
};

/*
Index: linux-2.6/Documentation/filesystems/Locking
===================================================================
--- linux-2.6.orig/Documentation/filesystems/Locking
+++ linux-2.6/Documentation/filesystems/Locking
@@ -171,6 +171,9 @@ prototypes:
int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
loff_t offset, unsigned long nr_segs);
int (*launder_page) (struct page *);
+ int (*swapfile) (struct address_space *, int);
+ int (*swap_out) (struct file *, struct page *, struct writeback_control *);
+ int (*swap_in) (struct file *, struct page *);

locking rules:
All except set_page_dirty may block
@@ -192,6 +195,9 @@ invalidatepage: no yes
releasepage: no yes
direct_IO: no
launder_page: no yes
+swapfile no
+swap_out no yes, unlocks
+swap_in no yes, unlocks

->prepare_write(), ->commit_write(), ->sync_page() and ->readpage()
may be called from the request handler (/dev/loop).
@@ -291,6 +297,19 @@ cleaned, or an error value if not. Note
getting mapped back in and redirtied, it needs to be kept locked
across the entire operation.

+ ->swapfile() will be called with a non zero argument on address spaces
+backing non block device backed swapfiles. A return value of zero indicates
+success. In which case this address space can be used for backing swapspace.
+The swapspace operations will be proxied to the address space operations.
+Swapoff will call this method with a zero argument to release the address
+space.
+
+ ->swap_out() when swapfile() returned success, this method is used to
+write the swap page.
+
+ ->swap_in() when swapfile() returned success, this method is used to
+read the swap page.
+
Note: currently almost all instances of address_space methods are
using BKL for internal serialization and that's one of the worst sources
of contention. Normally they are calling library functions (in fs/buffer.c)
Index: linux-2.6/include/linux/buffer_head.h
===================================================================
--- linux-2.6.orig/include/linux/buffer_head.h
+++ linux-2.6/include/linux/buffer_head.h
@@ -339,7 +339,7 @@ static inline void invalidate_inode_buff
static inline int remove_inode_buffers(struct inode *inode) { return 1; }
static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; }
static inline void invalidate_bdev(struct block_device *bdev) {}
-
+static inline void block_sync_page(struct page *) { }

#endif /* CONFIG_BLOCK */
#endif /* _LINUX_BUFFER_HEAD_H */
Index: linux-2.6/Documentation/filesystems/vfs.txt
===================================================================
--- linux-2.6.orig/Documentation/filesystems/vfs.txt
+++ linux-2.6/Documentation/filesystems/vfs.txt
@@ -543,6 +543,10 @@ struct address_space_operations {
/* migrate the contents of a page to the specified target */
int (*migratepage) (struct page *, struct page *);
int (*launder_page) (struct page *);
+ int (*swapfile)(struct address_space *, int);
+ int (*swap_out)(struct file *file, struct page *page,
+ struct writeback_control *wbc);
+ int (*swap_in)(struct file *file, struct page *page);
};

writepage: called by the VM to write a dirty page to backing store.
@@ -728,6 +732,19 @@ struct address_space_operations {
prevent redirtying the page, it is kept locked during the whole
operation.

+ swapfile: Called with a non-zero argument when swapon is used on a file. A
+ return value of zero indicates success. In which case this
+ address_space can be used to back swapspace. The swapspace operations
+ will be proxied to this address space's ->swap_{out,in} methods.
+ Swapoff will call this method with a zero argument to release the
+ address space.
+
+ swap_out: Called to write a swapcache page to a backing store, similar to
+ writepage.
+
+ swap_in: Called to read a swapcache page from a backing store, similar to
+ readpage.
+
The File Object
===============


--


2008-02-20 16:41:40

by Randy Dunlap

[permalink] [raw]
Subject: Re: [PATCH 22/28] mm: add support for non block device backed swap files

On Wed, 20 Feb 2008 15:46:32 +0100 Peter Zijlstra wrote:

> Signed-off-by: Peter Zijlstra <[email protected]>
> ---
> Documentation/filesystems/Locking | 19 +++++++++++++
> Documentation/filesystems/vfs.txt | 17 ++++++++++++
> include/linux/buffer_head.h | 2 -
> include/linux/fs.h | 8 +++++
> include/linux/swap.h | 4 ++
> mm/page_io.c | 52 ++++++++++++++++++++++++++++++++++++++
> mm/swap_state.c | 4 +-
> mm/swapfile.c | 26 ++++++++++++++++++-
> 8 files changed, 128 insertions(+), 4 deletions(-)

> Index: linux-2.6/Documentation/filesystems/Locking
> ===================================================================
> --- linux-2.6.orig/Documentation/filesystems/Locking
> +++ linux-2.6/Documentation/filesystems/Locking

> @@ -291,6 +297,19 @@ cleaned, or an error value if not. Note
> getting mapped back in and redirtied, it needs to be kept locked
> across the entire operation.
>
> + ->swapfile() will be called with a non zero argument on address spaces

non-zero

> +backing non block device backed swapfiles. A return value of zero indicates
> +success. In which case this address space can be used for backing swapspace.

success, in which case

> +The swapspace operations will be proxied to the address space operations.
> +Swapoff will call this method with a zero argument to release the address
> +space.
> +
> + ->swap_out() when swapfile() returned success, this method is used to
> +write the swap page.
> +
> + ->swap_in() when swapfile() returned success, this method is used to
> +read the swap page.
> +
> Note: currently almost all instances of address_space methods are
> using BKL for internal serialization and that's one of the worst sources
> of contention. Normally they are calling library functions (in fs/buffer.c)

> Index: linux-2.6/Documentation/filesystems/vfs.txt
> ===================================================================
> --- linux-2.6.orig/Documentation/filesystems/vfs.txt
> +++ linux-2.6/Documentation/filesystems/vfs.txt
> @@ -728,6 +732,19 @@ struct address_space_operations {
> prevent redirtying the page, it is kept locked during the whole
> operation.
>
> + swapfile: Called with a non-zero argument when swapon is used on a file. A
> + return value of zero indicates success. In which case this

success, in which case this

> + address_space can be used to back swapspace. The swapspace operations
> + will be proxied to this address space's ->swap_{out,in} methods.
> + Swapoff will call this method with a zero argument to release the
> + address space.
> +
> + swap_out: Called to write a swapcache page to a backing store, similar to
> + writepage.
> +
> + swap_in: Called to read a swapcache page from a backing store, similar to
> + readpage.
> +
> The File Object
> ===============

---
~Randy

2008-02-20 16:47:28

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 22/28] mm: add support for non block device backed swap files


On Wed, 2008-02-20 at 08:30 -0800, Randy Dunlap wrote:
> On Wed, 20 Feb 2008 15:46:32 +0100 Peter Zijlstra wrote:

< grammar mistakes >

Thanks Randy!

2008-02-26 12:45:58

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH 22/28] mm: add support for non block device backed swap files

Starting review in the middle, because this is the part I'm most
familiar with.

> New addres_space_operations methods are added:
> int swapfile(struct address_space *, int);

Separate ->swapon() and ->swapoff() methods would be so much cleaner IMO.

Also is there a reason why 'struct file *' cannot be supplied to these
functions?

[snip]

> +int swap_set_page_dirty(struct page *page)
> +{
> + struct swap_info_struct *sis = page_swap_info(page);
> +
> + if (sis->flags & SWP_FILE) {
> + const struct address_space_operations *a_ops =
> + sis->swap_file->f_mapping->a_ops;
> + int (*spd)(struct page *) = a_ops->set_page_dirty;
> +#ifdef CONFIG_BLOCK
> + if (!spd)
> + spd = __set_page_dirty_buffers;
> +#endif

This ifdef is not really needed. Just require ->set_page_dirty() be
filled in by filesystems which want swapfiles (and others too, in the
longer term, the fallback is just historical crud).

Here's an incremental patch addressing these issues and beautifying
the new code.

Signed-off-by: Miklos Szeredi <[email protected]>

Index: linux/mm/page_io.c
===================================================================
--- linux.orig/mm/page_io.c 2008-02-26 11:15:58.000000000 +0100
+++ linux/mm/page_io.c 2008-02-26 13:40:55.000000000 +0100
@@ -106,8 +106,10 @@ int swap_writepage(struct page *page, st
}

if (sis->flags & SWP_FILE) {
- ret = sis->swap_file->f_mapping->
- a_ops->swap_out(sis->swap_file, page, wbc);
+ struct file *swap_file = sis->swap_file;
+ struct address_space *mapping = swap_file->f_mapping;
+
+ ret = mapping->a_ops->swap_out(swap_file, page, wbc);
if (!ret)
count_vm_event(PSWPOUT);
return ret;
@@ -136,12 +138,13 @@ void swap_sync_page(struct page *page)
struct swap_info_struct *sis = page_swap_info(page);

if (sis->flags & SWP_FILE) {
- const struct address_space_operations *a_ops =
- sis->swap_file->f_mapping->a_ops;
- if (a_ops->sync_page)
- a_ops->sync_page(page);
- } else
+ struct address_space *mapping = sis->swap_file->f_mapping;
+
+ if (mapping->a_ops->sync_page)
+ mapping->a_ops->sync_page(page);
+ } else {
block_sync_page(page);
+ }
}

int swap_set_page_dirty(struct page *page)
@@ -149,17 +152,12 @@ int swap_set_page_dirty(struct page *pag
struct swap_info_struct *sis = page_swap_info(page);

if (sis->flags & SWP_FILE) {
- const struct address_space_operations *a_ops =
- sis->swap_file->f_mapping->a_ops;
- int (*spd)(struct page *) = a_ops->set_page_dirty;
-#ifdef CONFIG_BLOCK
- if (!spd)
- spd = __set_page_dirty_buffers;
-#endif
- return (*spd)(page);
- }
+ struct address_space *mapping = sis->swap_file->f_mapping;

- return __set_page_dirty_nobuffers(page);
+ return mapping->a_ops->set_page_dirty(page);
+ } else {
+ return __set_page_dirty_nobuffers(page);
+ }
}

int swap_readpage(struct file *file, struct page *page)
@@ -172,8 +170,10 @@ int swap_readpage(struct file *file, str
BUG_ON(PageUptodate(page));

if (sis->flags & SWP_FILE) {
- ret = sis->swap_file->f_mapping->
- a_ops->swap_in(sis->swap_file, page);
+ struct file *swap_file = sis->swap_file;
+ struct address_space *mapping = swap_file->f_mapping;
+
+ ret = mapping->a_ops->swap_in(swap_file, page);
if (!ret)
count_vm_event(PSWPIN);
return ret;
Index: linux/include/linux/fs.h
===================================================================
--- linux.orig/include/linux/fs.h 2008-02-26 11:15:58.000000000 +0100
+++ linux/include/linux/fs.h 2008-02-26 13:29:40.000000000 +0100
@@ -485,7 +485,8 @@ struct address_space_operations {
/*
* swapfile support
*/
- int (*swapfile)(struct address_space *, int);
+ int (*swapon)(struct file *file);
+ int (*swapoff)(struct file *file);
int (*swap_out)(struct file *file, struct page *page,
struct writeback_control *wbc);
int (*swap_in)(struct file *file, struct page *page);
Index: linux/mm/swapfile.c
===================================================================
--- linux.orig/mm/swapfile.c 2008-02-26 12:43:57.000000000 +0100
+++ linux/mm/swapfile.c 2008-02-26 13:34:57.000000000 +0100
@@ -1014,9 +1014,11 @@ static void destroy_swap_extents(struct
}

if (sis->flags & SWP_FILE) {
+ struct file *swap_file = sis->swap_file;
+ struct address_space *mapping = swap_file->f_mapping;
+
sis->flags &= ~SWP_FILE;
- sis->swap_file->f_mapping->a_ops->
- swapfile(sis->swap_file->f_mapping, 0);
+ mapping->a_ops->swapoff(swap_file);
}
}

@@ -1092,7 +1094,9 @@ add_swap_extent(struct swap_info_struct
*/
static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
{
- struct inode *inode;
+ struct file *swap_file = sis->swap_file;
+ struct address_space *mapping = swap_file->f_mapping;
+ struct inode *inode = mapping->host;
unsigned blocks_per_page;
unsigned long page_no;
unsigned blkbits;
@@ -1103,16 +1107,14 @@ static int setup_swap_extents(struct swa
int nr_extents = 0;
int ret;

- inode = sis->swap_file->f_mapping->host;
if (S_ISBLK(inode->i_mode)) {
ret = add_swap_extent(sis, 0, sis->max, 0);
*span = sis->pages;
goto done;
}

- if (sis->swap_file->f_mapping->a_ops->swapfile) {
- ret = sis->swap_file->f_mapping->a_ops->
- swapfile(sis->swap_file->f_mapping, 1);
+ if (mapping->a_ops->swapon) {
+ ret = mapping->a_ops->swapon(swap_file);
if (!ret) {
sis->flags |= SWP_FILE;
ret = add_swap_extent(sis, 0, sis->max, 0);


2008-02-26 12:58:34

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 22/28] mm: add support for non block device backed swap files


On Tue, 2008-02-26 at 13:45 +0100, Miklos Szeredi wrote:
> Starting review in the middle, because this is the part I'm most
> familiar with.
>
> > New addres_space_operations methods are added:
> > int swapfile(struct address_space *, int);
>
> Separate ->swapon() and ->swapoff() methods would be so much cleaner IMO.

I'm ok with that, but its a_ops bloat, do we care about that? I guess
since it has limited instances - typically one per filesystem - there is
no issue here.

> Also is there a reason why 'struct file *' cannot be supplied to these
> functions?

No real reason here. I guess its cleaner indeed. Thanks.

> > +int swap_set_page_dirty(struct page *page)
> > +{
> > + struct swap_info_struct *sis = page_swap_info(page);
> > +
> > + if (sis->flags & SWP_FILE) {
> > + const struct address_space_operations *a_ops =
> > + sis->swap_file->f_mapping->a_ops;
> > + int (*spd)(struct page *) = a_ops->set_page_dirty;
> > +#ifdef CONFIG_BLOCK
> > + if (!spd)
> > + spd = __set_page_dirty_buffers;
> > +#endif
>
> This ifdef is not really needed. Just require ->set_page_dirty() be
> filled in by filesystems which want swapfiles (and others too, in the
> longer term, the fallback is just historical crud).

Agreed. This is a good motivation to clean up that stuff.

> Here's an incremental patch addressing these issues and beautifying
> the new code.

Thanks, I'll fold it into the patch and update the documentation. I'll
put your creds in akpm style.