2009-10-15 00:44:50

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 0/9] swap_info and swap_map patches

Here's a series of nine patches around the swap_info_struct: against
2.6.32-rc4, but intended for mmotm, which is currently similar here.

They start out with some old and not very important cleanups, but get
around to solving the swap count overflow problem: our handling above
32765 has depended on hoping that it won't coincide with other races.

That problem exists in theory today (when pid_max is raised from its
default), though never reported in practice; but the motivation for
solving it now comes from the impending KSM swapping patches - it
becomes very easy for anyone to overflow the maximum that way.

But most people will never have a swap count overflow in their life:
the benefit for them is that the vmalloc'ed swap_map halves in size.

This is all internal housekeeping: no change to actual swapping and
page reclaim.

include/linux/swap.h | 66 ++-
mm/memory.c | 19
mm/page_io.c | 19
mm/rmap.c | 6
mm/shmem.c | 11
mm/swapfile.c | 834 +++++++++++++++++++++++++----------------
6 files changed, 599 insertions(+), 356 deletions(-)

Hugh


2009-10-15 00:46:57

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 1/9] swap_info: private to swapfile.c

The swap_info_struct is mostly private to mm/swapfile.c, with only
one other in-tree user: get_swap_bio(). Adjust its interface to
map_swap_page(), so that we can then remove get_swap_info_struct().

But there is a popular user out-of-tree, TuxOnIce: so leave the
declaration of swap_info_struct in linux/swap.h.

Signed-off-by: Hugh Dickins <[email protected]>
---

include/linux/swap.h | 3 +--
mm/page_io.c | 19 +++++++------------
mm/swapfile.c | 29 +++++++++++++++++------------
3 files changed, 25 insertions(+), 26 deletions(-)

--- 2.6.32-rc4/include/linux/swap.h 2009-09-28 00:28:39.000000000 +0100
+++ si1/include/linux/swap.h 2009-10-14 21:25:58.000000000 +0100
@@ -317,9 +317,8 @@ extern void swapcache_free(swp_entry_t,
extern int free_swap_and_cache(swp_entry_t);
extern int swap_type_of(dev_t, sector_t, struct block_device **);
extern unsigned int count_swap_pages(int, int);
-extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t);
+extern sector_t map_swap_page(swp_entry_t, struct block_device **);
extern sector_t swapdev_block(int, pgoff_t);
-extern struct swap_info_struct *get_swap_info_struct(unsigned);
extern int reuse_swap_page(struct page *);
extern int try_to_free_swap(struct page *);
struct backing_dev_info;
--- 2.6.32-rc4/mm/page_io.c 2009-09-09 23:13:59.000000000 +0100
+++ si1/mm/page_io.c 2009-10-14 21:25:58.000000000 +0100
@@ -19,20 +19,17 @@
#include <linux/writeback.h>
#include <asm/pgtable.h>

-static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index,
+static struct bio *get_swap_bio(gfp_t gfp_flags,
struct page *page, bio_end_io_t end_io)
{
struct bio *bio;

bio = bio_alloc(gfp_flags, 1);
if (bio) {
- struct swap_info_struct *sis;
- swp_entry_t entry = { .val = index, };
-
- sis = get_swap_info_struct(swp_type(entry));
- bio->bi_sector = map_swap_page(sis, swp_offset(entry)) *
- (PAGE_SIZE >> 9);
- bio->bi_bdev = sis->bdev;
+ swp_entry_t entry;
+ entry.val = page_private(page);
+ bio->bi_sector = map_swap_page(entry, &bio->bi_bdev);
+ bio->bi_sector <<= PAGE_SHIFT - 9;
bio->bi_io_vec[0].bv_page = page;
bio->bi_io_vec[0].bv_len = PAGE_SIZE;
bio->bi_io_vec[0].bv_offset = 0;
@@ -102,8 +99,7 @@ int swap_writepage(struct page *page, st
unlock_page(page);
goto out;
}
- bio = get_swap_bio(GFP_NOIO, page_private(page), page,
- end_swap_bio_write);
+ bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
if (bio == NULL) {
set_page_dirty(page);
unlock_page(page);
@@ -127,8 +123,7 @@ int swap_readpage(struct page *page)

VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(PageUptodate(page));
- bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
- end_swap_bio_read);
+ bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
if (bio == NULL) {
unlock_page(page);
ret = -ENOMEM;
--- 2.6.32-rc4/mm/swapfile.c 2009-10-05 04:33:14.000000000 +0100
+++ si1/mm/swapfile.c 2009-10-14 21:25:58.000000000 +0100
@@ -1284,12 +1284,22 @@ static void drain_mmlist(void)

/*
* Use this swapdev's extent info to locate the (PAGE_SIZE) block which
- * corresponds to page offset `offset'.
+ * corresponds to page offset `offset'. Note that the type of this function
+ * is sector_t, but it returns page offset into the bdev, not sector offset.
*/
-sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
+sector_t map_swap_page(swp_entry_t entry, struct block_device **bdev)
{
- struct swap_extent *se = sis->curr_swap_extent;
- struct swap_extent *start_se = se;
+ struct swap_info_struct *sis;
+ struct swap_extent *start_se;
+ struct swap_extent *se;
+ pgoff_t offset;
+
+ sis = swap_info + swp_type(entry);
+ *bdev = sis->bdev;
+
+ offset = swp_offset(entry);
+ start_se = sis->curr_swap_extent;
+ se = start_se;

for ( ; ; ) {
struct list_head *lh;
@@ -1315,12 +1325,14 @@ sector_t map_swap_page(struct swap_info_
sector_t swapdev_block(int swap_type, pgoff_t offset)
{
struct swap_info_struct *sis;
+ struct block_device *bdev;

if (swap_type >= nr_swapfiles)
return 0;

sis = swap_info + swap_type;
- return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0;
+ return (sis->flags & SWP_WRITEOK) ?
+ map_swap_page(swp_entry(swap_type, offset), &bdev) : 0;
}
#endif /* CONFIG_HIBERNATION */

@@ -2160,13 +2172,6 @@ int swapcache_prepare(swp_entry_t entry)
return __swap_duplicate(entry, SWAP_CACHE);
}

-
-struct swap_info_struct *
-get_swap_info_struct(unsigned type)
-{
- return &swap_info[type];
-}
-
/*
* swap_lock prevents swap_map being freed. Don't grab an extra
* reference on the swaphandle, it doesn't matter if it becomes unused.

2009-10-15 00:48:41

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 2/9] swap_info: change to array of pointers

The swap_info_struct is only 76 or 104 bytes, but it does seem wrong
to reserve an array of about 30 of them in bss, when most people will
want only one. Change swap_info[] to an array of pointers.

That does need a "type" field in the structure: pack it as a char with
next type and short prio (aha, char is unsigned by default on PowerPC).
Use the (admittedly peculiar) name "type" throughout for this index.

/proc/swaps does not take swap_lock: I wouldn't want it to, but do take
care with barriers when adding a new item to the array (never removed).

Signed-off-by: Hugh Dickins <[email protected]>
---

include/linux/swap.h | 7 -
mm/swapfile.c | 204 ++++++++++++++++++++++-------------------
2 files changed, 117 insertions(+), 94 deletions(-)

--- si1/include/linux/swap.h 2009-10-14 21:25:58.000000000 +0100
+++ si2/include/linux/swap.h 2009-10-14 21:26:09.000000000 +0100
@@ -159,9 +159,10 @@ enum {
* The in-memory structure used to track swap areas.
*/
struct swap_info_struct {
- unsigned long flags;
- int prio; /* swap priority */
- int next; /* next entry on swap list */
+ unsigned long flags; /* SWP_USED etc: see above */
+ signed short prio; /* swap priority of this type */
+ signed char type; /* strange name for an index */
+ signed char next; /* next type on the swap list */
struct file *swap_file;
struct block_device *bdev;
struct list_head extent_list;
--- si1/mm/swapfile.c 2009-10-14 21:25:58.000000000 +0100
+++ si2/mm/swapfile.c 2009-10-14 21:26:09.000000000 +0100
@@ -49,7 +49,7 @@ static const char Unused_offset[] = "Unu

static struct swap_list_t swap_list = {-1, -1};

-static struct swap_info_struct swap_info[MAX_SWAPFILES];
+static struct swap_info_struct *swap_info[MAX_SWAPFILES];

static DEFINE_MUTEX(swapon_mutex);

@@ -79,12 +79,11 @@ static inline unsigned short encode_swap
return ret;
}

-/* returnes 1 if swap entry is freed */
+/* returns 1 if swap entry is freed */
static int
__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
{
- int type = si - swap_info;
- swp_entry_t entry = swp_entry(type, offset);
+ swp_entry_t entry = swp_entry(si->type, offset);
struct page *page;
int ret = 0;

@@ -120,7 +119,7 @@ void swap_unplug_io_fn(struct backing_de
down_read(&swap_unplug_sem);
entry.val = page_private(page);
if (PageSwapCache(page)) {
- struct block_device *bdev = swap_info[swp_type(entry)].bdev;
+ struct block_device *bdev = swap_info[swp_type(entry)]->bdev;
struct backing_dev_info *bdi;

/*
@@ -467,10 +466,10 @@ swp_entry_t get_swap_page(void)
nr_swap_pages--;

for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
- si = swap_info + type;
+ si = swap_info[type];
next = si->next;
if (next < 0 ||
- (!wrapped && si->prio != swap_info[next].prio)) {
+ (!wrapped && si->prio != swap_info[next]->prio)) {
next = swap_list.head;
wrapped++;
}
@@ -503,8 +502,8 @@ swp_entry_t get_swap_page_of_type(int ty
pgoff_t offset;

spin_lock(&swap_lock);
- si = swap_info + type;
- if (si->flags & SWP_WRITEOK) {
+ si = swap_info[type];
+ if (si && (si->flags & SWP_WRITEOK)) {
nr_swap_pages--;
/* This is called for allocating swap entry, not cache */
offset = scan_swap_map(si, SWAP_MAP);
@@ -528,7 +527,7 @@ static struct swap_info_struct * swap_in
type = swp_type(entry);
if (type >= nr_swapfiles)
goto bad_nofile;
- p = & swap_info[type];
+ p = swap_info[type];
if (!(p->flags & SWP_USED))
goto bad_device;
offset = swp_offset(entry);
@@ -581,8 +580,9 @@ static int swap_entry_free(struct swap_i
p->lowest_bit = offset;
if (offset > p->highest_bit)
p->highest_bit = offset;
- if (p->prio > swap_info[swap_list.next].prio)
- swap_list.next = p - swap_info;
+ if (swap_list.next >= 0 &&
+ p->prio > swap_info[swap_list.next]->prio)
+ swap_list.next = p->type;
nr_swap_pages++;
p->inuse_pages--;
}
@@ -741,14 +741,14 @@ int free_swap_and_cache(swp_entry_t entr
int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
{
struct block_device *bdev = NULL;
- int i;
+ int type;

if (device)
bdev = bdget(device);

spin_lock(&swap_lock);
- for (i = 0; i < nr_swapfiles; i++) {
- struct swap_info_struct *sis = swap_info + i;
+ for (type = 0; type < nr_swapfiles; type++) {
+ struct swap_info_struct *sis = swap_info[type];

if (!(sis->flags & SWP_WRITEOK))
continue;
@@ -758,7 +758,7 @@ int swap_type_of(dev_t device, sector_t
*bdev_p = bdgrab(sis->bdev);

spin_unlock(&swap_lock);
- return i;
+ return type;
}
if (bdev == sis->bdev) {
struct swap_extent *se;
@@ -771,7 +771,7 @@ int swap_type_of(dev_t device, sector_t

spin_unlock(&swap_lock);
bdput(bdev);
- return i;
+ return type;
}
}
}
@@ -792,15 +792,17 @@ unsigned int count_swap_pages(int type,
{
unsigned int n = 0;

- if (type < nr_swapfiles) {
- spin_lock(&swap_lock);
- if (swap_info[type].flags & SWP_WRITEOK) {
- n = swap_info[type].pages;
+ spin_lock(&swap_lock);
+ if ((unsigned int)type < nr_swapfiles) {
+ struct swap_info_struct *sis = swap_info[type];
+
+ if (sis->flags & SWP_WRITEOK) {
+ n = sis->pages;
if (free)
- n -= swap_info[type].inuse_pages;
+ n -= sis->inuse_pages;
}
- spin_unlock(&swap_lock);
}
+ spin_unlock(&swap_lock);
return n;
}
#endif
@@ -1024,7 +1026,7 @@ static unsigned int find_next_to_unuse(s
*/
static int try_to_unuse(unsigned int type)
{
- struct swap_info_struct * si = &swap_info[type];
+ struct swap_info_struct *si = swap_info[type];
struct mm_struct *start_mm;
unsigned short *swap_map;
unsigned short swcount;
@@ -1271,10 +1273,10 @@ retry:
static void drain_mmlist(void)
{
struct list_head *p, *next;
- unsigned int i;
+ unsigned int type;

- for (i = 0; i < nr_swapfiles; i++)
- if (swap_info[i].inuse_pages)
+ for (type = 0; type < nr_swapfiles; type++)
+ if (swap_info[type]->inuse_pages)
return;
spin_lock(&mmlist_lock);
list_for_each_safe(p, next, &init_mm.mmlist)
@@ -1294,7 +1296,7 @@ sector_t map_swap_page(swp_entry_t entry
struct swap_extent *se;
pgoff_t offset;

- sis = swap_info + swp_type(entry);
+ sis = swap_info[swp_type(entry)];
*bdev = sis->bdev;

offset = swp_offset(entry);
@@ -1322,17 +1324,15 @@ sector_t map_swap_page(swp_entry_t entry
* Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
* corresponding to given index in swap_info (swap type).
*/
-sector_t swapdev_block(int swap_type, pgoff_t offset)
+sector_t swapdev_block(int type, pgoff_t offset)
{
- struct swap_info_struct *sis;
struct block_device *bdev;

- if (swap_type >= nr_swapfiles)
+ if ((unsigned int)type >= nr_swapfiles)
return 0;
-
- sis = swap_info + swap_type;
- return (sis->flags & SWP_WRITEOK) ?
- map_swap_page(swp_entry(swap_type, offset), &bdev) : 0;
+ if (!(swap_info[type]->flags & SWP_WRITEOK))
+ return 0;
+ return map_swap_page(swp_entry(type, offset), &bdev);
}
#endif /* CONFIG_HIBERNATION */

@@ -1548,8 +1548,8 @@ SYSCALL_DEFINE1(swapoff, const char __us
mapping = victim->f_mapping;
prev = -1;
spin_lock(&swap_lock);
- for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
- p = swap_info + type;
+ for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
+ p = swap_info[type];
if (p->flags & SWP_WRITEOK) {
if (p->swap_file->f_mapping == mapping)
break;
@@ -1568,18 +1568,17 @@ SYSCALL_DEFINE1(swapoff, const char __us
spin_unlock(&swap_lock);
goto out_dput;
}
- if (prev < 0) {
+ if (prev < 0)
swap_list.head = p->next;
- } else {
- swap_info[prev].next = p->next;
- }
+ else
+ swap_info[prev]->next = p->next;
if (type == swap_list.next) {
/* just pick something that's safe... */
swap_list.next = swap_list.head;
}
if (p->prio < 0) {
- for (i = p->next; i >= 0; i = swap_info[i].next)
- swap_info[i].prio = p->prio--;
+ for (i = p->next; i >= 0; i = swap_info[i]->next)
+ swap_info[i]->prio = p->prio--;
least_priority++;
}
nr_swap_pages -= p->pages;
@@ -1597,16 +1596,16 @@ SYSCALL_DEFINE1(swapoff, const char __us
if (p->prio < 0)
p->prio = --least_priority;
prev = -1;
- for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
- if (p->prio >= swap_info[i].prio)
+ for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
+ if (p->prio >= swap_info[i]->prio)
break;
prev = i;
}
p->next = i;
if (prev < 0)
- swap_list.head = swap_list.next = p - swap_info;
+ swap_list.head = swap_list.next = type;
else
- swap_info[prev].next = p - swap_info;
+ swap_info[prev]->next = type;
nr_swap_pages += p->pages;
total_swap_pages += p->pages;
p->flags |= SWP_WRITEOK;
@@ -1666,8 +1665,8 @@ out:
/* iterator */
static void *swap_start(struct seq_file *swap, loff_t *pos)
{
- struct swap_info_struct *ptr = swap_info;
- int i;
+ struct swap_info_struct *si;
+ int type;
loff_t l = *pos;

mutex_lock(&swapon_mutex);
@@ -1675,11 +1674,13 @@ static void *swap_start(struct seq_file
if (!l)
return SEQ_START_TOKEN;

- for (i = 0; i < nr_swapfiles; i++, ptr++) {
- if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
+ for (type = 0; type < nr_swapfiles; type++) {
+ smp_rmb(); /* read nr_swapfiles before swap_info[type] */
+ si = swap_info[type];
+ if (!(si->flags & SWP_USED) || !si->swap_map)
continue;
if (!--l)
- return ptr;
+ return si;
}

return NULL;
@@ -1687,21 +1688,21 @@ static void *swap_start(struct seq_file

static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
{
- struct swap_info_struct *ptr;
- struct swap_info_struct *endptr = swap_info + nr_swapfiles;
+ struct swap_info_struct *si = v;
+ int type;

if (v == SEQ_START_TOKEN)
- ptr = swap_info;
- else {
- ptr = v;
- ptr++;
- }
+ type = 0;
+ else
+ type = si->type + 1;

- for (; ptr < endptr; ptr++) {
- if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
+ for (; type < nr_swapfiles; type++) {
+ smp_rmb(); /* read nr_swapfiles before swap_info[type] */
+ si = swap_info[type];
+ if (!(si->flags & SWP_USED) || !si->swap_map)
continue;
++*pos;
- return ptr;
+ return si;
}

return NULL;
@@ -1714,24 +1715,24 @@ static void swap_stop(struct seq_file *s

static int swap_show(struct seq_file *swap, void *v)
{
- struct swap_info_struct *ptr = v;
+ struct swap_info_struct *si = v;
struct file *file;
int len;

- if (ptr == SEQ_START_TOKEN) {
+ if (si == SEQ_START_TOKEN) {
seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
return 0;
}

- file = ptr->swap_file;
+ file = si->swap_file;
len = seq_path(swap, &file->f_path, " \t\n\\");
seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
len < 40 ? 40 - len : 1, " ",
S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
"partition" : "file\t",
- ptr->pages << (PAGE_SHIFT - 10),
- ptr->inuse_pages << (PAGE_SHIFT - 10),
- ptr->prio);
+ si->pages << (PAGE_SHIFT - 10),
+ si->inuse_pages << (PAGE_SHIFT - 10),
+ si->prio);
return 0;
}

@@ -1799,23 +1800,45 @@ SYSCALL_DEFINE2(swapon, const char __use

if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
spin_lock(&swap_lock);
- p = swap_info;
- for (type = 0 ; type < nr_swapfiles ; type++,p++)
- if (!(p->flags & SWP_USED))
+ for (type = 0; type < nr_swapfiles; type++) {
+ if (!(swap_info[type]->flags & SWP_USED))
break;
+ }
error = -EPERM;
if (type >= MAX_SWAPFILES) {
spin_unlock(&swap_lock);
+ kfree(p);
goto out;
}
- if (type >= nr_swapfiles)
- nr_swapfiles = type+1;
- memset(p, 0, sizeof(*p));
INIT_LIST_HEAD(&p->extent_list);
+ if (type >= nr_swapfiles) {
+ p->type = type;
+ swap_info[type] = p;
+ /*
+ * Write swap_info[type] before nr_swapfiles, in case a
+ * racing procfs swap_start() or swap_next() is reading them.
+ * (We never shrink nr_swapfiles, we never free this entry.)
+ */
+ smp_wmb();
+ nr_swapfiles++;
+ } else {
+ kfree(p);
+ p = swap_info[type];
+ /*
+ * Do not memset this entry: a racing procfs swap_next()
+ * would be relying on p->type to remain valid.
+ */
+ }
p->flags = SWP_USED;
p->next = -1;
spin_unlock(&swap_lock);
+
name = getname(specialfile);
error = PTR_ERR(name);
if (IS_ERR(name)) {
@@ -1835,7 +1858,7 @@ SYSCALL_DEFINE2(swapon, const char __use

error = -EBUSY;
for (i = 0; i < nr_swapfiles; i++) {
- struct swap_info_struct *q = &swap_info[i];
+ struct swap_info_struct *q = swap_info[i];

if (i == type || !q->swap_file)
continue;
@@ -1910,6 +1933,7 @@ SYSCALL_DEFINE2(swapon, const char __use

p->lowest_bit = 1;
p->cluster_next = 1;
+ p->cluster_nr = 0;

/*
* Find out how many pages are allowed for a single swap
@@ -2016,18 +2040,16 @@ SYSCALL_DEFINE2(swapon, const char __use

/* insert swap space into swap_list: */
prev = -1;
- for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
- if (p->prio >= swap_info[i].prio) {
+ for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
+ if (p->prio >= swap_info[i]->prio)
break;
- }
prev = i;
}
p->next = i;
- if (prev < 0) {
- swap_list.head = swap_list.next = p - swap_info;
- } else {
- swap_info[prev].next = p - swap_info;
- }
+ if (prev < 0)
+ swap_list.head = swap_list.next = type;
+ else
+ swap_info[prev]->next = type;
spin_unlock(&swap_lock);
mutex_unlock(&swapon_mutex);
error = 0;
@@ -2064,15 +2086,15 @@ out:

void si_swapinfo(struct sysinfo *val)
{
- unsigned int i;
+ unsigned int type;
unsigned long nr_to_be_unused = 0;

spin_lock(&swap_lock);
- for (i = 0; i < nr_swapfiles; i++) {
- if (!(swap_info[i].flags & SWP_USED) ||
- (swap_info[i].flags & SWP_WRITEOK))
- continue;
- nr_to_be_unused += swap_info[i].inuse_pages;
+ for (type = 0; type < nr_swapfiles; type++) {
+ struct swap_info_struct *si = swap_info[type];
+
+ if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
+ nr_to_be_unused += si->inuse_pages;
}
val->freeswap = nr_swap_pages + nr_to_be_unused;
val->totalswap = total_swap_pages + nr_to_be_unused;
@@ -2105,7 +2127,7 @@ static int __swap_duplicate(swp_entry_t
type = swp_type(entry);
if (type >= nr_swapfiles)
goto bad_file;
- p = type + swap_info;
+ p = swap_info[type];
offset = swp_offset(entry);

spin_lock(&swap_lock);
@@ -2187,7 +2209,7 @@ int valid_swaphandles(swp_entry_t entry,
if (!our_page_cluster) /* no readahead */
return 0;

- si = &swap_info[swp_type(entry)];
+ si = swap_info[swp_type(entry)];
target = swp_offset(entry);
base = (target >> our_page_cluster) << our_page_cluster;
end = base + (1 << our_page_cluster);

2009-10-15 00:49:52

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 3/9] swap_info: include first_swap_extent

Make better use of the space by folding first swap_extent into its
swap_info_struct, instead of just the list_head: swap partitions need
only that one, and for others it's used as a circular list anyway.

Signed-off-by: Hugh Dickins <[email protected]>
---

include/linux/swap.h | 2 -
mm/swapfile.c | 72 +++++++++++++++++++++--------------------
2 files changed, 38 insertions(+), 36 deletions(-)

--- si2/include/linux/swap.h 2009-10-14 21:26:09.000000000 +0100
+++ si3/include/linux/swap.h 2009-10-14 21:26:22.000000000 +0100
@@ -165,7 +165,7 @@ struct swap_info_struct {
signed char next; /* next type on the swap list */
struct file *swap_file;
struct block_device *bdev;
- struct list_head extent_list;
+ struct swap_extent first_swap_extent;
struct swap_extent *curr_swap_extent;
unsigned short *swap_map;
unsigned int lowest_bit;
--- si2/mm/swapfile.c 2009-10-14 21:26:09.000000000 +0100
+++ si3/mm/swapfile.c 2009-10-14 21:26:22.000000000 +0100
@@ -145,23 +145,28 @@ void swap_unplug_io_fn(struct backing_de
static int discard_swap(struct swap_info_struct *si)
{
struct swap_extent *se;
+ sector_t start_block;
+ sector_t nr_blocks;
int err = 0;

- list_for_each_entry(se, &si->extent_list, list) {
- sector_t start_block = se->start_block << (PAGE_SHIFT - 9);
- sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
-
- if (se->start_page == 0) {
- /* Do not discard the swap header page! */
- start_block += 1 << (PAGE_SHIFT - 9);
- nr_blocks -= 1 << (PAGE_SHIFT - 9);
- if (!nr_blocks)
- continue;
- }
+ /* Do not discard the swap header page! */
+ se = &si->first_swap_extent;
+ start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
+ nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
+ if (nr_blocks) {
+ err = blkdev_issue_discard(si->bdev, start_block,
+ nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
+ if (err)
+ return err;
+ cond_resched();
+ }
+
+ list_for_each_entry(se, &si->first_swap_extent.list, list) {
+ start_block = se->start_block << (PAGE_SHIFT - 9);
+ nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);

err = blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_KERNEL,
- DISCARD_FL_BARRIER);
+ nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
if (err)
break;

@@ -200,14 +205,11 @@ static void discard_swap_cluster(struct
start_block <<= PAGE_SHIFT - 9;
nr_blocks <<= PAGE_SHIFT - 9;
if (blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_NOIO,
- DISCARD_FL_BARRIER))
+ nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER))
break;
}

lh = se->list.next;
- if (lh == &si->extent_list)
- lh = lh->next;
se = list_entry(lh, struct swap_extent, list);
}
}
@@ -761,10 +763,8 @@ int swap_type_of(dev_t device, sector_t
return type;
}
if (bdev == sis->bdev) {
- struct swap_extent *se;
+ struct swap_extent *se = &sis->first_swap_extent;

- se = list_entry(sis->extent_list.next,
- struct swap_extent, list);
if (se->start_block == offset) {
if (bdev_p)
*bdev_p = bdgrab(sis->bdev);
@@ -1311,8 +1311,6 @@ sector_t map_swap_page(swp_entry_t entry
return se->start_block + (offset - se->start_page);
}
lh = se->list.next;
- if (lh == &sis->extent_list)
- lh = lh->next;
se = list_entry(lh, struct swap_extent, list);
sis->curr_swap_extent = se;
BUG_ON(se == start_se); /* It *must* be present */
@@ -1341,10 +1339,10 @@ sector_t swapdev_block(int type, pgoff_t
*/
static void destroy_swap_extents(struct swap_info_struct *sis)
{
- while (!list_empty(&sis->extent_list)) {
+ while (!list_empty(&sis->first_swap_extent.list)) {
struct swap_extent *se;

- se = list_entry(sis->extent_list.next,
+ se = list_entry(sis->first_swap_extent.list.next,
struct swap_extent, list);
list_del(&se->list);
kfree(se);
@@ -1365,8 +1363,16 @@ add_swap_extent(struct swap_info_struct
struct swap_extent *new_se;
struct list_head *lh;

- lh = sis->extent_list.prev; /* The highest page extent */
- if (lh != &sis->extent_list) {
+ if (start_page == 0) {
+ se = &sis->first_swap_extent;
+ sis->curr_swap_extent = se;
+ INIT_LIST_HEAD(&se->list);
+ se->start_page = 0;
+ se->nr_pages = nr_pages;
+ se->start_block = start_block;
+ return 1;
+ } else {
+ lh = sis->first_swap_extent.list.prev; /* Highest extent */
se = list_entry(lh, struct swap_extent, list);
BUG_ON(se->start_page + se->nr_pages != start_page);
if (se->start_block + se->nr_pages == start_block) {
@@ -1386,7 +1392,7 @@ add_swap_extent(struct swap_info_struct
new_se->nr_pages = nr_pages;
new_se->start_block = start_block;

- list_add_tail(&new_se->list, &sis->extent_list);
+ list_add_tail(&new_se->list, &sis->first_swap_extent.list);
return 1;
}

@@ -1438,7 +1444,7 @@ static int setup_swap_extents(struct swa
if (S_ISBLK(inode->i_mode)) {
ret = add_swap_extent(sis, 0, sis->max, 0);
*span = sis->pages;
- goto done;
+ goto out;
}

blkbits = inode->i_blkbits;
@@ -1509,15 +1515,12 @@ reprobe:
sis->max = page_no;
sis->pages = page_no - 1;
sis->highest_bit = page_no - 1;
-done:
- sis->curr_swap_extent = list_entry(sis->extent_list.prev,
- struct swap_extent, list);
- goto out;
+out:
+ return ret;
bad_bmap:
printk(KERN_ERR "swapon: swapfile has holes\n");
ret = -EINVAL;
-out:
- return ret;
+ goto out;
}

SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
@@ -1816,7 +1819,6 @@ SYSCALL_DEFINE2(swapon, const char __use
kfree(p);
goto out;
}
- INIT_LIST_HEAD(&p->extent_list);
if (type >= nr_swapfiles) {
p->type = type;
swap_info[type] = p;

2009-10-15 00:51:33

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 4/9] swap_info: miscellaneous minor cleanups

Move CONFIG_MIGRATION's swapdev_block() into the main CONFIG_MIGRATION
block, remove extraneous whitespace and return, fix typo in a comment.

Signed-off-by: Hugh Dickins <[email protected]>
---

mm/swapfile.c | 51 ++++++++++++++++++++++--------------------------
1 file changed, 24 insertions(+), 27 deletions(-)

--- si3/mm/swapfile.c 2009-10-14 21:26:22.000000000 +0100
+++ si4/mm/swapfile.c 2009-10-14 21:26:32.000000000 +0100
@@ -519,9 +519,9 @@ swp_entry_t get_swap_page_of_type(int ty
return (swp_entry_t) {0};
}

-static struct swap_info_struct * swap_info_get(swp_entry_t entry)
+static struct swap_info_struct *swap_info_get(swp_entry_t entry)
{
- struct swap_info_struct * p;
+ struct swap_info_struct *p;
unsigned long offset, type;

if (!entry.val)
@@ -599,7 +599,7 @@ static int swap_entry_free(struct swap_i
*/
void swap_free(swp_entry_t entry)
{
- struct swap_info_struct * p;
+ struct swap_info_struct *p;

p = swap_info_get(entry);
if (p) {
@@ -629,7 +629,6 @@ void swapcache_free(swp_entry_t entry, s
}
spin_unlock(&swap_lock);
}
- return;
}

/*
@@ -783,6 +782,21 @@ int swap_type_of(dev_t device, sector_t
}

/*
+ * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
+ * corresponding to given index in swap_info (swap type).
+ */
+sector_t swapdev_block(int type, pgoff_t offset)
+{
+ struct block_device *bdev;
+
+ if ((unsigned int)type >= nr_swapfiles)
+ return 0;
+ if (!(swap_info[type]->flags & SWP_WRITEOK))
+ return 0;
+ return map_swap_page(swp_entry(type, offset), &bdev);
+}
+
+/*
* Return either the total number of swap pages of given type, or the number
* of free pages of that type (depending on @free)
*
@@ -805,7 +819,7 @@ unsigned int count_swap_pages(int type,
spin_unlock(&swap_lock);
return n;
}
-#endif
+#endif /* CONFIG_HIBERNATION */

/*
* No need to decide whether this PTE shares the swap entry with others,
@@ -1317,23 +1331,6 @@ sector_t map_swap_page(swp_entry_t entry
}
}

-#ifdef CONFIG_HIBERNATION
-/*
- * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
- * corresponding to given index in swap_info (swap type).
- */
-sector_t swapdev_block(int type, pgoff_t offset)
-{
- struct block_device *bdev;
-
- if ((unsigned int)type >= nr_swapfiles)
- return 0;
- if (!(swap_info[type]->flags & SWP_WRITEOK))
- return 0;
- return map_swap_page(swp_entry(type, offset), &bdev);
-}
-#endif /* CONFIG_HIBERNATION */
-
/*
* Free all of a swapdev's extent information
*/
@@ -1525,12 +1522,12 @@ bad_bmap:

SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
- struct swap_info_struct * p = NULL;
+ struct swap_info_struct *p = NULL;
unsigned short *swap_map;
struct file *swap_file, *victim;
struct address_space *mapping;
struct inode *inode;
- char * pathname;
+ char *pathname;
int i, type, prev;
int err;

@@ -1782,7 +1779,7 @@ late_initcall(max_swapfiles_check);
*/
SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
{
- struct swap_info_struct * p;
+ struct swap_info_struct *p;
char *name = NULL;
struct block_device *bdev = NULL;
struct file *swap_file = NULL;
@@ -2117,7 +2114,7 @@ void si_swapinfo(struct sysinfo *val)
*/
static int __swap_duplicate(swp_entry_t entry, bool cache)
{
- struct swap_info_struct * p;
+ struct swap_info_struct *p;
unsigned long offset, type;
int result = -EINVAL;
int count;
@@ -2186,7 +2183,7 @@ void swap_duplicate(swp_entry_t entry)
/*
* @entry: swap entry for which we allocate swap cache.
*
- * Called when allocating swap cache for exising swap entry,
+ * Called when allocating swap cache for existing swap entry,
* This can return error codes. Returns 0 at success.
* -EBUSY means there is a swap cache.
* Note: return code is different from swap_duplicate().

2009-10-15 00:53:08

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 5/9] swap_info: SWAP_HAS_CACHE cleanups

Though swap_count() is useful, I'm finding that swap_has_cache() and
encode_swapmap() obscure what happens in the swap_map entry, just at
those points where I need to understand it. Remove them, and pass
more usable "usage" values to scan_swap_map(), swap_entry_free() and
__swap_duplicate(), instead of the SWAP_MAP and SWAP_CACHE enum.

Signed-off-by: Hugh Dickins <[email protected]>
---

include/linux/swap.h | 2
mm/swapfile.c | 155 ++++++++++++++++-------------------------
2 files changed, 65 insertions(+), 92 deletions(-)

--- si4/include/linux/swap.h 2009-10-14 21:26:22.000000000 +0100
+++ si5/include/linux/swap.h 2009-10-14 21:26:42.000000000 +0100
@@ -154,7 +154,7 @@ enum {
#define SWAP_MAP_MAX 0x7ffe
#define SWAP_MAP_BAD 0x7fff
#define SWAP_HAS_CACHE 0x8000 /* There is a swap cache of entry. */
-#define SWAP_COUNT_MASK (~SWAP_HAS_CACHE)
+
/*
* The in-memory structure used to track swap areas.
*/
--- si4/mm/swapfile.c 2009-10-14 21:26:32.000000000 +0100
+++ si5/mm/swapfile.c 2009-10-14 21:26:42.000000000 +0100
@@ -53,30 +53,9 @@ static struct swap_info_struct *swap_inf

static DEFINE_MUTEX(swapon_mutex);

-/* For reference count accounting in swap_map */
-/* enum for swap_map[] handling. internal use only */
-enum {
- SWAP_MAP = 0, /* ops for reference from swap users */
- SWAP_CACHE, /* ops for reference from swap cache */
-};
-
static inline int swap_count(unsigned short ent)
{
- return ent & SWAP_COUNT_MASK;
-}
-
-static inline bool swap_has_cache(unsigned short ent)
-{
- return !!(ent & SWAP_HAS_CACHE);
-}
-
-static inline unsigned short encode_swapmap(int count, bool has_cache)
-{
- unsigned short ret = count;
-
- if (has_cache)
- return SWAP_HAS_CACHE | ret;
- return ret;
+ return ent & ~SWAP_HAS_CACHE;
}

/* returns 1 if swap entry is freed */
@@ -224,7 +203,7 @@ static int wait_for_discard(void *word)
#define LATENCY_LIMIT 256

static inline unsigned long scan_swap_map(struct swap_info_struct *si,
- int cache)
+ unsigned short usage)
{
unsigned long offset;
unsigned long scan_base;
@@ -355,10 +334,7 @@ checks:
si->lowest_bit = si->max;
si->highest_bit = 0;
}
- if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */
- si->swap_map[offset] = encode_swapmap(0, true);
- else /* at suspend */
- si->swap_map[offset] = encode_swapmap(1, false);
+ si->swap_map[offset] = usage;
si->cluster_next = offset + 1;
si->flags -= SWP_SCANNING;

@@ -483,7 +459,7 @@ swp_entry_t get_swap_page(void)

swap_list.next = next;
/* This is called for allocating swap entry for cache */
- offset = scan_swap_map(si, SWAP_CACHE);
+ offset = scan_swap_map(si, SWAP_HAS_CACHE);
if (offset) {
spin_unlock(&swap_lock);
return swp_entry(type, offset);
@@ -508,7 +484,7 @@ swp_entry_t get_swap_page_of_type(int ty
if (si && (si->flags & SWP_WRITEOK)) {
nr_swap_pages--;
/* This is called for allocating swap entry, not cache */
- offset = scan_swap_map(si, SWAP_MAP);
+ offset = scan_swap_map(si, 1);
if (offset) {
spin_unlock(&swap_lock);
return swp_entry(type, offset);
@@ -555,29 +531,31 @@ out:
return NULL;
}

-static int swap_entry_free(struct swap_info_struct *p,
- swp_entry_t ent, int cache)
+static unsigned short swap_entry_free(struct swap_info_struct *p,
+ swp_entry_t entry, unsigned short usage)
{
- unsigned long offset = swp_offset(ent);
- int count = swap_count(p->swap_map[offset]);
- bool has_cache;
+ unsigned long offset = swp_offset(entry);
+ unsigned short count;
+ unsigned short has_cache;

- has_cache = swap_has_cache(p->swap_map[offset]);
+ count = p->swap_map[offset];
+ has_cache = count & SWAP_HAS_CACHE;
+ count &= ~SWAP_HAS_CACHE;

- if (cache == SWAP_MAP) { /* dropping usage count of swap */
- if (count < SWAP_MAP_MAX) {
- count--;
- p->swap_map[offset] = encode_swapmap(count, has_cache);
- }
- } else { /* dropping swap cache flag */
+ if (usage == SWAP_HAS_CACHE) {
VM_BUG_ON(!has_cache);
- p->swap_map[offset] = encode_swapmap(count, false);
+ has_cache = 0;
+ } else if (count < SWAP_MAP_MAX)
+ count--;
+
+ if (!count)
+ mem_cgroup_uncharge_swap(entry);
+
+ usage = count | has_cache;
+ p->swap_map[offset] = usage;

- }
- /* return code. */
- count = p->swap_map[offset];
/* free if no reference */
- if (!count) {
+ if (!usage) {
if (offset < p->lowest_bit)
p->lowest_bit = offset;
if (offset > p->highest_bit)
@@ -588,9 +566,8 @@ static int swap_entry_free(struct swap_i
nr_swap_pages++;
p->inuse_pages--;
}
- if (!swap_count(count))
- mem_cgroup_uncharge_swap(ent);
- return count;
+
+ return usage;
}

/*
@@ -603,7 +580,7 @@ void swap_free(swp_entry_t entry)

p = swap_info_get(entry);
if (p) {
- swap_entry_free(p, entry, SWAP_MAP);
+ swap_entry_free(p, entry, 1);
spin_unlock(&swap_lock);
}
}
@@ -614,19 +591,13 @@ void swap_free(swp_entry_t entry)
void swapcache_free(swp_entry_t entry, struct page *page)
{
struct swap_info_struct *p;
- int ret;
+ unsigned short count;

p = swap_info_get(entry);
if (p) {
- ret = swap_entry_free(p, entry, SWAP_CACHE);
- if (page) {
- bool swapout;
- if (ret)
- swapout = true; /* the end of swap out */
- else
- swapout = false; /* no more swap users! */
- mem_cgroup_uncharge_swapcache(page, entry, swapout);
- }
+ count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
+ if (page)
+ mem_cgroup_uncharge_swapcache(page, entry, count != 0);
spin_unlock(&swap_lock);
}
}
@@ -705,7 +676,7 @@ int free_swap_and_cache(swp_entry_t entr

p = swap_info_get(entry);
if (p) {
- if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) {
+ if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
page = find_get_page(&swapper_space, entry.val);
if (page && !trylock_page(page)) {
page_cache_release(page);
@@ -1213,7 +1184,7 @@ static int try_to_unuse(unsigned int typ

if (swap_count(*swap_map) == SWAP_MAP_MAX) {
spin_lock(&swap_lock);
- *swap_map = encode_swapmap(0, true);
+ *swap_map = SWAP_HAS_CACHE;
spin_unlock(&swap_lock);
reset_overflow = 1;
}
@@ -2112,16 +2083,16 @@ void si_swapinfo(struct sysinfo *val)
* - swap-cache reference is requested but there is already one. -> EEXIST
* - swap-cache reference is requested but the entry is not used. -> ENOENT
*/
-static int __swap_duplicate(swp_entry_t entry, bool cache)
+static int __swap_duplicate(swp_entry_t entry, unsigned short usage)
{
struct swap_info_struct *p;
unsigned long offset, type;
- int result = -EINVAL;
- int count;
- bool has_cache;
+ unsigned short count;
+ unsigned short has_cache;
+ int err = -EINVAL;

if (non_swap_entry(entry))
- return -EINVAL;
+ goto out;

type = swp_type(entry);
if (type >= nr_swapfiles)
@@ -2130,54 +2101,56 @@ static int __swap_duplicate(swp_entry_t
offset = swp_offset(entry);

spin_lock(&swap_lock);
-
if (unlikely(offset >= p->max))
goto unlock_out;

- count = swap_count(p->swap_map[offset]);
- has_cache = swap_has_cache(p->swap_map[offset]);
+ count = p->swap_map[offset];
+ has_cache = count & SWAP_HAS_CACHE;
+ count &= ~SWAP_HAS_CACHE;
+ err = 0;

- if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */
+ if (usage == SWAP_HAS_CACHE) {

/* set SWAP_HAS_CACHE if there is no cache and entry is used */
- if (!has_cache && count) {
- p->swap_map[offset] = encode_swapmap(count, true);
- result = 0;
- } else if (has_cache) /* someone added cache */
- result = -EEXIST;
- else if (!count) /* no users */
- result = -ENOENT;
+ if (!has_cache && count)
+ has_cache = SWAP_HAS_CACHE;
+ else if (has_cache) /* someone else added cache */
+ err = -EEXIST;
+ else /* no users remaining */
+ err = -ENOENT;

} else if (count || has_cache) {
- if (count < SWAP_MAP_MAX - 1) {
- p->swap_map[offset] = encode_swapmap(count + 1,
- has_cache);
- result = 0;
- } else if (count <= SWAP_MAP_MAX) {
+
+ if (count < SWAP_MAP_MAX - 1)
+ count++;
+ else if (count <= SWAP_MAP_MAX) {
if (swap_overflow++ < 5)
printk(KERN_WARNING
"swap_dup: swap entry overflow\n");
- p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX,
- has_cache);
- result = 0;
- }
+ count = SWAP_MAP_MAX;
+ } else
+ err = -EINVAL;
} else
- result = -ENOENT; /* unused swap entry */
+ err = -ENOENT; /* unused swap entry */
+
+ p->swap_map[offset] = count | has_cache;
+
unlock_out:
spin_unlock(&swap_lock);
out:
- return result;
+ return err;

bad_file:
printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
goto out;
}
+
/*
* increase reference count of swap entry by 1.
*/
void swap_duplicate(swp_entry_t entry)
{
- __swap_duplicate(entry, SWAP_MAP);
+ __swap_duplicate(entry, 1);
}

/*
@@ -2190,7 +2163,7 @@ void swap_duplicate(swp_entry_t entry)
*/
int swapcache_prepare(swp_entry_t entry)
{
- return __swap_duplicate(entry, SWAP_CACHE);
+ return __swap_duplicate(entry, SWAP_HAS_CACHE);
}

/*

2009-10-15 00:54:31

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 6/9] swap_info: swap_map of chars not shorts

Halve the vmalloc'ed swap_map array from unsigned shorts to unsigned
chars: it's still very unusual to reach a swap count of 126, and the
next patch allows it to be extended indefinitely.

Signed-off-by: Hugh Dickins <[email protected]>
---

include/linux/swap.h | 8 ++++----
mm/swapfile.c | 40 +++++++++++++++++++++++-----------------
2 files changed, 27 insertions(+), 21 deletions(-)

--- si5/include/linux/swap.h 2009-10-14 21:26:42.000000000 +0100
+++ si6/include/linux/swap.h 2009-10-14 21:26:49.000000000 +0100
@@ -151,9 +151,9 @@ enum {

#define SWAP_CLUSTER_MAX 32

-#define SWAP_MAP_MAX 0x7ffe
-#define SWAP_MAP_BAD 0x7fff
-#define SWAP_HAS_CACHE 0x8000 /* There is a swap cache of entry. */
+#define SWAP_MAP_MAX 0x7e
+#define SWAP_MAP_BAD 0x7f
+#define SWAP_HAS_CACHE 0x80 /* There is a swap cache of entry. */

/*
* The in-memory structure used to track swap areas.
@@ -167,7 +167,7 @@ struct swap_info_struct {
struct block_device *bdev;
struct swap_extent first_swap_extent;
struct swap_extent *curr_swap_extent;
- unsigned short *swap_map;
+ unsigned char *swap_map;
unsigned int lowest_bit;
unsigned int highest_bit;
unsigned int lowest_alloc; /* while preparing discard cluster */
--- si5/mm/swapfile.c 2009-10-14 21:26:42.000000000 +0100
+++ si6/mm/swapfile.c 2009-10-14 21:26:49.000000000 +0100
@@ -53,7 +53,7 @@ static struct swap_info_struct *swap_inf

static DEFINE_MUTEX(swapon_mutex);

-static inline int swap_count(unsigned short ent)
+static inline unsigned char swap_count(unsigned char ent)
{
return ent & ~SWAP_HAS_CACHE;
}
@@ -203,7 +203,7 @@ static int wait_for_discard(void *word)
#define LATENCY_LIMIT 256

static inline unsigned long scan_swap_map(struct swap_info_struct *si,
- unsigned short usage)
+ unsigned char usage)
{
unsigned long offset;
unsigned long scan_base;
@@ -531,12 +531,12 @@ out:
return NULL;
}

-static unsigned short swap_entry_free(struct swap_info_struct *p,
- swp_entry_t entry, unsigned short usage)
+static unsigned char swap_entry_free(struct swap_info_struct *p,
+ swp_entry_t entry, unsigned char usage)
{
unsigned long offset = swp_offset(entry);
- unsigned short count;
- unsigned short has_cache;
+ unsigned char count;
+ unsigned char has_cache;

count = p->swap_map[offset];
has_cache = count & SWAP_HAS_CACHE;
@@ -591,7 +591,7 @@ void swap_free(swp_entry_t entry)
void swapcache_free(swp_entry_t entry, struct page *page)
{
struct swap_info_struct *p;
- unsigned short count;
+ unsigned char count;

p = swap_info_get(entry);
if (p) {
@@ -975,7 +975,7 @@ static unsigned int find_next_to_unuse(s
{
unsigned int max = si->max;
unsigned int i = prev;
- int count;
+ unsigned char count;

/*
* No need for swap_lock here: we're just looking
@@ -1013,8 +1013,8 @@ static int try_to_unuse(unsigned int typ
{
struct swap_info_struct *si = swap_info[type];
struct mm_struct *start_mm;
- unsigned short *swap_map;
- unsigned short swcount;
+ unsigned char *swap_map;
+ unsigned char swcount;
struct page *page;
swp_entry_t entry;
unsigned int i = 0;
@@ -1175,6 +1175,12 @@ static int try_to_unuse(unsigned int typ
* If that's wrong, then we should worry more about
* exit_mmap() and do_munmap() cases described above:
* we might be resetting SWAP_MAP_MAX too early here.
+ *
+ * Yes, that's wrong: though very unlikely, swap count 0x7ffe
+ * could surely occur if pid_max raised from PID_MAX_DEFAULT;
+ * and we are now lowering SWAP_MAP_MAX to 0x7e, making it
+ * much easier to reach. But the next patch will fix that.
+ *
* We know "Undead"s can happen, they're okay, so don't
* report them; but do report if we reset SWAP_MAP_MAX.
*/
@@ -1494,7 +1500,7 @@ bad_bmap:
SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
struct swap_info_struct *p = NULL;
- unsigned short *swap_map;
+ unsigned char *swap_map;
struct file *swap_file, *victim;
struct address_space *mapping;
struct inode *inode;
@@ -1764,7 +1770,7 @@ SYSCALL_DEFINE2(swapon, const char __use
sector_t span;
unsigned long maxpages = 1;
unsigned long swapfilepages;
- unsigned short *swap_map = NULL;
+ unsigned char *swap_map = NULL;
struct page *page = NULL;
struct inode *inode = NULL;
int did_down = 0;
@@ -1939,13 +1945,13 @@ SYSCALL_DEFINE2(swapon, const char __use
goto bad_swap;

/* OK, set up the swap map and apply the bad block list */
- swap_map = vmalloc(maxpages * sizeof(short));
+ swap_map = vmalloc(maxpages);
if (!swap_map) {
error = -ENOMEM;
goto bad_swap;
}

- memset(swap_map, 0, maxpages * sizeof(short));
+ memset(swap_map, 0, maxpages);
for (i = 0; i < swap_header->info.nr_badpages; i++) {
int page_nr = swap_header->info.badpages[i];
if (page_nr <= 0 || page_nr >= swap_header->info.last_page) {
@@ -2083,12 +2089,12 @@ void si_swapinfo(struct sysinfo *val)
* - swap-cache reference is requested but there is already one. -> EEXIST
* - swap-cache reference is requested but the entry is not used. -> ENOENT
*/
-static int __swap_duplicate(swp_entry_t entry, unsigned short usage)
+static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
{
struct swap_info_struct *p;
unsigned long offset, type;
- unsigned short count;
- unsigned short has_cache;
+ unsigned char count;
+ unsigned char has_cache;
int err = -EINVAL;

if (non_swap_entry(entry))

2009-10-15 00:56:55

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 7/9] swap_info: swap count continuations

Swap is duplicated (reference count incremented by one) whenever the same
swap page is inserted into another mm (when forking finds a swap entry in
place of a pte, or when reclaim unmaps a pte to insert the swap entry).

swap_info_struct's vmalloc'ed swap_map is the array of these reference
counts: but what happens when the unsigned short (or unsigned char since
the preceding patch) is full? (and its high bit is kept for a cache flag)

We then lose track of it, never freeing, leaving it in use until swapoff:
at which point we _hope_ that a single pass will have found all instances,
assume there are no more, and will lose user data if we're wrong.

Swapping of KSM pages has not yet been enabled; but it is implemented,
and makes it very easy for a user to overflow the maximum swap count:
possible with ordinary process pages, but unlikely, even when pid_max
has been raised from PID_MAX_DEFAULT.

This patch implements swap count continuations: when the count overflows,
a continuation page is allocated and linked to the original vmalloc'ed
map page, and this used to hold the continuation counts for that entry
and its neighbours. These continuation pages are seldom referenced:
the common paths all work on the original swap_map, only referring to
a continuation page when the low "digit" of a count is incremented or
decremented through SWAP_MAP_MAX.

Signed-off-by: Hugh Dickins <[email protected]>
---

include/linux/swap.h | 22 ++
mm/memory.c | 19 ++
mm/rmap.c | 6
mm/swapfile.c | 304 +++++++++++++++++++++++++++++++++--------
4 files changed, 287 insertions(+), 64 deletions(-)

--- si6/include/linux/swap.h 2009-10-14 21:26:49.000000000 +0100
+++ si7/include/linux/swap.h 2009-10-14 21:26:57.000000000 +0100
@@ -145,15 +145,18 @@ enum {
SWP_DISCARDABLE = (1 << 2), /* blkdev supports discard */
SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */
SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */
+ SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */
/* add others here before... */
SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */
};

#define SWAP_CLUSTER_MAX 32

-#define SWAP_MAP_MAX 0x7e
-#define SWAP_MAP_BAD 0x7f
-#define SWAP_HAS_CACHE 0x80 /* There is a swap cache of entry. */
+#define SWAP_MAP_MAX 0x3e /* Max duplication count, in first swap_map */
+#define SWAP_MAP_BAD 0x3f /* Note pageblock is bad, in first swap_map */
+#define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */
+#define SWAP_CONT_MAX 0x7f /* Max count, in each swap_map continuation */
+#define COUNT_CONTINUED 0x80 /* See swap_map continuation for full count */

/*
* The in-memory structure used to track swap areas.
@@ -310,9 +313,10 @@ extern long total_swap_pages;
extern void si_swapinfo(struct sysinfo *);
extern swp_entry_t get_swap_page(void);
extern swp_entry_t get_swap_page_of_type(int);
-extern void swap_duplicate(swp_entry_t);
-extern int swapcache_prepare(swp_entry_t);
extern int valid_swaphandles(swp_entry_t, unsigned long *);
+extern int add_swap_count_continuation(swp_entry_t, gfp_t);
+extern int swap_duplicate(swp_entry_t);
+extern int swapcache_prepare(swp_entry_t);
extern void swap_free(swp_entry_t);
extern void swapcache_free(swp_entry_t, struct page *page);
extern int free_swap_and_cache(swp_entry_t);
@@ -384,8 +388,14 @@ static inline void show_swap_cache_info(
#define free_swap_and_cache(swp) is_migration_entry(swp)
#define swapcache_prepare(swp) is_migration_entry(swp)

-static inline void swap_duplicate(swp_entry_t swp)
+static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
{
+ return 0;
+}
+
+static inline int swap_duplicate(swp_entry_t swp)
+{
+ return 0;
}

static inline void swap_free(swp_entry_t swp)
--- si6/mm/memory.c 2009-09-28 00:28:41.000000000 +0100
+++ si7/mm/memory.c 2009-10-14 21:26:57.000000000 +0100
@@ -572,7 +572,7 @@ out:
* covered by this vma.
*/

-static inline void
+static inline unsigned long
copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
unsigned long addr, int *rss)
@@ -586,7 +586,9 @@ copy_one_pte(struct mm_struct *dst_mm, s
if (!pte_file(pte)) {
swp_entry_t entry = pte_to_swp_entry(pte);

- swap_duplicate(entry);
+ if (swap_duplicate(entry) < 0)
+ return entry.val;
+
/* make sure dst_mm is on swapoff's mmlist. */
if (unlikely(list_empty(&dst_mm->mmlist))) {
spin_lock(&mmlist_lock);
@@ -635,6 +637,7 @@ copy_one_pte(struct mm_struct *dst_mm, s

out_set_pte:
set_pte_at(dst_mm, addr, dst_pte, pte);
+ return 0;
}

static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -645,6 +648,7 @@ static int copy_pte_range(struct mm_stru
spinlock_t *src_ptl, *dst_ptl;
int progress = 0;
int rss[2];
+ swp_entry_t entry = (swp_entry_t){0};

again:
rss[1] = rss[0] = 0;
@@ -671,7 +675,10 @@ again:
progress++;
continue;
}
- copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
+ entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
+ vma, addr, rss);
+ if (entry.val)
+ break;
progress += 8;
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);

@@ -681,6 +688,12 @@ again:
add_mm_rss(dst_mm, rss[0], rss[1]);
pte_unmap_unlock(dst_pte - 1, dst_ptl);
cond_resched();
+
+ if (entry.val) {
+ if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
+ return -ENOMEM;
+ progress = 0;
+ }
if (addr != end)
goto again;
return 0;
--- si6/mm/rmap.c 2009-10-05 04:33:14.000000000 +0100
+++ si7/mm/rmap.c 2009-10-14 21:26:57.000000000 +0100
@@ -822,7 +822,11 @@ static int try_to_unmap_one(struct page
* Store the swap location in the pte.
* See handle_pte_fault() ...
*/
- swap_duplicate(entry);
+ if (swap_duplicate(entry) < 0) {
+ set_pte_at(mm, address, pte, pteval);
+ ret = SWAP_FAIL;
+ goto out_unmap;
+ }
if (list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock);
if (list_empty(&mm->mmlist))
--- si6/mm/swapfile.c 2009-10-14 21:26:49.000000000 +0100
+++ si7/mm/swapfile.c 2009-10-14 21:26:57.000000000 +0100
@@ -35,11 +35,14 @@
#include <linux/swapops.h>
#include <linux/page_cgroup.h>

+static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
+ unsigned char);
+static void free_swap_count_continuations(struct swap_info_struct *);
+
static DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
long nr_swap_pages;
long total_swap_pages;
-static int swap_overflow;
static int least_priority;

static const char Bad_file[] = "Bad swap file entry ";
@@ -55,7 +58,7 @@ static DEFINE_MUTEX(swapon_mutex);

static inline unsigned char swap_count(unsigned char ent)
{
- return ent & ~SWAP_HAS_CACHE;
+ return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */
}

/* returns 1 if swap entry is freed */
@@ -545,8 +548,15 @@ static unsigned char swap_entry_free(str
if (usage == SWAP_HAS_CACHE) {
VM_BUG_ON(!has_cache);
has_cache = 0;
- } else if (count < SWAP_MAP_MAX)
- count--;
+ } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
+ if (count == COUNT_CONTINUED) {
+ if (swap_count_continued(p, offset, count))
+ count = SWAP_MAP_MAX | COUNT_CONTINUED;
+ else
+ count = SWAP_MAP_MAX;
+ } else
+ count--;
+ }

if (!count)
mem_cgroup_uncharge_swap(entry);
@@ -604,6 +614,8 @@ void swapcache_free(swp_entry_t entry, s

/*
* How many references to page are currently swapped out?
+ * This does not give an exact answer when swap count is continued,
+ * but does include the high COUNT_CONTINUED flag to allow for that.
*/
static inline int page_swapcount(struct page *page)
{
@@ -1019,7 +1031,6 @@ static int try_to_unuse(unsigned int typ
swp_entry_t entry;
unsigned int i = 0;
int retval = 0;
- int reset_overflow = 0;
int shmem;

/*
@@ -1034,8 +1045,7 @@ static int try_to_unuse(unsigned int typ
* together, child after parent. If we race with dup_mmap(), we
* prefer to resolve parent before child, lest we miss entries
* duplicated after we scanned child: using last mm would invert
- * that. Though it's only a serious concern when an overflowed
- * swap count is reset from SWAP_MAP_MAX, preventing a rescan.
+ * that.
*/
start_mm = &init_mm;
atomic_inc(&init_mm.mm_users);
@@ -1166,36 +1176,6 @@ static int try_to_unuse(unsigned int typ
}

/*
- * How could swap count reach 0x7ffe ?
- * There's no way to repeat a swap page within an mm
- * (except in shmem, where it's the shared object which takes
- * the reference count)?
- * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned
- * short is too small....)
- * If that's wrong, then we should worry more about
- * exit_mmap() and do_munmap() cases described above:
- * we might be resetting SWAP_MAP_MAX too early here.
- *
- * Yes, that's wrong: though very unlikely, swap count 0x7ffe
- * could surely occur if pid_max raised from PID_MAX_DEFAULT;
- * and we are now lowering SWAP_MAP_MAX to 0x7e, making it
- * much easier to reach. But the next patch will fix that.
- *
- * We know "Undead"s can happen, they're okay, so don't
- * report them; but do report if we reset SWAP_MAP_MAX.
- */
- /* We might release the lock_page() in unuse_mm(). */
- if (!PageSwapCache(page) || page_private(page) != entry.val)
- goto retry;
-
- if (swap_count(*swap_map) == SWAP_MAP_MAX) {
- spin_lock(&swap_lock);
- *swap_map = SWAP_HAS_CACHE;
- spin_unlock(&swap_lock);
- reset_overflow = 1;
- }
-
- /*
* If a reference remains (rare), we would like to leave
* the page in the swap cache; but try_to_unmap could
* then re-duplicate the entry once we drop page lock,
@@ -1236,7 +1216,6 @@ static int try_to_unuse(unsigned int typ
* mark page dirty so shrink_page_list will preserve it.
*/
SetPageDirty(page);
-retry:
unlock_page(page);
page_cache_release(page);

@@ -1248,10 +1227,6 @@ retry:
}

mmput(start_mm);
- if (reset_overflow) {
- printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
- swap_overflow = 0;
- }
return retval;
}

@@ -1595,6 +1570,9 @@ SYSCALL_DEFINE1(swapoff, const char __us
up_write(&swap_unplug_sem);

destroy_swap_extents(p);
+ if (p->flags & SWP_CONTINUED)
+ free_swap_count_continuations(p);
+
mutex_lock(&swapon_mutex);
spin_lock(&swap_lock);
drain_mmlist();
@@ -2080,14 +2058,13 @@ void si_swapinfo(struct sysinfo *val)
/*
* Verify that a swap entry is valid and increment its swap map count.
*
- * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
- * "permanent", but will be reclaimed by the next swapoff.
* Returns error code in following case.
* - success -> 0
* - swp_entry is invalid -> EINVAL
* - swp_entry is migration entry -> EINVAL
* - swap-cache reference is requested but there is already one. -> EEXIST
* - swap-cache reference is requested but the entry is not used. -> ENOENT
+ * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
*/
static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
{
@@ -2127,15 +2104,14 @@ static int __swap_duplicate(swp_entry_t

} else if (count || has_cache) {

- if (count < SWAP_MAP_MAX - 1)
- count++;
- else if (count <= SWAP_MAP_MAX) {
- if (swap_overflow++ < 5)
- printk(KERN_WARNING
- "swap_dup: swap entry overflow\n");
- count = SWAP_MAP_MAX;
- } else
+ if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
+ count += usage;
+ else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
err = -EINVAL;
+ else if (swap_count_continued(p, offset, count))
+ count = COUNT_CONTINUED;
+ else
+ err = -ENOMEM;
} else
err = -ENOENT; /* unused swap entry */

@@ -2154,9 +2130,13 @@ bad_file:
/*
* increase reference count of swap entry by 1.
*/
-void swap_duplicate(swp_entry_t entry)
+int swap_duplicate(swp_entry_t entry)
{
- __swap_duplicate(entry, 1);
+ int err = 0;
+
+ while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
+ err = add_swap_count_continuation(entry, GFP_ATOMIC);
+ return err;
}

/*
@@ -2223,3 +2203,219 @@ int valid_swaphandles(swp_entry_t entry,
*offset = ++toff;
return nr_pages? ++nr_pages: 0;
}
+
+/*
+ * add_swap_count_continuation - called when a swap count is duplicated
+ * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
+ * page of the original vmalloc'ed swap_map, to hold the continuation count
+ * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called
+ * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
+ *
+ * These continuation pages are seldom referenced: the common paths all work
+ * on the original swap_map, only referring to a continuation page when the
+ * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
+ *
+ * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
+ * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
+ * can be called after dropping locks.
+ */
+int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
+{
+ struct swap_info_struct *si;
+ struct page *head;
+ struct page *page;
+ struct page *list_page;
+ pgoff_t offset;
+ unsigned char count;
+
+ /*
+ * When debugging, it's easier to use __GFP_ZERO here; but it's better
+ * for latency not to zero a page while GFP_ATOMIC and holding locks.
+ */
+ page = alloc_page(gfp_mask | __GFP_HIGHMEM);
+
+ si = swap_info_get(entry);
+ if (!si) {
+ /*
+ * An acceptable race has occurred since the failing
+ * __swap_duplicate(): the swap entry has been freed,
+ * perhaps even the whole swap_map cleared for swapoff.
+ */
+ goto outer;
+ }
+
+ offset = swp_offset(entry);
+ count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
+
+ if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
+ /*
+ * The higher the swap count, the more likely it is that tasks
+ * will race to add swap count continuation: we need to avoid
+ * over-provisioning.
+ */
+ goto out;
+ }
+
+ if (!page) {
+ spin_unlock(&swap_lock);
+ return -ENOMEM;
+ }
+
+ /*
+ * We are fortunate that although vmalloc_to_page uses pte_offset_map,
+ * no architecture is using highmem pages for kernel pagetables: so it
+ * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps.
+ */
+ head = vmalloc_to_page(si->swap_map + offset);
+ offset &= ~PAGE_MASK;
+
+ /*
+ * Page allocation does not initialize the page's lru field,
+ * but it does always reset its private field.
+ */
+ if (!page_private(head)) {
+ BUG_ON(count & COUNT_CONTINUED);
+ INIT_LIST_HEAD(&head->lru);
+ set_page_private(head, SWP_CONTINUED);
+ si->flags |= SWP_CONTINUED;
+ }
+
+ list_for_each_entry(list_page, &head->lru, lru) {
+ unsigned char *map;
+
+ /*
+ * If the previous map said no continuation, but we've found
+ * a continuation page, free our allocation and use this one.
+ */
+ if (!(count & COUNT_CONTINUED))
+ goto out;
+
+ map = kmap_atomic(list_page, KM_USER0) + offset;
+ count = *map;
+ kunmap_atomic(map, KM_USER0);
+
+ /*
+ * If this continuation count now has some space in it,
+ * free our allocation and use this one.
+ */
+ if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
+ goto out;
+ }
+
+ list_add_tail(&page->lru, &head->lru);
+ page = NULL; /* now it's attached, don't free it */
+out:
+ spin_unlock(&swap_lock);
+outer:
+ if (page)
+ __free_page(page);
+ return 0;
+}
+
+/*
+ * swap_count_continued - when the original swap_map count is incremented
+ * from SWAP_MAP_MAX, check if there is already a continuation page to carry
+ * into, carry if so, or else fail until a new continuation page is allocated;
+ * when the original swap_map count is decremented from 0 with continuation,
+ * borrow from the continuation and report whether it still holds more.
+ * Called while __swap_duplicate() or swap_entry_free() holds swap_lock.
+ */
+static bool swap_count_continued(struct swap_info_struct *si,
+ pgoff_t offset, unsigned char count)
+{
+ struct page *head;
+ struct page *page;
+ unsigned char *map;
+
+ head = vmalloc_to_page(si->swap_map + offset);
+ if (page_private(head) != SWP_CONTINUED) {
+ BUG_ON(count & COUNT_CONTINUED);
+ return false; /* need to add count continuation */
+ }
+
+ offset &= ~PAGE_MASK;
+ page = list_entry(head->lru.next, struct page, lru);
+ map = kmap_atomic(page, KM_USER0) + offset;
+
+ if (count == SWAP_MAP_MAX) /* initial increment from swap_map */
+ goto init_map; /* jump over SWAP_CONT_MAX checks */
+
+ if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
+ /*
+ * Think of how you add 1 to 999
+ */
+ while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
+ kunmap_atomic(map, KM_USER0);
+ page = list_entry(page->lru.next, struct page, lru);
+ BUG_ON(page == head);
+ map = kmap_atomic(page, KM_USER0) + offset;
+ }
+ if (*map == SWAP_CONT_MAX) {
+ kunmap_atomic(map, KM_USER0);
+ page = list_entry(page->lru.next, struct page, lru);
+ if (page == head)
+ return false; /* add count continuation */
+ map = kmap_atomic(page, KM_USER0) + offset;
+init_map: *map = 0; /* we didn't zero the page */
+ }
+ *map += 1;
+ kunmap_atomic(map, KM_USER0);
+ page = list_entry(page->lru.prev, struct page, lru);
+ while (page != head) {
+ map = kmap_atomic(page, KM_USER0) + offset;
+ *map = COUNT_CONTINUED;
+ kunmap_atomic(map, KM_USER0);
+ page = list_entry(page->lru.prev, struct page, lru);
+ }
+ return true; /* incremented */
+
+ } else { /* decrementing */
+ /*
+ * Think of how you subtract 1 from 1000
+ */
+ BUG_ON(count != COUNT_CONTINUED);
+ while (*map == COUNT_CONTINUED) {
+ kunmap_atomic(map, KM_USER0);
+ page = list_entry(page->lru.next, struct page, lru);
+ BUG_ON(page == head);
+ map = kmap_atomic(page, KM_USER0) + offset;
+ }
+ BUG_ON(*map == 0);
+ *map -= 1;
+ if (*map == 0)
+ count = 0;
+ kunmap_atomic(map, KM_USER0);
+ page = list_entry(page->lru.prev, struct page, lru);
+ while (page != head) {
+ map = kmap_atomic(page, KM_USER0) + offset;
+ *map = SWAP_CONT_MAX | count;
+ count = COUNT_CONTINUED;
+ kunmap_atomic(map, KM_USER0);
+ page = list_entry(page->lru.prev, struct page, lru);
+ }
+ return count == COUNT_CONTINUED;
+ }
+}
+
+/*
+ * free_swap_count_continuations - swapoff free all the continuation pages
+ * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
+ */
+static void free_swap_count_continuations(struct swap_info_struct *si)
+{
+ pgoff_t offset;
+
+ for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
+ struct page *head;
+ head = vmalloc_to_page(si->swap_map + offset);
+ if (page_private(head)) {
+ struct list_head *this, *next;
+ list_for_each_safe(this, next, &head->lru) {
+ struct page *page;
+ page = list_entry(this, struct page, lru);
+ list_del(this);
+ __free_page(page);
+ }
+ }
+ }
+}

2009-10-15 00:58:09

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 8/9] swap_info: note SWAP_MAP_SHMEM

While we're fiddling with the swap_map values, let's assign a particular
value to shmem/tmpfs swap pages: their swap counts are never incremented,
and it helps swapoff's try_to_unuse() a little if it can immediately
distinguish those pages from process pages.

Since we've no use for SWAP_MAP_BAD | COUNT_CONTINUED,
we might as well use that 0xbf value for SWAP_MAP_SHMEM.

Signed-off-by: Hugh Dickins <[email protected]>
---

include/linux/swap.h | 6 +++++
mm/shmem.c | 11 +++++++--
mm/swapfile.c | 47 +++++++++++++++++++++++------------------
3 files changed, 42 insertions(+), 22 deletions(-)

--- si7/include/linux/swap.h 2009-10-14 21:26:57.000000000 +0100
+++ si8/include/linux/swap.h 2009-10-14 21:27:07.000000000 +0100
@@ -157,6 +157,7 @@ enum {
#define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */
#define SWAP_CONT_MAX 0x7f /* Max count, in each swap_map continuation */
#define COUNT_CONTINUED 0x80 /* See swap_map continuation for full count */
+#define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs, in first swap_map */

/*
* The in-memory structure used to track swap areas.
@@ -315,6 +316,7 @@ extern swp_entry_t get_swap_page(void);
extern swp_entry_t get_swap_page_of_type(int);
extern int valid_swaphandles(swp_entry_t, unsigned long *);
extern int add_swap_count_continuation(swp_entry_t, gfp_t);
+extern void swap_shmem_alloc(swp_entry_t);
extern int swap_duplicate(swp_entry_t);
extern int swapcache_prepare(swp_entry_t);
extern void swap_free(swp_entry_t);
@@ -393,6 +395,10 @@ static inline int add_swap_count_continu
return 0;
}

+static inline void swap_shmem_alloc(swp_entry_t swp)
+{
+}
+
static inline int swap_duplicate(swp_entry_t swp)
{
return 0;
--- si7/mm/shmem.c 2009-09-28 00:28:41.000000000 +0100
+++ si8/mm/shmem.c 2009-10-14 21:27:07.000000000 +0100
@@ -1017,7 +1017,14 @@ int shmem_unuse(swp_entry_t entry, struc
goto out;
}
mutex_unlock(&shmem_swaplist_mutex);
-out: return found; /* 0 or 1 or -ENOMEM */
+ /*
+ * Can some race bring us here? We've been holding page lock,
+ * so I think not; but would rather try again later than BUG()
+ */
+ unlock_page(page);
+ page_cache_release(page);
+out:
+ return (found < 0) ? found : 0;
}

/*
@@ -1080,7 +1087,7 @@ static int shmem_writepage(struct page *
else
inode = NULL;
spin_unlock(&info->lock);
- swap_duplicate(swap);
+ swap_shmem_alloc(swap);
BUG_ON(page_mapped(page));
page_cache_release(page); /* pagecache ref */
swap_writepage(page, wbc);
--- si7/mm/swapfile.c 2009-10-14 21:26:57.000000000 +0100
+++ si8/mm/swapfile.c 2009-10-14 21:27:07.000000000 +0100
@@ -548,6 +548,12 @@ static unsigned char swap_entry_free(str
if (usage == SWAP_HAS_CACHE) {
VM_BUG_ON(!has_cache);
has_cache = 0;
+ } else if (count == SWAP_MAP_SHMEM) {
+ /*
+ * Or we could insist on shmem.c using a special
+ * swap_shmem_free() and free_shmem_swap_and_cache()...
+ */
+ count = 0;
} else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
if (count == COUNT_CONTINUED) {
if (swap_count_continued(p, offset, count))
@@ -1031,7 +1037,6 @@ static int try_to_unuse(unsigned int typ
swp_entry_t entry;
unsigned int i = 0;
int retval = 0;
- int shmem;

/*
* When searching mms for an entry, a good strategy is to
@@ -1107,17 +1112,18 @@ static int try_to_unuse(unsigned int typ

/*
* Remove all references to entry.
- * Whenever we reach init_mm, there's no address space
- * to search, but use it as a reminder to search shmem.
*/
- shmem = 0;
swcount = *swap_map;
- if (swap_count(swcount)) {
- if (start_mm == &init_mm)
- shmem = shmem_unuse(entry, page);
- else
- retval = unuse_mm(start_mm, entry, page);
+ if (swap_count(swcount) == SWAP_MAP_SHMEM) {
+ retval = shmem_unuse(entry, page);
+ /* page has already been unlocked and released */
+ if (retval < 0)
+ break;
+ continue;
}
+ if (swap_count(swcount) && start_mm != &init_mm)
+ retval = unuse_mm(start_mm, entry, page);
+
if (swap_count(*swap_map)) {
int set_start_mm = (*swap_map >= swcount);
struct list_head *p = &start_mm->mmlist;
@@ -1128,7 +1134,7 @@ static int try_to_unuse(unsigned int typ
atomic_inc(&new_start_mm->mm_users);
atomic_inc(&prev_mm->mm_users);
spin_lock(&mmlist_lock);
- while (swap_count(*swap_map) && !retval && !shmem &&
+ while (swap_count(*swap_map) && !retval &&
(p = p->next) != &start_mm->mmlist) {
mm = list_entry(p, struct mm_struct, mmlist);
if (!atomic_inc_not_zero(&mm->mm_users))
@@ -1142,10 +1148,9 @@ static int try_to_unuse(unsigned int typ
swcount = *swap_map;
if (!swap_count(swcount)) /* any usage ? */
;
- else if (mm == &init_mm) {
+ else if (mm == &init_mm)
set_start_mm = 1;
- shmem = shmem_unuse(entry, page);
- } else
+ else
retval = unuse_mm(mm, entry, page);

if (set_start_mm &&
@@ -1162,13 +1167,6 @@ static int try_to_unuse(unsigned int typ
mmput(start_mm);
start_mm = new_start_mm;
}
- if (shmem) {
- /* page has already been unlocked and released */
- if (shmem > 0)
- continue;
- retval = shmem;
- break;
- }
if (retval) {
unlock_page(page);
page_cache_release(page);
@@ -2128,6 +2126,15 @@ bad_file:
}

/*
+ * Help swapoff by noting that swap entry belongs to shmem/tmpfs
+ * (in which case its reference count is never incremented).
+ */
+void swap_shmem_alloc(swp_entry_t entry)
+{
+ __swap_duplicate(entry, SWAP_MAP_SHMEM);
+}
+
+/*
* increase reference count of swap entry by 1.
*/
int swap_duplicate(swp_entry_t entry)

2009-10-15 00:59:30

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 9/9] swap_info: reorder its fields

Reorder (and comment) the fields of swap_info_struct, to make better
use of its cachelines: it's good for swap_duplicate() in particular
if unsigned int max and swap_map are near the start.

Signed-off-by: Hugh Dickins <[email protected]>
---

include/linux/swap.h | 26 +++++++++++++-------------
1 file changed, 13 insertions(+), 13 deletions(-)

--- si8/include/linux/swap.h 2009-10-14 21:27:07.000000000 +0100
+++ si9/include/linux/swap.h 2009-10-14 21:27:14.000000000 +0100
@@ -167,21 +167,21 @@ struct swap_info_struct {
signed short prio; /* swap priority of this type */
signed char type; /* strange name for an index */
signed char next; /* next type on the swap list */
- struct file *swap_file;
- struct block_device *bdev;
- struct swap_extent first_swap_extent;
- struct swap_extent *curr_swap_extent;
- unsigned char *swap_map;
- unsigned int lowest_bit;
- unsigned int highest_bit;
+ unsigned int max; /* extent of the swap_map */
+ unsigned char *swap_map; /* vmalloc'ed array of usage counts */
+ unsigned int lowest_bit; /* index of first free in swap_map */
+ unsigned int highest_bit; /* index of last free in swap_map */
+ unsigned int pages; /* total of usable pages of swap */
+ unsigned int inuse_pages; /* number of those currently in use */
+ unsigned int cluster_next; /* likely index for next allocation */
+ unsigned int cluster_nr; /* countdown to next cluster search */
unsigned int lowest_alloc; /* while preparing discard cluster */
unsigned int highest_alloc; /* while preparing discard cluster */
- unsigned int cluster_next;
- unsigned int cluster_nr;
- unsigned int pages;
- unsigned int max;
- unsigned int inuse_pages;
- unsigned int old_block_size;
+ struct swap_extent *curr_swap_extent;
+ struct swap_extent first_swap_extent;
+ struct block_device *bdev; /* swap device or bdev of swap file */
+ struct file *swap_file; /* seldom referenced */
+ unsigned int old_block_size; /* seldom referenced */
};

struct swap_list_t {

2009-10-15 02:14:15

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH 2/9] swap_info: change to array of pointers

On Thu, 15 Oct 2009 01:48:01 +0100 (BST)
Hugh Dickins <[email protected]> wrote:

> The swap_info_struct is only 76 or 104 bytes, but it does seem wrong
> to reserve an array of about 30 of them in bss, when most people will
> want only one. Change swap_info[] to an array of pointers.
>
> That does need a "type" field in the structure: pack it as a char with
> next type and short prio (aha, char is unsigned by default on PowerPC).
> Use the (admittedly peculiar) name "type" throughout for this index.
>
> /proc/swaps does not take swap_lock: I wouldn't want it to, but do take
> care with barriers when adding a new item to the array (never removed).
>
> Signed-off-by: Hugh Dickins <[email protected]>
> ---
>
> include/linux/swap.h | 7 -
> mm/swapfile.c | 204 ++++++++++++++++++++++-------------------
> 2 files changed, 117 insertions(+), 94 deletions(-)
>
> --- si1/include/linux/swap.h 2009-10-14 21:25:58.000000000 +0100
> +++ si2/include/linux/swap.h 2009-10-14 21:26:09.000000000 +0100
> @@ -159,9 +159,10 @@ enum {
> * The in-memory structure used to track swap areas.
> */
> struct swap_info_struct {
> - unsigned long flags;
> - int prio; /* swap priority */
> - int next; /* next entry on swap list */
> + unsigned long flags; /* SWP_USED etc: see above */
> + signed short prio; /* swap priority of this type */
> + signed char type; /* strange name for an index */
> + signed char next; /* next type on the swap list */
> struct file *swap_file;
> struct block_device *bdev;
> struct list_head extent_list;
> --- si1/mm/swapfile.c 2009-10-14 21:25:58.000000000 +0100
> +++ si2/mm/swapfile.c 2009-10-14 21:26:09.000000000 +0100
> @@ -49,7 +49,7 @@ static const char Unused_offset[] = "Unu
>
> static struct swap_list_t swap_list = {-1, -1};
>
> -static struct swap_info_struct swap_info[MAX_SWAPFILES];
> +static struct swap_info_struct *swap_info[MAX_SWAPFILES];
>

Could you add some comment like this ?
==
nr_swapfile is never decreased.
swap_info[type] pointer will never be invalid if it turns to be valid once.


for (i = 0; i < nr_swapfiles; i++) {
smp_rmp();
sis = swap_info[type];
....
}
Then, we can execute above without checking sis is valid or not.
smp_rmb() is required when we do above loop without swap_lock().
swapon_mutex() will be no help.

Whether sis is used or not can be detelcted by sis->flags.

==

I don' trust my eys very much...but

Reviewed-by: KAMEZAWA Hiroyuki <[email protected]>


> static DEFINE_MUTEX(swapon_mutex);
>
> @@ -79,12 +79,11 @@ static inline unsigned short encode_swap
> return ret;
> }
>
> -/* returnes 1 if swap entry is freed */
> +/* returns 1 if swap entry is freed */
> static int
> __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
> {
> - int type = si - swap_info;
> - swp_entry_t entry = swp_entry(type, offset);
> + swp_entry_t entry = swp_entry(si->type, offset);
> struct page *page;
> int ret = 0;
>
> @@ -120,7 +119,7 @@ void swap_unplug_io_fn(struct backing_de
> down_read(&swap_unplug_sem);
> entry.val = page_private(page);
> if (PageSwapCache(page)) {
> - struct block_device *bdev = swap_info[swp_type(entry)].bdev;
> + struct block_device *bdev = swap_info[swp_type(entry)]->bdev;
> struct backing_dev_info *bdi;
>
> /*
> @@ -467,10 +466,10 @@ swp_entry_t get_swap_page(void)
> nr_swap_pages--;
>
> for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
> - si = swap_info + type;
> + si = swap_info[type];
> next = si->next;
> if (next < 0 ||
> - (!wrapped && si->prio != swap_info[next].prio)) {
> + (!wrapped && si->prio != swap_info[next]->prio)) {
> next = swap_list.head;
> wrapped++;
> }
> @@ -503,8 +502,8 @@ swp_entry_t get_swap_page_of_type(int ty
> pgoff_t offset;
>
> spin_lock(&swap_lock);
> - si = swap_info + type;
> - if (si->flags & SWP_WRITEOK) {
> + si = swap_info[type];
> + if (si && (si->flags & SWP_WRITEOK)) {
> nr_swap_pages--;
> /* This is called for allocating swap entry, not cache */
> offset = scan_swap_map(si, SWAP_MAP);
> @@ -528,7 +527,7 @@ static struct swap_info_struct * swap_in
> type = swp_type(entry);
> if (type >= nr_swapfiles)
> goto bad_nofile;
> - p = & swap_info[type];
> + p = swap_info[type];
> if (!(p->flags & SWP_USED))
> goto bad_device;
> offset = swp_offset(entry);
> @@ -581,8 +580,9 @@ static int swap_entry_free(struct swap_i
> p->lowest_bit = offset;
> if (offset > p->highest_bit)
> p->highest_bit = offset;
> - if (p->prio > swap_info[swap_list.next].prio)
> - swap_list.next = p - swap_info;
> + if (swap_list.next >= 0 &&
> + p->prio > swap_info[swap_list.next]->prio)
> + swap_list.next = p->type;
> nr_swap_pages++;
> p->inuse_pages--;
> }
> @@ -741,14 +741,14 @@ int free_swap_and_cache(swp_entry_t entr
> int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
> {
> struct block_device *bdev = NULL;
> - int i;
> + int type;
>
> if (device)
> bdev = bdget(device);
>
> spin_lock(&swap_lock);
> - for (i = 0; i < nr_swapfiles; i++) {
> - struct swap_info_struct *sis = swap_info + i;
> + for (type = 0; type < nr_swapfiles; type++) {
> + struct swap_info_struct *sis = swap_info[type];
>
> if (!(sis->flags & SWP_WRITEOK))
> continue;
> @@ -758,7 +758,7 @@ int swap_type_of(dev_t device, sector_t
> *bdev_p = bdgrab(sis->bdev);
>
> spin_unlock(&swap_lock);
> - return i;
> + return type;
> }
> if (bdev == sis->bdev) {
> struct swap_extent *se;
> @@ -771,7 +771,7 @@ int swap_type_of(dev_t device, sector_t
>
> spin_unlock(&swap_lock);
> bdput(bdev);
> - return i;
> + return type;
> }
> }
> }
> @@ -792,15 +792,17 @@ unsigned int count_swap_pages(int type,
> {
> unsigned int n = 0;
>
> - if (type < nr_swapfiles) {
> - spin_lock(&swap_lock);
> - if (swap_info[type].flags & SWP_WRITEOK) {
> - n = swap_info[type].pages;
> + spin_lock(&swap_lock);
> + if ((unsigned int)type < nr_swapfiles) {
> + struct swap_info_struct *sis = swap_info[type];
> +
> + if (sis->flags & SWP_WRITEOK) {
> + n = sis->pages;
> if (free)
> - n -= swap_info[type].inuse_pages;
> + n -= sis->inuse_pages;
> }
> - spin_unlock(&swap_lock);
> }
> + spin_unlock(&swap_lock);
> return n;
> }
> #endif
> @@ -1024,7 +1026,7 @@ static unsigned int find_next_to_unuse(s
> */
> static int try_to_unuse(unsigned int type)
> {
> - struct swap_info_struct * si = &swap_info[type];
> + struct swap_info_struct *si = swap_info[type];
> struct mm_struct *start_mm;
> unsigned short *swap_map;
> unsigned short swcount;
> @@ -1271,10 +1273,10 @@ retry:
> static void drain_mmlist(void)
> {
> struct list_head *p, *next;
> - unsigned int i;
> + unsigned int type;
>
> - for (i = 0; i < nr_swapfiles; i++)
> - if (swap_info[i].inuse_pages)
> + for (type = 0; type < nr_swapfiles; type++)
> + if (swap_info[type]->inuse_pages)
> return;
> spin_lock(&mmlist_lock);
> list_for_each_safe(p, next, &init_mm.mmlist)
> @@ -1294,7 +1296,7 @@ sector_t map_swap_page(swp_entry_t entry
> struct swap_extent *se;
> pgoff_t offset;
>
> - sis = swap_info + swp_type(entry);
> + sis = swap_info[swp_type(entry)];
> *bdev = sis->bdev;
>
> offset = swp_offset(entry);
> @@ -1322,17 +1324,15 @@ sector_t map_swap_page(swp_entry_t entry
> * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
> * corresponding to given index in swap_info (swap type).
> */
> -sector_t swapdev_block(int swap_type, pgoff_t offset)
> +sector_t swapdev_block(int type, pgoff_t offset)
> {
> - struct swap_info_struct *sis;
> struct block_device *bdev;
>
> - if (swap_type >= nr_swapfiles)
> + if ((unsigned int)type >= nr_swapfiles)
> return 0;
> -
> - sis = swap_info + swap_type;
> - return (sis->flags & SWP_WRITEOK) ?
> - map_swap_page(swp_entry(swap_type, offset), &bdev) : 0;
> + if (!(swap_info[type]->flags & SWP_WRITEOK))
> + return 0;
> + return map_swap_page(swp_entry(type, offset), &bdev);
> }
> #endif /* CONFIG_HIBERNATION */
>
> @@ -1548,8 +1548,8 @@ SYSCALL_DEFINE1(swapoff, const char __us
> mapping = victim->f_mapping;
> prev = -1;
> spin_lock(&swap_lock);
> - for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
> - p = swap_info + type;
> + for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
> + p = swap_info[type];
> if (p->flags & SWP_WRITEOK) {
> if (p->swap_file->f_mapping == mapping)
> break;
> @@ -1568,18 +1568,17 @@ SYSCALL_DEFINE1(swapoff, const char __us
> spin_unlock(&swap_lock);
> goto out_dput;
> }
> - if (prev < 0) {
> + if (prev < 0)
> swap_list.head = p->next;
> - } else {
> - swap_info[prev].next = p->next;
> - }
> + else
> + swap_info[prev]->next = p->next;
> if (type == swap_list.next) {
> /* just pick something that's safe... */
> swap_list.next = swap_list.head;
> }
> if (p->prio < 0) {
> - for (i = p->next; i >= 0; i = swap_info[i].next)
> - swap_info[i].prio = p->prio--;
> + for (i = p->next; i >= 0; i = swap_info[i]->next)
> + swap_info[i]->prio = p->prio--;
> least_priority++;
> }
> nr_swap_pages -= p->pages;
> @@ -1597,16 +1596,16 @@ SYSCALL_DEFINE1(swapoff, const char __us
> if (p->prio < 0)
> p->prio = --least_priority;
> prev = -1;
> - for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
> - if (p->prio >= swap_info[i].prio)
> + for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
> + if (p->prio >= swap_info[i]->prio)
> break;
> prev = i;
> }
> p->next = i;
> if (prev < 0)
> - swap_list.head = swap_list.next = p - swap_info;
> + swap_list.head = swap_list.next = type;
> else
> - swap_info[prev].next = p - swap_info;
> + swap_info[prev]->next = type;
> nr_swap_pages += p->pages;
> total_swap_pages += p->pages;
> p->flags |= SWP_WRITEOK;
> @@ -1666,8 +1665,8 @@ out:
> /* iterator */
> static void *swap_start(struct seq_file *swap, loff_t *pos)
> {
> - struct swap_info_struct *ptr = swap_info;
> - int i;
> + struct swap_info_struct *si;
> + int type;
> loff_t l = *pos;
>
> mutex_lock(&swapon_mutex);
> @@ -1675,11 +1674,13 @@ static void *swap_start(struct seq_file
> if (!l)
> return SEQ_START_TOKEN;
>
> - for (i = 0; i < nr_swapfiles; i++, ptr++) {
> - if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
> + for (type = 0; type < nr_swapfiles; type++) {
> + smp_rmb(); /* read nr_swapfiles before swap_info[type] */
> + si = swap_info[type];

if (!si) ?

> + if (!(si->flags & SWP_USED) || !si->swap_map)
> continue;
> if (!--l)
> - return ptr;
> + return si;
> }
>
> return NULL;
> @@ -1687,21 +1688,21 @@ static void *swap_start(struct seq_file
>
> static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
> {
> - struct swap_info_struct *ptr;
> - struct swap_info_struct *endptr = swap_info + nr_swapfiles;
> + struct swap_info_struct *si = v;
> + int type;
>
> if (v == SEQ_START_TOKEN)
> - ptr = swap_info;
> - else {
> - ptr = v;
> - ptr++;
> - }
> + type = 0;
> + else
> + type = si->type + 1;
>
> - for (; ptr < endptr; ptr++) {
> - if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
> + for (; type < nr_swapfiles; type++) {
> + smp_rmb(); /* read nr_swapfiles before swap_info[type] */
> + si = swap_info[type];
> + if (!(si->flags & SWP_USED) || !si->swap_map)
> continue;
> ++*pos;
> - return ptr;
> + return si;
> }
>
> return NULL;
> @@ -1714,24 +1715,24 @@ static void swap_stop(struct seq_file *s
>
> static int swap_show(struct seq_file *swap, void *v)
> {
> - struct swap_info_struct *ptr = v;
> + struct swap_info_struct *si = v;
> struct file *file;
> int len;
>
> - if (ptr == SEQ_START_TOKEN) {
> + if (si == SEQ_START_TOKEN) {
> seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
> return 0;
> }
>
> - file = ptr->swap_file;
> + file = si->swap_file;
> len = seq_path(swap, &file->f_path, " \t\n\\");
> seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
> len < 40 ? 40 - len : 1, " ",
> S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
> "partition" : "file\t",
> - ptr->pages << (PAGE_SHIFT - 10),
> - ptr->inuse_pages << (PAGE_SHIFT - 10),
> - ptr->prio);
> + si->pages << (PAGE_SHIFT - 10),
> + si->inuse_pages << (PAGE_SHIFT - 10),
> + si->prio);
> return 0;
> }
>
> @@ -1799,23 +1800,45 @@ SYSCALL_DEFINE2(swapon, const char __use
>
> if (!capable(CAP_SYS_ADMIN))
> return -EPERM;
> +
> + p = kzalloc(sizeof(*p), GFP_KERNEL);
> + if (!p)
> + return -ENOMEM;
> +
> spin_lock(&swap_lock);
> - p = swap_info;
> - for (type = 0 ; type < nr_swapfiles ; type++,p++)
> - if (!(p->flags & SWP_USED))
> + for (type = 0; type < nr_swapfiles; type++) {
> + if (!(swap_info[type]->flags & SWP_USED))
> break;
> + }
> error = -EPERM;
> if (type >= MAX_SWAPFILES) {
> spin_unlock(&swap_lock);
> + kfree(p);
> goto out;
> }
> - if (type >= nr_swapfiles)
> - nr_swapfiles = type+1;
> - memset(p, 0, sizeof(*p));
> INIT_LIST_HEAD(&p->extent_list);
> + if (type >= nr_swapfiles) {
> + p->type = type;
> + swap_info[type] = p;
> + /*
> + * Write swap_info[type] before nr_swapfiles, in case a
> + * racing procfs swap_start() or swap_next() is reading them.
> + * (We never shrink nr_swapfiles, we never free this entry.)
> + */
> + smp_wmb();
> + nr_swapfiles++;
> + } else {
> + kfree(p);
> + p = swap_info[type];
> + /*
> + * Do not memset this entry: a racing procfs swap_next()
> + * would be relying on p->type to remain valid.
> + */
> + }
> p->flags = SWP_USED;
> p->next = -1;
> spin_unlock(&swap_lock);
> +
> name = getname(specialfile);
> error = PTR_ERR(name);
> if (IS_ERR(name)) {
> @@ -1835,7 +1858,7 @@ SYSCALL_DEFINE2(swapon, const char __use
>
> error = -EBUSY;
> for (i = 0; i < nr_swapfiles; i++) {
> - struct swap_info_struct *q = &swap_info[i];
> + struct swap_info_struct *q = swap_info[i];
>
> if (i == type || !q->swap_file)
> continue;
> @@ -1910,6 +1933,7 @@ SYSCALL_DEFINE2(swapon, const char __use
>
> p->lowest_bit = 1;
> p->cluster_next = 1;
> + p->cluster_nr = 0;
>
> /*
> * Find out how many pages are allowed for a single swap
> @@ -2016,18 +2040,16 @@ SYSCALL_DEFINE2(swapon, const char __use
>
> /* insert swap space into swap_list: */
> prev = -1;
> - for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
> - if (p->prio >= swap_info[i].prio) {
> + for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
> + if (p->prio >= swap_info[i]->prio)
> break;
> - }
> prev = i;
> }
> p->next = i;
> - if (prev < 0) {
> - swap_list.head = swap_list.next = p - swap_info;
> - } else {
> - swap_info[prev].next = p - swap_info;
> - }
> + if (prev < 0)
> + swap_list.head = swap_list.next = type;
> + else
> + swap_info[prev]->next = type;
> spin_unlock(&swap_lock);
> mutex_unlock(&swapon_mutex);
> error = 0;
> @@ -2064,15 +2086,15 @@ out:
>
> void si_swapinfo(struct sysinfo *val)
> {
> - unsigned int i;
> + unsigned int type;
> unsigned long nr_to_be_unused = 0;
>
> spin_lock(&swap_lock);
> - for (i = 0; i < nr_swapfiles; i++) {
> - if (!(swap_info[i].flags & SWP_USED) ||
> - (swap_info[i].flags & SWP_WRITEOK))
> - continue;
> - nr_to_be_unused += swap_info[i].inuse_pages;
> + for (type = 0; type < nr_swapfiles; type++) {
> + struct swap_info_struct *si = swap_info[type];
> +
> + if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
> + nr_to_be_unused += si->inuse_pages;
> }
> val->freeswap = nr_swap_pages + nr_to_be_unused;
> val->totalswap = total_swap_pages + nr_to_be_unused;
> @@ -2105,7 +2127,7 @@ static int __swap_duplicate(swp_entry_t
> type = swp_type(entry);
> if (type >= nr_swapfiles)
> goto bad_file;
> - p = type + swap_info;
> + p = swap_info[type];
> offset = swp_offset(entry);
>
> spin_lock(&swap_lock);
> @@ -2187,7 +2209,7 @@ int valid_swaphandles(swp_entry_t entry,
> if (!our_page_cluster) /* no readahead */
> return 0;
>
> - si = &swap_info[swp_type(entry)];
> + si = swap_info[swp_type(entry)];
> target = swp_offset(entry);
> base = (target >> our_page_cluster) << our_page_cluster;
> end = base + (1 << our_page_cluster);
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2009-10-15 02:22:36

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH 4/9] swap_info: miscellaneous minor cleanups

On Thu, 15 Oct 2009 01:50:54 +0100 (BST)
Hugh Dickins <[email protected]> wrote:

> Move CONFIG_MIGRATION's swapdev_block() into the main CONFIG_MIGRATION
> block, remove extraneous whitespace and return, fix typo in a comment.
>
CONFIG_HIBERNATION ?


> Signed-off-by: Hugh Dickins <[email protected]>

Reviewed-by: KAMEZAWA Hiroyuki <[email protected]>

> ---
>
> mm/swapfile.c | 51 ++++++++++++++++++++++--------------------------
> 1 file changed, 24 insertions(+), 27 deletions(-)
>
> --- si3/mm/swapfile.c 2009-10-14 21:26:22.000000000 +0100
> +++ si4/mm/swapfile.c 2009-10-14 21:26:32.000000000 +0100
> @@ -519,9 +519,9 @@ swp_entry_t get_swap_page_of_type(int ty
> return (swp_entry_t) {0};
> }
>
> -static struct swap_info_struct * swap_info_get(swp_entry_t entry)
> +static struct swap_info_struct *swap_info_get(swp_entry_t entry)
> {
> - struct swap_info_struct * p;
> + struct swap_info_struct *p;
> unsigned long offset, type;
>
> if (!entry.val)
> @@ -599,7 +599,7 @@ static int swap_entry_free(struct swap_i
> */
> void swap_free(swp_entry_t entry)
> {
> - struct swap_info_struct * p;
> + struct swap_info_struct *p;
>
> p = swap_info_get(entry);
> if (p) {
> @@ -629,7 +629,6 @@ void swapcache_free(swp_entry_t entry, s
> }
> spin_unlock(&swap_lock);
> }
> - return;
> }
>
> /*
> @@ -783,6 +782,21 @@ int swap_type_of(dev_t device, sector_t
> }
>
> /*
> + * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
> + * corresponding to given index in swap_info (swap type).
> + */
> +sector_t swapdev_block(int type, pgoff_t offset)
> +{
> + struct block_device *bdev;
> +
> + if ((unsigned int)type >= nr_swapfiles)
> + return 0;
> + if (!(swap_info[type]->flags & SWP_WRITEOK))
> + return 0;
> + return map_swap_page(swp_entry(type, offset), &bdev);
> +}
> +
> +/*
> * Return either the total number of swap pages of given type, or the number
> * of free pages of that type (depending on @free)
> *
> @@ -805,7 +819,7 @@ unsigned int count_swap_pages(int type,
> spin_unlock(&swap_lock);
> return n;
> }
> -#endif
> +#endif /* CONFIG_HIBERNATION */
>
> /*
> * No need to decide whether this PTE shares the swap entry with others,
> @@ -1317,23 +1331,6 @@ sector_t map_swap_page(swp_entry_t entry
> }
> }
>
> -#ifdef CONFIG_HIBERNATION
> -/*
> - * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
> - * corresponding to given index in swap_info (swap type).
> - */
> -sector_t swapdev_block(int type, pgoff_t offset)
> -{
> - struct block_device *bdev;
> -
> - if ((unsigned int)type >= nr_swapfiles)
> - return 0;
> - if (!(swap_info[type]->flags & SWP_WRITEOK))
> - return 0;
> - return map_swap_page(swp_entry(type, offset), &bdev);
> -}
> -#endif /* CONFIG_HIBERNATION */
> -
> /*
> * Free all of a swapdev's extent information
> */
> @@ -1525,12 +1522,12 @@ bad_bmap:
>
> SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
> {
> - struct swap_info_struct * p = NULL;
> + struct swap_info_struct *p = NULL;
> unsigned short *swap_map;
> struct file *swap_file, *victim;
> struct address_space *mapping;
> struct inode *inode;
> - char * pathname;
> + char *pathname;
> int i, type, prev;
> int err;
>
> @@ -1782,7 +1779,7 @@ late_initcall(max_swapfiles_check);
> */
> SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
> {
> - struct swap_info_struct * p;
> + struct swap_info_struct *p;
> char *name = NULL;
> struct block_device *bdev = NULL;
> struct file *swap_file = NULL;
> @@ -2117,7 +2114,7 @@ void si_swapinfo(struct sysinfo *val)
> */
> static int __swap_duplicate(swp_entry_t entry, bool cache)
> {
> - struct swap_info_struct * p;
> + struct swap_info_struct *p;
> unsigned long offset, type;
> int result = -EINVAL;
> int count;
> @@ -2186,7 +2183,7 @@ void swap_duplicate(swp_entry_t entry)
> /*
> * @entry: swap entry for which we allocate swap cache.
> *
> - * Called when allocating swap cache for exising swap entry,
> + * Called when allocating swap cache for existing swap entry,
> * This can return error codes. Returns 0 at success.
> * -EBUSY means there is a swap cache.
> * Note: return code is different from swap_duplicate().
>

2009-10-15 02:40:42

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH 5/9] swap_info: SWAP_HAS_CACHE cleanups

On Thu, 15 Oct 2009 01:52:27 +0100 (BST)
Hugh Dickins <[email protected]> wrote:

> Though swap_count() is useful, I'm finding that swap_has_cache() and
> encode_swapmap() obscure what happens in the swap_map entry, just at
> those points where I need to understand it. Remove them, and pass
> more usable "usage" values to scan_swap_map(), swap_entry_free() and
> __swap_duplicate(), instead of the SWAP_MAP and SWAP_CACHE enum.
>
> Signed-off-by: Hugh Dickins <[email protected]>

I have no objectios to above.
I'll test, later. maybe no troubles.

Reviewed-by: KAMEZAWA Hiroyuki <[email protected]>


> ---
>
> include/linux/swap.h | 2
> mm/swapfile.c | 155 ++++++++++++++++-------------------------
> 2 files changed, 65 insertions(+), 92 deletions(-)
>
> --- si4/include/linux/swap.h 2009-10-14 21:26:22.000000000 +0100
> +++ si5/include/linux/swap.h 2009-10-14 21:26:42.000000000 +0100
> @@ -154,7 +154,7 @@ enum {
> #define SWAP_MAP_MAX 0x7ffe
> #define SWAP_MAP_BAD 0x7fff
> #define SWAP_HAS_CACHE 0x8000 /* There is a swap cache of entry. */
> -#define SWAP_COUNT_MASK (~SWAP_HAS_CACHE)
> +
> /*
> * The in-memory structure used to track swap areas.
> */
> --- si4/mm/swapfile.c 2009-10-14 21:26:32.000000000 +0100
> +++ si5/mm/swapfile.c 2009-10-14 21:26:42.000000000 +0100
> @@ -53,30 +53,9 @@ static struct swap_info_struct *swap_inf
>
> static DEFINE_MUTEX(swapon_mutex);
>
> -/* For reference count accounting in swap_map */
> -/* enum for swap_map[] handling. internal use only */
> -enum {
> - SWAP_MAP = 0, /* ops for reference from swap users */
> - SWAP_CACHE, /* ops for reference from swap cache */
> -};
> -
> static inline int swap_count(unsigned short ent)
> {
> - return ent & SWAP_COUNT_MASK;
> -}
> -
> -static inline bool swap_has_cache(unsigned short ent)
> -{
> - return !!(ent & SWAP_HAS_CACHE);
> -}
> -
> -static inline unsigned short encode_swapmap(int count, bool has_cache)
> -{
> - unsigned short ret = count;
> -
> - if (has_cache)
> - return SWAP_HAS_CACHE | ret;
> - return ret;
> + return ent & ~SWAP_HAS_CACHE;
> }
>
> /* returns 1 if swap entry is freed */
> @@ -224,7 +203,7 @@ static int wait_for_discard(void *word)
> #define LATENCY_LIMIT 256
>
> static inline unsigned long scan_swap_map(struct swap_info_struct *si,
> - int cache)
> + unsigned short usage)
> {
> unsigned long offset;
> unsigned long scan_base;
> @@ -355,10 +334,7 @@ checks:
> si->lowest_bit = si->max;
> si->highest_bit = 0;
> }
> - if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */
> - si->swap_map[offset] = encode_swapmap(0, true);
> - else /* at suspend */
> - si->swap_map[offset] = encode_swapmap(1, false);
> + si->swap_map[offset] = usage;
> si->cluster_next = offset + 1;
> si->flags -= SWP_SCANNING;
>
> @@ -483,7 +459,7 @@ swp_entry_t get_swap_page(void)
>
> swap_list.next = next;
> /* This is called for allocating swap entry for cache */
> - offset = scan_swap_map(si, SWAP_CACHE);
> + offset = scan_swap_map(si, SWAP_HAS_CACHE);
> if (offset) {
> spin_unlock(&swap_lock);
> return swp_entry(type, offset);
> @@ -508,7 +484,7 @@ swp_entry_t get_swap_page_of_type(int ty
> if (si && (si->flags & SWP_WRITEOK)) {
> nr_swap_pages--;
> /* This is called for allocating swap entry, not cache */
> - offset = scan_swap_map(si, SWAP_MAP);
> + offset = scan_swap_map(si, 1);
> if (offset) {
> spin_unlock(&swap_lock);
> return swp_entry(type, offset);
> @@ -555,29 +531,31 @@ out:
> return NULL;
> }
>
> -static int swap_entry_free(struct swap_info_struct *p,
> - swp_entry_t ent, int cache)
> +static unsigned short swap_entry_free(struct swap_info_struct *p,
> + swp_entry_t entry, unsigned short usage)
> {
> - unsigned long offset = swp_offset(ent);
> - int count = swap_count(p->swap_map[offset]);
> - bool has_cache;
> + unsigned long offset = swp_offset(entry);
> + unsigned short count;
> + unsigned short has_cache;
>
> - has_cache = swap_has_cache(p->swap_map[offset]);
> + count = p->swap_map[offset];
> + has_cache = count & SWAP_HAS_CACHE;
> + count &= ~SWAP_HAS_CACHE;
>
> - if (cache == SWAP_MAP) { /* dropping usage count of swap */
> - if (count < SWAP_MAP_MAX) {
> - count--;
> - p->swap_map[offset] = encode_swapmap(count, has_cache);
> - }
> - } else { /* dropping swap cache flag */
> + if (usage == SWAP_HAS_CACHE) {
> VM_BUG_ON(!has_cache);
> - p->swap_map[offset] = encode_swapmap(count, false);
> + has_cache = 0;
> + } else if (count < SWAP_MAP_MAX)
> + count--;
> +
> + if (!count)
> + mem_cgroup_uncharge_swap(entry);
> +
> + usage = count | has_cache;
> + p->swap_map[offset] = usage;
>
> - }
> - /* return code. */
> - count = p->swap_map[offset];
> /* free if no reference */
> - if (!count) {
> + if (!usage) {
> if (offset < p->lowest_bit)
> p->lowest_bit = offset;
> if (offset > p->highest_bit)
> @@ -588,9 +566,8 @@ static int swap_entry_free(struct swap_i
> nr_swap_pages++;
> p->inuse_pages--;
> }
> - if (!swap_count(count))
> - mem_cgroup_uncharge_swap(ent);
> - return count;
> +
> + return usage;
> }
>
> /*
> @@ -603,7 +580,7 @@ void swap_free(swp_entry_t entry)
>
> p = swap_info_get(entry);
> if (p) {
> - swap_entry_free(p, entry, SWAP_MAP);
> + swap_entry_free(p, entry, 1);
> spin_unlock(&swap_lock);
> }
> }
> @@ -614,19 +591,13 @@ void swap_free(swp_entry_t entry)
> void swapcache_free(swp_entry_t entry, struct page *page)
> {
> struct swap_info_struct *p;
> - int ret;
> + unsigned short count;
>
> p = swap_info_get(entry);
> if (p) {
> - ret = swap_entry_free(p, entry, SWAP_CACHE);
> - if (page) {
> - bool swapout;
> - if (ret)
> - swapout = true; /* the end of swap out */
> - else
> - swapout = false; /* no more swap users! */
> - mem_cgroup_uncharge_swapcache(page, entry, swapout);
> - }
> + count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
> + if (page)
> + mem_cgroup_uncharge_swapcache(page, entry, count != 0);
> spin_unlock(&swap_lock);
> }
> }
> @@ -705,7 +676,7 @@ int free_swap_and_cache(swp_entry_t entr
>
> p = swap_info_get(entry);
> if (p) {
> - if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) {
> + if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
> page = find_get_page(&swapper_space, entry.val);
> if (page && !trylock_page(page)) {
> page_cache_release(page);
> @@ -1213,7 +1184,7 @@ static int try_to_unuse(unsigned int typ
>
> if (swap_count(*swap_map) == SWAP_MAP_MAX) {
> spin_lock(&swap_lock);
> - *swap_map = encode_swapmap(0, true);
> + *swap_map = SWAP_HAS_CACHE;
> spin_unlock(&swap_lock);
> reset_overflow = 1;
> }
> @@ -2112,16 +2083,16 @@ void si_swapinfo(struct sysinfo *val)
> * - swap-cache reference is requested but there is already one. -> EEXIST
> * - swap-cache reference is requested but the entry is not used. -> ENOENT
> */
> -static int __swap_duplicate(swp_entry_t entry, bool cache)
> +static int __swap_duplicate(swp_entry_t entry, unsigned short usage)
> {
> struct swap_info_struct *p;
> unsigned long offset, type;
> - int result = -EINVAL;
> - int count;
> - bool has_cache;
> + unsigned short count;
> + unsigned short has_cache;
> + int err = -EINVAL;
>
> if (non_swap_entry(entry))
> - return -EINVAL;
> + goto out;
>
> type = swp_type(entry);
> if (type >= nr_swapfiles)
> @@ -2130,54 +2101,56 @@ static int __swap_duplicate(swp_entry_t
> offset = swp_offset(entry);
>
> spin_lock(&swap_lock);
> -
> if (unlikely(offset >= p->max))
> goto unlock_out;
>
> - count = swap_count(p->swap_map[offset]);
> - has_cache = swap_has_cache(p->swap_map[offset]);
> + count = p->swap_map[offset];
> + has_cache = count & SWAP_HAS_CACHE;
> + count &= ~SWAP_HAS_CACHE;
> + err = 0;
>
> - if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */
> + if (usage == SWAP_HAS_CACHE) {
>
> /* set SWAP_HAS_CACHE if there is no cache and entry is used */
> - if (!has_cache && count) {
> - p->swap_map[offset] = encode_swapmap(count, true);
> - result = 0;
> - } else if (has_cache) /* someone added cache */
> - result = -EEXIST;
> - else if (!count) /* no users */
> - result = -ENOENT;
> + if (!has_cache && count)
> + has_cache = SWAP_HAS_CACHE;
> + else if (has_cache) /* someone else added cache */
> + err = -EEXIST;
> + else /* no users remaining */
> + err = -ENOENT;
>
> } else if (count || has_cache) {
> - if (count < SWAP_MAP_MAX - 1) {
> - p->swap_map[offset] = encode_swapmap(count + 1,
> - has_cache);
> - result = 0;
> - } else if (count <= SWAP_MAP_MAX) {
> +
> + if (count < SWAP_MAP_MAX - 1)
> + count++;
> + else if (count <= SWAP_MAP_MAX) {
> if (swap_overflow++ < 5)
> printk(KERN_WARNING
> "swap_dup: swap entry overflow\n");
> - p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX,
> - has_cache);
> - result = 0;
> - }
> + count = SWAP_MAP_MAX;
> + } else
> + err = -EINVAL;
> } else
> - result = -ENOENT; /* unused swap entry */
> + err = -ENOENT; /* unused swap entry */
> +
> + p->swap_map[offset] = count | has_cache;
> +
> unlock_out:
> spin_unlock(&swap_lock);
> out:
> - return result;
> + return err;
>
> bad_file:
> printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
> goto out;
> }
> +
> /*
> * increase reference count of swap entry by 1.
> */
> void swap_duplicate(swp_entry_t entry)
> {
> - __swap_duplicate(entry, SWAP_MAP);
> + __swap_duplicate(entry, 1);
> }
>
> /*
> @@ -2190,7 +2163,7 @@ void swap_duplicate(swp_entry_t entry)
> */
> int swapcache_prepare(swp_entry_t entry)
> {
> - return __swap_duplicate(entry, SWAP_CACHE);
> + return __swap_duplicate(entry, SWAP_HAS_CACHE);
> }
>
> /*
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2009-10-15 02:47:37

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH 6/9] swap_info: swap_map of chars not shorts

On Thu, 15 Oct 2009 01:53:52 +0100 (BST)
Hugh Dickins <[email protected]> wrote:

> Halve the vmalloc'ed swap_map array from unsigned shorts to unsigned
> chars: it's still very unusual to reach a swap count of 126, and the
> next patch allows it to be extended indefinitely.
>
> Signed-off-by: Hugh Dickins <[email protected]>

> ---
>
> include/linux/swap.h | 8 ++++----
> mm/swapfile.c | 40 +++++++++++++++++++++++-----------------
> 2 files changed, 27 insertions(+), 21 deletions(-)
>
> --- si5/include/linux/swap.h 2009-10-14 21:26:42.000000000 +0100
> +++ si6/include/linux/swap.h 2009-10-14 21:26:49.000000000 +0100
> @@ -151,9 +151,9 @@ enum {
>
> #define SWAP_CLUSTER_MAX 32
>
> -#define SWAP_MAP_MAX 0x7ffe
> -#define SWAP_MAP_BAD 0x7fff
> -#define SWAP_HAS_CACHE 0x8000 /* There is a swap cache of entry. */
> +#define SWAP_MAP_MAX 0x7e
> +#define SWAP_MAP_BAD 0x7f
> +#define SWAP_HAS_CACHE 0x80 /* There is a swap cache of entry. */
>
> /*
> * The in-memory structure used to track swap areas.
> @@ -167,7 +167,7 @@ struct swap_info_struct {
> struct block_device *bdev;
> struct swap_extent first_swap_extent;
> struct swap_extent *curr_swap_extent;
> - unsigned short *swap_map;
> + unsigned char *swap_map;
> unsigned int lowest_bit;
> unsigned int highest_bit;
> unsigned int lowest_alloc; /* while preparing discard cluster */
> --- si5/mm/swapfile.c 2009-10-14 21:26:42.000000000 +0100
> +++ si6/mm/swapfile.c 2009-10-14 21:26:49.000000000 +0100
> @@ -53,7 +53,7 @@ static struct swap_info_struct *swap_inf
>
> static DEFINE_MUTEX(swapon_mutex);
>
> -static inline int swap_count(unsigned short ent)
> +static inline unsigned char swap_count(unsigned char ent)
> {
> return ent & ~SWAP_HAS_CACHE;
> }
> @@ -203,7 +203,7 @@ static int wait_for_discard(void *word)
> #define LATENCY_LIMIT 256
>
> static inline unsigned long scan_swap_map(struct swap_info_struct *si,
> - unsigned short usage)
> + unsigned char usage)
> {
> unsigned long offset;
> unsigned long scan_base;
> @@ -531,12 +531,12 @@ out:
> return NULL;
> }
>
> -static unsigned short swap_entry_free(struct swap_info_struct *p,
> - swp_entry_t entry, unsigned short usage)
> +static unsigned char swap_entry_free(struct swap_info_struct *p,
> + swp_entry_t entry, unsigned char usage)
> {
> unsigned long offset = swp_offset(entry);
> - unsigned short count;
> - unsigned short has_cache;
> + unsigned char count;
> + unsigned char has_cache;
>
> count = p->swap_map[offset];
> has_cache = count & SWAP_HAS_CACHE;
> @@ -591,7 +591,7 @@ void swap_free(swp_entry_t entry)
> void swapcache_free(swp_entry_t entry, struct page *page)
> {
> struct swap_info_struct *p;
> - unsigned short count;
> + unsigned char count;
>
> p = swap_info_get(entry);
> if (p) {
> @@ -975,7 +975,7 @@ static unsigned int find_next_to_unuse(s
> {
> unsigned int max = si->max;
> unsigned int i = prev;
> - int count;
> + unsigned char count;
>
> /*
> * No need for swap_lock here: we're just looking
> @@ -1013,8 +1013,8 @@ static int try_to_unuse(unsigned int typ
> {
> struct swap_info_struct *si = swap_info[type];
> struct mm_struct *start_mm;
> - unsigned short *swap_map;
> - unsigned short swcount;
> + unsigned char *swap_map;
> + unsigned char swcount;
> struct page *page;
> swp_entry_t entry;
> unsigned int i = 0;
> @@ -1175,6 +1175,12 @@ static int try_to_unuse(unsigned int typ
> * If that's wrong, then we should worry more about
> * exit_mmap() and do_munmap() cases described above:
> * we might be resetting SWAP_MAP_MAX too early here.
> + *
> + * Yes, that's wrong: though very unlikely, swap count 0x7ffe
> + * could surely occur if pid_max raised from PID_MAX_DEFAULT;

Just a nitpick.

Hmm, logically, our MAX COUNT is 0x7e after this patch. Then, how about not
mentioning to 0x7ffe and PID_MAX ? as..

Yes, that's wrong: we now use SWAP_MAP_MAX as 0x7e, very easy to overflow.
next patch will...


Reviewed-by: KAMEZAWA Hiroyuki <[email protected]>

> + * and we are now lowering SWAP_MAP_MAX to 0x7e, making it
> + * much easier to reach. But the next patch will fix that.
> + *
> * We know "Undead"s can happen, they're okay, so don't
> * report them; but do report if we reset SWAP_MAP_MAX.
> */
> @@ -1494,7 +1500,7 @@ bad_bmap:
> SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
> {
> struct swap_info_struct *p = NULL;
> - unsigned short *swap_map;
> + unsigned char *swap_map;
> struct file *swap_file, *victim;
> struct address_space *mapping;
> struct inode *inode;
> @@ -1764,7 +1770,7 @@ SYSCALL_DEFINE2(swapon, const char __use
> sector_t span;
> unsigned long maxpages = 1;
> unsigned long swapfilepages;
> - unsigned short *swap_map = NULL;
> + unsigned char *swap_map = NULL;
> struct page *page = NULL;
> struct inode *inode = NULL;
> int did_down = 0;
> @@ -1939,13 +1945,13 @@ SYSCALL_DEFINE2(swapon, const char __use
> goto bad_swap;
>
> /* OK, set up the swap map and apply the bad block list */
> - swap_map = vmalloc(maxpages * sizeof(short));
> + swap_map = vmalloc(maxpages);
> if (!swap_map) {
> error = -ENOMEM;
> goto bad_swap;
> }
>
> - memset(swap_map, 0, maxpages * sizeof(short));
> + memset(swap_map, 0, maxpages);
> for (i = 0; i < swap_header->info.nr_badpages; i++) {
> int page_nr = swap_header->info.badpages[i];
> if (page_nr <= 0 || page_nr >= swap_header->info.last_page) {
> @@ -2083,12 +2089,12 @@ void si_swapinfo(struct sysinfo *val)
> * - swap-cache reference is requested but there is already one. -> EEXIST
> * - swap-cache reference is requested but the entry is not used. -> ENOENT
> */
> -static int __swap_duplicate(swp_entry_t entry, unsigned short usage)
> +static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
> {
> struct swap_info_struct *p;
> unsigned long offset, type;
> - unsigned short count;
> - unsigned short has_cache;
> + unsigned char count;
> + unsigned char has_cache;
> int err = -EINVAL;
>
> if (non_swap_entry(entry))
>

2009-10-15 03:33:34

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH 7/9] swap_info: swap count continuations

On Thu, 15 Oct 2009 01:56:01 +0100 (BST)
Hugh Dickins <[email protected]> wrote:

> Swap is duplicated (reference count incremented by one) whenever the same
> swap page is inserted into another mm (when forking finds a swap entry in
> place of a pte, or when reclaim unmaps a pte to insert the swap entry).
>
> swap_info_struct's vmalloc'ed swap_map is the array of these reference
> counts: but what happens when the unsigned short (or unsigned char since
> the preceding patch) is full? (and its high bit is kept for a cache flag)
>
> We then lose track of it, never freeing, leaving it in use until swapoff:
> at which point we _hope_ that a single pass will have found all instances,
> assume there are no more, and will lose user data if we're wrong.
>
> Swapping of KSM pages has not yet been enabled; but it is implemented,
> and makes it very easy for a user to overflow the maximum swap count:
> possible with ordinary process pages, but unlikely, even when pid_max
> has been raised from PID_MAX_DEFAULT.
>
> This patch implements swap count continuations: when the count overflows,
> a continuation page is allocated and linked to the original vmalloc'ed
> map page, and this used to hold the continuation counts for that entry
> and its neighbours. These continuation pages are seldom referenced:
> the common paths all work on the original swap_map, only referring to
> a continuation page when the low "digit" of a count is incremented or
> decremented through SWAP_MAP_MAX.
>
> Signed-off-by: Hugh Dickins <[email protected]>


Hmm...maybe I don't understand the benefit of this style of data structure.

Do we need fine grain chain ?
Is array of "unsigned long" counter is bad ? (too big?)

==
#define EXTENTION_OFFSET_INDEX(offset) (((offset) & PAGE_MASK)
#define EXTENTION_OFFSET_MASK (~(PAGE_SIZE/sizeof(long) - 1))
struct swapcount_extention_array {
unsigned long *map[EXTEND_MAP_SIZE];
};

At adding continuation.

int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
{
struct page *page;
unsigned long *newmap, *map;
struct swapcount_extention_array *array;

newmap = __get_free_page(mask);
si = swap_info_get(entry);
array = kmalloc(sizeof(swapcount_extention_array);

....
(If overflow)
page = vmalloc_to_page(si->swap_map + offset);
if (!PagePrivate(page)) {
page->praivate = array;
} else
kfree(array);

index = EXTENTION_OFFSET_INDEX(offset);
pos = EXTENTION_OFFSET_MASK(offset);

array = page->private;
if (!array->map[index]) {
array->map[index] = newmap;
} else
free_page(newmap);
map = array->map[index];
map[pos] += 1;
mappage = vaddr_to_page(map);
get_page(mappage); # increment page->count of array.
==

Hmm? maybe I just don't like chain...

Regards,
-Kame


> ---
>
> include/linux/swap.h | 22 ++
> mm/memory.c | 19 ++
> mm/rmap.c | 6
> mm/swapfile.c | 304 +++++++++++++++++++++++++++++++++--------
> 4 files changed, 287 insertions(+), 64 deletions(-)
>
> --- si6/include/linux/swap.h 2009-10-14 21:26:49.000000000 +0100
> +++ si7/include/linux/swap.h 2009-10-14 21:26:57.000000000 +0100
> @@ -145,15 +145,18 @@ enum {
> SWP_DISCARDABLE = (1 << 2), /* blkdev supports discard */
> SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */
> SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */
> + SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */
> /* add others here before... */
> SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */
> };
>
> #define SWAP_CLUSTER_MAX 32
>
> -#define SWAP_MAP_MAX 0x7e
> -#define SWAP_MAP_BAD 0x7f
> -#define SWAP_HAS_CACHE 0x80 /* There is a swap cache of entry. */
> +#define SWAP_MAP_MAX 0x3e /* Max duplication count, in first swap_map */
> +#define SWAP_MAP_BAD 0x3f /* Note pageblock is bad, in first swap_map */
> +#define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */
> +#define SWAP_CONT_MAX 0x7f /* Max count, in each swap_map continuation */
> +#define COUNT_CONTINUED 0x80 /* See swap_map continuation for full count */
>
> /*
> * The in-memory structure used to track swap areas.
> @@ -310,9 +313,10 @@ extern long total_swap_pages;
> extern void si_swapinfo(struct sysinfo *);
> extern swp_entry_t get_swap_page(void);
> extern swp_entry_t get_swap_page_of_type(int);
> -extern void swap_duplicate(swp_entry_t);
> -extern int swapcache_prepare(swp_entry_t);
> extern int valid_swaphandles(swp_entry_t, unsigned long *);
> +extern int add_swap_count_continuation(swp_entry_t, gfp_t);
> +extern int swap_duplicate(swp_entry_t);
> +extern int swapcache_prepare(swp_entry_t);
> extern void swap_free(swp_entry_t);
> extern void swapcache_free(swp_entry_t, struct page *page);
> extern int free_swap_and_cache(swp_entry_t);
> @@ -384,8 +388,14 @@ static inline void show_swap_cache_info(
> #define free_swap_and_cache(swp) is_migration_entry(swp)
> #define swapcache_prepare(swp) is_migration_entry(swp)
>
> -static inline void swap_duplicate(swp_entry_t swp)
> +static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
> {
> + return 0;
> +}
> +
> +static inline int swap_duplicate(swp_entry_t swp)
> +{
> + return 0;
> }
>
> static inline void swap_free(swp_entry_t swp)
> --- si6/mm/memory.c 2009-09-28 00:28:41.000000000 +0100
> +++ si7/mm/memory.c 2009-10-14 21:26:57.000000000 +0100
> @@ -572,7 +572,7 @@ out:
> * covered by this vma.
> */
>
> -static inline void
> +static inline unsigned long
> copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
> pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
> unsigned long addr, int *rss)
> @@ -586,7 +586,9 @@ copy_one_pte(struct mm_struct *dst_mm, s
> if (!pte_file(pte)) {
> swp_entry_t entry = pte_to_swp_entry(pte);
>
> - swap_duplicate(entry);
> + if (swap_duplicate(entry) < 0)
> + return entry.val;
> +
> /* make sure dst_mm is on swapoff's mmlist. */
> if (unlikely(list_empty(&dst_mm->mmlist))) {
> spin_lock(&mmlist_lock);
> @@ -635,6 +637,7 @@ copy_one_pte(struct mm_struct *dst_mm, s
>
> out_set_pte:
> set_pte_at(dst_mm, addr, dst_pte, pte);
> + return 0;
> }
>
> static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
> @@ -645,6 +648,7 @@ static int copy_pte_range(struct mm_stru
> spinlock_t *src_ptl, *dst_ptl;
> int progress = 0;
> int rss[2];
> + swp_entry_t entry = (swp_entry_t){0};
>
> again:
> rss[1] = rss[0] = 0;
> @@ -671,7 +675,10 @@ again:
> progress++;
> continue;
> }
> - copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
> + entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
> + vma, addr, rss);
> + if (entry.val)
> + break;
> progress += 8;
> } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
>
> @@ -681,6 +688,12 @@ again:
> add_mm_rss(dst_mm, rss[0], rss[1]);
> pte_unmap_unlock(dst_pte - 1, dst_ptl);
> cond_resched();
> +
> + if (entry.val) {
> + if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
> + return -ENOMEM;
> + progress = 0;
> + }
> if (addr != end)
> goto again;
> return 0;
> --- si6/mm/rmap.c 2009-10-05 04:33:14.000000000 +0100
> +++ si7/mm/rmap.c 2009-10-14 21:26:57.000000000 +0100
> @@ -822,7 +822,11 @@ static int try_to_unmap_one(struct page
> * Store the swap location in the pte.
> * See handle_pte_fault() ...
> */
> - swap_duplicate(entry);
> + if (swap_duplicate(entry) < 0) {
> + set_pte_at(mm, address, pte, pteval);
> + ret = SWAP_FAIL;
> + goto out_unmap;
> + }
> if (list_empty(&mm->mmlist)) {
> spin_lock(&mmlist_lock);
> if (list_empty(&mm->mmlist))
> --- si6/mm/swapfile.c 2009-10-14 21:26:49.000000000 +0100
> +++ si7/mm/swapfile.c 2009-10-14 21:26:57.000000000 +0100
> @@ -35,11 +35,14 @@
> #include <linux/swapops.h>
> #include <linux/page_cgroup.h>
>
> +static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
> + unsigned char);
> +static void free_swap_count_continuations(struct swap_info_struct *);
> +
> static DEFINE_SPINLOCK(swap_lock);
> static unsigned int nr_swapfiles;
> long nr_swap_pages;
> long total_swap_pages;
> -static int swap_overflow;
> static int least_priority;
>
> static const char Bad_file[] = "Bad swap file entry ";
> @@ -55,7 +58,7 @@ static DEFINE_MUTEX(swapon_mutex);
>
> static inline unsigned char swap_count(unsigned char ent)
> {
> - return ent & ~SWAP_HAS_CACHE;
> + return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */
> }
>
> /* returns 1 if swap entry is freed */
> @@ -545,8 +548,15 @@ static unsigned char swap_entry_free(str
> if (usage == SWAP_HAS_CACHE) {
> VM_BUG_ON(!has_cache);
> has_cache = 0;
> - } else if (count < SWAP_MAP_MAX)
> - count--;
> + } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
> + if (count == COUNT_CONTINUED) {
> + if (swap_count_continued(p, offset, count))
> + count = SWAP_MAP_MAX | COUNT_CONTINUED;
> + else
> + count = SWAP_MAP_MAX;
> + } else
> + count--;
> + }
>
> if (!count)
> mem_cgroup_uncharge_swap(entry);
> @@ -604,6 +614,8 @@ void swapcache_free(swp_entry_t entry, s
>
> /*
> * How many references to page are currently swapped out?
> + * This does not give an exact answer when swap count is continued,
> + * but does include the high COUNT_CONTINUED flag to allow for that.
> */
> static inline int page_swapcount(struct page *page)
> {
> @@ -1019,7 +1031,6 @@ static int try_to_unuse(unsigned int typ
> swp_entry_t entry;
> unsigned int i = 0;
> int retval = 0;
> - int reset_overflow = 0;
> int shmem;
>
> /*
> @@ -1034,8 +1045,7 @@ static int try_to_unuse(unsigned int typ
> * together, child after parent. If we race with dup_mmap(), we
> * prefer to resolve parent before child, lest we miss entries
> * duplicated after we scanned child: using last mm would invert
> - * that. Though it's only a serious concern when an overflowed
> - * swap count is reset from SWAP_MAP_MAX, preventing a rescan.
> + * that.
> */
> start_mm = &init_mm;
> atomic_inc(&init_mm.mm_users);
> @@ -1166,36 +1176,6 @@ static int try_to_unuse(unsigned int typ
> }
>
> /*
> - * How could swap count reach 0x7ffe ?
> - * There's no way to repeat a swap page within an mm
> - * (except in shmem, where it's the shared object which takes
> - * the reference count)?
> - * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned
> - * short is too small....)
> - * If that's wrong, then we should worry more about
> - * exit_mmap() and do_munmap() cases described above:
> - * we might be resetting SWAP_MAP_MAX too early here.
> - *
> - * Yes, that's wrong: though very unlikely, swap count 0x7ffe
> - * could surely occur if pid_max raised from PID_MAX_DEFAULT;
> - * and we are now lowering SWAP_MAP_MAX to 0x7e, making it
> - * much easier to reach. But the next patch will fix that.
> - *
> - * We know "Undead"s can happen, they're okay, so don't
> - * report them; but do report if we reset SWAP_MAP_MAX.
> - */
> - /* We might release the lock_page() in unuse_mm(). */
> - if (!PageSwapCache(page) || page_private(page) != entry.val)
> - goto retry;
> -
> - if (swap_count(*swap_map) == SWAP_MAP_MAX) {
> - spin_lock(&swap_lock);
> - *swap_map = SWAP_HAS_CACHE;
> - spin_unlock(&swap_lock);
> - reset_overflow = 1;
> - }
> -
> - /*
> * If a reference remains (rare), we would like to leave
> * the page in the swap cache; but try_to_unmap could
> * then re-duplicate the entry once we drop page lock,
> @@ -1236,7 +1216,6 @@ static int try_to_unuse(unsigned int typ
> * mark page dirty so shrink_page_list will preserve it.
> */
> SetPageDirty(page);
> -retry:
> unlock_page(page);
> page_cache_release(page);
>
> @@ -1248,10 +1227,6 @@ retry:
> }
>
> mmput(start_mm);
> - if (reset_overflow) {
> - printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
> - swap_overflow = 0;
> - }
> return retval;
> }
>
> @@ -1595,6 +1570,9 @@ SYSCALL_DEFINE1(swapoff, const char __us
> up_write(&swap_unplug_sem);
>
> destroy_swap_extents(p);
> + if (p->flags & SWP_CONTINUED)
> + free_swap_count_continuations(p);
> +
> mutex_lock(&swapon_mutex);
> spin_lock(&swap_lock);
> drain_mmlist();
> @@ -2080,14 +2058,13 @@ void si_swapinfo(struct sysinfo *val)
> /*
> * Verify that a swap entry is valid and increment its swap map count.
> *
> - * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
> - * "permanent", but will be reclaimed by the next swapoff.
> * Returns error code in following case.
> * - success -> 0
> * - swp_entry is invalid -> EINVAL
> * - swp_entry is migration entry -> EINVAL
> * - swap-cache reference is requested but there is already one. -> EEXIST
> * - swap-cache reference is requested but the entry is not used. -> ENOENT
> + * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
> */
> static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
> {
> @@ -2127,15 +2104,14 @@ static int __swap_duplicate(swp_entry_t
>
> } else if (count || has_cache) {
>
> - if (count < SWAP_MAP_MAX - 1)
> - count++;
> - else if (count <= SWAP_MAP_MAX) {
> - if (swap_overflow++ < 5)
> - printk(KERN_WARNING
> - "swap_dup: swap entry overflow\n");
> - count = SWAP_MAP_MAX;
> - } else
> + if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
> + count += usage;
> + else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
> err = -EINVAL;
> + else if (swap_count_continued(p, offset, count))
> + count = COUNT_CONTINUED;
> + else
> + err = -ENOMEM;
> } else
> err = -ENOENT; /* unused swap entry */
>
> @@ -2154,9 +2130,13 @@ bad_file:
> /*
> * increase reference count of swap entry by 1.
> */
> -void swap_duplicate(swp_entry_t entry)
> +int swap_duplicate(swp_entry_t entry)
> {
> - __swap_duplicate(entry, 1);
> + int err = 0;
> +
> + while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
> + err = add_swap_count_continuation(entry, GFP_ATOMIC);
> + return err;
> }
>
> /*
> @@ -2223,3 +2203,219 @@ int valid_swaphandles(swp_entry_t entry,
> *offset = ++toff;
> return nr_pages? ++nr_pages: 0;
> }
> +
> +/*
> + * add_swap_count_continuation - called when a swap count is duplicated
> + * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
> + * page of the original vmalloc'ed swap_map, to hold the continuation count
> + * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called
> + * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
> + *
> + * These continuation pages are seldom referenced: the common paths all work
> + * on the original swap_map, only referring to a continuation page when the
> + * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
> + *
> + * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
> + * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
> + * can be called after dropping locks.
> + */
> +int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
> +{
> + struct swap_info_struct *si;
> + struct page *head;
> + struct page *page;
> + struct page *list_page;
> + pgoff_t offset;
> + unsigned char count;
> +
> + /*
> + * When debugging, it's easier to use __GFP_ZERO here; but it's better
> + * for latency not to zero a page while GFP_ATOMIC and holding locks.
> + */
> + page = alloc_page(gfp_mask | __GFP_HIGHMEM);
> +
> + si = swap_info_get(entry);
> + if (!si) {
> + /*
> + * An acceptable race has occurred since the failing
> + * __swap_duplicate(): the swap entry has been freed,
> + * perhaps even the whole swap_map cleared for swapoff.
> + */
> + goto outer;
> + }
> +
> + offset = swp_offset(entry);
> + count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
> +
> + if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
> + /*
> + * The higher the swap count, the more likely it is that tasks
> + * will race to add swap count continuation: we need to avoid
> + * over-provisioning.
> + */
> + goto out;
> + }
> +
> + if (!page) {
> + spin_unlock(&swap_lock);
> + return -ENOMEM;
> + }
> +
> + /*
> + * We are fortunate that although vmalloc_to_page uses pte_offset_map,
> + * no architecture is using highmem pages for kernel pagetables: so it
> + * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps.
> + */
> + head = vmalloc_to_page(si->swap_map + offset);
> + offset &= ~PAGE_MASK;
> +
> + /*
> + * Page allocation does not initialize the page's lru field,
> + * but it does always reset its private field.
> + */
> + if (!page_private(head)) {
> + BUG_ON(count & COUNT_CONTINUED);
> + INIT_LIST_HEAD(&head->lru);
> + set_page_private(head, SWP_CONTINUED);
> + si->flags |= SWP_CONTINUED;
> + }
> +
> + list_for_each_entry(list_page, &head->lru, lru) {
> + unsigned char *map;
> +
> + /*
> + * If the previous map said no continuation, but we've found
> + * a continuation page, free our allocation and use this one.
> + */
> + if (!(count & COUNT_CONTINUED))
> + goto out;
> +
> + map = kmap_atomic(list_page, KM_USER0) + offset;
> + count = *map;
> + kunmap_atomic(map, KM_USER0);
> +
> + /*
> + * If this continuation count now has some space in it,
> + * free our allocation and use this one.
> + */
> + if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
> + goto out;
> + }
> +
> + list_add_tail(&page->lru, &head->lru);
> + page = NULL; /* now it's attached, don't free it */
> +out:
> + spin_unlock(&swap_lock);
> +outer:
> + if (page)
> + __free_page(page);
> + return 0;
> +}
> +
> +/*
> + * swap_count_continued - when the original swap_map count is incremented
> + * from SWAP_MAP_MAX, check if there is already a continuation page to carry
> + * into, carry if so, or else fail until a new continuation page is allocated;
> + * when the original swap_map count is decremented from 0 with continuation,
> + * borrow from the continuation and report whether it still holds more.
> + * Called while __swap_duplicate() or swap_entry_free() holds swap_lock.
> + */
> +static bool swap_count_continued(struct swap_info_struct *si,
> + pgoff_t offset, unsigned char count)
> +{
> + struct page *head;
> + struct page *page;
> + unsigned char *map;
> +
> + head = vmalloc_to_page(si->swap_map + offset);
> + if (page_private(head) != SWP_CONTINUED) {
> + BUG_ON(count & COUNT_CONTINUED);
> + return false; /* need to add count continuation */
> + }
> +
> + offset &= ~PAGE_MASK;
> + page = list_entry(head->lru.next, struct page, lru);
> + map = kmap_atomic(page, KM_USER0) + offset;
> +
> + if (count == SWAP_MAP_MAX) /* initial increment from swap_map */
> + goto init_map; /* jump over SWAP_CONT_MAX checks */
> +
> + if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
> + /*
> + * Think of how you add 1 to 999
> + */
> + while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
> + kunmap_atomic(map, KM_USER0);
> + page = list_entry(page->lru.next, struct page, lru);
> + BUG_ON(page == head);
> + map = kmap_atomic(page, KM_USER0) + offset;
> + }
> + if (*map == SWAP_CONT_MAX) {
> + kunmap_atomic(map, KM_USER0);
> + page = list_entry(page->lru.next, struct page, lru);
> + if (page == head)
> + return false; /* add count continuation */
> + map = kmap_atomic(page, KM_USER0) + offset;
> +init_map: *map = 0; /* we didn't zero the page */
> + }
> + *map += 1;
> + kunmap_atomic(map, KM_USER0);
> + page = list_entry(page->lru.prev, struct page, lru);
> + while (page != head) {
> + map = kmap_atomic(page, KM_USER0) + offset;
> + *map = COUNT_CONTINUED;
> + kunmap_atomic(map, KM_USER0);
> + page = list_entry(page->lru.prev, struct page, lru);
> + }
> + return true; /* incremented */
> +
> + } else { /* decrementing */
> + /*
> + * Think of how you subtract 1 from 1000
> + */
> + BUG_ON(count != COUNT_CONTINUED);
> + while (*map == COUNT_CONTINUED) {
> + kunmap_atomic(map, KM_USER0);
> + page = list_entry(page->lru.next, struct page, lru);
> + BUG_ON(page == head);
> + map = kmap_atomic(page, KM_USER0) + offset;
> + }
> + BUG_ON(*map == 0);
> + *map -= 1;
> + if (*map == 0)
> + count = 0;
> + kunmap_atomic(map, KM_USER0);
> + page = list_entry(page->lru.prev, struct page, lru);
> + while (page != head) {
> + map = kmap_atomic(page, KM_USER0) + offset;
> + *map = SWAP_CONT_MAX | count;
> + count = COUNT_CONTINUED;
> + kunmap_atomic(map, KM_USER0);
> + page = list_entry(page->lru.prev, struct page, lru);
> + }
> + return count == COUNT_CONTINUED;
> + }
> +}
> +
> +/*
> + * free_swap_count_continuations - swapoff free all the continuation pages
> + * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
> + */
> +static void free_swap_count_continuations(struct swap_info_struct *si)
> +{
> + pgoff_t offset;
> +
> + for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
> + struct page *head;
> + head = vmalloc_to_page(si->swap_map + offset);
> + if (page_private(head)) {
> + struct list_head *this, *next;
> + list_for_each_safe(this, next, &head->lru) {
> + struct page *page;
> + page = list_entry(this, struct page, lru);
> + list_del(this);
> + __free_page(page);
> + }
> + }
> + }
> +}
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2009-10-15 03:35:22

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH 8/9] swap_info: note SWAP_MAP_SHMEM

On Thu, 15 Oct 2009 01:57:28 +0100 (BST)
Hugh Dickins <[email protected]> wrote:

> While we're fiddling with the swap_map values, let's assign a particular
> value to shmem/tmpfs swap pages: their swap counts are never incremented,
> and it helps swapoff's try_to_unuse() a little if it can immediately
> distinguish those pages from process pages.
>
> Since we've no use for SWAP_MAP_BAD | COUNT_CONTINUED,
> we might as well use that 0xbf value for SWAP_MAP_SHMEM.
>
> Signed-off-by: Hugh Dickins <[email protected]>

I welcome this!

Reviewed-by: KAMEZAWA Hiroyuki <[email protected]>

> ---
>
> include/linux/swap.h | 6 +++++
> mm/shmem.c | 11 +++++++--
> mm/swapfile.c | 47 +++++++++++++++++++++++------------------
> 3 files changed, 42 insertions(+), 22 deletions(-)
>
> --- si7/include/linux/swap.h 2009-10-14 21:26:57.000000000 +0100
> +++ si8/include/linux/swap.h 2009-10-14 21:27:07.000000000 +0100
> @@ -157,6 +157,7 @@ enum {
> #define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */
> #define SWAP_CONT_MAX 0x7f /* Max count, in each swap_map continuation */
> #define COUNT_CONTINUED 0x80 /* See swap_map continuation for full count */
> +#define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs, in first swap_map */
>
> /*
> * The in-memory structure used to track swap areas.
> @@ -315,6 +316,7 @@ extern swp_entry_t get_swap_page(void);
> extern swp_entry_t get_swap_page_of_type(int);
> extern int valid_swaphandles(swp_entry_t, unsigned long *);
> extern int add_swap_count_continuation(swp_entry_t, gfp_t);
> +extern void swap_shmem_alloc(swp_entry_t);
> extern int swap_duplicate(swp_entry_t);
> extern int swapcache_prepare(swp_entry_t);
> extern void swap_free(swp_entry_t);
> @@ -393,6 +395,10 @@ static inline int add_swap_count_continu
> return 0;
> }
>
> +static inline void swap_shmem_alloc(swp_entry_t swp)
> +{
> +}
> +
> static inline int swap_duplicate(swp_entry_t swp)
> {
> return 0;
> --- si7/mm/shmem.c 2009-09-28 00:28:41.000000000 +0100
> +++ si8/mm/shmem.c 2009-10-14 21:27:07.000000000 +0100
> @@ -1017,7 +1017,14 @@ int shmem_unuse(swp_entry_t entry, struc
> goto out;
> }
> mutex_unlock(&shmem_swaplist_mutex);
> -out: return found; /* 0 or 1 or -ENOMEM */
> + /*
> + * Can some race bring us here? We've been holding page lock,
> + * so I think not; but would rather try again later than BUG()
> + */
> + unlock_page(page);
> + page_cache_release(page);
> +out:
> + return (found < 0) ? found : 0;
> }
>
> /*
> @@ -1080,7 +1087,7 @@ static int shmem_writepage(struct page *
> else
> inode = NULL;
> spin_unlock(&info->lock);
> - swap_duplicate(swap);
> + swap_shmem_alloc(swap);
> BUG_ON(page_mapped(page));
> page_cache_release(page); /* pagecache ref */
> swap_writepage(page, wbc);
> --- si7/mm/swapfile.c 2009-10-14 21:26:57.000000000 +0100
> +++ si8/mm/swapfile.c 2009-10-14 21:27:07.000000000 +0100
> @@ -548,6 +548,12 @@ static unsigned char swap_entry_free(str
> if (usage == SWAP_HAS_CACHE) {
> VM_BUG_ON(!has_cache);
> has_cache = 0;
> + } else if (count == SWAP_MAP_SHMEM) {
> + /*
> + * Or we could insist on shmem.c using a special
> + * swap_shmem_free() and free_shmem_swap_and_cache()...
> + */
> + count = 0;
> } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
> if (count == COUNT_CONTINUED) {
> if (swap_count_continued(p, offset, count))
> @@ -1031,7 +1037,6 @@ static int try_to_unuse(unsigned int typ
> swp_entry_t entry;
> unsigned int i = 0;
> int retval = 0;
> - int shmem;
>
> /*
> * When searching mms for an entry, a good strategy is to
> @@ -1107,17 +1112,18 @@ static int try_to_unuse(unsigned int typ
>
> /*
> * Remove all references to entry.
> - * Whenever we reach init_mm, there's no address space
> - * to search, but use it as a reminder to search shmem.
> */
> - shmem = 0;
> swcount = *swap_map;
> - if (swap_count(swcount)) {
> - if (start_mm == &init_mm)
> - shmem = shmem_unuse(entry, page);
> - else
> - retval = unuse_mm(start_mm, entry, page);
> + if (swap_count(swcount) == SWAP_MAP_SHMEM) {
> + retval = shmem_unuse(entry, page);
> + /* page has already been unlocked and released */
> + if (retval < 0)
> + break;
> + continue;
> }
> + if (swap_count(swcount) && start_mm != &init_mm)
> + retval = unuse_mm(start_mm, entry, page);
> +
> if (swap_count(*swap_map)) {
> int set_start_mm = (*swap_map >= swcount);
> struct list_head *p = &start_mm->mmlist;
> @@ -1128,7 +1134,7 @@ static int try_to_unuse(unsigned int typ
> atomic_inc(&new_start_mm->mm_users);
> atomic_inc(&prev_mm->mm_users);
> spin_lock(&mmlist_lock);
> - while (swap_count(*swap_map) && !retval && !shmem &&
> + while (swap_count(*swap_map) && !retval &&
> (p = p->next) != &start_mm->mmlist) {
> mm = list_entry(p, struct mm_struct, mmlist);
> if (!atomic_inc_not_zero(&mm->mm_users))
> @@ -1142,10 +1148,9 @@ static int try_to_unuse(unsigned int typ
> swcount = *swap_map;
> if (!swap_count(swcount)) /* any usage ? */
> ;
> - else if (mm == &init_mm) {
> + else if (mm == &init_mm)
> set_start_mm = 1;
> - shmem = shmem_unuse(entry, page);
> - } else
> + else
> retval = unuse_mm(mm, entry, page);
>
> if (set_start_mm &&
> @@ -1162,13 +1167,6 @@ static int try_to_unuse(unsigned int typ
> mmput(start_mm);
> start_mm = new_start_mm;
> }
> - if (shmem) {
> - /* page has already been unlocked and released */
> - if (shmem > 0)
> - continue;
> - retval = shmem;
> - break;
> - }
> if (retval) {
> unlock_page(page);
> page_cache_release(page);
> @@ -2128,6 +2126,15 @@ bad_file:
> }
>
> /*
> + * Help swapoff by noting that swap entry belongs to shmem/tmpfs
> + * (in which case its reference count is never incremented).
> + */
> +void swap_shmem_alloc(swp_entry_t entry)
> +{
> + __swap_duplicate(entry, SWAP_MAP_SHMEM);
> +}
> +
> +/*
> * increase reference count of swap entry by 1.
> */
> int swap_duplicate(swp_entry_t entry)
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>

2009-10-15 14:58:19

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH 1/9] swap_info: private to swapfile.c

Hugh Dickins wrote:
> The swap_info_struct is mostly private to mm/swapfile.c, with only
> one other in-tree user: get_swap_bio(). Adjust its interface to
> map_swap_page(), so that we can then remove get_swap_info_struct().
>
> But there is a popular user out-of-tree, TuxOnIce: so leave the
> declaration of swap_info_struct in linux/swap.h.
>
> Signed-off-by: Hugh Dickins <[email protected]>

Reviewed-by: Rik van Riel <[email protected]>

--
All rights reversed.

2009-10-15 15:04:13

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH 2/9] swap_info: change to array of pointers

Hugh Dickins wrote:
> The swap_info_struct is only 76 or 104 bytes, but it does seem wrong
> to reserve an array of about 30 of them in bss, when most people will
> want only one. Change swap_info[] to an array of pointers.
>
> That does need a "type" field in the structure: pack it as a char with
> next type and short prio (aha, char is unsigned by default on PowerPC).
> Use the (admittedly peculiar) name "type" throughout for this index.
>
> /proc/swaps does not take swap_lock: I wouldn't want it to, but do take
> care with barriers when adding a new item to the array (never removed).
>
> Signed-off-by: Hugh Dickins <[email protected]>

Acked-by: Rik van Riel <[email protected]>

--
All rights reversed.

2009-10-15 19:46:08

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 7/9] swap_info: swap count continuations

On Thu, 15 Oct 2009 12:30:24 +0900
KAMEZAWA Hiroyuki <[email protected]> wrote:

> Is array of "unsigned long" counter is bad ? (too big?)

metoo!

2009-10-15 21:18:47

by David Rientjes

[permalink] [raw]
Subject: Re: [PATCH 7/9] swap_info: swap count continuations

On Thu, 15 Oct 2009, KAMEZAWA Hiroyuki wrote:

> Hmm...maybe I don't understand the benefit of this style of data structure.
>
> Do we need fine grain chain ?
> Is array of "unsigned long" counter is bad ? (too big?)
>

I'm wondering if flex_array can be used for this purpose, which can store
up to 261632 elements of size unsigned long with 4K pages, or whether
finding the first available bit or weight would be too expensive.

2009-10-15 22:02:09

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH 4/9] swap_info: miscellaneous minor cleanups

On Thu, 15 Oct 2009, KAMEZAWA Hiroyuki wrote:
> On Thu, 15 Oct 2009 01:50:54 +0100 (BST)
> Hugh Dickins <[email protected]> wrote:
>
> > Move CONFIG_MIGRATION's swapdev_block() into the main CONFIG_MIGRATION
> > block, remove extraneous whitespace and return, fix typo in a comment.
> >
> CONFIG_HIBERNATION ?

Absolutely! Thanks a lot for spotting that (and for reviewing all these,
so quickly). It seems I'm confused by birds migrating for the winter :)
A 4/9 v2, correcting just that comment, will follow when I've gone
through your other suggestions.

Hugh

2009-10-15 22:08:58

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH 5/9] swap_info: SWAP_HAS_CACHE cleanups

On Thu, 15 Oct 2009, KAMEZAWA Hiroyuki wrote:
> On Thu, 15 Oct 2009 01:52:27 +0100 (BST)
> Hugh Dickins <[email protected]> wrote:
>
> > Though swap_count() is useful, I'm finding that swap_has_cache() and
> > encode_swapmap() obscure what happens in the swap_map entry, just at
> > those points where I need to understand it. Remove them, and pass
> > more usable "usage" values to scan_swap_map(), swap_entry_free() and
> > __swap_duplicate(), instead of the SWAP_MAP and SWAP_CACHE enum.
> >
> > Signed-off-by: Hugh Dickins <[email protected]>
>
> I have no objectios to above.

Phew! Thanks. I particularly had this one in mind when I Cc'ed you
on them all, because we do have a clash of styles or habits there.

My view is, the next time you or someone else is at work in there,
okay to reintroduce such things if they make it easier for you to
work on the code; but for me they made it harder.

> I'll test, later. maybe no troubles.

Thanks, yes, testing is the most important.

Hugh

2009-10-15 22:17:59

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH 6/9] swap_info: swap_map of chars not shorts

On Thu, 15 Oct 2009, KAMEZAWA Hiroyuki wrote:
> On Thu, 15 Oct 2009 01:53:52 +0100 (BST)
> Hugh Dickins <[email protected]> wrote:
> > @@ -1175,6 +1175,12 @@ static int try_to_unuse(unsigned int typ
> > * If that's wrong, then we should worry more about
> > * exit_mmap() and do_munmap() cases described above:
> > * we might be resetting SWAP_MAP_MAX too early here.
> > + *
> > + * Yes, that's wrong: though very unlikely, swap count 0x7ffe
> > + * could surely occur if pid_max raised from PID_MAX_DEFAULT;
>
> Just a nitpick.
>
> Hmm, logically, our MAX COUNT is 0x7e after this patch. Then, how about not
> mentioning to 0x7ffe and PID_MAX ? as..
>
> Yes, that's wrong: we now use SWAP_MAP_MAX as 0x7e, very easy to overflow.
> next patch will...

Perhaps we're reading it differently: I was there inserting a comment
on what was already said above (with no wish to change that existing
comment), then going on (immediately below) to mention how this patch
is now lowering SWAP_MAP_MAX to 0x7e, making the situation even worse,
but no worries because the next patch fixes it.

If you are seeing a nit there, I'm afraid it's one too small for my
eye! And the lifetime of this comment, in Linus's git history, will
be (I'm guessing) a fraction of a second - becoming a non-issue, it
rightly gets deleted in the next patch.

Hugh

>
> Reviewed-by: KAMEZAWA Hiroyuki <[email protected]>
>
> > + * and we are now lowering SWAP_MAP_MAX to 0x7e, making it
> > + * much easier to reach. But the next patch will fix that.

2009-10-15 22:24:03

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH 8/9] swap_info: note SWAP_MAP_SHMEM

On Thu, 15 Oct 2009, KAMEZAWA Hiroyuki wrote:
> On Thu, 15 Oct 2009 01:57:28 +0100 (BST)
> Hugh Dickins <[email protected]> wrote:
>
> > While we're fiddling with the swap_map values, let's assign a particular
> > value to shmem/tmpfs swap pages: their swap counts are never incremented,
> > and it helps swapoff's try_to_unuse() a little if it can immediately
> > distinguish those pages from process pages.
> >
> > Since we've no use for SWAP_MAP_BAD | COUNT_CONTINUED,
> > we might as well use that 0xbf value for SWAP_MAP_SHMEM.
> >
> > Signed-off-by: Hugh Dickins <[email protected]>
>
> I welcome this!

Ah, I did wonder whether you might find some memcg use for it too:
I'm guessing your welcome means that you do have some such in mind.

(By the way, there's no particular need to use that 0xbf value:
during most of my testing I was using SWAP_MAP_SHMEM 0x3e and
SWAP_MAP_MAX 0x3d; but then noticed that 0xbf just happened to be
free, and also happened to sail through the tests in the right way.
But if it ever becomes a nuisance there, no problem to move it.)

Hugh

2009-10-15 22:41:58

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH 2/9] swap_info: change to array of pointers

On Thu, 15 Oct 2009, KAMEZAWA Hiroyuki wrote:
> On Thu, 15 Oct 2009 01:48:01 +0100 (BST)
> Hugh Dickins <[email protected]> wrote:
> > --- si1/mm/swapfile.c 2009-10-14 21:25:58.000000000 +0100
> > +++ si2/mm/swapfile.c 2009-10-14 21:26:09.000000000 +0100
> > @@ -49,7 +49,7 @@ static const char Unused_offset[] = "Unu
> >
> > static struct swap_list_t swap_list = {-1, -1};
> >
> > -static struct swap_info_struct swap_info[MAX_SWAPFILES];
> > +static struct swap_info_struct *swap_info[MAX_SWAPFILES];
> >
>
> Could you add some comment like this ?
> ==
> nr_swapfile is never decreased.
> swap_info[type] pointer will never be invalid if it turns to be valid once.
>
>
> for (i = 0; i < nr_swapfiles; i++) {
> smp_rmp();
> sis = swap_info[type];
> ....
> }
> Then, we can execute above without checking sis is valid or not.
> smp_rmb() is required when we do above loop without swap_lock().

I do describe this (too briefly?) in the comment on smp_wmb() where
swap_info[type] is set and nr_swapfiles raised, in swapon (see below).
And make a quick same-line comment on the corresponding smp_rmb()s.

Those seem more useful to me than such a comment on the
static struct swap_info_struct *swap_info[MAX_SWAPFILES];

I was about to add (now, in writing this mail) that /proc/swaps is
the only thing that reads them without swap_lock; but that's not
true, of course, swap_duplicate and swap_free (or their helpers)
make preliminary checks without swap_lock - but the difference
there is that (unless the pagetable has become corrupted) they're
dealing with a swap entry which was previously valid, so can by
this time rely upon swap_info[type] and nr_swapfiles to be safe.

> swapon_mutex() will be no help.
>
> Whether sis is used or not can be detelcted by sis->flags.
>
> > @@ -1675,11 +1674,13 @@ static void *swap_start(struct seq_file
> > if (!l)
> > return SEQ_START_TOKEN;
> >
> > - for (i = 0; i < nr_swapfiles; i++, ptr++) {
> > - if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
> > + for (type = 0; type < nr_swapfiles; type++) {
> > + smp_rmb(); /* read nr_swapfiles before swap_info[type] */
> > + si = swap_info[type];
>
> if (!si) ?
>
> > + if (!(si->flags & SWP_USED) || !si->swap_map)
> > continue;
> > if (!--l)
> > - return ptr;
> > + return si;
> > }
...
> > static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
> > {
> > - struct swap_info_struct *ptr;
> > - struct swap_info_struct *endptr = swap_info + nr_swapfiles;
> > + struct swap_info_struct *si = v;
> > + int type;
> >
> > if (v == SEQ_START_TOKEN)
> > - ptr = swap_info;
> > - else {
> > - ptr = v;
> > - ptr++;
> > - }
> > + type = 0;
> > + else
> > + type = si->type + 1;
> >
> > - for (; ptr < endptr; ptr++) {
> > - if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
> > + for (; type < nr_swapfiles; type++) {
> > + smp_rmb(); /* read nr_swapfiles before swap_info[type] */
> > + si = swap_info[type];
> > + if (!(si->flags & SWP_USED) || !si->swap_map)
...
> > @@ -1799,23 +1800,45 @@ SYSCALL_DEFINE2(swapon, const char __use
...
> > - if (type >= nr_swapfiles)
> > - nr_swapfiles = type+1;
> > - memset(p, 0, sizeof(*p));
> > INIT_LIST_HEAD(&p->extent_list);
> > + if (type >= nr_swapfiles) {
> > + p->type = type;
> > + swap_info[type] = p;
> > + /*
> > + * Write swap_info[type] before nr_swapfiles, in case a
> > + * racing procfs swap_start() or swap_next() is reading them.
> > + * (We never shrink nr_swapfiles, we never free this entry.)
> > + */
> > + smp_wmb();
> > + nr_swapfiles++;
> > + } else {
> > + kfree(p);
> > + p = swap_info[type];
> > + /*
> > + * Do not memset this entry: a racing procfs swap_next()
> > + * would be relying on p->type to remain valid.
> > + */
> > + }
...

2009-10-15 23:04:53

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH 2/9] swap_info: change to array of pointers

On Thu, 15 Oct 2009, Hugh Dickins wrote:
> On Thu, 15 Oct 2009, KAMEZAWA Hiroyuki wrote:
> > On Thu, 15 Oct 2009 01:48:01 +0100 (BST)
> > Hugh Dickins <[email protected]> wrote:
> > > @@ -1675,11 +1674,13 @@ static void *swap_start(struct seq_file
> > > if (!l)
> > > return SEQ_START_TOKEN;
> > >
> > > - for (i = 0; i < nr_swapfiles; i++, ptr++) {
> > > - if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
> > > + for (type = 0; type < nr_swapfiles; type++) {
> > > + smp_rmb(); /* read nr_swapfiles before swap_info[type] */
> > > + si = swap_info[type];
> >
> > if (!si) ?

Re-reading, I see that I missed your interjection there.

Precisely because we read swap_info[type] after reading nr_swapfiles,
with smp_rmb() here to enforce that, and smp_wmb() where they're set
in swapon, there is no way for si to be seen as NULL here. Is there?

Or are you asking for a further comment here on why that's so?
I think I'd rather just switch to taking swap_lock in swap_start()
and swap_next(), than be adding comments on why we don't need it.

Hugh

2009-10-15 23:10:45

by Nigel Cunningham

[permalink] [raw]
Subject: Re: [PATCH 1/9] swap_info: private to swapfile.c

Hi Hugh.

Hugh Dickins wrote:
> The swap_info_struct is mostly private to mm/swapfile.c, with only
> one other in-tree user: get_swap_bio(). Adjust its interface to
> map_swap_page(), so that we can then remove get_swap_info_struct().
>
> But there is a popular user out-of-tree, TuxOnIce: so leave the
> declaration of swap_info_struct in linux/swap.h.

Sorry for the delay in replying.

I don't mind if you don't leave swap_info_struct in
include/linux/swap.h. I'm currently reworking my swap support anyway,
adding support for honouring the priority field. I've also recently
learned that under some circumstances, allocating all available swap can
take quite a while (I have a user who is hibernating with 32GB of RAM!),
so I've been thinking about what I can do to optimise that.

Regards,

Nigel

2009-10-15 23:49:29

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH 2/9] swap_info: change to array of pointers

On Thu, 15 Oct 2009 23:41:19 +0100 (BST)
Hugh Dickins <[email protected]> wrote:

> On Thu, 15 Oct 2009, KAMEZAWA Hiroyuki wrote:
> > On Thu, 15 Oct 2009 01:48:01 +0100 (BST)
> > Hugh Dickins <[email protected]> wrote:
> > > --- si1/mm/swapfile.c 2009-10-14 21:25:58.000000000 +0100
> > > +++ si2/mm/swapfile.c 2009-10-14 21:26:09.000000000 +0100
> > > @@ -49,7 +49,7 @@ static const char Unused_offset[] = "Unu
> > >
> > > static struct swap_list_t swap_list = {-1, -1};
> > >
> > > -static struct swap_info_struct swap_info[MAX_SWAPFILES];
> > > +static struct swap_info_struct *swap_info[MAX_SWAPFILES];
> > >
> >
> > Could you add some comment like this ?
> > ==
> > nr_swapfile is never decreased.
> > swap_info[type] pointer will never be invalid if it turns to be valid once.
> >
> >
> > for (i = 0; i < nr_swapfiles; i++) {
> > smp_rmp();
> > sis = swap_info[type];
> > ....
> > }
> > Then, we can execute above without checking sis is valid or not.
> > smp_rmb() is required when we do above loop without swap_lock().
>
> I do describe this (too briefly?) in the comment on smp_wmb() where
> swap_info[type] is set and nr_swapfiles raised, in swapon (see below).
> And make a quick same-line comment on the corresponding smp_rmb()s.
>
> Those seem more useful to me than such a comment on the
> static struct swap_info_struct *swap_info[MAX_SWAPFILES];
>
yes.



> I was about to add (now, in writing this mail) that /proc/swaps is
> the only thing that reads them without swap_lock; but that's not
> true, of course, swap_duplicate and swap_free (or their helpers)
> make preliminary checks without swap_lock - but the difference
> there is that (unless the pagetable has become corrupted) they're
> dealing with a swap entry which was previously valid, so can by
> this time rely upon swap_info[type] and nr_swapfiles to be safe.
>
Ah, a fact which confusing me very much at frist look was

- nr_swapfiles nerver decreases
- then, swap_info[type] will be never freed once allocated

I hope this should be commented upon this.

static struct swap_info_struct *swap_info[MAX_SWAPFILES];

Regards,
-Kame

> > swapon_mutex() will be no help.
> >
> > Whether sis is used or not can be detelcted by sis->flags.
> >
> > > @@ -1675,11 +1674,13 @@ static void *swap_start(struct seq_file
> > > if (!l)
> > > return SEQ_START_TOKEN;
> > >
> > > - for (i = 0; i < nr_swapfiles; i++, ptr++) {
> > > - if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
> > > + for (type = 0; type < nr_swapfiles; type++) {
> > > + smp_rmb(); /* read nr_swapfiles before swap_info[type] */
> > > + si = swap_info[type];
> >
> > if (!si) ?
> >
> > > + if (!(si->flags & SWP_USED) || !si->swap_map)
> > > continue;
> > > if (!--l)
> > > - return ptr;
> > > + return si;
> > > }
> ...
> > > static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
> > > {
> > > - struct swap_info_struct *ptr;
> > > - struct swap_info_struct *endptr = swap_info + nr_swapfiles;
> > > + struct swap_info_struct *si = v;
> > > + int type;
> > >
> > > if (v == SEQ_START_TOKEN)
> > > - ptr = swap_info;
> > > - else {
> > > - ptr = v;
> > > - ptr++;
> > > - }
> > > + type = 0;
> > > + else
> > > + type = si->type + 1;
> > >
> > > - for (; ptr < endptr; ptr++) {
> > > - if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
> > > + for (; type < nr_swapfiles; type++) {
> > > + smp_rmb(); /* read nr_swapfiles before swap_info[type] */
> > > + si = swap_info[type];
> > > + if (!(si->flags & SWP_USED) || !si->swap_map)
> ...
> > > @@ -1799,23 +1800,45 @@ SYSCALL_DEFINE2(swapon, const char __use
> ...
> > > - if (type >= nr_swapfiles)
> > > - nr_swapfiles = type+1;
> > > - memset(p, 0, sizeof(*p));
> > > INIT_LIST_HEAD(&p->extent_list);
> > > + if (type >= nr_swapfiles) {
> > > + p->type = type;
> > > + swap_info[type] = p;
> > > + /*
> > > + * Write swap_info[type] before nr_swapfiles, in case a
> > > + * racing procfs swap_start() or swap_next() is reading them.
> > > + * (We never shrink nr_swapfiles, we never free this entry.)
> > > + */
> > > + smp_wmb();
> > > + nr_swapfiles++;
> > > + } else {
> > > + kfree(p);
> > > + p = swap_info[type];
> > > + /*
> > > + * Do not memset this entry: a racing procfs swap_next()
> > > + * would be relying on p->type to remain valid.
> > > + */
> > > + }
> ...
>

2009-10-15 23:50:53

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH 2/9] swap_info: change to array of pointers

On Fri, 16 Oct 2009 00:04:14 +0100 (BST)
Hugh Dickins <[email protected]> wrote:

> On Thu, 15 Oct 2009, Hugh Dickins wrote:
> > On Thu, 15 Oct 2009, KAMEZAWA Hiroyuki wrote:
> > > On Thu, 15 Oct 2009 01:48:01 +0100 (BST)
> > > Hugh Dickins <[email protected]> wrote:
> > > > @@ -1675,11 +1674,13 @@ static void *swap_start(struct seq_file
> > > > if (!l)
> > > > return SEQ_START_TOKEN;
> > > >
> > > > - for (i = 0; i < nr_swapfiles; i++, ptr++) {
> > > > - if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
> > > > + for (type = 0; type < nr_swapfiles; type++) {
> > > > + smp_rmb(); /* read nr_swapfiles before swap_info[type] */
> > > > + si = swap_info[type];
> > >
> > > if (!si) ?
>
> Re-reading, I see that I missed your interjection there.
>
> Precisely because we read swap_info[type] after reading nr_swapfiles,
> with smp_rmb() here to enforce that, and smp_wmb() where they're set
> in swapon, there is no way for si to be seen as NULL here. Is there?
>
Ah, sorry this is my mistake. I don't understand "nr_swapfiles never decreases
and swap_info[] will be never invalidated."

> Or are you asking for a further comment here on why that's so?
No.

> I think I'd rather just switch to taking swap_lock in swap_start()
> and swap_next(), than be adding comments on why we don't need it.
>

Hmm, maybe.

Thanks,
-Kame

> Hugh
>

2009-10-15 23:55:16

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH 6/9] swap_info: swap_map of chars not shorts

On Thu, 15 Oct 2009 23:17:20 +0100 (BST)
Hugh Dickins <[email protected]> wrote:

> On Thu, 15 Oct 2009, KAMEZAWA Hiroyuki wrote:
> > On Thu, 15 Oct 2009 01:53:52 +0100 (BST)
> > Hugh Dickins <[email protected]> wrote:
> > > @@ -1175,6 +1175,12 @@ static int try_to_unuse(unsigned int typ
> > > * If that's wrong, then we should worry more about
> > > * exit_mmap() and do_munmap() cases described above:
> > > * we might be resetting SWAP_MAP_MAX too early here.
> > > + *
> > > + * Yes, that's wrong: though very unlikely, swap count 0x7ffe
> > > + * could surely occur if pid_max raised from PID_MAX_DEFAULT;
> >
> > Just a nitpick.
> >
> > Hmm, logically, our MAX COUNT is 0x7e after this patch. Then, how about not
> > mentioning to 0x7ffe and PID_MAX ? as..
> >
> > Yes, that's wrong: we now use SWAP_MAP_MAX as 0x7e, very easy to overflow.
> > next patch will...
>
> Perhaps we're reading it differently: I was there inserting a comment
> on what was already said above (with no wish to change that existing
> comment), then going on (immediately below) to mention how this patch
> is now lowering SWAP_MAP_MAX to 0x7e, making the situation even worse,
> but no worries because the next patch fixes it.
>
yes.

> If you are seeing a nit there, I'm afraid it's one too small for my
> eye!

I don't think it's very troublesome, but in these days, people seems to love
"bisect", Then, comments for change and comments for code should be divided, IMHO.

> And the lifetime of this comment, in Linus's git history, will
> be (I'm guessing) a fraction of a second - becoming a non-issue, it
> rightly gets deleted in the next patch.

ya, thanks.

Regards,
-Kame

2009-10-15 23:54:17

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH 7/9] swap_info: swap count continuations

On Thu, 15 Oct 2009, KAMEZAWA Hiroyuki wrote:
> On Thu, 15 Oct 2009 01:56:01 +0100 (BST)
> Hugh Dickins <[email protected]> wrote:
>
> > This patch implements swap count continuations: when the count overflows,
> > a continuation page is allocated and linked to the original vmalloc'ed
> > map page, and this used to hold the continuation counts for that entry
> > and its neighbours. These continuation pages are seldom referenced:
> > the common paths all work on the original swap_map, only referring to
> > a continuation page when the low "digit" of a count is incremented or
> > decremented through SWAP_MAP_MAX.
>
> Hmm...maybe I don't understand the benefit of this style of data structure.

I can see that what I have there is not entirely transparent!

>
> Do we need fine grain chain ?
> Is array of "unsigned long" counter is bad ? (too big?)

I'll admit that that design just happens to be what first sprang
to my mind. It was only later, while implementing it, that I
wondered, hey, wouldn't it be a lot simpler just to have an
extension array of full counts?

It seemed to me (I'm not certain) that the char arrays I was
implementing were better suited to (use less memory in) a "normal"
workload in which the basic swap_map counts might overflow (but
I wonder how normal is any workload in which they overflow).
Whereas the array of full counts would be better suited to an
"aberrant" workload in which a mischievous user is actually
trying to maximize those counts. I decided to carry on with
the better solution for the (more) normal workload, the solution
less likely to gobble up more memory there than we've used before.

While I agree that the full count implementation would be simpler
and more obviously correct, I thought it was still going to involve
a linked list of pages (but "parallel" rather than "serial": each
of the pages assigned to one range of the base page).

Looking at what you propose below, maybe I'm not getting the details
right, but it looks as if you're having to do an order 2 or order 3
page allocation? Attempted with GFP_ATOMIC? I'd much rather stick
with order 0 pages, even if we do have to chain them to the base.

(Order 3 on 64-bit? A side issue which deterred me from the full
count approach, was the argumentation we'd get into over how big a
full count needs to be. I think, for so long as we have atomic_t
page count and page mapcount, an int is big enough for swap count.
But switching them to atomic_long_t may already be overdue.
Anyway, I liked how the char continuations avoided that issue.)

I'm reluctant to depart from what I have, now that it's tested;
but yes, we could perfectly well replace it by a different design,
it is very self-contained. The demands on this code are unusually
simple: it only has to manage counting up and counting down;
so it is very easily tested.

(The part I found difficult was getting rid of the __GFP_ZERO
I was allocating with originally.)

Hugh

>
> ==
> #define EXTENTION_OFFSET_INDEX(offset) (((offset) & PAGE_MASK)
> #define EXTENTION_OFFSET_MASK (~(PAGE_SIZE/sizeof(long) - 1))
> struct swapcount_extention_array {
> unsigned long *map[EXTEND_MAP_SIZE];
> };
>
> At adding continuation.
>
> int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
> {
> struct page *page;
> unsigned long *newmap, *map;
> struct swapcount_extention_array *array;
>
> newmap = __get_free_page(mask);
> si = swap_info_get(entry);
> array = kmalloc(sizeof(swapcount_extention_array);
>
> ....
> (If overflow)
> page = vmalloc_to_page(si->swap_map + offset);
> if (!PagePrivate(page)) {
> page->praivate = array;
> } else
> kfree(array);
>
> index = EXTENTION_OFFSET_INDEX(offset);
> pos = EXTENTION_OFFSET_MASK(offset);
>
> array = page->private;
> if (!array->map[index]) {
> array->map[index] = newmap;
> } else
> free_page(newmap);
> map = array->map[index];
> map[pos] += 1;
> mappage = vaddr_to_page(map);
> get_page(mappage); # increment page->count of array.
> ==
>
> Hmm? maybe I just don't like chain...
>
> Regards,
> -Kame

2009-10-16 00:07:49

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH 8/9] swap_info: note SWAP_MAP_SHMEM

On Thu, 15 Oct 2009 23:23:24 +0100 (BST)
Hugh Dickins <[email protected]> wrote:

> On Thu, 15 Oct 2009, KAMEZAWA Hiroyuki wrote:
> > On Thu, 15 Oct 2009 01:57:28 +0100 (BST)
> > Hugh Dickins <[email protected]> wrote:
> >
> > > While we're fiddling with the swap_map values, let's assign a particular
> > > value to shmem/tmpfs swap pages: their swap counts are never incremented,
> > > and it helps swapoff's try_to_unuse() a little if it can immediately
> > > distinguish those pages from process pages.
> > >
> > > Since we've no use for SWAP_MAP_BAD | COUNT_CONTINUED,
> > > we might as well use that 0xbf value for SWAP_MAP_SHMEM.
> > >
> > > Signed-off-by: Hugh Dickins <[email protected]>
> >
> > I welcome this!
>
> Ah, I did wonder whether you might find some memcg use for it too:
> I'm guessing your welcome means that you do have some such in mind.
>
yes, I'm thinking I can use this or not on memcg for simplifying memcg's hooks
for shmem. It's complicated ;)
I have to test memcg+shmem carefully again after this patch but I think
there will be no trouble, now.

> (By the way, there's no particular need to use that 0xbf value:
> during most of my testing I was using SWAP_MAP_SHMEM 0x3e and
> SWAP_MAP_MAX 0x3d; but then noticed that 0xbf just happened to be
> free, and also happened to sail through the tests in the right way.
> But if it ever becomes a nuisance there, no problem to move it.)
>

Hmm. I myself have no troubles whatever free vaule is used.
let me clarify..

xx00 0000
xx11 1110 - swap count max
01xx xxxx - swap has cache
1xxx xxxx - swap count has continuation
1x11 1111 - swap for shmem

seems not very bad.

Regards,
-Kame

2009-10-16 00:21:57

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH 7/9] swap_info: swap count continuations

On Thu, 15 Oct 2009, David Rientjes wrote:
> On Thu, 15 Oct 2009, KAMEZAWA Hiroyuki wrote:
> > Hmm...maybe I don't understand the benefit of this style of data structure.
> >
> > Do we need fine grain chain ?
> > Is array of "unsigned long" counter is bad ? (too big?)
>
> I'm wondering if flex_array can be used for this purpose, which can store
> up to 261632 elements of size unsigned long with 4K pages, or whether
> finding the first available bit or weight would be too expensive.

When flex_arrays were first mooted, I did briefly wonder if we could
use them instead of vmalloc for the swap_map; but no, their interface
would slow down scan_swap_map() unacceptably.

Extensions of the swap_map are a different matter, they are seldom
referenced, and referenced just an item at a time: much better suited
to a flex_array. And looking at Jon's Doc, I see they're good for
sparse arrays, that would suit swap_map extensions very well.

However... that limit of 261632 elements rules them out here (or can
we have a flex_array of flex_arrays?), and the lack of support for
__GFP_HIGHMEM is disappointing - the current implementation of swap
count continuations does use highmem (though perhaps these pages
are so rarely needed that it actually doesn't matter).

It seems that the flex_array is a solution in search of a problem,
and that the swap_map extension is not the right problem for it.

Hugh

2009-10-16 00:29:06

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH 1/9] swap_info: private to swapfile.c

On Fri, 16 Oct 2009, Nigel Cunningham wrote:
> Hugh Dickins wrote:
> > The swap_info_struct is mostly private to mm/swapfile.c, with only
> > one other in-tree user: get_swap_bio(). Adjust its interface to
> > map_swap_page(), so that we can then remove get_swap_info_struct().
> >
> > But there is a popular user out-of-tree, TuxOnIce: so leave the
> > declaration of swap_info_struct in linux/swap.h.
>
> Sorry for the delay in replying.

Delay? Not at all! Just leave the delays to me ;)

>
> I don't mind if you don't leave swap_info_struct in
> include/linux/swap.h.

Okay, thanks for the info, that's good. I won't take it out of swap.h
at this point, I'm finished in that area for now; but it's useful to
know that later on we can do so.

> I'm currently reworking my swap support anyway,

There should be better ways to interface to it than get_swap_info_struct().

> adding support for honouring the priority field. I've also recently
> learned that under some circumstances, allocating all available swap can
> take quite a while (I have a user who is hibernating with 32GB of RAM!),
> so I've been thinking about what I can do to optimise that.

Have fun!

Hugh

2009-10-16 00:41:39

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 4/9 v2] swap_info: miscellaneous minor cleanups

Move CONFIG_HIBERNATION's swapdev_block() into the main CONFIG_HIBERNATION
block, remove extraneous whitespace and return, fix typo in a comment.

Signed-off-by: Hugh Dickins <[email protected]>
---
v2 fixes mistaken description above, thank you KAMEZAWA-san.

mm/swapfile.c | 51 ++++++++++++++++++++++--------------------------
1 file changed, 24 insertions(+), 27 deletions(-)

--- si3/mm/swapfile.c 2009-10-14 21:26:22.000000000 +0100
+++ si4/mm/swapfile.c 2009-10-14 21:26:32.000000000 +0100
@@ -519,9 +519,9 @@ swp_entry_t get_swap_page_of_type(int ty
return (swp_entry_t) {0};
}

-static struct swap_info_struct * swap_info_get(swp_entry_t entry)
+static struct swap_info_struct *swap_info_get(swp_entry_t entry)
{
- struct swap_info_struct * p;
+ struct swap_info_struct *p;
unsigned long offset, type;

if (!entry.val)
@@ -599,7 +599,7 @@ static int swap_entry_free(struct swap_i
*/
void swap_free(swp_entry_t entry)
{
- struct swap_info_struct * p;
+ struct swap_info_struct *p;

p = swap_info_get(entry);
if (p) {
@@ -629,7 +629,6 @@ void swapcache_free(swp_entry_t entry, s
}
spin_unlock(&swap_lock);
}
- return;
}

/*
@@ -783,6 +782,21 @@ int swap_type_of(dev_t device, sector_t
}

/*
+ * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
+ * corresponding to given index in swap_info (swap type).
+ */
+sector_t swapdev_block(int type, pgoff_t offset)
+{
+ struct block_device *bdev;
+
+ if ((unsigned int)type >= nr_swapfiles)
+ return 0;
+ if (!(swap_info[type]->flags & SWP_WRITEOK))
+ return 0;
+ return map_swap_page(swp_entry(type, offset), &bdev);
+}
+
+/*
* Return either the total number of swap pages of given type, or the number
* of free pages of that type (depending on @free)
*
@@ -805,7 +819,7 @@ unsigned int count_swap_pages(int type,
spin_unlock(&swap_lock);
return n;
}
-#endif
+#endif /* CONFIG_HIBERNATION */

/*
* No need to decide whether this PTE shares the swap entry with others,
@@ -1317,23 +1331,6 @@ sector_t map_swap_page(swp_entry_t entry
}
}

-#ifdef CONFIG_HIBERNATION
-/*
- * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
- * corresponding to given index in swap_info (swap type).
- */
-sector_t swapdev_block(int type, pgoff_t offset)
-{
- struct block_device *bdev;
-
- if ((unsigned int)type >= nr_swapfiles)
- return 0;
- if (!(swap_info[type]->flags & SWP_WRITEOK))
- return 0;
- return map_swap_page(swp_entry(type, offset), &bdev);
-}
-#endif /* CONFIG_HIBERNATION */
-
/*
* Free all of a swapdev's extent information
*/
@@ -1525,12 +1522,12 @@ bad_bmap:

SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
- struct swap_info_struct * p = NULL;
+ struct swap_info_struct *p = NULL;
unsigned short *swap_map;
struct file *swap_file, *victim;
struct address_space *mapping;
struct inode *inode;
- char * pathname;
+ char *pathname;
int i, type, prev;
int err;

@@ -1782,7 +1779,7 @@ late_initcall(max_swapfiles_check);
*/
SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
{
- struct swap_info_struct * p;
+ struct swap_info_struct *p;
char *name = NULL;
struct block_device *bdev = NULL;
struct file *swap_file = NULL;
@@ -2117,7 +2114,7 @@ void si_swapinfo(struct sysinfo *val)
*/
static int __swap_duplicate(swp_entry_t entry, bool cache)
{
- struct swap_info_struct * p;
+ struct swap_info_struct *p;
unsigned long offset, type;
int result = -EINVAL;
int count;
@@ -2186,7 +2183,7 @@ void swap_duplicate(swp_entry_t entry)
/*
* @entry: swap entry for which we allocate swap cache.
*
- * Called when allocating swap cache for exising swap entry,
+ * Called when allocating swap cache for existing swap entry,
* This can return error codes. Returns 0 at success.
* -EBUSY means there is a swap cache.
* Note: return code is different from swap_duplicate().

2009-10-16 01:32:56

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH 7/9] swap_info: swap count continuations

On Fri, 16 Oct 2009 00:53:36 +0100 (BST)
Hugh Dickins <[email protected]> wrote:

> On Thu, 15 Oct 2009, KAMEZAWA Hiroyuki wrote:
> > On Thu, 15 Oct 2009 01:56:01 +0100 (BST)
> > Hugh Dickins <[email protected]> wrote:
> >
> > > This patch implements swap count continuations: when the count overflows,
> > > a continuation page is allocated and linked to the original vmalloc'ed
> > > map page, and this used to hold the continuation counts for that entry
> > > and its neighbours. These continuation pages are seldom referenced:
> > > the common paths all work on the original swap_map, only referring to
> > > a continuation page when the low "digit" of a count is incremented or
> > > decremented through SWAP_MAP_MAX.
> >
> > Hmm...maybe I don't understand the benefit of this style of data structure.
>
> I can see that what I have there is not entirely transparent!
>
> >
> > Do we need fine grain chain ?
> > Is array of "unsigned long" counter is bad ? (too big?)
>
> I'll admit that that design just happens to be what first sprang
> to my mind. It was only later, while implementing it, that I
> wondered, hey, wouldn't it be a lot simpler just to have an
> extension array of full counts?
>
> It seemed to me (I'm not certain) that the char arrays I was
> implementing were better suited to (use less memory in) a "normal"
> workload in which the basic swap_map counts might overflow (but
> I wonder how normal is any workload in which they overflow).
> Whereas the array of full counts would be better suited to an
> "aberrant" workload in which a mischievous user is actually
> trying to maximize those counts. I decided to carry on with
> the better solution for the (more) normal workload, the solution
> less likely to gobble up more memory there than we've used before.
>
> While I agree that the full count implementation would be simpler
> and more obviously correct, I thought it was still going to involve
> a linked list of pages (but "parallel" rather than "serial": each
> of the pages assigned to one range of the base page).
>
> Looking at what you propose below, maybe I'm not getting the details
> right, but it looks as if you're having to do an order 2 or order 3
> page allocation? Attempted with GFP_ATOMIC? I'd much rather stick
> with order 0 pages, even if we do have to chain them to the base.
>
order-0 allocation per array entry.

1st leve map 2nd level map

map -> array[0] -> map => PAGE_SIZE map.
[1] -> map => PAGE_SIZE map.
...
[7] -> map == NULL if not used.


> (Order 3 on 64-bit? A side issue which deterred me from the full
> count approach, was the argumentation we'd get into over how big a
> full count needs to be. I think, for so long as we have atomic_t
> page count and page mapcount, an int is big enough for swap count.
I see.

> But switching them to atomic_long_t may already be overdue.
> Anyway, I liked how the char continuations avoided that issue.)
>
My concern is that small numbers of swap_map[] which has too much refcnt
can consume too much pages.

If an entry is shared by 65535, 65535/128 = 512 page will be used.
(I'm sorry if I don't undestand implementation correctly.)


> I'm reluctant to depart from what I have, now that it's tested;
> but yes, we could perfectly well replace it by a different design,
> it is very self-contained. The demands on this code are unusually
> simple: it only has to manage counting up and counting down;
> so it is very easily tested.
>
Okay, let's start with this.



Thanks,
-Kame

2009-10-16 02:25:40

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH 7/9] swap_info: swap count continuations

On Fri, 16 Oct 2009, KAMEZAWA Hiroyuki wrote:
> >
> My concern is that small numbers of swap_map[] which has too much refcnt
> can consume too much pages.
>
> If an entry is shared by 65535, 65535/128 = 512 page will be used.
> (I'm sorry if I don't undestand implementation correctly.)

Ah, you're thinking it's additive: perhaps because I use the name
"continuation", which may give that impression - maybe there's a
better name I can give it.

No, it's multiplicative - just like 999 is almost a thousand, not 27.

If an entry is shared by 65535, then it needs its original swap_map
page (0 to 0x3e) and a continuation page (0 to 0x7f) and another
continuation page (0 to 0x7f): if I've got my arithmetic right,
those three pages can hold a shared count up to 1032191, for
every one of that group of PAGE_SIZE neighbouring pages.

Hugh

2009-10-16 04:09:29

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH 7/9] swap_info: swap count continuations

On Fri, 16 Oct 2009 03:24:57 +0100 (BST)
Hugh Dickins <[email protected]> wrote:

> On Fri, 16 Oct 2009, KAMEZAWA Hiroyuki wrote:
> > >
> > My concern is that small numbers of swap_map[] which has too much refcnt
> > can consume too much pages.
> >
> > If an entry is shared by 65535, 65535/128 = 512 page will be used.
> > (I'm sorry if I don't undestand implementation correctly.)
>
> Ah, you're thinking it's additive: perhaps because I use the name
> "continuation", which may give that impression - maybe there's a
> better name I can give it.
>
> No, it's multiplicative - just like 999 is almost a thousand, not 27.
>
> If an entry is shared by 65535, then it needs its original swap_map
> page (0 to 0x3e) and a continuation page (0 to 0x7f) and another
> continuation page (0 to 0x7f): if I've got my arithmetic right,
> those three pages can hold a shared count up to 1032191, for
> every one of that group of PAGE_SIZE neighbouring pages.
>
Ah, okay. I see. thank you for explanation.

Regards,
-Kame

2009-10-16 04:51:20

by Nitin Gupta

[permalink] [raw]
Subject: Re: [PATCH 7/9] swap_info: swap count continuations

On 10/15/2009 06:26 AM, Hugh Dickins wrote:
> Swap is duplicated (reference count incremented by one) whenever the same
> swap page is inserted into another mm (when forking finds a swap entry in
> place of a pte, or when reclaim unmaps a pte to insert the swap entry).
>
> swap_info_struct's vmalloc'ed swap_map is the array of these reference
> counts: but what happens when the unsigned short (or unsigned char since
> the preceding patch) is full? (and its high bit is kept for a cache flag)
>
> We then lose track of it, never freeing, leaving it in use until swapoff:
> at which point we _hope_ that a single pass will have found all instances,
> assume there are no more, and will lose user data if we're wrong.
>
> Swapping of KSM pages has not yet been enabled; but it is implemented,
> and makes it very easy for a user to overflow the maximum swap count:
> possible with ordinary process pages, but unlikely, even when pid_max
> has been raised from PID_MAX_DEFAULT.
>
> This patch implements swap count continuations: when the count overflows,
> a continuation page is allocated and linked to the original vmalloc'ed
> map page, and this used to hold the continuation counts for that entry
> and its neighbours. These continuation pages are seldom referenced:
> the common paths all work on the original swap_map, only referring to
> a continuation page when the low "digit" of a count is incremented or
> decremented through SWAP_MAP_MAX.
>


I think the patch can be simplified a lot if we have just 2 levels (hard-coded)
of swap_map, each level having 16-bit count -- combined 32-bit count should be
sufficient for about anything. Saving 1-byte for level-1 swap_map and then having
arbitrary levels of swap_map doesn't look like its worth the complexity.

Nitin

2009-10-16 07:57:38

by Daisuke Nishimura

[permalink] [raw]
Subject: [PATCH] mm: call pte_unmap() against a proper pte (Re: [PATCH 7/9] swap_info: swap count continuations)

Hi.

> @@ -645,6 +648,7 @@ static int copy_pte_range(struct mm_stru
> spinlock_t *src_ptl, *dst_ptl;
> int progress = 0;
> int rss[2];
> + swp_entry_t entry = (swp_entry_t){0};
>
> again:
> rss[1] = rss[0] = 0;
> @@ -671,7 +675,10 @@ again:
> progress++;
> continue;
> }
> - copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
> + entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
> + vma, addr, rss);
> + if (entry.val)
> + break;
> progress += 8;
> } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
>
It isn't the fault of only this patch, but I think breaking the loop without incrementing
dst_pte(and src_pte) would be bad behavior because we do unmap_pte(dst_pte - 1) later.
(current copy_pte_range() already does it though... and this is only problematic
when we break the first loop, IIUC.)

> @@ -681,6 +688,12 @@ again:
> add_mm_rss(dst_mm, rss[0], rss[1]);
> pte_unmap_unlock(dst_pte - 1, dst_ptl);
> cond_resched();
> +
> + if (entry.val) {
> + if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
> + return -ENOMEM;
> + progress = 0;
> + }
> if (addr != end)
> goto again;
> return 0;

I've searched other places where we break a similar loop and do pte_unmap(pte - 1).
Current copy_pte_range() and apply_to_pte_range() has the same problem.

How about a patch like this ?
===
From: Daisuke Nishimura <[email protected]>

There are some places where we do like:

pte = pte_map();
do {
(do break in some conditions)
} while (pte++, ...);
pte_unmap(pte - 1);

But if the loop breaks at the first loop, pte_unmap() unmaps invalid pte.

This patch is a fix for this problem.

Signed-off-by: Daisuke Nishimura <[email protected]>
---
mm/memory.c | 11 +++++++----
1 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 72a2494..492de38 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -641,6 +641,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
+ pte_t *orig_src_pte, *orig_dst_pte;
pte_t *src_pte, *dst_pte;
spinlock_t *src_ptl, *dst_ptl;
int progress = 0;
@@ -654,6 +655,8 @@ again:
src_pte = pte_offset_map_nested(src_pmd, addr);
src_ptl = pte_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+ orig_src_pte = src_pte;
+ orig_dst_pte = dst_pte;
arch_enter_lazy_mmu_mode();

do {
@@ -677,9 +680,9 @@ again:

arch_leave_lazy_mmu_mode();
spin_unlock(src_ptl);
- pte_unmap_nested(src_pte - 1);
+ pte_unmap_nested(orig_src_pte);
add_mm_rss(dst_mm, rss[0], rss[1]);
- pte_unmap_unlock(dst_pte - 1, dst_ptl);
+ pte_unmap_unlock(orig_dst_pte, dst_ptl);
cond_resched();
if (addr != end)
goto again;
@@ -1822,10 +1825,10 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
token = pmd_pgtable(*pmd);

do {
- err = fn(pte, token, addr, data);
+ err = fn(pte++, token, addr, data);
if (err)
break;
- } while (pte++, addr += PAGE_SIZE, addr != end);
+ } while (addr += PAGE_SIZE, addr != end);

arch_leave_lazy_mmu_mode();

2009-10-16 08:04:42

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH] mm: call pte_unmap() against a proper pte (Re: [PATCH 7/9] swap_info: swap count continuations)

On Fri, 16 Oct 2009 15:30:56 +0900
Daisuke Nishimura <[email protected]> wrote:

> Hi.
>
> > @@ -645,6 +648,7 @@ static int copy_pte_range(struct mm_stru
> > spinlock_t *src_ptl, *dst_ptl;
> > int progress = 0;
> > int rss[2];
> > + swp_entry_t entry = (swp_entry_t){0};
> >
> > again:
> > rss[1] = rss[0] = 0;
> > @@ -671,7 +675,10 @@ again:
> > progress++;
> > continue;
> > }
> > - copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
> > + entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
> > + vma, addr, rss);
> > + if (entry.val)
> > + break;
> > progress += 8;
> > } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
> >
> It isn't the fault of only this patch, but I think breaking the loop without incrementing
> dst_pte(and src_pte) would be bad behavior because we do unmap_pte(dst_pte - 1) later.
> (current copy_pte_range() already does it though... and this is only problematic
> when we break the first loop, IIUC.)
>

oh, yes. nice catch!

> > @@ -681,6 +688,12 @@ again:
> > add_mm_rss(dst_mm, rss[0], rss[1]);
> > pte_unmap_unlock(dst_pte - 1, dst_ptl);
> > cond_resched();
> > +
> > + if (entry.val) {
> > + if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
> > + return -ENOMEM;
> > + progress = 0;
> > + }
> > if (addr != end)
> > goto again;
> > return 0;
>
> I've searched other places where we break a similar loop and do pte_unmap(pte - 1).
> Current copy_pte_range() and apply_to_pte_range() has the same problem.
>

> How about a patch like this ?
> ===
> From: Daisuke Nishimura <[email protected]>
>
> There are some places where we do like:
>
> pte = pte_map();
> do {
> (do break in some conditions)
> } while (pte++, ...);
> pte_unmap(pte - 1);
>
> But if the loop breaks at the first loop, pte_unmap() unmaps invalid pte.
>
> This patch is a fix for this problem.
>
> Signed-off-by: Daisuke Nishimura <[email protected]>

seems correct.

Reviewd-by: KAMEZAWA Hiroyuki <[email protected]>

> ---
> mm/memory.c | 11 +++++++----
> 1 files changed, 7 insertions(+), 4 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 72a2494..492de38 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -641,6 +641,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
> pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
> unsigned long addr, unsigned long end)
> {
> + pte_t *orig_src_pte, *orig_dst_pte;
> pte_t *src_pte, *dst_pte;
> spinlock_t *src_ptl, *dst_ptl;
> int progress = 0;
> @@ -654,6 +655,8 @@ again:
> src_pte = pte_offset_map_nested(src_pmd, addr);
> src_ptl = pte_lockptr(src_mm, src_pmd);
> spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
> + orig_src_pte = src_pte;
> + orig_dst_pte = dst_pte;
> arch_enter_lazy_mmu_mode();
>
> do {
> @@ -677,9 +680,9 @@ again:
>
> arch_leave_lazy_mmu_mode();
> spin_unlock(src_ptl);
> - pte_unmap_nested(src_pte - 1);
> + pte_unmap_nested(orig_src_pte);
> add_mm_rss(dst_mm, rss[0], rss[1]);
> - pte_unmap_unlock(dst_pte - 1, dst_ptl);
> + pte_unmap_unlock(orig_dst_pte, dst_ptl);
> cond_resched();
> if (addr != end)
> goto again;
> @@ -1822,10 +1825,10 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
> token = pmd_pgtable(*pmd);
>
> do {
> - err = fn(pte, token, addr, data);
> + err = fn(pte++, token, addr, data);
> if (err)
> break;
> - } while (pte++, addr += PAGE_SIZE, addr != end);
> + } while (addr += PAGE_SIZE, addr != end);
>
> arch_leave_lazy_mmu_mode();
>
>

2009-10-17 22:44:12

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH] mm: call pte_unmap() against a proper pte (Re: [PATCH 7/9] swap_info: swap count continuations)



>----Original Message----
>From: [email protected]
>Date: 16/10/2009 7:30
>To: "Hugh Dickins"<[email protected]>
>Cc: "Andrew Morton"<[email protected]>, "Nitin Gupta"<ngupta@vflare.
org>, "KAMEZAWA Hiroyuki"<[email protected]>, <hongshin@gmail.
com>, <[email protected]>, <[email protected]>, "Daisuke Nishimura"
<[email protected]>
>Subj: [PATCH] mm: call pte_unmap() against a proper pte (Re: [PATCH 7/9]
swap_info: swap count continuations)
>
>Hi.
>
>> @@ -645,6 +648,7 @@ static int copy_pte_range(struct mm_stru
>> spinlock_t *src_ptl, *dst_ptl;
>> int progress = 0;
>> int rss[2];
>> + swp_entry_t entry = (swp_entry_t){0};
>>
>> again:
>> rss[1] = rss[0] = 0;
>> @@ -671,7 +675,10 @@ again:
>> progress++;
>> continue;
>> }
>> - copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
>> + entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
>> + vma, addr, rss);
>> + if (entry.val)
>> + break;
>> progress += 8;
>> } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
>>
>It isn't the fault of only this patch, but I think breaking the loop without
incrementing
>dst_pte(and src_pte) would be bad behavior because we do unmap_pte(dst_pte -
1) later.
>(current copy_pte_range() already does it though... and this is only
problematic
>when we break the first loop, IIUC.)

Good catch, thanks a lot for finding that. I believe this is entirely a fault
in my 7/9, the existing code
takes care not to break before it has made some progress (in part because of
this unmap issue).

>
>> @@ -681,6 +688,12 @@ again:
>> add_mm_rss(dst_mm, rss[0], rss[1]);
>> pte_unmap_unlock(dst_pte - 1, dst_ptl);
>> cond_resched();
>> +
>> + if (entry.val) {
>> + if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
>> + return -ENOMEM;
>> + progress = 0;
>> + }
>> if (addr != end)
>> goto again;
>> return 0;
>
>I've searched other places where we break a similar loop and do pte_unmap(pte
- 1).
>Current copy_pte_range() and apply_to_pte_range() has the same problem.

And thank you for taking the trouble to look further afield: yes,
apply_to_pte_range()
was already wrong.

>
>How about a patch like this ?
>===
>From: Daisuke Nishimura <[email protected]>
>
>There are some places where we do like:
>
> pte = pte_map();
> do {
> (do break in some conditions)
> } while (pte++, ...);
> pte_unmap(pte - 1);
>
>But if the loop breaks at the first loop, pte_unmap() unmaps invalid pte.
>
>This patch is a fix for this problem.
>
>Signed-off-by: Daisuke Nishimura <[email protected]>

Acked-by: Hugh Dickins <[email protected]>





Forget the rest, get the best - http://www.tiscali.co.uk/music