2009-06-02 03:06:14

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: [PATCH 0/4] memcg fix swap accounting (2/Jun)

This is an updated sereis of memcg fix swap accounting
http://marc.info/?l=linux-mm&m=124348659700540&w=2

Now in mmotm as
mm-add-swap-cache-interface-for-swap-reference.patch
mm-modify-swap_map-and-add-swap_has_cache-flag.patch
mm-reuse-unused-swap-entry-if-necessary.patch
memcg-fix-swap-accounting.patch

No logic changes but fixed some condig style troubles pointed out.

Thanks,
-Kame


2009-06-02 03:08:42

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: [PATCH 1/4] add swap cache interface for swap reference

replacement for
mm-add-swap-cache-interface-for-swap-reference.patch in mmotm.
but no changes.
==
From: KAMEZAWA Hiroyuki <[email protected]>

In following patch, usage of swap cache will be recorded into swap_map.
This patch is for necessary interface changes to do that.

2 interfaces:
- swapcache_prepare()
- swapcache_free()
is added for allocating/freeing refcnt from swap-cache to existing
swap entries. But implementation itself is not changed under this patch.
At adding swapcache_free(), memcg's hook code is moved under swapcache_free().
This is better than using scattered hooks.

Changelog: v1->v2
- fixed shmem_writepage() error path.

Reviewed-by: Daisuke Nishimura <[email protected]>
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
include/linux/swap.h | 7 +++++++
mm/shmem.c | 2 +-
mm/swap_state.c | 11 +++++------
mm/swapfile.c | 19 +++++++++++++++++++
mm/vmscan.c | 3 +--
5 files changed, 33 insertions(+), 9 deletions(-)

Index: mmotm-2.6.30-May28/include/linux/swap.h
===================================================================
--- mmotm-2.6.30-May28.orig/include/linux/swap.h
+++ mmotm-2.6.30-May28/include/linux/swap.h
@@ -282,8 +282,10 @@ extern void si_swapinfo(struct sysinfo *
extern swp_entry_t get_swap_page(void);
extern swp_entry_t get_swap_page_of_type(int);
extern int swap_duplicate(swp_entry_t);
+extern int swapcache_prepare(swp_entry_t);
extern int valid_swaphandles(swp_entry_t, unsigned long *);
extern void swap_free(swp_entry_t);
+extern void swapcache_free(swp_entry_t, struct page *page);
extern int free_swap_and_cache(swp_entry_t);
extern int swap_type_of(dev_t, sector_t, struct block_device **);
extern unsigned int count_swap_pages(int, int);
@@ -352,11 +354,16 @@ static inline void show_swap_cache_info(

#define free_swap_and_cache(swp) is_migration_entry(swp)
#define swap_duplicate(swp) is_migration_entry(swp)
+#define swapcache_prepare(swp) is_migration_entry(swp)

static inline void swap_free(swp_entry_t swp)
{
}

+static inline void swapcache_free(swp_entry_t swp, struct page *page)
+{
+}
+
static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr)
{
Index: mmotm-2.6.30-May28/mm/swap_state.c
===================================================================
--- mmotm-2.6.30-May28.orig/mm/swap_state.c
+++ mmotm-2.6.30-May28/mm/swap_state.c
@@ -162,11 +162,11 @@ int add_to_swap(struct page *page)
return 1;
case -EEXIST:
/* Raced with "speculative" read_swap_cache_async */
- swap_free(entry);
+ swapcache_free(entry, NULL);
continue;
default:
/* -ENOMEM radix-tree allocation failure */
- swap_free(entry);
+ swapcache_free(entry, NULL);
return 0;
}
}
@@ -188,8 +188,7 @@ void delete_from_swap_cache(struct page
__delete_from_swap_cache(page);
spin_unlock_irq(&swapper_space.tree_lock);

- mem_cgroup_uncharge_swapcache(page, entry);
- swap_free(entry);
+ swapcache_free(entry, page);
page_cache_release(page);
}

@@ -293,7 +292,7 @@ struct page *read_swap_cache_async(swp_e
/*
* Swap entry may have been freed since our caller observed it.
*/
- if (!swap_duplicate(entry))
+ if (!swapcache_prepare(entry))
break;

/*
@@ -317,7 +316,7 @@ struct page *read_swap_cache_async(swp_e
}
ClearPageSwapBacked(new_page);
__clear_page_locked(new_page);
- swap_free(entry);
+ swapcache_free(entry, NULL);
} while (err != -ENOMEM);

if (new_page)
Index: mmotm-2.6.30-May28/mm/swapfile.c
===================================================================
--- mmotm-2.6.30-May28.orig/mm/swapfile.c
+++ mmotm-2.6.30-May28/mm/swapfile.c
@@ -510,6 +510,16 @@ void swap_free(swp_entry_t entry)
}

/*
+ * Called after dropping swapcache to decrease refcnt to swap entries.
+ */
+void swapcache_free(swp_entry_t entry, struct page *page)
+{
+ if (page)
+ mem_cgroup_uncharge_swapcache(page, entry);
+ return swap_free(entry);
+}
+
+/*
* How many references to page are currently swapped out?
*/
static inline int page_swapcount(struct page *page)
@@ -1979,6 +1989,15 @@ bad_file:
goto out;
}

+/*
+ * Called when allocating swap cache for exising swap entry,
+ */
+int swapcache_prepare(swp_entry_t entry)
+{
+ return swap_duplicate(entry);
+}
+
+
struct swap_info_struct *
get_swap_info_struct(unsigned type)
{
Index: mmotm-2.6.30-May28/mm/vmscan.c
===================================================================
--- mmotm-2.6.30-May28.orig/mm/vmscan.c
+++ mmotm-2.6.30-May28/mm/vmscan.c
@@ -477,8 +477,7 @@ static int __remove_mapping(struct addre
swp_entry_t swap = { .val = page_private(page) };
__delete_from_swap_cache(page);
spin_unlock_irq(&mapping->tree_lock);
- mem_cgroup_uncharge_swapcache(page, swap);
- swap_free(swap);
+ swapcache_free(swap, page);
} else {
__remove_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
Index: mmotm-2.6.30-May28/mm/shmem.c
===================================================================
--- mmotm-2.6.30-May28.orig/mm/shmem.c
+++ mmotm-2.6.30-May28/mm/shmem.c
@@ -1097,7 +1097,7 @@ static int shmem_writepage(struct page *
shmem_swp_unmap(entry);
unlock:
spin_unlock(&info->lock);
- swap_free(swap);
+ swapcache_free(swap, NULL);
redirty:
set_page_dirty(page);
if (wbc->for_reclaim)

2009-06-02 03:11:01

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: [PATCH 2/4] modify swap map and add swap has cache flag


Replacement for
mm-modify-swap_map-and-add-swap_has_cache-flag.patch in mmotm.
Several style fix are done....maybe easier to read.

==
From: KAMEZAWA Hiroyuki <[email protected]>

This is a part of patches for fixing memcg's swap account leak. But, IMHO,
not a bad patch even if no memcg.

Now, reference to swap is counted by swap_map[], an array of unsigned short.
There are 2 kinds of references to swap.
- reference from swap entry
- reference from swap cache
Then,
- If there is swap cache && swap's refcnt is 1, there is only swap cache.
(*) swapcount(entry) == 1 && find_get_page(swapper_space, entry) != NULL

This counting logic have worked well for a long time. But considering that
we cannot know there is a _real_ reference or not by swap_map[], current usage
of counter is not very good.

This patch adds a flag SWAP_HAS_CACHE and recorded information that a swap entry
has a cache or not. This will remove -1 magic used in swapfile.c and be a help
to avoid unnecessary find_get_page().

Changelog: v2->v3
- modified names/types of functions and their arguments.
Changelog: v1->v2
- fixed swapcache_prepare()'s return code.
- changed swap_duplicate() be void function.
- fixed racy case in swapoff().

Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
include/linux/swap.h | 14 ++-
mm/swap_state.c | 5 -
mm/swapfile.c | 216 +++++++++++++++++++++++++++++++++++++--------------
3 files changed, 173 insertions(+), 62 deletions(-)

Index: mmotm-2.6.30-May28/include/linux/swap.h
===================================================================
--- mmotm-2.6.30-May28.orig/include/linux/swap.h
+++ mmotm-2.6.30-May28/include/linux/swap.h
@@ -129,9 +129,10 @@ enum {

#define SWAP_CLUSTER_MAX 32

-#define SWAP_MAP_MAX 0x7fff
-#define SWAP_MAP_BAD 0x8000
-
+#define SWAP_MAP_MAX 0x7ffe
+#define SWAP_MAP_BAD 0x7fff
+#define SWAP_HAS_CACHE 0x8000 /* There is a swap cache of entry. */
+#define SWAP_COUNT_MASK (~SWAP_HAS_CACHE)
/*
* The in-memory structure used to track swap areas.
*/
@@ -281,7 +282,7 @@ extern long total_swap_pages;
extern void si_swapinfo(struct sysinfo *);
extern swp_entry_t get_swap_page(void);
extern swp_entry_t get_swap_page_of_type(int);
-extern int swap_duplicate(swp_entry_t);
+extern void swap_duplicate(swp_entry_t);
extern int swapcache_prepare(swp_entry_t);
extern int valid_swaphandles(swp_entry_t, unsigned long *);
extern void swap_free(swp_entry_t);
@@ -353,9 +354,12 @@ static inline void show_swap_cache_info(
}

#define free_swap_and_cache(swp) is_migration_entry(swp)
-#define swap_duplicate(swp) is_migration_entry(swp)
#define swapcache_prepare(swp) is_migration_entry(swp)

+static inline void swap_duplicate(swp_entry_t swp)
+{
+}
+
static inline void swap_free(swp_entry_t swp)
{
}
Index: mmotm-2.6.30-May28/mm/swapfile.c
===================================================================
--- mmotm-2.6.30-May28.orig/mm/swapfile.c
+++ mmotm-2.6.30-May28/mm/swapfile.c
@@ -53,6 +53,33 @@ static struct swap_info_struct swap_info

static DEFINE_MUTEX(swapon_mutex);

+/* For reference count accounting in swap_map */
+/* enum for swap_map[] handling. internal use only */
+enum {
+ SWAP_MAP = 0, /* ops for reference from swap users */
+ SWAP_CACHE, /* ops for reference from swap cache */
+};
+
+static inline int swap_count(unsigned short ent)
+{
+ return ent & SWAP_COUNT_MASK;
+}
+
+static inline bool swap_has_cache(unsigned short ent)
+{
+ return !!(ent & SWAP_HAS_CACHE);
+}
+
+static inline unsigned short encode_swapmap(int count, bool has_cache)
+{
+ unsigned short ret = count;
+
+ if (has_cache)
+ return SWAP_HAS_CACHE | ret;
+ return ret;
+}
+
+
/*
* We need this because the bdev->unplug_fn can sleep and we cannot
* hold swap_lock while calling the unplug_fn. And swap_lock
@@ -167,7 +194,8 @@ static int wait_for_discard(void *word)
#define SWAPFILE_CLUSTER 256
#define LATENCY_LIMIT 256

-static inline unsigned long scan_swap_map(struct swap_info_struct *si)
+static inline unsigned long scan_swap_map(struct swap_info_struct *si,
+ int cache)
{
unsigned long offset;
unsigned long scan_base;
@@ -285,7 +313,10 @@ checks:
si->lowest_bit = si->max;
si->highest_bit = 0;
}
- si->swap_map[offset] = 1;
+ if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */
+ si->swap_map[offset] = encode_swapmap(0, true);
+ else /* at suspend */
+ si->swap_map[offset] = encode_swapmap(1, false);
si->cluster_next = offset + 1;
si->flags -= SWP_SCANNING;

@@ -401,7 +432,8 @@ swp_entry_t get_swap_page(void)
continue;

swap_list.next = next;
- offset = scan_swap_map(si);
+ /* This is called for allocating swap entry for cache */
+ offset = scan_swap_map(si, SWAP_CACHE);
if (offset) {
spin_unlock(&swap_lock);
return swp_entry(type, offset);
@@ -415,6 +447,7 @@ noswap:
return (swp_entry_t) {0};
}

+/* The only caller of this function is now susupend routine */
swp_entry_t get_swap_page_of_type(int type)
{
struct swap_info_struct *si;
@@ -424,7 +457,8 @@ swp_entry_t get_swap_page_of_type(int ty
si = swap_info + type;
if (si->flags & SWP_WRITEOK) {
nr_swap_pages--;
- offset = scan_swap_map(si);
+ /* This is called for allocating swap entry, not cache */
+ offset = scan_swap_map(si, SWAP_MAP);
if (offset) {
spin_unlock(&swap_lock);
return swp_entry(type, offset);
@@ -471,25 +505,38 @@ out:
return NULL;
}

-static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent)
+static int swap_entry_free(struct swap_info_struct *p,
+ swp_entry_t ent, int cache)
{
unsigned long offset = swp_offset(ent);
- int count = p->swap_map[offset];
+ int count = swap_count(p->swap_map[offset]);
+ bool has_cache;

- if (count < SWAP_MAP_MAX) {
- count--;
- p->swap_map[offset] = count;
- if (!count) {
- if (offset < p->lowest_bit)
- p->lowest_bit = offset;
- if (offset > p->highest_bit)
- p->highest_bit = offset;
- if (p->prio > swap_info[swap_list.next].prio)
- swap_list.next = p - swap_info;
- nr_swap_pages++;
- p->inuse_pages--;
- mem_cgroup_uncharge_swap(ent);
- }
+ has_cache = swap_has_cache(p->swap_map[offset]);
+
+ if (cache == SWAP_MAP) { /* dropping usage count of swap */
+ if (count < SWAP_MAP_MAX) {
+ count--;
+ p->swap_map[offset] = encode_swapmap(count, has_cache);
+ }
+ } else { /* dropping swap cache flag */
+ VM_BUG_ON(!has_cache);
+ p->swap_map[offset] = encode_swapmap(count, false);
+
+ }
+ /* return code. */
+ count = p->swap_map[offset];
+ /* free if no reference */
+ if (!count) {
+ if (offset < p->lowest_bit)
+ p->lowest_bit = offset;
+ if (offset > p->highest_bit)
+ p->highest_bit = offset;
+ if (p->prio > swap_info[swap_list.next].prio)
+ swap_list.next = p - swap_info;
+ nr_swap_pages++;
+ p->inuse_pages--;
+ mem_cgroup_uncharge_swap(ent);
}
return count;
}
@@ -504,7 +551,7 @@ void swap_free(swp_entry_t entry)

p = swap_info_get(entry);
if (p) {
- swap_entry_free(p, entry);
+ swap_entry_free(p, entry, SWAP_MAP);
spin_unlock(&swap_lock);
}
}
@@ -514,9 +561,16 @@ void swap_free(swp_entry_t entry)
*/
void swapcache_free(swp_entry_t entry, struct page *page)
{
+ struct swap_info_struct *p;
+
if (page)
mem_cgroup_uncharge_swapcache(page, entry);
- return swap_free(entry);
+ p = swap_info_get(entry);
+ if (p) {
+ swap_entry_free(p, entry, SWAP_CACHE);
+ spin_unlock(&swap_lock);
+ }
+ return;
}

/*
@@ -531,8 +585,7 @@ static inline int page_swapcount(struct
entry.val = page_private(page);
p = swap_info_get(entry);
if (p) {
- /* Subtract the 1 for the swap cache itself */
- count = p->swap_map[swp_offset(entry)] - 1;
+ count = swap_count(p->swap_map[swp_offset(entry)]);
spin_unlock(&swap_lock);
}
return count;
@@ -594,7 +647,7 @@ int free_swap_and_cache(swp_entry_t entr

p = swap_info_get(entry);
if (p) {
- if (swap_entry_free(p, entry) == 1) {
+ if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) {
page = find_get_page(&swapper_space, entry.val);
if (page && !trylock_page(page)) {
page_cache_release(page);
@@ -901,7 +954,7 @@ static unsigned int find_next_to_unuse(s
i = 1;
}
count = si->swap_map[i];
- if (count && count != SWAP_MAP_BAD)
+ if (count && swap_count(count) != SWAP_MAP_BAD)
break;
}
return i;
@@ -1005,13 +1058,13 @@ static int try_to_unuse(unsigned int typ
*/
shmem = 0;
swcount = *swap_map;
- if (swcount > 1) {
+ if (swap_count(swcount)) {
if (start_mm == &init_mm)
shmem = shmem_unuse(entry, page);
else
retval = unuse_mm(start_mm, entry, page);
}
- if (*swap_map > 1) {
+ if (swap_count(*swap_map)) {
int set_start_mm = (*swap_map >= swcount);
struct list_head *p = &start_mm->mmlist;
struct mm_struct *new_start_mm = start_mm;
@@ -1021,7 +1074,7 @@ static int try_to_unuse(unsigned int typ
atomic_inc(&new_start_mm->mm_users);
atomic_inc(&prev_mm->mm_users);
spin_lock(&mmlist_lock);
- while (*swap_map > 1 && !retval && !shmem &&
+ while (swap_count(*swap_map) && !retval && !shmem &&
(p = p->next) != &start_mm->mmlist) {
mm = list_entry(p, struct mm_struct, mmlist);
if (!atomic_inc_not_zero(&mm->mm_users))
@@ -1033,14 +1086,16 @@ static int try_to_unuse(unsigned int typ
cond_resched();

swcount = *swap_map;
- if (swcount <= 1)
+ if (!swap_count(swcount)) /* any usage ? */
;
else if (mm == &init_mm) {
set_start_mm = 1;
shmem = shmem_unuse(entry, page);
} else
retval = unuse_mm(mm, entry, page);
- if (set_start_mm && *swap_map < swcount) {
+
+ if (set_start_mm &&
+ swap_count(*swap_map) < swcount) {
mmput(new_start_mm);
atomic_inc(&mm->mm_users);
new_start_mm = mm;
@@ -1067,21 +1122,25 @@ static int try_to_unuse(unsigned int typ
}

/*
- * How could swap count reach 0x7fff when the maximum
- * pid is 0x7fff, and there's no way to repeat a swap
- * page within an mm (except in shmem, where it's the
- * shared object which takes the reference count)?
- * We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
- *
+ * How could swap count reach 0x7ffe ?
+ * There's no way to repeat a swap page within an mm
+ * (except in shmem, where it's the shared object which takes
+ * the reference count)?
+ * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned
+ * short is too small....)
* If that's wrong, then we should worry more about
* exit_mmap() and do_munmap() cases described above:
* we might be resetting SWAP_MAP_MAX too early here.
* We know "Undead"s can happen, they're okay, so don't
* report them; but do report if we reset SWAP_MAP_MAX.
*/
- if (*swap_map == SWAP_MAP_MAX) {
+ /* We might release the lock_page() in unuse_mm(). */
+ if (!PageSwapCache(page) || page_private(page) != entry.val)
+ goto retry;
+
+ if (swap_count(*swap_map) == SWAP_MAP_MAX) {
spin_lock(&swap_lock);
- *swap_map = 1;
+ *swap_map = encode_swapmap(0, true);
spin_unlock(&swap_lock);
reset_overflow = 1;
}
@@ -1099,7 +1158,8 @@ static int try_to_unuse(unsigned int typ
* pages would be incorrect if swap supported "shared
* private" pages, but they are handled by tmpfs files.
*/
- if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
+ if (swap_count(*swap_map) &&
+ PageDirty(page) && PageSwapCache(page)) {
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
};
@@ -1126,6 +1186,7 @@ static int try_to_unuse(unsigned int typ
* mark page dirty so shrink_page_list will preserve it.
*/
SetPageDirty(page);
+retry:
unlock_page(page);
page_cache_release(page);

@@ -1952,15 +2013,23 @@ void si_swapinfo(struct sysinfo *val)
*
* Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
* "permanent", but will be reclaimed by the next swapoff.
+ * Returns error code in following case.
+ * - success -> 0
+ * - swp_entry is invalid -> EINVAL
+ * - swp_entry is migration entry -> EINVAL
+ * - swap-cache reference is requested but there is already one. -> EEXIST
+ * - swap-cache reference is requested but the entry is not used. -> ENOENT
*/
-int swap_duplicate(swp_entry_t entry)
+static int __swap_duplicate(swp_entry_t entry, bool cache)
{
struct swap_info_struct * p;
unsigned long offset, type;
- int result = 0;
+ int result = -EINVAL;
+ int count;
+ bool has_cache;

if (is_migration_entry(entry))
- return 1;
+ return -EINVAL;

type = swp_type(entry);
if (type >= nr_swapfiles)
@@ -1969,17 +2038,40 @@ int swap_duplicate(swp_entry_t entry)
offset = swp_offset(entry);

spin_lock(&swap_lock);
- if (offset < p->max && p->swap_map[offset]) {
- if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
- p->swap_map[offset]++;
- result = 1;
- } else if (p->swap_map[offset] <= SWAP_MAP_MAX) {
+
+ if (unlikely(offset >= p->max))
+ goto unlock_out;
+
+ count = swap_count(p->swap_map[offset]);
+ has_cache = swap_has_cache(p->swap_map[offset]);
+
+ if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */
+
+ /* set SWAP_HAS_CACHE if there is no cache and entry is used */
+ if (!has_cache && count) {
+ p->swap_map[offset] = encode_swapmap(count, true);
+ result = 0;
+ } else if (has_cache) /* someone added cache */
+ result = -EEXIST;
+ else if (!count) /* no users */
+ result = -ENOENT;
+
+ } else if (count || has_cache) {
+ if (count < SWAP_MAP_MAX - 1) {
+ p->swap_map[offset] = encode_swapmap(count + 1,
+ has_cache);
+ result = 0;
+ } else if (count <= SWAP_MAP_MAX) {
if (swap_overflow++ < 5)
- printk(KERN_WARNING "swap_dup: swap entry overflow\n");
- p->swap_map[offset] = SWAP_MAP_MAX;
- result = 1;
- }
- }
+ printk(KERN_WARNING
+ "swap_dup: swap entry overflow\n");
+ p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX,
+ has_cache);
+ result = 0;
+ }
+ } else
+ result = -ENOENT; /* unused swap entry */
+unlock_out:
spin_unlock(&swap_lock);
out:
return result;
@@ -1988,13 +2080,25 @@ bad_file:
printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
goto out;
}
+/*
+ * increase reference count of swap entry by 1.
+ */
+void swap_duplicate(swp_entry_t entry)
+{
+ __swap_duplicate(entry, SWAP_MAP);
+}

/*
+ * @entry: swap entry for which we allocate swap cache.
+ *
* Called when allocating swap cache for exising swap entry,
+ * This can return error codes. Returns 0 at success.
+ * -EBUSY means there is a swap cache.
+ * Note: return code is different from swap_duplicate().
*/
int swapcache_prepare(swp_entry_t entry)
{
- return swap_duplicate(entry);
+ return __swap_duplicate(entry, SWAP_CACHE);
}


@@ -2035,7 +2139,7 @@ int valid_swaphandles(swp_entry_t entry,
/* Don't read in free or bad pages */
if (!si->swap_map[toff])
break;
- if (si->swap_map[toff] == SWAP_MAP_BAD)
+ if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
break;
}
/* Count contiguous allocated slots below our target */
@@ -2043,7 +2147,7 @@ int valid_swaphandles(swp_entry_t entry,
/* Don't read in free or bad pages */
if (!si->swap_map[toff])
break;
- if (si->swap_map[toff] == SWAP_MAP_BAD)
+ if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
break;
}
spin_unlock(&swap_lock);
Index: mmotm-2.6.30-May28/mm/swap_state.c
===================================================================
--- mmotm-2.6.30-May28.orig/mm/swap_state.c
+++ mmotm-2.6.30-May28/mm/swap_state.c
@@ -292,7 +292,10 @@ struct page *read_swap_cache_async(swp_e
/*
* Swap entry may have been freed since our caller observed it.
*/
- if (!swapcache_prepare(entry))
+ err = swapcache_prepare(entry);
+ if (err == -EEXIST) /* seems racy */
+ continue;
+ if (err) /* swp entry is obsolete ? */
break;

/*

2009-06-02 03:13:43

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: [PATCH 3/4] reuse unused swap entry if necessary


This is a replacement for
mm-reuse-unused-swap-entry-if-necessary.patch in mmotm.
function is renamed and comments are added.

==
From: KAMEZAWA Hiroyuki <[email protected]>

Now, we can know the swap is just used as SwapCache via swap_map,
without looking up swap cache.

Then, we have a chance to reuse swap-cache-only swap entries in
get_swap_pages().

This patch tries to free swap-cache-only swap entries if swap is
not enough.
Note: We hit following path when swap_cluster code cannot find
a free cluster. Then, vm_swap_full() is not only condition to allow
the kernel to reclaim unused swap.

Acked-by: Balbir Singh <[email protected]>
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
mm/swapfile.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 47 insertions(+)

Index: mmotm-2.6.30-May28/mm/swapfile.c
===================================================================
--- mmotm-2.6.30-May28.orig/mm/swapfile.c
+++ mmotm-2.6.30-May28/mm/swapfile.c
@@ -79,6 +79,32 @@ static inline unsigned short encode_swap
return ret;
}

+/* returnes 1 if swap entry is freed */
+static int
+__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
+{
+ int type = si - swap_info;
+ swp_entry_t entry = swp_entry(type, offset);
+ struct page *page;
+ int ret = 0;
+
+ page = find_get_page(&swapper_space, entry.val);
+ if (!page)
+ return 0;
+ /*
+ * This function is called from scan_swap_map() and it's called
+ * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
+ * We have to use trylock for avoiding deadlock. This is a special
+ * case and you should use try_to_free_swap() with explicit lock_page()
+ * in usual operations.
+ */
+ if (trylock_page(page)) {
+ ret = try_to_free_swap(page);
+ unlock_page(page);
+ }
+ page_cache_release(page);
+ return ret;
+}

/*
* We need this because the bdev->unplug_fn can sleep and we cannot
@@ -301,6 +327,19 @@ checks:
goto no_page;
if (offset > si->highest_bit)
scan_base = offset = si->lowest_bit;
+
+ /* reuse swap entry of cache-only swap if not busy. */
+ if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+ int swap_was_freed;
+ spin_unlock(&swap_lock);
+ swap_was_freed = __try_to_reclaim_swap(si, offset);
+ spin_lock(&swap_lock);
+ /* entry was freed successfully, try to use this again */
+ if (swap_was_freed)
+ goto checks;
+ goto scan; /* check next one */
+ }
+
if (si->swap_map[offset])
goto scan;

@@ -382,6 +421,10 @@ scan:
spin_lock(&swap_lock);
goto checks;
}
+ if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+ spin_lock(&swap_lock);
+ goto checks;
+ }
if (unlikely(--latency_ration < 0)) {
cond_resched();
latency_ration = LATENCY_LIMIT;
@@ -393,6 +436,10 @@ scan:
spin_lock(&swap_lock);
goto checks;
}
+ if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+ spin_lock(&swap_lock);
+ goto checks;
+ }
if (unlikely(--latency_ration < 0)) {
cond_resched();
latency_ration = LATENCY_LIMIT;

2009-06-02 03:16:17

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: [PATCH 4/4] memcg fix swap accounting

This is a replacement for
memcg-fix-swap-accounting.patch in mmotm.
Adjusted to style changes in 2/4 and 3/4.

==
From: KAMEZAWA Hiroyuki <[email protected]>

This patch fixes mis-accounting of swap usage in memcg.

In current implementation, memcg's swap account is uncharged only when
swap is completely freed. But there are several cases where swap
cannot be freed cleanly. For handling that, this patch changes that
memcg uncharges swap account when swap has no references other than cache.

By this, memcg's swap entry accounting can be fully synchronous with
the application's behavior.
This patch also changes memcg's hooks for swap-out.
(If delete_from_swap_cache() is called but there is no swap-reference,
charge to swaps doesn't occur.
(the charge for mem+swap is attached to the page itself if mapped)

Acked-by: Balbir Singh <[email protected]>
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
include/linux/swap.h | 5 +++--
mm/memcontrol.c | 17 ++++++++++++-----
mm/swapfile.c | 16 ++++++++++++----
3 files changed, 27 insertions(+), 11 deletions(-)

Index: mmotm-2.6.30-May28/include/linux/swap.h
===================================================================
--- mmotm-2.6.30-May28.orig/include/linux/swap.h
+++ mmotm-2.6.30-May28/include/linux/swap.h
@@ -319,10 +319,11 @@ static inline void disable_swap_token(vo
}

#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-extern void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent);
+extern void
+mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout);
#else
static inline void
-mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
+mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
{
}
#endif
Index: mmotm-2.6.30-May28/mm/memcontrol.c
===================================================================
--- mmotm-2.6.30-May28.orig/mm/memcontrol.c
+++ mmotm-2.6.30-May28/mm/memcontrol.c
@@ -189,6 +189,7 @@ enum charge_type {
MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
+ MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
NR_CHARGE_TYPE,
};

@@ -1493,6 +1494,7 @@ __mem_cgroup_uncharge_common(struct page

switch (ctype) {
case MEM_CGROUP_CHARGE_TYPE_MAPPED:
+ case MEM_CGROUP_CHARGE_TYPE_DROP:
if (page_mapped(page))
goto unlock_out;
break;
@@ -1556,18 +1558,23 @@ void mem_cgroup_uncharge_cache_page(stru
* called after __delete_from_swap_cache() and drop "page" account.
* memcg information is recorded to swap_cgroup of "ent"
*/
-void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
+void
+mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
{
struct mem_cgroup *memcg;
+ int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
+
+ if (!swapout) /* this was a swap cache but the swap is unused ! */
+ ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
+
+ memcg = __mem_cgroup_uncharge_common(page, ctype);

- memcg = __mem_cgroup_uncharge_common(page,
- MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
/* record memcg information */
- if (do_swap_account && memcg) {
+ if (do_swap_account && swapout && memcg) {
swap_cgroup_record(ent, css_id(&memcg->css));
mem_cgroup_get(memcg);
}
- if (memcg)
+ if (swapout && memcg)
css_put(&memcg->css);
}
#endif
Index: mmotm-2.6.30-May28/mm/swapfile.c
===================================================================
--- mmotm-2.6.30-May28.orig/mm/swapfile.c
+++ mmotm-2.6.30-May28/mm/swapfile.c
@@ -583,8 +583,9 @@ static int swap_entry_free(struct swap_i
swap_list.next = p - swap_info;
nr_swap_pages++;
p->inuse_pages--;
- mem_cgroup_uncharge_swap(ent);
}
+ if (!swap_count(count))
+ mem_cgroup_uncharge_swap(ent);
return count;
}

@@ -609,12 +610,19 @@ void swap_free(swp_entry_t entry)
void swapcache_free(swp_entry_t entry, struct page *page)
{
struct swap_info_struct *p;
+ int ret;

- if (page)
- mem_cgroup_uncharge_swapcache(page, entry);
p = swap_info_get(entry);
if (p) {
- swap_entry_free(p, entry, SWAP_CACHE);
+ ret = swap_entry_free(p, entry, SWAP_CACHE);
+ if (page) {
+ bool swapout;
+ if (ret)
+ swapout = true; /* the end of swap out */
+ else
+ swapout = false; /* no more swap users! */
+ mem_cgroup_uncharge_swapcache(page, entry, swapout);
+ }
spin_unlock(&swap_lock);
}
return;

2009-06-04 06:23:39

by Daisuke Nishimura

[permalink] [raw]
Subject: Re: [PATCH 3/4] reuse unused swap entry if necessary

On Tue, 2 Jun 2009 12:12:02 +0900, KAMEZAWA Hiroyuki <[email protected]> wrote:
>
> This is a replacement for
> mm-reuse-unused-swap-entry-if-necessary.patch in mmotm.
> function is renamed and comments are added.
>
> ==
> From: KAMEZAWA Hiroyuki <[email protected]>
>
> Now, we can know the swap is just used as SwapCache via swap_map,
> without looking up swap cache.
>
> Then, we have a chance to reuse swap-cache-only swap entries in
> get_swap_pages().
>
> This patch tries to free swap-cache-only swap entries if swap is
> not enough.
> Note: We hit following path when swap_cluster code cannot find
> a free cluster. Then, vm_swap_full() is not only condition to allow
> the kernel to reclaim unused swap.
>
> Acked-by: Balbir Singh <[email protected]>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>

I've confirmed that usage increase of swap and swapcache stopped
at some threshold in my test, in which , before this patch, some programs
had been oom-killed after a long time because of shortage of swap space.

This has been merged to mm already though:

Tested-by: Daisuke Nishimura <[email protected]>

Thanks,
Daiuske Nishimura.

> ---
> mm/swapfile.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 47 insertions(+)
>
> Index: mmotm-2.6.30-May28/mm/swapfile.c
> ===================================================================
> --- mmotm-2.6.30-May28.orig/mm/swapfile.c
> +++ mmotm-2.6.30-May28/mm/swapfile.c
> @@ -79,6 +79,32 @@ static inline unsigned short encode_swap
> return ret;
> }
>
> +/* returnes 1 if swap entry is freed */
> +static int
> +__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
> +{
> + int type = si - swap_info;
> + swp_entry_t entry = swp_entry(type, offset);
> + struct page *page;
> + int ret = 0;
> +
> + page = find_get_page(&swapper_space, entry.val);
> + if (!page)
> + return 0;
> + /*
> + * This function is called from scan_swap_map() and it's called
> + * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
> + * We have to use trylock for avoiding deadlock. This is a special
> + * case and you should use try_to_free_swap() with explicit lock_page()
> + * in usual operations.
> + */
> + if (trylock_page(page)) {
> + ret = try_to_free_swap(page);
> + unlock_page(page);
> + }
> + page_cache_release(page);
> + return ret;
> +}
>
> /*
> * We need this because the bdev->unplug_fn can sleep and we cannot
> @@ -301,6 +327,19 @@ checks:
> goto no_page;
> if (offset > si->highest_bit)
> scan_base = offset = si->lowest_bit;
> +
> + /* reuse swap entry of cache-only swap if not busy. */
> + if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
> + int swap_was_freed;
> + spin_unlock(&swap_lock);
> + swap_was_freed = __try_to_reclaim_swap(si, offset);
> + spin_lock(&swap_lock);
> + /* entry was freed successfully, try to use this again */
> + if (swap_was_freed)
> + goto checks;
> + goto scan; /* check next one */
> + }
> +
> if (si->swap_map[offset])
> goto scan;
>
> @@ -382,6 +421,10 @@ scan:
> spin_lock(&swap_lock);
> goto checks;
> }
> + if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
> + spin_lock(&swap_lock);
> + goto checks;
> + }
> if (unlikely(--latency_ration < 0)) {
> cond_resched();
> latency_ration = LATENCY_LIMIT;
> @@ -393,6 +436,10 @@ scan:
> spin_lock(&swap_lock);
> goto checks;
> }
> + if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
> + spin_lock(&swap_lock);
> + goto checks;
> + }
> if (unlikely(--latency_ration < 0)) {
> cond_resched();
> latency_ration = LATENCY_LIMIT;
>

2009-06-04 07:01:08

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH 3/4] reuse unused swap entry if necessary

On Thu, 4 Jun 2009 14:57:33 +0900
Daisuke Nishimura <[email protected]> wrote:

> On Tue, 2 Jun 2009 12:12:02 +0900, KAMEZAWA Hiroyuki <[email protected]> wrote:
> >
> > This is a replacement for
> > mm-reuse-unused-swap-entry-if-necessary.patch in mmotm.
> > function is renamed and comments are added.
> >
> > ==
> > From: KAMEZAWA Hiroyuki <[email protected]>
> >
> > Now, we can know the swap is just used as SwapCache via swap_map,
> > without looking up swap cache.
> >
> > Then, we have a chance to reuse swap-cache-only swap entries in
> > get_swap_pages().
> >
> > This patch tries to free swap-cache-only swap entries if swap is
> > not enough.
> > Note: We hit following path when swap_cluster code cannot find
> > a free cluster. Then, vm_swap_full() is not only condition to allow
> > the kernel to reclaim unused swap.
> >
> > Acked-by: Balbir Singh <[email protected]>
> > Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
>
> I've confirmed that usage increase of swap and swapcache stopped
> at some threshold in my test, in which , before this patch, some programs
> had been oom-killed after a long time because of shortage of swap space.
>
> This has been merged to mm already though:
>
> Tested-by: Daisuke Nishimura <[email protected]>
>
Wondeful :) Thank you for your long efforts and patiance.

Regards,
-Kame


> Thanks,
> Daiuske Nishimura.
>
> > ---
> > mm/swapfile.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
> > 1 file changed, 47 insertions(+)
> >
> > Index: mmotm-2.6.30-May28/mm/swapfile.c
> > ===================================================================
> > --- mmotm-2.6.30-May28.orig/mm/swapfile.c
> > +++ mmotm-2.6.30-May28/mm/swapfile.c
> > @@ -79,6 +79,32 @@ static inline unsigned short encode_swap
> > return ret;
> > }
> >
> > +/* returnes 1 if swap entry is freed */
> > +static int
> > +__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
> > +{
> > + int type = si - swap_info;
> > + swp_entry_t entry = swp_entry(type, offset);
> > + struct page *page;
> > + int ret = 0;
> > +
> > + page = find_get_page(&swapper_space, entry.val);
> > + if (!page)
> > + return 0;
> > + /*
> > + * This function is called from scan_swap_map() and it's called
> > + * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
> > + * We have to use trylock for avoiding deadlock. This is a special
> > + * case and you should use try_to_free_swap() with explicit lock_page()
> > + * in usual operations.
> > + */
> > + if (trylock_page(page)) {
> > + ret = try_to_free_swap(page);
> > + unlock_page(page);
> > + }
> > + page_cache_release(page);
> > + return ret;
> > +}
> >
> > /*
> > * We need this because the bdev->unplug_fn can sleep and we cannot
> > @@ -301,6 +327,19 @@ checks:
> > goto no_page;
> > if (offset > si->highest_bit)
> > scan_base = offset = si->lowest_bit;
> > +
> > + /* reuse swap entry of cache-only swap if not busy. */
> > + if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
> > + int swap_was_freed;
> > + spin_unlock(&swap_lock);
> > + swap_was_freed = __try_to_reclaim_swap(si, offset);
> > + spin_lock(&swap_lock);
> > + /* entry was freed successfully, try to use this again */
> > + if (swap_was_freed)
> > + goto checks;
> > + goto scan; /* check next one */
> > + }
> > +
> > if (si->swap_map[offset])
> > goto scan;
> >
> > @@ -382,6 +421,10 @@ scan:
> > spin_lock(&swap_lock);
> > goto checks;
> > }
> > + if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
> > + spin_lock(&swap_lock);
> > + goto checks;
> > + }
> > if (unlikely(--latency_ration < 0)) {
> > cond_resched();
> > latency_ration = LATENCY_LIMIT;
> > @@ -393,6 +436,10 @@ scan:
> > spin_lock(&swap_lock);
> > goto checks;
> > }
> > + if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
> > + spin_lock(&swap_lock);
> > + goto checks;
> > + }
> > if (unlikely(--latency_ration < 0)) {
> > cond_resched();
> > latency_ration = LATENCY_LIMIT;
> >
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>