Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1757062Ab1E0Tt2 (ORCPT ); Fri, 27 May 2011 15:49:28 -0400 Received: from rcsinet10.oracle.com ([148.87.113.121]:20900 "EHLO rcsinet10.oracle.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1756936Ab1E0TtY (ORCPT ); Fri, 27 May 2011 15:49:24 -0400 Date: Fri, 27 May 2011 12:48:45 -0700 From: Dan Magenheimer To: linux-kernel@vger.kernel.org, linux-mm@kvack.org, jeremy@goop.org, hughd@google.com, ngupta@vflare.org, konrad.wilk@oracle.com, JBeulich@novell.com, kurt.hackel@oracle.com, npiggin@suse.de, akpm@linux-foundation.org, riel@redhat.com, hannes@cmpxchg.org, matthew@wil.cx, chris.mason@oracle.com, dan.magenheimer@oracle.com Subject: [PATCH V4 2/4] mm: frontswap: core code Message-ID: <20110527194845.GA27141@ca-server1.us.oracle.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline User-Agent: Mutt/1.5.16 (2007-06-11) X-Source-IP: acsinet21.oracle.com [141.146.126.237] X-Auth-Type: Internal IP X-CT-RefId: str=0001.0A090202.4DE00026.0242:SCFMA922111,ss=1,fgs=0 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 12346 Lines: 443 [PATCH V4 2/4] mm: frontswap: core code Core frontswap interface between swap subsystem hooks and frontswap backend via frontswap_ops. Credits: Frontswap_ops design derived from Jeremy Fitzhardinge design for tmem; sysfs code modelled after mm/ksm.c Signed-off-by: Dan Magenheimer Diffstat: include/linux/frontswap.h | 86 +++++ mm/frontswap.c | 331 +++++++++++++++++++++ 2 files changed, 417 insertions(+) --- linux-2.6.39/include/linux/frontswap.h 1969-12-31 17:00:00.000000000 -0700 +++ linux-2.6.39-frontswap/include/linux/frontswap.h 2011-05-26 15:37:25.198065401 -0600 @@ -0,0 +1,86 @@ +#ifndef _LINUX_FRONTSWAP_H +#define _LINUX_FRONTSWAP_H + +#include +#include + +struct frontswap_ops { + void (*init)(unsigned); + int (*put_page)(unsigned, pgoff_t, struct page *); + int (*get_page)(unsigned, pgoff_t, struct page *); + void (*flush_page)(unsigned, pgoff_t); + void (*flush_area)(unsigned); +}; + +extern int frontswap_enabled; +extern struct frontswap_ops + frontswap_register_ops(struct frontswap_ops *ops); +extern void frontswap_shrink(unsigned long); +extern unsigned long frontswap_curr_pages(void); + +extern void frontswap_init(unsigned type); +extern int __frontswap_put_page(struct page *page); +extern int __frontswap_get_page(struct page *page); +extern void __frontswap_flush_page(unsigned, pgoff_t); +extern void __frontswap_flush_area(unsigned); + +#ifndef CONFIG_FRONTSWAP +/* all inline routines become no-ops and all externs are ignored */ +#define frontswap_enabled (0) +#endif + +static inline int frontswap_test(struct swap_info_struct *sis, pgoff_t offset) +{ + int ret = 0; + + if (frontswap_enabled && sis->frontswap_map) + ret = test_bit(offset % BITS_PER_LONG, + &sis->frontswap_map[offset/BITS_PER_LONG]); + return ret; +} + +static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset) +{ + if (frontswap_enabled && sis->frontswap_map) + set_bit(offset % BITS_PER_LONG, + &sis->frontswap_map[offset/BITS_PER_LONG]); +} + +static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset) +{ + if (frontswap_enabled && sis->frontswap_map) + clear_bit(offset % BITS_PER_LONG, + &sis->frontswap_map[offset/BITS_PER_LONG]); +} + +static inline int frontswap_put_page(struct page *page) +{ + int ret = -1; + + if (frontswap_enabled) + ret = __frontswap_put_page(page); + return ret; +} + +static inline int frontswap_get_page(struct page *page) +{ + int ret = -1; + + if (frontswap_enabled) + ret = __frontswap_get_page(page); + return ret; +} + +static inline void frontswap_flush_page(unsigned type, pgoff_t offset) +{ + if (frontswap_enabled) + __frontswap_flush_page(type, offset); +} + +static inline void frontswap_flush_area(unsigned type) +{ + if (frontswap_enabled) + __frontswap_flush_area(type); +} + +#endif /* _LINUX_FRONTSWAP_H */ --- linux-2.6.39/mm/frontswap.c 1969-12-31 17:00:00.000000000 -0700 +++ linux-2.6.39-frontswap/mm/frontswap.c 2011-05-26 15:37:25.254816281 -0600 @@ -0,0 +1,331 @@ +/* + * Frontswap frontend + * + * This code provides the generic "frontend" layer to call a matching + * "backend" driver implementation of frontswap. See + * Documentation/vm/frontswap.txt for more information. + * + * Copyright (C) 2009-2010 Oracle Corp. All rights reserved. + * Author: Dan Magenheimer + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * frontswap_ops is set by frontswap_register_ops to contain the pointers + * to the frontswap "backend" implementation functions. + */ +static struct frontswap_ops frontswap_ops; + +/* + * This global enablement flag reduces overhead on systems where frontswap_ops + * has not been registered, so is preferred to the slower alternative: a + * function call that checks a non-global. + */ +int frontswap_enabled; +EXPORT_SYMBOL(frontswap_enabled); + +/* useful stats available in /sys/kernel/mm/frontswap */ +static unsigned long frontswap_gets; +static unsigned long frontswap_succ_puts; +static unsigned long frontswap_failed_puts; +static unsigned long frontswap_flushes; + +/* + * register operations for frontswap, returning previous thus allowing + * detection of multiple backends and possible nesting + */ +struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops) +{ + struct frontswap_ops old = frontswap_ops; + + frontswap_ops = *ops; + frontswap_enabled = 1; + return old; +} +EXPORT_SYMBOL(frontswap_register_ops); + +/* Called when a swap device is swapon'd */ +void frontswap_init(unsigned type) +{ + if (frontswap_enabled) + (*frontswap_ops.init)(type); +} +EXPORT_SYMBOL(frontswap_init); + +/* + * "Put" data from a page to frontswap and associate it with the page's + * swaptype and offset. Page must be locked and in the swap cache. + * If frontswap already contains a page with matching swaptype and + * offset, the frontswap implmentation may either overwrite the data + * and return success or flush the page from frontswap and return failure + */ +int __frontswap_put_page(struct page *page) +{ + int ret = -1, dup = 0; + swp_entry_t entry = { .val = page_private(page), }; + int type = swp_type(entry); + struct swap_info_struct *sis = swap_info[type]; + pgoff_t offset = swp_offset(entry); + + BUG_ON(!PageLocked(page)); + if (frontswap_test(sis, offset)) + dup = 1; + ret = (*frontswap_ops.put_page)(type, offset, page); + if (ret == 0) { + frontswap_set(sis, offset); + frontswap_succ_puts++; + if (!dup) + sis->frontswap_pages++; + } else if (dup) { + /* + failed dup always results in automatic flush of + the (older) page from frontswap + */ + frontswap_clear(sis, offset); + sis->frontswap_pages--; + frontswap_failed_puts++; + } else + frontswap_failed_puts++; + return ret; +} + +/* + * "Get" data from frontswap associated with swaptype and offset that were + * specified when the data was put to frontswap and use it to fill the + * specified page with data. Page must be locked and in the swap cache + */ +int __frontswap_get_page(struct page *page) +{ + int ret = -1; + swp_entry_t entry = { .val = page_private(page), }; + int type = swp_type(entry); + struct swap_info_struct *sis = swap_info[type]; + pgoff_t offset = swp_offset(entry); + + BUG_ON(!PageLocked(page)); + if (frontswap_test(sis, offset)) + ret = (*frontswap_ops.get_page)(type, offset, page); + if (ret == 0) + frontswap_gets++; + return ret; +} + +/* + * Flush any data from frontswap associated with the specified swaptype + * and offset so that a subsequent "get" will fail. + */ +void __frontswap_flush_page(unsigned type, pgoff_t offset) +{ + struct swap_info_struct *sis = swap_info[type]; + + if (frontswap_test(sis, offset)) { + (*frontswap_ops.flush_page)(type, offset); + sis->frontswap_pages--; + frontswap_clear(sis, offset); + frontswap_flushes++; + } +} + +/* + * Flush all data from frontswap associated with all offsets for the + * specified swaptype. + */ +void __frontswap_flush_area(unsigned type) +{ + struct swap_info_struct *sis = swap_info[type]; + + (*frontswap_ops.flush_area)(type); + sis->frontswap_pages = 0; + memset(sis->frontswap_map, 0, sis->max / sizeof(long)); +} + +/* + * Frontswap, like a true swap device, may unnecessarily retain pages + * under certain circumstances; "shrink" frontswap is essentially a + * "partial swapoff" and works by calling try_to_unuse to attempt to + * unuse enough frontswap pages to attempt to -- subject to memory + * constraints -- reduce the number of pages in frontswap + */ +void frontswap_shrink(unsigned long target_pages) +{ + int wrapped = 0; + bool locked = false; + + for (wrapped = 0; wrapped <= 3; wrapped++) { + + struct swap_info_struct *si = NULL; + unsigned long total_pages = 0, total_pages_to_unuse; + unsigned long pages = 0, unuse_pages = 0; + int type; + + /* + * we don't want to hold swap_lock while doing a very + * lengthy try_to_unuse, but swap_list may change + * so restart scan from swap_list.head each time + */ + spin_lock(&swap_lock); + locked = true; + total_pages = 0; + for (type = swap_list.head; type >= 0; type = si->next) { + si = swap_info[type]; + total_pages += si->frontswap_pages; + } + if (total_pages <= target_pages) + goto out; + total_pages_to_unuse = total_pages - target_pages; + for (type = swap_list.head; type >= 0; type = si->next) { + si = swap_info[type]; + if (total_pages_to_unuse < si->frontswap_pages) + pages = unuse_pages = total_pages_to_unuse; + else { + pages = si->frontswap_pages; + unuse_pages = 0; /* unuse all */ + } + if (security_vm_enough_memory_kern(pages)) + continue; + vm_unacct_memory(pages); + break; + } + if (type < 0) + goto out; + locked = false; + spin_unlock(&swap_lock); + try_to_unuse(type, true, unuse_pages); + } + +out: + if (locked) + spin_unlock(&swap_lock); + return; +} +EXPORT_SYMBOL(frontswap_shrink); + +/* + * count and return the number of pages frontswap pages across all + * swap devices. This is exported so that a kernel module can + * determine current usage without reading sysfs. + */ +unsigned long frontswap_curr_pages(void) +{ + int type; + unsigned long totalpages = 0; + struct swap_info_struct *si = NULL; + + spin_lock(&swap_lock); + for (type = swap_list.head; type >= 0; type = si->next) { + si = swap_info[type]; + totalpages += si->frontswap_pages; + } + spin_unlock(&swap_lock); + return totalpages; +} +EXPORT_SYMBOL(frontswap_curr_pages); + +#ifdef CONFIG_SYSFS + +/* see Documentation/ABI/xxx/sysfs-kernel-mm-frontswap */ + +#define FRONTSWAP_ATTR_RO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) +#define FRONTSWAP_ATTR(_name) \ + static struct kobj_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +static ssize_t curr_pages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", frontswap_curr_pages()); +} + +static ssize_t curr_pages_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long target_pages; + int err; + + err = strict_strtoul(buf, 10, &target_pages); + if (err) + return -EINVAL; + + frontswap_shrink(target_pages); + + return count; +} +FRONTSWAP_ATTR(curr_pages); + +static ssize_t succ_puts_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", frontswap_succ_puts); +} +FRONTSWAP_ATTR_RO(succ_puts); + +static ssize_t failed_puts_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", frontswap_failed_puts); +} +FRONTSWAP_ATTR_RO(failed_puts); + +static ssize_t gets_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", frontswap_gets); +} +FRONTSWAP_ATTR_RO(gets); + +static ssize_t flushes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", frontswap_flushes); +} +FRONTSWAP_ATTR_RO(flushes); + +static struct attribute *frontswap_attrs[] = { + &curr_pages_attr.attr, + &succ_puts_attr.attr, + &failed_puts_attr.attr, + &gets_attr.attr, + &flushes_attr.attr, + NULL, +}; + +static struct attribute_group frontswap_attr_group = { + .attrs = frontswap_attrs, + .name = "frontswap", +}; + +#endif /* CONFIG_SYSFS */ + +static int __init init_frontswap(void) +{ +#ifdef CONFIG_SYSFS + int err; + + err = sysfs_create_group(mm_kobj, &frontswap_attr_group); +#endif /* CONFIG_SYSFS */ + return 0; +} + +static void __exit exit_frontswap(void) +{ + frontswap_shrink(0UL); +} + +module_init(init_frontswap); +module_exit(exit_frontswap); -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/