Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1757480Ab1FGRyb (ORCPT ); Tue, 7 Jun 2011 13:54:31 -0400 Received: from rcsinet10.oracle.com ([148.87.113.121]:36891 "EHLO rcsinet10.oracle.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1756843Ab1FGRy3 (ORCPT ); Tue, 7 Jun 2011 13:54:29 -0400 Date: Tue, 7 Jun 2011 13:53:20 -0400 From: Konrad Rzeszutek Wilk To: Dan Magenheimer Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org, jeremy@goop.org, hughd@google.com, ngupta@vflare.org, JBeulich@novell.com, kurt.hackel@oracle.com, npiggin@suse.de, akpm@linux-foundation.org, riel@redhat.com, hannes@cmpxchg.org, matthew@wil.cx, chris.mason@oracle.com Subject: Re: [PATCH V4 2/4] mm: frontswap: core code Message-ID: <20110607175320.GA32207@dumpdata.com> References: <20110527194845.GA27141@ca-server1.us.oracle.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20110527194845.GA27141@ca-server1.us.oracle.com> User-Agent: Mutt/1.5.21 (2010-09-15) X-Source-IP: acsinet22.oracle.com [141.146.126.238] X-Auth-Type: Internal IP X-CT-RefId: str=0001.0A090207.4DEE65A6.01A7:SCFMA922111,ss=1,fgs=0 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 14028 Lines: 476 On Fri, May 27, 2011 at 12:48:45PM -0700, Dan Magenheimer wrote: > [PATCH V4 2/4] mm: frontswap: core code > > Core frontswap interface between swap subsystem hooks and > frontswap backend via frontswap_ops. > > Credits: Frontswap_ops design derived from Jeremy Fitzhardinge > design for tmem; sysfs code modelled after mm/ksm.c > > Signed-off-by: Dan Magenheimer > > Diffstat: > include/linux/frontswap.h | 86 +++++ > mm/frontswap.c | 331 +++++++++++++++++++++ > 2 files changed, 417 insertions(+) > > --- linux-2.6.39/include/linux/frontswap.h 1969-12-31 17:00:00.000000000 -0700 > +++ linux-2.6.39-frontswap/include/linux/frontswap.h 2011-05-26 15:37:25.198065401 -0600 > @@ -0,0 +1,86 @@ > +#ifndef _LINUX_FRONTSWAP_H > +#define _LINUX_FRONTSWAP_H > + > +#include > +#include > + > +struct frontswap_ops { > + void (*init)(unsigned); > + int (*put_page)(unsigned, pgoff_t, struct page *); > + int (*get_page)(unsigned, pgoff_t, struct page *); > + void (*flush_page)(unsigned, pgoff_t); > + void (*flush_area)(unsigned); > +}; > + > +extern int frontswap_enabled; > +extern struct frontswap_ops > + frontswap_register_ops(struct frontswap_ops *ops); > +extern void frontswap_shrink(unsigned long); > +extern unsigned long frontswap_curr_pages(void); > + > +extern void frontswap_init(unsigned type); > +extern int __frontswap_put_page(struct page *page); > +extern int __frontswap_get_page(struct page *page); > +extern void __frontswap_flush_page(unsigned, pgoff_t); > +extern void __frontswap_flush_area(unsigned); > + > +#ifndef CONFIG_FRONTSWAP > +/* all inline routines become no-ops and all externs are ignored */ > +#define frontswap_enabled (0) > +#endif > + > +static inline int frontswap_test(struct swap_info_struct *sis, pgoff_t offset) > +{ > + int ret = 0; > + > + if (frontswap_enabled && sis->frontswap_map) > + ret = test_bit(offset % BITS_PER_LONG, > + &sis->frontswap_map[offset/BITS_PER_LONG]); > + return ret; > +} > + > +static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset) > +{ > + if (frontswap_enabled && sis->frontswap_map) > + set_bit(offset % BITS_PER_LONG, > + &sis->frontswap_map[offset/BITS_PER_LONG]); > +} > + > +static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset) > +{ > + if (frontswap_enabled && sis->frontswap_map) > + clear_bit(offset % BITS_PER_LONG, > + &sis->frontswap_map[offset/BITS_PER_LONG]); > +} > + > +static inline int frontswap_put_page(struct page *page) > +{ > + int ret = -1; > + > + if (frontswap_enabled) > + ret = __frontswap_put_page(page); > + return ret; > +} > + > +static inline int frontswap_get_page(struct page *page) > +{ > + int ret = -1; > + > + if (frontswap_enabled) > + ret = __frontswap_get_page(page); > + return ret; > +} > + > +static inline void frontswap_flush_page(unsigned type, pgoff_t offset) > +{ > + if (frontswap_enabled) > + __frontswap_flush_page(type, offset); > +} > + > +static inline void frontswap_flush_area(unsigned type) > +{ > + if (frontswap_enabled) > + __frontswap_flush_area(type); > +} > + > +#endif /* _LINUX_FRONTSWAP_H */ > --- linux-2.6.39/mm/frontswap.c 1969-12-31 17:00:00.000000000 -0700 > +++ linux-2.6.39-frontswap/mm/frontswap.c 2011-05-26 15:37:25.254816281 -0600 > @@ -0,0 +1,331 @@ > +/* > + * Frontswap frontend > + * > + * This code provides the generic "frontend" layer to call a matching > + * "backend" driver implementation of frontswap. See > + * Documentation/vm/frontswap.txt for more information. > + * > + * Copyright (C) 2009-2010 Oracle Corp. All rights reserved. > + * Author: Dan Magenheimer > + * > + * This work is licensed under the terms of the GNU GPL, version 2. > + */ > + > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > + > +/* > + * frontswap_ops is set by frontswap_register_ops to contain the pointers > + * to the frontswap "backend" implementation functions. > + */ > +static struct frontswap_ops frontswap_ops; > + > +/* > + * This global enablement flag reduces overhead on systems where frontswap_ops > + * has not been registered, so is preferred to the slower alternative: a > + * function call that checks a non-global. > + */ > +int frontswap_enabled; > +EXPORT_SYMBOL(frontswap_enabled); > + > +/* useful stats available in /sys/kernel/mm/frontswap */ > +static unsigned long frontswap_gets; > +static unsigned long frontswap_succ_puts; > +static unsigned long frontswap_failed_puts; > +static unsigned long frontswap_flushes; > + > +/* > + * register operations for frontswap, returning previous thus allowing > + * detection of multiple backends and possible nesting > + */ > +struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops) > +{ > + struct frontswap_ops old = frontswap_ops; > + > + frontswap_ops = *ops; > + frontswap_enabled = 1; > + return old; > +} > +EXPORT_SYMBOL(frontswap_register_ops); > + > +/* Called when a swap device is swapon'd */ > +void frontswap_init(unsigned type) > +{ > + if (frontswap_enabled) > + (*frontswap_ops.init)(type); > +} > +EXPORT_SYMBOL(frontswap_init); > + > +/* > + * "Put" data from a page to frontswap and associate it with the page's > + * swaptype and offset. Page must be locked and in the swap cache. > + * If frontswap already contains a page with matching swaptype and > + * offset, the frontswap implmentation may either overwrite the data > + * and return success or flush the page from frontswap and return failure > + */ > +int __frontswap_put_page(struct page *page) > +{ > + int ret = -1, dup = 0; > + swp_entry_t entry = { .val = page_private(page), }; > + int type = swp_type(entry); > + struct swap_info_struct *sis = swap_info[type]; There is no danger that swap_info[type] returns NULL? Should you get the get the swap_lock lock? Or is that not neccessary b/c the caller already holds the lock? If so it might be better to name the function __frontswap_put_page_locked(..)? > + pgoff_t offset = swp_offset(entry); > + > + BUG_ON(!PageLocked(page)); > + if (frontswap_test(sis, offset)) > + dup = 1; > + ret = (*frontswap_ops.put_page)(type, offset, page); > + if (ret == 0) { > + frontswap_set(sis, offset); > + frontswap_succ_puts++; > + if (!dup) > + sis->frontswap_pages++; > + } else if (dup) { > + /* > + failed dup always results in automatic flush of > + the (older) page from frontswap > + */ > + frontswap_clear(sis, offset); > + sis->frontswap_pages--; > + frontswap_failed_puts++; > + } else > + frontswap_failed_puts++; > + return ret; > +} > + > +/* > + * "Get" data from frontswap associated with swaptype and offset that were > + * specified when the data was put to frontswap and use it to fill the > + * specified page with data. Page must be locked and in the swap cache > + */ > +int __frontswap_get_page(struct page *page) > +{ > + int ret = -1; > + swp_entry_t entry = { .val = page_private(page), }; > + int type = swp_type(entry); > + struct swap_info_struct *sis = swap_info[type]; > + pgoff_t offset = swp_offset(entry); > + > + BUG_ON(!PageLocked(page)); Looks like swap_readpage already does this? > + if (frontswap_test(sis, offset)) So no checking if sis == NULL b/c the caller already does that? You might want to stick a comment here. > + ret = (*frontswap_ops.get_page)(type, offset, page); > + if (ret == 0) > + frontswap_gets++; > + return ret; > +} > + > +/* > + * Flush any data from frontswap associated with the specified swaptype > + * and offset so that a subsequent "get" will fail. > + */ > +void __frontswap_flush_page(unsigned type, pgoff_t offset) > +{ > + struct swap_info_struct *sis = swap_info[type]; > + > + if (frontswap_test(sis, offset)) { > + (*frontswap_ops.flush_page)(type, offset); > + sis->frontswap_pages--; > + frontswap_clear(sis, offset); > + frontswap_flushes++; > + } > +} > + > +/* > + * Flush all data from frontswap associated with all offsets for the > + * specified swaptype. > + */ > +void __frontswap_flush_area(unsigned type) > +{ > + struct swap_info_struct *sis = swap_info[type]; > + > + (*frontswap_ops.flush_area)(type); > + sis->frontswap_pages = 0; > + memset(sis->frontswap_map, 0, sis->max / sizeof(long)); > +} > + > +/* > + * Frontswap, like a true swap device, may unnecessarily retain pages > + * under certain circumstances; "shrink" frontswap is essentially a > + * "partial swapoff" and works by calling try_to_unuse to attempt to > + * unuse enough frontswap pages to attempt to -- subject to memory > + * constraints -- reduce the number of pages in frontswap > + */ > +void frontswap_shrink(unsigned long target_pages) > +{ > + int wrapped = 0; > + bool locked = false; > + > + for (wrapped = 0; wrapped <= 3; wrapped++) { 3? Is this value special? You might want to provide a comment of why '3' is _the_ choice. > + > + struct swap_info_struct *si = NULL; > + unsigned long total_pages = 0, total_pages_to_unuse; > + unsigned long pages = 0, unuse_pages = 0; > + int type; > + > + /* > + * we don't want to hold swap_lock while doing a very > + * lengthy try_to_unuse, but swap_list may change > + * so restart scan from swap_list.head each time > + */ > + spin_lock(&swap_lock); > + locked = true; > + total_pages = 0; > + for (type = swap_list.head; type >= 0; type = si->next) { > + si = swap_info[type]; > + total_pages += si->frontswap_pages; > + } > + if (total_pages <= target_pages) > + goto out; > + total_pages_to_unuse = total_pages - target_pages; > + for (type = swap_list.head; type >= 0; type = si->next) { > + si = swap_info[type]; > + if (total_pages_to_unuse < si->frontswap_pages) > + pages = unuse_pages = total_pages_to_unuse; > + else { > + pages = si->frontswap_pages; unused_pages sounds better than 'pages'. > + unuse_pages = 0; /* unuse all */ pages_unuse? pages_to_unuse? Yeah, that sounds better. > + } > + if (security_vm_enough_memory_kern(pages)) > + continue; > + vm_unacct_memory(pages); > + break; > + } > + if (type < 0) > + goto out; > + locked = false; > + spin_unlock(&swap_lock); > + try_to_unuse(type, true, unuse_pages); > + } > + > +out: > + if (locked) > + spin_unlock(&swap_lock); > + return; > +} > +EXPORT_SYMBOL(frontswap_shrink); > + > +/* > + * count and return the number of pages frontswap pages across all > + * swap devices. This is exported so that a kernel module can > + * determine current usage without reading sysfs. > + */ > +unsigned long frontswap_curr_pages(void) > +{ > + int type; > + unsigned long totalpages = 0; > + struct swap_info_struct *si = NULL; > + > + spin_lock(&swap_lock); > + for (type = swap_list.head; type >= 0; type = si->next) { > + si = swap_info[type]; > + totalpages += si->frontswap_pages; Uh, you don't need to check 'si' for NULL? > + } > + spin_unlock(&swap_lock); > + return totalpages; > +} > +EXPORT_SYMBOL(frontswap_curr_pages); > + > +#ifdef CONFIG_SYSFS > + > +/* see Documentation/ABI/xxx/sysfs-kernel-mm-frontswap */ > + > +#define FRONTSWAP_ATTR_RO(_name) \ > + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) > +#define FRONTSWAP_ATTR(_name) \ > + static struct kobj_attribute _name##_attr = \ > + __ATTR(_name, 0644, _name##_show, _name##_store) > + > +static ssize_t curr_pages_show(struct kobject *kobj, > + struct kobj_attribute *attr, char *buf) > +{ > + return sprintf(buf, "%lu\n", frontswap_curr_pages()); > +} > + > +static ssize_t curr_pages_store(struct kobject *kobj, > + struct kobj_attribute *attr, > + const char *buf, size_t count) > +{ > + unsigned long target_pages; > + int err; > + > + err = strict_strtoul(buf, 10, &target_pages); > + if (err) > + return -EINVAL; How about just "if (strict..) return -EINVAL" ) and ditch the 'int err'? > + > + frontswap_shrink(target_pages); > + > + return count; > +} > +FRONTSWAP_ATTR(curr_pages); > + > +static ssize_t succ_puts_show(struct kobject *kobj, > + struct kobj_attribute *attr, char *buf) > +{ > + return sprintf(buf, "%lu\n", frontswap_succ_puts); > +} > +FRONTSWAP_ATTR_RO(succ_puts); > + > +static ssize_t failed_puts_show(struct kobject *kobj, > + struct kobj_attribute *attr, char *buf) > +{ > + return sprintf(buf, "%lu\n", frontswap_failed_puts); > +} > +FRONTSWAP_ATTR_RO(failed_puts); > + > +static ssize_t gets_show(struct kobject *kobj, > + struct kobj_attribute *attr, char *buf) > +{ > + return sprintf(buf, "%lu\n", frontswap_gets); > +} > +FRONTSWAP_ATTR_RO(gets); > + > +static ssize_t flushes_show(struct kobject *kobj, > + struct kobj_attribute *attr, char *buf) > +{ > + return sprintf(buf, "%lu\n", frontswap_flushes); > +} > +FRONTSWAP_ATTR_RO(flushes); > + > +static struct attribute *frontswap_attrs[] = { > + &curr_pages_attr.attr, > + &succ_puts_attr.attr, > + &failed_puts_attr.attr, > + &gets_attr.attr, > + &flushes_attr.attr, > + NULL, > +}; > + > +static struct attribute_group frontswap_attr_group = { > + .attrs = frontswap_attrs, > + .name = "frontswap", > +}; > + > +#endif /* CONFIG_SYSFS */ > + > +static int __init init_frontswap(void) > +{ > +#ifdef CONFIG_SYSFS > + int err; > + > + err = sysfs_create_group(mm_kobj, &frontswap_attr_group); So shouldn't you return 'err'? > +#endif /* CONFIG_SYSFS */ > + return 0; > +} > + > +static void __exit exit_frontswap(void) > +{ > + frontswap_shrink(0UL); > +} > + > +module_init(init_frontswap); > +module_exit(exit_frontswap); -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/