Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1760730AbYAJPFy (ORCPT ); Thu, 10 Jan 2008 10:05:54 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1759254AbYAJPBG (ORCPT ); Thu, 10 Jan 2008 10:01:06 -0500 Received: from filer.fsl.cs.sunysb.edu ([130.245.126.2]:57054 "EHLO filer.fsl.cs.sunysb.edu" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1758803AbYAJPAy (ORCPT ); Thu, 10 Jan 2008 10:00:54 -0500 From: Erez Zadok To: torvalds@linux-foundation.org, akpm@linux-foundation.org, hch@infradead.org, viro@ftp.linux.org.uk Cc: linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org, Erez Zadok Subject: [PATCH 18/29] Unionfs: address-space operations Date: Thu, 10 Jan 2008 09:59:37 -0500 Message-Id: <1199977198166-git-send-email-ezk@cs.sunysb.edu> X-Mailer: git-send-email 1.5.2.2 X-MailKey: Erez_Zadok In-Reply-To: <11999771882152-git-send-email-ezk@cs.sunysb.edu> References: <11999771882152-git-send-email-ezk@cs.sunysb.edu> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 11035 Lines: 365 Includes writepage, writepages, readpage, prepare_write, and commit_write. Signed-off-by: Erez Zadok --- fs/unionfs/mmap.c | 343 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 343 insertions(+), 0 deletions(-) create mode 100644 fs/unionfs/mmap.c diff --git a/fs/unionfs/mmap.c b/fs/unionfs/mmap.c new file mode 100644 index 0000000..ad770ac --- /dev/null +++ b/fs/unionfs/mmap.c @@ -0,0 +1,343 @@ +/* + * Copyright (c) 2003-2007 Erez Zadok + * Copyright (c) 2003-2006 Charles P. Wright + * Copyright (c) 2005-2007 Josef 'Jeff' Sipek + * Copyright (c) 2005-2006 Junjiro Okajima + * Copyright (c) 2006 Shaya Potter + * Copyright (c) 2005 Arun M. Krishnakumar + * Copyright (c) 2004-2006 David P. Quigley + * Copyright (c) 2003-2004 Mohammad Nayyer Zubair + * Copyright (c) 2003 Puja Gupta + * Copyright (c) 2003 Harikesavan Krishnan + * Copyright (c) 2003-2007 Stony Brook University + * Copyright (c) 2003-2007 The Research Foundation of SUNY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "union.h" + +static int unionfs_writepage(struct page *page, struct writeback_control *wbc) +{ + int err = -EIO; + struct inode *inode; + struct inode *lower_inode; + struct page *lower_page; + struct address_space *lower_mapping; /* lower inode mapping */ + gfp_t mask; + + BUG_ON(!PageUptodate(page)); + inode = page->mapping->host; + /* if no lower inode, nothing to do */ + if (!inode || !UNIONFS_I(inode) || UNIONFS_I(inode)->lower_inodes) { + err = 0; + goto out; + } + lower_inode = unionfs_lower_inode(inode); + lower_mapping = lower_inode->i_mapping; + + /* + * find lower page (returns a locked page) + * + * We turn off __GFP_FS while we look for or create a new lower + * page. This prevents a recursion into the file system code, which + * under memory pressure conditions could lead to a deadlock. This + * is similar to how the loop driver behaves (see loop_set_fd in + * drivers/block/loop.c). If we can't find the lower page, we + * redirty our page and return "success" so that the VM will call us + * again in the (hopefully near) future. + */ + mask = mapping_gfp_mask(lower_mapping) & ~(__GFP_FS); + lower_page = find_or_create_page(lower_mapping, page->index, mask); + if (!lower_page) { + err = 0; + set_page_dirty(page); + goto out; + } + + /* copy page data from our upper page to the lower page */ + copy_highpage(lower_page, page); + flush_dcache_page(lower_page); + SetPageUptodate(lower_page); + set_page_dirty(lower_page); + + /* + * Call lower writepage (expects locked page). However, if we are + * called with wbc->for_reclaim, then the VFS/VM just wants to + * reclaim our page. Therefore, we don't need to call the lower + * ->writepage: just copy our data to the lower page (already done + * above), then mark the lower page dirty and unlock it, and return + * success. + */ + if (wbc->for_reclaim) { + unlock_page(lower_page); + goto out_release; + } + + BUG_ON(!lower_mapping->a_ops->writepage); + wait_on_page_writeback(lower_page); /* prevent multiple writers */ + clear_page_dirty_for_io(lower_page); /* emulate VFS behavior */ + err = lower_mapping->a_ops->writepage(lower_page, wbc); + if (err < 0) + goto out_release; + + /* + * Lower file systems such as ramfs and tmpfs, may return + * AOP_WRITEPAGE_ACTIVATE so that the VM won't try to (pointlessly) + * write the page again for a while. But those lower file systems + * also set the page dirty bit back again. Since we successfully + * copied our page data to the lower page, then the VM will come + * back to the lower page (directly) and try to flush it. So we can + * save the VM the hassle of coming back to our page and trying to + * flush too. Therefore, we don't re-dirty our own page, and we + * never return AOP_WRITEPAGE_ACTIVATE back to the VM (we consider + * this a success). + * + * We also unlock the lower page if the lower ->writepage returned + * AOP_WRITEPAGE_ACTIVATE. (This "anomalous" behaviour may be + * addressed in future shmem/VM code.) + */ + if (err == AOP_WRITEPAGE_ACTIVATE) { + err = 0; + unlock_page(lower_page); + } + + /* all is well */ + + /* lower mtimes have changed: update ours */ + unionfs_copy_attr_times(inode); + +out_release: + /* b/c find_or_create_page increased refcnt */ + page_cache_release(lower_page); +out: + /* + * We unlock our page unconditionally, because we never return + * AOP_WRITEPAGE_ACTIVATE. + */ + unlock_page(page); + return err; +} + +static int unionfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + int err = 0; + struct inode *lower_inode; + struct inode *inode; + + inode = mapping->host; + if (ibstart(inode) < 0 && ibend(inode) < 0) + goto out; + lower_inode = unionfs_lower_inode(inode); + if (!lower_inode) + goto out; + + err = generic_writepages(mapping, wbc); + if (!err) + unionfs_copy_attr_times(inode); +out: + return err; +} + +/* Readpage expects a locked page, and must unlock it */ +static int unionfs_readpage(struct file *file, struct page *page) +{ + int err; + struct file *lower_file; + struct inode *inode; + mm_segment_t old_fs; + char *page_data = NULL; + mode_t orig_mode; + + unionfs_read_lock(file->f_path.dentry->d_sb, UNIONFS_SMUTEX_PARENT); + err = unionfs_file_revalidate(file, false); + if (unlikely(err)) + goto out; + unionfs_check_file(file); + + if (!UNIONFS_F(file)) { + err = -ENOENT; + goto out; + } + + lower_file = unionfs_lower_file(file); + /* FIXME: is this assertion right here? */ + BUG_ON(lower_file == NULL); + + inode = file->f_path.dentry->d_inode; + + page_data = (char *) kmap(page); + /* + * Use vfs_read because some lower file systems don't have a + * readpage method, and some file systems (esp. distributed ones) + * don't like their pages to be accessed directly. Using vfs_read + * may be a little slower, but a lot safer, as the VFS does a lot of + * the necessary magic for us. + */ + lower_file->f_pos = page_offset(page); + old_fs = get_fs(); + set_fs(KERNEL_DS); + /* + * generic_file_splice_write may call us on a file not opened for + * reading, so temporarily allow reading. + */ + orig_mode = lower_file->f_mode; + lower_file->f_mode |= FMODE_READ; + err = vfs_read(lower_file, page_data, PAGE_CACHE_SIZE, + &lower_file->f_pos); + lower_file->f_mode = orig_mode; + set_fs(old_fs); + if (err >= 0 && err < PAGE_CACHE_SIZE) + memset(page_data + err, 0, PAGE_CACHE_SIZE - err); + kunmap(page); + + if (err < 0) + goto out; + err = 0; + + /* if vfs_read succeeded above, sync up our times */ + unionfs_copy_attr_times(inode); + + flush_dcache_page(page); + + /* + * we have to unlock our page, b/c we _might_ have gotten a locked + * page. but we no longer have to wakeup on our page here, b/c + * UnlockPage does it + */ +out: + if (err == 0) + SetPageUptodate(page); + else + ClearPageUptodate(page); + + unlock_page(page); + unionfs_check_file(file); + unionfs_read_unlock(file->f_path.dentry->d_sb); + + return err; +} + +static int unionfs_prepare_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + int err; + + unionfs_read_lock(file->f_path.dentry->d_sb, UNIONFS_SMUTEX_PARENT); + /* + * This is the only place where we unconditionally copy the lower + * attribute times before calling unionfs_file_revalidate. The + * reason is that our ->write calls do_sync_write which in turn will + * call our ->prepare_write and then ->commit_write. Before our + * ->write is called, the lower mtimes are in sync, but by the time + * the VFS calls our ->commit_write, the lower mtimes have changed. + * Therefore, the only reasonable time for us to sync up from the + * changed lower mtimes, and avoid an invariant violation warning, + * is here, in ->prepare_write. + */ + unionfs_copy_attr_times(file->f_path.dentry->d_inode); + err = unionfs_file_revalidate(file, true); + unionfs_check_file(file); + unionfs_read_unlock(file->f_path.dentry->d_sb); + + return err; +} + +static int unionfs_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + int err = -ENOMEM; + struct inode *inode, *lower_inode; + struct file *lower_file = NULL; + unsigned bytes = to - from; + char *page_data = NULL; + mm_segment_t old_fs; + + BUG_ON(file == NULL); + + unionfs_read_lock(file->f_path.dentry->d_sb, UNIONFS_SMUTEX_PARENT); + err = unionfs_file_revalidate(file, true); + if (unlikely(err)) + goto out; + unionfs_check_file(file); + + inode = page->mapping->host; + + if (UNIONFS_F(file) != NULL) + lower_file = unionfs_lower_file(file); + + /* FIXME: is this assertion right here? */ + BUG_ON(lower_file == NULL); + + page_data = (char *)kmap(page); + lower_file->f_pos = page_offset(page) + from; + + /* + * We use vfs_write instead of copying page data and the + * prepare_write/commit_write combo because file system's like + * GFS/OCFS2 don't like things touching those directly, + * calling the underlying write op, while a little bit slower, will + * call all the FS specific code as well + */ + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = vfs_write(lower_file, page_data + from, bytes, + &lower_file->f_pos); + set_fs(old_fs); + + kunmap(page); + + if (err < 0) + goto out; + + /* if vfs_write succeeded above, sync up our times/sizes */ + lower_inode = lower_file->f_path.dentry->d_inode; + if (!lower_inode) + lower_inode = unionfs_lower_inode(inode); + BUG_ON(!lower_inode); + fsstack_copy_inode_size(inode, lower_inode); + unionfs_copy_attr_times(inode); + mark_inode_dirty_sync(inode); + +out: + if (err < 0) + ClearPageUptodate(page); + + unionfs_check_file(file); + unionfs_read_unlock(file->f_path.dentry->d_sb); + return err; /* assume all is ok */ +} + +/* + * Although unionfs isn't a block-based file system, it may stack on one. + * ->bmap is needed, for example, to swapon(2) files. + */ +sector_t unionfs_bmap(struct address_space *mapping, sector_t block) +{ + int err = -EINVAL; + struct inode *inode, *lower_inode; + sector_t (*bmap)(struct address_space *, sector_t); + + inode = (struct inode *)mapping->host; + lower_inode = unionfs_lower_inode(inode); + if (!lower_inode) + goto out; + bmap = lower_inode->i_mapping->a_ops->bmap; + if (bmap) + err = bmap(lower_inode->i_mapping, block); +out: + return err; +} + + +struct address_space_operations unionfs_aops = { + .writepage = unionfs_writepage, + .writepages = unionfs_writepages, + .readpage = unionfs_readpage, + .prepare_write = unionfs_prepare_write, + .commit_write = unionfs_commit_write, + .bmap = unionfs_bmap, +}; -- 1.5.2.2 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/