Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S267979AbUIGMNS (ORCPT ); Tue, 7 Sep 2004 08:13:18 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S267976AbUIGMLp (ORCPT ); Tue, 7 Sep 2004 08:11:45 -0400 Received: from mail.fh-wedel.de ([213.39.232.194]:2729 "EHLO mail.fh-wedel.de") by vger.kernel.org with ESMTP id S267958AbUIGMJV (ORCPT ); Tue, 7 Sep 2004 08:09:21 -0400 Date: Tue, 7 Sep 2004 14:09:08 +0200 From: =?iso-8859-1?Q?J=F6rn?= Engel To: Andrew Morton Cc: linux-kernel@vger.kernel.org, Linus Torvalds Subject: [PATCH 1/4] copyfile: generic_sendpage Message-ID: <20040907120908.GB26630@wohnheim.fh-wedel.de> Mime-Version: 1.0 Content-Type: text/plain; charset=iso-8859-1 Content-Disposition: inline Content-Transfer-Encoding: 8bit User-Agent: Mutt/1.3.28i Sender: linux-kernel-owner@vger.kernel.org X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 10036 Lines: 336 Set Linus on Cc:, since you quoted him. On Sat, 4 September 2004 15:39:02 -0700, Andrew Morton wrote: > > I discussed file->file sendfile with Linus a while back and he said > > > I think it was about doing a 2GB file-file sendfile, and see your system > > grind to a halt without being able to kill it. > > > > That said, we have some of the same problems with just regular read/write > > too. sendfile just makes it easier. > > > > We should probably make read/write be interruptible by _fatal_ signals. > > It would require a new task state, though (TASK_KILLABLE or something, and > > it would show up as a normal 'D' state). > > I don't know how much of a problem this is in practice - there are all > sorts of nasty things which unprivileged apps can do to the system by > overloading filesystems. Although most of them can be killed off by the > sysadmin. My solution is simple. Run sendfile() in a loop of 4k chunks and check for reschedule and signal after each chunk. Not sure if the reschedule is needed but it shouldn't hurt much either. The other three patches were updated accorting to Christophs comments. J?rn -- But this is not to say that the main benefit of Linux and other GPL software is lower-cost. Control is the main benefit--cost is secondary. -- Bruce Perens o Add sendfile() support for file targets to normal mm/filemap.c. o Have ext[23] use that support. Signed-off-by: J?rn Engel --- fs/ext2/file.c | 1 fs/ext3/file.c | 1 include/linux/fs.h | 1 mm/filemap.c | 216 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 219 insertions(+) --- linux-2.6.8cow/include/linux/fs.h~generic_sendpage 2004-09-04 22:59:13.000000000 +0200 +++ linux-2.6.8cow/include/linux/fs.h 2004-09-05 11:58:21.000000000 +0200 @@ -1417,6 +1417,7 @@ ssize_t generic_file_write_nolock(struct file *file, const struct iovec *iov, unsigned long nr_segs, loff_t *ppos); extern ssize_t generic_file_sendfile(struct file *, loff_t *, size_t, read_actor_t, void *); +extern ssize_t generic_file_sendpage(struct file *, struct page *, int, size_t, loff_t *, int); extern void do_generic_mapping_read(struct address_space *mapping, struct file_ra_state *, struct file *, loff_t *, read_descriptor_t *, read_actor_t); --- linux-2.6.8cow/mm/filemap.c~generic_sendpage 2004-09-04 22:59:14.000000000 +0200 +++ linux-2.6.8cow/mm/filemap.c 2004-09-04 23:04:38.000000000 +0200 @@ -985,6 +985,31 @@ return written; } +/* FIXME: It would be as simple as this, if we had a (void __user*) to write. + * We already have a kernel buffer, so it should be even simpler, right? ;) + * + * Yes, sorta. After duplicating the complete path of generic_file_write(), + * at least some special cases could be removed, so the copy is simpler than + * the original. But it remains a copy, so overall complexity increases. + */ +static ssize_t +generic_kernel_file_write(struct file *, const char *, size_t, loff_t *); + +ssize_t generic_file_sendpage(struct file *file, struct page *page, + int offset, size_t size, loff_t *ppos, int more) +{ + ssize_t ret; + char *kaddr; + + kaddr = kmap(page); + ret = generic_kernel_file_write(file, kaddr + offset, size, ppos); + kunmap(page); + + return ret; +} + +EXPORT_SYMBOL(generic_file_sendpage); + ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos, size_t count, read_actor_t actor, void *target) { @@ -1642,6 +1667,19 @@ return bytes - left; } +static inline size_t +filemap_copy_from_kernel(struct page *page, unsigned long offset, + const char *buf, unsigned bytes) +{ + char *kaddr; + + kaddr = kmap(page); + memcpy(kaddr + offset, buf, bytes); + kunmap(page); + + return bytes; +} + static size_t __filemap_copy_from_user_iovec(char *vaddr, const struct iovec *iov, size_t base, size_t bytes) @@ -2016,6 +2054,155 @@ EXPORT_SYMBOL(generic_file_aio_write_nolock); +/* + * TODO: + * This largely tries to copy generic_file_aio_write_nolock(), although it + * doesn't have to be nearly as generic. A real cleanup should either + * merge this into generic_file_aio_write_nolock() as well or keep it special + * and remove as much code as possible. + */ +static ssize_t +generic_kernel_file_aio_write_nolock(struct kiocb *iocb, const struct iovec*iov, + unsigned long nr_segs, loff_t *ppos) +{ + struct file *file = iocb->ki_filp; + struct address_space * mapping = file->f_mapping; + struct address_space_operations *a_ops = mapping->a_ops; + size_t ocount; /* original count */ + size_t count; /* after file limit checks */ + struct inode *inode = mapping->host; + long status = 0; + loff_t pos; + struct page *page; + struct page *cached_page = NULL; + const int isblk = S_ISBLK(inode->i_mode); + ssize_t written; + ssize_t err; + size_t bytes; + struct pagevec lru_pvec; + const struct iovec *cur_iov = iov; /* current iovec */ + size_t iov_base = 0; /* offset in the current iovec */ + unsigned long seg; + char *buf; + + ocount = 0; + for (seg = 0; seg < nr_segs; seg++) { + const struct iovec *iv = &iov[seg]; + + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + ocount += iv->iov_len; + if (unlikely((ssize_t)(ocount|iv->iov_len) < 0)) + return -EINVAL; + } + + count = ocount; + pos = *ppos; + pagevec_init(&lru_pvec, 0); + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = mapping->backing_dev_info; + written = 0; + + err = generic_write_checks(file, &pos, &count, isblk); + if (err) + goto out; + + + if (count == 0) + goto out; + + remove_suid(file->f_dentry); + inode_update_time(inode, 1); + + /* There is no sane reason to use O_DIRECT */ + BUG_ON(file->f_flags & O_DIRECT); + + buf = (char *)iov->iov_base; + do { + unsigned long index; + unsigned long offset; + size_t copied; + + offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ + index = pos >> PAGE_CACHE_SHIFT; + bytes = PAGE_CACHE_SIZE - offset; + if (bytes > count) + bytes = count; + + page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); + if (!page) { + status = -ENOMEM; + break; + } + + status = a_ops->prepare_write(file, page, offset, offset+bytes); + if (unlikely(status)) { + loff_t isize = i_size_read(inode); + /* + * prepare_write() may have instantiated a few blocks + * outside i_size. Trim these off again. + */ + unlock_page(page); + page_cache_release(page); + if (pos + bytes > isize) + vmtruncate(inode, isize); + break; + } + + BUG_ON(nr_segs != 1); + copied = filemap_copy_from_kernel(page, offset, buf, bytes); + + flush_dcache_page(page); + status = a_ops->commit_write(file, page, offset, offset+bytes); + if (likely(copied > 0)) { + if (!status) + status = copied; + + if (status >= 0) { + written += status; + count -= status; + pos += status; + buf += status; + if (unlikely(nr_segs > 1)) + filemap_set_next_iovec(&cur_iov, + &iov_base, status); + } + } + if (unlikely(copied != bytes)) + if (status >= 0) + status = -EFAULT; + unlock_page(page); + mark_page_accessed(page); + page_cache_release(page); + if (status < 0) + break; + balance_dirty_pages_ratelimited(mapping); + cond_resched(); + } while (count); + *ppos = pos; + + if (cached_page) + page_cache_release(cached_page); + + /* + * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC + */ + if (status >= 0) { + if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) + status = generic_osync_inode(inode, mapping, + OSYNC_METADATA|OSYNC_DATA); + } + + err = written ? written : status; +out: + pagevec_lru_add(&lru_pvec); + current->backing_dev_info = 0; + return err; +} + ssize_t generic_file_write_nolock(struct file *file, const struct iovec *iov, unsigned long nr_segs, loff_t *ppos) @@ -2030,6 +2217,20 @@ return ret; } +static ssize_t +generic_kernel_file_write_nolock(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) +{ + struct kiocb kiocb; + ssize_t ret; + + init_sync_kiocb(&kiocb, file); + ret = generic_kernel_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos); + if (ret == -EIOCBQUEUED) + ret = wait_on_sync_kiocb(&kiocb); + return ret; +} + EXPORT_SYMBOL(generic_file_write_nolock); ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf, @@ -2068,6 +2269,21 @@ EXPORT_SYMBOL(generic_file_write); +static ssize_t generic_kernel_file_write(struct file *file, const char *buf, + size_t count, loff_t *ppos) +{ + struct inode *inode = file->f_mapping->host; + ssize_t err; + struct iovec local_iov = {.iov_base = (void __user *)buf, + .iov_len = count }; + + down(&inode->i_sem); + err = generic_kernel_file_write_nolock(file, &local_iov, 1, ppos); + up(&inode->i_sem); + + return err; +} + ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, unsigned long nr_segs, loff_t *ppos) { --- linux-2.6.8cow/fs/ext2/file.c~generic_sendpage 2004-06-16 07:19:13.000000000 +0200 +++ linux-2.6.8cow/fs/ext2/file.c 2004-09-04 23:04:38.000000000 +0200 @@ -53,6 +53,7 @@ .readv = generic_file_readv, .writev = generic_file_writev, .sendfile = generic_file_sendfile, + .sendpage = generic_file_sendpage, }; struct inode_operations ext2_file_inode_operations = { --- linux-2.6.8cow/fs/ext3/file.c~generic_sendpage 2004-06-16 07:20:04.000000000 +0200 +++ linux-2.6.8cow/fs/ext3/file.c 2004-09-04 23:04:38.000000000 +0200 @@ -127,6 +127,7 @@ .release = ext3_release_file, .fsync = ext3_sync_file, .sendfile = generic_file_sendfile, + .sendpage = generic_file_sendpage, }; struct inode_operations ext3_file_inode_operations = { - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/