2006-03-30 10:06:27

by Jens Axboe

[permalink] [raw]
Subject: [PATCH] splice support #2

Hi,

This patch should resolve all issues mentioned so far. I'd still like to
implement the page moving, but that should just be a separate patch.
After this round of patching, I thought I'd toss a fresh version out
there for all to look at.

Signed-off-by: Jens Axboe <[email protected]>

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index 326595f..ce3ef4f 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -312,3 +312,4 @@ ENTRY(sys_call_table)
.long sys_unshare /* 310 */
.long sys_set_robust_list
.long sys_get_robust_list
+ .long sys_splice
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index 0e3eda9..14480d9 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -1605,5 +1605,6 @@ sys_call_table:
data8 sys_ni_syscall // reserved for pselect
data8 sys_ni_syscall // 1295 reserved for ppoll
data8 sys_unshare
+ date8 sys_splice

.org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls
diff --git a/fs/Makefile b/fs/Makefile
index 080b386..f3a4f70 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -10,7 +10,7 @@ obj-y := open.o read_write.o file_table.
ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
- ioprio.o pnode.o drop_caches.o
+ ioprio.o pnode.o drop_caches.o splice.o

obj-$(CONFIG_INOTIFY) += inotify.o
obj-$(CONFIG_EPOLL) += eventpoll.o
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 509ccec..23e2c7c 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -53,6 +53,8 @@ const struct file_operations ext2_file_o
.readv = generic_file_readv,
.writev = generic_file_writev,
.sendfile = generic_file_sendfile,
+ .splice_read = generic_file_splice_read,
+ .splice_write = generic_file_splice_write,
};

#ifdef CONFIG_EXT2_FS_XIP
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 783a796..1efefb6 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -119,6 +119,8 @@ const struct file_operations ext3_file_o
.release = ext3_release_file,
.fsync = ext3_sync_file,
.sendfile = generic_file_sendfile,
+ .splice_read = generic_file_splice_read,
+ .splice_write = generic_file_splice_write,
};

struct inode_operations ext3_file_inode_operations = {
diff --git a/fs/pipe.c b/fs/pipe.c
index e2f4f1d..aef9ad7 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -15,6 +15,7 @@ #include <linux/mount.h>
#include <linux/pipe_fs_i.h>
#include <linux/uio.h>
#include <linux/highmem.h>
+#include <linux/pagemap.h>

#include <asm/uaccess.h>
#include <asm/ioctls.h>
@@ -94,11 +95,20 @@ static void anon_pipe_buf_release(struct
{
struct page *page = buf->page;

- if (info->tmp_page) {
- __free_page(page);
+ /*
+ * If nobody else uses this page, and we don't already have a
+ * temporary page, let's keep track of it as a one-deep
+ * allocation cache
+ */
+ if (page_count(page) == 1 && !info->tmp_page) {
+ info->tmp_page = page;
return;
}
- info->tmp_page = page;
+
+ /*
+ * Otherwise just release our reference to it
+ */
+ page_cache_release(page);
}

static void *anon_pipe_buf_map(struct file *file, struct pipe_inode_info *info, struct pipe_buffer *buf)
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 010094d..cf6e1cf 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -1576,6 +1576,8 @@ const struct file_operations reiserfs_fi
.sendfile = generic_file_sendfile,
.aio_read = generic_file_aio_read,
.aio_write = reiserfs_aio_write,
+ .splice_read = generic_file_splice_read,
+ .splice_write = generic_file_splice_write,
};

struct inode_operations reiserfs_file_inode_operations = {
diff --git a/fs/splice.c b/fs/splice.c
new file mode 100644
index 0000000..b63db3f
--- /dev/null
+++ b/fs/splice.c
@@ -0,0 +1,602 @@
+/*
+ * "splice": joining two ropes together by interweaving their strands.
+ *
+ * This is the "extended pipe" functionality, where a pipe is used as
+ * an arbitrary in-memory buffer. Think of a pipe as a small kernel
+ * buffer that you can use to transfer data from one end to the other.
+ *
+ * The traditional unix read/write is extended with a "splice()" operation
+ * that transfers data buffers to or from a pipe buffer.
+ *
+ * Named by Larry McVoy, original implementation from Linus, extended by
+ * Jens to support splicing to files and fixing the initial implementation
+ * bugs.
+ *
+ * Copyright (C) 2005 Jens Axboe <[email protected]>
+ * Copyright (C) 2005 Linus Torvalds <[email protected]>
+ *
+ */
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/mm_inline.h>
+
+/*
+ * Passed to the actors
+ */
+struct splice_desc {
+ unsigned int len, total_len; /* current and remaining length */
+ unsigned int flags; /* splice flags */
+ struct file *file; /* file to read/write */
+ loff_t pos; /* file position */
+};
+
+static void page_cache_pipe_buf_release(struct pipe_inode_info *info,
+ struct pipe_buffer *buf)
+{
+ page_cache_release(buf->page);
+ buf->page = NULL;
+}
+
+static void *page_cache_pipe_buf_map(struct file *file,
+ struct pipe_inode_info *info,
+ struct pipe_buffer *buf)
+{
+ struct page *page = buf->page;
+
+ lock_page(page);
+
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ return NULL;
+ }
+
+ return kmap(buf->page);
+}
+
+static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info,
+ struct pipe_buffer *buf)
+{
+ unlock_page(buf->page);
+ kunmap(buf->page);
+}
+
+static struct pipe_buf_operations page_cache_pipe_buf_ops = {
+ .can_merge = 0,
+ .map = page_cache_pipe_buf_map,
+ .unmap = page_cache_pipe_buf_unmap,
+ .release = page_cache_pipe_buf_release,
+};
+
+static ssize_t move_to_pipe(struct inode *inode, struct page **pages,
+ int nr_pages, unsigned long offset,
+ unsigned long len)
+{
+ struct pipe_inode_info *info;
+ int ret, do_wakeup, i;
+
+ ret = 0;
+ do_wakeup = 0;
+ i = 0;
+
+ mutex_lock(PIPE_MUTEX(*inode));
+
+ info = inode->i_pipe;
+ for (;;) {
+ int bufs;
+
+ if (!PIPE_READERS(*inode)) {
+ send_sig(SIGPIPE, current, 0);
+ if (!ret)
+ ret = -EPIPE;
+ break;
+ }
+
+ bufs = info->nrbufs;
+ if (bufs < PIPE_BUFFERS) {
+ int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS - 1);
+ struct pipe_buffer *buf = info->bufs + newbuf;
+ struct page *page = pages[i++];
+ unsigned long this_len;
+
+ this_len = PAGE_CACHE_SIZE - offset;
+ if (this_len > len)
+ this_len = len;
+
+ buf->page = page;
+ buf->offset = offset;
+ buf->len = this_len;
+ buf->ops = &page_cache_pipe_buf_ops;
+ info->nrbufs = ++bufs;
+ do_wakeup = 1;
+
+ ret += this_len;
+ len -= this_len;
+ offset = 0;
+ if (!--nr_pages)
+ break;
+ if (!len)
+ break;
+ if (bufs < PIPE_BUFFERS)
+ continue;
+
+ break;
+ }
+
+ if (signal_pending(current)) {
+ if (!ret)
+ ret = -ERESTARTSYS;
+ break;
+ }
+
+ if (do_wakeup) {
+ wake_up_interruptible_sync(PIPE_WAIT(*inode));
+ kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO,
+ POLL_IN);
+ do_wakeup = 0;
+ }
+
+ PIPE_WAITING_WRITERS(*inode)++;
+ pipe_wait(inode);
+ PIPE_WAITING_WRITERS(*inode)--;
+ }
+
+ mutex_unlock(PIPE_MUTEX(*inode));
+
+ if (do_wakeup) {
+ wake_up_interruptible(PIPE_WAIT(*inode));
+ kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
+ }
+
+ while (i < nr_pages)
+ page_cache_release(pages[i++]);
+
+ return ret;
+}
+
+static int __generic_file_splice_read(struct file *in, struct inode *pipe,
+ size_t len)
+{
+ struct address_space *mapping = in->f_mapping;
+ unsigned int offset, nr_pages;
+ struct page *pages[PIPE_BUFFERS];
+ struct page *page;
+ pgoff_t index, pidx;
+ int i;
+
+ index = in->f_pos >> PAGE_CACHE_SHIFT;
+ offset = in->f_pos & ~PAGE_CACHE_MASK;
+ nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+ if (nr_pages > PIPE_BUFFERS)
+ nr_pages = PIPE_BUFFERS;
+
+ /*
+ * initiate read-ahead on this page range
+ */
+ do_page_cache_readahead(mapping, in, index, nr_pages);
+
+ /*
+ * Get as many pages from the page cache as possible..
+ * Start IO on the page cache entries we create (we
+ * can assume that any pre-existing ones we find have
+ * already had IO started on them).
+ */
+ i = find_get_pages(mapping, index, nr_pages, pages);
+
+ /*
+ * common case - we found all pages and they are contiguous,
+ * kick them off
+ */
+ if (i && (pages[i - 1]->index == index + i - 1))
+ goto splice_them;
+
+ memset(&pages[i], 0, (nr_pages - i) * sizeof(struct page *));
+
+ /*
+ * find_get_pages() may not return consecutive pages, so loop
+ * over the array moving pages and filling the rest, if need be.
+ */
+ for (i = 0, pidx = index; i < nr_pages; pidx++, i++) {
+ if (!pages[i]) {
+ int error;
+fill_page:
+ /*
+ * no page there, look one up / create it
+ */
+ page = find_or_create_page(mapping, pidx,
+ mapping_gfp_mask(mapping));
+ if (!page)
+ break;
+
+ if (PageUptodate(page))
+ unlock_page(page);
+ else {
+ error = mapping->a_ops->readpage(in, page);
+
+ if (unlikely(error)) {
+ page_cache_release(page);
+ break;
+ }
+ }
+ pages[i] = page;
+ } else if (pages[i]->index != pidx) {
+ page = pages[i];
+ /*
+ * page isn't in the right spot, move it and jump
+ * back to filling this one. we know that ->index
+ * is larger than pidx
+ */
+ pages[i + page->index - pidx] = page;
+ pages[i] = NULL;
+ goto fill_page;
+ }
+ }
+
+ if (!i)
+ return 0;
+
+ /*
+ * Now we splice them into the pipe..
+ */
+splice_them:
+ return move_to_pipe(pipe, pages, i, offset, len);
+}
+
+ssize_t generic_file_splice_read(struct file *in, struct inode *pipe,
+ size_t len, unsigned int flags)
+{
+ ssize_t spliced;
+ int ret;
+
+ ret = 0;
+ spliced = 0;
+ while (len) {
+ ret = __generic_file_splice_read(in, pipe, len);
+
+ if (ret <= 0)
+ break;
+
+ in->f_pos += ret;
+ len -= ret;
+ spliced += ret;
+ }
+
+ if (spliced)
+ return spliced;
+
+ return ret;
+}
+
+/*
+ * Send 'len' bytes to socket from 'file' at position 'pos' using sendpage().
+ */
+static int pipe_to_sendpage(struct pipe_inode_info *info,
+ struct pipe_buffer *buf, struct splice_desc *sd)
+{
+ struct file *file = sd->file;
+ loff_t pos = sd->pos;
+ unsigned int offset;
+ ssize_t ret;
+
+ /*
+ * sub-optimal, but we are limited by the pipe ->map. we don't
+ * need a kmap'ed buffer here, we just want to make sure we
+ * have the page pinned if the pipe page originates from the
+ * page cache
+ */
+ if (!buf->ops->map(file, info, buf))
+ return -EIO;
+
+ offset = pos & ~PAGE_CACHE_MASK;
+
+ ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,
+ sd->len < sd->total_len);
+
+ buf->ops->unmap(info, buf);
+ if (ret == sd->len)
+ return 0;
+
+ return -EIO;
+}
+
+/*
+ * This is a little more tricky than the file -> pipe splicing. There are
+ * basically three cases:
+ *
+ * - Destination page already exists in the address space and there
+ * are users of it. For that case we have no other option that
+ * copying the data. Tough luck.
+ * - Destination page already exists in the address space, but there
+ * are no users of it. Make sure it's uptodate, then drop it. Fall
+ * through to last case.
+ * - Destination page does not exist, we can add the pipe page to
+ * the page cache and avoid the copy.
+ *
+ * For now we just do the slower thing and always copy pages over, it's
+ * easier than migrating pages from the pipe to the target file. For the
+ * case of doing file | file splicing, the migrate approach had some LRU
+ * nastiness...
+ */
+static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
+ struct splice_desc *sd)
+{
+ struct file *file = sd->file;
+ struct address_space *mapping = file->f_mapping;
+ unsigned int offset;
+ struct page *page;
+ char *src, *dst;
+ pgoff_t index;
+ int ret;
+
+ /*
+ * after this, page will be locked and unmapped
+ */
+ src = buf->ops->map(file, info, buf);
+ if (!src)
+ return -EIO;
+
+ index = sd->pos >> PAGE_CACHE_SHIFT;
+ offset = sd->pos & ~PAGE_CACHE_MASK;
+
+find_page:
+ ret = -ENOMEM;
+ page = find_or_create_page(mapping, index, mapping_gfp_mask(mapping));
+ if (!page)
+ goto out;
+
+ /*
+ * If the page is uptodate, it is also locked. If it isn't
+ * uptodate, we can mark it uptodate if we are filling the
+ * full page. Otherwise we need to read it in first...
+ */
+ if (!PageUptodate(page)) {
+ if (sd->len < PAGE_CACHE_SIZE) {
+ ret = mapping->a_ops->readpage(file, page);
+ if (unlikely(ret))
+ goto out;
+
+ lock_page(page);
+
+ if (!PageUptodate(page)) {
+ /*
+ * page got invalidated, repeat
+ */
+ if (page->mapping == NULL) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto find_page;
+ }
+ ret = -EIO;
+ goto out;
+ }
+ } else {
+ WARN_ON(!PageLocked(page));
+ SetPageUptodate(page);
+ }
+ }
+
+ ret = mapping->a_ops->prepare_write(file, page, 0, sd->len);
+ if (ret)
+ goto out;
+
+ dst = kmap_atomic(page, KM_USER0);
+ memcpy(dst + offset, src + buf->offset, sd->len);
+ flush_dcache_page(page);
+ kunmap_atomic(dst, KM_USER0);
+
+ ret = mapping->a_ops->commit_write(file, page, 0, sd->len);
+ if (ret < 0)
+ goto out;
+
+ set_page_dirty(page);
+ ret = write_one_page(page, 0);
+out:
+ if (ret < 0)
+ unlock_page(page);
+ page_cache_release(page);
+ buf->ops->unmap(info, buf);
+ return ret;
+}
+
+typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
+ struct splice_desc *);
+
+static ssize_t move_from_pipe(struct inode *inode, struct file *out,
+ size_t len, unsigned int flags,
+ splice_actor *actor)
+{
+ struct pipe_inode_info *info;
+ int ret, do_wakeup, err;
+ struct splice_desc sd;
+
+ ret = 0;
+ do_wakeup = 0;
+
+ sd.total_len = len;
+ sd.flags = flags;
+ sd.file = out;
+ sd.pos = out->f_pos;
+
+ mutex_lock(PIPE_MUTEX(*inode));
+
+ info = inode->i_pipe;
+ for (;;) {
+ int bufs = info->nrbufs;
+
+ if (bufs) {
+ int curbuf = info->curbuf;
+ struct pipe_buffer *buf = info->bufs + curbuf;
+ struct pipe_buf_operations *ops = buf->ops;
+
+ sd.len = buf->len;
+ if (sd.len > sd.total_len)
+ sd.len = sd.total_len;
+
+ err = actor(info, buf, &sd);
+ if (err) {
+ if (!ret)
+ ret = err;
+
+ break;
+ }
+
+ ret += sd.len;
+ buf->offset += sd.len;
+ buf->len -= sd.len;
+ if (!buf->len) {
+ buf->ops = NULL;
+ ops->release(info, buf);
+ curbuf = (curbuf + 1) & (PIPE_BUFFERS - 1);
+ info->curbuf = curbuf;
+ info->nrbufs = --bufs;
+ do_wakeup = 1;
+ }
+
+ sd.pos += sd.len;
+ sd.total_len -= sd.len;
+ if (!sd.total_len)
+ break;
+ }
+
+ if (bufs)
+ continue;
+ if (!PIPE_WRITERS(*inode))
+ break;
+ if (!PIPE_WAITING_WRITERS(*inode)) {
+ if (ret)
+ break;
+ }
+
+ if (signal_pending(current)) {
+ if (!ret)
+ ret = -ERESTARTSYS;
+ break;
+ }
+
+ if (do_wakeup) {
+ wake_up_interruptible_sync(PIPE_WAIT(*inode));
+ kill_fasync(PIPE_FASYNC_WRITERS(*inode),SIGIO,POLL_OUT);
+ do_wakeup = 0;
+ }
+
+ pipe_wait(inode);
+ }
+
+ mutex_unlock(PIPE_MUTEX(*inode));
+
+ if (do_wakeup) {
+ wake_up_interruptible(PIPE_WAIT(*inode));
+ kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
+ }
+
+ mutex_lock(&out->f_mapping->host->i_mutex);
+ out->f_pos = sd.pos;
+ mutex_unlock(&out->f_mapping->host->i_mutex);
+ return ret;
+
+}
+
+ssize_t generic_file_splice_write(struct inode *inode, struct file *out,
+ size_t len, unsigned int flags)
+{
+ return move_from_pipe(inode, out, len, flags, pipe_to_file);
+}
+
+ssize_t generic_splice_sendpage(struct inode *inode, struct file *out,
+ size_t len, unsigned int flags)
+{
+ return move_from_pipe(inode, out, len, flags, pipe_to_sendpage);
+}
+
+static long do_splice_from(struct inode *pipe, struct file *out, size_t len,
+ unsigned int flags)
+{
+ loff_t pos;
+ int ret;
+
+ if (!out->f_op || !out->f_op->splice_write)
+ return -EINVAL;
+
+ if (!(out->f_mode & FMODE_WRITE))
+ return -EBADF;
+
+ pos = out->f_pos;
+ ret = rw_verify_area(WRITE, out, &pos, len);
+ if (unlikely(ret < 0))
+ return ret;
+
+ return out->f_op->splice_write(pipe, out, len, flags);
+}
+
+static long do_splice_to(struct file *in, struct inode *pipe, size_t len,
+ unsigned int flags)
+{
+ loff_t pos, isize, left;
+ int ret;
+
+ if (!in->f_op || !in->f_op->splice_read)
+ return -EINVAL;
+
+ if (!(in->f_mode & FMODE_READ))
+ return -EBADF;
+
+ pos = in->f_pos;
+ ret = rw_verify_area(READ, in, &pos, len);
+ if (unlikely(ret < 0))
+ return ret;
+
+ isize = i_size_read(in->f_mapping->host);
+ if (unlikely(in->f_pos >= isize))
+ return 0;
+
+ left = isize - in->f_pos;
+ if (left < len)
+ len = left;
+
+ return in->f_op->splice_read(in, pipe, len, flags);
+}
+
+static long do_splice(struct file *in, struct file *out, size_t len,
+ unsigned int flags)
+{
+ struct inode *pipe;
+
+ pipe = in->f_dentry->d_inode;
+ if (pipe->i_pipe)
+ return do_splice_from(pipe, out, len, flags);
+
+ pipe = out->f_dentry->d_inode;
+ if (pipe->i_pipe)
+ return do_splice_to(in, pipe, len, flags);
+
+ return -EINVAL;
+}
+
+asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags)
+{
+ long error;
+ struct file *in, *out;
+ int fput_in, fput_out;
+
+ if (unlikely(!len))
+ return 0;
+
+ error = -EBADF;
+ in = fget_light(fdin, &fput_in);
+ if (in) {
+ if (in->f_mode & FMODE_READ) {
+ out = fget_light(fdout, &fput_out);
+ if (out) {
+ if (out->f_mode & FMODE_WRITE)
+ error = do_splice(in, out, len, flags);
+ fput_light(out, fput_out);
+ }
+ }
+
+ fput_light(in, fput_in);
+ }
+
+ return error;
+}
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index 014e356..789e9bd 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -318,8 +318,9 @@ #define __NR_ppoll 309
#define __NR_unshare 310
#define __NR_set_robust_list 311
#define __NR_get_robust_list 312
+#define __NR_sys_splice 313

-#define NR_syscalls 313
+#define NR_syscalls 314

/*
* user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h
index 019956c..36070c1 100644
--- a/include/asm-ia64/unistd.h
+++ b/include/asm-ia64/unistd.h
@@ -285,12 +285,13 @@ #define __NR_fchmodat 1292
#define __NR_faccessat 1293
/* 1294, 1295 reserved for pselect/ppoll */
#define __NR_unshare 1296
+#define __NR_splice 1297

#ifdef __KERNEL__

#include <linux/config.h>

-#define NR_syscalls 273 /* length of syscall table */
+#define NR_syscalls 274 /* length of syscall table */

#define __ARCH_WANT_SYS_RT_SIGACTION

diff --git a/include/asm-powerpc/unistd.h b/include/asm-powerpc/unistd.h
index 1e99074..536ba08 100644
--- a/include/asm-powerpc/unistd.h
+++ b/include/asm-powerpc/unistd.h
@@ -301,8 +301,9 @@ #define __NR_spu_create 279
#define __NR_pselect6 280
#define __NR_ppoll 281
#define __NR_unshare 282
+#define __NR_splice 283

-#define __NR_syscalls 283
+#define __NR_syscalls 284

#ifdef __KERNEL__
#define __NR__exit __NR_exit
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index fcc5163..f21ff2c 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -609,8 +609,10 @@ #define __NR_set_robust_list 273
__SYSCALL(__NR_set_robust_list, sys_set_robust_list)
#define __NR_get_robust_list 274
__SYSCALL(__NR_get_robust_list, sys_get_robust_list)
+#define __NR_splice 275
+__SYSCALL(__NR_splice, sys_splice)

-#define __NR_syscall_max __NR_get_robust_list
+#define __NR_syscall_max __NR_splice

#ifndef __NO_STUBS

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 408fe89..20fa5f6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1032,6 +1032,8 @@ struct file_operations {
int (*check_flags)(int);
int (*dir_notify)(struct file *filp, unsigned long arg);
int (*flock) (struct file *, int, struct file_lock *);
+ ssize_t (*splice_write)(struct inode *, struct file *, size_t, unsigned int);
+ ssize_t (*splice_read)(struct file *, struct inode *, size_t, unsigned int);
};

struct inode_operations {
@@ -1609,6 +1611,8 @@ extern ssize_t generic_file_sendfile(str
extern void do_generic_mapping_read(struct address_space *mapping,
struct file_ra_state *, struct file *,
loff_t *, read_descriptor_t *, read_actor_t);
+extern ssize_t generic_file_splice_read(struct file *, struct inode *, size_t, unsigned int);
+extern ssize_t generic_file_splice_write(struct inode *, struct file *, size_t, unsigned int);
extern void
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e487e3b..e78ffc7 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -569,5 +569,7 @@ asmlinkage long compat_sys_newfstatat(un
asmlinkage long compat_sys_openat(unsigned int dfd, const char __user *filename,
int flags, int mode);
asmlinkage long sys_unshare(unsigned long unshare_flags);
+asmlinkage long sys_splice(int fdin, int fdout, size_t len,
+ unsigned int flags);

#endif
diff --git a/net/socket.c b/net/socket.c
index fcd77ea..b13042f 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -119,7 +119,10 @@ static ssize_t sock_writev(struct file *
static ssize_t sock_sendpage(struct file *file, struct page *page,
int offset, size_t size, loff_t *ppos, int more);

+extern ssize_t generic_splice_sendpage(struct inode *inode, struct file *out,
+ size_t len, unsigned int flags);

+
/*
* Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
* in the operation structures but are done directly via the socketcall() multiplexor.
@@ -141,7 +144,8 @@ #endif
.fasync = sock_fasync,
.readv = sock_readv,
.writev = sock_writev,
- .sendpage = sock_sendpage
+ .sendpage = sock_sendpage,
+ .splice_write = generic_splice_sendpage,
};

/*

--
Jens Axboe


2006-03-30 10:17:05

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] splice support #2

Jens Axboe <[email protected]> wrote:
>
> Hi,
>
> This patch should resolve all issues mentioned so far. I'd still like to
> implement the page moving, but that should just be a separate patch.
> After this round of patching, I thought I'd toss a fresh version out
> there for all to look at.
>
> Signed-off-by: Jens Axboe <[email protected]>
>
> ...
>
> --- a/arch/ia64/kernel/entry.S
> +++ b/arch/ia64/kernel/entry.S
> @@ -1605,5 +1605,6 @@ sys_call_table:
> data8 sys_ni_syscall // reserved for pselect
> data8 sys_ni_syscall // 1295 reserved for ppoll
> data8 sys_unshare
> + date8 sys_splice

typo

> +static void *page_cache_pipe_buf_map(struct file *file,
> + struct pipe_inode_info *info,
> + struct pipe_buffer *buf)
> +{
> + struct page *page = buf->page;
> +
> + lock_page(page);
> +
> + if (!PageUptodate(page)) {

|| page->mapping == NULL

> + struct page *pages[PIPE_BUFFERS];
> + struct page *page;
> + pgoff_t index, pidx;
> + int i;
> +
> + index = in->f_pos >> PAGE_CACHE_SHIFT;
> + offset = in->f_pos & ~PAGE_CACHE_MASK;
> + nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
> +
> + if (nr_pages > PIPE_BUFFERS)
> + nr_pages = PIPE_BUFFERS;
> +
> + /*
> + * initiate read-ahead on this page range
> + */
> + do_page_cache_readahead(mapping, in, index, nr_pages);
> +
> + /*
> + * Get as many pages from the page cache as possible..
> + * Start IO on the page cache entries we create (we
> + * can assume that any pre-existing ones we find have
> + * already had IO started on them).
> + */
> + i = find_get_pages(mapping, index, nr_pages, pages);
> +
> + /*
> + * common case - we found all pages and they are contiguous,
> + * kick them off
> + */
> + if (i && (pages[i - 1]->index == index + i - 1))
> + goto splice_them;
> +
> + memset(&pages[i], 0, (nr_pages - i) * sizeof(struct page *));
> +
> + /*
> + * find_get_pages() may not return consecutive pages, so loop
> + * over the array moving pages and filling the rest, if need be.
> + */
> + for (i = 0, pidx = index; i < nr_pages; pidx++, i++) {
> + if (!pages[i]) {
> + int error;
> +fill_page:
> + /*
> + * no page there, look one up / create it
> + */
> + page = find_or_create_page(mapping, pidx,
> + mapping_gfp_mask(mapping));
> + if (!page)
> + break;
> +
> + if (PageUptodate(page))
> + unlock_page(page);
> + else {
> + error = mapping->a_ops->readpage(in, page);
> +
> + if (unlikely(error)) {
> + page_cache_release(page);
> + break;
> + }
> + }
> + pages[i] = page;
> + } else if (pages[i]->index != pidx) {
> + page = pages[i];
> + /*
> + * page isn't in the right spot, move it and jump
> + * back to filling this one. we know that ->index
> + * is larger than pidx
> + */
> + pages[i + page->index - pidx] = page;
> + pages[i] = NULL;
> + goto fill_page;
> + }
> + }
> +
> + if (!i)
> + return 0;
> +
> + /*
> + * Now we splice them into the pipe..
> + */
> +splice_them:
> + return move_to_pipe(pipe, pages, i, offset, len);
> +}

OK, and this returns the number of bytes transferred all the way up to
userspace.

And, like read(), if we hit an IO error or ENOMEM partway through we'll
just return a short read which is indistinguishable from EOF.

But if we do get an IO error, we'll detect it in page_cache_pipe_buf_map().
Does the code still follow these read() return value semantics in that
case?

> ...

argh, am all reviewed out, and infiniband isn't compiling. Will look later.

2006-03-30 10:24:13

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH] splice support #2

On Thu, Mar 30 2006, Andrew Morton wrote:
> Jens Axboe <[email protected]> wrote:
> >
> > Hi,
> >
> > This patch should resolve all issues mentioned so far. I'd still like to
> > implement the page moving, but that should just be a separate patch.
> > After this round of patching, I thought I'd toss a fresh version out
> > there for all to look at.
> >
> > Signed-off-by: Jens Axboe <[email protected]>
> >
> > ...
> >
> > --- a/arch/ia64/kernel/entry.S
> > +++ b/arch/ia64/kernel/entry.S
> > @@ -1605,5 +1605,6 @@ sys_call_table:
> > data8 sys_ni_syscall // reserved for pselect
> > data8 sys_ni_syscall // 1295 reserved for ppoll
> > data8 sys_unshare
> > + date8 sys_splice
>
> typo

Oops, fixed.

> > +static void *page_cache_pipe_buf_map(struct file *file,
> > + struct pipe_inode_info *info,
> > + struct pipe_buffer *buf)
> > +{
> > + struct page *page = buf->page;
> > +
> > + lock_page(page);
> > +
> > + if (!PageUptodate(page)) {
>
> || page->mapping == NULL

Fixed

> > + struct page *pages[PIPE_BUFFERS];
> > + struct page *page;
> > + pgoff_t index, pidx;
> > + int i;
> > +
> > + index = in->f_pos >> PAGE_CACHE_SHIFT;
> > + offset = in->f_pos & ~PAGE_CACHE_MASK;
> > + nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
> > +
> > + if (nr_pages > PIPE_BUFFERS)
> > + nr_pages = PIPE_BUFFERS;
> > +
> > + /*
> > + * initiate read-ahead on this page range
> > + */
> > + do_page_cache_readahead(mapping, in, index, nr_pages);
> > +
> > + /*
> > + * Get as many pages from the page cache as possible..
> > + * Start IO on the page cache entries we create (we
> > + * can assume that any pre-existing ones we find have
> > + * already had IO started on them).
> > + */
> > + i = find_get_pages(mapping, index, nr_pages, pages);
> > +
> > + /*
> > + * common case - we found all pages and they are contiguous,
> > + * kick them off
> > + */
> > + if (i && (pages[i - 1]->index == index + i - 1))
> > + goto splice_them;
> > +
> > + memset(&pages[i], 0, (nr_pages - i) * sizeof(struct page *));
> > +
> > + /*
> > + * find_get_pages() may not return consecutive pages, so loop
> > + * over the array moving pages and filling the rest, if need be.
> > + */
> > + for (i = 0, pidx = index; i < nr_pages; pidx++, i++) {
> > + if (!pages[i]) {
> > + int error;
> > +fill_page:
> > + /*
> > + * no page there, look one up / create it
> > + */
> > + page = find_or_create_page(mapping, pidx,
> > + mapping_gfp_mask(mapping));
> > + if (!page)
> > + break;
> > +
> > + if (PageUptodate(page))
> > + unlock_page(page);
> > + else {
> > + error = mapping->a_ops->readpage(in, page);
> > +
> > + if (unlikely(error)) {
> > + page_cache_release(page);
> > + break;
> > + }
> > + }
> > + pages[i] = page;
> > + } else if (pages[i]->index != pidx) {
> > + page = pages[i];
> > + /*
> > + * page isn't in the right spot, move it and jump
> > + * back to filling this one. we know that ->index
> > + * is larger than pidx
> > + */
> > + pages[i + page->index - pidx] = page;
> > + pages[i] = NULL;
> > + goto fill_page;
> > + }
> > + }
> > +
> > + if (!i)
> > + return 0;
> > +
> > + /*
> > + * Now we splice them into the pipe..
> > + */
> > +splice_them:
> > + return move_to_pipe(pipe, pages, i, offset, len);
> > +}
>
> OK, and this returns the number of bytes transferred all the way up to
> userspace.
>
> And, like read(), if we hit an IO error or ENOMEM partway through we'll
> just return a short read which is indistinguishable from EOF.
>
> But if we do get an IO error, we'll detect it in page_cache_pipe_buf_map().
> Does the code still follow these read() return value semantics in that
> case?

Yes, see:

err = actor(info, buf, &sd);
if (err) {
if (!ret)
ret = err;

break;
}

so if we get -EIO from the actor, then we return bytes transferred _or_
-EIO in 0.

> argh, am all reviewed out, and infiniband isn't compiling. Will look later.

I don't blame you, you've been (as always) a huge help! Thanks a lot.

--
Jens Axboe

2006-03-30 11:17:04

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] splice support #2

Jens Axboe <[email protected]> wrote:
>
> > > +static void *page_cache_pipe_buf_map(struct file *file,
> > > + struct pipe_inode_info *info,
> > > + struct pipe_buffer *buf)
> > > +{
> > > + struct page *page = buf->page;
> > > +
> > > + lock_page(page);
> > > +
> > > + if (!PageUptodate(page)) {
> >
> > || page->mapping == NULL
>
> Fixed

Actually that wasn't right. If the page was truncated then we shouldn't
return -EIO to userspace. We should return zero, as this is the first
page.

Which means either returning an ERR_PTR here for -EIO or re-checking i_size
in the caller. Maybe the latter?

2006-03-30 11:55:18

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH] splice support #2

On Thu, Mar 30 2006, Andrew Morton wrote:
> Jens Axboe <[email protected]> wrote:
> >
> > > > +static void *page_cache_pipe_buf_map(struct file *file,
> > > > + struct pipe_inode_info *info,
> > > > + struct pipe_buffer *buf)
> > > > +{
> > > > + struct page *page = buf->page;
> > > > +
> > > > + lock_page(page);
> > > > +
> > > > + if (!PageUptodate(page)) {
> > >
> > > || page->mapping == NULL
> >
> > Fixed
>
> Actually that wasn't right. If the page was truncated then we shouldn't
> return -EIO to userspace. We should return zero, as this is the first
> page.

_if_ this is the first page, then agree.

> Which means either returning an ERR_PTR here for -EIO or re-checking i_size
> in the caller. Maybe the latter?

The latter sounds better, I'll add that.

--
Jens Axboe

2006-03-30 12:03:29

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] splice support #2


* Jens Axboe <[email protected]> wrote:

> Hi,
>
> This patch should resolve all issues mentioned so far. I'd still like
> to implement the page moving, but that should just be a separate
> patch.

neat stuff. One question: why do we require fdin or fdout to be a pipe?
Is there any fundamental problem with implementing what Larry's original
paper described too: straight pagecache -> socket transfers? Without a
pipe intermediary forced inbetween. It only adds unnecessary overhead.

Ingo

2006-03-30 12:05:09

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH] splice support #2

On Thu, Mar 30 2006, Ingo Molnar wrote:
>
> * Jens Axboe <[email protected]> wrote:
>
> > Hi,
> >
> > This patch should resolve all issues mentioned so far. I'd still like
> > to implement the page moving, but that should just be a separate
> > patch.
>
> neat stuff. One question: why do we require fdin or fdout to be a pipe?
> Is there any fundamental problem with implementing what Larry's original
> paper described too: straight pagecache -> socket transfers? Without a
> pipe intermediary forced inbetween. It only adds unnecessary overhead.

No, not a fundamental problem. I think I even hid that in some comment
in there, at least if it's decipharable by someone else than myself...
Basically I think it would be nice in the future to tidy this a little
bit and separate the actual container from the pipe itself - and have
the pipe just fill/use the same container.

--
Jens Axboe

2006-03-30 12:13:01

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] splice support #2


* Jens Axboe <[email protected]> wrote:

> On Thu, Mar 30 2006, Ingo Molnar wrote:
> >
> > * Jens Axboe <[email protected]> wrote:
> >
> > > Hi,
> > >
> > > This patch should resolve all issues mentioned so far. I'd still like
> > > to implement the page moving, but that should just be a separate
> > > patch.
> >
> > neat stuff. One question: why do we require fdin or fdout to be a pipe?
> > Is there any fundamental problem with implementing what Larry's original
> > paper described too: straight pagecache -> socket transfers? Without a
> > pipe intermediary forced inbetween. It only adds unnecessary overhead.
>
> No, not a fundamental problem. I think I even hid that in some comment
> in there, at least if it's decipharable by someone else than myself...
> Basically I think it would be nice in the future to tidy this a little
> bit and separate the actual container from the pipe itself - and have
> the pipe just fill/use the same container.

why is there a container needed at all? If i splice pagecache->socket,
we can use sendpage to send it off immediately. There is no need for any
container - both the pagecache and sendpage use struct page, and when we
iterate to create a container we might as well ->sendpage() those pages
off immediately instead.

I agree with the purpose of making sys_splice() generic and in
particular usable in scripts/shells where pipes are commonly used, but
we should also fulfill the original promise (outlined 15 years ago or
so) and not limit this to pipes. That way i could improve TUX to make
use of it for example ;)

Ingo

2006-03-30 12:16:32

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH] splice support #2

On Thu, Mar 30 2006, Ingo Molnar wrote:
>
> * Jens Axboe <[email protected]> wrote:
>
> > On Thu, Mar 30 2006, Ingo Molnar wrote:
> > >
> > > * Jens Axboe <[email protected]> wrote:
> > >
> > > > Hi,
> > > >
> > > > This patch should resolve all issues mentioned so far. I'd still like
> > > > to implement the page moving, but that should just be a separate
> > > > patch.
> > >
> > > neat stuff. One question: why do we require fdin or fdout to be a pipe?
> > > Is there any fundamental problem with implementing what Larry's original
> > > paper described too: straight pagecache -> socket transfers? Without a
> > > pipe intermediary forced inbetween. It only adds unnecessary overhead.
> >
> > No, not a fundamental problem. I think I even hid that in some comment
> > in there, at least if it's decipharable by someone else than myself...
> > Basically I think it would be nice in the future to tidy this a little
> > bit and separate the actual container from the pipe itself - and have
> > the pipe just fill/use the same container.
>
> why is there a container needed at all? If i splice pagecache->socket,
> we can use sendpage to send it off immediately. There is no need for any
> container - both the pagecache and sendpage use struct page, and when we
> iterate to create a container we might as well ->sendpage() those pages
> off immediately instead.
>
> I agree with the purpose of making sys_splice() generic and in
> particular usable in scripts/shells where pipes are commonly used, but
> we should also fulfill the original promise (outlined 15 years ago or
> so) and not limit this to pipes. That way i could improve TUX to make
> use of it for example ;)

There's absolutely no reason why we can't add fd -> fd splicing as well,
so no worries. Right now we just require a pipe transport. It's
extendable :-)

--
Jens Axboe

2006-03-30 12:30:05

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH] splice support #2

On Thu, Mar 30 2006, Jens Axboe wrote:
> On Thu, Mar 30 2006, Andrew Morton wrote:
> > Jens Axboe <[email protected]> wrote:
> > >
> > > > > +static void *page_cache_pipe_buf_map(struct file *file,
> > > > > + struct pipe_inode_info *info,
> > > > > + struct pipe_buffer *buf)
> > > > > +{
> > > > > + struct page *page = buf->page;
> > > > > +
> > > > > + lock_page(page);
> > > > > +
> > > > > + if (!PageUptodate(page)) {
> > > >
> > > > || page->mapping == NULL
> > >
> > > Fixed
> >
> > Actually that wasn't right. If the page was truncated then we shouldn't
> > return -EIO to userspace. We should return zero, as this is the first
> > page.
>
> _if_ this is the first page, then agree.
>
> > Which means either returning an ERR_PTR here for -EIO or re-checking i_size
> > in the caller. Maybe the latter?
>
> The latter sounds better, I'll add that.

Actually that's a little hard, since you don't know what state buf->page
is in - it might be page cache, it might not be. So I opted for the
ERR_PTR approach, just returning -ENODATA on !page->mapping.

--
Jens Axboe

2006-03-30 12:30:43

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH] splice support #2

On Thu, Mar 30 2006, Jens Axboe wrote:
> On Thu, Mar 30 2006, Jens Axboe wrote:
> > On Thu, Mar 30 2006, Andrew Morton wrote:
> > > Jens Axboe <[email protected]> wrote:
> > > >
> > > > > > +static void *page_cache_pipe_buf_map(struct file *file,
> > > > > > + struct pipe_inode_info *info,
> > > > > > + struct pipe_buffer *buf)
> > > > > > +{
> > > > > > + struct page *page = buf->page;
> > > > > > +
> > > > > > + lock_page(page);
> > > > > > +
> > > > > > + if (!PageUptodate(page)) {
> > > > >
> > > > > || page->mapping == NULL
> > > >
> > > > Fixed
> > >
> > > Actually that wasn't right. If the page was truncated then we shouldn't
> > > return -EIO to userspace. We should return zero, as this is the first
> > > page.
> >
> > _if_ this is the first page, then agree.
> >
> > > Which means either returning an ERR_PTR here for -EIO or re-checking i_size
> > > in the caller. Maybe the latter?
> >
> > The latter sounds better, I'll add that.
>
> Actually that's a little hard, since you don't know what state buf->page
> is in - it might be page cache, it might not be. So I opted for the
> ERR_PTR approach, just returning -ENODATA on !page->mapping.

Ala

diff --git a/fs/splice.c b/fs/splice.c
index 7927dff..68b98bb 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -47,9 +47,14 @@ static void *page_cache_pipe_buf_map(str

lock_page(page);

- if (!PageUptodate(page) || !page->mapping) {
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ return ERR_PTR(-EIO);
+ }
+
+ if (!page->mapping) {
unlock_page(page);
- return NULL;
+ return ERR_PTR(-ENODATA);
}

return kmap(buf->page);
@@ -279,6 +293,7 @@ static int pipe_to_sendpage(struct pipe_
loff_t pos = sd->pos;
unsigned int offset;
ssize_t ret;
+ void *ptr;

/*
* sub-optimal, but we are limited by the pipe ->map. we don't
@@ -286,8 +301,9 @@ static int pipe_to_sendpage(struct pipe_
* have the page pinned if the pipe page originates from the
* page cache
*/
- if (!buf->ops->map(file, info, buf))
- return -EIO;
+ ptr = buf->ops->map(file, info, buf);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);

offset = pos & ~PAGE_CACHE_MASK;

@@ -334,8 +350,8 @@ static int pipe_to_file(struct pipe_inod
* after this, page will be locked and unmapped
*/
src = buf->ops->map(file, info, buf);
- if (!src)
- return -EIO;
+ if (IS_ERR(src))
+ return PTR_ERR(src);

index = sd->pos >> PAGE_CACHE_SHIFT;
offset = sd->pos & ~PAGE_CACHE_MASK;
@@ -436,7 +452,7 @@ static ssize_t move_from_pipe(struct ino

err = actor(info, buf, &sd);
if (err) {
- if (!ret)
+ if (!ret && err != -ENODATA)
ret = err;

break;

--
Jens Axboe

2006-03-30 12:41:59

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH] splice support #2

On Thu, Mar 30 2006, Ingo Molnar wrote:
>
> * Jens Axboe <[email protected]> wrote:
>
> > > I agree with the purpose of making sys_splice() generic and in
> > > particular usable in scripts/shells where pipes are commonly used, but
> > > we should also fulfill the original promise (outlined 15 years ago or
> > > so) and not limit this to pipes. That way i could improve TUX to make
> > > use of it for example ;)
> >
> > There's absolutely no reason why we can't add fd -> fd splicing as
> > well, so no worries. Right now we just require a pipe transport. It's
> > extendable :-)
>
> ok :) I think this code should be merged into v2.6.17 - it's very clean
> and unintrusive.

I hope so, too. I'll post a hopefully finished patch real soon, and then
the move pages addon afterwards (which works now).

--
Jens Axboe

2006-03-30 12:40:36

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] splice support #2


* Jens Axboe <[email protected]> wrote:

> > I agree with the purpose of making sys_splice() generic and in
> > particular usable in scripts/shells where pipes are commonly used, but
> > we should also fulfill the original promise (outlined 15 years ago or
> > so) and not limit this to pipes. That way i could improve TUX to make
> > use of it for example ;)
>
> There's absolutely no reason why we can't add fd -> fd splicing as
> well, so no worries. Right now we just require a pipe transport. It's
> extendable :-)

ok :) I think this code should be merged into v2.6.17 - it's very clean
and unintrusive.

Ingo

2006-03-30 12:44:41

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] splice support #2


* Jens Axboe <[email protected]> wrote:

> On Thu, Mar 30 2006, Ingo Molnar wrote:
> >
> > * Jens Axboe <[email protected]> wrote:
> >
> > > > I agree with the purpose of making sys_splice() generic and in
> > > > particular usable in scripts/shells where pipes are commonly used, but
> > > > we should also fulfill the original promise (outlined 15 years ago or
> > > > so) and not limit this to pipes. That way i could improve TUX to make
> > > > use of it for example ;)
> > >
> > > There's absolutely no reason why we can't add fd -> fd splicing as
> > > well, so no worries. Right now we just require a pipe transport. It's
> > > extendable :-)
> >
> > ok :) I think this code should be merged into v2.6.17 - it's very clean
> > and unintrusive.
>
> I hope so, too. I'll post a hopefully finished patch real soon, and
> then the move pages addon afterwards (which works now).

what does the move pages addon do? pagecache->pagecache transfer?

Ingo

2006-03-30 13:02:56

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH] splice support #2

On Thu, Mar 30 2006, Ingo Molnar wrote:
>
> * Jens Axboe <[email protected]> wrote:
>
> > On Thu, Mar 30 2006, Ingo Molnar wrote:
> > >
> > > * Jens Axboe <[email protected]> wrote:
> > >
> > > > > I agree with the purpose of making sys_splice() generic and in
> > > > > particular usable in scripts/shells where pipes are commonly used, but
> > > > > we should also fulfill the original promise (outlined 15 years ago or
> > > > > so) and not limit this to pipes. That way i could improve TUX to make
> > > > > use of it for example ;)
> > > >
> > > > There's absolutely no reason why we can't add fd -> fd splicing as
> > > > well, so no worries. Right now we just require a pipe transport. It's
> > > > extendable :-)
> > >
> > > ok :) I think this code should be merged into v2.6.17 - it's very clean
> > > and unintrusive.
> >
> > I hope so, too. I'll post a hopefully finished patch real soon, and
> > then the move pages addon afterwards (which works now).
>
> what does the move pages addon do? pagecache->pagecache transfer?

Precisely, if you do file1 | file2 splicing, you can move the pages to
file2's address space. So say I do splice-cp file1 file2, I would only
have file2 contents in page cache aftwards. And I'd save a memcpy of all
the data.

--
Jens Axboe

2006-03-30 14:20:12

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH] splice support #2

On Thu, Mar 30, 2006 at 02:38:05PM +0200, Ingo Molnar wrote:
> ok :) I think this code should be merged into v2.6.17 - it's very clean
> and unintrusive.

no. there's a reason we have a staging area in -mm. Especially for things
introducing a new user ABI. Give it a few weeks exposure in which a few
things will change for sure instead of rushing it into a release.

2006-03-30 17:03:01

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] splice support #2



On Thu, 30 Mar 2006, Jens Axboe wrote:
> On Thu, Mar 30 2006, Ingo Molnar wrote:
> >
> > neat stuff. One question: why do we require fdin or fdout to be a pipe?
> > Is there any fundamental problem with implementing what Larry's original
> > paper described too: straight pagecache -> socket transfers? Without a
> > pipe intermediary forced inbetween. It only adds unnecessary overhead.
>
> No, not a fundamental problem. I think I even hid that in some comment
> in there, at least if it's decipharable by someone else than myself...

Actually, there _is_ a fundamental problem. Two of them, in fact.

The reason it goes through a pipe is two-fold:

- the pipe _is_ the buffer. The reason sendfile() sucks is that sendfile
cannot work with <n> different buffer representations. sendfile() only
works with _one_ buffer representation, namely the "page cache of the
file".

By using the page cache directly, sendfile() doesn't need any extra
buffering, but that's also why sendfile() fundamentally _cannot_ work
with anything else. You cannot do "sendfile" between two sockets to
forward data from one place to another, for example. You cannot do
sendfile from a streaming device.

The pipe is just the standard in-kernel buffer between two arbitrary
points. Think of it as a scatter-gather list with a wait-queue. That's
what a pipe _is_. Trying to get rid of the pipe totally misses the
whole point of splice().

Now, we could have a splice call that has an _implicit_ pipe, ie if
neither side is a pipe, we could create a temporary pipe and thus
allow what looks like a direct splice. But the pipe should still be
there.

- The pipe is the buffer #2: it's what allows you to do _other_ things
with splice that are simply impossible to do with sendfile. Notably,
splice allows very naturally the "readv/writev" scatter-gather
behaviour of _mixing_ streams. If you're a web-server, with splice you
can do

write(pipefd, header, header_len);
splice(file, pipefd, file_len);
splice(pipefd, socket, total_len);

(this is all conceptual pseudo-code, of course), and this very
naturally has none of the issues that sendfile() has with plugging etc.
There's never any "send header separately and do extra work to make
sure it is in the same packet as the start of the data".

So having a separate buffer even when you _do_ have a buffer like the
page cache is still something you want to do.

So there.

Linus

2006-03-30 17:17:34

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] splice support #2



On Thu, 30 Mar 2006, Linus Torvalds wrote:
>
> Actually, there _is_ a fundamental problem. Two of them, in fact.

Actually, four.

The third reason the pipe buffer is so useful is that it's literally a
stream with no position.

That may sound bad, but it's actually a huge deal. It's why standard unix
pipelines are so powerful. You don't pass around "this file, this offset,
this length" - you pass around a simple fd, and you can feed that fd data
without ever having to worry about what the reader of the data does. The
reader cannot seek around to places that you didn't want him to see, and
the reader cannot get confused about where the end is.

The 4th reason is "tee". Again, you _could_ perhaps do "tee" without the
pipe, but it would be a total nightmare. Now, tee isn't that common, but
it does happen, and in particular it happens a lot with certain streaming
content.

Doing a "tee" with regular pipes is not that common: you usually just use
it for debugging or logging (ie you have a pipeline you want to debug, and
inserting "tee" in the middle is a good way to keep the same pipeline, but
also being able to look at the intermediate data when something went
wrong).

However, one reason "tee" _isn't_ that common with regular pipe usage is
that normal programs never need to do that anyway: all the pipe data
always goes through user space, so you can trivially do a "tee" inside of
the application itself without any external support. You just log the data
as you receive it.

But with splice(), the whole _point_ of the system call is that at least a
portion of the data never hits a user space buffer at all. Which means
that suddenly "tee" becomes much more important, because it's the _only_
way to insert a point where you can do logging/debugging of the data.

Now, I didn't do the "tee()" system call in my initial example thing, and
Jens didn't add it either, but I described it back in Jan-2005 with the
original description. It really is very fundamental if you ever want to
have a "plugin" kind of model, where you plug in different users to the
same data stream.

The canonical example is getting video input from an mpeg encoder, and
_both_ saving it to a file and sending it on in real-time to the app that
shows it in a window. Again, having the pipe is what allows this.

Linus

2006-03-30 19:19:39

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] splice support #2

Jens Axboe <[email protected]> wrote:
>
> On Thu, Mar 30 2006, Jens Axboe wrote:
> > On Thu, Mar 30 2006, Andrew Morton wrote:
> > > Jens Axboe <[email protected]> wrote:
> > > >
> > > > > > +static void *page_cache_pipe_buf_map(struct file *file,
> > > > > > + struct pipe_inode_info *info,
> > > > > > + struct pipe_buffer *buf)
> > > > > > +{
> > > > > > + struct page *page = buf->page;
> > > > > > +
> > > > > > + lock_page(page);
> > > > > > +
> > > > > > + if (!PageUptodate(page)) {
> > > > >
> > > > > || page->mapping == NULL
> > > >
> > > > Fixed
> > >
> > > Actually that wasn't right. If the page was truncated then we shouldn't
> > > return -EIO to userspace. We should return zero, as this is the first
> > > page.
> >
> > _if_ this is the first page, then agree.
> >
> > > Which means either returning an ERR_PTR here for -EIO or re-checking i_size
> > > in the caller. Maybe the latter?
> >
> > The latter sounds better, I'll add that.
>
> Actually that's a little hard, since you don't know what state buf->page
> is in - it might be page cache, it might not be. So I opted for the
> ERR_PTR approach, just returning -ENODATA on !page->mapping.
>

That's good, because the recheck-i_size idea was racy too. If another
thread were to do a quick truncate-then-extend then there's a window where
we'd see a truncated page, return NULL from page_cache_pipe_buf_map() and
then by the time i_size is re-checked, i_size has grown again so we end up
deciding it must have been an IO error.

2006-03-30 20:48:20

by Jeff Garzik

[permalink] [raw]
Subject: Re: [PATCH] splice support #2


Let me pull back from the details a bit, to note a code pattern I'm
beginning to see:

if (src_fd is a file &&
dest_fd is a socket)
sendfile()
else
hand code fd->fd data move

with splice this becomes

if (special case fd combination #1)
sendfile()
else (special case fd combination #2)
splice()
else
hand code fd->fd data move

Creating a syscall for each fd->fd data move case seems wasteful. I
would rather that the kernel Does The Right Thing so the app doesn't
have to support all these special cases. Handling the implicit buffer
case in the kernel, when needed, means that the app is future-proofed:
when another fd->fd optimization is implemented, the app automatically
takes advantage of it.

Jeff


2006-03-30 21:17:38

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] splice support #2



On Thu, 30 Mar 2006, Jeff Garzik wrote:
>
> with splice this becomes
>
> if (special case fd combination #1)
> sendfile()
> else (special case fd combination #2)
> splice()
> else
> hand code fd->fd data move

No, it really should be splice for all combinations (possibly with a
manual read/write fallback for stuff where it just hasn't been done).

The fact that we don't have splice for certain fd combinations is really
just a result of it not being done yet. One of the reasons I wanted to
merge asap was that the original example patch was done over a year ago,
and not a lot happened, so I'm hoping that the fact that the core code is
now in the base tree is going to make people do the necessary splice input
functions for different file types.

For filesystems, splice support tends to be really easy (both read and
write). For other things, it depends a bit. But unlike sendfile(), it
really is quite possible to splice _from_ a socket too, not just _to_ a
socket. But no, that case hasn't been written yet.

(In contrast, with "sendfile()", you just fundamentally couldn't do it).

> Creating a syscall for each fd->fd data move case seems wasteful. I would
> rather that the kernel Does The Right Thing so the app doesn't have to support
> all these special cases. Handling the implicit buffer case in the kernel,
> when needed, means that the app is future-proofed: when another fd->fd
> optimization is implemented, the app automatically takes advantage of it.

splice() really can handle any fd->fd move.

The reason you want to have a pipe in the middle is that you have to
handle partial moves _some_ way. And the pipe being the buffer really does
allow that, and also handles the case of "what happens when we received
more data than we could write".

So the way to do copies is

int pipefd[2];
unsigned long copied = 0;

if (pipe(pipefd) < 0)
error

for (;;) {
int nr = splice(in, pipefd[1], MAX_INT, 0);
if (nr <= 0)
break;
do {
int ret = splice(pipefd[0], out, nr, 0);
if (ret <= 0) {
error: couldn't write 'nr' bytes
break;
}

nr -= ret;
} while (nr);
}
close(pipefd[0]);
close(pipefd[1]);

which may _seem_ very complicated and the extra pipe seems "useless", but
it's (a) actually pretty efficient and (b) allows error recovery.

That (b) is the really important part. I can pretty much guarantee that
without the "useless" pipe, you simply couldn't do it.

In particular, what happens when you try to connect two streaming devices,
but the destination stops accepting data? You cannot put the received data
"back" into the streaming source any way - so if you actually want to be
able to handle error recovery, you _have_ to get access to the source
buffers.

Also, for signal handling, you need to have some way to keep the pipe
around for several iterations on the sender side, while still returning to
user space to do the signal handler.

And guess what? That's exactly what you get with that "useless" pipe. For
error handling, you can decide to throw the extra data that the recipient
didn't want away (just close the pipe), and maybe that's going to be what
most people do. But you could also decide to just do a "read()" on the
pipefd, to just read the data into user space for debugging and/or error
recovery..

Similarly, for signals, the pipe _is_ that buffer that you need that is
consistent across multiple invocations.

So that pipe is not at all unnecessary, and in fact, it's critical. It may
look more complex than sendfile(), but it's more complex exactly because
it can handle cases that sendfile never could, and just punted on (for
regular files, you never have the issue of half-way buffers, since you
just re-read them. Which is why sendfile() could do with it's simple
interface, but is also why sendfile() was never good for anything else).

Linus

2006-03-31 00:03:52

by George Spelvin

[permalink] [raw]
Subject: Re: [PATCH] splice support #2

The Lord High Penguin spake:
> In particular, what happens when you try to connect two streaming devices,
> but the destination stops accepting data? You cannot put the received
> data "back" into the streaming source any way - so if you actually want
> to be able to handle error recovery, you _have_ to get access to the
> source buffers.

Ding! Ding! Ding! The light dawns.

Oh my, that *is* clever. And I remember wrestling with the problem before.
If you're reading from a seekable fd (a la sendfile()), you can always
back up the read pointer, but if you're reading from a non-seekable
fd, what *do* you do if there's a write problem? The pipe gives you
a solution.


However, that gets me thinking... what about a pipe lets it be used as
a splice() source? The answer is that, like seekable files, it's peekable.
You can "peek at" an efficiently-sized chunk of data before deciding
to remove it from the source. You don't have to read(buf)/write(buf),
you can buf=peek()/len=write(buf)/accept(len).

Network sockets have too many features (what with urgent data pointers
and crap) to make that easy, and some character devices have
side effects from read() that can't be hidden at all. So you need
an explicit buffer.


Next, what about a pipe makes it an acceptable splice() destination?
The answer is that space is reservable. You can know that a write()
won't fail before read()ing the source. But a great many devices with
non-synchronous write semantics behave the same way. Certainly it's
not hard to make a non-O_SYNC file system behave that way.


Give all that, it it necessary to use the fallback of a pipe all
the time? It seems that doing without the pipe would be
feasible a large fraction of the time And would a

bytes_written = splice2(src, pipefd[1], pipefd[0], dst, &bytes_buffered);

system call be worth it to reduce overhead? (I'm assuming that
bytes_read = bytes_written + bytes_buffered, but you can use any two
of the three in the interface.) Note that it could bypass the pipe
entirely for those cases where it's possible.


Finally, has anyone thought about the semantics of tee(2)?
It seems that it could be used on block files to get a second
handle that does NOT share the file pointer. But if used on
a pipe, does the write block if EITHER reader is behind?
So one reader could block the other? I can't see any other way
to implement it, but I would like to be sure those semantics are
expected.

2006-03-31 02:31:42

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH] splice support #2

Linus Torvalds wrote:

> splice() really can handle any fd->fd move.
>
> The reason you want to have a pipe in the middle is that you have to
> handle partial moves _some_ way. And the pipe being the buffer really does
> allow that, and also handles the case of "what happens when we received
> more data than we could write".
>
> So the way to do copies is
>
> int pipefd[2];
> unsigned long copied = 0;
>
> if (pipe(pipefd) < 0)
> error
>
> for (;;) {
> int nr = splice(in, pipefd[1], MAX_INT, 0);
> if (nr <= 0)
> break;
> do {
> int ret = splice(pipefd[0], out, nr, 0);
> if (ret <= 0) {
> error: couldn't write 'nr' bytes
> break;
> }
>
> nr -= ret;
> } while (nr);
> }
> close(pipefd[0]);
> close(pipefd[1]);
>

I think it makes sense to have a 64-bit length. It just seems
cleaner because it is in userspace units of file offset. Also,
might you be able to do a single file-sized file<->file splice,
and have it do a remote copy on a suitable network fs, or a
whole-file COW on some local fs (maybe not, as splice doesn't
deal with metadata... I don't know the tricky details of fses).

But your argument against a 64-bit length seemed to involve
limiting the usage that sys_splice would see. Make it 64-bit
instead and someone might come up with something that you
hadn't thought of. Is there any downside?

No offsets :(
Don't they only increase flexibility? Or would you prefer to
add a new sys_psplice for that?

--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com

2006-03-31 02:44:08

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] splice support #2

Nick Piggin <[email protected]> wrote:
>
> Also,
> might you be able to do a single file-sized file<->file splice,
> and have it do a remote copy on a suitable network fs

splice() may not be suitable for such filesystems. Quoting Robert Peterson
from earlier this week:


> The problem is that
> loop.c circumvents proper locking by going directly to prepare_write
> rather than following the normal process. If I added some kind of
> library callbacks to allow cluster locking, it would break the
> normal locking sequence of an ordinary write.
>
> The normal sequence of an ordinary write typically looks like:
>
> 1. write
> 2. (Take care of cluster locking if necessary)
> 3. generic_file_write_nolock
> 4. generic_file_aio_write_nolock
> 5. generic_file_buffered_write
> 6. a_ops->prepare_write
> etc.
>
> Right now, loop.c is circumventing step 2 (optional cluster locking
> if needed by the underlying fs) by bypassing the "write" op. If I
> added callbacks to prepare_write at (7) as you suggest, it would either
> (a) introduce deadlocks conflicting with step 2 above or
> (b) require the underlying fs to use its own versions of 3,4,5, and 6,
> thus bypassing a great deal of vfs, which is not good for anyone.
>
> Some could argue that loop.c should always use write rather than
> prepare_write/commit_write, but most fs's don't have special
> locking needs and therefore using them is a performance boost.
>

2006-03-31 02:52:19

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] splice support #2

Andrew Morton <[email protected]> wrote:
>
> splice() may not be suitable for such filesystems.

OK, splice() cuts in at the file_operations level, so sych a clustered
filesystem _could_ implement it, but none of the code we have there will be
usable by it. If the operations in splice.c were to operate at the
file_operations level (->read, ->write) then probably they could be used
thusly.

2006-03-31 06:06:19

by tridge

[permalink] [raw]
Subject: Re: [PATCH] splice support #2

Linus and Jens,

Stephen Rothwell just pointed me at the new splice() interface. Looks
really nice!

One comment though. Could we add a off_t to splice to make it more
like pwrite() ? Otherwise threads could get painful with race
conditions between the seek and the splice() call.

Either that or add psplice() like this:

ssize_t psplice(int fdin, int fdout, size_t len, off_t ofs, unsigned flags);

where ofs sets the offset to read from fdin.

Cheers, Tridge

2006-03-31 06:35:54

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH] splice support #2

On Thu, Mar 30, 2006 at 06:51:33PM -0800, Andrew Morton wrote:
> Andrew Morton <[email protected]> wrote:
> >
> > splice() may not be suitable for such filesystems.
>
> OK, splice() cuts in at the file_operations level, so sych a clustered
> filesystem _could_ implement it, but none of the code we have there will be
> usable by it. If the operations in splice.c were to operate at the
> file_operations level (->read, ->write) then probably they could be used
> thusly.

Of course the code would be useable, same as current generic_file_* code
is useable with small wrappers.

2006-03-31 06:59:23

by Antonio Vargas

[permalink] [raw]
Subject: Re: [PATCH] splice support #2

On 3/31/06, [email protected] <[email protected]> wrote:
> Linus and Jens,
>
> Stephen Rothwell just pointed me at the new splice() interface. Looks
> really nice!
>
> One comment though. Could we add a off_t to splice to make it more
> like pwrite() ? Otherwise threads could get painful with race
> conditions between the seek and the splice() call.
>
> Either that or add psplice() like this:
>
> ssize_t psplice(int fdin, int fdout, size_t len, off_t ofs, unsigned flags);
>
> where ofs sets the offset to read from fdin.
>
> Cheers, Tridge

Maybe "ssize_t psplice(int fdin, int fdout, size_t len, off_t
fdin_ofs, unsigned flags);"
would be better at documenting that the offset is for the input? Or
maybe adding also a fdout_ofs for when fdout is also a file...

--
Greetz, Antonio Vargas aka winden of network

http://wind.codepixel.com/
[email protected]
[email protected]

Every day, every year
you have to work
you have to study
you have to scene.

2006-03-31 07:12:08

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] splice support #2


* Linus Torvalds <[email protected]> wrote:

> In particular, what happens when you try to connect two streaming
> devices, but the destination stops accepting data? You cannot put the
> received data "back" into the streaming source any way - so if you
> actually want to be able to handle error recovery, you _have_ to get
> access to the source buffers.

i'd rather implement this error case as an exception mechanism, instead
of a forced intermediary buffer mechanism.

We should extend the userspace API so that it is prepared to receive
'excess data' via a separate 'flush excess data to' file descriptor:

sys_splice(fd_in, fd_out, fd_flush, size,
max_flush_size, *bytes_flushed)

Note1: fd_flush can be a pipe too! This would avoid copies in the
exception case - if the exception case is expected to be common.

Note2: max_flush_size serves as hint and as a natural 'buffering limit'
for the kernel-internal loops. I believe it's more natural than
the implicit 'pipe buffering limit' we currently have.
max_flush_size == 0 would say to the kernel: 'use whatever
buffering is natural or necessary'. E.g. if fd_flush is a pipe,
it would automatically set the buffering size to the flush-pipe's
internal buffering limit.

Note3: we could even eliminate the "*bytes_flushed" parameter from
the syscall: as fd_flush's seek offset gives userspace an idea
about how much data was written to it.

Note4: if the user messes up fd_flush so that the kernel's "excess data"
transfer into fd_flush failes then that's 'tough luck' and flush
data may be lost. Users can use pipes [if the exception case is
common and they want to optimize that codepath] or can pre-write
their files if they need a 100% guarantee.
In fact, the kernel doesnt even have to _look up_ fd_flush in the
common case. It's the application's responsibility to make sure
the exception case will work. This means that the _only_ overhead
from this exception mechanism are the 2-3 extra parameters to
sys_splice(). That's _much_ faster.

Just look at the beauty of this generalization. fd_flush can be
_anything_. It could be a pipe. It could be a temporary file in /tmp. It
could be a file over the network. fd_flush could be mmap()-ed to
user-space! Or it could even be -1 if the user is not interested in the
error case for the streaming data. (For example a good portion of video
and audio playback applications are not interested in the fd_out error
case at all: such data can easily lose 'value' if it gets delayed by
more than a few milliseconds and the right answer is to skip the frame
or display an error message, ignoring the lost data.)

But for heaven's sake: do not slow down the 99.9999999999% fastpath by
forcing a pipe inbetween on the ABI level! I really have nothing against
making sys_splice() generic and i agree that a very good first step to
achieve that is to include pipes in the implementation, but i dont think
pipes are (or should be) all that critical and fundamental to the splice
data-streaming concept itself, as you are suggesting.

> Also, for signal handling, you need to have some way to keep the pipe
> around for several iterations on the sender side, while still
> returning to user space to do the signal handler.

i believe the signal case is naturally handled by the fd_flush approach
too - in fact it can also acts as a nice tester for the exception
handling mechanism.

If the application in question expects to get many signals then it can
use a pipe as fd_flush. (But signal-heavy apps are quite rare: most
performance-critical apps avoid them for the fastpath like the plague,
on modern CPUs it's more expensive to receive and handle a single signal
than to create and tear down a completely new thread (!))

Ingo

2006-03-31 07:36:56

by tridge

[permalink] [raw]
Subject: Re: [PATCH] splice support #2

Antonio,

> Maybe "ssize_t psplice(int fdin, int fdout, size_t len, off_t
> fdin_ofs, unsigned flags);"

yep, thats clearer

I think ideally splice() would just be changed to always take a off_t
as the interface is so new hopefully no apps depend on it yet. Then we
don't need psplice().

A while ago we got bitten with tdb in Samba and systems that only have
write(), and no pwrite(). If a process forks then accesses the db in
both parent and child the shared seek pointer semantics of inherited
file descriptors causes races with seek/write. It would be nice if
splice() was immune from this from the start.

If people want it both ways, then an alternative would be to have a
SPLICE_FLAG_OFS_CURRENT flag to splice(), which would tell splice() to
ignore the off_t and use the current seek ptr. I guess that would save
some book keeping for some apps that know they don't have any seek
races and want to always write at the current position. I wouldn't
need this, but some people might find it convenient.

> would be better at documenting that the offset is for the input? Or
> maybe adding also a fdout_ofs for when fdout is also a file...

I thought about that, but Stephen convinced me it was a bad idea, as
the semantics with whats in the pipe buffer at any one time gets too
messy if the output is a socket. I guess you could return -EINVAL if
you used the fdout_ofs and its not a seekable file, but that seems to
be overloading the system call too much for my liking.

Cheers, Tridge

2006-03-31 07:42:18

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH] splice support #2

Andrew Morton wrote:
> Andrew Morton <[email protected]> wrote:
>
>>splice() may not be suitable for such filesystems.
>

Well OK, but my point about making the syscall API use 64-bit
size is not completely invalid. Unless there is some downside
to it.

--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com

2006-03-31 09:56:28

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH] splice support #2

On Thu, Mar 30 2006, Linus Torvalds wrote:
>
>
> On Thu, 30 Mar 2006, Jens Axboe wrote:
> > On Thu, Mar 30 2006, Ingo Molnar wrote:
> > >
> > > neat stuff. One question: why do we require fdin or fdout to be a pipe?
> > > Is there any fundamental problem with implementing what Larry's original
> > > paper described too: straight pagecache -> socket transfers? Without a
> > > pipe intermediary forced inbetween. It only adds unnecessary overhead.
> >
> > No, not a fundamental problem. I think I even hid that in some comment
> > in there, at least if it's decipharable by someone else than myself...
>
> Actually, there _is_ a fundamental problem. Two of them, in fact.
>
> The reason it goes through a pipe is two-fold:
>
> - the pipe _is_ the buffer. The reason sendfile() sucks is that sendfile
> cannot work with <n> different buffer representations. sendfile() only
> works with _one_ buffer representation, namely the "page cache of the
> file".
>
> By using the page cache directly, sendfile() doesn't need any extra
> buffering, but that's also why sendfile() fundamentally _cannot_ work
> with anything else. You cannot do "sendfile" between two sockets to
> forward data from one place to another, for example. You cannot do
> sendfile from a streaming device.
>
> The pipe is just the standard in-kernel buffer between two arbitrary
> points. Think of it as a scatter-gather list with a wait-queue. That's
> what a pipe _is_. Trying to get rid of the pipe totally misses the
> whole point of splice().
>
> Now, we could have a splice call that has an _implicit_ pipe, ie if
> neither side is a pipe, we could create a temporary pipe and thus
> allow what looks like a direct splice. But the pipe should still be
> there.
>
> - The pipe is the buffer #2: it's what allows you to do _other_ things
> with splice that are simply impossible to do with sendfile. Notably,
> splice allows very naturally the "readv/writev" scatter-gather
> behaviour of _mixing_ streams. If you're a web-server, with splice you
> can do
>
> write(pipefd, header, header_len);
> splice(file, pipefd, file_len);
> splice(pipefd, socket, total_len);
>
> (this is all conceptual pseudo-code, of course), and this very
> naturally has none of the issues that sendfile() has with plugging etc.
> There's never any "send header separately and do extra work to make
> sure it is in the same packet as the start of the data".
>
> So having a separate buffer even when you _do_ have a buffer like the
> page cache is still something you want to do.
>
> So there.

My point was mainly that the buffer itself need not necessarily be a
pipe, it could be implemented with a pipe just using the same buffer
type. But I guess it doesn't make much sense, the pipe has nice
advantages in itself.

--
Jens Axboe

2006-03-31 09:57:15

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH] splice support #2

On Fri, Mar 31 2006, [email protected] wrote:
> Linus and Jens,
>
> Stephen Rothwell just pointed me at the new splice() interface. Looks
> really nice!
>
> One comment though. Could we add a off_t to splice to make it more
> like pwrite() ? Otherwise threads could get painful with race
> conditions between the seek and the splice() call.
>
> Either that or add psplice() like this:
>
> ssize_t psplice(int fdin, int fdout, size_t len, off_t ofs, unsigned flags);
>
> where ofs sets the offset to read from fdin.

I definitely see some valid reasons for adding a file offset instead of
using ->f_pos, I'll leave that decision up to Linus though. Linus?

--
Jens Axboe

2006-03-31 12:20:48

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] splice support #2


* Linus Torvalds <[email protected]> wrote:

> - The pipe is the buffer #2: it's what allows you to do _other_ things
> with splice that are simply impossible to do with sendfile. Notably,
> splice allows very naturally the "readv/writev" scatter-gather
> behaviour of _mixing_ streams. If you're a web-server, with splice you
> can do
>
> write(pipefd, header, header_len);
> splice(file, pipefd, file_len);
> splice(pipefd, socket, total_len);
>
> (this is all conceptual pseudo-code, of course), and this very
> naturally has none of the issues that sendfile() has with plugging etc.
> There's never any "send header separately and do extra work to make
> sure it is in the same packet as the start of the data".

with pipe-based buffering this approach has still the very same problems
that sendfile() has with packet boundaries, because it's not enough to
have "large enough" buffering (like a pipe has), the pipe also has to be
drained, and the networking layer has to know the precise boundary of
data.

the right solution to the packet boundary problem is to pass in a proper
"does userspace expect more data right now" flag, or to let userspace
'flush' the socket independently - which is independent of the
pipe-in-slice issue. This solution already exists: the MSG_MORE flag.

Ingo

2006-03-31 12:23:31

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH] splice support #2

On Fri, Mar 31 2006, Ingo Molnar wrote:
>
> * Linus Torvalds <[email protected]> wrote:
>
> > - The pipe is the buffer #2: it's what allows you to do _other_ things
> > with splice that are simply impossible to do with sendfile. Notably,
> > splice allows very naturally the "readv/writev" scatter-gather
> > behaviour of _mixing_ streams. If you're a web-server, with splice you
> > can do
> >
> > write(pipefd, header, header_len);
> > splice(file, pipefd, file_len);
> > splice(pipefd, socket, total_len);
> >
> > (this is all conceptual pseudo-code, of course), and this very
> > naturally has none of the issues that sendfile() has with plugging etc.
> > There's never any "send header separately and do extra work to make
> > sure it is in the same packet as the start of the data".
>
> with pipe-based buffering this approach has still the very same problems
> that sendfile() has with packet boundaries, because it's not enough to
> have "large enough" buffering (like a pipe has), the pipe also has to be
> drained, and the networking layer has to know the precise boundary of
> data.
>
> the right solution to the packet boundary problem is to pass in a proper
> "does userspace expect more data right now" flag, or to let userspace
> 'flush' the socket independently - which is independent of the
> pipe-in-slice issue. This solution already exists: the MSG_MORE flag.

We can add a SPLICE_F_MORE flag for this, right now splice doesn't set
the MSG_MORE flag for the end of the pipe.

--
Jens Axboe

2006-03-31 12:26:19

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH] splice support #2

On Fri, Mar 31 2006, Jens Axboe wrote:
> On Fri, Mar 31 2006, Ingo Molnar wrote:
> >
> > * Linus Torvalds <[email protected]> wrote:
> >
> > > - The pipe is the buffer #2: it's what allows you to do _other_ things
> > > with splice that are simply impossible to do with sendfile. Notably,
> > > splice allows very naturally the "readv/writev" scatter-gather
> > > behaviour of _mixing_ streams. If you're a web-server, with splice you
> > > can do
> > >
> > > write(pipefd, header, header_len);
> > > splice(file, pipefd, file_len);
> > > splice(pipefd, socket, total_len);
> > >
> > > (this is all conceptual pseudo-code, of course), and this very
> > > naturally has none of the issues that sendfile() has with plugging etc.
> > > There's never any "send header separately and do extra work to make
> > > sure it is in the same packet as the start of the data".
> >
> > with pipe-based buffering this approach has still the very same problems
> > that sendfile() has with packet boundaries, because it's not enough to
> > have "large enough" buffering (like a pipe has), the pipe also has to be
> > drained, and the networking layer has to know the precise boundary of
> > data.
> >
> > the right solution to the packet boundary problem is to pass in a proper
> > "does userspace expect more data right now" flag, or to let userspace
> > 'flush' the socket independently - which is independent of the
> > pipe-in-slice issue. This solution already exists: the MSG_MORE flag.
>
> We can add a SPLICE_F_MORE flag for this, right now splice doesn't set
> the MSG_MORE flag for the end of the pipe.

Ala

diff --git a/fs/splice.c b/fs/splice.c
index d8787c1..8a9ef67 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -343,15 +343,16 @@ static int pipe_to_sendpage(struct pipe_
loff_t pos = sd->pos;
unsigned int offset;
ssize_t ret;
+ int more;

ret = buf->ops->map(file, info, buf);
if (unlikely(ret))
return ret;

offset = pos & ~PAGE_CACHE_MASK;
+ more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;

- ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,
- sd->len < sd->total_len);
+ ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,more);

buf->ops->unmap(info, buf);
if (ret == sd->len)
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index bdf9d57..b003e3d 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -59,5 +59,6 @@ void free_pipe_info(struct inode* inode)
* add the splice flags here.
*/
#define SPLICE_F_MOVE (0x01) /* move pages instead of copying */
+#define SPLICE_F_MORE (0x02) /* expect more data */

#endif

--
Jens Axboe

2006-03-31 12:29:59

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] splice support #2


* Linus Torvalds <[email protected]> wrote:

> The reason it goes through a pipe is two-fold:
>
> - the pipe _is_ the buffer. The reason sendfile() sucks is that sendfile
> cannot work with <n> different buffer representations. sendfile() only
> works with _one_ buffer representation, namely the "page cache of the
> file".

The main problem of sendfile() is that it exposes an incoming-fd
'offset', which pretty much hardcodes the assumption that the incoming
file is a pagecache-backed object. I believe the flush_fd solution to
sys_splice() would solve that range of problems, and thus slice could be
used for offset-less file objects too.

all the remaining problems of sendfile() just derive from this basic
pagecache assumption that is hardcoded in the ABI. Once this one is
overcome (and i believe fd_flush overcomes it), there's no reason why we
couldnt implement the complete range of sys_splice() transfers.

Ingo

2006-03-31 12:47:19

by Bernd Petrovitsch

[permalink] [raw]
Subject: Re: [PATCH] splice support #2

On Thu, 2006-03-30 at 13:16 -0800, Linus Torvalds wrote:
[...]
> splice() really can handle any fd->fd move.
>
> The reason you want to have a pipe in the middle is that you have to
> handle partial moves _some_ way. And the pipe being the buffer really does
> allow that, and also handles the case of "what happens when we received
> more data than we could write".
>
> So the way to do copies is
>
> int pipefd[2];
> unsigned long copied = 0;
>
> if (pipe(pipefd) < 0)
> error
>
> for (;;) {
> int nr = splice(in, pipefd[1], MAX_INT, 0);
> if (nr <= 0)
> break;
> do {
> int ret = splice(pipefd[0], out, nr, 0);
> if (ret <= 0) {
> error: couldn't write 'nr' bytes
> break;
> }
>
> nr -= ret;
> } while (nr);
> }
> close(pipefd[0]);
> close(pipefd[1]);
>
> which may _seem_ very complicated and the extra pipe seems "useless", but
> it's (a) actually pretty efficient and (b) allows error recovery.
>
> That (b) is the really important part. I can pretty much guarantee that
> without the "useless" pipe, you simply couldn't do it.
>
> In particular, what happens when you try to connect two streaming devices,
> but the destination stops accepting data? You cannot put the received data
> "back" into the streaming source any way - so if you actually want to be
> able to handle error recovery, you _have_ to get access to the source
> buffers.
>
> Also, for signal handling, you need to have some way to keep the pipe
> around for several iterations on the sender side, while still returning to
> user space to do the signal handler.
>
> And guess what? That's exactly what you get with that "useless" pipe. For
> error handling, you can decide to throw the extra data that the recipient
> didn't want away (just close the pipe), and maybe that's going to be what
> most people do. But you could also decide to just do a "read()" on the
> pipefd, to just read the data into user space for debugging and/or error
> recovery..
>
> Similarly, for signals, the pipe _is_ that buffer that you need that is
> consistent across multiple invocations.
>
> So that pipe is not at all unnecessary, and in fact, it's critical. It may

IOW the "pipe" above is in fact just a "handle" of a kernel-internal
buffer (of one page) and the two fd's can be used to tell sys-calls to
read from or write into the buffer. But it is much faster because it
avoids real copying of the data into a user-space buffer and out of the
user-space buffer since the kernel can manipulate the underlying data
directly as a typical implementation of the above code does (i.e. use
read(2) and write(2) instead of the two splice(2) calls).
Uuuuuh, real zero-copy seems possible.

Bernd
--
Firmix Software GmbH http://www.firmix.at/
mobil: +43 664 4416156 fax: +43 1 7890849-55
Embedded Linux Development and Services

2006-03-31 12:50:23

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] splice support #2


* Jens Axboe <[email protected]> wrote:

> > > with pipe-based buffering this approach has still the very same problems
> > > that sendfile() has with packet boundaries, because it's not enough to
> > > have "large enough" buffering (like a pipe has), the pipe also has to be
> > > drained, and the networking layer has to know the precise boundary of
> > > data.
> > >
> > > the right solution to the packet boundary problem is to pass in a proper
> > > "does userspace expect more data right now" flag, or to let userspace
> > > 'flush' the socket independently - which is independent of the
> > > pipe-in-slice issue. This solution already exists: the MSG_MORE flag.
> >
> > We can add a SPLICE_F_MORE flag for this, right now splice doesn't set
> > the MSG_MORE flag for the end of the pipe.
>
> Ala

> #define SPLICE_F_MOVE (0x01) /* move pages instead of copying */
> +#define SPLICE_F_MORE (0x02) /* expect more data */

ok, nice - something like this should work. The direction of the flag is
a philosophical question i guess: i believe in Linux we prefer to
default to "buffering enabled", i.e. the default flag should be "expect
more data". So maybe it would be better to pass in PLICE_F_END, to
indicate end-of-data. [it doesnt mean 'permanent end', so all the files
still remain open: this could be something like a HTTP 1.1 pipelined
request.]

furthermore, the internal implementation should also get smarter and do
a flush-socket if it would e.g. block on a pagecache page. [we often
prefer a partial packet in such cases instead of having a half-built
packet hang around.]

btw., that 'data boundary' detail is likely lost with the pipe
intermediary solution: there is no direct connection between 'input
file' and 'output socket', so a 'flush now' event doesnt get propagated
in a natural way. (unless we extend pipes with 'data boundary' markers,
or force their flushing, which looks a whole lot of complexity for such
a simple thing.)

straight pagecache->socket splicing on the other hand preserves 'data
boundary' markers in a natural way for two reasons: 1) the input and
output objects are known to the kernel at once 2) because there is no
internal buffering to begin with.

Ingo

2006-03-31 18:18:21

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH] splice support #2

On Fri, Mar 31 2006, Ingo Molnar wrote:
>
> * Jens Axboe <[email protected]> wrote:
>
> > > > with pipe-based buffering this approach has still the very same problems
> > > > that sendfile() has with packet boundaries, because it's not enough to
> > > > have "large enough" buffering (like a pipe has), the pipe also has to be
> > > > drained, and the networking layer has to know the precise boundary of
> > > > data.
> > > >
> > > > the right solution to the packet boundary problem is to pass in a proper
> > > > "does userspace expect more data right now" flag, or to let userspace
> > > > 'flush' the socket independently - which is independent of the
> > > > pipe-in-slice issue. This solution already exists: the MSG_MORE flag.
> > >
> > > We can add a SPLICE_F_MORE flag for this, right now splice doesn't set
> > > the MSG_MORE flag for the end of the pipe.
> >
> > Ala
>
> > #define SPLICE_F_MOVE (0x01) /* move pages instead of copying */
> > +#define SPLICE_F_MORE (0x02) /* expect more data */
>
> ok, nice - something like this should work. The direction of the flag is
> a philosophical question i guess: i believe in Linux we prefer to
> default to "buffering enabled", i.e. the default flag should be "expect
> more data". So maybe it would be better to pass in PLICE_F_END, to
> indicate end-of-data. [it doesnt mean 'permanent end', so all the files
> still remain open: this could be something like a HTTP 1.1 pipelined
> request.]

Hmm I do prefer a MORE as to an END, as it will always give predictable
results. The MORE is a performance hit, where as a missing END wont
necessarily be detected until you start looking at latencies.

> furthermore, the internal implementation should also get smarter and do
> a flush-socket if it would e.g. block on a pagecache page. [we often
> prefer a partial packet in such cases instead of having a half-built
> packet hang around.]

That's a really good idea! Wont be too much trouble to add, probably the
easiest is to add a 'wait' variable to the ->map() function or something
along those lines.

> btw., that 'data boundary' detail is likely lost with the pipe
> intermediary solution: there is no direct connection between 'input
> file' and 'output socket', so a 'flush now' event doesnt get propagated
> in a natural way. (unless we extend pipes with 'data boundary' markers,
> or force their flushing, which looks a whole lot of complexity for such
> a simple thing.)

Probably not.

--
Jens Axboe

2006-03-31 19:11:59

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] splice support #2



On Fri, 31 Mar 2006, Jens Axboe wrote:
> >
> > ssize_t psplice(int fdin, int fdout, size_t len, off_t ofs, unsigned flags);
>
> I definitely see some valid reasons for adding a file offset instead of
> using ->f_pos, I'll leave that decision up to Linus though. Linus?

I think a file offset is fine, the one thing holding me back was just the
interface. One file offset per fd? Or just have the rule that the file
offset is for the "non-pipe" device?

Linus

2006-03-31 19:40:10

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH] splice support #2

On Fri, Mar 31 2006, Linus Torvalds wrote:
>
>
> On Fri, 31 Mar 2006, Jens Axboe wrote:
> > >
> > > ssize_t psplice(int fdin, int fdout, size_t len, off_t ofs, unsigned flags);
> >
> > I definitely see some valid reasons for adding a file offset instead of
> > using ->f_pos, I'll leave that decision up to Linus though. Linus?
>
> I think a file offset is fine, the one thing holding me back was just the
> interface. One file offset per fd? Or just have the rule that the file
> offset is for the "non-pipe" device?

Intuitively, I'd expect the offset to be tied to the non-pipe if I were
to eg see this for the first time. So my vote would go for that.

--
Jens Axboe

2006-03-31 20:39:12

by Hua Zhong

[permalink] [raw]
Subject: RE: [PATCH] splice support #2

Hi Linus,

> The 4th reason is "tee". Again, you _could_ perhaps do "tee"
> without the pipe, but it would be a total nightmare. Now, tee
> isn't that common, but it does happen, and in particular it
> happens a lot with certain streaming content.

If I understand correctly:

splice is one fd in, one fd out
tee is one fd in, two fd out (and I'd assume the "one fd in" would always be
a pipe)

How about one fd in, N fd out? Do you then stack the tee calls using
temporary pipes?

i.e., if N=3, then we'd have:

pipe(fd_tmp_pipe);
tee(fd_in, fd_out1, fd_tmp_pipe[0];
tee(fd_tmp_pipe[1], fd_out2, fd_out3);

Basically, N-2 temporary pipes would be required.

Is this the intention?

Hua


2006-03-31 20:49:53

by Linus Torvalds

[permalink] [raw]
Subject: RE: [PATCH] splice support #2



On Fri, 31 Mar 2006, Hua Zhong wrote:
>
> If I understand correctly:
>
> splice is one fd in, one fd out

Yes, and one of the fd's have to be a pipe.

> tee is one fd in, two fd out (and I'd assume the "one fd in" would always be
> a pipe)

Actually, all three of them would have to be pipes. The tee() thing has to
push to both sources in a "synchronized" manner, so it can't just take
arbitrary file descriptors that it doesn't know the exact buffering rules
for.

(Otherwise you wouldn't need "tee()" at all - you could have a "splice()"
that just takes several output fd's).

> How about one fd in, N fd out? Do you then stack the tee calls using
> temporary pipes?

I didn't write the tee() logic, but making it 1:N instead of 1:2 is not
conceptually a problem at all. The exact system call interface might limit
it some way, of course.

Linus