2009-04-30 12:11:58

by Max Kellermann

[permalink] [raw]
Subject: [splice PATCH 1/3] splice: use "long" for tee() return values

do_tee() and other internal functions related to that have a "long"
return value. Internally, some of them work with an "int ret".
Convert them to "long".

Signed-off-by: Max Kellermann <[email protected]>
---

fs/splice.c | 11 ++++++-----
1 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/fs/splice.c b/fs/splice.c
index 666953d..f07e304 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1589,12 +1589,13 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
/*
* Link contents of ipipe to opipe.
*/
-static int link_pipe(struct pipe_inode_info *ipipe,
- struct pipe_inode_info *opipe,
- size_t len, unsigned int flags)
+static long link_pipe(struct pipe_inode_info *ipipe,
+ struct pipe_inode_info *opipe,
+ size_t len, unsigned int flags)
{
struct pipe_buffer *ibuf, *obuf;
- int ret = 0, i = 0, nbuf;
+ long ret = 0;
+ int i = 0, nbuf;

/*
* Potential ABBA deadlock, work around it by ordering lock
@@ -1679,7 +1680,7 @@ static long do_tee(struct file *in, struct file *out, size_t len,
{
struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode);
struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode);
- int ret = -EINVAL;
+ long ret = -EINVAL;

/*
* Duplicate the contents of ipipe to opipe without actually


2009-04-30 12:11:42

by Max Kellermann

[permalink] [raw]
Subject: [splice PATCH 2/3] tee: don't return 0 when another task drains/fills a pipe

Cite from the tee() manual page:

"A return value of 0 means that there was no data to transfer, and it
would not make sense to block, because there are no writers connected
to the write end of the pipe"

There is however a race condition in the tee() implementation, which
violates this definition:

- do_tee() ensures that ipipe is readable and opipe is writable by
calling link_ipipe_prep() and link_opipe_prep()
- these two functions unlock the pipe after they have waited
- during this unlocked phase, there is a short window where other
tasks may drain the input pipe or fill the output pipe
- do_tee() now calls link_pipe(), which re-locks both pipes
- link_pipe() sees that it is unable to read ("i >= ipipe->nrbufs ||
opipe->nrbufs >= PIPE_BUFFERS") and breaks from the loop
- link_pipe() returns 0

Although there may be writers connected to the input pipe, tee() now
returns 0, and the caller (spuriously) assumes this is the end of the
stream.

This patch wraps the link_[io]pipe_prep() invocation in a loop within
link_pipe(), and loops until the result is reliable.

Signed-off-by: Max Kellermann <[email protected]>
---

fs/splice.c | 47 ++++++++++++++++++++++++++++++++++-------------
1 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/fs/splice.c b/fs/splice.c
index f07e304..96135eb 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1594,15 +1594,41 @@ static long link_pipe(struct pipe_inode_info *ipipe,
size_t len, unsigned int flags)
{
struct pipe_buffer *ibuf, *obuf;
- long ret = 0;
+ long ret;
int i = 0, nbuf;

- /*
- * Potential ABBA deadlock, work around it by ordering lock
- * grabbing by pipe info address. Otherwise two different processes
- * could deadlock (one doing tee from A -> B, the other from B -> A).
- */
- pipe_double_lock(ipipe, opipe);
+ while (1) {
+ /* wait for ipipe to become ready to read */
+ ret = link_ipipe_prep(ipipe, flags);
+ if (ret)
+ return ret;
+
+ /* wait for opipe to become ready to write */
+ ret = link_opipe_prep(opipe, flags);
+ if (ret)
+ return ret;
+
+ /*
+ * Potential ABBA deadlock, work around it by ordering
+ * lock grabbing by inode address. Otherwise two
+ * different processes could deadlock (one doing tee
+ * from A -> B, the other from B -> A).
+ */
+ pipe_double_lock(ipipe, opipe);
+
+ /* see if the tee() is still possible */
+ if ((ipipe->nrbufs > 0 || ipipe->writers == 0) &&
+ opipe->nrbufs < PIPE_BUFFERS)
+ /* yes, it is - keep the locks and end this
+ loop */
+ break;
+
+ /* no - someone has drained ipipe or has filled opipe
+ between link_[io]pipe_pre()'s lock and our lock.
+ Drop both locks and wait again. */
+ pipe_unlock(ipipe);
+ pipe_unlock(opipe);
+ }

do {
if (!opipe->readers) {
@@ -1691,12 +1717,7 @@ static long do_tee(struct file *in, struct file *out, size_t len,
* Keep going, unless we encounter an error. The ipipe/opipe
* ordering doesn't really matter.
*/
- ret = link_ipipe_prep(ipipe, flags);
- if (!ret) {
- ret = link_opipe_prep(opipe, flags);
- if (!ret)
- ret = link_pipe(ipipe, opipe, len, flags);
- }
+ ret = link_pipe(ipipe, opipe, len, flags);
}

return ret;

2009-04-30 12:12:21

by Max Kellermann

[permalink] [raw]
Subject: [splice PATCH 3/3] splice: added support for pipe-to-pipe splice()

This patch enables the splice() system call to copy buffers from one
pipe to another. This obvious and trivial use case for splice() was
not supported until now.

It reuses the functions link_ipipe_prep() and link_opipe_prep() from
the tee() system call implementation.

Signed-off-by: Max Kellermann <[email protected]>
---

fs/splice.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 166 insertions(+), 0 deletions(-)

diff --git a/fs/splice.c b/fs/splice.c
index 96135eb..f69a88f 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -902,6 +902,156 @@ ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,

EXPORT_SYMBOL(generic_splice_sendpage);

+static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags);
+static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags);
+
+/**
+ * Returns the nth pipe buffer after the current one.
+ *
+ * @i the buffer index, relative to the current one
+ */
+static inline struct pipe_buffer *
+pipe_buffer_at(struct pipe_inode_info *pipe, unsigned i)
+{
+ BUG_ON(i >= PIPE_BUFFERS);
+
+ return pipe->bufs + ((pipe->curbuf + i) & (PIPE_BUFFERS - 1));
+}
+
+/**
+ * Splice pages from one pipe to another.
+ *
+ * @ipipe the input pipe
+ * @opipe the output pipe
+ * @len the maximum number of bytes to move
+ * @flags splice modifier flags
+ */
+static long do_splice_pipes(struct pipe_inode_info *ipipe,
+ struct pipe_inode_info *opipe,
+ size_t len, unsigned int flags)
+{
+ struct pipe_buffer *ibuf, *obuf;
+ long ret;
+ int do_wakeup = 0;
+
+ if (ipipe == opipe)
+ /* cannot splice a pipe to itself */
+ return -EINVAL;
+
+ while (1) {
+ /* wait for ipipe to become ready to read */
+ ret = link_ipipe_prep(ipipe, flags);
+ if (ret)
+ return ret;
+
+ /* wait for opipe to become ready to write */
+ ret = link_opipe_prep(opipe, flags);
+ if (ret)
+ return ret;
+
+ /* lock both pipes */
+ pipe_double_lock(ipipe, opipe);
+
+ /* see if the splice() is still possible */
+ if ((ipipe->nrbufs > 0 || ipipe->writers == 0) &&
+ opipe->nrbufs < PIPE_BUFFERS)
+ /* yes, it is - keep the locks and end this
+ loop */
+ break;
+
+ /* no - someone has drained ipipe or has filled opipe
+ between link_[io]pipe_pre()'s lock and our lock.
+ Drop both locks and wait again. */
+ pipe_unlock(ipipe);
+ pipe_unlock(opipe);
+ }
+
+ do {
+ if (opipe->readers == 0) {
+ /* nobody's reading from the output pipe: send
+ SIGPIPE */
+ send_sig(SIGPIPE, current, 0);
+ if (!ret)
+ ret = -EPIPE;
+ break;
+ }
+
+ /*
+ * If we have iterated all input buffers or ran out of
+ * output room, break.
+ */
+ if (ipipe->nrbufs == 0 || opipe->nrbufs >= PIPE_BUFFERS)
+ break;
+
+ /* now do the real thing: move a buffer (or a part of
+ it) from ipipe to opipe */
+
+ ibuf = pipe_buffer_at(ipipe, 0);
+ obuf = pipe_buffer_at(opipe, opipe->nrbufs);
+ *obuf = *ibuf;
+
+ /*
+ * Don't inherit the gift flag, we need to
+ * prevent multiple steals of this page.
+ */
+ obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+
+ /* increase the reference counter */
+ obuf->ops->get(opipe, obuf);
+
+ if (obuf->len > len) {
+ /* partial move */
+
+ obuf->len = len;
+
+ /* remove the portion from ibuf */
+ ibuf->offset += len;
+ ibuf->len -= len;
+ } else {
+ /* full move: remove buffer from the input
+ pipe */
+
+ ibuf->ops->release(ipipe, ibuf);
+ ibuf->ops = NULL;
+
+ ipipe->curbuf = (ipipe->curbuf + 1) &
+ (PIPE_BUFFERS - 1);
+ ipipe->nrbufs--;
+
+ do_wakeup = 1;
+ }
+
+ opipe->nrbufs++;
+ ret += obuf->len;
+ len -= obuf->len;
+ } while (len > 0);
+
+ pipe_unlock(ipipe);
+ pipe_unlock(opipe);
+
+ if (do_wakeup) {
+ /* at least one buffer was removed from the
+ input pipe: wake up potential writers */
+ smp_mb();
+ if (waitqueue_active(&ipipe->wait))
+ wake_up_interruptible(&ipipe->wait);
+ kill_fasync(&ipipe->fasync_writers, SIGIO, POLL_OUT);
+ }
+
+ /*
+ * If we put data in the output pipe, wakeup any potential
+ * readers.
+ */
+ if (ret > 0) {
+ smp_mb();
+ if (waitqueue_active(&opipe->wait))
+ wake_up_interruptible(&opipe->wait);
+ kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
+ }
+
+ return ret;
+}
+
/*
* Attempt to initiate a splice from pipe to file.
*/
@@ -1138,8 +1288,24 @@ static long do_splice(struct file *in, loff_t __user *off_in,

pipe = pipe_info(in->f_path.dentry->d_inode);
if (pipe) {
+ struct pipe_inode_info *out_pipe;
+
if (off_in)
return -ESPIPE;
+
+ out_pipe = pipe_info(out->f_path.dentry->d_inode);
+ if (out_pipe) {
+ /* special case: a splice between two pipes */
+
+ if (unlikely(!(out->f_mode & FMODE_WRITE)))
+ return -EBADF;
+
+ if (off_out)
+ return -ESPIPE;
+
+ return do_splice_pipes(pipe, out_pipe, len, flags);
+ }
+
if (off_out) {
if (out->f_op->llseek == no_llseek)
return -EINVAL;

2009-04-30 13:35:38

by Vegard Nossum

[permalink] [raw]
Subject: Re: [splice PATCH 3/3] splice: added support for pipe-to-pipe splice()

2009/4/30 Max Kellermann <[email protected]>:
> This patch enables the splice() system call to copy buffers from one
> pipe to another.  This obvious and trivial use case for splice() was
> not supported until now.
>
> It reuses the functions link_ipipe_prep() and link_opipe_prep() from
> the tee() system call implementation.
>
> Signed-off-by: Max Kellermann <[email protected]>
> ---
>
>  fs/splice.c |  166 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 files changed, 166 insertions(+), 0 deletions(-)
>
> diff --git a/fs/splice.c b/fs/splice.c
> index 96135eb..f69a88f 100644
> --- a/fs/splice.c
> +++ b/fs/splice.c
> @@ -902,6 +902,156 @@ ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
>
>  EXPORT_SYMBOL(generic_splice_sendpage);
>
> +static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags);
> +static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags);
> +
> +/**
> + * Returns the nth pipe buffer after the current one.
> + *
> + * @i the buffer index, relative to the current one
> + */
> +static inline struct pipe_buffer *
> +pipe_buffer_at(struct pipe_inode_info *pipe, unsigned i)
> +{
> +       BUG_ON(i >= PIPE_BUFFERS);
> +
> +       return pipe->bufs + ((pipe->curbuf + i) & (PIPE_BUFFERS - 1));
> +}
> +
> +/**
> + * Splice pages from one pipe to another.
> + *
> + * @ipipe the input pipe
> + * @opipe the output pipe
> + * @len the maximum number of bytes to move
> + * @flags splice modifier flags
> + */
> +static long do_splice_pipes(struct pipe_inode_info *ipipe,
> +                           struct pipe_inode_info *opipe,
> +                           size_t len, unsigned int flags)
> +{
> +       struct pipe_buffer *ibuf, *obuf;
> +       long ret;
> +       int do_wakeup = 0;
> +
> +       if (ipipe == opipe)
> +               /* cannot splice a pipe to itself */
> +               return -EINVAL;
> +

What happens if you splice two pipes, in separate threads, to each other? :-)


Vegard

2009-04-30 13:51:57

by Max Kellermann

[permalink] [raw]
Subject: Re: [splice PATCH 3/3] splice: added support for pipe-to-pipe splice()

On 2009/04/30 15:35, Vegard Nossum <[email protected]> wrote:
> What happens if you splice two pipes, in separate threads, to each
> other? :-)

As a demonstration, here's an experiment with older technology, when
pipes were still made of wood (pre-splice):

mkfifo /tmp/foo
cat /tmp/foo >/tmp/foo

You may launch the real thing now with a simple

echo Hello World >tmp/foo

Warning: with only one pipe, you're likely to bend the pipe too far,
and it may break (SIGPIPE).

Max