LinuxLists.cc - [RFC PATCH 1/4] splice: Fix corruption of spliced data after splice() returns

2023-06-29 16:01:48

Subject: [RFC PATCH 1/4] splice: Fix corruption of spliced data after splice() returns

Splicing data from, say, a file into a pipe currently leaves the source
pages in the pipe after splice() returns - but this means that those pages
can be subsequently modified by shared-writable mmap(), write(),
fallocate(), etc. before they're consumed.

Fix this by stealing the pages in splice() before they're added to the pipe
if no one else is using them or has them mapped and copying them otherwise.

Reported-by: Matt Whitlock <[email protected]>
Link: https://lore.kernel.org/r/[email protected]/
Signed-off-by: David Howells <[email protected]>
cc: Matthew Wilcox <[email protected]>
cc: Dave Chinner <[email protected]>
cc: Christoph Hellwig <[email protected]>
cc: Jens Axboe <[email protected]>
cc: [email protected]
---
mm/filemap.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++---
mm/internal.h | 4 +--
mm/shmem.c | 8 +++--
3 files changed, 95 insertions(+), 9 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 9e44a49bbd74..a002df515966 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2838,15 +2838,87 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
}
EXPORT_SYMBOL(generic_file_read_iter);

+static inline void copy_folio_to_folio(struct folio *src, size_t src_offset,
+ struct folio *dst, size_t dst_offset,
+ size_t size)
+{
+ void *p, *q;
+
+ while (size > 0) {
+ size_t part = min3(PAGE_SIZE - src_offset % PAGE_SIZE,
+ PAGE_SIZE - dst_offset % PAGE_SIZE,
+ size);
+
+ p = kmap_local_folio(src, src_offset);
+ q = kmap_local_folio(dst, dst_offset);
+ memcpy(q, p, part);
+ kunmap_local(p);
+ kunmap_local(q);
+ src_offset += part;
+ dst_offset += part;
+ size -= part;
+ }
+}
+
/*
- * Splice subpages from a folio into a pipe.
+ * Splice data from a folio into a pipe. The folio is stolen if no one else is
+ * using it and copied otherwise. We can't put the folio into the pipe still
+ * attached to the pagecache as that allows someone to modify it after the
+ * splice.
*/
-size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
- struct folio *folio, loff_t fpos, size_t size)
+ssize_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
+ struct folio *folio, loff_t fpos, size_t size)
{
+ struct address_space *mapping;
+ struct folio *copy = NULL;
struct page *page;
+ unsigned int flags = 0;
+ ssize_t ret;
size_t spliced = 0, offset = offset_in_folio(folio, fpos);

+ folio_lock(folio);
+
+ mapping = folio_mapping(folio);
+ ret = -ENODATA;
+ if (!folio->mapping)
+ goto err_unlock; /* Truncated */
+ ret = -EIO;
+ if (!folio_test_uptodate(folio))
+ goto err_unlock;
+
+ /*
+ * At least for ext2 with nobh option, we need to wait on writeback
+ * completing on this folio, since we'll remove it from the pagecache.
+ * Otherwise truncate wont wait on the folio, allowing the disk blocks
+ * to be reused by someone else before we actually wrote our data to
+ * them. fs corruption ensues.
+ */
+ folio_wait_writeback(folio);
+
+ if (folio_has_private(folio) &&
+ !filemap_release_folio(folio, GFP_KERNEL))
+ goto need_copy;
+
+ /* If we succeed in removing the mapping, set LRU flag and add it. */
+ if (remove_mapping(mapping, folio)) {
+ folio_unlock(folio);
+ flags = PIPE_BUF_FLAG_LRU;
+ goto add_to_pipe;
+ }
+
+need_copy:
+ folio_unlock(folio);
+
+ copy = folio_alloc(GFP_KERNEL, 0);
+ if (!copy)
+ return -ENOMEM;
+
+ size = min(size, PAGE_SIZE - offset % PAGE_SIZE);
+ copy_folio_to_folio(folio, offset, copy, 0, size);
+ folio = copy;
+ offset = 0;
+
+add_to_pipe:
page = folio_page(folio, offset / PAGE_SIZE);
size = min(size, folio_size(folio) - offset);
offset %= PAGE_SIZE;
@@ -2861,6 +2933,7 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
.page = page,
.offset = offset,
.len = part,
+ .flags = flags,
};
folio_get(folio);
pipe->head++;
@@ -2869,7 +2942,13 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
offset = 0;
}

+ if (copy)
+ folio_put(copy);
return spliced;
+
+err_unlock:
+ folio_unlock(folio);
+ return ret;
}

/**
@@ -2947,7 +3026,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos,

for (i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
- size_t n;
+ ssize_t n;

if (folio_pos(folio) >= end_offset)
goto out;
@@ -2963,8 +3042,11 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos,

n = min_t(loff_t, len, isize - *ppos);
n = splice_folio_into_pipe(pipe, folio, *ppos, n);
- if (!n)
+ if (n <= 0) {
+ if (n < 0)
+ error = n;
goto out;
+ }
len -= n;
total_spliced += n;
*ppos += n;
diff --git a/mm/internal.h b/mm/internal.h
index a7d9e980429a..ae395e0f31d5 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -881,8 +881,8 @@ struct migration_target_control {
/*
* mm/filemap.c
*/
-size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
- struct folio *folio, loff_t fpos, size_t size);
+ssize_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
+ struct folio *folio, loff_t fpos, size_t size);

/*
* mm/vmalloc.c
diff --git a/mm/shmem.c b/mm/shmem.c
index 2f2e0e618072..969931b0f00e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2783,7 +2783,8 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
struct inode *inode = file_inode(in);
struct address_space *mapping = inode->i_mapping;
struct folio *folio = NULL;
- size_t total_spliced = 0, used, npages, n, part;
+ ssize_t n;
+ size_t total_spliced = 0, used, npages, part;
loff_t isize;
int error = 0;

@@ -2844,8 +2845,11 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
n = splice_zeropage_into_pipe(pipe, *ppos, len);
}

- if (!n)
+ if (n <= 0) {
+ if (n < 0)
+ error = n;
break;
+ }
len -= n;
total_spliced += n;
*ppos += n;

2023-07-19 10:28:44

by Miklos Szeredi

[permalink] [raw]

Subject: Re: [RFC PATCH 1/4] splice: Fix corruption of spliced data after splice() returns

On Thu, 29 Jun 2023 at 17:56, David Howells <[email protected]> wrote:
>
> Splicing data from, say, a file into a pipe currently leaves the source
> pages in the pipe after splice() returns - but this means that those pages
> can be subsequently modified by shared-writable mmap(), write(),
> fallocate(), etc. before they're consumed.

What is this trying to fix? The above behavior is well known, so
it's not likely to be a problem.

Besides, removing spliced pages from the cache is basically guaranteed
to result in a performance regression for any application using
splice.

Thanks,
Miklos

2023-07-19 18:41:16

by Matt Whitlock

[permalink] [raw]

Subject: Re: [RFC PATCH 1/4] splice: Fix corruption of spliced data after splice() returns

On Wednesday, 19 July 2023 06:17:51 EDT, Miklos Szeredi wrote:
> On Thu, 29 Jun 2023 at 17:56, David Howells <[email protected]> wrote:
>>
>> Splicing data from, say, a file into a pipe currently leaves the source
>> pages in the pipe after splice() returns - but this means that those pages
>> can be subsequently modified by shared-writable mmap(), write(),
>> fallocate(), etc. before they're consumed.
>
> What is this trying to fix? The above behavior is well known, so
> it's not likely to be a problem.

Respectfully, it's not well-known, as it's not documented. If the splice(2)
man page had mentioned that pages can be mutated after they're already
ostensibly at rest in the output pipe buffer, then my nightly backups
wouldn't have been incurring corruption silently for many months.

2023-07-19 20:03:05

by Miklos Szeredi

[permalink] [raw]

Subject: Re: [RFC PATCH 1/4] splice: Fix corruption of spliced data after splice() returns

On Wed, 19 Jul 2023 at 19:59, Matt Whitlock <[email protected]> wrote:
>
> On Wednesday, 19 July 2023 06:17:51 EDT, Miklos Szeredi wrote:
> > On Thu, 29 Jun 2023 at 17:56, David Howells <[email protected]> wrote:
> >>
> >> Splicing data from, say, a file into a pipe currently leaves the source
> >> pages in the pipe after splice() returns - but this means that those pages
> >> can be subsequently modified by shared-writable mmap(), write(),
> >> fallocate(), etc. before they're consumed.
> >
> > What is this trying to fix? The above behavior is well known, so
> > it's not likely to be a problem.
>
> Respectfully, it's not well-known, as it's not documented. If the splice(2)
> man page had mentioned that pages can be mutated after they're already
> ostensibly at rest in the output pipe buffer, then my nightly backups
> wouldn't have been incurring corruption silently for many months.

splice(2):

Though we talk of copying, actual copies are generally avoided.
The kernel does this by implementing a pipe buffer as a set of
refer‐
ence-counted pointers to pages of kernel memory. The
kernel creates "copies" of pages in a buffer by creating new pointers
(for the
output buffer) referring to the pages, and increasing the
reference counts for the pages: only pointers are copied, not the
pages of the
buffer.

While not explicitly stating that the contents of the pages can change
after being spliced, this can easily be inferred from the above
semantics.

Thanks,
Miklos

2023-07-19 20:03:22

by Matthew Wilcox

[permalink] [raw]

Subject: Re: [RFC PATCH 1/4] splice: Fix corruption of spliced data after splice() returns

On Wed, Jul 19, 2023 at 09:35:33PM +0200, Miklos Szeredi wrote:
> On Wed, 19 Jul 2023 at 19:59, Matt Whitlock <[email protected]> wrote:
> >
> > On Wednesday, 19 July 2023 06:17:51 EDT, Miklos Szeredi wrote:
> > > On Thu, 29 Jun 2023 at 17:56, David Howells <[email protected]> wrote:
> > >>
> > >> Splicing data from, say, a file into a pipe currently leaves the source
> > >> pages in the pipe after splice() returns - but this means that those pages
> > >> can be subsequently modified by shared-writable mmap(), write(),
> > >> fallocate(), etc. before they're consumed.
> > >
> > > What is this trying to fix? The above behavior is well known, so
> > > it's not likely to be a problem.
> >
> > Respectfully, it's not well-known, as it's not documented. If the splice(2)
> > man page had mentioned that pages can be mutated after they're already
> > ostensibly at rest in the output pipe buffer, then my nightly backups
> > wouldn't have been incurring corruption silently for many months.
>
> splice(2):
>
> Though we talk of copying, actual copies are generally avoided.
> The kernel does this by implementing a pipe buffer as a set of
> refer‐
> ence-counted pointers to pages of kernel memory. The
> kernel creates "copies" of pages in a buffer by creating new pointers
> (for the
> output buffer) referring to the pages, and increasing the
> reference counts for the pages: only pointers are copied, not the
> pages of the
> buffer.
>
> While not explicitly stating that the contents of the pages can change
> after being spliced, this can easily be inferred from the above
> semantics.

So what's the API that provides the semantics of _copying_? And, frankly,
this is a "you're holding it wrong" kind of argument. It only makes
sense if you're read the implementation, which is at best level 2:

https://ozlabs.org/~rusty/index.cgi/tech/2008-03-30.html

and worst a level -5:

https://ozlabs.org/~rusty/index.cgi/tech/2008-04-01.html

2023-07-19 20:28:05

by Miklos Szeredi

[permalink] [raw]

Subject: Re: [RFC PATCH 1/4] splice: Fix corruption of spliced data after splice() returns

On Wed, 19 Jul 2023 at 21:44, Matthew Wilcox <[email protected]> wrote:
>
> On Wed, Jul 19, 2023 at 09:35:33PM +0200, Miklos Szeredi wrote:
> > On Wed, 19 Jul 2023 at 19:59, Matt Whitlock <[email protected]> wrote:
> > >
> > > On Wednesday, 19 July 2023 06:17:51 EDT, Miklos Szeredi wrote:
> > > > On Thu, 29 Jun 2023 at 17:56, David Howells <[email protected]> wrote:
> > > >>
> > > >> Splicing data from, say, a file into a pipe currently leaves the source
> > > >> pages in the pipe after splice() returns - but this means that those pages
> > > >> can be subsequently modified by shared-writable mmap(), write(),
> > > >> fallocate(), etc. before they're consumed.
> > > >
> > > > What is this trying to fix? The above behavior is well known, so
> > > > it's not likely to be a problem.
> > >
> > > Respectfully, it's not well-known, as it's not documented. If the splice(2)
> > > man page had mentioned that pages can be mutated after they're already
> > > ostensibly at rest in the output pipe buffer, then my nightly backups
> > > wouldn't have been incurring corruption silently for many months.
> >
> > splice(2):
> >
> > Though we talk of copying, actual copies are generally avoided.
> > The kernel does this by implementing a pipe buffer as a set of
> > refer‐
> > ence-counted pointers to pages of kernel memory. The
> > kernel creates "copies" of pages in a buffer by creating new pointers
> > (for the
> > output buffer) referring to the pages, and increasing the
> > reference counts for the pages: only pointers are copied, not the
> > pages of the
> > buffer.
> >
> > While not explicitly stating that the contents of the pages can change
> > after being spliced, this can easily be inferred from the above
> > semantics.
>
> So what's the API that provides the semantics of _copying_?

What's your definition of copying?

Thanks,
Miklos

2023-07-19 20:51:57

by Linus Torvalds

[permalink] [raw]

Subject: Re: [RFC PATCH 1/4] splice: Fix corruption of spliced data after splice() returns

On Wed, 19 Jul 2023 at 12:44, Matthew Wilcox <[email protected]> wrote:
>
> So what's the API that provides the semantics of _copying_?

It's called "read()" and "write()".

Seriously.

The *ONLY* reason for splice() existing is for zero-copy. If you don't
want zero-copy (aka "copy by reference"), don't use splice.

Stop arguing against it. If you don't want zero-copy, you use read()
and write(). It really is that simple.

And no, we don't start some kind of crazy "versioned zero-copy with
COW". That's a fundamental mistake. It's a mistake that has been done
- several times - and made perhaps most famous by Hurd, that made that
a big thing.

And yes, this has been documented *forever*. It may not have been
documented on the first line, because IT WAS SO OBVIOUS. The whole
reason splice() is fast is because it avoids the actual copy, and does
a copy-by-reference.

That's still a copy. But a copy-by-reference is a special thing. If
you don't know what copy-by-reference is, or don't want it, don't use
splice().

I don't know how many different ways I can say the same thing.

IF YOU DON'T WANT ZERO-COPY, DON'T USE SPLICE.

IF YOU DON'T UNDERSTAND THE DIFFERENCE BETWEEN COPY-BY-VALUE AND
COPY-BY-REFERENCE, DON'T USE SPLICE.

IF YOU DON'T UNDERSTAND THE *POINT* OF SPLICE, DON'T USE SPLICE.

It's kind of a bit like pointers in C: if you don't understand
pointers but use them anyway, you're going to have a hard time. That's
not the fault of the pointers. Pointers are very very powerful. But if
you are used to languages that only do copy-by-value, you are going to
think they are bad things.

Linus

2023-07-19 21:50:26

by Matthew Wilcox

[permalink] [raw]

Subject: Re: [RFC PATCH 1/4] splice: Fix corruption of spliced data after splice() returns

On Wed, Jul 19, 2023 at 09:56:44PM +0200, Miklos Szeredi wrote:
> On Wed, 19 Jul 2023 at 21:44, Matthew Wilcox <[email protected]> wrote:
> > So what's the API that provides the semantics of _copying_?
>
> What's your definition of copying?

Future modifications to the pagecache do not affect the data after the
syscall has returned success. Modifications to the pagecache while
the syscall is in progress may or may not affect the data received at
the destination.

2023-07-19 21:57:45

by Matt Whitlock

[permalink] [raw]

Subject: Re: [RFC PATCH 1/4] splice: Fix corruption of spliced data after splice() returns

On Wednesday, 19 July 2023 16:16:07 EDT, Linus Torvalds wrote:
> The *ONLY* reason for splice() existing is for zero-copy.

The very first sentence of splice(2) reads: "splice() moves data between
two file descriptors without copying between kernel address space and user
address space." Thus, it is not unreasonable to believe that the point of
splice is to avoid copying between user-space and kernel-space.

If you use read() and write(), then you're making two copies. If you use
splice(), then you're making one copy (or zero, but that's an optimization
that should be invisible to the user).

> And no, we don't start some kind of crazy "versioned zero-copy with
> COW". That's a fundamental mistake.

Agreed. splice() should steal the reference if it can, copy the page data
if it must. Note that, even in the slow case where the page data must be
copied, this still gives a better-than-50% speedup over read()+write()
since an entire copy (and one syscall) is elided.

> IF YOU DON'T UNDERSTAND THE *POINT* OF SPLICE, DON'T USE SPLICE.

Thanks for being so condescending. Your reputation is deserved.

2023-07-19 23:31:48

by Linus Torvalds

[permalink] [raw]

Subject: Re: [RFC PATCH 1/4] splice: Fix corruption of spliced data after splice() returns

On Wed, 19 Jul 2023 at 14:02, Matt Whitlock <[email protected]> wrote:
>
> On Wednesday, 19 July 2023 16:16:07 EDT, Linus Torvalds wrote:
> > The *ONLY* reason for splice() existing is for zero-copy.
>
> The very first sentence of splice(2) reads: "splice() moves data between
> two file descriptors without copying between kernel address space and user
> address space." Thus, it is not unreasonable to believe that the point of
> splice is to avoid copying between user-space and kernel-space.

I'm not at all opposed to clarifying the documentation.

But I *am* very much against changing existing semantics. People rely
on it. The networking layer knows about it. The whole design is all
around "copy by reference".

And changing existing semantics would not only slow things down, it
wouldn't even *fix* anything that got this wrong. They'd still be
broken on old kernels.

When documentation and reality collide, documentation loses. That's
how this works.

> If you use read() and write(), then you're making two copies. If you use
> splice(), then you're making one copy (or zero, but that's an optimization
> that should be invisible to the user).

No. It really isn't.

It is an optimization that is INHERENT IN THE INTERFACE and has been
there literally since day #1. It was *never* invisible. It was the
*point*.

You getting this use case wrong is not an excuse to change reality. It
is, at most, a good reason to clarify the documentation.

The "without copying between kernel address space and user address
space" is about the ability to not copy AT ALL, and yes, let's by all
means clarify that part.

Really. If you cannot understand this fact, and the fact that you
simply misunderstood how splice() worked, I can't really help you
about that.

I repeat: if you want a stable copy of some file data, you *have* to
copy the file data. There's no magic. There's no difference between
"copy to user space" or "copy in kernel space". So you had better just
use "read()".

If you want to avoid the copy, you use one of the interfaces that are
about references to the data. splice() is not the only such interface.
mmap() acts the same way (on input).

You really should see splice() into a pipe as a way to 'mmap the data
without allocating user space backing store".

Of course, splice() *out* of a pipe is different too. It's the same
system call, but "splice into pipe" and "splice out of pipe" are
actually very different animals.

So splicing into a pipe is kind of like a small temporary mmap without
the TLB flush or VM allocation overhead.

But splicing out of the pipe is more akin to "map this buffer into
your own buffers as long as you don't modify it", so it basically say
"you can take just a reference to this page" (complexity addition:
SPLICE_F_GIFT and buffer stealing).

And all of this is literally designed to be able to do zero-copy from
multiple sources to multiple destinations. Not "sendpage()", which
could only do file->network, but a more generic ability like having
data that is sourced from (say) a TV capture card, and is transferred
to the network or maybe to another accelerator for encoding.

That's why the "pipe" part exists. It's the buffer in between
arbitrary end points. It's the replacement for a user buffer. But it's
also literally designed to be all about copy-by-reference.

Really.

So stop arguing. You misused splice(), because you didn't understand
it, and you got burnt. You don't like that. I get it. But that doesn't
make splice() wrong. That only made your use of it buggy.

So splice() is for zero-copy. It expects that you either stabilized
the data somehow (maybe those files are never modified, or maybe you
have other synchronization) or that you simply don't care whether it's
stable, and if the file changes, maybe the data you send changed too.

If you want "one-copy", what you can do is:

- mmap() the file data (zero copy, not stable yet)

- use "write()" to write the data to the network. This will copy it
to the skbs before the write() call returns and that copy makes it
stable.

Alternatively, if you want to be more than a bit odd, you _can_ do the
zero-copy on the write side, by doing

- read the file data (one copy, now it's stable)

- vmsplice() to the kernel buffer (zero copy)

- splice() to the network (zero copy at least for the good cases)

and now you just need to make sure that you don't re-use the user
buffer until the network data has actually been sent. Which makes this
second alternative much more complicated than the first one, and I'm
absolutely not recommending it, but I'm mentioning it as a
possibility.

Honestly, the read/vmsplice/splice model is kind of crazy, but there
might be real reasons to do it that odd way, and the buffer handling
in user space is manageable (you might, for example, just decide to
"munmap()" the buffer after sending it out).

For an example of "why would you ever do that", you might have content
conversion issues between the read/vmsplice, or need to generate a
checksum on the stable data or whatever. So it's a *valid* use of
splice(), but it's certainly a bit odd.

Linus

2023-07-20 00:05:29

by Linus Torvalds

[permalink] [raw]

Subject: Re: [RFC PATCH 1/4] splice: Fix corruption of spliced data after splice() returns

On Wed, 19 Jul 2023 at 16:20, Linus Torvalds
<[email protected]> wrote:
>
> If you want "one-copy", what you can do is:
>
> - mmap() the file data (zero copy, not stable yet)
>
> - use "write()" to write the data to the network. This will copy it
> to the skbs before the write() call returns and that copy makes it
> stable.
>
> Alternatively, if you want to be more than a bit odd, you _can_ do the
> zero-copy on the write side, by doing
>
> - read the file data (one copy, now it's stable)
>
> - vmsplice() to the kernel buffer (zero copy)
>
> - splice() to the network (zero copy at least for the good cases)

Actually, I guess technically there's a third way:

- mmap the input (zero copy)

- write() to a pipe (one copy)

- splice() to the network (zero copy)

which doesn't seem to really have any sane use cases, but who knows...
It avoids the user buffer management of the vmsplice() model, and
while you cannot do anything to the data in user space *before* it is
stable (because it only becomes stable as it is copied to the pipe
buffers by the 'write()' system call), you could use "tee()" to
duplicate the now stable stream and perhaps log it or create a
checksum after-the-fact.

Another use-case would be if you want to send the *same* stable stream
to two different network connections, while still only having one
copy. You can't do that with plain splice() - because the data isn't
guaranteed to be stable, and the two network connections might see
different streams. You can't do that with the 'mmap and then
write-to-socket' approach, because the two writes not only copy twice,
they might copy different data.

And while you *can* do it with the "read+vmsplice()" approach, maybe
the "write to pipe() in order to avoid any user space buffer issues"
model is better. And "tee()" avoids the overhead of doing multiple
vmsplice() calls on the same buffer.

I dunno.

What I *am* trying to say is that "splice()" is actually kind of
designed for people to do these kinds of combinations. But very very
few people actually do it.

For example, the "tee()" system call exists, but it is crazy hard to
use, I'm not sure it has ever actually been used for anything real.

Linus

2023-07-20 00:17:36

by Linus Torvalds

[permalink] [raw]

Subject: Re: [RFC PATCH 1/4] splice: Fix corruption of spliced data after splice() returns

On Wed, 19 Jul 2023 at 16:41, Matt Whitlock <[email protected]> wrote:
>
> Then that is my request. This entire complaint/discussion/argument would
> have been avoided if splice(2) had contained a sentence like this one from
> sendfile(2):
>
> "If out_fd refers to a socket or pipe with zero-copy support, callers must
> ensure the transferred portions of the file referred to by in_fd remain
> unmodified until the reader on the other end of out_fd has consumed the
> transferred data."
>
> That is a clear warning of the perils of the implementation under the hood,
> and it could/should be copied, more or less verbatim, to splice(2).

Ack. Internally in the kernel, the two really have always been more or
less of intermingled.

In fact, I think splice()/sendfile()/tee() could - and maybe should -
actually be a single man-page to make it clear that they are all
facets of the same thing.

The issues with TCP_CORK exist for splice too, for example, for
exactly the same reasons.

And while SPLICE_F_MORE exists, it only deals with multiple splice()
calls, it doesn't deal with the "I wrote a header before I even
started using splice()" case that is the one that is mentioned for
sendfile().

Or course, technically TCP_CORK exists for plain write() use as well,
but there the portable and historical fix is simply to use writev()
and send it all in one go.

So it's hopefully only when you use sendfile() and splice() that you
end up with "oh, but I have multiple different *kinds* of sources, and
I want to cork things until I've dealt with them all".

Linus

2023-07-20 00:18:25

by Matt Whitlock

[permalink] [raw]

Subject: Re: [RFC PATCH 1/4] splice: Fix corruption of spliced data after splice() returns

On Wednesday, 19 July 2023 19:20:32 EDT, Linus Torvalds wrote:
>> On Wednesday, 19 July 2023 16:16:07 EDT, Linus Torvalds wrote:
>>> The *ONLY* reason for splice() existing is for zero-copy.
>>
>> The very first sentence of splice(2) reads: "splice() moves data between
>> two file descriptors without copying between kernel address space and user
>> address space." Thus, it is not unreasonable to believe that the point of
>> splice is to avoid copying between user-space and kernel-space.
>
> I'm not at all opposed to clarifying the documentation.

Then that is my request. This entire complaint/discussion/argument would
have been avoided if splice(2) had contained a sentence like this one from
sendfile(2):

"If out_fd refers to a socket or pipe with zero-copy support, callers must
ensure the transferred portions of the file referred to by in_fd remain
unmodified until the reader on the other end of out_fd has consumed the
transferred data."

That is a clear warning of the perils of the implementation under the hood,
and it could/should be copied, more or less verbatim, to splice(2).

2023-07-24 09:49:53

by David Howells

[permalink] [raw]

Subject: Re: [RFC PATCH 1/4] splice: Fix corruption of spliced data after splice() returns

Linus Torvalds <[email protected]> wrote:

> > So what's the API that provides the semantics of _copying_?
>
> It's called "read()" and "write()".

What about copy_file_range()? That seems to fall back to splicing if not
directly implemented by the filesystem. It looks like the manpage for that
needs updating too - or should that actually copy?

David

2023-07-24 14:35:21

by Miklos Szeredi

[permalink] [raw]

Subject: Re: [RFC PATCH 1/4] splice: Fix corruption of spliced data after splice() returns

On Mon, 24 Jul 2023 at 11:44, David Howells <[email protected]> wrote:
>
> Linus Torvalds <[email protected]> wrote:
>
> > > So what's the API that provides the semantics of _copying_?
> >
> > It's called "read()" and "write()".
>
> What about copy_file_range()? That seems to fall back to splicing if not
> directly implemented by the filesystem. It looks like the manpage for that
> needs updating too - or should that actually copy?

Both source and destination of copy_file_range() are regular files and
do_splice_direct() is basically equivalent to write(dest, mmap of
source), no refd buffers remain beyond the end of the syscall. What
is it that should be updated in the manpage?

Thanks,
Miklos

2023-07-24 16:45:47

by David Howells

[permalink] [raw]

Subject: Re: [RFC PATCH 1/4] splice: Fix corruption of spliced data after splice() returns

Miklos Szeredi <[email protected]> wrote:

> Both source and destination of copy_file_range() are regular files and

Ah - the check is in generic_file_rw_checks(). Okay, nevermind. (Though it
looks like it might allow this to be used with procfiles and suchlike, but
anyone who tries that had probably better know what they're doing).

David