2023-08-10 13:33:25

by Zheng Yejian

[permalink] [raw]
Subject: [PATCH] tracing: Fix race when concurrently splice_read trace_pipe

When concurrently splice_read file trace_pipe and per_cpu/cpu*/trace_pipe,
there are more data being read out than expected.

The root cause is that in tracing_splice_read_pipe(), an entry is found
outside locks, it may be read by multiple readers or consumed by other
reader as starting printing it.

To fix it, change to find entry after holding locks.

Fixes: 7e53bd42d14c ("tracing: Consolidate protection of reader access to the ring buffer")
Signed-off-by: Zheng Yejian <[email protected]>
---
kernel/trace/trace.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b8870078ef58..f169d33b948f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7054,14 +7054,16 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
if (ret <= 0)
goto out_err;

- if (!iter->ent && !trace_find_next_entry_inc(iter)) {
+ trace_event_read_lock();
+ trace_access_lock(iter->cpu_file);
+
+ if (!trace_find_next_entry_inc(iter)) {
+ trace_access_unlock(iter->cpu_file);
+ trace_event_read_unlock();
ret = -EFAULT;
goto out_err;
}

- trace_event_read_lock();
- trace_access_lock(iter->cpu_file);
-
/* Fill as many pages as possible. */
for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) {
spd.pages[i] = alloc_page(GFP_KERNEL);
--
2.25.1



2023-08-11 12:16:08

by Masami Hiramatsu

[permalink] [raw]
Subject: Re: [PATCH] tracing: Fix race when concurrently splice_read trace_pipe

On Thu, 10 Aug 2023 20:39:05 +0800
Zheng Yejian <[email protected]> wrote:

> When concurrently splice_read file trace_pipe and per_cpu/cpu*/trace_pipe,
> there are more data being read out than expected.
>
> The root cause is that in tracing_splice_read_pipe(), an entry is found
> outside locks, it may be read by multiple readers or consumed by other
> reader as starting printing it.
>
> To fix it, change to find entry after holding locks.
>
> Fixes: 7e53bd42d14c ("tracing: Consolidate protection of reader access to the ring buffer")
> Signed-off-by: Zheng Yejian <[email protected]>
> ---
> kernel/trace/trace.c | 10 ++++++----
> 1 file changed, 6 insertions(+), 4 deletions(-)
>
> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index b8870078ef58..f169d33b948f 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -7054,14 +7054,16 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
> if (ret <= 0)
> goto out_err;
>
> - if (!iter->ent && !trace_find_next_entry_inc(iter)) {
> + trace_event_read_lock();
> + trace_access_lock(iter->cpu_file);
> +
> + if (!trace_find_next_entry_inc(iter)) {

It seems you skips '!iter->ent' check. Is there any reason for this change?

Thank you,

> + trace_access_unlock(iter->cpu_file);
> + trace_event_read_unlock();
> ret = -EFAULT;
> goto out_err;
> }
>
> - trace_event_read_lock();
> - trace_access_lock(iter->cpu_file);
> -
> /* Fill as many pages as possible. */
> for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) {
> spd.pages[i] = alloc_page(GFP_KERNEL);
> --
> 2.25.1
>


--
Masami Hiramatsu (Google) <[email protected]>

2023-08-11 13:30:49

by Zheng Yejian

[permalink] [raw]
Subject: Re: [PATCH] tracing: Fix race when concurrently splice_read trace_pipe

On 2023/8/11 19:42, Masami Hiramatsu (Google) wrote:
> On Thu, 10 Aug 2023 20:39:05 +0800
> Zheng Yejian <[email protected]> wrote:
>
>> When concurrently splice_read file trace_pipe and per_cpu/cpu*/trace_pipe,
>> there are more data being read out than expected.
>>
>> The root cause is that in tracing_splice_read_pipe(), an entry is found
>> outside locks, it may be read by multiple readers or consumed by other
>> reader as starting printing it.
>>
>> To fix it, change to find entry after holding locks.
>>
>> Fixes: 7e53bd42d14c ("tracing: Consolidate protection of reader access to the ring buffer")
>> Signed-off-by: Zheng Yejian <[email protected]>
>> ---
>> kernel/trace/trace.c | 10 ++++++----
>> 1 file changed, 6 insertions(+), 4 deletions(-)
>>
>> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
>> index b8870078ef58..f169d33b948f 100644
>> --- a/kernel/trace/trace.c
>> +++ b/kernel/trace/trace.c
>> @@ -7054,14 +7054,16 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
>> if (ret <= 0)
>> goto out_err;
>>
>> - if (!iter->ent && !trace_find_next_entry_inc(iter)) {
>> + trace_event_read_lock();
>> + trace_access_lock(iter->cpu_file);
>> +
>> + if (!trace_find_next_entry_inc(iter)) {
>
> It seems you skips '!iter->ent' check. Is there any reason for this change?

IIUC, 'iter->ent' may be the entry that was found but not consumed
in last call tracing_splice_read_pipe(), and in this call, 'iter->ent'
may have being consumed, so we may should find a new 'iter->ent' before
printing it in tracing_fill_pipe_page(), see following reduced codes:

tracing_splice_read_pipe() {
if (!iter->ent && !trace_find_next_entry_inc(iter)) { // 1. find
entry here
... ...
}
tracing_fill_pipe_page() {
for (;;) {
... ...
ret = print_trace_line(iter); // 2. print entry
... ...
if (!trace_find_next_entry_inc()) { // 3. find next entry
... ...
break;
}
}

--

Thanks,
Zheng Yejian

>
> Thank you,
>
>> + trace_access_unlock(iter->cpu_file);
>> + trace_event_read_unlock();
>> ret = -EFAULT;
>> goto out_err;
>> }
>>
>> - trace_event_read_lock();
>> - trace_access_lock(iter->cpu_file);
>> -
>> /* Fill as many pages as possible. */
>> for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) {
>> spd.pages[i] = alloc_page(GFP_KERNEL);
>> --
>> 2.25.1
>>
>
>


2023-08-11 20:17:49

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH] tracing: Fix race when concurrently splice_read trace_pipe

On Thu, 10 Aug 2023 20:39:05 +0800
Zheng Yejian <[email protected]> wrote:

> When concurrently splice_read file trace_pipe and per_cpu/cpu*/trace_pipe,
> there are more data being read out than expected.

Honestly the real fix is to prevent that use case. We should probably have
access to trace_pipe lock all the per_cpu trace_pipes too.

-- Steve

2023-08-11 21:01:44

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH] tracing: Fix race when concurrently splice_read trace_pipe

On Fri, 11 Aug 2023 20:37:07 +0800
Zheng Yejian <[email protected]> wrote:

> On 2023/8/11 19:42, Masami Hiramatsu (Google) wrote:
> > On Thu, 10 Aug 2023 20:39:05 +0800
> > Zheng Yejian <[email protected]> wrote:
> >
> >> When concurrently splice_read file trace_pipe and per_cpu/cpu*/trace_pipe,
> >> there are more data being read out than expected.
> >>
> >> The root cause is that in tracing_splice_read_pipe(), an entry is found
> >> outside locks, it may be read by multiple readers or consumed by other
> >> reader as starting printing it.
> >>
> >> To fix it, change to find entry after holding locks.
> >>
> >> Fixes: 7e53bd42d14c ("tracing: Consolidate protection of reader access to the ring buffer")
> >> Signed-off-by: Zheng Yejian <[email protected]>
> >> ---
> >> kernel/trace/trace.c | 10 ++++++----
> >> 1 file changed, 6 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> >> index b8870078ef58..f169d33b948f 100644
> >> --- a/kernel/trace/trace.c
> >> +++ b/kernel/trace/trace.c
> >> @@ -7054,14 +7054,16 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
> >> if (ret <= 0)
> >> goto out_err;
> >>
> >> - if (!iter->ent && !trace_find_next_entry_inc(iter)) {
> >> + trace_event_read_lock();
> >> + trace_access_lock(iter->cpu_file);
> >> +
> >> + if (!trace_find_next_entry_inc(iter)) {
> >
> > It seems you skips '!iter->ent' check. Is there any reason for this change?
>
> IIUC, 'iter->ent' may be the entry that was found but not consumed
> in last call tracing_splice_read_pipe(), and in this call, 'iter->ent'
> may have being consumed, so we may should find a new 'iter->ent' before
> printing it in tracing_fill_pipe_page(), see following reduced codes:

And if it wasn't consumed? We just lost it?

>
> tracing_splice_read_pipe() {
> if (!iter->ent && !trace_find_next_entry_inc(iter)) { // 1. find
> entry here
> ... ...
> }
> tracing_fill_pipe_page() {
> for (;;) {
> ... ...
> ret = print_trace_line(iter); // 2. print entry
> ... ...

You missed:

count = trace_seq_used(&iter->seq) - save_len;
if (rem < count) {
rem = 0;
iter->seq.seq.len = save_len;

Where the above just threw away what was printed in the above
"print_trace_line()", and it never went to console.

break;
}

-- Steve


> if (!trace_find_next_entry_inc()) { // 3. find next entry
> ... ...
> break;
> }
> }
>
> --
>
> Thanks,
> Zheng Yejian
>
> >
> > Thank you,
> >
> >> + trace_access_unlock(iter->cpu_file);
> >> + trace_event_read_unlock();
> >> ret = -EFAULT;
> >> goto out_err;
> >> }
> >>
> >> - trace_event_read_lock();
> >> - trace_access_lock(iter->cpu_file);
> >> -
> >> /* Fill as many pages as possible. */
> >> for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) {
> >> spd.pages[i] = alloc_page(GFP_KERNEL);
> >> --
> >> 2.25.1
> >>
> >
> >


2023-08-12 02:49:28

by Zheng Yejian

[permalink] [raw]
Subject: Re: [PATCH] tracing: Fix race when concurrently splice_read trace_pipe

On 2023/8/12 03:24, Steven Rostedt wrote:
> On Fri, 11 Aug 2023 20:37:07 +0800
> Zheng Yejian <[email protected]> wrote:
>
>> On 2023/8/11 19:42, Masami Hiramatsu (Google) wrote:
>>> On Thu, 10 Aug 2023 20:39:05 +0800
>>> Zheng Yejian <[email protected]> wrote:
>>>
>>>> When concurrently splice_read file trace_pipe and per_cpu/cpu*/trace_pipe,
>>>> there are more data being read out than expected.
>>>>
>>>> The root cause is that in tracing_splice_read_pipe(), an entry is found
>>>> outside locks, it may be read by multiple readers or consumed by other
>>>> reader as starting printing it.
>>>>
>>>> To fix it, change to find entry after holding locks.
>>>>
>>>> Fixes: 7e53bd42d14c ("tracing: Consolidate protection of reader access to the ring buffer")
>>>> Signed-off-by: Zheng Yejian <[email protected]>
>>>> ---
>>>> kernel/trace/trace.c | 10 ++++++----
>>>> 1 file changed, 6 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
>>>> index b8870078ef58..f169d33b948f 100644
>>>> --- a/kernel/trace/trace.c
>>>> +++ b/kernel/trace/trace.c
>>>> @@ -7054,14 +7054,16 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
>>>> if (ret <= 0)
>>>> goto out_err;
>>>>
>>>> - if (!iter->ent && !trace_find_next_entry_inc(iter)) {
>>>> + trace_event_read_lock();
>>>> + trace_access_lock(iter->cpu_file);
>>>> +
>>>> + if (!trace_find_next_entry_inc(iter)) {
>>>
>>> It seems you skips '!iter->ent' check. Is there any reason for this change?
>>
>> IIUC, 'iter->ent' may be the entry that was found but not consumed
>> in last call tracing_splice_read_pipe(), and in this call, 'iter->ent'
>> may have being consumed, so we may should find a new 'iter->ent' before
>> printing it in tracing_fill_pipe_page(), see following reduced codes:
>
> And if it wasn't consumed? We just lost it?

If 'iter->ent' was not consumed, trace_find_next_entry_inc() will find
it again, will it?

-- Zheng Yejian

>
>>
>> tracing_splice_read_pipe() {
>> if (!iter->ent && !trace_find_next_entry_inc(iter)) { // 1. find
>> entry here
>> ... ...
>> }
>> tracing_fill_pipe_page() {
>> for (;;) {
>> ... ...
>> ret = print_trace_line(iter); // 2. print entry
>> ... ...
>
> You missed:
>
> count = trace_seq_used(&iter->seq) - save_len;
> if (rem < count) {
> rem = 0;
> iter->seq.seq.len = save_len;
>
> Where the above just threw away what was printed in the above
> "print_trace_line()", and it never went to console.
>
> break;
> }
>

Thanks for pointing this out!

-- Zheng Yejian

> -- Steve
>
>
>> if (!trace_find_next_entry_inc()) { // 3. find next entry
>> ... ...
>> break;
>> }
>> }
>>
>> --
>>
>> Thanks,
>> Zheng Yejian
>>
>>>
>>> Thank you,
>>>
>>>> + trace_access_unlock(iter->cpu_file);
>>>> + trace_event_read_unlock();
>>>> ret = -EFAULT;
>>>> goto out_err;
>>>> }
>>>>
>>>> - trace_event_read_lock();
>>>> - trace_access_lock(iter->cpu_file);
>>>> -
>>>> /* Fill as many pages as possible. */
>>>> for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) {
>>>> spd.pages[i] = alloc_page(GFP_KERNEL);
>>>> --
>>>> 2.25.1
>>>>
>>>
>>>
>
>


2023-08-12 05:39:25

by Zheng Yejian

[permalink] [raw]
Subject: Re: [PATCH] tracing: Fix race when concurrently splice_read trace_pipe

On 2023/8/12 03:25, Steven Rostedt wrote:
> On Thu, 10 Aug 2023 20:39:05 +0800
> Zheng Yejian <[email protected]> wrote:
>
>> When concurrently splice_read file trace_pipe and per_cpu/cpu*/trace_pipe,
>> there are more data being read out than expected.

Sorry, I didn't make clear here. It not just read more but also lost
some data. My case is that, for example:
1) Inject 3 events into ring_buffer: event1, event2, event3;
2) Concurrently splice_read through trace_pipes;
3) Then actually read out: event1, event3, event3. No event2, but 2
event3.

>
> Honestly the real fix is to prevent that use case. We should probably have
> access to trace_pipe lock all the per_cpu trace_pipes too.

Yes, we could do that, but would it seem not that effective?
because per_cpu trace_pipe only read its own ring_buffer and not race
with ring_buffers in other cpus.

>
> -- Steve
>


2023-08-12 08:18:02

by Zheng Yejian

[permalink] [raw]
Subject: Re: [PATCH] tracing: Fix race when concurrently splice_read trace_pipe

On 2023/8/12 03:25, Steven Rostedt wrote:
> On Thu, 10 Aug 2023 20:39:05 +0800
> Zheng Yejian <[email protected]> wrote:
>
>> When concurrently splice_read file trace_pipe and per_cpu/cpu*/trace_pipe,
>> there are more data being read out than expected.
>
> Honestly the real fix is to prevent that use case. We should probably have
> access to trace_pipe lock all the per_cpu trace_pipes too.
>
> -- Steve
>

Hi~

Reproduction testcase is show as below, it can always reproduce the
issue in v5.10, and after this patch, the testcase passed.

In v5.10, when run `cat trace_pipe > /tmp/myfile &`, it call
sendfile() to transmit data from trace_pipe into /tmp/myfile. And in
kernel, .splice_read() of trace_pipe is called then the issue is
reproduced.

However in the newest v6.5, this reproduction case didn't run into the
.splice_read() of trace_pipe, because after commit 97ef77c52b78 ("fs:
check FMODE_LSEEK to control internal pipe splicing"), non-seekable
trace_pipe cannot be sendfile-ed.

``` repro.sh
#!/bin/bash


do_test()
{
local trace_dir=/sys/kernel/tracing
local trace=${trace_dir}/trace
local old_trace_lines
local new_trace_lines
local tempfiles
local testlog="trace pipe concurrency issue"
local pipe_pids
local i
local write_cnt=1000
local read_cnt=0
local nr_cpu=`nproc`

# 1. At first, clear all ring buffer
echo > ${trace}

# 2. Count how many lines in trace file now
old_trace_lines=`cat ${trace} | wc -l`

# 3. Close water mark so that reader can read as event comes
echo 0 > ${trace_dir}/buffer_percent

# 4. Read percpu trace_pipes into local file on background.
# Splice read must be used under command 'cat' so that the racy
# issue can be reproduced !!!
i=0
while [ ${i} -lt ${nr_cpu} ]; do
tempfiles[${i}]=/tmp/percpu_trace_pipe_${i}
cat ${trace_dir}/per_cpu/cpu${i}/trace_pipe >
${tempfiles[${i}]} &
pipe_pids[${i}]=$!
let i=i+1
done

# 5. Read main trace_pipe into local file on background.
# The same, splice read must be used to reproduce the issue !!!
tempfiles[${i}]=/tmp/main_trace_pipe
cat ${trace_dir}/trace_pipe > ${tempfiles[${i}]} &
pipe_pids[${i}]=$!

echo "Take a break, let readers run."
sleep 3

# 6. Write events into ring buffer through trace_marker, so that
# hungry readers start racing these events.
i=0
while [ ${i} -lt ${write_cnt} ]; do
echo "${testlog} <${i}>" > ${trace_dir}/trace_marker
let i=i+1
done

# 7. Wait until all events being consumed
new_trace_lines=`cat ${trace} | wc -l`
while [ "${new_trace_lines}" != "${old_trace_lines}" ]; do
new_trace_lines=`cat ${trace} | wc -l`
sleep 1
done
echo "All written events have been consumed."

# 8. Kill all readers and count the events readed
i=0
while [ ${i} -lt ${#pipe_pids[*]} ]; do
local num

kill -9 ${pipe_pids[${i}]}
wait ${pipe_pids[${i}]}
num=`cat ${tempfiles[${i}]} | grep "${testlog}" | wc -l`
let read_cnt=read_cnt+num
let i=i+1
done

# 9. Expect to read events as much as write
if [ "${read_cnt}" != "${write_cnt}" ]; then
echo "Test fail: write ${write_cnt} but read
${read_cnt} !!!"
return 1
fi

# 10. Clean temp files if test success
i=0
while [ ${i} -lt ${#tempfiles[*]} ]; do
rm ${tempfiles[${i}]}
let i=i+1
done
return 0
}

do_test
```

-- Zheng Yejian

2023-08-12 21:33:21

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH] tracing: Fix race when concurrently splice_read trace_pipe

On Sat, 12 Aug 2023 10:22:43 +0800
Zheng Yejian <[email protected]> wrote:

> > And if it wasn't consumed? We just lost it?
>
> If 'iter->ent' was not consumed, trace_find_next_entry_inc() will find
> it again, will it?

No it will not, because it did the "inc". That means the next time it
is called, it will find the next item to read, thinking it already
"consumed" the last one.

-- Steve

2023-08-12 21:33:23

by Masami Hiramatsu

[permalink] [raw]
Subject: Re: [PATCH] tracing: Fix race when concurrently splice_read trace_pipe

On Sat, 12 Aug 2023 09:45:52 +0800
Zheng Yejian <[email protected]> wrote:

> On 2023/8/12 03:25, Steven Rostedt wrote:
> > On Thu, 10 Aug 2023 20:39:05 +0800
> > Zheng Yejian <[email protected]> wrote:
> >
> >> When concurrently splice_read file trace_pipe and per_cpu/cpu*/trace_pipe,
> >> there are more data being read out than expected.
>
> Sorry, I didn't make clear here. It not just read more but also lost
> some data. My case is that, for example:
> 1) Inject 3 events into ring_buffer: event1, event2, event3;
> 2) Concurrently splice_read through trace_pipes;
> 3) Then actually read out: event1, event3, event3. No event2, but 2
> event3.
>
> >
> > Honestly the real fix is to prevent that use case. We should probably have
> > access to trace_pipe lock all the per_cpu trace_pipes too.
>
> Yes, we could do that, but would it seem not that effective?
> because per_cpu trace_pipe only read its own ring_buffer and not race
> with ring_buffers in other cpus.

I think Steve said that only one of below is usable.

- Read trace_pipe

or

- Read per_cpu/cpu*/trace_pipe concurrently

And I think this makes sence, especially if you use splice (this *moves*
the page from the ring_buffer to other pipe).

Thank you,


>
> >
> > -- Steve
> >
>


--
Masami Hiramatsu (Google) <[email protected]>

2023-08-13 02:17:48

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH] tracing: Fix race when concurrently splice_read trace_pipe

On Sat, 12 Aug 2023 15:38:12 +0800
Zheng Yejian <[email protected]> wrote:

> Reproduction testcase is show as below, it can always reproduce the
> issue in v5.10, and after this patch, the testcase passed.
>
> In v5.10, when run `cat trace_pipe > /tmp/myfile &`, it call
> sendfile() to transmit data from trace_pipe into /tmp/myfile. And in
> kernel, .splice_read() of trace_pipe is called then the issue is
> reproduced.
>
> However in the newest v6.5, this reproduction case didn't run into the
> .splice_read() of trace_pipe, because after commit 97ef77c52b78 ("fs:
> check FMODE_LSEEK to control internal pipe splicing"), non-seekable
> trace_pipe cannot be sendfile-ed.

So the test case cannot be run because the "sendfile" on the
trace_pipe now fails?

I'm not sure if this is considered a user space regression or not, but
I figured I'd let the interested parties know.

I don't know what tools out there records trace_pipe via sendfile, but
there might be some.

-- Steve

2023-08-13 17:27:04

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] tracing: Fix race when concurrently splice_read trace_pipe

On Sat, 12 Aug 2023 at 18:13, Steven Rostedt <[email protected]> wrote:
>
> So the test case cannot be run because the "sendfile" on the
> trace_pipe now fails?

So sendfile() has always required a seekable source, because it -
intentionally - doesn't do the "splice to pipe and then splice from
pipe to destination".

It just does a "splice from source, splice result to destination".

That sounds like "obviously the right thing to do", but it means that
there is now no buffer with any extended lifetime between the two
operations. And that's a big deal.

Without that buffer (ie pipe) in the middle, if the splice to
destination fails - or is partial - and the system call is interrupted
by a signal, then the source splice data is now gone, gone, gone.

So it only works if the source can then re-create the data - ie if the
source is seekable.

In that case, if the destination cannot accept all the data, the
sendfile can just go back and read again from the last successfully
written position.

And if you are a data stream that can't seek, then that "from last
successfully written position" doesn't work, and any partial writes
will just have dropped the data.

This seekability test *used* to be to test that the source was either
a regular file or a block device.

Now it literally checks "can I seek".

It looks like the trace_pipe fiel is *claiming* to be a regular file -
so sendfile() used to be ok with it - but cannot actually seek - so
sendfile would silently drop data.

Now sendfile says "I'm sorry, Dave, I'm afraid I can't do that" rather
than silently causing data loss.

Now, this is not a user-visible regression in this case, because "cat"
will always fall back on just regular read/write when sendfile fails.

So all that changed for 'trace_pipe' is that a buggy splice now no
longer triggers the bug that it used to (which the patch in question
was also trying to fix).

End result: things in many ways work better this way.

So it really looks like it never really worked before either. There
was *both* the dropped data bug because "trace_pipe" lied about being
a regular file, *and* apparently this trace_pipe race bug that Zheng
Yejian tried to fix.

Of course, some destinations always accept a full write, so maybe we
could relax the "source must be seekable" to be "source must be
seekable, _OR_ destination must be something that never returns
partial writes".

So sendfile to a POSIX-compliant regular file could always work
(ignoring filesystem full situations, and at that point I think we can
say "yeah, we're done, no recovering from that in sendfile()").

So if somebody really *really* want sendfile to work for this case, then you

(a) do need to fix the race in tracing_splice_read_pipe (which you
should do anyway, since you can obviously always use splice() itself,
not sendfile()).

AND

(b) figure out when 'splice_write()' will always succeed fully and
convince people that we can do that "extended version" of sendfile().

Hmm?

Linus

2023-08-15 00:41:29

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH] tracing: Fix race when concurrently splice_read trace_pipe

On Sun, 13 Aug 2023 09:41:33 -0700
Linus Torvalds <[email protected]> wrote:

> So if somebody really *really* want sendfile to work for this case, then you
>
> (a) do need to fix the race in tracing_splice_read_pipe (which you
> should do anyway, since you can obviously always use splice() itself,
> not sendfile()).
>
> AND
>
> (b) figure out when 'splice_write()' will always succeed fully and
> convince people that we can do that "extended version" of sendfile().
>

No big deal. I really don't care about splicing trace_pipe anyway. That was
added by others, but the trace_pipe_raw is what really should be used.

Zheng's race needs to be fixed regardless, but I just wanted to make sure
this wasn't some kind of hidden regression, as there were patches to
trace_pipe to make sendfile get fixed in the past.

a29054d9478d0 ("tracing: Fix crash from reading trace_pipe with sendfile")

But that too was triggered by cat. If cat no longer uses sendfile for
trace_pipe, I'm fine with it.

-- Steve

2023-08-20 08:06:06

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH] tracing: Fix race when concurrently splice_read trace_pipe

On Sat, 12 Aug 2023 10:22:43 +0800
Zheng Yejian <[email protected]> wrote:

>
> >>>> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> >>>> index b8870078ef58..f169d33b948f 100644
> >>>> --- a/kernel/trace/trace.c
> >>>> +++ b/kernel/trace/trace.c
> >>>> @@ -7054,14 +7054,16 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
> >>>> if (ret <= 0)
> >>>> goto out_err;
> >>>>
> >>>> - if (!iter->ent && !trace_find_next_entry_inc(iter)) {
> >>>> + trace_event_read_lock();
> >>>> + trace_access_lock(iter->cpu_file);
> >>>> +
> >>>> + if (!trace_find_next_entry_inc(iter)) {
> >>>
> >>> It seems you skips '!iter->ent' check. Is there any reason for this change?
> >>
> >> IIUC, 'iter->ent' may be the entry that was found but not consumed
> >> in last call tracing_splice_read_pipe(), and in this call, 'iter->ent'
> >> may have being consumed, so we may should find a new 'iter->ent' before
> >> printing it in tracing_fill_pipe_page(), see following reduced codes:
> >
> > And if it wasn't consumed? We just lost it?
>
> If 'iter->ent' was not consumed, trace_find_next_entry_inc() will find
> it again, will it?
>
> -- Zheng Yejian
>
> >
> >>
> >> tracing_splice_read_pipe() {
> >> if (!iter->ent && !trace_find_next_entry_inc(iter)) { // 1. find
> >> entry here
> >> ... ...
> >> }
> >> tracing_fill_pipe_page() {
> >> for (;;) {
> >> ... ...
> >> ret = print_trace_line(iter); // 2. print entry
> >> ... ...
> >
> > You missed:
> >
> > count = trace_seq_used(&iter->seq) - save_len;
> > if (rem < count) {
> > rem = 0;
> > iter->seq.seq.len = save_len;
> >
> > Where the above just threw away what was printed in the above
> > "print_trace_line()", and it never went to console.
> >
> > break;
> > }
> >
>
> Thanks for pointing this out!

Just to get this moving again, I believe we should add a ref count to
trace_pipe and the per_cpu trace_pipes, where if they are opened, nothing else can read it.

Opening trace_pipe locks all per_cpu ref counts, if any of them are open,
then the trace_pipe open will fail (and releases any ref counts it had
taken).

Opening a per_cpu trace_pipe will up the ref count for just that CPU
buffer. This will allow multiple tasks to read different per_cpu trace_pipe
files, but will prevent the main trace_pipe file from being opened.

Does that work for this?

-- Steve

2023-08-20 17:14:43

by Zheng Yejian

[permalink] [raw]
Subject: [RFC PATCH] tracing: Introduce pipe_cpumask to avoid race on trace_pipes

There is race issue when concurrently splice_read main trace_pipe and
per_cpu trace_pipes which will result in data read out being different
from what actually writen.

As suggested by Steven:
> I believe we should add a ref count to trace_pipe and the per_cpu
> trace_pipes, where if they are opened, nothing else can read it.
>
> Opening trace_pipe locks all per_cpu ref counts, if any of them are
> open, then the trace_pipe open will fail (and releases any ref counts
> it had taken).
>
> Opening a per_cpu trace_pipe will up the ref count for just that
> CPU buffer. This will allow multiple tasks to read different per_cpu
> trace_pipe files, but will prevent the main trace_pipe file from
> being opened.

But because we only need to know whether per_cpu trace_pipe is open or
not, using a cpumask instead of using ref count may be easier.

After this patch, users will find that:
- Main trace_pipe can be opened by only one user, and if it is
opened, all per_cpu trace_pipes cannot be opened;
- Per_cpu trace_pipes can be opened by multiple users, but each per_cpu
trace_pipe can only be opened by one user. And if one of them is
opened, main trace_pipe cannot be opened.

Suggested-by: Steven Rostedt (Google) <[email protected]>
Signed-off-by: Zheng Yejian <[email protected]>
---

> Does that work for this?

Hi, Steven, how about using a cpumask instead of ref count?
This patch will also prevent main trace_pipe or anyone per_cpu trace_pipe
from being opened by multiple people at the same time.

kernel/trace/trace.c | 56 ++++++++++++++++++++++++++++++++++++++------
kernel/trace/trace.h | 2 ++
2 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b8870078ef58..73f6f4d10d43 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6705,24 +6705,54 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,

#endif

+static int open_pipe_on_cpu(struct trace_array *tr, int cpu)
+{
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ if (cpumask_empty(tr->pipe_cpumask)) {
+ cpumask_setall(tr->pipe_cpumask);
+ return 0;
+ }
+ } else {
+ if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) {
+ cpumask_set_cpu(cpu, tr->pipe_cpumask);
+ return 0;
+ }
+ }
+ return -EBUSY;
+}
+
+static void close_pipe_on_cpu(struct trace_array *tr, int cpu)
+{
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ WARN_ON(!cpumask_full(tr->pipe_cpumask));
+ cpumask_clear(tr->pipe_cpumask);
+ } else {
+ WARN_ON(!cpumask_test_cpu(cpu, tr->pipe_cpumask));
+ cpumask_clear_cpu(cpu, tr->pipe_cpumask);
+ }
+}
+
static int tracing_open_pipe(struct inode *inode, struct file *filp)
{
struct trace_array *tr = inode->i_private;
struct trace_iterator *iter;
int ret;
+ int cpu = tracing_get_cpu(inode);

ret = tracing_check_open_get_tr(tr);
if (ret)
return ret;

mutex_lock(&trace_types_lock);
+ ret = open_pipe_on_cpu(tr, cpu);
+ if (ret)
+ goto fail_pipe_on_cpu;

/* create a buffer to store the information to pass to userspace */
iter = kzalloc(sizeof(*iter), GFP_KERNEL);
if (!iter) {
ret = -ENOMEM;
- __trace_array_put(tr);
- goto out;
+ goto fail_alloc_iter;
}

trace_seq_init(&iter->seq);
@@ -6745,7 +6775,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)

iter->tr = tr;
iter->array_buffer = &tr->array_buffer;
- iter->cpu_file = tracing_get_cpu(inode);
+ iter->cpu_file = cpu;
mutex_init(&iter->mutex);
filp->private_data = iter;

@@ -6755,12 +6785,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
nonseekable_open(inode, filp);

tr->trace_ref++;
-out:
+
mutex_unlock(&trace_types_lock);
return ret;

fail:
kfree(iter);
+fail_alloc_iter:
+ close_pipe_on_cpu(tr, cpu);
+fail_pipe_on_cpu:
__trace_array_put(tr);
mutex_unlock(&trace_types_lock);
return ret;
@@ -6777,7 +6810,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)

if (iter->trace->pipe_close)
iter->trace->pipe_close(iter);
-
+ close_pipe_on_cpu(tr, iter->cpu_file);
mutex_unlock(&trace_types_lock);

free_cpumask_var(iter->started);
@@ -9441,6 +9474,9 @@ static struct trace_array *trace_array_create(const char *name)
if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
goto out_free_tr;

+ if (!alloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
+ goto out_free_tr;
+
tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;

cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
@@ -9482,6 +9518,7 @@ static struct trace_array *trace_array_create(const char *name)
out_free_tr:
ftrace_free_ftrace_ops(tr);
free_trace_buffers(tr);
+ free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
kfree(tr->name);
kfree(tr);
@@ -9584,6 +9621,7 @@ static int __remove_instance(struct trace_array *tr)
}
kfree(tr->topts);

+ free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
kfree(tr->name);
kfree(tr);
@@ -10381,12 +10419,14 @@ __init static int tracer_alloc_buffers(void)
if (trace_create_savedcmd() < 0)
goto out_free_temp_buffer;

+ if (!alloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL))
+ goto out_free_savedcmd;
+
/* TODO: make the number of buffers hot pluggable with CPUS */
if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n");
- goto out_free_savedcmd;
+ goto out_free_pipe_cpumask;
}
-
if (global_trace.buffer_disabled)
tracing_off();

@@ -10439,6 +10479,8 @@ __init static int tracer_alloc_buffers(void)

return 0;

+out_free_pipe_cpumask:
+ free_cpumask_var(global_trace.pipe_cpumask);
out_free_savedcmd:
free_saved_cmdlines_buffer(savedcmd);
out_free_temp_buffer:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e1edc2197fc8..53ac0f7780c2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -377,6 +377,8 @@ struct trace_array {
struct list_head events;
struct trace_event_file *trace_marker_file;
cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
+ /* one per_cpu trace_pipe can be opened by only one user */
+ cpumask_var_t pipe_cpumask;
int ref;
int trace_ref;
#ifdef CONFIG_FUNCTION_TRACER
--
2.25.1