Data race exists between perf_event_set_output and perf_mmap_close.
The scenario is as follows:
CPU1 CPU2
perf_mmap_close(event2)
if (atomic_dec_and_test(&event2->rb->mmap_count) // mmap_count 1 -> 0
detach_rest = true;
ioctl(event1, PERF_EVENT_IOC_SET_OUTPUT, event2)
perf_event_set_output(event1, event2)
if (!detach_rest)
goto out_put;
list_for_each_entry_rcu(event, &event2->rb->event_list, rb_entry)
ring_buffer_attach(event, NULL)
// because event1 has not been added to event2->rb->event_list,
// event1->rb is not set to NULL in these loops
ring_buffer_attach(event1, event2->rb)
list_add_rcu(&event1->rb_entry, &event2->rb->event_list)
The above data race causes a problem, that is, event1->rb is not NULL, but event1->rb->mmap_count is 0.
If the perf_mmap interface is invoked for the fd of event1, the kernel keeps in the perf_mmap infinite loop:
again:
mutex_lock(&event->mmap_mutex);
if (event->rb) {
<SNIP>
if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
/*
* Raced against perf_mmap_close() through
* perf_event_set_output(). Try again, hope for better
* luck.
*/
mutex_unlock(&event->mmap_mutex);
goto again;
}
<SNIP>
This problem has occurred on the Syzkaller, causing the softlockup. The call stack is as follows:
[ 3666.984385][ C2] watchdog: BUG: soft lockup - CPU#2 stuck for 23s! [syz-executor.2:32404]
[ 3666.986137][ C2] Modules linked in:
[ 3666.989581][ C2] CPU: 2 PID: 32404 Comm: syz-executor.2 Not tainted 5.10.0+ #4
[ 3666.990697][ C2] Hardware name: linux,dummy-virt (DT)
[ 3666.992270][ C2] pstate: 80000005 (Nzcv daif -PAN -UAO -TCO BTYPE=--)
[ 3666.993787][ C2] pc : __kasan_check_write+0x0/0x40
[ 3666.994841][ C2] lr : perf_mmap+0x3c8/0xf80
[ 3666.995661][ C2] sp : ffff00001011f8f0
[ 3666.996598][ C2] x29: ffff00001011f8f0 x28: ffff0000cf644868
[ 3666.998488][ C2] x27: ffff000012cad2c0 x26: 0000000000000000
[ 3666.999888][ C2] x25: 0000000000000001 x24: ffff000012cad298
[ 3667.003511][ C2] x23: 0000000000000000 x22: ffff000012cad000
[ 3667.005504][ C2] x21: ffff0000cf644818 x20: ffff0000cf6d2400
[ 3667.006891][ C2] x19: ffff0000cf6d24c0 x18: 0000000000000000
[ 3667.008295][ C2] x17: 0000000000000000 x16: 0000000000000000
[ 3667.009528][ C2] x15: 0000000000000000 x14: 0000000000000000
[ 3667.010658][ C2] x13: 0000000000000000 x12: ffff800002023f17
[ 3667.012169][ C2] x11: 1fffe00002023f16 x10: ffff800002023f16
[ 3667.013780][ C2] x9 : dfffa00000000000 x8 : ffff00001011f8b7
[ 3667.015265][ C2] x7 : 0000000000000001 x6 : ffff800002023f16
[ 3667.016683][ C2] x5 : ffff0000c0f36400 x4 : 0000000000000000
[ 3667.018078][ C2] x3 : ffffa00010000000 x2 : ffffa000119a0000
[ 3667.019343][ C2] x1 : 0000000000000004 x0 : ffff0000cf6d24c0
[ 3667.021276][ C2] Call trace:
[ 3667.022598][ C2] __kasan_check_write+0x0/0x40
[ 3667.023666][ C2] __mmap_region+0x7a4/0xc90
[ 3667.024679][ C2] __do_mmap_mm+0x600/0xa20
[ 3667.025700][ C2] do_mmap+0x114/0x384
[ 3667.026583][ C2] vm_mmap_pgoff+0x138/0x230
[ 3667.027532][ C2] ksys_mmap_pgoff+0x1d8/0x570
[ 3667.028537][ C2] __arm64_sys_mmap+0xa4/0xd0
[ 3667.029597][ C2] el0_svc_common.constprop.0+0xf4/0x414
[ 3667.030682][ C2] do_el0_svc+0x50/0x11c
[ 3667.031545][ C2] el0_svc+0x20/0x30
[ 3667.032368][ C2] el0_sync_handler+0xe4/0x1e0
[ 3667.033305][ C2] el0_sync+0x148/0x180
perf_mmap_close and ring_buffer_attach involve complicated lock recursion.
The solution is to modify ring_buffer_attach function, that is,
after new event is added to rb->event_list, check whether rb->mmap_count is 0,
if the value is 0, another core performs perf_mmap_close operation
on ring buffer during this period. In this case, we set event->rb to NULL.
Signed-off-by: Yang Jihong <[email protected]>
---
Changes since v1:
- Modify commit message description.
kernel/events/core.c | 24 +++++++++++++++++++++++-
1 file changed, 23 insertions(+), 1 deletion(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 80782cddb1da..c67c070f7b39 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5900,6 +5900,7 @@ static void ring_buffer_attach(struct perf_event *event,
struct perf_buffer *rb)
{
struct perf_buffer *old_rb = NULL;
+ struct perf_buffer *new_rb = rb;
unsigned long flags;
WARN_ON_ONCE(event->parent);
@@ -5928,6 +5929,20 @@ static void ring_buffer_attach(struct perf_event *event,
spin_lock_irqsave(&rb->event_lock, flags);
list_add_rcu(&event->rb_entry, &rb->event_list);
+
+ /*
+ * When perf_mmap_close traverses rb->event_list during
+ * detach all other events, new event may not be added to
+ * rb->event_list, let's check again, if rb->mmap_count is 0,
+ * it indicates that perf_mmap_close is executed.
+ * Manually delete event from rb->event_list and
+ * set event->rb to null.
+ */
+ if (!atomic_read(&rb->mmap_count)) {
+ list_del_rcu(&event->rb_entry);
+ new_rb = NULL;
+ }
+
spin_unlock_irqrestore(&rb->event_lock, flags);
}
@@ -5944,7 +5959,7 @@ static void ring_buffer_attach(struct perf_event *event,
if (has_aux(event))
perf_event_stop(event, 0);
- rcu_assign_pointer(event->rb, rb);
+ rcu_assign_pointer(event->rb, new_rb);
if (old_rb) {
ring_buffer_put(old_rb);
@@ -11883,6 +11898,13 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
goto unlock;
}
+ /*
+ * If rb->mmap_count is 0, perf_mmap_close is being executed,
+ * the ring buffer is about to be unmapped and cannot be attached.
+ */
+ if (rb && !atomic_read(&rb->mmap_count))
+ goto unlock;
+
ring_buffer_attach(event, rb);
ret = 0;
--
2.30.GIT
On Mon, Jul 04, 2022 at 08:00:06PM +0800, Yang Jihong wrote:
> Data race exists between perf_event_set_output and perf_mmap_close.
> The scenario is as follows:
>
> CPU1 CPU2
> perf_mmap_close(event2)
> if (atomic_dec_and_test(&event2->rb->mmap_count) // mmap_count 1 -> 0
> detach_rest = true;
> ioctl(event1, PERF_EVENT_IOC_SET_OUTPUT, event2)
> perf_event_set_output(event1, event2)
> if (!detach_rest)
> goto out_put;
> list_for_each_entry_rcu(event, &event2->rb->event_list, rb_entry)
> ring_buffer_attach(event, NULL)
> // because event1 has not been added to event2->rb->event_list,
> // event1->rb is not set to NULL in these loops
>
> ring_buffer_attach(event1, event2->rb)
> list_add_rcu(&event1->rb_entry, &event2->rb->event_list)
>
> The above data race causes a problem, that is, event1->rb is not NULL, but event1->rb->mmap_count is 0.
> If the perf_mmap interface is invoked for the fd of event1, the kernel keeps in the perf_mmap infinite loop:
>
> again:
> mutex_lock(&event->mmap_mutex);
> if (event->rb) {
> <SNIP>
> if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
> /*
> * Raced against perf_mmap_close() through
> * perf_event_set_output(). Try again, hope for better
> * luck.
> */
> mutex_unlock(&event->mmap_mutex);
> goto again;
> }
> <SNIP>
Too tired, must look again tomorrow, little feeback below.
> kernel/events/core.c | 24 +++++++++++++++++++++++-
> 1 file changed, 23 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 80782cddb1da..c67c070f7b39 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -5900,6 +5900,7 @@ static void ring_buffer_attach(struct perf_event *event,
> struct perf_buffer *rb)
> {
> struct perf_buffer *old_rb = NULL;
> + struct perf_buffer *new_rb = rb;
> unsigned long flags;
>
> WARN_ON_ONCE(event->parent);
> @@ -5928,6 +5929,20 @@ static void ring_buffer_attach(struct perf_event *event,
>
> spin_lock_irqsave(&rb->event_lock, flags);
> list_add_rcu(&event->rb_entry, &rb->event_list);
> +
> + /*
> + * When perf_mmap_close traverses rb->event_list during
> + * detach all other events, new event may not be added to
> + * rb->event_list, let's check again, if rb->mmap_count is 0,
> + * it indicates that perf_mmap_close is executed.
> + * Manually delete event from rb->event_list and
> + * set event->rb to null.
> + */
> + if (!atomic_read(&rb->mmap_count)) {
> + list_del_rcu(&event->rb_entry);
> + new_rb = NULL;
> + }
> +
> spin_unlock_irqrestore(&rb->event_lock, flags);
> }
>
> @@ -5944,7 +5959,7 @@ static void ring_buffer_attach(struct perf_event *event,
> if (has_aux(event))
> perf_event_stop(event, 0);
>
> - rcu_assign_pointer(event->rb, rb);
> + rcu_assign_pointer(event->rb, new_rb);
>
> if (old_rb) {
> ring_buffer_put(old_rb);
I'm confused by the above hunks; the below will avoid calling
ring_buffer_attach() when !rb->mmap_count, so how can the above ever
execute?
> @@ -11883,6 +11898,13 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
> goto unlock;
> }
>
> + /*
> + * If rb->mmap_count is 0, perf_mmap_close is being executed,
> + * the ring buffer is about to be unmapped and cannot be attached.
> + */
> + if (rb && !atomic_read(&rb->mmap_count))
> + goto unlock;
> +
> ring_buffer_attach(event, rb);
>
> ret = 0;
This is wrong I think, it'll leak ring_buffer_get().
Hello,
On 2022/7/4 23:26, Peter Zijlstra wrote:
> On Mon, Jul 04, 2022 at 08:00:06PM +0800, Yang Jihong wrote:
>> Data race exists between perf_event_set_output and perf_mmap_close.
>> The scenario is as follows:
>>
>> CPU1 CPU2
>> perf_mmap_close(event2)
>> if (atomic_dec_and_test(&event2->rb->mmap_count) // mmap_count 1 -> 0
>> detach_rest = true;
>> ioctl(event1, PERF_EVENT_IOC_SET_OUTPUT, event2)
>> perf_event_set_output(event1, event2)
>> if (!detach_rest)
>> goto out_put;
>> list_for_each_entry_rcu(event, &event2->rb->event_list, rb_entry)
>> ring_buffer_attach(event, NULL)
>> // because event1 has not been added to event2->rb->event_list,
>> // event1->rb is not set to NULL in these loops
>>
>> ring_buffer_attach(event1, event2->rb)
>> list_add_rcu(&event1->rb_entry, &event2->rb->event_list)
>>
>> The above data race causes a problem, that is, event1->rb is not NULL, but event1->rb->mmap_count is 0.
>> If the perf_mmap interface is invoked for the fd of event1, the kernel keeps in the perf_mmap infinite loop:
>>
>> again:
>> mutex_lock(&event->mmap_mutex);
>> if (event->rb) {
>> <SNIP>
>> if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
>> /*
>> * Raced against perf_mmap_close() through
>> * perf_event_set_output(). Try again, hope for better
>> * luck.
>> */
>> mutex_unlock(&event->mmap_mutex);
>> goto again;
>> }
>> <SNIP>
>
> Too tired, must look again tomorrow, little feeback below.
Thanks for reviewing this patch. The perf_mmap_close,
perf_event_set_output, and perf_mmap involve complex data race and lock
relationships. Therefore, this simple fix is proposed.
>
>> kernel/events/core.c | 24 +++++++++++++++++++++++-
>> 1 file changed, 23 insertions(+), 1 deletion(-)
>>
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 80782cddb1da..c67c070f7b39 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -5900,6 +5900,7 @@ static void ring_buffer_attach(struct perf_event *event,
>> struct perf_buffer *rb)
>> {
>> struct perf_buffer *old_rb = NULL;
>> + struct perf_buffer *new_rb = rb;
>> unsigned long flags;
>>
>> WARN_ON_ONCE(event->parent);
>> @@ -5928,6 +5929,20 @@ static void ring_buffer_attach(struct perf_event *event,
>>
>> spin_lock_irqsave(&rb->event_lock, flags);
>> list_add_rcu(&event->rb_entry, &rb->event_list);
>> +
>> + /*
>> + * When perf_mmap_close traverses rb->event_list during
>> + * detach all other events, new event may not be added to
>> + * rb->event_list, let's check again, if rb->mmap_count is 0,
>> + * it indicates that perf_mmap_close is executed.
>> + * Manually delete event from rb->event_list and
>> + * set event->rb to null.
>> + */
>> + if (!atomic_read(&rb->mmap_count)) {
>> + list_del_rcu(&event->rb_entry);
>> + new_rb = NULL;
>> + }
>> +
>> spin_unlock_irqrestore(&rb->event_lock, flags);
>> }
>>
>> @@ -5944,7 +5959,7 @@ static void ring_buffer_attach(struct perf_event *event,
>> if (has_aux(event))
>> perf_event_stop(event, 0);
>>
>> - rcu_assign_pointer(event->rb, rb);
>> + rcu_assign_pointer(event->rb, new_rb);
>>
>> if (old_rb) {
>> ring_buffer_put(old_rb);
>
> I'm confused by the above hunks; the below will avoid calling
> ring_buffer_attach() when !rb->mmap_count, so how can the above ever
> execute?
In this patch, !atomic_read(&rb->mmap_count) is checked before the
perf_event_set_output function invokes ring_buffer_attach(event, rb).
Therefore, !atomic_read(&rb->mmap_count) does not need to be checked in
the ring_buffer_attach function.
Am I right to understand that?
Because there is no lock parallel protection between ioctl(event1,
PERF_EVENT_IOC_SET_OUTPUT, event2) and perf_mmap_close(event2), they can
be executed in parallel.
The following scenarios may exist:
CPU1
CPU2
perf_mmap_close(event2)
...
ioctl(event1, PERF_EVENT_IOC_SET_OUTPUT, event2)
perf_event_set_output(event1, event2)
...
if (rb && !atomic_read(&rb->mmap_count))
goto unlock;
// Here rb->mmap_count = 1, Keep going.
...
if
(atomic_dec_and_test(&event2->rb->mmap_count) // mmap_count 1 -> 0
detach_rest = true;
...
list_for_each_entry_rcu(event, &event2->rb->event_list, rb_entry)
ring_buffer_attach(event, NULL)
// because event1 has not been added to event2->rb->event_list,
// event1->rb is not set to NULL in these loops
...
ring_buffer_attach(event1, rb)
...
list_add_rcu(&event1->rb_entry, &event2->rb->event_list)
...
In this case, the above problems arise.
>
>> @@ -11883,6 +11898,13 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
>> goto unlock;
>> }
>>
>> + /*
>> + * If rb->mmap_count is 0, perf_mmap_close is being executed,
>> + * the ring buffer is about to be unmapped and cannot be attached.
>> + */
>> + if (rb && !atomic_read(&rb->mmap_count))
>> + goto unlock;
>> +
>> ring_buffer_attach(event, rb);
>>
>> ret = 0;
>
> This is wrong I think, it'll leak ring_buffer_get().
Yes, ring_buffer_put(rb) needs to be added before goto unlock.
I'll fix in next version.
Thanks,
Yang
>
>
> .
>
On Mon, Jul 04, 2022 at 05:26:04PM +0200, Peter Zijlstra wrote:
> On Mon, Jul 04, 2022 at 08:00:06PM +0800, Yang Jihong wrote:
> > Data race exists between perf_event_set_output and perf_mmap_close.
> > The scenario is as follows:
> >
> > CPU1 CPU2
> > perf_mmap_close(event2)
> > if (atomic_dec_and_test(&event2->rb->mmap_count) // mmap_count 1 -> 0
> > detach_rest = true;
> > ioctl(event1, PERF_EVENT_IOC_SET_OUTPUT, event2)
> > perf_event_set_output(event1, event2)
> > if (!detach_rest)
> > goto out_put;
> > list_for_each_entry_rcu(event, &event2->rb->event_list, rb_entry)
> > ring_buffer_attach(event, NULL)
> > // because event1 has not been added to event2->rb->event_list,
> > // event1->rb is not set to NULL in these loops
> >
> > ring_buffer_attach(event1, event2->rb)
> > list_add_rcu(&event1->rb_entry, &event2->rb->event_list)
> >
> > The above data race causes a problem, that is, event1->rb is not NULL, but event1->rb->mmap_count is 0.
> > If the perf_mmap interface is invoked for the fd of event1, the kernel keeps in the perf_mmap infinite loop:
> >
> > again:
> > mutex_lock(&event->mmap_mutex);
> > if (event->rb) {
> > <SNIP>
> > if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
> > /*
> > * Raced against perf_mmap_close() through
> > * perf_event_set_output(). Try again, hope for better
> > * luck.
> > */
> > mutex_unlock(&event->mmap_mutex);
> > goto again;
> > }
> > <SNIP>
>
> Too tired, must look again tomorrow, little feeback below.
With brain more awake I ended up with the below. Does that work?
---
kernel/events/core.c | 45 +++++++++++++++++++++++++++++++--------------
1 file changed, 31 insertions(+), 14 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4d8c335a07db..c9d32d4d2e20 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6262,10 +6262,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
/*
- * Raced against perf_mmap_close() through
- * perf_event_set_output(). Try again, hope for better
- * luck.
+ * Raced against perf_mmap_close(); remove the
+ * event and try again.
*/
+ ring_buffer_attach(event, NULL);
mutex_unlock(&event->mmap_mutex);
goto again;
}
@@ -11840,14 +11840,25 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
goto out;
}
+static void mutex_lock_double(struct mutex *a, struct mutex *b)
+{
+ if (b < a)
+ swap(a, b);
+
+ mutex_lock(a);
+ mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
+}
+
static int
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
struct perf_buffer *rb = NULL;
int ret = -EINVAL;
- if (!output_event)
+ if (!output_event) {
+ mutex_lock(&event->mmap_mutex);
goto set;
+ }
/* don't allow circular references */
if (event == output_event)
@@ -11885,8 +11896,15 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
event->pmu != output_event->pmu)
goto out;
+ /*
+ * Hold both mmap_mutex to serialize against perf_mmap_close(). Since
+ * output_event is already on rb->event_list, and the list iteration
+ * restarts after every removal, it is guaranteed this new event is
+ * observed *OR* if output_event is already removed, it's guaranteed we
+ * observe !rb->mmap_count.
+ */
+ mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
set:
- mutex_lock(&event->mmap_mutex);
/* Can't redirect output if we've got an active mmap() */
if (atomic_read(&event->mmap_count))
goto unlock;
@@ -11896,6 +11914,12 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
rb = ring_buffer_get(output_event);
if (!rb)
goto unlock;
+
+ /* did we race against perf_mmap_close() */
+ if (!atomic_read(&rb->mmap_count)) {
+ ring_buffer_put(rb);
+ goto unlock;
+ }
}
ring_buffer_attach(event, rb);
@@ -11903,20 +11927,13 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
ret = 0;
unlock:
mutex_unlock(&event->mmap_mutex);
+ if (output_event)
+ mutex_unlock(&output_event->mmap_mutex);
out:
return ret;
}
-static void mutex_lock_double(struct mutex *a, struct mutex *b)
-{
- if (b < a)
- swap(a, b);
-
- mutex_lock(a);
- mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
-}
-
static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
{
bool nmi_safe = false;
Hello,
On 2022/7/5 21:07, Peter Zijlstra wrote:
> On Mon, Jul 04, 2022 at 05:26:04PM +0200, Peter Zijlstra wrote:
>> On Mon, Jul 04, 2022 at 08:00:06PM +0800, Yang Jihong wrote:
>>> Data race exists between perf_event_set_output and perf_mmap_close.
>>> The scenario is as follows:
>>>
>>> CPU1 CPU2
>>> perf_mmap_close(event2)
>>> if (atomic_dec_and_test(&event2->rb->mmap_count) // mmap_count 1 -> 0
>>> detach_rest = true;
>>> ioctl(event1, PERF_EVENT_IOC_SET_OUTPUT, event2)
>>> perf_event_set_output(event1, event2)
>>> if (!detach_rest)
>>> goto out_put;
>>> list_for_each_entry_rcu(event, &event2->rb->event_list, rb_entry)
>>> ring_buffer_attach(event, NULL)
>>> // because event1 has not been added to event2->rb->event_list,
>>> // event1->rb is not set to NULL in these loops
>>>
>>> ring_buffer_attach(event1, event2->rb)
>>> list_add_rcu(&event1->rb_entry, &event2->rb->event_list)
>>>
>>> The above data race causes a problem, that is, event1->rb is not NULL, but event1->rb->mmap_count is 0.
>>> If the perf_mmap interface is invoked for the fd of event1, the kernel keeps in the perf_mmap infinite loop:
>>>
>>> again:
>>> mutex_lock(&event->mmap_mutex);
>>> if (event->rb) {
>>> <SNIP>
>>> if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
>>> /*
>>> * Raced against perf_mmap_close() through
>>> * perf_event_set_output(). Try again, hope for better
>>> * luck.
>>> */
>>> mutex_unlock(&event->mmap_mutex);
>>> goto again;
>>> }
>>> <SNIP>
>>
>> Too tired, must look again tomorrow, little feeback below.
>
> With brain more awake I ended up with the below. Does that work?
Yes, I apply the patch on kernel versions 5.10 and mainline,
and it could fixed the problem.
Tested-by: Yang Jihong <[email protected]>
Thanks,
Yang
Hello,
On 2022/7/6 20:29, Yang Jihong wrote:
> Hello,
>
> On 2022/7/5 21:07, Peter Zijlstra wrote:
>> On Mon, Jul 04, 2022 at 05:26:04PM +0200, Peter Zijlstra wrote:
>>> On Mon, Jul 04, 2022 at 08:00:06PM +0800, Yang Jihong wrote:
>>>> Data race exists between perf_event_set_output and perf_mmap_close.
>>>> The scenario is as follows:
>>>>
>>>>
>>>> CPU1 CPU2
>>>>
>>>> perf_mmap_close(event2)
>>>>
>>>> if (atomic_dec_and_test(&event2->rb->mmap_count) // mmap_count 1 -> 0
>>>>
>>>> detach_rest = true;
>>>> ioctl(event1, PERF_EVENT_IOC_SET_OUTPUT, event2)
>>>> perf_event_set_output(event1, event2)
>>>>
>>>> if (!detach_rest)
>>>>
>>>> goto out_put;
>>>>
>>>> list_for_each_entry_rcu(event, &event2->rb->event_list, rb_entry)
>>>>
>>>> ring_buffer_attach(event, NULL)
>>>>
>>>> // because event1 has not been added to event2->rb->event_list,
>>>>
>>>> // event1->rb is not set to NULL in these loops
>>>>
>>>> ring_buffer_attach(event1, event2->rb)
>>>> list_add_rcu(&event1->rb_entry, &event2->rb->event_list)
>>>>
>>>> The above data race causes a problem, that is, event1->rb is not
>>>> NULL, but event1->rb->mmap_count is 0.
>>>> If the perf_mmap interface is invoked for the fd of event1, the
>>>> kernel keeps in the perf_mmap infinite loop:
>>>>
>>>> again:
>>>> mutex_lock(&event->mmap_mutex);
>>>> if (event->rb) {
>>>> <SNIP>
>>>> if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
>>>> /*
>>>> * Raced against perf_mmap_close() through
>>>> * perf_event_set_output(). Try again, hope
>>>> for better
>>>> * luck.
>>>> */
>>>> mutex_unlock(&event->mmap_mutex);
>>>> goto again;
>>>> }
>>>> <SNIP>
>>>
>>> Too tired, must look again tomorrow, little feeback below.
>>
>> With brain more awake I ended up with the below. Does that work?
I have verified that this patch can solve the problem.
Do I submit this patch? Or do you submit it?
Thanks,
Yang
>
> Yes, I apply the patch on kernel versions 5.10 and mainline,
> and it could fixed the problem.
>
> Tested-by: Yang Jihong <[email protected]>
>
> Thanks,
> Yang
> .
The following commit has been merged into the perf/urgent branch of tip:
Commit-ID: 68e3c69803dada336893640110cb87221bb01dcf
Gitweb: https://git.kernel.org/tip/68e3c69803dada336893640110cb87221bb01dcf
Author: Peter Zijlstra <[email protected]>
AuthorDate: Tue, 05 Jul 2022 15:07:26 +02:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Wed, 13 Jul 2022 11:29:12 +02:00
perf/core: Fix data race between perf_event_set_output() and perf_mmap_close()
Yang Jihing reported a race between perf_event_set_output() and
perf_mmap_close():
CPU1 CPU2
perf_mmap_close(e2)
if (atomic_dec_and_test(&e2->rb->mmap_count)) // 1 - > 0
detach_rest = true
ioctl(e1, IOC_SET_OUTPUT, e2)
perf_event_set_output(e1, e2)
...
list_for_each_entry_rcu(e, &e2->rb->event_list, rb_entry)
ring_buffer_attach(e, NULL);
// e1 isn't yet added and
// therefore not detached
ring_buffer_attach(e1, e2->rb)
list_add_rcu(&e1->rb_entry,
&e2->rb->event_list)
After this; e1 is attached to an unmapped rb and a subsequent
perf_mmap() will loop forever more:
again:
mutex_lock(&e->mmap_mutex);
if (event->rb) {
...
if (!atomic_inc_not_zero(&e->rb->mmap_count)) {
...
mutex_unlock(&e->mmap_mutex);
goto again;
}
}
The loop in perf_mmap_close() holds e2->mmap_mutex, while the attach
in perf_event_set_output() holds e1->mmap_mutex. As such there is no
serialization to avoid this race.
Change perf_event_set_output() to take both e1->mmap_mutex and
e2->mmap_mutex to alleviate that problem. Additionally, have the loop
in perf_mmap() detach the rb directly, this avoids having to wait for
the concurrent perf_mmap_close() to get around to doing it to make
progress.
Fixes: 9bb5d40cd93c ("perf: Fix mmap() accounting hole")
Reported-by: Yang Jihong <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Tested-by: Yang Jihong <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
kernel/events/core.c | 45 +++++++++++++++++++++++++++++--------------
1 file changed, 31 insertions(+), 14 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 80782cd..d2b3549 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6253,10 +6253,10 @@ again:
if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
/*
- * Raced against perf_mmap_close() through
- * perf_event_set_output(). Try again, hope for better
- * luck.
+ * Raced against perf_mmap_close(); remove the
+ * event and try again.
*/
+ ring_buffer_attach(event, NULL);
mutex_unlock(&event->mmap_mutex);
goto again;
}
@@ -11825,14 +11825,25 @@ err_size:
goto out;
}
+static void mutex_lock_double(struct mutex *a, struct mutex *b)
+{
+ if (b < a)
+ swap(a, b);
+
+ mutex_lock(a);
+ mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
+}
+
static int
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
struct perf_buffer *rb = NULL;
int ret = -EINVAL;
- if (!output_event)
+ if (!output_event) {
+ mutex_lock(&event->mmap_mutex);
goto set;
+ }
/* don't allow circular references */
if (event == output_event)
@@ -11870,8 +11881,15 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
event->pmu != output_event->pmu)
goto out;
+ /*
+ * Hold both mmap_mutex to serialize against perf_mmap_close(). Since
+ * output_event is already on rb->event_list, and the list iteration
+ * restarts after every removal, it is guaranteed this new event is
+ * observed *OR* if output_event is already removed, it's guaranteed we
+ * observe !rb->mmap_count.
+ */
+ mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
set:
- mutex_lock(&event->mmap_mutex);
/* Can't redirect output if we've got an active mmap() */
if (atomic_read(&event->mmap_count))
goto unlock;
@@ -11881,6 +11899,12 @@ set:
rb = ring_buffer_get(output_event);
if (!rb)
goto unlock;
+
+ /* did we race against perf_mmap_close() */
+ if (!atomic_read(&rb->mmap_count)) {
+ ring_buffer_put(rb);
+ goto unlock;
+ }
}
ring_buffer_attach(event, rb);
@@ -11888,20 +11912,13 @@ set:
ret = 0;
unlock:
mutex_unlock(&event->mmap_mutex);
+ if (output_event)
+ mutex_unlock(&output_event->mmap_mutex);
out:
return ret;
}
-static void mutex_lock_double(struct mutex *a, struct mutex *b)
-{
- if (b < a)
- swap(a, b);
-
- mutex_lock(a);
- mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
-}
-
static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
{
bool nmi_safe = false;