Hi,
After discussions with Sebastian, here is an updated version of
https://lore.kernel.org/all/[email protected]/
Changes since v2:
* Use rcuwait instead of waitqueue (more RT-friendly when preemption is
disabled)
* Add a few comments about RCU-ordering expectations
Thanks.
Frederic Weisbecker (4):
task_work: s/task_work_cancel()/task_work_cancel_func()/
task_work: Introduce task_work_cancel() again
perf: Fix event leak upon exit
perf: Fix event leak upon exec and file release
include/linux/perf_event.h | 1 +
include/linux/task_work.h | 3 ++-
kernel/events/core.c | 45 ++++++++++++++++++++++++++++++++------
kernel/irq/manage.c | 2 +-
kernel/task_work.c | 34 +++++++++++++++++++++++-----
security/keys/keyctl.c | 2 +-
6 files changed, 72 insertions(+), 15 deletions(-)
--
2.44.0
A proper task_work_cancel() API that actually cancels a callback and not
*any* callback pointing to a given function is going to be needed for
perf events event freeing. Do the appropriate rename to prepare for
that.
Signed-off-by: Frederic Weisbecker <[email protected]>
---
include/linux/task_work.h | 2 +-
kernel/irq/manage.c | 2 +-
kernel/task_work.c | 10 +++++-----
security/keys/keyctl.c | 2 +-
4 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/include/linux/task_work.h b/include/linux/task_work.h
index 795ef5a68429..23ab01ae185e 100644
--- a/include/linux/task_work.h
+++ b/include/linux/task_work.h
@@ -30,7 +30,7 @@ int task_work_add(struct task_struct *task, struct callback_head *twork,
struct callback_head *task_work_cancel_match(struct task_struct *task,
bool (*match)(struct callback_head *, void *data), void *data);
-struct callback_head *task_work_cancel(struct task_struct *, task_work_func_t);
+struct callback_head *task_work_cancel_func(struct task_struct *, task_work_func_t);
void task_work_run(void);
static inline void exit_task_work(struct task_struct *task)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index bf9ae8a8686f..ab767e62b19a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1333,7 +1333,7 @@ static int irq_thread(void *data)
* synchronize_hardirq(). So neither IRQTF_RUNTHREAD nor the
* oneshot mask bit can be set.
*/
- task_work_cancel(current, irq_thread_dtor);
+ task_work_cancel_func(current, irq_thread_dtor);
return 0;
}
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 95a7e1b7f1da..54ac24059daa 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -120,9 +120,9 @@ static bool task_work_func_match(struct callback_head *cb, void *data)
}
/**
- * task_work_cancel - cancel a pending work added by task_work_add()
- * @task: the task which should execute the work
- * @func: identifies the work to remove
+ * task_work_cancel_func - cancel a pending work matching a function added by task_work_add()
+ * @task: the task which should execute the func's work
+ * @func: identifies the func to match with a work to remove
*
* Find the last queued pending work with ->func == @func and remove
* it from queue.
@@ -131,7 +131,7 @@ static bool task_work_func_match(struct callback_head *cb, void *data)
* The found work or NULL if not found.
*/
struct callback_head *
-task_work_cancel(struct task_struct *task, task_work_func_t func)
+task_work_cancel_func(struct task_struct *task, task_work_func_t func)
{
return task_work_cancel_match(task, task_work_func_match, func);
}
@@ -168,7 +168,7 @@ void task_work_run(void)
if (!work)
break;
/*
- * Synchronize with task_work_cancel(). It can not remove
+ * Synchronize with task_work_cancel_match(). It can not remove
* the first entry == work, cmpxchg(task_works) must fail.
* But it can remove another entry from the ->next list.
*/
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 10ba439968f7..3aff32a2bcf3 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -1693,7 +1693,7 @@ long keyctl_session_to_parent(void)
goto unlock;
/* cancel an already pending keyring replacement */
- oldwork = task_work_cancel(parent, key_change_session_keyring);
+ oldwork = task_work_cancel_func(parent, key_change_session_keyring);
/* the replacement session keyring is applied just prior to userspace
* restarting */
--
2.44.0
Re-introduce task_work_cancel(), this time to cancel an actual callback
and not *any* callback pointing to a given function. This is going to be
needed for perf events event freeing.
Signed-off-by: Frederic Weisbecker <[email protected]>
---
include/linux/task_work.h | 1 +
kernel/task_work.c | 24 ++++++++++++++++++++++++
2 files changed, 25 insertions(+)
diff --git a/include/linux/task_work.h b/include/linux/task_work.h
index 23ab01ae185e..26b8a47f41fc 100644
--- a/include/linux/task_work.h
+++ b/include/linux/task_work.h
@@ -31,6 +31,7 @@ int task_work_add(struct task_struct *task, struct callback_head *twork,
struct callback_head *task_work_cancel_match(struct task_struct *task,
bool (*match)(struct callback_head *, void *data), void *data);
struct callback_head *task_work_cancel_func(struct task_struct *, task_work_func_t);
+bool task_work_cancel(struct task_struct *task, struct callback_head *cb);
void task_work_run(void);
static inline void exit_task_work(struct task_struct *task)
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 54ac24059daa..2134ac8057a9 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -136,6 +136,30 @@ task_work_cancel_func(struct task_struct *task, task_work_func_t func)
return task_work_cancel_match(task, task_work_func_match, func);
}
+static bool task_work_match(struct callback_head *cb, void *data)
+{
+ return cb == data;
+}
+
+/**
+ * task_work_cancel - cancel a pending work added by task_work_add()
+ * @task: the task which should execute the work
+ * @cb: the callback to remove if queued
+ *
+ * Remove a callback from a task's queue if queued.
+ *
+ * RETURNS:
+ * True if the callback was queued and got cancelled, false otherwise.
+ */
+bool task_work_cancel(struct task_struct *task, struct callback_head *cb)
+{
+ struct callback_head *ret;
+
+ ret = task_work_cancel_match(task, task_work_match, cb);
+
+ return ret == cb;
+}
+
/**
* task_work_run - execute the works added by task_work_add()
*
--
2.44.0
When a task is scheduled out, pending sigtrap deliveries are deferred
to the target task upon resume to userspace via task_work.
However failures while adding en event's callback to the task_work
engine are ignored. And since the last call for events exit happen
after task work is eventually closed, there is a small window during
which pending sigtrap can be queued though ignored, leaking the event
refcount addition such as in the following scenario:
TASK A
-----
do_exit()
exit_task_work(tsk);
<IRQ>
perf_event_overflow()
event->pending_sigtrap = pending_id;
irq_work_queue(&event->pending_irq);
</IRQ>
=========> PREEMPTION: TASK A -> TASK B
event_sched_out()
event->pending_sigtrap = 0;
atomic_long_inc_not_zero(&event->refcount)
// FAILS: task work has exited
task_work_add(&event->pending_task)
[...]
<IRQ WORK>
perf_pending_irq()
// early return: event->oncpu = -1
</IRQ WORK>
[...]
=========> TASK B -> TASK A
perf_event_exit_task(tsk)
perf_event_exit_event()
free_event()
WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1)
// leak event due to unexpected refcount == 2
As a result the event is never released while the task exits.
Fix this with appropriate task_work_add()'s error handling.
Fixes: 517e6a301f34 ("perf: Fix perf_pending_task() UaF")
Signed-off-by: Frederic Weisbecker <[email protected]>
---
kernel/events/core.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 724e6d7e128f..c1632e69c69d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2289,10 +2289,11 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
event->pending_sigtrap = 0;
if (state != PERF_EVENT_STATE_OFF &&
!event->pending_work) {
- event->pending_work = 1;
- dec = false;
- WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
- task_work_add(current, &event->pending_task, TWA_RESUME);
+ if (task_work_add(current, &event->pending_task, TWA_RESUME) >= 0) {
+ WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
+ dec = false;
+ event->pending_work = 1;
+ }
}
if (dec)
local_dec(&event->ctx->nr_pending);
--
2.44.0
The perf pending task work is never waited upon the matching event
release. In the case of a child event, released via free_event()
directly, this can potentially result in a leaked event, such as in the
following scenario that doesn't even require a weak IRQ work
implementation to trigger:
schedule()
prepare_task_switch()
=======> <NMI>
perf_event_overflow()
event->pending_sigtrap = ...
irq_work_queue(&event->pending_irq)
<======= </NMI>
perf_event_task_sched_out()
event_sched_out()
event->pending_sigtrap = 0;
atomic_long_inc_not_zero(&event->refcount)
task_work_add(&event->pending_task)
finish_lock_switch()
=======> <IRQ>
perf_pending_irq()
//do nothing, rely on pending task work
<======= </IRQ>
begin_new_exec()
perf_event_exit_task()
perf_event_exit_event()
// If is child event
free_event()
WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1)
// event is leaked
Similar scenarios can also happen with perf_event_remove_on_exec() or
simply against concurrent perf_event_release().
Fix this with synchonizing against the possibly remaining pending task
work while freeing the event, just like is done with remaining pending
IRQ work. This means that the pending task callback neither need nor
should hold a reference to the event, preventing it from ever beeing
freed.
Fixes: 517e6a301f34 ("perf: Fix perf_pending_task() UaF")
Signed-off-by: Frederic Weisbecker <[email protected]>
---
include/linux/perf_event.h | 1 +
kernel/events/core.c | 38 ++++++++++++++++++++++++++++++++++----
2 files changed, 35 insertions(+), 4 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index d2a15c0c6f8a..89ae41bb5f70 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -786,6 +786,7 @@ struct perf_event {
struct irq_work pending_irq;
struct callback_head pending_task;
unsigned int pending_work;
+ struct rcuwait pending_work_wait;
atomic_t event_limit;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c1632e69c69d..4b99ab7024a4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2290,7 +2290,6 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
if (state != PERF_EVENT_STATE_OFF &&
!event->pending_work) {
if (task_work_add(current, &event->pending_task, TWA_RESUME) >= 0) {
- WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
dec = false;
event->pending_work = 1;
}
@@ -5188,9 +5187,35 @@ static bool exclusive_event_installable(struct perf_event *event,
static void perf_addr_filters_splice(struct perf_event *event,
struct list_head *head);
+static void perf_pending_task_sync(struct perf_event *event)
+{
+ struct callback_head *head = &event->pending_task;
+
+ if (!event->pending_work)
+ return;
+ /*
+ * If the task is queued to the current task's queue, we
+ * obviously can't wait for it to complete. Simply cancel it.
+ */
+ if (task_work_cancel(current, head)) {
+ event->pending_work = 0;
+ local_dec(&event->ctx->nr_pending);
+ return;
+ }
+
+ /*
+ * All accesses related to the event are within the same
+ * non-preemptible section in perf_pending_task(). The RCU
+ * grace period before the event is freed will make sure all
+ * those accesses are complete by then.
+ */
+ rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE);
+}
+
static void _free_event(struct perf_event *event)
{
irq_work_sync(&event->pending_irq);
+ perf_pending_task_sync(event);
unaccount_event(event);
@@ -6808,24 +6833,28 @@ static void perf_pending_task(struct callback_head *head)
struct perf_event *event = container_of(head, struct perf_event, pending_task);
int rctx;
+ /*
+ * All accesses to the event must belong to the same implicit RCU read-side
+ * critical section as the ->pending_work reset. See comment in
+ * perf_pending_task_sync().
+ */
+ preempt_disable_notrace();
/*
* If we 'fail' here, that's OK, it means recursion is already disabled
* and we won't recurse 'further'.
*/
- preempt_disable_notrace();
rctx = perf_swevent_get_recursion_context();
if (event->pending_work) {
event->pending_work = 0;
perf_sigtrap(event);
local_dec(&event->ctx->nr_pending);
+ rcuwait_wake_up(&event->pending_work_wait);
}
if (rctx >= 0)
perf_swevent_put_recursion_context(rctx);
preempt_enable_notrace();
-
- put_event(event);
}
#ifdef CONFIG_GUEST_PERF_EVENTS
@@ -11933,6 +11962,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
init_waitqueue_head(&event->waitq);
init_irq_work(&event->pending_irq, perf_pending_irq);
init_task_work(&event->pending_task, perf_pending_task);
+ rcuwait_init(&event->pending_work_wait);
mutex_init(&event->mmap_mutex);
raw_spin_lock_init(&event->addr_filters.lock);
--
2.44.0
On Wed, May 15, 2024 at 04:43:10PM +0200, Frederic Weisbecker wrote:
> When a task is scheduled out, pending sigtrap deliveries are deferred
> to the target task upon resume to userspace via task_work.
>
> However failures while adding en event's callback to the task_work
> engine are ignored. And since the last call for events exit happen
> after task work is eventually closed, there is a small window during
> which pending sigtrap can be queued though ignored, leaking the event
> refcount addition such as in the following scenario:
>
> TASK A
> -----
>
> do_exit()
> exit_task_work(tsk);
>
> <IRQ>
> perf_event_overflow()
> event->pending_sigtrap = pending_id;
> irq_work_queue(&event->pending_irq);
> </IRQ>
> =========> PREEMPTION: TASK A -> TASK B
> event_sched_out()
> event->pending_sigtrap = 0;
> atomic_long_inc_not_zero(&event->refcount)
> // FAILS: task work has exited
> task_work_add(&event->pending_task)
> [...]
> <IRQ WORK>
> perf_pending_irq()
> // early return: event->oncpu = -1
> </IRQ WORK>
> [...]
> =========> TASK B -> TASK A
> perf_event_exit_task(tsk)
> perf_event_exit_event()
> free_event()
> WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1)
> // leak event due to unexpected refcount == 2
>
> As a result the event is never released while the task exits.
Urgh...
>
> Fix this with appropriate task_work_add()'s error handling.
>
> Fixes: 517e6a301f34 ("perf: Fix perf_pending_task() UaF")
> Signed-off-by: Frederic Weisbecker <[email protected]>
> ---
> kernel/events/core.c | 9 +++++----
> 1 file changed, 5 insertions(+), 4 deletions(-)
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 724e6d7e128f..c1632e69c69d 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -2289,10 +2289,11 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
> event->pending_sigtrap = 0;
> if (state != PERF_EVENT_STATE_OFF &&
> !event->pending_work) {
> - event->pending_work = 1;
> - dec = false;
> - WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
> - task_work_add(current, &event->pending_task, TWA_RESUME);
> + if (task_work_add(current, &event->pending_task, TWA_RESUME) >= 0) {
AFAICT the thing is a return 0 on success -Efoo on fail, no? That is,
should this not simply be '== 0' ?
> + WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
> + dec = false;
> + event->pending_work = 1;
> + }
Also, do we want to write it like so and save an indent?
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2288,11 +2288,11 @@ event_sched_out(struct perf_event *event
event->pending_sigtrap = 0;
if (state != PERF_EVENT_STATE_OFF &&
- !event->pending_work) {
+ !event->pending_work &&
+ !task_work_add(current, &event->pending_task, TWA_RESUME)) {
event->pending_work = 1;
dec = false;
WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
- task_work_add(current, &event->pending_task, TWA_RESUME);
}
if (dec)
local_dec(&event->ctx->nr_pending);
On Wed, May 15, 2024 at 04:43:11PM +0200, Frederic Weisbecker wrote:
> The perf pending task work is never waited upon the matching event
> release. In the case of a child event, released via free_event()
> directly, this can potentially result in a leaked event, such as in the
> following scenario that doesn't even require a weak IRQ work
> implementation to trigger:
>
> schedule()
> prepare_task_switch()
> =======> <NMI>
> perf_event_overflow()
> event->pending_sigtrap = ...
> irq_work_queue(&event->pending_irq)
> <======= </NMI>
> perf_event_task_sched_out()
> event_sched_out()
> event->pending_sigtrap = 0;
> atomic_long_inc_not_zero(&event->refcount)
> task_work_add(&event->pending_task)
> finish_lock_switch()
> =======> <IRQ>
> perf_pending_irq()
> //do nothing, rely on pending task work
> <======= </IRQ>
>
> begin_new_exec()
> perf_event_exit_task()
> perf_event_exit_event()
> // If is child event
> free_event()
> WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1)
> // event is leaked
>
> Similar scenarios can also happen with perf_event_remove_on_exec() or
> simply against concurrent perf_event_release().
>
> Fix this with synchonizing against the possibly remaining pending task
> work while freeing the event, just like is done with remaining pending
> IRQ work. This means that the pending task callback neither need nor
> should hold a reference to the event, preventing it from ever beeing
> freed.
>
> Fixes: 517e6a301f34 ("perf: Fix perf_pending_task() UaF")
> Signed-off-by: Frederic Weisbecker <[email protected]>
Yeah, I suppose this'll do. Thanks!
On Thu, May 16, 2024 at 11:05:29AM +0200, Peter Zijlstra wrote:
> On Wed, May 15, 2024 at 04:43:10PM +0200, Frederic Weisbecker wrote:
> > When a task is scheduled out, pending sigtrap deliveries are deferred
> > to the target task upon resume to userspace via task_work.
> >
> > However failures while adding en event's callback to the task_work
> > engine are ignored. And since the last call for events exit happen
> > after task work is eventually closed, there is a small window during
> > which pending sigtrap can be queued though ignored, leaking the event
> > refcount addition such as in the following scenario:
> >
> > TASK A
> > -----
> >
> > do_exit()
> > exit_task_work(tsk);
> >
> > <IRQ>
> > perf_event_overflow()
> > event->pending_sigtrap = pending_id;
> > irq_work_queue(&event->pending_irq);
> > </IRQ>
> > =========> PREEMPTION: TASK A -> TASK B
> > event_sched_out()
> > event->pending_sigtrap = 0;
> > atomic_long_inc_not_zero(&event->refcount)
> > // FAILS: task work has exited
> > task_work_add(&event->pending_task)
> > [...]
> > <IRQ WORK>
> > perf_pending_irq()
> > // early return: event->oncpu = -1
> > </IRQ WORK>
> > [...]
> > =========> TASK B -> TASK A
> > perf_event_exit_task(tsk)
> > perf_event_exit_event()
> > free_event()
> > WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1)
> > // leak event due to unexpected refcount == 2
> >
> > As a result the event is never released while the task exits.
>
> Urgh...
>
> >
> > Fix this with appropriate task_work_add()'s error handling.
> >
> > Fixes: 517e6a301f34 ("perf: Fix perf_pending_task() UaF")
> > Signed-off-by: Frederic Weisbecker <[email protected]>
> > ---
> > kernel/events/core.c | 9 +++++----
> > 1 file changed, 5 insertions(+), 4 deletions(-)
> >
> > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > index 724e6d7e128f..c1632e69c69d 100644
> > --- a/kernel/events/core.c
> > +++ b/kernel/events/core.c
> > @@ -2289,10 +2289,11 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
> > event->pending_sigtrap = 0;
> > if (state != PERF_EVENT_STATE_OFF &&
> > !event->pending_work) {
> > - event->pending_work = 1;
> > - dec = false;
> > - WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
> > - task_work_add(current, &event->pending_task, TWA_RESUME);
> > + if (task_work_add(current, &event->pending_task, TWA_RESUME) >= 0) {
>
> AFAICT the thing is a return 0 on success -Efoo on fail, no? That is,
> should this not simply be '== 0' ?
Right.
>
> > + WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
> > + dec = false;
> > + event->pending_work = 1;
> > + }
>
> Also, do we want to write it like so and save an indent?
>
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -2288,11 +2288,11 @@ event_sched_out(struct perf_event *event
>
> event->pending_sigtrap = 0;
> if (state != PERF_EVENT_STATE_OFF &&
> - !event->pending_work) {
> + !event->pending_work &&
> + !task_work_add(current, &event->pending_task, TWA_RESUME)) {
> event->pending_work = 1;
> dec = false;
> WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
> - task_work_add(current, &event->pending_task, TWA_RESUME);
> }
> if (dec)
> local_dec(&event->ctx->nr_pending);
Looks good, I'm resending this one patch.
Thanks.