Received-SPF: pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 2620:137:e000::1:20 as permitted sender) client-ip=2620:137:e000::1:20;
Date:   Tue, 27 Sep 2022 23:45:09 +0200
From:   Marco Elver <elver@google.com>
To:     Peter Zijlstra <peterz@infradead.org>
Cc:     Ingo Molnar <mingo@redhat.com>,
        Arnaldo Carvalho de Melo <acme@kernel.org>,
        Mark Rutland <mark.rutland@arm.com>,
        Alexander Shishkin <alexander.shishkin@linux.intel.com>,
        Jiri Olsa <jolsa@kernel.org>,
        Namhyung Kim <namhyung@kernel.org>,
        linux-perf-users@vger.kernel.org, linux-kernel@vger.kernel.org,
        kasan-dev@googlegroups.com, Dmitry Vyukov <dvyukov@google.com>
Subject: Re: [PATCH] perf: Fix missing SIGTRAPs due to pending_disable abuse
Message-ID: <YzNu5bgASbuVi0S3@elver.google.com>
References: <20220927121322.1236730-1-elver@google.com>
 <YzM/BUsBnX18NoOG@hirez.programming.kicks-ass.net>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <YzM/BUsBnX18NoOG@hirez.programming.kicks-ass.net>
User-Agent: Mutt/2.2.7 (2022-08-07)
Precedence: bulk

On Tue, Sep 27, 2022 at 08:20PM +0200, Peter Zijlstra wrote:
> On Tue, Sep 27, 2022 at 02:13:22PM +0200, Marco Elver wrote:
> > Due to the implementation of how SIGTRAP are delivered if
> > perf_event_attr::sigtrap is set, we've noticed 3 issues:
> > 
> > 	1. Missing SIGTRAP due to a race with event_sched_out() (more
> > 	   details below).
> > 
> > 	2. Hardware PMU events being disabled due to returning 1 from
> > 	   perf_event_overflow(). The only way to re-enable the event is
> > 	   for user space to first "properly" disable the event and then
> > 	   re-enable it.
> > 
> > 	3. The inability to automatically disable an event after a
> > 	   specified number of overflows via PERF_EVENT_IOC_REFRESH.
> > 
> > The worst of the 3 issues is problem (1), which occurs when a
> > pending_disable is "consumed" by a racing event_sched_out(), observed as
> > follows:
> > 
> > 		CPU0			| 	CPU1
> > 	--------------------------------+---------------------------
> > 	__perf_event_overflow()		|
> > 	 perf_event_disable_inatomic()	|
> > 	  pending_disable = CPU0	| ...
> > 	  				| _perf_event_enable()
> > 					|  event_function_call()
> > 					|   task_function_call()
> > 					|    /* sends IPI to CPU0 */
> > 	<IPI>				| ...
> > 	 __perf_event_enable()		+---------------------------
> > 	  ctx_resched()
> > 	   task_ctx_sched_out()
> > 	    ctx_sched_out()
> > 	     group_sched_out()
> > 	      event_sched_out()
> > 	       pending_disable = -1
> > 	</IPI>
> > 	<IRQ-work>
> > 	 perf_pending_event()
> > 	  perf_pending_event_disable()
> > 	   /* Fails to send SIGTRAP because no pending_disable! */
> > 	</IRQ-work>
> > 
> > In the above case, not only is that particular SIGTRAP missed, but also
> > all future SIGTRAPs because 'event_limit' is not reset back to 1.
> > 
> > To fix, rework pending delivery of SIGTRAP via IRQ-work by introduction
> > of a separate 'pending_sigtrap', no longer using 'event_limit' and
> > 'pending_disable' for its delivery.
> > 
> > During testing, this also revealed several more possible races between
> > reschedules and pending IRQ work; see code comments for details.
> 
> Perhaps use task_work_add() for this case? That runs on the
> return-to-user path, so then it doesn't matter how many reschedules
> happen in between.

Hmm, I tried the below (on top of this patch), but then all the tests
fail (including tools/testing/selftests/perf_events/sigtrap_threads.c)
because of lots of missing SIGTRAP. (The missing SIGTRAP happen with or
without the kernel/entry/ change.)

So something is wrong with task_work, and the irq_work solution thus far
is more robust (ran many hours of tests and fuzzing without failure).

Thoughts?

------ >8 ------

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index dff3430844a2..928fb9e2b655 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -743,7 +743,7 @@ struct perf_event {
 	int				pending_sigtrap;
 	unsigned long			pending_addr;	/* SIGTRAP */
 	struct irq_work			pending;
-	struct irq_work			pending_resched;
+	struct callback_head		pending_twork;
 
 	atomic_t			event_limit;
 
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 063068a9ea9b..7cacaefc97fe 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -162,12 +162,12 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 		if (ti_work & _TIF_PATCH_PENDING)
 			klp_update_patch_state(current);
 
-		if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
-			arch_do_signal_or_restart(regs);
-
 		if (ti_work & _TIF_NOTIFY_RESUME)
 			resume_user_mode_work(regs);
 
+		if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
+			arch_do_signal_or_restart(regs);
+
 		/* Architecture specific TIF work */
 		arch_exit_to_user_mode_work(regs, ti_work);
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 007a87c1599c..7f93dd91d572 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -17,6 +17,7 @@
 #include <linux/poll.h>
 #include <linux/slab.h>
 #include <linux/hash.h>
+#include <linux/task_work.h>
 #include <linux/tick.h>
 #include <linux/sysfs.h>
 #include <linux/dcache.h>
@@ -2527,14 +2528,6 @@ event_sched_in(struct perf_event *event,
 	if (event->attr.exclusive)
 		cpuctx->exclusive = 1;
 
-	if (event->pending_sigtrap) {
-		/*
-		 * The task and event might have been moved to another CPU:
-		 * queue another IRQ work. See perf_pending_event_sigtrap().
-		 */
-		WARN_ON_ONCE(!irq_work_queue(&event->pending_resched));
-	}
-
 out:
 	perf_pmu_enable(event->pmu);
 
@@ -4942,11 +4935,13 @@ static bool exclusive_event_installable(struct perf_event *event,
 
 static void perf_addr_filters_splice(struct perf_event *event,
 				       struct list_head *head);
+static void perf_pending_event_task_work(struct callback_head *work);
 
 static void _free_event(struct perf_event *event)
 {
 	irq_work_sync(&event->pending);
-	irq_work_sync(&event->pending_resched);
+	if (event->hw.target)
+		task_work_cancel(event->hw.target, perf_pending_event_task_work);
 
 	unaccount_event(event);
 
@@ -6438,15 +6433,7 @@ void perf_event_wakeup(struct perf_event *event)
 static void perf_sigtrap(struct perf_event *event)
 {
 	/*
-	 * We'd expect this to only occur if the irq_work is delayed and either
-	 * ctx->task or current has changed in the meantime. This can be the
-	 * case on architectures that do not implement arch_irq_work_raise().
-	 */
-	if (WARN_ON_ONCE(event->ctx->task != current))
-		return;
-
-	/*
-	 * perf_pending_event() can race with the task exiting.
+	 * Can be called while the task is exiting.
 	 */
 	if (current->flags & PF_EXITING)
 		return;
@@ -6455,35 +6442,22 @@ static void perf_sigtrap(struct perf_event *event)
 		      event->attr.type, event->attr.sig_data);
 }
 
-static void perf_pending_event_sigtrap(struct perf_event *event)
+static void perf_pending_event_task_work(struct callback_head *work)
 {
-	if (!event->pending_sigtrap)
-		return;
+	struct perf_event *event = container_of(work, struct perf_event, pending_twork);
+	int rctx;
 
-	/*
-	 * If we're racing with disabling of the event, consume pending_sigtrap
-	 * and don't send the SIGTRAP. This avoids potentially delaying a signal
-	 * indefinitely (oncpu mismatch) until the event is enabled again, which
-	 * could happen after already returning to user space; in that case the
-	 * signal would erroneously become asynchronous.
-	 */
-	if (event->state == PERF_EVENT_STATE_OFF) {
+	preempt_disable_notrace();
+	rctx = perf_swevent_get_recursion_context();
+
+	if (event->pending_sigtrap) {
 		event->pending_sigtrap = 0;
-		return;
+		perf_sigtrap(event);
 	}
 
-	/*
-	 * Only process this pending SIGTRAP if this IRQ work is running on the
-	 * right CPU: the scheduler is able to run before the IRQ work, which
-	 * moved the task to another CPU. In event_sched_in() another IRQ work
-	 * is scheduled, so that the signal is not lost; given the kernel has
-	 * not yet returned to user space, the signal remains synchronous.
-	 */
-	if (READ_ONCE(event->oncpu) != smp_processor_id())
-		return;
-
-	event->pending_sigtrap = 0;
-	perf_sigtrap(event);
+	if (rctx >= 0)
+		perf_swevent_put_recursion_context(rctx);
+	preempt_enable_notrace();
 }
 
 static void perf_pending_event_disable(struct perf_event *event)
@@ -6533,7 +6507,6 @@ static void perf_pending_event(struct irq_work *entry)
 	 * and we won't recurse 'further'.
 	 */
 
-	perf_pending_event_sigtrap(event);
 	perf_pending_event_disable(event);
 
 	if (event->pending_wakeup) {
@@ -6545,26 +6518,6 @@ static void perf_pending_event(struct irq_work *entry)
 		perf_swevent_put_recursion_context(rctx);
 }
 
-/*
- * If handling of a pending action must occur before returning to user space,
- * and it is possible to reschedule an event (to another CPU) with pending
- * actions, where the moved-from CPU may not yet have run event->pending (and
- * irq_work_queue() would fail on reuse), we'll use a separate IRQ work that
- * runs perf_pending_event_resched().
- */
-static void perf_pending_event_resched(struct irq_work *entry)
-{
-	struct perf_event *event = container_of(entry, struct perf_event, pending_resched);
-	int rctx;
-
-	rctx = perf_swevent_get_recursion_context();
-
-	perf_pending_event_sigtrap(event);
-
-	if (rctx >= 0)
-		perf_swevent_put_recursion_context(rctx);
-}
-
 #ifdef CONFIG_GUEST_PERF_EVENTS
 struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
 
@@ -9274,7 +9227,7 @@ static int __perf_event_overflow(struct perf_event *event,
 		WARN_ON_ONCE(event->pending_sigtrap && event->attr.exclude_kernel);
 		event->pending_sigtrap = 1;
 		event->pending_addr = data->addr;
-		irq_work_queue(&event->pending);
+		task_work_add(current, &event->pending_twork, TWA_RESUME);
 	}
 
 	READ_ONCE(event->overflow_handler)(event, data, regs);
@@ -11599,7 +11552,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	init_waitqueue_head(&event->waitq);
 	event->pending_disable = -1;
 	init_irq_work(&event->pending, perf_pending_event);
-	init_irq_work(&event->pending_resched, perf_pending_event_resched);
+	init_task_work(&event->pending_twork, perf_pending_event_task_work);
 
 	mutex_init(&event->mmap_mutex);
 	raw_spin_lock_init(&event->addr_filters.lock);