The error cleanup sequence in perf_event_alloc() is a subset of the
existing _free_event() function (it must of course be).
Split this out into __free_event() and simplify the error path.
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
include/linux/perf_event.h | 1
kernel/events/core.c | 129 ++++++++++++++++++++++-----------------------
2 files changed, 66 insertions(+), 64 deletions(-)
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -634,6 +634,7 @@ struct swevent_hlist {
#define PERF_ATTACH_ITRACE 0x10
#define PERF_ATTACH_SCHED_CB 0x20
#define PERF_ATTACH_CHILD 0x40
+#define PERF_ATTACH_EXCLUSIVE 0x80
struct bpf_prog;
struct perf_cgroup;
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5094,6 +5094,8 @@ static int exclusive_event_init(struct p
return -EBUSY;
}
+ event->attach_state |= PERF_ATTACH_EXCLUSIVE;
+
return 0;
}
@@ -5101,14 +5103,13 @@ static void exclusive_event_destroy(stru
{
struct pmu *pmu = event->pmu;
- if (!is_exclusive_pmu(pmu))
- return;
-
/* see comment in exclusive_event_init() */
if (event->attach_state & PERF_ATTACH_TASK)
atomic_dec(&pmu->exclusive_cnt);
else
atomic_inc(&pmu->exclusive_cnt);
+
+ event->attach_state &= ~PERF_ATTACH_EXCLUSIVE;
}
static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
@@ -5143,38 +5144,22 @@ static bool exclusive_event_installable(
static void perf_addr_filters_splice(struct perf_event *event,
struct list_head *head);
-static void _free_event(struct perf_event *event)
+/* vs perf_event_alloc() error */
+static void __free_event(struct perf_event *event)
{
- irq_work_sync(&event->pending_irq);
-
- unaccount_event(event);
-
- security_perf_event_free(event);
-
- if (event->rb) {
- /*
- * Can happen when we close an event with re-directed output.
- *
- * Since we have a 0 refcount, perf_mmap_close() will skip
- * over us; possibly making our ring_buffer_put() the last.
- */
- mutex_lock(&event->mmap_mutex);
- ring_buffer_attach(event, NULL);
- mutex_unlock(&event->mmap_mutex);
- }
-
- if (is_cgroup_event(event))
- perf_detach_cgroup(event);
-
if (!event->parent) {
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
put_callchain_buffers();
}
- perf_event_free_bpf_prog(event);
- perf_addr_filters_splice(event, NULL);
kfree(event->addr_filter_ranges);
+ if (event->attach_state & PERF_ATTACH_EXCLUSIVE)
+ exclusive_event_destroy(event);
+
+ if (is_cgroup_event(event))
+ perf_detach_cgroup(event);
+
if (event->destroy)
event->destroy(event);
@@ -5185,22 +5170,56 @@ static void _free_event(struct perf_even
if (event->hw.target)
put_task_struct(event->hw.target);
- if (event->pmu_ctx)
+ if (event->pmu_ctx) {
+ /*
+ * put_pmu_ctx() needs an event->ctx reference, because of
+ * epc->ctx.
+ */
+ WARN_ON_ONCE(!event->ctx);
+ WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx);
put_pmu_ctx(event->pmu_ctx);
+ }
/*
- * perf_event_free_task() relies on put_ctx() being 'last', in particular
- * all task references must be cleaned up.
+ * perf_event_free_task() relies on put_ctx() being 'last', in
+ * particular all task references must be cleaned up.
*/
if (event->ctx)
put_ctx(event->ctx);
- exclusive_event_destroy(event);
- module_put(event->pmu->module);
+ if (event->pmu)
+ module_put(event->pmu->module);
call_rcu(&event->rcu_head, free_event_rcu);
}
+/* vs perf_event_alloc() success */
+static void _free_event(struct perf_event *event)
+{
+ irq_work_sync(&event->pending_irq);
+
+ unaccount_event(event);
+
+ security_perf_event_free(event);
+
+ if (event->rb) {
+ /*
+ * Can happen when we close an event with re-directed output.
+ *
+ * Since we have a 0 refcount, perf_mmap_close() will skip
+ * over us; possibly making our ring_buffer_put() the last.
+ */
+ mutex_lock(&event->mmap_mutex);
+ ring_buffer_attach(event, NULL);
+ mutex_unlock(&event->mmap_mutex);
+ }
+
+ perf_event_free_bpf_prog(event);
+ perf_addr_filters_splice(event, NULL);
+
+ __free_event(event);
+}
+
/*
* Used to free events which have a known refcount of 1, such as in error paths
* where the event isn't exposed yet and inherited events.
@@ -11591,8 +11610,10 @@ static int perf_try_init_event(struct pm
event->destroy(event);
}
- if (ret)
+ if (ret) {
+ event->pmu = NULL;
module_put(pmu->module);
+ }
return ret;
}
@@ -11918,7 +11939,7 @@ perf_event_alloc(struct perf_event_attr
* See perf_output_read().
*/
if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
- goto err_ns;
+ goto err;
if (!has_branch_stack(event))
event->attr.branch_sample_type = 0;
@@ -11926,7 +11947,7 @@ perf_event_alloc(struct perf_event_attr
pmu = perf_init_event(event);
if (IS_ERR(pmu)) {
err = PTR_ERR(pmu);
- goto err_ns;
+ goto err;
}
/*
@@ -11936,24 +11957,24 @@ perf_event_alloc(struct perf_event_attr
*/
if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) {
err = -EINVAL;
- goto err_pmu;
+ goto err;
}
if (event->attr.aux_output &&
!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
err = -EOPNOTSUPP;
- goto err_pmu;
+ goto err;
}
if (cgroup_fd != -1) {
err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
if (err)
- goto err_pmu;
+ goto err;
}
err = exclusive_event_init(event);
if (err)
- goto err_pmu;
+ goto err;
if (has_addr_filter(event)) {
event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
@@ -11961,7 +11982,7 @@ perf_event_alloc(struct perf_event_attr
GFP_KERNEL);
if (!event->addr_filter_ranges) {
err = -ENOMEM;
- goto err_per_task;
+ goto err;
}
/*
@@ -11986,41 +12007,21 @@ perf_event_alloc(struct perf_event_attr
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
err = get_callchain_buffers(attr->sample_max_stack);
if (err)
- goto err_addr_filters;
+ goto err;
}
}
err = security_perf_event_alloc(event);
if (err)
- goto err_callchain_buffer;
+ goto err;
/* symmetric to unaccount_event() in _free_event() */
account_event(event);
return event;
-err_callchain_buffer:
- if (!event->parent) {
- if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
- put_callchain_buffers();
- }
-err_addr_filters:
- kfree(event->addr_filter_ranges);
-
-err_per_task:
- exclusive_event_destroy(event);
-
-err_pmu:
- if (is_cgroup_event(event))
- perf_detach_cgroup(event);
- if (event->destroy)
- event->destroy(event);
- module_put(pmu->module);
-err_ns:
- if (event->hw.target)
- put_task_struct(event->hw.target);
- call_rcu(&event->rcu_head, free_event_rcu);
-
+err:
+ __free_event(event);
return ERR_PTR(err);
}
On Thu, Nov 02, 2023 at 04:09:20PM +0100, Peter Zijlstra wrote:
SNIP
> @@ -11936,24 +11957,24 @@ perf_event_alloc(struct perf_event_attr
> */
> if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) {
> err = -EINVAL;
> - goto err_pmu;
> + goto err;
> }
>
> if (event->attr.aux_output &&
> !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
> err = -EOPNOTSUPP;
> - goto err_pmu;
> + goto err;
> }
>
> if (cgroup_fd != -1) {
> err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
> if (err)
> - goto err_pmu;
> + goto err;
> }
>
> err = exclusive_event_init(event);
> if (err)
> - goto err_pmu;
> + goto err;
>
> if (has_addr_filter(event)) {
> event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
> @@ -11961,7 +11982,7 @@ perf_event_alloc(struct perf_event_attr
> GFP_KERNEL);
> if (!event->addr_filter_ranges) {
> err = -ENOMEM;
> - goto err_per_task;
> + goto err;
> }
>
> /*
> @@ -11986,41 +12007,21 @@ perf_event_alloc(struct perf_event_attr
> if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
> err = get_callchain_buffers(attr->sample_max_stack);
> if (err)
> - goto err_addr_filters;
> + goto err;
> }
> }
>
> err = security_perf_event_alloc(event);
> if (err)
> - goto err_callchain_buffer;
> + goto err;
>
> /* symmetric to unaccount_event() in _free_event() */
> account_event(event);
>
> return event;
>
> -err_callchain_buffer:
> - if (!event->parent) {
> - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
> - put_callchain_buffers();
> - }
hum, so this is now called all the time via __free_event, but it should
be called only if we passed get_callchain_buffers call.. this could screw
up nr_callchain_events number eventually no?
jirka
> -err_addr_filters:
> - kfree(event->addr_filter_ranges);
> -
> -err_per_task:
> - exclusive_event_destroy(event);
> -
> -err_pmu:
> - if (is_cgroup_event(event))
> - perf_detach_cgroup(event);
> - if (event->destroy)
> - event->destroy(event);
> - module_put(pmu->module);
> -err_ns:
> - if (event->hw.target)
> - put_task_struct(event->hw.target);
> - call_rcu(&event->rcu_head, free_event_rcu);
> -
> +err:
> + __free_event(event);
> return ERR_PTR(err);
> }
>
>
>
Hi Jiri and Peter,
On Fri, Nov 3, 2023 at 5:38 AM Jiri Olsa <[email protected]> wrote:
>
> On Thu, Nov 02, 2023 at 04:09:20PM +0100, Peter Zijlstra wrote:
>
> SNIP
>
> > @@ -11936,24 +11957,24 @@ perf_event_alloc(struct perf_event_attr
> > */
> > if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) {
> > err = -EINVAL;
> > - goto err_pmu;
> > + goto err;
> > }
> >
> > if (event->attr.aux_output &&
> > !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
> > err = -EOPNOTSUPP;
> > - goto err_pmu;
> > + goto err;
> > }
> >
> > if (cgroup_fd != -1) {
> > err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
> > if (err)
> > - goto err_pmu;
> > + goto err;
> > }
> >
> > err = exclusive_event_init(event);
> > if (err)
> > - goto err_pmu;
> > + goto err;
> >
> > if (has_addr_filter(event)) {
> > event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
> > @@ -11961,7 +11982,7 @@ perf_event_alloc(struct perf_event_attr
> > GFP_KERNEL);
> > if (!event->addr_filter_ranges) {
> > err = -ENOMEM;
> > - goto err_per_task;
> > + goto err;
> > }
> >
> > /*
> > @@ -11986,41 +12007,21 @@ perf_event_alloc(struct perf_event_attr
> > if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
> > err = get_callchain_buffers(attr->sample_max_stack);
> > if (err)
> > - goto err_addr_filters;
> > + goto err;
> > }
> > }
> >
> > err = security_perf_event_alloc(event);
> > if (err)
> > - goto err_callchain_buffer;
> > + goto err;
> >
> > /* symmetric to unaccount_event() in _free_event() */
> > account_event(event);
> >
> > return event;
> >
> > -err_callchain_buffer:
> > - if (!event->parent) {
> > - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
> > - put_callchain_buffers();
> > - }
>
> hum, so this is now called all the time via __free_event, but it should
> be called only if we passed get_callchain_buffers call.. this could screw
> up nr_callchain_events number eventually no?
Looks like so.
>
> jirka
>
> > -err_addr_filters:
> > - kfree(event->addr_filter_ranges);
> > -
> > -err_per_task:
> > - exclusive_event_destroy(event);
> > -
> > -err_pmu:
> > - if (is_cgroup_event(event))
> > - perf_detach_cgroup(event);
> > - if (event->destroy)
> > - event->destroy(event);
> > - module_put(pmu->module);
I'm afraid of this part. If it failed at perf_init_event(), it might
call event->destroy() already. I saw you cleared event->pmu
in the failure path. Maybe we need the same thing for the
event->destroy.
Thanks,
Namhyung
> > -err_ns:
> > - if (event->hw.target)
> > - put_task_struct(event->hw.target);
> > - call_rcu(&event->rcu_head, free_event_rcu);
> > -
> > +err:
> > + __free_event(event);
> > return ERR_PTR(err);
> > }
> >
> >
> >
On Fri, Nov 03, 2023 at 01:38:05PM +0100, Jiri Olsa wrote:
> > -err_callchain_buffer:
> > - if (!event->parent) {
> > - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
> > - put_callchain_buffers();
> > - }
>
> hum, so this is now called all the time via __free_event, but it should
> be called only if we passed get_callchain_buffers call.. this could screw
> up nr_callchain_events number eventually no?
Yes, good catch, thanks!
Something like the below should handle that, no?
---
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -628,14 +628,15 @@ struct swevent_hlist {
struct rcu_head rcu_head;
};
-#define PERF_ATTACH_CONTEXT 0x01
-#define PERF_ATTACH_GROUP 0x02
-#define PERF_ATTACH_TASK 0x04
-#define PERF_ATTACH_TASK_DATA 0x08
-#define PERF_ATTACH_ITRACE 0x10
-#define PERF_ATTACH_SCHED_CB 0x20
-#define PERF_ATTACH_CHILD 0x40
-#define PERF_ATTACH_EXCLUSIVE 0x80
+#define PERF_ATTACH_CONTEXT 0x0001
+#define PERF_ATTACH_GROUP 0x0002
+#define PERF_ATTACH_TASK 0x0004
+#define PERF_ATTACH_TASK_DATA 0x0008
+#define PERF_ATTACH_ITRACE 0x0010
+#define PERF_ATTACH_SCHED_CB 0x0020
+#define PERF_ATTACH_CHILD 0x0040
+#define PERF_ATTACH_EXCLUSIVE 0x0080
+#define PERF_ATTACH_CALLCHAIN 0x0100
struct bpf_prog;
struct perf_cgroup;
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5166,10 +5166,8 @@ static void perf_addr_filters_splice(str
/* vs perf_event_alloc() error */
static void __free_event(struct perf_event *event)
{
- if (!event->parent) {
- if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
- put_callchain_buffers();
- }
+ if (event->attach_state & PERF_ATTACH_CALLCHAIN)
+ put_callchain_buffers();
kfree(event->addr_filter_ranges);
@@ -12065,6 +12063,7 @@ perf_event_alloc(struct perf_event_attr
err = get_callchain_buffers(attr->sample_max_stack);
if (err)
goto err;
+ event->attach_state |= PERF_ATTACH_CALLCHAIN;
}
}
On Fri, Nov 03, 2023 at 12:50:19PM -0700, Namhyung Kim wrote:
> > > -err_pmu:
> > > - if (is_cgroup_event(event))
> > > - perf_detach_cgroup(event);
> > > - if (event->destroy)
> > > - event->destroy(event);
> > > - module_put(pmu->module);
>
> I'm afraid of this part. If it failed at perf_init_event(), it might
> call event->destroy() already. I saw you cleared event->pmu
> in the failure path. Maybe we need the same thing for the
> event->destroy.
In that case event->destroy will not yet have been set, no?
And once it is set, we should call it.
On Wed, Nov 15, 2023 at 1:58 AM Peter Zijlstra <[email protected]> wrote:
>
> On Fri, Nov 03, 2023 at 12:50:19PM -0700, Namhyung Kim wrote:
>
> > > > -err_pmu:
> > > > - if (is_cgroup_event(event))
> > > > - perf_detach_cgroup(event);
> > > > - if (event->destroy)
> > > > - event->destroy(event);
> > > > - module_put(pmu->module);
> >
> > I'm afraid of this part. If it failed at perf_init_event(), it might
> > call event->destroy() already. I saw you cleared event->pmu
> > in the failure path. Maybe we need the same thing for the
> > event->destroy.
>
> In that case event->destroy will not yet have been set, no?
In perf_try_init_event(), it calls event->destroy() if set when it
has EXTENDED_REGS or NO_EXCLUDE capabilities and returns an error.
Thanks,
Namhyung