LinuxLists.cc - [RFC PATCH 3/8] perf: Init/fini PMU specific data

2019-11-28 15:19:59

Subject: [RFC PATCH 3/8] perf: Init/fini PMU specific data

From: Kan Liang <[email protected]>

For per-process event, only allocate the space for current task.
The space will be allocated by the first user.

For system-wide event, allocation for all the existing tasks and
upcoming tasks are required. Add variable nr_task_data_events to track
the number of system-wide event.
In perf_event_alloc(), the space for all the existing tasks will be
allocated.
The space for new tasks will be allocated in perf_event_fork().

The allocation may be failed. For per-process event, it error out.
For system-wide event, it doesn't error out, a debug message will be
dumped to system log instead. LBR callstack may be cutoff for the task
which doesn't have the space allocated.

Reviewed-by: Alexey Budankov <[email protected]>
Signed-off-by: Kan Liang <[email protected]>
---
kernel/events/core.c | 209 +++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 209 insertions(+)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index b4976e0..e519720 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -387,6 +387,10 @@ static atomic_t nr_switch_events __read_mostly;
static atomic_t nr_ksymbol_events __read_mostly;
static atomic_t nr_bpf_events __read_mostly;

+/* Track the number of system-wide event which requires pmu specific data */
+static atomic_t nr_task_data_events;
+static DEFINE_RAW_SPINLOCK(task_data_events_lock);
+
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
static struct srcu_struct pmus_srcu;
@@ -4552,6 +4556,104 @@ init_task_ctx_data_rcu(struct task_struct *task, size_t ctx_size, gfp_t flags)
return ret;
}

+static int
+init_task_ctx_data(struct task_struct *task, size_t ctx_size)
+{
+ struct perf_ctx_data *ctx_data;
+ unsigned long flags;
+ int ret = 0;
+
+ raw_spin_lock_irqsave(&task->perf_ctx_data_lock, flags);
+
+ ret = __init_task_ctx_data_rcu(task, ctx_size, GFP_KERNEL);
+ if (ret)
+ goto unlock;
+
+ ctx_data = task->perf_ctx_data;
+ /* System-wide event is active as well */
+ if ((ctx_data->refcount == 1) && atomic_read(&nr_task_data_events))
+ ctx_data->refcount++;
+
+unlock:
+ raw_spin_unlock_irqrestore(&task->perf_ctx_data_lock, flags);
+ return ret;
+}
+
+static int
+init_system_wide_ctx_data(size_t ctx_size)
+{
+ struct task_struct *g, *p;
+ int failed_alloc = 0;
+ unsigned long flags;
+
+ /*
+ * Allocate perf_ctx_data for all existing threads by the first event.
+ *
+ * The perf_ctx_data for new thread will be allocated in
+ * perf_event_fork(). The perf_event_fork() is called after the thread
+ * is added into the tasklist. It guarantees that any new threads will
+ * not be missed.
+ */
+ raw_spin_lock_irqsave(&task_data_events_lock, flags);
+ if (atomic_inc_return(&nr_task_data_events) > 1)
+ goto unlock;
+
+ read_lock(&tasklist_lock);
+
+ for_each_process_thread(g, p) {
+ /*
+ * The PMU specific data may already be allocated by
+ * per-process event. Need to update refcounter.
+ * init_task_ctx_data_rcu() is called here.
+ * Do a quick allocation in first round with GFP_ATOMIC.
+ */
+ if (init_task_ctx_data_rcu(p, ctx_size, GFP_ATOMIC))
+ failed_alloc++;
+ }
+
+ /*
+ * Failed to allocate the ctx data for some tasks.
+ * Repeat the allocation.
+ */
+ if (!failed_alloc)
+ goto tasklist_unlock;
+
+ failed_alloc = 0;
+ for_each_process_thread(g, p) {
+ /*
+ * Doesn't need to update refcounter for the task which
+ * is monitored by per-process event, or new created
+ * PMU specific data. They are done in first round allocation.
+ * alloc_task_ctx_data_rcu() is called here.
+ */
+ if (alloc_task_ctx_data_rcu(p, ctx_size, GFP_KERNEL) &&
+ ((++failed_alloc) == 1)) {
+ printk(KERN_DEBUG
+ "Failed to allocate space for LBR callstack for some tasks. "
+ "Their LBR callstack may be cutoff.\n");
+ }
+ }
+
+tasklist_unlock:
+ read_unlock(&tasklist_lock);
+unlock:
+ raw_spin_unlock_irqrestore(&task_data_events_lock, flags);
+
+ return 0;
+}
+
+static int
+init_perf_ctx_data(struct perf_event *event)
+{
+ struct task_struct *task = event->hw.target;
+ size_t ctx_size = event->pmu->task_ctx_size;
+
+ if (task)
+ return init_task_ctx_data(task, ctx_size);
+ else
+ return init_system_wide_ctx_data(ctx_size);
+}
+
static void
free_perf_ctx_data(struct rcu_head *rcu_head)
{
@@ -4594,6 +4696,40 @@ fini_task_ctx_data_rcu(struct task_struct *task, bool force)
raw_spin_unlock_irqrestore(&task->perf_ctx_data_lock, flags);
}

+static void fini_task_ctx_data(struct task_struct *task)
+{
+ fini_task_ctx_data_rcu(task, false);
+}
+
+static void fini_system_wide_ctx_data(void)
+{
+ struct task_struct *g, *p;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&task_data_events_lock, flags);
+ if (!atomic_dec_and_test(&nr_task_data_events))
+ goto unlock;
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, p)
+ fini_task_ctx_data_rcu(p, false);
+
+ read_unlock(&tasklist_lock);
+
+unlock:
+ raw_spin_unlock_irqrestore(&task_data_events_lock, flags);
+}
+
+static void fini_perf_ctx_data(struct perf_event *event)
+{
+ struct task_struct *task = event->hw.target;
+
+ if (task)
+ fini_task_ctx_data(task);
+ else
+ fini_system_wide_ctx_data();
+}
+
static void unaccount_event(struct perf_event *event)
{
bool dec = false;
@@ -4625,6 +4761,8 @@ static void unaccount_event(struct perf_event *event)
atomic_dec(&nr_ksymbol_events);
if (event->attr.bpf_event)
atomic_dec(&nr_bpf_events);
+ if (event->attach_state & PERF_ATTACH_TASK_DATA)
+ fini_perf_ctx_data(event);

if (dec) {
if (!atomic_add_unless(&perf_sched_count, -1, 1))
@@ -7451,10 +7589,72 @@ static void perf_event_task(struct task_struct *task,
task_ctx);
}

+/*
+ * Allocate data for a new task when profiling system-wide
+ * events which require PMU specific data
+ */
+static void perf_event_alloc_task_data(struct task_struct *child,
+ struct task_struct *parent)
+{
+ struct perf_ctx_data *ctx_data;
+ size_t ctx_size = 0;
+ unsigned long flags;
+
+ if (!atomic_read(&nr_task_data_events))
+ return;
+
+ rcu_read_lock();
+ ctx_data = rcu_dereference(parent->perf_ctx_data);
+ if (ctx_data)
+ ctx_size = ctx_data->data_size;
+ rcu_read_unlock();
+
+ if (!ctx_size)
+ return;
+
+ /*
+ * The refcount of a new task may not be updated correctly for some
+ * rare case as below.
+ * CPU A CPU B
+ * perf_event_alloc_task_data(): init_system_wide_ctx_data():
+ * inc(&nr_task_data_events)
+ * read(nr_task_data_events)
+ * alloc_task_ctx_data_rcu()
+ * init_task_ctx_data_rcu()
+ * The refcount of new task is double count.
+ * Lock is required to prevent the case by serializing the allocation.
+ */
+ raw_spin_lock_irqsave(&task_data_events_lock, flags);
+
+ /*
+ * System-wide event may be unaccount when attaching the perf_ctx_data.
+ * For example,
+ * CPU A CPU B
+ * perf_event_alloc_task_data(): fini_system_wide_ctx_data():
+ * read(nr_task_data_events)
+ * fini_task_ctx_data_rcu()
+ * alloc_task_ctx_data_rcu()
+ *
+ * The perf_ctx_data may never be freed until the task is terminated.
+ */
+ if (unlikely(!atomic_read(&nr_task_data_events)))
+ goto unlock;
+
+ if (alloc_task_ctx_data_rcu(child, ctx_size, GFP_KERNEL)) {
+ printk_once(KERN_DEBUG
+ "LBR callstack may be cutoff for task %s (%d) ctx_size %zu\n",
+ child->comm, task_pid_nr(child), ctx_size);
+ }
+
+unlock:
+ raw_spin_unlock_irqrestore(&task_data_events_lock, flags);
+}
+
void perf_event_fork(struct task_struct *task)
{
perf_event_task(task, NULL, 1);
perf_event_namespaces(task);
+ perf_event_alloc_task_data(task, current);
}

/*
@@ -10980,11 +11180,18 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (err)
goto err_callchain_buffer;

+ if ((event->attach_state & PERF_ATTACH_TASK_DATA) &&
+ init_perf_ctx_data(event))
+ goto err_task_ctx_data;
+
/* symmetric to unaccount_event() in _free_event() */
account_event(event);

return event;

+err_task_ctx_data:
+ if (!event->parent && (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
+ put_callchain_buffers();
err_callchain_buffer:
if (!event->parent) {
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
@@ -12046,6 +12253,8 @@ void perf_event_exit_task(struct task_struct *child)
* At this point we need to send EXIT events to cpu contexts.
*/
perf_event_task(child, NULL, 0);
+
+ fini_task_ctx_data_rcu(child, true);
}

static void perf_free_event(struct perf_event *event,
--
2.7.4

2019-12-02 12:45:05

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [RFC PATCH 3/8] perf: Init/fini PMU specific data

On Thu, Nov 28, 2019 at 07:14:26AM -0800, [email protected] wrote:
> +static int
> +init_system_wide_ctx_data(size_t ctx_size)
> +{
> + struct task_struct *g, *p;
> + int failed_alloc = 0;
> + unsigned long flags;
> +
> + /*
> + * Allocate perf_ctx_data for all existing threads by the first event.
> + *
> + * The perf_ctx_data for new thread will be allocated in
> + * perf_event_fork(). The perf_event_fork() is called after the thread
> + * is added into the tasklist. It guarantees that any new threads will
> + * not be missed.
> + */
> + raw_spin_lock_irqsave(&task_data_events_lock, flags);
> + if (atomic_inc_return(&nr_task_data_events) > 1)
> + goto unlock;
> +
> + read_lock(&tasklist_lock);
> +
> + for_each_process_thread(g, p) {
> + /*
> + * The PMU specific data may already be allocated by
> + * per-process event. Need to update refcounter.
> + * init_task_ctx_data_rcu() is called here.
> + * Do a quick allocation in first round with GFP_ATOMIC.
> + */
> + if (init_task_ctx_data_rcu(p, ctx_size, GFP_ATOMIC))
> + failed_alloc++;
> + }

This is atricous crap. Also it is completely broken for -RT.

2019-12-02 15:06:32

by Andi Kleen

[permalink] [raw]

Subject: Re: [RFC PATCH 3/8] perf: Init/fini PMU specific data

>
> This is atricous crap. Also it is completely broken for -RT.

Well can you please suggest how you would implement it instead?

-Andi

2019-12-02 16:25:23

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [RFC PATCH 3/8] perf: Init/fini PMU specific data

On Mon, Dec 02, 2019 at 06:59:57AM -0800, Andi Kleen wrote:
> >
> > This is atricous crap. Also it is completely broken for -RT.
>
> Well can you please suggest how you would implement it instead?

I don't think that is on me; at best I get to explain why it is
completely unacceptible to have O(nr_tasks) and allocations under a
raw_spinlock_t, but I was thinking you'd already know that.

2019-12-02 16:41:39

by Alexey Budankov

[permalink] [raw]

Subject: Re: [RFC PATCH 3/8] perf: Init/fini PMU specific data

On 02.12.2019 17:59, Andi Kleen wrote:
>>
>> This is atricous crap. Also it is completely broken for -RT.
>
> Well can you please suggest how you would implement it instead?

FWIW,
An alternative could probably be to make task_ctx_data allocations
on the nearest context switch in, and obvious drawback is slowdown on
this critical path, but it could be amortized by static branches.

Thanks,
Alexey

2019-12-02 17:44:09

by Alexey Budankov

[permalink] [raw]

Subject: Re: [RFC PATCH 3/8] perf: Init/fini PMU specific data

On 02.12.2019 19:43, Peter Zijlstra wrote:
> On Mon, Dec 02, 2019 at 07:38:00PM +0300, Alexey Budankov wrote:
>>
>> On 02.12.2019 17:59, Andi Kleen wrote:
>>>>
>>>> This is atricous crap. Also it is completely broken for -RT.
>>>
>>> Well can you please suggest how you would implement it instead?
>>
>> FWIW,
>> An alternative could probably be to make task_ctx_data allocations
>> on the nearest context switch in, and obvious drawback is slowdown on
>> this critical path, but it could be amortized by static branches.
>
> Context switch is under a raw_spinlock_t too.

Indeed, under rq->lock (some of runqueue locks, I suppose), but
as far locking order is not violated a thread shouldn't deadlock.
On the other side it could probably hurt concurrency and
it is more preferable to have task_ctx_data memory pre-allocated
by the time it is assigned on a context switch in.

What if we would create some pool of preallocated task_ctx_data
objects on the first system wide perf_event_open() syscall
and after that:
- already existing threads would take task_ctx_data objects from
the pool without additional locking on the nearest
context switch in;
- newly created threads would allocate task_ctx_data themselves,
atomically checking some global state, possibly at PMU object
- task_ctx_data deallocation would be performed by threads
themselves, at some safe points in time;

Thanks,
Alexey

2019-12-02 19:18:00

by Andi Kleen

[permalink] [raw]

Subject: Re: [RFC PATCH 3/8] perf: Init/fini PMU specific data

On Mon, Dec 02, 2019 at 05:21:52PM +0100, Peter Zijlstra wrote:
> On Mon, Dec 02, 2019 at 06:59:57AM -0800, Andi Kleen wrote:
> > >
> > > This is atricous crap. Also it is completely broken for -RT.
> >
> > Well can you please suggest how you would implement it instead?
>
> I don't think that is on me; at best I get to explain why it is

Normally code review is expected to be constructive.

> completely unacceptible to have O(nr_tasks) and allocations under a
> raw_spinlock_t, but I was thinking you'd already know that.

Ok if that's the only problem then a lock breaker + retry
if rescheduling is needed + some limit against live lock
should be sufficient.

-Andi

2019-12-02 20:15:35

by Liang, Kan

[permalink] [raw]

Subject: Re: [RFC PATCH 3/8] perf: Init/fini PMU specific data

On 12/2/2019 2:15 PM, Andi Kleen wrote:
> On Mon, Dec 02, 2019 at 05:21:52PM +0100, Peter Zijlstra wrote:
>> On Mon, Dec 02, 2019 at 06:59:57AM -0800, Andi Kleen wrote:
>>>>
>>>> This is atricous crap. Also it is completely broken for -RT.
>>>
>>> Well can you please suggest how you would implement it instead?
>>
>> I don't think that is on me; at best I get to explain why it is
>
> Normally code review is expected to be constructive.
>
>> completely unacceptible to have O(nr_tasks) and allocations under a
>> raw_spinlock_t, but I was thinking you'd already know that.
>
> Ok if that's the only problem then a lock breaker + retry
> if rescheduling is needed + some limit against live lock
> should be sufficient.
>

OK. I will move the allocation out of critical sections.
Here is some pseudo code.

if (atomic_read(&nr_task_data_events))
return;

//get current number of threads
read_lock(&tasklist_lock);
for_each_process_thread(g, p)
num_thread++;
read_unlock(&tasklist_lock);

//allocate the space for them
for (i = 0; i < num_thread; i++)
data[i] = kzalloc(ctx_size, flags);
i = 0;

/*
* Assign the space to tasks
* There may be some new threads created when we allocate space.
* new_task will track its number.
*/
raw_spin_lock_irqsave(&task_data_events_lock, flags);

if (atomic_inc_return(&nr_task_data_events) > 1)
goto unlock;

for_each_process_thread(g, p) {
if (i < num_thread)
p->perf_ctx_data = data[i++];
else
new_task++;
}
raw_spin_unlock_irqrestore(&task_data_events_lock, flags);

if (i < num_thread)
goto end;

/*
* Try again to allocate the space for the task created when
* we first allocate space.
* We don't need to worry about the task created after
* atomic_inc_return(). It will be handled in perf_event_fork().
* Retry one is enough.
*/
for (i = 0; i < new_task; i++)
data[i] = kzalloc(ctx_size, flags);

raw_spin_lock_irqsave(&task_data_events_lock, flags);

for_each_process_thread(g, p) {
if (i < unallocated)
p->perf_ctx_data = data[i++];
else
WARN_ON
}
raw_spin_unlock_irqrestore(&task_data_events_lock, flags);

unlock:
raw_spin_unlock_irqrestore(&task_data_events_lock, flags);

end:
free unused data[]

Thanks,
Kan

2019-12-02 20:21:14

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [RFC PATCH 3/8] perf: Init/fini PMU specific data

On Mon, Dec 02, 2019 at 07:38:00PM +0300, Alexey Budankov wrote:
>
> On 02.12.2019 17:59, Andi Kleen wrote:
> >>
> >> This is atricous crap. Also it is completely broken for -RT.
> >
> > Well can you please suggest how you would implement it instead?
>
> FWIW,
> An alternative could probably be to make task_ctx_data allocations
> on the nearest context switch in, and obvious drawback is slowdown on
> this critical path, but it could be amortized by static branches.

Context switch is under a raw_spinlock_t too.

2019-12-02 20:29:59

by Andi Kleen

[permalink] [raw]

Subject: Re: [RFC PATCH 3/8] perf: Init/fini PMU specific data

Looks reasonable to me.

> //get current number of threads
> read_lock(&tasklist_lock);
> for_each_process_thread(g, p)
> num_thread++;
> read_unlock(&tasklist_lock);

I'm sure we have that count somewhere.

>
> //allocate the space for them
> for (i = 0; i < num_thread; i++)
> data[i] = kzalloc(ctx_size, flags);
> i = 0;
>
> /*
> * Assign the space to tasks
> * There may be some new threads created when we allocate space.
> * new_task will track its number.
> */
> raw_spin_lock_irqsave(&task_data_events_lock, flags);
>
> if (atomic_inc_return(&nr_task_data_events) > 1)
> goto unlock;
>
> for_each_process_thread(g, p) {
> if (i < num_thread)
> p->perf_ctx_data = data[i++];
> else
> new_task++;
> }
> raw_spin_unlock_irqrestore(&task_data_events_lock, flags);

Is that lock taken in the context switch?

If not could be a normal spinlock, thus be more RT friendly.

-Andi

2019-12-02 20:45:47

by Liang, Kan

[permalink] [raw]

Subject: Re: [RFC PATCH 3/8] perf: Init/fini PMU specific data

On 12/2/2019 3:25 PM, Andi Kleen wrote:
>
> Looks reasonable to me.
>
>> //get current number of threads
>> read_lock(&tasklist_lock);
>> for_each_process_thread(g, p)
>> num_thread++;
>> read_unlock(&tasklist_lock);
>
> I'm sure we have that count somewhere.
>

It looks like we can get the number from global variable "nr_threads"
I will use it in v2.

>>
>> //allocate the space for them
>> for (i = 0; i < num_thread; i++)
>> data[i] = kzalloc(ctx_size, flags);
>> i = 0;
>>
>> /*
>> * Assign the space to tasks
>> * There may be some new threads created when we allocate space.
>> * new_task will track its number.
>> */
>> raw_spin_lock_irqsave(&task_data_events_lock, flags);
>>
>> if (atomic_inc_return(&nr_task_data_events) > 1)
>> goto unlock;
>>
>> for_each_process_thread(g, p) {
>> if (i < num_thread)
>> p->perf_ctx_data = data[i++];
>> else
>> new_task++;
>> }
>> raw_spin_unlock_irqrestore(&task_data_events_lock, flags);
>
> Is that lock taken in the context switch? >
> If not could be a normal spinlock, thus be more RT friendly.
>

It's not in context switch. I will use the normal spinlock to instead.

Thanks,
Kan

2019-12-04 13:13:21

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [RFC PATCH 3/8] perf: Init/fini PMU specific data

On Mon, Dec 02, 2019 at 03:44:34PM -0500, Liang, Kan wrote:

> It's not in context switch. I will use the normal spinlock to instead.

Mutex would make even more sense. And we already have a per-task
perf_event_mutex.

Also, I don't think you need tasklist_lock here, if you set the state
before the iteration, any new clone()s will observe the state and
allocate the storage themselves. Then all you need is RCU iteration of
the tasklist.