2022-12-12 13:04:59

by Like Xu

[permalink] [raw]
Subject: [PATCH RFC 1/8] perf/core: Add *group_leader to perf_event_create_kernel_counter()

From: Like Xu <[email protected]>

Like syscalls users, kernel-space perf_event creators may also use group
counters abstraction to gain pmu functionalities, and an in-kernel counter
groups behave much like normal 'single' counters, following the group
semantics-based behavior.

No functional changes at this time. An example will be that KVM creates
Intel slot event as group leader and other topdown metric events to emulate
MSR_PERF_METRICS pmu capability for guests.

Cc: Peter Zijlstra <[email protected]>
Cc: Marc Zyngier <[email protected]>
Cc: Fenghua Yu <[email protected]>
Cc: [email protected]
Cc: [email protected]
Signed-off-by: Like Xu <[email protected]>
---
arch/arm64/kvm/pmu-emul.c | 4 ++--
arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 4 ++--
arch/x86/kvm/pmu.c | 2 +-
arch/x86/kvm/vmx/pmu_intel.c | 2 +-
include/linux/perf_event.h | 1 +
kernel/events/core.c | 4 +++-
kernel/events/hw_breakpoint.c | 4 ++--
kernel/events/hw_breakpoint_test.c | 2 +-
kernel/watchdog_hld.c | 2 +-
9 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index 24908400e190..11c3386bc86b 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -624,7 +624,7 @@ static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc)

attr.sample_period = compute_period(pmc, kvm_pmu_get_pmc_value(pmc));

- event = perf_event_create_kernel_counter(&attr, -1, current,
+ event = perf_event_create_kernel_counter(&attr, -1, current, NULL,
kvm_pmu_perf_overflow, pmc);

if (IS_ERR(event)) {
@@ -713,7 +713,7 @@ static struct arm_pmu *kvm_pmu_probe_armpmu(void)
attr.config = ARMV8_PMUV3_PERFCTR_CPU_CYCLES;
attr.sample_period = GENMASK(63, 0);

- event = perf_event_create_kernel_counter(&attr, -1, current,
+ event = perf_event_create_kernel_counter(&attr, -1, current, NULL,
kvm_pmu_perf_overflow, &attr);

if (IS_ERR(event)) {
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index d961ae3ed96e..43e54bb200cd 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -952,12 +952,12 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr,
u64 tmp;

miss_event = perf_event_create_kernel_counter(miss_attr, plr->cpu,
- NULL, NULL, NULL);
+ NULL, NULL, NULL, NULL);
if (IS_ERR(miss_event))
goto out;

hit_event = perf_event_create_kernel_counter(hit_attr, plr->cpu,
- NULL, NULL, NULL);
+ NULL, NULL, NULL, NULL);
if (IS_ERR(hit_event))
goto out_miss;

diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index eb594620dd75..f6c8180241d7 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -204,7 +204,7 @@ static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
attr.precise_ip = 3;
}

- event = perf_event_create_kernel_counter(&attr, -1, current,
+ event = perf_event_create_kernel_counter(&attr, -1, current, NULL,
kvm_perf_overflow, pmc);
if (IS_ERR(event)) {
pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index f951dc756456..b746381307c7 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -299,7 +299,7 @@ int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu)
}

event = perf_event_create_kernel_counter(&attr, -1,
- current, NULL, NULL);
+ current, NULL, NULL, NULL);
if (IS_ERR(event)) {
pr_debug_ratelimited("%s: failed %ld\n",
__func__, PTR_ERR(event));
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0031f7b4d9ab..5f34e1d0bff8 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1023,6 +1023,7 @@ extern struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr,
int cpu,
struct task_struct *task,
+ struct perf_event *group_leader,
perf_overflow_handler_t callback,
void *context);
extern void perf_pmu_migrate_context(struct pmu *pmu,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7f04f995c975..f671b1a9a691 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -12674,12 +12674,14 @@ SYSCALL_DEFINE5(perf_event_open,
* @attr: attributes of the counter to create
* @cpu: cpu in which the counter is bound
* @task: task to profile (NULL for percpu)
+ * @group_leader: event group leader
* @overflow_handler: callback to trigger when we hit the event
* @context: context data could be used in overflow_handler callback
*/
struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
struct task_struct *task,
+ struct perf_event *group_leader,
perf_overflow_handler_t overflow_handler,
void *context)
{
@@ -12694,7 +12696,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
if (attr->aux_output)
return ERR_PTR(-EINVAL);

- event = perf_event_alloc(attr, cpu, task, NULL, NULL,
+ event = perf_event_alloc(attr, cpu, task, group_leader, NULL,
overflow_handler, context, -1);
if (IS_ERR(event)) {
err = PTR_ERR(event);
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index c3797701339c..65b5b1421e62 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -771,7 +771,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
void *context,
struct task_struct *tsk)
{
- return perf_event_create_kernel_counter(attr, -1, tsk, triggered,
+ return perf_event_create_kernel_counter(attr, -1, tsk, NULL, triggered,
context);
}
EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
@@ -881,7 +881,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,

cpus_read_lock();
for_each_online_cpu(cpu) {
- bp = perf_event_create_kernel_counter(attr, cpu, NULL,
+ bp = perf_event_create_kernel_counter(attr, cpu, NULL, NULL,
triggered, context);
if (IS_ERR(bp)) {
err = PTR_ERR(bp);
diff --git a/kernel/events/hw_breakpoint_test.c b/kernel/events/hw_breakpoint_test.c
index c57610f52bb4..b3597df12284 100644
--- a/kernel/events/hw_breakpoint_test.c
+++ b/kernel/events/hw_breakpoint_test.c
@@ -39,7 +39,7 @@ static struct perf_event *register_test_bp(int cpu, struct task_struct *tsk, int
attr.bp_addr = (unsigned long)&break_vars[idx];
attr.bp_len = HW_BREAKPOINT_LEN_1;
attr.bp_type = HW_BREAKPOINT_RW;
- return perf_event_create_kernel_counter(&attr, cpu, tsk, NULL, NULL);
+ return perf_event_create_kernel_counter(&attr, cpu, tsk, NULL, NULL, NULL);
}

static void unregister_test_bp(struct perf_event **bp)
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 247bf0b1582c..bb755dadba54 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -173,7 +173,7 @@ static int hardlockup_detector_event_create(void)
wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);

/* Try to register using hardware perf events */
- evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
+ evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL, NULL,
watchdog_overflow_callback, NULL);
if (IS_ERR(evt)) {
pr_debug("Perf event create on CPU %d failed with %ld\n", cpu,
--
2.38.2


2022-12-12 14:15:50

by Marc Zyngier

[permalink] [raw]
Subject: Re: [PATCH RFC 1/8] perf/core: Add *group_leader to perf_event_create_kernel_counter()

On Mon, 12 Dec 2022 12:58:37 +0000,
Like Xu <[email protected]> wrote:
>
> From: Like Xu <[email protected]>
>
> Like syscalls users, kernel-space perf_event creators may also use group
> counters abstraction to gain pmu functionalities, and an in-kernel counter
> groups behave much like normal 'single' counters, following the group
> semantics-based behavior.
>
> No functional changes at this time. An example will be that KVM creates
> Intel slot event as group leader and other topdown metric events to emulate
> MSR_PERF_METRICS pmu capability for guests.
>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Marc Zyngier <[email protected]>
> Cc: Fenghua Yu <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
> Signed-off-by: Like Xu <[email protected]>
> ---
> arch/arm64/kvm/pmu-emul.c | 4 ++--
> arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 4 ++--
> arch/x86/kvm/pmu.c | 2 +-
> arch/x86/kvm/vmx/pmu_intel.c | 2 +-
> include/linux/perf_event.h | 1 +
> kernel/events/core.c | 4 +++-
> kernel/events/hw_breakpoint.c | 4 ++--
> kernel/events/hw_breakpoint_test.c | 2 +-
> kernel/watchdog_hld.c | 2 +-
> 9 files changed, 14 insertions(+), 11 deletions(-)
>
> diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
> index 24908400e190..11c3386bc86b 100644
> --- a/arch/arm64/kvm/pmu-emul.c
> +++ b/arch/arm64/kvm/pmu-emul.c
> @@ -624,7 +624,7 @@ static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc)
>
> attr.sample_period = compute_period(pmc, kvm_pmu_get_pmc_value(pmc));
>
> - event = perf_event_create_kernel_counter(&attr, -1, current,
> + event = perf_event_create_kernel_counter(&attr, -1, current, NULL,
> kvm_pmu_perf_overflow, pmc);

Wouldn't it be better to have a separate helper that takes the group leader
as a parameter, and reimplement perf_event_create_kernel_counter() in term
of this helper?

M.

--
Without deviation from the norm, progress is not possible.

2022-12-14 04:09:52

by Ravi Bangoria

[permalink] [raw]
Subject: Re: [PATCH RFC 1/8] perf/core: Add *group_leader to perf_event_create_kernel_counter()

> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 7f04f995c975..f671b1a9a691 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -12674,12 +12674,14 @@ SYSCALL_DEFINE5(perf_event_open,
> * @attr: attributes of the counter to create
> * @cpu: cpu in which the counter is bound
> * @task: task to profile (NULL for percpu)
> + * @group_leader: event group leader
> * @overflow_handler: callback to trigger when we hit the event
> * @context: context data could be used in overflow_handler callback
> */
> struct perf_event *
> perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
> struct task_struct *task,
> + struct perf_event *group_leader,
> perf_overflow_handler_t overflow_handler,
> void *context)
> {
> @@ -12694,7 +12696,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
> if (attr->aux_output)
> return ERR_PTR(-EINVAL);
>
> - event = perf_event_alloc(attr, cpu, task, NULL, NULL,
> + event = perf_event_alloc(attr, cpu, task, group_leader, NULL,
> overflow_handler, context, -1);

Grouping involves lot of complexities. Setting group_leader won't be sufficient.
Please see perf_event_open() syscall code for more detail.

Thanks,
Ravi

2022-12-15 13:52:16

by Like Xu

[permalink] [raw]
Subject: Re: [PATCH RFC 1/8] perf/core: Add *group_leader to perf_event_create_kernel_counter()

On 14/12/2022 11:52 am, Ravi Bangoria wrote:
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 7f04f995c975..f671b1a9a691 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -12674,12 +12674,14 @@ SYSCALL_DEFINE5(perf_event_open,
>> * @attr: attributes of the counter to create
>> * @cpu: cpu in which the counter is bound
>> * @task: task to profile (NULL for percpu)
>> + * @group_leader: event group leader
>> * @overflow_handler: callback to trigger when we hit the event
>> * @context: context data could be used in overflow_handler callback
>> */
>> struct perf_event *
>> perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
>> struct task_struct *task,
>> + struct perf_event *group_leader,
>> perf_overflow_handler_t overflow_handler,
>> void *context)
>> {
>> @@ -12694,7 +12696,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
>> if (attr->aux_output)
>> return ERR_PTR(-EINVAL);
>>
>> - event = perf_event_alloc(attr, cpu, task, NULL, NULL,
>> + event = perf_event_alloc(attr, cpu, task, group_leader, NULL,
>> overflow_handler, context, -1);
>
> Grouping involves lot of complexities. Setting group_leader won't be sufficient.
> Please see perf_event_open() syscall code for more detail.
>
> Thanks,
> Ravi

This is the main reason why the RFC tag is added. More detailed professional
reviews is encouraged.

AFAI, there does exist a number of code gaps here to support grouped events in
the kernel,
but there are also opportunities, as there may be other new use cases bringing
innovation.

I have to confirm this idea with maintainers first, the alternative is to create
yet another special
perf_event similar to PMC_IDX_FIXED_VLBR, which schedules perf_metrics MSR for
KVM stuff.

PeterZ, any input ?

2022-12-15 13:56:58

by Like Xu

[permalink] [raw]
Subject: Re: [PATCH RFC 1/8] perf/core: Add *group_leader to perf_event_create_kernel_counter()

On 12/12/2022 9:23 pm, Marc Zyngier wrote:
> On Mon, 12 Dec 2022 12:58:37 +0000,
> Like Xu <[email protected]> wrote:
>>
>> From: Like Xu <[email protected]>
>>
>> Like syscalls users, kernel-space perf_event creators may also use group
>> counters abstraction to gain pmu functionalities, and an in-kernel counter
>> groups behave much like normal 'single' counters, following the group
>> semantics-based behavior.
>>
>> No functional changes at this time. An example will be that KVM creates
>> Intel slot event as group leader and other topdown metric events to emulate
>> MSR_PERF_METRICS pmu capability for guests.
>>
>> Cc: Peter Zijlstra <[email protected]>
>> Cc: Marc Zyngier <[email protected]>
>> Cc: Fenghua Yu <[email protected]>
>> Cc: [email protected]
>> Cc: [email protected]
>> Signed-off-by: Like Xu <[email protected]>
>> ---
>> arch/arm64/kvm/pmu-emul.c | 4 ++--
>> arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 4 ++--
>> arch/x86/kvm/pmu.c | 2 +-
>> arch/x86/kvm/vmx/pmu_intel.c | 2 +-
>> include/linux/perf_event.h | 1 +
>> kernel/events/core.c | 4 +++-
>> kernel/events/hw_breakpoint.c | 4 ++--
>> kernel/events/hw_breakpoint_test.c | 2 +-
>> kernel/watchdog_hld.c | 2 +-
>> 9 files changed, 14 insertions(+), 11 deletions(-)
>>
>> diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
>> index 24908400e190..11c3386bc86b 100644
>> --- a/arch/arm64/kvm/pmu-emul.c
>> +++ b/arch/arm64/kvm/pmu-emul.c
>> @@ -624,7 +624,7 @@ static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc)
>>
>> attr.sample_period = compute_period(pmc, kvm_pmu_get_pmc_value(pmc));
>>
>> - event = perf_event_create_kernel_counter(&attr, -1, current,
>> + event = perf_event_create_kernel_counter(&attr, -1, current, NULL,
>> kvm_pmu_perf_overflow, pmc);
>
> Wouldn't it be better to have a separate helper that takes the group leader
> as a parameter, and reimplement perf_event_create_kernel_counter() in term
> of this helper?
>
> M.
>

Applied. It makes changes more concise, thank you.