2023-05-04 11:03:47

by Ravi Bangoria

[permalink] [raw]
Subject: [PATCH v4 1/4] perf/core: Rework forwarding of {task|cpu}-clock events

Currently, PERF_TYPE_SOFTWARE is treated specially since task-clock and
cpu-clock events are interfaced through it but internally gets forwarded
to their own pmus.

Rework this by overwriting event->attr.type in perf_swevent_init() which
will cause perf_init_event() to retry with updated type and event will
automatically get forwarded to right pmu. With the change, SW pmu no
longer needs to be treated specially and can be included in 'pmu_idr'
list.

Suggested-by: Peter Zijlstra <[email protected]>
Signed-off-by: Ravi Bangoria <[email protected]>
---
include/linux/perf_event.h | 10 +++++
kernel/events/core.c | 77 ++++++++++++++++++++------------------
2 files changed, 51 insertions(+), 36 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index d5628a7b5eaa..5c8a748f51ac 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -295,6 +295,8 @@ struct perf_event_pmu_context;

struct perf_output_handle;

+#define PMU_NULL_DEV ((void *)(~0))
+
/**
* struct pmu - generic performance monitoring unit
*/
@@ -827,6 +829,14 @@ struct perf_event {
void *security;
#endif
struct list_head sb_list;
+
+ /*
+ * Certain events gets forwarded to another pmu internally by over-
+ * writing kernel copy of event->attr.type without user being aware
+ * of it. event->orig_type contains original 'type' requested by
+ * user.
+ */
+ __u32 orig_type;
#endif /* CONFIG_PERF_EVENTS */
};

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 435815d3be3f..0695bb9fbbb6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6647,7 +6647,7 @@ static void perf_sigtrap(struct perf_event *event)
return;

send_sig_perf((void __user *)event->pending_addr,
- event->attr.type, event->attr.sig_data);
+ event->orig_type, event->attr.sig_data);
}

/*
@@ -9951,6 +9951,9 @@ static void sw_perf_event_destroy(struct perf_event *event)
swevent_hlist_put();
}

+static struct pmu perf_cpu_clock; /* fwd declaration */
+static struct pmu perf_task_clock;
+
static int perf_swevent_init(struct perf_event *event)
{
u64 event_id = event->attr.config;
@@ -9966,7 +9969,10 @@ static int perf_swevent_init(struct perf_event *event)

switch (event_id) {
case PERF_COUNT_SW_CPU_CLOCK:
+ event->attr.type = perf_cpu_clock.type;
+ return -ENOENT;
case PERF_COUNT_SW_TASK_CLOCK:
+ event->attr.type = perf_task_clock.type;
return -ENOENT;

default:
@@ -11086,7 +11092,7 @@ static void cpu_clock_event_read(struct perf_event *event)

static int cpu_clock_event_init(struct perf_event *event)
{
- if (event->attr.type != PERF_TYPE_SOFTWARE)
+ if (event->attr.type != perf_cpu_clock.type)
return -ENOENT;

if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
@@ -11107,6 +11113,7 @@ static struct pmu perf_cpu_clock = {
.task_ctx_nr = perf_sw_context,

.capabilities = PERF_PMU_CAP_NO_NMI,
+ .dev = PMU_NULL_DEV,

.event_init = cpu_clock_event_init,
.add = cpu_clock_event_add,
@@ -11167,7 +11174,7 @@ static void task_clock_event_read(struct perf_event *event)

static int task_clock_event_init(struct perf_event *event)
{
- if (event->attr.type != PERF_TYPE_SOFTWARE)
+ if (event->attr.type != perf_task_clock.type)
return -ENOENT;

if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
@@ -11188,6 +11195,7 @@ static struct pmu perf_task_clock = {
.task_ctx_nr = perf_sw_context,

.capabilities = PERF_PMU_CAP_NO_NMI,
+ .dev = PMU_NULL_DEV,

.event_init = task_clock_event_init,
.add = task_clock_event_add,
@@ -11415,31 +11423,31 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
goto unlock;

pmu->type = -1;
- if (!name)
- goto skip_type;
+ if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) {
+ ret = -EINVAL;
+ goto free_pdc;
+ }
+
pmu->name = name;

- if (type != PERF_TYPE_SOFTWARE) {
- if (type >= 0)
- max = type;
+ if (type >= 0)
+ max = type;

- ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
- if (ret < 0)
- goto free_pdc;
+ ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
+ if (ret < 0)
+ goto free_pdc;

- WARN_ON(type >= 0 && ret != type);
+ WARN_ON(type >= 0 && ret != type);

- type = ret;
- }
+ type = ret;
pmu->type = type;

- if (pmu_bus_running) {
+ if (pmu_bus_running && !pmu->dev) {
ret = pmu_dev_alloc(pmu);
if (ret)
goto free_idr;
}

-skip_type:
ret = -ENOMEM;
pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context);
if (!pmu->cpu_pmu_context)
@@ -11481,16 +11489,7 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
if (!pmu->event_idx)
pmu->event_idx = perf_event_idx_default;

- /*
- * Ensure the TYPE_SOFTWARE PMUs are at the head of the list,
- * since these cannot be in the IDR. This way the linear search
- * is fast, provided a valid software event is provided.
- */
- if (type == PERF_TYPE_SOFTWARE || !name)
- list_add_rcu(&pmu->entry, &pmus);
- else
- list_add_tail_rcu(&pmu->entry, &pmus);
-
+ list_add_rcu(&pmu->entry, &pmus);
atomic_set(&pmu->exclusive_cnt, 0);
ret = 0;
unlock:
@@ -11499,12 +11498,13 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
return ret;

free_dev:
- device_del(pmu->dev);
- put_device(pmu->dev);
+ if (pmu->dev && pmu->dev != PMU_NULL_DEV) {
+ device_del(pmu->dev);
+ put_device(pmu->dev);
+ }

free_idr:
- if (pmu->type != PERF_TYPE_SOFTWARE)
- idr_remove(&pmu_idr, pmu->type);
+ idr_remove(&pmu_idr, pmu->type);

free_pdc:
free_percpu(pmu->pmu_disable_count);
@@ -11525,9 +11525,8 @@ void perf_pmu_unregister(struct pmu *pmu)
synchronize_rcu();

free_percpu(pmu->pmu_disable_count);
- if (pmu->type != PERF_TYPE_SOFTWARE)
- idr_remove(&pmu_idr, pmu->type);
- if (pmu_bus_running) {
+ idr_remove(&pmu_idr, pmu->type);
+ if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) {
if (pmu->nr_addr_filters)
device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
device_del(pmu->dev);
@@ -11601,6 +11600,12 @@ static struct pmu *perf_init_event(struct perf_event *event)

idx = srcu_read_lock(&pmus_srcu);

+ /*
+ * Save original type before calling pmu->event_init() since certain
+ * pmus overwrites event->attr.type to forward event to another pmu.
+ */
+ event->orig_type = event->attr.type;
+
/* Try parent's PMU first: */
if (event->parent && event->parent->pmu) {
pmu = event->parent->pmu;
@@ -13640,8 +13645,8 @@ void __init perf_event_init(void)
perf_event_init_all_cpus();
init_srcu_struct(&pmus_srcu);
perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
- perf_pmu_register(&perf_cpu_clock, NULL, -1);
- perf_pmu_register(&perf_task_clock, NULL, -1);
+ perf_pmu_register(&perf_cpu_clock, "cpu_clock", -1);
+ perf_pmu_register(&perf_task_clock, "task_clock", -1);
perf_tp_register();
perf_event_init_cpu(smp_processor_id());
register_reboot_notifier(&perf_reboot_notifier);
@@ -13684,7 +13689,7 @@ static int __init perf_event_sysfs_init(void)
goto unlock;

list_for_each_entry(pmu, &pmus, entry) {
- if (!pmu->name || pmu->type < 0)
+ if (pmu->dev)
continue;

ret = pmu_dev_alloc(pmu);
--
2.40.0


Subject: [tip: perf/core] perf/core: Rework forwarding of {task|cpu}-clock events

The following commit has been merged into the perf/core branch of tip:

Commit-ID: 0d6d062ca27ec7ef547712d34dcfcfb952bcef53
Gitweb: https://git.kernel.org/tip/0d6d062ca27ec7ef547712d34dcfcfb952bcef53
Author: Ravi Bangoria <[email protected]>
AuthorDate: Thu, 04 May 2023 16:30:00 +05:30
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Mon, 08 May 2023 10:58:30 +02:00

perf/core: Rework forwarding of {task|cpu}-clock events

Currently, PERF_TYPE_SOFTWARE is treated specially since task-clock and
cpu-clock events are interfaced through it but internally gets forwarded
to their own pmus.

Rework this by overwriting event->attr.type in perf_swevent_init() which
will cause perf_init_event() to retry with updated type and event will
automatically get forwarded to right pmu. With the change, SW pmu no
longer needs to be treated specially and can be included in 'pmu_idr'
list.

Suggested-by: Peter Zijlstra <[email protected]>
Signed-off-by: Ravi Bangoria <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
include/linux/perf_event.h | 10 +++++-
kernel/events/core.c | 77 +++++++++++++++++++------------------
2 files changed, 51 insertions(+), 36 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index d5628a7..bf4f346 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -295,6 +295,8 @@ struct perf_event_pmu_context;

struct perf_output_handle;

+#define PMU_NULL_DEV ((void *)(~0UL))
+
/**
* struct pmu - generic performance monitoring unit
*/
@@ -827,6 +829,14 @@ struct perf_event {
void *security;
#endif
struct list_head sb_list;
+
+ /*
+ * Certain events gets forwarded to another pmu internally by over-
+ * writing kernel copy of event->attr.type without user being aware
+ * of it. event->orig_type contains original 'type' requested by
+ * user.
+ */
+ __u32 orig_type;
#endif /* CONFIG_PERF_EVENTS */
};

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 68baa81..c01bbe9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6647,7 +6647,7 @@ static void perf_sigtrap(struct perf_event *event)
return;

send_sig_perf((void __user *)event->pending_addr,
- event->attr.type, event->attr.sig_data);
+ event->orig_type, event->attr.sig_data);
}

/*
@@ -9951,6 +9951,9 @@ static void sw_perf_event_destroy(struct perf_event *event)
swevent_hlist_put();
}

+static struct pmu perf_cpu_clock; /* fwd declaration */
+static struct pmu perf_task_clock;
+
static int perf_swevent_init(struct perf_event *event)
{
u64 event_id = event->attr.config;
@@ -9966,7 +9969,10 @@ static int perf_swevent_init(struct perf_event *event)

switch (event_id) {
case PERF_COUNT_SW_CPU_CLOCK:
+ event->attr.type = perf_cpu_clock.type;
+ return -ENOENT;
case PERF_COUNT_SW_TASK_CLOCK:
+ event->attr.type = perf_task_clock.type;
return -ENOENT;

default:
@@ -11086,7 +11092,7 @@ static void cpu_clock_event_read(struct perf_event *event)

static int cpu_clock_event_init(struct perf_event *event)
{
- if (event->attr.type != PERF_TYPE_SOFTWARE)
+ if (event->attr.type != perf_cpu_clock.type)
return -ENOENT;

if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
@@ -11107,6 +11113,7 @@ static struct pmu perf_cpu_clock = {
.task_ctx_nr = perf_sw_context,

.capabilities = PERF_PMU_CAP_NO_NMI,
+ .dev = PMU_NULL_DEV,

.event_init = cpu_clock_event_init,
.add = cpu_clock_event_add,
@@ -11167,7 +11174,7 @@ static void task_clock_event_read(struct perf_event *event)

static int task_clock_event_init(struct perf_event *event)
{
- if (event->attr.type != PERF_TYPE_SOFTWARE)
+ if (event->attr.type != perf_task_clock.type)
return -ENOENT;

if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
@@ -11188,6 +11195,7 @@ static struct pmu perf_task_clock = {
.task_ctx_nr = perf_sw_context,

.capabilities = PERF_PMU_CAP_NO_NMI,
+ .dev = PMU_NULL_DEV,

.event_init = task_clock_event_init,
.add = task_clock_event_add,
@@ -11415,31 +11423,31 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
goto unlock;

pmu->type = -1;
- if (!name)
- goto skip_type;
+ if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) {
+ ret = -EINVAL;
+ goto free_pdc;
+ }
+
pmu->name = name;

- if (type != PERF_TYPE_SOFTWARE) {
- if (type >= 0)
- max = type;
+ if (type >= 0)
+ max = type;

- ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
- if (ret < 0)
- goto free_pdc;
+ ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
+ if (ret < 0)
+ goto free_pdc;

- WARN_ON(type >= 0 && ret != type);
+ WARN_ON(type >= 0 && ret != type);

- type = ret;
- }
+ type = ret;
pmu->type = type;

- if (pmu_bus_running) {
+ if (pmu_bus_running && !pmu->dev) {
ret = pmu_dev_alloc(pmu);
if (ret)
goto free_idr;
}

-skip_type:
ret = -ENOMEM;
pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context);
if (!pmu->cpu_pmu_context)
@@ -11481,16 +11489,7 @@ skip_type:
if (!pmu->event_idx)
pmu->event_idx = perf_event_idx_default;

- /*
- * Ensure the TYPE_SOFTWARE PMUs are at the head of the list,
- * since these cannot be in the IDR. This way the linear search
- * is fast, provided a valid software event is provided.
- */
- if (type == PERF_TYPE_SOFTWARE || !name)
- list_add_rcu(&pmu->entry, &pmus);
- else
- list_add_tail_rcu(&pmu->entry, &pmus);
-
+ list_add_rcu(&pmu->entry, &pmus);
atomic_set(&pmu->exclusive_cnt, 0);
ret = 0;
unlock:
@@ -11499,12 +11498,13 @@ unlock:
return ret;

free_dev:
- device_del(pmu->dev);
- put_device(pmu->dev);
+ if (pmu->dev && pmu->dev != PMU_NULL_DEV) {
+ device_del(pmu->dev);
+ put_device(pmu->dev);
+ }

free_idr:
- if (pmu->type != PERF_TYPE_SOFTWARE)
- idr_remove(&pmu_idr, pmu->type);
+ idr_remove(&pmu_idr, pmu->type);

free_pdc:
free_percpu(pmu->pmu_disable_count);
@@ -11525,9 +11525,8 @@ void perf_pmu_unregister(struct pmu *pmu)
synchronize_rcu();

free_percpu(pmu->pmu_disable_count);
- if (pmu->type != PERF_TYPE_SOFTWARE)
- idr_remove(&pmu_idr, pmu->type);
- if (pmu_bus_running) {
+ idr_remove(&pmu_idr, pmu->type);
+ if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) {
if (pmu->nr_addr_filters)
device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
device_del(pmu->dev);
@@ -11601,6 +11600,12 @@ static struct pmu *perf_init_event(struct perf_event *event)

idx = srcu_read_lock(&pmus_srcu);

+ /*
+ * Save original type before calling pmu->event_init() since certain
+ * pmus overwrites event->attr.type to forward event to another pmu.
+ */
+ event->orig_type = event->attr.type;
+
/* Try parent's PMU first: */
if (event->parent && event->parent->pmu) {
pmu = event->parent->pmu;
@@ -13640,8 +13645,8 @@ void __init perf_event_init(void)
perf_event_init_all_cpus();
init_srcu_struct(&pmus_srcu);
perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
- perf_pmu_register(&perf_cpu_clock, NULL, -1);
- perf_pmu_register(&perf_task_clock, NULL, -1);
+ perf_pmu_register(&perf_cpu_clock, "cpu_clock", -1);
+ perf_pmu_register(&perf_task_clock, "task_clock", -1);
perf_tp_register();
perf_event_init_cpu(smp_processor_id());
register_reboot_notifier(&perf_reboot_notifier);
@@ -13684,7 +13689,7 @@ static int __init perf_event_sysfs_init(void)
goto unlock;

list_for_each_entry(pmu, &pmus, entry) {
- if (!pmu->name || pmu->type < 0)
+ if (pmu->dev)
continue;

ret = pmu_dev_alloc(pmu);

2024-02-20 08:47:00

by Pengfei Xu

[permalink] [raw]
Subject: Re: [PATCH v4 1/4] perf/core: Rework forwarding of {task|cpu}-clock events

Hi Ravi Bangoria,

On 2023-05-04 at 16:30:00 +0530, Ravi Bangoria wrote:
> Currently, PERF_TYPE_SOFTWARE is treated specially since task-clock and
> cpu-clock events are interfaced through it but internally gets forwarded
> to their own pmus.
>
> Rework this by overwriting event->attr.type in perf_swevent_init() which
> will cause perf_init_event() to retry with updated type and event will
> automatically get forwarded to right pmu. With the change, SW pmu no
> longer needs to be treated specially and can be included in 'pmu_idr'
> list.
>
> Suggested-by: Peter Zijlstra <[email protected]>
> Signed-off-by: Ravi Bangoria <[email protected]>
> ---
> include/linux/perf_event.h | 10 +++++
> kernel/events/core.c | 77 ++++++++++++++++++++------------------
> 2 files changed, 51 insertions(+), 36 deletions(-)

Greeting!
There is task hung in perf_tp_event_init in v6.8-rc4 in guest.

All detailed info: https://github.com/xupengfe/syzkaller_logs/tree/main/240219_055325_perf_tp_event_init
Syzkaller reproduced code: https://github.com/xupengfe/syzkaller_logs/blob/main/240219_055325_perf_tp_event_init/repro.c
Syzkaller reproduced syscall steps: https://github.com/xupengfe/syzkaller_logs/blob/main/240219_055325_perf_tp_event_init/repro.prog
Kconfig(make olddefconfig): https://github.com/xupengfe/syzkaller_logs/blob/main/240219_055325_perf_tp_event_init/kconfig_origin
Bisect info: https://github.com/xupengfe/syzkaller_logs/blob/main/240219_055325_perf_tp_event_init/bisect_info.log
v6.8-rc4 issue dmesg: https://github.com/xupengfe/syzkaller_logs/blob/main/240219_055325_perf_tp_event_init/841c35169323cd833294798e58b9bf63fa4fa1de_dmesg.log
v6.8-rc4 bzImage: https://github.com/xupengfe/syzkaller_logs/raw/main/240219_055325_perf_tp_event_init/bzimage_v6.8-rc4.tar.gz

Bisected and found bad commit:
0d6d062ca27e perf/core: Rework forwarding of {task|cpu}-clock events

It could be reproduced in 3600s.
After reverted the above commit on top of v6.8-rc4 kernel, this issue was gone.

Syzkaller repro.report:
"
clocksource: Long readout interval, skipping watchdog check: cs_nsec: 2247936849 wd_nsec: 2247937071
INFO: task syz-executor703:819 blocked for more than 143 seconds.
Not tainted 6.8.0-rc4-4e6c5b0f4ced+ #1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:syz-executor703 state:D stack:0 pid:819 tgid:819 ppid:760 flags:0x00000004
Call Trace:
<TASK>
context_switch kernel/sched/core.c:5400 [inline]
__schedule+0xa7d/0x2810 kernel/sched/core.c:6727
__schedule_loop kernel/sched/core.c:6802 [inline]
schedule+0xd0/0x290 kernel/sched/core.c:6817
schedule_preempt_disabled+0x1c/0x30 kernel/sched/core.c:6874
__mutex_lock_common kernel/locking/mutex.c:684 [inline]
__mutex_lock+0xf63/0x1b50 kernel/locking/mutex.c:752
mutex_lock_nested+0x1f/0x30 kernel/locking/mutex.c:804
perf_trace_init+0x5c/0x310 kernel/trace/trace_event_perf.c:221
perf_tp_event_init+0xaf/0x130 kernel/events/core.c:10106
perf_try_init_event+0x13f/0x5a0 kernel/events/core.c:11672
perf_init_event kernel/events/core.c:11742 [inline]
perf_event_alloc kernel/events/core.c:12022 [inline]
perf_event_alloc+0x1069/0x3e40 kernel/events/core.c:11888
__do_sys_perf_event_open+0x48e/0x2c40 kernel/events/core.c:12529
__se_sys_perf_event_open kernel/events/core.c:12420 [inline]
__x64_sys_perf_event_open+0xc7/0x160 kernel/events/core.c:12420
do_syscall_x64 arch/x86/entry/common.c:52 [inline]
do_syscall_64+0x74/0x150 arch/x86/entry/common.c:83
entry_SYSCALL_64_after_hwframe+0x6e/0x76
RIP: 0033:0x45c36d
RSP: 002b:00007fffec7b5478 EFLAGS: 00000246 ORIG_RAX: 000000000000012a
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 000000000045c36d
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000020000080
RBP: 00000000000b8995 R08: 0000000000000000 R09: 000000000000031a
R10: 00000000ffffffff R11: 0000000000000246 R12: 00000000ffffffff
R13: 0000000000000000 R14: 00000000004f9018 R15: 0000000000000000
</TASK>
INFO: task syz-executor703:821 blocked for more than 143 seconds.
Not tainted 6.8.0-rc4-4e6c5b0f4ced+ #1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:syz-executor703 state:D stack:0 pid:821 tgid:821 ppid:767 flags:0x00000004
Call Trace:
<TASK>
context_switch kernel/sched/core.c:5400 [inline]
__schedule+0xa7d/0x2810 kernel/sched/core.c:6727
__schedule_loop kernel/sched/core.c:6802 [inline]
schedule+0xd0/0x290 kernel/sched/core.c:6817
schedule_preempt_disabled+0x1c/0x30 kernel/sched/core.c:6874
__mutex_lock_common kernel/locking/mutex.c:684 [inline]
__mutex_lock+0xf63/0x1b50 kernel/locking/mutex.c:752
mutex_lock_nested+0x1f/0x30 kernel/locking/mutex.c:804
perf_trace_init+0x5c/0x310 kernel/trace/trace_event_perf.c:221
perf_tp_event_init+0xaf/0x130 kernel/events/core.c:10106
perf_try_init_event+0x13f/0x5a0 kernel/events/core.c:11672
perf_init_event kernel/events/core.c:11742 [inline]
perf_event_alloc kernel/events/core.c:12022 [inline]
perf_event_alloc+0x1069/0x3e40 kernel/events/core.c:11888
__do_sys_perf_event_open+0x48e/0x2c40 kernel/events/core.c:12529
__se_sys_perf_event_open kernel/events/core.c:12420 [inline]
__x64_sys_perf_event_open+0xc7/0x160 kernel/events/core.c:12420
do_syscall_x64 arch/x86/entry/common.c:52 [inline]
do_syscall_64+0x74/0x150 arch/x86/entry/common.c:83
entry_SYSCALL_64_after_hwframe+0x6e/0x76
RIP: 0033:0x45c36d
RSP: 002b:00007fffec7b5478 EFLAGS: 00000246 ORIG_RAX: 000000000000012a
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 000000000045c36d
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000020000080
RBP: 00000000000b8bea R08: 0000000000000000 R09: 000000000000031e
R10: 00000000ffffffff R11: 0000000000000246 R12: 00000000ffffffff
R13: 0000000000000000 R14: 00000000004f9018 R15: 0000000000000000
"

Thanks!

---

If you don't need the following environment to reproduce the problem or if you
already have one reproduced environment, please ignore the following information.

How to reproduce:
git clone https://gitlab.com/xupengfe/repro_vm_env.git
cd repro_vm_env
tar -xvf repro_vm_env.tar.gz
cd repro_vm_env; ./start3.sh // it needs qemu-system-x86_64 and I used v7.1.0
// start3.sh will load bzImage_2241ab53cbb5cdb08a6b2d4688feb13971058f65 v6.2-rc5 kernel
// You could change the bzImage_xxx as you want
// Maybe you need to remove line "-drive if=pflash,format=raw,readonly=on,file=./OVMF_CODE.fd \" for different qemu version
You could use below command to log in, there is no password for root.
ssh -p 10023 root@localhost

After login vm(virtual machine) successfully, you could transfer reproduced
binary to the vm by below way, and reproduce the problem in vm:
gcc -pthread -o repro repro.c
scp -P 10023 repro root@localhost:/root/

Get the bzImage for target kernel:
Please use target kconfig and copy it to kernel_src/.config
make olddefconfig
make -jx bzImage //x should equal or less than cpu num your pc has

Fill the bzImage file into above start3.sh to load the target kernel in vm.


Tips:
If you already have qemu-system-x86_64, please ignore below info.
If you want to install qemu v7.1.0 version:
git clone https://github.com/qemu/qemu.git
cd qemu
git checkout -f v7.1.0
mkdir build
cd build
yum install -y ninja-build.x86_64
yum -y install libslirp-devel.x86_64
./configure --target-list=x86_64-softmmu --enable-kvm --enable-vnc --enable-gtk --enable-sdl --enable-usb-redir --enable-slirp
make
make install

Best Regards,
Thanks!

>
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index d5628a7b5eaa..5c8a748f51ac 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -295,6 +295,8 @@ struct perf_event_pmu_context;
>
> struct perf_output_handle;
>
> +#define PMU_NULL_DEV ((void *)(~0))
> +
> /**
> * struct pmu - generic performance monitoring unit
> */
> @@ -827,6 +829,14 @@ struct perf_event {
> void *security;
> #endif
> struct list_head sb_list;
> +
> + /*
> + * Certain events gets forwarded to another pmu internally by over-
> + * writing kernel copy of event->attr.type without user being aware
> + * of it. event->orig_type contains original 'type' requested by
> + * user.
> + */
> + __u32 orig_type;
> #endif /* CONFIG_PERF_EVENTS */
> };
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 435815d3be3f..0695bb9fbbb6 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6647,7 +6647,7 @@ static void perf_sigtrap(struct perf_event *event)
> return;
>
> send_sig_perf((void __user *)event->pending_addr,
> - event->attr.type, event->attr.sig_data);
> + event->orig_type, event->attr.sig_data);
> }
>
> /*
> @@ -9951,6 +9951,9 @@ static void sw_perf_event_destroy(struct perf_event *event)
> swevent_hlist_put();
> }
>
> +static struct pmu perf_cpu_clock; /* fwd declaration */
> +static struct pmu perf_task_clock;
> +
> static int perf_swevent_init(struct perf_event *event)
> {
> u64 event_id = event->attr.config;
> @@ -9966,7 +9969,10 @@ static int perf_swevent_init(struct perf_event *event)
>
> switch (event_id) {
> case PERF_COUNT_SW_CPU_CLOCK:
> + event->attr.type = perf_cpu_clock.type;
> + return -ENOENT;
> case PERF_COUNT_SW_TASK_CLOCK:
> + event->attr.type = perf_task_clock.type;
> return -ENOENT;
>
> default:
> @@ -11086,7 +11092,7 @@ static void cpu_clock_event_read(struct perf_event *event)
>
> static int cpu_clock_event_init(struct perf_event *event)
> {
> - if (event->attr.type != PERF_TYPE_SOFTWARE)
> + if (event->attr.type != perf_cpu_clock.type)
> return -ENOENT;
>
> if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
> @@ -11107,6 +11113,7 @@ static struct pmu perf_cpu_clock = {
> .task_ctx_nr = perf_sw_context,
>
> .capabilities = PERF_PMU_CAP_NO_NMI,
> + .dev = PMU_NULL_DEV,
>
> .event_init = cpu_clock_event_init,
> .add = cpu_clock_event_add,
> @@ -11167,7 +11174,7 @@ static void task_clock_event_read(struct perf_event *event)
>
> static int task_clock_event_init(struct perf_event *event)
> {
> - if (event->attr.type != PERF_TYPE_SOFTWARE)
> + if (event->attr.type != perf_task_clock.type)
> return -ENOENT;
>
> if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
> @@ -11188,6 +11195,7 @@ static struct pmu perf_task_clock = {
> .task_ctx_nr = perf_sw_context,
>
> .capabilities = PERF_PMU_CAP_NO_NMI,
> + .dev = PMU_NULL_DEV,
>
> .event_init = task_clock_event_init,
> .add = task_clock_event_add,
> @@ -11415,31 +11423,31 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
> goto unlock;
>
> pmu->type = -1;
> - if (!name)
> - goto skip_type;
> + if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) {
> + ret = -EINVAL;
> + goto free_pdc;
> + }
> +
> pmu->name = name;
>
> - if (type != PERF_TYPE_SOFTWARE) {
> - if (type >= 0)
> - max = type;
> + if (type >= 0)
> + max = type;
>
> - ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
> - if (ret < 0)
> - goto free_pdc;
> + ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
> + if (ret < 0)
> + goto free_pdc;
>
> - WARN_ON(type >= 0 && ret != type);
> + WARN_ON(type >= 0 && ret != type);
>
> - type = ret;
> - }
> + type = ret;
> pmu->type = type;
>
> - if (pmu_bus_running) {
> + if (pmu_bus_running && !pmu->dev) {
> ret = pmu_dev_alloc(pmu);
> if (ret)
> goto free_idr;
> }
>
> -skip_type:
> ret = -ENOMEM;
> pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context);
> if (!pmu->cpu_pmu_context)
> @@ -11481,16 +11489,7 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
> if (!pmu->event_idx)
> pmu->event_idx = perf_event_idx_default;
>
> - /*
> - * Ensure the TYPE_SOFTWARE PMUs are at the head of the list,
> - * since these cannot be in the IDR. This way the linear search
> - * is fast, provided a valid software event is provided.
> - */
> - if (type == PERF_TYPE_SOFTWARE || !name)
> - list_add_rcu(&pmu->entry, &pmus);
> - else
> - list_add_tail_rcu(&pmu->entry, &pmus);
> -
> + list_add_rcu(&pmu->entry, &pmus);
> atomic_set(&pmu->exclusive_cnt, 0);
> ret = 0;
> unlock:
> @@ -11499,12 +11498,13 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
> return ret;
>
> free_dev:
> - device_del(pmu->dev);
> - put_device(pmu->dev);
> + if (pmu->dev && pmu->dev != PMU_NULL_DEV) {
> + device_del(pmu->dev);
> + put_device(pmu->dev);
> + }
>
> free_idr:
> - if (pmu->type != PERF_TYPE_SOFTWARE)
> - idr_remove(&pmu_idr, pmu->type);
> + idr_remove(&pmu_idr, pmu->type);
>
> free_pdc:
> free_percpu(pmu->pmu_disable_count);
> @@ -11525,9 +11525,8 @@ void perf_pmu_unregister(struct pmu *pmu)
> synchronize_rcu();
>
> free_percpu(pmu->pmu_disable_count);
> - if (pmu->type != PERF_TYPE_SOFTWARE)
> - idr_remove(&pmu_idr, pmu->type);
> - if (pmu_bus_running) {
> + idr_remove(&pmu_idr, pmu->type);
> + if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) {
> if (pmu->nr_addr_filters)
> device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
> device_del(pmu->dev);
> @@ -11601,6 +11600,12 @@ static struct pmu *perf_init_event(struct perf_event *event)
>
> idx = srcu_read_lock(&pmus_srcu);
>
> + /*
> + * Save original type before calling pmu->event_init() since certain
> + * pmus overwrites event->attr.type to forward event to another pmu.
> + */
> + event->orig_type = event->attr.type;
> +
> /* Try parent's PMU first: */
> if (event->parent && event->parent->pmu) {
> pmu = event->parent->pmu;
> @@ -13640,8 +13645,8 @@ void __init perf_event_init(void)
> perf_event_init_all_cpus();
> init_srcu_struct(&pmus_srcu);
> perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
> - perf_pmu_register(&perf_cpu_clock, NULL, -1);
> - perf_pmu_register(&perf_task_clock, NULL, -1);
> + perf_pmu_register(&perf_cpu_clock, "cpu_clock", -1);
> + perf_pmu_register(&perf_task_clock, "task_clock", -1);
> perf_tp_register();
> perf_event_init_cpu(smp_processor_id());
> register_reboot_notifier(&perf_reboot_notifier);
> @@ -13684,7 +13689,7 @@ static int __init perf_event_sysfs_init(void)
> goto unlock;
>
> list_for_each_entry(pmu, &pmus, entry) {
> - if (!pmu->name || pmu->type < 0)
> + if (pmu->dev)
> continue;
>
> ret = pmu_dev_alloc(pmu);
> --
> 2.40.0
>

2024-02-23 05:28:58

by Ravi Bangoria

[permalink] [raw]
Subject: Re: [PATCH v4 1/4] perf/core: Rework forwarding of {task|cpu}-clock events

Hi Pengfei,

On 20-Feb-24 2:11 PM, Pengfei Xu wrote:
> Hi Ravi Bangoria,
>
> On 2023-05-04 at 16:30:00 +0530, Ravi Bangoria wrote:
>> Currently, PERF_TYPE_SOFTWARE is treated specially since task-clock and
>> cpu-clock events are interfaced through it but internally gets forwarded
>> to their own pmus.
>>
>> Rework this by overwriting event->attr.type in perf_swevent_init() which
>> will cause perf_init_event() to retry with updated type and event will
>> automatically get forwarded to right pmu. With the change, SW pmu no
>> longer needs to be treated specially and can be included in 'pmu_idr'
>> list.
>>
>> Suggested-by: Peter Zijlstra <[email protected]>
>> Signed-off-by: Ravi Bangoria <[email protected]>
>> ---
>> include/linux/perf_event.h | 10 +++++
>> kernel/events/core.c | 77 ++++++++++++++++++++------------------
>> 2 files changed, 51 insertions(+), 36 deletions(-)
>
> Greeting!
> There is task hung in perf_tp_event_init in v6.8-rc4 in guest.

Thanks for the bug report. I'm able to reproduce it. Will try to spend
more time to rootcause it.

Thanks,
Ravi

2024-02-28 12:51:02

by Ravi Bangoria

[permalink] [raw]
Subject: Re: [PATCH v4 1/4] perf/core: Rework forwarding of {task|cpu}-clock events

>>> Currently, PERF_TYPE_SOFTWARE is treated specially since task-clock and
>>> cpu-clock events are interfaced through it but internally gets forwarded
>>> to their own pmus.
>>>
>>> Rework this by overwriting event->attr.type in perf_swevent_init() which
>>> will cause perf_init_event() to retry with updated type and event will
>>> automatically get forwarded to right pmu. With the change, SW pmu no
>>> longer needs to be treated specially and can be included in 'pmu_idr'
>>> list.
>>>
>>> Suggested-by: Peter Zijlstra <[email protected]>
>>> Signed-off-by: Ravi Bangoria <[email protected]>
>>> ---
>>> include/linux/perf_event.h | 10 +++++
>>> kernel/events/core.c | 77 ++++++++++++++++++++------------------
>>> 2 files changed, 51 insertions(+), 36 deletions(-)
>>
>> Greeting!
>> There is task hung in perf_tp_event_init in v6.8-rc4 in guest.
>
> Thanks for the bug report. I'm able to reproduce it. Will try to spend
> more time to rootcause it.

Although the bisect has lead to 0d6d062ca27e as culprit commit, a minor
change (shown below) in the test program can create the same task hang
issue even with 0d6d062ca27e reverted.

- *(uint32_t*)0x200000c0 = 6; /* Use cpu-clock pmu type when 0d6d062ca27e is present */
+ *(uint32_t*)0x200000c0 = 1; /* Use software pmu type when 0d6d062ca27e is absent */

So, 0d6d062ca27e is not the culprit commit.

Additionally,

o I've seen task hang or soft-lockups on a single cpu KVM guest while
running your test as root and also as normal user with
perf_event_paranoid=-1. But the same experiment on host, no lockups,
only task hang. So I feel the bug report is false positive and there
is no real issue (since the experiment requires special privilege).

o 0d6d062ca27e has inadvertently started allowing cpu-clock and task-
clock events creation via their own pmu->type in perf_event_open(),
instead of earlier design where the only interface was through sw
pmu. Is it harmful? Probably not. But worth to be documented:

----><----

From c7ae1c57e2a23a05eb982524d37bc8542c9c9a34 Mon Sep 17 00:00:00 2001
From: Ravi Bangoria <[email protected]>
Date: Wed, 28 Feb 2024 17:29:04 +0530
Subject: [PATCH] perf/core: Document {cpu|task}-clock event open behavior

The standard interface to invoke task-clock and cpu-clock pmu is through
software pmu (see perf_swevent_init()), since these pmus are not exposed
to the user via sysfs and thus user doesn't know their pmu->type. However,
current code allows user to open an event if user has passed correct type
in the perf event attribute. This is not easily apparent from the code and
thus worth to be documented.

Signed-off-by: Ravi Bangoria <[email protected]>
---
kernel/events/core.c | 14 ++++++++++++++
1 file changed, 14 insertions(+)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index f0f0f71213a1..4072bccab3ba 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11178,6 +11178,13 @@ static void cpu_clock_event_read(struct perf_event *event)

static int cpu_clock_event_init(struct perf_event *event)
{
+ /*
+ * The standard interface to invoke task-clock pmu is through software
+ * pmu(see perf_swevent_init()), since task-clock pmu is not exposed to
+ * the user via sysfs and thus user doesn't know perf_task_clock.type.
+ * However, allow user to open an event if user has passed correct type
+ * in the attribute.
+ */
if (event->attr.type != perf_cpu_clock.type)
return -ENOENT;

@@ -11260,6 +11267,13 @@ static void task_clock_event_read(struct perf_event *event)

static int task_clock_event_init(struct perf_event *event)
{
+ /*
+ * The standard interface to invoke task-clock pmu is through software
+ * pmu(see perf_swevent_init()), since task-clock pmu is not exposed to
+ * the user via sysfs and thus user doesn't know perf_task_clock.type.
+ * However, allow user to open an event if user has passed correct type
+ * in the attribute.
+ */
if (event->attr.type != perf_task_clock.type)
return -ENOENT;

--
2.34.1

2024-02-29 03:46:52

by Pengfei Xu

[permalink] [raw]
Subject: Re: [PATCH v4 1/4] perf/core: Rework forwarding of {task|cpu}-clock events

On 2024-02-28 at 20:49:59 +0800, Ravi Bangoria wrote:
> >>> Currently, PERF_TYPE_SOFTWARE is treated specially since task-clock and
> >>> cpu-clock events are interfaced through it but internally gets forwarded
> >>> to their own pmus.
> >>>
> >>> Rework this by overwriting event->attr.type in perf_swevent_init() which
> >>> will cause perf_init_event() to retry with updated type and event will
> >>> automatically get forwarded to right pmu. With the change, SW pmu no
> >>> longer needs to be treated specially and can be included in 'pmu_idr'
> >>> list.
> >>>
> >>> Suggested-by: Peter Zijlstra <[email protected]>
> >>> Signed-off-by: Ravi Bangoria <[email protected]>
> >>> ---
> >>> include/linux/perf_event.h | 10 +++++
> >>> kernel/events/core.c | 77 ++++++++++++++++++++------------------
> >>> 2 files changed, 51 insertions(+), 36 deletions(-)
> >>
> >> Greeting!
> >> There is task hung in perf_tp_event_init in v6.8-rc4 in guest.
> >
> > Thanks for the bug report. I'm able to reproduce it. Will try to spend
> > more time to rootcause it.
>
> Although the bisect has lead to 0d6d062ca27e as culprit commit, a minor
> change (shown below) in the test program can create the same task hang
> issue even with 0d6d062ca27e reverted.
>
> - *(uint32_t*)0x200000c0 = 6; /* Use cpu-clock pmu type when 0d6d062ca27e is present */
> + *(uint32_t*)0x200000c0 = 1; /* Use software pmu type when 0d6d062ca27e is absent */
>
> So, 0d6d062ca27e is not the culprit commit.

Thank you for your analysis and additional verification!

commit 0d6d062ca27e only changes kernel/events/core.c judgement:
PERF_TYPE_SOFTWARE to perf_cpu_clock.type in function cpu_clock_event_init():
"
@@ -11086,7 +11092,7 @@ static void cpu_clock_event_read(struct perf_event *event)

static int cpu_clock_event_init(struct perf_event *event)
{
- if (event->attr.type != PERF_TYPE_SOFTWARE)
+ if (event->attr.type != perf_cpu_clock.type)
return -ENOENT;
"

After reverted the commit, and change test code 0x200000c0 from 6 to 1,
kernel code in kernel/events/core.c: func perf_swevent_init() still keeps
PERF_TYPE_SOFTWARE(1) judgement because above commit doesn't change judgement
in perf_swevent_init():
"
// PERF_TYPE_SOFTWARE = 1;
if (event->attr.type != PERF_TYPE_SOFTWARE)
return -ENOENT;
"

So it seems show that attr.type(0x200000c0) = 6 will return -2 and not solve
further action in perf_swevent_init() will not trigger task hang.

And if attr.type(0x200000c0) = 1 will pass the judgement and solve further
action in perf_swevent_init(), and then will trigger task hang.

Thanks for correction if there is something wrong.


>
> Additionally,
>
> o I've seen task hang or soft-lockups on a single cpu KVM guest while
> running your test as root and also as normal user with
> perf_event_paranoid=-1. But the same experiment on host, no lockups,
> only task hang. So I feel the bug report is false positive and there
> is no real issue (since the experiment requires special privilege).

Thanks for your info! Seems some problem will cause cpu soft-lockups.
If there is more clue, will update it.

>
> o 0d6d062ca27e has inadvertently started allowing cpu-clock and task-
> clock events creation via their own pmu->type in perf_event_open(),
> instead of earlier design where the only interface was through sw
> pmu. Is it harmful? Probably not. But worth to be documented:

Thanks a lot for description, and there is some other way to trigger
the perf_event type change.

Thanks!

>
> ----><----
>
> From c7ae1c57e2a23a05eb982524d37bc8542c9c9a34 Mon Sep 17 00:00:00 2001
> From: Ravi Bangoria <[email protected]>
> Date: Wed, 28 Feb 2024 17:29:04 +0530
> Subject: [PATCH] perf/core: Document {cpu|task}-clock event open behavior
>
> The standard interface to invoke task-clock and cpu-clock pmu is through
> software pmu (see perf_swevent_init()), since these pmus are not exposed
> to the user via sysfs and thus user doesn't know their pmu->type. However,
> current code allows user to open an event if user has passed correct type
> in the perf event attribute. This is not easily apparent from the code and
> thus worth to be documented.
>
> Signed-off-by: Ravi Bangoria <[email protected]>
> ---
> kernel/events/core.c | 14 ++++++++++++++
> 1 file changed, 14 insertions(+)
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index f0f0f71213a1..4072bccab3ba 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -11178,6 +11178,13 @@ static void cpu_clock_event_read(struct perf_event *event)
>
> static int cpu_clock_event_init(struct perf_event *event)
> {
> + /*
> + * The standard interface to invoke task-clock pmu is through software
> + * pmu(see perf_swevent_init()), since task-clock pmu is not exposed to
> + * the user via sysfs and thus user doesn't know perf_task_clock.type.
> + * However, allow user to open an event if user has passed correct type
> + * in the attribute.
> + */
> if (event->attr.type != perf_cpu_clock.type)
> return -ENOENT;
>
> @@ -11260,6 +11267,13 @@ static void task_clock_event_read(struct perf_event *event)
>
> static int task_clock_event_init(struct perf_event *event)
> {
> + /*
> + * The standard interface to invoke task-clock pmu is through software
> + * pmu(see perf_swevent_init()), since task-clock pmu is not exposed to
> + * the user via sysfs and thus user doesn't know perf_task_clock.type.
> + * However, allow user to open an event if user has passed correct type
> + * in the attribute.
> + */
> if (event->attr.type != perf_task_clock.type)
> return -ENOENT;
>
> --
> 2.34.1