2010-11-09 21:45:31

by Peter Zijlstra

[permalink] [raw]
Subject: [RFC][PATCH] perf: sysfs type id

The below is a RFC patch adding dynamic type ids to perf.

We need to represent PMUs in sysfs because we want to allow multiple
(loadable) PMUs and need a way to identify them.

This patch creates a new device class "pmu" and adds a single attribute
"type" to it. This device attribute will expose the dynamic type id as
required by perf_event_attr::type.

The sysfs layout looks like:

[root@westmere ~]# cd /sys/class/pmu/
[root@westmere pmu]# ls -la
total 0
drwxr-xr-x 2 root root 0 2010-11-09 22:22 .
drwxr-xr-x 47 root root 0 2010-11-09 22:22 ..
lrwxrwxrwx 1 root root 0 2010-11-09 22:22 breakpoint -> ../../devices/virtual/pmu/breakpoint
lrwxrwxrwx 1 root root 0 2010-11-09 22:22 cpu -> ../../devices/virtual/pmu/cpu
lrwxrwxrwx 1 root root 0 2010-11-09 22:22 frob -> ../../devices/virtual/pmu/frob
lrwxrwxrwx 1 root root 0 2010-11-09 22:22 software -> ../../devices/virtual/pmu/software
lrwxrwxrwx 1 root root 0 2010-11-09 22:22 tracepoint -> ../../devices/virtual/pmu/tracepoint
[root@westmere pmu]# cd frob/
[root@westmere frob]# ls -la
total 0
drwxr-xr-x 3 root root 0 2010-11-09 22:22 .
drwxr-xr-x 7 root root 0 2010-11-09 22:22 ..
drwxr-xr-x 2 root root 0 2010-11-09 22:23 power
lrwxrwxrwx 1 root root 0 2010-11-09 22:23 subsystem -> ../../../../class/pmu
-r--r--r-- 1 root root 4096 2010-11-09 22:23 type
-rw-r--r-- 1 root root 4096 2010-11-09 22:22 uevent
[root@westmere frob]# cat type
6

Not at all sure what all those power bits mean, Greg?

The idea is to populate the sysfs topology with symlinks to these
devices (have /sys/devices/system/cpu/pmu link to the "cpu" pmu device,
have /sys/devices/system/node/ link to a possible "node" pmu device --
intel uncore, etc..). I'll still have to look at how to create these
symlinks, if anybody got clue please holler ;-)

Furthermore, we can later add an event directory to these devices which
list available events and contain the value required by
perf_event_attr::config.

Comments?

---
arch/x86/include/asm/perf_event.h | 2 -
arch/x86/kernel/cpu/common.c | 2 -
arch/x86/kernel/cpu/perf_event.c | 11 ++-
include/linux/perf_event.h | 7 ++-
init/main.c | 2 +-
kernel/hw_breakpoint.c | 2 +-
kernel/perf_event.c | 121 ++++++++++++++++++++++++++++++++----
7 files changed, 122 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 550e26b..d9d4dae 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -125,7 +125,6 @@ union cpuid10_edx {
#define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */

#ifdef CONFIG_PERF_EVENTS
-extern void init_hw_perf_events(void);
extern void perf_events_lapic_init(void);

#define PERF_EVENT_INDEX_OFFSET 0
@@ -156,7 +155,6 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs);
}

#else
-static inline void init_hw_perf_events(void) { }
static inline void perf_events_lapic_init(void) { }
#endif

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4b68bda..9eb2248 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,7 +13,6 @@
#include <linux/io.h>

#include <asm/stackprotector.h>
-#include <asm/perf_event.h>
#include <asm/mmu_context.h>
#include <asm/hypervisor.h>
#include <asm/processor.h>
@@ -894,7 +893,6 @@ void __init identify_boot_cpu(void)
#else
vgetcpu_set_mode();
#endif
- init_hw_perf_events();
}

void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index ed63101..04d0f3c 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1348,7 +1348,7 @@ static void __init pmu_check_apic(void)
pr_info("no hardware sampling interrupt available.\n");
}

-void __init init_hw_perf_events(void)
+static int __init init_hw_perf_events(void)
{
struct event_constraint *c;
int err;
@@ -1363,11 +1363,11 @@ void __init init_hw_perf_events(void)
err = amd_pmu_init();
break;
default:
- return;
+ return 0;
}
if (err != 0) {
pr_cont("no PMU driver, software events only.\n");
- return;
+ return 0;
}

pmu_check_apic();
@@ -1418,9 +1418,12 @@ void __init init_hw_perf_events(void)
pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);

- perf_pmu_register(&pmu);
+ perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
perf_cpu_notifier(x86_pmu_notifier);
+
+ return 0;
}
+early_initcall(init_hw_perf_events);

static inline void x86_pmu_read(struct perf_event *event)
{
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 057bf22..aa1117f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -578,6 +578,10 @@ struct perf_event;
struct pmu {
struct list_head entry;

+ struct device *dev;
+ char *name;
+ int type;
+
int * __percpu pmu_disable_count;
struct perf_cpu_context * __percpu pmu_cpu_context;
int task_ctx_nr;
@@ -876,6 +880,7 @@ struct perf_cpu_context {
int exclusive;
struct list_head rotation_list;
int jiffies_interval;
+ int disable_count;
};

struct perf_output_handle {
@@ -891,7 +896,7 @@ struct perf_output_handle {

#ifdef CONFIG_PERF_EVENTS

-extern int perf_pmu_register(struct pmu *pmu);
+extern int perf_pmu_register(struct pmu *pmu, char *name, int type);
extern void perf_pmu_unregister(struct pmu *pmu);

extern int perf_num_counters(void);
diff --git a/init/main.c b/init/main.c
index e59af24..41a0c2f 100644
--- a/init/main.c
+++ b/init/main.c
@@ -588,6 +588,7 @@ asmlinkage void __init start_kernel(void)
sort_main_extable();
trap_init();
mm_init();
+ idr_init_cache();
/*
* Set up the scheduler prior starting any interrupts (such as the
* timer interrupt). Full topology setup happens at smp_init()
@@ -659,7 +660,6 @@ asmlinkage void __init start_kernel(void)
enable_debug_pagealloc();
kmemleak_init();
debug_objects_mem_init();
- idr_init_cache();
setup_per_cpu_pageset();
numa_policy_init();
if (late_time_init)
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 2c9120f..a14ca35 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -641,7 +641,7 @@ static int __init init_hw_breakpoint(void)

constraints_initialized = 1;

- perf_pmu_register(&perf_breakpoint);
+ perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT);

return register_die_notifier(&hw_breakpoint_exceptions_nb);

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 517d827..7f0d3ac 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -13,6 +13,7 @@
#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/smp.h>
+#include <linux/idr.h>
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/slab.h>
@@ -22,6 +23,7 @@
#include <linux/percpu.h>
#include <linux/ptrace.h>
#include <linux/vmstat.h>
+#include <linux/device.h>
#include <linux/vmalloc.h>
#include <linux/hardirq.h>
#include <linux/rculist.h>
@@ -70,14 +72,16 @@ extern __weak const char *perf_pmu_name(void)

void perf_pmu_disable(struct pmu *pmu)
{
- int *count = this_cpu_ptr(pmu->pmu_disable_count);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ int *count = &cpuctx->disable_count;
if (!(*count)++)
pmu->pmu_disable(pmu);
}

void perf_pmu_enable(struct pmu *pmu)
{
- int *count = this_cpu_ptr(pmu->pmu_disable_count);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ int *count = &cpuctx->disable_count;
if (!--(*count))
pmu->pmu_enable(pmu);
}
@@ -4778,7 +4782,7 @@ static struct pmu perf_tracepoint = {

static inline void perf_tp_register(void)
{
- perf_pmu_register(&perf_tracepoint);
+ perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
}

static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -5087,6 +5091,9 @@ static void *find_pmu_context(int ctxn)
return NULL;
}

+static struct class *pmu_class;
+static struct idr pmu_idr;
+
static void free_pmu_context(void * __percpu cpu_context)
{
struct pmu *pmu;
@@ -5102,26 +5109,59 @@ static void free_pmu_context(void * __percpu cpu_context)

free_percpu(cpu_context);
out:
+ if (pmu->type >= 0)
+ idr_remove(&pmu_idr, pmu->type);
+
mutex_unlock(&pmus_lock);
+
+ if (pmu->dev)
+ device_unregister(pmu->dev);
}

-int perf_pmu_register(struct pmu *pmu)
+int perf_pmu_register(struct pmu *pmu, char *name, int type)
{
int cpu, ret;

mutex_lock(&pmus_lock);
ret = -ENOMEM;
- pmu->pmu_disable_count = alloc_percpu(int);
- if (!pmu->pmu_disable_count)
- goto unlock;

+ pmu->type = -1;
+ if (!name)
+ goto nodev;
+
+ pmu->name = name;
+ if (type < 0) {
+ int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
+ if (!err) {
+ printk(KERN_ERR "FOO! %d\n", err);
+ goto unlock;
+ }
+ err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
+ if (err) {
+ printk(KERN_ERR "BAR! %d\n", err);
+ ret = err;
+ goto unlock;
+ }
+ }
+ pmu->type = type;
+
+ if (pmu_class) {
+ pmu->dev = device_create(pmu_class, NULL, MKDEV(0, 0),
+ pmu, "%s", pmu->name);
+ if (IS_ERR(pmu->dev)) {
+ ret = PTR_ERR(pmu->dev);
+ goto free_idr;
+ }
+ }
+
+nodev:
pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
if (pmu->pmu_cpu_context)
goto got_cpu_context;

pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
if (!pmu->pmu_cpu_context)
- goto free_pdc;
+ goto free_dev;

for_each_possible_cpu(cpu) {
struct perf_cpu_context *cpuctx;
@@ -5132,6 +5172,7 @@ int perf_pmu_register(struct pmu *pmu)
cpuctx->ctx.pmu = pmu;
cpuctx->jiffies_interval = 1;
INIT_LIST_HEAD(&cpuctx->rotation_list);
+ cpuctx->disable_count = 0;
}

got_cpu_context:
@@ -5164,8 +5205,13 @@ unlock:

return ret;

-free_pdc:
- free_percpu(pmu->pmu_disable_count);
+free_dev:
+ if (pmu->dev)
+ device_unregister(pmu->dev);
+
+free_idr:
+ if (pmu->type >= 0)
+ idr_remove(&pmu_idr, pmu->type);
goto unlock;
}

@@ -5182,7 +5228,6 @@ void perf_pmu_unregister(struct pmu *pmu)
synchronize_srcu(&pmus_srcu);
synchronize_rcu();

- free_percpu(pmu->pmu_disable_count);
free_pmu_context(pmu->pmu_cpu_context);
}

@@ -5192,6 +5237,13 @@ struct pmu *perf_init_event(struct perf_event *event)
int idx;

idx = srcu_read_lock(&pmus_srcu);
+
+ rcu_read_lock();
+ pmu = idr_find(&pmu_idr, event->attr.type);
+ rcu_read_unlock();
+ if (pmu)
+ goto unlock;
+
list_for_each_entry_rcu(pmu, &pmus, entry) {
int ret = pmu->event_init(event);
if (!ret)
@@ -6293,13 +6345,54 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
return NOTIFY_OK;
}

+static ssize_t type_show(struct device *dev,
+ struct device_attribute *attr, char *page)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
+
+ return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
+}
+
+static struct device_attribute pmu_dev_attrs[] = {
+ __ATTR_RO(type),
+ __ATTR_NULL,
+};
+
void __init perf_event_init(void)
{
+ idr_init(&pmu_idr);
+
perf_event_init_all_cpus();
init_srcu_struct(&pmus_srcu);
- perf_pmu_register(&perf_swevent);
- perf_pmu_register(&perf_cpu_clock);
- perf_pmu_register(&perf_task_clock);
+ perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
+ perf_pmu_register(&perf_cpu_clock, "frob", -1); /* test the dynamic code */
+ perf_pmu_register(&perf_task_clock, NULL, -1);
perf_tp_register();
perf_cpu_notifier(perf_cpu_notify);
}
+
+int __init perf_event_sysfs_init(void)
+{
+ struct pmu *pmu;
+
+ mutex_lock(&pmus_lock);
+
+ pmu_class = class_create(THIS_MODULE, "pmu");
+ BUG_ON(IS_ERR(pmu_class));
+ pmu_class->dev_attrs = pmu_dev_attrs;
+
+ list_for_each_entry(pmu, &pmus, entry) {
+ if (!pmu->name || pmu->type < 0)
+ continue;
+
+ pmu->dev = device_create(pmu_class, NULL, MKDEV(0, 0),
+ pmu, "%s", pmu->name);
+ if (IS_ERR(pmu->dev))
+ pmu->dev = NULL; /* do we care about the failure? */
+ }
+
+ mutex_unlock(&pmus_lock);
+
+ return 0;
+}
+__initcall(perf_event_sysfs_init);


2010-11-09 22:11:34

by Kay Sievers

[permalink] [raw]
Subject: Re: [RFC][PATCH] perf: sysfs type id

On Tue, Nov 9, 2010 at 22:45, Peter Zijlstra <[email protected]> wrote:
> The below is a RFC patch adding dynamic type ids to perf.
>
> We need to represent PMUs in sysfs because we want to allow multiple
> (loadable) PMUs and need a way to identify them.
>
> This patch creates a new device class "pmu" and adds a single attribute
> "type" to it. This device attribute will expose the dynamic type id as
> required by perf_event_attr::type.
>
> The sysfs layout looks like:
>
> [root@westmere ~]# cd /sys/class/pmu/

Please use a 'bus_type' instead of 'class'.

I'm very sure, some day, you'll need global attributes for the pmu
stuff, and class -- unlike bus -- has its own subdir where you can go
wild, without mixing things with the list-of-devices. :)

No new stuff should use 'class', it's not extensible.

Kay

2010-11-09 22:13:50

by Greg KH

[permalink] [raw]
Subject: Re: [RFC][PATCH] perf: sysfs type id

On Tue, Nov 09, 2010 at 10:45:19PM +0100, Peter Zijlstra wrote:
> The below is a RFC patch adding dynamic type ids to perf.
>
> We need to represent PMUs in sysfs because we want to allow multiple
> (loadable) PMUs and need a way to identify them.
>
> This patch creates a new device class "pmu" and adds a single attribute
> "type" to it. This device attribute will expose the dynamic type id as
> required by perf_event_attr::type.
>
> The sysfs layout looks like:
>
> [root@westmere ~]# cd /sys/class/pmu/

You missed the embedded track at Plumbers where we talked about never
adding another class to the kernel. Please use bus_id instead for this.

> [root@westmere pmu]# ls -la
> total 0
> drwxr-xr-x 2 root root 0 2010-11-09 22:22 .
> drwxr-xr-x 47 root root 0 2010-11-09 22:22 ..
> lrwxrwxrwx 1 root root 0 2010-11-09 22:22 breakpoint -> ../../devices/virtual/pmu/breakpoint
> lrwxrwxrwx 1 root root 0 2010-11-09 22:22 cpu -> ../../devices/virtual/pmu/cpu
> lrwxrwxrwx 1 root root 0 2010-11-09 22:22 frob -> ../../devices/virtual/pmu/frob
> lrwxrwxrwx 1 root root 0 2010-11-09 22:22 software -> ../../devices/virtual/pmu/software
> lrwxrwxrwx 1 root root 0 2010-11-09 22:22 tracepoint -> ../../devices/virtual/pmu/tracepoint
> [root@westmere pmu]# cd frob/
> [root@westmere frob]# ls -la
> total 0
> drwxr-xr-x 3 root root 0 2010-11-09 22:22 .
> drwxr-xr-x 7 root root 0 2010-11-09 22:22 ..
> drwxr-xr-x 2 root root 0 2010-11-09 22:23 power
> lrwxrwxrwx 1 root root 0 2010-11-09 22:23 subsystem -> ../../../../class/pmu
> -r--r--r-- 1 root root 4096 2010-11-09 22:23 type
> -rw-r--r-- 1 root root 4096 2010-11-09 22:22 uevent
> [root@westmere frob]# cat type
> 6
>
> Not at all sure what all those power bits mean, Greg?

All devices get a "default" power directory by the core, not much you
can do about it.

thanks,

greg k-h

2010-11-09 22:22:25

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] perf: sysfs type id

On Tue, 2010-11-09 at 23:11 +0100, Kay Sievers wrote:
> On Tue, Nov 9, 2010 at 22:45, Peter Zijlstra <[email protected]> wrote:
> > The below is a RFC patch adding dynamic type ids to perf.
> >
> > We need to represent PMUs in sysfs because we want to allow multiple
> > (loadable) PMUs and need a way to identify them.
> >
> > This patch creates a new device class "pmu" and adds a single attribute
> > "type" to it. This device attribute will expose the dynamic type id as
> > required by perf_event_attr::type.
> >
> > The sysfs layout looks like:
> >
> > [root@westmere ~]# cd /sys/class/pmu/
>
> Please use a 'bus_type' instead of 'class'.
>
> I'm very sure, some day, you'll need global attributes for the pmu
> stuff, and class -- unlike bus -- has its own subdir where you can go
> wild, without mixing things with the list-of-devices. :)

Having its own subdir sounds like a pro, so I'm somewhat confused. Also
the pmu (or event_source as Ingo would like it getting called) seems
like a class of devices not a bus.

USB/PCI/I2C/ISA are all buses.. pmu/event_source not so much, they are
very different (sometimes even pure software) things that provide a
common interface -- they generate events which we can count and sample.

/me dazed & confused, please do explain.

> No new stuff should use 'class', it's not extensible.

Hrm.. when I introduced the bdi stuff class was _the_ thing to use.

2010-11-09 22:41:01

by Kay Sievers

[permalink] [raw]
Subject: Re: [RFC][PATCH] perf: sysfs type id

On Tue, Nov 9, 2010 at 23:22, Peter Zijlstra <[email protected]> wrote:
> On Tue, 2010-11-09 at 23:11 +0100, Kay Sievers wrote:
>> On Tue, Nov 9, 2010 at 22:45, Peter Zijlstra <[email protected]> wrote:
>> > The below is a RFC patch adding dynamic type ids to perf.
>> >
>> > We need to represent PMUs in sysfs because we want to allow multiple
>> > (loadable) PMUs and need a way to identify them.
>> >
>> > This patch creates a new device class "pmu" and adds a single attribute
>> > "type" to it. This device attribute will expose the dynamic type id as
>> > required by perf_event_attr::type.
>> >
>> > The sysfs layout looks like:
>> >
>> > [root@westmere ~]# cd /sys/class/pmu/
>>
>> Please use a 'bus_type' instead of 'class'.
>>
>> I'm very sure, some day, you'll need global attributes for the pmu
>> stuff, and class -- unlike bus -- has its own subdir where you can go
>> wild, without mixing things with the list-of-devices. :)
>
> Having its own subdir sounds like a pro, so I'm somewhat confused. Also
> the pmu (or event_source as Ingo would like it getting called) seems
> like a class of devices not a bus.
>
> USB/PCI/I2C/ISA are all buses.. pmu/event_source not so much, they are
> very different (sometimes even pure software) things that provide a
> common interface -- they generate events which we can count and sample.
>
> /me dazed & confused, please do explain.

Don't get confused by the 'bus' name, it's used for many things which
are not hardware buses. Class is just almost the same as a bus inside
and outside the kernel. Things like libudev don't even allow you to
distinguish the both, there is only a 'subsystem'. The class export in
/sys is not extensible at all, and therefore it should be avoided for
new stuff.

We will make all classes and buses show up in /sys/subsystem/. The
current silly split between a class and a bus has absolutely no
consistent meaning, and serves no purpose. After /sys/subsystem/
exists, class and bus will only be compat links pointing to the
entries in /sys/subsystem/.

There is _one_ tree at /sys/devices/, and will be _one_ classification
at /sys/subsystem/ which lists all-devices-of-the-same-subsystem and
has an extensible subdir like bus has today.

It's partly described in some outdated version in Documentation/sysfs-rules.txt.

>> No new stuff should use 'class', it's not extensible.
>
> Hrm.. when I introduced the bdi stuff class was _the_ thing to use.

Yeah, it does really matter for very simple things, but never for
stuff that reads -- before it even exists -- as "we will add links to
... and subbdirs, and have ...' :)

Kay

2010-11-09 23:36:54

by Michael Ellerman

[permalink] [raw]
Subject: Re: [RFC][PATCH] perf: sysfs type id

On Tue, 2010-11-09 at 14:13 -0800, Greg KH wrote:
> On Tue, Nov 09, 2010 at 10:45:19PM +0100, Peter Zijlstra wrote:
> > The below is a RFC patch adding dynamic type ids to perf.
> >
> > We need to represent PMUs in sysfs because we want to allow multiple
> > (loadable) PMUs and need a way to identify them.
> >
> > This patch creates a new device class "pmu" and adds a single attribute
> > "type" to it. This device attribute will expose the dynamic type id as
> > required by perf_event_attr::type.
> >
> > The sysfs layout looks like:
> >
> > [root@westmere ~]# cd /sys/class/pmu/
>
> You missed the embedded track at Plumbers where we talked about never
> adding another class to the kernel. Please use bus_id instead for this.

At least in the examples I've seen creating a bus requires a lot more
code than a class. Or is there a shortcut I don't know about when it's a
virtual bus?

(Interested because I have code that is using a class)

cheers


Attachments:
signature.asc (198.00 B)
This is a digitally signed message part

2010-11-10 01:10:25

by Michael Ellerman

[permalink] [raw]
Subject: Re: [RFC][PATCH] perf: sysfs type id

On Wed, 2010-11-10 at 01:57 +0100, Kay Sievers wrote:
> Stay on the list please, with any possible reply. Thanks!

You dropped the CC when you replied, or is my mailer being weird?

> On Wed, Nov 10, 2010 at 01:52, Michael Ellerman <[email protected]> wrote:
> > On Wed, 2010-11-10 at 00:45 +0100, Kay Sievers wrote:
> >> On Wed, Nov 10, 2010 at 00:36, Michael Ellerman <[email protected]> wrote:
> >> > On Tue, 2010-11-09 at 14:13 -0800, Greg KH wrote:
> >> >> On Tue, Nov 09, 2010 at 10:45:19PM +0100, Peter Zijlstra wrote:
> >> >> > The below is a RFC patch adding dynamic type ids to perf.
> >> >> >
> >> >> > We need to represent PMUs in sysfs because we want to allow multiple
> >> >> > (loadable) PMUs and need a way to identify them.
> >> >> >
> >> >> > This patch creates a new device class "pmu" and adds a single attribute
> >> >> > "type" to it. This device attribute will expose the dynamic type id as
> >> >> > required by perf_event_attr::type.
> >> >> >
> >> >> > The sysfs layout looks like:
> >> >> >
> >> >> > [root@westmere ~]# cd /sys/class/pmu/
> >> >>
> >> >> You missed the embedded track at Plumbers where we talked about never
> >> >> adding another class to the kernel. Please use bus_id instead for this.
> >> >
> >> > At least in the examples I've seen creating a bus requires a lot more
> >> > code than a class. Or is there a shortcut I don't know about when it's a
> >> > virtual bus?
> >>
> >> What do you mean? Instead of registering a class you register a bus?
> >
> > Yeah. From what I've seen doing the latter is a lot more involved.
> >
> >> Instead of assigning a class to the device, you assign the bus before
> >> you add it. Other than this you do the same with the device.
> >
> > Sure. My point is creating a bus (from what I've seen) is not as easy as
> > creating a bus. No one said kernel hacking should be easy, but if the
> > advice is "use a bus not a class" it'd be good if they were
> > approximately equivalent in effort.
> >
> > My code to use a class is ~=:
> >
> > struct class foo_class;
> > foo_class = class_create(THIS_MODULE, "foo");
> > device = device_create(foo_class, NULL, dev, "foo0");
>
> device_create must not be used for devices without a dev_t.
> device_destroy, which is the counterpart can not work.

It has a dev_t, that's the third argument?

> If you use device_del/device_unregister, you need to use
> device_add/device_register.

I don't use them.

> The pseudo-convenience device_register/device_unregister should also
> not be used.

Why are they in the tree if they shouldn't be used?

So far you are failing to dispel my notion that sysfs is a place where
mortals dare not tread ;)

> > And I'm looking at eg. drivers/usb/serial/bus.c as an example bus.
> >
> > But in my case (and I think perf too), we don't need a bus that probes
> > etc. it's just a virtual bus that groups things, so it seems like it
> > should be simple.
> >
> > Anyway I feel like I'm missing something, so hopefully you can clue me
> > in :)
>
> Buses without drivers do not probe at all, they behave like classes.

OK, good, that would seem to be a prerequisite for replacing the latter
with the former.

I'm just not clear on how that actually works in the code. For example I
have a device which is on a bus (that's how it got probed), how do I
also put it on another bus (my virtual bus replacing my class) ?

cheers



Attachments:
signature.asc (198.00 B)
This is a digitally signed message part

2010-11-10 01:19:37

by Kay Sievers

[permalink] [raw]
Subject: Re: [RFC][PATCH] perf: sysfs type id

On Wed, Nov 10, 2010 at 02:10, Michael Ellerman <[email protected]> wrote:
> On Wed, 2010-11-10 at 01:57 +0100, Kay Sievers wrote:
>> Stay on the list please, with any possible reply. Thanks!
>
> You dropped the CC when you replied, or is my mailer being weird?

You replied to me only:
From: Michael Ellerman <[email protected]>
To: Kay Sievers <[email protected]>

>> On Wed, Nov 10, 2010 at 01:52, Michael Ellerman <[email protected]> wrote:
>> > On Wed, 2010-11-10 at 00:45 +0100, Kay Sievers wrote:
>> >> On Wed, Nov 10, 2010 at 00:36, Michael Ellerman <[email protected]> wrote:
>> >> > On Tue, 2010-11-09 at 14:13 -0800, Greg KH wrote:
>> >> >> On Tue, Nov 09, 2010 at 10:45:19PM +0100, Peter Zijlstra wrote:
>> >> >> > The below is a RFC patch adding dynamic type ids to perf.
>> >> >> >
>> >> >> > We need to represent PMUs in sysfs because we want to allow multiple
>> >> >> > (loadable) PMUs and need a way to identify them.
>> >> >> >
>> >> >> > This patch creates a new device class "pmu" and adds a single attribute
>> >> >> > "type" to it. This device attribute will expose the dynamic type id as
>> >> >> > required by perf_event_attr::type.
>> >> >> >
>> >> >> > The sysfs layout looks like:
>> >> >> >
>> >> >> > [root@westmere ~]# cd /sys/class/pmu/
>> >> >>
>> >> >> You missed the embedded track at Plumbers where we talked about never
>> >> >> adding another class to the kernel.  Please use bus_id instead for this.
>> >> >
>> >> > At least in the examples I've seen creating a bus requires a lot more
>> >> > code than a class. Or is there a shortcut I don't know about when it's a
>> >> > virtual bus?
>> >>
>> >> What do you mean? Instead of registering a class you register a bus?
>> >
>> > Yeah. From what I've seen doing the latter is a lot more involved.
>> >
>> >> Instead of assigning a class to the device, you assign the bus before
>> >> you add it. Other than this you do the same with the device.
>> >
>> > Sure. My point is creating a bus (from what I've seen) is not as easy as
>> > creating a bus. No one said kernel hacking should be easy, but if the
>> > advice is "use a bus not a class" it'd be good if they were
>> > approximately equivalent in effort.
>> >
>> > My code to use a class is ~=:
>> >
>> > struct class foo_class;
>> > foo_class = class_create(THIS_MODULE, "foo");
>> > device = device_create(foo_class, NULL, dev, "foo0");
>>
>> device_create must not be used for devices without a dev_t.
>> device_destroy, which is the counterpart can not work.
>
> It has a dev_t, that's the third argument?

Yeah, and that's not the case for the perf stuff.

>> If you use device_del/device_unregister, you need to use
>> device_add/device_register.
>
> I don't use them.

As said, the class devices have a convenience API which saves a few
lines, but they only work for devices with device nodes.

>> The pseudo-convenience device_register/device_unregister should also
>> not be used.
>
> Why are they in the tree if they shouldn't be used?

Because they save one line and do improper error handling.
They should all be converted to device_init+device_add/device_del/device_put.

> So far you are failing to dispel my notion that sysfs is a place where
> mortals dare not tread ;)

Oh, you are welcome to join the endless fixing rounds. Most of the
weird stuff if from people who moved to islands far away and never
touched any Linux code anymore (no kidding). :)

>> > And I'm looking at eg. drivers/usb/serial/bus.c as an example bus.
>> >
>> > But in my case (and I think perf too), we don't need a bus that probes
>> > etc. it's just a virtual bus that groups things, so it seems like it
>> > should be simple.
>> >
>> > Anyway I feel like I'm missing something, so hopefully you can clue me
>> > in :)
>>
>> Buses without drivers do not probe at all, they behave like classes.
>
> OK, good, that would seem to be a prerequisite for replacing the latter
> with the former.
>
> I'm just not clear on how that actually works in the code. For example I
> have a device which is on a bus (that's how it got probed), how do I
> also put it on another bus (my virtual bus replacing my class) ?

Nothing gets probed ever, if no driver is registered. To get a device
on a bus, just assign the bus to the 'struct device' before calling
device_add, that's all.

Devices can never be on two subsystems at the same time. Not with
classes, not with buses, that was never, and probably will never be
possible.

Kay

2010-11-10 01:45:48

by Michael Ellerman

[permalink] [raw]
Subject: Re: [RFC][PATCH] perf: sysfs type id

On Wed, 2010-11-10 at 02:19 +0100, Kay Sievers wrote:
> On Wed, Nov 10, 2010 at 02:10, Michael Ellerman <[email protected]> wrote:
> > On Wed, 2010-11-10 at 01:57 +0100, Kay Sievers wrote:
> >> Stay on the list please, with any possible reply. Thanks!
> >
> > You dropped the CC when you replied, or is my mailer being weird?
>
> You replied to me only:
> From: Michael Ellerman <[email protected]>
> To: Kay Sievers <[email protected]>

Because you replied to me only, or at least that's what I see at my end.

> >> On Wed, Nov 10, 2010 at 01:52, Michael Ellerman <[email protected]> wrote:
> >> > On Wed, 2010-11-10 at 00:45 +0100, Kay Sievers wrote:
> >> The pseudo-convenience device_register/device_unregister should also
> >> not be used.
> >
> > Why are they in the tree if they shouldn't be used?
>
> Because they save one line and do improper error handling.
> They should all be converted to device_init+device_add/device_del/device_put.

I don't see device_init(), or do you mean device_initialize()? It
returns void?

> > So far you are failing to dispel my notion that sysfs is a place where
> > mortals dare not tread ;)
>
> Oh, you are welcome to join the endless fixing rounds. Most of the
> weird stuff if from people who moved to islands far away and never
> touched any Linux code anymore (no kidding). :)

Yeah fair enough :)

It is a pet peeve of mine when APIs are "deprecated" but no where is
that documented, least of all by using __deprecated. I guess that's
because it spews warnings, maybe we need __future_deprecated or
something.

> >> > And I'm looking at eg. drivers/usb/serial/bus.c as an example bus.
> >> >
> >> > But in my case (and I think perf too), we don't need a bus that probes
> >> > etc. it's just a virtual bus that groups things, so it seems like it
> >> > should be simple.
> >> >
> >> > Anyway I feel like I'm missing something, so hopefully you can clue me
> >> > in :)
> >>
> >> Buses without drivers do not probe at all, they behave like classes.
> >
> > OK, good, that would seem to be a prerequisite for replacing the latter
> > with the former.
> >
> > I'm just not clear on how that actually works in the code. For example I
> > have a device which is on a bus (that's how it got probed), how do I
> > also put it on another bus (my virtual bus replacing my class) ?
>
> Nothing gets probed ever, if no driver is registered. To get a device
> on a bus, just assign the bus to the 'struct device' before calling
> device_add, that's all.

Cool, that sounds simple enough.

> Devices can never be on two subsystems at the same time. Not with
> classes, not with buses, that was never, and probably will never be
> possible.

OK, I guess I'm getting my terminology wrong. My devices, which show up
in /sys/class/foo are symlinks into /sys/devices/virtual/foo, so they
_appear_ to be in two places.

I also see entries for example in /sys/class/scsi_disk that link
into /sys/devices/pci.

cheers


Attachments:
signature.asc (198.00 B)
This is a digitally signed message part

2010-11-10 01:59:35

by Kay Sievers

[permalink] [raw]
Subject: Re: [RFC][PATCH] perf: sysfs type id

On Wed, Nov 10, 2010 at 02:45, Michael Ellerman <[email protected]> wrote:
> On Wed, 2010-11-10 at 02:19 +0100, Kay Sievers wrote:
>> On Wed, Nov 10, 2010 at 02:10, Michael Ellerman <[email protected]> wrote:
>> > On Wed, 2010-11-10 at 01:57 +0100, Kay Sievers wrote:
>> >> Stay on the list please, with any possible reply. Thanks!
>> >
>> > You dropped the CC when you replied, or is my mailer being weird?
>>
>> You replied to me only:
>>   From: Michael Ellerman <[email protected]>
>>   To: Kay Sievers <[email protected]>
>
> Because you replied to me only, or at least that's what I see at my end.

Sure, I don't add lists back when people reply to me only. You never
know why they do that, and if they have a reason for it. That's why I
added the the same mail:
"Stay on the list please, with any possible reply. Thanks!"

>> Devices can never be on two subsystems at the same time. Not with
>> classes, not with buses, that was never, and probably will never be
>> possible.
>
> OK, I guess I'm getting my terminology wrong. My devices, which show up
> in /sys/class/foo are symlinks into /sys/devices/virtual/foo, so they
> _appear_ to be in two places.
>
> I also see entries for example in /sys/class/scsi_disk that link
> into /sys/devices/pci.

Sure, all devices (devices are real directories) end up in the tree in
/sys/devices/. That's one tree, where all the devices from different
subsystems stack on top of others.


All devices have a symlink called 'subsystem' which points back to the
individual class the device belongs to.

To find all device of a specific subsystem, you start at the subsystem
directory (class/bus) which collects them (only symlinks to the
device, not directories).

That way you will find a blockdev, even when it's deep down in a chain
of devices:
pci:bridge/pci:dev/scsi:host/scsi:target/scsi:lun/block

All devices get announced to userspace only by their devpath, never by
their class or bus directory. If you wan to watch the stuff in action,
run:
udevadm monitor --kernel
and plug in a USB stick. You see that you get all device only at one
place created, the rest (which you can't see in the event) is
symlinks, which don't matter for the device itself, only for userspace
tasks to find them.

Kay

2010-11-10 02:11:18

by Kay Sievers

[permalink] [raw]
Subject: Re: [RFC][PATCH] perf: sysfs type id

On Wed, Nov 10, 2010 at 02:45, Michael Ellerman <[email protected]> wrote:
> On Wed, 2010-11-10 at 02:19 +0100, Kay Sievers wrote:

>> >> The pseudo-convenience device_register/device_unregister should also
>> >> not be used.
>> >
>> > Why are they in the tree if they shouldn't be used?
>>
>> Because they save one line and do improper error handling.
>> They should all be converted to device_init+device_add/device_del/device_put.
>
> I don't see device_init(), or do you mean device_initialize()? It
> returns void?

http://lkml.org/lkml/2010/9/22/56

Kay

2010-11-10 03:37:38

by Michael Ellerman

[permalink] [raw]
Subject: Re: [RFC][PATCH] perf: sysfs type id

On Wed, 2010-11-10 at 02:59 +0100, Kay Sievers wrote:
> On Wed, Nov 10, 2010 at 02:45, Michael Ellerman <[email protected]> wrote:
> > On Wed, 2010-11-10 at 02:19 +0100, Kay Sievers wrote:
> >> On Wed, Nov 10, 2010 at 02:10, Michael Ellerman <[email protected]> wrote:
> >> > On Wed, 2010-11-10 at 01:57 +0100, Kay Sievers wrote:
> >> >> Stay on the list please, with any possible reply. Thanks!
> >> >
> >> > You dropped the CC when you replied, or is my mailer being weird?
> >>
> >> You replied to me only:
> >> From: Michael Ellerman <[email protected]>
> >> To: Kay Sievers <[email protected]>
> >
> > Because you replied to me only, or at least that's what I see at my end.
>
> Sure, I don't add lists back when people reply to me only. You never
> know why they do that, and if they have a reason for it. That's why I
> added the the same mail:
> "Stay on the list please, with any possible reply. Thanks!"
>
> >> Devices can never be on two subsystems at the same time. Not with
> >> classes, not with buses, that was never, and probably will never be
> >> possible.
> >
> > OK, I guess I'm getting my terminology wrong. My devices, which show up
> > in /sys/class/foo are symlinks into /sys/devices/virtual/foo, so they
> > _appear_ to be in two places.
> >
> > I also see entries for example in /sys/class/scsi_disk that link
> > into /sys/devices/pci.
>
> Sure, all devices (devices are real directories) end up in the tree in
> /sys/devices/. That's one tree, where all the devices from different
> subsystems stack on top of others.
>
>
> All devices have a symlink called 'subsystem' which points back to the
> individual class the device belongs to.
>
> To find all device of a specific subsystem, you start at the subsystem
> directory (class/bus) which collects them (only symlinks to the
> device, not directories).
>
> That way you will find a blockdev, even when it's deep down in a chain
> of devices:
> pci:bridge/pci:dev/scsi:host/scsi:target/scsi:lun/block

OK, I think I get it. Thanks for taking the time to explain it to me :)

cheers


Attachments:
signature.asc (198.00 B)
This is a digitally signed message part

2010-11-10 12:28:18

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] perf: sysfs type id

On Tue, 2010-11-09 at 14:13 -0800, Greg KH wrote:

> You missed the embedded track at Plumbers where we talked about never
> adding another class to the kernel. Please use bus_id instead for this.

I did, it was early and I wasn't aware this all comes under the heading
of embedded.

Anyway, anybody got a good example of bus_type I can 'borrow' ?

Also, it would be really nice if you (plural) could make this subsystem
thing happen, calling tihngs a bus that aren't a bus just makes me
upset ;-)

2010-11-10 13:01:53

by Stephane Eranian

[permalink] [raw]
Subject: Re: [RFC][PATCH] perf: sysfs type id

Peter,

On Tue, Nov 9, 2010 at 10:45 PM, Peter Zijlstra <[email protected]> wrote:
> The below is a RFC patch adding dynamic type ids to perf.
>
> We need to represent PMUs in sysfs because we want to allow multiple
> (loadable) PMUs and need a way to identify them.
>
> This patch creates a new device class "pmu" and adds a single attribute
> "type" to it. This device attribute will expose the dynamic type id as
> required by perf_event_attr::type.
>
> The sysfs layout looks like:
>
> [root@westmere ~]# cd /sys/class/pmu/
> [root@westmere pmu]# ls -la
> total 0
> drwxr-xr-x  2 root root 0 2010-11-09 22:22 .
> drwxr-xr-x 47 root root 0 2010-11-09 22:22 ..
> lrwxrwxrwx  1 root root 0 2010-11-09 22:22 breakpoint -> ../../devices/virtual/pmu/breakpoint
> lrwxrwxrwx  1 root root 0 2010-11-09 22:22 cpu -> ../../devices/virtual/pmu/cpu
> lrwxrwxrwx  1 root root 0 2010-11-09 22:22 frob -> ../../devices/virtual/pmu/frob
> lrwxrwxrwx  1 root root 0 2010-11-09 22:22 software -> ../../devices/virtual/pmu/software
> lrwxrwxrwx  1 root root 0 2010-11-09 22:22 tracepoint -> ../../devices/virtual/pmu/tracepoint
> [root@westmere pmu]# cd frob/
> [root@westmere frob]# ls -la
> total 0
> drwxr-xr-x 3 root root    0 2010-11-09 22:22 .
> drwxr-xr-x 7 root root    0 2010-11-09 22:22 ..
> drwxr-xr-x 2 root root    0 2010-11-09 22:23 power
> lrwxrwxrwx 1 root root    0 2010-11-09 22:23 subsystem -> ../../../../class/pmu
> -r--r--r-- 1 root root 4096 2010-11-09 22:23 type
> -rw-r--r-- 1 root root 4096 2010-11-09 22:22 uevent
> [root@westmere frob]# cat type
> 6

And then, what do you do with 6?
I assume you have to pass it in the attr struct.
How do you plan on doing this while keeping what is already there?

2010-11-10 13:37:15

by Ingo Molnar

[permalink] [raw]
Subject: sysfs: Add an 'events' class. (was: Re: [RFC][PATCH] perf: sysfs type id)


* Peter Zijlstra <[email protected]> wrote:

> On Tue, 2010-11-09 at 14:13 -0800, Greg KH wrote:
>
> > You missed the embedded track at Plumbers where we talked about never
> > adding another class to the kernel. Please use bus_id instead for this.
>
> I did, it was early and I wasn't aware this all comes under the heading
> of embedded.
>
> Anyway, anybody got a good example of bus_type I can 'borrow' ?
>
> Also, it would be really nice if you (plural) could make this subsystem thing
> happen, calling tihngs a bus that aren't a bus just makes me upset ;-)

Same here - calling events a 'bus' is like totally brain-dead IMHO. It implies
something hardware, while many events are not related to any hardware component but
are pure software abstractions: such as context-switches, or syscall entries, or VM
events.

So i'd rather have 'events' or 'event_source' as a 'class' temporarily, than have it
as a 'bus' temporarily - and we get the real fix whenever the sysfs unification
happens.

I also have a question about this future plan mentioned in
Documentation/sysfs-rules.txt:

- Hierarchy in a single device tree
There is only one valid place in sysfs where hierarchy can be examined
and this is below: /sys/devices.
It is planned that all device directories will end up in the tree
below this directory.

So did i get it right, sysfs is going to convert from a VFS hiearchy/enumeration to
a flat enumeration of entities, all listed in /dev/devices/?

Why is that done? I think it's quite nice that the actual topology is represented
right now via the sysfs VFS structure - so that we have things like:

/sys/devices/system/ioapic/ioapic0/

Where there's is a proper hierarchy showing that we have a 'system', which has an
'ioapic', which has an ioapic numbered '0'. That entity could then grow 'events' and
have:

/sys/devices/system/ioapic/ioapic0/events/

And could show various IO-APIC events, such as (future, possible events):

/sys/devices/system/ioapic/ioapic0/events/irq/
/sys/devices/system/ioapic/ioapic0/events/register-read/
/sys/devices/system/ioapic/ioapic0/events/register-write/
/sys/devices/system/ioapic/ioapic0/events/affinity/
[...]

So is the plan to get rid of such rich hiearchies and just use a flat store of
everything in /sys/devices/?

My hope would be to _increase_ the depth of sysfs in the future, to express every
meaningful hiearchy that exists in the system (be that hw hierarchy or some sw
abstraction hierarchy). But maybe i got it all wrong so please advise.

Thanks,

Ingo

2010-11-10 14:10:13

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] perf: sysfs type id

On Wed, 2010-11-10 at 14:01 +0100, Stephane Eranian wrote:
> Peter,

> > 6
>
> And then, what do you do with 6?
> I assume you have to pass it in the attr struct.

perf_event_attr::type, as said in the initial changelog.

> How do you plan on doing this while keeping what is already there?

+ if (type < 0) {
+ err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);

and

+ rcu_read_lock();
+ pmu = idr_find(&pmu_idr, event->attr.type);
+ rcu_read_unlock();
+ if (pmu)
+ goto unlock;


So we start dynamic IDs at the top of the static range, and only do
dynamic IDs for those that don't already have a static number.

2010-11-10 14:15:07

by Kay Sievers

[permalink] [raw]
Subject: Re: sysfs: Add an 'events' class. (was: Re: [RFC][PATCH] perf: sysfs type id)

On Wed, Nov 10, 2010 at 14:36, Ingo Molnar <[email protected]> wrote:
> * Peter Zijlstra <[email protected]> wrote:
>
>> On Tue, 2010-11-09 at 14:13 -0800, Greg KH wrote:
>>
>> > You missed the embedded track at Plumbers where we talked about never
>> > adding another class to the kernel.  Please use bus_id instead for this.
>>
>> I did, it was early and I wasn't aware this all comes under the heading
>> of embedded.
>>
>> Anyway, anybody got a good example of bus_type I can 'borrow' ?
>>
>> Also, it would be really nice if you (plural) could make this subsystem thing
>> happen, calling tihngs a bus that aren't a bus just makes me upset ;-)
>
> Same here - calling events a 'bus' is like totally brain-dead IMHO. It implies
> something hardware, while many events are not related to any hardware component but
> are pure software abstractions: such as context-switches, or syscall entries, or VM
> events.

Please use a bus, regardless of the name, class is dead and you will
need stuff tha can only happen at bus. We'll fix the name later.

> So i'd rather have 'events' or 'event_source' as a 'class' temporarily, than have it
> as a 'bus' temporarily - and we get the real fix whenever the sysfs unification
> happens.

Nope, that can not work.

> I also have a question about this future plan mentioned in
> Documentation/sysfs-rules.txt:
>
>  - Hierarchy in a single device tree
>   There is only one valid place in sysfs where hierarchy can be examined
>   and this is below: /sys/devices.
>   It is planned that all device directories will end up in the tree
>   below this directory.
>
> So did i get it right, sysfs is going to convert from a VFS hiearchy/enumeration to
> a flat enumeration of entities, all listed in /dev/devices/?

I guess you mean /sys. No, /sys/devices/ is and stays as a hierarchy
and will not be flat.

> Why is that done? I think it's quite nice that the actual topology is represented
> right now via the sysfs VFS structure - so that we have things like:
>
>   /sys/devices/system/ioapic/ioapic0/

That does not change at all.

> Where there's is a proper hierarchy showing that we have a 'system', which has an
> 'ioapic', which has an ioapic numbered '0'. That entity could then grow 'events' and
> have:
>
>   /sys/devices/system/ioapic/ioapic0/events/
>
> And could show various IO-APIC events, such as (future, possible events):
>
>   /sys/devices/system/ioapic/ioapic0/events/irq/
>   /sys/devices/system/ioapic/ioapic0/events/register-read/
>   /sys/devices/system/ioapic/ioapic0/events/register-write/
>   /sys/devices/system/ioapic/ioapic0/events/affinity/
>   [...]
>
> So is the plan to get rid of such rich hiearchies and just use a flat store of
> everything in /sys/devices/?

No. Where do you get that idea from?

> My hope would be to _increase_ the depth of sysfs in the future, to express every
> meaningful hiearchy that exists in the system (be that hw hierarchy or some sw
> abstraction hierarchy). But maybe i got it all wrong so please advise.

Yeah, seems all wrong. :)

/sys/device is the hierarchy (ans will stay as it is), /sys/class,bus
(later /sys/subsystem/) is the flat list of devices per subsystem (to
find all devices of a specific subsystem). The list _points_ to
devices in /sys/devices, it does never contain any devices. You might
want re-read the last mail in this thread, where I explained the
difference between hierarchy and classification in more details.

Kay

2010-11-10 14:19:42

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] perf: sysfs type id

On Wed, 2010-11-10 at 15:10 +0100, Peter Zijlstra wrote:
> On Wed, 2010-11-10 at 14:01 +0100, Stephane Eranian wrote:
> > Peter,
>
> > > 6
> >
> > And then, what do you do with 6?
> > I assume you have to pass it in the attr struct.
>
> perf_event_attr::type, as said in the initial changelog.
>
> > How do you plan on doing this while keeping what is already there?
>
> + if (type < 0) {
> + err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
>
> and
>
> + rcu_read_lock();
> + pmu = idr_find(&pmu_idr, event->attr.type);
> + rcu_read_unlock();
> + if (pmu)
> + goto unlock;
>
>
> So we start dynamic IDs at the top of the static range, and only do
> dynamic IDs for those that don't already have a static number.

Also note that I picked:

+ perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);

as the CPU type, since we can easily provide the raw values in any
listed events, eg.

# cat /sys/.../cpu/events/cycles
0x003c

2010-11-10 14:25:12

by Stephane Eranian

[permalink] [raw]
Subject: Re: [RFC][PATCH] perf: sysfs type id

On Wed, Nov 10, 2010 at 3:10 PM, Peter Zijlstra <[email protected]> wrote:
> On Wed, 2010-11-10 at 14:01 +0100, Stephane Eranian wrote:
>> Peter,
>
>> > 6
>>
>> And then, what do you do with 6?
>> I assume you have to pass it in the attr struct.
>
> perf_event_attr::type, as said in the initial changelog.
>
>> How do you plan on doing this while keeping what is already there?
>
> +       if (type < 0) {
> +               err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
>
> and
>
> +       rcu_read_lock();
> +       pmu = idr_find(&pmu_idr, event->attr.type);
> +       rcu_read_unlock();
> +       if (pmu)
> +               goto unlock;
>
>
> So we start dynamic IDs at the top of the static range, and only do
> dynamic IDs for those that don't already have a static number.
>
>
Ok, should work fine.

2010-11-10 15:01:27

by Ingo Molnar

[permalink] [raw]
Subject: Re: sysfs: Add an 'events' class. (was: Re: [RFC][PATCH] perf: sysfs type id)


* Kay Sievers <[email protected]> wrote:

> > Same here - calling events a 'bus' is like totally brain-dead IMHO. It implies
> > something hardware, while many events are not related to any hardware component
> > but are pure software abstractions: such as context-switches, or syscall
> > entries, or VM events.
>
> Please use a bus, regardless of the name, class is dead and you will need stuff
> tha can only happen at bus. We'll fix the name later.

When is that 'later' supposed to happen? git annotate says that the
renaming/unification comments/plans were added:

46336009 (Kay Sievers 2007-06-08 13:36:37 -0700 105) symlinks pointing to the unified /sys/devices tree.

... about 3 and a half years ago.

> [...] No, /sys/devices/ is and stays as a hierarchy and will not be flat.

Great :-)

Btw., where will thinks like /sys/kernel/slab/ end up? It would be useful (in the
long run) to have:

/sys/kernel/slab/uid_cache/events/

To describe/control events for the uid_cache SLAB cache alone.

Ingo

2010-11-10 17:31:15

by Greg KH

[permalink] [raw]
Subject: Re: [RFC][PATCH] perf: sysfs type id

On Wed, Nov 10, 2010 at 10:36:51AM +1100, Michael Ellerman wrote:
> On Tue, 2010-11-09 at 14:13 -0800, Greg KH wrote:
> > On Tue, Nov 09, 2010 at 10:45:19PM +0100, Peter Zijlstra wrote:
> > > The below is a RFC patch adding dynamic type ids to perf.
> > >
> > > We need to represent PMUs in sysfs because we want to allow multiple
> > > (loadable) PMUs and need a way to identify them.
> > >
> > > This patch creates a new device class "pmu" and adds a single attribute
> > > "type" to it. This device attribute will expose the dynamic type id as
> > > required by perf_event_attr::type.
> > >
> > > The sysfs layout looks like:
> > >
> > > [root@westmere ~]# cd /sys/class/pmu/
> >
> > You missed the embedded track at Plumbers where we talked about never
> > adding another class to the kernel. Please use bus_id instead for this.
>
> At least in the examples I've seen creating a bus requires a lot more
> code than a class. Or is there a shortcut I don't know about when it's a
> virtual bus?
>
> (Interested because I have code that is using a class)

It shouldn't be that much more code to do so. Have a pointer to some
code that you want converted over so I can show you exactly what is
needed?

thanks,

greg k-h

2010-11-10 20:08:12

by Stephane Eranian

[permalink] [raw]
Subject: Re: [RFC][PATCH] perf: sysfs type id

Peter,

How do you envision users specifying event_source + event name
on the command line of the perf tool (or others)?

Would that be by passing the full filename to the tool?



On Wed, Nov 10, 2010 at 3:19 PM, Peter Zijlstra <[email protected]> wrote:
> On Wed, 2010-11-10 at 15:10 +0100, Peter Zijlstra wrote:
>> On Wed, 2010-11-10 at 14:01 +0100, Stephane Eranian wrote:
>> > Peter,
>>
>> > > 6
>> >
>> > And then, what do you do with 6?
>> > I assume you have to pass it in the attr struct.
>>
>> perf_event_attr::type, as said in the initial changelog.
>>
>> > How do you plan on doing this while keeping what is already there?
>>
>> +       if (type < 0) {
>> +               err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
>>
>> and
>>
>> +       rcu_read_lock();
>> +       pmu = idr_find(&pmu_idr, event->attr.type);
>> +       rcu_read_unlock();
>> +       if (pmu)
>> +               goto unlock;
>>
>>
>> So we start dynamic IDs at the top of the static range, and only do
>> dynamic IDs for those that don't already have a static number.
>
> Also note that I picked:
>
> +       perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
>
> as the CPU type, since we can easily provide the raw values in any
> listed events, eg.
>
> # cat /sys/.../cpu/events/cycles
> 0x003c
>
>
>

2010-11-10 20:32:41

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] perf: sysfs type id

On Wed, 2010-11-10 at 21:08 +0100, Stephane Eranian wrote:
> Would that be by passing the full filename to the tool?

possible, or something like <pmu-name>:<event-name>, cpu:cycles would
map to /sys/class/pmu/cpu/events/cycles (given the previous patch).

2010-11-10 20:53:16

by Stephane Eranian

[permalink] [raw]
Subject: Re: [RFC][PATCH] perf: sysfs type id

On Wed, Nov 10, 2010 at 9:32 PM, Peter Zijlstra <[email protected]> wrote:
> On Wed, 2010-11-10 at 21:08 +0100, Stephane Eranian wrote:
>> Would that be by passing the full filename to the tool?
>
> possible, or something like <pmu-name>:<event-name>, cpu:cycles would
> map to /sys/class/pmu/cpu/events/cycles (given the previous patch).
>
>
Ok, but I think you're proposal is missing one bit. You are addressing
the class (or type) of PMU, but you are not addressing the naming of
an instance.

Let's take an example, suppose you have counters on a graphic card.
Your system has two such graphic cards. In your scheme you would
end up with a sys/class/pmu/gfx/.....

But now, suppose I want to count cycles on the first graphic card.
Seems to me you need to expose the instances as well. The instance
number needs to be passed in the attr struct somehow.

You can either create multiple subdir under gfx, or have this info somewhere
else in the sysfs tree, if people really care about class vs. instance.

I can see users doing:
$ perf stat -e gfx@1::cycles ... -> sys/class/gfx/1/event/cycles

The reason I am using :: here is because libpfm4 is already using
this as a separator for PMU type vs. event.

2010-11-10 21:05:34

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] perf: sysfs type id

On Wed, 2010-11-10 at 21:53 +0100, Stephane Eranian wrote:
> On Wed, Nov 10, 2010 at 9:32 PM, Peter Zijlstra <[email protected]> wrote:
> > On Wed, 2010-11-10 at 21:08 +0100, Stephane Eranian wrote:
> >> Would that be by passing the full filename to the tool?
> >
> > possible, or something like <pmu-name>:<event-name>, cpu:cycles would
> > map to /sys/class/pmu/cpu/events/cycles (given the previous patch).
> >
> >
> Ok, but I think you're proposal is missing one bit. You are addressing
> the class (or type) of PMU, but you are not addressing the naming of
> an instance.
>
> Let's take an example, suppose you have counters on a graphic card.
> Your system has two such graphic cards. In your scheme you would
> end up with a sys/class/pmu/gfx/.....
>
> But now, suppose I want to count cycles on the first graphic card.
> Seems to me you need to expose the instances as well. The instance
> number needs to be passed in the attr struct somehow.
>
> You can either create multiple subdir under gfx, or have this info somewhere
> else in the sysfs tree, if people really care about class vs. instance.
>
> I can see users doing:
> $ perf stat -e gfx@1::cycles ... -> sys/class/gfx/1/event/cycles
>
> The reason I am using :: here is because libpfm4 is already using
> this as a separator for PMU type vs. event.


right, so the idea is to have these pmu devices hooked into the existing
sysfs topology, my proposed patch misses that bit because I wanted to
get something out before having to dig through the topology code trying
to figure out how all that works.

So the 'cpu' pmu device would be linked from:

/sys/devices/system/cpu/pmu -> /sys/class/pmu/cpu

and gfx things would be linked like:

/sys/devices/pci0000:00/0000:00:1e.0/0000:0b:01.0/drm/card0/pmu -> /sys/class/pmu/radeon0

2010-11-11 06:40:04

by Kay Sievers

[permalink] [raw]
Subject: Re: sysfs: Add an 'events' class. (was: Re: [RFC][PATCH] perf: sysfs type id)

On Wed, Nov 10, 2010 at 16:00, Ingo Molnar <[email protected]> wrote:
> * Kay Sievers <[email protected]> wrote:
>
>> > Same here - calling events a 'bus' is like totally brain-dead IMHO. It implies
>> > something hardware, while many events are not related to any hardware component
>> > but are pure software abstractions: such as context-switches, or syscall
>> > entries, or VM events.
>>
>> Please use a bus, regardless of the name, class is dead and you will need stuff
>> tha can only happen at bus. We'll fix the name later.
>
> When is that 'later' supposed to happen? git annotate says that the
> renaming/unification comments/plans were added:
>
> 46336009        (Kay Sievers    2007-06-08 13:36:37 -0700       105)  symlinks pointing to the unified /sys/devices tree.
>
> ... about 3 and a half years ago.

If all works out as planned, we will do it during the next weeks.
Depends on a bit what Greg and I are dragged into, which is always
kind of unpredictable. It's 'just' a cleanup and stuff that's broken
always comes first, and comes a lot. :)

> Btw., where will thinks like /sys/kernel/slab/ end up? It would be useful (in the
> long run) to have:
>
>  /sys/kernel/slab/uid_cache/events/
>
> To describe/control events for the uid_cache SLAB cache alone.

It will probably stay as it is, they are not 'struct device' devices,
but raw kobjects. It's a completely unconnected custom tree, which is
not related to the driver core or anything similar.

It's more like its own filesystem. Stuff you put there can not show up
in /sys/class,bus,subsystem unless someone wants convert all of it to
real devices, let them show up in /sys/devices, and assign it a
subsystem (bus).

Kay

2010-11-17 02:35:56

by Corey Ashford

[permalink] [raw]
Subject: Re: [RFC][PATCH] perf: sysfs type id

On 11/10/2010 01:05 PM, Peter Zijlstra wrote:
> On Wed, 2010-11-10 at 21:53 +0100, Stephane Eranian wrote:
>> On Wed, Nov 10, 2010 at 9:32 PM, Peter Zijlstra<[email protected]> wrote:
>>> On Wed, 2010-11-10 at 21:08 +0100, Stephane Eranian wrote:
>>>> Would that be by passing the full filename to the tool?
>>>
>>> possible, or something like<pmu-name>:<event-name>, cpu:cycles would
>>> map to /sys/class/pmu/cpu/events/cycles (given the previous patch).
>>>
>>>
>> Ok, but I think you're proposal is missing one bit. You are addressing
>> the class (or type) of PMU, but you are not addressing the naming of
>> an instance.
>>
>> Let's take an example, suppose you have counters on a graphic card.
>> Your system has two such graphic cards. In your scheme you would
>> end up with a sys/class/pmu/gfx/.....
>>
>> But now, suppose I want to count cycles on the first graphic card.
>> Seems to me you need to expose the instances as well. The instance
>> number needs to be passed in the attr struct somehow.
>>
>> You can either create multiple subdir under gfx, or have this info somewhere
>> else in the sysfs tree, if people really care about class vs. instance.
>>
>> I can see users doing:
>> $ perf stat -e gfx@1::cycles ... -> sys/class/gfx/1/event/cycles
>>
>> The reason I am using :: here is because libpfm4 is already using
>> this as a separator for PMU type vs. event.
>
>
> right, so the idea is to have these pmu devices hooked into the existing
> sysfs topology, my proposed patch misses that bit because I wanted to
> get something out before having to dig through the topology code trying
> to figure out how all that works.
>
> So the 'cpu' pmu device would be linked from:
>
> /sys/devices/system/cpu/pmu -> /sys/class/pmu/cpu
>
> and gfx things would be linked like:
>
> /sys/devices/pci0000:00/0000:00:1e.0/0000:0b:01.0/drm/card0/pmu -> /sys/class/pmu/radeon0

I don't understand the /sys/devices tree much (I will read up on it),
but this idea looks good to me.

To clarify my understanding a bit and taking the gfx example, in the
path /sys/class/pmu/radeon0, is the '0' here denoting the 0'th radeon
chip in the system, or the radeon model number? I would assume the 0'th
chip.

So if I assume that now points to a unique radeon chip in the system,
underneath /sys/class/pmu/radeon0 will be a structure something like:

radeon0/
event/
evt0
..
evtn

And if there is a second radeon chip, there would be a nearly identical
tree:

radeon1/
event/
evt0
..
evtn

Is that correct?

Some of these events may need modifiers / attributes / umasks...
whatever you want to call them. And they may need more than one each,
and they may vary from event to event. So to add to the hierarchy,
we'd have:

radeon0/
type (for attr.type)
event/
evt0/
id (a base number for attr.config)
description (text file - but could be CONFIG_*'d out)
modifiers/
mod0/
formula (some ascii syntax for describing how
to set .config and/or .config_extra
with this modifer's value)
description (text - can configure out)
constraints (some ascii syntax for describing
the values mod0 can take on)
..
modn/
..
evtn/

And this would be replicated for radeon1..n

Maybe all of the "event" directories could be soft links to a common
radeon<model_number> event directory.

When you fully specify an event, you have something like:

/sys/devices/pci0000:00/0000:00:1e.0/0000:0b:01.0/drm/card0/pmu/<event>[:<modifier>=nnn:...]

So it wouldn't end up being strictly a sysfs path anymore, and perf
would have a bit of parsing work to do, to evaluate the modifiers, using
the info from constraints, and construct the .type, .config, and
.config_extra fields using formula.

Or maybe you have some other structure in mind?

- Corey

2010-11-17 07:02:46

by Kyle Moffett

[permalink] [raw]
Subject: Re: [RFC][PATCH] perf: sysfs type id

On Tue, Nov 16, 2010 at 21:35, Corey Ashford
<[email protected]> wrote:
> On 11/10/2010 01:05 PM, Peter Zijlstra wrote:
>> On Wed, 2010-11-10 at 21:53 +0100, Stephane Eranian wrote:
>>> On Wed, Nov 10, 2010 at 9:32 PM, Peter Zijlstra<[email protected]> wrote:
>>>> possible, or something like<pmu-name>:<event-name>, cpu:cycles would
>>>> map to /sys/class/pmu/cpu/events/cycles (given the previous patch).
>>>
>>> Ok, but I think you're proposal is missing one bit. You are addressing
>>> the class (or type) of PMU, but you are not addressing the naming of
>>> an instance.
>>>
>>> Let's take an example, suppose you have counters on a graphic card.
>>> Your system has two such graphic cards. In your scheme you would
>>> end up with a sys/class/pmu/gfx/.....

Not quite. I'm still a relative newbie to bits and pieces of the
device model, but I'll explain what I believe the best representation
would be:

Assuming you have counters on graphics cards, you would already have
the PCI device directories for the GPUs themselves. For example:
/sys/devices/[...]/0000:01:00.0/
/sys/devices/[...]/0000:02:00.0/

Those already obviously have various DRM-related device directories
under them, but I'll assume the PMU is tied directly to the PCI device
(although it could be put elsewhere if appropriate).

So then I believe you would create your "PMU" devices with names
"pmu0", "pmu1", etc, and set their "parent" to point to the PCI
device, and set their "bus" to point to the "pmu" bus.

What would happen is you would get subdirectories for your "pmu" devices
/sys/devices/[...]/0000:01:00.0/pmu0/
/sys/devices/[...]/0000:02:00.0/pmu1/

Each of those devices would have a "driver" symlink inside pointing to
something like:
/sys/subsystem/pmu/drivers/radeonpmu

There would also be symlinks:
/sys/subsystem/pmu/devices/pmu0 => ../../../../devices/[...]/0000:01:00.0/pmu0
/sys/subsystem/pmu/devices/pmu1 => ../../../../devices/[...]/0000:02:00.0/pmu1

So that lets you find your various PMU devices.

Then you'd have another "bus", perhaps, for "pmuevents", where the
"pmuevent" device nodes get useful names. Please note that including
the PMU name in the event name is necessary as you cannot have two
devices on the same "bus" with the exact same name.
/sys/devices/[...]/0000:01:00.0/pmu0/pmu0:gpu_idle/
/sys/devices/[...]/0000:01:00.0/pmu0/pmu0:gpu_throttle/
/sys/devices/[...]/0000:02:00.0/pmu1/pmu1:gpu_idle/
/sys/devices/[...]/0000:02:00.0/pmu1/pmu1:gpu_throttle/

Each event directory would contain other directories full of various
registered attributes of the event.

And again the directory full of symlinks (this is what requires the
"different names" thing as mentioned above):
/sys/subsystem/pmuevent/devices/pmu0:gpu_idle => [......]
/sys/subsystem/pmuevent/devices/pmu0:gpu_throttle => [......]
/sys/subsystem/pmuevent/devices/pmu1:gpu_idle => [......]
/sys/subsystem/pmuevent/devices/pmu1:gpu_throttle => [......]

So if you wanted to enumerate all of the "gpu_idle" events on the
system, you could just do:
ls /sys/subsystem/pmuevent/devices/*:gpu_idle

And then by following the symlinks into /sys/devices and traversing
the path upwards you can examine all of the other properties the same
way that udev does.

>>> But now, suppose I want to count cycles on the first graphic card.
>>> Seems to me you need to expose the instances as well. The instance
>>> number needs to be passed in the attr struct somehow.
>>>
>>> You can either create multiple subdir under gfx, or have this info
>>> somewhere
>>> else in the sysfs tree, if people really care about class vs. instance.
>>>
>>> I can see users doing:
>>> $ perf stat -e gfx@1::cycles ...  ->  sys/class/gfx/1/event/cycles
>>>
>>> The reason I am using :: here is because libpfm4 is already using
>>> this as a separator for PMU type vs. event.

So in this case, because all of the events get linked into a global
list, you could just use the "pmu0:gpu_idle" or some similar
derivation as a unique name with an easy lookup in
/sys/subsystem/pmuevent/devices/. As above it's trivial to follow the
symlinks down into the real /sys/devices/ tree and see where it's
physically located.


> Some of these events may need modifiers / attributes / umasks... whatever
> you want to call them.  And they may need more than one each, and they may
> vary from event to event.  So to add to the hierarchy,
> we'd have:
>
> radeon0/
>    type (for attr.type)
>    event/
>        evt0/
>            id (a base number for attr.config)
>            description (text file - but could be CONFIG_*'d out)
>            modifiers/
>                mod0/
>                    formula (some ascii syntax for describing how
>                             to set .config and/or .config_extra
>                             with this modifer's value)
>                    description (text - can configure out)
>                    constraints (some ascii syntax for describing
>                                 the values mod0 can take on)
>                ..
>                modn/
>        ..
>        evtn/
>
> And this would be replicated for radeon1..n

As for the specific attribute hierarchy, I'm even less sure. One
*potential* approach would be to create a "struct device_driver" for
each type of pmuevent. You would then assign those drivers to each
pmuevent device as it is registered.

To determine how to *configure* the event you would then look up the
event name within the PMU driver, so for example:

Considering the previously-described:
/sys/devices/[...]/0000:01:00.0/pmu0/pmu0:gpu_idle/

You would look at the symlink to the PMU event driver:
/sys/devices/[...]/0000:01:00.0/pmu0/pmu0:gpu_idle/driver => [...]

That symlink would point to:
/sys/subsystem/pmuevent/drivers/radeonpmu:gpu_idle

You could then trivially add driver attributes to that directory for
describing the nitty-gritty details of that event.

I believe with the described model you could still use trivial and
short names like "pmu0:gpu_idle", and yet still be perfectly able to
reference them with all of the hardware in the system. If you have
specific software events or PMUs that are not tied to any particular
hardware, you could easily fiddle with the parent pointer to stuff
them into /sys/devices/virtual/ along with other things like ptys.

Cheers,
Kyle Moffett

2010-11-17 11:25:24

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] perf: sysfs type id

On Tue, 2010-11-16 at 18:35 -0800, Corey Ashford wrote:

> I don't understand the /sys/devices tree much (I will read up on it),
> but this idea looks good to me.

Yeah, me too.. I talked to Kay a bit earlier on and /sys/devices/system
is 'special'..

> To clarify my understanding a bit and taking the gfx example, in the
> path /sys/class/pmu/radeon0, is the '0' here denoting the 0'th radeon
> chip in the system, or the radeon model number? I would assume the 0'th
> chip.

Chip indeed.

> So if I assume that now points to a unique radeon chip in the system,
> underneath /sys/class/pmu/radeon0 will be a structure something like:
>
> radeon0/
> event/
> evt0
> ..
> evtn
>
> And if there is a second radeon chip, there would be a nearly identical
> tree:
>
> radeon1/
> event/
> evt0
> ..
> evtn
>
> Is that correct?

Yes.

> Some of these events may need modifiers / attributes / umasks...
> whatever you want to call them. And they may need more than one each,
> and they may vary from event to event. So to add to the hierarchy,
> we'd have:
>
> radeon0/
> type (for attr.type)
> event/
> evt0/
> id (a base number for attr.config)
> description (text file - but could be CONFIG_*'d out)
> modifiers/
> mod0/
> formula (some ascii syntax for describing how
> to set .config and/or .config_extra
> with this modifer's value)
> description (text - can configure out)
> constraints (some ascii syntax for describing
> the values mod0 can take on)
> ..
> modn/
> ..
> evtn/
>
> And this would be replicated for radeon1..n

The idea of the events dir is to provide a few frequently used/common
events, not to be an exhaustive list.

What we can do is provide a break-down of the config in the top-level
directory and refer people to the hardware documentation (they need to
read that anyway if they want to make use special events anyway).

> Maybe all of the "event" directories could be soft links to a common
> radeon<model_number> event directory.

Possibly, but I don't expect this to be a common thing, and we can
always do it later.

> When you fully specify an event, you have something like:
>
> /sys/devices/pci0000:00/0000:00:1e.0/0000:0b:01.0/drm/card0/pmu/<event>[:<modifier>=nnn:...]
>
> So it wouldn't end up being strictly a sysfs path anymore, and perf
> would have a bit of parsing work to do, to evaluate the modifiers, using
> the info from constraints, and construct the .type, .config, and
> .config_extra fields using formula.
>
> Or maybe you have some other structure in mind?

I wouldn't bother with modifiers and all that:
perf record -e radeon0:r0123456789ABCDEF

is there for people who know what they're doing, possibly we can parse
the config format and use some of that to enable things like:

[ using the x86-intel format because I actually know that, as opposed to
the radeon case which I know absolutely nothing about. ]

# cat cpu/config_format
event_selector:8
unit_mask:8
NULL:7
invert:1
counter_mask:8

perf record -e radeon0:event_selector=0xf;unit_mask=0x5;invert;counter_mask=1

To make it slightly easier, we could maybe event do something like:

perf record -e radeon0:instructions;invert;counter_mask=1

To take the base of the 'instructions' event and modify that with the
invert and counter_mask details.

A further attribute we can give the top level node is something like
'name', which would describe the actual PMU, ARM people want something
like that because its near impossible to figure out what PMU lives on
your chip from userspace.

# cat cpu/name
x86,Core2,PEBS

or somesuch.

2010-11-17 11:31:04

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] perf: sysfs type id

On Wed, 2010-11-17 at 02:02 -0500, Kyle Moffett wrote:
>
> Not quite. I'm still a relative newbie to bits and pieces of the
> device model, but I'll explain what I believe the best representation
> would be:
>
> Assuming you have counters on graphics cards, you would already have
> the PCI device directories for the GPUs themselves. For example:
> /sys/devices/[...]/0000:01:00.0/
> /sys/devices/[...]/0000:02:00.0/
>
> Those already obviously have various DRM-related device directories
> under them, but I'll assume the PMU is tied directly to the PCI device
> (although it could be put elsewhere if appropriate).
>
> So then I believe you would create your "PMU" devices with names
> "pmu0", "pmu1", etc, and set their "parent" to point to the PCI
> device, and set their "bus" to point to the "pmu" bus.
>
> What would happen is you would get subdirectories for your "pmu" devices
> /sys/devices/[...]/0000:01:00.0/pmu0/
> /sys/devices/[...]/0000:02:00.0/pmu1/
>
> Each of those devices would have a "driver" symlink inside pointing to
> something like:
> /sys/subsystem/pmu/drivers/radeonpmu
>
> There would also be symlinks:
> /sys/subsystem/pmu/devices/pmu0 => ../../../../devices/[...]/0000:01:00.0/pmu0
> /sys/subsystem/pmu/devices/pmu1 => ../../../../devices/[...]/0000:02:00.0/pmu1
>
> So that lets you find your various PMU devices.
>
> Then you'd have another "bus", perhaps, for "pmuevents", where the
> "pmuevent" device nodes get useful names. Please note that including
> the PMU name in the event name is necessary as you cannot have two
> devices on the same "bus" with the exact same name.
> /sys/devices/[...]/0000:01:00.0/pmu0/pmu0:gpu_idle/
> /sys/devices/[...]/0000:01:00.0/pmu0/pmu0:gpu_throttle/
> /sys/devices/[...]/0000:02:00.0/pmu1/pmu1:gpu_idle/
> /sys/devices/[...]/0000:02:00.0/pmu1/pmu1:gpu_throttle/
>
> Each event directory would contain other directories full of various
> registered attributes of the event.
>
> And again the directory full of symlinks (this is what requires the
> "different names" thing as mentioned above):
> /sys/subsystem/pmuevent/devices/pmu0:gpu_idle => [......]
> /sys/subsystem/pmuevent/devices/pmu0:gpu_throttle => [......]
> /sys/subsystem/pmuevent/devices/pmu1:gpu_idle => [......]
> /sys/subsystem/pmuevent/devices/pmu1:gpu_throttle => [......]
>
> So if you wanted to enumerate all of the "gpu_idle" events on the
> system, you could just do:
> ls /sys/subsystem/pmuevent/devices/*:gpu_idle
>
> And then by following the symlinks into /sys/devices and traversing
> the path upwards you can examine all of the other properties the same
> way that udev does.

I've been talking to Kay for a bit and I've settled on one bus
"event_source", I register devices to that, and each device gets an
attribute_group "events" with events in.

Implementing that is painful enough, when I get it working I'll post it
as RFC and if someone with more sysfs skill than me is willing to help
out that's nice, but implementing something like Kyle says is waaaay
beyond me, sysfs is teh crazeh.

2010-11-17 19:47:37

by Corey Ashford

[permalink] [raw]
Subject: Re: [RFC][PATCH] perf: sysfs type id

On 11/17/2010 03:25 AM, Peter Zijlstra wrote:
> On Tue, 2010-11-16 at 18:35 -0800, Corey Ashford wrote:
>
>> I don't understand the /sys/devices tree much (I will read up on it),
>> but this idea looks good to me.
>
> Yeah, me too.. I talked to Kay a bit earlier on and /sys/devices/system
> is 'special'..
>
>> To clarify my understanding a bit and taking the gfx example, in the
>> path /sys/class/pmu/radeon0, is the '0' here denoting the 0'th radeon
>> chip in the system, or the radeon model number? I would assume the 0'th
>> chip.
>
> Chip indeed.
>
>> So if I assume that now points to a unique radeon chip in the system,
>> underneath /sys/class/pmu/radeon0 will be a structure something like:
>>
>> radeon0/
>> event/
>> evt0
>> ..
>> evtn
>>
>> And if there is a second radeon chip, there would be a nearly identical
>> tree:
>>
>> radeon1/
>> event/
>> evt0
>> ..
>> evtn
>>
>> Is that correct?
>
> Yes.
>
>> Some of these events may need modifiers / attributes / umasks...
>> whatever you want to call them. And they may need more than one each,
>> and they may vary from event to event. So to add to the hierarchy,
>> we'd have:
>>
>> radeon0/
>> type (for attr.type)
>> event/
>> evt0/
>> id (a base number for attr.config)
>> description (text file - but could be CONFIG_*'d out)
>> modifiers/
>> mod0/
>> formula (some ascii syntax for describing how
>> to set .config and/or .config_extra
>> with this modifer's value)
>> description (text - can configure out)
>> constraints (some ascii syntax for describing
>> the values mod0 can take on)
>> ..
>> modn/
>> ..
>> evtn/
>>
>> And this would be replicated for radeon1..n
>
> The idea of the events dir is to provide a few frequently used/common
> events, not to be an exhaustive list.
>
> What we can do is provide a break-down of the config in the top-level
> directory and refer people to the hardware documentation (they need to
> read that anyway if they want to make use special events anyway).

If the config breakdown is at the top level, it will be nearly
unreadable for WSP, because of the many different encoding formats we
use, even for one PMU. See below.

>
>> Maybe all of the "event" directories could be soft links to a common
>> radeon<model_number> event directory.
>
> Possibly, but I don't expect this to be a common thing, and we can
> always do it later.
>
>> When you fully specify an event, you have something like:
>>
>> /sys/devices/pci0000:00/0000:00:1e.0/0000:0b:01.0/drm/card0/pmu/<event>[:<modifier>=nnn:...]
>>
>> So it wouldn't end up being strictly a sysfs path anymore, and perf
>> would have a bit of parsing work to do, to evaluate the modifiers, using
>> the info from constraints, and construct the .type, .config, and
>> .config_extra fields using formula.
>>
>> Or maybe you have some other structure in mind?
>
> I wouldn't bother with modifiers and all that:
> perf record -e radeon0:r0123456789ABCDEF
>
> is there for people who know what they're doing, possibly we can parse
> the config format and use some of that to enable things like:
>
> [ using the x86-intel format because I actually know that, as opposed to
> the radeon case which I know absolutely nothing about. ]
>
> # cat cpu/config_format
> event_selector:8
> unit_mask:8
> NULL:7
> invert:1
> counter_mask:8
>

This is an interesting approach, though for the IBM WSP (aka PowerEN)
chip, the config_format would have to be at a deeper level than the PMU,
because the modifiers that affect the event, vary from event to event.
Either that or you'd have to provide a complex union structure.

However, above you say that you want to have "a few frequently
used/common events". I thought that was the job of the perf "generic
events". My understanding was that the sysfs tree was the solution for
all events, including arch-specific, and seldom-used events. Ingo
pushed back on a user-space library solution (like libpfm4) because he
wanted event info in sysfs (or some other mechanism by which the kernel
could expose event info to user space).

If there is going to be no place in sysfs for arch-specific events, I'll
want to start pushing for perf to be able to use a user space library again.

How about a compromise position: all of the arch-specific events are
exposed to user space via sysfs iff some CONFIG_* variable to set to
true. Something like CONFIG_EXPOSE_ALL_HW_PERF_EVENTS_IN_SYSFS.
This way you would only use all that memory when it's explicitly
configured in.

> perf record -e radeon0:event_selector=0xf;unit_mask=0x5;invert;counter_mask=1
>
> To make it slightly easier, we could maybe event do something like:
>
> perf record -e radeon0:instructions;invert;counter_mask=1
>
> To take the base of the 'instructions' event and modify that with the
> invert and counter_mask details.

I like this.

- Corey

2010-11-17 19:57:12

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] perf: sysfs type id

On Wed, 2010-11-17 at 11:47 -0800, Corey Ashford wrote:
>
> This is an interesting approach, though for the IBM WSP (aka PowerEN)
> chip, the config_format would have to be at a deeper level than the PMU,
> because the modifiers that affect the event, vary from event to event.
> Either that or you'd have to provide a complex union structure.

Eew.. how uhm creative ;-), yeah, not quite sure how to deal with that,
I wasn't planning to make a directory for each event, just a single file
with the u64 in it.

> However, above you say that you want to have "a few frequently
> used/common events". I thought that was the job of the perf "generic
> events". My understanding was that the sysfs tree was the solution for
> all events, including arch-specific, and seldom-used events.

My intent for the sysfs bits is to replace all the static stuff encoded
in PERF_TYPE_ and PERF_COUNT_ because it simply doesn't extend to a much
more dynamic world.

And GPUs/DSPs might have radically different common events than CPUs do.

Doing exhaustive lists in sysfs is simply going to waste tons of memory
for no real gain.

> Ingo
> pushed back on a user-space library solution (like libpfm4) because he
> wanted event info in sysfs (or some other mechanism by which the kernel
> could expose event info to user space).

I can't fully remember the details of that discussion.

> If there is going to be no place in sysfs for arch-specific events, I'll
> want to start pushing for perf to be able to use a user space library again.
>
> How about a compromise position: all of the arch-specific events are
> exposed to user space via sysfs iff some CONFIG_* variable to set to
> true. Something like CONFIG_EXPOSE_ALL_HW_PERF_EVENTS_IN_SYSFS.
> This way you would only use all that memory when it's explicitly
> configured in.

Ingo, any opinions?

2010-11-17 20:01:32

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] perf: sysfs type id

On Wed, 2010-11-17 at 20:57 +0100, Peter Zijlstra wrote:
> > How about a compromise position: all of the arch-specific events are
> > exposed to user space via sysfs iff some CONFIG_* variable to set to
> > true. Something like CONFIG_EXPOSE_ALL_HW_PERF_EVENTS_IN_SYSFS.
> > This way you would only use all that memory when it's explicitly
> > configured in.

Another thing you could do is make all PMU drivers loadable modules
(most of the infrastructure for that is present) and make the exhaustive
list either a module parameter or another module extending the PMU
driver.

2010-11-17 21:39:25

by Corey Ashford

[permalink] [raw]
Subject: Re: [RFC][PATCH] perf: sysfs type id

On 11/17/2010 12:01 PM, Peter Zijlstra wrote:
> On Wed, 2010-11-17 at 20:57 +0100, Peter Zijlstra wrote:
>>> How about a compromise position: all of the arch-specific events are
>>> exposed to user space via sysfs iff some CONFIG_* variable to set to
>>> true. Something like CONFIG_EXPOSE_ALL_HW_PERF_EVENTS_IN_SYSFS.
>>> This way you would only use all that memory when it's explicitly
>>> configured in.
>
> Another thing you could do is make all PMU drivers loadable modules
> (most of the infrastructure for that is present) and make the exhaustive
> list either a module parameter or another module extending the PMU
> driver.
>

That's an interesting approach. Maybe there could be a perf command
that would load the appropriate modules for all of the PMUs in the
system - "perf load-event-modules".

- Corey