LinuxLists.cc - [PATCH 5/6] perf_counter: kerneltop: mmap

2009-03-25 11:40:54

Subject: [PATCH 5/6] perf_counter: kerneltop: mmap_pages argument

provide a knob to set the number of mmap data pages.

Signed-off-by: Mike Galbraith <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
---
Documentation/perf_counter/kerneltop.c | 31 +++++++++++++++++++++++++------
1 file changed, 25 insertions(+), 6 deletions(-)

Index: linux-2.6/Documentation/perf_counter/kerneltop.c
===================================================================
--- linux-2.6.orig/Documentation/perf_counter/kerneltop.c
+++ linux-2.6/Documentation/perf_counter/kerneltop.c
@@ -178,6 +178,7 @@ static int nr_cpus = 0;
static int nmi = 1;
static int group = 0;
static unsigned int page_size;
+static unsigned int mmap_pages = 4;

static char *vmlinux;

@@ -326,6 +327,7 @@ static void display_help(void)
" -x path --vmlinux=<path> # the vmlinux binary, required for -s use\n"
" -z --zero # zero counts after display\n"
" -D --dump_symtab # dump symbol table to stderr on startup\n"
+ " -m pages --mmap_pages=<pages> # number of mmap data pages\n"
);

exit(0);
@@ -732,7 +734,9 @@ static int read_symbol(FILE *in, struct
/* Tag events to be skipped. */
if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
s->skip = 1;
- if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
+ else if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
+ s->skip = 1;
+ else if (!strcmp("mwait_idle", s->sym))
s->skip = 1;

if (filter_match == 1) {
@@ -1042,9 +1046,10 @@ static void process_options(int argc, ch
{"symbol", required_argument, NULL, 's'},
{"stat", no_argument, NULL, 'S'},
{"zero", no_argument, NULL, 'z'},
+ {"mmap_pages", required_argument, NULL, 'm'},
{NULL, 0, NULL, 0 }
};
- int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:p:s:Sx:z",
+ int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:m:p:s:Sx:z",
long_options, &option_index);
if (c == -1)
break;
@@ -1081,6 +1086,7 @@ static void process_options(int argc, ch
case 'S': run_perfstat = 1; break;
case 'x': vmlinux = strdup(optarg); break;
case 'z': zero = 1; break;
+ case 'm': mmap_pages = atoi(optarg); break;
default: error = 1; break;
}
}
@@ -1134,17 +1140,30 @@ repeat:
return head;
}

+struct timeval last_read, this_read;
+
static void mmap_read(struct mmap_data *md)
{
unsigned int head = mmap_read_head(md);
unsigned int old = md->prev;
unsigned char *data = md->base + page_size;

+ gettimeofday(&this_read, NULL);
+
if (head - old > md->mask) {
- printf("ERROR: failed to keep up with mmap data\n");
- exit(-1);
+ struct timeval iv;
+ unsigned long msecs;
+
+ timersub(&this_read, &last_read, &iv);
+ msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
+
+ fprintf(stderr, "WARNING: failed to keep up with mmap data. Last read %lu msecs ago.\n", msecs);
+
+ old = head;
}

+ last_read = this_read;
+
for (; old != head;) {
__u64 *ptr = (__u64 *)&data[old & md->mask];
old += sizeof(__u64);
@@ -1220,8 +1239,8 @@ int main(int argc, char *argv[])

mmap_array[i][counter].counter = counter;
mmap_array[i][counter].prev = 0;
- mmap_array[i][counter].mask = 2*page_size - 1;
- mmap_array[i][counter].base = mmap(NULL, 3*page_size,
+ mmap_array[i][counter].mask = mmap_pages*page_size - 1;
+ mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
PROT_READ, MAP_SHARED, fd[i][counter], 0);
if (mmap_array[i][counter].base == MAP_FAILED) {
printf("kerneltop error: failed to mmap with %d (%s)\n",

--

2009-03-25 12:08:13

by Peter Zijlstra

[permalink] [raw]

Subject: [tip:perfcounters/core] perf_counter: kerneltop: mmap_pages argument

Commit-ID: a7aa0cab482525a039b7ce7b1c8cf62f57f441c9
Gitweb: http://git.kernel.org/tip/a7aa0cab482525a039b7ce7b1c8cf62f57f441c9
Author: Peter Zijlstra <[email protected]>
AuthorDate: Wed, 25 Mar 2009 12:30:26 +0100
Committer: Ingo Molnar <[email protected]>
CommitDate: Wed, 25 Mar 2009 13:02:53 +0100

perf_counter: kerneltop: mmap_pages argument

provide a knob to set the number of mmap data pages.

Signed-off-by: Mike Galbraith <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
Cc: Paul Mackerras <[email protected]>
Cc: Mike Galbraith <[email protected]>
Cc: Arjan van de Ven <[email protected]>
Cc: Wu Fengguang <[email protected]>
LKML-Reference: <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>

---
Documentation/perf_counter/kerneltop.c | 31 +++++++++++++++++++++++++------
1 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 7ebde7a..3e45bf6 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -178,6 +178,7 @@ static int nr_cpus = 0;
static int nmi = 1;
static int group = 0;
static unsigned int page_size;
+static unsigned int mmap_pages = 4;

static char *vmlinux;

@@ -326,6 +327,7 @@ static void display_help(void)
" -x path --vmlinux=<path> # the vmlinux binary, required for -s use\n"
" -z --zero # zero counts after display\n"
" -D --dump_symtab # dump symbol table to stderr on startup\n"
+ " -m pages --mmap_pages=<pages> # number of mmap data pages\n"
);

exit(0);
@@ -732,7 +734,9 @@ static int read_symbol(FILE *in, struct sym_entry *s)
/* Tag events to be skipped. */
if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
s->skip = 1;
- if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
+ else if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
+ s->skip = 1;
+ else if (!strcmp("mwait_idle", s->sym))
s->skip = 1;

if (filter_match == 1) {
@@ -1042,9 +1046,10 @@ static void process_options(int argc, char *argv[])
{"symbol", required_argument, NULL, 's'},
{"stat", no_argument, NULL, 'S'},
{"zero", no_argument, NULL, 'z'},
+ {"mmap_pages", required_argument, NULL, 'm'},
{NULL, 0, NULL, 0 }
};
- int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:p:s:Sx:z",
+ int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:m:p:s:Sx:z",
long_options, &option_index);
if (c == -1)
break;
@@ -1081,6 +1086,7 @@ static void process_options(int argc, char *argv[])
case 'S': run_perfstat = 1; break;
case 'x': vmlinux = strdup(optarg); break;
case 'z': zero = 1; break;
+ case 'm': mmap_pages = atoi(optarg); break;
default: error = 1; break;
}
}
@@ -1134,17 +1140,30 @@ repeat:
return head;
}

+struct timeval last_read, this_read;
+
static void mmap_read(struct mmap_data *md)
{
unsigned int head = mmap_read_head(md);
unsigned int old = md->prev;
unsigned char *data = md->base + page_size;

+ gettimeofday(&this_read, NULL);
+
if (head - old > md->mask) {
- printf("ERROR: failed to keep up with mmap data\n");
- exit(-1);
+ struct timeval iv;
+ unsigned long msecs;
+
+ timersub(&this_read, &last_read, &iv);
+ msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
+
+ fprintf(stderr, "WARNING: failed to keep up with mmap data. Last read %lu msecs ago.\n", msecs);
+
+ old = head;
}

+ last_read = this_read;
+
for (; old != head;) {
__u64 *ptr = (__u64 *)&data[old & md->mask];
old += sizeof(__u64);
@@ -1220,8 +1239,8 @@ int main(int argc, char *argv[])

mmap_array[i][counter].counter = counter;
mmap_array[i][counter].prev = 0;
- mmap_array[i][counter].mask = 2*page_size - 1;
- mmap_array[i][counter].base = mmap(NULL, 3*page_size,
+ mmap_array[i][counter].mask = mmap_pages*page_size - 1;
+ mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
PROT_READ, MAP_SHARED, fd[i][counter], 0);
if (mmap_array[i][counter].base == MAP_FAILED) {
printf("kerneltop error: failed to mmap with %d (%s)\n",

2009-03-25 12:18:43

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH 5/6] perf_counter: kerneltop: mmap_pages argument

* Peter Zijlstra <[email protected]> wrote:

> provide a knob to set the number of mmap data pages.

> + " -m pages --mmap_pages=<pages> # number of mmap data pages\n"

Btw., we really want this to be auto-tuning to a large degree. If
the kernel observes missed events, it should create a
PERF_EVENT_OVERFLOW==0x3 record, with the number of missed events -
or something like that.

Ingo

2009-03-25 12:27:35

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH 5/6] perf_counter: kerneltop: mmap_pages argument

On Wed, 2009-03-25 at 13:18 +0100, Ingo Molnar wrote:
> * Peter Zijlstra <[email protected]> wrote:
>
> > provide a knob to set the number of mmap data pages.
>
> > + " -m pages --mmap_pages=<pages> # number of mmap data pages\n"
>
> Btw., we really want this to be auto-tuning to a large degree. If
> the kernel observes missed events, it should create a
> PERF_EVENT_OVERFLOW==0x3 record, with the number of missed events -
> or something like that.

Well, who's to say we ever see that overflow record if we're having
trouble tracking the output as is?

How important is it for people to have accurate overflow information
other than the current -- we can't keep up -- kind?

One possible solution is making the control page writable and writing
the userspace read position to it, then the kernel can, on
perf_output_begin() detect the overflow and count the number of
overwritten events.

This overflow count could then be published back into the control page.

TBH I'm not much of a fan, making all these pages writable just opens a
whole can of worms, and that accurate overflow tracking will put more
code in the output path.

Also, when mixing streams (events,mmap) is a single: you missed 'n'
events still good?

2009-03-25 12:35:43

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH 5/6] perf_counter: kerneltop: mmap_pages argument

* Peter Zijlstra <[email protected]> wrote:

> On Wed, 2009-03-25 at 13:18 +0100, Ingo Molnar wrote:
> > * Peter Zijlstra <[email protected]> wrote:
> >
> > > provide a knob to set the number of mmap data pages.
> >
> > > + " -m pages --mmap_pages=<pages> # number of mmap data pages\n"
> >
> > Btw., we really want this to be auto-tuning to a large degree. If
> > the kernel observes missed events, it should create a
> > PERF_EVENT_OVERFLOW==0x3 record, with the number of missed events -
> > or something like that.
>
> Well, who's to say we ever see that overflow record if we're
> having trouble tracking the output as is?

it would overwrite the last (new) record - so it's deterministic and
the tail does not consume the head - just bites itself a bit.

But it would still be somewhat racy, if user-space _just_ managed to
process those records ...

> How important is it for people to have accurate overflow
> information other than the current -- we can't keep up -- kind?

it's somewhat important and could pave the way for the kernel to
react to overflow more intelligently (via iterim buffering or
whatever future mechanism).

It's also a general quality of implementation principle for kernel
code: if we want to hide information we want to hide it from
_user-space_, not the kernel. Hiding information from the kernel
almost always causes trouble down the line.

> One possible solution is making the control page writable and
> writing the userspace read position to it, then the kernel can, on
> perf_output_begin() detect the overflow and count the number of
> overwritten events.
>
> This overflow count could then be published back into the control
> page.

Ok, that's a nice idea - it keeps the amount of dirty cachelines
minimal.

> TBH I'm not much of a fan, making all these pages writable just
> opens a whole can of worms, and that accurate overflow tracking
> will put more code in the output path.

What can of worms can you see there? (It would not be COW-ed - if
you share those pages without knowing that they are shared then
confused user-space will have to keep broken pieces of iteself.)

> Also, when mixing streams (events,mmap) is a single: you missed
> 'n' events still good?

How would such mixing work? Multiple counters streaming into the
same mmap area?

Ingo

2009-03-25 12:41:27

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH 5/6] perf_counter: kerneltop: mmap_pages argument

On Wed, 2009-03-25 at 13:35 +0100, Ingo Molnar wrote:

> > Also, when mixing streams (events,mmap) is a single: you missed
> > 'n' events still good?
>
> How would such mixing work? Multiple counters streaming into the
> same mmap area?

No basically having overflow events and mmap-vma changed events in a
single output stream.

2009-03-25 12:54:29

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH 5/6] perf_counter: kerneltop: mmap_pages argument

* Peter Zijlstra <[email protected]> wrote:

> On Wed, 2009-03-25 at 13:35 +0100, Ingo Molnar wrote:
>
> > > Also, when mixing streams (events,mmap) is a single: you missed
> > > 'n' events still good?
> >
> > How would such mixing work? Multiple counters streaming into the
> > same mmap area?
>
> No basically having overflow events and mmap-vma changed events in
> a single output stream.

ah, and i missed the impact of variable size records - that too
makes it somewhat impractical to emit overflow records in situ. (the
kernel does not really know the precise start of the previous
record, typically.)

Ingo

2009-03-25 12:57:52

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH 5/6] perf_counter: kerneltop: mmap_pages argument

On Wed, 2009-03-25 at 13:54 +0100, Ingo Molnar wrote:
> * Peter Zijlstra <[email protected]> wrote:
>
> > On Wed, 2009-03-25 at 13:35 +0100, Ingo Molnar wrote:
> >
> > > > Also, when mixing streams (events,mmap) is a single: you missed
> > > > 'n' events still good?
> > >
> > > How would such mixing work? Multiple counters streaming into the
> > > same mmap area?
> >
> > No basically having overflow events and mmap-vma changed events in
> > a single output stream.
>
> ah, and i missed the impact of variable size records - that too
> makes it somewhat impractical to emit overflow records in situ. (the
> kernel does not really know the precise start of the previous
> record, typically.)

Alternatively, we could simply not emit new events until the read
position increases,. that's much simpler.

Still don't like mapping the stuff writable though..

2009-03-25 14:53:27

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH 5/6] perf_counter: kerneltop: mmap_pages argument

On Wed, 2009-03-25 at 13:57 +0100, Peter Zijlstra wrote:
> On Wed, 2009-03-25 at 13:54 +0100, Ingo Molnar wrote:
> > * Peter Zijlstra <[email protected]> wrote:
> >
> > > On Wed, 2009-03-25 at 13:35 +0100, Ingo Molnar wrote:
> > >
> > > > > Also, when mixing streams (events,mmap) is a single: you missed
> > > > > 'n' events still good?
> > > >
> > > > How would such mixing work? Multiple counters streaming into the
> > > > same mmap area?
> > >
> > > No basically having overflow events and mmap-vma changed events in
> > > a single output stream.
> >
> > ah, and i missed the impact of variable size records - that too
> > makes it somewhat impractical to emit overflow records in situ. (the
> > kernel does not really know the precise start of the previous
> > record, typically.)
>
> Alternatively, we could simply not emit new events until the read
> position increases,. that's much simpler.
>
> Still don't like mapping the stuff writable though..

This is what it would look like I suppose...

Any thoughts?

Not-signed-off-by: me
---
include/linux/perf_counter.h | 4 ++
kernel/perf_counter.c | 67 +++++++++++++++++++++++++++++++++++++----
2 files changed, 64 insertions(+), 7 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 6bf67ce..d5a599c 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -165,6 +165,8 @@ struct perf_counter_mmap_page {
__s64 offset; /* add to hardware counter value */

__u32 data_head; /* head in the data section */
+ __u32 data_tail; /* user-space written tail */
+ __u32 overflow; /* number of lost events */
};

struct perf_event_header {
@@ -269,8 +271,10 @@ struct file;
struct perf_mmap_data {
struct rcu_head rcu_head;
int nr_pages;
+ int writable;
atomic_t wakeup;
atomic_t head;
+ atomic_t overflow;
struct perf_counter_mmap_page *user_page;
void *data_pages[0];
};
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 3b862a7..1f5c515 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1330,6 +1330,7 @@ static void __perf_counter_update_userpage(struct perf_counter *counter,
userpg->offset -= atomic64_read(&counter->hw.prev_count);

userpg->data_head = atomic_read(&data->head);
+ userpg->overflow = atomic_read(&data->overflow);
smp_wmb();
++userpg->lock;
preempt_enable();
@@ -1375,6 +1376,28 @@ unlock:
return ret;
}

+static int perf_mmap_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+ int ret = -EINVAL;
+
+ rcu_read_lock();
+ data = rcu_dereference(counter->data);
+ if (!data)
+ goto unlock;
+
+ /*
+ * Only allow writes to the control page.
+ */
+ if (page != virt_to_page(data->user_page))
+ goto unlock;
+
+ ret = 0;
+unlock:
+ rcu_read_unlock();
+
+ return ret;
+}
+
static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
{
struct perf_mmap_data *data;
@@ -1463,6 +1486,7 @@ static struct vm_operations_struct perf_mmap_vmops = {
.open = perf_mmap_open,
.close = perf_mmap_close,
.fault = perf_mmap_fault,
+ .page_mkwrite = perf_mmap_mkwrite,
};

static int perf_mmap(struct file *file, struct vm_area_struct *vma)
@@ -1473,7 +1497,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
unsigned long locked, lock_limit;
int ret = 0;

- if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
+ if (!(vma->vm_flags & VM_SHARED))
return -EINVAL;

vma_size = vma->vm_end - vma->vm_start;
@@ -1503,16 +1527,19 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)

mutex_lock(&counter->mmap_mutex);
if (atomic_inc_not_zero(&counter->mmap_count))
- goto out;
+ goto unlock;

WARN_ON(counter->data);
ret = perf_mmap_data_alloc(counter, nr_pages);
- if (!ret)
- atomic_set(&counter->mmap_count, 1);
-out:
+ if (ret)
+ goto unlock;
+
+ atomic_set(&counter->mmap_count, 1);
+ if (vma->vm_flags & VM_WRITE)
+ counter->data->writable = 1;
+unlock:
mutex_unlock(&counter->mmap_mutex);

- vma->vm_flags &= ~VM_MAYWRITE;
vma->vm_flags |= VM_RESERVED;
vma->vm_ops = &perf_mmap_vmops;

@@ -1540,6 +1567,28 @@ struct perf_output_handle {
int wakeup;
};

+static int perf_output_overflow(struct perf_mmap_data *data,
+ unsigned int offset, unsigned int head)
+{
+ unsigned int tail;
+ unsigned int mask;
+
+ if (!data->writable)
+ return 0;
+
+ mask = (data->nr_pages << PAGE_SHIFT) - 1;
+ smp_rmb();
+ tail = ACCESS_ONCE(data->user_page->data_tail);
+
+ offset = (offset - tail) & mask;
+ head = (head - tail) & mask;
+
+ if ((int)(head - offset) < 0)
+ return 1;
+
+ return 0;
+}
+
static int perf_output_begin(struct perf_output_handle *handle,
struct perf_counter *counter, unsigned int size)
{
@@ -1552,11 +1601,13 @@ static int perf_output_begin(struct perf_output_handle *handle,
goto out;

if (!data->nr_pages)
- goto out;
+ goto fail;

do {
offset = head = atomic_read(&data->head);
head += size;
+ if (unlikely(perf_output_overflow(data, offset, head)))
+ goto fail;
} while (atomic_cmpxchg(&data->head, offset, head) != offset);

handle->counter = counter;
@@ -1567,6 +1618,8 @@ static int perf_output_begin(struct perf_output_handle *handle,

return 0;

+fail:
+ atomic_inc(&data->overflow);
out:
rcu_read_unlock();

2009-03-25 17:16:40

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH 5/6] perf_counter: kerneltop: mmap_pages argument

* Peter Zijlstra <[email protected]> wrote:

> On Wed, 2009-03-25 at 13:57 +0100, Peter Zijlstra wrote:
> > On Wed, 2009-03-25 at 13:54 +0100, Ingo Molnar wrote:
> > > * Peter Zijlstra <[email protected]> wrote:
> > >
> > > > On Wed, 2009-03-25 at 13:35 +0100, Ingo Molnar wrote:
> > > >
> > > > > > Also, when mixing streams (events,mmap) is a single: you missed
> > > > > > 'n' events still good?
> > > > >
> > > > > How would such mixing work? Multiple counters streaming into the
> > > > > same mmap area?
> > > >
> > > > No basically having overflow events and mmap-vma changed events in
> > > > a single output stream.
> > >
> > > ah, and i missed the impact of variable size records - that too
> > > makes it somewhat impractical to emit overflow records in situ. (the
> > > kernel does not really know the precise start of the previous
> > > record, typically.)
> >
> > Alternatively, we could simply not emit new events until the read
> > position increases,. that's much simpler.
> >
> > Still don't like mapping the stuff writable though..
>
> This is what it would look like I suppose...
>
> Any thoughts?
>
> Not-signed-off-by: me

(you dont like it?)

> ---
> include/linux/perf_counter.h | 4 ++
> kernel/perf_counter.c | 67 +++++++++++++++++++++++++++++++++++++----
> 2 files changed, 64 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
> index 6bf67ce..d5a599c 100644
> --- a/include/linux/perf_counter.h
> +++ b/include/linux/perf_counter.h
> @@ -165,6 +165,8 @@ struct perf_counter_mmap_page {
> __s64 offset; /* add to hardware counter value */
>
> __u32 data_head; /* head in the data section */
> + __u32 data_tail; /* user-space written tail */
> + __u32 overflow; /* number of lost events */

small detail: i'd suggest to always pad things up to 64 bits. In
case someone adds a new field with u64.

> };
>
> struct perf_event_header {
> @@ -269,8 +271,10 @@ struct file;
> struct perf_mmap_data {
> struct rcu_head rcu_head;
> int nr_pages;
> + int writable;
> atomic_t wakeup;
> atomic_t head;
> + atomic_t overflow;
> struct perf_counter_mmap_page *user_page;
> void *data_pages[0];
> };
> diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
> index 3b862a7..1f5c515 100644
> --- a/kernel/perf_counter.c
> +++ b/kernel/perf_counter.c
> @@ -1330,6 +1330,7 @@ static void __perf_counter_update_userpage(struct perf_counter *counter,
> userpg->offset -= atomic64_read(&counter->hw.prev_count);
>
> userpg->data_head = atomic_read(&data->head);
> + userpg->overflow = atomic_read(&data->overflow);
> smp_wmb();
> ++userpg->lock;
> preempt_enable();
> @@ -1375,6 +1376,28 @@ unlock:
> return ret;
> }
>
> +static int perf_mmap_mkwrite(struct vm_area_struct *vma, struct page *page)
> +{
> + int ret = -EINVAL;
> +
> + rcu_read_lock();
> + data = rcu_dereference(counter->data);
> + if (!data)
> + goto unlock;
> +
> + /*
> + * Only allow writes to the control page.
> + */
> + if (page != virt_to_page(data->user_page))
> + goto unlock;
> +
> + ret = 0;
> +unlock:
> + rcu_read_unlock();
> +
> + return ret;
> +}
> +

I guess this:

rcu_read_lock();
data = rcu_dereference(counter->data);

/*
* Only allow writes to the control page.
*/
if (data && (page == virt_to_page(data->user_page))
ret = 0;

rcu_read_unlock();

is more compact?

> static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
> {
> struct perf_mmap_data *data;
> @@ -1463,6 +1486,7 @@ static struct vm_operations_struct perf_mmap_vmops = {
> .open = perf_mmap_open,
> .close = perf_mmap_close,
> .fault = perf_mmap_fault,
> + .page_mkwrite = perf_mmap_mkwrite,
> };

(nit: this structure should align vertically)

>
> static int perf_mmap(struct file *file, struct vm_area_struct *vma)
> @@ -1473,7 +1497,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
> unsigned long locked, lock_limit;
> int ret = 0;
>
> - if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
> + if (!(vma->vm_flags & VM_SHARED))
> return -EINVAL;
>
> vma_size = vma->vm_end - vma->vm_start;
> @@ -1503,16 +1527,19 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
>
> mutex_lock(&counter->mmap_mutex);
> if (atomic_inc_not_zero(&counter->mmap_count))
> - goto out;
> + goto unlock;
>
> WARN_ON(counter->data);
> ret = perf_mmap_data_alloc(counter, nr_pages);
> - if (!ret)
> - atomic_set(&counter->mmap_count, 1);
> -out:
> + if (ret)
> + goto unlock;
> +
> + atomic_set(&counter->mmap_count, 1);
> + if (vma->vm_flags & VM_WRITE)
> + counter->data->writable = 1;
> +unlock:
> mutex_unlock(&counter->mmap_mutex);
>
> - vma->vm_flags &= ~VM_MAYWRITE;

does ->vm_fflags have VM_MAYWRITE by default?

> vma->vm_flags |= VM_RESERVED;
> vma->vm_ops = &perf_mmap_vmops;
>
> @@ -1540,6 +1567,28 @@ struct perf_output_handle {
> int wakeup;
> };
>
> +static int perf_output_overflow(struct perf_mmap_data *data,
> + unsigned int offset, unsigned int head)
> +{
> + unsigned int tail;
> + unsigned int mask;
> +
> + if (!data->writable)
> + return 0;

so mmap()-ing it readonly turns off overflow detection
automatically? That's a nice touch i think - user-space can avoid
this overhead, if it does not care about overflows.

> + mask = (data->nr_pages << PAGE_SHIFT) - 1;

btw., we could have a data->mask.

> + smp_rmb();
> + tail = ACCESS_ONCE(data->user_page->data_tail);
> +
> + offset = (offset - tail) & mask;
> + head = (head - tail) & mask;
> +
> + if ((int)(head - offset) < 0)
> + return 1;
> +
> + return 0;

I guess it should use bool and return true/false.

> +}
> +
> static int perf_output_begin(struct perf_output_handle *handle,
> struct perf_counter *counter, unsigned int size)
> {
> @@ -1552,11 +1601,13 @@ static int perf_output_begin(struct perf_output_handle *handle,
> goto out;
>
> if (!data->nr_pages)
> - goto out;
> + goto fail;
>
> do {
> offset = head = atomic_read(&data->head);
> head += size;
> + if (unlikely(perf_output_overflow(data, offset, head)))
> + goto fail;
> } while (atomic_cmpxchg(&data->head, offset, head) != offset);
>
> handle->counter = counter;
> @@ -1567,6 +1618,8 @@ static int perf_output_begin(struct perf_output_handle *handle,
>
> return 0;
>
> +fail:
> + atomic_inc(&data->overflow);

data->user_page->overflow should be increased too - so that
user-space can see it.

And do we really need data->overflow?

Ingo

2009-03-25 21:18:21

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH 5/6] perf_counter: kerneltop: mmap_pages argument

On Wed, 2009-03-25 at 18:16 +0100, Ingo Molnar wrote:
> * Peter Zijlstra <[email protected]> wrote:
>
> > On Wed, 2009-03-25 at 13:57 +0100, Peter Zijlstra wrote:
> > > On Wed, 2009-03-25 at 13:54 +0100, Ingo Molnar wrote:
> > > > * Peter Zijlstra <[email protected]> wrote:
> > > >
> > > > > On Wed, 2009-03-25 at 13:35 +0100, Ingo Molnar wrote:
> > > > >
> > > > > > > Also, when mixing streams (events,mmap) is a single: you missed
> > > > > > > 'n' events still good?
> > > > > >
> > > > > > How would such mixing work? Multiple counters streaming into the
> > > > > > same mmap area?
> > > > >
> > > > > No basically having overflow events and mmap-vma changed events in
> > > > > a single output stream.
> > > >
> > > > ah, and i missed the impact of variable size records - that too
> > > > makes it somewhat impractical to emit overflow records in situ. (the
> > > > kernel does not really know the precise start of the previous
> > > > record, typically.)
> > >
> > > Alternatively, we could simply not emit new events until the read
> > > position increases,. that's much simpler.
> > >
> > > Still don't like mapping the stuff writable though..
> >
> > This is what it would look like I suppose...
> >
> > Any thoughts?
> >
> > Not-signed-off-by: me
>
> (you dont like it?)

Yeah, I'm still unconvinced we need more than the 'we're loosing data'
bit we already have and don't really like the extra code this
introduces, although it isn't nearly as bad as I initially thought it
would be.

> > ---
> > include/linux/perf_counter.h | 4 ++
> > kernel/perf_counter.c | 67 +++++++++++++++++++++++++++++++++++++----
> > 2 files changed, 64 insertions(+), 7 deletions(-)
> >
> > diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
> > index 6bf67ce..d5a599c 100644
> > --- a/include/linux/perf_counter.h
> > +++ b/include/linux/perf_counter.h
> > @@ -165,6 +165,8 @@ struct perf_counter_mmap_page {
> > __s64 offset; /* add to hardware counter value */
> >
> > __u32 data_head; /* head in the data section */
> > + __u32 data_tail; /* user-space written tail */
> > + __u32 overflow; /* number of lost events */
>
> small detail: i'd suggest to always pad things up to 64 bits. In
> case someone adds a new field with u64.

its not a packed struct, so at worst it will result in a hole which can
later be filled. but sure.

> > };

> > +static int perf_mmap_mkwrite(struct vm_area_struct *vma, struct page *page)
> > +{
> > + int ret = -EINVAL;
> > +
> > + rcu_read_lock();
> > + data = rcu_dereference(counter->data);
> > + if (!data)
> > + goto unlock;
> > +
> > + /*
> > + * Only allow writes to the control page.
> > + */
> > + if (page != virt_to_page(data->user_page))
> > + goto unlock;
> > +
> > + ret = 0;
> > +unlock:
> > + rcu_read_unlock();
> > +
> > + return ret;
> > +}
> > +
>
> I guess this:
>
> rcu_read_lock();
> data = rcu_dereference(counter->data);
>
> /*
> * Only allow writes to the control page.
> */
> if (data && (page == virt_to_page(data->user_page))
> ret = 0;
>
> rcu_read_unlock();
>
> is more compact?

Ah, quite.

> >
> > - vma->vm_flags &= ~VM_MAYWRITE;
>
> does ->vm_fflags have VM_MAYWRITE by default?

do_mmap_pgoff() has:

vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;

> > +static int perf_output_overflow(struct perf_mmap_data *data,
> > + unsigned int offset, unsigned int head)
> > +{
> > + unsigned int tail;
> > + unsigned int mask;
> > +
> > + if (!data->writable)
> > + return 0;
>
> so mmap()-ing it readonly turns off overflow detection
> automatically? That's a nice touch i think - user-space can avoid
> this overhead, if it does not care about overflows.

Yep.

> > + mask = (data->nr_pages << PAGE_SHIFT) - 1;
>
> btw., we could have a data->mask.

I thought about it, couldn't make up my mind about it, its only 2
trivial integer ops.

> > + smp_rmb();
> > + tail = ACCESS_ONCE(data->user_page->data_tail);
> > +
> > + offset = (offset - tail) & mask;
> > + head = (head - tail) & mask;
> > +
> > + if ((int)(head - offset) < 0)
> > + return 1;
> > +
> > + return 0;
>
> I guess it should use bool and return true/false.
>
> > +}

Ah, right, we use this new-fangled C99 bool stuff these days ;-)

> > +fail:
> > + atomic_inc(&data->overflow);
>
> data->user_page->overflow should be increased too - so that
> user-space can see it.

Hmm, right, it would need to do that wake-up bit..

> And do we really need data->overflow?

atomic_t isn't really a user exposed typed, the assignment in
update_userpage() seemed like the best solution

2009-03-26 02:43:14

by Paul Mackerras

[permalink] [raw]

Subject: Re: [PATCH 5/6] perf_counter: kerneltop: mmap_pages argument

Peter Zijlstra writes:

> One possible solution is making the control page writable and writing
> the userspace read position to it, then the kernel can, on
> perf_output_begin() detect the overflow and count the number of
> overwritten events.
>
> This overflow count could then be published back into the control page.

We could in principle have many different processes mmapping the same
counter and reading the ring buffer, couldn't we? So which process
gets to put its read position in the control page?

> TBH I'm not much of a fan, making all these pages writable just opens a
> whole can of worms, and that accurate overflow tracking will put more
> code in the output path.

I agree.

Paul.