2008-10-01 15:15:09

by Steven Rostedt

[permalink] [raw]
Subject: [PATCH] ring_buffer: allocate buffer page pointer


The current method of overlaying the page frame as the buffer page pointer
can be very dangerous and limits our ability to do other things with
a page from the buffer, like send it off to disk.

This patch allocates the buffer_page instead of overlaying the page's
page frame. The use of the buffer_page has hardly changed due to this.

Signed-off-by: Steven Rostedt <[email protected]>
---
kernel/trace/ring_buffer.c | 54 ++++++++++++++++++++++++++-------------------
1 file changed, 32 insertions(+), 22 deletions(-)

Index: linux-tip.git/kernel/trace/ring_buffer.c
===================================================================
--- linux-tip.git.orig/kernel/trace/ring_buffer.c 2008-10-01 09:37:23.000000000 -0400
+++ linux-tip.git/kernel/trace/ring_buffer.c 2008-10-01 11:03:16.000000000 -0400
@@ -115,16 +115,10 @@ void *ring_buffer_event_data(struct ring
* Thanks to Peter Zijlstra for suggesting this idea.
*/
struct buffer_page {
- union {
- struct {
- unsigned long flags; /* mandatory */
- atomic_t _count; /* mandatory */
- u64 time_stamp; /* page time stamp */
- unsigned size; /* size of page data */
- struct list_head list; /* list of free pages */
- };
- struct page page;
- };
+ u64 time_stamp; /* page time stamp */
+ unsigned size; /* size of page data */
+ struct list_head list; /* list of free pages */
+ void *page; /* Actual data page */
};

/*
@@ -133,9 +127,9 @@ struct buffer_page {
*/
static inline void free_buffer_page(struct buffer_page *bpage)
{
- reset_page_mapcount(&bpage->page);
- bpage->page.mapping = NULL;
- __free_page(&bpage->page);
+ if (bpage->page)
+ __free_page(bpage->page);
+ kfree(bpage);
}

/*
@@ -237,11 +231,16 @@ static int rb_allocate_pages(struct ring
unsigned i;

for (i = 0; i < nr_pages; i++) {
+ page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!page)
+ goto free_pages;
+ list_add(&page->list, &pages);
+
addr = __get_free_page(GFP_KERNEL);
if (!addr)
goto free_pages;
- page = (struct buffer_page *)virt_to_page(addr);
- list_add(&page->list, &pages);
+ page->page = (void *)addr;
}

list_splice(&pages, head);
@@ -262,6 +261,7 @@ static struct ring_buffer_per_cpu *
rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
+ struct buffer_page *page;
unsigned long addr;
int ret;

@@ -275,10 +275,17 @@ rb_allocate_cpu_buffer(struct ring_buffe
spin_lock_init(&cpu_buffer->lock);
INIT_LIST_HEAD(&cpu_buffer->pages);

+ page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!page)
+ goto fail_free_buffer;
+
+ cpu_buffer->reader_page = page;
addr = __get_free_page(GFP_KERNEL);
if (!addr)
- goto fail_free_buffer;
- cpu_buffer->reader_page = (struct buffer_page *)virt_to_page(addr);
+ goto fail_free_reader;
+ page->page = (void *)addr;
+
INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
cpu_buffer->reader_page->size = 0;

@@ -523,11 +530,16 @@ int ring_buffer_resize(struct ring_buffe

for_each_buffer_cpu(buffer, cpu) {
for (i = 0; i < new_pages; i++) {
+ page = kzalloc_node(ALIGN(sizeof(*page),
+ cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!page)
+ goto free_pages;
+ list_add(&page->list, &pages);
addr = __get_free_page(GFP_KERNEL);
if (!addr)
goto free_pages;
- page = (struct buffer_page *)virt_to_page(addr);
- list_add(&page->list, &pages);
+ page->page = (void *)addr;
}
}

@@ -567,9 +579,7 @@ static inline int rb_null_event(struct r

static inline void *rb_page_index(struct buffer_page *page, unsigned index)
{
- void *addr = page_address(&page->page);
-
- return addr + index;
+ return page->page + index;
}

static inline struct ring_buffer_event *


2008-10-01 17:37:17

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: [PATCH] ring_buffer: allocate buffer page pointer

* Steven Rostedt ([email protected]) wrote:
>
> The current method of overlaying the page frame as the buffer page pointer
> can be very dangerous and limits our ability to do other things with
> a page from the buffer, like send it off to disk.
>
> This patch allocates the buffer_page instead of overlaying the page's
> page frame. The use of the buffer_page has hardly changed due to this.
>
> Signed-off-by: Steven Rostedt <[email protected]>
> ---
> kernel/trace/ring_buffer.c | 54 ++++++++++++++++++++++++++-------------------
> 1 file changed, 32 insertions(+), 22 deletions(-)
>
> Index: linux-tip.git/kernel/trace/ring_buffer.c
> ===================================================================
> --- linux-tip.git.orig/kernel/trace/ring_buffer.c 2008-10-01 09:37:23.000000000 -0400
> +++ linux-tip.git/kernel/trace/ring_buffer.c 2008-10-01 11:03:16.000000000 -0400
> @@ -115,16 +115,10 @@ void *ring_buffer_event_data(struct ring
> * Thanks to Peter Zijlstra for suggesting this idea.
> */
> struct buffer_page {
> - union {
> - struct {
> - unsigned long flags; /* mandatory */
> - atomic_t _count; /* mandatory */
> - u64 time_stamp; /* page time stamp */
> - unsigned size; /* size of page data */
> - struct list_head list; /* list of free pages */
> - };
> - struct page page;
> - };
> + u64 time_stamp; /* page time stamp */
> + unsigned size; /* size of page data */
> + struct list_head list; /* list of free pages */
> + void *page; /* Actual data page */
> };
>
> /*
> @@ -133,9 +127,9 @@ struct buffer_page {
> */
> static inline void free_buffer_page(struct buffer_page *bpage)
> {
> - reset_page_mapcount(&bpage->page);
> - bpage->page.mapping = NULL;
> - __free_page(&bpage->page);
> + if (bpage->page)
> + __free_page(bpage->page);
> + kfree(bpage);
> }
>
> /*
> @@ -237,11 +231,16 @@ static int rb_allocate_pages(struct ring
> unsigned i;
>
> for (i = 0; i < nr_pages; i++) {
> + page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
> + GFP_KERNEL, cpu_to_node(cpu));
> + if (!page)
> + goto free_pages;
> + list_add(&page->list, &pages);
> +
> addr = __get_free_page(GFP_KERNEL);
> if (!addr)
> goto free_pages;
> - page = (struct buffer_page *)virt_to_page(addr);
> - list_add(&page->list, &pages);
> + page->page = (void *)addr;
> }
>
> list_splice(&pages, head);
> @@ -262,6 +261,7 @@ static struct ring_buffer_per_cpu *
> rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
> {
> struct ring_buffer_per_cpu *cpu_buffer;
> + struct buffer_page *page;
> unsigned long addr;
> int ret;
>
> @@ -275,10 +275,17 @@ rb_allocate_cpu_buffer(struct ring_buffe
> spin_lock_init(&cpu_buffer->lock);
> INIT_LIST_HEAD(&cpu_buffer->pages);
>
> + page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
> + GFP_KERNEL, cpu_to_node(cpu));

Hi Steven,

I understand that you want to allocate these struct buffer_page in
memory local to a given cpu node, which is great, but why do you feel
you need to align them on cache_line_size() ?

Hrm.. you put the timestamp in there, so I guess you're concerned about
having a writer on one CPU, a reader on another, and the fact that you
will have cache line bouncing because of that.

Note that if you put the timestamp and the unused bytes in a tiny header
at the beginning of the page, you

1 - make this information directly accessible for disk, network I/O
without any other abstraction layer.
2 - won't have to do such alignment on the struct buffer_page, because
it will only be read once it's been allocated.

My 2 cents ;)

Mathieu

> + if (!page)
> + goto fail_free_buffer;
> +
> + cpu_buffer->reader_page = page;
> addr = __get_free_page(GFP_KERNEL);
> if (!addr)
> - goto fail_free_buffer;
> - cpu_buffer->reader_page = (struct buffer_page *)virt_to_page(addr);
> + goto fail_free_reader;
> + page->page = (void *)addr;
> +
> INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
> cpu_buffer->reader_page->size = 0;
>
> @@ -523,11 +530,16 @@ int ring_buffer_resize(struct ring_buffe
>
> for_each_buffer_cpu(buffer, cpu) {
> for (i = 0; i < new_pages; i++) {
> + page = kzalloc_node(ALIGN(sizeof(*page),
> + cache_line_size()),
> + GFP_KERNEL, cpu_to_node(cpu));
> + if (!page)
> + goto free_pages;
> + list_add(&page->list, &pages);
> addr = __get_free_page(GFP_KERNEL);
> if (!addr)
> goto free_pages;
> - page = (struct buffer_page *)virt_to_page(addr);
> - list_add(&page->list, &pages);
> + page->page = (void *)addr;
> }
> }
>
> @@ -567,9 +579,7 @@ static inline int rb_null_event(struct r
>
> static inline void *rb_page_index(struct buffer_page *page, unsigned index)
> {
> - void *addr = page_address(&page->page);
> -
> - return addr + index;
> + return page->page + index;
> }
>
> static inline struct ring_buffer_event *
>
>

--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2008-10-01 17:50:29

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH] ring_buffer: allocate buffer page pointer


On Wed, 1 Oct 2008, Mathieu Desnoyers wrote:
>
> I understand that you want to allocate these struct buffer_page in
> memory local to a given cpu node, which is great, but why do you feel
> you need to align them on cache_line_size() ?
>
> Hrm.. you put the timestamp in there, so I guess you're concerned about
> having a writer on one CPU, a reader on another, and the fact that you
> will have cache line bouncing because of that.
>
> Note that if you put the timestamp and the unused bytes in a tiny header
> at the beginning of the page, you
>
> 1 - make this information directly accessible for disk, network I/O
> without any other abstraction layer.
> 2 - won't have to do such alignment on the struct buffer_page, because
> it will only be read once it's been allocated.
>

That was the approach I actually started with. But someone (I think
Peter) asked me to remove it.

Who knows, perhaps I can put it back. It's not that hard to do. This is
why I used BUF_PAGE_SIZE to determine the size of the buffer page.
Right now it BUF_PAGE_SIZE == PAGE_SIZE, but if we do add a header than
it will be BUF_PAGE_SIZE == PAGE_SIZE - sizeof(header)

-- Steve

2008-10-01 18:21:31

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: [PATCH] ring_buffer: allocate buffer page pointer

* Steven Rostedt ([email protected]) wrote:
>
> The current method of overlaying the page frame as the buffer page pointer
> can be very dangerous and limits our ability to do other things with
> a page from the buffer, like send it off to disk.
>
> This patch allocates the buffer_page instead of overlaying the page's
> page frame. The use of the buffer_page has hardly changed due to this.
>
> Signed-off-by: Steven Rostedt <[email protected]>
> ---
> kernel/trace/ring_buffer.c | 54 ++++++++++++++++++++++++++-------------------
> 1 file changed, 32 insertions(+), 22 deletions(-)
>
> Index: linux-tip.git/kernel/trace/ring_buffer.c
> ===================================================================
> --- linux-tip.git.orig/kernel/trace/ring_buffer.c 2008-10-01 09:37:23.000000000 -0400
> +++ linux-tip.git/kernel/trace/ring_buffer.c 2008-10-01 11:03:16.000000000 -0400
> @@ -115,16 +115,10 @@ void *ring_buffer_event_data(struct ring
> * Thanks to Peter Zijlstra for suggesting this idea.
> */
> struct buffer_page {
> - union {
> - struct {
> - unsigned long flags; /* mandatory */
> - atomic_t _count; /* mandatory */
> - u64 time_stamp; /* page time stamp */
> - unsigned size; /* size of page data */
> - struct list_head list; /* list of free pages */
> - };
> - struct page page;
> - };
> + u64 time_stamp; /* page time stamp */
> + unsigned size; /* size of page data */
> + struct list_head list; /* list of free pages */
> + void *page; /* Actual data page */
> };
>
> /*
> @@ -133,9 +127,9 @@ struct buffer_page {
> */
> static inline void free_buffer_page(struct buffer_page *bpage)
> {
> - reset_page_mapcount(&bpage->page);
> - bpage->page.mapping = NULL;
> - __free_page(&bpage->page);
> + if (bpage->page)
> + __free_page(bpage->page);
> + kfree(bpage);
> }
>
> /*
> @@ -237,11 +231,16 @@ static int rb_allocate_pages(struct ring
> unsigned i;
>
> for (i = 0; i < nr_pages; i++) {
> + page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
> + GFP_KERNEL, cpu_to_node(cpu));
> + if (!page)
> + goto free_pages;
> + list_add(&page->list, &pages);
> +
> addr = __get_free_page(GFP_KERNEL);

You could probably use alloc_pages_node instead here...

Mathieu

> if (!addr)
> goto free_pages;
> - page = (struct buffer_page *)virt_to_page(addr);
> - list_add(&page->list, &pages);
> + page->page = (void *)addr;
> }
>
> list_splice(&pages, head);
> @@ -262,6 +261,7 @@ static struct ring_buffer_per_cpu *
> rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
> {
> struct ring_buffer_per_cpu *cpu_buffer;
> + struct buffer_page *page;
> unsigned long addr;
> int ret;
>
> @@ -275,10 +275,17 @@ rb_allocate_cpu_buffer(struct ring_buffe
> spin_lock_init(&cpu_buffer->lock);
> INIT_LIST_HEAD(&cpu_buffer->pages);
>
> + page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
> + GFP_KERNEL, cpu_to_node(cpu));
> + if (!page)
> + goto fail_free_buffer;
> +
> + cpu_buffer->reader_page = page;
> addr = __get_free_page(GFP_KERNEL);
> if (!addr)
> - goto fail_free_buffer;
> - cpu_buffer->reader_page = (struct buffer_page *)virt_to_page(addr);
> + goto fail_free_reader;
> + page->page = (void *)addr;
> +
> INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
> cpu_buffer->reader_page->size = 0;
>
> @@ -523,11 +530,16 @@ int ring_buffer_resize(struct ring_buffe
>
> for_each_buffer_cpu(buffer, cpu) {
> for (i = 0; i < new_pages; i++) {
> + page = kzalloc_node(ALIGN(sizeof(*page),
> + cache_line_size()),
> + GFP_KERNEL, cpu_to_node(cpu));
> + if (!page)
> + goto free_pages;
> + list_add(&page->list, &pages);
> addr = __get_free_page(GFP_KERNEL);
> if (!addr)
> goto free_pages;
> - page = (struct buffer_page *)virt_to_page(addr);
> - list_add(&page->list, &pages);
> + page->page = (void *)addr;
> }
> }
>
> @@ -567,9 +579,7 @@ static inline int rb_null_event(struct r
>
> static inline void *rb_page_index(struct buffer_page *page, unsigned index)
> {
> - void *addr = page_address(&page->page);
> -
> - return addr + index;
> + return page->page + index;
> }
>
> static inline struct ring_buffer_event *
>
>

--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2008-10-02 08:54:33

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] ring_buffer: allocate buffer page pointer


* Steven Rostedt <[email protected]> wrote:

>
> The current method of overlaying the page frame as the buffer page pointer
> can be very dangerous and limits our ability to do other things with
> a page from the buffer, like send it off to disk.
>
> This patch allocates the buffer_page instead of overlaying the page's
> page frame. The use of the buffer_page has hardly changed due to this.
>
> Signed-off-by: Steven Rostedt <[email protected]>
> ---
> kernel/trace/ring_buffer.c | 54 ++++++++++++++++++++++++++-------------------
> 1 file changed, 32 insertions(+), 22 deletions(-)

applied to tip/tracing/ftrace, with the extended changlog below - i
think this commit warrants that extra mention.

Ingo

--------------->
>From da78331b4ced2763322d732ac5ba275965853bde Mon Sep 17 00:00:00 2001
From: Steven Rostedt <[email protected]>
Date: Wed, 1 Oct 2008 10:52:51 -0400
Subject: [PATCH] ftrace: type cast filter+verifier

The mmiotrace map had a bug that would typecast the entry from
the trace to the wrong type. That is a known danger of C typecasts,
there's absolutely zero checking done on them.

Help that problem a bit by using a GCC extension to implement a
type filter that restricts the types that a trace record can be
cast into, and by adding a dynamic check (in debug mode) to verify
the type of the entry.

This patch adds a macro to assign all entries of ftrace using the type
of the variable and checking the entry id. The typecasts are now done
in the macro for only those types that it knows about, which should
be all the types that are allowed to be read from the tracer.

Signed-off-by: Steven Rostedt <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/trace/trace.c | 85 ++++++++++++++++++++++++++++------------
kernel/trace/trace.h | 42 ++++++++++++++++++++
kernel/trace/trace_mmiotrace.c | 14 +++++--
3 files changed, 112 insertions(+), 29 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c163406..948f7d8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1350,7 +1350,9 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
}
switch (entry->type) {
case TRACE_FN: {
- struct ftrace_entry *field = (struct ftrace_entry *)entry;
+ struct ftrace_entry *field;
+
+ trace_assign_type(field, entry);

seq_print_ip_sym(s, field->ip, sym_flags);
trace_seq_puts(s, " (");
@@ -1363,8 +1365,9 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
}
case TRACE_CTX:
case TRACE_WAKE: {
- struct ctx_switch_entry *field =
- (struct ctx_switch_entry *)entry;
+ struct ctx_switch_entry *field;
+
+ trace_assign_type(field, entry);

T = field->next_state < sizeof(state_to_char) ?
state_to_char[field->next_state] : 'X';
@@ -1384,7 +1387,9 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
break;
}
case TRACE_SPECIAL: {
- struct special_entry *field = (struct special_entry *)entry;
+ struct special_entry *field;
+
+ trace_assign_type(field, entry);

trace_seq_printf(s, "# %ld %ld %ld\n",
field->arg1,
@@ -1393,7 +1398,9 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
break;
}
case TRACE_STACK: {
- struct stack_entry *field = (struct stack_entry *)entry;
+ struct stack_entry *field;
+
+ trace_assign_type(field, entry);

for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
if (i)
@@ -1404,7 +1411,9 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
break;
}
case TRACE_PRINT: {
- struct print_entry *field = (struct print_entry *)entry;
+ struct print_entry *field;
+
+ trace_assign_type(field, entry);

seq_print_ip_sym(s, field->ip, sym_flags);
trace_seq_printf(s, ": %s", field->buf);
@@ -1454,7 +1463,9 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)

switch (entry->type) {
case TRACE_FN: {
- struct ftrace_entry *field = (struct ftrace_entry *)entry;
+ struct ftrace_entry *field;
+
+ trace_assign_type(field, entry);

ret = seq_print_ip_sym(s, field->ip, sym_flags);
if (!ret)
@@ -1480,8 +1491,9 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
}
case TRACE_CTX:
case TRACE_WAKE: {
- struct ctx_switch_entry *field =
- (struct ctx_switch_entry *)entry;
+ struct ctx_switch_entry *field;
+
+ trace_assign_type(field, entry);

S = field->prev_state < sizeof(state_to_char) ?
state_to_char[field->prev_state] : 'X';
@@ -1501,7 +1513,9 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
break;
}
case TRACE_SPECIAL: {
- struct special_entry *field = (struct special_entry *)entry;
+ struct special_entry *field;
+
+ trace_assign_type(field, entry);

ret = trace_seq_printf(s, "# %ld %ld %ld\n",
field->arg1,
@@ -1512,7 +1526,9 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
break;
}
case TRACE_STACK: {
- struct stack_entry *field = (struct stack_entry *)entry;
+ struct stack_entry *field;
+
+ trace_assign_type(field, entry);

for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
if (i) {
@@ -1531,7 +1547,9 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
break;
}
case TRACE_PRINT: {
- struct print_entry *field = (struct print_entry *)entry;
+ struct print_entry *field;
+
+ trace_assign_type(field, entry);

seq_print_ip_sym(s, field->ip, sym_flags);
trace_seq_printf(s, ": %s", field->buf);
@@ -1562,7 +1580,9 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)

switch (entry->type) {
case TRACE_FN: {
- struct ftrace_entry *field = (struct ftrace_entry *)entry;
+ struct ftrace_entry *field;
+
+ trace_assign_type(field, entry);

ret = trace_seq_printf(s, "%x %x\n",
field->ip,
@@ -1573,8 +1593,9 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
}
case TRACE_CTX:
case TRACE_WAKE: {
- struct ctx_switch_entry *field =
- (struct ctx_switch_entry *)entry;
+ struct ctx_switch_entry *field;
+
+ trace_assign_type(field, entry);

S = field->prev_state < sizeof(state_to_char) ?
state_to_char[field->prev_state] : 'X';
@@ -1596,7 +1617,9 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
}
case TRACE_SPECIAL:
case TRACE_STACK: {
- struct special_entry *field = (struct special_entry *)entry;
+ struct special_entry *field;
+
+ trace_assign_type(field, entry);

ret = trace_seq_printf(s, "# %ld %ld %ld\n",
field->arg1,
@@ -1607,7 +1630,9 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
break;
}
case TRACE_PRINT: {
- struct print_entry *field = (struct print_entry *)entry;
+ struct print_entry *field;
+
+ trace_assign_type(field, entry);

trace_seq_printf(s, "# %lx %s", field->ip, field->buf);
if (entry->flags & TRACE_FLAG_CONT)
@@ -1648,7 +1673,9 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)

switch (entry->type) {
case TRACE_FN: {
- struct ftrace_entry *field = (struct ftrace_entry *)entry;
+ struct ftrace_entry *field;
+
+ trace_assign_type(field, entry);

SEQ_PUT_HEX_FIELD_RET(s, field->ip);
SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
@@ -1656,8 +1683,9 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
}
case TRACE_CTX:
case TRACE_WAKE: {
- struct ctx_switch_entry *field =
- (struct ctx_switch_entry *)entry;
+ struct ctx_switch_entry *field;
+
+ trace_assign_type(field, entry);

S = field->prev_state < sizeof(state_to_char) ?
state_to_char[field->prev_state] : 'X';
@@ -1676,7 +1704,9 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
}
case TRACE_SPECIAL:
case TRACE_STACK: {
- struct special_entry *field = (struct special_entry *)entry;
+ struct special_entry *field;
+
+ trace_assign_type(field, entry);

SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
@@ -1705,15 +1735,18 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)

switch (entry->type) {
case TRACE_FN: {
- struct ftrace_entry *field = (struct ftrace_entry *)entry;
+ struct ftrace_entry *field;
+
+ trace_assign_type(field, entry);

SEQ_PUT_FIELD_RET(s, field->ip);
SEQ_PUT_FIELD_RET(s, field->parent_ip);
break;
}
case TRACE_CTX: {
- struct ctx_switch_entry *field =
- (struct ctx_switch_entry *)entry;
+ struct ctx_switch_entry *field;
+
+ trace_assign_type(field, entry);

SEQ_PUT_FIELD_RET(s, field->prev_pid);
SEQ_PUT_FIELD_RET(s, field->prev_prio);
@@ -1725,7 +1758,9 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
}
case TRACE_SPECIAL:
case TRACE_STACK: {
- struct special_entry *field = (struct special_entry *)entry;
+ struct special_entry *field;
+
+ trace_assign_type(field, entry);

SEQ_PUT_FIELD_RET(s, field->arg1);
SEQ_PUT_FIELD_RET(s, field->arg2);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index a921ba5..f02042d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -177,6 +177,48 @@ struct trace_array {
struct trace_array_cpu *data[NR_CPUS];
};

+#define FTRACE_CMP_TYPE(var, type) \
+ __builtin_types_compatible_p(typeof(var), type *)
+
+#undef IF_ASSIGN
+#define IF_ASSIGN(var, entry, etype, id) \
+ if (FTRACE_CMP_TYPE(var, etype)) { \
+ var = (typeof(var))(entry); \
+ WARN_ON(id && (entry)->type != id); \
+ break; \
+ }
+
+/* Will cause compile errors if type is not found. */
+extern void __ftrace_bad_type(void);
+
+/*
+ * The trace_assign_type is a verifier that the entry type is
+ * the same as the type being assigned. To add new types simply
+ * add a line with the following format:
+ *
+ * IF_ASSIGN(var, ent, type, id);
+ *
+ * Where "type" is the trace type that includes the trace_entry
+ * as the "ent" item. And "id" is the trace identifier that is
+ * used in the trace_type enum.
+ *
+ * If the type can have more than one id, then use zero.
+ */
+#define trace_assign_type(var, ent) \
+ do { \
+ IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN); \
+ IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \
+ IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \
+ IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \
+ IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
+ IF_ASSIGN(var, ent, struct special_entry, 0); \
+ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
+ TRACE_MMIO_RW); \
+ IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
+ TRACE_MMIO_MAP); \
+ IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT); \
+ __ftrace_bad_type(); \
+ } while (0)

/* Return values for print_line callback */
enum print_line_t {
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 1a266aa..0e819f4 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -178,15 +178,17 @@ print_out:
static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
{
struct trace_entry *entry = iter->ent;
- struct trace_mmiotrace_rw *field =
- (struct trace_mmiotrace_rw *)entry;
- struct mmiotrace_rw *rw = &field->rw;
+ struct trace_mmiotrace_rw *field;
+ struct mmiotrace_rw *rw;
struct trace_seq *s = &iter->seq;
unsigned long long t = ns2usecs(iter->ts);
unsigned long usec_rem = do_div(t, 1000000ULL);
unsigned secs = (unsigned long)t;
int ret = 1;

+ trace_assign_type(field, entry);
+ rw = &field->rw;
+
switch (rw->opcode) {
case MMIO_READ:
ret = trace_seq_printf(s,
@@ -222,13 +224,17 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
static enum print_line_t mmio_print_map(struct trace_iterator *iter)
{
struct trace_entry *entry = iter->ent;
- struct mmiotrace_map *m = (struct mmiotrace_map *)entry;
+ struct trace_mmiotrace_map *field;
+ struct mmiotrace_map *m;
struct trace_seq *s = &iter->seq;
unsigned long long t = ns2usecs(iter->ts);
unsigned long usec_rem = do_div(t, 1000000ULL);
unsigned secs = (unsigned long)t;
int ret;

+ trace_assign_type(field, entry);
+ m = &field->map;
+
switch (m->opcode) {
case MMIO_PROBE:
ret = trace_seq_printf(s,

2008-10-02 08:59:55

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] ring_buffer: allocate buffer page pointer


* Ingo Molnar <[email protected]> wrote:

> applied to tip/tracing/ftrace, with the extended changlog below - i
> think this commit warrants that extra mention.

that was for the type filter commit. The 3 patches i've picked up into
tip/tracing/ring-buffer are:

b6eeea4: ftrace: preempt disable over interrupt disable
52abc82: ring_buffer: allocate buffer page pointer
da78331: ftrace: type cast filter+verifier

Thanks,

Ingo

2008-10-02 09:09:23

by Ingo Molnar

[permalink] [raw]
Subject: [PATCH] ring-buffer: fix build error


* Ingo Molnar <[email protected]> wrote:

>
> * Ingo Molnar <[email protected]> wrote:
>
> > applied to tip/tracing/ftrace, with the extended changlog below - i
> > think this commit warrants that extra mention.
>
> that was for the type filter commit. The 3 patches i've picked up into
> tip/tracing/ring-buffer are:
>
> b6eeea4: ftrace: preempt disable over interrupt disable
> 52abc82: ring_buffer: allocate buffer page pointer
> da78331: ftrace: type cast filter+verifier

trivial build fix below.

Ingo

>From 339ce9af3e6cbc02442b0b356c1ecb80a8ae92fb Mon Sep 17 00:00:00 2001
From: Ingo Molnar <[email protected]>
Date: Thu, 2 Oct 2008 11:04:14 +0200
Subject: [PATCH] ring-buffer: fix build error
MIME-Version: 1.0
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: 8bit

fix:

kernel/trace/ring_buffer.c: In function ‘rb_allocate_pages’:
kernel/trace/ring_buffer.c:235: error: ‘cpu’ undeclared (first use in this function)
kernel/trace/ring_buffer.c:235: error: (Each undeclared identifier is reported only once
kernel/trace/ring_buffer.c:235: error: for each function it appears in.)

Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/trace/ring_buffer.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9814571..54a3098 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -232,7 +232,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,

for (i = 0; i < nr_pages; i++) {
page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
- GFP_KERNEL, cpu_to_node(cpu));
+ GFP_KERNEL, cpu_to_node(i));
if (!page)
goto free_pages;
list_add(&page->list, &pages);

2008-10-02 09:13:31

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] ring_buffer: allocate buffer page pointer

On Thu, 2 Oct 2008 10:50:30 +0200 Ingo Molnar <[email protected]> wrote:

>
> * Steven Rostedt <[email protected]> wrote:
>
> >
> > The current method of overlaying the page frame as the buffer page pointer
> > can be very dangerous and limits our ability to do other things with
> > a page from the buffer, like send it off to disk.
> >
> > This patch allocates the buffer_page instead of overlaying the page's
> > page frame. The use of the buffer_page has hardly changed due to this.
> >
> > Signed-off-by: Steven Rostedt <[email protected]>
> > ---
> > kernel/trace/ring_buffer.c | 54 ++++++++++++++++++++++++++-------------------
> > 1 file changed, 32 insertions(+), 22 deletions(-)
>
> applied to tip/tracing/ftrace, with the extended changlog below - i
> think this commit warrants that extra mention.
>
> Ingo
>
> --------------->
> >From da78331b4ced2763322d732ac5ba275965853bde Mon Sep 17 00:00:00 2001
> From: Steven Rostedt <[email protected]>
> Date: Wed, 1 Oct 2008 10:52:51 -0400
> Subject: [PATCH] ftrace: type cast filter+verifier
>
> The mmiotrace map had a bug that would typecast the entry from
> the trace to the wrong type. That is a known danger of C typecasts,
> there's absolutely zero checking done on them.
>
> Help that problem a bit by using a GCC extension to implement a
> type filter that restricts the types that a trace record can be
> cast into, and by adding a dynamic check (in debug mode) to verify
> the type of the entry.
>
> This patch adds a macro to assign all entries of ftrace using the type
> of the variable and checking the entry id. The typecasts are now done
> in the macro for only those types that it knows about, which should
> be all the types that are allowed to be read from the tracer.
>

I'm somewhat at a loss here because I'm unable to find any version of
kernel/trace/trace.c which looks anything like the one which is being
patched, but...

> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -1350,7 +1350,9 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
> }
> switch (entry->type) {
> case TRACE_FN: {
> - struct ftrace_entry *field = (struct ftrace_entry *)entry;

Why was this code using a cast in the first place? It should be using
entry->some_field_i_dont_have_here? That was the whole point in using
the anonymous union in struct trace_entry?

2008-10-02 09:44:53

by Ingo Molnar

[permalink] [raw]
Subject: [boot crash] Re: [PATCH] ring-buffer: fix build error


* Ingo Molnar <[email protected]> wrote:

> > that was for the type filter commit. The 3 patches i've picked up into
> > tip/tracing/ring-buffer are:
> >
> > b6eeea4: ftrace: preempt disable over interrupt disable
> > 52abc82: ring_buffer: allocate buffer page pointer
> > da78331: ftrace: type cast filter+verifier
>
> trivial build fix below.

ok, these latest ring-buffer updates cause more serious trouble, i just
got this boot crash on a testbox:

[ 0.324003] calling tracer_alloc_buffers+0x0/0x14a @ 1
[ 0.328008] BUG: unable to handle kernel NULL pointer dereference at 0000000000000008
[ 0.332001] IP: [<ffffffff8027d28b>] ring_buffer_alloc+0x207/0x3fc
[ 0.332001] PGD 0
[ 0.332001] Oops: 0000 [1] SMP
[ 0.332001] CPU 0
[ 0.332001] Modules linked in:
[ 0.332001] Pid: 1, comm: swapper Not tainted 2.6.27-rc8-tip-01064-gd163d6b-dirty #1
[ 0.332001] RIP: 0010:[<ffffffff8027d28b>] [<ffffffff8027d28b>] ring_buffer_alloc+0x207/0x3fc
[ 0.332001] RSP: 0018:ffff88003f9d7de0 EFLAGS: 00010287
[ 0.332001] RAX: 0000000000000000 RBX: ffffffff80b08404 RCX: 0000000000000067
[ 0.332001] RDX: 0000000000000004 RSI: 00000000000080d0 RDI: ffffffffffffffc0
[ 0.332001] RBP: ffff88003f9d7e80 R08: ffff88003f8010b4 R09: 000000000003db02
[ 0.332001] R10: 0000000000000000 R11: 0000000000000001 R12: ffff88003f801600
[ 0.332001] R13: 0000000000000004 R14: ffff88003f801580 R15: ffff88003f801618
[ 0.332001] FS: 0000000000000000(0000) GS:ffffffff80a68280(0000) knlGS:0000000000000000
[ 0.332001] CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b
[ 0.332001] CR2: 0000000000000008 CR3: 0000000000201000 CR4: 00000000000006e0
[ 0.332001] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 0.332001] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 0.332001] Process swapper (pid: 1, threadinfo ffff88003f9d6000, task ffff88003f9d8000)
[ 0.332001] Stack: ffff88003f9d7df0 ffff88003f9d7e40 0000000000000283 ffffffff80b08404
[ 0.332001] ffffffff80b08404 ffff88003f801598 0000000000000000 ffff88003f801598
[ 0.332001] ffff88003f801580 0000016000000000 ffff88003f801600 ffff88003f9a2a40
[ 0.332001] Call Trace:
[ 0.332001] [<ffffffff80a95f41>] ? tracer_alloc_buffers+0x0/0x14a
[ 0.332001] [<ffffffff80a95f67>] tracer_alloc_buffers+0x26/0x14a
[ 0.332001] [<ffffffff80a95f41>] ? tracer_alloc_buffers+0x0/0x14a
[ 0.332001] [<ffffffff80209056>] do_one_initcall+0x56/0x144
[ 0.332001] [<ffffffff80a87d4a>] ? native_smp_prepare_cpus+0x2aa/0x2ef
[ 0.332001] [<ffffffff80a7c8ce>] kernel_init+0x69/0x20e
[ 0.332001] [<ffffffff8020d4e9>] child_rip+0xa/0x11
[ 0.332001] [<ffffffff80257896>] ? __atomic_notifier_call_chain+0xd/0xf
[ 0.332001] [<ffffffff80a7c865>] ? kernel_init+0x0/0x20e
[ 0.332001] [<ffffffff8020d4df>] ? child_rip+0x0/0x11
[ 0.332001] Code: 48 8b 05 d9 b2 7e 00 49 63 d5 48 63 0d 1b b2 7e 00 48 8b 9d 78 ff ff ff be d0 80 00 00 48 8b 04 d0 48 89 cf 48 83 c1 27 48 f7 df <48> 8b 40 08 48 21 cf 8b 14 03 e8 4e b5 02 00 48 85 c0 48 89 c3
[ 0.332001] RIP [<ffffffff8027d28b>] ring_buffer_alloc+0x207/0x3fc
[ 0.332001] RSP <ffff88003f9d7de0>
[ 0.332001] CR2: 0000000000000008
[ 0.332002] Kernel panic - not syncing: Fatal exception

full serial log and config attached. I'm excluding these latest commits
from tip/master for now:

339ce9a: ring-buffer: fix build error
b6eeea4: ftrace: preempt disable over interrupt disable
52abc82: ring_buffer: allocate buffer page pointer
da78331: ftrace: type cast filter+verifier

i'm quite sure 52abc82 causes this problem.

Another 64-bit testbox crashed too meanwhile.

Ingo

2008-10-02 09:45:47

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] ring_buffer: allocate buffer page pointer


* Andrew Morton <[email protected]> wrote:

> On Thu, 2 Oct 2008 10:50:30 +0200 Ingo Molnar <[email protected]> wrote:
>
> >
> > * Steven Rostedt <[email protected]> wrote:
> >
> > >
> > > The current method of overlaying the page frame as the buffer page pointer
> > > can be very dangerous and limits our ability to do other things with
> > > a page from the buffer, like send it off to disk.
> > >
> > > This patch allocates the buffer_page instead of overlaying the page's
> > > page frame. The use of the buffer_page has hardly changed due to this.
> > >
> > > Signed-off-by: Steven Rostedt <[email protected]>
> > > ---
> > > kernel/trace/ring_buffer.c | 54 ++++++++++++++++++++++++++-------------------
> > > 1 file changed, 32 insertions(+), 22 deletions(-)
> >
> > applied to tip/tracing/ftrace, with the extended changlog below - i
> > think this commit warrants that extra mention.
> >
> > Ingo
> >
> > --------------->
> > >From da78331b4ced2763322d732ac5ba275965853bde Mon Sep 17 00:00:00 2001
> > From: Steven Rostedt <[email protected]>
> > Date: Wed, 1 Oct 2008 10:52:51 -0400
> > Subject: [PATCH] ftrace: type cast filter+verifier
> >
> > The mmiotrace map had a bug that would typecast the entry from
> > the trace to the wrong type. That is a known danger of C typecasts,
> > there's absolutely zero checking done on them.
> >
> > Help that problem a bit by using a GCC extension to implement a
> > type filter that restricts the types that a trace record can be
> > cast into, and by adding a dynamic check (in debug mode) to verify
> > the type of the entry.
> >
> > This patch adds a macro to assign all entries of ftrace using the type
> > of the variable and checking the entry id. The typecasts are now done
> > in the macro for only those types that it knows about, which should
> > be all the types that are allowed to be read from the tracer.
> >
>
> I'm somewhat at a loss here because I'm unable to find any version of
> kernel/trace/trace.c which looks anything like the one which is being
> patched, but...

it's in tip/tracing/ring-buffer (also tip/master), but we are still
working on it (i just triggered a crash with it) so i havent pushed it
out into the auto-ftrace-next branch yet.

> > --- a/kernel/trace/trace.c
> > +++ b/kernel/trace/trace.c
> > @@ -1350,7 +1350,9 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
> > }
> > switch (entry->type) {
> > case TRACE_FN: {
> > - struct ftrace_entry *field = (struct ftrace_entry *)entry;
>
> Why was this code using a cast in the first place? It should be using
> entry->some_field_i_dont_have_here? That was the whole point in using
> the anonymous union in struct trace_entry?

this whole mega-thread was about removing that union and turning the
tracer into a type-opaque entity. I warned about the inevitable
fragility - but with this type filter approach the risks should be
substantially lower.

Ingo

2008-10-02 13:06:24

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH] ring_buffer: allocate buffer page pointer


On Thu, 2 Oct 2008, Andrew Morton wrote:
> >
> > This patch adds a macro to assign all entries of ftrace using the type
> > of the variable and checking the entry id. The typecasts are now done
> > in the macro for only those types that it knows about, which should
> > be all the types that are allowed to be read from the tracer.
> >
>
> I'm somewhat at a loss here because I'm unable to find any version of
> kernel/trace/trace.c which looks anything like the one which is being
> patched, but...

As Ingo mentioned, you don't have this yet. And be happy that you don't
;-)

This patch is to fix the patch that did this.

>
> > --- a/kernel/trace/trace.c
> > +++ b/kernel/trace/trace.c
> > @@ -1350,7 +1350,9 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
> > }
> > switch (entry->type) {
> > case TRACE_FN: {
> > - struct ftrace_entry *field = (struct ftrace_entry *)entry;
>
> Why was this code using a cast in the first place? It should be using
> entry->some_field_i_dont_have_here? That was the whole point in using
> the anonymous union in struct trace_entry?

Because the ring_buffer now allows for variable length entries, having a
one size fits all entry is not optimal.

But because C is not the best for typecasting, we have this macro to help
solve the issue. Instead of registering everything into a single union and
causing small fields to be large, we have a macro you can register your
type with instead.

-- Steve

2008-10-02 13:16:31

by Steven Rostedt

[permalink] [raw]
Subject: Re: [boot crash] Re: [PATCH] ring-buffer: fix build error


On Thu, 2 Oct 2008, Ingo Molnar wrote:
> [ 0.332002] Kernel panic - not syncing: Fatal exception
>
> full serial log and config attached. I'm excluding these latest commits
> from tip/master for now:

Thanks I'll take a look at this.

>
> 339ce9a: ring-buffer: fix build error
> b6eeea4: ftrace: preempt disable over interrupt disable

The above "preempt disable" is the most likely culprit. I'm trying to get
towards an interrupt disabled free and lockless code path. But in doing
so, one must be extra careful. This is why I'm taking baby steps towards
this approach. Any little error in one of these steps, and you have race
conditions biting you.

The above replaces interrupt disables with preempt disables, uses the
atomic data disable to protect against reentrancy. But this could also
have opened up a bug that was not present with the interrupts disabled
version.


> 52abc82: ring_buffer: allocate buffer page pointer
> da78331: ftrace: type cast filter+verifier

>
> i'm quite sure 52abc82 causes this problem.

Hmm, that was a trivial patch. Perhaps the trivial ones are the ones that
are most likely to be error prone. A developer will take much more care in
developing a patch that is complex than something he sees as trivial ;-)

-- Steve

>
> Another 64-bit testbox crashed too meanwhile.
>
> Ingo
>

2008-10-02 13:17:42

by Steven Rostedt

[permalink] [raw]
Subject: Re: [boot crash] Re: [PATCH] ring-buffer: fix build error


On Thu, 2 Oct 2008, Ingo Molnar wrote:
> * Ingo Molnar <[email protected]> wrote:

> full serial log and config attached. I'm excluding these latest commits

-ENOATTACHMENT

-- Steve

2008-10-02 15:53:58

by Ingo Molnar

[permalink] [raw]
Subject: Re: [boot crash] Re: [PATCH] ring-buffer: fix build error


* Steven Rostedt <[email protected]> wrote:

>
> On Thu, 2 Oct 2008, Ingo Molnar wrote:
> > * Ingo Molnar <[email protected]> wrote:
>
> > full serial log and config attached. I'm excluding these latest commits
>
> -ENOATTACHMENT

attached.

You can get the broken tree by doing this in tip/master:

git-merge tip/tracing/ring-buffer

Ingo


Attachments:
(No filename) (347.00 B)
crash.log (16.60 kB)
config-Thu_Oct__2_11_00_18_CEST_2008.bad (50.13 kB)
Download all attachments

2008-10-02 18:27:49

by Steven Rostedt

[permalink] [raw]
Subject: Re: [boot crash] Re: [PATCH] ring-buffer: fix build error


On Thu, 2 Oct 2008, Ingo Molnar wrote:

>
> * Steven Rostedt <[email protected]> wrote:
>
> >
> > On Thu, 2 Oct 2008, Ingo Molnar wrote:
> > > * Ingo Molnar <[email protected]> wrote:
> >
> > > full serial log and config attached. I'm excluding these latest commits
> >
> > -ENOATTACHMENT
>
> attached.
>
> You can get the broken tree by doing this in tip/master:
>
> git-merge tip/tracing/ring-buffer

I've just checked-out tip/tracing/ring-buffer. That tree is still broken
too, right? Or do I need to merge it to get the broken version?

-- Steve

2008-10-02 18:59:12

by Ingo Molnar

[permalink] [raw]
Subject: Re: [boot crash] Re: [PATCH] ring-buffer: fix build error


* Steven Rostedt <[email protected]> wrote:

>
> On Thu, 2 Oct 2008, Ingo Molnar wrote:
>
> >
> > * Steven Rostedt <[email protected]> wrote:
> >
> > >
> > > On Thu, 2 Oct 2008, Ingo Molnar wrote:
> > > > * Ingo Molnar <[email protected]> wrote:
> > >
> > > > full serial log and config attached. I'm excluding these latest commits
> > >
> > > -ENOATTACHMENT
> >
> > attached.
> >
> > You can get the broken tree by doing this in tip/master:
> >
> > git-merge tip/tracing/ring-buffer
>
> I've just checked-out tip/tracing/ring-buffer. That tree is still broken
> too, right? Or do I need to merge it to get the broken version?

yes, that's very likely broken too in a standalone way as well - but to
get the exact tree i tested i'd suggest:

git checkout tip/master
git merge tip/tracing/ring-buffer

Ingo

2008-10-02 23:18:23

by Steven Rostedt

[permalink] [raw]
Subject: [PATCH] ring_buffer: map to cpu not page


My original patch had a compile bug when NUMA was configured. I
referenced cpu when it should have been cpu_buffer->cpu.

Ingo quickly fixed this bug by replacing cpu with 'i' because that
was the loop counter. Unfortunately, the 'i' was the counter of
pages, not CPUs. This caused a crash when the number of pages allocated
for the buffers exceeded the number of pages, which would usually
be the case.

Signed-off-by: Steven Rostedt <[email protected]>
---
kernel/trace/ring_buffer.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux-tip.git/kernel/trace/ring_buffer.c
===================================================================
--- linux-tip.git.orig/kernel/trace/ring_buffer.c 2008-10-02 09:09:01.000000000 -0400
+++ linux-tip.git/kernel/trace/ring_buffer.c 2008-10-02 18:58:44.000000000 -0400
@@ -232,7 +232,7 @@ static int rb_allocate_pages(struct ring

for (i = 0; i < nr_pages; i++) {
page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
- GFP_KERNEL, cpu_to_node(i));
+ GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
if (!page)
goto free_pages;
list_add(&page->list, &pages);

2008-10-02 23:37:24

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH] ring_buffer: map to cpu not page


On Thu, 2 Oct 2008, Steven Rostedt wrote:

>
> My original patch had a compile bug when NUMA was configured. I
> referenced cpu when it should have been cpu_buffer->cpu.
>
> Ingo quickly fixed this bug by replacing cpu with 'i' because that
> was the loop counter. Unfortunately, the 'i' was the counter of
> pages, not CPUs. This caused a crash when the number of pages allocated
> for the buffers exceeded the number of pages, which would usually

That should have been:

"when the number of pages allocated for the buffers exceeded the number
of cpus".

> be the case.
>

-- Steve

2008-10-03 04:56:20

by Mathieu Desnoyers

[permalink] [raw]
Subject: [PATCH] x86 Topology cpu_to_node parameter check

Declare NUMA-less cpu_to_node with a check that the cpu parameter exists so
people without NUMA test configs (namely Steven Rostedt and myself who ran into
this error both in the same day with different implementations) stop doing this
trivial mistake.

Signed-off-by: Mathieu Desnoyers <[email protected]>
CC: Steven Rostedt <[email protected]>
CC: Linus Torvalds <[email protected]>
CC: Peter Zijlstra <[email protected]>
CC: Andrew Morton <[email protected]>
CC: Ingo Molnar <[email protected]>
---
include/asm-x86/topology.h | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

Index: linux-2.6-lttng/include/asm-x86/topology.h
===================================================================
--- linux-2.6-lttng.orig/include/asm-x86/topology.h 2008-10-03 00:37:05.000000000 -0400
+++ linux-2.6-lttng/include/asm-x86/topology.h 2008-10-03 00:45:52.000000000 -0400
@@ -182,9 +182,9 @@ extern int __node_distance(int, int);

#else /* !CONFIG_NUMA */

-#define numa_node_id() 0
-#define cpu_to_node(cpu) 0
-#define early_cpu_to_node(cpu) 0
+#define numa_node_id() 0
+#define cpu_to_node(cpu) ((void)(cpu),0)
+#define early_cpu_to_node(cpu) cpu_to_node(cpu)

static inline const cpumask_t *_node_to_cpumask_ptr(int node)
{
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2008-10-03 05:21:05

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH] x86 Topology cpu_to_node parameter check


On Fri, 3 Oct 2008, Mathieu Desnoyers wrote:

> ---
> include/asm-x86/topology.h | 6 +++---
> 1 file changed, 3 insertions(+), 3 deletions(-)
>
> Index: linux-2.6-lttng/include/asm-x86/topology.h
> ===================================================================
> --- linux-2.6-lttng.orig/include/asm-x86/topology.h 2008-10-03 00:37:05.000000000 -0400
> +++ linux-2.6-lttng/include/asm-x86/topology.h 2008-10-03 00:45:52.000000000 -0400
> @@ -182,9 +182,9 @@ extern int __node_distance(int, int);
>
> #else /* !CONFIG_NUMA */
>
> -#define numa_node_id() 0
> -#define cpu_to_node(cpu) 0
> -#define early_cpu_to_node(cpu) 0
> +#define numa_node_id() 0
> +#define cpu_to_node(cpu) ((void)(cpu),0)
> +#define early_cpu_to_node(cpu) cpu_to_node(cpu)

Actually the proper way would be to have:

static inline int cpu_to_node(int cpu)
{
return 0;
}

static inline int early_cpu_to_node(int cpu)
{
return 0;
}

This way you also get typechecks.

-- Steve

>
> static inline const cpumask_t *_node_to_cpumask_ptr(int node)
> {
> --
> Mathieu Desnoyers
> OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
>

2008-10-03 07:29:59

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] ring_buffer: map to cpu not page


* Steven Rostedt <[email protected]> wrote:

> My original patch had a compile bug when NUMA was configured. I
> referenced cpu when it should have been cpu_buffer->cpu.
>
> Ingo quickly fixed this bug by replacing cpu with 'i' because that was
> the loop counter. Unfortunately, the 'i' was the counter of pages, not
> CPUs. This caused a crash when the number of pages allocated for the
> buffers exceeded the number of pages, which would usually be the case.
>
> Signed-off-by: Steven Rostedt <[email protected]>

> for (i = 0; i < nr_pages; i++) {
> page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
> - GFP_KERNEL, cpu_to_node(i));
> + GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));

oh, stupid typo of the year :-)

applied to tip/tracing/ring-buffer, thanks for tracking it down! I've
reactivated the topic branch for tip/master and i'm running a few tests
before pushing it out for wider testing.

Ingo

2008-10-03 15:56:34

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: [PATCH] x86 Topology cpu_to_node parameter check

* Steven Rostedt ([email protected]) wrote:
>
> On Fri, 3 Oct 2008, Mathieu Desnoyers wrote:
>
> > ---
> > include/asm-x86/topology.h | 6 +++---
> > 1 file changed, 3 insertions(+), 3 deletions(-)
> >
> > Index: linux-2.6-lttng/include/asm-x86/topology.h
> > ===================================================================
> > --- linux-2.6-lttng.orig/include/asm-x86/topology.h 2008-10-03 00:37:05.000000000 -0400
> > +++ linux-2.6-lttng/include/asm-x86/topology.h 2008-10-03 00:45:52.000000000 -0400
> > @@ -182,9 +182,9 @@ extern int __node_distance(int, int);
> >
> > #else /* !CONFIG_NUMA */
> >
> > -#define numa_node_id() 0
> > -#define cpu_to_node(cpu) 0
> > -#define early_cpu_to_node(cpu) 0
> > +#define numa_node_id() 0
> > +#define cpu_to_node(cpu) ((void)(cpu),0)
> > +#define early_cpu_to_node(cpu) cpu_to_node(cpu)
>
> Actually the proper way would be to have:
>
> static inline int cpu_to_node(int cpu)
> {
> return 0;
> }
>
> static inline int early_cpu_to_node(int cpu)
> {
> return 0;
> }
>
> This way you also get typechecks.
>

That's how I did it first, but then I looked at asm-generic/topology.h
and have seen it uses #defines. Should we change them too ?

Mathieu

> -- Steve
>
> >
> > static inline const cpumask_t *_node_to_cpumask_ptr(int node)
> > {
> > --
> > Mathieu Desnoyers
> > OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
> >
>

--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2008-10-03 16:26:49

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH] x86 Topology cpu_to_node parameter check


On Fri, 3 Oct 2008, Mathieu Desnoyers wrote:
>
> That's how I did it first, but then I looked at asm-generic/topology.h
> and have seen it uses #defines. Should we change them too ?
>

The old way of doing this is with defines. But all new code should be
static inline functions when feasible. This way we can get typechecking
on the parameters even when the configuration is disabled.

Even if the rest of the file uses defines, the new code should be
static inlines. Eventually, even the old defines will be converted.

-- Steve

2008-10-03 17:27:30

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: [PATCH] x86 Topology cpu_to_node parameter check

* Steven Rostedt ([email protected]) wrote:
>
> On Fri, 3 Oct 2008, Mathieu Desnoyers wrote:
> >
> > That's how I did it first, but then I looked at asm-generic/topology.h
> > and have seen it uses #defines. Should we change them too ?
> >
>
> The old way of doing this is with defines. But all new code should be
> static inline functions when feasible. This way we can get typechecking
> on the parameters even when the configuration is disabled.
>
> Even if the rest of the file uses defines, the new code should be
> static inlines. Eventually, even the old defines will be converted.
>
> -- Steve
>

Argh, I think topology.h is utterly broken :-(

Have you noticed the subtile interaction between the

include/asm-x86/topology.h :

#define numa_node_id() 0
#define cpu_to_node(cpu) 0
#define early_cpu_to_node(cpu) 0
...
#include <asm-generic/topology.h>


and
include/asm-generic/topology.h :
#ifndef cpu_to_node
#define cpu_to_node(cpu) ((void)(cpu),0)
#endif

If any architecture decide for some reason to use a static inline rather
than a define, as currently done with node_to_first_cpu :

include/asm-x86/topology.h :
static inline int node_to_first_cpu(int node)
{
return first_cpu(cpu_online_map);
}
...
#include <asm-generic/topology.h>

include/asm-generic/topology.h :
#ifndef node_to_first_cpu
#define node_to_first_cpu(node) ((void)(node),0)
#endif

(which will override the static inline !)

It results in an override of the arch-specific version. Nice eh ?

Mathieu

--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2008-10-03 17:55:03

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH] x86 Topology cpu_to_node parameter check


On Fri, 3 Oct 2008, Mathieu Desnoyers wrote:

> * Steven Rostedt ([email protected]) wrote:
> >
> > On Fri, 3 Oct 2008, Mathieu Desnoyers wrote:
> > >
> > > That's how I did it first, but then I looked at asm-generic/topology.h
> > > and have seen it uses #defines. Should we change them too ?
> > >
> >
> > The old way of doing this is with defines. But all new code should be
> > static inline functions when feasible. This way we can get typechecking
> > on the parameters even when the configuration is disabled.
> >
> > Even if the rest of the file uses defines, the new code should be
> > static inlines. Eventually, even the old defines will be converted.
> >
> > -- Steve
> >
>
> Argh, I think topology.h is utterly broken :-(
>
> Have you noticed the subtile interaction between the
>
> include/asm-x86/topology.h :
>
> #define numa_node_id() 0
> #define cpu_to_node(cpu) 0
> #define early_cpu_to_node(cpu) 0
> ...
> #include <asm-generic/topology.h>
>
>
> and
> include/asm-generic/topology.h :
> #ifndef cpu_to_node
> #define cpu_to_node(cpu) ((void)(cpu),0)
> #endif
>
> If any architecture decide for some reason to use a static inline rather
> than a define, as currently done with node_to_first_cpu :
>
> include/asm-x86/topology.h :
> static inline int node_to_first_cpu(int node)
> {
> return first_cpu(cpu_online_map);
> }
> ...
> #include <asm-generic/topology.h>
>
> include/asm-generic/topology.h :
> #ifndef node_to_first_cpu
> #define node_to_first_cpu(node) ((void)(node),0)
> #endif
>
> (which will override the static inline !)
>
> It results in an override of the arch-specific version. Nice eh ?

Seems that they expect cpu_to_node to be a macro if NUMA is not
configured.

Actually, since the asm-generic/topology.h does have the cpu shown
(although not in inline format), the solution here is to simply remove
the

#define cpu_to_node() 0

And we can still make the early_cpu_to_node a static inline since it is
not referenced in the generic code.

-- Steve

2008-10-03 18:53:47

by Mathieu Desnoyers

[permalink] [raw]
Subject: [PATCH] topology.h define mess fix

* Steven Rostedt ([email protected]) wrote:

> Seems that they expect cpu_to_node to be a macro if NUMA is not
> configured.
>
> Actually, since the asm-generic/topology.h does have the cpu shown
> (although not in inline format), the solution here is to simply remove
> the
>
> #define cpu_to_node() 0
>
> And we can still make the early_cpu_to_node a static inline since it is
> not referenced in the generic code.
>
> -- Steve
>

Or we take a deep breath and clean this up ?

Ingo, I build tested this on x86_64 (with and without NUMA), x86_32,
powerpc, arm and mips. I applies to both -tip and 2.6.27-rc8. Could it
be pulled into -tip for further testing ?

Note that checkpatch.pl spills a warning telling me to modify include/asm-*/
files (unexisting in my tree) rather than arch/*/include/asm/. Any idea
why ?

Thanks,

Mathieu


topology.h define mess fix

Original goal : Declare NUMA-less cpu_to_node with a check that the cpu
parameter exists so people without NUMA test configs (namely Steven Rostedt and
myself who ran into this error both in the same day with different
implementations) stop doing this trivial mistake.

End result :

Argh, I think topology.h is utterly broken :-(

Have you noticed the subtile interaction between the

include/asm-x86/topology.h :

#define numa_node_id() 0
#define cpu_to_node(cpu) 0
#define early_cpu_to_node(cpu) 0
...
#include <asm-generic/topology.h>


and
include/asm-generic/topology.h :
#ifndef cpu_to_node
#define cpu_to_node(cpu) ((void)(cpu),0)
#endif

If any architecture decide for some reason to use a static inline rather
than a define, as currently done with node_to_first_cpu :

include/asm-x86/topology.h :
static inline int node_to_first_cpu(int node)
{
return first_cpu(cpu_online_map);
}
...
#include <asm-generic/topology.h>

include/asm-generic/topology.h :
#ifndef node_to_first_cpu
#define node_to_first_cpu(node) ((void)(node),0)
#endif

(which will override the static inline !)

It results in an override of the arch-specific version. Nice eh ?

This patch fixes this issue by declaring static inlines in
asm-generic/topology.h and by requiring a _complete_ override of the
topology functions when an architecture needs to override them. An
architecture overriding the topology functions should not include
asm-generic/topology.h anymore.

- alpha needs careful checking, as it did not implement parent_node nor
node_to_first_cpu previously.
- Major cross-architecture built test is required.

Signed-off-by: Mathieu Desnoyers <[email protected]>
CC: Steven Rostedt <[email protected]>
CC: Linus Torvalds <[email protected]>
CC: Peter Zijlstra <[email protected]>
CC: Andrew Morton <[email protected]>
CC: Ingo Molnar <[email protected]>
CC: [email protected]
CC: [email protected]
CC: [email protected]
CC: [email protected]
CC: [email protected]
---
arch/alpha/include/asm/topology.h | 38 +++++++++++++++++++
arch/ia64/include/asm/topology.h | 16 ++++----
arch/powerpc/include/asm/topology.h | 12 +++++-
arch/sh/include/asm/topology.h | 11 -----
include/asm-generic/topology.h | 70 ++++++++++++++++++++----------------
include/asm-x86/topology.h | 66 +++++++++++++++++++++++++--------
6 files changed, 144 insertions(+), 69 deletions(-)

Index: linux-2.6-lttng/include/asm-x86/topology.h
===================================================================
--- linux-2.6-lttng.orig/include/asm-x86/topology.h 2008-10-03 14:41:05.000000000 -0400
+++ linux-2.6-lttng/include/asm-x86/topology.h 2008-10-03 14:41:12.000000000 -0400
@@ -38,6 +38,8 @@
/* Node not present */
#define NUMA_NO_NODE (-1)

+struct pci_bus;
+
#ifdef CONFIG_NUMA
#include <linux/cpumask.h>
#include <asm/mpspec.h>
@@ -116,7 +118,6 @@ static inline cpumask_t node_to_cpumask(

#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */

-/* Replace default node_to_cpumask_ptr with optimized version */
#define node_to_cpumask_ptr(v, node) \
const cpumask_t *v = _node_to_cpumask_ptr(node)

@@ -129,8 +130,14 @@ static inline cpumask_t node_to_cpumask(
* Returns the number of the node containing Node 'node'. This
* architecture is flat, so it is a pretty simple function!
*/
-#define parent_node(node) (node)
+static inline int parent_node(int node)
+{
+ return node;
+}

+/*
+ * Leave those as defines so we don't have to include linux/pci.h.
+ */
#define pcibus_to_node(bus) __pcibus_to_node(bus)
#define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus)

@@ -180,42 +187,67 @@ extern int __node_distance(int, int);
#define node_distance(a, b) __node_distance(a, b)
#endif

+/* Returns the number of the first CPU on Node 'node'. */
+static inline int node_to_first_cpu(int node)
+{
+ node_to_cpumask_ptr(mask, node);
+ return first_cpu(*mask);
+}
+
#else /* !CONFIG_NUMA */

-#define numa_node_id() 0
-#define cpu_to_node(cpu) 0
-#define early_cpu_to_node(cpu) 0
+static inline int numa_node_id(void)
+{
+ return 0;
+}

-static inline const cpumask_t *_node_to_cpumask_ptr(int node)
+/*
+ * We override asm-generic/topology.h.
+ */
+static inline int cpu_to_node(int cpu)
{
- return &cpu_online_map;
+ return 0;
}
+
+static inline int parent_node(int node)
+{
+ return 0;
+}
+
static inline cpumask_t node_to_cpumask(int node)
{
return cpu_online_map;
}
+
static inline int node_to_first_cpu(int node)
{
return first_cpu(cpu_online_map);
}

+static inline int pcibus_to_node(struct pci_bus *bus)
+{
+ return -1;
+}
+
+static inline cpumask_t pcibus_to_cpumask(struct pci_bus *bus)
+{
+ return pcibus_to_node(bus) == -1 ?
+ CPU_MASK_ALL :
+ node_to_cpumask(pcibus_to_node(bus));
+}
+
+static inline const cpumask_t *_node_to_cpumask_ptr(int node)
+{
+ return &cpu_online_map;
+}
+
/* Replace default node_to_cpumask_ptr with optimized version */
#define node_to_cpumask_ptr(v, node) \
const cpumask_t *v = _node_to_cpumask_ptr(node)

#define node_to_cpumask_ptr_next(v, node) \
v = _node_to_cpumask_ptr(node)
-#endif
-
-#include <asm-generic/topology.h>

-#ifdef CONFIG_NUMA
-/* Returns the number of the first CPU on Node 'node'. */
-static inline int node_to_first_cpu(int node)
-{
- node_to_cpumask_ptr(mask, node);
- return first_cpu(*mask);
-}
#endif

extern cpumask_t cpu_coregroup_map(int cpu);
Index: linux-2.6-lttng/arch/alpha/include/asm/topology.h
===================================================================
--- linux-2.6-lttng.orig/arch/alpha/include/asm/topology.h 2008-10-03 14:41:05.000000000 -0400
+++ linux-2.6-lttng/arch/alpha/include/asm/topology.h 2008-10-03 14:41:12.000000000 -0400
@@ -41,7 +41,43 @@ static inline cpumask_t node_to_cpumask(

#define pcibus_to_cpumask(bus) (cpu_online_map)

+struct pci_bus;
+
+static inline int parent_node(int node)
+{
+ return node;
+}
+
+static inline int pcibus_to_node(struct pci_bus *bus)
+{
+ return -1;
+}
+
+static inline cpumask_t pcibus_to_cpumask(struct pci_bus *bus)
+{
+ return pcibus_to_node(bus) == -1 ?
+ CPU_MASK_ALL :
+ node_to_cpumask(pcibus_to_node(bus));
+}
+
+/* returns pointer to cpumask for specified node */
+#define node_to_cpumask_ptr(v, node) \
+ cpumask_t _##v = node_to_cpumask(node); \
+ const cpumask_t *v = &_##v
+
+#define node_to_cpumask_ptr_next(v, node) \
+ _##v = node_to_cpumask(node)
+
+static inline int node_to_first_cpu(int node)
+{
+ node_to_cpumask_ptr(mask, node);
+ return first_cpu(*mask);
+}
+
+#else
+
+#include <asm-generic/topology.h>
+
#endif /* !CONFIG_NUMA */
-# include <asm-generic/topology.h>

#endif /* _ASM_ALPHA_TOPOLOGY_H */
Index: linux-2.6-lttng/arch/ia64/include/asm/topology.h
===================================================================
--- linux-2.6-lttng.orig/arch/ia64/include/asm/topology.h 2008-10-03 14:41:05.000000000 -0400
+++ linux-2.6-lttng/arch/ia64/include/asm/topology.h 2008-10-03 14:41:12.000000000 -0400
@@ -104,6 +104,15 @@ void build_cpu_to_node_map(void);
.nr_balance_failed = 0, \
}

+#define pcibus_to_cpumask(bus) (pcibus_to_node(bus) == -1 ? \
+ CPU_MASK_ALL : \
+ node_to_cpumask(pcibus_to_node(bus)) \
+ )
+
+#else
+
+#include <asm-generic/topology.h>
+
#endif /* CONFIG_NUMA */

#ifdef CONFIG_SMP
@@ -116,11 +125,4 @@ void build_cpu_to_node_map(void);

extern void arch_fix_phys_package_id(int num, u32 slot);

-#define pcibus_to_cpumask(bus) (pcibus_to_node(bus) == -1 ? \
- CPU_MASK_ALL : \
- node_to_cpumask(pcibus_to_node(bus)) \
- )
-
-#include <asm-generic/topology.h>
-
#endif /* _ASM_IA64_TOPOLOGY_H */
Index: linux-2.6-lttng/arch/powerpc/include/asm/topology.h
===================================================================
--- linux-2.6-lttng.orig/arch/powerpc/include/asm/topology.h 2008-10-03 14:41:05.000000000 -0400
+++ linux-2.6-lttng/arch/powerpc/include/asm/topology.h 2008-10-03 14:41:12.000000000 -0400
@@ -77,6 +77,14 @@ extern void __init dump_numa_cpu_topolog
extern int sysfs_add_device_to_node(struct sys_device *dev, int nid);
extern void sysfs_remove_device_from_node(struct sys_device *dev, int nid);

+/* returns pointer to cpumask for specified node */
+#define node_to_cpumask_ptr(v, node) \
+ cpumask_t _##v = node_to_cpumask(node); \
+ const cpumask_t *v = &_##v
+
+#define node_to_cpumask_ptr_next(v, node) \
+ _##v = node_to_cpumask(node)
+
#else

static inline int of_node_to_nid(struct device_node *device)
@@ -96,10 +104,10 @@ static inline void sysfs_remove_device_f
{
}

-#endif /* CONFIG_NUMA */
-
#include <asm-generic/topology.h>

+#endif /* CONFIG_NUMA */
+
#ifdef CONFIG_SMP
#include <asm/cputable.h>
#define smt_capable() (cpu_has_feature(CPU_FTR_SMT))
Index: linux-2.6-lttng/arch/sh/include/asm/topology.h
===================================================================
--- linux-2.6-lttng.orig/arch/sh/include/asm/topology.h 2008-10-03 14:41:05.000000000 -0400
+++ linux-2.6-lttng/arch/sh/include/asm/topology.h 2008-10-03 14:41:12.000000000 -0400
@@ -29,17 +29,6 @@
.nr_balance_failed = 0, \
}

-#define cpu_to_node(cpu) ((void)(cpu),0)
-#define parent_node(node) ((void)(node),0)
-
-#define node_to_cpumask(node) ((void)node, cpu_online_map)
-#define node_to_first_cpu(node) ((void)(node),0)
-
-#define pcibus_to_node(bus) ((void)(bus), -1)
-#define pcibus_to_cpumask(bus) (pcibus_to_node(bus) == -1 ? \
- CPU_MASK_ALL : \
- node_to_cpumask(pcibus_to_node(bus)) \
- )
#endif

#include <asm-generic/topology.h>
Index: linux-2.6-lttng/include/asm-generic/topology.h
===================================================================
--- linux-2.6-lttng.orig/include/asm-generic/topology.h 2008-10-03 14:41:13.000000000 -0400
+++ linux-2.6-lttng/include/asm-generic/topology.h 2008-10-03 14:41:16.000000000 -0400
@@ -27,44 +27,52 @@
#ifndef _ASM_GENERIC_TOPOLOGY_H
#define _ASM_GENERIC_TOPOLOGY_H

-#ifndef CONFIG_NUMA
-
-/* Other architectures wishing to use this simple topology API should fill
- in the below functions as appropriate in their own <asm/topology.h> file. */
-#ifndef cpu_to_node
-#define cpu_to_node(cpu) ((void)(cpu),0)
-#endif
-#ifndef parent_node
-#define parent_node(node) ((void)(node),0)
-#endif
-#ifndef node_to_cpumask
-#define node_to_cpumask(node) ((void)node, cpu_online_map)
-#endif
-#ifndef node_to_first_cpu
-#define node_to_first_cpu(node) ((void)(node),0)
-#endif
-#ifndef pcibus_to_node
-#define pcibus_to_node(bus) ((void)(bus), -1)
-#endif
-
-#ifndef pcibus_to_cpumask
-#define pcibus_to_cpumask(bus) (pcibus_to_node(bus) == -1 ? \
- CPU_MASK_ALL : \
- node_to_cpumask(pcibus_to_node(bus)) \
- )
-#endif
-
-#endif /* CONFIG_NUMA */
+/*
+ * Other architectures wishing to use this simple topology API should fill
+ * in the below functions as appropriate in their own <asm/topology.h> file,
+ * and _don't_ include asm-generic/topology.h.
+ */
+
+struct pci_bus;
+
+static inline int cpu_to_node(int cpu)
+{
+ return 0;
+}
+
+static inline int parent_node(int node)
+{
+ return 0;
+}
+
+static inline cpumask_t node_to_cpumask(int node)
+{
+ return cpu_online_map;
+}
+
+static inline int node_to_first_cpu(int node)
+{
+ return 0;
+}
+
+static inline int pcibus_to_node(struct pci_bus *bus)
+{
+ return -1;
+}
+
+static inline cpumask_t pcibus_to_cpumask(struct pci_bus *bus)
+{
+ return pcibus_to_node(bus) == -1 ?
+ CPU_MASK_ALL :
+ node_to_cpumask(pcibus_to_node(bus));
+}

/* returns pointer to cpumask for specified node */
-#ifndef node_to_cpumask_ptr
-
#define node_to_cpumask_ptr(v, node) \
cpumask_t _##v = node_to_cpumask(node); \
const cpumask_t *v = &_##v

#define node_to_cpumask_ptr_next(v, node) \
_##v = node_to_cpumask(node)
-#endif

#endif /* _ASM_GENERIC_TOPOLOGY_H */

--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2008-10-03 20:14:19

by Luck, Tony

[permalink] [raw]
Subject: RE: [PATCH] topology.h define mess fix

> - Major cross-architecture built test is required.

Some problems on ia64. With defconfig build (which has
CONFIG_NUMA=y) I see this:

kernel/sched.c: In function 'find_next_best_node':
kernel/sched.c:6920: error: implicit declaration of function 'node_to_cpumask_ptr'
kernel/sched.c:6920: error: '__tmp__' undeclared (first use in this function)
kernel/sched.c:6920: error: (Each undeclared identifier is reported only once
kernel/sched.c:6920: error: for each function it appears in.)
kernel/sched.c: In function 'sched_domain_node_span':
kernel/sched.c:6952: error: 'nodemask' undeclared (first use in this function)
kernel/sched.c:6953: warning: ISO C90 forbids mixed declarations and code
kernel/sched.c:6964: error: implicit declaration of function 'node_to_cpumask_ptr_next'
kernel/sched.c: In function '__build_sched_domains':
kernel/sched.c:7510: error: 'pnodemask' undeclared (first use in this function)

On an "allnoconfig" build (which curiously also has CONFIG_NUMA=y :-) I see

mm/page_alloc.c: In function 'find_next_best_node':
mm/page_alloc.c:2086: error: implicit declaration of function 'node_to_cpumask_ptr'
mm/page_alloc.c:2086: error: 'tmp' undeclared (first use in this function)
mm/page_alloc.c:2086: error: (Each undeclared identifier is reported only once
mm/page_alloc.c:2086: error: for each function it appears in.)
mm/page_alloc.c:2107: error: implicit declaration of function 'node_to_cpumask_ptr_next'

There are most probably more errors ... but this is where the build stopped.

-Tony

2008-10-03 22:52:39

by Mathieu Desnoyers

[permalink] [raw]
Subject: [PATCH] topology.h define mess fix v2

* Luck, Tony ([email protected]) wrote:
> > - Major cross-architecture built test is required.
>
> Some problems on ia64. With defconfig build (which has
> CONFIG_NUMA=y) I see this:
>
[...]

Ah, I did not select config "generic" for ia64, and thus did not get
CONFIG_NUMA. Here is a v2 which fixes this.

Thanks for testing this.

Mathieu


topology.h define mess fix v2

Update : build fix for ia64 CONFIG_NUMA.

Original goal : Declare NUMA-less cpu_to_node with a check that the cpu
parameter exists so people without NUMA test configs (namely Steven Rostedt and
myself who ran into this error both in the same day with different
implementations) stop doing this trivial mistake.

End result :

Argh, I think topology.h is utterly broken :-(

Have you noticed the subtile interaction between the

include/asm-x86/topology.h :

#define numa_node_id() 0
#define cpu_to_node(cpu) 0
#define early_cpu_to_node(cpu) 0
...
#include <asm-generic/topology.h>


and
include/asm-generic/topology.h :
#ifndef cpu_to_node
#define cpu_to_node(cpu) ((void)(cpu),0)
#endif

If any architecture decide for some reason to use a static inline rather
than a define, as currently done with node_to_first_cpu :

include/asm-x86/topology.h :
static inline int node_to_first_cpu(int node)
{
return first_cpu(cpu_online_map);
}
...
#include <asm-generic/topology.h>

include/asm-generic/topology.h :
#ifndef node_to_first_cpu
#define node_to_first_cpu(node) ((void)(node),0)
#endif

(which will override the static inline !)

It results in an override of the arch-specific version. Nice eh ?

This patch fixes this issue by declaring static inlines in
asm-generic/topology.h and by requiring a _complete_ override of the
topology functions when an architecture needs to override them. An
architecture overriding the topology functions should not include
asm-generic/topology.h anymore.

- alpha needs careful checking, as it did not implement parent_node nor
node_to_first_cpu previously.
- Major cross-architecture built test is required.

Signed-off-by: Mathieu Desnoyers <[email protected]>
CC: Steven Rostedt <[email protected]>
CC: Linus Torvalds <[email protected]>
CC: Peter Zijlstra <[email protected]>
CC: Andrew Morton <[email protected]>
CC: Ingo Molnar <[email protected]>
CC: [email protected]
CC: [email protected]
CC: [email protected]
CC: [email protected]
CC: [email protected]
---
arch/alpha/include/asm/topology.h | 38 +++++++++++++++++++
arch/ia64/include/asm/topology.h | 24 ++++++++----
arch/powerpc/include/asm/topology.h | 12 +++++-
arch/sh/include/asm/topology.h | 11 -----
include/asm-generic/topology.h | 70 ++++++++++++++++++++----------------
include/asm-x86/topology.h | 66 +++++++++++++++++++++++++--------
6 files changed, 152 insertions(+), 69 deletions(-)

Index: linux-2.6-lttng/include/asm-x86/topology.h
===================================================================
--- linux-2.6-lttng.orig/include/asm-x86/topology.h 2008-10-03 17:58:00.000000000 -0400
+++ linux-2.6-lttng/include/asm-x86/topology.h 2008-10-03 17:59:12.000000000 -0400
@@ -38,6 +38,8 @@
/* Node not present */
#define NUMA_NO_NODE (-1)

+struct pci_bus;
+
#ifdef CONFIG_NUMA
#include <linux/cpumask.h>
#include <asm/mpspec.h>
@@ -116,7 +118,6 @@ static inline cpumask_t node_to_cpumask(

#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */

-/* Replace default node_to_cpumask_ptr with optimized version */
#define node_to_cpumask_ptr(v, node) \
const cpumask_t *v = _node_to_cpumask_ptr(node)

@@ -129,8 +130,14 @@ static inline cpumask_t node_to_cpumask(
* Returns the number of the node containing Node 'node'. This
* architecture is flat, so it is a pretty simple function!
*/
-#define parent_node(node) (node)
+static inline int parent_node(int node)
+{
+ return node;
+}

+/*
+ * Leave those as defines so we don't have to include linux/pci.h.
+ */
#define pcibus_to_node(bus) __pcibus_to_node(bus)
#define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus)

@@ -180,42 +187,67 @@ extern int __node_distance(int, int);
#define node_distance(a, b) __node_distance(a, b)
#endif

+/* Returns the number of the first CPU on Node 'node'. */
+static inline int node_to_first_cpu(int node)
+{
+ node_to_cpumask_ptr(mask, node);
+ return first_cpu(*mask);
+}
+
#else /* !CONFIG_NUMA */

-#define numa_node_id() 0
-#define cpu_to_node(cpu) 0
-#define early_cpu_to_node(cpu) 0
+static inline int numa_node_id(void)
+{
+ return 0;
+}

-static inline const cpumask_t *_node_to_cpumask_ptr(int node)
+/*
+ * We override asm-generic/topology.h.
+ */
+static inline int cpu_to_node(int cpu)
{
- return &cpu_online_map;
+ return 0;
}
+
+static inline int parent_node(int node)
+{
+ return 0;
+}
+
static inline cpumask_t node_to_cpumask(int node)
{
return cpu_online_map;
}
+
static inline int node_to_first_cpu(int node)
{
return first_cpu(cpu_online_map);
}

+static inline int pcibus_to_node(struct pci_bus *bus)
+{
+ return -1;
+}
+
+static inline cpumask_t pcibus_to_cpumask(struct pci_bus *bus)
+{
+ return pcibus_to_node(bus) == -1 ?
+ CPU_MASK_ALL :
+ node_to_cpumask(pcibus_to_node(bus));
+}
+
+static inline const cpumask_t *_node_to_cpumask_ptr(int node)
+{
+ return &cpu_online_map;
+}
+
/* Replace default node_to_cpumask_ptr with optimized version */
#define node_to_cpumask_ptr(v, node) \
const cpumask_t *v = _node_to_cpumask_ptr(node)

#define node_to_cpumask_ptr_next(v, node) \
v = _node_to_cpumask_ptr(node)
-#endif
-
-#include <asm-generic/topology.h>

-#ifdef CONFIG_NUMA
-/* Returns the number of the first CPU on Node 'node'. */
-static inline int node_to_first_cpu(int node)
-{
- node_to_cpumask_ptr(mask, node);
- return first_cpu(*mask);
-}
#endif

extern cpumask_t cpu_coregroup_map(int cpu);
Index: linux-2.6-lttng/arch/alpha/include/asm/topology.h
===================================================================
--- linux-2.6-lttng.orig/arch/alpha/include/asm/topology.h 2008-10-03 17:58:00.000000000 -0400
+++ linux-2.6-lttng/arch/alpha/include/asm/topology.h 2008-10-03 17:59:12.000000000 -0400
@@ -41,7 +41,43 @@ static inline cpumask_t node_to_cpumask(

#define pcibus_to_cpumask(bus) (cpu_online_map)

+struct pci_bus;
+
+static inline int parent_node(int node)
+{
+ return node;
+}
+
+static inline int pcibus_to_node(struct pci_bus *bus)
+{
+ return -1;
+}
+
+static inline cpumask_t pcibus_to_cpumask(struct pci_bus *bus)
+{
+ return pcibus_to_node(bus) == -1 ?
+ CPU_MASK_ALL :
+ node_to_cpumask(pcibus_to_node(bus));
+}
+
+/* returns pointer to cpumask for specified node */
+#define node_to_cpumask_ptr(v, node) \
+ cpumask_t _##v = node_to_cpumask(node); \
+ const cpumask_t *v = &_##v
+
+#define node_to_cpumask_ptr_next(v, node) \
+ _##v = node_to_cpumask(node)
+
+static inline int node_to_first_cpu(int node)
+{
+ node_to_cpumask_ptr(mask, node);
+ return first_cpu(*mask);
+}
+
+#else
+
+#include <asm-generic/topology.h>
+
#endif /* !CONFIG_NUMA */
-# include <asm-generic/topology.h>

#endif /* _ASM_ALPHA_TOPOLOGY_H */
Index: linux-2.6-lttng/arch/ia64/include/asm/topology.h
===================================================================
--- linux-2.6-lttng.orig/arch/ia64/include/asm/topology.h 2008-10-03 17:58:00.000000000 -0400
+++ linux-2.6-lttng/arch/ia64/include/asm/topology.h 2008-10-03 18:36:47.000000000 -0400
@@ -104,6 +104,23 @@ void build_cpu_to_node_map(void);
.nr_balance_failed = 0, \
}

+#define pcibus_to_cpumask(bus) (pcibus_to_node(bus) == -1 ? \
+ CPU_MASK_ALL : \
+ node_to_cpumask(pcibus_to_node(bus)) \
+ )
+
+/* returns pointer to cpumask for specified node */
+#define node_to_cpumask_ptr(v, node) \
+ cpumask_t _##v = node_to_cpumask(node); \
+ const cpumask_t *v = &_##v
+
+#define node_to_cpumask_ptr_next(v, node) \
+ _##v = node_to_cpumask(node)
+
+#else
+
+#include <asm-generic/topology.h>
+
#endif /* CONFIG_NUMA */

#ifdef CONFIG_SMP
@@ -116,11 +133,4 @@ void build_cpu_to_node_map(void);

extern void arch_fix_phys_package_id(int num, u32 slot);

-#define pcibus_to_cpumask(bus) (pcibus_to_node(bus) == -1 ? \
- CPU_MASK_ALL : \
- node_to_cpumask(pcibus_to_node(bus)) \
- )
-
-#include <asm-generic/topology.h>
-
#endif /* _ASM_IA64_TOPOLOGY_H */
Index: linux-2.6-lttng/arch/powerpc/include/asm/topology.h
===================================================================
--- linux-2.6-lttng.orig/arch/powerpc/include/asm/topology.h 2008-10-03 17:58:00.000000000 -0400
+++ linux-2.6-lttng/arch/powerpc/include/asm/topology.h 2008-10-03 17:59:12.000000000 -0400
@@ -77,6 +77,14 @@ extern void __init dump_numa_cpu_topolog
extern int sysfs_add_device_to_node(struct sys_device *dev, int nid);
extern void sysfs_remove_device_from_node(struct sys_device *dev, int nid);

+/* returns pointer to cpumask for specified node */
+#define node_to_cpumask_ptr(v, node) \
+ cpumask_t _##v = node_to_cpumask(node); \
+ const cpumask_t *v = &_##v
+
+#define node_to_cpumask_ptr_next(v, node) \
+ _##v = node_to_cpumask(node)
+
#else

static inline int of_node_to_nid(struct device_node *device)
@@ -96,10 +104,10 @@ static inline void sysfs_remove_device_f
{
}

-#endif /* CONFIG_NUMA */
-
#include <asm-generic/topology.h>

+#endif /* CONFIG_NUMA */
+
#ifdef CONFIG_SMP
#include <asm/cputable.h>
#define smt_capable() (cpu_has_feature(CPU_FTR_SMT))
Index: linux-2.6-lttng/arch/sh/include/asm/topology.h
===================================================================
--- linux-2.6-lttng.orig/arch/sh/include/asm/topology.h 2008-10-03 17:58:00.000000000 -0400
+++ linux-2.6-lttng/arch/sh/include/asm/topology.h 2008-10-03 17:59:12.000000000 -0400
@@ -29,17 +29,6 @@
.nr_balance_failed = 0, \
}

-#define cpu_to_node(cpu) ((void)(cpu),0)
-#define parent_node(node) ((void)(node),0)
-
-#define node_to_cpumask(node) ((void)node, cpu_online_map)
-#define node_to_first_cpu(node) ((void)(node),0)
-
-#define pcibus_to_node(bus) ((void)(bus), -1)
-#define pcibus_to_cpumask(bus) (pcibus_to_node(bus) == -1 ? \
- CPU_MASK_ALL : \
- node_to_cpumask(pcibus_to_node(bus)) \
- )
#endif

#include <asm-generic/topology.h>
Index: linux-2.6-lttng/include/asm-generic/topology.h
===================================================================
--- linux-2.6-lttng.orig/include/asm-generic/topology.h 2008-10-03 17:58:00.000000000 -0400
+++ linux-2.6-lttng/include/asm-generic/topology.h 2008-10-03 17:59:12.000000000 -0400
@@ -27,44 +27,52 @@
#ifndef _ASM_GENERIC_TOPOLOGY_H
#define _ASM_GENERIC_TOPOLOGY_H

-#ifndef CONFIG_NUMA
-
-/* Other architectures wishing to use this simple topology API should fill
- in the below functions as appropriate in their own <asm/topology.h> file. */
-#ifndef cpu_to_node
-#define cpu_to_node(cpu) ((void)(cpu),0)
-#endif
-#ifndef parent_node
-#define parent_node(node) ((void)(node),0)
-#endif
-#ifndef node_to_cpumask
-#define node_to_cpumask(node) ((void)node, cpu_online_map)
-#endif
-#ifndef node_to_first_cpu
-#define node_to_first_cpu(node) ((void)(node),0)
-#endif
-#ifndef pcibus_to_node
-#define pcibus_to_node(bus) ((void)(bus), -1)
-#endif
-
-#ifndef pcibus_to_cpumask
-#define pcibus_to_cpumask(bus) (pcibus_to_node(bus) == -1 ? \
- CPU_MASK_ALL : \
- node_to_cpumask(pcibus_to_node(bus)) \
- )
-#endif
-
-#endif /* CONFIG_NUMA */
+/*
+ * Other architectures wishing to use this simple topology API should fill
+ * in the below functions as appropriate in their own <asm/topology.h> file,
+ * and _don't_ include asm-generic/topology.h.
+ */
+
+struct pci_bus;
+
+static inline int cpu_to_node(int cpu)
+{
+ return 0;
+}
+
+static inline int parent_node(int node)
+{
+ return 0;
+}
+
+static inline cpumask_t node_to_cpumask(int node)
+{
+ return cpu_online_map;
+}
+
+static inline int node_to_first_cpu(int node)
+{
+ return 0;
+}
+
+static inline int pcibus_to_node(struct pci_bus *bus)
+{
+ return -1;
+}
+
+static inline cpumask_t pcibus_to_cpumask(struct pci_bus *bus)
+{
+ return pcibus_to_node(bus) == -1 ?
+ CPU_MASK_ALL :
+ node_to_cpumask(pcibus_to_node(bus));
+}

/* returns pointer to cpumask for specified node */
-#ifndef node_to_cpumask_ptr
-
#define node_to_cpumask_ptr(v, node) \
cpumask_t _##v = node_to_cpumask(node); \
const cpumask_t *v = &_##v

#define node_to_cpumask_ptr_next(v, node) \
_##v = node_to_cpumask(node)
-#endif

#endif /* _ASM_GENERIC_TOPOLOGY_H */

--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68