2012-02-02 20:01:40

by Vaibhav Nagarnaik

[permalink] [raw]
Subject: [PATCH v5 1/4] trace: Add per_cpu ring buffer control files

Add a debugfs entry under per_cpu/ folder for each cpu called
buffer_size_kb to control the ring buffer size for each CPU
independently.

If the global file buffer_size_kb is used to set size, the individual
ring buffers will be adjusted to the given size. The buffer_size_kb will
report the common size to maintain backward compatibility.

If the buffer_size_kb file under the per_cpu/ directory is used to
change buffer size for a specific CPU, only the size of the respective
ring buffer is updated. When tracing/buffer_size_kb is read, it reports
'X' to indicate that sizes of per_cpu ring buffers are not equivalent.

Signed-off-by: Vaibhav Nagarnaik <[email protected]>
---
Changelog v5-v4:
* Rebased to latest upstream

include/linux/ring_buffer.h | 6 +-
kernel/trace/ring_buffer.c | 248 ++++++++++++++++++++++++-------------------
kernel/trace/trace.c | 191 ++++++++++++++++++++++++++-------
kernel/trace/trace.h | 2 +-
4 files changed, 297 insertions(+), 150 deletions(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 67be037..ad36702 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -96,9 +96,11 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k
__ring_buffer_alloc((size), (flags), &__key); \
})

+#define RING_BUFFER_ALL_CPUS -1
+
void ring_buffer_free(struct ring_buffer *buffer);

-int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size);
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, int cpu);

void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val);

@@ -129,7 +131,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts);
void ring_buffer_iter_reset(struct ring_buffer_iter *iter);
int ring_buffer_iter_empty(struct ring_buffer_iter *iter);

-unsigned long ring_buffer_size(struct ring_buffer *buffer);
+unsigned long ring_buffer_size(struct ring_buffer *buffer, int cpu);

void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu);
void ring_buffer_reset(struct ring_buffer *buffer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f5b7b5c..c778ab9 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -481,6 +481,7 @@ struct ring_buffer_per_cpu {
raw_spinlock_t reader_lock; /* serialize readers */
arch_spinlock_t lock;
struct lock_class_key lock_key;
+ unsigned int nr_pages;
struct list_head *pages;
struct buffer_page *head_page; /* read from head */
struct buffer_page *tail_page; /* write to tail */
@@ -498,10 +499,12 @@ struct ring_buffer_per_cpu {
unsigned long read_bytes;
u64 write_stamp;
u64 read_stamp;
+ /* ring buffer pages to update, > 0 to add, < 0 to remove */
+ int nr_pages_to_update;
+ struct list_head new_pages; /* new pages to add */
};

struct ring_buffer {
- unsigned pages;
unsigned flags;
int cpus;
atomic_t record_disabled;
@@ -995,14 +998,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
return 0;
}

-static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
- unsigned nr_pages)
+static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu)
{
+ int i;
struct buffer_page *bpage, *tmp;
- LIST_HEAD(pages);
- unsigned i;
-
- WARN_ON(!nr_pages);

for (i = 0; i < nr_pages; i++) {
struct page *page;
@@ -1013,15 +1012,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
*/
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
GFP_KERNEL | __GFP_NORETRY,
- cpu_to_node(cpu_buffer->cpu));
+ cpu_to_node(cpu));
if (!bpage)
goto free_pages;

- rb_check_bpage(cpu_buffer, bpage);
+ list_add(&bpage->list, pages);

- list_add(&bpage->list, &pages);
-
- page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
+ page = alloc_pages_node(cpu_to_node(cpu),
GFP_KERNEL | __GFP_NORETRY, 0);
if (!page)
goto free_pages;
@@ -1029,6 +1026,27 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
rb_init_page(bpage->page);
}

+ return 0;
+
+free_pages:
+ list_for_each_entry_safe(bpage, tmp, pages, list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+
+ return -ENOMEM;
+}
+
+static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned nr_pages)
+{
+ LIST_HEAD(pages);
+
+ WARN_ON(!nr_pages);
+
+ if (__rb_allocate_pages(nr_pages, &pages, cpu_buffer->cpu))
+ return -ENOMEM;
+
/*
* The ring buffer page list is a circular list that does not
* start and end with a list head. All page list items point to
@@ -1037,20 +1055,15 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
cpu_buffer->pages = pages.next;
list_del(&pages);

+ cpu_buffer->nr_pages = nr_pages;
+
rb_check_pages(cpu_buffer);

return 0;
-
- free_pages:
- list_for_each_entry_safe(bpage, tmp, &pages, list) {
- list_del_init(&bpage->list);
- free_buffer_page(bpage);
- }
- return -ENOMEM;
}

static struct ring_buffer_per_cpu *
-rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
+rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct buffer_page *bpage;
@@ -1084,7 +1097,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)

INIT_LIST_HEAD(&cpu_buffer->reader_page->list);

- ret = rb_allocate_pages(cpu_buffer, buffer->pages);
+ ret = rb_allocate_pages(cpu_buffer, nr_pages);
if (ret < 0)
goto fail_free_reader;

@@ -1145,7 +1158,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
{
struct ring_buffer *buffer;
int bsize;
- int cpu;
+ int cpu, nr_pages;

/* keep it in its own cache line */
buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
@@ -1156,14 +1169,14 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
goto fail_free_buffer;

- buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
buffer->flags = flags;
buffer->clock = trace_clock_local;
buffer->reader_lock_key = key;

/* need at least two pages */
- if (buffer->pages < 2)
- buffer->pages = 2;
+ if (nr_pages < 2)
+ nr_pages = 2;

/*
* In case of non-hotplug cpu, if the ring-buffer is allocated
@@ -1186,7 +1199,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,

for_each_buffer_cpu(buffer, cpu) {
buffer->buffers[cpu] =
- rb_allocate_cpu_buffer(buffer, cpu);
+ rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
if (!buffer->buffers[cpu])
goto fail_free_buffers;
}
@@ -1308,6 +1321,18 @@ out:
raw_spin_unlock_irq(&cpu_buffer->reader_lock);
}

+static void update_pages_handler(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ if (cpu_buffer->nr_pages_to_update > 0)
+ rb_insert_pages(cpu_buffer, &cpu_buffer->new_pages,
+ cpu_buffer->nr_pages_to_update);
+ else
+ rb_remove_pages(cpu_buffer, -cpu_buffer->nr_pages_to_update);
+ cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update;
+ /* reset this value */
+ cpu_buffer->nr_pages_to_update = 0;
+}
+
/**
* ring_buffer_resize - resize the ring buffer
* @buffer: the buffer to resize.
@@ -1317,14 +1342,12 @@ out:
*
* Returns -1 on failure.
*/
-int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
+ int cpu_id)
{
struct ring_buffer_per_cpu *cpu_buffer;
- unsigned nr_pages, rm_pages, new_pages;
- struct buffer_page *bpage, *tmp;
- unsigned long buffer_size;
- LIST_HEAD(pages);
- int i, cpu;
+ unsigned nr_pages;
+ int cpu;

/*
* Always succeed at resizing a non-existent buffer:
@@ -1334,15 +1357,11 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)

size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
size *= BUF_PAGE_SIZE;
- buffer_size = buffer->pages * BUF_PAGE_SIZE;

/* we need a minimum of two pages */
if (size < BUF_PAGE_SIZE * 2)
size = BUF_PAGE_SIZE * 2;

- if (size == buffer_size)
- return size;
-
atomic_inc(&buffer->record_disabled);

/* Make sure all writers are done with this buffer. */
@@ -1353,68 +1372,56 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)

nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);

- if (size < buffer_size) {
-
- /* easy case, just free pages */
- if (RB_WARN_ON(buffer, nr_pages >= buffer->pages))
- goto out_fail;
-
- rm_pages = buffer->pages - nr_pages;
-
+ if (cpu_id == RING_BUFFER_ALL_CPUS) {
+ /* calculate the pages to update */
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
- rb_remove_pages(cpu_buffer, rm_pages);
- }
- goto out;
- }

- /*
- * This is a bit more difficult. We only want to add pages
- * when we can allocate enough for all CPUs. We do this
- * by allocating all the pages and storing them on a local
- * link list. If we succeed in our allocation, then we
- * add these pages to the cpu_buffers. Otherwise we just free
- * them all and return -ENOMEM;
- */
- if (RB_WARN_ON(buffer, nr_pages <= buffer->pages))
- goto out_fail;
+ cpu_buffer->nr_pages_to_update = nr_pages -
+ cpu_buffer->nr_pages;

- new_pages = nr_pages - buffer->pages;
+ /*
+ * nothing more to do for removing pages or no update
+ */
+ if (cpu_buffer->nr_pages_to_update <= 0)
+ continue;

- for_each_buffer_cpu(buffer, cpu) {
- for (i = 0; i < new_pages; i++) {
- struct page *page;
/*
- * __GFP_NORETRY flag makes sure that the allocation
- * fails gracefully without invoking oom-killer and
- * the system is not destabilized.
+ * to add pages, make sure all new pages can be
+ * allocated without receiving ENOMEM
*/
- bpage = kzalloc_node(ALIGN(sizeof(*bpage),
- cache_line_size()),
- GFP_KERNEL | __GFP_NORETRY,
- cpu_to_node(cpu));
- if (!bpage)
- goto free_pages;
- list_add(&bpage->list, &pages);
- page = alloc_pages_node(cpu_to_node(cpu),
- GFP_KERNEL | __GFP_NORETRY, 0);
- if (!page)
- goto free_pages;
- bpage->page = page_address(page);
- rb_init_page(bpage->page);
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
+ if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update,
+ &cpu_buffer->new_pages, cpu))
+ /* not enough memory for new pages */
+ goto no_mem;
}
- }

- for_each_buffer_cpu(buffer, cpu) {
- cpu_buffer = buffer->buffers[cpu];
- rb_insert_pages(cpu_buffer, &pages, new_pages);
- }
+ /* wait for all the updates to complete */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ if (cpu_buffer->nr_pages_to_update) {
+ update_pages_handler(cpu_buffer);
+ }
+ }
+ } else {
+ cpu_buffer = buffer->buffers[cpu_id];
+ if (nr_pages == cpu_buffer->nr_pages)
+ goto out;

- if (RB_WARN_ON(buffer, !list_empty(&pages)))
- goto out_fail;
+ cpu_buffer->nr_pages_to_update = nr_pages -
+ cpu_buffer->nr_pages;
+
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
+ if (cpu_buffer->nr_pages_to_update > 0 &&
+ __rb_allocate_pages(cpu_buffer->nr_pages_to_update,
+ &cpu_buffer->new_pages, cpu_id))
+ goto no_mem;
+
+ update_pages_handler(cpu_buffer);
+ }

out:
- buffer->pages = nr_pages;
put_online_cpus();
mutex_unlock(&buffer->mutex);

@@ -1422,25 +1429,24 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)

return size;

- free_pages:
- list_for_each_entry_safe(bpage, tmp, &pages, list) {
- list_del_init(&bpage->list);
- free_buffer_page(bpage);
+ no_mem:
+ for_each_buffer_cpu(buffer, cpu) {
+ struct buffer_page *bpage, *tmp;
+ cpu_buffer = buffer->buffers[cpu];
+ /* reset this number regardless */
+ cpu_buffer->nr_pages_to_update = 0;
+ if (list_empty(&cpu_buffer->new_pages))
+ continue;
+ list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages,
+ list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
}
put_online_cpus();
mutex_unlock(&buffer->mutex);
atomic_dec(&buffer->record_disabled);
return -ENOMEM;
-
- /*
- * Something went totally wrong, and we are too paranoid
- * to even clean up the mess.
- */
- out_fail:
- put_online_cpus();
- mutex_unlock(&buffer->mutex);
- atomic_dec(&buffer->record_disabled);
- return -1;
}
EXPORT_SYMBOL_GPL(ring_buffer_resize);

@@ -1542,7 +1548,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
* assign the commit to the tail.
*/
again:
- max_count = cpu_buffer->buffer->pages * 100;
+ max_count = cpu_buffer->nr_pages * 100;

while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
if (RB_WARN_ON(cpu_buffer, !(--max_count)))
@@ -3563,9 +3569,18 @@ EXPORT_SYMBOL_GPL(ring_buffer_read);
* ring_buffer_size - return the size of the ring buffer (in bytes)
* @buffer: The ring buffer.
*/
-unsigned long ring_buffer_size(struct ring_buffer *buffer)
+unsigned long ring_buffer_size(struct ring_buffer *buffer, int cpu)
{
- return BUF_PAGE_SIZE * buffer->pages;
+ /*
+ * Earlier, this method returned
+ * BUF_PAGE_SIZE * buffer->nr_pages
+ * Since the nr_pages field is now removed, we have converted this to
+ * return the per cpu buffer value.
+ */
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return 0;
+
+ return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages;
}
EXPORT_SYMBOL_GPL(ring_buffer_size);

@@ -3740,8 +3755,11 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
!cpumask_test_cpu(cpu, buffer_b->cpumask))
goto out;

+ cpu_buffer_a = buffer_a->buffers[cpu];
+ cpu_buffer_b = buffer_b->buffers[cpu];
+
/* At least make sure the two buffers are somewhat the same */
- if (buffer_a->pages != buffer_b->pages)
+ if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
goto out;

ret = -EAGAIN;
@@ -3755,9 +3773,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
if (atomic_read(&buffer_b->record_disabled))
goto out;

- cpu_buffer_a = buffer_a->buffers[cpu];
- cpu_buffer_b = buffer_b->buffers[cpu];
-
if (atomic_read(&cpu_buffer_a->record_disabled))
goto out;

@@ -4108,6 +4123,8 @@ static int rb_cpu_notify(struct notifier_block *self,
struct ring_buffer *buffer =
container_of(self, struct ring_buffer, cpu_notify);
long cpu = (long)hcpu;
+ int cpu_i, nr_pages_same;
+ unsigned int nr_pages;

switch (action) {
case CPU_UP_PREPARE:
@@ -4115,8 +4132,23 @@ static int rb_cpu_notify(struct notifier_block *self,
if (cpumask_test_cpu(cpu, buffer->cpumask))
return NOTIFY_OK;

+ nr_pages = 0;
+ nr_pages_same = 1;
+ /* check if all cpu sizes are same */
+ for_each_buffer_cpu(buffer, cpu_i) {
+ /* fill in the size from first enabled cpu */
+ if (nr_pages == 0)
+ nr_pages = buffer->buffers[cpu_i]->nr_pages;
+ if (nr_pages != buffer->buffers[cpu_i]->nr_pages) {
+ nr_pages_same = 0;
+ break;
+ }
+ }
+ /* allocate minimum pages, user can later expand it */
+ if (!nr_pages_same)
+ nr_pages = 2;
buffer->buffers[cpu] =
- rb_allocate_cpu_buffer(buffer, cpu);
+ rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
if (!buffer->buffers[cpu]) {
WARN(1, "failed to allocate ring buffer on CPU %ld\n",
cpu);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a3f1bc5..367659d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -787,7 +787,8 @@ __acquires(kernel_lock)

/* If we expanded the buffers, make sure the max is expanded too */
if (ring_buffer_expanded && type->use_max_tr)
- ring_buffer_resize(max_tr.buffer, trace_buf_size);
+ ring_buffer_resize(max_tr.buffer, trace_buf_size,
+ RING_BUFFER_ALL_CPUS);

/* the test is responsible for initializing and enabling */
pr_info("Testing tracer %s: ", type->name);
@@ -803,7 +804,8 @@ __acquires(kernel_lock)

/* Shrink the max buffer again */
if (ring_buffer_expanded && type->use_max_tr)
- ring_buffer_resize(max_tr.buffer, 1);
+ ring_buffer_resize(max_tr.buffer, 1,
+ RING_BUFFER_ALL_CPUS);

printk(KERN_CONT "PASSED\n");
}
@@ -2916,7 +2918,14 @@ int tracer_init(struct tracer *t, struct trace_array *tr)
return t->init(tr);
}

-static int __tracing_resize_ring_buffer(unsigned long size)
+static void set_buffer_entries(struct trace_array *tr, unsigned long val)
+{
+ int cpu;
+ for_each_tracing_cpu(cpu)
+ tr->data[cpu]->entries = val;
+}
+
+static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
{
int ret;

@@ -2927,19 +2936,32 @@ static int __tracing_resize_ring_buffer(unsigned long size)
*/
ring_buffer_expanded = 1;

- ret = ring_buffer_resize(global_trace.buffer, size);
+ ret = ring_buffer_resize(global_trace.buffer, size, cpu);
if (ret < 0)
return ret;

if (!current_trace->use_max_tr)
goto out;

- ret = ring_buffer_resize(max_tr.buffer, size);
+ ret = ring_buffer_resize(max_tr.buffer, size, cpu);
if (ret < 0) {
- int r;
+ int r = 0;
+
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ int i;
+ for_each_tracing_cpu(i) {
+ r = ring_buffer_resize(global_trace.buffer,
+ global_trace.data[i]->entries,
+ i);
+ if (r < 0)
+ break;
+ }
+ } else {
+ r = ring_buffer_resize(global_trace.buffer,
+ global_trace.data[cpu]->entries,
+ cpu);
+ }

- r = ring_buffer_resize(global_trace.buffer,
- global_trace.entries);
if (r < 0) {
/*
* AARGH! We are left with different
@@ -2961,14 +2983,21 @@ static int __tracing_resize_ring_buffer(unsigned long size)
return ret;
}

- max_tr.entries = size;
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ set_buffer_entries(&max_tr, size);
+ else
+ max_tr.data[cpu]->entries = size;
+
out:
- global_trace.entries = size;
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ set_buffer_entries(&global_trace, size);
+ else
+ global_trace.data[cpu]->entries = size;

return ret;
}

-static ssize_t tracing_resize_ring_buffer(unsigned long size)
+static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id)
{
int cpu, ret = size;

@@ -2984,12 +3013,19 @@ static ssize_t tracing_resize_ring_buffer(unsigned long size)
atomic_inc(&max_tr.data[cpu]->disabled);
}

- if (size != global_trace.entries)
- ret = __tracing_resize_ring_buffer(size);
+ if (cpu_id != RING_BUFFER_ALL_CPUS) {
+ /* make sure, this cpu is enabled in the mask */
+ if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }

+ ret = __tracing_resize_ring_buffer(size, cpu_id);
if (ret < 0)
ret = -ENOMEM;

+out:
for_each_tracing_cpu(cpu) {
if (global_trace.data[cpu])
atomic_dec(&global_trace.data[cpu]->disabled);
@@ -3020,7 +3056,8 @@ int tracing_update_buffers(void)

mutex_lock(&trace_types_lock);
if (!ring_buffer_expanded)
- ret = __tracing_resize_ring_buffer(trace_buf_size);
+ ret = __tracing_resize_ring_buffer(trace_buf_size,
+ RING_BUFFER_ALL_CPUS);
mutex_unlock(&trace_types_lock);

return ret;
@@ -3044,7 +3081,8 @@ static int tracing_set_tracer(const char *buf)
mutex_lock(&trace_types_lock);

if (!ring_buffer_expanded) {
- ret = __tracing_resize_ring_buffer(trace_buf_size);
+ ret = __tracing_resize_ring_buffer(trace_buf_size,
+ RING_BUFFER_ALL_CPUS);
if (ret < 0)
goto out;
ret = 0;
@@ -3070,8 +3108,8 @@ static int tracing_set_tracer(const char *buf)
* The max_tr ring buffer has some state (e.g. ring->clock) and
* we want preserve it.
*/
- ring_buffer_resize(max_tr.buffer, 1);
- max_tr.entries = 1;
+ ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS);
+ set_buffer_entries(&max_tr, 1);
}
destroy_trace_option_files(topts);

@@ -3079,10 +3117,17 @@ static int tracing_set_tracer(const char *buf)

topts = create_trace_option_files(current_trace);
if (current_trace->use_max_tr) {
- ret = ring_buffer_resize(max_tr.buffer, global_trace.entries);
- if (ret < 0)
- goto out;
- max_tr.entries = global_trace.entries;
+ int cpu;
+ /* we need to make per cpu buffer sizes equivalent */
+ for_each_tracing_cpu(cpu) {
+ ret = ring_buffer_resize(max_tr.buffer,
+ global_trace.data[cpu]->entries,
+ cpu);
+ if (ret < 0)
+ goto out;
+ max_tr.data[cpu]->entries =
+ global_trace.data[cpu]->entries;
+ }
}

if (t->init) {
@@ -3584,30 +3629,82 @@ out_err:
goto out;
}

+struct ftrace_entries_info {
+ struct trace_array *tr;
+ int cpu;
+};
+
+static int tracing_entries_open(struct inode *inode, struct file *filp)
+{
+ struct ftrace_entries_info *info;
+
+ if (tracing_disabled)
+ return -ENODEV;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+
+ info->tr = &global_trace;
+ info->cpu = (unsigned long)inode->i_private;
+
+ filp->private_data = info;
+
+ return 0;
+}
+
static ssize_t
tracing_entries_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- struct trace_array *tr = filp->private_data;
- char buf[96];
- int r;
+ struct ftrace_entries_info *info = filp->private_data;
+ struct trace_array *tr = info->tr;
+ char buf[64];
+ int r = 0;
+ ssize_t ret;

mutex_lock(&trace_types_lock);
- if (!ring_buffer_expanded)
- r = sprintf(buf, "%lu (expanded: %lu)\n",
- tr->entries >> 10,
- trace_buf_size >> 10);
- else
- r = sprintf(buf, "%lu\n", tr->entries >> 10);
+
+ if (info->cpu == RING_BUFFER_ALL_CPUS) {
+ int cpu, buf_size_same;
+ unsigned long size;
+
+ size = 0;
+ buf_size_same = 1;
+ /* check if all cpu sizes are same */
+ for_each_tracing_cpu(cpu) {
+ /* fill in the size from first enabled cpu */
+ if (size == 0)
+ size = tr->data[cpu]->entries;
+ if (size != tr->data[cpu]->entries) {
+ buf_size_same = 0;
+ break;
+ }
+ }
+
+ if (buf_size_same) {
+ if (!ring_buffer_expanded)
+ r = sprintf(buf, "%lu (expanded: %lu)\n",
+ size >> 10,
+ trace_buf_size >> 10);
+ else
+ r = sprintf(buf, "%lu\n", size >> 10);
+ } else
+ r = sprintf(buf, "X\n");
+ } else
+ r = sprintf(buf, "%lu\n", tr->data[info->cpu]->entries >> 10);
+
mutex_unlock(&trace_types_lock);

- return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+ ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+ return ret;
}

static ssize_t
tracing_entries_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
+ struct ftrace_entries_info *info = filp->private_data;
unsigned long val;
int ret;

@@ -3622,7 +3719,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
/* value is in KB */
val <<= 10;

- ret = tracing_resize_ring_buffer(val);
+ ret = tracing_resize_ring_buffer(val, info->cpu);
if (ret < 0)
return ret;

@@ -3631,6 +3728,16 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
return cnt;
}

+static int
+tracing_entries_release(struct inode *inode, struct file *filp)
+{
+ struct ftrace_entries_info *info = filp->private_data;
+
+ kfree(info);
+
+ return 0;
+}
+
static ssize_t
tracing_total_entries_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
@@ -3642,7 +3749,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,

mutex_lock(&trace_types_lock);
for_each_tracing_cpu(cpu) {
- size += tr->entries >> 10;
+ size += tr->data[cpu]->entries >> 10;
if (!ring_buffer_expanded)
expanded_size += trace_buf_size >> 10;
}
@@ -3676,7 +3783,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
if (trace_flags & TRACE_ITER_STOP_ON_FREE)
tracing_off();
/* resize the ring buffer to 0 */
- tracing_resize_ring_buffer(0);
+ tracing_resize_ring_buffer(0, RING_BUFFER_ALL_CPUS);

return 0;
}
@@ -3875,9 +3982,10 @@ static const struct file_operations tracing_pipe_fops = {
};

static const struct file_operations tracing_entries_fops = {
- .open = tracing_open_generic,
+ .open = tracing_entries_open,
.read = tracing_entries_read,
.write = tracing_entries_write,
+ .release = tracing_entries_release,
.llseek = generic_file_llseek,
};

@@ -4329,6 +4437,9 @@ static void tracing_init_debugfs_percpu(long cpu)

trace_create_file("stats", 0444, d_cpu,
(void *) cpu, &tracing_stats_fops);
+
+ trace_create_file("buffer_size_kb", 0444, d_cpu,
+ (void *) cpu, &tracing_entries_fops);
}

#ifdef CONFIG_FTRACE_SELFTEST
@@ -4609,7 +4720,7 @@ static __init int tracer_init_debugfs(void)
(void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);

trace_create_file("buffer_size_kb", 0644, d_tracer,
- &global_trace, &tracing_entries_fops);
+ (void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops);

trace_create_file("buffer_total_size_kb", 0444, d_tracer,
&global_trace, &tracing_total_entries_fops);
@@ -4862,8 +4973,6 @@ __init static int tracer_alloc_buffers(void)
WARN_ON(1);
goto out_free_cpumask;
}
- global_trace.entries = ring_buffer_size(global_trace.buffer);
-

#ifdef CONFIG_TRACER_MAX_TRACE
max_tr.buffer = ring_buffer_alloc(1, rb_flags);
@@ -4873,7 +4982,6 @@ __init static int tracer_alloc_buffers(void)
ring_buffer_free(global_trace.buffer);
goto out_free_cpumask;
}
- max_tr.entries = 1;
#endif

/* Allocate the first page for all buffers */
@@ -4882,6 +4990,11 @@ __init static int tracer_alloc_buffers(void)
max_tr.data[i] = &per_cpu(max_tr_data, i);
}

+ set_buffer_entries(&global_trace, ring_buf_size);
+#ifdef CONFIG_TRACER_MAX_TRACE
+ set_buffer_entries(&max_tr, 1);
+#endif
+
trace_init_cmdlines();

register_tracer(&nop_trace);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b93ecba..decbca3 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -125,6 +125,7 @@ struct trace_array_cpu {
atomic_t disabled;
void *buffer_page; /* ring buffer spare */

+ unsigned long entries;
unsigned long saved_latency;
unsigned long critical_start;
unsigned long critical_end;
@@ -146,7 +147,6 @@ struct trace_array_cpu {
*/
struct trace_array {
struct ring_buffer *buffer;
- unsigned long entries;
int cpu;
cycle_t time_start;
struct task_struct *waiter;
--
1.7.7.3


2012-02-02 20:01:29

by Vaibhav Nagarnaik

[permalink] [raw]
Subject: [PATCH v5 4/4] trace: change CPU ring buffer state from tracing_cpumask

According to Documentation/trace/ftrace.txt:

tracing_cpumask:

This is a mask that lets the user only trace
on specified CPUS. The format is a hex string
representing the CPUS.

The tracing_cpumask currently doesn't affect the tracing state of
per-CPU ring buffers.

This patch enables/disables CPU recording as its corresponding bit in
tracing_cpumask is set/unset.

Signed-off-by: Vaibhav Nagarnaik <[email protected]>
---
kernel/trace/trace.c | 2 ++
1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index abf1108..e25672e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2592,10 +2592,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
if (cpumask_test_cpu(cpu, tracing_cpumask) &&
!cpumask_test_cpu(cpu, tracing_cpumask_new)) {
atomic_inc(&global_trace.data[cpu]->disabled);
+ ring_buffer_record_disable_cpu(global_trace.buffer, cpu);
}
if (!cpumask_test_cpu(cpu, tracing_cpumask) &&
cpumask_test_cpu(cpu, tracing_cpumask_new)) {
atomic_dec(&global_trace.data[cpu]->disabled);
+ ring_buffer_record_enable_cpu(global_trace.buffer, cpu);
}
}
arch_spin_unlock(&ftrace_max_lock);
--
1.7.7.3

2012-02-02 20:01:43

by Vaibhav Nagarnaik

[permalink] [raw]
Subject: [PATCH v5 2/4] trace: Make removal of ring buffer pages atomic

This patch adds the capability to remove pages from a ring buffer
without destroying any existing data in it.

This is done by removing the pages after the tail page. This makes sure
that first all the empty pages in the ring buffer are removed. If the
head page is one in the list of pages to be removed, then the page after
the removed ones is made the head page. This removes the oldest data
from the ring buffer and keeps the latest data around to be read.

To do this in a non-racey manner, tracing is stopped for a very short
time while the pages to be removed are identified and unlinked from the
ring buffer. The pages are freed after the tracing is restarted to
minimize the time needed to stop tracing.

The context in which the pages from the per-cpu ring buffer are removed
runs on the respective CPU. This minimizes the events not traced to only
NMI trace contexts.

Signed-off-by: Vaibhav Nagarnaik <[email protected]>
---
Changelog v5-v4:
* Rebased to latest upstream

kernel/trace/ring_buffer.c | 222 ++++++++++++++++++++++++++++++++-----------
kernel/trace/trace.c | 20 +----
2 files changed, 166 insertions(+), 76 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c778ab9..a7c66e4 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -23,6 +23,8 @@
#include <asm/local.h>
#include "trace.h"

+static void update_pages_handler(struct work_struct *work);
+
/*
* The ring buffer header is special. We must manually up keep it.
*/
@@ -502,6 +504,8 @@ struct ring_buffer_per_cpu {
/* ring buffer pages to update, > 0 to add, < 0 to remove */
int nr_pages_to_update;
struct list_head new_pages; /* new pages to add */
+ struct work_struct update_pages_work;
+ struct completion update_completion;
};

struct ring_buffer {
@@ -1080,6 +1084,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
raw_spin_lock_init(&cpu_buffer->reader_lock);
lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+ INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler);
+ init_completion(&cpu_buffer->update_completion);

bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
GFP_KERNEL, cpu_to_node(cpu));
@@ -1267,32 +1273,107 @@ void ring_buffer_set_clock(struct ring_buffer *buffer,

static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);

+static inline unsigned long rb_page_entries(struct buffer_page *bpage)
+{
+ return local_read(&bpage->entries) & RB_WRITE_MASK;
+}
+
+static inline unsigned long rb_page_write(struct buffer_page *bpage)
+{
+ return local_read(&bpage->write) & RB_WRITE_MASK;
+}
+
static void
-rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
+rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages)
{
- struct buffer_page *bpage;
- struct list_head *p;
- unsigned i;
+ unsigned int nr_removed;
+ int page_entries;
+ struct list_head *tail_page, *to_remove, *next_page;
+ unsigned long head_bit;
+ struct buffer_page *last_page, *first_page;
+ struct buffer_page *to_remove_page, *tmp_iter_page;

+ head_bit = 0;
raw_spin_lock_irq(&cpu_buffer->reader_lock);
- rb_head_page_deactivate(cpu_buffer);
-
- for (i = 0; i < nr_pages; i++) {
- if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
- goto out;
- p = cpu_buffer->pages->next;
- bpage = list_entry(p, struct buffer_page, list);
- list_del_init(&bpage->list);
- free_buffer_page(bpage);
+ atomic_inc(&cpu_buffer->record_disabled);
+ /*
+ * We don't race with the readers since we have acquired the reader
+ * lock. We also don't race with writers after disabling recording.
+ * This makes it easy to figure out the first and the last page to be
+ * removed from the list. We remove all the pages in between including
+ * the first and last pages. This is done in a busy loop so that we
+ * lose the least number of traces.
+ * The pages are freed after we restart recording and unlock readers.
+ */
+ tail_page = &cpu_buffer->tail_page->list;
+ /*
+ * tail page might be on reader page, we remove the next page
+ * from the ring buffer
+ */
+ if (cpu_buffer->tail_page == cpu_buffer->reader_page)
+ tail_page = rb_list_head(tail_page->next);
+ to_remove = tail_page;
+
+ /* start of pages to remove */
+ first_page = list_entry(rb_list_head(to_remove->next),
+ struct buffer_page, list);
+ for (nr_removed = 0; nr_removed < nr_pages; nr_removed++) {
+ to_remove = rb_list_head(to_remove)->next;
+ head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD;
}
- if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
- goto out;

- rb_reset_cpu(cpu_buffer);
- rb_check_pages(cpu_buffer);
-
-out:
+ next_page = rb_list_head(to_remove)->next;
+ /* now we remove all pages between tail_page and next_page */
+ tail_page->next = (struct list_head *)((unsigned long)next_page |
+ head_bit);
+ next_page = rb_list_head(next_page);
+ next_page->prev = tail_page;
+ /* make sure pages points to a valid page in the ring buffer */
+ cpu_buffer->pages = next_page;
+ /* update head page */
+ if (head_bit)
+ cpu_buffer->head_page = list_entry(next_page,
+ struct buffer_page, list);
+ /*
+ * change read pointer to make sure any read iterators reset
+ * themselves
+ */
+ cpu_buffer->read = 0;
+ /* pages are removed, resume tracing and then free the pages */
+ atomic_dec(&cpu_buffer->record_disabled);
raw_spin_unlock_irq(&cpu_buffer->reader_lock);
+
+ RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages));
+
+ /* last buffer page to remove */
+ last_page = list_entry(rb_list_head(to_remove), struct buffer_page,
+ list);
+ tmp_iter_page = first_page;
+ do {
+ to_remove_page = tmp_iter_page;
+ rb_inc_page(cpu_buffer, &tmp_iter_page);
+ /* update the counters */
+ page_entries = rb_page_entries(to_remove_page);
+ if (page_entries) {
+ /*
+ * If something was added to this page, it was full
+ * since it is not the tail page. So we deduct the
+ * bytes consumed in ring buffer from here.
+ * No need to update overruns, since this page is
+ * deleted from ring buffer and its entries are
+ * already accounted for.
+ */
+ local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
+ }
+ /*
+ * We have already removed references to this list item, just
+ * free up the buffer_page and its page
+ */
+ nr_removed--;
+ free_buffer_page(to_remove_page);
+ } while (to_remove_page != last_page);
+
+ RB_WARN_ON(cpu_buffer, nr_removed);
}

static void
@@ -1303,6 +1384,12 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
struct list_head *p;
unsigned i;

+ /* stop the writers while inserting pages */
+ atomic_inc(&cpu_buffer->record_disabled);
+
+ /* Make sure all writers are done with this buffer. */
+ synchronize_sched();
+
raw_spin_lock_irq(&cpu_buffer->reader_lock);
rb_head_page_deactivate(cpu_buffer);

@@ -1319,18 +1406,22 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,

out:
raw_spin_unlock_irq(&cpu_buffer->reader_lock);
+ atomic_dec(&cpu_buffer->record_disabled);
}

-static void update_pages_handler(struct ring_buffer_per_cpu *cpu_buffer)
+static void update_pages_handler(struct work_struct *work)
{
+ struct ring_buffer_per_cpu *cpu_buffer = container_of(work,
+ struct ring_buffer_per_cpu, update_pages_work);
+
if (cpu_buffer->nr_pages_to_update > 0)
rb_insert_pages(cpu_buffer, &cpu_buffer->new_pages,
cpu_buffer->nr_pages_to_update);
else
rb_remove_pages(cpu_buffer, -cpu_buffer->nr_pages_to_update);
+
cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update;
- /* reset this value */
- cpu_buffer->nr_pages_to_update = 0;
+ complete(&cpu_buffer->update_completion);
}

/**
@@ -1340,14 +1431,14 @@ static void update_pages_handler(struct ring_buffer_per_cpu *cpu_buffer)
*
* Minimum size is 2 * BUF_PAGE_SIZE.
*
- * Returns -1 on failure.
+ * Returns 0 on success and < 0 on failure.
*/
int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
int cpu_id)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned nr_pages;
- int cpu;
+ int cpu, err = 0;

/*
* Always succeed at resizing a non-existent buffer:
@@ -1362,21 +1453,28 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
if (size < BUF_PAGE_SIZE * 2)
size = BUF_PAGE_SIZE * 2;

- atomic_inc(&buffer->record_disabled);
-
- /* Make sure all writers are done with this buffer. */
- synchronize_sched();
+ nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);

+ /*
+ * Don't succeed if recording is disabled globally, as a reader might
+ * be manipulating the ring buffer and is expecting a sane state while
+ * this is true.
+ */
+ if (atomic_read(&buffer->record_disabled))
+ return -EBUSY;
+ /* prevent another thread from changing buffer sizes */
mutex_lock(&buffer->mutex);
- get_online_cpus();
-
- nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);

if (cpu_id == RING_BUFFER_ALL_CPUS) {
/* calculate the pages to update */
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];

+ if (atomic_read(&cpu_buffer->record_disabled)) {
+ err = -EBUSY;
+ goto out_err;
+ }
+
cpu_buffer->nr_pages_to_update = nr_pages -
cpu_buffer->nr_pages;

@@ -1392,20 +1490,37 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
*/
INIT_LIST_HEAD(&cpu_buffer->new_pages);
if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update,
- &cpu_buffer->new_pages, cpu))
+ &cpu_buffer->new_pages, cpu)) {
/* not enough memory for new pages */
- goto no_mem;
+ err = -ENOMEM;
+ goto out_err;
+ }
+ }
+
+ /* fire off all the required work handlers */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ if (!cpu_buffer->nr_pages_to_update)
+ continue;
+ schedule_work_on(cpu, &cpu_buffer->update_pages_work);
}

/* wait for all the updates to complete */
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
- if (cpu_buffer->nr_pages_to_update) {
- update_pages_handler(cpu_buffer);
- }
+ if (!cpu_buffer->nr_pages_to_update)
+ continue;
+ wait_for_completion(&cpu_buffer->update_completion);
+ /* reset this value */
+ cpu_buffer->nr_pages_to_update = 0;
}
} else {
cpu_buffer = buffer->buffers[cpu_id];
+ if (atomic_read(&cpu_buffer->record_disabled)) {
+ err = -EBUSY;
+ goto out_err;
+ }
+
if (nr_pages == cpu_buffer->nr_pages)
goto out;

@@ -1415,38 +1530,41 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
INIT_LIST_HEAD(&cpu_buffer->new_pages);
if (cpu_buffer->nr_pages_to_update > 0 &&
__rb_allocate_pages(cpu_buffer->nr_pages_to_update,
- &cpu_buffer->new_pages, cpu_id))
- goto no_mem;
+ &cpu_buffer->new_pages, cpu_id)) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+
+ schedule_work_on(cpu_id, &cpu_buffer->update_pages_work);
+ wait_for_completion(&cpu_buffer->update_completion);

- update_pages_handler(cpu_buffer);
+ /* reset this value */
+ cpu_buffer->nr_pages_to_update = 0;
}

out:
- put_online_cpus();
mutex_unlock(&buffer->mutex);
-
- atomic_dec(&buffer->record_disabled);
-
return size;

- no_mem:
+ out_err:
for_each_buffer_cpu(buffer, cpu) {
struct buffer_page *bpage, *tmp;
+
cpu_buffer = buffer->buffers[cpu];
/* reset this number regardless */
cpu_buffer->nr_pages_to_update = 0;
+
if (list_empty(&cpu_buffer->new_pages))
continue;
+
list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages,
list) {
list_del_init(&bpage->list);
free_buffer_page(bpage);
}
}
- put_online_cpus();
mutex_unlock(&buffer->mutex);
- atomic_dec(&buffer->record_disabled);
- return -ENOMEM;
+ return err;
}
EXPORT_SYMBOL_GPL(ring_buffer_resize);

@@ -1485,21 +1603,11 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
return __rb_page_index(iter->head_page, iter->head);
}

-static inline unsigned long rb_page_write(struct buffer_page *bpage)
-{
- return local_read(&bpage->write) & RB_WRITE_MASK;
-}
-
static inline unsigned rb_page_commit(struct buffer_page *bpage)
{
return local_read(&bpage->page->commit);
}

-static inline unsigned long rb_page_entries(struct buffer_page *bpage)
-{
- return local_read(&bpage->entries) & RB_WRITE_MASK;
-}
-
/* Size is determined by what has been committed */
static inline unsigned rb_page_size(struct buffer_page *bpage)
{
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 367659d..abf1108 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2999,20 +2999,10 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)

static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id)
{
- int cpu, ret = size;
+ int ret = size;

mutex_lock(&trace_types_lock);

- tracing_stop();
-
- /* disable all cpu buffers */
- for_each_tracing_cpu(cpu) {
- if (global_trace.data[cpu])
- atomic_inc(&global_trace.data[cpu]->disabled);
- if (max_tr.data[cpu])
- atomic_inc(&max_tr.data[cpu]->disabled);
- }
-
if (cpu_id != RING_BUFFER_ALL_CPUS) {
/* make sure, this cpu is enabled in the mask */
if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) {
@@ -3026,14 +3016,6 @@ static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id)
ret = -ENOMEM;

out:
- for_each_tracing_cpu(cpu) {
- if (global_trace.data[cpu])
- atomic_dec(&global_trace.data[cpu]->disabled);
- if (max_tr.data[cpu])
- atomic_dec(&max_tr.data[cpu]->disabled);
- }
-
- tracing_start();
mutex_unlock(&trace_types_lock);

return ret;
--
1.7.7.3

2012-02-02 20:01:38

by Vaibhav Nagarnaik

[permalink] [raw]
Subject: [PATCH v5 3/4] trace: Make addition of pages in ring buffer atomic

This patch adds the capability to add new pages to a ring buffer
atomically while write operations are going on. This makes it possible
to expand the ring buffer size without reinitializing the ring buffer.

The new pages are attached between the head page and its previous page.

Signed-off-by: Vaibhav Nagarnaik <[email protected]>
---
Changelog v5-v4:
* Rebased to latest upstream

kernel/trace/ring_buffer.c | 128 +++++++++++++++++++++++++++++--------------
1 files changed, 86 insertions(+), 42 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a7c66e4..5aef474 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -512,6 +512,7 @@ struct ring_buffer {
unsigned flags;
int cpus;
atomic_t record_disabled;
+ atomic_t resize_disabled;
cpumask_var_t cpumask;

struct lock_class_key *reader_lock_key;
@@ -1283,7 +1284,7 @@ static inline unsigned long rb_page_write(struct buffer_page *bpage)
return local_read(&bpage->write) & RB_WRITE_MASK;
}

-static void
+static int
rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages)
{
unsigned int nr_removed;
@@ -1374,53 +1375,99 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages)
} while (to_remove_page != last_page);

RB_WARN_ON(cpu_buffer, nr_removed);
+
+ return nr_removed == 0;
}

-static void
-rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
- struct list_head *pages, unsigned nr_pages)
+static int
+rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
{
- struct buffer_page *bpage;
- struct list_head *p;
- unsigned i;
+ struct list_head *pages = &cpu_buffer->new_pages;
+ int retries, success;

- /* stop the writers while inserting pages */
- atomic_inc(&cpu_buffer->record_disabled);
+ raw_spin_lock_irq(&cpu_buffer->reader_lock);
+ /*
+ * We are holding the reader lock, so the reader page won't be swapped
+ * in the ring buffer. Now we are racing with the writer trying to
+ * move head page and the tail page.
+ * We are going to adapt the reader page update process where:
+ * 1. We first splice the start and end of list of new pages between
+ * the head page and its previous page.
+ * 2. We cmpxchg the prev_page->next to point from head page to the
+ * start of new pages list.
+ * 3. Finally, we update the head->prev to the end of new list.
+ *
+ * We will try this process 10 times, to make sure that we don't keep
+ * spinning.
+ */
+ retries = 10;
+ success = 0;
+ while (retries--) {
+ struct list_head *last_page, *first_page;
+ struct list_head *head_page, *prev_page, *r;
+ struct list_head *head_page_with_bit;

- /* Make sure all writers are done with this buffer. */
- synchronize_sched();
+ head_page = &rb_set_head_page(cpu_buffer)->list;
+ prev_page = head_page->prev;

- raw_spin_lock_irq(&cpu_buffer->reader_lock);
- rb_head_page_deactivate(cpu_buffer);
+ first_page = pages->next;
+ last_page = pages->prev;

- for (i = 0; i < nr_pages; i++) {
- if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
- goto out;
- p = pages->next;
- bpage = list_entry(p, struct buffer_page, list);
- list_del_init(&bpage->list);
- list_add_tail(&bpage->list, cpu_buffer->pages);
+ head_page_with_bit = (struct list_head *)
+ ((unsigned long)head_page | RB_PAGE_HEAD);
+
+ last_page->next = head_page_with_bit;
+ first_page->prev = prev_page;
+
+ r = cmpxchg(&prev_page->next, head_page_with_bit, first_page);
+
+ if (r == head_page_with_bit) {
+ /*
+ * yay, we replaced the page pointer to our new list,
+ * now, we just have to update to head page's prev
+ * pointer to point to end of list
+ */
+ head_page->prev = last_page;
+ success = 1;
+ break;
+ }
}
- rb_reset_cpu(cpu_buffer);
- rb_check_pages(cpu_buffer);

-out:
+ if (success)
+ INIT_LIST_HEAD(pages);
+ /*
+ * If we weren't successful in adding in new pages, warn and stop
+ * tracing
+ */
+ RB_WARN_ON(cpu_buffer, !success);
raw_spin_unlock_irq(&cpu_buffer->reader_lock);
- atomic_dec(&cpu_buffer->record_disabled);
+
+ /* free pages if they weren't inserted */
+ if (!success) {
+ struct buffer_page *bpage, *tmp;
+ list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages,
+ list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+ }
+ return success;
}

static void update_pages_handler(struct work_struct *work)
{
struct ring_buffer_per_cpu *cpu_buffer = container_of(work,
struct ring_buffer_per_cpu, update_pages_work);
+ int success;

if (cpu_buffer->nr_pages_to_update > 0)
- rb_insert_pages(cpu_buffer, &cpu_buffer->new_pages,
- cpu_buffer->nr_pages_to_update);
+ success = rb_insert_pages(cpu_buffer);
else
- rb_remove_pages(cpu_buffer, -cpu_buffer->nr_pages_to_update);
+ success = rb_remove_pages(cpu_buffer,
+ -cpu_buffer->nr_pages_to_update);

- cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update;
+ if (success)
+ cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update;
complete(&cpu_buffer->update_completion);
}

@@ -1456,11 +1503,11 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);

/*
- * Don't succeed if recording is disabled globally, as a reader might
- * be manipulating the ring buffer and is expecting a sane state while
+ * Don't succeed if resizing is disabled, as a reader might be
+ * manipulating the ring buffer and is expecting a sane state while
* this is true.
*/
- if (atomic_read(&buffer->record_disabled))
+ if (atomic_read(&buffer->resize_disabled))
return -EBUSY;
/* prevent another thread from changing buffer sizes */
mutex_lock(&buffer->mutex);
@@ -1470,11 +1517,6 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];

- if (atomic_read(&cpu_buffer->record_disabled)) {
- err = -EBUSY;
- goto out_err;
- }
-
cpu_buffer->nr_pages_to_update = nr_pages -
cpu_buffer->nr_pages;

@@ -1516,11 +1558,6 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
}
} else {
cpu_buffer = buffer->buffers[cpu_id];
- if (atomic_read(&cpu_buffer->record_disabled)) {
- err = -EBUSY;
- goto out_err;
- }
-
if (nr_pages == cpu_buffer->nr_pages)
goto out;

@@ -1537,7 +1574,6 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,

schedule_work_on(cpu_id, &cpu_buffer->update_pages_work);
wait_for_completion(&cpu_buffer->update_completion);
-
/* reset this value */
cpu_buffer->nr_pages_to_update = 0;
}
@@ -3575,6 +3611,7 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)

iter->cpu_buffer = cpu_buffer;

+ atomic_inc(&buffer->resize_disabled);
atomic_inc(&cpu_buffer->record_disabled);

return iter;
@@ -3638,6 +3675,7 @@ ring_buffer_read_finish(struct ring_buffer_iter *iter)
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;

atomic_dec(&cpu_buffer->record_disabled);
+ atomic_dec(&cpu_buffer->buffer->resize_disabled);
kfree(iter);
}
EXPORT_SYMBOL_GPL(ring_buffer_read_finish);
@@ -3709,6 +3747,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->commit_page = cpu_buffer->head_page;

INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
local_set(&cpu_buffer->reader_page->write, 0);
local_set(&cpu_buffer->reader_page->entries, 0);
local_set(&cpu_buffer->reader_page->page->commit, 0);
@@ -3745,8 +3784,12 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return;

+ atomic_inc(&buffer->resize_disabled);
atomic_inc(&cpu_buffer->record_disabled);

+ /* Make sure all commits have finished */
+ synchronize_sched();
+
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);

if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
@@ -3762,6 +3805,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);

atomic_dec(&cpu_buffer->record_disabled);
+ atomic_dec(&buffer->resize_disabled);
}
EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);

--
1.7.7.3

2012-05-02 21:03:31

by Vaibhav Nagarnaik

[permalink] [raw]
Subject: [tip:perf/core] ring-buffer: Add per_cpu ring buffer control files

Commit-ID: 438ced1720b584000a9e8a4349d1f6bb7ee3ad6d
Gitweb: http://git.kernel.org/tip/438ced1720b584000a9e8a4349d1f6bb7ee3ad6d
Author: Vaibhav Nagarnaik <[email protected]>
AuthorDate: Thu, 2 Feb 2012 12:00:41 -0800
Committer: Steven Rostedt <[email protected]>
CommitDate: Mon, 23 Apr 2012 21:17:51 -0400

ring-buffer: Add per_cpu ring buffer control files

Add a debugfs entry under per_cpu/ folder for each cpu called
buffer_size_kb to control the ring buffer size for each CPU
independently.

If the global file buffer_size_kb is used to set size, the individual
ring buffers will be adjusted to the given size. The buffer_size_kb will
report the common size to maintain backward compatibility.

If the buffer_size_kb file under the per_cpu/ directory is used to
change buffer size for a specific CPU, only the size of the respective
ring buffer is updated. When tracing/buffer_size_kb is read, it reports
'X' to indicate that sizes of per_cpu ring buffers are not equivalent.

Link: http://lkml.kernel.org/r/[email protected]

Cc: Frederic Weisbecker <[email protected]>
Cc: Michael Rubin <[email protected]>
Cc: David Sharp <[email protected]>
Cc: Justin Teravest <[email protected]>
Signed-off-by: Vaibhav Nagarnaik <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
include/linux/ring_buffer.h | 6 +-
kernel/trace/ring_buffer.c | 248 ++++++++++++++++++++++++-------------------
kernel/trace/trace.c | 190 ++++++++++++++++++++++++++-------
kernel/trace/trace.h | 2 +-
4 files changed, 297 insertions(+), 149 deletions(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 7be2e88..6c8835f 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -96,9 +96,11 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k
__ring_buffer_alloc((size), (flags), &__key); \
})

+#define RING_BUFFER_ALL_CPUS -1
+
void ring_buffer_free(struct ring_buffer *buffer);

-int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size);
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, int cpu);

void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val);

@@ -129,7 +131,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts);
void ring_buffer_iter_reset(struct ring_buffer_iter *iter);
int ring_buffer_iter_empty(struct ring_buffer_iter *iter);

-unsigned long ring_buffer_size(struct ring_buffer *buffer);
+unsigned long ring_buffer_size(struct ring_buffer *buffer, int cpu);

void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu);
void ring_buffer_reset(struct ring_buffer *buffer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index cf8d11e..2d5eb33 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -449,6 +449,7 @@ struct ring_buffer_per_cpu {
raw_spinlock_t reader_lock; /* serialize readers */
arch_spinlock_t lock;
struct lock_class_key lock_key;
+ unsigned int nr_pages;
struct list_head *pages;
struct buffer_page *head_page; /* read from head */
struct buffer_page *tail_page; /* write to tail */
@@ -466,10 +467,12 @@ struct ring_buffer_per_cpu {
unsigned long read_bytes;
u64 write_stamp;
u64 read_stamp;
+ /* ring buffer pages to update, > 0 to add, < 0 to remove */
+ int nr_pages_to_update;
+ struct list_head new_pages; /* new pages to add */
};

struct ring_buffer {
- unsigned pages;
unsigned flags;
int cpus;
atomic_t record_disabled;
@@ -963,14 +966,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
return 0;
}

-static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
- unsigned nr_pages)
+static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu)
{
+ int i;
struct buffer_page *bpage, *tmp;
- LIST_HEAD(pages);
- unsigned i;
-
- WARN_ON(!nr_pages);

for (i = 0; i < nr_pages; i++) {
struct page *page;
@@ -981,15 +980,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
*/
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
GFP_KERNEL | __GFP_NORETRY,
- cpu_to_node(cpu_buffer->cpu));
+ cpu_to_node(cpu));
if (!bpage)
goto free_pages;

- rb_check_bpage(cpu_buffer, bpage);
+ list_add(&bpage->list, pages);

- list_add(&bpage->list, &pages);
-
- page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
+ page = alloc_pages_node(cpu_to_node(cpu),
GFP_KERNEL | __GFP_NORETRY, 0);
if (!page)
goto free_pages;
@@ -997,6 +994,27 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
rb_init_page(bpage->page);
}

+ return 0;
+
+free_pages:
+ list_for_each_entry_safe(bpage, tmp, pages, list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+
+ return -ENOMEM;
+}
+
+static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned nr_pages)
+{
+ LIST_HEAD(pages);
+
+ WARN_ON(!nr_pages);
+
+ if (__rb_allocate_pages(nr_pages, &pages, cpu_buffer->cpu))
+ return -ENOMEM;
+
/*
* The ring buffer page list is a circular list that does not
* start and end with a list head. All page list items point to
@@ -1005,20 +1023,15 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
cpu_buffer->pages = pages.next;
list_del(&pages);

+ cpu_buffer->nr_pages = nr_pages;
+
rb_check_pages(cpu_buffer);

return 0;
-
- free_pages:
- list_for_each_entry_safe(bpage, tmp, &pages, list) {
- list_del_init(&bpage->list);
- free_buffer_page(bpage);
- }
- return -ENOMEM;
}

static struct ring_buffer_per_cpu *
-rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
+rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct buffer_page *bpage;
@@ -1052,7 +1065,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)

INIT_LIST_HEAD(&cpu_buffer->reader_page->list);

- ret = rb_allocate_pages(cpu_buffer, buffer->pages);
+ ret = rb_allocate_pages(cpu_buffer, nr_pages);
if (ret < 0)
goto fail_free_reader;

@@ -1113,7 +1126,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
{
struct ring_buffer *buffer;
int bsize;
- int cpu;
+ int cpu, nr_pages;

/* keep it in its own cache line */
buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
@@ -1124,14 +1137,14 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
goto fail_free_buffer;

- buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
buffer->flags = flags;
buffer->clock = trace_clock_local;
buffer->reader_lock_key = key;

/* need at least two pages */
- if (buffer->pages < 2)
- buffer->pages = 2;
+ if (nr_pages < 2)
+ nr_pages = 2;

/*
* In case of non-hotplug cpu, if the ring-buffer is allocated
@@ -1154,7 +1167,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,

for_each_buffer_cpu(buffer, cpu) {
buffer->buffers[cpu] =
- rb_allocate_cpu_buffer(buffer, cpu);
+ rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
if (!buffer->buffers[cpu])
goto fail_free_buffers;
}
@@ -1276,6 +1289,18 @@ out:
raw_spin_unlock_irq(&cpu_buffer->reader_lock);
}

+static void update_pages_handler(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ if (cpu_buffer->nr_pages_to_update > 0)
+ rb_insert_pages(cpu_buffer, &cpu_buffer->new_pages,
+ cpu_buffer->nr_pages_to_update);
+ else
+ rb_remove_pages(cpu_buffer, -cpu_buffer->nr_pages_to_update);
+ cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update;
+ /* reset this value */
+ cpu_buffer->nr_pages_to_update = 0;
+}
+
/**
* ring_buffer_resize - resize the ring buffer
* @buffer: the buffer to resize.
@@ -1285,14 +1310,12 @@ out:
*
* Returns -1 on failure.
*/
-int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
+ int cpu_id)
{
struct ring_buffer_per_cpu *cpu_buffer;
- unsigned nr_pages, rm_pages, new_pages;
- struct buffer_page *bpage, *tmp;
- unsigned long buffer_size;
- LIST_HEAD(pages);
- int i, cpu;
+ unsigned nr_pages;
+ int cpu;

/*
* Always succeed at resizing a non-existent buffer:
@@ -1302,15 +1325,11 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)

size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
size *= BUF_PAGE_SIZE;
- buffer_size = buffer->pages * BUF_PAGE_SIZE;

/* we need a minimum of two pages */
if (size < BUF_PAGE_SIZE * 2)
size = BUF_PAGE_SIZE * 2;

- if (size == buffer_size)
- return size;
-
atomic_inc(&buffer->record_disabled);

/* Make sure all writers are done with this buffer. */
@@ -1321,68 +1340,56 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)

nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);

- if (size < buffer_size) {
-
- /* easy case, just free pages */
- if (RB_WARN_ON(buffer, nr_pages >= buffer->pages))
- goto out_fail;
-
- rm_pages = buffer->pages - nr_pages;
-
+ if (cpu_id == RING_BUFFER_ALL_CPUS) {
+ /* calculate the pages to update */
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
- rb_remove_pages(cpu_buffer, rm_pages);
- }
- goto out;
- }

- /*
- * This is a bit more difficult. We only want to add pages
- * when we can allocate enough for all CPUs. We do this
- * by allocating all the pages and storing them on a local
- * link list. If we succeed in our allocation, then we
- * add these pages to the cpu_buffers. Otherwise we just free
- * them all and return -ENOMEM;
- */
- if (RB_WARN_ON(buffer, nr_pages <= buffer->pages))
- goto out_fail;
+ cpu_buffer->nr_pages_to_update = nr_pages -
+ cpu_buffer->nr_pages;

- new_pages = nr_pages - buffer->pages;
+ /*
+ * nothing more to do for removing pages or no update
+ */
+ if (cpu_buffer->nr_pages_to_update <= 0)
+ continue;

- for_each_buffer_cpu(buffer, cpu) {
- for (i = 0; i < new_pages; i++) {
- struct page *page;
/*
- * __GFP_NORETRY flag makes sure that the allocation
- * fails gracefully without invoking oom-killer and
- * the system is not destabilized.
+ * to add pages, make sure all new pages can be
+ * allocated without receiving ENOMEM
*/
- bpage = kzalloc_node(ALIGN(sizeof(*bpage),
- cache_line_size()),
- GFP_KERNEL | __GFP_NORETRY,
- cpu_to_node(cpu));
- if (!bpage)
- goto free_pages;
- list_add(&bpage->list, &pages);
- page = alloc_pages_node(cpu_to_node(cpu),
- GFP_KERNEL | __GFP_NORETRY, 0);
- if (!page)
- goto free_pages;
- bpage->page = page_address(page);
- rb_init_page(bpage->page);
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
+ if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update,
+ &cpu_buffer->new_pages, cpu))
+ /* not enough memory for new pages */
+ goto no_mem;
}
- }

- for_each_buffer_cpu(buffer, cpu) {
- cpu_buffer = buffer->buffers[cpu];
- rb_insert_pages(cpu_buffer, &pages, new_pages);
- }
+ /* wait for all the updates to complete */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ if (cpu_buffer->nr_pages_to_update) {
+ update_pages_handler(cpu_buffer);
+ }
+ }
+ } else {
+ cpu_buffer = buffer->buffers[cpu_id];
+ if (nr_pages == cpu_buffer->nr_pages)
+ goto out;

- if (RB_WARN_ON(buffer, !list_empty(&pages)))
- goto out_fail;
+ cpu_buffer->nr_pages_to_update = nr_pages -
+ cpu_buffer->nr_pages;
+
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
+ if (cpu_buffer->nr_pages_to_update > 0 &&
+ __rb_allocate_pages(cpu_buffer->nr_pages_to_update,
+ &cpu_buffer->new_pages, cpu_id))
+ goto no_mem;
+
+ update_pages_handler(cpu_buffer);
+ }

out:
- buffer->pages = nr_pages;
put_online_cpus();
mutex_unlock(&buffer->mutex);

@@ -1390,25 +1397,24 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)

return size;

- free_pages:
- list_for_each_entry_safe(bpage, tmp, &pages, list) {
- list_del_init(&bpage->list);
- free_buffer_page(bpage);
+ no_mem:
+ for_each_buffer_cpu(buffer, cpu) {
+ struct buffer_page *bpage, *tmp;
+ cpu_buffer = buffer->buffers[cpu];
+ /* reset this number regardless */
+ cpu_buffer->nr_pages_to_update = 0;
+ if (list_empty(&cpu_buffer->new_pages))
+ continue;
+ list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages,
+ list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
}
put_online_cpus();
mutex_unlock(&buffer->mutex);
atomic_dec(&buffer->record_disabled);
return -ENOMEM;
-
- /*
- * Something went totally wrong, and we are too paranoid
- * to even clean up the mess.
- */
- out_fail:
- put_online_cpus();
- mutex_unlock(&buffer->mutex);
- atomic_dec(&buffer->record_disabled);
- return -1;
}
EXPORT_SYMBOL_GPL(ring_buffer_resize);

@@ -1510,7 +1516,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
* assign the commit to the tail.
*/
again:
- max_count = cpu_buffer->buffer->pages * 100;
+ max_count = cpu_buffer->nr_pages * 100;

while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
if (RB_WARN_ON(cpu_buffer, !(--max_count)))
@@ -3588,9 +3594,18 @@ EXPORT_SYMBOL_GPL(ring_buffer_read);
* ring_buffer_size - return the size of the ring buffer (in bytes)
* @buffer: The ring buffer.
*/
-unsigned long ring_buffer_size(struct ring_buffer *buffer)
+unsigned long ring_buffer_size(struct ring_buffer *buffer, int cpu)
{
- return BUF_PAGE_SIZE * buffer->pages;
+ /*
+ * Earlier, this method returned
+ * BUF_PAGE_SIZE * buffer->nr_pages
+ * Since the nr_pages field is now removed, we have converted this to
+ * return the per cpu buffer value.
+ */
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return 0;
+
+ return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages;
}
EXPORT_SYMBOL_GPL(ring_buffer_size);

@@ -3765,8 +3780,11 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
!cpumask_test_cpu(cpu, buffer_b->cpumask))
goto out;

+ cpu_buffer_a = buffer_a->buffers[cpu];
+ cpu_buffer_b = buffer_b->buffers[cpu];
+
/* At least make sure the two buffers are somewhat the same */
- if (buffer_a->pages != buffer_b->pages)
+ if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
goto out;

ret = -EAGAIN;
@@ -3780,9 +3798,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
if (atomic_read(&buffer_b->record_disabled))
goto out;

- cpu_buffer_a = buffer_a->buffers[cpu];
- cpu_buffer_b = buffer_b->buffers[cpu];
-
if (atomic_read(&cpu_buffer_a->record_disabled))
goto out;

@@ -4071,6 +4086,8 @@ static int rb_cpu_notify(struct notifier_block *self,
struct ring_buffer *buffer =
container_of(self, struct ring_buffer, cpu_notify);
long cpu = (long)hcpu;
+ int cpu_i, nr_pages_same;
+ unsigned int nr_pages;

switch (action) {
case CPU_UP_PREPARE:
@@ -4078,8 +4095,23 @@ static int rb_cpu_notify(struct notifier_block *self,
if (cpumask_test_cpu(cpu, buffer->cpumask))
return NOTIFY_OK;

+ nr_pages = 0;
+ nr_pages_same = 1;
+ /* check if all cpu sizes are same */
+ for_each_buffer_cpu(buffer, cpu_i) {
+ /* fill in the size from first enabled cpu */
+ if (nr_pages == 0)
+ nr_pages = buffer->buffers[cpu_i]->nr_pages;
+ if (nr_pages != buffer->buffers[cpu_i]->nr_pages) {
+ nr_pages_same = 0;
+ break;
+ }
+ }
+ /* allocate minimum pages, user can later expand it */
+ if (!nr_pages_same)
+ nr_pages = 2;
buffer->buffers[cpu] =
- rb_allocate_cpu_buffer(buffer, cpu);
+ rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
if (!buffer->buffers[cpu]) {
WARN(1, "failed to allocate ring buffer on CPU %ld\n",
cpu);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bbcde54..f11a285 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -838,7 +838,8 @@ __acquires(kernel_lock)

/* If we expanded the buffers, make sure the max is expanded too */
if (ring_buffer_expanded && type->use_max_tr)
- ring_buffer_resize(max_tr.buffer, trace_buf_size);
+ ring_buffer_resize(max_tr.buffer, trace_buf_size,
+ RING_BUFFER_ALL_CPUS);

/* the test is responsible for initializing and enabling */
pr_info("Testing tracer %s: ", type->name);
@@ -854,7 +855,8 @@ __acquires(kernel_lock)

/* Shrink the max buffer again */
if (ring_buffer_expanded && type->use_max_tr)
- ring_buffer_resize(max_tr.buffer, 1);
+ ring_buffer_resize(max_tr.buffer, 1,
+ RING_BUFFER_ALL_CPUS);

printk(KERN_CONT "PASSED\n");
}
@@ -3053,7 +3055,14 @@ int tracer_init(struct tracer *t, struct trace_array *tr)
return t->init(tr);
}

-static int __tracing_resize_ring_buffer(unsigned long size)
+static void set_buffer_entries(struct trace_array *tr, unsigned long val)
+{
+ int cpu;
+ for_each_tracing_cpu(cpu)
+ tr->data[cpu]->entries = val;
+}
+
+static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
{
int ret;

@@ -3064,19 +3073,32 @@ static int __tracing_resize_ring_buffer(unsigned long size)
*/
ring_buffer_expanded = 1;

- ret = ring_buffer_resize(global_trace.buffer, size);
+ ret = ring_buffer_resize(global_trace.buffer, size, cpu);
if (ret < 0)
return ret;

if (!current_trace->use_max_tr)
goto out;

- ret = ring_buffer_resize(max_tr.buffer, size);
+ ret = ring_buffer_resize(max_tr.buffer, size, cpu);
if (ret < 0) {
- int r;
+ int r = 0;
+
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ int i;
+ for_each_tracing_cpu(i) {
+ r = ring_buffer_resize(global_trace.buffer,
+ global_trace.data[i]->entries,
+ i);
+ if (r < 0)
+ break;
+ }
+ } else {
+ r = ring_buffer_resize(global_trace.buffer,
+ global_trace.data[cpu]->entries,
+ cpu);
+ }

- r = ring_buffer_resize(global_trace.buffer,
- global_trace.entries);
if (r < 0) {
/*
* AARGH! We are left with different
@@ -3098,14 +3120,21 @@ static int __tracing_resize_ring_buffer(unsigned long size)
return ret;
}

- max_tr.entries = size;
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ set_buffer_entries(&max_tr, size);
+ else
+ max_tr.data[cpu]->entries = size;
+
out:
- global_trace.entries = size;
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ set_buffer_entries(&global_trace, size);
+ else
+ global_trace.data[cpu]->entries = size;

return ret;
}

-static ssize_t tracing_resize_ring_buffer(unsigned long size)
+static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id)
{
int cpu, ret = size;

@@ -3121,12 +3150,19 @@ static ssize_t tracing_resize_ring_buffer(unsigned long size)
atomic_inc(&max_tr.data[cpu]->disabled);
}

- if (size != global_trace.entries)
- ret = __tracing_resize_ring_buffer(size);
+ if (cpu_id != RING_BUFFER_ALL_CPUS) {
+ /* make sure, this cpu is enabled in the mask */
+ if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }

+ ret = __tracing_resize_ring_buffer(size, cpu_id);
if (ret < 0)
ret = -ENOMEM;

+out:
for_each_tracing_cpu(cpu) {
if (global_trace.data[cpu])
atomic_dec(&global_trace.data[cpu]->disabled);
@@ -3157,7 +3193,8 @@ int tracing_update_buffers(void)

mutex_lock(&trace_types_lock);
if (!ring_buffer_expanded)
- ret = __tracing_resize_ring_buffer(trace_buf_size);
+ ret = __tracing_resize_ring_buffer(trace_buf_size,
+ RING_BUFFER_ALL_CPUS);
mutex_unlock(&trace_types_lock);

return ret;
@@ -3181,7 +3218,8 @@ static int tracing_set_tracer(const char *buf)
mutex_lock(&trace_types_lock);

if (!ring_buffer_expanded) {
- ret = __tracing_resize_ring_buffer(trace_buf_size);
+ ret = __tracing_resize_ring_buffer(trace_buf_size,
+ RING_BUFFER_ALL_CPUS);
if (ret < 0)
goto out;
ret = 0;
@@ -3207,8 +3245,8 @@ static int tracing_set_tracer(const char *buf)
* The max_tr ring buffer has some state (e.g. ring->clock) and
* we want preserve it.
*/
- ring_buffer_resize(max_tr.buffer, 1);
- max_tr.entries = 1;
+ ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS);
+ set_buffer_entries(&max_tr, 1);
}
destroy_trace_option_files(topts);

@@ -3216,10 +3254,17 @@ static int tracing_set_tracer(const char *buf)

topts = create_trace_option_files(current_trace);
if (current_trace->use_max_tr) {
- ret = ring_buffer_resize(max_tr.buffer, global_trace.entries);
- if (ret < 0)
- goto out;
- max_tr.entries = global_trace.entries;
+ int cpu;
+ /* we need to make per cpu buffer sizes equivalent */
+ for_each_tracing_cpu(cpu) {
+ ret = ring_buffer_resize(max_tr.buffer,
+ global_trace.data[cpu]->entries,
+ cpu);
+ if (ret < 0)
+ goto out;
+ max_tr.data[cpu]->entries =
+ global_trace.data[cpu]->entries;
+ }
}

if (t->init) {
@@ -3721,30 +3766,82 @@ out_err:
goto out;
}

+struct ftrace_entries_info {
+ struct trace_array *tr;
+ int cpu;
+};
+
+static int tracing_entries_open(struct inode *inode, struct file *filp)
+{
+ struct ftrace_entries_info *info;
+
+ if (tracing_disabled)
+ return -ENODEV;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+
+ info->tr = &global_trace;
+ info->cpu = (unsigned long)inode->i_private;
+
+ filp->private_data = info;
+
+ return 0;
+}
+
static ssize_t
tracing_entries_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- struct trace_array *tr = filp->private_data;
- char buf[96];
- int r;
+ struct ftrace_entries_info *info = filp->private_data;
+ struct trace_array *tr = info->tr;
+ char buf[64];
+ int r = 0;
+ ssize_t ret;

mutex_lock(&trace_types_lock);
- if (!ring_buffer_expanded)
- r = sprintf(buf, "%lu (expanded: %lu)\n",
- tr->entries >> 10,
- trace_buf_size >> 10);
- else
- r = sprintf(buf, "%lu\n", tr->entries >> 10);
+
+ if (info->cpu == RING_BUFFER_ALL_CPUS) {
+ int cpu, buf_size_same;
+ unsigned long size;
+
+ size = 0;
+ buf_size_same = 1;
+ /* check if all cpu sizes are same */
+ for_each_tracing_cpu(cpu) {
+ /* fill in the size from first enabled cpu */
+ if (size == 0)
+ size = tr->data[cpu]->entries;
+ if (size != tr->data[cpu]->entries) {
+ buf_size_same = 0;
+ break;
+ }
+ }
+
+ if (buf_size_same) {
+ if (!ring_buffer_expanded)
+ r = sprintf(buf, "%lu (expanded: %lu)\n",
+ size >> 10,
+ trace_buf_size >> 10);
+ else
+ r = sprintf(buf, "%lu\n", size >> 10);
+ } else
+ r = sprintf(buf, "X\n");
+ } else
+ r = sprintf(buf, "%lu\n", tr->data[info->cpu]->entries >> 10);
+
mutex_unlock(&trace_types_lock);

- return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+ ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+ return ret;
}

static ssize_t
tracing_entries_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
+ struct ftrace_entries_info *info = filp->private_data;
unsigned long val;
int ret;

@@ -3759,7 +3856,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
/* value is in KB */
val <<= 10;

- ret = tracing_resize_ring_buffer(val);
+ ret = tracing_resize_ring_buffer(val, info->cpu);
if (ret < 0)
return ret;

@@ -3768,6 +3865,16 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
return cnt;
}

+static int
+tracing_entries_release(struct inode *inode, struct file *filp)
+{
+ struct ftrace_entries_info *info = filp->private_data;
+
+ kfree(info);
+
+ return 0;
+}
+
static ssize_t
tracing_total_entries_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
@@ -3779,7 +3886,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,

mutex_lock(&trace_types_lock);
for_each_tracing_cpu(cpu) {
- size += tr->entries >> 10;
+ size += tr->data[cpu]->entries >> 10;
if (!ring_buffer_expanded)
expanded_size += trace_buf_size >> 10;
}
@@ -3813,7 +3920,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
if (trace_flags & TRACE_ITER_STOP_ON_FREE)
tracing_off();
/* resize the ring buffer to 0 */
- tracing_resize_ring_buffer(0);
+ tracing_resize_ring_buffer(0, RING_BUFFER_ALL_CPUS);

return 0;
}
@@ -4012,9 +4119,10 @@ static const struct file_operations tracing_pipe_fops = {
};

static const struct file_operations tracing_entries_fops = {
- .open = tracing_open_generic,
+ .open = tracing_entries_open,
.read = tracing_entries_read,
.write = tracing_entries_write,
+ .release = tracing_entries_release,
.llseek = generic_file_llseek,
};

@@ -4466,6 +4574,9 @@ static void tracing_init_debugfs_percpu(long cpu)

trace_create_file("stats", 0444, d_cpu,
(void *) cpu, &tracing_stats_fops);
+
+ trace_create_file("buffer_size_kb", 0444, d_cpu,
+ (void *) cpu, &tracing_entries_fops);
}

#ifdef CONFIG_FTRACE_SELFTEST
@@ -4795,7 +4906,7 @@ static __init int tracer_init_debugfs(void)
(void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);

trace_create_file("buffer_size_kb", 0644, d_tracer,
- &global_trace, &tracing_entries_fops);
+ (void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops);

trace_create_file("buffer_total_size_kb", 0444, d_tracer,
&global_trace, &tracing_total_entries_fops);
@@ -5056,7 +5167,6 @@ __init static int tracer_alloc_buffers(void)
WARN_ON(1);
goto out_free_cpumask;
}
- global_trace.entries = ring_buffer_size(global_trace.buffer);
if (global_trace.buffer_disabled)
tracing_off();

@@ -5069,7 +5179,6 @@ __init static int tracer_alloc_buffers(void)
ring_buffer_free(global_trace.buffer);
goto out_free_cpumask;
}
- max_tr.entries = 1;
#endif

/* Allocate the first page for all buffers */
@@ -5078,6 +5187,11 @@ __init static int tracer_alloc_buffers(void)
max_tr.data[i] = &per_cpu(max_tr_data, i);
}

+ set_buffer_entries(&global_trace, ring_buf_size);
+#ifdef CONFIG_TRACER_MAX_TRACE
+ set_buffer_entries(&max_tr, 1);
+#endif
+
trace_init_cmdlines();

register_tracer(&nop_trace);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f9d8550..1c8b7c6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -131,6 +131,7 @@ struct trace_array_cpu {
atomic_t disabled;
void *buffer_page; /* ring buffer spare */

+ unsigned long entries;
unsigned long saved_latency;
unsigned long critical_start;
unsigned long critical_end;
@@ -152,7 +153,6 @@ struct trace_array_cpu {
*/
struct trace_array {
struct ring_buffer *buffer;
- unsigned long entries;
int cpu;
int buffer_disabled;
cycle_t time_start;