This is a reworked patch, version 3, to fix the SPU profile
sample data collection.
Currently, the SPU escape sequences and program counter data is being
added directly into the kernel buffer without holding the buffer_mutex
lock. This patch changes how the data is stored. A new function,
oprofile_add_value, is added into the oprofile driver to allow adding
generic data to the per cpu buffers. This enables adding series of data
to a specified cpu_buffer. The function is used to add SPU data
into a cpu buffer. The patch also adds the needed code to move the
special sequence to the kernel buffer. There are restrictions on
the use of the oprofile_add_value() function to ensure data is
properly inserted to the specified CPU buffer.
Finally, this patch backs out the changes previously added to the
oprofile generic code for handling the architecture specific
ops.sync_start and ops.sync_stop that allowed the architecture
to skip the per CPU buffer creation.
Signed-off-by: Carl Love <[email protected]>
Index: Cell_kernel_5_15_2008-new/arch/powerpc/oprofile/cell/pr_util.h
===================================================================
--- Cell_kernel_5_15_2008-new.orig/arch/powerpc/oprofile/cell/pr_util.h
+++ Cell_kernel_5_15_2008-new/arch/powerpc/oprofile/cell/pr_util.h
@@ -20,10 +20,7 @@
#include <asm/cell-regs.h>
#include <asm/spu.h>
-/* Defines used for sync_start */
-#define SKIP_GENERIC_SYNC 0
-#define SYNC_START_ERROR -1
-#define DO_GENERIC_SYNC 1
+#define SPUS_PER_NODE 8
struct spu_overlay_info { /* map of sections within an SPU overlay */
unsigned int vma; /* SPU virtual memory address from elf */
@@ -80,12 +77,18 @@ int start_spu_profiling(unsigned int cyc
void stop_spu_profiling(void);
+/*
+ * Entry point for SPU event profiling.
+ */
+int start_spu_event_profiling(unsigned int cycles_reset);
+
+void stop_spu_event_profiling(void);
/* add the necessary profiling hooks */
int spu_sync_start(void);
/* remove the hooks */
-int spu_sync_stop(void);
+void spu_sync_stop(void);
/* Record SPU program counter samples to the oprofile event buffer. */
void spu_sync_buffer(int spu_num, unsigned int *samples,
Index: Cell_kernel_5_15_2008-new/arch/powerpc/oprofile/cell/spu_profiler.c
===================================================================
--- Cell_kernel_5_15_2008-new.orig/arch/powerpc/oprofile/cell/spu_profiler.c
+++ Cell_kernel_5_15_2008-new/arch/powerpc/oprofile/cell/spu_profiler.c
@@ -28,7 +28,6 @@ static unsigned int profiling_interval;
#define NUM_SPU_BITS_TRBUF 16
#define SPUS_PER_TB_ENTRY 4
-#define SPUS_PER_NODE 8
#define SPU_PC_MASK 0xFFFF
@@ -123,7 +122,6 @@ static int cell_spu_pc_collection(int cp
return entry;
}
-
static enum hrtimer_restart profile_spus(struct hrtimer *timer)
{
ktime_t kt;
@@ -150,7 +148,13 @@ static enum hrtimer_restart profile_spus
sample_array_lock_flags);
num_samples = cell_spu_pc_collection(cpu);
- if (num_samples == 0) {
+ if (unlikely(!spu_prof_running)) {
+ spin_unlock_irqrestore(&sample_array_lock,
+ sample_array_lock_flags);
+ goto stop;
+ }
+
+ if (unlikely(num_samples == 0)) {
spin_unlock_irqrestore(&sample_array_lock,
sample_array_lock_flags);
continue;
@@ -214,8 +218,26 @@ int start_spu_profiling(unsigned int cyc
void stop_spu_profiling(void)
{
+ int cpu;
+
spu_prof_running = 0;
hrtimer_cancel(&timer);
+
+ /* insure everyone sees spu_prof_running
+ * changed to 0.
+ */
+ smp_wmb();
+
+ /* Ensure writing data to the trace buffer and processing
+ * data in the trace buffer has stopped.
+ * Setting the trace buffer to empty will cause
+ * cell_spu_pc_collection() to exit if it is running.
+ */
+ for_each_online_cpu(cpu) {
+ cbe_write_pm(cpu, pm_interval, 0);
+ cbe_write_pm(cpu, trace_address, 0);
+ }
+
kfree(samples);
pr_debug("SPU_PROF: stop_spu_profiling issued\n");
}
Index: Cell_kernel_5_15_2008-new/arch/powerpc/oprofile/cell/spu_task_sync.c
===================================================================
--- Cell_kernel_5_15_2008-new.orig/arch/powerpc/oprofile/cell/spu_task_sync.c
+++ Cell_kernel_5_15_2008-new/arch/powerpc/oprofile/cell/spu_task_sync.c
@@ -30,12 +30,20 @@
#include "pr_util.h"
#define RELEASE_ALL 9999
+#define NUM_SPU_CNTXT_SW 8
+#define NUM_SPU_SYNC_START 3
-static DEFINE_SPINLOCK(buffer_lock);
+static DEFINE_SPINLOCK(add_value_lock);
static DEFINE_SPINLOCK(cache_lock);
static int num_spu_nodes;
int spu_prof_num_nodes;
-int last_guard_val[MAX_NUMNODES * 8];
+int last_guard_val[MAX_NUMNODES * SPUS_PER_NODE];
+static int spu_ctx_sw_seen[MAX_NUMNODES * SPUS_PER_NODE];
+
+/* an array for mapping spu numbers to an index in an array */
+static int spu_num_to_index[MAX_NUMNODES * SPUS_PER_NODE];
+static int max_spu_num_to_index=0;
+static DEFINE_SPINLOCK(spu_index_map_lock);
/* Container for caching information about an active SPU task. */
struct cached_info {
@@ -44,7 +52,129 @@ struct cached_info {
struct kref cache_ref;
};
-static struct cached_info *spu_info[MAX_NUMNODES * 8];
+static struct cached_info *spu_info[MAX_NUMNODES * SPUS_PER_NODE];
+
+struct list_cpu_nums {
+ struct list_cpu_nums *next;
+ int cpu_num;
+};
+
+struct spu_cpu_map_struct {
+ int cpu_num;
+ int spu_num;
+};
+
+struct spu_cpu_map_struct spu_cpu_map[MAX_NUMNODES * SPUS_PER_NODE];
+
+struct list_cpu_nums *active_cpu_nums_list;
+struct list_cpu_nums *next_cpu;
+static int max_entries_spu_cpu_map=0;
+
+/* In general, don't know what the SPU number range will be.
+ * Create an array to define what SPU number is mapped to each
+ * index in an array. Want to be able to have multiple calls
+ * lookup an index simultaneously. Only hold a lock when adding
+ * a new entry.
+ */
+static int add_spu_index(int spu_num) {
+ int i, tmp;
+ int flags;
+
+ spin_lock_irqsave(&spu_index_map_lock, flags);
+
+ /* Need to double check that entry didn't get added
+ * since the call to get_spu_index() didn't find it.
+ */
+ for (i=0; i<max_spu_num_to_index; i++)
+ if (spu_num_to_index[i] == spu_num) {
+ tmp = i;
+ goto out;
+ }
+
+ /* create map for spu num */
+
+ tmp = max_spu_num_to_index;
+ spu_num_to_index[max_spu_num_to_index] = spu_num;
+ max_spu_num_to_index++;
+
+out: spin_unlock_irqrestore(&spu_index_map_lock, flags);
+
+ return tmp;
+}
+
+static int get_spu_index(int spu_num) {
+ int i, tmp;
+
+ /* check if spu map has been created */
+ for (i=0; i<max_spu_num_to_index; i++)
+ if (spu_num_to_index[i] == spu_num) {
+ tmp = i;
+ goto out;
+ }
+
+ tmp = add_spu_index(spu_num);
+
+out: return tmp;
+}
+
+static int valid_spu_num(int spu_num) {
+ int i;
+
+ /* check if spu map has been created */
+ for (i=0; i<max_spu_num_to_index; i++)
+ if (spu_num_to_index[i] == spu_num)
+ return 1;
+
+ /* The spu number has not been seen*/
+ return 0;
+}
+
+static int initialize_active_cpu_nums(void) {
+ int cpu;
+ struct list_cpu_nums *tmp;
+
+ /* initialize the circular list */
+
+ active_cpu_nums_list = NULL;
+
+ for_each_online_cpu(cpu) {
+ if (!(tmp = kzalloc(sizeof(struct list_cpu_nums),
+ GFP_KERNEL)))
+ return -ENOMEM;
+
+ tmp->cpu_num = cpu;
+
+ if (!active_cpu_nums_list) {
+ active_cpu_nums_list = tmp;
+ tmp->next = tmp;
+
+ } else {
+ tmp->next = active_cpu_nums_list->next;
+ active_cpu_nums_list->next = tmp;
+ }
+ }
+ next_cpu = active_cpu_nums_list;
+ return 0;
+}
+
+static int get_cpu_buf(int spu_num) {
+ int i;
+
+
+ for (i=0; i< max_entries_spu_cpu_map; i++)
+ if (spu_cpu_map[i].spu_num == spu_num)
+ return spu_cpu_map[i].cpu_num;
+
+ /* no mapping found, create mapping using the next
+ * cpu in the circular list of cpu numbers.
+ */
+ spu_cpu_map[max_entries_spu_cpu_map].spu_num = spu_num;
+ spu_cpu_map[max_entries_spu_cpu_map].cpu_num = next_cpu->cpu_num;
+
+ next_cpu = next_cpu->next;
+
+ return spu_cpu_map[max_entries_spu_cpu_map++].cpu_num;
+}
static void destroy_cached_info(struct kref *kref)
{
@@ -72,15 +202,17 @@ static struct cached_info *get_cached_in
ret_info = NULL;
goto out;
}
- if (!spu_info[spu_num] && the_spu) {
+ if (!spu_info[get_spu_index(spu_num)] && the_spu) {
ref = spu_get_profile_private_kref(the_spu->ctx);
if (ref) {
- spu_info[spu_num] = container_of(ref, struct cached_info, cache_ref);
- kref_get(&spu_info[spu_num]->cache_ref);
+ spu_info[get_spu_index(spu_num)] =
+ container_of(ref, struct cached_info,
+ cache_ref);
+ kref_get(&spu_info[get_spu_index(spu_num)]->cache_ref);
}
}
- ret_info = spu_info[spu_num];
+ ret_info = spu_info[get_spu_index(spu_num)];
out:
return ret_info;
}
@@ -133,7 +265,7 @@ prepare_cached_spu_info(struct spu *spu,
info->the_spu = spu;
kref_init(&info->cache_ref);
spin_lock_irqsave(&cache_lock, flags);
- spu_info[spu->number] = info;
+ spu_info[get_spu_index(spu->number)] = info;
/* Increment count before passing off ref to SPUFS. */
kref_get(&info->cache_ref);
@@ -161,27 +293,28 @@ out:
*/
static int release_cached_info(int spu_index)
{
- int index, end;
+ int index, end, spu_num;
if (spu_index == RELEASE_ALL) {
- end = num_spu_nodes;
+ end = max_spu_num_to_index;
index = 0;
} else {
- if (spu_index >= num_spu_nodes) {
+ if (!valid_spu_num(spu_index)) {
printk(KERN_ERR "SPU_PROF: "
"%s, line %d: "
"Invalid index %d into spu info cache\n",
__FUNCTION__, __LINE__, spu_index);
goto out;
}
- end = spu_index + 1;
- index = spu_index;
+ index = get_spu_index(spu_index);
+ end = index + 1;
}
for (; index < end; index++) {
- if (spu_info[index]) {
- kref_put(&spu_info[index]->cache_ref,
+ spu_num = spu_num_to_index[index];
+ if (spu_info[spu_num]) {
+ kref_put(&spu_info[spu_num]->cache_ref,
destroy_cached_info);
- spu_info[index] = NULL;
+ spu_info[spu_num] = NULL;
}
}
@@ -289,6 +422,8 @@ static int process_context_switch(struct
int retval;
unsigned int offset = 0;
unsigned long spu_cookie = 0, app_dcookie;
+ unsigned long values[NUM_SPU_CNTXT_SW];
+ int cpu_buf;
retval = prepare_cached_spu_info(spu, objectId);
if (retval)
@@ -303,17 +438,31 @@ static int process_context_switch(struct
goto out;
}
- /* Record context info in event buffer */
- spin_lock_irqsave(&buffer_lock, flags);
- add_event_entry(ESCAPE_CODE);
- add_event_entry(SPU_CTX_SWITCH_CODE);
- add_event_entry(spu->number);
- add_event_entry(spu->pid);
- add_event_entry(spu->tgid);
- add_event_entry(app_dcookie);
- add_event_entry(spu_cookie);
- add_event_entry(offset);
- spin_unlock_irqrestore(&buffer_lock, flags);
+ /* Record context info in event buffer. Note, there are more
+ * SPUs then CPUs. Map the SPU events/data for a given SPU to
+ * the same CPU buffer. Need to ensure the cntxt switch data and
+ * samples stay in order.
+ */
+
+ spin_lock_irqsave(&add_value_lock, flags);
+ cpu_buf = get_cpu_buf(spu->number);
+
+ values[0] = ESCAPE_CODE;
+ values[1] = SPU_CTX_SWITCH_CODE;
+ values[2] = spu->number;
+ values[3] = spu->pid;
+ values[4] = spu->tgid;
+ values[5] = app_dcookie;
+ values[6] = spu_cookie;
+ values[7] = offset;
+ oprofile_add_value(values, cpu_buf, NUM_SPU_CNTXT_SW);
+
+ /* Set flag to indicate SPU PC data can now be written out. If
+ * the SPU program counter data is seen before an SPU context
+ * record is seen, the postprocessing will fail.
+ */
+ spu_ctx_sw_seen[get_spu_index(spu->number)] = 1;
+ spin_unlock_irqrestore(&add_value_lock, flags);
smp_wmb(); /* insure spu event buffer updates are written */
/* don't want entries intermingled... */
out:
@@ -363,38 +512,51 @@ static int number_of_online_nodes(void)
/* The main purpose of this function is to synchronize
* OProfile with SPUFS by registering to be notified of
* SPU task switches.
- *
- * NOTE: When profiling SPUs, we must ensure that only
- * spu_sync_start is invoked and not the generic sync_start
- * in drivers/oprofile/oprof.c. A return value of
- * SKIP_GENERIC_SYNC or SYNC_START_ERROR will
- * accomplish this.
*/
int spu_sync_start(void)
{
int k;
- int ret = SKIP_GENERIC_SYNC;
+ int ret = 0;
int register_ret;
- unsigned long flags = 0;
+ int cpu;
+ int flags;
+ int unsigned long values[NUM_SPU_SYNC_START];
spu_prof_num_nodes = number_of_online_nodes();
- num_spu_nodes = spu_prof_num_nodes * 8;
+ num_spu_nodes = spu_prof_num_nodes * SPUS_PER_NODE;
- spin_lock_irqsave(&buffer_lock, flags);
- add_event_entry(ESCAPE_CODE);
- add_event_entry(SPU_PROFILING_CODE);
- add_event_entry(num_spu_nodes);
- spin_unlock_irqrestore(&buffer_lock, flags);
+ ret = initialize_active_cpu_nums();
+ if (ret)
+ goto out;
+
+ /* The SPU_PROFILING_CODE escape sequence must proceed
+ * the SPU context switch info.
+ *
+ * SPU profiling and PPU profiling are not supported
+ * at the same time. SPU Profilining does not support
+ * call graphs, hence just need lock to prevent mulitple
+ * calls to oprofile_add_value().
+ */
+ values[0] = ESCAPE_CODE;
+ values[1] = SPU_PROFILING_CODE;
+ values[2] =(unsigned long int) num_spu_nodes;
+
+ spin_lock_irqsave(&add_value_lock, flags);
+ for_each_online_cpu(cpu)
+ oprofile_add_value(values, cpu, NUM_SPU_SYNC_START);
+ spin_unlock_irqrestore(&add_value_lock, flags);
/* Register for SPU events */
register_ret = spu_switch_event_register(&spu_active);
if (register_ret) {
- ret = SYNC_START_ERROR;
+ ret = -1;
goto out;
}
- for (k = 0; k < (MAX_NUMNODES * 8); k++)
+ for (k = 0; k < (MAX_NUMNODES * SPUS_PER_NODE); k++) {
last_guard_val[k] = 0;
+ spu_ctx_sw_seen[k] = 0;
+ }
pr_debug("spu_sync_start -- running.\n");
out:
return ret;
@@ -405,13 +567,15 @@ void spu_sync_buffer(int spu_num, unsign
int num_samples)
{
unsigned long long file_offset;
- unsigned long flags;
+ unsigned long flags, flags_add_value;
int i;
struct vma_to_fileoffset_map *map;
struct spu *the_spu;
unsigned long long spu_num_ll = spu_num;
unsigned long long spu_num_shifted = spu_num_ll << 32;
struct cached_info *c_info;
+ unsigned long value;
+ int cpu_buf;
/* We need to obtain the cache_lock here because it's
* possible that after getting the cached_info, the SPU job
@@ -432,7 +596,9 @@ void spu_sync_buffer(int spu_num, unsign
map = c_info->map;
the_spu = c_info->the_spu;
- spin_lock(&buffer_lock);
+ spin_lock_irqsave(&add_value_lock, flags_add_value);
+ cpu_buf = get_cpu_buf(the_spu->number);
+
for (i = 0; i < num_samples; i++) {
unsigned int sample = *(samples+i);
int grd_val = 0;
@@ -446,37 +612,43 @@ void spu_sync_buffer(int spu_num, unsign
* use. We need to discard samples taken during the time
* period which an overlay occurs (i.e., guard value changes).
*/
- if (grd_val && grd_val != last_guard_val[spu_num]) {
- last_guard_val[spu_num] = grd_val;
+ if (grd_val && grd_val != last_guard_val[get_spu_index(spu_num)]) {
+ last_guard_val[get_spu_index(spu_num)] = grd_val;
/* Drop the rest of the samples. */
break;
}
- add_event_entry(file_offset | spu_num_shifted);
+ /* We must ensure that the SPU context switch has been written
+ * out before samples for the SPU. Otherwise, the SPU context
+ * information is not available and the postprocessing of the
+ * SPU PC will fail with no available anonymous map information.
+ */
+ if (likely(spu_ctx_sw_seen[get_spu_index(spu_num)])) {
+ value = file_offset | spu_num_shifted;
+ oprofile_add_value(&value, cpu_buf, 1);
+ }
}
- spin_unlock(&buffer_lock);
+ spin_unlock_irqrestore(&add_value_lock, flags_add_value);
out:
spin_unlock_irqrestore(&cache_lock, flags);
}
-int spu_sync_stop(void)
+void spu_sync_stop(void)
{
unsigned long flags = 0;
- int ret = spu_switch_event_unregister(&spu_active);
- if (ret) {
- printk(KERN_ERR "SPU_PROF: "
- "%s, line %d: spu_switch_event_unregister returned %d\n",
- __FUNCTION__, __LINE__, ret);
- goto out;
- }
+
+ /* Ignoring the return value from the unregister
+ * call. A failed return value simply says there
+ * was no registered event. Hence there will not
+ * be any calls to process a switch event that
+ * could cause a problem.
+ */
+ spu_switch_event_unregister(&spu_active);
spin_lock_irqsave(&cache_lock, flags);
- ret = release_cached_info(RELEASE_ALL);
+ release_cached_info(RELEASE_ALL);
spin_unlock_irqrestore(&cache_lock, flags);
-out:
pr_debug("spu_sync_stop -- done.\n");
- return ret;
+ return;
}
-
-
Index: Cell_kernel_5_15_2008-new/arch/powerpc/oprofile/op_model_cell.c
===================================================================
--- Cell_kernel_5_15_2008-new.orig/arch/powerpc/oprofile/op_model_cell.c
+++ Cell_kernel_5_15_2008-new/arch/powerpc/oprofile/op_model_cell.c
@@ -1191,15 +1191,15 @@ static int cell_sync_start(void)
if (spu_cycle_reset)
return spu_sync_start();
else
- return DO_GENERIC_SYNC;
+ return 0;
}
-static int cell_sync_stop(void)
+static void cell_sync_stop(void)
{
if (spu_cycle_reset)
- return spu_sync_stop();
- else
- return 1;
+ spu_sync_stop();
+
+ return;
}
struct op_powerpc_model op_model_cell = {
Index: Cell_kernel_5_15_2008-new/drivers/oprofile/buffer_sync.c
===================================================================
--- Cell_kernel_5_15_2008-new.orig/drivers/oprofile/buffer_sync.c
+++ Cell_kernel_5_15_2008-new/drivers/oprofile/buffer_sync.c
@@ -40,6 +40,7 @@ static cpumask_t marked_cpus = CPU_MASK_
static DEFINE_SPINLOCK(task_mortuary);
static void process_task_mortuary(void);
+extern int work_enabled; // carll added for debug
/* Take ownership of the task struct and place it on the
* list for processing. Only after two full buffer syncs
@@ -521,6 +522,46 @@ void sync_buffer(int cpu)
} else if (s->event == CPU_TRACE_BEGIN) {
state = sb_bt_start;
add_trace_begin();
+ } else if (s->event == VALUE_HEADER_ID) {
+ /* The next event entry contains the number
+ * values in the sequence to add.
+ */
+ int index, j, num;
+
+ if ((available - i) < 2)
+ /* The next entry which contains the
+ * number of entries in the sequence
+ * has not been written to the
+ * buffer yet.
+ */
+ break;
+
+ /* Get the number in the sequence without
+ * changing the state of the buffer.
+ */
+ index = cpu_buf->tail_pos + 1;
+ if (!(index < cpu_buf->buffer_size))
+ index = 0;
+
+ num = cpu_buf->buffer[index].eip;
+
+ if ((available - i) < (num+1))
+ /* The entire sequence has not been
+ * written to the buffer yet.
+ */
+ break;
+
+ if (work_enabled == 0) {
+ printk("work_enabled is zero\n");
+ }
+ for (j = 0; j < num; j++) {
+ increment_tail(cpu_buf);
+ i++;
+
+ s = &cpu_buf->buffer[cpu_buf->tail_pos];
+ add_event_entry(s->event);
+ }
+
} else {
struct mm_struct * oldmm = mm;
Index: Cell_kernel_5_15_2008-new/drivers/oprofile/cpu_buffer.c
===================================================================
--- Cell_kernel_5_15_2008-new.orig/drivers/oprofile/cpu_buffer.c
+++ Cell_kernel_5_15_2008-new/drivers/oprofile/cpu_buffer.c
@@ -32,7 +32,9 @@ struct oprofile_cpu_buffer cpu_buffer[NR
static void wq_sync_buffer(struct work_struct *work);
#define DEFAULT_TIMER_EXPIRE (HZ / 10)
-static int work_enabled;
+//carll changed static int work_enabled;
+extern int work_enabled;
+int work_enabled;
void free_cpu_buffers(void)
{
@@ -224,6 +226,27 @@ static void oprofile_end_trace(struct op
cpu_buf->tracing = 0;
}
+/*
+ * The first entry in the per cpu buffer consists of the escape code and
+ * the VALUE_HEADER_ID value. The next entry consists of the number of
+ * values in the sequence and the first value, followed by the entries
+ * for the next N-1 values.
+ */
+void oprofile_add_value(unsigned long *values, int cpu, int num) {
+ struct oprofile_cpu_buffer * cpu_buf = &cpu_buffer[cpu];
+ int i;
+
+ if (nr_available_slots(cpu_buf) < num+1) {
+ cpu_buf->sample_lost_overflow++;
+ return;
+ }
+
+ add_sample(cpu_buf, ESCAPE_CODE, VALUE_HEADER_ID);
+ add_sample(cpu_buf, num, values[0]);
+ for (i=1; i<num; i++)
+ add_sample(cpu_buf, 0, values[i]);
+}
+
void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
unsigned long event, int is_kernel)
{
Index: Cell_kernel_5_15_2008-new/drivers/oprofile/cpu_buffer.h
===================================================================
--- Cell_kernel_5_15_2008-new.orig/drivers/oprofile/cpu_buffer.h
+++ Cell_kernel_5_15_2008-new/drivers/oprofile/cpu_buffer.h
@@ -54,5 +54,6 @@ void cpu_buffer_reset(struct oprofile_cp
/* transient events for the CPU buffer -> event buffer */
#define CPU_IS_KERNEL 1
#define CPU_TRACE_BEGIN 2
+#define VALUE_HEADER_ID 3
#endif /* OPROFILE_CPU_BUFFER_H */
Index: Cell_kernel_5_15_2008-new/drivers/oprofile/event_buffer.h
===================================================================
--- Cell_kernel_5_15_2008-new.orig/drivers/oprofile/event_buffer.h
+++ Cell_kernel_5_15_2008-new/drivers/oprofile/event_buffer.h
@@ -17,6 +17,14 @@ int alloc_event_buffer(void);
void free_event_buffer(void);
+
+/**
+ * Add data to the event buffer.
+ * The data passed is free-form, but typically consists of
+ * file offsets, dcookies, context information, and ESCAPE codes.
+ */
+void add_event_entry(unsigned long data);
+
/* wake up the process sleeping on the event file */
void wake_up_buffer_waiter(void);
Index: Cell_kernel_5_15_2008-new/drivers/oprofile/oprof.c
===================================================================
--- Cell_kernel_5_15_2008-new.orig/drivers/oprofile/oprof.c
+++ Cell_kernel_5_15_2008-new/drivers/oprofile/oprof.c
@@ -53,24 +53,13 @@ int oprofile_setup(void)
* us missing task deaths and eventually oopsing
* when trying to process the event buffer.
*/
- if (oprofile_ops.sync_start) {
- int sync_ret = oprofile_ops.sync_start();
- switch (sync_ret) {
- case 0:
- goto post_sync;
- case 1:
- goto do_generic;
- case -1:
- goto out3;
- default:
- goto out3;
- }
- }
-do_generic:
+ if (oprofile_ops.sync_start
+ && ((err = oprofile_ops.sync_start())))
+ goto out2;
+
if ((err = sync_start()))
goto out3;
-post_sync:
is_setup = 1;
mutex_unlock(&start_mutex);
return 0;
@@ -133,20 +122,9 @@ out:
void oprofile_shutdown(void)
{
mutex_lock(&start_mutex);
- if (oprofile_ops.sync_stop) {
- int sync_ret = oprofile_ops.sync_stop();
- switch (sync_ret) {
- case 0:
- goto post_sync;
- case 1:
- goto do_generic;
- default:
- goto post_sync;
- }
- }
-do_generic:
+ if (oprofile_ops.sync_stop)
+ oprofile_ops.sync_stop();
sync_stop();
-post_sync:
if (oprofile_ops.shutdown)
oprofile_ops.shutdown();
is_setup = 0;
Index: Cell_kernel_5_15_2008-new/include/asm-powerpc/oprofile_impl.h
===================================================================
--- Cell_kernel_5_15_2008-new.orig/include/asm-powerpc/oprofile_impl.h
+++ Cell_kernel_5_15_2008-new/include/asm-powerpc/oprofile_impl.h
@@ -48,7 +48,7 @@ struct op_powerpc_model {
void (*stop) (void);
void (*global_stop) (void);
int (*sync_start)(void);
- int (*sync_stop)(void);
+ void (*sync_stop)(void);
void (*handle_interrupt) (struct pt_regs *,
struct op_counter_config *);
int num_counters;
Index: Cell_kernel_5_15_2008-new/include/linux/oprofile.h
===================================================================
--- Cell_kernel_5_15_2008-new.orig/include/linux/oprofile.h
+++ Cell_kernel_5_15_2008-new/include/linux/oprofile.h
@@ -51,17 +51,15 @@ struct oprofile_operations {
int (*setup)(void);
/* Do any necessary interrupt shutdown. Optional. */
void (*shutdown)(void);
- /* Start delivering interrupts. */
+ /* Start delivering interrupts. */
int (*start)(void);
/* Stop delivering interrupts. */
void (*stop)(void);
/* Arch-specific buffer sync functions.
- * Return value = 0: Success
- * Return value = -1: Failure
- * Return value = 1: Run generic sync function
+ * Sync start: Return 0 for Success, -1 for Failure
*/
int (*sync_start)(void);
- int (*sync_stop)(void);
+ void (*sync_stop)(void);
/* Initiate a stack backtrace. Optional. */
void (*backtrace)(struct pt_regs * const regs, unsigned int depth);
@@ -84,13 +82,6 @@ int oprofile_arch_init(struct oprofile_o
void oprofile_arch_exit(void);
/**
- * Add data to the event buffer.
- * The data passed is free-form, but typically consists of
- * file offsets, dcookies, context information, and ESCAPE codes.
- */
-void add_event_entry(unsigned long data);
-
-/**
* Add a sample. This may be called from any context. Pass
* smp_processor_id() as cpu.
*/
@@ -106,6 +97,22 @@ void oprofile_add_sample(struct pt_regs
void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
unsigned long event, int is_kernel);
+/*
+ * Add a sequence of values to the per CPU buffer. An array of values is
+ * added to the specified cpu buffer with no additional processing. The assumption
+ * is any processing of the value will be done in the postprocessor. This
+ * function should only be used for special architecture specific data.
+ * Currently only used by the CELL processor.
+ *
+ * REQUIREMENT: the user of the function must ensure that only one call at
+ * a time is made to this function. Additionally, it must ensure that
+ * no calls are made to the following routines: oprofile_begin_trace(),
+ * oprofile_add_ext_sample(), oprofile_add_pc(), oprofile_add_trace().
+ *
+ * This function does not perform a backtrace.
+ */
+void oprofile_add_value(unsigned long *values, int cpu, int num);
+
/* Use this instead when the PC value is not from the regs. Doesn't
* backtrace. */
void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event);
On Monday 09 June 2008, Carl Love wrote:
>
> This is a reworked patch, version 3, to fix the SPU profile
> sample data collection.
>
> Currently, the SPU escape sequences and program counter data is being
> added directly into the kernel buffer without holding the buffer_mutex
> lock. This patch changes how the data is stored. A new function,
> oprofile_add_value, is added into the oprofile driver to allow adding
> generic data to the per cpu buffers. This enables adding series of data
> to a specified cpu_buffer. The function is used to add SPU data
> into a cpu buffer. The patch also adds the needed code to move the
> special sequence to the kernel buffer. There are restrictions on
> the use of the oprofile_add_value() function to ensure data is
> properly inserted to the specified CPU buffer.
>
> Finally, this patch backs out the changes previously added to the
> oprofile generic code for handling the architecture specific
> ops.sync_start and ops.sync_stop that allowed the architecture
> to skip the per CPU buffer creation.
>
> Signed-off-by: Carl Love <[email protected]>
Thanks for your new post, addressing my previous complaints.
Unfortunately, I'm still uncomfortable with a number of details
about how the data is passed around, and there are a few style
issues that should be easier to address.
> - if (num_samples == 0) {
> + if (unlikely(!spu_prof_running)) {
> + spin_unlock_irqrestore(&sample_array_lock,
> + sample_array_lock_flags);
> + goto stop;
> + }
> +
> + if (unlikely(num_samples == 0)) {
> spin_unlock_irqrestore(&sample_array_lock,
> sample_array_lock_flags);
> continue;
I think you should not use 'unlikely' here, because that optimizes only
very specific cases, and often at the expense of code size and readability.
If you really want to optimize the spu_prof_running, how about doing
it outside of the loop?
Similarly, it probably makes sense to hold the lock around the loop.
Note that you must never have a global variable for 'flags' used
in a spinlock, as that will get clobbered if you have lock contention,
leading to a CPU running with interrupts disabled accidentally.
It's better to never use spin_lock_irq or spin_lock, respectively,
since you usually know whether interrupts are enabled or not
anyway!
> @@ -214,8 +218,26 @@ int start_spu_profiling(unsigned int cyc
>
> void stop_spu_profiling(void)
> {
> + int cpu;
> +
> spu_prof_running = 0;
> hrtimer_cancel(&timer);
> +
> + /* insure everyone sees spu_prof_running
> + * changed to 0.
> + */
> + smp_wmb();
I think you'd also need have a 'smp_rb' on the reader side to
make this visible, but it's easier to just use an atomic_t
variable instead, or to hold sample_array_lock when changing
it, since you already hold that on the reader side.
> -static DEFINE_SPINLOCK(buffer_lock);
> +static DEFINE_SPINLOCK(add_value_lock);
> static DEFINE_SPINLOCK(cache_lock);
> static int num_spu_nodes;
> int spu_prof_num_nodes;
> -int last_guard_val[MAX_NUMNODES * 8];
> +int last_guard_val[MAX_NUMNODES * SPUS_PER_NODE];
> +static int spu_ctx_sw_seen[MAX_NUMNODES * SPUS_PER_NODE];
> +
> +/* an array for mapping spu numbers to an index in an array */
> +static int spu_num_to_index[MAX_NUMNODES * SPUS_PER_NODE];
> +static int max_spu_num_to_index=0;
> +static DEFINE_SPINLOCK(spu_index_map_lock);
Your three locks are rather confusing. Please try to consolidate them
into fewer spinlocks and/or document for each lock exactly what
data structures it protects, and how it nests with the other locks
(e.g. "lock order: spu_index_map_lock can not be held around other locks")
and whether you need to disable interrupts and bottom half processing
while holding them.
>
> -static struct cached_info *spu_info[MAX_NUMNODES * 8];
> +static struct cached_info *spu_info[MAX_NUMNODES * SPUS_PER_NODE];
> +
> +struct list_cpu_nums {
> + struct list_cpu_nums *next;
> + int cpu_num;
> +};
> +
> +struct spu_cpu_map_struct {
> + int cpu_num;
> + int spu_num;
> +};
> +
> +struct spu_cpu_map_struct spu_cpu_map[MAX_NUMNODES * SPUS_PER_NODE];
> +
> +struct list_cpu_nums *active_cpu_nums_list;
> +struct list_cpu_nums *next_cpu;
Here, you are putting more symbols into the global namespace, which is
not acceptable. E.g. 'next_cpu' should either be static, or be called
something like spu_task_sync_next_cpu.
Ideally, put them into a function context, or get rid of the globals
altogether.
> +static int max_entries_spu_cpu_map=0;
By convention, we do not preinitialize global variables to zero.
> +
> +/* In general, don't know what the SPU number range will be.
> + * Create an array to define what SPU number is mapped to each
> + * index in an array. Want to be able to have multiple calls
> + * lookup an index simultaneously. Only hold a lock when adding
> + * a new entry.
> + */
> +static int add_spu_index(int spu_num) {
> + int i, tmp;
> + int flags;
> +
> + spin_lock_irqsave(&spu_index_map_lock, flags);
> +
> + /* Need to double check that entry didn't get added
> + * since the call to get_spu_index() didn't find it.
> + */
> + for (i=0; i<max_spu_num_to_index; i++)
> + if (spu_num_to_index[i] == spu_num) {
> + tmp = i;
> + goto out;
> + }
> +
> + /* create map for spu num */
> +
> + tmp = max_spu_num_to_index;
> + spu_num_to_index[max_spu_num_to_index] = spu_num;
> + max_spu_num_to_index++;
> +
> +out: spin_unlock_irqrestore(&spu_index_map_lock, flags);
> +
> + return tmp;
> +}
> +
> +static int get_spu_index(int spu_num) {
> + int i, tmp;
> +
> + /* check if spu map has been created */
> + for (i=0; i<max_spu_num_to_index; i++)
> + if (spu_num_to_index[i] == spu_num) {
> + tmp = i;
> + goto out;
> + }
> +
> + tmp = add_spu_index(spu_num);
> +
> +out: return tmp;
> +}
> +
> +static int valid_spu_num(int spu_num) {
> + int i;
> +
> + /* check if spu map has been created */
> + for (i=0; i<max_spu_num_to_index; i++)
> + if (spu_num_to_index[i] == spu_num)
> + return 1;
> +
> + /* The spu number has not been seen*/
> + return 0;
> +}
> +
> +static int initialize_active_cpu_nums(void) {
> + int cpu;
> + struct list_cpu_nums *tmp;
> +
> + /* initialize the circular list */
> +
> + active_cpu_nums_list = NULL;
> +
> + for_each_online_cpu(cpu) {
> + if (!(tmp = kzalloc(sizeof(struct list_cpu_nums),
> + GFP_KERNEL)))
> + return -ENOMEM;
> +
> + tmp->cpu_num = cpu;
> +
> + if (!active_cpu_nums_list) {
> + active_cpu_nums_list = tmp;
> + tmp->next = tmp;
> +
> + } else {
> + tmp->next = active_cpu_nums_list->next;
> + active_cpu_nums_list->next = tmp;
> + }
> + }
> + next_cpu = active_cpu_nums_list;
> + return 0;
> +}
This function does not clean up after itself when getting an ENOMEM.
> +
> +static int get_cpu_buf(int spu_num) {
> + int i;
> +
> +
> + for (i=0; i< max_entries_spu_cpu_map; i++)
> + if (spu_cpu_map[i].spu_num == spu_num)
> + return spu_cpu_map[i].cpu_num;
> +
> + /* no mapping found, create mapping using the next
> + * cpu in the circular list of cpu numbers.
> + */
> + spu_cpu_map[max_entries_spu_cpu_map].spu_num = spu_num;
> + spu_cpu_map[max_entries_spu_cpu_map].cpu_num = next_cpu->cpu_num;
> +
> + next_cpu = next_cpu->next;
> +
> + return spu_cpu_map[max_entries_spu_cpu_map++].cpu_num;
> +}
>
The whole logic you are adding here (and the functions above)
looks overly complicated. I realize now that this may be my
fault because I told you not to rely on the relation between
SPU numbers and CPU numbers, but I'm still not sure if this
is even a correct approach.
Your code guarantees that all samples from a physical SPU always
end up in the same CPUs buffer. However, when an SPU context
gets moved by the scheduler from one SPU to another, how do you
maintain the consistency between the old and the new samples?
Also, this approach fundamentally does not handle CPU hotplug,
though that is probably something that was broken in this code
before.
Having to do so much work just to be able to use the per-cpu
buffers indicates that it may not be such a good idea after all.
> @@ -161,27 +293,28 @@ out:
> */
> static int release_cached_info(int spu_index)
> {
> - int index, end;
> + int index, end, spu_num;
>
> if (spu_index == RELEASE_ALL) {
> - end = num_spu_nodes;
> + end = max_spu_num_to_index;
> index = 0;
> } else {
> - if (spu_index >= num_spu_nodes) {
> + if (!valid_spu_num(spu_index)) {
> printk(KERN_ERR "SPU_PROF: "
> "%s, line %d: "
> "Invalid index %d into spu info cache\n",
> __FUNCTION__, __LINE__, spu_index);
> goto out;
> }
> - end = spu_index + 1;
> - index = spu_index;
> + index = get_spu_index(spu_index);
> + end = index + 1;
> }
> for (; index < end; index++) {
> - if (spu_info[index]) {
> - kref_put(&spu_info[index]->cache_ref,
> + spu_num = spu_num_to_index[index];
> + if (spu_info[spu_num]) {
> + kref_put(&spu_info[spu_num]->cache_ref,
> destroy_cached_info);
> - spu_info[index] = NULL;
> + spu_info[spu_num] = NULL;
> }
> }
>
> @@ -289,6 +422,8 @@ static int process_context_switch(struct
> int retval;
> unsigned int offset = 0;
> unsigned long spu_cookie = 0, app_dcookie;
> + unsigned long values[NUM_SPU_CNTXT_SW];
> + int cpu_buf;
>
> retval = prepare_cached_spu_info(spu, objectId);
> if (retval)
> @@ -303,17 +438,31 @@ static int process_context_switch(struct
> goto out;
> }
>
> - /* Record context info in event buffer */
> - spin_lock_irqsave(&buffer_lock, flags);
> - add_event_entry(ESCAPE_CODE);
> - add_event_entry(SPU_CTX_SWITCH_CODE);
> - add_event_entry(spu->number);
> - add_event_entry(spu->pid);
> - add_event_entry(spu->tgid);
> - add_event_entry(app_dcookie);
> - add_event_entry(spu_cookie);
> - add_event_entry(offset);
> - spin_unlock_irqrestore(&buffer_lock, flags);
> + /* Record context info in event buffer. Note, there are more
> + * SPUs then CPUs. Map the SPU events/data for a given SPU to
> + * the same CPU buffer. Need to ensure the cntxt switch data and
> + * samples stay in order.
> + */
> +
> + spin_lock_irqsave(&add_value_lock, flags);
> + cpu_buf = get_cpu_buf(spu->number);
> +
> + values[0] = ESCAPE_CODE;
> + values[1] = SPU_CTX_SWITCH_CODE;
> + values[2] = spu->number;
> + values[3] = spu->pid;
> + values[4] = spu->tgid;
> + values[5] = app_dcookie;
> + values[6] = spu_cookie;
> + values[7] = offset;
> + oprofile_add_value(values, cpu_buf, NUM_SPU_CNTXT_SW);
> +
> + /* Set flag to indicate SPU PC data can now be written out. If
> + * the SPU program counter data is seen before an SPU context
> + * record is seen, the postprocessing will fail.
> + */
> + spu_ctx_sw_seen[get_spu_index(spu->number)] = 1;
> + spin_unlock_irqrestore(&add_value_lock, flags);
> smp_wmb(); /* insure spu event buffer updates are written */
> /* don't want entries intermingled... */
The spin_unlock obviously contain an implicit barrier, otherwise
it would not be much good as a lock, so you can remove the smp_wmb()
here.
> out:
> @@ -363,38 +512,51 @@ static int number_of_online_nodes(void)
> /* The main purpose of this function is to synchronize
> * OProfile with SPUFS by registering to be notified of
> * SPU task switches.
> - *
> - * NOTE: When profiling SPUs, we must ensure that only
> - * spu_sync_start is invoked and not the generic sync_start
> - * in drivers/oprofile/oprof.c. A return value of
> - * SKIP_GENERIC_SYNC or SYNC_START_ERROR will
> - * accomplish this.
> */
> int spu_sync_start(void)
> {
> int k;
> - int ret = SKIP_GENERIC_SYNC;
> + int ret = 0;
> int register_ret;
> - unsigned long flags = 0;
> + int cpu;
> + int flags;
> + int unsigned long values[NUM_SPU_SYNC_START];
>
> spu_prof_num_nodes = number_of_online_nodes();
> - num_spu_nodes = spu_prof_num_nodes * 8;
> + num_spu_nodes = spu_prof_num_nodes * SPUS_PER_NODE;
>
> - spin_lock_irqsave(&buffer_lock, flags);
> - add_event_entry(ESCAPE_CODE);
> - add_event_entry(SPU_PROFILING_CODE);
> - add_event_entry(num_spu_nodes);
> - spin_unlock_irqrestore(&buffer_lock, flags);
> + ret = initialize_active_cpu_nums();
> + if (ret)
> + goto out;
> +
> + /* The SPU_PROFILING_CODE escape sequence must proceed
> + * the SPU context switch info.
> + *
> + * SPU profiling and PPU profiling are not supported
> + * at the same time. SPU Profilining does not support
> + * call graphs, hence just need lock to prevent mulitple
> + * calls to oprofile_add_value().
> + */
> + values[0] = ESCAPE_CODE;
> + values[1] = SPU_PROFILING_CODE;
> + values[2] =(unsigned long int) num_spu_nodes;
> +
> + spin_lock_irqsave(&add_value_lock, flags);
> + for_each_online_cpu(cpu)
> + oprofile_add_value(values, cpu, NUM_SPU_SYNC_START);
> + spin_unlock_irqrestore(&add_value_lock, flags);
>
> /* Register for SPU events */
> register_ret = spu_switch_event_register(&spu_active);
> if (register_ret) {
> - ret = SYNC_START_ERROR;
> + ret = -1;
> goto out;
> }
>
> - for (k = 0; k < (MAX_NUMNODES * 8); k++)
> + for (k = 0; k < (MAX_NUMNODES * SPUS_PER_NODE); k++) {
> last_guard_val[k] = 0;
> + spu_ctx_sw_seen[k] = 0;
> + }
Why do you need the spu_ctx_sw_seen here? Shouldn't
spu_switch_event_register() cause a switch event in every context
anyway? You call that just a few lines earlier.
> -int spu_sync_stop(void)
> +void spu_sync_stop(void)
> {
> unsigned long flags = 0;
> - int ret = spu_switch_event_unregister(&spu_active);
> - if (ret) {
> - printk(KERN_ERR "SPU_PROF: "
> - "%s, line %d: spu_switch_event_unregister returned %d\n",
> - __FUNCTION__, __LINE__, ret);
> - goto out;
> - }
> +
> + /* Ignoring the return value from the unregister
> + * call. A failed return value simply says there
> + * was no registered event. Hence there will not
> + * be any calls to process a switch event that
> + * could cause a problem.
> + */
> + spu_switch_event_unregister(&spu_active);
Please change the prototype of the unregister function instead
then, you are the only caller anyway.
> @@ -40,6 +40,7 @@ static cpumask_t marked_cpus = CPU_MASK_
> static DEFINE_SPINLOCK(task_mortuary);
> static void process_task_mortuary(void);
>
> +extern int work_enabled; // carll added for debug
This obviously needs to be removed again.
>
> /* Take ownership of the task struct and place it on the
> * list for processing. Only after two full buffer syncs
> @@ -521,6 +522,46 @@ void sync_buffer(int cpu)
> } else if (s->event == CPU_TRACE_BEGIN) {
> state = sb_bt_start;
> add_trace_begin();
> + } else if (s->event == VALUE_HEADER_ID) {
> + /* The next event entry contains the number
> + * values in the sequence to add.
> + */
> + int index, j, num;
> +
> + if ((available - i) < 2)
> + /* The next entry which contains the
> + * number of entries in the sequence
> + * has not been written to the
> + * buffer yet.
> + */
> + break;
> +
> + /* Get the number in the sequence without
> + * changing the state of the buffer.
> + */
> + index = cpu_buf->tail_pos + 1;
> + if (!(index < cpu_buf->buffer_size))
> + index = 0;
> +
> + num = cpu_buf->buffer[index].eip;
> +
> + if ((available - i) < (num+1))
> + /* The entire sequence has not been
> + * written to the buffer yet.
> + */
> + break;
> +
> + if (work_enabled == 0) {
> + printk("work_enabled is zero\n");
> + }
> + for (j = 0; j < num; j++) {
> + increment_tail(cpu_buf);
> + i++;
> +
> + s = &cpu_buf->buffer[cpu_buf->tail_pos];
> + add_event_entry(s->event);
> + }
> +
> } else {
> struct mm_struct * oldmm = mm;
This is a substantial amount of code that you are adding to the existing
function, please move this out into a separate function, for better
readability.
> @@ -32,7 +32,9 @@ struct oprofile_cpu_buffer cpu_buffer[NR
> static void wq_sync_buffer(struct work_struct *work);
>
> #define DEFAULT_TIMER_EXPIRE (HZ / 10)
> -static int work_enabled;
> +//carll changed static int work_enabled;
> +extern int work_enabled;
> +int work_enabled;
please revert.
> Index: Cell_kernel_5_15_2008-new/drivers/oprofile/cpu_buffer.h
> ===================================================================
> --- Cell_kernel_5_15_2008-new.orig/drivers/oprofile/cpu_buffer.h
> +++ Cell_kernel_5_15_2008-new/drivers/oprofile/cpu_buffer.h
> @@ -54,5 +54,6 @@ void cpu_buffer_reset(struct oprofile_cp
> /* transient events for the CPU buffer -> event buffer */
> #define CPU_IS_KERNEL 1
> #define CPU_TRACE_BEGIN 2
> +#define VALUE_HEADER_ID 3
'VALUE_HEADER_ID' does not describe well enough to me what this
is doing. Maybe some other identifier if you can think of one?
> @@ -106,6 +97,22 @@ void oprofile_add_sample(struct pt_regs
> void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
> unsigned long event, int is_kernel);
>
> +/*
> + * Add a sequence of values to the per CPU buffer. An array of values is
> + * added to the specified cpu buffer with no additional processing. The assumption
> + * is any processing of the value will be done in the postprocessor. This
> + * function should only be used for special architecture specific data.
> + * Currently only used by the CELL processor.
> + *
> + * REQUIREMENT: the user of the function must ensure that only one call at
> + * a time is made to this function. Additionally, it must ensure that
> + * no calls are made to the following routines: oprofile_begin_trace(),
> + * oprofile_add_ext_sample(), oprofile_add_pc(), oprofile_add_trace().
> + *
> + * This function does not perform a backtrace.
> + */
> +void oprofile_add_value(unsigned long *values, int cpu, int num);
> +
> /* Use this instead when the PC value is not from the regs. Doesn't
> * backtrace. */
> void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event);
>
These are pretty strict requirements. How do you ensure that these are
all true? What if the user is running timer based profiling on the CPU
at the same time as event profiling on the SPU?
Maybe someone else with an oprofile background can comment on whether the
interface is acceptable as is.
Arnd <><
Arnd and company:
There are issues with using the cpu buffers for storing the SPU data. I
agree with you on some of your comments about the high complexity of the
per cpu patch. It seemed like a good idea at the start but in the end I
am not all that happy with it.
I have another approach to solving this bug. I will layout the approach
and then try to give the pros/cons of the proposed approach. Hopefully
everyone will help out with the design so we can come up with something
that will address the issues that we have seen with the previous
approaches. I think we are getting closer to an acceptable solution.
Maybe the fourth try will the the one. :-)
New approach, call it Single system wide buffer.
In arch/powerpc/oprofile/cell/spu_profiler.c,
- Create a function to return the per cpu buffer size.
- Create a single system wide buffer (call it SPU_BUFFER) to store the
SPU context switch and SPU PC data. The buffer would be an array of
unsigned longs. Currently the cpu buffer is a struct containing two
unsigned longs. The function would be called in function
start_spu_profiling() as part of starting an SPU profiling session. The
size of the buffer would be based on the size of the per cpu buffers * a
scale factor for handling all of the spu data.
- Create a function in arch/powerpc/oprofile/cell/spu_profiler.c that
would be periodically called via a delayed workqueue call to flush
SPU_BUFFER to the kernel buffer. It would have to call the routine to
get the kernel buffer mutex. Grab the lock protecting writes to the
SPU_BUFFER, flush the data to the kernel buffer and then release the
locks.
- Create a function to store data into the SPU_BUFFER. It will be
managed by a lock to ensure atomic access. If the buffer fills, data
will be dropped on the floor as is currently done when the per cpu
buffer fills. The cpu_buf->sample_lost_overflow stats would be
incremented, as if we were using them, so the user would know that the
buffer was not big enough. We would keep the fact that we really are
not using per cpu buffers transparent to the user. The idea is the
buffer size management would all look the same to the user.
In the driver/oprofile/buffer_sync.c function
-Create a function to obtain the kernel buffer mutex lock.
- Create a second function to release the kernel buffer mutex lock.
- Make these two functions visible to the architecture specific code by
exporting them in include/linux/oprofile.h.
Pros single system wide buffer
1) It would not require changing existing architecture independent code
to add the special data to a CPU buffer, move the special data to the
kernel buffer, remove the code that was added so that the SPU profiling
code does not enable the creation of the per cpu buffers. Summary,
minimal changing of architecture independent code
2) The SPU_BUFFER size would be managed using the same user controls and
feedback as the per cpu buffers. The goal would be to make management
look like we actually used cpu buffers.
3) I think the overall kernel data storage space would be less.
Currently, I have to increase the size of the cpu buffers, done in user
land, by 8 times to handle the SPU data. Since each entry in the
SPU_BUFFER would be one unsigned long instead of two unsigned longs for
cpu buffer we would save 2x here. We would not have to have and escape
sequence entry before each data sample that would save an additional 2x
in space.
4) Do not have to manage mapping of SPUs to CPU buffers. We don't care
what the cpu or spu numbers are.
5) I think the memory storage could be adjusted better for SPU event
profiling that I am working on.
Cons
1) Currently OProfile does not support CPU hotplug. Not sure how easy
it would be or what issues there may be in supporting CPU hot plug with
this approach. Can't really say if it would be easier/harder then with
per cpu buffers.
2) I think the cpu_buf stats are allocated when the module loads. I
think it is just the per cpu buffer allocation that is done dynamically
when the daemon starts/shutdown. I have not investigated this enough to
say for sure. If this is not the case, then we may need to tweek the
generic oprofile code a bit more.
3) There is probably some other gotcha out there I have not thought of
yet.
Any additional ideas, comments, concerns??? It would be nice if we
could flush out as many issues as possible now.
Thanks for your time and thought on this.
Carl Love
On Wednesday 11 June 2008, Carl Love wrote:
> Arnd and company:
>
> There are issues with using the cpu buffers for storing the SPU data. I
> agree with you on some of your comments about the high complexity of the
> per cpu patch. It seemed like a good idea at the start but in the end I
> am not all that happy with it.
Right, I agree. I assumed it would be enough to always write to the
local CPUs buffer, but as you explained, that caused other problems
that you had to work around.
Thanks for your detailed outline of your ideas, I think we're getting
much closer to a solution now!
> New approach, call it Single system wide buffer.
I like the approach in general, but would prefer to have it more generic,
so that other subsystems can use the same infrastructure if necessary:
> In arch/powerpc/oprofile/cell/spu_profiler.c,
> - Create a function to return the per cpu buffer size.
>
> - Create a single system wide buffer (call it SPU_BUFFER) to store the
> SPU context switch and SPU PC data. The buffer would be an array of
> unsigned longs. Currently the cpu buffer is a struct containing two
> unsigned longs. The function would be called in function
> start_spu_profiling() as part of starting an SPU profiling session. The
> size of the buffer would be based on the size of the per cpu buffers * a
> scale factor for handling all of the spu data.
I think the dependency on the size of the per cpu buffers is a bit
artificial. How about making this independently tunable with
another oprofilefs file and a reasonable default?
You could either create the extra files in oprofile_create_files
or in oprofile_ops.create_files.
> - Create a function in arch/powerpc/oprofile/cell/spu_profiler.c that
> would be periodically called via a delayed workqueue call to flush
> SPU_BUFFER to the kernel buffer. It would have to call the routine to
> get the kernel buffer mutex. Grab the lock protecting writes to the
> SPU_BUFFER, flush the data to the kernel buffer and then release the
> locks.
>
> - Create a function to store data into the SPU_BUFFER. It will be
> managed by a lock to ensure atomic access. If the buffer fills, data
> will be dropped on the floor as is currently done when the per cpu
> buffer fills. The cpu_buf->sample_lost_overflow stats would be
> incremented, as if we were using them, so the user would know that the
> buffer was not big enough. We would keep the fact that we really are
> not using per cpu buffers transparent to the user. The idea is the
> buffer size management would all look the same to the user.
>
> In the driver/oprofile/buffer_sync.c function
> -Create a function to obtain the kernel buffer mutex lock.
> - Create a second function to release the kernel buffer mutex lock.
> - Make these two functions visible to the architecture specific code by
> exporting them in include/linux/oprofile.h.
As a general design principle, I'd always try to have locks close
to the data. What this means is that I think the SPU_BUFFER should
be in drivers/oprofile, similar to the cpu_buffer. In that case,
it may be better to change the name to something more generic, e.g.
aux_buffer.
Similar, moving data into the aux_buffer can be done under a spinlock
that is defined in the same file, with an interface like the
oprofile_add_value() function from your V3 patch, but moving the
add_value_lock in there.
> Pros single system wide buffer
> 1) It would not require changing existing architecture independent code
> to add the special data to a CPU buffer, move the special data to the
> kernel buffer, remove the code that was added so that the SPU profiling
> code does not enable the creation of the per cpu buffers. Summary,
> minimal changing of architecture independent code
I don't think that's a requirement. I don't mind doing changes to
spufs or oprofile common code at all, as long as the change is
reasonable. This is especially true when there are potentially
other users that can benefit from the change.
> 2) The SPU_BUFFER size would be managed using the same user controls and
> feedback as the per cpu buffers. The goal would be to make management
> look like we actually used cpu buffers.
As mentioned above, I'd see this as a disadvantage.
> 3) I think the overall kernel data storage space would be less.
> Currently, I have to increase the size of the cpu buffers, done in user
> land, by 8 times to handle the SPU data. Since each entry in the
> SPU_BUFFER would be one unsigned long instead of two unsigned longs for
> cpu buffer we would save 2x here. We would not have to have and escape
> sequence entry before each data sample that would save an additional 2x
> in space.
right
> 4) Do not have to manage mapping of SPUs to CPU buffers. We don't care
> what the cpu or spu numbers are.
right
> 5) I think the memory storage could be adjusted better for SPU event
> profiling that I am working on.
right.
> Cons
> 1) Currently OProfile does not support CPU hotplug. Not sure how easy
> it would be or what issues there may be in supporting CPU hot plug with
> this approach. Can't really say if it would be easier/harder then with
> per cpu buffers.
I think it would be harder, but if oprofile doesn't support it anyway,
let's not worry about it.
> 2) I think the cpu_buf stats are allocated when the module loads. I
> think it is just the per cpu buffer allocation that is done dynamically
> when the daemon starts/shutdown. I have not investigated this enough to
> say for sure. If this is not the case, then we may need to tweek the
> generic oprofile code a bit more.
Both the event_buffer and the cpu_buffers are allocated in oprofile_setup
when the even buffer file is opened. When you have another buffer besides
the cpu_buffers, you can either change the oprofile_setup function to
allocate it, or do it in your setup() callback.
> 3) There is probably some other gotcha out there I have not thought of
> yet.
None that I can think of.
Arnd <><
Arnd Bergmann wrote:
> On Wednesday 11 June 2008, Carl Love wrote:
>
>> Arnd and company:
>>
>> There are issues with using the cpu buffers for storing the SPU data. I
>> agree with you on some of your comments about the high complexity of the
>> per cpu patch. It seemed like a good idea at the start but in the end I
>> am not all that happy with it.
>>
>
> Right, I agree. I assumed it would be enough to always write to the
> local CPUs buffer, but as you explained, that caused other problems
> that you had to work around.
>
> Thanks for your detailed outline of your ideas, I think we're getting
> much closer to a solution now!
>
>
>> New approach, call it Single system wide buffer.
>>
>
> I like the approach in general, but would prefer to have it more generic,
> so that other subsystems can use the same infrastructure if necessary:
>
I believe Carl's thoughts on implementation would result in
model-specific buffer create/destroy routines, to be invoked by the
generic routine. This would give the flexibility for architectures to
implement their own unique buffering mechanism. I don't think it would
be worthwhile to go beyond this level in trying to build some generic
mechanism since the generic mechanism already exists and has been
sufficient for all other architectures.
>
>> In arch/powerpc/oprofile/cell/spu_profiler.c,
>> - Create a function to return the per cpu buffer size.
>>
>> - Create a single system wide buffer (call it SPU_BUFFER) to store the
>> SPU context switch and SPU PC data. The buffer would be an array of
>> unsigned longs. Currently the cpu buffer is a struct containing two
>> unsigned longs. The function would be called in function
>> start_spu_profiling() as part of starting an SPU profiling session. The
>> size of the buffer would be based on the size of the per cpu buffers * a
>> scale factor for handling all of the spu data.
>>
>
> I think the dependency on the size of the per cpu buffers is a bit
> artificial. How about making this independently tunable with
> another oprofilefs file and a reasonable default?
> You could either create the extra files in oprofile_create_files
> or in oprofile_ops.create_files.
>
With the right scaling factor, we should be able to avoid the buffer
overflows for the most part. Adding a new file in oprofilefs for an
arch-specific buffer size would have to be documented on the user side,
as well as providing a new option on the opcontrol command to adjust the
value. Granted, the system-wide buffer that Carl is proposing for Cell
clashes with the "cpu buffer" construct, but some other architecture in
the future may want to make use of this new model-specific buffer
management routines and perhaps their buffers *would* be per-cpu. I'm
not sure there would be much value add in exposing this implementation
detail to the user as long as we can effectively use the existing user
interface mechanism for managing buffer size.
>
>> - Create a function in arch/powerpc/oprofile/cell/spu_profiler.c that
>> would be periodically called via a delayed workqueue call to flush
>> SPU_BUFFER to the kernel buffer. It would have to call the routine to
>> get the kernel buffer mutex. Grab the lock protecting writes to the
>> SPU_BUFFER, flush the data to the kernel buffer and then release the
>> locks.
>>
>> - Create a function to store data into the SPU_BUFFER. It will be
>> managed by a lock to ensure atomic access. If the buffer fills, data
>> will be dropped on the floor as is currently done when the per cpu
>> buffer fills. The cpu_buf->sample_lost_overflow stats would be
>> incremented, as if we were using them, so the user would know that the
>> buffer was not big enough. We would keep the fact that we really are
>> not using per cpu buffers transparent to the user. The idea is the
>> buffer size management would all look the same to the user.
>>
>> In the driver/oprofile/buffer_sync.c function
>> -Create a function to obtain the kernel buffer mutex lock.
>> - Create a second function to release the kernel buffer mutex lock.
>> - Make these two functions visible to the architecture specific code by
>> exporting them in include/linux/oprofile.h.
>>
>
> As a general design principle, I'd always try to have locks close
> to the data. What this means is that I think the SPU_BUFFER should
> be in drivers/oprofile, similar to the cpu_buffer. In that case,
> it may be better to change the name to something more generic, e.g.
> aux_buffer.
>
I agree -- in principle -- however, I really don't think a system-wide
buffer is something we want to put into generic code. It is not
generally useful. Also, putting this new code in drivers/oprofile would
not fit well with the general technique of adding model-specific
function pointers to 'struct oprofile_operations'.
> Similar, moving data into the aux_buffer can be done under a spinlock
> that is defined in the same file, with an interface like the
> oprofile_add_value() function from your V3 patch, but moving the
> add_value_lock in there.
>
>
>> Pros single system wide buffer
>> 1) It would not require changing existing architecture independent code
>> to add the special data to a CPU buffer, move the special data to the
>> kernel buffer, remove the code that was added so that the SPU profiling
>> code does not enable the creation of the per cpu buffers. Summary,
>> minimal changing of architecture independent code
>>
>
> I don't think that's a requirement. I don't mind doing changes to
> spufs or oprofile common code at all, as long as the change is
> reasonable. This is especially true when there are potentially
> other users that can benefit from the change.
>
This assumes we can get the oprofile maintainer to review and comment on
the new patch, which has been difficult to do lately. I agree that we
should not avoid making changes to common code when there's a good
justification for it, but I would say that this is not such a case.
-Maynard
>
>> 2) The SPU_BUFFER size would be managed using the same user controls and
>> feedback as the per cpu buffers. The goal would be to make management
>> look like we actually used cpu buffers.
>>
>
> As mentioned above, I'd see this as a disadvantage.
>
>
>> 3) I think the overall kernel data storage space would be less.
>> Currently, I have to increase the size of the cpu buffers, done in user
>> land, by 8 times to handle the SPU data. Since each entry in the
>> SPU_BUFFER would be one unsigned long instead of two unsigned longs for
>> cpu buffer we would save 2x here. We would not have to have and escape
>> sequence entry before each data sample that would save an additional 2x
>> in space.
>>
>
> right
>
>
>> 4) Do not have to manage mapping of SPUs to CPU buffers. We don't care
>> what the cpu or spu numbers are.
>>
>
> right
>
>
>> 5) I think the memory storage could be adjusted better for SPU event
>> profiling that I am working on.
>>
>
> right.
>
>
>> Cons
>> 1) Currently OProfile does not support CPU hotplug. Not sure how easy
>> it would be or what issues there may be in supporting CPU hot plug with
>> this approach. Can't really say if it would be easier/harder then with
>> per cpu buffers.
>>
>
> I think it would be harder, but if oprofile doesn't support it anyway,
> let's not worry about it.
>
>
>> 2) I think the cpu_buf stats are allocated when the module loads. I
>> think it is just the per cpu buffer allocation that is done dynamically
>> when the daemon starts/shutdown. I have not investigated this enough to
>> say for sure. If this is not the case, then we may need to tweek the
>> generic oprofile code a bit more.
>>
>
> Both the event_buffer and the cpu_buffers are allocated in oprofile_setup
> when the even buffer file is opened. When you have another buffer besides
> the cpu_buffers, you can either change the oprofile_setup function to
> allocate it, or do it in your setup() callback.
>
>
>> 3) There is probably some other gotcha out there I have not thought of
>> yet.
>>
>
> None that I can think of.
>
> Arnd <><
>
> -------------------------------------------------------------------------
> Check out the new SourceForge.net Marketplace.
> It's the best place to buy or sell services for
> just about anything Open Source.
> http://sourceforge.net/services/buy/index.php
> _______________________________________________
> oprofile-list mailing list
> [email protected]
> https://lists.sourceforge.net/lists/listinfo/oprofile-list
>
On Wednesday 11 June 2008, Maynard Johnson wrote:
> >
> > I like the approach in general, but would prefer to have it more generic,
> > so that other subsystems can use the same infrastructure if necessary:
> >
> I believe Carl's thoughts on implementation would result in
> model-specific buffer create/destroy routines, to be invoked by the
> generic routine. This would give the flexibility for architectures to
> implement their own unique buffering mechanism. I don't think it would
> be worthwhile to go beyond this level in trying to build some generic
> mechanism since the generic mechanism already exists and has been
> sufficient for all other architectures.
Well, the amount of code should be the same either way. We currently
have an interface between the cpu_buffer and its users, which the
interface between the event_buffer and the cpu_buffer is private
to the oprofile layer.
The logical extension of this would be to have another buffer in
common oprofile code, and have its users in the architecture code,
rather than interface the architecture with the private event_buffer.
> > I think the dependency on the size of the per cpu buffers is a bit
> > artificial. How about making this independently tunable with
> > another oprofilefs file and a reasonable default?
> > You could either create the extra files in oprofile_create_files
> > or in oprofile_ops.create_files.
> >
> With the right scaling factor, we should be able to avoid the buffer
> overflows for the most part. Adding a new file in oprofilefs for an
> arch-specific buffer size would have to be documented on the user side,
> as well as providing a new option on the opcontrol command to adjust the
> value. Granted, the system-wide buffer that Carl is proposing for Cell
> clashes with the "cpu buffer" construct, but some other architecture in
> the future may want to make use of this new model-specific buffer
> management routines and perhaps their buffers *would* be per-cpu. I'm
> not sure there would be much value add in exposing this implementation
> detail to the user as long as we can effectively use the existing user
> interface mechanism for managing buffer size.
But we're already exposing the implementation detail for the cpu buffers,
which are similar enough, just not the same. Since the file and opcontrol
option are called cpu-buffer-size, its rather unobvious how this should
control the size of a global buffer. As I mentioned, with a reasonable
default, you should not need to tune this anyway.
Changing opcontrol is a real disadvantage, but if a user needs to tune
this without a new opcontrol tool, it's always possible to directly
write to the file system.
> > As a general design principle, I'd always try to have locks close
> > to the data. What this means is that I think the SPU_BUFFER should
> > be in drivers/oprofile, similar to the cpu_buffer. In that case,
> > it may be better to change the name to something more generic, e.g.
> > aux_buffer.
> >
> I agree -- in principle -- however, I really don't think a system-wide
> buffer is something we want to put into generic code. It is not
> generally useful. Also, putting this new code in drivers/oprofile would
> not fit well with the general technique of adding model-specific
> function pointers to 'struct oprofile_operations'.
> This assumes we can get the oprofile maintainer to review and comment on
> the new patch, which has been difficult to do lately. I agree that we
> should not avoid making changes to common code when there's a good
> justification for it, but I would say that this is not such a case.
You have to change the oprofile code either way, because the existing
interface is not sufficient. However, there is no point in trying to do
a minimal change to the existing code when a larger change clearly gives
us a better resulting code.
Arnd <><
On Thu, 2008-06-12 at 14:31 +0200, Arnd Bergmann wrote:
> On Wednesday 11 June 2008, Maynard Johnson wrote:
>
> > >
> > > I like the approach in general, but would prefer to have it more generic,
> > > so that other subsystems can use the same infrastructure if necessary:
> > >
> > I believe Carl's thoughts on implementation would result in
> > model-specific buffer create/destroy routines, to be invoked by the
> > generic routine. This would give the flexibility for architectures to
> > implement their own unique buffering mechanism. I don't think it would
> > be worthwhile to go beyond this level in trying to build some generic
> > mechanism since the generic mechanism already exists and has been
> > sufficient for all other architectures.
>
> Well, the amount of code should be the same either way. We currently
> have an interface between the cpu_buffer and its users, which the
> interface between the event_buffer and the cpu_buffer is private
> to the oprofile layer.
>
> The logical extension of this would be to have another buffer in
> common oprofile code, and have its users in the architecture code,
> rather than interface the architecture with the private event_buffer.
>
> > > I think the dependency on the size of the per cpu buffers is a bit
> > > artificial. How about making this independently tunable with
> > > another oprofilefs file and a reasonable default?
> > > You could either create the extra files in oprofile_create_files
> > > or in oprofile_ops.create_files.
> > >
> > With the right scaling factor, we should be able to avoid the buffer
> > overflows for the most part. Adding a new file in oprofilefs for an
> > arch-specific buffer size would have to be documented on the user side,
> > as well as providing a new option on the opcontrol command to adjust the
> > value. Granted, the system-wide buffer that Carl is proposing for Cell
> > clashes with the "cpu buffer" construct, but some other architecture in
> > the future may want to make use of this new model-specific buffer
> > management routines and perhaps their buffers *would* be per-cpu. I'm
> > not sure there would be much value add in exposing this implementation
> > detail to the user as long as we can effectively use the existing user
> > interface mechanism for managing buffer size.
>
> But we're already exposing the implementation detail for the cpu buffers,
> which are similar enough, just not the same. Since the file and opcontrol
> option are called cpu-buffer-size, its rather unobvious how this should
> control the size of a global buffer. As I mentioned, with a reasonable
> default, you should not need to tune this anyway.
> Changing opcontrol is a real disadvantage, but if a user needs to tune
> this without a new opcontrol tool, it's always possible to directly
> write to the file system.
Not exactly. First of all I have to have a special escape sequence
header to say that the following data is of a special type. In the case
of the SPU PC data samples it means I use up twice as much memory, one
entry for the header and one for the data. This is true for each data
sample. Secondly there is no locking of the CPU buffer when data is
removed. Hence the need for the extra code in the sync_buffer routine
to be very careful to make sure the entire sequence of header plus data
was written to the per CPU buffer before removing it. If we did have a
dedicated system-wide buffer, we could eliminate the escape sequence
overhead in the storage space requirement. Since there are 4x the
number of SPUs then CPUs the per system buffer size would need to be
4*NUMCPUS*per-cpu-buffer-size. We need to have a way for the arch code
to tell OProfile at buffer creation time to either create the
system-wide buffer or the per-cpu buffers or possibly both for
generality. SPU profiling will not use cpu buffers. Please note you
can not do any CPU profiling (event counter based or timer based) while
doing SPU profiling. In theory if an architecture wanted to use
per-cpu-buffer and system-wide buffer we would need to be able to tune
the sizes independently. Now we are adding a lot more complexity to
make something very general. Currently there is only one architecture
that needs this feature. The probability of another arch needing this
feature is probably low. I say we keep it simple until we know that we
need a more complex general solution.
The whole point is the user would not have to know that they were
manipulating a system-wide buffer. From a black box perspective it
would be acting as if there were per cpu buffers being manipulated.
We have to allow for the size of the system-wide buffer to be tuned for
the same reason that the per cpu buffers have to be tunable. The the
number of samples collected is a function of the frequency the event
occurs. If you make the buffer fixed and large enough that it will
always be big enough, then you will be using a lot of kernel memory to
cover a worst case when in fact most of the time that memory will not
get used.
>
> > > As a general design principle, I'd always try to have locks close
> > > to the data. What this means is that I think the SPU_BUFFER should
> > > be in drivers/oprofile, similar to the cpu_buffer. In that case,
> > > it may be better to change the name to something more generic, e.g.
> > > aux_buffer.
> > >
> > I agree -- in principle -- however, I really don't think a system-wide
> > buffer is something we want to put into generic code. It is not
> > generally useful. Also, putting this new code in drivers/oprofile would
> > not fit well with the general technique of adding model-specific
> > function pointers to 'struct oprofile_operations'.
>
> > This assumes we can get the oprofile maintainer to review and comment on
> > the new patch, which has been difficult to do lately. I agree that we
> > should not avoid making changes to common code when there's a good
> > justification for it, but I would say that this is not such a case.
>
> You have to change the oprofile code either way, because the existing
> interface is not sufficient. However, there is no point in trying to do
> a minimal change to the existing code when a larger change clearly gives
> us a better resulting code.
I don't completely agree here. See comment above about adding
complexity when it is not generally needed. Secondly, adding the system
wide buffer into the general code would encourage people to use it when
they should use per cpu buffers to get better scalability with the
number of processors.
>
> Arnd <><