LinuxLists.cc - [PATCH 1/2] Cell OProfile: SPU mutex lock fix, version 4

2008-08-01 20:02:50

Subject: [PATCH 1/2] Cell OProfile: SPU mutex lock fix, version 4

Version 4 of the SPU mutex lock fix.

The issue is the SPU code is not holding the kernel mutex lock while
adding samples to the kernel buffer.

This patch creates per SPU buffers to hold the data. Data
is added to the buffers from in interrupt context. The data
is periodically pushed to the kernel buffer via a new Oprofile
function oprofile_put_buff(). The oprofile_put_buff() function
is called via a work queue enabling the funtion to acquire the
mutex lock.

The existing user controls for adjusting the per CPU buffer
size is used to control the size of the per SPU buffers.
Similarly, overflows of the SPU buffers are reported by
incrementing the per CPU buffer stats. This eliminates the
need to have architecture specific controls for the per SPU
buffers which is not acceptable to the OProfile user tool
maintainer.

The export of the oprofile add_event_entry() is removed as it
is no longer needed given this patch.

Note, this patch has not addressed the issue of indexing arrays
by the spu number. This still needs to be fixed as the spu
numbering is not guarenteed to be 0 to max_num_spus-1.

Signed-off-by: Carl Love <[email protected]>
Signed-off-by: Maynard Johnson <[email protected]>

Index: Cell_kernel_6_26_2008/drivers/oprofile/cpu_buffer.c
===================================================================
--- Cell_kernel_6_26_2008.orig/drivers/oprofile/cpu_buffer.c
+++ Cell_kernel_6_26_2008/drivers/oprofile/cpu_buffer.c
@@ -37,11 +37,24 @@ static int work_enabled;
void free_cpu_buffers(void)
{
int i;
-
+
for_each_online_cpu(i)
vfree(per_cpu(cpu_buffer, i).buffer);
}

+unsigned long oprofile_get_cpu_buffer_size(void)
+{
+ return fs_cpu_buffer_size;
+}
+
+void oprofile_cpu_buffer_inc_smpl_lost(void)
+{
+ struct oprofile_cpu_buffer *cpu_buf
+ = &__get_cpu_var(cpu_buffer);
+
+ cpu_buf->sample_lost_overflow++;
+}
+
int alloc_cpu_buffers(void)
{
int i;
Index: Cell_kernel_6_26_2008/include/linux/oprofile.h
===================================================================
--- Cell_kernel_6_26_2008.orig/include/linux/oprofile.h
+++ Cell_kernel_6_26_2008/include/linux/oprofile.h
@@ -84,13 +84,6 @@ int oprofile_arch_init(struct oprofile_o
void oprofile_arch_exit(void);

/**
- * Add data to the event buffer.
- * The data passed is free-form, but typically consists of
- * file offsets, dcookies, context information, and ESCAPE codes.
- */
-void add_event_entry(unsigned long data);
-
-/**
* Add a sample. This may be called from any context. Pass
* smp_processor_id() as cpu.
*/
@@ -160,5 +153,14 @@ int oprofilefs_ulong_from_user(unsigned

/** lock for read/write safety */
extern spinlock_t oprofilefs_lock;
+
+/**
+ * Add the contents of a circular buffer to the event buffer.
+ */
+void oprofile_put_buff(unsigned long *buf,unsigned int start,
+ unsigned int stop, unsigned int max);
+
+unsigned long oprofile_get_cpu_buffer_size(void);
+void oprofile_cpu_buffer_inc_smpl_lost(void);

#endif /* OPROFILE_H */
Index: Cell_kernel_6_26_2008/arch/powerpc/oprofile/cell/spu_profiler.c
===================================================================
--- Cell_kernel_6_26_2008.orig/arch/powerpc/oprofile/cell/spu_profiler.c
+++ Cell_kernel_6_26_2008/arch/powerpc/oprofile/cell/spu_profiler.c
@@ -23,12 +23,11 @@

static u32 *samples;

-static int spu_prof_running;
+int spu_prof_running;
static unsigned int profiling_interval;

#define NUM_SPU_BITS_TRBUF 16
#define SPUS_PER_TB_ENTRY 4
-#define SPUS_PER_NODE 8

#define SPU_PC_MASK 0xFFFF

@@ -208,6 +207,7 @@ int start_spu_profiling(unsigned int cyc

spu_prof_running = 1;
hrtimer_start(&timer, kt, HRTIMER_MODE_REL);
+ schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE);

return 0;
}
Index: Cell_kernel_6_26_2008/arch/powerpc/oprofile/cell/spu_task_sync.c
===================================================================
--- Cell_kernel_6_26_2008.orig/arch/powerpc/oprofile/cell/spu_task_sync.c
+++ Cell_kernel_6_26_2008/arch/powerpc/oprofile/cell/spu_task_sync.c
@@ -35,7 +35,106 @@ static DEFINE_SPINLOCK(buffer_lock);
static DEFINE_SPINLOCK(cache_lock);
static int num_spu_nodes;
int spu_prof_num_nodes;
-int last_guard_val[MAX_NUMNODES * 8];
+int last_guard_val[MAX_NUMNODES * SPUS_PER_NODE];
+static int spu_ctx_sw_seen[MAX_NUMNODES * SPUS_PER_NODE];
+static int sync_start_registered;
+static int delayed_work_init;
+
+struct spu_buffers spu_buff;
+struct delayed_work spu_work;
+static unsigned max_spu_buff;
+
+static void spu_buff_add(unsigned long int value, int spu)
+{
+ /* spu buff is a circular buffer. Add entries to the
+ * head. Head is the index to store the next value.
+ * The buffer is full when there is one available entry
+ * in the queue, i.e. head and tail can't be equal.
+ * That way we can tell the difference between the
+ * buffer being full versus empty.
+ *
+ * ASSUPTION: the buffer_lock is held when this function
+ * is called to lock the buffer, head and tail.
+ */
+ int full = 1;
+
+ if (spu_buff.index[spu].head >= spu_buff.index[spu].tail) {
+ if ((spu_buff.index[spu].head - spu_buff.index[spu].tail)
+ < (max_spu_buff - 1))
+ full = 0;
+
+ } else if (spu_buff.index[spu].tail > spu_buff.index[spu].head) {
+ if ((spu_buff.index[spu].tail - spu_buff.index[spu].head)
+ > 1)
+ full = 0;
+ }
+
+ if (!full) {
+ spu_buff.buff[spu][spu_buff.index[spu].head] = value;
+ spu_buff.index[spu].head++;
+
+ if (spu_buff.index[spu].head >= max_spu_buff)
+ spu_buff.index[spu].head = 0;
+ } else {
+ /* From the user's perspective make the SPU buffer
+ * size management/overflow look like we are using
+ * per cpu buffers. The user uses the same
+ * per cpu parameter to adjust the SPU buffer size.
+ * Increment the sample_lost_overflow to inform
+ * the user the buffer size needs to be increased.
+ */
+ oprofile_cpu_buffer_inc_smpl_lost();
+ }
+}
+
+/* This function copies the per SPU buffers to the
+ * OProfile kernel buffer.
+ */
+void sync_spu_buff(void)
+{
+ int spu;
+ int flags;
+ int curr_head;
+
+ for (spu=0; spu < num_spu_nodes; spu++) {
+ /* In case there was an issue and the buffer didn't
+ * get created skip it.
+ */
+ if (spu_buff.buff[spu] == NULL)
+ continue;
+
+ /* Hold the lock to make sure the head/tail
+ * doesn't change while spu_buff_add() is
+ * deciding if the buffer is full or not.
+ * Being a little paranoid.
+ */
+ spin_lock_irqsave(&buffer_lock, flags);
+ curr_head = spu_buff.index[spu].head;
+ spin_unlock_irqrestore(&buffer_lock, flags);
+
+ /* Transfer the current contents to the kernel buffer.
+ * data can still be added to the head of the buffer.
+ */
+ oprofile_put_buff(spu_buff.buff[spu],
+ spu_buff.index[spu].tail,
+ curr_head, max_spu_buff);
+
+ spin_lock_irqsave(&buffer_lock, flags);
+ spu_buff.index[spu].tail = curr_head;
+ spin_unlock_irqrestore(&buffer_lock, flags);
+ }
+
+}
+
+static void wq_sync_spu_buff(struct work_struct *work)
+{
+ /* move data from spu buffers to kernel buffer */
+ sync_spu_buff();
+
+ /* only reschedule if profiling is not done */
+ if (spu_prof_running)
+ schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE);
+}

/* Container for caching information about an active SPU task. */
struct cached_info {
@@ -305,14 +404,21 @@ static int process_context_switch(struct

/* Record context info in event buffer */
spin_lock_irqsave(&buffer_lock, flags);
- add_event_entry(ESCAPE_CODE);
- add_event_entry(SPU_CTX_SWITCH_CODE);
- add_event_entry(spu->number);
- add_event_entry(spu->pid);
- add_event_entry(spu->tgid);
- add_event_entry(app_dcookie);
- add_event_entry(spu_cookie);
- add_event_entry(offset);
+ spu_buff_add(ESCAPE_CODE, spu->number);
+ spu_buff_add(SPU_CTX_SWITCH_CODE, spu->number);
+ spu_buff_add(spu->number, spu->number);
+ spu_buff_add(spu->pid, spu->number);
+ spu_buff_add(spu->tgid, spu->number);
+ spu_buff_add(app_dcookie, spu->number);
+ spu_buff_add(spu_cookie, spu->number);
+ spu_buff_add(offset, spu->number);
+
+ /* Set flag to indicate SPU PC data can now be written out. If
+ * the SPU program counter data is seen before an SPU context
+ * record is seen, the postprocessing will fail.
+ */
+ spu_ctx_sw_seen[spu->number] = 1;
+
spin_unlock_irqrestore(&buffer_lock, flags);
smp_wmb(); /* insure spu event buffer updates are written */
/* don't want entries intermingled... */
@@ -360,6 +466,49 @@ static int number_of_online_nodes(void)
return nodes;
}

+static int oprofile_spu_buff_create(void)
+{
+ int spu;
+
+ max_spu_buff = oprofile_get_cpu_buffer_size();
+
+ for (spu = 0; spu < num_spu_nodes; spu++) {
+ /* create circular buffers to store the data in.
+ * use locks to manage accessing the buffers
+ */
+ spu_buff.index[spu].head = 0;
+ spu_buff.index[spu].tail = 0;
+
+ /*
+ * Create a buffer for each SPU. Can't reliably
+ * create a single buffer for all spus due to not
+ * enough contiguous kernel memory.
+ */
+
+ spu_buff.buff[spu] = kzalloc((max_spu_buff
+ * sizeof(unsigned long)),
+ GFP_KERNEL);
+
+ if (!spu_buff.buff[spu]) {
+ printk(KERN_ERR "SPU_PROF: "
+ "%s, line %d: oprofile_spu_buff_create " \
+ "failed to allocate spu buffer %d.\n",
+ __FUNCTION__, __LINE__, spu);
+
+ /* release the spu buffers that have been allocated */
+ while (spu >= 0) {
+ if (spu_buff.buff[spu]) {
+ kfree(spu_buff.buff[spu]);
+ spu_buff.buff[spu] = 0;
+ }
+ spu--;
+ }
+ return 1;
+ }
+ }
+ return 0;
+}
+
/* The main purpose of this function is to synchronize
* OProfile with SPUFS by registering to be notified of
* SPU task switches.
@@ -372,30 +521,50 @@ static int number_of_online_nodes(void)
*/
int spu_sync_start(void)
{
- int k;
+ int spu;
int ret = SKIP_GENERIC_SYNC;
int register_ret;
unsigned long flags = 0;

spu_prof_num_nodes = number_of_online_nodes();
num_spu_nodes = spu_prof_num_nodes * 8;
+ delayed_work_init = 0;
+
+ /* create buffer for storing the SPU data to put in
+ * the kernel buffer.
+ */
+ if (oprofile_spu_buff_create()) {
+ ret = -ENOMEM;
+ sync_start_registered = 0;
+ goto out;
+ }

spin_lock_irqsave(&buffer_lock, flags);
- add_event_entry(ESCAPE_CODE);
- add_event_entry(SPU_PROFILING_CODE);
- add_event_entry(num_spu_nodes);
+ for (spu = 0; spu < num_spu_nodes; spu++) {
+ spu_buff_add(ESCAPE_CODE, spu);
+ spu_buff_add(SPU_PROFILING_CODE, spu);
+ spu_buff_add(num_spu_nodes, spu);
+ }
spin_unlock_irqrestore(&buffer_lock, flags);

+ for (spu = 0; spu < num_spu_nodes; spu++) {
+ spu_ctx_sw_seen[spu] = 0;
+ last_guard_val[spu] = 0;
+ }
+
+ INIT_DELAYED_WORK(&spu_work, wq_sync_spu_buff);
+ delayed_work_init = 1;
+
/* Register for SPU events */
register_ret = spu_switch_event_register(&spu_active);
if (register_ret) {
+ sync_start_registered = 0;
ret = SYNC_START_ERROR;
goto out;
}

- for (k = 0; k < (MAX_NUMNODES * 8); k++)
- last_guard_val[k] = 0;
pr_debug("spu_sync_start -- running.\n");
+ sync_start_registered = 1;
out:
return ret;
}
@@ -452,7 +621,14 @@ void spu_sync_buffer(int spu_num, unsign
break;
}

- add_event_entry(file_offset | spu_num_shifted);
+ /* We must ensure that the SPU context switch has been written
+ * out before samples for the SPU. Otherwise, the SPU context
+ * information is not available and the postprocessing of the
+ * SPU PC will fail with no available anonymous map information.
+ */
+ if (spu_ctx_sw_seen[spu_num])
+ spu_buff_add((file_offset | spu_num_shifted),
+ spu_num);
}
spin_unlock(&buffer_lock);
out:
@@ -463,20 +639,47 @@ out:
int spu_sync_stop(void)
{
unsigned long flags = 0;
- int ret = spu_switch_event_unregister(&spu_active);
- if (ret) {
- printk(KERN_ERR "SPU_PROF: "
- "%s, line %d: spu_switch_event_unregister returned %d\n",
- __func__, __LINE__, ret);
- goto out;
+ int ret=0;
+ int k;
+
+ if (sync_start_registered) {
+ ret = spu_switch_event_unregister(&spu_active);
+
+ if (ret)
+ printk(KERN_ERR "SPU_PROF: "
+ "%s, line %d: spu_switch_event_unregister " \
+ "returned %d\n",
+ __func__, __LINE__, ret);
}

+ /* flush any remaining data in the per SPU buffers */
+ sync_spu_buff();
+
spin_lock_irqsave(&cache_lock, flags);
ret = release_cached_info(RELEASE_ALL);
spin_unlock_irqrestore(&cache_lock, flags);
-out:
+
+ /* remove scheduled work queue item rather then waiting
+ * for every queued entry to execute. Then flush pending
+ * system wide buffer to event buffer. Only try to
+ * remove if it was scheduled. Get kernel errors otherwise.
+ */
+ if (delayed_work_init)
+ cancel_delayed_work(&spu_work);
+
+ for (k = 0; k < num_spu_nodes; k++) {
+ spu_ctx_sw_seen[k] = 0;
+
+ /* spu_sys_buff will be null if there was a problem
+ * allocating the buffer. Only delete if it exists.
+ */
+
+ if (spu_buff.buff[k]) {
+ kfree(spu_buff.buff[k]);
+ spu_buff.buff[k] = 0;
+ }
+ }
pr_debug("spu_sync_stop -- done.\n");
return ret;
}

-
Index: Cell_kernel_6_26_2008/drivers/oprofile/event_buffer.h
===================================================================
--- Cell_kernel_6_26_2008.orig/drivers/oprofile/event_buffer.h
+++ Cell_kernel_6_26_2008/drivers/oprofile/event_buffer.h
@@ -17,6 +17,13 @@ int alloc_event_buffer(void);

void free_event_buffer(void);

+/**
+ * Add data to the event buffer.
+ * The data passed is free-form, but typically consists of
+ * file offsets, dcookies, context information, and ESCAPE codes.
+ */
+void add_event_entry(unsigned long data);
+
/* wake up the process sleeping on the event file */
void wake_up_buffer_waiter(void);

Index: Cell_kernel_6_26_2008/arch/powerpc/oprofile/cell/pr_util.h
===================================================================
--- Cell_kernel_6_26_2008.orig/arch/powerpc/oprofile/cell/pr_util.h
+++ Cell_kernel_6_26_2008/arch/powerpc/oprofile/cell/pr_util.h
@@ -24,6 +24,11 @@
#define SKIP_GENERIC_SYNC 0
#define SYNC_START_ERROR -1
#define DO_GENERIC_SYNC 1
+#define SPUS_PER_NODE 8
+#define DEFAULT_TIMER_EXPIRE (HZ / 10)
+
+extern struct delayed_work spu_work;
+extern int spu_prof_running;

struct spu_overlay_info { /* map of sections within an SPU overlay */
unsigned int vma; /* SPU virtual memory address from elf */
@@ -62,6 +67,20 @@ struct vma_to_fileoffset_map { /* map of

};

+/* Put head and tail for the spu buffer into a structure to keep
+ * them in the same cache line.
+ */
+struct head_tail {
+ unsigned int head;
+ unsigned int tail;
+};
+
+struct spu_buffers {
+ unsigned long *buff[MAX_NUMNODES * SPUS_PER_NODE];
+ struct head_tail index[MAX_NUMNODES * SPUS_PER_NODE];
+};
+
+
/* The three functions below are for maintaining and accessing
* the vma-to-fileoffset map.
*/
Index: Cell_kernel_6_26_2008/drivers/oprofile/buffer_sync.c
===================================================================
--- Cell_kernel_6_26_2008.orig/drivers/oprofile/buffer_sync.c
+++ Cell_kernel_6_26_2008/drivers/oprofile/buffer_sync.c
@@ -551,3 +551,36 @@ void sync_buffer(int cpu)

mutex_unlock(&buffer_mutex);
}
+
+/* The function can be used to add a buffer worth of data directly to
+ * the kernel buffer. The buffer is assumed to be a circular buffer.
+ * Take the entries from index start and end at index end, wrapping
+ * at max_entries.
+ */
+void oprofile_put_buff(unsigned long *buf, unsigned int start,
+ unsigned int stop, unsigned int max)
+{
+ int i;
+
+ /* Check the args */
+ if (stop > max) {
+ printk(KERN_ERR "oprofile: oprofile_put_buff(), illegal "\
+ "arguments; stop greater then max."\
+ " stop = %u, max = %u.\n",
+ stop, max);
+ return;
+ }
+
+ i = start;
+
+ mutex_lock(&buffer_mutex);
+ while (i != stop) {
+ add_event_entry(buf[i++]);
+
+ if (i >= max)
+ i = 0;
+ }
+
+ mutex_unlock(&buffer_mutex);
+}
+

2008-08-08 16:08:40

by Arnd Bergmann

[permalink] [raw]

Subject: Re: [PATCH 1/2] Cell OProfile: SPU mutex lock fix, version 4

On Friday 01 August 2008, Carl Love wrote:
> The issue is the SPU code is not holding the kernel mutex lock while
> adding samples to the kernel buffer.

Thanks for your patch, and sorry for not replying earlier.
It looks good from a functionality perspective, I just have
some style comments left that I hope we can address quickly.

Since this is still a bug fix (though a rather large one), I
guess we can should get it into 2.6.27-rc3.

Arnd <><

> Index: Cell_kernel_6_26_2008/arch/powerpc/oprofile/cell/spu_task_sync.c
> ===================================================================
> --- Cell_kernel_6_26_2008.orig/arch/powerpc/oprofile/cell/spu_task_sync.c
> +++ Cell_kernel_6_26_2008/arch/powerpc/oprofile/cell/spu_task_sync.c
> @@ -35,7 +35,106 @@ static DEFINE_SPINLOCK(buffer_lock);
> static DEFINE_SPINLOCK(cache_lock);
> static int num_spu_nodes;
> int spu_prof_num_nodes;
> -int last_guard_val[MAX_NUMNODES * 8];
> +int last_guard_val[MAX_NUMNODES * SPUS_PER_NODE];
> +static int spu_ctx_sw_seen[MAX_NUMNODES * SPUS_PER_NODE];
> +static int sync_start_registered;
> +static int delayed_work_init;

You don't need the delayed_work_init variable. Just initialize
the work queue in your init function to be sure it's always
right.

I think you should also try to remove sync_start_registered,
these global state variables can easily get annoying when you
try to change something.
AFAICT, spu_sync_stop does not get called unless spu_sync_start
was successful, so the sync_start_registered variable is
redundant.

> +static int oprofile_spu_buff_create(void)
> +{
> + int spu;
> +
> + max_spu_buff = oprofile_get_cpu_buffer_size();
> +
> + for (spu = 0; spu < num_spu_nodes; spu++) {
> + /* create circular buffers to store the data in.
> + * use locks to manage accessing the buffers
> + */
> + spu_buff.index[spu].head = 0;
> + spu_buff.index[spu].tail = 0;
> +
> + /*
> + * Create a buffer for each SPU. Can't reliably
> + * create a single buffer for all spus due to not
> + * enough contiguous kernel memory.
> + */
> +
> + spu_buff.buff[spu] = kzalloc((max_spu_buff
> + * sizeof(unsigned long)),
> + GFP_KERNEL);
> +
> + if (!spu_buff.buff[spu]) {
> + printk(KERN_ERR "SPU_PROF: "
> + "%s, line %d: oprofile_spu_buff_create " \
> + "failed to allocate spu buffer %d.\n",
> + __FUNCTION__, __LINE__, spu);

The formatting of the printk line is a little unconventional. You certainly
don't need the '\' at the end of the line.
Also, please don't use __FUNCTION__ in new code, but instead of the
standard c99 __func__ symbol. The __LINE__ macro is fine.

> +
> + /* release the spu buffers that have been allocated */
> + while (spu >= 0) {
> + if (spu_buff.buff[spu]) {
> + kfree(spu_buff.buff[spu]);
> + spu_buff.buff[spu] = 0;
> + }
> + spu--;
> + }
> + return 1;
> + }
> + }
> + return 0;
> +}

The convention for a function would be to return -ENOMEM here instead
of 1.

> /* The main purpose of this function is to synchronize
> * OProfile with SPUFS by registering to be notified of
> * SPU task switches.
> @@ -372,30 +521,50 @@ static int number_of_online_nodes(void)
> */
> int spu_sync_start(void)
> {
> - int k;
> + int spu;
> int ret = SKIP_GENERIC_SYNC;
> int register_ret;
> unsigned long flags = 0;
>
> spu_prof_num_nodes = number_of_online_nodes();
> num_spu_nodes = spu_prof_num_nodes * 8;
> + delayed_work_init = 0;
> +
> + /* create buffer for storing the SPU data to put in
> + * the kernel buffer.
> + */
> + if (oprofile_spu_buff_create()) {
> + ret = -ENOMEM;
> + sync_start_registered = 0;
> + goto out;
> + }

consequently, this becomes

ret = oprofile_spu_buff_create();
if (ret)
goto out;

> -out:
> +
> + /* remove scheduled work queue item rather then waiting
> + * for every queued entry to execute. Then flush pending
> + * system wide buffer to event buffer. Only try to
> + * remove if it was scheduled. Get kernel errors otherwise.
> + */
> + if (delayed_work_init)
> + cancel_delayed_work(&spu_work);
> +
> + for (k = 0; k < num_spu_nodes; k++) {
> + spu_ctx_sw_seen[k] = 0;
> +
> + /* spu_sys_buff will be null if there was a problem
> + * allocating the buffer. Only delete if it exists.
> + */
> +
> + if (spu_buff.buff[k]) {
> + kfree(spu_buff.buff[k]);
> + spu_buff.buff[k] = 0;
> + }
> + }

The if(spu_buff.buff[k]) is redundant, kfree(NULL) is valid, so you
should simplify this.

> +/* Put head and tail for the spu buffer into a structure to keep
> + * them in the same cache line.
> + */
> +struct head_tail {
> + unsigned int head;
> + unsigned int tail;
> +};
> +
> +struct spu_buffers {
> + unsigned long *buff[MAX_NUMNODES * SPUS_PER_NODE];
> + struct head_tail index[MAX_NUMNODES * SPUS_PER_NODE];
> +};
> +

Did you measure a problem in the cache layout here? A simpler
array of

struct spu_buffer {
int last_guard_val;
int spu_ctx_sw_seen;
unsigned long *buff;
unsigned int head, tail;
};

would otherwise be more reasonable, mostly for readability.

> +/* The function can be used to add a buffer worth of data directly to
> + * the kernel buffer. The buffer is assumed to be a circular buffer.
> + * Take the entries from index start and end at index end, wrapping
> + * at max_entries.
> + */
> +void oprofile_put_buff(unsigned long *buf, unsigned int start,
> + unsigned int stop, unsigned int max)
> +{
> + int i;
> +
> + /* Check the args */
> + if (stop > max) {
> + printk(KERN_ERR "oprofile: oprofile_put_buff(), illegal "\
> + "arguments; stop greater then max."\
> + " stop = %u, max = %u.\n",
> + stop, max);
> + return;
> + }

hmm, this is not a good interface. Better return -EINVAL in case of
error, or, even better, don't call this function with invalid
arguments. Note that in the kernel, the rule is that you expect other
kernel code to be written correctly, and user space code to call you
with any combination of invalid arguments.

Arnd <><

2008-08-08 22:30:46

by Carl Love

[permalink] [raw]

Subject: Re: [PATCH 1/2] Cell OProfile: SPU mutex lock fix, version 4

On Fri, 2008-08-08 at 18:08 +0200, Arnd Bergmann wrote:
> On Friday 01 August 2008, Carl Love wrote:
> > The issue is the SPU code is not holding the kernel mutex lock while
> > adding samples to the kernel buffer.
>
> Thanks for your patch, and sorry for not replying earlier.
> It looks good from a functionality perspective, I just have
> some style comments left that I hope we can address quickly.
>
> Since this is still a bug fix (though a rather large one), I
> guess we can should get it into 2.6.27-rc3.
>
> Arnd <><
>
> > Index: Cell_kernel_6_26_2008/arch/powerpc/oprofile/cell/spu_task_sync.c
> > ===================================================================
> > --- Cell_kernel_6_26_2008.orig/arch/powerpc/oprofile/cell/spu_task_sync.c
> > +++ Cell_kernel_6_26_2008/arch/powerpc/oprofile/cell/spu_task_sync.c
> > @@ -35,7 +35,106 @@ static DEFINE_SPINLOCK(buffer_lock);
> > static DEFINE_SPINLOCK(cache_lock);
> > static int num_spu_nodes;
> > int spu_prof_num_nodes;
> > -int last_guard_val[MAX_NUMNODES * 8];
> > +int last_guard_val[MAX_NUMNODES * SPUS_PER_NODE];
> > +static int spu_ctx_sw_seen[MAX_NUMNODES * SPUS_PER_NODE];
> > +static int sync_start_registered;
> > +static int delayed_work_init;
>
> You don't need the delayed_work_init variable. Just initialize
> the work queue in your init function to be sure it's always
> right.
>
> I think you should also try to remove sync_start_registered,
> these global state variables can easily get annoying when you
> try to change something.
> AFAICT, spu_sync_stop does not get called unless spu_sync_start
> was successful, so the sync_start_registered variable is
> redundant.
>

I was able to remove the delayed_work_init variable with a little code
restructuring.

I worked on the sync_start_registered variable. I had put it in because
I thought that the call to spu_switch_event_unregister() call for a non
registered event was giving me a kernel error. I retested this and it
does look like it is OK. So it looks safe to remove the
sync_start_registered variable.

The rest of your comments were fairly easy to address.

> > +static int oprofile_spu_buff_create(void)
> > +{
> > + int spu;
> > +
> > + max_spu_buff = oprofile_get_cpu_buffer_size();
> > +
> > + for (spu = 0; spu < num_spu_nodes; spu++) {
> > + /* create circular buffers to store the data in.
> > + * use locks to manage accessing the buffers
> > + */
> > + spu_buff.index[spu].head = 0;
> > + spu_buff.index[spu].tail = 0;
> > +
> > + /*
> > + * Create a buffer for each SPU. Can't reliably
> > + * create a single buffer for all spus due to not
> > + * enough contiguous kernel memory.
> > + */
> > +
> > + spu_buff.buff[spu] = kzalloc((max_spu_buff
> > + * sizeof(unsigned long)),
> > + GFP_KERNEL);
> > +
> > + if (!spu_buff.buff[spu]) {
> > + printk(KERN_ERR "SPU_PROF: "
> > + "%s, line %d: oprofile_spu_buff_create " \
> > + "failed to allocate spu buffer %d.\n",
> > + __FUNCTION__, __LINE__, spu);
>
> The formatting of the printk line is a little unconventional. You certainly
> don't need the '\' at the end of the line.
> Also, please don't use __FUNCTION__ in new code, but instead of the
> standard c99 __func__ symbol. The __LINE__ macro is fine.
>
> > +
> > + /* release the spu buffers that have been allocated */
> > + while (spu >= 0) {
> > + if (spu_buff.buff[spu]) {
> > + kfree(spu_buff.buff[spu]);
> > + spu_buff.buff[spu] = 0;
> > + }
> > + spu--;
> > + }
> > + return 1;
> > + }
> > + }
> > + return 0;
> > +}
>
> The convention for a function would be to return -ENOMEM here instead
> of 1.
>
> > /* The main purpose of this function is to synchronize
> > * OProfile with SPUFS by registering to be notified of
> > * SPU task switches.
> > @@ -372,30 +521,50 @@ static int number_of_online_nodes(void)
> > */
> > int spu_sync_start(void)
> > {
> > - int k;
> > + int spu;
> > int ret = SKIP_GENERIC_SYNC;
> > int register_ret;
> > unsigned long flags = 0;
> >
> > spu_prof_num_nodes = number_of_online_nodes();
> > num_spu_nodes = spu_prof_num_nodes * 8;
> > + delayed_work_init = 0;
> > +
> > + /* create buffer for storing the SPU data to put in
> > + * the kernel buffer.
> > + */
> > + if (oprofile_spu_buff_create()) {
> > + ret = -ENOMEM;
> > + sync_start_registered = 0;
> > + goto out;
> > + }
>
> consequently, this becomes
>
> ret = oprofile_spu_buff_create();
> if (ret)
> goto out;
>
>
> > -out:
> > +
> > + /* remove scheduled work queue item rather then waiting
> > + * for every queued entry to execute. Then flush pending
> > + * system wide buffer to event buffer. Only try to
> > + * remove if it was scheduled. Get kernel errors otherwise.
> > + */
> > + if (delayed_work_init)
> > + cancel_delayed_work(&spu_work);
> > +
> > + for (k = 0; k < num_spu_nodes; k++) {
> > + spu_ctx_sw_seen[k] = 0;
> > +
> > + /* spu_sys_buff will be null if there was a problem
> > + * allocating the buffer. Only delete if it exists.
> > + */
> > +
> > + if (spu_buff.buff[k]) {
> > + kfree(spu_buff.buff[k]);
> > + spu_buff.buff[k] = 0;
> > + }
> > + }
>
> The if(spu_buff.buff[k]) is redundant, kfree(NULL) is valid, so you
> should simplify this.
>
> > +/* Put head and tail for the spu buffer into a structure to keep
> > + * them in the same cache line.
> > + */
> > +struct head_tail {
> > + unsigned int head;
> > + unsigned int tail;
> > +};
> > +
> > +struct spu_buffers {
> > + unsigned long *buff[MAX_NUMNODES * SPUS_PER_NODE];
> > + struct head_tail index[MAX_NUMNODES * SPUS_PER_NODE];
> > +};
> > +
>
> Did you measure a problem in the cache layout here? A simpler
> array of
>
> struct spu_buffer {
> int last_guard_val;
> int spu_ctx_sw_seen;
> unsigned long *buff;
> unsigned int head, tail;
> };
>
> would otherwise be more reasonable, mostly for readability.
>
> > +/* The function can be used to add a buffer worth of data directly to
> > + * the kernel buffer. The buffer is assumed to be a circular buffer.
> > + * Take the entries from index start and end at index end, wrapping
> > + * at max_entries.
> > + */
> > +void oprofile_put_buff(unsigned long *buf, unsigned int start,
> > + unsigned int stop, unsigned int max)
> > +{
> > + int i;
> > +
> > + /* Check the args */
> > + if (stop > max) {
> > + printk(KERN_ERR "oprofile: oprofile_put_buff(), illegal "\
> > + "arguments; stop greater then max."\
> > + " stop = %u, max = %u.\n",
> > + stop, max);
> > + return;
> > + }
>
> hmm, this is not a good interface. Better return -EINVAL in case of
> error, or, even better, don't call this function with invalid
> arguments. Note that in the kernel, the rule is that you expect other
> kernel code to be written correctly, and user space code to call you
> with any combination of invalid arguments.
>
> Arnd <><

2008-08-08 22:40:00

by Carl Love

[permalink] [raw]

Subject: Re: [PATCH 1/2] Repost Cell OProfile: SPU mutex lock fix, version 4

Version 4 of the SPU mutex lock fix.

Updated to address Arnd's comments.

The issue is the SPU code is not holding the kernel mutex lock while
adding samples to the kernel buffer.

This patch creates per SPU buffers to hold the data. Data
is added to the buffers from in interrupt context. The data
is periodically pushed to the kernel buffer via a new Oprofile
function oprofile_put_buff(). The oprofile_put_buff() function
is called via a work queue enabling the funtion to acquire the
mutex lock.

The existing user controls for adjusting the per CPU buffer
size is used to control the size of the per SPU buffers.
Similarly, overflows of the SPU buffers are reported by
incrementing the per CPU buffer stats. This eliminates the
need to have architecture specific controls for the per SPU
buffers which is not acceptable to the OProfile user tool
maintainer.

The export of the oprofile add_event_entry() is removed as it
is no longer needed given this patch.

Note, this patch has not addressed the issue of indexing arrays
by the spu number. This still needs to be fixed as the spu
numbering is not guarenteed to be 0 to max_num_spus-1.

Signed-off-by: Carl Love <[email protected]>
Signed-off-by: Maynard Johnson <[email protected]>

Index: Cell_kernel_6_26_2008/drivers/oprofile/cpu_buffer.c
===================================================================
--- Cell_kernel_6_26_2008.orig/drivers/oprofile/cpu_buffer.c
+++ Cell_kernel_6_26_2008/drivers/oprofile/cpu_buffer.c
@@ -37,11 +37,24 @@ static int work_enabled;
void free_cpu_buffers(void)
{
int i;
-
+
for_each_online_cpu(i)
vfree(per_cpu(cpu_buffer, i).buffer);
}

+unsigned long oprofile_get_cpu_buffer_size(void)
+{
+ return fs_cpu_buffer_size;
+}
+
+void oprofile_cpu_buffer_inc_smpl_lost(void)
+{
+ struct oprofile_cpu_buffer *cpu_buf
+ = &__get_cpu_var(cpu_buffer);
+
+ cpu_buf->sample_lost_overflow++;
+}
+
int alloc_cpu_buffers(void)
{
int i;
Index: Cell_kernel_6_26_2008/include/linux/oprofile.h
===================================================================
--- Cell_kernel_6_26_2008.orig/include/linux/oprofile.h
+++ Cell_kernel_6_26_2008/include/linux/oprofile.h
@@ -84,13 +84,6 @@ int oprofile_arch_init(struct oprofile_o
void oprofile_arch_exit(void);

/**
- * Add data to the event buffer.
- * The data passed is free-form, but typically consists of
- * file offsets, dcookies, context information, and ESCAPE codes.
- */
-void add_event_entry(unsigned long data);
-
-/**
* Add a sample. This may be called from any context. Pass
* smp_processor_id() as cpu.
*/
@@ -160,5 +153,14 @@ int oprofilefs_ulong_from_user(unsigned

/** lock for read/write safety */
extern spinlock_t oprofilefs_lock;
+
+/**
+ * Add the contents of a circular buffer to the event buffer.
+ */
+void oprofile_put_buff(unsigned long *buf,unsigned int start,
+ unsigned int stop, unsigned int max);
+
+unsigned long oprofile_get_cpu_buffer_size(void);
+void oprofile_cpu_buffer_inc_smpl_lost(void);

#endif /* OPROFILE_H */
Index: Cell_kernel_6_26_2008/arch/powerpc/oprofile/cell/spu_profiler.c
===================================================================
--- Cell_kernel_6_26_2008.orig/arch/powerpc/oprofile/cell/spu_profiler.c
+++ Cell_kernel_6_26_2008/arch/powerpc/oprofile/cell/spu_profiler.c
@@ -23,12 +23,11 @@

static u32 *samples;

-static int spu_prof_running;
+int spu_prof_running;
static unsigned int profiling_interval;

#define NUM_SPU_BITS_TRBUF 16
#define SPUS_PER_TB_ENTRY 4
-#define SPUS_PER_NODE 8

#define SPU_PC_MASK 0xFFFF

@@ -208,6 +207,7 @@ int start_spu_profiling(unsigned int cyc

spu_prof_running = 1;
hrtimer_start(&timer, kt, HRTIMER_MODE_REL);
+ schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE);

return 0;
}
Index: Cell_kernel_6_26_2008/arch/powerpc/oprofile/cell/spu_task_sync.c
===================================================================
--- Cell_kernel_6_26_2008.orig/arch/powerpc/oprofile/cell/spu_task_sync.c
+++ Cell_kernel_6_26_2008/arch/powerpc/oprofile/cell/spu_task_sync.c
@@ -35,7 +35,102 @@ static DEFINE_SPINLOCK(buffer_lock);
static DEFINE_SPINLOCK(cache_lock);
static int num_spu_nodes;
int spu_prof_num_nodes;
-int last_guard_val[MAX_NUMNODES * 8];
+
+struct spu_buffer spu_buff[MAX_NUMNODES * SPUS_PER_NODE];
+struct delayed_work spu_work;
+static unsigned max_spu_buff;
+
+static void spu_buff_add(unsigned long int value, int spu)
+{
+ /* spu buff is a circular buffer. Add entries to the
+ * head. Head is the index to store the next value.
+ * The buffer is full when there is one available entry
+ * in the queue, i.e. head and tail can't be equal.
+ * That way we can tell the difference between the
+ * buffer being full versus empty.
+ *
+ * ASSUPTION: the buffer_lock is held when this function
+ * is called to lock the buffer, head and tail.
+ */
+ int full = 1;
+
+ if (spu_buff[spu].head >= spu_buff[spu].tail) {
+ if ((spu_buff[spu].head - spu_buff[spu].tail)
+ < (max_spu_buff - 1))
+ full = 0;
+
+ } else if (spu_buff[spu].tail > spu_buff[spu].head) {
+ if ((spu_buff[spu].tail - spu_buff[spu].head)
+ > 1)
+ full = 0;
+ }
+
+ if (!full) {
+ spu_buff[spu].buff[spu_buff[spu].head] = value;
+ spu_buff[spu].head++;
+
+ if (spu_buff[spu].head >= max_spu_buff)
+ spu_buff[spu].head = 0;
+ } else {
+ /* From the user's perspective make the SPU buffer
+ * size management/overflow look like we are using
+ * per cpu buffers. The user uses the same
+ * per cpu parameter to adjust the SPU buffer size.
+ * Increment the sample_lost_overflow to inform
+ * the user the buffer size needs to be increased.
+ */
+ oprofile_cpu_buffer_inc_smpl_lost();
+ }
+}
+
+/* This function copies the per SPU buffers to the
+ * OProfile kernel buffer.
+ */
+void sync_spu_buff(void)
+{
+ int spu;
+ int flags;
+ int curr_head;
+
+ for (spu=0; spu < num_spu_nodes; spu++) {
+ /* In case there was an issue and the buffer didn't
+ * get created skip it.
+ */
+ if (spu_buff[spu].buff == NULL)
+ continue;
+
+ /* Hold the lock to make sure the head/tail
+ * doesn't change while spu_buff_add() is
+ * deciding if the buffer is full or not.
+ * Being a little paranoid.
+ */
+ spin_lock_irqsave(&buffer_lock, flags);
+ curr_head = spu_buff[spu].head;
+ spin_unlock_irqrestore(&buffer_lock, flags);
+
+ /* Transfer the current contents to the kernel buffer.
+ * data can still be added to the head of the buffer.
+ */
+ oprofile_put_buff(spu_buff[spu].buff,
+ spu_buff[spu].tail,
+ curr_head, max_spu_buff);
+
+ spin_lock_irqsave(&buffer_lock, flags);
+ spu_buff[spu].tail = curr_head;
+ spin_unlock_irqrestore(&buffer_lock, flags);
+ }
+
+}
+
+static void wq_sync_spu_buff(struct work_struct *work)
+{
+ /* move data from spu buffers to kernel buffer */
+ sync_spu_buff();
+
+ /* only reschedule if profiling is not done */
+ if (spu_prof_running)
+ schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE);
+}

/* Container for caching information about an active SPU task. */
struct cached_info {
@@ -305,14 +400,21 @@ static int process_context_switch(struct

/* Record context info in event buffer */
spin_lock_irqsave(&buffer_lock, flags);
- add_event_entry(ESCAPE_CODE);
- add_event_entry(SPU_CTX_SWITCH_CODE);
- add_event_entry(spu->number);
- add_event_entry(spu->pid);
- add_event_entry(spu->tgid);
- add_event_entry(app_dcookie);
- add_event_entry(spu_cookie);
- add_event_entry(offset);
+ spu_buff_add(ESCAPE_CODE, spu->number);
+ spu_buff_add(SPU_CTX_SWITCH_CODE, spu->number);
+ spu_buff_add(spu->number, spu->number);
+ spu_buff_add(spu->pid, spu->number);
+ spu_buff_add(spu->tgid, spu->number);
+ spu_buff_add(app_dcookie, spu->number);
+ spu_buff_add(spu_cookie, spu->number);
+ spu_buff_add(offset, spu->number);
+
+ /* Set flag to indicate SPU PC data can now be written out. If
+ * the SPU program counter data is seen before an SPU context
+ * record is seen, the postprocessing will fail.
+ */
+ spu_buff[spu->number].ctx_sw_seen = 1;
+
spin_unlock_irqrestore(&buffer_lock, flags);
smp_wmb(); /* insure spu event buffer updates are written */
/* don't want entries intermingled... */
@@ -360,6 +462,47 @@ static int number_of_online_nodes(void)
return nodes;
}

+static int oprofile_spu_buff_create(void)
+{
+ int spu;
+
+ max_spu_buff = oprofile_get_cpu_buffer_size();
+
+ for (spu = 0; spu < num_spu_nodes; spu++) {
+ /* create circular buffers to store the data in.
+ * use locks to manage accessing the buffers
+ */
+ spu_buff[spu].head = 0;
+ spu_buff[spu].tail = 0;
+
+ /*
+ * Create a buffer for each SPU. Can't reliably
+ * create a single buffer for all spus due to not
+ * enough contiguous kernel memory.
+ */
+
+ spu_buff[spu].buff = kzalloc((max_spu_buff
+ * sizeof(unsigned long)),
+ GFP_KERNEL);
+
+ if (!spu_buff[spu].buff) {
+ printk(KERN_ERR "SPU_PROF: "
+ "%s, line %d: oprofile_spu_buff_create "
+ "failed to allocate spu buffer %d.\n",
+ __func__, __LINE__, spu);
+
+ /* release the spu buffers that have been allocated */
+ while (spu >= 0) {
+ kfree(spu_buff[spu].buff);
+ spu_buff[spu].buff = 0;
+ spu--;
+ }
+ return -ENOMEM;
+ }
+ }
+ return 0;
+}
+
/* The main purpose of this function is to synchronize
* OProfile with SPUFS by registering to be notified of
* SPU task switches.
@@ -372,20 +515,35 @@ static int number_of_online_nodes(void)
*/
int spu_sync_start(void)
{
- int k;
+ int spu;
int ret = SKIP_GENERIC_SYNC;
int register_ret;
unsigned long flags = 0;

spu_prof_num_nodes = number_of_online_nodes();
num_spu_nodes = spu_prof_num_nodes * 8;
+ INIT_DELAYED_WORK(&spu_work, wq_sync_spu_buff);
+
+ /* create buffer for storing the SPU data to put in
+ * the kernel buffer.
+ */
+ ret = oprofile_spu_buff_create();
+ if(ret)
+ goto out;

spin_lock_irqsave(&buffer_lock, flags);
- add_event_entry(ESCAPE_CODE);
- add_event_entry(SPU_PROFILING_CODE);
- add_event_entry(num_spu_nodes);
+ for (spu = 0; spu < num_spu_nodes; spu++) {
+ spu_buff_add(ESCAPE_CODE, spu);
+ spu_buff_add(SPU_PROFILING_CODE, spu);
+ spu_buff_add(num_spu_nodes, spu);
+ }
spin_unlock_irqrestore(&buffer_lock, flags);

+ for (spu = 0; spu < num_spu_nodes; spu++) {
+ spu_buff[spu].ctx_sw_seen = 0;
+ spu_buff[spu].last_guard_val = 0;
+ }
+
/* Register for SPU events */
register_ret = spu_switch_event_register(&spu_active);
if (register_ret) {
@@ -393,8 +551,6 @@ int spu_sync_start(void)
goto out;
}

- for (k = 0; k < (MAX_NUMNODES * 8); k++)
- last_guard_val[k] = 0;
pr_debug("spu_sync_start -- running.\n");
out:
return ret;
@@ -446,13 +602,20 @@ void spu_sync_buffer(int spu_num, unsign
* use. We need to discard samples taken during the time
* period which an overlay occurs (i.e., guard value changes).
*/
- if (grd_val && grd_val != last_guard_val[spu_num]) {
- last_guard_val[spu_num] = grd_val;
+ if (grd_val && grd_val != spu_buff[spu_num].last_guard_val) {
+ spu_buff[spu_num].last_guard_val = grd_val;
/* Drop the rest of the samples. */
break;
}

- add_event_entry(file_offset | spu_num_shifted);
+ /* We must ensure that the SPU context switch has been written
+ * out before samples for the SPU. Otherwise, the SPU context
+ * information is not available and the postprocessing of the
+ * SPU PC will fail with no available anonymous map information.
+ */
+ if (spu_buff[spu_num].ctx_sw_seen)
+ spu_buff_add((file_offset | spu_num_shifted),
+ spu_num);
}
spin_unlock(&buffer_lock);
out:
@@ -463,20 +626,43 @@ out:
int spu_sync_stop(void)
{
unsigned long flags = 0;
- int ret = spu_switch_event_unregister(&spu_active);
- if (ret) {
+ int ret=0;
+ int k;
+
+ ret = spu_switch_event_unregister(&spu_active);
+
+ if (ret)
printk(KERN_ERR "SPU_PROF: "
- "%s, line %d: spu_switch_event_unregister returned %d\n",
- __func__, __LINE__, ret);
- goto out;
- }
+ "%s, line %d: spu_switch_event_unregister " \
+ "returned %d\n",
+ __func__, __LINE__, ret);
+
+ /* flush any remaining data in the per SPU buffers */
+ sync_spu_buff();

spin_lock_irqsave(&cache_lock, flags);
ret = release_cached_info(RELEASE_ALL);
spin_unlock_irqrestore(&cache_lock, flags);
-out:
+
+ /* remove scheduled work queue item rather then waiting
+ * for every queued entry to execute. Then flush pending
+ * system wide buffer to event buffer.
+ */
+ cancel_delayed_work(&spu_work);
+
+ for (k = 0; k < num_spu_nodes; k++) {
+ spu_buff[k].ctx_sw_seen = 0;
+
+ /* spu_sys_buff will be null if there was a problem
+ * allocating the buffer. Only delete if it exists.
+ */
+
+ if (spu_buff[k].buff) {
+ kfree(spu_buff[k].buff);
+ spu_buff[k].buff = 0;
+ }
+ }
pr_debug("spu_sync_stop -- done.\n");
return ret;
}

-
Index: Cell_kernel_6_26_2008/drivers/oprofile/event_buffer.h
===================================================================
--- Cell_kernel_6_26_2008.orig/drivers/oprofile/event_buffer.h
+++ Cell_kernel_6_26_2008/drivers/oprofile/event_buffer.h
@@ -17,6 +17,13 @@ int alloc_event_buffer(void);

void free_event_buffer(void);

+/**
+ * Add data to the event buffer.
+ * The data passed is free-form, but typically consists of
+ * file offsets, dcookies, context information, and ESCAPE codes.
+ */
+void add_event_entry(unsigned long data);
+
/* wake up the process sleeping on the event file */
void wake_up_buffer_waiter(void);

Index: Cell_kernel_6_26_2008/arch/powerpc/oprofile/cell/pr_util.h
===================================================================
--- Cell_kernel_6_26_2008.orig/arch/powerpc/oprofile/cell/pr_util.h
+++ Cell_kernel_6_26_2008/arch/powerpc/oprofile/cell/pr_util.h
@@ -24,6 +24,11 @@
#define SKIP_GENERIC_SYNC 0
#define SYNC_START_ERROR -1
#define DO_GENERIC_SYNC 1
+#define SPUS_PER_NODE 8
+#define DEFAULT_TIMER_EXPIRE (HZ / 10)
+
+extern struct delayed_work spu_work;
+extern int spu_prof_running;

struct spu_overlay_info { /* map of sections within an SPU overlay */
unsigned int vma; /* SPU virtual memory address from elf */
@@ -62,6 +67,14 @@ struct vma_to_fileoffset_map { /* map of

};

+struct spu_buffer {
+ int last_guard_val;
+ int ctx_sw_seen;
+ unsigned long *buff;
+ unsigned int head, tail;
+};
+
+
/* The three functions below are for maintaining and accessing
* the vma-to-fileoffset map.
*/
Index: Cell_kernel_6_26_2008/drivers/oprofile/buffer_sync.c
===================================================================
--- Cell_kernel_6_26_2008.orig/drivers/oprofile/buffer_sync.c
+++ Cell_kernel_6_26_2008/drivers/oprofile/buffer_sync.c
@@ -551,3 +551,27 @@ void sync_buffer(int cpu)

mutex_unlock(&buffer_mutex);
}
+
+/* The function can be used to add a buffer worth of data directly to
+ * the kernel buffer. The buffer is assumed to be a circular buffer.
+ * Take the entries from index start and end at index end, wrapping
+ * at max_entries.
+ */
+void oprofile_put_buff(unsigned long *buf, unsigned int start,
+ unsigned int stop, unsigned int max)
+{
+ int i;
+
+ i = start;
+
+ mutex_lock(&buffer_mutex);
+ while (i != stop) {
+ add_event_entry(buf[i++]);
+
+ if (i >= max)
+ i = 0;
+ }
+
+ mutex_unlock(&buffer_mutex);
+}
+