This patch converts the lock statistics to a user of lib/statistic.c,
resulting in somewhat simpler code (a 300 lines-of-code diet).
Anyone interested in the details of code duplication might want to study:
- lock_time_inc() vs. statistic_add_util()
- lock_stats() vs. statistic_merge()
- clear_lock_stats() vs. statistic_reset()
- seq file code and per cpu data used by both approaches
The output has changed from a terribly wide table to an enormously
long list (just the generic way the statistics code prints data).
Still, all information has been retained:
<snip>
&rq->rq_lock_key contentions 0x34c04 545 task_rq_lock+0x64/0xb0
&rq->rq_lock_key contentions 0x34b62 148 double_rq_lock+0x4a/0x88
&rq->rq_lock_key contentions 0x357d0 8 double_lock_balance+0x7c/0x98
&rq->rq_lock_key contentions 0x35cf6 1 run_rebalance_domains+0x50a/0x5c0
&rq->rq_lock_key contentions other 404
&rq->rq_lock_key wait_read samples 0
&rq->rq_lock_key wait_read minimum 0
&rq->rq_lock_key wait_read average 0.000
&rq->rq_lock_key wait_read maximum 0
&rq->rq_lock_key wait_read variance 0.000
&rq->rq_lock_key wait_write samples 1106
&rq->rq_lock_key wait_write minimum 343
&rq->rq_lock_key wait_write average 361157.478
&rq->rq_lock_key wait_write maximum 5886625
&rq->rq_lock_key wait_write variance 753383636064.242
&rq->rq_lock_key hold_read samples 0
&rq->rq_lock_key hold_read minimum 0
&rq->rq_lock_key hold_read average 0.000
&rq->rq_lock_key hold_read maximum 0
&rq->rq_lock_key hold_read variance 0.000
&rq->rq_lock_key hold_write samples 551474
&rq->rq_lock_key hold_write minimum 94
&rq->rq_lock_key hold_write average 2517.107
&rq->rq_lock_key hold_write maximum 6269659
&rq->rq_lock_key hold_write variance 1996596045.121
xtime_lock contentions 0x17bd2 3327 account_ticks+0x96/0x184
xtime_lock contentions other 0
xtime_lock wait_read samples 0
xtime_lock wait_read minimum 0
xtime_lock wait_read average 0.000
xtime_lock wait_read maximum 0
xtime_lock wait_read variance 0.000
xtime_lock wait_write samples 3327
xtime_lock wait_write minimum 422
xtime_lock wait_write average 63371.777
xtime_lock wait_write maximum 3731202
xtime_lock wait_write variance 109560396493.412
xtime_lock hold_read samples 0
xtime_lock hold_read minimum 0
xtime_lock hold_read average 0.000
xtime_lock hold_read maximum 0
xtime_lock hold_read variance 0.000
xtime_lock hold_write samples 500405
xtime_lock hold_write minimum 203
xtime_lock hold_write average 2224.482
xtime_lock hold_write maximum 3495922
xtime_lock hold_write variance 350307599.860
clocksource_lock contentions other 0
clocksource_lock wait_read samples 0
clocksource_lock wait_read minimum 0
clocksource_lock wait_read average 0.000
clocksource_lock wait_read maximum 0
clocksource_lock wait_read variance 0.000
clocksource_lock wait_write samples 0
clocksource_lock wait_write minimum 0
clocksource_lock wait_write average 0.000
clocksource_lock wait_write maximum 0
clocksource_lock wait_write variance 0.000
clocksource_lock hold_read samples 0
clocksource_lock hold_read minimum 0
clocksource_lock hold_read average 0.000
clocksource_lock hold_read maximum 0
clocksource_lock hold_read variance 0.000
clocksource_lock hold_write samples 308908
clocksource_lock hold_write minimum 188
clocksource_lock hold_write average 435.165
clocksource_lock hold_write maximum 2485654
clocksource_lock hold_write variance 86184020.322
<snip>
Signed-off-by: Martin Peschke <[email protected]>
---
include/linux/lockdep.h | 35 ++---
kernel/lockdep.c | 122 ++-----------------
kernel/lockdep_proc.c | 294 +++++++-----------------------------------------
3 files changed, 76 insertions(+), 375 deletions(-)
Index: linux/kernel/lockdep.c
===================================================================
--- linux.orig/kernel/lockdep.c
+++ linux/kernel/lockdep.c
@@ -37,6 +37,7 @@
#include <linux/debug_locks.h>
#include <linux/irqflags.h>
#include <linux/utsname.h>
+#include <linux/statistic.h>
#include <asm/sections.h>
@@ -119,93 +120,9 @@ unsigned long nr_lock_classes;
static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
#ifdef CONFIG_LOCK_STAT
-static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
-
-static int lock_contention_point(struct lock_class *class, unsigned long ip)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) {
- if (class->contention_point[i] == 0) {
- class->contention_point[i] = ip;
- break;
- }
- if (class->contention_point[i] == ip)
- break;
- }
-
- return i;
-}
-
-static void lock_time_inc(struct lock_time *lt, s64 time)
-{
- if (time > lt->max)
- lt->max = time;
-
- if (time < lt->min || !lt->min)
- lt->min = time;
-
- lt->total += time;
- lt->nr++;
-}
-
-static inline void lock_time_add(struct lock_time *src, struct lock_time *dst)
-{
- dst->min += src->min;
- dst->max += src->max;
- dst->total += src->total;
- dst->nr += src->nr;
-}
-
-struct lock_class_stats lock_stats(struct lock_class *class)
-{
- struct lock_class_stats stats;
- int cpu, i;
-
- memset(&stats, 0, sizeof(struct lock_class_stats));
- for_each_possible_cpu(cpu) {
- struct lock_class_stats *pcs =
- &per_cpu(lock_stats, cpu)[class - lock_classes];
-
- for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
- stats.contention_point[i] += pcs->contention_point[i];
-
- lock_time_add(&pcs->read_waittime, &stats.read_waittime);
- lock_time_add(&pcs->write_waittime, &stats.write_waittime);
-
- lock_time_add(&pcs->read_holdtime, &stats.read_holdtime);
- lock_time_add(&pcs->write_holdtime, &stats.write_holdtime);
- }
-
- return stats;
-}
-
-void clear_lock_stats(struct lock_class *class)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct lock_class_stats *cpu_stats =
- &per_cpu(lock_stats, cpu)[class - lock_classes];
-
- memset(cpu_stats, 0, sizeof(struct lock_class_stats));
- }
- memset(class->contention_point, 0, sizeof(class->contention_point));
-}
-
-static struct lock_class_stats *get_lock_stats(struct lock_class *class)
-{
- return &get_cpu_var(lock_stats)[class - lock_classes];
-}
-
-static void put_lock_stats(struct lock_class_stats *stats)
-{
- put_cpu_var(lock_stats);
-}
-
static void lock_release_holdtime(struct held_lock *hlock)
{
- struct lock_class_stats *stats;
+ struct statistic *stat = hlock->class->stat;
s64 holdtime;
if (!lock_stat)
@@ -213,12 +130,10 @@ static void lock_release_holdtime(struct
holdtime = sched_clock() - hlock->holdtime_stamp;
- stats = get_lock_stats(hlock->class);
if (hlock->read)
- lock_time_inc(&stats->read_holdtime, holdtime);
+ statistic_inc(stat, LOCK_STAT_HOLD_READ, holdtime);
else
- lock_time_inc(&stats->write_holdtime, holdtime);
- put_lock_stats(stats);
+ statistic_inc(stat, LOCK_STAT_HOLD_WRITE, holdtime);
}
#else
static inline void lock_release_holdtime(struct held_lock *hlock)
@@ -2745,9 +2660,8 @@ __lock_contended(struct lockdep_map *loc
{
struct task_struct *curr = current;
struct held_lock *hlock, *prev_hlock;
- struct lock_class_stats *stats;
unsigned int depth;
- int i, point;
+ int i;
depth = curr->lockdep_depth;
if (DEBUG_LOCKS_WARN_ON(!depth))
@@ -2761,22 +2675,15 @@ __lock_contended(struct lockdep_map *loc
*/
if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
break;
- if (hlock->instance == lock)
- goto found_it;
+ if (hlock->instance == lock) {
+ hlock->waittime_stamp = sched_clock();
+ _statistic_inc(hlock->class->stat, LOCK_STAT_CONT, ip);
+ return;
+ }
prev_hlock = hlock;
}
print_lock_contention_bug(curr, lock, ip);
return;
-
-found_it:
- hlock->waittime_stamp = sched_clock();
-
- point = lock_contention_point(hlock->class, ip);
-
- stats = get_lock_stats(hlock->class);
- if (point < ARRAY_SIZE(stats->contention_point))
- stats->contention_point[i]++;
- put_lock_stats(stats);
}
static void
@@ -2784,7 +2691,7 @@ __lock_acquired(struct lockdep_map *lock
{
struct task_struct *curr = current;
struct held_lock *hlock, *prev_hlock;
- struct lock_class_stats *stats;
+ struct statistic *stat;
unsigned int depth;
u64 now;
s64 waittime;
@@ -2817,12 +2724,11 @@ found_it:
waittime = now - hlock->waittime_stamp;
hlock->holdtime_stamp = now;
- stats = get_lock_stats(hlock->class);
+ stat = hlock->class->stat;
if (hlock->read)
- lock_time_inc(&stats->read_waittime, waittime);
+ _statistic_inc(stat, LOCK_STAT_WAIT_READ, waittime);
else
- lock_time_inc(&stats->write_waittime, waittime);
- put_lock_stats(stats);
+ _statistic_inc(stat, LOCK_STAT_WAIT_WRITE, waittime);
}
void lock_contended(struct lockdep_map *lock, unsigned long ip)
Index: linux/include/linux/lockdep.h
===================================================================
--- linux.orig/include/linux/lockdep.h
+++ linux/include/linux/lockdep.h
@@ -17,6 +17,7 @@ struct lockdep_map;
#include <linux/list.h>
#include <linux/debug_locks.h>
#include <linux/stacktrace.h>
+#include <linux/statistic.h>
/*
* Lock-class usage-state bits:
@@ -72,6 +73,17 @@ struct lock_class_key {
struct lockdep_subclass_key subkeys[MAX_LOCKDEP_SUBCLASSES];
};
+#ifdef CONFIG_LOCK_STAT
+enum lock_stat_enum {
+ LOCK_STAT_CONT,
+ LOCK_STAT_WAIT_READ,
+ LOCK_STAT_WAIT_WRITE,
+ LOCK_STAT_HOLD_READ,
+ LOCK_STAT_HOLD_WRITE,
+ _LOCK_STAT_NUMBER
+};
+#endif
+
/*
* The lock-class itself:
*/
@@ -117,30 +129,11 @@ struct lock_class {
int name_version;
#ifdef CONFIG_LOCK_STAT
- unsigned long contention_point[4];
+ struct statistic stat[_LOCK_STAT_NUMBER];
+ struct statistic_coll stat_coll;
#endif
};
-#ifdef CONFIG_LOCK_STAT
-struct lock_time {
- s64 min;
- s64 max;
- s64 total;
- unsigned long nr;
-};
-
-struct lock_class_stats {
- unsigned long contention_point[4];
- struct lock_time read_waittime;
- struct lock_time write_waittime;
- struct lock_time read_holdtime;
- struct lock_time write_holdtime;
-};
-
-struct lock_class_stats lock_stats(struct lock_class *class);
-void clear_lock_stats(struct lock_class *class);
-#endif
-
/*
* Map the lock object (the lock instance) to the lock-class object.
* This is embedded into specific lock instances:
Index: linux/kernel/lockdep_proc.c
===================================================================
--- linux.orig/kernel/lockdep_proc.c
+++ linux/kernel/lockdep_proc.c
@@ -349,260 +349,64 @@ static const struct file_operations proc
};
#ifdef CONFIG_LOCK_STAT
-
-struct lock_stat_data {
- struct lock_class *class;
- struct lock_class_stats stats;
-};
-
-struct lock_stat_seq {
- struct lock_stat_data *iter;
- struct lock_stat_data *iter_end;
- struct lock_stat_data stats[MAX_LOCKDEP_KEYS];
-};
-
-/*
- * sort on absolute number of contentions
- */
-int lock_stat_cmp(const void *l, const void *r)
-{
- const struct lock_stat_data *dl = l, *dr = r;
- unsigned long nl, nr;
-
- nl = dl->stats.read_waittime.nr + dl->stats.write_waittime.nr;
- nr = dr->stats.read_waittime.nr + dr->stats.write_waittime.nr;
-
- return nr - nl;
-}
-
-static void seq_line(struct seq_file *m, char c, int offset, int length)
-{
- int i;
-
- for (i = 0; i < offset; i++)
- seq_puts(m, " ");
- for (i = 0; i < length; i++)
- seq_printf(m, "%c", c);
- seq_puts(m, "\n");
-}
-
-static void snprint_time(char *buf, size_t bufsiz, s64 nr)
-{
- unsigned long rem;
-
- rem = do_div(nr, 1000); /* XXX: do_div_signed */
- snprintf(buf, bufsiz, "%lld.%02d", nr, ((int)rem+5)/10);
-}
-
-static void seq_time(struct seq_file *m, s64 time)
-{
- char num[15];
-
- snprint_time(num, sizeof(num), time);
- seq_printf(m, " %14s", num);
-}
-
-static void seq_lock_time(struct seq_file *m, struct lock_time *lt)
-{
- seq_printf(m, "%14lu", lt->nr);
- seq_time(m, lt->min);
- seq_time(m, lt->max);
- seq_time(m, lt->total);
-}
-
-static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
-{
- char name[39];
- struct lock_class *class;
- struct lock_class_stats *stats;
- int i, namelen;
-
- class = data->class;
- stats = &data->stats;
-
- snprintf(name, 38, "%s", class->name);
- namelen = strlen(name);
-
- if (stats->write_holdtime.nr) {
- if (stats->read_holdtime.nr)
- seq_printf(m, "%38s-W:", name);
- else
- seq_printf(m, "%40s:", name);
-
- seq_lock_time(m, &stats->write_waittime);
- seq_puts(m, " ");
- seq_lock_time(m, &stats->write_holdtime);
- seq_puts(m, "\n");
- }
-
- if (stats->read_holdtime.nr) {
- seq_printf(m, "%38s-R:", name);
- seq_lock_time(m, &stats->read_waittime);
- seq_puts(m, " ");
- seq_lock_time(m, &stats->read_holdtime);
- seq_puts(m, "\n");
+struct statistic_info lock_stat_info[_LOCK_STAT_NUMBER] = {
+ [LOCK_STAT_CONT] = {
+ .name = "contentions",
+ .x_unit = "instruction_pointer",
+ .y_unit = "occurrence",
+ .defaults = "type=sparse entries=4",
+ .flags = STATISTIC_FLAGS_LABEL,
+ },
+ [LOCK_STAT_WAIT_READ] = {
+ .name = "wait_read",
+ .x_unit = "nanoseconds",
+ .y_unit = "occurrence",
+ .defaults = "type=utilisation",
+ },
+ [LOCK_STAT_WAIT_WRITE] = {
+ .name = "wait_write",
+ .x_unit = "nanoseconds",
+ .y_unit = "occurrence",
+ .defaults = "type=utilisation",
+ },
+ [LOCK_STAT_HOLD_READ] = {
+ .name = "hold_read",
+ .x_unit = "nanoseconds",
+ .y_unit = "occurrence",
+ .defaults = "type=utilisation",
+ },
+ [LOCK_STAT_HOLD_WRITE] = {
+ .name = "hold_write",
+ .x_unit = "nanoseconds",
+ .y_unit = "occurrence",
+ .defaults = "type=utilisation",
}
-
- if (stats->read_waittime.nr + stats->write_waittime.nr == 0)
- return;
-
- if (stats->read_holdtime.nr)
- namelen += 2;
-
- for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) {
- char sym[KSYM_SYMBOL_LEN];
- char ip[32];
-
- if (class->contention_point[i] == 0)
- break;
-
- if (!i)
- seq_line(m, '-', 40-namelen, namelen);
-
- sprint_symbol(sym, class->contention_point[i]);
- snprintf(ip, sizeof(ip), "[<%p>]",
- (void *)class->contention_point[i]);
- seq_printf(m, "%40s %14lu %29s %s\n", name,
- stats->contention_point[i],
- ip, sym);
- }
- if (i) {
- seq_puts(m, "\n");
- seq_line(m, '.', 0, 40 + 1 + 8 * (14 + 1));
- seq_puts(m, "\n");
- }
-}
-
-static void seq_header(struct seq_file *m)
-{
- seq_printf(m, "lock_stat version 0.1\n");
- seq_line(m, '-', 0, 40 + 1 + 8 * (14 + 1));
- seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s\n",
- "class name",
- "contentions",
- "waittime-min",
- "waittime-max",
- "waittime-total",
- "acquisitions",
- "holdtime-min",
- "holdtime-max",
- "holdtime-total");
- seq_line(m, '-', 0, 40 + 1 + 8 * (14 + 1));
- seq_printf(m, "\n");
-}
-
-static void *ls_start(struct seq_file *m, loff_t *pos)
-{
- struct lock_stat_seq *data = m->private;
-
- if (data->iter == data->stats)
- seq_header(m);
-
- return data->iter;
-}
-
-static void *ls_next(struct seq_file *m, void *v, loff_t *pos)
-{
- struct lock_stat_seq *data = m->private;
-
- (*pos)++;
-
- data->iter = v;
- data->iter++;
- if (data->iter == data->iter_end)
- data->iter = NULL;
-
- return data->iter;
-}
-
-static void ls_stop(struct seq_file *m, void *v)
-{
-}
-
-static int ls_show(struct seq_file *m, void *v)
-{
- struct lock_stat_seq *data = m->private;
-
- seq_stats(m, data->iter);
- return 0;
-}
-
-static struct seq_operations lockstat_ops = {
- .start = ls_start,
- .next = ls_next,
- .stop = ls_stop,
- .show = ls_show,
};
-static int lock_stat_open(struct inode *inode, struct file *file)
-{
- int res;
- struct lock_class *class;
- struct lock_stat_seq *data = vmalloc(sizeof(struct lock_stat_seq));
-
- if (!data)
- return -ENOMEM;
-
- res = seq_open(file, &lockstat_ops);
- if (!res) {
- struct lock_stat_data *iter = data->stats;
- struct seq_file *m = file->private_data;
-
- data->iter = iter;
- list_for_each_entry(class, &all_lock_classes, lock_entry) {
- iter->class = class;
- iter->stats = lock_stats(class);
- iter++;
- }
- data->iter_end = iter;
-
- sort(data->stats, data->iter_end - data->iter,
- sizeof(struct lock_stat_data),
- lock_stat_cmp, NULL);
-
- m->private = data;
- } else
- vfree(data);
+struct statistic_ui lock_stat_ui;
- return res;
+static void lock_stat_label(struct statistic_coll *coll, int i,
+ void *key, char *buf, int size)
+{
+ sprint_symbol(buf, *(unsigned long *)key);
}
-ssize_t lock_stat_write(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
+static void lock_stat_init(void)
{
struct lock_class *class;
- char c;
-
- if (count) {
- if (get_user(c, buf))
- return -EFAULT;
- if (c != '0')
- return count;
-
- list_for_each_entry(class, &all_lock_classes, lock_entry)
- clear_lock_stats(class);
+ statistic_create(&lock_stat_ui, "lockstat");
+ list_for_each_entry(class, &all_lock_classes, lock_entry) {
+ class->stat_coll.stat = class->stat;
+ class->stat_coll.info = lock_stat_info;
+ class->stat_coll.number = _LOCK_STAT_NUMBER;
+ class->stat_coll.label = lock_stat_label;
+ strncpy(class->stat_coll.name, class->name,
+ STATISTIC_COLL_NAME_SIZE - 1);
+ statistic_attach(&class->stat_coll, &lock_stat_ui);
}
- return count;
-}
-
-static int lock_stat_release(struct inode *inode, struct file *file)
-{
- struct seq_file *seq = file->private_data;
-
- vfree(seq->private);
- seq->private = NULL;
- return seq_release(inode, file);
}
-
-static const struct file_operations proc_lock_stat_operations = {
- .open = lock_stat_open,
- .write = lock_stat_write,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = lock_stat_release,
-};
-#endif /* CONFIG_LOCK_STAT */
+#endif
static int __init lockdep_proc_init(void)
{
@@ -617,9 +421,7 @@ static int __init lockdep_proc_init(void
entry->proc_fops = &proc_lockdep_stats_operations;
#ifdef CONFIG_LOCK_STAT
- entry = create_proc_entry("lock_stat", S_IRUSR, NULL);
- if (entry)
- entry->proc_fops = &proc_lock_stat_operations;
+ lock_stat_init();
#endif
return 0;
* Martin Peschke <[email protected]> wrote:
> The output has changed from a terribly wide table to an enormously
> long list (just the generic way the statistics code prints data).
Sigh, why dont you _ask_ before doing such stuff? It is a terribly wide
table because that makes it easily greppable but still watchable in one
chunk in a sufficiently zoomed out xterm. Please preserve this output
format, quite some work went into it - NACK :-(
Ingo
* Martin Peschke <[email protected]> wrote:
> - lock_time_inc() vs. statistic_add_util()
please fix the coding style in lib/statistic.c. It's full of:
{
unsigned long long i;
if (value <= stat->u.histogram.range_min)
return 0;
put a newline after variable sections.
and:
on_each_cpu(_statistic_barrier, NULL, 0, 1);
return 0;
preferably use a newline before 'return' statements as well. (this is
not always necessary, but in the above case it looks better)
Ingo
Ingo Molnar wrote:
> * Martin Peschke <[email protected]> wrote:
>
>> The output has changed from a terribly wide table to an enormously
>> long list (just the generic way the statistics code prints data).
>
> Sigh, why dont you _ask_ before doing such stuff?
A nice diffstat is always worth a try, isn't it?
And I see other reasons for code sharing.
Ah, and doing it has been actually quite simple once I had figured out
what the original code does. :-)
> It is a terribly wide table because that makes it easily greppable
If one looks for contentions of "xtime_lock" within my enormously long list,
they could issue:
grep -e "xtime_lock contentions" data
and get
xtime_lock contentions 0x17bd2 3327 account_ticks+0x96/0x184
xtime_lock contentions other 0
for example.
So how is this worse?
> but still watchable in one chunk in a sufficiently zoomed out xterm.
I am wondering whether we really want to reject kernel patches on the basis of
this reasoning, disregarding any other point why a patch might be helpful.
> Please preserve this output format
I understand why everybody likes their format most. It's always made to measure.
Chosing a different - or common - format didn't happen in bad faith.
Made to measure file format doesn't work well once we start abstracting out this
functionality. And I feel that was expected too much of a low level kernel ABI
piece.
I would like to add that usuability doesn't necessarily suffer if statistics for
some brand new gadget look somewhat familiar and similar to other statistics one
has encountered earlier.
>, quite some work went into it - NACK :-(
Considering the amount of code.. ;-) I am sorry.
But seriously, did you consider using some user space tool or script to
format this stuff the way you like it - similar to the way the powertop tool
reshuffles timer_stats data found in a proc file, for example?
The format of an enormously long list has been thought out keeping this
particular requirement in mind.
Martin
Ingo Molnar wrote:
> * Martin Peschke <[email protected]> wrote:
>
>> - lock_time_inc() vs. statistic_add_util()
>
> please fix the coding style in lib/statistic.c. It's full of:
>
> {
> unsigned long long i;
> if (value <= stat->u.histogram.range_min)
> return 0;
>
> put a newline after variable sections.
>
> and:
>
> on_each_cpu(_statistic_barrier, NULL, 0, 1);
> return 0;
>
> preferably use a newline before 'return' statements as well. (this is
> not always necessary, but in the above case it looks better)
Will do (in my local tree, for the time being).
Good points. Thnaks for reviewing.
Martin
On Thu, Jun 07, 2007 at 02:17:45AM +0200, Martin Peschke wrote:
> Ingo Molnar wrote:
> >, quite some work went into it - NACK :-(
>
> Considering the amount of code.. ;-) I am sorry.
>
> But seriously, did you consider using some user space tool or script to
> format this stuff the way you like it - similar to the way the powertop tool
> reshuffles timer_stats data found in a proc file, for example?
When I was doing my stuff, I intended for it to be parsed by a script or
simple command line tools like sort/grep piped through less. I also though
it might be interesting to output the text into either a python or ruby
syntax collect so that it can go through a more extensive sorting using
those languages.
There are roughly about 400 locks in a normal kernel for a desktop. The
list is rather cumbersome anyways so, IMO, it really should be handled
by parsing tools, etc... There could be more properties attached to each
lock especially if you intend to get this to work on -rt which need more
things reported.
bill
On Thu, 2007-06-07 at 02:17 +0200, Martin Peschke wrote:
> Ingo Molnar wrote:
> > * Martin Peschke <[email protected]> wrote:
> >
> >> The output has changed from a terribly wide table to an enormously
> >> long list (just the generic way the statistics code prints data).
> >
> > Sigh, why dont you _ask_ before doing such stuff?
>
> A nice diffstat is always worth a try, isn't it?
> And I see other reasons for code sharing.
> Ah, and doing it has been actually quite simple once I had figured out
> what the original code does. :-)
>
> > It is a terribly wide table because that makes it easily greppable
>
> If one looks for contentions of "xtime_lock" within my enormously long list,
> they could issue:
>
> grep -e "xtime_lock contentions" data
>
> and get
>
> xtime_lock contentions 0x17bd2 3327 account_ticks+0x96/0x184
> xtime_lock contentions other 0
>
> for example.
>
> So how is this worse?
How will you find the 5 most contended locks with 1 grep?
It used to be:
grep ":" /proc/lock_stat | head -n 5
lock stat is more about finding who is bad than examining a particular
lock (of course in the process of fixing that lock, that does become
interesting).
Nor was it _that_ wide, it easily fit into an xterm, even on my laptop.
Peter Zijlstra wrote:
> On Thu, 2007-06-07 at 02:17 +0200, Martin Peschke wrote:
>> Ingo Molnar wrote:
>>> * Martin Peschke <[email protected]> wrote:
>>>
>>>> The output has changed from a terribly wide table to an enormously
>>>> long list (just the generic way the statistics code prints data).
>>> Sigh, why dont you _ask_ before doing such stuff?
>> A nice diffstat is always worth a try, isn't it?
>> And I see other reasons for code sharing.
>> Ah, and doing it has been actually quite simple once I had figured out
>> what the original code does. :-)
>>
>>> It is a terribly wide table because that makes it easily greppable
>> If one looks for contentions of "xtime_lock" within my enormously long list,
>> they could issue:
>>
>> grep -e "xtime_lock contentions" data
>>
>> and get
>>
>> xtime_lock contentions 0x17bd2 3327 account_ticks+0x96/0x184
>> xtime_lock contentions other 0
>>
>> for example.
>>
>> So how is this worse?
>
> How will you find the 5 most contended locks with 1 grep?
>
> It used to be:
> grep ":" /proc/lock_stat | head -n 5
grep "contention" data |sort -k 4 -n |tail -n 5
&rq->rq_lock_key contentions 0x34c04 613 task_rq_lock+0x64/0xb0
&zone->lock contentions 0x8322e 770 __free_pages_ok+0x1b2/0x534
&zone->lock contentions 0x82828 1017 free_pages_bulk+0x40/0x42c
xtime_lock contentions 0x17bd2 3327 account_ticks+0x96/0x184
&inode->i_mutex contentions 0x3273ee 8807 mutex_lock+0x3e/0x4c
Admittedly this gives you the top five contention points, as my prototype
tracks this without accounting the total number of contentions for a
given lock in another counter.
If the above command line isn't good enough, we could easily put a total
contentions counter for each lock back in (see lock_stat_info).
>
> lock stat is more about finding who is bad than examining a particular
> lock (of course in the process of fixing that lock, that does become
> interesting).
Sure. I guess that applies to most other statistics.
Bill Huey (hui) wrote:
> There are roughly about 400 locks in a normal kernel for a desktop. The
> list is rather cumbersome anyways so, IMO, it really should be handled
> by parsing tools, etc... There could be more properties attached to each
> lock especially if you intend to get this to work on -rt which need more
> things reported.
Adding stuff to an already wide table in a readable fashion is more difficult
than making a list longer.
Do mean I might submit this stuff for -rt?
* Martin Peschke <[email protected]> wrote:
> Admittedly this gives you the top five contention points, [...]
if the infrastructure your are advocating does not allow us to keep the
existing output then it's simply not flexible enough. Why on earth are
you even arguing about this? A "cleanup" should not change the output,
simple as that. Do a patch that has the _same_ output and then we can
see whether it's a good patch. You made the same mistake with your
/proc/timer_stats cleanups. I dont like NACK-ing patches but you seem to
be missing the basic precondition of cleanups: no functional effect to
the code, and certainly no change in output.
Ingo
* Martin Peschke <[email protected]> wrote:
> Bill Huey (hui) wrote:
> > There are roughly about 400 locks in a normal kernel for a desktop. The
> > list is rather cumbersome anyways so, IMO, it really should be handled
> > by parsing tools, etc... There could be more properties attached to each
> > lock especially if you intend to get this to work on -rt which need more
> > things reported.
>
> Adding stuff to an already wide table in a readable fashion is more
> difficult than making a list longer.
>
> Do mean I might submit this stuff for -rt?
Firstly, submit cleanup patches that _do not change the output_. If you
have any output changes, do it as a separate patch, ontop of the cleanup
patch. Mixing material changes and cleanups into a single patch is a
basic patch submission mistake that will only earn you NACKs.
Ingo
On Wed, 2007-06-06 at 23:34 +0200, Martin Peschke wrote:
>
> Signed-off-by: Martin Peschke <[email protected]>
> ---
>
> include/linux/lockdep.h | 35 ++---
> kernel/lockdep.c | 122 ++-----------------
> kernel/lockdep_proc.c | 294 +++++++-----------------------------------------
> 3 files changed, 76 insertions(+), 375 deletions(-)
>
> Index: linux/kernel/lockdep.c
> ===================================================================
> --- linux.orig/kernel/lockdep.c
> +++ linux/kernel/lockdep.c
> @@ -37,6 +37,7 @@
> #include <linux/debug_locks.h>
> #include <linux/irqflags.h>
> #include <linux/utsname.h>
> +#include <linux/statistic.h>
>
> #include <asm/sections.h>
>
> @@ -119,93 +120,9 @@ unsigned long nr_lock_classes;
> static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
>
> #ifdef CONFIG_LOCK_STAT
> static void lock_release_holdtime(struct held_lock *hlock)
> {
> + struct statistic *stat = hlock->class->stat;
> s64 holdtime;
>
> if (!lock_stat)
> @@ -213,12 +130,10 @@ static void lock_release_holdtime(struct
>
> holdtime = sched_clock() - hlock->holdtime_stamp;
>
> if (hlock->read)
> + statistic_inc(stat, LOCK_STAT_HOLD_READ, holdtime);
> else
> + statistic_inc(stat, LOCK_STAT_HOLD_WRITE, holdtime);
> }
> #else
> static inline void lock_release_holdtime(struct held_lock *hlock)
> @@ -2745,9 +2660,8 @@ __lock_contended(struct lockdep_map *loc
> {
> struct task_struct *curr = current;
> struct held_lock *hlock, *prev_hlock;
> unsigned int depth;
> + int i;
>
> depth = curr->lockdep_depth;
> if (DEBUG_LOCKS_WARN_ON(!depth))
> @@ -2761,22 +2675,15 @@ __lock_contended(struct lockdep_map *loc
> */
> if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
> break;
> + if (hlock->instance == lock) {
> + hlock->waittime_stamp = sched_clock();
> + _statistic_inc(hlock->class->stat, LOCK_STAT_CONT, ip);
> + return;
> + }
> prev_hlock = hlock;
> }
> print_lock_contention_bug(curr, lock, ip);
> return;
> }
>
> static void
> @@ -2784,7 +2691,7 @@ __lock_acquired(struct lockdep_map *lock
> {
> struct task_struct *curr = current;
> struct held_lock *hlock, *prev_hlock;
> + struct statistic *stat;
> unsigned int depth;
> u64 now;
> s64 waittime;
> @@ -2817,12 +2724,11 @@ found_it:
> waittime = now - hlock->waittime_stamp;
> hlock->holdtime_stamp = now;
>
> + stat = hlock->class->stat;
> if (hlock->read)
> + _statistic_inc(stat, LOCK_STAT_WAIT_READ, waittime);
> else
> + _statistic_inc(stat, LOCK_STAT_WAIT_WRITE, waittime);
> }
>
> void lock_contended(struct lockdep_map *lock, unsigned long ip)
> Index: linux/include/linux/lockdep.h
> ===================================================================
> --- linux.orig/include/linux/lockdep.h
> +++ linux/include/linux/lockdep.h
> @@ -17,6 +17,7 @@ struct lockdep_map;
> #include <linux/list.h>
> #include <linux/debug_locks.h>
> #include <linux/stacktrace.h>
> +#include <linux/statistic.h>
>
> /*
> * Lock-class usage-state bits:
> @@ -72,6 +73,17 @@ struct lock_class_key {
> struct lockdep_subclass_key subkeys[MAX_LOCKDEP_SUBCLASSES];
> };
>
> +#ifdef CONFIG_LOCK_STAT
> +enum lock_stat_enum {
> + LOCK_STAT_CONT,
> + LOCK_STAT_WAIT_READ,
> + LOCK_STAT_WAIT_WRITE,
> + LOCK_STAT_HOLD_READ,
> + LOCK_STAT_HOLD_WRITE,
> + _LOCK_STAT_NUMBER
> +};
> +#endif
> +
> /*
> * The lock-class itself:
> */
> @@ -117,30 +129,11 @@ struct lock_class {
> int name_version;
>
> #ifdef CONFIG_LOCK_STAT
> + struct statistic stat[_LOCK_STAT_NUMBER];
> + struct statistic_coll stat_coll;
> #endif
> };
>
> /*
> * Map the lock object (the lock instance) to the lock-class object.
> * This is embedded into specific lock instances:
> Index: linux/kernel/lockdep_proc.c
> ===================================================================
> --- linux.orig/kernel/lockdep_proc.c
> +++ linux/kernel/lockdep_proc.c
> @@ -349,260 +349,64 @@ static const struct file_operations proc
> };
>
> #ifdef CONFIG_LOCK_STAT
> +struct statistic_info lock_stat_info[_LOCK_STAT_NUMBER] = {
> + [LOCK_STAT_CONT] = {
> + .name = "contentions",
> + .x_unit = "instruction_pointer",
> + .y_unit = "occurrence",
> + .defaults = "type=sparse entries=4",
> + .flags = STATISTIC_FLAGS_LABEL,
> + },
> + [LOCK_STAT_WAIT_READ] = {
> + .name = "wait_read",
> + .x_unit = "nanoseconds",
> + .y_unit = "occurrence",
> + .defaults = "type=utilisation",
> + },
> + [LOCK_STAT_WAIT_WRITE] = {
> + .name = "wait_write",
> + .x_unit = "nanoseconds",
> + .y_unit = "occurrence",
> + .defaults = "type=utilisation",
> + },
> + [LOCK_STAT_HOLD_READ] = {
> + .name = "hold_read",
> + .x_unit = "nanoseconds",
> + .y_unit = "occurrence",
> + .defaults = "type=utilisation",
> + },
> + [LOCK_STAT_HOLD_WRITE] = {
> + .name = "hold_write",
> + .x_unit = "nanoseconds",
> + .y_unit = "occurrence",
> + .defaults = "type=utilisation",
> }
> };
>
> +struct statistic_ui lock_stat_ui;
>
> +static void lock_stat_label(struct statistic_coll *coll, int i,
> + void *key, char *buf, int size)
> +{
> + sprint_symbol(buf, *(unsigned long *)key);
> }
>
> +static void lock_stat_init(void)
> {
> struct lock_class *class;
> + statistic_create(&lock_stat_ui, "lockstat");
> + list_for_each_entry(class, &all_lock_classes, lock_entry) {
> + class->stat_coll.stat = class->stat;
> + class->stat_coll.info = lock_stat_info;
> + class->stat_coll.number = _LOCK_STAT_NUMBER;
> + class->stat_coll.label = lock_stat_label;
> + strncpy(class->stat_coll.name, class->name,
> + STATISTIC_COLL_NAME_SIZE - 1);
> + statistic_attach(&class->stat_coll, &lock_stat_ui);
> }
> }
> +#endif
>
> static int __init lockdep_proc_init(void)
> {
> @@ -617,9 +421,7 @@ static int __init lockdep_proc_init(void
> entry->proc_fops = &proc_lockdep_stats_operations;
>
> #ifdef CONFIG_LOCK_STAT
> + lock_stat_init();
> #endif
>
> return 0;
I'm confused as to where the class->stat objects are initialised? Is
that done in lock_stat_init()? If so, then you have a bug.
On Wed, 2007-06-06 at 23:34 +0200, Martin Peschke wrote:
> +struct statistic_info lock_stat_info[_LOCK_STAT_NUMBER] = {
> + [LOCK_STAT_CONT] = {
> + .name = "contentions",
> + .x_unit = "instruction_pointer",
> + .y_unit = "occurrence",
> + .defaults = "type=sparse entries=4",
> + .flags = STATISTIC_FLAGS_LABEL,
> + },
> + [LOCK_STAT_WAIT_READ] = {
> + .name = "wait_read",
> + .x_unit = "nanoseconds",
> + .y_unit = "occurrence",
> + .defaults = "type=utilisation",
> + },
> + [LOCK_STAT_WAIT_WRITE] = {
> + .name = "wait_write",
> + .x_unit = "nanoseconds",
> + .y_unit = "occurrence",
> + .defaults = "type=utilisation",
> + },
> + [LOCK_STAT_HOLD_READ] = {
> + .name = "hold_read",
> + .x_unit = "nanoseconds",
> + .y_unit = "occurrence",
> + .defaults = "type=utilisation",
> + },
> + [LOCK_STAT_HOLD_WRITE] = {
> + .name = "hold_write",
> + .x_unit = "nanoseconds",
> + .y_unit = "occurrence",
> + .defaults = "type=utilisation",
> }
> };
You're parsing strings in-kernel to setup data structures?
On Wed, 2007-06-06 at 23:34 +0200, Martin Peschke wrote:
> +#ifdef CONFIG_LOCK_STAT
> +enum lock_stat_enum {
> + LOCK_STAT_CONT,
> + LOCK_STAT_WAIT_READ,
> + LOCK_STAT_WAIT_WRITE,
> + LOCK_STAT_HOLD_READ,
> + LOCK_STAT_HOLD_WRITE,
> + _LOCK_STAT_NUMBER
> +};
> +#endif
> +
> /*
> * The lock-class itself:
> */
> @@ -117,30 +129,11 @@ struct lock_class {
> int name_version;
>
> #ifdef CONFIG_LOCK_STAT
> - unsigned long contention_point[4];
> + struct statistic stat[_LOCK_STAT_NUMBER];
> + struct statistic_coll stat_coll;
> #endif
> };
sizeof(struct statistic_coll) = 16+64+8+8+4+8+8 = 116
sizeof(struct statistic) = 4+4+8+8+8+8+8+4+8+4+4 = 68
+ 8*NR_CPUS
+ kmalloc_size(obj)*nr_cpu_ids
4 objs with size 40, gives 4*64 = 256 * nr_cpu_ids
1 obj with size 32 + more
for 2048 total classes this gives:
2048 * (116+68) = 376832
for each active class this adds per cpu:
8+256+32+some = 296+
we have around 1400 locks in the kernel, this would give 414400 per cpu.
vs the old code:
2048*(4*8) = 65536
+
2048*(4*4*8 + 4*8) = 327680 per cpu
worst case
I'm not convinced 300 lines less code is worth that extra bloat.
On Thu, Jun 07, 2007 at 09:30:21AM +0200, Ingo Molnar wrote:
> * Martin Peschke <[email protected]> wrote:
> > Do mean I might submit this stuff for -rt?
>
> Firstly, submit cleanup patches that _do not change the output_. If you
> have any output changes, do it as a separate patch, ontop of the cleanup
> patch. Mixing material changes and cleanups into a single patch is a
> basic patch submission mistake that will only earn you NACKs.
Martin,
First of all I agree with Ingo in that this needs to be seperated from
the rest of the clean ups. However, I don't understand why all of this
is so heavy weight when the current measurements that Peter makes is
completely sufficient for any reasonable purpose I can think of at the
moment. What's this stuff with labels about ?
It's important to get the points of contention so that the greater
kernel group can fix this issues and not log statistics for the purpose
of logging it. The original purpose should not be ignore when working
on this stuff.
By the way, what's the purpose of all of this stuff ? like what do you
intend to do with it over the long haul ?
bill
* Peter Zijlstra <[email protected]> wrote:
> we have around 1400 locks in the kernel, this would give 414400 per cpu.
>
> vs the old code:
>
> 2048*(4*8) = 65536
> +
> 2048*(4*4*8 + 4*8) = 327680 per cpu
>
> worst case
>
> I'm not convinced 300 lines less code is worth that extra bloat.
agreed, this is not worth it. It would be acceptable if the statistics
stuff was flexible enough to have the same footprint as the current
lockstat data structures.
Ingo
Ingo Molnar wrote:
> if the infrastructure your are advocating does not allow us to keep the
> existing output then it's simply not flexible enough.
Let's be precise. If "keep the existing output" means any format change is
unacceptible to you, then I broke things. If it means that my method provides
data equivalent in respect of content, then I didn't break the lock contention
output.
> Why on earth are you even arguing about this?
> A "cleanup" should not change the output, simple as that.
> Do a patch that has the _same_ output and then we can
> see whether it's a good patch. You made the same mistake with your
> /proc/timer_stats cleanups.
We got to be careful here. My other proposal was doomed because timerstat became
kernel ABI in the meantime. We won't break the kernel ABI. I was late, as simple
as that.
The lock contention stuff isn't kernel ABI yet. This is -mm code, stuff
intented for a wider audience and discussion. It should be perfectly fine
to scrutinize kernel ABI additions before we get beyond the point of no return.
> I dont like NACK-ing patches but you seem to
> be missing the basic precondition of cleanups: no functional effect to
> the code, and certainly no change in output.
I don't see the point of judging something by goals that have not been set.
I have advertised my patches as: same purpose, different or generalised method,
differences in output format, output equivalent in respect of content,
much more code sharing.
Martin
Ingo Molnar wrote:
> Firstly, submit cleanup patches that _do not change the output_. If you
> have any output changes, do it as a separate patch, ontop of the cleanup
> patch. Mixing material changes and cleanups into a single patch is a
> basic patch submission mistake that will only earn you NACKs.
Let's put this straight: it has not been my primary goal to clean it up.
I have posted an alternative implementation. One of the good things about the
alternative implementation is lines-of-savings through code sharing.
Don't hear me say my code is perfect. As anybody else here I am relying on
reviews and feedback in order to make it good enough.
I think there has been enough duplication to consider a component
that statistics data is handed over to.
Martin
Peter Zijlstra wrote:
> I'm confused as to where the class->stat objects are initialised? Is
> that done in lock_stat_init()? If so, then you have a bug.
static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
I assume this gets us class structures containing all zeros, doesn't it?
Then class->stat is zeros as well, which is handled by lib/statistics.
(In this case, data gathering hasn't been turned on yet, and statistic_inc()
and similar functions don't access other areas of struct statistic.)
Martin
Peter Zijlstra wrote:
> On Wed, 2007-06-06 at 23:34 +0200, Martin Peschke wrote:
>
>> +struct statistic_info lock_stat_info[_LOCK_STAT_NUMBER] = {
>> + [LOCK_STAT_CONT] = {
>> + .name = "contentions",
>> + .x_unit = "instruction_pointer",
>> + .y_unit = "occurrence",
>> + .defaults = "type=sparse entries=4",
>> + .flags = STATISTIC_FLAGS_LABEL,
>> + },
>> + [LOCK_STAT_WAIT_READ] = {
>> + .name = "wait_read",
>> + .x_unit = "nanoseconds",
>> + .y_unit = "occurrence",
>> + .defaults = "type=utilisation",
>> + },
>> + [LOCK_STAT_WAIT_WRITE] = {
>> + .name = "wait_write",
>> + .x_unit = "nanoseconds",
>> + .y_unit = "occurrence",
>> + .defaults = "type=utilisation",
>> + },
>> + [LOCK_STAT_HOLD_READ] = {
>> + .name = "hold_read",
>> + .x_unit = "nanoseconds",
>> + .y_unit = "occurrence",
>> + .defaults = "type=utilisation",
>> + },
>> + [LOCK_STAT_HOLD_WRITE] = {
>> + .name = "hold_write",
>> + .x_unit = "nanoseconds",
>> + .y_unit = "occurrence",
>> + .defaults = "type=utilisation",
>> }
>> };
>
> You're parsing strings in-kernel to setup data structures?
Might be surprising (or revolting?)... but, yes, I do.
.defaults reflects the developers opinion on the way of data
accumululation that might make most sense in most cases.
The current implementation of lib/statistic.c allows users to change
setting that we see in .defaults. If one needs to track more than
4 contention points, for example, then they can change this limit by
passing "entries=4" to the user interface. So there is some parsing
code anyway. Passing default setting in a similar fashion has just been
convenient for the developer.
On Fri, 2007-06-08 at 19:18 +0200, Martin Peschke wrote:
> Peter Zijlstra wrote:
> > On Fri, 2007-06-08 at 19:00 +0200, Martin Peschke wrote:
> >> Peter Zijlstra wrote:
> >>> I'm confused as to where the class->stat objects are initialised? Is
> >>> that done in lock_stat_init()? If so, then you have a bug.
> >> static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
> >>
> >> I assume this gets us class structures containing all zeros, doesn't it?
> >> Then class->stat is zeros as well, which is handled by lib/statistics.
> >> (In this case, data gathering hasn't been turned on yet, and statistic_inc()
> >> and similar functions don't access other areas of struct statistic.)
> >
> > Who eventually calls percpu_alloc?
>
> There is a small state machine calling percpu_alloc when users do
>
> echo state=on > /debug/statistics/lockdep/definition
>
> So data gathering is off by default.
>
> It might make sense to allow "state=on" as a default. Then allocation would
> be done in the context of statistic_attach().
Right, the problem here is that you iterate over all_lock_classes once
at init.
Contrary to what the name might suggest, it are not all possible
classes, just all active ones. So you'll only attach the classes which
have been used up until the init point. All other classes used later
will never be initialized.
Peter Zijlstra wrote:
> On Fri, 2007-06-08 at 19:18 +0200, Martin Peschke wrote:
>> Peter Zijlstra wrote:
>>> On Fri, 2007-06-08 at 19:00 +0200, Martin Peschke wrote:
>>>> Peter Zijlstra wrote:
>>>>> I'm confused as to where the class->stat objects are initialised? Is
>>>>> that done in lock_stat_init()? If so, then you have a bug.
>>>> static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
>>>>
>>>> I assume this gets us class structures containing all zeros, doesn't it?
>>>> Then class->stat is zeros as well, which is handled by lib/statistics.
>>>> (In this case, data gathering hasn't been turned on yet, and statistic_inc()
>>>> and similar functions don't access other areas of struct statistic.)
>>> Who eventually calls percpu_alloc?
>> There is a small state machine calling percpu_alloc when users do
>>
>> echo state=on > /debug/statistics/lockdep/definition
>>
>> So data gathering is off by default.
>>
>> It might make sense to allow "state=on" as a default. Then allocation would
>> be done in the context of statistic_attach().
>
> Right, the problem here is that you iterate over all_lock_classes once
> at init.
>
> Contrary to what the name might suggest, it are not all possible
> classes, just all active ones. So you'll only attach the classes which
> have been used up until the init point. All other classes used later
> will never be initialized.
Ah..
Do you know where a class is setup for first use?
I guess it is feasible to move statistic initialisation to that place.
Is there a place where a class becomes unused again?
On Fri, 2007-06-08 at 19:37 +0200, Martin Peschke wrote:
> Peter Zijlstra wrote:
> > On Fri, 2007-06-08 at 19:18 +0200, Martin Peschke wrote:
> >> Peter Zijlstra wrote:
> >>> On Fri, 2007-06-08 at 19:00 +0200, Martin Peschke wrote:
> >>>> Peter Zijlstra wrote:
> >>>>> I'm confused as to where the class->stat objects are initialised? Is
> >>>>> that done in lock_stat_init()? If so, then you have a bug.
> >>>> static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
> >>>>
> >>>> I assume this gets us class structures containing all zeros, doesn't it?
> >>>> Then class->stat is zeros as well, which is handled by lib/statistics.
> >>>> (In this case, data gathering hasn't been turned on yet, and statistic_inc()
> >>>> and similar functions don't access other areas of struct statistic.)
> >>> Who eventually calls percpu_alloc?
> >> There is a small state machine calling percpu_alloc when users do
> >>
> >> echo state=on > /debug/statistics/lockdep/definition
> >>
> >> So data gathering is off by default.
> >>
> >> It might make sense to allow "state=on" as a default. Then allocation would
> >> be done in the context of statistic_attach().
> >
> > Right, the problem here is that you iterate over all_lock_classes once
> > at init.
> >
> > Contrary to what the name might suggest, it are not all possible
> > classes, just all active ones. So you'll only attach the classes which
> > have been used up until the init point. All other classes used later
> > will never be initialized.
>
> Ah..
>
> Do you know where a class is setup for first use?
Somewhere down the line of lock_acquire()->register_lock_class().
It is then stuck onto the all_lock_classes list in mark_lock(). However
this would be a very bad place to run init code, since percpu_alloc()
can schedule and we're in the middle of a locking primitive :-)
> I guess it is feasible to move statistic initialisation to that place.
>
> Is there a place where a class becomes unused again?
Yes, it is possible to clear the task list, however that part of lockdep
is a tag fragile - look for the call-chain leading to zap_class().
Peter Zijlstra wrote:
> On Fri, 2007-06-08 at 19:37 +0200, Martin Peschke wrote:
>> Peter Zijlstra wrote:
>>> On Fri, 2007-06-08 at 19:18 +0200, Martin Peschke wrote:
>>>> Peter Zijlstra wrote:
>>>>> On Fri, 2007-06-08 at 19:00 +0200, Martin Peschke wrote:
>>>>>> Peter Zijlstra wrote:
>>>>>>> I'm confused as to where the class->stat objects are initialised? Is
>>>>>>> that done in lock_stat_init()? If so, then you have a bug.
>>>>>> static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
>>>>>>
>>>>>> I assume this gets us class structures containing all zeros, doesn't it?
>>>>>> Then class->stat is zeros as well, which is handled by lib/statistics.
>>>>>> (In this case, data gathering hasn't been turned on yet, and statistic_inc()
>>>>>> and similar functions don't access other areas of struct statistic.)
>>>>> Who eventually calls percpu_alloc?
>>>> There is a small state machine calling percpu_alloc when users do
>>>>
>>>> echo state=on > /debug/statistics/lockdep/definition
>>>>
>>>> So data gathering is off by default.
>>>>
>>>> It might make sense to allow "state=on" as a default. Then allocation would
>>>> be done in the context of statistic_attach().
>>> Right, the problem here is that you iterate over all_lock_classes once
>>> at init.
>>>
>>> Contrary to what the name might suggest, it are not all possible
>>> classes, just all active ones. So you'll only attach the classes which
>>> have been used up until the init point. All other classes used later
>>> will never be initialized.
One could work around this by iterating over the entire array...
Anyway, this is not good enough. The right solution was to teach my
code to deal with static (per-cpu) statistic counter arrays in addition
to buffers allocated with percpu_alloc(). Both use cases are valid.
Thanks for your feedback.
Martin
Bill Huey (hui) wrote:
> However, I don't understand why all of this
> is so heavy weight when the current measurements that Peter makes is
> completely sufficient for any reasonable purpose I can think of at the
> moment.
It wasn't meant to be or to stay heavy weight. To make the best of my
recent attempt: It's good that such issues show up.
> What's this stuff with labels about ?
Sorry, I didn't come around to posting a documentation update.
It's an optional callback that allows a user to label an entry of
a statistic when it is printed. I have used this feature for adding
symbol strings to instruction pointers.
> It's important to get the points of contention so that the greater
> kernel group can fix this issues and not log statistics for the purpose
> of logging it. The original purpose should not be ignore when working
> on this stuff.
Sure.
> By the way, what's the purpose of all of this stuff ? like what do you
> intend to do with it over the long haul ?
Well, it started as statistics code of a device driver. When I realized
that I was increasing code duplication in the kernel, I tried to abstract
the code out that is device driver independent, allowing others to reuse it.
I have done a couple of iterations, adapting the code to requirements
which showed up when using the code for another driver etc.
This time, my lock contentaion patch showed two issues that require work:
struct statistics needs to be slimmed down or be eliminated alltogether;
allocations might be fine for some users but not for all. Solving
these issues is feasible. Anyway, I don't think I can effort pursueing
it further.
Martin
Peter Zijlstra wrote:
> On Wed, 2007-06-06 at 23:34 +0200, Martin Peschke wrote:
>
>> +#ifdef CONFIG_LOCK_STAT
>> +enum lock_stat_enum {
>> + LOCK_STAT_CONT,
>> + LOCK_STAT_WAIT_READ,
>> + LOCK_STAT_WAIT_WRITE,
>> + LOCK_STAT_HOLD_READ,
>> + LOCK_STAT_HOLD_WRITE,
>> + _LOCK_STAT_NUMBER
>> +};
>> +#endif
>> +
>> /*
>> * The lock-class itself:
>> */
>> @@ -117,30 +129,11 @@ struct lock_class {
>> int name_version;
>>
>> #ifdef CONFIG_LOCK_STAT
>> - unsigned long contention_point[4];
>> + struct statistic stat[_LOCK_STAT_NUMBER];
>> + struct statistic_coll stat_coll;
>> #endif
>> };
>
>
> sizeof(struct statistic_coll) = 16+64+8+8+4+8+8 = 116
> sizeof(struct statistic) = 4+4+8+8+8+8+8+4+8+4+4 = 68
> + 8*NR_CPUS
> + kmalloc_size(obj)*nr_cpu_ids
>
>
> 4 objs with size 40, gives 4*64 = 256 * nr_cpu_ids
This looks like 4 * struct statistic_entry_util with members
for min, max etc. Used for contention point tracking.
I have noticed that many lock classes show less than 4, or even no
contention points. Unlike the original code, my code doesn't eat
up memory for contentions that don't show.
I doubt that my patch scores 414400 bytes per cpu then.
> 1 obj with size 32 + more
>
>
> for 2048 total classes this gives:
>
> 2048 * (116+68) = 376832
>
> for each active class this adds per cpu:
>
> 8+256+32+some = 296+
>
> we have around 1400 locks in the kernel, this would give 414400 per cpu.
>
> vs the old code:
>
> 2048*(4*8) = 65536
> +
> 2048*(4*4*8 + 4*8) = 327680 per cpu
>
> worst case
>
> I'm not convinced 300 lines less code is worth that extra bloat.
In general, you are right.
First, struct statistic is too expensive. Allowing sets of statistics
instead of individual statistics to be switched on and off would suffice
surely. This alone would allow to move several members from struct
statistic to struct statistic_coll (one per lock class) or struct
statistic_ui (for all lock contention statistics). In the end it might
be feasible to reduce struct statistic to a per-cpu data pointer.
Second, my code should not depend on struct statistic and percpu_alloc,
allowing users to bring their own static data areas, at least for simple
things like counters, min, max etc.
Martin