From: Ming Lei <[email protected]>
This series does the BFS optimizations:
.reintroduce DaveM's generation id to bfs to make it faster
.make bfs preallocated "stack" as per cpu variable
.add likely/unlikely annotations to BFS core routine
and the patches are based on Peter's patches against tip:core/locking:
http://marc.info/?l=linux-kernel&m=124793787804558&w=2
http://marc.info/?l=linux-kernel&m=124793787804556&w=2
Also, the patches have passed lockdep's selftest.
include/linux/lockdep.h | 1 +
kernel/lockdep.c | 26 ++++++++++++++------------
2 files changed, 15 insertions(+), 12 deletions(-)
Thanks,
Lei Ming
From: Ming Lei <[email protected]>
Signed-off-by: Ming Lei <[email protected]>
---
kernel/lockdep.c | 5 +++--
1 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 1b1796a..1583439 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -847,7 +847,7 @@ struct circular_queue {
unsigned int front, rear;
};
-static struct circular_queue lock_cq;
+static DEFINE_PER_CPU(struct circular_queue, lock_cq);
unsigned int max_bfs_queue_depth;
@@ -937,7 +937,7 @@ static int __bfs(struct lock_list *source_entry,
{
struct lock_list *entry;
struct list_head *head;
- struct circular_queue *cq = &lock_cq;
+ struct circular_queue *cq = &get_cpu_var(lock_cq);
int ret = 1;
if (match(source_entry, data)) {
@@ -993,6 +993,7 @@ static int __bfs(struct lock_list *source_entry,
}
}
exit:
+ put_cpu_var(lock_cq);
return ret;
}
--
1.6.0.GIT
From: Ming Lei <[email protected]>
We still can apply DaveM's generation count optimization to
BFS, so reintroduce it.
Signed-off-by: Ming Lei <[email protected]>
---
include/linux/lockdep.h | 1 +
kernel/lockdep.c | 11 ++++++-----
2 files changed, 7 insertions(+), 5 deletions(-)
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 12aabfc..ddde26f 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -58,6 +58,7 @@ struct lock_class {
struct lockdep_subclass_key *key;
unsigned int subclass;
+ unsigned int dep_gen_id;
/*
* IRQ/softirq usage tracking bits:
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 1cedb00..1b1796a 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -848,14 +848,15 @@ struct circular_queue {
};
static struct circular_queue lock_cq;
-static unsigned long bfs_accessed[BITS_TO_LONGS(MAX_LOCKDEP_ENTRIES)];
unsigned int max_bfs_queue_depth;
+static unsigned int lockdep_dependency_gen_id;
+
static inline void __cq_init(struct circular_queue *cq)
{
cq->front = cq->rear = 0;
- bitmap_zero(bfs_accessed, MAX_LOCKDEP_ENTRIES);
+ lockdep_dependency_gen_id++;
}
static inline int __cq_empty(struct circular_queue *cq)
@@ -900,7 +901,7 @@ static inline void mark_lock_accessed(struct lock_list *lock,
nr = lock - list_entries;
WARN_ON(nr >= nr_list_entries);
lock->parent = parent;
- set_bit(nr, bfs_accessed);
+ lock->class->dep_gen_id = lockdep_dependency_gen_id;
}
static inline unsigned long lock_accessed(struct lock_list *lock)
@@ -908,7 +909,7 @@ static inline unsigned long lock_accessed(struct lock_list *lock)
unsigned long nr;
nr = lock - list_entries;
WARN_ON(nr >= nr_list_entries);
- return test_bit(nr, bfs_accessed);
+ return lock->class->dep_gen_id == lockdep_dependency_gen_id;
}
static inline struct lock_list *get_lock_parent(struct lock_list *child)
@@ -3491,7 +3492,7 @@ void __init lockdep_info(void)
sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS +
sizeof(struct list_head) * CHAINHASH_SIZE) / 1024
#ifdef CONFIG_PROVE_LOCKING
- + sizeof(struct circular_queue) + sizeof(bfs_accessed)
+ + sizeof(struct circular_queue)
#endif
);
--
1.6.0.GIT
From: Ming Lei <[email protected]>
Signed-off-by: Ming Lei <[email protected]>
---
kernel/lockdep.c | 10 +++++-----
1 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 1583439..2bf49ae 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -951,18 +951,18 @@ static int __bfs(struct lock_list *source_entry,
else
head = &source_entry->class->locks_before;
- if (list_empty(head))
+ if (unlikely(list_empty(head)))
goto exit;
__cq_init(cq);
__cq_enqueue(cq, (unsigned long)source_entry);
- while (!__cq_empty(cq)) {
+ while (likely(!__cq_empty(cq))) {
struct lock_list *lock;
__cq_dequeue(cq, (unsigned long *)&lock);
- if (!lock->class) {
+ if (unlikely(!lock->class)) {
ret = -2;
goto exit;
}
@@ -982,12 +982,12 @@ static int __bfs(struct lock_list *source_entry,
goto exit;
}
- if (__cq_enqueue(cq, (unsigned long)entry)) {
+ if (unlikely(__cq_enqueue(cq, (unsigned long)entry))) {
ret = -1;
goto exit;
}
cq_depth = __cq_get_elem_count(cq);
- if (max_bfs_queue_depth < cq_depth)
+ if (unlikely(max_bfs_queue_depth < cq_depth))
max_bfs_queue_depth = cq_depth;
}
}
--
1.6.0.GIT
On Wed, 2009-07-22 at 20:58 +0800, [email protected] wrote:
> From: Ming Lei <[email protected]>
This patch can use a changelog.
Why is this needed, isn't all that serialized by the graph_lock anyway?
Or are there a few paths where this isn't the case and we're now racy?
> Signed-off-by: Ming Lei <[email protected]>
> ---
> kernel/lockdep.c | 5 +++--
> 1 files changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/lockdep.c b/kernel/lockdep.c
> index 1b1796a..1583439 100644
> --- a/kernel/lockdep.c
> +++ b/kernel/lockdep.c
> @@ -847,7 +847,7 @@ struct circular_queue {
> unsigned int front, rear;
> };
>
> -static struct circular_queue lock_cq;
> +static DEFINE_PER_CPU(struct circular_queue, lock_cq);
>
> unsigned int max_bfs_queue_depth;
>
> @@ -937,7 +937,7 @@ static int __bfs(struct lock_list *source_entry,
> {
> struct lock_list *entry;
> struct list_head *head;
> - struct circular_queue *cq = &lock_cq;
> + struct circular_queue *cq = &get_cpu_var(lock_cq);
> int ret = 1;
>
> if (match(source_entry, data)) {
> @@ -993,6 +993,7 @@ static int __bfs(struct lock_list *source_entry,
> }
> }
> exit:
> + put_cpu_var(lock_cq);
> return ret;
> }
>
On Wed, 2009-07-22 at 20:58 +0800, [email protected] wrote:
> From: Ming Lei <[email protected]>
>
> We still can apply DaveM's generation count optimization to
> BFS, so reintroduce it.
Please elaborate, changelogs are important.
You fail to explain why it still applies and what the gains are.
(I can guess, but you explaining makes sure we're on the same page)
On Wed, 2009-07-22 at 20:58 +0800, [email protected] wrote:
> From: Ming Lei <[email protected]>
Another empty changelog,... not good.
Please explain why, and such patches really should carry performance
numbers to justify themselves.
2009/7/22 Peter Zijlstra <[email protected]>:
> On Wed, 2009-07-22 at 20:58 +0800, [email protected] wrote:
>> From: Ming Lei <[email protected]>
>
> This patch can use a changelog.
>
> Why is this needed, isn't all that serialized by the graph_lock anyway?
> Or are there a few paths where this isn't the case and we're now racy?
It is really serialized by the graph_lock, but we can prevent cpu cache from
being flushing by different cpu access, which seems that can be avoided by
per cpu variables. Right?
Thanks.
--
Lei Ming
2009/7/22 Peter Zijlstra <[email protected]>:
> On Wed, 2009-07-22 at 20:58 +0800, [email protected] wrote:
>> From: Ming Lei <[email protected]>
>>
>> We still can apply DaveM's generation count optimization to
>> BFS, so reintroduce it.
>
> Please elaborate, changelogs are important.
Sorry, I'll resend the patches and provide a elaborate changlogs.
>
> You fail to explain why it still applies and what the gains are.
>
> (I can guess, but you explaining makes sure we're on the same page)
>
gains:
1), avoid to allocate the bfs_accessed array;
2), remove the
bitmap_zero(bfs_accessed, MAX_LOCKDEP_ENTRIES);
in each BFS, which is a little time consuming since
MAX_LOCKDEP_ENTRIES
is very large.(16384UL)
--
Lei Ming
On Wed, 2009-07-22 at 21:10 +0800, Ming Lei wrote:
> 2009/7/22 Peter Zijlstra <[email protected]>:
> > On Wed, 2009-07-22 at 20:58 +0800, [email protected] wrote:
> >> From: Ming Lei <[email protected]>
> >
> > This patch can use a changelog.
> >
> > Why is this needed, isn't all that serialized by the graph_lock anyway?
> > Or are there a few paths where this isn't the case and we're now racy?
>
> It is really serialized by the graph_lock, but we can prevent cpu cache from
> being flushing by different cpu access, which seems that can be avoided by
> per cpu variables. Right?
I doubt it'll make a difference, got any numbers to back that up?
2009/7/22 Peter Zijlstra <[email protected]>:
> On Wed, 2009-07-22 at 21:10 +0800, Ming Lei wrote:
>> 2009/7/22 Peter Zijlstra <[email protected]>:
>> > On Wed, 2009-07-22 at 20:58 +0800, [email protected] wrote:
>> >> From: Ming Lei <[email protected]>
>> >
>> > This patch can use a changelog.
>> >
>> > Why is this needed, isn't all that serialized by the graph_lock anyway?
>> > Or are there a few paths where this isn't the case and we're now racy?
>>
>> It is really serialized by the graph_lock, but we can prevent cpu cache from
>> being flushing by different cpu access, ?which seems that can be avoided by
>> per cpu variables. Right?
>
> I doubt it'll make a difference, got any numbers to back that up?
OK, I'll design some test case to get the numbers.
Thanks.
--
Lei Ming