2018-12-12 22:29:42

by Waiman Long

[permalink] [raw]
Subject: [PATCH] debugobjects: Move printk out of db lock critical sections

The db->lock is a raw spinlock and so the lock hold time is supposed
to be short. This will not be the case when printk() is being involved
in some of the critical sections. In order to avoid the long hold time,
in case some messages need to be printed, the debug_object_is_on_stack()
and debug_print_object() calls are now moved out of those critical
sections.

Holding the db->lock while calling printk() may lead to deadlock if
printk() somehow requires the allocation/freeing of debug object that
happens to be in the same hash bucket or a circular lock dependency
warning from lockdep as reported in https://lkml.org/lkml/2018/12/11/143.

[ 87.209665] WARNING: possible circular locking dependency detected
[ 87.210547] 4.20.0-rc4-00057-gc96cf92 #1 Tainted: G W
[ 87.211449] ------------------------------------------------------
[ 87.212405] getty/519 is trying to acquire lock:
[ 87.213074] (____ptrval____) (&obj_hash[i].lock){-.-.}, at: debug_check_no_obj_freed+0xb4/0x302
[ 87.214343]
[ 87.214343] but task is already holding lock:
[ 87.215174] (____ptrval____) (&port_lock_key){-.-.}, at: uart_shutdown+0x3a3/0x4e2
[ 87.216260]
[ 87.216260] which lock already depends on the new lock.

This patch was also found to be able to fix a boot hanging problem
when the initramfs image was switched on after a debugobjects splat
from the EFI code.

Signed-off-by: Waiman Long <[email protected]>
---
lib/debugobjects.c | 61 +++++++++++++++++++++++++++++++++++++-----------------
1 file changed, 42 insertions(+), 19 deletions(-)

diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 14afeeb..c30e786 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -375,6 +375,8 @@ static void debug_object_is_on_stack(void *addr, int onstack)
struct debug_bucket *db;
struct debug_obj *obj;
unsigned long flags;
+ bool debug_printobj = false;
+ bool debug_chkstack = false;

fill_pool();

@@ -391,7 +393,7 @@ static void debug_object_is_on_stack(void *addr, int onstack)
debug_objects_oom();
return;
}
- debug_object_is_on_stack(addr, onstack);
+ debug_chkstack = true;
}

switch (obj->state) {
@@ -402,20 +404,25 @@ static void debug_object_is_on_stack(void *addr, int onstack)
break;

case ODEBUG_STATE_ACTIVE:
- debug_print_object(obj, "init");
state = obj->state;
raw_spin_unlock_irqrestore(&db->lock, flags);
+ debug_print_object(obj, "init");
debug_object_fixup(descr->fixup_init, addr, state);
return;

case ODEBUG_STATE_DESTROYED:
- debug_print_object(obj, "init");
+ debug_printobj = true;
break;
default:
break;
}

raw_spin_unlock_irqrestore(&db->lock, flags);
+ if (debug_chkstack)
+ debug_object_is_on_stack(addr, onstack);
+ if (debug_printobj)
+ debug_print_object(obj, "init");
+
}

/**
@@ -473,6 +480,8 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr)

obj = lookup_object(addr, db);
if (obj) {
+ bool debug_printobj = false;
+
switch (obj->state) {
case ODEBUG_STATE_INIT:
case ODEBUG_STATE_INACTIVE:
@@ -481,14 +490,14 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr)
break;

case ODEBUG_STATE_ACTIVE:
- debug_print_object(obj, "activate");
state = obj->state;
raw_spin_unlock_irqrestore(&db->lock, flags);
+ debug_print_object(obj, "activate");
ret = debug_object_fixup(descr->fixup_activate, addr, state);
return ret ? 0 : -EINVAL;

case ODEBUG_STATE_DESTROYED:
- debug_print_object(obj, "activate");
+ debug_printobj = true;
ret = -EINVAL;
break;
default:
@@ -496,10 +505,13 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr)
break;
}
raw_spin_unlock_irqrestore(&db->lock, flags);
+ if (debug_printobj)
+ debug_print_object(obj, "activate");
return ret;
}

raw_spin_unlock_irqrestore(&db->lock, flags);
+
/*
* We are here when a static object is activated. We
* let the type specific code confirm whether this is
@@ -531,6 +543,7 @@ void debug_object_deactivate(void *addr, struct debug_obj_descr *descr)
struct debug_bucket *db;
struct debug_obj *obj;
unsigned long flags;
+ bool debug_printobj = false;

if (!debug_objects_enabled)
return;
@@ -548,24 +561,27 @@ void debug_object_deactivate(void *addr, struct debug_obj_descr *descr)
if (!obj->astate)
obj->state = ODEBUG_STATE_INACTIVE;
else
- debug_print_object(obj, "deactivate");
+ debug_printobj = true;
break;

case ODEBUG_STATE_DESTROYED:
- debug_print_object(obj, "deactivate");
+ debug_printobj = true;
break;
default:
break;
}
- } else {
+ }
+
+ raw_spin_unlock_irqrestore(&db->lock, flags);
+ if (!obj) {
struct debug_obj o = { .object = addr,
.state = ODEBUG_STATE_NOTAVAILABLE,
.descr = descr };

debug_print_object(&o, "deactivate");
+ } else if (debug_printobj) {
+ debug_print_object(obj, "deactivate");
}
-
- raw_spin_unlock_irqrestore(&db->lock, flags);
}
EXPORT_SYMBOL_GPL(debug_object_deactivate);

@@ -580,6 +596,7 @@ void debug_object_destroy(void *addr, struct debug_obj_descr *descr)
struct debug_bucket *db;
struct debug_obj *obj;
unsigned long flags;
+ bool debug_printobj = false;

if (!debug_objects_enabled)
return;
@@ -599,20 +616,22 @@ void debug_object_destroy(void *addr, struct debug_obj_descr *descr)
obj->state = ODEBUG_STATE_DESTROYED;
break;
case ODEBUG_STATE_ACTIVE:
- debug_print_object(obj, "destroy");
state = obj->state;
raw_spin_unlock_irqrestore(&db->lock, flags);
+ debug_print_object(obj, "destroy");
debug_object_fixup(descr->fixup_destroy, addr, state);
return;

case ODEBUG_STATE_DESTROYED:
- debug_print_object(obj, "destroy");
+ debug_printobj = true;
break;
default:
break;
}
out_unlock:
raw_spin_unlock_irqrestore(&db->lock, flags);
+ if (debug_printobj)
+ debug_print_object(obj, "destroy");
}
EXPORT_SYMBOL_GPL(debug_object_destroy);

@@ -641,9 +660,9 @@ void debug_object_free(void *addr, struct debug_obj_descr *descr)

switch (obj->state) {
case ODEBUG_STATE_ACTIVE:
- debug_print_object(obj, "free");
state = obj->state;
raw_spin_unlock_irqrestore(&db->lock, flags);
+ debug_print_object(obj, "free");
debug_object_fixup(descr->fixup_free, addr, state);
return;
default:
@@ -716,6 +735,7 @@ void debug_object_assert_init(void *addr, struct debug_obj_descr *descr)
struct debug_bucket *db;
struct debug_obj *obj;
unsigned long flags;
+ bool debug_printobj = false;

if (!debug_objects_enabled)
return;
@@ -731,22 +751,25 @@ void debug_object_assert_init(void *addr, struct debug_obj_descr *descr)
if (obj->astate == expect)
obj->astate = next;
else
- debug_print_object(obj, "active_state");
+ debug_printobj = true;
break;

default:
- debug_print_object(obj, "active_state");
+ debug_printobj = true;
break;
}
- } else {
+ }
+
+ raw_spin_unlock_irqrestore(&db->lock, flags);
+ if (!obj) {
struct debug_obj o = { .object = addr,
.state = ODEBUG_STATE_NOTAVAILABLE,
.descr = descr };

debug_print_object(&o, "active_state");
+ } else if (debug_printobj) {
+ debug_print_object(obj, "active_state");
}
-
- raw_spin_unlock_irqrestore(&db->lock, flags);
}
EXPORT_SYMBOL_GPL(debug_object_active_state);

@@ -782,10 +805,10 @@ static void __debug_check_no_obj_freed(const void *address, unsigned long size)

switch (obj->state) {
case ODEBUG_STATE_ACTIVE:
- debug_print_object(obj, "free");
descr = obj->descr;
state = obj->state;
raw_spin_unlock_irqrestore(&db->lock, flags);
+ debug_print_object(obj, "free");
debug_object_fixup(descr->fixup_free,
(void *) oaddr, state);
goto repeat;
--
1.8.3.1



2018-12-12 23:40:16

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] debugobjects: Move printk out of db lock critical sections

On Wed, 12 Dec 2018 17:28:14 -0500 Waiman Long <[email protected]> wrote:

> The db->lock is a raw spinlock and so the lock hold time is supposed
> to be short. This will not be the case when printk() is being involved
> in some of the critical sections. In order to avoid the long hold time,
> in case some messages need to be printed, the debug_object_is_on_stack()
> and debug_print_object() calls are now moved out of those critical
> sections.
>
> Holding the db->lock while calling printk() may lead to deadlock if
> printk() somehow requires the allocation/freeing of debug object that
> happens to be in the same hash bucket or a circular lock dependency
> warning from lockdep as reported in https://lkml.org/lkml/2018/12/11/143.
>
> [ 87.209665] WARNING: possible circular locking dependency detected
> [ 87.210547] 4.20.0-rc4-00057-gc96cf92 #1 Tainted: G W
> [ 87.211449] ------------------------------------------------------
> [ 87.212405] getty/519 is trying to acquire lock:
> [ 87.213074] (____ptrval____) (&obj_hash[i].lock){-.-.}, at: debug_check_no_obj_freed+0xb4/0x302
> [ 87.214343]
> [ 87.214343] but task is already holding lock:
> [ 87.215174] (____ptrval____) (&port_lock_key){-.-.}, at: uart_shutdown+0x3a3/0x4e2
> [ 87.216260]
> [ 87.216260] which lock already depends on the new lock.
>
> This patch was also found to be able to fix a boot hanging problem
> when the initramfs image was switched on after a debugobjects splat
> from the EFI code.

Patch looks sensible, but I have a nit about the variable names.

> --- a/lib/debugobjects.c
> +++ b/lib/debugobjects.c
> @@ -375,6 +375,8 @@ static void debug_object_is_on_stack(void *addr, int onstack)
> struct debug_bucket *db;
> struct debug_obj *obj;
> unsigned long flags;
> + bool debug_printobj = false;

"debug_printobject" would be better, but this code already intermingles
"obj" and "object".

> + bool debug_chkstack = false;

Not so good. Is it debug_chkstack or debug_checkstk or ...

This file uses "check" consistently so let's not depart from that?
Linux style is to avoid these tricky little abbreviations and to use
full words.

ie, debug_checkstack, please. Better would be debug_check_stack. Or
simply check_stack: the "debug" doesn't add anything useful.



2018-12-13 02:05:15

by Sergey Senozhatsky

[permalink] [raw]
Subject: Re: [PATCH] debugobjects: Move printk out of db lock critical sections

On (12/12/18 17:28), Waiman Long wrote:
> The db->lock is a raw spinlock and so the lock hold time is supposed
> to be short. This will not be the case when printk() is being involved
> in some of the critical sections. In order to avoid the long hold time,
> in case some messages need to be printed, the debug_object_is_on_stack()
> and debug_print_object() calls are now moved out of those critical
> sections.
>
> Holding the db->lock while calling printk() may lead to deadlock if
> printk() somehow requires the allocation/freeing of debug object that
> happens to be in the same hash bucket or a circular lock dependency
> warning from lockdep as reported in https://lkml.org/lkml/2018/12/11/143.
>
> [ 87.209665] WARNING: possible circular locking dependency detected
> [ 87.210547] 4.20.0-rc4-00057-gc96cf92 #1 Tainted: G W
> [ 87.211449] ------------------------------------------------------
> [ 87.212405] getty/519 is trying to acquire lock:
> [ 87.213074] (____ptrval____) (&obj_hash[i].lock){-.-.}, at: debug_check_no_obj_freed+0xb4/0x302
> [ 87.214343]
> [ 87.214343] but task is already holding lock:
> [ 87.215174] (____ptrval____) (&port_lock_key){-.-.}, at: uart_shutdown+0x3a3/0x4e2
> [ 87.216260]
> [ 87.216260] which lock already depends on the new lock.
>
> This patch was also found to be able to fix a boot hanging problem
> when the initramfs image was switched on after a debugobjects splat
> from the EFI code.
>
> Signed-off-by: Waiman Long <[email protected]>

The patch looks good to me. A bit curious if we need to also patch
the self-test part debugobjects - check_results(). That guy still
printk()-s under bucket ->lock.

-ss

2018-12-13 03:00:52

by Sergey Senozhatsky

[permalink] [raw]
Subject: Re: [PATCH] debugobjects: Move printk out of db lock critical sections

On (12/12/18 17:28), Waiman Long wrote:
>
> warning from lockdep as reported in https://lkml.org/lkml/2018/12/11/143.
>

That link shows an empty page, lkml.org is quite unstable. Let's
use this one instead:

https://lore.kernel.org/lkml/20181211091154.GL23332@shao2-debian/T/#u

-ss

2018-12-13 04:36:56

by Dmitry Safonov

[permalink] [raw]
Subject: Re: [PATCH] debugobjects: Move printk out of db lock critical sections

Hi Waiman,

On 12/12/18 10:28 PM, Waiman Long wrote:
> The db->lock is a raw spinlock and so the lock hold time is supposed
> to be short. This will not be the case when printk() is being involved
> in some of the critical sections. In order to avoid the long hold time,
> in case some messages need to be printed, the debug_object_is_on_stack()
> and debug_print_object() calls are now moved out of those critical
> sections.
>
> Holding the db->lock while calling printk() may lead to deadlock if
> printk() somehow requires the allocation/freeing of debug object that
> happens to be in the same hash bucket or a circular lock dependency
> warning from lockdep as reported in https://lkml.org/lkml/2018/12/11/143.
>
> [ 87.209665] WARNING: possible circular locking dependency detected
> [ 87.210547] 4.20.0-rc4-00057-gc96cf92 #1 Tainted: G W
> [ 87.211449] ------------------------------------------------------
> [ 87.212405] getty/519 is trying to acquire lock:
> [ 87.213074] (____ptrval____) (&obj_hash[i].lock){-.-.}, at: debug_check_no_obj_freed+0xb4/0x302
> [ 87.214343]
> [ 87.214343] but task is already holding lock:
> [ 87.215174] (____ptrval____) (&port_lock_key){-.-.}, at: uart_shutdown+0x3a3/0x4e2
> [ 87.216260]
> [ 87.216260] which lock already depends on the new lock.
>
> This patch was also found to be able to fix a boot hanging problem
> when the initramfs image was switched on after a debugobjects splat
> from the EFI code.
>
> Signed-off-by: Waiman Long <[email protected]>
> ---

I've tried to review it and found minor issues like missed
debug_object_is_on_stack() for initializing already active object.

But than I come to opinion that it's just generally unsafe:
debug_obj life-time is protected by bucket's spin_lock.
Check the conditions when free_object() is being called.

So, I resulted by finishing my stack-copy version which I've started
yesterday on test robot report when I didn't know about your patch.

I've sent it now concurrently.

Thanks,
Dima

2018-12-13 05:20:09

by Sergey Senozhatsky

[permalink] [raw]
Subject: Re: [PATCH] debugobjects: Move printk out of db lock critical sections

On (12/13/18 04:35), Dmitry Safonov wrote:
> I've tried to review it and found minor issues like missed
> debug_object_is_on_stack() for initializing already active object.
>
> But than I come to opinion that it's just generally unsafe:
> debug_obj life-time is protected by bucket's spin_lock.
> Check the conditions when free_object() is being called.

I thought about it for a second, but couldn't figure out if this race
was real.

E.g. uart case - if there are two paths which concurrently free and
access debug object, then the same race condition should exist for
the xmit.buf page. debug object's lifespan should be the same as the
lifespan of, ummm, whatever the thing that debug object is attached
to? I'm surely missing something here.

-ss

2018-12-13 05:56:17

by Dmitry Safonov

[permalink] [raw]
Subject: Re: [PATCH] debugobjects: Move printk out of db lock critical sections

On Thu, 13 Dec 2018 at 05:19, Sergey Senozhatsky
<[email protected]> wrote:
>
> On (12/13/18 04:35), Dmitry Safonov wrote:
> > I've tried to review it and found minor issues like missed
> > debug_object_is_on_stack() for initializing already active object.
> >
> > But than I come to opinion that it's just generally unsafe:
> > debug_obj life-time is protected by bucket's spin_lock.
> > Check the conditions when free_object() is being called.
>
> I thought about it for a second, but couldn't figure out if this race
> was real.
>
> E.g. uart case - if there are two paths which concurrently free and
> access debug object, then the same race condition should exist for
> the xmit.buf page. debug object's lifespan should be the same as the
> lifespan of, ummm, whatever the thing that debug object is attached
> to? I'm surely missing something here.

I think the point of debugobjects is to reduce the damage caused
by an improper life-time of objects.

IIUC:
1. alloc_page()
2. dynamically allocate a workqueue on a new page
3. schedule some work
4. until the work has run free_page()

So, wroqueue fixup upon freeing should cancel the work TBD.

But maybe I've misunderstood something..
And it's 6 am in my timezone, I'll catch some sleep and
will glance this with some morning coffee.

Thanks,
Dmitry

2018-12-13 09:23:20

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] debugobjects: Move printk out of db lock critical sections

On Thu, Dec 13, 2018 at 11:59:40AM +0900, Sergey Senozhatsky wrote:
> On (12/12/18 17:28), Waiman Long wrote:
> >
> > warning from lockdep as reported in https://lkml.org/lkml/2018/12/11/143.
> >
>
> That link shows an empty page, lkml.org is quite unstable. Let's
> use this one instead:
>
> https://lore.kernel.org/lkml/20181211091154.GL23332@shao2-debian/T/#u

Better yet:

https://lkml.kernel.org/r/20181211091154.GL23332@shao2-debian

2018-12-13 10:22:09

by Sergey Senozhatsky

[permalink] [raw]
Subject: Re: [PATCH] debugobjects: Move printk out of db lock critical sections

On (12/13/18 05:54), Dmitry Safonov wrote:
> > I thought about it for a second, but couldn't figure out if this race
> > was real.
> >
> > E.g. uart case - if there are two paths which concurrently free and
> > access debug object, then the same race condition should exist for
> > the xmit.buf page. debug object's lifespan should be the same as the
> > lifespan of, ummm, whatever the thing that debug object is attached
> > to? I'm surely missing something here.
>
> I think the point of debugobjects is to reduce the damage caused
> by an improper life-time of objects.
>
> IIUC:
> 1. alloc_page()
> 2. dynamically allocate a workqueue on a new page
> 3. schedule some work
> 4. until the work has run free_page()

Hmm, if we free() that page then __debug_check_no_obj_freed() should ->fixup
all ODEBUG_STATE_ACTIVE objects first. Maybe we are talking about different
scenarios.

-ss

2018-12-13 20:02:32

by Waiman Long

[permalink] [raw]
Subject: Re: [PATCH] debugobjects: Move printk out of db lock critical sections

On 12/12/2018 09:03 PM, Sergey Senozhatsky wrote:
> On (12/12/18 17:28), Waiman Long wrote:
>> The db->lock is a raw spinlock and so the lock hold time is supposed
>> to be short. This will not be the case when printk() is being involved
>> in some of the critical sections. In order to avoid the long hold time,
>> in case some messages need to be printed, the debug_object_is_on_stack()
>> and debug_print_object() calls are now moved out of those critical
>> sections.
>>
>> Holding the db->lock while calling printk() may lead to deadlock if
>> printk() somehow requires the allocation/freeing of debug object that
>> happens to be in the same hash bucket or a circular lock dependency
>> warning from lockdep as reported in https://lkml.org/lkml/2018/12/11/143.
>>
>> [ 87.209665] WARNING: possible circular locking dependency detected
>> [ 87.210547] 4.20.0-rc4-00057-gc96cf92 #1 Tainted: G W
>> [ 87.211449] ------------------------------------------------------
>> [ 87.212405] getty/519 is trying to acquire lock:
>> [ 87.213074] (____ptrval____) (&obj_hash[i].lock){-.-.}, at: debug_check_no_obj_freed+0xb4/0x302
>> [ 87.214343]
>> [ 87.214343] but task is already holding lock:
>> [ 87.215174] (____ptrval____) (&port_lock_key){-.-.}, at: uart_shutdown+0x3a3/0x4e2
>> [ 87.216260]
>> [ 87.216260] which lock already depends on the new lock.
>>
>> This patch was also found to be able to fix a boot hanging problem
>> when the initramfs image was switched on after a debugobjects splat
>> from the EFI code.
>>
>> Signed-off-by: Waiman Long <[email protected]>
> The patch looks good to me. A bit curious if we need to also patch
> the self-test part debugobjects - check_results(). That guy still
> printk()-s under bucket ->lock.
>
> -ss

Yes, I should have changed those in the check_results() as well.

-Longman


2018-12-13 22:02:31

by Waiman Long

[permalink] [raw]
Subject: Re: [PATCH] debugobjects: Move printk out of db lock critical sections

On 12/12/2018 06:39 PM, Andrew Morton wrote:
> On Wed, 12 Dec 2018 17:28:14 -0500 Waiman Long <[email protected]> wrote:
>
>> The db->lock is a raw spinlock and so the lock hold time is supposed
>> to be short. This will not be the case when printk() is being involved
>> in some of the critical sections. In order to avoid the long hold time,
>> in case some messages need to be printed, the debug_object_is_on_stack()
>> and debug_print_object() calls are now moved out of those critical
>> sections.
>>
>> Holding the db->lock while calling printk() may lead to deadlock if
>> printk() somehow requires the allocation/freeing of debug object that
>> happens to be in the same hash bucket or a circular lock dependency
>> warning from lockdep as reported in https://lkml.org/lkml/2018/12/11/143.
>>
>> [ 87.209665] WARNING: possible circular locking dependency detected
>> [ 87.210547] 4.20.0-rc4-00057-gc96cf92 #1 Tainted: G W
>> [ 87.211449] ------------------------------------------------------
>> [ 87.212405] getty/519 is trying to acquire lock:
>> [ 87.213074] (____ptrval____) (&obj_hash[i].lock){-.-.}, at: debug_check_no_obj_freed+0xb4/0x302
>> [ 87.214343]
>> [ 87.214343] but task is already holding lock:
>> [ 87.215174] (____ptrval____) (&port_lock_key){-.-.}, at: uart_shutdown+0x3a3/0x4e2
>> [ 87.216260]
>> [ 87.216260] which lock already depends on the new lock.
>>
>> This patch was also found to be able to fix a boot hanging problem
>> when the initramfs image was switched on after a debugobjects splat
>> from the EFI code.
> Patch looks sensible, but I have a nit about the variable names.
>
>> --- a/lib/debugobjects.c
>> +++ b/lib/debugobjects.c
>> @@ -375,6 +375,8 @@ static void debug_object_is_on_stack(void *addr, int onstack)
>> struct debug_bucket *db;
>> struct debug_obj *obj;
>> unsigned long flags;
>> + bool debug_printobj = false;
> "debug_printobject" would be better, but this code already intermingles
> "obj" and "object".
>
>> + bool debug_chkstack = false;
> Not so good. Is it debug_chkstack or debug_checkstk or ...
>
> This file uses "check" consistently so let's not depart from that?
> Linux style is to avoid these tricky little abbreviations and to use
> full words.
>
> ie, debug_checkstack, please. Better would be debug_check_stack. Or
> simply check_stack: the "debug" doesn't add anything useful.
>
>
Thanks for the review. I have eliminated debug_printobj in the new v2 patch.

Cheers,
Longman


2018-12-13 22:11:51

by Waiman Long

[permalink] [raw]
Subject: Re: [PATCH] debugobjects: Move printk out of db lock critical sections

On 12/12/2018 11:35 PM, Dmitry Safonov wrote:
> Hi Waiman,
>
> On 12/12/18 10:28 PM, Waiman Long wrote:
>> The db->lock is a raw spinlock and so the lock hold time is supposed
>> to be short. This will not be the case when printk() is being involved
>> in some of the critical sections. In order to avoid the long hold time,
>> in case some messages need to be printed, the debug_object_is_on_stack()
>> and debug_print_object() calls are now moved out of those critical
>> sections.
>>
>> Holding the db->lock while calling printk() may lead to deadlock if
>> printk() somehow requires the allocation/freeing of debug object that
>> happens to be in the same hash bucket or a circular lock dependency
>> warning from lockdep as reported in https://lkml.org/lkml/2018/12/11/143.
>>
>> [ 87.209665] WARNING: possible circular locking dependency detected
>> [ 87.210547] 4.20.0-rc4-00057-gc96cf92 #1 Tainted: G W
>> [ 87.211449] ------------------------------------------------------
>> [ 87.212405] getty/519 is trying to acquire lock:
>> [ 87.213074] (____ptrval____) (&obj_hash[i].lock){-.-.}, at: debug_check_no_obj_freed+0xb4/0x302
>> [ 87.214343]
>> [ 87.214343] but task is already holding lock:
>> [ 87.215174] (____ptrval____) (&port_lock_key){-.-.}, at: uart_shutdown+0x3a3/0x4e2
>> [ 87.216260]
>> [ 87.216260] which lock already depends on the new lock.
>>
>> This patch was also found to be able to fix a boot hanging problem
>> when the initramfs image was switched on after a debugobjects splat
>> from the EFI code.
>>
>> Signed-off-by: Waiman Long <[email protected]>
>> ---
> I've tried to review it and found minor issues like missed
> debug_object_is_on_stack() for initializing already active object.
>
> But than I come to opinion that it's just generally unsafe:
> debug_obj life-time is protected by bucket's spin_lock.
> Check the conditions when free_object() is being called.

The bucket lock is for protecting the insertion and deletion of
debug_obj to/from the bucket list as well as searching within the bucket
list. It has nothing to do with the life time of the debug_obj itself.

Cheers,
Longman

2018-12-14 18:09:15

by Dmitry Safonov

[permalink] [raw]
Subject: Re: [PATCH] debugobjects: Move printk out of db lock critical sections

On 12/13/18 10:10 PM, Waiman Long wrote:
> The bucket lock is for protecting the insertion and deletion of
> debug_obj to/from the bucket list as well as searching within the bucket
> list. It has nothing to do with the life time of the debug_obj itself.

The bucket lock also protects lookups on a bucket.

1. Imagine, you have object in ODEBUG_STATE_DESTROYED.
If you will try debug_object_activate() with this patch, it will debug
print object outside of bucket lock, which means that
debug_check_no_obj_freed() may concurrently fixup/free object and
meta-data on another CPU.

I don't see this state is being used in many places around the kernel,
except selftest and i915 driver.

2. If you try to deactivate already non-active object - you will have
debug print outside of bucket lock. The other CPU can fixup/free page
with this object concurrently.

(before your patch the lookup would fail and no printing)

So, I might be mistaken, I'm mostly worried about dereferencing the
descriptor inside of the object during printing.
And in my opinion, i.e., there was a reason to save `descr` before
releasing the bucket lock here:
https://elixir.bootlin.com/linux/latest/source/lib/debugobjects.c#L789

Anyway, I see that your patch is already in -mm tree and Thomas seems to
be fine with the change so whatever %)

Thanks,
Dima

2018-12-14 18:22:46

by Waiman Long

[permalink] [raw]
Subject: Re: [PATCH] debugobjects: Move printk out of db lock critical sections

On 12/14/2018 01:06 PM, Dmitry Safonov wrote:
> On 12/13/18 10:10 PM, Waiman Long wrote:
>> The bucket lock is for protecting the insertion and deletion of
>> debug_obj to/from the bucket list as well as searching within the bucket
>> list. It has nothing to do with the life time of the debug_obj itself.
> The bucket lock also protects lookups on a bucket.

Yes, that is what I meant by "searching".

>
> 1. Imagine, you have object in ODEBUG_STATE_DESTROYED.
> If you will try debug_object_activate() with this patch, it will debug
> print object outside of bucket lock, which means that
> debug_check_no_obj_freed() may concurrently fixup/free object and
> meta-data on another CPU.
>
> I don't see this state is being used in many places around the kernel,
> except selftest and i915 driver.
>
> 2. If you try to deactivate already non-active object - you will have
> debug print outside of bucket lock. The other CPU can fixup/free page
> with this object concurrently.
>
> (before your patch the lookup would fail and no printing)
>
> So, I might be mistaken, I'm mostly worried about dereferencing the
> descriptor inside of the object during printing.
> And in my opinion, i.e., there was a reason to save `descr` before
> releasing the bucket lock here:
> https://elixir.bootlin.com/linux/latest/source/lib/debugobjects.c#L789
>
> Anyway, I see that your patch is already in -mm tree and Thomas seems to
> be fine with the change so whatever %)

I also thought about whether we should save descr before unlock. Anyway,
we can always send a follow-up patch to make further change if it is the
right thing to do.

The debugobjects code is enabled in a debug kernel and never in a kernel
for production use. The printk should only be triggered if there is a
bug in the code which the debug object is tracking.

Thanks,
Longman