2009-07-01 07:53:51

by Ingo Molnar

[permalink] [raw]
Subject: [PATCH] kmemleak: Fix scheduling-while-atomic bug


* Linux Kernel Mailing List <[email protected]> wrote:

> Gitweb: http://git.kernel.org/linus/acf4968ec9dea49387ca8b3d36dfaa0850bdb2d5
> Commit: acf4968ec9dea49387ca8b3d36dfaa0850bdb2d5
> Parent: 4698c1f2bbe44ce852ef1a6716973c1f5401a4c4
> Author: Catalin Marinas <[email protected]>
> AuthorDate: Fri Jun 26 17:38:29 2009 +0100
> Committer: Catalin Marinas <[email protected]>
> CommitDate: Fri Jun 26 17:38:29 2009 +0100
>
> kmemleak: Slightly change the policy on newly allocated objects

I think one of the kmemleak fixes that went upstream yesterday
caused the following scheduling-while-holding-the-tasklist-lock
regression/crash on x86:

BUG: sleeping function called from invalid context at mm/kmemleak.c:795
in_atomic(): 1, irqs_disabled(): 0, pid: 1737, name: kmemleak
2 locks held by kmemleak/1737:
#0: (scan_mutex){......}, at: [<c10c4376>] kmemleak_scan_thread+0x45/0x86
#1: (tasklist_lock){......}, at: [<c10c3bb4>] kmemleak_scan+0x1a9/0x39c
Pid: 1737, comm: kmemleak Not tainted 2.6.31-rc1-tip #59266
Call Trace:
[<c105ac0f>] ? __debug_show_held_locks+0x1e/0x20
[<c102e490>] __might_sleep+0x10a/0x111
[<c10c38d5>] scan_yield+0x17/0x3b
[<c10c3970>] scan_block+0x39/0xd4
[<c10c3bc6>] kmemleak_scan+0x1bb/0x39c
[<c10c4331>] ? kmemleak_scan_thread+0x0/0x86
[<c10c437b>] kmemleak_scan_thread+0x4a/0x86
[<c104d73e>] kthread+0x6e/0x73
[<c104d6d0>] ? kthread+0x0/0x73
[<c100959f>] kernel_thread_helper+0x7/0x10
kmemleak: 834 new suspected memory leaks (see /sys/kernel/debug/kmemleak)

The bit causing it is highly dubious:

static void scan_yield(void)
{
might_sleep();

if (time_is_before_eq_jiffies(next_scan_yield)) {
schedule();
next_scan_yield = jiffies + jiffies_scan_yield;
}
}

It is called deep inside the codepath and in a conditional way, and
that is what crapped up when one of the new scan_block() uses grew a
tasklist_lock dependency. Also, we dont need another 'yield'
primitive in the MM code, we have priorities and other scheduling
mechanisms to throttle background scanning just fine.

The minimal fix below removes scan_yield() and adds a cond_resched()
to the outmost (safe) place of the scanning thread. This solves the
regression.

Ingo

----------------->

>From 5ba1a8143c502f40b976a0ea1df5e5a10056fcc6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <[email protected]>
Date: Wed, 1 Jul 2009 09:43:53 +0200
Subject: [PATCH] kmemleak: Fix scheduling-while-atomic bug

One of the kmemleak changes caused the following
scheduling-while-holding-the-tasklist-lock regression on x86:

BUG: sleeping function called from invalid context at mm/kmemleak.c:795
in_atomic(): 1, irqs_disabled(): 0, pid: 1737, name: kmemleak
2 locks held by kmemleak/1737:
#0: (scan_mutex){......}, at: [<c10c4376>] kmemleak_scan_thread+0x45/0x86
#1: (tasklist_lock){......}, at: [<c10c3bb4>] kmemleak_scan+0x1a9/0x39c
Pid: 1737, comm: kmemleak Not tainted 2.6.31-rc1-tip #59266
Call Trace:
[<c105ac0f>] ? __debug_show_held_locks+0x1e/0x20
[<c102e490>] __might_sleep+0x10a/0x111
[<c10c38d5>] scan_yield+0x17/0x3b
[<c10c3970>] scan_block+0x39/0xd4
[<c10c3bc6>] kmemleak_scan+0x1bb/0x39c
[<c10c4331>] ? kmemleak_scan_thread+0x0/0x86
[<c10c437b>] kmemleak_scan_thread+0x4a/0x86
[<c104d73e>] kthread+0x6e/0x73
[<c104d6d0>] ? kthread+0x0/0x73
[<c100959f>] kernel_thread_helper+0x7/0x10
kmemleak: 834 new suspected memory leaks (see /sys/kernel/debug/kmemleak)

The bit causing it is highly dubious:

static void scan_yield(void)
{
might_sleep();

if (time_is_before_eq_jiffies(next_scan_yield)) {
schedule();
next_scan_yield = jiffies + jiffies_scan_yield;
}
}

It called deep inside the codepath and in a conditional way,
and that is what crapped up when one of the new scan_block()
uses grew a tasklist_lock dependency.

This minimal patch removes that yielding stuff and adds the
proper cond_resched().

The background scanning thread could probably also be reniced
to +10.

Signed-off-by: Ingo Molnar <[email protected]>
---
mm/kmemleak.c | 31 +------------------------------
1 files changed, 1 insertions(+), 30 deletions(-)

diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index eeece2d..e766e1d 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -105,7 +105,6 @@
#define MAX_TRACE 16 /* stack trace length */
#define REPORTS_NR 50 /* maximum number of reported leaks */
#define MSECS_MIN_AGE 5000 /* minimum object age for reporting */
-#define MSECS_SCAN_YIELD 10 /* CPU yielding period */
#define SECS_FIRST_SCAN 60 /* delay before the first scan */
#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */

@@ -186,10 +185,7 @@ static atomic_t kmemleak_error = ATOMIC_INIT(0);
static unsigned long min_addr = ULONG_MAX;
static unsigned long max_addr;

-/* used for yielding the CPU to other tasks during scanning */
-static unsigned long next_scan_yield;
static struct task_struct *scan_thread;
-static unsigned long jiffies_scan_yield;
/* used to avoid reporting of recently allocated objects */
static unsigned long jiffies_min_age;
static unsigned long jiffies_last_scan;
@@ -786,21 +782,6 @@ void kmemleak_no_scan(const void *ptr)
EXPORT_SYMBOL(kmemleak_no_scan);

/*
- * Yield the CPU so that other tasks get a chance to run. The yielding is
- * rate-limited to avoid excessive number of calls to the schedule() function
- * during memory scanning.
- */
-static void scan_yield(void)
-{
- might_sleep();
-
- if (time_is_before_eq_jiffies(next_scan_yield)) {
- schedule();
- next_scan_yield = jiffies + jiffies_scan_yield;
- }
-}
-
-/*
* Memory scanning is a long process and it needs to be interruptable. This
* function checks whether such interrupt condition occured.
*/
@@ -840,15 +821,6 @@ static void scan_block(void *_start, void *_end,
if (scan_should_stop())
break;

- /*
- * When scanning a memory block with a corresponding
- * kmemleak_object, the CPU yielding is handled in the calling
- * code since it holds the object->lock to avoid the block
- * freeing.
- */
- if (!scanned)
- scan_yield();
-
object = find_and_get_object(pointer, 1);
if (!object)
continue;
@@ -1014,7 +986,7 @@ static void kmemleak_scan(void)
*/
object = list_entry(gray_list.next, typeof(*object), gray_list);
while (&object->gray_list != &gray_list) {
- scan_yield();
+ cond_resched();

/* may add new objects to the list */
if (!scan_should_stop())
@@ -1385,7 +1357,6 @@ void __init kmemleak_init(void)
int i;
unsigned long flags;

- jiffies_scan_yield = msecs_to_jiffies(MSECS_SCAN_YIELD);
jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);


2009-07-01 08:10:32

by Pekka Enberg

[permalink] [raw]
Subject: Re: [PATCH] kmemleak: Fix scheduling-while-atomic bug

Hi Ingo,

On Wed, Jul 1, 2009 at 10:53 AM, Ingo Molnar<[email protected]> wrote:
> From 5ba1a8143c502f40b976a0ea1df5e5a10056fcc6 Mon Sep 17 00:00:00 2001
> From: Ingo Molnar <[email protected]>
> Date: Wed, 1 Jul 2009 09:43:53 +0200
> Subject: [PATCH] kmemleak: Fix scheduling-while-atomic bug
>
> One of the kmemleak changes caused the following
> scheduling-while-holding-the-tasklist-lock regression on x86:
>
> BUG: sleeping function called from invalid context at mm/kmemleak.c:795
> in_atomic(): 1, irqs_disabled(): 0, pid: 1737, name: kmemleak
> 2 locks held by kmemleak/1737:
> ?#0: ?(scan_mutex){......}, at: [<c10c4376>] kmemleak_scan_thread+0x45/0x86
> ?#1: ?(tasklist_lock){......}, at: [<c10c3bb4>] kmemleak_scan+0x1a9/0x39c
> Pid: 1737, comm: kmemleak Not tainted 2.6.31-rc1-tip #59266
> Call Trace:
> ?[<c105ac0f>] ? __debug_show_held_locks+0x1e/0x20
> ?[<c102e490>] __might_sleep+0x10a/0x111
> ?[<c10c38d5>] scan_yield+0x17/0x3b
> ?[<c10c3970>] scan_block+0x39/0xd4
> ?[<c10c3bc6>] kmemleak_scan+0x1bb/0x39c
> ?[<c10c4331>] ? kmemleak_scan_thread+0x0/0x86
> ?[<c10c437b>] kmemleak_scan_thread+0x4a/0x86
> ?[<c104d73e>] kthread+0x6e/0x73
> ?[<c104d6d0>] ? kthread+0x0/0x73
> ?[<c100959f>] kernel_thread_helper+0x7/0x10
> kmemleak: 834 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
>
> The bit causing it is highly dubious:
>
> static void scan_yield(void)
> {
> ? ? ? ?might_sleep();
>
> ? ? ? ?if (time_is_before_eq_jiffies(next_scan_yield)) {
> ? ? ? ? ? ? ? ?schedule();
> ? ? ? ? ? ? ? ?next_scan_yield = jiffies + jiffies_scan_yield;
> ? ? ? ?}
> }
>
> It called deep inside the codepath and in a conditional way,
> and that is what crapped up when one of the new scan_block()
> uses grew a tasklist_lock dependency.
>
> This minimal patch removes that yielding stuff and adds the
> proper cond_resched().
>
> The background scanning thread could probably also be reniced
> to +10.
>
> Signed-off-by: Ingo Molnar <[email protected]>

Looks good to me, thanks!

Acked-by: Pekka Enberg <[email protected]>

2009-07-01 09:19:26

by Catalin Marinas

[permalink] [raw]
Subject: Re: [PATCH] kmemleak: Fix scheduling-while-atomic bug

On Wed, 2009-07-01 at 09:53 +0200, Ingo Molnar wrote:
> * Linux Kernel Mailing List <[email protected]> wrote:
>
> > Gitweb: http://git.kernel.org/linus/acf4968ec9dea49387ca8b3d36dfaa0850bdb2d5
> > Commit: acf4968ec9dea49387ca8b3d36dfaa0850bdb2d5
> > Parent: 4698c1f2bbe44ce852ef1a6716973c1f5401a4c4
> > Author: Catalin Marinas <[email protected]>
> > AuthorDate: Fri Jun 26 17:38:29 2009 +0100
> > Committer: Catalin Marinas <[email protected]>
> > CommitDate: Fri Jun 26 17:38:29 2009 +0100
> >
> > kmemleak: Slightly change the policy on newly allocated objects
>
> I think one of the kmemleak fixes that went upstream yesterday
> caused the following scheduling-while-holding-the-tasklist-lock
> regression/crash on x86:

Thanks for the patch. The bug was always there, only that the task stack
scanning is now enabled by default (and I probably have a small number
of tasks that the rescheduling didn't happen during stack scanning).

> The minimal fix below removes scan_yield() and adds a cond_resched()
> to the outmost (safe) place of the scanning thread. This solves the
> regression.

With CONFIG_PREEMPT disabled it won't reschedule during the bss scanning
but I don't see this as a real issue (task stacks scanning probably
takes longer anyway).

> The background scanning thread could probably also be reniced
> to +10.

diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index e766e1d..6006553 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1039,6 +1039,7 @@ static int kmemleak_scan_thread(void *arg)
static int first_run = 1;

pr_info("Automatic memory scanning thread started\n");
+ set_user_nice(current, 10);

/*
* Wait before the first scan to allow the system to fully initialize.

--
Catalin

2009-07-01 09:30:28

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] kmemleak: Fix scheduling-while-atomic bug


* Catalin Marinas <[email protected]> wrote:

> > The minimal fix below removes scan_yield() and adds a
> > cond_resched() to the outmost (safe) place of the scanning
> > thread. This solves the regression.
>
> With CONFIG_PREEMPT disabled it won't reschedule during the bss
> scanning but I don't see this as a real issue (task stacks
> scanning probably takes longer anyway).

Yeah. I suspect one more cond_resched() could be added - i just
didnt see an obvious place for it, given that scan_block() is being
called with asymetric held-locks contexts.

Ingo

2009-07-01 09:47:23

by Catalin Marinas

[permalink] [raw]
Subject: Re: [PATCH] kmemleak: Fix scheduling-while-atomic bug

On Wed, 2009-07-01 at 11:30 +0200, Ingo Molnar wrote:
> * Catalin Marinas <[email protected]> wrote:
>
> > > The minimal fix below removes scan_yield() and adds a
> > > cond_resched() to the outmost (safe) place of the scanning
> > > thread. This solves the regression.
> >
> > With CONFIG_PREEMPT disabled it won't reschedule during the bss
> > scanning but I don't see this as a real issue (task stacks
> > scanning probably takes longer anyway).
>
> Yeah. I suspect one more cond_resched() could be added - i just
> didnt see an obvious place for it, given that scan_block() is being
> called with asymetric held-locks contexts.

Yes, scan_block shouldn't call cond_resched(). The code is cleaner if
functions don't have too many side-effects. I can see about 1 sec of bss
scanning on an ARM board but with processor at < 500MHz and slow memory
system. On a standard x86 systems BSS scanning may not be noticeable
(and I think PREEMPT enabling is quite common these days).

Since we are at locking, I just noticed this on my x86 laptop when
running cat /sys/kernel/debug/kmemleak (I haven't got it on an ARM
board):

================================================
[ BUG: lock held when returning to user space! ]
------------------------------------------------
cat/3687 is leaving the kernel with locks still held!
1 lock held by cat/3687:
#0: (scan_mutex){+.+.+.}, at: [<c01e0c5c>] kmemleak_open+0x3c/0x70

kmemleak_open() acquires scan_mutex and unconditionally releases it in
kmemleak_release(). The mutex seems to be released as a subsequent
acquiring works fine.

Is this caused just because cat may have exited without closing the file
descriptor (which should be done automatically anyway)?

Thanks.

--
Catalin

2009-07-01 11:04:53

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] kmemleak: Fix scheduling-while-atomic bug


* Catalin Marinas <[email protected]> wrote:

> On Wed, 2009-07-01 at 11:30 +0200, Ingo Molnar wrote:
> > * Catalin Marinas <[email protected]> wrote:
> >
> > > > The minimal fix below removes scan_yield() and adds a
> > > > cond_resched() to the outmost (safe) place of the scanning
> > > > thread. This solves the regression.
> > >
> > > With CONFIG_PREEMPT disabled it won't reschedule during the bss
> > > scanning but I don't see this as a real issue (task stacks
> > > scanning probably takes longer anyway).
> >
> > Yeah. I suspect one more cond_resched() could be added - i just
> > didnt see an obvious place for it, given that scan_block() is being
> > called with asymetric held-locks contexts.
>
> Yes, scan_block shouldn't call cond_resched(). The code is cleaner if
> functions don't have too many side-effects. I can see about 1 sec of bss
> scanning on an ARM board but with processor at < 500MHz and slow memory
> system. On a standard x86 systems BSS scanning may not be noticeable
> (and I think PREEMPT enabling is quite common these days).
>
> Since we are at locking, I just noticed this on my x86 laptop when
> running cat /sys/kernel/debug/kmemleak (I haven't got it on an ARM
> board):
>
> ================================================
> [ BUG: lock held when returning to user space! ]
> ------------------------------------------------
> cat/3687 is leaving the kernel with locks still held!
> 1 lock held by cat/3687:
> #0: (scan_mutex){+.+.+.}, at: [<c01e0c5c>] kmemleak_open+0x3c/0x70
>
> kmemleak_open() acquires scan_mutex and unconditionally releases
> it in kmemleak_release(). The mutex seems to be released as a
> subsequent acquiring works fine.
>
> Is this caused just because cat may have exited without closing
> the file descriptor (which should be done automatically anyway)?

This lockdep warning has a 0% false positives track record so far:
all previous cases it triggered showed some real (and fatal) bug in
the underlying code.

The above one probably means scan_mutex is leaked out of a /proc
syscall - that would be a bug in kmemleak.

Ingo

2009-07-02 09:48:57

by Catalin Marinas

[permalink] [raw]
Subject: Re: [PATCH] kmemleak: Fix scheduling-while-atomic bug

On Wed, 2009-07-01 at 11:30 +0200, Ingo Molnar wrote:
> * Catalin Marinas <[email protected]> wrote:
>
> > > The minimal fix below removes scan_yield() and adds a
> > > cond_resched() to the outmost (safe) place of the scanning
> > > thread. This solves the regression.
> >
> > With CONFIG_PREEMPT disabled it won't reschedule during the bss
> > scanning but I don't see this as a real issue (task stacks
> > scanning probably takes longer anyway).
>
> Yeah. I suspect one more cond_resched() could be added - i just
> didnt see an obvious place for it, given that scan_block() is being
> called with asymetric held-locks contexts.

Now that your patch was merged, I propose adding a few more
cond_resched() calls, useful for the !PREEMPT case:


kmemleak: Add more cond_resched() calls in the scanning thread

From: Catalin Marinas <[email protected]>

Following recent patch to no longer reschedule in the scan_block()
function, we need a few more cond_resched() calls in the kmemleak_scan()
function (useful if PREEMPT is disabled).

Signed-off-by: Catalin Marinas <[email protected]>
---
mm/kmemleak.c | 7 ++++++-
1 files changed, 6 insertions(+), 1 deletions(-)

diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 013f188..b4f675e 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -932,13 +932,16 @@ static void kmemleak_scan(void)

/* data/bss scanning */
scan_block(_sdata, _edata, NULL);
+ cond_resched();
scan_block(__bss_start, __bss_stop, NULL);

#ifdef CONFIG_SMP
/* per-cpu sections scanning */
- for_each_possible_cpu(i)
+ for_each_possible_cpu(i) {
+ cond_resched();
scan_block(__per_cpu_start + per_cpu_offset(i),
__per_cpu_end + per_cpu_offset(i), NULL);
+ }
#endif

/*
@@ -960,6 +963,7 @@ static void kmemleak_scan(void)
/* only scan if page is in use */
if (page_count(page) == 0)
continue;
+ cond_resched();
scan_block(page, page + 1, NULL);
}
}
@@ -969,6 +973,7 @@ static void kmemleak_scan(void)
* not enabled by default.
*/
if (kmemleak_stack_scan) {
+ cond_resched();
read_lock(&tasklist_lock);
for_each_process(task)
scan_block(task_stack_page(task),


--
Catalin

2009-07-02 12:49:02

by Catalin Marinas

[permalink] [raw]
Subject: Exiting with locks still held (was Re: [PATCH] kmemleak: Fix scheduling-while-atomic bug)

Hi Ingo,

On Wed, 2009-07-01 at 13:04 +0200, Ingo Molnar wrote:
> * Catalin Marinas <[email protected]> wrote:
> > Since we are at locking, I just noticed this on my x86 laptop when
> > running cat /sys/kernel/debug/kmemleak (I haven't got it on an ARM
> > board):
> >
> > ================================================
> > [ BUG: lock held when returning to user space! ]
> > ------------------------------------------------
> > cat/3687 is leaving the kernel with locks still held!
> > 1 lock held by cat/3687:
> > #0: (scan_mutex){+.+.+.}, at: [<c01e0c5c>] kmemleak_open+0x3c/0x70
> >
> > kmemleak_open() acquires scan_mutex and unconditionally releases
> > it in kmemleak_release(). The mutex seems to be released as a
> > subsequent acquiring works fine.
> >
> > Is this caused just because cat may have exited without closing
> > the file descriptor (which should be done automatically anyway)?
>
> This lockdep warning has a 0% false positives track record so far:
> all previous cases it triggered showed some real (and fatal) bug in
> the underlying code.

In this particular case, there is no fatal problem as the mutex is
released shortly after this message.

> The above one probably means scan_mutex is leaked out of a /proc
> syscall - that would be a bug in kmemleak.

It could be but I can't figure out a solution. If there is only one task
opening and closing the kmemleak file, everything is fine. In
combination with shell piping I think I get the kmemleak file descriptor
released from a different task than the one that opened it.

For example, the badly written code below opens kmemleak and acquires
the scan_mutex in the parent task but releases it in the child (it needs
a few tries to trigger it). With waitpid() in parent everything is fine.

#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/wait.h>

int main(void)
{
int fd = open("/sys/kernel/debug/kmemleak", O_RDONLY);

printf("fd = %d\n", fd);
if (fd < 0)
return 2;

if (!fork()) {
/* child */
sleep(2);
close(fd);
printf("fd closed in child\n");
}

return 0;
}

Running this gives (the ### lines are printed in the
kmemleak_open/release functions):

# ./cat-kmemleak
### kmemleak_open current->pid = 1409
fd = 3
=====================================
[ BUG: lock held at task exit time! ]
-------------------------------------
cat-kmemleak/1409 is exiting with locks still held!
1 lock held by cat-kmemleak/1409:
#0: (scan_mutex){+.+.+.}, at: [<c00662b1>] kmemleak_open+0x31/0x68

stack backtrace:
[<c0024025>] (unwind_backtrace+0x1/0x80) from [<c01cddd7>] (dump_stack+0xb/0xc)
[<c01cddd7>] (dump_stack+0xb/0xc) from [<c0043d2d>] (debug_check_no_locks_held+0x49/0x64)
[<c0043d2d>] (debug_check_no_locks_held+0x49/0x64) from [<c0031423>] (do_exit+0x3fb/0x43c)
[<c0031423>] (do_exit+0x3fb/0x43c) from [<c00314c5>] (do_group_exit+0x61/0x80)
[<c00314c5>] (do_group_exit+0x61/0x80) from [<c00314f3>] (sys_exit_group+0xf/0x14)
[<c00314f3>] (sys_exit_group+0xf/0x14) from [<c001fc41>] (ret_fast_syscall+0x1/0x40)

### kmemleak_release current->pid = 1410
fd closed in child


Any suggestions? Thanks.

--
Catalin

2009-07-02 12:54:14

by Pekka Enberg

[permalink] [raw]
Subject: Re: Exiting with locks still held (was Re: [PATCH] kmemleak: Fix scheduling-while-atomic bug)

Hi Catalin,

On Thu, Jul 2, 2009 at 3:48 PM, Catalin Marinas<[email protected]> wrote:
> It could be but I can't figure out a solution. If there is only one task
> opening and closing the kmemleak file, everything is fine. In
> combination with shell piping I think I get the kmemleak file descriptor
> released from a different task than the one that opened it.
>
> For example, the badly written code below opens kmemleak and acquires
> the scan_mutex in the parent task but releases it in the child (it needs
> a few tries to trigger it). With waitpid() in parent everything is fine.
>
> #include <stdio.h>
> #include <sys/types.h>
> #include <sys/stat.h>
> #include <fcntl.h>
> #include <unistd.h>
> #include <sys/wait.h>
>
> int main(void)
> {
> ? ? ? ?int fd = open("/sys/kernel/debug/kmemleak", O_RDONLY);
>
> ? ? ? ?printf("fd = %d\n", fd);
> ? ? ? ?if (fd < 0)
> ? ? ? ? ? ? ? ?return 2;
>
> ? ? ? ?if (!fork()) {
> ? ? ? ? ? ? ? ?/* child */
> ? ? ? ? ? ? ? ?sleep(2);
> ? ? ? ? ? ? ? ?close(fd);
> ? ? ? ? ? ? ? ?printf("fd closed in child\n");
> ? ? ? ?}
>
> ? ? ? ?return 0;
> }
>
> Running this gives (the ### lines are printed in the
> kmemleak_open/release functions):
>
> # ./cat-kmemleak
> ### kmemleak_open current->pid = 1409
> fd = 3
> =====================================
> [ BUG: lock held at task exit time! ]
> -------------------------------------
> cat-kmemleak/1409 is exiting with locks still held!
> 1 lock held by cat-kmemleak/1409:
> ?#0: ?(scan_mutex){+.+.+.}, at: [<c00662b1>] kmemleak_open+0x31/0x68
>
> stack backtrace:
> [<c0024025>] (unwind_backtrace+0x1/0x80) from [<c01cddd7>] (dump_stack+0xb/0xc)
> [<c01cddd7>] (dump_stack+0xb/0xc) from [<c0043d2d>] (debug_check_no_locks_held+0x49/0x64)
> [<c0043d2d>] (debug_check_no_locks_held+0x49/0x64) from [<c0031423>] (do_exit+0x3fb/0x43c)
> [<c0031423>] (do_exit+0x3fb/0x43c) from [<c00314c5>] (do_group_exit+0x61/0x80)
> [<c00314c5>] (do_group_exit+0x61/0x80) from [<c00314f3>] (sys_exit_group+0xf/0x14)
> [<c00314f3>] (sys_exit_group+0xf/0x14) from [<c001fc41>] (ret_fast_syscall+0x1/0x40)
>
> ### kmemleak_release current->pid = 1410
> fd closed in child
>
> Any suggestions? Thanks.

Well, you are not supposed to hold on to locks when returning from a
system call ("sys_open") anyway. You can probably do the exclusion
with a kmemcheck specific flag?

Pekka

2009-07-02 13:06:35

by Catalin Marinas

[permalink] [raw]
Subject: Re: Exiting with locks still held (was Re: [PATCH] kmemleak: Fix scheduling-while-atomic bug)

On Thu, 2009-07-02 at 15:54 +0300, Pekka Enberg wrote:
> On Thu, Jul 2, 2009 at 3:48 PM, Catalin Marinas<[email protected]> wrote:
> > It could be but I can't figure out a solution. If there is only one task
> > opening and closing the kmemleak file, everything is fine. In
> > combination with shell piping I think I get the kmemleak file descriptor
> > released from a different task than the one that opened it.
> >
> > For example, the badly written code below opens kmemleak and acquires
> > the scan_mutex in the parent task but releases it in the child (it needs
> > a few tries to trigger it). With waitpid() in parent everything is fine.
[...]
> Well, you are not supposed to hold on to locks when returning from a
> system call ("sys_open") anyway. You can probably do the exclusion
> with a kmemcheck specific flag?

Acquiring the mutex in "open" and releasing it in "release" was easier.
I'll see if I can move the mutex to the seq_read functions which
actually need it.

--
Catalin

2009-07-02 14:14:30

by Catalin Marinas

[permalink] [raw]
Subject: Re: Exiting with locks still held (was Re: [PATCH] kmemleak: Fix scheduling-while-atomic bug)

On Thu, 2009-07-02 at 15:54 +0300, Pekka Enberg wrote:
> On Thu, Jul 2, 2009 at 3:48 PM, Catalin Marinas<[email protected]> wrote:
> > For example, the badly written code below opens kmemleak and acquires
> > the scan_mutex in the parent task but releases it in the child (it needs
> > a few tries to trigger it). With waitpid() in parent everything is fine.
[...]
> Well, you are not supposed to hold on to locks when returning from a
> system call ("sys_open") anyway.

There's a fix to this. I'll test it a bit more before pushing upstream:


kmemleak: Do not acquire scan_mutex in kmemleak_open()

From: Catalin Marinas <[email protected]>

Initially, the scan_mutex was acquired in kmemleak_open() and released
in kmemleak_release() (corresponding to /sys/kernel/debug/kmemleak
operations). This was causing some lockdep reports when the file was
closed from a different task than the one opening it. This patch moves
the scan_mutex acquiring in kmemleak_write() or kmemleak_seq_show().

Signed-off-by: Catalin Marinas <[email protected]>
---
mm/kmemleak.c | 58 +++++++++++++++++++++++++++------------------------------
1 files changed, 27 insertions(+), 31 deletions(-)

diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 958b462..678baad 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1157,11 +1157,22 @@ static int kmemleak_seq_show(struct seq_file *seq, void *v)
{
struct kmemleak_object *object = v;
unsigned long flags;
+ int ret;
+
+ /*
+ * scan_mutex needs to be acquired here since unreferenced_object() is
+ * not reliable during memory scanning.
+ */
+ ret = mutex_lock_interruptible(&scan_mutex);
+ if (ret < 0)
+ return ret;

spin_lock_irqsave(&object->lock, flags);
if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object))
print_unreferenced(seq, object);
spin_unlock_irqrestore(&object->lock, flags);
+
+ mutex_unlock(&scan_mutex);
return 0;
}

@@ -1174,36 +1185,15 @@ static const struct seq_operations kmemleak_seq_ops = {

static int kmemleak_open(struct inode *inode, struct file *file)
{
- int ret = 0;
-
if (!atomic_read(&kmemleak_enabled))
return -EBUSY;

- ret = mutex_lock_interruptible(&scan_mutex);
- if (ret < 0)
- goto out;
- if (file->f_mode & FMODE_READ) {
- ret = seq_open(file, &kmemleak_seq_ops);
- if (ret < 0)
- goto scan_unlock;
- }
- return ret;
-
-scan_unlock:
- mutex_unlock(&scan_mutex);
-out:
- return ret;
+ return seq_open(file, &kmemleak_seq_ops);
}

static int kmemleak_release(struct inode *inode, struct file *file)
{
- int ret = 0;
-
- if (file->f_mode & FMODE_READ)
- seq_release(inode, file);
- mutex_unlock(&scan_mutex);
-
- return ret;
+ return seq_release(inode, file);
}

/*
@@ -1223,15 +1213,17 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
{
char buf[64];
int buf_size;
-
- if (!atomic_read(&kmemleak_enabled))
- return -EBUSY;
+ int ret;

buf_size = min(size, (sizeof(buf) - 1));
if (strncpy_from_user(buf, user_buf, buf_size) < 0)
return -EFAULT;
buf[buf_size] = 0;

+ ret = mutex_lock_interruptible(&scan_mutex);
+ if (ret < 0)
+ return ret;
+
if (strncmp(buf, "off", 3) == 0)
kmemleak_disable();
else if (strncmp(buf, "stack=on", 8) == 0)
@@ -1244,11 +1236,10 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
stop_scan_thread();
else if (strncmp(buf, "scan=", 5) == 0) {
unsigned long secs;
- int err;

- err = strict_strtoul(buf + 5, 0, &secs);
- if (err < 0)
- return err;
+ ret = strict_strtoul(buf + 5, 0, &secs);
+ if (ret < 0)
+ goto out;
stop_scan_thread();
if (secs) {
jiffies_scan_wait = msecs_to_jiffies(secs * 1000);
@@ -1257,7 +1248,12 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
} else if (strncmp(buf, "scan", 4) == 0)
kmemleak_scan();
else
- return -EINVAL;
+ ret = -EINVAL;
+
+out:
+ mutex_unlock(&scan_mutex);
+ if (ret < 0)
+ return ret;

/* ignore the rest of the buffer, only one command at a time */
*ppos += size;


--
Catalin

2009-07-02 17:40:14

by Linus Torvalds

[permalink] [raw]
Subject: Re: Exiting with locks still held (was Re: [PATCH] kmemleak: Fix scheduling-while-atomic bug)



On Thu, 2 Jul 2009, Catalin Marinas wrote:
>
> Initially, the scan_mutex was acquired in kmemleak_open() and released
> in kmemleak_release() (corresponding to /sys/kernel/debug/kmemleak
> operations). This was causing some lockdep reports when the file was
> closed from a different task than the one opening it. This patch moves
> the scan_mutex acquiring in kmemleak_write() or kmemleak_seq_show().

This is better, but not really how you are supposed to do it.

The whole seq-file thing is very much _designed_ for taking a lock at the
beginning of the operation, and releasing it at the end. It's a very
common pattern.

But you should _not_ do it in the "show" routine. If you do, you're always
going to be racy wrt lseek() and friends.

What you _should_ do is to take the lock in the "seq_start" routine, and
release it in "seq_stop". The "seq_show" routine may be called multiple
times in between.

For a trivial example, see the drivers/char/misc.c file. Note how it needs
to hold the lock over the whole list traversal, and how seqfiles allows it
to do that quite naturally.

Linus

2009-07-03 07:00:56

by Ingo Molnar

[permalink] [raw]
Subject: [PATCH] kmemleak: Mark nice +10


* Catalin Marinas <[email protected]> wrote:

> On Wed, 2009-07-01 at 11:30 +0200, Ingo Molnar wrote:
> > * Catalin Marinas <[email protected]> wrote:
> >
> > > > The minimal fix below removes scan_yield() and adds a
> > > > cond_resched() to the outmost (safe) place of the scanning
> > > > thread. This solves the regression.
> > >
> > > With CONFIG_PREEMPT disabled it won't reschedule during the bss
> > > scanning but I don't see this as a real issue (task stacks
> > > scanning probably takes longer anyway).
> >
> > Yeah. I suspect one more cond_resched() could be added - i just
> > didnt see an obvious place for it, given that scan_block() is being
> > called with asymetric held-locks contexts.
>
> Now that your patch was merged, I propose adding a few more
> cond_resched() calls, useful for the !PREEMPT case:

note, please also merge the renicing fix you sent. I have it tested
in tip:out-of-tree, attached below.

Ingo

>From f6a529517732a9d0e1ad0cd43ad7d2d96de4a4f5 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <[email protected]>
Date: Wed, 1 Jul 2009 10:18:57 +0100
Subject: [PATCH] kmemleak: Mark nice +10

> The background scanning thread could probably also be reniced
> to +10.

Signed-off-by: Ingo Molnar <[email protected]>
---
mm/kmemleak.c | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index e766e1d..6006553 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1039,6 +1039,7 @@ static int kmemleak_scan_thread(void *arg)
static int first_run = 1;

pr_info("Automatic memory scanning thread started\n");
+ set_user_nice(current, 10);

/*
* Wait before the first scan to allow the system to fully initialize.

2009-07-03 07:05:32

by Ingo Molnar

[permalink] [raw]
Subject: Re: Exiting with locks still held (was Re: [PATCH] kmemleak: Fix scheduling-while-atomic bug)


* Catalin Marinas <[email protected]> wrote:

> Hi Ingo,
>
> On Wed, 2009-07-01 at 13:04 +0200, Ingo Molnar wrote:
> > * Catalin Marinas <[email protected]> wrote:
> > > Since we are at locking, I just noticed this on my x86 laptop when
> > > running cat /sys/kernel/debug/kmemleak (I haven't got it on an ARM
> > > board):
> > >
> > > ================================================
> > > [ BUG: lock held when returning to user space! ]
> > > ------------------------------------------------
> > > cat/3687 is leaving the kernel with locks still held!
> > > 1 lock held by cat/3687:
> > > #0: (scan_mutex){+.+.+.}, at: [<c01e0c5c>] kmemleak_open+0x3c/0x70
> > >
> > > kmemleak_open() acquires scan_mutex and unconditionally releases
> > > it in kmemleak_release(). The mutex seems to be released as a
> > > subsequent acquiring works fine.
> > >
> > > Is this caused just because cat may have exited without closing
> > > the file descriptor (which should be done automatically anyway)?
> >
> > This lockdep warning has a 0% false positives track record so
> > far: all previous cases it triggered showed some real (and
> > fatal) bug in the underlying code.
>
> In this particular case, there is no fatal problem as the mutex is
> released shortly after this message.

Maybe - but holding locks in user-space is almost always bad.

What happens if user-space opens a second file descriptor before
closing the first one? We either lock up (which is bad and fatal) or
you already have some _other_ exclusion mechanism that prevents this
case, which calls into question the need to hold this particular
mutex in user-space to begin with.

I've yet to see a valid 'need to hold this kernel lock in
user-space' case, and this does not seem to be such a case either.

Ingo

2009-07-03 08:09:52

by Catalin Marinas

[permalink] [raw]
Subject: Re: [PATCH] kmemleak: Mark nice +10

On Fri, 2009-07-03 at 09:00 +0200, Ingo Molnar wrote:
> * Catalin Marinas <[email protected]> wrote:
>
> > On Wed, 2009-07-01 at 11:30 +0200, Ingo Molnar wrote:
> > > * Catalin Marinas <[email protected]> wrote:
> > >
> > > > > The minimal fix below removes scan_yield() and adds a
> > > > > cond_resched() to the outmost (safe) place of the scanning
> > > > > thread. This solves the regression.
> > > >
> > > > With CONFIG_PREEMPT disabled it won't reschedule during the bss
> > > > scanning but I don't see this as a real issue (task stacks
> > > > scanning probably takes longer anyway).
> > >
> > > Yeah. I suspect one more cond_resched() could be added - i just
> > > didnt see an obvious place for it, given that scan_block() is being
> > > called with asymetric held-locks contexts.
> >
> > Now that your patch was merged, I propose adding a few more
> > cond_resched() calls, useful for the !PREEMPT case:
>
> note, please also merge the renicing fix you sent. I have it tested
> in tip:out-of-tree, attached below.

I have this patch in my kmemleak branch
(http://www.linux-arm.org/git?p=linux-2.6.git;a=shortlog;h=kmemleak)
which I plan to push to Linus, only that I was waiting to accumulate a
few more patches (to avoid sending too many pull requests).

I'll fix the scan_mutex lock as well, following comments and send a pull
request tonight.

Thanks.

--
Catalin

2009-07-03 10:18:50

by Catalin Marinas

[permalink] [raw]
Subject: Re: Exiting with locks still held (was Re: [PATCH] kmemleak: Fix scheduling-while-atomic bug)

On Thu, 2009-07-02 at 10:39 -0700, Linus Torvalds wrote:
> On Thu, 2 Jul 2009, Catalin Marinas wrote:
> >
> > Initially, the scan_mutex was acquired in kmemleak_open() and released
> > in kmemleak_release() (corresponding to /sys/kernel/debug/kmemleak
> > operations). This was causing some lockdep reports when the file was
> > closed from a different task than the one opening it. This patch moves
> > the scan_mutex acquiring in kmemleak_write() or kmemleak_seq_show().
>
> This is better, but not really how you are supposed to do it.
>
> The whole seq-file thing is very much _designed_ for taking a lock at the
> beginning of the operation, and releasing it at the end. It's a very
> common pattern.

Thanks for this. Here's the updated patch (the difference from misc.c is
that it uses mutex_lock_interruptible):


kmemleak: Do not acquire scan_mutex in kmemleak_open()

From: Catalin Marinas <[email protected]>

Initially, the scan_mutex was acquired in kmemleak_open() and released
in kmemleak_release() (corresponding to /sys/kernel/debug/kmemleak
operations). This was causing some lockdep reports when the file was
closed from a different task than the one opening it. This patch moves
the scan_mutex acquiring in kmemleak_write() or kmemleak_seq_start()
with releasing in kmemleak_seq_stop().

Signed-off-by: Catalin Marinas <[email protected]>
---
mm/kmemleak.c | 63 +++++++++++++++++++++++++++------------------------------
1 files changed, 30 insertions(+), 33 deletions(-)

diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 23bf5f0..c6e0aae 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1103,6 +1103,11 @@ static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos)
{
struct kmemleak_object *object;
loff_t n = *pos;
+ int err;
+
+ err = mutex_lock_interruptible(&scan_mutex);
+ if (err < 0)
+ return ERR_PTR(err);

rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
@@ -1146,8 +1151,15 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
*/
static void kmemleak_seq_stop(struct seq_file *seq, void *v)
{
- if (v)
- put_object(v);
+ if (!IS_ERR(v)) {
+ /*
+ * kmemleak_seq_start may return ERR_PTR if the scan_mutex
+ * waiting was interrupted, so only release it if !IS_ERR.
+ */
+ mutex_unlock(&scan_mutex);
+ if (v)
+ put_object(v);
+ }
}

/*
@@ -1174,36 +1186,15 @@ static const struct seq_operations kmemleak_seq_ops = {

static int kmemleak_open(struct inode *inode, struct file *file)
{
- int ret = 0;
-
if (!atomic_read(&kmemleak_enabled))
return -EBUSY;

- ret = mutex_lock_interruptible(&scan_mutex);
- if (ret < 0)
- goto out;
- if (file->f_mode & FMODE_READ) {
- ret = seq_open(file, &kmemleak_seq_ops);
- if (ret < 0)
- goto scan_unlock;
- }
- return ret;
-
-scan_unlock:
- mutex_unlock(&scan_mutex);
-out:
- return ret;
+ return seq_open(file, &kmemleak_seq_ops);
}

static int kmemleak_release(struct inode *inode, struct file *file)
{
- int ret = 0;
-
- if (file->f_mode & FMODE_READ)
- seq_release(inode, file);
- mutex_unlock(&scan_mutex);
-
- return ret;
+ return seq_release(inode, file);
}

/*
@@ -1223,15 +1214,17 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
{
char buf[64];
int buf_size;
-
- if (!atomic_read(&kmemleak_enabled))
- return -EBUSY;
+ int ret;

buf_size = min(size, (sizeof(buf) - 1));
if (strncpy_from_user(buf, user_buf, buf_size) < 0)
return -EFAULT;
buf[buf_size] = 0;

+ ret = mutex_lock_interruptible(&scan_mutex);
+ if (ret < 0)
+ return ret;
+
if (strncmp(buf, "off", 3) == 0)
kmemleak_disable();
else if (strncmp(buf, "stack=on", 8) == 0)
@@ -1244,11 +1237,10 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
stop_scan_thread();
else if (strncmp(buf, "scan=", 5) == 0) {
unsigned long secs;
- int err;

- err = strict_strtoul(buf + 5, 0, &secs);
- if (err < 0)
- return err;
+ ret = strict_strtoul(buf + 5, 0, &secs);
+ if (ret < 0)
+ goto out;
stop_scan_thread();
if (secs) {
jiffies_scan_wait = msecs_to_jiffies(secs * 1000);
@@ -1257,7 +1249,12 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
} else if (strncmp(buf, "scan", 4) == 0)
kmemleak_scan();
else
- return -EINVAL;
+ ret = -EINVAL;
+
+out:
+ mutex_unlock(&scan_mutex);
+ if (ret < 0)
+ return ret;

/* ignore the rest of the buffer, only one command at a time */
*ppos += size;


--
Catalin

2009-07-08 13:34:20

by Catalin Marinas

[permalink] [raw]
Subject: Re: [PATCH] kmemleak: Fix scheduling-while-atomic bug

Hi Ingo,

On Wed, 2009-07-01 at 11:30 +0200, Ingo Molnar wrote:
> * Catalin Marinas <[email protected]> wrote:
>
> > > The minimal fix below removes scan_yield() and adds a
> > > cond_resched() to the outmost (safe) place of the scanning
> > > thread. This solves the regression.
> >
> > With CONFIG_PREEMPT disabled it won't reschedule during the bss
> > scanning but I don't see this as a real issue (task stacks
> > scanning probably takes longer anyway).
>
> Yeah. I suspect one more cond_resched() could be added - i just
> didnt see an obvious place for it, given that scan_block() is being
> called with asymetric held-locks contexts.

I tried with a few cond_resched() calls in the kmemleak_scan() function
but it looks like a significant (well, 1-2 seconds, depending on the
hardware) amount of time is spent in scanning the data and bss sections.
The patch below makes the system with !PREEMPT more responsive during
the scanning episodes.

Are you ok with this approach? Thanks.


kmemleak: Add more cond_resched() calls in the scanning thread

From: Catalin Marinas <[email protected]>

Following recent fix to no longer reschedule in the scan_block()
function, the system may become unresponsive with !PREEMPT. This patch
re-adds the cond_resched() call to scan_block() but conditioned by the
allow_resched parameter.

Signed-off-by: Catalin Marinas <[email protected]>
Cc: Ingo Molnar <[email protected]>
---
mm/kmemleak.c | 19 +++++++++++--------
1 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 6006553..93f1481 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -807,7 +807,7 @@ static int scan_should_stop(void)
* found to the gray list.
*/
static void scan_block(void *_start, void *_end,
- struct kmemleak_object *scanned)
+ struct kmemleak_object *scanned, int allow_resched)
{
unsigned long *ptr;
unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER);
@@ -818,6 +818,8 @@ static void scan_block(void *_start, void *_end,
unsigned long pointer = *ptr;
struct kmemleak_object *object;

+ if (allow_resched)
+ cond_resched();
if (scan_should_stop())
break;

@@ -881,12 +883,12 @@ static void scan_object(struct kmemleak_object *object)
goto out;
if (hlist_empty(&object->area_list))
scan_block((void *)object->pointer,
- (void *)(object->pointer + object->size), object);
+ (void *)(object->pointer + object->size), object, 0);
else
hlist_for_each_entry(area, elem, &object->area_list, node)
scan_block((void *)(object->pointer + area->offset),
(void *)(object->pointer + area->offset
- + area->length), object);
+ + area->length), object, 0);
out:
spin_unlock_irqrestore(&object->lock, flags);
}
@@ -931,14 +933,14 @@ static void kmemleak_scan(void)
rcu_read_unlock();

/* data/bss scanning */
- scan_block(_sdata, _edata, NULL);
- scan_block(__bss_start, __bss_stop, NULL);
+ scan_block(_sdata, _edata, NULL, 1);
+ scan_block(__bss_start, __bss_stop, NULL, 1);

#ifdef CONFIG_SMP
/* per-cpu sections scanning */
for_each_possible_cpu(i)
scan_block(__per_cpu_start + per_cpu_offset(i),
- __per_cpu_end + per_cpu_offset(i), NULL);
+ __per_cpu_end + per_cpu_offset(i), NULL, 1);
#endif

/*
@@ -960,7 +962,7 @@ static void kmemleak_scan(void)
/* only scan if page is in use */
if (page_count(page) == 0)
continue;
- scan_block(page, page + 1, NULL);
+ scan_block(page, page + 1, NULL, 1);
}
}

@@ -972,7 +974,8 @@ static void kmemleak_scan(void)
read_lock(&tasklist_lock);
for_each_process(task)
scan_block(task_stack_page(task),
- task_stack_page(task) + THREAD_SIZE, NULL);
+ task_stack_page(task) + THREAD_SIZE,
+ NULL, 0);
read_unlock(&tasklist_lock);
}



--
Catalin