LinuxLists.cc - hugetlbfs lockdep spew revisited.

2012-02-17 00:09:01

Subject: hugetlbfs lockdep spew revisited.

Remember this ? https://lkml.org/lkml/2011/4/15/272
Josh took a stab at fixing it in e096d0c7e2e4e5893792db865dd065ac73cf1f00,
but it seems to still be there.

Dave

======================================================
[ INFO: possible circular locking dependency detected ]
3.3.0-rc3+ #2 Not tainted
-------------------------------------------------------
trinity/30663 is trying to acquire lock:
(&sb->s_type->i_mutex_key#18){+.+...}, at: [<ffffffff81298169>] hugetlbfs_file_mmap+0x89/0x140

but task is already holding lock:
(&mm->mmap_sem){++++++}, at: [<ffffffff81182d97>] sys_mmap_pgoff+0x1d7/0x230

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #1 (&mm->mmap_sem){++++++}:
[<ffffffff810d073d>] lock_acquire+0x9d/0x220
[<ffffffff811789c0>] might_fault+0x80/0xb0
[<ffffffff811d2997>] filldir+0x77/0xe0
[<ffffffff811e61ae>] dcache_readdir+0x5e/0x220
[<ffffffff811d2c68>] vfs_readdir+0xb8/0xf0
[<ffffffff811d2d99>] sys_getdents+0x89/0x100
[<ffffffff816a5b69>] system_call_fastpath+0x16/0x1b

-> #0 (&sb->s_type->i_mutex_key#18){+.+...}:
[<ffffffff810d0008>] __lock_acquire+0x1bf8/0x1c20
[<ffffffff810d073d>] lock_acquire+0x9d/0x220
[<ffffffff8169a5b9>] __mutex_lock_common+0x59/0x500
[<ffffffff8169ab94>] mutex_lock_nested+0x44/0x50
[<ffffffff81298169>] hugetlbfs_file_mmap+0x89/0x140
[<ffffffff811826a9>] mmap_region+0x369/0x4f0
[<ffffffff81182b9f>] do_mmap_pgoff+0x36f/0x390
[<ffffffff81182db7>] sys_mmap_pgoff+0x1f7/0x230
[<ffffffff8101eda2>] sys_mmap+0x22/0x30
[<ffffffff816a5b69>] system_call_fastpath+0x16/0x1b

other info that might help us debug this:

Possible unsafe locking scenario:

CPU0 CPU1
---- ----
lock(&mm->mmap_sem);
lock(&sb->s_type->i_mutex_key#18);
lock(&mm->mmap_sem);
lock(&sb->s_type->i_mutex_key#18);

*** DEADLOCK ***

1 lock held by trinity/30663:
#0: (&mm->mmap_sem){++++++}, at: [<ffffffff81182d97>] sys_mmap_pgoff+0x1d7/0x230

stack backtrace:
Pid: 30663, comm: trinity Not tainted 3.3.0-rc3+ #2
Call Trace:
[<ffffffff816924d7>] print_circular_bug+0x1fb/0x20c
[<ffffffff810d0008>] __lock_acquire+0x1bf8/0x1c20
[<ffffffff816a1c2d>] ? sub_preempt_count+0x9d/0xd0
[<ffffffff811a40cc>] ? deactivate_slab+0x54c/0x5f0
[<ffffffff810d073d>] lock_acquire+0x9d/0x220
[<ffffffff81298169>] ? hugetlbfs_file_mmap+0x89/0x140
[<ffffffff810d12fd>] ? trace_hardirqs_on_caller+0x10d/0x1a0
[<ffffffff8169a5b9>] __mutex_lock_common+0x59/0x500
[<ffffffff81298169>] ? hugetlbfs_file_mmap+0x89/0x140
[<ffffffff811825e5>] ? mmap_region+0x2a5/0x4f0
[<ffffffff81298169>] ? hugetlbfs_file_mmap+0x89/0x140
[<ffffffff8169ab94>] mutex_lock_nested+0x44/0x50
[<ffffffff81298169>] hugetlbfs_file_mmap+0x89/0x140
[<ffffffff811826a9>] mmap_region+0x369/0x4f0
[<ffffffff812c1e9a>] ? file_map_prot_check+0xaa/0xe0
[<ffffffff81182b9f>] do_mmap_pgoff+0x36f/0x390
[<ffffffff81182d97>] ? sys_mmap_pgoff+0x1d7/0x230
[<ffffffff81182db7>] sys_mmap_pgoff+0x1f7/0x230
[<ffffffff810d12fd>] ? trace_hardirqs_on_caller+0x10d/0x1a0
[<ffffffff8101eda2>] sys_mmap+0x22/0x30
[<ffffffff816a5b69>] system_call_fastpath+0x16/0x1b

2012-02-17 00:16:40

by Josh Boyer

[permalink] [raw]

Subject: Re: hugetlbfs lockdep spew revisited.

On Thu, Feb 16, 2012 at 07:08:57PM -0500, Dave Jones wrote:
> Remember this ? https://lkml.org/lkml/2011/4/15/272
> Josh took a stab at fixing it in e096d0c7e2e4e5893792db865dd065ac73cf1f00,
> but it seems to still be there.

I think Tyler Hicks actually noticed this a while ago, but his patch has
been waiting on comment from Al and Christoph:

http://thread.gmane.org/gmane.linux.file-systems/58795/focus=59565

I've been hesitant to comment because I obviously screwed up once
already. We could try this patch in Fedora for a while if Al and
company don't speak up soon.

josh

>
> Dave
>
>
> ======================================================
> [ INFO: possible circular locking dependency detected ]
> 3.3.0-rc3+ #2 Not tainted
> -------------------------------------------------------
> trinity/30663 is trying to acquire lock:
> (&sb->s_type->i_mutex_key#18){+.+...}, at: [<ffffffff81298169>] hugetlbfs_file_mmap+0x89/0x140
>
> but task is already holding lock:
> (&mm->mmap_sem){++++++}, at: [<ffffffff81182d97>] sys_mmap_pgoff+0x1d7/0x230
>
> which lock already depends on the new lock.
>
>
> the existing dependency chain (in reverse order) is:
>
> -> #1 (&mm->mmap_sem){++++++}:
> [<ffffffff810d073d>] lock_acquire+0x9d/0x220
> [<ffffffff811789c0>] might_fault+0x80/0xb0
> [<ffffffff811d2997>] filldir+0x77/0xe0
> [<ffffffff811e61ae>] dcache_readdir+0x5e/0x220
> [<ffffffff811d2c68>] vfs_readdir+0xb8/0xf0
> [<ffffffff811d2d99>] sys_getdents+0x89/0x100
> [<ffffffff816a5b69>] system_call_fastpath+0x16/0x1b
>
> -> #0 (&sb->s_type->i_mutex_key#18){+.+...}:
> [<ffffffff810d0008>] __lock_acquire+0x1bf8/0x1c20
> [<ffffffff810d073d>] lock_acquire+0x9d/0x220
> [<ffffffff8169a5b9>] __mutex_lock_common+0x59/0x500
> [<ffffffff8169ab94>] mutex_lock_nested+0x44/0x50
> [<ffffffff81298169>] hugetlbfs_file_mmap+0x89/0x140
> [<ffffffff811826a9>] mmap_region+0x369/0x4f0
> [<ffffffff81182b9f>] do_mmap_pgoff+0x36f/0x390
> [<ffffffff81182db7>] sys_mmap_pgoff+0x1f7/0x230
> [<ffffffff8101eda2>] sys_mmap+0x22/0x30
> [<ffffffff816a5b69>] system_call_fastpath+0x16/0x1b
>
> other info that might help us debug this:
>
> Possible unsafe locking scenario:
>
> CPU0 CPU1
> ---- ----
> lock(&mm->mmap_sem);
> lock(&sb->s_type->i_mutex_key#18);
> lock(&mm->mmap_sem);
> lock(&sb->s_type->i_mutex_key#18);
>
> *** DEADLOCK ***
>
> 1 lock held by trinity/30663:
> #0: (&mm->mmap_sem){++++++}, at: [<ffffffff81182d97>] sys_mmap_pgoff+0x1d7/0x230
>
> stack backtrace:
> Pid: 30663, comm: trinity Not tainted 3.3.0-rc3+ #2
> Call Trace:
> [<ffffffff816924d7>] print_circular_bug+0x1fb/0x20c
> [<ffffffff810d0008>] __lock_acquire+0x1bf8/0x1c20
> [<ffffffff816a1c2d>] ? sub_preempt_count+0x9d/0xd0
> [<ffffffff811a40cc>] ? deactivate_slab+0x54c/0x5f0
> [<ffffffff810d073d>] lock_acquire+0x9d/0x220
> [<ffffffff81298169>] ? hugetlbfs_file_mmap+0x89/0x140
> [<ffffffff810d12fd>] ? trace_hardirqs_on_caller+0x10d/0x1a0
> [<ffffffff8169a5b9>] __mutex_lock_common+0x59/0x500
> [<ffffffff81298169>] ? hugetlbfs_file_mmap+0x89/0x140
> [<ffffffff811825e5>] ? mmap_region+0x2a5/0x4f0
> [<ffffffff81298169>] ? hugetlbfs_file_mmap+0x89/0x140
> [<ffffffff8169ab94>] mutex_lock_nested+0x44/0x50
> [<ffffffff81298169>] hugetlbfs_file_mmap+0x89/0x140
> [<ffffffff811826a9>] mmap_region+0x369/0x4f0
> [<ffffffff812c1e9a>] ? file_map_prot_check+0xaa/0xe0
> [<ffffffff81182b9f>] do_mmap_pgoff+0x36f/0x390
> [<ffffffff81182d97>] ? sys_mmap_pgoff+0x1d7/0x230
> [<ffffffff81182db7>] sys_mmap_pgoff+0x1f7/0x230
> [<ffffffff810d12fd>] ? trace_hardirqs_on_caller+0x10d/0x1a0
> [<ffffffff8101eda2>] sys_mmap+0x22/0x30
> [<ffffffff816a5b69>] system_call_fastpath+0x16/0x1b
>

2012-02-17 00:27:28

by Al Viro

[permalink] [raw]

Subject: Re: hugetlbfs lockdep spew revisited.

On Thu, Feb 16, 2012 at 07:08:57PM -0500, Dave Jones wrote:
> Remember this ? https://lkml.org/lkml/2011/4/15/272
> Josh took a stab at fixing it in e096d0c7e2e4e5893792db865dd065ac73cf1f00,
> but it seems to still be there.

> the existing dependency chain (in reverse order) is:

[snip]

... and as bloody usual, that mentioning of readdir in the output is a
red herring; the real problem (and yes, it *is* deadlock-prone) is not
with getdents(2) that cannot happen on anything that could be mmaped;
it's with hugetlbfs_read() (i.e. read(2)) that very definitely *can*.

This is *not* a misannotation and not a false positive; this is a real,
honest deadlock.
Thread A:
read() on hugetlbfs
hugetlbfs_read() called
i_mutex grabbed
hugetlbfs_read_actor() called
__copy_to_user() called
page fault is triggered
Thread B, sharing address space with A:
mmap() the same file
->mmap_sem is grabbed on task_B->mm->mmap_sem
hugetlbfs_file_mmap() is called
attempt to grab ->i_mutex and block waiting for A to give it up
Thread A:
pagefault handled blocked on attempt to grab task_A->mm->mmap_sem,
which happens to be the same thing as task_B->mm->mmap_sem. Block waiting
for B to give it up.

Deadlock.

2012-02-17 00:34:17

by Al Viro

[permalink] [raw]

Subject: Re: hugetlbfs lockdep spew revisited.

On Thu, Feb 16, 2012 at 07:16:34PM -0500, Josh Boyer wrote:
> On Thu, Feb 16, 2012 at 07:08:57PM -0500, Dave Jones wrote:
> > Remember this ? https://lkml.org/lkml/2011/4/15/272
> > Josh took a stab at fixing it in e096d0c7e2e4e5893792db865dd065ac73cf1f00,
> > but it seems to still be there.
>
> I think Tyler Hicks actually noticed this a while ago, but his patch has
> been waiting on comment from Al and Christoph:
>
> http://thread.gmane.org/gmane.linux.file-systems/58795/focus=59565
>
> I've been hesitant to comment because I obviously screwed up once
> already. We could try this patch in Fedora for a while if Al and
> company don't speak up soon.

That has nothing to do with the deadlock in question; it's *NOT* about
directories at all and no, it's not a false positive.

This is very simple: ->mmap() should never take ->i_mutex. Directories
have nothing to do with that. Simple grep for i_mutex in fs/hugetlbfs/*.c
will instantly show its use for non-directories, with pagefaults taken
while holding it. Pagefault handlers take ->mmap_sem; so does ->mmap()
caller. QED.

2012-02-17 00:38:53

by Tyler Hicks

[permalink] [raw]

Subject: Re: hugetlbfs lockdep spew revisited.

On 2012-02-16 19:16:34, Josh Boyer wrote:
> On Thu, Feb 16, 2012 at 07:08:57PM -0500, Dave Jones wrote:
> > Remember this ? https://lkml.org/lkml/2011/4/15/272
> > Josh took a stab at fixing it in e096d0c7e2e4e5893792db865dd065ac73cf1f00,
> > but it seems to still be there.
>
> I think Tyler Hicks actually noticed this a while ago, but his patch has
> been waiting on comment from Al and Christoph:
>
> http://thread.gmane.org/gmane.linux.file-systems/58795/focus=59565
>
> I've been hesitant to comment because I obviously screwed up once
> already. We could try this patch in Fedora for a while if Al and
> company don't speak up soon.

I'm pretty confident that my patch that Josh linked to would "fix" the
lockdep warning below. According to the backtrace, it is barking about a
directory inode and a regular inode having a circular locking
dependency, so deadlock is not possible in this case.

akpm picked up my patch in the mm tree, but it still hasn't made it into
mainline.

Tyler

>
> josh
>
> >
> > Dave
> >
> >
> > ======================================================
> > [ INFO: possible circular locking dependency detected ]
> > 3.3.0-rc3+ #2 Not tainted
> > -------------------------------------------------------
> > trinity/30663 is trying to acquire lock:
> > (&sb->s_type->i_mutex_key#18){+.+...}, at: [<ffffffff81298169>] hugetlbfs_file_mmap+0x89/0x140
> >
> > but task is already holding lock:
> > (&mm->mmap_sem){++++++}, at: [<ffffffff81182d97>] sys_mmap_pgoff+0x1d7/0x230
> >
> > which lock already depends on the new lock.
> >
> >
> > the existing dependency chain (in reverse order) is:
> >
> > -> #1 (&mm->mmap_sem){++++++}:
> > [<ffffffff810d073d>] lock_acquire+0x9d/0x220
> > [<ffffffff811789c0>] might_fault+0x80/0xb0
> > [<ffffffff811d2997>] filldir+0x77/0xe0
> > [<ffffffff811e61ae>] dcache_readdir+0x5e/0x220
> > [<ffffffff811d2c68>] vfs_readdir+0xb8/0xf0
> > [<ffffffff811d2d99>] sys_getdents+0x89/0x100
> > [<ffffffff816a5b69>] system_call_fastpath+0x16/0x1b
> >
> > -> #0 (&sb->s_type->i_mutex_key#18){+.+...}:
> > [<ffffffff810d0008>] __lock_acquire+0x1bf8/0x1c20
> > [<ffffffff810d073d>] lock_acquire+0x9d/0x220
> > [<ffffffff8169a5b9>] __mutex_lock_common+0x59/0x500
> > [<ffffffff8169ab94>] mutex_lock_nested+0x44/0x50
> > [<ffffffff81298169>] hugetlbfs_file_mmap+0x89/0x140
> > [<ffffffff811826a9>] mmap_region+0x369/0x4f0
> > [<ffffffff81182b9f>] do_mmap_pgoff+0x36f/0x390
> > [<ffffffff81182db7>] sys_mmap_pgoff+0x1f7/0x230
> > [<ffffffff8101eda2>] sys_mmap+0x22/0x30
> > [<ffffffff816a5b69>] system_call_fastpath+0x16/0x1b
> >
> > other info that might help us debug this:
> >
> > Possible unsafe locking scenario:
> >
> > CPU0 CPU1
> > ---- ----
> > lock(&mm->mmap_sem);
> > lock(&sb->s_type->i_mutex_key#18);
> > lock(&mm->mmap_sem);
> > lock(&sb->s_type->i_mutex_key#18);
> >
> > *** DEADLOCK ***
> >
> > 1 lock held by trinity/30663:
> > #0: (&mm->mmap_sem){++++++}, at: [<ffffffff81182d97>] sys_mmap_pgoff+0x1d7/0x230
> >
> > stack backtrace:
> > Pid: 30663, comm: trinity Not tainted 3.3.0-rc3+ #2
> > Call Trace:
> > [<ffffffff816924d7>] print_circular_bug+0x1fb/0x20c
> > [<ffffffff810d0008>] __lock_acquire+0x1bf8/0x1c20
> > [<ffffffff816a1c2d>] ? sub_preempt_count+0x9d/0xd0
> > [<ffffffff811a40cc>] ? deactivate_slab+0x54c/0x5f0
> > [<ffffffff810d073d>] lock_acquire+0x9d/0x220
> > [<ffffffff81298169>] ? hugetlbfs_file_mmap+0x89/0x140
> > [<ffffffff810d12fd>] ? trace_hardirqs_on_caller+0x10d/0x1a0
> > [<ffffffff8169a5b9>] __mutex_lock_common+0x59/0x500
> > [<ffffffff81298169>] ? hugetlbfs_file_mmap+0x89/0x140
> > [<ffffffff811825e5>] ? mmap_region+0x2a5/0x4f0
> > [<ffffffff81298169>] ? hugetlbfs_file_mmap+0x89/0x140
> > [<ffffffff8169ab94>] mutex_lock_nested+0x44/0x50
> > [<ffffffff81298169>] hugetlbfs_file_mmap+0x89/0x140
> > [<ffffffff811826a9>] mmap_region+0x369/0x4f0
> > [<ffffffff812c1e9a>] ? file_map_prot_check+0xaa/0xe0
> > [<ffffffff81182b9f>] do_mmap_pgoff+0x36f/0x390
> > [<ffffffff81182d97>] ? sys_mmap_pgoff+0x1d7/0x230
> > [<ffffffff81182db7>] sys_mmap_pgoff+0x1f7/0x230
> > [<ffffffff810d12fd>] ? trace_hardirqs_on_caller+0x10d/0x1a0
> > [<ffffffff8101eda2>] sys_mmap+0x22/0x30
> > [<ffffffff816a5b69>] system_call_fastpath+0x16/0x1b
> >

Attachments:

(No filename) (4.40 kB)
signature.asc (836.00 B)
Digital signature Download all attachments

2012-02-17 00:49:25

by Al Viro

[permalink] [raw]

Subject: Re: hugetlbfs lockdep spew revisited.

On Thu, Feb 16, 2012 at 06:38:49PM -0600, Tyler Hicks wrote:
> On 2012-02-16 19:16:34, Josh Boyer wrote:
> > On Thu, Feb 16, 2012 at 07:08:57PM -0500, Dave Jones wrote:
> > > Remember this ? https://lkml.org/lkml/2011/4/15/272
> > > Josh took a stab at fixing it in e096d0c7e2e4e5893792db865dd065ac73cf1f00,
> > > but it seems to still be there.
> >
> > I think Tyler Hicks actually noticed this a while ago, but his patch has
> > been waiting on comment from Al and Christoph:
> >
> > http://thread.gmane.org/gmane.linux.file-systems/58795/focus=59565
> >
> > I've been hesitant to comment because I obviously screwed up once
> > already. We could try this patch in Fedora for a while if Al and
> > company don't speak up soon.
>
> I'm pretty confident that my patch that Josh linked to would "fix" the
> lockdep warning below. According to the backtrace, it is barking about a
> directory inode and a regular inode having a circular locking
> dependency, so deadlock is not possible in this case.

Sigh... That patch is correct, but it has nothing to do with the locking
order violation that really *is* there. The only benefit would be to
get rid of the "deadlock is not possible" nonsense, since you would see
read/write vs. mmap instead of readdir vs. mmap in the traces. Locking
order is the *same* for directories and nondirectories; both can have
pagefaults under ->i_mutex on their respective inodes. And while mmap
cannot happen for directories, it certainly can happen for regular files,
so taking ->i_mutex in ->mmap() is a plain and simple bug. Should never
be done; in particular, hugetlbfs has ->i_mutex held in read() around
pagefaults, which gives you an obvious deadlock with its ->mmap().

Folks, this is not a false positive and it has nothing to do with misannotation
for directories. Deadlock is real; I have no idea WTF do we what ->i_mutex
held over that area in hugetlbfs ->mmap(), but doing that is really, really
wrong, whatever the reason.

2012-02-17 03:42:20

by Tyler Hicks

[permalink] [raw]

Subject: Re: hugetlbfs lockdep spew revisited.

On 2012-02-17 00:49:22, Al Viro wrote:
> On Thu, Feb 16, 2012 at 06:38:49PM -0600, Tyler Hicks wrote:
> > On 2012-02-16 19:16:34, Josh Boyer wrote:
> > > On Thu, Feb 16, 2012 at 07:08:57PM -0500, Dave Jones wrote:
> > > > Remember this ? https://lkml.org/lkml/2011/4/15/272
> > > > Josh took a stab at fixing it in e096d0c7e2e4e5893792db865dd065ac73cf1f00,
> > > > but it seems to still be there.
> > >
> > > I think Tyler Hicks actually noticed this a while ago, but his patch has
> > > been waiting on comment from Al and Christoph:
> > >
> > > http://thread.gmane.org/gmane.linux.file-systems/58795/focus=59565
> > >
> > > I've been hesitant to comment because I obviously screwed up once
> > > already. We could try this patch in Fedora for a while if Al and
> > > company don't speak up soon.
> >
> > I'm pretty confident that my patch that Josh linked to would "fix" the
> > lockdep warning below. According to the backtrace, it is barking about a
> > directory inode and a regular inode having a circular locking
> > dependency, so deadlock is not possible in this case.
>
> Sigh... That patch is correct, but it has nothing to do with the locking
> order violation that really *is* there. The only benefit would be to
> get rid of the "deadlock is not possible" nonsense, since you would see
> read/write vs. mmap instead of readdir vs. mmap in the traces. Locking
> order is the *same* for directories and nondirectories; both can have
> pagefaults under ->i_mutex on their respective inodes. And while mmap
> cannot happen for directories, it certainly can happen for regular files,
> so taking ->i_mutex in ->mmap() is a plain and simple bug. Should never
> be done; in particular, hugetlbfs has ->i_mutex held in read() around
> pagefaults, which gives you an obvious deadlock with its ->mmap().
>
> Folks, this is not a false positive and it has nothing to do with misannotation
> for directories. Deadlock is real; I have no idea WTF do we what ->i_mutex
> held over that area in hugetlbfs ->mmap(), but doing that is really, really
> wrong, whatever the reason.

Thanks for clearing that up, Al. I only knew that the inodes were being
incorrectly annotated, but I wasn't sure about the correct locking order.

Tyler

Attachments:

(No filename) (2.19 kB)
signature.asc (836.00 B)
Digital signature Download all attachments

2012-02-17 06:47:45

by J. R. Okajima

[permalink] [raw]

Subject: Re: hugetlbfs lockdep spew revisited.

Al Viro:
> Sigh... That patch is correct, but it has nothing to do with the locking
> order violation that really *is* there. The only benefit would be to
> get rid of the "deadlock is not possible" nonsense, since you would see
> read/write vs. mmap instead of readdir vs. mmap in the traces. Locking
:::

How do you think about this patch?

Re: [RFC 0/2] locking order of mm->mmap_sem and various FS
http://marc.info/?l=linux-kernel&m=132124846728745&w=2

Ah, I found mutex_destroy() call in hugetlbfs_destroy_inode() should be
removed.
If you think this approach is good, then I'd post a revised patch.

J. R. Okajima

2012-02-17 17:48:21

by Al Viro

[permalink] [raw]

Subject: udf deadlock (was Re: hugetlbfs lockdep spew revisited.)

On Fri, Feb 17, 2012 at 12:49:22AM +0000, Al Viro wrote:
> Folks, this is not a false positive and it has nothing to do with misannotation
> for directories. Deadlock is real; I have no idea WTF do we what ->i_mutex
> held over that area in hugetlbfs ->mmap(), but doing that is really, really
> wrong, whatever the reason.

Arrrrgh... Some grepping around has uncovered another deadlock on
i_mutex/mmap_sem and this one is not hard to hit at all.

Thread A:
opens file on UDF (O_RDWR open)
does big, fat write() to it
Thread B:
opens the same file (also O_RDWR)
mmaps it
closes
does munmap()

and there we are - munmap() will end up closing the second struct file,
call udf_release_file() and we are hitting ->i_mutex while under
->mmap_sem. Blocking on it, actually, since generic_file_aio_write()
in the first thread is holding ->i_mutex. And as soon as thread A gets
around to faulting the next piece of data in, well... To widen the
window a lot, mmap something large sitting on NFS and do write() from
that mmapped area. Race window as wide as one could ask for...

What happens there is prealloc discard on close; do we even want ->i_mutex
there these days? Note that there's also
down_write(&UDF_I(inode)->i_data_sem);
in udf_release_file()...

2012-02-18 10:56:09

by Aneesh Kumar K.V

[permalink] [raw]

Subject: Re: hugetlbfs lockdep spew revisited.

On Fri, 17 Feb 2012 00:49:22 +0000, Al Viro <[email protected]> wrote:
> On Thu, Feb 16, 2012 at 06:38:49PM -0600, Tyler Hicks wrote:
> > On 2012-02-16 19:16:34, Josh Boyer wrote:
> > > On Thu, Feb 16, 2012 at 07:08:57PM -0500, Dave Jones wrote:
> > > > Remember this ? https://lkml.org/lkml/2011/4/15/272
> > > > Josh took a stab at fixing it in e096d0c7e2e4e5893792db865dd065ac73cf1f00,
> > > > but it seems to still be there.
> > >
> > > I think Tyler Hicks actually noticed this a while ago, but his patch has
> > > been waiting on comment from Al and Christoph:
> > >
> > > http://thread.gmane.org/gmane.linux.file-systems/58795/focus=59565
> > >
> > > I've been hesitant to comment because I obviously screwed up once
> > > already. We could try this patch in Fedora for a while if Al and
> > > company don't speak up soon.
> >
> > I'm pretty confident that my patch that Josh linked to would "fix" the
> > lockdep warning below. According to the backtrace, it is barking about a
> > directory inode and a regular inode having a circular locking
> > dependency, so deadlock is not possible in this case.
>
> Sigh... That patch is correct, but it has nothing to do with the locking
> order violation that really *is* there. The only benefit would be to
> get rid of the "deadlock is not possible" nonsense, since you would see
> read/write vs. mmap instead of readdir vs. mmap in the traces. Locking
> order is the *same* for directories and nondirectories; both can have
> pagefaults under ->i_mutex on their respective inodes. And while mmap
> cannot happen for directories, it certainly can happen for regular files,
> so taking ->i_mutex in ->mmap() is a plain and simple bug. Should never
> be done; in particular, hugetlbfs has ->i_mutex held in read() around
> pagefaults, which gives you an obvious deadlock with its ->mmap().
>
> Folks, this is not a false positive and it has nothing to do with misannotation
> for directories. Deadlock is real; I have no idea WTF do we what ->i_mutex
> held over that area in hugetlbfs ->mmap(), but doing that is really, really
> wrong, whatever the reason.

I looked at hugetlbfs recently and noticed this. Another strange thing
with hugetlbfs is, it doesn't support write, instead allows to bump
the file size via mmap. I don't have a patch for inode->i_mutex issue
yet.

-aneesh

2012-02-20 16:01:39

by Jan Kara

[permalink] [raw]

Subject: Re: udf deadlock (was Re: hugetlbfs lockdep spew revisited.)

On Fri 17-02-12 17:48:18, Al Viro wrote:
> On Fri, Feb 17, 2012 at 12:49:22AM +0000, Al Viro wrote:
> > Folks, this is not a false positive and it has nothing to do with misannotation
> > for directories. Deadlock is real; I have no idea WTF do we what ->i_mutex
> > held over that area in hugetlbfs ->mmap(), but doing that is really, really
> > wrong, whatever the reason.
>
> Arrrrgh... Some grepping around has uncovered another deadlock on
> i_mutex/mmap_sem and this one is not hard to hit at all.
>
> Thread A:
> opens file on UDF (O_RDWR open)
> does big, fat write() to it
> Thread B:
> opens the same file (also O_RDWR)
> mmaps it
> closes
> does munmap()
>
> and there we are - munmap() will end up closing the second struct file,
> call udf_release_file() and we are hitting ->i_mutex while under
> ->mmap_sem. Blocking on it, actually, since generic_file_aio_write()
> in the first thread is holding ->i_mutex. And as soon as thread A gets
> around to faulting the next piece of data in, well... To widen the
> window a lot, mmap something large sitting on NFS and do write() from
> that mmapped area. Race window as wide as one could ask for...
Right, I didn't realize ->release() may be called with mmap_sem held.
Thanks for spotting this. BTW: Documentation/filesystems/Locking might
need an update since it states:
locking rules:
All may block except for ->setlease.
No VFS locks held on entry except for ->setlease.

> What happens there is prealloc discard on close; do we even want ->i_mutex
> there these days? Note that there's also
> down_write(&UDF_I(inode)->i_data_sem);
> in udf_release_file()...
I've looked around and it seems we don't need i_mutex for anything.
i_data_sem should be enough. So I'll just remove i_mutex.

Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR

2012-02-21 18:24:55

by Mimi Zohar

[permalink] [raw]

Subject: Re: hugetlbfs lockdep spew revisited.

On Thu, 2012-02-16 at 21:42 -0600, Tyler Hicks wrote:
> On 2012-02-17 00:49:22, Al Viro wrote:
> > On Thu, Feb 16, 2012 at 06:38:49PM -0600, Tyler Hicks wrote:
> > > On 2012-02-16 19:16:34, Josh Boyer wrote:
> > > > On Thu, Feb 16, 2012 at 07:08:57PM -0500, Dave Jones wrote:
> > > > > Remember this ? https://lkml.org/lkml/2011/4/15/272
> > > > > Josh took a stab at fixing it in e096d0c7e2e4e5893792db865dd065ac73cf1f00,
> > > > > but it seems to still be there.
> > > >
> > > > I think Tyler Hicks actually noticed this a while ago, but his patch has
> > > > been waiting on comment from Al and Christoph:
> > > >
> > > > http://thread.gmane.org/gmane.linux.file-systems/58795/focus=59565
> > > >
> > > > I've been hesitant to comment because I obviously screwed up once
> > > > already. We could try this patch in Fedora for a while if Al and
> > > > company don't speak up soon.
> > >
> > > I'm pretty confident that my patch that Josh linked to would "fix" the
> > > lockdep warning below. According to the backtrace, it is barking about a
> > > directory inode and a regular inode having a circular locking
> > > dependency, so deadlock is not possible in this case.
> >
> > Sigh... That patch is correct, but it has nothing to do with the locking
> > order violation that really *is* there. The only benefit would be to
> > get rid of the "deadlock is not possible" nonsense, since you would see
> > read/write vs. mmap instead of readdir vs. mmap in the traces. Locking
> > order is the *same* for directories and nondirectories; both can have
> > pagefaults under ->i_mutex on their respective inodes. And while mmap
> > cannot happen for directories, it certainly can happen for regular files,
> > so taking ->i_mutex in ->mmap() is a plain and simple bug. Should never
> > be done; in particular, hugetlbfs has ->i_mutex held in read() around
> > pagefaults, which gives you an obvious deadlock with its ->mmap().
> >
> > Folks, this is not a false positive and it has nothing to do with misannotation
> > for directories. Deadlock is real; I have no idea WTF do we what ->i_mutex
> > held over that area in hugetlbfs ->mmap(), but doing that is really, really
> > wrong, whatever the reason.
>
> Thanks for clearing that up, Al. I only knew that the inodes were being
> incorrectly annotated, but I wasn't sure about the correct locking order.
>
> Tyler

Al, thanks for the clarification. An i_mutex/mmap_sem lockdep exists
for IMA as well. https://lkml.org/lkml/2012/1/24/246 resolves the
lockdep by moving ima_file_mmap() before the mmap_sem is taken. Do you
see any problems with this patch?

thanks,

Mimi

2012-02-23 09:28:24

by Aneesh Kumar K.V

[permalink] [raw]

Subject: Re: hugetlbfs lockdep spew revisited.

On Fri, 17 Feb 2012 00:27:26 +0000, Al Viro <[email protected]> wrote:
> On Thu, Feb 16, 2012 at 07:08:57PM -0500, Dave Jones wrote:
> > Remember this ? https://lkml.org/lkml/2011/4/15/272
> > Josh took a stab at fixing it in e096d0c7e2e4e5893792db865dd065ac73cf1f00,
> > but it seems to still be there.
>
> > the existing dependency chain (in reverse order) is:
>
> [snip]
>
> ... and as bloody usual, that mentioning of readdir in the output is a
> red herring; the real problem (and yes, it *is* deadlock-prone) is not
> with getdents(2) that cannot happen on anything that could be mmaped;
> it's with hugetlbfs_read() (i.e. read(2)) that very definitely *can*.
>
> This is *not* a misannotation and not a false positive; this is a real,
> honest deadlock.
> Thread A:
> read() on hugetlbfs
> hugetlbfs_read() called
> i_mutex grabbed
> hugetlbfs_read_actor() called
> __copy_to_user() called
> page fault is triggered
> Thread B, sharing address space with A:
> mmap() the same file
> ->mmap_sem is grabbed on task_B->mm->mmap_sem
> hugetlbfs_file_mmap() is called
> attempt to grab ->i_mutex and block waiting for A to give it up
> Thread A:
> pagefault handled blocked on attempt to grab task_A->mm->mmap_sem,
> which happens to be the same thing as task_B->mm->mmap_sem. Block waiting
> for B to give it up.
>
> Deadlock.

How about the below patch ? If this is ok, I can send this as a separate
mail. I am still not sure about dropping inode->i_mutex in mmap as that
will mean we cannot update i_size in mmap and that will break userspace ?

commit 8fb2df40cabd99dc4f39f7fd26ba1d4db885cb5e
Author: Aneesh Kumar K.V <[email protected]>
Date: Wed Feb 22 10:18:51 2012 +0530

hugetlbfs: Add new rw_semaphore for truncate/read race

Drop using inode->i_mutex from read, since that can result in deadlock with
mmap. Ideally we can extend the patch to make sure we don't increase i_size
in mmap. But that will break userspace, because application will have to now
use truncate(2) to increase i_size in hugetlbfs.

Signed-off-by: Aneesh Kumar K.V <[email protected]>

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 2680578..cd33685 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -238,8 +238,9 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
unsigned long end_index;
loff_t isize;
ssize_t retval = 0;
+ struct hugetlbfs_inode_info *hinfo = HUGETLBFS_I(inode);

- mutex_lock(&inode->i_mutex);
+ down_read(&hinfo->truncate_sem);

/* validate length */
if (len == 0)
@@ -309,7 +310,7 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
}
out:
*ppos = ((loff_t)index << huge_page_shift(h)) + offset;
- mutex_unlock(&inode->i_mutex);
+ up_read(&hinfo->truncate_sem);
return retval;
}

@@ -408,16 +409,19 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
pgoff_t pgoff;
struct address_space *mapping = inode->i_mapping;
struct hstate *h = hstate_inode(inode);
+ struct hugetlbfs_inode_info *hinfo = HUGETLBFS_I(inode);

BUG_ON(offset & ~huge_page_mask(h));
pgoff = offset >> PAGE_SHIFT;

+ down_write(&hinfo->truncate_sem);
i_size_write(inode, offset);
mutex_lock(&mapping->i_mmap_mutex);
if (!prio_tree_empty(&mapping->i_mmap))
hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
mutex_unlock(&mapping->i_mmap_mutex);
truncate_hugepages(inode, offset);
+ up_write(&hinfo->truncate_sem);
return 0;
}

@@ -695,9 +699,10 @@ static const struct address_space_operations hugetlbfs_aops = {

static void init_once(void *foo)
{
- struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
+ struct hugetlbfs_inode_info *hinfo = (struct hugetlbfs_inode_info *)foo;

- inode_init_once(&ei->vfs_inode);
+ init_rwsem(&hinfo->truncate_sem);
+ inode_init_once(&hinfo->vfs_inode);
}

const struct file_operations hugetlbfs_file_operations = {
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 226f488..57fb788 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -155,6 +155,7 @@ struct hugetlbfs_sb_info {

struct hugetlbfs_inode_info {
struct shared_policy policy;
+ struct rw_semaphore truncate_sem;
struct inode vfs_inode;
};