This patch fixes a kernel NULL pointer dereference as mentioned in this log:
---8<---
[ 43.044000] mmc0: card c556 removed
[ 43.059000] mmcblk0: error -123 sending status comand
[ 43.064000] mmcblk0: error -123 sending read/write command, response 0x0, card status 0x0
[ 43.089000] mmcblk0: error -123 requesting status
[ 43.096000] end_request: I/O error, dev mmcblk0, sector 1667989
<snip repeated error>
[ 43.830000] end_request: I/O error, dev mmcblk0, sector 1667988
[ 44.679000] Unable to handle kernel NULL pointer dereference at virtual address 00000010
[ 44.688000] ptbr = 93ec0000 pgd = 93ebf000
[ 44.692000] Oops: Kernel access of bad area, sig: 11 [#1]
[ 44.692000] FRAME_POINTER chip: 0x01f:0x1e82 rev 2
[ 44.692000] Modules linked in:
[ 44.692000] PC is at __mark_inode_dirty+0x8a/0x11c
[ 44.692000] LR is at __mark_inode_dirty+0x7c/0x11c
<snip stack trace>
[ 44.692000] Call trace:
[ 44.692000] [<900780a4>] file_update_time+0x96/0xaa
[ 44.692000] [<9005439a>] __generic_file_aio_write+0x212/0x330
[ 44.692000] [<900544f4>] generic_file_aio_write+0x3c/0x74
[ 44.692000] [<9006b82c>] do_sync_readv_writev+0x68/0x90
[ 44.692000] [<9006b8c0>] do_readv_writev+0x6c/0x108
[ 44.692000] [<9006b98a>] vfs_writev+0x2e/0x34
[ 44.692000] [<9006be60>] sys_writev+0x2c/0x4c
[ 44.692000] [<90023132>] syscall_return+0x0/0x12
[ 44.692000]
--->8---
The reference to sb->s_bdi may be deleted from mmc_blk_remove() ->
del_gendisk() -> unlink_gendisk() -> bdi_unregister() -> bdi_prune_sb() while
another instance try to write some data to the device.
Signed-off-by: Andreas Bießmann <[email protected]>
---
fs/fs-writeback.c | 3 +++
1 files changed, 3 insertions(+), 0 deletions(-)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index cdbf7ac..96b4b25 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1047,6 +1047,9 @@ void __mark_inode_dirty(struct inode *inode, int flags)
if (!was_dirty) {
bdi = inode_to_bdi(inode);
+ if (!bdi)
+ goto out;
+
if (bdi_cap_writeback_dirty(bdi)) {
WARN(!test_bit(BDI_registered, &bdi->state),
"bdi-%s not registered\n", bdi->name);
--
1.7.2.3
On (02/28/11 16:25), Andreas Bie?mann wrote:
> This patch fixes a kernel NULL pointer dereference as mentioned in this log:
>
> ---8<---
> [ 43.044000] mmc0: card c556 removed
> [ 43.059000] mmcblk0: error -123 sending status comand
> [ 43.064000] mmcblk0: error -123 sending read/write command, response 0x0, card status 0x0
> [ 43.089000] mmcblk0: error -123 requesting status
> [ 43.096000] end_request: I/O error, dev mmcblk0, sector 1667989
> <snip repeated error>
> [ 43.830000] end_request: I/O error, dev mmcblk0, sector 1667988
> [ 44.679000] Unable to handle kernel NULL pointer dereference at virtual address 00000010
> [ 44.688000] ptbr = 93ec0000 pgd = 93ebf000
> [ 44.692000] Oops: Kernel access of bad area, sig: 11 [#1]
> [ 44.692000] FRAME_POINTER chip: 0x01f:0x1e82 rev 2
> [ 44.692000] Modules linked in:
> [ 44.692000] PC is at __mark_inode_dirty+0x8a/0x11c
> [ 44.692000] LR is at __mark_inode_dirty+0x7c/0x11c
> <snip stack trace>
> [ 44.692000] Call trace:
> [ 44.692000] [<900780a4>] file_update_time+0x96/0xaa
> [ 44.692000] [<9005439a>] __generic_file_aio_write+0x212/0x330
> [ 44.692000] [<900544f4>] generic_file_aio_write+0x3c/0x74
> [ 44.692000] [<9006b82c>] do_sync_readv_writev+0x68/0x90
> [ 44.692000] [<9006b8c0>] do_readv_writev+0x6c/0x108
> [ 44.692000] [<9006b98a>] vfs_writev+0x2e/0x34
> [ 44.692000] [<9006be60>] sys_writev+0x2c/0x4c
> [ 44.692000] [<90023132>] syscall_return+0x0/0x12
> [ 44.692000]
> --->8---
>
> The reference to sb->s_bdi may be deleted from mmc_blk_remove() ->
> del_gendisk() -> unlink_gendisk() -> bdi_unregister() -> bdi_prune_sb() while
> another instance try to write some data to the device.
>
> Signed-off-by: Andreas Bie?mann <[email protected]>
> ---
> fs/fs-writeback.c | 3 +++
> 1 files changed, 3 insertions(+), 0 deletions(-)
>
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index cdbf7ac..96b4b25 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -1047,6 +1047,9 @@ void __mark_inode_dirty(struct inode *inode, int flags)
> if (!was_dirty) {
> bdi = inode_to_bdi(inode);
>
> + if (!bdi)
> + goto out;
> +
> if (bdi_cap_writeback_dirty(bdi)) {
> WARN(!test_bit(BDI_registered, &bdi->state),
> "bdi-%s not registered\n", bdi->name);
Hello,
I had something very similar to this some time ago
https://lkml.org/lkml/2010/12/9/436
However, I'm not sure that this check is sufficient.
Sergey
Dear Sergey Senozhatsky,
Am 28.02.2011 16:43, schrieb Sergey Senozhatsky:
> On (02/28/11 16:25), Andreas Bie?mann wrote:
>> The reference to sb->s_bdi may be deleted from mmc_blk_remove() ->
>> del_gendisk() -> unlink_gendisk() -> bdi_unregister() -> bdi_prune_sb() while
>> another instance try to write some data to the device.
>>
>> Signed-off-by: Andreas Bie?mann <[email protected]>
>> ---
>> fs/fs-writeback.c | 3 +++
>> 1 files changed, 3 insertions(+), 0 deletions(-)
>>
>> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
>> index cdbf7ac..96b4b25 100644
>> --- a/fs/fs-writeback.c
>> +++ b/fs/fs-writeback.c
>> @@ -1047,6 +1047,9 @@ void __mark_inode_dirty(struct inode *inode, int flags)
>> if (!was_dirty) {
>> bdi = inode_to_bdi(inode);
>>
>> + if (!bdi)
>> + goto out;
>> +
>> if (bdi_cap_writeback_dirty(bdi)) {
>> WARN(!test_bit(BDI_registered, &bdi->state),
>> "bdi-%s not registered\n", bdi->name);
>
> Hello,
> I had something very similar to this some time ago
> https://lkml.org/lkml/2010/12/9/436
Sorry, I did not see that patch.
> However, I'm not sure that this check is sufficient.
Why are you think this is not sufficient?
If an instance try to write that specific inode to an physical device
which is not longer available how should we react then?
Another solution could be to clean up all instances referring to that
superblock in del_/unlink_gendisk(). But I think to check the return of
inode_to_bdi() is needed in any case.
regards
Andreas Bie?mann
On (02/28/11 16:59), Andreas Bie?mann wrote:
> Dear Sergey Senozhatsky,
>
> Am 28.02.2011 16:43, schrieb Sergey Senozhatsky:
> > On (02/28/11 16:25), Andreas Bie?mann wrote:
>
> >> The reference to sb->s_bdi may be deleted from mmc_blk_remove() ->
> >> del_gendisk() -> unlink_gendisk() -> bdi_unregister() -> bdi_prune_sb() while
> >> another instance try to write some data to the device.
> >>
> >> Signed-off-by: Andreas Bie?mann <[email protected]>
> >> ---
> >> fs/fs-writeback.c | 3 +++
> >> 1 files changed, 3 insertions(+), 0 deletions(-)
> >>
> >> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> >> index cdbf7ac..96b4b25 100644
> >> --- a/fs/fs-writeback.c
> >> +++ b/fs/fs-writeback.c
> >> @@ -1047,6 +1047,9 @@ void __mark_inode_dirty(struct inode *inode, int flags)
> >> if (!was_dirty) {
> >> bdi = inode_to_bdi(inode);
> >>
> >> + if (!bdi)
> >> + goto out;
> >> +
> >> if (bdi_cap_writeback_dirty(bdi)) {
> >> WARN(!test_bit(BDI_registered, &bdi->state),
> >> "bdi-%s not registered\n", bdi->name);
> >
> > Hello,
> > I had something very similar to this some time ago
> > https://lkml.org/lkml/2010/12/9/436
>
> Sorry, I did not see that patch.
>
No problem :-)
> > However, I'm not sure that this check is sufficient.
>
> Why are you think this is not sufficient?
> If an instance try to write that specific inode to an physical device
> which is not longer available how should we react then?
>
I think the path which `kills' the device should take care of that.
Otherwise, IMHO, we have:
- ok, we're falling on line 42 -- let's fix line 42
ignoring the fact that there are reasons which led to faulty line 42,
which are, for example:
#0
604 spin_lock(&sb_lock);
605 list_for_each_entry(sb, &super_blocks, s_list) {
606 if (sb->s_bdi == bdi)
607 sb->s_bdi = NULL;
608 }
609 spin_unlock(&sb_lock);
#1 bdi_prune_sb
#2 bdi_unregister
#3 del_gendisk
Of course, I may be wrong.
> Another solution could be to clean up all instances referring to that
> superblock in del_/unlink_gendisk(). But I think to check the return of
> inode_to_bdi() is needed in any case.
>
Sergey
Dear Jason A. Donenfeld,
Am 01.03.2011 10:00, schrieb Jason A. Donenfeld:
> Can you make an isolated test case to trigger this bug?
in my case it is easily reproduceable. I have an SD-card in our embedded
device (AVR32 AP7000). Some random data is continuously written to an
FAT filesystem on that device. When you pull the card out of the slot
you trigger that NULL pointer dereference.
I will try to reproduce that error on my workstation but this will need
some time. Maybe I can not hit that race on my quad core workstation but
I will give it a try.
regards
Andreas Bie?mann
Dear Jason A. Donenfeld,
Am 02.03.2011 09:35, schrieb Andreas Bie?mann:
> Dear Jason A. Donenfeld,
>
> Am 01.03.2011 10:00, schrieb Jason A. Donenfeld:
>> Can you make an isolated test case to trigger this bug?
>
> in my case it is easily reproduceable. I have an SD-card in our embedded
> device (AVR32 AP7000). Some random data is continuously written to an
> FAT filesystem on that device. When you pull the card out of the slot
> you trigger that NULL pointer dereference.
>
> I will try to reproduce that error on my workstation but this will need
> some time. Maybe I can not hit that race on my quad core workstation but
> I will give it a try.
unfortunately I can not reproduce this error on my workstation. I tested
an 35 in 1 USB card reader and a single USB stick.
I will try to test on another architecture uniprocessor system (e.g. one
of our ARM eval boards).
regards
Andreas Bie?mann
On Wed, 02 Mar 2011 09:35:55 +0100
"Andreas Bie__mann" <[email protected]> wrote:
> Dear Jason A. Donenfeld,
>
> Am 01.03.2011 10:00, schrieb Jason A. Donenfeld:
> > Can you make an isolated test case to trigger this bug?
>
> in my case it is easily reproduceable. I have an SD-card in our embedded
> device (AVR32 AP7000). Some random data is continuously written to an
> FAT filesystem on that device. When you pull the card out of the slot
> you trigger that NULL pointer dereference.
>
> I will try to reproduce that error on my workstation but this will need
> some time. Maybe I can not hit that race on my quad core workstation but
> I will give it a try.
>
afaik this regression didn't get fixed. Jens put out a patch for
George to test but there hasn't been any feedback on that yet. Could
you guys please give it a spin?
From: Jens Axboe <[email protected]>
When we move the potential dirty list entries to the
default_backing_dev_info, reassign the sb->s_bdi as well.
default_backing_dev_info will always be around. I hope this can fix it up
for 2.6.38 and we can add the proper ref counting for .39.
Cc: Anton Altaparmakov <[email protected]>
Cc: George Spelvin <[email protected]>
Cc: Christoph Hellwig <[email protected]>
Cc: Andreas Biemann <[email protected]>
Cc: Sergey Senozhatsky <[email protected]>
Tested-by: Torsten Hilbrich <[email protected]>
Cc: <[email protected]> [2.6.38.x]
Signed-off-by: Andrew Morton <[email protected]>
---
fs/super.c | 2 ++
fs/sync.c | 4 ++--
mm/backing-dev.c | 2 +-
3 files changed, 5 insertions(+), 3 deletions(-)
diff -puN fs/super.c~vfs-fix-null-pointer-oops-in-sync_inodes_sb fs/super.c
--- a/fs/super.c~vfs-fix-null-pointer-oops-in-sync_inodes_sb
+++ a/fs/super.c
@@ -72,6 +72,7 @@ static struct super_block *alloc_super(s
#else
INIT_LIST_HEAD(&s->s_files);
#endif
+ s->s_bdi = &default_backing_dev_info;
INIT_LIST_HEAD(&s->s_instances);
INIT_HLIST_BL_HEAD(&s->s_anon);
INIT_LIST_HEAD(&s->s_inodes);
@@ -1006,6 +1007,7 @@ vfs_kern_mount(struct file_system_type *
}
BUG_ON(!mnt->mnt_sb);
WARN_ON(!mnt->mnt_sb->s_bdi);
+ WARN_ON(mnt->mnt_sb->s_bdi == &default_backing_dev_info);
mnt->mnt_sb->s_flags |= MS_BORN;
error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);
diff -puN fs/sync.c~vfs-fix-null-pointer-oops-in-sync_inodes_sb fs/sync.c
--- a/fs/sync.c~vfs-fix-null-pointer-oops-in-sync_inodes_sb
+++ a/fs/sync.c
@@ -33,7 +33,7 @@ static int __sync_filesystem(struct supe
* This should be safe, as we require bdi backing to actually
* write out data in the first place
*/
- if (!sb->s_bdi || sb->s_bdi == &noop_backing_dev_info)
+ if (sb->s_bdi == &noop_backing_dev_info)
return 0;
if (sb->s_qcop && sb->s_qcop->quota_sync)
@@ -79,7 +79,7 @@ EXPORT_SYMBOL_GPL(sync_filesystem);
static void sync_one_sb(struct super_block *sb, void *arg)
{
- if (!(sb->s_flags & MS_RDONLY) && sb->s_bdi)
+ if (!(sb->s_flags & MS_RDONLY))
__sync_filesystem(sb, *(int *)arg);
}
/*
diff -puN mm/backing-dev.c~vfs-fix-null-pointer-oops-in-sync_inodes_sb mm/backing-dev.c
--- a/mm/backing-dev.c~vfs-fix-null-pointer-oops-in-sync_inodes_sb
+++ a/mm/backing-dev.c
@@ -598,7 +598,7 @@ static void bdi_prune_sb(struct backing_
spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
if (sb->s_bdi == bdi)
- sb->s_bdi = NULL;
+ sb->s_bdi = &default_backing_dev_info;
}
spin_unlock(&sb_lock);
}
_
btw, Christoph: would this not have been be a less hacky hack?
--- a/fs/fs-writeback.c~a
+++ a/fs/fs-writeback.c
@@ -73,7 +73,7 @@ static inline struct backing_dev_info *i
{
struct super_block *sb = inode->i_sb;
- if (strcmp(sb->s_type->name, "bdev") == 0)
+ if (sb == blockdev_superblock)
return inode->i_mapping->backing_dev_info;
return sb->s_bdi;
_
> afaik this regression didn't get fixed. Jens put out a patch for
> George to test but there hasn't been any feedback on that yet. Could
> you guys please give it a spin?
I'm running with it without problems, but the bug was hard (for me)
to trigger before. So it's a very weak "Works For Me."
Hello,
On (03/17/11 14:04), Andrew Morton wrote:
>[..]
> afaik this regression didn't get fixed. Jens put out a patch for
> George to test but there hasn't been any feedback on that yet. Could
> you guys please give it a spin?
>
Sorry for rather long reply. Seem to work fine for me.
Tested-by: Sergey Senozhatsky <[email protected]>
Thanks,
Sergey
> From: Jens Axboe <[email protected]>
>
> When we move the potential dirty list entries to the
> default_backing_dev_info, reassign the sb->s_bdi as well.
> default_backing_dev_info will always be around. I hope this can fix it up
> for 2.6.38 and we can add the proper ref counting for .39.
>
> Cc: Anton Altaparmakov <[email protected]>
> Cc: George Spelvin <[email protected]>
> Cc: Christoph Hellwig <[email protected]>
> Cc: Andreas Biemann <[email protected]>
> Cc: Sergey Senozhatsky <[email protected]>
> Tested-by: Torsten Hilbrich <[email protected]>
> Cc: <[email protected]> [2.6.38.x]
> Signed-off-by: Andrew Morton <[email protected]>
> ---
>
> fs/super.c | 2 ++
> fs/sync.c | 4 ++--
> mm/backing-dev.c | 2 +-
> 3 files changed, 5 insertions(+), 3 deletions(-)
>
> diff -puN fs/super.c~vfs-fix-null-pointer-oops-in-sync_inodes_sb fs/super.c
> --- a/fs/super.c~vfs-fix-null-pointer-oops-in-sync_inodes_sb
> +++ a/fs/super.c
> @@ -72,6 +72,7 @@ static struct super_block *alloc_super(s
> #else
> INIT_LIST_HEAD(&s->s_files);
> #endif
> + s->s_bdi = &default_backing_dev_info;
> INIT_LIST_HEAD(&s->s_instances);
> INIT_HLIST_BL_HEAD(&s->s_anon);
> INIT_LIST_HEAD(&s->s_inodes);
> @@ -1006,6 +1007,7 @@ vfs_kern_mount(struct file_system_type *
> }
> BUG_ON(!mnt->mnt_sb);
> WARN_ON(!mnt->mnt_sb->s_bdi);
> + WARN_ON(mnt->mnt_sb->s_bdi == &default_backing_dev_info);
> mnt->mnt_sb->s_flags |= MS_BORN;
>
> error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);
> diff -puN fs/sync.c~vfs-fix-null-pointer-oops-in-sync_inodes_sb fs/sync.c
> --- a/fs/sync.c~vfs-fix-null-pointer-oops-in-sync_inodes_sb
> +++ a/fs/sync.c
> @@ -33,7 +33,7 @@ static int __sync_filesystem(struct supe
> * This should be safe, as we require bdi backing to actually
> * write out data in the first place
> */
> - if (!sb->s_bdi || sb->s_bdi == &noop_backing_dev_info)
> + if (sb->s_bdi == &noop_backing_dev_info)
> return 0;
>
> if (sb->s_qcop && sb->s_qcop->quota_sync)
> @@ -79,7 +79,7 @@ EXPORT_SYMBOL_GPL(sync_filesystem);
>
> static void sync_one_sb(struct super_block *sb, void *arg)
> {
> - if (!(sb->s_flags & MS_RDONLY) && sb->s_bdi)
> + if (!(sb->s_flags & MS_RDONLY))
> __sync_filesystem(sb, *(int *)arg);
> }
> /*
> diff -puN mm/backing-dev.c~vfs-fix-null-pointer-oops-in-sync_inodes_sb mm/backing-dev.c
> --- a/mm/backing-dev.c~vfs-fix-null-pointer-oops-in-sync_inodes_sb
> +++ a/mm/backing-dev.c
> @@ -598,7 +598,7 @@ static void bdi_prune_sb(struct backing_
> spin_lock(&sb_lock);
> list_for_each_entry(sb, &super_blocks, s_list) {
> if (sb->s_bdi == bdi)
> - sb->s_bdi = NULL;
> + sb->s_bdi = &default_backing_dev_info;
> }
> spin_unlock(&sb_lock);
> }
> _
>
>
> btw, Christoph: would this not have been be a less hacky hack?
>
> --- a/fs/fs-writeback.c~a
> +++ a/fs/fs-writeback.c
> @@ -73,7 +73,7 @@ static inline struct backing_dev_info *i
> {
> struct super_block *sb = inode->i_sb;
>
> - if (strcmp(sb->s_type->name, "bdev") == 0)
> + if (sb == blockdev_superblock)
> return inode->i_mapping->backing_dev_info;
>
> return sb->s_bdi;
> _
>