From: Andreas Herrmann <[email protected]>
This is a resubmit of Andreas' patch that reduces stackframe usage in
do_mount. Problem is that without this patch we get a kernel stack
overflow if we run with 4k stacks (s390 31 bit mode).
See original stack back trace below and Andreas' patch and analysis
here:
http://www.ussg.iu.edu/hypermail/linux/kernel/0410.3/1844.html
<4>Call Trace:
<4>([<000000000014ade6>] buffered_rmqueue+0x36a/0x3b4)
<4> [<000000000014aed6>] __alloc_pages+0xa6/0x350
<4> [<000000000014b258>] __get_free_pages+0x38/0x74
<4> [<000000000014f9fe>] kmem_getpages+0x32/0x140
<4> [<0000000000150396>] cache_alloc_refill+0x3ae/0x58c
<4> [<000000000014feb0>] __kmalloc+0xc0/0xe8
<4> [<00000000002acc64>] zfcp_mempool_alloc+0x24/0x34
<4> [<00000000001494ea>] mempool_alloc+0x56/0x1e8
<4> [<00000000002bfb46>] zfcp_fsf_req_create+0x56/0x5d8
<4> [<00000000002c0cf0>] zfcp_fsf_send_fcp_command_task+0x44/0x5c4
<4> [<00000000002afbda>] zfcp_scsi_command_async+0x7a/0x1f8
<4> [<00000000002afe68>] zfcp_scsi_queuecommand+0x44/0x54
<4> [<000000000024f458>] scsi_dispatch_cmd+0x234/0x3c0
<4> [<0000000000255fb2>] scsi_request_fn+0x2c6/0x67c
<4> [<000000000023f3ea>] __generic_unplug_device+0x52/0x60
<4> [<000000000023d392>] __elv_add_request+0x8a/0x98
<4> [<0000000000240c5e>] __make_request+0x306/0x62c
<4> [<0000000000241074>] generic_make_request+0xf0/0x21c
<4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
<4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
<4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
<4> [<0000000000241074>] generic_make_request+0xf0/0x21c
<4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
<4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
<4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
<4> [<0000000000241074>] generic_make_request+0xf0/0x21c
<4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
<4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
<4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
<4> [<0000000000241074>] generic_make_request+0xf0/0x21c
<4> [<000000000024121a>] submit_bio+0x7a/0x154
<4> [<00000000001736e6>] submit_bh+0x106/0x184
<4> [<000000000017514e>] __block_write_full_page+0x23a/0x58c
<4> [<000000000017558e>] block_write_full_page+0xee/0x108
<4> [<000000000017a908>] blkdev_writepage+0x24/0x38
<4> [<00000000001a0f76>] mpage_writepages+0x1da/0x9d0
<4> [<000000000017a4fa>] generic_writepages+0x22/0x30
<4> [<000000000014cc1e>] do_writepages+0x36/0x54
<4> [<0000000000144fe2>] __filemap_fdatawrite+0x5a/0x6c
<4> [<0000000000145016>] filemap_fdatawrite+0x22/0x30
<4> [<0000000000171244>] sync_blockdev+0x30/0x5c
<4> [<00000000001df074>] journal_recover+0x88/0xbc
<4> [<00000000001e2876>] journal_load+0x52/0xcc
<4> [<00000000001d37b4>] ext3_fill_super+0x1a48/0x1d38
<4> [<0000000000179faa>] get_sb_bdev+0x13a/0x208
<4> [<00000000001d3bd6>] ext3_get_sb+0x22/0x34
<4> [<000000000017a32e>] do_kern_mount+0x86/0x1f4
<4> [<000000000019aaf4>] do_mount+0x1c8/0x8e4
<4> [<000000000019b7ba>] sys_mount+0xce/0x1dc
<4> [<000000000010f9a6>] sysc_do_restart+0xe/0x12
Signed-off-by: Andreas Herrmann <[email protected]>
Signed-off-by: Heiko Carstens <[email protected]>
---
fs/namespace.c | 80 ++++++++++++++++++++++++++++++++++++---------------------
1 files changed, 51 insertions(+), 29 deletions(-)
diff -urpN linux-2.6/fs/namespace.c linux-2.6-patched/fs/namespace.c
--- linux-2.6/fs/namespace.c 2005-10-28 02:02:08.000000000 +0200
+++ linux-2.6-patched/fs/namespace.c 2005-11-03 13:39:30.000000000 +0100
@@ -619,25 +619,31 @@ out_unlock:
*/
static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
{
- struct nameidata old_nd;
+ struct nameidata *old_nd;
struct vfsmount *mnt = NULL;
int err = mount_is_safe(nd);
if (err)
return err;
if (!old_name || !*old_name)
return -EINVAL;
- err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd);
- if (err)
+
+ old_nd = kmalloc(sizeof(*old_nd), GFP_KERNEL);
+ if (!old_nd)
+ return -ENOMEM;
+ err = path_lookup(old_name, LOOKUP_FOLLOW, old_nd);
+ if (err) {
+ kfree(old_nd);
return err;
+ }
down_write(¤t->namespace->sem);
err = -EINVAL;
- if (check_mnt(nd->mnt) && (!recurse || check_mnt(old_nd.mnt))) {
+ if (check_mnt(nd->mnt) && (!recurse || check_mnt(old_nd->mnt))) {
err = -ENOMEM;
if (recurse)
- mnt = copy_tree(old_nd.mnt, old_nd.dentry);
+ mnt = copy_tree(old_nd->mnt, old_nd->dentry);
else
- mnt = clone_mnt(old_nd.mnt, old_nd.dentry);
+ mnt = clone_mnt(old_nd->mnt, old_nd->dentry);
}
if (mnt) {
@@ -656,7 +662,8 @@ static int do_loopback(struct nameidata
}
up_write(¤t->namespace->sem);
- path_release(&old_nd);
+ path_release(old_nd);
+ kfree(old_nd);
return err;
}
@@ -693,22 +700,29 @@ static int do_remount(struct nameidata *
static int do_move_mount(struct nameidata *nd, char *old_name)
{
- struct nameidata old_nd, parent_nd;
+ struct {
+ struct nameidata old_nd, parent_nd;
+ } *loc;
struct vfsmount *p;
int err = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (!old_name || !*old_name)
return -EINVAL;
- err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd);
- if (err)
+ loc = kmalloc(sizeof(*loc), GFP_KERNEL);
+ if (!loc)
+ return -ENOMEM;
+ err = path_lookup(old_name, LOOKUP_FOLLOW, &loc->old_nd);
+ if (err) {
+ kfree (loc);
return err;
+ }
down_write(¤t->namespace->sem);
while(d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
;
err = -EINVAL;
- if (!check_mnt(nd->mnt) || !check_mnt(old_nd.mnt))
+ if (!check_mnt(nd->mnt) || !check_mnt(loc->old_nd.mnt))
goto out;
err = -ENOENT;
@@ -721,28 +735,28 @@ static int do_move_mount(struct nameidat
goto out2;
err = -EINVAL;
- if (old_nd.dentry != old_nd.mnt->mnt_root)
+ if (loc->old_nd.dentry != loc->old_nd.mnt->mnt_root)
goto out2;
- if (old_nd.mnt == old_nd.mnt->mnt_parent)
+ if (loc->old_nd.mnt == loc->old_nd.mnt->mnt_parent)
goto out2;
if (S_ISDIR(nd->dentry->d_inode->i_mode) !=
- S_ISDIR(old_nd.dentry->d_inode->i_mode))
+ S_ISDIR(loc->old_nd.dentry->d_inode->i_mode))
goto out2;
err = -ELOOP;
for (p = nd->mnt; p->mnt_parent!=p; p = p->mnt_parent)
- if (p == old_nd.mnt)
+ if (p == loc->old_nd.mnt)
goto out2;
err = 0;
- detach_mnt(old_nd.mnt, &parent_nd);
- attach_mnt(old_nd.mnt, nd);
+ detach_mnt(loc->old_nd.mnt, &loc->parent_nd);
+ attach_mnt(loc->old_nd.mnt, nd);
/* if the mount is moved, it should no longer be expire
* automatically */
- list_del_init(&old_nd.mnt->mnt_expire);
+ list_del_init(&loc->old_nd.mnt->mnt_expire);
out2:
spin_unlock(&vfsmount_lock);
out1:
@@ -750,8 +764,9 @@ out1:
out:
up_write(¤t->namespace->sem);
if (!err)
- path_release(&parent_nd);
- path_release(&old_nd);
+ path_release(&loc->parent_nd);
+ path_release(&loc->old_nd);
+ kfree(loc);
return err;
}
@@ -1014,7 +1029,7 @@ int copy_mount_options(const void __user
long do_mount(char * dev_name, char * dir_name, char *type_page,
unsigned long flags, void *data_page)
{
- struct nameidata nd;
+ struct nameidata *nd;
int retval = 0;
int mnt_flags = 0;
@@ -1041,27 +1056,34 @@ long do_mount(char * dev_name, char * di
mnt_flags |= MNT_NOEXEC;
flags &= ~(MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_ACTIVE);
+ nd = kmalloc(sizeof(*nd), GFP_KERNEL);
+ if (!nd)
+ return -ENOMEM;
+
/* ... and get the mountpoint */
- retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd);
- if (retval)
+ retval = path_lookup(dir_name, LOOKUP_FOLLOW, nd);
+ if (retval) {
+ kfree(nd);
return retval;
+ }
- retval = security_sb_mount(dev_name, &nd, type_page, flags, data_page);
+ retval = security_sb_mount(dev_name, nd, type_page, flags, data_page);
if (retval)
goto dput_out;
if (flags & MS_REMOUNT)
- retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags,
+ retval = do_remount(nd, flags & ~MS_REMOUNT, mnt_flags,
data_page);
else if (flags & MS_BIND)
- retval = do_loopback(&nd, dev_name, flags & MS_REC);
+ retval = do_loopback(nd, dev_name, flags & MS_REC);
else if (flags & MS_MOVE)
- retval = do_move_mount(&nd, dev_name);
+ retval = do_move_mount(nd, dev_name);
else
- retval = do_new_mount(&nd, type_page, flags, mnt_flags,
+ retval = do_new_mount(nd, type_page, flags, mnt_flags,
dev_name, data_page);
dput_out:
- path_release(&nd);
+ path_release(nd);
+ kfree(nd);
return retval;
}
On Fri, Nov 04, 2005 at 11:50:26AM +0100, Heiko Carstens wrote:
> From: Andreas Herrmann <[email protected]>
>
> This is a resubmit of Andreas' patch that reduces stackframe usage in
> do_mount. Problem is that without this patch we get a kernel stack
> overflow if we run with 4k stacks (s390 31 bit mode).
> See original stack back trace below and Andreas' patch and analysis
> here:
> http://www.ussg.iu.edu/hypermail/linux/kernel/0410.3/1844.html
NAK. Rationale: too ugly.
> > This is a resubmit of Andreas' patch that reduces stackframe usage in
> > do_mount. Problem is that without this patch we get a kernel stack
> > overflow if we run with 4k stacks (s390 31 bit mode).
> > See original stack back trace below and Andreas' patch and analysis
> > here:
> > http://www.ussg.iu.edu/hypermail/linux/kernel/0410.3/1844.html
>
> NAK. Rationale: too ugly.
Ok, since I can only guess what you don't like: here is an updated patch
that probably addresses a few things.
If you don't like this one too, could you please explain what should be
changed?
Thanks,
Heiko
---
fs/namespace.c | 79 ++++++++++++++++++++++++++++++++++++---------------------
1 file changed, 50 insertions(+), 29 deletions(-)
diff --git a/fs/namespace.c b/fs/namespace.c
index 2fa9fdf..c14fbfc 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -619,25 +619,30 @@ out_unlock:
*/
static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
{
- struct nameidata old_nd;
+ struct nameidata *old_nd;
struct vfsmount *mnt = NULL;
int err = mount_is_safe(nd);
if (err)
return err;
if (!old_name || !*old_name)
return -EINVAL;
- err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd);
+
+ old_nd = kmalloc(sizeof(*old_nd), GFP_KERNEL);
+ if (!old_nd)
+ return -ENOMEM;
+
+ err = path_lookup(old_name, LOOKUP_FOLLOW, old_nd);
if (err)
- return err;
+ goto out;
down_write(¤t->namespace->sem);
err = -EINVAL;
- if (check_mnt(nd->mnt) && (!recurse || check_mnt(old_nd.mnt))) {
+ if (check_mnt(nd->mnt) && (!recurse || check_mnt(old_nd->mnt))) {
err = -ENOMEM;
if (recurse)
- mnt = copy_tree(old_nd.mnt, old_nd.dentry);
+ mnt = copy_tree(old_nd->mnt, old_nd->dentry);
else
- mnt = clone_mnt(old_nd.mnt, old_nd.dentry);
+ mnt = clone_mnt(old_nd->mnt, old_nd->dentry);
}
if (mnt) {
@@ -656,7 +661,9 @@ static int do_loopback(struct nameidata
}
up_write(¤t->namespace->sem);
- path_release(&old_nd);
+ path_release(old_nd);
+out:
+ kfree(old_nd);
return err;
}
@@ -693,22 +700,28 @@ static int do_remount(struct nameidata *
static int do_move_mount(struct nameidata *nd, char *old_name)
{
- struct nameidata old_nd, parent_nd;
+ struct {
+ struct nameidata old_nd, parent_nd;
+ } *loc;
struct vfsmount *p;
int err = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (!old_name || !*old_name)
return -EINVAL;
- err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd);
+ loc = kmalloc(sizeof(*loc), GFP_KERNEL);
+ if (!loc)
+ return -ENOMEM;
+
+ err = path_lookup(old_name, LOOKUP_FOLLOW, &loc->old_nd);
if (err)
- return err;
+ goto path_out;
down_write(¤t->namespace->sem);
while(d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
;
err = -EINVAL;
- if (!check_mnt(nd->mnt) || !check_mnt(old_nd.mnt))
+ if (!check_mnt(nd->mnt) || !check_mnt(loc->old_nd.mnt))
goto out;
err = -ENOENT;
@@ -721,28 +734,28 @@ static int do_move_mount(struct nameidat
goto out2;
err = -EINVAL;
- if (old_nd.dentry != old_nd.mnt->mnt_root)
+ if (loc->old_nd.dentry != loc->old_nd.mnt->mnt_root)
goto out2;
- if (old_nd.mnt == old_nd.mnt->mnt_parent)
+ if (loc->old_nd.mnt == loc->old_nd.mnt->mnt_parent)
goto out2;
if (S_ISDIR(nd->dentry->d_inode->i_mode) !=
- S_ISDIR(old_nd.dentry->d_inode->i_mode))
+ S_ISDIR(loc->old_nd.dentry->d_inode->i_mode))
goto out2;
err = -ELOOP;
for (p = nd->mnt; p->mnt_parent!=p; p = p->mnt_parent)
- if (p == old_nd.mnt)
+ if (p == loc->old_nd.mnt)
goto out2;
err = 0;
- detach_mnt(old_nd.mnt, &parent_nd);
- attach_mnt(old_nd.mnt, nd);
+ detach_mnt(loc->old_nd.mnt, &loc->parent_nd);
+ attach_mnt(loc->old_nd.mnt, nd);
/* if the mount is moved, it should no longer be expire
* automatically */
- list_del_init(&old_nd.mnt->mnt_expire);
+ list_del_init(&loc->old_nd.mnt->mnt_expire);
out2:
spin_unlock(&vfsmount_lock);
out1:
@@ -750,8 +763,10 @@ out1:
out:
up_write(¤t->namespace->sem);
if (!err)
- path_release(&parent_nd);
- path_release(&old_nd);
+ path_release(&loc->parent_nd);
+ path_release(&loc->old_nd);
+path_out:
+ kfree(loc);
return err;
}
@@ -1014,7 +1029,7 @@ int copy_mount_options(const void __user
long do_mount(char * dev_name, char * dir_name, char *type_page,
unsigned long flags, void *data_page)
{
- struct nameidata nd;
+ struct nameidata *nd;
int retval = 0;
int mnt_flags = 0;
@@ -1041,27 +1056,33 @@ long do_mount(char * dev_name, char * di
mnt_flags |= MNT_NOEXEC;
flags &= ~(MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_ACTIVE);
+ nd = kmalloc(sizeof(*nd), GFP_KERNEL);
+ if (!nd)
+ return -ENOMEM;
+
/* ... and get the mountpoint */
- retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd);
+ retval = path_lookup(dir_name, LOOKUP_FOLLOW, nd);
if (retval)
- return retval;
+ goto path_out;
- retval = security_sb_mount(dev_name, &nd, type_page, flags, data_page);
+ retval = security_sb_mount(dev_name, nd, type_page, flags, data_page);
if (retval)
goto dput_out;
if (flags & MS_REMOUNT)
- retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags,
+ retval = do_remount(nd, flags & ~MS_REMOUNT, mnt_flags,
data_page);
else if (flags & MS_BIND)
- retval = do_loopback(&nd, dev_name, flags & MS_REC);
+ retval = do_loopback(nd, dev_name, flags & MS_REC);
else if (flags & MS_MOVE)
- retval = do_move_mount(&nd, dev_name);
+ retval = do_move_mount(nd, dev_name);
else
- retval = do_new_mount(&nd, type_page, flags, mnt_flags,
+ retval = do_new_mount(nd, type_page, flags, mnt_flags,
dev_name, data_page);
dput_out:
- path_release(&nd);
+ path_release(nd);
+path_out:
+ kfree(nd);
return retval;
}
On Fri, Nov 04, 2005 at 01:57:05PM +0100, Heiko Carstens wrote:
> > > This is a resubmit of Andreas' patch that reduces stackframe usage in
> > > do_mount. Problem is that without this patch we get a kernel stack
> > > overflow if we run with 4k stacks (s390 31 bit mode).
> > > See original stack back trace below and Andreas' patch and analysis
> > > here:
> > > http://www.ussg.iu.edu/hypermail/linux/kernel/0410.3/1844.html
> >
> > NAK. Rationale: too ugly.
>
> Ok, since I can only guess what you don't like: here is an updated patch
> that probably addresses a few things.
> If you don't like this one too, could you please explain what should be
> changed?
Depth analysis. E.g. do_move_mount() change is simply nonsense - _this_
is not going to overflow, no matter what. And do_add_mount() change
is also very suspicious - looks like you are attacking the wrong place
in call chain.
On 04.11.2005 15:06 Al Viro wrote:
> On Fri, Nov 04, 2005 at 01:57:05PM +0100, Heiko Carstens wrote:
> > Ok, since I can only guess what you don't like: here is an updated
patch
> > that probably addresses a few things.
> > If you don't like this one too, could you please explain what should
be
> > changed?
> Depth analysis. E.g. do_move_mount() change is simply nonsense - _this_
> is not going to overflow, no matter what. And do_add_mount() change
> is also very suspicious - looks like you are attacking the wrong place
> in call chain.
Obviously you missed the point that (depending on the compiler version,
options etc.) do_move_mount() and do_add_mount() can be inlined.
Hence stack frame size of do_mount is directly influenced by those two
functions. At least when I performed my "skin-deep" analysis of this
kernel stack overflow both functions were inlined in do_mount().
Stack consumptions of all other functions occurring in the backtrace
were moderate or due to the amount of inlined functions.
Just do_mount and its inlined friends use to put larger structures on
the stack that better should be allocated. There would not be any
performance impact for the user to allocate the structs.
Regards,
Andreas
Heiko Carstens <[email protected]> wrote:
>
> From: Andreas Herrmann <[email protected]>
>
> This is a resubmit of Andreas' patch that reduces stackframe usage in
> do_mount. Problem is that without this patch we get a kernel stack
> overflow if we run with 4k stacks (s390 31 bit mode).
> See original stack back trace below and Andreas' patch and analysis
> here:
> http://www.ussg.iu.edu/hypermail/linux/kernel/0410.3/1844.html
>
> <4>Call Trace:
> <4>([<000000000014ade6>] buffered_rmqueue+0x36a/0x3b4)
> <4> [<000000000014aed6>] __alloc_pages+0xa6/0x350
> <4> [<000000000014b258>] __get_free_pages+0x38/0x74
> <4> [<000000000014f9fe>] kmem_getpages+0x32/0x140
> <4> [<0000000000150396>] cache_alloc_refill+0x3ae/0x58c
> <4> [<000000000014feb0>] __kmalloc+0xc0/0xe8
> <4> [<00000000002acc64>] zfcp_mempool_alloc+0x24/0x34
> <4> [<00000000001494ea>] mempool_alloc+0x56/0x1e8
> <4> [<00000000002bfb46>] zfcp_fsf_req_create+0x56/0x5d8
> <4> [<00000000002c0cf0>] zfcp_fsf_send_fcp_command_task+0x44/0x5c4
> <4> [<00000000002afbda>] zfcp_scsi_command_async+0x7a/0x1f8
> <4> [<00000000002afe68>] zfcp_scsi_queuecommand+0x44/0x54
> <4> [<000000000024f458>] scsi_dispatch_cmd+0x234/0x3c0
> <4> [<0000000000255fb2>] scsi_request_fn+0x2c6/0x67c
> <4> [<000000000023f3ea>] __generic_unplug_device+0x52/0x60
> <4> [<000000000023d392>] __elv_add_request+0x8a/0x98
> <4> [<0000000000240c5e>] __make_request+0x306/0x62c
> <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> <4> [<000000000024121a>] submit_bio+0x7a/0x154
> <4> [<00000000001736e6>] submit_bh+0x106/0x184
> <4> [<000000000017514e>] __block_write_full_page+0x23a/0x58c
> <4> [<000000000017558e>] block_write_full_page+0xee/0x108
> <4> [<000000000017a908>] blkdev_writepage+0x24/0x38
> <4> [<00000000001a0f76>] mpage_writepages+0x1da/0x9d0
> <4> [<000000000017a4fa>] generic_writepages+0x22/0x30
> <4> [<000000000014cc1e>] do_writepages+0x36/0x54
> <4> [<0000000000144fe2>] __filemap_fdatawrite+0x5a/0x6c
> <4> [<0000000000145016>] filemap_fdatawrite+0x22/0x30
> <4> [<0000000000171244>] sync_blockdev+0x30/0x5c
> <4> [<00000000001df074>] journal_recover+0x88/0xbc
> <4> [<00000000001e2876>] journal_load+0x52/0xcc
> <4> [<00000000001d37b4>] ext3_fill_super+0x1a48/0x1d38
> <4> [<0000000000179faa>] get_sb_bdev+0x13a/0x208
> <4> [<00000000001d3bd6>] ext3_get_sb+0x22/0x34
> <4> [<000000000017a32e>] do_kern_mount+0x86/0x1f4
> <4> [<000000000019aaf4>] do_mount+0x1c8/0x8e4
> <4> [<000000000019b7ba>] sys_mount+0xce/0x1dc
> <4> [<000000000010f9a6>] sysc_do_restart+0xe/0x12
I'd call that a device mapper bug. If you were to increase the stacking
from 4-deep to 5-deep, it will crash the kernel, patched or not.
I'm (very) surprised that DM doesn't perform its stacking iteratively.
Eliminating the 36-byte `struct clone_info' in __split_bio() would be most
effective here.
Andreas Herrmann <[email protected]> wrote:
>
> Obviously you missed the point that (depending on the compiler version,
> options etc.) do_move_mount() and do_add_mount() can be inlined.
I think we found a way of preventing the 3.x compiler from doing that. Arjan,
do you recall where we ended up with that problem?
On Fri, Nov 04, 2005 at 09:42:12AM -0800, Andrew Morton wrote:
> Andreas Herrmann <[email protected]> wrote:
> >
> > Obviously you missed the point that (depending on the compiler version,
> > options etc.) do_move_mount() and do_add_mount() can be inlined.
>
> I think we found a way of preventing the 3.x compiler from doing that. Arjan,
> do you recall where we ended up with that problem?
it was mostly caused by -funit-at-a-time that caused 3.x to go haywire. You
need to turn that off in general.
the real fix is in gcc 4.x (4.1 at least) where gcc got a lot smarter about
stack slot lifetimes
Arjan van de Ven <[email protected]> wrote:
>
> On Fri, Nov 04, 2005 at 09:42:12AM -0800, Andrew Morton wrote:
> > Andreas Herrmann <[email protected]> wrote:
> > >
> > > Obviously you missed the point that (depending on the compiler version,
> > > options etc.) do_move_mount() and do_add_mount() can be inlined.
> >
> > I think we found a way of preventing the 3.x compiler from doing that. Arjan,
> > do you recall where we ended up with that problem?
>
> it was mostly caused by -funit-at-a-time that caused 3.x to go haywire. You
> need to turn that off in general.
In that case, does s390 still have a problem that we need to be solving here?
(s390 doesn't implement CONFIG_DEBUG_STACK_USAGE. Suggest you do so, then
stick a dump_stack() into do_exit()).
(dm is still doing very risky things though)
> > See original stack back trace below and Andreas' patch and analysis
> > here:
> > http://www.ussg.iu.edu/hypermail/linux/kernel/0410.3/1844.html
I probably should add that with "original" stack back trace a trace of
a 2.6.10 kernel was meant, if that wasn't clear, but the DM code is
still the same in 2.6.14.
> > <4>Call Trace:
...
> > <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> > <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> > <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> > <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> > <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> > <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> > <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> > <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> > <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> > <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> > <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> > <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
...
This part of the call trace is actually good for >1500 bytes of stack
usage and is what kills us and should be fixed.
I'm surprised that there are no other bug reports regarding DM and
stack overflow with 4k stacks.
> I'd call that a device mapper bug. If you were to increase the stacking
> from 4-deep to 5-deep, it will crash the kernel, patched or not.
Yes, you're right. Just forget the do_mount() patch. It's the wrong approach.
Heiko
> > If you don't like this one too, could you please explain what should be
> > changed?
>
> Depth analysis. E.g. do_move_mount() change is simply nonsense - _this_
> is not going to overflow, no matter what. And do_add_mount() change
> is also very suspicious - looks like you are attacking the wrong place
> in call chain.
You're right (see also my other reply to Andrew).
Thanks,
Heiko
On Fri, Nov 04, 2005 at 10:27:42PM +0100, Heiko Carstens wrote:
> > > See original stack back trace below and Andreas' patch and analysis
> > > here:
> > > http://www.ussg.iu.edu/hypermail/linux/kernel/0410.3/1844.html
>
> I probably should add that with "original" stack back trace a trace of
> a 2.6.10 kernel was meant, if that wasn't clear, but the DM code is
> still the same in 2.6.14.
>
> > > <4>Call Trace:
> ...
> > > <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> > > <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> > > <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> > > <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> > > <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> > > <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> > > <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> > > <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> > > <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> > > <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> > > <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> > > <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> ...
>
> This part of the call trace is actually good for >1500 bytes of stack
> usage and is what kills us and should be fixed.
> I'm surprised that there are no other bug reports regarding DM and
> stack overflow with 4k stacks.
>...
There were some reports of dm+xfs overflows with 4k stacks on i386.
The xfs side was sorted out, but I son't know the state of the dm part.
> Heiko
cu
Adrian
--
"Is there not promise of rain?" Ling Tan asked suddenly out
of the darkness. There had been need of rain for many days.
"Only a promise," Lao Er said.
Pearl S. Buck - Dragon Seed
Adrian Bunk <[email protected]> wrote:
>
> On Fri, Nov 04, 2005 at 10:27:42PM +0100, Heiko Carstens wrote:
> > > > See original stack back trace below and Andreas' patch and analysis
> > > > here:
> > > > http://www.ussg.iu.edu/hypermail/linux/kernel/0410.3/1844.html
> >
> > I probably should add that with "original" stack back trace a trace of
> > a 2.6.10 kernel was meant, if that wasn't clear, but the DM code is
> > still the same in 2.6.14.
> >
> > > > <4>Call Trace:
> > ...
> > > > <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> > > > <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> > > > <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> > > > <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> > > > <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> > > > <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> > > > <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> > > > <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> > > > <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> > > > <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> > > > <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> > > > <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> > ...
> >
> > This part of the call trace is actually good for >1500 bytes of stack
> > usage and is what kills us and should be fixed.
> > I'm surprised that there are no other bug reports regarding DM and
> > stack overflow with 4k stacks.
> >...
>
> There were some reports of dm+xfs overflows with 4k stacks on i386.
>
> The xfs side was sorted out, but I son't know the state of the dm part.
>
The state is Very Bad, IMO. At the very least, DM should struggle to the
utmost to reduce the stack utilisation around that recursion point.
> > > This part of the call trace is actually good for >1500 bytes of stack
> > > usage and is what kills us and should be fixed.
> > > I'm surprised that there are no other bug reports regarding DM and
> > > stack overflow with 4k stacks.
> > >...
> >
> > There were some reports of dm+xfs overflows with 4k stacks on i386.
> >
> > The xfs side was sorted out, but I son't know the state of the dm part.
> >
>
> The state is Very Bad, IMO. At the very least, DM should struggle to the
> utmost to reduce the stack utilisation around that recursion point.
Neil Brown suggested a change to generic_make_request to convert recursion
through it into iteration (see http://lkml.org/lkml/2005/9/2/34 ), but
there was no discussion at the time. Would this help with this case?
Chris
[email protected] wrote:
>
> > > > This part of the call trace is actually good for >1500 bytes of stack
> > > > usage and is what kills us and should be fixed.
> > > > I'm surprised that there are no other bug reports regarding DM and
> > > > stack overflow with 4k stacks.
> > > >...
> > >
> > > There were some reports of dm+xfs overflows with 4k stacks on i386.
> > >
> > > The xfs side was sorted out, but I son't know the state of the dm part.
> > >
> >
> > The state is Very Bad, IMO. At the very least, DM should struggle to the
> > utmost to reduce the stack utilisation around that recursion point.
>
> Neil Brown suggested a change to generic_make_request to convert recursion
> through it into iteration (see http://lkml.org/lkml/2005/9/2/34 ), but
> there was no discussion at the time. Would this help with this case?
It certainly would. That looks like a good thing to do some more work on.
On Friday November 4, [email protected] wrote:
> [email protected] wrote:
> >
> > > > > This part of the call trace is actually good for >1500 bytes of stack
> > > > > usage and is what kills us and should be fixed.
> > > > > I'm surprised that there are no other bug reports regarding DM and
> > > > > stack overflow with 4k stacks.
> > > > >...
> > > >
> > > > There were some reports of dm+xfs overflows with 4k stacks on i386.
> > > >
> > > > The xfs side was sorted out, but I son't know the state of the dm part.
> > > >
> > >
> > > The state is Very Bad, IMO. At the very least, DM should struggle to the
> > > utmost to reduce the stack utilisation around that recursion point.
> >
> > Neil Brown suggested a change to generic_make_request to convert recursion
> > through it into iteration (see http://lkml.org/lkml/2005/9/2/34 ), but
> > there was no discussion at the time. Would this help with this case?
>
> It certainly would. That looks like a good thing to do some more work on.
Ok, I'll dust it off, make sure it seems to work (at the time I first
wrote it, I think it caused 'md' to deadlock) and submit it.
NeilBrown