2005-11-04 10:50:36

by Heiko Carstens

[permalink] [raw]
Subject: [PATCH resubmit] do_mount: reduce stack consumption

From: Andreas Herrmann <[email protected]>

This is a resubmit of Andreas' patch that reduces stackframe usage in
do_mount. Problem is that without this patch we get a kernel stack
overflow if we run with 4k stacks (s390 31 bit mode).
See original stack back trace below and Andreas' patch and analysis
here:
http://www.ussg.iu.edu/hypermail/linux/kernel/0410.3/1844.html

<4>Call Trace:
<4>([<000000000014ade6>] buffered_rmqueue+0x36a/0x3b4)
<4> [<000000000014aed6>] __alloc_pages+0xa6/0x350
<4> [<000000000014b258>] __get_free_pages+0x38/0x74
<4> [<000000000014f9fe>] kmem_getpages+0x32/0x140
<4> [<0000000000150396>] cache_alloc_refill+0x3ae/0x58c
<4> [<000000000014feb0>] __kmalloc+0xc0/0xe8
<4> [<00000000002acc64>] zfcp_mempool_alloc+0x24/0x34
<4> [<00000000001494ea>] mempool_alloc+0x56/0x1e8
<4> [<00000000002bfb46>] zfcp_fsf_req_create+0x56/0x5d8
<4> [<00000000002c0cf0>] zfcp_fsf_send_fcp_command_task+0x44/0x5c4
<4> [<00000000002afbda>] zfcp_scsi_command_async+0x7a/0x1f8
<4> [<00000000002afe68>] zfcp_scsi_queuecommand+0x44/0x54
<4> [<000000000024f458>] scsi_dispatch_cmd+0x234/0x3c0
<4> [<0000000000255fb2>] scsi_request_fn+0x2c6/0x67c
<4> [<000000000023f3ea>] __generic_unplug_device+0x52/0x60
<4> [<000000000023d392>] __elv_add_request+0x8a/0x98
<4> [<0000000000240c5e>] __make_request+0x306/0x62c
<4> [<0000000000241074>] generic_make_request+0xf0/0x21c
<4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
<4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
<4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
<4> [<0000000000241074>] generic_make_request+0xf0/0x21c
<4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
<4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
<4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
<4> [<0000000000241074>] generic_make_request+0xf0/0x21c
<4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
<4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
<4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
<4> [<0000000000241074>] generic_make_request+0xf0/0x21c
<4> [<000000000024121a>] submit_bio+0x7a/0x154
<4> [<00000000001736e6>] submit_bh+0x106/0x184
<4> [<000000000017514e>] __block_write_full_page+0x23a/0x58c
<4> [<000000000017558e>] block_write_full_page+0xee/0x108
<4> [<000000000017a908>] blkdev_writepage+0x24/0x38
<4> [<00000000001a0f76>] mpage_writepages+0x1da/0x9d0
<4> [<000000000017a4fa>] generic_writepages+0x22/0x30
<4> [<000000000014cc1e>] do_writepages+0x36/0x54
<4> [<0000000000144fe2>] __filemap_fdatawrite+0x5a/0x6c
<4> [<0000000000145016>] filemap_fdatawrite+0x22/0x30
<4> [<0000000000171244>] sync_blockdev+0x30/0x5c
<4> [<00000000001df074>] journal_recover+0x88/0xbc
<4> [<00000000001e2876>] journal_load+0x52/0xcc
<4> [<00000000001d37b4>] ext3_fill_super+0x1a48/0x1d38
<4> [<0000000000179faa>] get_sb_bdev+0x13a/0x208
<4> [<00000000001d3bd6>] ext3_get_sb+0x22/0x34
<4> [<000000000017a32e>] do_kern_mount+0x86/0x1f4
<4> [<000000000019aaf4>] do_mount+0x1c8/0x8e4
<4> [<000000000019b7ba>] sys_mount+0xce/0x1dc
<4> [<000000000010f9a6>] sysc_do_restart+0xe/0x12

Signed-off-by: Andreas Herrmann <[email protected]>
Signed-off-by: Heiko Carstens <[email protected]>
---

fs/namespace.c | 80 ++++++++++++++++++++++++++++++++++++---------------------
1 files changed, 51 insertions(+), 29 deletions(-)

diff -urpN linux-2.6/fs/namespace.c linux-2.6-patched/fs/namespace.c
--- linux-2.6/fs/namespace.c 2005-10-28 02:02:08.000000000 +0200
+++ linux-2.6-patched/fs/namespace.c 2005-11-03 13:39:30.000000000 +0100
@@ -619,25 +619,31 @@ out_unlock:
*/
static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
{
- struct nameidata old_nd;
+ struct nameidata *old_nd;
struct vfsmount *mnt = NULL;
int err = mount_is_safe(nd);
if (err)
return err;
if (!old_name || !*old_name)
return -EINVAL;
- err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd);
- if (err)
+
+ old_nd = kmalloc(sizeof(*old_nd), GFP_KERNEL);
+ if (!old_nd)
+ return -ENOMEM;
+ err = path_lookup(old_name, LOOKUP_FOLLOW, old_nd);
+ if (err) {
+ kfree(old_nd);
return err;
+ }

down_write(&current->namespace->sem);
err = -EINVAL;
- if (check_mnt(nd->mnt) && (!recurse || check_mnt(old_nd.mnt))) {
+ if (check_mnt(nd->mnt) && (!recurse || check_mnt(old_nd->mnt))) {
err = -ENOMEM;
if (recurse)
- mnt = copy_tree(old_nd.mnt, old_nd.dentry);
+ mnt = copy_tree(old_nd->mnt, old_nd->dentry);
else
- mnt = clone_mnt(old_nd.mnt, old_nd.dentry);
+ mnt = clone_mnt(old_nd->mnt, old_nd->dentry);
}

if (mnt) {
@@ -656,7 +662,8 @@ static int do_loopback(struct nameidata
}

up_write(&current->namespace->sem);
- path_release(&old_nd);
+ path_release(old_nd);
+ kfree(old_nd);
return err;
}

@@ -693,22 +700,29 @@ static int do_remount(struct nameidata *

static int do_move_mount(struct nameidata *nd, char *old_name)
{
- struct nameidata old_nd, parent_nd;
+ struct {
+ struct nameidata old_nd, parent_nd;
+ } *loc;
struct vfsmount *p;
int err = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (!old_name || !*old_name)
return -EINVAL;
- err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd);
- if (err)
+ loc = kmalloc(sizeof(*loc), GFP_KERNEL);
+ if (!loc)
+ return -ENOMEM;
+ err = path_lookup(old_name, LOOKUP_FOLLOW, &loc->old_nd);
+ if (err) {
+ kfree (loc);
return err;
+ }

down_write(&current->namespace->sem);
while(d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
;
err = -EINVAL;
- if (!check_mnt(nd->mnt) || !check_mnt(old_nd.mnt))
+ if (!check_mnt(nd->mnt) || !check_mnt(loc->old_nd.mnt))
goto out;

err = -ENOENT;
@@ -721,28 +735,28 @@ static int do_move_mount(struct nameidat
goto out2;

err = -EINVAL;
- if (old_nd.dentry != old_nd.mnt->mnt_root)
+ if (loc->old_nd.dentry != loc->old_nd.mnt->mnt_root)
goto out2;

- if (old_nd.mnt == old_nd.mnt->mnt_parent)
+ if (loc->old_nd.mnt == loc->old_nd.mnt->mnt_parent)
goto out2;

if (S_ISDIR(nd->dentry->d_inode->i_mode) !=
- S_ISDIR(old_nd.dentry->d_inode->i_mode))
+ S_ISDIR(loc->old_nd.dentry->d_inode->i_mode))
goto out2;

err = -ELOOP;
for (p = nd->mnt; p->mnt_parent!=p; p = p->mnt_parent)
- if (p == old_nd.mnt)
+ if (p == loc->old_nd.mnt)
goto out2;
err = 0;

- detach_mnt(old_nd.mnt, &parent_nd);
- attach_mnt(old_nd.mnt, nd);
+ detach_mnt(loc->old_nd.mnt, &loc->parent_nd);
+ attach_mnt(loc->old_nd.mnt, nd);

/* if the mount is moved, it should no longer be expire
* automatically */
- list_del_init(&old_nd.mnt->mnt_expire);
+ list_del_init(&loc->old_nd.mnt->mnt_expire);
out2:
spin_unlock(&vfsmount_lock);
out1:
@@ -750,8 +764,9 @@ out1:
out:
up_write(&current->namespace->sem);
if (!err)
- path_release(&parent_nd);
- path_release(&old_nd);
+ path_release(&loc->parent_nd);
+ path_release(&loc->old_nd);
+ kfree(loc);
return err;
}

@@ -1014,7 +1029,7 @@ int copy_mount_options(const void __user
long do_mount(char * dev_name, char * dir_name, char *type_page,
unsigned long flags, void *data_page)
{
- struct nameidata nd;
+ struct nameidata *nd;
int retval = 0;
int mnt_flags = 0;

@@ -1041,27 +1056,34 @@ long do_mount(char * dev_name, char * di
mnt_flags |= MNT_NOEXEC;
flags &= ~(MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_ACTIVE);

+ nd = kmalloc(sizeof(*nd), GFP_KERNEL);
+ if (!nd)
+ return -ENOMEM;
+
/* ... and get the mountpoint */
- retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd);
- if (retval)
+ retval = path_lookup(dir_name, LOOKUP_FOLLOW, nd);
+ if (retval) {
+ kfree(nd);
return retval;
+ }

- retval = security_sb_mount(dev_name, &nd, type_page, flags, data_page);
+ retval = security_sb_mount(dev_name, nd, type_page, flags, data_page);
if (retval)
goto dput_out;

if (flags & MS_REMOUNT)
- retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags,
+ retval = do_remount(nd, flags & ~MS_REMOUNT, mnt_flags,
data_page);
else if (flags & MS_BIND)
- retval = do_loopback(&nd, dev_name, flags & MS_REC);
+ retval = do_loopback(nd, dev_name, flags & MS_REC);
else if (flags & MS_MOVE)
- retval = do_move_mount(&nd, dev_name);
+ retval = do_move_mount(nd, dev_name);
else
- retval = do_new_mount(&nd, type_page, flags, mnt_flags,
+ retval = do_new_mount(nd, type_page, flags, mnt_flags,
dev_name, data_page);
dput_out:
- path_release(&nd);
+ path_release(nd);
+ kfree(nd);
return retval;
}


2005-11-04 11:48:20

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH resubmit] do_mount: reduce stack consumption

On Fri, Nov 04, 2005 at 11:50:26AM +0100, Heiko Carstens wrote:
> From: Andreas Herrmann <[email protected]>
>
> This is a resubmit of Andreas' patch that reduces stackframe usage in
> do_mount. Problem is that without this patch we get a kernel stack
> overflow if we run with 4k stacks (s390 31 bit mode).
> See original stack back trace below and Andreas' patch and analysis
> here:
> http://www.ussg.iu.edu/hypermail/linux/kernel/0410.3/1844.html

NAK. Rationale: too ugly.

2005-11-04 12:57:12

by Heiko Carstens

[permalink] [raw]
Subject: Re: [PATCH resubmit] do_mount: reduce stack consumption

> > This is a resubmit of Andreas' patch that reduces stackframe usage in
> > do_mount. Problem is that without this patch we get a kernel stack
> > overflow if we run with 4k stacks (s390 31 bit mode).
> > See original stack back trace below and Andreas' patch and analysis
> > here:
> > http://www.ussg.iu.edu/hypermail/linux/kernel/0410.3/1844.html
>
> NAK. Rationale: too ugly.

Ok, since I can only guess what you don't like: here is an updated patch
that probably addresses a few things.
If you don't like this one too, could you please explain what should be
changed?

Thanks,
Heiko
---

fs/namespace.c | 79 ++++++++++++++++++++++++++++++++++++---------------------
1 file changed, 50 insertions(+), 29 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 2fa9fdf..c14fbfc 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -619,25 +619,30 @@ out_unlock:
*/
static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
{
- struct nameidata old_nd;
+ struct nameidata *old_nd;
struct vfsmount *mnt = NULL;
int err = mount_is_safe(nd);
if (err)
return err;
if (!old_name || !*old_name)
return -EINVAL;
- err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd);
+
+ old_nd = kmalloc(sizeof(*old_nd), GFP_KERNEL);
+ if (!old_nd)
+ return -ENOMEM;
+
+ err = path_lookup(old_name, LOOKUP_FOLLOW, old_nd);
if (err)
- return err;
+ goto out;

down_write(&current->namespace->sem);
err = -EINVAL;
- if (check_mnt(nd->mnt) && (!recurse || check_mnt(old_nd.mnt))) {
+ if (check_mnt(nd->mnt) && (!recurse || check_mnt(old_nd->mnt))) {
err = -ENOMEM;
if (recurse)
- mnt = copy_tree(old_nd.mnt, old_nd.dentry);
+ mnt = copy_tree(old_nd->mnt, old_nd->dentry);
else
- mnt = clone_mnt(old_nd.mnt, old_nd.dentry);
+ mnt = clone_mnt(old_nd->mnt, old_nd->dentry);
}

if (mnt) {
@@ -656,7 +661,9 @@ static int do_loopback(struct nameidata
}

up_write(&current->namespace->sem);
- path_release(&old_nd);
+ path_release(old_nd);
+out:
+ kfree(old_nd);
return err;
}

@@ -693,22 +700,28 @@ static int do_remount(struct nameidata *

static int do_move_mount(struct nameidata *nd, char *old_name)
{
- struct nameidata old_nd, parent_nd;
+ struct {
+ struct nameidata old_nd, parent_nd;
+ } *loc;
struct vfsmount *p;
int err = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (!old_name || !*old_name)
return -EINVAL;
- err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd);
+ loc = kmalloc(sizeof(*loc), GFP_KERNEL);
+ if (!loc)
+ return -ENOMEM;
+
+ err = path_lookup(old_name, LOOKUP_FOLLOW, &loc->old_nd);
if (err)
- return err;
+ goto path_out;

down_write(&current->namespace->sem);
while(d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
;
err = -EINVAL;
- if (!check_mnt(nd->mnt) || !check_mnt(old_nd.mnt))
+ if (!check_mnt(nd->mnt) || !check_mnt(loc->old_nd.mnt))
goto out;

err = -ENOENT;
@@ -721,28 +734,28 @@ static int do_move_mount(struct nameidat
goto out2;

err = -EINVAL;
- if (old_nd.dentry != old_nd.mnt->mnt_root)
+ if (loc->old_nd.dentry != loc->old_nd.mnt->mnt_root)
goto out2;

- if (old_nd.mnt == old_nd.mnt->mnt_parent)
+ if (loc->old_nd.mnt == loc->old_nd.mnt->mnt_parent)
goto out2;

if (S_ISDIR(nd->dentry->d_inode->i_mode) !=
- S_ISDIR(old_nd.dentry->d_inode->i_mode))
+ S_ISDIR(loc->old_nd.dentry->d_inode->i_mode))
goto out2;

err = -ELOOP;
for (p = nd->mnt; p->mnt_parent!=p; p = p->mnt_parent)
- if (p == old_nd.mnt)
+ if (p == loc->old_nd.mnt)
goto out2;
err = 0;

- detach_mnt(old_nd.mnt, &parent_nd);
- attach_mnt(old_nd.mnt, nd);
+ detach_mnt(loc->old_nd.mnt, &loc->parent_nd);
+ attach_mnt(loc->old_nd.mnt, nd);

/* if the mount is moved, it should no longer be expire
* automatically */
- list_del_init(&old_nd.mnt->mnt_expire);
+ list_del_init(&loc->old_nd.mnt->mnt_expire);
out2:
spin_unlock(&vfsmount_lock);
out1:
@@ -750,8 +763,10 @@ out1:
out:
up_write(&current->namespace->sem);
if (!err)
- path_release(&parent_nd);
- path_release(&old_nd);
+ path_release(&loc->parent_nd);
+ path_release(&loc->old_nd);
+path_out:
+ kfree(loc);
return err;
}

@@ -1014,7 +1029,7 @@ int copy_mount_options(const void __user
long do_mount(char * dev_name, char * dir_name, char *type_page,
unsigned long flags, void *data_page)
{
- struct nameidata nd;
+ struct nameidata *nd;
int retval = 0;
int mnt_flags = 0;

@@ -1041,27 +1056,33 @@ long do_mount(char * dev_name, char * di
mnt_flags |= MNT_NOEXEC;
flags &= ~(MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_ACTIVE);

+ nd = kmalloc(sizeof(*nd), GFP_KERNEL);
+ if (!nd)
+ return -ENOMEM;
+
/* ... and get the mountpoint */
- retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd);
+ retval = path_lookup(dir_name, LOOKUP_FOLLOW, nd);
if (retval)
- return retval;
+ goto path_out;

- retval = security_sb_mount(dev_name, &nd, type_page, flags, data_page);
+ retval = security_sb_mount(dev_name, nd, type_page, flags, data_page);
if (retval)
goto dput_out;

if (flags & MS_REMOUNT)
- retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags,
+ retval = do_remount(nd, flags & ~MS_REMOUNT, mnt_flags,
data_page);
else if (flags & MS_BIND)
- retval = do_loopback(&nd, dev_name, flags & MS_REC);
+ retval = do_loopback(nd, dev_name, flags & MS_REC);
else if (flags & MS_MOVE)
- retval = do_move_mount(&nd, dev_name);
+ retval = do_move_mount(nd, dev_name);
else
- retval = do_new_mount(&nd, type_page, flags, mnt_flags,
+ retval = do_new_mount(nd, type_page, flags, mnt_flags,
dev_name, data_page);
dput_out:
- path_release(&nd);
+ path_release(nd);
+path_out:
+ kfree(nd);
return retval;
}

2005-11-04 14:06:57

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH resubmit] do_mount: reduce stack consumption

On Fri, Nov 04, 2005 at 01:57:05PM +0100, Heiko Carstens wrote:
> > > This is a resubmit of Andreas' patch that reduces stackframe usage in
> > > do_mount. Problem is that without this patch we get a kernel stack
> > > overflow if we run with 4k stacks (s390 31 bit mode).
> > > See original stack back trace below and Andreas' patch and analysis
> > > here:
> > > http://www.ussg.iu.edu/hypermail/linux/kernel/0410.3/1844.html
> >
> > NAK. Rationale: too ugly.
>
> Ok, since I can only guess what you don't like: here is an updated patch
> that probably addresses a few things.
> If you don't like this one too, could you please explain what should be
> changed?

Depth analysis. E.g. do_move_mount() change is simply nonsense - _this_
is not going to overflow, no matter what. And do_add_mount() change
is also very suspicious - looks like you are attacking the wrong place
in call chain.

2005-11-04 15:53:09

by Andreas Herrmann

[permalink] [raw]
Subject: Re: [PATCH resubmit] do_mount: reduce stack consumption

On 04.11.2005 15:06 Al Viro wrote:
> On Fri, Nov 04, 2005 at 01:57:05PM +0100, Heiko Carstens wrote:
> > Ok, since I can only guess what you don't like: here is an updated
patch
> > that probably addresses a few things.
> > If you don't like this one too, could you please explain what should
be
> > changed?

> Depth analysis. E.g. do_move_mount() change is simply nonsense - _this_
> is not going to overflow, no matter what. And do_add_mount() change
> is also very suspicious - looks like you are attacking the wrong place
> in call chain.

Obviously you missed the point that (depending on the compiler version,
options etc.) do_move_mount() and do_add_mount() can be inlined.
Hence stack frame size of do_mount is directly influenced by those two
functions. At least when I performed my "skin-deep" analysis of this
kernel stack overflow both functions were inlined in do_mount().

Stack consumptions of all other functions occurring in the backtrace
were moderate or due to the amount of inlined functions.

Just do_mount and its inlined friends use to put larger structures on
the stack that better should be allocated. There would not be any
performance impact for the user to allocate the structs.


Regards,

Andreas

2005-11-04 16:48:41

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH resubmit] do_mount: reduce stack consumption

Heiko Carstens <[email protected]> wrote:
>
> From: Andreas Herrmann <[email protected]>
>
> This is a resubmit of Andreas' patch that reduces stackframe usage in
> do_mount. Problem is that without this patch we get a kernel stack
> overflow if we run with 4k stacks (s390 31 bit mode).
> See original stack back trace below and Andreas' patch and analysis
> here:
> http://www.ussg.iu.edu/hypermail/linux/kernel/0410.3/1844.html
>
> <4>Call Trace:
> <4>([<000000000014ade6>] buffered_rmqueue+0x36a/0x3b4)
> <4> [<000000000014aed6>] __alloc_pages+0xa6/0x350
> <4> [<000000000014b258>] __get_free_pages+0x38/0x74
> <4> [<000000000014f9fe>] kmem_getpages+0x32/0x140
> <4> [<0000000000150396>] cache_alloc_refill+0x3ae/0x58c
> <4> [<000000000014feb0>] __kmalloc+0xc0/0xe8
> <4> [<00000000002acc64>] zfcp_mempool_alloc+0x24/0x34
> <4> [<00000000001494ea>] mempool_alloc+0x56/0x1e8
> <4> [<00000000002bfb46>] zfcp_fsf_req_create+0x56/0x5d8
> <4> [<00000000002c0cf0>] zfcp_fsf_send_fcp_command_task+0x44/0x5c4
> <4> [<00000000002afbda>] zfcp_scsi_command_async+0x7a/0x1f8
> <4> [<00000000002afe68>] zfcp_scsi_queuecommand+0x44/0x54
> <4> [<000000000024f458>] scsi_dispatch_cmd+0x234/0x3c0
> <4> [<0000000000255fb2>] scsi_request_fn+0x2c6/0x67c
> <4> [<000000000023f3ea>] __generic_unplug_device+0x52/0x60
> <4> [<000000000023d392>] __elv_add_request+0x8a/0x98
> <4> [<0000000000240c5e>] __make_request+0x306/0x62c
> <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> <4> [<000000000024121a>] submit_bio+0x7a/0x154
> <4> [<00000000001736e6>] submit_bh+0x106/0x184
> <4> [<000000000017514e>] __block_write_full_page+0x23a/0x58c
> <4> [<000000000017558e>] block_write_full_page+0xee/0x108
> <4> [<000000000017a908>] blkdev_writepage+0x24/0x38
> <4> [<00000000001a0f76>] mpage_writepages+0x1da/0x9d0
> <4> [<000000000017a4fa>] generic_writepages+0x22/0x30
> <4> [<000000000014cc1e>] do_writepages+0x36/0x54
> <4> [<0000000000144fe2>] __filemap_fdatawrite+0x5a/0x6c
> <4> [<0000000000145016>] filemap_fdatawrite+0x22/0x30
> <4> [<0000000000171244>] sync_blockdev+0x30/0x5c
> <4> [<00000000001df074>] journal_recover+0x88/0xbc
> <4> [<00000000001e2876>] journal_load+0x52/0xcc
> <4> [<00000000001d37b4>] ext3_fill_super+0x1a48/0x1d38
> <4> [<0000000000179faa>] get_sb_bdev+0x13a/0x208
> <4> [<00000000001d3bd6>] ext3_get_sb+0x22/0x34
> <4> [<000000000017a32e>] do_kern_mount+0x86/0x1f4
> <4> [<000000000019aaf4>] do_mount+0x1c8/0x8e4
> <4> [<000000000019b7ba>] sys_mount+0xce/0x1dc
> <4> [<000000000010f9a6>] sysc_do_restart+0xe/0x12

I'd call that a device mapper bug. If you were to increase the stacking
from 4-deep to 5-deep, it will crash the kernel, patched or not.

I'm (very) surprised that DM doesn't perform its stacking iteratively.

Eliminating the 36-byte `struct clone_info' in __split_bio() would be most
effective here.


2005-11-04 17:42:48

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH resubmit] do_mount: reduce stack consumption

Andreas Herrmann <[email protected]> wrote:
>
> Obviously you missed the point that (depending on the compiler version,
> options etc.) do_move_mount() and do_add_mount() can be inlined.

I think we found a way of preventing the 3.x compiler from doing that. Arjan,
do you recall where we ended up with that problem?

2005-11-04 17:45:20

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [PATCH resubmit] do_mount: reduce stack consumption

On Fri, Nov 04, 2005 at 09:42:12AM -0800, Andrew Morton wrote:
> Andreas Herrmann <[email protected]> wrote:
> >
> > Obviously you missed the point that (depending on the compiler version,
> > options etc.) do_move_mount() and do_add_mount() can be inlined.
>
> I think we found a way of preventing the 3.x compiler from doing that. Arjan,
> do you recall where we ended up with that problem?

it was mostly caused by -funit-at-a-time that caused 3.x to go haywire. You
need to turn that off in general.

the real fix is in gcc 4.x (4.1 at least) where gcc got a lot smarter about
stack slot lifetimes

2005-11-04 18:38:04

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH resubmit] do_mount: reduce stack consumption

Arjan van de Ven <[email protected]> wrote:
>
> On Fri, Nov 04, 2005 at 09:42:12AM -0800, Andrew Morton wrote:
> > Andreas Herrmann <[email protected]> wrote:
> > >
> > > Obviously you missed the point that (depending on the compiler version,
> > > options etc.) do_move_mount() and do_add_mount() can be inlined.
> >
> > I think we found a way of preventing the 3.x compiler from doing that. Arjan,
> > do you recall where we ended up with that problem?
>
> it was mostly caused by -funit-at-a-time that caused 3.x to go haywire. You
> need to turn that off in general.

In that case, does s390 still have a problem that we need to be solving here?

(s390 doesn't implement CONFIG_DEBUG_STACK_USAGE. Suggest you do so, then
stick a dump_stack() into do_exit()).

(dm is still doing very risky things though)

2005-11-04 21:27:54

by Heiko Carstens

[permalink] [raw]
Subject: Re: [PATCH resubmit] do_mount: reduce stack consumption

> > See original stack back trace below and Andreas' patch and analysis
> > here:
> > http://www.ussg.iu.edu/hypermail/linux/kernel/0410.3/1844.html

I probably should add that with "original" stack back trace a trace of
a 2.6.10 kernel was meant, if that wasn't clear, but the DM code is
still the same in 2.6.14.

> > <4>Call Trace:
...
> > <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> > <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> > <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> > <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> > <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> > <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> > <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> > <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> > <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> > <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> > <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> > <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
...

This part of the call trace is actually good for >1500 bytes of stack
usage and is what kills us and should be fixed.
I'm surprised that there are no other bug reports regarding DM and
stack overflow with 4k stacks.

> I'd call that a device mapper bug. If you were to increase the stacking
> from 4-deep to 5-deep, it will crash the kernel, patched or not.

Yes, you're right. Just forget the do_mount() patch. It's the wrong approach.

Heiko

2005-11-04 22:03:59

by Heiko Carstens

[permalink] [raw]
Subject: Re: [PATCH resubmit] do_mount: reduce stack consumption

> > If you don't like this one too, could you please explain what should be
> > changed?
>
> Depth analysis. E.g. do_move_mount() change is simply nonsense - _this_
> is not going to overflow, no matter what. And do_add_mount() change
> is also very suspicious - looks like you are attacking the wrong place
> in call chain.

You're right (see also my other reply to Andrew).

Thanks,
Heiko

2005-11-04 23:55:04

by Adrian Bunk

[permalink] [raw]
Subject: Re: [PATCH resubmit] do_mount: reduce stack consumption

On Fri, Nov 04, 2005 at 10:27:42PM +0100, Heiko Carstens wrote:
> > > See original stack back trace below and Andreas' patch and analysis
> > > here:
> > > http://www.ussg.iu.edu/hypermail/linux/kernel/0410.3/1844.html
>
> I probably should add that with "original" stack back trace a trace of
> a 2.6.10 kernel was meant, if that wasn't clear, but the DM code is
> still the same in 2.6.14.
>
> > > <4>Call Trace:
> ...
> > > <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> > > <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> > > <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> > > <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> > > <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> > > <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> > > <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> > > <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> > > <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> > > <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> > > <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> > > <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> ...
>
> This part of the call trace is actually good for >1500 bytes of stack
> usage and is what kills us and should be fixed.
> I'm surprised that there are no other bug reports regarding DM and
> stack overflow with 4k stacks.
>...

There were some reports of dm+xfs overflows with 4k stacks on i386.

The xfs side was sorted out, but I son't know the state of the dm part.

> Heiko

cu
Adrian

--

"Is there not promise of rain?" Ling Tan asked suddenly out
of the darkness. There had been need of rain for many days.
"Only a promise," Lao Er said.
Pearl S. Buck - Dragon Seed

2005-11-05 00:12:09

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH resubmit] do_mount: reduce stack consumption

Adrian Bunk <[email protected]> wrote:
>
> On Fri, Nov 04, 2005 at 10:27:42PM +0100, Heiko Carstens wrote:
> > > > See original stack back trace below and Andreas' patch and analysis
> > > > here:
> > > > http://www.ussg.iu.edu/hypermail/linux/kernel/0410.3/1844.html
> >
> > I probably should add that with "original" stack back trace a trace of
> > a 2.6.10 kernel was meant, if that wasn't clear, but the DM code is
> > still the same in 2.6.14.
> >
> > > > <4>Call Trace:
> > ...
> > > > <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> > > > <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> > > > <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> > > > <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> > > > <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> > > > <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> > > > <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> > > > <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> > > > <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> > > > <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> > > > <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> > > > <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> > ...
> >
> > This part of the call trace is actually good for >1500 bytes of stack
> > usage and is what kills us and should be fixed.
> > I'm surprised that there are no other bug reports regarding DM and
> > stack overflow with 4k stacks.
> >...
>
> There were some reports of dm+xfs overflows with 4k stacks on i386.
>
> The xfs side was sorted out, but I son't know the state of the dm part.
>

The state is Very Bad, IMO. At the very least, DM should struggle to the
utmost to reduce the stack utilisation around that recursion point.

2005-11-05 01:20:43

by cplk

[permalink] [raw]
Subject: Re: [PATCH resubmit] do_mount: reduce stack consumption

> > > This part of the call trace is actually good for >1500 bytes of stack
> > > usage and is what kills us and should be fixed.
> > > I'm surprised that there are no other bug reports regarding DM and
> > > stack overflow with 4k stacks.
> > >...
> >
> > There were some reports of dm+xfs overflows with 4k stacks on i386.
> >
> > The xfs side was sorted out, but I son't know the state of the dm part.
> >
>
> The state is Very Bad, IMO. At the very least, DM should struggle to the
> utmost to reduce the stack utilisation around that recursion point.

Neil Brown suggested a change to generic_make_request to convert recursion
through it into iteration (see http://lkml.org/lkml/2005/9/2/34 ), but
there was no discussion at the time. Would this help with this case?

Chris

2005-11-05 01:40:59

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH resubmit] do_mount: reduce stack consumption

[email protected] wrote:
>
> > > > This part of the call trace is actually good for >1500 bytes of stack
> > > > usage and is what kills us and should be fixed.
> > > > I'm surprised that there are no other bug reports regarding DM and
> > > > stack overflow with 4k stacks.
> > > >...
> > >
> > > There were some reports of dm+xfs overflows with 4k stacks on i386.
> > >
> > > The xfs side was sorted out, but I son't know the state of the dm part.
> > >
> >
> > The state is Very Bad, IMO. At the very least, DM should struggle to the
> > utmost to reduce the stack utilisation around that recursion point.
>
> Neil Brown suggested a change to generic_make_request to convert recursion
> through it into iteration (see http://lkml.org/lkml/2005/9/2/34 ), but
> there was no discussion at the time. Would this help with this case?

It certainly would. That looks like a good thing to do some more work on.

2005-11-05 05:57:08

by NeilBrown

[permalink] [raw]
Subject: Re: [PATCH resubmit] do_mount: reduce stack consumption

On Friday November 4, [email protected] wrote:
> [email protected] wrote:
> >
> > > > > This part of the call trace is actually good for >1500 bytes of stack
> > > > > usage and is what kills us and should be fixed.
> > > > > I'm surprised that there are no other bug reports regarding DM and
> > > > > stack overflow with 4k stacks.
> > > > >...
> > > >
> > > > There were some reports of dm+xfs overflows with 4k stacks on i386.
> > > >
> > > > The xfs side was sorted out, but I son't know the state of the dm part.
> > > >
> > >
> > > The state is Very Bad, IMO. At the very least, DM should struggle to the
> > > utmost to reduce the stack utilisation around that recursion point.
> >
> > Neil Brown suggested a change to generic_make_request to convert recursion
> > through it into iteration (see http://lkml.org/lkml/2005/9/2/34 ), but
> > there was no discussion at the time. Would this help with this case?
>
> It certainly would. That looks like a good thing to do some more work on.

Ok, I'll dust it off, make sure it seems to work (at the time I first
wrote it, I think it caused 'md' to deadlock) and submit it.

NeilBrown