--- linux-2.6.13.1/arch/ia64/ia32/binfmt_elf32.c.accterr 2005-09-12 13:59:50.000000000 +0400
+++ linux-2.6.13.1/arch/ia64/ia32/binfmt_elf32.c 2005-09-12 15:10:16.000000000 +0400
@@ -199,7 +199,7 @@
int
ia32_setup_arg_pages (struct linux_binprm *bprm, int executable_stack)
{
- unsigned long stack_base;
+ unsigned long stack_base, vm_end, vm_start;
struct vm_area_struct *mpnt;
struct mm_struct *mm = current->mm;
int i, ret;
@@ -212,23 +212,24 @@
bprm->loader += stack_base;
bprm->exec += stack_base;
+ ret = -ENOMEM;
mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
if (!mpnt)
- return -ENOMEM;
+ goto out;
- if (security_vm_enough_memory((IA32_STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p))
- >> PAGE_SHIFT)) {
- kmem_cache_free(vm_area_cachep, mpnt);
- return -ENOMEM;
- }
+ vm_end = IA32_STACK_TOP;
+ vm_start = PAGE_MASK & (unsigned long)bprm->p;
+
+ if (security_vm_enough_memory((vm_end - vm_start) >> PAGE_SHIFT))
+ goto out_free;
memset(mpnt, 0, sizeof(*mpnt));
down_write(¤t->mm->mmap_sem);
{
mpnt->vm_mm = current->mm;
- mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p;
- mpnt->vm_end = IA32_STACK_TOP;
+ mpnt->vm_start = vm_start;
+ mpnt->vm_end = vm_end;
if (executable_stack == EXSTACK_ENABLE_X)
mpnt->vm_flags = VM_STACK_FLAGS | VM_EXEC;
else if (executable_stack == EXSTACK_DISABLE_X)
@@ -237,11 +238,8 @@
mpnt->vm_flags = VM_STACK_FLAGS;
mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC)?
PAGE_COPY_EXEC: PAGE_COPY;
- if ((ret = insert_vm_struct(current->mm, mpnt))) {
- up_write(¤t->mm->mmap_sem);
- kmem_cache_free(vm_area_cachep, mpnt);
- return ret;
- }
+ if ((ret = insert_vm_struct(current->mm, mpnt)))
+ goto out_up;
current->mm->stack_vm = current->mm->total_vm = vma_pages(mpnt);
}
@@ -260,6 +258,14 @@
current->thread.ppl = ia32_init_pp_list();
return 0;
+
+out_up:
+ up_write(¤t->mm->mmap_sem);
+ vm_unacct_memory((vm_end - vm_start) >> PAGE_SHIFT);
+out_free:
+ kmem_cache_free(vm_area_cachep, mpnt);
+out:
+ return ret;
}
static void
--- linux-2.6.13.1/arch/x86_64/ia32/ia32_binfmt.c.accterr 2005-09-12 14:00:30.000000000 +0400
+++ linux-2.6.13.1/arch/x86_64/ia32/ia32_binfmt.c 2005-09-12 15:18:45.000000000 +0400
@@ -337,7 +337,7 @@
int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack)
{
- unsigned long stack_base;
+ unsigned long stack_base, vm_end, vm_start;
struct vm_area_struct *mpnt;
struct mm_struct *mm = current->mm;
int i, ret;
@@ -350,22 +350,24 @@
bprm->loader += stack_base;
bprm->exec += stack_base;
+ ret = -ENOMEM;
mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
- if (!mpnt)
- return -ENOMEM;
+ if (!mpnt)
+ goto out;
+
+ vm_end = IA32_STACK_TOP;
+ vm_start = PAGE_MASK & (unsigned long)bprm->p;
- if (security_vm_enough_memory((IA32_STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p))>>PAGE_SHIFT)) {
- kmem_cache_free(vm_area_cachep, mpnt);
- return -ENOMEM;
- }
+ if (security_vm_enough_memory((vm_end - vm_start)>>PAGE_SHIFT))
+ goto out_free;
memset(mpnt, 0, sizeof(*mpnt));
down_write(&mm->mmap_sem);
{
mpnt->vm_mm = mm;
- mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p;
- mpnt->vm_end = IA32_STACK_TOP;
+ mpnt->vm_start = vm_start;
+ mpnt->vm_end = vm_end;
if (executable_stack == EXSTACK_ENABLE_X)
mpnt->vm_flags = VM_STACK_FLAGS | VM_EXEC;
else if (executable_stack == EXSTACK_DISABLE_X)
@@ -374,11 +376,8 @@
mpnt->vm_flags = VM_STACK_FLAGS;
mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC) ?
PAGE_COPY_EXEC : PAGE_COPY;
- if ((ret = insert_vm_struct(mm, mpnt))) {
- up_write(&mm->mmap_sem);
- kmem_cache_free(vm_area_cachep, mpnt);
- return ret;
- }
+ if ((ret = insert_vm_struct(mm, mpnt)))
+ goto out_up;
mm->stack_vm = mm->total_vm = vma_pages(mpnt);
}
@@ -393,6 +392,14 @@
up_write(&mm->mmap_sem);
return 0;
+
+out_up:
+ up_write(&mm->mmap_sem);
+ vm_unacct_memory((vm_end - vm_start) >> PAGE_SHIFT);
+out_free:
+ kmem_cache_free(vm_area_cachep, mpnt);
+out:
+ return ret;
}
static unsigned long
--- linux-2.6.13.1/arch/x86_64/ia32/syscall32.c.accterr 2005-09-12 14:00:29.000000000 +0400
+++ linux-2.6.13.1/arch/x86_64/ia32/syscall32.c 2005-09-12 15:23:07.000000000 +0400
@@ -10,6 +10,7 @@
#include <linux/init.h>
#include <linux/stringify.h>
#include <linux/security.h>
+#include <linux/mman.h>
#include <asm/proto.h>
#include <asm/tlbflush.h>
#include <asm/ia32_unistd.h>
@@ -49,13 +50,12 @@
struct mm_struct *mm = current->mm;
int ret;
+ ret = -ENOMEM;
vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
if (!vma)
- return -ENOMEM;
- if (security_vm_enough_memory(npages)) {
- kmem_cache_free(vm_area_cachep, vma);
- return -ENOMEM;
- }
+ goto out;
+ if (security_vm_enough_memory(npages))
+ goto out_free;
memset(vma, 0, sizeof(struct vm_area_struct));
/* Could randomize here */
@@ -69,14 +69,19 @@
vma->vm_mm = mm;
down_write(&mm->mmap_sem);
- if ((ret = insert_vm_struct(mm, vma))) {
- up_write(&mm->mmap_sem);
- kmem_cache_free(vm_area_cachep, vma);
- return ret;
- }
+ if ((ret = insert_vm_struct(mm, vma)))
+ goto out_up;
mm->total_vm += npages;
up_write(&mm->mmap_sem);
return 0;
+
+out_up:
+ up_write(&mm->mmap_sem);
+ vm_unacct_memory(npages);
+out_free:
+ kmem_cache_free(vm_area_cachep, vma);
+out:
+ return ret;
}
static int __init init_syscall32(void)
--- linux-2.6.13.1/fs/exec.c.accterr 2005-09-12 14:01:43.000000000 +0400
+++ linux-2.6.13.1/fs/exec.c 2005-09-12 15:20:38.000000000 +0400
@@ -417,14 +417,13 @@
bprm->loader += stack_base;
bprm->exec += stack_base;
+ ret = -ENOMEM;
mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
if (!mpnt)
- return -ENOMEM;
+ goto out;
- if (security_vm_enough_memory(arg_size >> PAGE_SHIFT)) {
- kmem_cache_free(vm_area_cachep, mpnt);
- return -ENOMEM;
- }
+ if (security_vm_enough_memory(arg_size >> PAGE_SHIFT))
+ goto out_free;
memset(mpnt, 0, sizeof(*mpnt));
@@ -449,11 +448,8 @@
mpnt->vm_flags = VM_STACK_FLAGS;
mpnt->vm_flags |= mm->def_flags;
mpnt->vm_page_prot = protection_map[mpnt->vm_flags & 0x7];
- if ((ret = insert_vm_struct(mm, mpnt))) {
- up_write(&mm->mmap_sem);
- kmem_cache_free(vm_area_cachep, mpnt);
- return ret;
- }
+ if ((ret = insert_vm_struct(mm, mpnt)))
+ goto out_up;
mm->stack_vm = mm->total_vm = vma_pages(mpnt);
}
@@ -468,6 +464,14 @@
up_write(&mm->mmap_sem);
return 0;
+
+out_up:
+ up_write(&mm->mmap_sem);
+ vm_unacct_memory(arg_size >> PAGE_SHIFT);
+out_free:
+ kmem_cache_free(vm_area_cachep, mpnt);
+out:
+ return ret;
}
EXPORT_SYMBOL(setup_arg_pages);
Kirill Korotaev <[email protected]> wrote:
>
> This patch fixes error path in setup_arg_pages() functions, since it
> misses vm_unacct_memory() after successful call of
> security_vm_enough_memory(). Also it cleans up error path.
Ugh. The identifier `security_vm_enough_memory()' sounds like some
predicate which has no side-effects. Except it performs accounting. Hence
bugs like this.
It's a shame that you mixed a largeish cleanup along with a bugfix - please
don't do that in future.
Patch looks OK to me. Hugh, could you please double-check sometime?
>> This patch fixes error path in setup_arg_pages() functions, since it
>> misses vm_unacct_memory() after successful call of
>> security_vm_enough_memory(). Also it cleans up error path.
>
> Ugh. The identifier `security_vm_enough_memory()' sounds like some
> predicate which has no side-effects. Except it performs accounting. Hence
> bugs like this.
yup, this is really done in a leading-to-bugs way... :(
maybe it is worth moving vm_acct_memory() out of
security_vm_enough_memory()? what do you think?
> It's a shame that you mixed a largeish cleanup along with a bugfix - please
> don't do that in future.
not a problem :) thanks for your time and looking at the patches I sent.
Kirill
Kirill Korotaev <[email protected]> wrote:
>
> maybe it is worth moving vm_acct_memory() out of
> security_vm_enough_memory()?
I think that would be saner, yes. That means that the callers would call
vm_acct_memory() after security_enough_memory(), if that succeeded.
On Mon, 12 Sep 2005, Andrew Morton wrote:
> Kirill Korotaev <[email protected]> wrote:
> >
> > This patch fixes error path in setup_arg_pages() functions, since it
> > misses vm_unacct_memory() after successful call of
> > security_vm_enough_memory(). Also it cleans up error path.
>
> Ugh. The identifier `security_vm_enough_memory()' sounds like some
> predicate which has no side-effects. Except it performs accounting. Hence
> bugs like this.
>
> It's a shame that you mixed a largeish cleanup along with a bugfix - please
> don't do that in future.
>
> Patch looks OK to me. Hugh, could you please double-check sometime?
It's a good find, and the patch looks correct to me, so far as it goes.
But I think it's the wrong patch, and incomplete: it can be done more
appropriately, more simply and more completely in insert_vm_struct itself.
I'll post a replacement patch (or admit I'm wrong) in a little while.
Hugh
On Tue, 13 Sep 2005, Andrew Morton wrote:
> Kirill Korotaev <[email protected]> wrote:
> >
> > maybe it is worth moving vm_acct_memory() out of
> > security_vm_enough_memory()?
>
> I think that would be saner, yes. That means that the callers would call
> vm_acct_memory() after security_enough_memory(), if that succeeded.
I don't like that at all. The implementation of its tests is necessarily
imprecise, but nonetheless, we do prefer primitives which atomically test
and reserve. We're not moving from request_region to check_region, are we?
But change the naming by all means, it was never good,
and grew worse when "security_" got stuck on the front.
Hugh
On Maw, 2005-09-13 at 01:40 -0700, Andrew Morton wrote:
> Kirill Korotaev <[email protected]> wrote:
> >
> > maybe it is worth moving vm_acct_memory() out of
> > security_vm_enough_memory()?
>
> I think that would be saner, yes. That means that the callers would call
> vm_acct_memory() after security_enough_memory(), if that succeeded.
It would make much more sense to simply sed security_vm_enough_memory()
into security_vm_claim_memory() or a better name. You need to perform
the process as one thing otherwise two people checking for enough memory
may both succeed and then both reserve memory causing overcommits that
should not be permitted.
If you jut fix the name you get the right semantics still but without
the confusion.
Alan
--- linux-2.6.13.1/arch/ppc64/kernel/vdso.c.ppccln 2005-09-13 15:47:26.000000000 +0400
+++ linux-2.6.13.1/arch/ppc64/kernel/vdso.c 2005-09-13 15:55:08.000000000 +0400
@@ -221,13 +221,13 @@ int arch_setup_additional_pages(struct l
if (vdso_pages == 0)
return 0;
+ vdso_base = -ENOMEM;
vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
if (vma == NULL)
- return -ENOMEM;
- if (security_vm_enough_memory(vdso_pages)) {
- kmem_cache_free(vm_area_cachep, vma);
- return -ENOMEM;
- }
+ goto out;
+ if (security_vm_enough_memory(vdso_pages))
+ goto out_free;
+
memset(vma, 0, sizeof(*vma));
/*
@@ -237,11 +237,8 @@ int arch_setup_additional_pages(struct l
*/
vdso_base = get_unmapped_area(NULL, vdso_base,
vdso_pages << PAGE_SHIFT, 0, 0);
- if (vdso_base & ~PAGE_MASK) {
- vm_unacct_memory(vdso_pages);
- kmem_cache_free(vm_area_cachep, vma);
- return (int)vdso_base;
- }
+ if (vdso_base & ~PAGE_MASK)
+ goto out_unacct;
current->thread.vdso_base = vdso_base;
@@ -274,6 +271,13 @@ int arch_setup_additional_pages(struct l
up_write(&mm->mmap_sem);
return 0;
+
+out_unacct:
+ vm_unacct_memory(vdso_pages);
+out_free:
+ kmem_cache_free(vm_area_cachep, vma);
+out:
+ return (int)vdso_base;
}
static void * __init find_section32(Elf32_Ehdr *ehdr, const char *secname,
Hugh Dickins <[email protected]> wrote:
>
> On Tue, 13 Sep 2005, Andrew Morton wrote:
> > Kirill Korotaev <[email protected]> wrote:
> > >
> > > maybe it is worth moving vm_acct_memory() out of
> > > security_vm_enough_memory()?
> >
> > I think that would be saner, yes. That means that the callers would call
> > vm_acct_memory() after security_enough_memory(), if that succeeded.
>
> I don't like that at all. The implementation of its tests is necessarily
> imprecise, but nonetheless, we do prefer primitives which atomically test
> and reserve. We're not moving from request_region to check_region, are we?
I don't think that it's any racier to move the allocation to after the
check than to have it before the check. If we're worried, take mmap_sem -
most place already do that, but not all.
> But change the naming by all means, it was never good,
> and grew worse when "security_" got stuck on the front.
Yes, renaming it to something like alloc_vm_space() would suit.
On Tue, 13 Sep 2005, Andrew Morton wrote:
>
> I don't think that it's any racier to move the allocation to after the
> check than to have it before the check. If we're worried, take mmap_sem -
> most place already do that, but not all.
mmap_sem? That locks a single mm, but here we're talking about
making reservations from what /proc/meminfo calls CommitLimit,
for the whole machine. I really don't see any need to change
the ordering of what's done at present.
> > But change the naming by all means, it was never good,
> > and grew worse when "security_" got stuck on the front.
>
> Yes, renaming it to something like alloc_vm_space() would suit.
Nor am I in any hurry to change the name, though I agree with
you and Alan that a name change would be good, in due course.
I'm more interested in fixing the bugs Kirill and co discovered,
and those I'm additionally finding on the way to fixing them in
insert_vm_struct. Notice how running a 32-bit binary on x86_64
leaks 4kB into Committed_AS each time?
But I'm puzzled as to why the same leak into Committed_AS doesn't
occur on ppc64, each time an ELF binary is run, if vDSO is enabled.
Or is it indeed leaking, but nobody has noticed? I don't have any
ppc64, could someone please check and see? Thanks.
insert_vm_struct is certainly the way to go (it's not obvious to
callers whether VM_ACCOUNT is set or not), and there won't be any
security_vm_enough_memory calls outside mm/ (and kernel/fork.c)
once I'm done: just a matter of where to stop (should it also
vm_stat_account? can we trust callers to maintain total_vm?
what about locked_vm? rlimits?).
Hugh
On Tue, 13 Sep 2005, Hugh Dickins wrote:
> On Mon, 12 Sep 2005, Andrew Morton wrote:
> >
> > Patch looks OK to me. Hugh, could you please double-check sometime?
>
> It's a good find, and the patch looks correct to me, so far as it goes.
> But I think it's the wrong patch, and incomplete: it can be done more
> appropriately, more simply and more completely in insert_vm_struct itself.
> I'll post a replacement patch (or admit I'm wrong) in a little while.
Many thanks, Andrew, for confirming that ppc64 does indeed leak into
Committed_AS, as it looked to me. Here's my version of Pavel/Kirill's
patches: sorry if it seems "weird" to Kirill, perhaps we need to change
the name of insert_vm_struct too; but it seems safer and easier to get
right this way. And I prefer deleting code to adding it...
Pavel Emelianov and Kirill Korotaev observe that fs and arch users of
security_vm_enough_memory tend to forget to vm_unacct_memory when a
failure occurs further down (typically in setup_arg_pages variants).
These are all users of insert_vm_struct, and that reservation will only
be unaccounted on exit if the vma is marked VM_ACCOUNT: which in some
cases it is (hidden inside VM_STACK_FLAGS) and in some cases it isn't.
So x86_64 32-bit and ppc64 vDSO ELFs have been leaking memory into
Committed_AS each time they're run. But don't add VM_ACCOUNT to them,
it's inappropriate to reserve against the very unlikely case that gdb
be used to COW a vDSO page - we ought to do something about that in
do_wp_page, but there are yet other inconsistencies to be resolved.
The safe and economical way to fix this is to let insert_vm_struct do
the security_vm_enough_memory check when it finds VM_ACCOUNT is set.
And the MIPS irix_brk has been calling security_vm_enough_memory before
calling do_brk which repeats it, doubly accounting and so also leaking.
Remove that, and all the fs and arch calls to security_vm_enough_memory:
give it a less misleading name later on.
Signed-off-by: Hugh Dickins <[email protected]>
---
arch/ia64/ia32/binfmt_elf32.c | 6 ------
arch/mips/kernel/sysirix.c | 9 ++-------
arch/ppc64/kernel/vdso.c | 15 +++++++++------
arch/x86_64/ia32/ia32_binfmt.c | 5 -----
arch/x86_64/ia32/syscall32.c | 6 +-----
fs/exec.c | 5 -----
mm/mmap.c | 3 +++
7 files changed, 15 insertions(+), 34 deletions(-)
--- 2.6.14-rc1/arch/ia64/ia32/binfmt_elf32.c 2005-03-02 07:39:16.000000000 +0000
+++ linux/arch/ia64/ia32/binfmt_elf32.c 2005-09-13 17:58:28.000000000 +0100
@@ -216,12 +216,6 @@ ia32_setup_arg_pages (struct linux_binpr
if (!mpnt)
return -ENOMEM;
- if (security_vm_enough_memory((IA32_STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p))
- >> PAGE_SHIFT)) {
- kmem_cache_free(vm_area_cachep, mpnt);
- return -ENOMEM;
- }
-
memset(mpnt, 0, sizeof(*mpnt));
down_write(¤t->mm->mmap_sem);
--- 2.6.14-rc1/arch/mips/kernel/sysirix.c 2005-09-13 15:22:15.000000000 +0100
+++ linux/arch/mips/kernel/sysirix.c 2005-09-13 18:51:43.000000000 +0100
@@ -581,18 +581,13 @@ asmlinkage int irix_brk(unsigned long br
}
/*
- * Check if we have enough memory..
+ * Ok, looks good - let it rip.
*/
- if (security_vm_enough_memory((newbrk-oldbrk) >> PAGE_SHIFT)) {
+ if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) {
ret = -ENOMEM;
goto out;
}
-
- /*
- * Ok, looks good - let it rip.
- */
mm->brk = brk;
- do_brk(oldbrk, newbrk-oldbrk);
ret = 0;
out:
--- 2.6.14-rc1/arch/ppc64/kernel/vdso.c 2005-06-17 20:48:29.000000000 +0100
+++ linux/arch/ppc64/kernel/vdso.c 2005-09-13 20:50:02.000000000 +0100
@@ -224,10 +224,7 @@ int arch_setup_additional_pages(struct l
vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
if (vma == NULL)
return -ENOMEM;
- if (security_vm_enough_memory(vdso_pages)) {
- kmem_cache_free(vm_area_cachep, vma);
- return -ENOMEM;
- }
+
memset(vma, 0, sizeof(*vma));
/*
@@ -237,8 +234,10 @@ int arch_setup_additional_pages(struct l
*/
vdso_base = get_unmapped_area(NULL, vdso_base,
vdso_pages << PAGE_SHIFT, 0, 0);
- if (vdso_base & ~PAGE_MASK)
+ if (vdso_base & ~PAGE_MASK) {
+ kmem_cache_free(vm_area_cachep, vma);
return (int)vdso_base;
+ }
current->thread.vdso_base = vdso_base;
@@ -266,7 +265,11 @@ int arch_setup_additional_pages(struct l
vma->vm_ops = &vdso_vmops;
down_write(&mm->mmap_sem);
- insert_vm_struct(mm, vma);
+ if (insert_vm_struct(mm, vma)) {
+ up_write(&mm->mmap_sem);
+ kmem_cache_free(vm_area_cachep, vma);
+ return -ENOMEM;
+ }
mm->total_vm += (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
up_write(&mm->mmap_sem);
--- 2.6.14-rc1/arch/x86_64/ia32/ia32_binfmt.c 2005-08-29 00:41:01.000000000 +0100
+++ linux/arch/x86_64/ia32/ia32_binfmt.c 2005-09-13 18:05:20.000000000 +0100
@@ -353,11 +353,6 @@ int setup_arg_pages(struct linux_binprm
mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
if (!mpnt)
return -ENOMEM;
-
- if (security_vm_enough_memory((IA32_STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p))>>PAGE_SHIFT)) {
- kmem_cache_free(vm_area_cachep, mpnt);
- return -ENOMEM;
- }
memset(mpnt, 0, sizeof(*mpnt));
--- 2.6.14-rc1/arch/x86_64/ia32/syscall32.c 2005-08-29 00:41:01.000000000 +0100
+++ linux/arch/x86_64/ia32/syscall32.c 2005-09-13 18:53:32.000000000 +0100
@@ -52,17 +52,13 @@ int syscall32_setup_pages(struct linux_b
vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
if (!vma)
return -ENOMEM;
- if (security_vm_enough_memory(npages)) {
- kmem_cache_free(vm_area_cachep, vma);
- return -ENOMEM;
- }
memset(vma, 0, sizeof(struct vm_area_struct));
/* Could randomize here */
vma->vm_start = VSYSCALL32_BASE;
vma->vm_end = VSYSCALL32_END;
/* MAYWRITE to allow gdb to COW and set breakpoints */
- vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYEXEC|VM_MAYWRITE;
+ vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYWRITE;
vma->vm_flags |= mm->def_flags;
vma->vm_page_prot = protection_map[vma->vm_flags & 7];
vma->vm_ops = &syscall32_vm_ops;
--- 2.6.14-rc1/fs/exec.c 2005-09-13 15:22:37.000000000 +0100
+++ linux/fs/exec.c 2005-09-13 17:50:46.000000000 +0100
@@ -421,11 +421,6 @@ int setup_arg_pages(struct linux_binprm
if (!mpnt)
return -ENOMEM;
- if (security_vm_enough_memory(arg_size >> PAGE_SHIFT)) {
- kmem_cache_free(vm_area_cachep, mpnt);
- return -ENOMEM;
- }
-
memset(mpnt, 0, sizeof(*mpnt));
down_write(&mm->mmap_sem);
--- 2.6.14-rc1/mm/mmap.c 2005-09-13 15:22:47.000000000 +0100
+++ linux/mm/mmap.c 2005-09-13 18:59:59.000000000 +0100
@@ -1993,6 +1993,9 @@ int insert_vm_struct(struct mm_struct *
__vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent);
if (__vma && __vma->vm_start < vma->vm_end)
return -ENOMEM;
+ if ((vma->vm_flags & VM_ACCOUNT) &&
+ security_vm_enough_memory(vma_pages(vma)))
+ return -ENOMEM;
vma_link(mm, vma, prev, rb_link, rb_parent);
return 0;
}
Hugh,
is vma accounting in arch/x86_64/ia32/syscall32.c and
arch/ppc64/kernel/vdso.c removed due to fixed size of vma (vsyscall/vdso
mappings)?
in other respects it looks ok.
Signed-Off-By: Kirill Korotaev <[email protected]>
Kirill
> On Tue, 13 Sep 2005, Hugh Dickins wrote:
>
>>On Mon, 12 Sep 2005, Andrew Morton wrote:
>>
>>>Patch looks OK to me. Hugh, could you please double-check sometime?
>>
>>It's a good find, and the patch looks correct to me, so far as it goes.
>>But I think it's the wrong patch, and incomplete: it can be done more
>>appropriately, more simply and more completely in insert_vm_struct itself.
>>I'll post a replacement patch (or admit I'm wrong) in a little while.
>
>
> Many thanks, Andrew, for confirming that ppc64 does indeed leak into
> Committed_AS, as it looked to me. Here's my version of Pavel/Kirill's
> patches: sorry if it seems "weird" to Kirill, perhaps we need to change
> the name of insert_vm_struct too; but it seems safer and easier to get
> right this way. And I prefer deleting code to adding it...
>
>
> Pavel Emelianov and Kirill Korotaev observe that fs and arch users of
> security_vm_enough_memory tend to forget to vm_unacct_memory when a
> failure occurs further down (typically in setup_arg_pages variants).
>
> These are all users of insert_vm_struct, and that reservation will only
> be unaccounted on exit if the vma is marked VM_ACCOUNT: which in some
> cases it is (hidden inside VM_STACK_FLAGS) and in some cases it isn't.
>
> So x86_64 32-bit and ppc64 vDSO ELFs have been leaking memory into
> Committed_AS each time they're run. But don't add VM_ACCOUNT to them,
> it's inappropriate to reserve against the very unlikely case that gdb
> be used to COW a vDSO page - we ought to do something about that in
> do_wp_page, but there are yet other inconsistencies to be resolved.
>
> The safe and economical way to fix this is to let insert_vm_struct do
> the security_vm_enough_memory check when it finds VM_ACCOUNT is set.
>
> And the MIPS irix_brk has been calling security_vm_enough_memory before
> calling do_brk which repeats it, doubly accounting and so also leaking.
> Remove that, and all the fs and arch calls to security_vm_enough_memory:
> give it a less misleading name later on.
>
> Signed-off-by: Hugh Dickins <[email protected]>
> ---
>
> arch/ia64/ia32/binfmt_elf32.c | 6 ------
> arch/mips/kernel/sysirix.c | 9 ++-------
> arch/ppc64/kernel/vdso.c | 15 +++++++++------
> arch/x86_64/ia32/ia32_binfmt.c | 5 -----
> arch/x86_64/ia32/syscall32.c | 6 +-----
> fs/exec.c | 5 -----
> mm/mmap.c | 3 +++
> 7 files changed, 15 insertions(+), 34 deletions(-)
>
> --- 2.6.14-rc1/arch/ia64/ia32/binfmt_elf32.c 2005-03-02 07:39:16.000000000 +0000
> +++ linux/arch/ia64/ia32/binfmt_elf32.c 2005-09-13 17:58:28.000000000 +0100
> @@ -216,12 +216,6 @@ ia32_setup_arg_pages (struct linux_binpr
> if (!mpnt)
> return -ENOMEM;
>
> - if (security_vm_enough_memory((IA32_STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p))
> - >> PAGE_SHIFT)) {
> - kmem_cache_free(vm_area_cachep, mpnt);
> - return -ENOMEM;
> - }
> -
> memset(mpnt, 0, sizeof(*mpnt));
>
> down_write(¤t->mm->mmap_sem);
> --- 2.6.14-rc1/arch/mips/kernel/sysirix.c 2005-09-13 15:22:15.000000000 +0100
> +++ linux/arch/mips/kernel/sysirix.c 2005-09-13 18:51:43.000000000 +0100
> @@ -581,18 +581,13 @@ asmlinkage int irix_brk(unsigned long br
> }
>
> /*
> - * Check if we have enough memory..
> + * Ok, looks good - let it rip.
> */
> - if (security_vm_enough_memory((newbrk-oldbrk) >> PAGE_SHIFT)) {
> + if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) {
> ret = -ENOMEM;
> goto out;
> }
> -
> - /*
> - * Ok, looks good - let it rip.
> - */
> mm->brk = brk;
> - do_brk(oldbrk, newbrk-oldbrk);
> ret = 0;
>
> out:
> --- 2.6.14-rc1/arch/ppc64/kernel/vdso.c 2005-06-17 20:48:29.000000000 +0100
> +++ linux/arch/ppc64/kernel/vdso.c 2005-09-13 20:50:02.000000000 +0100
> @@ -224,10 +224,7 @@ int arch_setup_additional_pages(struct l
> vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
> if (vma == NULL)
> return -ENOMEM;
> - if (security_vm_enough_memory(vdso_pages)) {
> - kmem_cache_free(vm_area_cachep, vma);
> - return -ENOMEM;
> - }
> +
> memset(vma, 0, sizeof(*vma));
>
> /*
> @@ -237,8 +234,10 @@ int arch_setup_additional_pages(struct l
> */
> vdso_base = get_unmapped_area(NULL, vdso_base,
> vdso_pages << PAGE_SHIFT, 0, 0);
> - if (vdso_base & ~PAGE_MASK)
> + if (vdso_base & ~PAGE_MASK) {
> + kmem_cache_free(vm_area_cachep, vma);
> return (int)vdso_base;
> + }
>
> current->thread.vdso_base = vdso_base;
>
> @@ -266,7 +265,11 @@ int arch_setup_additional_pages(struct l
> vma->vm_ops = &vdso_vmops;
>
> down_write(&mm->mmap_sem);
> - insert_vm_struct(mm, vma);
> + if (insert_vm_struct(mm, vma)) {
> + up_write(&mm->mmap_sem);
> + kmem_cache_free(vm_area_cachep, vma);
> + return -ENOMEM;
> + }
> mm->total_vm += (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
> up_write(&mm->mmap_sem);
>
> --- 2.6.14-rc1/arch/x86_64/ia32/ia32_binfmt.c 2005-08-29 00:41:01.000000000 +0100
> +++ linux/arch/x86_64/ia32/ia32_binfmt.c 2005-09-13 18:05:20.000000000 +0100
> @@ -353,11 +353,6 @@ int setup_arg_pages(struct linux_binprm
> mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
> if (!mpnt)
> return -ENOMEM;
> -
> - if (security_vm_enough_memory((IA32_STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p))>>PAGE_SHIFT)) {
> - kmem_cache_free(vm_area_cachep, mpnt);
> - return -ENOMEM;
> - }
>
> memset(mpnt, 0, sizeof(*mpnt));
>
> --- 2.6.14-rc1/arch/x86_64/ia32/syscall32.c 2005-08-29 00:41:01.000000000 +0100
> +++ linux/arch/x86_64/ia32/syscall32.c 2005-09-13 18:53:32.000000000 +0100
> @@ -52,17 +52,13 @@ int syscall32_setup_pages(struct linux_b
> vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
> if (!vma)
> return -ENOMEM;
> - if (security_vm_enough_memory(npages)) {
> - kmem_cache_free(vm_area_cachep, vma);
> - return -ENOMEM;
> - }
>
> memset(vma, 0, sizeof(struct vm_area_struct));
> /* Could randomize here */
> vma->vm_start = VSYSCALL32_BASE;
> vma->vm_end = VSYSCALL32_END;
> /* MAYWRITE to allow gdb to COW and set breakpoints */
> - vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYEXEC|VM_MAYWRITE;
> + vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYWRITE;
> vma->vm_flags |= mm->def_flags;
> vma->vm_page_prot = protection_map[vma->vm_flags & 7];
> vma->vm_ops = &syscall32_vm_ops;
> --- 2.6.14-rc1/fs/exec.c 2005-09-13 15:22:37.000000000 +0100
> +++ linux/fs/exec.c 2005-09-13 17:50:46.000000000 +0100
> @@ -421,11 +421,6 @@ int setup_arg_pages(struct linux_binprm
> if (!mpnt)
> return -ENOMEM;
>
> - if (security_vm_enough_memory(arg_size >> PAGE_SHIFT)) {
> - kmem_cache_free(vm_area_cachep, mpnt);
> - return -ENOMEM;
> - }
> -
> memset(mpnt, 0, sizeof(*mpnt));
>
> down_write(&mm->mmap_sem);
> --- 2.6.14-rc1/mm/mmap.c 2005-09-13 15:22:47.000000000 +0100
> +++ linux/mm/mmap.c 2005-09-13 18:59:59.000000000 +0100
> @@ -1993,6 +1993,9 @@ int insert_vm_struct(struct mm_struct *
> __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent);
> if (__vma && __vma->vm_start < vma->vm_end)
> return -ENOMEM;
> + if ((vma->vm_flags & VM_ACCOUNT) &&
> + security_vm_enough_memory(vma_pages(vma)))
> + return -ENOMEM;
> vma_link(mm, vma, prev, rb_link, rb_parent);
> return 0;
> }
>
On Wed, 14 Sep 2005, Kirill Korotaev wrote:
>
> is vma accounting in arch/x86_64/ia32/syscall32.c and arch/ppc64/kernel/vdso.c
> removed due to fixed size of vma (vsyscall/vdso mappings)?
> in other respects it looks ok.
It's removed because without VM_ACCOUNT it was steadily leaking:
the main path was wrong, never mind the error path. We could add
VM_ACCOUNT to the flags to fix that, but in 99.999% of cases we
should not be accounting these - all mms are sharing the same page.
Ben designed it to allow for gdb setting breakpoints in the vDSO
page (COW then giving a private page), but that's an very unusual
case, which isn't handled quite right anyway: we need to take a
separate look at that sometime - we account for what's VM_WRITE,
not for what ptrace might write to by the backdoor.
> > So x86_64 32-bit and ppc64 vDSO ELFs have been leaking memory into
> > Committed_AS each time they're run. But don't add VM_ACCOUNT to them,
> > it's inappropriate to reserve against the very unlikely case that gdb
> > be used to COW a vDSO page - we ought to do something about that in
> > do_wp_page, but there are yet other inconsistencies to be resolved.
Hugh