2004-03-13 03:39:26

by Ray Bryant

[permalink] [raw]
Subject: Hugetlbpages in very large memory machines.......

We've run into a scaling problem using hugetlbpages in very large memory machines, e. g. machines
with 1TB or more of main memory. The problem is that hugetlbpage pages are not faulted in, rather
they are zeroed and mapped in in by hugetlb_prefault() (at least on ia64), which is called in
response to the user's mmap() request. The net is that all of the hugetlb pages end up being
allocated and zeroed by a single thread, and if most of the machine's memory is allocated to hugetlb
pages, and there is 1 TB or more of main memory, zeroing and allocating all of those pages can take
a long time (500 s or more).

We've looked at allocating and zeroing hugetlbpages at fault time, which would at least allow
multiple processors to be thrown at the problem. Question is, has anyone else been working on
this problem and might they have prototype code they could share with us?

Thanks,
--
Best Regards,
Ray
-----------------------------------------------
Ray Bryant
512-453-9679 (work) 512-507-7807 (cell)
[email protected] [email protected]
The box said: "Requires Windows 98 or better",
so I installed Linux.
-----------------------------------------------


2004-03-13 03:54:36

by Andi Kleen

[permalink] [raw]
Subject: Re: Hugetlbpages in very large memory machines.......

On Fri, Mar 12, 2004 at 09:44:03PM -0600, Ray Bryant wrote:
> We've run into a scaling problem using hugetlbpages in very large memory
> machines, e. g. machines with 1TB or more of main memory. The problem is
> that hugetlbpage pages are not faulted in, rather they are zeroed and
> mapped in in by hugetlb_prefault() (at least on ia64), which is called in
> response to the user's mmap() request. The net is that all of the hugetlb
> pages end up being allocated and zeroed by a single thread, and if most of
> the machine's memory is allocated to hugetlb pages, and there is 1 TB or
> more of main memory, zeroing and allocating all of those pages can take a
> long time (500 s or more).
>
> We've looked at allocating and zeroing hugetlbpages at fault time, which
> would at least allow multiple processors to be thrown at the problem.
> Question is, has anyone else been working on
> this problem and might they have prototype code they could share with us?

Yes. I ran into exactly this problem with NUMA API too.
mbind() runs after mmap, but it cannot work anymore when
the pages are already allocated.

I fixed it on x86-64/i386 by allocating the pages lazily.
Doing it for IA64 has been on the todo list too.

i386/x86-64 Code as an example attached.

One drawback is that the out of memory handling is lot less nicer
than it was before - when you run out of hugepages you get SIGBUS
now instead of a ENOMEM from mmap. Maybe some prereservation would
make sense, but that would be somewhat harder. Alternatively
fall back to smaller pages if possible (I was told it isn't easily
possible on IA64)

-Andi


diff -burpN -X ../KDIFX linux-2.6.2/arch/i386/mm/hugetlbpage.c linux-2.6.2-numa/arch/i386/mm/hugetlbpage.c
--- linux-2.6.2/arch/i386/mm/hugetlbpage.c 2004-02-24 20:48:10.000000000 +0100
+++ linux-2.6.2-numa/arch/i386/mm/hugetlbpage.c 2004-02-20 18:52:57.000000000 +0100
@@ -329,41 +333,43 @@ zap_hugepage_range(struct vm_area_struct
spin_unlock(&mm->page_table_lock);
}

-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+/* page_table_lock hold on entry. */
+static int
+hugetlb_alloc_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, int write_access)
{
- struct mm_struct *mm = current->mm;
- unsigned long addr;
- int ret = 0;
-
- BUG_ON(vma->vm_start & ~HPAGE_MASK);
- BUG_ON(vma->vm_end & ~HPAGE_MASK);
-
- spin_lock(&mm->page_table_lock);
- for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
unsigned long idx;
- pte_t *pte = huge_pte_alloc(mm, addr);
- struct page *page;
+ int ret;
+ pte_t *pte;
+ struct page *page = NULL;
+ struct address_space *mapping = vma->vm_file->f_mapping;

+ pte = huge_pte_alloc(mm, addr);
if (!pte) {
- ret = -ENOMEM;
+ ret = VM_FAULT_OOM;
goto out;
}
- if (!pte_none(*pte))
- continue;

idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
page = find_get_page(mapping, idx);
if (!page) {
- /* charge the fs quota first */
- if (hugetlb_get_quota(mapping)) {
- ret = -ENOMEM;
+ /* Should do this at prefault time, but that gets us into
+ trouble with freeing right now. */
+ ret = hugetlb_get_quota(mapping);
+ if (ret) {
+ ret = VM_FAULT_OOM;
goto out;
}
- page = alloc_hugetlb_page();
+
+ page = alloc_hugetlb_page(vma);
if (!page) {
hugetlb_put_quota(mapping);
- ret = -ENOMEM;
+
+ /* Instead of OOMing here could just transparently use
+ small pages. */
+
+ ret = VM_FAULT_OOM;
goto out;
}
ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
@@ -371,23 +377,62 @@ int hugetlb_prefault(struct address_spac
if (ret) {
hugetlb_put_quota(mapping);
free_huge_page(page);
+ ret = VM_FAULT_SIGBUS;
goto out;
}
- }
+ ret = VM_FAULT_MAJOR;
+ } else
+ ret = VM_FAULT_MINOR;
+
set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
- }
-out:
+ /* Don't need to flush other CPUs. They will just do a page
+ fault and flush it lazily. */
+ __flush_tlb_one(addr);
+
+ out:
spin_unlock(&mm->page_table_lock);
return ret;
}

+int arch_hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, int write_access)
+{
+ pmd_t *pmd;
+ pgd_t *pgd;
+
+ if (write_access && !(vma->vm_flags & VM_WRITE))
+ return VM_FAULT_SIGBUS;
+
+ spin_lock(&mm->page_table_lock);
+ pgd = pgd_offset(mm, address);
+ if (pgd_none(*pgd))
+ return hugetlb_alloc_fault(mm, vma, address, write_access);
+
+ pmd = pmd_offset(pgd, address);
+ if (pmd_none(*pmd))
+ return hugetlb_alloc_fault(mm, vma, address, write_access);
+
+ BUG_ON(!pmd_large(*pmd));
+
+ /* must have been a race. Flush the TLB. NX not supported yet. */
+
+ __flush_tlb_one(address);
+ spin_lock(&mm->page_table_lock);
+ return VM_FAULT_MINOR;
+}
+
+int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+{
+ return 0;
+}
+
static void update_and_free_page(struct page *page)
{
int j;
struct page *map;

map = page;
- htlbzone_pages--;
+ htlbzone_pages--;
for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
map->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
diff -burpN -X ../KDIFX linux-2.6.2/mm/memory.c linux-2.6.2-numa/mm/memory.c
--- linux-2.6.2/mm/memory.c 2004-02-20 18:31:32.000000000 +0100
+++ linux-2.6.2-numa/mm/memory.c 2004-02-18 20:08:40.000000000 +0100
@@ -1576,6 +1593,15 @@ static inline int handle_pte_fault(struc
return VM_FAULT_MINOR;
}

+
+/* Can be overwritten by the architecture */
+int __attribute__((weak)) arch_hugetlb_fault(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address, int write_access)
+{
+ return VM_FAULT_SIGBUS;
+}
+
/*
* By the time we get here, we already hold the mm semaphore
*/
@@ -1591,7 +1617,7 @@ int handle_mm_fault(struct mm_struct *mm
inc_page_state(pgfault);

if (is_vm_hugetlb_page(vma))
- return VM_FAULT_SIGBUS; /* mapping truncation does this. */
+ return arch_hugetlb_fault(mm, vma, address, write_access);

/*
* We need the page table lock to synchronize with kswapd

2004-03-13 03:55:34

by William Lee Irwin III

[permalink] [raw]
Subject: Re: Hugetlbpages in very large memory machines.......

On Fri, Mar 12, 2004 at 09:44:03PM -0600, Ray Bryant wrote:
> We've run into a scaling problem using hugetlbpages in very large memory
> machines, e. g. machines with 1TB or more of main memory. The problem is
> that hugetlbpage pages are not faulted in, rather they are zeroed and
> mapped in in by hugetlb_prefault() (at least on ia64), which is called in
> response to the user's mmap() request. The net is that all of the hugetlb
> pages end up being allocated and zeroed by a single thread, and if most of
> the machine's memory is allocated to hugetlb pages, and there is 1 TB or
> more of main memory, zeroing and allocating all of those pages can take a
> long time (500 s or more).
> We've looked at allocating and zeroing hugetlbpages at fault time, which
> would at least allow multiple processors to be thrown at the problem.
> Question is, has anyone else been working on
> this problem and might they have prototype code they could share with us?

This actually is largely a question of architecture-dependent code, so
the answer will depend on whether your architecture matches those of the
others who have had a need to arrange this.

Basically, all you really need to do is to check the vma and call either
a hugetlb-specific fault handler or handle_mm_fault() depending on whether
hugetlb is configured. Once you've gotten that far, it's only a question
of implementing the methods to work together properly when driven by
upper layers.

The reason why this wasn't done up-front was that there wasn't a
demonstrable need to do so. The issue you're citing is exactly the kind
of demonstration needed to motivate its inclusion.


-- wli

2004-03-13 04:53:43

by Hirokazu Takahashi

[permalink] [raw]
Subject: Re: Hugetlbpages in very large memory machines.......

Hello,

My following patch might help you. It inclueds pagefault routine
for hugetlbpages. If you want to use it for your purpose, you need to
remove some code from hugetlb_prefault() that will call hugetlb_fault().
http://people.valinux.co.jp/~taka/patches/va01-hugepagefault.patch

But it's just for IA32.

I heard that [email protected] was porting this patch
on IA64.

> We've run into a scaling problem using hugetlbpages in very large memory machines, e. g. machines
> with 1TB or more of main memory. The problem is that hugetlbpage pages are not faulted in, rather
> they are zeroed and mapped in in by hugetlb_prefault() (at least on ia64), which is called in
> response to the user's mmap() request. The net is that all of the hugetlb pages end up being
> allocated and zeroed by a single thread, and if most of the machine's memory is allocated to hugetlb
> pages, and there is 1 TB or more of main memory, zeroing and allocating all of those pages can take
> a long time (500 s or more).
>
> We've looked at allocating and zeroing hugetlbpages at fault time, which would at least allow
> multiple processors to be thrown at the problem. Question is, has anyone else been working on
> this problem and might they have prototype code they could share with us?
>
> Thanks,
> --
> Best Regards,
> Ray


Thank you,
Hirokazu Takahashi.

2004-03-13 05:49:29

by William Lee Irwin III

[permalink] [raw]
Subject: Re: Hugetlbpages in very large memory machines.......

On Sat, Mar 13, 2004 at 04:48:40AM +0100, Andi Kleen wrote:
> One drawback is that the out of memory handling is lot less nicer
> than it was before - when you run out of hugepages you get SIGBUS
> now instead of a ENOMEM from mmap. Maybe some prereservation would
> make sense, but that would be somewhat harder. Alternatively
> fall back to smaller pages if possible (I was told it isn't easily
> possible on IA64)

That's not entirely true. Whether it's feasible depends on how the
MMU is used. The HPW (Hardware Pagetable Walker) and short mode of the
VHPT insist upon pagesize being a per-region attribute, where regions
are something like 60-bit areas of virtualspace, which is likely what
they're referring to. The VHPT in long mode should be capable of
arbitrary virtual placement (modulo alignment of course).


-- wli

2004-03-13 16:10:17

by Andi Kleen

[permalink] [raw]
Subject: Re: [Lse-tech] Re: Hugetlbpages in very large memory machines.......

> > fall back to smaller pages if possible (I was told it isn't easily
> > possible on IA64)
>
> That's not entirely true. Whether it's feasible depends on how the
> MMU is used. The HPW (Hardware Pagetable Walker) and short mode of the
> VHPT insist upon pagesize being a per-region attribute, where regions
> are something like 60-bit areas of virtualspace, which is likely what
> they're referring to. The VHPT in long mode should be capable of
> arbitrary virtual placement (modulo alignment of course).

Redesigning the low level TLB fault handling for this would not count as
"easily" in my book.

-Andi

2004-03-14 00:05:36

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [Lse-tech] Re: Hugetlbpages in very large memory machines.......

At some point in the past, I wrote:
>> That's not entirely true. Whether it's feasible depends on how the
>> MMU is used. The HPW (Hardware Pagetable Walker) and short mode of the
>> VHPT insist upon pagesize being a per-region attribute, where regions
>> are something like 60-bit areas of virtualspace, which is likely what
>> they're referring to. The VHPT in long mode should be capable of
>> arbitrary virtual placement (modulo alignment of course).

On Sat, Mar 13, 2004 at 05:10:10PM +0100, Andi Kleen wrote:
> Redesigning the low level TLB fault handling for this would not count as
> "easily" in my book.

I make no estimate of ease of implementation of long mode VHPT support.
The point of the above is that the virtual placement constraint is an
artifact of the implementation and not inherent in hardware.


-- wli

2004-03-14 02:46:08

by Andrew Morton

[permalink] [raw]
Subject: Re: Hugetlbpages in very large memory machines.......

Andi Kleen <[email protected]> wrote:
>
> > We've looked at allocating and zeroing hugetlbpages at fault time, which
> > would at least allow multiple processors to be thrown at the problem.
> > Question is, has anyone else been working on
> > this problem and might they have prototype code they could share with us?
>
> Yes. I ran into exactly this problem with NUMA API too.
> mbind() runs after mmap, but it cannot work anymore when
> the pages are already allocated.
>
> I fixed it on x86-64/i386 by allocating the pages lazily.
> Doing it for IA64 has been on the todo list too.
>
> i386/x86-64 Code as an example attached.
>
> One drawback is that the out of memory handling is lot less nicer
> than it was before - when you run out of hugepages you get SIGBUS
> now instead of a ENOMEM from mmap. Maybe some prereservation would
> make sense, but that would be somewhat harder. Alternatively
> fall back to smaller pages if possible (I was told it isn't easily
> possible on IA64)

Demand-paging the hugepages is a decent feature to have, and ISTR resisting
it before for this reason.

Even though it's early in the 2.6 series I'd be a bit worried about
breaking existing hugetlb users in this way. Yes, the pages are
preallocated so it is unlikely that a working setup is suddenly going to
break. Unless someone is using the return value from mmap to find out how
many pages they can get.

So ho-hum. I think it needs to be back-compatible. Could we add
MAP_NO_PREFAULT?

2004-03-14 04:07:37

by Anton Blanchard

[permalink] [raw]
Subject: Re: [Lse-tech] Re: Hugetlbpages in very large memory machines.......


> Demand-paging the hugepages is a decent feature to have, and ISTR resisting
> it before for this reason.
>
> Even though it's early in the 2.6 series I'd be a bit worried about
> breaking existing hugetlb users in this way. Yes, the pages are
> preallocated so it is unlikely that a working setup is suddenly going to
> break. Unless someone is using the return value from mmap to find out how
> many pages they can get.

Hmm what a coincidence, I was chasing a problem where large page
allocations would fail even though I clearly had enough large page memory
free.

It turns out we were tripping the overcommit logic in do_mmap. I had
30GB of large page and 2GB of small pages and of course cap_vm_enough_memory
was looking at the small page pool. Setting overcommit to 1 fixed it.

It seems we can solve both problems by having a separate hugetlb overcommit
policy. Make it strict and you wont have OOM problems on large pages
and I wont hit my 30GB / 2GB problem.

Anton

2004-03-14 05:23:26

by Peter Chubb

[permalink] [raw]
Subject: Re: [Lse-tech] Re: Hugetlbpages in very large memory machines.......

>>>>> "William" == William Lee Irwin, <William> writes:

William> At some point in the past, I wrote:

William> On Sat, Mar 13, 2004 at 05:10:10PM +0100, Andi Kleen wrote:
>> Redesigning the low level TLB fault handling for this would not
>> count as "easily" in my book.

William> I make no estimate of ease of implementation of long mode
William> VHPT support. The point of the above is that the virtual
William> placement constraint is an artifact of the implementation and
William> not inherent in hardware.

Ther's a patch available to enable long-format VHPT at
http://www.gelato.unsw.edu.au

We're waiting for 2.7 to open before pushing it in. The long-format
vpht is a prerequisite for other work we're doing on super-pagesand
TLB sharing.

--
Dr Peter Chubb http://www.gelato.unsw.edu.au peterc AT gelato.unsw.edu.au
The technical we do immediately, the political takes *forever*

2004-03-14 08:34:08

by Ray Bryant

[permalink] [raw]
Subject: Re: [Lse-tech] Re: Hugetlbpages in very large memory machines.......



Andrew Morton wrote:

>>
>> One drawback is that the out of memory handling is lot less nicer
>> than it was before - when you run out of hugepages you get SIGBUS
>> now instead of a ENOMEM from mmap. Maybe some prereservation would
>> make sense, but that would be somewhat harder. Alternatively
>> fall back to smaller pages if possible (I was told it isn't easily
>> possible on IA64)
>
>
> Demand-paging the hugepages is a decent feature to have, and ISTR resisting
> it before for this reason.
>
> Even though it's early in the 2.6 series I'd be a bit worried about
> breaking existing hugetlb users in this way. Yes, the pages are
> preallocated so it is unlikely that a working setup is suddenly going to
> break. Unless someone is using the return value from mmap to find out how
> many pages they can get.
>
> So ho-hum. I think it needs to be back-compatible. Could we add
> MAP_NO_PREFAULT?
>
>
>

I agree with the compatibility concern, but the other part of the problem
is that while hugetlb_prefault() is running, it holds both the mm->mmap_sem in
write mode and the mm->page_table_lock. So not only does it take 500 s for
the mmap() to return on our test system, but ps, top, etc all freeze for the
duration. Very irritating, especially on a 64 or 128 P system.

My preference would be to do away with bugetlb_prefault() altogether.
(If there was a MAP_NO_PREFAULT, we would have to make this the default on
Altix to avoid the freeze problem mentioned above. Can't have an arbitrary
user locking up the system.) As Andi pointed out, perhaps we can do some
prereservation of huge pages so that we can return a ENONMEM to the mmap()
if there are not enough huge pages to (lazily) be allocated to satisfy the
request, but then still allocate the pages at fault time. A simple count
would suffice.

--
Best Regards,
Ray
-----------------------------------------------
Ray Bryant
512-453-9679 (work) 512-507-7807 (cell)
[email protected] [email protected]
The box said: "Requires Windows 98 or better",
so I installed Linux.
-----------------------------------------------

2004-03-14 08:48:49

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [Lse-tech] Re: Hugetlbpages in very large memory machines.......

On Sun, Mar 14, 2004 at 02:38:33AM -0600, Ray Bryant wrote:
> write mode and the mm->page_table_lock. So not only does it take 500 s for
> the mmap() to return on our test system, but ps, top, etc all freeze for the
> duration. Very irritating, especially on a 64 or 128 P system.
> My preference would be to do away with bugetlb_prefault() altogether.
> (If there was a MAP_NO_PREFAULT, we would have to make this the default on
> Altix to avoid the freeze problem mentioned above. Can't have an arbitrary
> user locking up the system.) As Andi pointed out, perhaps we can do some
> prereservation of huge pages so that we can return a ENONMEM to the mmap()
> if there are not enough huge pages to (lazily) be allocated to satisfy the
> request, but then still allocate the pages at fault time. A simple count
> would suffice.

There is a patch which arranges to keep statistics ready in the mm so that
the mmap_sem need not be taken for /proc/ and furthermore renders
proc_pid_statm() nothing more than copying integers out of the mm that
I forward ported to 2.6.0-test*, originally by Ben LaHaise, that may
also be of interest to those concerned about tripping over other processes'
mmap_sem's in /proc/.


-- wli

2004-03-14 08:57:53

by Andrew Morton

[permalink] [raw]
Subject: Re: [Lse-tech] Re: Hugetlbpages in very large memory machines.......

Ray Bryant <[email protected]> wrote:
>
>
> I agree with the compatibility concern, but the other part of the problem
> is that while hugetlb_prefault() is running, it holds both the mm->mmap_sem in
> write mode and the mm->page_table_lock. So not only does it take 500 s for
> the mmap() to return on our test system, but ps, top, etc all freeze for the
> duration. Very irritating, especially on a 64 or 128 P system.

Well that's just a dumb implementation. hugetlb_prefault() doesn't need
page_table_lock while it is zeroing the page: just drop it, test for
-EEXIST returned from add_to_page_cache().

In fact we need to do that anyway: the current code is buggy if some other
process with a different mm gets in there and instantiates the page in the
pagecache before this process does: hugetlb_prefault() will return -EEXIST
instead of simply accepting the race and using the page which someone else
put there.

After we have the page in pagecache we need to retake page_table_lock and
check that the target pte is still pte_none(). If it is not, you know that
some other thread has already instantiated a pte there so the new ref to
the pagecache page can simply be dropped. See how do_no_page() handles it.
Of course, this only applies if mmap_sem is no longer held in there.

As for holding mmap_sem for too long, well, that can presumably be worked
around by not mmapping the whole lot in one hit?

2004-03-14 09:02:55

by Andrew Morton

[permalink] [raw]
Subject: Re: [Lse-tech] Re: Hugetlbpages in very large memory machines.......

Andrew Morton <[email protected]> wrote:
>
> Well that's just a dumb implementation. hugetlb_prefault() doesn't need
> page_table_lock while it is zeroing the page: just drop it, test for
> -EEXIST returned from add_to_page_cache().
>
> In fact we need to do that anyway: the current code is buggy if some other
> process with a different mm gets in there and instantiates the page in the
> pagecache before this process does: hugetlb_prefault() will return -EEXIST
> instead of simply accepting the race and using the page which someone else
> put there.
>
> After we have the page in pagecache we need to retake page_table_lock and
> check that the target pte is still pte_none(). If it is not, you know that
> some other thread has already instantiated a pte there so the new ref to
> the pagecache page can simply be dropped. See how do_no_page() handles it.
> Of course, this only applies if mmap_sem is no longer held in there.

But before implementing any of this we should move hugetlb_prefault() and
any other generic-looking functions into mm/hugetlbpage.c. We're getting
too much duplication in there.

2004-03-14 09:07:49

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [Lse-tech] Re: Hugetlbpages in very large memory machines.......

On Sun, Mar 14, 2004 at 12:57:37AM -0800, Andrew Morton wrote:
> Well that's just a dumb implementation. hugetlb_prefault() doesn't need
> page_table_lock while it is zeroing the page: just drop it, test for
> -EEXIST returned from add_to_page_cache().
> In fact we need to do that anyway: the current code is buggy if some other
> process with a different mm gets in there and instantiates the page in the
> pagecache before this process does: hugetlb_prefault() will return -EEXIST
> instead of simply accepting the race and using the page which someone else
> put there.

Don't blame me. I didn't write the expand-on-mmap() code.


-- wli

2004-03-15 06:41:18

by Ray Bryant

[permalink] [raw]
Subject: Re: [Lse-tech] Re: Hugetlbpages in very large memory machines.......



Andrew Morton wrote:
<unrelated text snipped>
>
> As for holding mmap_sem for too long, well, that can presumably be worked
> around by not mmapping the whole lot in one hit?
>

There are a number of places that one could do this (explicitly in user code,
hidden in library level, or in do_mmap2() where the mm->map_sem is taken).
I'm not happy with requiring the user to make a modification to solve this
kernel problem. Hiding the split has the problem of making sure that if any
of the sub mmap() operations fail then the rest of the mmap() operations have
to be undone, and this all has to happen in a way that makes the mmap() look
like a single system call.

An alternative would be put some info in the mm_struct indicating that a
hugetlb_prefault() is in progress, then drop the mm->mmap_sem while
hugetlb_prefault() is running. Once it is done, regrab the mm->mmap_sem,
clear the "in progress flag" and finish up processing. Any other mmap()
that got the mmap_sem and found the "in progress flag" set would have to
fail, perhaps with -EAGAIN (again, an mmap() extension). One can also
implement more elaborate schemes where there is a list of pending hugetlb
mmaps() with the associated address space ranges being listed; one could
check this list in get_unmapped_area() and return -EAGAIN if there is
a conflict.

I'd still rather see us do the "allocate on fault" approach with prereservation
to maintain the current ENOMEM return code from mmap() for hugepages. Let me
work on that and get back to y'all with a patch and see where we can go from
there. I'll start by taking a look at all of the arch dependent hugetlbpage.c's
and see how common they all are and move the common code up to mm/hugetlbpage.c.
(or did WLI's note imply that this is impossible?)

However, is this set of changes something that would still be accepted in 2.6,
or is this now a 2.7 discussion?

--
Best Regards,
Ray
-----------------------------------------------
Ray Bryant
512-453-9679 (work) 512-507-7807 (cell)
[email protected] [email protected]
The box said: "Requires Windows 98 or better",
so I installed Linux.
-----------------------------------------------

2004-03-15 15:28:03

by jlnance

[permalink] [raw]
Subject: Re: Hugetlbpages in very large memory machines.......

On Fri, Mar 12, 2004 at 09:44:03PM -0600, Ray Bryant wrote:
> We've run into a scaling problem using hugetlbpages in very large memory
> machines, e. g. machines with 1TB or more of main memory.

You know, when I started using Linux it wouldn't support more than 16M
of ram. No one complained because no one using Linux had a machine with
more than 16M of ram. It looks like things have progressed a bit since
then :-)

Jim

2004-03-15 23:54:59

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [Lse-tech] Re: Hugetlbpages in very large memory machines.......

On Mon, Mar 15, 2004 at 12:45:10AM -0600, Ray Bryant wrote:
> I'd still rather see us do the "allocate on fault" approach with
> prereservation to maintain the current ENOMEM return code from mmap()
> for hugepages. Let me work on that and get back to y'all with a patch
> and see where we can go from there. I'll start by taking a look at
> all of the arch dependent hugetlbpage.c's and see how common they all
> are and move the common code up to mm/hugetlbpage.c.
> (or did WLI's note imply that this is impossible?)

It would be a mistake to put any pagetable handling functions in the
core. Things above that level, e.g. callers that don't examine the
pagetables directly in favor of calling lower-level API's, are fine.


-- wli

2004-03-16 00:38:17

by Nobuhiko Yoshida

[permalink] [raw]
Subject: Re: Hugetlbpages in very large memory machines.......

Hello,

Hirokazu Takahashi <[email protected]> :
> Hello,
>
> My following patch might help you. It inclueds pagefault routine
> for hugetlbpages. If you want to use it for your purpose, you need to
> remove some code from hugetlb_prefault() that will call hugetlb_fault().
> http://people.valinux.co.jp/~taka/patches/va01-hugepagefault.patch
>
> But it's just for IA32.
>
> I heard that [email protected] was porting this patch
> on IA64.

Below is the patch I ported Takahashi-san's one for IA64.
However, my patch is for kernel 2.6.0 and cannot be
appiled to 2.6.1 or later.

Thank you,
Nobuhiko Yoshida

diff -dupr linux-2.6.0.org/arch/ia64/mm/hugetlbpage.c linux-2.6.0.HugeTLB/arch/ia64/mm/hugetlbpage.c
--- linux-2.6.0.org/arch/ia64/mm/hugetlbpage.c 2003-12-18 11:58:56.000000000 +0900
+++ linux-2.6.0.HugeTLB/arch/ia64/mm/hugetlbpage.c 2004-01-06 14:26:53.000000000 +0900
@@ -170,8 +170,10 @@ int copy_hugetlb_page_range(struct mm_st
goto nomem;
src_pte = huge_pte_offset(src, addr);
entry = *src_pte;
- ptepage = pte_page(entry);
- get_page(ptepage);
+ if (!pte_none(entry)) {
+ ptepage = pte_page(entry);
+ get_page(ptepage);
+ }
set_pte(dst_pte, entry);
dst->rss += (HPAGE_SIZE / PAGE_SIZE);
addr += HPAGE_SIZE;
@@ -195,6 +197,12 @@ follow_hugetlb_page(struct mm_struct *mm
do {
pstart = start & HPAGE_MASK;
ptep = huge_pte_offset(mm, start);
+
+ if (!ptep || pte_none(*ptep)) {
+ hugetlb_fault(mm, vma, 0, start);
+ ptep = huge_pte_offset(mm, start);
+ }
+
pte = *ptep;

back1:
@@ -236,6 +244,12 @@ struct page *follow_huge_addr(struct mm_
pte_t *ptep;

ptep = huge_pte_offset(mm, addr);
+
+ if (!ptep || pte_none(*ptep)) {
+ hugetlb_fault(mm, vma, 0, addr);
+ ptep = huge_pte_offset(mm, addr);
+ }
+
page = pte_page(*ptep);
page += ((addr & ~HPAGE_MASK) >> PAGE_SHIFT);
get_page(page);
@@ -246,7 +260,8 @@ int pmd_huge(pmd_t pmd)
return 0;
}
struct page *
-follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write)
+follow_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd, int write)
{
return NULL;
}
@@ -518,6 +533,48 @@ int is_hugepage_mem_enough(size_t size)
return 1;
}

+
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, int write_access, unsigned long address)
+{
+ struct file *file = vma->vm_file;
+ struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
+ struct page *page;
+ unsigned long idx;
+ pte_t *pte;
+ int ret = VM_FAULT_MINOR;
+
+ BUG_ON(vma->vm_start & ~HPAGE_MASK);
+ BUG_ON(vma->vm_end & ~HPAGE_MASK);
+
+ spin_lock(&mm->page_table_lock);
+
+ idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
+ + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+ page = find_get_page(mapping, idx);
+
+ if (!page) {
+ page = alloc_hugetlb_page();
+ if (!page) {
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
+ unlock_page(page);
+ if (ret) {
+ free_huge_page(page);
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ }
+ pte = huge_pte_alloc(mm, address);
+ set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
+/* update_mmu_cache(vma, address, *pte); */
+out:
+ spin_unlock(&mm->page_table_lock);
+ return ret;
+}
+
+
static struct page *hugetlb_nopage(struct vm_area_struct * area, unsigned long address, int unused)
{
BUG();

2004-03-16 01:58:08

by Andi Kleen

[permalink] [raw]
Subject: Re: Hugetlbpages in very large memory machines.......

> + pte = huge_pte_alloc(mm, address);
> + set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);

This looks broken. Another CPU could have raced to the same fault
and already added an PTE here. You have to handle that.

(my i386 version originally had the same problem)


> +/* update_mmu_cache(vma, address, *pte); */

I have not studied low level IA64 VM in detail, but don't you need
some kind of TLB flush here?

-Andi

2004-03-16 02:33:05

by Hirokazu Takahashi

[permalink] [raw]
Subject: Re: Hugetlbpages in very large memory machines.......

Hello,

> > + pte = huge_pte_alloc(mm, address);
> > + set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
>
> This looks broken. Another CPU could have raced to the same fault
> and already added an PTE here. You have to handle that.
>
> (my i386 version originally had the same problem)

Yes, you are true.
In the fault handler, we should use find_lock_page() instead of
find_get_page() to find a hugepage associated with the fault address.
After that pte_none(*pte) should be called again to check whether
some races has happened.

> > +/* update_mmu_cache(vma, address, *pte); */
>
> I have not studied low level IA64 VM in detail, but don't you need
> some kind of TLB flush here?
>
> -Andi


Thank you,
Hirokazu Takahashi.

2004-03-16 03:15:55

by Nobuhiko Yoshida

[permalink] [raw]
Subject: Re: Hugetlbpages in very large memory machines.......

Hello,

> > +/* update_mmu_cache(vma, address, *pte); */
>
> I have not studied low level IA64 VM in detail, but don't you need
> some kind of TLB flush here?

Oh! Yes.
Perhaps, TLB flush is needed here.

Thank you,
Nobuhiko Yoshida

2004-03-16 03:18:25

by Hirokazu Takahashi

[permalink] [raw]
Subject: Re: Hugetlbpages in very large memory machines.......

Hello,

> Yes, you are true.
> In the fault handler, we should use find_lock_page() instead of
> find_get_page() to find a hugepage associated with the fault address.

Sorry, locking page is not needed.

> After that pte_none(*pte) should be called again to check whether
> some races has happened.

While checking, mm->page_table_lock have to be locked.

2004-03-17 19:06:39

by Andy Whitcroft

[permalink] [raw]
Subject: Re: [Lse-tech] Re: Hugetlbpages in very large memory machines.......

--On 14 March 2004 15:06 +1100 Anton Blanchard <[email protected]> wrote:

> Hmm what a coincidence, I was chasing a problem where large page
> allocations would fail even though I clearly had enough large page memory
> free.
>
> It turns out we were tripping the overcommit logic in do_mmap. I had
> 30GB of large page and 2GB of small pages and of course
> cap_vm_enough_memory was looking at the small page pool. Setting
> overcommit to 1 fixed it.
>
> It seems we can solve both problems by having a separate hugetlb
> overcommit policy. Make it strict and you wont have OOM problems on large
> pages and I wont hit my 30GB / 2GB problem.

Been following this thread and it seems that fixing this overcommit
miss-handling problem would logically be the first step. From my reading
it seems that once we have initialised hugetlb we have two independent and
non-overlapping 'page' pools from which we can allocate pages and against
which we wish to handle commitments. Looking at the current code base we
effectivly have only a single 'accounting domain' and so when we attempt to
allocate hugetlb pages we incorrectly account them against the small page
pool.

I believe we need to add support for more than one page 'accounting domain'
each with its own policy and with its own commitments. The attached patch
is my attempt at this first step. I have created the concept of an
accounting domain, against which pages are to be accounted. In this
implementation there are two domains VM_AD_DEFAULT which is used to account
normal small pages in the normal way and VM_AD_HUGETLB which is used to
select and identify VM_HUGETLB pages. I have not attempted to add any
actual accounting for VM_HUGETLB pages, as currently they are prefaulted
and thus there is always 0 outstanding commitment to track. Obviously, if
hugetlb was also changed to support demand paging that would need to be
implemented.

The patch below implements the basic domain split and provides a default
overcommit policy only for VM_AD_HUGETLB. Anton, with it installed I
believe that you should not need to change the global overcommit policy to
1 to allow 30GB of hugetlb pages to work. It was made against 2.6.4. It
contains a couple of comment changes which I intend to split off and submit
separatly (so ignore them).

I have compiled and booted with security on and off, but have not had a
chance to test the hugetlb side as yet. What do people think? The right
direction?

Cheers.

-apw

diff -X /home/apw/lib/vdiff.excl -rupN reference/include/linux/mm.h
current/include/linux/mm.h
--- reference/include/linux/mm.h 2004-03-11 20:47:28.000000000 +0000
+++ current/include/linux/mm.h 2004-03-17 19:10:23.000000000 +0000
@@ -112,6 +112,11 @@ struct vm_area_struct {
#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */

+/* Memory accounting domains. These may not be consecutive bits. */
+#define VM_ACCTDOM(vma) (vma)->vm_flags & VM_HUGETLB)
+#define VM_AD_DEFAULT 0x00000000
+#define VM_AD_HUGETLB VM_HUGETLB
+
#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
#endif
diff -X /home/apw/lib/vdiff.excl -rupN reference/include/linux/security.h
current/include/linux/security.h
--- reference/include/linux/security.h 2004-03-11 20:47:28.000000000 +0000
+++ current/include/linux/security.h 2004-03-17 19:10:23.000000000 +0000
@@ -51,7 +51,7 @@ extern int cap_inode_removexattr(struct
extern int cap_task_post_setuid (uid_t old_ruid, uid_t old_euid, uid_t
old_suid, int flags);
extern void cap_task_reparent_to_init (struct task_struct *p);
extern int cap_syslog (int type);
-extern int cap_vm_enough_memory (long pages);
+extern int cap_vm_enough_acctdom (int domain, long pages);

static inline int cap_netlink_send (struct sk_buff *skb)
{
@@ -987,8 +987,9 @@ struct swap_info_struct;
* See the syslog(2) manual page for an explanation of the @type values.
* @type contains the type of action.
* Return 0 if permission is granted.
- * @vm_enough_memory:
- * Check permissions for allocating a new virtual mapping.
+ * @vm_enough_acctdom:
+ * Check permissions for allocating a new virtual mapping.
+ * @domain contains the accounting domain.
* @pages contains the number of pages.
* Return 0 if permission is granted.
*
@@ -1022,7 +1023,7 @@ struct security_operations {
int (*quotactl) (int cmds, int type, int id, struct super_block * sb);
int (*quota_on) (struct file * f);
int (*syslog) (int type);
- int (*vm_enough_memory) (long pages);
+ int (*vm_enough_acctdom) (int domain, long pages);

int (*bprm_alloc_security) (struct linux_binprm * bprm);
void (*bprm_free_security) (struct linux_binprm * bprm);
@@ -1276,9 +1277,9 @@ static inline int security_syslog(int ty
return security_ops->syslog(type);
}

-static inline int security_vm_enough_memory(long pages)
+static inline int security_vm_enough_acctdom(int domain, long pages)
{
- return security_ops->vm_enough_memory(pages);
+ return security_ops->vm_enough_acctdom(domain, pages);
}

static inline int security_bprm_alloc (struct linux_binprm *bprm)
@@ -1947,9 +1948,9 @@ static inline int security_syslog(int ty
return cap_syslog(type);
}

-static inline int security_vm_enough_memory(long pages)
+static inline int security_vm_enough_acctdom(int domain, long pages)
{
- return cap_vm_enough_memory(pages);
+ return cap_vm_enough_acctdom(domain, pages);
}

static inline int security_bprm_alloc (struct linux_binprm *bprm)
@@ -2738,5 +2739,10 @@ static inline void security_sk_free(stru
}
#endif /* CONFIG_SECURITY_NETWORK */

+static inline int security_vm_enough_memory(long pages)
+{
+ return security_vm_enough_acctdom(VM_AD_DEFAULT, pages);
+}
+
#endif /* ! __LINUX_SECURITY_H */

diff -X /home/apw/lib/vdiff.excl -rupN reference/mm/mmap.c current/mm/mmap.c
--- reference/mm/mmap.c 2004-03-11 20:47:29.000000000 +0000
+++ current/mm/mmap.c 2004-03-17 19:10:23.000000000 +0000
@@ -473,6 +473,7 @@ unsigned long do_mmap_pgoff(struct file
int error;
struct rb_node ** rb_link, * rb_parent;
unsigned long charged = 0;
+ int acctdom = VM_AD_DEFAULT;

if (file) {
if (!file->f_op || !file->f_op->mmap)
@@ -591,7 +592,10 @@ munmap_back:
> current->rlim[RLIMIT_AS].rlim_cur)
return -ENOMEM;

- if (!(flags & MAP_NORESERVE) || sysctl_overcommit_memory > 1) {
+ if (is_file_hugepages(file))
+ acctdom = VM_AD_HUGETLB;
+ if (!(flags & MAP_NORESERVE) ||
+ (acctdom == VM_AD_DEFAULT && sysctl_overcommit_memory > 1)) {
if (vm_flags & VM_SHARED) {
/* Check memory availability in shmem_file_setup? */
vm_flags |= VM_ACCOUNT;
@@ -600,7 +604,7 @@ munmap_back:
* Private writable mapping: check memory availability
*/
charged = len >> PAGE_SHIFT;
- if (security_vm_enough_memory(charged))
+ if (security_vm_enough_acctdom(acctdom, charged))
return -ENOMEM;
vm_flags |= VM_ACCOUNT;
}
diff -X /home/apw/lib/vdiff.excl -rupN reference/security/capability.c
current/security/capability.c
--- reference/security/capability.c 2004-02-04 15:09:21.000000000 +0000
+++ current/security/capability.c 2004-03-17 19:10:23.000000000 +0000
@@ -47,7 +47,7 @@ static struct security_operations capabi

.syslog = cap_syslog,

- .vm_enough_memory = cap_vm_enough_memory,
+ .vm_enough_acctdom = cap_vm_enough_acctdom,
};

#if defined(CONFIG_SECURITY_CAPABILITIES_MODULE)
diff -X /home/apw/lib/vdiff.excl -rupN reference/security/commoncap.c
current/security/commoncap.c
--- reference/security/commoncap.c 2004-02-23 18:15:19.000000000 +0000
+++ current/security/commoncap.c 2004-03-17 19:10:23.000000000 +0000
@@ -303,15 +303,21 @@ int cap_syslog (int type)
* succeed and -ENOMEM implies there is not.
*
* We currently support three overcommit policies, which are set via the
- * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-acounting
+ * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
*
* Strict overcommit modes added 2002 Feb 26 by Alan Cox.
* Additional code 2002 Jul 20 by Robert Love.
*/
-int cap_vm_enough_memory(long pages)
+int cap_vm_enough_acctdom(int domain, long pages)
{
unsigned long free, allowed;

+ /* We only account for the default memory domain, assume overcommit
+ * for all others.
+ */
+ if (domain != VM_AD_DEFAULT)
+ return 0;
+
vm_acct_memory(pages);

/*
@@ -382,7 +388,7 @@ EXPORT_SYMBOL(cap_inode_removexattr);
EXPORT_SYMBOL(cap_task_post_setuid);
EXPORT_SYMBOL(cap_task_reparent_to_init);
EXPORT_SYMBOL(cap_syslog);
-EXPORT_SYMBOL(cap_vm_enough_memory);
+EXPORT_SYMBOL(cap_vm_enough_acctdom);

MODULE_DESCRIPTION("Standard Linux Common Capabilities Security Module");
MODULE_LICENSE("GPL");
diff -X /home/apw/lib/vdiff.excl -rupN reference/security/dummy.c
current/security/dummy.c
--- reference/security/dummy.c 2004-03-11 20:47:31.000000000 +0000
+++ current/security/dummy.c 2004-03-17 19:10:23.000000000 +0000
@@ -101,10 +101,24 @@ static int dummy_syslog (int type)
return 0;
}

-static int dummy_vm_enough_memory(long pages)
+/*
+ * Check that a process has enough memory to allocate a new virtual
+ * mapping. 0 means there is enough memory for the allocation to
+ * succeed and -ENOMEM implies there is not.
+ *
+ * We currently support three overcommit policies, which are set via the
+ * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
+ */
+static int dummy_vm_enough_acctdom(int domain, long pages)
{
unsigned long free, allowed;

+ /* We only account for the default memory domain, assume overcommit
+ * for all others.
+ */
+ if (domain != VM_AD_DEFAULT)
+ return 0;
+
vm_acct_memory(pages);

/*
@@ -873,7 +887,7 @@ void security_fixup_ops (struct security
set_to_dummy_if_null(ops, quota_on);
set_to_dummy_if_null(ops, sysctl);
set_to_dummy_if_null(ops, syslog);
- set_to_dummy_if_null(ops, vm_enough_memory);
+ set_to_dummy_if_null(ops, vm_enough_acctdom);
set_to_dummy_if_null(ops, bprm_alloc_security);
set_to_dummy_if_null(ops, bprm_free_security);
set_to_dummy_if_null(ops, bprm_compute_creds);
diff -X /home/apw/lib/vdiff.excl -rupN reference/security/selinux/hooks.c
current/security/selinux/hooks.c
--- reference/security/selinux/hooks.c 2004-03-11 20:47:31.000000000 +0000
+++ current/security/selinux/hooks.c 2004-03-17 19:10:23.000000000 +0000
@@ -1492,17 +1492,23 @@ static int selinux_syslog(int type)
* succeed and -ENOMEM implies there is not.
*
* We currently support three overcommit policies, which are set via the
- * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-acounting
+ * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
*
* Strict overcommit modes added 2002 Feb 26 by Alan Cox.
* Additional code 2002 Jul 20 by Robert Love.
*/
-static int selinux_vm_enough_memory(long pages)
+static int selinux_vm_enough_acctdom(int domain, long pages)
{
unsigned long free, allowed;
int rc;
struct task_security_struct *tsec = current->security;

+ /* We only account for the default memory domain, assume overcommit
+ * for all others.
+ */
+ if (domain != VM_AD_DEFAULT)
+ return 0;
+
vm_acct_memory(pages);

/*
@@ -3817,7 +3823,7 @@ struct security_operations selinux_ops =
.quotactl = selinux_quotactl,
.quota_on = selinux_quota_on,
.syslog = selinux_syslog,
- .vm_enough_memory = selinux_vm_enough_memory,
+ .vm_enough_acctdom = selinux_vm_enough_acctdom,

.netlink_send = selinux_netlink_send,
.netlink_recv = selinux_netlink_recv,


2004-03-18 20:25:24

by Andrew Morton

[permalink] [raw]
Subject: Re: [Lse-tech] Re: Hugetlbpages in very large memory machines.......

Andy Whitcroft <[email protected]> wrote:
>
> --On 14 March 2004 15:06 +1100 Anton Blanchard <[email protected]> wrote:
>
> > Hmm what a coincidence, I was chasing a problem where large page
> > allocations would fail even though I clearly had enough large page memory
> > free.
> >
> > It turns out we were tripping the overcommit logic in do_mmap. I had
> > 30GB of large page and 2GB of small pages and of course
> > cap_vm_enough_memory was looking at the small page pool. Setting
> > overcommit to 1 fixed it.
> >
> > It seems we can solve both problems by having a separate hugetlb
> > overcommit policy. Make it strict and you wont have OOM problems on large
> > pages and I wont hit my 30GB / 2GB problem.
>
> Been following this thread and it seems that fixing this overcommit
> miss-handling problem would logically be the first step. From my reading
> it seems that once we have initialised hugetlb we have two independent and
> non-overlapping 'page' pools from which we can allocate pages and against
> which we wish to handle commitments. Looking at the current code base we
> effectivly have only a single 'accounting domain' and so when we attempt to
> allocate hugetlb pages we incorrectly account them against the small page
> pool.
>
> I believe we need to add support for more than one page 'accounting domain'
> each with its own policy and with its own commitments. The attached patch
> is my attempt at this first step. I have created the concept of an
> accounting domain, against which pages are to be accounted. In this
> implementation there are two domains VM_AD_DEFAULT which is used to account
> normal small pages in the normal way and VM_AD_HUGETLB which is used to
> select and identify VM_HUGETLB pages. I have not attempted to add any
> actual accounting for VM_HUGETLB pages, as currently they are prefaulted
> and thus there is always 0 outstanding commitment to track. Obviously, if
> hugetlb was also changed to support demand paging that would need to be
> implemented.

Seems reasonable, although "vm_enough_acctdom" makes my eyes pop. Why not
keep the "vm_enough_memory" identifier?

I've asked Stephen for comment - assuming he's OK with it I'd ask you to
finish this off please.

2004-03-18 21:23:56

by Stephen Smalley

[permalink] [raw]
Subject: Re: [Lse-tech] Re: Hugetlbpages in very large memory machines.......

On Thu, 2004-03-18 at 15:25, Andrew Morton wrote:
> Seems reasonable, although "vm_enough_acctdom" makes my eyes pop. Why not
> keep the "vm_enough_memory" identifier?
>
> I've asked Stephen for comment - assuming he's OK with it I'd ask you to
> finish this off please.

To keep the name, he needs to update all callers, right? Current patch
appears to add a static inline for security_vm_enough_memory that
retains the old interface to avoid having to update most callers.

I don't have any fundamental problem with the nature of the change. As
a side note, patch was malformed (at least as I received it), not sure
if that was just a problem on my end.

--
Stephen Smalley <[email protected]>
National Security Agency

2004-03-18 22:23:36

by Andy Whitcroft

[permalink] [raw]
Subject: Re: [Lse-tech] Re: Hugetlbpages in very large memory machines.......

--On 18 March 2004 16:22 -0500 Stephen Smalley <[email protected]> wrote:

> On Thu, 2004-03-18 at 15:25, Andrew Morton wrote:
>> Seems reasonable, although "vm_enough_acctdom" makes my eyes pop. Why
>> not keep the "vm_enough_memory" identifier?
>>
>> I've asked Stephen for comment - assuming he's OK with it I'd ask you to
>> finish this off please.

I have no emotional attachment to any of the names. If we can come up with
a more sensible name then all for the best. I was trying to find something
which implied the 'measurement' thing which didn't overlap with any of the
other memory grouping concepts. As the domains overlap nodes and zones.

> To keep the name, he needs to update all callers, right? Current patch
> appears to add a static inline for security_vm_enough_memory that
> retains the old interface to avoid having to update most callers.

Yes this is the main reason for the name change. This is at the dirty hack
stage in that sense, minimal changes to prove the concept. I think that we
should be changing all the callers if this is going mainline in the longer
term. Although then the do cross 4 architecture and with it being in the
security interface it also interfaces with selinux as well (sigh).

I'll put together a more complete change over of the interface, keep the
name the same and see how intrusive that seems. Then we'll get some
testing on it.

> I don't have any fundamental problem with the nature of the change. As
> a side note, patch was malformed (at least as I received it), not sure
> if that was just a problem on my end.

Steven, I'll send you a copy of the patch under separate cover.

-apw

2004-03-23 17:29:48

by Andy Whitcroft

[permalink] [raw]
Subject: Re: [Lse-tech] Re: Hugetlbpages in very large memory machines.......

Been working on the hugetlb page commitment overcommit issues. I have
attached a bunch of patches for review purposes, there are a number so I've
not inlined them, but I can send them, just ask.

The first two patches are cosmetic fixes, either in documentation or to
remove a warning later in the game.

010-overcommit_docs: documentation changes.
015-do_mremap_warning: change mremap to be more correct and prevents a
warning when later patches are applied.

The next two patches set the scene. These are the most tested and it is
these that I hope Anton can test for us with his "real world" failure mode.
These two patches introduce the concept of a split between the default and
hugetlb memory pools and stop the hugtlb pool being accounted at all. This
is not as clean as I would like, particularly the need to check against
VM_AD_DEFAULT in a few places.

050-mem_acctdom_core: core changes to create two accounting domains
055-mem_acctdom_arch: architecture specific changes for above.

The next two patches are work in progress and I present them more for
review of the direction. This was prompted by the need to check
VM_AD_DEFAULT explicitly to handle vm_committed. The first splits the
current vm_committed into a per domain count. The final patch is the
beginnings of making hugetlbfs account for its pages correctly, currently
it actually only exposes the HUGETLB accounting domain.

060-mem_acctdom_commitments: splits vm_committed into a per domain count
070-mem_acctdom_hugetlb: starts the process of using above for hugetlb.

Testing for the first four patches and comments on the direction of the
remaining patches appreciated.

-apw


Attachments:
(No filename) (1.62 kB)
010-overcommit_docs.txt (2.10 kB)
015-do_mremap_warning.txt (1.21 kB)
050-mem_acctdom_core.txt (14.14 kB)
055-mem_acctdom_arch.txt (2.58 kB)
060-mem_acctdom_commitments.txt (17.56 kB)
070-mem_acctdom_hugetlb.txt (1.04 kB)
Download all attachments

2004-03-24 17:38:50

by Andy Whitcroft

[permalink] [raw]
Subject: Re: [Lse-tech] Re: Hugetlbpages in very large memory machines.......

Here is the next installment of HUGETLB memory accounting. With the stack
applied (to 2.6.4) HUGETLB allocations are be handled separately to those
for normal pages. The set has been tested lightly on i386. Other
architectures have not yet been compiled (testers please). Currently there
are no tunables for overcommit. Again patches attached, ask if you need
them inline.

This patch has an interesting and I believe correct side effect. Memory is
now committed when a hugetlb segment is initially requested, even before it
is attached. Thus it is no longer possible to shmget many large segments
and have them fail to attach.

The patch list below ... Comments??

-apw

010-overcommit_docs: documentation changes
015-do_mremap_warning: cleanup exit handling to prevent warning
050-mem_acctdom_core: core changes to create two accounting domains
055-mem_acctdom_arch: architecture specific changes for above
060-mem_acctdom_commitments: splits vm_committed into a per domain count
070-mem_acctdom_hugetlb: use vm_committed to track HUGETLB usage
075-em_acctdom_hugetlb_arch: architecture specific changes for above

The first two patches are cosmetic fixes, either in documentation or to
remove a warning later in the game.

The third and fourth patches patches set the scene. These are the most
tested and it is these that I hope Anton can test for us with his "real
world" failure mode. These two patches introduce the concept of a split
between the default and hugetlb memory pools and stop the hugtlb pool being
accounted at all. This is not as clean as I would like, particularly the
need to check against VM_AD_DEFAULT in a few places.

The fifth patch splits the vm_committed count into a per domain count and
exposes the domain in the interface.

The sixth and seventh patch converts hugetlb to use the vm_commitment
interfaces exposed above.


Attachments:
(No filename) (1.82 kB)
075-mem_acctdom_hugetlb_arch.txt (5.90 kB)
015-do_mremap_warning.txt (1.21 kB)
050-mem_acctdom_core.txt (14.14 kB)
055-mem_acctdom_arch.txt (2.58 kB)
060-mem_acctdom_commitments.txt (17.97 kB)
070-mem_acctdom_hugetlb.txt (7.04 kB)
010-overcommit_docs.txt (2.10 kB)
Download all attachments

2004-04-01 09:12:57

by Nobuhiko Yoshida

[permalink] [raw]
Subject: Re: Hugetlbpages in very large memory machines.......

Nobuhiko Yoshida <[email protected]> wrote?F
> Hello,
>
> > > +/* update_mmu_cache(vma, address, *pte); */
> >
> > I have not studied low level IA64 VM in detail, but don't you need
> > some kind of TLB flush here?
>
> Oh! Yes.
> Perhaps, TLB flush is needed here.

- Below is the patch that revised what I contributed before.
- I added the flush of TLB and icache.

How To Use:
1. Download linux-2.6.0 source tree
2. Apply the below patch for linux-2.6.0

Thank you,
Nobuhiko Yoshida

diff -dupr linux-2.6.0/arch/i386/mm/hugetlbpage.c linux-2.6.0.HugeTLB/arch/i386/mm/hugetlbpage.c
--- linux-2.6.0/arch/i386/mm/hugetlbpage.c 2003-12-18 11:59:38.000000000 +0900
+++ linux-2.6.0.HugeTLB/arch/i386/mm/hugetlbpage.c 2004-04-01 11:48:56.000000000 +0900
@@ -142,8 +142,10 @@ int copy_hugetlb_page_range(struct mm_st
goto nomem;
src_pte = huge_pte_offset(src, addr);
entry = *src_pte;
- ptepage = pte_page(entry);
- get_page(ptepage);
+ if (!pte_none(entry)) {
+ ptepage = pte_page(entry);
+ get_page(ptepage);
+ }
set_pte(dst_pte, entry);
dst->rss += (HPAGE_SIZE / PAGE_SIZE);
addr += HPAGE_SIZE;
@@ -173,6 +175,11 @@ follow_hugetlb_page(struct mm_struct *mm

pte = huge_pte_offset(mm, vaddr);

+ if (!pte || pte_none(*pte)) {
+ hugetlb_fault(mm, vma, 0, vaddr);
+ pte = huge_pte_offset(mm, vaddr);
+ }
+
/* hugetlb should be locked, and hence, prefaulted */
WARN_ON(!pte || pte_none(*pte));

@@ -261,12 +268,17 @@ int pmd_huge(pmd_t pmd)
}

struct page *
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
- pmd_t *pmd, int write)
+follow_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd, int write)
{
struct page *page;

page = pte_page(*(pte_t *)pmd);
+
+ if (!page) {
+ hugetlb_fault(mm, vma, write, address);
+ page = pte_page(*(pte_t *)pmd);
+ }
if (page) {
page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT);
get_page(page);
@@ -527,6 +539,48 @@ int is_hugepage_mem_enough(size_t size)
return (size + ~HPAGE_MASK)/HPAGE_SIZE <= htlbpagemem;
}

+
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, int write_access, unsigned long address)
+{
+ struct file *file = vma->vm_file;
+ struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
+ struct page *page;
+ unsigned long idx;
+ pte_t *pte;
+ int ret = VM_FAULT_MINOR;
+
+ BUG_ON(vma->vm_start & ~HPAGE_MASK);
+ BUG_ON(vma->vm_end & ~HPAGE_MASK);
+
+ spin_lock(&mm->page_table_lock);
+
+ idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
+ + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+ page = find_get_page(mapping, idx);
+
+ if (!page) {
+ page = alloc_hugetlb_page();
+ if (!page) {
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
+ unlock_page(page);
+ if (ret) {
+ free_huge_page(page);
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ }
+ pte = huge_pte_alloc(mm, address);
+ set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
+/* update_mmu_cache(vma, address, *pte); */
+out:
+ spin_unlock(&mm->page_table_lock);
+ return ret;
+}
+
+
/*
* We cannot handle pagefaults against hugetlb pages at all. They cause
* handle_mm_fault() to try to instantiate regular-sized pages in the
diff -dupr linux-2.6.0/arch/ia64/mm/hugetlbpage.c linux-2.6.0.HugeTLB/arch/ia64/mm/hugetlbpage.c
--- linux-2.6.0/arch/ia64/mm/hugetlbpage.c 2003-12-18 11:58:56.000000000 +0900
+++ linux-2.6.0.HugeTLB/arch/ia64/mm/hugetlbpage.c 2004-03-22 11:29:01.000000000 +0900
@@ -170,8 +170,10 @@ int copy_hugetlb_page_range(struct mm_st
goto nomem;
src_pte = huge_pte_offset(src, addr);
entry = *src_pte;
- ptepage = pte_page(entry);
- get_page(ptepage);
+ if (!pte_none(entry)) {
+ ptepage = pte_page(entry);
+ get_page(ptepage);
+ }
set_pte(dst_pte, entry);
dst->rss += (HPAGE_SIZE / PAGE_SIZE);
addr += HPAGE_SIZE;
@@ -195,6 +197,12 @@ follow_hugetlb_page(struct mm_struct *mm
do {
pstart = start & HPAGE_MASK;
ptep = huge_pte_offset(mm, start);
+
+ if (!ptep || pte_none(*ptep)) {
+ hugetlb_fault(mm, vma, 0, start);
+ ptep = huge_pte_offset(mm, start);
+ }
+
pte = *ptep;

back1:
@@ -236,6 +244,12 @@ struct page *follow_huge_addr(struct mm_
pte_t *ptep;

ptep = huge_pte_offset(mm, addr);
+
+ if (!ptep || pte_none(*ptep)) {
+ hugetlb_fault(mm, vma, 0, addr);
+ ptep = huge_pte_offset(mm, addr);
+ }
+
page = pte_page(*ptep);
page += ((addr & ~HPAGE_MASK) >> PAGE_SHIFT);
get_page(page);
@@ -246,7 +260,8 @@ int pmd_huge(pmd_t pmd)
return 0;
}
struct page *
-follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write)
+follow_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd, int write)
{
return NULL;
}
@@ -518,6 +533,49 @@ int is_hugepage_mem_enough(size_t size)
return 1;
}

+
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, int write_access, unsigned long address)
+{
+ struct file *file = vma->vm_file;
+ struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
+ struct page *page;
+ unsigned long idx;
+ pte_t *pte;
+ int ret = VM_FAULT_MINOR;
+
+ BUG_ON(vma->vm_start & ~HPAGE_MASK);
+ BUG_ON(vma->vm_end & ~HPAGE_MASK);
+
+ spin_lock(&mm->page_table_lock);
+
+ idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
+ + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+ page = find_get_page(mapping, idx);
+
+ if (!page) {
+ page = alloc_hugetlb_page();
+ if (!page) {
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
+ unlock_page(page);
+ if (ret) {
+ free_huge_page(page);
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ }
+ pte = huge_pte_alloc(mm, address);
+ set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
+ flush_tlb_range(vma, address, address + HPAGE_SIZE);
+ update_mmu_cache(vma, address, *pte);
+out:
+ spin_unlock(&mm->page_table_lock);
+ return ret;
+}
+
+
static struct page *hugetlb_nopage(struct vm_area_struct * area, unsigned long address, int unused)
{
BUG();
diff -dupr linux-2.6.0/include/linux/hugetlb.h linux-2.6.0.HugeTLB/include/linux/hugetlb.h
--- linux-2.6.0/include/linux/hugetlb.h 2003-12-18 11:58:49.000000000 +0900
+++ linux-2.6.0.HugeTLB/include/linux/hugetlb.h 2003-12-19 09:47:25.000000000 +0900
@@ -23,10 +23,12 @@ struct page *follow_huge_addr(struct mm_
unsigned long address, int write);
struct vm_area_struct *hugepage_vma(struct mm_struct *mm,
unsigned long address);
-struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
- pmd_t *pmd, int write);
+struct page *follow_huge_pmd(struct mm_struct *mm, struct vm_area_struct *,
+ unsigned long address, pmd_t *pmd, int write);
int is_aligned_hugepage_range(unsigned long addr, unsigned long len);
int pmd_huge(pmd_t pmd);
+extern int hugetlb_fault(struct mm_struct *, struct vm_area_struct *,
+ int, unsigned long);

extern int htlbpage_max;

@@ -63,6 +65,7 @@ static inline int is_vm_hugetlb_page(str
#define is_aligned_hugepage_range(addr, len) 0
#define pmd_huge(x) 0
#define is_hugepage_only_range(addr, len) 0
+#define hugetlb_fault(mm, vma, write, addr) 0

#ifndef HPAGE_MASK
#define HPAGE_MASK 0 /* Keep the compiler happy */
diff -dupr linux-2.6.0/mm/memory.c linux-2.6.0.HugeTLB/mm/memory.c
--- linux-2.6.0/mm/memory.c 2003-12-18 11:58:48.000000000 +0900
+++ linux-2.6.0.HugeTLB/mm/memory.c 2003-12-19 09:47:46.000000000 +0900
@@ -640,7 +640,7 @@ follow_page(struct mm_struct *mm, unsign
if (pmd_none(*pmd))
goto out;
if (pmd_huge(*pmd))
- return follow_huge_pmd(mm, address, pmd, write);
+ return follow_huge_pmd(mm, vma, address, pmd, write);
if (pmd_bad(*pmd))
goto out;

@@ -1603,7 +1603,7 @@ int handle_mm_fault(struct mm_struct *mm
inc_page_state(pgfault);

if (is_vm_hugetlb_page(vma))
- return VM_FAULT_SIGBUS; /* mapping truncation does this. */
+ return hugetlb_fault(mm, vma, write_access, address);

/*
* We need the page table lock to synchronize with kswapd