Provide functions for moving page tables upwards.
Signed-off-by: Peter Zijlstra <[email protected]>
Signed-off-by: Ollie Wild <[email protected]>
---
include/linux/mm.h | 7 +++
mm/mremap.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 110 insertions(+), 2 deletions(-)
Index: linux-2.6-2/include/linux/mm.h
===================================================================
--- linux-2.6-2.orig/include/linux/mm.h 2007-06-01 10:50:58.000000000 +0200
+++ linux-2.6-2/include/linux/mm.h 2007-06-01 10:57:26.000000000 +0200
@@ -788,6 +787,12 @@ int FASTCALL(set_page_dirty(struct page
int set_page_dirty_lock(struct page *page);
int clear_page_dirty_for_io(struct page *page);
+extern unsigned long move_page_tables(struct vm_area_struct *vma,
+ unsigned long old_addr, struct vm_area_struct *new_vma,
+ unsigned long new_addr, unsigned long len);
+extern unsigned long move_page_tables_up(struct vm_area_struct *vma,
+ unsigned long old_addr, struct vm_area_struct *new_vma,
+ unsigned long new_addr, unsigned long len);
extern unsigned long do_mremap(unsigned long addr,
unsigned long old_len, unsigned long new_len,
unsigned long flags, unsigned long new_addr);
Index: linux-2.6-2/mm/mremap.c
===================================================================
--- linux-2.6-2.orig/mm/mremap.c 2007-06-01 10:50:58.000000000 +0200
+++ linux-2.6-2/mm/mremap.c 2007-06-01 10:57:45.000000000 +0200
@@ -118,9 +118,63 @@ static void move_ptes(struct vm_area_str
spin_unlock(&mapping->i_mmap_lock);
}
+static void move_ptes_up(struct vm_area_struct *vma, pmd_t *old_pmd,
+ unsigned long old_addr, unsigned long old_end,
+ struct vm_area_struct *new_vma, pmd_t *new_pmd,
+ unsigned long new_addr)
+{
+ struct address_space *mapping = NULL;
+ struct mm_struct *mm = vma->vm_mm;
+ pte_t *old_pte, *new_pte, pte;
+ spinlock_t *old_ptl, *new_ptl;
+ unsigned long new_end = new_addr + (old_end - old_addr);
+
+ if (vma->vm_file) {
+ /*
+ * Subtle point from Rajesh Venkatasubramanian: before
+ * moving file-based ptes, we must lock vmtruncate out,
+ * since it might clean the dst vma before the src vma,
+ * and we propagate stale pages into the dst afterward.
+ */
+ mapping = vma->vm_file->f_mapping;
+ spin_lock(&mapping->i_mmap_lock);
+ if (new_vma->vm_truncate_count &&
+ new_vma->vm_truncate_count != vma->vm_truncate_count)
+ new_vma->vm_truncate_count = 0;
+ }
+
+ /*
+ * We don't have to worry about the ordering of src and dst
+ * pte locks because exclusive mmap_sem prevents deadlock.
+ */
+ old_pte = pte_offset_map_lock(mm, old_pmd, old_end-1, &old_ptl);
+ new_pte = pte_offset_map_nested(new_pmd, new_end-1);
+ new_ptl = pte_lockptr(mm, new_pmd);
+ if (new_ptl != old_ptl)
+ spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+ arch_enter_lazy_mmu_mode();
+
+ for (; old_end > old_addr; old_pte--, old_end -= PAGE_SIZE,
+ new_pte--, new_end -= PAGE_SIZE) {
+ if (pte_none(*old_pte))
+ continue;
+ pte = ptep_clear_flush(vma, old_end-1, old_pte);
+ pte = move_pte(pte, new_vma->vm_page_prot, old_end-1, new_end-1);
+ set_pte_at(mm, new_end-1, new_pte, pte);
+ }
+
+ arch_leave_lazy_mmu_mode();
+ if (new_ptl != old_ptl)
+ spin_unlock(new_ptl);
+ pte_unmap_nested(new_pte - 1);
+ pte_unmap_unlock(old_pte - 1, old_ptl);
+ if (mapping)
+ spin_unlock(&mapping->i_mmap_lock);
+}
+
#define LATENCY_LIMIT (64 * PAGE_SIZE)
-static unsigned long move_page_tables(struct vm_area_struct *vma,
+unsigned long move_page_tables(struct vm_area_struct *vma,
unsigned long old_addr, struct vm_area_struct *new_vma,
unsigned long new_addr, unsigned long len)
{
@@ -132,21 +186,25 @@ static unsigned long move_page_tables(st
for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
cond_resched();
+
next = (old_addr + PMD_SIZE) & PMD_MASK;
if (next - 1 > old_end)
next = old_end;
extent = next - old_addr;
+
old_pmd = get_old_pmd(vma->vm_mm, old_addr);
if (!old_pmd)
continue;
new_pmd = alloc_new_pmd(vma->vm_mm, new_addr);
if (!new_pmd)
break;
+
next = (new_addr + PMD_SIZE) & PMD_MASK;
if (extent > next - new_addr)
extent = next - new_addr;
if (extent > LATENCY_LIMIT)
extent = LATENCY_LIMIT;
+
move_ptes(vma, old_pmd, old_addr, old_addr + extent,
new_vma, new_pmd, new_addr);
}
@@ -154,6 +212,51 @@ static unsigned long move_page_tables(st
return len + old_addr - old_end; /* how much done */
}
+unsigned long move_page_tables_up(struct vm_area_struct *vma,
+ unsigned long old_addr, struct vm_area_struct *new_vma,
+ unsigned long new_addr, unsigned long len)
+{
+ unsigned long extent, prev, old_end, new_end;
+ pmd_t *old_pmd, *new_pmd;
+
+ old_end = old_addr + len;
+ new_end = new_addr + len;
+ flush_cache_range(vma, old_addr, old_end);
+
+ for (; old_end > old_addr; old_end -= extent, new_end -= extent) {
+ cond_resched();
+
+ /*
+ * calculate how far till prev PMD boundary for old
+ */
+ prev = (old_end - 1) & PMD_MASK;
+ if (prev < old_addr)
+ prev = old_addr;
+ extent = old_end - prev;
+
+ old_pmd = get_old_pmd(vma->vm_mm, old_end-1);
+ if (!old_pmd)
+ continue;
+ new_pmd = alloc_new_pmd(vma->vm_mm, new_end-1);
+ if (!new_pmd)
+ break;
+
+ /*
+ * calculate and clip to prev PMD boundary for new
+ */
+ prev = (new_end - 1) & PMD_MASK;
+ if (extent > new_end - prev)
+ extent = new_end - prev;
+ if (extent > LATENCY_LIMIT)
+ extent = LATENCY_LIMIT;
+
+ move_ptes_up(vma, old_pmd, old_end - extent, old_end,
+ new_vma, new_pmd, new_end - extent);
+ }
+
+ return old_addr + len - old_end;
+}
+
static unsigned long move_vma(struct vm_area_struct *vma,
unsigned long old_addr, unsigned long old_len,
unsigned long new_len, unsigned long new_addr)
--
On Tue, 5 Jun 2007, Peter Zijlstra wrote:
> Provide functions for moving page tables upwards.
Could you make this more general so that it allows arbitrary page table
pages moving? That would be useful for Mel's memory defragmentation since
it increases the types of pages that can be moved.
On Tue, 05 Jun 2007 17:05:26 +0200
Peter Zijlstra <[email protected]> wrote:
> Provide functions for moving page tables upwards.
>
> ...
>
> +extern unsigned long move_page_tables(struct vm_area_struct *vma,
> + unsigned long old_addr, struct vm_area_struct *new_vma,
> + unsigned long new_addr, unsigned long len);
> +extern unsigned long move_page_tables_up(struct vm_area_struct *vma,
> + unsigned long old_addr, struct vm_area_struct *new_vma,
> + unsigned long new_addr, unsigned long len);
> extern unsigned long do_mremap(unsigned long addr,
> unsigned long old_len, unsigned long new_len,
> unsigned long flags, unsigned long new_addr);
They become kernel-wide
> +static void move_ptes_up(struct vm_area_struct *vma, pmd_t *old_pmd,
> + unsigned long old_addr, unsigned long old_end,
> + struct vm_area_struct *new_vma, pmd_t *new_pmd,
> + unsigned long new_addr)
So some documentation might be in order...
On 6/5/07, Peter Zijlstra <[email protected]> wrote:
> Provide functions for moving page tables upwards.
Now that we're initializing the temporary stack location to
STACK_TOP_MAX, do we still need move_page_tables_up() for variable
length argument support? I originally added it into shift_arg_pages()
to support 32-bit apps exec'ing 64-bit apps when we were using
TASK_SIZE as our temporary location.
Maybe we should decouple this patch from the others and submit it as
an enhancement to support memory defragmentation.
Ollie
On Wed, 2007-06-06 at 12:06 -0700, Ollie Wild wrote:
> On 6/5/07, Peter Zijlstra <[email protected]> wrote:
> > Provide functions for moving page tables upwards.
>
> Now that we're initializing the temporary stack location to
> STACK_TOP_MAX, do we still need move_page_tables_up() for variable
> length argument support? I originally added it into shift_arg_pages()
> to support 32-bit apps exec'ing 64-bit apps when we were using
> TASK_SIZE as our temporary location.
>
> Maybe we should decouple this patch from the others and submit it as
> an enhancement to support memory defragmentation.
PA-RISC will still need it, right?
On the defrag thingy, I talked with Mel today, and neither of us can see
a usefull application of these functions to his defrag work.
On 6/6/07, Peter Zijlstra <[email protected]> wrote:
> PA-RISC will still need it, right?
Originally, I thought since the PA-RISC stack grows up, we'd want to
place the stack at the bottom of memory and have copy_strings() and
friends work in the opposite direction. It turns out, though, that
this ends up being way more headache than it's worth, so I just
manually grow the stack down with expand_downwards().
Ollie
On Wed, 2007-06-06 at 12:50 -0700, Ollie Wild wrote:
> On 6/6/07, Peter Zijlstra <[email protected]> wrote:
> > PA-RISC will still need it, right?
>
> Originally, I thought since the PA-RISC stack grows up, we'd want to
> place the stack at the bottom of memory and have copy_strings() and
> friends work in the opposite direction. It turns out, though, that
> this ends up being way more headache than it's worth, so I just
> manually grow the stack down with expand_downwards().
Ah, ok. I'll drop this whole patch then.