2005-10-22 16:19:08

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 0/9] mm: page fault scalability

Here comes the third batch of my page fault scalability patches, against
2.6.14-rc4-mm1. This batch actually gets there. There will be a little
more to come later (e.g. comments in ppc64, page_table_lock in sparc64,
perhaps out-of-lining pte_offset_map_lock - I haven't quite decided, and
attempt to update Documentation/vm/locking); but this is the most of it.

Hugh

arch/arm/kernel/signal.c | 96 +++++++---------------------------------
arch/arm/kernel/traps.c | 14 +++--
arch/arm/mm/fault-armv.c | 7 ++
arch/arm/mm/mm-armv.c | 1
arch/cris/arch-v32/mm/tlb.c | 6 +-
arch/i386/kernel/vm86.c | 17 ++-----
arch/parisc/kernel/cache.c | 24 +++-------
arch/sh/mm/fault.c | 40 +++++++++-------
arch/sh64/mm/cache.c | 66 ++++++++++++---------------
arch/um/include/tlb.h | 1
arch/um/kernel/process_kern.c | 8 ++-
arch/um/kernel/skas/mmu.c | 1
arch/um/kernel/tlb.c | 9 ---
arch/um/kernel/tt/tlb.c | 36 ---------------
include/asm-generic/pgtable.h | 2
include/asm-parisc/cacheflush.h | 35 +++++++-------
include/linux/mempolicy.h | 3 -
include/linux/mm.h | 26 ++++++++++
include/linux/sched.h | 42 +++++++++++++++--
mm/Kconfig | 13 +++++
mm/filemap.c | 6 +-
mm/memory.c | 28 +++++++----
mm/mremap.c | 11 ++++
mm/rmap.c | 15 +++---
mm/swap_state.c | 3 -
25 files changed, 248 insertions(+), 262 deletions(-)


2005-10-22 16:20:32

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 1/9] mm: i386 sh sh64 ready for split ptlock

Use pte_offset_map_lock, instead of pte_offset_map (or inappropriate
pte_offset_kernel) and mm-wide page_table_lock, in sundry arch places.

The i386 vm86 mark_screen_rdonly: yes, there was and is an assumption
that the screen fits inside the one page table, as indeed it does.

The sh __do_page_fault: which handles both kernel faults (without lock)
and user mm faults (locked - though it set_pte without locking before).

The sh64 flush_cache_range and helpers: which wrongly thought callers
held page_table_lock before (only its tlb_start_vma did, and no longer
does so); moved the flush loop down, and adjusted the large versus small
range decision to consider a range which spans page tables as large.

Signed-off-by: Hugh Dickins <[email protected]>
---

arch/i386/kernel/vm86.c | 17 +++++-------
arch/sh/mm/fault.c | 40 ++++++++++++++++-------------
arch/sh64/mm/cache.c | 66 +++++++++++++++++++++---------------------------
3 files changed, 59 insertions(+), 64 deletions(-)

--- mm0/arch/i386/kernel/vm86.c 2005-10-17 12:05:09.000000000 +0100
+++ mm1/arch/i386/kernel/vm86.c 2005-10-22 14:06:01.000000000 +0100
@@ -134,17 +134,16 @@ struct pt_regs * fastcall save_v86_state
return ret;
}

-static void mark_screen_rdonly(struct task_struct * tsk)
+static void mark_screen_rdonly(struct mm_struct *mm)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
- pte_t *pte, *mapped;
+ pte_t *pte;
+ spinlock_t *ptl;
int i;

- preempt_disable();
- spin_lock(&tsk->mm->page_table_lock);
- pgd = pgd_offset(tsk->mm, 0xA0000);
+ pgd = pgd_offset(mm, 0xA0000);
if (pgd_none_or_clear_bad(pgd))
goto out;
pud = pud_offset(pgd, 0xA0000);
@@ -153,16 +152,14 @@ static void mark_screen_rdonly(struct ta
pmd = pmd_offset(pud, 0xA0000);
if (pmd_none_or_clear_bad(pmd))
goto out;
- pte = mapped = pte_offset_map(pmd, 0xA0000);
+ pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
for (i = 0; i < 32; i++) {
if (pte_present(*pte))
set_pte(pte, pte_wrprotect(*pte));
pte++;
}
- pte_unmap(mapped);
+ pte_unmap_unlock(pte, ptl);
out:
- spin_unlock(&tsk->mm->page_table_lock);
- preempt_enable();
flush_tlb();
}

@@ -306,7 +303,7 @@ static void do_sys_vm86(struct kernel_vm

tsk->thread.screen_bitmap = info->screen_bitmap;
if (info->flags & VM86_SCREEN_BITMAP)
- mark_screen_rdonly(tsk);
+ mark_screen_rdonly(tsk->mm);
__asm__ __volatile__(
"xorl %%eax,%%eax; movl %%eax,%%fs; movl %%eax,%%gs\n\t"
"movl %0,%%esp\n\t"
--- mm0/arch/sh/mm/fault.c 2004-10-18 22:56:27.000000000 +0100
+++ mm1/arch/sh/mm/fault.c 2005-10-22 14:06:01.000000000 +0100
@@ -194,10 +194,13 @@ asmlinkage int __do_page_fault(struct pt
unsigned long address)
{
unsigned long addrmax = P4SEG;
- pgd_t *dir;
+ pgd_t *pgd;
pmd_t *pmd;
pte_t *pte;
pte_t entry;
+ struct mm_struct *mm;
+ spinlock_t *ptl;
+ int ret = 1;

#ifdef CONFIG_SH_KGDB
if (kgdb_nofault && kgdb_bus_err_hook)
@@ -208,28 +211,28 @@ asmlinkage int __do_page_fault(struct pt
addrmax = P4SEG_STORE_QUE + 0x04000000;
#endif

- if (address >= P3SEG && address < addrmax)
- dir = pgd_offset_k(address);
- else if (address >= TASK_SIZE)
+ if (address >= P3SEG && address < addrmax) {
+ pgd = pgd_offset_k(address);
+ mm = NULL;
+ } else if (address >= TASK_SIZE)
return 1;
- else if (!current->mm)
+ else if (!(mm = current->mm))
return 1;
else
- dir = pgd_offset(current->mm, address);
+ pgd = pgd_offset(mm, address);

- pmd = pmd_offset(dir, address);
- if (pmd_none(*pmd))
+ pmd = pmd_offset(pgd, address);
+ if (pmd_none_or_clear_bad(pmd))
return 1;
- if (pmd_bad(*pmd)) {
- pmd_ERROR(*pmd);
- pmd_clear(pmd);
- return 1;
- }
- pte = pte_offset_kernel(pmd, address);
+ if (mm)
+ pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ else
+ pte = pte_offset_kernel(pmd, address);
+
entry = *pte;
if (pte_none(entry) || pte_not_present(entry)
|| (writeaccess && !pte_write(entry)))
- return 1;
+ goto unlock;

if (writeaccess)
entry = pte_mkdirty(entry);
@@ -251,8 +254,11 @@ asmlinkage int __do_page_fault(struct pt

set_pte(pte, entry);
update_mmu_cache(NULL, address, entry);
-
- return 0;
+ ret = 0;
+unlock:
+ if (mm)
+ pte_unmap_unlock(pte, ptl);
+ return ret;
}

void flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
--- mm0/arch/sh64/mm/cache.c 2005-06-17 20:48:29.000000000 +0100
+++ mm1/arch/sh64/mm/cache.c 2005-10-22 14:06:01.000000000 +0100
@@ -584,32 +584,36 @@ static void sh64_dcache_purge_phy_page(u
}
}

-static void sh64_dcache_purge_user_page(struct mm_struct *mm, unsigned long eaddr)
+static void sh64_dcache_purge_user_pages(struct mm_struct *mm,
+ unsigned long addr, unsigned long end)
{
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte;
pte_t entry;
+ spinlock_t *ptl;
unsigned long paddr;

- /* NOTE : all the callers of this have mm->page_table_lock held, so the
- following page table traversal is safe even on SMP/pre-emptible. */
-
- if (!mm) return; /* No way to find physical address of page */
- pgd = pgd_offset(mm, eaddr);
- if (pgd_bad(*pgd)) return;
-
- pmd = pmd_offset(pgd, eaddr);
- if (pmd_none(*pmd) || pmd_bad(*pmd)) return;
-
- pte = pte_offset_kernel(pmd, eaddr);
- entry = *pte;
- if (pte_none(entry) || !pte_present(entry)) return;
-
- paddr = pte_val(entry) & PAGE_MASK;
-
- sh64_dcache_purge_coloured_phy_page(paddr, eaddr);
+ if (!mm)
+ return; /* No way to find physical address of page */

+ pgd = pgd_offset(mm, addr);
+ if (pgd_bad(*pgd))
+ return;
+
+ pmd = pmd_offset(pgd, addr);
+ if (pmd_none(*pmd) || pmd_bad(*pmd))
+ return;
+
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ do {
+ entry = *pte;
+ if (pte_none(entry) || !pte_present(entry))
+ continue;
+ paddr = pte_val(entry) & PAGE_MASK;
+ sh64_dcache_purge_coloured_phy_page(paddr, addr);
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+ pte_unmap_unlock(pte - 1, ptl);
}
/****************************************************************************/

@@ -668,7 +672,7 @@ static void sh64_dcache_purge_user_range
int n_pages;

n_pages = ((end - start) >> PAGE_SHIFT);
- if (n_pages >= 64) {
+ if (n_pages >= 64 || ((start ^ (end - 1)) & PMD_MASK)) {
#if 1
sh64_dcache_purge_all();
#else
@@ -707,20 +711,10 @@ static void sh64_dcache_purge_user_range
}
#endif
} else {
- /* 'Small' range */
- unsigned long aligned_start;
- unsigned long eaddr;
- unsigned long last_page_start;
-
- aligned_start = start & PAGE_MASK;
- /* 'end' is 1 byte beyond the end of the range */
- last_page_start = (end - 1) & PAGE_MASK;
-
- eaddr = aligned_start;
- while (eaddr <= last_page_start) {
- sh64_dcache_purge_user_page(mm, eaddr);
- eaddr += PAGE_SIZE;
- }
+ /* Small range, covered by a single page table page */
+ start &= PAGE_MASK; /* should already be so */
+ end = PAGE_ALIGN(end); /* should already be so */
+ sh64_dcache_purge_user_pages(mm, start, end);
}
return;
}
@@ -880,9 +874,7 @@ void flush_cache_range(struct vm_area_st
addresses from the user address space specified by mm, after writing
back any dirty data.

- Note(1), 'end' is 1 byte beyond the end of the range to flush.
-
- Note(2), this is called with mm->page_table_lock held.*/
+ Note, 'end' is 1 byte beyond the end of the range to flush. */

sh64_dcache_purge_user_range(mm, start, end);
sh64_icache_inv_user_page_range(mm, start, end);
@@ -898,7 +890,7 @@ void flush_cache_page(struct vm_area_str
the I-cache must be searched too in case the page in question is
both writable and being executed from (e.g. stack trampolines.)

- Note(1), this is called with mm->page_table_lock held.
+ Note, this is called with pte lock held.
*/

sh64_dcache_purge_phy_page(pfn << PAGE_SHIFT);

2005-10-22 16:23:21

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 2/9] mm: arm ready for split ptlock

Prepare arm for the split page_table_lock: three issues.

Signal handling's preserve and restore of iwmmxt context currently
involves reading and writing that context to and from user space, while
holding page_table_lock to secure the user page(s) against kswapd. If
we split the lock, then the structure might span two pages, secured by
different locks. That would be manageable; but it seems simpler just
to read into and write from a kernel stack buffer, copying that out and
in without locking (the structure is 160 bytes in size, and here we're
near the top of the kernel stack). Or would the overhead be noticeable?

arm_syscall's cmpxchg emulation use pte_offset_map_lock, instead of
pte_offset_map and mm-wide page_table_lock; and strictly, it should now
also take mmap_sem before descending to pmd, to guard against another
thread munmapping, and the page table pulled out beneath this thread.

Updated two comments in fault-armv.c. adjust_pte is interesting, since
its modification of a pte in one part of the mm depends on the lock held
when calling update_mmu_cache for a pte in some other part of that mm.
This can't be done with a split page_table_lock (and we've already taken
the lowest lock in the hierarchy here): so we'll have to disable split
on arm, unless CONFIG_CPU_CACHE_VIPT to ensures adjust_pte never used.

Signed-off-by: Hugh Dickins <[email protected]>
---
Note: this is RMK's first sighting of this patch: he may not approve.

arch/arm/kernel/signal.c | 96 ++++++++---------------------------------------
arch/arm/kernel/traps.c | 14 ++++--
arch/arm/mm/fault-armv.c | 7 ++-
3 files changed, 33 insertions(+), 84 deletions(-)

--- mm1/arch/arm/kernel/signal.c 2005-10-11 12:07:07.000000000 +0100
+++ mm2/arch/arm/kernel/signal.c 2005-10-22 14:06:15.000000000 +0100
@@ -139,93 +139,33 @@ struct iwmmxt_sigframe {
unsigned long storage[0x98/4];
};

-static int page_present(struct mm_struct *mm, void __user *uptr, int wr)
-{
- unsigned long addr = (unsigned long)uptr;
- pgd_t *pgd = pgd_offset(mm, addr);
- if (pgd_present(*pgd)) {
- pmd_t *pmd = pmd_offset(pgd, addr);
- if (pmd_present(*pmd)) {
- pte_t *pte = pte_offset_map(pmd, addr);
- return (pte_present(*pte) && (!wr || pte_write(*pte)));
- }
- }
- return 0;
-}
-
-static int copy_locked(void __user *uptr, void *kptr, size_t size, int write,
- void (*copyfn)(void *, void __user *))
-{
- unsigned char v, __user *userptr = uptr;
- int err = 0;
-
- do {
- struct mm_struct *mm;
-
- if (write) {
- __put_user_error(0, userptr, err);
- __put_user_error(0, userptr + size - 1, err);
- } else {
- __get_user_error(v, userptr, err);
- __get_user_error(v, userptr + size - 1, err);
- }
-
- if (err)
- break;
-
- mm = current->mm;
- spin_lock(&mm->page_table_lock);
- if (page_present(mm, userptr, write) &&
- page_present(mm, userptr + size - 1, write)) {
- copyfn(kptr, uptr);
- } else
- err = 1;
- spin_unlock(&mm->page_table_lock);
- } while (err);
-
- return err;
-}
-
static int preserve_iwmmxt_context(struct iwmmxt_sigframe *frame)
{
- int err = 0;
+ char kbuf[sizeof(*frame) + 8];
+ struct iwmmxt_sigframe *kframe;

/* the iWMMXt context must be 64 bit aligned */
- WARN_ON((unsigned long)frame & 7);
-
- __put_user_error(IWMMXT_MAGIC0, &frame->magic0, err);
- __put_user_error(IWMMXT_MAGIC1, &frame->magic1, err);
-
- /*
- * iwmmxt_task_copy() doesn't check user permissions.
- * Let's do a dummy write on the upper boundary to ensure
- * access to user mem is OK all way up.
- */
- err |= copy_locked(&frame->storage, current_thread_info(),
- sizeof(frame->storage), 1, iwmmxt_task_copy);
- return err;
+ kframe = (struct iwmmxt_sigframe *)((unsigned long)(kbuf + 8) & ~7);
+ kframe->magic0 = IWMMXT_MAGIC0;
+ kframe->magic1 = IWMMXT_MAGIC1;
+ iwmmxt_task_copy(current_thread_info(), &kframe->storage);
+ return __copy_to_user(frame, kframe, sizeof(*frame));
}

static int restore_iwmmxt_context(struct iwmmxt_sigframe *frame)
{
- unsigned long magic0, magic1;
- int err = 0;
+ char kbuf[sizeof(*frame) + 8];
+ struct iwmmxt_sigframe *kframe;

- /* the iWMMXt context is 64 bit aligned */
- WARN_ON((unsigned long)frame & 7);
-
- /*
- * Validate iWMMXt context signature.
- * Also, iwmmxt_task_restore() doesn't check user permissions.
- * Let's do a dummy write on the upper boundary to ensure
- * access to user mem is OK all way up.
- */
- __get_user_error(magic0, &frame->magic0, err);
- __get_user_error(magic1, &frame->magic1, err);
- if (!err && magic0 == IWMMXT_MAGIC0 && magic1 == IWMMXT_MAGIC1)
- err = copy_locked(&frame->storage, current_thread_info(),
- sizeof(frame->storage), 0, iwmmxt_task_restore);
- return err;
+ /* the iWMMXt context must be 64 bit aligned */
+ kframe = (struct iwmmxt_sigframe *)((unsigned long)(kbuf + 8) & ~7);
+ if (__copy_from_user(kframe, frame, sizeof(*frame)))
+ return -1;
+ if (kframe->magic0 != IWMMXT_MAGIC0 ||
+ kframe->magic1 != IWMMXT_MAGIC1)
+ return -1;
+ iwmmxt_task_restore(current_thread_info(), &kframe->storage);
+ return 0;
}

#endif
--- mm1/arch/arm/kernel/traps.c 2005-10-17 12:05:08.000000000 +0100
+++ mm2/arch/arm/kernel/traps.c 2005-10-22 14:06:15.000000000 +0100
@@ -481,29 +481,33 @@ asmlinkage int arm_syscall(int no, struc
unsigned long addr = regs->ARM_r2;
struct mm_struct *mm = current->mm;
pgd_t *pgd; pmd_t *pmd; pte_t *pte;
+ spinlock_t *ptl;

regs->ARM_cpsr &= ~PSR_C_BIT;
- spin_lock(&mm->page_table_lock);
+ down_read(&mm->mmap_sem);
pgd = pgd_offset(mm, addr);
if (!pgd_present(*pgd))
goto bad_access;
pmd = pmd_offset(pgd, addr);
if (!pmd_present(*pmd))
goto bad_access;
- pte = pte_offset_map(pmd, addr);
- if (!pte_present(*pte) || !pte_write(*pte))
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!pte_present(*pte) || !pte_write(*pte)) {
+ pte_unmap_unlock(pte, ptl);
goto bad_access;
+ }
val = *(unsigned long *)addr;
val -= regs->ARM_r0;
if (val == 0) {
*(unsigned long *)addr = regs->ARM_r1;
regs->ARM_cpsr |= PSR_C_BIT;
}
- spin_unlock(&mm->page_table_lock);
+ pte_unmap_unlock(pte, ptl);
+ up_read(&mm->mmap_sem);
return val;

bad_access:
- spin_unlock(&mm->page_table_lock);
+ up_read(&mm->mmap_sem);
/* simulate a write access fault */
do_DataAbort(addr, 15 + (1 << 11), regs);
return -1;
--- mm1/arch/arm/mm/fault-armv.c 2005-08-29 00:41:01.000000000 +0100
+++ mm2/arch/arm/mm/fault-armv.c 2005-10-22 14:06:16.000000000 +0100
@@ -26,6 +26,11 @@ static unsigned long shared_pte_mask = L
/*
* We take the easy way out of this problem - we make the
* PTE uncacheable. However, we leave the write buffer on.
+ *
+ * Note that the pte lock held when calling update_mmu_cache must also
+ * guard the pte (somewhere else in the same mm) that we modify here.
+ * Therefore those configurations which might call adjust_pte (those
+ * without CONFIG_CPU_CACHE_VIPT) cannot support split page_table_lock.
*/
static int adjust_pte(struct vm_area_struct *vma, unsigned long address)
{
@@ -127,7 +132,7 @@ void __flush_dcache_page(struct address_
* 2. If we have multiple shared mappings of the same space in
* an object, we need to deal with the cache aliasing issues.
*
- * Note that the page_table_lock will be held.
+ * Note that the pte lock will be held.
*/
void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
{

2005-10-22 16:24:29

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 3/9] mm: parisc pte atomicity

There's a worrying function translation_exists in parisc cacheflush.h,
unaffected by split ptlock since flush_dcache_page is using it on some
other mm, without any relevant lock. Oh well, make it a slightly more
robust by factoring the pfn check within it. And it looked liable to
confuse a camouflaged swap or file entry with a good pte: fix that too.

Signed-off-by: Hugh Dickins <[email protected]>
---

arch/parisc/kernel/cache.c | 24 +++++++++---------------
include/asm-parisc/cacheflush.h | 35 +++++++++++++++++++----------------
2 files changed, 28 insertions(+), 31 deletions(-)

--- mm2/arch/parisc/kernel/cache.c 2005-03-02 07:38:56.000000000 +0000
+++ mm3/arch/parisc/kernel/cache.c 2005-10-22 14:06:30.000000000 +0100
@@ -266,7 +266,6 @@ void flush_dcache_page(struct page *page
unsigned long offset;
unsigned long addr;
pgoff_t pgoff;
- pte_t *pte;
unsigned long pfn = page_to_pfn(page);


@@ -297,21 +296,16 @@ void flush_dcache_page(struct page *page
* taking a page fault if the pte doesn't exist.
* This is just for speed. If the page translation
* isn't there, there's no point exciting the
- * nadtlb handler into a nullification frenzy */
-
-
- if(!(pte = translation_exists(mpnt, addr)))
- continue;
-
- /* make sure we really have this page: the private
+ * nadtlb handler into a nullification frenzy.
+ *
+ * Make sure we really have this page: the private
* mappings may cover this area but have COW'd this
- * particular page */
- if(pte_pfn(*pte) != pfn)
- continue;
-
- __flush_cache_page(mpnt, addr);
-
- break;
+ * particular page.
+ */
+ if (translation_exists(mpnt, addr, pfn)) {
+ __flush_cache_page(mpnt, addr);
+ break;
+ }
}
flush_dcache_mmap_unlock(mapping);
}
--- mm2/include/asm-parisc/cacheflush.h 2005-10-11 12:07:49.000000000 +0100
+++ mm3/include/asm-parisc/cacheflush.h 2005-10-22 14:06:30.000000000 +0100
@@ -100,30 +100,34 @@ static inline void flush_cache_range(str

/* Simple function to work out if we have an existing address translation
* for a user space vma. */
-static inline pte_t *__translation_exists(struct mm_struct *mm,
- unsigned long addr)
+static inline int translation_exists(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long pfn)
{
- pgd_t *pgd = pgd_offset(mm, addr);
+ pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
pmd_t *pmd;
- pte_t *pte;
+ pte_t pte;

if(pgd_none(*pgd))
- return NULL;
+ return 0;

pmd = pmd_offset(pgd, addr);
if(pmd_none(*pmd) || pmd_bad(*pmd))
- return NULL;
+ return 0;

- pte = pte_offset_map(pmd, addr);
+ /* We cannot take the pte lock here: flush_cache_page is usually
+ * called with pte lock already held. Whereas flush_dcache_page
+ * takes flush_dcache_mmap_lock, which is lower in the hierarchy:
+ * the vma itself is secure, but the pte might come or go racily.
+ */
+ pte = *pte_offset_map(pmd, addr);
+ /* But pte_unmap() does nothing on this architecture */
+
+ /* Filter out coincidental file entries and swap entries */
+ if (!(pte_val(pte) & (_PAGE_FLUSH|_PAGE_PRESENT)))
+ return 0;

- /* The PA flush mappings show up as pte_none, but they're
- * valid none the less */
- if(pte_none(*pte) && ((pte_val(*pte) & _PAGE_FLUSH) == 0))
- return NULL;
- return pte;
+ return pte_pfn(pte) == pfn;
}
-#define translation_exists(vma, addr) __translation_exists((vma)->vm_mm, addr)
-

/* Private function to flush a page from the cache of a non-current
* process. cr25 contains the Page Directory of the current user
@@ -175,9 +179,8 @@ flush_cache_page(struct vm_area_struct *
{
BUG_ON(!vma->vm_mm->context);

- if(likely(translation_exists(vma, vmaddr)))
+ if (likely(translation_exists(vma, vmaddr, pfn)))
__flush_cache_page(vma, vmaddr);

}
#endif
-

2005-10-22 16:25:51

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 4/9] mm: cris v32 mmu_context_lock

The cris v32 switch_mm guards get_mmu_context with next->page_table_lock:
good it's not really SMP yet, since get_mmu_context messes with global
variables affecting other mms. Replace by global mmu_context_lock.

Signed-off-by: Hugh Dickins <[email protected]>
---

arch/cris/arch-v32/mm/tlb.c | 6 ++++--
1 files changed, 4 insertions(+), 2 deletions(-)

--- mm3/arch/cris/arch-v32/mm/tlb.c 2005-10-17 12:05:08.000000000 +0100
+++ mm4/arch/cris/arch-v32/mm/tlb.c 2005-10-22 14:06:42.000000000 +0100
@@ -175,6 +175,8 @@ init_new_context(struct task_struct *tsk
return 0;
}

+static DEFINE_SPINLOCK(mmu_context_lock);
+
/* Called in schedule() just before actually doing the switch_to. */
void
switch_mm(struct mm_struct *prev, struct mm_struct *next,
@@ -183,10 +185,10 @@ switch_mm(struct mm_struct *prev, struct
int cpu = smp_processor_id();

/* Make sure there is a MMU context. */
- spin_lock(&next->page_table_lock);
+ spin_lock(&mmu_context_lock);
get_mmu_context(next);
cpu_set(cpu, next->cpu_vm_mask);
- spin_unlock(&next->page_table_lock);
+ spin_unlock(&mmu_context_lock);

/*
* Remember the pgd for the fault handlers. Keep a seperate copy of it

2005-10-22 16:26:53

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 5/9] mm: uml pte atomicity

There's usually a good reason when a pte is examined without the lock;
but it makes me nervous when the pointer is dereferenced more than once.

Signed-off-by: Hugh Dickins <[email protected]>
---

arch/um/kernel/process_kern.c | 8 +++++---
1 files changed, 5 insertions(+), 3 deletions(-)

--- mm4/arch/um/kernel/process_kern.c 2005-10-17 12:05:14.000000000 +0100
+++ mm5/arch/um/kernel/process_kern.c 2005-10-22 14:06:58.000000000 +0100
@@ -222,6 +222,7 @@ void *um_virt_to_phys(struct task_struct
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
+ pte_t ptent;

if(task->mm == NULL)
return(ERR_PTR(-EINVAL));
@@ -238,12 +239,13 @@ void *um_virt_to_phys(struct task_struct
return(ERR_PTR(-EINVAL));

pte = pte_offset_kernel(pmd, addr);
- if(!pte_present(*pte))
+ ptent = *pte;
+ if(!pte_present(ptent))
return(ERR_PTR(-EINVAL));

if(pte_out != NULL)
- *pte_out = *pte;
- return((void *) (pte_val(*pte) & PAGE_MASK) + (addr & ~PAGE_MASK));
+ *pte_out = ptent;
+ return((void *) (pte_val(ptent) & PAGE_MASK) + (addr & ~PAGE_MASK));
}

char *current_cmd(void)

2005-10-22 16:28:04

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 6/9] mm: uml kill unused

In worrying over the various pte operations in different architectures,
I came across some unused functions in UML: remove mprotect_kernel_vm,
protect_vm_page and addr_pte.

Signed-off-by: Hugh Dickins <[email protected]>
---

arch/um/include/tlb.h | 1 -
arch/um/kernel/tlb.c | 9 ---------
arch/um/kernel/tt/tlb.c | 36 ------------------------------------
3 files changed, 46 deletions(-)

--- mm5/arch/um/include/tlb.h 2005-10-11 12:07:16.000000000 +0100
+++ mm6/arch/um/include/tlb.h 2005-10-22 14:07:12.000000000 +0100
@@ -34,7 +34,6 @@ struct host_vm_op {
} u;
};

-extern void mprotect_kernel_vm(int w);
extern void force_flush_all(void);
extern void fix_range_common(struct mm_struct *mm, unsigned long start_addr,
unsigned long end_addr, int force,
--- mm5/arch/um/kernel/tlb.c 2005-10-11 12:07:16.000000000 +0100
+++ mm6/arch/um/kernel/tlb.c 2005-10-22 14:07:12.000000000 +0100
@@ -334,15 +334,6 @@ pte_t *pte_offset_proc(pmd_t *pmd, unsig
return(pte_offset_kernel(pmd, address));
}

-pte_t *addr_pte(struct task_struct *task, unsigned long addr)
-{
- pgd_t *pgd = pgd_offset(task->mm, addr);
- pud_t *pud = pud_offset(pgd, addr);
- pmd_t *pmd = pmd_offset(pud, addr);
-
- return(pte_offset_map(pmd, addr));
-}
-
void flush_tlb_page(struct vm_area_struct *vma, unsigned long address)
{
address &= PAGE_MASK;
--- mm5/arch/um/kernel/tt/tlb.c 2005-10-11 12:07:16.000000000 +0100
+++ mm6/arch/um/kernel/tt/tlb.c 2005-10-22 14:07:12.000000000 +0100
@@ -74,42 +74,6 @@ void flush_tlb_kernel_range_tt(unsigned
atomic_inc(&vmchange_seq);
}

-static void protect_vm_page(unsigned long addr, int w, int must_succeed)
-{
- int err;
-
- err = protect_memory(addr, PAGE_SIZE, 1, w, 1, must_succeed);
- if(err == 0) return;
- else if((err == -EFAULT) || (err == -ENOMEM)){
- flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
- protect_vm_page(addr, w, 1);
- }
- else panic("protect_vm_page : protect failed, errno = %d\n", err);
-}
-
-void mprotect_kernel_vm(int w)
-{
- struct mm_struct *mm;
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
- unsigned long addr;
-
- mm = &init_mm;
- for(addr = start_vm; addr < end_vm;){
- pgd = pgd_offset(mm, addr);
- pud = pud_offset(pgd, addr);
- pmd = pmd_offset(pud, addr);
- if(pmd_present(*pmd)){
- pte = pte_offset_kernel(pmd, addr);
- if(pte_present(*pte)) protect_vm_page(addr, w, 0);
- addr += PAGE_SIZE;
- }
- else addr += PMD_SIZE;
- }
-}
-
void flush_tlb_kernel_vm_tt(void)
{
flush_tlb_kernel_range(start_vm, end_vm);

2005-10-22 16:30:18

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 7/9] mm: split page table lock

Christoph Lameter demonstrated very poor scalability on the SGI 512-way,
with a many-threaded application which concurrently initializes different
parts of a large anonymous area.

This patch corrects that, by using a separate spinlock per page table
page, to guard the page table entries in that page, instead of using
the mm's single page_table_lock. (But even then, page_table_lock is
still used to guard page table allocation, and anon_vma allocation.)

In this implementation, the spinlock is tucked inside the struct page of
the page table page: with a BUILD_BUG_ON in case it overflows - which it
would in the case of 32-bit PA-RISC with spinlock debugging enabled.

Splitting the lock is not quite for free: another cacheline access.
Ideally, I suppose we would use split ptlock only for multi-threaded
processes on multi-cpu machines; but deciding that dynamically would
have its own costs. So for now enable it by config, at some number
of cpus - since the Kconfig language doesn't support inequalities, let
preprocessor compare that with NR_CPUS. But I don't think it's worth
being user-configurable: for good testing of both split and unsplit
configs, split now at 4 cpus, and perhaps change that to 8 later.

There is a benefit even for singly threaded processes: kswapd can be
attacking one part of the mm while another part is busy faulting.

Signed-off-by: Hugh Dickins <[email protected]>
---

arch/arm/mm/mm-armv.c | 1 +
arch/um/kernel/skas/mmu.c | 1 +
include/linux/mm.h | 26 +++++++++++++++++++++++++-
mm/Kconfig | 13 +++++++++++++
mm/memory.c | 24 ++++++++++++++----------
mm/mremap.c | 11 ++++++++++-
mm/rmap.c | 2 +-
7 files changed, 65 insertions(+), 13 deletions(-)

--- mm6/arch/arm/mm/mm-armv.c 2005-10-17 12:05:08.000000000 +0100
+++ mm7/arch/arm/mm/mm-armv.c 2005-10-22 14:07:25.000000000 +0100
@@ -229,6 +229,7 @@ void free_pgd_slow(pgd_t *pgd)
pte = pmd_page(*pmd);
pmd_clear(pmd);
dec_page_state(nr_page_table_pages);
+ pte_lock_deinit(pte);
pte_free(pte);
pmd_free(pmd);
free:
--- mm6/arch/um/kernel/skas/mmu.c 2005-10-17 12:05:14.000000000 +0100
+++ mm7/arch/um/kernel/skas/mmu.c 2005-10-22 14:07:25.000000000 +0100
@@ -144,6 +144,7 @@ void destroy_context_skas(struct mm_stru

if(!proc_mm || !ptrace_faultinfo){
free_page(mmu->id.stack);
+ pte_lock_deinit(virt_to_page(mmu->last_page_table));
pte_free_kernel((pte_t *) mmu->last_page_table);
dec_page_state(nr_page_table_pages);
#ifdef CONFIG_3_LEVEL_PGTABLES
--- mm6/include/linux/mm.h 2005-10-17 12:05:38.000000000 +0100
+++ mm7/include/linux/mm.h 2005-10-22 14:07:25.000000000 +0100
@@ -778,9 +778,33 @@ static inline pmd_t *pmd_alloc(struct mm
}
#endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */

+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+/*
+ * We tuck a spinlock to guard each pagetable page into its struct page,
+ * at page->private, with BUILD_BUG_ON to make sure that this will not
+ * overflow into the next struct page (as it might with DEBUG_SPINLOCK).
+ * When freeing, reset page->mapping so free_pages_check won't complain.
+ */
+#define __pte_lockptr(page) ((spinlock_t *)&((page)->private))
+#define pte_lock_init(_page) do { \
+ BUILD_BUG_ON((size_t)(__pte_lockptr((struct page *)0) + 1) > \
+ sizeof(struct page)); \
+ spin_lock_init(__pte_lockptr(_page)); \
+} while (0)
+#define pte_lock_deinit(page) ((page)->mapping = NULL)
+#define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
+#else
+/*
+ * We use mm->page_table_lock to guard all pagetable pages of the mm.
+ */
+#define pte_lock_init(page) do {} while (0)
+#define pte_lock_deinit(page) do {} while (0)
+#define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;})
+#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
+
#define pte_offset_map_lock(mm, pmd, address, ptlp) \
({ \
- spinlock_t *__ptl = &(mm)->page_table_lock; \
+ spinlock_t *__ptl = pte_lockptr(mm, pmd); \
pte_t *__pte = pte_offset_map(pmd, address); \
*(ptlp) = __ptl; \
spin_lock(__ptl); \
--- mm6/mm/Kconfig 2005-10-17 12:05:40.000000000 +0100
+++ mm7/mm/Kconfig 2005-10-22 14:07:25.000000000 +0100
@@ -119,3 +119,16 @@ config MEMORY_HOTPLUG

comment "Memory hotplug is currently incompatible with Software Suspend"
depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND
+
+# Heavily threaded applications may benefit from splitting the mm-wide
+# page_table_lock, so that faults on different parts of the user address
+# space can be handled with less contention: split it at this NR_CPUS.
+# Default to 4 for wider testing, though 8 might be more appropriate.
+# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
+# PA-RISC's debug spinlock_t is too large for the 32-bit struct page.
+#
+config SPLIT_PTLOCK_CPUS
+ int
+ default "4096" if ARM && !CPU_CACHE_VIPT
+ default "4096" if PARISC && DEBUG_SPINLOCK && !64BIT
+ default "4"
--- mm6/mm/memory.c 2005-10-17 12:05:40.000000000 +0100
+++ mm7/mm/memory.c 2005-10-22 14:07:25.000000000 +0100
@@ -114,6 +114,7 @@ static void free_pte_range(struct mmu_ga
{
struct page *page = pmd_page(*pmd);
pmd_clear(pmd);
+ pte_lock_deinit(page);
pte_free_tlb(tlb, page);
dec_page_state(nr_page_table_pages);
tlb->mm->nr_ptes--;
@@ -294,10 +295,12 @@ int __pte_alloc(struct mm_struct *mm, pm
if (!new)
return -ENOMEM;

+ pte_lock_init(new);
spin_lock(&mm->page_table_lock);
- if (pmd_present(*pmd)) /* Another has populated it */
+ if (pmd_present(*pmd)) { /* Another has populated it */
+ pte_lock_deinit(new);
pte_free(new);
- else {
+ } else {
mm->nr_ptes++;
inc_page_state(nr_page_table_pages);
pmd_populate(mm, pmd, new);
@@ -432,7 +435,7 @@ again:
if (!dst_pte)
return -ENOMEM;
src_pte = pte_offset_map_nested(src_pmd, addr);
- src_ptl = &src_mm->page_table_lock;
+ src_ptl = pte_lockptr(src_mm, src_pmd);
spin_lock(src_ptl);

do {
@@ -1194,15 +1197,16 @@ EXPORT_SYMBOL(remap_pfn_range);
* (but do_wp_page is only called after already making such a check;
* and do_anonymous_page and do_no_page can safely check later on).
*/
-static inline int pte_unmap_same(struct mm_struct *mm,
+static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
pte_t *page_table, pte_t orig_pte)
{
int same = 1;
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
if (sizeof(pte_t) > sizeof(unsigned long)) {
- spin_lock(&mm->page_table_lock);
+ spinlock_t *ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
same = pte_same(*page_table, orig_pte);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
}
#endif
pte_unmap(page_table);
@@ -1655,7 +1659,7 @@ static int do_swap_page(struct mm_struct
pte_t pte;
int ret = VM_FAULT_MINOR;

- if (!pte_unmap_same(mm, page_table, orig_pte))
+ if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
goto out;

entry = pte_to_swp_entry(orig_pte);
@@ -1773,7 +1777,7 @@ static int do_anonymous_page(struct mm_s
page_cache_get(page);
entry = mk_pte(page, vma->vm_page_prot);

- ptl = &mm->page_table_lock;
+ ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
if (!pte_none(*page_table))
goto release;
@@ -1934,7 +1938,7 @@ static int do_file_page(struct mm_struct
pgoff_t pgoff;
int err;

- if (!pte_unmap_same(mm, page_table, orig_pte))
+ if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
return VM_FAULT_MINOR;

if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
@@ -1992,7 +1996,7 @@ static inline int handle_pte_fault(struc
pte, pmd, write_access, entry);
}

- ptl = &mm->page_table_lock;
+ ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
if (unlikely(!pte_same(*pte, entry)))
goto unlock;
--- mm6/mm/mremap.c 2005-10-17 12:05:40.000000000 +0100
+++ mm7/mm/mremap.c 2005-10-22 14:07:25.000000000 +0100
@@ -72,7 +72,7 @@ static void move_ptes(struct vm_area_str
struct address_space *mapping = NULL;
struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte;
- spinlock_t *old_ptl;
+ spinlock_t *old_ptl, *new_ptl;

if (vma->vm_file) {
/*
@@ -88,8 +88,15 @@ static void move_ptes(struct vm_area_str
new_vma->vm_truncate_count = 0;
}

+ /*
+ * We don't have to worry about the ordering of src and dst
+ * pte locks because exclusive mmap_sem prevents deadlock.
+ */
old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
new_pte = pte_offset_map_nested(new_pmd, new_addr);
+ new_ptl = pte_lockptr(mm, new_pmd);
+ if (new_ptl != old_ptl)
+ spin_lock(new_ptl);

for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
new_pte++, new_addr += PAGE_SIZE) {
@@ -101,6 +108,8 @@ static void move_ptes(struct vm_area_str
set_pte_at(mm, new_addr, new_pte, pte);
}

+ if (new_ptl != old_ptl)
+ spin_unlock(new_ptl);
pte_unmap_nested(new_pte - 1);
pte_unmap_unlock(old_pte - 1, old_ptl);
if (mapping)
--- mm6/mm/rmap.c 2005-10-17 12:05:40.000000000 +0100
+++ mm7/mm/rmap.c 2005-10-22 14:07:25.000000000 +0100
@@ -274,7 +274,7 @@ pte_t *page_check_address(struct page *p
return NULL;
}

- ptl = &mm->page_table_lock;
+ ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
*ptlp = ptl;

2005-10-22 16:31:39

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 8/9] mm: fix rss and mmlist locking

A couple of oddities were guarded by page_table_lock, no longer properly
guarded when that is split.

The mm_counters of file_rss and anon_rss: make those an atomic_t, or an
atomic64_t if the architecture supports it, in such a case. Definitions
by courtesy of Christoph Lameter: who spent considerable effort on more
scalable ways of counting, but found insufficient benefit in practice.

And adding an mm with swap to the mmlist for swapoff: the list is well-
guarded by its own lock, but the list_empty check now has to be repeated
inside it.

Signed-off-by: Hugh Dickins <[email protected]>
---

include/linux/sched.h | 42 ++++++++++++++++++++++++++++++++++++++----
mm/memory.c | 4 +++-
mm/rmap.c | 3 ++-
3 files changed, 43 insertions(+), 6 deletions(-)

--- mm7/include/linux/sched.h 2005-10-17 12:05:38.000000000 +0100
+++ mm8/include/linux/sched.h 2005-10-22 14:07:39.000000000 +0100
@@ -250,13 +250,47 @@ arch_get_unmapped_area_topdown(struct fi
extern void arch_unmap_area(struct mm_struct *, unsigned long);
extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);

+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+/*
+ * The mm counters are not protected by its page_table_lock,
+ * so must be incremented atomically.
+ */
+#ifdef ATOMIC64_INIT
+#define set_mm_counter(mm, member, value) atomic64_set(&(mm)->_##member, value)
+#define get_mm_counter(mm, member) ((unsigned long)atomic64_read(&(mm)->_##member))
+#define add_mm_counter(mm, member, value) atomic64_add(value, &(mm)->_##member)
+#define inc_mm_counter(mm, member) atomic64_inc(&(mm)->_##member)
+#define dec_mm_counter(mm, member) atomic64_dec(&(mm)->_##member)
+typedef atomic64_t mm_counter_t;
+#else /* !ATOMIC64_INIT */
+/*
+ * The counters wrap back to 0 at 2^32 * PAGE_SIZE,
+ * that is, at 16TB if using 4kB page size.
+ */
+#define set_mm_counter(mm, member, value) atomic_set(&(mm)->_##member, value)
+#define get_mm_counter(mm, member) ((unsigned long)atomic_read(&(mm)->_##member))
+#define add_mm_counter(mm, member, value) atomic_add(value, &(mm)->_##member)
+#define inc_mm_counter(mm, member) atomic_inc(&(mm)->_##member)
+#define dec_mm_counter(mm, member) atomic_dec(&(mm)->_##member)
+typedef atomic_t mm_counter_t;
+#endif /* !ATOMIC64_INIT */
+
+#else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
+/*
+ * The mm counters are protected by its page_table_lock,
+ * so can be incremented directly.
+ */
#define set_mm_counter(mm, member, value) (mm)->_##member = (value)
#define get_mm_counter(mm, member) ((mm)->_##member)
#define add_mm_counter(mm, member, value) (mm)->_##member += (value)
#define inc_mm_counter(mm, member) (mm)->_##member++
#define dec_mm_counter(mm, member) (mm)->_##member--
-#define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss)
+typedef unsigned long mm_counter_t;
+
+#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */

+#define get_mm_rss(mm) \
+ (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
#define update_hiwater_rss(mm) do { \
unsigned long _rss = get_mm_rss(mm); \
if ((mm)->hiwater_rss < _rss) \
@@ -267,8 +301,6 @@ extern void arch_unmap_area_topdown(stru
(mm)->hiwater_vm = (mm)->total_vm; \
} while (0)

-typedef unsigned long mm_counter_t;
-
struct mm_struct {
struct vm_area_struct * mmap; /* list of VMAs */
struct rb_root mm_rb;
@@ -292,7 +324,9 @@ struct mm_struct {
* by mmlist_lock
*/

- /* Special counters protected by the page_table_lock */
+ /* Special counters, in some configurations protected by the
+ * page_table_lock, in other configurations by being atomic.
+ */
mm_counter_t _file_rss;
mm_counter_t _anon_rss;

--- mm7/mm/memory.c 2005-10-22 14:07:25.000000000 +0100
+++ mm8/mm/memory.c 2005-10-22 14:07:40.000000000 +0100
@@ -372,7 +372,9 @@ copy_one_pte(struct mm_struct *dst_mm, s
/* make sure dst_mm is on swapoff's mmlist. */
if (unlikely(list_empty(&dst_mm->mmlist))) {
spin_lock(&mmlist_lock);
- list_add(&dst_mm->mmlist, &src_mm->mmlist);
+ if (list_empty(&dst_mm->mmlist))
+ list_add(&dst_mm->mmlist,
+ &src_mm->mmlist);
spin_unlock(&mmlist_lock);
}
}
--- mm7/mm/rmap.c 2005-10-22 14:07:25.000000000 +0100
+++ mm8/mm/rmap.c 2005-10-22 14:07:40.000000000 +0100
@@ -559,7 +559,8 @@ static int try_to_unmap_one(struct page
swap_duplicate(entry);
if (list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock);
- list_add(&mm->mmlist, &init_mm.mmlist);
+ if (list_empty(&mm->mmlist))
+ list_add(&mm->mmlist, &init_mm.mmlist);
spin_unlock(&mmlist_lock);
}
set_pte_at(mm, address, pte, swp_entry_to_pte(entry));

2005-10-22 16:32:27

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 9/9] mm: update comments to pte lock

Updated several references to page_table_lock in common code comments.

Signed-off-by: Hugh Dickins <[email protected]>
---

include/asm-generic/pgtable.h | 2 +-
include/linux/mempolicy.h | 3 +--
mm/filemap.c | 6 +++---
mm/rmap.c | 10 +++++-----
mm/swap_state.c | 3 +--
5 files changed, 11 insertions(+), 13 deletions(-)

--- mm8/include/asm-generic/pgtable.h 2005-10-11 12:07:47.000000000 +0100
+++ mm9/include/asm-generic/pgtable.h 2005-10-22 14:07:54.000000000 +0100
@@ -8,7 +8,7 @@
* - update the page tables
* - inform the TLB about the new one
*
- * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock.
+ * We hold the mm semaphore for reading, and the pte lock.
*
* Note: the old pte is known to not be writable, so we don't need to
* worry about dirty bits etc getting lost.
--- mm8/include/linux/mempolicy.h 2005-10-17 12:05:38.000000000 +0100
+++ mm9/include/linux/mempolicy.h 2005-10-22 14:07:54.000000000 +0100
@@ -47,8 +47,7 @@ struct vm_area_struct;
* Locking policy for interlave:
* In process context there is no locking because only the process accesses
* its own state. All vma manipulation is somewhat protected by a down_read on
- * mmap_sem. For allocating in the interleave policy the page_table_lock
- * must be also aquired to protect il_next.
+ * mmap_sem.
*
* Freeing policy:
* When policy is MPOL_BIND v.zonelist is kmalloc'ed and must be kfree'd.
--- mm8/mm/filemap.c 2005-10-17 12:05:40.000000000 +0100
+++ mm9/mm/filemap.c 2005-10-22 14:07:54.000000000 +0100
@@ -66,7 +66,7 @@ generic_file_direct_IO(int rw, struct ki
*
* ->mmap_sem
* ->i_mmap_lock
- * ->page_table_lock (various places, mainly in mmap.c)
+ * ->page_table_lock or pte_lock (various, mainly in memory.c)
* ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
*
* ->mmap_sem
@@ -86,9 +86,9 @@ generic_file_direct_IO(int rw, struct ki
* ->anon_vma.lock (vma_adjust)
*
* ->anon_vma.lock
- * ->page_table_lock (anon_vma_prepare and various)
+ * ->page_table_lock or pte_lock (anon_vma_prepare and various)
*
- * ->page_table_lock
+ * ->page_table_lock or pte_lock
* ->swap_lock (try_to_unmap_one)
* ->private_lock (try_to_unmap_one)
* ->tree_lock (try_to_unmap_one)
--- mm8/mm/rmap.c 2005-10-22 14:07:40.000000000 +0100
+++ mm9/mm/rmap.c 2005-10-22 14:07:54.000000000 +0100
@@ -32,7 +32,7 @@
* page->flags PG_locked (lock_page)
* mapping->i_mmap_lock
* anon_vma->lock
- * mm->page_table_lock
+ * mm->page_table_lock or pte_lock
* zone->lru_lock (in mark_page_accessed)
* swap_lock (in swap_duplicate, swap_info_get)
* mmlist_lock (in mmput, drain_mmlist and others)
@@ -244,7 +244,7 @@ unsigned long page_address_in_vma(struct
/*
* Check that @page is mapped at @address into @mm.
*
- * On success returns with mapped pte and locked mm->page_table_lock.
+ * On success returns with pte mapped and locked.
*/
pte_t *page_check_address(struct page *page, struct mm_struct *mm,
unsigned long address, spinlock_t **ptlp)
@@ -445,7 +445,7 @@ int page_referenced(struct page *page, i
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
*
- * The caller needs to hold the mm->page_table_lock.
+ * The caller needs to hold the pte lock.
*/
void page_add_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
@@ -468,7 +468,7 @@ void page_add_anon_rmap(struct page *pag
* page_add_file_rmap - add pte mapping to a file page
* @page: the page to add the mapping to
*
- * The caller needs to hold the mm->page_table_lock.
+ * The caller needs to hold the pte lock.
*/
void page_add_file_rmap(struct page *page)
{
@@ -483,7 +483,7 @@ void page_add_file_rmap(struct page *pag
* page_remove_rmap - take down pte mapping from a page
* @page: page to remove mapping from
*
- * Caller needs to hold the mm->page_table_lock.
+ * The caller needs to hold the pte lock.
*/
void page_remove_rmap(struct page *page)
{
--- mm8/mm/swap_state.c 2005-10-17 12:05:40.000000000 +0100
+++ mm9/mm/swap_state.c 2005-10-22 14:07:54.000000000 +0100
@@ -263,8 +263,7 @@ static inline void free_swap_cache(struc

/*
* Perform a free_page(), also freeing any swap cache associated with
- * this page if it is the last user of the page. Can not do a lock_page,
- * as we are holding the page_table_lock spinlock.
+ * this page if it is the last user of the page.
*/
void free_page_and_swap_cache(struct page *page)
{

2005-10-22 16:33:37

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 3/9] mm: parisc pte atomicity

On Sat, Oct 22, 2005 at 05:23:27PM +0100, Hugh Dickins wrote:
> There's a worrying function translation_exists in parisc cacheflush.h,
> unaffected by split ptlock since flush_dcache_page is using it on some
> other mm, without any relevant lock. Oh well, make it a slightly more
> robust by factoring the pfn check within it. And it looked liable to
> confuse a camouflaged swap or file entry with a good pte: fix that too.

I have to say I really don't understand VM at all. cc'ing the
parisc-linux list in case anyone there has a better understanding than I
do.

> Signed-off-by: Hugh Dickins <[email protected]>
> ---
>
> arch/parisc/kernel/cache.c | 24 +++++++++---------------
> include/asm-parisc/cacheflush.h | 35 +++++++++++++++++++----------------
> 2 files changed, 28 insertions(+), 31 deletions(-)
>
> --- mm2/arch/parisc/kernel/cache.c 2005-03-02 07:38:56.000000000 +0000
> +++ mm3/arch/parisc/kernel/cache.c 2005-10-22 14:06:30.000000000 +0100
> @@ -266,7 +266,6 @@ void flush_dcache_page(struct page *page
> unsigned long offset;
> unsigned long addr;
> pgoff_t pgoff;
> - pte_t *pte;
> unsigned long pfn = page_to_pfn(page);
>
>
> @@ -297,21 +296,16 @@ void flush_dcache_page(struct page *page
> * taking a page fault if the pte doesn't exist.
> * This is just for speed. If the page translation
> * isn't there, there's no point exciting the
> - * nadtlb handler into a nullification frenzy */
> -
> -
> - if(!(pte = translation_exists(mpnt, addr)))
> - continue;
> -
> - /* make sure we really have this page: the private
> + * nadtlb handler into a nullification frenzy.
> + *
> + * Make sure we really have this page: the private
> * mappings may cover this area but have COW'd this
> - * particular page */
> - if(pte_pfn(*pte) != pfn)
> - continue;
> -
> - __flush_cache_page(mpnt, addr);
> -
> - break;
> + * particular page.
> + */
> + if (translation_exists(mpnt, addr, pfn)) {
> + __flush_cache_page(mpnt, addr);
> + break;
> + }
> }
> flush_dcache_mmap_unlock(mapping);
> }
> --- mm2/include/asm-parisc/cacheflush.h 2005-10-11 12:07:49.000000000 +0100
> +++ mm3/include/asm-parisc/cacheflush.h 2005-10-22 14:06:30.000000000 +0100
> @@ -100,30 +100,34 @@ static inline void flush_cache_range(str
>
> /* Simple function to work out if we have an existing address translation
> * for a user space vma. */
> -static inline pte_t *__translation_exists(struct mm_struct *mm,
> - unsigned long addr)
> +static inline int translation_exists(struct vm_area_struct *vma,
> + unsigned long addr, unsigned long pfn)
> {
> - pgd_t *pgd = pgd_offset(mm, addr);
> + pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
> pmd_t *pmd;
> - pte_t *pte;
> + pte_t pte;
>
> if(pgd_none(*pgd))
> - return NULL;
> + return 0;
>
> pmd = pmd_offset(pgd, addr);
> if(pmd_none(*pmd) || pmd_bad(*pmd))
> - return NULL;
> + return 0;
>
> - pte = pte_offset_map(pmd, addr);
> + /* We cannot take the pte lock here: flush_cache_page is usually
> + * called with pte lock already held. Whereas flush_dcache_page
> + * takes flush_dcache_mmap_lock, which is lower in the hierarchy:
> + * the vma itself is secure, but the pte might come or go racily.
> + */
> + pte = *pte_offset_map(pmd, addr);
> + /* But pte_unmap() does nothing on this architecture */
> +
> + /* Filter out coincidental file entries and swap entries */
> + if (!(pte_val(pte) & (_PAGE_FLUSH|_PAGE_PRESENT)))
> + return 0;
>
> - /* The PA flush mappings show up as pte_none, but they're
> - * valid none the less */
> - if(pte_none(*pte) && ((pte_val(*pte) & _PAGE_FLUSH) == 0))
> - return NULL;
> - return pte;
> + return pte_pfn(pte) == pfn;
> }
> -#define translation_exists(vma, addr) __translation_exists((vma)->vm_mm, addr)
> -
>
> /* Private function to flush a page from the cache of a non-current
> * process. cr25 contains the Page Directory of the current user
> @@ -175,9 +179,8 @@ flush_cache_page(struct vm_area_struct *
> {
> BUG_ON(!vma->vm_mm->context);
>
> - if(likely(translation_exists(vma, vmaddr)))
> + if (likely(translation_exists(vma, vmaddr, pfn)))
> __flush_cache_page(vma, vmaddr);
>
> }
> #endif
> -

2005-10-22 17:02:48

by Russell King

[permalink] [raw]
Subject: Re: [PATCH 2/9] mm: arm ready for split ptlock

On Sat, Oct 22, 2005 at 05:22:20PM +0100, Hugh Dickins wrote:
> Signal handling's preserve and restore of iwmmxt context currently
> involves reading and writing that context to and from user space, while
> holding page_table_lock to secure the user page(s) against kswapd. If
> we split the lock, then the structure might span two pages, secured by
> different locks. That would be manageable; but it seems simpler just
> to read into and write from a kernel stack buffer, copying that out and
> in without locking (the structure is 160 bytes in size, and here we're
> near the top of the kernel stack). Or would the overhead be noticeable?

Please contact Nicolas Pitre about that - that was my suggestion,
but ISTR apparantly the overhead is too high.

> arm_syscall's cmpxchg emulation use pte_offset_map_lock, instead of
> pte_offset_map and mm-wide page_table_lock; and strictly, it should now
> also take mmap_sem before descending to pmd, to guard against another
> thread munmapping, and the page table pulled out beneath this thread.

Now that I look at it, it's probably buggy - if the page isn't already
dirty, it will modify without the COW action. Again, please contact
Nicolas about this.

> Updated two comments in fault-armv.c. adjust_pte is interesting, since
> its modification of a pte in one part of the mm depends on the lock held
> when calling update_mmu_cache for a pte in some other part of that mm.
> This can't be done with a split page_table_lock (and we've already taken
> the lowest lock in the hierarchy here): so we'll have to disable split
> on arm, unless CONFIG_CPU_CACHE_VIPT to ensures adjust_pte never used.

Well, adjust_pte is extremely critical to ensure correct cache behaviour
(and therefore data integrity) so if split ptlock is incompatible with
this, split ptlock loses.

As far as adjust_pte being called, it's only called for VIVT caches,
which means the configuration has to do if VIVT, disable split ptlock.

--
Russell King
Linux kernel 2.6 ARM Linux - http://www.arm.linux.org.uk/
maintainer of: 2.6 Serial core

2005-10-22 17:09:09

by James Bottomley

[permalink] [raw]
Subject: Re: [parisc-linux] Re: [PATCH 3/9] mm: parisc pte atomicity

On Sat, 2005-10-22 at 10:33 -0600, Matthew Wilcox wrote:
> On Sat, Oct 22, 2005 at 05:23:27PM +0100, Hugh Dickins wrote:
> > There's a worrying function translation_exists in parisc cacheflush.h,
> > unaffected by split ptlock since flush_dcache_page is using it on some
> > other mm, without any relevant lock. Oh well, make it a slightly more
> > robust by factoring the pfn check within it. And it looked liable to
> > confuse a camouflaged swap or file entry with a good pte: fix that too.
>
> I have to say I really don't understand VM at all. cc'ing the
> parisc-linux list in case anyone there has a better understanding than I
> do.

Well, I wrote the code for translation_exists() so I suppose that's
me...

Do you have a reference to the thread that triggered this? I need more
context to decide what the actual problem is.

Let me explain what translation_exists() is though for the benefit of
the mm people.

Parisc is a VIPT architecture, so that would ordinarily entail a lot of
cache flushing through process spaces for shared pages. However, we use
an optimisation of making all user space shared pages congruent, so
flushing a single one makes the cache coherent for all the others (this
is also a cache usage optimisation).

So, what our flush_dcache_page() does is it selects the first user page
it comes across to flush knowing that flushing this is sufficient to
flush all the others.

Unfortunately, there's a catch: the page we're flushing must have been
mapped into the user process (not guaranteed even if the area is in the
vma list) otherwise the flush has no effect (a VIPT cache flush must
know the translation of the page it's flushing), so we have to check the
validity of the translation before doing the flush.

On parisc, if we try to flush a page without a translation, it's picked
up by our software tlb miss handlers, which actually nullify the
instruction (but since we have to flush a page as a set of non
interlocking cache line flushes [about 128 of them per page with a cache
width of 32]) and the tlb handler is invoked for every flush instruction
(because the translation continues not to exist) it makes that flush
operation extremely slow. (128 interruptions of the execution stream per
flush)

So, the uses of translation_exists() are threefold

1) Make sure we execute flush_dcache_page() correctly (rather than
executing a flush that has no effect)
2) optimise single page flushing: don't excite the tlb miss handlers if
there's no translation
3) optimise pte lookup (that's why translation_exists returns the pte
pointer); since we already have to walk the page tables to answer the
question, the return value may as well be the pte entry or NULL rather
than true or false.

James


2005-10-22 18:24:29

by Paul Mundt

[permalink] [raw]
Subject: Re: [PATCH 1/9] mm: i386 sh sh64 ready for split ptlock

On Sat, Oct 22, 2005 at 05:19:30PM +0100, Hugh Dickins wrote:
> The sh __do_page_fault: which handles both kernel faults (without lock)
> and user mm faults (locked - though it set_pte without locking before).
>
> The sh64 flush_cache_range and helpers: which wrongly thought callers
> held page_table_lock before (only its tlb_start_vma did, and no longer
> does so); moved the flush loop down, and adjusted the large versus small
> range decision to consider a range which spans page tables as large.
>
> Signed-off-by: Hugh Dickins <[email protected]>

Yes, that was very clearly broken, thanks for noting this.

I won't be able to test the sh64 bits until later this week, but both
of these changes look good to me. I'll take care of cleaning up any
breakage this introduces, thanks Hugh.

Acked-by: Paul Mundt <[email protected]>


Attachments:
(No filename) (838.00 B)
(No filename) (189.00 B)
Download all attachments

2005-10-22 19:30:32

by Jeff Dike

[permalink] [raw]
Subject: Re: [PATCH 6/9] mm: uml kill unused

On Sat, Oct 22, 2005 at 05:27:01PM +0100, Hugh Dickins wrote:
> In worrying over the various pte operations in different architectures,
> I came across some unused functions in UML: remove mprotect_kernel_vm,
> protect_vm_page and addr_pte.

NACK on the addr_pte part of this - I use this from gdb all the time when I'm
concerned with page tables. The other two are fine.

Jeff

2005-10-23 07:50:30

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 6/9 take 2] mm: uml kill unused

In worrying over the various pte operations in different architectures,
I came across some unused functions in UML: remove mprotect_kernel_vm
and protect_vm_page; but leave addr_pte, Jeff notes it's useful in gdb.

Signed-off-by: Hugh Dickins <[email protected]>
---
This replaces yesterday's 6/9, following Jeff's feedback - thanks.

arch/um/include/tlb.h | 1 -
arch/um/kernel/tt/tlb.c | 36 ------------------------------------
2 files changed, 37 deletions(-)

--- mm5/arch/um/include/tlb.h 2005-10-11 12:07:16.000000000 +0100
+++ mm6/arch/um/include/tlb.h 2005-10-22 14:07:12.000000000 +0100
@@ -34,7 +34,6 @@ struct host_vm_op {
} u;
};

-extern void mprotect_kernel_vm(int w);
extern void force_flush_all(void);
extern void fix_range_common(struct mm_struct *mm, unsigned long start_addr,
unsigned long end_addr, int force,
--- mm5/arch/um/kernel/tt/tlb.c 2005-10-11 12:07:16.000000000 +0100
+++ mm6/arch/um/kernel/tt/tlb.c 2005-10-22 14:07:12.000000000 +0100
@@ -74,42 +74,6 @@ void flush_tlb_kernel_range_tt(unsigned
atomic_inc(&vmchange_seq);
}

-static void protect_vm_page(unsigned long addr, int w, int must_succeed)
-{
- int err;
-
- err = protect_memory(addr, PAGE_SIZE, 1, w, 1, must_succeed);
- if(err == 0) return;
- else if((err == -EFAULT) || (err == -ENOMEM)){
- flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
- protect_vm_page(addr, w, 1);
- }
- else panic("protect_vm_page : protect failed, errno = %d\n", err);
-}
-
-void mprotect_kernel_vm(int w)
-{
- struct mm_struct *mm;
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
- unsigned long addr;
-
- mm = &init_mm;
- for(addr = start_vm; addr < end_vm;){
- pgd = pgd_offset(mm, addr);
- pud = pud_offset(pgd, addr);
- pmd = pmd_offset(pud, addr);
- if(pmd_present(*pmd)){
- pte = pte_offset_kernel(pmd, addr);
- if(pte_present(*pte)) protect_vm_page(addr, w, 0);
- addr += PAGE_SIZE;
- }
- else addr += PMD_SIZE;
- }
-}
-
void flush_tlb_kernel_vm_tt(void)
{
flush_tlb_kernel_range(start_vm, end_vm);

2005-10-23 08:28:59

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH 2/9] mm: arm ready for split ptlock

Many thanks for your rapid response...

On Sat, 22 Oct 2005, Russell King wrote:
> On Sat, Oct 22, 2005 at 05:22:20PM +0100, Hugh Dickins wrote:
> > Signal handling's preserve and restore of iwmmxt context currently
> > involves reading and writing that context to and from user space, while
> > holding page_table_lock to secure the user page(s) against kswapd. If
> > we split the lock, then the structure might span two pages, secured by
> > different locks. That would be manageable; but it seems simpler just
> > to read into and write from a kernel stack buffer, copying that out and
> > in without locking (the structure is 160 bytes in size, and here we're
> > near the top of the kernel stack). Or would the overhead be noticeable?
>
> Please contact Nicolas Pitre about that - that was my suggestion,
> but ISTR apparantly the overhead is too high.

Right, I've CC'ed him, and will forward the original patch in a moment.

It'd be perfectly possible to lock two extents in there, just seems like
over-engineering, and yet more strange mm-dependent code down in the arch
than I'd like to put there.

If Nicolas insists on avoiding the copies, then the simplest answer is
to disable split ptlock in the CONFIG_IWMMXT case too, and replace my
signal.c mods by just a comment highlighting the issue.

ARM is not the architecture (ia64) which prompted this page fault
scalability business. I've no idea of its benefit or otherwise on ARM.
Just didn't want ARM to be the only one not invited to the party.

> > arm_syscall's cmpxchg emulation use pte_offset_map_lock, instead of
> > pte_offset_map and mm-wide page_table_lock; and strictly, it should now
> > also take mmap_sem before descending to pmd, to guard against another
> > thread munmapping, and the page table pulled out beneath this thread.
>
> Now that I look at it, it's probably buggy - if the page isn't already
> dirty, it will modify without the COW action. Again, please contact
> Nicolas about this.

That hadn't crossed my mind, good catch. (Though bad both before and
after my change.) I did wonder about the alignment, the assumption that
*(unsigned long *)addr all fits within the same page - but assumed ARM's
instructions are suitably aligned?

It does look like that cmpxchg emulation needs to be done another way,
quite unrelated to my changes.

> > Updated two comments in fault-armv.c. adjust_pte is interesting, since
> > its modification of a pte in one part of the mm depends on the lock held
> > when calling update_mmu_cache for a pte in some other part of that mm.
> > This can't be done with a split page_table_lock (and we've already taken
> > the lowest lock in the hierarchy here): so we'll have to disable split
> > on arm, unless CONFIG_CPU_CACHE_VIPT to ensures adjust_pte never used.
>
> Well, adjust_pte is extremely critical to ensure correct cache behaviour
> (and therefore data integrity) so if split ptlock is incompatible with
> this, split ptlock loses.

Unquestionably.

> As far as adjust_pte being called, it's only called for VIVT caches,
> which means the configuration has to do if VIVT, disable split ptlock.

Yes, patch 7/9 disables it on ARM unless CPU_CACHE_VIPT
(since in other cases cache_is_vivt may be determined at runtime).

config SPLIT_PTLOCK_CPUS
int
default "4096" if ARM && !CPU_CACHE_VIPT
default "4096" if PARISC && DEBUG_SPINLOCK && !64BIT
default "4"

Please let me know when ARM supports 4096 cpus, I'll change that then!

Hugh

2005-10-23 09:03:52

by Hugh Dickins

[permalink] [raw]
Subject: Re: [parisc-linux] Re: [PATCH 3/9] mm: parisc pte atomicity

On Sat, 22 Oct 2005, James Bottomley wrote:
> On Sat, 2005-10-22 at 10:33 -0600, Matthew Wilcox wrote:
> > On Sat, Oct 22, 2005 at 05:23:27PM +0100, Hugh Dickins wrote:
> > > There's a worrying function translation_exists in parisc cacheflush.h,
> > > unaffected by split ptlock since flush_dcache_page is using it on some
> > > other mm, without any relevant lock. Oh well, make it a slightly more
> > > robust by factoring the pfn check within it. And it looked liable to
> > > confuse a camouflaged swap or file entry with a good pte: fix that too.
> >
> > I have to say I really don't understand VM at all. cc'ing the
> > parisc-linux list in case anyone there has a better understanding than I
> > do.
>
> Well, I wrote the code for translation_exists() so I suppose that's
> me...
>
> Do you have a reference to the thread that triggered this? I need more
> context to decide what the actual problem is.

No thread, and I don't know of any problem reported. It's just that
in connection with my page fault scalability patches, splitting up the
page_table_lock, I've had to search through architectures for pte code
which may need changing.

Since translation_exists is doing no locking at present, it doesn't need
any change to match my ptlock changes. But it's a little worrying that
it has no locking. When called from flush_cache_page that's almost
always right, and locking would just deadlock on locks already taken
(I think core dumping uses flush_cache_page outside of locks, not a big
deal). But when called from flush_dcache_page, it's dealing with some
other mm entirely: anything could happen at any moment.

We're actually better off with my changes here than before. Since you're
coming from the vma_prio_tree, with the necessary locking on that, you
can now be sure that the page tables won't get pulled out from under you
(whereas before there was an unlikely window in which they might be).

> Let me explain what translation_exists() is though for the benefit of
> the mm people.

Thanks for taking the trouble, James (more than I needed to know!
but I'm reassured by what you say of the TLB miss handlers, I was
worried about how serious the consequence of doing it on an invalid
entry, feared kernel panic or something).

> Parisc is a VIPT architecture, so that would ordinarily entail a lot of
> cache flushing through process spaces for shared pages. However, we use
> an optimisation of making all user space shared pages congruent, so
> flushing a single one makes the cache coherent for all the others (this
> is also a cache usage optimisation).
>
> So, what our flush_dcache_page() does is it selects the first user page
> it comes across to flush knowing that flushing this is sufficient to
> flush all the others.
>
> Unfortunately, there's a catch: the page we're flushing must have been
> mapped into the user process (not guaranteed even if the area is in the
> vma list) otherwise the flush has no effect (a VIPT cache flush must
> know the translation of the page it's flushing), so we have to check the
> validity of the translation before doing the flush.
>
> On parisc, if we try to flush a page without a translation, it's picked
> up by our software tlb miss handlers, which actually nullify the
> instruction (but since we have to flush a page as a set of non
> interlocking cache line flushes [about 128 of them per page with a cache
> width of 32]) and the tlb handler is invoked for every flush instruction
> (because the translation continues not to exist) it makes that flush
> operation extremely slow. (128 interruptions of the execution stream per
> flush)
>
> So, the uses of translation_exists() are threefold
>
> 1) Make sure we execute flush_dcache_page() correctly (rather than
> executing a flush that has no effect)
> 2) optimise single page flushing: don't excite the tlb miss handlers if
> there's no translation
> 3) optimise pte lookup (that's why translation_exists returns the pte
> pointer); since we already have to walk the page tables to answer the
> question, the return value may as well be the pte entry or NULL rather
> than true or false.

Regarding 3, I changed it to boolean because the only use made of the pte
pointer was to check pfn: which check I folded inside translation_exists,
so the pte it's checking for pfn cannot have changed since it was checked
for being present (repeated dereferences of pte pointer without locking
make me nervous, and I changed some code in other architectures for that).

I'm right, aren't I? that the previous pte_none test was actually letting
through swap entries and pte_file entries which might look as if they had
the right pfn, just by coincidence of their offsets? So flush_dcache_page
would stop, thinking it had done a good flush, when actually it hadn't.

But races remain: the pte good at the moment translation_exists checks it,
may have been taken out by kswapd a moment later (flush_dcache_mmap_lock
only secures the vma_prio_tree structure, not ptes within the vmas);
or it may have been COWed in the interval; or zapped from the mm.

Can you get a success code out of __flush_cache_page? Or perhaps you
should run translation_exists a second time after the __flush_cache_page,
to check nothing changed (the pte pointer would then be helpful, to save
descending a second time)? Or perhaps it all works out, that any such
change which might occur, itself does the __flush_cache_page you need?

Hugh

2005-10-23 15:05:46

by James Bottomley

[permalink] [raw]
Subject: Re: [parisc-linux] Re: [PATCH 3/9] mm: parisc pte atomicity

On Sun, 2005-10-23 at 10:02 +0100, Hugh Dickins wrote:
> No thread, and I don't know of any problem reported. It's just that
> in connection with my page fault scalability patches, splitting up the
> page_table_lock, I've had to search through architectures for pte code
> which may need changing.
>
> Since translation_exists is doing no locking at present, it doesn't need
> any change to match my ptlock changes. But it's a little worrying that
> it has no locking. When called from flush_cache_page that's almost
> always right, and locking would just deadlock on locks already taken
> (I think core dumping uses flush_cache_page outside of locks, not a big
> deal). But when called from flush_dcache_page, it's dealing with some
> other mm entirely: anything could happen at any moment.

Yes ... and flush_dcache_page() is the only one we really care about;
flush_cache_page() is just an optimisation.

> We're actually better off with my changes here than before. Since you're
> coming from the vma_prio_tree, with the necessary locking on that, you
> can now be sure that the page tables won't get pulled out from under you
> (whereas before there was an unlikely window in which they might be).

OK, thanks.

> > 1) Make sure we execute flush_dcache_page() correctly (rather than
> > executing a flush that has no effect)
> > 2) optimise single page flushing: don't excite the tlb miss handlers if
> > there's no translation
> > 3) optimise pte lookup (that's why translation_exists returns the pte
> > pointer); since we already have to walk the page tables to answer the
> > question, the return value may as well be the pte entry or NULL rather
> > than true or false.
>
> Regarding 3, I changed it to boolean because the only use made of the pte
> pointer was to check pfn: which check I folded inside translation_exists,
> so the pte it's checking for pfn cannot have changed since it was checked
> for being present (repeated dereferences of pte pointer without locking
> make me nervous, and I changed some code in other architectures for that).

Actually, yes, I think the code it was optimising already got removed,
so it's now a pointless optimisation.

Originally translation_exists returned any pte entry at all and it was
up to the caller to check the pte flags.

The change does slightly worry me in that it alters the behaviour of
flush_cache_page() because now it checks the pfn whereas previously it
didn't. This means that previously we would flush the COW'd page of a
shared mapping, now we won't.

> I'm right, aren't I? that the previous pte_none test was actually letting
> through swap entries and pte_file entries which might look as if they had
> the right pfn, just by coincidence of their offsets? So flush_dcache_page
> would stop, thinking it had done a good flush, when actually it hadn't.

Actually, no, pte_none() on parisc is either pte is zero or _PAGE_FLUSH
(which is our private flag saying we're preserving the pte entry for the
purposes of flushing even though it's gone) is set. However, now that I
look at it, are you thinking our ptep_get_and_clear() should be doing a
check for _PAGE_PRESENT before it sets _PAGE_FLUSH?

> But races remain: the pte good at the moment translation_exists checks it,
> may have been taken out by kswapd a moment later (flush_dcache_mmap_lock
> only secures the vma_prio_tree structure, not ptes within the vmas);
> or it may have been COWed in the interval; or zapped from the mm.
>
> Can you get a success code out of __flush_cache_page? Or perhaps you
> should run translation_exists a second time after the __flush_cache_page,
> to check nothing changed (the pte pointer would then be helpful, to save
> descending a second time)? Or perhaps it all works out, that any such
> change which might occur, itself does the __flush_cache_page you need?

Yes, I know ... I never liked the volatility of this, but it's better
than what was there before, believe me (previously we just flushed the
first entry we found regardless).

Getting a return code out of __flush_dcache_page() is hard because it
doesn't know if the tlb miss handler nullified the instructions it's
trying to execute; and they're interruption handlers (meaning we don't
push anything on the stack for them, they run in shadow registers), so
getting a return code out of them is next to impossible.

For the flush to be effective in the VIPT cache, we have to have a
virtual address with a valid translation that points to the correct
physical address. I suppose we could flush through the tmpalias space
instead. That's where we set up an artifical translation that's not the
actual vaddr but instead is congruent to the vaddr so the mapping is
effective in cache aliasing terms. Our congruence boundary is 4MB, so
we set up a private (per cpu) 4MB space (tmpalias) where we can set up
our pte's (or actually manufacture them in the tlb miss handler)
securely.

James


2005-10-23 16:54:28

by Nikita Danilov

[permalink] [raw]
Subject: Re: [PATCH 7/9] mm: split page table lock

Hugh Dickins writes:

[...]

> + * When freeing, reset page->mapping so free_pages_check won't complain.
> + */
> +#define __pte_lockptr(page) ((spinlock_t *)&((page)->private))
> +#define pte_lock_init(_page) do { \
> + BUILD_BUG_ON((size_t)(__pte_lockptr((struct page *)0) + 1) > \
> + sizeof(struct page)); \
> + spin_lock_init(__pte_lockptr(_page)); \
> +} while (0)

Looking at this, I think BUILD_BUG_ON() should be defined in a way that
allows it to be used outside of function scope too (see patch below,
compile-tested).

Nikita.
--
Fix comment describing BUILD_BUG_ON: BUG_ON is not an assertion
(unfortunately).

Also implement BUILD_BUG_ON in a way that can be used outside of a function
scope, so that compile time checks can be placed in convenient places (like in
a header, close to the definition of related constants and data-types).

Signed-off-by: Nikita Danilov <[email protected]>

include/linux/kernel.h | 5 +++--
1 files changed, 3 insertions(+), 2 deletions(-)

diff -puN include/linux/kernel.h~BUILD_BUG_ON-fix-comment include/linux/kernel.h
--- git-linux/include/linux/kernel.h~BUILD_BUG_ON-fix-comment 2005-10-23 20:09:33.000000000 +0400
+++ git-linux-nikita/include/linux/kernel.h 2005-10-23 20:44:03.000000000 +0400
@@ -307,8 +307,9 @@ struct sysinfo {
char _f[20-2*sizeof(long)-sizeof(int)]; /* Padding: libc5 uses this.. */
};

-/* Force a compilation error if condition is false */
-#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
+/* Force a compilation error if condition is true */
+#define BUILD_BUG_ON(x) \
+ void __dummy_build_bug_on(int __compile_time_assertion_failed_[-!!(x)])

#ifdef CONFIG_SYSCTL
extern int randomize_va_space;

_

2005-10-23 21:28:04

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 7/9] mm: split page table lock

Hugh Dickins <[email protected]> wrote:
>
> In this implementation, the spinlock is tucked inside the struct page of
> the page table page: with a BUILD_BUG_ON in case it overflows - which it
> would in the case of 32-bit PA-RISC with spinlock debugging enabled.

eh? It's going to overflow an unsigned long on x86 too:

typedef struct {
raw_spinlock_t raw_lock;
#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
unsigned int break_lock;
#endif
#ifdef CONFIG_DEBUG_SPINLOCK
unsigned int magic, owner_cpu;
void *owner;
#endif
} spinlock_t;

I think we need a union here.

> +#define __pte_lockptr(page) ((spinlock_t *)&((page)->private))
> +#define pte_lock_init(_page) do { \
> + BUILD_BUG_ON((size_t)(__pte_lockptr((struct page *)0) + 1) > \
> + sizeof(struct page)); \

The above assumes that page.private is the final field in struct page.
That's fragile.

> Splitting the lock is not quite for free: another cacheline access.
> Ideally, I suppose we would use split ptlock only for multi-threaded
> processes on multi-cpu machines; but deciding that dynamically would
> have its own costs. So for now enable it by config, at some number
> of cpus - since the Kconfig language doesn't support inequalities, let
> preprocessor compare that with NR_CPUS. But I don't think it's worth
> being user-configurable: for good testing of both split and unsplit
> configs, split now at 4 cpus, and perhaps change that to 8 later.

I'll make it >= 2 for -mm.

> +#define __pte_lockptr(page) ((spinlock_t *)&((page)->private))
> +#define pte_lock_init(_page) do { \
> + BUILD_BUG_ON((size_t)(__pte_lockptr((struct page *)0) + 1) > \
> + sizeof(struct page)); \

2005-10-23 21:49:51

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 7/9] mm: split page table lock


I did the make-it-a-union thing. Seems OK.

Hugh Dickins <[email protected]> wrote:
>
> +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
> +/*
> + * We tuck a spinlock to guard each pagetable page into its struct page,
> + * at page->private, with BUILD_BUG_ON to make sure that this will not
> + * overflow into the next struct page (as it might with DEBUG_SPINLOCK).
> + * When freeing, reset page->mapping so free_pages_check won't complain.
> + */
> +#define __pte_lockptr(page) ((spinlock_t *)&((page)->private))
> +#define pte_lock_init(_page) do { \
> + BUILD_BUG_ON((size_t)(__pte_lockptr((struct page *)0) + 1) > \
> + sizeof(struct page)); \
> + spin_lock_init(__pte_lockptr(_page)); \
> +} while (0)
> +#define pte_lock_deinit(page) ((page)->mapping = NULL)
> +#define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
> +#else

Why does pte_lock_deinit() zap ->mapping? That doesn't seem to have
anything to do with anything?

2005-10-23 22:23:36

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 7/9] mm: split page table lock

Andrew Morton <[email protected]> wrote:
>
> Hugh Dickins <[email protected]> wrote:
> >
> > In this implementation, the spinlock is tucked inside the struct page of
> > the page table page: with a BUILD_BUG_ON in case it overflows - which it
> > would in the case of 32-bit PA-RISC with spinlock debugging enabled.
>
> eh? It's going to overflow an unsigned long on x86 too:

Ah, I think I see what you've done: assume that .index, .lru and .virtual
are unused on pagetable pages, so we can just overwrite them.

ick. I think I prefer the union, although it'll make struct page bigger
for CONFIG_PREEMPT+CONFIG_SMP+NR_CPUS>=4. hmm.


2005-10-24 03:10:33

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH 7/9] mm: split page table lock

On Sun, 23 Oct 2005, Andrew Morton wrote:
> Hugh Dickins <[email protected]> wrote:
> > preprocessor compare that with NR_CPUS. But I don't think it's worth
> > being user-configurable: for good testing of both split and unsplit
> > configs, split now at 4 cpus, and perhaps change that to 8 later.
>
> I'll make it >= 2 for -mm.

The trouble with >= 2 is that it then leaves the unsplit page_table_lock
path untested, since UP isn't using page_table_lock at all. While it's
true that the unsplit page_table_lock path has had a long history of
testing, it's not inconceivable that I could have screwed it up.

With the default at 4, I think we've got quite good coverage between
those who configure NR_CPUS down to the 2 they actually have,
and those who leave it at its default or actually have 4.

Hugh

2005-10-24 03:13:13

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH 7/9] mm: split page table lock

On Sun, 23 Oct 2005, Andrew Morton wrote:
> Hugh Dickins <[email protected]> wrote:
> > + * When freeing, reset page->mapping so free_pages_check won't complain.
> > + */
> > +#define __pte_lockptr(page) ((spinlock_t *)&((page)->private))
> > +#define pte_lock_init(_page) do { \
> > + BUILD_BUG_ON((size_t)(__pte_lockptr((struct page *)0) + 1) > \
> > + sizeof(struct page)); \
> > + spin_lock_init(__pte_lockptr(_page)); \
> > +} while (0)
> > +#define pte_lock_deinit(page) ((page)->mapping = NULL)
> > +#define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
> > +#else
>
> Why does pte_lock_deinit() zap ->mapping? That doesn't seem to have
> anything to do with anything?

Nick had wondered the same originally, so I did add the comment above.
Bring it down to immediately above the deinit line if you prefer.

Hugh

2005-10-24 03:39:14

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH 7/9] mm: split page table lock

On Sun, 23 Oct 2005, Andrew Morton wrote:
> Andrew Morton <[email protected]> wrote:
> > Hugh Dickins <[email protected]> wrote:
> > >
> > > In this implementation, the spinlock is tucked inside the struct page of
> > > the page table page: with a BUILD_BUG_ON in case it overflows - which it
> > > would in the case of 32-bit PA-RISC with spinlock debugging enabled.
> >
> > eh? It's going to overflow an unsigned long on x86 too:
>
> Ah, I think I see what you've done: assume that .index, .lru and .virtual
> are unused on pagetable pages, so we can just overwrite them.

That's right (so I'm ignoring some of your earlier stabs).
Sorry, it's looking like my comment block was inadequate.

I'm also assuming that it'd be very quickly noticed, by bad_page
or otherwise, if anyone changes these fields, so that what it's
overwriting becomes significant.

Would it be better if pte_lock_init(page) explicitly initialize each
field from _page->index onwards, so that any search for uses of that
page field shows up pte_lock_init? With the BUILD_BUG_ON adjusted
somehow so _page->virtual is excluded (I tend to erase that one from
my mind, but we most certainly don't want a spinlock overwriting it).

> ick. I think I prefer the union, although it'll make struct page bigger
> for CONFIG_PREEMPT+CONFIG_SMP+NR_CPUS>=4. hmm.

Hmm indeed. Definitely not the tradeoff I chose or would choose.

Hugh

2005-10-24 04:17:23

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 7/9] mm: split page table lock

Hugh Dickins <[email protected]> wrote:
>
> On Sun, 23 Oct 2005, Andrew Morton wrote:
> > Andrew Morton <[email protected]> wrote:
> > > Hugh Dickins <[email protected]> wrote:
> > > >
> > > > In this implementation, the spinlock is tucked inside the struct page of
> > > > the page table page: with a BUILD_BUG_ON in case it overflows - which it
> > > > would in the case of 32-bit PA-RISC with spinlock debugging enabled.
> > >
> > > eh? It's going to overflow an unsigned long on x86 too:
> >
> > Ah, I think I see what you've done: assume that .index, .lru and .virtual
> > are unused on pagetable pages, so we can just overwrite them.
>
> That's right (so I'm ignoring some of your earlier stabs).
> Sorry, it's looking like my comment block was inadequate.
>
> I'm also assuming that it'd be very quickly noticed, by bad_page
> or otherwise, if anyone changes these fields, so that what it's
> overwriting becomes significant.

Well it won't necesarily be noticed quickly - detecting an overflow depends
upon the right settings of CONFIG_PREEMPT, CONFIG_SMP, CONFIG_NR_CPUS,
WANT_PAGE_VIRTUAL, CONFIG_PAGE_OWNER and appropriate selection of
architecture and the absence of additional spinlock debugging patches and
the absence of reworked struct page layout!

I'm rather surprised that no architectures are already using page.mapping,
.index, .lru or .virtual in pte pages.

> Would it be better if pte_lock_init(page) explicitly initialize each
> field from _page->index onwards, so that any search for uses of that
> page field shows up pte_lock_init? With the BUILD_BUG_ON adjusted
> somehow so _page->virtual is excluded (I tend to erase that one from
> my mind, but we most certainly don't want a spinlock overwriting it).
>
> > ick. I think I prefer the union, although it'll make struct page bigger
> > for CONFIG_PREEMPT+CONFIG_SMP+NR_CPUS>=4. hmm.
>
> Hmm indeed. Definitely not the tradeoff I chose or would choose.

It's not that bad, really. I do think that this approach is just too
dirty, sorry. We can avoid it by moving something else into the union.
lru, perhaps?

2005-10-24 04:37:29

by Hugh Dickins

[permalink] [raw]
Subject: Re: [parisc-linux] Re: [PATCH 3/9] mm: parisc pte atomicity

On Sun, 23 Oct 2005, James Bottomley wrote:
> On Sun, 2005-10-23 at 10:02 +0100, Hugh Dickins wrote:
>
> The change does slightly worry me in that it alters the behaviour of
> flush_cache_page() because now it checks the pfn whereas previously it
> didn't. This means that previously we would flush the COW'd page of a
> shared mapping, now we won't.

Perhaps you're thinking of some use of flush_cache_page that no longer
exists in the tree? All the uses I see are passing in the right pfn,
and the question is why translation_exists even gets called by it.

Even those uses of flush_cache_page in asm-parisc/cacheflush.h itself
(which I'd missed before): copy_to_user_page and copy_from_user_page
are only used by nothing but access_process_vm, after get_user_pages:
so any COW has already been done, and the right pfn is passed in.

> > I'm right, aren't I? that the previous pte_none test was actually letting
> > through swap entries and pte_file entries which might look as if they had
> > the right pfn, just by coincidence of their offsets? So flush_dcache_page
> > would stop, thinking it had done a good flush, when actually it hadn't.
>
> Actually, no, pte_none() on parisc is either pte is zero or _PAGE_FLUSH
> (which is our private flag saying we're preserving the pte entry for the
> purposes of flushing even though it's gone) is set.

Sorry, "pte_none test" was my lazy shorthand for the actual test:
if(pte_none(*pte) && ((pte_val(*pte) & _PAGE_FLUSH) == 0))
return NULL;
from which point on it assumes the pte is valid. I'm contending that
a swap entry or pte_file entry does not return NULL there, so is then
treated as a valid pte: and pte_pfn on it might match the target pfn.

I believe my
/* Filter out coincidental file entries and swap entries */
if (!(pte_val(pte) & (_PAGE_FLUSH|_PAGE_PRESENT)))
return 0;
is correct, and avoids that problem (plus avoiding the complication
of parisc's unintuitive pte_none).

> However, now that I
> look at it, are you thinking our ptep_get_and_clear() should be doing a
> check for _PAGE_PRESENT before it sets _PAGE_FLUSH?

That's certainly not what I was thinking. No, I'm pretty sure that
ptep_get_and_clear only gets called when we've got the right lock and
know the pte is present: I don't think you need to make any change there.
(The "pretty sure" reflecting that there's a bit of a macro maze here now,
so actually following up each reference would take a bit longer.)

But you may be seeing something I don't see: I've only just met your
_PAGE_FLUSH, and that unusual pte_none might be letting something
through that I'm overlooking, that you're aware of.

> > But races remain: the pte good at the moment translation_exists checks it,
> > may have been taken out by kswapd a moment later (flush_dcache_mmap_lock
> > only secures the vma_prio_tree structure, not ptes within the vmas);
> > or it may have been COWed in the interval; or zapped from the mm.
> >
> > Can you get a success code out of __flush_cache_page? Or perhaps you
> > should run translation_exists a second time after the __flush_cache_page,
> > to check nothing changed (the pte pointer would then be helpful, to save
> > descending a second time)? Or perhaps it all works out, that any such
> > change which might occur, itself does the __flush_cache_page you need?
>
> Yes, I know ... I never liked the volatility of this, but it's better
> than what was there before, believe me (previously we just flushed the
> first entry we found regardless).

I do believe you!

> Getting a return code out of __flush_dcache_page() is hard because it
> doesn't know if the tlb miss handler nullified the instructions it's
> trying to execute; and they're interruption handlers (meaning we don't
> push anything on the stack for them, they run in shadow registers), so
> getting a return code out of them is next to impossible.

Right, it was just a thought.

> For the flush to be effective in the VIPT cache, we have to have a
> virtual address with a valid translation that points to the correct
> physical address. I suppose we could flush through the tmpalias space
> instead. That's where we set up an artifical translation that's not the
> actual vaddr but instead is congruent to the vaddr so the mapping is
> effective in cache aliasing terms. Our congruence boundary is 4MB, so
> we set up a private (per cpu) 4MB space (tmpalias) where we can set up
> our pte's (or actually manufacture them in the tlb miss handler)
> securely.

I have no appreciation at all how that would compare. On the one hand
it sounds like a lot of overhead you'd understandably prefer to avoid;
on the other hand, this hit-and-miss search of the vma_prio_tree might
be a worse overhead better avoided. I've no appreciation at all.

But I made two suggestions above that might be less work. One,
requiring no work but research, that _perhaps_ in all the racy cases
you can rely on the __flush_cache_page having been done by the racer?
I can't tell, and it may be obvious to you that that's a non-starter.

Other, that you just check the pte again after the __flush_cache_page,
if it's no longer right then assume the flush didn't work and continue.
There is a small chance that the pte was right before, wrong during the
flush, then right after (something else faulted it back in, while we
were away... I was about to say handling an interrupt, but actually
even interrupts are disabled here by flush_dcache_mmap_lock, since
that's a reuse of mapping->tree_lock). But it would improve the
safety by several orders of magnitude.

Hugh

2005-10-24 04:59:39

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH 7/9] mm: split page table lock

On Sun, 23 Oct 2005, Andrew Morton wrote:
> Hugh Dickins <[email protected]> wrote:
>
> I'm rather surprised that no architectures are already using page.mapping,
> .index, .lru or .virtual in pte pages.

It is important that we don't corrupt .virtual. But beyond that, why
should they use those fields? Some were used in the pte_chains days,
and ppc held on to that usage for a while longer (to get from page
table to mm), but that is all gone now. Unless I've missed something.

> > > ick. I think I prefer the union, although it'll make struct page bigger
> > > for CONFIG_PREEMPT+CONFIG_SMP+NR_CPUS>=4. hmm.
> >
> > Hmm indeed. Definitely not the tradeoff I chose or would choose.
>
> It's not that bad, really. I do think that this approach is just too
> dirty, sorry. We can avoid it by moving something else into the union.
> lru, perhaps?

Perhaps. But revisiting this is not something I'm prepared to rush
into overnight - right answers come slower. I'd rather we start with
what I had, tested on few architectures as it is, and consider how to
robustify it more sedately.

Or, if you prefer, disable the split (raise Kconfig default to "4096"), or
back out the 7/9 for the moment; though I had been hoping for exposure.

Hugh

2005-10-24 14:56:53

by James Bottomley

[permalink] [raw]
Subject: Re: [parisc-linux] Re: [PATCH 3/9] mm: parisc pte atomicity

On Mon, 2005-10-24 at 05:36 +0100, Hugh Dickins wrote:
> On Sun, 23 Oct 2005, James Bottomley wrote:
> > The change does slightly worry me in that it alters the behaviour of
> > flush_cache_page() because now it checks the pfn whereas previously it
> > didn't. This means that previously we would flush the COW'd page of a
> > shared mapping, now we won't.
>
> Perhaps you're thinking of some use of flush_cache_page that no longer
> exists in the tree? All the uses I see are passing in the right pfn,
> and the question is why translation_exists even gets called by it.

No, just verifying that we won't get tripped up by the pfn test in the
one case where we didn't do it previously.

translation_exists() is just an optimisation in flush_cache_page(). If
there's no pte, we can't do an effective flush, so instead of relying on
flush instruction nullification to achieve the nop, we just call
transation_exists() to do the check for us.

> Even those uses of flush_cache_page in asm-parisc/cacheflush.h itself
> (which I'd missed before): copy_to_user_page and copy_from_user_page
> are only used by nothing but access_process_vm, after get_user_pages:
> so any COW has already been done, and the right pfn is passed in.

OK, so the patch should be fine then.

> > > I'm right, aren't I? that the previous pte_none test was actually letting
> > > through swap entries and pte_file entries which might look as if they had
> > > the right pfn, just by coincidence of their offsets? So flush_dcache_page
> > > would stop, thinking it had done a good flush, when actually it hadn't.
> >
> > Actually, no, pte_none() on parisc is either pte is zero or _PAGE_FLUSH
> > (which is our private flag saying we're preserving the pte entry for the
> > purposes of flushing even though it's gone) is set.
>
> Sorry, "pte_none test" was my lazy shorthand for the actual test:
> if(pte_none(*pte) && ((pte_val(*pte) & _PAGE_FLUSH) == 0))
> return NULL;
> from which point on it assumes the pte is valid. I'm contending that
> a swap entry or pte_file entry does not return NULL there, so is then
> treated as a valid pte: and pte_pfn on it might match the target pfn.

OK, I see, yes, we could have had this problem.

> I believe my
> /* Filter out coincidental file entries and swap entries */
> if (!(pte_val(pte) & (_PAGE_FLUSH|_PAGE_PRESENT)))
> return 0;
> is correct, and avoids that problem (plus avoiding the complication
> of parisc's unintuitive pte_none).

Yes, it should.

> > However, now that I
> > look at it, are you thinking our ptep_get_and_clear() should be doing a
> > check for _PAGE_PRESENT before it sets _PAGE_FLUSH?
>
> That's certainly not what I was thinking. No, I'm pretty sure that
> ptep_get_and_clear only gets called when we've got the right lock and
> know the pte is present: I don't think you need to make any change there.
> (The "pretty sure" reflecting that there's a bit of a macro maze here now,
> so actually following up each reference would take a bit longer.)
>
> But you may be seeing something I don't see: I've only just met your
> _PAGE_FLUSH, and that unusual pte_none might be letting something
> through that I'm overlooking, that you're aware of.

Actually, I think what we need to ascertain is whether _PAGE_FLUSH is
still necessary. A long time ago, Linux would do stupid things like
clear the page table entry and then flush (which is legal on PIPT
architectures like x86). If it's no longer doing that then we no-longer
need to worry about _PAGE_FLUSH.

> > > But races remain: the pte good at the moment translation_exists checks it,
> > > may have been taken out by kswapd a moment later (flush_dcache_mmap_lock
> > > only secures the vma_prio_tree structure, not ptes within the vmas);
> > > or it may have been COWed in the interval; or zapped from the mm.
> > >
> > > Can you get a success code out of __flush_cache_page? Or perhaps you
> > > should run translation_exists a second time after the __flush_cache_page,
> > > to check nothing changed (the pte pointer would then be helpful, to save
> > > descending a second time)? Or perhaps it all works out, that any such
> > > change which might occur, itself does the __flush_cache_page you need?
> >
> > Yes, I know ... I never liked the volatility of this, but it's better
> > than what was there before, believe me (previously we just flushed the
> > first entry we found regardless).
>
> I do believe you!
>
> > Getting a return code out of __flush_dcache_page() is hard because it
> > doesn't know if the tlb miss handler nullified the instructions it's
> > trying to execute; and they're interruption handlers (meaning we don't
> > push anything on the stack for them, they run in shadow registers), so
> > getting a return code out of them is next to impossible.
>
> Right, it was just a thought.
>
> > For the flush to be effective in the VIPT cache, we have to have a
> > virtual address with a valid translation that points to the correct
> > physical address. I suppose we could flush through the tmpalias space
> > instead. That's where we set up an artifical translation that's not the
> > actual vaddr but instead is congruent to the vaddr so the mapping is
> > effective in cache aliasing terms. Our congruence boundary is 4MB, so
> > we set up a private (per cpu) 4MB space (tmpalias) where we can set up
> > our pte's (or actually manufacture them in the tlb miss handler)
> > securely.
>
> I have no appreciation at all how that would compare. On the one hand
> it sounds like a lot of overhead you'd understandably prefer to avoid;
> on the other hand, this hit-and-miss search of the vma_prio_tree might
> be a worse overhead better avoided. I've no appreciation at all.

Well ... it appeals because we could now implement flush_dcache_page()
without walking the vmas. All we need is the offset (because vm_start
is always congruent) which we can work out from the page->index, so we
never need any locking.

> But I made two suggestions above that might be less work. One,
> requiring no work but research, that _perhaps_ in all the racy cases
> you can rely on the __flush_cache_page having been done by the racer?
> I can't tell, and it may be obvious to you that that's a non-starter.

Yes, that, theoretically, is correct. Any flush at that address to user
space affects all the mappings, whether it's done by clearing the pte or
by us in flush_dcache_page().

> Other, that you just check the pte again after the __flush_cache_page,
> if it's no longer right then assume the flush didn't work and continue.
> There is a small chance that the pte was right before, wrong during the
> flush, then right after (something else faulted it back in, while we
> were away... I was about to say handling an interrupt, but actually
> even interrupts are disabled here by flush_dcache_mmap_lock, since
> that's a reuse of mapping->tree_lock). But it would improve the
> safety by several orders of magnitude.

No ... we'll go with the racer already did it theory, I think ...

James


2005-10-24 16:50:21

by Hugh Dickins

[permalink] [raw]
Subject: Re: [parisc-linux] Re: [PATCH 3/9] mm: parisc pte atomicity

Good, I think we're converging.

On Mon, 24 Oct 2005, James Bottomley wrote:
> On Mon, 2005-10-24 at 05:36 +0100, Hugh Dickins wrote:

> Actually, I think what we need to ascertain is whether _PAGE_FLUSH is
> still necessary. A long time ago, Linux would do stupid things like
> clear the page table entry and then flush (which is legal on PIPT
> architectures like x86). If it's no longer doing that then we no-longer
> need to worry about _PAGE_FLUSH.

I can't quite work out the sequence here. That issue was fixed four
years ago in 2.4.10: I sent the patch which moved the flush_cache_page
in vmscan.c, incorporating one posted by NIIBE Yutaka and David Miller,
to get SH right. Whereas PA-RISC's _PAGE_FLUSH got added a year later
in 2.4.20 and 2.5.45.

Ah, yes, it got switched around the wrong side again in 2.5, because
the pte_chains rmap (based on 2.4.9) propagated that error for a while.
Well, it's simply an error if flush_cache_page is called after clearing
the pte, and it now looks like _PAGE_FLUSH was the wrong fix. It would
be a nice simplification if you can now get rid of _PAGE_FLUSH (but I
won't be sending patches to do that myself).

> > > For the flush to be effective in the VIPT cache, we have to have a
> > > virtual address with a valid translation that points to the correct
> > > physical address. I suppose we could flush through the tmpalias space
> > > instead. That's where we set up an artifical translation that's not the
> > > actual vaddr but instead is congruent to the vaddr so the mapping is
> > > effective in cache aliasing terms. Our congruence boundary is 4MB, so
> > > we set up a private (per cpu) 4MB space (tmpalias) where we can set up
> > > our pte's (or actually manufacture them in the tlb miss handler)
> > > securely.
>
> Well ... it appeals because we could now implement flush_dcache_page()
> without walking the vmas. All we need is the offset (because vm_start
> is always congruent) which we can work out from the page->index, so we
> never need any locking.

If the overhead of setting them up and flushing them out after is low,
then yes, using your own tmpalias area sounds much better than getting
involved in the vma_prio_tree stuff. ARM would then be the only
architecture having to dabble in that.

> No ... we'll go with the racer already did it theory, I think ...

Okay, just don't blame me if it's horribly wrong ;)
I've simply not wrapped my head around the races, and would
need a much better understanding of SMP on PA-RISC to do so.

Right, it looks like we agree that my patch is necessary and valid as is;
but that the further races which worried me are not an issue, because the
racer will have done the necessary flush_cache_page. And two cleanups may
be done later: remove _PAGE_FLUSH, and use tmpalias instead of vma_prio_tree.

Hugh

2005-10-25 02:45:34

by Nicolas Pitre

[permalink] [raw]
Subject: Re: [PATCH 2/9] mm: arm ready for split ptlock

On Sat, 22 Oct 2005, Russell King wrote:

> On Sat, Oct 22, 2005 at 05:22:20PM +0100, Hugh Dickins wrote:
> > Signal handling's preserve and restore of iwmmxt context currently
> > involves reading and writing that context to and from user space, while
> > holding page_table_lock to secure the user page(s) against kswapd. If
> > we split the lock, then the structure might span two pages, secured by
> > different locks. That would be manageable; but it seems simpler just
> > to read into and write from a kernel stack buffer, copying that out and
> > in without locking (the structure is 160 bytes in size, and here we're
> > near the top of the kernel stack). Or would the overhead be noticeable?
>
> Please contact Nicolas Pitre about that - that was my suggestion,
> but ISTR apparantly the overhead is too high.

Going through a kernel buffer will simply double the overhead. Let's
suppose it should not be a big enough issue to stop the patch from being
merged though (and it looks cleaner that way). However I'd like for the
WARN_ON((unsigned long)frame & 7) to remain as both the kernel and user
buffers should be 64-bit aligned.

> > arm_syscall's cmpxchg emulation use pte_offset_map_lock, instead of
> > pte_offset_map and mm-wide page_table_lock; and strictly, it should now
> > also take mmap_sem before descending to pmd, to guard against another
> > thread munmapping, and the page table pulled out beneath this thread.
>
> Now that I look at it, it's probably buggy - if the page isn't already
> dirty, it will modify without the COW action. Again, please contact
> Nicolas about this.

I don't see how standard COW could not happen. The only difference with
a true write fault as if we used put_user() is that we bypassed the data
abort vector and the code to get the FAR value. Or am I missing
something?


Nicolas

2005-10-25 06:32:37

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH 2/9] mm: arm ready for split ptlock

On Mon, 24 Oct 2005, Nicolas Pitre wrote:
> On Sat, 22 Oct 2005, Russell King wrote:
> > On Sat, Oct 22, 2005 at 05:22:20PM +0100, Hugh Dickins wrote:
> > > Signal handling's preserve and restore of iwmmxt context currently
> > > involves reading and writing that context to and from user space, while
> > > holding page_table_lock to secure the user page(s) against kswapd. If
> > > we split the lock, then the structure might span two pages, secured by
> > > different locks. That would be manageable; but it seems simpler just
> > > to read into and write from a kernel stack buffer, copying that out and
> > > in without locking (the structure is 160 bytes in size, and here we're
> > > near the top of the kernel stack). Or would the overhead be noticeable?
> >
> > Please contact Nicolas Pitre about that - that was my suggestion,
> > but ISTR apparantly the overhead is too high.
>
> Going through a kernel buffer will simply double the overhead. Let's
> suppose it should not be a big enough issue to stop the patch from being
> merged though (and it looks cleaner that way). However I'd like for the
> WARN_ON((unsigned long)frame & 7) to remain as both the kernel and user
> buffers should be 64-bit aligned.

Okay, thanks. I can submit a patch to restore the WARN_ON later
(not today). Though that seems very odd to me, can you explain? I can
understand that the original kernel context needs to be 64-bit aligned,
and perhaps the iwmmxt_task_copy copy of it (I explicitly align that
buffer). But I can't see why the saved copy in userspace would need
to be 64-bit aligned, if it's just __copy_to_user'ed and __copy_from_
user'ed. Or is it also accessed in some other, direct way?

As to the overhead, let's see if it's serious or not in practice:
let me know if you find it to be a significant slowdown - thanks.

> > > arm_syscall's cmpxchg emulation use pte_offset_map_lock, instead of
> > > pte_offset_map and mm-wide page_table_lock; and strictly, it should now
> > > also take mmap_sem before descending to pmd, to guard against another
> > > thread munmapping, and the page table pulled out beneath this thread.
> >
> > Now that I look at it, it's probably buggy - if the page isn't already
> > dirty, it will modify without the COW action. Again, please contact
> > Nicolas about this.
>
> I don't see how standard COW could not happen. The only difference with
> a true write fault as if we used put_user() is that we bypassed the data
> abort vector and the code to get the FAR value. Or am I missing
> something?

It's certainly not buggy in the way that I thought (and I believe rmk
was thinking): you are checking pte_write, correctly within the lock,
so COW shouldn't come into it at all - it'll only work if the page is
already writable by the user.

But then I'm puzzled by your reply, saying you don't see how standard
COW could not happen.

Plus it seems a serious limitation: mightn't this be an area of executable
text that it has to write into, but is most likely readonly? Or an area
of data made readonly by fork? And is the alignment assured, that the
long will fit in one page only?

The better way to do it, I think, would be to use ptrace's
access_process_vm (it is our own mm, but that's okay).

Hugh

2005-10-25 07:56:04

by Russell King

[permalink] [raw]
Subject: Re: [PATCH 2/9] mm: arm ready for split ptlock

On Mon, Oct 24, 2005 at 10:45:04PM -0400, Nicolas Pitre wrote:
> On Sat, 22 Oct 2005, Russell King wrote:
> > Please contact Nicolas Pitre about that - that was my suggestion,
> > but ISTR apparantly the overhead is too high.
>
> Going through a kernel buffer will simply double the overhead. Let's
> suppose it should not be a big enough issue to stop the patch from being
> merged though (and it looks cleaner that way). However I'd like for the
> WARN_ON((unsigned long)frame & 7) to remain as both the kernel and user
> buffers should be 64-bit aligned.

The WARN_ON is pointless because we guarantee that the stack is always
64-bit aligned on signal handler setup and return.

> I don't see how standard COW could not happen. The only difference with
> a true write fault as if we used put_user() is that we bypassed the data
> abort vector and the code to get the FAR value. Or am I missing
> something?

pte_write() just says that the page _may_ be writable. It doesn't say
that the MMU is programmed to allow writes. If pte_dirty() doesn't
return true, that means that the page is _not_ writable from userspace.
If you write to it from kernel mode (without using put_user) you'll
bypass the MMU read-only protection and may end up writing to a page
owned by two separate processes.

--
Russell King
Linux kernel 2.6 ARM Linux - http://www.arm.linux.org.uk/
maintainer of: 2.6 Serial core

2005-10-25 14:55:48

by Nicolas Pitre

[permalink] [raw]
Subject: Re: [PATCH 2/9] mm: arm ready for split ptlock

On Tue, 25 Oct 2005, Hugh Dickins wrote:

> On Mon, 24 Oct 2005, Nicolas Pitre wrote:
> > However I'd like for the
> > WARN_ON((unsigned long)frame & 7) to remain as both the kernel and user
> > buffers should be 64-bit aligned.
>
> Okay, thanks. I can submit a patch to restore the WARN_ON later
> (not today). Though that seems very odd to me, can you explain? I can
> understand that the original kernel context needs to be 64-bit aligned,
> and perhaps the iwmmxt_task_copy copy of it (I explicitly align that
> buffer). But I can't see why the saved copy in userspace would need
> to be 64-bit aligned, if it's just __copy_to_user'ed and __copy_from_
> user'ed. Or is it also accessed in some other, direct way?

If the user signal handler decides to do something with it then it
probably expects it to be 64-bit aligned as well per current ABI since
those are 64 bit registers.

> > > Now that I look at it, it's probably buggy - if the page isn't already
> > > dirty, it will modify without the COW action. Again, please contact
> > > Nicolas about this.
> >
> > I don't see how standard COW could not happen. The only difference with
> > a true write fault as if we used put_user() is that we bypassed the data
> > abort vector and the code to get the FAR value. Or am I missing
> > something?
>
> It's certainly not buggy in the way that I thought (and I believe rmk
> was thinking): you are checking pte_write, correctly within the lock,
> so COW shouldn't come into it at all - it'll only work if the page is
> already writable by the user.
>
> But then I'm puzzled by your reply, saying you don't see how standard
> COW could not happen.

NO. What I'm saying is that if ever the page is _not_ writable (be it
absent or read-only) then do_DataAbort() is called just like if we had a
real access fault as if we actually wrote to the page. What happens at
that point is therefore exactly the same as a real fault (either COW,
paging from swap, process killed, or whatever).

> Plus it seems a serious limitation: mightn't this be an area of executable
> text that it has to write into, but is most likely readonly? Or an area
> of data made readonly by fork? And is the alignment assured, that the
> long will fit in one page only?

Atomic operations are expected to be performed on an aligned word. This
code is only one way (and actually there might be only one person on the
planet actually needing it at the moment) but all the other much common
and faster ways would either succeed due to the alignment trap or fail
due to the alignment trap (the alignment trap doesn't fix up misaligned
accesses from ldrex/strex on ARMv6). So in practice only RMK might be
exercising that code. If it becomes necessary for more configs then we
could simply SIGBUS on a misaligned pointer.

> The better way to do it, I think, would be to use ptrace's
> access_process_vm (it is our own mm, but that's okay).

This is IMHO way too much overkill for an operation that should be only
a few assembly instructions in user space normally. This really should
remain as light weight as possible (and used only when there is
absolutely no other ways).


Nicolas

2005-10-25 15:00:11

by Nicolas Pitre

[permalink] [raw]
Subject: Re: [PATCH 2/9] mm: arm ready for split ptlock

On Tue, 25 Oct 2005, Russell King wrote:

> On Mon, Oct 24, 2005 at 10:45:04PM -0400, Nicolas Pitre wrote:
> > On Sat, 22 Oct 2005, Russell King wrote:
> > > Please contact Nicolas Pitre about that - that was my suggestion,
> > > but ISTR apparantly the overhead is too high.
> >
> > Going through a kernel buffer will simply double the overhead. Let's
> > suppose it should not be a big enough issue to stop the patch from being
> > merged though (and it looks cleaner that way). However I'd like for the
> > WARN_ON((unsigned long)frame & 7) to remain as both the kernel and user
> > buffers should be 64-bit aligned.
>
> The WARN_ON is pointless because we guarantee that the stack is always
> 64-bit aligned on signal handler setup and return.

Sure, but the iWMMXt context is stored after the standard sigcontext
which also must be 64 bits in size (which might not be always the case
if things change in the structure or in its padding).

> > I don't see how standard COW could not happen. The only difference with
> > a true write fault as if we used put_user() is that we bypassed the data
> > abort vector and the code to get the FAR value. Or am I missing
> > something?
>
> pte_write() just says that the page _may_ be writable. It doesn't say
> that the MMU is programmed to allow writes. If pte_dirty() doesn't
> return true, that means that the page is _not_ writable from userspace.

Argh... So only suffice to s/pte_write/pte_dirty/ I'd guess?


Nicolas

2005-10-26 00:20:26

by Russell King

[permalink] [raw]
Subject: Re: [PATCH 2/9] mm: arm ready for split ptlock

On Tue, Oct 25, 2005 at 11:00:09AM -0400, Nicolas Pitre wrote:
> On Tue, 25 Oct 2005, Russell King wrote:
>
> > On Mon, Oct 24, 2005 at 10:45:04PM -0400, Nicolas Pitre wrote:
> > > On Sat, 22 Oct 2005, Russell King wrote:
> > > > Please contact Nicolas Pitre about that - that was my suggestion,
> > > > but ISTR apparantly the overhead is too high.
> > >
> > > Going through a kernel buffer will simply double the overhead. Let's
> > > suppose it should not be a big enough issue to stop the patch from being
> > > merged though (and it looks cleaner that way). However I'd like for the
> > > WARN_ON((unsigned long)frame & 7) to remain as both the kernel and user
> > > buffers should be 64-bit aligned.
> >
> > The WARN_ON is pointless because we guarantee that the stack is always
> > 64-bit aligned on signal handler setup and return.
>
> Sure, but the iWMMXt context is stored after the standard sigcontext
> which also must be 64 bits in size (which might not be always the case
> if things change in the structure or in its padding).
>
> > > I don't see how standard COW could not happen. The only difference with
> > > a true write fault as if we used put_user() is that we bypassed the data
> > > abort vector and the code to get the FAR value. Or am I missing
> > > something?
> >
> > pte_write() just says that the page _may_ be writable. It doesn't say
> > that the MMU is programmed to allow writes. If pte_dirty() doesn't
> > return true, that means that the page is _not_ writable from userspace.
>
> Argh... So only suffice to s/pte_write/pte_dirty/ I'd guess?

No. If we're emulating a cmpxchg() on a clean BSS page, this code
as it stands today will write to the zero page making it a page which
is mostly zero. Bad news when it's mapped into other processes BSS
pages.

Changing this for pte_dirty() means that we'll refuse to do a cmpxchg()
on a clean BSS page. The data may compare correctly, but because it
isn't already dirty, you'll fail.

If we still had it, I'd say you need to use verify_area() to tell the
kernel to pre-COW the pages. However, that got removed a while back.

For the benefit of others, this is a bit of a bugger since you can't
use put_user() - the read + compare + write needs to be 100% atomic
and put_user() isn't - it may fault and hence we may switch contexts.

I don't have any ideas at present, but maybe that's because it's
getting on for 1:30am. 8/

--
Russell King
Linux kernel 2.6 ARM Linux - http://www.arm.linux.org.uk/
maintainer of: 2.6 Serial core

2005-10-26 01:26:35

by Nicolas Pitre

[permalink] [raw]
Subject: Re: [PATCH 2/9] mm: arm ready for split ptlock

On Wed, 26 Oct 2005, Russell King wrote:

> On Tue, Oct 25, 2005 at 11:00:09AM -0400, Nicolas Pitre wrote:
>
> > Argh... So only suffice to s/pte_write/pte_dirty/ I'd guess?
>
> No. If we're emulating a cmpxchg() on a clean BSS page, this code
> as it stands today will write to the zero page making it a page which
> is mostly zero. Bad news when it's mapped into other processes BSS
> pages.
>
> Changing this for pte_dirty() means that we'll refuse to do a cmpxchg()
> on a clean BSS page. The data may compare correctly, but because it
> isn't already dirty, you'll fail.

Does it matter? I'd say no. OK we _know_ that it should have worked,
but user space must be ready to deal with cmpxchg failing and reattempt
the operation. It never cares about the reason for which it might have
failed in all the cases I've seen. The second time, the page will have
been marked dirty and things will proceed normally.

This is of course not perfect but should be pretty damn good enough
given that the number of machine instances on this planet that _might_
need to rely on that code are truely non standard (i.e. pre ARMv6 SMP)
and can be counted on my fingers. The only other "solution" is to
always return failure and simply deny NPTL support for those classes of
machines.


Nicolas

2005-10-31 22:19:23

by Jesper Juhl

[permalink] [raw]
Subject: Re: [PATCH 2/9] mm: arm ready for split ptlock

On 10/26/05, Russell King <[email protected]> wrote:
> On Tue, Oct 25, 2005 at 11:00:09AM -0400, Nicolas Pitre wrote:
> > On Tue, 25 Oct 2005, Russell King wrote:
> >
> > > On Mon, Oct 24, 2005 at 10:45:04PM -0400, Nicolas Pitre wrote:
> > > > On Sat, 22 Oct 2005, Russell King wrote:
> > > > > Please contact Nicolas Pitre about that - that was my suggestion,
> > > > > but ISTR apparantly the overhead is too high.
> > > >
> > > > Going through a kernel buffer will simply double the overhead. Let's
> > > > suppose it should not be a big enough issue to stop the patch from being
> > > > merged though (and it looks cleaner that way). However I'd like for the
> > > > WARN_ON((unsigned long)frame & 7) to remain as both the kernel and user
> > > > buffers should be 64-bit aligned.
> > >
> > > The WARN_ON is pointless because we guarantee that the stack is always
> > > 64-bit aligned on signal handler setup and return.
> >
> > Sure, but the iWMMXt context is stored after the standard sigcontext
> > which also must be 64 bits in size (which might not be always the case
> > if things change in the structure or in its padding).
> >
> > > > I don't see how standard COW could not happen. The only difference with
> > > > a true write fault as if we used put_user() is that we bypassed the data
> > > > abort vector and the code to get the FAR value. Or am I missing
> > > > something?
> > >
> > > pte_write() just says that the page _may_ be writable. It doesn't say
> > > that the MMU is programmed to allow writes. If pte_dirty() doesn't
> > > return true, that means that the page is _not_ writable from userspace.
> >
> > Argh... So only suffice to s/pte_write/pte_dirty/ I'd guess?
>
> No. If we're emulating a cmpxchg() on a clean BSS page, this code
> as it stands today will write to the zero page making it a page which
> is mostly zero. Bad news when it's mapped into other processes BSS
> pages.
>
> Changing this for pte_dirty() means that we'll refuse to do a cmpxchg()
> on a clean BSS page. The data may compare correctly, but because it
> isn't already dirty, you'll fail.
>
> If we still had it, I'd say you need to use verify_area() to tell the
> kernel to pre-COW the pages. However, that got removed a while back.
>

Yes, I removed verify_area() since it was just a wrapper for access_ok().
If verify_area() was/is needed, then access_ok() should be just fine
as a replacement as far as I can see.

--
Jesper Juhl <[email protected]>
Don't top-post http://www.catb.org/~esr/jargon/html/T/top-post.html
Plain text mails only, please http://www.expita.com/nomime.html

2005-10-31 22:27:38

by Russell King

[permalink] [raw]
Subject: Re: [PATCH 2/9] mm: arm ready for split ptlock

On Mon, Oct 31, 2005 at 11:19:21PM +0100, Jesper Juhl wrote:
> On 10/26/05, Russell King <[email protected]> wrote:
> > No. If we're emulating a cmpxchg() on a clean BSS page, this code
> > as it stands today will write to the zero page making it a page which
> > is mostly zero. Bad news when it's mapped into other processes BSS
> > pages.
> >
> > Changing this for pte_dirty() means that we'll refuse to do a cmpxchg()
> > on a clean BSS page. The data may compare correctly, but because it
> > isn't already dirty, you'll fail.
> >
> > If we still had it, I'd say you need to use verify_area() to tell the
> > kernel to pre-COW the pages. However, that got removed a while back.
> >
>
> Yes, I removed verify_area() since it was just a wrapper for access_ok().
> If verify_area() was/is needed, then access_ok() should be just fine
> as a replacement as far as I can see.

Except verify_area() would pre-fault the pages in whereas access_ok()
just verifies that the address is a user page. That's quite important
in this case because in order to fault the page in, we need to use
put_user() to get the permission checking correct.

However, we can't use put_user() because then the cmpxchg emulation
becomes completely non-atomic.

--
Russell King
Linux kernel 2.6 ARM Linux - http://www.arm.linux.org.uk/
maintainer of: 2.6 Serial core

2005-10-31 22:34:42

by Jesper Juhl

[permalink] [raw]
Subject: Re: [PATCH 2/9] mm: arm ready for split ptlock

On 10/31/05, Russell King <[email protected]> wrote:
> On Mon, Oct 31, 2005 at 11:19:21PM +0100, Jesper Juhl wrote:
> > On 10/26/05, Russell King <[email protected]> wrote:
> > > No. If we're emulating a cmpxchg() on a clean BSS page, this code
> > > as it stands today will write to the zero page making it a page which
> > > is mostly zero. Bad news when it's mapped into other processes BSS
> > > pages.
> > >
> > > Changing this for pte_dirty() means that we'll refuse to do a cmpxchg()
> > > on a clean BSS page. The data may compare correctly, but because it
> > > isn't already dirty, you'll fail.
> > >
> > > If we still had it, I'd say you need to use verify_area() to tell the
> > > kernel to pre-COW the pages. However, that got removed a while back.
> > >
> >
> > Yes, I removed verify_area() since it was just a wrapper for access_ok().
> > If verify_area() was/is needed, then access_ok() should be just fine
> > as a replacement as far as I can see.
>
> Except verify_area() would pre-fault the pages in whereas access_ok()
> just verifies that the address is a user page. That's quite important
> in this case because in order to fault the page in, we need to use
> put_user() to get the permission checking correct.
>
> However, we can't use put_user() because then the cmpxchg emulation
> becomes completely non-atomic.
>
Colour me stupid, but I don't see how that can be.

Looking at verify_area() from 2.6.13 - the arm version (
http://sosdg.org/~coywolf/lxr/source/include/asm-arm/uaccess.h?v=2.6.13#L81
) :

static inline int __deprecated verify_area(int type, const void __user
*addr, unsigned long size)
{
return access_ok(type, addr, size) ? 0 : -EFAULT;
}

How will this cause pre-faulting if a call to access_ok() will not?
Please enlighten me.


--
Jesper Juhl <[email protected]>
Don't top-post http://www.catb.org/~esr/jargon/html/T/top-post.html
Plain text mails only, please http://www.expita.com/nomime.html

2005-10-31 22:50:19

by Russell King

[permalink] [raw]
Subject: Re: [PATCH 2/9] mm: arm ready for split ptlock

On Mon, Oct 31, 2005 at 11:34:39PM +0100, Jesper Juhl wrote:
> On 10/31/05, Russell King <[email protected]> wrote:
> > On Mon, Oct 31, 2005 at 11:19:21PM +0100, Jesper Juhl wrote:
> > > Yes, I removed verify_area() since it was just a wrapper for access_ok().
> > > If verify_area() was/is needed, then access_ok() should be just fine
> > > as a replacement as far as I can see.
> >
> > Except verify_area() would pre-fault the pages in whereas access_ok()
> > just verifies that the address is a user page. That's quite important
> > in this case because in order to fault the page in, we need to use
> > put_user() to get the permission checking correct.
> >
> > However, we can't use put_user() because then the cmpxchg emulation
> > becomes completely non-atomic.
> >
> Colour me stupid, but I don't see how that can be.
>
> Looking at verify_area() from 2.6.13 - the arm version (
> http://sosdg.org/~coywolf/lxr/source/include/asm-arm/uaccess.h?v=2.6.13#L81
> ) :
>
> static inline int __deprecated verify_area(int type, const void __user
> *addr, unsigned long size)
> {
> return access_ok(type, addr, size) ? 0 : -EFAULT;
> }
>
> How will this cause pre-faulting if a call to access_ok() will not?
> Please enlighten me.

This is true because we haven't had this requirement up until the
introduction of this. I am referring to some functionality which
was present a while back in the kernel - the old version of
verify_area which, when passed a VERIFY_WRITE argument, would
prefault the pages... as required for some i386 CPUs.

--
Russell King
Linux kernel 2.6 ARM Linux - http://www.arm.linux.org.uk/
maintainer of: 2.6 Serial core