2013-09-05 21:28:30

by Naoya Horiguchi

[permalink] [raw]
Subject: [PATCH 0/2 v3] split page table lock for hugepage

I revised the split ptl patchset with small fixes.
See also the previous post [1] for the motivation and the numbers.

Any comments and reviews are welcomed.

[1] http://thread.gmane.org/gmane.linux.kernel.mm/106292/

Thanks,
Naoya Horiguchi
---
Summary:

Naoya Horiguchi (2):
hugetlbfs: support split page table lock
thp: support split page table lock

arch/powerpc/mm/pgtable_64.c | 8 +-
arch/s390/mm/pgtable.c | 4 +-
arch/sparc/mm/tlb.c | 4 +-
fs/proc/task_mmu.c | 17 +++--
include/linux/huge_mm.h | 11 +--
include/linux/hugetlb.h | 20 +++++
include/linux/mm.h | 3 +
include/linux/mm_types.h | 2 +
mm/huge_memory.c | 171 ++++++++++++++++++++++++++-----------------
mm/hugetlb.c | 92 ++++++++++++++---------
mm/memcontrol.c | 14 ++--
mm/memory.c | 15 ++--
mm/mempolicy.c | 5 +-
mm/migrate.c | 12 +--
mm/mprotect.c | 5 +-
mm/pgtable-generic.c | 10 +--
mm/rmap.c | 13 ++--
17 files changed, 246 insertions(+), 160 deletions(-)


2013-09-05 21:28:32

by Naoya Horiguchi

[permalink] [raw]
Subject: [PATCH 1/2] hugetlbfs: support split page table lock

Currently all of page table handling by hugetlbfs code are done under
mm->page_table_lock. So when a process have many threads and they heavily
access to the memory, lock contention happens and impacts the performance.

This patch makes hugepage support split page table lock so that we use
page->ptl of the leaf node of page table tree which is pte for normal pages
but can be pmd and/or pud for hugepages of some architectures.

ChangeLog v3:
- disable split ptl for ppc with USE_SPLIT_PTLOCKS_HUGETLB.
- remove replacement in some architecture dependent code. This is justified
because an allocation of pgd/pud/pmd/pte entry can race with other
allocation, not with read/write access, so we can use different locks.
http://thread.gmane.org/gmane.linux.kernel.mm/106292/focus=106458

ChangeLog v2:
- add split ptl on other archs missed in v1
- drop changes on arch/{powerpc,tile}/mm/hugetlbpage.c

Signed-off-by: Naoya Horiguchi <[email protected]>
---
include/linux/hugetlb.h | 20 +++++++++++
include/linux/mm_types.h | 2 ++
mm/hugetlb.c | 92 +++++++++++++++++++++++++++++-------------------
mm/mempolicy.c | 5 +--
mm/migrate.c | 4 +--
mm/rmap.c | 2 +-
6 files changed, 84 insertions(+), 41 deletions(-)

diff --git v3.11-rc3.orig/include/linux/hugetlb.h v3.11-rc3/include/linux/hugetlb.h
index 0393270..5cb8a4e 100644
--- v3.11-rc3.orig/include/linux/hugetlb.h
+++ v3.11-rc3/include/linux/hugetlb.h
@@ -80,6 +80,24 @@ extern const unsigned long hugetlb_zero, hugetlb_infinity;
extern int sysctl_hugetlb_shm_group;
extern struct list_head huge_boot_pages;

+#if USE_SPLIT_PTLOCKS_HUGETLB
+#define huge_pte_lockptr(mm, ptep) ({__pte_lockptr(virt_to_page(ptep)); })
+#else /* !USE_SPLIT_PTLOCKS_HUGETLB */
+#define huge_pte_lockptr(mm, ptep) ({&(mm)->page_table_lock; })
+#endif /* USE_SPLIT_PTLOCKS_HUGETLB */
+
+#define huge_pte_offset_lock(mm, address, ptlp) \
+({ \
+ pte_t *__pte = huge_pte_offset(mm, address); \
+ spinlock_t *__ptl = NULL; \
+ if (__pte) { \
+ __ptl = huge_pte_lockptr(mm, __pte); \
+ *(ptlp) = __ptl; \
+ spin_lock(__ptl); \
+ } \
+ __pte; \
+})
+
/* arch callbacks */

pte_t *huge_pte_alloc(struct mm_struct *mm,
@@ -164,6 +182,8 @@ static inline void __unmap_hugepage_range(struct mmu_gather *tlb,
BUG();
}

+#define huge_pte_lockptr(mm, ptep) 0
+
#endif /* !CONFIG_HUGETLB_PAGE */

#define HUGETLB_ANON_FILE "anon_hugepage"
diff --git v3.11-rc3.orig/include/linux/mm_types.h v3.11-rc3/include/linux/mm_types.h
index fb425aa..cfb8c6f 100644
--- v3.11-rc3.orig/include/linux/mm_types.h
+++ v3.11-rc3/include/linux/mm_types.h
@@ -24,6 +24,8 @@
struct address_space;

#define USE_SPLIT_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
+#define USE_SPLIT_PTLOCKS_HUGETLB \
+ (USE_SPLIT_PTLOCKS && !defined(CONFIG_PPC))

/*
* Each physical page in the system has a struct page associated with
diff --git v3.11-rc3.orig/mm/hugetlb.c v3.11-rc3/mm/hugetlb.c
index 0f8da6b..b2dbca4 100644
--- v3.11-rc3.orig/mm/hugetlb.c
+++ v3.11-rc3/mm/hugetlb.c
@@ -2372,6 +2372,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;

for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
+ spinlock_t *src_ptl, *dst_ptl;
src_pte = huge_pte_offset(src, addr);
if (!src_pte)
continue;
@@ -2383,8 +2384,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
if (dst_pte == src_pte)
continue;

- spin_lock(&dst->page_table_lock);
- spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING);
+ dst_ptl = huge_pte_lockptr(dst, dst_pte);
+ src_ptl = huge_pte_lockptr(src, src_pte);
+ spin_lock(dst_ptl);
+ spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
if (!huge_pte_none(huge_ptep_get(src_pte))) {
if (cow)
huge_ptep_set_wrprotect(src, addr, src_pte);
@@ -2394,8 +2397,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
page_dup_rmap(ptepage);
set_huge_pte_at(dst, addr, dst_pte, entry);
}
- spin_unlock(&src->page_table_lock);
- spin_unlock(&dst->page_table_lock);
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
}
return 0;

@@ -2438,6 +2441,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long address;
pte_t *ptep;
pte_t pte;
+ spinlock_t *ptl;
struct page *page;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
@@ -2451,25 +2455,24 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
tlb_start_vma(tlb, vma);
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
again:
- spin_lock(&mm->page_table_lock);
for (address = start; address < end; address += sz) {
- ptep = huge_pte_offset(mm, address);
+ ptep = huge_pte_offset_lock(mm, address, &ptl);
if (!ptep)
continue;

if (huge_pmd_unshare(mm, &address, ptep))
- continue;
+ goto unlock;

pte = huge_ptep_get(ptep);
if (huge_pte_none(pte))
- continue;
+ goto unlock;

/*
* HWPoisoned hugepage is already unmapped and dropped reference
*/
if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
huge_pte_clear(mm, address, ptep);
- continue;
+ goto unlock;
}

page = pte_page(pte);
@@ -2480,7 +2483,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
*/
if (ref_page) {
if (page != ref_page)
- continue;
+ goto unlock;

/*
* Mark the VMA as having unmapped its page so that
@@ -2497,13 +2500,18 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,

page_remove_rmap(page);
force_flush = !__tlb_remove_page(tlb, page);
- if (force_flush)
+ if (force_flush) {
+ spin_unlock(ptl);
break;
+ }
/* Bail out after unmapping reference page if supplied */
- if (ref_page)
+ if (ref_page) {
+ spin_unlock(ptl);
break;
+ }
+unlock:
+ spin_unlock(ptl);
}
- spin_unlock(&mm->page_table_lock);
/*
* mmu_gather ran out of room to batch pages, we break out of
* the PTE lock to avoid doing the potential expensive TLB invalidate
@@ -2617,6 +2625,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
int outside_reserve = 0;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
+ spinlock_t *ptl = huge_pte_lockptr(mm, ptep);

old_page = pte_page(pte);

@@ -2648,7 +2657,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
page_cache_get(old_page);

/* Drop page_table_lock as buddy allocator may be called */
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
new_page = alloc_huge_page(vma, address, outside_reserve);

if (IS_ERR(new_page)) {
@@ -2666,7 +2675,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
BUG_ON(huge_pte_none(pte));
if (unmap_ref_private(mm, vma, old_page, address)) {
BUG_ON(huge_pte_none(pte));
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
ptep = huge_pte_offset(mm, address & huge_page_mask(h));
if (likely(pte_same(huge_ptep_get(ptep), pte)))
goto retry_avoidcopy;
@@ -2680,7 +2689,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
}

/* Caller expects lock to be held */
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
if (err == -ENOMEM)
return VM_FAULT_OOM;
else
@@ -2695,7 +2704,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
page_cache_release(new_page);
page_cache_release(old_page);
/* Caller expects lock to be held */
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
return VM_FAULT_OOM;
}

@@ -2710,7 +2719,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
* Retake the page_table_lock to check for racing updates
* before the page tables are altered
*/
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
ptep = huge_pte_offset(mm, address & huge_page_mask(h));
if (likely(pte_same(huge_ptep_get(ptep), pte))) {
/* Break COW */
@@ -2722,10 +2731,10 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
/* Make the old page be freed below */
new_page = old_page;
}
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
/* Caller expects lock to be held */
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
page_cache_release(new_page);
page_cache_release(old_page);
return 0;
@@ -2775,6 +2784,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *page;
struct address_space *mapping;
pte_t new_pte;
+ spinlock_t *ptl;

/*
* Currently, we are forced to kill the process in the event the
@@ -2860,7 +2870,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto backout_unlocked;
}

- spin_lock(&mm->page_table_lock);
+ ptl = huge_pte_lockptr(mm, ptep);
+ spin_lock(ptl);
size = i_size_read(mapping->host) >> huge_page_shift(h);
if (idx >= size)
goto backout;
@@ -2882,13 +2893,13 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
}

- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
unlock_page(page);
out:
return ret;

backout:
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
backout_unlocked:
unlock_page(page);
put_page(page);
@@ -2900,6 +2911,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
{
pte_t *ptep;
pte_t entry;
+ spinlock_t *ptl;
int ret;
struct page *page = NULL;
struct page *pagecache_page = NULL;
@@ -2968,7 +2980,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (page != pagecache_page)
lock_page(page);

- spin_lock(&mm->page_table_lock);
+ ptl = huge_pte_lockptr(mm, ptep);
+ spin_lock(ptl);
/* Check for a racing update before calling hugetlb_cow */
if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
goto out_page_table_lock;
@@ -2988,7 +3001,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
update_mmu_cache(vma, address, ptep);

out_page_table_lock:
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);

if (pagecache_page) {
unlock_page(pagecache_page);
@@ -3014,9 +3027,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long remainder = *nr_pages;
struct hstate *h = hstate_vma(vma);

- spin_lock(&mm->page_table_lock);
while (vaddr < vma->vm_end && remainder) {
pte_t *pte;
+ spinlock_t *ptl = NULL;
int absent;
struct page *page;

@@ -3024,8 +3037,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
* Some archs (sparc64, sh*) have multiple pte_ts to
* each hugepage. We have to make sure we get the
* first, for the page indexing below to work.
+ *
+ * Note that page table lock is not held when pte is null.
*/
- pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
+ pte = huge_pte_offset_lock(mm, vaddr & huge_page_mask(h), &ptl);
absent = !pte || huge_pte_none(huge_ptep_get(pte));

/*
@@ -3037,6 +3052,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
*/
if (absent && (flags & FOLL_DUMP) &&
!hugetlbfs_pagecache_present(h, vma, vaddr)) {
+ if (pte)
+ spin_unlock(ptl);
remainder = 0;
break;
}
@@ -3056,10 +3073,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
!huge_pte_write(huge_ptep_get(pte)))) {
int ret;

- spin_unlock(&mm->page_table_lock);
+ if (pte)
+ spin_unlock(ptl);
ret = hugetlb_fault(mm, vma, vaddr,
(flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
- spin_lock(&mm->page_table_lock);
if (!(ret & VM_FAULT_ERROR))
continue;

@@ -3090,8 +3107,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
*/
goto same_page;
}
+ spin_unlock(ptl);
}
- spin_unlock(&mm->page_table_lock);
*nr_pages = remainder;
*position = vaddr;

@@ -3112,13 +3129,14 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
flush_cache_range(vma, address, end);

mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
- spin_lock(&mm->page_table_lock);
for (; address < end; address += huge_page_size(h)) {
- ptep = huge_pte_offset(mm, address);
+ spinlock_t *ptl;
+ ptep = huge_pte_offset_lock(mm, address, &ptl);
if (!ptep)
continue;
if (huge_pmd_unshare(mm, &address, ptep)) {
pages++;
+ spin_unlock(ptl);
continue;
}
if (!huge_pte_none(huge_ptep_get(ptep))) {
@@ -3128,8 +3146,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
set_huge_pte_at(mm, address, ptep, pte);
pages++;
}
+ spin_unlock(ptl);
}
- spin_unlock(&mm->page_table_lock);
/*
* Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
* may have cleared our pud entry and done put_page on the page table:
@@ -3292,6 +3310,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
unsigned long saddr;
pte_t *spte = NULL;
pte_t *pte;
+ spinlock_t *ptl;

if (!vma_shareable(vma, addr))
return (pte_t *)pmd_alloc(mm, pud, addr);
@@ -3314,13 +3333,14 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
if (!spte)
goto out;

- spin_lock(&mm->page_table_lock);
+ ptl = huge_pte_lockptr(mm, spte);
+ spin_lock(ptl);
if (pud_none(*pud))
pud_populate(mm, pud,
(pmd_t *)((unsigned long)spte & PAGE_MASK));
else
put_page(virt_to_page(spte));
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
out:
pte = (pte_t *)pmd_alloc(mm, pud, addr);
mutex_unlock(&mapping->i_mmap_mutex);
@@ -3334,7 +3354,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
* indicated by page_count > 1, unmap is achieved by clearing pud and
* decrementing the ref count. If count == 1, the pte page is not shared.
*
- * called with vma->vm_mm->page_table_lock held.
+ * called with page table lock held.
*
* returns: 1 successfully unmapped a shared pte page
* 0 the underlying pte page is not shared, or it is the last user
diff --git v3.11-rc3.orig/mm/mempolicy.c v3.11-rc3/mm/mempolicy.c
index 838d022..a9de55f 100644
--- v3.11-rc3.orig/mm/mempolicy.c
+++ v3.11-rc3/mm/mempolicy.c
@@ -522,8 +522,9 @@ static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
#ifdef CONFIG_HUGETLB_PAGE
int nid;
struct page *page;
+ spinlock_t *ptl = huge_pte_lockptr(vma->vm_mm, (pte_t *)pmd);

- spin_lock(&vma->vm_mm->page_table_lock);
+ spin_lock(ptl);
page = pte_page(huge_ptep_get((pte_t *)pmd));
nid = page_to_nid(page);
if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
@@ -533,7 +534,7 @@ static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
(flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
isolate_huge_page(page, private);
unlock:
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
#else
BUG();
#endif
diff --git v3.11-rc3.orig/mm/migrate.c v3.11-rc3/mm/migrate.c
index 61f14a1..c69a9c7 100644
--- v3.11-rc3.orig/mm/migrate.c
+++ v3.11-rc3/mm/migrate.c
@@ -130,7 +130,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
ptep = huge_pte_offset(mm, addr);
if (!ptep)
goto out;
- ptl = &mm->page_table_lock;
+ ptl = huge_pte_lockptr(mm, ptep);
} else {
pmd = mm_find_pmd(mm, addr);
if (!pmd)
@@ -249,7 +249,7 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,

void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte)
{
- spinlock_t *ptl = &(mm)->page_table_lock;
+ spinlock_t *ptl = huge_pte_lockptr(mm, pte);
__migration_entry_wait(mm, pte, ptl);
}

diff --git v3.11-rc3.orig/mm/rmap.c v3.11-rc3/mm/rmap.c
index c56ba98..eccec58 100644
--- v3.11-rc3.orig/mm/rmap.c
+++ v3.11-rc3/mm/rmap.c
@@ -601,7 +601,7 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,

if (unlikely(PageHuge(page))) {
pte = huge_pte_offset(mm, address);
- ptl = &mm->page_table_lock;
+ ptl = huge_pte_lockptr(mm, pte);
goto check;
}

--
1.8.3.1

2013-09-05 21:28:56

by Naoya Horiguchi

[permalink] [raw]
Subject: [PATCH 2/2] thp: support split page table lock

Thp related code also uses per process mm->page_table_lock now.
So making it fine-grained can provide better performance.

This patch makes thp support split page table lock by using page->ptl
of the pages storing "pmd_trans_huge" pmds.

Some functions like pmd_trans_huge_lock() and page_check_address_pmd()
are expected by their caller to pass back the pointer of ptl, so this
patch adds to those functions new arguments for that. Rather than that,
this patch gives only straightforward replacement.

ChangeLog v3:
- fixed argument of huge_pmd_lockptr() in copy_huge_pmd()
- added missing declaration of ptl in do_huge_pmd_anonymous_page()

Signed-off-by: Naoya Horiguchi <[email protected]>
---
arch/powerpc/mm/pgtable_64.c | 8 +-
arch/s390/mm/pgtable.c | 4 +-
arch/sparc/mm/tlb.c | 4 +-
fs/proc/task_mmu.c | 17 +++--
include/linux/huge_mm.h | 11 +--
include/linux/mm.h | 3 +
mm/huge_memory.c | 171 ++++++++++++++++++++++++++-----------------
mm/memcontrol.c | 14 ++--
mm/memory.c | 15 ++--
mm/migrate.c | 8 +-
mm/mprotect.c | 5 +-
mm/pgtable-generic.c | 10 +--
mm/rmap.c | 11 ++-
13 files changed, 162 insertions(+), 119 deletions(-)

diff --git v3.11-rc3.orig/arch/powerpc/mm/pgtable_64.c v3.11-rc3/arch/powerpc/mm/pgtable_64.c
index 536eec72..f9177eb 100644
--- v3.11-rc3.orig/arch/powerpc/mm/pgtable_64.c
+++ v3.11-rc3/arch/powerpc/mm/pgtable_64.c
@@ -605,7 +605,7 @@ void pmdp_splitting_flush(struct vm_area_struct *vma,

#ifdef CONFIG_DEBUG_VM
WARN_ON(!pmd_trans_huge(*pmdp));
- assert_spin_locked(&vma->vm_mm->page_table_lock);
+ assert_spin_locked(huge_pmd_lockptr(vma->vm_mm, pmdp));
#endif

#ifdef PTE_ATOMIC_UPDATES
@@ -643,7 +643,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable)
{
pgtable_t *pgtable_slot;
- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(huge_pmd_lockptr(mm, pmdp));
/*
* we store the pgtable in the second half of PMD
*/
@@ -663,7 +663,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
pgtable_t pgtable;
pgtable_t *pgtable_slot;

- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(huge_pmd_lockptr(mm, pmdp));
pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
pgtable = *pgtable_slot;
/*
@@ -687,7 +687,7 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
{
#ifdef CONFIG_DEBUG_VM
WARN_ON(!pmd_none(*pmdp));
- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(huge_pmd_lockptr(mm, pmdp));
WARN_ON(!pmd_trans_huge(pmd));
#endif
return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
diff --git v3.11-rc3.orig/arch/s390/mm/pgtable.c v3.11-rc3/arch/s390/mm/pgtable.c
index a8154a1..d6c6b5c 100644
--- v3.11-rc3.orig/arch/s390/mm/pgtable.c
+++ v3.11-rc3/arch/s390/mm/pgtable.c
@@ -1170,7 +1170,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
{
struct list_head *lh = (struct list_head *) pgtable;

- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(huge_pmd_lockptr(mm, pmdp));

/* FIFO */
if (!mm->pmd_huge_pte)
@@ -1186,7 +1186,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
pgtable_t pgtable;
pte_t *ptep;

- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(huge_pmd_lockptr(mm, pmdp));

/* FIFO */
pgtable = mm->pmd_huge_pte;
diff --git v3.11-rc3.orig/arch/sparc/mm/tlb.c v3.11-rc3/arch/sparc/mm/tlb.c
index 7a91f28..cca3bed 100644
--- v3.11-rc3.orig/arch/sparc/mm/tlb.c
+++ v3.11-rc3/arch/sparc/mm/tlb.c
@@ -193,7 +193,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
{
struct list_head *lh = (struct list_head *) pgtable;

- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(huge_pmd_lockptr(mm, pmdp));

/* FIFO */
if (!mm->pmd_huge_pte)
@@ -208,7 +208,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
struct list_head *lh;
pgtable_t pgtable;

- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(huge_pmd_lockptr(mm, pmdp));

/* FIFO */
pgtable = mm->pmd_huge_pte;
diff --git v3.11-rc3.orig/fs/proc/task_mmu.c v3.11-rc3/fs/proc/task_mmu.c
index dbf61f6..e23c882 100644
--- v3.11-rc3.orig/fs/proc/task_mmu.c
+++ v3.11-rc3/fs/proc/task_mmu.c
@@ -503,11 +503,11 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = mss->vma;
pte_t *pte;
- spinlock_t *ptl;
+ spinlock_t *uninitialized_var(ptl);

- if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk);
- spin_unlock(&walk->mm->page_table_lock);
+ spin_unlock(ptl);
mss->anonymous_thp += HPAGE_PMD_SIZE;
return 0;
}
@@ -980,10 +980,11 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *pte;
int err = 0;
pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
+ spinlock_t *uninitialized_var(ptl);

/* find the first VMA at or above 'addr' */
vma = find_vma(walk->mm, addr);
- if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
int pmd_flags2;

pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0);
@@ -997,7 +998,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (err)
break;
}
- spin_unlock(&walk->mm->page_table_lock);
+ spin_unlock(ptl);
return err;
}

@@ -1276,13 +1277,13 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
struct numa_maps *md;
- spinlock_t *ptl;
+ spinlock_t *uninitialized_var(ptl);
pte_t *orig_pte;
pte_t *pte;

md = walk->private;

- if (pmd_trans_huge_lock(pmd, md->vma) == 1) {
+ if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) {
pte_t huge_pte = *(pte_t *)pmd;
struct page *page;

@@ -1290,7 +1291,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
if (page)
gather_stats(page, md, pte_dirty(huge_pte),
HPAGE_PMD_SIZE/PAGE_SIZE);
- spin_unlock(&walk->mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}

diff --git v3.11-rc3.orig/include/linux/huge_mm.h v3.11-rc3/include/linux/huge_mm.h
index b60de92..1faf757 100644
--- v3.11-rc3.orig/include/linux/huge_mm.h
+++ v3.11-rc3/include/linux/huge_mm.h
@@ -54,7 +54,8 @@ enum page_check_address_pmd_flag {
extern pmd_t *page_check_address_pmd(struct page *page,
struct mm_struct *mm,
unsigned long address,
- enum page_check_address_pmd_flag flag);
+ enum page_check_address_pmd_flag flag,
+ spinlock_t **ptl);

#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
@@ -133,14 +134,14 @@ extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
unsigned long end,
long adjust_next);
extern int __pmd_trans_huge_lock(pmd_t *pmd,
- struct vm_area_struct *vma);
+ struct vm_area_struct *vma, spinlock_t **ptl);
/* mmap_sem must be held on entry */
static inline int pmd_trans_huge_lock(pmd_t *pmd,
- struct vm_area_struct *vma)
+ struct vm_area_struct *vma, spinlock_t **ptl)
{
VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem));
if (pmd_trans_huge(*pmd))
- return __pmd_trans_huge_lock(pmd, vma);
+ return __pmd_trans_huge_lock(pmd, vma, ptl);
else
return 0;
}
@@ -219,7 +220,7 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
{
}
static inline int pmd_trans_huge_lock(pmd_t *pmd,
- struct vm_area_struct *vma)
+ struct vm_area_struct *vma, spinlock_t **ptl)
{
return 0;
}
diff --git v3.11-rc3.orig/include/linux/mm.h v3.11-rc3/include/linux/mm.h
index f022460..9219f43 100644
--- v3.11-rc3.orig/include/linux/mm.h
+++ v3.11-rc3/include/linux/mm.h
@@ -1251,6 +1251,8 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
} while (0)
#define pte_lock_deinit(page) ((page)->mapping = NULL)
#define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
+#define huge_pmd_lockptr(mm, pmdp) \
+ ({(void)(mm); __pte_lockptr(virt_to_page(pmdp)); })
#else /* !USE_SPLIT_PTLOCKS */
/*
* We use mm->page_table_lock to guard all pagetable pages of the mm.
@@ -1258,6 +1260,7 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
#define pte_lock_init(page) do {} while (0)
#define pte_lock_deinit(page) do {} while (0)
#define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;})
+#define huge_pmd_lockptr(mm, pmdp) ({(void)(pmd); &(mm)->page_table_lock; })
#endif /* USE_SPLIT_PTLOCKS */

static inline void pgtable_page_ctor(struct page *page)
diff --git v3.11-rc3.orig/mm/huge_memory.c v3.11-rc3/mm/huge_memory.c
index 243e710..20fd1dd 100644
--- v3.11-rc3.orig/mm/huge_memory.c
+++ v3.11-rc3/mm/huge_memory.c
@@ -705,6 +705,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
struct page *page)
{
pgtable_t pgtable;
+ spinlock_t *ptl;

VM_BUG_ON(!PageCompound(page));
pgtable = pte_alloc_one(mm, haddr);
@@ -719,9 +720,10 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
*/
__SetPageUptodate(page);

- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lockptr(mm, pmd);
+ spin_lock(ptl);
if (unlikely(!pmd_none(*pmd))) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mem_cgroup_uncharge_page(page);
put_page(page);
pte_free(mm, pgtable);
@@ -733,7 +735,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
set_pmd_at(mm, haddr, pmd, entry);
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm->nr_ptes++;
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
}

return 0;
@@ -761,6 +763,7 @@ static inline struct page *alloc_hugepage(int defrag)
}
#endif

+/* Caller must hold page table lock. */
static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
struct page *zero_page)
@@ -795,6 +798,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
pgtable_t pgtable;
struct page *zero_page;
bool set;
+ spinlock_t *ptl;
pgtable = pte_alloc_one(mm, haddr);
if (unlikely(!pgtable))
return VM_FAULT_OOM;
@@ -804,10 +808,11 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
count_vm_event(THP_FAULT_FALLBACK);
goto out;
}
- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lockptr(mm, pmd);
+ spin_lock(ptl);
set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
zero_page);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
if (!set) {
pte_free(mm, pgtable);
put_huge_zero_page();
@@ -864,14 +869,17 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t pmd;
pgtable_t pgtable;
int ret;
+ spinlock_t *uninitialized_var(dst_ptl), *uninitialized_var(src_ptl);

ret = -ENOMEM;
pgtable = pte_alloc_one(dst_mm, addr);
if (unlikely(!pgtable))
goto out;

- spin_lock(&dst_mm->page_table_lock);
- spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
+ dst_ptl = huge_pmd_lockptr(dst_mm, dst_pmd);
+ src_ptl = huge_pmd_lockptr(src_mm, src_pmd);
+ spin_lock(dst_ptl);
+ spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);

ret = -EAGAIN;
pmd = *src_pmd;
@@ -880,7 +888,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
goto out_unlock;
}
/*
- * mm->page_table_lock is enough to be sure that huge zero pmd is not
+ * When page table lock is held, the huge zero pmd should not be
* under splitting since we don't split the page itself, only pmd to
* a page table.
*/
@@ -901,8 +909,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
}
if (unlikely(pmd_trans_splitting(pmd))) {
/* split huge page running from under us */
- spin_unlock(&src_mm->page_table_lock);
- spin_unlock(&dst_mm->page_table_lock);
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
pte_free(dst_mm, pgtable);

wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
@@ -922,8 +930,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,

ret = 0;
out_unlock:
- spin_unlock(&src_mm->page_table_lock);
- spin_unlock(&dst_mm->page_table_lock);
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
out:
return ret;
}
@@ -936,8 +944,9 @@ void huge_pmd_set_accessed(struct mm_struct *mm,
{
pmd_t entry;
unsigned long haddr;
+ spinlock_t *ptl = huge_pmd_lockptr(mm, pmd);

- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto unlock;

@@ -947,7 +956,7 @@ void huge_pmd_set_accessed(struct mm_struct *mm,
update_mmu_cache_pmd(vma, address, pmd);

unlock:
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
}

static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
@@ -960,6 +969,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
int i, ret = 0;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
+ spinlock_t *ptl;

page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
if (!page) {
@@ -980,7 +990,8 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);

- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lockptr(mm, pmd);
+ spin_lock(ptl);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_free_page;

@@ -1007,7 +1018,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
}
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
put_huge_zero_page();
inc_mm_counter(mm, MM_ANONPAGES);

@@ -1017,7 +1028,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
out:
return ret;
out_free_page:
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
mem_cgroup_uncharge_page(page);
put_page(page);
@@ -1037,6 +1048,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
struct page **pages;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
+ spinlock_t *ptl;

pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
GFP_KERNEL);
@@ -1077,7 +1089,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);

- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lockptr(mm, pmd);
+ spin_lock(ptl);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_free_pages;
VM_BUG_ON(!PageHead(page));
@@ -1103,7 +1116,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
page_remove_rmap(page);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);

mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);

@@ -1114,7 +1127,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
return ret;

out_free_pages:
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
mem_cgroup_uncharge_start();
for (i = 0; i < HPAGE_PMD_NR; i++) {
@@ -1134,12 +1147,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long haddr;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
+ spinlock_t *ptl = huge_pmd_lockptr(mm, pmd);

VM_BUG_ON(!vma->anon_vma);
haddr = address & HPAGE_PMD_MASK;
if (is_huge_zero_pmd(orig_pmd))
goto alloc;
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_unlock;

@@ -1155,7 +1169,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out_unlock;
}
get_page(page);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow())
@@ -1200,11 +1214,11 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);

- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
if (page)
put_page(page);
if (unlikely(!pmd_same(*pmd, orig_pmd))) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mem_cgroup_uncharge_page(new_page);
put_page(new_page);
goto out_mn;
@@ -1225,13 +1239,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
ret |= VM_FAULT_WRITE;
}
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
out_mn:
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out:
return ret;
out_unlock:
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
return ret;
}

@@ -1240,11 +1254,8 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
pmd_t *pmd,
unsigned int flags)
{
- struct mm_struct *mm = vma->vm_mm;
struct page *page = NULL;

- assert_spin_locked(&mm->page_table_lock);
-
if (flags & FOLL_WRITE && !pmd_write(*pmd))
goto out;

@@ -1295,8 +1306,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
int target_nid;
int current_nid = -1;
bool migrated;
+ spinlock_t *ptl = huge_pmd_lockptr(mm, pmdp);

- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
if (unlikely(!pmd_same(pmd, *pmdp)))
goto out_unlock;

@@ -1314,17 +1326,17 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
}

/* Acquire the page lock to serialise THP migrations */
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
lock_page(page);

/* Confirm the PTE did not while locked */
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
if (unlikely(!pmd_same(pmd, *pmdp))) {
unlock_page(page);
put_page(page);
goto out_unlock;
}
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);

/* Migrate the THP to the requested node */
migrated = migrate_misplaced_transhuge_page(mm, vma,
@@ -1336,7 +1348,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
return 0;

check_same:
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
if (unlikely(!pmd_same(pmd, *pmdp)))
goto out_unlock;
clear_pmdnuma:
@@ -1345,7 +1357,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
VM_BUG_ON(pmd_numa(*pmdp));
update_mmu_cache_pmd(vma, addr, pmdp);
out_unlock:
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
if (current_nid != -1)
task_numa_fault(current_nid, HPAGE_PMD_NR, false);
return 0;
@@ -1355,8 +1367,9 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr)
{
int ret = 0;
+ spinlock_t *uninitialized_var(ptl);

- if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
struct page *page;
pgtable_t pgtable;
pmd_t orig_pmd;
@@ -1371,7 +1384,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
if (is_huge_zero_pmd(orig_pmd)) {
tlb->mm->nr_ptes--;
- spin_unlock(&tlb->mm->page_table_lock);
+ spin_unlock(ptl);
put_huge_zero_page();
} else {
page = pmd_page(orig_pmd);
@@ -1380,7 +1393,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
VM_BUG_ON(!PageHead(page));
tlb->mm->nr_ptes--;
- spin_unlock(&tlb->mm->page_table_lock);
+ spin_unlock(ptl);
tlb_remove_page(tlb, page);
}
pte_free(tlb->mm, pgtable);
@@ -1394,13 +1407,14 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned char *vec)
{
int ret = 0;
+ spinlock_t *uninitialized_var(ptl);

- if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
/*
* All logical pages in the range are present
* if backed by a huge page.
*/
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
memset(vec, 1, (end - addr) >> PAGE_SHIFT);
ret = 1;
}
@@ -1415,6 +1429,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
{
int ret = 0;
pmd_t pmd;
+ spinlock_t *uninitialized_var(ptl);

struct mm_struct *mm = vma->vm_mm;

@@ -1433,12 +1448,12 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
goto out;
}

- ret = __pmd_trans_huge_lock(old_pmd, vma);
+ ret = __pmd_trans_huge_lock(old_pmd, vma, &ptl);
if (ret == 1) {
pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
}
out:
return ret;
@@ -1449,8 +1464,9 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
{
struct mm_struct *mm = vma->vm_mm;
int ret = 0;
+ spinlock_t *uninitialized_var(ptl);

- if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
pmd_t entry;
entry = pmdp_get_and_clear(mm, addr, pmd);
if (!prot_numa) {
@@ -1466,7 +1482,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
}
}
set_pmd_at(mm, addr, pmd, entry);
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
ret = 1;
}

@@ -1480,12 +1496,14 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
* Note that if it returns 1, this routine returns without unlocking page
* table locks. So callers must unlock them.
*/
-int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
+int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+ spinlock_t **ptl)
{
- spin_lock(&vma->vm_mm->page_table_lock);
+ *ptl = huge_pmd_lockptr(vma->vm_mm, pmd);
+ spin_lock(*ptl);
if (likely(pmd_trans_huge(*pmd))) {
if (unlikely(pmd_trans_splitting(*pmd))) {
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(*ptl);
wait_split_huge_page(vma->anon_vma, pmd);
return -1;
} else {
@@ -1494,14 +1512,23 @@ int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
return 1;
}
}
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(*ptl);
return 0;
}

+/*
+ * This function returns whether a given @page is mapped onto the @address
+ * in the virtual space of @mm.
+ *
+ * When it's true, this function returns *pmd with holding the page table lock
+ * and passing it back to the caller via @ptl.
+ * If it's false, returns NULL without holding the page table lock.
+ */
pmd_t *page_check_address_pmd(struct page *page,
struct mm_struct *mm,
unsigned long address,
- enum page_check_address_pmd_flag flag)
+ enum page_check_address_pmd_flag flag,
+ spinlock_t **ptl)
{
pmd_t *pmd, *ret = NULL;

@@ -1511,10 +1538,12 @@ pmd_t *page_check_address_pmd(struct page *page,
pmd = mm_find_pmd(mm, address);
if (!pmd)
goto out;
+ *ptl = huge_pmd_lockptr(mm, pmd);
+ spin_lock(*ptl);
if (pmd_none(*pmd))
- goto out;
+ goto unlock;
if (pmd_page(*pmd) != page)
- goto out;
+ goto unlock;
/*
* split_vma() may create temporary aliased mappings. There is
* no risk as long as all huge pmd are found and have their
@@ -1524,12 +1553,15 @@ pmd_t *page_check_address_pmd(struct page *page,
*/
if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
pmd_trans_splitting(*pmd))
- goto out;
+ goto unlock;
if (pmd_trans_huge(*pmd)) {
VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
!pmd_trans_splitting(*pmd));
ret = pmd;
+ goto out;
}
+unlock:
+ spin_unlock(*ptl);
out:
return ret;
}
@@ -1541,14 +1573,15 @@ static int __split_huge_page_splitting(struct page *page,
struct mm_struct *mm = vma->vm_mm;
pmd_t *pmd;
int ret = 0;
+ spinlock_t *uninitialized_var(ptl);
/* For mmu_notifiers */
const unsigned long mmun_start = address;
const unsigned long mmun_end = address + HPAGE_PMD_SIZE;

mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- spin_lock(&mm->page_table_lock);
+
pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
+ PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl);
if (pmd) {
/*
* We can't temporarily set the pmd to null in order
@@ -1559,8 +1592,8 @@ static int __split_huge_page_splitting(struct page *page,
*/
pmdp_splitting_flush(vma, address, pmd);
ret = 1;
+ spin_unlock(ptl);
}
- spin_unlock(&mm->page_table_lock);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);

return ret;
@@ -1694,10 +1727,10 @@ static int __split_huge_page_map(struct page *page,
int ret = 0, i;
pgtable_t pgtable;
unsigned long haddr;
+ spinlock_t *uninitialized_var(ptl);

- spin_lock(&mm->page_table_lock);
pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
+ PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl);
if (pmd) {
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
@@ -1752,8 +1785,8 @@ static int __split_huge_page_map(struct page *page,
pmdp_invalidate(vma, address, pmd);
pmd_populate(mm, pmd, pgtable);
ret = 1;
+ spin_unlock(ptl);
}
- spin_unlock(&mm->page_table_lock);

return ret;
}
@@ -2314,7 +2347,7 @@ static void collapse_huge_page(struct mm_struct *mm,
mmun_start = address;
mmun_end = address + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- spin_lock(&mm->page_table_lock); /* probably unnecessary */
+ spin_lock(ptl); /* probably unnecessary */
/*
* After this gup_fast can't run anymore. This also removes
* any huge TLB entry from the CPU so we won't allow
@@ -2322,7 +2355,7 @@ static void collapse_huge_page(struct mm_struct *mm,
* to avoid the risk of CPU bugs in that area.
*/
_pmd = pmdp_clear_flush(vma, address, pmd);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);

spin_lock(ptl);
@@ -2331,7 +2364,7 @@ static void collapse_huge_page(struct mm_struct *mm,

if (unlikely(!isolated)) {
pte_unmap(pte);
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
BUG_ON(!pmd_none(*pmd));
/*
* We can only use set_pmd_at when establishing
@@ -2339,7 +2372,7 @@ static void collapse_huge_page(struct mm_struct *mm,
* points to regular pagetables. Use pmd_populate for that
*/
pmd_populate(mm, pmd, pmd_pgtable(_pmd));
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
anon_vma_unlock_write(vma->anon_vma);
goto out;
}
@@ -2364,13 +2397,13 @@ static void collapse_huge_page(struct mm_struct *mm,
*/
smp_wmb();

- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);

*hpage = NULL;

@@ -2698,6 +2731,7 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
struct page *page;
struct mm_struct *mm = vma->vm_mm;
unsigned long haddr = address & HPAGE_PMD_MASK;
+ spinlock_t *ptl;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */

@@ -2706,22 +2740,23 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
mmun_start = haddr;
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lockptr(mm, pmd);
+ spin_lock(ptl);
if (unlikely(!pmd_trans_huge(*pmd))) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
return;
}
if (is_huge_zero_pmd(*pmd)) {
__split_huge_zero_page_pmd(vma, haddr, pmd);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
return;
}
page = pmd_page(*pmd);
VM_BUG_ON(!page_count(page));
get_page(page);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);

split_huge_page(page);
diff --git v3.11-rc3.orig/mm/memcontrol.c v3.11-rc3/mm/memcontrol.c
index 00a7a66..3949444 100644
--- v3.11-rc3.orig/mm/memcontrol.c
+++ v3.11-rc3/mm/memcontrol.c
@@ -6591,12 +6591,12 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
{
struct vm_area_struct *vma = walk->private;
pte_t *pte;
- spinlock_t *ptl;
+ spinlock_t *uninitialized_var(ptl);

- if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
mc.precharge += HPAGE_PMD_NR;
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}

@@ -6769,7 +6769,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
int ret = 0;
struct vm_area_struct *vma = walk->private;
pte_t *pte;
- spinlock_t *ptl;
+ spinlock_t *uninitialized_var(ptl);
enum mc_target_type target_type;
union mc_target target;
struct page *page;
@@ -6785,9 +6785,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
* to be unlocked in __split_huge_page_splitting(), where the main
* part of thp split is not executed yet.
*/
- if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
if (mc.precharge < HPAGE_PMD_NR) {
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
@@ -6804,7 +6804,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
}
put_page(page);
}
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}

diff --git v3.11-rc3.orig/mm/memory.c v3.11-rc3/mm/memory.c
index aa3b994..8c97ef0 100644
--- v3.11-rc3.orig/mm/memory.c
+++ v3.11-rc3/mm/memory.c
@@ -1529,20 +1529,21 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
split_huge_page_pmd(vma, address, pmd);
goto split_fallthrough;
}
- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lockptr(mm, pmd);
+ spin_lock(ptl);
if (likely(pmd_trans_huge(*pmd))) {
if (unlikely(pmd_trans_splitting(*pmd))) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
wait_split_huge_page(vma->anon_vma, pmd);
} else {
page = follow_trans_huge_pmd(vma, address,
pmd, flags);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
*page_mask = HPAGE_PMD_NR - 1;
goto out;
}
} else
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
/* fall through */
}
split_fallthrough:
@@ -3607,17 +3608,17 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t *pte, *orig_pte;
unsigned long _addr = addr & PMD_MASK;
unsigned long offset;
- spinlock_t *ptl;
+ spinlock_t *ptl = huge_pmd_lockptr(mm, pmdp);
bool numa = false;
int local_nid = numa_node_id();

- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
pmd = *pmdp;
if (pmd_numa(pmd)) {
set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
numa = true;
}
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);

if (!numa)
return 0;
diff --git v3.11-rc3.orig/mm/migrate.c v3.11-rc3/mm/migrate.c
index c69a9c7..1e1e9f2 100644
--- v3.11-rc3.orig/mm/migrate.c
+++ v3.11-rc3/mm/migrate.c
@@ -1659,6 +1659,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
struct page *new_page = NULL;
struct mem_cgroup *memcg = NULL;
int page_lru = page_is_file_cache(page);
+ spinlock_t *ptl;

/*
* Don't migrate pages that are mapped in multiple processes.
@@ -1699,9 +1700,10 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
WARN_ON(PageLRU(new_page));

/* Recheck the target PMD */
- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lockptr(mm, pmd);
+ spin_lock(ptl);
if (unlikely(!pmd_same(*pmd, entry))) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);

/* Reverse changes made by migrate_page_copy() */
if (TestClearPageActive(new_page))
@@ -1746,7 +1748,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
* before it's fully transferred to the new page.
*/
mem_cgroup_end_migration(memcg, page, new_page, true);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);

unlock_page(new_page);
unlock_page(page);
diff --git v3.11-rc3.orig/mm/mprotect.c v3.11-rc3/mm/mprotect.c
index 94722a4..c65c390 100644
--- v3.11-rc3.orig/mm/mprotect.c
+++ v3.11-rc3/mm/mprotect.c
@@ -116,9 +116,10 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
pmd_t *pmd)
{
- spin_lock(&mm->page_table_lock);
+ spinlock_t *ptl = huge_pmd_lockptr(mm, pmd);
+ spin_lock(ptl);
set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
}
#else
static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
diff --git v3.11-rc3.orig/mm/pgtable-generic.c v3.11-rc3/mm/pgtable-generic.c
index e1a6e4f..8e49928 100644
--- v3.11-rc3.orig/mm/pgtable-generic.c
+++ v3.11-rc3/mm/pgtable-generic.c
@@ -124,11 +124,10 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,

#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/* The caller must hold page table lock */
void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable)
{
- assert_spin_locked(&mm->page_table_lock);
-
/* FIFO */
if (!mm->pmd_huge_pte)
INIT_LIST_HEAD(&pgtable->lru);
@@ -141,13 +140,14 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,

#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-/* no "address" argument so destroys page coloring of some arch */
+/*
+ * no "address" argument so destroys page coloring of some arch
+ * The caller must hold page table lock.
+ */
pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
{
pgtable_t pgtable;

- assert_spin_locked(&mm->page_table_lock);
-
/* FIFO */
pgtable = mm->pmd_huge_pte;
if (list_empty(&pgtable->lru))
diff --git v3.11-rc3.orig/mm/rmap.c v3.11-rc3/mm/rmap.c
index eccec58..798f6ae 100644
--- v3.11-rc3.orig/mm/rmap.c
+++ v3.11-rc3/mm/rmap.c
@@ -666,24 +666,24 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
int referenced = 0;
+ spinlock_t *uninitialized_var(ptl);

if (unlikely(PageTransHuge(page))) {
pmd_t *pmd;

- spin_lock(&mm->page_table_lock);
/*
* rmap might return false positives; we must filter
* these out using page_check_address_pmd().
*/
pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_FLAG);
+ PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
if (!pmd) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
goto out;
}

if (vma->vm_flags & VM_LOCKED) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
*mapcount = 0; /* break early from loop */
*vm_flags |= VM_LOCKED;
goto out;
@@ -692,10 +692,9 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
/* go ahead even if the pmd is pmd_trans_splitting() */
if (pmdp_clear_flush_young_notify(vma, address, pmd))
referenced++;
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
} else {
pte_t *pte;
- spinlock_t *ptl;

/*
* rmap might return false positives; we must filter
--
1.8.3.1

2013-09-06 10:48:10

by Kirill A. Shutemov

[permalink] [raw]
Subject: RE: [PATCH 2/2] thp: support split page table lock

Naoya Horiguchi wrote:
> Thp related code also uses per process mm->page_table_lock now.
> So making it fine-grained can provide better performance.
>
> This patch makes thp support split page table lock by using page->ptl
> of the pages storing "pmd_trans_huge" pmds.
>
> Some functions like pmd_trans_huge_lock() and page_check_address_pmd()
> are expected by their caller to pass back the pointer of ptl, so this
> patch adds to those functions new arguments for that. Rather than that,
> this patch gives only straightforward replacement.
>
> ChangeLog v3:
> - fixed argument of huge_pmd_lockptr() in copy_huge_pmd()
> - added missing declaration of ptl in do_huge_pmd_anonymous_page()
>
> Signed-off-by: Naoya Horiguchi <[email protected]>

Generally, looks good. Few notes:

I believe you need to convert __pte_alloc() to new locking. Not sure about
__pte_alloc_kernel().
Have you check all rest mm->page_table_lock, that they shouldn't be
converted to new locking?

You use uninitialized_var() a lot. It's ugly. I've check few places
(task_mmu.c, copy_huge_pmd) and have found a reason why we need it there.
Why?

You often do

+ ptl = huge_pmd_lockptr(mm, pmd);
+ spin_lock(ptl);

Should we have a helper to combine them? huge_pmd_lock()?

--
Kirill A. Shutemov

2013-09-06 16:04:23

by Alex Thorlton

[permalink] [raw]
Subject: Re: [PATCH 2/2] thp: support split page table lock

On Thu, Sep 05, 2013 at 05:27:46PM -0400, Naoya Horiguchi wrote:
> Thp related code also uses per process mm->page_table_lock now.
> So making it fine-grained can provide better performance.
>
> This patch makes thp support split page table lock by using page->ptl
> of the pages storing "pmd_trans_huge" pmds.
>
> Some functions like pmd_trans_huge_lock() and page_check_address_pmd()
> are expected by their caller to pass back the pointer of ptl, so this
> patch adds to those functions new arguments for that. Rather than that,
> this patch gives only straightforward replacement.
>
> ChangeLog v3:
> - fixed argument of huge_pmd_lockptr() in copy_huge_pmd()
> - added missing declaration of ptl in do_huge_pmd_anonymous_page()

I've applied these and tested them using the same tests program that I
used when I was working on the same issue, and I'm running into some
bugs. Here's a stack trace:

general protection fault: 0000 [#1] SMP
Modules linked in:
CPU: 268 PID: 32381 Comm: memscale Not tainted
3.11.0-medusa-03121-g757f8ca #184
Hardware name: SGI UV2000/ROMLEY, BIOS SGI UV 2000/3000 series BIOS
01/15/2013
task: ffff880fbdd82180 ti: ffff880fc0c5a000 task.ti: ffff880fc0c5a000
RIP: 0010:[<ffffffff810e3eef>] [<ffffffff810e3eef>]
pgtable_trans_huge_withdraw+0x38/0x60
RSP: 0018:ffff880fc0c5bc88 EFLAGS: 00010297
RAX: ffffea17cebe8838 RBX: 00000015309bd000 RCX: ffffea01f623b028
RDX: dead000000100100 RSI: ffff8dcf77d84c30 RDI: ffff880fbda67580
RBP: ffff880fc0c5bc88 R08: 0000000000000013 R09: 0000000000014da0
R10: ffff880fc0c5bc88 R11: ffff888f7efda000 R12: ffff8dcf77d84c30
R13: ffff880fc0c5bdf8 R14: 800005cf401ff067 R15: ffff8b4de5fabff8
FS: 0000000000000000(0000) GS:ffff880fffd80000(0000)
knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007ffff768b0b8 CR3: 0000000001a0b000 CR4: 00000000000407e0
Stack:
ffff880fc0c5bcc8 ffffffff810f7643 ffff880fc0c5bcc8 ffffffff810d8297
ffffea1456237510 00007fc7b0e00000 0000000000000000 00007fc7b0c00000
ffff880fc0c5bda8 ffffffff810d85ba ffff880fc0c5bd48 ffff880fc0c5bd68
Call Trace:
[<ffffffff810f7643>] zap_huge_pmd+0x4c/0x101
[<ffffffff810d8297>] ? tlb_flush_mmu+0x58/0x75
[<ffffffff810d85ba>] unmap_single_vma+0x306/0x7d6
[<ffffffff810d8ad9>] unmap_vmas+0x4f/0x82
[<ffffffff810dab5e>] exit_mmap+0x8b/0x113
[<ffffffff810a9743>] ? __delayacct_add_tsk+0x170/0x182
[<ffffffff8103c609>] mmput+0x3e/0xc4
[<ffffffff8104088c>] do_exit+0x380/0x907
[<ffffffff810fb89c>] ? vfs_write+0x149/0x1a3
[<ffffffff81040e85>] do_group_exit+0x72/0x9b
[<ffffffff81040ec0>] SyS_exit_group+0x12/0x16
[<ffffffff814f52d2>] system_call_fastpath+0x16/0x1b
Code: 51 20 48 8d 41 20 48 39 c2 75 0d 48 c7 87 28 03 00 00 00 00 00 00
eb 36 48 8d 42 e0 48 89 87 28 03 00 00 48 8b 51 20 48 8b 41 28 <48> 89
42 08 48 89 10 48 ba 00 01 10 00 00 00 ad de 48 b8 00 02
RIP [<ffffffff810e3eef>] pgtable_trans_huge_withdraw+0x38/0x60
RSP <ffff880fc0c5bc88>
---[ end trace e5413b388b6ea448 ]---
Fixing recursive fault but reboot is needed!
general protection fault: 0000 [#2] SMP
Modules linked in:
CPU: 268 PID: 1722 Comm: kworker/268:1 Tainted: G D
3.11.0-medusa-03121-g757f8ca #184
Hardware name: SGI UV2000/ROMLEY, BIOS SGI UV 2000/3000 series BIOS
01/15/2013
Workqueue: events vmstat_update
task: ffff880fc1a74280 ti: ffff880fc1a76000 task.ti: ffff880fc1a76000
RIP: 0010:[<ffffffff810bcdcb>] [<ffffffff810bcdcb>]
free_pcppages_bulk+0x97/0x329
RSP: 0018:ffff880fc1a77c98 EFLAGS: 00010082
RAX: ffff880fffd94d68 RBX: dead0000002001e0 RCX: ffff880fffd94d50
RDX: ffff880fffd94d68 RSI: 000000000000001f RDI: ffff888f7efdac68
RBP: ffff880fc1a77cf8 R08: 0000000000000400 R09: ffffffff81a8bf00
R10: ffff884f7efdac00 R11: ffffffff81009bae R12: dead000000200200
R13: ffff888f7efdac00 R14: 000000000000001f R15: 0000000000000000
FS: 0000000000000000(0000) GS:ffff880fffd80000(0000)
knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007ffff768b0b8 CR3: 0000000001a0b000 CR4: 00000000000407e0
Stack:
ffff880fc1a77ce8 ffff880fffd94d68 0000000000000010 ffff880fffd94d50
0000001ff9276a68 ffff880fffd94d60 0000000000000000 000000000000001f
ffff880fffd94d50 0000000000000292 ffff880fc1a77d38 ffff880fffd95d05
Call Trace:
[<ffffffff810bd149>] drain_zone_pages+0x33/0x42
[<ffffffff810cd5a6>] refresh_cpu_vm_stats+0xcc/0x11e
[<ffffffff810cd609>] vmstat_update+0x11/0x43
[<ffffffff8105350f>] process_one_work+0x260/0x389
[<ffffffff8105381a>] worker_thread+0x1e2/0x332
[<ffffffff81053638>] ? process_one_work+0x389/0x389
[<ffffffff810579df>] kthread+0xb3/0xbd
[<ffffffff81053638>] ? process_one_work+0x389/0x389
[<ffffffff8105792c>] ? kthread_freezable_should_stop+0x5b/0x5b
[<ffffffff814f522c>] ret_from_fork+0x7c/0xb0
[<ffffffff8105792c>] ? kthread_freezable_should_stop+0x5b/0x5b
Code: 48 89 55 c8 48 39 14 08 74 ce 41 83 fe 03 44 0f 44 75 c4 48 83 c2
08 48 89 45 b0 48 89 55 a8 48 8b 45 a8 4c 8b 20 49 8d 5c 24 e0 <48> 8b
53 20 48 8b 43 28 48 89 42 08 48 89 10 48 ba 00 01 10 00
RIP [<ffffffff810bcdcb>] free_pcppages_bulk+0x97/0x329
RSP <ffff880fc1a77c98>
---[ end trace e5413b388b6ea449 ]---
BUG: unable to handle kernel paging request at ffffffffffffffd8
IP: [<ffffffff8105742c>] kthread_data+0xb/0x11
PGD 1a0c067 PUD 1a0e067 PMD 0
Oops: 0000 [#3] SMP
Modules linked in:
CPU: 268 PID: 1722 Comm: kworker/268:1 Tainted: G D
3.11.0-medusa-03121-g757f8ca #184
Hardware name: SGI UV2000/ROMLEY, BIOS SGI UV 2000/3000 series BIOS
01/15/2013
task: ffff880fc1a74280 ti: ffff880fc1a76000 task.ti: ffff880fc1a76000
RIP: 0010:[<ffffffff8105742c>] [<ffffffff8105742c>]
kthread_data+0xb/0x11
RSP: 0018:ffff880fc1a77948 EFLAGS: 00010092
RAX: 0000000000000000 RBX: 000000000000010c RCX: 0000000000000000
RDX: 000000000000000f RSI: 000000000000010c RDI: ffff880fc1a74280
RBP: ffff880fc1a77948 R08: 00000000000442c8 R09: 0000000000000000
R10: dead000000200200 R11: ffff880fc1a742e8 R12: ffff880fc1a74868
R13: ffff880fffd91cc0 R14: ffff880ff9b7a040 R15: 000000000000010c
FS: 0000000000000000(0000) GS:ffff880fffd80000(0000)
knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000028 CR3: 0000000001a0b000 CR4: 00000000000407e0
Stack:
ffff880fc1a77968 ffffffff8105151f ffff880fc1a77968 ffff880fc1a74280
ffff880fc1a77ab8 ffffffff814f2e98 ffff880fc1a76010 0000000000004000
ffff880fc1a74280 0000000000011cc0 ffff880fc1a77fd8 ffff880fc1a77fd8
Call Trace:
[<ffffffff8105151f>] wq_worker_sleeping+0x10/0x82
[<ffffffff814f2e98>] __schedule+0x1b7/0x8f7
[<ffffffff8135d4bd>] ? mix_pool_bytes+0x4a/0x56
[<ffffffff810a5d05>] ? call_rcu_sched+0x16/0x18
[<ffffffff8103f708>] ? release_task+0x3a7/0x3bf
[<ffffffff814f36b5>] schedule+0x61/0x63
[<ffffffff81040e0f>] do_exit+0x903/0x907
[<ffffffff8100529a>] oops_end+0xb9/0xc1
[<ffffffff81005393>] die+0x55/0x5e
[<ffffffff8100341a>] do_general_protection+0x93/0x139
[<ffffffff814f4d82>] general_protection+0x22/0x30
[<ffffffff81009bae>] ? default_idle+0x6/0x8
[<ffffffff810bcdcb>] ? free_pcppages_bulk+0x97/0x329
[<ffffffff810bcd5d>] ? free_pcppages_bulk+0x29/0x329
[<ffffffff810bd149>] drain_zone_pages+0x33/0x42
[<ffffffff810cd5a6>] refresh_cpu_vm_stats+0xcc/0x11e
[<ffffffff810cd609>] vmstat_update+0x11/0x43
[<ffffffff8105350f>] process_one_work+0x260/0x389
[<ffffffff8105381a>] worker_thread+0x1e2/0x332
[<ffffffff81053638>] ? process_one_work+0x389/0x389
[<ffffffff810579df>] kthread+0xb3/0xbd
[<ffffffff81053638>] ? process_one_work+0x389/0x389
[<ffffffff8105792c>] ? kthread_freezable_should_stop+0x5b/0x5b
[<ffffffff814f522c>] ret_from_fork+0x7c/0xb0
[<ffffffff8105792c>] ? kthread_freezable_should_stop+0x5b/0x5b
Code: 65 48 8b 04 25 40 b7 00 00 48 8b 80 90 05 00 00 48 89 e5 48 8b 40
c8 c9 48 c1 e8 02 83 e0 01 c3 48 8b 87 90 05 00 00 55 48 89 e5 <48> 8b
40 d8 c9 c3 48 3b 3d 67 ca c2 00 55 48 89 e5 75 09 0f bf
RIP [<ffffffff8105742c>] kthread_data+0xb/0x11
RSP <ffff880fc1a77948>
CR2: ffffffffffffffd8
---[ end trace e5413b388b6ea44a ]---
Fixing recursive fault but reboot is needed!

I'm testing on a 528 core machine, with ~2TB of memory, THP on. The
test case works like this:

- Spawn 512 threads using pthread_create, pin each thread to a separate
cpu
- Each thread allocates 512mb, local to its cpu
- Threads are sent a "go" signal, all threads begin touching the first
byte of each 4k chunk of their 512mb simultaneously

I'm working on debugging the issue now, but I thought I'd get this out
to everyone in case they might have some input. I'll try and get my
test program cleaned up and posted somewhere today so that others can
try it out as well.

- Alex

2013-09-06 16:27:47

by Naoya Horiguchi

[permalink] [raw]
Subject: Re: [PATCH 2/2] thp: support split page table lock

Hi Kirill,

On Fri, Sep 06, 2013 at 01:48:03PM +0300, Kirill A. Shutemov wrote:
> Naoya Horiguchi wrote:
> > Thp related code also uses per process mm->page_table_lock now.
> > So making it fine-grained can provide better performance.
> >
> > This patch makes thp support split page table lock by using page->ptl
> > of the pages storing "pmd_trans_huge" pmds.
> >
> > Some functions like pmd_trans_huge_lock() and page_check_address_pmd()
> > are expected by their caller to pass back the pointer of ptl, so this
> > patch adds to those functions new arguments for that. Rather than that,
> > this patch gives only straightforward replacement.
> >
> > ChangeLog v3:
> > - fixed argument of huge_pmd_lockptr() in copy_huge_pmd()
> > - added missing declaration of ptl in do_huge_pmd_anonymous_page()
> >
> > Signed-off-by: Naoya Horiguchi <[email protected]>
>
> Generally, looks good. Few notes:
>
> I believe you need to convert __pte_alloc() to new locking. Not sure about
> __pte_alloc_kernel().
> Have you check all rest mm->page_table_lock, that they shouldn't be
> converted to new locking?

I thought that keeping __pte_alloc() using mm->page_table_lock was safe
because it uses bare mm->page_table_lock instead of pte_lockptr() even
before this patchset, but not 100% sure.
__pte_alloc() (and its family) are used in normal page path, so if it's
not safe, we've lived with unsafe code for very long (maybe since 2005).
Anyway, converting __pte_alloc() into split ptl could improve performance
(though we need testing to know what amount), so I'll try that.

> You use uninitialized_var() a lot. It's ugly. I've check few places
> (task_mmu.c, copy_huge_pmd) and have found a reason why we need it there.
> Why?

I got a compile warning of uninitialized usage when developing and added
to suppress it, but in the final form I never get such a warning.
So I'll remove this uninitialized_var()s.

> You often do
>
> + ptl = huge_pmd_lockptr(mm, pmd);
> + spin_lock(ptl);
>
> Should we have a helper to combine them? huge_pmd_lock()?

OK, I'll do it.

Thanks,
Naoya Horiguchi

2013-09-06 16:46:31

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCH 2/2] thp: support split page table lock

Naoya Horiguchi wrote:
> Hi Kirill,
>
> On Fri, Sep 06, 2013 at 01:48:03PM +0300, Kirill A. Shutemov wrote:
> > Naoya Horiguchi wrote:
> > > Thp related code also uses per process mm->page_table_lock now.
> > > So making it fine-grained can provide better performance.
> > >
> > > This patch makes thp support split page table lock by using page->ptl
> > > of the pages storing "pmd_trans_huge" pmds.
> > >
> > > Some functions like pmd_trans_huge_lock() and page_check_address_pmd()
> > > are expected by their caller to pass back the pointer of ptl, so this
> > > patch adds to those functions new arguments for that. Rather than that,
> > > this patch gives only straightforward replacement.
> > >
> > > ChangeLog v3:
> > > - fixed argument of huge_pmd_lockptr() in copy_huge_pmd()
> > > - added missing declaration of ptl in do_huge_pmd_anonymous_page()
> > >
> > > Signed-off-by: Naoya Horiguchi <[email protected]>
> >
> > Generally, looks good. Few notes:
> >
> > I believe you need to convert __pte_alloc() to new locking. Not sure about
> > __pte_alloc_kernel().
> > Have you check all rest mm->page_table_lock, that they shouldn't be
> > converted to new locking?
>
> I thought that keeping __pte_alloc() using mm->page_table_lock was safe
> because it uses bare mm->page_table_lock instead of pte_lockptr() even
> before this patchset, but not 100% sure.
> __pte_alloc() (and its family) are used in normal page path, so if it's
> not safe, we've lived with unsafe code for very long (maybe since 2005).
> Anyway, converting __pte_alloc() into split ptl could improve performance
> (though we need testing to know what amount), so I'll try that.

No, before the patch mm->page_table_lock is what we need: it serializes
setting up pmd, not adding pte to pmd and it's subject to change with new
locking model.

--
Kirill A. Shutemov

2013-09-06 17:16:01

by Naoya Horiguchi

[permalink] [raw]
Subject: Re: [PATCH 2/2] thp: support split page table lock

On Fri, Sep 06, 2013 at 07:46:25PM +0300, Kirill A. Shutemov wrote:
> Naoya Horiguchi wrote:
> > Hi Kirill,
> >
> > On Fri, Sep 06, 2013 at 01:48:03PM +0300, Kirill A. Shutemov wrote:
> > > Naoya Horiguchi wrote:
> > > > Thp related code also uses per process mm->page_table_lock now.
> > > > So making it fine-grained can provide better performance.
> > > >
> > > > This patch makes thp support split page table lock by using page->ptl
> > > > of the pages storing "pmd_trans_huge" pmds.
> > > >
> > > > Some functions like pmd_trans_huge_lock() and page_check_address_pmd()
> > > > are expected by their caller to pass back the pointer of ptl, so this
> > > > patch adds to those functions new arguments for that. Rather than that,
> > > > this patch gives only straightforward replacement.
> > > >
> > > > ChangeLog v3:
> > > > - fixed argument of huge_pmd_lockptr() in copy_huge_pmd()
> > > > - added missing declaration of ptl in do_huge_pmd_anonymous_page()
> > > >
> > > > Signed-off-by: Naoya Horiguchi <[email protected]>
> > >
> > > Generally, looks good. Few notes:
> > >
> > > I believe you need to convert __pte_alloc() to new locking. Not sure about
> > > __pte_alloc_kernel().
> > > Have you check all rest mm->page_table_lock, that they shouldn't be
> > > converted to new locking?
> >
> > I thought that keeping __pte_alloc() using mm->page_table_lock was safe
> > because it uses bare mm->page_table_lock instead of pte_lockptr() even
> > before this patchset, but not 100% sure.
> > __pte_alloc() (and its family) are used in normal page path, so if it's
> > not safe, we've lived with unsafe code for very long (maybe since 2005).
> > Anyway, converting __pte_alloc() into split ptl could improve performance
> > (though we need testing to know what amount), so I'll try that.
>
> No, before the patch mm->page_table_lock is what we need: it serializes
> setting up pmd, not adding pte to pmd and it's subject to change with new
> locking model.

OK, I see. This is a todo for the next post.
Thank you for explanation.

Naoya

2013-09-06 18:01:37

by Naoya Horiguchi

[permalink] [raw]
Subject: Re: [PATCH 2/2] thp: support split page table lock

Hi Alex,

On Fri, Sep 06, 2013 at 11:04:23AM -0500, Alex Thorlton wrote:
> On Thu, Sep 05, 2013 at 05:27:46PM -0400, Naoya Horiguchi wrote:
> > Thp related code also uses per process mm->page_table_lock now.
> > So making it fine-grained can provide better performance.
> >
> > This patch makes thp support split page table lock by using page->ptl
> > of the pages storing "pmd_trans_huge" pmds.
> >
> > Some functions like pmd_trans_huge_lock() and page_check_address_pmd()
> > are expected by their caller to pass back the pointer of ptl, so this
> > patch adds to those functions new arguments for that. Rather than that,
> > this patch gives only straightforward replacement.
> >
> > ChangeLog v3:
> > - fixed argument of huge_pmd_lockptr() in copy_huge_pmd()
> > - added missing declaration of ptl in do_huge_pmd_anonymous_page()
>
> I've applied these and tested them using the same tests program that I
> used when I was working on the same issue, and I'm running into some
> bugs. Here's a stack trace:

Thank you for helping testing. This bug is new to me.

> general protection fault: 0000 [#1] SMP
> Modules linked in:
> CPU: 268 PID: 32381 Comm: memscale Not tainted
> 3.11.0-medusa-03121-g757f8ca #184
> Hardware name: SGI UV2000/ROMLEY, BIOS SGI UV 2000/3000 series BIOS
> 01/15/2013
> task: ffff880fbdd82180 ti: ffff880fc0c5a000 task.ti: ffff880fc0c5a000
> RIP: 0010:[<ffffffff810e3eef>] [<ffffffff810e3eef>]
> pgtable_trans_huge_withdraw+0x38/0x60
> RSP: 0018:ffff880fc0c5bc88 EFLAGS: 00010297
> RAX: ffffea17cebe8838 RBX: 00000015309bd000 RCX: ffffea01f623b028
> RDX: dead000000100100 RSI: ffff8dcf77d84c30 RDI: ffff880fbda67580
> RBP: ffff880fc0c5bc88 R08: 0000000000000013 R09: 0000000000014da0
> R10: ffff880fc0c5bc88 R11: ffff888f7efda000 R12: ffff8dcf77d84c30
> R13: ffff880fc0c5bdf8 R14: 800005cf401ff067 R15: ffff8b4de5fabff8
> FS: 0000000000000000(0000) GS:ffff880fffd80000(0000)
> knlGS:0000000000000000
> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00007ffff768b0b8 CR3: 0000000001a0b000 CR4: 00000000000407e0
> Stack:
> ffff880fc0c5bcc8 ffffffff810f7643 ffff880fc0c5bcc8 ffffffff810d8297
> ffffea1456237510 00007fc7b0e00000 0000000000000000 00007fc7b0c00000
> ffff880fc0c5bda8 ffffffff810d85ba ffff880fc0c5bd48 ffff880fc0c5bd68
> Call Trace:
> [<ffffffff810f7643>] zap_huge_pmd+0x4c/0x101
> [<ffffffff810d8297>] ? tlb_flush_mmu+0x58/0x75
> [<ffffffff810d85ba>] unmap_single_vma+0x306/0x7d6
> [<ffffffff810d8ad9>] unmap_vmas+0x4f/0x82
> [<ffffffff810dab5e>] exit_mmap+0x8b/0x113
> [<ffffffff810a9743>] ? __delayacct_add_tsk+0x170/0x182
> [<ffffffff8103c609>] mmput+0x3e/0xc4
> [<ffffffff8104088c>] do_exit+0x380/0x907
> [<ffffffff810fb89c>] ? vfs_write+0x149/0x1a3
> [<ffffffff81040e85>] do_group_exit+0x72/0x9b
> [<ffffffff81040ec0>] SyS_exit_group+0x12/0x16
> [<ffffffff814f52d2>] system_call_fastpath+0x16/0x1b
> Code: 51 20 48 8d 41 20 48 39 c2 75 0d 48 c7 87 28 03 00 00 00 00 00 00
> eb 36 48 8d 42 e0 48 89 87 28 03 00 00 48 8b 51 20 48 8b 41 28 <48> 89
> 42 08 48 89 10 48 ba 00 01 10 00 00 00 ad de 48 b8 00 02
> RIP [<ffffffff810e3eef>] pgtable_trans_huge_withdraw+0x38/0x60
> RSP <ffff880fc0c5bc88>
> ---[ end trace e5413b388b6ea448 ]---
> Fixing recursive fault but reboot is needed!
> general protection fault: 0000 [#2] SMP
> Modules linked in:
> CPU: 268 PID: 1722 Comm: kworker/268:1 Tainted: G D
> 3.11.0-medusa-03121-g757f8ca #184
> Hardware name: SGI UV2000/ROMLEY, BIOS SGI UV 2000/3000 series BIOS
> 01/15/2013
> Workqueue: events vmstat_update
> task: ffff880fc1a74280 ti: ffff880fc1a76000 task.ti: ffff880fc1a76000
> RIP: 0010:[<ffffffff810bcdcb>] [<ffffffff810bcdcb>]
> free_pcppages_bulk+0x97/0x329
> RSP: 0018:ffff880fc1a77c98 EFLAGS: 00010082
> RAX: ffff880fffd94d68 RBX: dead0000002001e0 RCX: ffff880fffd94d50
> RDX: ffff880fffd94d68 RSI: 000000000000001f RDI: ffff888f7efdac68
> RBP: ffff880fc1a77cf8 R08: 0000000000000400 R09: ffffffff81a8bf00
> R10: ffff884f7efdac00 R11: ffffffff81009bae R12: dead000000200200
> R13: ffff888f7efdac00 R14: 000000000000001f R15: 0000000000000000
> FS: 0000000000000000(0000) GS:ffff880fffd80000(0000)
> knlGS:0000000000000000
> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00007ffff768b0b8 CR3: 0000000001a0b000 CR4: 00000000000407e0
> Stack:
> ffff880fc1a77ce8 ffff880fffd94d68 0000000000000010 ffff880fffd94d50
> 0000001ff9276a68 ffff880fffd94d60 0000000000000000 000000000000001f
> ffff880fffd94d50 0000000000000292 ffff880fc1a77d38 ffff880fffd95d05
> Call Trace:
> [<ffffffff810bd149>] drain_zone_pages+0x33/0x42
> [<ffffffff810cd5a6>] refresh_cpu_vm_stats+0xcc/0x11e
> [<ffffffff810cd609>] vmstat_update+0x11/0x43
> [<ffffffff8105350f>] process_one_work+0x260/0x389
> [<ffffffff8105381a>] worker_thread+0x1e2/0x332
> [<ffffffff81053638>] ? process_one_work+0x389/0x389
> [<ffffffff810579df>] kthread+0xb3/0xbd
> [<ffffffff81053638>] ? process_one_work+0x389/0x389
> [<ffffffff8105792c>] ? kthread_freezable_should_stop+0x5b/0x5b
> [<ffffffff814f522c>] ret_from_fork+0x7c/0xb0
> [<ffffffff8105792c>] ? kthread_freezable_should_stop+0x5b/0x5b
> Code: 48 89 55 c8 48 39 14 08 74 ce 41 83 fe 03 44 0f 44 75 c4 48 83 c2
> 08 48 89 45 b0 48 89 55 a8 48 8b 45 a8 4c 8b 20 49 8d 5c 24 e0 <48> 8b
> 53 20 48 8b 43 28 48 89 42 08 48 89 10 48 ba 00 01 10 00
> RIP [<ffffffff810bcdcb>] free_pcppages_bulk+0x97/0x329
> RSP <ffff880fc1a77c98>
> ---[ end trace e5413b388b6ea449 ]---
> BUG: unable to handle kernel paging request at ffffffffffffffd8
> IP: [<ffffffff8105742c>] kthread_data+0xb/0x11
> PGD 1a0c067 PUD 1a0e067 PMD 0
> Oops: 0000 [#3] SMP
> Modules linked in:
> CPU: 268 PID: 1722 Comm: kworker/268:1 Tainted: G D
> 3.11.0-medusa-03121-g757f8ca #184
> Hardware name: SGI UV2000/ROMLEY, BIOS SGI UV 2000/3000 series BIOS
> 01/15/2013
> task: ffff880fc1a74280 ti: ffff880fc1a76000 task.ti: ffff880fc1a76000
> RIP: 0010:[<ffffffff8105742c>] [<ffffffff8105742c>]
> kthread_data+0xb/0x11
> RSP: 0018:ffff880fc1a77948 EFLAGS: 00010092
> RAX: 0000000000000000 RBX: 000000000000010c RCX: 0000000000000000
> RDX: 000000000000000f RSI: 000000000000010c RDI: ffff880fc1a74280
> RBP: ffff880fc1a77948 R08: 00000000000442c8 R09: 0000000000000000
> R10: dead000000200200 R11: ffff880fc1a742e8 R12: ffff880fc1a74868
> R13: ffff880fffd91cc0 R14: ffff880ff9b7a040 R15: 000000000000010c
> FS: 0000000000000000(0000) GS:ffff880fffd80000(0000)
> knlGS:0000000000000000
> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 0000000000000028 CR3: 0000000001a0b000 CR4: 00000000000407e0
> Stack:
> ffff880fc1a77968 ffffffff8105151f ffff880fc1a77968 ffff880fc1a74280
> ffff880fc1a77ab8 ffffffff814f2e98 ffff880fc1a76010 0000000000004000
> ffff880fc1a74280 0000000000011cc0 ffff880fc1a77fd8 ffff880fc1a77fd8
> Call Trace:
> [<ffffffff8105151f>] wq_worker_sleeping+0x10/0x82
> [<ffffffff814f2e98>] __schedule+0x1b7/0x8f7
> [<ffffffff8135d4bd>] ? mix_pool_bytes+0x4a/0x56
> [<ffffffff810a5d05>] ? call_rcu_sched+0x16/0x18
> [<ffffffff8103f708>] ? release_task+0x3a7/0x3bf
> [<ffffffff814f36b5>] schedule+0x61/0x63
> [<ffffffff81040e0f>] do_exit+0x903/0x907
> [<ffffffff8100529a>] oops_end+0xb9/0xc1
> [<ffffffff81005393>] die+0x55/0x5e
> [<ffffffff8100341a>] do_general_protection+0x93/0x139
> [<ffffffff814f4d82>] general_protection+0x22/0x30
> [<ffffffff81009bae>] ? default_idle+0x6/0x8
> [<ffffffff810bcdcb>] ? free_pcppages_bulk+0x97/0x329
> [<ffffffff810bcd5d>] ? free_pcppages_bulk+0x29/0x329
> [<ffffffff810bd149>] drain_zone_pages+0x33/0x42
> [<ffffffff810cd5a6>] refresh_cpu_vm_stats+0xcc/0x11e
> [<ffffffff810cd609>] vmstat_update+0x11/0x43
> [<ffffffff8105350f>] process_one_work+0x260/0x389
> [<ffffffff8105381a>] worker_thread+0x1e2/0x332
> [<ffffffff81053638>] ? process_one_work+0x389/0x389
> [<ffffffff810579df>] kthread+0xb3/0xbd
> [<ffffffff81053638>] ? process_one_work+0x389/0x389
> [<ffffffff8105792c>] ? kthread_freezable_should_stop+0x5b/0x5b
> [<ffffffff814f522c>] ret_from_fork+0x7c/0xb0
> [<ffffffff8105792c>] ? kthread_freezable_should_stop+0x5b/0x5b
> Code: 65 48 8b 04 25 40 b7 00 00 48 8b 80 90 05 00 00 48 89 e5 48 8b 40
> c8 c9 48 c1 e8 02 83 e0 01 c3 48 8b 87 90 05 00 00 55 48 89 e5 <48> 8b
> 40 d8 c9 c3 48 3b 3d 67 ca c2 00 55 48 89 e5 75 09 0f bf
> RIP [<ffffffff8105742c>] kthread_data+0xb/0x11
> RSP <ffff880fc1a77948>
> CR2: ffffffffffffffd8
> ---[ end trace e5413b388b6ea44a ]---
> Fixing recursive fault but reboot is needed!
>
> I'm testing on a 528 core machine, with ~2TB of memory, THP on. The
> test case works like this:
>
> - Spawn 512 threads using pthread_create, pin each thread to a separate
> cpu
> - Each thread allocates 512mb, local to its cpu
> - Threads are sent a "go" signal, all threads begin touching the first
> byte of each 4k chunk of their 512mb simultaneously
>
> I'm working on debugging the issue now, but I thought I'd get this out
> to everyone in case they might have some input. I'll try and get my
> test program cleaned up and posted somewhere today so that others can
> try it out as well.

Thanks. Please let me know when it's available. I'll look at it.

Naoya

2013-09-08 16:54:09

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH 1/2] hugetlbfs: support split page table lock

Naoya Horiguchi <[email protected]> writes:

> Currently all of page table handling by hugetlbfs code are done under
> mm->page_table_lock. So when a process have many threads and they heavily
> access to the memory, lock contention happens and impacts the performance.
>
> This patch makes hugepage support split page table lock so that we use
> page->ptl of the leaf node of page table tree which is pte for normal pages
> but can be pmd and/or pud for hugepages of some architectures.
>
> ChangeLog v3:
> - disable split ptl for ppc with USE_SPLIT_PTLOCKS_HUGETLB.
> - remove replacement in some architecture dependent code. This is justified
> because an allocation of pgd/pud/pmd/pte entry can race with other
> allocation, not with read/write access, so we can use different locks.
> http://thread.gmane.org/gmane.linux.kernel.mm/106292/focus=106458
>
> ChangeLog v2:
> - add split ptl on other archs missed in v1
> - drop changes on arch/{powerpc,tile}/mm/hugetlbpage.c
>
> Signed-off-by: Naoya Horiguchi <[email protected]>
> ---
> include/linux/hugetlb.h | 20 +++++++++++
> include/linux/mm_types.h | 2 ++
> mm/hugetlb.c | 92 +++++++++++++++++++++++++++++-------------------
> mm/mempolicy.c | 5 +--
> mm/migrate.c | 4 +--
> mm/rmap.c | 2 +-
> 6 files changed, 84 insertions(+), 41 deletions(-)
>
> diff --git v3.11-rc3.orig/include/linux/hugetlb.h v3.11-rc3/include/linux/hugetlb.h
> index 0393270..5cb8a4e 100644
> --- v3.11-rc3.orig/include/linux/hugetlb.h
> +++ v3.11-rc3/include/linux/hugetlb.h
> @@ -80,6 +80,24 @@ extern const unsigned long hugetlb_zero, hugetlb_infinity;
> extern int sysctl_hugetlb_shm_group;
> extern struct list_head huge_boot_pages;
>
> +#if USE_SPLIT_PTLOCKS_HUGETLB
> +#define huge_pte_lockptr(mm, ptep) ({__pte_lockptr(virt_to_page(ptep)); })
> +#else /* !USE_SPLIT_PTLOCKS_HUGETLB */
> +#define huge_pte_lockptr(mm, ptep) ({&(mm)->page_table_lock; })
> +#endif /* USE_SPLIT_PTLOCKS_HUGETLB */
> +
> +#define huge_pte_offset_lock(mm, address, ptlp) \
> +({ \
> + pte_t *__pte = huge_pte_offset(mm, address); \
> + spinlock_t *__ptl = NULL; \
> + if (__pte) { \
> + __ptl = huge_pte_lockptr(mm, __pte); \
> + *(ptlp) = __ptl; \
> + spin_lock(__ptl); \
> + } \
> + __pte; \
> +})


why not a static inline function ?


> +
> /* arch callbacks */
>
> pte_t *huge_pte_alloc(struct mm_struct *mm,> @@ -164,6 +182,8 @@ static inline void __unmap_hugepage_range(struct mmu_gather *tlb,
> BUG();
> }
>
> +#define huge_pte_lockptr(mm, ptep) 0
> +
> #endif /* !CONFIG_HUGETLB_PAGE */
>
> #define HUGETLB_ANON_FILE "anon_hugepage"
> diff --git v3.11-rc3.orig/include/linux/mm_types.h v3.11-rc3/include/linux/mm_types.h
> index fb425aa..cfb8c6f 100644
> --- v3.11-rc3.orig/include/linux/mm_types.h
> +++ v3.11-rc3/include/linux/mm_types.h
> @@ -24,6 +24,8 @@
> struct address_space;
>
> #define USE_SPLIT_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
> +#define USE_SPLIT_PTLOCKS_HUGETLB \
> + (USE_SPLIT_PTLOCKS && !defined(CONFIG_PPC))
>

Is that a common pattern ? Don't we generally use
HAVE_ARCH_SPLIT_PTLOCKS in arch config file ? Also are we sure this is
only an issue with PPC ?



-aneesh

2013-09-08 16:56:58

by Aneesh Kumar K.V

[permalink] [raw]
Subject: RE: [PATCH 2/2] thp: support split page table lock

"Kirill A. Shutemov" <[email protected]> writes:

> Naoya Horiguchi wrote:
>> Thp related code also uses per process mm->page_table_lock now.
>> So making it fine-grained can provide better performance.
>>
>> This patch makes thp support split page table lock by using page->ptl
>> of the pages storing "pmd_trans_huge" pmds.
>>
>> Some functions like pmd_trans_huge_lock() and page_check_address_pmd()
>> are expected by their caller to pass back the pointer of ptl, so this
>> patch adds to those functions new arguments for that. Rather than that,
>> this patch gives only straightforward replacement.
>>
>> ChangeLog v3:
>> - fixed argument of huge_pmd_lockptr() in copy_huge_pmd()
>> - added missing declaration of ptl in do_huge_pmd_anonymous_page()
>>
>> Signed-off-by: Naoya Horiguchi <[email protected]>
>
> Generally, looks good. Few notes:
>
> I believe you need to convert __pte_alloc() to new locking. Not sure about
> __pte_alloc_kernel().
> Have you check all rest mm->page_table_lock, that they shouldn't be
> converted to new locking?

May be we can have a CONFIG_DEBUG_VM version of pmd_populate that check
check with assert_spin_locked ?


-aneesh

2013-09-09 03:29:12

by Daniel J Blueman

[permalink] [raw]
Subject: Re: [PATCH 2/2] thp: support split page table lock

On Saturday, 7 September 2013 02:10:02 UTC+8, Naoya Horiguchi wrote:
> Hi Alex,
>
> On Fri, Sep 06, 2013 at 11:04:23AM -0500, Alex Thorlton wrote:
> > On Thu, Sep 05, 2013 at 05:27:46PM -0400, Naoya Horiguchi wrote:
> > > Thp related code also uses per process mm->page_table_lock now.
> > > So making it fine-grained can provide better performance.
> > >
> > > This patch makes thp support split page table lock by using page->ptl
> > > of the pages storing "pmd_trans_huge" pmds.
> > >
> > > Some functions like pmd_trans_huge_lock() and
page_check_address_pmd()
> > > are expected by their caller to pass back the pointer of ptl, so this
> > > patch adds to those functions new arguments for that. Rather than
that,
> > > this patch gives only straightforward replacement.
> > >
> > > ChangeLog v3:
> > > - fixed argument of huge_pmd_lockptr() in copy_huge_pmd()
> > > - added missing declaration of ptl in do_huge_pmd_anonymous_page()
> >
> > I've applied these and tested them using the same tests program that I
> > used when I was working on the same issue, and I'm running into some
> > bugs. Here's a stack trace:
>
> Thank you for helping testing. This bug is new to me.

With 3.11, this patch series and CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS, I
consistently hit the same failure when exiting one of my stress-testers
[1] when using eg 24 cores.

Doesn't happen with 8 cores, so likely needs enough virtual memory to
use multiple split locks. Otherwise, this is very promising work!

[1] http://quora.org/2013/fft3d.c
--
Daniel J Blueman
Principal Software Engineer, Numascale

2013-09-09 15:43:52

by Alex Thorlton

[permalink] [raw]
Subject: Re: [PATCH 2/2] thp: support split page table lock

On Mon, Sep 09, 2013 at 10:34:45AM +0800, Daniel J Blueman wrote:
> On Saturday, 7 September 2013 02:10:02 UTC+8, Naoya Horiguchi wrote:
> >Hi Alex,
> >
> >On Fri, Sep 06, 2013 at 11:04:23AM -0500, Alex Thorlton wrote:
> >> On Thu, Sep 05, 2013 at 05:27:46PM -0400, Naoya Horiguchi wrote:
> >> > Thp related code also uses per process mm->page_table_lock now.
> >> > So making it fine-grained can provide better performance.
> >> >
> >> > This patch makes thp support split page table lock by using page->ptl
> >> > of the pages storing "pmd_trans_huge" pmds.
> >> >
> >> > Some functions like pmd_trans_huge_lock() and
> page_check_address_pmd()
> >> > are expected by their caller to pass back the pointer of ptl, so this
> >> > patch adds to those functions new arguments for that. Rather than
> that,
> >> > this patch gives only straightforward replacement.
> >> >
> >> > ChangeLog v3:
> >> > - fixed argument of huge_pmd_lockptr() in copy_huge_pmd()
> >> > - added missing declaration of ptl in do_huge_pmd_anonymous_page()
> >>
> >> I've applied these and tested them using the same tests program that I
> >> used when I was working on the same issue, and I'm running into some
> >> bugs. Here's a stack trace:
> >
> >Thank you for helping testing. This bug is new to me.
>
> With 3.11, this patch series and CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS,
> I consistently hit the same failure when exiting one of my
> stress-testers [1] when using eg 24 cores.
>
> Doesn't happen with 8 cores, so likely needs enough virtual memory
> to use multiple split locks. Otherwise, this is very promising work!

Daniel,

Hillf Danton ([email protected]) suggested putting the big
page_table_lock back into the two functions seen below. I re-tested
with this change and it seems to resolve the issue.

- Alex

--- a/mm/pgtable-generic.c Sat Sep 7 15:17:52 2013
+++ b/mm/pgtable-generic.c Sat Sep 7 15:20:28 2013
@@ -127,12 +127,14 @@ void pmdp_splitting_flush(struct vm_area
void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable)
{
+ spin_lock(&mm->page_table_lock);
/* FIFO */
if (!mm->pmd_huge_pte)
INIT_LIST_HEAD(&pgtable->lru);
else
list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
mm->pmd_huge_pte = pgtable;
+ spin_unlock(&mm->page_table_lock);
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
@@ -144,6 +146,7 @@ pgtable_t pgtable_trans_huge_withdraw(st
{
pgtable_t pgtable;

+ spin_lock(&mm->page_table_lock);
/* FIFO */
pgtable = mm->pmd_huge_pte;
if (list_empty(&pgtable->lru))
@@ -153,6 +156,7 @@ pgtable_t pgtable_trans_huge_withdraw(st
struct page, lru);
list_del(&pgtable->lru);
}
+ spin_unlock(&mm->page_table_lock);
return pgtable;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
--

2013-09-09 16:26:41

by Naoya Horiguchi

[permalink] [raw]
Subject: Re: [PATCH 1/2] hugetlbfs: support split page table lock

On Sun, Sep 08, 2013 at 10:23:55PM +0530, Aneesh Kumar K.V wrote:
> Naoya Horiguchi <[email protected]> writes:
>
> > Currently all of page table handling by hugetlbfs code are done under
> > mm->page_table_lock. So when a process have many threads and they heavily
> > access to the memory, lock contention happens and impacts the performance.
> >
> > This patch makes hugepage support split page table lock so that we use
> > page->ptl of the leaf node of page table tree which is pte for normal pages
> > but can be pmd and/or pud for hugepages of some architectures.
> >
> > ChangeLog v3:
> > - disable split ptl for ppc with USE_SPLIT_PTLOCKS_HUGETLB.
> > - remove replacement in some architecture dependent code. This is justified
> > because an allocation of pgd/pud/pmd/pte entry can race with other
> > allocation, not with read/write access, so we can use different locks.
> > http://thread.gmane.org/gmane.linux.kernel.mm/106292/focus=106458
> >
> > ChangeLog v2:
> > - add split ptl on other archs missed in v1
> > - drop changes on arch/{powerpc,tile}/mm/hugetlbpage.c
> >
> > Signed-off-by: Naoya Horiguchi <[email protected]>
> > ---
> > include/linux/hugetlb.h | 20 +++++++++++
> > include/linux/mm_types.h | 2 ++
> > mm/hugetlb.c | 92 +++++++++++++++++++++++++++++-------------------
> > mm/mempolicy.c | 5 +--
> > mm/migrate.c | 4 +--
> > mm/rmap.c | 2 +-
> > 6 files changed, 84 insertions(+), 41 deletions(-)
> >
> > diff --git v3.11-rc3.orig/include/linux/hugetlb.h v3.11-rc3/include/linux/hugetlb.h
> > index 0393270..5cb8a4e 100644
> > --- v3.11-rc3.orig/include/linux/hugetlb.h
> > +++ v3.11-rc3/include/linux/hugetlb.h
> > @@ -80,6 +80,24 @@ extern const unsigned long hugetlb_zero, hugetlb_infinity;
> > extern int sysctl_hugetlb_shm_group;
> > extern struct list_head huge_boot_pages;
> >
> > +#if USE_SPLIT_PTLOCKS_HUGETLB
> > +#define huge_pte_lockptr(mm, ptep) ({__pte_lockptr(virt_to_page(ptep)); })
> > +#else /* !USE_SPLIT_PTLOCKS_HUGETLB */
> > +#define huge_pte_lockptr(mm, ptep) ({&(mm)->page_table_lock; })
> > +#endif /* USE_SPLIT_PTLOCKS_HUGETLB */
> > +
> > +#define huge_pte_offset_lock(mm, address, ptlp) \
> > +({ \
> > + pte_t *__pte = huge_pte_offset(mm, address); \
> > + spinlock_t *__ptl = NULL; \
> > + if (__pte) { \
> > + __ptl = huge_pte_lockptr(mm, __pte); \
> > + *(ptlp) = __ptl; \
> > + spin_lock(__ptl); \
> > + } \
> > + __pte; \
> > +})
>
>
> why not a static inline function ?

I'm not sure how these two make change in runtime (maybe optimized to
similar code?).
I just used the same pattern with pte_offset_map_lock(). I think that
this may show that this function is the variant of pte_offset_map_lock().
But if we have a good reason to make it static inline, I'm glad to do this.
Do you have any specific ideas?

>
> > +
> > /* arch callbacks */
> >
> > pte_t *huge_pte_alloc(struct mm_struct *mm,> @@ -164,6 +182,8 @@ static inline void __unmap_hugepage_range(struct mmu_gather *tlb,
> > BUG();
> > }
> >
> > +#define huge_pte_lockptr(mm, ptep) 0
> > +
> > #endif /* !CONFIG_HUGETLB_PAGE */
> >
> > #define HUGETLB_ANON_FILE "anon_hugepage"
> > diff --git v3.11-rc3.orig/include/linux/mm_types.h v3.11-rc3/include/linux/mm_types.h
> > index fb425aa..cfb8c6f 100644
> > --- v3.11-rc3.orig/include/linux/mm_types.h
> > +++ v3.11-rc3/include/linux/mm_types.h
> > @@ -24,6 +24,8 @@
> > struct address_space;
> >
> > #define USE_SPLIT_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
> > +#define USE_SPLIT_PTLOCKS_HUGETLB \
> > + (USE_SPLIT_PTLOCKS && !defined(CONFIG_PPC))
> >
>
> Is that a common pattern ? Don't we generally use
> HAVE_ARCH_SPLIT_PTLOCKS in arch config file ?

Do you mean that HAVE_ARCH_SPLIT_PTLOCKS enables/disables whole split
ptl mechanism (not only split ptl for hugepages) depending on archs?
If you do, we can do this with adding a line 'default "999999" if PPC'.
(Note that if we do this, we can lose the performance benefit from
existing split ptl for normal pages. So I'm not sure it's acceptible
for users of the arch.)

> Also are we sure this is
> only an issue with PPC ?

Not confirmed yet (so let me take some time to check this).
I think that split ptl for hugepage should work on any archtectures
where every entry pointing to hugepage is stored in 4kB page allocated
for page table trees. So I'll check it.

Thanks,
Naoya Horiguchi