2014-10-17 04:38:23

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH V2 1/2] mm: Update generic gup implementation to handle hugepage directory

Update generic gup implementation with powerpc specific details.
On powerpc at pmd level we can have hugepte, normal pmd pointer
or a pointer to the hugepage directory.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
Changes from V1:
* Folded arm/arm64 related changes into the patch
* Dropped pgd_huge from generic header

arch/arm/include/asm/pgtable.h | 2 +
arch/arm64/include/asm/pgtable.h | 2 +
include/linux/mm.h | 26 +++++++++
mm/gup.c | 113 +++++++++++++++++++--------------------
4 files changed, 84 insertions(+), 59 deletions(-)

diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index 90aa4583b308..46f81fbaa4a5 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@ -181,6 +181,8 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
/* to find an entry in a kernel page-table-directory */
#define pgd_offset_k(addr) pgd_offset(&init_mm, addr)

+#define pgd_huge(pgd) (0)
+
#define pmd_none(pmd) (!pmd_val(pmd))
#define pmd_present(pmd) (pmd_val(pmd))

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index cefd3e825612..ed8f42497ac4 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -464,6 +464,8 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
extern pgd_t idmap_pg_dir[PTRS_PER_PGD];

+#define pgd_huge(pgd) (0)
+
/*
* Encode and decode a swap entry:
* bits 0-1: present (must be zero)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 02d11ee7f19d..f97732412cb4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1219,6 +1219,32 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
struct vm_area_struct **vmas);
int get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages);
+
+#ifdef CONFIG_HAVE_GENERIC_RCU_GUP
+#ifndef is_hugepd
+/*
+ * Some architectures support hugepage directory format that is
+ * required to support different hugetlbfs sizes.
+ */
+typedef struct { unsigned long pd; } hugepd_t;
+#define is_hugepd(hugepd) (0)
+#define __hugepd(x) ((hugepd_t) { (x) })
+static inline int gup_hugepd(hugepd_t hugepd, unsigned long addr,
+ unsigned pdshift, unsigned long end,
+ int write, struct page **pages, int *nr)
+{
+ return 0;
+}
+#else
+extern int gup_hugepd(hugepd_t hugepd, unsigned long addr,
+ unsigned pdshift, unsigned long end,
+ int write, struct page **pages, int *nr);
+#endif
+extern int gup_huge_pte(pte_t orig, pte_t *ptep, unsigned long addr,
+ unsigned long sz, unsigned long end, int write,
+ struct page **pages, int *nr);
+#endif
+
struct kvec;
int get_kernel_pages(const struct kvec *iov, int nr_pages, int write,
struct page **pages);
diff --git a/mm/gup.c b/mm/gup.c
index cd62c8c90d4a..13c560ef9ddf 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -786,65 +786,31 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
}
#endif /* __HAVE_ARCH_PTE_SPECIAL */

-static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
+int gup_huge_pte(pte_t orig, pte_t *ptep, unsigned long addr,
+ unsigned long sz, unsigned long end, int write,
+ struct page **pages, int *nr)
{
- struct page *head, *page, *tail;
int refs;
+ unsigned long pte_end;
+ struct page *head, *page, *tail;

- if (write && !pmd_write(orig))
- return 0;
-
- refs = 0;
- head = pmd_page(orig);
- page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- tail = page;
- do {
- VM_BUG_ON_PAGE(compound_head(page) != head, page);
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);

- if (!page_cache_add_speculative(head, refs)) {
- *nr -= refs;
+ if (write && !pte_write(orig))
return 0;
- }

- if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
- *nr -= refs;
- while (refs--)
- put_page(head);
+ if (!pte_present(orig))
return 0;
- }

- /*
- * Any tail pages need their mapcount reference taken before we
- * return. (This allows the THP code to bump their ref count when
- * they are split into base pages).
- */
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
- return 1;
-}
-
-static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
-{
- struct page *head, *page, *tail;
- int refs;
+ pte_end = (addr + sz) & ~(sz-1);
+ if (pte_end < end)
+ end = pte_end;

- if (write && !pud_write(orig))
- return 0;
+ /* hugepages are never "special" */
+ VM_BUG_ON(!pfn_valid(pte_pfn(orig)));

refs = 0;
- head = pud_page(orig);
- page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ head = pte_page(orig);
+ page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
tail = page;
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
@@ -859,13 +825,18 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
return 0;
}

- if (unlikely(pud_val(orig) != pud_val(*pudp))) {
+ if (unlikely(pte_val(orig) != pte_val(*ptep))) {
*nr -= refs;
while (refs--)
put_page(head);
return 0;
}

+ /*
+ * Any tail pages need their mapcount reference taken before we
+ * return. (This allows the THP code to bump their ref count when
+ * they are split into base pages).
+ */
while (refs--) {
if (PageTail(tail))
get_huge_page_tail(tail);
@@ -898,10 +869,19 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
if (pmd_numa(pmd))
return 0;

- if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
- pages, nr))
+ if (!gup_huge_pte(__pte(pmd_val(pmd)), (pte_t *)pmdp,
+ addr, PMD_SIZE, next,
+ write, pages, nr))
return 0;

+ } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) {
+ /*
+ * architecture have different format for hugetlbfs
+ * pmd format and THP pmd format
+ */
+ if (!gup_hugepd(__hugepd(pmd_val(pmd)), addr, PMD_SHIFT,
+ next, write, pages, nr))
+ return 0;
} else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
return 0;
} while (pmdp++, addr = next, addr != end);
@@ -909,22 +889,27 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
return 1;
}

-static int gup_pud_range(pgd_t *pgdp, unsigned long addr, unsigned long end,
+static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr)
{
unsigned long next;
pud_t *pudp;

- pudp = pud_offset(pgdp, addr);
+ pudp = pud_offset(&pgd, addr);
do {
pud_t pud = ACCESS_ONCE(*pudp);

next = pud_addr_end(addr, end);
if (pud_none(pud))
return 0;
- if (pud_huge(pud)) {
- if (!gup_huge_pud(pud, pudp, addr, next, write,
- pages, nr))
+ if (unlikely(pud_huge(pud))) {
+ if (!gup_huge_pte(__pte(pud_val(pud)), (pte_t *)pudp,
+ addr, PUD_SIZE, next,
+ write, pages, nr))
+ return 0;
+ } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
+ if (!gup_hugepd(__hugepd(pud_val(pud)), addr, PUD_SHIFT,
+ next, write, pages, nr))
return 0;
} else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
return 0;
@@ -970,10 +955,21 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
local_irq_save(flags);
pgdp = pgd_offset(mm, addr);
do {
+ pgd_t pgd = ACCESS_ONCE(*pgdp);
+
next = pgd_addr_end(addr, end);
- if (pgd_none(*pgdp))
+ if (pgd_none(pgd))
break;
- else if (!gup_pud_range(pgdp, addr, next, write, pages, &nr))
+ if (unlikely(pgd_huge(pgd))) {
+ if (!gup_huge_pte(__pte(pgd_val(pgd)), (pte_t *)pgdp,
+ addr, PGDIR_SIZE, next,
+ write, pages, &nr))
+ break;
+ } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
+ if (!gup_hugepd(__hugepd(pgd_val(pgd)), addr, PGDIR_SHIFT,
+ next, write, pages, &nr))
+ break;
+ } else if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
break;
} while (pgdp++, addr = next, addr != end);
local_irq_restore(flags);
@@ -1028,5 +1024,4 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,

return ret;
}
-
#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */
--
1.9.1


2014-10-17 04:38:33

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH V2 2/2] arch/powerpc: Switch to generic RCU get_user_pages_fast

This patch switch the ppc arch to use the generic RCU based
gup implementation.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
Changes from V1:
* added pgd_huge definition back

arch/powerpc/Kconfig | 1 +
arch/powerpc/include/asm/hugetlb.h | 8 +-
arch/powerpc/include/asm/page.h | 3 +-
arch/powerpc/include/asm/pgtable-ppc64.h | 1 -
arch/powerpc/include/asm/pgtable.h | 5 -
arch/powerpc/mm/Makefile | 2 +-
arch/powerpc/mm/gup.c | 235 -------------------------------
arch/powerpc/mm/hugetlbpage.c | 27 ++--
8 files changed, 21 insertions(+), 261 deletions(-)
delete mode 100644 arch/powerpc/mm/gup.c

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 88eace4e28c3..7af887dc6aed 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -148,6 +148,7 @@ config PPC
select HAVE_ARCH_AUDITSYSCALL
select ARCH_SUPPORTS_ATOMIC_RMW
select DCACHE_WORD_ACCESS if PPC64 && CPU_LITTLE_ENDIAN
+ select HAVE_GENERIC_RCU_GUP

config GENERIC_CSUM
def_bool CPU_LITTLE_ENDIAN
diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index 623f2971ce0e..7855cce9c969 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -48,7 +48,7 @@ static inline unsigned int hugepd_shift(hugepd_t hpd)
#endif /* CONFIG_PPC_BOOK3S_64 */


-static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
+static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr,
unsigned pdshift)
{
/*
@@ -58,9 +58,9 @@ static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
*/
unsigned long idx = 0;

- pte_t *dir = hugepd_page(*hpdp);
+ pte_t *dir = hugepd_page(hpd);
#ifndef CONFIG_PPC_FSL_BOOK3E
- idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp);
+ idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(hpd);
#endif

return dir + idx;
@@ -193,7 +193,7 @@ static inline void flush_hugetlb_page(struct vm_area_struct *vma,
}

#define hugepd_shift(x) 0
-static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
+static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr,
unsigned pdshift)
{
return 0;
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index 26fe1ae15212..aeca81947dc6 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -379,12 +379,13 @@ static inline int hugepd_ok(hugepd_t hpd)
}
#endif

-#define is_hugepd(pdep) (hugepd_ok(*((hugepd_t *)(pdep))))
+#define is_hugepd(hpd) (hugepd_ok(hpd))
int pgd_huge(pgd_t pgd);
#else /* CONFIG_HUGETLB_PAGE */
#define is_hugepd(pdep) 0
#define pgd_huge(pgd) 0
#endif /* CONFIG_HUGETLB_PAGE */
+#define __hugepd(x) ((hugepd_t) { (x) })

struct page;
extern void clear_user_page(void *page, unsigned long vaddr, struct page *pg);
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index ae153c40ab7c..29c36242cc6a 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -575,6 +575,5 @@ static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
*/
return true;
}
-
#endif /* __ASSEMBLY__ */
#endif /* _ASM_POWERPC_PGTABLE_PPC64_H_ */
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 316f9a5da173..4a67c1ddb91b 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -274,11 +274,6 @@ extern void paging_init(void);
*/
extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t *);

-extern int gup_hugepd(hugepd_t *hugepd, unsigned pdshift, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr);
-
-extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr);
#ifndef CONFIG_TRANSPARENT_HUGEPAGE
#define pmd_large(pmd) 0
#define has_transparent_hugepage() 0
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 325e861616a1..438dcd3fd0d1 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -6,7 +6,7 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror

ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC)

-obj-y := fault.o mem.o pgtable.o gup.o mmap.o \
+obj-y := fault.o mem.o pgtable.o mmap.o \
init_$(CONFIG_WORD_SIZE).o \
pgtable_$(CONFIG_WORD_SIZE).o
obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
deleted file mode 100644
index d8746684f606..000000000000
--- a/arch/powerpc/mm/gup.c
+++ /dev/null
@@ -1,235 +0,0 @@
-/*
- * Lockless get_user_pages_fast for powerpc
- *
- * Copyright (C) 2008 Nick Piggin
- * Copyright (C) 2008 Novell Inc.
- */
-#undef DEBUG
-
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/hugetlb.h>
-#include <linux/vmstat.h>
-#include <linux/pagemap.h>
-#include <linux/rwsem.h>
-#include <asm/pgtable.h>
-
-#ifdef __HAVE_ARCH_PTE_SPECIAL
-
-/*
- * The performance critical leaf functions are made noinline otherwise gcc
- * inlines everything into a single function which results in too much
- * register pressure.
- */
-static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
-{
- unsigned long mask, result;
- pte_t *ptep;
-
- result = _PAGE_PRESENT|_PAGE_USER;
- if (write)
- result |= _PAGE_RW;
- mask = result | _PAGE_SPECIAL;
-
- ptep = pte_offset_kernel(&pmd, addr);
- do {
- pte_t pte = ACCESS_ONCE(*ptep);
- struct page *page;
- /*
- * Similar to the PMD case, NUMA hinting must take slow path
- */
- if (pte_numa(pte))
- return 0;
-
- if ((pte_val(pte) & mask) != result)
- return 0;
- VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
- page = pte_page(pte);
- if (!page_cache_get_speculative(page))
- return 0;
- if (unlikely(pte_val(pte) != pte_val(*ptep))) {
- put_page(page);
- return 0;
- }
- pages[*nr] = page;
- (*nr)++;
-
- } while (ptep++, addr += PAGE_SIZE, addr != end);
-
- return 1;
-}
-
-static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
-{
- unsigned long next;
- pmd_t *pmdp;
-
- pmdp = pmd_offset(&pud, addr);
- do {
- pmd_t pmd = ACCESS_ONCE(*pmdp);
-
- next = pmd_addr_end(addr, end);
- /*
- * If we find a splitting transparent hugepage we
- * return zero. That will result in taking the slow
- * path which will call wait_split_huge_page()
- * if the pmd is still in splitting state
- */
- if (pmd_none(pmd) || pmd_trans_splitting(pmd))
- return 0;
- if (pmd_huge(pmd) || pmd_large(pmd)) {
- /*
- * NUMA hinting faults need to be handled in the GUP
- * slowpath for accounting purposes and so that they
- * can be serialised against THP migration.
- */
- if (pmd_numa(pmd))
- return 0;
-
- if (!gup_hugepte((pte_t *)pmdp, PMD_SIZE, addr, next,
- write, pages, nr))
- return 0;
- } else if (is_hugepd(pmdp)) {
- if (!gup_hugepd((hugepd_t *)pmdp, PMD_SHIFT,
- addr, next, write, pages, nr))
- return 0;
- } else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
- return 0;
- } while (pmdp++, addr = next, addr != end);
-
- return 1;
-}
-
-static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
-{
- unsigned long next;
- pud_t *pudp;
-
- pudp = pud_offset(&pgd, addr);
- do {
- pud_t pud = ACCESS_ONCE(*pudp);
-
- next = pud_addr_end(addr, end);
- if (pud_none(pud))
- return 0;
- if (pud_huge(pud)) {
- if (!gup_hugepte((pte_t *)pudp, PUD_SIZE, addr, next,
- write, pages, nr))
- return 0;
- } else if (is_hugepd(pudp)) {
- if (!gup_hugepd((hugepd_t *)pudp, PUD_SHIFT,
- addr, next, write, pages, nr))
- return 0;
- } else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
- return 0;
- } while (pudp++, addr = next, addr != end);
-
- return 1;
-}
-
-int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
-{
- struct mm_struct *mm = current->mm;
- unsigned long addr, len, end;
- unsigned long next;
- unsigned long flags;
- pgd_t *pgdp;
- int nr = 0;
-
- pr_devel("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read");
-
- start &= PAGE_MASK;
- addr = start;
- len = (unsigned long) nr_pages << PAGE_SHIFT;
- end = start + len;
-
- if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
- start, len)))
- return 0;
-
- pr_devel(" aligned: %lx .. %lx\n", start, end);
-
- /*
- * XXX: batch / limit 'nr', to avoid large irq off latency
- * needs some instrumenting to determine the common sizes used by
- * important workloads (eg. DB2), and whether limiting the batch size
- * will decrease performance.
- *
- * It seems like we're in the clear for the moment. Direct-IO is
- * the main guy that batches up lots of get_user_pages, and even
- * they are limited to 64-at-a-time which is not so many.
- */
- /*
- * This doesn't prevent pagetable teardown, but does prevent
- * the pagetables from being freed on powerpc.
- *
- * So long as we atomically load page table pointers versus teardown,
- * we can follow the address down to the the page and take a ref on it.
- */
- local_irq_save(flags);
-
- pgdp = pgd_offset(mm, addr);
- do {
- pgd_t pgd = ACCESS_ONCE(*pgdp);
-
- pr_devel(" %016lx: normal pgd %p\n", addr,
- (void *)pgd_val(pgd));
- next = pgd_addr_end(addr, end);
- if (pgd_none(pgd))
- break;
- if (pgd_huge(pgd)) {
- if (!gup_hugepte((pte_t *)pgdp, PGDIR_SIZE, addr, next,
- write, pages, &nr))
- break;
- } else if (is_hugepd(pgdp)) {
- if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT,
- addr, next, write, pages, &nr))
- break;
- } else if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
- break;
- } while (pgdp++, addr = next, addr != end);
-
- local_irq_restore(flags);
-
- return nr;
-}
-
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
-{
- struct mm_struct *mm = current->mm;
- int nr, ret;
-
- start &= PAGE_MASK;
- nr = __get_user_pages_fast(start, nr_pages, write, pages);
- ret = nr;
-
- if (nr < nr_pages) {
- pr_devel(" slow path ! nr = %d\n", nr);
-
- /* Try to get the remaining pages with get_user_pages */
- start += nr << PAGE_SHIFT;
- pages += nr;
-
- down_read(&mm->mmap_sem);
- ret = get_user_pages(current, mm, start,
- nr_pages - nr, write, 0, pages, NULL);
- up_read(&mm->mmap_sem);
-
- /* Have to be a bit careful with return values */
- if (nr > 0) {
- if (ret < 0)
- ret = nr;
- else
- ret += nr;
- }
- }
-
- return ret;
-}
-
-#endif /* __HAVE_ARCH_PTE_SPECIAL */
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 7e70ae968e5f..b8d228ca50fd 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -230,7 +230,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
return NULL;

- return hugepte_offset(hpdp, addr, pdshift);
+ return hugepte_offset(*hpdp, addr, pdshift);
}

#else
@@ -270,7 +270,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
return NULL;

- return hugepte_offset(hpdp, addr, pdshift);
+ return hugepte_offset(*hpdp, addr, pdshift);
}
#endif

@@ -538,7 +538,7 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
do {
pmd = pmd_offset(pud, addr);
next = pmd_addr_end(addr, end);
- if (!is_hugepd(pmd)) {
+ if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
/*
* if it is not hugepd pointer, we should already find
* it cleared.
@@ -587,7 +587,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
do {
pud = pud_offset(pgd, addr);
next = pud_addr_end(addr, end);
- if (!is_hugepd(pud)) {
+ if (!is_hugepd(__hugepd(pud_val(*pud)))) {
if (pud_none_or_clear_bad(pud))
continue;
hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
@@ -653,7 +653,7 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
do {
next = pgd_addr_end(addr, end);
pgd = pgd_offset(tlb->mm, addr);
- if (!is_hugepd(pgd)) {
+ if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
if (pgd_none_or_clear_bad(pgd))
continue;
hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
@@ -713,18 +713,17 @@ static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
return (__boundary - 1 < end - 1) ? __boundary : end;
}

-int gup_hugepd(hugepd_t *hugepd, unsigned pdshift,
- unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+int gup_hugepd(hugepd_t hugepd, unsigned long addr, unsigned pdshift,
+ unsigned long end, int write, struct page **pages, int *nr)
{
pte_t *ptep;
- unsigned long sz = 1UL << hugepd_shift(*hugepd);
+ unsigned long sz = 1UL << hugepd_shift(hugepd);
unsigned long next;

ptep = hugepte_offset(hugepd, addr, pdshift);
do {
next = hugepte_addr_end(addr, end, sz);
- if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
+ if (!gup_huge_pte(*ptep, ptep, addr, sz, end, write, pages, nr))
return 0;
} while (ptep++, addr = next, addr != end);

@@ -961,7 +960,7 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift
else if (pgd_huge(pgd)) {
ret_pte = (pte_t *) pgdp;
goto out;
- } else if (is_hugepd(&pgd))
+ } else if (is_hugepd(__hugepd(pgd_val(pgd))))
hpdp = (hugepd_t *)&pgd;
else {
/*
@@ -978,7 +977,7 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift
else if (pud_huge(pud)) {
ret_pte = (pte_t *) pudp;
goto out;
- } else if (is_hugepd(&pud))
+ } else if (is_hugepd(__hugepd(pud_val(pud))))
hpdp = (hugepd_t *)&pud;
else {
pdshift = PMD_SHIFT;
@@ -999,7 +998,7 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift
if (pmd_huge(pmd) || pmd_large(pmd)) {
ret_pte = (pte_t *) pmdp;
goto out;
- } else if (is_hugepd(&pmd))
+ } else if (is_hugepd(__hugepd(pmd_val(pmd))))
hpdp = (hugepd_t *)&pmd;
else
return pte_offset_kernel(&pmd, ea);
@@ -1008,7 +1007,7 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift
if (!hpdp)
return NULL;

- ret_pte = hugepte_offset(hpdp, ea, pdshift);
+ ret_pte = hugepte_offset(*hpdp, ea, pdshift);
pdshift = hugepd_shift(*hpdp);
out:
if (shift)
--
1.9.1

2014-10-17 14:10:40

by Steve Capper

[permalink] [raw]
Subject: Re: [PATCH V2 1/2] mm: Update generic gup implementation to handle hugepage directory

On Fri, Oct 17, 2014 at 10:08:06AM +0530, Aneesh Kumar K.V wrote:
> Update generic gup implementation with powerpc specific details.
> On powerpc at pmd level we can have hugepte, normal pmd pointer
> or a pointer to the hugepage directory.
>
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> ---
> Changes from V1:
> * Folded arm/arm64 related changes into the patch
> * Dropped pgd_huge from generic header
>
> arch/arm/include/asm/pgtable.h | 2 +
> arch/arm64/include/asm/pgtable.h | 2 +
> include/linux/mm.h | 26 +++++++++
> mm/gup.c | 113 +++++++++++++++++++--------------------
> 4 files changed, 84 insertions(+), 59 deletions(-)
>

Hi Aneesh,
Thanks for coding this up. I've tested this for arm (Arndale board) and
arm64 (Juno); it builds without any issues and passes my futex on THP
tail test.

Please add my:
Tested-by: Steve Capper <[email protected]>

As this patch progresses through -mm, the arm maintainer:
Russell King <[email protected]>

and arm64 maintainers:
Catalin Marinas <[email protected]>
Will Deacon <[email protected]>

should also be on CC.

Cheers,
--
Steve

> diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
> index 90aa4583b308..46f81fbaa4a5 100644
> --- a/arch/arm/include/asm/pgtable.h
> +++ b/arch/arm/include/asm/pgtable.h
> @@ -181,6 +181,8 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
> /* to find an entry in a kernel page-table-directory */
> #define pgd_offset_k(addr) pgd_offset(&init_mm, addr)
>
> +#define pgd_huge(pgd) (0)
> +
> #define pmd_none(pmd) (!pmd_val(pmd))
> #define pmd_present(pmd) (pmd_val(pmd))
>
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index cefd3e825612..ed8f42497ac4 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -464,6 +464,8 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
> extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
> extern pgd_t idmap_pg_dir[PTRS_PER_PGD];
>
> +#define pgd_huge(pgd) (0)
> +
> /*
> * Encode and decode a swap entry:
> * bits 0-1: present (must be zero)
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 02d11ee7f19d..f97732412cb4 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1219,6 +1219,32 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
> struct vm_area_struct **vmas);
> int get_user_pages_fast(unsigned long start, int nr_pages, int write,
> struct page **pages);
> +
> +#ifdef CONFIG_HAVE_GENERIC_RCU_GUP
> +#ifndef is_hugepd
> +/*
> + * Some architectures support hugepage directory format that is
> + * required to support different hugetlbfs sizes.
> + */
> +typedef struct { unsigned long pd; } hugepd_t;
> +#define is_hugepd(hugepd) (0)
> +#define __hugepd(x) ((hugepd_t) { (x) })
> +static inline int gup_hugepd(hugepd_t hugepd, unsigned long addr,
> + unsigned pdshift, unsigned long end,
> + int write, struct page **pages, int *nr)
> +{
> + return 0;
> +}
> +#else
> +extern int gup_hugepd(hugepd_t hugepd, unsigned long addr,
> + unsigned pdshift, unsigned long end,
> + int write, struct page **pages, int *nr);
> +#endif
> +extern int gup_huge_pte(pte_t orig, pte_t *ptep, unsigned long addr,
> + unsigned long sz, unsigned long end, int write,
> + struct page **pages, int *nr);
> +#endif
> +
> struct kvec;
> int get_kernel_pages(const struct kvec *iov, int nr_pages, int write,
> struct page **pages);
> diff --git a/mm/gup.c b/mm/gup.c
> index cd62c8c90d4a..13c560ef9ddf 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -786,65 +786,31 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
> }
> #endif /* __HAVE_ARCH_PTE_SPECIAL */
>
> -static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
> - unsigned long end, int write, struct page **pages, int *nr)
> +int gup_huge_pte(pte_t orig, pte_t *ptep, unsigned long addr,
> + unsigned long sz, unsigned long end, int write,
> + struct page **pages, int *nr)
> {
> - struct page *head, *page, *tail;
> int refs;
> + unsigned long pte_end;
> + struct page *head, *page, *tail;
>
> - if (write && !pmd_write(orig))
> - return 0;
> -
> - refs = 0;
> - head = pmd_page(orig);
> - page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
> - tail = page;
> - do {
> - VM_BUG_ON_PAGE(compound_head(page) != head, page);
> - pages[*nr] = page;
> - (*nr)++;
> - page++;
> - refs++;
> - } while (addr += PAGE_SIZE, addr != end);
>
> - if (!page_cache_add_speculative(head, refs)) {
> - *nr -= refs;
> + if (write && !pte_write(orig))
> return 0;
> - }
>
> - if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
> - *nr -= refs;
> - while (refs--)
> - put_page(head);
> + if (!pte_present(orig))
> return 0;
> - }
>
> - /*
> - * Any tail pages need their mapcount reference taken before we
> - * return. (This allows the THP code to bump their ref count when
> - * they are split into base pages).
> - */
> - while (refs--) {
> - if (PageTail(tail))
> - get_huge_page_tail(tail);
> - tail++;
> - }
> -
> - return 1;
> -}
> -
> -static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
> - unsigned long end, int write, struct page **pages, int *nr)
> -{
> - struct page *head, *page, *tail;
> - int refs;
> + pte_end = (addr + sz) & ~(sz-1);
> + if (pte_end < end)
> + end = pte_end;
>
> - if (write && !pud_write(orig))
> - return 0;
> + /* hugepages are never "special" */
> + VM_BUG_ON(!pfn_valid(pte_pfn(orig)));
>
> refs = 0;
> - head = pud_page(orig);
> - page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
> + head = pte_page(orig);
> + page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
> tail = page;
> do {
> VM_BUG_ON_PAGE(compound_head(page) != head, page);
> @@ -859,13 +825,18 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
> return 0;
> }
>
> - if (unlikely(pud_val(orig) != pud_val(*pudp))) {
> + if (unlikely(pte_val(orig) != pte_val(*ptep))) {
> *nr -= refs;
> while (refs--)
> put_page(head);
> return 0;
> }
>
> + /*
> + * Any tail pages need their mapcount reference taken before we
> + * return. (This allows the THP code to bump their ref count when
> + * they are split into base pages).
> + */
> while (refs--) {
> if (PageTail(tail))
> get_huge_page_tail(tail);
> @@ -898,10 +869,19 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
> if (pmd_numa(pmd))
> return 0;
>
> - if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
> - pages, nr))
> + if (!gup_huge_pte(__pte(pmd_val(pmd)), (pte_t *)pmdp,
> + addr, PMD_SIZE, next,
> + write, pages, nr))
> return 0;
>
> + } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) {
> + /*
> + * architecture have different format for hugetlbfs
> + * pmd format and THP pmd format
> + */
> + if (!gup_hugepd(__hugepd(pmd_val(pmd)), addr, PMD_SHIFT,
> + next, write, pages, nr))
> + return 0;
> } else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
> return 0;
> } while (pmdp++, addr = next, addr != end);
> @@ -909,22 +889,27 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
> return 1;
> }
>
> -static int gup_pud_range(pgd_t *pgdp, unsigned long addr, unsigned long end,
> +static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
> int write, struct page **pages, int *nr)
> {
> unsigned long next;
> pud_t *pudp;
>
> - pudp = pud_offset(pgdp, addr);
> + pudp = pud_offset(&pgd, addr);
> do {
> pud_t pud = ACCESS_ONCE(*pudp);
>
> next = pud_addr_end(addr, end);
> if (pud_none(pud))
> return 0;
> - if (pud_huge(pud)) {
> - if (!gup_huge_pud(pud, pudp, addr, next, write,
> - pages, nr))
> + if (unlikely(pud_huge(pud))) {
> + if (!gup_huge_pte(__pte(pud_val(pud)), (pte_t *)pudp,
> + addr, PUD_SIZE, next,
> + write, pages, nr))
> + return 0;
> + } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
> + if (!gup_hugepd(__hugepd(pud_val(pud)), addr, PUD_SHIFT,
> + next, write, pages, nr))
> return 0;
> } else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
> return 0;
> @@ -970,10 +955,21 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
> local_irq_save(flags);
> pgdp = pgd_offset(mm, addr);
> do {
> + pgd_t pgd = ACCESS_ONCE(*pgdp);
> +
> next = pgd_addr_end(addr, end);
> - if (pgd_none(*pgdp))
> + if (pgd_none(pgd))
> break;
> - else if (!gup_pud_range(pgdp, addr, next, write, pages, &nr))
> + if (unlikely(pgd_huge(pgd))) {
> + if (!gup_huge_pte(__pte(pgd_val(pgd)), (pte_t *)pgdp,
> + addr, PGDIR_SIZE, next,
> + write, pages, &nr))
> + break;
> + } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
> + if (!gup_hugepd(__hugepd(pgd_val(pgd)), addr, PGDIR_SHIFT,
> + next, write, pages, &nr))
> + break;
> + } else if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
> break;
> } while (pgdp++, addr = next, addr != end);
> local_irq_restore(flags);
> @@ -1028,5 +1024,4 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
>
> return ret;
> }
> -
> #endif /* CONFIG_HAVE_GENERIC_RCU_GUP */
> --
> 1.9.1
>

2014-10-22 23:02:25

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH V2 1/2] mm: Update generic gup implementation to handle hugepage directory

On Fri, 17 Oct 2014 10:08:06 +0530 "Aneesh Kumar K.V" <[email protected]> wrote:

> Update generic gup implementation with powerpc specific details.
> On powerpc at pmd level we can have hugepte, normal pmd pointer
> or a pointer to the hugepage directory.
>
> ...
>
> --- a/arch/arm/include/asm/pgtable.h
> +++ b/arch/arm/include/asm/pgtable.h
> @@ -181,6 +181,8 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
> /* to find an entry in a kernel page-table-directory */
> #define pgd_offset_k(addr) pgd_offset(&init_mm, addr)
>
> +#define pgd_huge(pgd) (0)
> +
> #define pmd_none(pmd) (!pmd_val(pmd))
> #define pmd_present(pmd) (pmd_val(pmd))
>
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index cefd3e825612..ed8f42497ac4 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -464,6 +464,8 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
> extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
> extern pgd_t idmap_pg_dir[PTRS_PER_PGD];
>
> +#define pgd_huge(pgd) (0)
> +

So only arm, arm64 and powerpc implement CONFIG_HAVE_GENERIC_RCU_GUP
and only powerpc impements pgd_huge().

Could we get a bit of documentation in place for pgd_huge() so that
people who aren't familiar with powerpc can understand what's going on?

> /*
> * Encode and decode a swap entry:
> * bits 0-1: present (must be zero)
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 02d11ee7f19d..f97732412cb4 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1219,6 +1219,32 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
> struct vm_area_struct **vmas);
> int get_user_pages_fast(unsigned long start, int nr_pages, int write,
> struct page **pages);
> +
> +#ifdef CONFIG_HAVE_GENERIC_RCU_GUP
> +#ifndef is_hugepd

And is_hugepd is a bit of a mystery. Let's get some description in
place for this as well? Why it exists, what its role is. Also,
specifically which arch header file is responsible for defining it.

It takes a hugepd_t argument, but hugepd_t is defined later in this
header file. This is weird because any preceding implementation of
is_hugepd() can't actually be implemented because it hasn't seen the
hugepd_t definition yet! So any is_hugepd() implementation is forced
to be a simple macro which punts to a C function which *has* seen the
hugepd_t definition. What a twisty maze.

It all seems messy, confusing and poorly documented. Can we clean this
up?

> +/*
> + * Some architectures support hugepage directory format that is
> + * required to support different hugetlbfs sizes.
> + */
> +typedef struct { unsigned long pd; } hugepd_t;
> +#define is_hugepd(hugepd) (0)
> +#define __hugepd(x) ((hugepd_t) { (x) })

What's this.

> +static inline int gup_hugepd(hugepd_t hugepd, unsigned long addr,
> + unsigned pdshift, unsigned long end,
> + int write, struct page **pages, int *nr)
> +{
> + return 0;
> +}
> +#else
> +extern int gup_hugepd(hugepd_t hugepd, unsigned long addr,
> + unsigned pdshift, unsigned long end,
> + int write, struct page **pages, int *nr);
> +#endif
> +extern int gup_huge_pte(pte_t orig, pte_t *ptep, unsigned long addr,
> + unsigned long sz, unsigned long end, int write,
> + struct page **pages, int *nr);
> +#endif
> +
>
> ...
>

2014-10-23 04:29:11

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH V2 1/2] mm: Update generic gup implementation to handle hugepage directory

Andrew Morton <[email protected]> writes:

> On Fri, 17 Oct 2014 10:08:06 +0530 "Aneesh Kumar K.V" <[email protected]> wrote:
>
>> Update generic gup implementation with powerpc specific details.
>> On powerpc at pmd level we can have hugepte, normal pmd pointer
>> or a pointer to the hugepage directory.
>>
>> ...
>>
>> --- a/arch/arm/include/asm/pgtable.h
>> +++ b/arch/arm/include/asm/pgtable.h
>> @@ -181,6 +181,8 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
>> /* to find an entry in a kernel page-table-directory */
>> #define pgd_offset_k(addr) pgd_offset(&init_mm, addr)
>>
>> +#define pgd_huge(pgd) (0)
>> +
>> #define pmd_none(pmd) (!pmd_val(pmd))
>> #define pmd_present(pmd) (pmd_val(pmd))
>>
>> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
>> index cefd3e825612..ed8f42497ac4 100644
>> --- a/arch/arm64/include/asm/pgtable.h
>> +++ b/arch/arm64/include/asm/pgtable.h
>> @@ -464,6 +464,8 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
>> extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
>> extern pgd_t idmap_pg_dir[PTRS_PER_PGD];
>>
>> +#define pgd_huge(pgd) (0)
>> +
>
> So only arm, arm64 and powerpc implement CONFIG_HAVE_GENERIC_RCU_GUP
> and only powerpc impements pgd_huge().
>
> Could we get a bit of documentation in place for pgd_huge() so that
> people who aren't familiar with powerpc can understand what's going
> on?

Will update

>
>> /*
>> * Encode and decode a swap entry:
>> * bits 0-1: present (must be zero)
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 02d11ee7f19d..f97732412cb4 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -1219,6 +1219,32 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
>> struct vm_area_struct **vmas);
>> int get_user_pages_fast(unsigned long start, int nr_pages, int write,
>> struct page **pages);
>> +
>> +#ifdef CONFIG_HAVE_GENERIC_RCU_GUP
>> +#ifndef is_hugepd
>
> And is_hugepd is a bit of a mystery. Let's get some description in
> place for this as well? Why it exists, what its role is. Also,
> specifically which arch header file is responsible for defining it.
>
> It takes a hugepd_t argument, but hugepd_t is defined later in this
> header file. This is weird because any preceding implementation of
> is_hugepd() can't actually be implemented because it hasn't seen the
> hugepd_t definition yet! So any is_hugepd() implementation is forced
> to be a simple macro which punts to a C function which *has* seen the
> hugepd_t definition. What a twisty maze.

arch can definitely do

#defne is_hugepd is_hugepd
typedef struct { unsigned long pd; } hugepd_t;
static inline int is_hugepd(hugepd_t hpd)
{

}

I wanted to make sure arch can have their own definition of hugepd_t .

>
> It all seems messy, confusing and poorly documented. Can we clean this
> up?
>
>> +/*
>> + * Some architectures support hugepage directory format that is
>> + * required to support different hugetlbfs sizes.
>> + */
>> +typedef struct { unsigned long pd; } hugepd_t;
>> +#define is_hugepd(hugepd) (0)
>> +#define __hugepd(x) ((hugepd_t) { (x) })
>
> What's this.

macro that is used to convert value to type hugepd_t. We use that style
already for __pte() etc.

>
>> +static inline int gup_hugepd(hugepd_t hugepd, unsigned long addr,
>> + unsigned pdshift, unsigned long end,
>> + int write, struct page **pages, int *nr)
>> +{
>> + return 0;
>> +}
>> +#else
>> +extern int gup_hugepd(hugepd_t hugepd, unsigned long addr,
>> + unsigned pdshift, unsigned long end,
>> + int write, struct page **pages, int *nr);
>> +#endif
>> +extern int gup_huge_pte(pte_t orig, pte_t *ptep, unsigned long addr,
>> + unsigned long sz, unsigned long end, int write,
>> + struct page **pages, int *nr);
>> +#endif

-aneesh

2014-10-23 08:09:55

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH V2 1/2] mm: Update generic gup implementation to handle hugepage directory

Andrew Morton <[email protected]> writes:

> On Fri, 17 Oct 2014 10:08:06 +0530 "Aneesh Kumar K.V" <[email protected]> wrote:
>
>> Update generic gup implementation with powerpc specific details.
>> On powerpc at pmd level we can have hugepte, normal pmd pointer
>> or a pointer to the hugepage directory.
>>
>> ...
>>
>> --- a/arch/arm/include/asm/pgtable.h
>> +++ b/arch/arm/include/asm/pgtable.h
>> @@ -181,6 +181,8 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
>> /* to find an entry in a kernel page-table-directory */
>> #define pgd_offset_k(addr) pgd_offset(&init_mm, addr)
>>
>> +#define pgd_huge(pgd) (0)
>> +
>> #define pmd_none(pmd) (!pmd_val(pmd))
>> #define pmd_present(pmd) (pmd_val(pmd))
>>
>> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
>> index cefd3e825612..ed8f42497ac4 100644
>> --- a/arch/arm64/include/asm/pgtable.h
>> +++ b/arch/arm64/include/asm/pgtable.h
>> @@ -464,6 +464,8 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
>> extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
>> extern pgd_t idmap_pg_dir[PTRS_PER_PGD];
>>
>> +#define pgd_huge(pgd) (0)
>> +
>
> So only arm, arm64 and powerpc implement CONFIG_HAVE_GENERIC_RCU_GUP
> and only powerpc impements pgd_huge().
>
> Could we get a bit of documentation in place for pgd_huge() so that
> people who aren't familiar with powerpc can understand what's going
> on?


I ended up moving that to include/linux/hugetlb.h with the below
comments added. Let me know if the below is ok

/*
* hugepages at page global directory. If arch support
* hugepages at pgd level, they need to define this.
*/
#ifndef pgd_huge
#define pgd_huge(x) 0
#endif

#ifndef is_hugepd
/*
* Some architectures requires a hugepage directory format that is
* required to support multiple hugepage sizes. For example
* a4fe3ce7699bfe1bd88f816b55d42d8fe1dac655 introduced the same
* on powerpc. This allows for a more flexible hugepage pagetable
* layout.
*/
typedef struct { unsigned long pd; } hugepd_t;
#define is_hugepd(hugepd) (0)
#define __hugepd(x) ((hugepd_t) { (x) })
static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
unsigned pdshift, unsigned long end,
int write, struct page **pages, int *nr)
{
return 0;
}
#else
extern int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
unsigned pdshift, unsigned long end,
int write, struct page **pages, int *nr);
#endif


-aneesh

2014-10-23 22:40:58

by David Miller

[permalink] [raw]
Subject: Re: [PATCH V2 1/2] mm: Update generic gup implementation to handle hugepage directory


Hey guys, was looking over the generic GUP while working on a sparc64
issue and I noticed that you guys do speculative page gets, and after
talking with Johannes Weiner (CC:'d) about this we don't see how it
could be necessary.

If interrupts are disabled during the page table scan (which they
are), no IPI tlb flushes can arrive. Therefore any removal from the
page tables is guarded by interrupts being re-enabled. And as a
result, page counts of pages we see in the page tables must always
have a count > 0.

x86 does direct atomic_add() on &page->_count because of this
invariant and I would rather see the generic version do this too.

2014-10-23 23:41:38

by Benjamin Herrenschmidt

[permalink] [raw]
Subject: Re: [PATCH V2 1/2] mm: Update generic gup implementation to handle hugepage directory

On Thu, 2014-10-23 at 18:40 -0400, David Miller wrote:
> Hey guys, was looking over the generic GUP while working on a sparc64
> issue and I noticed that you guys do speculative page gets, and after
> talking with Johannes Weiner (CC:'d) about this we don't see how it
> could be necessary.
>
> If interrupts are disabled during the page table scan (which they
> are), no IPI tlb flushes can arrive. Therefore any removal from the
> page tables is guarded by interrupts being re-enabled. And as a
> result, page counts of pages we see in the page tables must always
> have a count > 0.
>
> x86 does direct atomic_add() on &page->_count because of this
> invariant and I would rather see the generic version do this too.

This is of course only true of archs who use IPIs for TLB flushes, so if
we are going down the path of not being speculative, powerpc would have
to go back to doing its own since our broadcast TLB flush means we
aren't protected (we are only protected vs. the page tables themselves
being freed since we do that via sched RCU).

AFAIK, ARM also broadcasts TLB flushes...

Another option would be to make the generic code use something defined
by the arch to decide whether to use speculative get or
not. I like the idea of keeping the bulk of that code generic...

Cheers,
Ben.

> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

2014-10-24 03:55:18

by David Miller

[permalink] [raw]
Subject: Re: [PATCH V2 1/2] mm: Update generic gup implementation to handle hugepage directory

From: Benjamin Herrenschmidt <[email protected]>
Date: Fri, 24 Oct 2014 10:40:35 +1100

> Another option would be to make the generic code use something defined
> by the arch to decide whether to use speculative get or
> not. I like the idea of keeping the bulk of that code generic...

Me too. We could have inlines that do either speculative or
non-speculative gets on the pages in some header file and hide
the ifdefs in there.

2014-10-24 08:33:22

by Steve Capper

[permalink] [raw]
Subject: Re: [PATCH V2 1/2] mm: Update generic gup implementation to handle hugepage directory

On 24 October 2014 00:40, Benjamin Herrenschmidt
<[email protected]> wrote:
> On Thu, 2014-10-23 at 18:40 -0400, David Miller wrote:
>> Hey guys, was looking over the generic GUP while working on a sparc64
>> issue and I noticed that you guys do speculative page gets, and after
>> talking with Johannes Weiner (CC:'d) about this we don't see how it
>> could be necessary.
>>
>> If interrupts are disabled during the page table scan (which they
>> are), no IPI tlb flushes can arrive. Therefore any removal from the
>> page tables is guarded by interrupts being re-enabled. And as a
>> result, page counts of pages we see in the page tables must always
>> have a count > 0.
>>
>> x86 does direct atomic_add() on &page->_count because of this
>> invariant and I would rather see the generic version do this too.
>
> This is of course only true of archs who use IPIs for TLB flushes, so if
> we are going down the path of not being speculative, powerpc would have
> to go back to doing its own since our broadcast TLB flush means we
> aren't protected (we are only protected vs. the page tables themselves
> being freed since we do that via sched RCU).
>
> AFAIK, ARM also broadcasts TLB flushes...

Indeed, for most ARM cores we have hardware TLB broadcasts, thus we
need the speculative path.

>
> Another option would be to make the generic code use something defined
> by the arch to decide whether to use speculative get or
> not. I like the idea of keeping the bulk of that code generic...

It would be nice to have the code generalised further.
In addition to the speculative/atomic helpers the implementation would
need to be renamed from GENERIC_RCU_GUP to GENERIC_GUP.
The other noteworthy assumption made in the RCU GUP is that pte's can
be read atomically. For x86 this isn't true when running with 64-bit
pte's, thus a helper would be needed.

Cheers,
--
Steve

2014-10-24 16:22:45

by James Bottomley

[permalink] [raw]
Subject: Re: [PATCH V2 1/2] mm: Update generic gup implementation to handle hugepage directory

On Fri, 2014-10-24 at 10:40 +1100, Benjamin Herrenschmidt wrote:
> On Thu, 2014-10-23 at 18:40 -0400, David Miller wrote:
> > Hey guys, was looking over the generic GUP while working on a sparc64
> > issue and I noticed that you guys do speculative page gets, and after
> > talking with Johannes Weiner (CC:'d) about this we don't see how it
> > could be necessary.
> >
> > If interrupts are disabled during the page table scan (which they
> > are), no IPI tlb flushes can arrive. Therefore any removal from the
> > page tables is guarded by interrupts being re-enabled. And as a
> > result, page counts of pages we see in the page tables must always
> > have a count > 0.
> >
> > x86 does direct atomic_add() on &page->_count because of this
> > invariant and I would rather see the generic version do this too.
>
> This is of course only true of archs who use IPIs for TLB flushes, so if
> we are going down the path of not being speculative, powerpc would have
> to go back to doing its own since our broadcast TLB flush means we
> aren't protected (we are only protected vs. the page tables themselves
> being freed since we do that via sched RCU).
>
> AFAIK, ARM also broadcasts TLB flushes...

Parisc does this. As soon as one CPU issues a TLB purge, it's broadcast
to all the CPUs on the inter-CPU bus. The next instruction isn't
executed until they respond.

But this is only for our CPU TLB. There's no other external
consequence, so removal from the page tables isn't effected by this TLB
flush, therefore the theory on which Dave bases the change to
atomic_add() should work for us (of course, atomic_add is lock add
unlock on our CPU, so it's not going to be of much benefit).

James

> Another option would be to make the generic code use something defined
> by the arch to decide whether to use speculative get or
> not. I like the idea of keeping the bulk of that code generic...
>
> Cheers,
> Ben.
>
> > --
> > To unsubscribe, send a message with 'unsubscribe linux-mm' in
> > the body to [email protected]. For more info on Linux MM,
> > see: http://www.linux-mm.org/ .
> > Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>

2014-10-25 21:00:52

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH V2 1/2] mm: Update generic gup implementation to handle hugepage directory

David Miller <[email protected]> writes:

> Hey guys, was looking over the generic GUP while working on a sparc64
> issue and I noticed that you guys do speculative page gets, and after
> talking with Johannes Weiner (CC:'d) about this we don't see how it
> could be necessary.
>
> If interrupts are disabled during the page table scan (which they
> are), no IPI tlb flushes can arrive. Therefore any removal from the
> page tables is guarded by interrupts being re-enabled. And as a
> result, page counts of pages we see in the page tables must always
> have a count > 0.
>
> x86 does direct atomic_add() on &page->_count because of this
> invariant and I would rather see the generic version do this too.


But that won't work with RCU GUP. For example on powerpc the tlb flush
doesn't involve an IPI and we can essentially find page count 0.

-aneesh

2014-10-26 21:48:13

by Benjamin Herrenschmidt

[permalink] [raw]
Subject: Re: [PATCH V2 1/2] mm: Update generic gup implementation to handle hugepage directory

On Fri, 2014-10-24 at 09:22 -0700, James Bottomley wrote:

> Parisc does this. As soon as one CPU issues a TLB purge, it's broadcast
> to all the CPUs on the inter-CPU bus. The next instruction isn't
> executed until they respond.
>
> But this is only for our CPU TLB. There's no other external
> consequence, so removal from the page tables isn't effected by this TLB
> flush, therefore the theory on which Dave bases the change to
> atomic_add() should work for us (of course, atomic_add is lock add
> unlock on our CPU, so it's not going to be of much benefit).

I'm not sure I follow you here.

Do you or do you now perform an IPI to do TLB flushes ? If you don't
(for example because you have HW broadcast), then you need the
speculative get_page(). If you do (and can read a PTE atomically), you
can get away with atomic_add().

The reason is that if you remember how zap_pte_range works, we perform
the flush before we get rid of the page.

So if your using IPIs for the flush, the fact that gup_fast has
interrupts disabled will delay the IPI response and thus effectively
prevent the pages from being actually freed, allowing us to simply do
the atomic_add() on x86.

But if we don't use IPIs because we have HW broadcast of TLB
invalidations, then we don't have that synchronization. atomic_add won't
work, we need get_page_speculative() because the page could be
concurrently being freed.

Cheers,
Ben.

> James
>
> > Another option would be to make the generic code use something defined
> > by the arch to decide whether to use speculative get or
> > not. I like the idea of keeping the bulk of that code generic...
> >
> > Cheers,
> > Ben.
> >
> > > --
> > > To unsubscribe, send a message with 'unsubscribe linux-mm' in
> > > the body to [email protected]. For more info on Linux MM,
> > > see: http://www.linux-mm.org/ .
> > > Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
> >
> >
> > --
> > To unsubscribe, send a message with 'unsubscribe linux-mm' in
> > the body to [email protected]. For more info on Linux MM,
> > see: http://www.linux-mm.org/ .
> > Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
> >
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2014-10-27 00:19:17

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [PATCH V2 1/2] mm: Update generic gup implementation to handle hugepage directory

Hello,

On Mon, Oct 27, 2014 at 07:50:41AM +1100, Benjamin Herrenschmidt wrote:
> On Fri, 2014-10-24 at 09:22 -0700, James Bottomley wrote:
>
> > Parisc does this. As soon as one CPU issues a TLB purge, it's broadcast
> > to all the CPUs on the inter-CPU bus. The next instruction isn't
> > executed until they respond.
> >
> > But this is only for our CPU TLB. There's no other external
> > consequence, so removal from the page tables isn't effected by this TLB
> > flush, therefore the theory on which Dave bases the change to
> > atomic_add() should work for us (of course, atomic_add is lock add
> > unlock on our CPU, so it's not going to be of much benefit).
>
> I'm not sure I follow you here.
>
> Do you or do you now perform an IPI to do TLB flushes ? If you don't
> (for example because you have HW broadcast), then you need the
> speculative get_page(). If you do (and can read a PTE atomically), you
> can get away with atomic_add().
>
> The reason is that if you remember how zap_pte_range works, we perform
> the flush before we get rid of the page.
>
> So if your using IPIs for the flush, the fact that gup_fast has
> interrupts disabled will delay the IPI response and thus effectively
> prevent the pages from being actually freed, allowing us to simply do
> the atomic_add() on x86.
>
> But if we don't use IPIs because we have HW broadcast of TLB
> invalidations, then we don't have that synchronization. atomic_add won't
> work, we need get_page_speculative() because the page could be
> concurrently being freed.

I looked at how this works more closely and I agree
get_page_unless_zero is always necessary if the TLB flush doesn't
always wait for IPIs to all CPUs where a gup_fast may be running onto.

To summarize, the pagetables are freed with RCU (arch sets
HAVE_RCU_TABLE_FREE) and that allows to walk them lockless with RCU.

After we can walk the pagetables lockless with RCU, we get to the page
lockless, but the pages themself can still be freed at any time from
under us (hence the need for get_page_unless_zero).

The additional trick gup_fast RCU does is to recheck the pte after
elevating the page count with get_page_unless_zero. Rechecking the
pte/hugepmd to be sure it didn't change from under us is critical to
be sure get_page_unless_zero didn't run after the page was freed and
reallocated which would otherwise lead to a security problem too
(i.e. it protects against get_page_unless_zero false positives).

The last bit required is to still disable irqs like on x86 to
serialize against THP splits combined with pmdp_splitting_flush always
delivering IPIs (pmdp_splitting_flush must wait all gup_fast to
complete before proceeding in mangling the page struct of the compound
page).

Preventing the irq disable while taking a gup_fast pin using
compound_lock isn't as "easy" as it is to do for put_page. put_page
(non-compound) fastest path remains THP agnostic because
collapse_huge_page is inhibited by any existing gup pin, but here
we're exactly taking it, so we can't depend on it to already exist to
avoid the race with collapse_huge_page. It's not just split_huge_page
we need to protect against.

So while thinking the above summary, I noticed this patch misses a IPI
in mm/huge_memory.c that must be delivered after pmdp_clear_flush
below to be safe against collapse_huge_page for the same reasons it
sends it within pmdp_splitting_flush. Without this IPI what can happen
is that the GUP pin protection in __collapse_huge_page_isolate races
against gup_fast-RCU.

If gup_fast reads the pte on one CPU before pmdp_clear_flush, and on
the other CPU __collapse_huge_page_isolate succeeds, then gup_fast
could recheck the pte that hasn't been zapped yet by
__collapse_huge_page_copy. gup_fast would succeed because the pte
wasn't zapped yet, but then __collapse_huge_page_copy would run
replacing the pte with a transhuge pmd, making gup_fast return the old
page, while the process got the copy as part of the collapsed hugepage.

/*
* After this gup_fast can't run anymore. This also removes
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -> invariant broken by gup_fast-RCU
* any huge TLB entry from the CPU so we won't allow
* huge and small TLB entries for the same virtual address
* to avoid the risk of CPU bugs in that area.
*/
_pmd = pmdp_clear_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);

spin_lock(pte_ptl);
isolated = __collapse_huge_page_isolate(vma, address, pte);
spin_unlock(pte_ptl);

CPU0 CPU1
--------- -------------
gup_fast-RCU
local_irq_disable()
pte = pte_offset_map(pmd, address)

pmdp_clear_flush (not sending IPI -> bug)

__collapse_huge_page_isolate -> succeeds

(page_count != 1 gup-pin check of
__collapse_huge_page_isolate
didn't fire)

page = vm_normal_page(pte)
get_page_unless_zero() -> succeeds
recheck pte -> succeeds
local_irq_enable()
return page

collapse_huge_page thought
no gup_fast could run after
pmdp_clear_flush returned

__collapse_huge_page_copy (zap
pte too late, gup_fast already
returned on the other CPU)

set_pmd_at(mm, address, pmd, _pmd);

virtual memory backed by THP

gup_fast went out of sync with virtual memory

It could be solved also without IPI, for example by adding a failure
path to __collapse_huge_page_copy and by adding a second gup-pin check
(page_count != 1) after pte_clear(vma->vm_mm, address, _pte) (with a
smp_mb() in between) and returning a failure if the check
triggers. However then we need to store the 512 pte pointers in a
temporary page to roll all of them back if we raced.

Comments what is preferable between IPI and a gup-pin check after
zapping the pte in __collapse_huge_page_copy welcome. If a
modification to __collapse_huge_page_copy is preferable the temporary
pte allocation (for rollback in the gup-pin check trigger case) should
still be skipped on x86.

Thanks,
Andrea

2014-10-27 17:59:04

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH V2 1/2] mm: Update generic gup implementation to handle hugepage directory

Andrea Arcangeli <[email protected]> writes:

> Hello,
>
> On Mon, Oct 27, 2014 at 07:50:41AM +1100, Benjamin Herrenschmidt wrote:
>> On Fri, 2014-10-24 at 09:22 -0700, James Bottomley wrote:
>>
>> > Parisc does this. As soon as one CPU issues a TLB purge, it's broadcast
>> > to all the CPUs on the inter-CPU bus. The next instruction isn't
>> > executed until they respond.
>> >
>> > But this is only for our CPU TLB. There's no other external
>> > consequence, so removal from the page tables isn't effected by this TLB
>> > flush, therefore the theory on which Dave bases the change to
>> > atomic_add() should work for us (of course, atomic_add is lock add
>> > unlock on our CPU, so it's not going to be of much benefit).
>>
>> I'm not sure I follow you here.
>>
>> Do you or do you now perform an IPI to do TLB flushes ? If you don't
>> (for example because you have HW broadcast), then you need the
>> speculative get_page(). If you do (and can read a PTE atomically), you
>> can get away with atomic_add().
>>
>> The reason is that if you remember how zap_pte_range works, we perform
>> the flush before we get rid of the page.
>>
>> So if your using IPIs for the flush, the fact that gup_fast has
>> interrupts disabled will delay the IPI response and thus effectively
>> prevent the pages from being actually freed, allowing us to simply do
>> the atomic_add() on x86.
>>
>> But if we don't use IPIs because we have HW broadcast of TLB
>> invalidations, then we don't have that synchronization. atomic_add won't
>> work, we need get_page_speculative() because the page could be
>> concurrently being freed.
>
> I looked at how this works more closely and I agree
> get_page_unless_zero is always necessary if the TLB flush doesn't
> always wait for IPIs to all CPUs where a gup_fast may be running onto.
>
> To summarize, the pagetables are freed with RCU (arch sets
> HAVE_RCU_TABLE_FREE) and that allows to walk them lockless with RCU.
>
> After we can walk the pagetables lockless with RCU, we get to the page
> lockless, but the pages themself can still be freed at any time from
> under us (hence the need for get_page_unless_zero).
>
> The additional trick gup_fast RCU does is to recheck the pte after
> elevating the page count with get_page_unless_zero. Rechecking the
> pte/hugepmd to be sure it didn't change from under us is critical to
> be sure get_page_unless_zero didn't run after the page was freed and
> reallocated which would otherwise lead to a security problem too
> (i.e. it protects against get_page_unless_zero false positives).
>
> The last bit required is to still disable irqs like on x86 to
> serialize against THP splits combined with pmdp_splitting_flush always
> delivering IPIs (pmdp_splitting_flush must wait all gup_fast to
> complete before proceeding in mangling the page struct of the compound
> page).
>
> Preventing the irq disable while taking a gup_fast pin using
> compound_lock isn't as "easy" as it is to do for put_page. put_page
> (non-compound) fastest path remains THP agnostic because
> collapse_huge_page is inhibited by any existing gup pin, but here
> we're exactly taking it, so we can't depend on it to already exist to
> avoid the race with collapse_huge_page. It's not just split_huge_page
> we need to protect against.
>
> So while thinking the above summary, I noticed this patch misses a IPI
> in mm/huge_memory.c that must be delivered after pmdp_clear_flush
> below to be safe against collapse_huge_page for the same reasons it
> sends it within pmdp_splitting_flush. Without this IPI what can happen
> is that the GUP pin protection in __collapse_huge_page_isolate races
> against gup_fast-RCU.
>
> If gup_fast reads the pte on one CPU before pmdp_clear_flush, and on
> the other CPU __collapse_huge_page_isolate succeeds, then gup_fast
> could recheck the pte that hasn't been zapped yet by
> __collapse_huge_page_copy. gup_fast would succeed because the pte
> wasn't zapped yet, but then __collapse_huge_page_copy would run
> replacing the pte with a transhuge pmd, making gup_fast return the old
> page, while the process got the copy as part of the collapsed hugepage.
>
> /*
> * After this gup_fast can't run anymore. This also removes
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -> invariant broken by gup_fast-RCU
> * any huge TLB entry from the CPU so we won't allow
> * huge and small TLB entries for the same virtual address
> * to avoid the risk of CPU bugs in that area.
> */
> _pmd = pmdp_clear_flush(vma, address, pmd);
> spin_unlock(pmd_ptl);
> mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
>
> spin_lock(pte_ptl);
> isolated = __collapse_huge_page_isolate(vma, address, pte);
> spin_unlock(pte_ptl);


That is the transition from pmd pointing to a PTE page to a hugepage
right ? On ppc64 we do the below. Though not for the same reason
mentioned above (we did that to handle the hash insertion case) that
should take care of the gup case too right ?


pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
pmd_t pmd;

VM_BUG_ON(address & ~HPAGE_PMD_MASK);
if (pmd_trans_huge(*pmdp)) {
pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
} else {
/*
* khugepaged calls this for normal pmd
*/
pmd = *pmdp;
pmd_clear(pmdp);
/*
* Wait for all pending hash_page to finish. This is needed
* in case of subpage collapse. When we collapse normal pages
* to hugepage, we first clear the pmd, then invalidate all
* the PTE entries. The assumption here is that any low level
* page fault will see a none pmd and take the slow path that
* will wait on mmap_sem. But we could very well be in a
* hash_page with local ptep pointer value. Such a hash page
* can result in adding new HPTE entries for normal subpages.
* That means we could be modifying the page content as we
* copy them to a huge page. So wait for parallel hash_page
* to finish before invalidating HPTE entries. We can do this
* by sending an IPI to all the cpus and executing a dummy
* function there.
*/
kick_all_cpus_sync();
...
.....
}

>
> CPU0 CPU1
> --------- -------------
> gup_fast-RCU
> local_irq_disable()
> pte = pte_offset_map(pmd, address)
>
> pmdp_clear_flush (not sending IPI -> bug)
>
> __collapse_huge_page_isolate -> succeeds
>
> (page_count != 1 gup-pin check of
> __collapse_huge_page_isolate
> didn't fire)
>
> page = vm_normal_page(pte)
> get_page_unless_zero() -> succeeds
> recheck pte -> succeeds
> local_irq_enable()
> return page
>
> collapse_huge_page thought
> no gup_fast could run after
> pmdp_clear_flush returned
>
> __collapse_huge_page_copy (zap
> pte too late, gup_fast already
> returned on the other CPU)
>
> set_pmd_at(mm, address, pmd, _pmd);
>
> virtual memory backed by THP
>
> gup_fast went out of sync with virtual memory
>
> It could be solved also without IPI, for example by adding a failure
> path to __collapse_huge_page_copy and by adding a second gup-pin check
> (page_count != 1) after pte_clear(vma->vm_mm, address, _pte) (with a
> smp_mb() in between) and returning a failure if the check
> triggers. However then we need to store the 512 pte pointers in a
> temporary page to roll all of them back if we raced.
>
> Comments what is preferable between IPI and a gup-pin check after
> zapping the pte in __collapse_huge_page_copy welcome. If a
> modification to __collapse_huge_page_copy is preferable the temporary
> pte allocation (for rollback in the gup-pin check trigger case) should
> still be skipped on x86.

We already do an IPI for ppc64.

-aneesh

2014-10-27 18:41:46

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [PATCH V2 1/2] mm: Update generic gup implementation to handle hugepage directory

Hi Aneesh,

On Mon, Oct 27, 2014 at 11:28:41PM +0530, Aneesh Kumar K.V wrote:
> VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> if (pmd_trans_huge(*pmdp)) {
> pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
> } else {

The only problematic path that needs IPI is the below one yes.

> /*
> * khugepaged calls this for normal pmd
> */
> pmd = *pmdp;
> pmd_clear(pmdp);
> /*
> * Wait for all pending hash_page to finish. This is needed
> * in case of subpage collapse. When we collapse normal pages
> * to hugepage, we first clear the pmd, then invalidate all
> * the PTE entries. The assumption here is that any low level
> * page fault will see a none pmd and take the slow path that
> * will wait on mmap_sem. But we could very well be in a
> * hash_page with local ptep pointer value. Such a hash page
> * can result in adding new HPTE entries for normal subpages.
> * That means we could be modifying the page content as we
> * copy them to a huge page. So wait for parallel hash_page
> * to finish before invalidating HPTE entries. We can do this
> * by sending an IPI to all the cpus and executing a dummy
> * function there.
> */
> kick_all_cpus_sync();
>
> We already do an IPI for ppc64.

Agreed, ppc64 is already covered.

sparc/arm seem to be using the generic pmdp_clear_flush implementation
instead, which just calls flush_tlb_range, so perhaps they aren't.

As above, the IPIs are only needed if the *pmd is not transhuge.

Thanks,
Andrea