2008-06-05 10:01:33

by Nick Piggin

[permalink] [raw]
Subject: [patch 7/7] powerpc: lockless get_user_pages_fast

Implement lockless get_user_pages_fast for powerpc. Page table existence is
guaranteed with RCU, and speculative page references are used to take a
reference to the pages without having a prior existence guarantee on them.

Signed-off-by: Nick Piggin <[email protected]>
---
Index: linux-2.6/include/asm-powerpc/uaccess.h
===================================================================
--- linux-2.6.orig/include/asm-powerpc/uaccess.h
+++ linux-2.6/include/asm-powerpc/uaccess.h
@@ -493,6 +493,12 @@ static inline int strnlen_user(const cha

#define strlen_user(str) strnlen_user((str), 0x7ffffffe)

+#ifdef __powerpc64__
+#define __HAVE_ARCH_GET_USER_PAGES_FAST
+struct page;
+int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages);
+#endif
+
#endif /* __ASSEMBLY__ */
#endif /* __KERNEL__ */

Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -244,7 +244,7 @@ static inline int put_page_testzero(stru
*/
static inline int get_page_unless_zero(struct page *page)
{
- VM_BUG_ON(PageTail(page));
+ VM_BUG_ON(PageCompound(page));
return atomic_inc_not_zero(&page->_count);
}

Index: linux-2.6/include/linux/pagemap.h
===================================================================
--- linux-2.6.orig/include/linux/pagemap.h
+++ linux-2.6/include/linux/pagemap.h
@@ -142,6 +142,29 @@ static inline int page_cache_get_specula
return 1;
}

+/*
+ * Same as above, but add instead of inc (could just be merged)
+ */
+static inline int page_cache_add_speculative(struct page *page, int count)
+{
+ VM_BUG_ON(in_interrupt());
+
+#ifndef CONFIG_SMP
+# ifdef CONFIG_PREEMPT
+ VM_BUG_ON(!in_atomic());
+# endif
+ VM_BUG_ON(page_count(page) == 0);
+ atomic_add(count, &page->_count);
+
+#else
+ if (unlikely(!atomic_add_unless(&page->_count, count, 0)))
+ return 0;
+#endif
+ VM_BUG_ON(PageCompound(page) && page != compound_head(page));
+
+ return 1;
+}
+
static inline int page_freeze_refs(struct page *page, int count)
{
return likely(atomic_cmpxchg(&page->_count, count, 0) == count);
Index: linux-2.6/arch/powerpc/mm/Makefile
===================================================================
--- linux-2.6.orig/arch/powerpc/mm/Makefile
+++ linux-2.6/arch/powerpc/mm/Makefile
@@ -6,7 +6,7 @@ ifeq ($(CONFIG_PPC64),y)
EXTRA_CFLAGS += -mno-minimal-toc
endif

-obj-y := fault.o mem.o \
+obj-y := fault.o mem.o gup.o \
init_$(CONFIG_WORD_SIZE).o \
pgtable_$(CONFIG_WORD_SIZE).o \
mmu_context_$(CONFIG_WORD_SIZE).o
Index: linux-2.6/arch/powerpc/mm/gup.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/powerpc/mm/gup.c
@@ -0,0 +1,230 @@
+/*
+ * Lockless get_user_pages_fast for powerpc
+ *
+ * Copyright (C) 2008 Nick Piggin
+ * Copyright (C) 2008 Novell Inc.
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
+#include <linux/pagemap.h>
+#include <linux/rwsem.h>
+#include <asm/pgtable.h>
+
+/*
+ * The performance critical leaf functions are made noinline otherwise gcc
+ * inlines everything into a single function which results in too much
+ * register pressure.
+ */
+static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
+{
+ unsigned long mask, result;
+ pte_t *ptep;
+
+ result = _PAGE_PRESENT|_PAGE_USER;
+ if (write)
+ result |= _PAGE_RW;
+ mask = result | _PAGE_SPECIAL;
+
+ ptep = pte_offset_kernel(&pmd, addr);
+ do {
+ pte_t pte = *ptep;
+ struct page *page;
+
+ if ((pte_val(pte) & mask) != result)
+ return 0;
+ VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+ page = pte_page(pte);
+ if (!page_cache_get_speculative(page))
+ return 0;
+ if (unlikely(pte != *ptep)) {
+ put_page(page);
+ return 0;
+ }
+ pages[*nr] = page;
+ (*nr)++;
+
+ } while (ptep++, addr += PAGE_SIZE, addr != end);
+
+ return 1;
+}
+
+static noinline int gup_huge_pte(pte_t *ptep, unsigned long *addr,
+ unsigned long end, int write, struct page **pages, int *nr)
+{
+ unsigned long mask;
+ unsigned long pte_end;
+ struct page *head, *page;
+ pte_t pte;
+ int refs;
+
+ pte_end = (*addr + HPAGE_SIZE) & HPAGE_MASK;
+ if (pte_end < end)
+ end = pte_end;
+
+ pte = *ptep;
+ mask = _PAGE_PRESENT|_PAGE_USER;
+ if (write)
+ mask |= _PAGE_RW;
+ if ((pte_val(pte) & mask) != mask)
+ return 0;
+ /* hugepages are never "special" */
+ VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+ refs = 0;
+ head = pte_page(pte);
+ page = head + ((*addr & ~HPAGE_MASK) >> PAGE_SHIFT);
+ do {
+ VM_BUG_ON(compound_head(page) != head);
+ pages[*nr] = page;
+ (*nr)++;
+ page++;
+ refs++;
+ } while (*addr += PAGE_SIZE, *addr != end);
+
+ if (!page_cache_add_speculative(head, refs)) {
+ *nr -= refs;
+ return 0;
+ }
+ if (unlikely(pte != *ptep)) {
+ /* Could be optimized better */
+ while (*nr) {
+ put_page(page);
+ (*nr)--;
+ }
+ }
+
+ return 1;
+}
+
+static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
+ int write, struct page **pages, int *nr)
+{
+ unsigned long next;
+ pmd_t *pmdp;
+
+ pmdp = pmd_offset(&pud, addr);
+ do {
+ pmd_t pmd = *pmdp;
+
+ next = pmd_addr_end(addr, end);
+ if (pmd_none(pmd))
+ return 0;
+ if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+ return 0;
+ } while (pmdp++, addr = next, addr != end);
+
+ return 1;
+}
+
+static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
+ int write, struct page **pages, int *nr)
+{
+ unsigned long next;
+ pud_t *pudp;
+
+ pudp = pud_offset(&pgd, addr);
+ do {
+ pud_t pud = *pudp;
+
+ next = pud_addr_end(addr, end);
+ if (pud_none(pud))
+ return 0;
+ if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+ return 0;
+ } while (pudp++, addr = next, addr != end);
+
+ return 1;
+}
+
+int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long end = start + (nr_pages << PAGE_SHIFT);
+ unsigned long addr = start;
+ unsigned long next;
+ pgd_t *pgdp;
+ int nr = 0;
+
+
+ if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+ start, nr_pages*PAGE_SIZE)))
+ goto slow_irqon;
+
+ /* Cross a slice boundary? */
+ if (unlikely(addr < SLICE_LOW_TOP && end >= SLICE_LOW_TOP))
+ goto slow_irqon;
+
+ /*
+ * XXX: batch / limit 'nr', to avoid large irq off latency
+ * needs some instrumenting to determine the common sizes used by
+ * important workloads (eg. DB2), and whether limiting the batch size
+ * will decrease performance.
+ *
+ * It seems like we're in the clear for the moment. Direct-IO is
+ * the main guy that batches up lots of get_user_pages, and even
+ * they are limited to 64-at-a-time which is not so many.
+ */
+ /*
+ * This doesn't prevent pagetable teardown, but does prevent
+ * the pagetables from being freed on powerpc.
+ *
+ * So long as we atomically load page table pointers versus teardown,
+ * we can follow the address down to the the page and take a ref on it.
+ */
+ local_irq_disable();
+
+ if (get_slice_psize(mm, addr) == mmu_huge_psize) {
+ pte_t *ptep;
+ unsigned long a = addr;
+
+ ptep = huge_pte_offset(mm, a);
+ do {
+ if (!gup_huge_pte(ptep, &a, end, write, pages, &nr))
+ goto slow;
+ ptep++;
+ } while (a != end);
+ } else {
+ pgdp = pgd_offset(mm, addr);
+ do {
+ pgd_t pgd = *pgdp;
+
+ next = pgd_addr_end(addr, end);
+ if (pgd_none(pgd))
+ goto slow;
+ if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+ goto slow;
+ } while (pgdp++, addr = next, addr != end);
+ }
+ local_irq_enable();
+
+ VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
+ return nr;
+
+ {
+ int ret;
+
+slow:
+ local_irq_enable();
+slow_irqon:
+ /* Try to get the remaining pages with get_user_pages */
+ start += nr << PAGE_SHIFT;
+ pages += nr;
+
+ down_read(&mm->mmap_sem);
+ ret = get_user_pages(current, mm, start,
+ (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
+ up_read(&mm->mmap_sem);
+
+ /* Have to be a bit careful with return values */
+ if (nr > 0) {
+ if (ret < 0)
+ ret = nr;
+ else
+ ret += nr;
+ }
+
+ return ret;
+ }
+}

--


2008-06-09 08:32:25

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch 7/7] powerpc: lockless get_user_pages_fast

On Thu, 05 Jun 2008 19:43:07 +1000 [email protected] wrote:

> Implement lockless get_user_pages_fast for powerpc. Page table existence is
> guaranteed with RCU, and speculative page references are used to take a
> reference to the pages without having a prior existence guarantee on them.
>

arch/powerpc/mm/gup.c: In function `get_user_pages_fast':
arch/powerpc/mm/gup.c:156: error: `SLICE_LOW_TOP' undeclared (first use in this function)
arch/powerpc/mm/gup.c:156: error: (Each undeclared identifier is reported only once
arch/powerpc/mm/gup.c:156: error: for each function it appears in.)
arch/powerpc/mm/gup.c:178: error: implicit declaration of function `get_slice_psize'
arch/powerpc/mm/gup.c:178: error: `mmu_huge_psize' undeclared (first use in this function)
arch/powerpc/mm/gup.c:182: error: implicit declaration of function `huge_pte_offset'
arch/powerpc/mm/gup.c:182: warning: assignment makes pointer from integer without a cast

with

http://userweb.kernel.org/~akpm/config-g5.txt

I don't immediately know why - adding asm/page.h to gup.c doesn't help.
I'm suspecting a recursive include problem somewhere.

I'll drop it, sorry - too much other stuff to fix over here.

2008-06-10 03:15:36

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 7/7] powerpc: lockless get_user_pages_fast

On Mon, Jun 09, 2008 at 01:32:04AM -0700, Andrew Morton wrote:
> On Thu, 05 Jun 2008 19:43:07 +1000 [email protected] wrote:
>
> > Implement lockless get_user_pages_fast for powerpc. Page table existence is
> > guaranteed with RCU, and speculative page references are used to take a
> > reference to the pages without having a prior existence guarantee on them.
> >
>
> arch/powerpc/mm/gup.c: In function `get_user_pages_fast':
> arch/powerpc/mm/gup.c:156: error: `SLICE_LOW_TOP' undeclared (first use in this function)
> arch/powerpc/mm/gup.c:156: error: (Each undeclared identifier is reported only once
> arch/powerpc/mm/gup.c:156: error: for each function it appears in.)
> arch/powerpc/mm/gup.c:178: error: implicit declaration of function `get_slice_psize'
> arch/powerpc/mm/gup.c:178: error: `mmu_huge_psize' undeclared (first use in this function)
> arch/powerpc/mm/gup.c:182: error: implicit declaration of function `huge_pte_offset'
> arch/powerpc/mm/gup.c:182: warning: assignment makes pointer from integer without a cast
>
> with
>
> http://userweb.kernel.org/~akpm/config-g5.txt
>
> I don't immediately know why - adding asm/page.h to gup.c doesn't help.
> I'm suspecting a recursive include problem somewhere.
>
> I'll drop it, sorry - too much other stuff to fix over here.

No problem. Likely a clash with the hugepage patches.

2008-06-10 19:00:59

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 7/7] powerpc: lockless get_user_pages_fast

On Thu, 5 Jun 2008, [email protected] wrote:

> Index: linux-2.6/include/linux/mm.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mm.h
> +++ linux-2.6/include/linux/mm.h
> @@ -244,7 +244,7 @@ static inline int put_page_testzero(stru
> */
> static inline int get_page_unless_zero(struct page *page)
> {
> - VM_BUG_ON(PageTail(page));
> + VM_BUG_ON(PageCompound(page));
> return atomic_inc_not_zero(&page->_count);
> }

This is reversing the modification to make get_page_unless_zero() usable
with compound page heads. Will break the slab defrag patchset.

2008-06-11 03:18:32

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 7/7] powerpc: lockless get_user_pages_fast

On Tue, Jun 10, 2008 at 12:00:48PM -0700, Christoph Lameter wrote:
> On Thu, 5 Jun 2008, [email protected] wrote:
>
> > Index: linux-2.6/include/linux/mm.h
> > ===================================================================
> > --- linux-2.6.orig/include/linux/mm.h
> > +++ linux-2.6/include/linux/mm.h
> > @@ -244,7 +244,7 @@ static inline int put_page_testzero(stru
> > */
> > static inline int get_page_unless_zero(struct page *page)
> > {
> > - VM_BUG_ON(PageTail(page));
> > + VM_BUG_ON(PageCompound(page));
> > return atomic_inc_not_zero(&page->_count);
> > }
>
> This is reversing the modification to make get_page_unless_zero() usable
> with compound page heads. Will break the slab defrag patchset.

Is the slab defrag patchset in -mm? Because you ignored my comment about
this change that assertions should not be weakened until required by the
actual patchset. I wanted to have these assertions be as strong as
possible for the lockless pagecache patchset.

2008-06-11 04:41:57

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 7/7] powerpc: lockless get_user_pages_fast

And yes slab defrag is part of linux-next. So it would break.

2008-06-11 04:42:24

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 7/7] powerpc: lockless get_user_pages_fast

On Wed, 11 Jun 2008, Nick Piggin wrote:

> > This is reversing the modification to make get_page_unless_zero() usable
> > with compound page heads. Will break the slab defrag patchset.
>
> Is the slab defrag patchset in -mm? Because you ignored my comment about
> this change that assertions should not be weakened until required by the
> actual patchset. I wanted to have these assertions be as strong as
> possible for the lockless pagecache patchset.

So you are worried about accidentally using get_page_unless_zero on a
compound page? What would be wrong about that?


2008-06-11 04:47:26

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 7/7] powerpc: lockless get_user_pages_fast

On Tue, Jun 10, 2008 at 09:40:25PM -0700, Christoph Lameter wrote:
> On Wed, 11 Jun 2008, Nick Piggin wrote:
>
> > > This is reversing the modification to make get_page_unless_zero() usable
> > > with compound page heads. Will break the slab defrag patchset.
> >
> > Is the slab defrag patchset in -mm? Because you ignored my comment about
> > this change that assertions should not be weakened until required by the
> > actual patchset. I wanted to have these assertions be as strong as
> > possible for the lockless pagecache patchset.
>
> So you are worried about accidentally using get_page_unless_zero on a
> compound page? What would be wrong about that?

Unexpected. Compound pages should have no such races that require
get_page_unless_zero that we very carefully use in page reclaim.

If you don't actually know whether you have a reference to the
thing or not before trying to operate on it, then you're almost
definitely got refcount wrong. How does slab defrag use it?

2008-06-11 04:49:16

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 7/7] powerpc: lockless get_user_pages_fast

On Tue, Jun 10, 2008 at 09:41:33PM -0700, Christoph Lameter wrote:
> And yes slab defrag is part of linux-next. So it would break.

Can memory management patches go though mm/? I dislike the cowboy
method of merging things that some other subsystems have adopted :)

2008-06-11 06:06:42

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch 7/7] powerpc: lockless get_user_pages_fast

On Wed, 11 Jun 2008 06:49:02 +0200 Nick Piggin <[email protected]> wrote:

> On Tue, Jun 10, 2008 at 09:41:33PM -0700, Christoph Lameter wrote:
> > And yes slab defrag is part of linux-next. So it would break.

No, slab defreg[*] isn't in linux-next.

y:/usr/src/25> diffstat patches/linux-next.patch| grep mm/slub.c
mm/slub.c | 4

That's two spelling fixes in comments.

I have git-pekka in -mm too. Here it is:

--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2765,6 +2765,7 @@ void kfree(const void *x)

page = virt_to_head_page(x);
if (unlikely(!PageSlab(page))) {
+ BUG_ON(!PageCompound(page));
put_page(page);
return;
}


> Can memory management patches go though mm/? I dislike the cowboy
> method of merging things that some other subsystems have adopted :)

I think I'd prefer that. I may be a bit slow, but we're shoving at
least 100 MM patches through each kernel release and I think I review
things more closely than others choose to. At least, I find problems
and I've seen some pretty wild acked-bys...


[*] It _isn't_ "slab defrag". Or at least, it wasn't last time I saw
it. It's "slub defrag". And IMO it is bad to be adding slub-only
features because afaik slub still isn't as fast as slab on some things
and so some people might want to run slab rather than slub. And
because if this the decision whether to retain slab or slub STILL
hasn't been made. Carrying both versions was supposed to be a
short-term transitional thing :(

2008-06-11 06:24:19

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 7/7] powerpc: lockless get_user_pages_fast

On Tue, Jun 10, 2008 at 11:06:22PM -0700, Andrew Morton wrote:
> On Wed, 11 Jun 2008 06:49:02 +0200 Nick Piggin <[email protected]> wrote:
>
> > Can memory management patches go though mm/? I dislike the cowboy
^^^
That should read -mm, of course.


> > method of merging things that some other subsystems have adopted :)
>
> I think I'd prefer that. I may be a bit slow, but we're shoving at
> least 100 MM patches through each kernel release and I think I review
> things more closely than others choose to. At least, I find problems
> and I've seen some pretty wild acked-bys...

I wouldn't say you're too slow. You're as close to mm and mm/fs
maintainer as we're likely to get and I think it would be much worse
to have things merged out-of-band. Even the more peripheral parts like
slab or hugetlb.

2008-06-11 06:51:11

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch 7/7] powerpc: lockless get_user_pages_fast

On Wed, 11 Jun 2008 08:24:04 +0200 Nick Piggin <[email protected]> wrote:

> On Tue, Jun 10, 2008 at 11:06:22PM -0700, Andrew Morton wrote:
> > On Wed, 11 Jun 2008 06:49:02 +0200 Nick Piggin <[email protected]> wrote:
> >
> > > Can memory management patches go though mm/? I dislike the cowboy
> ^^^
> That should read -mm, of course.
>

-mm is looking awfully peripheral nowadays. I really need to get my
linux-next act together. Instead I'll be taking all next week off.
nyer nyer.

>
> > > method of merging things that some other subsystems have adopted :)
> >
> > I think I'd prefer that. I may be a bit slow, but we're shoving at
> > least 100 MM patches through each kernel release and I think I review
> > things more closely than others choose to. At least, I find problems
> > and I've seen some pretty wild acked-bys...
>
> I wouldn't say you're too slow. You're as close to mm and mm/fs
> maintainer as we're likely to get and I think it would be much worse
> to have things merged out-of-band. Even the more peripheral parts like
> slab or hugetlb.

Sigh. I feel guilty when spending time merging (for example)
random-usb-patches-in-case-greg-misses-them, but such is life. Some
help reviewing things would be nice.

A lot of my review is now of the "how the heck is anyone to understand
this in a year's time if I can't understand it now" variety, but I hope
that's useful...

2008-06-11 23:20:35

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 7/7] powerpc: lockless get_user_pages_fast

On Tue, 10 Jun 2008, Andrew Morton wrote:

> hasn't been made. Carrying both versions was supposed to be a
> short-term transitional thing :(

The whatever defrag patchset includes a patch to make SLAB
experimental. So one step further.