2018-01-23 10:56:25

by Konstantin Khlebnikov

[permalink] [raw]
Subject: [PATCH 1/4] vmalloc: add vm_flags argument to internal __vmalloc_node()

This allows to set VM_USERMAP in vmalloc_user() and vmalloc_32_user()
directly at allocation and avoid find_vm_area() call.

Signed-off-by: Konstantin Khlebnikov <[email protected]>
---
mm/vmalloc.c | 54 +++++++++++++++++++++---------------------------------
1 file changed, 21 insertions(+), 33 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 673942094328..cece3fb33cef 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1662,7 +1662,9 @@ EXPORT_SYMBOL(vmap);

static void *__vmalloc_node(unsigned long size, unsigned long align,
gfp_t gfp_mask, pgprot_t prot,
+ unsigned long vm_flags,
int node, const void *caller);
+
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node)
{
@@ -1681,7 +1683,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
- PAGE_KERNEL, node, area->caller);
+ PAGE_KERNEL, 0, node, area->caller);
} else {
pages = kmalloc_node(array_size, nested_gfp, node);
}
@@ -1752,7 +1754,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
goto fail;

area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
- vm_flags, start, end, node, gfp_mask, caller);
+ vm_flags, start, end, node, gfp_mask, caller);
if (!area)
goto fail;

@@ -1783,6 +1785,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
* @align: desired alignment
* @gfp_mask: flags for the page level allocator
* @prot: protection mask for the allocated pages
+ * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD)
* @node: node to use for allocation or NUMA_NO_NODE
* @caller: caller's return address
*
@@ -1799,15 +1802,16 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
*/
static void *__vmalloc_node(unsigned long size, unsigned long align,
gfp_t gfp_mask, pgprot_t prot,
+ unsigned long vm_flags,
int node, const void *caller)
{
return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
- gfp_mask, prot, 0, node, caller);
+ gfp_mask, prot, vm_flags, node, caller);
}

void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
{
- return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE,
+ return __vmalloc_node(size, 1, gfp_mask, prot, 0, NUMA_NO_NODE,
__builtin_return_address(0));
}
EXPORT_SYMBOL(__vmalloc);
@@ -1815,15 +1819,15 @@ EXPORT_SYMBOL(__vmalloc);
static inline void *__vmalloc_node_flags(unsigned long size,
int node, gfp_t flags)
{
- return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
- node, __builtin_return_address(0));
+ return __vmalloc_node(size, 1, flags, PAGE_KERNEL, 0, node,
+ __builtin_return_address(0));
}


void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags,
void *caller)
{
- return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller);
+ return __vmalloc_node(size, 1, flags, PAGE_KERNEL, 0, node, caller);
}

/**
@@ -1868,18 +1872,9 @@ EXPORT_SYMBOL(vzalloc);
*/
void *vmalloc_user(unsigned long size)
{
- struct vm_struct *area;
- void *ret;
-
- ret = __vmalloc_node(size, SHMLBA,
- GFP_KERNEL | __GFP_ZERO,
- PAGE_KERNEL, NUMA_NO_NODE,
- __builtin_return_address(0));
- if (ret) {
- area = find_vm_area(ret);
- area->flags |= VM_USERMAP;
- }
- return ret;
+ return __vmalloc_node(size, SHMLBA, GFP_KERNEL | __GFP_ZERO,
+ PAGE_KERNEL, VM_USERMAP, NUMA_NO_NODE,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_user);

@@ -1896,8 +1891,8 @@ EXPORT_SYMBOL(vmalloc_user);
*/
void *vmalloc_node(unsigned long size, int node)
{
- return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL,
- node, __builtin_return_address(0));
+ return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL, 0, node,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_node);

@@ -1938,7 +1933,7 @@ EXPORT_SYMBOL(vzalloc_node);

void *vmalloc_exec(unsigned long size)
{
- return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC,
+ return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC, 0,
NUMA_NO_NODE, __builtin_return_address(0));
}

@@ -1959,7 +1954,7 @@ void *vmalloc_exec(unsigned long size)
*/
void *vmalloc_32(unsigned long size)
{
- return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
+ return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, 0,
NUMA_NO_NODE, __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32);
@@ -1973,16 +1968,9 @@ EXPORT_SYMBOL(vmalloc_32);
*/
void *vmalloc_32_user(unsigned long size)
{
- struct vm_struct *area;
- void *ret;
-
- ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
- NUMA_NO_NODE, __builtin_return_address(0));
- if (ret) {
- area = find_vm_area(ret);
- area->flags |= VM_USERMAP;
- }
- return ret;
+ return __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO,
+ PAGE_KERNEL, VM_USERMAP, NUMA_NO_NODE,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32_user);




2018-01-23 10:56:30

by Konstantin Khlebnikov

[permalink] [raw]
Subject: [PATCH 4/4] kernel/fork: add option to use virtually mapped stacks as fallback

Virtually mapped stack have two bonuses: it eats order-0 pages and
adds guard page at the end. But it slightly slower if system have
plenty free high-order pages.

This patch adds option to use virtually bapped stack as fallback for
atomic allocation of traditional high-order page.

Signed-off-by: Konstantin Khlebnikov <[email protected]>
---
arch/Kconfig | 14 ++++++++++++++
kernel/fork.c | 11 +++++++++++
2 files changed, 25 insertions(+)

diff --git a/arch/Kconfig b/arch/Kconfig
index 400b9e1b2f27..c181ab263e7f 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -904,6 +904,20 @@ config VMAP_STACK
the stack to map directly to the KASAN shadow map using a formula
that is incorrect if the stack is in vmalloc space.

+config VMAP_STACK_AS_FALLBACK
+ default n
+ bool "Use a virtually-mapped stack as fallback for directly-mapped"
+ depends on VMAP_STACK
+ help
+ With this option kernel first tries to allocate directly-mapped stack
+ without calling direct memory reclaim and fallback to vmap stack.
+
+ Allocation of directly mapped stack faster than vmap if system a lot
+ of free memory and much slower if all memory is used or fragmented.
+
+ This option neutralize stack overflow protection but allows to
+ achieve best performance for syscalls fork() and clone().
+
config ARCH_OPTIONAL_KERNEL_RWX
def_bool n

diff --git a/kernel/fork.c b/kernel/fork.c
index 457c9151f3c8..cc61a083954d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -207,6 +207,17 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
struct vm_struct *stack;
int i;

+#ifdef CONFIG_VMAP_STACK_AS_FALLBACK
+ struct page *page;
+
+ page = alloc_pages_node(node, THREADINFO_GFP & ~__GFP_DIRECT_RECLAIM,
+ THREAD_SIZE_ORDER);
+ if (page) {
+ tsk->stack_vm_area = NULL;
+ return page_address(page);
+ }
+#endif
+
for (i = 0; i < NR_CACHED_STACKS; i++) {
stack = this_cpu_xchg(cached_stacks[i], NULL);
if (!stack)


2018-01-23 10:57:59

by Konstantin Khlebnikov

[permalink] [raw]
Subject: [PATCH 2/4] vmalloc: add __vmalloc_area()

This function it the same as __vmalloc_node_range() but returns pointer to
vm_struct rather than virtual address.

Signed-off-by: Konstantin Khlebnikov <[email protected]>
---
include/linux/vmalloc.h | 5 +++++
mm/vmalloc.c | 39 +++++++++++++++++++++++++++++++++++----
2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 1e5d8c392f15..f772346a506e 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -81,6 +81,11 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
pgprot_t prot, unsigned long vm_flags, int node,
const void *caller);
+extern struct vm_struct *__vmalloc_area(unsigned long size, unsigned long align,
+ unsigned long start, unsigned long end,
+ gfp_t gfp_mask, pgprot_t prot,
+ unsigned long vm_flags, int node,
+ const void *caller);
#ifndef CONFIG_MMU
extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags);
static inline void *__vmalloc_node_flags_caller(unsigned long size, int node,
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index cece3fb33cef..ad962be74d53 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1725,7 +1725,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
}

/**
- * __vmalloc_node_range - allocate virtually contiguous memory
+ * __vmalloc_area - allocate virtually contiguous memory
* @size: allocation size
* @align: desired alignment
* @start: vm area range start
@@ -1738,9 +1738,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
*
* Allocate enough pages to cover @size from the page level
* allocator with @gfp_mask flags. Map them into contiguous
- * kernel virtual space, using a pagetable protection of @prot.
+ * kernel virtual space, using a pagetable protection of @prot
+ *
+ * Returns the area descriptor on success or %NULL on failure.
*/
-void *__vmalloc_node_range(unsigned long size, unsigned long align,
+struct vm_struct *__vmalloc_area(unsigned long size, unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
pgprot_t prot, unsigned long vm_flags, int node,
const void *caller)
@@ -1771,7 +1773,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,

kmemleak_vmalloc(area, size, gfp_mask);

- return addr;
+ return area;

fail:
warn_alloc(gfp_mask, NULL,
@@ -1780,6 +1782,35 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
}

/**
+ * __vmalloc_node_range - allocate virtually contiguous memory
+ * @size: allocation size
+ * @align: desired alignment
+ * @start: vm area range start
+ * @end: vm area range end
+ * @gfp_mask: flags for the page level allocator
+ * @prot: protection mask for the allocated pages
+ * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD)
+ * @node: node to use for allocation or NUMA_NO_NODE
+ * @caller: caller's return address
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator with @gfp_mask flags. Map them into contiguous
+ * kernel virtual space, using a pagetable protection of @prot.
+ */
+void *__vmalloc_node_range(unsigned long size, unsigned long align,
+ unsigned long start, unsigned long end, gfp_t gfp_mask,
+ pgprot_t prot, unsigned long vm_flags, int node,
+ const void *caller)
+{
+ struct vm_struct *area;
+
+ area = __vmalloc_area(size, align, start, end, gfp_mask,
+ prot, vm_flags, node, caller);
+
+ return area ? area->addr : NULL;
+}
+
+/**
* __vmalloc_node - allocate virtually contiguous memory
* @size: allocation size
* @align: desired alignment


2018-01-23 10:58:58

by Konstantin Khlebnikov

[permalink] [raw]
Subject: [PATCH 3/4] kernel/fork: switch vmapped stack callation to __vmalloc_area()

This gives as pointer vm_struct without calling find_vm_area().

And fix comment about that task holds cache of vm area: this cache used
for retrieving actual stack pages, freeing is done by vfree_deferred().

Signed-off-by: Konstantin Khlebnikov <[email protected]>
---
kernel/fork.c | 37 +++++++++++++++----------------------
1 file changed, 15 insertions(+), 22 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 2295fc69717f..457c9151f3c8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -204,39 +204,32 @@ static int free_vm_stack_cache(unsigned int cpu)
static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
{
#ifdef CONFIG_VMAP_STACK
- void *stack;
+ struct vm_struct *stack;
int i;

for (i = 0; i < NR_CACHED_STACKS; i++) {
- struct vm_struct *s;
-
- s = this_cpu_xchg(cached_stacks[i], NULL);
-
- if (!s)
+ stack = this_cpu_xchg(cached_stacks[i], NULL);
+ if (!stack)
continue;

#ifdef CONFIG_DEBUG_KMEMLEAK
/* Clear stale pointers from reused stack. */
- memset(s->addr, 0, THREAD_SIZE);
+ memset(stack->addr, 0, THREAD_SIZE);
#endif
- tsk->stack_vm_area = s;
- return s->addr;
+ tsk->stack_vm_area = stack;
+ return stack->addr;
}

- stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
- VMALLOC_START, VMALLOC_END,
- THREADINFO_GFP,
- PAGE_KERNEL,
- 0, node, __builtin_return_address(0));
+ stack = __vmalloc_area(THREAD_SIZE, THREAD_ALIGN,
+ VMALLOC_START, VMALLOC_END,
+ THREADINFO_GFP, PAGE_KERNEL,
+ 0, node, __builtin_return_address(0));
+ if (unlikely(!stack))
+ return NULL;

- /*
- * We can't call find_vm_area() in interrupt context, and
- * free_thread_stack() can be called in interrupt context,
- * so cache the vm_struct.
- */
- if (stack)
- tsk->stack_vm_area = find_vm_area(stack);
- return stack;
+ /* Cache the vm_struct for stack to page conversions. */
+ tsk->stack_vm_area = stack;
+ return stack->addr;
#else
struct page *page = alloc_pages_node(node, THREADINFO_GFP,
THREAD_SIZE_ORDER);


2018-01-23 13:58:10

by Konstantin Khlebnikov

[permalink] [raw]
Subject: Re: [PATCH 3/4] kernel/fork: switch vmapped stack callation to __vmalloc_area()

# stress-ng --clone 100 -t 10s --metrics-brief
at 32-core machine shows boost 35000 -> 36000 bogo ops

Patch 4/4 is a kind of RFC.
Actually per-cpu cache of preallocated stacks works faster than buddy allocator thus
performance boots for it happens only at completely insane rate of clones.

On 23.01.2018 13:55, Konstantin Khlebnikov wrote:
> This gives as pointer vm_struct without calling find_vm_area().
>
> And fix comment about that task holds cache of vm area: this cache used
> for retrieving actual stack pages, freeing is done by vfree_deferred().
>
> Signed-off-by: Konstantin Khlebnikov <[email protected]>
> ---
> kernel/fork.c | 37 +++++++++++++++----------------------
> 1 file changed, 15 insertions(+), 22 deletions(-)
>
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 2295fc69717f..457c9151f3c8 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -204,39 +204,32 @@ static int free_vm_stack_cache(unsigned int cpu)
> static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
> {
> #ifdef CONFIG_VMAP_STACK
> - void *stack;
> + struct vm_struct *stack;
> int i;
>
> for (i = 0; i < NR_CACHED_STACKS; i++) {
> - struct vm_struct *s;
> -
> - s = this_cpu_xchg(cached_stacks[i], NULL);
> -
> - if (!s)
> + stack = this_cpu_xchg(cached_stacks[i], NULL);
> + if (!stack)
> continue;
>
> #ifdef CONFIG_DEBUG_KMEMLEAK
> /* Clear stale pointers from reused stack. */
> - memset(s->addr, 0, THREAD_SIZE);
> + memset(stack->addr, 0, THREAD_SIZE);
> #endif
> - tsk->stack_vm_area = s;
> - return s->addr;
> + tsk->stack_vm_area = stack;
> + return stack->addr;
> }
>
> - stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
> - VMALLOC_START, VMALLOC_END,
> - THREADINFO_GFP,
> - PAGE_KERNEL,
> - 0, node, __builtin_return_address(0));
> + stack = __vmalloc_area(THREAD_SIZE, THREAD_ALIGN,
> + VMALLOC_START, VMALLOC_END,
> + THREADINFO_GFP, PAGE_KERNEL,
> + 0, node, __builtin_return_address(0));
> + if (unlikely(!stack))
> + return NULL;
>
> - /*
> - * We can't call find_vm_area() in interrupt context, and
> - * free_thread_stack() can be called in interrupt context,
> - * so cache the vm_struct.
> - */
> - if (stack)
> - tsk->stack_vm_area = find_vm_area(stack);
> - return stack;
> + /* Cache the vm_struct for stack to page conversions. */
> + tsk->stack_vm_area = stack;
> + return stack->addr;
> #else
> struct page *page = alloc_pages_node(node, THREADINFO_GFP,
> THREAD_SIZE_ORDER);
>

2018-02-21 00:17:30

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 3/4] kernel/fork: switch vmapped stack callation to __vmalloc_area()

On Tue, 23 Jan 2018 16:57:21 +0300 Konstantin Khlebnikov <[email protected]> wrote:

> # stress-ng --clone 100 -t 10s --metrics-brief
> at 32-core machine shows boost 35000 -> 36000 bogo ops
>
> Patch 4/4 is a kind of RFC.
> Actually per-cpu cache of preallocated stacks works faster than buddy allocator thus
> performance boots for it happens only at completely insane rate of clones.
>

I'm not really sure what to make of this patchset. Is it useful in any
known real-world use cases?

> + This option neutralize stack overflow protection but allows to
> + achieve best performance for syscalls fork() and clone().

That sounds problematic, but perhaps acceptable if the fallback only
happens rarely.

Can this code be folded into CONFIG_VMAP_STACk in some cleaner fashion?
We now have options for non-vmapped stacks, vmapped stacks and a mix
of both.

And what about this comment in arch/Kconfig:VMAP_STACK:

This is presently incompatible with KASAN because KASAN expects
the stack to map directly to the KASAN shadow map using a formula
that is incorrect if the stack is in vmalloc space.


So VMAP_STACK_AS_FALLBACK will intermittently break KASAN?


2018-02-21 07:56:42

by Konstantin Khlebnikov

[permalink] [raw]
Subject: Re: [PATCH 3/4] kernel/fork: switch vmapped stack callation to __vmalloc_area()



On 21.02.2018 03:16, Andrew Morton wrote:
> On Tue, 23 Jan 2018 16:57:21 +0300 Konstantin Khlebnikov <[email protected]> wrote:
>
>> # stress-ng --clone 100 -t 10s --metrics-brief
>> at 32-core machine shows boost 35000 -> 36000 bogo ops
>>
>> Patch 4/4 is a kind of RFC.
>> Actually per-cpu cache of preallocated stacks works faster than buddy allocator thus
>> performance boots for it happens only at completely insane rate of clones.
>>
>
> I'm not really sure what to make of this patchset. Is it useful in any
> known real-world use cases?

Not yet. Feel free to ignore last patch.

>
>> + This option neutralize stack overflow protection but allows to
>> + achieve best performance for syscalls fork() and clone().
>
> That sounds problematic, but perhaps acceptable if the fallback only
> happens rarely.
>
> Can this code be folded into CONFIG_VMAP_STACk in some cleaner fashion?
> We now have options for non-vmapped stacks, vmapped stacks and a mix
> of both.
>
> And what about this comment in arch/Kconfig:VMAP_STACK:
>
> This is presently incompatible with KASAN because KASAN expects
> the stack to map directly to the KASAN shadow map using a formula
> that is incorrect if the stack is in vmalloc space.
>
>
> So VMAP_STACK_AS_FALLBACK will intermittently break KASAN?
>

All of this (including CONFIG_VMAP_STACK) could be turned into boot option.
I think this would be a best solution.

2018-02-21 15:43:16

by Matthew Wilcox

[permalink] [raw]
Subject: Use higher-order pages in vmalloc

On Tue, Jan 23, 2018 at 01:55:32PM +0300, Konstantin Khlebnikov wrote:
> Virtually mapped stack have two bonuses: it eats order-0 pages and
> adds guard page at the end. But it slightly slower if system have
> plenty free high-order pages.
>
> This patch adds option to use virtually bapped stack as fallback for
> atomic allocation of traditional high-order page.

This prompted me to write a patch I've been meaning to do for a while,
allocating large pages if they're available to satisfy vmalloc. I thought
it would save on touching multiple struct pages, but it turns out that
the checking code we currently have in the free_pages path requires you
to have initialised all of the tail pages (maybe we can make that code
conditional ...)

It does save the buddy allocator the trouble of breaking down higher-order
pages into order-0 pages, only to allocate them again immediately.

(um, i seem to have broken the patch while cleaning it up for submission.
since it probably won't be accepted anyway, I'm not going to try to debug it)

diff --git a/kernel/fork.c b/kernel/fork.c
index be8aa5b98666..2bc01071b6ae 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -319,12 +319,12 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
if (vm) {
int i;

- BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
-
- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
- mod_zone_page_state(page_zone(vm->pages[i]),
+ for (i = 0; i < vm->nr_pages; i++) {
+ struct page *page = vm->pages[i];
+ unsigned int size = PAGE_SIZE << compound_order(page);
+ mod_zone_page_state(page_zone(page),
NR_KERNEL_STACK_KB,
- PAGE_SIZE / 1024 * account);
+ size / 1024 * account);
}

/* All stack pages belong to the same memcg. */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b728c98f49cd..4bfc29b21bc1 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -134,6 +134,7 @@ static void vunmap_page_range(unsigned long addr, unsigned long end)
static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr)
{
+ unsigned int i;
pte_t *pte;

/*
@@ -151,9 +152,13 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
return -EBUSY;
if (WARN_ON(!page))
return -ENOMEM;
- set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
+ for (i = 0; i < (1UL << compound_order(page)); i++) {
+ set_pte_at(&init_mm, addr, pte++,
+ mk_pte(page + i, prot));
+ addr += PAGE_SIZE;
+ }
(*nr)++;
- } while (pte++, addr += PAGE_SIZE, addr != end);
+ } while (addr != end);
return 0;
}

@@ -1530,14 +1535,14 @@ static void __vunmap(const void *addr, int deallocate_pages)
debug_check_no_obj_freed(addr, get_vm_area_size(area));

if (deallocate_pages) {
- int i;
+ unsigned int i;

for (i = 0; i < area->nr_pages; i++) {
struct page *page = area->pages[i];

BUG_ON(!page);
__ClearPageVmalloc(page);
- __free_pages(page, 0);
+ __free_pages(page, compound_order(page));
}

kvfree(area->pages);
@@ -1696,11 +1701,20 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,

for (i = 0; i < area->nr_pages; i++) {
struct page *page;
-
- if (node == NUMA_NO_NODE)
- page = alloc_page(alloc_mask);
- else
- page = alloc_pages_node(node, alloc_mask, 0);
+ unsigned int j = ilog2(area->nr_pages - i) + 1;
+
+ do {
+ j--;
+ if (node == NUMA_NO_NODE)
+ page = alloc_pages(alloc_mask, j);
+ else
+ page = alloc_pages_node(node, alloc_mask, j);
+ } while (!page && j);
+
+ if (j) {
+ area->nr_pages -= (1UL << j) - 1;
+ prep_compound_page(page, j);
+ }

if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vunmap() */
@@ -1719,8 +1733,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,

fail:
warn_alloc(gfp_mask, NULL,
- "vmalloc: allocation failure, allocated %ld of %ld bytes",
- (area->nr_pages*PAGE_SIZE), area->size);
+ "vmalloc: allocation failure, allocated %ld of %ld bytes",
+ (nr_pages * PAGE_SIZE), get_vm_area_size(area));
vfree(area->addr);
return NULL;
}

2018-02-21 17:06:08

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 1/4] vmalloc: add vm_flags argument to internal __vmalloc_node()

On Tue, Jan 23, 2018 at 01:55:22PM +0300, Konstantin Khlebnikov wrote:
> This allows to set VM_USERMAP in vmalloc_user() and vmalloc_32_user()
> directly at allocation and avoid find_vm_area() call.

While reviewing this patch, I came across this infelicity ...

have I understood correctly?

diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index e13d911251e7..9060f80b4a41 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -631,11 +631,10 @@ int kasan_module_alloc(void *addr, size_t size)
ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
shadow_start + shadow_size,
GFP_KERNEL | __GFP_ZERO,
- PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
+ PAGE_KERNEL, VM_NO_GUARD | VM_KASAN, NUMA_NO_NODE,
__builtin_return_address(0));

if (ret) {
- find_vm_area(addr)->flags |= VM_KASAN;
kmemleak_ignore(ret);
return 0;
}

2018-02-21 17:50:37

by Andrey Ryabinin

[permalink] [raw]
Subject: Re: [PATCH 1/4] vmalloc: add vm_flags argument to internal __vmalloc_node()



On 02/21/2018 03:24 PM, Matthew Wilcox wrote:
> On Tue, Jan 23, 2018 at 01:55:22PM +0300, Konstantin Khlebnikov wrote:
>> This allows to set VM_USERMAP in vmalloc_user() and vmalloc_32_user()
>> directly at allocation and avoid find_vm_area() call.
>
> While reviewing this patch, I came across this infelicity ...
>
> have I understood correctly?
>
> diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
> index e13d911251e7..9060f80b4a41 100644
> --- a/mm/kasan/kasan.c
> +++ b/mm/kasan/kasan.c
> @@ -631,11 +631,10 @@ int kasan_module_alloc(void *addr, size_t size)
> ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
> shadow_start + shadow_size,
> GFP_KERNEL | __GFP_ZERO,
> - PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
> + PAGE_KERNEL, VM_NO_GUARD | VM_KASAN, NUMA_NO_NODE,
> __builtin_return_address(0));
>
> if (ret) {
> - find_vm_area(addr)->flags |= VM_KASAN;

addr != ret
That's different vm areas.

> kmemleak_ignore(ret);
> return 0;
> }
>

2018-02-21 19:00:32

by Andy Lutomirski

[permalink] [raw]
Subject: Re: Use higher-order pages in vmalloc

On Wed, Feb 21, 2018 at 3:42 PM, Matthew Wilcox <[email protected]> wrote:
> On Tue, Jan 23, 2018 at 01:55:32PM +0300, Konstantin Khlebnikov wrote:
>> Virtually mapped stack have two bonuses: it eats order-0 pages and
>> adds guard page at the end. But it slightly slower if system have
>> plenty free high-order pages.
>>
>> This patch adds option to use virtually bapped stack as fallback for
>> atomic allocation of traditional high-order page.
>
> This prompted me to write a patch I've been meaning to do for a while,
> allocating large pages if they're available to satisfy vmalloc. I thought
> it would save on touching multiple struct pages, but it turns out that
> the checking code we currently have in the free_pages path requires you
> to have initialised all of the tail pages (maybe we can make that code
> conditional ...)
>
> It does save the buddy allocator the trouble of breaking down higher-order
> pages into order-0 pages, only to allocate them again immediately.
>
> (um, i seem to have broken the patch while cleaning it up for submission.
> since it probably won't be accepted anyway, I'm not going to try to debug it)
>
> diff --git a/kernel/fork.c b/kernel/fork.c
> index be8aa5b98666..2bc01071b6ae 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -319,12 +319,12 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
> if (vm) {
> int i;
>
> - BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
> -
> - for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
> - mod_zone_page_state(page_zone(vm->pages[i]),
> + for (i = 0; i < vm->nr_pages; i++) {
> + struct page *page = vm->pages[i];
> + unsigned int size = PAGE_SIZE << compound_order(page);
> + mod_zone_page_state(page_zone(page),
> NR_KERNEL_STACK_KB,
> - PAGE_SIZE / 1024 * account);
> + size / 1024 * account);
> }
>
> /* All stack pages belong to the same memcg. */
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index b728c98f49cd..4bfc29b21bc1 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -134,6 +134,7 @@ static void vunmap_page_range(unsigned long addr, unsigned long end)
> static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
> unsigned long end, pgprot_t prot, struct page **pages, int *nr)
> {
> + unsigned int i;
> pte_t *pte;
>
> /*
> @@ -151,9 +152,13 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
> return -EBUSY;
> if (WARN_ON(!page))
> return -ENOMEM;
> - set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
> + for (i = 0; i < (1UL << compound_order(page)); i++) {
> + set_pte_at(&init_mm, addr, pte++,
> + mk_pte(page + i, prot));
> + addr += PAGE_SIZE;
> + }
> (*nr)++;
> - } while (pte++, addr += PAGE_SIZE, addr != end);
> + } while (addr != end);
> return 0;
> }
>
> @@ -1530,14 +1535,14 @@ static void __vunmap(const void *addr, int deallocate_pages)
> debug_check_no_obj_freed(addr, get_vm_area_size(area));
>
> if (deallocate_pages) {
> - int i;
> + unsigned int i;
>
> for (i = 0; i < area->nr_pages; i++) {
> struct page *page = area->pages[i];
>
> BUG_ON(!page);
> __ClearPageVmalloc(page);
> - __free_pages(page, 0);
> + __free_pages(page, compound_order(page));
> }
>
> kvfree(area->pages);
> @@ -1696,11 +1701,20 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>
> for (i = 0; i < area->nr_pages; i++) {
> struct page *page;
> -
> - if (node == NUMA_NO_NODE)
> - page = alloc_page(alloc_mask);
> - else
> - page = alloc_pages_node(node, alloc_mask, 0);
> + unsigned int j = ilog2(area->nr_pages - i) + 1;
> +
> + do {
> + j--;
> + if (node == NUMA_NO_NODE)
> + page = alloc_pages(alloc_mask, j);
> + else
> + page = alloc_pages_node(node, alloc_mask, j);
> + } while (!page && j);
> +
> + if (j) {
> + area->nr_pages -= (1UL << j) - 1;

Is there any code that expects area->nr_pages to be the size of the
area in pages? I don't know of any such code.

> + prep_compound_page(page, j);
> + }
>
> if (unlikely(!page)) {
> /* Successfully allocated i pages, free them in __vunmap() */
> @@ -1719,8 +1733,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>
> fail:
> warn_alloc(gfp_mask, NULL,
> - "vmalloc: allocation failure, allocated %ld of %ld bytes",
> - (area->nr_pages*PAGE_SIZE), area->size);
> + "vmalloc: allocation failure, allocated %ld of %ld bytes",
> + (nr_pages * PAGE_SIZE), get_vm_area_size(area));
> vfree(area->addr);
> return NULL;
> }

2018-02-21 19:01:28

by Dave Hansen

[permalink] [raw]
Subject: Re: Use higher-order pages in vmalloc

On 02/21/2018 07:42 AM, Matthew Wilcox wrote:
> On Tue, Jan 23, 2018 at 01:55:32PM +0300, Konstantin Khlebnikov wrote:
>> Virtually mapped stack have two bonuses: it eats order-0 pages and
>> adds guard page at the end. But it slightly slower if system have
>> plenty free high-order pages.
>>
>> This patch adds option to use virtually bapped stack as fallback for
>> atomic allocation of traditional high-order page.
> This prompted me to write a patch I've been meaning to do for a while,
> allocating large pages if they're available to satisfy vmalloc. I thought
> it would save on touching multiple struct pages, but it turns out that
> the checking code we currently have in the free_pages path requires you
> to have initialised all of the tail pages (maybe we can make that code
> conditional ...)

What the concept here? If we can use high-order pages for vmalloc() at
the moment, we *should* use them?

One of the coolest things about vmalloc() is that it can do large
allocations without consuming large (high-order) pages, so it has very
few side-effects compared to doing a bunch of order-0 allocations. This
patch seems to propose removing that cool thing. Even trying the
high-order allocation could kick off a bunch of reclaim and compaction
that was not there previously.

If you could take this an only _opportunistically_ allocate large pages,
it could be a more universal win. You could try to make sure that no
compaction or reclaim is done for the large allocation. Or, maybe you
only try it if there are *only* high-order pages in the allocator that
would have been broken down into order-0 *anyway*.

I'm not sure it's worth it, though. I don't see a lot of folks
complaining about vmalloc()'s speed or TLB impact.

2018-02-21 19:05:10

by Matthew Wilcox

[permalink] [raw]
Subject: Re: Use higher-order pages in vmalloc

On Wed, Feb 21, 2018 at 04:11:16PM +0000, Andy Lutomirski wrote:
> On Wed, Feb 21, 2018 at 3:42 PM, Matthew Wilcox <[email protected]> wrote:
> > +++ b/kernel/fork.c
> > @@ -319,12 +319,12 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
> > if (vm) {
> > int i;
> >
> > - BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
...
> > + if (j) {
> > + area->nr_pages -= (1UL << j) - 1;
>
> Is there any code that expects area->nr_pages to be the size of the
> area in pages? I don't know of any such code.

I found one and deleted it ;-)


2018-02-21 19:07:28

by Matthew Wilcox

[permalink] [raw]
Subject: Re: Use higher-order pages in vmalloc

On Wed, Feb 21, 2018 at 08:16:22AM -0800, Dave Hansen wrote:
> On 02/21/2018 07:42 AM, Matthew Wilcox wrote:
> > This prompted me to write a patch I've been meaning to do for a while,
> > allocating large pages if they're available to satisfy vmalloc. I thought
> > it would save on touching multiple struct pages, but it turns out that
> > the checking code we currently have in the free_pages path requires you
> > to have initialised all of the tail pages (maybe we can make that code
> > conditional ...)
>
> What the concept here? If we can use high-order pages for vmalloc() at
> the moment, we *should* use them?

Right. It helps with fragmentation if we can keep higher-order
allocations together.

> One of the coolest things about vmalloc() is that it can do large
> allocations without consuming large (high-order) pages, so it has very
> few side-effects compared to doing a bunch of order-0 allocations. This
> patch seems to propose removing that cool thing. Even trying the
> high-order allocation could kick off a bunch of reclaim and compaction
> that was not there previously.

Yes, that's one of the debatable things. It'd be nice to have a GFP
flag that stopped after calling get_page_from_freelist() and didn't try
to do compaction or reclaim.

> If you could take this an only _opportunistically_ allocate large pages,
> it could be a more universal win. You could try to make sure that no
> compaction or reclaim is done for the large allocation. Or, maybe you
> only try it if there are *only* high-order pages in the allocator that
> would have been broken down into order-0 *anyway*.
>
> I'm not sure it's worth it, though. I don't see a lot of folks
> complaining about vmalloc()'s speed or TLB impact.

No, I'm not sure it's worth it either, although Konstantin's mail
suggesting improvements in fork speed were possible by avoiding vmalloc
reminded me that I'd been meaning to give this a try.

2018-02-21 19:50:30

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH 3/4] kernel/fork: switch vmapped stack callation to __vmalloc_area()

On Wed, Feb 21, 2018 at 7:23 AM, Konstantin Khlebnikov
<[email protected]> wrote:
>
>
> On 21.02.2018 03:16, Andrew Morton wrote:
>>
>> On Tue, 23 Jan 2018 16:57:21 +0300 Konstantin Khlebnikov
>> <[email protected]> wrote:
>>
>>> # stress-ng --clone 100 -t 10s --metrics-brief
>>> at 32-core machine shows boost 35000 -> 36000 bogo ops
>>>
>>> Patch 4/4 is a kind of RFC.
>>> Actually per-cpu cache of preallocated stacks works faster than buddy
>>> allocator thus
>>> performance boots for it happens only at completely insane rate of
>>> clones.
>>>
>>
>> I'm not really sure what to make of this patchset. Is it useful in any
>> known real-world use cases?
>
>
> Not yet. Feel free to ignore last patch.
>
>>
>>> + This option neutralize stack overflow protection but allows to
>>> + achieve best performance for syscalls fork() and clone().
>>
>>
>> That sounds problematic, but perhaps acceptable if the fallback only
>> happens rarely.
>>
>> Can this code be folded into CONFIG_VMAP_STACk in some cleaner fashion?
>> We now have options for non-vmapped stacks, vmapped stacks and a mix
>> of both.
>>
>> And what about this comment in arch/Kconfig:VMAP_STACK:
>>
>> This is presently incompatible with KASAN because KASAN expects
>> the stack to map directly to the KASAN shadow map using a
>> formula
>> that is incorrect if the stack is in vmalloc space.
>>
>>
>> So VMAP_STACK_AS_FALLBACK will intermittently break KASAN?
>>
>
> All of this (including CONFIG_VMAP_STACK) could be turned into boot option.
> I think this would be a best solution.

Or someone could *fix* KASAN to work with stacks in the vmalloc area.

2018-02-22 07:00:41

by Michal Hocko

[permalink] [raw]
Subject: Re: Use higher-order pages in vmalloc

On Wed 21-02-18 09:01:29, Matthew Wilcox wrote:
> On Wed, Feb 21, 2018 at 08:16:22AM -0800, Dave Hansen wrote:
> > On 02/21/2018 07:42 AM, Matthew Wilcox wrote:
> > > This prompted me to write a patch I've been meaning to do for a while,
> > > allocating large pages if they're available to satisfy vmalloc. I thought
> > > it would save on touching multiple struct pages, but it turns out that
> > > the checking code we currently have in the free_pages path requires you
> > > to have initialised all of the tail pages (maybe we can make that code
> > > conditional ...)
> >
> > What the concept here? If we can use high-order pages for vmalloc() at
> > the moment, we *should* use them?
>
> Right. It helps with fragmentation if we can keep higher-order
> allocations together.

Hmm, wouldn't it help if we made vmalloc pages migrateable instead? That
would help the compaction and get us to a lower fragmentation longterm
without playing tricks in the allocation path.

> > One of the coolest things about vmalloc() is that it can do large
> > allocations without consuming large (high-order) pages, so it has very
> > few side-effects compared to doing a bunch of order-0 allocations. This
> > patch seems to propose removing that cool thing. Even trying the
> > high-order allocation could kick off a bunch of reclaim and compaction
> > that was not there previously.
>
> Yes, that's one of the debatable things. It'd be nice to have a GFP
> flag that stopped after calling get_page_from_freelist() and didn't try
> to do compaction or reclaim.

GFP_NOWAIT, you mean?

> > If you could take this an only _opportunistically_ allocate large pages,
> > it could be a more universal win. You could try to make sure that no
> > compaction or reclaim is done for the large allocation. Or, maybe you
> > only try it if there are *only* high-order pages in the allocator that
> > would have been broken down into order-0 *anyway*.
> >
> > I'm not sure it's worth it, though. I don't see a lot of folks
> > complaining about vmalloc()'s speed or TLB impact.
>
> No, I'm not sure it's worth it either, although Konstantin's mail
> suggesting improvements in fork speed were possible by avoiding vmalloc
> reminded me that I'd been meaning to give this a try.

Maybe we should consider kvmalloc for the kernel stack?
--
Michal Hocko
SUSE Labs

2018-02-22 12:24:26

by Matthew Wilcox

[permalink] [raw]
Subject: Re: Use higher-order pages in vmalloc

On Thu, Feb 22, 2018 at 07:59:43AM +0100, Michal Hocko wrote:
> On Wed 21-02-18 09:01:29, Matthew Wilcox wrote:
> > Right. It helps with fragmentation if we can keep higher-order
> > allocations together.
>
> Hmm, wouldn't it help if we made vmalloc pages migrateable instead? That
> would help the compaction and get us to a lower fragmentation longterm
> without playing tricks in the allocation path.

I was wondering about that possibility. If we want to migrate a page
then we have to shoot down the PTE across all CPUs, copy the data to the
new page, and insert the new PTE. Copying 4kB doesn't take long; if you
have 12GB/s (current example on Wikipedia: dual-channel memory and one
DDR2-800 module per channel gives a theoretical bandwidth of 12.8GB/s)
then we should be able to copy a page in 666ns). So there's no problem
holding a spinlock for it.

But we can't handle a fault in vmalloc space today. It's handled in
arch-specific code, see vmalloc_fault() in arch/x86/mm/fault.c
If we're going to do this, it'll have to be something arches opt into
because I'm not taking on the job of fixing every architecture!

> Maybe we should consider kvmalloc for the kernel stack?

We'd lose the guard page, so it'd have to be something we let the
sysadmin decide to do.

2018-02-22 13:38:10

by Michal Hocko

[permalink] [raw]
Subject: Re: Use higher-order pages in vmalloc

On Thu 22-02-18 04:22:54, Matthew Wilcox wrote:
> On Thu, Feb 22, 2018 at 07:59:43AM +0100, Michal Hocko wrote:
> > On Wed 21-02-18 09:01:29, Matthew Wilcox wrote:
> > > Right. It helps with fragmentation if we can keep higher-order
> > > allocations together.
> >
> > Hmm, wouldn't it help if we made vmalloc pages migrateable instead? That
> > would help the compaction and get us to a lower fragmentation longterm
> > without playing tricks in the allocation path.
>
> I was wondering about that possibility. If we want to migrate a page
> then we have to shoot down the PTE across all CPUs, copy the data to the
> new page, and insert the new PTE. Copying 4kB doesn't take long; if you
> have 12GB/s (current example on Wikipedia: dual-channel memory and one
> DDR2-800 module per channel gives a theoretical bandwidth of 12.8GB/s)
> then we should be able to copy a page in 666ns). So there's no problem
> holding a spinlock for it.
>
> But we can't handle a fault in vmalloc space today. It's handled in
> arch-specific code, see vmalloc_fault() in arch/x86/mm/fault.c
> If we're going to do this, it'll have to be something arches opt into
> because I'm not taking on the job of fixing every architecture!

yes.

> > Maybe we should consider kvmalloc for the kernel stack?
>
> We'd lose the guard page, so it'd have to be something we let the
> sysadmin decide to do.

ohh, right, I forgot about the guard page.
--
Michal Hocko
SUSE Labs

2018-02-22 19:04:20

by Andy Lutomirski

[permalink] [raw]
Subject: Re: Use higher-order pages in vmalloc

On Thu, Feb 22, 2018 at 1:36 PM, Michal Hocko <[email protected]> wrote:
> On Thu 22-02-18 04:22:54, Matthew Wilcox wrote:
>> On Thu, Feb 22, 2018 at 07:59:43AM +0100, Michal Hocko wrote:
>> > On Wed 21-02-18 09:01:29, Matthew Wilcox wrote:
>> > > Right. It helps with fragmentation if we can keep higher-order
>> > > allocations together.
>> >
>> > Hmm, wouldn't it help if we made vmalloc pages migrateable instead? That
>> > would help the compaction and get us to a lower fragmentation longterm
>> > without playing tricks in the allocation path.
>>
>> I was wondering about that possibility. If we want to migrate a page
>> then we have to shoot down the PTE across all CPUs, copy the data to the
>> new page, and insert the new PTE. Copying 4kB doesn't take long; if you
>> have 12GB/s (current example on Wikipedia: dual-channel memory and one
>> DDR2-800 module per channel gives a theoretical bandwidth of 12.8GB/s)
>> then we should be able to copy a page in 666ns). So there's no problem
>> holding a spinlock for it.
>>
>> But we can't handle a fault in vmalloc space today. It's handled in
>> arch-specific code, see vmalloc_fault() in arch/x86/mm/fault.c
>> If we're going to do this, it'll have to be something arches opt into
>> because I'm not taking on the job of fixing every architecture!
>
> yes.

On x86, if you shoot down the PTE for the current stack, you're dead.
vmalloc_fault() might not even be called. Instead we hit
do_double_fault(), and the manual warns extremely strongly against
trying to recover, and, in this case, I agree with the SDM. If you
actually want this to work, there needs to be a special IPI broadcast
to the task in question (with appropriate synchronization) that calls
magic arch code that does the switcheroo.

Didn't someone (Christoph?) have a patch to teach the page allocator
to give high-order allocations if available and otherwise fall back to
low order?

2018-02-22 19:20:18

by Dave Hansen

[permalink] [raw]
Subject: Re: Use higher-order pages in vmalloc

On 02/22/2018 11:01 AM, Andy Lutomirski wrote:
> On x86, if you shoot down the PTE for the current stack, you're dead.

*If* we were to go do this insanity for vmalloc()'d memory, we could
probably limit it to kswapd, and also make sure that kernel threads
don't get vmalloc()'d stacks or that we mark them in a way to say we
never muck with them.

2018-02-22 19:28:37

by Andy Lutomirski

[permalink] [raw]
Subject: Re: Use higher-order pages in vmalloc

On Thu, Feb 22, 2018 at 7:19 PM, Dave Hansen <[email protected]> wrote:
> On 02/22/2018 11:01 AM, Andy Lutomirski wrote:
>> On x86, if you shoot down the PTE for the current stack, you're dead.
>
> *If* we were to go do this insanity for vmalloc()'d memory, we could
> probably limit it to kswapd, and also make sure that kernel threads
> don't get vmalloc()'d stacks or that we mark them in a way to say we
> never muck with them.

How does that help? We need to make sure that the task whose stack
we're migrating is (a) not running and (b) is not being switched in or
out. And we have to make sure that there isn't some *other* mm that
has the task's stack in ASID's TLB space.

Maybe we take some lock so the task can't run, then flush the world,
then release the lock.

2018-02-22 19:38:10

by Dave Hansen

[permalink] [raw]
Subject: Re: Use higher-order pages in vmalloc

On 02/22/2018 11:27 AM, Andy Lutomirski wrote:
> On Thu, Feb 22, 2018 at 7:19 PM, Dave Hansen <[email protected]> wrote:
>> On 02/22/2018 11:01 AM, Andy Lutomirski wrote:
>>> On x86, if you shoot down the PTE for the current stack, you're dead.
>>
>> *If* we were to go do this insanity for vmalloc()'d memory, we could
>> probably limit it to kswapd, and also make sure that kernel threads
>> don't get vmalloc()'d stacks or that we mark them in a way to say we
>> never muck with them.
>
> How does that help? We need to make sure that the task whose stack
> we're migrating is (a) not running and (b) is not being switched in or
> out. And we have to make sure that there isn't some *other* mm that
> has the task's stack in ASID's TLB space.
>
> Maybe we take some lock so the task can't run, then flush the world,
> then release the lock.

Oh, I was thinking only of the case where you try to muck with your
*own* stack. But, I see what you are saying about doing it to another
task on another CPU that is actively using the stack.

I think what you're saying is that we do not want to handle faults that
are caused by %esp being unusable. Whatever we do, we've got to make
sure that no CPU has a stack in %esp that we are messing with.

2018-02-23 12:13:57

by Michal Hocko

[permalink] [raw]
Subject: Re: Use higher-order pages in vmalloc

On Thu 22-02-18 19:01:35, Andy Lutomirski wrote:
> On Thu, Feb 22, 2018 at 1:36 PM, Michal Hocko <[email protected]> wrote:
> > On Thu 22-02-18 04:22:54, Matthew Wilcox wrote:
> >> On Thu, Feb 22, 2018 at 07:59:43AM +0100, Michal Hocko wrote:
> >> > On Wed 21-02-18 09:01:29, Matthew Wilcox wrote:
> >> > > Right. It helps with fragmentation if we can keep higher-order
> >> > > allocations together.
> >> >
> >> > Hmm, wouldn't it help if we made vmalloc pages migrateable instead? That
> >> > would help the compaction and get us to a lower fragmentation longterm
> >> > without playing tricks in the allocation path.
> >>
> >> I was wondering about that possibility. If we want to migrate a page
> >> then we have to shoot down the PTE across all CPUs, copy the data to the
> >> new page, and insert the new PTE. Copying 4kB doesn't take long; if you
> >> have 12GB/s (current example on Wikipedia: dual-channel memory and one
> >> DDR2-800 module per channel gives a theoretical bandwidth of 12.8GB/s)
> >> then we should be able to copy a page in 666ns). So there's no problem
> >> holding a spinlock for it.
> >>
> >> But we can't handle a fault in vmalloc space today. It's handled in
> >> arch-specific code, see vmalloc_fault() in arch/x86/mm/fault.c
> >> If we're going to do this, it'll have to be something arches opt into
> >> because I'm not taking on the job of fixing every architecture!
> >
> > yes.
>
> On x86, if you shoot down the PTE for the current stack, you're dead.
> vmalloc_fault() might not even be called. Instead we hit
> do_double_fault(), and the manual warns extremely strongly against
> trying to recover, and, in this case, I agree with the SDM. If you
> actually want this to work, there needs to be a special IPI broadcast
> to the task in question (with appropriate synchronization) that calls
> magic arch code that does the switcheroo.

Why cannot we use the pte swap entry trick also for vmalloc migration.
I haven't explored this path at all, to be honest.

> Didn't someone (Christoph?) have a patch to teach the page allocator
> to give high-order allocations if available and otherwise fall back to
> low order?

Do you mean kvmalloc?

--
Michal Hocko
SUSE Labs

2018-03-01 18:17:56

by Eric Dumazet

[permalink] [raw]
Subject: Re: Use higher-order pages in vmalloc

On Fri, 2018-02-23 at 13:13 +0100, Michal Hocko wrote:
> On Thu 22-02-18 19:01:35, Andy Lutomirski wrote:
> > On Thu, Feb 22, 2018 at 1:36 PM, Michal Hocko <[email protected]> wrote:
> > > On Thu 22-02-18 04:22:54, Matthew Wilcox wrote:
> > > > On Thu, Feb 22, 2018 at 07:59:43AM +0100, Michal Hocko wrote:
> > > > > On Wed 21-02-18 09:01:29, Matthew Wilcox wrote:
> > > > > > Right. It helps with fragmentation if we can keep higher-order
> > > > > > allocations together.
> > > > >
> > > > > Hmm, wouldn't it help if we made vmalloc pages migrateable instead? That
> > > > > would help the compaction and get us to a lower fragmentation longterm
> > > > > without playing tricks in the allocation path.
> > > >
> > > > I was wondering about that possibility. If we want to migrate a page
> > > > then we have to shoot down the PTE across all CPUs, copy the data to the
> > > > new page, and insert the new PTE. Copying 4kB doesn't take long; if you
> > > > have 12GB/s (current example on Wikipedia: dual-channel memory and one
> > > > DDR2-800 module per channel gives a theoretical bandwidth of 12.8GB/s)
> > > > then we should be able to copy a page in 666ns). So there's no problem
> > > > holding a spinlock for it.
> > > >
> > > > But we can't handle a fault in vmalloc space today. It's handled in
> > > > arch-specific code, see vmalloc_fault() in arch/x86/mm/fault.c
> > > > If we're going to do this, it'll have to be something arches opt into
> > > > because I'm not taking on the job of fixing every architecture!
> > >
> > > yes.
> >
> > On x86, if you shoot down the PTE for the current stack, you're dead.
> > vmalloc_fault() might not even be called. Instead we hit
> > do_double_fault(), and the manual warns extremely strongly against
> > trying to recover, and, in this case, I agree with the SDM. If you
> > actually want this to work, there needs to be a special IPI broadcast
> > to the task in question (with appropriate synchronization) that calls
> > magic arch code that does the switcheroo.
>
> Why cannot we use the pte swap entry trick also for vmalloc migration.
> I haven't explored this path at all, to be honest.
>
> > Didn't someone (Christoph?) have a patch to teach the page allocator
> > to give high-order allocations if available and otherwise fall back to
> > low order?
>
> Do you mean kvmalloc?


I sent something last year but had not finished the patch series :/

https://marc.info/?l=linux-kernel&m=148233423610544&w=2