Hi all,
this is my spin at sorting out the long lock hold times in
__purge_vmap_area_lazy. It is based on the patch from Joel sent this
week. I don't have any good numbers for it, but it survived an
xfstests run on XFS which is a significant vmalloc user. The
changelogs could still be improved as well, but I'd rather get it
out quickly for feedback and testing.
Just inline it into the only caller.
Signed-off-by: Christoph Hellwig <[email protected]>
---
mm/vmalloc.c | 13 ++-----------
1 file changed, 2 insertions(+), 11 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 9830514..8cedfa0 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -697,22 +697,13 @@ static void free_vmap_area_noflush(struct vmap_area *va)
}
/*
- * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
- * called for the correct range previously.
- */
-static void free_unmap_vmap_area_noflush(struct vmap_area *va)
-{
- unmap_vmap_area(va);
- free_vmap_area_noflush(va);
-}
-
-/*
* Free and unmap a vmap area
*/
static void free_unmap_vmap_area(struct vmap_area *va)
{
flush_cache_vunmap(va->va_start, va->va_end);
- free_unmap_vmap_area_noflush(va);
+ unmap_vmap_area(va);
+ free_vmap_area_noflush(va);
}
static struct vmap_area *find_vmap_area(unsigned long addr)
--
2.1.4
From: Joel Fernandes <[email protected]>
Use cond_resched_lock to avoid holding the vmap_area_lock for a
potentially long time.
Signed-off-by: Joel Fernandes <[email protected]>
[hch: split from a larger patch by Joel, wrote the crappy changelog]
Signed-off-by: Christoph Hellwig <[email protected]>
---
mm/vmalloc.c | 14 +++++++++-----
1 file changed, 9 insertions(+), 5 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6c7eb8d..98b19ea 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -628,7 +628,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
struct llist_node *valist;
struct vmap_area *va;
struct vmap_area *n_va;
- int nr = 0;
+ bool do_free = false;
lockdep_assert_held(&vmap_purge_lock);
@@ -638,18 +638,22 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
start = va->va_start;
if (va->va_end > end)
end = va->va_end;
- nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
+ do_free = true;
}
- if (!nr)
+ if (!do_free)
return false;
- atomic_sub(nr, &vmap_lazy_nr);
flush_tlb_kernel_range(start, end);
spin_lock(&vmap_area_lock);
- llist_for_each_entry_safe(va, n_va, valist, purge_list)
+ llist_for_each_entry_safe(va, n_va, valist, purge_list) {
+ int nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
+
__free_vmap_area(va);
+ atomic_sub(nr, &vmap_lazy_nr);
+ cond_resched_lock(&vmap_area_lock);
+ }
spin_unlock(&vmap_area_lock);
return true;
}
--
2.1.4
Just inline it into the only caller.
Signed-off-by: Christoph Hellwig <[email protected]>
---
mm/vmalloc.c | 21 ++++++++-------------
1 file changed, 8 insertions(+), 13 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 8cedfa0..2af2921 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -717,16 +717,6 @@ static struct vmap_area *find_vmap_area(unsigned long addr)
return va;
}
-static void free_unmap_vmap_area_addr(unsigned long addr)
-{
- struct vmap_area *va;
-
- va = find_vmap_area(addr);
- BUG_ON(!va);
- free_unmap_vmap_area(va);
-}
-
-
/*** Per cpu kva allocator ***/
/*
@@ -1090,6 +1080,7 @@ void vm_unmap_ram(const void *mem, unsigned int count)
{
unsigned long size = (unsigned long)count << PAGE_SHIFT;
unsigned long addr = (unsigned long)mem;
+ struct vmap_area *va;
might_sleep();
BUG_ON(!addr);
@@ -1100,10 +1091,14 @@ void vm_unmap_ram(const void *mem, unsigned int count)
debug_check_no_locks_freed(mem, size);
vmap_debug_free_range(addr, addr+size);
- if (likely(count <= VMAP_MAX_ALLOC))
+ if (likely(count <= VMAP_MAX_ALLOC)) {
vb_free(mem, size);
- else
- free_unmap_vmap_area_addr(addr);
+ return;
+ }
+
+ va = find_vmap_area(addr);
+ BUG_ON(!va);
+ free_unmap_vmap_area(va);
}
EXPORT_SYMBOL(vm_unmap_ram);
--
2.1.4
Signed-off-by: Christoph Hellwig <[email protected]>
---
mm/vmalloc.c | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2af2921..6c7eb8d 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -606,7 +606,7 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
* by this look, but we want to avoid concurrent calls for performance
* reasons and to make the pcpu_get_vm_areas more deterministic.
*/
-static DEFINE_SPINLOCK(vmap_purge_lock);
+static DEFINE_MUTEX(vmap_purge_lock);
/* for per-CPU blocks */
static void purge_fragmented_blocks_allcpus(void);
@@ -660,9 +660,9 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
*/
static void try_purge_vmap_area_lazy(void)
{
- if (spin_trylock(&vmap_purge_lock)) {
+ if (mutex_trylock(&vmap_purge_lock)) {
__purge_vmap_area_lazy(ULONG_MAX, 0);
- spin_unlock(&vmap_purge_lock);
+ mutex_unlock(&vmap_purge_lock);
}
}
@@ -671,10 +671,10 @@ static void try_purge_vmap_area_lazy(void)
*/
static void purge_vmap_area_lazy(void)
{
- spin_lock(&vmap_purge_lock);
+ mutex_lock(&vmap_purge_lock);
purge_fragmented_blocks_allcpus();
__purge_vmap_area_lazy(ULONG_MAX, 0);
- spin_unlock(&vmap_purge_lock);
+ mutex_unlock(&vmap_purge_lock);
}
/*
@@ -1063,11 +1063,11 @@ void vm_unmap_aliases(void)
rcu_read_unlock();
}
- spin_lock(&vmap_purge_lock);
+ mutex_lock(&vmap_purge_lock);
purge_fragmented_blocks_allcpus();
if (!__purge_vmap_area_lazy(start, end) && flush)
flush_tlb_kernel_range(start, end);
- spin_unlock(&vmap_purge_lock);
+ mutex_unlock(&vmap_purge_lock);
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
--
2.1.4
This is how everyone seems to already use them, but let's make that
explicit.
Signed-off-by: Christoph Hellwig <[email protected]>
---
mm/vmalloc.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d045a10..9830514 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -365,7 +365,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
BUG_ON(offset_in_page(size));
BUG_ON(!is_power_of_2(align));
- might_sleep_if(gfpflags_allow_blocking(gfp_mask));
+ might_sleep();
va = kmalloc_node(sizeof(struct vmap_area),
gfp_mask & GFP_RECLAIM_MASK, node);
@@ -1056,6 +1056,8 @@ void vm_unmap_aliases(void)
if (unlikely(!vmap_initialized))
return;
+ might_sleep();
+
for_each_possible_cpu(cpu) {
struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
struct vmap_block *vb;
@@ -1098,6 +1100,7 @@ void vm_unmap_ram(const void *mem, unsigned int count)
unsigned long size = (unsigned long)count << PAGE_SHIFT;
unsigned long addr = (unsigned long)mem;
+ might_sleep();
BUG_ON(!addr);
BUG_ON(addr < VMALLOC_START);
BUG_ON(addr > VMALLOC_END);
@@ -1445,6 +1448,8 @@ struct vm_struct *remove_vm_area(const void *addr)
{
struct vmap_area *va;
+ might_sleep();
+
va = find_vmap_area((unsigned long)addr);
if (va && va->flags & VM_VM_AREA) {
struct vm_struct *vm = va->vm;
--
2.1.4
Move the purge_lock synchronization to the callers, move the call to
purge_fragmented_blocks_allcpus at the beginning of the function to
the callers that need it, move the force_flush behavior to the caller
that needs it, and pass start and end by value instead of by reference.
No change in behavior.
Signed-off-by: Christoph Hellwig <[email protected]>
---
mm/vmalloc.c | 80 ++++++++++++++++++++++++++----------------------------------
1 file changed, 35 insertions(+), 45 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f2481cb..d045a10 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -601,6 +601,13 @@ static unsigned long lazy_max_pages(void)
static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
+/*
+ * Serialize vmap purging. There is no actual criticial section protected
+ * by this look, but we want to avoid concurrent calls for performance
+ * reasons and to make the pcpu_get_vm_areas more deterministic.
+ */
+static DEFINE_SPINLOCK(vmap_purge_lock);
+
/* for per-CPU blocks */
static void purge_fragmented_blocks_allcpus(void);
@@ -615,59 +622,36 @@ void set_iounmap_nonlazy(void)
/*
* Purges all lazily-freed vmap areas.
- *
- * If sync is 0 then don't purge if there is already a purge in progress.
- * If force_flush is 1, then flush kernel TLBs between *start and *end even
- * if we found no lazy vmap areas to unmap (callers can use this to optimise
- * their own TLB flushing).
- * Returns with *start = min(*start, lowest purged address)
- * *end = max(*end, highest purged address)
*/
-static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
- int sync, int force_flush)
+static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
{
- static DEFINE_SPINLOCK(purge_lock);
struct llist_node *valist;
struct vmap_area *va;
struct vmap_area *n_va;
int nr = 0;
- /*
- * If sync is 0 but force_flush is 1, we'll go sync anyway but callers
- * should not expect such behaviour. This just simplifies locking for
- * the case that isn't actually used at the moment anyway.
- */
- if (!sync && !force_flush) {
- if (!spin_trylock(&purge_lock))
- return;
- } else
- spin_lock(&purge_lock);
-
- if (sync)
- purge_fragmented_blocks_allcpus();
+ lockdep_assert_held(&vmap_purge_lock);
valist = llist_del_all(&vmap_purge_list);
llist_for_each_entry(va, valist, purge_list) {
- if (va->va_start < *start)
- *start = va->va_start;
- if (va->va_end > *end)
- *end = va->va_end;
+ if (va->va_start < start)
+ start = va->va_start;
+ if (va->va_end > end)
+ end = va->va_end;
nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
}
- if (nr)
- atomic_sub(nr, &vmap_lazy_nr);
+ if (!nr)
+ return false;
- if (nr || force_flush)
- flush_tlb_kernel_range(*start, *end);
+ atomic_sub(nr, &vmap_lazy_nr);
+ flush_tlb_kernel_range(start, end);
- if (nr) {
- spin_lock(&vmap_area_lock);
- llist_for_each_entry_safe(va, n_va, valist, purge_list)
- __free_vmap_area(va);
- spin_unlock(&vmap_area_lock);
- }
- spin_unlock(&purge_lock);
+ spin_lock(&vmap_area_lock);
+ llist_for_each_entry_safe(va, n_va, valist, purge_list)
+ __free_vmap_area(va);
+ spin_unlock(&vmap_area_lock);
+ return true;
}
/*
@@ -676,9 +660,10 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
*/
static void try_purge_vmap_area_lazy(void)
{
- unsigned long start = ULONG_MAX, end = 0;
-
- __purge_vmap_area_lazy(&start, &end, 0, 0);
+ if (spin_trylock(&vmap_purge_lock)) {
+ __purge_vmap_area_lazy(ULONG_MAX, 0);
+ spin_unlock(&vmap_purge_lock);
+ }
}
/*
@@ -686,9 +671,10 @@ static void try_purge_vmap_area_lazy(void)
*/
static void purge_vmap_area_lazy(void)
{
- unsigned long start = ULONG_MAX, end = 0;
-
- __purge_vmap_area_lazy(&start, &end, 1, 0);
+ spin_lock(&vmap_purge_lock);
+ purge_fragmented_blocks_allcpus();
+ __purge_vmap_area_lazy(ULONG_MAX, 0);
+ spin_unlock(&vmap_purge_lock);
}
/*
@@ -1094,7 +1080,11 @@ void vm_unmap_aliases(void)
rcu_read_unlock();
}
- __purge_vmap_area_lazy(&start, &end, 1, flush);
+ spin_lock(&vmap_purge_lock);
+ purge_fragmented_blocks_allcpus();
+ if (!__purge_vmap_area_lazy(start, end) && flush)
+ flush_tlb_kernel_range(start, end);
+ spin_unlock(&vmap_purge_lock);
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
--
2.1.4
On Tue, Oct 18, 2016 at 08:56:07AM +0200, Christoph Hellwig wrote:
> This is how everyone seems to already use them, but let's make that
> explicit.
mm/page_alloc.c: alloc_large_system_hash() is perhaps the exception to
the rule.
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
On Tue, Oct 18, 2016 at 11:33:59AM +0100, Chris Wilson wrote:
> On Tue, Oct 18, 2016 at 08:56:07AM +0200, Christoph Hellwig wrote:
> > This is how everyone seems to already use them, but let's make that
> > explicit.
>
> mm/page_alloc.c: alloc_large_system_hash() is perhaps the exception to
> the rule.
While alloc_large_system_hash passes GFP_ATOMIC it still is called
from context where it can sleep - I think it just abuses GFP_ATOMIC
so that it gets an "early" failure. For which GFP_ATOMIC isn't
exactly a good choice as it dips into additional reserves, GFP_NOWAIT
would have probably been a better choice.
On Tue, 18 Oct 2016 08:56:05 +0200
Christoph Hellwig <[email protected]> wrote:
> Hi all,
>
> this is my spin at sorting out the long lock hold times in
> __purge_vmap_area_lazy. It is based on the patch from Joel sent this
> week. I don't have any good numbers for it, but it survived an
> xfstests run on XFS which is a significant vmalloc user. The
> changelogs could still be improved as well, but I'd rather get it
> out quickly for feedback and testing.
All seems pretty good to me.
Thanks,
Nick
On Tue, 18 Oct 2016 08:56:05 +0200 Christoph Hellwig wrote:
> Hi all,
>
> this is my spin at sorting out the long lock hold times in
> __purge_vmap_area_lazy. It is based on the patch from Joel sent this
> week. I don't have any good numbers for it, but it survived an
> xfstests run on XFS which is a significant vmalloc user. The
> changelogs could still be improved as well, but I'd rather get it
> out quickly for feedback and testing.
I just tested this series, the preempt off ftrace log doesn't complain
__purge_vmap_area_lazy any more in my test case, this latency is removed!
So feel free to add
Tested-by: Jisheng Zhang <[email protected]>
On Tue, Oct 18, 2016 at 08:56:11AM +0200, Christoph Hellwig wrote:
> From: Joel Fernandes <[email protected]>
>
> Use cond_resched_lock to avoid holding the vmap_area_lock for a
> potentially long time.
>
> Signed-off-by: Joel Fernandes <[email protected]>
> [hch: split from a larger patch by Joel, wrote the crappy changelog]
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> mm/vmalloc.c | 14 +++++++++-----
> 1 file changed, 9 insertions(+), 5 deletions(-)
>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 6c7eb8d..98b19ea 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -628,7 +628,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
> struct llist_node *valist;
> struct vmap_area *va;
> struct vmap_area *n_va;
> - int nr = 0;
> + bool do_free = false;
>
> lockdep_assert_held(&vmap_purge_lock);
>
> @@ -638,18 +638,22 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
> start = va->va_start;
> if (va->va_end > end)
> end = va->va_end;
> - nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
> + do_free = true;
> }
>
> - if (!nr)
> + if (!do_free)
> return false;
>
> - atomic_sub(nr, &vmap_lazy_nr);
> flush_tlb_kernel_range(start, end);
>
> spin_lock(&vmap_area_lock);
> - llist_for_each_entry_safe(va, n_va, valist, purge_list)
> + llist_for_each_entry_safe(va, n_va, valist, purge_list) {
> + int nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
> +
> __free_vmap_area(va);
> + atomic_sub(nr, &vmap_lazy_nr);
> + cond_resched_lock(&vmap_area_lock);
Is releasing the lock within a llist_for_each_entry_safe() actually safe? Is
vmap_area_lock the one to protect the valist?
That is llist_for_each_entry_safe(va, n_va, valist, purg_list) does:
for (va = llist_entry(valist, typeof(*va), purge_list);
&va->purge_list != NULL &&
n_va = llist_entry(va->purge_list.next, typeof(*n_va),
purge_list, true);
pos = n)
Thus n_va is pointing to the next element to process when we release the
lock. Is it possible for another task to get into this same path and process
the item that n_va is pointing to? Then when the preempted task comes back,
grabs the vmap_area_lock, and then continues the loop with what n_va has,
could that cause problems? That is, the next iteration after releasing the
lock does va = n_va. What happens if n_va no longer exits?
I don't know this code that well, and perhaps vmap_area_lock is not protecting
the list and this is all fine.
-- Steve
> + }
> spin_unlock(&vmap_area_lock);
> return true;
> }
> --
> 2.1.4
On Tue, 18 Oct 2016 16:56:48 -0400
Steven Rostedt <[email protected]> wrote:
> Is releasing the lock within a llist_for_each_entry_safe() actually safe? Is
> vmap_area_lock the one to protect the valist?
>
> That is llist_for_each_entry_safe(va, n_va, valist, purg_list) does:
>
> for (va = llist_entry(valist, typeof(*va), purge_list);
> &va->purge_list != NULL &&
> n_va = llist_entry(va->purge_list.next, typeof(*n_va),
> purge_list, true);
> pos = n)
>
> Thus n_va is pointing to the next element to process when we release the
> lock. Is it possible for another task to get into this same path and process
> the item that n_va is pointing to? Then when the preempted task comes back,
> grabs the vmap_area_lock, and then continues the loop with what n_va has,
> could that cause problems? That is, the next iteration after releasing the
> lock does va = n_va. What happens if n_va no longer exits?
>
> I don't know this code that well, and perhaps vmap_area_lock is not protecting
> the list and this is all fine.
>
Bah, nevermind. I missed the:
valist = llist_del_all(&vmap_purge_list);
so yeah, all should be good.
Nothing to see here, move along please.
-- Steve
On Wed, Oct 19, 2016 at 12:15:41PM +0100, Chris Wilson wrote:
> On Tue, Oct 18, 2016 at 08:56:07AM +0200, Christoph Hellwig wrote:
> > This is how everyone seems to already use them, but let's make that
> > explicit.
>
> Ah, found an exception, vmapped stacks:
Oh, fun. So if we can't require vfree to be called from process context
we also can't use a mutex to wait for the vmap flushing. Given that we
free stacks from the scheduler context switch I also fear there is no
good way to get a sleepable context there.
The only other idea I had was to use vmap_area_lock for the protection
that purge_lock currently provides, but that would require some serious
refactoring to avoid recursive locking first.
On Tue, Oct 18, 2016 at 08:56:07AM +0200, Christoph Hellwig wrote:
> This is how everyone seems to already use them, but let's make that
> explicit.
Ah, found an exception, vmapped stacks:
[ 696.928541] BUG: sleeping function called from invalid context at mm/vmalloc.c:615
[ 696.928576] in_atomic(): 1, irqs_disabled(): 0, pid: 30521, name: bash
[ 696.928590] 1 lock held by bash/30521:
[ 696.928600] #0: [ 696.928606] (vmap_area_lock[ 696.928619] ){+.+...}, at: [ 696.928640] [<ffffffff8115f0cf>] __purge_vmap_area_lazy+0x30f/0x370
[ 696.928656] CPU: 0 PID: 30521 Comm: bash Tainted: G W 4.9.0-rc1+ #124
[ 696.928672] Hardware name: / , BIOS PYBSWCEL.86A.0027.2015.0507.1758 05/07/2015
[ 696.928690] ffffc900070f7c70 ffffffff812be1f5 ffff8802750b6680 ffffffff819650a6
[ 696.928717] ffffc900070f7c98 ffffffff810a3216 0000000000004001 ffff8802726e16c0
[ 696.928743] ffff8802726e19a0 ffffc900070f7d08 ffffffff8115f0f3 ffff8802750b6680
[ 696.928768] Call Trace:
[ 696.928782] [<ffffffff812be1f5>] dump_stack+0x68/0x93
[ 696.928796] [<ffffffff810a3216>] ___might_sleep+0x166/0x220
[ 696.928809] [<ffffffff8115f0f3>] __purge_vmap_area_lazy+0x333/0x370
[ 696.928823] [<ffffffff8115ea68>] ? vunmap_page_range+0x1e8/0x350
[ 696.928837] [<ffffffff8115f1b3>] free_vmap_area_noflush+0x83/0x90
[ 696.928850] [<ffffffff81160931>] remove_vm_area+0x71/0xb0
[ 696.928863] [<ffffffff81160999>] __vunmap+0x29/0xf0
[ 696.928875] [<ffffffff81160ab9>] vfree+0x29/0x70
[ 696.928888] [<ffffffff81071746>] put_task_stack+0x76/0x120
[ 696.928901] [<ffffffff8109a943>] finish_task_switch+0x163/0x1e0
[ 696.928914] [<ffffffff8109a845>] ? finish_task_switch+0x65/0x1e0
[ 696.928928] [<ffffffff816125f5>] __schedule+0x1f5/0x7c0
[ 696.928940] [<ffffffff81612c28>] schedule+0x38/0x90
[ 696.928953] [<ffffffff810787b1>] do_wait+0x1d1/0x200
[ 696.928966] [<ffffffff810799b1>] SyS_wait4+0x61/0xc0
[ 696.928979] [<ffffffff81076e50>] ? task_stopped_code+0x50/0x50
[ 696.928992] [<ffffffff81618e6e>] entry_SYSCALL_64_fastpath+0x1c/0xb1
[This was triggered by earlier patch to remove the serialisation and add
cond_resched_lock(&vmap_area_lock)]
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
On Wed, Oct 19, 2016 at 6:05 AM, Christoph Hellwig <[email protected]> wrote:
> On Wed, Oct 19, 2016 at 12:15:41PM +0100, Chris Wilson wrote:
>> On Tue, Oct 18, 2016 at 08:56:07AM +0200, Christoph Hellwig wrote:
>> > This is how everyone seems to already use them, but let's make that
>> > explicit.
>>
>> Ah, found an exception, vmapped stacks:
>
> Oh, fun. So if we can't require vfree to be called from process context
> we also can't use a mutex to wait for the vmap flushing. Given that we
> free stacks from the scheduler context switch I also fear there is no
> good way to get a sleepable context there.
>
> The only other idea I had was to use vmap_area_lock for the protection
> that purge_lock currently provides, but that would require some serious
> refactoring to avoid recursive locking first.
It would be quite awkward for a task stack to get freed from a
sleepable context, because the obvious sleepable context is the task
itself, and it still needs its stack. This was true even in the old
regime when task stacks were freed from RCU context.
But vfree has a magic automatic deferral mechanism. Couldn't you make
the non-deferred case might_sleep()?
--
Andy Lutomirski
AMA Capital Management, LLC
On Wed, Oct 19, 2016 at 08:34:40AM -0700, Andy Lutomirski wrote:
>
> It would be quite awkward for a task stack to get freed from a
> sleepable context, because the obvious sleepable context is the task
> itself, and it still needs its stack. This was true even in the old
> regime when task stacks were freed from RCU context.
>
> But vfree has a magic automatic deferral mechanism. Couldn't you make
> the non-deferred case might_sleep()?
But it's only magic from interrupt context..
Chris, does this patch make virtually mapped stack work for you again?
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f2481cb..942e02d 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1533,7 +1533,7 @@ void vfree(const void *addr)
if (!addr)
return;
- if (unlikely(in_interrupt())) {
+ if (in_interrupt() || in_atomic()) {
struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred);
if (llist_add((struct llist_node *)addr, &p->list))
schedule_work(&p->wq);
On Wed, Oct 19, 2016 at 06:31:12PM +0200, Christoph Hellwig wrote:
> On Wed, Oct 19, 2016 at 08:34:40AM -0700, Andy Lutomirski wrote:
> >
> > It would be quite awkward for a task stack to get freed from a
> > sleepable context, because the obvious sleepable context is the task
> > itself, and it still needs its stack. This was true even in the old
> > regime when task stacks were freed from RCU context.
> >
> > But vfree has a magic automatic deferral mechanism. Couldn't you make
> > the non-deferred case might_sleep()?
>
> But it's only magic from interrupt context..
>
> Chris, does this patch make virtually mapped stack work for you again?
So far, so good. No warns from anyone else.
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
Hi Christoph,
On Wed, Oct 19, 2016 at 9:31 AM, Christoph Hellwig <[email protected]> wrote:
> On Wed, Oct 19, 2016 at 08:34:40AM -0700, Andy Lutomirski wrote:
>>
>> It would be quite awkward for a task stack to get freed from a
>> sleepable context, because the obvious sleepable context is the task
>> itself, and it still needs its stack. This was true even in the old
>> regime when task stacks were freed from RCU context.
>>
>> But vfree has a magic automatic deferral mechanism. Couldn't you make
>> the non-deferred case might_sleep()?
>
> But it's only magic from interrupt context..
>
> Chris, does this patch make virtually mapped stack work for you again?
>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index f2481cb..942e02d 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -1533,7 +1533,7 @@ void vfree(const void *addr)
>
> if (!addr)
> return;
> - if (unlikely(in_interrupt())) {
> + if (in_interrupt() || in_atomic()) {
in_atomic() also checks in_interrupt() cases so only in_atomic() should suffice.
Thanks,
Joel
Hi Christoph,
On Mon, Oct 17, 2016 at 11:56 PM, Christoph Hellwig <[email protected]> wrote:
> Just inline it into the only caller.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> mm/vmalloc.c | 21 ++++++++-------------
> 1 file changed, 8 insertions(+), 13 deletions(-)
>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 8cedfa0..2af2921 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -717,16 +717,6 @@ static struct vmap_area *find_vmap_area(unsigned long addr)
> return va;
> }
>
> -static void free_unmap_vmap_area_addr(unsigned long addr)
> -{
> - struct vmap_area *va;
> -
> - va = find_vmap_area(addr);
> - BUG_ON(!va);
> - free_unmap_vmap_area(va);
> -}
> -
> -
> /*** Per cpu kva allocator ***/
>
> /*
> @@ -1090,6 +1080,7 @@ void vm_unmap_ram(const void *mem, unsigned int count)
> {
> unsigned long size = (unsigned long)count << PAGE_SHIFT;
> unsigned long addr = (unsigned long)mem;
> + struct vmap_area *va;
>
> might_sleep();
> BUG_ON(!addr);
> @@ -1100,10 +1091,14 @@ void vm_unmap_ram(const void *mem, unsigned int count)
> debug_check_no_locks_freed(mem, size);
> vmap_debug_free_range(addr, addr+size);
>
> - if (likely(count <= VMAP_MAX_ALLOC))
> + if (likely(count <= VMAP_MAX_ALLOC)) {
> vb_free(mem, size);
> - else
> - free_unmap_vmap_area_addr(addr);
> + return;
> + }
> +
> + va = find_vmap_area(addr);
> + BUG_ON(!va);
Considering recent objections to BUG_ON [1], lets make this a WARN_ON
while we're moving the code?
> + free_unmap_vmap_area(va);
> }
> EXPORT_SYMBOL(vm_unmap_ram);
>
> --
> 2.1.4
Thanks,
Joel
[1] https://lkml.org/lkml/2016/10/6/65
On Mon, Oct 17, 2016 at 11:56 PM, Christoph Hellwig <[email protected]> wrote:
> Hi all,
>
> this is my spin at sorting out the long lock hold times in
> __purge_vmap_area_lazy. It is based on the patch from Joel sent this
> week. I don't have any good numbers for it, but it survived an
> xfstests run on XFS which is a significant vmalloc user. The
> changelogs could still be improved as well, but I'd rather get it
> out quickly for feedback and testing.
All patches look great to me, and thanks a lot.
Regards,
Joel
On Thu, 20 Oct 2016 17:46:36 -0700
Joel Fernandes <[email protected]> wrote:
> > @@ -1100,10 +1091,14 @@ void vm_unmap_ram(const void *mem, unsigned int count)
> > debug_check_no_locks_freed(mem, size);
> > vmap_debug_free_range(addr, addr+size);
> >
> > - if (likely(count <= VMAP_MAX_ALLOC))
> > + if (likely(count <= VMAP_MAX_ALLOC)) {
> > vb_free(mem, size);
> > - else
> > - free_unmap_vmap_area_addr(addr);
> > + return;
> > + }
> > +
> > + va = find_vmap_area(addr);
> > + BUG_ON(!va);
>
> Considering recent objections to BUG_ON [1], lets make this a WARN_ON
> while we're moving the code?
If you lost track of your kernel memory mappings, you are in big trouble
and fail stop is really the best course of action for containing the
problem, which could have security and data corruption implications. This
is covered by Linus' last paragraph in that commit.
Thanks,
Nick
On Wed, Oct 19, 2016 at 4:15 AM, Chris Wilson <[email protected]> wrote:
> On Tue, Oct 18, 2016 at 08:56:07AM +0200, Christoph Hellwig wrote:
>> This is how everyone seems to already use them, but let's make that
>> explicit.
>
> Ah, found an exception, vmapped stacks:
>
> [ 696.928541] BUG: sleeping function called from invalid context at mm/vmalloc.c:615
> [ 696.928576] in_atomic(): 1, irqs_disabled(): 0, pid: 30521, name: bash
> [ 696.928590] 1 lock held by bash/30521:
> [ 696.928600] #0: [ 696.928606] (vmap_area_lock[ 696.928619] ){+.+...}, at: [ 696.928640] [<ffffffff8115f0cf>] __purge_vmap_area_lazy+0x30f/0x370
> [ 696.928656] CPU: 0 PID: 30521 Comm: bash Tainted: G W 4.9.0-rc1+ #124
> [ 696.928672] Hardware name: / , BIOS PYBSWCEL.86A.0027.2015.0507.1758 05/07/2015
> [ 696.928690] ffffc900070f7c70 ffffffff812be1f5 ffff8802750b6680 ffffffff819650a6
> [ 696.928717] ffffc900070f7c98 ffffffff810a3216 0000000000004001 ffff8802726e16c0
> [ 696.928743] ffff8802726e19a0 ffffc900070f7d08 ffffffff8115f0f3 ffff8802750b6680
> [ 696.928768] Call Trace:
> [ 696.928782] [<ffffffff812be1f5>] dump_stack+0x68/0x93
> [ 696.928796] [<ffffffff810a3216>] ___might_sleep+0x166/0x220
> [ 696.928809] [<ffffffff8115f0f3>] __purge_vmap_area_lazy+0x333/0x370
> [ 696.928823] [<ffffffff8115ea68>] ? vunmap_page_range+0x1e8/0x350
> [ 696.928837] [<ffffffff8115f1b3>] free_vmap_area_noflush+0x83/0x90
> [ 696.928850] [<ffffffff81160931>] remove_vm_area+0x71/0xb0
> [ 696.928863] [<ffffffff81160999>] __vunmap+0x29/0xf0
> [ 696.928875] [<ffffffff81160ab9>] vfree+0x29/0x70
> [ 696.928888] [<ffffffff81071746>] put_task_stack+0x76/0x120
>From this traceback, it looks like the lock causing the atomic context
was actually acquired in the vfree path itself, and not by the vmapped
stack user (as it says "vmap_area_lock" held). I am still wondering
why vmap_area_lock was held during the might_sleep(), perhaps you may
not have applied all patches from Chris H?
>From the patches I saw, vmap_area_lock is not acquired during any of
the might_sleep Chris H added, but I may be missing something. In
anycase looks to me like the atomicity is introduced by the vfree path
itself and not the caller.
Thanks!
Joel
> [ 696.928901] [<ffffffff8109a943>] finish_task_switch+0x163/0x1e0
> [ 696.928914] [<ffffffff8109a845>] ? finish_task_switch+0x65/0x1e0
> [ 696.928928] [<ffffffff816125f5>] __schedule+0x1f5/0x7c0
> [ 696.928940] [<ffffffff81612c28>] schedule+0x38/0x90
> [ 696.928953] [<ffffffff810787b1>] do_wait+0x1d1/0x200
> [ 696.928966] [<ffffffff810799b1>] SyS_wait4+0x61/0xc0
> [ 696.928979] [<ffffffff81076e50>] ? task_stopped_code+0x50/0x50
> [ 696.928992] [<ffffffff81618e6e>] entry_SYSCALL_64_fastpath+0x1c/0xb1
>
> [This was triggered by earlier patch to remove the serialisation and add
> cond_resched_lock(&vmap_area_lock)]
> -Chris
>
> --
> Chris Wilson, Intel Open Source Technology Centre
On 11/08/2016 04:24 PM, Joel Fernandes wrote:
> On Wed, Oct 19, 2016 at 4:15 AM, Chris Wilson <[email protected]> wrote:
>> On Tue, Oct 18, 2016 at 08:56:07AM +0200, Christoph Hellwig wrote:
>>> This is how everyone seems to already use them, but let's make that
>>> explicit.
>>
>> Ah, found an exception, vmapped stacks:
>>
>> [ 696.928541] BUG: sleeping function called from invalid context at mm/vmalloc.c:615
>> [ 696.928576] in_atomic(): 1, irqs_disabled(): 0, pid: 30521, name: bash
>> [ 696.928590] 1 lock held by bash/30521:
>> [ 696.928600] #0: [ 696.928606] (vmap_area_lock[ 696.928619] ){+.+...}, at: [ 696.928640] [<ffffffff8115f0cf>] __purge_vmap_area_lazy+0x30f/0x370
>> [ 696.928656] CPU: 0 PID: 30521 Comm: bash Tainted: G W 4.9.0-rc1+ #124
>> [ 696.928672] Hardware name: / , BIOS PYBSWCEL.86A.0027.2015.0507.1758 05/07/2015
>> [ 696.928690] ffffc900070f7c70 ffffffff812be1f5 ffff8802750b6680 ffffffff819650a6
>> [ 696.928717] ffffc900070f7c98 ffffffff810a3216 0000000000004001 ffff8802726e16c0
>> [ 696.928743] ffff8802726e19a0 ffffc900070f7d08 ffffffff8115f0f3 ffff8802750b6680
>> [ 696.928768] Call Trace:
>> [ 696.928782] [<ffffffff812be1f5>] dump_stack+0x68/0x93
>> [ 696.928796] [<ffffffff810a3216>] ___might_sleep+0x166/0x220
>> [ 696.928809] [<ffffffff8115f0f3>] __purge_vmap_area_lazy+0x333/0x370
>> [ 696.928823] [<ffffffff8115ea68>] ? vunmap_page_range+0x1e8/0x350
>> [ 696.928837] [<ffffffff8115f1b3>] free_vmap_area_noflush+0x83/0x90
>> [ 696.928850] [<ffffffff81160931>] remove_vm_area+0x71/0xb0
>> [ 696.928863] [<ffffffff81160999>] __vunmap+0x29/0xf0
>> [ 696.928875] [<ffffffff81160ab9>] vfree+0x29/0x70
>> [ 696.928888] [<ffffffff81071746>] put_task_stack+0x76/0x120
>
> From this traceback, it looks like the lock causing the atomic context
> was actually acquired in the vfree path itself, and not by the vmapped
> stack user (as it says "vmap_area_lock" held). I am still wondering
> why vmap_area_lock was held during the might_sleep(), perhaps you may
> not have applied all patches from Chris H?
>
I don't think that this splat is because we holding vmap_area_lock.
Look at cond_resched_lock:
#define cond_resched_lock(lock) ({ \
___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\
__cond_resched_lock(lock); \
})
It calls might_sleep() with spin lock still held.
AFAIU PREEMPT_LOCK_OFFSET supposed to tell might_sleep() to ignore spin locks
and complain iff something else changed preempt_count.