Commit 5803ed292e63 ("mm: mark all calls into the vmalloc subsystem
as potentially sleeping") added might_sleep() to remove_vm_area() from
vfree(), and commit 763b218ddfaf ("mm: add preempt points into
__purge_vmap_area_lazy()") actually made vfree() potentially sleeping.
This broke vmwgfx driver which calls vfree() under spin_lock().
BUG: sleeping function called from invalid context at mm/vmalloc.c:1480
in_atomic(): 1, irqs_disabled(): 0, pid: 341, name: plymouthd
2 locks held by plymouthd/341:
#0: (drm_global_mutex){+.+.+.}, at: [<ffffffffc01c274b>] drm_release+0x3b/0x3b0 [drm]
#1: (&(&tfile->lock)->rlock){+.+...}, at: [<ffffffffc0173038>] ttm_object_file_release+0x28/0x90 [ttm]
Call Trace:
dump_stack+0x86/0xc3
___might_sleep+0x17d/0x250
__might_sleep+0x4a/0x80
remove_vm_area+0x22/0x90
__vunmap+0x2e/0x110
vfree+0x42/0x90
kvfree+0x2c/0x40
drm_ht_remove+0x1a/0x30 [drm]
ttm_object_file_release+0x50/0x90 [ttm]
vmw_postclose+0x47/0x60 [vmwgfx]
drm_release+0x290/0x3b0 [drm]
__fput+0xf8/0x210
____fput+0xe/0x10
task_work_run+0x85/0xc0
exit_to_usermode_loop+0xb4/0xc0
do_syscall_64+0x185/0x1f0
entry_SYSCALL64_slow_path+0x25/0x25
This can be fixed in vmgfx, but it would be better to make vfree()
non-sleeping again because we may have other bugs like this one.
__purge_vmap_area_lazy() is the only function in the vfree() path that
wants to be able to sleep. So it make sense to schedule
__purge_vmap_area_lazy() via schedule_work() so it runs only in sleepable
context. This will have a minimal effect on the regular vfree() path.
since __purge_vmap_area_lazy() is rarely called.
Fixes: 5803ed292e63 ("mm: mark all calls into the vmalloc subsystem as
potentially sleeping")
Reported-by: Tetsuo Handa <[email protected]>
Signed-off-by: Andrey Ryabinin <[email protected]>
Cc: <[email protected]>
Signed-off-by: Andrey Ryabinin <[email protected]>
---
mm/vmalloc.c | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 68eb002..ea1b4ab 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -701,7 +701,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
* Kick off a purge of the outstanding lazy areas. Don't bother if somebody
* is already purging.
*/
-static void try_purge_vmap_area_lazy(void)
+static void try_purge_vmap_area_lazy(struct work_struct *work)
{
if (mutex_trylock(&vmap_purge_lock)) {
__purge_vmap_area_lazy(ULONG_MAX, 0);
@@ -720,6 +720,8 @@ static void purge_vmap_area_lazy(void)
mutex_unlock(&vmap_purge_lock);
}
+static DECLARE_WORK(purge_vmap_work, try_purge_vmap_area_lazy);
+
/*
* Free a vmap area, caller ensuring that the area has been unmapped
* and flush_cache_vunmap had been called for the correct range
@@ -736,7 +738,7 @@ static void free_vmap_area_noflush(struct vmap_area *va)
llist_add(&va->purge_list, &vmap_purge_list);
if (unlikely(nr_lazy > lazy_max_pages()))
- try_purge_vmap_area_lazy();
+ schedule_work(&purge_vmap_work);
}
/*
@@ -1125,7 +1127,6 @@ void vm_unmap_ram(const void *mem, unsigned int count)
unsigned long addr = (unsigned long)mem;
struct vmap_area *va;
- might_sleep();
BUG_ON(!addr);
BUG_ON(addr < VMALLOC_START);
BUG_ON(addr > VMALLOC_END);
@@ -1477,8 +1478,6 @@ struct vm_struct *remove_vm_area(const void *addr)
{
struct vmap_area *va;
- might_sleep();
-
va = find_vmap_area((unsigned long)addr);
if (va && va->flags & VM_VM_AREA) {
struct vm_struct *vm = va->vm;
--
2.10.2
vfree() can be used in any atomic context now, thus there is no point
in vfree_atomic().
This reverts commit 8d5341a6260a ("x86/ldt: use vfree_atomic()
to free ldt entries")
Signed-off-by: Andrey Ryabinin <[email protected]>
---
arch/x86/kernel/ldt.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index d4a1583..f09df2f 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -93,7 +93,7 @@ static void free_ldt_struct(struct ldt_struct *ldt)
paravirt_free_ldt(ldt->entries, ldt->size);
if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
- vfree_atomic(ldt->entries);
+ vfree(ldt->entries);
else
free_page((unsigned long)ldt->entries);
kfree(ldt);
--
2.10.2
vfree() can be used in any atomic context now, thus there is no point
in using vfree_atomic().
This reverts commit 0f110a9b956c ("kernel/fork: use vfree_atomic()
to free thread stack")
Signed-off-by: Andrey Ryabinin <[email protected]>
---
kernel/fork.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/fork.c b/kernel/fork.c
index a9f642d..084e6a4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -241,7 +241,7 @@ static inline void free_thread_stack(struct task_struct *tsk)
}
local_irq_restore(flags);
- vfree_atomic(tsk->stack);
+ vfree(tsk->stack);
return;
}
#endif
--
2.10.2
vfree() can be used in any atomic context and there is no
vfree_atomic() callers left, so let's remove it.
This reverts commit bf22e37a6413 ("mm: add vfree_atomic()")
Signed-off-by: Andrey Ryabinin <[email protected]>
---
include/linux/vmalloc.h | 1 -
mm/vmalloc.c | 40 +++++-----------------------------------
2 files changed, 5 insertions(+), 36 deletions(-)
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 46991ad..b4f044f 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -83,7 +83,6 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags);
extern void vfree(const void *addr);
-extern void vfree_atomic(const void *addr);
extern void *vmap(struct page **pages, unsigned int count,
unsigned long flags, pgprot_t prot);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ea1b4ab..b77337a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1534,38 +1534,6 @@ static void __vunmap(const void *addr, int deallocate_pages)
return;
}
-static inline void __vfree_deferred(const void *addr)
-{
- /*
- * Use raw_cpu_ptr() because this can be called from preemptible
- * context. Preemption is absolutely fine here, because the llist_add()
- * implementation is lockless, so it works even if we are adding to
- * nother cpu's list. schedule_work() should be fine with this too.
- */
- struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
-
- if (llist_add((struct llist_node *)addr, &p->list))
- schedule_work(&p->wq);
-}
-
-/**
- * vfree_atomic - release memory allocated by vmalloc()
- * @addr: memory base address
- *
- * This one is just like vfree() but can be called in any atomic context
- * except NMIs.
- */
-void vfree_atomic(const void *addr)
-{
- BUG_ON(in_nmi());
-
- kmemleak_free(addr);
-
- if (!addr)
- return;
- __vfree_deferred(addr);
-}
-
/**
* vfree - release memory allocated by vmalloc()
* @addr: memory base address
@@ -1588,9 +1556,11 @@ void vfree(const void *addr)
if (!addr)
return;
- if (unlikely(in_interrupt()))
- __vfree_deferred(addr);
- else
+ if (unlikely(in_interrupt())) {
+ struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred);
+ if (llist_add((struct llist_node *)addr, &p->list))
+ schedule_work(&p->wq);
+ } else
__vunmap(addr, 1);
}
EXPORT_SYMBOL(vfree);
--
2.10.2
On 03/30/2017 12:27 PM, Andrey Ryabinin wrote:
> Commit 5803ed292e63 ("mm: mark all calls into the vmalloc subsystem
> as potentially sleeping") added might_sleep() to remove_vm_area() from
> vfree(), and commit 763b218ddfaf ("mm: add preempt points into
> __purge_vmap_area_lazy()") actually made vfree() potentially sleeping.
>
> This broke vmwgfx driver which calls vfree() under spin_lock().
>
> BUG: sleeping function called from invalid context at mm/vmalloc.c:1480
> in_atomic(): 1, irqs_disabled(): 0, pid: 341, name: plymouthd
> 2 locks held by plymouthd/341:
> #0: (drm_global_mutex){+.+.+.}, at: [<ffffffffc01c274b>] drm_release+0x3b/0x3b0 [drm]
> #1: (&(&tfile->lock)->rlock){+.+...}, at: [<ffffffffc0173038>] ttm_object_file_release+0x28/0x90 [ttm]
>
> Call Trace:
> dump_stack+0x86/0xc3
> ___might_sleep+0x17d/0x250
> __might_sleep+0x4a/0x80
> remove_vm_area+0x22/0x90
> __vunmap+0x2e/0x110
> vfree+0x42/0x90
> kvfree+0x2c/0x40
> drm_ht_remove+0x1a/0x30 [drm]
> ttm_object_file_release+0x50/0x90 [ttm]
> vmw_postclose+0x47/0x60 [vmwgfx]
> drm_release+0x290/0x3b0 [drm]
> __fput+0xf8/0x210
> ____fput+0xe/0x10
> task_work_run+0x85/0xc0
> exit_to_usermode_loop+0xb4/0xc0
> do_syscall_64+0x185/0x1f0
> entry_SYSCALL64_slow_path+0x25/0x25
>
> This can be fixed in vmgfx, but it would be better to make vfree()
> non-sleeping again because we may have other bugs like this one.
>
> __purge_vmap_area_lazy() is the only function in the vfree() path that
> wants to be able to sleep. So it make sense to schedule
> __purge_vmap_area_lazy() via schedule_work() so it runs only in sleepable
> context. This will have a minimal effect on the regular vfree() path.
> since __purge_vmap_area_lazy() is rarely called.
>
> Fixes: 5803ed292e63 ("mm: mark all calls into the vmalloc subsystem as
> potentially sleeping")
> Reported-by: Tetsuo Handa <[email protected]>
> Signed-off-by: Andrey Ryabinin <[email protected]>
> Cc: <[email protected]>
>
> Signed-off-by: Andrey Ryabinin <[email protected]>
> ---
> mm/vmalloc.c | 9 ++++-----
> 1 file changed, 4 insertions(+), 5 deletions(-)
>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 68eb002..ea1b4ab 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -701,7 +701,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
> * Kick off a purge of the outstanding lazy areas. Don't bother if somebody
> * is already purging.
> */
> -static void try_purge_vmap_area_lazy(void)
> +static void try_purge_vmap_area_lazy(struct work_struct *work)
> {
> if (mutex_trylock(&vmap_purge_lock)) {
> __purge_vmap_area_lazy(ULONG_MAX, 0);
> @@ -720,6 +720,8 @@ static void purge_vmap_area_lazy(void)
> mutex_unlock(&vmap_purge_lock);
> }
>
> +static DECLARE_WORK(purge_vmap_work, try_purge_vmap_area_lazy);
> +
> /*
> * Free a vmap area, caller ensuring that the area has been unmapped
> * and flush_cache_vunmap had been called for the correct range
> @@ -736,7 +738,7 @@ static void free_vmap_area_noflush(struct vmap_area *va)
> llist_add(&va->purge_list, &vmap_purge_list);
>
> if (unlikely(nr_lazy > lazy_max_pages()))
> - try_purge_vmap_area_lazy();
Perhaps a slight optimization would be to schedule work iff
!mutex_locked(&vmap_purge_lock) below?
/Thomas
> + schedule_work(&purge_vmap_work);
> }
>
> /*
> @@ -1125,7 +1127,6 @@ void vm_unmap_ram(const void *mem, unsigned int count)
> unsigned long addr = (unsigned long)mem;
> struct vmap_area *va;
>
> - might_sleep();
> BUG_ON(!addr);
> BUG_ON(addr < VMALLOC_START);
> BUG_ON(addr > VMALLOC_END);
> @@ -1477,8 +1478,6 @@ struct vm_struct *remove_vm_area(const void *addr)
> {
> struct vmap_area *va;
>
> - might_sleep();
> -
> va = find_vmap_area((unsigned long)addr);
> if (va && va->flags & VM_VM_AREA) {
> struct vm_struct *vm = va->vm;
On 03/30/2017 03:00 PM, Thomas Hellstrom wrote:
>>
>> if (unlikely(nr_lazy > lazy_max_pages()))
>> - try_purge_vmap_area_lazy();
>
> Perhaps a slight optimization would be to schedule work iff
> !mutex_locked(&vmap_purge_lock) below?
>
Makes sense, we don't need to spawn workers if we already purging.
From: Andrey Ryabinin <[email protected]>
Subject: mm/vmalloc: allow to call vfree() in atomic context fix
Don't spawn worker if we already purging.
Signed-off-by: Andrey Ryabinin <[email protected]>
---
mm/vmalloc.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ea1b4ab..88168b8 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -737,7 +737,8 @@ static void free_vmap_area_noflush(struct vmap_area *va)
/* After this point, we may free va at any time */
llist_add(&va->purge_list, &vmap_purge_list);
- if (unlikely(nr_lazy > lazy_max_pages()))
+ if (unlikely(nr_lazy > lazy_max_pages()) &&
+ !mutex_is_locked(&vmap_purge_lock))
schedule_work(&purge_vmap_work);
}
--
2.10.2
On 03/30/2017 04:48 PM, Andrey Ryabinin wrote:
> On 03/30/2017 03:00 PM, Thomas Hellstrom wrote:
>
>>>
>>> if (unlikely(nr_lazy > lazy_max_pages()))
>>> - try_purge_vmap_area_lazy();
>> Perhaps a slight optimization would be to schedule work iff
>> !mutex_locked(&vmap_purge_lock) below?
>>
> Makes sense, we don't need to spawn workers if we already purging.
>
>
>
> From: Andrey Ryabinin <[email protected]>
> Subject: mm/vmalloc: allow to call vfree() in atomic context fix
>
> Don't spawn worker if we already purging.
>
> Signed-off-by: Andrey Ryabinin <[email protected]>
> ---
> mm/vmalloc.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index ea1b4ab..88168b8 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -737,7 +737,8 @@ static void free_vmap_area_noflush(struct vmap_area *va)
> /* After this point, we may free va at any time */
> llist_add(&va->purge_list, &vmap_purge_list);
>
> - if (unlikely(nr_lazy > lazy_max_pages()))
> + if (unlikely(nr_lazy > lazy_max_pages()) &&
> + !mutex_is_locked(&vmap_purge_lock))
> schedule_work(&purge_vmap_work);
> }
>
For both patches,
Reviewed-by: Thomas Hellstrom <[email protected]>
Thanks,
Thomas
On Thu, Mar 30, 2017 at 01:27:19PM +0300, Andrey Ryabinin wrote:
> vfree() can be used in any atomic context and there is no
> vfree_atomic() callers left, so let's remove it.
We might still get warnings though.
> @@ -1588,9 +1556,11 @@ void vfree(const void *addr)
>
> if (!addr)
> return;
> - if (unlikely(in_interrupt()))
> - __vfree_deferred(addr);
> - else
> + if (unlikely(in_interrupt())) {
> + struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred);
> + if (llist_add((struct llist_node *)addr, &p->list))
> + schedule_work(&p->wq);
> + } else
> __vunmap(addr, 1);
> }
> EXPORT_SYMBOL(vfree);
If I disable preemption, then call vfree(), in_interrupt() will not be
true (I've only incremented preempt_count()), then __vunmap() calls
remove_vm_area() which calls might_sleep(), which will warn.
So I think this check needs to change from in_interrupt() to in_atomic().
On 03/30/2017 08:18 PM, Matthew Wilcox wrote:
> On Thu, Mar 30, 2017 at 01:27:19PM +0300, Andrey Ryabinin wrote:
>> vfree() can be used in any atomic context and there is no
>> vfree_atomic() callers left, so let's remove it.
>
> We might still get warnings though.
>
>> @@ -1588,9 +1556,11 @@ void vfree(const void *addr)
>>
>> if (!addr)
>> return;
>> - if (unlikely(in_interrupt()))
>> - __vfree_deferred(addr);
>> - else
>> + if (unlikely(in_interrupt())) {
>> + struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred);
>> + if (llist_add((struct llist_node *)addr, &p->list))
>> + schedule_work(&p->wq);
>> + } else
>> __vunmap(addr, 1);
>> }
>> EXPORT_SYMBOL(vfree);
>
> If I disable preemption, then call vfree(), in_interrupt() will not be
> true (I've only incremented preempt_count()), then __vunmap() calls
> remove_vm_area() which calls might_sleep(), which will warn.
The first patch removed this might_sleep() .
> So I think this check needs to change from in_interrupt() to in_atomic().
>
On Thu, 30 Mar 2017 13:27:16 +0300 Andrey Ryabinin <[email protected]> wrote:
> Commit 5803ed292e63 ("mm: mark all calls into the vmalloc subsystem
> as potentially sleeping") added might_sleep() to remove_vm_area() from
> vfree(), and commit 763b218ddfaf ("mm: add preempt points into
> __purge_vmap_area_lazy()") actually made vfree() potentially sleeping.
>
> This broke vmwgfx driver which calls vfree() under spin_lock().
>
> BUG: sleeping function called from invalid context at mm/vmalloc.c:1480
> in_atomic(): 1, irqs_disabled(): 0, pid: 341, name: plymouthd
> 2 locks held by plymouthd/341:
> #0: (drm_global_mutex){+.+.+.}, at: [<ffffffffc01c274b>] drm_release+0x3b/0x3b0 [drm]
> #1: (&(&tfile->lock)->rlock){+.+...}, at: [<ffffffffc0173038>] ttm_object_file_release+0x28/0x90 [ttm]
>
> Call Trace:
> dump_stack+0x86/0xc3
> ___might_sleep+0x17d/0x250
> __might_sleep+0x4a/0x80
> remove_vm_area+0x22/0x90
> __vunmap+0x2e/0x110
> vfree+0x42/0x90
> kvfree+0x2c/0x40
> drm_ht_remove+0x1a/0x30 [drm]
> ttm_object_file_release+0x50/0x90 [ttm]
> vmw_postclose+0x47/0x60 [vmwgfx]
> drm_release+0x290/0x3b0 [drm]
> __fput+0xf8/0x210
> ____fput+0xe/0x10
> task_work_run+0x85/0xc0
> exit_to_usermode_loop+0xb4/0xc0
> do_syscall_64+0x185/0x1f0
> entry_SYSCALL64_slow_path+0x25/0x25
>
> This can be fixed in vmgfx, but it would be better to make vfree()
> non-sleeping again because we may have other bugs like this one.
I tend to disagree: adding yet another schedule_work() introduces
additional overhead and adds some risk of ENOMEM errors which wouldn't
occur with a synchronous free.
> __purge_vmap_area_lazy() is the only function in the vfree() path that
> wants to be able to sleep. So it make sense to schedule
> __purge_vmap_area_lazy() via schedule_work() so it runs only in sleepable
> context.
vfree() already does
if (unlikely(in_interrupt()))
__vfree_deferred(addr);
so it seems silly to introduce another defer-to-kernel-thread thing
when we already have one.
> This will have a minimal effect on the regular vfree() path.
> since __purge_vmap_area_lazy() is rarely called.
hum, OK, so perhaps the overhead isn't too bad.
Remind me: where does __purge_vmap_area_lazy() sleep?
Seems to me that a better fix would be to make vfree() atomic, if poss.
Otherwise, to fix callers so they call vfree from sleepable context.
That will reduce kernel latencies as well.
Hi Andrew,
On Thu, Mar 30, 2017 at 3:22 PM, Andrew Morton
<[email protected]> wrote:
> On Thu, 30 Mar 2017 13:27:16 +0300 Andrey Ryabinin <[email protected]> wrote:
>
>> Commit 5803ed292e63 ("mm: mark all calls into the vmalloc subsystem
>> as potentially sleeping") added might_sleep() to remove_vm_area() from
>> vfree(), and commit 763b218ddfaf ("mm: add preempt points into
>> __purge_vmap_area_lazy()") actually made vfree() potentially sleeping.
>>
>> This broke vmwgfx driver which calls vfree() under spin_lock().
>>
>> BUG: sleeping function called from invalid context at mm/vmalloc.c:1480
>> in_atomic(): 1, irqs_disabled(): 0, pid: 341, name: plymouthd
>> 2 locks held by plymouthd/341:
>> #0: (drm_global_mutex){+.+.+.}, at: [<ffffffffc01c274b>] drm_release+0x3b/0x3b0 [drm]
>> #1: (&(&tfile->lock)->rlock){+.+...}, at: [<ffffffffc0173038>] ttm_object_file_release+0x28/0x90 [ttm]
>>
>> Call Trace:
>> dump_stack+0x86/0xc3
>> ___might_sleep+0x17d/0x250
>> __might_sleep+0x4a/0x80
>> remove_vm_area+0x22/0x90
>> __vunmap+0x2e/0x110
>> vfree+0x42/0x90
>> kvfree+0x2c/0x40
>> drm_ht_remove+0x1a/0x30 [drm]
>> ttm_object_file_release+0x50/0x90 [ttm]
>> vmw_postclose+0x47/0x60 [vmwgfx]
>> drm_release+0x290/0x3b0 [drm]
>> __fput+0xf8/0x210
>> ____fput+0xe/0x10
>> task_work_run+0x85/0xc0
>> exit_to_usermode_loop+0xb4/0xc0
>> do_syscall_64+0x185/0x1f0
>> entry_SYSCALL64_slow_path+0x25/0x25
>>
>> This can be fixed in vmgfx, but it would be better to make vfree()
>> non-sleeping again because we may have other bugs like this one.
>
> I tend to disagree: adding yet another schedule_work() introduces
> additional overhead and adds some risk of ENOMEM errors which wouldn't
> occur with a synchronous free.
>
>> __purge_vmap_area_lazy() is the only function in the vfree() path that
>> wants to be able to sleep. So it make sense to schedule
>> __purge_vmap_area_lazy() via schedule_work() so it runs only in sleepable
>> context.
>
> vfree() already does
>
> if (unlikely(in_interrupt()))
> __vfree_deferred(addr);
>
> so it seems silly to introduce another defer-to-kernel-thread thing
> when we already have one.
>
>> This will have a minimal effect on the regular vfree() path.
>> since __purge_vmap_area_lazy() is rarely called.
>
> hum, OK, so perhaps the overhead isn't too bad.
>
> Remind me: where does __purge_vmap_area_lazy() sleep?
Because it will make for a possibly time consuming critical section.
This was hurting real-time workloads which are sensitive to latencies
(commit f9e09977671b618a "mm: turn vmap_purge_lock into a mutex" fixed
it).
Thanks,
Joel
On Thu, 30 Mar 2017, Andrey Ryabinin wrote:
> vfree() can be used in any atomic context now, thus there is no point
> in vfree_atomic().
> This reverts commit 8d5341a6260a ("x86/ldt: use vfree_atomic()
> to free ldt entries")
>
> Signed-off-by: Andrey Ryabinin <[email protected]>
Reviewed-by: Thomas Gleixner <[email protected]>
On 03/31/2017 01:22 AM, Andrew Morton wrote:
> On Thu, 30 Mar 2017 13:27:16 +0300 Andrey Ryabinin <[email protected]> wrote:
>
>> Commit 5803ed292e63 ("mm: mark all calls into the vmalloc subsystem
>> as potentially sleeping") added might_sleep() to remove_vm_area() from
>> vfree(), and commit 763b218ddfaf ("mm: add preempt points into
>> __purge_vmap_area_lazy()") actually made vfree() potentially sleeping.
>>
>> This broke vmwgfx driver which calls vfree() under spin_lock().
>>
>> BUG: sleeping function called from invalid context at mm/vmalloc.c:1480
>> in_atomic(): 1, irqs_disabled(): 0, pid: 341, name: plymouthd
>> 2 locks held by plymouthd/341:
>> #0: (drm_global_mutex){+.+.+.}, at: [<ffffffffc01c274b>] drm_release+0x3b/0x3b0 [drm]
>> #1: (&(&tfile->lock)->rlock){+.+...}, at: [<ffffffffc0173038>] ttm_object_file_release+0x28/0x90 [ttm]
>>
>> Call Trace:
>> dump_stack+0x86/0xc3
>> ___might_sleep+0x17d/0x250
>> __might_sleep+0x4a/0x80
>> remove_vm_area+0x22/0x90
>> __vunmap+0x2e/0x110
>> vfree+0x42/0x90
>> kvfree+0x2c/0x40
>> drm_ht_remove+0x1a/0x30 [drm]
>> ttm_object_file_release+0x50/0x90 [ttm]
>> vmw_postclose+0x47/0x60 [vmwgfx]
>> drm_release+0x290/0x3b0 [drm]
>> __fput+0xf8/0x210
>> ____fput+0xe/0x10
>> task_work_run+0x85/0xc0
>> exit_to_usermode_loop+0xb4/0xc0
>> do_syscall_64+0x185/0x1f0
>> entry_SYSCALL64_slow_path+0x25/0x25
>>
>> This can be fixed in vmgfx, but it would be better to make vfree()
>> non-sleeping again because we may have other bugs like this one.
>
> I tend to disagree: adding yet another schedule_work() introduces
> additional overhead and adds some risk of ENOMEM errors which wouldn't
> occur with a synchronous free.
>
>> __purge_vmap_area_lazy() is the only function in the vfree() path that
>> wants to be able to sleep. So it make sense to schedule
>> __purge_vmap_area_lazy() via schedule_work() so it runs only in sleepable
>> context.
>
> vfree() already does
>
> if (unlikely(in_interrupt()))
> __vfree_deferred(addr);
>
> so it seems silly to introduce another defer-to-kernel-thread thing
> when we already have one.
>
>> This will have a minimal effect on the regular vfree() path.
>> since __purge_vmap_area_lazy() is rarely called.
>
> hum, OK, so perhaps the overhead isn't too bad.
>
> Remind me: where does __purge_vmap_area_lazy() sleep?
It's cond_resched_lock(&vmap_area_lock);
>
> Seems to me that a better fix would be to make vfree() atomic, if poss.
>
> Otherwise, to fix callers so they call vfree from sleepable context.
> That will reduce kernel latencies as well.
>
Currently we know only two such callers: ttm_object_file_release() and ttm_object_device_release().
Both of them call vfree() under spin_lock() for no reason, Thomas said that he
has patch to fix this: http://lkml.kernel.org/r/[email protected]
So this patch is more like an attempt to address other similar bugs possibly introduced by
commit 5803ed292e63 ("mm: mark all calls into the vmalloc subsystem as potentially sleeping")
It's quite possible that we overlooked something.
On Thu 30-03-17 15:22:29, Andrew Morton wrote:
> On Thu, 30 Mar 2017 13:27:16 +0300 Andrey Ryabinin <[email protected]> wrote:
[...]
> > This can be fixed in vmgfx, but it would be better to make vfree()
> > non-sleeping again because we may have other bugs like this one.
>
> I tend to disagree: adding yet another schedule_work() introduces
> additional overhead and adds some risk of ENOMEM errors which wouldn't
> occur with a synchronous free.
I do not think ENOMEM would be a problem. We are talking about lazy
handling already. Besides that the allocation path also does this lazy
free AFAICS.
> > __purge_vmap_area_lazy() is the only function in the vfree() path that
> > wants to be able to sleep. So it make sense to schedule
> > __purge_vmap_area_lazy() via schedule_work() so it runs only in sleepable
> > context.
>
> vfree() already does
>
> if (unlikely(in_interrupt()))
> __vfree_deferred(addr);
>
> so it seems silly to introduce another defer-to-kernel-thread thing
> when we already have one.
But this only cares about the IRQ context and this patch aims at atomic
context in general. I agree it would have been better to reduce this
deferred behavior to only _atomic_ context but we not have a reliable
way to detect that on non-preemptive kernels AFAIR.
--
Michal Hocko
SUSE Labs
On Thu 30-03-17 13:27:16, Andrey Ryabinin wrote:
> Commit 5803ed292e63 ("mm: mark all calls into the vmalloc subsystem
> as potentially sleeping") added might_sleep() to remove_vm_area() from
> vfree(), and commit 763b218ddfaf ("mm: add preempt points into
> __purge_vmap_area_lazy()") actually made vfree() potentially sleeping.
>
> This broke vmwgfx driver which calls vfree() under spin_lock().
>
> BUG: sleeping function called from invalid context at mm/vmalloc.c:1480
> in_atomic(): 1, irqs_disabled(): 0, pid: 341, name: plymouthd
> 2 locks held by plymouthd/341:
> #0: (drm_global_mutex){+.+.+.}, at: [<ffffffffc01c274b>] drm_release+0x3b/0x3b0 [drm]
> #1: (&(&tfile->lock)->rlock){+.+...}, at: [<ffffffffc0173038>] ttm_object_file_release+0x28/0x90 [ttm]
>
> Call Trace:
> dump_stack+0x86/0xc3
> ___might_sleep+0x17d/0x250
> __might_sleep+0x4a/0x80
> remove_vm_area+0x22/0x90
> __vunmap+0x2e/0x110
> vfree+0x42/0x90
> kvfree+0x2c/0x40
> drm_ht_remove+0x1a/0x30 [drm]
> ttm_object_file_release+0x50/0x90 [ttm]
> vmw_postclose+0x47/0x60 [vmwgfx]
> drm_release+0x290/0x3b0 [drm]
> __fput+0xf8/0x210
> ____fput+0xe/0x10
> task_work_run+0x85/0xc0
> exit_to_usermode_loop+0xb4/0xc0
> do_syscall_64+0x185/0x1f0
> entry_SYSCALL64_slow_path+0x25/0x25
>
> This can be fixed in vmgfx, but it would be better to make vfree()
> non-sleeping again because we may have other bugs like this one.
>
> __purge_vmap_area_lazy() is the only function in the vfree() path that
> wants to be able to sleep. So it make sense to schedule
> __purge_vmap_area_lazy() via schedule_work() so it runs only in sleepable
> context. This will have a minimal effect on the regular vfree() path.
> since __purge_vmap_area_lazy() is rarely called.
>
> Fixes: 5803ed292e63 ("mm: mark all calls into the vmalloc subsystem as
> potentially sleeping")
> Reported-by: Tetsuo Handa <[email protected]>
> Signed-off-by: Andrey Ryabinin <[email protected]>
> Cc: <[email protected]>
Yes I believe this is an enhancements.
Acked-by: Michal Hocko <[email protected]>
Crawling over all vfree users is just unfeasible.
> Signed-off-by: Andrey Ryabinin <[email protected]>
> ---
> mm/vmalloc.c | 9 ++++-----
> 1 file changed, 4 insertions(+), 5 deletions(-)
>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 68eb002..ea1b4ab 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -701,7 +701,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
> * Kick off a purge of the outstanding lazy areas. Don't bother if somebody
> * is already purging.
> */
> -static void try_purge_vmap_area_lazy(void)
> +static void try_purge_vmap_area_lazy(struct work_struct *work)
> {
> if (mutex_trylock(&vmap_purge_lock)) {
> __purge_vmap_area_lazy(ULONG_MAX, 0);
> @@ -720,6 +720,8 @@ static void purge_vmap_area_lazy(void)
> mutex_unlock(&vmap_purge_lock);
> }
>
> +static DECLARE_WORK(purge_vmap_work, try_purge_vmap_area_lazy);
> +
> /*
> * Free a vmap area, caller ensuring that the area has been unmapped
> * and flush_cache_vunmap had been called for the correct range
> @@ -736,7 +738,7 @@ static void free_vmap_area_noflush(struct vmap_area *va)
> llist_add(&va->purge_list, &vmap_purge_list);
>
> if (unlikely(nr_lazy > lazy_max_pages()))
> - try_purge_vmap_area_lazy();
> + schedule_work(&purge_vmap_work);
> }
>
> /*
> @@ -1125,7 +1127,6 @@ void vm_unmap_ram(const void *mem, unsigned int count)
> unsigned long addr = (unsigned long)mem;
> struct vmap_area *va;
>
> - might_sleep();
> BUG_ON(!addr);
> BUG_ON(addr < VMALLOC_START);
> BUG_ON(addr > VMALLOC_END);
> @@ -1477,8 +1478,6 @@ struct vm_struct *remove_vm_area(const void *addr)
> {
> struct vmap_area *va;
>
> - might_sleep();
> -
> va = find_vmap_area((unsigned long)addr);
> if (va && va->flags & VM_VM_AREA) {
> struct vm_struct *vm = va->vm;
> --
> 2.10.2
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
--
Michal Hocko
SUSE Labs
On Thu 30-03-17 13:27:19, Andrey Ryabinin wrote:
> vfree() can be used in any atomic context and there is no
> vfree_atomic() callers left, so let's remove it.
>
> This reverts commit bf22e37a6413 ("mm: add vfree_atomic()")
>
> Signed-off-by: Andrey Ryabinin <[email protected]>
the idea was nice but reality hits the fan and we learn that this just
doesn't work...
Acked-by: Michal Hocko <[email protected]>
> ---
> include/linux/vmalloc.h | 1 -
> mm/vmalloc.c | 40 +++++-----------------------------------
> 2 files changed, 5 insertions(+), 36 deletions(-)
>
> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> index 46991ad..b4f044f 100644
> --- a/include/linux/vmalloc.h
> +++ b/include/linux/vmalloc.h
> @@ -83,7 +83,6 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
> extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags);
>
> extern void vfree(const void *addr);
> -extern void vfree_atomic(const void *addr);
>
> extern void *vmap(struct page **pages, unsigned int count,
> unsigned long flags, pgprot_t prot);
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index ea1b4ab..b77337a 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -1534,38 +1534,6 @@ static void __vunmap(const void *addr, int deallocate_pages)
> return;
> }
>
> -static inline void __vfree_deferred(const void *addr)
> -{
> - /*
> - * Use raw_cpu_ptr() because this can be called from preemptible
> - * context. Preemption is absolutely fine here, because the llist_add()
> - * implementation is lockless, so it works even if we are adding to
> - * nother cpu's list. schedule_work() should be fine with this too.
> - */
> - struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
> -
> - if (llist_add((struct llist_node *)addr, &p->list))
> - schedule_work(&p->wq);
> -}
> -
> -/**
> - * vfree_atomic - release memory allocated by vmalloc()
> - * @addr: memory base address
> - *
> - * This one is just like vfree() but can be called in any atomic context
> - * except NMIs.
> - */
> -void vfree_atomic(const void *addr)
> -{
> - BUG_ON(in_nmi());
> -
> - kmemleak_free(addr);
> -
> - if (!addr)
> - return;
> - __vfree_deferred(addr);
> -}
> -
> /**
> * vfree - release memory allocated by vmalloc()
> * @addr: memory base address
> @@ -1588,9 +1556,11 @@ void vfree(const void *addr)
>
> if (!addr)
> return;
> - if (unlikely(in_interrupt()))
> - __vfree_deferred(addr);
> - else
> + if (unlikely(in_interrupt())) {
> + struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred);
> + if (llist_add((struct llist_node *)addr, &p->list))
> + schedule_work(&p->wq);
> + } else
> __vunmap(addr, 1);
> }
> EXPORT_SYMBOL(vfree);
> --
> 2.10.2
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
--
Michal Hocko
SUSE Labs
On Thu 30-03-17 17:48:39, Andrey Ryabinin wrote:
> From: Andrey Ryabinin <[email protected]>
> Subject: mm/vmalloc: allow to call vfree() in atomic context fix
>
> Don't spawn worker if we already purging.
>
> Signed-off-by: Andrey Ryabinin <[email protected]>
I would rather put this into a separate patch. Ideally with some numners
as this is an optimization...
> ---
> mm/vmalloc.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index ea1b4ab..88168b8 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -737,7 +737,8 @@ static void free_vmap_area_noflush(struct vmap_area *va)
> /* After this point, we may free va at any time */
> llist_add(&va->purge_list, &vmap_purge_list);
>
> - if (unlikely(nr_lazy > lazy_max_pages()))
> + if (unlikely(nr_lazy > lazy_max_pages()) &&
> + !mutex_is_locked(&vmap_purge_lock))
> schedule_work(&purge_vmap_work);
> }
>
> --
> 2.10.2
>
--
Michal Hocko
SUSE Labs
On 04/04/2017 11:41 AM, Michal Hocko wrote:
> On Thu 30-03-17 17:48:39, Andrey Ryabinin wrote:
>> From: Andrey Ryabinin <[email protected]>
>> Subject: mm/vmalloc: allow to call vfree() in atomic context fix
>>
>> Don't spawn worker if we already purging.
>>
>> Signed-off-by: Andrey Ryabinin <[email protected]>
> I would rather put this into a separate patch. Ideally with some numners
> as this is an optimization...
Actually, this just mimics what the code was doing before, as it only
invoked
__purge_vmap_area_lazy() if the trylock succeeded.
/Thomas
>
>> ---
>> mm/vmalloc.c | 3 ++-
>> 1 file changed, 2 insertions(+), 1 deletion(-)
>>
>> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
>> index ea1b4ab..88168b8 100644
>> --- a/mm/vmalloc.c
>> +++ b/mm/vmalloc.c
>> @@ -737,7 +737,8 @@ static void free_vmap_area_noflush(struct vmap_area *va)
>> /* After this point, we may free va at any time */
>> llist_add(&va->purge_list, &vmap_purge_list);
>>
>> - if (unlikely(nr_lazy > lazy_max_pages()))
>> + if (unlikely(nr_lazy > lazy_max_pages()) &&
>> + !mutex_is_locked(&vmap_purge_lock))
>> schedule_work(&purge_vmap_work);
>> }
>>
>> --
>> 2.10.2
>>
On 04/04/2017 12:41 PM, Michal Hocko wrote:
> On Thu 30-03-17 17:48:39, Andrey Ryabinin wrote:
>> From: Andrey Ryabinin <[email protected]>
>> Subject: mm/vmalloc: allow to call vfree() in atomic context fix
>>
>> Don't spawn worker if we already purging.
>>
>> Signed-off-by: Andrey Ryabinin <[email protected]>
>
> I would rather put this into a separate patch. Ideally with some numners
> as this is an optimization...
>
It's quite simple optimization and don't think that this deserves to be a separate patch.
But I did some measurements though. With enabled VMAP_STACK=y and NR_CACHED_STACK changed to 0
running fork() 100000 times gives this:
With optimization:
~ # grep try_purge /proc/kallsyms
ffffffff811d0dd0 t try_purge_vmap_area_lazy
~ # perf stat --repeat 10 -ae workqueue:workqueue_queue_work --filter 'function == 0xffffffff811d0dd0' ./fork
Performance counter stats for 'system wide' (10 runs):
15 workqueue:workqueue_queue_work ( +- 0.88% )
1.615368474 seconds time elapsed ( +- 0.41% )
Without optimization:
~ # grep try_purge /proc/kallsyms
ffffffff811d0dd0 t try_purge_vmap_area_lazy
~ # perf stat --repeat 10 -ae workqueue:workqueue_queue_work --filter 'function == 0xffffffff811d0dd0' ./fork
Performance counter stats for 'system wide' (10 runs):
30 workqueue:workqueue_queue_work ( +- 1.31% )
1.613231060 seconds time elapsed ( +- 0.38% )
So there is no measurable difference on the test itself, but we queue twice more jobs without this optimization.
It should decrease load of kworkers.
>> ---
>> mm/vmalloc.c | 3 ++-
>> 1 file changed, 2 insertions(+), 1 deletion(-)
>>
>> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
>> index ea1b4ab..88168b8 100644
>> --- a/mm/vmalloc.c
>> +++ b/mm/vmalloc.c
>> @@ -737,7 +737,8 @@ static void free_vmap_area_noflush(struct vmap_area *va)
>> /* After this point, we may free va at any time */
>> llist_add(&va->purge_list, &vmap_purge_list);
>>
>> - if (unlikely(nr_lazy > lazy_max_pages()))
>> + if (unlikely(nr_lazy > lazy_max_pages()) &&
>> + !mutex_is_locked(&vmap_purge_lock))
>> schedule_work(&purge_vmap_work);
>> }
>>
>> --
>> 2.10.2
>>
>
On Wed 05-04-17 13:31:23, Andrey Ryabinin wrote:
> On 04/04/2017 12:41 PM, Michal Hocko wrote:
> > On Thu 30-03-17 17:48:39, Andrey Ryabinin wrote:
> >> From: Andrey Ryabinin <[email protected]>
> >> Subject: mm/vmalloc: allow to call vfree() in atomic context fix
> >>
> >> Don't spawn worker if we already purging.
> >>
> >> Signed-off-by: Andrey Ryabinin <[email protected]>
> >
> > I would rather put this into a separate patch. Ideally with some numners
> > as this is an optimization...
> >
>
> It's quite simple optimization and don't think that this deserves to
> be a separate patch.
I disagree. I am pretty sure nobody will remember after few years. I
do not want to push too hard on this but I can tell you from my own
experience that we used to do way too many optimizations like that in
the past and they tend to be real head scratchers these days. Moreover
people just tend to build on top of them without understadning and then
chances are quite high that they are no longer relevant anymore.
> But I did some measurements though. With enabled VMAP_STACK=y and
> NR_CACHED_STACK changed to 0 running fork() 100000 times gives this:
>
> With optimization:
>
> ~ # grep try_purge /proc/kallsyms
> ffffffff811d0dd0 t try_purge_vmap_area_lazy
> ~ # perf stat --repeat 10 -ae workqueue:workqueue_queue_work --filter 'function == 0xffffffff811d0dd0' ./fork
>
> Performance counter stats for 'system wide' (10 runs):
>
> 15 workqueue:workqueue_queue_work ( +- 0.88% )
>
> 1.615368474 seconds time elapsed ( +- 0.41% )
>
>
> Without optimization:
> ~ # grep try_purge /proc/kallsyms
> ffffffff811d0dd0 t try_purge_vmap_area_lazy
> ~ # perf stat --repeat 10 -ae workqueue:workqueue_queue_work --filter 'function == 0xffffffff811d0dd0' ./fork
>
> Performance counter stats for 'system wide' (10 runs):
>
> 30 workqueue:workqueue_queue_work ( +- 1.31% )
>
> 1.613231060 seconds time elapsed ( +- 0.38% )
>
>
> So there is no measurable difference on the test itself, but we queue
> twice more jobs without this optimization. It should decrease load of
> kworkers.
And this is really valueable for the changelog!
Thanks!
--
Michal Hocko
SUSE Labs
On 03/30/2017 04:48 PM, Andrey Ryabinin wrote:
> On 03/30/2017 03:00 PM, Thomas Hellstrom wrote:
>
>>>
>>> if (unlikely(nr_lazy > lazy_max_pages()))
>>> - try_purge_vmap_area_lazy();
>>
>> Perhaps a slight optimization would be to schedule work iff
>> !mutex_locked(&vmap_purge_lock) below?
>>
>
> Makes sense, we don't need to spawn workers if we already purging.
>
>
>
> From: Andrey Ryabinin <[email protected]>
> Subject: mm/vmalloc: allow to call vfree() in atomic context fix
>
> Don't spawn worker if we already purging.
>
> Signed-off-by: Andrey Ryabinin <[email protected]>
> ---
> mm/vmalloc.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index ea1b4ab..88168b8 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -737,7 +737,8 @@ static void free_vmap_area_noflush(struct vmap_area *va)
> /* After this point, we may free va at any time */
> llist_add(&va->purge_list, &vmap_purge_list);
>
> - if (unlikely(nr_lazy > lazy_max_pages()))
> + if (unlikely(nr_lazy > lazy_max_pages()) &&
> + !mutex_is_locked(&vmap_purge_lock))
So, isn't this racy? (and do we care?)
Vlastimil
> schedule_work(&purge_vmap_work);
> }
>
>
On Wed 05-04-17 13:42:19, Vlastimil Babka wrote:
> On 03/30/2017 04:48 PM, Andrey Ryabinin wrote:
[...]
> > --- a/mm/vmalloc.c
> > +++ b/mm/vmalloc.c
> > @@ -737,7 +737,8 @@ static void free_vmap_area_noflush(struct vmap_area *va)
> > /* After this point, we may free va at any time */
> > llist_add(&va->purge_list, &vmap_purge_list);
> >
> > - if (unlikely(nr_lazy > lazy_max_pages()))
> > + if (unlikely(nr_lazy > lazy_max_pages()) &&
> > + !mutex_is_locked(&vmap_purge_lock))
>
> So, isn't this racy? (and do we care?)
yes, it is racy and no we do not care AFAICS. If the lock is held then
somebody is already doing the work on our behalf. If we are unlucky
and that work has been already consumed (read another lazy_max_pages
have been freed) then we would still try to lazy free it during the
allocation. This would be something for the changelog of course.
--
Michal Hocko
SUSE Labs
Changes since v1:
- Added small optmization as a separate patch 5/5
- Collected Acks/Review tags.
Andrey Ryabinin (5):
mm/vmalloc: allow to call vfree() in atomic context
x86/ldt: use vfree() instead of vfree_atomic()
kernel/fork: use vfree() instead of vfree_atomic() to free thread
stack
mm/vmalloc: remove vfree_atomic()
mm/vmalloc: Don't spawn workers if somebody already purging
arch/x86/kernel/ldt.c | 2 +-
include/linux/vmalloc.h | 1 -
kernel/fork.c | 2 +-
mm/vmalloc.c | 52 +++++++++++--------------------------------------
4 files changed, 13 insertions(+), 44 deletions(-)
--
2.10.2
Commit 5803ed292e63 ("mm: mark all calls into the vmalloc subsystem
as potentially sleeping") added might_sleep() to remove_vm_area() from
vfree(), and commit 763b218ddfaf ("mm: add preempt points into
__purge_vmap_area_lazy()") actually made vfree() potentially sleeping.
This broke vmwgfx driver which calls vfree() under spin_lock().
BUG: sleeping function called from invalid context at mm/vmalloc.c:1480
in_atomic(): 1, irqs_disabled(): 0, pid: 341, name: plymouthd
2 locks held by plymouthd/341:
#0: (drm_global_mutex){+.+.+.}, at: [<ffffffffc01c274b>] drm_release+0x3b/0x3b0 [drm]
#1: (&(&tfile->lock)->rlock){+.+...}, at: [<ffffffffc0173038>] ttm_object_file_release+0x28/0x90 [ttm]
Call Trace:
dump_stack+0x86/0xc3
___might_sleep+0x17d/0x250
__might_sleep+0x4a/0x80
remove_vm_area+0x22/0x90
__vunmap+0x2e/0x110
vfree+0x42/0x90
kvfree+0x2c/0x40
drm_ht_remove+0x1a/0x30 [drm]
ttm_object_file_release+0x50/0x90 [ttm]
vmw_postclose+0x47/0x60 [vmwgfx]
drm_release+0x290/0x3b0 [drm]
__fput+0xf8/0x210
____fput+0xe/0x10
task_work_run+0x85/0xc0
exit_to_usermode_loop+0xb4/0xc0
do_syscall_64+0x185/0x1f0
entry_SYSCALL64_slow_path+0x25/0x25
This can be fixed in vmgfx, but it would be better to make vfree()
non-sleeping again because we may have other bugs like this one.
__purge_vmap_area_lazy() is the only function in the vfree() path that
wants to be able to sleep. So it make sense to schedule
__purge_vmap_area_lazy() via schedule_work() so it runs only in sleepable
context. This will have a minimal effect on the regular vfree() path.
since __purge_vmap_area_lazy() is rarely called.
Fixes: 5803ed292e63 ("mm: mark all calls into the vmalloc subsystem as
potentially sleeping")
Reported-by: Tetsuo Handa <[email protected]>
Signed-off-by: Andrey Ryabinin <[email protected]>
Reviewed-by: Thomas Hellstrom <[email protected]>
Acked-by: Michal Hocko <[email protected]>
Cc: <[email protected]>
---
mm/vmalloc.c | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 8ef8ea1..1c74b26 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -701,7 +701,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
* Kick off a purge of the outstanding lazy areas. Don't bother if somebody
* is already purging.
*/
-static void try_purge_vmap_area_lazy(void)
+static void try_purge_vmap_area_lazy(struct work_struct *work)
{
if (mutex_trylock(&vmap_purge_lock)) {
__purge_vmap_area_lazy(ULONG_MAX, 0);
@@ -720,6 +720,8 @@ static void purge_vmap_area_lazy(void)
mutex_unlock(&vmap_purge_lock);
}
+static DECLARE_WORK(purge_vmap_work, try_purge_vmap_area_lazy);
+
/*
* Free a vmap area, caller ensuring that the area has been unmapped
* and flush_cache_vunmap had been called for the correct range
@@ -736,7 +738,7 @@ static void free_vmap_area_noflush(struct vmap_area *va)
llist_add(&va->purge_list, &vmap_purge_list);
if (unlikely(nr_lazy > lazy_max_pages()))
- try_purge_vmap_area_lazy();
+ schedule_work(&purge_vmap_work);
}
/*
@@ -1125,7 +1127,6 @@ void vm_unmap_ram(const void *mem, unsigned int count)
unsigned long addr = (unsigned long)mem;
struct vmap_area *va;
- might_sleep();
BUG_ON(!addr);
BUG_ON(addr < VMALLOC_START);
BUG_ON(addr > VMALLOC_END);
@@ -1477,8 +1478,6 @@ struct vm_struct *remove_vm_area(const void *addr)
{
struct vmap_area *va;
- might_sleep();
-
va = find_vmap_area((unsigned long)addr);
if (va && va->flags & VM_VM_AREA) {
struct vm_struct *vm = va->vm;
--
2.10.2
vfree() can be used in any atomic context now, thus there is no point
in using vfree_atomic().
This reverts commit 0f110a9b956c ("kernel/fork: use vfree_atomic()
to free thread stack")
Signed-off-by: Andrey Ryabinin <[email protected]>
---
kernel/fork.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/fork.c b/kernel/fork.c
index 81347bd..e4baa21 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -259,7 +259,7 @@ static inline void free_thread_stack(struct task_struct *tsk)
}
local_irq_restore(flags);
- vfree_atomic(tsk->stack);
+ vfree(tsk->stack);
return;
}
#endif
--
2.10.2
Don't schedule purge_vmap_work if mutex_is_locked(&vmap_purge_lock),
as this means that purging is already running in another thread.
There is no point to schedule extra purge_vmap_work if somebody
is already purging for us, because that extra work will not do anything
useful.
To evaluate performance impact of this change test that calls
fork() 100 000 times on the kernel with enabled CONFIG_VMAP_STACK=y
and NR_CACHED_STACK changed to 0 (so that each fork()/exit() executes
vmalloc()/vfree() call) was used.
Commands:
~ # grep try_purge /proc/kallsyms
ffffffff811d0dd0 t try_purge_vmap_area_lazy
~ # perf stat --repeat 10 -ae workqueue:workqueue_queue_work \
--filter 'function == 0xffffffff811d0dd0' ./fork
gave me the following results:
before:
30 workqueue:workqueue_queue_work ( +- 1.31% )
1.613231060 seconds time elapsed ( +- 0.38% )
after:
15 workqueue:workqueue_queue_work ( +- 0.88% )
1.615368474 seconds time elapsed ( +- 0.41% )
So there is no measurable difference on the performance of the test itself,
but without the optimization we queue twice more jobs. This should save
kworkers from doing some useless job.
Signed-off-by: Andrey Ryabinin <[email protected]>
Suggested-by: Thomas Hellstrom <[email protected]>
Reviewed-by: Thomas Hellstrom <[email protected]>
---
mm/vmalloc.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ee62c0a..1079555 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -737,7 +737,8 @@ static void free_vmap_area_noflush(struct vmap_area *va)
/* After this point, we may free va at any time */
llist_add(&va->purge_list, &vmap_purge_list);
- if (unlikely(nr_lazy > lazy_max_pages()))
+ if (unlikely(nr_lazy > lazy_max_pages()) &&
+ !mutex_is_locked(&vmap_purge_lock))
schedule_work(&purge_vmap_work);
}
--
2.10.2
vfree() can be used in any atomic context and there is no
vfree_atomic() callers left, so let's remove it.
This reverts commit bf22e37a6413 ("mm: add vfree_atomic()")
Signed-off-by: Andrey Ryabinin <[email protected]>
Acked-by: Michal Hocko <[email protected]>
---
include/linux/vmalloc.h | 1 -
mm/vmalloc.c | 40 +++++-----------------------------------
2 files changed, 5 insertions(+), 36 deletions(-)
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 46991ad..b4f044f 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -83,7 +83,6 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags);
extern void vfree(const void *addr);
-extern void vfree_atomic(const void *addr);
extern void *vmap(struct page **pages, unsigned int count,
unsigned long flags, pgprot_t prot);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 1c74b26..ee62c0a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1534,38 +1534,6 @@ static void __vunmap(const void *addr, int deallocate_pages)
return;
}
-static inline void __vfree_deferred(const void *addr)
-{
- /*
- * Use raw_cpu_ptr() because this can be called from preemptible
- * context. Preemption is absolutely fine here, because the llist_add()
- * implementation is lockless, so it works even if we are adding to
- * nother cpu's list. schedule_work() should be fine with this too.
- */
- struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
-
- if (llist_add((struct llist_node *)addr, &p->list))
- schedule_work(&p->wq);
-}
-
-/**
- * vfree_atomic - release memory allocated by vmalloc()
- * @addr: memory base address
- *
- * This one is just like vfree() but can be called in any atomic context
- * except NMIs.
- */
-void vfree_atomic(const void *addr)
-{
- BUG_ON(in_nmi());
-
- kmemleak_free(addr);
-
- if (!addr)
- return;
- __vfree_deferred(addr);
-}
-
/**
* vfree - release memory allocated by vmalloc()
* @addr: memory base address
@@ -1588,9 +1556,11 @@ void vfree(const void *addr)
if (!addr)
return;
- if (unlikely(in_interrupt()))
- __vfree_deferred(addr);
- else
+ if (unlikely(in_interrupt())) {
+ struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred);
+ if (llist_add((struct llist_node *)addr, &p->list))
+ schedule_work(&p->wq);
+ } else
__vunmap(addr, 1);
}
EXPORT_SYMBOL(vfree);
--
2.10.2
vfree() can be used in any atomic context now, thus there is no point
in vfree_atomic().
This reverts commit 8d5341a6260a ("x86/ldt: use vfree_atomic()
to free ldt entries")
Signed-off-by: Andrey Ryabinin <[email protected]>
Reviewed-by: Thomas Gleixner <[email protected]>
---
arch/x86/kernel/ldt.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index d4a1583..f09df2f 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -93,7 +93,7 @@ static void free_ldt_struct(struct ldt_struct *ldt)
paravirt_free_ldt(ldt->entries, ldt->size);
if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
- vfree_atomic(ldt->entries);
+ vfree(ldt->entries);
else
free_page((unsigned long)ldt->entries);
kfree(ldt);
--
2.10.2