Hi.
While running v5.10-rc5-rt11 I bumped into the following:
```
BUG: scheduling while atomic: git/18695/0x00000002
Preemption disabled at:
[<ffffffffbb93fcb3>] z3fold_zpool_malloc+0x463/0x6e0
…
Call Trace:
dump_stack+0x6d/0x88
__schedule_bug.cold+0x88/0x96
__schedule+0x69e/0x8c0
preempt_schedule_lock+0x51/0x150
rt_spin_lock_slowlock_locked+0x117/0x2c0
rt_spin_lock_slowlock+0x58/0x80
rt_spin_lock+0x2a/0x40
z3fold_zpool_malloc+0x4c1/0x6e0
zswap_frontswap_store+0x39c/0x980
__frontswap_store+0x6e/0xf0
swap_writepage+0x39/0x70
shmem_writepage+0x31b/0x490
pageout+0xf4/0x350
shrink_page_list+0xa28/0xcc0
shrink_inactive_list+0x300/0x690
shrink_lruvec+0x59a/0x770
shrink_node+0x2d6/0x8d0
do_try_to_free_pages+0xda/0x530
try_to_free_pages+0xff/0x260
__alloc_pages_slowpath.constprop.0+0x3d5/0x1230
__alloc_pages_nodemask+0x2f6/0x350
allocate_slab+0x3da/0x660
___slab_alloc+0x4ff/0x760
__slab_alloc.constprop.0+0x7a/0x100
kmem_cache_alloc+0x27b/0x2c0
__d_alloc+0x22/0x230
d_alloc_parallel+0x67/0x5e0
__lookup_slow+0x5c/0x150
path_lookupat+0x2ea/0x4d0
filename_lookup+0xbf/0x210
vfs_statx.constprop.0+0x4d/0x110
__do_sys_newlstat+0x3d/0x80
do_syscall_64+0x33/0x40
entry_SYSCALL_64_after_hwframe+0x44/0xa9
```
The preemption seems to be disabled here:
```
$ scripts/faddr2line mm/z3fold.o z3fold_zpool_malloc+0x463
z3fold_zpool_malloc+0x463/0x6e0:
add_to_unbuddied at mm/z3fold.c:645
(inlined by) z3fold_alloc at mm/z3fold.c:1195
(inlined by) z3fold_zpool_malloc at mm/z3fold.c:1737
```
The call to the rt_spin_lock() seems to be here:
```
$ scripts/faddr2line mm/z3fold.o z3fold_zpool_malloc+0x4c1
z3fold_zpool_malloc+0x4c1/0x6e0:
add_to_unbuddied at mm/z3fold.c:649
(inlined by) z3fold_alloc at mm/z3fold.c:1195
(inlined by) z3fold_zpool_malloc at mm/z3fold.c:1737
```
Or, in source code:
```
639 /* Add to the appropriate unbuddied list */
640 static inline void add_to_unbuddied(struct z3fold_pool *pool,
641 struct z3fold_header *zhdr)
642 {
643 if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
644 zhdr->middle_chunks == 0) {
645 struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
646
647 int freechunks = num_free_chunks(zhdr);
648 spin_lock(&pool->lock);
649 list_add(&zhdr->buddy, &unbuddied[freechunks]);
650 spin_unlock(&pool->lock);
651 zhdr->cpu = smp_processor_id();
652 put_cpu_ptr(pool->unbuddied);
653 }
654 }
```
Shouldn't the list manipulation be protected with
local_lock+this_cpu_ptr instead of get_cpu_ptr+spin_lock?
Thanks.
--
Oleksandr Natalenko (post-factum)
On Sat, Nov 28, 2020 at 03:05:24PM +0100, Oleksandr Natalenko wrote:
> Hi.
>
> While running v5.10-rc5-rt11 I bumped into the following:
>
> ```
> BUG: scheduling while atomic: git/18695/0x00000002
> Preemption disabled at:
> [<ffffffffbb93fcb3>] z3fold_zpool_malloc+0x463/0x6e0
> …
> Call Trace:
> dump_stack+0x6d/0x88
> __schedule_bug.cold+0x88/0x96
> __schedule+0x69e/0x8c0
> preempt_schedule_lock+0x51/0x150
> rt_spin_lock_slowlock_locked+0x117/0x2c0
> rt_spin_lock_slowlock+0x58/0x80
> rt_spin_lock+0x2a/0x40
> z3fold_zpool_malloc+0x4c1/0x6e0
> zswap_frontswap_store+0x39c/0x980
> __frontswap_store+0x6e/0xf0
> swap_writepage+0x39/0x70
> shmem_writepage+0x31b/0x490
> pageout+0xf4/0x350
> shrink_page_list+0xa28/0xcc0
> shrink_inactive_list+0x300/0x690
> shrink_lruvec+0x59a/0x770
> shrink_node+0x2d6/0x8d0
> do_try_to_free_pages+0xda/0x530
> try_to_free_pages+0xff/0x260
> __alloc_pages_slowpath.constprop.0+0x3d5/0x1230
> __alloc_pages_nodemask+0x2f6/0x350
> allocate_slab+0x3da/0x660
> ___slab_alloc+0x4ff/0x760
> __slab_alloc.constprop.0+0x7a/0x100
> kmem_cache_alloc+0x27b/0x2c0
> __d_alloc+0x22/0x230
> d_alloc_parallel+0x67/0x5e0
> __lookup_slow+0x5c/0x150
> path_lookupat+0x2ea/0x4d0
> filename_lookup+0xbf/0x210
> vfs_statx.constprop.0+0x4d/0x110
> __do_sys_newlstat+0x3d/0x80
> do_syscall_64+0x33/0x40
> entry_SYSCALL_64_after_hwframe+0x44/0xa9
> ```
>
> The preemption seems to be disabled here:
>
> ```
> $ scripts/faddr2line mm/z3fold.o z3fold_zpool_malloc+0x463
> z3fold_zpool_malloc+0x463/0x6e0:
> add_to_unbuddied at mm/z3fold.c:645
> (inlined by) z3fold_alloc at mm/z3fold.c:1195
> (inlined by) z3fold_zpool_malloc at mm/z3fold.c:1737
> ```
>
> The call to the rt_spin_lock() seems to be here:
>
> ```
> $ scripts/faddr2line mm/z3fold.o z3fold_zpool_malloc+0x4c1
> z3fold_zpool_malloc+0x4c1/0x6e0:
> add_to_unbuddied at mm/z3fold.c:649
> (inlined by) z3fold_alloc at mm/z3fold.c:1195
> (inlined by) z3fold_zpool_malloc at mm/z3fold.c:1737
> ```
>
> Or, in source code:
>
> ```
> 639 /* Add to the appropriate unbuddied list */
> 640 static inline void add_to_unbuddied(struct z3fold_pool *pool,
> 641 struct z3fold_header *zhdr)
> 642 {
> 643 if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
> 644 zhdr->middle_chunks == 0) {
> 645 struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
> 646
> 647 int freechunks = num_free_chunks(zhdr);
> 648 spin_lock(&pool->lock);
> 649 list_add(&zhdr->buddy, &unbuddied[freechunks]);
> 650 spin_unlock(&pool->lock);
> 651 zhdr->cpu = smp_processor_id();
> 652 put_cpu_ptr(pool->unbuddied);
> 653 }
> 654 }
> ```
>
> Shouldn't the list manipulation be protected with
> local_lock+this_cpu_ptr instead of get_cpu_ptr+spin_lock?
>
> Thanks.
>
> --
> Oleksandr Natalenko (post-factum)
Forgot to Cc linux-rt-users@, sorry.
--
Oleksandr Natalenko (post-factum)
On Sat, Nov 28, 2020 at 03:09:24PM +0100, Oleksandr Natalenko wrote:
> > While running v5.10-rc5-rt11 I bumped into the following:
> >
> > ```
> > BUG: scheduling while atomic: git/18695/0x00000002
> > Preemption disabled at:
> > [<ffffffffbb93fcb3>] z3fold_zpool_malloc+0x463/0x6e0
> > …
> > Call Trace:
> > dump_stack+0x6d/0x88
> > __schedule_bug.cold+0x88/0x96
> > __schedule+0x69e/0x8c0
> > preempt_schedule_lock+0x51/0x150
> > rt_spin_lock_slowlock_locked+0x117/0x2c0
> > rt_spin_lock_slowlock+0x58/0x80
> > rt_spin_lock+0x2a/0x40
> > z3fold_zpool_malloc+0x4c1/0x6e0
> > zswap_frontswap_store+0x39c/0x980
> > __frontswap_store+0x6e/0xf0
> > swap_writepage+0x39/0x70
> > shmem_writepage+0x31b/0x490
> > pageout+0xf4/0x350
> > shrink_page_list+0xa28/0xcc0
> > shrink_inactive_list+0x300/0x690
> > shrink_lruvec+0x59a/0x770
> > shrink_node+0x2d6/0x8d0
> > do_try_to_free_pages+0xda/0x530
> > try_to_free_pages+0xff/0x260
> > __alloc_pages_slowpath.constprop.0+0x3d5/0x1230
> > __alloc_pages_nodemask+0x2f6/0x350
> > allocate_slab+0x3da/0x660
> > ___slab_alloc+0x4ff/0x760
> > __slab_alloc.constprop.0+0x7a/0x100
> > kmem_cache_alloc+0x27b/0x2c0
> > __d_alloc+0x22/0x230
> > d_alloc_parallel+0x67/0x5e0
> > __lookup_slow+0x5c/0x150
> > path_lookupat+0x2ea/0x4d0
> > filename_lookup+0xbf/0x210
> > vfs_statx.constprop.0+0x4d/0x110
> > __do_sys_newlstat+0x3d/0x80
> > do_syscall_64+0x33/0x40
> > entry_SYSCALL_64_after_hwframe+0x44/0xa9
> > ```
> >
> > The preemption seems to be disabled here:
> >
> > ```
> > $ scripts/faddr2line mm/z3fold.o z3fold_zpool_malloc+0x463
> > z3fold_zpool_malloc+0x463/0x6e0:
> > add_to_unbuddied at mm/z3fold.c:645
> > (inlined by) z3fold_alloc at mm/z3fold.c:1195
> > (inlined by) z3fold_zpool_malloc at mm/z3fold.c:1737
> > ```
> >
> > The call to the rt_spin_lock() seems to be here:
> >
> > ```
> > $ scripts/faddr2line mm/z3fold.o z3fold_zpool_malloc+0x4c1
> > z3fold_zpool_malloc+0x4c1/0x6e0:
> > add_to_unbuddied at mm/z3fold.c:649
> > (inlined by) z3fold_alloc at mm/z3fold.c:1195
> > (inlined by) z3fold_zpool_malloc at mm/z3fold.c:1737
> > ```
> >
> > Or, in source code:
> >
> > ```
> > 639 /* Add to the appropriate unbuddied list */
> > 640 static inline void add_to_unbuddied(struct z3fold_pool *pool,
> > 641 struct z3fold_header *zhdr)
> > 642 {
> > 643 if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
> > 644 zhdr->middle_chunks == 0) {
> > 645 struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
> > 646
> > 647 int freechunks = num_free_chunks(zhdr);
> > 648 spin_lock(&pool->lock);
> > 649 list_add(&zhdr->buddy, &unbuddied[freechunks]);
> > 650 spin_unlock(&pool->lock);
> > 651 zhdr->cpu = smp_processor_id();
> > 652 put_cpu_ptr(pool->unbuddied);
> > 653 }
> > 654 }
> > ```
> >
> > Shouldn't the list manipulation be protected with
> > local_lock+this_cpu_ptr instead of get_cpu_ptr+spin_lock?
Totally untested:
```
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 18feaa0bc537..53fcb80c6167 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -41,6 +41,7 @@
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
+#include <linux/local_lock.h>
#include <linux/zpool.h>
#include <linux/magic.h>
#include <linux/kmemleak.h>
@@ -156,6 +157,7 @@ struct z3fold_pool {
const char *name;
spinlock_t lock;
spinlock_t stale_lock;
+ local_lock_t llock;
struct list_head *unbuddied;
struct list_head lru;
struct list_head stale;
@@ -642,14 +644,17 @@ static inline void add_to_unbuddied(struct z3fold_pool *pool,
{
if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
zhdr->middle_chunks == 0) {
- struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
+ struct list_head *unbuddied;
+ int freechunks;
+ local_lock(&pool->llock);
+ unbuddied = *this_cpu_ptr(&pool->unbuddied);
- int freechunks = num_free_chunks(zhdr);
+ freechunks = num_free_chunks(zhdr);
spin_lock(&pool->lock);
list_add(&zhdr->buddy, &unbuddied[freechunks]);
spin_unlock(&pool->lock);
zhdr->cpu = smp_processor_id();
- put_cpu_ptr(pool->unbuddied);
+ local_unlock(&pool->llock);
}
}
@@ -887,7 +892,8 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
lookup:
/* First, try to find an unbuddied z3fold page. */
- unbuddied = get_cpu_ptr(pool->unbuddied);
+ local_lock(&pool->llock);
+ unbuddied = *this_cpu_ptr(&pool->unbuddied);
for_each_unbuddied_list(i, chunks) {
struct list_head *l = &unbuddied[i];
@@ -905,7 +911,7 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
!z3fold_page_trylock(zhdr)) {
spin_unlock(&pool->lock);
zhdr = NULL;
- put_cpu_ptr(pool->unbuddied);
+ local_unlock(&pool->llock);
if (can_sleep)
cond_resched();
goto lookup;
@@ -919,7 +925,7 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
test_bit(PAGE_CLAIMED, &page->private)) {
z3fold_page_unlock(zhdr);
zhdr = NULL;
- put_cpu_ptr(pool->unbuddied);
+ local_unlock(&pool->llock);
if (can_sleep)
cond_resched();
goto lookup;
@@ -934,7 +940,7 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
kref_get(&zhdr->refcount);
break;
}
- put_cpu_ptr(pool->unbuddied);
+ local_unlock(&pool->llock);
if (!zhdr) {
int cpu;
@@ -1005,6 +1011,7 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
goto out_c;
spin_lock_init(&pool->lock);
spin_lock_init(&pool->stale_lock);
+ local_lock_init(&pool->llock);
pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2);
if (!pool->unbuddied)
goto out_pool;
```
--
Oleksandr Natalenko (post-factum)
On Sat, 2020-11-28 at 15:27 +0100, Oleksandr Natalenko wrote:
>
> > > Shouldn't the list manipulation be protected with
> > > local_lock+this_cpu_ptr instead of get_cpu_ptr+spin_lock?
>
> Totally untested:
Hrm, the thing doesn't seem to care deeply about preemption being
disabled, so adding another lock may be overkill. It looks like you
could get the job done via migrate_disable()+this_cpu_ptr().
In the case of the list in __z3fold_alloc(), after the unlocked peek,
it double checks under the existing lock. There's not a whole lot of
difference between another cpu a few lines down in the same function
diddling the list and a preemption doing the same, so why bother?
Ditto add_to_unbuddied(). Flow decisions are being made based on zhdr-
>first/middle/last_chunks state prior to preemption being disabled as
the thing sits, so presumably their stability is not dependent thereon,
while the list is protected by the already existing lock, making the
preempt_disable() look more incidental than intentional.
Equally untested :)
---
mm/z3fold.c | 17 ++++++++++-------
1 file changed, 10 insertions(+), 7 deletions(-)
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -642,14 +642,16 @@ static inline void add_to_unbuddied(stru
{
if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
zhdr->middle_chunks == 0) {
- struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
-
+ struct list_head *unbuddied;
int freechunks = num_free_chunks(zhdr);
+
+ migrate_disable();
+ unbuddied = this_cpu_ptr(pool->unbuddied);
spin_lock(&pool->lock);
list_add(&zhdr->buddy, &unbuddied[freechunks]);
spin_unlock(&pool->lock);
zhdr->cpu = smp_processor_id();
- put_cpu_ptr(pool->unbuddied);
+ migrate_enable();
}
}
@@ -886,8 +888,9 @@ static inline struct z3fold_header *__z3
int chunks = size_to_chunks(size), i;
lookup:
+ migrate_disable();
/* First, try to find an unbuddied z3fold page. */
- unbuddied = get_cpu_ptr(pool->unbuddied);
+ unbuddied = this_cpu_ptr(pool->unbuddied);
for_each_unbuddied_list(i, chunks) {
struct list_head *l = &unbuddied[i];
@@ -905,7 +908,7 @@ static inline struct z3fold_header *__z3
!z3fold_page_trylock(zhdr)) {
spin_unlock(&pool->lock);
zhdr = NULL;
- put_cpu_ptr(pool->unbuddied);
+ migrate_enable();
if (can_sleep)
cond_resched();
goto lookup;
@@ -919,7 +922,7 @@ static inline struct z3fold_header *__z3
test_bit(PAGE_CLAIMED, &page->private)) {
z3fold_page_unlock(zhdr);
zhdr = NULL;
- put_cpu_ptr(pool->unbuddied);
+ migrate_enable();
if (can_sleep)
cond_resched();
goto lookup;
@@ -934,7 +937,7 @@ static inline struct z3fold_header *__z3
kref_get(&zhdr->refcount);
break;
}
- put_cpu_ptr(pool->unbuddied);
+ migrate_enable();
if (!zhdr) {
int cpu;
On Sun, 2020-11-29 at 07:41 +0100, Mike Galbraith wrote:
> On Sat, 2020-11-28 at 15:27 +0100, Oleksandr Natalenko wrote:
> >
> > > > Shouldn't the list manipulation be protected with
> > > > local_lock+this_cpu_ptr instead of get_cpu_ptr+spin_lock?
> >
> > Totally untested:
>
> Hrm, the thing doesn't seem to care deeply about preemption being
> disabled, so adding another lock may be overkill. It looks like you
> could get the job done via migrate_disable()+this_cpu_ptr().
There is however an ever so tiny chance that I'm wrong about that :)
crash.rt> bt -s
PID: 6699 TASK: ffff913c464b5640 CPU: 0 COMMAND: "oom01"
#0 [ffffb6b94adff6f0] machine_kexec+366 at ffffffffbd05f87e
#1 [ffffb6b94adff738] __crash_kexec+210 at ffffffffbd14c052
#2 [ffffb6b94adff7f8] crash_kexec+48 at ffffffffbd14d240
#3 [ffffb6b94adff808] oops_end+202 at ffffffffbd02680a
#4 [ffffb6b94adff828] no_context+333 at ffffffffbd06d7ed
#5 [ffffb6b94adff888] exc_page_fault+696 at ffffffffbd8c0b68
#6 [ffffb6b94adff8e0] asm_exc_page_fault+30 at ffffffffbda00ace
#7 [ffffb6b94adff968] mark_wakeup_next_waiter+81 at ffffffffbd0ea1e1
#8 [ffffb6b94adff9c8] rt_mutex_futex_unlock+79 at ffffffffbd8cc3cf
#9 [ffffb6b94adffa08] z3fold_zpool_free+1319 at ffffffffbd2b6b17
#10 [ffffb6b94adffa68] zswap_free_entry+67 at ffffffffbd27c6f3
#11 [ffffb6b94adffa78] zswap_frontswap_invalidate_page+138 at ffffffffbd27c7fa
#12 [ffffb6b94adffaa0] __frontswap_invalidate_page+72 at ffffffffbd27bee8
#13 [ffffb6b94adffac8] swapcache_free_entries+494 at ffffffffbd276e1e
#14 [ffffb6b94adffb10] free_swap_slot+173 at ffffffffbd27b7dd
#15 [ffffb6b94adffb30] __swap_entry_free+112 at ffffffffbd2768d0
#16 [ffffb6b94adffb58] free_swap_and_cache+57 at ffffffffbd278939
#17 [ffffb6b94adffb80] unmap_page_range+1485 at ffffffffbd24c52d
#18 [ffffb6b94adffc40] __oom_reap_task_mm+178 at ffffffffbd218f02
#19 [ffffb6b94adffd10] exit_mmap+339 at ffffffffbd257da3
#20 [ffffb6b94adffdb0] mmput+78 at ffffffffbd07fe7e
#21 [ffffb6b94adffdc0] do_exit+822 at ffffffffbd089bc6
#22 [ffffb6b94adffe28] do_group_exit+71 at ffffffffbd08a547
#23 [ffffb6b94adffe50] get_signal+319 at ffffffffbd0979ff
#24 [ffffb6b94adffe98] arch_do_signal+30 at ffffffffbd022cbe
#25 [ffffb6b94adfff28] exit_to_user_mode_prepare+293 at ffffffffbd1223e5
#26 [ffffb6b94adfff48] irqentry_exit_to_user_mode+5 at ffffffffbd8c1675
#27 [ffffb6b94adfff50] asm_exc_page_fault+30 at ffffffffbda00ace
RIP: 0000000000414300 RSP: 00007f5ddf065ec0 RFLAGS: 00010206
RAX: 0000000000001000 RBX: 00000000c0000000 RCX: 00000000adf28000
RDX: 00007f5d0bf8d000 RSI: 00000000c0000000 RDI: 0000000000000000
RBP: 00007f5c5e065000 R8: ffffffffffffffff R9: 0000000000000000
R10: 0000000000000022 R11: 0000000000000246 R12: 0000000000001000
R13: 0000000000000001 R14: 0000000000000001 R15: 00007ffc953ebcd0
ORIG_RAX: ffffffffffffffff CS: 0033 SS: 002b
crash.rt>
On Sun, 2020-11-29 at 08:48 +0100, Mike Galbraith wrote:
> On Sun, 2020-11-29 at 07:41 +0100, Mike Galbraith wrote:
> > On Sat, 2020-11-28 at 15:27 +0100, Oleksandr Natalenko wrote:
> > >
> > > > > Shouldn't the list manipulation be protected with
> > > > > local_lock+this_cpu_ptr instead of get_cpu_ptr+spin_lock?
> > >
> > > Totally untested:
> >
> > Hrm, the thing doesn't seem to care deeply about preemption being
> > disabled, so adding another lock may be overkill. It looks like you
> > could get the job done via migrate_disable()+this_cpu_ptr().
>
> There is however an ever so tiny chance that I'm wrong about that :)
Or not, your local_lock+this_cpu_ptr version exploded too.
Perhaps there's a bit of non-rt related racy racy going on in zswap
thingy that makes swap an even less wonderful idea for RT than usual.
crash.rt> bt -s
PID: 32399 TASK: ffff8e4528cd8000 CPU: 4 COMMAND: "cc1"
#0 [ffff9f0f1228f488] machine_kexec+366 at ffffffff8c05f87e
#1 [ffff9f0f1228f4d0] __crash_kexec+210 at ffffffff8c14c052
#2 [ffff9f0f1228f590] crash_kexec+48 at ffffffff8c14d240
#3 [ffff9f0f1228f5a0] oops_end+202 at ffffffff8c02680a
#4 [ffff9f0f1228f5c0] exc_general_protection+403 at ffffffff8c8be413
#5 [ffff9f0f1228f650] asm_exc_general_protection+30 at ffffffff8ca00a0e
#6 [ffff9f0f1228f6d8] __z3fold_alloc+118 at ffffffff8c2b4ea6
#7 [ffff9f0f1228f760] z3fold_zpool_malloc+115 at ffffffff8c2b56c3
#8 [ffff9f0f1228f7c8] zswap_frontswap_store+789 at ffffffff8c27d335
#9 [ffff9f0f1228f828] __frontswap_store+110 at ffffffff8c27bafe
#10 [ffff9f0f1228f858] swap_writepage+55 at ffffffff8c273b17
#11 [ffff9f0f1228f870] shmem_writepage+612 at ffffffff8c232964
#12 [ffff9f0f1228f8a8] pageout+210 at ffffffff8c225f12
#13 [ffff9f0f1228f928] shrink_page_list+2428 at ffffffff8c22744c
#14 [ffff9f0f1228f9c0] shrink_inactive_list+534 at ffffffff8c229746
#15 [ffff9f0f1228fa68] shrink_lruvec+927 at ffffffff8c22a35f
#16 [ffff9f0f1228fb78] shrink_node+567 at ffffffff8c22a7d7
#17 [ffff9f0f1228fbf8] do_try_to_free_pages+185 at ffffffff8c22ad39
#18 [ffff9f0f1228fc40] try_to_free_pages+201 at ffffffff8c22c239
#19 [ffff9f0f1228fcd0] __alloc_pages_slowpath.constprop.111+1056 at ffffffff8c26eb70
#20 [ffff9f0f1228fda8] __alloc_pages_nodemask+786 at ffffffff8c26f7e2
#21 [ffff9f0f1228fe00] alloc_pages_vma+309 at ffffffff8c288f15
#22 [ffff9f0f1228fe40] handle_mm_fault+1687 at ffffffff8c24ee97
#23 [ffff9f0f1228fef8] exc_page_fault+821 at ffffffff8c8c1be5
#24 [ffff9f0f1228ff50] asm_exc_page_fault+30 at ffffffff8ca00ace
RIP: 00000000010fea3b RSP: 00007ffc88ad5a50 RFLAGS: 00010206
RAX: 00007f4a548d1638 RBX: 00007f4a5c0c5000 RCX: 0000000000000000
RDX: 0000000000020000 RSI: 000000000000000f RDI: 0000000000018001
RBP: 00007f4a5547a000 R8: 0000000000018000 R9: 0000000000240000
R10: 00007f4a5523a000 R11: 0000000000098628 R12: 00007f4a548d1638
R13: 00007f4a54ab1478 R14: 000000005002ac29 R15: 0000000000000000
ORIG_RAX: ffffffffffffffff CS: 0033 SS: 002b
crash.rt>
On Sun, 2020-11-29 at 10:21 +0100, Mike Galbraith wrote:
> On Sun, 2020-11-29 at 08:48 +0100, Mike Galbraith wrote:
> > On Sun, 2020-11-29 at 07:41 +0100, Mike Galbraith wrote:
> > > On Sat, 2020-11-28 at 15:27 +0100, Oleksandr Natalenko wrote:
> > > >
> > > > > > Shouldn't the list manipulation be protected with
> > > > > > local_lock+this_cpu_ptr instead of get_cpu_ptr+spin_lock?
> > > >
> > > > Totally untested:
> > >
> > > Hrm, the thing doesn't seem to care deeply about preemption being
> > > disabled, so adding another lock may be overkill. It looks like you
> > > could get the job done via migrate_disable()+this_cpu_ptr().
> >
> > There is however an ever so tiny chance that I'm wrong about that :)
>
> Or not, your local_lock+this_cpu_ptr version exploded too.
>
> Perhaps there's a bit of non-rt related racy racy going on in zswap
> thingy that makes swap an even less wonderful idea for RT than usual.
Raciness seems to be restricted to pool compressor. "zbud" seems to be
solid, virgin "zsmalloc" explodes, as does "z3fold" regardless which of
us puts his grubby fingerprints on it.
Exploding compressors survived zero runs of runltp -f mm, I declared
zbud to be at least kinda sorta stable after box survived five runs.
-Mike
On Sun, Nov 29, 2020 at 11:56:55AM +0100, Mike Galbraith wrote:
> On Sun, 2020-11-29 at 10:21 +0100, Mike Galbraith wrote:
> > On Sun, 2020-11-29 at 08:48 +0100, Mike Galbraith wrote:
> > > On Sun, 2020-11-29 at 07:41 +0100, Mike Galbraith wrote:
> > > > On Sat, 2020-11-28 at 15:27 +0100, Oleksandr Natalenko wrote:
> > > > >
> > > > > > > Shouldn't the list manipulation be protected with
> > > > > > > local_lock+this_cpu_ptr instead of get_cpu_ptr+spin_lock?
> > > > >
> > > > > Totally untested:
> > > >
> > > > Hrm, the thing doesn't seem to care deeply about preemption being
> > > > disabled, so adding another lock may be overkill. It looks like you
> > > > could get the job done via migrate_disable()+this_cpu_ptr().
> > >
> > > There is however an ever so tiny chance that I'm wrong about that :)
> >
> > Or not, your local_lock+this_cpu_ptr version exploded too.
> >
> > Perhaps there's a bit of non-rt related racy racy going on in zswap
> > thingy that makes swap an even less wonderful idea for RT than usual.
>
> Raciness seems to be restricted to pool compressor. "zbud" seems to be
> solid, virgin "zsmalloc" explodes, as does "z3fold" regardless which of
> us puts his grubby fingerprints on it.
>
> Exploding compressors survived zero runs of runltp -f mm, I declared
> zbud to be at least kinda sorta stable after box survived five runs.
Ummm so do compressors explode under non-rt kernel in your tests as
well, or it is just -rt that triggers this?
I've never seen that on both -rt and non-rt, thus asking.
--
Oleksandr Natalenko (post-factum)
On Sun, 2020-11-29 at 12:29 +0100, Oleksandr Natalenko wrote:
>
> Ummm so do compressors explode under non-rt kernel in your tests as
> well, or it is just -rt that triggers this?
I only tested a non-rt kernel with z3fold, which worked just fine.
-Mike
On 2020-11-29 12:41:14 [+0100], Mike Galbraith wrote:
> On Sun, 2020-11-29 at 12:29 +0100, Oleksandr Natalenko wrote:
> >
> > Ummm so do compressors explode under non-rt kernel in your tests as
> > well, or it is just -rt that triggers this?
>
> I only tested a non-rt kernel with z3fold, which worked just fine.
I tried this and it did not not explode yet. Mike, can you please
confirm?
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 18feaa0bc5377..0bf70f624a4bd 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -642,14 +642,17 @@ static inline void add_to_unbuddied(struct z3fold_pool *pool,
{
if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
zhdr->middle_chunks == 0) {
- struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
-
+ struct list_head *unbuddied;
int freechunks = num_free_chunks(zhdr);
+
+ migrate_disable();
+ unbuddied = this_cpu_ptr(pool->unbuddied);
+
spin_lock(&pool->lock);
list_add(&zhdr->buddy, &unbuddied[freechunks]);
spin_unlock(&pool->lock);
zhdr->cpu = smp_processor_id();
- put_cpu_ptr(pool->unbuddied);
+ migrate_enable();
}
}
@@ -887,7 +890,8 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
lookup:
/* First, try to find an unbuddied z3fold page. */
- unbuddied = get_cpu_ptr(pool->unbuddied);
+ migrate_disable();
+ unbuddied = this_cpu_ptr(pool->unbuddied);
for_each_unbuddied_list(i, chunks) {
struct list_head *l = &unbuddied[i];
@@ -905,7 +909,7 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
!z3fold_page_trylock(zhdr)) {
spin_unlock(&pool->lock);
zhdr = NULL;
- put_cpu_ptr(pool->unbuddied);
+ migrate_enable();
if (can_sleep)
cond_resched();
goto lookup;
@@ -919,7 +923,7 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
test_bit(PAGE_CLAIMED, &page->private)) {
z3fold_page_unlock(zhdr);
zhdr = NULL;
- put_cpu_ptr(pool->unbuddied);
+ migrate_enable();
if (can_sleep)
cond_resched();
goto lookup;
@@ -934,7 +938,7 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
kref_get(&zhdr->refcount);
break;
}
- put_cpu_ptr(pool->unbuddied);
+ migrate_enable();
if (!zhdr) {
int cpu;
diff --git a/mm/zswap.c b/mm/zswap.c
index 78a20f7b00f2c..b24f761b9241c 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -394,7 +394,9 @@ struct zswap_comp {
u8 *dstmem;
};
-static DEFINE_PER_CPU(struct zswap_comp, zswap_comp);
+static DEFINE_PER_CPU(struct zswap_comp, zswap_comp) = {
+ .lock = INIT_LOCAL_LOCK(lock),
+};
static int zswap_dstmem_prepare(unsigned int cpu)
{
> -Mike
Sebastian
On Mon, Nov 30, 2020 at 02:20:14PM +0100, Sebastian Andrzej Siewior wrote:
> On 2020-11-29 12:41:14 [+0100], Mike Galbraith wrote:
> > On Sun, 2020-11-29 at 12:29 +0100, Oleksandr Natalenko wrote:
> > >
> > > Ummm so do compressors explode under non-rt kernel in your tests as
> > > well, or it is just -rt that triggers this?
> >
> > I only tested a non-rt kernel with z3fold, which worked just fine.
>
> I tried this and it did not not explode yet. Mike, can you please
> confirm?
>
> diff --git a/mm/z3fold.c b/mm/z3fold.c
> index 18feaa0bc5377..0bf70f624a4bd 100644
> --- a/mm/z3fold.c
> +++ b/mm/z3fold.c
> @@ -642,14 +642,17 @@ static inline void add_to_unbuddied(struct z3fold_pool *pool,
> {
> if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
> zhdr->middle_chunks == 0) {
> - struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
> -
> + struct list_head *unbuddied;
> int freechunks = num_free_chunks(zhdr);
> +
> + migrate_disable();
> + unbuddied = this_cpu_ptr(pool->unbuddied);
> +
> spin_lock(&pool->lock);
> list_add(&zhdr->buddy, &unbuddied[freechunks]);
> spin_unlock(&pool->lock);
> zhdr->cpu = smp_processor_id();
> - put_cpu_ptr(pool->unbuddied);
> + migrate_enable();
> }
> }
>
> @@ -887,7 +890,8 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
>
> lookup:
> /* First, try to find an unbuddied z3fold page. */
> - unbuddied = get_cpu_ptr(pool->unbuddied);
> + migrate_disable();
> + unbuddied = this_cpu_ptr(pool->unbuddied);
> for_each_unbuddied_list(i, chunks) {
> struct list_head *l = &unbuddied[i];
>
> @@ -905,7 +909,7 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
> !z3fold_page_trylock(zhdr)) {
> spin_unlock(&pool->lock);
> zhdr = NULL;
> - put_cpu_ptr(pool->unbuddied);
> + migrate_enable();
> if (can_sleep)
> cond_resched();
> goto lookup;
> @@ -919,7 +923,7 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
> test_bit(PAGE_CLAIMED, &page->private)) {
> z3fold_page_unlock(zhdr);
> zhdr = NULL;
> - put_cpu_ptr(pool->unbuddied);
> + migrate_enable();
> if (can_sleep)
> cond_resched();
> goto lookup;
> @@ -934,7 +938,7 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
> kref_get(&zhdr->refcount);
> break;
> }
> - put_cpu_ptr(pool->unbuddied);
> + migrate_enable();
>
> if (!zhdr) {
> int cpu;
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 78a20f7b00f2c..b24f761b9241c 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -394,7 +394,9 @@ struct zswap_comp {
> u8 *dstmem;
> };
>
> -static DEFINE_PER_CPU(struct zswap_comp, zswap_comp);
> +static DEFINE_PER_CPU(struct zswap_comp, zswap_comp) = {
> + .lock = INIT_LOCAL_LOCK(lock),
Is it a residue?
> +};
>
> static int zswap_dstmem_prepare(unsigned int cpu)
> {
>
> > -Mike
>
> Sebastian
--
Oleksandr Natalenko (post-factum)
On 2020-11-30 14:53:22 [+0100], Oleksandr Natalenko wrote:
> > diff --git a/mm/zswap.c b/mm/zswap.c
> > index 78a20f7b00f2c..b24f761b9241c 100644
> > --- a/mm/zswap.c
> > +++ b/mm/zswap.c
> > @@ -394,7 +394,9 @@ struct zswap_comp {
> > u8 *dstmem;
> > };
> >
> > -static DEFINE_PER_CPU(struct zswap_comp, zswap_comp);
> > +static DEFINE_PER_CPU(struct zswap_comp, zswap_comp) = {
> > + .lock = INIT_LOCAL_LOCK(lock),
>
> Is it a residue?
It as been added on purpose.
Sebastian
On Mon, 2020-11-30 at 14:20 +0100, Sebastian Andrzej Siewior wrote:
> On 2020-11-29 12:41:14 [+0100], Mike Galbraith wrote:
> > On Sun, 2020-11-29 at 12:29 +0100, Oleksandr Natalenko wrote:
> > >
> > > Ummm so do compressors explode under non-rt kernel in your tests as
> > > well, or it is just -rt that triggers this?
> >
> > I only tested a non-rt kernel with z3fold, which worked just fine.
>
> I tried this and it did not not explode yet. Mike, can you please
> confirm?
This explodes in write_unlock() as mine did. Oleksandr's local_lock()
variant explodes in the lock he added. (ew, corruption)
I think I'll try a stable-rt tree. This master tree _should_ be fine
given it seems to work just peachy for everything else, but my box is
the only one making boom... and also NOT making boom with the zbud
compressor. Hrmph.
crash.rt> bt -sx
PID: 27961 TASK: ffff8f6ad5344500 CPU: 4 COMMAND: "oom01"
#0 [ffffa439d4747708] machine_kexec+0x16e at ffffffffbb05eace
#1 [ffffa439d4747750] __crash_kexec+0x5a at ffffffffbb145e5a
#2 [ffffa439d4747810] crash_kexec+0x24 at ffffffffbb147004
#3 [ffffa439d4747818] oops_end+0x93 at ffffffffbb025c03
#4 [ffffa439d4747838] exc_page_fault+0x6b at ffffffffbb9011bb
#5 [ffffa439d4747860] asm_exc_page_fault+0x1e at ffffffffbba00ace
#6 [ffffa439d47478e8] mark_wakeup_next_waiter+0x51 at ffffffffbb0e6781
#7 [ffffa439d4747950] rt_mutex_futex_unlock+0x50 at ffffffffbb90bae0
#8 [ffffa439d4747990] z3fold_free+0x4a8 at ffffffffbb2bf8a8
#9 [ffffa439d47479f0] zswap_free_entry+0x82 at ffffffffbb285dd2
#10 [ffffa439d4747a08] zswap_frontswap_invalidate_page+0x8c at ffffffffbb285edc
#11 [ffffa439d4747a30] __frontswap_invalidate_page+0x4e at ffffffffbb28548e
#12 [ffffa439d4747a58] swap_range_free.constprop.0+0x9e at ffffffffbb27fd4e
#13 [ffffa439d4747a78] swapcache_free_entries+0x10d at ffffffffbb28101d
#14 [ffffa439d4747ac0] free_swap_slot+0xac at ffffffffbb284d8c
#15 [ffffa439d4747ae0] __swap_entry_free+0x8f at ffffffffbb2808ff
#16 [ffffa439d4747b08] free_swap_and_cache+0x3b at ffffffffbb2829db
#17 [ffffa439d4747b38] zap_pte_range+0x164 at ffffffffbb258004
#18 [ffffa439d4747bc0] unmap_page_range+0x1dd at ffffffffbb258b6d
#19 [ffffa439d4747c38] __oom_reap_task_mm+0xd5 at ffffffffbb2235c5
#20 [ffffa439d4747d08] exit_mmap+0x154 at ffffffffbb264084
#21 [ffffa439d4747da0] mmput+0x4e at ffffffffbb07e66e
#22 [ffffa439d4747db0] exit_mm+0x172 at ffffffffbb088372
#23 [ffffa439d4747df0] do_exit+0x1a8 at ffffffffbb088588
#24 [ffffa439d4747e20] do_group_exit+0x39 at ffffffffbb0888a9
#25 [ffffa439d4747e48] get_signal+0x155 at ffffffffbb096ef5
#26 [ffffa439d4747e98] arch_do_signal+0x1a at ffffffffbb0224ba
#27 [ffffa439d4747f18] exit_to_user_mode_loop+0xc7 at ffffffffbb11c037
#28 [ffffa439d4747f38] exit_to_user_mode_prepare+0x6a at ffffffffbb11c12a
#29 [ffffa439d4747f48] irqentry_exit_to_user_mode+0x5 at ffffffffbb901ae5
#30 [ffffa439d4747f50] asm_exc_page_fault+0x1e at ffffffffbba00ace
RIP: 0000000000414300 RSP: 00007f193033bec0 RFLAGS: 00010206
RAX: 0000000000001000 RBX: 00000000c0000000 RCX: 00000000a6e6a000
RDX: 00007f19159a4000 RSI: 00000000c0000000 RDI: 0000000000000000
RBP: 00007f186eb3a000 R8: ffffffffffffffff R9: 0000000000000000
R10: 0000000000000022 R11: 0000000000000246 R12: 0000000000001000
R13: 0000000000000001 R14: 0000000000000001 R15: 00007ffd0ffb96d0
ORIG_RAX: ffffffffffffffff CS: 0033 SS: 002b
>
On 2020-11-30 15:42:46 [+0100], Mike Galbraith wrote:
> This explodes in write_unlock() as mine did. Oleksandr's local_lock()
> variant explodes in the lock he added. (ew, corruption)
>
> I think I'll try a stable-rt tree. This master tree _should_ be fine
> given it seems to work just peachy for everything else, but my box is
> the only one making boom... and also NOT making boom with the zbud
> compressor. Hrmph.
How do you test this? I triggered a few oom-killer and I have here git
gc running for a few hours now… Everything is fine.
Sebastian
On Mon, 2020-11-30 at 15:52 +0100, Sebastian Andrzej Siewior wrote:
> How do you test this? I triggered a few oom-killer and I have here git
> gc running for a few hours now… Everything is fine.
In an LTP install, ./runltp -f mm. Shortly after box starts swapping
insanely, it explodes quite reliably here with either z3fold or
zsmalloc.. but not with zbud.
-Mike
On Mon, 2020-11-30 at 16:01 +0100, Mike Galbraith wrote:
> On Mon, 2020-11-30 at 15:52 +0100, Sebastian Andrzej Siewior wrote:
> > How do you test this? I triggered a few oom-killer and I have here git
> > gc running for a few hours now… Everything is fine.
>
> In an LTP install, ./runltp -f mm. Shortly after box starts swapping
> insanely, it explodes quite reliably here with either z3fold or
> zsmalloc.. but not with zbud.
My config attached in case it's relevant.
On 2020-11-30 16:01:11 [+0100], Mike Galbraith wrote:
> On Mon, 2020-11-30 at 15:52 +0100, Sebastian Andrzej Siewior wrote:
> > How do you test this? I triggered a few oom-killer and I have here git
> > gc running for a few hours now… Everything is fine.
>
> In an LTP install, ./runltp -f mm. Shortly after box starts swapping
> insanely, it explodes quite reliably here with either z3fold or
> zsmalloc.. but not with zbud.
This just passed. It however killed my git-gc task which wasn't done.
Let me try tomorrow with your config.
> -Mike
Sebastian
On Mon, 2020-11-30 at 17:03 +0100, Sebastian Andrzej Siewior wrote:
> On 2020-11-30 16:01:11 [+0100], Mike Galbraith wrote:
> > On Mon, 2020-11-30 at 15:52 +0100, Sebastian Andrzej Siewior wrote:
> > > How do you test this? I triggered a few oom-killer and I have here git
> > > gc running for a few hours now… Everything is fine.
> >
> > In an LTP install, ./runltp -f mm. Shortly after box starts swapping
> > insanely, it explodes quite reliably here with either z3fold or
> > zsmalloc.. but not with zbud.
>
> This just passed. It however killed my git-gc task which wasn't done.
> Let me try tomorrow with your config.
FYI, I tried 5.9-rt (after fixing 5.9.11), it exploded in the same way,
so (as expected) it's not some devel tree oopsie.
-Mike
On 2020-11-30 17:27:17 [+0100], Mike Galbraith wrote:
> > This just passed. It however killed my git-gc task which wasn't done.
> > Let me try tomorrow with your config.
>
> FYI, I tried 5.9-rt (after fixing 5.9.11), it exploded in the same way,
> so (as expected) it's not some devel tree oopsie.
v5.9 has the same missing local-lock init due to merged local-lock
upstream. I don't remember when this get_cpu_ptr() came it.
> -Mike
Sebastian
On Mon, 2020-11-30 at 17:32 +0100, Sebastian Andrzej Siewior wrote:
> On 2020-11-30 17:27:17 [+0100], Mike Galbraith wrote:
> > > This just passed. It however killed my git-gc task which wasn't done.
> > > Let me try tomorrow with your config.
> >
> > FYI, I tried 5.9-rt (after fixing 5.9.11), it exploded in the same way,
> > so (as expected) it's not some devel tree oopsie.
>
> v5.9 has the same missing local-lock init due to merged local-lock
> upstream. I don't remember when this get_cpu_ptr() came it.
This was with your test patch applied.
-Mike
On Mon, 2020-11-30 at 17:27 +0100, Mike Galbraith wrote:
> On Mon, 2020-11-30 at 17:03 +0100, Sebastian Andrzej Siewior wrote:
> > On 2020-11-30 16:01:11 [+0100], Mike Galbraith wrote:
> > > On Mon, 2020-11-30 at 15:52 +0100, Sebastian Andrzej Siewior wrote:
> > > > How do you test this? I triggered a few oom-killer and I have here git
> > > > gc running for a few hours now… Everything is fine.
> > >
> > > In an LTP install, ./runltp -f mm. Shortly after box starts swapping
> > > insanely, it explodes quite reliably here with either z3fold or
> > > zsmalloc.. but not with zbud.
> >
> > This just passed. It however killed my git-gc task which wasn't done.
> > Let me try tomorrow with your config.
>
> FYI, I tried 5.9-rt (after fixing 5.9.11), it exploded in the same way,
> so (as expected) it's not some devel tree oopsie.
It reproduces in full distro KVM too.
-Mike
On Mon, 2020-11-30 at 17:32 +0100, Sebastian Andrzej Siewior wrote:
> On 2020-11-30 17:27:17 [+0100], Mike Galbraith wrote:
> > > This just passed. It however killed my git-gc task which wasn't done.
> > > Let me try tomorrow with your config.
> >
> > FYI, I tried 5.9-rt (after fixing 5.9.11), it exploded in the same way,
> > so (as expected) it's not some devel tree oopsie.
>
> v5.9 has the same missing local-lock init due to merged local-lock
> upstream. I don't remember when this get_cpu_ptr() came it.
Looks like get_cpu_ptr() arrived in 4.14.
I patched up zswap/z3fold in my old 5.1-rt tree (old forward port of
4.19-rt), turned them on with a script, and it all worked fine, so the
RT zswap allergy lies somewhere upstream of there it seems.
If I were to start merging zswap and friends forward, I'd likely
eventually meet whatever RT is allergic to. I might try that, but
won't be the least surprised if it turns into a god awful mess of
dependencies.
Hopping forward in rt branches first and patching them up will bracket
the little bugger a lot quicker.
-Mike
On Mon, 2020-11-30 at 17:03 +0100, Sebastian Andrzej Siewior wrote:
> On 2020-11-30 16:01:11 [+0100], Mike Galbraith wrote:
> > On Mon, 2020-11-30 at 15:52 +0100, Sebastian Andrzej Siewior wrote:
> > > How do you test this? I triggered a few oom-killer and I have here git
> > > gc running for a few hours now… Everything is fine.
> >
> > In an LTP install, ./runltp -f mm. Shortly after box starts swapping
> > insanely, it explodes quite reliably here with either z3fold or
> > zsmalloc.. but not with zbud.
>
> This just passed. It however killed my git-gc task which wasn't done.
> Let me try tomorrow with your config.
What I'm seeing is the below. rt_mutex_has_waiters() says yup we have
a waiter, rt_mutex_top_waiter() emits the missing cached leftmost, and
rt_mutex_dequeue_pi() chokes on it. Lock is buggered.
[ 894.376962] BUG: kernel NULL pointer dereference, address: 0000000000000018
[ 894.377639] #PF: supervisor read access in kernel mode
[ 894.378130] #PF: error_code(0x0000) - not-present page
[ 894.378735] PGD 0 P4D 0
[ 894.378974] Oops: 0000 [#1] PREEMPT_RT SMP PTI
[ 894.379384] CPU: 2 PID: 78 Comm: oom_reaper Kdump: loaded Tainted: G E 5.9.11-rt20-rt #9
[ 894.380253] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014
[ 894.381352] RIP: 0010:mark_wakeup_next_waiter+0x51/0x150
[ 894.381869] Code: 00 00 49 89 f5 e8 9f 1c 7c 00 48 8b 5d 10 48 85 db 74 0a 48 3b 6b 38 0f 85 00 01 00 00 65 4c 8b 3c 25 c0 8d 01 00 4c 8d 73 18 <4c> 39 73 18 0f 85 94 00 00 00 65 48 8b 3c 25 c0 8d 01 00 48 8b 87
[ 894.383640] RSP: 0018:ffffb792802cfb18 EFLAGS: 00010046
[ 894.384135] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000001
[ 894.384804] RDX: 0000000000000001 RSI: ffffb792802cfb68 RDI: 0000000000000001
[ 894.385473] RBP: ffff997b4e508628 R08: ffff997b39075000 R09: ffff997a47800db0
[ 894.386134] R10: 0000000000000000 R11: ffffffff8a58f4d8 R12: ffffb792802cfb58
[ 894.387030] R13: ffffb792802cfb68 R14: 0000000000000018 R15: ffff997a7f1d3300
[ 894.387715] FS: 0000000000000000(0000) GS:ffff997b77c80000(0000) knlGS:0000000000000000
[ 894.388476] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 894.389209] CR2: 0000000000000018 CR3: 00000001cc156006 CR4: 00000000001706e0
[ 894.389881] Call Trace:
[ 894.390127] rt_mutex_futex_unlock+0x4f/0x90
[ 894.390547] z3fold_zpool_free+0x539/0x5c0
[ 894.390930] zswap_free_entry+0x43/0x50
[ 894.391193] zswap_frontswap_invalidate_page+0x8a/0x90
[ 894.391544] __frontswap_invalidate_page+0x48/0x80
[ 894.391875] swapcache_free_entries+0x1ee/0x330
[ 894.392189] ? rt_mutex_futex_unlock+0x65/0x90
[ 894.392502] free_swap_slot+0xad/0xc0
[ 894.392757] __swap_entry_free+0x70/0x90
[ 894.393046] free_swap_and_cache+0x39/0xe0
[ 894.393351] unmap_page_range+0x5e1/0xb30
[ 894.393646] ? flush_tlb_mm_range+0xfb/0x170
[ 894.393965] __oom_reap_task_mm+0xb2/0x170
[ 894.394254] ? __switch_to+0x12a/0x520
[ 894.394514] oom_reaper+0x119/0x540
[ 894.394756] ? wait_woken+0xa0/0xa0
[ 894.394997] ? __oom_reap_task_mm+0x170/0x170
[ 894.395297] kthread+0x169/0x180
[ 894.395535] ? kthread_park+0x90/0x90
[ 894.395867] ret_from_fork+0x22/0x30
[ 894.396252] Modules linked in: ebtable_filter(E) ebtables(E) uinput(E) fuse(E) rpcsec_gss_krb5(E) nfsv4(E) xt_comment(E) dns_resolver(E) nfs(E) nf_log_ipv6(E) nf_log_ipv4(E) nf_log_common(E) xt_LOG(E) xt_limit(E) nfs_ssc(E) fscache(E>
[ 894.396280] cryptd(E) glue_helper(E) pcspkr(E) nfsd(E) auth_rpcgss(E) nfs_acl(E) lockd(E) grace(E) sunrpc(E) sch_fq_codel(E) hid_generic(E) usbhid(E) ext4(E) crc16(E) mbcache(E) jbd2(E) ata_generic(E) virtio_console(E) virtio_blk(E)>
[ 894.406791] Dumping ftrace buffer:
[ 894.407037] (ftrace buffer empty)
[ 894.407293] CR2: 0000000000000018
crash> gdb list *mark_wakeup_next_waiter+0x51
0xffffffff810e87e1 is in mark_wakeup_next_waiter (kernel/locking/rtmutex.c:362).
357 }
358
359 static void
360 rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
361 {
362 if (RB_EMPTY_NODE(&waiter->pi_tree_entry))
363 return;
364
365 rb_erase_cached(&waiter->pi_tree_entry, &task->pi_waiters);
366 RB_CLEAR_NODE(&waiter->pi_tree_entry);
crash> rwlock_t -x 0xffff997b4e508628
struct rwlock_t {
rtmutex = {
wait_lock = {
raw_lock = {
{
val = {
counter = 0x1
},
{
locked = 0x1,
pending = 0x0
},
{
locked_pending = 0x1,
tail = 0x0
}
}
}
},
waiters = {
rb_root = {
rb_node = 0xffff997b4e508580
},
rb_leftmost = 0x0
},
owner = 0xffff997a7f1d3300,
save_state = 0x1
},
readers = {
counter = 0x80000000
}
}
crash> rb_root 0xffff997b4e508580
struct rb_root {
rb_node = 0x0
}
On 2020-12-02 03:30:27 [+0100], Mike Galbraith wrote:
> > > In an LTP install, ./runltp -f mm. Shortly after box starts swapping
> > > insanely, it explodes quite reliably here with either z3fold or
> > > zsmalloc.. but not with zbud.
>
> What I'm seeing is the below. rt_mutex_has_waiters() says yup we have
> a waiter, rt_mutex_top_waiter() emits the missing cached leftmost, and
> rt_mutex_dequeue_pi() chokes on it. Lock is buggered.
correct. So this:
diff --git a/mm/z3fold.c b/mm/z3fold.c
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -342,7 +342,7 @@ static inline void free_handle(unsigned long handle)
if (is_free) {
struct z3fold_pool *pool = slots_to_pool(slots);
-
+ memset(slots, 0xee, sizeof(struct z3fold_buddy_slots));
kmem_cache_free(pool->c_handle, slots);
}
}
@@ -548,8 +549,10 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
set_bit(HANDLES_ORPHANED, &zhdr->slots->pool);
read_unlock(&zhdr->slots->lock);
- if (is_free)
+ if (is_free) {
+ memset(zhdr->slots, 0xdd, sizeof(struct z3fold_buddy_slots));
kmem_cache_free(pool->c_handle, zhdr->slots);
+ }
if (locked)
z3fold_page_unlock(zhdr);
resulted in:
|[ 377.200696] Out of memory: Killed process 284358 (oom01) total-vm:15780488kB, anon-rss:150624kB, file-rss:0kB, shmem-rss:0kB, UID:0 pgtables:16760kB oom_score_adj:0
|[ 377.205438] ------------[ cut here ]------------
|[ 377.205441] pvqspinlock: lock 0xffff8880105c6828 has corrupted value 0xdddddddd!
|[ 377.205448] WARNING: CPU: 6 PID: 72 at kernel/locking/qspinlock_paravirt.h:498 __pv_queued_spin_unlock_slowpath+0xb3/0xc0
|[ 377.205455] Modules linked in:
|[ 377.205456] CPU: 6 PID: 72 Comm: oom_reaper Not tainted 5.10.0-rc6-rt13-rt+ #103
|[ 377.205458] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-1 04/01/2014
|[ 377.205460] RIP: 0010:__pv_queued_spin_unlock_slowpath+0xb3/0xc0
…
|[ 377.205475] Call Trace:
|[ 377.205477] __raw_callee_save___pv_queued_spin_unlock_slowpath+0x11/0x20
|[ 377.205481] .slowpath+0x9/0xe
|[ 377.205483] _raw_spin_unlock_irqrestore+0x5/0x50
|[ 377.205486] rt_mutex_futex_unlock+0x9e/0xb0
|[ 377.205488] z3fold_free+0x2b0/0x470
|[ 377.205491] zswap_free_entry+0x7d/0xc0
|[ 377.205493] zswap_frontswap_invalidate_page+0x87/0x90
|[ 377.205495] __frontswap_invalidate_page+0x58/0x90
|[ 377.205496] swap_range_free.constprop.0+0x99/0xb0
|[ 377.205499] swapcache_free_entries+0x131/0x390
|[ 377.205501] free_swap_slot+0x99/0xc0
|[ 377.205502] __swap_entry_free+0x8a/0xa0
|[ 377.205504] free_swap_and_cache+0x36/0xd0
|[ 377.205506] zap_pte_range+0x16a/0x940
|[ 377.205509] unmap_page_range+0x1d8/0x310
|[ 377.205514] __oom_reap_task_mm+0xe7/0x190
|[ 377.205520] oom_reap_task_mm+0x5a/0x280
|[ 377.205521] oom_reaper+0x98/0x1c0
|[ 377.205525] kthread+0x18c/0x1b0
|[ 377.205528] ret_from_fork+0x22/0x30
|[ 377.205531] ---[ end trace 0000000000000002 ]---
Then I reverted commit
4a3ac9311dac3 ("mm/z3fold.c: add inter-page compaction")
and it seems to work now. Any suggestions? It looks like use-after-free.
Sebastian
On Wed, 2020-12-02 at 23:08 +0100, Sebastian Andrzej Siewior wrote:
> On 2020-12-02 03:30:27 [+0100], Mike Galbraith wrote:
>
> > What I'm seeing is the below. rt_mutex_has_waiters() says yup we have
> > a waiter, rt_mutex_top_waiter() emits the missing cached leftmost, and
> > rt_mutex_dequeue_pi() chokes on it. Lock is buggered.
>
> correct. So this:
>
> diff --git a/mm/z3fold.c b/mm/z3fold.c
> --- a/mm/z3fold.c
> +++ b/mm/z3fold.c
> @@ -342,7 +342,7 @@ static inline void free_handle(unsigned long handle)
>
> if (is_free) {
> struct z3fold_pool *pool = slots_to_pool(slots);
> -
> + memset(slots, 0xee, sizeof(struct z3fold_buddy_slots));
> kmem_cache_free(pool->c_handle, slots);
> }
> }
> @@ -548,8 +549,10 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
> set_bit(HANDLES_ORPHANED, &zhdr->slots->pool);
> read_unlock(&zhdr->slots->lock);
>
> - if (is_free)
> + if (is_free) {
> + memset(zhdr->slots, 0xdd, sizeof(struct z3fold_buddy_slots));
> kmem_cache_free(pool->c_handle, zhdr->slots);
> + }
>
> if (locked)
> z3fold_page_unlock(zhdr);
>
> resulted in:
>
> |[ 377.200696] Out of memory: Killed process 284358 (oom01) total-vm:15780488kB, anon-rss:150624kB, file-rss:0kB, shmem-rss:0kB, UID:0 pgtables:16760kB oom_score_adj:0
> |[ 377.205438] ------------[ cut here ]------------
> |[ 377.205441] pvqspinlock: lock 0xffff8880105c6828 has corrupted value 0xdddddddd!
> |[ 377.205448] WARNING: CPU: 6 PID: 72 at kernel/locking/qspinlock_paravirt.h:498 __pv_queued_spin_unlock_slowpath+0xb3/0xc0
> |[ 377.205455] Modules linked in:
> |[ 377.205456] CPU: 6 PID: 72 Comm: oom_reaper Not tainted 5.10.0-rc6-rt13-rt+ #103
> |[ 377.205458] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-1 04/01/2014
> |[ 377.205460] RIP: 0010:__pv_queued_spin_unlock_slowpath+0xb3/0xc0
> …
> |[ 377.205475] Call Trace:
> |[ 377.205477] __raw_callee_save___pv_queued_spin_unlock_slowpath+0x11/0x20
> |[ 377.205481] .slowpath+0x9/0xe
> |[ 377.205483] _raw_spin_unlock_irqrestore+0x5/0x50
> |[ 377.205486] rt_mutex_futex_unlock+0x9e/0xb0
> |[ 377.205488] z3fold_free+0x2b0/0x470
> |[ 377.205491] zswap_free_entry+0x7d/0xc0
> |[ 377.205493] zswap_frontswap_invalidate_page+0x87/0x90
> |[ 377.205495] __frontswap_invalidate_page+0x58/0x90
> |[ 377.205496] swap_range_free.constprop.0+0x99/0xb0
> |[ 377.205499] swapcache_free_entries+0x131/0x390
> |[ 377.205501] free_swap_slot+0x99/0xc0
> |[ 377.205502] __swap_entry_free+0x8a/0xa0
> |[ 377.205504] free_swap_and_cache+0x36/0xd0
> |[ 377.205506] zap_pte_range+0x16a/0x940
> |[ 377.205509] unmap_page_range+0x1d8/0x310
> |[ 377.205514] __oom_reap_task_mm+0xe7/0x190
> |[ 377.205520] oom_reap_task_mm+0x5a/0x280
> |[ 377.205521] oom_reaper+0x98/0x1c0
> |[ 377.205525] kthread+0x18c/0x1b0
> |[ 377.205528] ret_from_fork+0x22/0x30
> |[ 377.205531] ---[ end trace 0000000000000002 ]---
>
> Then I reverted commit
> 4a3ac9311dac3 ("mm/z3fold.c: add inter-page compaction")
>
> and it seems to work now. Any suggestions? It looks like use-after-free.
Looks like...
d8f117abb380 z3fold: fix use-after-free when freeing handles
...wasn't completely effective. write_unlock() in handle_free() is
where I see explosions. Only trouble is that 4a3ac9311dac3 arrived in
5.5, and my 5.[5-7]-rt refuse to reproduce (d8f117abb380 applied),
whereas 5.9 and 5.10 do so quite reliably. There is a heisenbug aspect
though, one trace_printk() in handle_free() made bug go hide, so the
heisen-fairies in earlier trees were probably just messing with me..
because they can, being the twisted little freaks they are ;-)
-Mike
On Thu, 2020-12-03 at 03:16 +0100, Mike Galbraith wrote:
> On Wed, 2020-12-02 at 23:08 +0100, Sebastian Andrzej Siewior wrote:
> Looks like...
>
> d8f117abb380 z3fold: fix use-after-free when freeing handles
>
> ...wasn't completely effective...
The top two hunks seem to have rendered the thing RT tolerant.
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 18feaa0bc537..851d9f4f1644 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -537,7 +537,7 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
spin_unlock(&pool->lock);
/* If there are no foreign handles, free the handles array */
- read_lock(&zhdr->slots->lock);
+ write_lock(&zhdr->slots->lock);
for (i = 0; i <= BUDDY_MASK; i++) {
if (zhdr->slots->slot[i]) {
is_free = false;
@@ -546,7 +546,7 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
}
if (!is_free)
set_bit(HANDLES_ORPHANED, &zhdr->slots->pool);
- read_unlock(&zhdr->slots->lock);
+ write_unlock(&zhdr->slots->lock);
if (is_free)
kmem_cache_free(pool->c_handle, zhdr->slots);
@@ -642,14 +642,16 @@ static inline void add_to_unbuddied(struct z3fold_pool *pool,
{
if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
zhdr->middle_chunks == 0) {
- struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
-
+ struct list_head *unbuddied;
int freechunks = num_free_chunks(zhdr);
+
+ migrate_disable();
+ unbuddied = this_cpu_ptr(pool->unbuddied);
spin_lock(&pool->lock);
list_add(&zhdr->buddy, &unbuddied[freechunks]);
spin_unlock(&pool->lock);
zhdr->cpu = smp_processor_id();
- put_cpu_ptr(pool->unbuddied);
+ migrate_enable();
}
}
@@ -886,8 +888,9 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
int chunks = size_to_chunks(size), i;
lookup:
+ migrate_disable();
/* First, try to find an unbuddied z3fold page. */
- unbuddied = get_cpu_ptr(pool->unbuddied);
+ unbuddied = this_cpu_ptr(pool->unbuddied);
for_each_unbuddied_list(i, chunks) {
struct list_head *l = &unbuddied[i];
@@ -905,7 +908,7 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
!z3fold_page_trylock(zhdr)) {
spin_unlock(&pool->lock);
zhdr = NULL;
- put_cpu_ptr(pool->unbuddied);
+ migrate_enable();
if (can_sleep)
cond_resched();
goto lookup;
@@ -919,7 +922,7 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
test_bit(PAGE_CLAIMED, &page->private)) {
z3fold_page_unlock(zhdr);
zhdr = NULL;
- put_cpu_ptr(pool->unbuddied);
+ migrate_enable();
if (can_sleep)
cond_resched();
goto lookup;
@@ -934,7 +937,7 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
kref_get(&zhdr->refcount);
break;
}
- put_cpu_ptr(pool->unbuddied);
+ migrate_enable();
if (!zhdr) {
int cpu;
On 2020-12-03 09:18:21 [+0100], Mike Galbraith wrote:
> On Thu, 2020-12-03 at 03:16 +0100, Mike Galbraith wrote:
> > On Wed, 2020-12-02 at 23:08 +0100, Sebastian Andrzej Siewior wrote:
> > Looks like...
> >
> > d8f117abb380 z3fold: fix use-after-free when freeing handles
> >
> > ...wasn't completely effective...
>
> The top two hunks seem to have rendered the thing RT tolerant.
Yes, it appears to. I have no idea if this is a proper fix or not.
Without your write lock, after a few attempts, KASAN says:
| BUG: KASAN: use-after-free in __pv_queued_spin_lock_slowpath+0x293/0x770
| Write of size 2 at addr ffff88800e0e10aa by task kworker/u16:3/237
|
| CPU: 5 PID: 237 Comm: kworker/u16:3 Tainted: G W 5.10.0-rc6-rt13-rt+
| Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-1 04/01/2014
| Workqueue: zswap1 compact_page_work
| Call Trace:
| dump_stack+0x7d/0xa3
| print_address_description.constprop.0+0x19/0x120
| __kasan_report.cold+0x1d/0x35
| kasan_report+0x3a/0x50
| check_memory_region+0x145/0x1a0
| __pv_queued_spin_lock_slowpath+0x293/0x770
| _raw_spin_lock_irq+0xca/0xe0
| rt_read_unlock+0x2c/0x70
| __release_z3fold_page.constprop.0+0x45e/0x620
| do_compact_page+0x674/0xa50
| process_one_work+0x63a/0x1130
| worker_thread+0xd3/0xc80
| kthread+0x401/0x4e0
| ret_from_fork+0x22/0x30
|
| Allocated by task 225 (systemd-journal):
| kasan_save_stack+0x1b/0x40
| __kasan_kmalloc.constprop.0+0xc2/0xd0
| kmem_cache_alloc+0x103/0x2b0
| z3fold_alloc+0x597/0x1970
| zswap_frontswap_store+0x928/0x1bf0
| __frontswap_store+0x117/0x320
| swap_writepage+0x34/0x70
| pageout+0x268/0x7c0
| shrink_page_list+0x13e1/0x1e80
| shrink_inactive_list+0x303/0xde0
| shrink_lruvec+0x3dd/0x660
| shrink_node_memcgs+0x3a1/0x600
| shrink_node+0x3a7/0x1350
| shrink_zones+0x1f1/0x7f0
| do_try_to_free_pages+0x219/0xcc0
| try_to_free_pages+0x1c5/0x4b0
| __perform_reclaim+0x18f/0x2c0
| __alloc_pages_slowpath.constprop.0+0x7ea/0x1790
| __alloc_pages_nodemask+0x5f5/0x700
| page_cache_ra_unbounded+0x30f/0x690
| do_sync_mmap_readahead+0x3e3/0x640
| filemap_fault+0x981/0x1110
| __xfs_filemap_fault+0x12d/0x840
| __do_fault+0xf3/0x4b0
| do_fault+0x202/0x8c0
| __handle_mm_fault+0x338/0x500
| handle_mm_fault+0x1a8/0x670
| do_user_addr_fault+0x409/0x8b0
| exc_page_fault+0x60/0xc0
| asm_exc_page_fault+0x1e/0x30
|
| Freed by task 71 (oom_reaper):
| kasan_save_stack+0x1b/0x40
| kasan_set_track+0x1c/0x30
| kasan_set_free_info+0x1b/0x30
| __kasan_slab_free+0x110/0x150
| kmem_cache_free+0x7f/0x450
| z3fold_free+0x1f8/0xc90
| zswap_free_entry+0x168/0x230
| zswap_frontswap_invalidate_page+0x145/0x190
| __frontswap_invalidate_page+0xe8/0x1a0
| swap_range_free.constprop.0+0x266/0x300
| swapcache_free_entries+0x1dc/0x970
| free_swap_slot+0x19c/0x290
| __swap_entry_free+0x139/0x160
| free_swap_and_cache+0xda/0x230
| zap_pte_range+0x275/0x1590
| unmap_page_range+0x320/0x690
| __oom_reap_task_mm+0x207/0x330
| oom_reap_task_mm+0x78/0x7e0
| oom_reap_task+0x6d/0x1a0
| oom_reaper+0x103/0x290
| kthread+0x401/0x4e0
| ret_from_fork+0x22/0x30
|
| The buggy address belongs to the object at ffff88800e0e1080
| which belongs to the cache z3fold_handle of size 88
| The buggy address is located 42 bytes inside of
| 88-byte region [ffff88800e0e1080, ffff88800e0e10d8)
| The buggy address belongs to the page:
| page:000000002ba661bc refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0xe0e1
| flags: 0xffff800000200(slab)
| raw: 000ffff800000200 dead000000000100 dead000000000122 ffff88800aa4eb40
| raw: 0000000000000000 0000000000200020 00000001ffffffff 0000000000000000
| page dumped because: kasan: bad access detected
|
| Memory state around the buggy address:
| ffff88800e0e0f80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
| ffff88800e0e1000: 00 00 00 00 00 00 00 00 00 00 00 fc fc fc fc fc
| >ffff88800e0e1080: fa fb fb fb fb fb fb fb fb fb fb fc fc fc fc fc
| ^
| ffff88800e0e1100: 00 00 00 00 00 00 00 00 00 00 00 fc fc fc fc fc
| ffff88800e0e1180: 00 00 00 00 00 00 00 00 00 00 00 fc fc fc fc fc
| ==================================================================
| Disabling lock debugging due to kernel taint
with the lock I haven't seen anything.
Sebastian
On Thu, Dec 3, 2020 at 2:39 PM Sebastian Andrzej Siewior
<[email protected]> wrote:
>
> On 2020-12-03 09:18:21 [+0100], Mike Galbraith wrote:
> > On Thu, 2020-12-03 at 03:16 +0100, Mike Galbraith wrote:
> > > On Wed, 2020-12-02 at 23:08 +0100, Sebastian Andrzej Siewior wrote:
> > > Looks like...
> > >
> > > d8f117abb380 z3fold: fix use-after-free when freeing handles
> > >
> > > ...wasn't completely effective...
> >
> > The top two hunks seem to have rendered the thing RT tolerant.
Thanks for all your efforts, I promise to take a closer look at this
today, has had my hands full with RISC-V up until now.
Best regards,
Vitaly
> Yes, it appears to. I have no idea if this is a proper fix or not.
> Without your write lock, after a few attempts, KASAN says:
>
> | BUG: KASAN: use-after-free in __pv_queued_spin_lock_slowpath+0x293/0x770
> | Write of size 2 at addr ffff88800e0e10aa by task kworker/u16:3/237
> |
> | CPU: 5 PID: 237 Comm: kworker/u16:3 Tainted: G W 5.10.0-rc6-rt13-rt+
> | Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-1 04/01/2014
> | Workqueue: zswap1 compact_page_work
> | Call Trace:
> | dump_stack+0x7d/0xa3
> | print_address_description.constprop.0+0x19/0x120
> | __kasan_report.cold+0x1d/0x35
> | kasan_report+0x3a/0x50
> | check_memory_region+0x145/0x1a0
> | __pv_queued_spin_lock_slowpath+0x293/0x770
> | _raw_spin_lock_irq+0xca/0xe0
> | rt_read_unlock+0x2c/0x70
> | __release_z3fold_page.constprop.0+0x45e/0x620
> | do_compact_page+0x674/0xa50
> | process_one_work+0x63a/0x1130
> | worker_thread+0xd3/0xc80
> | kthread+0x401/0x4e0
> | ret_from_fork+0x22/0x30
> |
> | Allocated by task 225 (systemd-journal):
> | kasan_save_stack+0x1b/0x40
> | __kasan_kmalloc.constprop.0+0xc2/0xd0
> | kmem_cache_alloc+0x103/0x2b0
> | z3fold_alloc+0x597/0x1970
> | zswap_frontswap_store+0x928/0x1bf0
> | __frontswap_store+0x117/0x320
> | swap_writepage+0x34/0x70
> | pageout+0x268/0x7c0
> | shrink_page_list+0x13e1/0x1e80
> | shrink_inactive_list+0x303/0xde0
> | shrink_lruvec+0x3dd/0x660
> | shrink_node_memcgs+0x3a1/0x600
> | shrink_node+0x3a7/0x1350
> | shrink_zones+0x1f1/0x7f0
> | do_try_to_free_pages+0x219/0xcc0
> | try_to_free_pages+0x1c5/0x4b0
> | __perform_reclaim+0x18f/0x2c0
> | __alloc_pages_slowpath.constprop.0+0x7ea/0x1790
> | __alloc_pages_nodemask+0x5f5/0x700
> | page_cache_ra_unbounded+0x30f/0x690
> | do_sync_mmap_readahead+0x3e3/0x640
> | filemap_fault+0x981/0x1110
> | __xfs_filemap_fault+0x12d/0x840
> | __do_fault+0xf3/0x4b0
> | do_fault+0x202/0x8c0
> | __handle_mm_fault+0x338/0x500
> | handle_mm_fault+0x1a8/0x670
> | do_user_addr_fault+0x409/0x8b0
> | exc_page_fault+0x60/0xc0
> | asm_exc_page_fault+0x1e/0x30
> |
> | Freed by task 71 (oom_reaper):
> | kasan_save_stack+0x1b/0x40
> | kasan_set_track+0x1c/0x30
> | kasan_set_free_info+0x1b/0x30
> | __kasan_slab_free+0x110/0x150
> | kmem_cache_free+0x7f/0x450
> | z3fold_free+0x1f8/0xc90
> | zswap_free_entry+0x168/0x230
> | zswap_frontswap_invalidate_page+0x145/0x190
> | __frontswap_invalidate_page+0xe8/0x1a0
> | swap_range_free.constprop.0+0x266/0x300
> | swapcache_free_entries+0x1dc/0x970
> | free_swap_slot+0x19c/0x290
> | __swap_entry_free+0x139/0x160
> | free_swap_and_cache+0xda/0x230
> | zap_pte_range+0x275/0x1590
> | unmap_page_range+0x320/0x690
> | __oom_reap_task_mm+0x207/0x330
> | oom_reap_task_mm+0x78/0x7e0
> | oom_reap_task+0x6d/0x1a0
> | oom_reaper+0x103/0x290
> | kthread+0x401/0x4e0
> | ret_from_fork+0x22/0x30
> |
> | The buggy address belongs to the object at ffff88800e0e1080
> | which belongs to the cache z3fold_handle of size 88
> | The buggy address is located 42 bytes inside of
> | 88-byte region [ffff88800e0e1080, ffff88800e0e10d8)
> | The buggy address belongs to the page:
> | page:000000002ba661bc refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0xe0e1
> | flags: 0xffff800000200(slab)
> | raw: 000ffff800000200 dead000000000100 dead000000000122 ffff88800aa4eb40
> | raw: 0000000000000000 0000000000200020 00000001ffffffff 0000000000000000
> | page dumped because: kasan: bad access detected
> |
> | Memory state around the buggy address:
> | ffff88800e0e0f80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> | ffff88800e0e1000: 00 00 00 00 00 00 00 00 00 00 00 fc fc fc fc fc
> | >ffff88800e0e1080: fa fb fb fb fb fb fb fb fb fb fb fc fc fc fc fc
> | ^
> | ffff88800e0e1100: 00 00 00 00 00 00 00 00 00 00 00 fc fc fc fc fc
> | ffff88800e0e1180: 00 00 00 00 00 00 00 00 00 00 00 fc fc fc fc fc
> | ==================================================================
> | Disabling lock debugging due to kernel taint
>
> with the lock I haven't seen anything.
>
> Sebastian
On Thu, 2020-12-03 at 14:39 +0100, Sebastian Andrzej Siewior wrote:
> On 2020-12-03 09:18:21 [+0100], Mike Galbraith wrote:
> > On Thu, 2020-12-03 at 03:16 +0100, Mike Galbraith wrote:
> > > On Wed, 2020-12-02 at 23:08 +0100, Sebastian Andrzej Siewior wrote:
> > > Looks like...
> > >
> > > d8f117abb380 z3fold: fix use-after-free when freeing handles
> > >
> > > ...wasn't completely effective...
> >
> > The top two hunks seem to have rendered the thing RT tolerant.
>
> Yes, it appears to. I have no idea if this is a proper fix or not.
> Without your write lock, after a few attempts, KASAN says:
>
> | BUG: KASAN: use-after-free in __pv_queued_spin_lock_slowpath+0x293/0x770
> | Write of size 2 at addr ffff88800e0e10aa by task kworker/u16:3/237
Things that make ya go hmmm...
I started poking at it thinking ok, given write_lock() fixes it, bad
juju must be happening under another read_lock(), so I poisoned the
structure about to be freed by __release_z3fold_page() under lock, and
opened a delay window for bad juju to materialize, but it didn't, so I
just poisoned instead, and sprinkled landmines all over the place.
My landmines are not triggering but the detonator is materializing
inside the structure containing the lock we explode on. Somebody
blasts a recycled z3fold_buddy_slots into ram we were working on?
crash> bt -sx
PID: 11850 TASK: ffff933ee67f2280 CPU: 1 COMMAND: "swapoff"
#0 [ffffa7bf4e7f3900] machine_kexec+0x16e at ffffffff8405f86e
#1 [ffffa7bf4e7f3948] __crash_kexec+0xd2 at ffffffff8414c182
#2 [ffffa7bf4e7f3a08] crash_kexec+0x30 at ffffffff8414d370
#3 [ffffa7bf4e7f3a18] oops_end+0xca at ffffffff8402680a
#4 [ffffa7bf4e7f3a38] no_context+0x14d at ffffffff8406d7ed
#5 [ffffa7bf4e7f3a98] exc_page_fault+0x2b8 at ffffffff848bdb88
#6 [ffffa7bf4e7f3af0] asm_exc_page_fault+0x1e at ffffffff84a00ace
#7 [ffffa7bf4e7f3b78] mark_wakeup_next_waiter+0x51 at ffffffff840ea141
#8 [ffffa7bf4e7f3bd8] rt_mutex_futex_unlock+0x4f at ffffffff848c93ef
#9 [ffffa7bf4e7f3c18] z3fold_zpool_free+0x580 at ffffffffc0f37680 [z3fold]
#10 [ffffa7bf4e7f3c78] zswap_free_entry+0x43 at ffffffff8427c863
#11 [ffffa7bf4e7f3c88] zswap_frontswap_invalidate_page+0x8a at ffffffff8427c96a
#12 [ffffa7bf4e7f3cb0] __frontswap_invalidate_page+0x48 at ffffffff8427c058
#13 [ffffa7bf4e7f3cd8] swapcache_free_entries+0x1ee at ffffffff84276f8e
#14 [ffffa7bf4e7f3d20] free_swap_slot+0x9f at ffffffff8427b93f
#15 [ffffa7bf4e7f3d40] delete_from_swap_cache+0x61 at ffffffff84274651
#16 [ffffa7bf4e7f3d60] try_to_free_swap+0x70 at ffffffff84277550
#17 [ffffa7bf4e7f3d70] unuse_vma+0x55c at ffffffff842786cc
#18 [ffffa7bf4e7f3e90] try_to_unuse+0x139 at ffffffff84278eb9
#19 [ffffa7bf4e7f3ee8] __x64_sys_swapoff+0x1eb at ffffffff842798fb
#20 [ffffa7bf4e7f3f40] do_syscall_64+0x33 at ffffffff848b9ab3
#21 [ffffa7bf4e7f3f50] entry_SYSCALL_64_after_hwframe+0x44 at ffffffff84a0007c
RIP: 00007fdd9ce26d17 RSP: 00007ffe99f98238 RFLAGS: 00000202
RAX: ffffffffffffffda RBX: 00005625219e8b60 RCX: 00007fdd9ce26d17
RDX: 0000000000000001 RSI: 0000000000000001 RDI: 00005625219e8b60
RBP: 0000000000000001 R8: 00007ffe99f982a0 R9: 0000000000000003
R10: 00005625219e8721 R11: 0000000000000202 R12: 0000000000000001
R13: 0000000000000000 R14: 00007ffe99f982a0 R15: 0000000000000000
ORIG_RAX: 00000000000000a8 CS: 0033 SS: 002b
crash> bt -e
PID: 11850 TASK: ffff933ee67f2280 CPU: 1 COMMAND: "swapoff"
KERNEL-MODE EXCEPTION FRAME AT: ffffa7bf4e7f3950
[exception RIP: mark_wakeup_next_waiter+81]
RIP: ffffffff840ea141 RSP: ffffa7bf4e7f3ba0 RFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000001
RDX: 0000000000000001 RSI: ffffa7bf4e7f3bf0 RDI: 0000000000000001
RBP: ffff933e5a7baba8 R8: ffff933f502c2000 R9: ffff933e5a7babc0
R10: 0000000000000001 R11: ffffffff855938b8 R12: ffffa7bf4e7f3be0
R13: ffffa7bf4e7f3bf0 R14: 0000000000000018 R15: ffff933ee67f2280
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
...
crash> gdb list *z3fold_zpool_free+0x580
0xffffffffc0f37680 is in z3fold_zpool_free (mm/z3fold.c:338).
warning: Source file is more recent than executable.
333
334 /* we are freeing a foreign handle if we are here */
335 zhdr->foreign_handles--;
336 is_free = true;
337 if (!test_bit(HANDLES_ORPHANED, &slots->pool)) {
338 write_unlock(&slots->lock);
339 return;
340 }
341 for (i = 0; i <= BUDDY_MASK; i++) {
342 if (slots->slot[i]) {
crash> z3fold_buddy_slots -x ffff933e5a7bab80
struct z3fold_buddy_slots {
slot = {0x0, 0x0, 0x0, 0x0},
pool = 0xffff933e4b6d8e00,
lock = {
rtmutex = {
wait_lock = {
raw_lock = {
{
val = {
counter = 0x1
},
{
locked = 0x1,
pending = 0x0
},
{
locked_pending = 0x1,
tail = 0x0
}
}
}
},
waiters = {
rb_root = {
rb_node = 0xffff933e5a7ba700
},
rb_leftmost = 0x0 <== yup, that's why we've been exploding
},
owner = 0xffff933ee67f2280, <== yup, that's our exploding task
save_state = 0x1
},
readers = {
counter = 0x80000000
}
},
poison = 0xfeedbabedeadbeef <== oh dear, that SHOULD be impossible
}
crash>
326 write_lock(&slots->lock);
327 BUG_ON(READ_ONCE(slots->poison)); <== we got past this landmine..
328 *(unsigned long *)handle = 0;
329 if (zhdr->slots == slots) {
330 write_unlock(&slots->lock);
331 return; /* simple case, nothing else to do */
332 }
333
334 /* we are freeing a foreign handle if we are here */
335 zhdr->foreign_handles--;
336 is_free = true;
337 if (!test_bit(HANDLES_ORPHANED, &slots->pool)) {
338 write_unlock(&slots->lock); <== ..and exploded here due to a corrupt lock
AND poison written under read_lock()
has suddenly materialized as well!?!
---
mm/z3fold.c | 25 +++++++++++++++++++++++++
1 file changed, 25 insertions(+)
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -44,6 +44,7 @@
#include <linux/zpool.h>
#include <linux/magic.h>
#include <linux/kmemleak.h>
+#include <linux/delay.h>
/*
* NCHUNKS_ORDER determines the internal allocation granularity, effectively
@@ -92,6 +93,7 @@ struct z3fold_buddy_slots {
unsigned long slot[BUDDY_MASK + 1];
unsigned long pool; /* back link + flags */
rwlock_t lock;
+ unsigned long poison;
};
#define HANDLE_FLAG_MASK (0x03)
@@ -220,6 +222,7 @@ static inline struct z3fold_buddy_slots
kmemleak_not_leak(slots);
slots->pool = (unsigned long)pool;
rwlock_init(&slots->lock);
+ slots->poison = 0;
}
return slots;
@@ -267,11 +270,13 @@ static inline struct z3fold_header *__ge
unsigned long addr;
read_lock(&slots->lock);
+ BUG_ON(READ_ONCE(slots->poison));
addr = *(unsigned long *)handle;
zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
if (lock)
locked = z3fold_page_trylock(zhdr);
read_unlock(&slots->lock);
+ BUG_ON(READ_ONCE(slots->poison));
if (locked)
break;
cpu_relax();
@@ -319,6 +324,7 @@ static inline void free_handle(unsigned
zhdr = handle_to_z3fold_header(handle);
slots = handle_to_slots(handle);
write_lock(&slots->lock);
+ BUG_ON(READ_ONCE(slots->poison)); <== poison was zero here
*(unsigned long *)handle = 0;
if (zhdr->slots == slots) {
write_unlock(&slots->lock);
@@ -338,6 +344,10 @@ static inline void free_handle(unsigned
break;
}
}
+ if (is_free)
+ slots->poison = 0xdeaddeaddeaddead;
+ else
+ BUG_ON(READ_ONCE(slots->poison));
write_unlock(&slots->lock);
if (is_free) {
@@ -475,8 +485,10 @@ static unsigned long __encode_handle(str
h |= (zhdr->last_chunks << BUDDY_SHIFT);
write_lock(&slots->lock);
+ BUG_ON(READ_ONCE(slots->poison));
slots->slot[idx] = h;
write_unlock(&slots->lock);
+ BUG_ON(READ_ONCE(slots->poison));
return (unsigned long)&slots->slot[idx];
}
@@ -492,8 +504,10 @@ static unsigned short handle_to_chunks(u
unsigned long addr;
read_lock(&slots->lock);
+ BUG_ON(READ_ONCE(slots->poison));
addr = *(unsigned long *)handle;
read_unlock(&slots->lock);
+ BUG_ON(READ_ONCE(slots->poison));
return (addr & ~PAGE_MASK) >> BUDDY_SHIFT;
}
@@ -509,10 +523,12 @@ static enum buddy handle_to_buddy(unsign
unsigned long addr;
read_lock(&slots->lock);
+ BUG_ON(READ_ONCE(slots->poison));
WARN_ON(handle & (1 << PAGE_HEADLESS));
addr = *(unsigned long *)handle;
read_unlock(&slots->lock);
zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
+ BUG_ON(READ_ONCE(slots->poison));
return (addr - zhdr->first_num) & BUDDY_MASK;
}
@@ -538,6 +554,7 @@ static void __release_z3fold_page(struct
/* If there are no foreign handles, free the handles array */
read_lock(&zhdr->slots->lock);
+ BUG_ON(READ_ONCE(zhdr->slots->poison));
for (i = 0; i <= BUDDY_MASK; i++) {
if (zhdr->slots->slot[i]) {
is_free = false;
@@ -546,6 +563,11 @@ static void __release_z3fold_page(struct
}
if (!is_free)
set_bit(HANDLES_ORPHANED, &zhdr->slots->pool);
+ else {
+ zhdr->slots->poison = 0xfeedbabedeadbeef;
+// usleep_range(100, 200);
+// zhdr->slots->poison = 0;
+ }
read_unlock(&zhdr->slots->lock);
if (is_free)
@@ -750,11 +772,14 @@ static struct z3fold_header *compact_sin
new_zhdr->foreign_handles++;
memcpy(q, p, sz);
write_lock(&zhdr->slots->lock);
+ BUG_ON(READ_ONCE(zhdr->slots->poison));
*(unsigned long *)old_handle = (unsigned long)new_zhdr +
__idx(new_zhdr, new_bud);
if (new_bud == LAST)
*(unsigned long *)old_handle |=
(new_zhdr->last_chunks << BUDDY_SHIFT);
+ BUG_ON(READ_ONCE(zhdr->slots->poison));
+ BUG_ON(READ_ONCE(new_zhdr->slots->poison));
write_unlock(&zhdr->slots->lock);
add_to_unbuddied(pool, new_zhdr);
z3fold_page_unlock(new_zhdr);
On Mon, 2020-12-07 at 02:05 +0100, Vitaly Wool wrote:
>
> Could you please try the following patch in your setup:
crash> gdb list *z3fold_zpool_free+0x527
0xffffffffc0e14487 is in z3fold_zpool_free (mm/z3fold.c:341).
336 if (slots->slot[i]) {
337 is_free = false;
338 break;
339 }
340 }
341 write_unlock(&slots->lock); <== boom
342
343 if (is_free) {
344 struct z3fold_pool *pool = slots_to_pool(slots);
345
crash> z3fold_buddy_slots -x ffff99a3287b8780
struct z3fold_buddy_slots {
slot = {0xdeadbeef, 0xdeadbeef, 0xdeadbeef, 0xdeadbeef},
pool = 0xffff99a3146b8400,
lock = {
rtmutex = {
wait_lock = {
raw_lock = {
{
val = {
counter = 0x1
},
{
locked = 0x1,
pending = 0x0
},
{
locked_pending = 0x1,
tail = 0x0
}
}
}
},
waiters = {
rb_root = {
rb_node = 0xffff99a3287b8e00
},
rb_leftmost = 0x0
},
owner = 0xffff99a355c24500,
save_state = 0x1
},
readers = {
counter = 0x80000000
}
}
}
> diff --git a/mm/z3fold.c b/mm/z3fold.c
> index 18feaa0bc537..efe9a012643d 100644
> --- a/mm/z3fold.c
> +++ b/mm/z3fold.c
> @@ -544,12 +544,17 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
> break;
> }
> }
> - if (!is_free)
> + if (!is_free) {
> set_bit(HANDLES_ORPHANED, &zhdr->slots->pool);
> - read_unlock(&zhdr->slots->lock);
> -
> - if (is_free)
> + read_unlock(&zhdr->slots->lock);
> + } else {
> + zhdr->slots->slot[0] =
> + zhdr->slots->slot[1] =
> + zhdr->slots->slot[2] =
> + zhdr->slots->slot[3] = 0xdeadbeef;
> + read_unlock(&zhdr->slots->lock);
> kmem_cache_free(pool->c_handle, zhdr->slots);
> + }
>
> if (locked)
> z3fold_page_unlock(zhdr);
On Mon, Dec 7, 2020 at 3:18 AM Mike Galbraith <[email protected]> wrote:
>
> On Mon, 2020-12-07 at 02:05 +0100, Vitaly Wool wrote:
> >
> > Could you please try the following patch in your setup:
>
> crash> gdb list *z3fold_zpool_free+0x527
> 0xffffffffc0e14487 is in z3fold_zpool_free (mm/z3fold.c:341).
> 336 if (slots->slot[i]) {
> 337 is_free = false;
> 338 break;
> 339 }
> 340 }
> 341 write_unlock(&slots->lock); <== boom
> 342
> 343 if (is_free) {
> 344 struct z3fold_pool *pool = slots_to_pool(slots);
> 345
> crash> z3fold_buddy_slots -x ffff99a3287b8780
> struct z3fold_buddy_slots {
> slot = {0xdeadbeef, 0xdeadbeef, 0xdeadbeef, 0xdeadbeef},
> pool = 0xffff99a3146b8400,
> lock = {
> rtmutex = {
> wait_lock = {
> raw_lock = {
> {
> val = {
> counter = 0x1
> },
> {
> locked = 0x1,
> pending = 0x0
> },
> {
> locked_pending = 0x1,
> tail = 0x0
> }
> }
> }
> },
> waiters = {
> rb_root = {
> rb_node = 0xffff99a3287b8e00
> },
> rb_leftmost = 0x0
> },
> owner = 0xffff99a355c24500,
> save_state = 0x1
> },
> readers = {
> counter = 0x80000000
> }
> }
> }
Thanks. This trace beats me because I don't quite get how this could
have happened.
Hitting write_unlock at line 341 would mean that HANDLES_ORPHANED bit
is set but obviously it isn't.
Could you please comment out the ".shrink = z3fold_zpool_shrink" line
and retry? Reclaim is the trickiest thing over there since I have to
drop page lock while reclaiming.
Thanks,
Vitaly
> > diff --git a/mm/z3fold.c b/mm/z3fold.c
> > index 18feaa0bc537..efe9a012643d 100644
> > --- a/mm/z3fold.c
> > +++ b/mm/z3fold.c
> > @@ -544,12 +544,17 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
> > break;
> > }
> > }
> > - if (!is_free)
> > + if (!is_free) {
> > set_bit(HANDLES_ORPHANED, &zhdr->slots->pool);
> > - read_unlock(&zhdr->slots->lock);
> > -
> > - if (is_free)
> > + read_unlock(&zhdr->slots->lock);
> > + } else {
> > + zhdr->slots->slot[0] =
> > + zhdr->slots->slot[1] =
> > + zhdr->slots->slot[2] =
> > + zhdr->slots->slot[3] = 0xdeadbeef;
> > + read_unlock(&zhdr->slots->lock);
> > kmem_cache_free(pool->c_handle, zhdr->slots);
> > + }
> >
> > if (locked)
> > z3fold_page_unlock(zhdr);
>
On Mon, 2020-12-07 at 12:52 +0100, Vitaly Wool wrote:
>
> Thanks. This trace beats me because I don't quite get how this could
> have happened.
I swear there's a mythical creature loose in there somewhere ;-)
Everything looks just peachy up to the instant it goes boom, then you
find in the wreckage that which was most definitely NOT there just a
few ns ago.
> Hitting write_unlock at line 341 would mean that HANDLES_ORPHANED bit
> is set but obviously it isn't.
> Could you please comment out the ".shrink = z3fold_zpool_shrink" line
> and retry?
Unfortunately, that made zero difference.
-Mike
On Mon, Dec 7, 2020 at 1:34 PM Mike Galbraith <[email protected]> wrote:
>
> On Mon, 2020-12-07 at 12:52 +0100, Vitaly Wool wrote:
> >
> > Thanks. This trace beats me because I don't quite get how this could
> > have happened.
>
> I swear there's a mythical creature loose in there somewhere ;-)
> Everything looks just peachy up to the instant it goes boom, then you
> find in the wreckage that which was most definitely NOT there just a
> few ns ago.
>
> > Hitting write_unlock at line 341 would mean that HANDLES_ORPHANED bit
> > is set but obviously it isn't.
> > Could you please comment out the ".shrink = z3fold_zpool_shrink" line
> > and retry?
>
> Unfortunately, that made zero difference.
Okay, I suggest that you submit the patch that changes read_lock() to
write_lock() in __release_z3fold_page() and I'll ack it then.
I would like to rewrite the code so that write_lock is not necessary
there but I don't want to hold you back and it isn't likely that I'll
complete this today.
Best regards,
Vitaly
On 2020-12-07 16:21:20 [+0100], Vitaly Wool wrote:
> On Mon, Dec 7, 2020 at 1:34 PM Mike Galbraith <[email protected]> wrote:
> >
> > Unfortunately, that made zero difference.
>
> Okay, I suggest that you submit the patch that changes read_lock() to
> write_lock() in __release_z3fold_page() and I'll ack it then.
> I would like to rewrite the code so that write_lock is not necessary
> there but I don't want to hold you back and it isn't likely that I'll
> complete this today.
Please with a
Fixes: 4a3ac9311dac3 ("mm/z3fold.c: add inter-page compaction")
tag.
> Best regards,
> Vitaly
Sebastian
On Mon, 2020-12-07 at 16:21 +0100, Vitaly Wool wrote:
> On Mon, Dec 7, 2020 at 1:34 PM Mike Galbraith <[email protected]> wrote:
> >
>
> > Unfortunately, that made zero difference.
>
> Okay, I suggest that you submit the patch that changes read_lock() to
> write_lock() in __release_z3fold_page() and I'll ack it then.
> I would like to rewrite the code so that write_lock is not necessary
> there but I don't want to hold you back and it isn't likely that I'll
> complete this today.
Nah, I'm in no rush... especially not to sign off on "Because the
little voices in my head said this bit should look like that bit over
yonder, and testing _seems_ to indicate they're right about that" :)
-Mike
Hi Mike,
On 2020-12-07 16:41, Mike Galbraith wrote:
> On Mon, 2020-12-07 at 16:21 +0100, Vitaly Wool wrote:
>> On Mon, Dec 7, 2020 at 1:34 PM Mike Galbraith <[email protected]> wrote:
>>>
>>
>>> Unfortunately, that made zero difference.
>>
>> Okay, I suggest that you submit the patch that changes read_lock() to
>> write_lock() in __release_z3fold_page() and I'll ack it then.
>> I would like to rewrite the code so that write_lock is not necessary
>> there but I don't want to hold you back and it isn't likely that I'll
>> complete this today.
>
> Nah, I'm in no rush... especially not to sign off on "Because the
> little voices in my head said this bit should look like that bit over
> yonder, and testing _seems_ to indicate they're right about that" :)
>
> -Mike
>
okay, thanks. Would this make things better:
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 18feaa0bc537..340c38a5ffac 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -303,10 +303,9 @@ static inline void put_z3fold_header(struct
z3fold_header *zhdr)
z3fold_page_unlock(zhdr);
}
-static inline void free_handle(unsigned long handle)
+static inline void free_handle(unsigned long handle, struct
z3fold_header *zhdr)
{
struct z3fold_buddy_slots *slots;
- struct z3fold_header *zhdr;
int i;
bool is_free;
@@ -316,22 +315,13 @@ static inline void free_handle(unsigned long handle)
if (WARN_ON(*(unsigned long *)handle == 0))
return;
- zhdr = handle_to_z3fold_header(handle);
slots = handle_to_slots(handle);
write_lock(&slots->lock);
*(unsigned long *)handle = 0;
- if (zhdr->slots == slots) {
- write_unlock(&slots->lock);
- return; /* simple case, nothing else to do */
- }
+ if (zhdr->slots != slots)
+ zhdr->foreign_handles--;
- /* we are freeing a foreign handle if we are here */
- zhdr->foreign_handles--;
is_free = true;
- if (!test_bit(HANDLES_ORPHANED, &slots->pool)) {
- write_unlock(&slots->lock);
- return;
- }
for (i = 0; i <= BUDDY_MASK; i++) {
if (slots->slot[i]) {
is_free = false;
@@ -343,6 +333,8 @@ static inline void free_handle(unsigned long handle)
if (is_free) {
struct z3fold_pool *pool = slots_to_pool(slots);
+ if (zhdr->slots == slots)
+ zhdr->slots = NULL;
kmem_cache_free(pool->c_handle, slots);
}
}
@@ -525,8 +517,6 @@ static void __release_z3fold_page(struct
z3fold_header *zhdr, bool locked)
{
struct page *page = virt_to_page(zhdr);
struct z3fold_pool *pool = zhdr_to_pool(zhdr);
- bool is_free = true;
- int i;
WARN_ON(!list_empty(&zhdr->buddy));
set_bit(PAGE_STALE, &page->private);
@@ -536,21 +526,6 @@ static void __release_z3fold_page(struct
z3fold_header *zhdr, bool locked)
list_del_init(&page->lru);
spin_unlock(&pool->lock);
- /* If there are no foreign handles, free the handles array */
- read_lock(&zhdr->slots->lock);
- for (i = 0; i <= BUDDY_MASK; i++) {
- if (zhdr->slots->slot[i]) {
- is_free = false;
- break;
- }
- }
- if (!is_free)
- set_bit(HANDLES_ORPHANED, &zhdr->slots->pool);
- read_unlock(&zhdr->slots->lock);
-
- if (is_free)
- kmem_cache_free(pool->c_handle, zhdr->slots);
-
if (locked)
z3fold_page_unlock(zhdr);
@@ -973,6 +948,9 @@ static inline struct z3fold_header
*__z3fold_alloc(struct z3fold_pool *pool,
}
}
+ if (zhdr && !zhdr->slots)
+ zhdr->slots = alloc_slots(pool,
+ can_sleep ? GFP_NOIO : GFP_ATOMIC);
return zhdr;
}
@@ -1270,7 +1248,7 @@ static void z3fold_free(struct z3fold_pool *pool,
unsigned long handle)
}
if (!page_claimed)
- free_handle(handle);
+ free_handle(handle, zhdr);
if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) {
atomic64_dec(&pool->pages_nr);
return;
@@ -1429,19 +1407,19 @@ static int z3fold_reclaim_page(struct
z3fold_pool *pool, unsigned int retries)
ret = pool->ops->evict(pool, middle_handle);
if (ret)
goto next;
- free_handle(middle_handle);
+ free_handle(middle_handle, zhdr);
}
if (first_handle) {
ret = pool->ops->evict(pool, first_handle);
if (ret)
goto next;
- free_handle(first_handle);
+ free_handle(first_handle, zhdr);
}
if (last_handle) {
ret = pool->ops->evict(pool, last_handle);
if (ret)
goto next;
- free_handle(last_handle);
+ free_handle(last_handle, zhdr);
}
next:
if (test_bit(PAGE_HEADLESS, &page->private)) {
--
Best regards,
Vitaly
On Wed, 2020-12-09 at 00:26 +0100, Vitaly Wool wrote:
> Hi Mike,
>
> On 2020-12-07 16:41, Mike Galbraith wrote:
> > On Mon, 2020-12-07 at 16:21 +0100, Vitaly Wool wrote:
> >> On Mon, Dec 7, 2020 at 1:34 PM Mike Galbraith <[email protected]> wrote:
> >>>
> >>
> >>> Unfortunately, that made zero difference.
> >>
> >> Okay, I suggest that you submit the patch that changes read_lock() to
> >> write_lock() in __release_z3fold_page() and I'll ack it then.
> >> I would like to rewrite the code so that write_lock is not necessary
> >> there but I don't want to hold you back and it isn't likely that I'll
> >> complete this today.
> >
> > Nah, I'm in no rush... especially not to sign off on "Because the
> > little voices in my head said this bit should look like that bit over
> > yonder, and testing _seems_ to indicate they're right about that" :)
> >
> > -Mike
> >
>
> okay, thanks. Would this make things better:
Yup, z3fold became RT tolerant with this (un-munged and) applied.
BTW, I don't think my write_lock() hacklet actually plugged the hole
that leads to the below. I think it just reduced the odds of the two
meeting enough to make it look ~solid in fairly limited test time.
[ 3369.373023] kworker/-7413 4.....12 3368809247us : do_compact_page: zhdr: ffff95d93abd8000 zhdr->slots: ffff95d951f5df80 zhdr->slots->slot[0]: 0
[ 3369.373025] kworker/-7413 4.....12 3368809248us : do_compact_page: old_handle ffff95d951f5df98 *old_handle was ffff95d93abd804f now is ffff95da3ab8104c
[ 3369.373027] kworker/-7413 4.....11 3368809249us : __release_z3fold_page.constprop.25: freed ffff95d951f5df80
[ 3369.373028] ---------------------------------
[ 3369.373029] CR2: 0000000000000018
crash> p debug_handle | grep '\[2'
[2]: ffff95dc1ecac0d0
crash> rd ffff95dc1ecac0d0
ffff95dc1ecac0d0: ffff95d951f5df98 ...Q....
crash> p debug_zhdr | grep '\[2'
[2]: ffff95dc1ecac0c8
crash> rd ffff95dc1ecac0c8
ffff95dc1ecac0c8: ffff95da3ab81000 ...:.... <== kworker is not working on same zhdr as free_handle()..
crash> p debug_slots | grep '\[2'
[2]: ffff95dc1ecac0c0
crash> rd ffff95dc1ecac0c0
ffff95dc1ecac0c0: ffff95d951f5df80 ...Q.... <== ..but is the same slots, and frees it under free_handle().
crash> bt -sx leading to use after free+corruption+explosion 1us later.
PID: 9334 TASK: ffff95d95a1eb3c0 CPU: 2 COMMAND: "swapoff"
#0 [ffffb4248847f8f0] machine_kexec+0x16e at ffffffffa605f8ce
#1 [ffffb4248847f938] __crash_kexec+0xd2 at ffffffffa614c162
#2 [ffffb4248847f9f8] crash_kexec+0x30 at ffffffffa614d350
#3 [ffffb4248847fa08] oops_end+0xca at ffffffffa602680a
#4 [ffffb4248847fa28] no_context+0x14d at ffffffffa606d7cd
#5 [ffffb4248847fa88] exc_page_fault+0x2b8 at ffffffffa68bdb88
#6 [ffffb4248847fae0] asm_exc_page_fault+0x1e at ffffffffa6a00ace
#7 [ffffb4248847fb68] mark_wakeup_next_waiter+0x51 at ffffffffa60ea121
#8 [ffffb4248847fbd0] rt_mutex_futex_unlock+0x4f at ffffffffa68c93ef
#9 [ffffb4248847fc10] z3fold_zpool_free+0x593 at ffffffffc0ecb663 [z3fold]
#10 [ffffb4248847fc78] zswap_free_entry+0x43 at ffffffffa627c823
#11 [ffffb4248847fc88] zswap_frontswap_invalidate_page+0x8a at ffffffffa627c92a
#12 [ffffb4248847fcb0] __frontswap_invalidate_page+0x48 at ffffffffa627c018
#13 [ffffb4248847fcd8] swapcache_free_entries+0x1ee at ffffffffa6276f5e
#14 [ffffb4248847fd20] free_swap_slot+0x9f at ffffffffa627b8ff
#15 [ffffb4248847fd40] delete_from_swap_cache+0x61 at ffffffffa6274621
#16 [ffffb4248847fd60] try_to_free_swap+0x70 at ffffffffa6277520
#17 [ffffb4248847fd70] unuse_vma+0x55c at ffffffffa627869c
#18 [ffffb4248847fe90] try_to_unuse+0x139 at ffffffffa6278e89
#19 [ffffb4248847fee8] __x64_sys_swapoff+0x1eb at ffffffffa62798cb
#20 [ffffb4248847ff40] do_syscall_64+0x33 at ffffffffa68b9ab3
#21 [ffffb4248847ff50] entry_SYSCALL_64_after_hwframe+0x44 at ffffffffa6a0007c
RIP: 00007fbd835a5d17 RSP: 00007ffd60634458 RFLAGS: 00000202
RAX: ffffffffffffffda RBX: 0000559540e34b60 RCX: 00007fbd835a5d17
RDX: 0000000000000001 RSI: 0000000000000001 RDI: 0000559540e34b60
RBP: 0000000000000001 R8: 00007ffd606344c0 R9: 0000000000000003
R10: 0000559540e34721 R11: 0000000000000202 R12: 0000000000000001
R13: 0000000000000000 R14: 00007ffd606344c0 R15: 0000000000000000
ORIG_RAX: 00000000000000a8 CS: 0033 SS: 002b
crash>
>
> diff --git a/mm/z3fold.c b/mm/z3fold.c
> index 18feaa0bc537..340c38a5ffac 100644
> --- a/mm/z3fold.c
> +++ b/mm/z3fold.c
> @@ -303,10 +303,9 @@ static inline void put_z3fold_header(struct
> z3fold_header *zhdr)
> z3fold_page_unlock(zhdr);
> }
>
> -static inline void free_handle(unsigned long handle)
> +static inline void free_handle(unsigned long handle, struct
> z3fold_header *zhdr)
> {
> struct z3fold_buddy_slots *slots;
> - struct z3fold_header *zhdr;
> int i;
> bool is_free;
>
> @@ -316,22 +315,13 @@ static inline void free_handle(unsigned long handle)
> if (WARN_ON(*(unsigned long *)handle == 0))
> return;
>
> - zhdr = handle_to_z3fold_header(handle);
> slots = handle_to_slots(handle);
> write_lock(&slots->lock);
> *(unsigned long *)handle = 0;
> - if (zhdr->slots == slots) {
> - write_unlock(&slots->lock);
> - return; /* simple case, nothing else to do */
> - }
> + if (zhdr->slots != slots)
> + zhdr->foreign_handles--;
>
> - /* we are freeing a foreign handle if we are here */
> - zhdr->foreign_handles--;
> is_free = true;
> - if (!test_bit(HANDLES_ORPHANED, &slots->pool)) {
> - write_unlock(&slots->lock);
> - return;
> - }
> for (i = 0; i <= BUDDY_MASK; i++) {
> if (slots->slot[i]) {
> is_free = false;
> @@ -343,6 +333,8 @@ static inline void free_handle(unsigned long handle)
> if (is_free) {
> struct z3fold_pool *pool = slots_to_pool(slots);
>
> + if (zhdr->slots == slots)
> + zhdr->slots = NULL;
> kmem_cache_free(pool->c_handle, slots);
> }
> }
> @@ -525,8 +517,6 @@ static void __release_z3fold_page(struct
> z3fold_header *zhdr, bool locked)
> {
> struct page *page = virt_to_page(zhdr);
> struct z3fold_pool *pool = zhdr_to_pool(zhdr);
> - bool is_free = true;
> - int i;
>
> WARN_ON(!list_empty(&zhdr->buddy));
> set_bit(PAGE_STALE, &page->private);
> @@ -536,21 +526,6 @@ static void __release_z3fold_page(struct
> z3fold_header *zhdr, bool locked)
> list_del_init(&page->lru);
> spin_unlock(&pool->lock);
>
> - /* If there are no foreign handles, free the handles array */
> - read_lock(&zhdr->slots->lock);
> - for (i = 0; i <= BUDDY_MASK; i++) {
> - if (zhdr->slots->slot[i]) {
> - is_free = false;
> - break;
> - }
> - }
> - if (!is_free)
> - set_bit(HANDLES_ORPHANED, &zhdr->slots->pool);
> - read_unlock(&zhdr->slots->lock);
> -
> - if (is_free)
> - kmem_cache_free(pool->c_handle, zhdr->slots);
> -
> if (locked)
> z3fold_page_unlock(zhdr);
>
> @@ -973,6 +948,9 @@ static inline struct z3fold_header
> *__z3fold_alloc(struct z3fold_pool *pool,
> }
> }
>
> + if (zhdr && !zhdr->slots)
> + zhdr->slots = alloc_slots(pool,
> + can_sleep ? GFP_NOIO : GFP_ATOMIC);
> return zhdr;
> }
>
> @@ -1270,7 +1248,7 @@ static void z3fold_free(struct z3fold_pool *pool,
> unsigned long handle)
> }
>
> if (!page_claimed)
> - free_handle(handle);
> + free_handle(handle, zhdr);
> if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) {
> atomic64_dec(&pool->pages_nr);
> return;
> @@ -1429,19 +1407,19 @@ static int z3fold_reclaim_page(struct
> z3fold_pool *pool, unsigned int retries)
> ret = pool->ops->evict(pool, middle_handle);
> if (ret)
> goto next;
> - free_handle(middle_handle);
> + free_handle(middle_handle, zhdr);
> }
> if (first_handle) {
> ret = pool->ops->evict(pool, first_handle);
> if (ret)
> goto next;
> - free_handle(first_handle);
> + free_handle(first_handle, zhdr);
> }
> if (last_handle) {
> ret = pool->ops->evict(pool, last_handle);
> if (ret)
> goto next;
> - free_handle(last_handle);
> + free_handle(last_handle, zhdr);
> }
> next:
> if (test_bit(PAGE_HEADLESS, &page->private)) {
>
> --
>
> Best regards,
> Vitaly
On Wed, 2020-12-09 at 07:13 +0100, Mike Galbraith wrote:
> On Wed, 2020-12-09 at 00:26 +0100, Vitaly Wool wrote:
> > Hi Mike,
> >
> > On 2020-12-07 16:41, Mike Galbraith wrote:
> > > On Mon, 2020-12-07 at 16:21 +0100, Vitaly Wool wrote:
> > >> On Mon, Dec 7, 2020 at 1:34 PM Mike Galbraith <[email protected]> wrote:
> > >>>
> > >>
> > >>> Unfortunately, that made zero difference.
> > >>
> > >> Okay, I suggest that you submit the patch that changes read_lock() to
> > >> write_lock() in __release_z3fold_page() and I'll ack it then.
> > >> I would like to rewrite the code so that write_lock is not necessary
> > >> there but I don't want to hold you back and it isn't likely that I'll
> > >> complete this today.
> > >
> > > Nah, I'm in no rush... especially not to sign off on "Because the
> > > little voices in my head said this bit should look like that bit over
> > > yonder, and testing _seems_ to indicate they're right about that" :)
> > >
> > > -Mike
> > >
> >
> > okay, thanks. Would this make things better:
>
> Yup, z3fold became RT tolerant with this (un-munged and) applied.
Below is the other change that any RT users of z3fold will need.
mm, z3fold: Remove preempt disabled sections for RT
Replace get_cpu_ptr() with migrate_disable()+this_cpu_ptr() so RT can take
spinlocks that become sleeping locks.
Signed-off-by Mike Galbraith <[email protected]>
---
mm/z3fold.c | 17 ++++++++++-------
1 file changed, 10 insertions(+), 7 deletions(-)
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -617,14 +617,16 @@ static inline void add_to_unbuddied(stru
{
if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
zhdr->middle_chunks == 0) {
- struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
-
+ struct list_head *unbuddied;
int freechunks = num_free_chunks(zhdr);
+
+ migrate_disable();
+ unbuddied = this_cpu_ptr(pool->unbuddied);
spin_lock(&pool->lock);
list_add(&zhdr->buddy, &unbuddied[freechunks]);
spin_unlock(&pool->lock);
zhdr->cpu = smp_processor_id();
- put_cpu_ptr(pool->unbuddied);
+ migrate_enable();
}
}
@@ -861,8 +863,9 @@ static inline struct z3fold_header *__z3
int chunks = size_to_chunks(size), i;
lookup:
+ migrate_disable();
/* First, try to find an unbuddied z3fold page. */
- unbuddied = get_cpu_ptr(pool->unbuddied);
+ unbuddied = this_cpu_ptr(pool->unbuddied);
for_each_unbuddied_list(i, chunks) {
struct list_head *l = &unbuddied[i];
@@ -880,7 +883,7 @@ static inline struct z3fold_header *__z3
!z3fold_page_trylock(zhdr)) {
spin_unlock(&pool->lock);
zhdr = NULL;
- put_cpu_ptr(pool->unbuddied);
+ migrate_enable();
if (can_sleep)
cond_resched();
goto lookup;
@@ -894,7 +897,7 @@ static inline struct z3fold_header *__z3
test_bit(PAGE_CLAIMED, &page->private)) {
z3fold_page_unlock(zhdr);
zhdr = NULL;
- put_cpu_ptr(pool->unbuddied);
+ migrate_enable();
if (can_sleep)
cond_resched();
goto lookup;
@@ -909,7 +912,7 @@ static inline struct z3fold_header *__z3
kref_get(&zhdr->refcount);
break;
}
- put_cpu_ptr(pool->unbuddied);
+ migrate_enable();
if (!zhdr) {
int cpu;