I picked up c40f7d74c741a907cfaeb73a7697081881c497d0 sched/fair: Fix
infinite loop in update_blocked_averages() by reverting a9e7f6544b9c
and put it on top of 4.19.13. In addition to this, I uninlined
list_add_leaf_cfs_rq for debugging.
This revealed a new bug that we didn't get to because we kept getting
crashes from the previous issue. When we are running with cgroups that
are rapidly changing, with CFS bandwidth control, and in addition
using the cpusets cgroup, we see this crash. Specifically, it seems to
occur with cgroups that are throttled and we change the allowed
cpuset.
[ 590.128698] list_add corruption. next->prev should be prev
(ffff8b129c904388), but was 0000000000000000. (next=ffff8b129c904200).
[ 590.131971] ------------[ cut here ]------------
[ 590.133248] kernel BUG at lib/list_debug.c:25!
[ 590.134478] invalid opcode: 0000 [#1] SMP PTI
[ 590.135673] CPU: 88 PID: 65056 Comm: runc:[2:INIT] Kdump: loaded
Tainted: G OE 4.19.13netflixdebug-46061-gf9fd7e5-dirty
#8
[ 590.138828] Hardware name: Amazon EC2 r5.24xlarge/, BIOS 1.0 10/16/2017
[ 590.140574] RIP: 0010:__list_add_valid+0x38/0x70
[ 590.141839] Code: 75 19 48 8b 32 48 39 f0 75 22 48 39 fa 74 34 48
39 f8 74 2f b8 01 00 00 00 5d c3 48 89 c1 48 c7 c7 d8 98 8c a9 e8 1b
88 bd ff <0f> 0b 48 89 d1 48 c7 c7 30 99 8c a9 48 89 f2 48 89 c6 e8 04
88 bd
[ 590.146570] RSP: 0018:ffffaedba3f23b30 EFLAGS: 00010086
[ 590.148007] RAX: 0000000000000075 RBX: ffff8b1292b93200 RCX: 0000000000000000
[ 590.149878] RDX: 0000000000000000 RSI: ffff8b1577c16858 RDI: ffff8b1577c16858
[ 590.151769] RBP: ffffaedba3f23b30 R08: 0000000000000000 R09: 0000000000000000
[ 590.153668] R10: ffffaedba3f23b20 R11: ffffffffa870aa8f R12: ffff8b1577ce3f40
[ 590.155543] R13: ffff8b1292b93388 R14: ffff8b129c904388 R15: ffff8b129c904200
[ 590.157452] FS: 00007f8143856b80(0000) GS:ffff8b1577c00000(0000)
knlGS:0000000000000000
[ 590.160372] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 590.162742] CR2: 000000c4202d0000 CR3: 000000bc6431c003 CR4: 00000000007606e0
[ 590.165463] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 590.168202] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 590.170890] PKRU: 55555554
[ 590.172514] Call Trace:
[ 590.174088] list_add_leaf_cfs_rq+0x91/0x1a0
[ 590.176120] enqueue_entity+0x177/0x5f0
[ 590.178032] ? __lock_is_held+0x5e/0xa0
[ 590.179980] enqueue_task_fair+0x94/0x1d0
[ 590.181935] activate_task+0x54/0xa0
[ 590.183775] ttwu_do_activate+0x54/0xb0
[ 590.185649] try_to_wake_up+0x225/0x510
[ 590.187609] wake_up_state+0x10/0x20
[ 590.189443] signal_wake_up_state+0x19/0x30
[ 590.191418] zap_other_threads+0x74/0x90
[ 590.193327] flush_old_exec+0x9c/0x740
[ 590.195199] load_elf_binary+0x3a1/0x1760
[ 590.197220] ? find_held_lock+0x35/0xa0
[ 590.199110] ? find_held_lock+0x35/0xa0
[ 590.200998] ? search_binary_handler+0x88/0x1d0
[ 590.203082] search_binary_handler+0x9d/0x1d0
[ 590.205107] __do_execve_file.isra.27+0x709/0xa90
[ 590.207313] __x64_sys_execve+0x39/0x50
[ 590.209233] do_syscall_64+0x65/0x1a0
[ 590.211126] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 590.213341] RIP: 0033:0x5606439cea2b
[ 590.215195] Code: 8b 7c 24 10 48 8b 74 24 18 48 8b 54 24 20 49 c7
c2 00 00 00 00 49 c7 c0 00 00 00 00 49 c7 c1 00 00 00 00 48 8b 44 24
08 0f 05 <48> 3d 01 f0 ff ff 76 1b 48 c7 44 24 28 ff ff ff ff 48 c7 44
24 30
[ 590.222457] RSP: 002b:000000c42027d0b8 EFLAGS: 00000206 ORIG_RAX:
000000000000003b
[ 590.226164] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00005606439cea2b
[ 590.228883] RDX: 000000c420226c60 RSI: 000000c420214400 RDI: 000000c4202d0210
[ 590.231567] RBP: 000000c42027d150 R08: 0000000000000000 R09: 0000000000000000
[ 590.234237] R10: 0000000000000000 R11: 0000000000000206 R12: ffffffffffffffff
[ 590.237028] R13: 0000000000000039 R14: 0000000000000038 R15: 0000000000000055
This branch that it occurs on is the last one in list_add_leaf_cfs_rq,
where The parent has not already been added, so it tries to add the
cfs_rq to tmp_alone branch.
# So, ffff8b1292b93388 is the new item, and it looks reasonable:
crash> struct cfs_rq -l cfs_rq.leaf_cfs_rq_list ffff8b1292b93388
struct cfs_rq {
load = {
weight = 1048576,
inv_weight = 0
},
runnable_weight = 1048576,
nr_running = 1,
h_nr_running = 0,
exec_clock = 0,
min_vruntime = 18446744073708503040,
tasks_timeline = {
rb_root = {
rb_node = 0xffff8b126ec79f98
},
rb_leftmost = 0xffff8b126ec79f98
},
curr = 0x0,
next = 0x0,
last = 0x0,
skip = 0x0,
nr_spread_over = 0,
avg = {
last_update_time = 590128483328,
load_sum = 188416,
runnable_load_sum = 188416,
util_sum = 94210,
period_contrib = 387,
load_avg = 4,
runnable_load_avg = 4,
util_avg = 2,
util_est = {
enqueued = 0,
ewma = 0
}
},
removed = {
lock = {
raw_lock = {
{
val = {
counter = 0
},
{
locked = 0 '\000',
pending = 0 '\000'
},
{
locked_pending = 0,
tail = 0
}
}
},
magic = 3735899821,
owner_cpu = 4294967295,
owner = 0xffffffffffffffff,
dep_map = {
key = 0xffffffffa9da21d0,
class_cache = {0x0, 0x0},
name = 0xffffffffa987c66f "&cfs_rq->removed.lock"
}
},
nr = 0,
load_avg = 0,
util_avg = 0,
runnable_sum = 0
},
tg_load_avg_contrib = 4,
propagate = 1,
prop_runnable_sum = 184,
h_load = 0,
last_h_load_update = 0,
h_load_next = 0x0,
rq = 0xffff8b1577ce3f40,
freed = 0,
retired = 0,
manipulated = 0,
on_list = 0,
leaf_cfs_rq_list = {
next = 0x0,
prev = 0x0
},
tg = 0xffff8b12c011d100,
runtime_enabled = 1,
expires_seq = 0,
runtime_expires = 0,
runtime_remaining = 0,
throttled_clock = 0,
throttled_clock_task = 589819246302,
throttled_clock_task_time = 0,
throttled = 0,
throttle_count = 0,
throttled_list = {
next = 0xffff8b1292b933d8,
prev = 0xffff8b1292b933d8
}
}
Let's grab it's rq:
crash> struct cfs_rq.rq -l cfs_rq.leaf_cfs_rq_list ffff8b1292b93388
rq = 0xffff8b1577ce3f40
The rq's leaf_cfs_rq_list looks self-consistent.
crash> struct rq.leaf_cfs_rq_list 0xffff8b1577ce3f40
leaf_cfs_rq_list = {
next = 0xffff8b12ea889588,
prev = 0xffff8b1577ce4188
}
crash> list 0xffff8b12ea889588
ffff8b12ea889588
ffff8b12e0673788
ffff8b12dd860588
ffff8b12e98a8188
ffff8b12dd63b188
ffff8b12eab16588
ffff8b12c9954388
ffff8b12eaa63188
ffff8b12ee361d88
ffff8b12e9b98188
ffff8b12c98ab388
ffff8b12c2c5b788
ffff8b12c08f6788
ffff8b12e3c4ab88
ffff8b12eaa54788
ffff8b12e9ad1b88
ffff8b12e982eb88
ffff8b12ea069b88
ffff8b12ea220388
ffff8b1577ce4188
ffff8b1577ce4940
What's our current tmp_alone branch?
crash> struct rq.tmp_alone_branch 0xffff8b1577ce3f40
tmp_alone_branch = 0xffff8b129c904388
This seems malformed:
crash> list -s cfs_rq.leaf_cfs_rq_list -l cfs_rq.leaf_cfs_rq_list
0xffff8b129c904388
ffff8b129c904388
leaf_cfs_rq_list = {
next = 0xffff8b129c904200,
prev = 0xffff8b129c9043b1
}
ffff8b129c904200
leaf_cfs_rq_list = {
next = 0xffff8b129c904e00,
prev = 0x0
}
ffff8b129c904e00
leaf_cfs_rq_list = {
next = 0xffff8b129c906400,
prev = 0x0
}
ffff8b129c906400
leaf_cfs_rq_list = {
next = 0xffff8b129c907400,
prev = 0x0
}
ffff8b129c907400
leaf_cfs_rq_list = {
next = 0xffff8b129c904c00,
prev = 0x0
}
ffff8b129c904c00
leaf_cfs_rq_list = {
next = 0xffff8b129c906e00,
prev = 0x0
}
ffff8b129c906e00
leaf_cfs_rq_list = {
next = 0xffff8b129c906a00,
prev = 0x0
}
ffff8b129c906a00
leaf_cfs_rq_list = {
next = 0xffff8b129c904800,
prev = 0x0
}
ffff8b129c904800
leaf_cfs_rq_list = {
next = 0xffff8b129c906600,
prev = 0x0
}
ffff8b129c906600
leaf_cfs_rq_list = {
next = 0xffff8b129c904400,
prev = 0x0
}
ffff8b129c904400
leaf_cfs_rq_list = {
next = 0xffff8b129c905200,
prev = 0x0
}
ffff8b129c905200
leaf_cfs_rq_list = {
next = 0xffff8b129c906200,
prev = 0x0
}
ffff8b129c906200
leaf_cfs_rq_list = {
next = 0xffff8b129c905400,
prev = 0x0
}
ffff8b129c905400
leaf_cfs_rq_list = {
next = 0xffff8b129c907600,
prev = 0x0
}
ffff8b129c907600
leaf_cfs_rq_list = {
next = 0xffff8b129c907000,
prev = 0x0
}
ffff8b129c907000
leaf_cfs_rq_list = {
next = 0xffff8b129c906000,
prev = 0x0
}
ffff8b129c906000
leaf_cfs_rq_list = {
next = 0xffff8b129c907c00,
prev = 0x0
}
ffff8b129c907c00
leaf_cfs_rq_list = {
next = 0xffff8b129c905c00,
prev = 0x0
}
ffff8b129c905c00
leaf_cfs_rq_list = {
next = 0xffff8b129c905e00,
prev = 0x0
}
ffff8b129c905e00
leaf_cfs_rq_list = {
next = 0xffff8b129c907e00,
prev = 0x0
}
ffff8b129c907e00
leaf_cfs_rq_list = {
next = 0xffff8b129c905000,
prev = 0x0
}
ffff8b129c905000
leaf_cfs_rq_list = {
next = 0xffff8b129c906c00,
prev = 0x0
}
ffff8b129c906c00
leaf_cfs_rq_list = {
next = 0xffff8b129c904000,
prev = 0x0
}
ffff8b129c904000
leaf_cfs_rq_list = {
next = 0x0,
prev = 0x0
}
--
I'm not sure what's causing this. Perhaps there's some issue if the
allowed CPU set changes under throttle. It appears that
tmp_alone_branch points to a cfs_rq that's already been freed. Since I
couldn't replicate this with KASAN on, we verified this by overwriting
the cfs_rq with 0xff in free_fair_sched_group. When we examined the
cfs_rq that the rq->tmp_alone_branch pointed to, it had this pattern.
On Wed, Jan 9, 2019 at 2:14 PM Sargun Dhillon <[email protected]> wrote:
>
> I picked up c40f7d74c741a907cfaeb73a7697081881c497d0 sched/fair: Fix
> infinite loop in update_blocked_averages() by reverting a9e7f6544b9c
> and put it on top of 4.19.13. In addition to this, I uninlined
> list_add_leaf_cfs_rq for debugging.
>
> This revealed a new bug that we didn't get to because we kept getting
> crashes from the previous issue. When we are running with cgroups that
> are rapidly changing, with CFS bandwidth control, and in addition
> using the cpusets cgroup, we see this crash. Specifically, it seems to
> occur with cgroups that are throttled and we change the allowed
> cpuset.
>
This patch from Gabriel should fix the problem:
[PATCH] sched/fair: Reset tmp_alone_branch on cfs_rq delete
When a child cfs_rq is added to the leaf cfs_rq list before its parent
tmp_alone_branch is set to point to the child in preparation for the
parent being added.
If the child is deleted before the parent is added then tmp_alone_branch
points to a freed cfs_rq. Any future reference to tmp_alone_branch will
result in a use after free.
Signed-off-by: Gabriel Hartmann <[email protected]>
Reported-by: Sargun Dhillon <[email protected]>
---
kernel/sched/fair.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7137bc343b4a..0987629cbb76 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -347,6 +347,11 @@ static inline void list_add_leaf_cfs_rq(struct
cfs_rq *cfs_rq)
static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (cfs_rq->on_list) {
+ struct rq *rq = rq_of(cfs_rq);
+
+ if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+
list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
cfs_rq->on_list = 0;
}
On Wed, 9 Jan 2019 at 23:43, Sargun Dhillon <[email protected]> wrote:
>
> On Wed, Jan 9, 2019 at 2:14 PM Sargun Dhillon <[email protected]> wrote:
> >
> > I picked up c40f7d74c741a907cfaeb73a7697081881c497d0 sched/fair: Fix
> > infinite loop in update_blocked_averages() by reverting a9e7f6544b9c
> > and put it on top of 4.19.13. In addition to this, I uninlined
> > list_add_leaf_cfs_rq for debugging.
> >
> > This revealed a new bug that we didn't get to because we kept getting
> > crashes from the previous issue. When we are running with cgroups that
> > are rapidly changing, with CFS bandwidth control, and in addition
> > using the cpusets cgroup, we see this crash. Specifically, it seems to
> > occur with cgroups that are throttled and we change the allowed
> > cpuset.
Thanks for the context, I will try to reproduce the problem and
understand how we can stop in the middle of walking to the
sched_entity branch with a parent not already added
How many cgroup level have you got in you setup ?
> >
>
> This patch from Gabriel should fix the problem:
>
>
> [PATCH] sched/fair: Reset tmp_alone_branch on cfs_rq delete
>
> When a child cfs_rq is added to the leaf cfs_rq list before its parent
> tmp_alone_branch is set to point to the child in preparation for the
> parent being added.
>
> If the child is deleted before the parent is added then tmp_alone_branch
> points to a freed cfs_rq. Any future reference to tmp_alone_branch will
> result in a use after free.
So, the patch below is a temporary fix that helps to recover from the
situation where tmp_alone_branch doesn't finished back to
rq->leaf_cfs_rq_list
But this situation should not happened at the beginning
>
> Signed-off-by: Gabriel Hartmann <[email protected]>
> Reported-by: Sargun Dhillon <[email protected]>
> ---
> kernel/sched/fair.c | 5 +++++
> 1 file changed, 5 insertions(+)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 7137bc343b4a..0987629cbb76 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -347,6 +347,11 @@ static inline void list_add_leaf_cfs_rq(struct
> cfs_rq *cfs_rq)
> static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
> {
> if (cfs_rq->on_list) {
> + struct rq *rq = rq_of(cfs_rq);
> +
> + if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
> + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
> +
> list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
> cfs_rq->on_list = 0;
> }
On Fri, 18 Jan 2019 at 11:16, Vincent Guittot
<[email protected]> wrote:
>
> On Wed, 9 Jan 2019 at 23:43, Sargun Dhillon <[email protected]> wrote:
> >
> > On Wed, Jan 9, 2019 at 2:14 PM Sargun Dhillon <[email protected]> wrote:
> > >
> > > I picked up c40f7d74c741a907cfaeb73a7697081881c497d0 sched/fair: Fix
> > > infinite loop in update_blocked_averages() by reverting a9e7f6544b9c
> > > and put it on top of 4.19.13. In addition to this, I uninlined
> > > list_add_leaf_cfs_rq for debugging.
With the fix above applied, the code that manages the leaf_cfs_rq_list
is the same since v4.9.
Have you noticed similar problem on other older kernel version between
v4.9 and v4.19 ? The problem might have been introduce while modifying
other part of the scheduler like the sequence for adding/removing
cgroup.
Knowing the most recent kernel version without the problem could help
to narrow the problem
Thanks,
Vincent
> > >
> > > This revealed a new bug that we didn't get to because we kept getting
> > > crashes from the previous issue. When we are running with cgroups that
> > > are rapidly changing, with CFS bandwidth control, and in addition
> > > using the cpusets cgroup, we see this crash. Specifically, it seems to
> > > occur with cgroups that are throttled and we change the allowed
> > > cpuset.
>
> Thanks for the context, I will try to reproduce the problem and
> understand how we can stop in the middle of walking to the
> sched_entity branch with a parent not already added
>
> How many cgroup level have you got in you setup ?
>
> > >
> >
> > This patch from Gabriel should fix the problem:
> >
> >
> > [PATCH] sched/fair: Reset tmp_alone_branch on cfs_rq delete
> >
> > When a child cfs_rq is added to the leaf cfs_rq list before its parent
> > tmp_alone_branch is set to point to the child in preparation for the
> > parent being added.
> >
> > If the child is deleted before the parent is added then tmp_alone_branch
> > points to a freed cfs_rq. Any future reference to tmp_alone_branch will
> > result in a use after free.
>
> So, the patch below is a temporary fix that helps to recover from the
> situation where tmp_alone_branch doesn't finished back to
> rq->leaf_cfs_rq_list
> But this situation should not happened at the beginning
>
> >
> > Signed-off-by: Gabriel Hartmann <[email protected]>
> > Reported-by: Sargun Dhillon <[email protected]>
> > ---
> > kernel/sched/fair.c | 5 +++++
> > 1 file changed, 5 insertions(+)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 7137bc343b4a..0987629cbb76 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -347,6 +347,11 @@ static inline void list_add_leaf_cfs_rq(struct
> > cfs_rq *cfs_rq)
> > static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
> > {
> > if (cfs_rq->on_list) {
> > + struct rq *rq = rq_of(cfs_rq);
> > +
> > + if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
> > + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
> > +
> > list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
> > cfs_rq->on_list = 0;
> > }
Hi Sargun,
Le Friday 18 Jan 2019 ? 15:06:28 (+0100), Vincent Guittot a ?crit :
> On Fri, 18 Jan 2019 at 11:16, Vincent Guittot
> <[email protected]> wrote:
> >
> > On Wed, 9 Jan 2019 at 23:43, Sargun Dhillon <[email protected]> wrote:
> > >
> > > On Wed, Jan 9, 2019 at 2:14 PM Sargun Dhillon <[email protected]> wrote:
> > > >
> > > > I picked up c40f7d74c741a907cfaeb73a7697081881c497d0 sched/fair: Fix
> > > > infinite loop in update_blocked_averages() by reverting a9e7f6544b9c
> > > > and put it on top of 4.19.13. In addition to this, I uninlined
> > > > list_add_leaf_cfs_rq for debugging.
>
> With the fix above applied, the code that manages the leaf_cfs_rq_list
> is the same since v4.9.
> Have you noticed similar problem on other older kernel version between
> v4.9 and v4.19 ? The problem might have been introduce while modifying
> other part of the scheduler like the sequence for adding/removing
> cgroup.
>
> Knowing the most recent kernel version without the problem could help
> to narrow the problem
>
> Thanks,
> Vincent
>
> > > >
> > > > This revealed a new bug that we didn't get to because we kept getting
> > > > crashes from the previous issue. When we are running with cgroups that
> > > > are rapidly changing, with CFS bandwidth control, and in addition
> > > > using the cpusets cgroup, we see this crash. Specifically, it seems to
> > > > occur with cgroups that are throttled and we change the allowed
> > > > cpuset.
> >
> > Thanks for the context, I will try to reproduce the problem and
> > understand how we can stop in the middle of walking to the
> > sched_entity branch with a parent not already added
> >
> > How many cgroup level have you got in you setup ?
> >
> > > >
> > >
> > > This patch from Gabriel should fix the problem:
> > >
> > >
> > > [PATCH] sched/fair: Reset tmp_alone_branch on cfs_rq delete
> > >
> > > When a child cfs_rq is added to the leaf cfs_rq list before its parent
> > > tmp_alone_branch is set to point to the child in preparation for the
> > > parent being added.
> > >
> > > If the child is deleted before the parent is added then tmp_alone_branch
> > > points to a freed cfs_rq. Any future reference to tmp_alone_branch will
> > > result in a use after free.
> >
> > So, the patch below is a temporary fix that helps to recover from the
> > situation where tmp_alone_branch doesn't finished back to
> > rq->leaf_cfs_rq_list
> > But this situation should not happened at the beginning
I have been able to reproduce the situation where tmp_alone_branch doesn't
point to rq->leaf_cfs_rq_list after enqueuing a task.
Can you try the patch below which ensures all cfs_rq of a cgroup branch will
be added in the list even if throttled ?
The algorithm used to order cfs_rq in rq->leaf_cfs_rq_list assumes that
it will walk down to root the 1st time a cfs_rq is used and we will finished
to add either a cfs_rq without parent or a cfs_rq with a parent that is already
on the list. But this is not always true in presence of throttling.
Because a cfs_rq can be throttled even if it has never been used but other CPUS
of the cgroup have already used all the bandwdith, we are not sure to go down to
the root and add all cfs_rq in the list.
Ensure that all cfs_rq will be added in the list even if they are throttled.
Signed-off-by: Vincent Guittot <[email protected]>
---
kernel/sched/fair.c | 17 +++++++++++++++++
1 file changed, 17 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6483834..ae468ab 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -352,6 +352,20 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
}
}
+static inline void list_add_branch_cfs_rq(struct sched_entity *se, struct rq *rq)
+{
+struct cfs_rq *cfs_rq;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ list_add_leaf_cfs_rq(cfs_rq);
+
+ /* If parent is already in the list, we can stop */
+ if (rq->tmp_alone_branch == &rq->leaf_cfs_rq_list)
+ break;
+ }
+}
+
/* Iterate through all leaf cfs_rq's on a runqueue: */
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
@@ -5177,6 +5191,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
}
+ /* Ensure that all cfs_rq have been added to the list */
+ list_add_branch_cfs_rq(se, rq);
+
hrtick_update(rq);
}
> >
> >
> > >
> > > Signed-off-by: Gabriel Hartmann <[email protected]>
> > > Reported-by: Sargun Dhillon <[email protected]>
> > > ---
> > > kernel/sched/fair.c | 5 +++++
> > > 1 file changed, 5 insertions(+)
> > >
> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > index 7137bc343b4a..0987629cbb76 100644
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -347,6 +347,11 @@ static inline void list_add_leaf_cfs_rq(struct
> > > cfs_rq *cfs_rq)
> > > static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
> > > {
> > > if (cfs_rq->on_list) {
> > > + struct rq *rq = rq_of(cfs_rq);
> > > +
> > > + if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
> > > + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
> > > +
> > > list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
> > > cfs_rq->on_list = 0;
> > > }
Hi Sargun,
On Mon, 21 Jan 2019 at 15:46, Vincent Guittot
<[email protected]> wrote:
>
> Hi Sargun,
>
> Le Friday 18 Jan 2019 à 15:06:28 (+0100), Vincent Guittot a écrit :
> > On Fri, 18 Jan 2019 at 11:16, Vincent Guittot
> > <[email protected]> wrote:
> > >
> > > On Wed, 9 Jan 2019 at 23:43, Sargun Dhillon <[email protected]> wrote:
> > > >
> > > > On Wed, Jan 9, 2019 at 2:14 PM Sargun Dhillon <[email protected]> wrote:
> > > > >
> > > > > I picked up c40f7d74c741a907cfaeb73a7697081881c497d0 sched/fair: Fix
> > > > > infinite loop in update_blocked_averages() by reverting a9e7f6544b9c
> > > > > and put it on top of 4.19.13. In addition to this, I uninlined
> > > > > list_add_leaf_cfs_rq for debugging.
> >
> > With the fix above applied, the code that manages the leaf_cfs_rq_list
> > is the same since v4.9.
> > Have you noticed similar problem on other older kernel version between
> > v4.9 and v4.19 ? The problem might have been introduce while modifying
> > other part of the scheduler like the sequence for adding/removing
> > cgroup.
> >
> > Knowing the most recent kernel version without the problem could help
> > to narrow the problem
> >
> > Thanks,
> > Vincent
> >
> > > > >
> > > > > This revealed a new bug that we didn't get to because we kept getting
> > > > > crashes from the previous issue. When we are running with cgroups that
> > > > > are rapidly changing, with CFS bandwidth control, and in addition
> > > > > using the cpusets cgroup, we see this crash. Specifically, it seems to
> > > > > occur with cgroups that are throttled and we change the allowed
> > > > > cpuset.
> > >
> > > Thanks for the context, I will try to reproduce the problem and
> > > understand how we can stop in the middle of walking to the
> > > sched_entity branch with a parent not already added
> > >
> > > How many cgroup level have you got in you setup ?
> > >
> > > > >
> > > >
> > > > This patch from Gabriel should fix the problem:
> > > >
> > > >
> > > > [PATCH] sched/fair: Reset tmp_alone_branch on cfs_rq delete
> > > >
> > > > When a child cfs_rq is added to the leaf cfs_rq list before its parent
> > > > tmp_alone_branch is set to point to the child in preparation for the
> > > > parent being added.
> > > >
> > > > If the child is deleted before the parent is added then tmp_alone_branch
> > > > points to a freed cfs_rq. Any future reference to tmp_alone_branch will
> > > > result in a use after free.
> > >
> > > So, the patch below is a temporary fix that helps to recover from the
> > > situation where tmp_alone_branch doesn't finished back to
> > > rq->leaf_cfs_rq_list
> > > But this situation should not happened at the beginning
>
> I have been able to reproduce the situation where tmp_alone_branch doesn't
> point to rq->leaf_cfs_rq_list after enqueuing a task.
>
> Can you try the patch below which ensures all cfs_rq of a cgroup branch will
> be added in the list even if throttled ?
Did you get a chance to test this patch ?
Regards,
Vincent
>
> The algorithm used to order cfs_rq in rq->leaf_cfs_rq_list assumes that
> it will walk down to root the 1st time a cfs_rq is used and we will finished
> to add either a cfs_rq without parent or a cfs_rq with a parent that is already
> on the list. But this is not always true in presence of throttling.
> Because a cfs_rq can be throttled even if it has never been used but other CPUS
> of the cgroup have already used all the bandwdith, we are not sure to go down to
> the root and add all cfs_rq in the list.
>
> Ensure that all cfs_rq will be added in the list even if they are throttled.
>
> Signed-off-by: Vincent Guittot <[email protected]>
> ---
> kernel/sched/fair.c | 17 +++++++++++++++++
> 1 file changed, 17 insertions(+)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6483834..ae468ab 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -352,6 +352,20 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
> }
> }
>
> +static inline void list_add_branch_cfs_rq(struct sched_entity *se, struct rq *rq)
> +{
> +struct cfs_rq *cfs_rq;
> +
> + for_each_sched_entity(se) {
> + cfs_rq = cfs_rq_of(se);
> + list_add_leaf_cfs_rq(cfs_rq);
> +
> + /* If parent is already in the list, we can stop */
> + if (rq->tmp_alone_branch == &rq->leaf_cfs_rq_list)
> + break;
> + }
> +}
> +
> /* Iterate through all leaf cfs_rq's on a runqueue: */
> #define for_each_leaf_cfs_rq(rq, cfs_rq) \
> list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
> @@ -5177,6 +5191,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>
> }
>
> + /* Ensure that all cfs_rq have been added to the list */
> + list_add_branch_cfs_rq(se, rq);
> +
> hrtick_update(rq);
> }
>
>
>
> > >
> > >
> > > >
> > > > Signed-off-by: Gabriel Hartmann <[email protected]>
> > > > Reported-by: Sargun Dhillon <[email protected]>
> > > > ---
> > > > kernel/sched/fair.c | 5 +++++
> > > > 1 file changed, 5 insertions(+)
> > > >
> > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > > index 7137bc343b4a..0987629cbb76 100644
> > > > --- a/kernel/sched/fair.c
> > > > +++ b/kernel/sched/fair.c
> > > > @@ -347,6 +347,11 @@ static inline void list_add_leaf_cfs_rq(struct
> > > > cfs_rq *cfs_rq)
> > > > static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
> > > > {
> > > > if (cfs_rq->on_list) {
> > > > + struct rq *rq = rq_of(cfs_rq);
> > > > +
> > > > + if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
> > > > + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
> > > > +
> > > > list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
> > > > cfs_rq->on_list = 0;
> > > > }
Hi Vincent,
On Fri, Jan 25, 2019 at 6:31 AM Vincent Guittot
<[email protected]> wrote:
>
> Hi Sargun,
>
> On Mon, 21 Jan 2019 at 15:46, Vincent Guittot
> <[email protected]> wrote:
> >
> > Hi Sargun,
> >
> > Le Friday 18 Jan 2019 à 15:06:28 (+0100), Vincent Guittot a écrit :
> > > On Fri, 18 Jan 2019 at 11:16, Vincent Guittot
> > > <[email protected]> wrote:
> > > >
> > > > On Wed, 9 Jan 2019 at 23:43, Sargun Dhillon <[email protected]> wrote:
> > > > >
> > > > > On Wed, Jan 9, 2019 at 2:14 PM Sargun Dhillon <[email protected]> wrote:
> > > > > >
> > > > > > I picked up c40f7d74c741a907cfaeb73a7697081881c497d0 sched/fair: Fix
> > > > > > infinite loop in update_blocked_averages() by reverting a9e7f6544b9c
> > > > > > and put it on top of 4.19.13. In addition to this, I uninlined
> > > > > > list_add_leaf_cfs_rq for debugging.
> > >
> > > With the fix above applied, the code that manages the leaf_cfs_rq_list
> > > is the same since v4.9.
> > > Have you noticed similar problem on other older kernel version between
> > > v4.9 and v4.19 ? The problem might have been introduce while modifying
> > > other part of the scheduler like the sequence for adding/removing
> > > cgroup.
> > >
> > > Knowing the most recent kernel version without the problem could help
> > > to narrow the problem
> > >
> > > Thanks,
> > > Vincent
> > >
> > > > > >
> > > > > > This revealed a new bug that we didn't get to because we kept getting
> > > > > > crashes from the previous issue. When we are running with cgroups that
> > > > > > are rapidly changing, with CFS bandwidth control, and in addition
> > > > > > using the cpusets cgroup, we see this crash. Specifically, it seems to
> > > > > > occur with cgroups that are throttled and we change the allowed
> > > > > > cpuset.
> > > >
> > > > Thanks for the context, I will try to reproduce the problem and
> > > > understand how we can stop in the middle of walking to the
> > > > sched_entity branch with a parent not already added
> > > >
> > > > How many cgroup level have you got in you setup ?
> > > >
> > > > > >
> > > > >
> > > > > This patch from Gabriel should fix the problem:
> > > > >
> > > > >
> > > > > [PATCH] sched/fair: Reset tmp_alone_branch on cfs_rq delete
> > > > >
> > > > > When a child cfs_rq is added to the leaf cfs_rq list before its parent
> > > > > tmp_alone_branch is set to point to the child in preparation for the
> > > > > parent being added.
> > > > >
> > > > > If the child is deleted before the parent is added then tmp_alone_branch
> > > > > points to a freed cfs_rq. Any future reference to tmp_alone_branch will
> > > > > result in a use after free.
> > > >
> > > > So, the patch below is a temporary fix that helps to recover from the
> > > > situation where tmp_alone_branch doesn't finished back to
> > > > rq->leaf_cfs_rq_list
> > > > But this situation should not happened at the beginning
> >
> > I have been able to reproduce the situation where tmp_alone_branch doesn't
> > point to rq->leaf_cfs_rq_list after enqueuing a task.
> >
> > Can you try the patch below which ensures all cfs_rq of a cgroup branch will
> > be added in the list even if throttled ?
>
> Did you get a chance to test this patch ?
>
> Regards,
> Vincent
>
> >
> > The algorithm used to order cfs_rq in rq->leaf_cfs_rq_list assumes that
> > it will walk down to root the 1st time a cfs_rq is used and we will finished
> > to add either a cfs_rq without parent or a cfs_rq with a parent that is already
> > on the list. But this is not always true in presence of throttling.
> > Because a cfs_rq can be throttled even if it has never been used but other CPUS
> > of the cgroup have already used all the bandwdith, we are not sure to go down to
> > the root and add all cfs_rq in the list.
> >
> > Ensure that all cfs_rq will be added in the list even if they are throttled.
> >
> > Signed-off-by: Vincent Guittot <[email protected]>
> > ---
> > kernel/sched/fair.c | 17 +++++++++++++++++
> > 1 file changed, 17 insertions(+)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 6483834..ae468ab 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -352,6 +352,20 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
> > }
> > }
> >
> > +static inline void list_add_branch_cfs_rq(struct sched_entity *se, struct rq *rq)
> > +{
> > +struct cfs_rq *cfs_rq;
> > +
> > + for_each_sched_entity(se) {
> > + cfs_rq = cfs_rq_of(se);
> > + list_add_leaf_cfs_rq(cfs_rq);
> > +
> > + /* If parent is already in the list, we can stop */
> > + if (rq->tmp_alone_branch == &rq->leaf_cfs_rq_list)
> > + break;
> > + }
> > +}
> > +
> > /* Iterate through all leaf cfs_rq's on a runqueue: */
> > #define for_each_leaf_cfs_rq(rq, cfs_rq) \
> > list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
> > @@ -5177,6 +5191,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> >
> > }
> >
> > + /* Ensure that all cfs_rq have been added to the list */
> > + list_add_branch_cfs_rq(se, rq);
> > +
> > hrtick_update(rq);
> > }
> >
> >
> >
> > > >
> > > >
> > > > >
> > > > > Signed-off-by: Gabriel Hartmann <[email protected]>
> > > > > Reported-by: Sargun Dhillon <[email protected]>
> > > > > ---
> > > > > kernel/sched/fair.c | 5 +++++
> > > > > 1 file changed, 5 insertions(+)
> > > > >
> > > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > > > index 7137bc343b4a..0987629cbb76 100644
> > > > > --- a/kernel/sched/fair.c
> > > > > +++ b/kernel/sched/fair.c
> > > > > @@ -347,6 +347,11 @@ static inline void list_add_leaf_cfs_rq(struct
> > > > > cfs_rq *cfs_rq)
> > > > > static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
> > > > > {
> > > > > if (cfs_rq->on_list) {
> > > > > + struct rq *rq = rq_of(cfs_rq);
> > > > > +
> > > > > + if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
> > > > > + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
> > > > > +
> > > > > list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
> > > > > cfs_rq->on_list = 0;
> > > > > }
Apologies for the slow turn around on this. We have tried both
approaches to fixing the bug now. In both cases for a particularly
long duration CPU intensive workload we are seeing ~33% slowdown.
-- Gabriel
Hi Gabriel,
On Sat, 16 Feb 2019 at 00:06, Gabriel Hartmann
<[email protected]> wrote:
>
> Hi Vincent,
>
> Apologies for the slow turn around on this. We have tried both approaches to fixing the bug now. In both cases for a particularly long duration CPU intensive workload we are seeing ~33% slowdown.
This was somehow expected because the unused cfs_rq are not removed
anymore but at least the list is correctly ordered with my patch.
the official version of this patch is there: https://lkml.org/lkml/2019/2/4/121
Then, more patches have been queued that removed unused cfs_rq and
keep a correct list ordering: https://lkml.org/lkml/2019/2/6/499
With these 3 patches, the slowdown should disappear and the list
ordering will stay correct
Regards,
Vincent
>
> -- Gabriel
>
> On Fri, Jan 25, 2019 at 6:31 AM Vincent Guittot <[email protected]> wrote:
>>
>> Hi Sargun,
>>
>> On Mon, 21 Jan 2019 at 15:46, Vincent Guittot
>> <[email protected]> wrote:
>> >
>> > Hi Sargun,
>> >
>> > Le Friday 18 Jan 2019 à 15:06:28 (+0100), Vincent Guittot a écrit :
>> > > On Fri, 18 Jan 2019 at 11:16, Vincent Guittot
>> > > <[email protected]> wrote:
>> > > >
>> > > > On Wed, 9 Jan 2019 at 23:43, Sargun Dhillon <[email protected]> wrote:
>> > > > >
>> > > > > On Wed, Jan 9, 2019 at 2:14 PM Sargun Dhillon <[email protected]> wrote:
>> > > > > >
>> > > > > > I picked up c40f7d74c741a907cfaeb73a7697081881c497d0 sched/fair: Fix
>> > > > > > infinite loop in update_blocked_averages() by reverting a9e7f6544b9c
>> > > > > > and put it on top of 4.19.13. In addition to this, I uninlined
>> > > > > > list_add_leaf_cfs_rq for debugging.
>> > >
>> > > With the fix above applied, the code that manages the leaf_cfs_rq_list
>> > > is the same since v4.9.
>> > > Have you noticed similar problem on other older kernel version between
>> > > v4.9 and v4.19 ? The problem might have been introduce while modifying
>> > > other part of the scheduler like the sequence for adding/removing
>> > > cgroup.
>> > >
>> > > Knowing the most recent kernel version without the problem could help
>> > > to narrow the problem
>> > >
>> > > Thanks,
>> > > Vincent
>> > >
>> > > > > >
>> > > > > > This revealed a new bug that we didn't get to because we kept getting
>> > > > > > crashes from the previous issue. When we are running with cgroups that
>> > > > > > are rapidly changing, with CFS bandwidth control, and in addition
>> > > > > > using the cpusets cgroup, we see this crash. Specifically, it seems to
>> > > > > > occur with cgroups that are throttled and we change the allowed
>> > > > > > cpuset.
>> > > >
>> > > > Thanks for the context, I will try to reproduce the problem and
>> > > > understand how we can stop in the middle of walking to the
>> > > > sched_entity branch with a parent not already added
>> > > >
>> > > > How many cgroup level have you got in you setup ?
>> > > >
>> > > > > >
>> > > > >
>> > > > > This patch from Gabriel should fix the problem:
>> > > > >
>> > > > >
>> > > > > [PATCH] sched/fair: Reset tmp_alone_branch on cfs_rq delete
>> > > > >
>> > > > > When a child cfs_rq is added to the leaf cfs_rq list before its parent
>> > > > > tmp_alone_branch is set to point to the child in preparation for the
>> > > > > parent being added.
>> > > > >
>> > > > > If the child is deleted before the parent is added then tmp_alone_branch
>> > > > > points to a freed cfs_rq. Any future reference to tmp_alone_branch will
>> > > > > result in a use after free.
>> > > >
>> > > > So, the patch below is a temporary fix that helps to recover from the
>> > > > situation where tmp_alone_branch doesn't finished back to
>> > > > rq->leaf_cfs_rq_list
>> > > > But this situation should not happened at the beginning
>> >
>> > I have been able to reproduce the situation where tmp_alone_branch doesn't
>> > point to rq->leaf_cfs_rq_list after enqueuing a task.
>> >
>> > Can you try the patch below which ensures all cfs_rq of a cgroup branch will
>> > be added in the list even if throttled ?
>>
>> Did you get a chance to test this patch ?
>>
>> Regards,
>> Vincent
>>
>> >
>> > The algorithm used to order cfs_rq in rq->leaf_cfs_rq_list assumes that
>> > it will walk down to root the 1st time a cfs_rq is used and we will finished
>> > to add either a cfs_rq without parent or a cfs_rq with a parent that is already
>> > on the list. But this is not always true in presence of throttling.
>> > Because a cfs_rq can be throttled even if it has never been used but other CPUS
>> > of the cgroup have already used all the bandwdith, we are not sure to go down to
>> > the root and add all cfs_rq in the list.
>> >
>> > Ensure that all cfs_rq will be added in the list even if they are throttled.
>> >
>> > Signed-off-by: Vincent Guittot <[email protected]>
>> > ---
>> > kernel/sched/fair.c | 17 +++++++++++++++++
>> > 1 file changed, 17 insertions(+)
>> >
>> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> > index 6483834..ae468ab 100644
>> > --- a/kernel/sched/fair.c
>> > +++ b/kernel/sched/fair.c
>> > @@ -352,6 +352,20 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
>> > }
>> > }
>> >
>> > +static inline void list_add_branch_cfs_rq(struct sched_entity *se, struct rq *rq)
>> > +{
>> > +struct cfs_rq *cfs_rq;
>> > +
>> > + for_each_sched_entity(se) {
>> > + cfs_rq = cfs_rq_of(se);
>> > + list_add_leaf_cfs_rq(cfs_rq);
>> > +
>> > + /* If parent is already in the list, we can stop */
>> > + if (rq->tmp_alone_branch == &rq->leaf_cfs_rq_list)
>> > + break;
>> > + }
>> > +}
>> > +
>> > /* Iterate through all leaf cfs_rq's on a runqueue: */
>> > #define for_each_leaf_cfs_rq(rq, cfs_rq) \
>> > list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
>> > @@ -5177,6 +5191,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>> >
>> > }
>> >
>> > + /* Ensure that all cfs_rq have been added to the list */
>> > + list_add_branch_cfs_rq(se, rq);
>> > +
>> > hrtick_update(rq);
>> > }
>> >
>> >
>> >
>> > > >
>> > > >
>> > > > >
>> > > > > Signed-off-by: Gabriel Hartmann <[email protected]>
>> > > > > Reported-by: Sargun Dhillon <[email protected]>
>> > > > > ---
>> > > > > kernel/sched/fair.c | 5 +++++
>> > > > > 1 file changed, 5 insertions(+)
>> > > > >
>> > > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> > > > > index 7137bc343b4a..0987629cbb76 100644
>> > > > > --- a/kernel/sched/fair.c
>> > > > > +++ b/kernel/sched/fair.c
>> > > > > @@ -347,6 +347,11 @@ static inline void list_add_leaf_cfs_rq(struct
>> > > > > cfs_rq *cfs_rq)
>> > > > > static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
>> > > > > {
>> > > > > if (cfs_rq->on_list) {
>> > > > > + struct rq *rq = rq_of(cfs_rq);
>> > > > > +
>> > > > > + if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
>> > > > > + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
>> > > > > +
>> > > > > list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
>> > > > > cfs_rq->on_list = 0;
>> > > > > }