The commit 36b238d57172 ("psi: Optimize switching tasks inside shared
cgroups") only update cgroups whose state actually changes during a
task switch only in task preempt case, not in task sleep case.
We actually don't need to clear and set TSK_ONCPU state for common cgroups
of next and prev task in sleep case, that can save many psi_group_change
especially when most activity comes from one leaf cgroup.
Signed-off-by: Muchun Song <[email protected]>
Signed-off-by: Chengming Zhou <[email protected]>
---
kernel/sched/psi.c | 27 +++++++++++++++++----------
kernel/sched/stats.h | 17 +++--------------
2 files changed, 20 insertions(+), 24 deletions(-)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 6e46d9eb279b..6061e87089ac 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -836,20 +836,27 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
}
}
- /*
- * If this is a voluntary sleep, dequeue will have taken care
- * of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We
- * only need to deal with it during preemption.
- */
- if (sleep)
- return;
-
if (prev->pid) {
- psi_flags_change(prev, TSK_ONCPU, 0);
+ int clear = 0, set = 0;
+
+ if (sleep) {
+ clear |= TSK_RUNNING;
+ if (prev->in_iowait)
+ set |= TSK_IOWAIT;
+ }
+
+ psi_flags_change(prev, clear | TSK_ONCPU, set);
iter = NULL;
while ((group = iterate_groups(prev, &iter)) && group != common)
- psi_group_change(group, cpu, TSK_ONCPU, 0, true);
+ psi_group_change(group, cpu, clear | TSK_ONCPU, set, true);
+
+ if (sleep) {
+ while (group) {
+ psi_group_change(group, cpu, clear, set, true);
+ group = iterate_groups(prev, &iter);
+ }
+ }
}
}
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 9e4e67a94731..2d92c8467678 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -84,28 +84,17 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
static inline void psi_dequeue(struct task_struct *p, bool sleep)
{
- int clear = TSK_RUNNING, set = 0;
-
if (static_branch_likely(&psi_disabled))
return;
if (!sleep) {
+ int clear = TSK_RUNNING;
+
if (p->in_memstall)
clear |= TSK_MEMSTALL;
- } else {
- /*
- * When a task sleeps, schedule() dequeues it before
- * switching to the next one. Merge the clearing of
- * TSK_RUNNING and TSK_ONCPU to save an unnecessary
- * psi_task_change() call in psi_sched_switch().
- */
- clear |= TSK_ONCPU;
- if (p->in_iowait)
- set |= TSK_IOWAIT;
+ psi_task_change(p, clear, 0);
}
-
- psi_task_change(p, clear, set);
}
static inline void psi_ttwu_dequeue(struct task_struct *p)
--
2.11.0
Hello Chengming,
This patch looks useful to me. A couple of comments below:
On Tue, Feb 09, 2021 at 03:14:13PM +0800, Chengming Zhou wrote:
> The commit 36b238d57172 ("psi: Optimize switching tasks inside shared
> cgroups") only update cgroups whose state actually changes during a
> task switch only in task preempt case, not in task sleep case.
>
> We actually don't need to clear and set TSK_ONCPU state for common cgroups
> of next and prev task in sleep case, that can save many psi_group_change
> especially when most activity comes from one leaf cgroup.
Can you please make this a bit more concrete? Maybe include this:
sleep before:
psi_dequeue()
while ((group = iterate_groups(prev))) # all ancestors
psi_group_change(prev, .clear=TSK_RUNNING|TSK_ONCPU)
psi_task_switch()
while ((group = iterate_groups(next))) # all ancestors
psi_group_change(next, .set=TSK_ONCPU)
sleep after:
psi_dequeue()
nop
psi_task_switch()
while ((group = iterate_groups(next))) # until (prev & next)
psi_group_change(next, .set=TSK_ONCPU)
while ((group = iterate_groups(prev))) # all ancestors
psi_group_change(prev, .clear = common ? TSK_RUNNING : TSK_RUNNING|TSK_ONCPU)
When a voluntary sleep switches to another task, we remove one call of
psi_group_change() for every common cgroup ancestor of the two tasks.
> Signed-off-by: Muchun Song <[email protected]>
> Signed-off-by: Chengming Zhou <[email protected]>
> ---
> kernel/sched/psi.c | 27 +++++++++++++++++----------
> kernel/sched/stats.h | 17 +++--------------
> 2 files changed, 20 insertions(+), 24 deletions(-)
>
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index 6e46d9eb279b..6061e87089ac 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -836,20 +836,27 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
> }
> }
>
> - /*
> - * If this is a voluntary sleep, dequeue will have taken care
> - * of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We
> - * only need to deal with it during preemption.
> - */
> - if (sleep)
> - return;
> -
> if (prev->pid) {
> - psi_flags_change(prev, TSK_ONCPU, 0);
> + int clear = 0, set = 0;
> +
> + if (sleep) {
> + clear |= TSK_RUNNING;
> + if (prev->in_iowait)
> + set |= TSK_IOWAIT;
> + }
This needs a comment why it's doing psi_dequeue()'s job. How about this?
/*
* When we're going to sleep, psi_dequeue() lets us handle
* TSK_RUNNING and TSK_IOWAIT here, where we can combine it
* with TSK_ONCPU and save walking common ancestors twice.
*/
if (sleep) {
...
> + psi_flags_change(prev, clear | TSK_ONCPU, set);
>
> iter = NULL;
> while ((group = iterate_groups(prev, &iter)) && group != common)
> - psi_group_change(group, cpu, TSK_ONCPU, 0, true);
> + psi_group_change(group, cpu, clear | TSK_ONCPU, set, true);
> +
> + if (sleep) {
> + while (group) {
> + psi_group_change(group, cpu, clear, set, true);
> + group = iterate_groups(prev, &iter);
> + }
> + }
This function is *primarily* about handling TSK_ONCPU and secondarily
optimizes the dequeue. It would be a bit clearer to do:
int clear = TSK_ONCPU, set = 0;
...
/*
* TSK_ONCPU is handled up to the common ancestor. If we're tasked
* with dequeuing too, finish that for the rest of the hierarchy.
*/
if (sleep) {
clear &= TSK_ONCPU;
for (; group; group = iterate_groups(prev, &iter))
psi_group_change(group, cpu, clear, set, true);
}
> diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
> index 9e4e67a94731..2d92c8467678 100644
> --- a/kernel/sched/stats.h
> +++ b/kernel/sched/stats.h
> @@ -84,28 +84,17 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
>
> static inline void psi_dequeue(struct task_struct *p, bool sleep)
> {
> - int clear = TSK_RUNNING, set = 0;
> -
> if (static_branch_likely(&psi_disabled))
> return;
>
> if (!sleep) {
> + int clear = TSK_RUNNING;
> +
> if (p->in_memstall)
> clear |= TSK_MEMSTALL;
> - } else {
> - /*
> - * When a task sleeps, schedule() dequeues it before
> - * switching to the next one. Merge the clearing of
> - * TSK_RUNNING and TSK_ONCPU to save an unnecessary
> - * psi_task_change() call in psi_sched_switch().
> - */
> - clear |= TSK_ONCPU;
>
> - if (p->in_iowait)
> - set |= TSK_IOWAIT;
> + psi_task_change(p, clear, 0);
> }
Likewise, this really should have a comment for why it's not handling
TSK_RUNNING to match psi_enqueue()!
int clear = TSK_RUNNING;
/*
* A voluntary sleep is a dequeue followed by a task switch. To
* avoid walking all ancestors twice, psi_task_switch() handles
* TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
* Do nothing here.
*/
if (sleep)
return;
if (p->in_memstall)
clear |= TSK_MEMSTALL;
psi_task_change(p, clear, 0);
Thanks
Hello Johannes,
在 2021/2/17 上午4:00, Johannes Weiner 写道:
> Hello Chengming,
>
> This patch looks useful to me. A couple of comments below:
>
> On Tue, Feb 09, 2021 at 03:14:13PM +0800, Chengming Zhou wrote:
>> The commit 36b238d57172 ("psi: Optimize switching tasks inside shared
>> cgroups") only update cgroups whose state actually changes during a
>> task switch only in task preempt case, not in task sleep case.
>>
>> We actually don't need to clear and set TSK_ONCPU state for common cgroups
>> of next and prev task in sleep case, that can save many psi_group_change
>> especially when most activity comes from one leaf cgroup.
> Can you please make this a bit more concrete? Maybe include this:
>
> sleep before:
> psi_dequeue()
> while ((group = iterate_groups(prev))) # all ancestors
> psi_group_change(prev, .clear=TSK_RUNNING|TSK_ONCPU)
> psi_task_switch()
> while ((group = iterate_groups(next))) # all ancestors
> psi_group_change(next, .set=TSK_ONCPU)
>
> sleep after:
> psi_dequeue()
> nop
> psi_task_switch()
> while ((group = iterate_groups(next))) # until (prev & next)
> psi_group_change(next, .set=TSK_ONCPU)
> while ((group = iterate_groups(prev))) # all ancestors
> psi_group_change(prev, .clear = common ? TSK_RUNNING : TSK_RUNNING|TSK_ONCPU)
>
> When a voluntary sleep switches to another task, we remove one call of
> psi_group_change() for every common cgroup ancestor of the two tasks.
Thank you for the very beautiful and detailed comments, must be included : )
>> Signed-off-by: Muchun Song <[email protected]>
>> Signed-off-by: Chengming Zhou <[email protected]>
>> ---
>> kernel/sched/psi.c | 27 +++++++++++++++++----------
>> kernel/sched/stats.h | 17 +++--------------
>> 2 files changed, 20 insertions(+), 24 deletions(-)
>>
>> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
>> index 6e46d9eb279b..6061e87089ac 100644
>> --- a/kernel/sched/psi.c
>> +++ b/kernel/sched/psi.c
>> @@ -836,20 +836,27 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
>> }
>> }
>>
>> - /*
>> - * If this is a voluntary sleep, dequeue will have taken care
>> - * of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We
>> - * only need to deal with it during preemption.
>> - */
>> - if (sleep)
>> - return;
>> -
>> if (prev->pid) {
>> - psi_flags_change(prev, TSK_ONCPU, 0);
>> + int clear = 0, set = 0;
>> +
>> + if (sleep) {
>> + clear |= TSK_RUNNING;
>> + if (prev->in_iowait)
>> + set |= TSK_IOWAIT;
>> + }
> This needs a comment why it's doing psi_dequeue()'s job. How about this?
>
> /*
> * When we're going to sleep, psi_dequeue() lets us handle
> * TSK_RUNNING and TSK_IOWAIT here, where we can combine it
> * with TSK_ONCPU and save walking common ancestors twice.
> */
> if (sleep) {
> ...
Make sense, will be added.
>> + psi_flags_change(prev, clear | TSK_ONCPU, set);
>>
>> iter = NULL;
>> while ((group = iterate_groups(prev, &iter)) && group != common)
>> - psi_group_change(group, cpu, TSK_ONCPU, 0, true);
>> + psi_group_change(group, cpu, clear | TSK_ONCPU, set, true);
>> +
>> + if (sleep) {
>> + while (group) {
>> + psi_group_change(group, cpu, clear, set, true);
>> + group = iterate_groups(prev, &iter);
>> + }
>> + }
> This function is *primarily* about handling TSK_ONCPU and secondarily
> optimizes the dequeue. It would be a bit clearer to do:
>
> int clear = TSK_ONCPU, set = 0;
>
> ...
>
> /*
> * TSK_ONCPU is handled up to the common ancestor. If we're tasked
> * with dequeuing too, finish that for the rest of the hierarchy.
> */
> if (sleep) {
> clear &= TSK_ONCPU;
> for (; group; group = iterate_groups(prev, &iter))
> psi_group_change(group, cpu, clear, set, true);
> }
>
Make sense, I will modify the patch and send a patch-v2.
BTW there is a typo above: clear &= ~TSK_ONCPU;
Thanks.
>> diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
>> index 9e4e67a94731..2d92c8467678 100644
>> --- a/kernel/sched/stats.h
>> +++ b/kernel/sched/stats.h
>> @@ -84,28 +84,17 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
>>
>> static inline void psi_dequeue(struct task_struct *p, bool sleep)
>> {
>> - int clear = TSK_RUNNING, set = 0;
>> -
>> if (static_branch_likely(&psi_disabled))
>> return;
>>
>> if (!sleep) {
>> + int clear = TSK_RUNNING;
>> +
>> if (p->in_memstall)
>> clear |= TSK_MEMSTALL;
>> - } else {
>> - /*
>> - * When a task sleeps, schedule() dequeues it before
>> - * switching to the next one. Merge the clearing of
>> - * TSK_RUNNING and TSK_ONCPU to save an unnecessary
>> - * psi_task_change() call in psi_sched_switch().
>> - */
>> - clear |= TSK_ONCPU;
>>
>> - if (p->in_iowait)
>> - set |= TSK_IOWAIT;
>> + psi_task_change(p, clear, 0);
>> }
> Likewise, this really should have a comment for why it's not handling
> TSK_RUNNING to match psi_enqueue()!
>
> int clear = TSK_RUNNING;
>
> /*
> * A voluntary sleep is a dequeue followed by a task switch. To
> * avoid walking all ancestors twice, psi_task_switch() handles
> * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
> * Do nothing here.
> */
> if (sleep)
> return;
>
> if (p->in_memstall)
> clear |= TSK_MEMSTALL;
>
> psi_task_change(p, clear, 0);
>
> Thanks