2021-02-07 11:58:58

by Chengming Zhou

[permalink] [raw]
Subject: [PATCH] psi: Remove the redundant psi_task_tick

From: zhouchengming <[email protected]>

When the current task in a cgroup is in_memstall, the corresponding psi_group
is in PSI_MEM_FULL state, so we can remove the redundant psi_task_tick
from scheduler_tick to save this periodic cost.

Signed-off-by: Chengming Zhou <[email protected]>
---
include/linux/psi.h | 1 -
kernel/sched/core.c | 1 -
kernel/sched/psi.c | 40 ++++------------------------------------
kernel/sched/stats.h | 9 ---------
4 files changed, 4 insertions(+), 47 deletions(-)

diff --git a/include/linux/psi.h b/include/linux/psi.h
index 7361023f3fdd..65eb1476ac70 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -20,7 +20,6 @@ void psi_task_change(struct task_struct *task, int clear, int set);
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep);

-void psi_memstall_tick(struct task_struct *task, int cpu);
void psi_memstall_enter(unsigned long *flags);
void psi_memstall_leave(unsigned long *flags);

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 15d2562118d1..31788a9b335b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4533,7 +4533,6 @@ void scheduler_tick(void)
update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
curr->sched_class->task_tick(rq, curr, 0);
calc_global_load_tick(rq);
- psi_task_tick(rq);

rq_unlock(rq, &rf);

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 8f90be7b10d4..1fdd6f73c62b 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -225,7 +225,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
case PSI_MEM_SOME:
return tasks[NR_MEMSTALL];
case PSI_MEM_FULL:
- return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
+ return (tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]) ||
+ (tasks[NR_ONCPU] && current->in_memstall);
case PSI_CPU_SOME:
return tasks[NR_RUNNING] > tasks[NR_ONCPU];
case PSI_CPU_FULL:
@@ -644,8 +645,7 @@ static void poll_timer_fn(struct timer_list *t)
wake_up_interruptible(&group->poll_wait);
}

-static void record_times(struct psi_group_cpu *groupc, int cpu,
- bool memstall_tick)
+static void record_times(struct psi_group_cpu *groupc, int cpu)
{
u32 delta;
u64 now;
@@ -664,23 +664,6 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
groupc->times[PSI_MEM_SOME] += delta;
if (groupc->state_mask & (1 << PSI_MEM_FULL))
groupc->times[PSI_MEM_FULL] += delta;
- else if (memstall_tick) {
- u32 sample;
- /*
- * Since we care about lost potential, a
- * memstall is FULL when there are no other
- * working tasks, but also when the CPU is
- * actively reclaiming and nothing productive
- * could run even if it were runnable.
- *
- * When the timer tick sees a reclaiming CPU,
- * regardless of runnable tasks, sample a FULL
- * tick (or less if it hasn't been a full tick
- * since the last state change).
- */
- sample = min(delta, (u32)jiffies_to_nsecs(1));
- groupc->times[PSI_MEM_FULL] += sample;
- }
}

if (groupc->state_mask & (1 << PSI_CPU_SOME)) {
@@ -714,7 +697,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
*/
write_seqcount_begin(&groupc->seq);

- record_times(groupc, cpu, false);
+ record_times(groupc, cpu);

for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
if (!(m & (1 << t)))
@@ -859,21 +842,6 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
}
}

-void psi_memstall_tick(struct task_struct *task, int cpu)
-{
- struct psi_group *group;
- void *iter = NULL;
-
- while ((group = iterate_groups(task, &iter))) {
- struct psi_group_cpu *groupc;
-
- groupc = per_cpu_ptr(group->pcpu, cpu);
- write_seqcount_begin(&groupc->seq);
- record_times(groupc, cpu, true);
- write_seqcount_end(&groupc->seq);
- }
-}
-
/**
* psi_memstall_enter - mark the beginning of a memory stall section
* @flags: flags to handle nested sections
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 33d0daf83842..9e4e67a94731 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -144,14 +144,6 @@ static inline void psi_sched_switch(struct task_struct *prev,
psi_task_switch(prev, next, sleep);
}

-static inline void psi_task_tick(struct rq *rq)
-{
- if (static_branch_likely(&psi_disabled))
- return;
-
- if (unlikely(rq->curr->in_memstall))
- psi_memstall_tick(rq->curr, cpu_of(rq));
-}
#else /* CONFIG_PSI */
static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
@@ -159,7 +151,6 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next,
bool sleep) {}
-static inline void psi_task_tick(struct rq *rq) {}
#endif /* CONFIG_PSI */

#ifdef CONFIG_SCHED_INFO
--
2.11.0


2021-02-08 18:46:12

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH] psi: Remove the redundant psi_task_tick

On Sun, Feb 07, 2021 at 07:56:42PM +0800, Chengming Zhou wrote:
> From: zhouchengming <[email protected]>
>
> When the current task in a cgroup is in_memstall, the corresponding psi_group
> is in PSI_MEM_FULL state

This is correct.

> so we can remove the redundant psi_task_tick from scheduler_tick to
> save this periodic cost.

But this isn't. It IS the task tick that makes it so. The state change
tracking counts MEMSTALL tasks and ONCPU tasks, but it doesn't check
whether the ONCPU task is the MEMSTALL one.

It would be possible to incorporate that into psi_task_switch(), but
you have to be careful with the branches because that path is often
hotter than the timer tick.

This patch by itself breaks page reclaim detection, so NAK.

2021-02-09 02:23:26

by Chengming Zhou

[permalink] [raw]
Subject: Re: [External] Re: [PATCH] psi: Remove the redundant psi_task_tick

在 2021/2/9 上午12:45, Johannes Weiner 写道:

> On Sun, Feb 07, 2021 at 07:56:42PM +0800, Chengming Zhou wrote:
>> From: zhouchengming <[email protected]>
>>
>> When the current task in a cgroup is in_memstall, the corresponding psi_group
>> is in PSI_MEM_FULL state
> This is correct.
>
>> so we can remove the redundant psi_task_tick from scheduler_tick to
>> save this periodic cost.
> But this isn't. It IS the task tick that makes it so. The state change
> tracking counts MEMSTALL tasks and ONCPU tasks, but it doesn't check
> whether the ONCPU task is the MEMSTALL one.

Yes, you are right, I messed them up... will take a closer look at the code later.

Thanks.

> It would be possible to incorporate that into psi_task_switch(), but
> you have to be careful with the branches because that path is often
> hotter than the timer tick.
>
> This patch by itself breaks page reclaim detection, so NAK.