From: Zhaoyang Huang <[email protected]>
cfs task's memstall time is counted as simple as exit - entry so far, which
ignore the preempted time by rt, dl and irq time. Eliminating them by calc the
time growth via the proportion of cfs_rq's utilization on the whole rq.
eg.
Here is the scenario which this commit want to fix, that is the rt and irq consume
some utilization of the whole rq. This scenario could be typical in a core
which is assigned to deal with all irqs. Furthermore, the rt task used to run on
little core under EAS.
Binder:305_3-314 [002] d..1 257.880195: psi_memtime_fixup: original:30616,adjusted:25951,se:89,cfs:353,rt:139,dl:0,irq:18
droid.phone-1525 [001] d..1 265.145492: psi_memtime_fixup: original:61616,adjusted:53492,se:55,cfs:225,rt:121,dl:0,irq:15
Signed-off-by: Zhaoyang Huang <[email protected]>
---
kernel/sched/psi.c | 20 +++++++++++++++++++-
1 file changed, 19 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index cc25a3c..754a836 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -182,6 +182,8 @@ struct psi_group psi_system = {
static void psi_avgs_work(struct work_struct *work);
+static unsigned long psi_memtime_fixup(u32 growth);
+
static void group_init(struct psi_group *group)
{
int cpu;
@@ -492,6 +494,21 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value)
return growth;
}
+static unsigned long psi_memtime_fixup(u32 growth)
+{
+ struct rq *rq = task_rq(current);
+ unsigned long growth_fixed = (unsigned long)growth;
+
+ if (!(current->policy == SCHED_NORMAL || current->policy == SCHED_BATCH))
+ return growth_fixed;
+
+ if (current->in_memstall)
+ growth_fixed = div64_ul((1024 - rq->avg_rt.util_avg - rq->avg_dl.util_avg
+ - rq->avg_irq.util_avg + 1) * growth, 1024);
+
+ return growth_fixed;
+}
+
static void init_triggers(struct psi_group *group, u64 now)
{
struct psi_trigger *t;
@@ -658,6 +675,7 @@ static void record_times(struct psi_group_cpu *groupc, u64 now)
}
if (groupc->state_mask & (1 << PSI_MEM_SOME)) {
+ delta = psi_memtime_fixup(delta);
groupc->times[PSI_MEM_SOME] += delta;
if (groupc->state_mask & (1 << PSI_MEM_FULL))
groupc->times[PSI_MEM_FULL] += delta;
@@ -928,8 +946,8 @@ void psi_memstall_leave(unsigned long *flags)
*/
rq = this_rq_lock_irq(&rf);
- current->in_memstall = 0;
psi_task_change(current, TSK_MEMSTALL, 0);
+ current->in_memstall = 0;
rq_unlock_irq(rq, &rf);
}
--
1.9.1
Please don't send new revisions without replying to previous feedback.
On Fri, Sep 10, 2021 at 03:54:46PM +0800, Huangzhaoyang wrote:
> From: Zhaoyang Huang <[email protected]>
>
> cfs task's memstall time is counted as simple as exit - entry so far, which
> ignore the preempted time by rt, dl and irq time. Eliminating them by calc the
> time growth via the proportion of cfs_rq's utilization on the whole rq.
>
> eg.
> Here is the scenario which this commit want to fix, that is the rt and irq consume
> some utilization of the whole rq. This scenario could be typical in a core
> which is assigned to deal with all irqs. Furthermore, the rt task used to run on
> little core under EAS.
>
> Binder:305_3-314 [002] d..1 257.880195: psi_memtime_fixup: original:30616,adjusted:25951,se:89,cfs:353,rt:139,dl:0,irq:18
> droid.phone-1525 [001] d..1 265.145492: psi_memtime_fixup: original:61616,adjusted:53492,se:55,cfs:225,rt:121,dl:0,irq:15
>
> Signed-off-by: Zhaoyang Huang <[email protected]>
> ---
> kernel/sched/psi.c | 20 +++++++++++++++++++-
> 1 file changed, 19 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index cc25a3c..754a836 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -182,6 +182,8 @@ struct psi_group psi_system = {
>
> static void psi_avgs_work(struct work_struct *work);
>
> +static unsigned long psi_memtime_fixup(u32 growth);
> +
> static void group_init(struct psi_group *group)
> {
> int cpu;
> @@ -492,6 +494,21 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value)
> return growth;
> }
>
> +static unsigned long psi_memtime_fixup(u32 growth)
> +{
> + struct rq *rq = task_rq(current);
> + unsigned long growth_fixed = (unsigned long)growth;
> +
> + if (!(current->policy == SCHED_NORMAL || current->policy == SCHED_BATCH))
> + return growth_fixed;
> +
> + if (current->in_memstall)
> + growth_fixed = div64_ul((1024 - rq->avg_rt.util_avg - rq->avg_dl.util_avg
> + - rq->avg_irq.util_avg + 1) * growth, 1024);
> +
> + return growth_fixed;
> +}
> +
> static void init_triggers(struct psi_group *group, u64 now)
> {
> struct psi_trigger *t;
> @@ -658,6 +675,7 @@ static void record_times(struct psi_group_cpu *groupc, u64 now)
> }
>
> if (groupc->state_mask & (1 << PSI_MEM_SOME)) {
> + delta = psi_memtime_fixup(delta);
> groupc->times[PSI_MEM_SOME] += delta;
> if (groupc->state_mask & (1 << PSI_MEM_FULL))
> groupc->times[PSI_MEM_FULL] += delta;
> @@ -928,8 +946,8 @@ void psi_memstall_leave(unsigned long *flags)
> */
> rq = this_rq_lock_irq(&rf);
>
> - current->in_memstall = 0;
> psi_task_change(current, TSK_MEMSTALL, 0);
> + current->in_memstall = 0;
>
> rq_unlock_irq(rq, &rf);
> }
> --
> 1.9.1
>
Sorry for the confusion. Actually, it is a new design which has taken
previous advice into consideration. I agree that part of the off cpu
time among the memstall time should be counted in, whereas, the
preempted time by high priority tasks should NOT be, including
RT,DEADLINE,IRQS time on cfs task.
On Fri, Sep 10, 2021 at 9:16 PM Johannes Weiner <[email protected]> wrote:
>
> Please don't send new revisions without replying to previous feedback.
>
> On Fri, Sep 10, 2021 at 03:54:46PM +0800, Huangzhaoyang wrote:
> > From: Zhaoyang Huang <[email protected]>
> >
> > cfs task's memstall time is counted as simple as exit - entry so far, which
> > ignore the preempted time by rt, dl and irq time. Eliminating them by calc the
> > time growth via the proportion of cfs_rq's utilization on the whole rq.
> >
> > eg.
> > Here is the scenario which this commit want to fix, that is the rt and irq consume
> > some utilization of the whole rq. This scenario could be typical in a core
> > which is assigned to deal with all irqs. Furthermore, the rt task used to run on
> > little core under EAS.
> >
> > Binder:305_3-314 [002] d..1 257.880195: psi_memtime_fixup: original:30616,adjusted:25951,se:89,cfs:353,rt:139,dl:0,irq:18
> > droid.phone-1525 [001] d..1 265.145492: psi_memtime_fixup: original:61616,adjusted:53492,se:55,cfs:225,rt:121,dl:0,irq:15
> >
> > Signed-off-by: Zhaoyang Huang <[email protected]>
> > ---
> > kernel/sched/psi.c | 20 +++++++++++++++++++-
> > 1 file changed, 19 insertions(+), 1 deletion(-)
> >
> > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> > index cc25a3c..754a836 100644
> > --- a/kernel/sched/psi.c
> > +++ b/kernel/sched/psi.c
> > @@ -182,6 +182,8 @@ struct psi_group psi_system = {
> >
> > static void psi_avgs_work(struct work_struct *work);
> >
> > +static unsigned long psi_memtime_fixup(u32 growth);
> > +
> > static void group_init(struct psi_group *group)
> > {
> > int cpu;
> > @@ -492,6 +494,21 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value)
> > return growth;
> > }
> >
> > +static unsigned long psi_memtime_fixup(u32 growth)
> > +{
> > + struct rq *rq = task_rq(current);
> > + unsigned long growth_fixed = (unsigned long)growth;
> > +
> > + if (!(current->policy == SCHED_NORMAL || current->policy == SCHED_BATCH))
> > + return growth_fixed;
> > +
> > + if (current->in_memstall)
> > + growth_fixed = div64_ul((1024 - rq->avg_rt.util_avg - rq->avg_dl.util_avg
> > + - rq->avg_irq.util_avg + 1) * growth, 1024);
> > +
> > + return growth_fixed;
> > +}
> > +
> > static void init_triggers(struct psi_group *group, u64 now)
> > {
> > struct psi_trigger *t;
> > @@ -658,6 +675,7 @@ static void record_times(struct psi_group_cpu *groupc, u64 now)
> > }
> >
> > if (groupc->state_mask & (1 << PSI_MEM_SOME)) {
> > + delta = psi_memtime_fixup(delta);
> > groupc->times[PSI_MEM_SOME] += delta;
> > if (groupc->state_mask & (1 << PSI_MEM_FULL))
> > groupc->times[PSI_MEM_FULL] += delta;
> > @@ -928,8 +946,8 @@ void psi_memstall_leave(unsigned long *flags)
> > */
> > rq = this_rq_lock_irq(&rf);
> >
> > - current->in_memstall = 0;
> > psi_task_change(current, TSK_MEMSTALL, 0);
> > + current->in_memstall = 0;
> >
> > rq_unlock_irq(rq, &rf);
> > }
> > --
> > 1.9.1
> >
Any comments on this patch?
On Mon, Sep 13, 2021 at 9:11 AM Zhaoyang Huang <[email protected]> wrote:
>
> Sorry for the confusion. Actually, it is a new design which has taken
> previous advice into consideration. I agree that part of the off cpu
> time among the memstall time should be counted in, whereas, the
> preempted time by high priority tasks should NOT be, including
> RT,DEADLINE,IRQS time on cfs task.
>
> On Fri, Sep 10, 2021 at 9:16 PM Johannes Weiner <[email protected]> wrote:
> >
> > Please don't send new revisions without replying to previous feedback.
> >
> > On Fri, Sep 10, 2021 at 03:54:46PM +0800, Huangzhaoyang wrote:
> > > From: Zhaoyang Huang <[email protected]>
> > >
> > > cfs task's memstall time is counted as simple as exit - entry so far, which
> > > ignore the preempted time by rt, dl and irq time. Eliminating them by calc the
> > > time growth via the proportion of cfs_rq's utilization on the whole rq.
> > >
> > > eg.
> > > Here is the scenario which this commit want to fix, that is the rt and irq consume
> > > some utilization of the whole rq. This scenario could be typical in a core
> > > which is assigned to deal with all irqs. Furthermore, the rt task used to run on
> > > little core under EAS.
> > >
> > > Binder:305_3-314 [002] d..1 257.880195: psi_memtime_fixup: original:30616,adjusted:25951,se:89,cfs:353,rt:139,dl:0,irq:18
> > > droid.phone-1525 [001] d..1 265.145492: psi_memtime_fixup: original:61616,adjusted:53492,se:55,cfs:225,rt:121,dl:0,irq:15
> > >
> > > Signed-off-by: Zhaoyang Huang <[email protected]>
> > > ---
> > > kernel/sched/psi.c | 20 +++++++++++++++++++-
> > > 1 file changed, 19 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> > > index cc25a3c..754a836 100644
> > > --- a/kernel/sched/psi.c
> > > +++ b/kernel/sched/psi.c
> > > @@ -182,6 +182,8 @@ struct psi_group psi_system = {
> > >
> > > static void psi_avgs_work(struct work_struct *work);
> > >
> > > +static unsigned long psi_memtime_fixup(u32 growth);
> > > +
> > > static void group_init(struct psi_group *group)
> > > {
> > > int cpu;
> > > @@ -492,6 +494,21 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value)
> > > return growth;
> > > }
> > >
> > > +static unsigned long psi_memtime_fixup(u32 growth)
> > > +{
> > > + struct rq *rq = task_rq(current);
> > > + unsigned long growth_fixed = (unsigned long)growth;
> > > +
> > > + if (!(current->policy == SCHED_NORMAL || current->policy == SCHED_BATCH))
> > > + return growth_fixed;
> > > +
> > > + if (current->in_memstall)
> > > + growth_fixed = div64_ul((1024 - rq->avg_rt.util_avg - rq->avg_dl.util_avg
> > > + - rq->avg_irq.util_avg + 1) * growth, 1024);
> > > +
> > > + return growth_fixed;
> > > +}
> > > +
> > > static void init_triggers(struct psi_group *group, u64 now)
> > > {
> > > struct psi_trigger *t;
> > > @@ -658,6 +675,7 @@ static void record_times(struct psi_group_cpu *groupc, u64 now)
> > > }
> > >
> > > if (groupc->state_mask & (1 << PSI_MEM_SOME)) {
> > > + delta = psi_memtime_fixup(delta);
> > > groupc->times[PSI_MEM_SOME] += delta;
> > > if (groupc->state_mask & (1 << PSI_MEM_FULL))
> > > groupc->times[PSI_MEM_FULL] += delta;
> > > @@ -928,8 +946,8 @@ void psi_memstall_leave(unsigned long *flags)
> > > */
> > > rq = this_rq_lock_irq(&rf);
> > >
> > > - current->in_memstall = 0;
> > > psi_task_change(current, TSK_MEMSTALL, 0);
> > > + current->in_memstall = 0;
> > >
> > > rq_unlock_irq(rq, &rf);
> > > }
> > > --
> > > 1.9.1
> > >