I recently got a bug report that
clock_gettime(CLOCK_THREAD_CPUTIME_ID,...) had regressed between
5.10 and 6.1. Its not a huge regression in absolute time
(~30-40ns), but is >10% change.
I narrowed the cause down to the addition of
psi_account_irqtime() in update_rq_clock_task(), in commit
52b1364ba0b1 ("sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ
pressure")
So that explains the behavior change, but it also seems odd that
we're doing psi irq accounting from a syscall that is just
trying to read the thread's cputime.
Thinking about it more, it seems the re-use of update_rq_clock()
to handle accounting for any in-progress time for the current
task has the potential for side effects and unnecessary work.
So instead rework the logic so we calculate the current cpu
runtime in a read-only fashion.
This has the side benefit of improving
clock_gettime(CLOCK_THREAD_CPUTIME_ID,...) performance by ~12%
over the behavior in 5.10, and ~21% over the 6.1 behavior.
NOTE: I'm not 100% sure this is correct yet. There may be some
edge cases I've overlooked, so I'd greatly appreciate any
review or feedback.
Cc: Thomas Gleixner <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Juri Lelli <[email protected]>
Cc: Vincent Guittot <[email protected]>
Cc: Dietmar Eggemann <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Ben Segall <[email protected]>
Cc: Mel Gorman <[email protected]>
Cc: Daniel Bristot de Oliveira <[email protected]>
Cc: Valentin Schneider <[email protected]>
Cc: Qais Yousef <[email protected]>
Cc: Joel Fernandes <[email protected]>
Cc: [email protected]
Signed-off-by: John Stultz <[email protected]>
---
kernel/sched/core.c | 82 ++++++++++++++++++++++++++-------------------
1 file changed, 47 insertions(+), 35 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bcf2c4cc0522..b29cde5ded84 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -692,16 +692,11 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
* RQ-clock updating methods:
*/
-static void update_rq_clock_task(struct rq *rq, s64 delta)
-{
-/*
- * In theory, the compile should just see 0 here, and optimize out the call
- * to sched_rt_avg_update. But I don't trust it...
- */
- s64 __maybe_unused steal = 0, irq_delta = 0;
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
- irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+static inline s64 get_irq_delta(struct rq *rq, s64 delta)
+{
+ s64 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
/*
* Since irq_time is only updated on {soft,}irq_exit, we might run into
@@ -720,7 +715,45 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
*/
if (irq_delta > delta)
irq_delta = delta;
+ return irq_delta;
+}
+#else
+static inline s64 get_irq_delta(struct rq *rq, s64 delta)
+{
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+static inline s64 get_steal_time(struct rq *rq, s64 delta)
+{
+ s64 steal;
+ if (!static_key_false(¶virt_steal_rq_enabled))
+ return 0;
+ steal = paravirt_steal_clock(cpu_of(rq));
+ steal -= rq->prev_steal_time_rq;
+ if (unlikely(steal > delta))
+ steal = delta;
+ return steal;
+}
+#else
+static inline s64 get_steal_time(struct rq *rq, s64 delta)
+{
+ return 0;
+}
+#endif
+
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+ s64 __maybe_unused steal = 0, irq_delta = 0;
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ irq_delta = get_irq_delta(rq, delta);
rq->prev_irq_time += irq_delta;
delta -= irq_delta;
psi_account_irqtime(rq->curr, irq_delta);
@@ -728,12 +761,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((¶virt_steal_rq_enabled))) {
- steal = paravirt_steal_clock(cpu_of(rq));
- steal -= rq->prev_steal_time_rq;
-
- if (unlikely(steal > delta))
- steal = delta;
-
+ steal = get_steal_time(rq, delta);
rq->prev_steal_time_rq += steal;
delta -= steal;
}
@@ -5547,23 +5575,6 @@ DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
EXPORT_PER_CPU_SYMBOL(kstat);
EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
-/*
- * The function fair_sched_class.update_curr accesses the struct curr
- * and its field curr->exec_start; when called from task_sched_runtime(),
- * we observe a high rate of cache misses in practice.
- * Prefetching this data results in improved performance.
- */
-static inline void prefetch_curr_exec_start(struct task_struct *p)
-{
-#ifdef CONFIG_FAIR_GROUP_SCHED
- struct sched_entity *curr = (&p->se)->cfs_rq->curr;
-#else
- struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
-#endif
- prefetch(curr);
- prefetch(&curr->exec_start);
-}
-
/*
* Return accounted runtime for the task.
* In case the task is currently running, return the runtime plus current's
@@ -5573,6 +5584,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
{
struct rq_flags rf;
struct rq *rq;
+ s64 delta_exec = 0;
u64 ns;
#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
@@ -5598,11 +5610,11 @@ unsigned long long task_sched_runtime(struct task_struct *p)
* thread, breaking clock_gettime().
*/
if (task_current(rq, p) && task_on_rq_queued(p)) {
- prefetch_curr_exec_start(p);
- update_rq_clock(rq);
- p->sched_class->update_curr(rq);
+ delta_exec = sched_clock_cpu(cpu_of(rq)) - p->se.exec_start;
+ delta_exec -= get_irq_delta(rq, delta_exec);
+ delta_exec -= get_steal_time(rq, delta_exec);
}
- ns = p->se.sum_exec_runtime;
+ ns = p->se.sum_exec_runtime + delta_exec;
task_rq_unlock(rq, p, &rf);
return ns;
--
2.45.2.505.gda0bf45e8d-goog
On Wed, Jun 12, 2024 at 6:58 PM John Stultz <[email protected]> wrote:
>
> I recently got a bug report that
> clock_gettime(CLOCK_THREAD_CPUTIME_ID,...) had regressed between
> 5.10 and 6.1. Its not a huge regression in absolute time
> (~30-40ns), but is >10% change.
>
> I narrowed the cause down to the addition of
> psi_account_irqtime() in update_rq_clock_task(), in commit
> 52b1364ba0b1 ("sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ
> pressure")
>
> So that explains the behavior change, but it also seems odd that
> we're doing psi irq accounting from a syscall that is just
> trying to read the thread's cputime.
>
> Thinking about it more, it seems the re-use of update_rq_clock()
> to handle accounting for any in-progress time for the current
> task has the potential for side effects and unnecessary work.
>
> So instead rework the logic so we calculate the current cpu
> runtime in a read-only fashion.
>
> This has the side benefit of improving
> clock_gettime(CLOCK_THREAD_CPUTIME_ID,...) performance by ~12%
> over the behavior in 5.10, and ~21% over the 6.1 behavior.
>
> NOTE: I'm not 100% sure this is correct yet. There may be some
> edge cases I've overlooked, so I'd greatly appreciate any
> review or feedback.
Actually, I've definitely overlooked something as I'm seeing
inconsistencies in CLOCK_THREAD_CPUTIME_ID with testing.
I'll have to stare at this some more and resend once I've sorted it out.
Even so, I'd appreciate thoughts on the general approach of avoiding
update_rq_clock() in the clock_gettime() path.
thanks
-john
On Wed, Jun 12, 2024 at 06:58:26PM -0700, John Stultz wrote:
> I recently got a bug report that
> clock_gettime(CLOCK_THREAD_CPUTIME_ID,...) had regressed between
> 5.10 and 6.1. Its not a huge regression in absolute time
> (~30-40ns), but is >10% change.
>
> I narrowed the cause down to the addition of
> psi_account_irqtime() in update_rq_clock_task(), in commit
> 52b1364ba0b1 ("sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ
> pressure")
>
> So that explains the behavior change,
It doesn't really... that patch just feeds the irq_time we *already*
subtracted prior to it, to PSI, such that PSI can also enjoy the 'view'.
The only explanation I have is that interrupts that end up in the
scheduler (wakeups, tick, etc..) now get to do that PSI cgroup iteration
and that cost adds up to the IRQ time itself, and as such the task time
slows down accordingly.
Are you using silly deep cgroup hierarchies?
> but it also seems odd that
> we're doing psi irq accounting from a syscall that is just
> trying to read the thread's cputime.
In order to avoid doing all the accounting in the IRQ entry/exit paths,
those paths only do the bare minimum of tracking how much IRQ time there
is.
update_rq_clock_task() then looks at the increase of IRQ time and
subtracts this from the task time -- after all, all time spend in the
IRQ wasn't spend on the task itself.
It did that prior to the PSI patch, and it does so after. The only
change is it now feeds this delta into the PSI thing.
> NOTE: I'm not 100% sure this is correct yet. There may be some
> edge cases I've overlooked, so I'd greatly appreciate any
> review or feedback.
Urgh, you're sprinkling the details of what is clock_task over multiple
places.
Does something like so work?
---
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0935f9d4bb7b..d4b87539d72a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -724,7 +724,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->prev_irq_time += irq_delta;
delta -= irq_delta;
- psi_account_irqtime(rq->curr, irq_delta);
delayacct_irq(rq->curr, irq_delta);
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
@@ -5459,6 +5458,8 @@ void sched_tick(void)
sched_clock_tick();
+ psi_account_irqtime(curr, &rq->psi_irq_time);
+
rq_lock(rq, &rf);
update_rq_clock(rq);
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 146baa91d104..57fdb0b9efbd 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -991,12 +991,13 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
}
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-void psi_account_irqtime(struct task_struct *task, u32 delta)
+void psi_account_irqtime(struct task_struct *task, u64 *prev)
{
int cpu = task_cpu(task);
struct psi_group *group;
struct psi_group_cpu *groupc;
- u64 now;
+ u64 now, irq;
+ s64 delta;
if (static_branch_likely(&psi_disabled))
return;
@@ -1005,6 +1006,11 @@ void psi_account_irqtime(struct task_struct *task, u32 delta)
return;
now = cpu_clock(cpu);
+ irq = irq_time_read(cpu);
+ delta = (s64)(irq - *prev);
+ if (delta < 0)
+ return;
+ *prev = irq;
group = task_psi_group(task);
do {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 62fd8bc6fd08..a63eb546bc4a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1133,6 +1133,7 @@ struct rq {
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
+ u64 psi_irq_time;
#endif
#ifdef CONFIG_PARAVIRT
u64 prev_steal_time;
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index d1445410840a..1111f060264f 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -110,7 +110,7 @@ __schedstats_from_se(struct sched_entity *se)
void psi_task_change(struct task_struct *task, int clear, int set);
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep);
-void psi_account_irqtime(struct task_struct *task, u32 delta);
+void psi_account_irqtime(struct task_struct *task, u64 *prev);
/*
* PSI tracks state that persists across sleeps, such as iowaits and
@@ -192,7 +192,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next,
bool sleep) {}
-static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
+static inline void psi_account_irqtime(struct task_struct *task, u64 *prev) {}
#endif /* CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO
On 06/13/24 12:04, Peter Zijlstra wrote:
> ---
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 0935f9d4bb7b..d4b87539d72a 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -724,7 +724,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
>
> rq->prev_irq_time += irq_delta;
> delta -= irq_delta;
> - psi_account_irqtime(rq->curr, irq_delta);
> delayacct_irq(rq->curr, irq_delta);
> #endif
> #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
> @@ -5459,6 +5458,8 @@ void sched_tick(void)
>
> sched_clock_tick();
>
> + psi_account_irqtime(curr, &rq->psi_irq_time);
> +
If wakeup preemption causes a context switch, wouldn't we lose this
information then? I *think* active migration might cause this information to be
lost too.
pick_next_task() might be a better place to do the accounting?
> rq_lock(rq, &rf);
>
> update_rq_clock(rq);
On Thu, Jun 13, 2024 at 3:04 AM Peter Zijlstra <[email protected]> wrote:
> On Wed, Jun 12, 2024 at 06:58:26PM -0700, John Stultz wrote:
> > I recently got a bug report that
> > clock_gettime(CLOCK_THREAD_CPUTIME_ID,...) had regressed between
> > 5.10 and 6.1. Its not a huge regression in absolute time
> > (~30-40ns), but is >10% change.
> >
> > I narrowed the cause down to the addition of
> > psi_account_irqtime() in update_rq_clock_task(), in commit
> > 52b1364ba0b1 ("sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ
> > pressure")
> >
> > So that explains the behavior change,
>
> It doesn't really... that patch just feeds the irq_time we *already*
> subtracted prior to it, to PSI, such that PSI can also enjoy the 'view'.
>
> The only explanation I have is that interrupts that end up in the
> scheduler (wakeups, tick, etc..) now get to do that PSI cgroup iteration
> and that cost adds up to the IRQ time itself, and as such the task time
> slows down accordingly.
>
> Are you using silly deep cgroup hierarchies?
Not deeply nested. We have a handful of cgroups, but due to
priority-inversion issues we're not putting any tasks in them right
now.
Again, this is only ~30necs delta, so it's not a huge amount of time
we're talking about. The cpu_clock() call in psi_account_irqtime is
probably a reasonable chunk of that (as just the vdso clock_gettime()
takes a similar order of time).
> > but it also seems odd that
> > we're doing psi irq accounting from a syscall that is just
> > trying to read the thread's cputime.
>
> In order to avoid doing all the accounting in the IRQ entry/exit paths,
> those paths only do the bare minimum of tracking how much IRQ time there
> is.
>
> update_rq_clock_task() then looks at the increase of IRQ time and
> subtracts this from the task time -- after all, all time spend in the
> IRQ wasn't spend on the task itself.
>
> It did that prior to the PSI patch, and it does so after. The only
> change is it now feeds this delta into the PSI thing.
Yeah. I don't see the PSI logic as particularly bad, it's just that I
was surprised we were getting into the accounting paths during a read.
Coming from the timekeeping subsystem, we usually do the time
accumulation/accounting (write side) periodically, and on the read
fast-path we avoid anything extra. So it was just a bit unintuitive to
me to find we were doing the accounting/accumulation to the task clock
on each read.
> > NOTE: I'm not 100% sure this is correct yet. There may be some
> > edge cases I've overlooked, so I'd greatly appreciate any
> > review or feedback.
>
> Urgh, you're sprinkling the details of what is clock_task over multiple
> places.
Yeah. Agreed, the base-irq-steal logic I added should have been
wrapped up better.
> Does something like so work?
Yeah, it takes the psi_account_irqtime out of the hotpath, so with it
we're back to ~5.10 numbers.
Qais' correctness concerns seem reasonable, and I fret the second
psi_irq_time accounting base creates multiple irq_time timelines to
manage (and eventually fumble up). So I still think the idea of
separate read/write paths might be an overall improvement. But
obviously your patch seemingly works properly and mine doesn't, so I
can't object too strongly here. :) Though I may still take another
pass on my patch to see if we can make the clock_gettime read path
avoid the accounting logic, but it can be a throwaway if you really
don't like it.
> ---
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 0935f9d4bb7b..d4b87539d72a 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -724,7 +724,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
>
> rq->prev_irq_time += irq_delta;
> delta -= irq_delta;
> - psi_account_irqtime(rq->curr, irq_delta);
> delayacct_irq(rq->curr, irq_delta);
> #endif
> #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
> @@ -5459,6 +5458,8 @@ void sched_tick(void)
>
> sched_clock_tick();
>
> + psi_account_irqtime(curr, &rq->psi_irq_time);
> +
> rq_lock(rq, &rf);
Should the psi_account_irqtime (and its use of the rq->psi_irq_time)
go under the rq_lock here? This is the only user, so there isn't any
practical parallel access, but the locking rules are subtle enough
already.
thanks
-john
On Thu, Jun 13, 2024 at 12:51:42PM +0100, Qais Yousef wrote:
> On 06/13/24 12:04, Peter Zijlstra wrote:
>
> > ---
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 0935f9d4bb7b..d4b87539d72a 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -724,7 +724,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
> >
> > rq->prev_irq_time += irq_delta;
> > delta -= irq_delta;
> > - psi_account_irqtime(rq->curr, irq_delta);
> > delayacct_irq(rq->curr, irq_delta);
> > #endif
> > #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
> > @@ -5459,6 +5458,8 @@ void sched_tick(void)
> >
> > sched_clock_tick();
> >
> > + psi_account_irqtime(curr, &rq->psi_irq_time);
> > +
>
> If wakeup preemption causes a context switch, wouldn't we lose this
> information then? I *think* active migration might cause this information to be
> lost too.
I'm not sure what would be lost ?! the accounting is per cpu, not per
task afaict. That said,...
> pick_next_task() might be a better place to do the accounting?
Additionally, when there has been an effective cgroup switch. Only on
switch doesn't work for long running tasks, then the PSI information
will be artitrarily long out of date.
Which then gets me something like the (completely untested) below..
Hmm?
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0935f9d4bb7b..36aed99d6a6c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -724,7 +724,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->prev_irq_time += irq_delta;
delta -= irq_delta;
- psi_account_irqtime(rq->curr, irq_delta);
delayacct_irq(rq->curr, irq_delta);
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
@@ -5459,6 +5458,8 @@ void sched_tick(void)
sched_clock_tick();
+ psi_account_irqtime(curr, NULL, &rq->psi_irq_time);
+
rq_lock(rq, &rf);
update_rq_clock(rq);
@@ -6521,6 +6524,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
++*switch_count;
migrate_disable_switch(rq, prev);
+ psi_account_irqtime(prev, next, &rq->psi_irq_time);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 146baa91d104..65bba162408f 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -991,22 +991,31 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
}
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-void psi_account_irqtime(struct task_struct *task, u32 delta)
+void psi_account_irqtime(struct task_struct *curr, struct task_struct *prev, u64 *time)
{
- int cpu = task_cpu(task);
+ int cpu = task_cpu(curr);
struct psi_group *group;
struct psi_group_cpu *groupc;
- u64 now;
+ u64 now, irq;
+ s64 delta;
if (static_branch_likely(&psi_disabled))
return;
- if (!task->pid)
+ if (!curr->pid)
+ return;
+
+ group = task_psi_group(curr);
+ if( prev && task_psi_group(prev) == group)
return;
now = cpu_clock(cpu);
+ irq = irq_time_read(cpu);
+ delta = (s64)(irq - *time);
+ if (delta < 0)
+ return;
+ *time = irq;
- group = task_psi_group(task);
do {
if (!group->enabled)
continue;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 62fd8bc6fd08..a63eb546bc4a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1133,6 +1133,7 @@ struct rq {
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
+ u64 psi_irq_time;
#endif
#ifdef CONFIG_PARAVIRT
u64 prev_steal_time;
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index d1445410840a..1e290054c5db 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -110,7 +110,7 @@ __schedstats_from_se(struct sched_entity *se)
void psi_task_change(struct task_struct *task, int clear, int set);
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep);
-void psi_account_irqtime(struct task_struct *task, u32 delta);
+void psi_account_irqtime(struct task_struct *curr, struct task_struct *prev, u64 *time);
/*
* PSI tracks state that persists across sleeps, such as iowaits and
@@ -192,7 +192,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next,
bool sleep) {}
-static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
+static inline void psi_account_irqtime(struct task_struct *curr, struct task_struct *prev, u64 *time) {}
#endif /* CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO
On Fri, Jun 14, 2024 at 2:48 AM Peter Zijlstra <[email protected]> wrote:
> On Thu, Jun 13, 2024 at 12:51:42PM +0100, Qais Yousef wrote:
> > On 06/13/24 12:04, Peter Zijlstra wrote:
> > > @@ -5459,6 +5458,8 @@ void sched_tick(void)
> > >
> > > sched_clock_tick();
> > >
> > > + psi_account_irqtime(curr, &rq->psi_irq_time);
> > > +
> >
> > If wakeup preemption causes a context switch, wouldn't we lose this
> > information then? I *think* active migration might cause this information to be
> > lost too.
>
> I'm not sure what would be lost ?! the accounting is per cpu, not per
> task afaict. That said,...
>
> > pick_next_task() might be a better place to do the accounting?
>
> Additionally, when there has been an effective cgroup switch. Only on
> switch doesn't work for long running tasks, then the PSI information
> will be artitrarily long out of date.
>
> Which then gets me something like the (completely untested) below..
>
> Hmm?
I applied and booted with this. It still takes the accounting out of
the hotpath for the CLOCK_THREAD_CPUTIME_ID the microbenchmark
performance is back to 5.10 numbers.
I don't have any correctness tests for irqtime measurements, so I'll
have to try to work something up for that next week.
thanks
-john