2020-12-02 12:01:30

by Frederic Weisbecker

[permalink] [raw]
Subject: [PATCH 4/5] irqtime: Move irqtime entry accounting after irq offset incrementation

IRQ time entry is currently accounted before HARDIRQ_OFFSET or
SOFTIRQ_OFFSET are incremented. This is convenient to decide to which
index the cputime to account is dispatched.

Unfortunately it prevents tick_irq_enter() from being called under
HARDIRQ_OFFSET because tick_irq_enter() has to be called before the IRQ
entry accounting due to the necessary clock catch up. As a result we
don't benefit from appropriate lockdep coverage on tick_irq_enter().

To prepare for fixing this, move the IRQ entry cputime accounting after
the preempt offset is incremented. This requires the cputime dispatch
code to handle the extra offset.

Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Tony Luck <[email protected]>
Cc: Fenghua Yu <[email protected]>
Cc: Michael Ellerman <[email protected]>
Cc: Benjamin Herrenschmidt <[email protected]>
Cc: Paul Mackerras <[email protected]>
Cc: Heiko Carstens <[email protected]>
Cc: Vasily Gorbik <[email protected]>
Cc: Christian Borntraeger <[email protected]>
---
include/linux/hardirq.h | 4 ++--
include/linux/vtime.h | 34 ++++++++++++++++++++++++----------
kernel/sched/cputime.c | 18 +++++++++++-------
kernel/softirq.c | 6 +++---
4 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 754f67ac4326..7c9d6a2d7e90 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -32,9 +32,9 @@ static __always_inline void rcu_irq_enter_check_tick(void)
*/
#define __irq_enter() \
do { \
- account_irq_enter_time(current); \
preempt_count_add(HARDIRQ_OFFSET); \
lockdep_hardirq_enter(); \
+ account_hardirq_enter(current); \
} while (0)

/*
@@ -62,8 +62,8 @@ void irq_enter_rcu(void);
*/
#define __irq_exit() \
do { \
+ account_hardirq_exit(current); \
lockdep_hardirq_exit(); \
- account_irq_exit_time(current); \
preempt_count_sub(HARDIRQ_OFFSET); \
} while (0)

diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index 6c9867419615..041d6524d144 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -83,32 +83,46 @@ static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { }
#endif

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-extern void vtime_account_irq(struct task_struct *tsk);
+extern void vtime_account_irq(struct task_struct *tsk, unsigned int offset);
extern void vtime_account_softirq(struct task_struct *tsk);
extern void vtime_account_hardirq(struct task_struct *tsk);
extern void vtime_flush(struct task_struct *tsk);
#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-static inline void vtime_account_irq(struct task_struct *tsk) { }
+static inline void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { }
+static inline void vtime_account_softirq(struct task_struct *tsk) { }
+static inline void vtime_account_hardirq(struct task_struct *tsk) { }
static inline void vtime_flush(struct task_struct *tsk) { }
#endif


#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-extern void irqtime_account_irq(struct task_struct *tsk);
+extern void irqtime_account_irq(struct task_struct *tsk, unsigned int offset);
#else
-static inline void irqtime_account_irq(struct task_struct *tsk) { }
+static inline void irqtime_account_irq(struct task_struct *tsk, unsigned int offset) { }
#endif

-static inline void account_irq_enter_time(struct task_struct *tsk)
+static inline void account_softirq_enter(struct task_struct *tsk)
{
- vtime_account_irq(tsk);
- irqtime_account_irq(tsk);
+ vtime_account_irq(tsk, SOFTIRQ_OFFSET);
+ irqtime_account_irq(tsk, SOFTIRQ_OFFSET);
}

-static inline void account_irq_exit_time(struct task_struct *tsk)
+static inline void account_softirq_exit(struct task_struct *tsk)
{
- vtime_account_irq(tsk);
- irqtime_account_irq(tsk);
+ vtime_account_softirq(tsk);
+ irqtime_account_irq(tsk, 0);
+}
+
+static inline void account_hardirq_enter(struct task_struct *tsk)
+{
+ vtime_account_irq(tsk, HARDIRQ_OFFSET);
+ irqtime_account_irq(tsk, HARDIRQ_OFFSET);
+}
+
+static inline void account_hardirq_exit(struct task_struct *tsk)
+{
+ vtime_account_hardirq(tsk);
+ irqtime_account_irq(tsk, 0);
}

#endif /* _LINUX_KERNEL_VTIME_H */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 02163d4260d7..5f611658eeab 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -44,12 +44,13 @@ static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
}

/*
- * Called before incrementing preempt_count on {soft,}irq_enter
+ * Called after incrementing preempt_count on {soft,}irq_enter
* and before decrementing preempt_count on {soft,}irq_exit.
*/
-void irqtime_account_irq(struct task_struct *curr)
+void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
{
struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
+ unsigned int pc;
s64 delta;
int cpu;

@@ -59,6 +60,7 @@ void irqtime_account_irq(struct task_struct *curr)
cpu = smp_processor_id();
delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
irqtime->irq_start_time += delta;
+ pc = preempt_count() - offset;

/*
* We do not account for softirq time from ksoftirqd here.
@@ -66,9 +68,9 @@ void irqtime_account_irq(struct task_struct *curr)
* in that case, so as not to confuse scheduler with a special task
* that do not consume any time, but still wants to run.
*/
- if (hardirq_count())
+ if (pc & HARDIRQ_MASK)
irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
- else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+ else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd())
irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
}

@@ -417,11 +419,13 @@ void vtime_task_switch(struct task_struct *prev)
}
# endif

-void vtime_account_irq(struct task_struct *tsk)
+void vtime_account_irq(struct task_struct *tsk, unsigned int offset)
{
- if (hardirq_count()) {
+ unsigned int pc = preempt_count() - offset;
+
+ if (pc & HARDIRQ_OFFSET) {
vtime_account_hardirq(tsk);
- } else if (in_serving_softirq()) {
+ } else if (pc & SOFTIRQ_OFFSET) {
vtime_account_softirq(tsk);
} else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) &&
is_idle_task(tsk)) {
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 617009ccd82c..b8f42b3ba8ca 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -315,10 +315,10 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
current->flags &= ~PF_MEMALLOC;

pending = local_softirq_pending();
- account_irq_enter_time(current);

__local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
in_hardirq = lockdep_softirq_start();
+ account_softirq_enter(current);

restart:
/* Reset the pending bitmask before enabling irqs */
@@ -365,8 +365,8 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
wakeup_softirqd();
}

+ account_softirq_exit(current);
lockdep_softirq_end(in_hardirq);
- account_irq_exit_time(current);
__local_bh_enable(SOFTIRQ_OFFSET);
WARN_ON_ONCE(in_interrupt());
current_restore_flags(old_flags, PF_MEMALLOC);
@@ -418,7 +418,7 @@ static inline void __irq_exit_rcu(void)
#else
lockdep_assert_irqs_disabled();
#endif
- account_irq_exit_time(current);
+ account_hardirq_exit(current);
preempt_count_sub(HARDIRQ_OFFSET);
if (!in_interrupt() && local_softirq_pending())
invoke_softirq();
--
2.25.1


2020-12-02 12:41:47

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 4/5] irqtime: Move irqtime entry accounting after irq offset incrementation

On Wed, Dec 02, 2020 at 12:57:31PM +0100, Frederic Weisbecker wrote:
> IRQ time entry is currently accounted before HARDIRQ_OFFSET or
> SOFTIRQ_OFFSET are incremented. This is convenient to decide to which
> index the cputime to account is dispatched.
>
> Unfortunately it prevents tick_irq_enter() from being called under
> HARDIRQ_OFFSET because tick_irq_enter() has to be called before the IRQ
> entry accounting due to the necessary clock catch up. As a result we
> don't benefit from appropriate lockdep coverage on tick_irq_enter().
>
> To prepare for fixing this, move the IRQ entry cputime accounting after
> the preempt offset is incremented. This requires the cputime dispatch
> code to handle the extra offset.
>
> Signed-off-by: Frederic Weisbecker <[email protected]>

Acked-by: Peter Zijlstra (Intel) <[email protected]>

Subject: [tip: irq/core] irqtime: Move irqtime entry accounting after irq offset incrementation

The following commit has been merged into the irq/core branch of tip:

Commit-ID: d3759e7184f8f6187e62f8c4e7dcb1f6c47c075a
Gitweb: https://git.kernel.org/tip/d3759e7184f8f6187e62f8c4e7dcb1f6c47c075a
Author: Frederic Weisbecker <[email protected]>
AuthorDate: Wed, 02 Dec 2020 12:57:31 +01:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Wed, 02 Dec 2020 20:20:05 +01:00

irqtime: Move irqtime entry accounting after irq offset incrementation

IRQ time entry is currently accounted before HARDIRQ_OFFSET or
SOFTIRQ_OFFSET are incremented. This is convenient to decide to which
index the cputime to account is dispatched.

Unfortunately it prevents tick_irq_enter() from being called under
HARDIRQ_OFFSET because tick_irq_enter() has to be called before the IRQ
entry accounting due to the necessary clock catch up. As a result we
don't benefit from appropriate lockdep coverage on tick_irq_enter().

To prepare for fixing this, move the IRQ entry cputime accounting after
the preempt offset is incremented. This requires the cputime dispatch
code to handle the extra offset.

Signed-off-by: Frederic Weisbecker <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Acked-by: Peter Zijlstra (Intel) <[email protected]>
Link: https://lore.kernel.org/r/[email protected]

---
include/linux/hardirq.h | 4 ++--
include/linux/vtime.h | 34 ++++++++++++++++++++++++----------
kernel/sched/cputime.c | 18 +++++++++++-------
kernel/softirq.c | 6 +++---
4 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 754f67a..7c9d6a2 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -32,9 +32,9 @@ static __always_inline void rcu_irq_enter_check_tick(void)
*/
#define __irq_enter() \
do { \
- account_irq_enter_time(current); \
preempt_count_add(HARDIRQ_OFFSET); \
lockdep_hardirq_enter(); \
+ account_hardirq_enter(current); \
} while (0)

/*
@@ -62,8 +62,8 @@ void irq_enter_rcu(void);
*/
#define __irq_exit() \
do { \
+ account_hardirq_exit(current); \
lockdep_hardirq_exit(); \
- account_irq_exit_time(current); \
preempt_count_sub(HARDIRQ_OFFSET); \
} while (0)

diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index 6c98674..041d652 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -83,32 +83,46 @@ static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { }
#endif

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-extern void vtime_account_irq(struct task_struct *tsk);
+extern void vtime_account_irq(struct task_struct *tsk, unsigned int offset);
extern void vtime_account_softirq(struct task_struct *tsk);
extern void vtime_account_hardirq(struct task_struct *tsk);
extern void vtime_flush(struct task_struct *tsk);
#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-static inline void vtime_account_irq(struct task_struct *tsk) { }
+static inline void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { }
+static inline void vtime_account_softirq(struct task_struct *tsk) { }
+static inline void vtime_account_hardirq(struct task_struct *tsk) { }
static inline void vtime_flush(struct task_struct *tsk) { }
#endif


#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-extern void irqtime_account_irq(struct task_struct *tsk);
+extern void irqtime_account_irq(struct task_struct *tsk, unsigned int offset);
#else
-static inline void irqtime_account_irq(struct task_struct *tsk) { }
+static inline void irqtime_account_irq(struct task_struct *tsk, unsigned int offset) { }
#endif

-static inline void account_irq_enter_time(struct task_struct *tsk)
+static inline void account_softirq_enter(struct task_struct *tsk)
{
- vtime_account_irq(tsk);
- irqtime_account_irq(tsk);
+ vtime_account_irq(tsk, SOFTIRQ_OFFSET);
+ irqtime_account_irq(tsk, SOFTIRQ_OFFSET);
}

-static inline void account_irq_exit_time(struct task_struct *tsk)
+static inline void account_softirq_exit(struct task_struct *tsk)
{
- vtime_account_irq(tsk);
- irqtime_account_irq(tsk);
+ vtime_account_softirq(tsk);
+ irqtime_account_irq(tsk, 0);
+}
+
+static inline void account_hardirq_enter(struct task_struct *tsk)
+{
+ vtime_account_irq(tsk, HARDIRQ_OFFSET);
+ irqtime_account_irq(tsk, HARDIRQ_OFFSET);
+}
+
+static inline void account_hardirq_exit(struct task_struct *tsk)
+{
+ vtime_account_hardirq(tsk);
+ irqtime_account_irq(tsk, 0);
}

#endif /* _LINUX_KERNEL_VTIME_H */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 02163d4..5f61165 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -44,12 +44,13 @@ static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
}

/*
- * Called before incrementing preempt_count on {soft,}irq_enter
+ * Called after incrementing preempt_count on {soft,}irq_enter
* and before decrementing preempt_count on {soft,}irq_exit.
*/
-void irqtime_account_irq(struct task_struct *curr)
+void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
{
struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
+ unsigned int pc;
s64 delta;
int cpu;

@@ -59,6 +60,7 @@ void irqtime_account_irq(struct task_struct *curr)
cpu = smp_processor_id();
delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
irqtime->irq_start_time += delta;
+ pc = preempt_count() - offset;

/*
* We do not account for softirq time from ksoftirqd here.
@@ -66,9 +68,9 @@ void irqtime_account_irq(struct task_struct *curr)
* in that case, so as not to confuse scheduler with a special task
* that do not consume any time, but still wants to run.
*/
- if (hardirq_count())
+ if (pc & HARDIRQ_MASK)
irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
- else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+ else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd())
irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
}

@@ -417,11 +419,13 @@ void vtime_task_switch(struct task_struct *prev)
}
# endif

-void vtime_account_irq(struct task_struct *tsk)
+void vtime_account_irq(struct task_struct *tsk, unsigned int offset)
{
- if (hardirq_count()) {
+ unsigned int pc = preempt_count() - offset;
+
+ if (pc & HARDIRQ_OFFSET) {
vtime_account_hardirq(tsk);
- } else if (in_serving_softirq()) {
+ } else if (pc & SOFTIRQ_OFFSET) {
vtime_account_softirq(tsk);
} else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) &&
is_idle_task(tsk)) {
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 617009c..b8f42b3 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -315,10 +315,10 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
current->flags &= ~PF_MEMALLOC;

pending = local_softirq_pending();
- account_irq_enter_time(current);

__local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
in_hardirq = lockdep_softirq_start();
+ account_softirq_enter(current);

restart:
/* Reset the pending bitmask before enabling irqs */
@@ -365,8 +365,8 @@ restart:
wakeup_softirqd();
}

+ account_softirq_exit(current);
lockdep_softirq_end(in_hardirq);
- account_irq_exit_time(current);
__local_bh_enable(SOFTIRQ_OFFSET);
WARN_ON_ONCE(in_interrupt());
current_restore_flags(old_flags, PF_MEMALLOC);
@@ -418,7 +418,7 @@ static inline void __irq_exit_rcu(void)
#else
lockdep_assert_irqs_disabled();
#endif
- account_irq_exit_time(current);
+ account_hardirq_exit(current);
preempt_count_sub(HARDIRQ_OFFSET);
if (!in_interrupt() && local_softirq_pending())
invoke_softirq();

2020-12-28 02:19:06

by Qais Yousef

[permalink] [raw]
Subject: Re: [PATCH 4/5] irqtime: Move irqtime entry accounting after irq offset incrementation

Hi Frederic

On 12/02/20 12:57, Frederic Weisbecker wrote:
> #endif /* _LINUX_KERNEL_VTIME_H */
> diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
> index 02163d4260d7..5f611658eeab 100644
> --- a/kernel/sched/cputime.c
> +++ b/kernel/sched/cputime.c
> @@ -44,12 +44,13 @@ static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
> }
>
> /*
> - * Called before incrementing preempt_count on {soft,}irq_enter
> + * Called after incrementing preempt_count on {soft,}irq_enter
> * and before decrementing preempt_count on {soft,}irq_exit.
> */
> -void irqtime_account_irq(struct task_struct *curr)
> +void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
> {
> struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
> + unsigned int pc;
> s64 delta;
> int cpu;
>
> @@ -59,6 +60,7 @@ void irqtime_account_irq(struct task_struct *curr)
> cpu = smp_processor_id();
> delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
> irqtime->irq_start_time += delta;
> + pc = preempt_count() - offset;
>
> /*
> * We do not account for softirq time from ksoftirqd here.
> @@ -66,9 +68,9 @@ void irqtime_account_irq(struct task_struct *curr)
> * in that case, so as not to confuse scheduler with a special task
> * that do not consume any time, but still wants to run.
> */
> - if (hardirq_count())
> + if (pc & HARDIRQ_MASK)
> irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
> - else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
> + else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd())

Noob question. Why for SOFTIRQs we do sofirq_count() & *SOFTIRQ_OFFSET*? It
seems we're in-softirq only if the count is odd numbered.

/me tries to dig more

Hmm could it be because the softirq count is actually 1 bit and the rest is
for SOFTIRQ_DISABLE_OFFSET (BH disabled)?

IOW, 1 bit is for we're in softirq context, and the remaining 7 bits are to
count BH disable nesting, right?

I guess this would make sense; we don't nest softirqs processing AFAIK. But
I could be misreading the code too :-)

> irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
> }
>
> @@ -417,11 +419,13 @@ void vtime_task_switch(struct task_struct *prev)
> }
> # endif
>
> -void vtime_account_irq(struct task_struct *tsk)
> +void vtime_account_irq(struct task_struct *tsk, unsigned int offset)
> {
> - if (hardirq_count()) {
> + unsigned int pc = preempt_count() - offset;
> +
> + if (pc & HARDIRQ_OFFSET) {

Shouldn't this be HARDIRQ_MASK like above?

> vtime_account_hardirq(tsk);
> - } else if (in_serving_softirq()) {
> + } else if (pc & SOFTIRQ_OFFSET) {
> vtime_account_softirq(tsk);
> } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) &&
> is_idle_task(tsk)) {

Thanks

--
Qais Yousef

2020-12-29 13:44:42

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH 4/5] irqtime: Move irqtime entry accounting after irq offset incrementation

On Mon, Dec 28, 2020 at 02:15:29AM +0000, Qais Yousef wrote:
> Hi Frederic
>
> On 12/02/20 12:57, Frederic Weisbecker wrote:
> > @@ -66,9 +68,9 @@ void irqtime_account_irq(struct task_struct *curr)
> > * in that case, so as not to confuse scheduler with a special task
> > * that do not consume any time, but still wants to run.
> > */
> > - if (hardirq_count())
> > + if (pc & HARDIRQ_MASK)
> > irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
> > - else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
> > + else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd())
>
> Noob question. Why for SOFTIRQs we do sofirq_count() & *SOFTIRQ_OFFSET*? It
> seems we're in-softirq only if the count is odd numbered.
>
> /me tries to dig more
>
> Hmm could it be because the softirq count is actually 1 bit and the rest is
> for SOFTIRQ_DISABLE_OFFSET (BH disabled)?

Exactly!

>
> IOW, 1 bit is for we're in softirq context, and the remaining 7 bits are to
> count BH disable nesting, right?
>
> I guess this would make sense; we don't nest softirqs processing AFAIK. But
> I could be misreading the code too :-)

You got it right!

This is commented in softirq.c somewhere:

/*
* preempt_count and SOFTIRQ_OFFSET usage:
* - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
* softirq processing.
* - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
* on local_bh_disable or local_bh_enable.
* This lets us distinguish between whether we are currently processing
* softirq and whether we just have bh disabled.
*/

But we should elaborate on the fact that, indeed, softirq processing can't nest,
while softirq disablement can. I should try to send a patch and comment more
thoroughly on the subtleties of preempt mask in preempt.h.

>
> > irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
> > }
> >
> > @@ -417,11 +419,13 @@ void vtime_task_switch(struct task_struct *prev)
> > }
> > # endif
> >
> > -void vtime_account_irq(struct task_struct *tsk)
> > +void vtime_account_irq(struct task_struct *tsk, unsigned int offset)
> > {
> > - if (hardirq_count()) {
> > + unsigned int pc = preempt_count() - offset;
> > +
> > + if (pc & HARDIRQ_OFFSET) {
>
> Shouldn't this be HARDIRQ_MASK like above?

In the rare cases of nested hardirqs happening with broken drivers, Only the outer hardirq
does matter. All the time spent in the inner hardirqs is included in the outer
one.

Thanks.

>
> > vtime_account_hardirq(tsk);
> > - } else if (in_serving_softirq()) {
> > + } else if (pc & SOFTIRQ_OFFSET) {
> > vtime_account_softirq(tsk);
> > } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) &&
> > is_idle_task(tsk)) {
>
> Thanks
>
> --
> Qais Yousef

2020-12-29 14:15:16

by Qais Yousef

[permalink] [raw]
Subject: Re: [PATCH 4/5] irqtime: Move irqtime entry accounting after irq offset incrementation

On 12/29/20 14:41, Frederic Weisbecker wrote:
> On Mon, Dec 28, 2020 at 02:15:29AM +0000, Qais Yousef wrote:
> > Hi Frederic
> >
> > On 12/02/20 12:57, Frederic Weisbecker wrote:
> > > @@ -66,9 +68,9 @@ void irqtime_account_irq(struct task_struct *curr)
> > > * in that case, so as not to confuse scheduler with a special task
> > > * that do not consume any time, but still wants to run.
> > > */
> > > - if (hardirq_count())
> > > + if (pc & HARDIRQ_MASK)
> > > irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
> > > - else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
> > > + else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd())
> >
> > Noob question. Why for SOFTIRQs we do sofirq_count() & *SOFTIRQ_OFFSET*? It
> > seems we're in-softirq only if the count is odd numbered.
> >
> > /me tries to dig more
> >
> > Hmm could it be because the softirq count is actually 1 bit and the rest is
> > for SOFTIRQ_DISABLE_OFFSET (BH disabled)?
>
> Exactly!
>
> >
> > IOW, 1 bit is for we're in softirq context, and the remaining 7 bits are to
> > count BH disable nesting, right?
> >
> > I guess this would make sense; we don't nest softirqs processing AFAIK. But
> > I could be misreading the code too :-)
>
> You got it right!
>
> This is commented in softirq.c somewhere:
>
> /*
> * preempt_count and SOFTIRQ_OFFSET usage:
> * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
> * softirq processing.
> * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
> * on local_bh_disable or local_bh_enable.
> * This lets us distinguish between whether we are currently processing
> * softirq and whether we just have bh disabled.
> */
>
> But we should elaborate on the fact that, indeed, softirq processing can't nest,
> while softirq disablement can. I should try to send a patch and comment more
> thoroughly on the subtleties of preempt mask in preempt.h.

Thanks for the info!

>
> >
> > > irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
> > > }
> > >
> > > @@ -417,11 +419,13 @@ void vtime_task_switch(struct task_struct *prev)
> > > }
> > > # endif
> > >
> > > -void vtime_account_irq(struct task_struct *tsk)
> > > +void vtime_account_irq(struct task_struct *tsk, unsigned int offset)
> > > {
> > > - if (hardirq_count()) {
> > > + unsigned int pc = preempt_count() - offset;
> > > +
> > > + if (pc & HARDIRQ_OFFSET) {
> >
> > Shouldn't this be HARDIRQ_MASK like above?
>
> In the rare cases of nested hardirqs happening with broken drivers, Only the outer hardirq
> does matter. All the time spent in the inner hardirqs is included in the outer
> one.

Ah I see. The original code was doing hardirq_count(), which apparently wasn't
right either.

Shouldn't it be pc == HARDIRQ_OFFSET then? All odd nest counts will trigger
this otherwise, and IIUC we want this to trigger once on first entry only.

Thanks

--
Qais Yousef

2020-12-29 14:32:55

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH 4/5] irqtime: Move irqtime entry accounting after irq offset incrementation

On Tue, Dec 29, 2020 at 02:12:31PM +0000, Qais Yousef wrote:
> On 12/29/20 14:41, Frederic Weisbecker wrote:
> > > > -void vtime_account_irq(struct task_struct *tsk)
> > > > +void vtime_account_irq(struct task_struct *tsk, unsigned int offset)
> > > > {
> > > > - if (hardirq_count()) {
> > > > + unsigned int pc = preempt_count() - offset;
> > > > +
> > > > + if (pc & HARDIRQ_OFFSET) {
> > >
> > > Shouldn't this be HARDIRQ_MASK like above?
> >
> > In the rare cases of nested hardirqs happening with broken drivers, Only the outer hardirq
> > does matter. All the time spent in the inner hardirqs is included in the outer
> > one.
>
> Ah I see. The original code was doing hardirq_count(), which apparently wasn't
> right either.
>
> Shouldn't it be pc == HARDIRQ_OFFSET then? All odd nest counts will trigger
> this otherwise, and IIUC we want this to trigger once on first entry only.

Right but we must also handle hardirqs interrupting either preempt disabled sections
or softirq servicing/disabled section.

3 stacking hardirqs should be rare enough that we don't really care. In the
worst case we are going to account the third IRQ seperately. Not a correctness
issue, just a rare unoptimized case.

>
> Thanks
>
> --
> Qais Yousef

2020-12-29 16:00:32

by Qais Yousef

[permalink] [raw]
Subject: Re: [PATCH 4/5] irqtime: Move irqtime entry accounting after irq offset incrementation

On 12/29/20 15:30, Frederic Weisbecker wrote:
> On Tue, Dec 29, 2020 at 02:12:31PM +0000, Qais Yousef wrote:
> > On 12/29/20 14:41, Frederic Weisbecker wrote:
> > > > > -void vtime_account_irq(struct task_struct *tsk)
> > > > > +void vtime_account_irq(struct task_struct *tsk, unsigned int offset)
> > > > > {
> > > > > - if (hardirq_count()) {
> > > > > + unsigned int pc = preempt_count() - offset;
> > > > > +
> > > > > + if (pc & HARDIRQ_OFFSET) {
> > > >
> > > > Shouldn't this be HARDIRQ_MASK like above?
> > >
> > > In the rare cases of nested hardirqs happening with broken drivers, Only the outer hardirq
> > > does matter. All the time spent in the inner hardirqs is included in the outer
> > > one.
> >
> > Ah I see. The original code was doing hardirq_count(), which apparently wasn't
> > right either.
> >
> > Shouldn't it be pc == HARDIRQ_OFFSET then? All odd nest counts will trigger
> > this otherwise, and IIUC we want this to trigger once on first entry only.
>
> Right but we must also handle hardirqs interrupting either preempt disabled sections
> or softirq servicing/disabled section.
>
> 3 stacking hardirqs should be rare enough that we don't really care. In the
> worst case we are going to account the third IRQ seperately. Not a correctness
> issue, just a rare unoptimized case.

I admit I need to wrap my head around some more details to fully comprehend
that, but that's my own confusion to clear out :-)

Thanks for your answer.

Cheers

--
Qais Yousef