2024-04-02 11:21:43

by Tio Zhang

[permalink] [raw]
Subject: [PATCHSET] sched/delayacct: get task SOFTIRQ delay

We can only get IRQ/SOFTIRQ delay in total now in Delay accounting, but
getting SOFTIRQ delay and IRQ delay separetely would help users reduce
such delays in a more convenient way.
For IRQ delay, we can tuning irq CPU affinity or using threaded-irq.
For SOFTIRQ delay, we can tuning rps/xps or using kernel threads for NAPI.
And this is an example stack a task is delayed mainly by SOFTIRQ(delay
by receiving packets when sending packets):
...
ip_rcv
__netif_receive_skb_core
__netif_receive_skb
process_backlog
net_rx_action
do_softirq
__local_bh_enable_ip
ip_finish_output2
ip_finish_output
ip_output
ip_local_out
ip_send_skb
udp_send_skb
udp_sendmsg
inet_sendmsg
sock_sendmsg
__sys_sendto
do_syscall_64
__libc_sendto
...

So this patchset tries to make SOFTIRQ delay observeable in Delay
accounting and available in taskstats.
(also update tools/accounting/getdelays.c)

Also for backward compatibility, we dont want to change the meaning of
origin IRQ/SOFTIRQ delay, instead we can get real IRQ(interrupt) delay by
the origin IRQ/SOFTIRQ delay minus SOFTIRQ delay added by this patch.

With this patch, the example above results by getdelays.c:
# ./getdelays -t 4600 -d
print delayacct stats ON
TGID 4600

CPU count real total virtual total delay total delay average
3973 10700014780 10698803222 312345815813 78.617ms
IO count delay total delay average
0 0 0.000ms
SWAP count delay total delay average
0 0 0.000ms
RECLAIM count delay total delay average
0 0 0.000ms
THRASHING count delay total delay average
0 0 0.000ms
COMPACT count delay total delay average
0 0 0.000ms
WPCOPY count delay total delay average
40 266859 0.007ms
IRQ count delay total delay average
13450 17756373906 1.320ms
SOFTIRQ count delay total delay average
13450 17639154300 1.311ms

We find out SOFTIRQ impact the delay most, then tune RPS to reduce this.


2024-04-02 11:26:35

by Tio Zhang

[permalink] [raw]
Subject: [PATCH 1/3] sched: make softirq cputime accounting separately in irqtime

Currently we account irq{,soft} time in "irqtime.total",
when CONFIG_IRQ_TIME_ACCOUNTING=y. Since we account them in
the same path (irq{,soft}_enter{,exit}), we can separately count them
by filtering the offset.
In order to not break backward compatibility, we do not change the meaning
of "total", we only let softirq time to be accounted separately in
a new field "total_soft".
So interrupt time could also be calculated by "total" minus "total_soft".

This patch only let softirq cputime stats available in irqtime, do not
make it in real usage.

Signed-off-by: Tio Zhang <[email protected]>
---
kernel/sched/cputime.c | 18 ++++++++++++++----
kernel/sched/sched.h | 16 ++++++++++++++++
2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index af7952f12e6c..23e4bca1e3e8 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -35,13 +35,14 @@ void disable_sched_clock_irqtime(void)
}

static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
- enum cpu_usage_stat idx)
+ u64 delta_soft, enum cpu_usage_stat idx)
{
u64 *cpustat = kcpustat_this_cpu->cpustat;

u64_stats_update_begin(&irqtime->sync);
cpustat[idx] += delta;
irqtime->total += delta;
+ irqtime->total_soft += delta_soft;
irqtime->tick_delta += delta;
u64_stats_update_end(&irqtime->sync);
}
@@ -54,7 +55,7 @@ void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
{
struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
unsigned int pc;
- s64 delta;
+ s64 delta, delta_soft = 0;
int cpu;

if (!sched_clock_irqtime)
@@ -65,6 +66,15 @@ void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
irqtime->irq_start_time += delta;
pc = irq_count() - offset;

+ /*
+ * We only account softirq time when we are called by
+ * account_softirq_enter{,exit}
+ */
+ if ((offset & SOFTIRQ_OFFSET) || (pc & SOFTIRQ_OFFSET)) {
+ delta_soft = sched_clock_cpu(cpu) - irqtime->soft_start_time;
+ irqtime->soft_start_time += delta_soft;
+ }
+
/*
* We do not account for softirq time from ksoftirqd here.
* We want to continue accounting softirq time to ksoftirqd thread
@@ -72,9 +82,9 @@ void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
* that do not consume any time, but still wants to run.
*/
if (pc & HARDIRQ_MASK)
- irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
+ irqtime_account_delta(irqtime, delta, delta_soft, CPUTIME_IRQ);
else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd())
- irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
+ irqtime_account_delta(irqtime, delta, delta_soft, CPUTIME_SOFTIRQ);
}

static u64 irqtime_tick_accounted(u64 maxtime)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 001fe047bd5d..f479c61b84b5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2931,8 +2931,10 @@ static inline void nohz_run_idle_balance(int cpu) { }
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
struct irqtime {
u64 total;
+ u64 total_soft;
u64 tick_delta;
u64 irq_start_time;
+ u64 soft_start_time;
struct u64_stats_sync sync;
};

@@ -2956,6 +2958,20 @@ static inline u64 irq_time_read(int cpu)

return total;
}
+
+static inline u64 irq_time_read_soft(int cpu)
+{
+ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
+ unsigned int seq;
+ u64 total_soft;
+
+ do {
+ seq = __u64_stats_fetch_begin(&irqtime->sync);
+ total_soft = irqtime->total_soft;
+ } while (__u64_stats_fetch_retry(&irqtime->sync, seq));
+
+ return total_soft;
+}
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */

#ifdef CONFIG_CPU_FREQ
--
2.17.1


2024-04-02 11:44:16

by Tio Zhang

[permalink] [raw]
Subject: [PATCH 2/3] delayacct: get delay of SOFTIRQ

This patch makes SOFTIRQ time accounted by "irqtime.total_soft" in use
by adding soft_delay accounts for Delay accounting.

Signed-off-by: Tio Zhang <[email protected]>
---
include/linux/delayacct.h | 11 +++++++----
kernel/delayacct.c | 5 +++--
kernel/sched/core.c | 6 ++++--
kernel/sched/sched.h | 1 +
4 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 6639f48dac36..bf1d45fcb505 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -49,12 +49,14 @@ struct task_delay_info {
u64 wpcopy_delay; /* wait for write-protect copy */

u64 irq_delay; /* wait for IRQ/SOFTIRQ */
+ u64 soft_delay; /* wait for SOFTIRQ */

u32 freepages_count; /* total count of memory reclaim */
u32 thrashing_count; /* total count of thrash waits */
u32 compact_count; /* total count of memory compact */
u32 wpcopy_count; /* total count of write-protect copy */
u32 irq_count; /* total count of IRQ/SOFTIRQ */
+ u32 soft_count; /* total count of SOFTIRQ */
};
#endif

@@ -84,7 +86,7 @@ extern void __delayacct_compact_start(void);
extern void __delayacct_compact_end(void);
extern void __delayacct_wpcopy_start(void);
extern void __delayacct_wpcopy_end(void);
-extern void __delayacct_irq(struct task_struct *task, u32 delta);
+extern void __delayacct_irq(struct task_struct *task, u32 delta, u32 delta_soft);

static inline void delayacct_tsk_init(struct task_struct *tsk)
{
@@ -219,13 +221,14 @@ static inline void delayacct_wpcopy_end(void)
__delayacct_wpcopy_end();
}

-static inline void delayacct_irq(struct task_struct *task, u32 delta)
+static inline void delayacct_irq(struct task_struct *task, u32 delta,
+ u32 delta_soft)
{
if (!static_branch_unlikely(&delayacct_key))
return;

if (task->delays)
- __delayacct_irq(task, delta);
+ __delayacct_irq(task, delta, delta_soft);
}

#else
@@ -266,7 +269,7 @@ static inline void delayacct_wpcopy_start(void)
{}
static inline void delayacct_wpcopy_end(void)
{}
-static inline void delayacct_irq(struct task_struct *task, u32 delta)
+static inline void delayacct_irq(struct task_struct *task, u32 delta, u32 delta_soft)
{}

#endif /* CONFIG_TASK_DELAY_ACCT */
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 6f0c358e73d8..8517f1c1df88 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -278,13 +278,14 @@ void __delayacct_wpcopy_end(void)
&current->delays->wpcopy_count);
}

-void __delayacct_irq(struct task_struct *task, u32 delta)
+void __delayacct_irq(struct task_struct *task, u32 delta, u32 delta_soft)
{
unsigned long flags;

raw_spin_lock_irqsave(&task->delays->lock, flags);
task->delays->irq_delay += delta;
task->delays->irq_count++;
+ task->delays->soft_delay += delta_soft;
+ task->delays->soft_count++;
raw_spin_unlock_irqrestore(&task->delays->lock, flags);
}
-
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9116bcc90346..2f5fd775b47b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -698,10 +698,11 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
* In theory, the compile should just see 0 here, and optimize out the call
* to sched_rt_avg_update. But I don't trust it...
*/
- s64 __maybe_unused steal = 0, irq_delta = 0;
+ s64 __maybe_unused steal = 0, irq_delta = 0, soft_delta = 0;

#ifdef CONFIG_IRQ_TIME_ACCOUNTING
irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+ soft_delta = irq_time_read_soft(cpu_of(rq)) - rq->prev_soft_time;

/*
* Since irq_time is only updated on {soft,}irq_exit, we might run into
@@ -722,9 +723,10 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
irq_delta = delta;

rq->prev_irq_time += irq_delta;
+ rq->prev_soft_time += soft_delta;
delta -= irq_delta;
psi_account_irqtime(rq->curr, irq_delta);
- delayacct_irq(rq->curr, irq_delta);
+ delayacct_irq(rq->curr, irq_delta, soft_delta);
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((&paravirt_steal_rq_enabled))) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f479c61b84b5..abf96ad9c301 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1105,6 +1105,7 @@ struct rq {

#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
+ u64 prev_soft_time;
#endif
#ifdef CONFIG_PARAVIRT
u64 prev_steal_time;
--
2.17.1