The nohz tick code recalculates the timer wheel's next expiry on each
idle loop iteration.
On the other hand, the base next expiry is now always cached and updated
upon timer enqueue and execution. Only timer dequeue may leave
base->next_expiry out of date (but then its stale value won't ever go
past the actual next expiry to be recalculated).
Since recalculating the next_expiry isn't a free operation, especially
when the last wheel level is reached to find out that no timer has
been enqueued at all, reuse the next expiry cache when it is known to be
reliable, which it is most of the time.
Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Anna-Maria Behnsen <[email protected]>
---
Changes since v1:
_ Fix changelog's ramblings
_ Fix structure layout
kernel/time/timer.c | 21 ++++++++++++++++++---
1 file changed, 18 insertions(+), 3 deletions(-)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 77e21e98ec32..96d802e9769e 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -204,6 +204,7 @@ struct timer_base {
unsigned long clk;
unsigned long next_expiry;
unsigned int cpu;
+ bool next_expiry_recalc;
bool is_idle;
DECLARE_BITMAP(pending_map, WHEEL_SIZE);
struct hlist_head vectors[WHEEL_SIZE];
@@ -593,6 +594,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
* can reevaluate the wheel:
*/
base->next_expiry = bucket_expiry;
+ base->next_expiry_recalc = false;
trigger_dyntick_cpu(base, timer);
}
}
@@ -836,8 +838,10 @@ static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
if (!timer_pending(timer))
return 0;
- if (hlist_is_singular_node(&timer->entry, base->vectors + idx))
+ if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) {
__clear_bit(idx, base->pending_map);
+ base->next_expiry_recalc = true;
+ }
detach_timer(timer, clear_pending);
return 1;
@@ -1571,6 +1575,9 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
clk >>= LVL_CLK_SHIFT;
clk += adj;
}
+
+ base->next_expiry_recalc = false;
+
return next;
}
@@ -1631,9 +1638,11 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
return expires;
raw_spin_lock(&base->lock);
- nextevt = __next_timer_interrupt(base);
+ if (base->next_expiry_recalc)
+ base->next_expiry = __next_timer_interrupt(base);
+ nextevt = base->next_expiry;
is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
- base->next_expiry = nextevt;
+
/*
* We have a fresh next event. Check whether we can forward the
* base. We can only do that when @basej is past base->clk
@@ -1725,6 +1734,12 @@ static inline void __run_timers(struct timer_base *base)
while (time_after_eq(jiffies, base->clk) &&
time_after_eq(jiffies, base->next_expiry)) {
levels = collect_expired_timers(base, heads);
+ /*
+ * The only possible reason for not finding any expired
+ * timer at this clk is that all matching timers have been
+ * dequeued.
+ */
+ WARN_ON_ONCE(!levels && !base->next_expiry_recalc);
base->clk++;
base->next_expiry = __next_timer_interrupt(base);
--
2.26.2
The following commit has been merged into the timers/core branch of tip:
Commit-ID: 31cd0e119d50cf27ebe214d1a8f7ca36692f13a5
Gitweb: https://git.kernel.org/tip/31cd0e119d50cf27ebe214d1a8f7ca36692f13a5
Author: Frederic Weisbecker <[email protected]>
AuthorDate: Thu, 23 Jul 2020 17:16:41 +02:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Fri, 24 Jul 2020 12:49:40 +02:00
timers: Recalculate next timer interrupt only when necessary
The nohz tick code recalculates the timer wheel's next expiry on each idle
loop iteration.
On the other hand, the base next expiry is now always cached and updated
upon timer enqueue and execution. Only timer dequeue may leave
base->next_expiry out of date (but then its stale value won't ever go past
the actual next expiry to be recalculated).
Since recalculating the next_expiry isn't a free operation, especially when
the last wheel level is reached to find out that no timer has been enqueued
at all, reuse the next expiry cache when it is known to be reliable, which
it is most of the time.
Signed-off-by: Frederic Weisbecker <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
kernel/time/timer.c | 21 ++++++++++++++++++---
1 file changed, 18 insertions(+), 3 deletions(-)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 77e21e9..96d802e 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -204,6 +204,7 @@ struct timer_base {
unsigned long clk;
unsigned long next_expiry;
unsigned int cpu;
+ bool next_expiry_recalc;
bool is_idle;
DECLARE_BITMAP(pending_map, WHEEL_SIZE);
struct hlist_head vectors[WHEEL_SIZE];
@@ -593,6 +594,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
* can reevaluate the wheel:
*/
base->next_expiry = bucket_expiry;
+ base->next_expiry_recalc = false;
trigger_dyntick_cpu(base, timer);
}
}
@@ -836,8 +838,10 @@ static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
if (!timer_pending(timer))
return 0;
- if (hlist_is_singular_node(&timer->entry, base->vectors + idx))
+ if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) {
__clear_bit(idx, base->pending_map);
+ base->next_expiry_recalc = true;
+ }
detach_timer(timer, clear_pending);
return 1;
@@ -1571,6 +1575,9 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
clk >>= LVL_CLK_SHIFT;
clk += adj;
}
+
+ base->next_expiry_recalc = false;
+
return next;
}
@@ -1631,9 +1638,11 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
return expires;
raw_spin_lock(&base->lock);
- nextevt = __next_timer_interrupt(base);
+ if (base->next_expiry_recalc)
+ base->next_expiry = __next_timer_interrupt(base);
+ nextevt = base->next_expiry;
is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
- base->next_expiry = nextevt;
+
/*
* We have a fresh next event. Check whether we can forward the
* base. We can only do that when @basej is past base->clk
@@ -1725,6 +1734,12 @@ static inline void __run_timers(struct timer_base *base)
while (time_after_eq(jiffies, base->clk) &&
time_after_eq(jiffies, base->next_expiry)) {
levels = collect_expired_timers(base, heads);
+ /*
+ * The only possible reason for not finding any expired
+ * timer at this clk is that all matching timers have been
+ * dequeued.
+ */
+ WARN_ON_ONCE(!levels && !base->next_expiry_recalc);
base->clk++;
base->next_expiry = __next_timer_interrupt(base);
Hi,
Ever since this commit merged in, when nohz_full enabled, the counts of arch_timer interrupt on arm64 arches keep increasing on cores that have been isolated. This can be reproduced on several arm64 boards. After reverting the commit, the counts would stop increasing after boot. my .config is attached.
root@qemuarm64:~# uname -a
Linux qemuarm64 5.13.0 #1 SMP PREEMPT Mon Jul 5 07:11:27 UTC 2021 aarch64 aarch64 aarch64 GNU/Linux
root@qemuarm64:~# cat /proc/cmdline
root=/dev/vda rw mem=2048M ip=dhcp console=ttyAMA0 console=hvc0 earlyprintk isolcpus=1-5 nohz_full=1-5 rcu_nocbs=1-5
root@qemuarm64:~# cat /proc/interrupts
CPU0 CPU1 CPU2 CPU3 CPU4 CPU5
11: 12396 326 325 323 320 321 GIC-0 27 Level arch_timer
Zhe
On Thu, Jul 08, 2021 at 02:43:01PM +0800, He Zhe wrote:
> Hi,
>
> Ever since this commit merged in, when nohz_full enabled, the counts of arch_timer interrupt on arm64 arches keep increasing on cores that have been isolated. This can be reproduced on several arm64 boards. After reverting the commit, the counts would stop increasing after boot. my .config is attached.
>
> root@qemuarm64:~# uname -a
> Linux qemuarm64 5.13.0 #1 SMP PREEMPT Mon Jul 5 07:11:27 UTC 2021 aarch64 aarch64 aarch64 GNU/Linux
> root@qemuarm64:~# cat /proc/cmdline
> root=/dev/vda rw mem=2048M ip=dhcp console=ttyAMA0 console=hvc0 earlyprintk isolcpus=1-5 nohz_full=1-5 rcu_nocbs=1-5
> root@qemuarm64:~# cat /proc/interrupts
> CPU0 CPU1 CPU2 CPU3 CPU4 CPU5
> 12396 326 325 323 320 321 GIC-0� 27 Level���� arch_timer
Strange, I'm not observing that on a raspberry 3b+ (arm64 defconfig):
# cat /proc/cmdline
console=tty0 console=ttyS1,115200 root=/dev/sda2 rw fsck.repair=yes net.ifnames=0 cma=64M rootwait isolcpus=1-3 nohz_full=1-3
# uname -a
Linux rpi3 5.13.0 #3 SMP PREEMPT Thu Jul 8 13:08:39 CEST 2021 aarch64 GNU/Linux
# cat /proc/interrupts
CPU0 CPU1 CPU2 CPU3
108: 165376 25 25 25 bcm2836-timer 1 Edge arch_timer
But let's see if I can successfully boot your own config...
On Thu, Jul 08, 2021 at 02:43:01PM +0800, He Zhe wrote:
> Hi,
>
> Ever since this commit merged in, when nohz_full enabled, the counts of arch_timer interrupt on arm64 arches keep increasing on cores that have been isolated. This can be reproduced on several arm64 boards. After reverting the commit, the counts would stop increasing after boot. my .config is attached.
>
> root@qemuarm64:~# uname -a
> Linux qemuarm64 5.13.0 #1 SMP PREEMPT Mon Jul 5 07:11:27 UTC 2021 aarch64 aarch64 aarch64 GNU/Linux
> root@qemuarm64:~# cat /proc/cmdline
> root=/dev/vda rw? mem=2048M ip=dhcp console=ttyAMA0 console=hvc0? earlyprintk isolcpus=1-5 nohz_full=1-5 rcu_nocbs=1-5
> root@qemuarm64:~# cat /proc/interrupts
And I'm not observing that on default aarch64 on qemu either.
Are you emulating a specific machine?
Can you enable the following trace events and send me the output from
one of the isolated CPU trace, say CPU 3 for example:
DIR=/sys/kernel/debug/tracing
echo 1 > $DIR/events/irq/enable
echo 1 > $DIR/events/sched/sched_switch/enable
echo 1 > $DIR/events/irq_vectors/enable
echo 1 > $DIR/events/workqueue/workqueue_execute_start/enable
echo 1 > $DIR/events/timer/hrtimer_expire_entry/enable
echo 1 > $DIR/events/timer/timer_expire_entry/enable
echo 1 > $DIR/events/timer/tick_stop/enable
sleep 10
cat $DIR/per_cpu/cpu0/trace > ~/output_to_send
On 7/8/21 11:36 PM, Frederic Weisbecker wrote:
> On Thu, Jul 08, 2021 at 02:43:01PM +0800, He Zhe wrote:
>> Hi,
>>
>> Ever since this commit merged in, when nohz_full enabled, the counts of arch_timer interrupt on arm64 arches keep increasing on cores that have been isolated. This can be reproduced on several arm64 boards. After reverting the commit, the counts would stop increasing after boot. my .config is attached.
>>
>> root@qemuarm64:~# uname -a
>> Linux qemuarm64 5.13.0 #1 SMP PREEMPT Mon Jul 5 07:11:27 UTC 2021 aarch64 aarch64 aarch64 GNU/Linux
>> root@qemuarm64:~# cat /proc/cmdline
>> root=/dev/vda rw mem=2048M ip=dhcp console=ttyAMA0 console=hvc0 earlyprintk isolcpus=1-5 nohz_full=1-5 rcu_nocbs=1-5
>> root@qemuarm64:~# cat /proc/interrupts
> And I'm not observing that on default aarch64 on qemu either.
> Are you emulating a specific machine?
Here is my qemu configuration.
qemu-system-aarch64 --version
QEMU emulator version 6.0.0
Copyright (c) 2003-2021 Fabrice Bellard and the QEMU Project developers
qemu-system-aarch64 -device virtio-net-device,netdev=net0,mac=52:54:00:12:35:02 -netdev user,id=net0,hostfwd=tcp::2222-:22,hostfwd=tcp::2323-:23,tftp=/qemuarm64 -object rng-random,filename=/dev/urandom,id=rng0 -device virtio-rng-pci,rng=rng0 -drive id=disk0,file=/qemuarm64/qemuarm64.rootfs.ext4,if=none,format=raw -device virtio-blk-device,drive=disk0 -device qemu-xhci -device usb-tablet -device usb-kbd -machine virt -cpu cortex-a57 -smp 4 -m 2048 -smp 6 -m 2048 -serial mon:stdio -serial null -nographic -device VGA,edid=on -kernel /qemuarm64/Image.bin -append 'root=/dev/vda rw mem=2048M ip=dhcp console=ttyAMA0 console=hvc0 earlyprintk isolcpus=1-5 nohz_full=1-5 rcu_nocbs=1-5'
>
> Can you enable the following trace events and send me the output from
> one of the isolated CPU trace, say CPU 3 for example:
output_to_send is attached.
I can confirm that during the sleep the count of arch_timer increases one on each isolated core.
Thanks,
Zhe
>
>
> DIR=/sys/kernel/debug/tracing
>
> echo 1 > $DIR/events/irq/enable
> echo 1 > $DIR/events/sched/sched_switch/enable
> echo 1 > $DIR/events/irq_vectors/enable
> echo 1 > $DIR/events/workqueue/workqueue_execute_start/enable
> echo 1 > $DIR/events/timer/hrtimer_expire_entry/enable
> echo 1 > $DIR/events/timer/timer_expire_entry/enable
> echo 1 > $DIR/events/timer/tick_stop/enable
> sleep 10
> cat $DIR/per_cpu/cpu0/trace > ~/output_to_send
On Fri, Jul 09, 2021 at 01:37:11PM +0800, He Zhe wrote:
>
>
> On 7/8/21 11:36 PM, Frederic Weisbecker wrote:
> > On Thu, Jul 08, 2021 at 02:43:01PM +0800, He Zhe wrote:
> >> Hi,
> >>
> >> Ever since this commit merged in, when nohz_full enabled, the counts of arch_timer interrupt on arm64 arches keep increasing on cores that have been isolated. This can be reproduced on several arm64 boards. After reverting the commit, the counts would stop increasing after boot. my .config is attached.
> >>
> >> root@qemuarm64:~# uname -a
> >> Linux qemuarm64 5.13.0 #1 SMP PREEMPT Mon Jul 5 07:11:27 UTC 2021 aarch64 aarch64 aarch64 GNU/Linux
> >> root@qemuarm64:~# cat /proc/cmdline
> >> root=/dev/vda rw? mem=2048M ip=dhcp console=ttyAMA0 console=hvc0? earlyprintk isolcpus=1-5 nohz_full=1-5 rcu_nocbs=1-5
> >> root@qemuarm64:~# cat /proc/interrupts
> > And I'm not observing that on default aarch64 on qemu either.
> > Are you emulating a specific machine?
>
> Here is my qemu configuration.
>
> qemu-system-aarch64 --version
> QEMU emulator version 6.0.0
> Copyright (c) 2003-2021 Fabrice Bellard and the QEMU Project developers
>
> qemu-system-aarch64 -device virtio-net-device,netdev=net0,mac=52:54:00:12:35:02 -netdev user,id=net0,hostfwd=tcp::2222-:22,hostfwd=tcp::2323-:23,tftp=/qemuarm64 -object rng-random,filename=/dev/urandom,id=rng0 -device virtio-rng-pci,rng=rng0 -drive id=disk0,file=/qemuarm64/qemuarm64.rootfs.ext4,if=none,format=raw -device virtio-blk-device,drive=disk0 -device qemu-xhci -device usb-tablet -device usb-kbd? -machine virt -cpu cortex-a57 -smp 4 -m 2048? -smp 6 -m 2048 -serial mon:stdio -serial null -nographic -device VGA,edid=on -kernel /qemuarm64/Image.bin -append 'root=/dev/vda rw? mem=2048M ip=dhcp console=ttyAMA0 console=hvc0 earlyprintk isolcpus=1-5 nohz_full=1-5 rcu_nocbs=1-5'
>
> >
> > Can you enable the following trace events and send me the output from
> > one of the isolated CPU trace, say CPU 3 for example:
>
> output_to_send is attached.
> I can confirm that during the sleep the count of arch_timer increases one on
> each isolated core.
Oh that's the trace from CPU 0, precisely the only one I don't need :o)
It's my fault, the last line of the script should have been:
cat $DIR/per_cpu/cpu3/trace > ~/output_to_send
Sorry...
On 7/9/21 4:43 PM, Frederic Weisbecker wrote:
> On Fri, Jul 09, 2021 at 01:37:11PM +0800, He Zhe wrote:
>>
>> On 7/8/21 11:36 PM, Frederic Weisbecker wrote:
>>> On Thu, Jul 08, 2021 at 02:43:01PM +0800, He Zhe wrote:
>>>> Hi,
>>>>
>>>> Ever since this commit merged in, when nohz_full enabled, the counts of arch_timer interrupt on arm64 arches keep increasing on cores that have been isolated. This can be reproduced on several arm64 boards. After reverting the commit, the counts would stop increasing after boot. my .config is attached.
>>>>
>>>> root@qemuarm64:~# uname -a
>>>> Linux qemuarm64 5.13.0 #1 SMP PREEMPT Mon Jul 5 07:11:27 UTC 2021 aarch64 aarch64 aarch64 GNU/Linux
>>>> root@qemuarm64:~# cat /proc/cmdline
>>>> root=/dev/vda rw mem=2048M ip=dhcp console=ttyAMA0 console=hvc0 earlyprintk isolcpus=1-5 nohz_full=1-5 rcu_nocbs=1-5
>>>> root@qemuarm64:~# cat /proc/interrupts
>>> And I'm not observing that on default aarch64 on qemu either.
>>> Are you emulating a specific machine?
>> Here is my qemu configuration.
>>
>> qemu-system-aarch64 --version
>> QEMU emulator version 6.0.0
>> Copyright (c) 2003-2021 Fabrice Bellard and the QEMU Project developers
>>
>> qemu-system-aarch64 -device virtio-net-device,netdev=net0,mac=52:54:00:12:35:02 -netdev user,id=net0,hostfwd=tcp::2222-:22,hostfwd=tcp::2323-:23,tftp=/qemuarm64 -object rng-random,filename=/dev/urandom,id=rng0 -device virtio-rng-pci,rng=rng0 -drive id=disk0,file=/qemuarm64/qemuarm64.rootfs.ext4,if=none,format=raw -device virtio-blk-device,drive=disk0 -device qemu-xhci -device usb-tablet -device usb-kbd -machine virt -cpu cortex-a57 -smp 4 -m 2048 -smp 6 -m 2048 -serial mon:stdio -serial null -nographic -device VGA,edid=on -kernel /qemuarm64/Image.bin -append 'root=/dev/vda rw mem=2048M ip=dhcp console=ttyAMA0 console=hvc0 earlyprintk isolcpus=1-5 nohz_full=1-5 rcu_nocbs=1-5'
>>
>>> Can you enable the following trace events and send me the output from
>>> one of the isolated CPU trace, say CPU 3 for example:
>> output_to_send is attached.
>> I can confirm that during the sleep the count of arch_timer increases one on
>> each isolated core.
> Oh that's the trace from CPU 0, precisely the only one I don't need :o)
>
> It's my fault, the last line of the script should have been:
>
> cat $DIR/per_cpu/cpu3/trace > ~/output_to_send
>
> Sorry...
No problem. I didn't notice that either...
Then here is the result of cpu3
root@qemuarm64:~# cat output_to_send
# tracer: nop
#
# entries-in-buffer/entries-written: 19704/19704 #P:6
#
# _-----=> irqs-off
# / _----=> need-resched
# | / _---=> hardirq/softirq
# || / _--=> preempt-depth
# ||| / delay
# TASK-PID CPU# |||| TIMESTAMP FUNCTION
# | | | |||| | |
<idle>-0 [003] d.h1 346.216802: irq_handler_entry: irq=11 name=arch_timer
<idle>-0 [003] d.h1 346.216862: irq_handler_exit: irq=11 ret=handled
Zhe
Hi Frederic, Zhe,
I've stumbled upon an issue on my x86_64 nohz_full setups which looks a lot
like Zhe's.
I've got a potential fix, I'll post it as a reply to this mail.
Regards,
Nicolas
31cd0e119d50 ("timers: Recalculate next timer interrupt only when
necessary") subtly altered get_next_timer_interrupt()'s behaviour. The
function no longer consistently returns KTIME_MAX with no timers
pending.
In order to decide if there are any timers pending we check whether the
next expiry will happen NEXT_TIMER_MAX_DELTA jiffies from now.
Unfortunately, the next expiry time and the timer base clock are no
longer updated in unison. The former changes upon certain timer
operations (enqueue, expire, detach), whereas the latter keeps track of
jiffies as they move forward. Ultimately breaking the logic above.
A simplified example:
- Upon entering get_next_timer_interrupt() with:
jiffies = 1
base->clk = 0;
base->next_expiry = NEXT_TIMER_MAX_DELTA;
'base->next_expiry == base->clk + NEXT_TIMER_MAX_DELTA', the function
returns KTIME_MAX.
- 'base->clk' is updated to the jiffies value.
- The next time we enter get_next_timer_interrupt(), taking into account
no timer operations happened:
base->clk = 1;
base->next_expiry = NEXT_TIMER_MAX_DELTA;
'base->next_expiry != base->clk + NEXT_TIMER_MAX_DELTA', the function
returns a valid expire time, which is incorrect.
This ultimately might unnecessarily rearm sched's timer on nohz_full
setups, and add latency to the system[1].
So, introduce 'base->timers_pending'[2], update it every time
'base->next_expiry' changes, and use it in get_next_timer_interrupt().
[1] See tick_nohz_stop_tick().
[2] A quick pahole check on x86_64 and arm64 shows it doesn't make
'struct timer_base' any bigger.
Fixes: 31cd0e119d50 ("timers: Recalculate next timer interrupt only when necessary")
Signed-off-by: Nicolas Saenz Julienne <[email protected]>
---
kernel/time/timer.c | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 47e5c39b005d..830a9016e0ec 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -207,6 +207,7 @@ struct timer_base {
unsigned int cpu;
bool next_expiry_recalc;
bool is_idle;
+ bool timers_pending;
DECLARE_BITMAP(pending_map, WHEEL_SIZE);
struct hlist_head vectors[WHEEL_SIZE];
} ____cacheline_aligned;
@@ -595,6 +596,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
* can reevaluate the wheel:
*/
base->next_expiry = bucket_expiry;
+ base->timers_pending = true;
base->next_expiry_recalc = false;
trigger_dyntick_cpu(base, timer);
}
@@ -1598,6 +1600,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
}
base->next_expiry_recalc = false;
+ base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);
return next;
}
@@ -1649,7 +1652,6 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
u64 expires = KTIME_MAX;
unsigned long nextevt;
- bool is_max_delta;
/*
* Pretend that there is no timer pending if the cpu is offline.
@@ -1662,7 +1664,6 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
if (base->next_expiry_recalc)
base->next_expiry = __next_timer_interrupt(base);
nextevt = base->next_expiry;
- is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
/*
* We have a fresh next event. Check whether we can forward the
@@ -1680,7 +1681,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
expires = basem;
base->is_idle = false;
} else {
- if (!is_max_delta)
+ if (base->timers_pending)
expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
/*
* If we expect to sleep more than a tick, mark the base idle.
@@ -1970,6 +1971,7 @@ int timers_prepare_cpu(unsigned int cpu)
base = per_cpu_ptr(&timer_bases[b], cpu);
base->clk = jiffies;
base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
+ base->timers_pending = false;
base->is_idle = false;
}
return 0;
--
2.31.1
On Fri, Jul 09, 2021 at 04:13:25PM +0200, Nicolas Saenz Julienne wrote:
> 31cd0e119d50 ("timers: Recalculate next timer interrupt only when
> necessary") subtly altered get_next_timer_interrupt()'s behaviour. The
> function no longer consistently returns KTIME_MAX with no timers
> pending.
>
> In order to decide if there are any timers pending we check whether the
> next expiry will happen NEXT_TIMER_MAX_DELTA jiffies from now.
> Unfortunately, the next expiry time and the timer base clock are no
> longer updated in unison. The former changes upon certain timer
> operations (enqueue, expire, detach), whereas the latter keeps track of
> jiffies as they move forward. Ultimately breaking the logic above.
>
> A simplified example:
>
> - Upon entering get_next_timer_interrupt() with:
>
> jiffies = 1
> base->clk = 0;
> base->next_expiry = NEXT_TIMER_MAX_DELTA;
>
> 'base->next_expiry == base->clk + NEXT_TIMER_MAX_DELTA', the function
> returns KTIME_MAX.
>
> - 'base->clk' is updated to the jiffies value.
>
> - The next time we enter get_next_timer_interrupt(), taking into account
> no timer operations happened:
>
> base->clk = 1;
> base->next_expiry = NEXT_TIMER_MAX_DELTA;
>
> 'base->next_expiry != base->clk + NEXT_TIMER_MAX_DELTA', the function
> returns a valid expire time, which is incorrect.
>
> This ultimately might unnecessarily rearm sched's timer on nohz_full
> setups, and add latency to the system[1].
>
> So, introduce 'base->timers_pending'[2], update it every time
> 'base->next_expiry' changes, and use it in get_next_timer_interrupt().
>
> [1] See tick_nohz_stop_tick().
> [2] A quick pahole check on x86_64 and arm64 shows it doesn't make
> 'struct timer_base' any bigger.
>
> Fixes: 31cd0e119d50 ("timers: Recalculate next timer interrupt only when necessary")
> Signed-off-by: Nicolas Saenz Julienne <[email protected]>
Very good catch.
And the fix looks good:
Acked-by: Frederic Weisbecker <[email protected]>
I guess later we can turn this .timers_pending into
.timers_count and that would spare us the costly call to
__next_timer_interrupt() up to the last level after the last
timer is dequeued.
Anyway, thanks a lot!
On Fri, Jul 09, 2021 at 04:13:25PM +0200, Nicolas Saenz Julienne wrote:
> 31cd0e119d50 ("timers: Recalculate next timer interrupt only when
> necessary") subtly altered get_next_timer_interrupt()'s behaviour. The
> function no longer consistently returns KTIME_MAX with no timers
> pending.
>
> In order to decide if there are any timers pending we check whether the
> next expiry will happen NEXT_TIMER_MAX_DELTA jiffies from now.
> Unfortunately, the next expiry time and the timer base clock are no
> longer updated in unison. The former changes upon certain timer
> operations (enqueue, expire, detach), whereas the latter keeps track of
> jiffies as they move forward. Ultimately breaking the logic above.
>
> A simplified example:
>
> - Upon entering get_next_timer_interrupt() with:
>
> jiffies = 1
> base->clk = 0;
> base->next_expiry = NEXT_TIMER_MAX_DELTA;
>
> 'base->next_expiry == base->clk + NEXT_TIMER_MAX_DELTA', the function
> returns KTIME_MAX.
>
> - 'base->clk' is updated to the jiffies value.
>
> - The next time we enter get_next_timer_interrupt(), taking into account
> no timer operations happened:
>
> base->clk = 1;
> base->next_expiry = NEXT_TIMER_MAX_DELTA;
>
> 'base->next_expiry != base->clk + NEXT_TIMER_MAX_DELTA', the function
> returns a valid expire time, which is incorrect.
>
> This ultimately might unnecessarily rearm sched's timer on nohz_full
> setups, and add latency to the system[1].
>
> So, introduce 'base->timers_pending'[2], update it every time
> 'base->next_expiry' changes, and use it in get_next_timer_interrupt().
>
> [1] See tick_nohz_stop_tick().
> [2] A quick pahole check on x86_64 and arm64 shows it doesn't make
> 'struct timer_base' any bigger.
>
> Fixes: 31cd0e119d50 ("timers: Recalculate next timer interrupt only when necessary")
> Signed-off-by: Nicolas Saenz Julienne <[email protected]>
He Zhe, does it fix your issue?
Thanks.
On 7/10/21 5:05 PM, Frederic Weisbecker wrote:
> On Fri, Jul 09, 2021 at 04:13:25PM +0200, Nicolas Saenz Julienne wrote:
>> 31cd0e119d50 ("timers: Recalculate next timer interrupt only when
>> necessary") subtly altered get_next_timer_interrupt()'s behaviour. The
>> function no longer consistently returns KTIME_MAX with no timers
>> pending.
>>
>> In order to decide if there are any timers pending we check whether the
>> next expiry will happen NEXT_TIMER_MAX_DELTA jiffies from now.
>> Unfortunately, the next expiry time and the timer base clock are no
>> longer updated in unison. The former changes upon certain timer
>> operations (enqueue, expire, detach), whereas the latter keeps track of
>> jiffies as they move forward. Ultimately breaking the logic above.
>>
>> A simplified example:
>>
>> - Upon entering get_next_timer_interrupt() with:
>>
>> jiffies = 1
>> base->clk = 0;
>> base->next_expiry = NEXT_TIMER_MAX_DELTA;
>>
>> 'base->next_expiry == base->clk + NEXT_TIMER_MAX_DELTA', the function
>> returns KTIME_MAX.
>>
>> - 'base->clk' is updated to the jiffies value.
>>
>> - The next time we enter get_next_timer_interrupt(), taking into account
>> no timer operations happened:
>>
>> base->clk = 1;
>> base->next_expiry = NEXT_TIMER_MAX_DELTA;
>>
>> 'base->next_expiry != base->clk + NEXT_TIMER_MAX_DELTA', the function
>> returns a valid expire time, which is incorrect.
>>
>> This ultimately might unnecessarily rearm sched's timer on nohz_full
>> setups, and add latency to the system[1].
>>
>> So, introduce 'base->timers_pending'[2], update it every time
>> 'base->next_expiry' changes, and use it in get_next_timer_interrupt().
>>
>> [1] See tick_nohz_stop_tick().
>> [2] A quick pahole check on x86_64 and arm64 shows it doesn't make
>> 'struct timer_base' any bigger.
>>
>> Fixes: 31cd0e119d50 ("timers: Recalculate next timer interrupt only when necessary")
>> Signed-off-by: Nicolas Saenz Julienne <[email protected]>
> He Zhe, does it fix your issue?
Yes, this fixes my issue. Thank you all.
Zhe
>
> Thanks.
On Sat, 2021-07-10 at 02:52 +0200, Frederic Weisbecker wrote:
> Very good catch.
>
> And the fix looks good:
>
> Acked-by: Frederic Weisbecker <[email protected]>
Thanks!
>
> I guess later we can turn this .timers_pending into
> .timers_count and that would spare us the costly call to
> __next_timer_interrupt() up to the last level after the last
> timer is dequeued.
Sound like something manageable. I'll probably have a go at it in the future.
--
Nicolás Sáenz
On Sat, 2021-07-10 at 02:52 +0200, Frederic Weisbecker wrote:
> I guess later we can turn this .timers_pending into
> .timers_count and that would spare us the costly call to
> __next_timer_interrupt() up to the last level after the last
> timer is dequeued.
I've been looking into this. AFAIU there is no limit to the number of timers
one might enqueue, so there is no fool proof way of selecting .timers_count's
size. That said, 'struct timer_list' size is 40 bytes (as per pahole), so in
order to overflow an u32 .timers_count you'd need to allocate ~160GB in 'struct
timer_list' which I think is safe to assume will never happen.
Also, I measured the costy call to __next_timer_interrupt() it's slightly less
than 1us on my test machine. Not a that big in the grand scheme of things, but
it's in the irq exit code path, so I think it's worth the extra complexity in
the timer code.
Any thoughs?
--
Nicolás Sáenz
On Fri, Jul 16, 2021 at 06:38:37PM +0200, Nicolas Saenz Julienne wrote:
> On Sat, 2021-07-10 at 02:52 +0200, Frederic Weisbecker wrote:
> > I guess later we can turn this .timers_pending into
> > .timers_count and that would spare us the costly call to
> > __next_timer_interrupt() up to the last level after the last
> > timer is dequeued.
>
> I've been looking into this. AFAIU there is no limit to the number of timers
> one might enqueue, so there is no fool proof way of selecting .timers_count's
> size. That said, 'struct timer_list' size is 40 bytes (as per pahole), so in
> order to overflow an u32 .timers_count you'd need to allocate ~160GB in 'struct
> timer_list' which I think is safe to assume will never happen.
>
> Also, I measured the costy call to __next_timer_interrupt() it's slightly less
> than 1us on my test machine. Not a that big in the grand scheme of things, but
> it's in the irq exit code path, so I think it's worth the extra complexity in
> the timer code.
And also each time we iterate the idle loop. In fact __next_timer_interrupt()
won't always have the same cost: the worst case is when the wheel is entirely
empty after the last removal and we need to walk through all 9 levels. It's
a pretty common case because it happens when the last timer expires.
And that's the only one case to measure because it's the only one covered
by the counter.
>
> Any thoughs?
>
> --
> Nicol?s S?enz
>