Juri Lelli has reported that threaded IRQs don't go along well with
full dynticks.
The tick never manages to stop due to the following endless loop:
* Timer tick raises timer softirq
* Ksoftirqd wakes up, restarts the tick (2 tasks in the runqueue)
* Timer tick raises timer softirq
* Ksoftirqd wakes up, etc...
The main issue here is that the tick fires the timer tick unconditionally,
whether timers have expired or not. This set is a proposal to lower that.
The 1st patch is actually a bugfix for a theoretical issue that I haven't
observed in practice. But who knows?
Patch 2 is a consolidation.
Patch 3 and 4 are optimizations.
The rest is about timer softirqs.
git://git.kernel.org/pub/scm/linux/kernel/git/frederic/linux-dynticks.git
timers/softirq
HEAD: d0567dd5546d1f32eca3772a431488f8b0ac26a1
Thanks,
Frederic
---
Frederic Weisbecker (10):
timer: Prevent base->clk from moving backward
timer: Move trigger_dyntick_cpu() to enqueue_timer()
timer: Simplify LVL_START() and calc_index()
timer: Optimize _next_timer_interrupt() level iteration
timers: Always keep track of next expiry
timer: Reuse next expiry cache after nohz exit
timer: Expand clk forward logic beyond nohz
timer: Spare timer softirq until next expiry
timer: Remove must_forward_clk
timer: Lower base clock forwarding threshold
kernel/time/timer.c | 179 +++++++++++++++++++---------------------------------
1 file changed, 64 insertions(+), 115 deletions(-)
Consolidate the code by calling trigger_dyntick_cpu() from
enqueue_timer() instead of calling it from all its callers.
Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Anna-Maria Gleixner <[email protected]>
Cc: Juri Lelli <[email protected]>
---
kernel/time/timer.c | 46 +++++++++++++++++++--------------------------
1 file changed, 19 insertions(+), 27 deletions(-)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 9a838d38dbe6..4c977df3610b 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -529,29 +529,6 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk)
return idx;
}
-/*
- * Enqueue the timer into the hash bucket, mark it pending in
- * the bitmap and store the index in the timer flags.
- */
-static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
- unsigned int idx)
-{
- hlist_add_head(&timer->entry, base->vectors + idx);
- __set_bit(idx, base->pending_map);
- timer_set_idx(timer, idx);
-
- trace_timer_start(timer, timer->expires, timer->flags);
-}
-
-static void
-__internal_add_timer(struct timer_base *base, struct timer_list *timer)
-{
- unsigned int idx;
-
- idx = calc_wheel_index(timer->expires, base->clk);
- enqueue_timer(base, timer, idx);
-}
-
static void
trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
{
@@ -596,13 +573,29 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
wake_up_nohz_cpu(base->cpu);
}
-static void
-internal_add_timer(struct timer_base *base, struct timer_list *timer)
+/*
+ * Enqueue the timer into the hash bucket, mark it pending in
+ * the bitmap and store the index in the timer flags.
+ */
+static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
+ unsigned int idx)
{
- __internal_add_timer(base, timer);
+ hlist_add_head(&timer->entry, base->vectors + idx);
+ __set_bit(idx, base->pending_map);
+ timer_set_idx(timer, idx);
+
+ trace_timer_start(timer, timer->expires, timer->flags);
trigger_dyntick_cpu(base, timer);
}
+static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
+{
+ unsigned int idx;
+
+ idx = calc_wheel_index(timer->expires, base->clk);
+ enqueue_timer(base, timer, idx);
+}
+
#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
static struct debug_obj_descr timer_debug_descr;
@@ -1059,7 +1052,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
*/
if (idx != UINT_MAX && clk == base->clk) {
enqueue_timer(base, timer, idx);
- trigger_dyntick_cpu(base, timer);
} else {
internal_add_timer(base, timer);
}
--
2.26.2
So far next expiry was only tracked while the CPU was in nohz_idle mode
in order to cope with missing ticks that can't increment the base->clk
periodically anymore.
We are going to expand that logic beyond nohz in order to spare timers
softirqs so do it unconditionally.
Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Anna-Maria Gleixner <[email protected]>
Cc: Juri Lelli <[email protected]>
---
kernel/time/timer.c | 35 ++++++++++++++++++-----------------
1 file changed, 18 insertions(+), 17 deletions(-)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 4aa74aafdd33..30f491ec875a 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -550,8 +550,22 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
* timer is not deferrable. If the other CPU is on the way to idle
* then it can't set base->is_idle as we hold the base lock:
*/
- if (!base->is_idle)
- return;
+ if (base->is_idle)
+ wake_up_nohz_cpu(base->cpu);
+}
+
+/*
+ * Enqueue the timer into the hash bucket, mark it pending in
+ * the bitmap and store the index in the timer flags.
+ */
+static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
+ unsigned int idx)
+{
+ hlist_add_head(&timer->entry, base->vectors + idx);
+ __set_bit(idx, base->pending_map);
+ timer_set_idx(timer, idx);
+
+ trace_timer_start(timer, timer->expires, timer->flags);
/* Check whether this is the new first expiring timer: */
if (time_after_eq(timer->expires, base->next_expiry))
@@ -570,21 +584,7 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
} else {
base->next_expiry = timer->expires;
}
- wake_up_nohz_cpu(base->cpu);
-}
-/*
- * Enqueue the timer into the hash bucket, mark it pending in
- * the bitmap and store the index in the timer flags.
- */
-static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
- unsigned int idx)
-{
- hlist_add_head(&timer->entry, base->vectors + idx);
- __set_bit(idx, base->pending_map);
- timer_set_idx(timer, idx);
-
- trace_timer_start(timer, timer->expires, timer->flags);
trigger_dyntick_cpu(base, timer);
}
@@ -1482,7 +1482,6 @@ static int __collect_expired_timers(struct timer_base *base,
return levels;
}
-#ifdef CONFIG_NO_HZ_COMMON
/*
* Find the next pending bucket of a level. Search from level start (@offset)
* + @clk upwards and if nothing there, search from start of the level
@@ -1574,6 +1573,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
return next;
}
+#ifdef CONFIG_NO_HZ_COMMON
/*
* Check, if the next hrtimer event is before the next timer wheel
* event:
@@ -1779,6 +1779,7 @@ static inline void __run_timers(struct timer_base *base)
levels = collect_expired_timers(base, heads);
base->clk++;
+ base->next_expiry = __next_timer_interrupt(base);
while (levels--)
expire_timers(base, heads + levels);
--
2.26.2
As for next_expiry, we want to expand base->clk catch up logic beyond
nohz in order to avoid triggering useless softirqs.
If we want to fire softirqs only to expire pending timers, periodic
base->clk increments must be skippable for random amounts of time.
Therefore we must prepare to catch-up with missing updates whenever we
need an up-to-date base clock.
Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Anna-Maria Gleixner <[email protected]>
Cc: Juri Lelli <[email protected]>
---
kernel/time/timer.c | 26 ++++----------------------
1 file changed, 4 insertions(+), 22 deletions(-)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 4612d26fd09a..6c021be0e76f 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -877,19 +877,12 @@ get_target_base(struct timer_base *base, unsigned tflags)
static inline void forward_timer_base(struct timer_base *base)
{
-#ifdef CONFIG_NO_HZ_COMMON
unsigned long jnow;
- /*
- * We only forward the base when we are idle or have just come out of
- * idle (must_forward_clk logic), and have a delta between base clock
- * and jiffies. In the common case, run_timers will take care of it.
- */
- if (likely(!base->must_forward_clk))
+ if (!base->must_forward_clk)
return;
jnow = READ_ONCE(jiffies);
- base->must_forward_clk = base->is_idle;
if ((long)(jnow - base->clk) < 2)
return;
@@ -904,7 +897,6 @@ static inline void forward_timer_base(struct timer_base *base)
return;
base->clk = base->next_expiry;
}
-#endif
}
@@ -1658,10 +1650,8 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
* logic is only maintained for the BASE_STD base, deferrable
* timers may still see large granularity skew (by design).
*/
- if ((expires - basem) > TICK_NSEC) {
- base->must_forward_clk = true;
+ if ((expires - basem) > TICK_NSEC)
base->is_idle = true;
- }
}
raw_spin_unlock(&base->lock);
@@ -1760,16 +1750,7 @@ static inline void __run_timers(struct timer_base *base)
/*
* timer_base::must_forward_clk must be cleared before running
* timers so that any timer functions that call mod_timer() will
- * not try to forward the base. Idle tracking / clock forwarding
- * logic is only used with BASE_STD timers.
- *
- * The must_forward_clk flag is cleared unconditionally also for
- * the deferrable base. The deferrable base is not affected by idle
- * tracking and never forwarded, so clearing the flag is a NOOP.
- *
- * The fact that the deferrable base is never forwarded can cause
- * large variations in granularity for deferrable timers, but they
- * can be deferred for long periods due to idle anyway.
+ * not try to forward the base.
*/
base->must_forward_clk = false;
@@ -1782,6 +1763,7 @@ static inline void __run_timers(struct timer_base *base)
while (levels--)
expire_timers(base, heads + levels);
}
+ base->must_forward_clk = true;
raw_spin_unlock_irq(&base->lock);
timer_base_unlock_expiry(base);
}
--
2.26.2
There is no apparent reason for not forwarding base->clk when it's 2
jiffies late, except perhaps for past optimizations. But since forwarding
has to be done at some point now anyway, this doesn't stand anymore.
Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Anna-Maria Gleixner <[email protected]>
Cc: Juri Lelli <[email protected]>
---
kernel/time/timer.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 439fee098e76..25a55c043297 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -883,7 +883,7 @@ static inline void forward_timer_base(struct timer_base *base)
* Also while executing timers, base->clk is 1 offset ahead
* of jiffies to avoid endless requeuing to current jffies.
*/
- if ((long)(jnow - base->clk) < 2)
+ if ((long)(jnow - base->clk) < 1)
return;
/*
--
2.26.2
There is no reason to keep this guard around. The code makes sure that
base->clk remains sane and won't be forwarded beyond jiffies nor set
backward.
Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Anna-Maria Gleixner <[email protected]>
Cc: Juri Lelli <[email protected]>
---
kernel/time/timer.c | 22 ++++++----------------
1 file changed, 6 insertions(+), 16 deletions(-)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 95f51b646447..439fee098e76 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -204,7 +204,6 @@ struct timer_base {
unsigned long next_expiry;
unsigned int cpu;
bool is_idle;
- bool must_forward_clk;
DECLARE_BITMAP(pending_map, WHEEL_SIZE);
struct hlist_head vectors[WHEEL_SIZE];
} ____cacheline_aligned;
@@ -877,12 +876,13 @@ get_target_base(struct timer_base *base, unsigned tflags)
static inline void forward_timer_base(struct timer_base *base)
{
- unsigned long jnow;
+ unsigned long jnow = READ_ONCE(jiffies);
- if (!base->must_forward_clk)
- return;
-
- jnow = READ_ONCE(jiffies);
+ /*
+ * No need to forward if we are close enough below jiffies.
+ * Also while executing timers, base->clk is 1 offset ahead
+ * of jiffies to avoid endless requeuing to current jffies.
+ */
if ((long)(jnow - base->clk) < 2)
return;
@@ -1713,16 +1713,8 @@ static inline void __run_timers(struct timer_base *base)
timer_base_lock_expiry(base);
raw_spin_lock_irq(&base->lock);
- /*
- * timer_base::must_forward_clk must be cleared before running
- * timers so that any timer functions that call mod_timer() will
- * not try to forward the base.
- */
- base->must_forward_clk = false;
-
while (time_after_eq(jiffies, base->clk) &&
time_after_eq(jiffies, base->next_expiry)) {
-
levels = collect_expired_timers(base, heads);
base->clk++;
base->next_expiry = __next_timer_interrupt(base);
@@ -1730,7 +1722,6 @@ static inline void __run_timers(struct timer_base *base)
while (levels--)
expire_timers(base, heads + levels);
}
- base->must_forward_clk = true;
raw_spin_unlock_irq(&base->lock);
timer_base_unlock_expiry(base);
}
@@ -1926,7 +1917,6 @@ int timers_prepare_cpu(unsigned int cpu)
base->clk = jiffies;
base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
base->is_idle = false;
- base->must_forward_clk = true;
}
return 0;
}
--
2.26.2
Now that the core timer infrastructure doesn't depend anymore on
periodic base->clk increments, even when the CPU is not in NO_HZ mode,
we can delay the timer softirqs until we have actual timers to expire.
Some spurious softirqs can still remain since base->next_expiry doesn't
keep track of canceled timers but we are still way ahead of the
unconditional periodic softirqs (~15 times less of them with 1000 Hz
and ~5 times less with 100 Hz).
Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Anna-Maria Gleixner <[email protected]>
Cc: Juri Lelli <[email protected]>
---
kernel/time/timer.c | 49 ++++++++-------------------------------------
1 file changed, 8 insertions(+), 41 deletions(-)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 6c021be0e76f..95f51b646447 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1449,10 +1449,10 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
}
}
-static int __collect_expired_timers(struct timer_base *base,
- struct hlist_head *heads)
+static int collect_expired_timers(struct timer_base *base,
+ struct hlist_head *heads)
{
- unsigned long clk = base->clk;
+ unsigned long clk = base->clk = base->next_expiry;
struct hlist_head *vec;
int i, levels = 0;
unsigned int idx;
@@ -1675,40 +1675,6 @@ void timer_clear_idle(void)
*/
base->is_idle = false;
}
-
-static int collect_expired_timers(struct timer_base *base,
- struct hlist_head *heads)
-{
- unsigned long now = READ_ONCE(jiffies);
-
- /*
- * NOHZ optimization. After a long idle sleep we need to forward the
- * base to current jiffies. Avoid a loop by searching the bitfield for
- * the next expiring timer.
- */
- if ((long)(now - base->clk) > 2) {
- /*
- * If the next timer is ahead of time forward to current
- * jiffies, otherwise forward to the next expiry time:
- */
- if (time_after(base->next_expiry, now)) {
- /*
- * The call site will increment base->clk and then
- * terminate the expiry loop immediately.
- */
- base->clk = now;
- return 0;
- }
- base->clk = base->next_expiry;
- }
- return __collect_expired_timers(base, heads);
-}
-#else
-static inline int collect_expired_timers(struct timer_base *base,
- struct hlist_head *heads)
-{
- return __collect_expired_timers(base, heads);
-}
#endif
/*
@@ -1741,7 +1707,7 @@ static inline void __run_timers(struct timer_base *base)
struct hlist_head heads[LVL_DEPTH];
int levels;
- if (!time_after_eq(jiffies, base->clk))
+ if (time_before(jiffies, base->next_expiry))
return;
timer_base_lock_expiry(base);
@@ -1754,7 +1720,8 @@ static inline void __run_timers(struct timer_base *base)
*/
base->must_forward_clk = false;
- while (time_after_eq(jiffies, base->clk)) {
+ while (time_after_eq(jiffies, base->clk) &&
+ time_after_eq(jiffies, base->next_expiry)) {
levels = collect_expired_timers(base, heads);
base->clk++;
@@ -1789,12 +1756,12 @@ void run_local_timers(void)
hrtimer_run_queues();
/* Raise the softirq only if required. */
- if (time_before(jiffies, base->clk)) {
+ if (time_before(jiffies, base->next_expiry)) {
if (!IS_ENABLED(CONFIG_NO_HZ_COMMON))
return;
/* CPU is awake, so check the deferrable base. */
base++;
- if (time_before(jiffies, base->clk))
+ if (time_before(jiffies, base->next_expiry))
return;
}
raise_softirq(TIMER_SOFTIRQ);
--
2.26.2
Now that we track the next expiry unconditionally when a timer is added,
we can reuse that information on a tick firing after exiting nohz.
Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Anna-Maria Gleixner <[email protected]>
Cc: Juri Lelli <[email protected]>
---
kernel/time/timer.c | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 30f491ec875a..4612d26fd09a 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1697,13 +1697,11 @@ static int collect_expired_timers(struct timer_base *base,
* the next expiring timer.
*/
if ((long)(now - base->clk) > 2) {
- unsigned long next = __next_timer_interrupt(base);
-
/*
* If the next timer is ahead of time forward to current
* jiffies, otherwise forward to the next expiry time:
*/
- if (time_after(next, now)) {
+ if (time_after(base->next_expiry, now)) {
/*
* The call site will increment base->clk and then
* terminate the expiry loop immediately.
@@ -1711,7 +1709,7 @@ static int collect_expired_timers(struct timer_base *base,
base->clk = now;
return 0;
}
- base->clk = next;
+ base->clk = base->next_expiry;
}
return __collect_expired_timers(base, heads);
}
--
2.26.2
If a level has a timer that expires before we reach the next level,
there is no need to iterate further.
The next level is reached when the 3 lower bits of the current level are
cleared. If the next event happens before/during that, the next levels
won't give us an earlier expiration.
Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Anna-Maria Gleixner <[email protected]>
Cc: Juri Lelli <[email protected]>
---
kernel/time/timer.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index b4838d63a016..4aa74aafdd33 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1515,6 +1515,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
clk = base->clk;
for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
+ unsigned long lvl_clk = clk & LVL_CLK_MASK;
if (pos >= 0) {
unsigned long tmp = clk + (unsigned long) pos;
@@ -1522,6 +1523,13 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
tmp <<= LVL_SHIFT(lvl);
if (time_before(tmp, next))
next = tmp;
+
+ /*
+ * If the next expiration happens before we reach
+ * the next level, no need to check further.
+ */
+ if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK))
+ break;
}
/*
* Clock for the next level. If the current level clock lower
@@ -1559,7 +1567,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
* So the simple check whether the lower bits of the current
* level are 0 or not is sufficient for all cases.
*/
- adj = clk & LVL_CLK_MASK ? 1 : 0;
+ adj = lvl_clk ? 1 : 0;
clk >>= LVL_CLK_SHIFT;
clk += adj;
}
--
2.26.2
LVL_START() makes the first index of a level to start with what would be
the value of all bits set of the previous level.
For example level 1 starts at 63 instead of 64.
To cope with that, calc_index() always adds one offset for the level
granularity to the expiry passed in parameter.
Yet there is no apparent reason for such fixups so simplify the whole
thing.
Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Anna-Maria Gleixner <[email protected]>
Cc: Juri Lelli <[email protected]>
---
kernel/time/timer.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 4c977df3610b..b4838d63a016 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -158,7 +158,7 @@ EXPORT_SYMBOL(jiffies_64);
* The time start value for each level to select the bucket at enqueue
* time.
*/
-#define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))
+#define LVL_START(n) (LVL_SIZE << (((n) - 1) * LVL_CLK_SHIFT))
/* Size of each clock level */
#define LVL_BITS 6
@@ -489,7 +489,7 @@ static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
*/
static inline unsigned calc_index(unsigned expires, unsigned lvl)
{
- expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl);
+ expires >>= LVL_SHIFT(lvl);
return LVL_OFFS(lvl) + (expires & LVL_MASK);
}
--
2.26.2
When a timer is enqueued with a negative delta (ie: expiry is below
base->clk), it gets added to the wheel as expiring now (base->clk).
Yet the value that gets stored in base->next_expiry, while calling
trigger_dyntick_cpu(), is the initial timer->expires value. The
resulting state becomes:
base->next_expiry < base->clk
On the next timer enqueue, forward_timer_base() may accidentally
rewind base->clk. As a possible outcome, timers may expire way too
early, the worst case being that the highest wheel levels get spuriously
processed again.
To prevent from that, make sure that base->next_expiry doesn't get below
base->clk.
Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Anna-Maria Gleixner <[email protected]>
Cc: Juri Lelli <[email protected]>
---
kernel/time/timer.c | 17 ++++++++++++++---
1 file changed, 14 insertions(+), 3 deletions(-)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 398e6eadb861..9a838d38dbe6 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -584,7 +584,15 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
* Set the next expiry time and kick the CPU so it can reevaluate the
* wheel:
*/
- base->next_expiry = timer->expires;
+ if (time_before(timer->expires, base->clk)) {
+ /*
+ * Prevent from forward_timer_base() moving the base->clk
+ * backward
+ */
+ base->next_expiry = base->clk;
+ } else {
+ base->next_expiry = timer->expires;
+ }
wake_up_nohz_cpu(base->cpu);
}
@@ -896,10 +904,13 @@ static inline void forward_timer_base(struct timer_base *base)
* If the next expiry value is > jiffies, then we fast forward to
* jiffies otherwise we forward to the next expiry value.
*/
- if (time_after(base->next_expiry, jnow))
+ if (time_after(base->next_expiry, jnow)) {
base->clk = jnow;
- else
+ } else {
+ if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk)))
+ return;
base->clk = base->next_expiry;
+ }
#endif
}
--
2.26.2
Hi,
Thanks for the set, first of all. Reviewing and testing it.
On 01/07/20 03:10, Frederic Weisbecker wrote:
> When a timer is enqueued with a negative delta (ie: expiry is below
> base->clk), it gets added to the wheel as expiring now (base->clk).
>
> Yet the value that gets stored in base->next_expiry, while calling
> trigger_dyntick_cpu(), is the initial timer->expires value. The
> resulting state becomes:
>
> base->next_expiry < base->clk
>
> On the next timer enqueue, forward_timer_base() may accidentally
> rewind base->clk. As a possible outcome, timers may expire way too
> early, the worst case being that the highest wheel levels get spuriously
> processed again.
>
> To prevent from that, make sure that base->next_expiry doesn't get below
> base->clk.
>
> Signed-off-by: Frederic Weisbecker <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Anna-Maria Gleixner <[email protected]>
> Cc: Juri Lelli <[email protected]>
> ---
> kernel/time/timer.c | 17 ++++++++++++++---
> 1 file changed, 14 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/time/timer.c b/kernel/time/timer.c
> index 398e6eadb861..9a838d38dbe6 100644
> --- a/kernel/time/timer.c
> +++ b/kernel/time/timer.c
> @@ -584,7 +584,15 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
> * Set the next expiry time and kick the CPU so it can reevaluate the
> * wheel:
> */
> - base->next_expiry = timer->expires;
> + if (time_before(timer->expires, base->clk)) {
> + /*
> + * Prevent from forward_timer_base() moving the base->clk
> + * backward
> + */
> + base->next_expiry = base->clk;
> + } else {
> + base->next_expiry = timer->expires;
> + }
> wake_up_nohz_cpu(base->cpu);
> }
>
> @@ -896,10 +904,13 @@ static inline void forward_timer_base(struct timer_base *base)
> * If the next expiry value is > jiffies, then we fast forward to
> * jiffies otherwise we forward to the next expiry value.
> */
> - if (time_after(base->next_expiry, jnow))
> + if (time_after(base->next_expiry, jnow)) {
> base->clk = jnow;
> - else
> + } else {
> + if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk)))
> + return;
I actually ported it to latest RT tree (v5.6.17-rt10) w/o conflicts,
but hit this one above:
...
000: Kernel command line: BOOT_IMAGE=(hd0,msdos1)/vmlinuz-5.6.17-rt10 root=/dev/mapper/rhel_rt--qe--04-root ro crashkernel=auto resume=/dev/mapper/rhel_rt--qe--04-swap rd.lvm.lv=rhel_rt-qe-04/root rd.lvm.lv=rhel_rt-qe-04/swap console=ttyS0,115200
000: mem auto-init: stack:off, heap alloc:off, heap free:off
000: Memory: 2102240K/134089036K available (12292K kernel code, 2449K rwdata, 4332K rodata, 2248K init, 15392K bss, 2278548K reserved, 0K cma-reserved)
000: random: get_random_u64 called from cache_random_seq_create+0x7c/0x140 with crng_init=0
000: SLUB: HWalign=64, Order=0-3, MinObjects=0, CPUs=24, Nodes=2
000: Kernel/User page tables isolation: enabled
000: ftrace: allocating 37680 entries in 148 pages
000: ftrace: allocated 148 pages with 3 groups
000: rcu: Preemptible hierarchical RCU implementation.
000: rcu: RCU restricting CPUs from NR_CPUS=8192 to nr_cpu_ids=24.
000: rcu: RCU priority boosting: priority 1 delay 500 ms.
000: rcu: RCU_SOFTIRQ processing moved to rcuc kthreads.
000: No expedited grace period (rcu_normal_after_boot).
000: Tasks RCU enabled.
000: rcu: RCU calculated value of scheduler-enlistment delay is 100 jiffies.
000: rcu: Adjusting geometry for rcu_fanout_leaf=16, nr_cpu_ids=24
000: NR_IRQS: 524544, nr_irqs: 1432, preallocated irqs: 16
000: random: crng done (trusting CPU's manufacturer)
000: Console: colour VGA+ 80x25
000: printk: console [ttyS0] enabled
000: ACPI: Core revision 20200110
000: clocksource: hpet: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 133484882848 ns
000: APIC: Switch to symmetric I/O mode setup
000: DMAR: Host address width 46
000: DMAR: DRHD base: 0x000000fbffc000 flags: 0x0
000: DMAR: dmar0: reg_base_addr fbffc000 ver 1:0 cap d2078c106f0466 ecap f020df
000: DMAR: DRHD base: 0x000000c7ffc000 flags: 0x1
000: DMAR: dmar1: reg_base_addr c7ffc000 ver 1:0 cap d2078c106f0466 ecap f020df
000: DMAR: RMRR base: 0x00000079190000 end: 0x00000079192fff
000: DMAR: RMRR base: 0x000000791f4000 end: 0x000000791f7fff
000: DMAR: RMRR base: 0x000000791de000 end: 0x000000791f3fff
000: DMAR: RMRR base: 0x000000791cb000 end: 0x000000791dbfff
000: DMAR: RMRR base: 0x000000791dc000 end: 0x000000791ddfff
000: DMAR: RMRR base: 0x0000005a661000 end: 0x0000005a6a0fff
000: DMAR-IR: IOAPIC id 10 under DRHD base 0xfbffc000 IOMMU 0
000: DMAR-IR: IOAPIC id 8 under DRHD base 0xc7ffc000 IOMMU 1
000: DMAR-IR: IOAPIC id 9 under DRHD base 0xc7ffc000 IOMMU 1
000: DMAR-IR: HPET id 0 under DRHD base 0xc7ffc000
000: DMAR-IR: Queued invalidation will be enabled to support x2apic and Intr-remapping.
000: DMAR-IR: Enabled IRQ remapping in x2apic mode
000: x2apic enabled
000: Switched APIC routing to cluster x2apic.
000: ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1
000: clocksource: tsc-early: mask: 0xffffffffffffffff max_cycles: 0x228d095820e, max_idle_ns: 440795295198 ns
000: ------------[ cut here ]------------
000: WARNING: CPU: 0 PID: 0 at kernel/time/timer.c:897 add_timer_on+0x129/0x140
000: Modules linked in:
000: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.6.17-rt10 #1
000: Hardware name: HP ProLiant DL380 Gen9/ProLiant DL380 Gen9, BIOS P89 10/25/2017
000: RIP: 0010:add_timer_on+0x129/0x140
000: Code: 24 48 89 ef e8 e8 ca 7d 00 48 8b 44 24 08 65 48 33 04 25 28 00 00 00 75 1f 48 83 c4 10 5b 5d 41 5c c3 48 89 45 48 eb ca 0f 0b <0f> 0b eb c4 e8 53 94 e9 ff e9 7b ff ff ff e8 64 c4 f6 ff 0f 1f 40
000: RSP: 0000:ffffffffb2c03e80 EFLAGS: 00010083
000: RAX: 00000000fffb6c25 RBX: ffffffffb3886540 RCX: 0000000000000000
000: RDX: 00000000fffb6c20 RSI: ffffffffb2c03e80 RDI: 0000000000000001
000: RBP: ffff92687fa19300 R08: 0000000000000000 R09: 0000000000000001
000: R10: ffffffffb2c5b4e0 R11: 3235393730343420 R12: 0000000000000000
000: R13: ffffffffb2cd42c0 R14: 0000000000000000 R15: ffff92687fa27f40
000: FS: 0000000000000000(0000) GS:ffff92687fa00000(0000) knlGS:0000000000000000
000: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
000: CR2: ffff92610ac01000 CR3: 00000008c920e001 CR4: 00000000000606b0
000: Call Trace:
000: clocksource_select_watchdog+0x144/0x1a0
000: __clocksource_register_scale+0x88/0xf0
000: tsc_init+0x1a1/0x268
000: start_kernel+0x4ae/0x56e
000: secondary_startup_64+0xb6/0xc0
000: ---[ end trace 0000000000000001 ]---
Guess you might be faster to understand what I'm missing. :-)
Best,
Juri
On Wed, Jul 01, 2020 at 06:35:04PM +0200, Juri Lelli wrote:
> Guess you might be faster to understand what I'm missing. :-)
So, did you port only this patch or the whole set in order to
trigger this?
If it was the whole set, can you try this patch alone?
Thanks!
Frederic Weisbecker <[email protected]> writes:
> When a timer is enqueued with a negative delta (ie: expiry is below
> base->clk), it gets added to the wheel as expiring now (base->clk).
>
> Yet the value that gets stored in base->next_expiry, while calling
> trigger_dyntick_cpu(), is the initial timer->expires value. The
> resulting state becomes:
>
> base->next_expiry < base->clk
>
> On the next timer enqueue, forward_timer_base() may accidentally
> rewind base->clk. As a possible outcome, timers may expire way too
> early, the worst case being that the highest wheel levels get spuriously
> processed again.
Bah. Nice catch.
> To prevent from that, make sure that base->next_expiry doesn't get below
> base->clk.
That wants a Fixes: tag and a CC stable.
Thanks,
tglx
On 02/07/20 01:20, Frederic Weisbecker wrote:
> On Wed, Jul 01, 2020 at 06:35:04PM +0200, Juri Lelli wrote:
> > Guess you might be faster to understand what I'm missing. :-)
>
> So, did you port only this patch or the whole set in order to
> trigger this?
>
> If it was the whole set, can you try this patch alone?
Whole set. And I can't reproduce if I try with this patch alone.
Frederic Weisbecker <[email protected]> writes:
> LVL_START() makes the first index of a level to start with what would be
> the value of all bits set of the previous level.
>
> For example level 1 starts at 63 instead of 64.
>
> To cope with that, calc_index() always adds one offset for the level
> granularity to the expiry passed in parameter.
>
> Yet there is no apparent reason for such fixups so simplify the whole
> thing.
You sure?
> @@ -158,7 +158,7 @@ EXPORT_SYMBOL(jiffies_64);
> * The time start value for each level to select the bucket at enqueue
> * time.
> */
> -#define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))
> +#define LVL_START(n) (LVL_SIZE << (((n) - 1) * LVL_CLK_SHIFT))
> /* Size of each clock level */
> #define LVL_BITS 6
> @@ -489,7 +489,7 @@ static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
> */
> static inline unsigned calc_index(unsigned expires, unsigned lvl)
> {
> - expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl);
> + expires >>= LVL_SHIFT(lvl);
> return LVL_OFFS(lvl) + (expires & LVL_MASK);
> }
So with that you move the expiry of each timer one jiffie ahead vs. the
original code, which violates the guarantee that a timer sleeps at least
for one jiffie for real and not measured in jiffies.
base->clk = 1
jiffies = 0
local_irq_disable()
-> timer interrupt is raised in HW
timer->expires = 1
add_timer(timer)
---> index == 1
local_irq_enable()
timer interrupt
jiffies++;
softirq()
expire(timer);
So the off by one has a reason.
Thanks,
tglx
On Thu, Jul 02, 2020 at 01:59:17PM +0200, Thomas Gleixner wrote:
> Frederic Weisbecker <[email protected]> writes:
> > LVL_START() makes the first index of a level to start with what would be
> > the value of all bits set of the previous level.
> >
> > For example level 1 starts at 63 instead of 64.
> >
> > To cope with that, calc_index() always adds one offset for the level
> > granularity to the expiry passed in parameter.
> >
> > Yet there is no apparent reason for such fixups so simplify the whole
> > thing.
>
> You sure?
No :o)
>
> > @@ -158,7 +158,7 @@ EXPORT_SYMBOL(jiffies_64);
> > * The time start value for each level to select the bucket at enqueue
> > * time.
> > */
> > -#define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))
> > +#define LVL_START(n) (LVL_SIZE << (((n) - 1) * LVL_CLK_SHIFT))
>
> > /* Size of each clock level */
> > #define LVL_BITS 6
> > @@ -489,7 +489,7 @@ static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
> > */
> > static inline unsigned calc_index(unsigned expires, unsigned lvl)
> > {
> > - expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl);
> > + expires >>= LVL_SHIFT(lvl);
> > return LVL_OFFS(lvl) + (expires & LVL_MASK);
> > }
>
> So with that you move the expiry of each timer one jiffie ahead vs. the
> original code, which violates the guarantee that a timer sleeps at least
> for one jiffie for real and not measured in jiffies.
>
> base->clk = 1
> jiffies = 0
> local_irq_disable()
> -> timer interrupt is raised in HW
> timer->expires = 1
> add_timer(timer)
> ---> index == 1
> local_irq_enable()
> timer interrupt
> jiffies++;
> softirq()
> expire(timer);
>
> So the off by one has a reason.
Fair enough. I didn't know about the one jiffy sleep guarantee.
I'll convert this patch to a comment explaining that off by one,
using your above example.
Thanks!
Frederic Weisbecker <[email protected]> writes:
> There is no apparent reason for not forwarding base->clk when it's 2
> jiffies late, except perhaps for past optimizations. But since forwarding
> has to be done at some point now anyway, this doesn't stand anymore.
>
> Signed-off-by: Frederic Weisbecker <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Anna-Maria Gleixner <[email protected]>
> Cc: Juri Lelli <[email protected]>
> ---
> kernel/time/timer.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/kernel/time/timer.c b/kernel/time/timer.c
> index 439fee098e76..25a55c043297 100644
> --- a/kernel/time/timer.c
> +++ b/kernel/time/timer.c
> @@ -883,7 +883,7 @@ static inline void forward_timer_base(struct timer_base *base)
> * Also while executing timers, base->clk is 1 offset ahead
> * of jiffies to avoid endless requeuing to current jffies.
> */
> - if ((long)(jnow - base->clk) < 2)
> + if ((long)(jnow - base->clk) < 1)
> return;
The apparent reason is in the comment right above the condition ...
Thanks,
tglx
On Thu, Jul 02, 2020 at 03:21:35PM +0200, Thomas Gleixner wrote:
> Frederic Weisbecker <[email protected]> writes:
> > There is no apparent reason for not forwarding base->clk when it's 2
> > jiffies late, except perhaps for past optimizations. But since forwarding
> > has to be done at some point now anyway, this doesn't stand anymore.
> >
> > Signed-off-by: Frederic Weisbecker <[email protected]>
> > Cc: Peter Zijlstra <[email protected]>
> > Cc: Anna-Maria Gleixner <[email protected]>
> > Cc: Juri Lelli <[email protected]>
> > ---
> > kernel/time/timer.c | 2 +-
> > 1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/kernel/time/timer.c b/kernel/time/timer.c
> > index 439fee098e76..25a55c043297 100644
> > --- a/kernel/time/timer.c
> > +++ b/kernel/time/timer.c
> > @@ -883,7 +883,7 @@ static inline void forward_timer_base(struct timer_base *base)
> > * Also while executing timers, base->clk is 1 offset ahead
> > * of jiffies to avoid endless requeuing to current jffies.
> > */
> > - if ((long)(jnow - base->clk) < 2)
> > + if ((long)(jnow - base->clk) < 1)
> > return;
>
> The apparent reason is in the comment right above the condition ...
Hmm, that's a comment I added myself in the patch before.
The following part:
> > * Also while executing timers, base->clk is 1 offset ahead
> > * of jiffies to avoid endless requeuing to current jffies.
> > */
relates to situation when (long)(jnow - base->clk) < 0
On Thu, Jul 02, 2020 at 11:59:59AM +0200, Juri Lelli wrote:
> On 02/07/20 01:20, Frederic Weisbecker wrote:
> > On Wed, Jul 01, 2020 at 06:35:04PM +0200, Juri Lelli wrote:
> > > Guess you might be faster to understand what I'm missing. :-)
> >
> > So, did you port only this patch or the whole set in order to
> > trigger this?
> >
> > If it was the whole set, can you try this patch alone?
>
> Whole set. And I can't reproduce if I try with this patch alone.
Ok I can reproduce with the whole series upstream with threadirqs.
Investigating...
On Thu, Jul 02, 2020 at 11:59:59AM +0200, Juri Lelli wrote:
> On 02/07/20 01:20, Frederic Weisbecker wrote:
> > On Wed, Jul 01, 2020 at 06:35:04PM +0200, Juri Lelli wrote:
> > > Guess you might be faster to understand what I'm missing. :-)
> >
> > So, did you port only this patch or the whole set in order to
> > trigger this?
> >
> > If it was the whole set, can you try this patch alone?
>
> Whole set. And I can't reproduce if I try with this patch alone.
>
Missing initialization. This should go better after this:
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 25a55c043297..f36b53219768 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1969,6 +1969,7 @@ static void __init init_timer_cpu(int cpu)
base->cpu = cpu;
raw_spin_lock_init(&base->lock);
base->clk = jiffies;
+ base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
timer_base_init_expiry_lock(base);
}
}
Frederic Weisbecker <[email protected]> writes:
> On Thu, Jul 02, 2020 at 03:21:35PM +0200, Thomas Gleixner wrote:
>> Frederic Weisbecker <[email protected]> writes:
>> > @@ -883,7 +883,7 @@ static inline void forward_timer_base(struct timer_base *base)
>> > * Also while executing timers, base->clk is 1 offset ahead
>> > * of jiffies to avoid endless requeuing to current jffies.
>> > */
>> > - if ((long)(jnow - base->clk) < 2)
>> > + if ((long)(jnow - base->clk) < 1)
>> > return;
>>
>> The apparent reason is in the comment right above the condition ...
>
> Hmm, that's a comment I added myself in the patch before.
:)
> The following part:
>
>> > * Also while executing timers, base->clk is 1 offset ahead
>> > * of jiffies to avoid endless requeuing to current jffies.
>> > */
>
> relates to situation when (long)(jnow - base->clk) < 0
This still is inconsistent with your changelog:
> There is no apparent reason for not forwarding base->clk when it's 2
> jiffies late
Let's do the math:
jiffies = 4
base->clk = 2
4 - 2 = 2
which means it is forwarded when it's 2 jiffies late with the original
code, because 2 < 2.
The reason for this < 2 is historical and goes back to the oddities of
the original timer wheel before the big rewrite.
Thanks,
tglx
On 02/07/20 16:32, Frederic Weisbecker wrote:
> On Thu, Jul 02, 2020 at 11:59:59AM +0200, Juri Lelli wrote:
> > On 02/07/20 01:20, Frederic Weisbecker wrote:
> > > On Wed, Jul 01, 2020 at 06:35:04PM +0200, Juri Lelli wrote:
> > > > Guess you might be faster to understand what I'm missing. :-)
> > >
> > > So, did you port only this patch or the whole set in order to
> > > trigger this?
> > >
> > > If it was the whole set, can you try this patch alone?
> >
> > Whole set. And I can't reproduce if I try with this patch alone.
> >
>
> Missing initialization. This should go better after this:
>
> diff --git a/kernel/time/timer.c b/kernel/time/timer.c
> index 25a55c043297..f36b53219768 100644
> --- a/kernel/time/timer.c
> +++ b/kernel/time/timer.c
> @@ -1969,6 +1969,7 @@ static void __init init_timer_cpu(int cpu)
> base->cpu = cpu;
> raw_spin_lock_init(&base->lock);
> base->clk = jiffies;
> + base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
> timer_base_init_expiry_lock(base);
> }
> }
>
Yep. Looks like warning is gone.
On Thu, Jul 02, 2020 at 05:14:23PM +0200, Thomas Gleixner wrote:
> Frederic Weisbecker <[email protected]> writes:
> > On Thu, Jul 02, 2020 at 03:21:35PM +0200, Thomas Gleixner wrote:
> > The following part:
> >
> >> > * Also while executing timers, base->clk is 1 offset ahead
> >> > * of jiffies to avoid endless requeuing to current jffies.
> >> > */
> >
> > relates to situation when (long)(jnow - base->clk) < 0
>
> This still is inconsistent with your changelog:
Right.
>
> > There is no apparent reason for not forwarding base->clk when it's 2
> > jiffies late
>
> Let's do the math:
>
> jiffies = 4
> base->clk = 2
>
> 4 - 2 = 2
>
> which means it is forwarded when it's 2 jiffies late with the original
> code, because 2 < 2.
>
> The reason for this < 2 is historical and goes back to the oddities of
> the original timer wheel before the big rewrite.
Ok. And is it still needed today or can we now forward even with a 1 delta?
Frederic Weisbecker <[email protected]> writes:
> On Thu, Jul 02, 2020 at 05:14:23PM +0200, Thomas Gleixner wrote:
>>
>> The reason for this < 2 is historical and goes back to the oddities of
>> the original timer wheel before the big rewrite.
>
> Ok. And is it still needed today or can we now forward even with a 1 delta?
I haven't found anything which requires it. But be aware of dragons ....