2024-01-29 23:58:04

by Boqun Feng

[permalink] [raw]
Subject: [PATCH 0/8] Misc RCU update for v6.9

Hi,

This series contains miscellaneous updates and fixes from RCU v6.9.
You can also find the series at:

https://github.com/fbq/linux.git rcu-misc.2024.01.29a

The detailed list of the changes:

Frederic Weisbecker (2):
rcu: Rename jiffies_till_flush to jiffies_lazy_flush
hrtimer: Report offline hrtimer enqueue

Jiri Wiesner (1):
clocksource: Skip watchdog check for large watchdog intervals

Joel Fernandes (Google) (1):
srcu: Improve comments about acceleration leak

Onkarnath (1):
rcu/sync: remove un-used rcu_sync_enter_start function

Paul E. McKenney (2):
tsc: Check for sockets instead of CPUs to make code match comment
rcutorture: Suppress rtort_pipe_count warnings until after stalls

Qais Yousef (1):
rcu: Provide a boot time parameter to control lazy RCU

.../admin-guide/kernel-parameters.txt | 5 ++++
arch/x86/kernel/tsc.c | 2 +-
include/linux/hrtimer.h | 3 ++-
include/linux/rcu_sync.h | 1 -
kernel/rcu/Kconfig | 13 ++++++++++
kernel/rcu/rcu.h | 8 +++---
kernel/rcu/rcuscale.c | 6 ++---
kernel/rcu/rcutorture.c | 13 +++++++---
kernel/rcu/srcutree.c | 24 +++++++++++++++---
kernel/rcu/sync.c | 16 ------------
kernel/rcu/tree.c | 7 +++++-
kernel/rcu/tree_nocb.h | 22 ++++++++--------
kernel/time/clocksource.c | 25 ++++++++++++++++++-
kernel/time/hrtimer.c | 3 +++
14 files changed, 101 insertions(+), 47 deletions(-)

--
2.43.0



2024-01-29 23:58:18

by Boqun Feng

[permalink] [raw]
Subject: [PATCH 1/8] rcu: Rename jiffies_till_flush to jiffies_lazy_flush

From: Frederic Weisbecker <[email protected]>

The variable name jiffies_till_flush is too generic and therefore:

* It may shadow a global variable
* It doesn't tell on what it operates

Make the name more precise, along with the related APIs.

Reviewed-by: Joel Fernandes (Google) <[email protected]>
Signed-off-by: Frederic Weisbecker <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>
---
kernel/rcu/rcu.h | 8 ++++----
kernel/rcu/rcuscale.c | 6 +++---
kernel/rcu/tree_nocb.h | 22 +++++++++++-----------
3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index f94f65877f2b..dcfb666f2499 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -543,11 +543,11 @@ enum rcutorture_type {
};

#if defined(CONFIG_RCU_LAZY)
-unsigned long rcu_lazy_get_jiffies_till_flush(void);
-void rcu_lazy_set_jiffies_till_flush(unsigned long j);
+unsigned long rcu_get_jiffies_lazy_flush(void);
+void rcu_set_jiffies_lazy_flush(unsigned long j);
#else
-static inline unsigned long rcu_lazy_get_jiffies_till_flush(void) { return 0; }
-static inline void rcu_lazy_set_jiffies_till_flush(unsigned long j) { }
+static inline unsigned long rcu_get_jiffies_lazy_flush(void) { return 0; }
+static inline void rcu_set_jiffies_lazy_flush(unsigned long j) { }
#endif

#if defined(CONFIG_TREE_RCU)
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index ffdb30495e3c..8db4fedaaa1e 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -764,9 +764,9 @@ kfree_scale_init(void)

if (kfree_by_call_rcu) {
/* do a test to check the timeout. */
- orig_jif = rcu_lazy_get_jiffies_till_flush();
+ orig_jif = rcu_get_jiffies_lazy_flush();

- rcu_lazy_set_jiffies_till_flush(2 * HZ);
+ rcu_set_jiffies_lazy_flush(2 * HZ);
rcu_barrier();

jif_start = jiffies;
@@ -775,7 +775,7 @@ kfree_scale_init(void)

smp_cond_load_relaxed(&rcu_lazy_test1_cb_called, VAL == 1);

- rcu_lazy_set_jiffies_till_flush(orig_jif);
+ rcu_set_jiffies_lazy_flush(orig_jif);

if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start < 2 * HZ)) {
pr_alert("ERROR: call_rcu() CBs are not being lazy as expected!\n");
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 4efbf7333d4e..aecef51166c7 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -256,6 +256,7 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
return __wake_nocb_gp(rdp_gp, rdp, force, flags);
}

+#ifdef CONFIG_RCU_LAZY
/*
* LAZY_FLUSH_JIFFIES decides the maximum amount of time that
* can elapse before lazy callbacks are flushed. Lazy callbacks
@@ -264,21 +265,20 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
* left unsubmitted to RCU after those many jiffies.
*/
#define LAZY_FLUSH_JIFFIES (10 * HZ)
-static unsigned long jiffies_till_flush = LAZY_FLUSH_JIFFIES;
+static unsigned long jiffies_lazy_flush = LAZY_FLUSH_JIFFIES;

-#ifdef CONFIG_RCU_LAZY
// To be called only from test code.
-void rcu_lazy_set_jiffies_till_flush(unsigned long jif)
+void rcu_set_jiffies_lazy_flush(unsigned long jif)
{
- jiffies_till_flush = jif;
+ jiffies_lazy_flush = jif;
}
-EXPORT_SYMBOL(rcu_lazy_set_jiffies_till_flush);
+EXPORT_SYMBOL(rcu_set_jiffies_lazy_flush);

-unsigned long rcu_lazy_get_jiffies_till_flush(void)
+unsigned long rcu_get_jiffies_lazy_flush(void)
{
- return jiffies_till_flush;
+ return jiffies_lazy_flush;
}
-EXPORT_SYMBOL(rcu_lazy_get_jiffies_till_flush);
+EXPORT_SYMBOL(rcu_get_jiffies_lazy_flush);
#endif

/*
@@ -299,7 +299,7 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
*/
if (waketype == RCU_NOCB_WAKE_LAZY &&
rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) {
- mod_timer(&rdp_gp->nocb_timer, jiffies + jiffies_till_flush);
+ mod_timer(&rdp_gp->nocb_timer, jiffies + rcu_get_jiffies_lazy_flush());
WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
} else if (waketype == RCU_NOCB_WAKE_BYPASS) {
mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
@@ -482,7 +482,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
// flush ->nocb_bypass to ->cblist.
if ((ncbs && !bypass_is_lazy && j != READ_ONCE(rdp->nocb_bypass_first)) ||
(ncbs && bypass_is_lazy &&
- (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush))) ||
+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + rcu_get_jiffies_lazy_flush()))) ||
ncbs >= qhimark) {
rcu_nocb_lock(rdp);
*was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
@@ -723,7 +723,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
lazy_ncbs = READ_ONCE(rdp->lazy_len);

if (bypass_ncbs && (lazy_ncbs == bypass_ncbs) &&
- (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush) ||
+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + rcu_get_jiffies_lazy_flush()) ||
bypass_ncbs > 2 * qhimark)) {
flush_bypass = true;
} else if (bypass_ncbs && (lazy_ncbs != bypass_ncbs) &&
--
2.43.0


2024-01-29 23:58:36

by Boqun Feng

[permalink] [raw]
Subject: [PATCH 2/8] rcu: Provide a boot time parameter to control lazy RCU

From: Qais Yousef <[email protected]>

To allow more flexible arrangements while still provide a single kernel
for distros, provide a boot time parameter to enable/disable lazy RCU.

Specify:

rcutree.enable_rcu_lazy=[y|1|n|0]

Which also requires

rcu_nocbs=all

at boot time to enable/disable lazy RCU.

To disable it by default at build time when CONFIG_RCU_LAZY=y, the new
CONFIG_RCU_LAZY_DEFAULT_OFF can be used.

Signed-off-by: Qais Yousef (Google) <[email protected]>
Tested-by: Andrea Righi <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>
---
Documentation/admin-guide/kernel-parameters.txt | 5 +++++
kernel/rcu/Kconfig | 13 +++++++++++++
kernel/rcu/tree.c | 7 ++++++-
3 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 31b3a25680d0..b6c848c29a53 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5034,6 +5034,11 @@
this kernel boot parameter, forcibly setting it
to zero.

+ rcutree.enable_rcu_lazy= [KNL]
+ To save power, batch RCU callbacks and flush after
+ delay, memory pressure or callback list growing too
+ big.
+
rcuscale.gp_async= [KNL]
Measure performance of asynchronous
grace-period primitives such as call_rcu().
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index bdd7eadb33d8..e7d2dd267593 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -314,6 +314,19 @@ config RCU_LAZY
To save power, batch RCU callbacks and flush after delay, memory
pressure, or callback list growing too big.

+ Requires rcu_nocbs=all to be set.
+
+ Use rcutree.enable_rcu_lazy=0 to turn it off at boot time.
+
+config RCU_LAZY_DEFAULT_OFF
+ bool "Turn RCU lazy invocation off by default"
+ depends on RCU_LAZY
+ default n
+ help
+ Allows building the kernel with CONFIG_RCU_LAZY=y yet keep it default
+ off. Boot time param rcutree.enable_rcu_lazy=1 can be used to switch
+ it back on.
+
config RCU_DOUBLE_CHECK_CB_TIME
bool "RCU callback-batch backup time check"
depends on RCU_EXPERT
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b2bccfd37c38..41c50a6c607e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2753,6 +2753,9 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)
}

#ifdef CONFIG_RCU_LAZY
+static bool enable_rcu_lazy __read_mostly = !IS_ENABLED(CONFIG_RCU_LAZY_DEFAULT_OFF);
+module_param(enable_rcu_lazy, bool, 0444);
+
/**
* call_rcu_hurry() - Queue RCU callback for invocation after grace period, and
* flush all lazy callbacks (including the new one) to the main ->cblist while
@@ -2778,6 +2781,8 @@ void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
__call_rcu_common(head, func, false);
}
EXPORT_SYMBOL_GPL(call_rcu_hurry);
+#else
+#define enable_rcu_lazy false
#endif

/**
@@ -2826,7 +2831,7 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry);
*/
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
- __call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY));
+ __call_rcu_common(head, func, enable_rcu_lazy);
}
EXPORT_SYMBOL_GPL(call_rcu);

--
2.43.0


2024-01-29 23:58:49

by Boqun Feng

[permalink] [raw]
Subject: [PATCH 3/8] hrtimer: Report offline hrtimer enqueue

From: Frederic Weisbecker <[email protected]>

The hrtimers migration on CPU-down hotplug process has been moved
earlier, before the CPU actually goes to die. This leaves a small window
of opportunity to queue an hrtimer in a blind spot, leaving it ignored.

For example a practical case has been reported with RCU waking up a
SCHED_FIFO task right before the CPUHP_AP_IDLE_DEAD stage, queuing that
way a sched/rt timer to the local offline CPU.

Make sure such situations never go unnoticed and warn when that happens.

Reported-by: Paul E. McKenney <[email protected]>
Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier")
Signed-off-by: Frederic Weisbecker <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>
---
include/linux/hrtimer.h | 3 ++-
kernel/time/hrtimer.c | 3 +++
2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 87e3bedf8eb0..4f2cf7309486 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -179,7 +179,8 @@ struct hrtimer_cpu_base {
unsigned int hres_active : 1,
in_hrtirq : 1,
hang_detected : 1,
- softirq_activated : 1;
+ softirq_activated : 1,
+ online : 1;
#ifdef CONFIG_HIGH_RES_TIMERS
unsigned int nr_events;
unsigned short nr_retries;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 760793998cdd..edb0f821dcea 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1085,6 +1085,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
enum hrtimer_mode mode)
{
debug_activate(timer, mode);
+ WARN_ON_ONCE(!base->cpu_base->online);

base->cpu_base->active_bases |= 1 << base->index;

@@ -2183,6 +2184,7 @@ int hrtimers_prepare_cpu(unsigned int cpu)
cpu_base->softirq_next_timer = NULL;
cpu_base->expires_next = KTIME_MAX;
cpu_base->softirq_expires_next = KTIME_MAX;
+ cpu_base->online = 1;
hrtimer_cpu_base_init_expiry_lock(cpu_base);
return 0;
}
@@ -2250,6 +2252,7 @@ int hrtimers_cpu_dying(unsigned int dying_cpu)
smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);

raw_spin_unlock(&new_base->lock);
+ old_base->online = 0;
raw_spin_unlock(&old_base->lock);

return 0;
--
2.43.0


2024-01-29 23:59:03

by Boqun Feng

[permalink] [raw]
Subject: [PATCH 4/8] srcu: Improve comments about acceleration leak

From: "Joel Fernandes (Google)" <[email protected]>

The comments added in commit 1ef990c4b36b ("srcu: No need to
advance/accelerate if no callback enqueued") are a bit confusing.
The comments are describing a scenario for code that was moved and is
no longer the way it was (snapshot after advancing). Improve the code
comments to reflect this and also document why acceleration can never
fail.

Cc: Frederic Weisbecker <[email protected]>
Cc: Neeraj Upadhyay <[email protected]>
Reviewed-by: Frederic Weisbecker <[email protected]>
Signed-off-by: Joel Fernandes (Google) <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>
---
kernel/rcu/srcutree.c | 24 ++++++++++++++++++++----
1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 0351a4e83529..e4d673fc30f4 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1234,11 +1234,20 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
if (rhp)
rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
/*
- * The snapshot for acceleration must be taken _before_ the read of the
- * current gp sequence used for advancing, otherwise advancing may fail
- * and acceleration may then fail too.
+ * It's crucial to capture the snapshot 's' for acceleration before
+ * reading the current gp_seq that is used for advancing. This is
+ * essential because if the acceleration snapshot is taken after a
+ * failed advancement attempt, there's a risk that a grace period may
+ * conclude and a new one may start in the interim. If the snapshot is
+ * captured after this sequence of events, the acceleration snapshot 's'
+ * could be excessively advanced, leading to acceleration failure.
+ * In such a scenario, an 'acceleration leak' can occur, where new
+ * callbacks become indefinitely stuck in the RCU_NEXT_TAIL segment.
+ * Also note that encountering advancing failures is a normal
+ * occurrence when the grace period for RCU_WAIT_TAIL is in progress.
*
- * This could happen if:
+ * To see this, consider the following events which occur if
+ * rcu_seq_snap() were to be called after advance:
*
* 1) The RCU_WAIT_TAIL segment has callbacks (gp_num = X + 4) and the
* RCU_NEXT_READY_TAIL also has callbacks (gp_num = X + 8).
@@ -1264,6 +1273,13 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
if (rhp) {
rcu_segcblist_advance(&sdp->srcu_cblist,
rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
+ /*
+ * Acceleration can never fail because the base current gp_seq
+ * used for acceleration is <= the value of gp_seq used for
+ * advancing. This means that RCU_NEXT_TAIL segment will
+ * always be able to be emptied by the acceleration into the
+ * RCU_NEXT_READY_TAIL or RCU_WAIT_TAIL segments.
+ */
WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s));
}
if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
--
2.43.0


2024-01-29 23:59:38

by Boqun Feng

[permalink] [raw]
Subject: [PATCH 6/8] rcutorture: Suppress rtort_pipe_count warnings until after stalls

From: "Paul E. McKenney" <[email protected]>

Currently, if rcu_torture_writer() sees fewer than ten grace periods
having elapsed during a call to stutter_wait() that actually waited,
the rtort_pipe_count warning is emitted. This has worked well for
a long time. Except that the rcutorture TREE07 scenario now does a
short-term 14-second RCU CPU stall, which can most definitely case
false-positive rtort_pipe_count warnings.

This commit therefore changes rcu_torture_writer() to compute the
full expected holdoff and stall duration, and to refuse to report any
rtort_pipe_count warnings until after all stalls have completed.

Signed-off-by: Paul E. McKenney <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
---
kernel/rcu/rcutorture.c | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 7567ca8e743c..45d6b4c3d199 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1368,9 +1368,13 @@ rcu_torture_writer(void *arg)
struct rcu_torture *rp;
struct rcu_torture *old_rp;
static DEFINE_TORTURE_RANDOM(rand);
+ unsigned long stallsdone = jiffies;
bool stutter_waited;
unsigned long ulo[NUM_ACTIVE_RCU_POLL_OLDSTATE];

+ // If a new stall test is added, this must be adjusted.
+ if (stall_cpu_holdoff + stall_gp_kthread + stall_cpu)
+ stallsdone += (stall_cpu_holdoff + stall_gp_kthread + stall_cpu + 60) * HZ;
VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
if (!can_expedite)
pr_alert("%s" TORTURE_FLAG
@@ -1576,11 +1580,11 @@ rcu_torture_writer(void *arg)
!atomic_read(&rcu_fwd_cb_nodelay) &&
!cur_ops->slow_gps &&
!torture_must_stop() &&
- boot_ended)
+ boot_ended &&
+ time_after(jiffies, stallsdone))
for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++)
if (list_empty(&rcu_tortures[i].rtort_free) &&
- rcu_access_pointer(rcu_torture_current) !=
- &rcu_tortures[i]) {
+ rcu_access_pointer(rcu_torture_current) != &rcu_tortures[i]) {
tracing_off();
show_rcu_gp_kthreads();
WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
@@ -2441,7 +2445,8 @@ static struct notifier_block rcu_torture_stall_block = {

/*
* CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then
- * induces a CPU stall for the time specified by stall_cpu.
+ * induces a CPU stall for the time specified by stall_cpu. If a new
+ * stall test is added, stallsdone in rcu_torture_writer() must be adjusted.
*/
static int rcu_torture_stall(void *args)
{
--
2.43.0


2024-01-29 23:59:56

by Boqun Feng

[permalink] [raw]
Subject: [PATCH 7/8] clocksource: Skip watchdog check for large watchdog intervals

From: Jiri Wiesner <[email protected]>

There have been reports of the watchdog marking clocksources unstable on
machines with 8 NUMA nodes:
> clocksource: timekeeping watchdog on CPU373: Marking clocksource 'tsc' as unstable because the skew is too large:
> clocksource: 'hpet' wd_nsec: 14523447520 wd_now: 5a749706 wd_last: 45adf1e0 mask: ffffffff
> clocksource: 'tsc' cs_nsec: 14524115132 cs_now: 515ce2c5a96caa cs_last: 515cd9a9d83918 mask: ffffffffffffffff
> clocksource: 'tsc' is current clocksource.
> tsc: Marking TSC unstable due to clocksource watchdog
> TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.
> sched_clock: Marking unstable (1950347883333462, 79649632569)<-(1950428279338308, -745776594)
> clocksource: Checking clocksource tsc synchronization from CPU 400 to CPUs 0,46,52,54,138,208,392,397.
> clocksource: Switched to clocksource hpet

The measured clocksource skew - the absolute difference between cs_nsec
and wd_nsec - was 668 microseconds:
> cs_nsec - wd_nsec = 14524115132 - 14523447520 = 667612

The kernel (based on 5.14.21) used 200 microseconds for the
uncertainty_margin of both the clocksource and watchdog, resulting in a
threshold of 400 microseconds (the md variable). Both the cs_nsec and the
wd_nsec value indicate that the readout interval was circa 14.5 seconds.
The observed behaviour is that watchdog checks failed for large readout
intervals on 8 NUMA node machines. This indicates that the size of the
skew was directly proportinal to the length of the readout interval on
those machines. The measured clocksource skew, 668 microseconds, was
evaluated against a threshold (the md variable) that is suited for
readout intervals of roughly WATCHDOG_INTERVAL, i.e. HZ >> 1, which is
0.5 second.

The intention of 2e27e793e280 ("clocksource: Reduce clocksource-skew
threshold") was to tighten the threshold for evaluating skew and set the
lower bound for the uncertainty_margin of clocksources to twice
WATCHDOG_MAX_SKEW. Later in c37e85c135ce ("clocksource: Loosen clocksource
watchdog constraints"), the WATCHDOG_MAX_SKEW constant was increased to
125 microseconds to fit the limit of NTP, which is able to use a
clocksource that suffers from up to 500 microseconds of skew per second.
Both the TSC and the HPET use default uncertainty_margin. When the
readout interval gets stretched the default uncertainty_margin is no
longer a suitable lower bound for evaluating skew - it imposes a limit
that is far stricter than the skew with which NTP can deal.

The root causes of the skew being directly proportinal to the length of
the readout interval are
* the inaccuracy of the shift/mult pairs of clocksources and the watchdog
* the conversion to nanoseconds is imprecise for large readout intervals

Prevent this by skipping the current watchdog check if the readout
interval exceeds 2 * WATCHDOG_INTERVAL. Considering the maximum readout
interval of 2 * WATCHDOG_INTERVAL, the current default uncertainty margin
(of the TSC and HPET) corresponds to a limit on clocksource skew of 250
ppm (microseconds of skew per second). To keep the limit imposed by NTP
(500 microseconds of skew per second) for all possible readout intervals,
the margins would have to be scaled so that the threshold value is
proportional to the length of the actual readout interval.

As for why the readout interval may get stretched: Since the watchdog is
executed in softirq context the expiration of the watchdog timer can get
severely delayed on account of a ksoftirqd thread not getting to run in a
timely manner. Surely, a system with such belated softirq execution is not
working well and the scheduling issue should be looked into but the
clocksource watchdog should be able to deal with it accordingly.

Fixes: 2e27e793e280 ("clocksource: Reduce clocksource-skew threshold")
Suggested-by: Feng Tang <[email protected]>
Reviewed-by: Feng Tang <[email protected]>
Tested-by: Paul E. McKenney <[email protected]>
Signed-off-by: Jiri Wiesner <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>
---
kernel/time/clocksource.c | 25 ++++++++++++++++++++++++-
1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c108ed8a9804..3052b1f1168e 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -99,6 +99,7 @@ static u64 suspend_start;
* Interval: 0.5sec.
*/
#define WATCHDOG_INTERVAL (HZ >> 1)
+#define WATCHDOG_INTERVAL_MAX_NS ((2 * WATCHDOG_INTERVAL) * (NSEC_PER_SEC / HZ))

/*
* Threshold: 0.0312s, when doubled: 0.0625s.
@@ -134,6 +135,7 @@ static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
static DEFINE_SPINLOCK(watchdog_lock);
static int watchdog_running;
static atomic_t watchdog_reset_pending;
+static int64_t watchdog_max_interval;

static inline void clocksource_watchdog_lock(unsigned long *flags)
{
@@ -399,8 +401,8 @@ static inline void clocksource_reset_watchdog(void)
static void clocksource_watchdog(struct timer_list *unused)
{
u64 csnow, wdnow, cslast, wdlast, delta;
+ int64_t wd_nsec, cs_nsec, interval;
int next_cpu, reset_pending;
- int64_t wd_nsec, cs_nsec;
struct clocksource *cs;
enum wd_read_status read_ret;
unsigned long extra_wait = 0;
@@ -470,6 +472,27 @@ static void clocksource_watchdog(struct timer_list *unused)
if (atomic_read(&watchdog_reset_pending))
continue;

+ /*
+ * The processing of timer softirqs can get delayed (usually
+ * on account of ksoftirqd not getting to run in a timely
+ * manner), which causes the watchdog interval to stretch.
+ * Skew detection may fail for longer watchdog intervals
+ * on account of fixed margins being used.
+ * Some clocksources, e.g. acpi_pm, cannot tolerate
+ * watchdog intervals longer than a few seconds.
+ */
+ interval = max(cs_nsec, wd_nsec);
+ if (unlikely(interval > WATCHDOG_INTERVAL_MAX_NS)) {
+ if (system_state > SYSTEM_SCHEDULING &&
+ interval > 2 * watchdog_max_interval) {
+ watchdog_max_interval = interval;
+ pr_warn("Long readout interval, skipping watchdog check: cs_nsec: %lld wd_nsec: %lld\n",
+ cs_nsec, wd_nsec);
+ }
+ watchdog_timer.expires = jiffies;
+ continue;
+ }
+
/* Check the deviation from the watchdog clocksource. */
md = cs->uncertainty_margin + watchdog->uncertainty_margin;
if (abs(cs_nsec - wd_nsec) > md) {
--
2.43.0


2024-01-30 00:00:11

by Boqun Feng

[permalink] [raw]
Subject: [PATCH 8/8] rcu/sync: remove un-used rcu_sync_enter_start function

From: Onkarnath <[email protected]>

With commit '6a010a49b63a ("cgroup: Make !percpu threadgroup_rwsem
operations optional")' usage of rcu_sync_enter_start is removed.

So this function can also be removed.

In the words of Oleg Nesterov:

__rcu_sync_enter(wait => false) is a better alternative if
someone needs rcu_sync_enter_start() again.

Link: https://lore.kernel.org/all/[email protected]/
Signed-off-by: Onkarnath <[email protected]>
Signed-off-by: Maninder Singh <[email protected]>
Acked-by: Oleg Nesterov <[email protected]>
Acked-by: Tejun Heo <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>
---
include/linux/rcu_sync.h | 1 -
kernel/rcu/sync.c | 16 ----------------
2 files changed, 17 deletions(-)

diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h
index 0027d4c8087c..3860dbb9107a 100644
--- a/include/linux/rcu_sync.h
+++ b/include/linux/rcu_sync.h
@@ -37,7 +37,6 @@ static inline bool rcu_sync_is_idle(struct rcu_sync *rsp)
}

extern void rcu_sync_init(struct rcu_sync *);
-extern void rcu_sync_enter_start(struct rcu_sync *);
extern void rcu_sync_enter(struct rcu_sync *);
extern void rcu_sync_exit(struct rcu_sync *);
extern void rcu_sync_dtor(struct rcu_sync *);
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index e550f97779b8..86df878a2fee 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -24,22 +24,6 @@ void rcu_sync_init(struct rcu_sync *rsp)
init_waitqueue_head(&rsp->gp_wait);
}

-/**
- * rcu_sync_enter_start - Force readers onto slow path for multiple updates
- * @rsp: Pointer to rcu_sync structure to use for synchronization
- *
- * Must be called after rcu_sync_init() and before first use.
- *
- * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}()
- * pairs turn into NO-OPs.
- */
-void rcu_sync_enter_start(struct rcu_sync *rsp)
-{
- rsp->gp_count++;
- rsp->gp_state = GP_PASSED;
-}
-
-
static void rcu_sync_func(struct rcu_head *rhp);

static void rcu_sync_call(struct rcu_sync *rsp)
--
2.43.0


2024-01-30 00:00:23

by Boqun Feng

[permalink] [raw]
Subject: [PATCH 5/8] tsc: Check for sockets instead of CPUs to make code match comment

From: "Paul E. McKenney" <[email protected]>

The unsynchronized_tsc() eventually checks num_possible_cpus(), and
if the system is non-Intel and the number of possible CPUs is greater
than one, assumes that TSCs are unsynchronized. This despite the
comment saying "assume multi socket systems are not synchronized",
that is, socket rather than CPU. This behavior was preserved by
commit 8fbbc4b45ce3 ("x86: merge tsc_init and clocksource code") and
by the previous relevant commit 7e69f2b1ead2 ("clocksource: Remove the
update callback").

The clocksource drivers were added by commit 5d0cf410e94b ("Time: i386
Clocksource Drivers") back in 2006, and the comment still said "socket"
rather than "CPU".

Therefore, bravely (and perhaps foolishly) make the code match the
comment.

Note that it is possible to bypass both code and comment by booting
with tsc=reliable, but this also disables the clocksource watchdog,
which is undesirable when trust in the TSC is strictly limited.

Reported-by: Zhengxu Chen <[email protected]>
Reported-by: Danielle Costantino <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Feng Tang <[email protected]>
Cc: Waiman Long <[email protected]>
Cc: John Stultz <[email protected]>
Cc: <[email protected]>
---
arch/x86/kernel/tsc.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 15f97c0abc9d..d45084c6a15e 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1287,7 +1287,7 @@ int unsynchronized_tsc(void)
*/
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
/* assume multi socket systems are not synchronized: */
- if (num_possible_cpus() > 1)
+ if (nr_online_nodes > 1)
return 1;
}

--
2.43.0


2024-02-05 21:54:29

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH 5/8] tsc: Check for sockets instead of CPUs to make code match comment

On Mon, Jan 29, 2024 at 03:56:38PM -0800, Boqun Feng wrote:
> From: "Paul E. McKenney" <[email protected]>
>
> The unsynchronized_tsc() eventually checks num_possible_cpus(), and
> if the system is non-Intel and the number of possible CPUs is greater
> than one, assumes that TSCs are unsynchronized. This despite the
> comment saying "assume multi socket systems are not synchronized",
> that is, socket rather than CPU. This behavior was preserved by
> commit 8fbbc4b45ce3 ("x86: merge tsc_init and clocksource code") and
> by the previous relevant commit 7e69f2b1ead2 ("clocksource: Remove the
> update callback").
>
> The clocksource drivers were added by commit 5d0cf410e94b ("Time: i386
> Clocksource Drivers") back in 2006, and the comment still said "socket"
> rather than "CPU".
>
> Therefore, bravely (and perhaps foolishly) make the code match the
> comment.
>
> Note that it is possible to bypass both code and comment by booting
> with tsc=reliable, but this also disables the clocksource watchdog,
> which is undesirable when trust in the TSC is strictly limited.
>
> Reported-by: Zhengxu Chen <[email protected]>
> Reported-by: Danielle Costantino <[email protected]>
> Signed-off-by: Paul E. McKenney <[email protected]>
> Cc: Thomas Gleixner <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> Cc: Borislav Petkov <[email protected]>
> Cc: Dave Hansen <[email protected]>
> Cc: "H. Peter Anvin" <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Feng Tang <[email protected]>
> Cc: Waiman Long <[email protected]>
> Cc: John Stultz <[email protected]>
> Cc: <[email protected]>

Hello, Boqun,

Please drop this one, as I never got an ack from the maintainers, and
quite possibly for good reason. ;-)

Thanx, Paul

> ---
> arch/x86/kernel/tsc.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index 15f97c0abc9d..d45084c6a15e 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -1287,7 +1287,7 @@ int unsynchronized_tsc(void)
> */
> if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
> /* assume multi socket systems are not synchronized: */
> - if (num_possible_cpus() > 1)
> + if (nr_online_nodes > 1)
> return 1;
> }
>
> --
> 2.43.0
>

2024-02-05 22:04:24

by Boqun Feng

[permalink] [raw]
Subject: Re: [PATCH 5/8] tsc: Check for sockets instead of CPUs to make code match comment

On Mon, Feb 05, 2024 at 12:03:33PM -0800, Paul E. McKenney wrote:
> On Mon, Jan 29, 2024 at 03:56:38PM -0800, Boqun Feng wrote:
> > From: "Paul E. McKenney" <[email protected]>
> >
> > The unsynchronized_tsc() eventually checks num_possible_cpus(), and
> > if the system is non-Intel and the number of possible CPUs is greater
> > than one, assumes that TSCs are unsynchronized. This despite the
> > comment saying "assume multi socket systems are not synchronized",
> > that is, socket rather than CPU. This behavior was preserved by
> > commit 8fbbc4b45ce3 ("x86: merge tsc_init and clocksource code") and
> > by the previous relevant commit 7e69f2b1ead2 ("clocksource: Remove the
> > update callback").
> >
> > The clocksource drivers were added by commit 5d0cf410e94b ("Time: i386
> > Clocksource Drivers") back in 2006, and the comment still said "socket"
> > rather than "CPU".
> >
> > Therefore, bravely (and perhaps foolishly) make the code match the
> > comment.
> >
> > Note that it is possible to bypass both code and comment by booting
> > with tsc=reliable, but this also disables the clocksource watchdog,
> > which is undesirable when trust in the TSC is strictly limited.
> >
> > Reported-by: Zhengxu Chen <[email protected]>
> > Reported-by: Danielle Costantino <[email protected]>
> > Signed-off-by: Paul E. McKenney <[email protected]>
> > Cc: Thomas Gleixner <[email protected]>
> > Cc: Ingo Molnar <[email protected]>
> > Cc: Borislav Petkov <[email protected]>
> > Cc: Dave Hansen <[email protected]>
> > Cc: "H. Peter Anvin" <[email protected]>
> > Cc: Peter Zijlstra <[email protected]>
> > Cc: Feng Tang <[email protected]>
> > Cc: Waiman Long <[email protected]>
> > Cc: John Stultz <[email protected]>
> > Cc: <[email protected]>
>
> Hello, Boqun,
>
> Please drop this one, as I never got an ack from the maintainers, and
> quite possibly for good reason. ;-)
>

Got it I will drop it in the PR, the topic branch has been updated:

git://git.kernel.org/pub/scm/linux/kernel/git/boqun/linux.git rcu-misc.2024.02.05a

Regards,
Boqun

> Thanx, Paul
>
> > ---
> > arch/x86/kernel/tsc.c | 2 +-
> > 1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> > index 15f97c0abc9d..d45084c6a15e 100644
> > --- a/arch/x86/kernel/tsc.c
> > +++ b/arch/x86/kernel/tsc.c
> > @@ -1287,7 +1287,7 @@ int unsynchronized_tsc(void)
> > */
> > if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
> > /* assume multi socket systems are not synchronized: */
> > - if (num_possible_cpus() > 1)
> > + if (nr_online_nodes > 1)
> > return 1;
> > }
> >
> > --
> > 2.43.0
> >

Subject: [tip: timers/urgent] hrtimer: Report offline hrtimer enqueue

The following commit has been merged into the timers/urgent branch of tip:

Commit-ID: dad6a09f3148257ac1773cd90934d721d68ab595
Gitweb: https://git.kernel.org/tip/dad6a09f3148257ac1773cd90934d721d68ab595
Author: Frederic Weisbecker <[email protected]>
AuthorDate: Mon, 29 Jan 2024 15:56:36 -08:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Tue, 06 Feb 2024 10:56:35 +01:00

hrtimer: Report offline hrtimer enqueue

The hrtimers migration on CPU-down hotplug process has been moved
earlier, before the CPU actually goes to die. This leaves a small window
of opportunity to queue an hrtimer in a blind spot, leaving it ignored.

For example a practical case has been reported with RCU waking up a
SCHED_FIFO task right before the CPUHP_AP_IDLE_DEAD stage, queuing that
way a sched/rt timer to the local offline CPU.

Make sure such situations never go unnoticed and warn when that happens.

Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier")
Reported-by: Paul E. McKenney <[email protected]>
Signed-off-by: Frederic Weisbecker <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Cc: [email protected]
Link: https://lore.kernel.org/r/[email protected]
---
include/linux/hrtimer.h | 4 +++-
kernel/time/hrtimer.c | 3 +++
2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 87e3bed..641c456 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -157,6 +157,7 @@ enum hrtimer_base_type {
* @max_hang_time: Maximum time spent in hrtimer_interrupt
* @softirq_expiry_lock: Lock which is taken while softirq based hrtimer are
* expired
+ * @online: CPU is online from an hrtimers point of view
* @timer_waiters: A hrtimer_cancel() invocation waits for the timer
* callback to finish.
* @expires_next: absolute time of the next event, is required for remote
@@ -179,7 +180,8 @@ struct hrtimer_cpu_base {
unsigned int hres_active : 1,
in_hrtirq : 1,
hang_detected : 1,
- softirq_activated : 1;
+ softirq_activated : 1,
+ online : 1;
#ifdef CONFIG_HIGH_RES_TIMERS
unsigned int nr_events;
unsigned short nr_retries;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 7607939..edb0f82 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1085,6 +1085,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
enum hrtimer_mode mode)
{
debug_activate(timer, mode);
+ WARN_ON_ONCE(!base->cpu_base->online);

base->cpu_base->active_bases |= 1 << base->index;

@@ -2183,6 +2184,7 @@ int hrtimers_prepare_cpu(unsigned int cpu)
cpu_base->softirq_next_timer = NULL;
cpu_base->expires_next = KTIME_MAX;
cpu_base->softirq_expires_next = KTIME_MAX;
+ cpu_base->online = 1;
hrtimer_cpu_base_init_expiry_lock(cpu_base);
return 0;
}
@@ -2250,6 +2252,7 @@ int hrtimers_cpu_dying(unsigned int dying_cpu)
smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);

raw_spin_unlock(&new_base->lock);
+ old_base->online = 0;
raw_spin_unlock(&old_base->lock);

return 0;