2022-06-30 04:25:20

by Neeraj Upadhyay

[permalink] [raw]
Subject: [PATCH v2] srcu: Reduce blocking agressiveness of expedited grace periods further

Commit 640a7d37c3f4 ("srcu: Block less aggressively for expedited
grace periods") highlights a problem where aggressively blocking
SRCU expedited grace periods, as was introduced in commit
282d8998e997 ("srcu: Prevent expedited GPs and blocking readers
from consuming CPU"), introduces ~2 minutes delay to the overall
~3.5 minutes boot time, when starting VMs with "-bios QEMU_EFI.fd"
cmdline on qemu, which results in very high rate of memslots
add/remove, which causes > ~6000 synchronize_srcu() calls for
kvm->srcu SRCU instance.

Below table captures the experiments done by Zhangfei Gao and Shameer
to measure the boottime impact with various values of non-sleeping
per phase counts, with HZ_250 and preemption enabled:

+──────────────────────────+────────────────+
| SRCU_MAX_NODELAY_PHASE | Boot time (s) |
+──────────────────────────+────────────────+
| 100 | 30.053 |
| 150 | 25.151 |
| 200 | 20.704 |
| 250 | 15.748 |
| 500 | 11.401 |
| 1000 | 11.443 |
| 10000 | 11.258 |
| 1000000 | 11.154 |
+──────────────────────────+────────────────+

Analysis on the experiment results showed improved boot time
with non blocking delays close to one jiffy duration. This
was also seen when number of per-phase iterations were scaled
to one jiffy.

So, this change scales per-grace-period phase number of non-sleeping
polls, such that, non-sleeping polls are done for one jiffy. In addition
to this, srcu_get_delay() call in srcu_gp_end(), which is used to calculate
the delay used for scheduling callbacks, is replaced with the check for
expedited grace period. This is done, to schedule cbs for completed expedited
grace periods immediately, which results in improved boot time seen in
experiments.

In addition to the changes to default per phase delays, this change
adds 3 new kernel parameters - srcutree.srcu_max_nodelay,
srcutree.srcu_max_nodelay_phase, srcutree.srcu_retry_check_delay.
This allows users to configure the srcu grace period scanning delays,
depending on their system configuration requirements.

Signed-off-by: Neeraj Upadhyay <[email protected]>
Tested-by: Marc Zyngier <[email protected]>
---

Change in v2:

- Change srcu_max_nodelay default value to consider phase delay
iterations
- Apply Pauls' feedback
- Add Marc's Tested-by

.../admin-guide/kernel-parameters.txt | 18 ++++
kernel/rcu/srcutree.c | 82 ++++++++++++++-----
2 files changed, 81 insertions(+), 19 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index af647714c113..7e34086c64f5 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5805,6 +5805,24 @@
expediting. Set to zero to disable automatic
expediting.

+ srcutree.srcu_max_nodelay [KNL]
+ Specifies the number of no-delay instances
+ per jiffy for which the SRCU grace period
+ worker thread will be rescheduled with zero
+ delay. Beyond this limit, worker thread will
+ be rescheduled with a sleep delay of one jiffy.
+
+ srcutree.srcu_max_nodelay_phase [KNL]
+ Specifies the per-grace-period phase, number of
+ non-sleeping polls of readers. Beyond this limit,
+ grace period worker thread will be rescheduled
+ with a sleep delay of one jiffy, between each
+ rescan of the readers, for a grace period phase.
+
+ srcutree.srcu_retry_check_delay [KNL]
+ Specifies number of microseconds of non-sleeping
+ delay between each non-sleeping poll of readers.
+
srcutree.small_contention_lim [KNL]
Specifies the number of update-side contention
events per jiffy will be tolerated before
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 0db7873f4e95..1c304fec89c0 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -511,10 +511,52 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
return sum;
}

-#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending.
-#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers.
-#define SRCU_MAX_NODELAY_PHASE 3 // Maximum per-GP-phase consecutive no-delay instances.
-#define SRCU_MAX_NODELAY 100 // Maximum consecutive no-delay instances.
+/*
+ * We use an adaptive strategy for synchronize_srcu() and especially for
+ * synchronize_srcu_expedited(). We spin for a fixed time period
+ * (defined below, boot time configurable) to allow SRCU readers to exit
+ * their read-side critical sections. If there are still some readers
+ * after one jiffy, we repeatedly block for one jiffy time periods.
+ * The blocking time is increased as the grace-period age increases,
+ * with max blocking time capped at 10 jiffies.
+ */
+#define SRCU_DEFAULT_RETRY_CHECK_DELAY 5
+
+static ulong srcu_retry_check_delay = SRCU_DEFAULT_RETRY_CHECK_DELAY;
+module_param(srcu_retry_check_delay, ulong, 0444);
+
+#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending.
+#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers.
+
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE_LO 3UL // Lowmark on default per-GP-phase
+ // no-delay instances.
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE_HI 1000UL // Highmark on default per-GP-phase
+ // no-delay instances.
+
+#define SRCU_UL_CLAMP_LO(val, low) ((val) > (low) ? (val) : (low))
+#define SRCU_UL_CLAMP_HI(val, high) ((val) < (high) ? (val) : (high))
+#define SRCU_UL_CLAMP(val, low, high) SRCU_UL_CLAMP_HI(SRCU_UL_CLAMP_LO((val), (low)), (high))
+// per-GP-phase no-delay instances adjusted to allow non-sleeping poll upto
+// one jiffies time duration. Mult by 2 is done to factor in the srcu_get_delay()
+// called from process_srcu().
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED \
+ (2UL * USEC_PER_SEC / HZ / SRCU_DEFAULT_RETRY_CHECK_DELAY)
+
+// Maximum per-GP-phase consecutive no-delay instances.
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE \
+ SRCU_UL_CLAMP(SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED, \
+ SRCU_DEFAULT_MAX_NODELAY_PHASE_LO, \
+ SRCU_DEFAULT_MAX_NODELAY_PHASE_HI)
+
+static ulong srcu_max_nodelay_phase = SRCU_DEFAULT_MAX_NODELAY_PHASE;
+module_param(srcu_max_nodelay_phase, ulong, 0444);
+
+// Maximum consecutive no-delay instances.
+#define SRCU_DEFAULT_MAX_NODELAY (SRCU_DEFAULT_MAX_NODELAY_PHASE > 100 ? \
+ SRCU_DEFAULT_MAX_NODELAY_PHASE : 100)
+
+static ulong srcu_max_nodelay = SRCU_DEFAULT_MAX_NODELAY;
+module_param(srcu_max_nodelay, ulong, 0444);

/*
* Return grace-period delay, zero if there are expedited grace
@@ -535,7 +577,7 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
jbase += j - gpstart;
if (!jbase) {
WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1);
- if (READ_ONCE(ssp->srcu_n_exp_nodelay) > SRCU_MAX_NODELAY_PHASE)
+ if (READ_ONCE(ssp->srcu_n_exp_nodelay) > srcu_max_nodelay_phase)
jbase = 1;
}
}
@@ -612,15 +654,6 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock);

-/*
- * We use an adaptive strategy for synchronize_srcu() and especially for
- * synchronize_srcu_expedited(). We spin for a fixed time period
- * (defined below) to allow SRCU readers to exit their read-side critical
- * sections. If there are still some readers after a few microseconds,
- * we repeatedly block for 1-millisecond time periods.
- */
-#define SRCU_RETRY_CHECK_DELAY 5
-
/*
* Start an SRCU grace period.
*/
@@ -706,7 +739,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp
*/
static void srcu_gp_end(struct srcu_struct *ssp)
{
- unsigned long cbdelay;
+ unsigned long cbdelay = 1;
bool cbs;
bool last_lvl;
int cpu;
@@ -726,7 +759,9 @@ static void srcu_gp_end(struct srcu_struct *ssp)
spin_lock_irq_rcu_node(ssp);
idx = rcu_seq_state(ssp->srcu_gp_seq);
WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
- cbdelay = !!srcu_get_delay(ssp);
+ if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
+ cbdelay = 0;
+
WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns());
rcu_seq_end(&ssp->srcu_gp_seq);
gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
@@ -927,12 +962,16 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
*/
static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount)
{
+ unsigned long curdelay;
+
+ curdelay = !srcu_get_delay(ssp);
+
for (;;) {
if (srcu_readers_active_idx_check(ssp, idx))
return true;
- if (--trycount + !srcu_get_delay(ssp) <= 0)
+ if ((--trycount + curdelay) <= 0)
return false;
- udelay(SRCU_RETRY_CHECK_DELAY);
+ udelay(srcu_retry_check_delay);
}
}

@@ -1588,7 +1627,7 @@ static void process_srcu(struct work_struct *work)
j = jiffies;
if (READ_ONCE(ssp->reschedule_jiffies) == j) {
WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1);
- if (READ_ONCE(ssp->reschedule_count) > SRCU_MAX_NODELAY)
+ if (READ_ONCE(ssp->reschedule_count) > srcu_max_nodelay)
curdelay = 1;
} else {
WRITE_ONCE(ssp->reschedule_count, 1);
@@ -1680,6 +1719,11 @@ static int __init srcu_bootup_announce(void)
pr_info("Hierarchical SRCU implementation.\n");
if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF)
pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff);
+ if (srcu_retry_check_delay != SRCU_DEFAULT_RETRY_CHECK_DELAY)
+ pr_info("\tNon-default retry check delay of %lu us.\n", srcu_retry_check_delay);
+ if (srcu_max_nodelay != SRCU_DEFAULT_MAX_NODELAY)
+ pr_info("\tNon-default max no-delay of %lu.\n", srcu_max_nodelay);
+ pr_info("\tMax phase no-delay instances is %lu.\n", srcu_max_nodelay_phase);
return 0;
}
early_initcall(srcu_bootup_announce);
--
2.17.1


2022-06-30 06:59:23

by zhangfei

[permalink] [raw]
Subject: Re: [PATCH v2] srcu: Reduce blocking agressiveness of expedited grace periods further



On 2022/6/30 下午12:12, Neeraj Upadhyay wrote:
> Commit 640a7d37c3f4 ("srcu: Block less aggressively for expedited
> grace periods") highlights a problem where aggressively blocking
> SRCU expedited grace periods, as was introduced in commit
> 282d8998e997 ("srcu: Prevent expedited GPs and blocking readers
> from consuming CPU"), introduces ~2 minutes delay to the overall
> ~3.5 minutes boot time, when starting VMs with "-bios QEMU_EFI.fd"
> cmdline on qemu, which results in very high rate of memslots
> add/remove, which causes > ~6000 synchronize_srcu() calls for
> kvm->srcu SRCU instance.
>
> Below table captures the experiments done by Zhangfei Gao and Shameer
> to measure the boottime impact with various values of non-sleeping
> per phase counts, with HZ_250 and preemption enabled:
>
> +──────────────────────────+────────────────+
> | SRCU_MAX_NODELAY_PHASE | Boot time (s) |
> +──────────────────────────+────────────────+
> | 100 | 30.053 |
> | 150 | 25.151 |
> | 200 | 20.704 |
> | 250 | 15.748 |
> | 500 | 11.401 |
> | 1000 | 11.443 |
> | 10000 | 11.258 |
> | 1000000 | 11.154 |
> +──────────────────────────+────────────────+
>
> Analysis on the experiment results showed improved boot time
> with non blocking delays close to one jiffy duration. This
> was also seen when number of per-phase iterations were scaled
> to one jiffy.
>
> So, this change scales per-grace-period phase number of non-sleeping
> polls, such that, non-sleeping polls are done for one jiffy. In addition
> to this, srcu_get_delay() call in srcu_gp_end(), which is used to calculate
> the delay used for scheduling callbacks, is replaced with the check for
> expedited grace period. This is done, to schedule cbs for completed expedited
> grace periods immediately, which results in improved boot time seen in
> experiments.
>
> In addition to the changes to default per phase delays, this change
> adds 3 new kernel parameters - srcutree.srcu_max_nodelay,
> srcutree.srcu_max_nodelay_phase, srcutree.srcu_retry_check_delay.
> This allows users to configure the srcu grace period scanning delays,
> depending on their system configuration requirements.
>
> Signed-off-by: Neeraj Upadhyay <[email protected]>
> Tested-by: Marc Zyngier <[email protected]>

Tested-by: Zhangfei Gao <[email protected]>

Test on arm64, defconfig(CONFIG_HZ_250=y)
qemu boot Image with -bios QEMU_EFI.fd


With this patch
real 0m9.739s
user 0m3.270s
sys 0m0.969s


Without this patch
real 2m40.361s
user 0m3.034s
sys 0m1.162s

5.18-rc6
real 0m8.402s
user 0m3.015s
sys 0m1.102s

Thanks

> ---
>
> Change in v2:
>
> - Change srcu_max_nodelay default value to consider phase delay
> iterations
> - Apply Pauls' feedback
> - Add Marc's Tested-by
>
> .../admin-guide/kernel-parameters.txt | 18 ++++
> kernel/rcu/srcutree.c | 82 ++++++++++++++-----
> 2 files changed, 81 insertions(+), 19 deletions(-)
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index af647714c113..7e34086c64f5 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -5805,6 +5805,24 @@
> expediting. Set to zero to disable automatic
> expediting.
>
> + srcutree.srcu_max_nodelay [KNL]
> + Specifies the number of no-delay instances
> + per jiffy for which the SRCU grace period
> + worker thread will be rescheduled with zero
> + delay. Beyond this limit, worker thread will
> + be rescheduled with a sleep delay of one jiffy.
> +
> + srcutree.srcu_max_nodelay_phase [KNL]
> + Specifies the per-grace-period phase, number of
> + non-sleeping polls of readers. Beyond this limit,
> + grace period worker thread will be rescheduled
> + with a sleep delay of one jiffy, between each
> + rescan of the readers, for a grace period phase.
> +
> + srcutree.srcu_retry_check_delay [KNL]
> + Specifies number of microseconds of non-sleeping
> + delay between each non-sleeping poll of readers.
> +
> srcutree.small_contention_lim [KNL]
> Specifies the number of update-side contention
> events per jiffy will be tolerated before
> diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
> index 0db7873f4e95..1c304fec89c0 100644
> --- a/kernel/rcu/srcutree.c
> +++ b/kernel/rcu/srcutree.c
> @@ -511,10 +511,52 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
> return sum;
> }
>
> -#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending.
> -#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers.
> -#define SRCU_MAX_NODELAY_PHASE 3 // Maximum per-GP-phase consecutive no-delay instances.
> -#define SRCU_MAX_NODELAY 100 // Maximum consecutive no-delay instances.
> +/*
> + * We use an adaptive strategy for synchronize_srcu() and especially for
> + * synchronize_srcu_expedited(). We spin for a fixed time period
> + * (defined below, boot time configurable) to allow SRCU readers to exit
> + * their read-side critical sections. If there are still some readers
> + * after one jiffy, we repeatedly block for one jiffy time periods.
> + * The blocking time is increased as the grace-period age increases,
> + * with max blocking time capped at 10 jiffies.
> + */
> +#define SRCU_DEFAULT_RETRY_CHECK_DELAY 5
> +
> +static ulong srcu_retry_check_delay = SRCU_DEFAULT_RETRY_CHECK_DELAY;
> +module_param(srcu_retry_check_delay, ulong, 0444);
> +
> +#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending.
> +#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers.
> +
> +#define SRCU_DEFAULT_MAX_NODELAY_PHASE_LO 3UL // Lowmark on default per-GP-phase
> + // no-delay instances.
> +#define SRCU_DEFAULT_MAX_NODELAY_PHASE_HI 1000UL // Highmark on default per-GP-phase
> + // no-delay instances.
> +
> +#define SRCU_UL_CLAMP_LO(val, low) ((val) > (low) ? (val) : (low))
> +#define SRCU_UL_CLAMP_HI(val, high) ((val) < (high) ? (val) : (high))
> +#define SRCU_UL_CLAMP(val, low, high) SRCU_UL_CLAMP_HI(SRCU_UL_CLAMP_LO((val), (low)), (high))
> +// per-GP-phase no-delay instances adjusted to allow non-sleeping poll upto
> +// one jiffies time duration. Mult by 2 is done to factor in the srcu_get_delay()
> +// called from process_srcu().
> +#define SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED \
> + (2UL * USEC_PER_SEC / HZ / SRCU_DEFAULT_RETRY_CHECK_DELAY)
> +
> +// Maximum per-GP-phase consecutive no-delay instances.
> +#define SRCU_DEFAULT_MAX_NODELAY_PHASE \
> + SRCU_UL_CLAMP(SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED, \
> + SRCU_DEFAULT_MAX_NODELAY_PHASE_LO, \
> + SRCU_DEFAULT_MAX_NODELAY_PHASE_HI)
> +
> +static ulong srcu_max_nodelay_phase = SRCU_DEFAULT_MAX_NODELAY_PHASE;
> +module_param(srcu_max_nodelay_phase, ulong, 0444);
> +
> +// Maximum consecutive no-delay instances.
> +#define SRCU_DEFAULT_MAX_NODELAY (SRCU_DEFAULT_MAX_NODELAY_PHASE > 100 ? \
> + SRCU_DEFAULT_MAX_NODELAY_PHASE : 100)
> +
> +static ulong srcu_max_nodelay = SRCU_DEFAULT_MAX_NODELAY;
> +module_param(srcu_max_nodelay, ulong, 0444);
>
> /*
> * Return grace-period delay, zero if there are expedited grace
> @@ -535,7 +577,7 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
> jbase += j - gpstart;
> if (!jbase) {
> WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1);
> - if (READ_ONCE(ssp->srcu_n_exp_nodelay) > SRCU_MAX_NODELAY_PHASE)
> + if (READ_ONCE(ssp->srcu_n_exp_nodelay) > srcu_max_nodelay_phase)
> jbase = 1;
> }
> }
> @@ -612,15 +654,6 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
> }
> EXPORT_SYMBOL_GPL(__srcu_read_unlock);
>
> -/*
> - * We use an adaptive strategy for synchronize_srcu() and especially for
> - * synchronize_srcu_expedited(). We spin for a fixed time period
> - * (defined below) to allow SRCU readers to exit their read-side critical
> - * sections. If there are still some readers after a few microseconds,
> - * we repeatedly block for 1-millisecond time periods.
> - */
> -#define SRCU_RETRY_CHECK_DELAY 5
> -
> /*
> * Start an SRCU grace period.
> */
> @@ -706,7 +739,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp
> */
> static void srcu_gp_end(struct srcu_struct *ssp)
> {
> - unsigned long cbdelay;
> + unsigned long cbdelay = 1;
> bool cbs;
> bool last_lvl;
> int cpu;
> @@ -726,7 +759,9 @@ static void srcu_gp_end(struct srcu_struct *ssp)
> spin_lock_irq_rcu_node(ssp);
> idx = rcu_seq_state(ssp->srcu_gp_seq);
> WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
> - cbdelay = !!srcu_get_delay(ssp);
> + if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
> + cbdelay = 0;
> +
> WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns());
> rcu_seq_end(&ssp->srcu_gp_seq);
> gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
> @@ -927,12 +962,16 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
> */
> static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount)
> {
> + unsigned long curdelay;
> +
> + curdelay = !srcu_get_delay(ssp);
> +
> for (;;) {
> if (srcu_readers_active_idx_check(ssp, idx))
> return true;
> - if (--trycount + !srcu_get_delay(ssp) <= 0)
> + if ((--trycount + curdelay) <= 0)
> return false;
> - udelay(SRCU_RETRY_CHECK_DELAY);
> + udelay(srcu_retry_check_delay);
> }
> }
>
> @@ -1588,7 +1627,7 @@ static void process_srcu(struct work_struct *work)
> j = jiffies;
> if (READ_ONCE(ssp->reschedule_jiffies) == j) {
> WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1);
> - if (READ_ONCE(ssp->reschedule_count) > SRCU_MAX_NODELAY)
> + if (READ_ONCE(ssp->reschedule_count) > srcu_max_nodelay)
> curdelay = 1;
> } else {
> WRITE_ONCE(ssp->reschedule_count, 1);
> @@ -1680,6 +1719,11 @@ static int __init srcu_bootup_announce(void)
> pr_info("Hierarchical SRCU implementation.\n");
> if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF)
> pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff);
> + if (srcu_retry_check_delay != SRCU_DEFAULT_RETRY_CHECK_DELAY)
> + pr_info("\tNon-default retry check delay of %lu us.\n", srcu_retry_check_delay);
> + if (srcu_max_nodelay != SRCU_DEFAULT_MAX_NODELAY)
> + pr_info("\tNon-default max no-delay of %lu.\n", srcu_max_nodelay);
> + pr_info("\tMax phase no-delay instances is %lu.\n", srcu_max_nodelay_phase);
> return 0;
> }
> early_initcall(srcu_bootup_announce);

2022-06-30 07:14:05

by Neeraj Upadhyay

[permalink] [raw]
Subject: Re: [PATCH v2] srcu: Reduce blocking agressiveness of expedited grace periods further



On 6/30/2022 12:04 PM, Zhangfei Gao wrote:
>
>
> On 2022/6/30 下午12:12, Neeraj Upadhyay wrote:
>> Commit 640a7d37c3f4 ("srcu: Block less aggressively for expedited
>> grace periods") highlights a problem where aggressively blocking
>> SRCU expedited grace periods, as was introduced in commit
>> 282d8998e997 ("srcu: Prevent expedited GPs and blocking readers
>> from consuming CPU"), introduces ~2 minutes delay to the overall
>> ~3.5 minutes boot time, when starting VMs with "-bios QEMU_EFI.fd"
>> cmdline on qemu, which results in very high rate of memslots
>> add/remove, which causes > ~6000 synchronize_srcu() calls for
>> kvm->srcu SRCU instance.
>>
>> Below table captures the experiments done by Zhangfei Gao and Shameer
>> to measure the boottime impact with various values of non-sleeping
>> per phase counts, with HZ_250 and preemption enabled:
>>
>> +──────────────────────────+────────────────+
>> | SRCU_MAX_NODELAY_PHASE   | Boot time (s)  |
>> +──────────────────────────+────────────────+
>> | 100                      | 30.053         |
>> | 150                      | 25.151         |
>> | 200                      | 20.704         |
>> | 250                      | 15.748         |
>> | 500                      | 11.401         |
>> | 1000                     | 11.443         |
>> | 10000                    | 11.258         |
>> | 1000000                  | 11.154         |
>> +──────────────────────────+────────────────+
>>
>> Analysis on the experiment results showed improved boot time
>> with non blocking delays close to one jiffy duration. This
>> was also seen when number of per-phase iterations were scaled
>> to one jiffy.
>>
>> So, this change scales per-grace-period phase number of non-sleeping
>> polls, such that, non-sleeping polls are done for one jiffy. In addition
>> to this, srcu_get_delay() call in srcu_gp_end(), which is used to
>> calculate
>> the delay used for scheduling callbacks, is replaced with the check for
>> expedited grace period. This is done, to schedule cbs for completed
>> expedited
>> grace periods immediately, which results in improved boot time seen in
>> experiments.
>>
>> In addition to the changes to default per phase delays, this change
>> adds 3 new kernel parameters - srcutree.srcu_max_nodelay,
>> srcutree.srcu_max_nodelay_phase, srcutree.srcu_retry_check_delay.
>> This allows users to configure the srcu grace period scanning delays,
>> depending on their system configuration requirements.
>>
>> Signed-off-by: Neeraj Upadhyay <[email protected]>
>> Tested-by: Marc Zyngier <[email protected]>
>
> Tested-by: Zhangfei Gao <[email protected]>

Thanks for sharing the test results!! will include your Tested-by tag in
next version.


Thanks
Neeraj

>
> Test on arm64, defconfig(CONFIG_HZ_250=y)
> qemu boot Image with -bios QEMU_EFI.fd
>
>
> With this patch
> real    0m9.739s
> user    0m3.270s
> sys    0m0.969s
>
>
> Without this patch
> real    2m40.361s
> user    0m3.034s
> sys    0m1.162s
>
> 5.18-rc6
> real    0m8.402s
> user    0m3.015s
> sys     0m1.102s
>
> Thanks
>
>> ---
>>
>> Change in v2:
>>
>>    - Change srcu_max_nodelay default value to consider phase delay
>>      iterations
>>    - Apply Pauls' feedback
>>    - Add Marc's Tested-by
>>
>>   .../admin-guide/kernel-parameters.txt         | 18 ++++
>>   kernel/rcu/srcutree.c                         | 82 ++++++++++++++-----
>>   2 files changed, 81 insertions(+), 19 deletions(-)
>>
>> diff --git a/Documentation/admin-guide/kernel-parameters.txt
>> b/Documentation/admin-guide/kernel-parameters.txt
>> index af647714c113..7e34086c64f5 100644
>> --- a/Documentation/admin-guide/kernel-parameters.txt
>> +++ b/Documentation/admin-guide/kernel-parameters.txt
>> @@ -5805,6 +5805,24 @@
>>               expediting.  Set to zero to disable automatic
>>               expediting.
>> +    srcutree.srcu_max_nodelay [KNL]
>> +            Specifies the number of no-delay instances
>> +            per jiffy for which the SRCU grace period
>> +            worker thread will be rescheduled with zero
>> +            delay. Beyond this limit, worker thread will
>> +            be rescheduled with a sleep delay of one jiffy.
>> +
>> +    srcutree.srcu_max_nodelay_phase [KNL]
>> +            Specifies the per-grace-period phase, number of
>> +            non-sleeping polls of readers. Beyond this limit,
>> +            grace period worker thread will be rescheduled
>> +            with a sleep delay of one jiffy, between each
>> +            rescan of the readers, for a grace period phase.
>> +
>> +    srcutree.srcu_retry_check_delay [KNL]
>> +            Specifies number of microseconds of non-sleeping
>> +            delay between each non-sleeping poll of readers.
>> +
>>       srcutree.small_contention_lim [KNL]
>>               Specifies the number of update-side contention
>>               events per jiffy will be tolerated before
>> diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
>> index 0db7873f4e95..1c304fec89c0 100644
>> --- a/kernel/rcu/srcutree.c
>> +++ b/kernel/rcu/srcutree.c
>> @@ -511,10 +511,52 @@ static bool srcu_readers_active(struct
>> srcu_struct *ssp)
>>       return sum;
>>   }
>> -#define SRCU_INTERVAL        1    // Base delay if no expedited GPs
>> pending.
>> -#define SRCU_MAX_INTERVAL    10    // Maximum incremental delay from
>> slow readers.
>> -#define SRCU_MAX_NODELAY_PHASE    3    // Maximum per-GP-phase
>> consecutive no-delay instances.
>> -#define SRCU_MAX_NODELAY    100    // Maximum consecutive no-delay
>> instances.
>> +/*
>> + * We use an adaptive strategy for synchronize_srcu() and especially for
>> + * synchronize_srcu_expedited().  We spin for a fixed time period
>> + * (defined below, boot time configurable) to allow SRCU readers to exit
>> + * their read-side critical sections.  If there are still some readers
>> + * after one jiffy, we repeatedly block for one jiffy time periods.
>> + * The blocking time is increased as the grace-period age increases,
>> + * with max blocking time capped at 10 jiffies.
>> + */
>> +#define SRCU_DEFAULT_RETRY_CHECK_DELAY        5
>> +
>> +static ulong srcu_retry_check_delay = SRCU_DEFAULT_RETRY_CHECK_DELAY;
>> +module_param(srcu_retry_check_delay, ulong, 0444);
>> +
>> +#define SRCU_INTERVAL        1        // Base delay if no expedited
>> GPs pending.
>> +#define SRCU_MAX_INTERVAL    10        // Maximum incremental delay
>> from slow readers.
>> +
>> +#define SRCU_DEFAULT_MAX_NODELAY_PHASE_LO    3UL    // Lowmark on
>> default per-GP-phase
>> +                            // no-delay instances.
>> +#define SRCU_DEFAULT_MAX_NODELAY_PHASE_HI    1000UL    // Highmark on
>> default per-GP-phase
>> +                            // no-delay instances.
>> +
>> +#define SRCU_UL_CLAMP_LO(val, low)    ((val) > (low) ? (val) : (low))
>> +#define SRCU_UL_CLAMP_HI(val, high)    ((val) < (high) ? (val) : (high))
>> +#define SRCU_UL_CLAMP(val, low, high)
>> SRCU_UL_CLAMP_HI(SRCU_UL_CLAMP_LO((val), (low)), (high))
>> +// per-GP-phase no-delay instances adjusted to allow non-sleeping
>> poll upto
>> +// one jiffies time duration. Mult by 2 is done to factor in the
>> srcu_get_delay()
>> +// called from process_srcu().
>> +#define SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED    \
>> +    (2UL * USEC_PER_SEC / HZ / SRCU_DEFAULT_RETRY_CHECK_DELAY)
>> +
>> +// Maximum per-GP-phase consecutive no-delay instances.
>> +#define SRCU_DEFAULT_MAX_NODELAY_PHASE    \
>> +    SRCU_UL_CLAMP(SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED,    \
>> +              SRCU_DEFAULT_MAX_NODELAY_PHASE_LO,    \
>> +              SRCU_DEFAULT_MAX_NODELAY_PHASE_HI)
>> +
>> +static ulong srcu_max_nodelay_phase = SRCU_DEFAULT_MAX_NODELAY_PHASE;
>> +module_param(srcu_max_nodelay_phase, ulong, 0444);
>> +
>> +// Maximum consecutive no-delay instances.
>> +#define SRCU_DEFAULT_MAX_NODELAY    (SRCU_DEFAULT_MAX_NODELAY_PHASE >
>> 100 ?    \
>> +                     SRCU_DEFAULT_MAX_NODELAY_PHASE : 100)
>> +
>> +static ulong srcu_max_nodelay = SRCU_DEFAULT_MAX_NODELAY;
>> +module_param(srcu_max_nodelay, ulong, 0444);
>>   /*
>>    * Return grace-period delay, zero if there are expedited grace
>> @@ -535,7 +577,7 @@ static unsigned long srcu_get_delay(struct
>> srcu_struct *ssp)
>>               jbase += j - gpstart;
>>           if (!jbase) {
>>               WRITE_ONCE(ssp->srcu_n_exp_nodelay,
>> READ_ONCE(ssp->srcu_n_exp_nodelay) + 1);
>> -            if (READ_ONCE(ssp->srcu_n_exp_nodelay) >
>> SRCU_MAX_NODELAY_PHASE)
>> +            if (READ_ONCE(ssp->srcu_n_exp_nodelay) >
>> srcu_max_nodelay_phase)
>>                   jbase = 1;
>>           }
>>       }
>> @@ -612,15 +654,6 @@ void __srcu_read_unlock(struct srcu_struct *ssp,
>> int idx)
>>   }
>>   EXPORT_SYMBOL_GPL(__srcu_read_unlock);
>> -/*
>> - * We use an adaptive strategy for synchronize_srcu() and especially for
>> - * synchronize_srcu_expedited().  We spin for a fixed time period
>> - * (defined below) to allow SRCU readers to exit their read-side
>> critical
>> - * sections.  If there are still some readers after a few microseconds,
>> - * we repeatedly block for 1-millisecond time periods.
>> - */
>> -#define SRCU_RETRY_CHECK_DELAY        5
>> -
>>   /*
>>    * Start an SRCU grace period.
>>    */
>> @@ -706,7 +739,7 @@ static void srcu_schedule_cbs_snp(struct
>> srcu_struct *ssp, struct srcu_node *snp
>>    */
>>   static void srcu_gp_end(struct srcu_struct *ssp)
>>   {
>> -    unsigned long cbdelay;
>> +    unsigned long cbdelay = 1;
>>       bool cbs;
>>       bool last_lvl;
>>       int cpu;
>> @@ -726,7 +759,9 @@ static void srcu_gp_end(struct srcu_struct *ssp)
>>       spin_lock_irq_rcu_node(ssp);
>>       idx = rcu_seq_state(ssp->srcu_gp_seq);
>>       WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
>> -    cbdelay = !!srcu_get_delay(ssp);
>> +    if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq),
>> READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
>> +        cbdelay = 0;
>> +
>>       WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns());
>>       rcu_seq_end(&ssp->srcu_gp_seq);
>>       gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
>> @@ -927,12 +962,16 @@ static void srcu_funnel_gp_start(struct
>> srcu_struct *ssp, struct srcu_data *sdp,
>>    */
>>   static bool try_check_zero(struct srcu_struct *ssp, int idx, int
>> trycount)
>>   {
>> +    unsigned long curdelay;
>> +
>> +    curdelay = !srcu_get_delay(ssp);
>> +
>>       for (;;) {
>>           if (srcu_readers_active_idx_check(ssp, idx))
>>               return true;
>> -        if (--trycount + !srcu_get_delay(ssp) <= 0)
>> +        if ((--trycount + curdelay) <= 0)
>>               return false;
>> -        udelay(SRCU_RETRY_CHECK_DELAY);
>> +        udelay(srcu_retry_check_delay);
>>       }
>>   }
>> @@ -1588,7 +1627,7 @@ static void process_srcu(struct work_struct *work)
>>           j = jiffies;
>>           if (READ_ONCE(ssp->reschedule_jiffies) == j) {
>>               WRITE_ONCE(ssp->reschedule_count,
>> READ_ONCE(ssp->reschedule_count) + 1);
>> -            if (READ_ONCE(ssp->reschedule_count) > SRCU_MAX_NODELAY)
>> +            if (READ_ONCE(ssp->reschedule_count) > srcu_max_nodelay)
>>                   curdelay = 1;
>>           } else {
>>               WRITE_ONCE(ssp->reschedule_count, 1);
>> @@ -1680,6 +1719,11 @@ static int __init srcu_bootup_announce(void)
>>       pr_info("Hierarchical SRCU implementation.\n");
>>       if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF)
>>           pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n",
>> exp_holdoff);
>> +    if (srcu_retry_check_delay != SRCU_DEFAULT_RETRY_CHECK_DELAY)
>> +        pr_info("\tNon-default retry check delay of %lu us.\n",
>> srcu_retry_check_delay);
>> +    if (srcu_max_nodelay != SRCU_DEFAULT_MAX_NODELAY)
>> +        pr_info("\tNon-default max no-delay of %lu.\n",
>> srcu_max_nodelay);
>> +    pr_info("\tMax phase no-delay instances is %lu.\n",
>> srcu_max_nodelay_phase);
>>       return 0;
>>   }
>>   early_initcall(srcu_bootup_announce);
>

2022-06-30 10:03:27

by Neeraj Upadhyay

[permalink] [raw]
Subject: Re: [PATCH v2] srcu: Reduce blocking agressiveness of expedited grace periods further



On 6/30/2022 2:56 PM, Marc Zyngier wrote:
> On Thu, 30 Jun 2022 05:12:01 +0100,
> Neeraj Upadhyay <[email protected]> wrote:
>>
>> Commit 640a7d37c3f4 ("srcu: Block less aggressively for expedited
>> grace periods") highlights a problem where aggressively blocking
>> SRCU expedited grace periods, as was introduced in commit
>> 282d8998e997 ("srcu: Prevent expedited GPs and blocking readers
>> from consuming CPU"), introduces ~2 minutes delay to the overall
>> ~3.5 minutes boot time, when starting VMs with "-bios QEMU_EFI.fd"
>> cmdline on qemu, which results in very high rate of memslots
>> add/remove, which causes > ~6000 synchronize_srcu() calls for
>> kvm->srcu SRCU instance.
>>
>> Below table captures the experiments done by Zhangfei Gao and Shameer
>> to measure the boottime impact with various values of non-sleeping
>> per phase counts, with HZ_250 and preemption enabled:
>>
>> +──────────────────────────+────────────────+
>> | SRCU_MAX_NODELAY_PHASE | Boot time (s) |
>> +──────────────────────────+────────────────+
>> | 100 | 30.053 |
>> | 150 | 25.151 |
>> | 200 | 20.704 |
>> | 250 | 15.748 |
>> | 500 | 11.401 |
>> | 1000 | 11.443 |
>> | 10000 | 11.258 |
>> | 1000000 | 11.154 |
>> +──────────────────────────+────────────────+
>>
>> Analysis on the experiment results showed improved boot time
>> with non blocking delays close to one jiffy duration. This
>> was also seen when number of per-phase iterations were scaled
>> to one jiffy.
>>
>> So, this change scales per-grace-period phase number of non-sleeping
>> polls, such that, non-sleeping polls are done for one jiffy. In addition
>> to this, srcu_get_delay() call in srcu_gp_end(), which is used to calculate
>> the delay used for scheduling callbacks, is replaced with the check for
>> expedited grace period. This is done, to schedule cbs for completed expedited
>> grace periods immediately, which results in improved boot time seen in
>> experiments.
>>
>> In addition to the changes to default per phase delays, this change
>> adds 3 new kernel parameters - srcutree.srcu_max_nodelay,
>> srcutree.srcu_max_nodelay_phase, srcutree.srcu_retry_check_delay.
>> This allows users to configure the srcu grace period scanning delays,
>> depending on their system configuration requirements.
>>
>> Signed-off-by: Neeraj Upadhyay <[email protected]>
>> Tested-by: Marc Zyngier <[email protected]>
>> ---
>>
>> Change in v2:
>>
>> - Change srcu_max_nodelay default value to consider phase delay
>> iterations
>> - Apply Pauls' feedback
>> - Add Marc's Tested-by
>
> I gave this a go on the same platform as v1, and the result is
> actually much better as I didn't have to add any extra command-line
> option to get to a reasonable result (41s). I think we have a winner.
>

Thank you for testing it!


Thanks
Neeraj

> Thanks again,
>
> M.
>

2022-06-30 10:16:38

by Marc Zyngier

[permalink] [raw]
Subject: Re: [PATCH v2] srcu: Reduce blocking agressiveness of expedited grace periods further

On Thu, 30 Jun 2022 05:12:01 +0100,
Neeraj Upadhyay <[email protected]> wrote:
>
> Commit 640a7d37c3f4 ("srcu: Block less aggressively for expedited
> grace periods") highlights a problem where aggressively blocking
> SRCU expedited grace periods, as was introduced in commit
> 282d8998e997 ("srcu: Prevent expedited GPs and blocking readers
> from consuming CPU"), introduces ~2 minutes delay to the overall
> ~3.5 minutes boot time, when starting VMs with "-bios QEMU_EFI.fd"
> cmdline on qemu, which results in very high rate of memslots
> add/remove, which causes > ~6000 synchronize_srcu() calls for
> kvm->srcu SRCU instance.
>
> Below table captures the experiments done by Zhangfei Gao and Shameer
> to measure the boottime impact with various values of non-sleeping
> per phase counts, with HZ_250 and preemption enabled:
>
> +──────────────────────────+────────────────+
> | SRCU_MAX_NODELAY_PHASE | Boot time (s) |
> +──────────────────────────+────────────────+
> | 100 | 30.053 |
> | 150 | 25.151 |
> | 200 | 20.704 |
> | 250 | 15.748 |
> | 500 | 11.401 |
> | 1000 | 11.443 |
> | 10000 | 11.258 |
> | 1000000 | 11.154 |
> +──────────────────────────+────────────────+
>
> Analysis on the experiment results showed improved boot time
> with non blocking delays close to one jiffy duration. This
> was also seen when number of per-phase iterations were scaled
> to one jiffy.
>
> So, this change scales per-grace-period phase number of non-sleeping
> polls, such that, non-sleeping polls are done for one jiffy. In addition
> to this, srcu_get_delay() call in srcu_gp_end(), which is used to calculate
> the delay used for scheduling callbacks, is replaced with the check for
> expedited grace period. This is done, to schedule cbs for completed expedited
> grace periods immediately, which results in improved boot time seen in
> experiments.
>
> In addition to the changes to default per phase delays, this change
> adds 3 new kernel parameters - srcutree.srcu_max_nodelay,
> srcutree.srcu_max_nodelay_phase, srcutree.srcu_retry_check_delay.
> This allows users to configure the srcu grace period scanning delays,
> depending on their system configuration requirements.
>
> Signed-off-by: Neeraj Upadhyay <[email protected]>
> Tested-by: Marc Zyngier <[email protected]>
> ---
>
> Change in v2:
>
> - Change srcu_max_nodelay default value to consider phase delay
> iterations
> - Apply Pauls' feedback
> - Add Marc's Tested-by

I gave this a go on the same platform as v1, and the result is
actually much better as I didn't have to add any extra command-line
option to get to a reasonable result (41s). I think we have a winner.

Thanks again,

M.

--
Without deviation from the norm, progress is not possible.

2022-06-30 16:53:07

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH v2] srcu: Reduce blocking agressiveness of expedited grace periods further

On Thu, Jun 30, 2022 at 09:42:01AM +0530, Neeraj Upadhyay wrote:
> Commit 640a7d37c3f4 ("srcu: Block less aggressively for expedited
> grace periods") highlights a problem where aggressively blocking
> SRCU expedited grace periods, as was introduced in commit
> 282d8998e997 ("srcu: Prevent expedited GPs and blocking readers
> from consuming CPU"), introduces ~2 minutes delay to the overall
> ~3.5 minutes boot time, when starting VMs with "-bios QEMU_EFI.fd"
> cmdline on qemu, which results in very high rate of memslots
> add/remove, which causes > ~6000 synchronize_srcu() calls for
> kvm->srcu SRCU instance.
>
> Below table captures the experiments done by Zhangfei Gao and Shameer
> to measure the boottime impact with various values of non-sleeping
> per phase counts, with HZ_250 and preemption enabled:
>
> +──────────────────────────+────────────────+
> | SRCU_MAX_NODELAY_PHASE | Boot time (s) |
> +──────────────────────────+────────────────+
> | 100 | 30.053 |
> | 150 | 25.151 |
> | 200 | 20.704 |
> | 250 | 15.748 |
> | 500 | 11.401 |
> | 1000 | 11.443 |
> | 10000 | 11.258 |
> | 1000000 | 11.154 |
> +──────────────────────────+────────────────+
>
> Analysis on the experiment results showed improved boot time
> with non blocking delays close to one jiffy duration. This
> was also seen when number of per-phase iterations were scaled
> to one jiffy.
>
> So, this change scales per-grace-period phase number of non-sleeping
> polls, such that, non-sleeping polls are done for one jiffy. In addition
> to this, srcu_get_delay() call in srcu_gp_end(), which is used to calculate
> the delay used for scheduling callbacks, is replaced with the check for
> expedited grace period. This is done, to schedule cbs for completed expedited
> grace periods immediately, which results in improved boot time seen in
> experiments.
>
> In addition to the changes to default per phase delays, this change
> adds 3 new kernel parameters - srcutree.srcu_max_nodelay,
> srcutree.srcu_max_nodelay_phase, srcutree.srcu_retry_check_delay.
> This allows users to configure the srcu grace period scanning delays,
> depending on their system configuration requirements.
>
> Signed-off-by: Neeraj Upadhyay <[email protected]>
> Tested-by: Marc Zyngier <[email protected]>

Looks like great progress, thank you!

I look forward to seeing the next version. In the meantime, I have
queued this one on the experimental branch quic_neeraju.2022.06.30a
for further testing.

Thanx, Paul

> ---
>
> Change in v2:
>
> - Change srcu_max_nodelay default value to consider phase delay
> iterations
> - Apply Pauls' feedback
> - Add Marc's Tested-by
>
> .../admin-guide/kernel-parameters.txt | 18 ++++
> kernel/rcu/srcutree.c | 82 ++++++++++++++-----
> 2 files changed, 81 insertions(+), 19 deletions(-)
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index af647714c113..7e34086c64f5 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -5805,6 +5805,24 @@
> expediting. Set to zero to disable automatic
> expediting.
>
> + srcutree.srcu_max_nodelay [KNL]
> + Specifies the number of no-delay instances
> + per jiffy for which the SRCU grace period
> + worker thread will be rescheduled with zero
> + delay. Beyond this limit, worker thread will
> + be rescheduled with a sleep delay of one jiffy.
> +
> + srcutree.srcu_max_nodelay_phase [KNL]
> + Specifies the per-grace-period phase, number of
> + non-sleeping polls of readers. Beyond this limit,
> + grace period worker thread will be rescheduled
> + with a sleep delay of one jiffy, between each
> + rescan of the readers, for a grace period phase.
> +
> + srcutree.srcu_retry_check_delay [KNL]
> + Specifies number of microseconds of non-sleeping
> + delay between each non-sleeping poll of readers.
> +
> srcutree.small_contention_lim [KNL]
> Specifies the number of update-side contention
> events per jiffy will be tolerated before
> diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
> index 0db7873f4e95..1c304fec89c0 100644
> --- a/kernel/rcu/srcutree.c
> +++ b/kernel/rcu/srcutree.c
> @@ -511,10 +511,52 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
> return sum;
> }
>
> -#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending.
> -#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers.
> -#define SRCU_MAX_NODELAY_PHASE 3 // Maximum per-GP-phase consecutive no-delay instances.
> -#define SRCU_MAX_NODELAY 100 // Maximum consecutive no-delay instances.
> +/*
> + * We use an adaptive strategy for synchronize_srcu() and especially for
> + * synchronize_srcu_expedited(). We spin for a fixed time period
> + * (defined below, boot time configurable) to allow SRCU readers to exit
> + * their read-side critical sections. If there are still some readers
> + * after one jiffy, we repeatedly block for one jiffy time periods.
> + * The blocking time is increased as the grace-period age increases,
> + * with max blocking time capped at 10 jiffies.
> + */
> +#define SRCU_DEFAULT_RETRY_CHECK_DELAY 5
> +
> +static ulong srcu_retry_check_delay = SRCU_DEFAULT_RETRY_CHECK_DELAY;
> +module_param(srcu_retry_check_delay, ulong, 0444);
> +
> +#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending.
> +#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers.
> +
> +#define SRCU_DEFAULT_MAX_NODELAY_PHASE_LO 3UL // Lowmark on default per-GP-phase
> + // no-delay instances.
> +#define SRCU_DEFAULT_MAX_NODELAY_PHASE_HI 1000UL // Highmark on default per-GP-phase
> + // no-delay instances.
> +
> +#define SRCU_UL_CLAMP_LO(val, low) ((val) > (low) ? (val) : (low))
> +#define SRCU_UL_CLAMP_HI(val, high) ((val) < (high) ? (val) : (high))
> +#define SRCU_UL_CLAMP(val, low, high) SRCU_UL_CLAMP_HI(SRCU_UL_CLAMP_LO((val), (low)), (high))
> +// per-GP-phase no-delay instances adjusted to allow non-sleeping poll upto
> +// one jiffies time duration. Mult by 2 is done to factor in the srcu_get_delay()
> +// called from process_srcu().
> +#define SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED \
> + (2UL * USEC_PER_SEC / HZ / SRCU_DEFAULT_RETRY_CHECK_DELAY)
> +
> +// Maximum per-GP-phase consecutive no-delay instances.
> +#define SRCU_DEFAULT_MAX_NODELAY_PHASE \
> + SRCU_UL_CLAMP(SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED, \
> + SRCU_DEFAULT_MAX_NODELAY_PHASE_LO, \
> + SRCU_DEFAULT_MAX_NODELAY_PHASE_HI)
> +
> +static ulong srcu_max_nodelay_phase = SRCU_DEFAULT_MAX_NODELAY_PHASE;
> +module_param(srcu_max_nodelay_phase, ulong, 0444);
> +
> +// Maximum consecutive no-delay instances.
> +#define SRCU_DEFAULT_MAX_NODELAY (SRCU_DEFAULT_MAX_NODELAY_PHASE > 100 ? \
> + SRCU_DEFAULT_MAX_NODELAY_PHASE : 100)
> +
> +static ulong srcu_max_nodelay = SRCU_DEFAULT_MAX_NODELAY;
> +module_param(srcu_max_nodelay, ulong, 0444);
>
> /*
> * Return grace-period delay, zero if there are expedited grace
> @@ -535,7 +577,7 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
> jbase += j - gpstart;
> if (!jbase) {
> WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1);
> - if (READ_ONCE(ssp->srcu_n_exp_nodelay) > SRCU_MAX_NODELAY_PHASE)
> + if (READ_ONCE(ssp->srcu_n_exp_nodelay) > srcu_max_nodelay_phase)
> jbase = 1;
> }
> }
> @@ -612,15 +654,6 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
> }
> EXPORT_SYMBOL_GPL(__srcu_read_unlock);
>
> -/*
> - * We use an adaptive strategy for synchronize_srcu() and especially for
> - * synchronize_srcu_expedited(). We spin for a fixed time period
> - * (defined below) to allow SRCU readers to exit their read-side critical
> - * sections. If there are still some readers after a few microseconds,
> - * we repeatedly block for 1-millisecond time periods.
> - */
> -#define SRCU_RETRY_CHECK_DELAY 5
> -
> /*
> * Start an SRCU grace period.
> */
> @@ -706,7 +739,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp
> */
> static void srcu_gp_end(struct srcu_struct *ssp)
> {
> - unsigned long cbdelay;
> + unsigned long cbdelay = 1;
> bool cbs;
> bool last_lvl;
> int cpu;
> @@ -726,7 +759,9 @@ static void srcu_gp_end(struct srcu_struct *ssp)
> spin_lock_irq_rcu_node(ssp);
> idx = rcu_seq_state(ssp->srcu_gp_seq);
> WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
> - cbdelay = !!srcu_get_delay(ssp);
> + if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
> + cbdelay = 0;
> +
> WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns());
> rcu_seq_end(&ssp->srcu_gp_seq);
> gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
> @@ -927,12 +962,16 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
> */
> static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount)
> {
> + unsigned long curdelay;
> +
> + curdelay = !srcu_get_delay(ssp);
> +
> for (;;) {
> if (srcu_readers_active_idx_check(ssp, idx))
> return true;
> - if (--trycount + !srcu_get_delay(ssp) <= 0)
> + if ((--trycount + curdelay) <= 0)
> return false;
> - udelay(SRCU_RETRY_CHECK_DELAY);
> + udelay(srcu_retry_check_delay);
> }
> }
>
> @@ -1588,7 +1627,7 @@ static void process_srcu(struct work_struct *work)
> j = jiffies;
> if (READ_ONCE(ssp->reschedule_jiffies) == j) {
> WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1);
> - if (READ_ONCE(ssp->reschedule_count) > SRCU_MAX_NODELAY)
> + if (READ_ONCE(ssp->reschedule_count) > srcu_max_nodelay)
> curdelay = 1;
> } else {
> WRITE_ONCE(ssp->reschedule_count, 1);
> @@ -1680,6 +1719,11 @@ static int __init srcu_bootup_announce(void)
> pr_info("Hierarchical SRCU implementation.\n");
> if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF)
> pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff);
> + if (srcu_retry_check_delay != SRCU_DEFAULT_RETRY_CHECK_DELAY)
> + pr_info("\tNon-default retry check delay of %lu us.\n", srcu_retry_check_delay);
> + if (srcu_max_nodelay != SRCU_DEFAULT_MAX_NODELAY)
> + pr_info("\tNon-default max no-delay of %lu.\n", srcu_max_nodelay);
> + pr_info("\tMax phase no-delay instances is %lu.\n", srcu_max_nodelay_phase);
> return 0;
> }
> early_initcall(srcu_bootup_announce);
> --
> 2.17.1
>