2024-03-08 22:44:52

by Joel Fernandes

[permalink] [raw]
Subject: [PATCH v2 rcu/dev 1/2] rcu/tree: Reduce wake up for synchronize_rcu() common case

In the synchronize_rcu() common case, we will have less than
SR_MAX_USERS_WAKE_FROM_GP number of users per GP. Waking up the kworker
is pointless just to free the last injected wait head since at that point,
all the users have already been awakened.

Introduce a new counter to track this and prevent the wakeup in the
common case.

Signed-off-by: Joel Fernandes (Google) <[email protected]>
---
Rebased on paul/dev of today.

kernel/rcu/tree.c | 36 +++++++++++++++++++++++++++++++-----
kernel/rcu/tree.h | 1 +
2 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9fbb5ab57c84..bd29fe3c76bf 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -96,6 +96,7 @@ static struct rcu_state rcu_state = {
.ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED,
.srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work,
rcu_sr_normal_gp_cleanup_work),
+ .srs_cleanups_pending = ATOMIC_INIT(0),
};

/* Dump rcu_node combining tree at boot to verify correct setup. */
@@ -1642,8 +1643,11 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
* the done tail list manipulations are protected here.
*/
done = smp_load_acquire(&rcu_state.srs_done_tail);
- if (!done)
+ if (!done) {
+ /* See comments below. */
+ atomic_dec_return_release(&rcu_state.srs_cleanups_pending);
return;
+ }

WARN_ON_ONCE(!rcu_sr_is_wait_head(done));
head = done->next;
@@ -1666,6 +1670,9 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)

rcu_sr_put_wait_head(rcu);
}
+
+ /* Order list manipulations with atomic access. */
+ atomic_dec_return_release(&rcu_state.srs_cleanups_pending);
}

/*
@@ -1673,7 +1680,7 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
*/
static void rcu_sr_normal_gp_cleanup(void)
{
- struct llist_node *wait_tail, *next, *rcu;
+ struct llist_node *wait_tail, *next = NULL, *rcu = NULL;
int done = 0;

wait_tail = rcu_state.srs_wait_tail;
@@ -1699,16 +1706,35 @@ static void rcu_sr_normal_gp_cleanup(void)
break;
}

- // concurrent sr_normal_gp_cleanup work might observe this update.
- smp_store_release(&rcu_state.srs_done_tail, wait_tail);
+ /*
+ * Fast path, no more users to process. Remove the last wait head
+ * if no inflight-workers. If there are in-flight workers, let them
+ * remove the last wait head.
+ */
+ WARN_ON_ONCE(!rcu);
ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail);

+ if (rcu && rcu_sr_is_wait_head(rcu) && rcu->next == NULL &&
+ /* Order atomic access with list manipulation. */
+ !atomic_read_acquire(&rcu_state.srs_cleanups_pending)) {
+ wait_tail->next = NULL;
+ rcu_sr_put_wait_head(rcu);
+ smp_store_release(&rcu_state.srs_done_tail, wait_tail);
+ return;
+ }
+
+ /* Concurrent sr_normal_gp_cleanup work might observe this update. */
+ smp_store_release(&rcu_state.srs_done_tail, wait_tail);
+
/*
* We schedule a work in order to perform a final processing
* of outstanding users(if still left) and releasing wait-heads
* added by rcu_sr_normal_gp_init() call.
*/
- queue_work(sync_wq, &rcu_state.srs_cleanup_work);
+ atomic_inc(&rcu_state.srs_cleanups_pending);
+ if (!queue_work(sync_wq, &rcu_state.srs_cleanup_work)) {
+ atomic_dec(&rcu_state.srs_cleanups_pending);
+ }
}

/*
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index bae7925c497f..affcb92a358c 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -420,6 +420,7 @@ struct rcu_state {
struct llist_node *srs_done_tail; /* ready for GP users. */
struct sr_wait_node srs_wait_nodes[SR_NORMAL_GP_WAIT_HEAD_MAX];
struct work_struct srs_cleanup_work;
+ atomic_t srs_cleanups_pending; /* srs inflight worker cleanups. */
};

/* Values for rcu_state structure's gp_flags field. */
--
2.34.1



2024-03-08 22:45:02

by Joel Fernandes

[permalink] [raw]
Subject: [PATCH v2 rcu/dev 2/2] rcu/tree: Add comments explaining now-offline-CPU QS reports

This a confusing piece of code (rightfully so as the issue it deals with
is complex). Recent discussions brought up a question -- what prevents the
rcu_implicit_dyntick_qs() from warning about QS reports for offline
CPUs.

QS reporting for now-offline CPUs should only happen from:
- gp_init()
- rcutree_cpu_report_dead()

Add some comments to this code explaining how QS reporting is not
missed when these functions are concurrently running.

Signed-off-by: Joel Fernandes (Google) <[email protected]>
---
kernel/rcu/tree.c | 36 +++++++++++++++++++++++++++++++++++-
1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index bd29fe3c76bf..f3582f843a05 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1917,7 +1917,22 @@ static noinline_for_stack bool rcu_gp_init(void)
trace_rcu_grace_period_init(rcu_state.name, rnp->gp_seq,
rnp->level, rnp->grplo,
rnp->grphi, rnp->qsmask);
- /* Quiescent states for tasks on any now-offline CPUs. */
+ /*
+ * === Quiescent states for tasks on any now-offline CPUs. ===
+ *
+ * QS reporting for now-offline CPUs should only be performed from
+ * either here, i.e., gp_init() or from rcutree_report_cpu_dead().
+ *
+ * Note that, when reporting quiescent states for now-offline CPUs,
+ * the sequence of code doing those reports while also accessing
+ * ->qsmask and ->qsmaskinitnext, has to be an atomic sequence so
+ * that QS reporting is not missed! Otherwise it possible that
+ * rcu_implicit_dyntick_qs() screams. This is ensured by keeping
+ * the rnp->lock acquired throughout these QS-reporting
+ * sequences, which is also acquired in
+ * rcutree_report_cpu_dead(), so, acquiring ofl_lock is not
+ * necessary here to synchronize with that function.
+ */
mask = rnp->qsmask & ~rnp->qsmaskinitnext;
rnp->rcu_gp_init_mask = mask;
if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp))
@@ -5116,6 +5131,25 @@ void rcutree_report_cpu_dead(void)
raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
rdp->rcu_ofl_gp_state = READ_ONCE(rcu_state.gp_state);
+
+ /*
+ * === Quiescent state reporting for now-offline CPUs ===
+ *
+ * QS reporting for now-offline CPUs should only be performed from
+ * either here, i.e. rcutree_report_cpu_dead(), or gp_init().
+ *
+ * Note that, when reporting quiescent states for now-offline CPUs,
+ * the sequence of code doing those reports while also accessing
+ * ->qsmask and ->qsmaskinitnext, has to be an atomic sequence so
+ * that QS reporting is not missed! Otherwise it possible that
+ * rcu_implicit_dyntick_qs() screams. This is ensured by keeping
+ * the rnp->lock acquired throughout these QS-reporting sequences, which
+ * is also acquired in gp_init().
+ * One slight change to this rule is below, where we release and
+ * reacquire the lock after a QS report, but before we clear the
+ * ->qsmaskinitnext bit. That is OK to do, because gp_init() report a
+ * QS again, if it acquired the rnp->lock before we reacquired below.
+ */
if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */
/* Report quiescent state -before- changing ->qsmaskinitnext! */
rcu_disable_urgency_upon_qs(rdp);
--
2.34.1


2024-03-10 19:43:57

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH v2 rcu/dev 2/2] rcu/tree: Add comments explaining now-offline-CPU QS reports

On Fri, Mar 08, 2024 at 05:44:38PM -0500, Joel Fernandes (Google) wrote:
> This a confusing piece of code (rightfully so as the issue it deals with
> is complex). Recent discussions brought up a question -- what prevents the
> rcu_implicit_dyntick_qs() from warning about QS reports for offline
> CPUs.
>
> QS reporting for now-offline CPUs should only happen from:
> - gp_init()
> - rcutree_cpu_report_dead()
>
> Add some comments to this code explaining how QS reporting is not
> missed when these functions are concurrently running.
>
> Signed-off-by: Joel Fernandes (Google) <[email protected]>

Thank you for putting this together!

A couple of questions below.

Thanx, Paul

> ---
> kernel/rcu/tree.c | 36 +++++++++++++++++++++++++++++++++++-
> 1 file changed, 35 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index bd29fe3c76bf..f3582f843a05 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -1917,7 +1917,22 @@ static noinline_for_stack bool rcu_gp_init(void)

Would it make sense to tag the earlier arch_spin_lock(&rcu_state.ofl_lock)
as preventing grace period from starting concurrently with
rcutree_report_cpu_dead()?

> trace_rcu_grace_period_init(rcu_state.name, rnp->gp_seq,
> rnp->level, rnp->grplo,
> rnp->grphi, rnp->qsmask);
> - /* Quiescent states for tasks on any now-offline CPUs. */
> + /*
> + * === Quiescent states for tasks on any now-offline CPUs. ===
> + *
> + * QS reporting for now-offline CPUs should only be performed from
> + * either here, i.e., gp_init() or from rcutree_report_cpu_dead().
> + *
> + * Note that, when reporting quiescent states for now-offline CPUs,
> + * the sequence of code doing those reports while also accessing
> + * ->qsmask and ->qsmaskinitnext, has to be an atomic sequence so
> + * that QS reporting is not missed! Otherwise it possible that
> + * rcu_implicit_dyntick_qs() screams. This is ensured by keeping
> + * the rnp->lock acquired throughout these QS-reporting
> + * sequences, which is also acquired in
> + * rcutree_report_cpu_dead(), so, acquiring ofl_lock is not
> + * necessary here to synchronize with that function.
> + */

Would it be better to put the long-form description in the "Hotplug
CPU" section of Documentation/RCU/Design/Requirements/Requirements.rst?
I will be the first to admit that this section is not as detailed as it
needs to be. This section is already referenced by the block comment
preceding the WARN_ON_ONCE() in rcu_implicit_dyntick_qs(), which is
where people will look first if any of this gets messed up.

Then these other places can refer to that comment or to that section of
Requirements.rst, allowing them to focus on the corresponding piece of
the puzzle.

> mask = rnp->qsmask & ~rnp->qsmaskinitnext;
> rnp->rcu_gp_init_mask = mask;
> if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp))
> @@ -5116,6 +5131,25 @@ void rcutree_report_cpu_dead(void)
> raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
> rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
> rdp->rcu_ofl_gp_state = READ_ONCE(rcu_state.gp_state);
> +
> + /*
> + * === Quiescent state reporting for now-offline CPUs ===
> + *
> + * QS reporting for now-offline CPUs should only be performed from
> + * either here, i.e. rcutree_report_cpu_dead(), or gp_init().
> + *
> + * Note that, when reporting quiescent states for now-offline CPUs,
> + * the sequence of code doing those reports while also accessing
> + * ->qsmask and ->qsmaskinitnext, has to be an atomic sequence so
> + * that QS reporting is not missed! Otherwise it possible that
> + * rcu_implicit_dyntick_qs() screams. This is ensured by keeping
> + * the rnp->lock acquired throughout these QS-reporting sequences, which
> + * is also acquired in gp_init().
> + * One slight change to this rule is below, where we release and
> + * reacquire the lock after a QS report, but before we clear the
> + * ->qsmaskinitnext bit. That is OK to do, because gp_init() report a
> + * QS again, if it acquired the rnp->lock before we reacquired below.
> + */

And then this need only say what is happening right here, but possibly
moved to within the following "if" statement, at which point we know that
we are in a grace period that cannot end until we report the quiescent
state (which releases the rcu_node structure's ->lock) and a new grace
period cannot look at this rcu_node structure's ->qsmaskinitnext until
we release rcu_state.ofl_lock.

Thoughts?

> if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */
> /* Report quiescent state -before- changing ->qsmaskinitnext! */
> rcu_disable_urgency_upon_qs(rdp);
> --
> 2.34.1
>

2024-03-11 16:04:38

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH v2 rcu/dev 2/2] rcu/tree: Add comments explaining now-offline-CPU QS reports



On 3/10/2024 3:43 PM, Paul E. McKenney wrote:
> On Fri, Mar 08, 2024 at 05:44:38PM -0500, Joel Fernandes (Google) wrote:
>> This a confusing piece of code (rightfully so as the issue it deals with
>> is complex). Recent discussions brought up a question -- what prevents the
>> rcu_implicit_dyntick_qs() from warning about QS reports for offline
>> CPUs.
>>
>> QS reporting for now-offline CPUs should only happen from:
>> - gp_init()
>> - rcutree_cpu_report_dead()
>>
>> Add some comments to this code explaining how QS reporting is not
>> missed when these functions are concurrently running.
>>
>> Signed-off-by: Joel Fernandes (Google) <[email protected]>
>
> Thank you for putting this together!
>
> A couple of questions below.
>
> Thanx, Paul
>
>> ---
>> kernel/rcu/tree.c | 36 +++++++++++++++++++++++++++++++++++-
>> 1 file changed, 35 insertions(+), 1 deletion(-)
>>
>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
>> index bd29fe3c76bf..f3582f843a05 100644
>> --- a/kernel/rcu/tree.c
>> +++ b/kernel/rcu/tree.c
>> @@ -1917,7 +1917,22 @@ static noinline_for_stack bool rcu_gp_init(void)
>
> Would it make sense to tag the earlier arch_spin_lock(&rcu_state.ofl_lock)
> as preventing grace period from starting concurrently with
> rcutree_report_cpu_dead()?

Yes, that makes sense.

>
>> trace_rcu_grace_period_init(rcu_state.name, rnp->gp_seq,
>> rnp->level, rnp->grplo,
>> rnp->grphi, rnp->qsmask);
>> - /* Quiescent states for tasks on any now-offline CPUs. */
>> + /*
>> + * === Quiescent states for tasks on any now-offline CPUs. ===
>> + *
>> + * QS reporting for now-offline CPUs should only be performed from
>> + * either here, i.e., gp_init() or from rcutree_report_cpu_dead().
>> + *
>> + * Note that, when reporting quiescent states for now-offline CPUs,
>> + * the sequence of code doing those reports while also accessing
>> + * ->qsmask and ->qsmaskinitnext, has to be an atomic sequence so
>> + * that QS reporting is not missed! Otherwise it possible that
>> + * rcu_implicit_dyntick_qs() screams. This is ensured by keeping
>> + * the rnp->lock acquired throughout these QS-reporting
>> + * sequences, which is also acquired in
>> + * rcutree_report_cpu_dead(), so, acquiring ofl_lock is not
>> + * necessary here to synchronize with that function.
>> + */
>
> Would it be better to put the long-form description in the "Hotplug
> CPU" section of Documentation/RCU/Design/Requirements/Requirements.rst?

Yes, totally. In fact I see the following already in Requirements.rst Hotplug
section:

During the checking/modification of RCU's hotplug bookkeeping, the
corresponding CPU's leaf node lock is held. This avoids race conditions
between RCU's hotplug notifier hooks, the grace period initialization
code, and the FQS loop, all of which refer to or modify this bookkeeping.
--

So I/we could just expand it there and refer to the .rst from the code.

> I will be the first to admit that this section is not as detailed as it
> needs to be. This section is already referenced by the block comment
> preceding the WARN_ON_ONCE() in rcu_implicit_dyntick_qs(), which is
> where people will look first if any of this gets messed up.

Yes great point, and it is referenced in rcu_gp_init() as well.
>
> Then these other places can refer to that comment or to that section of
> Requirements.rst, allowing them to focus on the corresponding piece of
> the puzzle.

Makes sense.

>> mask = rnp->qsmask & ~rnp->qsmaskinitnext;
>> rnp->rcu_gp_init_mask = mask;
>> if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp))
>> @@ -5116,6 +5131,25 @@ void rcutree_report_cpu_dead(void)
>> raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
>> rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
>> rdp->rcu_ofl_gp_state = READ_ONCE(rcu_state.gp_state);
>> +
>> + /*
>> + * === Quiescent state reporting for now-offline CPUs ===
>> + *
>> + * QS reporting for now-offline CPUs should only be performed from
>> + * either here, i.e. rcutree_report_cpu_dead(), or gp_init().
>> + *
>> + * Note that, when reporting quiescent states for now-offline CPUs,
>> + * the sequence of code doing those reports while also accessing
>> + * ->qsmask and ->qsmaskinitnext, has to be an atomic sequence so
>> + * that QS reporting is not missed! Otherwise it possible that
>> + * rcu_implicit_dyntick_qs() screams. This is ensured by keeping
>> + * the rnp->lock acquired throughout these QS-reporting sequences, which
>> + * is also acquired in gp_init().
>> + * One slight change to this rule is below, where we release and
>> + * reacquire the lock after a QS report, but before we clear the
>> + * ->qsmaskinitnext bit. That is OK to do, because gp_init() report a
>> + * QS again, if it acquired the rnp->lock before we reacquired below.
>> + */
>
> And then this need only say what is happening right here, but possibly
> moved to within the following "if" statement, at which point we know that
> we are in a grace period that cannot end until we report the quiescent
> state (which releases the rcu_node structure's ->lock) and a new grace
> period cannot look at this rcu_node structure's ->qsmaskinitnext until
> we release rcu_state.ofl_lock.

Yes, it makes sense and we should mention the ofl_lock as well as you note.

I have an trip starting in 2 weeks that goes on for 3 weeks shortly so I'm
scrambling a bit for time, and may get to this only after. If Neeraj is
interested in documenting this, he is more than welcome, especially since he
also understands this code very well ;-).

(See what I did there with the 'also' ? :P)

thanks,

- Joel




2024-03-18 18:58:26

by Uladzislau Rezki

[permalink] [raw]
Subject: Re: [PATCH v2 rcu/dev 1/2] rcu/tree: Reduce wake up for synchronize_rcu() common case

Hello, Joel!

Sorry for late checking, see below few comments:

> In the synchronize_rcu() common case, we will have less than
> SR_MAX_USERS_WAKE_FROM_GP number of users per GP. Waking up the kworker
> is pointless just to free the last injected wait head since at that point,
> all the users have already been awakened.
>
> Introduce a new counter to track this and prevent the wakeup in the
> common case.
>
> Signed-off-by: Joel Fernandes (Google) <[email protected]>
> ---
> Rebased on paul/dev of today.
>
> kernel/rcu/tree.c | 36 +++++++++++++++++++++++++++++++-----
> kernel/rcu/tree.h | 1 +
> 2 files changed, 32 insertions(+), 5 deletions(-)
>
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 9fbb5ab57c84..bd29fe3c76bf 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -96,6 +96,7 @@ static struct rcu_state rcu_state = {
> .ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED,
> .srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work,
> rcu_sr_normal_gp_cleanup_work),
> + .srs_cleanups_pending = ATOMIC_INIT(0),
> };
>
> /* Dump rcu_node combining tree at boot to verify correct setup. */
> @@ -1642,8 +1643,11 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
> * the done tail list manipulations are protected here.
> */
> done = smp_load_acquire(&rcu_state.srs_done_tail);
> - if (!done)
> + if (!done) {
> + /* See comments below. */
> + atomic_dec_return_release(&rcu_state.srs_cleanups_pending);
> return;
> + }
>
> WARN_ON_ONCE(!rcu_sr_is_wait_head(done));
> head = done->next;
> @@ -1666,6 +1670,9 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
>
> rcu_sr_put_wait_head(rcu);
> }
> +
> + /* Order list manipulations with atomic access. */
> + atomic_dec_return_release(&rcu_state.srs_cleanups_pending);
> }
>
> /*
> @@ -1673,7 +1680,7 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
> */
> static void rcu_sr_normal_gp_cleanup(void)
> {
> - struct llist_node *wait_tail, *next, *rcu;
> + struct llist_node *wait_tail, *next = NULL, *rcu = NULL;
> int done = 0;
>
> wait_tail = rcu_state.srs_wait_tail;
> @@ -1699,16 +1706,35 @@ static void rcu_sr_normal_gp_cleanup(void)
> break;
> }
>
> - // concurrent sr_normal_gp_cleanup work might observe this update.
> - smp_store_release(&rcu_state.srs_done_tail, wait_tail);
> + /*
> + * Fast path, no more users to process. Remove the last wait head
> + * if no inflight-workers. If there are in-flight workers, let them
> + * remove the last wait head.
> + */
> + WARN_ON_ONCE(!rcu);
>
This assumption is not correct. An "rcu" can be NULL in fact.

> ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail);
>
> + if (rcu && rcu_sr_is_wait_head(rcu) && rcu->next == NULL &&
> + /* Order atomic access with list manipulation. */
> + !atomic_read_acquire(&rcu_state.srs_cleanups_pending)) {
> + wait_tail->next = NULL;
> + rcu_sr_put_wait_head(rcu);
> + smp_store_release(&rcu_state.srs_done_tail, wait_tail);
> + return;
> + }
> +
> + /* Concurrent sr_normal_gp_cleanup work might observe this update. */
> + smp_store_release(&rcu_state.srs_done_tail, wait_tail);
> +
> /*
> * We schedule a work in order to perform a final processing
> * of outstanding users(if still left) and releasing wait-heads
> * added by rcu_sr_normal_gp_init() call.
> */
> - queue_work(sync_wq, &rcu_state.srs_cleanup_work);
> + atomic_inc(&rcu_state.srs_cleanups_pending);
> + if (!queue_work(sync_wq, &rcu_state.srs_cleanup_work)) {
> + atomic_dec(&rcu_state.srs_cleanups_pending);
> + }
> }
No need an extra "{}" pair.

>
> /*
> diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
> index bae7925c497f..affcb92a358c 100644
> --- a/kernel/rcu/tree.h
> +++ b/kernel/rcu/tree.h
> @@ -420,6 +420,7 @@ struct rcu_state {
> struct llist_node *srs_done_tail; /* ready for GP users. */
> struct sr_wait_node srs_wait_nodes[SR_NORMAL_GP_WAIT_HEAD_MAX];
> struct work_struct srs_cleanup_work;
> + atomic_t srs_cleanups_pending; /* srs inflight worker cleanups. */
> };
>
> /* Values for rcu_state structure's gp_flags field. */
> --
> 2.34.1
>

2024-03-18 21:07:47

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH v2 rcu/dev 1/2] rcu/tree: Reduce wake up for synchronize_rcu() common case



> On Mar 18, 2024, at 2:58 PM, Uladzislau Rezki <[email protected]> wrote:
>
> Hello, Joel!
>
> Sorry for late checking, see below few comments:
>
>> In the synchronize_rcu() common case, we will have less than
>> SR_MAX_USERS_WAKE_FROM_GP number of users per GP. Waking up the kworker
>> is pointless just to free the last injected wait head since at that point,
>> all the users have already been awakened.
>>
>> Introduce a new counter to track this and prevent the wakeup in the
>> common case.
>>
>> Signed-off-by: Joel Fernandes (Google) <[email protected]>
>> ---
>> Rebased on paul/dev of today.
>>
>> kernel/rcu/tree.c | 36 +++++++++++++++++++++++++++++++-----
>> kernel/rcu/tree.h | 1 +
>> 2 files changed, 32 insertions(+), 5 deletions(-)
>>
>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
>> index 9fbb5ab57c84..bd29fe3c76bf 100644
>> --- a/kernel/rcu/tree.c
>> +++ b/kernel/rcu/tree.c
>> @@ -96,6 +96,7 @@ static struct rcu_state rcu_state = {
>> .ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED,
>> .srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work,
>> rcu_sr_normal_gp_cleanup_work),
>> + .srs_cleanups_pending = ATOMIC_INIT(0),
>> };
>>
>> /* Dump rcu_node combining tree at boot to verify correct setup. */
>> @@ -1642,8 +1643,11 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
>> * the done tail list manipulations are protected here.
>> */
>> done = smp_load_acquire(&rcu_state.srs_done_tail);
>> - if (!done)
>> + if (!done) {
>> + /* See comments below. */
>> + atomic_dec_return_release(&rcu_state.srs_cleanups_pending);
>> return;
>> + }
>>
>> WARN_ON_ONCE(!rcu_sr_is_wait_head(done));
>> head = done->next;
>> @@ -1666,6 +1670,9 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
>>
>> rcu_sr_put_wait_head(rcu);
>> }
>> +
>> + /* Order list manipulations with atomic access. */
>> + atomic_dec_return_release(&rcu_state.srs_cleanups_pending);
>> }
>>
>> /*
>> @@ -1673,7 +1680,7 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
>> */
>> static void rcu_sr_normal_gp_cleanup(void)
>> {
>> - struct llist_node *wait_tail, *next, *rcu;
>> + struct llist_node *wait_tail, *next = NULL, *rcu = NULL;
>> int done = 0;
>>
>> wait_tail = rcu_state.srs_wait_tail;
>> @@ -1699,16 +1706,35 @@ static void rcu_sr_normal_gp_cleanup(void)
>> break;
>> }
>>
>> - // concurrent sr_normal_gp_cleanup work might observe this update.
>> - smp_store_release(&rcu_state.srs_done_tail, wait_tail);
>> + /*
>> + * Fast path, no more users to process. Remove the last wait head
>> + * if no inflight-workers. If there are in-flight workers, let them
>> + * remove the last wait head.
>> + */
>> + WARN_ON_ONCE(!rcu);
>>
> This assumption is not correct. An "rcu" can be NULL in fact.

Hmm I could never trigger that. Are you saying that is true after Neeraj recent patch or something else? Note, after Neeraj patch to handle the lack of heads availability, it could be true so I requested him to rebase his patch on top of this one.

However I will revisit my patch and look for if it could occur but please let me know if you knew of a sequence of events to make it NULL.
>
>> ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail);
>>
>> + if (rcu && rcu_sr_is_wait_head(rcu) && rcu->next == NULL &&
>> + /* Order atomic access with list manipulation. */
>> + !atomic_read_acquire(&rcu_state.srs_cleanups_pending)) {
>> + wait_tail->next = NULL;
>> + rcu_sr_put_wait_head(rcu);
>> + smp_store_release(&rcu_state.srs_done_tail, wait_tail);
>> + return;
>> + }
>> +
>> + /* Concurrent sr_normal_gp_cleanup work might observe this update. */
>> + smp_store_release(&rcu_state.srs_done_tail, wait_tail);
>> +
>> /*
>> * We schedule a work in order to perform a final processing
>> * of outstanding users(if still left) and releasing wait-heads
>> * added by rcu_sr_normal_gp_init() call.
>> */
>> - queue_work(sync_wq, &rcu_state.srs_cleanup_work);
>> + atomic_inc(&rcu_state.srs_cleanups_pending);
>> + if (!queue_work(sync_wq, &rcu_state.srs_cleanup_work)) {
>> + atomic_dec(&rcu_state.srs_cleanups_pending);
>> + }
>> }
> No need an extra "{}" pair.

I do prefer it for readability but I am ok with dropping it.

Thanks!

- Joel


>
>>
>> /*
>> diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
>> index bae7925c497f..affcb92a358c 100644
>> --- a/kernel/rcu/tree.h
>> +++ b/kernel/rcu/tree.h
>> @@ -420,6 +420,7 @@ struct rcu_state {
>> struct llist_node *srs_done_tail; /* ready for GP users. */
>> struct sr_wait_node srs_wait_nodes[SR_NORMAL_GP_WAIT_HEAD_MAX];
>> struct work_struct srs_cleanup_work;
>> + atomic_t srs_cleanups_pending; /* srs inflight worker cleanups. */
>> };
>>
>> /* Values for rcu_state structure's gp_flags field. */
>> --
>> 2.34.1
>>

2024-03-19 09:53:19

by Uladzislau Rezki

[permalink] [raw]
Subject: Re: [PATCH v2 rcu/dev 1/2] rcu/tree: Reduce wake up for synchronize_rcu() common case

On Mon, Mar 18, 2024 at 05:05:31PM -0400, Joel Fernandes wrote:
>
>
> > On Mar 18, 2024, at 2:58 PM, Uladzislau Rezki <[email protected]> wrote:
> >
> > Hello, Joel!
> >
> > Sorry for late checking, see below few comments:
> >
> >> In the synchronize_rcu() common case, we will have less than
> >> SR_MAX_USERS_WAKE_FROM_GP number of users per GP. Waking up the kworker
> >> is pointless just to free the last injected wait head since at that point,
> >> all the users have already been awakened.
> >>
> >> Introduce a new counter to track this and prevent the wakeup in the
> >> common case.
> >>
> >> Signed-off-by: Joel Fernandes (Google) <[email protected]>
> >> ---
> >> Rebased on paul/dev of today.
> >>
> >> kernel/rcu/tree.c | 36 +++++++++++++++++++++++++++++++-----
> >> kernel/rcu/tree.h | 1 +
> >> 2 files changed, 32 insertions(+), 5 deletions(-)
> >>
> >> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> >> index 9fbb5ab57c84..bd29fe3c76bf 100644
> >> --- a/kernel/rcu/tree.c
> >> +++ b/kernel/rcu/tree.c
> >> @@ -96,6 +96,7 @@ static struct rcu_state rcu_state = {
> >> .ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED,
> >> .srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work,
> >> rcu_sr_normal_gp_cleanup_work),
> >> + .srs_cleanups_pending = ATOMIC_INIT(0),
> >> };
> >>
> >> /* Dump rcu_node combining tree at boot to verify correct setup. */
> >> @@ -1642,8 +1643,11 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
> >> * the done tail list manipulations are protected here.
> >> */
> >> done = smp_load_acquire(&rcu_state.srs_done_tail);
> >> - if (!done)
> >> + if (!done) {
> >> + /* See comments below. */
> >> + atomic_dec_return_release(&rcu_state.srs_cleanups_pending);
> >> return;
> >> + }
> >>
> >> WARN_ON_ONCE(!rcu_sr_is_wait_head(done));
> >> head = done->next;
> >> @@ -1666,6 +1670,9 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
> >>
> >> rcu_sr_put_wait_head(rcu);
> >> }
> >> +
> >> + /* Order list manipulations with atomic access. */
> >> + atomic_dec_return_release(&rcu_state.srs_cleanups_pending);
> >> }
> >>
> >> /*
> >> @@ -1673,7 +1680,7 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
> >> */
> >> static void rcu_sr_normal_gp_cleanup(void)
> >> {
> >> - struct llist_node *wait_tail, *next, *rcu;
> >> + struct llist_node *wait_tail, *next = NULL, *rcu = NULL;
> >> int done = 0;
> >>
> >> wait_tail = rcu_state.srs_wait_tail;
> >> @@ -1699,16 +1706,35 @@ static void rcu_sr_normal_gp_cleanup(void)
> >> break;
> >> }
> >>
> >> - // concurrent sr_normal_gp_cleanup work might observe this update.
> >> - smp_store_release(&rcu_state.srs_done_tail, wait_tail);
> >> + /*
> >> + * Fast path, no more users to process. Remove the last wait head
> >> + * if no inflight-workers. If there are in-flight workers, let them
> >> + * remove the last wait head.
> >> + */
> >> + WARN_ON_ONCE(!rcu);
> >>
> > This assumption is not correct. An "rcu" can be NULL in fact.
>
> Hmm I could never trigger that. Are you saying that is true after Neeraj recent patch or something else?
> Note, after Neeraj patch to handle the lack of heads availability, it could be true so I requested
> him to rebase his patch on top of this one.
>
> However I will revisit my patch and look for if it could occur but please let me know if you knew of a sequence of events to make it NULL.
> >
I think we should agree on your patch first otherwise it becomes a bit
messy or go with a Neeraj as first step and then work on youth. So, i
reviewed this patch based on latest Paul's dev branch. I see that Neeraj
needs further work.

So this is true without Neeraj patch. Consider the following case:

3 2 1 0
wh -> cb -> cb -> cb -> NULL

we start to process from 2 and handle all clients, in the end,
an "rcu" points to NULL and trigger the WARN_ON_ONCE. I see the
splat during the boot:

<snip>
[ 0.927699][ T16] ------------[ cut here ]------------
[ 0.930867][ T16] WARNING: CPU: 0 PID: 16 at kernel/rcu/tree.c:1721 rcu_gp_cleanup+0x37b/0x4a0
[ 0.930490][ T1] acpiphp: ACPI Hot Plug PCI Controller Driver version: 0.5
[ 0.931401][ T16] Modules linked in:
[ 0.932400][ T1] PCI: Using configuration type 1 for base access
[ 0.932771][ T16]
[ 0.932773][ T16] CPU: 0 PID: 16 Comm: rcu_sched Not tainted 6.8.0-rc2-00089-g65ae0a6b86f0-dirty #1156
[ 0.937780][ T16] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
[ 0.939402][ T16] RIP: 0010:rcu_gp_cleanup+0x37b/0x4a0
[ 0.940636][ T16] Code: b0 4b bd 72 09 48 81 ff e8 b0 4b bd 76 1e 4c 8b 27 48 83 c7 10 e8 a5 8e fb ff 4c 89 23 83 ed 01 74 0a 4c 89 e7 48 85 ff 75 d2 <0f> 0b 48 8b 35 14 d0 fd 02 48 89 1d 8d 64 d0 01 48 83 c4 08 48 c7
[ 0.942402][ T16] RSP: 0018:ffff9b4a8008fe88 EFLAGS: 00010246
[ 0.943648][ T16] RAX: 0000000000000000 RBX: ffffffffbd4bb0a8 RCX: 6c9b26c9b26c9b27
[ 0.944751][ T16] RDX: 0000000000000000 RSI: 00000000374b92b6 RDI: 0000000000000000
[ 0.945757][ T16] RBP: 0000000000000004 R08: fffffffffff54ea1 R09: 0000000000000000
[ 0.946753][ T16] R10: ffff89070098c278 R11: 0000000000000001 R12: 0000000000000000
[ 0.947752][ T16] R13: fffffffffffffcbc R14: 0000000000000000 R15: ffffffffbd3f1300
[ 0.948764][ T16] FS: 0000000000000000(0000) GS:ffff8915efe00000(0000) knlGS:0000000000000000
[ 0.950403][ T16] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 0.951656][ T16] CR2: ffff89163ffff000 CR3: 00000002eae26000 CR4: 00000000000006f0
[ 0.952755][ T16] Call Trace:
[ 0.953597][ T16] <TASK>
[ 0.955404][ T16] ? __warn+0x80/0x140
[ 0.956608][ T16] ? rcu_gp_cleanup+0x37b/0x4a0
[ 0.957621][ T16] ? report_bug+0x15d/0x180
[ 0.959403][ T16] ? handle_bug+0x3c/0x70
[ 0.960616][ T16] ? exc_invalid_op+0x17/0x70
[ 0.961620][ T16] ? asm_exc_invalid_op+0x1a/0x20
[ 0.962627][ T16] ? rcu_gp_cleanup+0x37b/0x4a0
[ 0.963622][ T16] ? rcu_gp_cleanup+0x36b/0x4a0
[ 0.965403][ T16] ? __pfx_rcu_gp_kthread+0x10/0x10
[ 0.967402][ T16] rcu_gp_kthread+0xf7/0x180
[ 0.968619][ T16] kthread+0xd3/0x100
[ 0.969602][ T16] ? __pfx_kthread+0x10/0x10
[ 0.971402][ T16] ret_from_fork+0x34/0x50
[ 0.972613][ T16] ? __pfx_kthread+0x10/0x10
[ 0.973615][ T16] ret_from_fork_asm+0x1b/0x30
[ 0.974624][ T16] </TASK>
[ 0.975587][ T16] ---[ end trace 0000000000000000 ]---
<snip>

--
Uladzislau Rezki

2024-03-19 14:30:21

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH v2 rcu/dev 1/2] rcu/tree: Reduce wake up for synchronize_rcu() common case



> On Mar 19, 2024, at 5:53 AM, Uladzislau Rezki <[email protected]> wrote:
>
> On Mon, Mar 18, 2024 at 05:05:31PM -0400, Joel Fernandes wrote:
>>
>>
>>>> On Mar 18, 2024, at 2:58 PM, Uladzislau Rezki <[email protected]> wrote:
>>>
>>> Hello, Joel!
>>>
>>> Sorry for late checking, see below few comments:
>>>
>>>> In the synchronize_rcu() common case, we will have less than
>>>> SR_MAX_USERS_WAKE_FROM_GP number of users per GP. Waking up the kworker
>>>> is pointless just to free the last injected wait head since at that point,
>>>> all the users have already been awakened.
>>>>
>>>> Introduce a new counter to track this and prevent the wakeup in the
>>>> common case.
>>>>
>>>> Signed-off-by: Joel Fernandes (Google) <[email protected]>
>>>> ---
>>>> Rebased on paul/dev of today.
>>>>
>>>> kernel/rcu/tree.c | 36 +++++++++++++++++++++++++++++++-----
>>>> kernel/rcu/tree.h | 1 +
>>>> 2 files changed, 32 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
>>>> index 9fbb5ab57c84..bd29fe3c76bf 100644
>>>> --- a/kernel/rcu/tree.c
>>>> +++ b/kernel/rcu/tree.c
>>>> @@ -96,6 +96,7 @@ static struct rcu_state rcu_state = {
>>>> .ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED,
>>>> .srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work,
>>>> rcu_sr_normal_gp_cleanup_work),
>>>> + .srs_cleanups_pending = ATOMIC_INIT(0),
>>>> };
>>>>
>>>> /* Dump rcu_node combining tree at boot to verify correct setup. */
>>>> @@ -1642,8 +1643,11 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
>>>> * the done tail list manipulations are protected here.
>>>> */
>>>> done = smp_load_acquire(&rcu_state.srs_done_tail);
>>>> - if (!done)
>>>> + if (!done) {
>>>> + /* See comments below. */
>>>> + atomic_dec_return_release(&rcu_state.srs_cleanups_pending);
>>>> return;
>>>> + }
>>>>
>>>> WARN_ON_ONCE(!rcu_sr_is_wait_head(done));
>>>> head = done->next;
>>>> @@ -1666,6 +1670,9 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
>>>>
>>>> rcu_sr_put_wait_head(rcu);
>>>> }
>>>> +
>>>> + /* Order list manipulations with atomic access. */
>>>> + atomic_dec_return_release(&rcu_state.srs_cleanups_pending);
>>>> }
>>>>
>>>> /*
>>>> @@ -1673,7 +1680,7 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
>>>> */
>>>> static void rcu_sr_normal_gp_cleanup(void)
>>>> {
>>>> - struct llist_node *wait_tail, *next, *rcu;
>>>> + struct llist_node *wait_tail, *next = NULL, *rcu = NULL;
>>>> int done = 0;
>>>>
>>>> wait_tail = rcu_state.srs_wait_tail;
>>>> @@ -1699,16 +1706,35 @@ static void rcu_sr_normal_gp_cleanup(void)
>>>> break;
>>>> }
>>>>
>>>> - // concurrent sr_normal_gp_cleanup work might observe this update.
>>>> - smp_store_release(&rcu_state.srs_done_tail, wait_tail);
>>>> + /*
>>>> + * Fast path, no more users to process. Remove the last wait head
>>>> + * if no inflight-workers. If there are in-flight workers, let them
>>>> + * remove the last wait head.
>>>> + */
>>>> + WARN_ON_ONCE(!rcu);
>>>>
>>> This assumption is not correct. An "rcu" can be NULL in fact.
>>
>> Hmm I could never trigger that. Are you saying that is true after Neeraj recent patch or something else?
>> Note, after Neeraj patch to handle the lack of heads availability, it could be true so I requested
>> him to rebase his patch on top of this one.
>>
>> However I will revisit my patch and look for if it could occur but please let me know if you knew of a sequence of events to make it NULL.
>>>
> I think we should agree on your patch first otherwise it becomes a bit
> messy or go with a Neeraj as first step and then work on youth. So, i
> reviewed this patch based on latest Paul's dev branch. I see that Neeraj
> needs further work.

You are right. So the only change is to drop the warning and those braces. Agreed?

I will resend the patch and we can discuss during tomorrow call as well.

Thanks!

Joel



>
> So this is true without Neeraj patch. Consider the following case:
>
> 3 2 1 0
> wh -> cb -> cb -> cb -> NULL
>
> we start to process from 2 and handle all clients, in the end,
> an "rcu" points to NULL and trigger the WARN_ON_ONCE. I see the
> splat during the boot:
>
> <snip>
> [ 0.927699][ T16] ------------[ cut here ]------------
> [ 0.930867][ T16] WARNING: CPU: 0 PID: 16 at kernel/rcu/tree.c:1721 rcu_gp_cleanup+0x37b/0x4a0
> [ 0.930490][ T1] acpiphp: ACPI Hot Plug PCI Controller Driver version: 0.5
> [ 0.931401][ T16] Modules linked in:
> [ 0.932400][ T1] PCI: Using configuration type 1 for base access
> [ 0.932771][ T16]
> [ 0.932773][ T16] CPU: 0 PID: 16 Comm: rcu_sched Not tainted 6.8.0-rc2-00089-g65ae0a6b86f0-dirty #1156
> [ 0.937780][ T16] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
> [ 0.939402][ T16] RIP: 0010:rcu_gp_cleanup+0x37b/0x4a0
> [ 0.940636][ T16] Code: b0 4b bd 72 09 48 81 ff e8 b0 4b bd 76 1e 4c 8b 27 48 83 c7 10 e8 a5 8e fb ff 4c 89 23 83 ed 01 74 0a 4c 89 e7 48 85 ff 75 d2 <0f> 0b 48 8b 35 14 d0 fd 02 48 89 1d 8d 64 d0 01 48 83 c4 08 48 c7
> [ 0.942402][ T16] RSP: 0018:ffff9b4a8008fe88 EFLAGS: 00010246
> [ 0.943648][ T16] RAX: 0000000000000000 RBX: ffffffffbd4bb0a8 RCX: 6c9b26c9b26c9b27
> [ 0.944751][ T16] RDX: 0000000000000000 RSI: 00000000374b92b6 RDI: 0000000000000000
> [ 0.945757][ T16] RBP: 0000000000000004 R08: fffffffffff54ea1 R09: 0000000000000000
> [ 0.946753][ T16] R10: ffff89070098c278 R11: 0000000000000001 R12: 0000000000000000
> [ 0.947752][ T16] R13: fffffffffffffcbc R14: 0000000000000000 R15: ffffffffbd3f1300
> [ 0.948764][ T16] FS: 0000000000000000(0000) GS:ffff8915efe00000(0000) knlGS:0000000000000000
> [ 0.950403][ T16] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 0.951656][ T16] CR2: ffff89163ffff000 CR3: 00000002eae26000 CR4: 00000000000006f0
> [ 0.952755][ T16] Call Trace:
> [ 0.953597][ T16] <TASK>
> [ 0.955404][ T16] ? __warn+0x80/0x140
> [ 0.956608][ T16] ? rcu_gp_cleanup+0x37b/0x4a0
> [ 0.957621][ T16] ? report_bug+0x15d/0x180
> [ 0.959403][ T16] ? handle_bug+0x3c/0x70
> [ 0.960616][ T16] ? exc_invalid_op+0x17/0x70
> [ 0.961620][ T16] ? asm_exc_invalid_op+0x1a/0x20
> [ 0.962627][ T16] ? rcu_gp_cleanup+0x37b/0x4a0
> [ 0.963622][ T16] ? rcu_gp_cleanup+0x36b/0x4a0
> [ 0.965403][ T16] ? __pfx_rcu_gp_kthread+0x10/0x10
> [ 0.967402][ T16] rcu_gp_kthread+0xf7/0x180
> [ 0.968619][ T16] kthread+0xd3/0x100
> [ 0.969602][ T16] ? __pfx_kthread+0x10/0x10
> [ 0.971402][ T16] ret_from_fork+0x34/0x50
> [ 0.972613][ T16] ? __pfx_kthread+0x10/0x10
> [ 0.973615][ T16] ret_from_fork_asm+0x1b/0x30
> [ 0.974624][ T16] </TASK>
> [ 0.975587][ T16] ---[ end trace 0000000000000000 ]---
> <snip>
>
> --
> Uladzislau Rezki

2024-03-19 14:49:02

by Uladzislau Rezki

[permalink] [raw]
Subject: Re: [PATCH v2 rcu/dev 1/2] rcu/tree: Reduce wake up for synchronize_rcu() common case

On Tue, Mar 19, 2024 at 10:29:59AM -0400, Joel Fernandes wrote:
>
>
> > On Mar 19, 2024, at 5:53 AM, Uladzislau Rezki <[email protected]> wrote:
> >
> > On Mon, Mar 18, 2024 at 05:05:31PM -0400, Joel Fernandes wrote:
> >>
> >>
> >>>> On Mar 18, 2024, at 2:58 PM, Uladzislau Rezki <[email protected]> wrote:
> >>>
> >>> Hello, Joel!
> >>>
> >>> Sorry for late checking, see below few comments:
> >>>
> >>>> In the synchronize_rcu() common case, we will have less than
> >>>> SR_MAX_USERS_WAKE_FROM_GP number of users per GP. Waking up the kworker
> >>>> is pointless just to free the last injected wait head since at that point,
> >>>> all the users have already been awakened.
> >>>>
> >>>> Introduce a new counter to track this and prevent the wakeup in the
> >>>> common case.
> >>>>
> >>>> Signed-off-by: Joel Fernandes (Google) <[email protected]>
> >>>> ---
> >>>> Rebased on paul/dev of today.
> >>>>
> >>>> kernel/rcu/tree.c | 36 +++++++++++++++++++++++++++++++-----
> >>>> kernel/rcu/tree.h | 1 +
> >>>> 2 files changed, 32 insertions(+), 5 deletions(-)
> >>>>
> >>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> >>>> index 9fbb5ab57c84..bd29fe3c76bf 100644
> >>>> --- a/kernel/rcu/tree.c
> >>>> +++ b/kernel/rcu/tree.c
> >>>> @@ -96,6 +96,7 @@ static struct rcu_state rcu_state = {
> >>>> .ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED,
> >>>> .srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work,
> >>>> rcu_sr_normal_gp_cleanup_work),
> >>>> + .srs_cleanups_pending = ATOMIC_INIT(0),
> >>>> };
> >>>>
> >>>> /* Dump rcu_node combining tree at boot to verify correct setup. */
> >>>> @@ -1642,8 +1643,11 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
> >>>> * the done tail list manipulations are protected here.
> >>>> */
> >>>> done = smp_load_acquire(&rcu_state.srs_done_tail);
> >>>> - if (!done)
> >>>> + if (!done) {
> >>>> + /* See comments below. */
> >>>> + atomic_dec_return_release(&rcu_state.srs_cleanups_pending);
> >>>> return;
> >>>> + }
> >>>>
> >>>> WARN_ON_ONCE(!rcu_sr_is_wait_head(done));
> >>>> head = done->next;
> >>>> @@ -1666,6 +1670,9 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
> >>>>
> >>>> rcu_sr_put_wait_head(rcu);
> >>>> }
> >>>> +
> >>>> + /* Order list manipulations with atomic access. */
> >>>> + atomic_dec_return_release(&rcu_state.srs_cleanups_pending);
> >>>> }
> >>>>
> >>>> /*
> >>>> @@ -1673,7 +1680,7 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
> >>>> */
> >>>> static void rcu_sr_normal_gp_cleanup(void)
> >>>> {
> >>>> - struct llist_node *wait_tail, *next, *rcu;
> >>>> + struct llist_node *wait_tail, *next = NULL, *rcu = NULL;
> >>>> int done = 0;
> >>>>
> >>>> wait_tail = rcu_state.srs_wait_tail;
> >>>> @@ -1699,16 +1706,35 @@ static void rcu_sr_normal_gp_cleanup(void)
> >>>> break;
> >>>> }
> >>>>
> >>>> - // concurrent sr_normal_gp_cleanup work might observe this update.
> >>>> - smp_store_release(&rcu_state.srs_done_tail, wait_tail);
> >>>> + /*
> >>>> + * Fast path, no more users to process. Remove the last wait head
> >>>> + * if no inflight-workers. If there are in-flight workers, let them
> >>>> + * remove the last wait head.
> >>>> + */
> >>>> + WARN_ON_ONCE(!rcu);
> >>>>
> >>> This assumption is not correct. An "rcu" can be NULL in fact.
> >>
> >> Hmm I could never trigger that. Are you saying that is true after Neeraj recent patch or something else?
> >> Note, after Neeraj patch to handle the lack of heads availability, it could be true so I requested
> >> him to rebase his patch on top of this one.
> >>
> >> However I will revisit my patch and look for if it could occur but please let me know if you knew of a sequence of events to make it NULL.
> >>>
> > I think we should agree on your patch first otherwise it becomes a bit
> > messy or go with a Neeraj as first step and then work on youth. So, i
> > reviewed this patch based on latest Paul's dev branch. I see that Neeraj
> > needs further work.
>
> You are right. So the only change is to drop the warning and those braces. Agreed?
>
Let me check a bit. Looks like correct but just in case.

>
> I will resend the patch and we can discuss during tomorrow call as well.
>
Good :)

--
Uladzislau Rezki

2024-03-19 16:04:23

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH v2 rcu/dev 1/2] rcu/tree: Reduce wake up for synchronize_rcu() common case

On Tue, Mar 19, 2024 at 03:48:46PM +0100, Uladzislau Rezki wrote:
> On Tue, Mar 19, 2024 at 10:29:59AM -0400, Joel Fernandes wrote:
> >
> >
> > > On Mar 19, 2024, at 5:53 AM, Uladzislau Rezki <[email protected]> wrote:
> > >
> > > On Mon, Mar 18, 2024 at 05:05:31PM -0400, Joel Fernandes wrote:
> > >>
> > >>
> > >>>> On Mar 18, 2024, at 2:58 PM, Uladzislau Rezki <[email protected]> wrote:
> > >>>
> > >>> Hello, Joel!
> > >>>
> > >>> Sorry for late checking, see below few comments:
> > >>>
> > >>>> In the synchronize_rcu() common case, we will have less than
> > >>>> SR_MAX_USERS_WAKE_FROM_GP number of users per GP. Waking up the kworker
> > >>>> is pointless just to free the last injected wait head since at that point,
> > >>>> all the users have already been awakened.
> > >>>>
> > >>>> Introduce a new counter to track this and prevent the wakeup in the
> > >>>> common case.
> > >>>>
> > >>>> Signed-off-by: Joel Fernandes (Google) <[email protected]>
> > >>>> ---
> > >>>> Rebased on paul/dev of today.
> > >>>>
> > >>>> kernel/rcu/tree.c | 36 +++++++++++++++++++++++++++++++-----
> > >>>> kernel/rcu/tree.h | 1 +
> > >>>> 2 files changed, 32 insertions(+), 5 deletions(-)
> > >>>>
> > >>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > >>>> index 9fbb5ab57c84..bd29fe3c76bf 100644
> > >>>> --- a/kernel/rcu/tree.c
> > >>>> +++ b/kernel/rcu/tree.c
> > >>>> @@ -96,6 +96,7 @@ static struct rcu_state rcu_state = {
> > >>>> .ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED,
> > >>>> .srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work,
> > >>>> rcu_sr_normal_gp_cleanup_work),
> > >>>> + .srs_cleanups_pending = ATOMIC_INIT(0),
> > >>>> };
> > >>>>
> > >>>> /* Dump rcu_node combining tree at boot to verify correct setup. */
> > >>>> @@ -1642,8 +1643,11 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
> > >>>> * the done tail list manipulations are protected here.
> > >>>> */
> > >>>> done = smp_load_acquire(&rcu_state.srs_done_tail);
> > >>>> - if (!done)
> > >>>> + if (!done) {
> > >>>> + /* See comments below. */
> > >>>> + atomic_dec_return_release(&rcu_state.srs_cleanups_pending);
> > >>>> return;
> > >>>> + }
> > >>>>
> > >>>> WARN_ON_ONCE(!rcu_sr_is_wait_head(done));
> > >>>> head = done->next;
> > >>>> @@ -1666,6 +1670,9 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
> > >>>>
> > >>>> rcu_sr_put_wait_head(rcu);
> > >>>> }
> > >>>> +
> > >>>> + /* Order list manipulations with atomic access. */
> > >>>> + atomic_dec_return_release(&rcu_state.srs_cleanups_pending);
> > >>>> }
> > >>>>
> > >>>> /*
> > >>>> @@ -1673,7 +1680,7 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
> > >>>> */
> > >>>> static void rcu_sr_normal_gp_cleanup(void)
> > >>>> {
> > >>>> - struct llist_node *wait_tail, *next, *rcu;
> > >>>> + struct llist_node *wait_tail, *next = NULL, *rcu = NULL;
> > >>>> int done = 0;
> > >>>>
> > >>>> wait_tail = rcu_state.srs_wait_tail;
> > >>>> @@ -1699,16 +1706,35 @@ static void rcu_sr_normal_gp_cleanup(void)
> > >>>> break;
> > >>>> }
> > >>>>
> > >>>> - // concurrent sr_normal_gp_cleanup work might observe this update.
> > >>>> - smp_store_release(&rcu_state.srs_done_tail, wait_tail);
> > >>>> + /*
> > >>>> + * Fast path, no more users to process. Remove the last wait head
> > >>>> + * if no inflight-workers. If there are in-flight workers, let them
> > >>>> + * remove the last wait head.
> > >>>> + */
> > >>>> + WARN_ON_ONCE(!rcu);
> > >>>>
> > >>> This assumption is not correct. An "rcu" can be NULL in fact.
> > >>
> > >> Hmm I could never trigger that. Are you saying that is true after Neeraj recent patch or something else?
> > >> Note, after Neeraj patch to handle the lack of heads availability, it could be true so I requested
> > >> him to rebase his patch on top of this one.
> > >>
> > >> However I will revisit my patch and look for if it could occur but please let me know if you knew of a sequence of events to make it NULL.
> > >>>
> > > I think we should agree on your patch first otherwise it becomes a bit
> > > messy or go with a Neeraj as first step and then work on youth. So, i
> > > reviewed this patch based on latest Paul's dev branch. I see that Neeraj
> > > needs further work.
> >
> > You are right. So the only change is to drop the warning and those braces. Agreed?
> >
> Let me check a bit. Looks like correct but just in case.
>

Thanks. I was also considering improving it for the rcu == NULL case, as
below. I will test it more before re-sending.

On top of my patch:

---8<-----------------------

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 0df659a878ee..a5ef844835d4 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1706,15 +1706,18 @@ static void rcu_sr_normal_gp_cleanup(void)
break;
}

+
+ /* Last head stays. No more processing to do. */
+ if (!rcu)
+ return;
+
/*
* Fast path, no more users to process. Remove the last wait head
* if no inflight-workers. If there are in-flight workers, let them
* remove the last wait head.
*/
- WARN_ON_ONCE(!rcu);
- ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail);

- if (rcu && rcu_sr_is_wait_head(rcu) && rcu->next == NULL &&
+ if (rcu_sr_is_wait_head(rcu) && rcu->next == NULL &&
/* Order atomic access with list manipulation. */
!atomic_read_acquire(&rcu_state.srs_cleanups_pending)) {
wait_tail->next = NULL;
@@ -1724,6 +1727,7 @@ static void rcu_sr_normal_gp_cleanup(void)
}

/* Concurrent sr_normal_gp_cleanup work might observe this update. */
+ ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail);
smp_store_release(&rcu_state.srs_done_tail, wait_tail);

/*

2024-03-19 16:12:21

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH v2 rcu/dev 1/2] rcu/tree: Reduce wake up for synchronize_rcu() common case

On Tue, Mar 19, 2024 at 12:02 PM Joel Fernandes <joel@joelfernandesorg> wrote:
>
> On Tue, Mar 19, 2024 at 03:48:46PM +0100, Uladzislau Rezki wrote:
> > On Tue, Mar 19, 2024 at 10:29:59AM -0400, Joel Fernandes wrote:
> > >
> > >
> > > > On Mar 19, 2024, at 5:53 AM, Uladzislau Rezki <urezki@gmailcom> wrote:
> > > >
> > > > On Mon, Mar 18, 2024 at 05:05:31PM -0400, Joel Fernandes wrote:
> > > >>
> > > >>
> > > >>>> On Mar 18, 2024, at 2:58 PM, Uladzislau Rezki <[email protected]> wrote:
> > > >>>
> > > >>> Hello, Joel!
> > > >>>
> > > >>> Sorry for late checking, see below few comments:
> > > >>>
> > > >>>> In the synchronize_rcu() common case, we will have less than
> > > >>>> SR_MAX_USERS_WAKE_FROM_GP number of users per GP. Waking up the kworker
> > > >>>> is pointless just to free the last injected wait head since at that point,
> > > >>>> all the users have already been awakened.
> > > >>>>
> > > >>>> Introduce a new counter to track this and prevent the wakeup in the
> > > >>>> common case.
> > > >>>>
> > > >>>> Signed-off-by: Joel Fernandes (Google) <[email protected]>
> > > >>>> ---
> > > >>>> Rebased on paul/dev of today.
> > > >>>>
> > > >>>> kernel/rcu/tree.c | 36 +++++++++++++++++++++++++++++++-----
> > > >>>> kernel/rcu/tree.h | 1 +
> > > >>>> 2 files changed, 32 insertions(+), 5 deletions(-)
> > > >>>>
> > > >>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > >>>> index 9fbb5ab57c84..bd29fe3c76bf 100644
> > > >>>> --- a/kernel/rcu/tree.c
> > > >>>> +++ b/kernel/rcu/tree.c
> > > >>>> @@ -96,6 +96,7 @@ static struct rcu_state rcu_state = {
> > > >>>> .ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED,
> > > >>>> .srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work,
> > > >>>> rcu_sr_normal_gp_cleanup_work),
> > > >>>> + .srs_cleanups_pending = ATOMIC_INIT(0),
> > > >>>> };
> > > >>>>
> > > >>>> /* Dump rcu_node combining tree at boot to verify correct setup. */
> > > >>>> @@ -1642,8 +1643,11 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
> > > >>>> * the done tail list manipulations are protected here.
> > > >>>> */
> > > >>>> done = smp_load_acquire(&rcu_state.srs_done_tail);
> > > >>>> - if (!done)
> > > >>>> + if (!done) {
> > > >>>> + /* See comments below. */
> > > >>>> + atomic_dec_return_release(&rcu_state.srs_cleanups_pending);
> > > >>>> return;
> > > >>>> + }
> > > >>>>
> > > >>>> WARN_ON_ONCE(!rcu_sr_is_wait_head(done));
> > > >>>> head = done->next;
> > > >>>> @@ -1666,6 +1670,9 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
> > > >>>>
> > > >>>> rcu_sr_put_wait_head(rcu);
> > > >>>> }
> > > >>>> +
> > > >>>> + /* Order list manipulations with atomic access. */
> > > >>>> + atomic_dec_return_release(&rcu_state.srs_cleanups_pending);
> > > >>>> }
> > > >>>>
> > > >>>> /*
> > > >>>> @@ -1673,7 +1680,7 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
> > > >>>> */
> > > >>>> static void rcu_sr_normal_gp_cleanup(void)
> > > >>>> {
> > > >>>> - struct llist_node *wait_tail, *next, *rcu;
> > > >>>> + struct llist_node *wait_tail, *next = NULL, *rcu = NULL;
> > > >>>> int done = 0;
> > > >>>>
> > > >>>> wait_tail = rcu_state.srs_wait_tail;
> > > >>>> @@ -1699,16 +1706,35 @@ static void rcu_sr_normal_gp_cleanup(void)
> > > >>>> break;
> > > >>>> }
> > > >>>>
> > > >>>> - // concurrent sr_normal_gp_cleanup work might observe this update.
> > > >>>> - smp_store_release(&rcu_state.srs_done_tail, wait_tail);
> > > >>>> + /*
> > > >>>> + * Fast path, no more users to process. Remove the last wait head
> > > >>>> + * if no inflight-workers. If there are in-flight workers, let them
> > > >>>> + * remove the last wait head.
> > > >>>> + */
> > > >>>> + WARN_ON_ONCE(!rcu);
> > > >>>>
> > > >>> This assumption is not correct. An "rcu" can be NULL in fact.
> > > >>
> > > >> Hmm I could never trigger that. Are you saying that is true after Neeraj recent patch or something else?
> > > >> Note, after Neeraj patch to handle the lack of heads availability, it could be true so I requested
> > > >> him to rebase his patch on top of this one.
> > > >>
> > > >> However I will revisit my patch and look for if it could occur but please let me know if you knew of a sequence of events to make it NULL.
> > > >>>
> > > > I think we should agree on your patch first otherwise it becomes a bit
> > > > messy or go with a Neeraj as first step and then work on youth. So, i
> > > > reviewed this patch based on latest Paul's dev branch. I see that Neeraj
> > > > needs further work.
> > >
> > > You are right. So the only change is to drop the warning and those braces. Agreed?
> > >
> > Let me check a bit. Looks like correct but just in case.
> >
>
> Thanks. I was also considering improving it for the rcu == NULL case, as
> below. I will test it more before re-sending.
>
> On top of my patch:
>
> ---8<-----------------------
>
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 0df659a878ee..a5ef844835d4 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -1706,15 +1706,18 @@ static void rcu_sr_normal_gp_cleanup(void)
> break;
> }
>
> +
> + /* Last head stays. No more processing to do. */
> + if (!rcu)
> + return;
> +

Ugh, should be "if (!wait_head->next)" instead of "if (!rcu)". But
in any case, the original patch except the warning should hold.
Still, I am testing the above diff now.

- Joel

2024-03-19 17:30:00

by Uladzislau Rezki

[permalink] [raw]
Subject: Re: [PATCH v2 rcu/dev 1/2] rcu/tree: Reduce wake up for synchronize_rcu() common case

On Tue, Mar 19, 2024 at 12:11:28PM -0400, Joel Fernandes wrote:
> On Tue, Mar 19, 2024 at 12:02 PM Joel Fernandes <[email protected]> wrote:
> >
> > On Tue, Mar 19, 2024 at 03:48:46PM +0100, Uladzislau Rezki wrote:
> > > On Tue, Mar 19, 2024 at 10:29:59AM -0400, Joel Fernandes wrote:
> > > >
> > > >
> > > > > On Mar 19, 2024, at 5:53 AM, Uladzislau Rezki <[email protected]> wrote:
> > > > >
> > > > > On Mon, Mar 18, 2024 at 05:05:31PM -0400, Joel Fernandes wrote:
> > > > >>
> > > > >>
> > > > >>>> On Mar 18, 2024, at 2:58 PM, Uladzislau Rezki <[email protected]> wrote:
> > > > >>>
> > > > >>> Hello, Joel!
> > > > >>>
> > > > >>> Sorry for late checking, see below few comments:
> > > > >>>
> > > > >>>> In the synchronize_rcu() common case, we will have less than
> > > > >>>> SR_MAX_USERS_WAKE_FROM_GP number of users per GP. Waking up the kworker
> > > > >>>> is pointless just to free the last injected wait head since at that point,
> > > > >>>> all the users have already been awakened.
> > > > >>>>
> > > > >>>> Introduce a new counter to track this and prevent the wakeup in the
> > > > >>>> common case.
> > > > >>>>
> > > > >>>> Signed-off-by: Joel Fernandes (Google) <[email protected]>
> > > > >>>> ---
> > > > >>>> Rebased on paul/dev of today.
> > > > >>>>
> > > > >>>> kernel/rcu/tree.c | 36 +++++++++++++++++++++++++++++++-----
> > > > >>>> kernel/rcu/tree.h | 1 +
> > > > >>>> 2 files changed, 32 insertions(+), 5 deletions(-)
> > > > >>>>
> > > > >>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > > >>>> index 9fbb5ab57c84..bd29fe3c76bf 100644
> > > > >>>> --- a/kernel/rcu/tree.c
> > > > >>>> +++ b/kernel/rcu/tree.c
> > > > >>>> @@ -96,6 +96,7 @@ static struct rcu_state rcu_state = {
> > > > >>>> .ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED,
> > > > >>>> .srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work,
> > > > >>>> rcu_sr_normal_gp_cleanup_work),
> > > > >>>> + .srs_cleanups_pending = ATOMIC_INIT(0),
> > > > >>>> };
> > > > >>>>
> > > > >>>> /* Dump rcu_node combining tree at boot to verify correct setup. */
> > > > >>>> @@ -1642,8 +1643,11 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
> > > > >>>> * the done tail list manipulations are protected here.
> > > > >>>> */
> > > > >>>> done = smp_load_acquire(&rcu_state.srs_done_tail);
> > > > >>>> - if (!done)
> > > > >>>> + if (!done) {
> > > > >>>> + /* See comments below. */
> > > > >>>> + atomic_dec_return_release(&rcu_state.srs_cleanups_pending);
> > > > >>>> return;
> > > > >>>> + }
> > > > >>>>
> > > > >>>> WARN_ON_ONCE(!rcu_sr_is_wait_head(done));
> > > > >>>> head = done->next;
> > > > >>>> @@ -1666,6 +1670,9 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
> > > > >>>>
> > > > >>>> rcu_sr_put_wait_head(rcu);
> > > > >>>> }
> > > > >>>> +
> > > > >>>> + /* Order list manipulations with atomic access. */
> > > > >>>> + atomic_dec_return_release(&rcu_state.srs_cleanups_pending);
> > > > >>>> }
> > > > >>>>
> > > > >>>> /*
> > > > >>>> @@ -1673,7 +1680,7 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
> > > > >>>> */
> > > > >>>> static void rcu_sr_normal_gp_cleanup(void)
> > > > >>>> {
> > > > >>>> - struct llist_node *wait_tail, *next, *rcu;
> > > > >>>> + struct llist_node *wait_tail, *next = NULL, *rcu = NULL;
> > > > >>>> int done = 0;
> > > > >>>>
> > > > >>>> wait_tail = rcu_state.srs_wait_tail;
> > > > >>>> @@ -1699,16 +1706,35 @@ static void rcu_sr_normal_gp_cleanup(void)
> > > > >>>> break;
> > > > >>>> }
> > > > >>>>
> > > > >>>> - // concurrent sr_normal_gp_cleanup work might observe this update.
> > > > >>>> - smp_store_release(&rcu_state.srs_done_tail, wait_tail);
> > > > >>>> + /*
> > > > >>>> + * Fast path, no more users to process. Remove the last wait head
> > > > >>>> + * if no inflight-workers. If there are in-flight workers, let them
> > > > >>>> + * remove the last wait head.
> > > > >>>> + */
> > > > >>>> + WARN_ON_ONCE(!rcu);
> > > > >>>>
> > > > >>> This assumption is not correct. An "rcu" can be NULL in fact.
> > > > >>
> > > > >> Hmm I could never trigger that. Are you saying that is true after Neeraj recent patch or something else?
> > > > >> Note, after Neeraj patch to handle the lack of heads availability, it could be true so I requested
> > > > >> him to rebase his patch on top of this one.
> > > > >>
> > > > >> However I will revisit my patch and look for if it could occur but please let me know if you knew of a sequence of events to make it NULL.
> > > > >>>
> > > > > I think we should agree on your patch first otherwise it becomes a bit
> > > > > messy or go with a Neeraj as first step and then work on youth. So, i
> > > > > reviewed this patch based on latest Paul's dev branch. I see that Neeraj
> > > > > needs further work.
> > > >
> > > > You are right. So the only change is to drop the warning and those braces. Agreed?
> > > >
> > > Let me check a bit. Looks like correct but just in case.
> > >
> >
> > Thanks. I was also considering improving it for the rcu == NULL case, as
> > below. I will test it more before re-sending.
> >
> > On top of my patch:
> >
> > ---8<-----------------------
> >
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index 0df659a878ee..a5ef844835d4 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -1706,15 +1706,18 @@ static void rcu_sr_normal_gp_cleanup(void)
> > break;
> > }
> >
> > +
> > + /* Last head stays. No more processing to do. */
> > + if (!rcu)
> > + return;
> > +
>
> Ugh, should be "if (!wait_head->next)" instead of "if (!rcu)". But
> in any case, the original patch except the warning should hold.
> Still, I am testing the above diff now.
>
> - Joel
>
Just in case, it is based on your patch:

<snip>
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index bd29fe3c76bf..98546afe7c21 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1711,29 +1711,25 @@ static void rcu_sr_normal_gp_cleanup(void)
* if no inflight-workers. If there are in-flight workers, let them
* remove the last wait head.
*/
- WARN_ON_ONCE(!rcu);
- ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail);
-
- if (rcu && rcu_sr_is_wait_head(rcu) && rcu->next == NULL &&
- /* Order atomic access with list manipulation. */
- !atomic_read_acquire(&rcu_state.srs_cleanups_pending)) {
+ if (wait_tail->next && rcu_sr_is_wait_head(wait_tail->next) && !wait_tail->next->next &&
+ !atomic_read_acquire(&rcu_state.srs_cleanups_pending)) {
+ rcu_sr_put_wait_head(wait_tail->next);
wait_tail->next = NULL;
- rcu_sr_put_wait_head(rcu);
- smp_store_release(&rcu_state.srs_done_tail, wait_tail);
- return;
}

/* Concurrent sr_normal_gp_cleanup work might observe this update. */
smp_store_release(&rcu_state.srs_done_tail, wait_tail);
+ ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail);

- /*
- * We schedule a work in order to perform a final processing
- * of outstanding users(if still left) and releasing wait-heads
- * added by rcu_sr_normal_gp_init() call.
- */
- atomic_inc(&rcu_state.srs_cleanups_pending);
- if (!queue_work(sync_wq, &rcu_state.srs_cleanup_work)) {
- atomic_dec(&rcu_state.srs_cleanups_pending);
+ if (wait_tail->next) {
+ /*
+ * We schedule a work in order to perform a final processing
+ * of outstanding users(if still left) and releasing wait-heads
+ * added by rcu_sr_normal_gp_init() call.
+ */
+ atomic_inc(&rcu_state.srs_cleanups_pending);
+ if (!queue_work(sync_wq, &rcu_state.srs_cleanup_work))
+ atomic_dec(&rcu_state.srs_cleanups_pending);
}
}
<snip>



--
Uladzislau Rezki

2024-03-19 17:30:03

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH v2 rcu/dev 1/2] rcu/tree: Reduce wake up for synchronize_rcu() common case



On 3/19/2024 1:26 PM, Uladzislau Rezki wrote:
> On Tue, Mar 19, 2024 at 12:11:28PM -0400, Joel Fernandes wrote:
>> On Tue, Mar 19, 2024 at 12:02 PM Joel Fernandes <[email protected]> wrote:
>>>
>>> On Tue, Mar 19, 2024 at 03:48:46PM +0100, Uladzislau Rezki wrote:
>>>> On Tue, Mar 19, 2024 at 10:29:59AM -0400, Joel Fernandes wrote:
>>>>>
>>>>>
>>>>>> On Mar 19, 2024, at 5:53 AM, Uladzislau Rezki <[email protected]> wrote:
>>>>>>
>>>>>> On Mon, Mar 18, 2024 at 05:05:31PM -0400, Joel Fernandes wrote:
>>>>>>>
>>>>>>>
>>>>>>>>> On Mar 18, 2024, at 2:58 PM, Uladzislau Rezki <[email protected]> wrote:
>>>>>>>>
>>>>>>>> Hello, Joel!
>>>>>>>>
>>>>>>>> Sorry for late checking, see below few comments:
>>>>>>>>
>>>>>>>>> In the synchronize_rcu() common case, we will have less than
>>>>>>>>> SR_MAX_USERS_WAKE_FROM_GP number of users per GP. Waking up the kworker
>>>>>>>>> is pointless just to free the last injected wait head since at that point,
>>>>>>>>> all the users have already been awakened.
>>>>>>>>>
>>>>>>>>> Introduce a new counter to track this and prevent the wakeup in the
>>>>>>>>> common case.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Joel Fernandes (Google) <[email protected]>
>>>>>>>>> ---
>>>>>>>>> Rebased on paul/dev of today.
>>>>>>>>>
>>>>>>>>> kernel/rcu/tree.c | 36 +++++++++++++++++++++++++++++++-----
>>>>>>>>> kernel/rcu/tree.h | 1 +
>>>>>>>>> 2 files changed, 32 insertions(+), 5 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
>>>>>>>>> index 9fbb5ab57c84..bd29fe3c76bf 100644
>>>>>>>>> --- a/kernel/rcu/tree.c
>>>>>>>>> +++ b/kernel/rcu/tree.c
>>>>>>>>> @@ -96,6 +96,7 @@ static struct rcu_state rcu_state = {
>>>>>>>>> .ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED,
>>>>>>>>> .srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work,
>>>>>>>>> rcu_sr_normal_gp_cleanup_work),
>>>>>>>>> + .srs_cleanups_pending = ATOMIC_INIT(0),
>>>>>>>>> };
>>>>>>>>>
>>>>>>>>> /* Dump rcu_node combining tree at boot to verify correct setup. */
>>>>>>>>> @@ -1642,8 +1643,11 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
>>>>>>>>> * the done tail list manipulations are protected here.
>>>>>>>>> */
>>>>>>>>> done = smp_load_acquire(&rcu_state.srs_done_tail);
>>>>>>>>> - if (!done)
>>>>>>>>> + if (!done) {
>>>>>>>>> + /* See comments below. */
>>>>>>>>> + atomic_dec_return_release(&rcu_state.srs_cleanups_pending);
>>>>>>>>> return;
>>>>>>>>> + }
>>>>>>>>>
>>>>>>>>> WARN_ON_ONCE(!rcu_sr_is_wait_head(done));
>>>>>>>>> head = done->next;
>>>>>>>>> @@ -1666,6 +1670,9 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
>>>>>>>>>
>>>>>>>>> rcu_sr_put_wait_head(rcu);
>>>>>>>>> }
>>>>>>>>> +
>>>>>>>>> + /* Order list manipulations with atomic access. */
>>>>>>>>> + atomic_dec_return_release(&rcu_state.srs_cleanups_pending);
>>>>>>>>> }
>>>>>>>>>
>>>>>>>>> /*
>>>>>>>>> @@ -1673,7 +1680,7 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
>>>>>>>>> */
>>>>>>>>> static void rcu_sr_normal_gp_cleanup(void)
>>>>>>>>> {
>>>>>>>>> - struct llist_node *wait_tail, *next, *rcu;
>>>>>>>>> + struct llist_node *wait_tail, *next = NULL, *rcu = NULL;
>>>>>>>>> int done = 0;
>>>>>>>>>
>>>>>>>>> wait_tail = rcu_state.srs_wait_tail;
>>>>>>>>> @@ -1699,16 +1706,35 @@ static void rcu_sr_normal_gp_cleanup(void)
>>>>>>>>> break;
>>>>>>>>> }
>>>>>>>>>
>>>>>>>>> - // concurrent sr_normal_gp_cleanup work might observe this update.
>>>>>>>>> - smp_store_release(&rcu_state.srs_done_tail, wait_tail);
>>>>>>>>> + /*
>>>>>>>>> + * Fast path, no more users to process. Remove the last wait head
>>>>>>>>> + * if no inflight-workers. If there are in-flight workers, let them
>>>>>>>>> + * remove the last wait head.
>>>>>>>>> + */
>>>>>>>>> + WARN_ON_ONCE(!rcu);
>>>>>>>>>
>>>>>>>> This assumption is not correct. An "rcu" can be NULL in fact.
>>>>>>>
>>>>>>> Hmm I could never trigger that. Are you saying that is true after Neeraj recent patch or something else?
>>>>>>> Note, after Neeraj patch to handle the lack of heads availability, it could be true so I requested
>>>>>>> him to rebase his patch on top of this one.
>>>>>>>
>>>>>>> However I will revisit my patch and look for if it could occur but please let me know if you knew of a sequence of events to make it NULL.
>>>>>>>>
>>>>>> I think we should agree on your patch first otherwise it becomes a bit
>>>>>> messy or go with a Neeraj as first step and then work on youth. So, i
>>>>>> reviewed this patch based on latest Paul's dev branch. I see that Neeraj
>>>>>> needs further work.
>>>>>
>>>>> You are right. So the only change is to drop the warning and those braces. Agreed?
>>>>>
>>>> Let me check a bit. Looks like correct but just in case.
>>>>
>>>
>>> Thanks. I was also considering improving it for the rcu == NULL case, as
>>> below. I will test it more before re-sending.
>>>
>>> On top of my patch:
>>>
>>> ---8<-----------------------
>>>
>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
>>> index 0df659a878ee..a5ef844835d4 100644
>>> --- a/kernel/rcu/tree.c
>>> +++ b/kernel/rcu/tree.c
>>> @@ -1706,15 +1706,18 @@ static void rcu_sr_normal_gp_cleanup(void)
>>> break;
>>> }
>>>
>>> +
>>> + /* Last head stays. No more processing to do. */
>>> + if (!rcu)
>>> + return;
>>> +
>>
>> Ugh, should be "if (!wait_head->next)" instead of "if (!rcu)". But
>> in any case, the original patch except the warning should hold.
>> Still, I am testing the above diff now.
>>
>> - Joel
>>
> Just in case, it is based on your patch:
>
> <snip>
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index bd29fe3c76bf..98546afe7c21 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -1711,29 +1711,25 @@ static void rcu_sr_normal_gp_cleanup(void)
> * if no inflight-workers. If there are in-flight workers, let them
> * remove the last wait head.
> */
> - WARN_ON_ONCE(!rcu);
> - ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail);
> -
> - if (rcu && rcu_sr_is_wait_head(rcu) && rcu->next == NULL &&
> - /* Order atomic access with list manipulation. */
> - !atomic_read_acquire(&rcu_state.srs_cleanups_pending)) {
> + if (wait_tail->next && rcu_sr_is_wait_head(wait_tail->next) && !wait_tail->next->next &&
> + !atomic_read_acquire(&rcu_state.srs_cleanups_pending)) {


Yes this also works. But also if wait_tail->next == NULL, then you do not need
to queue worker for that case as well. I sent this as v3.

If you want to add that and resend my patch with the above diff, that would also
be fine. Or I can do that, let me know. Thanks!

- Joel

2024-03-19 17:33:26

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH v2 rcu/dev 1/2] rcu/tree: Reduce wake up for synchronize_rcu() common case



On 3/19/2024 1:29 PM, Joel Fernandes wrote:
>
>
> On 3/19/2024 1:26 PM, Uladzislau Rezki wrote:
>> On Tue, Mar 19, 2024 at 12:11:28PM -0400, Joel Fernandes wrote:
>>> On Tue, Mar 19, 2024 at 12:02 PM Joel Fernandes <[email protected]> wrote:
>>>>
>>>> On Tue, Mar 19, 2024 at 03:48:46PM +0100, Uladzislau Rezki wrote:
>>>>> On Tue, Mar 19, 2024 at 10:29:59AM -0400, Joel Fernandes wrote:
>>>>>>
>>>>>>
>>>>>>> On Mar 19, 2024, at 5:53 AM, Uladzislau Rezki <[email protected]> wrote:
>>>>>>>
>>>>>>> On Mon, Mar 18, 2024 at 05:05:31PM -0400, Joel Fernandes wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>>>> On Mar 18, 2024, at 2:58 PM, Uladzislau Rezki <[email protected]> wrote:
>>>>>>>>>
>>>>>>>>> Hello, Joel!
>>>>>>>>>
>>>>>>>>> Sorry for late checking, see below few comments:
>>>>>>>>>
>>>>>>>>>> In the synchronize_rcu() common case, we will have less than
>>>>>>>>>> SR_MAX_USERS_WAKE_FROM_GP number of users per GP. Waking up the kworker
>>>>>>>>>> is pointless just to free the last injected wait head since at that point,
>>>>>>>>>> all the users have already been awakened.
>>>>>>>>>>
>>>>>>>>>> Introduce a new counter to track this and prevent the wakeup in the
>>>>>>>>>> common case.
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Joel Fernandes (Google) <[email protected]>
>>>>>>>>>> ---
>>>>>>>>>> Rebased on paul/dev of today.
>>>>>>>>>>
>>>>>>>>>> kernel/rcu/tree.c | 36 +++++++++++++++++++++++++++++++-----
>>>>>>>>>> kernel/rcu/tree.h | 1 +
>>>>>>>>>> 2 files changed, 32 insertions(+), 5 deletions(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
>>>>>>>>>> index 9fbb5ab57c84..bd29fe3c76bf 100644
>>>>>>>>>> --- a/kernel/rcu/tree.c
>>>>>>>>>> +++ b/kernel/rcu/tree.c
>>>>>>>>>> @@ -96,6 +96,7 @@ static struct rcu_state rcu_state = {
>>>>>>>>>> .ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED,
>>>>>>>>>> .srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work,
>>>>>>>>>> rcu_sr_normal_gp_cleanup_work),
>>>>>>>>>> + .srs_cleanups_pending = ATOMIC_INIT(0),
>>>>>>>>>> };
>>>>>>>>>>
>>>>>>>>>> /* Dump rcu_node combining tree at boot to verify correct setup. */
>>>>>>>>>> @@ -1642,8 +1643,11 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
>>>>>>>>>> * the done tail list manipulations are protected here.
>>>>>>>>>> */
>>>>>>>>>> done = smp_load_acquire(&rcu_state.srs_done_tail);
>>>>>>>>>> - if (!done)
>>>>>>>>>> + if (!done) {
>>>>>>>>>> + /* See comments below. */
>>>>>>>>>> + atomic_dec_return_release(&rcu_state.srs_cleanups_pending);
>>>>>>>>>> return;
>>>>>>>>>> + }
>>>>>>>>>>
>>>>>>>>>> WARN_ON_ONCE(!rcu_sr_is_wait_head(done));
>>>>>>>>>> head = done->next;
>>>>>>>>>> @@ -1666,6 +1670,9 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
>>>>>>>>>>
>>>>>>>>>> rcu_sr_put_wait_head(rcu);
>>>>>>>>>> }
>>>>>>>>>> +
>>>>>>>>>> + /* Order list manipulations with atomic access. */
>>>>>>>>>> + atomic_dec_return_release(&rcu_state.srs_cleanups_pending);
>>>>>>>>>> }
>>>>>>>>>>
>>>>>>>>>> /*
>>>>>>>>>> @@ -1673,7 +1680,7 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
>>>>>>>>>> */
>>>>>>>>>> static void rcu_sr_normal_gp_cleanup(void)
>>>>>>>>>> {
>>>>>>>>>> - struct llist_node *wait_tail, *next, *rcu;
>>>>>>>>>> + struct llist_node *wait_tail, *next = NULL, *rcu = NULL;
>>>>>>>>>> int done = 0;
>>>>>>>>>>
>>>>>>>>>> wait_tail = rcu_state.srs_wait_tail;
>>>>>>>>>> @@ -1699,16 +1706,35 @@ static void rcu_sr_normal_gp_cleanup(void)
>>>>>>>>>> break;
>>>>>>>>>> }
>>>>>>>>>>
>>>>>>>>>> - // concurrent sr_normal_gp_cleanup work might observe this update.
>>>>>>>>>> - smp_store_release(&rcu_state.srs_done_tail, wait_tail);
>>>>>>>>>> + /*
>>>>>>>>>> + * Fast path, no more users to process. Remove the last wait head
>>>>>>>>>> + * if no inflight-workers. If there are in-flight workers, let them
>>>>>>>>>> + * remove the last wait head.
>>>>>>>>>> + */
>>>>>>>>>> + WARN_ON_ONCE(!rcu);
>>>>>>>>>>
>>>>>>>>> This assumption is not correct. An "rcu" can be NULL in fact.
>>>>>>>>
>>>>>>>> Hmm I could never trigger that. Are you saying that is true after Neeraj recent patch or something else?
>>>>>>>> Note, after Neeraj patch to handle the lack of heads availability, it could be true so I requested
>>>>>>>> him to rebase his patch on top of this one.
>>>>>>>>
>>>>>>>> However I will revisit my patch and look for if it could occur but please let me know if you knew of a sequence of events to make it NULL.
>>>>>>>>>
>>>>>>> I think we should agree on your patch first otherwise it becomes a bit
>>>>>>> messy or go with a Neeraj as first step and then work on youth. So, i
>>>>>>> reviewed this patch based on latest Paul's dev branch. I see that Neeraj
>>>>>>> needs further work.
>>>>>>
>>>>>> You are right. So the only change is to drop the warning and those braces. Agreed?
>>>>>>
>>>>> Let me check a bit. Looks like correct but just in case.
>>>>>
>>>>
>>>> Thanks. I was also considering improving it for the rcu == NULL case, as
>>>> below. I will test it more before re-sending.
>>>>
>>>> On top of my patch:
>>>>
>>>> ---8<-----------------------
>>>>
>>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
>>>> index 0df659a878ee..a5ef844835d4 100644
>>>> --- a/kernel/rcu/tree.c
>>>> +++ b/kernel/rcu/tree.c
>>>> @@ -1706,15 +1706,18 @@ static void rcu_sr_normal_gp_cleanup(void)
>>>> break;
>>>> }
>>>>
>>>> +
>>>> + /* Last head stays. No more processing to do. */
>>>> + if (!rcu)
>>>> + return;
>>>> +
>>>
>>> Ugh, should be "if (!wait_head->next)" instead of "if (!rcu)". But
>>> in any case, the original patch except the warning should hold.
>>> Still, I am testing the above diff now.
>>>
>>> - Joel
>>>
>> Just in case, it is based on your patch:
>>
>> <snip>
>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
>> index bd29fe3c76bf..98546afe7c21 100644
>> --- a/kernel/rcu/tree.c
>> +++ b/kernel/rcu/tree.c
>> @@ -1711,29 +1711,25 @@ static void rcu_sr_normal_gp_cleanup(void)
>> * if no inflight-workers. If there are in-flight workers, let them
>> * remove the last wait head.
>> */
>> - WARN_ON_ONCE(!rcu);
>> - ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail);
>> -
>> - if (rcu && rcu_sr_is_wait_head(rcu) && rcu->next == NULL &&
>> - /* Order atomic access with list manipulation. */
>> - !atomic_read_acquire(&rcu_state.srs_cleanups_pending)) {
>> + if (wait_tail->next && rcu_sr_is_wait_head(wait_tail->next) && !wait_tail->next->next &&
>> + !atomic_read_acquire(&rcu_state.srs_cleanups_pending)) {
>
>
> Yes this also works. But also if wait_tail->next == NULL, then you do not need
> to queue worker for that case as well. I sent this as v3.
>
Sorry, I see you did add that later in the patch ;-). I think we have converged
on the final patch then, give or take the use of 'rcu' versus 'wait_tail->next'.

- Joel

2024-03-19 18:37:55

by Uladzislau Rezki

[permalink] [raw]
Subject: Re: [PATCH v2 rcu/dev 1/2] rcu/tree: Reduce wake up for synchronize_rcu() common case

On Tue, Mar 19, 2024 at 01:33:11PM -0400, Joel Fernandes wrote:
>
>
> On 3/19/2024 1:29 PM, Joel Fernandes wrote:
> >
> >
> > On 3/19/2024 1:26 PM, Uladzislau Rezki wrote:
> >> On Tue, Mar 19, 2024 at 12:11:28PM -0400, Joel Fernandes wrote:
> >>> On Tue, Mar 19, 2024 at 12:02 PM Joel Fernandes <[email protected]> wrote:
> >>>>
> >>>> On Tue, Mar 19, 2024 at 03:48:46PM +0100, Uladzislau Rezki wrote:
> >>>>> On Tue, Mar 19, 2024 at 10:29:59AM -0400, Joel Fernandes wrote:
> >>>>>>
> >>>>>>
> >>>>>>> On Mar 19, 2024, at 5:53 AM, Uladzislau Rezki <[email protected]> wrote:
> >>>>>>>
> >>>>>>> On Mon, Mar 18, 2024 at 05:05:31PM -0400, Joel Fernandes wrote:
> >>>>>>>>
> >>>>>>>>
> >>>>>>>>>> On Mar 18, 2024, at 2:58 PM, Uladzislau Rezki <[email protected]> wrote:
> >>>>>>>>>
> >>>>>>>>> Hello, Joel!
> >>>>>>>>>
> >>>>>>>>> Sorry for late checking, see below few comments:
> >>>>>>>>>
> >>>>>>>>>> In the synchronize_rcu() common case, we will have less than
> >>>>>>>>>> SR_MAX_USERS_WAKE_FROM_GP number of users per GP. Waking up the kworker
> >>>>>>>>>> is pointless just to free the last injected wait head since at that point,
> >>>>>>>>>> all the users have already been awakened.
> >>>>>>>>>>
> >>>>>>>>>> Introduce a new counter to track this and prevent the wakeup in the
> >>>>>>>>>> common case.
> >>>>>>>>>>
> >>>>>>>>>> Signed-off-by: Joel Fernandes (Google) <[email protected]>
> >>>>>>>>>> ---
> >>>>>>>>>> Rebased on paul/dev of today.
> >>>>>>>>>>
> >>>>>>>>>> kernel/rcu/tree.c | 36 +++++++++++++++++++++++++++++++-----
> >>>>>>>>>> kernel/rcu/tree.h | 1 +
> >>>>>>>>>> 2 files changed, 32 insertions(+), 5 deletions(-)
> >>>>>>>>>>
> >>>>>>>>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> >>>>>>>>>> index 9fbb5ab57c84..bd29fe3c76bf 100644
> >>>>>>>>>> --- a/kernel/rcu/tree.c
> >>>>>>>>>> +++ b/kernel/rcu/tree.c
> >>>>>>>>>> @@ -96,6 +96,7 @@ static struct rcu_state rcu_state = {
> >>>>>>>>>> .ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED,
> >>>>>>>>>> .srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work,
> >>>>>>>>>> rcu_sr_normal_gp_cleanup_work),
> >>>>>>>>>> + .srs_cleanups_pending = ATOMIC_INIT(0),
> >>>>>>>>>> };
> >>>>>>>>>>
> >>>>>>>>>> /* Dump rcu_node combining tree at boot to verify correct setup. */
> >>>>>>>>>> @@ -1642,8 +1643,11 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
> >>>>>>>>>> * the done tail list manipulations are protected here.
> >>>>>>>>>> */
> >>>>>>>>>> done = smp_load_acquire(&rcu_state.srs_done_tail);
> >>>>>>>>>> - if (!done)
> >>>>>>>>>> + if (!done) {
> >>>>>>>>>> + /* See comments below. */
> >>>>>>>>>> + atomic_dec_return_release(&rcu_state.srs_cleanups_pending);
> >>>>>>>>>> return;
> >>>>>>>>>> + }
> >>>>>>>>>>
> >>>>>>>>>> WARN_ON_ONCE(!rcu_sr_is_wait_head(done));
> >>>>>>>>>> head = done->next;
> >>>>>>>>>> @@ -1666,6 +1670,9 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
> >>>>>>>>>>
> >>>>>>>>>> rcu_sr_put_wait_head(rcu);
> >>>>>>>>>> }
> >>>>>>>>>> +
> >>>>>>>>>> + /* Order list manipulations with atomic access. */
> >>>>>>>>>> + atomic_dec_return_release(&rcu_state.srs_cleanups_pending);
> >>>>>>>>>> }
> >>>>>>>>>>
> >>>>>>>>>> /*
> >>>>>>>>>> @@ -1673,7 +1680,7 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
> >>>>>>>>>> */
> >>>>>>>>>> static void rcu_sr_normal_gp_cleanup(void)
> >>>>>>>>>> {
> >>>>>>>>>> - struct llist_node *wait_tail, *next, *rcu;
> >>>>>>>>>> + struct llist_node *wait_tail, *next = NULL, *rcu = NULL;
> >>>>>>>>>> int done = 0;
> >>>>>>>>>>
> >>>>>>>>>> wait_tail = rcu_state.srs_wait_tail;
> >>>>>>>>>> @@ -1699,16 +1706,35 @@ static void rcu_sr_normal_gp_cleanup(void)
> >>>>>>>>>> break;
> >>>>>>>>>> }
> >>>>>>>>>>
> >>>>>>>>>> - // concurrent sr_normal_gp_cleanup work might observe this update.
> >>>>>>>>>> - smp_store_release(&rcu_state.srs_done_tail, wait_tail);
> >>>>>>>>>> + /*
> >>>>>>>>>> + * Fast path, no more users to process. Remove the last wait head
> >>>>>>>>>> + * if no inflight-workers. If there are in-flight workers, let them
> >>>>>>>>>> + * remove the last wait head.
> >>>>>>>>>> + */
> >>>>>>>>>> + WARN_ON_ONCE(!rcu);
> >>>>>>>>>>
> >>>>>>>>> This assumption is not correct. An "rcu" can be NULL in fact.
> >>>>>>>>
> >>>>>>>> Hmm I could never trigger that. Are you saying that is true after Neeraj recent patch or something else?
> >>>>>>>> Note, after Neeraj patch to handle the lack of heads availability, it could be true so I requested
> >>>>>>>> him to rebase his patch on top of this one.
> >>>>>>>>
> >>>>>>>> However I will revisit my patch and look for if it could occur but please let me know if you knew of a sequence of events to make it NULL.
> >>>>>>>>>
> >>>>>>> I think we should agree on your patch first otherwise it becomes a bit
> >>>>>>> messy or go with a Neeraj as first step and then work on youth. So, i
> >>>>>>> reviewed this patch based on latest Paul's dev branch. I see that Neeraj
> >>>>>>> needs further work.
> >>>>>>
> >>>>>> You are right. So the only change is to drop the warning and those braces. Agreed?
> >>>>>>
> >>>>> Let me check a bit. Looks like correct but just in case.
> >>>>>
> >>>>
> >>>> Thanks. I was also considering improving it for the rcu == NULL case, as
> >>>> below. I will test it more before re-sending.
> >>>>
> >>>> On top of my patch:
> >>>>
> >>>> ---8<-----------------------
> >>>>
> >>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> >>>> index 0df659a878ee..a5ef844835d4 100644
> >>>> --- a/kernel/rcu/tree.c
> >>>> +++ b/kernel/rcu/tree.c
> >>>> @@ -1706,15 +1706,18 @@ static void rcu_sr_normal_gp_cleanup(void)
> >>>> break;
> >>>> }
> >>>>
> >>>> +
> >>>> + /* Last head stays. No more processing to do. */
> >>>> + if (!rcu)
> >>>> + return;
> >>>> +
> >>>
> >>> Ugh, should be "if (!wait_head->next)" instead of "if (!rcu)". But
> >>> in any case, the original patch except the warning should hold.
> >>> Still, I am testing the above diff now.
> >>>
> >>> - Joel
> >>>
> >> Just in case, it is based on your patch:
> >>
> >> <snip>
> >> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> >> index bd29fe3c76bf..98546afe7c21 100644
> >> --- a/kernel/rcu/tree.c
> >> +++ b/kernel/rcu/tree.c
> >> @@ -1711,29 +1711,25 @@ static void rcu_sr_normal_gp_cleanup(void)
> >> * if no inflight-workers. If there are in-flight workers, let them
> >> * remove the last wait head.
> >> */
> >> - WARN_ON_ONCE(!rcu);
> >> - ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail);
> >> -
> >> - if (rcu && rcu_sr_is_wait_head(rcu) && rcu->next == NULL &&
> >> - /* Order atomic access with list manipulation. */
> >> - !atomic_read_acquire(&rcu_state.srs_cleanups_pending)) {
> >> + if (wait_tail->next && rcu_sr_is_wait_head(wait_tail->next) && !wait_tail->next->next &&
> >> + !atomic_read_acquire(&rcu_state.srs_cleanups_pending)) {
> >
> >
> > Yes this also works. But also if wait_tail->next == NULL, then you do not need
> > to queue worker for that case as well. I sent this as v3.
> >
> Sorry, I see you did add that later in the patch ;-). I think we have converged
> on the final patch then, give or take the use of 'rcu' versus 'wait_tail->next'.
>
Just combine all parts into one place and resend :)

Thanks!

--
Uladzislau Rezki

2024-03-19 18:52:58

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH v2 rcu/dev 1/2] rcu/tree: Reduce wake up for synchronize_rcu() common case



On 3/19/2024 2:37 PM, Uladzislau Rezki wrote:
> On Tue, Mar 19, 2024 at 01:33:11PM -0400, Joel Fernandes wrote:

>>> On 3/19/2024 1:26 PM, Uladzislau Rezki wrote:

>>>>>>>>>>>> /*
>>>>>>>>>>>> @@ -1673,7 +1680,7 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
>>>>>>>>>>>> */
>>>>>>>>>>>> static void rcu_sr_normal_gp_cleanup(void)
>>>>>>>>>>>> {
>>>>>>>>>>>> - struct llist_node *wait_tail, *next, *rcu;
>>>>>>>>>>>> + struct llist_node *wait_tail, *next = NULL, *rcu = NULL;
>>>>>>>>>>>> int done = 0;
>>>>>>>>>>>>
>>>>>>>>>>>> wait_tail = rcu_state.srs_wait_tail;
>>>>>>>>>>>> @@ -1699,16 +1706,35 @@ static void rcu_sr_normal_gp_cleanup(void)
>>>>>>>>>>>> break;
>>>>>>>>>>>> }
>>>>>>>>>>>>
>>>>>>>>>>>> - // concurrent sr_normal_gp_cleanup work might observe this update.
>>>>>>>>>>>> - smp_store_release(&rcu_state.srs_done_tail, wait_tail);
>>>>>>>>>>>> + /*
>>>>>>>>>>>> + * Fast path, no more users to process. Remove the last wait head
>>>>>>>>>>>> + * if no inflight-workers. If there are in-flight workers, let them
>>>>>>>>>>>> + * remove the last wait head.
>>>>>>>>>>>> + */
>>>>>>>>>>>> + WARN_ON_ONCE(!rcu);
>>>>>>>>>>>>
>>>>>>>>>>> This assumption is not correct. An "rcu" can be NULL in fact.
>>>>>>>>>>
>>>>>>>>>> Hmm I could never trigger that. Are you saying that is true after Neeraj recent patch or something else?
>>>>>>>>>> Note, after Neeraj patch to handle the lack of heads availability, it could be true so I requested
>>>>>>>>>> him to rebase his patch on top of this one.
>>>>>>>>>>
>>>>>>>>>> However I will revisit my patch and look for if it could occur but please let me know if you knew of a sequence of events to make it NULL.
>>>>>>>>>>>
>>>>>>>>> I think we should agree on your patch first otherwise it becomes a bit
>>>>>>>>> messy or go with a Neeraj as first step and then work on youth. So, i
>>>>>>>>> reviewed this patch based on latest Paul's dev branch. I see that Neeraj
>>>>>>>>> needs further work.
>>>>>>>>
>>>>>>>> You are right. So the only change is to drop the warning and those braces. Agreed?
>>>>>>>>
>>>>>>> Let me check a bit. Looks like correct but just in case.
>>>>>>>
>>>>>>
>>>>>> Thanks. I was also considering improving it for the rcu == NULL case, as
>>>>>> below. I will test it more before re-sending.
>>>>>>
>>>>>> On top of my patch:
>>>>>>
>>>>>> ---8<-----------------------
>>>>>>
>>>>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
>>>>>> index 0df659a878ee..a5ef844835d4 100644
>>>>>> --- a/kernel/rcu/tree.c
>>>>>> +++ b/kernel/rcu/tree.c
>>>>>> @@ -1706,15 +1706,18 @@ static void rcu_sr_normal_gp_cleanup(void)
>>>>>> break;
>>>>>> }
>>>>>>
>>>>>> +
>>>>>> + /* Last head stays. No more processing to do. */
>>>>>> + if (!rcu)
>>>>>> + return;
>>>>>> +
>>>>>
>>>>> Ugh, should be "if (!wait_head->next)" instead of "if (!rcu)". But
>>>>> in any case, the original patch except the warning should hold.
>>>>> Still, I am testing the above diff now.
>>>>>
>>>>> - Joel
>>>>>
>>>> Just in case, it is based on your patch:
>>>>
>>>> <snip>
>>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
>>>> index bd29fe3c76bf..98546afe7c21 100644
>>>> --- a/kernel/rcu/tree.c
>>>> +++ b/kernel/rcu/tree.c
>>>> @@ -1711,29 +1711,25 @@ static void rcu_sr_normal_gp_cleanup(void)
>>>> * if no inflight-workers. If there are in-flight workers, let them
>>>> * remove the last wait head.
>>>> */
>>>> - WARN_ON_ONCE(!rcu);
>>>> - ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail);
>>>> -
>>>> - if (rcu && rcu_sr_is_wait_head(rcu) && rcu->next == NULL &&
>>>> - /* Order atomic access with list manipulation. */
>>>> - !atomic_read_acquire(&rcu_state.srs_cleanups_pending)) {
>>>> + if (wait_tail->next && rcu_sr_is_wait_head(wait_tail->next) && !wait_tail->next->next &&
>>>> + !atomic_read_acquire(&rcu_state.srs_cleanups_pending)) {
>>>
>>>
>>> Yes this also works. But also if wait_tail->next == NULL, then you do not need
>>> to queue worker for that case as well. I sent this as v3.
>>>
>> Sorry, I see you did add that later in the patch ;-). I think we have converged
>> on the final patch then, give or take the use of 'rcu' versus 'wait_tail->next'.
>>
> Just combine all parts into one place and resend :)

Yes sir ;)

- Joel


2024-03-19 19:07:22

by Uladzislau Rezki

[permalink] [raw]
Subject: Re: [PATCH v2 rcu/dev 1/2] rcu/tree: Reduce wake up for synchronize_rcu() common case

On Tue, Mar 19, 2024 at 02:52:43PM -0400, Joel Fernandes wrote:
>
>
> On 3/19/2024 2:37 PM, Uladzislau Rezki wrote:
> > On Tue, Mar 19, 2024 at 01:33:11PM -0400, Joel Fernandes wrote:
>
> >>> On 3/19/2024 1:26 PM, Uladzislau Rezki wrote:
>
> >>>>>>>>>>>> /*
> >>>>>>>>>>>> @@ -1673,7 +1680,7 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
> >>>>>>>>>>>> */
> >>>>>>>>>>>> static void rcu_sr_normal_gp_cleanup(void)
> >>>>>>>>>>>> {
> >>>>>>>>>>>> - struct llist_node *wait_tail, *next, *rcu;
> >>>>>>>>>>>> + struct llist_node *wait_tail, *next = NULL, *rcu = NULL;
> >>>>>>>>>>>> int done = 0;
> >>>>>>>>>>>>
> >>>>>>>>>>>> wait_tail = rcu_state.srs_wait_tail;
> >>>>>>>>>>>> @@ -1699,16 +1706,35 @@ static void rcu_sr_normal_gp_cleanup(void)
> >>>>>>>>>>>> break;
> >>>>>>>>>>>> }
> >>>>>>>>>>>>
> >>>>>>>>>>>> - // concurrent sr_normal_gp_cleanup work might observe this update.
> >>>>>>>>>>>> - smp_store_release(&rcu_state.srs_done_tail, wait_tail);
> >>>>>>>>>>>> + /*
> >>>>>>>>>>>> + * Fast path, no more users to process. Remove the last wait head
> >>>>>>>>>>>> + * if no inflight-workers. If there are in-flight workers, let them
> >>>>>>>>>>>> + * remove the last wait head.
> >>>>>>>>>>>> + */
> >>>>>>>>>>>> + WARN_ON_ONCE(!rcu);
> >>>>>>>>>>>>
> >>>>>>>>>>> This assumption is not correct. An "rcu" can be NULL in fact.
> >>>>>>>>>>
> >>>>>>>>>> Hmm I could never trigger that. Are you saying that is true after Neeraj recent patch or something else?
> >>>>>>>>>> Note, after Neeraj patch to handle the lack of heads availability, it could be true so I requested
> >>>>>>>>>> him to rebase his patch on top of this one.
> >>>>>>>>>>
> >>>>>>>>>> However I will revisit my patch and look for if it could occur but please let me know if you knew of a sequence of events to make it NULL.
> >>>>>>>>>>>
> >>>>>>>>> I think we should agree on your patch first otherwise it becomes a bit
> >>>>>>>>> messy or go with a Neeraj as first step and then work on youth. So, i
> >>>>>>>>> reviewed this patch based on latest Paul's dev branch. I see that Neeraj
> >>>>>>>>> needs further work.
> >>>>>>>>
> >>>>>>>> You are right. So the only change is to drop the warning and those braces. Agreed?
> >>>>>>>>
> >>>>>>> Let me check a bit. Looks like correct but just in case.
> >>>>>>>
> >>>>>>
> >>>>>> Thanks. I was also considering improving it for the rcu == NULL case, as
> >>>>>> below. I will test it more before re-sending.
> >>>>>>
> >>>>>> On top of my patch:
> >>>>>>
> >>>>>> ---8<-----------------------
> >>>>>>
> >>>>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> >>>>>> index 0df659a878ee..a5ef844835d4 100644
> >>>>>> --- a/kernel/rcu/tree.c
> >>>>>> +++ b/kernel/rcu/tree.c
> >>>>>> @@ -1706,15 +1706,18 @@ static void rcu_sr_normal_gp_cleanup(void)
> >>>>>> break;
> >>>>>> }
> >>>>>>
> >>>>>> +
> >>>>>> + /* Last head stays. No more processing to do. */
> >>>>>> + if (!rcu)
> >>>>>> + return;
> >>>>>> +
> >>>>>
> >>>>> Ugh, should be "if (!wait_head->next)" instead of "if (!rcu)". But
> >>>>> in any case, the original patch except the warning should hold.
> >>>>> Still, I am testing the above diff now.
> >>>>>
> >>>>> - Joel
> >>>>>
> >>>> Just in case, it is based on your patch:
> >>>>
> >>>> <snip>
> >>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> >>>> index bd29fe3c76bf..98546afe7c21 100644
> >>>> --- a/kernel/rcu/tree.c
> >>>> +++ b/kernel/rcu/tree.c
> >>>> @@ -1711,29 +1711,25 @@ static void rcu_sr_normal_gp_cleanup(void)
> >>>> * if no inflight-workers. If there are in-flight workers, let them
> >>>> * remove the last wait head.
> >>>> */
> >>>> - WARN_ON_ONCE(!rcu);
> >>>> - ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail);
> >>>> -
> >>>> - if (rcu && rcu_sr_is_wait_head(rcu) && rcu->next == NULL &&
> >>>> - /* Order atomic access with list manipulation. */
> >>>> - !atomic_read_acquire(&rcu_state.srs_cleanups_pending)) {
> >>>> + if (wait_tail->next && rcu_sr_is_wait_head(wait_tail->next) && !wait_tail->next->next &&
> >>>> + !atomic_read_acquire(&rcu_state.srs_cleanups_pending)) {
> >>>
> >>>
> >>> Yes this also works. But also if wait_tail->next == NULL, then you do not need
> >>> to queue worker for that case as well. I sent this as v3.
> >>>
> >> Sorry, I see you did add that later in the patch ;-). I think we have converged
> >> on the final patch then, give or take the use of 'rcu' versus 'wait_tail->next'.
> >>
> > Just combine all parts into one place and resend :)
>
> Yes sir ;)
>
Ha-ha :)))))

--
Uladzislau Rezki