Fix a below race by not releasing a wait-head from the
GP-kthread as it can lead for reusing it whereas a worker
can still access it thus execute newly added callbacks too
early.
CPU 0 CPU 1
----- -----
// wait_tail == HEAD1
rcu_sr_normal_gp_cleanup() {
// has passed SR_MAX_USERS_WAKE_FROM_GP
wait_tail->next = next;
// done_tail = HEAD1
smp_store_release(&rcu_state.srs_done_tail, wait_tail);
queue_work() {
test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)
__queue_work()
}
}
set_work_pool_and_clear_pending()
rcu_sr_normal_gp_cleanup_work() {
// new GP, wait_tail == HEAD2
rcu_sr_normal_gp_cleanup() {
// executes all completion, but stop at HEAD1
wait_tail->next = HEAD1;
// done_tail = HEAD2
smp_store_release(&rcu_state.srs_done_tail, wait_tail);
queue_work() {
test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)
__queue_work()
}
}
// done = HEAD2
done = smp_load_acquire(&rcu_state.srs_done_tail);
// head = HEAD1
head = done->next;
done->next = NULL;
llist_for_each_safe() {
// completes all callbacks, release HEAD1
}
}
// Process second queue
set_work_pool_and_clear_pending()
rcu_sr_normal_gp_cleanup_work() {
// done = HEAD2
done = smp_load_acquire(&rcu_state.srs_done_tail);
// new GP, wait_tail == HEAD3
rcu_sr_normal_gp_cleanup() {
// Finds HEAD2 with ->next == NULL at the end
rcu_sr_put_wait_head(HEAD2)
...
// A few more GPs later
rcu_sr_normal_gp_init() {
HEAD2 = rcu_sr_get_wait_head();
llist_add(HEAD2, &rcu_state.srs_next);
// head == rcu_state.srs_next
head = done->next;
done->next = NULL;
llist_for_each_safe() {
// EXECUTE CALLBACKS TOO EARLY!!!
}
}
Reported-by: Frederic Weisbecker <[email protected]>
Fixes: 05a10b921000 ("rcu: Support direct wake-up of synchronize_rcu() users")
Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
---
kernel/rcu/tree.c | 22 ++++++++--------------
1 file changed, 8 insertions(+), 14 deletions(-)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 31f3a61f9c38..475647620b12 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1656,21 +1656,11 @@ static void rcu_sr_normal_gp_cleanup(void)
WARN_ON_ONCE(!rcu_sr_is_wait_head(wait_tail));
/*
- * Process (a) and (d) cases. See an illustration. Apart of
- * that it handles the scenario when all clients are done,
- * wait-head is released if last. The worker is not kicked.
+ * Process (a) and (d) cases. See an illustration.
*/
llist_for_each_safe(rcu, next, wait_tail->next) {
- if (rcu_sr_is_wait_head(rcu)) {
- if (!rcu->next) {
- rcu_sr_put_wait_head(rcu);
- wait_tail->next = NULL;
- } else {
- wait_tail->next = rcu;
- }
-
+ if (rcu_sr_is_wait_head(rcu))
break;
- }
rcu_sr_normal_complete(rcu);
// It can be last, update a next on this step.
@@ -1684,8 +1674,12 @@ static void rcu_sr_normal_gp_cleanup(void)
smp_store_release(&rcu_state.srs_done_tail, wait_tail);
ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail);
- if (wait_tail->next)
- queue_work(system_highpri_wq, &rcu_state.srs_cleanup_work);
+ /*
+ * We schedule a work in order to perform a final processing
+ * of outstanding users(if still left) and releasing wait-heads
+ * added by rcu_sr_normal_gp_init() call.
+ */
+ queue_work(system_highpri_wq, &rcu_state.srs_cleanup_work);
}
/*
--
2.39.2
synchronize_rcu() users have to be processed regardless
of memory pressure so our private WQ needs to have at least
one execution context what WQ_MEM_RECLAIM flag guarantees.
Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
---
kernel/rcu/tree.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 475647620b12..59881a68dd26 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1581,6 +1581,7 @@ static void rcu_sr_put_wait_head(struct llist_node *node)
/* Disabled by default. */
static int rcu_normal_wake_from_gp;
module_param(rcu_normal_wake_from_gp, int, 0644);
+static struct workqueue_struct *sync_wq;
static void rcu_sr_normal_complete(struct llist_node *node)
{
@@ -1679,7 +1680,7 @@ static void rcu_sr_normal_gp_cleanup(void)
* of outstanding users(if still left) and releasing wait-heads
* added by rcu_sr_normal_gp_init() call.
*/
- queue_work(system_highpri_wq, &rcu_state.srs_cleanup_work);
+ queue_work(sync_wq, &rcu_state.srs_cleanup_work);
}
/*
@@ -5584,6 +5585,9 @@ void __init rcu_init(void)
rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
WARN_ON(!rcu_gp_wq);
+ sync_wq = alloc_workqueue("sync_wq", WQ_MEM_RECLAIM, 0);
+ WARN_ON(!sync_wq);
+
/* Fill in default value for rcutree.qovld boot parameter. */
/* -After- the rcu_node ->lock fields are initialized! */
if (qovld < 0)
--
2.39.2
>
> synchronize_rcu() users have to be processed regardless
> of memory pressure so our private WQ needs to have at least
> one execution context what WQ_MEM_RECLAIM flag guarantees.
>
> Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
> ---
> kernel/rcu/tree.c | 6 +++++-
> 1 file changed, 5 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 475647620b12..59881a68dd26 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -1581,6 +1581,7 @@ static void rcu_sr_put_wait_head(struct llist_node *node)
> /* Disabled by default. */
> static int rcu_normal_wake_from_gp;
> module_param(rcu_normal_wake_from_gp, int, 0644);
> +static struct workqueue_struct *sync_wq;
>
> static void rcu_sr_normal_complete(struct llist_node *node)
> {
> @@ -1679,7 +1680,7 @@ static void rcu_sr_normal_gp_cleanup(void)
> * of outstanding users(if still left) and releasing wait-heads
> * added by rcu_sr_normal_gp_init() call.
> */
> - queue_work(system_highpri_wq, &rcu_state.srs_cleanup_work);
> + queue_work(sync_wq, &rcu_state.srs_cleanup_work);
> }
>
> /*
> @@ -5584,6 +5585,9 @@ void __init rcu_init(void)
> rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
> WARN_ON(!rcu_gp_wq);
>
> + sync_wq = alloc_workqueue("sync_wq", WQ_MEM_RECLAIM, 0);
Why was WQ_HIGHPRI removed?
Thanks
Zqiang
> + WARN_ON(!sync_wq);
> +
> /* Fill in default value for rcutree.qovld boot parameter. */
> /* -After- the rcu_node ->lock fields are initialized! */
> if (qovld < 0)
> --
> 2.39.2
>
>
On Wed, Mar 06, 2024 at 10:15:44AM +0800, Z qiang wrote:
> >
> > synchronize_rcu() users have to be processed regardless
> > of memory pressure so our private WQ needs to have at least
> > one execution context what WQ_MEM_RECLAIM flag guarantees.
> >
> > Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
> > ---
> > kernel/rcu/tree.c | 6 +++++-
> > 1 file changed, 5 insertions(+), 1 deletion(-)
> >
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index 475647620b12..59881a68dd26 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -1581,6 +1581,7 @@ static void rcu_sr_put_wait_head(struct llist_node *node)
> > /* Disabled by default. */
> > static int rcu_normal_wake_from_gp;
> > module_param(rcu_normal_wake_from_gp, int, 0644);
> > +static struct workqueue_struct *sync_wq;
> >
> > static void rcu_sr_normal_complete(struct llist_node *node)
> > {
> > @@ -1679,7 +1680,7 @@ static void rcu_sr_normal_gp_cleanup(void)
> > * of outstanding users(if still left) and releasing wait-heads
> > * added by rcu_sr_normal_gp_init() call.
> > */
> > - queue_work(system_highpri_wq, &rcu_state.srs_cleanup_work);
> > + queue_work(sync_wq, &rcu_state.srs_cleanup_work);
> > }
> >
> > /*
> > @@ -5584,6 +5585,9 @@ void __init rcu_init(void)
> > rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
> > WARN_ON(!rcu_gp_wq);
> >
> > + sync_wq = alloc_workqueue("sync_wq", WQ_MEM_RECLAIM, 0);
>
> Why was WQ_HIGHPRI removed?
>
I would like to check perf. figures with it and send out it as a
separate patch if it is worth it.
--
Uladzislau Rezki
On 3/6/2024 6:56 AM, Uladzislau Rezki wrote:
> On Wed, Mar 06, 2024 at 10:15:44AM +0800, Z qiang wrote:
>>>
>>> synchronize_rcu() users have to be processed regardless
>>> of memory pressure so our private WQ needs to have at least
>>> one execution context what WQ_MEM_RECLAIM flag guarantees.
>>>
>>> Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
>>> ---
>>> kernel/rcu/tree.c | 6 +++++-
>>> 1 file changed, 5 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
>>> index 475647620b12..59881a68dd26 100644
>>> --- a/kernel/rcu/tree.c
>>> +++ b/kernel/rcu/tree.c
>>> @@ -1581,6 +1581,7 @@ static void rcu_sr_put_wait_head(struct llist_node *node)
>>> /* Disabled by default. */
>>> static int rcu_normal_wake_from_gp;
>>> module_param(rcu_normal_wake_from_gp, int, 0644);
>>> +static struct workqueue_struct *sync_wq;
>>>
>>> static void rcu_sr_normal_complete(struct llist_node *node)
>>> {
>>> @@ -1679,7 +1680,7 @@ static void rcu_sr_normal_gp_cleanup(void)
>>> * of outstanding users(if still left) and releasing wait-heads
>>> * added by rcu_sr_normal_gp_init() call.
>>> */
>>> - queue_work(system_highpri_wq, &rcu_state.srs_cleanup_work);
>>> + queue_work(sync_wq, &rcu_state.srs_cleanup_work);
>>> }
>>>
>>> /*
>>> @@ -5584,6 +5585,9 @@ void __init rcu_init(void)
>>> rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
>>> WARN_ON(!rcu_gp_wq);
>>>
>>> + sync_wq = alloc_workqueue("sync_wq", WQ_MEM_RECLAIM, 0);
>>
>> Why was WQ_HIGHPRI removed?
>>
> I would like to check perf. figures with it and send out it as a
> separate patch if it is worth it.
I guess one thing to note is that there are also other RCU-related WQ which have
WQ_MEM_RECLAIM but not WQ_HIGHPRI (such as for expedited RCU, at least some
configs). So for consistency, this makes sense to me.
Reviewed-by: Joel Fernandes (Google) <[email protected]).
thanks,
- Joel
On 3/5/2024 2:57 PM, Uladzislau Rezki (Sony) wrote:
> Fix a below race by not releasing a wait-head from the
> GP-kthread as it can lead for reusing it whereas a worker
> can still access it thus execute newly added callbacks too
> early.
>
> CPU 0 CPU 1
> ----- -----
>
> // wait_tail == HEAD1
> rcu_sr_normal_gp_cleanup() {
> // has passed SR_MAX_USERS_WAKE_FROM_GP
> wait_tail->next = next;
> // done_tail = HEAD1
> smp_store_release(&rcu_state.srs_done_tail, wait_tail);
> queue_work() {
> test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)
> __queue_work()
> }
> }
>
> set_work_pool_and_clear_pending()
> rcu_sr_normal_gp_cleanup_work() {
> // new GP, wait_tail == HEAD2
> rcu_sr_normal_gp_cleanup() {
> // executes all completion, but stop at HEAD1
> wait_tail->next = HEAD1;
> // done_tail = HEAD2
> smp_store_release(&rcu_state.srs_done_tail, wait_tail);
> queue_work() {
> test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)
> __queue_work()
> }
> }
> // done = HEAD2
> done = smp_load_acquire(&rcu_state.srs_done_tail);
> // head = HEAD1
> head = done->next;
> done->next = NULL;
> llist_for_each_safe() {
> // completes all callbacks, release HEAD1
> }
> }
> // Process second queue
> set_work_pool_and_clear_pending()
> rcu_sr_normal_gp_cleanup_work() {
> // done = HEAD2
> done = smp_load_acquire(&rcu_state.srs_done_tail);
>
> // new GP, wait_tail == HEAD3
> rcu_sr_normal_gp_cleanup() {
> // Finds HEAD2 with ->next == NULL at the end
> rcu_sr_put_wait_head(HEAD2)
> ...
>
> // A few more GPs later
> rcu_sr_normal_gp_init() {
> HEAD2 = rcu_sr_get_wait_head();
> llist_add(HEAD2, &rcu_state.srs_next);
> // head == rcu_state.srs_next
> head = done->next;
> done->next = NULL;
> llist_for_each_safe() {
> // EXECUTE CALLBACKS TOO EARLY!!!
> }
> }
>
> Reported-by: Frederic Weisbecker <[email protected]>
> Fixes: 05a10b921000 ("rcu: Support direct wake-up of synchronize_rcu() users")
> Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
> ---
> kernel/rcu/tree.c | 22 ++++++++--------------
> 1 file changed, 8 insertions(+), 14 deletions(-)
>
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 31f3a61f9c38..475647620b12 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -1656,21 +1656,11 @@ static void rcu_sr_normal_gp_cleanup(void)
> WARN_ON_ONCE(!rcu_sr_is_wait_head(wait_tail));
>
> /*
> - * Process (a) and (d) cases. See an illustration. Apart of
> - * that it handles the scenario when all clients are done,
> - * wait-head is released if last. The worker is not kicked.
> + * Process (a) and (d) cases. See an illustration.
> */
> llist_for_each_safe(rcu, next, wait_tail->next) {
> - if (rcu_sr_is_wait_head(rcu)) {
> - if (!rcu->next) {
> - rcu_sr_put_wait_head(rcu);
> - wait_tail->next = NULL;
> - } else {
> - wait_tail->next = rcu;
> - }
> -
> + if (rcu_sr_is_wait_head(rcu))
> break;
> - }
>
> rcu_sr_normal_complete(rcu);
> // It can be last, update a next on this step.
> @@ -1684,8 +1674,12 @@ static void rcu_sr_normal_gp_cleanup(void)
> smp_store_release(&rcu_state.srs_done_tail, wait_tail);
> ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail);
>
> - if (wait_tail->next)
> - queue_work(system_highpri_wq, &rcu_state.srs_cleanup_work);
> + /*
> + * We schedule a work in order to perform a final processing
> + * of outstanding users(if still left) and releasing wait-heads
> + * added by rcu_sr_normal_gp_init() call.
> + */
> + queue_work(system_highpri_wq, &rcu_state.srs_cleanup_work);
> }
Ah, nice. So instead of allocating/freeing in GP thread and freeing in worker,
you allocate heads only in GP thread and free them only in worker, thus
essentially fixing the UAF that Frederick found.
AFAICS, this fixes the issue.
Reviewed-by: Joel Fernandes (Google) <[email protected]>
There might a way to prevent queuing new work as fast-path optimization, incase
the CBs per GP will always be < SR_MAX_USERS_WAKE_FROM_GP but I could not find a
workqueue API that helps there, and work_busy() has comments saying not to use that.
thanks,
- Joel
On Wed, Mar 6, 2024 at 5:31 PM Joel Fernandes <[email protected]> wrote:
>
>
>
> On 3/5/2024 2:57 PM, Uladzislau Rezki (Sony) wrote:
> > Fix a below race by not releasing a wait-head from the
> > GP-kthread as it can lead for reusing it whereas a worker
> > can still access it thus execute newly added callbacks too
> > early.
> >
[...]
> There might a way to prevent queuing new work as fast-path optimization, incase
> the CBs per GP will always be < SR_MAX_USERS_WAKE_FROM_GP but I could not find a
> workqueue API that helps there, and work_busy() has comments saying not to use that.
One way to do this would be to maintain a count of how many CBs are in
flight via the worker route, and then fast-path-free the thing if the
count is 0. Should I send a patch around something like that? It saves
1 more wakeup per synchronize_rcu() I think.
- Joel
Le Tue, Mar 05, 2024 at 08:57:19PM +0100, Uladzislau Rezki (Sony) a ?crit :
> Fix a below race by not releasing a wait-head from the
> GP-kthread as it can lead for reusing it whereas a worker
> can still access it thus execute newly added callbacks too
> early.
>
> CPU 0 CPU 1
> ----- -----
>
> // wait_tail == HEAD1
> rcu_sr_normal_gp_cleanup() {
> // has passed SR_MAX_USERS_WAKE_FROM_GP
> wait_tail->next = next;
> // done_tail = HEAD1
> smp_store_release(&rcu_state.srs_done_tail, wait_tail);
> queue_work() {
> test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)
> __queue_work()
> }
> }
>
> set_work_pool_and_clear_pending()
> rcu_sr_normal_gp_cleanup_work() {
> // new GP, wait_tail == HEAD2
> rcu_sr_normal_gp_cleanup() {
> // executes all completion, but stop at HEAD1
> wait_tail->next = HEAD1;
> // done_tail = HEAD2
> smp_store_release(&rcu_state.srs_done_tail, wait_tail);
> queue_work() {
> test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)
> __queue_work()
> }
> }
> // done = HEAD2
> done = smp_load_acquire(&rcu_state.srs_done_tail);
> // head = HEAD1
> head = done->next;
> done->next = NULL;
> llist_for_each_safe() {
> // completes all callbacks, release HEAD1
> }
> }
> // Process second queue
> set_work_pool_and_clear_pending()
> rcu_sr_normal_gp_cleanup_work() {
> // done = HEAD2
> done = smp_load_acquire(&rcu_state.srs_done_tail);
>
> // new GP, wait_tail == HEAD3
> rcu_sr_normal_gp_cleanup() {
> // Finds HEAD2 with ->next == NULL at the end
> rcu_sr_put_wait_head(HEAD2)
> ...
>
> // A few more GPs later
> rcu_sr_normal_gp_init() {
> HEAD2 = rcu_sr_get_wait_head();
> llist_add(HEAD2, &rcu_state.srs_next);
> // head == rcu_state.srs_next
> head = done->next;
> done->next = NULL;
> llist_for_each_safe() {
> // EXECUTE CALLBACKS TOO EARLY!!!
> }
> }
>
> Reported-by: Frederic Weisbecker <[email protected]>
> Fixes: 05a10b921000 ("rcu: Support direct wake-up of synchronize_rcu() users")
> Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
Reviewed-by: Frederic Weisbecker <[email protected]>
On 3/6/2024 5:31 PM, Joel Fernandes wrote:
>
>
> On 3/5/2024 2:57 PM, Uladzislau Rezki (Sony) wrote:
>> Fix a below race by not releasing a wait-head from the
>> GP-kthread as it can lead for reusing it whereas a worker
>> can still access it thus execute newly added callbacks too
>> early.
>>
>> CPU 0 CPU 1
>> ----- -----
>>
>> // wait_tail == HEAD1
>> rcu_sr_normal_gp_cleanup() {
>> // has passed SR_MAX_USERS_WAKE_FROM_GP
>> wait_tail->next = next;
>> // done_tail = HEAD1
>> smp_store_release(&rcu_state.srs_done_tail, wait_tail);
>> queue_work() {
>> test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)
>> __queue_work()
>> }
>> }
>>
>> set_work_pool_and_clear_pending()
>> rcu_sr_normal_gp_cleanup_work() {
>> // new GP, wait_tail == HEAD2
>> rcu_sr_normal_gp_cleanup() {
>> // executes all completion, but stop at HEAD1
>> wait_tail->next = HEAD1;
>> // done_tail = HEAD2
>> smp_store_release(&rcu_state.srs_done_tail, wait_tail);
>> queue_work() {
>> test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)
>> __queue_work()
>> }
>> }
>> // done = HEAD2
>> done = smp_load_acquire(&rcu_state.srs_done_tail);
>> // head = HEAD1
>> head = done->next;
>> done->next = NULL;
>> llist_for_each_safe() {
>> // completes all callbacks, release HEAD1
>> }
>> }
>> // Process second queue
>> set_work_pool_and_clear_pending()
>> rcu_sr_normal_gp_cleanup_work() {
>> // done = HEAD2
>> done = smp_load_acquire(&rcu_state.srs_done_tail);
>>
>> // new GP, wait_tail == HEAD3
>> rcu_sr_normal_gp_cleanup() {
>> // Finds HEAD2 with ->next == NULL at the end
>> rcu_sr_put_wait_head(HEAD2)
>> ...
>>
>> // A few more GPs later
>> rcu_sr_normal_gp_init() {
>> HEAD2 = rcu_sr_get_wait_head();
>> llist_add(HEAD2, &rcu_state.srs_next);
>> // head == rcu_state.srs_next
>> head = done->next;
>> done->next = NULL;
>> llist_for_each_safe() {
>> // EXECUTE CALLBACKS TOO EARLY!!!
>> }
>> }
>>
>> Reported-by: Frederic Weisbecker <[email protected]>
>> Fixes: 05a10b921000 ("rcu: Support direct wake-up of synchronize_rcu() users")
>> Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
>> ---
>> kernel/rcu/tree.c | 22 ++++++++--------------
>> 1 file changed, 8 insertions(+), 14 deletions(-)
>>
>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
>> index 31f3a61f9c38..475647620b12 100644
>> --- a/kernel/rcu/tree.c
>> +++ b/kernel/rcu/tree.c
>> @@ -1656,21 +1656,11 @@ static void rcu_sr_normal_gp_cleanup(void)
>> WARN_ON_ONCE(!rcu_sr_is_wait_head(wait_tail));
>>
>> /*
>> - * Process (a) and (d) cases. See an illustration. Apart of
>> - * that it handles the scenario when all clients are done,
>> - * wait-head is released if last. The worker is not kicked.
>> + * Process (a) and (d) cases. See an illustration.
>> */
>> llist_for_each_safe(rcu, next, wait_tail->next) {
>> - if (rcu_sr_is_wait_head(rcu)) {
>> - if (!rcu->next) {
>> - rcu_sr_put_wait_head(rcu);
>> - wait_tail->next = NULL;
>> - } else {
>> - wait_tail->next = rcu;
>> - }
>> -
>> + if (rcu_sr_is_wait_head(rcu))
>> break;
>> - }
>>
>> rcu_sr_normal_complete(rcu);
>> // It can be last, update a next on this step.
>> @@ -1684,8 +1674,12 @@ static void rcu_sr_normal_gp_cleanup(void)
>> smp_store_release(&rcu_state.srs_done_tail, wait_tail);
>> ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail);
>>
>> - if (wait_tail->next)
>> - queue_work(system_highpri_wq, &rcu_state.srs_cleanup_work);
>> + /*
>> + * We schedule a work in order to perform a final processing
>> + * of outstanding users(if still left) and releasing wait-heads
>> + * added by rcu_sr_normal_gp_init() call.
>> + */
>> + queue_work(system_highpri_wq, &rcu_state.srs_cleanup_work);
>> }
One question, why do you need to queue_work() if wait_tail->next == NULL?
AFAICS, at this stage if wait_tail->next == NULL, you are in CASE f. so the last
remaining HEAD stays? (And llist_for_each_safe() in
rcu_sr_normal_gp_cleanup_work becomes a NOOP).
Could be something like this, but maybe I missed something:
@@ -1672,7 +1674,7 @@ static void rcu_sr_normal_gp_cleanup_work(struct
work_struct *work)
*/
static void rcu_sr_normal_gp_cleanup(void)
{
- struct llist_node *wait_tail, *next, *rcu;
+ struct llist_node *wait_tail, *next = NULL, *rcu = NULL;
int done = 0;
wait_tail = rcu_state.srs_wait_tail;
@@ -1707,7 +1709,8 @@ static void rcu_sr_normal_gp_cleanup(void)
* of outstanding users(if still left) and releasing wait-heads
* added by rcu_sr_normal_gp_init() call.
*/
- queue_work(system_highpri_wq, &rcu_state.srs_cleanup_work);
+ if (rcu)
+ queue_work(system_highpri_wq, &rcu_state.srs_cleanup_work);
}
/*
On 3/7/2024 1:21 AM, Joel Fernandes wrote:
>
>
> On 3/6/2024 5:31 PM, Joel Fernandes wrote:
>>
>>
>> On 3/5/2024 2:57 PM, Uladzislau Rezki (Sony) wrote:
>>> Fix a below race by not releasing a wait-head from the
>>> GP-kthread as it can lead for reusing it whereas a worker
>>> can still access it thus execute newly added callbacks too
>>> early.
>>>
>>> CPU 0 CPU 1
>>> ----- -----
>>>
>>> // wait_tail == HEAD1
>>> rcu_sr_normal_gp_cleanup() {
>>> // has passed SR_MAX_USERS_WAKE_FROM_GP
>>> wait_tail->next = next;
>>> // done_tail = HEAD1
>>> smp_store_release(&rcu_state.srs_done_tail, wait_tail);
>>> queue_work() {
>>> test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)
>>> __queue_work()
>>> }
>>> }
>>>
>>> set_work_pool_and_clear_pending()
>>> rcu_sr_normal_gp_cleanup_work() {
>>> // new GP, wait_tail == HEAD2
>>> rcu_sr_normal_gp_cleanup() {
>>> // executes all completion, but stop at HEAD1
>>> wait_tail->next = HEAD1;
>>> // done_tail = HEAD2
>>> smp_store_release(&rcu_state.srs_done_tail, wait_tail);
>>> queue_work() {
>>> test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)
>>> __queue_work()
>>> }
>>> }
>>> // done = HEAD2
>>> done = smp_load_acquire(&rcu_state.srs_done_tail);
>>> // head = HEAD1
>>> head = done->next;
>>> done->next = NULL;
>>> llist_for_each_safe() {
>>> // completes all callbacks, release HEAD1
>>> }
>>> }
>>> // Process second queue
>>> set_work_pool_and_clear_pending()
>>> rcu_sr_normal_gp_cleanup_work() {
>>> // done = HEAD2
>>> done = smp_load_acquire(&rcu_state.srs_done_tail);
>>>
>>> // new GP, wait_tail == HEAD3
>>> rcu_sr_normal_gp_cleanup() {
>>> // Finds HEAD2 with ->next == NULL at the end
>>> rcu_sr_put_wait_head(HEAD2)
>>> ...
>>>
>>> // A few more GPs later
>>> rcu_sr_normal_gp_init() {
>>> HEAD2 = rcu_sr_get_wait_head();
>>> llist_add(HEAD2, &rcu_state.srs_next);
>>> // head == rcu_state.srs_next
>>> head = done->next;
>>> done->next = NULL;
>>> llist_for_each_safe() {
>>> // EXECUTE CALLBACKS TOO EARLY!!!
>>> }
>>> }
>>>
>>> Reported-by: Frederic Weisbecker <[email protected]>
>>> Fixes: 05a10b921000 ("rcu: Support direct wake-up of synchronize_rcu() users")
>>> Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
>>> ---
>>> kernel/rcu/tree.c | 22 ++++++++--------------
>>> 1 file changed, 8 insertions(+), 14 deletions(-)
>>>
>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
>>> index 31f3a61f9c38..475647620b12 100644
>>> --- a/kernel/rcu/tree.c
>>> +++ b/kernel/rcu/tree.c
>>> @@ -1656,21 +1656,11 @@ static void rcu_sr_normal_gp_cleanup(void)
>>> WARN_ON_ONCE(!rcu_sr_is_wait_head(wait_tail));
>>>
>>> /*
>>> - * Process (a) and (d) cases. See an illustration. Apart of
>>> - * that it handles the scenario when all clients are done,
>>> - * wait-head is released if last. The worker is not kicked.
>>> + * Process (a) and (d) cases. See an illustration.
>>> */
>>> llist_for_each_safe(rcu, next, wait_tail->next) {
>>> - if (rcu_sr_is_wait_head(rcu)) {
>>> - if (!rcu->next) {
>>> - rcu_sr_put_wait_head(rcu);
>>> - wait_tail->next = NULL;
>>> - } else {
>>> - wait_tail->next = rcu;
>>> - }
>>> -
>>> + if (rcu_sr_is_wait_head(rcu))
>>> break;
>>> - }
>>>
>>> rcu_sr_normal_complete(rcu);
>>> // It can be last, update a next on this step.
>>> @@ -1684,8 +1674,12 @@ static void rcu_sr_normal_gp_cleanup(void)
>>> smp_store_release(&rcu_state.srs_done_tail, wait_tail);
>>> ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail);
>>>
>>> - if (wait_tail->next)
>>> - queue_work(system_highpri_wq, &rcu_state.srs_cleanup_work);
>>> + /*
>>> + * We schedule a work in order to perform a final processing
>>> + * of outstanding users(if still left) and releasing wait-heads
>>> + * added by rcu_sr_normal_gp_init() call.
>>> + */
>>> + queue_work(system_highpri_wq, &rcu_state.srs_cleanup_work);
>>> }
>
> One question, why do you need to queue_work() if wait_tail->next == NULL?
>
> AFAICS, at this stage if wait_tail->next == NULL, you are in CASE f. so the last
> remaining HEAD stays? (And llist_for_each_safe() in
> rcu_sr_normal_gp_cleanup_work becomes a NOOP).
>
Never mind, sorry for spewing nonsense. You can never end up with CASE f here so
you still need the queue_work(). I think it is worth looking into how to free
the last HEAD, without queuing a work though (while not causing wreckage).
thanks,
- Joel
On Wed, Mar 06, 2024 at 12:57:25PM -0500, Joel Fernandes wrote:
>
>
> On 3/6/2024 6:56 AM, Uladzislau Rezki wrote:
> > On Wed, Mar 06, 2024 at 10:15:44AM +0800, Z qiang wrote:
> >>>
> >>> synchronize_rcu() users have to be processed regardless
> >>> of memory pressure so our private WQ needs to have at least
> >>> one execution context what WQ_MEM_RECLAIM flag guarantees.
> >>>
> >>> Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
> >>> ---
> >>> kernel/rcu/tree.c | 6 +++++-
> >>> 1 file changed, 5 insertions(+), 1 deletion(-)
> >>>
> >>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> >>> index 475647620b12..59881a68dd26 100644
> >>> --- a/kernel/rcu/tree.c
> >>> +++ b/kernel/rcu/tree.c
> >>> @@ -1581,6 +1581,7 @@ static void rcu_sr_put_wait_head(struct llist_node *node)
> >>> /* Disabled by default. */
> >>> static int rcu_normal_wake_from_gp;
> >>> module_param(rcu_normal_wake_from_gp, int, 0644);
> >>> +static struct workqueue_struct *sync_wq;
> >>>
> >>> static void rcu_sr_normal_complete(struct llist_node *node)
> >>> {
> >>> @@ -1679,7 +1680,7 @@ static void rcu_sr_normal_gp_cleanup(void)
> >>> * of outstanding users(if still left) and releasing wait-heads
> >>> * added by rcu_sr_normal_gp_init() call.
> >>> */
> >>> - queue_work(system_highpri_wq, &rcu_state.srs_cleanup_work);
> >>> + queue_work(sync_wq, &rcu_state.srs_cleanup_work);
> >>> }
> >>>
> >>> /*
> >>> @@ -5584,6 +5585,9 @@ void __init rcu_init(void)
> >>> rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
> >>> WARN_ON(!rcu_gp_wq);
> >>>
> >>> + sync_wq = alloc_workqueue("sync_wq", WQ_MEM_RECLAIM, 0);
> >>
> >> Why was WQ_HIGHPRI removed?
> >>
> > I would like to check perf. figures with it and send out it as a
> > separate patch if it is worth it.
>
> I guess one thing to note is that there are also other RCU-related WQ which have
> WQ_MEM_RECLAIM but not WQ_HIGHPRI (such as for expedited RCU, at least some
> configs). So for consistency, this makes sense to me.
>
> Reviewed-by: Joel Fernandes (Google) <[email protected]).
>
Thanks. I will update it with review tag!
--
Uladzislau Rezki
On Wed, Mar 06, 2024 at 05:44:04PM -0500, Joel Fernandes wrote:
> On Wed, Mar 6, 2024 at 5:31 PM Joel Fernandes <[email protected]> wrote:
> >
> >
> >
> > On 3/5/2024 2:57 PM, Uladzislau Rezki (Sony) wrote:
> > > Fix a below race by not releasing a wait-head from the
> > > GP-kthread as it can lead for reusing it whereas a worker
> > > can still access it thus execute newly added callbacks too
> > > early.
> > >
> [...]
> > There might a way to prevent queuing new work as fast-path optimization, incase
> > the CBs per GP will always be < SR_MAX_USERS_WAKE_FROM_GP but I could not find a
> > workqueue API that helps there, and work_busy() has comments saying not to use that.
>
> One way to do this would be to maintain a count of how many CBs are in
> flight via the worker route, and then fast-path-free the thing if the
> count is 0. Should I send a patch around something like that? It saves
> 1 more wakeup per synchronize_rcu() I think.
>
We can release the last wait-head if we know that the worker is not
pending/running. Then we guarantee that Frederic's case is not possible.
From the other hand it will introduce again more mess because the idea
was, in the begging, to start with something really simple :)
--
Uladzislau Rezki
On Thu, Mar 07, 2024 at 01:21:50AM -0500, Joel Fernandes wrote:
>
>
> On 3/6/2024 5:31 PM, Joel Fernandes wrote:
> >
> >
> > On 3/5/2024 2:57 PM, Uladzislau Rezki (Sony) wrote:
> >> Fix a below race by not releasing a wait-head from the
> >> GP-kthread as it can lead for reusing it whereas a worker
> >> can still access it thus execute newly added callbacks too
> >> early.
> >>
> >> CPU 0 CPU 1
> >> ----- -----
> >>
> >> // wait_tail == HEAD1
> >> rcu_sr_normal_gp_cleanup() {
> >> // has passed SR_MAX_USERS_WAKE_FROM_GP
> >> wait_tail->next = next;
> >> // done_tail = HEAD1
> >> smp_store_release(&rcu_state.srs_done_tail, wait_tail);
> >> queue_work() {
> >> test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)
> >> __queue_work()
> >> }
> >> }
> >>
> >> set_work_pool_and_clear_pending()
> >> rcu_sr_normal_gp_cleanup_work() {
> >> // new GP, wait_tail == HEAD2
> >> rcu_sr_normal_gp_cleanup() {
> >> // executes all completion, but stop at HEAD1
> >> wait_tail->next = HEAD1;
> >> // done_tail = HEAD2
> >> smp_store_release(&rcu_state.srs_done_tail, wait_tail);
> >> queue_work() {
> >> test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)
> >> __queue_work()
> >> }
> >> }
> >> // done = HEAD2
> >> done = smp_load_acquire(&rcu_state.srs_done_tail);
> >> // head = HEAD1
> >> head = done->next;
> >> done->next = NULL;
> >> llist_for_each_safe() {
> >> // completes all callbacks, release HEAD1
> >> }
> >> }
> >> // Process second queue
> >> set_work_pool_and_clear_pending()
> >> rcu_sr_normal_gp_cleanup_work() {
> >> // done = HEAD2
> >> done = smp_load_acquire(&rcu_state.srs_done_tail);
> >>
> >> // new GP, wait_tail == HEAD3
> >> rcu_sr_normal_gp_cleanup() {
> >> // Finds HEAD2 with ->next == NULL at the end
> >> rcu_sr_put_wait_head(HEAD2)
> >> ...
> >>
> >> // A few more GPs later
> >> rcu_sr_normal_gp_init() {
> >> HEAD2 = rcu_sr_get_wait_head();
> >> llist_add(HEAD2, &rcu_state.srs_next);
> >> // head == rcu_state.srs_next
> >> head = done->next;
> >> done->next = NULL;
> >> llist_for_each_safe() {
> >> // EXECUTE CALLBACKS TOO EARLY!!!
> >> }
> >> }
> >>
> >> Reported-by: Frederic Weisbecker <[email protected]>
> >> Fixes: 05a10b921000 ("rcu: Support direct wake-up of synchronize_rcu() users")
> >> Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
> >> ---
> >> kernel/rcu/tree.c | 22 ++++++++--------------
> >> 1 file changed, 8 insertions(+), 14 deletions(-)
> >>
> >> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> >> index 31f3a61f9c38..475647620b12 100644
> >> --- a/kernel/rcu/tree.c
> >> +++ b/kernel/rcu/tree.c
> >> @@ -1656,21 +1656,11 @@ static void rcu_sr_normal_gp_cleanup(void)
> >> WARN_ON_ONCE(!rcu_sr_is_wait_head(wait_tail));
> >>
> >> /*
> >> - * Process (a) and (d) cases. See an illustration. Apart of
> >> - * that it handles the scenario when all clients are done,
> >> - * wait-head is released if last. The worker is not kicked.
> >> + * Process (a) and (d) cases. See an illustration.
> >> */
> >> llist_for_each_safe(rcu, next, wait_tail->next) {
> >> - if (rcu_sr_is_wait_head(rcu)) {
> >> - if (!rcu->next) {
> >> - rcu_sr_put_wait_head(rcu);
> >> - wait_tail->next = NULL;
> >> - } else {
> >> - wait_tail->next = rcu;
> >> - }
> >> -
> >> + if (rcu_sr_is_wait_head(rcu))
> >> break;
> >> - }
> >>
> >> rcu_sr_normal_complete(rcu);
> >> // It can be last, update a next on this step.
> >> @@ -1684,8 +1674,12 @@ static void rcu_sr_normal_gp_cleanup(void)
> >> smp_store_release(&rcu_state.srs_done_tail, wait_tail);
> >> ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail);
> >>
> >> - if (wait_tail->next)
> >> - queue_work(system_highpri_wq, &rcu_state.srs_cleanup_work);
> >> + /*
> >> + * We schedule a work in order to perform a final processing
> >> + * of outstanding users(if still left) and releasing wait-heads
> >> + * added by rcu_sr_normal_gp_init() call.
> >> + */
> >> + queue_work(system_highpri_wq, &rcu_state.srs_cleanup_work);
> >> }
>
> One question, why do you need to queue_work() if wait_tail->next == NULL?
>
> AFAICS, at this stage if wait_tail->next == NULL, you are in CASE f. so the last
> remaining HEAD stays? (And llist_for_each_safe() in
> rcu_sr_normal_gp_cleanup_work becomes a NOOP).
>
> Could be something like this, but maybe I missed something:
>
> @@ -1672,7 +1674,7 @@ static void rcu_sr_normal_gp_cleanup_work(struct
> work_struct *work)
> */
> static void rcu_sr_normal_gp_cleanup(void)
> {
> - struct llist_node *wait_tail, *next, *rcu;
> + struct llist_node *wait_tail, *next = NULL, *rcu = NULL;
> int done = 0;
>
> wait_tail = rcu_state.srs_wait_tail;
> @@ -1707,7 +1709,8 @@ static void rcu_sr_normal_gp_cleanup(void)
> * of outstanding users(if still left) and releasing wait-heads
> * added by rcu_sr_normal_gp_init() call.
> */
> - queue_work(system_highpri_wq, &rcu_state.srs_cleanup_work);
> + if (rcu)
> + queue_work(system_highpri_wq, &rcu_state.srs_cleanup_work);
> }
>
> /*
>
Unneeded queueing happens only first time after boot. After one cycle
we always need to queue the work to do a previous cleanups.
if (wait_head->next)
queue_the_work();
wait_head->next is always not NULL except first cycle after boot.
--
Uladzislau Rezki
On Thu, Mar 07, 2024 at 02:09:38AM -0500, Joel Fernandes wrote:
>
>
> On 3/7/2024 1:21 AM, Joel Fernandes wrote:
> >
> >
> > On 3/6/2024 5:31 PM, Joel Fernandes wrote:
> >>
> >>
> >> On 3/5/2024 2:57 PM, Uladzislau Rezki (Sony) wrote:
> >>> Fix a below race by not releasing a wait-head from the
> >>> GP-kthread as it can lead for reusing it whereas a worker
> >>> can still access it thus execute newly added callbacks too
> >>> early.
> >>>
> >>> CPU 0 CPU 1
> >>> ----- -----
> >>>
> >>> // wait_tail == HEAD1
> >>> rcu_sr_normal_gp_cleanup() {
> >>> // has passed SR_MAX_USERS_WAKE_FROM_GP
> >>> wait_tail->next = next;
> >>> // done_tail = HEAD1
> >>> smp_store_release(&rcu_state.srs_done_tail, wait_tail);
> >>> queue_work() {
> >>> test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)
> >>> __queue_work()
> >>> }
> >>> }
> >>>
> >>> set_work_pool_and_clear_pending()
> >>> rcu_sr_normal_gp_cleanup_work() {
> >>> // new GP, wait_tail == HEAD2
> >>> rcu_sr_normal_gp_cleanup() {
> >>> // executes all completion, but stop at HEAD1
> >>> wait_tail->next = HEAD1;
> >>> // done_tail = HEAD2
> >>> smp_store_release(&rcu_state.srs_done_tail, wait_tail);
> >>> queue_work() {
> >>> test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)
> >>> __queue_work()
> >>> }
> >>> }
> >>> // done = HEAD2
> >>> done = smp_load_acquire(&rcu_state.srs_done_tail);
> >>> // head = HEAD1
> >>> head = done->next;
> >>> done->next = NULL;
> >>> llist_for_each_safe() {
> >>> // completes all callbacks, release HEAD1
> >>> }
> >>> }
> >>> // Process second queue
> >>> set_work_pool_and_clear_pending()
> >>> rcu_sr_normal_gp_cleanup_work() {
> >>> // done = HEAD2
> >>> done = smp_load_acquire(&rcu_state.srs_done_tail);
> >>>
> >>> // new GP, wait_tail == HEAD3
> >>> rcu_sr_normal_gp_cleanup() {
> >>> // Finds HEAD2 with ->next == NULL at the end
> >>> rcu_sr_put_wait_head(HEAD2)
> >>> ...
> >>>
> >>> // A few more GPs later
> >>> rcu_sr_normal_gp_init() {
> >>> HEAD2 = rcu_sr_get_wait_head();
> >>> llist_add(HEAD2, &rcu_state.srs_next);
> >>> // head == rcu_state.srs_next
> >>> head = done->next;
> >>> done->next = NULL;
> >>> llist_for_each_safe() {
> >>> // EXECUTE CALLBACKS TOO EARLY!!!
> >>> }
> >>> }
> >>>
> >>> Reported-by: Frederic Weisbecker <[email protected]>
> >>> Fixes: 05a10b921000 ("rcu: Support direct wake-up of synchronize_rcu() users")
> >>> Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
> >>> ---
> >>> kernel/rcu/tree.c | 22 ++++++++--------------
> >>> 1 file changed, 8 insertions(+), 14 deletions(-)
> >>>
> >>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> >>> index 31f3a61f9c38..475647620b12 100644
> >>> --- a/kernel/rcu/tree.c
> >>> +++ b/kernel/rcu/tree.c
> >>> @@ -1656,21 +1656,11 @@ static void rcu_sr_normal_gp_cleanup(void)
> >>> WARN_ON_ONCE(!rcu_sr_is_wait_head(wait_tail));
> >>>
> >>> /*
> >>> - * Process (a) and (d) cases. See an illustration. Apart of
> >>> - * that it handles the scenario when all clients are done,
> >>> - * wait-head is released if last. The worker is not kicked.
> >>> + * Process (a) and (d) cases. See an illustration.
> >>> */
> >>> llist_for_each_safe(rcu, next, wait_tail->next) {
> >>> - if (rcu_sr_is_wait_head(rcu)) {
> >>> - if (!rcu->next) {
> >>> - rcu_sr_put_wait_head(rcu);
> >>> - wait_tail->next = NULL;
> >>> - } else {
> >>> - wait_tail->next = rcu;
> >>> - }
> >>> -
> >>> + if (rcu_sr_is_wait_head(rcu))
> >>> break;
> >>> - }
> >>>
> >>> rcu_sr_normal_complete(rcu);
> >>> // It can be last, update a next on this step.
> >>> @@ -1684,8 +1674,12 @@ static void rcu_sr_normal_gp_cleanup(void)
> >>> smp_store_release(&rcu_state.srs_done_tail, wait_tail);
> >>> ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail);
> >>>
> >>> - if (wait_tail->next)
> >>> - queue_work(system_highpri_wq, &rcu_state.srs_cleanup_work);
> >>> + /*
> >>> + * We schedule a work in order to perform a final processing
> >>> + * of outstanding users(if still left) and releasing wait-heads
> >>> + * added by rcu_sr_normal_gp_init() call.
> >>> + */
> >>> + queue_work(system_highpri_wq, &rcu_state.srs_cleanup_work);
> >>> }
> >
> > One question, why do you need to queue_work() if wait_tail->next == NULL?
> >
> > AFAICS, at this stage if wait_tail->next == NULL, you are in CASE f. so the last
> > remaining HEAD stays? (And llist_for_each_safe() in
> > rcu_sr_normal_gp_cleanup_work becomes a NOOP).
> >
>
> Never mind, sorry for spewing nonsense. You can never end up with CASE f here so
> you still need the queue_work(). I think it is worth looking into how to free
> the last HEAD, without queuing a work though (while not causing wreckage).
>
No problem at all :)
--
Uladzislau Rezki
On Thu, Mar 07, 2024 at 01:04:25AM +0100, Frederic Weisbecker wrote:
> Le Tue, Mar 05, 2024 at 08:57:19PM +0100, Uladzislau Rezki (Sony) a écrit :
> > Fix a below race by not releasing a wait-head from the
> > GP-kthread as it can lead for reusing it whereas a worker
> > can still access it thus execute newly added callbacks too
> > early.
> >
> > CPU 0 CPU 1
> > ----- -----
> >
> > // wait_tail == HEAD1
> > rcu_sr_normal_gp_cleanup() {
> > // has passed SR_MAX_USERS_WAKE_FROM_GP
> > wait_tail->next = next;
> > // done_tail = HEAD1
> > smp_store_release(&rcu_state.srs_done_tail, wait_tail);
> > queue_work() {
> > test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)
> > __queue_work()
> > }
> > }
> >
> > set_work_pool_and_clear_pending()
> > rcu_sr_normal_gp_cleanup_work() {
> > // new GP, wait_tail == HEAD2
> > rcu_sr_normal_gp_cleanup() {
> > // executes all completion, but stop at HEAD1
> > wait_tail->next = HEAD1;
> > // done_tail = HEAD2
> > smp_store_release(&rcu_state.srs_done_tail, wait_tail);
> > queue_work() {
> > test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)
> > __queue_work()
> > }
> > }
> > // done = HEAD2
> > done = smp_load_acquire(&rcu_state.srs_done_tail);
> > // head = HEAD1
> > head = done->next;
> > done->next = NULL;
> > llist_for_each_safe() {
> > // completes all callbacks, release HEAD1
> > }
> > }
> > // Process second queue
> > set_work_pool_and_clear_pending()
> > rcu_sr_normal_gp_cleanup_work() {
> > // done = HEAD2
> > done = smp_load_acquire(&rcu_state.srs_done_tail);
> >
> > // new GP, wait_tail == HEAD3
> > rcu_sr_normal_gp_cleanup() {
> > // Finds HEAD2 with ->next == NULL at the end
> > rcu_sr_put_wait_head(HEAD2)
> > ...
> >
> > // A few more GPs later
> > rcu_sr_normal_gp_init() {
> > HEAD2 = rcu_sr_get_wait_head();
> > llist_add(HEAD2, &rcu_state.srs_next);
> > // head == rcu_state.srs_next
> > head = done->next;
> > done->next = NULL;
> > llist_for_each_safe() {
> > // EXECUTE CALLBACKS TOO EARLY!!!
> > }
> > }
> >
> > Reported-by: Frederic Weisbecker <[email protected]>
> > Fixes: 05a10b921000 ("rcu: Support direct wake-up of synchronize_rcu() users")
> > Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
>
> Reviewed-by: Frederic Weisbecker <[email protected]>
>
Thank you, I will update with your review-by tag!
--
Uladzislau Rezki
On Wed, Mar 06, 2024 at 05:31:31PM -0500, Joel Fernandes wrote:
>
>
> On 3/5/2024 2:57 PM, Uladzislau Rezki (Sony) wrote:
> > Fix a below race by not releasing a wait-head from the
> > GP-kthread as it can lead for reusing it whereas a worker
> > can still access it thus execute newly added callbacks too
> > early.
> >
> > CPU 0 CPU 1
> > ----- -----
> >
> > // wait_tail == HEAD1
> > rcu_sr_normal_gp_cleanup() {
> > // has passed SR_MAX_USERS_WAKE_FROM_GP
> > wait_tail->next = next;
> > // done_tail = HEAD1
> > smp_store_release(&rcu_state.srs_done_tail, wait_tail);
> > queue_work() {
> > test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)
> > __queue_work()
> > }
> > }
> >
> > set_work_pool_and_clear_pending()
> > rcu_sr_normal_gp_cleanup_work() {
> > // new GP, wait_tail == HEAD2
> > rcu_sr_normal_gp_cleanup() {
> > // executes all completion, but stop at HEAD1
> > wait_tail->next = HEAD1;
> > // done_tail = HEAD2
> > smp_store_release(&rcu_state.srs_done_tail, wait_tail);
> > queue_work() {
> > test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)
> > __queue_work()
> > }
> > }
> > // done = HEAD2
> > done = smp_load_acquire(&rcu_state.srs_done_tail);
> > // head = HEAD1
> > head = done->next;
> > done->next = NULL;
> > llist_for_each_safe() {
> > // completes all callbacks, release HEAD1
> > }
> > }
> > // Process second queue
> > set_work_pool_and_clear_pending()
> > rcu_sr_normal_gp_cleanup_work() {
> > // done = HEAD2
> > done = smp_load_acquire(&rcu_state.srs_done_tail);
> >
> > // new GP, wait_tail == HEAD3
> > rcu_sr_normal_gp_cleanup() {
> > // Finds HEAD2 with ->next == NULL at the end
> > rcu_sr_put_wait_head(HEAD2)
> > ...
> >
> > // A few more GPs later
> > rcu_sr_normal_gp_init() {
> > HEAD2 = rcu_sr_get_wait_head();
> > llist_add(HEAD2, &rcu_state.srs_next);
> > // head == rcu_state.srs_next
> > head = done->next;
> > done->next = NULL;
> > llist_for_each_safe() {
> > // EXECUTE CALLBACKS TOO EARLY!!!
> > }
> > }
> >
> > Reported-by: Frederic Weisbecker <[email protected]>
> > Fixes: 05a10b921000 ("rcu: Support direct wake-up of synchronize_rcu() users")
> > Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
> > ---
> > kernel/rcu/tree.c | 22 ++++++++--------------
> > 1 file changed, 8 insertions(+), 14 deletions(-)
> >
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index 31f3a61f9c38..475647620b12 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -1656,21 +1656,11 @@ static void rcu_sr_normal_gp_cleanup(void)
> > WARN_ON_ONCE(!rcu_sr_is_wait_head(wait_tail));
> >
> > /*
> > - * Process (a) and (d) cases. See an illustration. Apart of
> > - * that it handles the scenario when all clients are done,
> > - * wait-head is released if last. The worker is not kicked.
> > + * Process (a) and (d) cases. See an illustration.
> > */
> > llist_for_each_safe(rcu, next, wait_tail->next) {
> > - if (rcu_sr_is_wait_head(rcu)) {
> > - if (!rcu->next) {
> > - rcu_sr_put_wait_head(rcu);
> > - wait_tail->next = NULL;
> > - } else {
> > - wait_tail->next = rcu;
> > - }
> > -
> > + if (rcu_sr_is_wait_head(rcu))
> > break;
> > - }
> >
> > rcu_sr_normal_complete(rcu);
> > // It can be last, update a next on this step.
> > @@ -1684,8 +1674,12 @@ static void rcu_sr_normal_gp_cleanup(void)
> > smp_store_release(&rcu_state.srs_done_tail, wait_tail);
> > ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail);
> >
> > - if (wait_tail->next)
> > - queue_work(system_highpri_wq, &rcu_state.srs_cleanup_work);
> > + /*
> > + * We schedule a work in order to perform a final processing
> > + * of outstanding users(if still left) and releasing wait-heads
> > + * added by rcu_sr_normal_gp_init() call.
> > + */
> > + queue_work(system_highpri_wq, &rcu_state.srs_cleanup_work);
> > }
>
> Ah, nice. So instead of allocating/freeing in GP thread and freeing in worker,
> you allocate heads only in GP thread and free them only in worker, thus
> essentially fixing the UAF that Frederick found.
>
> AFAICS, this fixes the issue.
>
> Reviewed-by: Joel Fernandes (Google) <[email protected]>
>
Thank you for the review-by!
> There might a way to prevent queuing new work as fast-path optimization, incase
> the CBs per GP will always be < SR_MAX_USERS_WAKE_FROM_GP but I could not find a
> workqueue API that helps there, and work_busy() has comments saying not to use that.
>
This is not really critical but yes, we can think of it.
--
Uladzislau Rezki
On 3/7/2024 7:57 AM, Uladzislau Rezki wrote:
> On Wed, Mar 06, 2024 at 05:31:31PM -0500, Joel Fernandes wrote:
>>
>>
>> On 3/5/2024 2:57 PM, Uladzislau Rezki (Sony) wrote:
>>> Fix a below race by not releasing a wait-head from the
>>> GP-kthread as it can lead for reusing it whereas a worker
>>> can still access it thus execute newly added callbacks too
>>> early.
>>>
>>> CPU 0 CPU 1
>>> ----- -----
>>>
>>> // wait_tail == HEAD1
>>> rcu_sr_normal_gp_cleanup() {
>>> // has passed SR_MAX_USERS_WAKE_FROM_GP
>>> wait_tail->next = next;
>>> // done_tail = HEAD1
>>> smp_store_release(&rcu_state.srs_done_tail, wait_tail);
>>> queue_work() {
>>> test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)
>>> __queue_work()
>>> }
>>> }
>>>
>>> set_work_pool_and_clear_pending()
>>> rcu_sr_normal_gp_cleanup_work() {
[..]
>>>
>>> Reported-by: Frederic Weisbecker <[email protected]>
>>> Fixes: 05a10b921000 ("rcu: Support direct wake-up of synchronize_rcu() users")
>>> Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
>>> ---
>>> kernel/rcu/tree.c | 22 ++++++++--------------
>>> 1 file changed, 8 insertions(+), 14 deletions(-)
>>>
>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
>>> index 31f3a61f9c38..475647620b12 100644
>>> --- a/kernel/rcu/tree.c
>>> +++ b/kernel/rcu/tree.c
>>> @@ -1656,21 +1656,11 @@ static void rcu_sr_normal_gp_cleanup(void)
>>> WARN_ON_ONCE(!rcu_sr_is_wait_head(wait_tail));
>>>
>>> /*
>>> - * Process (a) and (d) cases. See an illustration. Apart of
>>> - * that it handles the scenario when all clients are done,
>>> - * wait-head is released if last. The worker is not kicked.
>>> + * Process (a) and (d) cases. See an illustration.
>>> */
>>> llist_for_each_safe(rcu, next, wait_tail->next) {
>>> - if (rcu_sr_is_wait_head(rcu)) {
>>> - if (!rcu->next) {
>>> - rcu_sr_put_wait_head(rcu);
>>> - wait_tail->next = NULL;
>>> - } else {
>>> - wait_tail->next = rcu;
>>> - }
>>> -
>>> + if (rcu_sr_is_wait_head(rcu))
>>> break;
>>> - }
>>>
>>> rcu_sr_normal_complete(rcu);
>>> // It can be last, update a next on this step.
>>> @@ -1684,8 +1674,12 @@ static void rcu_sr_normal_gp_cleanup(void)
>>> smp_store_release(&rcu_state.srs_done_tail, wait_tail);
>>> ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail);
>>>
>>> - if (wait_tail->next)
>>> - queue_work(system_highpri_wq, &rcu_state.srs_cleanup_work);
>>> + /*
>>> + * We schedule a work in order to perform a final processing
>>> + * of outstanding users(if still left) and releasing wait-heads
>>> + * added by rcu_sr_normal_gp_init() call.
>>> + */
>>> + queue_work(system_highpri_wq, &rcu_state.srs_cleanup_work);
>>> }
>>
>> Ah, nice. So instead of allocating/freeing in GP thread and freeing in worker,
>> you allocate heads only in GP thread and free them only in worker, thus
>> essentially fixing the UAF that Frederick found.
>>
>> AFAICS, this fixes the issue.
>>
>> Reviewed-by: Joel Fernandes (Google) <[email protected]>
>>
> Thank you for the review-by!
>
>> There might a way to prevent queuing new work as fast-path optimization, incase
>> the CBs per GP will always be < SR_MAX_USERS_WAKE_FROM_GP but I could not find a
>> workqueue API that helps there, and work_busy() has comments saying not to use that.
>>
> This is not really critical but yes, we can think of it.
>
Thanks, I have a patch that does that. I could not help but write it as soon as
I woke up in the morning, ;-). It passes torture and I will push it for further
review after some more testing.
thanks,
- Joel