From: Zhaoyang Huang <[email protected]>
Race detected between psimon_new and psimon_old as shown below, which
cause panic by accessing invalid psi_system->poll_wait->wait_queue_entry
and psi_system->poll_timer->entry->next. It is not necessary to reinit
resource of psi_system when psi_trigger_create.
psi_trigger_create psimon_new psimon_old
init_waitqueue_head finish_wait
spin_lock(lock_old)
spin_lock_init(lock_new)
wake_up_process(psimon_new)
finish_wait
spin_lock(lock_new)
list_del list_del
Signed-off-by: ziwei.dai <[email protected]>
Signed-off-by: ke.wang <[email protected]>
Signed-off-by: Zhaoyang Huang <[email protected]>
+++
v2: change del_timer_sync to del_timer in psi_trigger_destroy
+++
---
kernel/sched/psi.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index cc25a3c..fe29022 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -182,6 +182,8 @@ struct psi_group psi_system = {
static void psi_avgs_work(struct work_struct *work);
+static void poll_timer_fn(struct timer_list *t);
+
static void group_init(struct psi_group *group)
{
int cpu;
@@ -201,6 +203,8 @@ static void group_init(struct psi_group *group)
memset(group->polling_total, 0, sizeof(group->polling_total));
group->polling_next_update = ULLONG_MAX;
group->polling_until = 0;
+ init_waitqueue_head(&group->poll_wait);
+ timer_setup(&group->poll_timer, poll_timer_fn, 0);
rcu_assign_pointer(group->poll_task, NULL);
}
@@ -1157,7 +1161,6 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
return ERR_CAST(task);
}
atomic_set(&group->poll_wakeup, 0);
- init_waitqueue_head(&group->poll_wait);
wake_up_process(task);
timer_setup(&group->poll_timer, poll_timer_fn, 0);
rcu_assign_pointer(group->poll_task, task);
@@ -1233,7 +1236,7 @@ static void psi_trigger_destroy(struct kref *ref)
* But it might have been already scheduled before
* that - deschedule it cleanly before destroying it.
*/
- del_timer_sync(&group->poll_timer);
+ del_timer(&group->poll_timer);
kthread_stop(task_to_destroy);
}
kfree(t);
--
1.9.1
resend for adding other maintainers in cc
On Tue, May 18, 2021 at 10:11 AM Huangzhaoyang <[email protected]> wrote:
>
> From: Zhaoyang Huang <[email protected]>
>
> Race detected between psimon_new and psimon_old as shown below, which
> cause panic by accessing invalid psi_system->poll_wait->wait_queue_entry
> and psi_system->poll_timer->entry->next. It is not necessary to reinit
> resource of psi_system when psi_trigger_create.
>
> psi_trigger_create psimon_new psimon_old
> init_waitqueue_head finish_wait
> spin_lock(lock_old)
> spin_lock_init(lock_new)
> wake_up_process(psimon_new)
>
> finish_wait
> spin_lock(lock_new)
> list_del list_del
>
> Signed-off-by: ziwei.dai <[email protected]>
> Signed-off-by: ke.wang <[email protected]>
> Signed-off-by: Zhaoyang Huang <[email protected]>
> +++
> v2: change del_timer_sync to del_timer in psi_trigger_destroy
> +++
> ---
> kernel/sched/psi.c | 7 +++++--
> 1 file changed, 5 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index cc25a3c..fe29022 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -182,6 +182,8 @@ struct psi_group psi_system = {
>
> static void psi_avgs_work(struct work_struct *work);
>
> +static void poll_timer_fn(struct timer_list *t);
> +
> static void group_init(struct psi_group *group)
> {
> int cpu;
> @@ -201,6 +203,8 @@ static void group_init(struct psi_group *group)
> memset(group->polling_total, 0, sizeof(group->polling_total));
> group->polling_next_update = ULLONG_MAX;
> group->polling_until = 0;
> + init_waitqueue_head(&group->poll_wait);
> + timer_setup(&group->poll_timer, poll_timer_fn, 0);
> rcu_assign_pointer(group->poll_task, NULL);
> }
>
> @@ -1157,7 +1161,6 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
> return ERR_CAST(task);
> }
> atomic_set(&group->poll_wakeup, 0);
> - init_waitqueue_head(&group->poll_wait);
> wake_up_process(task);
> timer_setup(&group->poll_timer, poll_timer_fn, 0);
> rcu_assign_pointer(group->poll_task, task);
> @@ -1233,7 +1236,7 @@ static void psi_trigger_destroy(struct kref *ref)
> * But it might have been already scheduled before
> * that - deschedule it cleanly before destroying it.
> */
> - del_timer_sync(&group->poll_timer);
> + del_timer(&group->poll_timer);
> kthread_stop(task_to_destroy);
> }
> kfree(t);
> --
> 1.9.1
>
On Mon, May 17, 2021 at 7:16 PM Zhaoyang Huang <[email protected]> wrote:
>
> resend for adding other maintainers in cc
>
> On Tue, May 18, 2021 at 10:11 AM Huangzhaoyang <[email protected]> wrote:
> >
> > From: Zhaoyang Huang <[email protected]>
> >
> > Race detected between psimon_new and psimon_old as shown below, which
> > cause panic by accessing invalid psi_system->poll_wait->wait_queue_entry
> > and psi_system->poll_timer->entry->next. It is not necessary to reinit
> > resource of psi_system when psi_trigger_create.
Please clarify that this fixes a race between psi_trigger_create and
psi_trigger_destroy, when psi_trigger_destroy is destroying the last
trigger in a psi group while racing with psi_trigger_create.
> >
> > psi_trigger_create psimon_new psimon_old
> > init_waitqueue_head finish_wait
> > spin_lock(lock_old)
> > spin_lock_init(lock_new)
> > wake_up_process(psimon_new)
> >
> > finish_wait
> > spin_lock(lock_new)
> > list_del list_del
Add:
Fixes: 461daba06bdc ("psi: eliminate kthread_worker from psi trigger
scheduling mechanism")
> >
> > Signed-off-by: ziwei.dai <[email protected]>
> > Signed-off-by: ke.wang <[email protected]>
> > Signed-off-by: Zhaoyang Huang <[email protected]>
> > +++
> > v2: change del_timer_sync to del_timer in psi_trigger_destroy
Please see my reply to your original RFC:
https://lore.kernel.org/patchwork/patch/1429498/#1627196
You are still missing:
- removal of timer_setup() in psi_trigger_create()
- removal of an obsolete comment in psi_trigger_destroy()
- you are calling del_timer() from outside of trigger_lock protected
section, which leaves the following race:
psi_trigger_destroy
mutex_lock(trigger_lock)
mutex_unlock(trigger_lock)
<preempted>
psi_trigger_create
psi_group_change
psi_schedule_poll_work
mod_timer(poll_timer)
<resumes>
del_timer(poll_timer)
This would lead to deletion of the new poll_timer created by
psi_trigger_create() and would stop periodic psi_poll_work(). This
results in psi trigger not firing when it should. In my suggestion
del_timer() was called under trigger_lock protection which would
prevent psi_trigger_create() from starting a new timer before
psi_trigger_destroy() can call del_timer().
> > +++
> > ---
> > kernel/sched/psi.c | 7 +++++--
> > 1 file changed, 5 insertions(+), 2 deletions(-)
> >
> > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> > index cc25a3c..fe29022 100644
> > --- a/kernel/sched/psi.c
> > +++ b/kernel/sched/psi.c
> > @@ -182,6 +182,8 @@ struct psi_group psi_system = {
> >
> > static void psi_avgs_work(struct work_struct *work);
> >
> > +static void poll_timer_fn(struct timer_list *t);
> > +
> > static void group_init(struct psi_group *group)
> > {
> > int cpu;
> > @@ -201,6 +203,8 @@ static void group_init(struct psi_group *group)
> > memset(group->polling_total, 0, sizeof(group->polling_total));
> > group->polling_next_update = ULLONG_MAX;
> > group->polling_until = 0;
> > + init_waitqueue_head(&group->poll_wait);
> > + timer_setup(&group->poll_timer, poll_timer_fn, 0);
> > rcu_assign_pointer(group->poll_task, NULL);
> > }
> >
> > @@ -1157,7 +1161,6 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
> > return ERR_CAST(task);
> > }
> > atomic_set(&group->poll_wakeup, 0);
> > - init_waitqueue_head(&group->poll_wait);
> > wake_up_process(task);
> > timer_setup(&group->poll_timer, poll_timer_fn, 0);
> > rcu_assign_pointer(group->poll_task, task);
> > @@ -1233,7 +1236,7 @@ static void psi_trigger_destroy(struct kref *ref)
> > * But it might have been already scheduled before
> > * that - deschedule it cleanly before destroying it.
> > */
> > - del_timer_sync(&group->poll_timer);
> > + del_timer(&group->poll_timer);
> > kthread_stop(task_to_destroy);
> > }
> > kfree(t);
> > --
> > 1.9.1
> >