The following circular locking dependency was reported when running
cpus online/offline test on an arm64 system.
[ 84.195923] Chain exists of:
dmc620_pmu_irqs_lock --> cpu_hotplug_lock --> cpuhp_state-down
[ 84.207305] Possible unsafe locking scenario:
[ 84.213212] CPU0 CPU1
[ 84.217729] ---- ----
[ 84.222247] lock(cpuhp_state-down);
[ 84.225899] lock(cpu_hotplug_lock);
[ 84.232068] lock(cpuhp_state-down);
[ 84.238237] lock(dmc620_pmu_irqs_lock);
[ 84.242236]
*** DEADLOCK ***
The problematic locking order seems to be
lock(dmc620_pmu_irqs_lock) --> lock(cpu_hotplug_lock)
This locking order happens when dmc620_pmu_get_irq() is called from
dmc620_pmu_device_probe(). Since dmc620_pmu_irqs_lock is used for
protecting the dmc620_pmu_irqs structure only, we don't actually need
to hold the lock when adding a new instance to the CPU hotplug subsystem.
Fix this possible deadlock scenario by releasing the lock before
calling cpuhp_state_add_instance_nocalls() and reacquiring it afterward.
To avoid the possibility of 2 racing dmc620_pmu_get_irq() calls inserting
duplicated dmc620_pmu_irq structures with the same irq number, a dummy
entry is inserted before releasing the lock which will block a competing
thread from inserting another irq structure of the same irq number.
Suggested-by: Robin Murphy <[email protected]>
Signed-off-by: Waiman Long <[email protected]>
---
drivers/perf/arm_dmc620_pmu.c | 28 ++++++++++++++++++++++------
1 file changed, 22 insertions(+), 6 deletions(-)
diff --git a/drivers/perf/arm_dmc620_pmu.c b/drivers/perf/arm_dmc620_pmu.c
index 9d0f01c4455a..7cafd4dd4522 100644
--- a/drivers/perf/arm_dmc620_pmu.c
+++ b/drivers/perf/arm_dmc620_pmu.c
@@ -76,6 +76,7 @@ struct dmc620_pmu_irq {
refcount_t refcount;
unsigned int irq_num;
unsigned int cpu;
+ unsigned int valid;
};
struct dmc620_pmu {
@@ -423,9 +424,14 @@ static struct dmc620_pmu_irq *__dmc620_pmu_get_irq(int irq_num)
struct dmc620_pmu_irq *irq;
int ret;
- list_for_each_entry(irq, &dmc620_pmu_irqs, irqs_node)
- if (irq->irq_num == irq_num && refcount_inc_not_zero(&irq->refcount))
+ list_for_each_entry(irq, &dmc620_pmu_irqs, irqs_node) {
+ if (irq->irq_num != irq_num)
+ continue;
+ if (!irq->valid)
+ return ERR_PTR(-EAGAIN); /* Try again later */
+ if (refcount_inc_not_zero(&irq->refcount))
return irq;
+ }
irq = kzalloc(sizeof(*irq), GFP_KERNEL);
if (!irq)
@@ -447,13 +453,23 @@ static struct dmc620_pmu_irq *__dmc620_pmu_get_irq(int irq_num)
if (ret)
goto out_free_irq;
- ret = cpuhp_state_add_instance_nocalls(cpuhp_state_num, &irq->node);
- if (ret)
- goto out_free_irq;
-
irq->irq_num = irq_num;
list_add(&irq->irqs_node, &dmc620_pmu_irqs);
+ /*
+ * Release dmc620_pmu_irqs_lock before calling
+ * cpuhp_state_add_instance_nocalls() and reacquire it afterward.
+ */
+ mutex_unlock(&dmc620_pmu_irqs_lock);
+ ret = cpuhp_state_add_instance_nocalls(cpuhp_state_num, &irq->node);
+ mutex_lock(&dmc620_pmu_irqs_lock);
+
+ if (ret) {
+ list_del(&irq->irqs_node);
+ goto out_free_irq;
+ }
+
+ irq->valid = true;
return irq;
out_free_irq:
--
2.31.1
On Fri, Jul 21, 2023 at 11:17:28PM -0400, Waiman Long wrote:
> The following circular locking dependency was reported when running
> cpus online/offline test on an arm64 system.
>
> [ 84.195923] Chain exists of:
> dmc620_pmu_irqs_lock --> cpu_hotplug_lock --> cpuhp_state-down
>
> [ 84.207305] Possible unsafe locking scenario:
>
> [ 84.213212] CPU0 CPU1
> [ 84.217729] ---- ----
> [ 84.222247] lock(cpuhp_state-down);
> [ 84.225899] lock(cpu_hotplug_lock);
> [ 84.232068] lock(cpuhp_state-down);
> [ 84.238237] lock(dmc620_pmu_irqs_lock);
> [ 84.242236]
> *** DEADLOCK ***
>
> The problematic locking order seems to be
>
> lock(dmc620_pmu_irqs_lock) --> lock(cpu_hotplug_lock)
>
> This locking order happens when dmc620_pmu_get_irq() is called from
> dmc620_pmu_device_probe(). Since dmc620_pmu_irqs_lock is used for
> protecting the dmc620_pmu_irqs structure only, we don't actually need
> to hold the lock when adding a new instance to the CPU hotplug subsystem.
>
> Fix this possible deadlock scenario by releasing the lock before
> calling cpuhp_state_add_instance_nocalls() and reacquiring it afterward.
> To avoid the possibility of 2 racing dmc620_pmu_get_irq() calls inserting
> duplicated dmc620_pmu_irq structures with the same irq number, a dummy
> entry is inserted before releasing the lock which will block a competing
> thread from inserting another irq structure of the same irq number.
>
> Suggested-by: Robin Murphy <[email protected]>
> Signed-off-by: Waiman Long <[email protected]>
> ---
> drivers/perf/arm_dmc620_pmu.c | 28 ++++++++++++++++++++++------
> 1 file changed, 22 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/perf/arm_dmc620_pmu.c b/drivers/perf/arm_dmc620_pmu.c
> index 9d0f01c4455a..7cafd4dd4522 100644
> --- a/drivers/perf/arm_dmc620_pmu.c
> +++ b/drivers/perf/arm_dmc620_pmu.c
> @@ -76,6 +76,7 @@ struct dmc620_pmu_irq {
> refcount_t refcount;
> unsigned int irq_num;
> unsigned int cpu;
> + unsigned int valid;
> };
>
> struct dmc620_pmu {
> @@ -423,9 +424,14 @@ static struct dmc620_pmu_irq *__dmc620_pmu_get_irq(int irq_num)
> struct dmc620_pmu_irq *irq;
> int ret;
>
> - list_for_each_entry(irq, &dmc620_pmu_irqs, irqs_node)
> - if (irq->irq_num == irq_num && refcount_inc_not_zero(&irq->refcount))
> + list_for_each_entry(irq, &dmc620_pmu_irqs, irqs_node) {
> + if (irq->irq_num != irq_num)
> + continue;
> + if (!irq->valid)
> + return ERR_PTR(-EAGAIN); /* Try again later */
It looks like this can bubble up to the probe() routine. Does the driver
core handle -EAGAIN coming back from a probe routine?
> + if (refcount_inc_not_zero(&irq->refcount))
> return irq;
> + }
>
> irq = kzalloc(sizeof(*irq), GFP_KERNEL);
> if (!irq)
> @@ -447,13 +453,23 @@ static struct dmc620_pmu_irq *__dmc620_pmu_get_irq(int irq_num)
> if (ret)
> goto out_free_irq;
>
> - ret = cpuhp_state_add_instance_nocalls(cpuhp_state_num, &irq->node);
> - if (ret)
> - goto out_free_irq;
> -
> irq->irq_num = irq_num;
> list_add(&irq->irqs_node, &dmc620_pmu_irqs);
>
> + /*
> + * Release dmc620_pmu_irqs_lock before calling
> + * cpuhp_state_add_instance_nocalls() and reacquire it afterward.
> + */
> + mutex_unlock(&dmc620_pmu_irqs_lock);
> + ret = cpuhp_state_add_instance_nocalls(cpuhp_state_num, &irq->node);
> + mutex_lock(&dmc620_pmu_irqs_lock);
> +
> + if (ret) {
> + list_del(&irq->irqs_node);
> + goto out_free_irq;
> + }
> +
> + irq->valid = true;
Do you actually need a new flag here, or could we use a refcount of zero
to indicate that the irq descriptor is still being constructed?
Will
On 7/28/23 11:06, Will Deacon wrote:
> On Fri, Jul 21, 2023 at 11:17:28PM -0400, Waiman Long wrote:
>> The following circular locking dependency was reported when running
>> cpus online/offline test on an arm64 system.
>>
>> [ 84.195923] Chain exists of:
>> dmc620_pmu_irqs_lock --> cpu_hotplug_lock --> cpuhp_state-down
>>
>> [ 84.207305] Possible unsafe locking scenario:
>>
>> [ 84.213212] CPU0 CPU1
>> [ 84.217729] ---- ----
>> [ 84.222247] lock(cpuhp_state-down);
>> [ 84.225899] lock(cpu_hotplug_lock);
>> [ 84.232068] lock(cpuhp_state-down);
>> [ 84.238237] lock(dmc620_pmu_irqs_lock);
>> [ 84.242236]
>> *** DEADLOCK ***
>>
>> The problematic locking order seems to be
>>
>> lock(dmc620_pmu_irqs_lock) --> lock(cpu_hotplug_lock)
>>
>> This locking order happens when dmc620_pmu_get_irq() is called from
>> dmc620_pmu_device_probe(). Since dmc620_pmu_irqs_lock is used for
>> protecting the dmc620_pmu_irqs structure only, we don't actually need
>> to hold the lock when adding a new instance to the CPU hotplug subsystem.
>>
>> Fix this possible deadlock scenario by releasing the lock before
>> calling cpuhp_state_add_instance_nocalls() and reacquiring it afterward.
>> To avoid the possibility of 2 racing dmc620_pmu_get_irq() calls inserting
>> duplicated dmc620_pmu_irq structures with the same irq number, a dummy
>> entry is inserted before releasing the lock which will block a competing
>> thread from inserting another irq structure of the same irq number.
>>
>> Suggested-by: Robin Murphy <[email protected]>
>> Signed-off-by: Waiman Long <[email protected]>
>> ---
>> drivers/perf/arm_dmc620_pmu.c | 28 ++++++++++++++++++++++------
>> 1 file changed, 22 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/perf/arm_dmc620_pmu.c b/drivers/perf/arm_dmc620_pmu.c
>> index 9d0f01c4455a..7cafd4dd4522 100644
>> --- a/drivers/perf/arm_dmc620_pmu.c
>> +++ b/drivers/perf/arm_dmc620_pmu.c
>> @@ -76,6 +76,7 @@ struct dmc620_pmu_irq {
>> refcount_t refcount;
>> unsigned int irq_num;
>> unsigned int cpu;
>> + unsigned int valid;
>> };
>>
>> struct dmc620_pmu {
>> @@ -423,9 +424,14 @@ static struct dmc620_pmu_irq *__dmc620_pmu_get_irq(int irq_num)
>> struct dmc620_pmu_irq *irq;
>> int ret;
>>
>> - list_for_each_entry(irq, &dmc620_pmu_irqs, irqs_node)
>> - if (irq->irq_num == irq_num && refcount_inc_not_zero(&irq->refcount))
>> + list_for_each_entry(irq, &dmc620_pmu_irqs, irqs_node) {
>> + if (irq->irq_num != irq_num)
>> + continue;
>> + if (!irq->valid)
>> + return ERR_PTR(-EAGAIN); /* Try again later */
> It looks like this can bubble up to the probe() routine. Does the driver
> core handle -EAGAIN coming back from a probe routine?
Right, I should add code to handle this error condition. I think it can
be handled in dmc620_pmu_get_irq(). The important thing is to release
the mutex, wait a few ms and try again. What do you think?
>
>> + if (refcount_inc_not_zero(&irq->refcount))
>> return irq;
>> + }
>>
>> irq = kzalloc(sizeof(*irq), GFP_KERNEL);
>> if (!irq)
>> @@ -447,13 +453,23 @@ static struct dmc620_pmu_irq *__dmc620_pmu_get_irq(int irq_num)
>> if (ret)
>> goto out_free_irq;
>>
>> - ret = cpuhp_state_add_instance_nocalls(cpuhp_state_num, &irq->node);
>> - if (ret)
>> - goto out_free_irq;
>> -
>> irq->irq_num = irq_num;
>> list_add(&irq->irqs_node, &dmc620_pmu_irqs);
>>
>> + /*
>> + * Release dmc620_pmu_irqs_lock before calling
>> + * cpuhp_state_add_instance_nocalls() and reacquire it afterward.
>> + */
>> + mutex_unlock(&dmc620_pmu_irqs_lock);
>> + ret = cpuhp_state_add_instance_nocalls(cpuhp_state_num, &irq->node);
>> + mutex_lock(&dmc620_pmu_irqs_lock);
>> +
>> + if (ret) {
>> + list_del(&irq->irqs_node);
>> + goto out_free_irq;
>> + }
>> +
>> + irq->valid = true;
> Do you actually need a new flag here, or could we use a refcount of zero
> to indicate that the irq descriptor is still being constructed?
A refcount of zero can also mean that an existing irq is about to be
removed. Right? So I don't think we can use that for this purpose.
Besides, there is a 4-byte hole in the structure anyway for arm64.
Cheers,
Longman
On 8/2/23 21:37, Waiman Long wrote:
>
> On 7/28/23 11:06, Will Deacon wrote:
>> On Fri, Jul 21, 2023 at 11:17:28PM -0400, Waiman Long wrote:
>>> The following circular locking dependency was reported when running
>>> cpus online/offline test on an arm64 system.
>>>
>>> [ 84.195923] Chain exists of:
>>> dmc620_pmu_irqs_lock --> cpu_hotplug_lock -->
>>> cpuhp_state-down
>>>
>>> [ 84.207305] Possible unsafe locking scenario:
>>>
>>> [ 84.213212] CPU0 CPU1
>>> [ 84.217729] ---- ----
>>> [ 84.222247] lock(cpuhp_state-down);
>>> [ 84.225899] lock(cpu_hotplug_lock);
>>> [ 84.232068] lock(cpuhp_state-down);
>>> [ 84.238237] lock(dmc620_pmu_irqs_lock);
>>> [ 84.242236]
>>> *** DEADLOCK ***
>>>
>>> The problematic locking order seems to be
>>>
>>> lock(dmc620_pmu_irqs_lock) --> lock(cpu_hotplug_lock)
>>>
>>> This locking order happens when dmc620_pmu_get_irq() is called from
>>> dmc620_pmu_device_probe(). Since dmc620_pmu_irqs_lock is used for
>>> protecting the dmc620_pmu_irqs structure only, we don't actually need
>>> to hold the lock when adding a new instance to the CPU hotplug
>>> subsystem.
>>>
>>> Fix this possible deadlock scenario by releasing the lock before
>>> calling cpuhp_state_add_instance_nocalls() and reacquiring it
>>> afterward.
>>> To avoid the possibility of 2 racing dmc620_pmu_get_irq() calls
>>> inserting
>>> duplicated dmc620_pmu_irq structures with the same irq number, a dummy
>>> entry is inserted before releasing the lock which will block a
>>> competing
>>> thread from inserting another irq structure of the same irq number.
>>>
>>> Suggested-by: Robin Murphy <[email protected]>
>>> Signed-off-by: Waiman Long <[email protected]>
>>> ---
>>> drivers/perf/arm_dmc620_pmu.c | 28 ++++++++++++++++++++++------
>>> 1 file changed, 22 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/perf/arm_dmc620_pmu.c
>>> b/drivers/perf/arm_dmc620_pmu.c
>>> index 9d0f01c4455a..7cafd4dd4522 100644
>>> --- a/drivers/perf/arm_dmc620_pmu.c
>>> +++ b/drivers/perf/arm_dmc620_pmu.c
>>> @@ -76,6 +76,7 @@ struct dmc620_pmu_irq {
>>> refcount_t refcount;
>>> unsigned int irq_num;
>>> unsigned int cpu;
>>> + unsigned int valid;
>>> };
>>> struct dmc620_pmu {
>>> @@ -423,9 +424,14 @@ static struct dmc620_pmu_irq
>>> *__dmc620_pmu_get_irq(int irq_num)
>>> struct dmc620_pmu_irq *irq;
>>> int ret;
>>> - list_for_each_entry(irq, &dmc620_pmu_irqs, irqs_node)
>>> - if (irq->irq_num == irq_num &&
>>> refcount_inc_not_zero(&irq->refcount))
>>> + list_for_each_entry(irq, &dmc620_pmu_irqs, irqs_node) {
>>> + if (irq->irq_num != irq_num)
>>> + continue;
>>> + if (!irq->valid)
>>> + return ERR_PTR(-EAGAIN); /* Try again later */
>> It looks like this can bubble up to the probe() routine. Does the driver
>> core handle -EAGAIN coming back from a probe routine?
> Right, I should add code to handle this error condition. I think it
> can be handled in dmc620_pmu_get_irq(). The important thing is to
> release the mutex, wait a few ms and try again. What do you think?
>>
>>> + if (refcount_inc_not_zero(&irq->refcount))
>>> return irq;
>>> + }
>>> irq = kzalloc(sizeof(*irq), GFP_KERNEL);
>>> if (!irq)
>>> @@ -447,13 +453,23 @@ static struct dmc620_pmu_irq
>>> *__dmc620_pmu_get_irq(int irq_num)
>>> if (ret)
>>> goto out_free_irq;
>>> - ret = cpuhp_state_add_instance_nocalls(cpuhp_state_num,
>>> &irq->node);
>>> - if (ret)
>>> - goto out_free_irq;
>>> -
>>> irq->irq_num = irq_num;
>>> list_add(&irq->irqs_node, &dmc620_pmu_irqs);
>>> + /*
>>> + * Release dmc620_pmu_irqs_lock before calling
>>> + * cpuhp_state_add_instance_nocalls() and reacquire it afterward.
>>> + */
>>> + mutex_unlock(&dmc620_pmu_irqs_lock);
>>> + ret = cpuhp_state_add_instance_nocalls(cpuhp_state_num,
>>> &irq->node);
>>> + mutex_lock(&dmc620_pmu_irqs_lock);
>>> +
>>> + if (ret) {
>>> + list_del(&irq->irqs_node);
>>> + goto out_free_irq;
>>> + }
>>> +
>>> + irq->valid = true;
>> Do you actually need a new flag here, or could we use a refcount of zero
>> to indicate that the irq descriptor is still being constructed?
>
> A refcount of zero can also mean that an existing irq is about to be
> removed. Right? So I don't think we can use that for this purpose.
> Besides, there is a 4-byte hole in the structure anyway for arm64.
Alternatively, I can use a special reference count value, say -1, to
signal that the irq is not valid yet. What do you think?
Cheers,
Longman
On Wed, Aug 02, 2023 at 09:37:31PM -0400, Waiman Long wrote:
>
> On 7/28/23 11:06, Will Deacon wrote:
> > On Fri, Jul 21, 2023 at 11:17:28PM -0400, Waiman Long wrote:
> > > The following circular locking dependency was reported when running
> > > cpus online/offline test on an arm64 system.
> > >
> > > [ 84.195923] Chain exists of:
> > > dmc620_pmu_irqs_lock --> cpu_hotplug_lock --> cpuhp_state-down
> > >
> > > [ 84.207305] Possible unsafe locking scenario:
> > >
> > > [ 84.213212] CPU0 CPU1
> > > [ 84.217729] ---- ----
> > > [ 84.222247] lock(cpuhp_state-down);
> > > [ 84.225899] lock(cpu_hotplug_lock);
> > > [ 84.232068] lock(cpuhp_state-down);
> > > [ 84.238237] lock(dmc620_pmu_irqs_lock);
> > > [ 84.242236]
> > > *** DEADLOCK ***
> > >
> > > The problematic locking order seems to be
> > >
> > > lock(dmc620_pmu_irqs_lock) --> lock(cpu_hotplug_lock)
> > >
> > > This locking order happens when dmc620_pmu_get_irq() is called from
> > > dmc620_pmu_device_probe(). Since dmc620_pmu_irqs_lock is used for
> > > protecting the dmc620_pmu_irqs structure only, we don't actually need
> > > to hold the lock when adding a new instance to the CPU hotplug subsystem.
> > >
> > > Fix this possible deadlock scenario by releasing the lock before
> > > calling cpuhp_state_add_instance_nocalls() and reacquiring it afterward.
> > > To avoid the possibility of 2 racing dmc620_pmu_get_irq() calls inserting
> > > duplicated dmc620_pmu_irq structures with the same irq number, a dummy
> > > entry is inserted before releasing the lock which will block a competing
> > > thread from inserting another irq structure of the same irq number.
> > >
> > > Suggested-by: Robin Murphy <[email protected]>
> > > Signed-off-by: Waiman Long <[email protected]>
> > > ---
> > > drivers/perf/arm_dmc620_pmu.c | 28 ++++++++++++++++++++++------
> > > 1 file changed, 22 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/drivers/perf/arm_dmc620_pmu.c b/drivers/perf/arm_dmc620_pmu.c
> > > index 9d0f01c4455a..7cafd4dd4522 100644
> > > --- a/drivers/perf/arm_dmc620_pmu.c
> > > +++ b/drivers/perf/arm_dmc620_pmu.c
> > > @@ -76,6 +76,7 @@ struct dmc620_pmu_irq {
> > > refcount_t refcount;
> > > unsigned int irq_num;
> > > unsigned int cpu;
> > > + unsigned int valid;
> > > };
> > > struct dmc620_pmu {
> > > @@ -423,9 +424,14 @@ static struct dmc620_pmu_irq *__dmc620_pmu_get_irq(int irq_num)
> > > struct dmc620_pmu_irq *irq;
> > > int ret;
> > > - list_for_each_entry(irq, &dmc620_pmu_irqs, irqs_node)
> > > - if (irq->irq_num == irq_num && refcount_inc_not_zero(&irq->refcount))
> > > + list_for_each_entry(irq, &dmc620_pmu_irqs, irqs_node) {
> > > + if (irq->irq_num != irq_num)
> > > + continue;
> > > + if (!irq->valid)
> > > + return ERR_PTR(-EAGAIN); /* Try again later */
> > It looks like this can bubble up to the probe() routine. Does the driver
> > core handle -EAGAIN coming back from a probe routine?
> Right, I should add code to handle this error condition. I think it can be
> handled in dmc620_pmu_get_irq(). The important thing is to release the
> mutex, wait a few ms and try again. What do you think?
I don't really follow, but waiting a few ms and trying again sounds like
a really nasty hack for something which doesn't appear to be constrained
by broken hardware. In other words, we got ourselves into this mess, so
we should be able to resolve it properly.
Will
On Fri, Aug 04, 2023 at 12:51:47PM -0400, Waiman Long wrote:
> On 8/4/23 12:28, Will Deacon wrote:
> > > > > struct dmc620_pmu {
> > > > > @@ -423,9 +424,14 @@ static struct dmc620_pmu_irq *__dmc620_pmu_get_irq(int irq_num)
> > > > > struct dmc620_pmu_irq *irq;
> > > > > int ret;
> > > > > - list_for_each_entry(irq, &dmc620_pmu_irqs, irqs_node)
> > > > > - if (irq->irq_num == irq_num && refcount_inc_not_zero(&irq->refcount))
> > > > > + list_for_each_entry(irq, &dmc620_pmu_irqs, irqs_node) {
> > > > > + if (irq->irq_num != irq_num)
> > > > > + continue;
> > > > > + if (!irq->valid)
> > > > > + return ERR_PTR(-EAGAIN); /* Try again later */
> > > > It looks like this can bubble up to the probe() routine. Does the driver
> > > > core handle -EAGAIN coming back from a probe routine?
> > > Right, I should add code to handle this error condition. I think it can be
> > > handled in dmc620_pmu_get_irq(). The important thing is to release the
> > > mutex, wait a few ms and try again. What do you think?
> > I don't really follow, but waiting a few ms and trying again sounds like
> > a really nasty hack for something which doesn't appear to be constrained
> > by broken hardware. In other words, we got ourselves into this mess, so
> > we should be able to resolve it properly.
>
> From my point of view, the proper way to solve the problem is to reverse the
> locking order. Since you don't to add a EXPORT statement to the core kernel
> code, we will have to find a way around it by not holding the
> dmc620_pmu_irqs_lock when cpuhp_state_add_instance_nocalls() is called.
> Another alternative that I can think of is to add one more mutex that we
> will hold just for the entirety of? __dmc620_pmu_get_irq() and take
> dmc620_pmu_irqs_lock only when the linked list is being modified. That will
> eliminate the need to introduce arbitrary wait as other caller of
> __dmc620_pmu_get_irq() will wait in the new mutex. Will this work for you?
Yes. To be honest, I think we've both spent far too much time trying to
fix this (and I admire your persistence!), so adding a mutex to make it
"obviously" correct sounds like the right thing to me. We can look at
optimisations later if anybody cares.
Cheers,
Will
On Wed, Aug 02, 2023 at 09:44:58PM -0400, Waiman Long wrote:
>
> On 8/2/23 21:37, Waiman Long wrote:
> >
> > On 7/28/23 11:06, Will Deacon wrote:
> > > On Fri, Jul 21, 2023 at 11:17:28PM -0400, Waiman Long wrote:
> > > > The following circular locking dependency was reported when running
> > > > cpus online/offline test on an arm64 system.
> > > >
> > > > [?? 84.195923] Chain exists of:
> > > > ????????????????? dmc620_pmu_irqs_lock --> cpu_hotplug_lock -->
> > > > cpuhp_state-down
> > > >
> > > > [?? 84.207305]? Possible unsafe locking scenario:
> > > >
> > > > [?? 84.213212]??????? CPU0??????????????????? CPU1
> > > > [?? 84.217729]??????? ----??????????????????? ----
> > > > [?? 84.222247]?? lock(cpuhp_state-down);
> > > > [?? 84.225899] lock(cpu_hotplug_lock);
> > > > [?? 84.232068] lock(cpuhp_state-down);
> > > > [?? 84.238237]?? lock(dmc620_pmu_irqs_lock);
> > > > [?? 84.242236]
> > > > ???????????????? *** DEADLOCK ***
> > > >
> > > > The problematic locking order seems to be
> > > >
> > > > ????lock(dmc620_pmu_irqs_lock) --> lock(cpu_hotplug_lock)
> > > >
> > > > This locking order happens when dmc620_pmu_get_irq() is called from
> > > > dmc620_pmu_device_probe(). Since dmc620_pmu_irqs_lock is used for
> > > > protecting the dmc620_pmu_irqs structure only, we don't actually need
> > > > to hold the lock when adding a new instance to the CPU hotplug
> > > > subsystem.
> > > >
> > > > Fix this possible deadlock scenario by releasing the lock before
> > > > calling cpuhp_state_add_instance_nocalls() and reacquiring it
> > > > afterward.
> > > > To avoid the possibility of 2 racing dmc620_pmu_get_irq() calls
> > > > inserting
> > > > duplicated dmc620_pmu_irq structures with the same irq number, a dummy
> > > > entry is inserted before releasing the lock which will block a
> > > > competing
> > > > thread from inserting another irq structure of the same irq number.
> > > >
> > > > Suggested-by: Robin Murphy <[email protected]>
> > > > Signed-off-by: Waiman Long <[email protected]>
> > > > ---
> > > > ? drivers/perf/arm_dmc620_pmu.c | 28 ++++++++++++++++++++++------
> > > > ? 1 file changed, 22 insertions(+), 6 deletions(-)
> > > >
> > > > diff --git a/drivers/perf/arm_dmc620_pmu.c
> > > > b/drivers/perf/arm_dmc620_pmu.c
> > > > index 9d0f01c4455a..7cafd4dd4522 100644
> > > > --- a/drivers/perf/arm_dmc620_pmu.c
> > > > +++ b/drivers/perf/arm_dmc620_pmu.c
> > > > @@ -76,6 +76,7 @@ struct dmc620_pmu_irq {
> > > > ????? refcount_t refcount;
> > > > ????? unsigned int irq_num;
> > > > ????? unsigned int cpu;
> > > > +??? unsigned int valid;
> > > > ? };
> > > > ? ? struct dmc620_pmu {
> > > > @@ -423,9 +424,14 @@ static struct dmc620_pmu_irq
> > > > *__dmc620_pmu_get_irq(int irq_num)
> > > > ????? struct dmc620_pmu_irq *irq;
> > > > ????? int ret;
> > > > ? -??? list_for_each_entry(irq, &dmc620_pmu_irqs, irqs_node)
> > > > -??????? if (irq->irq_num == irq_num &&
> > > > refcount_inc_not_zero(&irq->refcount))
> > > > +??? list_for_each_entry(irq, &dmc620_pmu_irqs, irqs_node) {
> > > > +??????? if (irq->irq_num != irq_num)
> > > > +??????????? continue;
> > > > +??????? if (!irq->valid)
> > > > +??????????? return ERR_PTR(-EAGAIN);??? /* Try again later */
> > > It looks like this can bubble up to the probe() routine. Does the driver
> > > core handle -EAGAIN coming back from a probe routine?
> > Right, I should add code to handle this error condition. I think it can
> > be handled in dmc620_pmu_get_irq(). The important thing is to release
> > the mutex, wait a few ms and try again. What do you think?
> > >
> > > > +??????? if (refcount_inc_not_zero(&irq->refcount))
> > > > ????????????? return irq;
> > > > +??? }
> > > > ? ????? irq = kzalloc(sizeof(*irq), GFP_KERNEL);
> > > > ????? if (!irq)
> > > > @@ -447,13 +453,23 @@ static struct dmc620_pmu_irq
> > > > *__dmc620_pmu_get_irq(int irq_num)
> > > > ????? if (ret)
> > > > ????????? goto out_free_irq;
> > > > ? -??? ret = cpuhp_state_add_instance_nocalls(cpuhp_state_num,
> > > > &irq->node);
> > > > -??? if (ret)
> > > > -??????? goto out_free_irq;
> > > > -
> > > > ????? irq->irq_num = irq_num;
> > > > ????? list_add(&irq->irqs_node, &dmc620_pmu_irqs);
> > > > ? +??? /*
> > > > +???? * Release dmc620_pmu_irqs_lock before calling
> > > > +???? * cpuhp_state_add_instance_nocalls() and reacquire it afterward.
> > > > +???? */
> > > > +??? mutex_unlock(&dmc620_pmu_irqs_lock);
> > > > +??? ret = cpuhp_state_add_instance_nocalls(cpuhp_state_num,
> > > > &irq->node);
> > > > +??? mutex_lock(&dmc620_pmu_irqs_lock);
> > > > +
> > > > +??? if (ret) {
> > > > +??????? list_del(&irq->irqs_node);
> > > > +??????? goto out_free_irq;
> > > > +??? }
> > > > +
> > > > +??? irq->valid = true;
> > > Do you actually need a new flag here, or could we use a refcount of zero
> > > to indicate that the irq descriptor is still being constructed?
> >
> > A refcount of zero can also mean that an existing irq is about to be
> > removed. Right? So I don't think we can use that for this purpose.
> > Besides, there is a 4-byte hole in the structure anyway for arm64.
>
> Alternatively, I can use a special reference count value, say -1, to signal
> that the irq is not valid yet. What do you think?
If the device is being removed, we should teardown the irq handler first,
so I don't see why the refcount isn't the right thing.
Will
On 8/4/23 12:28, Will Deacon wrote:
>>>> struct dmc620_pmu {
>>>> @@ -423,9 +424,14 @@ static struct dmc620_pmu_irq *__dmc620_pmu_get_irq(int irq_num)
>>>> struct dmc620_pmu_irq *irq;
>>>> int ret;
>>>> - list_for_each_entry(irq, &dmc620_pmu_irqs, irqs_node)
>>>> - if (irq->irq_num == irq_num && refcount_inc_not_zero(&irq->refcount))
>>>> + list_for_each_entry(irq, &dmc620_pmu_irqs, irqs_node) {
>>>> + if (irq->irq_num != irq_num)
>>>> + continue;
>>>> + if (!irq->valid)
>>>> + return ERR_PTR(-EAGAIN); /* Try again later */
>>> It looks like this can bubble up to the probe() routine. Does the driver
>>> core handle -EAGAIN coming back from a probe routine?
>> Right, I should add code to handle this error condition. I think it can be
>> handled in dmc620_pmu_get_irq(). The important thing is to release the
>> mutex, wait a few ms and try again. What do you think?
> I don't really follow, but waiting a few ms and trying again sounds like
> a really nasty hack for something which doesn't appear to be constrained
> by broken hardware. In other words, we got ourselves into this mess, so
> we should be able to resolve it properly.
From my point of view, the proper way to solve the problem is to
reverse the locking order. Since you don't to add a EXPORT statement to
the core kernel code, we will have to find a way around it by not
holding the dmc620_pmu_irqs_lock when cpuhp_state_add_instance_nocalls()
is called. Another alternative that I can think of is to add one more
mutex that we will hold just for the entirety of __dmc620_pmu_get_irq()
and take dmc620_pmu_irqs_lock only when the linked list is being
modified. That will eliminate the need to introduce arbitrary wait as
other caller of __dmc620_pmu_get_irq() will wait in the new mutex. Will
this work for you?
Cheers,
Longman
On 8/4/23 12:59, Will Deacon wrote:
> On Fri, Aug 04, 2023 at 12:51:47PM -0400, Waiman Long wrote:
>> On 8/4/23 12:28, Will Deacon wrote:
>>>>>> struct dmc620_pmu {
>>>>>> @@ -423,9 +424,14 @@ static struct dmc620_pmu_irq *__dmc620_pmu_get_irq(int irq_num)
>>>>>> struct dmc620_pmu_irq *irq;
>>>>>> int ret;
>>>>>> - list_for_each_entry(irq, &dmc620_pmu_irqs, irqs_node)
>>>>>> - if (irq->irq_num == irq_num && refcount_inc_not_zero(&irq->refcount))
>>>>>> + list_for_each_entry(irq, &dmc620_pmu_irqs, irqs_node) {
>>>>>> + if (irq->irq_num != irq_num)
>>>>>> + continue;
>>>>>> + if (!irq->valid)
>>>>>> + return ERR_PTR(-EAGAIN); /* Try again later */
>>>>> It looks like this can bubble up to the probe() routine. Does the driver
>>>>> core handle -EAGAIN coming back from a probe routine?
>>>> Right, I should add code to handle this error condition. I think it can be
>>>> handled in dmc620_pmu_get_irq(). The important thing is to release the
>>>> mutex, wait a few ms and try again. What do you think?
>>> I don't really follow, but waiting a few ms and trying again sounds like
>>> a really nasty hack for something which doesn't appear to be constrained
>>> by broken hardware. In other words, we got ourselves into this mess, so
>>> we should be able to resolve it properly.
>> From my point of view, the proper way to solve the problem is to reverse the
>> locking order. Since you don't to add a EXPORT statement to the core kernel
>> code, we will have to find a way around it by not holding the
>> dmc620_pmu_irqs_lock when cpuhp_state_add_instance_nocalls() is called.
>> Another alternative that I can think of is to add one more mutex that we
>> will hold just for the entirety of __dmc620_pmu_get_irq() and take
>> dmc620_pmu_irqs_lock only when the linked list is being modified. That will
>> eliminate the need to introduce arbitrary wait as other caller of
>> __dmc620_pmu_get_irq() will wait in the new mutex. Will this work for you?
> Yes. To be honest, I think we've both spent far too much time trying to
> fix this (and I admire your persistence!), so adding a mutex to make it
> "obviously" correct sounds like the right thing to me. We can look at
> optimisations later if anybody cares.
Sorry to be too persistent sometimes:-) Will send out a new version soon.
Cheers,
Longman
On 8/4/23 12:29, Will Deacon wrote:
> On Wed, Aug 02, 2023 at 09:44:58PM -0400, Waiman Long wrote:
>> On 8/2/23 21:37, Waiman Long wrote:
>>> On 7/28/23 11:06, Will Deacon wrote:
>>>> On Fri, Jul 21, 2023 at 11:17:28PM -0400, Waiman Long wrote:
>>>>> The following circular locking dependency was reported when running
>>>>> cpus online/offline test on an arm64 system.
>>>>>
>>>>> [ 84.195923] Chain exists of:
>>>>> dmc620_pmu_irqs_lock --> cpu_hotplug_lock -->
>>>>> cpuhp_state-down
>>>>>
>>>>> [ 84.207305] Possible unsafe locking scenario:
>>>>>
>>>>> [ 84.213212] CPU0 CPU1
>>>>> [ 84.217729] ---- ----
>>>>> [ 84.222247] lock(cpuhp_state-down);
>>>>> [ 84.225899] lock(cpu_hotplug_lock);
>>>>> [ 84.232068] lock(cpuhp_state-down);
>>>>> [ 84.238237] lock(dmc620_pmu_irqs_lock);
>>>>> [ 84.242236]
>>>>> *** DEADLOCK ***
>>>>>
>>>>> The problematic locking order seems to be
>>>>>
>>>>> lock(dmc620_pmu_irqs_lock) --> lock(cpu_hotplug_lock)
>>>>>
>>>>> This locking order happens when dmc620_pmu_get_irq() is called from
>>>>> dmc620_pmu_device_probe(). Since dmc620_pmu_irqs_lock is used for
>>>>> protecting the dmc620_pmu_irqs structure only, we don't actually need
>>>>> to hold the lock when adding a new instance to the CPU hotplug
>>>>> subsystem.
>>>>>
>>>>> Fix this possible deadlock scenario by releasing the lock before
>>>>> calling cpuhp_state_add_instance_nocalls() and reacquiring it
>>>>> afterward.
>>>>> To avoid the possibility of 2 racing dmc620_pmu_get_irq() calls
>>>>> inserting
>>>>> duplicated dmc620_pmu_irq structures with the same irq number, a dummy
>>>>> entry is inserted before releasing the lock which will block a
>>>>> competing
>>>>> thread from inserting another irq structure of the same irq number.
>>>>>
>>>>> Suggested-by: Robin Murphy <[email protected]>
>>>>> Signed-off-by: Waiman Long <[email protected]>
>>>>> ---
>>>>> drivers/perf/arm_dmc620_pmu.c | 28 ++++++++++++++++++++++------
>>>>> 1 file changed, 22 insertions(+), 6 deletions(-)
>>>>>
>>>>> diff --git a/drivers/perf/arm_dmc620_pmu.c
>>>>> b/drivers/perf/arm_dmc620_pmu.c
>>>>> index 9d0f01c4455a..7cafd4dd4522 100644
>>>>> --- a/drivers/perf/arm_dmc620_pmu.c
>>>>> +++ b/drivers/perf/arm_dmc620_pmu.c
>>>>> @@ -76,6 +76,7 @@ struct dmc620_pmu_irq {
>>>>> refcount_t refcount;
>>>>> unsigned int irq_num;
>>>>> unsigned int cpu;
>>>>> + unsigned int valid;
>>>>> };
>>>>> struct dmc620_pmu {
>>>>> @@ -423,9 +424,14 @@ static struct dmc620_pmu_irq
>>>>> *__dmc620_pmu_get_irq(int irq_num)
>>>>> struct dmc620_pmu_irq *irq;
>>>>> int ret;
>>>>> - list_for_each_entry(irq, &dmc620_pmu_irqs, irqs_node)
>>>>> - if (irq->irq_num == irq_num &&
>>>>> refcount_inc_not_zero(&irq->refcount))
>>>>> + list_for_each_entry(irq, &dmc620_pmu_irqs, irqs_node) {
>>>>> + if (irq->irq_num != irq_num)
>>>>> + continue;
>>>>> + if (!irq->valid)
>>>>> + return ERR_PTR(-EAGAIN); /* Try again later */
>>>> It looks like this can bubble up to the probe() routine. Does the driver
>>>> core handle -EAGAIN coming back from a probe routine?
>>> Right, I should add code to handle this error condition. I think it can
>>> be handled in dmc620_pmu_get_irq(). The important thing is to release
>>> the mutex, wait a few ms and try again. What do you think?
>>>>> + if (refcount_inc_not_zero(&irq->refcount))
>>>>> return irq;
>>>>> + }
>>>>> irq = kzalloc(sizeof(*irq), GFP_KERNEL);
>>>>> if (!irq)
>>>>> @@ -447,13 +453,23 @@ static struct dmc620_pmu_irq
>>>>> *__dmc620_pmu_get_irq(int irq_num)
>>>>> if (ret)
>>>>> goto out_free_irq;
>>>>> - ret = cpuhp_state_add_instance_nocalls(cpuhp_state_num,
>>>>> &irq->node);
>>>>> - if (ret)
>>>>> - goto out_free_irq;
>>>>> -
>>>>> irq->irq_num = irq_num;
>>>>> list_add(&irq->irqs_node, &dmc620_pmu_irqs);
>>>>> + /*
>>>>> + * Release dmc620_pmu_irqs_lock before calling
>>>>> + * cpuhp_state_add_instance_nocalls() and reacquire it afterward.
>>>>> + */
>>>>> + mutex_unlock(&dmc620_pmu_irqs_lock);
>>>>> + ret = cpuhp_state_add_instance_nocalls(cpuhp_state_num,
>>>>> &irq->node);
>>>>> + mutex_lock(&dmc620_pmu_irqs_lock);
>>>>> +
>>>>> + if (ret) {
>>>>> + list_del(&irq->irqs_node);
>>>>> + goto out_free_irq;
>>>>> + }
>>>>> +
>>>>> + irq->valid = true;
>>>> Do you actually need a new flag here, or could we use a refcount of zero
>>>> to indicate that the irq descriptor is still being constructed?
>>> A refcount of zero can also mean that an existing irq is about to be
>>> removed. Right? So I don't think we can use that for this purpose.
>>> Besides, there is a 4-byte hole in the structure anyway for arm64.
>> Alternatively, I can use a special reference count value, say -1, to signal
>> that the irq is not valid yet. What do you think?
> If the device is being removed, we should teardown the irq handler first,
> so I don't see why the refcount isn't the right thing.
According to the current code, a refcount of 0 will cause the caller to
skip the entry and eventually create a new irq itself. That may cause
the creation of 2 dmc620_pmu_irq structures with the same irq number.
Will that be a problem? The reason why I see a problem with a refcount
of 0 because it can now signal both the creation of the new irq or the
retirement of an old irq that is to be teared down.
Cheers,
Longman