On a POWER8 LPAR running 4.11.0-rc5, a hot unplug operation on
any I/O adapter results in the following warning
This problem has been in the code for some time now. I had first seen this in
-next tree.
[ 269.589441] rpadlpar_io: slot PHB 72 removed
[ 270.589997] refcount_t: underflow; use-after-free.
[ 270.590019] ------------[ cut here ]------------
[ 270.590025] WARNING: CPU: 5 PID: 3335 at lib/refcount.c:128 refcount_sub_and_test+0xf4/0x110
[ 270.590028] Modules linked in: xt_CHECKSUM iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 tun bridge stp llc rpadlpar_io rpaphp kvm_pr kvm ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter dccp_diag dccp tcp_diag udp_diag inet_diag unix_diag af_packet_diag netlink_diag ghash_generic xts gf128mul vmx_crypto tpm_ibmvtpm tpm sg pseries_rng nfsd auth_rpcgss nfs_acl lockd grace sunrpc binfmt_misc ip_tables xfs libcrc32c sr_mod sd_mod cdrom ibmvscsi ibmveth scsi_transport_srp dm_mirror dm_region_hash dm_log dm_mod
[ 270.590076] CPU: 5 PID: 3335 Comm: drmgr Not tainted 4.11.0-rc5 #3
[ 270.590079] task: c0000005d8df8600 task.stack: c0000000fb3a8000
[ 270.590081] NIP: c000000001aa3ca4 LR: c000000001aa3ca0 CTR: 00000000006338e4
[ 270.590084] REGS: c0000000fb3ab8a0 TRAP: 0700 Not tainted (4.11.0-rc5)
[ 270.590087] MSR: 8000000000029033 <SF,EE,ME,IR,DR,RI,LE>
[ 270.590090] CR: 22002422 XER: 00000007
[ 270.590093] CFAR: c000000001edaabc SOFTE: 1
[ 270.590093] GPR00: c000000001aa3ca0 c0000000fb3abb20 c0000000025ea900 0000000000000026
[ 270.590093] GPR04: c00000077fc4ada0 c00000077fc617b8 00000000000f0c33 0000000000000000
[ 270.590093] GPR08: 0000000000000000 c00000000227146c 000000077d9e0000 0000000000003ff0
[ 270.590093] GPR12: 0000000000002200 c00000000e802d00 0000000000000000 0000000000000000
[ 270.590093] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
[ 270.590093] GPR20: 0000000000000000 000000001001b5a8 0000000010018338 0000000010016650
[ 270.590093] GPR24: 000000001001b278 c000000776e0fdcc 0000000010016650 0000000000000000
[ 270.590093] GPR28: c00000077ffea910 c0000000fbf79180 c000000776e0fdc0 c0000000fbf791d8
[ 270.590126] NIP [c000000001aa3ca4] refcount_sub_and_test+0xf4/0x110
[ 270.590129] LR [c000000001aa3ca0] refcount_sub_and_test+0xf0/0x110
[ 270.590132] Call Trace:
[ 270.590134] [c0000000fb3abb20] [c000000001aa3ca0] refcount_sub_and_test+0xf0/0x110 (unreliable)
[ 270.590139] [c0000000fb3abb80] [c000000001a8221c] kobject_put+0x3c/0xa0
[ 270.590143] [c0000000fb3abbf0] [c000000001d22d34] of_node_put+0x24/0x40
[ 270.590147] [c0000000fb3abc10] [c00000000165c874] ofdt_write+0x204/0x6b0
[ 270.590151] [c0000000fb3abcd0] [c00000000197a220] proc_reg_write+0x80/0xd0
[ 270.590155] [c0000000fb3abd00] [c0000000018de680] __vfs_write+0x40/0x1c0
[ 270.590158] [c0000000fb3abd90] [c0000000018dffd8] vfs_write+0xc8/0x240
[ 270.590162] [c0000000fb3abde0] [c0000000018e1c40] SyS_write+0x60/0x110
[ 270.590165] [c0000000fb3abe30] [c0000000015cb184] system_call+0x38/0xe0
[ 270.590168] Instruction dump:
[ 270.590170] 7863d182 4e800020 7c0802a6 39200001 3d42fff8 3c62ffb1 386371a8 992a0171
[ 270.590175] f8010010 f821ffa1 48436de1 60000000 <0fe00000> 38210060 38600000 e8010010
[ 270.590180] ---[ end trace 08c7a2f3c8bead33 ]—
Have attached the dmesg log from the system. Let me know if any additional
information is required to help debug this problem.
Thanks
-Sachin
On 04/06/2017 03:27 AM, Sachin Sant wrote:
> On a POWER8 LPAR running 4.11.0-rc5, a hot unplug operation on
> any I/O adapter results in the following warning
>
> This problem has been in the code for some time now. I had first seen this in
> -next tree.
>
> [ 269.589441] rpadlpar_io: slot PHB 72 removed
> [ 270.589997] refcount_t: underflow; use-after-free.
> [ 270.590019] ------------[ cut here ]------------
> [ 270.590025] WARNING: CPU: 5 PID: 3335 at lib/refcount.c:128 refcount_sub_and_test+0xf4/0x110
> [ 270.590028] Modules linked in: xt_CHECKSUM iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 tun bridge stp llc rpadlpar_io rpaphp kvm_pr kvm ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter dccp_diag dccp tcp_diag udp_diag inet_diag unix_diag af_packet_diag netlink_diag ghash_generic xts gf128mul vmx_crypto tpm_ibmvtpm tpm sg pseries_rng nfsd auth_rpcgss nfs_acl lockd grace sunrpc binfmt_misc ip_tables xfs libcrc32c sr_mod sd_mod cdrom ibmvscsi ibmveth scsi_transport_srp dm_mirror dm_region_hash dm_log dm_mod
> [ 270.590076] CPU: 5 PID: 3335 Comm: drmgr Not tainted 4.11.0-rc5 #3
> [ 270.590079] task: c0000005d8df8600 task.stack: c0000000fb3a8000
> [ 270.590081] NIP: c000000001aa3ca4 LR: c000000001aa3ca0 CTR: 00000000006338e4
> [ 270.590084] REGS: c0000000fb3ab8a0 TRAP: 0700 Not tainted (4.11.0-rc5)
> [ 270.590087] MSR: 8000000000029033 <SF,EE,ME,IR,DR,RI,LE>
> [ 270.590090] CR: 22002422 XER: 00000007
> [ 270.590093] CFAR: c000000001edaabc SOFTE: 1
> [ 270.590093] GPR00: c000000001aa3ca0 c0000000fb3abb20 c0000000025ea900 0000000000000026
> [ 270.590093] GPR04: c00000077fc4ada0 c00000077fc617b8 00000000000f0c33 0000000000000000
> [ 270.590093] GPR08: 0000000000000000 c00000000227146c 000000077d9e0000 0000000000003ff0
> [ 270.590093] GPR12: 0000000000002200 c00000000e802d00 0000000000000000 0000000000000000
> [ 270.590093] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
> [ 270.590093] GPR20: 0000000000000000 000000001001b5a8 0000000010018338 0000000010016650
> [ 270.590093] GPR24: 000000001001b278 c000000776e0fdcc 0000000010016650 0000000000000000
> [ 270.590093] GPR28: c00000077ffea910 c0000000fbf79180 c000000776e0fdc0 c0000000fbf791d8
> [ 270.590126] NIP [c000000001aa3ca4] refcount_sub_and_test+0xf4/0x110
> [ 270.590129] LR [c000000001aa3ca0] refcount_sub_and_test+0xf0/0x110
> [ 270.590132] Call Trace:
> [ 270.590134] [c0000000fb3abb20] [c000000001aa3ca0] refcount_sub_and_test+0xf0/0x110 (unreliable)
> [ 270.590139] [c0000000fb3abb80] [c000000001a8221c] kobject_put+0x3c/0xa0
> [ 270.590143] [c0000000fb3abbf0] [c000000001d22d34] of_node_put+0x24/0x40
> [ 270.590147] [c0000000fb3abc10] [c00000000165c874] ofdt_write+0x204/0x6b0
> [ 270.590151] [c0000000fb3abcd0] [c00000000197a220] proc_reg_write+0x80/0xd0
> [ 270.590155] [c0000000fb3abd00] [c0000000018de680] __vfs_write+0x40/0x1c0
> [ 270.590158] [c0000000fb3abd90] [c0000000018dffd8] vfs_write+0xc8/0x240
> [ 270.590162] [c0000000fb3abde0] [c0000000018e1c40] SyS_write+0x60/0x110
> [ 270.590165] [c0000000fb3abe30] [c0000000015cb184] system_call+0x38/0xe0
> [ 270.590168] Instruction dump:
> [ 270.590170] 7863d182 4e800020 7c0802a6 39200001 3d42fff8 3c62ffb1 386371a8 992a0171
> [ 270.590175] f8010010 f821ffa1 48436de1 60000000 <0fe00000> 38210060 38600000 e8010010
> [ 270.590180] ---[ end trace 08c7a2f3c8bead33 ]—
>
> Have attached the dmesg log from the system. Let me know if any additional
> information is required to help debug this problem.
I remember you mentioning this when the issue was brought up for CPUs. I
assume the case is the same here where the issue is only seen with
adapters that were hot-added after boot (ie. hot-remove of adapter
present at boot doesn't trip the warning)?
-Tyrel
>
> Thanks
> -Sachin
>
>
Tyrel Datwyler <[email protected]> writes:
> On 04/06/2017 03:27 AM, Sachin Sant wrote:
>> On a POWER8 LPAR running 4.11.0-rc5, a hot unplug operation on
>> any I/O adapter results in the following warning
>>
>> This problem has been in the code for some time now. I had first seen this in
>> -next tree.
>>
>> [ 269.589441] rpadlpar_io: slot PHB 72 removed
>> [ 270.589997] refcount_t: underflow; use-after-free.
>> [ 270.590019] ------------[ cut here ]------------
>> [ 270.590025] WARNING: CPU: 5 PID: 3335 at lib/refcount.c:128 refcount_sub_and_test+0xf4/0x110
>> [ 270.590028] Modules linked in: xt_CHECKSUM iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 tun bridge stp llc rpadlpar_io rpaphp kvm_pr kvm ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter dccp_diag dccp tcp_diag udp_diag inet_diag unix_diag af_packet_diag netlink_diag ghash_generic xts gf128mul vmx_crypto tpm_ibmvtpm tpm sg pseries_rng nfsd auth_rpcgss nfs_acl lockd grace sunrpc binfmt_misc ip_tables xfs libcrc32c sr_mod sd_mod cdrom ibmvscsi ibmveth scsi_transport_srp dm_mirror dm_region_hash dm_log dm_mod
>> [ 270.590076] CPU: 5 PID: 3335 Comm: drmgr Not tainted 4.11.0-rc5 #3
>> [ 270.590079] task: c0000005d8df8600 task.stack: c0000000fb3a8000
>> [ 270.590081] NIP: c000000001aa3ca4 LR: c000000001aa3ca0 CTR: 00000000006338e4
>> [ 270.590084] REGS: c0000000fb3ab8a0 TRAP: 0700 Not tainted (4.11.0-rc5)
>> [ 270.590087] MSR: 8000000000029033 <SF,EE,ME,IR,DR,RI,LE>
>> [ 270.590090] CR: 22002422 XER: 00000007
>> [ 270.590093] CFAR: c000000001edaabc SOFTE: 1
>> [ 270.590093] GPR00: c000000001aa3ca0 c0000000fb3abb20 c0000000025ea900 0000000000000026
>> [ 270.590093] GPR04: c00000077fc4ada0 c00000077fc617b8 00000000000f0c33 0000000000000000
>> [ 270.590093] GPR08: 0000000000000000 c00000000227146c 000000077d9e0000 0000000000003ff0
>> [ 270.590093] GPR12: 0000000000002200 c00000000e802d00 0000000000000000 0000000000000000
>> [ 270.590093] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
>> [ 270.590093] GPR20: 0000000000000000 000000001001b5a8 0000000010018338 0000000010016650
>> [ 270.590093] GPR24: 000000001001b278 c000000776e0fdcc 0000000010016650 0000000000000000
>> [ 270.590093] GPR28: c00000077ffea910 c0000000fbf79180 c000000776e0fdc0 c0000000fbf791d8
>> [ 270.590126] NIP [c000000001aa3ca4] refcount_sub_and_test+0xf4/0x110
>> [ 270.590129] LR [c000000001aa3ca0] refcount_sub_and_test+0xf0/0x110
>> [ 270.590132] Call Trace:
>> [ 270.590134] [c0000000fb3abb20] [c000000001aa3ca0] refcount_sub_and_test+0xf0/0x110 (unreliable)
>> [ 270.590139] [c0000000fb3abb80] [c000000001a8221c] kobject_put+0x3c/0xa0
>> [ 270.590143] [c0000000fb3abbf0] [c000000001d22d34] of_node_put+0x24/0x40
>> [ 270.590147] [c0000000fb3abc10] [c00000000165c874] ofdt_write+0x204/0x6b0
>> [ 270.590151] [c0000000fb3abcd0] [c00000000197a220] proc_reg_write+0x80/0xd0
>> [ 270.590155] [c0000000fb3abd00] [c0000000018de680] __vfs_write+0x40/0x1c0
>> [ 270.590158] [c0000000fb3abd90] [c0000000018dffd8] vfs_write+0xc8/0x240
>> [ 270.590162] [c0000000fb3abde0] [c0000000018e1c40] SyS_write+0x60/0x110
>> [ 270.590165] [c0000000fb3abe30] [c0000000015cb184] system_call+0x38/0xe0
>> [ 270.590168] Instruction dump:
>> [ 270.590170] 7863d182 4e800020 7c0802a6 39200001 3d42fff8 3c62ffb1 386371a8 992a0171
>> [ 270.590175] f8010010 f821ffa1 48436de1 60000000 <0fe00000> 38210060 38600000 e8010010
>> [ 270.590180] ---[ end trace 08c7a2f3c8bead33 ]—
>>
>> Have attached the dmesg log from the system. Let me know if any additional
>> information is required to help debug this problem.
>
> I remember you mentioning this when the issue was brought up for CPUs. I
> assume the case is the same here where the issue is only seen with
> adapters that were hot-added after boot (ie. hot-remove of adapter
> present at boot doesn't trip the warning)?
So who's fixing this?
cheers
> On 07-Apr-2017, at 2:14 AM, Tyrel Datwyler <[email protected]> wrote:
>
> On 04/06/2017 03:27 AM, Sachin Sant wrote:
>> On a POWER8 LPAR running 4.11.0-rc5, a hot unplug operation on
>> any I/O adapter results in the following warning
>
> I remember you mentioning this when the issue was brought up for CPUs. I
> assume the case is the same here where the issue is only seen with
> adapters that were hot-added after boot (ie. hot-remove of adapter
> present at boot doesn't trip the warning)?
>
Correct, can be recreated only with adapters that were hot-added after boot.
> -Tyrel
>
>>
>> Thanks
>> -Sachin
>>
>>
>
On 04/06/2017 09:04 PM, Michael Ellerman wrote:
> Tyrel Datwyler <[email protected]> writes:
>
>> On 04/06/2017 03:27 AM, Sachin Sant wrote:
>>> On a POWER8 LPAR running 4.11.0-rc5, a hot unplug operation on
>>> any I/O adapter results in the following warning
>>>
>>> This problem has been in the code for some time now. I had first seen this in
>>> -next tree.
>>>
<snip>
>>> Have attached the dmesg log from the system. Let me know if any additional
>>> information is required to help debug this problem.
>>
>> I remember you mentioning this when the issue was brought up for CPUs. I
>> assume the case is the same here where the issue is only seen with
>> adapters that were hot-added after boot (ie. hot-remove of adapter
>> present at boot doesn't trip the warning)?
>
> So who's fixing this?
I started looking at it when Bharata submitted a patch trying to fix the
issue for CPUs, but got side tracked by other things. I suspect that
this underflow has actually been an issue for quite some time, and we
are just now becoming aware of it thanks to the recount_t patchset being
merged. I'll look into it again this week.
-Tyrel
>
> cheers
>
Tyrel Datwyler <[email protected]> writes:
> On 04/06/2017 09:04 PM, Michael Ellerman wrote:
>> Tyrel Datwyler <[email protected]> writes:
>>
>>> On 04/06/2017 03:27 AM, Sachin Sant wrote:
>>>> On a POWER8 LPAR running 4.11.0-rc5, a hot unplug operation on
>>>> any I/O adapter results in the following warning
>>>>
>>>> This problem has been in the code for some time now. I had first seen this in
>>>> -next tree.
>>>>
>
> <snip>
>
>>>> Have attached the dmesg log from the system. Let me know if any additional
>>>> information is required to help debug this problem.
>>>
>>> I remember you mentioning this when the issue was brought up for CPUs. I
>>> assume the case is the same here where the issue is only seen with
>>> adapters that were hot-added after boot (ie. hot-remove of adapter
>>> present at boot doesn't trip the warning)?
>>
>> So who's fixing this?
>
> I started looking at it when Bharata submitted a patch trying to fix the
> issue for CPUs, but got side tracked by other things. I suspect that
> this underflow has actually been an issue for quite some time, and we
> are just now becoming aware of it thanks to the recount_t patchset being
> merged.
Yes I agree. Which means it might be broken in existing distros.
> I'll look into it again this week.
Thanks.
cheers
On 04/11/2017 02:00 AM, Michael Ellerman wrote:
> Tyrel Datwyler <[email protected]> writes:
>
>> On 04/06/2017 09:04 PM, Michael Ellerman wrote:
>>> Tyrel Datwyler <[email protected]> writes:
>>>
>>>> On 04/06/2017 03:27 AM, Sachin Sant wrote:
>>>>> On a POWER8 LPAR running 4.11.0-rc5, a hot unplug operation on
>>>>> any I/O adapter results in the following warning
>>>>>
>>>>> This problem has been in the code for some time now. I had first seen this in
>>>>> -next tree.
>>>>>
>>
>> <snip>
>>
>>>>> Have attached the dmesg log from the system. Let me know if any additional
>>>>> information is required to help debug this problem.
>>>>
>>>> I remember you mentioning this when the issue was brought up for CPUs. I
>>>> assume the case is the same here where the issue is only seen with
>>>> adapters that were hot-added after boot (ie. hot-remove of adapter
>>>> present at boot doesn't trip the warning)?
>>>
>>> So who's fixing this?
>>
>> I started looking at it when Bharata submitted a patch trying to fix the
>> issue for CPUs, but got side tracked by other things. I suspect that
>> this underflow has actually been an issue for quite some time, and we
>> are just now becoming aware of it thanks to the recount_t patchset being
>> merged.
>
> Yes I agree. Which means it might be broken in existing distros.
Definitely. I did some profiling last night, and I understand the
hotplug case. It turns out to be as I suggested in the original thread
about CPUs. When the devicetree code was worked to move the tree out of
proc and into sysfs the sysfs detach code added a of_node_put to remove
the original of_init reference. pSeries Being the sole original
*dynamic* device tree user we had always issued a of_node_put in our
dlpar specific detach function to achieve that end. So, this should be a
pretty straight forward trivial fix.
However, for the case where devices are present at boot it appears we a
leaking a lot of references resulting in the device nodes never actually
being released/freed after a dlpar remove. In the CPU case after boot I
count 8 more references taken than the hotplug case, and corresponding
of_node_put's are not called at dlpar remove time either. That will take
some time to track them down, review and clean up.
-Tyrel
>
>> I'll look into it again this week.
>
> Thanks.
>
> cheers
>
Tyrel Datwyler <[email protected]> writes:
> On 04/11/2017 02:00 AM, Michael Ellerman wrote:
>> Tyrel Datwyler <[email protected]> writes:
>>> I started looking at it when Bharata submitted a patch trying to fix the
>>> issue for CPUs, but got side tracked by other things. I suspect that
>>> this underflow has actually been an issue for quite some time, and we
>>> are just now becoming aware of it thanks to the recount_t patchset being
>>> merged.
>>
>> Yes I agree. Which means it might be broken in existing distros.
>
> Definitely. I did some profiling last night, and I understand the
> hotplug case. It turns out to be as I suggested in the original thread
> about CPUs. When the devicetree code was worked to move the tree out of
> proc and into sysfs the sysfs detach code added a of_node_put to remove
> the original of_init reference. pSeries Being the sole original
> *dynamic* device tree user we had always issued a of_node_put in our
> dlpar specific detach function to achieve that end. So, this should be a
> pretty straight forward trivial fix.
Excellent, thanks.
> However, for the case where devices are present at boot it appears we a
> leaking a lot of references resulting in the device nodes never actually
> being released/freed after a dlpar remove. In the CPU case after boot I
> count 8 more references taken than the hotplug case, and corresponding
> of_node_put's are not called at dlpar remove time either. That will take
> some time to track them down, review and clean up.
Yes that is a perennial problem unfortunately which we've never come up
with a good solution for.
The (old) patch below might help track some of them down. I remember
having a script to process the output of the trace and find mismatches,
but I can't find it right now - but I'm sure you can hack up something
:)
cheers
diff --git a/arch/powerpc/include/asm/trace.h b/arch/powerpc/include/asm/trace.h
index 32e36b16773f..ad32365082a0 100644
--- a/arch/powerpc/include/asm/trace.h
+++ b/arch/powerpc/include/asm/trace.h
@@ -168,6 +168,44 @@ TRACE_EVENT(hash_fault,
__entry->addr, __entry->access, __entry->trap)
);
+TRACE_EVENT(of_node_get,
+
+ TP_PROTO(struct device_node *dn, int val),
+
+ TP_ARGS(dn, val),
+
+ TP_STRUCT__entry(
+ __field(struct device_node *, dn)
+ __field(int, val)
+ ),
+
+ TP_fast_assign(
+ __entry->dn = dn;
+ __entry->val = val;
+ ),
+
+ TP_printk("get %d -> %d %s", __entry->val - 1, __entry->val, __entry->dn->full_name)
+);
+
+TRACE_EVENT(of_node_put,
+
+ TP_PROTO(struct device_node *dn, int val),
+
+ TP_ARGS(dn, val),
+
+ TP_STRUCT__entry(
+ __field(struct device_node *, dn)
+ __field(int, val)
+ ),
+
+ TP_fast_assign(
+ __entry->dn = dn;
+ __entry->val = val;
+ ),
+
+ TP_printk("put %d -> %d %s", __entry->val + 1, __entry->val, __entry->dn->full_name)
+);
+
#endif /* _TRACE_POWERPC_H */
#undef TRACE_INCLUDE_PATH
diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
index c647bd1b6903..f5c3d761f3cd 100644
--- a/drivers/of/dynamic.c
+++ b/drivers/of/dynamic.c
@@ -14,6 +14,8 @@
#include "of_private.h"
+#include <asm/trace.h>
+
/**
* of_node_get() - Increment refcount of a node
* @node: Node to inc refcount, NULL is supported to simplify writing of
@@ -23,8 +25,12 @@
*/
struct device_node *of_node_get(struct device_node *node)
{
- if (node)
+ if (node) {
kobject_get(&node->kobj);
+
+ trace_of_node_get(node, atomic_read(&node->kobj.kref.refcount));
+ }
+
return node;
}
EXPORT_SYMBOL(of_node_get);
@@ -36,8 +42,10 @@ EXPORT_SYMBOL(of_node_get);
*/
void of_node_put(struct device_node *node)
{
- if (node)
+ if (node) {
kobject_put(&node->kobj);
+ trace_of_node_put(node, atomic_read(&node->kobj.kref.refcount));
+ }
}
EXPORT_SYMBOL(of_node_put);
On 04/11/2017 07:10 PM, Michael Ellerman wrote:
> Tyrel Datwyler <[email protected]> writes:
>> On 04/11/2017 02:00 AM, Michael Ellerman wrote:
>>> Tyrel Datwyler <[email protected]> writes:
>>>> I started looking at it when Bharata submitted a patch trying to fix the
>>>> issue for CPUs, but got side tracked by other things. I suspect that
>>>> this underflow has actually been an issue for quite some time, and we
>>>> are just now becoming aware of it thanks to the recount_t patchset being
>>>> merged.
>>>
>>> Yes I agree. Which means it might be broken in existing distros.
>>
>> Definitely. I did some profiling last night, and I understand the
>> hotplug case. It turns out to be as I suggested in the original thread
>> about CPUs. When the devicetree code was worked to move the tree out of
>> proc and into sysfs the sysfs detach code added a of_node_put to remove
>> the original of_init reference. pSeries Being the sole original
>> *dynamic* device tree user we had always issued a of_node_put in our
>> dlpar specific detach function to achieve that end. So, this should be a
>> pretty straight forward trivial fix.
>
> Excellent, thanks.
>
>> However, for the case where devices are present at boot it appears we a
>> leaking a lot of references resulting in the device nodes never actually
>> being released/freed after a dlpar remove. In the CPU case after boot I
>> count 8 more references taken than the hotplug case, and corresponding
>> of_node_put's are not called at dlpar remove time either. That will take
>> some time to track them down, review and clean up.
I found our reference leak. In topology_init() we call register_cpu()
for each possible logical cpu id. For any logical cpu present a
reference to the device node of the cpu core is grabbed and added to
cpu->dev.of_node. Which matches what I'm seeing on a Power8 lpar, 8
extraneous references which is equal to the 8 hardware threads of a core.
>
> Yes that is a perennial problem unfortunately which we've never come up
> with a good solution for.
>
> The (old) patch below might help track some of them down. I remember
> having a script to process the output of the trace and find mismatches,
> but I can't find it right now - but I'm sure you can hack up something
> :)
Haha, this patch is almost identical to what I hacked up Monday to get
an idea of where the refcounts were at. Probably wouldn't hurt to try
and upstream it into the driver/of tree.
-Tyrel
>
> cheers
>
>
> diff --git a/arch/powerpc/include/asm/trace.h b/arch/powerpc/include/asm/trace.h
> index 32e36b16773f..ad32365082a0 100644
> --- a/arch/powerpc/include/asm/trace.h
> +++ b/arch/powerpc/include/asm/trace.h
> @@ -168,6 +168,44 @@ TRACE_EVENT(hash_fault,
> __entry->addr, __entry->access, __entry->trap)
> );
>
> +TRACE_EVENT(of_node_get,
> +
> + TP_PROTO(struct device_node *dn, int val),
> +
> + TP_ARGS(dn, val),
> +
> + TP_STRUCT__entry(
> + __field(struct device_node *, dn)
> + __field(int, val)
> + ),
> +
> + TP_fast_assign(
> + __entry->dn = dn;
> + __entry->val = val;
> + ),
> +
> + TP_printk("get %d -> %d %s", __entry->val - 1, __entry->val, __entry->dn->full_name)
> +);
> +
> +TRACE_EVENT(of_node_put,
> +
> + TP_PROTO(struct device_node *dn, int val),
> +
> + TP_ARGS(dn, val),
> +
> + TP_STRUCT__entry(
> + __field(struct device_node *, dn)
> + __field(int, val)
> + ),
> +
> + TP_fast_assign(
> + __entry->dn = dn;
> + __entry->val = val;
> + ),
> +
> + TP_printk("put %d -> %d %s", __entry->val + 1, __entry->val, __entry->dn->full_name)
> +);
> +
> #endif /* _TRACE_POWERPC_H */
>
> #undef TRACE_INCLUDE_PATH
> diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
> index c647bd1b6903..f5c3d761f3cd 100644
> --- a/drivers/of/dynamic.c
> +++ b/drivers/of/dynamic.c
> @@ -14,6 +14,8 @@
>
> #include "of_private.h"
>
> +#include <asm/trace.h>
> +
> /**
> * of_node_get() - Increment refcount of a node
> * @node: Node to inc refcount, NULL is supported to simplify writing of
> @@ -23,8 +25,12 @@
> */
> struct device_node *of_node_get(struct device_node *node)
> {
> - if (node)
> + if (node) {
> kobject_get(&node->kobj);
> +
> + trace_of_node_get(node, atomic_read(&node->kobj.kref.refcount));
> + }
> +
> return node;
> }
> EXPORT_SYMBOL(of_node_get);
> @@ -36,8 +42,10 @@ EXPORT_SYMBOL(of_node_get);
> */
> void of_node_put(struct device_node *node)
> {
> - if (node)
> + if (node) {
> kobject_put(&node->kobj);
> + trace_of_node_put(node, atomic_read(&node->kobj.kref.refcount));
> + }
> }
> EXPORT_SYMBOL(of_node_put);
>
>