2022-09-23 08:04:16

by Kassey Li

[permalink] [raw]
Subject: [PATCH v2] cgroup: align the comm length with TASK_COMM_LEN

__string could get a dst string with length less than
TASK_COMM_LEN.

A task->comm may change that can cause out of bounds access
for the dst string buffer, e.g in the call trace of below:

Call trace:

dump_backtrace.cfi_jt+0x0/0x4
show_stack+0x14/0x1c
dump_stack+0xa0/0xd8
die_callback+0x248/0x24c
notify_die+0x7c/0xf8
die+0xac/0x290
die_kernel_fault+0x88/0x98
die_kernel_fault+0x0/0x98
do_page_fault+0xa0/0x544
do_mem_abort+0x60/0x10c
el1_da+0x1c/0xc4
trace_event_raw_event_cgroup_migrate+0x124/0x170
cgroup_attach_task+0x2e8/0x41c
__cgroup1_procs_write+0x114/0x1ec
cgroup1_tasks_write+0x10/0x18
cgroup_file_write+0xa4/0x208
kernfs_fop_write+0x1f0/0x2f4
__vfs_write+0x5c/0x200
vfs_write+0xe0/0x1a0
ksys_write+0x74/0xdc
__arm64_sys_write+0x18/0x20
el0_svc_common+0xc0/0x1a4
el0_svc_compat_handler+0x18/0x20
el0_svc_compat+0x8/0x2c

Change it as arrary with same length TASK_COMM_LEN,
This idea is from commit d1eb650ff413 ("tracepoint: Move signal sending
tracepoint to events/signal.h").

Signed-off-by: Kassey Li <[email protected]>
---
include/trace/events/cgroup.h | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h
index dd7d7c9efecd..b4ef0ffa38a4 100644
--- a/include/trace/events/cgroup.h
+++ b/include/trace/events/cgroup.h
@@ -130,7 +130,7 @@ DECLARE_EVENT_CLASS(cgroup_migrate,
__field( u64, dst_id )
__field( int, pid )
__string( dst_path, path )
- __string( comm, task->comm )
+ __array(char, comm, TASK_COMM_LEN)
),

TP_fast_assign(
@@ -139,12 +139,12 @@ DECLARE_EVENT_CLASS(cgroup_migrate,
__entry->dst_level = dst_cgrp->level;
__assign_str(dst_path, path);
__entry->pid = task->pid;
- __assign_str(comm, task->comm);
+ memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
),

TP_printk("dst_root=%d dst_id=%llu dst_level=%d dst_path=%s pid=%d comm=%s",
__entry->dst_root, __entry->dst_id, __entry->dst_level,
- __get_str(dst_path), __entry->pid, __get_str(comm))
+ __get_str(dst_path), __entry->pid, __entry->comm)
);

DEFINE_EVENT(cgroup_migrate, cgroup_attach_task,
--
2.17.1


2022-09-23 15:44:55

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH v2] cgroup: align the comm length with TASK_COMM_LEN

On Fri, 23 Sep 2022 15:51:05 +0800
Kassey Li <[email protected]> wrote:

> __string could get a dst string with length less than
> TASK_COMM_LEN.
>
> A task->comm may change that can cause out of bounds access
> for the dst string buffer, e.g in the call trace of below:
>
> Call trace:
>
> dump_backtrace.cfi_jt+0x0/0x4
> show_stack+0x14/0x1c
> dump_stack+0xa0/0xd8
> die_callback+0x248/0x24c
> notify_die+0x7c/0xf8
> die+0xac/0x290
> die_kernel_fault+0x88/0x98
> die_kernel_fault+0x0/0x98
> do_page_fault+0xa0/0x544
> do_mem_abort+0x60/0x10c
> el1_da+0x1c/0xc4
> trace_event_raw_event_cgroup_migrate+0x124/0x170
> cgroup_attach_task+0x2e8/0x41c
> __cgroup1_procs_write+0x114/0x1ec
> cgroup1_tasks_write+0x10/0x18
> cgroup_file_write+0xa4/0x208
> kernfs_fop_write+0x1f0/0x2f4
> __vfs_write+0x5c/0x200
> vfs_write+0xe0/0x1a0
> ksys_write+0x74/0xdc
> __arm64_sys_write+0x18/0x20
> el0_svc_common+0xc0/0x1a4
> el0_svc_compat_handler+0x18/0x20
> el0_svc_compat+0x8/0x2c
>
> Change it as arrary with same length TASK_COMM_LEN,
> This idea is from commit d1eb650ff413 ("tracepoint: Move signal sending
> tracepoint to events/signal.h").

This does not make sense. What exactly is the bug here?

__string() will do a strlen(task->comm) + 1 to allocate on the ring buffer.
It should not be less that task->comm. The above stack dump does not show
what happened.

This looks like another bug and I do not see how this patch addresses
the issue.

-- Steve

>
> Signed-off-by: Kassey Li <[email protected]>
> ---
> include/trace/events/cgroup.h | 6 +++---
> 1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h
> index dd7d7c9efecd..b4ef0ffa38a4 100644
> --- a/include/trace/events/cgroup.h
> +++ b/include/trace/events/cgroup.h
> @@ -130,7 +130,7 @@ DECLARE_EVENT_CLASS(cgroup_migrate,
> __field( u64, dst_id )
> __field( int, pid )
> __string( dst_path, path )
> - __string( comm, task->comm )
> + __array(char, comm, TASK_COMM_LEN)
> ),
>
> TP_fast_assign(
> @@ -139,12 +139,12 @@ DECLARE_EVENT_CLASS(cgroup_migrate,
> __entry->dst_level = dst_cgrp->level;
> __assign_str(dst_path, path);
> __entry->pid = task->pid;
> - __assign_str(comm, task->comm);
> + memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
> ),
>
> TP_printk("dst_root=%d dst_id=%llu dst_level=%d dst_path=%s pid=%d comm=%s",
> __entry->dst_root, __entry->dst_id, __entry->dst_level,
> - __get_str(dst_path), __entry->pid, __get_str(comm))
> + __get_str(dst_path), __entry->pid, __entry->comm)
> );
>
> DEFINE_EVENT(cgroup_migrate, cgroup_attach_task,

2022-09-26 03:19:09

by Kassey Li

[permalink] [raw]
Subject: Re: [PATCH v2] cgroup: align the comm length with TASK_COMM_LEN



On 9/23/2022 11:00 PM, Steven Rostedt wrote:
> On Fri, 23 Sep 2022 15:51:05 +0800
> Kassey Li <[email protected]> wrote:
>
>> __string could get a dst string with length less than
>> TASK_COMM_LEN.
>>
>> A task->comm may change that can cause out of bounds access
>> for the dst string buffer, e.g in the call trace of below:
>>
>> Call trace:
>>
>> dump_backtrace.cfi_jt+0x0/0x4
>> show_stack+0x14/0x1c
>> dump_stack+0xa0/0xd8
>> die_callback+0x248/0x24c
>> notify_die+0x7c/0xf8
>> die+0xac/0x290
>> die_kernel_fault+0x88/0x98
>> die_kernel_fault+0x0/0x98
>> do_page_fault+0xa0/0x544
>> do_mem_abort+0x60/0x10c
>> el1_da+0x1c/0xc4
>> trace_event_raw_event_cgroup_migrate+0x124/0x170
>> cgroup_attach_task+0x2e8/0x41c
>> __cgroup1_procs_write+0x114/0x1ec
>> cgroup1_tasks_write+0x10/0x18
>> cgroup_file_write+0xa4/0x208
>> kernfs_fop_write+0x1f0/0x2f4
>> __vfs_write+0x5c/0x200
>> vfs_write+0xe0/0x1a0
>> ksys_write+0x74/0xdc
>> __arm64_sys_write+0x18/0x20
>> el0_svc_common+0xc0/0x1a4
>> el0_svc_compat_handler+0x18/0x20
>> el0_svc_compat+0x8/0x2c
>>
>> Change it as arrary with same length TASK_COMM_LEN,
>> This idea is from commit d1eb650ff413 ("tracepoint: Move signal sending
>> tracepoint to events/signal.h").
>
> This does not make sense. What exactly is the bug here?
hi, Steven:
hope below info can give you idea on this , let me know if you need
more info.

kernel log:
Unable to handle kernel write to read-only memory at virtual address
ffffffbcf7450000

"SharedPreferenc" is task name/comm.

memory/ddr dump:

FFFFFFBCF744FFE0| 00090020 000B0029 706F742F 7070612D 61685300
50646572 65666572 636E6572 ...).../top-app.SharedPreferenc
FFFFFFBCF7450000|>52800101 97FD3A05 140000B3 AA1303E0 9400193C
B0000F88 90000D89 9137FD08 ...R.:..........<.............7.

trace stack:

-000|strcpy(inline)
-000|trace_event_raw_event_cgroup_migrate
-001|trace_cgroup_attach_task(inline)
-001|cgroup_attach_task()
-002|__read_once_size(inline)
-002|atomic_read(inline)
-002|static_key_count(inline)
-002|static_key_false(inline)
-002|trace_android_vh_cgroup_set_task(inline)
-002|__cgroup1_procs_write()
-003|cgroup1_tasks_write
-004|cgroup_file_write
-005|kernfs_fop_write$
-006|__vfs_write()
-007|vfs_write()
-008|ksys_write()
-009|__se_sys_write(inline)
-009|__arm64_sys_write()
-010|__invoke_syscall(inline)
-010|invoke_syscall(inline)
-010|el0_svc_common()
-011|el0_svc_compat_handler()
-012|el0_svc_compat(asm)




>
> __string() will do a strlen(task->comm) + 1 to allocate on the ring buffer.
> It should not be less that task->comm. The above stack dump does not show
> what happened.
>
> This looks like another bug and I do not see how this patch addresses
> the issue.
>
> -- Steve
>
>>
>> Signed-off-by: Kassey Li <[email protected]>
>> ---
>> include/trace/events/cgroup.h | 6 +++---
>> 1 file changed, 3 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h
>> index dd7d7c9efecd..b4ef0ffa38a4 100644
>> --- a/include/trace/events/cgroup.h
>> +++ b/include/trace/events/cgroup.h
>> @@ -130,7 +130,7 @@ DECLARE_EVENT_CLASS(cgroup_migrate,
>> __field( u64, dst_id )
>> __field( int, pid )
>> __string( dst_path, path )
>> - __string( comm, task->comm )
>> + __array(char, comm, TASK_COMM_LEN)
>> ),
>>
>> TP_fast_assign(
>> @@ -139,12 +139,12 @@ DECLARE_EVENT_CLASS(cgroup_migrate,
>> __entry->dst_level = dst_cgrp->level;
>> __assign_str(dst_path, path);
>> __entry->pid = task->pid;
>> - __assign_str(comm, task->comm);
>> + memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
I think the problem is here, __assign_str using strcpy
the task->comm here tail is not '\0'
that's why it out of bounds access.

do you want to this version or just modify the memcpy or strncpy to do
with a known length ? please give suggest so I can modify .

>> ),
>>
>> TP_printk("dst_root=%d dst_id=%llu dst_level=%d dst_path=%s pid=%d comm=%s",
>> __entry->dst_root, __entry->dst_id, __entry->dst_level,
>> - __get_str(dst_path), __entry->pid, __get_str(comm))
>> + __get_str(dst_path), __entry->pid, __entry->comm)
>> );
>>
>> DEFINE_EVENT(cgroup_migrate, cgroup_attach_task,
>

2022-09-26 04:13:49

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH v2] cgroup: align the comm length with TASK_COMM_LEN

On Mon, 26 Sep 2022 10:18:55 +0800
Kassey Li <[email protected]> wrote:

> >> @@ -139,12 +139,12 @@ DECLARE_EVENT_CLASS(cgroup_migrate,
> >> __entry->dst_level = dst_cgrp->level;
> >> __assign_str(dst_path, path);
> >> __entry->pid = task->pid;
> >> - __assign_str(comm, task->comm);
> >> + memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
> I think the problem is here, __assign_str using strcpy
> the task->comm here tail is not '\0'
> that's why it out of bounds access.
>

If this is the case, then there's a lot more than just tracing that will
break. There are other places in the kernel has used strcpy() on task->comm,
and many more that do "%s" on task->comm, which would also crash on this.

> do you want to this version or just modify the memcpy or strncpy to do
> with a known length ? please give suggest so I can modify .

I'm guessing a problem exists elsewhere that makes it look like this is the
issue. I suggest finding where the '\0' is dropped (if that is indeed the
case).

-- Steve

2022-09-26 14:55:18

by Kassey Li

[permalink] [raw]
Subject: Re: [PATCH v2] cgroup: align the comm length with TASK_COMM_LEN



On 9/26/2022 10:42 AM, Steven Rostedt wrote:
> On Mon, 26 Sep 2022 10:18:55 +0800
> Kassey Li <[email protected]> wrote:
>
>>>> @@ -139,12 +139,12 @@ DECLARE_EVENT_CLASS(cgroup_migrate,
>>>> __entry->dst_level = dst_cgrp->level;
>>>> __assign_str(dst_path, path);
>>>> __entry->pid = task->pid;
>>>> - __assign_str(comm, task->comm);
>>>> + memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
>> I think the problem is here, __assign_str using strcpy
>> the task->comm here tail is not '\0'
>> that's why it out of bounds access.
>>
>
> If this is the case, then there's a lot more than just tracing that will
> break. There are other places in the kernel has used strcpy() on task->comm,
> and many more that do "%s" on task->comm, which would also crash on this.

You are right.

by re-check my local logs(arm64), we can see the src has '\0' as end of
string.
but looks strcpy did not catch this and crossed.
I can not figure out how this could happen. if there is debug suggest,
please help.


src: task->comm SharedPreferenc pid 28395
_____________________address|________0________4________8________C_0123456789ABCDEF
NSD:0000::FFFFFFBD1B6C59D0|>72616853 72506465 72656665 00636E65
SharedPreferenc.


dst: trace event buffer:
_____________________address|________0________4________8________C_0123456789ABCDEF
NSD:0000::FFFFFFBCF744FFE0| 00090020 000B0029 706F742F 7070612D
...).../top-app
NSD:0000::FFFFFFBCF744FFF0| 61685300 50646572 65666572 636E6572
.SharedPreferenc
NSD:0000::FFFFFFBCF7450000|>52800101 97FD3A05 140000B3 AA1303E0
...R.:..........

layout of the struct:

[ND:0x0::0xFFFFFFBCF744FFC8] (struct
trace_event_raw_cgroup_migrate)0xFFFFFFBCF744FFc8 = (
[ND:0x0::0xFFFFFFBCF744FFC8] ent = (
[ND:0x0::0xFFFFFFBCF744FFC8] type = 0x98,
[ND:0x0::0xFFFFFFBCF744FFCA] flags = 0x1,
[ND:0x0::0xFFFFFFBCF744FFCB] preempt_count = 0x1,
[ND:0x0::0xFFFFFFBCF744FFCC] pid = 0x0773),
[ND:0x0::0xFFFFFFBCF744FFD0] dst_root = 0x1,
[ND:0x0::0xFFFFFFBCF744FFD4] dst_id = 0x6,
[ND:0x0::0xFFFFFFBCF744FFD8] dst_level = 0x1,
[ND:0x0::0xFFFFFFBCF744FFDC] pid = 28395 = 0x6EEB,
[ND:0x0::0xFFFFFFBCF744FFE0] __data_loc_dst_path = 0x00090020 = '... ',
[ND:0x0::0xFFFFFFBCF744FFE4] __data_loc_comm = 0x000B0029 = '...)',
[ND:0x0::0xFFFFFFBCF744FFE8] __data_=_"/top-app")

name: cgroup_attach_task
ID: 152
format:
field:unsigned short common_type; offset:0; size:2; signed:0;
field:unsigned char common_flags; offset:2; size:1; signed:0;
field:unsigned char common_preempt_count; offset:3; size:1; signed:0;
field:int common_pid; offset:4; size:4; signed:1;

field:int dst_root; offset:8; size:4; signed:1;
field:int dst_id; offset:12; size:4; signed:1;
field:int dst_level; offset:16; size:4; signed:1;
field:int pid; offset:20; size:4; signed:1;
field:__data_loc char[] dst_path; offset:24; size:4; signed:0;
field:__data_loc char[] comm; offset:28; size:4; signed:0;


_____________________address|________0________4________8________C_0123456789ABCDEF
NSD:0000::FFFFFFBCF744FFC0| 00656C64 0066D18D>01010098 00000773
dle...f.....s...
NSD:0000::FFFFFFBCF744FFD0| 00000001 00000006 00000001 00006EEB
.............n..
NSD:0000::FFFFFFBCF744FFE0| 00090020 000B0029 706F742F 7070612D
...).../top-app
NSD:0000::FFFFFFBCF744FFF0| 61685300 50646572 65666572 636E6572
.SharedPreferenc
NSD:0000::FFFFFFBCF7450000| 52800101 97FD3A05 140000B3 AA1303E0
...R.:..........

>
>> do you want to this version or just modify the memcpy or strncpy to do
>> with a known length ? please give suggest so I can modify .
>
> I'm guessing a problem exists elsewhere that makes it look like this is the
> issue. I suggest finding where the '\0' is dropped (if that is indeed the
> case).
>
> -- Steve

2022-09-26 16:25:46

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH v2] cgroup: align the comm length with TASK_COMM_LEN

On Fri, 23 Sep 2022 15:51:05 +0800
Kassey Li <[email protected]> wrote:

> __string could get a dst string with length less than
> TASK_COMM_LEN.
>
> A task->comm may change that can cause out of bounds access
> for the dst string buffer, e.g in the call trace of below:
>
> Call trace:
>
> dump_backtrace.cfi_jt+0x0/0x4
> show_stack+0x14/0x1c
> dump_stack+0xa0/0xd8
> die_callback+0x248/0x24c
> notify_die+0x7c/0xf8
> die+0xac/0x290
> die_kernel_fault+0x88/0x98
> die_kernel_fault+0x0/0x98
> do_page_fault+0xa0/0x544
> do_mem_abort+0x60/0x10c
> el1_da+0x1c/0xc4

> trace_event_raw_event_cgroup_migrate+0x124/0x170

You're sure the above is on the strcpy()?

Note, this code has __string() which does a strlen() which appears to be
working fine.

> cgroup_attach_task+0x2e8/0x41c
> __cgroup1_procs_write+0x114/0x1ec
> cgroup1_tasks_write+0x10/0x18
> cgroup_file_write+0xa4/0x208
> kernfs_fop_write+0x1f0/0x2f4
> __vfs_write+0x5c/0x200
> vfs_write+0xe0/0x1a0
> ksys_write+0x74/0xdc
> __arm64_sys_write+0x18/0x20
> el0_svc_common+0xc0/0x1a4
> el0_svc_compat_handler+0x18/0x20
> el0_svc_compat+0x8/0x2c

Can you give the full debug report, that includes the register content and
everything else.

-- Steve

2022-09-27 02:19:22

by Kassey Li

[permalink] [raw]
Subject: Re: [PATCH v2] cgroup: align the comm length with TASK_COMM_LEN



On 9/26/2022 10:09 PM, Steven Rostedt wrote:
> On Fri, 23 Sep 2022 15:51:05 +0800
> Kassey Li <[email protected]> wrote:
>
>> __string could get a dst string with length less than
>> TASK_COMM_LEN.
>>
>> A task->comm may change that can cause out of bounds access
>> for the dst string buffer, e.g in the call trace of below:
>>
>> Call trace:
>>
>> dump_backtrace.cfi_jt+0x0/0x4
>> show_stack+0x14/0x1c
>> dump_stack+0xa0/0xd8
>> die_callback+0x248/0x24c
>> notify_die+0x7c/0xf8
>> die+0xac/0x290
>> die_kernel_fault+0x88/0x98
>> die_kernel_fault+0x0/0x98
>> do_page_fault+0xa0/0x544
>> do_mem_abort+0x60/0x10c
>> el1_da+0x1c/0xc4
>
>> trace_event_raw_event_cgroup_migrate+0x124/0x170
>
> You're sure the above is on the strcpy()?
set PC to 0xffffffda401c47d4
and this reflect to the strcpy asm code of aarch64. (lib/string.c)

120|
NSX:0000::FFFFFFDA401C47B8|B946C268
ldr w8,[x19,#0x6C0] ; w8,[task,#1728]
NSX:0000::FFFFFFDA401C47BC|79403809
ldrh w9,[x0,#0x1C] ; w9,[entry,#28]
NSX:0000::FFFFFFDA401C47C0|F10002FF
cmp x23,#0x0 ; x23,#0
NSX:0000::FFFFFFDA401C47C4|B9001408
str w8,[x0,#0x14] ; w8,[entry,#20]
NSX:0000::FFFFFFDA401C47C8|8B090008
add x8,x0,x9 ; x8,entry,x9
NSX:0000::FFFFFFDA401C47CC|9A970349
csel x9,x26,x23,eq
|
|
93|
NSX:0000::FFFFFFDA401C47D0|3840152A
ldrb w10,[x9],#0x1 ; w10,[src],#1

NSX:0000::FFFFFFDA401C47D4|3800150A_________________________________________________________strb____w10,[x8],#0x1____;_w10,[dest],#1
__NSX:0000::FFFFFFDA401C47D8|35FFFFCA_________________________________________________________cbnz____w10,0xFFFFFFDA401C47D0
|
|
|
|
|
|
120|
NSX:0000::FFFFFFDA401C47DC|910023E0
add x0,sp,#0x8 ; entry,sp,#8
NSX:0000::FFFFFFDA401C47E0|AA1503E1
mov x1,x21
NSX:0000::FFFFFFDA401C47E4|9400E4ED
bl 0xFFFFFFDA401FDB98 ;
trace_event_buffer_commit
NSX:0000::FFFFFFDA401C47E8|F00134C9
adrp x9,0xFFFFFFDA4285F000
NSX:0000::FFFFFFDA401C47EC|F85F83A8
ldur x8,[x29,#-0x8] ; x8,[x29,#-8]
NSX:0000::FFFFFFDA401C47F0|F9420929
ldr x9,[x9,#0x410] ; x9,[x9,#1040]
NSX:0000::FFFFFFDA401C47F4|EB08013F
cmp x9,x8
NSX:0000::FFFFFFDA401C47F8|54000121
b.ne 0xFFFFFFDA401C481C
NSX:0000::FFFFFFDA401C47FC|A9494FF4
ldp x20,x19,[sp,#0x90] ;
xdst_cgrp,xtask,[sp,#144]
NSX:0000::FFFFFFDA401C4800|A94857F6
ldp x22,x21,[sp,#0x80] ; x__data,x21,[sp,#128]
NSX:0000::FFFFFFDA401C4804|A9475FF8
ldp x24,x23,[sp,#0x70] ; x24,x23,[sp,#112]
NSX:0000::FFFFFFDA401C4808|A94667FA
ldp x26,x25,[sp,#0x60] ; x26,x25,[sp,#96]
NSX:0000::FFFFFFDA401C480C|A9456FFC
ldp x28,x27,[sp,#0x50] ; x28,x27,[sp,#80]
NSX:0000::FFFFFFDA401C4810|A9447BFD
ldp x29,x30,[sp,#0x40] ; x29,x30,[sp,#64]
NSX:0000::FFFFFFDA401C4814|910283FF
add sp,sp,#0xA0 ; sp,sp,#160
NSX:0000::FFFFFFDA401C4818|D65F03C0
ret
NSX:0000::FFFFFFDA401C481C|97FC50DC
bl 0xFFFFFFDA400D8B8C ; __stack_chk_fail

>
> Note, this code has __string() which does a strlen() which appears to be
> working fine.

I did see this as well.

500 #define __string(item, src) __dynamic_array(char, item, \
501 strlen((src) ? (const char *)(src) : "(null)") + 1)



My tester reported they could change the task->comm.
for example from "123456789abcde" to "test"
I wonder if we prepare the buffer as "test" 5bytes with __string
then __assign_str with "123456789abcde" new task->comm.

but this is very hard race condition.

this is not easy to reproduced, we got 2 instances of this problem.

I read others using the task->comm as trace event with memcpy as this
patch I initialized.



>
>> cgroup_attach_task+0x2e8/0x41c
>> __cgroup1_procs_write+0x114/0x1ec
>> cgroup1_tasks_write+0x10/0x18
>> cgroup_file_write+0xa4/0x208
>> kernfs_fop_write+0x1f0/0x2f4
>> __vfs_write+0x5c/0x200
>> vfs_write+0xe0/0x1a0
>> ksys_write+0x74/0xdc
>> __arm64_sys_write+0x18/0x20
>> el0_svc_common+0xc0/0x1a4
>> el0_svc_compat_handler+0x18/0x20
>> el0_svc_compat+0x8/0x2c
>
> Can you give the full debug report, that includes the register content and
> everything else.

here is the crash stack of cpu regs.
you can ref the asm code above.


pc : [0xffffffda401c47d4] trace_event_raw_event_cgroup_migrate+0x124/0x170
lr : [0xffffffda401c4774] trace_event_raw_event_cgroup_migrate+0xc4/0x170
sp : ffffffc01f8e9aa0
x29: ffffffc01f8e9ae0 x28: 000000000000000b
x27: 0000000000000009 x26: ffffffda41f1e515
x25: 0000000000000008 x24: ffffffda42c72a7d
x23: ffffffbd1b6c59d0 x22: ffffffbbf383bec8
x21: 0000000000000034 x20: ffffffbd621cd000
x19: ffffffbd1b6c5140 x18: 0000000000000008
x17: ffffffbd66b96010 x16: ffffffbd621cd000
x15: ffffffbc11583860 x14: ffffffbd1b6c5ba8
x13: 0000000000000000 x12: 0000000000200000
x11: 0000000000000001 x10: 0000000000000000
x9 : ffffffbd1b6c59e0 x8 : ffffffbcf7450000
x7 : 6a716e76225c435a x6 : 0000800000008080
x5 : 0000000000000001 x4 : 0000000000000080
x3 : 0000000000000034 x2 : 0000000000000098
x1 : 0000000000000034 x0 : ffffffbcf744ffc8



>
> -- Steve

2022-09-28 08:18:22

by Kassey Li

[permalink] [raw]
Subject: Re: [PATCH v2] cgroup: align the comm length with TASK_COMM_LEN



On 9/26/2022 10:09 PM, Steven Rostedt wrote:
> On Fri, 23 Sep 2022 15:51:05 +0800
> Kassey Li <[email protected]> wrote:
>
>> __string could get a dst string with length less than
>> TASK_COMM_LEN.
>>
>> A task->comm may change that can cause out of bounds access
>> for the dst string buffer, e.g in the call trace of below:
>>
>> Call trace:
>>
>> dump_backtrace.cfi_jt+0x0/0x4
>> show_stack+0x14/0x1c
>> dump_stack+0xa0/0xd8
>> die_callback+0x248/0x24c
>> notify_die+0x7c/0xf8
>> die+0xac/0x290
>> die_kernel_fault+0x88/0x98
>> die_kernel_fault+0x0/0x98
>> do_page_fault+0xa0/0x544
>> do_mem_abort+0x60/0x10c
>> el1_da+0x1c/0xc4
>
>> trace_event_raw_event_cgroup_migrate+0x124/0x170
>
> You're sure the above is on the strcpy()?
>
> Note, this code has __string() which does a strlen() which appears to be
> working fine.
>
>> cgroup_attach_task+0x2e8/0x41c
>> __cgroup1_procs_write+0x114/0x1ec
>> cgroup1_tasks_write+0x10/0x18
>> cgroup_file_write+0xa4/0x208
>> kernfs_fop_write+0x1f0/0x2f4
>> __vfs_write+0x5c/0x200
>> vfs_write+0xe0/0x1a0
>> ksys_write+0x74/0xdc
>> __arm64_sys_write+0x18/0x20
>> el0_svc_common+0xc0/0x1a4
>> el0_svc_compat_handler+0x18/0x20
>> el0_svc_compat+0x8/0x2c
>
> Can you give the full debug report, that includes the register content and
> everything else.
>
> -- Steve

hiļ¼ŒSteve:
I simulate a what I supposed problem, but it did not panic.
I will use kasan to debug further. thanks for your review and suggest
on this problem. abandon this patch now currently, if there is new
proved problem, I will come again.


diff --git a/include/linux/sched.h b/include/linux/sched.h
index 637d25c31374..d219d9f45529 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1389,6 +1389,7 @@ struct task_struct {
ANDROID_KABI_RESERVE(7);
ANDROID_KABI_RESERVE(8);

+ char comm1[32];
/*
* New fields for task_struct should be added above here, so that
* they are included in the randomized portion of task_struct.
diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h
index dd7d7c9efecd..da2426c7f99b 100644
--- a/include/trace/events/cgroup.h
+++ b/include/trace/events/cgroup.h
@@ -139,7 +139,7 @@ DECLARE_EVENT_CLASS(cgroup_migrate,
__entry->dst_level = dst_cgrp->level;
__assign_str(dst_path, path);
__entry->pid = task->pid;
- __assign_str(comm, task->comm);
+ __assign_str(comm, task->comm1);
),

TP_printk("dst_root=%d dst_id=%llu dst_level=%d dst_path=%s
pid=%d comm=%s",
diff --git a/kernel/fork.c b/kernel/fork.c
index 58409b7178c2..4e0c564852c6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1443,6 +1443,7 @@ static int copy_mm(unsigned long clone_flags,
struct task_struct *tsk)
tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
tsk->last_switch_time = 0;
#endif
+ strlcpy(tsk->comm1, "012345678901234567890123456789", 30);

tsk->mm = NULL;
tsk->active_mm = NULL;




# cat /sys/kernel/tracing/trace_pipe
sh-874 [007] d..2 142.097349:
cgroup_notify_populated: root=1 id=49 level=1 path=/top-app val=1
sh-874 [007] d..1 142.097405: cgroup_attach_task:
dst_root=1 dst_id=49 dst_level=1 dst_path=/top-app pid=1
comm=01234567890123456789012345678