Hi all,
This series uses arch_stack_walk() to fix false positives of KASAN and
avoid open-coding of unwind logic.
This series is based on next-20230330.
Comments and suggestions are welcome.
Thanks,
Qi
Qi Zheng (2):
x86: make profile_pc() use arch_stack_walk()
x86: make __get_wchan() use arch_stack_walk()
arch/x86/kernel/process.c | 22 ++++++++++++----------
arch/x86/kernel/time.c | 36 +++++++++++++++++-------------------
2 files changed, 29 insertions(+), 29 deletions(-)
--
2.20.1
Make __get_wchan() use arch_stack_walk() directly to
avoid open-coding of unwind logic.
Signed-off-by: Qi Zheng <[email protected]>
---
arch/x86/kernel/process.c | 22 ++++++++++++----------
1 file changed, 12 insertions(+), 10 deletions(-)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3ab62ac98c2c..a6ff18fa6d5d 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -1000,6 +1000,17 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
return randomize_page(mm->brk, 0x02000000);
}
+static bool get_wchan_cb(void *arg, unsigned long pc)
+{
+ unsigned long *addr = arg;
+
+ if (in_sched_functions(pc))
+ return true;
+
+ *addr = pc;
+ return false;
+}
+
/*
* Called from fs/proc with a reference on @p to find the function
* which called into schedule(). This needs to be done carefully
@@ -1008,21 +1019,12 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
*/
unsigned long __get_wchan(struct task_struct *p)
{
- struct unwind_state state;
unsigned long addr = 0;
if (!try_get_task_stack(p))
return 0;
- for (unwind_start(&state, p, NULL, NULL); !unwind_done(&state);
- unwind_next_frame(&state)) {
- addr = unwind_get_return_address(&state);
- if (!addr)
- break;
- if (in_sched_functions(addr))
- continue;
- break;
- }
+ arch_stack_walk(get_wchan_cb, &addr, p, NULL);
put_task_stack(p);
--
2.20.1
The profile_pc() try to get pc by doing a trick to read
the contents of the stack. This may cause false positives
for KASAN, like the following:
BUG: KASAN: stack-out-of-bounds in profile_pc+0x5b/0x90
Read of size 8 at addr ffff8881062a7a00 by task id/130040
CPU: 1 PID: 130040 Comm: id Kdump: loaded Not tainted 5.15.93+ #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
Call Trace:
<IRQ>
dump_stack_lvl+0x4c/0x64
? profile_pc+0x5b/0x90
print_address_description.constprop.8.cold.12+0x10/0x36b
? profile_pc+0x5b/0x90
? profile_pc+0x5b/0x90
? tick_sched_handle.isra.20+0xa0/0xa0
kasan_report.cold.13+0x7f/0x11b
? scheduler_tick+0x30/0x150
? profile_pc+0x5b/0x90
? _raw_spin_lock+0x82/0xd0
profile_pc+0x5b/0x90
profile_tick+0x78/0xb0
? tick_sched_handle.isra.20+0x83/0xa0
tick_sched_timer+0x94/0xb0
? enqueue_hrtimer+0x100/0x100
? _raw_write_lock_irqsave+0xd0/0xd0
? recalibrate_cpu_khz+0x10/0x10
? ktime_get_update_offsets_now+0x148/0x1a0
hrtimer_interrupt+0x1b9/0x390
? sched_ttwu_pending+0xf1/0x150
__sysvec_apic_timer_interrupt+0x7c/0x150
sysvec_apic_timer_interrupt+0x61/0x80
</IRQ>
<TASK>
asm_sysvec_apic_timer_interrupt+0x16/0x20
RIP: 0010:_raw_spin_lock+0x82/0xd0
The KASAN checking is already disabled in the ORC unwinder,
so let's make profile_pc() use arch_stack_walk() to get pc,
which fixes the above BUG and also avoids open-coding of
unwind logic.
Signed-off-by: Qi Zheng <[email protected]>
---
arch/x86/kernel/time.c | 36 +++++++++++++++++-------------------
1 file changed, 17 insertions(+), 19 deletions(-)
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index e42faa792c07..eee884306d36 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -17,6 +17,7 @@
#include <linux/i8253.h>
#include <linux/time.h>
#include <linux/export.h>
+#include <linux/stacktrace.h>
#include <asm/vsyscall.h>
#include <asm/x86_init.h>
@@ -25,27 +26,24 @@
#include <asm/hpet.h>
#include <asm/time.h>
+static bool profile_pc_cb(void *arg, unsigned long pc)
+{
+ unsigned long *prof_pc = arg;
+
+ if (in_lock_functions(pc))
+ return true;
+
+ *prof_pc = pc;
+ return false;
+}
+
unsigned long profile_pc(struct pt_regs *regs)
{
- unsigned long pc = instruction_pointer(regs);
-
- if (!user_mode(regs) && in_lock_functions(pc)) {
-#ifdef CONFIG_FRAME_POINTER
- return *(unsigned long *)(regs->bp + sizeof(long));
-#else
- unsigned long *sp = (unsigned long *)regs->sp;
- /*
- * Return address is either directly at stack pointer
- * or above a saved flags. Eflags has bits 22-31 zero,
- * kernel addresses don't.
- */
- if (sp[0] >> 22)
- return sp[0];
- if (sp[1] >> 22)
- return sp[1];
-#endif
- }
- return pc;
+ unsigned long prof_pc = 0;
+
+ arch_stack_walk(profile_pc_cb, &prof_pc, current, regs);
+
+ return prof_pc;
}
EXPORT_SYMBOL(profile_pc);
--
2.20.1
On 2023/3/30 16:15, Qi Zheng wrote:
> Hi all,
>
> This series uses arch_stack_walk() to fix false positives of KASAN and
> avoid open-coding of unwind logic.
Gentle ping. Any comments or suggestions? :)
>
> This series is based on next-20230330.
>
> Comments and suggestions are welcome.
>
> Thanks,
> Qi
>
> Qi Zheng (2):
> x86: make profile_pc() use arch_stack_walk()
> x86: make __get_wchan() use arch_stack_walk()
>
> arch/x86/kernel/process.c | 22 ++++++++++++----------
> arch/x86/kernel/time.c | 36 +++++++++++++++++-------------------
> 2 files changed, 29 insertions(+), 29 deletions(-)
>
--
Thanks,
Qi
On Thu, Mar 30, 2023 at 04:15:51PM +0800, Qi Zheng wrote:
> The profile_pc() try to get pc by doing a trick to read
> the contents of the stack. This may cause false positives
> for KASAN, like the following:
>
> BUG: KASAN: stack-out-of-bounds in profile_pc+0x5b/0x90
> Read of size 8 at addr ffff8881062a7a00 by task id/130040
I don't think this was actually a false positive. The !FRAME_POINTER
code in profile_pc() has been badly broken for many years.
BTW, there was a similar patch here:
https://lore.kernel.org/lkml/[email protected]/
I thought CONFIG_PROFILING was obsolete but Andi said previously he
wants to keep it for at least boot-time profiling.
Andi did suggest removing the lock profiling hacks, which means all the
profile_pc() implementations can just be removed in favor of the generic
instruction_pointer().
--
Josh
On Thu, Mar 30, 2023 at 04:15:52PM +0800, Qi Zheng wrote:
> Make __get_wchan() use arch_stack_walk() directly to
> avoid open-coding of unwind logic.
>
> Signed-off-by: Qi Zheng <[email protected]>
Can we just have a shared version of __get_wchan() for all
CONFIG_ARCH_STACKWALK arches?
--
Josh
On 2023/4/8 12:56, Josh Poimboeuf wrote:
> On Thu, Mar 30, 2023 at 04:15:51PM +0800, Qi Zheng wrote:
>> The profile_pc() try to get pc by doing a trick to read
>> the contents of the stack. This may cause false positives
>> for KASAN, like the following:
>>
>> BUG: KASAN: stack-out-of-bounds in profile_pc+0x5b/0x90
>> Read of size 8 at addr ffff8881062a7a00 by task id/130040
>
> I don't think this was actually a false positive. The !FRAME_POINTER
> code in profile_pc() has been badly broken for many years.
>
> BTW, there was a similar patch here:
>
> https://lore.kernel.org/lkml/[email protected]/
Ah.
>
> I thought CONFIG_PROFILING was obsolete but Andi said previously he
> wants to keep it for at least boot-time profiling.
>
> Andi did suggest removing the lock profiling hacks, which means all the
> profile_pc() implementations can just be removed in favor of the generic
> instruction_pointer().
That's great, and I see Chen Zhongjin will send a new patch for this,
let him continue this work. :)
>
--
Thanks,
Qi
On 2023/4/8 13:08, Josh Poimboeuf wrote:
> On Thu, Mar 30, 2023 at 04:15:52PM +0800, Qi Zheng wrote:
>> Make __get_wchan() use arch_stack_walk() directly to
>> avoid open-coding of unwind logic.
>>
>> Signed-off-by: Qi Zheng <[email protected]>
>
> Can we just have a shared version of __get_wchan() for all
> CONFIG_ARCH_STACKWALK arches?
From a quick glance, I think it's ok, but we still need to define
the arch's own get_wchan_cb(). I will try to do it.
>
--
Thanks,
Qi
On Sat, Apr 08, 2023 at 01:36:06PM +0800, Qi Zheng wrote:
>
>
> On 2023/4/8 13:08, Josh Poimboeuf wrote:
> > On Thu, Mar 30, 2023 at 04:15:52PM +0800, Qi Zheng wrote:
> > > Make __get_wchan() use arch_stack_walk() directly to
> > > avoid open-coding of unwind logic.
> > >
> > > Signed-off-by: Qi Zheng <[email protected]>
> >
> > Can we just have a shared version of __get_wchan() for all
> > CONFIG_ARCH_STACKWALK arches?
>
> From a quick glance, I think it's ok, but we still need to define
> the arch's own get_wchan_cb(). I will try to do it.
Hm, why would we need to do that?
--
Josh
On 2023/4/9 06:12, Josh Poimboeuf wrote:
> On Sat, Apr 08, 2023 at 01:36:06PM +0800, Qi Zheng wrote:
>>
>>
>> On 2023/4/8 13:08, Josh Poimboeuf wrote:
>>> On Thu, Mar 30, 2023 at 04:15:52PM +0800, Qi Zheng wrote:
>>>> Make __get_wchan() use arch_stack_walk() directly to
>>>> avoid open-coding of unwind logic.
>>>>
>>>> Signed-off-by: Qi Zheng <[email protected]>
>>>
>>> Can we just have a shared version of __get_wchan() for all
>>> CONFIG_ARCH_STACKWALK arches?
>>
>> From a quick glance, I think it's ok, but we still need to define
>> the arch's own get_wchan_cb(). I will try to do it.
>
> Hm, why would we need to do that?
Because I see checks for count++ < 16 exist in __get_wchan() for some
arches and some don't. So I'm not sure if this check can be discarded
after using arch_stack_walk(). And I see that this check is retained in
arm64, see [1] for details.
[1].
https://github.com/torvalds/linux/commit/4f62bb7cb165f3e7b0a91279fe9dd5c56daf3457
>
--
Thanks,
Qi
On Sun, Apr 09, 2023 at 02:30:23PM +0800, Qi Zheng wrote:
>
>
> On 2023/4/9 06:12, Josh Poimboeuf wrote:
> > On Sat, Apr 08, 2023 at 01:36:06PM +0800, Qi Zheng wrote:
> > >
> > >
> > > On 2023/4/8 13:08, Josh Poimboeuf wrote:
> > > > On Thu, Mar 30, 2023 at 04:15:52PM +0800, Qi Zheng wrote:
> > > > > Make __get_wchan() use arch_stack_walk() directly to
> > > > > avoid open-coding of unwind logic.
> > > > >
> > > > > Signed-off-by: Qi Zheng <[email protected]>
> > > >
> > > > Can we just have a shared version of __get_wchan() for all
> > > > CONFIG_ARCH_STACKWALK arches?
> > >
> > > From a quick glance, I think it's ok, but we still need to define
> > > the arch's own get_wchan_cb(). I will try to do it.
> >
> > Hm, why would we need to do that?
>
> Because I see checks for count++ < 16 exist in __get_wchan() for some
> arches and some don't. So I'm not sure if this check can be discarded
> after using arch_stack_walk(). And I see that this check is retained in
> arm64, see [1] for details.
>
> [1]. https://github.com/torvalds/linux/commit/4f62bb7cb165f3e7b0a91279fe9dd5c56daf3457
That difference seems to have nothing to do with individual arch
differences.
The 16-check limit looks like some ancient cargo cult ritual which was
copy-pasted decades ago, presumably to avoid some kind of infinite stack
recursion loop in scheduler code, which should never happen. That
should definitely be removed.
Another good reason to unify them, to get rid of cruft like that.
--
Josh
On 2023/4/12 13:20, Josh Poimboeuf wrote:
> On Sun, Apr 09, 2023 at 02:30:23PM +0800, Qi Zheng wrote:
>>
>>
>> On 2023/4/9 06:12, Josh Poimboeuf wrote:
>>> On Sat, Apr 08, 2023 at 01:36:06PM +0800, Qi Zheng wrote:
>>>>
>>>>
>>>> On 2023/4/8 13:08, Josh Poimboeuf wrote:
>>>>> On Thu, Mar 30, 2023 at 04:15:52PM +0800, Qi Zheng wrote:
>>>>>> Make __get_wchan() use arch_stack_walk() directly to
>>>>>> avoid open-coding of unwind logic.
>>>>>>
>>>>>> Signed-off-by: Qi Zheng <[email protected]>
>>>>>
>>>>> Can we just have a shared version of __get_wchan() for all
>>>>> CONFIG_ARCH_STACKWALK arches?
>>>>
>>>> From a quick glance, I think it's ok, but we still need to define
>>>> the arch's own get_wchan_cb(). I will try to do it.
>>>
>>> Hm, why would we need to do that?
>>
>> Because I see checks for count++ < 16 exist in __get_wchan() for some
>> arches and some don't. So I'm not sure if this check can be discarded
>> after using arch_stack_walk(). And I see that this check is retained in
>> arm64, see [1] for details.
>>
>> [1]. https://github.com/torvalds/linux/commit/4f62bb7cb165f3e7b0a91279fe9dd5c56daf3457
>
> That difference seems to have nothing to do with individual arch
> differences.
>
> The 16-check limit looks like some ancient cargo cult ritual which was
> copy-pasted decades ago, presumably to avoid some kind of infinite stack
> recursion loop in scheduler code, which should never happen. That
> should definitely be removed.
Got it.
>
> Another good reason to unify them, to get rid of cruft like that.
OK, will try to make a shared version of __get_wchan() for all
CONFIG_ARCH_STACKWALK arches.
Thanks,
Qi
>
On Fri, Apr 07, 2023 at 10:08:22PM -0700, Josh Poimboeuf wrote:
> On Thu, Mar 30, 2023 at 04:15:52PM +0800, Qi Zheng wrote:
> > Make __get_wchan() use arch_stack_walk() directly to
> > avoid open-coding of unwind logic.
> >
> > Signed-off-by: Qi Zheng <[email protected]>
>
> Can we just have a shared version of __get_wchan() for all
> CONFIG_ARCH_STACKWALK arches?
Didn't I do that a while back ? I can't seem to actually find the
patch-set though :/
On Wed, Apr 12, 2023 at 03:15:33PM +0200, Peter Zijlstra wrote:
> On Fri, Apr 07, 2023 at 10:08:22PM -0700, Josh Poimboeuf wrote:
> > On Thu, Mar 30, 2023 at 04:15:52PM +0800, Qi Zheng wrote:
> > > Make __get_wchan() use arch_stack_walk() directly to
> > > avoid open-coding of unwind logic.
> > >
> > > Signed-off-by: Qi Zheng <[email protected]>
> >
> > Can we just have a shared version of __get_wchan() for all
> > CONFIG_ARCH_STACKWALK arches?
>
> Didn't I do that a while back ? I can't seem to actually find the
> patch-set though :/
Could be this series:
https://lkml.kernel.org/r/[email protected]
And this here:
https://lore.kernel.org/lkml/CAHk-=wjHbKfck1Ws4Y0pUZ7bxdjU9eh2WK0EFsv65utfeVkT9Q@mail.gmail.com/
might be why I dropped it.. I can't remember.
On 2023/4/12 21:23, Peter Zijlstra wrote:
> On Wed, Apr 12, 2023 at 03:15:33PM +0200, Peter Zijlstra wrote:
>> On Fri, Apr 07, 2023 at 10:08:22PM -0700, Josh Poimboeuf wrote:
>>> On Thu, Mar 30, 2023 at 04:15:52PM +0800, Qi Zheng wrote:
>>>> Make __get_wchan() use arch_stack_walk() directly to
>>>> avoid open-coding of unwind logic.
>>>>
>>>> Signed-off-by: Qi Zheng <[email protected]>
>>>
>>> Can we just have a shared version of __get_wchan() for all
>>> CONFIG_ARCH_STACKWALK arches?
>>
>> Didn't I do that a while back ? I can't seem to actually find the
>> patch-set though :/
>
> Could be this series:
>
> https://lkml.kernel.org/r/[email protected]
Oh, I vaguely remember the beginning because I was trying to fix
get_wchan() not supporting ORC unwinder on x86 [1], and then you sent a
patch set, and the patch [2] in this patch set tried to implement the
shared version of __get_wchan().
[1]. https://lore.kernel.org/all/[email protected]/
[2]. https://lore.kernel.org/all/[email protected]/
>
> And this here:
>
> https://lore.kernel.org/lkml/CAHk-=wjHbKfck1Ws4Y0pUZ7bxdjU9eh2WK0EFsv65utfeVkT9Q@mail.gmail.com/
>
> might be why I dropped it.. I can't remember.
Didn't realize I had replied to this email before.
But I also don't see why you dropped it. Looks like you have fixed the
UAF problem.
So do we still need to implement a shared version of __get_wchan()?
If we still need it, do I need to send it again? Or just pick your
previous patch directly? Both are fine to me. :)
Thanks,
Qi