All usage sites which expected that the exception stacks in the CPU entry
area are mapped linearly are fixed up. Enable guard pages between the
IST stacks.
Signed-off-by: Thomas Gleixner <[email protected]>
---
arch/x86/include/asm/cpu_entry_area.h | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)
--- a/arch/x86/include/asm/cpu_entry_area.h
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -25,13 +25,9 @@ struct exception_stacks {
ESTACKS_MEMBERS(0)
};
-/*
- * The effective cpu entry area mapping with guard pages. Guard size is
- * zero until the code which makes assumptions about linear mapping is
- * cleaned up.
- */
+/* The effective cpu_entry_area mapping with guard pages */
struct cea_exception_stacks {
- ESTACKS_MEMBERS(0)
+ ESTACKS_MEMBERS(PAGE_SIZE)
};
#endif
The current implementation of in_exception_stack() iterates over the
exception stacks array. Most of the time this is an useless exercise, but
even for the actual use cases (perf and ftrace) it takes at least 2
iterations to get to the NMI stack.
As the exception stacks and the guard pages are page aligned the loop can
be avoided completely.
Add a initial check whether the stack pointer is inside the full exception
stack area and leave early if not.
Create a lookup table which describes the stack area. The table index is
the page offset from the beginning of the exception stacks. So for any
given stack pointer the page offset is computed and a lookup in the
description table is performed. If it is inside a guard page, return. If
not, use the descriptor to fill in the info structure.
The table is filled at compile time with nasty macro magic and for the
!KASAN case the interesting page descriptors exactly fit into a single
cache line. Just the last guard page descriptor is in the next cacheline,
but that should not be accessed in the regular case.
Signed-off-by: Thomas Gleixner <[email protected]>
---
arch/x86/kernel/dumpstack_64.c | 97 +++++++++++++++++++++++++++--------------
1 file changed, 66 insertions(+), 31 deletions(-)
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -48,50 +48,85 @@ const char *stack_type_name(enum stack_t
return NULL;
}
-struct estack_layout {
- unsigned int begin;
- unsigned int end;
+#define ESTACK_S(st) \
+ (offsetof(struct cea_exception_stacks, st## _stack))
+
+#define ESTACK_E(st) \
+ (offsetof(struct cea_exception_stacks, st## _stack_guard))
+
+#define PAGENR(offs) ((offs) / PAGE_SIZE)
+#define PAGERANGE(st) PAGENR(ESTACK_S(st)) ... PAGENR(ESTACK_E(st) - 1)
+
+#if EXCEPTION_STKSZ == PAGE_SIZE
+# define CONDRANGE(st) PAGENR(ESTACK_S(st))
+#else
+# define CONDRANGE(st) PAGERANGE(st)
+#endif
+
+/**
+ * struct estack_pages - Page descriptor for exception stacks
+ * @offs: Offset from the start of the exception stack area
+ * @size: Size of the exception stack
+ * @type: Type to store in the stack_info struct
+ */
+struct estack_pages {
+ u32 offs;
+ u16 size;
+ u16 type;
};
-#define ESTACK_ENTRY(x) { \
- .begin = offsetof(struct cea_exception_stacks, x## _stack), \
- .end = offsetof(struct cea_exception_stacks, x## _stack_guard) \
+#define ESTACK_PAGE(ist, est) { \
+ .offs = ESTACK_S(est), \
+ .size = ESTACK_E(est) - ESTACK_S(est), \
+ .type = STACK_TYPE_EXCEPTION + ist, \
}
-static const struct estack_layout layout[N_EXCEPTION_STACKS] = {
- [ DOUBLEFAULT_IST ] = ESTACK_ENTRY(DF),
- [ NMI_IST ] = ESTACK_ENTRY(NMI),
- [ DEBUG_IST ] = ESTACK_ENTRY(DB),
- [ MCE_IST ] = ESTACK_ENTRY(MCE),
+#define ESTACK_PAGES (sizeof(struct cea_exception_stacks) / PAGE_SIZE)
+
+/*
+ * Array of exception stack page descriptors. If the stack is larger than
+ * PAGE_SIZE, all pages covering a particular stack will have the same
+ * info.
+ */
+static const struct estack_pages estack_pages[ESTACK_PAGES] ____cacheline_aligned = {
+ [CONDRANGE(DF)] = ESTACK_PAGE(DOUBLEFAULT_IST, DF),
+ [CONDRANGE(NMI)] = ESTACK_PAGE(NMI_IST, NMI),
+ [PAGERANGE(DB)] = ESTACK_PAGE(DEBUG_IST, DB),
+ [CONDRANGE(MCE)] = ESTACK_PAGE(MCE_IST, MCE),
};
static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
{
- unsigned long estacks, begin, end, stk = (unsigned long)stack;
+ unsigned long begin, end, stk = (unsigned long)stack;
+ const struct estack_pages *ep;
struct pt_regs *regs;
unsigned int k;
BUILD_BUG_ON(N_EXCEPTION_STACKS != 4);
- estacks = (unsigned long)__this_cpu_read(cea_exception_stacks);
-
- for (k = 0; k < N_EXCEPTION_STACKS; k++) {
- begin = estacks + layout[k].begin;
- end = estacks + layout[k].end;
- regs = (struct pt_regs *)end - 1;
-
- if (stk <= begin || stk >= end)
- continue;
-
- info->type = STACK_TYPE_EXCEPTION + k;
- info->begin = (unsigned long *)begin;
- info->end = (unsigned long *)end;
- info->next_sp = (unsigned long *)regs->sp;
-
- return true;
- }
-
- return false;
+ begin = (unsigned long)__this_cpu_read(cea_exception_stacks);
+ end = begin + sizeof(struct cea_exception_stacks);
+ /* Bail if @stack is outside the exception stack area. */
+ if (stk <= begin || stk >= end)
+ return false;
+
+ /* Calc page offset from start of exception stacks */
+ k = (stk - begin) >> PAGE_SHIFT;
+ /* Lookup the page descriptor */
+ ep = &estack_pages[k];
+ /* Guard page? */
+ if (unlikely(!ep->size))
+ return false;
+
+ begin += (unsigned long)ep->offs;
+ end = begin + (unsigned long)ep->size;
+ regs = (struct pt_regs *)end - 1;
+
+ info->type = ep->type;
+ info->begin = (unsigned long *)begin;
+ info->end = (unsigned long *)end;
+ info->next_sp = (unsigned long *)regs->sp;
+ return true;
}
static bool in_irq_stack(unsigned long *stack, struct stack_info *info)
On Tue, Apr 02, 2019 at 12:19:46PM +0200, Thomas Gleixner wrote:
> The current implementation of in_exception_stack() iterates over the
> exception stacks array. Most of the time this is an useless exercise, but
> even for the actual use cases (perf and ftrace) it takes at least 2
> iterations to get to the NMI stack.
>
> As the exception stacks and the guard pages are page aligned the loop can
> be avoided completely.
>
> Add a initial check whether the stack pointer is inside the full exception
> stack area and leave early if not.
>
> Create a lookup table which describes the stack area. The table index is
> the page offset from the beginning of the exception stacks. So for any
> given stack pointer the page offset is computed and a lookup in the
> description table is performed. If it is inside a guard page, return. If
> not, use the descriptor to fill in the info structure.
>
> The table is filled at compile time with nasty macro magic and for the
> !KASAN case the interesting page descriptors exactly fit into a single
> cache line. Just the last guard page descriptor is in the next cacheline,
> but that should not be accessed in the regular case.
>
> Signed-off-by: Thomas Gleixner <[email protected]>
> ---
> arch/x86/kernel/dumpstack_64.c | 97 +++++++++++++++++++++++++++--------------
> 1 file changed, 66 insertions(+), 31 deletions(-)
>
> --- a/arch/x86/kernel/dumpstack_64.c
> +++ b/arch/x86/kernel/dumpstack_64.c
> @@ -48,50 +48,85 @@ const char *stack_type_name(enum stack_t
> return NULL;
> }
>
> -struct estack_layout {
> - unsigned int begin;
> - unsigned int end;
> +#define ESTACK_S(st) \
> + (offsetof(struct cea_exception_stacks, st## _stack))
> +
> +#define ESTACK_E(st) \
> + (offsetof(struct cea_exception_stacks, st## _stack_guard))
> +
> +#define PAGENR(offs) ((offs) / PAGE_SIZE)
> +#define PAGERANGE(st) PAGENR(ESTACK_S(st)) ... PAGENR(ESTACK_E(st) - 1)
> +
> +#if EXCEPTION_STKSZ == PAGE_SIZE
> +# define CONDRANGE(st) PAGENR(ESTACK_S(st))
> +#else
> +# define CONDRANGE(st) PAGERANGE(st)
> +#endif
> +
> +/**
> + * struct estack_pages - Page descriptor for exception stacks
> + * @offs: Offset from the start of the exception stack area
> + * @size: Size of the exception stack
> + * @type: Type to store in the stack_info struct
> + */
> +struct estack_pages {
> + u32 offs;
> + u16 size;
> + u16 type;
> };
>
> -#define ESTACK_ENTRY(x) { \
> - .begin = offsetof(struct cea_exception_stacks, x## _stack), \
> - .end = offsetof(struct cea_exception_stacks, x## _stack_guard) \
> +#define ESTACK_PAGE(ist, est) { \
> + .offs = ESTACK_S(est), \
> + .size = ESTACK_E(est) - ESTACK_S(est), \
> + .type = STACK_TYPE_EXCEPTION + ist, \
> }
>
> -static const struct estack_layout layout[N_EXCEPTION_STACKS] = {
> - [ DOUBLEFAULT_IST ] = ESTACK_ENTRY(DF),
> - [ NMI_IST ] = ESTACK_ENTRY(NMI),
> - [ DEBUG_IST ] = ESTACK_ENTRY(DB),
> - [ MCE_IST ] = ESTACK_ENTRY(MCE),
> +#define ESTACK_PAGES (sizeof(struct cea_exception_stacks) / PAGE_SIZE)
> +
> +/*
> + * Array of exception stack page descriptors. If the stack is larger than
> + * PAGE_SIZE, all pages covering a particular stack will have the same
> + * info.
> + */
> +static const struct estack_pages estack_pages[ESTACK_PAGES] ____cacheline_aligned = {
> + [CONDRANGE(DF)] = ESTACK_PAGE(DOUBLEFAULT_IST, DF),
> + [CONDRANGE(NMI)] = ESTACK_PAGE(NMI_IST, NMI),
> + [PAGERANGE(DB)] = ESTACK_PAGE(DEBUG_IST, DB),
> + [CONDRANGE(MCE)] = ESTACK_PAGE(MCE_IST, MCE),
It would be nice if the *_IST macro naming aligned with the struct
cea_exception_stacks field naming. Then you could just do, e.g.
ESTACKPAGE(DF).
Also it's a bit unfortunate that some of the stack size knowledge is
hard-coded here, i.e #DB always being > 1 page and non-#DB being
sometimes 1 page.
> };
>
> static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
> {
> - unsigned long estacks, begin, end, stk = (unsigned long)stack;
> + unsigned long begin, end, stk = (unsigned long)stack;
> + const struct estack_pages *ep;
> struct pt_regs *regs;
> unsigned int k;
>
> BUILD_BUG_ON(N_EXCEPTION_STACKS != 4);
>
> - estacks = (unsigned long)__this_cpu_read(cea_exception_stacks);
> -
> - for (k = 0; k < N_EXCEPTION_STACKS; k++) {
> - begin = estacks + layout[k].begin;
> - end = estacks + layout[k].end;
> - regs = (struct pt_regs *)end - 1;
> -
> - if (stk <= begin || stk >= end)
> - continue;
> -
> - info->type = STACK_TYPE_EXCEPTION + k;
> - info->begin = (unsigned long *)begin;
> - info->end = (unsigned long *)end;
> - info->next_sp = (unsigned long *)regs->sp;
> -
> - return true;
> - }
> -
> - return false;
> + begin = (unsigned long)__this_cpu_read(cea_exception_stacks);
> + end = begin + sizeof(struct cea_exception_stacks);
> + /* Bail if @stack is outside the exception stack area. */
> + if (stk <= begin || stk >= end)
> + return false;
This check is the most important piece. Exception stack dumps are quite
rare, so this ensures an early exit in most cases regardless of whether
there's a loop below.
> +
> + /* Calc page offset from start of exception stacks */
> + k = (stk - begin) >> PAGE_SHIFT;
> + /* Lookup the page descriptor */
> + ep = &estack_pages[k];
> + /* Guard page? */
> + if (unlikely(!ep->size))
> + return false;
> +
> + begin += (unsigned long)ep->offs;
> + end = begin + (unsigned long)ep->size;
> + regs = (struct pt_regs *)end - 1;
> +
> + info->type = ep->type;
> + info->begin = (unsigned long *)begin;
> + info->end = (unsigned long *)end;
> + info->next_sp = (unsigned long *)regs->sp;
> + return true;
With the above "(stk <= begin || stk >= end)" check, removing the loop
becomes not all that important since exception stack dumps are quite
rare and not performance sensitive. With all the macros this code
becomes a little more obtuse, so I'm not sure whether removal of the
loop is a net positive.
> }
>
> static bool in_irq_stack(unsigned long *stack, struct stack_info *info)
--
Josh
On Tue, 2 Apr 2019, Josh Poimboeuf wrote:
> On Tue, Apr 02, 2019 at 12:19:46PM +0200, Thomas Gleixner wrote:
> > +/*
> > + * Array of exception stack page descriptors. If the stack is larger than
> > + * PAGE_SIZE, all pages covering a particular stack will have the same
> > + * info.
> > + */
> > +static const struct estack_pages estack_pages[ESTACK_PAGES] ____cacheline_aligned = {
> > + [CONDRANGE(DF)] = ESTACK_PAGE(DOUBLEFAULT_IST, DF),
> > + [CONDRANGE(NMI)] = ESTACK_PAGE(NMI_IST, NMI),
> > + [PAGERANGE(DB)] = ESTACK_PAGE(DEBUG_IST, DB),
> > + [CONDRANGE(MCE)] = ESTACK_PAGE(MCE_IST, MCE),
>
> It would be nice if the *_IST macro naming aligned with the struct
> cea_exception_stacks field naming. Then you could just do, e.g.
> ESTACKPAGE(DF).
Yes, lemme fix that up.
> Also it's a bit unfortunate that some of the stack size knowledge is
> hard-coded here, i.e #DB always being > 1 page and non-#DB being
> sometimes 1 page.
The problem is that there is no way to make this macro maze conditional on
sizeof(). But my macro foo is rusty.
> > + begin = (unsigned long)__this_cpu_read(cea_exception_stacks);
> > + end = begin + sizeof(struct cea_exception_stacks);
> > + /* Bail if @stack is outside the exception stack area. */
> > + if (stk <= begin || stk >= end)
> > + return false;
>
> This check is the most important piece. Exception stack dumps are quite
> rare, so this ensures an early exit in most cases regardless of whether
> there's a loop below.
>
> > +
> > + /* Calc page offset from start of exception stacks */
> > + k = (stk - begin) >> PAGE_SHIFT;
> > + /* Lookup the page descriptor */
> > + ep = &estack_pages[k];
> > + /* Guard page? */
> > + if (unlikely(!ep->size))
> > + return false;
> > +
> > + begin += (unsigned long)ep->offs;
> > + end = begin + (unsigned long)ep->size;
> > + regs = (struct pt_regs *)end - 1;
> > +
> > + info->type = ep->type;
> > + info->begin = (unsigned long *)begin;
> > + info->end = (unsigned long *)end;
> > + info->next_sp = (unsigned long *)regs->sp;
> > + return true;
>
> With the above "(stk <= begin || stk >= end)" check, removing the loop
> becomes not all that important since exception stack dumps are quite
> rare and not performance sensitive. With all the macros this code
> becomes a little more obtuse, so I'm not sure whether removal of the
> loop is a net positive.
What about perf? It's NMI context and probably starts from there. Peter?
Thanks,
tglx
On Tue, Apr 02, 2019 at 10:51:49AM -0500, Josh Poimboeuf wrote:
> On Tue, Apr 02, 2019 at 05:48:56PM +0200, Thomas Gleixner wrote:
> > > With the above "(stk <= begin || stk >= end)" check, removing the loop
> > > becomes not all that important since exception stack dumps are quite
> > > rare and not performance sensitive. With all the macros this code
> > > becomes a little more obtuse, so I'm not sure whether removal of the
> > > loop is a net positive.
> >
> > What about perf? It's NMI context and probably starts from there. Peter?
>
> I believe perf unwinds starting from the regs from the context which was
> interrupted by the NMI.
Adding Peter to keep me honest.
--
Josh
> On Apr 2, 2019, at 9:48 AM, Thomas Gleixner <[email protected]> wrote:
>
>> On Tue, 2 Apr 2019, Josh Poimboeuf wrote:
>>> On Tue, Apr 02, 2019 at 12:19:46PM +0200, Thomas Gleixner wrote:
>>> +/*
>>> + * Array of exception stack page descriptors. If the stack is larger than
>>> + * PAGE_SIZE, all pages covering a particular stack will have the same
>>> + * info.
>>> + */
>>> +static const struct estack_pages estack_pages[ESTACK_PAGES] ____cacheline_aligned = {
>>> + [CONDRANGE(DF)] = ESTACK_PAGE(DOUBLEFAULT_IST, DF),
>>> + [CONDRANGE(NMI)] = ESTACK_PAGE(NMI_IST, NMI),
>>> + [PAGERANGE(DB)] = ESTACK_PAGE(DEBUG_IST, DB),
>>> + [CONDRANGE(MCE)] = ESTACK_PAGE(MCE_IST, MCE),
>>
>> It would be nice if the *_IST macro naming aligned with the struct
>> cea_exception_stacks field naming. Then you could just do, e.g.
>> ESTACKPAGE(DF).
>
> Yes, lemme fix that up.
>
>> Also it's a bit unfortunate that some of the stack size knowledge is
>> hard-coded here, i.e #DB always being > 1 page and non-#DB being
>> sometimes 1 page.
>
> The problem is that there is no way to make this macro maze conditional on
> sizeof(). But my macro foo is rusty.
How about a much better fix: make the DB stack be the same size as all the others and just have 4 of them (DB0, DB1, DB2, and DB3. After all, overflowing from one debug stack into another is just as much of a bug as overflowing into a different IST stack.
On Tue, Apr 02, 2019 at 05:48:56PM +0200, Thomas Gleixner wrote:
> > With the above "(stk <= begin || stk >= end)" check, removing the loop
> > becomes not all that important since exception stack dumps are quite
> > rare and not performance sensitive. With all the macros this code
> > becomes a little more obtuse, so I'm not sure whether removal of the
> > loop is a net positive.
>
> What about perf? It's NMI context and probably starts from there. Peter?
I believe perf unwinds starting from the regs from the context which was
interrupted by the NMI.
--
Josh
On Tue, 2 Apr 2019, Andy Lutomirski wrote:
> > On Apr 2, 2019, at 9:48 AM, Thomas Gleixner <[email protected]> wrote:
> >
> >> On Tue, 2 Apr 2019, Josh Poimboeuf wrote:
> >>> On Tue, Apr 02, 2019 at 12:19:46PM +0200, Thomas Gleixner wrote:
> >>> +/*
> >>> + * Array of exception stack page descriptors. If the stack is larger than
> >>> + * PAGE_SIZE, all pages covering a particular stack will have the same
> >>> + * info.
> >>> + */
> >>> +static const struct estack_pages estack_pages[ESTACK_PAGES] ____cacheline_aligned = {
> >>> + [CONDRANGE(DF)] = ESTACK_PAGE(DOUBLEFAULT_IST, DF),
> >>> + [CONDRANGE(NMI)] = ESTACK_PAGE(NMI_IST, NMI),
> >>> + [PAGERANGE(DB)] = ESTACK_PAGE(DEBUG_IST, DB),
> >>> + [CONDRANGE(MCE)] = ESTACK_PAGE(MCE_IST, MCE),
> >>
> >> It would be nice if the *_IST macro naming aligned with the struct
> >> cea_exception_stacks field naming. Then you could just do, e.g.
> >> ESTACKPAGE(DF).
> >
> > Yes, lemme fix that up.
> >
> >> Also it's a bit unfortunate that some of the stack size knowledge is
> >> hard-coded here, i.e #DB always being > 1 page and non-#DB being
> >> sometimes 1 page.
> >
> > The problem is that there is no way to make this macro maze conditional on
> > sizeof(). But my macro foo is rusty.
>
> How about a much better fix: make the DB stack be the same size as all
> the others and just have 4 of them (DB0, DB1, DB2, and DB3. After all,
> overflowing from one debug stack into another is just as much of a bug as
> overflowing into a different IST stack.
That makes sense.
Thanks,
tglx
On Tue, 2 Apr 2019, Rasmus Villemoes wrote:
> On 02/04/2019 17.48, Thomas Gleixner wrote:
> > On Tue, 2 Apr 2019, Josh Poimboeuf wrote:
> >> On Tue, Apr 02, 2019 at 12:19:46PM +0200, Thomas Gleixner wrote:
> >>> +/*
> >>> + * Array of exception stack page descriptors. If the stack is larger than
> >>> + * PAGE_SIZE, all pages covering a particular stack will have the same
> >>> + * info.
> >>> + */
> >>> +static const struct estack_pages estack_pages[ESTACK_PAGES] ____cacheline_aligned = {
> >>> + [CONDRANGE(DF)] = ESTACK_PAGE(DOUBLEFAULT_IST, DF),
> >>> + [CONDRANGE(NMI)] = ESTACK_PAGE(NMI_IST, NMI),
> >>> + [PAGERANGE(DB)] = ESTACK_PAGE(DEBUG_IST, DB),
> >>> + [CONDRANGE(MCE)] = ESTACK_PAGE(MCE_IST, MCE),
> >>
> >> It would be nice if the *_IST macro naming aligned with the struct
> >> cea_exception_stacks field naming. Then you could just do, e.g.
> >> ESTACKPAGE(DF).
> >
> > Yes, lemme fix that up.
> >
> >> Also it's a bit unfortunate that some of the stack size knowledge is
> >> hard-coded here, i.e #DB always being > 1 page and non-#DB being
> >> sometimes 1 page.
> >
> > The problem is that there is no way to make this macro maze conditional on
> > sizeof(). But my macro foo is rusty.
>
> Eh, but why do you need the CONDRANGE thing at all? [5 ... 5] is a
> perfectly fine designator, equivalent to [5]. So you can just use
> PAGERANGE in all cases, no?
Indeed. I tried that before and at some point GCC barfed, but probably due
to some other slipup. The macro expansion error messages are soo helpful...
Thanks,
tglx
On Tue, 2 Apr 2019, Thomas Gleixner wrote:
> On Tue, 2 Apr 2019, Andy Lutomirski wrote:
> > > On Apr 2, 2019, at 9:48 AM, Thomas Gleixner <[email protected]> wrote:
> > >
> > >> On Tue, 2 Apr 2019, Josh Poimboeuf wrote:
> > >>> On Tue, Apr 02, 2019 at 12:19:46PM +0200, Thomas Gleixner wrote:
> > >>> +/*
> > >>> + * Array of exception stack page descriptors. If the stack is larger than
> > >>> + * PAGE_SIZE, all pages covering a particular stack will have the same
> > >>> + * info.
> > >>> + */
> > >>> +static const struct estack_pages estack_pages[ESTACK_PAGES] ____cacheline_aligned = {
> > >>> + [CONDRANGE(DF)] = ESTACK_PAGE(DOUBLEFAULT_IST, DF),
> > >>> + [CONDRANGE(NMI)] = ESTACK_PAGE(NMI_IST, NMI),
> > >>> + [PAGERANGE(DB)] = ESTACK_PAGE(DEBUG_IST, DB),
> > >>> + [CONDRANGE(MCE)] = ESTACK_PAGE(MCE_IST, MCE),
> > >>
> > >> It would be nice if the *_IST macro naming aligned with the struct
> > >> cea_exception_stacks field naming. Then you could just do, e.g.
> > >> ESTACKPAGE(DF).
> > >
> > > Yes, lemme fix that up.
> > >
> > >> Also it's a bit unfortunate that some of the stack size knowledge is
> > >> hard-coded here, i.e #DB always being > 1 page and non-#DB being
> > >> sometimes 1 page.
> > >
> > > The problem is that there is no way to make this macro maze conditional on
> > > sizeof(). But my macro foo is rusty.
> >
> > How about a much better fix: make the DB stack be the same size as all
> > the others and just have 4 of them (DB0, DB1, DB2, and DB3. After all,
> > overflowing from one debug stack into another is just as much of a bug as
> > overflowing into a different IST stack.
>
> That makes sense.
Except that we just have two not four.
It needs some tweaking of the ist_shift stuff in entry_64.S but that's not
rocket science. Famous last words....
Thanks,
tglx
On 02/04/2019 17.48, Thomas Gleixner wrote:
> On Tue, 2 Apr 2019, Josh Poimboeuf wrote:
>> On Tue, Apr 02, 2019 at 12:19:46PM +0200, Thomas Gleixner wrote:
>>> +/*
>>> + * Array of exception stack page descriptors. If the stack is larger than
>>> + * PAGE_SIZE, all pages covering a particular stack will have the same
>>> + * info.
>>> + */
>>> +static const struct estack_pages estack_pages[ESTACK_PAGES] ____cacheline_aligned = {
>>> + [CONDRANGE(DF)] = ESTACK_PAGE(DOUBLEFAULT_IST, DF),
>>> + [CONDRANGE(NMI)] = ESTACK_PAGE(NMI_IST, NMI),
>>> + [PAGERANGE(DB)] = ESTACK_PAGE(DEBUG_IST, DB),
>>> + [CONDRANGE(MCE)] = ESTACK_PAGE(MCE_IST, MCE),
>>
>> It would be nice if the *_IST macro naming aligned with the struct
>> cea_exception_stacks field naming. Then you could just do, e.g.
>> ESTACKPAGE(DF).
>
> Yes, lemme fix that up.
>
>> Also it's a bit unfortunate that some of the stack size knowledge is
>> hard-coded here, i.e #DB always being > 1 page and non-#DB being
>> sometimes 1 page.
>
> The problem is that there is no way to make this macro maze conditional on
> sizeof(). But my macro foo is rusty.
Eh, but why do you need the CONDRANGE thing at all? [5 ... 5] is a
perfectly fine designator, equivalent to [5]. So you can just use
PAGERANGE in all cases, no?
Rasmus
> On Apr 2, 2019, at 1:29 PM, Thomas Gleixner <[email protected]> wrote:
>
>> On Tue, 2 Apr 2019, Thomas Gleixner wrote:
>> On Tue, 2 Apr 2019, Andy Lutomirski wrote:
>>>> On Apr 2, 2019, at 9:48 AM, Thomas Gleixner <[email protected]> wrote:
>>>>
>>>>>> On Tue, 2 Apr 2019, Josh Poimboeuf wrote:
>>>>>> On Tue, Apr 02, 2019 at 12:19:46PM +0200, Thomas Gleixner wrote:
>>>>>> +/*
>>>>>> + * Array of exception stack page descriptors. If the stack is larger than
>>>>>> + * PAGE_SIZE, all pages covering a particular stack will have the same
>>>>>> + * info.
>>>>>> + */
>>>>>> +static const struct estack_pages estack_pages[ESTACK_PAGES] ____cacheline_aligned = {
>>>>>> + [CONDRANGE(DF)] = ESTACK_PAGE(DOUBLEFAULT_IST, DF),
>>>>>> + [CONDRANGE(NMI)] = ESTACK_PAGE(NMI_IST, NMI),
>>>>>> + [PAGERANGE(DB)] = ESTACK_PAGE(DEBUG_IST, DB),
>>>>>> + [CONDRANGE(MCE)] = ESTACK_PAGE(MCE_IST, MCE),
>>>>>
>>>>> It would be nice if the *_IST macro naming aligned with the struct
>>>>> cea_exception_stacks field naming. Then you could just do, e.g.
>>>>> ESTACKPAGE(DF).
>>>>
>>>> Yes, lemme fix that up.
>>>>
>>>>> Also it's a bit unfortunate that some of the stack size knowledge is
>>>>> hard-coded here, i.e #DB always being > 1 page and non-#DB being
>>>>> sometimes 1 page.
>>>>
>>>> The problem is that there is no way to make this macro maze conditional on
>>>> sizeof(). But my macro foo is rusty.
>>>
>>> How about a much better fix: make the DB stack be the same size as all
>>> the others and just have 4 of them (DB0, DB1, DB2, and DB3. After all,
>>> overflowing from one debug stack into another is just as much of a bug as
>>> overflowing into a different IST stack.
>>
>> That makes sense.
>
> Except that we just have two not four.
>
> It needs some tweaking of the ist_shift stuff in entry_64.S but that's not
> rocket science. Famous last words....
>
The ist_shift mess should probably be in C, but that’s a big can of worms. That being said, why do we have it at all? Once upon a time, we’d do ICEBP from user mode (or a legit breakpoint), then send a signal and hit a data breakpoint, and we’d recurse. But we don’t run user debug handlers on the IST stack at all anymore.
Maybe we can convince ourselves it’s safe?
What we should do is check, on IST return, that we’re not about to return to our own stack. Then we can at least properly panic.
On Tue, Apr 02, 2019 at 10:43:29AM -0500, Josh Poimboeuf wrote:
> On Tue, Apr 02, 2019 at 12:19:46PM +0200, Thomas Gleixner wrote:
> > static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
> > {
> > - unsigned long estacks, begin, end, stk = (unsigned long)stack;
> > + unsigned long begin, end, stk = (unsigned long)stack;
> > + const struct estack_pages *ep;
> > struct pt_regs *regs;
> > unsigned int k;
> >
> > BUILD_BUG_ON(N_EXCEPTION_STACKS != 4);
> >
> > - estacks = (unsigned long)__this_cpu_read(cea_exception_stacks);
> > -
> > - for (k = 0; k < N_EXCEPTION_STACKS; k++) {
> > - begin = estacks + layout[k].begin;
> > - end = estacks + layout[k].end;
> > - regs = (struct pt_regs *)end - 1;
> > -
> > - if (stk <= begin || stk >= end)
> > - continue;
> > -
> > - info->type = STACK_TYPE_EXCEPTION + k;
> > - info->begin = (unsigned long *)begin;
> > - info->end = (unsigned long *)end;
> > - info->next_sp = (unsigned long *)regs->sp;
> > -
> > - return true;
> > - }
> > -
> > - return false;
> > + begin = (unsigned long)__this_cpu_read(cea_exception_stacks);
> > + end = begin + sizeof(struct cea_exception_stacks);
> > + /* Bail if @stack is outside the exception stack area. */
> > + if (stk <= begin || stk >= end)
> > + return false;
>
> This check is the most important piece. Exception stack dumps are quite
> rare, so this ensures an early exit in most cases regardless of whether
> there's a loop below.
>
> > +
> > + /* Calc page offset from start of exception stacks */
> > + k = (stk - begin) >> PAGE_SHIFT;
> > + /* Lookup the page descriptor */
> > + ep = &estack_pages[k];
> > + /* Guard page? */
> > + if (unlikely(!ep->size))
> > + return false;
> > +
> > + begin += (unsigned long)ep->offs;
> > + end = begin + (unsigned long)ep->size;
> > + regs = (struct pt_regs *)end - 1;
> > +
> > + info->type = ep->type;
> > + info->begin = (unsigned long *)begin;
> > + info->end = (unsigned long *)end;
> > + info->next_sp = (unsigned long *)regs->sp;
> > + return true;
>
> With the above "(stk <= begin || stk >= end)" check, removing the loop
> becomes not all that important since exception stack dumps are quite
> rare and not performance sensitive. With all the macros this code
> becomes a little more obtuse, so I'm not sure whether removal of the
> loop is a net positive.
Are you sure; perf does a lot of stack dumps from NMI context.
On Tue, Apr 02, 2019 at 10:51:49AM -0500, Josh Poimboeuf wrote:
> On Tue, Apr 02, 2019 at 05:48:56PM +0200, Thomas Gleixner wrote:
> > > With the above "(stk <= begin || stk >= end)" check, removing the loop
> > > becomes not all that important since exception stack dumps are quite
> > > rare and not performance sensitive. With all the macros this code
> > > becomes a little more obtuse, so I'm not sure whether removal of the
> > > loop is a net positive.
> >
> > What about perf? It's NMI context and probably starts from there. Peter?
>
> I believe perf unwinds starting from the regs from the context which was
> interrupted by the NMI.
Aah, indeed. So then we only see exception stacks when the NMI lands in
an exception, which is, as you say, quite rare.
On Wed, Apr 03, 2019 at 10:08:28AM +0200, Peter Zijlstra wrote:
> On Tue, Apr 02, 2019 at 10:51:49AM -0500, Josh Poimboeuf wrote:
> > On Tue, Apr 02, 2019 at 05:48:56PM +0200, Thomas Gleixner wrote:
> > > > With the above "(stk <= begin || stk >= end)" check, removing the loop
> > > > becomes not all that important since exception stack dumps are quite
> > > > rare and not performance sensitive. With all the macros this code
> > > > becomes a little more obtuse, so I'm not sure whether removal of the
> > > > loop is a net positive.
> > >
> > > What about perf? It's NMI context and probably starts from there. Peter?
> >
> > I believe perf unwinds starting from the regs from the context which was
> > interrupted by the NMI.
>
> Aah, indeed. So then we only see exception stacks when the NMI lands in
> an exception, which is, as you say, quite rare.
Aah, ftrace OTOH might still trigger this lots. When you do function
tracer with stacktrace enabled it'll do unwinds _everywhere_.
On Wed, Apr 03, 2019 at 10:10:41AM +0200, Peter Zijlstra wrote:
> On Wed, Apr 03, 2019 at 10:08:28AM +0200, Peter Zijlstra wrote:
> > On Tue, Apr 02, 2019 at 10:51:49AM -0500, Josh Poimboeuf wrote:
> > > On Tue, Apr 02, 2019 at 05:48:56PM +0200, Thomas Gleixner wrote:
> > > > > With the above "(stk <= begin || stk >= end)" check, removing the loop
> > > > > becomes not all that important since exception stack dumps are quite
> > > > > rare and not performance sensitive. With all the macros this code
> > > > > becomes a little more obtuse, so I'm not sure whether removal of the
> > > > > loop is a net positive.
> > > >
> > > > What about perf? It's NMI context and probably starts from there. Peter?
> > >
> > > I believe perf unwinds starting from the regs from the context which was
> > > interrupted by the NMI.
> >
> > Aah, indeed. So then we only see exception stacks when the NMI lands in
> > an exception, which is, as you say, quite rare.
>
> Aah, ftrace OTOH might still trigger this lots. When you do function
> tracer with stacktrace enabled it'll do unwinds _everywhere_.
Even then, ftrace stacktrace will be really slow regardless, and this
loop removal would be a tiny performance improvement for a tiny fraction
of those stack traces. Unless the improvement is measurable I would
personally rather err on the side of code readability.
--
Josh
On Tue, 2 Apr 2019, Andy Lutomirski wrote:
> > On Apr 2, 2019, at 1:29 PM, Thomas Gleixner <[email protected]> wrote:
> >>> How about a much better fix: make the DB stack be the same size as all
> >>> the others and just have 4 of them (DB0, DB1, DB2, and DB3. After all,
> >>> overflowing from one debug stack into another is just as much of a bug as
> >>> overflowing into a different IST stack.
> >>
> >> That makes sense.
> >
> > Except that we just have two not four.
> >
> > It needs some tweaking of the ist_shift stuff in entry_64.S but that's not
> > rocket science. Famous last words....
> >
>
> The ist_shift mess should probably be in C, but that’s a big can of
> worms. That being said, why do we have it at all? Once upon a time, we’d
> do ICEBP from user mode (or a legit breakpoint), then send a signal and
> hit a data breakpoint, and we’d recurse. But we don’t run user debug
> handlers on the IST stack at all anymore.
>
> Maybe we can convince ourselves it’s safe?
Maybe. Need to think about it for a while.
Thanks,
tglx
On Wed, 3 Apr 2019, Thomas Gleixner wrote:
> On Tue, 2 Apr 2019, Andy Lutomirski wrote:
> > > On Apr 2, 2019, at 1:29 PM, Thomas Gleixner <[email protected]> wrote:
> > >>> How about a much better fix: make the DB stack be the same size as all
> > >>> the others and just have 4 of them (DB0, DB1, DB2, and DB3. After all,
> > >>> overflowing from one debug stack into another is just as much of a bug as
> > >>> overflowing into a different IST stack.
> > >>
> > >> That makes sense.
> > >
> > > Except that we just have two not four.
> > >
> > > It needs some tweaking of the ist_shift stuff in entry_64.S but that's not
> > > rocket science. Famous last words....
> > >
> >
> > The ist_shift mess should probably be in C, but that?s a big can of
> > worms. That being said, why do we have it at all? Once upon a time, we?d
> > do ICEBP from user mode (or a legit breakpoint), then send a signal and
> > hit a data breakpoint, and we?d recurse. But we don?t run user debug
> > handlers on the IST stack at all anymore.
> >
> > Maybe we can convince ourselves it?s safe?
>
> Maybe. Need to think about it for a while.
What about kprobes. It has nasty reentrancy stuff as well...
Thanks,
tglx
On Wed, Apr 3, 2019 at 12:42 PM Thomas Gleixner <[email protected]> wrote:
>
> On Wed, 3 Apr 2019, Thomas Gleixner wrote:
> > On Tue, 2 Apr 2019, Andy Lutomirski wrote:
> > > > On Apr 2, 2019, at 1:29 PM, Thomas Gleixner <[email protected]> wrote:
> > > >>> How about a much better fix: make the DB stack be the same size as all
> > > >>> the others and just have 4 of them (DB0, DB1, DB2, and DB3. After all,
> > > >>> overflowing from one debug stack into another is just as much of a bug as
> > > >>> overflowing into a different IST stack.
> > > >>
> > > >> That makes sense.
> > > >
> > > > Except that we just have two not four.
> > > >
> > > > It needs some tweaking of the ist_shift stuff in entry_64.S but that's not
> > > > rocket science. Famous last words....
> > > >
> > >
> > > The ist_shift mess should probably be in C, but that’s a big can of
> > > worms. That being said, why do we have it at all? Once upon a time, we’d
> > > do ICEBP from user mode (or a legit breakpoint), then send a signal and
> > > hit a data breakpoint, and we’d recurse. But we don’t run user debug
> > > handlers on the IST stack at all anymore.
> > >
> > > Maybe we can convince ourselves it’s safe?
> >
> > Maybe. Need to think about it for a while.
>
> What about kprobes. It has nasty reentrancy stuff as well...
>
Hmm. We used to have #BP on the same stack, and I bet there were
plenty of ways to get #DB and #BP inside each other.
I bet the best solution is to set dr7 to 0 before we do anything
complicated in do_debug() (at least if we got there from kernel mode).
But we should probably make this a whole separate project after your
series is done.