2009-03-05 04:40:32

by K.Prasad

[permalink] [raw]
Subject: [patch 06/11] Use virtual debug registers in process/thread handling code

From: Alan Stern <[email protected]>

This patch enables the use of abstract/virtual debug registers in
process-handling routines.

[K.Prasad: Split-out from the bigger patch and minor changes following
re-basing]

Signed-off-by: K.Prasad <[email protected]>
Signed-off-by: Alan Stern <[email protected]>
---
arch/x86/kernel/process_32.c | 43 +++++++++++++++++++++++++------------------
arch/x86/kernel/process_64.c | 41 ++++++++++++++++++++++++-----------------
2 files changed, 49 insertions(+), 35 deletions(-)

Index: linux-2.6-tip/arch/x86/kernel/process_32.c
===================================================================
--- linux-2.6-tip.orig/arch/x86/kernel/process_32.c
+++ linux-2.6-tip/arch/x86/kernel/process_32.c
@@ -59,6 +59,8 @@
#include <asm/idle.h>
#include <asm/syscalls.h>
#include <asm/ds.h>
+#include <asm/debugreg.h>
+#include <asm/hw_breakpoint.h>

asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");

@@ -233,6 +235,8 @@ EXPORT_SYMBOL(kernel_thread);
*/
void exit_thread(void)
{
+ struct task_struct *tsk = current;
+
/* The process may have allocated an io port bitmap... nuke it. */
if (unlikely(test_thread_flag(TIF_IO_BITMAP))) {
struct task_struct *tsk = current;
@@ -253,6 +257,8 @@ void exit_thread(void)
tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
put_cpu();
}
+ if (unlikely(tsk->thread.hw_breakpoint_info))
+ flush_thread_hw_breakpoint(tsk);

ds_exit_thread(current);
}
@@ -261,14 +267,9 @@ void flush_thread(void)
{
struct task_struct *tsk = current;

- tsk->thread.debugreg0 = 0;
- tsk->thread.debugreg1 = 0;
- tsk->thread.debugreg2 = 0;
- tsk->thread.debugreg3 = 0;
- tsk->thread.debugreg6 = 0;
- tsk->thread.debugreg7 = 0;
memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
- clear_tsk_thread_flag(tsk, TIF_DEBUG);
+ if (unlikely(tsk->thread.hw_breakpoint_info))
+ flush_thread_hw_breakpoint(tsk);
/*
* Forget coprocessor state..
*/
@@ -312,7 +313,15 @@ int copy_thread(int nr, unsigned long cl

task_user_gs(p) = get_user_gs(regs);

+ p->thread.hw_breakpoint_info = NULL;
+ p->thread.io_bitmap_ptr = NULL;
+
tsk = current;
+ err = -ENOMEM;
+ if (unlikely(tsk->thread.hw_breakpoint_info)) {
+ if (copy_thread_hw_breakpoint(tsk, p, clone_flags))
+ goto out;
+ }
if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
IO_BITMAP_BYTES, GFP_KERNEL);
@@ -331,11 +340,13 @@ int copy_thread(int nr, unsigned long cl
if (clone_flags & CLONE_SETTLS)
err = do_set_thread_area(p, -1,
(struct user_desc __user *)childregs->si, 0);
-
+ out:
if (err && p->thread.io_bitmap_ptr) {
kfree(p->thread.io_bitmap_ptr);
p->thread.io_bitmap_max = 0;
}
+ if (err)
+ flush_thread_hw_breakpoint(p);

ds_copy_thread(p, current);

@@ -437,16 +448,6 @@ __switch_to_xtra(struct task_struct *pre
else if (next->debugctlmsr != prev->debugctlmsr)
update_debugctlmsr(next->debugctlmsr);

- if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
- set_debugreg(next->debugreg0, 0);
- set_debugreg(next->debugreg1, 1);
- set_debugreg(next->debugreg2, 2);
- set_debugreg(next->debugreg3, 3);
- /* no 4 and 5 */
- set_debugreg(next->debugreg6, 6);
- set_debugreg(next->debugreg7, 7);
- }
-
if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
test_tsk_thread_flag(next_p, TIF_NOTSC)) {
/* prev and next are different */
@@ -595,6 +596,12 @@ __switch_to(struct task_struct *prev_p,

percpu_write(current_task, next_p);

+ /*
+ * Handle debug registers. This must be done _after_ current
+ * is updated.
+ */
+ if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG)))
+ switch_to_thread_hw_breakpoint(next_p);
return prev_p;
}

Index: linux-2.6-tip/arch/x86/kernel/process_64.c
===================================================================
--- linux-2.6-tip.orig/arch/x86/kernel/process_64.c
+++ linux-2.6-tip/arch/x86/kernel/process_64.c
@@ -53,6 +53,8 @@
#include <asm/proto.h>
#include <asm/ia32.h>
#include <asm/idle.h>
+#include <asm/debugreg.h>
+#include <asm/hw_breakpoint.h>
#include <asm/syscalls.h>
#include <asm/ds.h>

@@ -277,13 +279,9 @@ void flush_thread(void)
}
clear_tsk_thread_flag(tsk, TIF_DEBUG);

- tsk->thread.debugreg0 = 0;
- tsk->thread.debugreg1 = 0;
- tsk->thread.debugreg2 = 0;
- tsk->thread.debugreg3 = 0;
- tsk->thread.debugreg6 = 0;
- tsk->thread.debugreg7 = 0;
memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
+ if (unlikely(tsk->thread.hw_breakpoint_info))
+ flush_thread_hw_breakpoint(tsk);
/*
* Forget coprocessor state..
*/
@@ -303,6 +301,8 @@ void release_thread(struct task_struct *
BUG();
}
}
+ if (unlikely(me->thread.hw_breakpoint_info))
+ flush_thread_hw_breakpoint(me);
}

static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
@@ -358,13 +358,21 @@ int copy_thread(int nr, unsigned long cl

p->thread.fs = me->thread.fs;
p->thread.gs = me->thread.gs;
+ p->thread.hw_breakpoint_info = NULL;
+ p->thread.io_bitmap_ptr = NULL;

savesegment(gs, p->thread.gsindex);
savesegment(fs, p->thread.fsindex);
savesegment(es, p->thread.es);
savesegment(ds, p->thread.ds);

- if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
+ err = -ENOMEM;
+ if (unlikely(me->thread.hw_breakpoint_info)) {
+ if (copy_thread_hw_breakpoint(me, p, clone_flags))
+ goto out;
+ }
+
+if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
if (!p->thread.io_bitmap_ptr) {
p->thread.io_bitmap_max = 0;
@@ -401,6 +409,9 @@ out:
kfree(p->thread.io_bitmap_ptr);
p->thread.io_bitmap_max = 0;
}
+ if (err)
+ flush_thread_hw_breakpoint(p);
+
return err;
}

@@ -503,16 +514,6 @@ static inline void __switch_to_xtra(stru
else if (next->debugctlmsr != prev->debugctlmsr)
update_debugctlmsr(next->debugctlmsr);

- if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
- loaddebug(next, 0);
- loaddebug(next, 1);
- loaddebug(next, 2);
- loaddebug(next, 3);
- /* no 4 and 5 */
- loaddebug(next, 6);
- loaddebug(next, 7);
- }
-
if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
test_tsk_thread_flag(next_p, TIF_NOTSC)) {
/* prev and next are different */
@@ -535,6 +536,12 @@ static inline void __switch_to_xtra(stru
*/
memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
}
+ /*
+ * Handle debug registers. This must be done _after_ current
+ * is updated.
+ */
+ if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG)))
+ switch_to_thread_hw_breakpoint(next_p);
}

/*


2009-03-10 14:50:18

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch 06/11] Use virtual debug registers in process/thread handling code


* [email protected] <[email protected]> wrote:

> @@ -437,16 +448,6 @@ __switch_to_xtra(struct task_struct *pre
> else if (next->debugctlmsr != prev->debugctlmsr)
> update_debugctlmsr(next->debugctlmsr);
>
> - if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
> - set_debugreg(next->debugreg0, 0);
> - set_debugreg(next->debugreg1, 1);
> - set_debugreg(next->debugreg2, 2);
> - set_debugreg(next->debugreg3, 3);
> - /* no 4 and 5 */
> - set_debugreg(next->debugreg6, 6);
> - set_debugreg(next->debugreg7, 7);
> - }
> -
> if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
> test_tsk_thread_flag(next_p, TIF_NOTSC)) {
> /* prev and next are different */
> @@ -595,6 +596,12 @@ __switch_to(struct task_struct *prev_p,
>
> percpu_write(current_task, next_p);
>
> + /*
> + * Handle debug registers. This must be done _after_ current
> + * is updated.
> + */
> + if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG)))
> + switch_to_thread_hw_breakpoint(next_p);

why does this have to be called after 'current' has been
updated? AFAICS switch_to_thread_hw_breakpoint() does not take a
look at 'current'.

Speaking of switch_to_thread_hw_breakpoint(), i dont like that
function at all:

- why does it have to do a list of debug registers?

- why does it worry about IPIs arriving when context-switches on
x86 are always done with interrupts disabled?

- also, what do the ->installed() and ->uninstalled() callbacks
do - nothing uses it!

Ingo

2009-03-10 16:08:09

by Alan Stern

[permalink] [raw]
Subject: Re: [patch 06/11] Use virtual debug registers in process/thread handling code

On Tue, 10 Mar 2009, Ingo Molnar wrote:

> > @@ -595,6 +596,12 @@ __switch_to(struct task_struct *prev_p,
> >
> > percpu_write(current_task, next_p);
> >
> > + /*
> > + * Handle debug registers. This must be done _after_ current
> > + * is updated.
> > + */
> > + if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG)))
> > + switch_to_thread_hw_breakpoint(next_p);
>
> why does this have to be called after 'current' has been
> updated? AFAICS switch_to_thread_hw_breakpoint() does not take a
> look at 'current'.

There was a discussion about this on LKML last October 17, and you were
in the CC list. Here is the reason, extracted from one of those
messages:


There's a problem with moving the switch_to_thread_hw_breakpoint() call
before current is updated. Suppose a kernel breakpoint is triggered in
between the two. The hw-breakpoint handler will see that current is
different from the task pointer stored in the chbi area, so it will
think the task pointer is leftover from an old task (lazy switching)
and will erase it. Then until the next context switch, no
user-breakpoints will be installed.

The real problem is that it's impossible to update both current and
chbi->bp_task at the same instant, so there will always be a window in
which they disagree and a breakpoint might get triggered. Since we use
lazy switching, we are forced to assume that a disagreement means that
current is correct and chbi->bp_task is old. But if you move the code
above then you'll create a window in which current is old and
chbi->bp_task is correct.


> Speaking of switch_to_thread_hw_breakpoint(), i dont like that
> function at all:
>
> - why does it have to do a list of debug registers?

I'm not sure I understand the point of this question. Are you asking
why the hw_breakpoint structures are stored on a list? Because there
can be an arbitrarily large number of them.

> - why does it worry about IPIs arriving when context-switches on
> x86 are always done with interrupts disabled?

The routine gets invoked at times other than during a context switch.
However you may be right that these times are all mutually exclusive.
If so then a good deal of complication can be removed.

> - also, what do the ->installed() and ->uninstalled() callbacks
> do - nothing uses it!

What do you mean? They do what any callback does. And of course
nothing uses them -- the code hasn't been merged yet!

The intention is to let programs (or kernel debuggers) know when the
statistics they are gathering are contaminated because the breakpoint
in question has been uninstalled, and when the statistics are again
valid because the breakpoint has been re-installed.

Alan Stern

2009-03-10 16:59:21

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch 06/11] Use virtual debug registers in process/thread handling code


* Alan Stern <[email protected]> wrote:

> On Tue, 10 Mar 2009, Ingo Molnar wrote:
>
> > > @@ -595,6 +596,12 @@ __switch_to(struct task_struct *prev_p,
> > >
> > > percpu_write(current_task, next_p);
> > >
> > > + /*
> > > + * Handle debug registers. This must be done _after_ current
> > > + * is updated.
> > > + */
> > > + if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG)))
> > > + switch_to_thread_hw_breakpoint(next_p);
> >
> > why does this have to be called after 'current' has been
> > updated? AFAICS switch_to_thread_hw_breakpoint() does not take a
> > look at 'current'.
>
> There was a discussion about this on LKML last October 17, and
> you were in the CC list. [...]

I am on the Cc: list of thousands of messages per month.
Consider it a very volatile form of storage.

Instead put these:

> There's a problem with moving the
> switch_to_thread_hw_breakpoint() call before current is
> updated. Suppose a kernel breakpoint is triggered in between
> the two. The hw-breakpoint handler will see that current is
> different from the task pointer stored in the chbi area, so it
> will think the task pointer is leftover from an old task (lazy
> switching) and will erase it. Then until the next context
> switch, no user-breakpoints will be installed.
>
> The real problem is that it's impossible to update both
> current and chbi->bp_task at the same instant, so there will
> always be a window in which they disagree and a breakpoint
> might get triggered. Since we use lazy switching, we are
> forced to assume that a disagreement means that current is
> correct and chbi->bp_task is old. But if you move the code
> above then you'll create a window in which current is old and
> chbi->bp_task is correct.

inside these:

/*
* ......
*/

Thanks,

Ingo

2009-03-10 17:07:44

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch 06/11] Use virtual debug registers in process/thread handling code


* Alan Stern <[email protected]> wrote:

> > Speaking of switch_to_thread_hw_breakpoint(), i dont like
> > that function at all:
> >
> > - why does it have to do a list of debug registers?
>
> I'm not sure I understand the point of this question. Are you
> asking why the hw_breakpoint structures are stored on a list?
> Because there can be an arbitrarily large number of them.

But that does not make much sense. There's just 4 hardware
registers. There's no sane way to overcommit them hence we
_should not_.

> > - why does it worry about IPIs arriving when context-switches on
> > x86 are always done with interrupts disabled?
>
> The routine gets invoked at times other than during a context
> switch. However you may be right that these times are all
> mutually exclusive. If so then a good deal of complication
> can be removed.

Yes.

> > - also, what do the ->installed() and ->uninstalled() callbacks
> > do - nothing uses it!
>
> What do you mean? They do what any callback does. And of
> course nothing uses them -- the code hasn't been merged yet!

No need to get testy - i'm the maintainer and you are trying to
get stuff into two subsystems i maintain. I ask such questions
when i see something added that has no immediate purpose.

If a later patch needs a particular facility then submit it
together with that use. It's not that hard to add callbacks -
but right now it just distracts from the immediate purpose of
these patches.

And please dont try to get stuff merged if you are not willing
to answer simple questions like that in a constructive way.

Ingo

2009-03-10 20:10:48

by Alan Stern

[permalink] [raw]
Subject: Re: [patch 06/11] Use virtual debug registers in process/thread handling code

On Tue, 10 Mar 2009, Ingo Molnar wrote:

> * Alan Stern <[email protected]> wrote:
>
> > > Speaking of switch_to_thread_hw_breakpoint(), i dont like
> > > that function at all:
> > >
> > > - why does it have to do a list of debug registers?
> >
> > I'm not sure I understand the point of this question. Are you
> > asking why the hw_breakpoint structures are stored on a list?
> > Because there can be an arbitrarily large number of them.
>
> But that does not make much sense. There's just 4 hardware
> registers. There's no sane way to overcommit them hence we
> _should not_.

The number of hardware registers will vary according to the
architecture. Our intention was to make the hardware breakpoint
interface architecture-neutral, as nearly as possible. Hence we
decided to let callers register arbitrary numbers of breakpoints, and
inform them when the breakpoints actually got installed in or
uninstalled from the debug registers.

If you think this design decision is a bad one, we can discuss it. But
Roland should be involved, because it is in large part his design.

> > > - why does it worry about IPIs arriving when context-switches on
> > > x86 are always done with interrupts disabled?
> >
> > The routine gets invoked at times other than during a context
> > switch. However you may be right that these times are all
> > mutually exclusive. If so then a good deal of complication
> > can be removed.
>
> Yes.

After looking through it more carefully, I think you're right -- if a
kernel breakpoint change does occur while
switch_to_thread_hw_breakpoint() is running then the IPI will arrive
immediately afterward, so there's no need to check for it explicitly.
(When this was written I probably wasn't aware that interrupts are
disabled during context switches.) So all the stuff involving "goto
restart" can be removed.


> > > - also, what do the ->installed() and ->uninstalled() callbacks
> > > do - nothing uses it!
> >
> > What do you mean? They do what any callback does. And of
> > course nothing uses them -- the code hasn't been merged yet!
>
> No need to get testy - i'm the maintainer and you are trying to
> get stuff into two subsystems i maintain. I ask such questions
> when i see something added that has no immediate purpose.

Email is hopeless for conveying emotional nuances. I didn't intend
that statement to sound testy; if it did I apologize.

> If a later patch needs a particular facility then submit it
> together with that use. It's not that hard to add callbacks -
> but right now it just distracts from the immediate purpose of
> these patches.

Prasad can take out the callback parts for now. And if we do change
the design so that breakpoints don't get installed and uninstalled at
random times then the callbacks won't be needed at all.

> And please dont try to get stuff merged if you are not willing
> to answer simple questions like that in a constructive way.

Wasn't the remainder of that reply (the part you omitted) constructive?

Alan Stern

2009-03-11 11:53:41

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch 06/11] Use virtual debug registers in process/thread handling code


* Alan Stern <[email protected]> wrote:

> On Tue, 10 Mar 2009, Ingo Molnar wrote:
>
> > * Alan Stern <[email protected]> wrote:
> >
> > > > Speaking of switch_to_thread_hw_breakpoint(), i dont like
> > > > that function at all:
> > > >
> > > > - why does it have to do a list of debug registers?
> > >
> > > I'm not sure I understand the point of this question. Are you
> > > asking why the hw_breakpoint structures are stored on a list?
> > > Because there can be an arbitrarily large number of them.
> >
> > But that does not make much sense. There's just 4 hardware
> > registers. There's no sane way to overcommit them hence we
> > _should not_.
>
> The number of hardware registers will vary according to the
> architecture. Our intention was to make the hardware
> breakpoint interface architecture-neutral, as nearly as
> possible. Hence we decided to let callers register arbitrary
> numbers of breakpoints, and inform them when the breakpoints
> actually got installed in or uninstalled from the debug
> registers.

This may sound as handwaving, but the thing is, it's best to do
these kinds of things gradually. Keep it clean, design for sane
hardware first (and x86, as a rare exception i guess, is rather
sane when it comes to hw debug features), add quirks on an
as-needed basis.

That principle is _especially_ true when a feature with
borderline utility is merged. We had to do that with KGDB: had
to strip down a decade of cruft and it really helped.

> If you think this design decision is a bad one, we can discuss
> it. But Roland should be involved, because it is in large
> part his design.

Sure.

> > > > - why does it worry about IPIs arriving when context-switches on
> > > > x86 are always done with interrupts disabled?
> > >
> > > The routine gets invoked at times other than during a
> > > context switch. However you may be right that these times
> > > are all mutually exclusive. If so then a good deal of
> > > complication can be removed.
> >
> > Yes.
>
> After looking through it more carefully, I think you're right
> -- if a kernel breakpoint change does occur while
> switch_to_thread_hw_breakpoint() is running then the IPI will
> arrive immediately afterward, so there's no need to check for
> it explicitly. (When this was written I probably wasn't aware
> that interrupts are disabled during context switches.) So all
> the stuff involving "goto restart" can be removed.

Good - that certainly makes the code we execute during
context-switch a lot more palatable.

Ingo