Hello.
IIRC Steven agrees with this changes, but nobody picked them.
Resend, perhaps Andrew can help...
Oleg.
syscall_regfunc() and syscall_unregfunc() should set/clear
TIF_SYSCALL_TRACEPOINT system-wide, but do_each_thread() can race
with copy_process() and miss the new child which was not added to
init_task.tasks list yet.
Change copy_process() to update the child's TIF_SYSCALL_TRACEPOINT
under tasklist.
While at it,
- remove _irqsafe from syscall_regfunc/syscall_unregfunc,
read_lock(tasklist) doesn't need to disable irqs.
- change syscall_unregfunc() to check PF_KTHREAD to skip
the kernel threads, ->mm != NULL is the common mistake.
Note: probably this check should be simply removed, needs
another patch.
Signed-off-by: Oleg Nesterov <[email protected]>
---
kernel/fork.c | 7 +++++++
kernel/tracepoint.c | 12 +++++-------
2 files changed, 12 insertions(+), 7 deletions(-)
diff --git a/kernel/fork.c b/kernel/fork.c
index 1766d32..8184a53 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1472,7 +1472,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
total_forks++;
spin_unlock(¤t->sighand->siglock);
+#ifdef CONFIG_TRACEPOINTS
+ if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
+ set_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT);
+ else
+ clear_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT);
+#endif
write_unlock_irq(&tasklist_lock);
+
proc_fork_connector(p);
cgroup_post_fork(p);
if (clone_flags & CLONE_THREAD)
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 0c05a45..a16754b 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -732,33 +732,31 @@ static int sys_tracepoint_refcount;
void syscall_regfunc(void)
{
- unsigned long flags;
struct task_struct *g, *t;
if (!sys_tracepoint_refcount) {
- read_lock_irqsave(&tasklist_lock, flags);
+ read_lock(&tasklist_lock);
do_each_thread(g, t) {
/* Skip kernel threads. */
- if (t->mm)
+ if (!(t->flags & PF_KTHREAD))
set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
} while_each_thread(g, t);
- read_unlock_irqrestore(&tasklist_lock, flags);
+ read_unlock(&tasklist_lock);
}
sys_tracepoint_refcount++;
}
void syscall_unregfunc(void)
{
- unsigned long flags;
struct task_struct *g, *t;
sys_tracepoint_refcount--;
if (!sys_tracepoint_refcount) {
- read_lock_irqsave(&tasklist_lock, flags);
+ read_lock(&tasklist_lock);
do_each_thread(g, t) {
clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
} while_each_thread(g, t);
- read_unlock_irqrestore(&tasklist_lock, flags);
+ read_unlock(&tasklist_lock);
}
}
#endif
--
1.5.5.1
syscall_regfunc() ignores the kernel thread because "it has
no effect", see cc3b13c1 "Don't trace kernel thread syscalls".
However, this means that a user-space task spawned by
call_usermodehelper() won't report the system calls if
kernel_execve() is called when sys_tracepoint_refcount != 0.
Remove this check. Hopefully the unnecessary report from
ret_from_fork path mentioned by cc3b13c1 is fine. In fact
"this is the only case" is not true. Say, kernel_execve()
itself does "int 80" on X86_32. Hopefully fine too.
Signed-off-by: Oleg Nesterov <[email protected]>
---
kernel/tracepoint.c | 4 +---
1 files changed, 1 insertions(+), 3 deletions(-)
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index a16754b..4e1e4ca 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -737,9 +737,7 @@ void syscall_regfunc(void)
if (!sys_tracepoint_refcount) {
read_lock(&tasklist_lock);
do_each_thread(g, t) {
- /* Skip kernel threads. */
- if (!(t->flags & PF_KTHREAD))
- set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
+ set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
} while_each_thread(g, t);
read_unlock(&tasklist_lock);
}
--
1.5.5.1
On Sun, 2013-03-17 at 19:28 +0100, Oleg Nesterov wrote:
> syscall_regfunc() and syscall_unregfunc() should set/clear
> TIF_SYSCALL_TRACEPOINT system-wide, but do_each_thread() can race
> with copy_process() and miss the new child which was not added to
> init_task.tasks list yet.
>
> Change copy_process() to update the child's TIF_SYSCALL_TRACEPOINT
> under tasklist.
Is this because "p = dup_task_struct(current);" is outside the lock?
Probably should state this in the change log.
>
> While at it,
>
> - remove _irqsafe from syscall_regfunc/syscall_unregfunc,
> read_lock(tasklist) doesn't need to disable irqs.
I'm fine with this.
>
> - change syscall_unregfunc() to check PF_KTHREAD to skip
> the kernel threads, ->mm != NULL is the common mistake.
I'm fine with this too.
>
> Note: probably this check should be simply removed, needs
> another patch.
>
> Signed-off-by: Oleg Nesterov <[email protected]>
> ---
> kernel/fork.c | 7 +++++++
> kernel/tracepoint.c | 12 +++++-------
> 2 files changed, 12 insertions(+), 7 deletions(-)
>
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 1766d32..8184a53 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1472,7 +1472,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
>
> total_forks++;
> spin_unlock(¤t->sighand->siglock);
> +#ifdef CONFIG_TRACEPOINTS
> + if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
> + set_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT);
> + else
> + clear_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT);
> +#endif
I hate seeing #ifdef code like this in C files. Can you add a function
to set this in include/trace/syscalls.h:
#ifdef CONFIG_TRACEPOINTS
static inline void syscall_tracepoint_update(struct task_struct *p)
{
if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
set_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT);
else
clear_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT);
}
#else
static inline void syscall_tracepoint_update(struct task_struct *p) {}
#endif
Then in copy_process() just have:
syscall_tracepoint_update(p);
Thanks,
-- Steve
> write_unlock_irq(&tasklist_lock);
> +
> proc_fork_connector(p);
> cgroup_post_fork(p);
> if (clone_flags & CLONE_THREAD)
> diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
> index 0c05a45..a16754b 100644
> --- a/kernel/tracepoint.c
> +++ b/kernel/tracepoint.c
> @@ -732,33 +732,31 @@ static int sys_tracepoint_refcount;
>
> void syscall_regfunc(void)
> {
> - unsigned long flags;
> struct task_struct *g, *t;
>
> if (!sys_tracepoint_refcount) {
> - read_lock_irqsave(&tasklist_lock, flags);
> + read_lock(&tasklist_lock);
> do_each_thread(g, t) {
> /* Skip kernel threads. */
> - if (t->mm)
> + if (!(t->flags & PF_KTHREAD))
> set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
> } while_each_thread(g, t);
> - read_unlock_irqrestore(&tasklist_lock, flags);
> + read_unlock(&tasklist_lock);
> }
> sys_tracepoint_refcount++;
> }
>
> void syscall_unregfunc(void)
> {
> - unsigned long flags;
> struct task_struct *g, *t;
>
> sys_tracepoint_refcount--;
> if (!sys_tracepoint_refcount) {
> - read_lock_irqsave(&tasklist_lock, flags);
> + read_lock(&tasklist_lock);
> do_each_thread(g, t) {
> clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
> } while_each_thread(g, t);
> - read_unlock_irqrestore(&tasklist_lock, flags);
> + read_unlock(&tasklist_lock);
> }
> }
> #endif
On Sun, 2013-03-17 at 19:28 +0100, Oleg Nesterov wrote:
> syscall_regfunc() ignores the kernel thread because "it has
> no effect", see cc3b13c1 "Don't trace kernel thread syscalls".
>
> However, this means that a user-space task spawned by
> call_usermodehelper() won't report the system calls if
> kernel_execve() is called when sys_tracepoint_refcount != 0.
>
> Remove this check. Hopefully the unnecessary report from
> ret_from_fork path mentioned by cc3b13c1 is fine. In fact
> "this is the only case" is not true. Say, kernel_execve()
> itself does "int 80" on X86_32. Hopefully fine too.
>
I'm really thinking the TIF_SYSCALL_TRACEPOINT flag is getting a bit
ridiculous. We really should have a "swap syscall table when tracepoints
enabled" that changes the syscall table that does exactly the same thing
as the normal table but wraps the system call with the tracepoints.
Something that we are looking to do with interrupts.
Altough this may not be that trivial, as this seems to be the method to
trace system calls on not only x86, but also PowerPC, ARM, s390, Sparc,
and sh.
-- Steve
> Signed-off-by: Oleg Nesterov <[email protected]>
> ---
> kernel/tracepoint.c | 4 +---
> 1 files changed, 1 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
> index a16754b..4e1e4ca 100644
> --- a/kernel/tracepoint.c
> +++ b/kernel/tracepoint.c
> @@ -737,9 +737,7 @@ void syscall_regfunc(void)
> if (!sys_tracepoint_refcount) {
> read_lock(&tasklist_lock);
> do_each_thread(g, t) {
> - /* Skip kernel threads. */
> - if (!(t->flags & PF_KTHREAD))
> - set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
> + set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
> } while_each_thread(g, t);
> read_unlock(&tasklist_lock);
> }
On 03/17, Steven Rostedt wrote:
>
> On Sun, 2013-03-17 at 19:28 +0100, Oleg Nesterov wrote:
> > syscall_regfunc() and syscall_unregfunc() should set/clear
> > TIF_SYSCALL_TRACEPOINT system-wide, but do_each_thread() can race
> > with copy_process() and miss the new child which was not added to
> > init_task.tasks list yet.
> >
> > Change copy_process() to update the child's TIF_SYSCALL_TRACEPOINT
> > under tasklist.
>
> Is this because "p = dup_task_struct(current);" is outside the lock?
> Probably should state this in the change log.
Not only, syscall_regfunc/syscall_unregfunc can miss the new child.
Just suppose that syscall_regfunc() takes tasklist right before the
forking task tries to take it for writing and and the child to the
list.
> > +#ifdef CONFIG_TRACEPOINTS
> > + if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
> > + set_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT);
> > + else
> > + clear_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT);
> > +#endif
>
> I hate seeing #ifdef code like this in C files. Can you add a function
> to set this in include/trace/syscalls.h:
It seems that everyone hates them, except me ;)
> #ifdef CONFIG_TRACEPOINTS
> static inline void syscall_tracepoint_update(struct task_struct *p)
> {
> if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
> set_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT);
> else
> clear_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT);
> }
> #else
> static inline void syscall_tracepoint_update(struct task_struct *p) {}
> #endif
OK, thanks, will do. But perhaps tracepoint_fork() would be better?
Oleg.
On 03/17, Steven Rostedt wrote:
>
> On Sun, 2013-03-17 at 19:28 +0100, Oleg Nesterov wrote:
> > syscall_regfunc() ignores the kernel thread because "it has
> > no effect", see cc3b13c1 "Don't trace kernel thread syscalls".
> >
> > However, this means that a user-space task spawned by
> > call_usermodehelper() won't report the system calls if
> > kernel_execve() is called when sys_tracepoint_refcount != 0.
> >
> > Remove this check. Hopefully the unnecessary report from
> > ret_from_fork path mentioned by cc3b13c1 is fine. In fact
> > "this is the only case" is not true. Say, kernel_execve()
> > itself does "int 80" on X86_32. Hopefully fine too.
>
> I'm really thinking the TIF_SYSCALL_TRACEPOINT flag is getting a bit
> ridiculous. We really should have a "swap syscall table when tracepoints
> enabled" that changes the syscall table that does exactly the same thing
> as the normal table but wraps the system call with the tracepoints.
But we also need to force the slow path in system_call...
Anyway, do you agree with this change for now?
Oleg.
On Sun, 2013-03-17 at 20:00 +0100, Oleg Nesterov wrote:
> On 03/17, Steven Rostedt wrote:
> >
> > On Sun, 2013-03-17 at 19:28 +0100, Oleg Nesterov wrote:
> > > syscall_regfunc() and syscall_unregfunc() should set/clear
> > > TIF_SYSCALL_TRACEPOINT system-wide, but do_each_thread() can race
> > > with copy_process() and miss the new child which was not added to
> > > init_task.tasks list yet.
> > >
> > > Change copy_process() to update the child's TIF_SYSCALL_TRACEPOINT
> > > under tasklist.
> >
> > Is this because "p = dup_task_struct(current);" is outside the lock?
> > Probably should state this in the change log.
>
> Not only, syscall_regfunc/syscall_unregfunc can miss the new child.
>
> Just suppose that syscall_regfunc() takes tasklist right before the
> forking task tries to take it for writing and and the child to the
> list.
I'm a bit confused by the above. Maybe it's the typo with the "and and"
that's confusing me.
>
> > > +#ifdef CONFIG_TRACEPOINTS
> > > + if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
> > > + set_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT);
> > > + else
> > > + clear_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT);
> > > +#endif
> >
> > I hate seeing #ifdef code like this in C files. Can you add a function
> > to set this in include/trace/syscalls.h:
>
> It seems that everyone hates them, except me ;)
>
> > #ifdef CONFIG_TRACEPOINTS
> > static inline void syscall_tracepoint_update(struct task_struct *p)
> > {
> > if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
> > set_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT);
> > else
> > clear_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT);
> > }
> > #else
> > static inline void syscall_tracepoint_update(struct task_struct *p) {}
> > #endif
>
> OK, thanks, will do. But perhaps tracepoint_fork() would be better?
tracepoint_fork() is similar to being called trace_fork() which would be
considered a tracepoint. Seeing tracepoint_fork() would make me think it
has something to do with the fork tracepoint.
Do we plan on doing anything other than updating the syscall tracepoint
flag here? I find the "syscall_tracepoint_update()" very descriptive to
what is actually happening. While reading the fork code, seeing
'syscall_tracepoint_update()' would tell me that this has something to
do with syscall tracepoints, which it does. But tracepoint_fork() would
have me think something completely different.
-- Steve
On Sun, 2013-03-17 at 20:04 +0100, Oleg Nesterov wrote:
> > I'm really thinking the TIF_SYSCALL_TRACEPOINT flag is getting a bit
> > ridiculous. We really should have a "swap syscall table when tracepoints
> > enabled" that changes the syscall table that does exactly the same thing
> > as the normal table but wraps the system call with the tracepoints.
>
> But we also need to force the slow path in system_call...
Why? If we remove the tracepoint from the slowpath and use a table swap,
then we wouldn't need to use the slowpath at all.
>
> Anyway, do you agree with this change for now?
Well, if it's solving a bug today sure. But we should really be looking
at fixing what's there for the future.
-- Steve
On 03/17, Steven Rostedt wrote:
>
> On Sun, 2013-03-17 at 20:04 +0100, Oleg Nesterov wrote:
>
> > > I'm really thinking the TIF_SYSCALL_TRACEPOINT flag is getting a bit
> > > ridiculous. We really should have a "swap syscall table when tracepoints
> > > enabled" that changes the syscall table that does exactly the same thing
> > > as the normal table but wraps the system call with the tracepoints.
> >
> > But we also need to force the slow path in system_call...
>
> Why? If we remove the tracepoint from the slowpath and use a table swap,
> then we wouldn't need to use the slowpath at all.
Ah, indeed, you are right.
> > Anyway, do you agree with this change for now?
>
> Well, if it's solving a bug today sure. But we should really be looking
> at fixing what's there for the future.
OK, thanks.
Oleg.
On 03/17, Steven Rostedt wrote:
>
> On Sun, 2013-03-17 at 20:00 +0100, Oleg Nesterov wrote:
> > On 03/17, Steven Rostedt wrote:
> > >
> > > > Change copy_process() to update the child's TIF_SYSCALL_TRACEPOINT
> > > > under tasklist.
> > >
> > > Is this because "p = dup_task_struct(current);" is outside the lock?
> > > Probably should state this in the change log.
> >
> > Not only, syscall_regfunc/syscall_unregfunc can miss the new child.
> >
> > Just suppose that syscall_regfunc() takes tasklist right before the
> > forking task tries to take it for writing and and the child to the
> > list.
>
> I'm a bit confused by the above. Maybe it's the typo with the "and and"
> that's confusing me.
Yes, "and and" was supposed to be "and add".
But probably I misunderstood you before... Well yes, this is because
"p = dup_task_struct(current)" copies TIF_SYSCALL_TRACEPOINT outside
of the tasklist-protected section which also makes the new task visible
for do_each_thread().
IOW, the state of TIF_SYSCALL_TRACEPOINT bit can be correct after
dup_task_struct(), but it can't be updated until copy_process() add
the child to the list.
> > OK, thanks, will do. But perhaps tracepoint_fork() would be better?
>
> tracepoint_fork() is similar to being called trace_fork() which would be
> considered a tracepoint. Seeing tracepoint_fork() would make me think it
> has something to do with the fork tracepoint.
>
> Do we plan on doing anything other than updating the syscall tracepoint
> flag here? I find the "syscall_tracepoint_update()" very descriptive to
> what is actually happening. While reading the fork code, seeing
> 'syscall_tracepoint_update()' would tell me that this has something to
> do with syscall tracepoints, which it does. But tracepoint_fork() would
> have me think something completely different.
OK, thanks, I am sending v2 in reply to v1.
Oleg.
syscall_regfunc() and syscall_unregfunc() should set/clear
TIF_SYSCALL_TRACEPOINT system-wide, but do_each_thread() can race
with copy_process() and miss the new child which was not added to
init_task.tasks list yet.
Change copy_process() to update the child's TIF_SYSCALL_TRACEPOINT
under tasklist.
While at it,
- remove _irqsafe from syscall_regfunc/syscall_unregfunc,
read_lock(tasklist) doesn't need to disable irqs.
- change syscall_unregfunc() to check PF_KTHREAD to skip
the kernel threads, ->mm != NULL is the common mistake.
Note: probably this check should be simply removed, needs
another patch.
Signed-off-by: Oleg Nesterov <[email protected]>
---
include/trace/syscall.h | 15 +++++++++++++++
kernel/fork.c | 2 ++
kernel/tracepoint.c | 12 +++++-------
3 files changed, 22 insertions(+), 7 deletions(-)
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 84bc419..15a954b 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -4,6 +4,7 @@
#include <linux/tracepoint.h>
#include <linux/unistd.h>
#include <linux/ftrace_event.h>
+#include <linux/thread_info.h>
#include <asm/ptrace.h>
@@ -31,4 +32,18 @@ struct syscall_metadata {
struct ftrace_event_call *exit_event;
};
+#ifdef CONFIG_TRACEPOINTS
+static inline void syscall_tracepoint_update(struct task_struct *p)
+{
+ if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
+ set_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT);
+ else
+ clear_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT);
+}
+#else
+static inline void syscall_tracepoint_update(struct task_struct *p)
+{
+}
+#endif
+
#endif /* _TRACE_SYSCALL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 1766d32..e463f99 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1472,7 +1472,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
total_forks++;
spin_unlock(¤t->sighand->siglock);
+ syscall_tracepoint_update(p);
write_unlock_irq(&tasklist_lock);
+
proc_fork_connector(p);
cgroup_post_fork(p);
if (clone_flags & CLONE_THREAD)
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 0c05a45..a16754b 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -732,33 +732,31 @@ static int sys_tracepoint_refcount;
void syscall_regfunc(void)
{
- unsigned long flags;
struct task_struct *g, *t;
if (!sys_tracepoint_refcount) {
- read_lock_irqsave(&tasklist_lock, flags);
+ read_lock(&tasklist_lock);
do_each_thread(g, t) {
/* Skip kernel threads. */
- if (t->mm)
+ if (!(t->flags & PF_KTHREAD))
set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
} while_each_thread(g, t);
- read_unlock_irqrestore(&tasklist_lock, flags);
+ read_unlock(&tasklist_lock);
}
sys_tracepoint_refcount++;
}
void syscall_unregfunc(void)
{
- unsigned long flags;
struct task_struct *g, *t;
sys_tracepoint_refcount--;
if (!sys_tracepoint_refcount) {
- read_lock_irqsave(&tasklist_lock, flags);
+ read_lock(&tasklist_lock);
do_each_thread(g, t) {
clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
} while_each_thread(g, t);
- read_unlock_irqrestore(&tasklist_lock, flags);
+ read_unlock(&tasklist_lock);
}
}
#endif
--
1.5.5.1
Steven Rostedt <[email protected]> wrote:
> Why? If we remove the tracepoint from the slowpath and use a table swap,
> then we wouldn't need to use the slowpath at all.
How are you engineering a table swap? Do you patch the system call code to
change the immediate address loaded or do you put in a level of indirection?
David
On Tue, 2013-03-19 at 15:10 +0000, David Howells wrote:
> Steven Rostedt <[email protected]> wrote:
>
> > Why? If we remove the tracepoint from the slowpath and use a table swap,
> > then we wouldn't need to use the slowpath at all.
>
> How are you engineering a table swap? Do you patch the system call code to
> change the immediate address loaded or do you put in a level of indirection?
Patching the call site would probably be the easiest method.
We've gotten pretty good at doing that ;-)
-- Steve
On 03/19/2013 08:36 AM, Steven Rostedt wrote:
> On Tue, 2013-03-19 at 15:10 +0000, David Howells wrote:
>> Steven Rostedt <[email protected]> wrote:
>>
>>> Why? If we remove the tracepoint from the slowpath and use a table swap,
>>> then we wouldn't need to use the slowpath at all.
>>
>> How are you engineering a table swap? Do you patch the system call code to
>> change the immediate address loaded or do you put in a level of indirection?
>
> Patching the call site would probably be the easiest method.
>
> We've gotten pretty good at doing that ;-)
>
Yes, given that the machinery is already there we might as well use it.
-hpa
On Mon, 2013-03-18 at 17:34 +0100, Oleg Nesterov wrote:
> syscall_regfunc() and syscall_unregfunc() should set/clear
> TIF_SYSCALL_TRACEPOINT system-wide, but do_each_thread() can race
> with copy_process() and miss the new child which was not added to
> init_task.tasks list yet.
>
> Change copy_process() to update the child's TIF_SYSCALL_TRACEPOINT
> under tasklist.
>
> While at it,
>
> - remove _irqsafe from syscall_regfunc/syscall_unregfunc,
> read_lock(tasklist) doesn't need to disable irqs.
>
> - change syscall_unregfunc() to check PF_KTHREAD to skip
> the kernel threads, ->mm != NULL is the common mistake.
>
> Note: probably this check should be simply removed, needs
> another patch.
>
> Signed-off-by: Oleg Nesterov <[email protected]>
Acked-by: Steven Rostedt <[email protected]>
-- Steve
> ---
> include/trace/syscall.h | 15 +++++++++++++++
> kernel/fork.c | 2 ++
> kernel/tracepoint.c | 12 +++++-------
> 3 files changed, 22 insertions(+), 7 deletions(-)
>
> diff --git a/include/trace/syscall.h b/include/trace/syscall.h
> index 84bc419..15a954b 100644
> --- a/include/trace/syscall.h
> +++ b/include/trace/syscall.h
> @@ -4,6 +4,7 @@
> #include <linux/tracepoint.h>
> #include <linux/unistd.h>
> #include <linux/ftrace_event.h>
> +#include <linux/thread_info.h>
>
> #include <asm/ptrace.h>
>
> @@ -31,4 +32,18 @@ struct syscall_metadata {
> struct ftrace_event_call *exit_event;
> };
>
> +#ifdef CONFIG_TRACEPOINTS
> +static inline void syscall_tracepoint_update(struct task_struct *p)
> +{
> + if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
> + set_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT);
> + else
> + clear_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT);
> +}
> +#else
> +static inline void syscall_tracepoint_update(struct task_struct *p)
> +{
> +}
> +#endif
> +
> #endif /* _TRACE_SYSCALL_H */
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 1766d32..e463f99 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1472,7 +1472,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
>
> total_forks++;
> spin_unlock(¤t->sighand->siglock);
> + syscall_tracepoint_update(p);
> write_unlock_irq(&tasklist_lock);
> +
> proc_fork_connector(p);
> cgroup_post_fork(p);
> if (clone_flags & CLONE_THREAD)
> diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
> index 0c05a45..a16754b 100644
> --- a/kernel/tracepoint.c
> +++ b/kernel/tracepoint.c
> @@ -732,33 +732,31 @@ static int sys_tracepoint_refcount;
>
> void syscall_regfunc(void)
> {
> - unsigned long flags;
> struct task_struct *g, *t;
>
> if (!sys_tracepoint_refcount) {
> - read_lock_irqsave(&tasklist_lock, flags);
> + read_lock(&tasklist_lock);
> do_each_thread(g, t) {
> /* Skip kernel threads. */
> - if (t->mm)
> + if (!(t->flags & PF_KTHREAD))
> set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
> } while_each_thread(g, t);
> - read_unlock_irqrestore(&tasklist_lock, flags);
> + read_unlock(&tasklist_lock);
> }
> sys_tracepoint_refcount++;
> }
>
> void syscall_unregfunc(void)
> {
> - unsigned long flags;
> struct task_struct *g, *t;
>
> sys_tracepoint_refcount--;
> if (!sys_tracepoint_refcount) {
> - read_lock_irqsave(&tasklist_lock, flags);
> + read_lock(&tasklist_lock);
> do_each_thread(g, t) {
> clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
> } while_each_thread(g, t);
> - read_unlock_irqrestore(&tasklist_lock, flags);
> + read_unlock(&tasklist_lock);
> }
> }
> #endif