Trace analysis code needs a coherent picture of the set of processes
and threads running on a system. While it's possible to enumerate all
tasks via /proc, this enumeration is not atomic. If PID numbering
rolls over during snapshot collection, the resulting snapshot of the
process and thread state of the system may be incoherent, confusing
trace analysis tools. The fundamental problem is that if a PID is
reused during a userspace scan of /proc, it's impossible to tell, in
post-processing, whether a fact that the userspace /proc scanner
reports regarding a given PID refers to the old or new task named by
that PID, as the scan of that PID may or may not have occurred before
the PID reuse, and there's no way to "stamp" a fact read from the
kernel with a trace timestamp.
This change adds a per-pid-namespace 64-bit generation number,
incremented on PID rollover, and exposes it via a new proc file
/proc/pid_generation. By examining this file before and after /proc
enumeration, user code can detect the potential reuse of a PID and
restart the task enumeration process, repeating until it gets a
coherent snapshot.
PID rollover ought to be rare, so in practice, scan repetitions will
be rare.
Signed-off-by: Daniel Colascione <[email protected]>
---
Documentation/filesystems/proc.txt | 1 +
include/linux/pid.h | 1 +
include/linux/pid_namespace.h | 2 ++
init/main.c | 1 +
kernel/pid.c | 36 +++++++++++++++++++++++++++++-
5 files changed, 40 insertions(+), 1 deletion(-)
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 12a5e6e693b6..f58a359f9a2c 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -615,6 +615,7 @@ Table 1-5: Kernel info in /proc
partitions Table of partitions known to the system
pci Deprecated info of PCI bus (new way -> /proc/bus/pci/,
decoupled by lspci (2.4)
+ pid_gen PID rollover count
rtc Real time clock
scsi SCSI info (see text)
slabinfo Slab pool info
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 14a9a39da9c7..2e4b41a32e86 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -112,6 +112,7 @@ extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
int next_pidmap(struct pid_namespace *pid_ns, unsigned int last);
extern struct pid *alloc_pid(struct pid_namespace *ns);
+extern u64 read_pid_generation(struct pid_namespace *ns);
extern void free_pid(struct pid *pid);
extern void disable_pid_allocation(struct pid_namespace *ns);
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 49538b172483..fa92ae66fb98 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -44,6 +44,7 @@ struct pid_namespace {
kgid_t pid_gid;
int hide_pid;
int reboot; /* group exit code if this pidns was rebooted */
+ u64 generation; /* incremented on wraparound */
struct ns_common ns;
} __randomize_layout;
@@ -99,5 +100,6 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
void pidhash_init(void);
void pid_idr_init(void);
+void pid_proc_init(void);
#endif /* _LINUX_PID_NS_H */
diff --git a/init/main.c b/init/main.c
index ee147103ba1b..20c595e852c6 100644
--- a/init/main.c
+++ b/init/main.c
@@ -730,6 +730,7 @@ asmlinkage __visible void __init start_kernel(void)
cgroup_init();
taskstats_init_early();
delayacct_init();
+ pid_proc_init();
check_bugs();
diff --git a/kernel/pid.c b/kernel/pid.c
index b2f6c506035d..cd5f4aa8eb55 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -174,6 +174,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
for (i = ns->level; i >= 0; i--) {
int pid_min = 1;
+ unsigned int old_cursor;
idr_preload(GFP_KERNEL);
spin_lock_irq(&pidmap_lock);
@@ -182,7 +183,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
* init really needs pid 1, but after reaching the maximum
* wrap back to RESERVED_PIDS
*/
- if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
+ old_cursor = idr_get_cursor(&tmp->idr);
+ if (old_cursor > RESERVED_PIDS)
pid_min = RESERVED_PIDS;
/*
@@ -191,6 +193,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
*/
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
pid_max, GFP_ATOMIC);
+ if (unlikely(idr_get_cursor(&tmp->idr) <= old_cursor))
+ tmp->generation += 1;
spin_unlock_irq(&pidmap_lock);
idr_preload_end();
@@ -246,6 +250,16 @@ struct pid *alloc_pid(struct pid_namespace *ns)
return ERR_PTR(retval);
}
+u64 read_pid_generation(struct pid_namespace *ns)
+{
+ u64 generation;
+
+ spin_lock_irq(&pidmap_lock);
+ generation = ns->generation;
+ spin_unlock_irq(&pidmap_lock);
+ return generation;
+}
+
void disable_pid_allocation(struct pid_namespace *ns)
{
spin_lock_irq(&pidmap_lock);
@@ -449,6 +463,17 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
return idr_get_next(&ns->idr, &nr);
}
+#ifdef CONFIG_PROC_FS
+static int pid_generation_show(struct seq_file *m, void *v)
+{
+ u64 generation =
+ read_pid_generation(proc_pid_ns(file_inode(m->file)));
+ seq_printf(m, "%llu\n", generation);
+ return 0;
+
+};
+#endif
+
void __init pid_idr_init(void)
{
/* Verify no one has done anything silly: */
@@ -465,4 +490,13 @@ void __init pid_idr_init(void)
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
+
+}
+
+void __init pid_proc_init(void)
+{
+ /* pid_idr_init is too early, so get a separate init function. */
+#ifdef CONFIG_PROC_FS
+ WARN_ON(!proc_create_single("pid_gen", 0, NULL, pid_generation_show));
+#endif
}
--
2.19.1.1215.g8438c0b245-goog
On Wed, Nov 21, 2018 at 12:14:44PM -0800, Daniel Colascione wrote:
> This change adds a per-pid-namespace 64-bit generation number,
> incremented on PID rollover, and exposes it via a new proc file
> /proc/pid_generation. By examining this file before and after /proc
> enumeration, user code can detect the potential reuse of a PID and
> restart the task enumeration process, repeating until it gets a
> coherent snapshot.
>
> PID rollover ought to be rare, so in practice, scan repetitions will
> be rare.
Then why does it need to be 64-bit?
Trace analysis code needs a coherent picture of the set of processes
and threads running on a system. While it's possible to enumerate all
tasks via /proc, this enumeration is not atomic. If PID numbering
rolls over during snapshot collection, the resulting snapshot of the
process and thread state of the system may be incoherent, confusing
trace analysis tools. The fundamental problem is that if a PID is
reused during a userspace scan of /proc, it's impossible to tell, in
post-processing, whether a fact that the userspace /proc scanner
reports regarding a given PID refers to the old or new task named by
that PID, as the scan of that PID may or may not have occurred before
the PID reuse, and there's no way to "stamp" a fact read from the
kernel with a trace timestamp.
This change adds a per-pid-namespace 64-bit generation number,
incremented on PID rollover, and exposes it via a new proc file
/proc/pid_gen. By examining this file before and after /proc
enumeration, user code can detect the potential reuse of a PID and
restart the task enumeration process, repeating until it gets a
coherent snapshot.
PID rollover ought to be rare, so in practice, scan repetitions will
be rare.
Signed-off-by: Daniel Colascione <[email protected]>
---
Make commit message match the code.
Documentation/filesystems/proc.txt | 1 +
include/linux/pid.h | 1 +
include/linux/pid_namespace.h | 2 ++
init/main.c | 1 +
kernel/pid.c | 36 +++++++++++++++++++++++++++++-
5 files changed, 40 insertions(+), 1 deletion(-)
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 12a5e6e693b6..f58a359f9a2c 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -615,6 +615,7 @@ Table 1-5: Kernel info in /proc
partitions Table of partitions known to the system
pci Deprecated info of PCI bus (new way -> /proc/bus/pci/,
decoupled by lspci (2.4)
+ pid_gen PID rollover count
rtc Real time clock
scsi SCSI info (see text)
slabinfo Slab pool info
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 14a9a39da9c7..2e4b41a32e86 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -112,6 +112,7 @@ extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
int next_pidmap(struct pid_namespace *pid_ns, unsigned int last);
extern struct pid *alloc_pid(struct pid_namespace *ns);
+extern u64 read_pid_generation(struct pid_namespace *ns);
extern void free_pid(struct pid *pid);
extern void disable_pid_allocation(struct pid_namespace *ns);
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 49538b172483..fa92ae66fb98 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -44,6 +44,7 @@ struct pid_namespace {
kgid_t pid_gid;
int hide_pid;
int reboot; /* group exit code if this pidns was rebooted */
+ u64 generation; /* incremented on wraparound */
struct ns_common ns;
} __randomize_layout;
@@ -99,5 +100,6 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
void pidhash_init(void);
void pid_idr_init(void);
+void pid_proc_init(void);
#endif /* _LINUX_PID_NS_H */
diff --git a/init/main.c b/init/main.c
index ee147103ba1b..20c595e852c6 100644
--- a/init/main.c
+++ b/init/main.c
@@ -730,6 +730,7 @@ asmlinkage __visible void __init start_kernel(void)
cgroup_init();
taskstats_init_early();
delayacct_init();
+ pid_proc_init();
check_bugs();
diff --git a/kernel/pid.c b/kernel/pid.c
index b2f6c506035d..cd5f4aa8eb55 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -174,6 +174,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
for (i = ns->level; i >= 0; i--) {
int pid_min = 1;
+ unsigned int old_cursor;
idr_preload(GFP_KERNEL);
spin_lock_irq(&pidmap_lock);
@@ -182,7 +183,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
* init really needs pid 1, but after reaching the maximum
* wrap back to RESERVED_PIDS
*/
- if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
+ old_cursor = idr_get_cursor(&tmp->idr);
+ if (old_cursor > RESERVED_PIDS)
pid_min = RESERVED_PIDS;
/*
@@ -191,6 +193,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
*/
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
pid_max, GFP_ATOMIC);
+ if (unlikely(idr_get_cursor(&tmp->idr) <= old_cursor))
+ tmp->generation += 1;
spin_unlock_irq(&pidmap_lock);
idr_preload_end();
@@ -246,6 +250,16 @@ struct pid *alloc_pid(struct pid_namespace *ns)
return ERR_PTR(retval);
}
+u64 read_pid_generation(struct pid_namespace *ns)
+{
+ u64 generation;
+
+ spin_lock_irq(&pidmap_lock);
+ generation = ns->generation;
+ spin_unlock_irq(&pidmap_lock);
+ return generation;
+}
+
void disable_pid_allocation(struct pid_namespace *ns)
{
spin_lock_irq(&pidmap_lock);
@@ -449,6 +463,17 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
return idr_get_next(&ns->idr, &nr);
}
+#ifdef CONFIG_PROC_FS
+static int pid_generation_show(struct seq_file *m, void *v)
+{
+ u64 generation =
+ read_pid_generation(proc_pid_ns(file_inode(m->file)));
+ seq_printf(m, "%llu\n", generation);
+ return 0;
+
+};
+#endif
+
void __init pid_idr_init(void)
{
/* Verify no one has done anything silly: */
@@ -465,4 +490,13 @@ void __init pid_idr_init(void)
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
+
+}
+
+void __init pid_proc_init(void)
+{
+ /* pid_idr_init is too early, so get a separate init function. */
+#ifdef CONFIG_PROC_FS
+ WARN_ON(!proc_create_single("pid_gen", 0, NULL, pid_generation_show));
+#endif
}
--
2.19.1.1215.g8438c0b245-goog
On Wed, Nov 21, 2018 at 12:31 PM Matthew Wilcox <[email protected]> wrote:
>
> On Wed, Nov 21, 2018 at 12:14:44PM -0800, Daniel Colascione wrote:
> > This change adds a per-pid-namespace 64-bit generation number,
> > incremented on PID rollover, and exposes it via a new proc file
> > /proc/pid_generation. By examining this file before and after /proc
> > enumeration, user code can detect the potential reuse of a PID and
> > restart the task enumeration process, repeating until it gets a
> > coherent snapshot.
> >
> > PID rollover ought to be rare, so in practice, scan repetitions will
> > be rare.
>
> Then why does it need to be 64-bit?
[Resending because of accidental HTML. I really need to switch to a
better email client.]
Because 64 bits is enough for anyone. :-) A u64 is big enough that
we'll never observe an overflow on a running system, and PID
namespaces are rare enough that we won't miss the four extra bytes we
use by upgrading from a u32. And after reading about some security
problems caused by too-clever handling of 32-bit rollover, I'd rather
the code be obviously correct than save a trivial amount of space.
On Wed, 21 Nov 2018 12:54:20 -0800 Daniel Colascione <[email protected]> wrote:
> Trace analysis code needs a coherent picture of the set of processes
> and threads running on a system. While it's possible to enumerate all
> tasks via /proc, this enumeration is not atomic. If PID numbering
> rolls over during snapshot collection, the resulting snapshot of the
> process and thread state of the system may be incoherent, confusing
> trace analysis tools. The fundamental problem is that if a PID is
> reused during a userspace scan of /proc, it's impossible to tell, in
> post-processing, whether a fact that the userspace /proc scanner
> reports regarding a given PID refers to the old or new task named by
> that PID, as the scan of that PID may or may not have occurred before
> the PID reuse, and there's no way to "stamp" a fact read from the
> kernel with a trace timestamp.
>
> This change adds a per-pid-namespace 64-bit generation number,
> incremented on PID rollover, and exposes it via a new proc file
> /proc/pid_gen. By examining this file before and after /proc
> enumeration, user code can detect the potential reuse of a PID and
> restart the task enumeration process, repeating until it gets a
> coherent snapshot.
>
> PID rollover ought to be rare, so in practice, scan repetitions will
> be rare.
In general, tracing is a rather specialized thing. Why is this very
occasional confusion a sufficiently serious problem to warrant addition
of this code?
Which userspace tools will be using pid_gen? Are the developers of
those tools signed up to use pid_gen?
> --- a/include/linux/pid.h
> +++ b/include/linux/pid.h
> @@ -112,6 +112,7 @@ extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
> int next_pidmap(struct pid_namespace *pid_ns, unsigned int last);
>
> extern struct pid *alloc_pid(struct pid_namespace *ns);
> +extern u64 read_pid_generation(struct pid_namespace *ns);
pig_generation_read() would be a better (and more idiomatic) name.
> extern void free_pid(struct pid *pid);
> extern void disable_pid_allocation(struct pid_namespace *ns);
>
> ...
>
> +u64 read_pid_generation(struct pid_namespace *ns)
> +{
> + u64 generation;
> +
> +
> + spin_lock_irq(&pidmap_lock);
> + generation = ns->generation;
> + spin_unlock_irq(&pidmap_lock);
> + return generation;
> +}
What is the spinlocking in here for? afaict the only purpose it serves
is to make the 64-bit read atomic, so it isn't needed on 32-bit?
> void disable_pid_allocation(struct pid_namespace *ns)
> {
> spin_lock_irq(&pidmap_lock);
> @@ -449,6 +463,17 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
> return idr_get_next(&ns->idr, &nr);
> }
>
> +#ifdef CONFIG_PROC_FS
> +static int pid_generation_show(struct seq_file *m, void *v)
> +{
> + u64 generation =
> + read_pid_generation(proc_pid_ns(file_inode(m->file)));
u64 generation;
generation = read_pid_generation(proc_pid_ns(file_inode(m->file)));
is a nicer way of avoiding column wrap.
> + seq_printf(m, "%llu\n", generation);
> + return 0;
> +
> +};
> +#endif
> +
> void __init pid_idr_init(void)
> {
> /* Verify no one has done anything silly: */
> @@ -465,4 +490,13 @@ void __init pid_idr_init(void)
>
> init_pid_ns.pid_cachep = KMEM_CACHE(pid,
> SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
> +
> +}
> +
> +void __init pid_proc_init(void)
> +{
> + /* pid_idr_init is too early, so get a separate init function. */
s/get a/use a/
> +#ifdef CONFIG_PROC_FS
> + WARN_ON(!proc_create_single("pid_gen", 0, NULL, pid_generation_show));
> +#endif
> }
This whole function could vanish if !CONFIG_PROC_FS. Doesn't matter
much with __init code though.
On Wed, Nov 21, 2018 at 11:40 PM Daniel Colascione <[email protected]> wrote:
> On Wed, Nov 21, 2018 at 2:12 PM Andrew Morton <[email protected]> wrote:
> > On Wed, 21 Nov 2018 12:54:20 -0800 Daniel Colascione <[email protected]> wrote:
> > > +u64 read_pid_generation(struct pid_namespace *ns)
> > > +{
> > > + u64 generation;
> > > +
> > > +
> > > + spin_lock_irq(&pidmap_lock);
> > > + generation = ns->generation;
> > > + spin_unlock_irq(&pidmap_lock);
> > > + return generation;
> > > +}
> >
> > What is the spinlocking in here for? afaict the only purpose it serves
> > is to make the 64-bit read atomic, so it isn't needed on 32-bit?
>
> ITYM the spinlock is necessary *only* on 32-bit, since 64-bit
> architectures have atomic 64-bit reads, and 64-bit reads on 32-bit
> architectures can tear. This function isn't a particularly hot path,
> so I thought consistency across architectures would be more valuable
> than avoiding the lock on some systems.
Linux has atomic64_t/atomic64_read()/atomic64_inc() for this, which
should automatically do the right thing - processor-supported atomic
ops when possible, spinlock otherwise.
On Wed, Nov 21, 2018 at 2:12 PM Andrew Morton <[email protected]> wrote:
>
> On Wed, 21 Nov 2018 12:54:20 -0800 Daniel Colascione <[email protected]> wrote:
>
> > Trace analysis code needs a coherent picture of the set of processes
> > and threads running on a system. While it's possible to enumerate all
> > tasks via /proc, this enumeration is not atomic. If PID numbering
> > rolls over during snapshot collection, the resulting snapshot of the
> > process and thread state of the system may be incoherent, confusing
> > trace analysis tools. The fundamental problem is that if a PID is
> > reused during a userspace scan of /proc, it's impossible to tell, in
> > post-processing, whether a fact that the userspace /proc scanner
> > reports regarding a given PID refers to the old or new task named by
> > that PID, as the scan of that PID may or may not have occurred before
> > the PID reuse, and there's no way to "stamp" a fact read from the
> > kernel with a trace timestamp.
> >
> > This change adds a per-pid-namespace 64-bit generation number,
> > incremented on PID rollover, and exposes it via a new proc file
> > /proc/pid_gen. By examining this file before and after /proc
> > enumeration, user code can detect the potential reuse of a PID and
> > restart the task enumeration process, repeating until it gets a
> > coherent snapshot.
> >
> > PID rollover ought to be rare, so in practice, scan repetitions will
> > be rare.
>
> In general, tracing is a rather specialized thing. Why is this very
> occasional confusion a sufficiently serious problem to warrant addition
> of this code?
I wouldn't call tracing a specialized thing: it's important enough to
justify its own summit and a whole ecosystem of trace collection and
analysis tools. We use it in every day in Android. It's tremendously
helpful for understanding system behavior, especially in cases where
multiple components interact in ways that we can't readily predict or
replicate. Reliability and precision in this area are essential:
retrospective analysis of difficult-to-reproduce problems involves
puzzling over trace files and testing hypothesis, and when the trace
system itself is occasionally unreliable, the set of hypothesis to
consider grows. I've tried to keep the amount of kernel infrastructure
needed to support this precision and reliability to a minimum, pushing
most of the complexity to userspace. But we do need, from the kernel,
reliable process disambiguation.
Besides: things like checkpoint and restart are also non-core
features, but the kernel has plenty of infrastructure to support them.
We're talking about a very lightweight feature in this thread.
> Which userspace tools will be using pid_gen? Are the developers of
> those tools signed up to use pid_gen?
I'll be changing Android tracing tools to capture process snapshots
using pid_gen, using the algorithm in the commit message.
> > --- a/include/linux/pid.h
> > +++ b/include/linux/pid.h
> > @@ -112,6 +112,7 @@ extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
> > int next_pidmap(struct pid_namespace *pid_ns, unsigned int last);
> >
> > extern struct pid *alloc_pid(struct pid_namespace *ns);
> > +extern u64 read_pid_generation(struct pid_namespace *ns);
>
> pig_generation_read() would be a better (and more idiomatic) name.
Thanks. I'll change it.
>
> > extern void free_pid(struct pid *pid);
> > extern void disable_pid_allocation(struct pid_namespace *ns);
> >
> > ...
> >
> > +u64 read_pid_generation(struct pid_namespace *ns)
> > +{
> > + u64 generation;
> > +
> > +
> > + spin_lock_irq(&pidmap_lock);
> > + generation = ns->generation;
> > + spin_unlock_irq(&pidmap_lock);
> > + return generation;
> > +}
>
> What is the spinlocking in here for? afaict the only purpose it serves
> is to make the 64-bit read atomic, so it isn't needed on 32-bit?
ITYM the spinlock is necessary *only* on 32-bit, since 64-bit
architectures have atomic 64-bit reads, and 64-bit reads on 32-bit
architectures can tear. This function isn't a particularly hot path,
so I thought consistency across architectures would be more valuable
than avoiding the lock on some systems.
> > void disable_pid_allocation(struct pid_namespace *ns)
> > {
> > spin_lock_irq(&pidmap_lock);
> > @@ -449,6 +463,17 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
> > return idr_get_next(&ns->idr, &nr);
> > }
> >
> > +#ifdef CONFIG_PROC_FS
> > +static int pid_generation_show(struct seq_file *m, void *v)
> > +{
> > + u64 generation =
> > + read_pid_generation(proc_pid_ns(file_inode(m->file)));
>
> u64 generation;
>
> generation = read_pid_generation(proc_pid_ns(file_inode(m->file)));
>
> is a nicer way of avoiding column wrap.
Sure.
> > + seq_printf(m, "%llu\n", generation);
> > + return 0;
> > +
> > +};
> > +#endif
> > +
> > void __init pid_idr_init(void)
> > {
> > /* Verify no one has done anything silly: */
> > @@ -465,4 +490,13 @@ void __init pid_idr_init(void)
> >
> > init_pid_ns.pid_cachep = KMEM_CACHE(pid,
> > SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
> > +
> > +}
> > +
> > +void __init pid_proc_init(void)
> > +{
> > + /* pid_idr_init is too early, so get a separate init function. */
>
> s/get a/use a/
Will change.
> > +#ifdef CONFIG_PROC_FS
> > + WARN_ON(!proc_create_single("pid_gen", 0, NULL, pid_generation_show));
> > +#endif
> > }
>
> This whole function could vanish if !CONFIG_PROC_FS. Doesn't matter
> much with __init code though.
I wanted to keep the ifdefed region as small as possible. I wonder
whether LTO is good enough to make the function and its call site
disappear entirely in configurations where it has an empty body.
On Wed, 21 Nov 2018 14:40:28 -0800 Daniel Colascione <[email protected]> wrote:
> On Wed, Nov 21, 2018 at 2:12 PM Andrew Morton <[email protected]> wrote:
> >
> > On Wed, 21 Nov 2018 12:54:20 -0800 Daniel Colascione <[email protected]> wrote:
> >
> > > Trace analysis code needs a coherent picture of the set of processes
> > > and threads running on a system. While it's possible to enumerate all
> > > tasks via /proc, this enumeration is not atomic. If PID numbering
> > > rolls over during snapshot collection, the resulting snapshot of the
> > > process and thread state of the system may be incoherent, confusing
> > > trace analysis tools. The fundamental problem is that if a PID is
> > > reused during a userspace scan of /proc, it's impossible to tell, in
> > > post-processing, whether a fact that the userspace /proc scanner
> > > reports regarding a given PID refers to the old or new task named by
> > > that PID, as the scan of that PID may or may not have occurred before
> > > the PID reuse, and there's no way to "stamp" a fact read from the
> > > kernel with a trace timestamp.
> > >
> > > This change adds a per-pid-namespace 64-bit generation number,
> > > incremented on PID rollover, and exposes it via a new proc file
> > > /proc/pid_gen. By examining this file before and after /proc
> > > enumeration, user code can detect the potential reuse of a PID and
> > > restart the task enumeration process, repeating until it gets a
> > > coherent snapshot.
> > >
> > > PID rollover ought to be rare, so in practice, scan repetitions will
> > > be rare.
> >
> > In general, tracing is a rather specialized thing. Why is this very
> > occasional confusion a sufficiently serious problem to warrant addition
> > of this code?
>
> I wouldn't call tracing a specialized thing: it's important enough to
> justify its own summit and a whole ecosystem of trace collection and
> analysis tools. We use it in every day in Android. It's tremendously
> helpful for understanding system behavior, especially in cases where
> multiple components interact in ways that we can't readily predict or
> replicate. Reliability and precision in this area are essential:
> retrospective analysis of difficult-to-reproduce problems involves
> puzzling over trace files and testing hypothesis, and when the trace
> system itself is occasionally unreliable, the set of hypothesis to
> consider grows. I've tried to keep the amount of kernel infrastructure
> needed to support this precision and reliability to a minimum, pushing
> most of the complexity to userspace. But we do need, from the kernel,
> reliable process disambiguation.
>
> Besides: things like checkpoint and restart are also non-core
> features, but the kernel has plenty of infrastructure to support them.
> We're talking about a very lightweight feature in this thread.
I'm still not understanding the seriousness of the problem. Presumably
you've hit problems in real-life which were serious and frequent enough
to justify getting down and writing the code. Please share some sob stories
with us!
> > Which userspace tools will be using pid_gen? Are the developers of
> > those tools signed up to use pid_gen?
>
> I'll be changing Android tracing tools to capture process snapshots
> using pid_gen, using the algorithm in the commit message.
Which other tools could use this and what was the feedback from their
developers? Those people are the intended audience and the
best-positioned reviewers so let's hear from them?
> > > +u64 read_pid_generation(struct pid_namespace *ns)
> > > +{
> > > + u64 generation;
> > > +
> > > +
> > > + spin_lock_irq(&pidmap_lock);
> > > + generation = ns->generation;
> > > + spin_unlock_irq(&pidmap_lock);
> > > + return generation;
> > > +}
> >
> > What is the spinlocking in here for? afaict the only purpose it serves
> > is to make the 64-bit read atomic, so it isn't needed on 32-bit?
>
> ITYM the spinlock is necessary *only* on 32-bit, since 64-bit
> architectures have atomic 64-bit reads, and 64-bit reads on 32-bit
> architectures can tear. This function isn't a particularly hot path,
> so I thought consistency across architectures would be more valuable
> than avoiding the lock on some systems.
OK.
On Wed, Nov 21, 2018 at 2:50 PM Andrew Morton <[email protected]> wrote:
>
> On Wed, 21 Nov 2018 14:40:28 -0800 Daniel Colascione <[email protected]> wrote:
>
> > On Wed, Nov 21, 2018 at 2:12 PM Andrew Morton <[email protected]> wrote:
> > >
> > > On Wed, 21 Nov 2018 12:54:20 -0800 Daniel Colascione <[email protected]> wrote:
> > >
> > > > Trace analysis code needs a coherent picture of the set of processes
> > > > and threads running on a system. While it's possible to enumerate all
> > > > tasks via /proc, this enumeration is not atomic. If PID numbering
> > > > rolls over during snapshot collection, the resulting snapshot of the
> > > > process and thread state of the system may be incoherent, confusing
> > > > trace analysis tools. The fundamental problem is that if a PID is
> > > > reused during a userspace scan of /proc, it's impossible to tell, in
> > > > post-processing, whether a fact that the userspace /proc scanner
> > > > reports regarding a given PID refers to the old or new task named by
> > > > that PID, as the scan of that PID may or may not have occurred before
> > > > the PID reuse, and there's no way to "stamp" a fact read from the
> > > > kernel with a trace timestamp.
> > > >
> > > > This change adds a per-pid-namespace 64-bit generation number,
> > > > incremented on PID rollover, and exposes it via a new proc file
> > > > /proc/pid_gen. By examining this file before and after /proc
> > > > enumeration, user code can detect the potential reuse of a PID and
> > > > restart the task enumeration process, repeating until it gets a
> > > > coherent snapshot.
> > > >
> > > > PID rollover ought to be rare, so in practice, scan repetitions will
> > > > be rare.
> > >
> > > In general, tracing is a rather specialized thing. Why is this very
> > > occasional confusion a sufficiently serious problem to warrant addition
> > > of this code?
> >
> > I wouldn't call tracing a specialized thing: it's important enough to
> > justify its own summit and a whole ecosystem of trace collection and
> > analysis tools. We use it in every day in Android. It's tremendously
> > helpful for understanding system behavior, especially in cases where
> > multiple components interact in ways that we can't readily predict or
> > replicate. Reliability and precision in this area are essential:
> > retrospective analysis of difficult-to-reproduce problems involves
> > puzzling over trace files and testing hypothesis, and when the trace
> > system itself is occasionally unreliable, the set of hypothesis to
> > consider grows. I've tried to keep the amount of kernel infrastructure
> > needed to support this precision and reliability to a minimum, pushing
> > most of the complexity to userspace. But we do need, from the kernel,
> > reliable process disambiguation.
> >
> > Besides: things like checkpoint and restart are also non-core
> > features, but the kernel has plenty of infrastructure to support them.
> > We're talking about a very lightweight feature in this thread.
>
> I'm still not understanding the seriousness of the problem. Presumably
> you've hit problems in real-life which were serious and frequent enough
> to justify getting down and writing the code. Please share some sob stories
> with us!
The problem here is the possibility of confusion, even if it's rare.
Does the naive approach of just walking /proc and ignoring the
possibility of PID reuse races work most of the time? Sure. But "most
of the time" isn't good enough. It's not that there are tons of sob
stories: it's that without completely robust reporting, we can't rule
out of the possibility that weirdness we observe in a given trace is
actually just an artifact from a kinda-sort-working best-effort trace
collection system instead of a real anomaly in behavior. Tracing,
essentially, gives us deltas for system state, and without an accurate
baseline, collected via some kind of scan on trace startup, it's
impossible to use these deltas to robustly reconstruct total system
state at a given time. And this matters, because errors in
reconstruction (e.g., assigning a thread to the wrong process because
the IDs happen to be reused) can affect processing of the whole trace.
If it's 3am and I'm analyzing the lone trace from a dogfooder
demonstrating a particularly nasty problem, I don't want to find out
that the trace I'm analyzing ended up being useless because the
kernel's trace system is merely best effort. It's very cheap to be
100% reliable here, so let's be reliable and rule out sources of
error.
> > > Which userspace tools will be using pid_gen? Are the developers of
> > > those tools signed up to use pid_gen?
> >
> > I'll be changing Android tracing tools to capture process snapshots
> > using pid_gen, using the algorithm in the commit message.
>
> Which other tools could use this and what was the feedback from their
> developers?
I'm going to have Android's systrace and Perfetto use this approach.
Exactly how many tools signed up to use this feature do you need?
> Those people are the intended audience and the
> best-positioned reviewers so let's hear from them?
I'm writing plenty of trace analysis tools myself, so I'm part of this
intended audience. Other tracing tool authors have told me about
out-of-tree hacks for process atomic snapshots via ftrace events. This
approach avoids the necessity of these more-invasive hacks.
On Wed, Nov 21, 2018 at 2:49 PM Jann Horn <[email protected]> wrote:
>
> On Wed, Nov 21, 2018 at 11:40 PM Daniel Colascione <[email protected]> wrote:
> > On Wed, Nov 21, 2018 at 2:12 PM Andrew Morton <[email protected]> wrote:
> > > On Wed, 21 Nov 2018 12:54:20 -0800 Daniel Colascione <[email protected]> wrote:
> > > > +u64 read_pid_generation(struct pid_namespace *ns)
> > > > +{
> > > > + u64 generation;
> > > > +
> > > > +
> > > > + spin_lock_irq(&pidmap_lock);
> > > > + generation = ns->generation;
> > > > + spin_unlock_irq(&pidmap_lock);
> > > > + return generation;
> > > > +}
> > >
> > > What is the spinlocking in here for? afaict the only purpose it serves
> > > is to make the 64-bit read atomic, so it isn't needed on 32-bit?
> >
> > ITYM the spinlock is necessary *only* on 32-bit, since 64-bit
> > architectures have atomic 64-bit reads, and 64-bit reads on 32-bit
> > architectures can tear. This function isn't a particularly hot path,
> > so I thought consistency across architectures would be more valuable
> > than avoiding the lock on some systems.
>
> Linux has atomic64_t/atomic64_read()/atomic64_inc() for this, which
> should automatically do the right thing - processor-supported atomic
> ops when possible, spinlock otherwise.
I wanted to take advantage of the existing spinlock to synchronize
instead of adding more atomic operations to the rollover path.
On Wed, 21 Nov 2018 15:21:40 -0800 Daniel Colascione <[email protected]> wrote:
> On Wed, Nov 21, 2018 at 2:50 PM Andrew Morton <[email protected]> wrote:
> >
> > On Wed, 21 Nov 2018 14:40:28 -0800 Daniel Colascione <[email protected]> wrote:
> >
> > > On Wed, Nov 21, 2018 at 2:12 PM Andrew Morton <[email protected]> wrote:
>
> ...
>
> > > I wouldn't call tracing a specialized thing: it's important enough to
> > > justify its own summit and a whole ecosystem of trace collection and
> > > analysis tools. We use it in every day in Android. It's tremendously
> > > helpful for understanding system behavior, especially in cases where
> > > multiple components interact in ways that we can't readily predict or
> > > replicate. Reliability and precision in this area are essential:
> > > retrospective analysis of difficult-to-reproduce problems involves
> > > puzzling over trace files and testing hypothesis, and when the trace
> > > system itself is occasionally unreliable, the set of hypothesis to
> > > consider grows. I've tried to keep the amount of kernel infrastructure
> > > needed to support this precision and reliability to a minimum, pushing
> > > most of the complexity to userspace. But we do need, from the kernel,
> > > reliable process disambiguation.
> > >
> > > Besides: things like checkpoint and restart are also non-core
> > > features, but the kernel has plenty of infrastructure to support them.
> > > We're talking about a very lightweight feature in this thread.
> >
> > I'm still not understanding the seriousness of the problem. Presumably
> > you've hit problems in real-life which were serious and frequent enough
> > to justify getting down and writing the code. Please share some sob stories
> > with us!
>
> The problem here is the possibility of confusion, even if it's rare.
> Does the naive approach of just walking /proc and ignoring the
> possibility of PID reuse races work most of the time? Sure. But "most
> of the time" isn't good enough. It's not that there are tons of sob
> stories: it's that without completely robust reporting, we can't rule
> out of the possibility that weirdness we observe in a given trace is
> actually just an artifact from a kinda-sort-working best-effort trace
> collection system instead of a real anomaly in behavior. Tracing,
> essentially, gives us deltas for system state, and without an accurate
> baseline, collected via some kind of scan on trace startup, it's
> impossible to use these deltas to robustly reconstruct total system
> state at a given time. And this matters, because errors in
> reconstruction (e.g., assigning a thread to the wrong process because
> the IDs happen to be reused) can affect processing of the whole trace.
> If it's 3am and I'm analyzing the lone trace from a dogfooder
> demonstrating a particularly nasty problem, I don't want to find out
> that the trace I'm analyzing ended up being useless because the
> kernel's trace system is merely best effort. It's very cheap to be
> 100% reliable here, so let's be reliable and rule out sources of
> error.
So we're solving a problem which isn't known to occur, but solving it
provides some peace-of-mind? Sounds thin!
btw, how should tool developers test their pid_gen-based disambiguation
code?
> > > > Which userspace tools will be using pid_gen? Are the developers of
> > > > those tools signed up to use pid_gen?
> > >
> > > I'll be changing Android tracing tools to capture process snapshots
> > > using pid_gen, using the algorithm in the commit message.
> >
> > Which other tools could use this and what was the feedback from their
> > developers?
>
> I'm going to have Android's systrace and Perfetto use this approach.
> Exactly how many tools signed up to use this feature do you need?
What other ones are there?
> > Those people are the intended audience and the
> > best-positioned reviewers so let's hear from them?
>
> I'm writing plenty of trace analysis tools myself, so I'm part of this
> intended audience. Other tracing tool authors have told me about
> out-of-tree hacks for process atomic snapshots via ftrace events. This
> approach avoids the necessity of these more-invasive hacks.
Those authors would make great reviewers! Adding a cc is cheap.
On Wed, Nov 21, 2018 at 4:28 PM Daniel Colascione <[email protected]> wrote:
>
> On Wed, Nov 21, 2018 at 4:22 PM Andrew Morton <[email protected]> wrote:
> >
> > On Wed, 21 Nov 2018 15:21:40 -0800 Daniel Colascione <[email protected]> wrote:
> >
> > > On Wed, Nov 21, 2018 at 2:50 PM Andrew Morton <[email protected]> wrote:
> > > >
> > > > On Wed, 21 Nov 2018 14:40:28 -0800 Daniel Colascione <[email protected]> wrote:
> > > >
> > > > > On Wed, Nov 21, 2018 at 2:12 PM Andrew Morton <[email protected]> wrote:
> > >
> > > ...
> > >
> > > > > I wouldn't call tracing a specialized thing: it's important enough to
> > > > > justify its own summit and a whole ecosystem of trace collection and
> > > > > analysis tools. We use it in every day in Android. It's tremendously
> > > > > helpful for understanding system behavior, especially in cases where
> > > > > multiple components interact in ways that we can't readily predict or
> > > > > replicate. Reliability and precision in this area are essential:
> > > > > retrospective analysis of difficult-to-reproduce problems involves
> > > > > puzzling over trace files and testing hypothesis, and when the trace
> > > > > system itself is occasionally unreliable, the set of hypothesis to
> > > > > consider grows. I've tried to keep the amount of kernel infrastructure
> > > > > needed to support this precision and reliability to a minimum, pushing
> > > > > most of the complexity to userspace. But we do need, from the kernel,
> > > > > reliable process disambiguation.
> > > > >
> > > > > Besides: things like checkpoint and restart are also non-core
> > > > > features, but the kernel has plenty of infrastructure to support them.
> > > > > We're talking about a very lightweight feature in this thread.
> > > >
> > > > I'm still not understanding the seriousness of the problem. Presumably
> > > > you've hit problems in real-life which were serious and frequent enough
> > > > to justify getting down and writing the code. Please share some sob stories
> > > > with us!
> > >
> > > The problem here is the possibility of confusion, even if it's rare.
> > > Does the naive approach of just walking /proc and ignoring the
> > > possibility of PID reuse races work most of the time? Sure. But "most
> > > of the time" isn't good enough. It's not that there are tons of sob
> > > stories: it's that without completely robust reporting, we can't rule
> > > out of the possibility that weirdness we observe in a given trace is
> > > actually just an artifact from a kinda-sort-working best-effort trace
> > > collection system instead of a real anomaly in behavior. Tracing,
> > > essentially, gives us deltas for system state, and without an accurate
> > > baseline, collected via some kind of scan on trace startup, it's
> > > impossible to use these deltas to robustly reconstruct total system
> > > state at a given time. And this matters, because errors in
> > > reconstruction (e.g., assigning a thread to the wrong process because
> > > the IDs happen to be reused) can affect processing of the whole trace.
> > > If it's 3am and I'm analyzing the lone trace from a dogfooder
> > > demonstrating a particularly nasty problem, I don't want to find out
> > > that the trace I'm analyzing ended up being useless because the
> > > kernel's trace system is merely best effort. It's very cheap to be
> > > 100% reliable here, so let's be reliable and rule out sources of
> > > error.
> >
> > So we're solving a problem which isn't known to occur, but solving it
> > provides some peace-of-mind? Sounds thin!
>
> So you want to reject a cheap fix for a problem that you know occurs
> at some non-zero frequency? There's a big difference between "may or
> may not occur" and "will occur eventually, given enough time, and so
> must be taken into account in analysis". Would you fix a refcount race
> that you knew was possible, but didn't observe? What, exactly, is your
> threshold for accepting a fix that makes tracing more reliable?
> On Nov 21, 2018, at 4:21 PM, Daniel Colascione <[email protected]> wrote:
>
>> On Wed, Nov 21, 2018 at 2:50 PM Andrew Morton <[email protected]> wrote:
>>
>>> On Wed, 21 Nov 2018 14:40:28 -0800 Daniel Colascione <[email protected]> wrote:
>>>
>>>> On Wed, Nov 21, 2018 at 2:12 PM Andrew Morton <[email protected]> wrote:
>>>>
>>>>> On Wed, 21 Nov 2018 12:54:20 -0800 Daniel Colascione <[email protected]> wrote:
>>>>>
>>>>> Trace analysis code needs a coherent picture of the set of processes
>>>>> and threads running on a system. While it's possible to enumerate all
>>>>> tasks via /proc, this enumeration is not atomic. If PID numbering
>>>>> rolls over during snapshot collection, the resulting snapshot of the
>>>>> process and thread state of the system may be incoherent, confusing
>>>>> trace analysis tools. The fundamental problem is that if a PID is
>>>>> reused during a userspace scan of /proc, it's impossible to tell, in
>>>>> post-processing, whether a fact that the userspace /proc scanner
>>>>> reports regarding a given PID refers to the old or new task named by
>>>>> that PID, as the scan of that PID may or may not have occurred before
>>>>> the PID reuse, and there's no way to "stamp" a fact read from the
>>>>> kernel with a trace timestamp.
>>>>>
>>>>> This change adds a per-pid-namespace 64-bit generation number,
>>>>> incremented on PID rollover, and exposes it via a new proc file
>>>>> /proc/pid_gen. By examining this file before and after /proc
>>>>> enumeration, user code can detect the potential reuse of a PID and
>>>>> restart the task enumeration process, repeating until it gets a
>>>>> coherent snapshot.
>>>>>
>>>>> PID rollover ought to be rare, so in practice, scan repetitions will
>>>>> be rare.
>>>>
>>>> In general, tracing is a rather specialized thing. Why is this very
>>>> occasional confusion a sufficiently serious problem to warrant addition
>>>> of this code?
>>>
>>> I wouldn't call tracing a specialized thing: it's important enough to
>>> justify its own summit and a whole ecosystem of trace collection and
>>> analysis tools. We use it in every day in Android. It's tremendously
>>> helpful for understanding system behavior, especially in cases where
>>> multiple components interact in ways that we can't readily predict or
>>> replicate. Reliability and precision in this area are essential:
>>> retrospective analysis of difficult-to-reproduce problems involves
>>> puzzling over trace files and testing hypothesis, and when the trace
>>> system itself is occasionally unreliable, the set of hypothesis to
>>> consider grows. I've tried to keep the amount of kernel infrastructure
>>> needed to support this precision and reliability to a minimum, pushing
>>> most of the complexity to userspace. But we do need, from the kernel,
>>> reliable process disambiguation.
>>>
>>> Besides: things like checkpoint and restart are also non-core
>>> features, but the kernel has plenty of infrastructure to support them.
>>> We're talking about a very lightweight feature in this thread.
>>
>> I'm still not understanding the seriousness of the problem. Presumably
>> you've hit problems in real-life which were serious and frequent enough
>> to justify getting down and writing the code. Please share some sob stories
>> with us!
>
> The problem here is the possibility of confusion, even if it's rare.
> Does the naive approach of just walking /proc and ignoring the
> possibility of PID reuse races work most of the time? Sure. But "most
> of the time" isn't good enough. It's not that there are tons of sob
> stories: it's that without completely robust reporting, we can't rule
> out of the possibility that weirdness we observe in a given trace is
> actually just an artifact from a kinda-sort-working best-effort trace
> collection system instead of a real anomaly in behavior. Tracing,
> essentially, gives us deltas for system state, and without an accurate
> baseline, collected via some kind of scan on trace startup, it's
> impossible to use these deltas to robustly reconstruct total system
> state at a given time. And this matters, because errors in
> reconstruction (e.g., assigning a thread to the wrong process because
> the IDs happen to be reused) can affect processing of the whole trace.
> If it's 3am and I'm analyzing the lone trace from a dogfooder
> demonstrating a particularly nasty problem, I don't want to find out
> that the trace I'm analyzing ended up being useless because the
> kernel's trace system is merely best effort. It's very cheap to be
> 100% reliable here, so let's be reliable and rule out sources of
> error.
>
>>>> Which userspace tools will be using pid_gen? Are the developers of
>>>> those tools signed up to use pid_gen?
>>>
>>> I'll be changing Android tracing tools to capture process snapshots
>>> using pid_gen, using the algorithm in the commit message.
>>
>> Which other tools could use this and what was the feedback from their
>> developers?
>
> I'm going to have Android's systrace and Perfetto use this approach.
> Exactly how many tools signed up to use this feature do you need?
>
>> Those people are the intended audience and the
>> best-positioned reviewers so let's hear from them?
>
> I'm writing plenty of trace analysis tools myself, so I'm part of this
> intended audience. Other tracing tool authors have told me about
> out-of-tree hacks for process atomic snapshots via ftrace events. This
> approach avoids the necessity of these more-invasive hacks.
Would a tracepoint for pid reuse solve your problem?
On Wed, Nov 21, 2018 at 4:22 PM Andrew Morton <[email protected]> wrote:
>
> On Wed, 21 Nov 2018 15:21:40 -0800 Daniel Colascione <[email protected]> wrote:
>
> > On Wed, Nov 21, 2018 at 2:50 PM Andrew Morton <[email protected]> wrote:
> > >
> > > On Wed, 21 Nov 2018 14:40:28 -0800 Daniel Colascione <[email protected]> wrote:
> > >
> > > > On Wed, Nov 21, 2018 at 2:12 PM Andrew Morton <[email protected]> wrote:
> >
> > ...
> >
> > > > I wouldn't call tracing a specialized thing: it's important enough to
> > > > justify its own summit and a whole ecosystem of trace collection and
> > > > analysis tools. We use it in every day in Android. It's tremendously
> > > > helpful for understanding system behavior, especially in cases where
> > > > multiple components interact in ways that we can't readily predict or
> > > > replicate. Reliability and precision in this area are essential:
> > > > retrospective analysis of difficult-to-reproduce problems involves
> > > > puzzling over trace files and testing hypothesis, and when the trace
> > > > system itself is occasionally unreliable, the set of hypothesis to
> > > > consider grows. I've tried to keep the amount of kernel infrastructure
> > > > needed to support this precision and reliability to a minimum, pushing
> > > > most of the complexity to userspace. But we do need, from the kernel,
> > > > reliable process disambiguation.
> > > >
> > > > Besides: things like checkpoint and restart are also non-core
> > > > features, but the kernel has plenty of infrastructure to support them.
> > > > We're talking about a very lightweight feature in this thread.
> > >
> > > I'm still not understanding the seriousness of the problem. Presumably
> > > you've hit problems in real-life which were serious and frequent enough
> > > to justify getting down and writing the code. Please share some sob stories
> > > with us!
> >
> > The problem here is the possibility of confusion, even if it's rare.
> > Does the naive approach of just walking /proc and ignoring the
> > possibility of PID reuse races work most of the time? Sure. But "most
> > of the time" isn't good enough. It's not that there are tons of sob
> > stories: it's that without completely robust reporting, we can't rule
> > out of the possibility that weirdness we observe in a given trace is
> > actually just an artifact from a kinda-sort-working best-effort trace
> > collection system instead of a real anomaly in behavior. Tracing,
> > essentially, gives us deltas for system state, and without an accurate
> > baseline, collected via some kind of scan on trace startup, it's
> > impossible to use these deltas to robustly reconstruct total system
> > state at a given time. And this matters, because errors in
> > reconstruction (e.g., assigning a thread to the wrong process because
> > the IDs happen to be reused) can affect processing of the whole trace.
> > If it's 3am and I'm analyzing the lone trace from a dogfooder
> > demonstrating a particularly nasty problem, I don't want to find out
> > that the trace I'm analyzing ended up being useless because the
> > kernel's trace system is merely best effort. It's very cheap to be
> > 100% reliable here, so let's be reliable and rule out sources of
> > error.
>
> So we're solving a problem which isn't known to occur, but solving it
> provides some peace-of-mind? Sounds thin!
So you want to reject a cheap fix for a problem that you know occurs
at some non-zero frequency? There's a big difference between "may or
may not occur" and "will occur eventually, given enough time, and so
must be taken into account in analysis". Would you fix a refcount race
that you knew was possible, but didn't observe? What, exactly, is your
threshold for accepting a fix that makes tracing more reliable?
On Wed, Nov 21, 2018 at 4:57 PM Andrew Morton <[email protected]> wrote:
>
> On Wed, 21 Nov 2018 16:28:56 -0800 Daniel Colascione <[email protected]> wrote:
>
> > > > The problem here is the possibility of confusion, even if it's rare.
> > > > Does the naive approach of just walking /proc and ignoring the
> > > > possibility of PID reuse races work most of the time? Sure. But "most
> > > > of the time" isn't good enough. It's not that there are tons of sob
> > > > stories: it's that without completely robust reporting, we can't rule
> > > > out of the possibility that weirdness we observe in a given trace is
> > > > actually just an artifact from a kinda-sort-working best-effort trace
> > > > collection system instead of a real anomaly in behavior. Tracing,
> > > > essentially, gives us deltas for system state, and without an accurate
> > > > baseline, collected via some kind of scan on trace startup, it's
> > > > impossible to use these deltas to robustly reconstruct total system
> > > > state at a given time. And this matters, because errors in
> > > > reconstruction (e.g., assigning a thread to the wrong process because
> > > > the IDs happen to be reused) can affect processing of the whole trace.
> > > > If it's 3am and I'm analyzing the lone trace from a dogfooder
> > > > demonstrating a particularly nasty problem, I don't want to find out
> > > > that the trace I'm analyzing ended up being useless because the
> > > > kernel's trace system is merely best effort. It's very cheap to be
> > > > 100% reliable here, so let's be reliable and rule out sources of
> > > > error.
> > >
> > > So we're solving a problem which isn't known to occur, but solving it
> > > provides some peace-of-mind? Sounds thin!
> >
> > So you want to reject a cheap fix for a problem that you know occurs
> > at some non-zero frequency? There's a big difference between "may or
> > may not occur" and "will occur eventually, given enough time, and so
> > must be taken into account in analysis". Would you fix a refcount race
> > that you knew was possible, but didn't observe? What, exactly, is your
> > threshold for accepting a fix that makes tracing more reliable?
>
> Well for a start I'm looking for a complete patch changelog. One which
> permits readers to fully understand the user-visible impact of the
> problem.
The patch already describes the problem, the solution, and the way in
which this solution is provided. What more information do you want?
> If it is revealed that is a theoretical problem which has negligible
> end-user impact then sure, it is rational to leave things as they are.
> That's what "negligible" means!
I don't think the problem is negligible. There's a huge difference
between 99% and 100% reliability! The possibility of a theoretical
problem is a real problem when, in retrospective analysis, the
possibility of theoretical problems must be taken into account when
trying to figure out how the system got into whatever state it was
observed to be in.
Look, if I were proposing some expensive new bit of infrastructure,
that would be one thing. But this is trivial. What form of patch
*would* you take here? Would you take a tracepoint, as I discussed in
your other message? Is there *any* snapshot approach here that you
would take? Is your position that providing an atomic process tree
hierarchy snapshot is just not a capable the kernel should provide?
I'm writing trace analysis tools, and I'm saying that in order to be
confident in the results of the analysis, we need a way to be certain
about baseline system state, and without added robustness, there's
always going to be some doubt as to whether any particular observation
is real or an artifact. I'm open to various technical options for
providing this information, but I think it's reasonable to ask the
system "what is your state?" and somehow get back an answer that's
guaranteed not to be self-contradictory. Have you done much
retrospective long trace analysis?
On Wed, Nov 21, 2018 at 3:35 PM Andy Lutomirski <[email protected]> wrote:
> > On Nov 21, 2018, at 4:21 PM, Daniel Colascione <[email protected]> wrote:
> >
> >> On Wed, Nov 21, 2018 at 2:50 PM Andrew Morton <[email protected]> wrote:
> >>
> >>> On Wed, 21 Nov 2018 14:40:28 -0800 Daniel Colascione <[email protected]> wrote:
> >>>
> >>>> On Wed, Nov 21, 2018 at 2:12 PM Andrew Morton <[email protected]> wrote:
> >>>>
> >>>>> On Wed, 21 Nov 2018 12:54:20 -0800 Daniel Colascione <[email protected]> wrote:
> >>>>>
> >>>>> Trace analysis code needs a coherent picture of the set of processes
> >>>>> and threads running on a system. While it's possible to enumerate all
> >>>>> tasks via /proc, this enumeration is not atomic. If PID numbering
> >>>>> rolls over during snapshot collection, the resulting snapshot of the
> >>>>> process and thread state of the system may be incoherent, confusing
> >>>>> trace analysis tools. The fundamental problem is that if a PID is
> >>>>> reused during a userspace scan of /proc, it's impossible to tell, in
> >>>>> post-processing, whether a fact that the userspace /proc scanner
> >>>>> reports regarding a given PID refers to the old or new task named by
> >>>>> that PID, as the scan of that PID may or may not have occurred before
> >>>>> the PID reuse, and there's no way to "stamp" a fact read from the
> >>>>> kernel with a trace timestamp.
> >>>>>
> >>>>> This change adds a per-pid-namespace 64-bit generation number,
> >>>>> incremented on PID rollover, and exposes it via a new proc file
> >>>>> /proc/pid_gen. By examining this file before and after /proc
> >>>>> enumeration, user code can detect the potential reuse of a PID and
> >>>>> restart the task enumeration process, repeating until it gets a
> >>>>> coherent snapshot.
> >>>>>
> >>>>> PID rollover ought to be rare, so in practice, scan repetitions will
> >>>>> be rare.
> >>>>
> >>>> In general, tracing is a rather specialized thing. Why is this very
> >>>> occasional confusion a sufficiently serious problem to warrant addition
> >>>> of this code?
> >>>
> >>> I wouldn't call tracing a specialized thing: it's important enough to
> >>> justify its own summit and a whole ecosystem of trace collection and
> >>> analysis tools. We use it in every day in Android. It's tremendously
> >>> helpful for understanding system behavior, especially in cases where
> >>> multiple components interact in ways that we can't readily predict or
> >>> replicate. Reliability and precision in this area are essential:
> >>> retrospective analysis of difficult-to-reproduce problems involves
> >>> puzzling over trace files and testing hypothesis, and when the trace
> >>> system itself is occasionally unreliable, the set of hypothesis to
> >>> consider grows. I've tried to keep the amount of kernel infrastructure
> >>> needed to support this precision and reliability to a minimum, pushing
> >>> most of the complexity to userspace. But we do need, from the kernel,
> >>> reliable process disambiguation.
> >>>
> >>> Besides: things like checkpoint and restart are also non-core
> >>> features, but the kernel has plenty of infrastructure to support them.
> >>> We're talking about a very lightweight feature in this thread.
> >>
> >> I'm still not understanding the seriousness of the problem. Presumably
> >> you've hit problems in real-life which were serious and frequent enough
> >> to justify getting down and writing the code. Please share some sob stories
> >> with us!
> >
> > The problem here is the possibility of confusion, even if it's rare.
> > Does the naive approach of just walking /proc and ignoring the
> > possibility of PID reuse races work most of the time? Sure. But "most
> > of the time" isn't good enough. It's not that there are tons of sob
> > stories: it's that without completely robust reporting, we can't rule
> > out of the possibility that weirdness we observe in a given trace is
> > actually just an artifact from a kinda-sort-working best-effort trace
> > collection system instead of a real anomaly in behavior. Tracing,
> > essentially, gives us deltas for system state, and without an accurate
> > baseline, collected via some kind of scan on trace startup, it's
> > impossible to use these deltas to robustly reconstruct total system
> > state at a given time. And this matters, because errors in
> > reconstruction (e.g., assigning a thread to the wrong process because
> > the IDs happen to be reused) can affect processing of the whole trace.
> > If it's 3am and I'm analyzing the lone trace from a dogfooder
> > demonstrating a particularly nasty problem, I don't want to find out
> > that the trace I'm analyzing ended up being useless because the
> > kernel's trace system is merely best effort. It's very cheap to be
> > 100% reliable here, so let's be reliable and rule out sources of
> > error.
> >
> >>>> Which userspace tools will be using pid_gen? Are the developers of
> >>>> those tools signed up to use pid_gen?
> >>>
> >>> I'll be changing Android tracing tools to capture process snapshots
> >>> using pid_gen, using the algorithm in the commit message.
> >>
> >> Which other tools could use this and what was the feedback from their
> >> developers?
> >
> > I'm going to have Android's systrace and Perfetto use this approach.
> > Exactly how many tools signed up to use this feature do you need?
> >
> >> Those people are the intended audience and the
> >> best-positioned reviewers so let's hear from them?
> >
> > I'm writing plenty of trace analysis tools myself, so I'm part of this
> > intended audience. Other tracing tool authors have told me about
> > out-of-tree hacks for process atomic snapshots via ftrace events. This
> > approach avoids the necessity of these more-invasive hacks.
>
> Would a tracepoint for pid reuse solve your problem?
I initially thought "no", but maybe it would after all.
The /proc scanner would need some way of consuming this tracepoint so
that it would know to re-run the scan. For some tracing systems (like
Perfetto) doing simultaneous event consumption and recording would
work (although it'd be awkward), but other systems (like things based
on the simpler atrace framework) aren't able to process events, and
inferring PID reuse after the fact it's as helpful as being able to
re-scan in response to PID rollover.
OTOH, the ftrace histogram stuff could record a running count of
rollovers events pretty easily independent of the trace recording
machinery, and that running event count would be good enough for doing
the re-scan I want. If /proc/pid_gen isn't on the table, doing it that
way (with a new tracepoint) would work too. There'd be no way to
separate out rollovers per PID namespace though, so we'd have to
re-scan when *any* PID namespace rolled over, not just the current
one. But that's probably not a problem.
I'll send a patch with a PID rollover tracepoint. Thanks.
On Wed, 21 Nov 2018 16:28:56 -0800 Daniel Colascione <[email protected]> wrote:
> > > The problem here is the possibility of confusion, even if it's rare.
> > > Does the naive approach of just walking /proc and ignoring the
> > > possibility of PID reuse races work most of the time? Sure. But "most
> > > of the time" isn't good enough. It's not that there are tons of sob
> > > stories: it's that without completely robust reporting, we can't rule
> > > out of the possibility that weirdness we observe in a given trace is
> > > actually just an artifact from a kinda-sort-working best-effort trace
> > > collection system instead of a real anomaly in behavior. Tracing,
> > > essentially, gives us deltas for system state, and without an accurate
> > > baseline, collected via some kind of scan on trace startup, it's
> > > impossible to use these deltas to robustly reconstruct total system
> > > state at a given time. And this matters, because errors in
> > > reconstruction (e.g., assigning a thread to the wrong process because
> > > the IDs happen to be reused) can affect processing of the whole trace.
> > > If it's 3am and I'm analyzing the lone trace from a dogfooder
> > > demonstrating a particularly nasty problem, I don't want to find out
> > > that the trace I'm analyzing ended up being useless because the
> > > kernel's trace system is merely best effort. It's very cheap to be
> > > 100% reliable here, so let's be reliable and rule out sources of
> > > error.
> >
> > So we're solving a problem which isn't known to occur, but solving it
> > provides some peace-of-mind? Sounds thin!
>
> So you want to reject a cheap fix for a problem that you know occurs
> at some non-zero frequency? There's a big difference between "may or
> may not occur" and "will occur eventually, given enough time, and so
> must be taken into account in analysis". Would you fix a refcount race
> that you knew was possible, but didn't observe? What, exactly, is your
> threshold for accepting a fix that makes tracing more reliable?
Well for a start I'm looking for a complete patch changelog. One which
permits readers to fully understand the user-visible impact of the
problem.
If it is revealed that is a theoretical problem which has negligible
end-user impact then sure, it is rational to leave things as they are.
That's what "negligible" means!
On Wed, 21 Nov 2018 17:08:08 -0800 Daniel Colascione <[email protected]> wrote:
> Have you done much
> retrospective long trace analysis?
No. Have you?
Of course you have, which is why I and others are dependent upon you to
explain why this change is worth adding to Linux. If this thing solves
a problem which we expect will not occur for anyone between now and the
heat death of the universe then this impacts our decisions.
> The patch already describes the problem, the solution, and the way in
> which this solution is provided. What more information do you want?
I want to know how useful the darn thing is! Why is this so hard?
And my thus-far-unanswered question regarding testing the feature
is also relevant to that understanding. What does a testcase look
like? If we're not actually able to construct one then what does that
mean?
On Wed, Nov 21, 2018 at 12:38:20PM -0800, Daniel Colascione wrote:
> On Wed, Nov 21, 2018 at 12:31 PM Matthew Wilcox <[email protected]> wrote:
> >
> > On Wed, Nov 21, 2018 at 12:14:44PM -0800, Daniel Colascione wrote:
> > > This change adds a per-pid-namespace 64-bit generation number,
> > > incremented on PID rollover, and exposes it via a new proc file
> > > /proc/pid_generation. By examining this file before and after /proc
> > > enumeration, user code can detect the potential reuse of a PID and
> > > restart the task enumeration process, repeating until it gets a
> > > coherent snapshot.
> > >
> > > PID rollover ought to be rare, so in practice, scan repetitions will
> > > be rare.
> >
> > Then why does it need to be 64-bit?
>
> [Resending because of accidental HTML. I really need to switch to a
> better email client.]
>
> Because 64 bits is enough for anyone. :-) A u64 is big enough that
> we'll never observe an overflow on a running system, and PID
> namespaces are rare enough that we won't miss the four extra bytes we
> use by upgrading from a u32. And after reading about some security
> problems caused by too-clever handling of 32-bit rollover, I'd rather
> the code be obviously correct than save a trivial amount of space.
I don't think you understand how big 4 billion is. If it happens once a
second, it will take 136 years for a 2^32 count to roll over. How often
does a PID roll over happen?
On Wed, Nov 21, 2018 at 5:29 PM Andrew Morton <[email protected]> wrote:
>
> On Wed, 21 Nov 2018 17:08:08 -0800 Daniel Colascione <[email protected]> wrote:
>
> > Have you done much
> > retrospective long trace analysis?
>
> No. Have you?
>
> Of course you have, which is why I and others are dependent upon you to
> explain why this change is worth adding to Linux. If this thing solves
> a problem which we expect will not occur for anyone between now and the
> heat death of the universe then this impacts our decisions.
I use ftrace the most on Android, so let me take a shot.
In addition to the normal "debug a slow thing" use cases for ftrace,
Android has started exploring two other ways of using ftrace:
1. "Flight recorder" mode: trigger ftrace for some amount of time when
a particular anomaly is detected to make debugging those cases easier.
2. Long traces: let a trace stream to disk for hours or days, then
postprocess it to get some deeper insights about system behavior.
We've used this very successfully to debug and optimize power
consumption.
Knowing the initial state of the system is a pain for both of these
cases. For example, one of the things I'd like to know in some of my
current use cases for long traces is the current oom_score_adj of
every process in the system, but similar to PID reuse, that can change
very quickly due to userspace behavior. There's also a race between
reading that value in userspace and writing it to trace_marker:
1. Userspace daemon X reads oom_score_adj for a process Y.
2. Process Y gets a new oom_score_adj value, triggering the
oom/oom_score_adj_update tracepoint.
3. Daemon X writes the old oom_score_adj value to trace_marker.
As I was writing this, though, I realized that the race doesn't matter
so long as our tools follow the same basic practice (for PID reuse,
oom_score_adj, or anything else we need):
1. Daemon enables all requested tracepoints and resets the trace clock.
2. Daemon enables tracing.
3. Daemon dumps initial state for any tracepoint we care about.
4. When postprocessing, a tool must consider the initial state of a
value (eg, oom_score_adj of pid X) to be either the initial state as
reported by the daemon or the first ftrace event reporting that value.
If there is an ftrace event in the trace before the report from the
daemon, the report from the daemon should be ignored.
The key here is that initial state as reported by userspace needs to
provable from ftrace events. For example, if we stream ps -AT to
trace_marker from userspace, we should be able to prove that pid 5000
in that ps -AT is actually the same process that shows up as pid 5000
later on in the trace and that it has not been replaced by some other
pid 5000. That requires that any event that could break that
assumption be available from the trace itself. Accordingly, I think a
PID reuse tracepoint would work better than an atomic dump of all PIDs
because I'd rather have tracepoints for anything where the initial
state of the system matters than relying on different atomic dumps to
be sure of the initial state. (in this case, we'd have to combine a
PID reuse tracepoint with sched_process_fork and task_rename or
something like that to know what's actually running, but that's a
tractable problem)
The PID reuse tracepoint requires more intelligence in postprocessing
and it still has a race where the state of these values can be
indeterminate at the beginning of a trace if those values change
quickly, but I don't think we can get to a point where we can generate
a full snapshot of every tracepoint we care about in the system at the
start of a trace. For Android's use cases, that short race at the
beginning of a trace isn't a big deal (or at least I can't think of a
case where it would be).
On Wed, Nov 21, 2018 at 6:36 PM Tim Murray <[email protected]> wrote:
>
> On Wed, Nov 21, 2018 at 5:29 PM Andrew Morton <[email protected]> wrote:
> >
> > On Wed, 21 Nov 2018 17:08:08 -0800 Daniel Colascione <[email protected]> wrote:
> >
> > > Have you done much
> > > retrospective long trace analysis?
> >
> > No. Have you?
> >
> > Of course you have, which is why I and others are dependent upon you to
> > explain why this change is worth adding to Linux. If this thing solves
> > a problem which we expect will not occur for anyone between now and the
> > heat death of the universe then this impacts our decisions.
>
> I use ftrace the most on Android, so let me take a shot.
>
> In addition to the normal "debug a slow thing" use cases for ftrace,
> Android has started exploring two other ways of using ftrace:
>
> 1. "Flight recorder" mode: trigger ftrace for some amount of time when
> a particular anomaly is detected to make debugging those cases easier.
>
> 2. Long traces: let a trace stream to disk for hours or days, then
> postprocess it to get some deeper insights about system behavior.
> We've used this very successfully to debug and optimize power
> consumption.
>
> Knowing the initial state of the system is a pain for both of these
> cases. For example, one of the things I'd like to know in some of my
> current use cases for long traces is the current oom_score_adj of
> every process in the system, but similar to PID reuse, that can change
> very quickly due to userspace behavior. There's also a race between
> reading that value in userspace and writing it to trace_marker:
>
> 1. Userspace daemon X reads oom_score_adj for a process Y.
> 2. Process Y gets a new oom_score_adj value, triggering the
> oom/oom_score_adj_update tracepoint.
> 3. Daemon X writes the old oom_score_adj value to trace_marker.
>
> As I was writing this, though, I realized that the race doesn't matter
> so long as our tools follow the same basic practice (for PID reuse,
> oom_score_adj, or anything else we need):
>
> 1. Daemon enables all requested tracepoints and resets the trace clock.
> 2. Daemon enables tracing
> 3. Daemon dumps initial state for any tracepoint we care about.
> 4. When postprocessing, a tool must consider the initial state of a
> value (eg, oom_score_adj of pid X) to be either the initial state as
> reported by the daemon or the first ftrace event reporting that value.
> If there is an ftrace event in the trace before the report from the
> daemon, the report from the daemon should be ignored.
I was imagining periodic scans being the moral equivalent of I-frames
in an MPEG stream: known-good synchronization points that bound the
propagation of errors. To prevent loss of events in the *middle* of
one of these samples affecting the validity of the sample, I wanted
each one to be atomic and self-contained. I was initially imagining a
single proc file containing a mapping from thread ID list and a TGID
mapping for each one (as well as assorted other information),
generated atomically under task_list_lock and stamped with the trace
timestamp. The /proc/pid_gen thing was an effort to do most of that in
userspace. But maybe that's not really that important.
Another thing to consider: there are some properties which we can
enumerate *only* through /proc and that we can't yet receive in a
streaming, event-driven fashion. We read those by periodic sampling,
but the sampling isn't atomic. And as Tim notes, even properties that
we ordinarily receive in streaming form via trace event must be
sampled at trace start. But trace start isn't the only case where it
matters.
Suppose at T+50 we have a TID 601 (which we'll call TID 601_1) with
steady-state resource-consumption $COUNTER 100MB. Now suppose we start
sampling at T+50. At T+55, TID 601_1 dies. At T+60, due to unlikely be
possible TID reuse, we see a new task with TID 601, which we'll call
TID 601_2. At T+65, the sample reads TID 601_2's $COUNTER as 1GB. At
T+70, we finish the sample. From the perspective of trace analysis,
all the sample tells us is "at some point between T+50 and T+70, TID
601's $COUNTER was 1GB". We also know when TID 601_1 died and when TID
601_2 started. We don't know whether the sample's "TID 601" refers to
TID 601_1 or TID 601_2. So what do we do?
1) We could attribute the 1GB value of $COUNTER to TID 601_1, which is
what would happen by default if we deemed the sample to be a virtual
event with timestamp T+50, the time we started the sampling process.
But that would be incorrect here. In analysis, we'd see this
attribution as a mysterious spike in TID 601_1's resource consumption
immediately before its death. (The spike would be to a level
suspiciously similar to TID 601_2's next sample.) Imagine spending
hours trying to track down a mystery death-rattle resource spike
(something that could plausibly happen in the real world) that ended
up being a tracing artifact.
2) We could attribute the 1GB read of $COUNTER to TID 601_2. In the
timeline I gave above, this attribution would be correct. But what if
TID 601_1's $COUNTER really _had_ spiked to 1GB immediately before its
death? (What if its death had been caused by that spike?) Then, under
the "tag last" approach, we'd falsely tag TID 601_2 with TID 601_1's
excessive resource use, and this false attribution would last until
the next sample of TID 601_2, which might be quite far in the future.
And we might find ourselves wondering why TID 601_1 died, since (if we
blame 601_2 for the new $COUNTER value) TID 601_1 would have done
nothing wrong. And in the meantime, we'd see 1GB of $COUNTER
consumption that would appear to come out of nowhere.
3) We could discard the ambiguous sample. But in this case, we'd leave
TID 601_2's $COUNTER value completely unknown until the next sample,
which could be quite far in the future. And, if TID 601_2's $COUNTER
value really were 1GB, we might leave significant resource consumption
unaccounted-for until that next sample.
With my pid_gen patch, we could have used strategy #3 without having
to wait for the system to get around to sampling TID 601_2 again on
its own, because we'd have been able to detect the ambiguity and
collect a new, hopefully-unambiguous sample on the spot. Yes, we do
wait a sampling period between samples anyway, and it's possible for a
process to start, run, and die without having ever been sampled with
or without pid_gen, but at least we wouldn't have missed a perfectly
good sampling opportunity due to PID ambiguity.
You might say, "dancol@, you sound excessively fussy here. That's
never going to really matter under any realistic workload." And maybe
that'd be right. But like I said above, I think it's important to be
able to have deterministic bounds on uncertainty even in the case of
pathological behavior, and I'd rather guarantee correctness
structurally than wonder whether unlikely events are one-in-a-trace
unlikely or heat-death-of-the-universe unlikely.
To improve on the sampling model, we could have the kernel stamp
$COUNTER reads with the current trace timestamp. Then, the sample
would tell us, instead of "TID 601's $COUNTER was 1GB between T+50 and
T+70", it'd tell us "TID 601's $COUNTER was 1GB at exactly T+65". (The
sampling program itself might get around to logging this bit of
information at T+70 or T+700, but the embedded timestamp would be
accurate.) We could then look up the inferred process table at T+65
and attribute that 1GB to TID 601_2.
Ideally, we wouldn't bother with this sort of sampling at all, instead
relying on streaming $COUNTER updates as events. But we're not quite
there yet. I have a larger proposal in the works for that one. Under
that proposal, we wouldn't log every change to $COUNTER, but we would
log $COUNTER at process death (as well as at other times ---
basically, after large changes and after a maximum time after any
change), so trace analysis would still be able to correctly determine
TID 601_1's time-of-death resource use.
In another approach, the trace system *itself* would generate the
"I-frames" by automatically generating synthetic trace events at trace
startup in order to reflect pre-trace system state. (For example, if
you enabled sched_switch and started tracing, the ftrace ring buffer
would begin life with one synthetic sched_switch per CPU describing
each CPU's running task at the instant of trace startup, maybe
collected with on_all_cpus.)
But anyway, I didn't expect the pid_gen change to be controversial.
Given that we can eventually do something different altogether and
that it probably *is* excessively fussy, consider the patch withdrawn.
On Wed, Nov 21, 2018 at 12:14:44PM -0800, Daniel Colascione wrote:
> This change adds a per-pid-namespace 64-bit generation number,
> incremented on PID rollover, and exposes it via a new proc file
> /proc/pid_generation. By examining this file before and after /proc
> enumeration, user code can detect the potential reuse of a PID and
> restart the task enumeration process, repeating until it gets a
> coherent snapshot.
I see downthread this patch has been withdrawn, but nonetheless I'm
still curious - does this actually solve the problem?
It seems to me that a PID could be reused within a scan even if the
generation number remains the same at the beginning and end of a scan:
Say you have a very long-lived task with PID 500 allocated in generation
0. The PID creation has since wrapped and we are now allocating from the
start of the range again, with generation 1. We begin a scan of /proc,
read the generation (1) and at this point, our task dies and PID 500 is
then reallocated to a new task. We finish our scan, generation is still
1 but PID 500 is now ambiguous.
Am I wrong?
- Kevin
On Wed, Nov 21, 2018 at 04:35:34PM -0700, Andy Lutomirski wrote:
> >
> > I'm going to have Android's systrace and Perfetto use this approach.
> > Exactly how many tools signed up to use this feature do you need?
> >
> >> Those people are the intended audience and the
> >> best-positioned reviewers so let's hear from them?
> >
> > I'm writing plenty of trace analysis tools myself, so I'm part of this
> > intended audience. Other tracing tool authors have told me about
> > out-of-tree hacks for process atomic snapshots via ftrace events. This
> > approach avoids the necessity of these more-invasive hacks.
>
> Would a tracepoint for pid reuse solve your problem?
FWIW we've had similar problem in criu for memory snapshotting,
because memory data is PID-driven and snapshots are rather
discrete events. So we use task_struct::real_start_time as
a second guard agains pid reuse.
----- On Nov 21, 2018, at 7:30 PM, Daniel Colascione [email protected] wrote:
[...]
>> > >
>> > > The problem here is the possibility of confusion, even if it's rare.
>> > > Does the naive approach of just walking /proc and ignoring the
>> > > possibility of PID reuse races work most of the time? Sure. But "most
>> > > of the time" isn't good enough. It's not that there are tons of sob
>> > > stories: it's that without completely robust reporting, we can't rule
>> > > out of the possibility that weirdness we observe in a given trace is
>> > > actually just an artifact from a kinda-sort-working best-effort trace
>> > > collection system instead of a real anomaly in behavior. Tracing,
>> > > essentially, gives us deltas for system state, and without an accurate
>> > > baseline, collected via some kind of scan on trace startup, it's
>> > > impossible to use these deltas to robustly reconstruct total system
>> > > state at a given time. And this matters, because errors in
>> > > reconstruction (e.g., assigning a thread to the wrong process because
>> > > the IDs happen to be reused) can affect processing of the whole trace.
>> > > If it's 3am and I'm analyzing the lone trace from a dogfooder
>> > > demonstrating a particularly nasty problem, I don't want to find out
>> > > that the trace I'm analyzing ended up being useless because the
>> > > kernel's trace system is merely best effort. It's very cheap to be
>> > > 100% reliable here, so let's be reliable and rule out sources of
>> > > error.
>> >
[...]
I've just been CC'd on this thread for some reason, so I'll add my 2 cents.
WHIW, I think using /proc to add stateful information to a time-based
trace is the wrong way to do things. Here, the fact that you need to
add a generation counter struct pid_namespace and expose it via /proc
just highlights its limitations when it comes to dealing with state
that changes over time. Your current issue is with PID re-use, but
you will eventually face the same issue for re-use of all other resources
you are trying to model. For instance, a file descriptor may be associated
to a path as some point in time, but that is not true anymore after a
sequence of close/open which re-uses that file descriptor. Does that
mean we will eventually end up needing per-file-descriptor generation
counters as well ?
LTTng solves this by dumping the system state as events within the
trace [1], which associates time-stamps with the state being dumped.
It is recorded while the rest of the system is being traced, so tools
can reconstruct full system state by combining this statedump with the
rest of the events recording state transitions.
So while I agree that it's important to have a way to reconstruct
system state that is aware of PID re-use, I think trying to extend
/proc for this is the wrong approach. It adds extra fields to struct
pid_namespace that seem to be only useful for tracing, whereas using
the time-stamp at which the thread/process was first seen in the trace
(either fork or statedump) as secondary key should suffice to uniquely
identify a thread/process. I would recommend extending tracing
facilities to dump the data you need rather than /proc.
Thanks,
Mathieu
[1] http://git.lttng.org/?p=lttng-modules.git;a=blob;f=lttng-statedump-impl.c;h=dc037508c055b7f61b8c758d581bd0178e26552a;hb=HEAD
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
From: Kevin Easton
> Sent: 22 November 2018 11:20
>
> On Wed, Nov 21, 2018 at 12:14:44PM -0800, Daniel Colascione wrote:
> > This change adds a per-pid-namespace 64-bit generation number,
> > incremented on PID rollover, and exposes it via a new proc file
> > /proc/pid_generation. By examining this file before and after /proc
> > enumeration, user code can detect the potential reuse of a PID and
> > restart the task enumeration process, repeating until it gets a
> > coherent snapshot.
>
> I see downthread this patch has been withdrawn, but nonetheless I'm
> still curious - does this actually solve the problem?
>
> It seems to me that a PID could be reused within a scan even if the
> generation number remains the same at the beginning and end of a scan:
Why not allocate a 48bit generation number to each 16bit pid?
Then you have a 64bit 'extended-pid' that can be assumed to never be reused.
Provided enough interfaces are enhanced to support 'extended-pid' values
you'll never get reused values.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
On Wed 2018-11-21 18:06:33, Matthew Wilcox wrote:
> On Wed, Nov 21, 2018 at 12:38:20PM -0800, Daniel Colascione wrote:
> > On Wed, Nov 21, 2018 at 12:31 PM Matthew Wilcox <[email protected]> wrote:
> > >
> > > On Wed, Nov 21, 2018 at 12:14:44PM -0800, Daniel Colascione wrote:
> > > > This change adds a per-pid-namespace 64-bit generation number,
> > > > incremented on PID rollover, and exposes it via a new proc file
> > > > /proc/pid_generation. By examining this file before and after /proc
> > > > enumeration, user code can detect the potential reuse of a PID and
> > > > restart the task enumeration process, repeating until it gets a
> > > > coherent snapshot.
> > > >
> > > > PID rollover ought to be rare, so in practice, scan repetitions will
> > > > be rare.
> > >
> > > Then why does it need to be 64-bit?
> >
> > [Resending because of accidental HTML. I really need to switch to a
> > better email client.]
> >
> > Because 64 bits is enough for anyone. :-) A u64 is big enough that
> > we'll never observe an overflow on a running system, and PID
> > namespaces are rare enough that we won't miss the four extra bytes we
> > use by upgrading from a u32. And after reading about some security
> > problems caused by too-clever handling of 32-bit rollover, I'd rather
> > the code be obviously correct than save a trivial amount of space.
>
> I don't think you understand how big 4 billion is. If it happens once a
> second, it will take 136 years for a 2^32 count to roll over. How often
> does a PID roll over happen?
Well, the cost of 64-bit vs. 32-bit is really small here... I'd go
with 64bits. If you have 1000 CPUs, rollovers may be faster..
Best regards,
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
On Fri 2018-11-23 11:14:17, David Laight wrote:
> From: Kevin Easton
> > Sent: 22 November 2018 11:20
> >
> > On Wed, Nov 21, 2018 at 12:14:44PM -0800, Daniel Colascione wrote:
> > > This change adds a per-pid-namespace 64-bit generation number,
> > > incremented on PID rollover, and exposes it via a new proc file
> > > /proc/pid_generation. By examining this file before and after /proc
> > > enumeration, user code can detect the potential reuse of a PID and
> > > restart the task enumeration process, repeating until it gets a
> > > coherent snapshot.
> >
> > I see downthread this patch has been withdrawn, but nonetheless I'm
> > still curious - does this actually solve the problem?
> >
> > It seems to me that a PID could be reused within a scan even if the
> > generation number remains the same at the beginning and end of a scan:
>
> Why not allocate a 48bit generation number to each 16bit pid?
> Then you have a 64bit 'extended-pid' that can be assumed to never be reused.
> Provided enough interfaces are enhanced to support 'extended-pid' values
> you'll never get reused values.
For the record, I really like this proposal.
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html