Currently, there is no trivial mechanism to analyze events based on
containers. perf -G can be used, but it will not filter events for the
containers created after perf is invoked, making it difficult to assess/
analyze performance issues of multiple containers at once.
This patch-set overcomes this limitation by using cgroup identifier as
container unique identifier. A new PERF_RECORD_NAMESPACES event that
records namespaces related info is introduced, from which the cgroup
namespace's inode number is used as cgroup identifier. This is based
on the assumption that each container is created with it's own cgroup
namespace allowing assessment/analysis of multiple containers using
cgroup identifier.
The first patch introduces PERF_RECORD_NAMESPACES in kernel while the
second patch makes the corresponding changes in perf tool to read this
PERF_RECORD_NAMESPACES events. The third patch adds a cgroup identifier
column in perf report, which is nothing but the cgroup namespace's
inode number. This approach is based on the suggestion from Peter
Zijlstra here: https://patchwork.kernel.org/patch/9305655/
---
Hari Bathini (3):
perf: add PERF_RECORD_NAMESPACES to include namespaces related info
perf tool: add PERF_RECORD_NAMESPACES to include namespaces related info
perf tool: add cgroup identifier entry in perf report
fs/mount.h | 17 ----
include/linux/mnt_namespace.h | 18 ++++
include/linux/perf_event.h | 1
include/uapi/linux/perf_event.h | 21 +++++
kernel/events/core.c | 136 +++++++++++++++++++++++++++++++++
kernel/fork.c | 4 +
kernel/nsproxy.c | 5 +
tools/include/uapi/linux/perf_event.h | 21 +++++
tools/perf/builtin-annotate.c | 1
tools/perf/builtin-diff.c | 1
tools/perf/builtin-inject.c | 14 +++
tools/perf/builtin-kmem.c | 1
tools/perf/builtin-kvm.c | 2
tools/perf/builtin-lock.c | 1
tools/perf/builtin-mem.c | 1
tools/perf/builtin-record.c | 28 ++++++-
tools/perf/builtin-report.c | 1
tools/perf/builtin-sched.c | 1
tools/perf/builtin-script.c | 41 ++++++++++
tools/perf/builtin-trace.c | 3 -
tools/perf/util/Build | 1
tools/perf/util/data-convert-bt.c | 2
tools/perf/util/event.c | 123 ++++++++++++++++++++++++++++--
tools/perf/util/event.h | 25 ++++++
tools/perf/util/evsel.c | 1
tools/perf/util/hist.c | 4 +
tools/perf/util/hist.h | 1
tools/perf/util/machine.c | 24 ++++++
tools/perf/util/machine.h | 3 +
tools/perf/util/namespaces.c | 33 ++++++++
tools/perf/util/namespaces.h | 24 ++++++
tools/perf/util/session.c | 7 ++
tools/perf/util/sort.c | 22 +++++
tools/perf/util/sort.h | 2
tools/perf/util/thread.c | 44 ++++++++++-
tools/perf/util/thread.h | 7 ++
tools/perf/util/tool.h | 1
37 files changed, 610 insertions(+), 32 deletions(-)
create mode 100644 tools/perf/util/namespaces.c
create mode 100644 tools/perf/util/namespaces.h
With the advert of container technologies like docker, that depend
on namespaces for isolation, there is a need for tracing support for
namespaces. This patch introduces new PERF_RECORD_NAMESPACES event
for tracing based on namespaces related info.
Signed-off-by: Hari Bathini <[email protected]>
---
fs/mount.h | 17 -----
include/linux/mnt_namespace.h | 18 +++++
include/linux/perf_event.h | 1
include/uapi/linux/perf_event.h | 21 ++++++
kernel/events/core.c | 136 +++++++++++++++++++++++++++++++++++++++
kernel/fork.c | 4 +
kernel/nsproxy.c | 5 +
7 files changed, 184 insertions(+), 18 deletions(-)
diff --git a/fs/mount.h b/fs/mount.h
index d2e25d7..5ec592b 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -1,22 +1,7 @@
#include <linux/mount.h>
#include <linux/seq_file.h>
-#include <linux/poll.h>
-#include <linux/ns_common.h>
#include <linux/fs_pin.h>
-
-struct mnt_namespace {
- atomic_t count;
- struct ns_common ns;
- struct mount * root;
- struct list_head list;
- struct user_namespace *user_ns;
- struct ucounts *ucounts;
- u64 seq; /* Sequence number to prevent loops */
- wait_queue_head_t poll;
- u64 event;
- unsigned int mounts; /* # of mounts in the namespace */
- unsigned int pending_mounts;
-};
+#include <linux/mnt_namespace.h>
struct mnt_pcp {
int mnt_count;
diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
index 12b2ab5..b911ca6 100644
--- a/include/linux/mnt_namespace.h
+++ b/include/linux/mnt_namespace.h
@@ -2,10 +2,26 @@
#define _NAMESPACE_H_
#ifdef __KERNEL__
-struct mnt_namespace;
+#include <linux/poll.h>
+#include <linux/ns_common.h>
+
struct fs_struct;
struct user_namespace;
+struct mnt_namespace {
+ atomic_t count;
+ struct ns_common ns;
+ struct mount *root;
+ struct list_head list;
+ struct user_namespace *user_ns;
+ struct ucounts *ucounts;
+ u64 seq; /* Sequence number to prevent loops */
+ wait_queue_head_t poll;
+ u64 event;
+ unsigned int mounts; /* # of mounts in the namespace */
+ unsigned int pending_mounts;
+};
+
extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *,
struct user_namespace *, struct fs_struct *);
extern void put_mnt_ns(struct mnt_namespace *ns);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4741ecd..243b988 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1110,6 +1110,7 @@ extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks
extern void perf_event_exec(void);
extern void perf_event_comm(struct task_struct *tsk, bool exec);
+extern void perf_event_namespaces(struct task_struct *tsk);
extern void perf_event_fork(struct task_struct *tsk);
/* Callchains */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index c66a485..575aed6 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -344,7 +344,8 @@ struct perf_event_attr {
use_clockid : 1, /* use @clockid for time fields */
context_switch : 1, /* context switch data */
write_backward : 1, /* Write ring buffer from end to beginning */
- __reserved_1 : 36;
+ namespaces : 1, /* include namespaces data */
+ __reserved_1 : 35;
union {
__u32 wakeup_events; /* wakeup every n events */
@@ -862,6 +863,24 @@ enum perf_event_type {
*/
PERF_RECORD_SWITCH_CPU_WIDE = 15,
+ /*
+ * struct {
+ * struct perf_event_header header;
+ *
+ * u32 pid, tid;
+ * u64 time;
+ * u32 uts_ns_inum;
+ * u32 ipc_ns_inum;
+ * u32 mnt_ns_inum;
+ * u32 pid_ns_inum;
+ * u32 net_ns_inum;
+ * u32 cgroup_ns_inum;
+ * u32 user_ns_inum;
+ * struct sample_id sample_id;
+ * };
+ */
+ PERF_RECORD_NAMESPACES = 16,
+
PERF_RECORD_MAX, /* non-ABI */
};
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0e29213..3bdc2e7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -46,6 +46,9 @@
#include <linux/filter.h>
#include <linux/namei.h>
#include <linux/parser.h>
+#include <linux/ipc_namespace.h>
+#include <linux/utsname.h>
+#include <linux/mnt_namespace.h>
#include "internal.h"
@@ -375,6 +378,7 @@ static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
+static atomic_t nr_namespaces_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
static atomic_t nr_freq_events __read_mostly;
static atomic_t nr_switch_events __read_mostly;
@@ -3874,6 +3878,8 @@ static void unaccount_event(struct perf_event *event)
atomic_dec(&nr_mmap_events);
if (event->attr.comm)
atomic_dec(&nr_comm_events);
+ if (event->attr.namespaces)
+ atomic_dec(&nr_namespaces_events);
if (event->attr.task)
atomic_dec(&nr_task_events);
if (event->attr.freq)
@@ -6476,6 +6482,134 @@ void perf_event_comm(struct task_struct *task, bool exec)
}
/*
+ * namespaces tracking
+ */
+
+struct perf_namespaces_event {
+ struct task_struct *task;
+
+ struct {
+ struct perf_event_header header;
+ u32 pid;
+ u32 tid;
+ u64 time;
+ u32 uts_ns_inum;
+ u32 ipc_ns_inum;
+ u32 mnt_ns_inum;
+ u32 pid_ns_inum;
+ u32 net_ns_inum;
+ u32 cgroup_ns_inum;
+ u32 user_ns_inum;
+ } event_id;
+};
+
+static int perf_event_namespaces_match(struct perf_event *event)
+{
+ return event->attr.namespaces;
+}
+
+static void perf_event_namespaces_output(struct perf_event *event,
+ void *data)
+{
+ struct perf_namespaces_event *namespaces_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int size = namespaces_event->event_id.header.size;
+ struct nsproxy *nsproxy;
+ int ret;
+
+ if (!perf_event_namespaces_match(event))
+ return;
+
+ perf_event_header__init_id(&namespaces_event->event_id.header,
+ &sample, event);
+ ret = perf_output_begin(&handle, event,
+ namespaces_event->event_id.header.size);
+
+ if (ret)
+ goto out;
+
+ namespaces_event->event_id.pid = perf_event_pid(event,
+ namespaces_event->task);
+ namespaces_event->event_id.tid = perf_event_tid(event,
+ namespaces_event->task);
+
+ if (namespaces_event->task != current)
+ task_lock(namespaces_event->task);
+
+ nsproxy = namespaces_event->task->nsproxy;
+ if (nsproxy != NULL) {
+ namespaces_event->event_id.uts_ns_inum =
+ nsproxy->uts_ns->ns.inum;
+#ifdef CONFIG_IPC_NS
+ namespaces_event->event_id.ipc_ns_inum =
+ nsproxy->ipc_ns->ns.inum;
+#endif
+ namespaces_event->event_id.mnt_ns_inum =
+ nsproxy->mnt_ns->ns.inum;
+ namespaces_event->event_id.pid_ns_inum =
+ nsproxy->pid_ns_for_children->ns.inum;
+#ifdef CONFIG_NET
+ namespaces_event->event_id.net_ns_inum =
+ nsproxy->net_ns->ns.inum;
+#endif
+#ifdef CONFIG_CGROUPS
+ namespaces_event->event_id.cgroup_ns_inum =
+ nsproxy->cgroup_ns->ns.inum;
+#endif
+ }
+
+ namespaces_event->event_id.user_ns_inum =
+ __task_cred(namespaces_event->task)->user_ns->ns.inum;
+
+ if (namespaces_event->task != current)
+ task_unlock(namespaces_event->task);
+
+ namespaces_event->event_id.time = perf_event_clock(event);
+
+ perf_output_put(&handle, namespaces_event->event_id);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+out:
+ namespaces_event->event_id.header.size = size;
+}
+
+void perf_event_namespaces(struct task_struct *task)
+{
+ struct perf_namespaces_event namespaces_event;
+
+ if (!atomic_read(&nr_namespaces_events))
+ return;
+
+ namespaces_event = (struct perf_namespaces_event){
+ .task = task,
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_NAMESPACES,
+ .misc = 0,
+ .size = sizeof(namespaces_event.event_id),
+ },
+ /* .pid */
+ /* .tid */
+ /* .time */
+ /* .uts_ns_inum */
+ /* .ipc_ns_inum */
+ /* .mnt_ns_inum */
+ /* .pid_ns_inum */
+ /* .net_ns_inum */
+ /* .cgroup_ns_inum */
+ /* .user_ns_inum */
+ },
+ };
+
+ perf_iterate_sb(perf_event_namespaces_output,
+ &namespaces_event,
+ NULL);
+}
+
+/*
* mmap tracking
*/
@@ -9018,6 +9152,8 @@ static void account_event(struct perf_event *event)
atomic_inc(&nr_mmap_events);
if (event->attr.comm)
atomic_inc(&nr_comm_events);
+ if (event->attr.namespaces)
+ atomic_inc(&nr_namespaces_events);
if (event->attr.task)
atomic_inc(&nr_task_events);
if (event->attr.freq)
diff --git a/kernel/fork.c b/kernel/fork.c
index 997ac1d..3faca3d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1818,6 +1818,7 @@ static __latent_entropy struct task_struct *copy_process(
cgroup_post_fork(p);
threadgroup_change_end(current);
perf_event_fork(p);
+ perf_event_namespaces(p);
trace_task_newtask(p, clone_flags);
uprobe_copy_process(p, clone_flags);
@@ -2280,6 +2281,9 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
free_fs_struct(new_fs);
bad_unshare_out:
+ if (!err)
+ perf_event_namespaces(current);
+
return err;
}
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 782102e..4c25e6e 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -26,6 +26,7 @@
#include <linux/file.h>
#include <linux/syscalls.h>
#include <linux/cgroup.h>
+#include <linux/perf_event.h>
static struct kmem_cache *nsproxy_cachep;
@@ -264,6 +265,10 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
switch_task_namespaces(tsk, new_nsproxy);
out:
fput(file);
+
+ if (!err)
+ perf_event_namespaces(tsk);
+
return err;
}
This patch updates perf tool to examine PERF_RECORD_NAMESPACES events
emitted by the kernel when fork, clone, setns or unshare are invoked.
Also, it synthesizes PERF_RECORD_NAMESPACES events for processes that
were running prior to invocation of perf record, the data for which
is taken from /proc/$PID/ns. These changes make way for analyzing
events with regard to namespaces.
Signed-off-by: Hari Bathini <[email protected]>
---
TODO
* Update manpage
* Make PERF_RECORD_NAMESPACES optional (?)
tools/include/uapi/linux/perf_event.h | 21 +++++-
tools/perf/builtin-annotate.c | 1
tools/perf/builtin-diff.c | 1
tools/perf/builtin-inject.c | 14 ++++
tools/perf/builtin-kmem.c | 1
tools/perf/builtin-kvm.c | 2 +
tools/perf/builtin-lock.c | 1
tools/perf/builtin-mem.c | 1
tools/perf/builtin-record.c | 28 ++++++--
tools/perf/builtin-report.c | 1
tools/perf/builtin-sched.c | 1
tools/perf/builtin-script.c | 41 +++++++++++
tools/perf/builtin-trace.c | 3 +
tools/perf/util/Build | 1
tools/perf/util/data-convert-bt.c | 2 +
tools/perf/util/event.c | 123 +++++++++++++++++++++++++++++++--
tools/perf/util/event.h | 25 +++++++
tools/perf/util/evsel.c | 1
tools/perf/util/machine.c | 24 ++++++
tools/perf/util/machine.h | 3 +
tools/perf/util/namespaces.c | 33 +++++++++
tools/perf/util/namespaces.h | 24 ++++++
tools/perf/util/session.c | 7 ++
tools/perf/util/thread.c | 44 +++++++++++-
tools/perf/util/thread.h | 7 ++
tools/perf/util/tool.h | 1
26 files changed, 397 insertions(+), 14 deletions(-)
create mode 100644 tools/perf/util/namespaces.c
create mode 100644 tools/perf/util/namespaces.h
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index c66a485..575aed6 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -344,7 +344,8 @@ struct perf_event_attr {
use_clockid : 1, /* use @clockid for time fields */
context_switch : 1, /* context switch data */
write_backward : 1, /* Write ring buffer from end to beginning */
- __reserved_1 : 36;
+ namespaces : 1, /* include namespaces data */
+ __reserved_1 : 35;
union {
__u32 wakeup_events; /* wakeup every n events */
@@ -862,6 +863,24 @@ enum perf_event_type {
*/
PERF_RECORD_SWITCH_CPU_WIDE = 15,
+ /*
+ * struct {
+ * struct perf_event_header header;
+ *
+ * u32 pid, tid;
+ * u64 time;
+ * u32 uts_ns_inum;
+ * u32 ipc_ns_inum;
+ * u32 mnt_ns_inum;
+ * u32 pid_ns_inum;
+ * u32 net_ns_inum;
+ * u32 cgroup_ns_inum;
+ * u32 user_ns_inum;
+ * struct sample_id sample_id;
+ * };
+ */
+ PERF_RECORD_NAMESPACES = 16,
+
PERF_RECORD_MAX, /* non-ABI */
};
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index ebb6283..1b63dc4 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -393,6 +393,7 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __maybe_unused)
.comm = perf_event__process_comm,
.exit = perf_event__process_exit,
.fork = perf_event__process_fork,
+ .namespaces = perf_event__process_namespaces,
.ordered_events = true,
.ordering_requires_timestamps = true,
},
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 9ff0db4..c52552f 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -354,6 +354,7 @@ static struct perf_tool tool = {
.exit = perf_event__process_exit,
.fork = perf_event__process_fork,
.lost = perf_event__process_lost,
+ .namespaces = perf_event__process_namespaces,
.ordered_events = true,
.ordering_requires_timestamps = true,
};
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index b9bc7e3..c5ddc73 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -333,6 +333,19 @@ static int perf_event__repipe_comm(struct perf_tool *tool,
return err;
}
+static int perf_event__repipe_namespaces(struct perf_tool *tool,
+ union perf_event *event,
+ struct perf_sample *sample,
+ struct machine *machine)
+{
+ int err;
+
+ err = perf_event__process_namespaces(tool, event, sample, machine);
+ perf_event__repipe(tool, event, sample, machine);
+
+ return err;
+}
+
static int perf_event__repipe_exit(struct perf_tool *tool,
union perf_event *event,
struct perf_sample *sample,
@@ -660,6 +673,7 @@ static int __cmd_inject(struct perf_inject *inject)
session->itrace_synth_opts = &inject->itrace_synth_opts;
inject->itrace_synth_opts.inject = true;
inject->tool.comm = perf_event__repipe_comm;
+ inject->tool.namespaces = perf_event__repipe_namespaces;
inject->tool.exit = perf_event__repipe_exit;
inject->tool.id_index = perf_event__repipe_id_index;
inject->tool.auxtrace_info = perf_event__process_auxtrace_info;
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index d426dcb..a60fab0 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -943,6 +943,7 @@ static struct perf_tool perf_kmem = {
.comm = perf_event__process_comm,
.mmap = perf_event__process_mmap,
.mmap2 = perf_event__process_mmap2,
+ .namespaces = perf_event__process_namespaces,
.ordered_events = true,
};
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index 08fa88f..18e6c38 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -1044,6 +1044,7 @@ static int read_events(struct perf_kvm_stat *kvm)
struct perf_tool eops = {
.sample = process_sample_event,
.comm = perf_event__process_comm,
+ .namespaces = perf_event__process_namespaces,
.ordered_events = true,
};
struct perf_data_file file = {
@@ -1348,6 +1349,7 @@ static int kvm_events_live(struct perf_kvm_stat *kvm,
kvm->tool.exit = perf_event__process_exit;
kvm->tool.fork = perf_event__process_fork;
kvm->tool.lost = process_lost_event;
+ kvm->tool.namespaces = perf_event__process_namespaces;
kvm->tool.ordered_events = true;
perf_tool__fill_defaults(&kvm->tool);
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index ce3bfb4..d750cca 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -858,6 +858,7 @@ static int __cmd_report(bool display_info)
struct perf_tool eops = {
.sample = process_sample_event,
.comm = perf_event__process_comm,
+ .namespaces = perf_event__process_namespaces,
.ordered_events = true,
};
struct perf_data_file file = {
diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index d1ce29b..da55056 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -342,6 +342,7 @@ int cmd_mem(int argc, const char **argv, const char *prefix __maybe_unused)
.lost = perf_event__process_lost,
.fork = perf_event__process_fork,
.build_id = perf_event__process_build_id,
+ .namespaces = perf_event__process_namespaces,
.ordered_events = true,
},
.input_name = "perf.data",
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 67d2a90..57bc17c 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -941,6 +941,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
*/
if (forks) {
union perf_event *event;
+ pid_t tgid;
event = malloc(sizeof(event->comm) + machine->id_hdr_size);
if (event == NULL) {
@@ -954,10 +955,28 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
* cannot see a correct process name for those events.
* Synthesize COMM event to prevent it.
*/
- perf_event__synthesize_comm(tool, event,
- rec->evlist->workload.pid,
- process_synthesized_event,
- machine);
+ tgid = perf_event__synthesize_comm(tool, event,
+ rec->evlist->workload.pid,
+ process_synthesized_event,
+ machine);
+ free(event);
+
+ if (tgid == -1)
+ goto out_child;
+
+ event = malloc(sizeof(event->namespaces) + machine->id_hdr_size);
+ if (event == NULL) {
+ err = -ENOMEM;
+ goto out_child;
+ }
+
+ /*
+ * Synthesize NAMESPACES event for the command specified.
+ */
+ perf_event__synthesize_namespaces(tool, event,
+ rec->evlist->workload.pid,
+ tgid, process_synthesized_event,
+ machine);
free(event);
perf_evlist__start_workload(rec->evlist);
@@ -1376,6 +1395,7 @@ static struct record record = {
.fork = perf_event__process_fork,
.exit = perf_event__process_exit,
.comm = perf_event__process_comm,
+ .namespaces = perf_event__process_namespaces,
.mmap = perf_event__process_mmap,
.mmap2 = perf_event__process_mmap2,
.ordered_events = true,
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 6e88460..420878f4 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -683,6 +683,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
.mmap = perf_event__process_mmap,
.mmap2 = perf_event__process_mmap2,
.comm = perf_event__process_comm,
+ .namespaces = perf_event__process_namespaces,
.exit = perf_event__process_exit,
.fork = perf_event__process_fork,
.lost = perf_event__process_lost,
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index f5503ca..db67f55 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1939,6 +1939,7 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
.tool = {
.sample = perf_sched__process_tracepoint_sample,
.comm = perf_event__process_comm,
+ .namespaces = perf_event__process_namespaces,
.lost = perf_event__process_lost,
.fork = perf_sched__process_fork_event,
.ordered_events = true,
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 7228d14..77cc796 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -807,6 +807,7 @@ struct perf_script {
bool show_task_events;
bool show_mmap_events;
bool show_switch_events;
+ bool show_namespaces_events;
bool allocated;
struct cpu_map *cpus;
struct thread_map *threads;
@@ -1090,6 +1091,41 @@ static int process_comm_event(struct perf_tool *tool,
return ret;
}
+static int process_namespaces_event(struct perf_tool *tool,
+ union perf_event *event,
+ struct perf_sample *sample,
+ struct machine *machine)
+{
+ struct thread *thread;
+ struct perf_script *script = container_of(tool, struct perf_script, tool);
+ struct perf_session *session = script->session;
+ struct perf_evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id);
+ int ret = -1;
+
+ thread = machine__findnew_thread(machine, event->namespaces.pid,
+ event->namespaces.tid);
+ if (thread == NULL) {
+ pr_debug("problem processing NAMESPACES event, skipping it.\n");
+ return -1;
+ }
+
+ if (perf_event__process_namespaces(tool, event, sample, machine) < 0)
+ goto out;
+
+ if (!evsel->attr.sample_id_all) {
+ sample->cpu = 0;
+ sample->time = 0;
+ sample->tid = event->namespaces.tid;
+ sample->pid = event->namespaces.pid;
+ }
+ print_sample_start(sample, thread, evsel);
+ perf_event__fprintf(event, stdout);
+ ret = 0;
+out:
+ thread__put(thread);
+ return ret;
+}
+
static int process_fork_event(struct perf_tool *tool,
union perf_event *event,
struct perf_sample *sample,
@@ -1265,6 +1301,8 @@ static int __cmd_script(struct perf_script *script)
}
if (script->show_switch_events)
script->tool.context_switch = process_switch_event;
+ if (script->show_namespaces_events)
+ script->tool.namespaces = process_namespaces_event;
ret = perf_session__process_events(script->session);
@@ -2069,6 +2107,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
.mmap = perf_event__process_mmap,
.mmap2 = perf_event__process_mmap2,
.comm = perf_event__process_comm,
+ .namespaces = perf_event__process_namespaces,
.exit = perf_event__process_exit,
.fork = perf_event__process_fork,
.attr = process_attr,
@@ -2150,6 +2189,8 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
"Show the mmap events"),
OPT_BOOLEAN('\0', "show-switch-events", &script.show_switch_events,
"Show context switch events (if recorded)"),
+ OPT_BOOLEAN('\0', "show-namespaces-events", &script.show_namespaces_events,
+ "Show namespaces events (if recorded)"),
OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"),
OPT_BOOLEAN(0, "ns", &nanosecs,
"Use 9 decimal places when displaying time"),
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index c298bd3..8201a90 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -2445,8 +2445,9 @@ static int trace__replay(struct trace *trace)
trace->tool.exit = perf_event__process_exit;
trace->tool.fork = perf_event__process_fork;
trace->tool.attr = perf_event__process_attr;
- trace->tool.tracing_data = perf_event__process_tracing_data;
+ trace->tool.tracing_data = perf_event__process_tracing_data;
trace->tool.build_id = perf_event__process_build_id;
+ trace->tool.namespaces = perf_event__process_namespaces;
trace->tool.ordered_events = true;
trace->tool.ordering_requires_timestamps = true;
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index eb60e61..73f12ae 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -42,6 +42,7 @@ libperf-y += pstack.o
libperf-y += session.o
libperf-$(CONFIG_AUDIT) += syscalltbl.o
libperf-y += ordered-events.o
+libperf-y += namespaces.o
libperf-y += comm.o
libperf-y += thread.o
libperf-y += thread_map.o
diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c
index 7123f4d..1fcacf1 100644
--- a/tools/perf/util/data-convert-bt.c
+++ b/tools/perf/util/data-convert-bt.c
@@ -1468,6 +1468,7 @@ int bt_convert__perf2ctf(const char *input, const char *path,
.lost = perf_event__process_lost,
.tracing_data = perf_event__process_tracing_data,
.build_id = perf_event__process_build_id,
+ .namespaces = perf_event__process_namespaces,
.ordered_events = true,
.ordering_requires_timestamps = true,
},
@@ -1479,6 +1480,7 @@ int bt_convert__perf2ctf(const char *input, const char *path,
c.tool.comm = process_comm_event;
c.tool.exit = process_exit_event;
c.tool.fork = process_fork_event;
+ c.tool.namespaces = process_namespaces_event;
}
perf_config(convert__config, &c);
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 8ab0d7d..35ace47 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -31,6 +31,7 @@ static const char *perf_event__names[] = {
[PERF_RECORD_LOST_SAMPLES] = "LOST_SAMPLES",
[PERF_RECORD_SWITCH] = "SWITCH",
[PERF_RECORD_SWITCH_CPU_WIDE] = "SWITCH_CPU_WIDE",
+ [PERF_RECORD_NAMESPACES] = "NAMESPACES",
[PERF_RECORD_HEADER_ATTR] = "ATTR",
[PERF_RECORD_HEADER_EVENT_TYPE] = "EVENT_TYPE",
[PERF_RECORD_HEADER_TRACING_DATA] = "TRACING_DATA",
@@ -203,6 +204,60 @@ pid_t perf_event__synthesize_comm(struct perf_tool *tool,
return tgid;
}
+int perf_event__synthesize_namespaces(struct perf_tool *tool,
+ union perf_event *event,
+ pid_t pid, pid_t tgid,
+ perf_event__handler_t process,
+ struct machine *machine)
+{
+ struct stat sb;
+ char proc_ns[128];
+
+ memset(&event->namespaces, 0,
+ sizeof(event->namespaces) + machine->id_hdr_size);
+
+ event->namespaces.pid = tgid;
+ event->namespaces.tid = pid;
+
+ sprintf(proc_ns, "/proc/%u/ns/uts", pid);
+ stat(proc_ns, &sb);
+ event->namespaces.uts_ns_inum = sb.st_ino;
+
+ sprintf(proc_ns, "/proc/%u/ns/ipc", pid);
+ stat(proc_ns, &sb);
+ event->namespaces.ipc_ns_inum = sb.st_ino;
+
+ sprintf(proc_ns, "/proc/%u/ns/mnt", pid);
+ stat(proc_ns, &sb);
+ event->namespaces.mnt_ns_inum = sb.st_ino;
+
+ sprintf(proc_ns, "/proc/%u/ns/pid", pid);
+ stat(proc_ns, &sb);
+ event->namespaces.pid_ns_inum = sb.st_ino;
+
+ sprintf(proc_ns, "/proc/%u/ns/net", pid);
+ stat(proc_ns, &sb);
+ event->namespaces.net_ns_inum = sb.st_ino;
+
+ sprintf(proc_ns, "/proc/%u/ns/cgroup", pid);
+ stat(proc_ns, &sb);
+ event->namespaces.cgroup_ns_inum = sb.st_ino;
+
+ sprintf(proc_ns, "/proc/%u/ns/user", pid);
+ stat(proc_ns, &sb);
+ event->namespaces.user_ns_inum = sb.st_ino;
+
+ event->namespaces.header.type = PERF_RECORD_NAMESPACES;
+
+ event->namespaces.header.size = (sizeof(event->namespaces) +
+ machine->id_hdr_size);
+
+ if (perf_tool__process_synth_event(tool, event, machine, process) != 0)
+ return -1;
+
+ return 0;
+}
+
static int perf_event__synthesize_fork(struct perf_tool *tool,
union perf_event *event,
pid_t pid, pid_t tgid, pid_t ppid,
@@ -434,8 +489,9 @@ int perf_event__synthesize_modules(struct perf_tool *tool,
static int __event__synthesize_thread(union perf_event *comm_event,
union perf_event *mmap_event,
union perf_event *fork_event,
+ union perf_event *namespaces_event,
pid_t pid, int full,
- perf_event__handler_t process,
+ perf_event__handler_t process,
struct perf_tool *tool,
struct machine *machine,
bool mmap_data,
@@ -455,6 +511,11 @@ static int __event__synthesize_thread(union perf_event *comm_event,
if (tgid == -1)
return -1;
+ if (perf_event__synthesize_namespaces(tool, namespaces_event, pid,
+ tgid, process, machine) < 0)
+ return -1;
+
+
return perf_event__synthesize_mmap_events(tool, mmap_event, pid, tgid,
process, machine, mmap_data,
proc_map_timeout);
@@ -488,6 +549,11 @@ static int __event__synthesize_thread(union perf_event *comm_event,
if (perf_event__synthesize_fork(tool, fork_event, _pid, tgid,
ppid, process, machine) < 0)
break;
+
+ if (perf_event__synthesize_namespaces(tool, namespaces_event, _pid,
+ tgid, process, machine) < 0)
+ break;
+
/*
* Send the prepared comm event
*/
@@ -516,6 +582,7 @@ int perf_event__synthesize_thread_map(struct perf_tool *tool,
unsigned int proc_map_timeout)
{
union perf_event *comm_event, *mmap_event, *fork_event;
+ union perf_event *namespaces_event;
int err = -1, thread, j;
comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size);
@@ -530,10 +597,15 @@ int perf_event__synthesize_thread_map(struct perf_tool *tool,
if (fork_event == NULL)
goto out_free_mmap;
+ namespaces_event = malloc(sizeof(namespaces_event->namespaces) +
+ machine->id_hdr_size);
+ if (namespaces_event == NULL)
+ goto out_free_fork;
+
err = 0;
for (thread = 0; thread < threads->nr; ++thread) {
if (__event__synthesize_thread(comm_event, mmap_event,
- fork_event,
+ fork_event, namespaces_event,
thread_map__pid(threads, thread), 0,
process, tool, machine,
mmap_data, proc_map_timeout)) {
@@ -559,7 +631,7 @@ int perf_event__synthesize_thread_map(struct perf_tool *tool,
/* if not, generate events for it */
if (need_leader &&
__event__synthesize_thread(comm_event, mmap_event,
- fork_event,
+ fork_event, namespaces_event,
comm_event->comm.pid, 0,
process, tool, machine,
mmap_data, proc_map_timeout)) {
@@ -568,6 +640,8 @@ int perf_event__synthesize_thread_map(struct perf_tool *tool,
}
}
}
+ free(namespaces_event);
+out_free_fork:
free(fork_event);
out_free_mmap:
free(mmap_event);
@@ -587,6 +661,7 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
char proc_path[PATH_MAX];
struct dirent *dirent;
union perf_event *comm_event, *mmap_event, *fork_event;
+ union perf_event *namespaces_event;
int err = -1;
if (machine__is_default_guest(machine))
@@ -604,11 +679,16 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
if (fork_event == NULL)
goto out_free_mmap;
+ namespaces_event = malloc(sizeof(namespaces_event->namespaces) +
+ machine->id_hdr_size);
+ if (namespaces_event == NULL)
+ goto out_free_fork;
+
snprintf(proc_path, sizeof(proc_path), "%s/proc", machine->root_dir);
proc = opendir(proc_path);
if (proc == NULL)
- goto out_free_fork;
+ goto out_free_namespaces;
while ((dirent = readdir(proc)) != NULL) {
char *end;
@@ -620,13 +700,16 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
* We may race with exiting thread, so don't stop just because
* one thread couldn't be synthesized.
*/
- __event__synthesize_thread(comm_event, mmap_event, fork_event, pid,
- 1, process, tool, machine, mmap_data,
+ __event__synthesize_thread(comm_event, mmap_event, fork_event,
+ namespaces_event, pid, 1, process,
+ tool, machine, mmap_data,
proc_map_timeout);
}
err = 0;
closedir(proc);
+out_free_namespaces:
+ free(namespaces_event);
out_free_fork:
free(fork_event);
out_free_mmap:
@@ -1008,6 +1091,23 @@ size_t perf_event__fprintf_comm(union perf_event *event, FILE *fp)
return fprintf(fp, "%s: %s:%d/%d\n", s, event->comm.comm, event->comm.pid, event->comm.tid);
}
+size_t perf_event__fprintf_namespaces(union perf_event *event, FILE *fp)
+{
+ return fprintf(fp, " %d/%d - [uts ns: 0x%08x, ipc ns: 0x%08x,"
+ " mnt_ns: 0x%08x, pid ns for childern: 0x%08x,"
+ " net ns: 0x%08x, cgroup ns: 0x%08x,"
+ " user ns: 0x%08x, time: %lu]\n\n",
+ event->namespaces.pid, event->namespaces.tid,
+ event->namespaces.uts_ns_inum,
+ event->namespaces.ipc_ns_inum,
+ event->namespaces.mnt_ns_inum,
+ event->namespaces.pid_ns_inum,
+ event->namespaces.net_ns_inum,
+ event->namespaces.cgroup_ns_inum,
+ event->namespaces.user_ns_inum,
+ event->namespaces.start_time);
+}
+
int perf_event__process_comm(struct perf_tool *tool __maybe_unused,
union perf_event *event,
struct perf_sample *sample,
@@ -1016,6 +1116,14 @@ int perf_event__process_comm(struct perf_tool *tool __maybe_unused,
return machine__process_comm_event(machine, event, sample);
}
+int perf_event__process_namespaces(struct perf_tool *tool __maybe_unused,
+ union perf_event *event,
+ struct perf_sample *sample,
+ struct machine *machine)
+{
+ return machine__process_namespaces_event(machine, event, sample);
+}
+
int perf_event__process_lost(struct perf_tool *tool __maybe_unused,
union perf_event *event,
struct perf_sample *sample,
@@ -1196,6 +1304,9 @@ size_t perf_event__fprintf(union perf_event *event, FILE *fp)
case PERF_RECORD_MMAP:
ret += perf_event__fprintf_mmap(event, fp);
break;
+ case PERF_RECORD_NAMESPACES:
+ ret += perf_event__fprintf_namespaces(event, fp);
+ break;
case PERF_RECORD_MMAP2:
ret += perf_event__fprintf_mmap2(event, fp);
break;
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 8d363d5..e1ed11d 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -39,6 +39,19 @@ struct comm_event {
char comm[16];
};
+struct namespaces_event {
+ struct perf_event_header header;
+ u32 pid, tid;
+ u64 start_time;
+ u32 uts_ns_inum;
+ u32 ipc_ns_inum;
+ u32 mnt_ns_inum;
+ u32 pid_ns_inum;
+ u32 net_ns_inum;
+ u32 cgroup_ns_inum;
+ u32 user_ns_inum;
+};
+
struct fork_event {
struct perf_event_header header;
u32 pid, ppid;
@@ -482,6 +495,7 @@ union perf_event {
struct mmap_event mmap;
struct mmap2_event mmap2;
struct comm_event comm;
+ struct namespaces_event namespaces;
struct fork_event fork;
struct lost_event lost;
struct lost_samples_event lost_samples;
@@ -584,6 +598,10 @@ int perf_event__process_switch(struct perf_tool *tool,
union perf_event *event,
struct perf_sample *sample,
struct machine *machine);
+int perf_event__process_namespaces(struct perf_tool *tool,
+ union perf_event *event,
+ struct perf_sample *sample,
+ struct machine *machine);
int perf_event__process_mmap(struct perf_tool *tool,
union perf_event *event,
struct perf_sample *sample,
@@ -633,6 +651,12 @@ pid_t perf_event__synthesize_comm(struct perf_tool *tool,
perf_event__handler_t process,
struct machine *machine);
+int perf_event__synthesize_namespaces(struct perf_tool *tool,
+ union perf_event *event,
+ pid_t pid, pid_t tgid,
+ perf_event__handler_t process,
+ struct machine *machine);
+
int perf_event__synthesize_mmap_events(struct perf_tool *tool,
union perf_event *event,
pid_t pid, pid_t tgid,
@@ -650,6 +674,7 @@ size_t perf_event__fprintf_itrace_start(union perf_event *event, FILE *fp);
size_t perf_event__fprintf_switch(union perf_event *event, FILE *fp);
size_t perf_event__fprintf_thread_map(union perf_event *event, FILE *fp);
size_t perf_event__fprintf_cpu_map(union perf_event *event, FILE *fp);
+size_t perf_event__fprintf_namespaces(union perf_event *event, FILE *fp);
size_t perf_event__fprintf(union perf_event *event, FILE *fp);
u64 kallsyms__get_function_start(const char *kallsyms_filename,
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 8bc2711..847ea10 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -922,6 +922,7 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts,
attr->mmap = track;
attr->mmap2 = track && !perf_missing_features.mmap2;
attr->comm = track;
+ attr->namespaces = track;
if (opts->record_switch_events)
attr->context_switch = track;
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index df85b9e..d721906 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -482,6 +482,28 @@ int machine__process_comm_event(struct machine *machine, union perf_event *event
return err;
}
+int machine__process_namespaces_event(struct machine *machine __maybe_unused,
+ union perf_event *event,
+ struct perf_sample *sample __maybe_unused)
+{
+ struct thread *thread = machine__findnew_thread(machine,
+ event->namespaces.pid,
+ event->namespaces.tid);
+ int err = 0;
+
+ if (dump_trace)
+ perf_event__fprintf_namespaces(event, stdout);
+
+ if (thread == NULL || thread__set_namespaces(thread, &event->namespaces)) {
+ dump_printf("problem processing PERF_RECORD_NAMESPACES, skipping event.\n");
+ err = -1;
+ }
+
+ thread__put(thread);
+
+ return err;
+}
+
int machine__process_lost_event(struct machine *machine __maybe_unused,
union perf_event *event, struct perf_sample *sample __maybe_unused)
{
@@ -1519,6 +1541,8 @@ int machine__process_event(struct machine *machine, union perf_event *event,
ret = machine__process_comm_event(machine, event, sample); break;
case PERF_RECORD_MMAP:
ret = machine__process_mmap_event(machine, event, sample); break;
+ case PERF_RECORD_NAMESPACES:
+ ret = machine__process_namespaces_event(machine, event, sample); break;
case PERF_RECORD_MMAP2:
ret = machine__process_mmap2_event(machine, event, sample); break;
case PERF_RECORD_FORK:
diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
index 354de6e..e494368 100644
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -97,6 +97,9 @@ int machine__process_itrace_start_event(struct machine *machine,
union perf_event *event);
int machine__process_switch_event(struct machine *machine,
union perf_event *event);
+int machine__process_namespaces_event(struct machine *machine,
+ union perf_event *event,
+ struct perf_sample *sample);
int machine__process_mmap_event(struct machine *machine, union perf_event *event,
struct perf_sample *sample);
int machine__process_mmap2_event(struct machine *machine, union perf_event *event,
diff --git a/tools/perf/util/namespaces.c b/tools/perf/util/namespaces.c
new file mode 100644
index 0000000..51b8932
--- /dev/null
+++ b/tools/perf/util/namespaces.c
@@ -0,0 +1,33 @@
+#include "namespaces.h"
+#include "util.h"
+#include "event.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <linux/atomic.h>
+
+struct namespaces *namespaces__new(struct namespaces_event *event)
+{
+ struct namespaces *namespaces = zalloc(sizeof(*namespaces));
+
+ if (!namespaces)
+ return NULL;
+
+ namespaces->end_time = -1;
+
+ if (event) {
+ namespaces->uts_ns_inum = event->uts_ns_inum;
+ namespaces->ipc_ns_inum = event->ipc_ns_inum;
+ namespaces->mnt_ns_inum = event->mnt_ns_inum;
+ namespaces->pid_ns_inum = event->pid_ns_inum;
+ namespaces->net_ns_inum = event->net_ns_inum;
+ namespaces->cgroup_ns_inum = event->cgroup_ns_inum;
+ namespaces->user_ns_inum = event->user_ns_inum;
+ }
+
+ return namespaces;
+}
+
+void namespaces__free(struct namespaces *namespaces)
+{
+ free(namespaces);
+}
diff --git a/tools/perf/util/namespaces.h b/tools/perf/util/namespaces.h
new file mode 100644
index 0000000..3a941ae
--- /dev/null
+++ b/tools/perf/util/namespaces.h
@@ -0,0 +1,24 @@
+#ifndef __PERF_NAMESPACES_H
+#define __PERF_NAMESPACES_H
+
+#include "../perf.h"
+#include <linux/list.h>
+
+struct namespaces_event;
+
+struct namespaces {
+ struct list_head list;
+ u64 end_time;
+ u32 uts_ns_inum;
+ u32 ipc_ns_inum;
+ u32 mnt_ns_inum;
+ u32 pid_ns_inum;
+ u32 net_ns_inum;
+ u32 cgroup_ns_inum;
+ u32 user_ns_inum;
+};
+
+struct namespaces *namespaces__new(struct namespaces_event *event);
+void namespaces__free(struct namespaces *namespaces);
+
+#endif /* __PERF_NAMESPACES_H */
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 5d61242..d645b1a 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1239,6 +1239,8 @@ static int machines__deliver_event(struct machines *machines,
return tool->mmap2(tool, event, sample, machine);
case PERF_RECORD_COMM:
return tool->comm(tool, event, sample, machine);
+ case PERF_RECORD_NAMESPACES:
+ return tool->namespaces(tool, event, sample, machine);
case PERF_RECORD_FORK:
return tool->fork(tool, event, sample, machine);
case PERF_RECORD_EXIT:
@@ -1494,6 +1496,11 @@ int perf_session__register_idle_thread(struct perf_session *session)
err = -1;
}
+ if (thread == NULL || thread__set_namespaces(thread, NULL)) {
+ pr_err("problem inserting idle task.\n");
+ err = -1;
+ }
+
/* machine__findnew_thread() got the thread, so put it */
thread__put(thread);
return err;
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index f5af87f..f145a8d 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -7,6 +7,7 @@
#include "thread-stack.h"
#include "util.h"
#include "debug.h"
+#include "namespaces.h"
#include "comm.h"
#include "unwind.h"
@@ -40,6 +41,7 @@ struct thread *thread__new(pid_t pid, pid_t tid)
thread->tid = tid;
thread->ppid = -1;
thread->cpu = -1;
+ INIT_LIST_HEAD(&thread->namespaces_list);
INIT_LIST_HEAD(&thread->comm_list);
comm_str = malloc(32);
@@ -66,7 +68,8 @@ struct thread *thread__new(pid_t pid, pid_t tid)
void thread__delete(struct thread *thread)
{
- struct comm *comm, *tmp;
+ struct namespaces *namespaces, *tmp_namespaces;
+ struct comm *comm, *tmp_comm;
BUG_ON(!RB_EMPTY_NODE(&thread->rb_node));
@@ -76,7 +79,12 @@ void thread__delete(struct thread *thread)
map_groups__put(thread->mg);
thread->mg = NULL;
}
- list_for_each_entry_safe(comm, tmp, &thread->comm_list, list) {
+ list_for_each_entry_safe(namespaces, tmp_namespaces,
+ &thread->namespaces_list, list) {
+ list_del(&namespaces->list);
+ namespaces__free(namespaces);
+ }
+ list_for_each_entry_safe(comm, tmp_comm, &thread->comm_list, list) {
list_del(&comm->list);
comm__free(comm);
}
@@ -104,6 +112,38 @@ void thread__put(struct thread *thread)
}
}
+struct namespaces *thread__namespaces(const struct thread *thread)
+{
+ if (list_empty(&thread->namespaces_list))
+ return NULL;
+
+ return list_first_entry(&thread->namespaces_list, struct namespaces, list);
+}
+
+int thread__set_namespaces(struct thread *thread,
+ struct namespaces_event *event)
+{
+ struct namespaces *new, *curr = thread__namespaces(thread);
+
+ new = namespaces__new(event);
+ if (!new)
+ return -ENOMEM;
+
+ list_add(&new->list, &thread->namespaces_list);
+
+ if (event && curr) {
+ /*
+ * setns syscall must have changed few or all the namespaces
+ * of this thread. Update end time for the namespaces
+ * previously used.
+ */
+ curr = list_next_entry(new, list);
+ curr->end_time = event->start_time;
+ }
+
+ return 0;
+}
+
struct comm *thread__comm(const struct thread *thread)
{
if (list_empty(&thread->comm_list))
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index 99263cb..c0df723 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -28,6 +28,7 @@ struct thread {
bool comm_set;
int comm_len;
bool dead; /* if set thread has exited */
+ struct list_head namespaces_list;
struct list_head comm_list;
u64 db_id;
@@ -40,6 +41,7 @@ struct thread {
};
struct machine;
+struct namespaces;
struct comm;
struct thread *thread__new(pid_t pid, pid_t tid);
@@ -62,6 +64,11 @@ static inline void thread__exited(struct thread *thread)
thread->dead = true;
}
+struct namespaces *thread__namespaces(const struct thread *thread);
+struct namespaces *thread__latest_namespaces(const struct thread *thread);
+int thread__set_namespaces(struct thread *thread,
+ struct namespaces_event *event);
+
int __thread__set_comm(struct thread *thread, const char *comm, u64 timestamp,
bool exec);
static inline int thread__set_comm(struct thread *thread, const char *comm,
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index ac2590a..a0d1af2 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -40,6 +40,7 @@ struct perf_tool {
event_op mmap,
mmap2,
comm,
+ namespaces,
fork,
exit,
lost,
This patch introduces a cgroup identifier entry field in perf report to
identify or distinguish data of different cgroups. It uses the unique
inode number of cgroup namespace, included in perf data with the new
PERF_RECORD_NAMESPACES event, as cgroup identifier. With the assumption
that each container is created with it's own cgroup namespace, this
allows assessment/analysis of multiple containers at once.
Shown below is the output of perf report, sorted based on cgroup id, on
a system that was running three containers at the time of perf record
and clearly showing one of the containers' considerable use of kernel
memory in comparison with others:
$ perf report -s cgroup_id,sample --stdio
#
# Total Lost Samples: 0
#
# Samples: 1K of event 'kmem:kmalloc'
# Event count (approx.): 1290
#
# Overhead cgroup id Samples
# ........ .......... ............
#
78.84% 4026532052 1017
12.87% 4026531835 166
5.58% 4026532053 72
2.71% 4026532054 35
Signed-off-by: Hari Bathini <[email protected]>
---
tools/perf/util/hist.c | 4 ++++
tools/perf/util/hist.h | 1 +
tools/perf/util/sort.c | 22 ++++++++++++++++++++++
tools/perf/util/sort.h | 2 ++
4 files changed, 29 insertions(+)
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index b02992e..76f2a91 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -2,6 +2,7 @@
#include "build-id.h"
#include "hist.h"
#include "session.h"
+#include "namespaces.h"
#include "sort.h"
#include "evlist.h"
#include "evsel.h"
@@ -168,6 +169,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
hists__set_unres_dso_col_len(hists, HISTC_MEM_DADDR_DSO);
}
+ hists__new_col_len(hists, HISTC_CGROUP_ID, 10);
hists__new_col_len(hists, HISTC_CPU, 3);
hists__new_col_len(hists, HISTC_SOCKET, 6);
hists__new_col_len(hists, HISTC_MEM_LOCKED, 6);
@@ -573,9 +575,11 @@ __hists__add_entry(struct hists *hists,
bool sample_self,
struct hist_entry_ops *ops)
{
+ struct namespaces *ns = thread__namespaces(al->thread);
struct hist_entry entry = {
.thread = al->thread,
.comm = thread__comm(al->thread),
+ .cgroup_id = ns ? ns->cgroup_ns_inum : 0,
.ms = {
.map = al->map,
.sym = al->sym,
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 9928fed..894c95d 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -29,6 +29,7 @@ enum hist_column {
HISTC_DSO,
HISTC_THREAD,
HISTC_COMM,
+ HISTC_CGROUP_ID,
HISTC_PARENT,
HISTC_CPU,
HISTC_SOCKET,
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 452e15a..b6152df 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -536,6 +536,27 @@ struct sort_entry sort_cpu = {
.se_width_idx = HISTC_CPU,
};
+/* --sort cgroup_id */
+
+static int64_t
+sort__cgroup_id_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+ return (int64_t)right->cgroup_id - (int64_t)left->cgroup_id;
+}
+
+static int hist_entry__cgroup_id_snprintf(struct hist_entry *he, char *bf,
+ size_t size, unsigned int width)
+{
+ return repsep_snprintf(bf, size, "%-*u", width, he->cgroup_id);
+}
+
+struct sort_entry sort_cgroup_id = {
+ .se_header = "cgroup id",
+ .se_cmp = sort__cgroup_id_cmp,
+ .se_snprintf = hist_entry__cgroup_id_snprintf,
+ .se_width_idx = HISTC_CGROUP_ID,
+};
+
/* --sort socket */
static int64_t
@@ -1418,6 +1439,7 @@ static struct sort_dimension common_sort_dimensions[] = {
DIM(SORT_GLOBAL_WEIGHT, "weight", sort_global_weight),
DIM(SORT_TRANSACTION, "transaction", sort_transaction),
DIM(SORT_TRACE, "trace", sort_trace),
+ DIM(SORT_CGROUP_ID, "cgroup_id", sort_cgroup_id),
};
#undef DIM
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 099c975..e8058f6 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -95,6 +95,7 @@ struct hist_entry {
u64 transaction;
s32 socket;
s32 cpu;
+ u32 cgroup_id;
u8 cpumode;
u8 depth;
@@ -211,6 +212,7 @@ enum sort_type {
SORT_GLOBAL_WEIGHT,
SORT_TRANSACTION,
SORT_TRACE,
+ SORT_CGROUP_ID,
/* branch stack specific sort keys */
__SORT_BRANCH_STACK,
Hi Hari,
[auto build test ERROR on tip/perf/core]
[also build test ERROR on v4.9-rc4]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/Hari-Bathini/perf-add-support-for-analyzing-events-for-containers/20161110-195727
config: ia64-allnoconfig (attached as .config)
compiler: ia64-linux-gcc (GCC) 6.2.0
reproduce:
wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=ia64
All errors (new ones prefixed by >>):
kernel/fork.c: In function 'copy_process':
>> kernel/fork.c:1818:2: error: implicit declaration of function 'perf_event_namespaces' [-Werror=implicit-function-declaration]
perf_event_namespaces(p);
^~~~~~~~~~~~~~~~~~~~~
cc1: some warnings being treated as errors
--
kernel/nsproxy.c: In function 'SYSC_setns':
>> kernel/nsproxy.c:270:3: error: implicit declaration of function 'perf_event_namespaces' [-Werror=implicit-function-declaration]
perf_event_namespaces(tsk);
^~~~~~~~~~~~~~~~~~~~~
cc1: some warnings being treated as errors
vim +/perf_event_namespaces +1818 kernel/fork.c
1812 write_unlock_irq(&tasklist_lock);
1813
1814 proc_fork_connector(p);
1815 cgroup_post_fork(p);
1816 threadgroup_change_end(current);
1817 perf_event_fork(p);
> 1818 perf_event_namespaces(p);
1819
1820 trace_task_newtask(p, clone_flags);
1821 uprobe_copy_process(p, clone_flags);
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
On Thu, Nov 10, 2016 at 05:08:06PM +0530, Hari Bathini wrote:
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index c66a485..575aed6 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -344,7 +344,8 @@ struct perf_event_attr {
> use_clockid : 1, /* use @clockid for time fields */
> context_switch : 1, /* context switch data */
> write_backward : 1, /* Write ring buffer from end to beginning */
> - __reserved_1 : 36;
> + namespaces : 1, /* include namespaces data */
> + __reserved_1 : 35;
>
> union {
> __u32 wakeup_events; /* wakeup every n events */
> @@ -862,6 +863,24 @@ enum perf_event_type {
> */
> PERF_RECORD_SWITCH_CPU_WIDE = 15,
>
> + /*
> + * struct {
> + * struct perf_event_header header;
> + *
> + * u32 pid, tid;
> + * u64 time;
> + * u32 uts_ns_inum;
> + * u32 ipc_ns_inum;
> + * u32 mnt_ns_inum;
> + * u32 pid_ns_inum;
> + * u32 net_ns_inum;
> + * u32 cgroup_ns_inum;
> + * u32 user_ns_inum;
> + * struct sample_id sample_id;
> + * };
> + */
> + PERF_RECORD_NAMESPACES = 16,
So this format is not extensible, that is, if someone adds yet another
namespace, we'll need to introduce PERF_RECORD_NAMESPACES2.
Is there a 'natural' and exposed namespace index that we can use to
change it like:
u32 nr_nss;
u32 namespace[nr_nss];
?
> +static void perf_event_namespaces_output(struct perf_event *event,
> + void *data)
> +{
> + struct perf_namespaces_event *namespaces_event = data;
> + struct perf_output_handle handle;
> + struct perf_sample_data sample;
> + int size = namespaces_event->event_id.header.size;
> + struct nsproxy *nsproxy;
> + int ret;
> +
> + if (!perf_event_namespaces_match(event))
> + return;
> +
> + perf_event_header__init_id(&namespaces_event->event_id.header,
> + &sample, event);
> + ret = perf_output_begin(&handle, event,
> + namespaces_event->event_id.header.size);
> +
> + if (ret)
> + goto out;
If you were to introduce:
struct ns_event_id *ei = &namespace_event->event_id;
> +
> + namespaces_event->event_id.pid = perf_event_pid(event,
> + namespaces_event->task);
> + namespaces_event->event_id.tid = perf_event_tid(event,
> + namespaces_event->task);
> +
> + if (namespaces_event->task != current)
> + task_lock(namespaces_event->task);
> +
> + nsproxy = namespaces_event->task->nsproxy;
> + if (nsproxy != NULL) {
> + namespaces_event->event_id.uts_ns_inum =
> + nsproxy->uts_ns->ns.inum;
> +#ifdef CONFIG_IPC_NS
> + namespaces_event->event_id.ipc_ns_inum =
> + nsproxy->ipc_ns->ns.inum;
> +#endif
> + namespaces_event->event_id.mnt_ns_inum =
> + nsproxy->mnt_ns->ns.inum;
> + namespaces_event->event_id.pid_ns_inum =
> + nsproxy->pid_ns_for_children->ns.inum;
> +#ifdef CONFIG_NET
> + namespaces_event->event_id.net_ns_inum =
> + nsproxy->net_ns->ns.inum;
> +#endif
> +#ifdef CONFIG_CGROUPS
> + namespaces_event->event_id.cgroup_ns_inum =
> + nsproxy->cgroup_ns->ns.inum;
> +#endif
> + }
> +
> + namespaces_event->event_id.user_ns_inum =
> + __task_cred(namespaces_event->task)->user_ns->ns.inum;
You can do s/namespace_event->event_id./ei->/ which is tons shorter and
would result in less wrapping of lines and generally improve
readability.
> +
> + if (namespaces_event->task != current)
> + task_unlock(namespaces_event->task);
> +
> + namespaces_event->event_id.time = perf_event_clock(event);
> +
> + perf_output_put(&handle, namespaces_event->event_id);
> +
> + perf_event__output_id_sample(event, &handle, &sample);
> +
> + perf_output_end(&handle);
> +out:
> + namespaces_event->event_id.header.size = size;
> +}
> +
> +void perf_event_namespaces(struct task_struct *task)
> +{
> + struct perf_namespaces_event namespaces_event;
> +
> + if (!atomic_read(&nr_namespaces_events))
> + return;
> +
> + namespaces_event = (struct perf_namespaces_event){
> + .task = task,
> + .event_id = {
> + .header = {
> + .type = PERF_RECORD_NAMESPACES,
> + .misc = 0,
> + .size = sizeof(namespaces_event.event_id),
> + },
> + /* .pid */
> + /* .tid */
> + /* .time */
> + /* .uts_ns_inum */
> + /* .ipc_ns_inum */
> + /* .mnt_ns_inum */
> + /* .pid_ns_inum */
> + /* .net_ns_inum */
> + /* .cgroup_ns_inum */
> + /* .user_ns_inum */
> + },
> + };
> +
> + perf_iterate_sb(perf_event_namespaces_output,
> + &namespaces_event,
> + NULL);
> +}
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 997ac1d..3faca3d 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1818,6 +1818,7 @@ static __latent_entropy struct task_struct *copy_process(
> cgroup_post_fork(p);
> threadgroup_change_end(current);
> perf_event_fork(p);
> + perf_event_namespaces(p);
I would much prefer calling perf_event_namespace() from
perf_event_fork() and reduce the external interface.
Hari Bathini <[email protected]> writes:
> Currently, there is no trivial mechanism to analyze events based on
> containers. perf -G can be used, but it will not filter events for the
> containers created after perf is invoked, making it difficult to assess/
> analyze performance issues of multiple containers at once.
>
> This patch-set overcomes this limitation by using cgroup identifier as
> container unique identifier. A new PERF_RECORD_NAMESPACES event that
> records namespaces related info is introduced, from which the cgroup
> namespace's inode number is used as cgroup identifier. This is based
> on the assumption that each container is created with it's own cgroup
> namespace allowing assessment/analysis of multiple containers using
> cgroup identifier.
>
> The first patch introduces PERF_RECORD_NAMESPACES in kernel while the
> second patch makes the corresponding changes in perf tool to read this
> PERF_RECORD_NAMESPACES events. The third patch adds a cgroup identifier
> column in perf report, which is nothing but the cgroup namespace's
> inode number. This approach is based on the suggestion from Peter
> Zijlstra here: https://patchwork.kernel.org/patch/9305655/
Where is the check that ensures that only the someone with
capable(CAP_SYS_ADMIN) can use this interface. This interface is not
namespace clean in multiple dimensions so it can not be used generally?
You are not allowed to move struct mount_namespace into
include/linux/mnt_namespace.h. Al Viro will crucify you with cause.
Those are implementation details the rest of the kernel should not be
digging into.
Where are the device numbers that go with those inode numbers you are
exporting? For now all of those inodes live on the filesystem but I am
not giving guarantees to userspace that do not work for ordinary
filesystems.
Eric
On Thursday 10 November 2016 06:49 PM, Peter Zijlstra wrote:
> On Thu, Nov 10, 2016 at 05:08:06PM +0530, Hari Bathini wrote:
>
>> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
>> index c66a485..575aed6 100644
>> --- a/include/uapi/linux/perf_event.h
>> +++ b/include/uapi/linux/perf_event.h
>> @@ -344,7 +344,8 @@ struct perf_event_attr {
>> use_clockid : 1, /* use @clockid for time fields */
>> context_switch : 1, /* context switch data */
>> write_backward : 1, /* Write ring buffer from end to beginning */
>> - __reserved_1 : 36;
>> + namespaces : 1, /* include namespaces data */
>> + __reserved_1 : 35;
>>
>> union {
>> __u32 wakeup_events; /* wakeup every n events */
>> @@ -862,6 +863,24 @@ enum perf_event_type {
>> */
>> PERF_RECORD_SWITCH_CPU_WIDE = 15,
>>
>> + /*
>> + * struct {
>> + * struct perf_event_header header;
>> + *
>> + * u32 pid, tid;
>> + * u64 time;
>> + * u32 uts_ns_inum;
>> + * u32 ipc_ns_inum;
>> + * u32 mnt_ns_inum;
>> + * u32 pid_ns_inum;
>> + * u32 net_ns_inum;
>> + * u32 cgroup_ns_inum;
>> + * u32 user_ns_inum;
>> + * struct sample_id sample_id;
>> + * };
>> + */
>> + PERF_RECORD_NAMESPACES = 16,
> So this format is not extensible, that is, if someone adds yet another
> namespace, we'll need to introduce PERF_RECORD_NAMESPACES2.
>
> Is there a 'natural' and exposed namespace index that we can use to
> change it like:
>
> u32 nr_nss;
> u32 namespace[nr_nss];
>
> ?
Hi Peter,
Nothing of that sort exists, currently.
Maybe, time to introduce with this patch-set..?
>> +static void perf_event_namespaces_output(struct perf_event *event,
>> + void *data)
>> +{
>> + struct perf_namespaces_event *namespaces_event = data;
>> + struct perf_output_handle handle;
>> + struct perf_sample_data sample;
>> + int size = namespaces_event->event_id.header.size;
>> + struct nsproxy *nsproxy;
>> + int ret;
>> +
>> + if (!perf_event_namespaces_match(event))
>> + return;
>> +
>> + perf_event_header__init_id(&namespaces_event->event_id.header,
>> + &sample, event);
>> + ret = perf_output_begin(&handle, event,
>> + namespaces_event->event_id.header.size);
>> +
>> + if (ret)
>> + goto out;
> If you were to introduce:
>
> struct ns_event_id *ei = &namespace_event->event_id;
>
>> +
>> + namespaces_event->event_id.pid = perf_event_pid(event,
>> + namespaces_event->task);
>> + namespaces_event->event_id.tid = perf_event_tid(event,
>> + namespaces_event->task);
>> +
>> + if (namespaces_event->task != current)
>> + task_lock(namespaces_event->task);
>> +
>> + nsproxy = namespaces_event->task->nsproxy;
>> + if (nsproxy != NULL) {
>> + namespaces_event->event_id.uts_ns_inum =
>> + nsproxy->uts_ns->ns.inum;
>> +#ifdef CONFIG_IPC_NS
>> + namespaces_event->event_id.ipc_ns_inum =
>> + nsproxy->ipc_ns->ns.inum;
>> +#endif
>> + namespaces_event->event_id.mnt_ns_inum =
>> + nsproxy->mnt_ns->ns.inum;
>> + namespaces_event->event_id.pid_ns_inum =
>> + nsproxy->pid_ns_for_children->ns.inum;
>> +#ifdef CONFIG_NET
>> + namespaces_event->event_id.net_ns_inum =
>> + nsproxy->net_ns->ns.inum;
>> +#endif
>> +#ifdef CONFIG_CGROUPS
>> + namespaces_event->event_id.cgroup_ns_inum =
>> + nsproxy->cgroup_ns->ns.inum;
>> +#endif
>> + }
>> +
>> + namespaces_event->event_id.user_ns_inum =
>> + __task_cred(namespaces_event->task)->user_ns->ns.inum;
> You can do s/namespace_event->event_id./ei->/ which is tons shorter and
> would result in less wrapping of lines and generally improve
> readability.
True.
>> +
>> + if (namespaces_event->task != current)
>> + task_unlock(namespaces_event->task);
>> +
>> + namespaces_event->event_id.time = perf_event_clock(event);
>> +
>> + perf_output_put(&handle, namespaces_event->event_id);
>> +
>> + perf_event__output_id_sample(event, &handle, &sample);
>> +
>> + perf_output_end(&handle);
>> +out:
>> + namespaces_event->event_id.header.size = size;
>> +}
>> +
>> +void perf_event_namespaces(struct task_struct *task)
>> +{
>> + struct perf_namespaces_event namespaces_event;
>> +
>> + if (!atomic_read(&nr_namespaces_events))
>> + return;
>> +
>> + namespaces_event = (struct perf_namespaces_event){
>> + .task = task,
>> + .event_id = {
>> + .header = {
>> + .type = PERF_RECORD_NAMESPACES,
>> + .misc = 0,
>> + .size = sizeof(namespaces_event.event_id),
>> + },
>> + /* .pid */
>> + /* .tid */
>> + /* .time */
>> + /* .uts_ns_inum */
>> + /* .ipc_ns_inum */
>> + /* .mnt_ns_inum */
>> + /* .pid_ns_inum */
>> + /* .net_ns_inum */
>> + /* .cgroup_ns_inum */
>> + /* .user_ns_inum */
>> + },
>> + };
>> +
>> + perf_iterate_sb(perf_event_namespaces_output,
>> + &namespaces_event,
>> + NULL);
>> +}
>> diff --git a/kernel/fork.c b/kernel/fork.c
>> index 997ac1d..3faca3d 100644
>> --- a/kernel/fork.c
>> +++ b/kernel/fork.c
>> @@ -1818,6 +1818,7 @@ static __latent_entropy struct task_struct *copy_process(
>> cgroup_post_fork(p);
>> threadgroup_change_end(current);
>> perf_event_fork(p);
>> + perf_event_namespaces(p);
> I would much prefer calling perf_event_namespace() from
> perf_event_fork() and reduce the external interface.
>
OK. Will update..
Thanks
Hari
On Mon, Nov 14, 2016 at 04:02:30PM +0530, Hari Bathini wrote:
> >> PERF_RECORD_SWITCH_CPU_WIDE = 15,
> >>+ /*
> >>+ * struct {
> >>+ * struct perf_event_header header;
> >>+ *
> >>+ * u32 pid, tid;
> >>+ * u64 time;
> >>+ * u32 uts_ns_inum;
> >>+ * u32 ipc_ns_inum;
> >>+ * u32 mnt_ns_inum;
> >>+ * u32 pid_ns_inum;
> >>+ * u32 net_ns_inum;
> >>+ * u32 cgroup_ns_inum;
> >>+ * u32 user_ns_inum;
> >>+ * struct sample_id sample_id;
> >>+ * };
> >>+ */
> >>+ PERF_RECORD_NAMESPACES = 16,
> >So this format is not extensible, that is, if someone adds yet another
> >namespace, we'll need to introduce PERF_RECORD_NAMESPACES2.
> >
> >Is there a 'natural' and exposed namespace index that we can use to
> >change it like:
> >
> > u32 nr_nss;
> > u32 namespace[nr_nss];
> >
> >?
>
> Nothing of that sort exists, currently.
> Maybe, time to introduce with this patch-set..?
Would be good, but you'll have to sort that with the namespace folks.
Peter Zijlstra <[email protected]> writes:
> On Mon, Nov 14, 2016 at 04:02:30PM +0530, Hari Bathini wrote:
>
>> >> PERF_RECORD_SWITCH_CPU_WIDE = 15,
>> >>+ /*
>> >>+ * struct {
>> >>+ * struct perf_event_header header;
>> >>+ *
>> >>+ * u32 pid, tid;
>> >>+ * u64 time;
>> >>+ * u32 uts_ns_inum;
>> >>+ * u32 ipc_ns_inum;
>> >>+ * u32 mnt_ns_inum;
>> >>+ * u32 pid_ns_inum;
>> >>+ * u32 net_ns_inum;
>> >>+ * u32 cgroup_ns_inum;
>> >>+ * u32 user_ns_inum;
>> >>+ * struct sample_id sample_id;
>> >>+ * };
>> >>+ */
>> >>+ PERF_RECORD_NAMESPACES = 16,
>> >So this format is not extensible, that is, if someone adds yet another
>> >namespace, we'll need to introduce PERF_RECORD_NAMESPACES2.
>> >
>> >Is there a 'natural' and exposed namespace index that we can use to
>> >change it like:
>> >
>> > u32 nr_nss;
>> > u32 namespace[nr_nss];
>> >
>> >?
>>
>> Nothing of that sort exists, currently.
>> Maybe, time to introduce with this patch-set..?
>
> Would be good, but you'll have to sort that with the namespace folks.
The somewhat easy answer is to use the unshare flags. AKA CLONE_NEWNS,
CLONE_NEWUSER, ...
In the worst case things may get extended to the point where perf would
have to use a different set of values than we use to unshare, clone, and
setns (as the clone flags are effectively all used up) but for the
existing namespaces that index should work just fine.
That won't allow for a natural array in the record but it will allow for
an array with that has a tag for which namespace it is in, or
alternative it will allow for one record per namespace.
Eric
On Tuesday 15 November 2016 02:27 AM, Eric W. Biederman wrote:
> Peter Zijlstra <[email protected]> writes:
>
>> On Mon, Nov 14, 2016 at 04:02:30PM +0530, Hari Bathini wrote:
>>
>>>>> PERF_RECORD_SWITCH_CPU_WIDE = 15,
>>>>> + /*
>>>>> + * struct {
>>>>> + * struct perf_event_header header;
>>>>> + *
>>>>> + * u32 pid, tid;
>>>>> + * u64 time;
>>>>> + * u32 uts_ns_inum;
>>>>> + * u32 ipc_ns_inum;
>>>>> + * u32 mnt_ns_inum;
>>>>> + * u32 pid_ns_inum;
>>>>> + * u32 net_ns_inum;
>>>>> + * u32 cgroup_ns_inum;
>>>>> + * u32 user_ns_inum;
>>>>> + * struct sample_id sample_id;
>>>>> + * };
>>>>> + */
>>>>> + PERF_RECORD_NAMESPACES = 16,
>>>> So this format is not extensible, that is, if someone adds yet another
>>>> namespace, we'll need to introduce PERF_RECORD_NAMESPACES2.
>>>>
>>>> Is there a 'natural' and exposed namespace index that we can use to
>>>> change it like:
>>>>
>>>> u32 nr_nss;
>>>> u32 namespace[nr_nss];
>>>>
>>>> ?
>>> Nothing of that sort exists, currently.
>>> Maybe, time to introduce with this patch-set..?
>> Would be good, but you'll have to sort that with the namespace folks.
> The somewhat easy answer is to use the unshare flags. AKA CLONE_NEWNS,
> CLONE_NEWUSER, ...
>
> In the worst case things may get extended to the point where perf would
> have to use a different set of values than we use to unshare, clone, and
> setns (as the clone flags are effectively all used up) but for the
> existing namespaces that index should work just fine.
>
> That won't allow for a natural array in the record but it will allow for
> an array with that has a tag for which namespace it is in, or
> alternative it will allow for one record per namespace.
struct namespace_info {
u64 flags;
u32 inode_num;
/* any other essential namespace info */
};
struct {
struct perf_event_header header;
u32 pid, tid;
u64 time;
struct namespace_info ns_info[];
struct sample_id sample_id;
};
Something of this sort should do..?
Thanks
Hari
On Friday 11 November 2016 01:18 AM, Eric W. Biederman wrote:
> Hari Bathini <[email protected]> writes:
>
>> Currently, there is no trivial mechanism to analyze events based on
>> containers. perf -G can be used, but it will not filter events for the
>> containers created after perf is invoked, making it difficult to assess/
>> analyze performance issues of multiple containers at once.
>>
>> This patch-set overcomes this limitation by using cgroup identifier as
>> container unique identifier. A new PERF_RECORD_NAMESPACES event that
>> records namespaces related info is introduced, from which the cgroup
>> namespace's inode number is used as cgroup identifier. This is based
>> on the assumption that each container is created with it's own cgroup
>> namespace allowing assessment/analysis of multiple containers using
>> cgroup identifier.
>>
>> The first patch introduces PERF_RECORD_NAMESPACES in kernel while the
>> second patch makes the corresponding changes in perf tool to read this
>> PERF_RECORD_NAMESPACES events. The third patch adds a cgroup identifier
>> column in perf report, which is nothing but the cgroup namespace's
>> inode number. This approach is based on the suggestion from Peter
>> Zijlstra here: https://patchwork.kernel.org/patch/9305655/
> Where is the check that ensures that only the someone with
> capable(CAP_SYS_ADMIN) can use this interface. This interface is not
> namespace clean in multiple dimensions so it can not be used generally?
Right. Will add the check..
> You are not allowed to move struct mount_namespace into
> include/linux/mnt_namespace.h. Al Viro will crucify you with cause.
> Those are implementation details the rest of the kernel should not be
> digging into.
Ouch! How about adding an accessor function(s) in fs/namespace.c ..?
> Where are the device numbers that go with those inode numbers you are
> exporting? For now all of those inodes live on the filesystem but I am
> not giving guarantees to userspace that do not work for ordinary
> filesystems.
Sorry! I didn't get this..
Want to use these numbers as identity for namespace (like pid for process..)
Thanks
Hari
Hari Bathini <[email protected]> writes:
> On Friday 11 November 2016 01:18 AM, Eric W. Biederman wrote:
>> Hari Bathini <[email protected]> writes:
>>
>>> Currently, there is no trivial mechanism to analyze events based on
>>> containers. perf -G can be used, but it will not filter events for the
>>> containers created after perf is invoked, making it difficult to assess/
>>> analyze performance issues of multiple containers at once.
>>>
>>> This patch-set overcomes this limitation by using cgroup identifier as
>>> container unique identifier. A new PERF_RECORD_NAMESPACES event that
>>> records namespaces related info is introduced, from which the cgroup
>>> namespace's inode number is used as cgroup identifier. This is based
>>> on the assumption that each container is created with it's own cgroup
>>> namespace allowing assessment/analysis of multiple containers using
>>> cgroup identifier.
>>>
>>> The first patch introduces PERF_RECORD_NAMESPACES in kernel while the
>>> second patch makes the corresponding changes in perf tool to read this
>>> PERF_RECORD_NAMESPACES events. The third patch adds a cgroup identifier
>>> column in perf report, which is nothing but the cgroup namespace's
>>> inode number. This approach is based on the suggestion from Peter
>>> Zijlstra here: https://patchwork.kernel.org/patch/9305655/
>> Where is the check that ensures that only the someone with
>> capable(CAP_SYS_ADMIN) can use this interface. This interface is not
>> namespace clean in multiple dimensions so it can not be used generally?
>
> Right. Will add the check..
>
>> You are not allowed to move struct mount_namespace into
>> include/linux/mnt_namespace.h. Al Viro will crucify you with cause.
>> Those are implementation details the rest of the kernel should not be
>> digging into.
>
> Ouch! How about adding an accessor function(s) in fs/namespace.c ..?
For reasonable things of course. I think the namespace operations
from ns common already has a large set of accessors so I don't know
what you are looking for.
>> Where are the device numbers that go with those inode numbers you are
>> exporting? For now all of those inodes live on the filesystem but I am
>> not giving guarantees to userspace that do not work for ordinary
>> filesystems.
>
> Sorry! I didn't get this..
> Want to use these numbers as identity for namespace (like pid for process..)
Yes I understand you would like to have a global identifier like pids.
A global identifier would ultimately require the addition of a namespace
of namespaces so the global identifier would be relative to something.
I really don't want to go there.
Global identifiers are evil!
So you need specify not only the inode number but also which filesystem
the inode number applies to. Aka the device number of the appropriate
filesystem as well.
Also please don't forget that modern inode numbers are 64bit not 32bit.
I don't know if that freedom will be used with namespaces or not, but we
need the freedom in a userspace API to make that change without breaking
userspace.
Eric