This tool is mostly a perf version of kmemtrace-user.
The following information is provided by this tool:
- the total amount of memory allocated and fragmentation per call-site
- the total amount of memory allocated and fragmentation per allocation
- total memory allocated and fragmentation in the collected dataset
- ...
# ./perf kmem record
^C
# ./perf kmem --stat caller --stat alloc -l 10
------------------------------------------------------------------------------
Callsite | Total_alloc/Per | Total_req/Per | Hit | Fragmentation
------------------------------------------------------------------------------
0xc052f37a | 790528/4096 | 790528/4096 | 193 | 0.000%
0xc0541d70 | 524288/4096 | 524288/4096 | 128 | 0.000%
0xc051cc68 | 481600/200 | 481600/200 | 2408 | 0.000%
0xc0572623 | 297444/676 | 297440/676 | 440 | 0.001%
0xc05399f1 | 73476/164 | 73472/164 | 448 | 0.005%
0xc05243bf | 51456/256 | 51456/256 | 201 | 0.000%
0xc0730d0e | 31844/497 | 31808/497 | 64 | 0.113%
0xc0734c4e | 17152/256 | 17152/256 | 67 | 0.000%
0xc0541a6d | 16384/128 | 16384/128 | 128 | 0.000%
0xc059c217 | 13120/40 | 13120/40 | 328 | 0.000%
0xc0501ee6 | 11264/88 | 11264/88 | 128 | 0.000%
0xc04daef0 | 7504/682 | 7128/648 | 11 | 5.011%
0xc04e14a3 | 4216/191 | 4216/191 | 22 | 0.000%
0xc05041ca | 3524/44 | 3520/44 | 80 | 0.114%
0xc0734fa3 | 2104/701 | 1620/540 | 3 | 23.004%
0xc05ec9f1 | 2024/289 | 2016/288 | 7 | 0.395%
0xc06a1999 | 1792/256 | 1792/256 | 7 | 0.000%
0xc0463b9a | 1584/144 | 1584/144 | 11 | 0.000%
0xc0541eb0 | 1024/16 | 1024/16 | 64 | 0.000%
0xc06a19ac | 896/128 | 896/128 | 7 | 0.000%
0xc05721c0 | 772/12 | 768/12 | 64 | 0.518%
0xc054d1e6 | 288/57 | 280/56 | 5 | 2.778%
0xc04b562e | 157/31 | 154/30 | 5 | 1.911%
0xc04b536f | 80/16 | 80/16 | 5 | 0.000%
0xc05855a0 | 64/64 | 36/36 | 1 | 43.750%
------------------------------------------------------------------------------
------------------------------------------------------------------------------
Alloc Ptr | Total_alloc/Per | Total_req/Per | Hit | Fragmentation
------------------------------------------------------------------------------
0xda884000 | 1052672/4096 | 1052672/4096 | 257 | 0.000%
0xda886000 | 262144/4096 | 262144/4096 | 64 | 0.000%
0xf60c7c00 | 16512/128 | 16512/128 | 129 | 0.000%
0xf59a4118 | 13120/40 | 13120/40 | 328 | 0.000%
0xdfd4b2c0 | 11264/88 | 11264/88 | 128 | 0.000%
0xf5274600 | 7680/256 | 7680/256 | 30 | 0.000%
0xe8395000 | 5948/594 | 5464/546 | 10 | 8.137%
0xe59c3c00 | 5748/479 | 5712/476 | 12 | 0.626%
0xf4cd1a80 | 3524/44 | 3520/44 | 80 | 0.114%
0xe5bd1600 | 2892/482 | 2856/476 | 6 | 1.245%
... | ... | ... | ... | ...
------------------------------------------------------------------------------
SUMMARY
=======
Total bytes requested: 2333626
Total bytes allocated: 2353712
Total bytes wasted on internal fragmentation: 20086
Internal fragmentation: 0.853375%
TODO:
- show sym+offset in 'callsite' column
- show cross node allocation stats
- collect more useful stats?
- ...
Signed-off-by: Li Zefan <[email protected]>
---
tools/perf/Makefile | 1 +
tools/perf/builtin-kmem.c | 578 +++++++++++++++++++++++++++++++++++++++++++++
tools/perf/builtin.h | 1 +
tools/perf/perf.c | 27 +-
4 files changed, 594 insertions(+), 13 deletions(-)
create mode 100644 tools/perf/builtin-kmem.c
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 53e663a..4ec86da 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -445,6 +445,7 @@ BUILTIN_OBJS += builtin-timechart.o
BUILTIN_OBJS += builtin-top.o
BUILTIN_OBJS += builtin-trace.o
BUILTIN_OBJS += builtin-probe.o
+BUILTIN_OBJS += builtin-kmem.o
PERFLIBS = $(LIB_FILE)
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
new file mode 100644
index 0000000..285fda3
--- /dev/null
+++ b/tools/perf/builtin-kmem.c
@@ -0,0 +1,578 @@
+#include "builtin.h"
+#include "perf.h"
+
+#include "util/util.h"
+#include "util/cache.h"
+#include "util/symbol.h"
+#include "util/thread.h"
+#include "util/header.h"
+
+#include "util/parse-options.h"
+#include "util/trace-event.h"
+
+#include "util/debug.h"
+#include "util/data_map.h"
+
+#include <linux/rbtree.h>
+
+struct alloc_stat;
+typedef int (*sort_fn_t)(struct alloc_stat *, struct alloc_stat *);
+
+static char const *input_name = "perf.data";
+
+static struct perf_header *header;
+static u64 sample_type;
+
+static int alloc_flag;
+static int caller_flag;
+
+sort_fn_t alloc_sort_fn;
+sort_fn_t caller_sort_fn;
+
+static int alloc_lines = -1;
+static int caller_lines = -1;
+
+static char *cwd;
+static int cwdlen;
+
+struct alloc_stat {
+ union {
+ struct {
+ char *name;
+ u64 call_site;
+ };
+ u64 ptr;
+ };
+ u64 bytes_req;
+ u64 bytes_alloc;
+ u32 hit;
+
+ struct rb_node node;
+};
+
+static struct rb_root root_alloc_stat;
+static struct rb_root root_alloc_sorted;
+static struct rb_root root_caller_stat;
+static struct rb_root root_caller_sorted;
+
+static unsigned long total_requested, total_allocated;
+
+struct raw_event_sample {
+ u32 size;
+ char data[0];
+};
+
+static int
+process_comm_event(event_t *event, unsigned long offset, unsigned long head)
+{
+ struct thread *thread = threads__findnew(event->comm.pid);
+
+ dump_printf("%p [%p]: PERF_RECORD_COMM: %s:%d\n",
+ (void *)(offset + head),
+ (void *)(long)(event->header.size),
+ event->comm.comm, event->comm.pid);
+
+ if (thread == NULL ||
+ thread__set_comm(thread, event->comm.comm)) {
+ dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static void insert_alloc_stat(unsigned long ptr,
+ int bytes_req, int bytes_alloc)
+{
+ struct rb_node **node = &root_alloc_stat.rb_node;
+ struct rb_node *parent = NULL;
+ struct alloc_stat *data = NULL;
+
+ if (!alloc_flag)
+ return;
+
+ while (*node) {
+ parent = *node;
+ data = rb_entry(*node, struct alloc_stat, node);
+
+ if (ptr > data->ptr)
+ node = &(*node)->rb_right;
+ else if (ptr < data->ptr)
+ node = &(*node)->rb_left;
+ else
+ break;
+ }
+
+ if (data && data->ptr == ptr) {
+ data->hit++;
+ data->bytes_req += bytes_req;
+ data->bytes_alloc += bytes_req;
+ } else {
+ data = malloc(sizeof(*data));
+ data->ptr = ptr;
+ data->hit = 1;
+ data->bytes_req = bytes_req;
+ data->bytes_alloc = bytes_alloc;
+
+ rb_link_node(&data->node, parent, node);
+ rb_insert_color(&data->node, &root_alloc_stat);
+ }
+}
+
+static void insert_caller_stat(unsigned long call_site,
+ int bytes_req, int bytes_alloc)
+{
+ struct rb_node **node = &root_caller_stat.rb_node;
+ struct rb_node *parent = NULL;
+ struct alloc_stat *data = NULL;
+
+ if (!caller_flag)
+ return;
+
+ while (*node) {
+ parent = *node;
+ data = rb_entry(*node, struct alloc_stat, node);
+
+ if (call_site > data->call_site)
+ node = &(*node)->rb_right;
+ else if (call_site < data->call_site)
+ node = &(*node)->rb_left;
+ else
+ break;
+ }
+
+ if (data && data->call_site == call_site) {
+ data->hit++;
+ data->bytes_req += bytes_req;
+ data->bytes_alloc += bytes_req;
+ } else {
+ data = malloc(sizeof(*data));
+ data->call_site = call_site;
+ data->hit = 1;
+ data->bytes_req = bytes_req;
+ data->bytes_alloc = bytes_alloc;
+
+ rb_link_node(&data->node, parent, node);
+ rb_insert_color(&data->node, &root_caller_stat);
+ }
+}
+
+static void process_alloc_event(struct raw_event_sample *raw,
+ struct event *event,
+ int cpu __used,
+ u64 timestamp __used,
+ struct thread *thread __used,
+ int node __used)
+{
+ unsigned long call_site;
+ unsigned long ptr;
+ int bytes_req;
+ int bytes_alloc;
+
+ ptr = raw_field_value(event, "ptr", raw->data);
+ call_site = raw_field_value(event, "call_site", raw->data);
+ bytes_req = raw_field_value(event, "bytes_req", raw->data);
+ bytes_alloc = raw_field_value(event, "bytes_alloc", raw->data);
+
+ insert_alloc_stat(ptr, bytes_req, bytes_alloc);
+ insert_caller_stat(call_site, bytes_req, bytes_alloc);
+
+ total_requested += bytes_req;
+ total_allocated += bytes_alloc;
+}
+
+static void process_free_event(struct raw_event_sample *raw __used,
+ struct event *event __used,
+ int cpu __used,
+ u64 timestamp __used,
+ struct thread *thread __used)
+{
+}
+
+static void
+process_raw_event(event_t *raw_event __used, void *more_data,
+ int cpu, u64 timestamp, struct thread *thread)
+{
+ struct raw_event_sample *raw = more_data;
+ struct event *event;
+ int type;
+
+ type = trace_parse_common_type(raw->data);
+ event = trace_find_event(type);
+
+ if (!strcmp(event->name, "kmalloc") ||
+ !strcmp(event->name, "kmem_cache_alloc")) {
+ process_alloc_event(raw, event, cpu, timestamp, thread, 0);
+ return;
+ }
+
+ if (!strcmp(event->name, "kmalloc_node") ||
+ !strcmp(event->name, "kmem_cache_alloc_node")) {
+ process_alloc_event(raw, event, cpu, timestamp, thread, 1);
+ return;
+ }
+
+ if (!strcmp(event->name, "kfree") ||
+ !strcmp(event->name, "kmem_cache_free")) {
+ process_free_event(raw, event, cpu, timestamp, thread);
+ return;
+ }
+}
+
+static int
+process_sample_event(event_t *event, unsigned long offset, unsigned long head)
+{
+ u64 ip = event->ip.ip;
+ u64 timestamp = -1;
+ u32 cpu = -1;
+ u64 period = 1;
+ void *more_data = event->ip.__more_data;
+ struct thread *thread = threads__findnew(event->ip.pid);
+
+ if (sample_type & PERF_SAMPLE_TIME) {
+ timestamp = *(u64 *)more_data;
+ more_data += sizeof(u64);
+ }
+
+ if (sample_type & PERF_SAMPLE_CPU) {
+ cpu = *(u32 *)more_data;
+ more_data += sizeof(u32);
+ more_data += sizeof(u32); /* reserved */
+ }
+
+ if (sample_type & PERF_SAMPLE_PERIOD) {
+ period = *(u64 *)more_data;
+ more_data += sizeof(u64);
+ }
+
+ dump_printf("%p [%p]: PERF_RECORD_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
+ (void *)(offset + head),
+ (void *)(long)(event->header.size),
+ event->header.misc,
+ event->ip.pid, event->ip.tid,
+ (void *)(long)ip,
+ (long long)period);
+
+ if (thread == NULL) {
+ pr_debug("problem processing %d event, skipping it.\n",
+ event->header.type);
+ return -1;
+ }
+
+ dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
+
+ process_raw_event(event, more_data, cpu, timestamp, thread);
+
+ return 0;
+}
+
+static int sample_type_check(u64 type)
+{
+ sample_type = type;
+
+ if (!(sample_type & PERF_SAMPLE_RAW)) {
+ fprintf(stderr,
+ "No trace sample to read. Did you call perf record "
+ "without -R?");
+ return -1;
+ }
+
+ return 0;
+}
+
+static struct perf_file_handler file_handler = {
+ .process_sample_event = process_sample_event,
+ .process_comm_event = process_comm_event,
+ .sample_type_check = sample_type_check,
+};
+
+static int read_events(void)
+{
+ register_idle_thread();
+ register_perf_file_handler(&file_handler);
+
+ return mmap_dispatch_perf_file(&header, input_name, 0, 0,
+ &cwdlen, &cwd);
+}
+
+static double fragmentation(unsigned long n_req, unsigned long n_alloc)
+{
+ if (n_alloc == 0)
+ return 0.0;
+ else
+ return 100.0 - (100.0 * n_req / n_alloc);
+}
+
+static void __print_result(struct rb_root *root, int n_lines, int is_caller)
+{
+ struct rb_node *next;
+
+ printf("\n ------------------------------------------------------------------------------\n");
+ if (is_caller)
+ printf(" Callsite |");
+ else
+ printf(" Alloc Ptr |");
+ printf(" Total_alloc/Per | Total_req/Per | Hit | Fragmentation\n");
+ printf(" ------------------------------------------------------------------------------\n");
+
+ next = rb_first(root);
+
+ while (next && n_lines--) {
+ struct alloc_stat *data;
+
+ data = rb_entry(next, struct alloc_stat, node);
+
+ printf(" %-16p | %8llu/%-6lu | %8llu/%-6lu | %6lu | %8.3f%%\n",
+ is_caller ? (void *)(unsigned long)data->call_site :
+ (void *)(unsigned long)data->ptr,
+ (unsigned long long)data->bytes_alloc,
+ (unsigned long)data->bytes_alloc / data->hit,
+ (unsigned long long)data->bytes_req,
+ (unsigned long)data->bytes_req / data->hit,
+ (unsigned long)data->hit,
+ fragmentation(data->bytes_req, data->bytes_alloc));
+
+ next = rb_next(next);
+ }
+
+ if (n_lines == -1)
+ printf(" ... | ... | ... | ... | ... \n");
+
+ printf(" ------------------------------------------------------------------------------\n");
+}
+
+static void print_summary(void)
+{
+ printf("\nSUMMARY\n=======\n");
+ printf("Total bytes requested: %lu\n", total_requested);
+ printf("Total bytes allocated: %lu\n", total_allocated);
+ printf("Total bytes wasted on internal fragmentation: %lu\n",
+ total_allocated - total_requested);
+ printf("Internal fragmentation: %f%%\n",
+ fragmentation(total_requested, total_allocated));
+}
+
+static void print_result(void)
+{
+ if (caller_flag)
+ __print_result(&root_caller_sorted, caller_lines, 1);
+ if (alloc_flag)
+ __print_result(&root_alloc_sorted, alloc_lines, 0);
+ print_summary();
+}
+
+static void sort_insert(struct rb_root *root, struct alloc_stat *data,
+ sort_fn_t sort_fn)
+{
+ struct rb_node **new = &(root->rb_node);
+ struct rb_node *parent = NULL;
+
+ while (*new) {
+ struct alloc_stat *this;
+ int cmp;
+
+ this = rb_entry(*new, struct alloc_stat, node);
+ parent = *new;
+
+ cmp = sort_fn(data, this);
+
+ if (cmp > 0)
+ new = &((*new)->rb_left);
+ else
+ new = &((*new)->rb_right);
+ }
+
+ rb_link_node(&data->node, parent, new);
+ rb_insert_color(&data->node, root);
+}
+
+static void __sort_result(struct rb_root *root, struct rb_root *root_sorted,
+ sort_fn_t sort_fn)
+{
+ struct rb_node *node;
+ struct alloc_stat *data;
+
+ for (;;) {
+ node = rb_first(root);
+ if (!node)
+ break;
+
+ rb_erase(node, root);
+ data = rb_entry(node, struct alloc_stat, node);
+ sort_insert(root_sorted, data, sort_fn);
+ }
+}
+
+static void sort_result(void)
+{
+ __sort_result(&root_alloc_stat, &root_alloc_sorted, alloc_sort_fn);
+ __sort_result(&root_caller_stat, &root_caller_sorted, caller_sort_fn);
+}
+
+static int __cmd_kmem(void)
+{
+ setup_pager();
+ read_events();
+ sort_result();
+ print_result();
+
+ return 0;
+}
+
+static const char * const kmem_usage[] = {
+ "perf kmem [<options>] {record}",
+ NULL
+};
+
+
+static int ptr_cmp(struct alloc_stat *l, struct alloc_stat *r)
+{
+ if (l->ptr < r->ptr)
+ return -1;
+ else if (l->ptr > r->ptr)
+ return 1;
+ return 0;
+}
+
+static int callsite_cmp(struct alloc_stat *l, struct alloc_stat *r)
+{
+ if (l->call_site < r->call_site)
+ return -1;
+ else if (l->call_site > r->call_site)
+ return 1;
+ return 0;
+}
+
+static int bytes_cmp(struct alloc_stat *l, struct alloc_stat *r)
+{
+ if (l->bytes_alloc < r->bytes_alloc)
+ return -1;
+ else if (l->bytes_alloc > r->bytes_alloc)
+ return 1;
+ return 0;
+}
+
+static int parse_sort_opt(const struct option *opt __used,
+ const char *arg, int unset __used)
+{
+ sort_fn_t sort_fn;
+
+ if (!arg)
+ return -1;
+
+ if (strcmp(arg, "ptr") == 0)
+ sort_fn = ptr_cmp;
+ else if (strcmp(arg, "call_site") == 0)
+ sort_fn = callsite_cmp;
+ else if (strcmp(arg, "bytes") == 0)
+ sort_fn = bytes_cmp;
+ else
+ return -1;
+
+ if (caller_flag > alloc_flag)
+ caller_sort_fn = sort_fn;
+ else
+ alloc_sort_fn = sort_fn;
+
+ return 0;
+}
+
+static int parse_stat_opt(const struct option *opt __used,
+ const char *arg, int unset __used)
+{
+ if (!arg)
+ return -1;
+
+ if (strcmp(arg, "alloc") == 0)
+ alloc_flag = (caller_flag + 1);
+ else if (strcmp(arg, "caller") == 0)
+ caller_flag = (alloc_flag + 1);
+ else
+ return -1;
+ return 0;
+}
+
+static int parse_line_opt(const struct option *opt __used,
+ const char *arg, int unset __used)
+{
+ int lines;
+
+ if (!arg)
+ return -1;
+
+ lines = strtoul(arg, NULL, 10);
+
+ if (caller_flag > alloc_flag)
+ caller_lines = lines;
+ else
+ alloc_lines = lines;
+
+ return 0;
+}
+
+static const struct option kmem_options[] = {
+ OPT_STRING('i', "input", &input_name, "file",
+ "input file name"),
+ OPT_CALLBACK(0, "stat", NULL, "<alloc>|<caller>",
+ "stat selector, Pass 'alloc' or 'caller'.",
+ parse_stat_opt),
+ OPT_CALLBACK('s', "sort", NULL, "key",
+ "sort by key: ptr, call_site, hit, bytes",
+ parse_sort_opt),
+ OPT_CALLBACK('l', "line", NULL, "num",
+ "show n lins",
+ parse_line_opt),
+ OPT_END()
+};
+
+static const char *record_args[] = {
+ "record",
+ "-a",
+ "-R",
+ "-M",
+ "-f",
+ "-c", "1",
+ "-e", "kmem:kmalloc",
+ "-e", "kmem:kmalloc_node",
+ "-e", "kmem:kfree",
+ "-e", "kmem:kmem_cache_alloc",
+ "-e", "kmem:kmem_cache_alloc_node",
+ "-e", "kmem:kmem_cache_free",
+};
+
+static int __cmd_record(int argc, const char **argv)
+{
+ unsigned int rec_argc, i, j;
+ const char **rec_argv;
+
+ rec_argc = ARRAY_SIZE(record_args) + argc - 1;
+ rec_argv = calloc(rec_argc + 1, sizeof(char *));
+
+ for (i = 0; i < ARRAY_SIZE(record_args); i++)
+ rec_argv[i] = strdup(record_args[i]);
+
+ for (j = 1; j < (unsigned int)argc; j++, i++)
+ rec_argv[i] = argv[j];
+
+ return cmd_record(i, rec_argv, NULL);
+}
+
+int cmd_kmem(int argc, const char **argv, const char *prefix __used)
+{
+ symbol__init(0);
+
+ argc = parse_options(argc, argv, kmem_options, kmem_usage, 0);
+
+ if (argc && !strncmp(argv[0], "rec", 3))
+ return __cmd_record(argc, argv);
+ else if (argc)
+ usage_with_options(kmem_usage, kmem_options);
+
+ if (!alloc_sort_fn)
+ alloc_sort_fn = bytes_cmp;
+ if (!caller_sort_fn)
+ caller_sort_fn = bytes_cmp;
+
+ return __cmd_kmem();
+}
+
diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h
index 9b02d85..a3d8bf6 100644
--- a/tools/perf/builtin.h
+++ b/tools/perf/builtin.h
@@ -28,5 +28,6 @@ extern int cmd_top(int argc, const char **argv, const char *prefix);
extern int cmd_trace(int argc, const char **argv, const char *prefix);
extern int cmd_version(int argc, const char **argv, const char *prefix);
extern int cmd_probe(int argc, const char **argv, const char *prefix);
+extern int cmd_kmem(int argc, const char **argv, const char *prefix);
#endif
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 89b82ac..cf64049 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -285,20 +285,21 @@ static void handle_internal_command(int argc, const char **argv)
{
const char *cmd = argv[0];
static struct cmd_struct commands[] = {
- { "help", cmd_help, 0 },
- { "list", cmd_list, 0 },
{ "buildid-list", cmd_buildid_list, 0 },
- { "record", cmd_record, 0 },
- { "report", cmd_report, 0 },
- { "bench", cmd_bench, 0 },
- { "stat", cmd_stat, 0 },
- { "timechart", cmd_timechart, 0 },
- { "top", cmd_top, 0 },
- { "annotate", cmd_annotate, 0 },
- { "version", cmd_version, 0 },
- { "trace", cmd_trace, 0 },
- { "sched", cmd_sched, 0 },
- { "probe", cmd_probe, 0 },
+ { "help", cmd_help, 0 },
+ { "list", cmd_list, 0 },
+ { "record", cmd_record, 0 },
+ { "report", cmd_report, 0 },
+ { "bench", cmd_bench, 0 },
+ { "stat", cmd_stat, 0 },
+ { "timechart", cmd_timechart, 0 },
+ { "top", cmd_top, 0 },
+ { "annotate", cmd_annotate, 0 },
+ { "version", cmd_version, 0 },
+ { "trace", cmd_trace, 0 },
+ { "sched", cmd_sched, 0 },
+ { "probe", cmd_probe, 0 },
+ { "kmem", cmd_kmem, 0 },
};
unsigned int i;
static const char ext[] = STRIP_EXTENSION;
--
1.6.3
The kmem trace events can replace the functions of kmemtrace
tracer.
And kmemtrace-user can be modified to use trace events.
(But after cloning the git repo, I found it's still based on
the original relay version..), not to mention now we have
'perf kmem' tool.
Signed-off-by: Li Zefan <[email protected]>
---
Documentation/ABI/testing/debugfs-kmemtrace | 71 ----
Documentation/trace/kmemtrace.txt | 126 -------
MAINTAINERS | 7 -
include/linux/kmemtrace.h | 25 --
include/linux/slab_def.h | 16 +-
include/linux/slub_def.h | 14 +-
init/main.c | 2 -
kernel/trace/Kconfig | 20 -
kernel/trace/Makefile | 1 -
kernel/trace/kmemtrace.c | 511 ---------------------------
kernel/trace/trace.h | 11 -
kernel/trace/trace_entries.h | 35 --
mm/slab.c | 11 -
mm/slub.c | 3 -
14 files changed, 7 insertions(+), 846 deletions(-)
delete mode 100644 Documentation/ABI/testing/debugfs-kmemtrace
delete mode 100644 Documentation/trace/kmemtrace.txt
delete mode 100644 include/linux/kmemtrace.h
delete mode 100644 kernel/trace/kmemtrace.c
diff --git a/Documentation/ABI/testing/debugfs-kmemtrace b/Documentation/ABI/testing/debugfs-kmemtrace
deleted file mode 100644
index 5e6a92a..0000000
--- a/Documentation/ABI/testing/debugfs-kmemtrace
+++ /dev/null
@@ -1,71 +0,0 @@
-What: /sys/kernel/debug/kmemtrace/
-Date: July 2008
-Contact: Eduard - Gabriel Munteanu <[email protected]>
-Description:
-
-In kmemtrace-enabled kernels, the following files are created:
-
-/sys/kernel/debug/kmemtrace/
- cpu<n> (0400) Per-CPU tracing data, see below. (binary)
- total_overruns (0400) Total number of bytes which were dropped from
- cpu<n> files because of full buffer condition,
- non-binary. (text)
- abi_version (0400) Kernel's kmemtrace ABI version. (text)
-
-Each per-CPU file should be read according to the relay interface. That is,
-the reader should set affinity to that specific CPU and, as currently done by
-the userspace application (though there are other methods), use poll() with
-an infinite timeout before every read(). Otherwise, erroneous data may be
-read. The binary data has the following _core_ format:
-
- Event ID (1 byte) Unsigned integer, one of:
- 0 - represents an allocation (KMEMTRACE_EVENT_ALLOC)
- 1 - represents a freeing of previously allocated memory
- (KMEMTRACE_EVENT_FREE)
- Type ID (1 byte) Unsigned integer, one of:
- 0 - this is a kmalloc() / kfree()
- 1 - this is a kmem_cache_alloc() / kmem_cache_free()
- 2 - this is a __get_free_pages() et al.
- Event size (2 bytes) Unsigned integer representing the
- size of this event. Used to extend
- kmemtrace. Discard the bytes you
- don't know about.
- Sequence number (4 bytes) Signed integer used to reorder data
- logged on SMP machines. Wraparound
- must be taken into account, although
- it is unlikely.
- Caller address (8 bytes) Return address to the caller.
- Pointer to mem (8 bytes) Pointer to target memory area. Can be
- NULL, but not all such calls might be
- recorded.
-
-In case of KMEMTRACE_EVENT_ALLOC events, the next fields follow:
-
- Requested bytes (8 bytes) Total number of requested bytes,
- unsigned, must not be zero.
- Allocated bytes (8 bytes) Total number of actually allocated
- bytes, unsigned, must not be lower
- than requested bytes.
- Requested flags (4 bytes) GFP flags supplied by the caller.
- Target CPU (4 bytes) Signed integer, valid for event id 1.
- If equal to -1, target CPU is the same
- as origin CPU, but the reverse might
- not be true.
-
-The data is made available in the same endianness the machine has.
-
-Other event ids and type ids may be defined and added. Other fields may be
-added by increasing event size, but see below for details.
-Every modification to the ABI, including new id definitions, are followed
-by bumping the ABI version by one.
-
-Adding new data to the packet (features) is done at the end of the mandatory
-data:
- Feature size (2 byte)
- Feature ID (1 byte)
- Feature data (Feature size - 3 bytes)
-
-
-Users:
- kmemtrace-user - git://repo.or.cz/kmemtrace-user.git
-
diff --git a/Documentation/trace/kmemtrace.txt b/Documentation/trace/kmemtrace.txt
deleted file mode 100644
index 6308735..0000000
--- a/Documentation/trace/kmemtrace.txt
+++ /dev/null
@@ -1,126 +0,0 @@
- kmemtrace - Kernel Memory Tracer
-
- by Eduard - Gabriel Munteanu
- <[email protected]>
-
-I. Introduction
-===============
-
-kmemtrace helps kernel developers figure out two things:
-1) how different allocators (SLAB, SLUB etc.) perform
-2) how kernel code allocates memory and how much
-
-To do this, we trace every allocation and export information to the userspace
-through the relay interface. We export things such as the number of requested
-bytes, the number of bytes actually allocated (i.e. including internal
-fragmentation), whether this is a slab allocation or a plain kmalloc() and so
-on.
-
-The actual analysis is performed by a userspace tool (see section III for
-details on where to get it from). It logs the data exported by the kernel,
-processes it and (as of writing this) can provide the following information:
-- the total amount of memory allocated and fragmentation per call-site
-- the amount of memory allocated and fragmentation per allocation
-- total memory allocated and fragmentation in the collected dataset
-- number of cross-CPU allocation and frees (makes sense in NUMA environments)
-
-Moreover, it can potentially find inconsistent and erroneous behavior in
-kernel code, such as using slab free functions on kmalloc'ed memory or
-allocating less memory than requested (but not truly failed allocations).
-
-kmemtrace also makes provisions for tracing on some arch and analysing the
-data on another.
-
-II. Design and goals
-====================
-
-kmemtrace was designed to handle rather large amounts of data. Thus, it uses
-the relay interface to export whatever is logged to userspace, which then
-stores it. Analysis and reporting is done asynchronously, that is, after the
-data is collected and stored. By design, it allows one to log and analyse
-on different machines and different arches.
-
-As of writing this, the ABI is not considered stable, though it might not
-change much. However, no guarantees are made about compatibility yet. When
-deemed stable, the ABI should still allow easy extension while maintaining
-backward compatibility. This is described further in Documentation/ABI.
-
-Summary of design goals:
- - allow logging and analysis to be done across different machines
- - be fast and anticipate usage in high-load environments (*)
- - be reasonably extensible
- - make it possible for GNU/Linux distributions to have kmemtrace
- included in their repositories
-
-(*) - one of the reasons Pekka Enberg's original userspace data analysis
- tool's code was rewritten from Perl to C (although this is more than a
- simple conversion)
-
-
-III. Quick usage guide
-======================
-
-1) Get a kernel that supports kmemtrace and build it accordingly (i.e. enable
-CONFIG_KMEMTRACE).
-
-2) Get the userspace tool and build it:
-$ git clone git://repo.or.cz/kmemtrace-user.git # current repository
-$ cd kmemtrace-user/
-$ ./autogen.sh
-$ ./configure
-$ make
-
-3) Boot the kmemtrace-enabled kernel if you haven't, preferably in the
-'single' runlevel (so that relay buffers don't fill up easily), and run
-kmemtrace:
-# '$' does not mean user, but root here.
-$ mount -t debugfs none /sys/kernel/debug
-$ mount -t proc none /proc
-$ cd path/to/kmemtrace-user/
-$ ./kmemtraced
-Wait a bit, then stop it with CTRL+C.
-$ cat /sys/kernel/debug/kmemtrace/total_overruns # Check if we didn't
- # overrun, should
- # be zero.
-$ (Optionally) [Run kmemtrace_check separately on each cpu[0-9]*.out file to
- check its correctness]
-$ ./kmemtrace-report
-
-Now you should have a nice and short summary of how the allocator performs.
-
-IV. FAQ and known issues
-========================
-
-Q: 'cat /sys/kernel/debug/kmemtrace/total_overruns' is non-zero, how do I fix
-this? Should I worry?
-A: If it's non-zero, this affects kmemtrace's accuracy, depending on how
-large the number is. You can fix it by supplying a higher
-'kmemtrace.subbufs=N' kernel parameter.
----
-
-Q: kmemtrace_check reports errors, how do I fix this? Should I worry?
-A: This is a bug and should be reported. It can occur for a variety of
-reasons:
- - possible bugs in relay code
- - possible misuse of relay by kmemtrace
- - timestamps being collected unorderly
-Or you may fix it yourself and send us a patch.
----
-
-Q: kmemtrace_report shows many errors, how do I fix this? Should I worry?
-A: This is a known issue and I'm working on it. These might be true errors
-in kernel code, which may have inconsistent behavior (e.g. allocating memory
-with kmem_cache_alloc() and freeing it with kfree()). Pekka Enberg pointed
-out this behavior may work with SLAB, but may fail with other allocators.
-
-It may also be due to lack of tracing in some unusual allocator functions.
-
-We don't want bug reports regarding this issue yet.
----
-
-V. See also
-===========
-
-Documentation/kernel-parameters.txt
-Documentation/ABI/testing/debugfs-kmemtrace
-
diff --git a/MAINTAINERS b/MAINTAINERS
index c824b4d..dfb2e7d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3100,13 +3100,6 @@ F: include/linux/kmemleak.h
F: mm/kmemleak.c
F: mm/kmemleak-test.c
-KMEMTRACE
-M: Eduard - Gabriel Munteanu <[email protected]>
-S: Maintained
-F: Documentation/trace/kmemtrace.txt
-F: include/linux/kmemtrace.h
-F: kernel/trace/kmemtrace.c
-
KPROBES
M: Ananth N Mavinakayanahalli <[email protected]>
M: Anil S Keshavamurthy <[email protected]>
diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h
deleted file mode 100644
index b616d39..0000000
--- a/include/linux/kmemtrace.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (C) 2008 Eduard - Gabriel Munteanu
- *
- * This file is released under GPL version 2.
- */
-
-#ifndef _LINUX_KMEMTRACE_H
-#define _LINUX_KMEMTRACE_H
-
-#ifdef __KERNEL__
-
-#include <trace/events/kmem.h>
-
-#ifdef CONFIG_KMEMTRACE
-extern void kmemtrace_init(void);
-#else
-static inline void kmemtrace_init(void)
-{
-}
-#endif
-
-#endif /* __KERNEL__ */
-
-#endif /* _LINUX_KMEMTRACE_H */
-
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 850d057..56171d6 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -14,7 +14,8 @@
#include <asm/page.h> /* kmalloc_sizes.h needs PAGE_SIZE */
#include <asm/cache.h> /* kmalloc_sizes.h needs L1_CACHE_BYTES */
#include <linux/compiler.h>
-#include <linux/kmemtrace.h>
+
+#include <trace/events/kmem.h>
/*
* struct kmem_cache
@@ -108,22 +109,13 @@ struct cache_sizes {
extern struct cache_sizes malloc_sizes[];
void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
+void *kmem_cache_alloc_notrace(struct kmem_cache *, gfp_t);
void *__kmalloc(size_t size, gfp_t flags);
-#ifdef CONFIG_KMEMTRACE
-extern void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags);
-extern size_t slab_buffer_size(struct kmem_cache *cachep);
-#else
-static __always_inline void *
-kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
-{
- return kmem_cache_alloc(cachep, flags);
-}
static inline size_t slab_buffer_size(struct kmem_cache *cachep)
{
- return 0;
+ return cachep->buffer_size;
}
-#endif
static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 5ad70a6..b41dd8c 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -10,9 +10,10 @@
#include <linux/gfp.h>
#include <linux/workqueue.h>
#include <linux/kobject.h>
-#include <linux/kmemtrace.h>
#include <linux/kmemleak.h>
+#include <trace/events/kmem.h>
+
enum stat_item {
ALLOC_FASTPATH, /* Allocation from cpu slab */
ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */
@@ -215,18 +216,9 @@ static __always_inline struct kmem_cache *kmalloc_slab(size_t size)
#endif
void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
+void *kmem_cache_alloc_notrace(struct kmem_cache *, gfp_t);
void *__kmalloc(size_t size, gfp_t flags);
-#ifdef CONFIG_KMEMTRACE
-extern void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags);
-#else
-static __always_inline void *
-kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags)
-{
- return kmem_cache_alloc(s, gfpflags);
-}
-#endif
-
static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
{
unsigned int order = get_order(size);
diff --git a/init/main.c b/init/main.c
index 5988deb..daf13a4 100644
--- a/init/main.c
+++ b/init/main.c
@@ -66,7 +66,6 @@
#include <linux/ftrace.h>
#include <linux/async.h>
#include <linux/kmemcheck.h>
-#include <linux/kmemtrace.h>
#include <linux/sfi.h>
#include <linux/shmem_fs.h>
#include <trace/boot.h>
@@ -645,7 +644,6 @@ asmlinkage void __init start_kernel(void)
#endif
page_cgroup_init();
enable_debug_pagealloc();
- kmemtrace_init();
kmemleak_init();
debug_objects_mem_init();
idr_init_cache();
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 20e3695..530bbb1 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -406,26 +406,6 @@ config HW_BRANCH_TRACER
This tracer records all branches on the system in a circular
buffer giving access to the last N branches for each cpu.
-config KMEMTRACE
- bool "Trace SLAB allocations"
- select GENERIC_TRACER
- help
- kmemtrace provides tracing for slab allocator functions, such as
- kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
- data is then fed to the userspace application in order to analyse
- allocation hotspots, internal fragmentation and so on, making it
- possible to see how well an allocator performs, as well as debug
- and profile kernel code.
-
- This requires an userspace application to use. See
- Documentation/trace/kmemtrace.txt for more information.
-
- Saying Y will make the kernel somewhat larger and slower. However,
- if you disable kmemtrace at run-time or boot-time, the performance
- impact is minimal (depending on the arch the kernel is built for).
-
- If unsure, say N.
-
config WORKQUEUE_TRACER
bool "Trace workqueues"
select GENERIC_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index cd9ecd8..cd5ed77 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -42,7 +42,6 @@ obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
-obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
ifeq ($(CONFIG_BLOCK),y)
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
deleted file mode 100644
index a91da69..0000000
--- a/kernel/trace/kmemtrace.c
+++ /dev/null
@@ -1,511 +0,0 @@
-/*
- * Memory allocator tracing
- *
- * Copyright (C) 2008 Eduard - Gabriel Munteanu
- * Copyright (C) 2008 Pekka Enberg <[email protected]>
- * Copyright (C) 2008 Frederic Weisbecker <[email protected]>
- */
-
-#include <linux/tracepoint.h>
-#include <linux/seq_file.h>
-#include <linux/debugfs.h>
-#include <linux/dcache.h>
-#include <linux/fs.h>
-
-#include <linux/kmemtrace.h>
-
-#include "trace_output.h"
-#include "trace.h"
-
-/* Select an alternative, minimalistic output than the original one */
-#define TRACE_KMEM_OPT_MINIMAL 0x1
-
-static struct tracer_opt kmem_opts[] = {
- /* Default disable the minimalistic output */
- { TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) },
- { }
-};
-
-static struct tracer_flags kmem_tracer_flags = {
- .val = 0,
- .opts = kmem_opts
-};
-
-static struct trace_array *kmemtrace_array;
-
-/* Trace allocations */
-static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
- unsigned long call_site,
- const void *ptr,
- size_t bytes_req,
- size_t bytes_alloc,
- gfp_t gfp_flags,
- int node)
-{
- struct ftrace_event_call *call = &event_kmem_alloc;
- struct trace_array *tr = kmemtrace_array;
- struct kmemtrace_alloc_entry *entry;
- struct ring_buffer_event *event;
-
- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
- if (!event)
- return;
-
- entry = ring_buffer_event_data(event);
- tracing_generic_entry_update(&entry->ent, 0, 0);
-
- entry->ent.type = TRACE_KMEM_ALLOC;
- entry->type_id = type_id;
- entry->call_site = call_site;
- entry->ptr = ptr;
- entry->bytes_req = bytes_req;
- entry->bytes_alloc = bytes_alloc;
- entry->gfp_flags = gfp_flags;
- entry->node = node;
-
- if (!filter_check_discard(call, entry, tr->buffer, event))
- ring_buffer_unlock_commit(tr->buffer, event);
-
- trace_wake_up();
-}
-
-static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
- unsigned long call_site,
- const void *ptr)
-{
- struct ftrace_event_call *call = &event_kmem_free;
- struct trace_array *tr = kmemtrace_array;
- struct kmemtrace_free_entry *entry;
- struct ring_buffer_event *event;
-
- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
- if (!event)
- return;
- entry = ring_buffer_event_data(event);
- tracing_generic_entry_update(&entry->ent, 0, 0);
-
- entry->ent.type = TRACE_KMEM_FREE;
- entry->type_id = type_id;
- entry->call_site = call_site;
- entry->ptr = ptr;
-
- if (!filter_check_discard(call, entry, tr->buffer, event))
- ring_buffer_unlock_commit(tr->buffer, event);
-
- trace_wake_up();
-}
-
-static void kmemtrace_kmalloc(unsigned long call_site,
- const void *ptr,
- size_t bytes_req,
- size_t bytes_alloc,
- gfp_t gfp_flags)
-{
- kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
- bytes_req, bytes_alloc, gfp_flags, -1);
-}
-
-static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
- const void *ptr,
- size_t bytes_req,
- size_t bytes_alloc,
- gfp_t gfp_flags)
-{
- kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
- bytes_req, bytes_alloc, gfp_flags, -1);
-}
-
-static void kmemtrace_kmalloc_node(unsigned long call_site,
- const void *ptr,
- size_t bytes_req,
- size_t bytes_alloc,
- gfp_t gfp_flags,
- int node)
-{
- kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
- bytes_req, bytes_alloc, gfp_flags, node);
-}
-
-static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
- const void *ptr,
- size_t bytes_req,
- size_t bytes_alloc,
- gfp_t gfp_flags,
- int node)
-{
- kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
- bytes_req, bytes_alloc, gfp_flags, node);
-}
-
-static void kmemtrace_kfree(unsigned long call_site, const void *ptr)
-{
- kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
-}
-
-static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr)
-{
- kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
-}
-
-static int kmemtrace_start_probes(void)
-{
- int err;
-
- err = register_trace_kmalloc(kmemtrace_kmalloc);
- if (err)
- return err;
- err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
- if (err)
- return err;
- err = register_trace_kmalloc_node(kmemtrace_kmalloc_node);
- if (err)
- return err;
- err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
- if (err)
- return err;
- err = register_trace_kfree(kmemtrace_kfree);
- if (err)
- return err;
- err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
-
- return err;
-}
-
-static void kmemtrace_stop_probes(void)
-{
- unregister_trace_kmalloc(kmemtrace_kmalloc);
- unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
- unregister_trace_kmalloc_node(kmemtrace_kmalloc_node);
- unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
- unregister_trace_kfree(kmemtrace_kfree);
- unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
-}
-
-static int kmem_trace_init(struct trace_array *tr)
-{
- kmemtrace_array = tr;
-
- tracing_reset_online_cpus(tr);
-
- kmemtrace_start_probes();
-
- return 0;
-}
-
-static void kmem_trace_reset(struct trace_array *tr)
-{
- kmemtrace_stop_probes();
-}
-
-static void kmemtrace_headers(struct seq_file *s)
-{
- /* Don't need headers for the original kmemtrace output */
- if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
- return;
-
- seq_printf(s, "#\n");
- seq_printf(s, "# ALLOC TYPE REQ GIVEN FLAGS "
- " POINTER NODE CALLER\n");
- seq_printf(s, "# FREE | | | | "
- " | | | |\n");
- seq_printf(s, "# |\n\n");
-}
-
-/*
- * The following functions give the original output from kmemtrace,
- * plus the origin CPU, since reordering occurs in-kernel now.
- */
-
-#define KMEMTRACE_USER_ALLOC 0
-#define KMEMTRACE_USER_FREE 1
-
-struct kmemtrace_user_event {
- u8 event_id;
- u8 type_id;
- u16 event_size;
- u32 cpu;
- u64 timestamp;
- unsigned long call_site;
- unsigned long ptr;
-};
-
-struct kmemtrace_user_event_alloc {
- size_t bytes_req;
- size_t bytes_alloc;
- unsigned gfp_flags;
- int node;
-};
-
-static enum print_line_t
-kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
-{
- struct trace_seq *s = &iter->seq;
- struct kmemtrace_alloc_entry *entry;
- int ret;
-
- trace_assign_type(entry, iter->ent);
-
- ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu "
- "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
- entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr,
- (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc,
- (unsigned long)entry->gfp_flags, entry->node);
-
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-kmemtrace_print_free(struct trace_iterator *iter, int flags)
-{
- struct trace_seq *s = &iter->seq;
- struct kmemtrace_free_entry *entry;
- int ret;
-
- trace_assign_type(entry, iter->ent);
-
- ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n",
- entry->type_id, (void *)entry->call_site,
- (unsigned long)entry->ptr);
-
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
-{
- struct trace_seq *s = &iter->seq;
- struct kmemtrace_alloc_entry *entry;
- struct kmemtrace_user_event *ev;
- struct kmemtrace_user_event_alloc *ev_alloc;
-
- trace_assign_type(entry, iter->ent);
-
- ev = trace_seq_reserve(s, sizeof(*ev));
- if (!ev)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ev->event_id = KMEMTRACE_USER_ALLOC;
- ev->type_id = entry->type_id;
- ev->event_size = sizeof(*ev) + sizeof(*ev_alloc);
- ev->cpu = iter->cpu;
- ev->timestamp = iter->ts;
- ev->call_site = entry->call_site;
- ev->ptr = (unsigned long)entry->ptr;
-
- ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc));
- if (!ev_alloc)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ev_alloc->bytes_req = entry->bytes_req;
- ev_alloc->bytes_alloc = entry->bytes_alloc;
- ev_alloc->gfp_flags = entry->gfp_flags;
- ev_alloc->node = entry->node;
-
- return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-kmemtrace_print_free_user(struct trace_iterator *iter, int flags)
-{
- struct trace_seq *s = &iter->seq;
- struct kmemtrace_free_entry *entry;
- struct kmemtrace_user_event *ev;
-
- trace_assign_type(entry, iter->ent);
-
- ev = trace_seq_reserve(s, sizeof(*ev));
- if (!ev)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ev->event_id = KMEMTRACE_USER_FREE;
- ev->type_id = entry->type_id;
- ev->event_size = sizeof(*ev);
- ev->cpu = iter->cpu;
- ev->timestamp = iter->ts;
- ev->call_site = entry->call_site;
- ev->ptr = (unsigned long)entry->ptr;
-
- return TRACE_TYPE_HANDLED;
-}
-
-/* The two other following provide a more minimalistic output */
-static enum print_line_t
-kmemtrace_print_alloc_compress(struct trace_iterator *iter)
-{
- struct kmemtrace_alloc_entry *entry;
- struct trace_seq *s = &iter->seq;
- int ret;
-
- trace_assign_type(entry, iter->ent);
-
- /* Alloc entry */
- ret = trace_seq_printf(s, " + ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Type */
- switch (entry->type_id) {
- case KMEMTRACE_TYPE_KMALLOC:
- ret = trace_seq_printf(s, "K ");
- break;
- case KMEMTRACE_TYPE_CACHE:
- ret = trace_seq_printf(s, "C ");
- break;
- case KMEMTRACE_TYPE_PAGES:
- ret = trace_seq_printf(s, "P ");
- break;
- default:
- ret = trace_seq_printf(s, "? ");
- }
-
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Requested */
- ret = trace_seq_printf(s, "%4zu ", entry->bytes_req);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Allocated */
- ret = trace_seq_printf(s, "%4zu ", entry->bytes_alloc);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Flags
- * TODO: would be better to see the name of the GFP flag names
- */
- ret = trace_seq_printf(s, "%08x ", entry->gfp_flags);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Pointer to allocated */
- ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Node and call site*/
- ret = trace_seq_printf(s, "%4d %pf\n", entry->node,
- (void *)entry->call_site);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-kmemtrace_print_free_compress(struct trace_iterator *iter)
-{
- struct kmemtrace_free_entry *entry;
- struct trace_seq *s = &iter->seq;
- int ret;
-
- trace_assign_type(entry, iter->ent);
-
- /* Free entry */
- ret = trace_seq_printf(s, " - ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Type */
- switch (entry->type_id) {
- case KMEMTRACE_TYPE_KMALLOC:
- ret = trace_seq_printf(s, "K ");
- break;
- case KMEMTRACE_TYPE_CACHE:
- ret = trace_seq_printf(s, "C ");
- break;
- case KMEMTRACE_TYPE_PAGES:
- ret = trace_seq_printf(s, "P ");
- break;
- default:
- ret = trace_seq_printf(s, "? ");
- }
-
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Skip requested/allocated/flags */
- ret = trace_seq_printf(s, " ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Pointer to allocated */
- ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Skip node and print call site*/
- ret = trace_seq_printf(s, " %pf\n", (void *)entry->call_site);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
-{
- struct trace_entry *entry = iter->ent;
-
- if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
- return TRACE_TYPE_UNHANDLED;
-
- switch (entry->type) {
- case TRACE_KMEM_ALLOC:
- return kmemtrace_print_alloc_compress(iter);
- case TRACE_KMEM_FREE:
- return kmemtrace_print_free_compress(iter);
- default:
- return TRACE_TYPE_UNHANDLED;
- }
-}
-
-static struct trace_event kmem_trace_alloc = {
- .type = TRACE_KMEM_ALLOC,
- .trace = kmemtrace_print_alloc,
- .binary = kmemtrace_print_alloc_user,
-};
-
-static struct trace_event kmem_trace_free = {
- .type = TRACE_KMEM_FREE,
- .trace = kmemtrace_print_free,
- .binary = kmemtrace_print_free_user,
-};
-
-static struct tracer kmem_tracer __read_mostly = {
- .name = "kmemtrace",
- .init = kmem_trace_init,
- .reset = kmem_trace_reset,
- .print_line = kmemtrace_print_line,
- .print_header = kmemtrace_headers,
- .flags = &kmem_tracer_flags
-};
-
-void kmemtrace_init(void)
-{
- /* earliest opportunity to start kmem tracing */
-}
-
-static int __init init_kmem_tracer(void)
-{
- if (!register_ftrace_event(&kmem_trace_alloc)) {
- pr_warning("Warning: could not register kmem events\n");
- return 1;
- }
-
- if (!register_ftrace_event(&kmem_trace_free)) {
- pr_warning("Warning: could not register kmem events\n");
- return 1;
- }
-
- if (register_tracer(&kmem_tracer) != 0) {
- pr_warning("Warning: could not register the kmem tracer\n");
- return 1;
- }
-
- return 0;
-}
-device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5d6398b..d850dc2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -10,7 +10,6 @@
#include <linux/tracepoint.h>
#include <linux/ftrace.h>
#include <trace/boot.h>
-#include <linux/kmemtrace.h>
#include <linux/hw_breakpoint.h>
#include <linux/trace_seq.h>
@@ -43,12 +42,6 @@ enum trace_type {
__TRACE_LAST_TYPE,
};
-enum kmemtrace_type_id {
- KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */
- KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
- KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
-};
-
extern struct tracer boot_tracer;
#undef __field
@@ -230,10 +223,6 @@ extern void __ftrace_bad_type(void);
IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
TRACE_GRAPH_RET); \
IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
- IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
- TRACE_KMEM_ALLOC); \
- IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
- TRACE_KMEM_FREE); \
IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
__ftrace_bad_type(); \
} while (0)
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index c16a08f..7564b56 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -330,41 +330,6 @@ FTRACE_ENTRY(hw_branch, hw_branch_entry,
F_printk("from: %llx to: %llx", __entry->from, __entry->to)
);
-FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
-
- TRACE_KMEM_ALLOC,
-
- F_STRUCT(
- __field( enum kmemtrace_type_id, type_id )
- __field( unsigned long, call_site )
- __field( const void *, ptr )
- __field( size_t, bytes_req )
- __field( size_t, bytes_alloc )
- __field( gfp_t, gfp_flags )
- __field( int, node )
- ),
-
- F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi"
- " flags:%x node:%d",
- __entry->type_id, __entry->call_site, __entry->ptr,
- __entry->bytes_req, __entry->bytes_alloc,
- __entry->gfp_flags, __entry->node)
-);
-
-FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
-
- TRACE_KMEM_FREE,
-
- F_STRUCT(
- __field( enum kmemtrace_type_id, type_id )
- __field( unsigned long, call_site )
- __field( const void *, ptr )
- ),
-
- F_printk("type:%u call_site:%lx ptr:%p",
- __entry->type_id, __entry->call_site, __entry->ptr)
-);
-
FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
TRACE_KSYM,
diff --git a/mm/slab.c b/mm/slab.c
index 7dfa481..07e4072 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -102,7 +102,6 @@
#include <linux/cpu.h>
#include <linux/sysctl.h>
#include <linux/module.h>
-#include <linux/kmemtrace.h>
#include <linux/rcupdate.h>
#include <linux/string.h>
#include <linux/uaccess.h>
@@ -490,14 +489,6 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
#endif
-#ifdef CONFIG_KMEMTRACE
-size_t slab_buffer_size(struct kmem_cache *cachep)
-{
- return cachep->buffer_size;
-}
-EXPORT_SYMBOL(slab_buffer_size);
-#endif
-
/*
* Do not go above this order unless 0 objects fit into the slab.
*/
@@ -3558,13 +3549,11 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
}
EXPORT_SYMBOL(kmem_cache_alloc);
-#ifdef CONFIG_KMEMTRACE
void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
{
return __cache_alloc(cachep, flags, __builtin_return_address(0));
}
EXPORT_SYMBOL(kmem_cache_alloc_notrace);
-#endif
/**
* kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
diff --git a/mm/slub.c b/mm/slub.c
index 4996fc7..97f2da6 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -17,7 +17,6 @@
#include <linux/slab.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
-#include <linux/kmemtrace.h>
#include <linux/kmemcheck.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
@@ -1754,13 +1753,11 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
}
EXPORT_SYMBOL(kmem_cache_alloc);
-#ifdef CONFIG_KMEMTRACE
void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags)
{
return slab_alloc(s, gfpflags, -1, _RET_IP_);
}
EXPORT_SYMBOL(kmem_cache_alloc_notrace);
-#endif
#ifdef CONFIG_NUMA
void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
--
1.6.3
* Li Zefan <[email protected]> wrote:
> This tool is mostly a perf version of kmemtrace-user.
>
> The following information is provided by this tool:
>
> - the total amount of memory allocated and fragmentation per call-site
> - the total amount of memory allocated and fragmentation per allocation
> - total memory allocated and fragmentation in the collected dataset
> - ...
>
> # ./perf kmem record
> ^C
> # ./perf kmem --stat caller --stat alloc -l 10
>
> ------------------------------------------------------------------------------
> Callsite | Total_alloc/Per | Total_req/Per | Hit | Fragmentation
> ------------------------------------------------------------------------------
> 0xc052f37a | 790528/4096 | 790528/4096 | 193 | 0.000%
> 0xc0541d70 | 524288/4096 | 524288/4096 | 128 | 0.000%
> 0xc051cc68 | 481600/200 | 481600/200 | 2408 | 0.000%
> 0xc0572623 | 297444/676 | 297440/676 | 440 | 0.001%
> 0xc05399f1 | 73476/164 | 73472/164 | 448 | 0.005%
> 0xc05243bf | 51456/256 | 51456/256 | 201 | 0.000%
> 0xc0730d0e | 31844/497 | 31808/497 | 64 | 0.113%
> 0xc0734c4e | 17152/256 | 17152/256 | 67 | 0.000%
> 0xc0541a6d | 16384/128 | 16384/128 | 128 | 0.000%
> 0xc059c217 | 13120/40 | 13120/40 | 328 | 0.000%
> 0xc0501ee6 | 11264/88 | 11264/88 | 128 | 0.000%
> 0xc04daef0 | 7504/682 | 7128/648 | 11 | 5.011%
> 0xc04e14a3 | 4216/191 | 4216/191 | 22 | 0.000%
> 0xc05041ca | 3524/44 | 3520/44 | 80 | 0.114%
> 0xc0734fa3 | 2104/701 | 1620/540 | 3 | 23.004%
> 0xc05ec9f1 | 2024/289 | 2016/288 | 7 | 0.395%
> 0xc06a1999 | 1792/256 | 1792/256 | 7 | 0.000%
> 0xc0463b9a | 1584/144 | 1584/144 | 11 | 0.000%
> 0xc0541eb0 | 1024/16 | 1024/16 | 64 | 0.000%
> 0xc06a19ac | 896/128 | 896/128 | 7 | 0.000%
> 0xc05721c0 | 772/12 | 768/12 | 64 | 0.518%
> 0xc054d1e6 | 288/57 | 280/56 | 5 | 2.778%
> 0xc04b562e | 157/31 | 154/30 | 5 | 1.911%
> 0xc04b536f | 80/16 | 80/16 | 5 | 0.000%
> 0xc05855a0 | 64/64 | 36/36 | 1 | 43.750%
> ------------------------------------------------------------------------------
>
> ------------------------------------------------------------------------------
> Alloc Ptr | Total_alloc/Per | Total_req/Per | Hit | Fragmentation
> ------------------------------------------------------------------------------
> 0xda884000 | 1052672/4096 | 1052672/4096 | 257 | 0.000%
> 0xda886000 | 262144/4096 | 262144/4096 | 64 | 0.000%
> 0xf60c7c00 | 16512/128 | 16512/128 | 129 | 0.000%
> 0xf59a4118 | 13120/40 | 13120/40 | 328 | 0.000%
> 0xdfd4b2c0 | 11264/88 | 11264/88 | 128 | 0.000%
> 0xf5274600 | 7680/256 | 7680/256 | 30 | 0.000%
> 0xe8395000 | 5948/594 | 5464/546 | 10 | 8.137%
> 0xe59c3c00 | 5748/479 | 5712/476 | 12 | 0.626%
> 0xf4cd1a80 | 3524/44 | 3520/44 | 80 | 0.114%
> 0xe5bd1600 | 2892/482 | 2856/476 | 6 | 1.245%
> ... | ... | ... | ... | ...
> ------------------------------------------------------------------------------
>
> SUMMARY
> =======
> Total bytes requested: 2333626
> Total bytes allocated: 2353712
> Total bytes wasted on internal fragmentation: 20086
> Internal fragmentation: 0.853375%
Very impressive!
> TODO:
> - show sym+offset in 'callsite' column
The way to print symbolic information for the 'callsite' column is to
fill in and walk the thread->DSO->symbol trees that all perf tools
maintain:
/* simplified, without error handling */
ip = event->ip.ip;
thread = threads__findnew(event->ip.pid);
map = thread__find_map(thread, ip);
ip = map->map_ip(map, ip); /* map absolute RIP into DSO-relative one */
sym = map__find_symbol(map, ip, symbol_filter);
then sym->name is the string that can be printed out. This works in a
symmetric way for both kernel-space and user-space symbols. (Call-chain
information can be captured and displayed too.)
( 'Alloc Ptr' symbolization is harder, but it would be useful too i
think, to map it back to the slab cache name. )
> - show cross node allocation stats
I checked and we appear to have all the right events for that - the node
ID is being traced consistently AFAICS.
> - collect more useful stats?
> - ...
Pekka, Eduard and the other slab hackers might have ideas about what
other stats they generally like to see to judge the health of a workload
(or system).
If this iteration looks good to the slab folks then i can apply it as-is
and we can do the other changes relative to that. It looks good to me as
a first step, and it's functional already.
Ingo
On Fri, Nov 20, 2009 at 10:14 AM, Ingo Molnar <[email protected]> wrote:
> Pekka, Eduard and the other slab hackers might have ideas about what
> other stats they generally like to see to judge the health of a workload
> (or system).
kmalloc()/kfree() CPU ping-pong call-sites (i.e. alloc and free
happening on different CPUs) is one interesting metric we haven't
implemented yet. Valgrind massif tool type of output graph would be
helpful as well:
http://valgrind.org/docs/manual/ms-manual.html
On Fri, Nov 20, 2009 at 10:14 AM, Ingo Molnar <[email protected]> wrote:
> If this iteration looks good to the slab folks then i can apply it as-is
> and we can do the other changes relative to that. It looks good to me as
> a first step, and it's functional already.
Yeah, looks OK to me as the first step. Patch 2 looks premature,
though, looking at the output of "perf kmem" from patch 1.
Acked-by: Pekka Enberg <[email protected]>
Pekka
Li Zefan kirjoitti:
> The kmem trace events can replace the functions of kmemtrace
> tracer.
>
> And kmemtrace-user can be modified to use trace events.
> (But after cloning the git repo, I found it's still based on
> the original relay version..), not to mention now we have
> 'perf kmem' tool.
>
> Signed-off-by: Li Zefan <[email protected]>
NAK for the time being. "perf kmem" output is not yet as good as that of
kmemtrace-user.
Pekka
>> TODO:
>> - show sym+offset in 'callsite' column
>
> The way to print symbolic information for the 'callsite' column is to
> fill in and walk the thread->DSO->symbol trees that all perf tools
> maintain:
>
> /* simplified, without error handling */
>
> ip = event->ip.ip;
>
> thread = threads__findnew(event->ip.pid);
>
> map = thread__find_map(thread, ip);
>
> ip = map->map_ip(map, ip); /* map absolute RIP into DSO-relative one */
>
> sym = map__find_symbol(map, ip, symbol_filter);
>
> then sym->name is the string that can be printed out. This works in a
> symmetric way for both kernel-space and user-space symbols. (Call-chain
> information can be captured and displayed too.)
>
> ( 'Alloc Ptr' symbolization is harder, but it would be useful too i
> think, to map it back to the slab cache name. )
>
Thanks.
I was lazy to figure it out by myself. ;)
>> - show cross node allocation stats
>
> I checked and we appear to have all the right events for that - the node
> ID is being traced consistently AFAICS.
>
Actually kmemtrace-user shows this stats, but in a wrong way.
It doesn't map cpu_nr to node.
>> - collect more useful stats?
>> - ...
>
> Pekka, Eduard and the other slab hackers might have ideas about what
> other stats they generally like to see to judge the health of a workload
> (or system).
>
> If this iteration looks good to the slab folks then i can apply it as-is
> and we can do the other changes relative to that. It looks good to me as
> a first step, and it's functional already.
>
Thanks!
于 2009年11月20日 16:20, Pekka Enberg 写道:
> Li Zefan kirjoitti:
>> The kmem trace events can replace the functions of kmemtrace
>> tracer.
>>
>> And kmemtrace-user can be modified to use trace events.
>> (But after cloning the git repo, I found it's still based on
>> the original relay version..), not to mention now we have
>> 'perf kmem' tool.
>>
>> Signed-off-by: Li Zefan <[email protected]>
>
> NAK for the time being. "perf kmem" output is not yet as good as that of
> kmemtrace-user.
>
But is the current kmemtrace-user based on kmemtrace?
>From the git repo:
http://repo.or.cz/w/kmemtrace-user.git
I found it's still based on relay.
On Fri, Nov 20, 2009 at 10:24 AM, Li Zefan <[email protected]> wrote:
> ?? 2009??11??20?? 16:20, Pekka Enberg д??:
>> Li Zefan kirjoitti:
>>> The kmem trace events can replace the functions of kmemtrace
>>> tracer.
>>>
>>> And kmemtrace-user can be modified to use trace events.
>>> (But after cloning the git repo, I found it's still based on
>>> the original relay version..), not to mention now we have
>>> 'perf kmem' tool.
>>>
>>> Signed-off-by: Li Zefan <[email protected]>
>>
>> NAK for the time being. "perf kmem" output is not yet as good as that of
>> kmemtrace-user.
>>
>
> But is the current kmemtrace-user based on kmemtrace?
>
> From the git repo:
> http://repo.or.cz/w/kmemtrace-user.git
>
> I found it's still based on relay.
The "ftrace-temp" branch seems to have the ftrace based version in it. Eduard?
Pekka
* Pekka Enberg <[email protected]> wrote:
> On Fri, Nov 20, 2009 at 10:14 AM, Ingo Molnar <[email protected]> wrote:
> > Pekka, Eduard and the other slab hackers might have ideas about what
> > other stats they generally like to see to judge the health of a workload
> > (or system).
>
> kmalloc()/kfree() CPU ping-pong call-sites (i.e. alloc and free
> happening on different CPUs) is one interesting metric we haven't
> implemented yet. Valgrind massif tool type of output graph would be
> helpful as well:
>
> http://valgrind.org/docs/manual/ms-manual.html
>
> On Fri, Nov 20, 2009 at 10:14 AM, Ingo Molnar <[email protected]> wrote:
> > If this iteration looks good to the slab folks then i can apply it as-is
> > and we can do the other changes relative to that. It looks good to me as
> > a first step, and it's functional already.
>
> Yeah, looks OK to me as the first step. Patch 2 looks premature,
> though, looking at the output of "perf kmem" from patch 1.
>
> Acked-by: Pekka Enberg <[email protected]>
Great - thanks for the quick ack!
Regarding patch 2 - can we set some definitive benchmark threshold for
that? I.e. a list of must-have features in 'perf kmem' before we can do
it? 100% information and analysis equivalency with kmemtrace-user tool?
Eduard, what do you think?
Ingo
Pekka Enberg wrote:
> On Fri, Nov 20, 2009 at 10:24 AM, Li Zefan <[email protected]> wrote:
>> ?? 2009??11??20?? 16:20, Pekka Enberg д??:
>>> Li Zefan kirjoitti:
>>>> The kmem trace events can replace the functions of kmemtrace
>>>> tracer.
>>>>
>>>> And kmemtrace-user can be modified to use trace events.
>>>> (But after cloning the git repo, I found it's still based on
>>>> the original relay version..), not to mention now we have
>>>> 'perf kmem' tool.
>>>>
>>>> Signed-off-by: Li Zefan <[email protected]>
>>> NAK for the time being. "perf kmem" output is not yet as good as that of
>>> kmemtrace-user.
>>>
>> But is the current kmemtrace-user based on kmemtrace?
>>
>> From the git repo:
>> http://repo.or.cz/w/kmemtrace-user.git
>>
>> I found it's still based on relay.
>
> The "ftrace-temp" branch seems to have the ftrace based version in it. Eduard?
>
Thanks. I just overlooked the branch..
Ingo Molnar kirjoitti:
> Regarding patch 2 - can we set some definitive benchmark threshold for
> that? I.e. a list of must-have features in 'perf kmem' before we can do
> it? 100% information and analysis equivalency with kmemtrace-user tool?
I'd be interested to hear Eduard's comment on that.
That said, I'll try to find some time to test "perf kmem" and provide
feedback on that. I can ACK the patch when I'm happy with the output. :-)
I'm mostly interested in two scenarios: (1) getting a nice report on
worst fragmented call-sites (perf kmem needs symbol lookup) and (2)
doing "perf kmem record" on machine A (think embedded here) and then
"perf kmem report" on machine B. I haven't tried kmemtrace-user for a
while but it did support both of them quite nicely at some point.
Pekka
Pekka Enberg wrote:
> Ingo Molnar kirjoitti:
>> Regarding patch 2 - can we set some definitive benchmark threshold for
>> that? I.e. a list of must-have features in 'perf kmem' before we can
>> do it? 100% information and analysis equivalency with kmemtrace-user
>> tool?
>
> I'd be interested to hear Eduard's comment on that.
>
> That said, I'll try to find some time to test "perf kmem" and provide
> feedback on that. I can ACK the patch when I'm happy with the output. :-)
>
> I'm mostly interested in two scenarios: (1) getting a nice report on
> worst fragmented call-sites (perf kmem needs symbol lookup) and
This will be done in next version.
> (2) doing "perf kmem record" on machine A (think embedded here) and then
> "perf kmem report" on machine B. I haven't tried kmemtrace-user for a
> while but it did support both of them quite nicely at some point.
>
Everything needed and machine-specific will be recorded in perf.data,
so this should already been supported. I'll try it.
Commit-ID: ba77c9e11111a172c9e8687fe16a6a173a61916f
Gitweb: http://git.kernel.org/tip/ba77c9e11111a172c9e8687fe16a6a173a61916f
Author: Li Zefan <[email protected]>
AuthorDate: Fri, 20 Nov 2009 15:53:25 +0800
Committer: Ingo Molnar <[email protected]>
CommitDate: Fri, 20 Nov 2009 09:51:41 +0100
perf: Add 'perf kmem' tool
This tool is mostly a perf version of kmemtrace-user.
The following information is provided by this tool:
- the total amount of memory allocated and fragmentation per
call-site
- the total amount of memory allocated and fragmentation per
allocation
- total memory allocated and fragmentation in the collected
dataset - ...
Sample output:
# ./perf kmem record
^C
# ./perf kmem --stat caller --stat alloc -l 10
------------------------------------------------------------------------------
Callsite | Total_alloc/Per | Total_req/Per | Hit | Fragmentation
------------------------------------------------------------------------------
0xc052f37a | 790528/4096 | 790528/4096 | 193 | 0.000%
0xc0541d70 | 524288/4096 | 524288/4096 | 128 | 0.000%
0xc051cc68 | 481600/200 | 481600/200 | 2408 | 0.000%
0xc0572623 | 297444/676 | 297440/676 | 440 | 0.001%
0xc05399f1 | 73476/164 | 73472/164 | 448 | 0.005%
0xc05243bf | 51456/256 | 51456/256 | 201 | 0.000%
0xc0730d0e | 31844/497 | 31808/497 | 64 | 0.113%
0xc0734c4e | 17152/256 | 17152/256 | 67 | 0.000%
0xc0541a6d | 16384/128 | 16384/128 | 128 | 0.000%
0xc059c217 | 13120/40 | 13120/40 | 328 | 0.000%
0xc0501ee6 | 11264/88 | 11264/88 | 128 | 0.000%
0xc04daef0 | 7504/682 | 7128/648 | 11 | 5.011%
0xc04e14a3 | 4216/191 | 4216/191 | 22 | 0.000%
0xc05041ca | 3524/44 | 3520/44 | 80 | 0.114%
0xc0734fa3 | 2104/701 | 1620/540 | 3 | 23.004%
0xc05ec9f1 | 2024/289 | 2016/288 | 7 | 0.395%
0xc06a1999 | 1792/256 | 1792/256 | 7 | 0.000%
0xc0463b9a | 1584/144 | 1584/144 | 11 | 0.000%
0xc0541eb0 | 1024/16 | 1024/16 | 64 | 0.000%
0xc06a19ac | 896/128 | 896/128 | 7 | 0.000%
0xc05721c0 | 772/12 | 768/12 | 64 | 0.518%
0xc054d1e6 | 288/57 | 280/56 | 5 | 2.778%
0xc04b562e | 157/31 | 154/30 | 5 | 1.911%
0xc04b536f | 80/16 | 80/16 | 5 | 0.000%
0xc05855a0 | 64/64 | 36/36 | 1 | 43.750%
------------------------------------------------------------------------------
------------------------------------------------------------------------------
Alloc Ptr | Total_alloc/Per | Total_req/Per | Hit | Fragmentation
------------------------------------------------------------------------------
0xda884000 | 1052672/4096 | 1052672/4096 | 257 | 0.000%
0xda886000 | 262144/4096 | 262144/4096 | 64 | 0.000%
0xf60c7c00 | 16512/128 | 16512/128 | 129 | 0.000%
0xf59a4118 | 13120/40 | 13120/40 | 328 | 0.000%
0xdfd4b2c0 | 11264/88 | 11264/88 | 128 | 0.000%
0xf5274600 | 7680/256 | 7680/256 | 30 | 0.000%
0xe8395000 | 5948/594 | 5464/546 | 10 | 8.137%
0xe59c3c00 | 5748/479 | 5712/476 | 12 | 0.626%
0xf4cd1a80 | 3524/44 | 3520/44 | 80 | 0.114%
0xe5bd1600 | 2892/482 | 2856/476 | 6 | 1.245%
... | ... | ... | ... | ...
------------------------------------------------------------------------------
SUMMARY
=======
Total bytes requested: 2333626
Total bytes allocated: 2353712
Total bytes wasted on internal fragmentation: 20086
Internal fragmentation: 0.853375%
TODO:
- show sym+offset in 'callsite' column
- show cross node allocation stats
- collect more useful stats?
- ...
Signed-off-by: Li Zefan <[email protected]>
Acked-by: Pekka Enberg <[email protected]>
Acked-by: Peter Zijlstra <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Eduard - Gabriel Munteanu <[email protected]>
Cc: [email protected] <[email protected]>
LKML-Reference: <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
---
tools/perf/Makefile | 1 +
tools/perf/builtin-kmem.c | 578 +++++++++++++++++++++++++++++++++++++++++++++
tools/perf/builtin.h | 1 +
tools/perf/perf.c | 27 +-
4 files changed, 594 insertions(+), 13 deletions(-)
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 3f0666a..d7198c5 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -444,6 +444,7 @@ BUILTIN_OBJS += builtin-timechart.o
BUILTIN_OBJS += builtin-top.o
BUILTIN_OBJS += builtin-trace.o
BUILTIN_OBJS += builtin-probe.o
+BUILTIN_OBJS += builtin-kmem.o
PERFLIBS = $(LIB_FILE)
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
new file mode 100644
index 0000000..f315b05
--- /dev/null
+++ b/tools/perf/builtin-kmem.c
@@ -0,0 +1,578 @@
+#include "builtin.h"
+#include "perf.h"
+
+#include "util/util.h"
+#include "util/cache.h"
+#include "util/symbol.h"
+#include "util/thread.h"
+#include "util/header.h"
+
+#include "util/parse-options.h"
+#include "util/trace-event.h"
+
+#include "util/debug.h"
+#include "util/data_map.h"
+
+#include <linux/rbtree.h>
+
+struct alloc_stat;
+typedef int (*sort_fn_t)(struct alloc_stat *, struct alloc_stat *);
+
+static char const *input_name = "perf.data";
+
+static struct perf_header *header;
+static u64 sample_type;
+
+static int alloc_flag;
+static int caller_flag;
+
+sort_fn_t alloc_sort_fn;
+sort_fn_t caller_sort_fn;
+
+static int alloc_lines = -1;
+static int caller_lines = -1;
+
+static char *cwd;
+static int cwdlen;
+
+struct alloc_stat {
+ union {
+ struct {
+ char *name;
+ u64 call_site;
+ };
+ u64 ptr;
+ };
+ u64 bytes_req;
+ u64 bytes_alloc;
+ u32 hit;
+
+ struct rb_node node;
+};
+
+static struct rb_root root_alloc_stat;
+static struct rb_root root_alloc_sorted;
+static struct rb_root root_caller_stat;
+static struct rb_root root_caller_sorted;
+
+static unsigned long total_requested, total_allocated;
+
+struct raw_event_sample {
+ u32 size;
+ char data[0];
+};
+
+static int
+process_comm_event(event_t *event, unsigned long offset, unsigned long head)
+{
+ struct thread *thread = threads__findnew(event->comm.pid);
+
+ dump_printf("%p [%p]: PERF_RECORD_COMM: %s:%d\n",
+ (void *)(offset + head),
+ (void *)(long)(event->header.size),
+ event->comm.comm, event->comm.pid);
+
+ if (thread == NULL ||
+ thread__set_comm(thread, event->comm.comm)) {
+ dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static void insert_alloc_stat(unsigned long ptr,
+ int bytes_req, int bytes_alloc)
+{
+ struct rb_node **node = &root_alloc_stat.rb_node;
+ struct rb_node *parent = NULL;
+ struct alloc_stat *data = NULL;
+
+ if (!alloc_flag)
+ return;
+
+ while (*node) {
+ parent = *node;
+ data = rb_entry(*node, struct alloc_stat, node);
+
+ if (ptr > data->ptr)
+ node = &(*node)->rb_right;
+ else if (ptr < data->ptr)
+ node = &(*node)->rb_left;
+ else
+ break;
+ }
+
+ if (data && data->ptr == ptr) {
+ data->hit++;
+ data->bytes_req += bytes_req;
+ data->bytes_alloc += bytes_req;
+ } else {
+ data = malloc(sizeof(*data));
+ data->ptr = ptr;
+ data->hit = 1;
+ data->bytes_req = bytes_req;
+ data->bytes_alloc = bytes_alloc;
+
+ rb_link_node(&data->node, parent, node);
+ rb_insert_color(&data->node, &root_alloc_stat);
+ }
+}
+
+static void insert_caller_stat(unsigned long call_site,
+ int bytes_req, int bytes_alloc)
+{
+ struct rb_node **node = &root_caller_stat.rb_node;
+ struct rb_node *parent = NULL;
+ struct alloc_stat *data = NULL;
+
+ if (!caller_flag)
+ return;
+
+ while (*node) {
+ parent = *node;
+ data = rb_entry(*node, struct alloc_stat, node);
+
+ if (call_site > data->call_site)
+ node = &(*node)->rb_right;
+ else if (call_site < data->call_site)
+ node = &(*node)->rb_left;
+ else
+ break;
+ }
+
+ if (data && data->call_site == call_site) {
+ data->hit++;
+ data->bytes_req += bytes_req;
+ data->bytes_alloc += bytes_req;
+ } else {
+ data = malloc(sizeof(*data));
+ data->call_site = call_site;
+ data->hit = 1;
+ data->bytes_req = bytes_req;
+ data->bytes_alloc = bytes_alloc;
+
+ rb_link_node(&data->node, parent, node);
+ rb_insert_color(&data->node, &root_caller_stat);
+ }
+}
+
+static void process_alloc_event(struct raw_event_sample *raw,
+ struct event *event,
+ int cpu __used,
+ u64 timestamp __used,
+ struct thread *thread __used,
+ int node __used)
+{
+ unsigned long call_site;
+ unsigned long ptr;
+ int bytes_req;
+ int bytes_alloc;
+
+ ptr = raw_field_value(event, "ptr", raw->data);
+ call_site = raw_field_value(event, "call_site", raw->data);
+ bytes_req = raw_field_value(event, "bytes_req", raw->data);
+ bytes_alloc = raw_field_value(event, "bytes_alloc", raw->data);
+
+ insert_alloc_stat(ptr, bytes_req, bytes_alloc);
+ insert_caller_stat(call_site, bytes_req, bytes_alloc);
+
+ total_requested += bytes_req;
+ total_allocated += bytes_alloc;
+}
+
+static void process_free_event(struct raw_event_sample *raw __used,
+ struct event *event __used,
+ int cpu __used,
+ u64 timestamp __used,
+ struct thread *thread __used)
+{
+}
+
+static void
+process_raw_event(event_t *raw_event __used, void *more_data,
+ int cpu, u64 timestamp, struct thread *thread)
+{
+ struct raw_event_sample *raw = more_data;
+ struct event *event;
+ int type;
+
+ type = trace_parse_common_type(raw->data);
+ event = trace_find_event(type);
+
+ if (!strcmp(event->name, "kmalloc") ||
+ !strcmp(event->name, "kmem_cache_alloc")) {
+ process_alloc_event(raw, event, cpu, timestamp, thread, 0);
+ return;
+ }
+
+ if (!strcmp(event->name, "kmalloc_node") ||
+ !strcmp(event->name, "kmem_cache_alloc_node")) {
+ process_alloc_event(raw, event, cpu, timestamp, thread, 1);
+ return;
+ }
+
+ if (!strcmp(event->name, "kfree") ||
+ !strcmp(event->name, "kmem_cache_free")) {
+ process_free_event(raw, event, cpu, timestamp, thread);
+ return;
+ }
+}
+
+static int
+process_sample_event(event_t *event, unsigned long offset, unsigned long head)
+{
+ u64 ip = event->ip.ip;
+ u64 timestamp = -1;
+ u32 cpu = -1;
+ u64 period = 1;
+ void *more_data = event->ip.__more_data;
+ struct thread *thread = threads__findnew(event->ip.pid);
+
+ if (sample_type & PERF_SAMPLE_TIME) {
+ timestamp = *(u64 *)more_data;
+ more_data += sizeof(u64);
+ }
+
+ if (sample_type & PERF_SAMPLE_CPU) {
+ cpu = *(u32 *)more_data;
+ more_data += sizeof(u32);
+ more_data += sizeof(u32); /* reserved */
+ }
+
+ if (sample_type & PERF_SAMPLE_PERIOD) {
+ period = *(u64 *)more_data;
+ more_data += sizeof(u64);
+ }
+
+ dump_printf("%p [%p]: PERF_RECORD_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
+ (void *)(offset + head),
+ (void *)(long)(event->header.size),
+ event->header.misc,
+ event->ip.pid, event->ip.tid,
+ (void *)(long)ip,
+ (long long)period);
+
+ if (thread == NULL) {
+ pr_debug("problem processing %d event, skipping it.\n",
+ event->header.type);
+ return -1;
+ }
+
+ dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
+
+ process_raw_event(event, more_data, cpu, timestamp, thread);
+
+ return 0;
+}
+
+static int sample_type_check(u64 type)
+{
+ sample_type = type;
+
+ if (!(sample_type & PERF_SAMPLE_RAW)) {
+ fprintf(stderr,
+ "No trace sample to read. Did you call perf record "
+ "without -R?");
+ return -1;
+ }
+
+ return 0;
+}
+
+static struct perf_file_handler file_handler = {
+ .process_sample_event = process_sample_event,
+ .process_comm_event = process_comm_event,
+ .sample_type_check = sample_type_check,
+};
+
+static int read_events(void)
+{
+ register_idle_thread();
+ register_perf_file_handler(&file_handler);
+
+ return mmap_dispatch_perf_file(&header, input_name, 0, 0,
+ &cwdlen, &cwd);
+}
+
+static double fragmentation(unsigned long n_req, unsigned long n_alloc)
+{
+ if (n_alloc == 0)
+ return 0.0;
+ else
+ return 100.0 - (100.0 * n_req / n_alloc);
+}
+
+static void __print_result(struct rb_root *root, int n_lines, int is_caller)
+{
+ struct rb_node *next;
+
+ printf("\n ------------------------------------------------------------------------------\n");
+ if (is_caller)
+ printf(" Callsite |");
+ else
+ printf(" Alloc Ptr |");
+ printf(" Total_alloc/Per | Total_req/Per | Hit | Fragmentation\n");
+ printf(" ------------------------------------------------------------------------------\n");
+
+ next = rb_first(root);
+
+ while (next && n_lines--) {
+ struct alloc_stat *data;
+
+ data = rb_entry(next, struct alloc_stat, node);
+
+ printf(" %-16p | %8llu/%-6lu | %8llu/%-6lu | %6lu | %8.3f%%\n",
+ is_caller ? (void *)(unsigned long)data->call_site :
+ (void *)(unsigned long)data->ptr,
+ (unsigned long long)data->bytes_alloc,
+ (unsigned long)data->bytes_alloc / data->hit,
+ (unsigned long long)data->bytes_req,
+ (unsigned long)data->bytes_req / data->hit,
+ (unsigned long)data->hit,
+ fragmentation(data->bytes_req, data->bytes_alloc));
+
+ next = rb_next(next);
+ }
+
+ if (n_lines == -1)
+ printf(" ... | ... | ... | ... | ... \n");
+
+ printf(" ------------------------------------------------------------------------------\n");
+}
+
+static void print_summary(void)
+{
+ printf("\nSUMMARY\n=======\n");
+ printf("Total bytes requested: %lu\n", total_requested);
+ printf("Total bytes allocated: %lu\n", total_allocated);
+ printf("Total bytes wasted on internal fragmentation: %lu\n",
+ total_allocated - total_requested);
+ printf("Internal fragmentation: %f%%\n",
+ fragmentation(total_requested, total_allocated));
+}
+
+static void print_result(void)
+{
+ if (caller_flag)
+ __print_result(&root_caller_sorted, caller_lines, 1);
+ if (alloc_flag)
+ __print_result(&root_alloc_sorted, alloc_lines, 0);
+ print_summary();
+}
+
+static void sort_insert(struct rb_root *root, struct alloc_stat *data,
+ sort_fn_t sort_fn)
+{
+ struct rb_node **new = &(root->rb_node);
+ struct rb_node *parent = NULL;
+
+ while (*new) {
+ struct alloc_stat *this;
+ int cmp;
+
+ this = rb_entry(*new, struct alloc_stat, node);
+ parent = *new;
+
+ cmp = sort_fn(data, this);
+
+ if (cmp > 0)
+ new = &((*new)->rb_left);
+ else
+ new = &((*new)->rb_right);
+ }
+
+ rb_link_node(&data->node, parent, new);
+ rb_insert_color(&data->node, root);
+}
+
+static void __sort_result(struct rb_root *root, struct rb_root *root_sorted,
+ sort_fn_t sort_fn)
+{
+ struct rb_node *node;
+ struct alloc_stat *data;
+
+ for (;;) {
+ node = rb_first(root);
+ if (!node)
+ break;
+
+ rb_erase(node, root);
+ data = rb_entry(node, struct alloc_stat, node);
+ sort_insert(root_sorted, data, sort_fn);
+ }
+}
+
+static void sort_result(void)
+{
+ __sort_result(&root_alloc_stat, &root_alloc_sorted, alloc_sort_fn);
+ __sort_result(&root_caller_stat, &root_caller_sorted, caller_sort_fn);
+}
+
+static int __cmd_kmem(void)
+{
+ setup_pager();
+ read_events();
+ sort_result();
+ print_result();
+
+ return 0;
+}
+
+static const char * const kmem_usage[] = {
+ "perf kmem [<options>] {record}",
+ NULL
+};
+
+
+static int ptr_cmp(struct alloc_stat *l, struct alloc_stat *r)
+{
+ if (l->ptr < r->ptr)
+ return -1;
+ else if (l->ptr > r->ptr)
+ return 1;
+ return 0;
+}
+
+static int callsite_cmp(struct alloc_stat *l, struct alloc_stat *r)
+{
+ if (l->call_site < r->call_site)
+ return -1;
+ else if (l->call_site > r->call_site)
+ return 1;
+ return 0;
+}
+
+static int bytes_cmp(struct alloc_stat *l, struct alloc_stat *r)
+{
+ if (l->bytes_alloc < r->bytes_alloc)
+ return -1;
+ else if (l->bytes_alloc > r->bytes_alloc)
+ return 1;
+ return 0;
+}
+
+static int parse_sort_opt(const struct option *opt __used,
+ const char *arg, int unset __used)
+{
+ sort_fn_t sort_fn;
+
+ if (!arg)
+ return -1;
+
+ if (strcmp(arg, "ptr") == 0)
+ sort_fn = ptr_cmp;
+ else if (strcmp(arg, "call_site") == 0)
+ sort_fn = callsite_cmp;
+ else if (strcmp(arg, "bytes") == 0)
+ sort_fn = bytes_cmp;
+ else
+ return -1;
+
+ if (caller_flag > alloc_flag)
+ caller_sort_fn = sort_fn;
+ else
+ alloc_sort_fn = sort_fn;
+
+ return 0;
+}
+
+static int parse_stat_opt(const struct option *opt __used,
+ const char *arg, int unset __used)
+{
+ if (!arg)
+ return -1;
+
+ if (strcmp(arg, "alloc") == 0)
+ alloc_flag = (caller_flag + 1);
+ else if (strcmp(arg, "caller") == 0)
+ caller_flag = (alloc_flag + 1);
+ else
+ return -1;
+ return 0;
+}
+
+static int parse_line_opt(const struct option *opt __used,
+ const char *arg, int unset __used)
+{
+ int lines;
+
+ if (!arg)
+ return -1;
+
+ lines = strtoul(arg, NULL, 10);
+
+ if (caller_flag > alloc_flag)
+ caller_lines = lines;
+ else
+ alloc_lines = lines;
+
+ return 0;
+}
+
+static const struct option kmem_options[] = {
+ OPT_STRING('i', "input", &input_name, "file",
+ "input file name"),
+ OPT_CALLBACK(0, "stat", NULL, "<alloc>|<caller>",
+ "stat selector, Pass 'alloc' or 'caller'.",
+ parse_stat_opt),
+ OPT_CALLBACK('s', "sort", NULL, "key",
+ "sort by key: ptr, call_site, hit, bytes",
+ parse_sort_opt),
+ OPT_CALLBACK('l', "line", NULL, "num",
+ "show n lins",
+ parse_line_opt),
+ OPT_END()
+};
+
+static const char *record_args[] = {
+ "record",
+ "-a",
+ "-R",
+ "-M",
+ "-f",
+ "-c", "1",
+ "-e", "kmem:kmalloc",
+ "-e", "kmem:kmalloc_node",
+ "-e", "kmem:kfree",
+ "-e", "kmem:kmem_cache_alloc",
+ "-e", "kmem:kmem_cache_alloc_node",
+ "-e", "kmem:kmem_cache_free",
+};
+
+static int __cmd_record(int argc, const char **argv)
+{
+ unsigned int rec_argc, i, j;
+ const char **rec_argv;
+
+ rec_argc = ARRAY_SIZE(record_args) + argc - 1;
+ rec_argv = calloc(rec_argc + 1, sizeof(char *));
+
+ for (i = 0; i < ARRAY_SIZE(record_args); i++)
+ rec_argv[i] = strdup(record_args[i]);
+
+ for (j = 1; j < (unsigned int)argc; j++, i++)
+ rec_argv[i] = argv[j];
+
+ return cmd_record(i, rec_argv, NULL);
+}
+
+int cmd_kmem(int argc, const char **argv, const char *prefix __used)
+{
+ symbol__init(0);
+
+ argc = parse_options(argc, argv, kmem_options, kmem_usage, 0);
+
+ if (argc && !strncmp(argv[0], "rec", 3))
+ return __cmd_record(argc, argv);
+ else if (argc)
+ usage_with_options(kmem_usage, kmem_options);
+
+ if (!alloc_sort_fn)
+ alloc_sort_fn = bytes_cmp;
+ if (!caller_sort_fn)
+ caller_sort_fn = bytes_cmp;
+
+ return __cmd_kmem();
+}
+
diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h
index 9b02d85..a3d8bf6 100644
--- a/tools/perf/builtin.h
+++ b/tools/perf/builtin.h
@@ -28,5 +28,6 @@ extern int cmd_top(int argc, const char **argv, const char *prefix);
extern int cmd_trace(int argc, const char **argv, const char *prefix);
extern int cmd_version(int argc, const char **argv, const char *prefix);
extern int cmd_probe(int argc, const char **argv, const char *prefix);
+extern int cmd_kmem(int argc, const char **argv, const char *prefix);
#endif
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 89b82ac..cf64049 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -285,20 +285,21 @@ static void handle_internal_command(int argc, const char **argv)
{
const char *cmd = argv[0];
static struct cmd_struct commands[] = {
- { "help", cmd_help, 0 },
- { "list", cmd_list, 0 },
{ "buildid-list", cmd_buildid_list, 0 },
- { "record", cmd_record, 0 },
- { "report", cmd_report, 0 },
- { "bench", cmd_bench, 0 },
- { "stat", cmd_stat, 0 },
- { "timechart", cmd_timechart, 0 },
- { "top", cmd_top, 0 },
- { "annotate", cmd_annotate, 0 },
- { "version", cmd_version, 0 },
- { "trace", cmd_trace, 0 },
- { "sched", cmd_sched, 0 },
- { "probe", cmd_probe, 0 },
+ { "help", cmd_help, 0 },
+ { "list", cmd_list, 0 },
+ { "record", cmd_record, 0 },
+ { "report", cmd_report, 0 },
+ { "bench", cmd_bench, 0 },
+ { "stat", cmd_stat, 0 },
+ { "timechart", cmd_timechart, 0 },
+ { "top", cmd_top, 0 },
+ { "annotate", cmd_annotate, 0 },
+ { "version", cmd_version, 0 },
+ { "trace", cmd_trace, 0 },
+ { "sched", cmd_sched, 0 },
+ { "probe", cmd_probe, 0 },
+ { "kmem", cmd_kmem, 0 },
};
unsigned int i;
static const char ext[] = STRIP_EXTENSION;
* Li Zefan <[email protected]> wrote:
> ---
> tools/perf/Makefile | 1 +
> tools/perf/builtin-kmem.c | 578 +++++++++++++++++++++++++++++++++++++++++++++
> tools/perf/builtin.h | 1 +
> tools/perf/perf.c | 27 +-
> 4 files changed, 594 insertions(+), 13 deletions(-)
> create mode 100644 tools/perf/builtin-kmem.c
btw., you might want to add it to command-list.txt as well (in a future
patch), so that 'kmem' shows up in the default 'perf' output.
Plus a Documentation/perf-kmem.txt file will make sure that 'perf help
kmem' and 'perf kmem --help' displays a help page, etc.
Thanks,
Ingo
* Pekka Enberg <[email protected]> wrote:
> Ingo Molnar kirjoitti:
> >Regarding patch 2 - can we set some definitive benchmark threshold
> >for that? I.e. a list of must-have features in 'perf kmem' before
> >we can do it? 100% information and analysis equivalency with
> >kmemtrace-user tool?
>
> I'd be interested to hear Eduard's comment on that.
>
> That said, I'll try to find some time to test "perf kmem" and
> provide feedback on that. I can ACK the patch when I'm happy with
> the output. :-)
>
> I'm mostly interested in two scenarios: (1) getting a nice report on
> worst fragmented call-sites (perf kmem needs symbol lookup) and (2)
> doing "perf kmem record" on machine A (think embedded here) and then
> "perf kmem report" on machine B. I haven't tried kmemtrace-user for
> a while but it did support both of them quite nicely at some point.
The perf.data can be copied over and to get off-side kernel symbol
resolution you can specify the kernel vmlinux via -k/--vmlinux to perf
report, then perf will look up the symbols from that vmlinux.
Cross word-size data files should work fine - cross-endian probably
needs a few fixes.
Plus off-site user-space symbols need more work, right now we dont
embedd them in the perf.data. It would need a symbol lookup + embedd-it
pass in perf record (perhaps available as a separate 'perf archive'
command as well), and some smarts on the reporting side to make use of
them. (Probably a copy of all relevant DSOs is what works best - that
enables off-site annotate as well.)
But ... even without that, perf is really fast and is supposed to build
fine even in minimal (embedded) environments, so you can run it on the
embedded board too. That's useful to get live inspection features like
'perf top', 'perf stat' and 'perf probe' anyway.
Ingo
* Li Zefan <[email protected]> wrote:
> > (2) doing "perf kmem record" on machine A (think embedded here) and
> > then "perf kmem report" on machine B. I haven't tried kmemtrace-user
> > for a while but it did support both of them quite nicely at some
> > point.
>
> Everything needed and machine-specific will be recorded in perf.data,
> so this should already been supported. I'll try it.
Right now the DSOs are not recorded in the perf.data - but it would be
useful to add it and to turn perf.data into a self-sufficient capture of
all relevant data, which can be analyzed on any box.
Ingo
Ingo Molnar wrote:
> * Li Zefan <[email protected]> wrote:
>
>> ---
>> tools/perf/Makefile | 1 +
>> tools/perf/builtin-kmem.c | 578 +++++++++++++++++++++++++++++++++++++++++++++
>> tools/perf/builtin.h | 1 +
>> tools/perf/perf.c | 27 +-
>> 4 files changed, 594 insertions(+), 13 deletions(-)
>> create mode 100644 tools/perf/builtin-kmem.c
>
> btw., you might want to add it to command-list.txt as well (in a future
> patch), so that 'kmem' shows up in the default 'perf' output.
>
> Plus a Documentation/perf-kmem.txt file will make sure that 'perf help
> kmem' and 'perf kmem --help' displays a help page, etc.
>
I planed to do these after collecting comments and getting positive
responses. So sure, I'll post further patches.
Ingo Molnar wrote:
> * Li Zefan <[email protected]> wrote:
>
>>> (2) doing "perf kmem record" on machine A (think embedded here) and
>>> then "perf kmem report" on machine B. I haven't tried kmemtrace-user
>>> for a while but it did support both of them quite nicely at some
>>> point.
>> Everything needed and machine-specific will be recorded in perf.data,
>> so this should already been supported. I'll try it.
>
> Right now the DSOs are not recorded in the perf.data - but it would be
> useful to add it and to turn perf.data into a self-sufficient capture of
> all relevant data, which can be analyzed on any box.
>
But still 'perf kmem' should function better than kmemtrace-user,
since the latter records no more than raw trace data.
Hi Ingo,
On Fri, Nov 20, 2009 at 11:01 AM, Ingo Molnar <[email protected]> wrote:
> But ... even without that, perf is really fast and is supposed to build
> fine even in minimal (embedded) environments, so you can run it on the
> embedded board too. That's useful to get live inspection features like
> 'perf top', 'perf stat' and 'perf probe' anyway.
Maybe I'm just too damn lazy but if I don't go through the trouble of
building my kernel on the box, I sure don't want to do that for perf
either. Anyway, I'm sure we can fix "perf kmem" to support what
kmemtrace-user does so it's not an issue.
Pekka
* Pekka Enberg <[email protected]> wrote:
> Hi Ingo,
>
> On Fri, Nov 20, 2009 at 11:01 AM, Ingo Molnar <[email protected]> wrote:
> > But ... even without that, perf is really fast and is supposed to build
> > fine even in minimal (embedded) environments, so you can run it on the
> > embedded board too. That's useful to get live inspection features like
> > 'perf top', 'perf stat' and 'perf probe' anyway.
>
> Maybe I'm just too damn lazy but if I don't go through the trouble of
> building my kernel on the box, I sure don't want to do that for perf
> either. [...]
Well you'll need 'perf' on that box anyway, to be able to do 'perf kmem
record'.
Ingo
Ingo Molnar kirjoitti:
> * Pekka Enberg <[email protected]> wrote:
>
>> Hi Ingo,
>>
>> On Fri, Nov 20, 2009 at 11:01 AM, Ingo Molnar <[email protected]> wrote:
>>> But ... even without that, perf is really fast and is supposed to build
>>> fine even in minimal (embedded) environments, so you can run it on the
>>> embedded board too. That's useful to get live inspection features like
>>> 'perf top', 'perf stat' and 'perf probe' anyway.
>> Maybe I'm just too damn lazy but if I don't go through the trouble of
>> building my kernel on the box, I sure don't want to do that for perf
>> either. [...]
>
> Well you'll need 'perf' on that box anyway, to be able to do 'perf kmem
> record'.
/me turns brains on
You're right, of course. With kmemtrace-user, I just copied the raw
trace file from /sys/kernel. I wonder if that's a good enough reason to
keep kmemtrace bits around?
Pekka
* Pekka Enberg <[email protected]> wrote:
> Ingo Molnar kirjoitti:
> >* Pekka Enberg <[email protected]> wrote:
> >
> >>Hi Ingo,
> >>
> >>On Fri, Nov 20, 2009 at 11:01 AM, Ingo Molnar <[email protected]> wrote:
> >>>But ... even without that, perf is really fast and is supposed to build
> >>>fine even in minimal (embedded) environments, so you can run it on the
> >>>embedded board too. That's useful to get live inspection features like
> >>>'perf top', 'perf stat' and 'perf probe' anyway.
> >>Maybe I'm just too damn lazy but if I don't go through the trouble of
> >>building my kernel on the box, I sure don't want to do that for perf
> >>either. [...]
> >
> >Well you'll need 'perf' on that box anyway, to be able to do 'perf
> >kmem record'.
>
> /me turns brains on
>
> You're right, of course. With kmemtrace-user, I just copied the raw
> trace file from /sys/kernel. I wonder if that's a good enough reason
> to keep kmemtrace bits around?
Not really. If then a light-weight recording app could be made but i'd
rather wait for actual usecases to pop up.
Ingo
Em Fri, Nov 20, 2009 at 10:03:53AM +0100, Ingo Molnar escreveu:
>
> * Li Zefan <[email protected]> wrote:
>
> > > (2) doing "perf kmem record" on machine A (think embedded here) and
> > > then "perf kmem report" on machine B. I haven't tried kmemtrace-user
> > > for a while but it did support both of them quite nicely at some
> > > point.
> >
> > Everything needed and machine-specific will be recorded in perf.data,
> > so this should already been supported. I'll try it.
>
> Right now the DSOs are not recorded in the perf.data - but it would be
> useful to add it and to turn perf.data into a self-sufficient capture of
> all relevant data, which can be analyzed on any box.
Well, the DSOs are recorded in perf.data, just not its symtabs, but now
we have buildids, so we can ask for them to be installed on the other
machine and it'll all work. Or should. :)
For instance:
[root@doppio linux-2.6-tip]# perf buildid-list -i perf.data | egrep 'vmlinux|nfs|libc-'
ec8dd400904ddfcac8b1c343263a790f977159dc /lib64/libc-2.10.1.so
0da49504693a200583fda6f1b949e6d2f799e692 /usr/lib64/libnfsidmap_nsswitch.so.0.0.0
c90269c87eaf08559012a9fa29f60780743360cd /usr/lib64/libnfsidmap.so.0.3.0
18e7cc53db62a7d35e9d6f6c9ddc23017d38ee9a vmlinux
3982866276471cde6ac5821fdced42a7b1bfd848 [nfs]
1489007276a50005753e730198fd93dd05b2082f [nfsd]
5a128f082fe7fdcab6fb5d1b71935accb1f34383 [nfs_acl]
[root@doppio linux-2.6-tip]#
Now if I ask that the buildid for /usr/lib64/libnfsidmap.so.0.3.0 above
to be installed, like this:
[root@doppio linux-2.6-tip]# yum install /usr/lib/debug/.build-id/c9/0269c87eaf08559012a9fa29f60780743360cd
Loaded plugins: auto-update-debuginfo, refresh-packagekit
Found 44 installed debuginfo package(s)
Enabling fedora-debuginfo: Fedora 11 - x86_64 - Debug
Reading repository metadata in from local files
Enabling updates-debuginfo: Fedora 11 - x86_64 - Updates - Debug
Reading repository metadata in from local files
Setting up Install Process
Importing additional filelist information
Resolving Dependencies
--> Running transaction check
---> Package nfs-utils-lib-debuginfo.x86_64 0:1.1.4-6.fc11 set to be updated
--> Finished Dependency Resolution
Dependencies Resolved
========================================================================
Package Arch Version Repository Size
========================================================================
Installing:
nfs-utils-lib-debuginfo x86_64 1.1.4-6.fc11 fedora-debuginfo 174 k
Transaction Summary
========================================================================
Install 1 Package(s)
Upgrade 0 Package(s)
Total download size: 174 k
Is this ok [y/N]:
So now we have:
1) 'perf record' records the build-ids into perf.data
2) 'perf buildid-list' list them, distro specific porcelain needed
to do the equivalent to the yum install above.
3) 'perf report' will only use the symtab in a file that has the matching
build-id, if a build-id is found in the perf.data header for a
particular DSO.
So we have a mechanism that is already present in several distros
(build-id), that is in the kernel build process since ~2.6.23, and that
avoids using mismatching DSOs when resolving symbols.
I'm working on some of these aspects, but most of the infrastructure is
alreadu in tip.
- Arnaldo
* Arnaldo Carvalho de Melo <[email protected]> wrote:
> Em Fri, Nov 20, 2009 at 10:03:53AM +0100, Ingo Molnar escreveu:
> >
> > * Li Zefan <[email protected]> wrote:
> >
> > > > (2) doing "perf kmem record" on machine A (think embedded here) and
> > > > then "perf kmem report" on machine B. I haven't tried kmemtrace-user
> > > > for a while but it did support both of them quite nicely at some
> > > > point.
> > >
> > > Everything needed and machine-specific will be recorded in perf.data,
> > > so this should already been supported. I'll try it.
> >
> > Right now the DSOs are not recorded in the perf.data - but it would be
> > useful to add it and to turn perf.data into a self-sufficient capture of
> > all relevant data, which can be analyzed on any box.
>
> Well, the DSOs are recorded in perf.data, just not its symtabs, but now
> we have buildids, so we can ask for them to be installed on the other
> machine and it'll all work. Or should. :)
>
> For instance:
>
> [root@doppio linux-2.6-tip]# perf buildid-list -i perf.data | egrep 'vmlinux|nfs|libc-'
> ec8dd400904ddfcac8b1c343263a790f977159dc /lib64/libc-2.10.1.so
> 0da49504693a200583fda6f1b949e6d2f799e692 /usr/lib64/libnfsidmap_nsswitch.so.0.0.0
> c90269c87eaf08559012a9fa29f60780743360cd /usr/lib64/libnfsidmap.so.0.3.0
> 18e7cc53db62a7d35e9d6f6c9ddc23017d38ee9a vmlinux
> 3982866276471cde6ac5821fdced42a7b1bfd848 [nfs]
> 1489007276a50005753e730198fd93dd05b2082f [nfsd]
> 5a128f082fe7fdcab6fb5d1b71935accb1f34383 [nfs_acl]
> [root@doppio linux-2.6-tip]#
>
> Now if I ask that the buildid for /usr/lib64/libnfsidmap.so.0.3.0 above
> to be installed, like this:
>
> [root@doppio linux-2.6-tip]# yum install /usr/lib/debug/.build-id/c9/0269c87eaf08559012a9fa29f60780743360cd
> Loaded plugins: auto-update-debuginfo, refresh-packagekit
> Found 44 installed debuginfo package(s)
> Enabling fedora-debuginfo: Fedora 11 - x86_64 - Debug
> Reading repository metadata in from local files
> Enabling updates-debuginfo: Fedora 11 - x86_64 - Updates - Debug
> Reading repository metadata in from local files
> Setting up Install Process
> Importing additional filelist information
> Resolving Dependencies
> --> Running transaction check
> ---> Package nfs-utils-lib-debuginfo.x86_64 0:1.1.4-6.fc11 set to be updated
> --> Finished Dependency Resolution
>
> Dependencies Resolved
>
> ========================================================================
> Package Arch Version Repository Size
> ========================================================================
> Installing:
> nfs-utils-lib-debuginfo x86_64 1.1.4-6.fc11 fedora-debuginfo 174 k
>
> Transaction Summary
> ========================================================================
> Install 1 Package(s)
> Upgrade 0 Package(s)
>
> Total download size: 174 k
> Is this ok [y/N]:
>
> So now we have:
>
> 1) 'perf record' records the build-ids into perf.data
> 2) 'perf buildid-list' list them, distro specific porcelain needed
> to do the equivalent to the yum install above.
> 3) 'perf report' will only use the symtab in a file that has the matching
> build-id, if a build-id is found in the perf.data header for a
> particular DSO.
>
> So we have a mechanism that is already present in several distros
> (build-id), that is in the kernel build process since ~2.6.23, and that
> avoids using mismatching DSOs when resolving symbols.
But what do we do if we have another box that runs say on a MIPS CPU,
uses some minimal distro - and copy that perf.data over to an x86 box.
The idea is there to be some new mode of perf.data where all the
relevant DSO contents (symtabs but also sections with instructions for
perf annotate to work) are copied into perf.data, during or after data
capture - on the box that does the recording.
Once we have everything embedded in the perf.data, analysis passes only
have to work based on that particular perf.data - no external data.
Ingo
Em Fri, Nov 20, 2009 at 05:41:10PM +0100, Ingo Molnar escreveu:
> > So we have a mechanism that is already present in several distros
> > (build-id), that is in the kernel build process since ~2.6.23, and that
> > avoids using mismatching DSOs when resolving symbols.
>
> But what do we do if we have another box that runs say on a MIPS CPU,
> uses some minimal distro - and copy that perf.data over to an x86 box.
There would be no problem, it would be just a matter of installing the
right -debuginfo packages, for MIPS.
Or the original, unstripped FS image sent to the machine with the MIPS
cpu, if there aren't -debuginfo packages.
Either one, the right DSOs would be found by the buildids.
There are other scenarios, like a binary that gets updated while a long
running perf record session runs, the way to differentiate between the
two DSOs wouldn't be the name, but the buildid.
> The idea is there to be some new mode of perf.data where all the
> relevant DSO contents (symtabs but also sections with instructions for
> perf annotate to work) are copied into perf.data, during or after data
> capture - on the box that does the recording.
>
> Once we have everything embedded in the perf.data, analysis passes only
> have to work based on that particular perf.data - no external data.
Well, we can that, additionally, but think about stripped binaries, we
would lose potentially a lot because the symtabs on that small machine
would have poorer symtabs than the ones in an unstriped binary (or in a
-debuginfo package).
- Arnaldo
* Arnaldo Carvalho de Melo <[email protected]> wrote:
> Em Fri, Nov 20, 2009 at 05:41:10PM +0100, Ingo Molnar escreveu:
> > > So we have a mechanism that is already present in several distros
> > > (build-id), that is in the kernel build process since ~2.6.23, and that
> > > avoids using mismatching DSOs when resolving symbols.
> >
> > But what do we do if we have another box that runs say on a MIPS CPU,
> > uses some minimal distro - and copy that perf.data over to an x86 box.
>
> There would be no problem, it would be just a matter of installing the
> right -debuginfo packages, for MIPS.
I havent tried this - is this really possible to do on an x86 box, with
a typical distro? Can i install say Fedora PowerPC debuginfo packages on
an x86 box, while also having the x86 debuginfo packages there?
> Or the original, unstripped FS image sent to the machine with the MIPS
> cpu, if there aren't -debuginfo packages.
>
> Either one, the right DSOs would be found by the buildids.
>
> There are other scenarios, like a binary that gets updated while a long
> running perf record session runs, the way to differentiate between the
> two DSOs wouldn't be the name, but the buildid.
>
> > The idea is there to be some new mode of perf.data where all the
> > relevant DSO contents (symtabs but also sections with instructions for
> > perf annotate to work) are copied into perf.data, during or after data
> > capture - on the box that does the recording.
> >
> > Once we have everything embedded in the perf.data, analysis passes only
> > have to work based on that particular perf.data - no external data.
>
> Well, we can that, additionally, but think about stripped binaries, we
> would lose potentially a lot because the symtabs on that small machine
> would have poorer symtabs than the ones in an unstriped binary (or in
> a -debuginfo package).
We should definitely use the widest and best quality information we can
- if it's available.
So even if we 'inline' any information from the box, if there's better
info available at the time of analysis, we should use that too.
Basically what matters is the principle of 'what is possible'.
If a user records on a box and analyses on a different box, and we end
up not doing something (and printing an error or displaying an empty
profile) that could reasonably have been done, then the user will be
unhappy and we might lose that user.
The user wont be unhappy about us using a big set of data sources that
we can recover information from transparently. The user will be unhappy
if we insist on (and force) a certain form of information source - such
as debuginfo.
Ingo
On Mon, 2009-11-23 at 07:51 +0100, Ingo Molnar wrote:
>
> * Arnaldo Carvalho de Melo <[email protected]> wrote:
>
> > Em Fri, Nov 20, 2009 at 05:41:10PM +0100, Ingo Molnar escreveu:
> > > > So we have a mechanism that is already present in several distros
> > > > (build-id), that is in the kernel build process since ~2.6.23, and that
> > > > avoids using mismatching DSOs when resolving symbols.
> > >
> > > But what do we do if we have another box that runs say on a MIPS CPU,
> > > uses some minimal distro - and copy that perf.data over to an x86 box.
> >
> > There would be no problem, it would be just a matter of installing the
> > right -debuginfo packages, for MIPS.
>
> I havent tried this - is this really possible to do on an x86 box, with
> a typical distro? Can i install say Fedora PowerPC debuginfo packages on
> an x86 box, while also having the x86 debuginfo packages there?
The best option would be to allow to specify a chroot parameter, where
we can specify the embedded root filesystem on out machine.
I'm not even sure embedded distros even have this separate debug package
crazyness, you simply build the distro with or without debuginfo.
* Peter Zijlstra <[email protected]> wrote:
> > I havent tried this - is this really possible to do on an x86 box,
> > with a typical distro? Can i install say Fedora PowerPC debuginfo
> > packages on an x86 box, while also having the x86 debuginfo packages
> > there?
>
> The best option would be to allow to specify a chroot parameter, where
> we can specify the embedded root filesystem on out machine.
>
> I'm not even sure embedded distros even have this separate debug
> package crazyness, you simply build the distro with or without
> debuginfo.
yes - we could use -R/--root (which opreport has as well), as a
mandatory path prefix to all DSO/debuginfo searches.
Ingo
Em Mon, Nov 23, 2009 at 07:51:10AM +0100, Ingo Molnar escreveu:
>
> * Arnaldo Carvalho de Melo <[email protected]> wrote:
>
> > Em Fri, Nov 20, 2009 at 05:41:10PM +0100, Ingo Molnar escreveu:
> > > > So we have a mechanism that is already present in several distros
> > > > (build-id), that is in the kernel build process since ~2.6.23, and that
> > > > avoids using mismatching DSOs when resolving symbols.
> > >
> > > But what do we do if we have another box that runs say on a MIPS CPU,
> > > uses some minimal distro - and copy that perf.data over to an x86 box.
> >
> > There would be no problem, it would be just a matter of installing the
> > right -debuginfo packages, for MIPS.
>
> I havent tried this - is this really possible to do on an x86 box, with
> a typical distro? Can i install say Fedora PowerPC debuginfo packages on
> an x86 box, while also having the x86 debuginfo packages there?
I should have added "in theory", as I haven't tested this as well using
the current tools, but it should :)
> > Or the original, unstripped FS image sent to the machine with the MIPS
> > cpu, if there aren't -debuginfo packages.
> >
> > Either one, the right DSOs would be found by the buildids.
> >
> > There are other scenarios, like a binary that gets updated while a long
> > running perf record session runs, the way to differentiate between the
> > two DSOs wouldn't be the name, but the buildid.
> >
> > > The idea is there to be some new mode of perf.data where all the
> > > relevant DSO contents (symtabs but also sections with instructions for
> > > perf annotate to work) are copied into perf.data, during or after data
> > > capture - on the box that does the recording.
> > >
> > > Once we have everything embedded in the perf.data, analysis passes only
> > > have to work based on that particular perf.data - no external data.
> >
> > Well, we can that, additionally, but think about stripped binaries, we
> > would lose potentially a lot because the symtabs on that small machine
> > would have poorer symtabs than the ones in an unstriped binary (or in
> > a -debuginfo package).
>
> We should definitely use the widest and best quality information we can
> - if it's available.
>
> So even if we 'inline' any information from the box, if there's better
> info available at the time of analysis, we should use that too.
>
> Basically what matters is the principle of 'what is possible'.
>
> If a user records on a box and analyses on a different box, and we end
> up not doing something (and printing an error or displaying an empty
> profile) that could reasonably have been done, then the user will be
> unhappy and we might lose that user.
>
> The user wont be unhappy about us using a big set of data sources that
> we can recover information from transparently. The user will be unhappy
> if we insist on (and force) a certain form of information source - such
> as debuginfo.
Sure thing, I'm thinking about how to encode the perf.data file inside
an ELF section while merging all symtabs to reduce size by sharing the
strings table, etc.
The dso__load routine already does that fallback from what is best
(debuginfo packages) to what is available (the symtab, dynsym tables in
the DSO itself), its just a matter of efficiently encoding the symtabs
into the perf.data file and that will be another source of symbols if
the preferred one (debuginfo) is not available.
- Arnaldo
Em Mon, Nov 23, 2009 at 08:22:21AM +0100, Peter Zijlstra escreveu:
> On Mon, 2009-11-23 at 07:51 +0100, Ingo Molnar wrote:
> > I havent tried this - is this really possible to do on an x86 box, with
> > a typical distro? Can i install say Fedora PowerPC debuginfo packages on
> > an x86 box, while also having the x86 debuginfo packages there?
>
> The best option would be to allow to specify a chroot parameter, where
> we can specify the embedded root filesystem on out machine.
yeah, I'm working now on a vmlinux_path, so that the symbol machinery in
perf looks at /lib/module/`uname -r`/build/vmlinux,
/usr/lib/debug/lib/modules/`uname -r`/vmlinux, ./vmlinux as a default or
in getenv("VMLINUX_PATH") if set. Being able to specify a
SYMTAB_PREFIX_PATH also should be possible.
> I'm not even sure embedded distros even have this separate debug package
> crazyness, you simply build the distro with or without debuginfo.
Whatever crazyness people usually do to find the files with matching,
richer symtabs we should support :)
- Arnaldo
On Fri, 2009-11-20 at 11:49 +0100, Ingo Molnar wrote:
> >
> > You're right, of course. With kmemtrace-user, I just copied the raw
> > trace file from /sys/kernel. I wonder if that's a good enough reason
> > to keep kmemtrace bits around?
>
> Not really. If then a light-weight recording app could be made but i'd
> rather wait for actual usecases to pop up.
Hmm, but isn't this an actual use case?
-- Steve
* Steven Rostedt <[email protected]> wrote:
> On Fri, 2009-11-20 at 11:49 +0100, Ingo Molnar wrote:
> > >
> > > You're right, of course. With kmemtrace-user, I just copied the raw
> > > trace file from /sys/kernel. I wonder if that's a good enough reason
> > > to keep kmemtrace bits around?
> >
> > Not really. If then a light-weight recording app could be made but
> > i'd rather wait for actual usecases to pop up.
>
> Hmm, but isn't this an actual use case?
Not really - perf record is pretty lightweight and you'd want perf for
hands-on stats anyway.
Ingo