This is a tool to trace the related schedule events of a
specified task, eg the migration, sched in/out, wakeup and
sleep/block.
The event was translated into sentence to be more readable,
by execute command 'task_detector -p 49870' we continually
tracing the schedule events related to 'top' like:
----------------------------
923455517688 CPU=23 PID=49870 COMM=top ENQUEUE
923455519633 CPU=23 PID=0 COMM=IDLE PREEMPTED 1945ns
923455519868 CPU=23 PID=49870 COMM=top EXECUTE AFTER WAITED 2180ns
923468279019 CPU=23 PID=49870 COMM=top WAIT AFTER EXECUTED 12ms
923468279220 CPU=23 PID=128 COMM=ksoftirqd/23 PREEMPT
923468283051 CPU=23 PID=128 COMM=ksoftirqd/23 DEQUEUE AFTER PREEMPTED 3831ns
923468283216 CPU=23 PID=49870 COMM=top EXECUTE AFTER WAITED 4197ns
923476280180 CPU=23 PID=49870 COMM=top WAIT AFTER EXECUTED 7996us
923476280350 CPU=23 PID=128 COMM=ksoftirqd/23 PREEMPT
923476322029 CPU=23 PID=128 COMM=ksoftirqd/23 DEQUEUE AFTER PREEMPTED 41us
923476322150 CPU=23 PID=49870 COMM=top EXECUTE AFTER WAITED 41us
923479726879 CPU=23 PID=49870 COMM=top DEQUEUE AFTER EXECUTED 3404us
----------------------------
This could be helpful on debugging the competition on CPU
resource, to find out who has stolen the CPU and how much
it stolen.
It can also tracing the syscall by append option -s.
Signed-off-by: Michael Wang <[email protected]>
---
samples/bpf/Makefile | 3 +
samples/bpf/task_detector.h | 382 +++++++++++++++++++++++++++++++++++++++
samples/bpf/task_detector_kern.c | 329 +++++++++++++++++++++++++++++++++
samples/bpf/task_detector_user.c | 314 ++++++++++++++++++++++++++++++++
4 files changed, 1028 insertions(+)
create mode 100644 samples/bpf/task_detector.h
create mode 100644 samples/bpf/task_detector_kern.c
create mode 100644 samples/bpf/task_detector_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 8403e4762306..a8e468e0eac9 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -53,6 +53,7 @@ tprogs-y += task_fd_query
tprogs-y += xdp_sample_pkts
tprogs-y += ibumad
tprogs-y += hbm
+tprogs-y += task_detector
# Libbpf dependencies
LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
@@ -109,6 +110,7 @@ task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS)
ibumad-objs := bpf_load.o ibumad_user.o $(TRACE_HELPERS)
hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS)
+task_detector-objs := bpf_load.o task_detector_user.o $(TRACE_HELPERS)
# Tell kbuild to always build the programs
always-y := $(tprogs-y)
@@ -170,6 +172,7 @@ always-y += ibumad_kern.o
always-y += hbm_out_kern.o
always-y += hbm_edt_kern.o
always-y += xdpsock_kern.o
+always-y += task_detector_kern.o
ifeq ($(ARCH), arm)
# Strip all except -D__LINUX_ARM_ARCH__ option needed to handle linux
diff --git a/samples/bpf/task_detector.h b/samples/bpf/task_detector.h
new file mode 100644
index 000000000000..9b5f3f73a46a
--- /dev/null
+++ b/samples/bpf/task_detector.h
@@ -0,0 +1,382 @@
+#ifndef TASK_DETECTOR_H
+#define TASK_DETECTOR_H
+
+#define NR_ENTRY_MAX 10000
+#define NR_CPU_MAX 512
+
+typedef unsigned long long u64;
+typedef int pid_t;
+
+enum {
+ TYPE_MIGRATE,
+ TYPE_ENQUEUE,
+ TYPE_WAIT,
+ TYPE_EXECUTE,
+ TYPE_DEQUEUE,
+ TYPE_SYSCALL_ENTER,
+ TYPE_SYSCALL_EXIT,
+};
+
+struct user_info {
+ pid_t pid;
+ int exit;
+ int trace_syscall;
+};
+
+struct trace_info {
+ int type;
+ int cpu;
+ int syscall;
+ pid_t pid;
+ u64 ts;
+ char comm[16];
+};
+
+struct si_key {
+ int cpu;
+ int syscall;
+ pid_t pid;
+};
+
+char *syscall_name[] = {
+ "read",
+ "write",
+ "open",
+ "close",
+ "stat",
+ "fstat",
+ "lstat",
+ "poll",
+ "lseek",
+ "mmap",
+ "mprotect",
+ "munmap",
+ "brk",
+ "rt_sigaction",
+ "rt_sigprocmask",
+ "rt_sigreturn",
+ "ioctl",
+ "pread64",
+ "pwrite64",
+ "readv",
+ "writev",
+ "access",
+ "pipe",
+ "select",
+ "sched_yield",
+ "mremap",
+ "msync",
+ "mincore",
+ "madvise",
+ "shmget",
+ "shmat",
+ "shmctl",
+ "dup",
+ "dup2",
+ "pause",
+ "nanosleep",
+ "getitimer",
+ "alarm",
+ "setitimer",
+ "getpid",
+ "sendfile",
+ "socket",
+ "connect",
+ "accept",
+ "sendto",
+ "recvfrom",
+ "sendmsg",
+ "recvmsg",
+ "shutdown",
+ "bind",
+ "listen",
+ "getsockname",
+ "getpeername",
+ "socketpair",
+ "setsockopt",
+ "getsockopt",
+ "clone",
+ "fork",
+ "vfork",
+ "execve",
+ "exit",
+ "wait4",
+ "kill",
+ "uname",
+ "semget",
+ "semop",
+ "semctl",
+ "shmdt",
+ "msgget",
+ "msgsnd",
+ "msgrcv",
+ "msgctl",
+ "fcntl",
+ "flock",
+ "fsync",
+ "fdatasync",
+ "truncate",
+ "ftruncate",
+ "getdents",
+ "getcwd",
+ "chdir",
+ "fchdir",
+ "rename",
+ "mkdir",
+ "rmdir",
+ "creat",
+ "link",
+ "unlink",
+ "symlink",
+ "readlink",
+ "chmod",
+ "fchmod",
+ "chown",
+ "fchown",
+ "lchown",
+ "umask",
+ "gettimeofday",
+ "getrlimit",
+ "getrusage",
+ "sysinfo",
+ "times",
+ "ptrace",
+ "getuid",
+ "syslog",
+ "getgid",
+ "setuid",
+ "setgid",
+ "geteuid",
+ "getegid",
+ "setpgid",
+ "getppid",
+ "getpgrp",
+ "setsid",
+ "setreuid",
+ "setregid",
+ "getgroups",
+ "setgroups",
+ "setresuid",
+ "getresuid",
+ "setresgid",
+ "getresgid",
+ "getpgid",
+ "setfsuid",
+ "setfsgid",
+ "getsid",
+ "capget",
+ "capset",
+ "rt_sigpending",
+ "rt_sigtimedwait",
+ "rt_sigqueueinfo",
+ "rt_sigsuspend",
+ "sigaltstack",
+ "utime",
+ "mknod",
+ "uselib",
+ "personality",
+ "ustat",
+ "statfs",
+ "fstatfs",
+ "sysfs",
+ "getpriority",
+ "setpriority",
+ "sched_setparam",
+ "sched_getparam",
+ "sched_setscheduler",
+ "sched_getscheduler",
+ "sched_get_priority_max",
+ "sched_get_priority_min",
+ "sched_rr_get_interval",
+ "mlock",
+ "munlock",
+ "mlockall",
+ "munlockall",
+ "vhangup",
+ "modify_ldt",
+ "pivot_root",
+ "_sysctl",
+ "prctl",
+ "arch_prctl",
+ "adjtimex",
+ "setrlimit",
+ "chroot",
+ "sync",
+ "acct",
+ "settimeofday",
+ "mount",
+ "umount2",
+ "swapon",
+ "swapoff",
+ "reboot",
+ "sethostname",
+ "setdomainname",
+ "iopl",
+ "ioperm",
+ "create_module",
+ "init_module",
+ "delete_module",
+ "get_kernel_syms",
+ "query_module",
+ "quotactl",
+ "nfsservctl",
+ "getpmsg",
+ "putpmsg",
+ "afs_syscall",
+ "tuxcall",
+ "security",
+ "gettid",
+ "readahead",
+ "setxattr",
+ "lsetxattr",
+ "fsetxattr",
+ "getxattr",
+ "lgetxattr",
+ "fgetxattr",
+ "listxattr",
+ "llistxattr",
+ "flistxattr",
+ "removexattr",
+ "lremovexattr",
+ "fremovexattr",
+ "tkill",
+ "time",
+ "futex",
+ "sched_setaffinity",
+ "sched_getaffinity",
+ "set_thread_area",
+ "io_setup",
+ "io_destroy",
+ "io_getevents",
+ "io_submit",
+ "io_cancel",
+ "get_thread_area",
+ "lookup_dcookie",
+ "epoll_create",
+ "epoll_ctl_old",
+ "epoll_wait_old",
+ "remap_file_pages",
+ "getdents64",
+ "set_tid_address",
+ "restart_syscall",
+ "semtimedop",
+ "fadvise64",
+ "timer_create",
+ "timer_settime",
+ "timer_gettime",
+ "timer_getoverrun",
+ "timer_delete",
+ "clock_settime",
+ "clock_gettime",
+ "clock_getres",
+ "clock_nanosleep",
+ "exit_group",
+ "epoll_wait",
+ "epoll_ctl",
+ "tgkill",
+ "utimes",
+ "vserver",
+ "mbind",
+ "set_mempolicy",
+ "get_mempolicy",
+ "mq_open",
+ "mq_unlink",
+ "mq_timedsend",
+ "mq_timedreceive",
+ "mq_notify",
+ "mq_getsetattr",
+ "kexec_load",
+ "waitid",
+ "add_key",
+ "request_key",
+ "keyctl",
+ "ioprio_set",
+ "ioprio_get",
+ "inotify_init",
+ "inotify_add_watch",
+ "inotify_rm_watch",
+ "migrate_pages",
+ "openat",
+ "mkdirat",
+ "mknodat",
+ "fchownat",
+ "futimesat",
+ "newfstatat",
+ "unlinkat",
+ "renameat",
+ "linkat",
+ "symlinkat",
+ "readlinkat",
+ "fchmodat",
+ "faccessat",
+ "pselect6",
+ "ppoll",
+ "unshare",
+ "set_robust_list",
+ "get_robust_list",
+ "splice",
+ "tee",
+ "sync_file_range",
+ "vmsplice",
+ "move_pages",
+ "utimensat",
+ "epoll_pwait",
+ "signalfd",
+ "timerfd_create",
+ "eventfd",
+ "fallocate",
+ "timerfd_settime",
+ "timerfd_gettime",
+ "accept4",
+ "signalfd4",
+ "eventfd2",
+ "epoll_create1",
+ "dup3",
+ "pipe2",
+ "inotify_init1",
+ "preadv",
+ "pwritev",
+ "rt_tgsigqueueinfo",
+ "perf_event_open",
+ "recvmmsg",
+ "fanotify_init",
+ "fanotify_mark",
+ "prlimit64",
+ "name_to_handle_at",
+ "open_by_handle_at",
+ "clock_adjtime",
+ "syncfs",
+ "sendmmsg",
+ "setns",
+ "getcpu",
+ "process_vm_readv",
+ "process_vm_writev",
+ "kcmp",
+ "finit_module",
+ "sched_setattr",
+ "sched_getattr",
+ "renameat2",
+ "seccomp",
+ "getrandom",
+ "memfd_create",
+ "kexec_file_load",
+ "bpf",
+ "execveat",
+ "userfaultfd",
+ "membarrier",
+ "mlock2",
+ "copy_file_range",
+ "preadv2",
+ "pwritev2",
+ "pkey_mprotect",
+ "pkey_alloc",
+ "pkey_free",
+ "statx",
+ "io_pgetevents",
+ "rseq",
+ "io_uring_setup",
+ "io_uring_enter",
+ "io_uring_register",
+};
+
+#endif
diff --git a/samples/bpf/task_detector_kern.c b/samples/bpf/task_detector_kern.c
new file mode 100644
index 000000000000..35a7f1cba6bb
--- /dev/null
+++ b/samples/bpf/task_detector_kern.c
@@ -0,0 +1,329 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifdef asm_volatile_goto
+#undef asm_volatile_goto
+#endif
+
+#define asm_volatile_goto(x...) asm volatile("invalid use of asm_volatile_goto")
+
+#include <linux/version.h>
+#include <linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "task_detector.h"
+
+#define DEBUG_ON 0
+
+struct bpf_map_def SEC("maps") user_info_maps = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(struct user_info),
+ .max_entries = 1,
+};
+
+struct bpf_map_def SEC("maps") trace_info_maps = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(struct trace_info),
+ .max_entries = NR_ENTRY_MAX,
+};
+
+struct bpf_map_def SEC("maps") syscall_info_maps = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(struct si_key),
+ .value_size = sizeof(u64),
+ .max_entries = NR_ENTRY_MAX,
+};
+
+struct bpf_map_def SEC("maps") start_maps = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .max_entries = 1,
+};
+
+struct bpf_map_def SEC("maps") end_maps = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .max_entries = 1,
+};
+
+struct bpf_map_def SEC("maps") trace_on_maps = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .max_entries = 1,
+};
+
+static inline void *get_user_info(void)
+{
+ int key = 0;
+
+ return bpf_map_lookup_elem(&user_info_maps, &key);
+}
+
+static inline void set_trace_on(int cpu)
+{
+ int key = 0;
+ int *trace_on = bpf_map_lookup_elem(&trace_on_maps, &key);
+
+ if (trace_on)
+ *trace_on = cpu + 1;
+}
+
+static inline void set_trace_off(void)
+{
+ int key = 0;
+ int *trace_on = bpf_map_lookup_elem(&trace_on_maps, &key);
+
+ if (trace_on)
+ *trace_on = 0;
+}
+
+static inline int should_trace(int cpu)
+{
+ int key = 0;
+ int *trace_on = bpf_map_lookup_elem(&trace_on_maps, &key);
+
+ return (trace_on && *trace_on == cpu + 1);
+}
+
+static inline void *get_trace(int key)
+{
+ return bpf_map_lookup_elem(&trace_info_maps, &key);
+}
+
+static inline void add_trace(struct trace_info ti)
+{
+ int orig_end, key = 0;
+ struct trace_info *tip;
+ int *start, *end;
+
+ start = bpf_map_lookup_elem(&start_maps, &key);
+ end = bpf_map_lookup_elem(&end_maps, &key);
+
+ if (!start || !end)
+ return;
+
+ orig_end = *end;
+ *end = (*end + 1);
+
+ if (*end == NR_ENTRY_MAX)
+ *end = 0;
+
+ if (*end == *start)
+ return;
+
+ tip = get_trace(orig_end);
+ if (!tip) {
+ *end = orig_end;
+ return;
+ }
+
+ memcpy(tip, &ti, sizeof(ti));
+ tip->ts = bpf_ktime_get_ns();
+
+ if (ti.type == TYPE_SYSCALL_ENTER ||
+ ti.type == TYPE_SYSCALL_EXIT ||
+ ti.type == TYPE_WAIT ||
+ ti.type == TYPE_DEQUEUE)
+ bpf_get_current_comm(&tip->comm, sizeof(tip->comm));
+}
+
+struct sched_process_exit_args {
+ unsigned short common_type;
+ unsigned char common_flags;
+ unsigned char common_preempt_count;
+ int common_pid;
+ char comm[16];
+ pid_t pid;
+ int prio;
+};
+
+SEC("tracepoint/sched/sched_process_exit")
+int bpf_trace_sched_process_exit(struct sched_process_exit_args *args)
+{
+ struct user_info *ui = get_user_info();
+
+ if (ui && ui->pid && ui->pid == args->pid)
+ ui->exit = 1;
+
+ return 0;
+}
+
+struct sched_migrate_task_args {
+ unsigned short common_type;
+ unsigned char common_flags;
+ unsigned char common_preempt_count;
+ int common_pid;
+ char comm[16];
+ pid_t pid;
+ int prio;
+ int orig_cpu;
+ int dest_cpu;
+};
+
+SEC("tracepoint/sched/sched_migrate_task")
+int bpf_trace_sched_migrate_task(struct sched_migrate_task_args *args)
+{
+ struct trace_info ti = {
+ .cpu = args->dest_cpu,
+ .pid = args->pid,
+ .type = TYPE_MIGRATE,
+ };
+ struct user_info *ui = get_user_info();
+
+ if (!ui || !ui->pid || ui->pid != args->pid)
+ return 0;
+
+ set_trace_on(1 + args->dest_cpu);
+
+ add_trace(ti);
+
+ return 0;
+}
+
+struct sched_wakeup_args {
+ unsigned short common_type;
+ unsigned char common_flags;
+ unsigned char common_preempt_count;
+ int common_pid;
+ char comm[16];
+ pid_t pid;
+ int prio;
+ int success;
+ int target_cpu;
+};
+
+SEC("tracepoint/sched/sched_wakeup")
+int bpf_trace_sched_wakeup(struct sched_wakeup_args *args)
+{
+ struct trace_info ti = {
+ .cpu = args->target_cpu,
+ .pid = args->pid,
+ .type = TYPE_ENQUEUE,
+ };
+ struct user_info *ui = get_user_info();
+
+ if (!ui || !ui->pid || ui->pid != args->pid)
+ return 0;
+
+ set_trace_on(1 + args->target_cpu);
+
+ add_trace(ti);
+
+ return 0;
+}
+
+struct sched_switch_args {
+ unsigned short common_type;
+ unsigned char common_flags;
+ unsigned char common_preempt_count;
+ int common_pid;
+ char prev_comm[16];
+ pid_t prev_pid;
+ int prev_prio;
+ long prev_state;
+ char next_comm[16];
+ pid_t next_pid;
+ int next_prio;
+};
+
+SEC("tracepoint/sched/sched_switch")
+int bpf_trace_sched_switch(struct sched_switch_args *args)
+{
+ struct trace_info ti = {
+ .cpu = bpf_get_smp_processor_id(),
+ }, *tip;
+ struct user_info *ui = get_user_info();
+
+ if (!ui || !ui->pid)
+ return 0;
+
+ if (!should_trace(ti.cpu + 1)) {
+ if (ui->pid != args->prev_pid &&
+ ui->pid != args->next_pid)
+ return 0;
+
+ set_trace_on(ti.cpu + 1);
+
+ ti.pid = ui->pid;
+ ti.type = TYPE_MIGRATE;
+ add_trace(ti);
+ }
+
+ if (args->prev_state != TASK_RUNNING &&
+ args->prev_state != TASK_REPORT_MAX) {
+ if (ui->pid == args->prev_pid)
+ set_trace_off();
+ ti.type = TYPE_DEQUEUE;
+ } else
+ ti.type = TYPE_WAIT;
+ ti.pid = args->prev_pid;
+ add_trace(ti);
+
+ if (!should_trace(ti.cpu + 1))
+ return 0;
+
+ ti.type = TYPE_EXECUTE;
+ ti.pid = args->next_pid,
+ add_trace(ti);
+
+ return 0;
+}
+
+struct sys_enter_args {
+ unsigned short common_type;
+ unsigned char common_flags;
+ unsigned char common_preempt_count;
+ int common_pid;
+ long id;
+ unsigned long args[6];
+};
+
+SEC("tracepoint/raw_syscalls/sys_enter")
+int bpf_trace_sys_enter(struct sys_enter_args *args)
+{
+ struct trace_info ti = {
+ .cpu = bpf_get_smp_processor_id(),
+ .pid = bpf_get_current_pid_tgid(),
+ .type = TYPE_SYSCALL_ENTER,
+ .syscall = args->id,
+ }, *tip;
+ struct user_info *ui = get_user_info();
+
+ if (args->id && ui && ui->trace_syscall && should_trace(ti.cpu + 1))
+ add_trace(ti);
+
+ return 0;
+}
+
+struct sys_exit_args {
+ unsigned short common_type;
+ unsigned char common_flags;
+ unsigned char common_preempt_count;
+ int common_pid;
+ long id;
+ long ret;
+};
+
+SEC("tracepoint/raw_syscalls/sys_exit")
+int bpf_trace_sys_exit(struct sys_exit_args *args)
+{
+ struct trace_info ti = {
+ .cpu = bpf_get_smp_processor_id(),
+ .pid = bpf_get_current_pid_tgid(),
+ .type = TYPE_SYSCALL_EXIT,
+ .syscall = args->id,
+ }, *tip;
+ struct user_info *ui = get_user_info();
+
+ if (args->id && ui && ui->trace_syscall && should_trace(ti.cpu + 1))
+ add_trace(ti);
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/task_detector_user.c b/samples/bpf/task_detector_user.c
new file mode 100644
index 000000000000..d113546acddd
--- /dev/null
+++ b/samples/bpf/task_detector_user.c
@@ -0,0 +1,314 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <sched.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <linux/bpf.h>
+#include <locale.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/wait.h>
+#include <dirent.h>
+
+#include <bpf/bpf.h>
+#include "bpf_load.h"
+#include "trace_helpers.h"
+#include "task_detector.h"
+
+#define ROOT_CG "/sys/fs/cgroup/cpu/"
+
+static int ui_map_fd;
+static int ti_map_fd;
+static int si_map_fd;
+static int start_map_fd;
+static int end_map_fd;
+
+static int nr_cpus;
+static int trace_syscall;
+static char *target;
+
+static inline int get_comm(int pid, char comm[])
+{
+ int fd, ret = 1;
+ char path_buf[512];
+
+ comm[0] = '\0';
+
+ if (!pid) {
+ strcpy(comm, "IDLE");
+ return 0;
+ }
+
+ snprintf(path_buf, sizeof(path_buf), "/proc/%d/comm", pid);
+ fd = open(path_buf, O_RDONLY);
+ if (fd) {
+ int cnt;
+
+ cnt = read(fd, comm, 16);
+ if (cnt > 0) {
+ comm[cnt - 1] = '\0';
+ ret = 0;
+ }
+ }
+ close(fd);
+ return ret;
+}
+
+static inline int time_to_str(u64 ns, char *buf, size_t len)
+{
+ u64 us = ns / 1000;
+ u64 ms = us / 1000;
+ u64 s = ms / 1000;
+
+ if (s > 10)
+ snprintf(buf, len, "%llus", s);
+ else if (ms > 10)
+ snprintf(buf, len, "%llums", ms);
+ else if (us > 10)
+ snprintf(buf, len, "%lluus", us);
+ else
+ snprintf(buf, len, "%lluns", ns);
+
+ return (s || ms > 10);
+}
+
+static inline void pr_ti(struct trace_info *ti, char *opt, char *delay)
+{
+ char comm[16];
+
+ if (get_comm(ti->pid, comm))
+ memcpy(comm, ti->comm, sizeof(comm));
+
+ printf("%-27lluCPU=%-7dPID=%-7dCOMM=%-20s%-37s%-17s\n",
+ ti->ts, ti->cpu, ti->pid, comm, opt,
+ delay ? delay : "");
+}
+
+static int print_trace_info(int start, int end)
+{
+ int key;
+ char d_str[16];
+ static u64 w_start, p_start;
+ static struct trace_info ti_last;
+
+ for (key = start; key != end; key = (key + 1) % NR_ENTRY_MAX) {
+ char func[37];
+ struct trace_info ti;
+ struct si_key sik;
+ u64 siv;
+
+ if (bpf_map_lookup_elem(ti_map_fd, &key, &ti))
+ continue;
+
+ time_to_str(ti.ts - ti_last.ts, d_str, sizeof(d_str));
+
+ switch (ti.type) {
+ case TYPE_MIGRATE:
+ w_start = p_start = ti.ts;
+ pr_ti(&ti, "MIGRATE", NULL);
+ break;
+ case TYPE_ENQUEUE:
+ w_start = p_start = ti.ts;
+ printf("----------------------------\n");
+ pr_ti(&ti, "ENQUEUE", NULL);
+ break;
+ case TYPE_WAIT:
+ if (ti.pid == atoi(target)) {
+ w_start = ti.ts;
+ pr_ti(&ti, "WAIT AFTER EXECUTED", d_str);
+ } else {
+ time_to_str(ti.ts - p_start,
+ d_str, sizeof(d_str));
+ pr_ti(&ti, "PREEMPTED", d_str);
+ }
+ break;
+ case TYPE_EXECUTE:
+ if (ti.pid == atoi(target)) {
+ time_to_str(ti.ts - w_start,
+ d_str, sizeof(d_str));
+ pr_ti(&ti, "EXECUTE AFTER WAITED", d_str);
+ } else {
+ p_start = ti.ts;
+ pr_ti(&ti, "PREEMPT", NULL);
+ }
+ break;
+ case TYPE_DEQUEUE:
+ if (ti.pid == atoi(target))
+ pr_ti(&ti, "DEQUEUE AFTER EXECUTED", d_str);
+ else {
+ time_to_str(ti.ts - p_start,
+ d_str, sizeof(d_str));
+ pr_ti(&ti, "DEQUEUE AFTER PREEMPTED", d_str);
+ }
+ break;
+ case TYPE_SYSCALL_ENTER:
+ siv = ti.ts;
+ sik.cpu = ti.cpu;
+ sik.pid = ti.pid;
+ sik.syscall = ti.syscall;
+ bpf_map_update_elem(si_map_fd, &sik, &siv, BPF_ANY);
+ snprintf(func, sizeof(func), "SC [%d:%s] ENTER",
+ ti.syscall, syscall_name[ti.syscall]);
+ pr_ti(&ti, func, NULL);
+ break;
+ case TYPE_SYSCALL_EXIT:
+ sik.cpu = ti.cpu;
+ sik.pid = ti.pid;
+ sik.syscall = ti.syscall;
+ if (bpf_map_lookup_elem(si_map_fd, &sik, &siv))
+ break;
+ time_to_str(ti.ts - siv, d_str, sizeof(d_str));
+ bpf_map_delete_elem(si_map_fd, &sik);
+ snprintf(func, sizeof(func), "SC [%d:%s] TAKE %s TO EXIT",
+ ti.syscall, syscall_name[ti.syscall], d_str);
+ pr_ti(&ti, func, NULL);
+ break;
+ default:
+ break;
+ }
+
+ memcpy(&ti_last, &ti, sizeof(ti));
+ }
+
+ return end;
+}
+
+static int setup_user_info(char *pid)
+{
+ int key = 0;
+ DIR *dir;
+ char buf[256];
+ struct user_info ui;
+
+ snprintf(buf, sizeof(buf), "/proc/%s", pid);
+
+ dir = opendir(buf);
+ if (!dir) {
+ printf("Open %s failed: %s\n", buf, strerror(errno));
+ return -1;
+ }
+
+ memset(&ui, 0, sizeof(ui));
+ ui.pid = atoi(pid);
+ ui.trace_syscall = trace_syscall;
+
+ bpf_map_update_elem(ui_map_fd, &key, &ui, BPF_ANY);
+
+ closedir(dir);
+
+ return 0;
+}
+
+static inline void print_help(char *cmd)
+{
+ fprintf(stderr,
+ "Usage: %s [options]\n"
+ "\t-p PID\n"
+ "\t-s Trace SYSCALL Info\n"
+ , cmd);
+}
+
+static void int_exit(int sig)
+{
+ exit(0);
+}
+
+int main(int argc, char **argv)
+{
+ char filename[256];
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ int opt;
+
+ while ((opt = getopt(argc, argv, "p:i:s")) != -1) {
+ switch (opt) {
+ case 'p':
+ target = optarg;
+ break;
+ case 's':
+ trace_syscall = 1;
+ break;
+ default:
+help:
+ print_help(argv[0]);
+ return 1;
+ }
+ }
+
+ if (!target)
+ goto help;
+
+ nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+ if (nr_cpus > NR_CPU_MAX) {
+ printf("Support Maximum %d cpus\n", NR_CPU_MAX);
+ return 1;
+ }
+
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ printf("Failed To Set Unlimited Memory\n");
+ return 1;
+ }
+
+ if (load_kallsyms()) {
+ printf("Load kallsyms Failed\n");
+ return 1;
+ }
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ if (load_bpf_file(filename)) {
+ snprintf(filename, sizeof(filename), "/bin/%s_kern.o", argv[0]);
+ if (load_bpf_file(filename)) {
+ printf("Load %s Failed\n%s", filename, bpf_log_buf);
+ return 1;
+ }
+ }
+
+ ui_map_fd = map_fd[0];
+ ti_map_fd = map_fd[1];
+ si_map_fd = map_fd[2];
+ start_map_fd = map_fd[3];
+ end_map_fd = map_fd[4];
+
+ if (setup_user_info(target)) {
+ printf("Illegal Target %s\n", target);
+ return 1;
+ }
+
+ signal(SIGINT, int_exit);
+ signal(SIGTERM, int_exit);
+
+ while (1) {
+ int key = 0, start = 0, end = 0;
+ struct user_info ui = {
+ .exit = 0,
+ };
+
+ bpf_map_lookup_elem(ui_map_fd, &key, &ui);
+ if (ui.exit) {
+ printf("Target \"%s\" Destroyed\n", target);
+ return 0;
+ }
+
+ bpf_map_lookup_elem(start_map_fd, &key, &start);
+ bpf_map_lookup_elem(end_map_fd, &key, &end);
+
+ if (start == end) {
+ sleep(1);
+ continue;
+ }
+
+ start = print_trace_info(start, end);
+
+ bpf_map_update_elem(start_map_fd, &key, &start, BPF_ANY);
+ }
+
+ return 0;
+}
--
2.14.4.44.g2045bb6
On Wed, May 27, 2020 at 7:53 PM 王贇 <[email protected]> wrote:
>
> This is a tool to trace the related schedule events of a
> specified task, eg the migration, sched in/out, wakeup and
> sleep/block.
>
> The event was translated into sentence to be more readable,
> by execute command 'task_detector -p 49870' we continually
> tracing the schedule events related to 'top' like:
>
> ----------------------------
> 923455517688 CPU=23 PID=49870 COMM=top ENQUEUE
> 923455519633 CPU=23 PID=0 COMM=IDLE PREEMPTED 1945ns
> 923455519868 CPU=23 PID=49870 COMM=top EXECUTE AFTER WAITED 2180ns
> 923468279019 CPU=23 PID=49870 COMM=top WAIT AFTER EXECUTED 12ms
> 923468279220 CPU=23 PID=128 COMM=ksoftirqd/23 PREEMPT
> 923468283051 CPU=23 PID=128 COMM=ksoftirqd/23 DEQUEUE AFTER PREEMPTED 3831ns
> 923468283216 CPU=23 PID=49870 COMM=top EXECUTE AFTER WAITED 4197ns
> 923476280180 CPU=23 PID=49870 COMM=top WAIT AFTER EXECUTED 7996us
> 923476280350 CPU=23 PID=128 COMM=ksoftirqd/23 PREEMPT
> 923476322029 CPU=23 PID=128 COMM=ksoftirqd/23 DEQUEUE AFTER PREEMPTED 41us
> 923476322150 CPU=23 PID=49870 COMM=top EXECUTE AFTER WAITED 41us
> 923479726879 CPU=23 PID=49870 COMM=top DEQUEUE AFTER EXECUTED 3404us
> ----------------------------
>
> This could be helpful on debugging the competition on CPU
> resource, to find out who has stolen the CPU and how much
> it stolen.
>
> It can also tracing the syscall by append option -s.
>
> Signed-off-by: Michael Wang <[email protected]>
> ---
I haven't looked through implementation thoroughly yet. But I have few
general remarks.
This looks like a useful and generic tool. I think it will get most
attention and be most useful if it will be part of BCC tools. There is
already a set of generic tools that use libbpf and CO-RE, see [0]. It
feels like this belongs there.
Some of the annoying parts (e.g., syscall name translation) is already
generalized as part of syscount tool PR (to be hopefully merged soon),
so you'll be able to save quite a lot of code with this. There is also
a common build infra that takes care of things like vmlinux.h, which
would provide definitions for all those xxx_args structs that you had
to manually define.
With CO-RE, it also will allow to compile this tool once and run it on
many different kernels without recompilation. Please do take a look
and submit a PR there, it will be a good addition to the toolkit (and
will force you write a bit of README explaining use of this tool as
well ;).
As for the code itself, I haven't gone through it much, but please
convert map definition syntax to BTF-defined one. The one you are
using is a legacy one. Thanks!
[0] https://github.com/iovisor/bcc/tree/master/libbpf-tools
> samples/bpf/Makefile | 3 +
> samples/bpf/task_detector.h | 382 +++++++++++++++++++++++++++++++++++++++
> samples/bpf/task_detector_kern.c | 329 +++++++++++++++++++++++++++++++++
> samples/bpf/task_detector_user.c | 314 ++++++++++++++++++++++++++++++++
> 4 files changed, 1028 insertions(+)
> create mode 100644 samples/bpf/task_detector.h
> create mode 100644 samples/bpf/task_detector_kern.c
> create mode 100644 samples/bpf/task_detector_user.c
>
[...]
Hi, Andrii
Thanks for your comments :-)
On 2020/5/28 下午2:36, Andrii Nakryiko wrote:
[snip]
>> ---
>
> I haven't looked through implementation thoroughly yet. But I have few
> general remarks.
>
> This looks like a useful and generic tool. I think it will get most
> attention and be most useful if it will be part of BCC tools. There is
> already a set of generic tools that use libbpf and CO-RE, see [0]. It
> feels like this belongs there.
>
> Some of the annoying parts (e.g., syscall name translation) is already
> generalized as part of syscount tool PR (to be hopefully merged soon),
> so you'll be able to save quite a lot of code with this. There is also
> a common build infra that takes care of things like vmlinux.h, which
> would provide definitions for all those xxx_args structs that you had
> to manually define.
>
> With CO-RE, it also will allow to compile this tool once and run it on
> many different kernels without recompilation. Please do take a look
> and submit a PR there, it will be a good addition to the toolkit (and
> will force you write a bit of README explaining use of this tool as
> well ;).
Aha, I used to think bcc only support python and cpp :-P
I'll try to rework it and submit PR, I'm glad to know that you think
this tool as a helpful one, we do solved some tough issue with it
already.
>
> As for the code itself, I haven't gone through it much, but please
> convert map definition syntax to BTF-defined one. The one you are
> using is a legacy one. Thanks!
>
> [0] https://github.com/iovisor/bcc/tree/master/libbpf-tools
Will check the example there :-)
Regards,
Michael Wang
>
>> samples/bpf/Makefile | 3 +
>> samples/bpf/task_detector.h | 382 +++++++++++++++++++++++++++++++++++++++
>> samples/bpf/task_detector_kern.c | 329 +++++++++++++++++++++++++++++++++
>> samples/bpf/task_detector_user.c | 314 ++++++++++++++++++++++++++++++++
>> 4 files changed, 1028 insertions(+)
>> create mode 100644 samples/bpf/task_detector.h
>> create mode 100644 samples/bpf/task_detector_kern.c
>> create mode 100644 samples/bpf/task_detector_user.c
>>
>
> [...]
>
On Thu, May 28, 2020 at 1:14 AM 王贇 <[email protected]> wrote:
>
> Hi, Andrii
>
> Thanks for your comments :-)
>
> On 2020/5/28 下午2:36, Andrii Nakryiko wrote:
> [snip]
> >> ---
> >
> > I haven't looked through implementation thoroughly yet. But I have few
> > general remarks.
> >
> > This looks like a useful and generic tool. I think it will get most
> > attention and be most useful if it will be part of BCC tools. There is
> > already a set of generic tools that use libbpf and CO-RE, see [0]. It
> > feels like this belongs there.
> >
> > Some of the annoying parts (e.g., syscall name translation) is already
> > generalized as part of syscount tool PR (to be hopefully merged soon),
> > so you'll be able to save quite a lot of code with this. There is also
> > a common build infra that takes care of things like vmlinux.h, which
> > would provide definitions for all those xxx_args structs that you had
> > to manually define.
> >
> > With CO-RE, it also will allow to compile this tool once and run it on
> > many different kernels without recompilation. Please do take a look
> > and submit a PR there, it will be a good addition to the toolkit (and
> > will force you write a bit of README explaining use of this tool as
> > well ;).
>
> Aha, I used to think bcc only support python and cpp :-P
>
libbpf-tools don't use BCC at all, they are just co-located with BCC
and BCC tools in the same repository and are lightweight alternatives
to BCC-based tools. But it needs kernel with BTF built-in, which is
the only (temporary) downside.
> I'll try to rework it and submit PR, I'm glad to know that you think
> this tool as a helpful one, we do solved some tough issue with it
> already.
>
> >
> > As for the code itself, I haven't gone through it much, but please
> > convert map definition syntax to BTF-defined one. The one you are
> > using is a legacy one. Thanks!
> >
> > [0] https://github.com/iovisor/bcc/tree/master/libbpf-tools
>
> Will check the example there :-)
>
> Regards,
> Michael Wang
>
> >
> >> samples/bpf/Makefile | 3 +
> >> samples/bpf/task_detector.h | 382 +++++++++++++++++++++++++++++++++++++++
> >> samples/bpf/task_detector_kern.c | 329 +++++++++++++++++++++++++++++++++
> >> samples/bpf/task_detector_user.c | 314 ++++++++++++++++++++++++++++++++
> >> 4 files changed, 1028 insertions(+)
> >> create mode 100644 samples/bpf/task_detector.h
> >> create mode 100644 samples/bpf/task_detector_kern.c
> >> create mode 100644 samples/bpf/task_detector_user.c
> >>
> >
> > [...]
> >
On 2020/5/29 上午2:34, Andrii Nakryiko wrote:
[snip]
>>>
>>> With CO-RE, it also will allow to compile this tool once and run it on
>>> many different kernels without recompilation. Please do take a look
>>> and submit a PR there, it will be a good addition to the toolkit (and
>>> will force you write a bit of README explaining use of this tool as
>>> well ;).
>>
>> Aha, I used to think bcc only support python and cpp :-P
>>
>
> libbpf-tools don't use BCC at all, they are just co-located with BCC
> and BCC tools in the same repository and are lightweight alternatives
> to BCC-based tools. But it needs kernel with BTF built-in, which is
> the only (temporary) downside.
I see, thanks for the explain :-)
We prefer libbpf since we feel it's more friendly for kernel folks to
do develop, less suspects when meet any BUG, while BCC is a good way
to package and distribute, glad to know that we can have them in one
piece now.
Regards,
Michael Wang
>
>> I'll try to rework it and submit PR, I'm glad to know that you think
>> this tool as a helpful one, we do solved some tough issue with it
>> already.
>>
>>>
>>> As for the code itself, I haven't gone through it much, but please
>>> convert map definition syntax to BTF-defined one. The one you are
>>> using is a legacy one. Thanks!
>>>
>>> [0] https://github.com/iovisor/bcc/tree/master/libbpf-tools
>>
>> Will check the example there :-)
>>
>> Regards,
>> Michael Wang
>>
>>>
>>>> samples/bpf/Makefile | 3 +
>>>> samples/bpf/task_detector.h | 382 +++++++++++++++++++++++++++++++++++++++
>>>> samples/bpf/task_detector_kern.c | 329 +++++++++++++++++++++++++++++++++
>>>> samples/bpf/task_detector_user.c | 314 ++++++++++++++++++++++++++++++++
>>>> 4 files changed, 1028 insertions(+)
>>>> create mode 100644 samples/bpf/task_detector.h
>>>> create mode 100644 samples/bpf/task_detector_kern.c
>>>> create mode 100644 samples/bpf/task_detector_user.c
>>>>
>>>
>>> [...]
>>>