Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1763490AbYARRkc (ORCPT ); Fri, 18 Jan 2008 12:40:32 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1760006AbYARRkY (ORCPT ); Fri, 18 Jan 2008 12:40:24 -0500 Received: from pentafluge.infradead.org ([213.146.154.40]:47984 "EHLO pentafluge.infradead.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1759516AbYARRkX (ORCPT ); Fri, 18 Jan 2008 12:40:23 -0500 Date: Fri, 18 Jan 2008 09:39:43 -0800 From: Arjan van de Ven To: Arjan van de Ven Cc: Linux Kernel Mailing List , Linus Torvalds , Ingo Molnar , Andrew Morton Subject: [patch 1/3] LatencyTOP infrastructure patch Message-ID: <20080118093943.0048d0f7@laptopd505.fenrus.org> In-Reply-To: <4790E3A6.7060807@linux.intel.com> References: <4790E3A6.7060807@linux.intel.com> Organization: Intel X-Mailer: Claws Mail 3.2.0 (GTK+ 2.12.3; i386-redhat-linux-gnu) Mime-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit X-Bad-Reply: References and In-Reply-To but no 'Re:' in Subject. Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 17018 Lines: 586 This patch is the core LatencyTOP kernel infrastructure; it measures latencies in the scheduler and tracks it system wide and per process --- fs/proc/base.c | 61 +++++++++ include/linux/latencytop.h | 62 +++++++++ include/linux/sched.h | 6 kernel/Makefile | 1 kernel/fork.c | 1 kernel/latencytop.c | 287 +++++++++++++++++++++++++++++++++++++++++++++ kernel/sched_fair.c | 8 + lib/Kconfig.debug | 11 + 8 files changed, 436 insertions(+), 1 deletion(-) Index: linux-2.6.24-rc7/include/linux/latencytop.h =================================================================== --- /dev/null +++ linux-2.6.24-rc7/include/linux/latencytop.h @@ -0,0 +1,62 @@ +/* + * latencytop.h: Infrastructure for displaying latency + * + * (C) Copyright 2008 Intel Corporation + * Author: Arjan van de Ven + * + */ + +#ifndef _INCLUDE_GUARD_LATENCYTOP_H_ +#define _INCLUDE_GUARD_LATENCYTOP_H_ + +#ifdef CONFIG_LATENCYTOP + +#define LT_ARGUMENTSIZE 24 +#define LT_SAVECOUNT 32 + +struct latency_entry { + char *reason; + char argument[LT_ARGUMENTSIZE]; + int maxtime; /* ignore latencies above this time as + "user requested" to deal with select() */ +}; + +struct latency_record { + char *reason; + char argument[LT_ARGUMENTSIZE]; + unsigned int count; + unsigned long time; + unsigned long max; +}; + + +struct task_struct; + +struct latency_entry *set_latency_reason(char *reason, + struct latency_entry *entry); +struct latency_entry *set_latency_reason_user(char *reason, + struct latency_entry *entry, unsigned int max); +struct latency_entry *set_latency_reason_param(char *reason, + const char *param, struct latency_entry *entry); +void restore_latency_reason(struct latency_entry *entry); +void account_scheduler_latency(struct task_struct *task, int usecs, int inter); + +void clear_all_latency_tracing(struct task_struct *p); +#else +struct latency_entry {}; +static inline struct latency_entry *set_latency_reason(char *reason, + struct latency_entry *entry) { return NULL; } +static inline struct latency_entry *set_latency_reason_user(char *reason, + struct latency_entry *entry, unsigned int max) + { return NULL; } +static inline struct latency_entry *set_latency_reason_param(char *reason, + const char *param, struct latency_entry *entry) + { return NULL; } +static inline void restore_latency_reason(struct latency_entry *entry) { } +static inline void account_scheduler_latency(struct task_struct *task, + int usecs, int inter) {}; +static inline void clear_all_latency_tracing(struct task_struct *p) {}; + +#endif + +#endif Index: linux-2.6.24-rc7/include/linux/sched.h =================================================================== --- linux-2.6.24-rc7.orig/include/linux/sched.h +++ linux-2.6.24-rc7/include/linux/sched.h @@ -88,6 +88,7 @@ struct sched_param { #include #include #include +#include #include @@ -1178,6 +1179,11 @@ struct task_struct { int make_it_fail; #endif struct prop_local_single dirties; +#ifdef CONFIG_LATENCYTOP + struct latency_entry latency_reason; + int latency_record_count; + struct latency_record latency_record[LT_SAVECOUNT]; +#endif }; /* Index: linux-2.6.24-rc7/kernel/fork.c =================================================================== --- linux-2.6.24-rc7.orig/kernel/fork.c +++ linux-2.6.24-rc7/kernel/fork.c @@ -1196,6 +1196,7 @@ static struct task_struct *copy_process( #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); #endif + clear_all_latency_tracing(p); /* Our parent execution domain becomes current domain These must match for thread signalling to apply */ Index: linux-2.6.24-rc7/kernel/latencytop.c =================================================================== --- /dev/null +++ linux-2.6.24-rc7/kernel/latencytop.c @@ -0,0 +1,287 @@ +/* + * latencytop.c: Latency display infrastructure + * + * (C) Copyright 2008 Intel Corporation + * Author: Arjan van de Ven + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_SPINLOCK(latency_lock); + +#define MAXLR 128 +static struct latency_record latency_record[MAXLR]; + +struct latency_entry *set_latency_reason(char *reason, + struct latency_entry *entry) +{ + unsigned long flags; + spin_lock_irqsave(&latency_lock, flags); + + *entry = current->latency_reason; + if (current->latency_reason.reason) { + spin_unlock_irqrestore(&latency_lock, flags); + return entry; + } + + current->latency_reason.reason = reason; + current->latency_reason.argument[0] = 0; + current->latency_reason.maxtime = 0; + spin_unlock_irqrestore(&latency_lock, flags); + return entry; +} + +struct latency_entry *set_latency_reason_user(char *reason, + struct latency_entry *entry, unsigned int max) +{ + unsigned long flags; + spin_lock_irqsave(&latency_lock, flags); + + *entry = current->latency_reason; + if (current->latency_reason.reason) { + spin_unlock_irqrestore(&latency_lock, flags); + return entry; + } + + current->latency_reason.reason = reason; + current->latency_reason.argument[0] = 0; + current->latency_reason.maxtime = max; + spin_unlock_irqrestore(&latency_lock, flags); + return entry; +} + +struct latency_entry *set_latency_reason_param(char *reason, + char const *param, struct latency_entry *entry) +{ + unsigned long flags; + spin_lock_irqsave(&latency_lock, flags); + + *entry = current->latency_reason; + if (current->latency_reason.reason) { + spin_unlock_irqrestore(&latency_lock, flags); + return entry; + } + + current->latency_reason.reason = reason; + current->latency_reason.maxtime = 0; + if (param) + strncpy(current->latency_reason.argument, param, 23); + else + current->latency_reason.argument[0] = 0; + spin_unlock_irqrestore(&latency_lock, flags); + return entry; +} +EXPORT_SYMBOL(set_latency_reason); + +void restore_latency_reason(struct latency_entry *entry) +{ + unsigned long flags; + spin_lock_irqsave(&latency_lock, flags); + current->latency_reason = *entry; + spin_unlock_irqrestore(&latency_lock, flags); +} +EXPORT_SYMBOL(restore_latency_reason); + +void clear_all_latency_tracing(struct task_struct *p) +{ + unsigned long flags; + spin_lock_irqsave(&latency_lock, flags); + memset(&p->latency_reason, 0, sizeof(p->latency_reason)); + memset(&p->latency_record, 0, sizeof(p->latency_record)); + p->latency_record_count = 0; + spin_unlock_irqrestore(&latency_lock, flags); +} + +static void clear_global_latency_tracing(void) +{ + unsigned long flags; + spin_lock_irqsave(&latency_lock, flags); + memset(&latency_record, 0, sizeof(latency_record)); + spin_unlock_irqrestore(&latency_lock, flags); +} + +static void __sched +account_global_scheduler_latency(struct task_struct *tsk, int usecs) +{ + int i; + int firstnonnull = MAXLR + 1; + + if (!tsk->latency_reason.reason) + return; + + /* skip kernel threads for now */ + if (!tsk->mm) + return; + + for (i = 0; i < MAXLR; i++) { + /* nothing stored */ + if (!latency_record[i].reason) { + if (firstnonnull > i) + firstnonnull = i; + continue; + } + /* mismatch on the reason */ + if (strcmp(latency_record[i].reason, + tsk->latency_reason.reason)) + continue; + /* mismatch on the argument */ + if (strcmp(latency_record[i].argument, + tsk->latency_reason.argument)) + continue; + latency_record[i].count++; + latency_record[i].time += usecs; + if (usecs > latency_record[i].max) + latency_record[i].max = usecs; + return; + } + + i = firstnonnull; + if (i >= MAXLR - 1) + return; + + /* allocted a new one */ + latency_record[i].reason = tsk->latency_reason.reason; + strcpy(latency_record[i].argument, tsk->latency_reason.argument); + latency_record[i].count = 1; + latency_record[i].time = usecs; + latency_record[i].max = usecs; +} + +void __sched account_scheduler_latency(struct task_struct *tsk, + int usecs, int inter) +{ + int i; + unsigned long flags; + + /* long interruptible waits are generally user requested.. */ + if (inter && usecs > 5000) + return; + + /* no reason for the latency... bail for now. Later we'll + * store a backtrace item instead + */ + if (!tsk->latency_reason.reason && tsk->latency_record_count > 15) + return; + + /* if we have a maxtime set (as is the case for select and such), + and we're waiting longer than that, assume it's a user requested + delay and don't log it */ + if (tsk->latency_reason.maxtime && + usecs > tsk->latency_reason.maxtime) + return; + + spin_lock_irqsave(&latency_lock, flags); + if (!tsk->latency_reason.reason) { + static char str[KSYM_NAME_LEN]; + unsigned long EIPV = get_wchan(tsk); + sprint_symbol(str, EIPV); + tsk->latency_reason.reason = "Unknown reason"; + strncpy(tsk->latency_reason.argument, str, 23); + } + + account_global_scheduler_latency(tsk, usecs); + + /* check if the reason is already in the table */ + for (i = 0; i < LT_SAVECOUNT; i++) { + /* nothing stored */ + if (!tsk->latency_record[i].reason) + continue; + /* mismatch on the reason */ + if (strcmp(tsk->latency_record[i].reason, + tsk->latency_reason.reason)) + continue; + /* mismatch on the argument */ + if (strcmp(tsk->latency_record[i].argument, + tsk->latency_reason.argument)) + continue; + tsk->latency_record[i].count++; + tsk->latency_record[i].time += usecs; + if (usecs > tsk->latency_record[i].max) + tsk->latency_record[i].max = usecs; + spin_unlock_irqrestore(&latency_lock, flags); + return; + } + /* short term hack; if we're > 32 we stop; future we recycle */ + tsk->latency_record_count++; + if (tsk->latency_record_count >= LT_SAVECOUNT) { + spin_unlock_irqrestore(&latency_lock, flags); + return; + } + /* allocted a new one */ + i = tsk->latency_record_count; + tsk->latency_record[i].reason = tsk->latency_reason.reason; + strcpy(tsk->latency_record[i].argument, tsk->latency_reason.argument); + tsk->latency_record[i].count = 1; + tsk->latency_record[i].time = usecs; + tsk->latency_record[i].max = usecs; + spin_unlock_irqrestore(&latency_lock, flags); + +} + + + +static int lstats_show(struct seq_file *m, void *v) +{ + int i; + seq_puts(m, "Latency Top version : v0.1\n"); + + for (i = 0; i < MAXLR; i++) { + if (latency_record[i].reason) + seq_printf(m, "%i %li %li %s (%s)\n", + latency_record[i].count, + latency_record[i].time, + latency_record[i].max, + latency_record[i].reason, + latency_record[i].argument); + + } + return 0; +} + +static ssize_t lstats_write(struct file *file, const char __user *buf, + size_t count, loff_t *offs) +{ + clear_global_latency_tracing(); + return count; +} + +static int lstats_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, lstats_show, NULL); +} + +static struct file_operations lstats_fops = { + .open = lstats_open, + .read = seq_read, + .write = lstats_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init init_lstats_procfs(void) +{ + struct proc_dir_entry *pe; + + pe = create_proc_entry("latency_stats", 0644, NULL); + if (!pe) + return -ENOMEM; + + pe->proc_fops = &lstats_fops; + + return 0; +} +__initcall(init_lstats_procfs); Index: linux-2.6.24-rc7/kernel/Makefile =================================================================== --- linux-2.6.24-rc7.orig/kernel/Makefile +++ linux-2.6.24-rc7/kernel/Makefile @@ -57,6 +57,7 @@ obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_MARKERS) += marker.o +obj-$(CONFIG_LATENCYTOP) += latencytop.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is Index: linux-2.6.24-rc7/kernel/sched_fair.c =================================================================== --- linux-2.6.24-rc7.orig/kernel/sched_fair.c +++ linux-2.6.24-rc7/kernel/sched_fair.c @@ -20,6 +20,8 @@ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ +#include + /* * Targeted preemption latency for CPU-bound tasks: * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) @@ -434,6 +436,7 @@ static void enqueue_sleeper(struct cfs_r #ifdef CONFIG_SCHEDSTATS if (se->sleep_start) { u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; + struct task_struct *tsk = task_of(se); if ((s64)delta < 0) delta = 0; @@ -443,9 +446,12 @@ static void enqueue_sleeper(struct cfs_r se->sleep_start = 0; se->sum_sleep_runtime += delta; + + account_scheduler_latency(tsk, delta >> 10, 1); } if (se->block_start) { u64 delta = rq_of(cfs_rq)->clock - se->block_start; + struct task_struct *tsk = task_of(se); if ((s64)delta < 0) delta = 0; @@ -462,11 +468,11 @@ static void enqueue_sleeper(struct cfs_r * time that the task spent sleeping: */ if (unlikely(prof_on == SLEEP_PROFILING)) { - struct task_struct *tsk = task_of(se); profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), delta >> 20); } + account_scheduler_latency(tsk, delta >> 10, 0); } #endif } Index: linux-2.6.24-rc7/fs/proc/base.c =================================================================== --- linux-2.6.24-rc7.orig/fs/proc/base.c +++ linux-2.6.24-rc7/fs/proc/base.c @@ -310,6 +310,60 @@ static int proc_pid_schedstat(struct tas } #endif +#ifdef CONFIG_LATENCYTOP +static int lstats_show_proc(struct seq_file *m, void *v) +{ + int i; + struct task_struct *task = m->private; + seq_puts(m, "Latency Top version : v0.1\n"); + + for (i = 0; i < 32; i++) { + if (task->latency_record[i].reason) + seq_printf(m, "%i %li %li %s (%s)\n", + task->latency_record[i].count, + task->latency_record[i].time, + task->latency_record[i].max, + task->latency_record[i].reason, + task->latency_record[i].argument); + + } + return 0; +} + +static int lstats_open(struct inode *inode, struct file *file) +{ + int ret; + struct seq_file *m; + struct task_struct *task = get_proc_task(inode); + ret = single_open(file, lstats_show_proc, NULL); + if (!ret) { + m = file->private_data; + m->private = task; + } + return ret; +} + +static ssize_t lstats_write(struct file *file, const char __user *buf, + size_t count, loff_t *offs) +{ + struct seq_file *m; + struct task_struct *task; + m = file->private_data; + task = m->private; + clear_all_latency_tracing(task); + return count; +} + +static const struct file_operations proc_lstats_operations = { + .open = lstats_open, + .read = seq_read, + .write = lstats_write, + .llseek = seq_lseek, + .release = single_release, +}; + +#endif + /* The badness from the OOM killer */ unsigned long badness(struct task_struct *p, unsigned long uptime); static int proc_oom_score(struct task_struct *task, char *buffer) @@ -1020,6 +1074,7 @@ static const struct file_operations proc }; #endif + #ifdef CONFIG_SCHED_DEBUG /* * Print out various scheduling related per-task fields: @@ -2230,6 +2285,9 @@ static const struct pid_entry tgid_base_ #ifdef CONFIG_SCHEDSTATS INF("schedstat", S_IRUGO, pid_schedstat), #endif +#ifdef CONFIG_LATENCYTOP + REG("latency", S_IRUGO, lstats), +#endif #ifdef CONFIG_PROC_PID_CPUSET REG("cpuset", S_IRUGO, cpuset), #endif @@ -2555,6 +2613,9 @@ static const struct pid_entry tid_base_s #ifdef CONFIG_SCHEDSTATS INF("schedstat", S_IRUGO, pid_schedstat), #endif +#ifdef CONFIG_LATENCYTOP + REG("latency", S_IRUGO, lstats), +#endif #ifdef CONFIG_PROC_PID_CPUSET REG("cpuset", S_IRUGO, cpuset), #endif Index: linux-2.6.24-rc7/lib/Kconfig.debug =================================================================== --- linux-2.6.24-rc7.orig/lib/Kconfig.debug +++ linux-2.6.24-rc7/lib/Kconfig.debug @@ -517,4 +517,15 @@ config FAULT_INJECTION_STACKTRACE_FILTER help Provide stacktrace filter for fault-injection capabilities +config LATENCYTOP + bool "Latency measuring infrastructure" + select FRAME_POINTER if !X86 && !MIPS + select KALLSYMS + select KALLSYMS_ALL + depends on SCHEDSTATS + help + Enable this option if you want to use the LatencyTOP tool + to find out which userspace is blocking on what kernel operations. + + source "samples/Kconfig" -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/