Date: Fri, 18 Jan 2008 09:39:43 -0800
From: Arjan van de Ven <arjan@linux.intel.com>
To: Arjan van de Ven <arjan@linux.intel.com>
Cc: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>,
       Linus Torvalds <torvalds@linux-foundation.org>,
       Ingo Molnar <mingo@elte.hu>, Andrew Morton <akpm@linux-foundation.org>
Subject: [patch 1/3] LatencyTOP infrastructure patch
Message-ID: <20080118093943.0048d0f7@laptopd505.fenrus.org>
In-Reply-To: <4790E3A6.7060807@linux.intel.com>
References: <4790E3A6.7060807@linux.intel.com>
Organization: Intel
Mime-Version: 1.0
Content-Type: text/plain; charset=US-ASCII
Content-Transfer-Encoding: 7bit
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 17018
Lines: 586

This patch is the core LatencyTOP kernel infrastructure;
it measures latencies in the scheduler and tracks it system
wide and per process
---
 fs/proc/base.c             |   61 +++++++++
 include/linux/latencytop.h |   62 +++++++++
 include/linux/sched.h      |    6 
 kernel/Makefile            |    1 
 kernel/fork.c              |    1 
 kernel/latencytop.c        |  287 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched_fair.c        |    8 +
 lib/Kconfig.debug          |   11 +
 8 files changed, 436 insertions(+), 1 deletion(-)

Index: linux-2.6.24-rc7/include/linux/latencytop.h
===================================================================
--- /dev/null
+++ linux-2.6.24-rc7/include/linux/latencytop.h
@@ -0,0 +1,62 @@
+/*
+ * latencytop.h: Infrastructure for displaying latency
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ */
+
+#ifndef _INCLUDE_GUARD_LATENCYTOP_H_
+#define _INCLUDE_GUARD_LATENCYTOP_H_
+
+#ifdef CONFIG_LATENCYTOP
+
+#define LT_ARGUMENTSIZE 24
+#define LT_SAVECOUNT 32
+
+struct latency_entry {
+	char	*reason;
+	char	argument[LT_ARGUMENTSIZE];
+	int	maxtime;         /* ignore latencies above this time as
+				    "user requested" to deal with select() */
+};
+
+struct latency_record {
+	char 	*reason;
+	char	argument[LT_ARGUMENTSIZE];
+	unsigned int count;
+	unsigned long time;
+	unsigned long max;
+};
+
+
+struct task_struct;
+
+struct latency_entry *set_latency_reason(char *reason,
+			struct latency_entry *entry);
+struct latency_entry *set_latency_reason_user(char *reason,
+			struct latency_entry *entry, unsigned int max);
+struct latency_entry *set_latency_reason_param(char *reason,
+			const char *param, struct latency_entry *entry);
+void restore_latency_reason(struct latency_entry *entry);
+void account_scheduler_latency(struct task_struct *task, int usecs, int inter);
+
+void clear_all_latency_tracing(struct task_struct *p);
+#else
+struct latency_entry {};
+static inline struct latency_entry *set_latency_reason(char *reason,
+			struct latency_entry *entry) { return NULL; }
+static inline struct latency_entry *set_latency_reason_user(char *reason,
+			struct latency_entry *entry, unsigned int max)
+				{ return NULL; }
+static inline struct latency_entry *set_latency_reason_param(char *reason,
+			const char *param, struct latency_entry *entry)
+				{ return NULL; }
+static inline void restore_latency_reason(struct latency_entry *entry) { }
+static inline void account_scheduler_latency(struct task_struct *task,
+		int usecs, int inter) {};
+static inline void clear_all_latency_tracing(struct task_struct *p) {};
+
+#endif
+
+#endif
Index: linux-2.6.24-rc7/include/linux/sched.h
===================================================================
--- linux-2.6.24-rc7.orig/include/linux/sched.h
+++ linux-2.6.24-rc7/include/linux/sched.h
@@ -88,6 +88,7 @@ struct sched_param {
 #include <linux/hrtimer.h>
 #include <linux/task_io_accounting.h>
 #include <linux/kobject.h>
+#include <linux/latencytop.h>
 
 #include <asm/processor.h>
 
@@ -1178,6 +1179,11 @@ struct task_struct {
 	int make_it_fail;
 #endif
 	struct prop_local_single dirties;
+#ifdef CONFIG_LATENCYTOP
+	struct latency_entry latency_reason;
+	int latency_record_count;
+	struct latency_record latency_record[LT_SAVECOUNT];
+#endif
 };
 
 /*
Index: linux-2.6.24-rc7/kernel/fork.c
===================================================================
--- linux-2.6.24-rc7.orig/kernel/fork.c
+++ linux-2.6.24-rc7/kernel/fork.c
@@ -1196,6 +1196,7 @@ static struct task_struct *copy_process(
 #ifdef TIF_SYSCALL_EMU
 	clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
 #endif
+	clear_all_latency_tracing(p);
 
 	/* Our parent execution domain becomes current domain
 	   These must match for thread signalling to apply */
Index: linux-2.6.24-rc7/kernel/latencytop.c
===================================================================
--- /dev/null
+++ linux-2.6.24-rc7/kernel/latencytop.c
@@ -0,0 +1,287 @@
+/*
+ * latencytop.c: Latency display infrastructure
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/latencytop.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/sched.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+static DEFINE_SPINLOCK(latency_lock);
+
+#define MAXLR 128
+static struct latency_record latency_record[MAXLR];
+
+struct latency_entry *set_latency_reason(char *reason,
+				struct latency_entry *entry)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&latency_lock, flags);
+
+	*entry = current->latency_reason;
+	if (current->latency_reason.reason) {
+		spin_unlock_irqrestore(&latency_lock, flags);
+		return entry;
+	}
+
+	current->latency_reason.reason = reason;
+	current->latency_reason.argument[0] = 0;
+	current->latency_reason.maxtime = 0;
+	spin_unlock_irqrestore(&latency_lock, flags);
+	return entry;
+}
+
+struct latency_entry *set_latency_reason_user(char *reason,
+			struct latency_entry *entry, unsigned int max)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&latency_lock, flags);
+
+	*entry = current->latency_reason;
+	if (current->latency_reason.reason) {
+		spin_unlock_irqrestore(&latency_lock, flags);
+		return entry;
+	}
+
+	current->latency_reason.reason = reason;
+	current->latency_reason.argument[0] = 0;
+	current->latency_reason.maxtime = max;
+	spin_unlock_irqrestore(&latency_lock, flags);
+	return entry;
+}
+
+struct latency_entry *set_latency_reason_param(char *reason,
+			char const *param, struct latency_entry *entry)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&latency_lock, flags);
+
+	*entry = current->latency_reason;
+	if (current->latency_reason.reason) {
+		spin_unlock_irqrestore(&latency_lock, flags);
+		return entry;
+	}
+
+	current->latency_reason.reason = reason;
+	current->latency_reason.maxtime = 0;
+	if (param)
+		strncpy(current->latency_reason.argument, param, 23);
+	else
+		current->latency_reason.argument[0] = 0;
+	spin_unlock_irqrestore(&latency_lock, flags);
+	return entry;
+}
+EXPORT_SYMBOL(set_latency_reason);
+
+void restore_latency_reason(struct latency_entry *entry)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&latency_lock, flags);
+	current->latency_reason = *entry;
+	spin_unlock_irqrestore(&latency_lock, flags);
+}
+EXPORT_SYMBOL(restore_latency_reason);
+
+void clear_all_latency_tracing(struct task_struct *p)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&latency_lock, flags);
+	memset(&p->latency_reason, 0, sizeof(p->latency_reason));
+	memset(&p->latency_record, 0, sizeof(p->latency_record));
+	p->latency_record_count = 0;
+	spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+static void clear_global_latency_tracing(void)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&latency_lock, flags);
+	memset(&latency_record, 0, sizeof(latency_record));
+	spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+static void __sched
+account_global_scheduler_latency(struct task_struct *tsk, int usecs)
+{
+	int i;
+	int firstnonnull = MAXLR + 1;
+
+	if (!tsk->latency_reason.reason)
+		return;
+
+	/* skip kernel threads for now */
+	if (!tsk->mm)
+		return;
+
+	for (i = 0; i < MAXLR; i++) {
+		/* nothing stored */
+		if (!latency_record[i].reason) {
+			if (firstnonnull > i)
+				firstnonnull = i;
+			continue;
+		}
+		/* mismatch on the reason */
+		if (strcmp(latency_record[i].reason,
+				tsk->latency_reason.reason))
+			continue;
+		/* mismatch on the argument */
+		if (strcmp(latency_record[i].argument,
+				tsk->latency_reason.argument))
+			continue;
+		latency_record[i].count++;
+		latency_record[i].time += usecs;
+		if (usecs > latency_record[i].max)
+			latency_record[i].max = usecs;
+		return;
+	}
+
+	i = firstnonnull;
+	if (i >= MAXLR - 1)
+		return;
+
+	/* allocted a new one */
+	latency_record[i].reason = tsk->latency_reason.reason;
+	strcpy(latency_record[i].argument, tsk->latency_reason.argument);
+	latency_record[i].count = 1;
+	latency_record[i].time = usecs;
+	latency_record[i].max = usecs;
+}
+
+void __sched account_scheduler_latency(struct task_struct *tsk,
+						int usecs, int inter)
+{
+	int i;
+	unsigned long flags;
+
+	/* long interruptible waits are generally user requested.. */
+	if (inter && usecs > 5000)
+		return;
+
+	/* no reason for the latency... bail for now. Later we'll
+	 * store a backtrace item instead
+	 */
+	if (!tsk->latency_reason.reason && tsk->latency_record_count > 15)
+		return;
+
+	/* if we have a maxtime set (as is the case for select and such),
+	   and we're waiting longer than that, assume it's a user requested
+	   delay and don't log it */
+	if (tsk->latency_reason.maxtime &&
+		usecs > tsk->latency_reason.maxtime)
+		return;
+
+	spin_lock_irqsave(&latency_lock, flags);
+	if (!tsk->latency_reason.reason) {
+		static char str[KSYM_NAME_LEN];
+		unsigned long EIPV = get_wchan(tsk);
+		sprint_symbol(str, EIPV);
+		tsk->latency_reason.reason = "Unknown reason";
+		strncpy(tsk->latency_reason.argument, str, 23);
+	}
+
+	account_global_scheduler_latency(tsk, usecs);
+
+	/* check if the reason is already in the table */
+	for (i = 0; i < LT_SAVECOUNT; i++) {
+		/* nothing stored */
+		if (!tsk->latency_record[i].reason)
+			continue;
+		/* mismatch on the reason */
+		if (strcmp(tsk->latency_record[i].reason,
+				tsk->latency_reason.reason))
+			continue;
+		/* mismatch on the argument */
+		if (strcmp(tsk->latency_record[i].argument,
+				tsk->latency_reason.argument))
+			continue;
+		tsk->latency_record[i].count++;
+		tsk->latency_record[i].time += usecs;
+		if (usecs > tsk->latency_record[i].max)
+			tsk->latency_record[i].max = usecs;
+		spin_unlock_irqrestore(&latency_lock, flags);
+		return;
+	}
+	/* short term hack; if we're > 32 we stop; future we recycle */
+	tsk->latency_record_count++;
+	if (tsk->latency_record_count >= LT_SAVECOUNT) {
+		spin_unlock_irqrestore(&latency_lock, flags);
+		return;
+	}
+	/* allocted a new one */
+	i = tsk->latency_record_count;
+	tsk->latency_record[i].reason = tsk->latency_reason.reason;
+	strcpy(tsk->latency_record[i].argument, tsk->latency_reason.argument);
+	tsk->latency_record[i].count = 1;
+	tsk->latency_record[i].time = usecs;
+	tsk->latency_record[i].max = usecs;
+	spin_unlock_irqrestore(&latency_lock, flags);
+
+}
+
+
+
+static int lstats_show(struct seq_file *m, void *v)
+{
+	int i;
+	seq_puts(m, "Latency Top version : v0.1\n");
+
+	for (i = 0; i < MAXLR; i++) {
+		if (latency_record[i].reason)
+			seq_printf(m, "%i %li %li %s (%s)\n",
+				latency_record[i].count,
+				latency_record[i].time,
+				latency_record[i].max,
+				latency_record[i].reason,
+				latency_record[i].argument);
+
+	}
+	return 0;
+}
+
+static ssize_t lstats_write(struct file *file, const char __user *buf,
+			    size_t count, loff_t *offs)
+{
+	clear_global_latency_tracing();
+	return count;
+}
+
+static int lstats_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, lstats_show, NULL);
+}
+
+static struct file_operations lstats_fops = {
+	.open		= lstats_open,
+	.read		= seq_read,
+	.write		= lstats_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init init_lstats_procfs(void)
+{
+	struct proc_dir_entry *pe;
+
+	pe = create_proc_entry("latency_stats", 0644, NULL);
+	if (!pe)
+		return -ENOMEM;
+
+	pe->proc_fops = &lstats_fops;
+
+	return 0;
+}
+__initcall(init_lstats_procfs);
Index: linux-2.6.24-rc7/kernel/Makefile
===================================================================
--- linux-2.6.24-rc7.orig/kernel/Makefile
+++ linux-2.6.24-rc7/kernel/Makefile
@@ -57,6 +57,7 @@ obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_MARKERS) += marker.o
+obj-$(CONFIG_LATENCYTOP) += latencytop.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
Index: linux-2.6.24-rc7/kernel/sched_fair.c
===================================================================
--- linux-2.6.24-rc7.orig/kernel/sched_fair.c
+++ linux-2.6.24-rc7/kernel/sched_fair.c
@@ -20,6 +20,8 @@
  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  */
 
+#include <linux/latencytop.h>
+
 /*
  * Targeted preemption latency for CPU-bound tasks:
  * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds)
@@ -434,6 +436,7 @@ static void enqueue_sleeper(struct cfs_r
 #ifdef CONFIG_SCHEDSTATS
 	if (se->sleep_start) {
 		u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
+		struct task_struct *tsk = task_of(se);
 
 		if ((s64)delta < 0)
 			delta = 0;
@@ -443,9 +446,12 @@ static void enqueue_sleeper(struct cfs_r
 
 		se->sleep_start = 0;
 		se->sum_sleep_runtime += delta;
+
+		account_scheduler_latency(tsk, delta >> 10, 1);
 	}
 	if (se->block_start) {
 		u64 delta = rq_of(cfs_rq)->clock - se->block_start;
+		struct task_struct *tsk = task_of(se);
 
 		if ((s64)delta < 0)
 			delta = 0;
@@ -462,11 +468,11 @@ static void enqueue_sleeper(struct cfs_r
 		 * time that the task spent sleeping:
 		 */
 		if (unlikely(prof_on == SLEEP_PROFILING)) {
-			struct task_struct *tsk = task_of(se);
 
 			profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
 				     delta >> 20);
 		}
+		account_scheduler_latency(tsk, delta >> 10, 0);
 	}
 #endif
 }
Index: linux-2.6.24-rc7/fs/proc/base.c
===================================================================
--- linux-2.6.24-rc7.orig/fs/proc/base.c
+++ linux-2.6.24-rc7/fs/proc/base.c
@@ -310,6 +310,60 @@ static int proc_pid_schedstat(struct tas
 }
 #endif
 
+#ifdef CONFIG_LATENCYTOP
+static int lstats_show_proc(struct seq_file *m, void *v)
+{
+	int i;
+	struct task_struct *task = m->private;
+	seq_puts(m, "Latency Top version : v0.1\n");
+
+	for (i = 0; i < 32; i++) {
+		if (task->latency_record[i].reason)
+			seq_printf(m, "%i %li %li %s (%s)\n",
+				task->latency_record[i].count,
+				task->latency_record[i].time,
+				task->latency_record[i].max,
+				task->latency_record[i].reason,
+				task->latency_record[i].argument);
+
+	}
+	return 0;
+}
+
+static int lstats_open(struct inode *inode, struct file *file)
+{
+	int ret;
+	struct seq_file *m;
+	struct task_struct *task = get_proc_task(inode);
+	ret = single_open(file, lstats_show_proc, NULL);
+	if (!ret) {
+		m = file->private_data;
+		m->private = task;
+	}
+	return ret;
+}
+
+static ssize_t lstats_write(struct file *file, const char __user *buf,
+			    size_t count, loff_t *offs)
+{
+	struct seq_file *m;
+	struct task_struct *task;
+	m = file->private_data;
+	task = m->private;
+	clear_all_latency_tracing(task);
+	return count;
+}
+
+static const struct file_operations proc_lstats_operations = {
+	.open		= lstats_open,
+	.read		= seq_read,
+	.write		= lstats_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+#endif
+
 /* The badness from the OOM killer */
 unsigned long badness(struct task_struct *p, unsigned long uptime);
 static int proc_oom_score(struct task_struct *task, char *buffer)
@@ -1020,6 +1074,7 @@ static const struct file_operations proc
 };
 #endif
 
+
 #ifdef CONFIG_SCHED_DEBUG
 /*
  * Print out various scheduling related per-task fields:
@@ -2230,6 +2285,9 @@ static const struct pid_entry tgid_base_
 #ifdef CONFIG_SCHEDSTATS
 	INF("schedstat",  S_IRUGO, pid_schedstat),
 #endif
+#ifdef CONFIG_LATENCYTOP
+	REG("latency",  S_IRUGO, lstats),
+#endif
 #ifdef CONFIG_PROC_PID_CPUSET
 	REG("cpuset",     S_IRUGO, cpuset),
 #endif
@@ -2555,6 +2613,9 @@ static const struct pid_entry tid_base_s
 #ifdef CONFIG_SCHEDSTATS
 	INF("schedstat", S_IRUGO, pid_schedstat),
 #endif
+#ifdef CONFIG_LATENCYTOP
+	REG("latency",  S_IRUGO, lstats),
+#endif
 #ifdef CONFIG_PROC_PID_CPUSET
 	REG("cpuset",    S_IRUGO, cpuset),
 #endif
Index: linux-2.6.24-rc7/lib/Kconfig.debug
===================================================================
--- linux-2.6.24-rc7.orig/lib/Kconfig.debug
+++ linux-2.6.24-rc7/lib/Kconfig.debug
@@ -517,4 +517,15 @@ config FAULT_INJECTION_STACKTRACE_FILTER
 	help
 	  Provide stacktrace filter for fault-injection capabilities
 
+config LATENCYTOP
+	bool "Latency measuring infrastructure"
+	select FRAME_POINTER if !X86 && !MIPS
+	select KALLSYMS
+	select KALLSYMS_ALL
+	depends on SCHEDSTATS
+	help
+	  Enable this option if you want to use the LatencyTOP tool
+	  to find out which userspace is blocking on what kernel operations.
+
+
 source "samples/Kconfig"
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/