The following patches add accounting for the delays seen by tasks in
a) waiting for a CPU (while being runnable)
b) completion of synchronous block I/O initiated by the task
c) swapping in pages (i.e. capacity misses).
Such delays provide feedback for a task's cpu priority, io priority and
rss limit values. Long delays, especially relative to other tasks, can
be a trigger for changing a task's cpu/io priorities and modifying its
rss usage (either directly through sys_getprlimit() that was proposed
earlier on lkml or by throttling cpu consumption or process calling
sys_setrlimit etc.)
The major changes since the previous posting of these patches are
- use of the new generic netlink interface (NETLINK_GENERIC family)
with provision for reuse by other (non-delay accounting) kernel
components
- sysctl option for turning delay accounting collection on/off
dynamically
- similar sysctl option for schedstats. Delay accounting leverages
schedstats code for cpu delays.
- dynamic allocation of delay accounting structures
More comments in individual patches. Please give feedback.
--Shailabh
Series
nstimestamp-diff.patch
schedstats-sysctl.patch
delayacct-setup.patch
delayacct-sysctl.patch
delayacct-blkio.patch
delayacct-swapin.patch
delayacct-genetlink.patch
nstimestamp_diff.patch
Add kernel utility function for measuring the
interval (diff) between two timespec values, adjusting for overflow
Signed-off-by: Shailabh Nagar <[email protected]>
include/linux/time.h | 14 ++++++++++++++
1 files changed, 14 insertions(+)
Index: linux-2.6.16-rc4/include/linux/time.h
===================================================================
--- linux-2.6.16-rc4.orig/include/linux/time.h 2006-02-27 01:20:04.000000000 -0500
+++ linux-2.6.16-rc4/include/linux/time.h 2006-02-27 01:52:49.000000000 -0500
@@ -147,6 +147,20 @@ extern struct timespec ns_to_timespec(co
*/
extern struct timeval ns_to_timeval(const nsec_t nsec);
+/*
+ * timespec_diff_ns - Return difference of two timestamps in nanoseconds
+ * In the rare case of @end being earlier than @start, return zero
+ */
+static inline nsec_t timespec_diff_ns(struct timespec *start, struct timespec *end)
+{
+ nsec_t ret;
+
+ ret = (nsec_t)(end->tv_sec - start->tv_sec)*NSEC_PER_SEC;
+ ret += (nsec_t)(end->tv_nsec - start->tv_nsec);
+ if (ret < 0)
+ return 0;
+ return ret;
+}
#endif /* __KERNEL__ */
#define NFDBITS __NFDBITS
schedstats-sysctl.patch
Add sysctl option for controlling schedstats collection
dynamically. Delay accounting leverages schedstats for
cpu delay statistics.
Signed-off-by: Chandra Seetharaman <[email protected]>
Signed-off-by: Shailabh Nagar <[email protected]>
Documentation/kernel-parameters.txt | 2
include/linux/sched.h | 4 +
include/linux/sysctl.h | 1
kernel/sched.c | 74 +++++++++++++++++++++++++++++++++---
kernel/sysctl.c | 10 ++++
lib/Kconfig.debug | 6 +-
6 files changed, 90 insertions(+), 7 deletions(-)
Index: linux-2.6.16-rc4/include/linux/sched.h
===================================================================
--- linux-2.6.16-rc4.orig/include/linux/sched.h 2006-02-27 01:20:04.000000000 -0500
+++ linux-2.6.16-rc4/include/linux/sched.h 2006-02-27 01:52:52.000000000 -0500
@@ -15,6 +15,7 @@
#include <linux/cpumask.h>
#include <linux/errno.h>
#include <linux/nodemask.h>
+#include <linux/sysctl.h>
#include <asm/system.h>
#include <asm/semaphore.h>
@@ -525,6 +526,9 @@ struct backing_dev_info;
struct reclaim_state;
#ifdef CONFIG_SCHEDSTATS
+extern int schedstats_sysctl;
+extern int schedstats_sysctl_handler(ctl_table *, int, struct file *,
+ void __user *, size_t *, loff_t *);
struct sched_info {
/* cumulative counters */
unsigned long cpu_time, /* time spent on the cpu */
Index: linux-2.6.16-rc4/include/linux/sysctl.h
===================================================================
--- linux-2.6.16-rc4.orig/include/linux/sysctl.h 2006-02-27 01:20:04.000000000 -0500
+++ linux-2.6.16-rc4/include/linux/sysctl.h 2006-02-27 01:52:52.000000000 -0500
@@ -146,6 +146,7 @@ enum
KERN_RANDOMIZE=68, /* int: randomize virtual address space */
KERN_SETUID_DUMPABLE=69, /* int: behaviour of dumps for setuid core */
KERN_SPIN_RETRY=70, /* int: number of spinlock retries */
+ KERN_SCHEDSTATS=71, /* int: Schedstats on/off */
};
Index: linux-2.6.16-rc4/kernel/sched.c
===================================================================
--- linux-2.6.16-rc4.orig/kernel/sched.c 2006-02-27 01:20:04.000000000 -0500
+++ linux-2.6.16-rc4/kernel/sched.c 2006-02-27 01:52:52.000000000 -0500
@@ -382,11 +382,56 @@ static inline void task_rq_unlock(runque
}
#ifdef CONFIG_SCHEDSTATS
+
+int schedstats_sysctl = 0; /* schedstats turned off by default */
+static DEFINE_PER_CPU(int, schedstats) = 0;
+
+static void schedstats_set(int val)
+{
+ int i;
+ static spinlock_t schedstats_lock = SPIN_LOCK_UNLOCKED;
+
+ spin_lock(&schedstats_lock);
+ schedstats_sysctl = val;
+ for (i = 0; i < NR_CPUS; i++)
+ per_cpu(schedstats, i) = val;
+ spin_unlock(&schedstats_lock);
+}
+
+static int __init schedstats_setup_enable(char *str)
+{
+ schedstats_sysctl = 1;
+ schedstats_set(schedstats_sysctl);
+ return 1;
+}
+
+__setup("schedstats", schedstats_setup_enable);
+
+int schedstats_sysctl_handler(ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret, prev = schedstats_sysctl;
+ struct task_struct *g, *t;
+
+ ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+ if ((ret != 0) || (prev == schedstats_sysctl))
+ return ret;
+ if (schedstats_sysctl) {
+ read_lock(&tasklist_lock);
+ do_each_thread(g, t) {
+ memset(&t->sched_info, 0, sizeof(t->sched_info));
+ } while_each_thread(g, t);
+ read_unlock(&tasklist_lock);
+ }
+ schedstats_set(schedstats_sysctl);
+ return ret;
+}
+
/*
* bump this up when changing the output format or the meaning of an existing
* format, so that tools can adapt (or abort)
*/
-#define SCHEDSTAT_VERSION 12
+#define SCHEDSTAT_VERSION 13
static int show_schedstat(struct seq_file *seq, void *v)
{
@@ -394,6 +439,10 @@ static int show_schedstat(struct seq_fil
seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
seq_printf(seq, "timestamp %lu\n", jiffies);
+ if (!schedstats_sysctl) {
+ seq_printf(seq, "State Off\n");
+ return 0;
+ }
for_each_online_cpu(cpu) {
runqueue_t *rq = cpu_rq(cpu);
#ifdef CONFIG_SMP
@@ -472,8 +521,17 @@ struct file_operations proc_schedstat_op
.release = single_release,
};
-# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
-# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
+#define schedstats_on (per_cpu(schedstats, smp_processor_id()) != 0)
+#define schedstat_inc(rq, field) \
+do { \
+ if (unlikely(schedstats_on)) \
+ (rq)->field++; \
+} while (0)
+#define schedstat_add(rq, field, amt) \
+do { \
+ if (unlikely(schedstats_on)) \
+ (rq)->field += (amt); \
+} while (0)
#else /* !CONFIG_SCHEDSTATS */
# define schedstat_inc(rq, field) do { } while (0)
# define schedstat_add(rq, field, amt) do { } while (0)
@@ -556,7 +614,7 @@ static void sched_info_arrive(task_t *t)
*/
static inline void sched_info_queued(task_t *t)
{
- if (!t->sched_info.last_queued)
+ if (unlikely(schedstats_on && !t->sched_info.last_queued))
t->sched_info.last_queued = jiffies;
}
@@ -580,7 +638,7 @@ static inline void sched_info_depart(tas
* their time slice. (This may also be called when switching to or from
* the idle task.) We are only called when prev != next.
*/
-static inline void sched_info_switch(task_t *prev, task_t *next)
+static inline void __sched_info_switch(task_t *prev, task_t *next)
{
struct runqueue *rq = task_rq(prev);
@@ -595,6 +653,12 @@ static inline void sched_info_switch(tas
if (next != rq->idle)
sched_info_arrive(next);
}
+
+static inline void sched_info_switch(task_t *prev, task_t *next)
+{
+ if (unlikely(schedstats_on))
+ __sched_info_switch(prev, next);
+}
#else
#define sched_info_queued(t) do { } while (0)
#define sched_info_switch(t, next) do { } while (0)
Index: linux-2.6.16-rc4/kernel/sysctl.c
===================================================================
--- linux-2.6.16-rc4.orig/kernel/sysctl.c 2006-02-27 01:20:04.000000000 -0500
+++ linux-2.6.16-rc4/kernel/sysctl.c 2006-02-27 01:52:52.000000000 -0500
@@ -656,6 +656,16 @@ static ctl_table kern_table[] = {
.proc_handler = &proc_dointvec,
},
#endif
+#if defined(CONFIG_SCHEDSTATS)
+ {
+ .ctl_name = KERN_SCHEDSTATS,
+ .procname = "schedstats",
+ .data = &schedstats_sysctl,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &schedstats_sysctl_handler,
+ },
+#endif
{ .ctl_name = 0 }
};
Index: linux-2.6.16-rc4/lib/Kconfig.debug
===================================================================
--- linux-2.6.16-rc4.orig/lib/Kconfig.debug 2006-02-27 01:20:04.000000000 -0500
+++ linux-2.6.16-rc4/lib/Kconfig.debug 2006-02-27 01:52:52.000000000 -0500
@@ -67,15 +67,17 @@ config DETECT_SOFTLOCKUP
config SCHEDSTATS
bool "Collect scheduler statistics"
- depends on DEBUG_KERNEL && PROC_FS
+ depends on PROC_FS
help
If you say Y here, additional code will be inserted into the
scheduler and related routines to collect statistics about
scheduler behavior and provide them in /proc/schedstat. These
- stats may be useful for both tuning and debugging the scheduler
+ stats may be useful for both tuning and debugging the scheduler.
If you aren't debugging the scheduler or trying to tune a specific
application, you can say N to avoid the very slight overhead
this adds.
+ Schedstats collection, and most of its overhead, can also be
+ controlled dyanmically through the schedstats sysctl.
config DEBUG_SLAB
bool "Debug memory allocations"
Index: linux-2.6.16-rc4/Documentation/kernel-parameters.txt
===================================================================
--- linux-2.6.16-rc4.orig/Documentation/kernel-parameters.txt 2006-02-27 01:19:52.000000000 -0500
+++ linux-2.6.16-rc4/Documentation/kernel-parameters.txt 2006-02-27 01:52:52.000000000 -0500
@@ -1333,6 +1333,8 @@ running once the system is up.
sc1200wdt= [HW,WDT] SC1200 WDT (watchdog) driver
Format: <io>[,<timeout>[,<isapnp>]]
+ schedstats [KNL] Collect CPU scheduler statistics
+
scsi_debug_*= [SCSI]
See drivers/scsi/scsi_debug.c.
delayacct-setup.patch
Initialization code related to collection of per-task "delay"
statistics which measure how long it had to wait for cpu,
sync block io, swapping etc. The collection of statistics and
the interface are in other patches. This patch sets up the data
structures and allows the statistics collection to be disabled
through a kernel boot paramater.
Signed-off-by: Shailabh Nagar <[email protected]>
Documentation/kernel-parameters.txt | 2 +
include/linux/delayacct.h | 55 ++++++++++++++++++++++++++++++
include/linux/sched.h | 15 ++++++++
init/Kconfig | 13 +++++++
init/main.c | 2 +
kernel/Makefile | 1
kernel/delayacct.c | 65 ++++++++++++++++++++++++++++++++++++
kernel/exit.c | 3 +
kernel/fork.c | 2 +
9 files changed, 158 insertions(+)
Index: linux-2.6.16-rc4/Documentation/kernel-parameters.txt
===================================================================
--- linux-2.6.16-rc4.orig/Documentation/kernel-parameters.txt 2006-02-27 01:52:52.000000000 -0500
+++ linux-2.6.16-rc4/Documentation/kernel-parameters.txt 2006-02-27 01:52:54.000000000 -0500
@@ -410,6 +410,8 @@ running once the system is up.
Format: <area>[,<node>]
See also Documentation/networking/decnet.txt.
+ delayacct [KNL] Enable per-task delay accounting
+
devfs= [DEVFS]
See Documentation/filesystems/devfs/boot-options.
Index: linux-2.6.16-rc4/kernel/Makefile
===================================================================
--- linux-2.6.16-rc4.orig/kernel/Makefile 2006-02-27 01:20:04.000000000 -0500
+++ linux-2.6.16-rc4/kernel/Makefile 2006-02-27 01:52:54.000000000 -0500
@@ -34,6 +34,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softl
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
# According to Alan Modra <[email protected]>, the -fno-omit-frame-pointer is
Index: linux-2.6.16-rc4/include/linux/delayacct.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16-rc4/include/linux/delayacct.h 2006-02-27 01:52:54.000000000 -0500
@@ -0,0 +1,55 @@
+/* delayacct.h - per-task delay accounting
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#ifndef _LINUX_TASKDELAYS_H
+#define _LINUX_TASKDELAYS_H
+
+#include <linux/sched.h>
+
+#ifdef CONFIG_TASK_DELAY_ACCT
+extern int delayacct_on; /* Delay accounting turned on/off */
+extern kmem_cache_t *delayacct_cache;
+extern int delayacct_init(void);
+extern void __delayacct_tsk_init(struct task_struct *);
+extern void __delayacct_tsk_exit(struct task_struct *);
+
+static inline void delayacct_tsk_init(struct task_struct *tsk)
+{
+ /* reinitialize in case parent's non-null pointer was dup'ed*/
+ tsk->delays = NULL;
+ if (unlikely(delayacct_on))
+ __delayacct_tsk_init(tsk);
+}
+
+static inline void delayacct_tsk_exit(struct task_struct *tsk)
+{
+ if (unlikely(tsk->delays))
+ __delayacct_tsk_exit(tsk);
+}
+
+static inline void delayacct_timestamp_start(void)
+{
+ if (unlikely(current->delays && delayacct_on))
+ do_posix_clock_monotonic_gettime(¤t->delays->start);
+}
+#else
+static inline void delayacct_tsk_init(struct task_struct *tsk)
+{}
+static inline void delayacct_tsk_exit(struct task_struct *tsk)
+{}
+static inline void delayacct_timestamp_start(void)
+{}
+static inline int delayacct_init(void)
+{}
+#endif /* CONFIG_TASK_DELAY_ACCT */
+#endif /* _LINUX_TASKDELAYS_H */
Index: linux-2.6.16-rc4/include/linux/sched.h
===================================================================
--- linux-2.6.16-rc4.orig/include/linux/sched.h 2006-02-27 01:52:52.000000000 -0500
+++ linux-2.6.16-rc4/include/linux/sched.h 2006-02-27 01:52:54.000000000 -0500
@@ -543,6 +543,18 @@ struct sched_info {
extern struct file_operations proc_schedstat_operations;
#endif
+#ifdef CONFIG_TASK_DELAY_ACCT
+struct task_delay_info {
+ spinlock_t lock;
+
+ /* timestamp recording variables (to reduce stack usage) */
+ struct timespec start, end;
+
+ /* Add stats in pairs: u64 delay, u32 count, aligned properly */
+};
+#endif
+
+
enum idle_type
{
SCHED_IDLE,
@@ -874,6 +886,9 @@ struct task_struct {
#endif
atomic_t fs_excl; /* holding fs exclusive resources */
struct rcu_head rcu;
+#ifdef CONFIG_TASK_DELAY_ACCT
+ struct task_delay_info *delays;
+#endif
};
static inline pid_t process_group(struct task_struct *tsk)
Index: linux-2.6.16-rc4/init/Kconfig
===================================================================
--- linux-2.6.16-rc4.orig/init/Kconfig 2006-02-27 01:20:04.000000000 -0500
+++ linux-2.6.16-rc4/init/Kconfig 2006-02-27 01:52:54.000000000 -0500
@@ -150,6 +150,19 @@ config BSD_PROCESS_ACCT_V3
for processing it. A preliminary version of these tools is available
at <http://www.physik3.uni-rostock.de/tim/kernel/utils/acct/>.
+config TASK_DELAY_ACCT
+ bool "Enable per-task delay accounting (EXPERIMENTAL)"
+ help
+ Collect information on time spent by a task waiting for system
+ resources like cpu, synchronous block I/O completion and swapping
+ in pages. Such statistics can help in setting a task's priorities
+ relative to other tasks for cpu, io, rss limits etc.
+
+ Unlike BSD process accounting, this information is available
+ continuously during the lifetime of a task.
+
+ Say N if unsure.
+
config SYSCTL
bool "Sysctl support"
---help---
Index: linux-2.6.16-rc4/init/main.c
===================================================================
--- linux-2.6.16-rc4.orig/init/main.c 2006-02-27 01:20:04.000000000 -0500
+++ linux-2.6.16-rc4/init/main.c 2006-02-27 01:52:54.000000000 -0500
@@ -47,6 +47,7 @@
#include <linux/rmap.h>
#include <linux/mempolicy.h>
#include <linux/key.h>
+#include <linux/delayacct.h>
#include <asm/io.h>
#include <asm/bugs.h>
@@ -537,6 +538,7 @@ asmlinkage void __init start_kernel(void
proc_root_init();
#endif
cpuset_init();
+ delayacct_init();
check_bugs();
Index: linux-2.6.16-rc4/kernel/delayacct.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16-rc4/kernel/delayacct.c 2006-02-27 01:52:54.000000000 -0500
@@ -0,0 +1,65 @@
+/* delayacct.c - per-task delay accounting
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/sysctl.h>
+#include <linux/delayacct.h>
+
+int delayacct_on = 0; /* Delay accounting turned on/off */
+kmem_cache_t *delayacct_cache;
+
+static int __init delayacct_setup_enable(char *str)
+{
+ delayacct_on = 1;
+ return 1;
+}
+__setup("delayacct", delayacct_setup_enable);
+
+int delayacct_init(void)
+{
+ delayacct_cache = kmem_cache_create("delayacct_cache",
+ sizeof(struct task_delay_info),
+ 0,
+ SLAB_PANIC,
+ NULL, NULL);
+ if (!delayacct_cache)
+ return -ENOMEM;
+ delayacct_tsk_init(&init_task);
+ return 0;
+}
+
+void __delayacct_tsk_init(struct task_struct *tsk)
+{
+ tsk->delays = kmem_cache_alloc(delayacct_cache, SLAB_KERNEL);
+ if (tsk->delays) {
+ memset(tsk->delays, 0, sizeof(*tsk->delays));
+ spin_lock_init(&tsk->delays->lock);
+ }
+}
+
+void __delayacct_tsk_exit(struct task_struct *tsk)
+{
+ kmem_cache_free(delayacct_cache, tsk->delays);
+ tsk->delays = NULL;
+}
+
+static inline nsec_t delayacct_measure(void)
+{
+ if ((current->delays->start.tv_sec == 0) &&
+ (current->delays->start.tv_nsec == 0))
+ return -EINVAL;
+ do_posix_clock_monotonic_gettime(¤t->delays->end);
+ return timespec_diff_ns(¤t->delays->start, ¤t->delays->end);
+}
Index: linux-2.6.16-rc4/kernel/fork.c
===================================================================
--- linux-2.6.16-rc4.orig/kernel/fork.c 2006-02-27 01:20:04.000000000 -0500
+++ linux-2.6.16-rc4/kernel/fork.c 2006-02-27 01:52:54.000000000 -0500
@@ -44,6 +44,7 @@
#include <linux/rmap.h>
#include <linux/acct.h>
#include <linux/cn_proc.h>
+#include <linux/delayacct.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -970,6 +971,7 @@ static task_t *copy_process(unsigned lon
goto bad_fork_cleanup_put_domain;
p->did_exec = 0;
+ delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
copy_flags(clone_flags, p);
p->pid = pid;
retval = -EFAULT;
Index: linux-2.6.16-rc4/kernel/exit.c
===================================================================
--- linux-2.6.16-rc4.orig/kernel/exit.c 2006-02-27 01:20:04.000000000 -0500
+++ linux-2.6.16-rc4/kernel/exit.c 2006-02-27 01:52:54.000000000 -0500
@@ -31,6 +31,7 @@
#include <linux/signal.h>
#include <linux/cn_proc.h>
#include <linux/mutex.h>
+#include <linux/delayacct.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -839,6 +840,8 @@ fastcall NORET_TYPE void do_exit(long co
preempt_count());
acct_update_integrals(tsk);
+ delayacct_tsk_exit(tsk);
+
if (tsk->mm) {
update_hiwater_rss(tsk->mm);
update_hiwater_vm(tsk->mm);
delayacct-sysctl.patch
Adds a sysctl to turn delay accounting on/off dynamically.
(defaults to off). When turning off, struct task_delay_info
associated with each task need to be cleared. When turning
on, tasks without struct task_delay_info need to be allocated
one.
Signed-off-by: Shailabh Nagar <[email protected]>
Signed-off-by: Balbir Singh <[email protected]>
Signed-off-by: Srivatsa Vaddagiri <[email protected]>
include/linux/delayacct.h | 12 +++-
include/linux/sysctl.h | 1
kernel/delayacct.c | 128 ++++++++++++++++++++++++++++++++++++++++++++--
kernel/fork.c | 3 -
kernel/sysctl.c | 11 +++
5 files changed, 147 insertions(+), 8 deletions(-)
Index: linux-2.6.16-rc4/include/linux/delayacct.h
===================================================================
--- linux-2.6.16-rc4.orig/include/linux/delayacct.h 2006-02-27 01:52:54.000000000 -0500
+++ linux-2.6.16-rc4/include/linux/delayacct.h 2006-02-27 01:52:56.000000000 -0500
@@ -15,18 +15,24 @@
#define _LINUX_TASKDELAYS_H
#include <linux/sched.h>
+#include <linux/sysctl.h>
#ifdef CONFIG_TASK_DELAY_ACCT
extern int delayacct_on; /* Delay accounting turned on/off */
extern kmem_cache_t *delayacct_cache;
+int delayacct_sysctl_handler(ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
extern int delayacct_init(void);
extern void __delayacct_tsk_init(struct task_struct *);
extern void __delayacct_tsk_exit(struct task_struct *);
-static inline void delayacct_tsk_init(struct task_struct *tsk)
+static inline void delayacct_tsk_early_init(struct task_struct *tsk)
{
- /* reinitialize in case parent's non-null pointer was dup'ed*/
tsk->delays = NULL;
+}
+
+static inline void delayacct_tsk_init(struct task_struct *tsk)
+{
if (unlikely(delayacct_on))
__delayacct_tsk_init(tsk);
}
@@ -43,6 +49,8 @@ static inline void delayacct_timestamp_s
do_posix_clock_monotonic_gettime(¤t->delays->start);
}
#else
+static inline void delayacct_tsk_early_init(struct task_struct *tsk)
+{}
static inline void delayacct_tsk_init(struct task_struct *tsk)
{}
static inline void delayacct_tsk_exit(struct task_struct *tsk)
Index: linux-2.6.16-rc4/include/linux/sysctl.h
===================================================================
--- linux-2.6.16-rc4.orig/include/linux/sysctl.h 2006-02-27 01:52:52.000000000 -0500
+++ linux-2.6.16-rc4/include/linux/sysctl.h 2006-02-27 01:52:56.000000000 -0500
@@ -147,6 +147,7 @@ enum
KERN_SETUID_DUMPABLE=69, /* int: behaviour of dumps for setuid core */
KERN_SPIN_RETRY=70, /* int: number of spinlock retries */
KERN_SCHEDSTATS=71, /* int: Schedstats on/off */
+ KERN_DELAYACCT=74, /* int: Per-task delay accounting on/off */
};
Index: linux-2.6.16-rc4/kernel/delayacct.c
===================================================================
--- linux-2.6.16-rc4.orig/kernel/delayacct.c 2006-02-27 01:52:54.000000000 -0500
+++ linux-2.6.16-rc4/kernel/delayacct.c 2006-02-27 01:52:56.000000000 -0500
@@ -1,6 +1,7 @@
/* delayacct.c - per-task delay accounting
*
* Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ * (C) Balbir Singh, IBM Corp. 2006
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of version 2.1 of the GNU Lesser General Public License
@@ -42,17 +43,30 @@ int delayacct_init(void)
void __delayacct_tsk_init(struct task_struct *tsk)
{
- tsk->delays = kmem_cache_alloc(delayacct_cache, SLAB_KERNEL);
- if (tsk->delays) {
+ struct task_delay_info *delays = NULL;
+
+ delays = kmem_cache_alloc(delayacct_cache, SLAB_KERNEL);
+ if (!delays)
+ return;
+
+ task_lock(tsk);
+ if (!tsk->delays) {
+ tsk->delays = delays;
memset(tsk->delays, 0, sizeof(*tsk->delays));
spin_lock_init(&tsk->delays->lock);
- }
+ } else
+ kmem_cache_free(delayacct_cache, delays);
+ task_unlock(tsk);
}
void __delayacct_tsk_exit(struct task_struct *tsk)
{
- kmem_cache_free(delayacct_cache, tsk->delays);
- tsk->delays = NULL;
+ task_lock(tsk);
+ if (tsk->delays) {
+ kmem_cache_free(delayacct_cache, tsk->delays);
+ tsk->delays = NULL;
+ }
+ task_unlock(tsk);
}
static inline nsec_t delayacct_measure(void)
@@ -63,3 +77,107 @@ static inline nsec_t delayacct_measure(v
do_posix_clock_monotonic_gettime(¤t->delays->end);
return timespec_diff_ns(¤t->delays->start, ¤t->delays->end);
}
+
+/* Allocate task_delay_info for all tasks without one */
+static int alloc_delays(void)
+{
+ int cnt=0, i, j;
+ struct task_struct *g, *t;
+ struct task_delay_info **delayp;
+ int err = 0;
+
+ read_lock(&tasklist_lock);
+ do_each_thread(g, t)
+ if (!t->delays && !(t->flags & (PF_EXITING | PF_DEAD)))
+ cnt++;
+ while_each_thread(g, t);
+ read_unlock(&tasklist_lock);
+
+ if (!cnt)
+ return 0;
+retry_allocs:
+
+ delayp = kmalloc(cnt *sizeof(struct task_delay_info *), GFP_KERNEL);
+ if (!delayp)
+ return -ENOMEM;
+ for (i = 0; i < cnt; i++) {
+ delayp[i] = kmem_cache_alloc(delayacct_cache, SLAB_KERNEL);
+ if (!delayp[i]) {
+ err = -ENOMEM;
+ goto out;
+ }
+ memset(delayp[i], 0, sizeof(*delayp[i]));
+ spin_lock_init(&delayp[i]->lock);
+ }
+
+ i--;
+ j = 0;
+ read_lock(&tasklist_lock);
+ do_each_thread(g, t) {
+ task_lock(t);
+ if (t->delays) {
+ task_unlock(t);
+ continue;
+ }
+ /* Did some additional unaccounted tasks get created */
+ if (i < 0) {
+ j++;
+ task_unlock(t);
+ continue;
+ }
+ if (!(t->flags & (PF_EXITING | PF_DEAD))) {
+ t->delays = delayp[i--];
+ }
+ task_unlock(t);
+ } while_each_thread(g, t);
+ read_unlock(&tasklist_lock);
+
+ /*
+ * Retry allocations for all tasks created in between the two
+ * tasklist_locks
+ */
+ if (j > 0) {
+ kfree(delayp);
+ cnt = j;
+ goto retry_allocs;
+ }
+out:
+ while (i >= 0)
+ kmem_cache_free(delayacct_cache, delayp[i--]);
+ kfree(delayp);
+ return err;
+}
+
+/* Reset task_delay_info structs for all tasks */
+static void reset_delays(void)
+{
+ struct task_struct *g, *t;
+
+ read_lock(&tasklist_lock);
+ do_each_thread(g, t) {
+ if (!t->delays)
+ continue;
+ memset(t->delays, 0, sizeof(struct task_delay_info));
+ spin_lock_init(&t->delays->lock);
+ } while_each_thread(g, t);
+ read_unlock(&tasklist_lock);
+}
+
+int delayacct_sysctl_handler(ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret, prev;
+
+ prev = delayacct_on;
+ ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+ if (ret || (prev == delayacct_on))
+ return ret;
+
+ if (delayacct_on)
+ ret = alloc_delays();
+ else
+ reset_delays();
+ if (ret)
+ delayacct_on = prev;
+ return ret;
+}
Index: linux-2.6.16-rc4/kernel/fork.c
===================================================================
--- linux-2.6.16-rc4.orig/kernel/fork.c 2006-02-27 01:52:54.000000000 -0500
+++ linux-2.6.16-rc4/kernel/fork.c 2006-02-27 01:52:56.000000000 -0500
@@ -971,7 +971,6 @@ static task_t *copy_process(unsigned lon
goto bad_fork_cleanup_put_domain;
p->did_exec = 0;
- delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
copy_flags(clone_flags, p);
p->pid = pid;
retval = -EFAULT;
@@ -1013,6 +1012,7 @@ static task_t *copy_process(unsigned lon
p->io_wait = NULL;
p->audit_context = NULL;
cpuset_fork(p);
+ delayacct_tsk_early_init(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_copy(p->mempolicy);
if (IS_ERR(p->mempolicy)) {
@@ -1191,6 +1191,7 @@ static task_t *copy_process(unsigned lon
total_forks++;
spin_unlock(¤t->sighand->siglock);
write_unlock_irq(&tasklist_lock);
+ delayacct_tsk_init(p);
proc_fork_connector(p);
return p;
Index: linux-2.6.16-rc4/kernel/sysctl.c
===================================================================
--- linux-2.6.16-rc4.orig/kernel/sysctl.c 2006-02-27 01:52:52.000000000 -0500
+++ linux-2.6.16-rc4/kernel/sysctl.c 2006-02-27 01:52:56.000000000 -0500
@@ -44,6 +44,7 @@
#include <linux/limits.h>
#include <linux/dcache.h>
#include <linux/syscalls.h>
+#include <linux/delayacct.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
@@ -666,6 +667,16 @@ static ctl_table kern_table[] = {
.proc_handler = &schedstats_sysctl_handler,
},
#endif
+#if defined(CONFIG_TASK_DELAY_ACCT)
+ {
+ .ctl_name = KERN_DELAYACCT,
+ .procname = "delayacct",
+ .data = &delayacct_on,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &delayacct_sysctl_handler,
+ },
+#endif
{ .ctl_name = 0 }
};
delayacct-blkio.patch
Record time spent by a task waiting for completion of
userspace initiated synchronous block I/O. This can help
determine the right I/O priority for the task.
Signed-off-by: Shailabh Nagar <[email protected]>
fs/buffer.c | 3 +++
fs/read_write.c | 5 ++++-
include/linux/delayacct.h | 8 ++++++++
include/linux/sched.h | 2 ++
kernel/delayacct.c | 14 ++++++++++++++
mm/filemap.c | 5 ++++-
mm/memory.c | 8 +++++++-
7 files changed, 42 insertions(+), 3 deletions(-)
Index: linux-2.6.16-rc4/include/linux/delayacct.h
===================================================================
--- linux-2.6.16-rc4.orig/include/linux/delayacct.h 2006-02-27 01:52:56.000000000 -0500
+++ linux-2.6.16-rc4/include/linux/delayacct.h 2006-02-27 01:52:59.000000000 -0500
@@ -25,6 +25,7 @@ int delayacct_sysctl_handler(ctl_table *
extern int delayacct_init(void);
extern void __delayacct_tsk_init(struct task_struct *);
extern void __delayacct_tsk_exit(struct task_struct *);
+extern void __delayacct_blkio(void);
static inline void delayacct_tsk_early_init(struct task_struct *tsk)
{
@@ -48,6 +49,11 @@ static inline void delayacct_timestamp_s
if (unlikely(current->delays && delayacct_on))
do_posix_clock_monotonic_gettime(¤t->delays->start);
}
+static inline void delayacct_blkio(void)
+{
+ if (unlikely(current->delays && delayacct_on))
+ __delayacct_blkio();
+}
#else
static inline void delayacct_tsk_early_init(struct task_struct *tsk)
{}
@@ -57,6 +63,8 @@ static inline void delayacct_tsk_exit(st
{}
static inline void delayacct_timestamp_start(void)
{}
+static inline void delayacct_blkio(void)
+{}
static inline int delayacct_init(void)
{}
#endif /* CONFIG_TASK_DELAY_ACCT */
Index: linux-2.6.16-rc4/kernel/delayacct.c
===================================================================
--- linux-2.6.16-rc4.orig/kernel/delayacct.c 2006-02-27 01:52:56.000000000 -0500
+++ linux-2.6.16-rc4/kernel/delayacct.c 2006-02-27 01:52:59.000000000 -0500
@@ -78,6 +78,20 @@ static inline nsec_t delayacct_measure(v
return timespec_diff_ns(¤t->delays->start, ¤t->delays->end);
}
+void __delayacct_blkio(void)
+{
+ nsec_t delay;
+
+ delay = delayacct_measure();
+ if (delay < 0)
+ return;
+
+ spin_lock(¤t->delays->lock);
+ current->delays->blkio_delay += delay;
+ current->delays->blkio_count++;
+ spin_unlock(¤t->delays->lock);
+}
+
/* Allocate task_delay_info for all tasks without one */
static int alloc_delays(void)
{
Index: linux-2.6.16-rc4/fs/buffer.c
===================================================================
--- linux-2.6.16-rc4.orig/fs/buffer.c 2006-02-27 01:20:01.000000000 -0500
+++ linux-2.6.16-rc4/fs/buffer.c 2006-02-27 01:52:59.000000000 -0500
@@ -42,6 +42,7 @@
#include <linux/bitops.h>
#include <linux/mpage.h>
#include <linux/bit_spinlock.h>
+#include <linux/delayacct.h>
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
static void invalidate_bh_lrus(void);
@@ -344,6 +345,7 @@ static long do_fsync(unsigned int fd, in
goto out_putf;
}
+ delayacct_timestamp_start();
mapping = file->f_mapping;
current->flags |= PF_SYNCWRITE;
@@ -366,6 +368,7 @@ static long do_fsync(unsigned int fd, in
out_putf:
fput(file);
out:
+ delayacct_blkio();
return ret;
}
Index: linux-2.6.16-rc4/fs/read_write.c
===================================================================
--- linux-2.6.16-rc4.orig/fs/read_write.c 2006-02-27 01:20:02.000000000 -0500
+++ linux-2.6.16-rc4/fs/read_write.c 2006-02-27 01:52:59.000000000 -0500
@@ -15,6 +15,7 @@
#include <linux/module.h>
#include <linux/syscalls.h>
#include <linux/pagemap.h>
+#include <linux/delayacct.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -236,8 +237,10 @@ ssize_t do_sync_read(struct file *filp,
(ret = filp->f_op->aio_read(&kiocb, buf, len, kiocb.ki_pos)))
wait_on_retry_sync_kiocb(&kiocb);
- if (-EIOCBQUEUED == ret)
+ if (-EIOCBQUEUED == ret) {
+ delayacct_timestamp_start();
ret = wait_on_sync_kiocb(&kiocb);
+ }
*ppos = kiocb.ki_pos;
return ret;
}
Index: linux-2.6.16-rc4/mm/filemap.c
===================================================================
--- linux-2.6.16-rc4.orig/mm/filemap.c 2006-02-27 01:20:04.000000000 -0500
+++ linux-2.6.16-rc4/mm/filemap.c 2006-02-27 01:52:59.000000000 -0500
@@ -29,6 +29,7 @@
#include <linux/blkdev.h>
#include <linux/security.h>
#include <linux/syscalls.h>
+#include <linux/delayacct.h>
#include "filemap.h"
/*
* FIXME: remove all knowledge of the buffer layer from the core VM
@@ -1084,8 +1085,10 @@ generic_file_read(struct file *filp, cha
init_sync_kiocb(&kiocb, filp);
ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos);
- if (-EIOCBQUEUED == ret)
+ if (-EIOCBQUEUED == ret) {
+ delayacct_timestamp_start();
ret = wait_on_sync_kiocb(&kiocb);
+ }
return ret;
}
Index: linux-2.6.16-rc4/mm/memory.c
===================================================================
--- linux-2.6.16-rc4.orig/mm/memory.c 2006-02-27 01:20:04.000000000 -0500
+++ linux-2.6.16-rc4/mm/memory.c 2006-02-27 01:52:59.000000000 -0500
@@ -48,6 +48,7 @@
#include <linux/rmap.h>
#include <linux/module.h>
#include <linux/init.h>
+#include <linux/delayacct.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
@@ -2208,12 +2209,17 @@ static inline int handle_pte_fault(struc
old_entry = entry = *pte;
if (!pte_present(entry)) {
+ delayacct_timestamp_start();
if (pte_none(entry)) {
+ int ret;
if (!vma->vm_ops || !vma->vm_ops->nopage)
return do_anonymous_page(mm, vma, address,
pte, pmd, write_access);
- return do_no_page(mm, vma, address,
+ ret = do_no_page(mm, vma, address,
pte, pmd, write_access);
+ if (vma->vm_file)
+ delayacct_blkio();
+ return ret;
}
if (pte_file(entry))
return do_file_page(mm, vma, address,
Index: linux-2.6.16-rc4/include/linux/sched.h
===================================================================
--- linux-2.6.16-rc4.orig/include/linux/sched.h 2006-02-27 01:52:54.000000000 -0500
+++ linux-2.6.16-rc4/include/linux/sched.h 2006-02-27 01:52:59.000000000 -0500
@@ -551,6 +551,8 @@ struct task_delay_info {
struct timespec start, end;
/* Add stats in pairs: u64 delay, u32 count, aligned properly */
+ u64 blkio_delay; /* wait for sync block io completion */
+ u32 blkio_count;
};
#endif
delayacct-swapin.patch
Record time spent by a task waiting for its pages to be swapped in.
This statistic can help in adjusting the rss limits of
tasks (process), especially relative to each other, when the system is
under memory pressure.
Signed-off-by: Shailabh Nagar <[email protected]>
include/linux/delayacct.h | 8 ++++++++
include/linux/sched.h | 2 ++
kernel/delayacct.c | 14 ++++++++++++++
mm/memory.c | 7 +++++--
4 files changed, 29 insertions(+), 2 deletions(-)
Index: linux-2.6.16-rc4/include/linux/delayacct.h
===================================================================
--- linux-2.6.16-rc4.orig/include/linux/delayacct.h 2006-02-27 01:52:59.000000000 -0500
+++ linux-2.6.16-rc4/include/linux/delayacct.h 2006-02-27 01:53:01.000000000 -0500
@@ -26,6 +26,7 @@ extern int delayacct_init(void);
extern void __delayacct_tsk_init(struct task_struct *);
extern void __delayacct_tsk_exit(struct task_struct *);
extern void __delayacct_blkio(void);
+extern void __delayacct_swapin(void);
static inline void delayacct_tsk_early_init(struct task_struct *tsk)
{
@@ -54,6 +55,11 @@ static inline void delayacct_blkio(void)
if (unlikely(current->delays && delayacct_on))
__delayacct_blkio();
}
+static inline void delayacct_swapin(void)
+{
+ if (unlikely(current->delays && delayacct_on))
+ __delayacct_swapin();
+}
#else
static inline void delayacct_tsk_early_init(struct task_struct *tsk)
{}
@@ -65,6 +71,8 @@ static inline void delayacct_timestamp_s
{}
static inline void delayacct_blkio(void)
{}
+static inline void delayacct_swapin(void)
+{}
static inline int delayacct_init(void)
{}
#endif /* CONFIG_TASK_DELAY_ACCT */
Index: linux-2.6.16-rc4/include/linux/sched.h
===================================================================
--- linux-2.6.16-rc4.orig/include/linux/sched.h 2006-02-27 01:52:59.000000000 -0500
+++ linux-2.6.16-rc4/include/linux/sched.h 2006-02-27 01:53:01.000000000 -0500
@@ -552,7 +552,9 @@ struct task_delay_info {
/* Add stats in pairs: u64 delay, u32 count, aligned properly */
u64 blkio_delay; /* wait for sync block io completion */
+ u64 swapin_delay; /* wait for pages to be swapped in */
u32 blkio_count;
+ u32 swapin_count;
};
#endif
Index: linux-2.6.16-rc4/kernel/delayacct.c
===================================================================
--- linux-2.6.16-rc4.orig/kernel/delayacct.c 2006-02-27 01:52:59.000000000 -0500
+++ linux-2.6.16-rc4/kernel/delayacct.c 2006-02-27 01:53:01.000000000 -0500
@@ -92,6 +92,20 @@ void __delayacct_blkio(void)
spin_unlock(¤t->delays->lock);
}
+void __delayacct_swapin(void)
+{
+ nsec_t delay;
+
+ delay = delayacct_measure();
+ if (delay < 0)
+ return;
+
+ spin_lock(¤t->delays->lock);
+ current->delays->swapin_delay += delay;
+ current->delays->swapin_count++;
+ spin_unlock(¤t->delays->lock);
+}
+
/* Allocate task_delay_info for all tasks without one */
static int alloc_delays(void)
{
Index: linux-2.6.16-rc4/mm/memory.c
===================================================================
--- linux-2.6.16-rc4.orig/mm/memory.c 2006-02-27 01:52:59.000000000 -0500
+++ linux-2.6.16-rc4/mm/memory.c 2006-02-27 01:53:01.000000000 -0500
@@ -2209,9 +2209,10 @@ static inline int handle_pte_fault(struc
old_entry = entry = *pte;
if (!pte_present(entry)) {
+ int ret;
+
delayacct_timestamp_start();
if (pte_none(entry)) {
- int ret;
if (!vma->vm_ops || !vma->vm_ops->nopage)
return do_anonymous_page(mm, vma, address,
pte, pmd, write_access);
@@ -2224,8 +2225,10 @@ static inline int handle_pte_fault(struc
if (pte_file(entry))
return do_file_page(mm, vma, address,
pte, pmd, write_access, entry);
- return do_swap_page(mm, vma, address,
+ ret = do_swap_page(mm, vma, address,
pte, pmd, write_access, entry);
+ delayacct_swapin();
+ return ret;
}
ptl = pte_lockptr(mm, pmd);
> +/*
> + * timespec_diff_ns - Return difference of two timestamps in nanoseconds
> + * In the rare case of @end being earlier than @start, return zero
> + */
> +static inline nsec_t timespec_diff_ns(struct timespec *start, struct timespec *end)
> +{
> + nsec_t ret;
> +
> + ret = (nsec_t)(end->tv_sec - start->tv_sec)*NSEC_PER_SEC;
> + ret += (nsec_t)(end->tv_nsec - start->tv_nsec);
> + if (ret < 0)
> + return 0;
> + return ret;
> +}
> #endif /* __KERNEL__ */
>
wouldn't it be more useful to have this return a timespec as well, and
then it'd be generically useful (and it also probably should then be
uninlined ;)
> +/* Allocate task_delay_info for all tasks without one */
> +static int alloc_delays(void)
I'm sorry but this function seems to be highly horrible
> +static inline void delayacct_blkio(void)
> +{
> + if (unlikely(current->delays && delayacct_on))
> + __delayacct_blkio();
> +}
why is this unlikely?
> + u64 blkio_delay; /* wait for sync block io completion */
this misses O_SYNC, msync(), and general throttling.
I get the feeling this is being measured at the wrong level
currently.... since the number of entry points that needs measuring at
the current level is hardly finite...
On Mon, 2006-02-27 at 03:22 -0500, Shailabh Nagar wrote:
> delayacct-swapin.patch
>
> Record time spent by a task waiting for its pages to be swapped in.
> This statistic can help in adjusting the rss limits of
> tasks (process), especially relative to each other, when the system is
> under memory pressure.
ok this poses a question: how do you deal with nested timings? Say an
O_SYC write which internally causes a pagefault?
delayacct_timestamp_start() at minimum has to get event-type specific,
or even implement a stack of some sorts.
delayacct-genetlink.patch
Create a generic netlink interface (NETLINK_GENERIC family),
called "taskstats", for getting delay and cpu statistics of
tasks and thread groups during their lifetime and when they exit.
The cpu stats are available only if CONFIG_SCHEDSTATS is enabled.
When a task is alive, userspace can get its stats by sending a
command containing its pid. Sending a tgid returns the sum of stats
of the tasks belonging to that tgid (where such a sum makes sense).
Together, the command interface allows stats for a large number of
tasks to be collected more efficiently than would be possible
through /proc or any per-pid interface.
The netlink interface also sends the stats for each task to userspace
when the task is exiting. This permits fine-grain accounting for
short-lived tasks, which is important if userspace is doing its own
aggregation of statistics based on some grouping of tasks
(e.g. CSA jobs, ELSA banks or CKRM classes).
If the exiting task belongs to a thread group (with more members than itself)
, the latters delay stats are also sent out on the task's exit. This allows
userspace to get accurate data at a per-tgid level while the tid's of a tgid
are exiting one by one.
The interface has been deliberately kept distinct from the delay
accounting code since it is potentially usable by other kernel components
that need to export per-pid/tgid data. The format of data returned to
userspace is versioned and the command interface easily extensible to
facilitate reuse.
If reuse is not deemed useful enough, the naming, placement of functions
and config options will be modified to make this an interface for delay
accounting alone.
Signed-off-by: Shailabh Nagar <[email protected]>
Signed-off-by: Balbir Singh <[email protected]>
include/linux/delayacct.h | 2
include/linux/taskstats.h | 125 +++++++++++++++++++++
init/Kconfig | 16 ++
kernel/Makefile | 1
kernel/delayacct.c | 49 ++++++++
kernel/taskstats.c | 266 ++++++++++++++++++++++++++++++++++++++++++++++
6 files changed, 456 insertions(+), 3 deletions(-)
Index: linux-2.6.16-rc4/include/linux/delayacct.h
===================================================================
--- linux-2.6.16-rc4.orig/include/linux/delayacct.h 2006-02-27 01:53:01.000000000 -0500
+++ linux-2.6.16-rc4/include/linux/delayacct.h 2006-02-27 01:53:03.000000000 -0500
@@ -16,6 +16,7 @@
#include <linux/sched.h>
#include <linux/sysctl.h>
+#include <linux/taskstats.h>
#ifdef CONFIG_TASK_DELAY_ACCT
extern int delayacct_on; /* Delay accounting turned on/off */
@@ -27,6 +28,7 @@ extern void __delayacct_tsk_init(struct
extern void __delayacct_tsk_exit(struct task_struct *);
extern void __delayacct_blkio(void);
extern void __delayacct_swapin(void);
+extern int delayacct_add_tsk(struct taskstats_reply *, struct task_struct *);
static inline void delayacct_tsk_early_init(struct task_struct *tsk)
{
Index: linux-2.6.16-rc4/include/linux/taskstats.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16-rc4/include/linux/taskstats.h 2006-02-27 01:53:03.000000000 -0500
@@ -0,0 +1,125 @@
+/* taskstats.h - exporting per-task statistics
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ * (C) Balbir Singh, IBM Corp. 2006
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#ifndef _LINUX_TASKSTATS_H
+#define _LINUX_TASKSTATS_H
+
+/* Format for per-task data returned to userland when
+ * - a task exits
+ * - listener requests stats for a task
+ *
+ * The struct is versioned. Newer versions should only add fields to
+ * the bottom of the struct to maintain backward compatibility.
+ *
+ * To create the next version, bump up the taskstats_version variable
+ * and delineate the start of newly added fields with a comment indicating
+ * the version number.
+ */
+
+#define TASKSTATS_VERSION 1
+
+struct taskstats {
+
+ /* Version 1 */
+#define TASKSTATS_NOPID -1
+ pid_t pid;
+ pid_t tgid;
+
+ /* XXX_count is number of delay values recorded.
+ * XXX_total is corresponding cumulative delay in nanoseconds
+ */
+
+#define TASKSTATS_NOCPUSTATS 1
+ __u64 cpu_count;
+ __u64 cpu_delay_total; /* wait, while runnable, for cpu */
+ __u64 blkio_count;
+ __u64 blkio_delay_total; /* sync,block io completion wait*/
+ __u64 swapin_count;
+ __u64 swapin_delay_total; /* swapin page fault wait*/
+
+ __u64 cpu_run_total; /* cpu running time
+ * no count available/provided */
+};
+
+
+#define TASKSTATS_LISTEN_GROUP 0x1
+
+/*
+ * Commands sent from userspace
+ * Not versioned. New commands should only be inserted at the enum's end
+ */
+
+enum {
+ TASKSTATS_CMD_UNSPEC, /* Reserved */
+ TASKSTATS_CMD_NONE, /* Not a valid cmd to send
+ * Marks data sent on task/tgid exit */
+ TASKSTATS_CMD_LISTEN, /* Start listening */
+ TASKSTATS_CMD_IGNORE, /* Stop listening */
+ TASKSTATS_CMD_PID, /* Send stats for a pid */
+ TASKSTATS_CMD_TGID, /* Send stats for a tgid */
+};
+
+/* Parameters for commands
+ * New parameters should only be inserted at the struct's end
+ */
+
+struct taskstats_cmd_param {
+ union {
+ pid_t pid;
+ pid_t tgid;
+ } id;
+};
+
+/*
+ * Reply sent from kernel
+ * Version number affects size/format of struct taskstats only
+ */
+
+struct taskstats_reply {
+ enum outtype {
+ TASKSTATS_REPLY_NONE = 1, /* Control cmd response */
+ TASKSTATS_REPLY_PID, /* per-pid data cmd response*/
+ TASKSTATS_REPLY_TGID, /* per-tgid data cmd response*/
+ TASKSTATS_REPLY_EXIT_PID, /* Exiting task's stats */
+ TASKSTATS_REPLY_EXIT_TGID, /* Exiting tgid's stats
+ * (sent on each tid's exit) */
+ } outtype;
+ __u32 version;
+ __u32 err;
+ struct taskstats stats; /* Invalid if err != 0 */
+};
+
+/* NETLINK_GENERIC related info */
+
+#define TASKSTATS_GENL_NAME "TASKSTATS"
+#define TASKSTATS_GENL_VERSION 0x1
+/* Following must be > NLMSG_MIN_TYPE */
+#define TASKSTATS_GENL_ID 0x123
+
+#define TASKSTATS_HDRLEN (NLMSG_SPACE(GENL_HDRLEN))
+#define TASKSTATS_BODYLEN (sizeof(struct taskstats_reply))
+
+#ifdef __KERNEL__
+
+#include <linux/sched.h>
+
+#ifdef CONFIG_TASKSTATS
+extern void taskstats_exit_pid(struct task_struct *);
+#else
+static inline void taskstats_exit_pid(struct task_struct *tsk)
+{}
+#endif
+
+#endif /* __KERNEL__ */
+#endif /* _LINUX_TASKSTATS_H */
Index: linux-2.6.16-rc4/kernel/Makefile
===================================================================
--- linux-2.6.16-rc4.orig/kernel/Makefile 2006-02-27 01:52:54.000000000 -0500
+++ linux-2.6.16-rc4/kernel/Makefile 2006-02-27 01:53:03.000000000 -0500
@@ -35,6 +35,7 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
+obj-$(CONFIG_TASKSTATS) += taskstats.o
ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
# According to Alan Modra <[email protected]>, the -fno-omit-frame-pointer is
Index: linux-2.6.16-rc4/kernel/taskstats.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16-rc4/kernel/taskstats.c 2006-02-27 01:53:03.000000000 -0500
@@ -0,0 +1,266 @@
+/*
+ * taskstats.c - Export per-task statistics to userland
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ * (C) Balbir Singh, IBM Corp. 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/taskstats.h>
+#include <linux/delayacct.h>
+#include <net/genetlink.h>
+#include <asm/atomic.h>
+
+const int taskstats_version = TASKSTATS_VERSION;
+static atomic_t taskstats_group_listeners = ATOMIC_INIT(0);
+static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
+
+static struct genl_family family = {
+// .id = GENL_ID_GENERATE,
+ .id = TASKSTATS_GENL_ID,
+ .name = TASKSTATS_GENL_NAME,
+ .version = TASKSTATS_GENL_VERSION,
+ .hdrsize = 0,
+ .maxattr = 0,
+};
+
+#define genlmsg_data(genlhdr) ((char *)genlhdr + GENL_HDRLEN)
+
+/* Taskstat specific functions */
+
+static int taskstats_listen(struct sk_buff *skb, struct genl_info *info)
+{
+ atomic_inc(&taskstats_group_listeners);
+ return 0;
+}
+
+static int taskstats_ignore(struct sk_buff *skb, struct genl_info *info)
+{
+ atomic_dec(&taskstats_group_listeners);
+ return 0;
+}
+
+static int prepare_reply(struct genl_info *info, u8 cmd,
+ struct sk_buff **skbp, struct taskstats_reply **replyp)
+{
+ struct sk_buff *skb;
+ struct taskstats_reply *reply;
+
+ skb = nlmsg_new(TASKSTATS_HDRLEN + TASKSTATS_BODYLEN);
+ if (!skb)
+ return -ENOMEM;
+
+ if (!info) {
+ int seq = get_cpu_var(taskstats_seqnum)++;
+ put_cpu_var(taskstats_seqnum);
+
+ reply = genlmsg_put(skb, 0, seq,
+ family.id, 0, NLM_F_REQUEST,
+ cmd, family.version);
+ } else
+ reply = genlmsg_put(skb, info->snd_pid, info->snd_seq,
+ family.id, 0, info->nlhdr->nlmsg_flags,
+ info->genlhdr->cmd, family.version);
+ if (reply == NULL) {
+ nlmsg_free(skb);
+ return -EINVAL;
+ }
+ skb_put(skb, TASKSTATS_BODYLEN);
+
+ memset(reply, 0, sizeof(*reply));
+ reply->version = taskstats_version;
+ reply->err = 0;
+
+ *skbp = skb;
+ *replyp = reply;
+ return 0;
+}
+
+static int send_reply(struct sk_buff *skb, int replytype)
+{
+ struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
+ struct taskstats_reply *reply;
+
+ reply = (struct taskstats_reply *)genlmsg_data(genlhdr);
+ reply->outtype = replytype;
+
+ genlmsg_end(skb, genlhdr);
+ return genlmsg_multicast(skb, 0, TASKSTATS_LISTEN_GROUP);
+}
+
+static inline void fill_pid(struct taskstats_reply *reply, pid_t pid, struct task_struct *pidtsk)
+{
+ int rc;
+ struct task_struct *tsk = pidtsk;
+
+ if (!pidtsk) {
+ read_lock(&tasklist_lock);
+ tsk = find_task_by_pid(pid);
+ if (!tsk) {
+ read_unlock(&tasklist_lock);
+ reply->err = EINVAL;
+ return;
+ }
+ get_task_struct(tsk);
+ read_unlock(&tasklist_lock);
+ } else
+ get_task_struct(tsk);
+
+ rc = delayacct_add_tsk(reply, tsk);
+ if (!rc) {
+ reply->stats.pid = tsk->pid;
+ reply->stats.tgid = tsk->tgid;
+ } else
+ reply->err = (rc < 0) ? -rc : rc ;
+
+ put_task_struct(tsk);
+}
+
+static int taskstats_send_pid(struct sk_buff *skb, struct genl_info *info)
+{
+ int rc;
+ struct sk_buff *rep_skb;
+ struct taskstats_reply *reply;
+ struct taskstats_cmd_param *param= info->userhdr;
+
+ if (atomic_read(&taskstats_group_listeners) < 1)
+ return -EINVAL;
+
+ rc = prepare_reply(info, info->genlhdr->cmd, &rep_skb, &reply);
+ if (rc)
+ return rc;
+ fill_pid(reply, param->id.pid, NULL);
+ return send_reply(rep_skb, TASKSTATS_REPLY_PID);
+}
+
+static inline void fill_tgid(struct taskstats_reply *reply, pid_t tgid, struct task_struct *tgidtsk)
+{
+ int rc;
+ struct task_struct *tsk, *first;
+
+ first = tgidtsk;
+ read_lock(&tasklist_lock);
+ if (!first) {
+ first = find_task_by_pid(tgid);
+ if (!first) {
+ read_unlock(&tasklist_lock);
+ reply->err = EINVAL;
+ return;
+ }
+ }
+ tsk = first;
+ do {
+ rc = delayacct_add_tsk(reply, tsk);
+ if (rc)
+ break;
+ } while_each_thread(first, tsk);
+ read_unlock(&tasklist_lock);
+
+ if (!rc) {
+ reply->stats.pid = TASKSTATS_NOPID;
+ reply->stats.tgid = tgid;
+ } else
+ reply->err = (rc < 0) ? -rc : rc ;
+}
+
+static int taskstats_send_tgid(struct sk_buff *skb, struct genl_info *info)
+{
+ int rc;
+ struct sk_buff *rep_skb;
+ struct taskstats_reply *reply;
+ struct taskstats_cmd_param *param= info->userhdr;
+
+ if (atomic_read(&taskstats_group_listeners) < 1)
+ return -EINVAL;
+
+ rc = prepare_reply(info, info->genlhdr->cmd, &rep_skb, &reply);
+ if (rc)
+ return rc;
+ fill_tgid(reply, param->id.tgid, NULL);
+ return send_reply(rep_skb, TASKSTATS_REPLY_TGID);
+}
+
+/* Send pid data out on exit */
+void taskstats_exit_pid(struct task_struct *tsk)
+{
+ int rc;
+ struct sk_buff *rep_skb;
+ struct taskstats_reply *reply;
+
+ if (atomic_read(&taskstats_group_listeners) < 1)
+ return;
+
+ rc = prepare_reply(NULL, TASKSTATS_CMD_NONE, &rep_skb, &reply);
+ if (rc)
+ return;
+ fill_pid(reply, tsk->pid, tsk);
+ rc = send_reply(rep_skb, TASKSTATS_REPLY_EXIT_PID);
+
+ if (rc || thread_group_empty(tsk))
+ return;
+
+ /* Send tgid data too */
+ rc = prepare_reply(NULL, TASKSTATS_CMD_NONE, &rep_skb, &reply);
+ if (rc)
+ return;
+ fill_tgid(reply, tsk->tgid, tsk);
+ send_reply(rep_skb, TASKSTATS_REPLY_EXIT_TGID);
+}
+
+static struct genl_ops listen_ops = {
+ .cmd = TASKSTATS_CMD_LISTEN,
+ .doit = taskstats_listen,
+};
+
+static struct genl_ops ignore_ops = {
+ .cmd = TASKSTATS_CMD_IGNORE,
+ .doit = taskstats_ignore,
+};
+
+static struct genl_ops pid_ops = {
+ .cmd = TASKSTATS_CMD_PID,
+ .doit = taskstats_send_pid,
+};
+
+static struct genl_ops tgid_ops = {
+ .cmd = TASKSTATS_CMD_TGID,
+ .doit = taskstats_send_tgid,
+};
+
+static int family_registered = 0;
+
+static int __init taskstats_init(void)
+{
+ if (genl_register_family(&family))
+ return -EFAULT;
+ family_registered = 1;
+
+ if (genl_register_ops(&family, &listen_ops))
+ goto err;
+ if (genl_register_ops(&family, &ignore_ops))
+ goto err;
+ if (genl_register_ops(&family, &pid_ops))
+ goto err;
+ if (genl_register_ops(&family, &tgid_ops))
+ goto err;
+
+ return 0;
+err:
+ genl_unregister_family(&family);
+ family_registered = 0;
+ return -EFAULT;
+}
+
+late_initcall(taskstats_init);
+
Index: linux-2.6.16-rc4/init/Kconfig
===================================================================
--- linux-2.6.16-rc4.orig/init/Kconfig 2006-02-27 01:52:54.000000000 -0500
+++ linux-2.6.16-rc4/init/Kconfig 2006-02-27 01:53:03.000000000 -0500
@@ -158,11 +158,21 @@ config TASK_DELAY_ACCT
in pages. Such statistics can help in setting a task's priorities
relative to other tasks for cpu, io, rss limits etc.
- Unlike BSD process accounting, this information is available
- continuously during the lifetime of a task.
-
Say N if unsure.
+config TASKSTATS
+ bool "Export task/process statistics through netlink (EXPERIMENTAL)"
+ depends on TASK_DELAY_ACCT
+ default y
+ help
+ Export selected statistics for tasks/processes through the
+ generic netlink interface. Unlike BSD process accounting, the
+ statistics are available during the lifetime of tasks/processes as
+ responses to commands. Like BSD accounting, they are sent to user
+ space on task exit.
+
+ Say Y if unsure.
+
config SYSCTL
bool "Sysctl support"
---help---
Index: linux-2.6.16-rc4/kernel/delayacct.c
===================================================================
--- linux-2.6.16-rc4.orig/kernel/delayacct.c 2006-02-27 01:53:01.000000000 -0500
+++ linux-2.6.16-rc4/kernel/delayacct.c 2006-02-27 01:53:03.000000000 -0500
@@ -17,6 +17,7 @@
#include <linux/time.h>
#include <linux/sysctl.h>
#include <linux/delayacct.h>
+#include <linux/taskstats.h>
int delayacct_on = 0; /* Delay accounting turned on/off */
kmem_cache_t *delayacct_cache;
@@ -61,6 +62,7 @@ void __delayacct_tsk_init(struct task_st
void __delayacct_tsk_exit(struct task_struct *tsk)
{
+ taskstats_exit_pid(tsk);
task_lock(tsk);
if (tsk->delays) {
kmem_cache_free(delayacct_cache, tsk->delays);
@@ -209,3 +211,50 @@ int delayacct_sysctl_handler(ctl_table *
delayacct_on = prev;
return ret;
}
+
+#ifdef CONFIG_TASKSTATS
+
+int delayacct_add_tsk(struct taskstats_reply *reply, struct task_struct *tsk)
+{
+ struct taskstats *d = &reply->stats;
+ nsec_t tmp;
+ struct timespec ts;
+ unsigned long t1,t2;
+
+ if (!tsk->delays || !delayacct_on)
+ return -EINVAL;
+
+ /* zero XXX_total,non-zero XXX_count implies XXX stat overflowed */
+#ifdef CONFIG_SCHEDSTATS
+
+ tmp = (nsec_t)d->cpu_run_total ;
+ tmp += (u64)(tsk->utime+tsk->stime)*TICK_NSEC;
+ d->cpu_run_total = (tmp < (nsec_t)d->cpu_run_total)? 0:tmp;
+
+ /* No locking available for sched_info. Take snapshot first. */
+ t1 = tsk->sched_info.pcnt;
+ t2 = tsk->sched_info.run_delay;
+
+ d->cpu_count += t1;
+
+ jiffies_to_timespec(t2, &ts);
+ tmp = (nsec_t)d->cpu_delay_total + timespec_to_ns(&ts);
+ d->cpu_delay_total = (tmp < (nsec_t)d->cpu_delay_total)? 0:tmp;
+#else
+ /* Non-zero XXX_total,zero XXX_count implies XXX stat unavailable */
+ d->cpu_count = 0;
+ d->cpu_run_total = d->cpu_delay_total = TASKSTATS_NOCPUSTATS;
+#endif
+ spin_lock(&tsk->delays->lock);
+ tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
+ d->blkio_delay_total = (tmp < d->blkio_delay_total)? 0:tmp;
+ tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
+ d->swapin_delay_total = (tmp < d->swapin_delay_total)? 0:tmp;
+ d->blkio_count += tsk->delays->blkio_count;
+ d->swapin_count += tsk->delays->swapin_count;
+ spin_unlock(&tsk->delays->lock);
+
+ return 0;
+}
+
+#endif /* CONFIG_TASKSTATS */
Arjan van de Ven wrote:
>>+/*
>>+ * timespec_diff_ns - Return difference of two timestamps in nanoseconds
>>+ * In the rare case of @end being earlier than @start, return zero
>>+ */
>>+static inline nsec_t timespec_diff_ns(struct timespec *start, struct timespec *end)
>>+{
>>+ nsec_t ret;
>>+
>>+ ret = (nsec_t)(end->tv_sec - start->tv_sec)*NSEC_PER_SEC;
>>+ ret += (nsec_t)(end->tv_nsec - start->tv_nsec);
>>+ if (ret < 0)
>>+ return 0;
>>+ return ret;
>>+}
>> #endif /* __KERNEL__ */
>>
>>
>>
>
>wouldn't it be more useful to have this return a timespec as well, and
>then it'd be generically useful (and it also probably should then be
>uninlined ;)
>
>
Return another timespec to store the difference of two input timespecs ?
Would that be useful ?
Didn't quite get it.
--Shailabh
On Mon, 2006-02-27 at 03:34 -0500, Shailabh Nagar wrote:
> Arjan van de Ven wrote:
>
> >>+/*
> >>+ * timespec_diff_ns - Return difference of two timestamps in nanoseconds
> >>+ * In the rare case of @end being earlier than @start, return zero
> >>+ */
> >>+static inline nsec_t timespec_diff_ns(struct timespec *start, struct timespec *end)
> >>+{
> >>+ nsec_t ret;
> >>+
> >>+ ret = (nsec_t)(end->tv_sec - start->tv_sec)*NSEC_PER_SEC;
> >>+ ret += (nsec_t)(end->tv_nsec - start->tv_nsec);
> >>+ if (ret < 0)
> >>+ return 0;
> >>+ return ret;
> >>+}
> >> #endif /* __KERNEL__ */
> >>
> >>
> >>
> >
> >wouldn't it be more useful to have this return a timespec as well, and
> >then it'd be generically useful (and it also probably should then be
> >uninlined ;)
> >
> >
> Return another timespec to store the difference of two input timespecs ?
> Would that be useful ?
> Didn't quite get it.
the API is a bit crooked right now; you have 2 timespecs as a measure of
time, and you return a long as diff, rather than another timespec.
How do you know the nsec_t doesn't overflow ??? I suspect the answer is
"you don't". timespec's are a way to deal with that nicely. And it makes
the API more symmetric as well
Arjan van de Ven wrote:
>>+/* Allocate task_delay_info for all tasks without one */
>>+static int alloc_delays(void)
>>
>>
>
>I'm sorry but this function seems to be highly horrible
>
>
Could you be more specific ? Is it the way its coded or the design
(preallocate, then assign)
itself ?
The function needs to allocate task_delay_info structs for all tasks
that might
have been forked since the last time delay accounting was turned off.
Either we have to count how many such tasks there are, or preallocate
nr_tasks (as an upper bound) and then use as many as needed.
Thanks for reviewing so quickly.
-- Shailabh
On Mon, 2006-02-27 at 03:38 -0500, Shailabh Nagar wrote:
> Arjan van de Ven wrote:
>
> >>+/* Allocate task_delay_info for all tasks without one */
> >>+static int alloc_delays(void)
> >>
> >>
> >
> >I'm sorry but this function seems to be highly horrible
> >
> >
> Could you be more specific ? Is it the way its coded or the design
> (preallocate, then assign)
> itself ?
>
> The function needs to allocate task_delay_info structs for all tasks
> that might
> have been forked since the last time delay accounting was turned off.
> Either we have to count how many such tasks there are, or preallocate
> nr_tasks (as an upper bound) and then use as many as needed.
it generally feels really fragile, especially with the task enumeration
going to RCU soon. (eg you'd lose the ability to lock out new task
creation)
On first sight it looks a lot better to allocate these things on demand,
but I'm not sure how the sleeping-allocation would interact with the
places it'd need to be called...
the principle looks OK to me, just a few minor nits:
> #ifdef CONFIG_SCHEDSTATS
> +
> +int schedstats_sysctl = 0; /* schedstats turned off by default */
no need to initialize to 0.
> +static DEFINE_PER_CPU(int, schedstats) = 0;
ditto.
> +
> +static void schedstats_set(int val)
> +{
> + int i;
> + static spinlock_t schedstats_lock = SPIN_LOCK_UNLOCKED;
move spinlock out of the function and use DEFINE_SPINLOCK. (But ... see
below for suggestion to get rid of this lock altogether.)
> + spin_lock(&schedstats_lock);
> + schedstats_sysctl = val;
> + for (i = 0; i < NR_CPUS; i++)
> + per_cpu(schedstats, i) = val;
> + spin_unlock(&schedstats_lock);
> +}
> +
> +static int __init schedstats_setup_enable(char *str)
> +{
> + schedstats_sysctl = 1;
> + schedstats_set(schedstats_sysctl);
> + return 1;
> +}
> +
> +__setup("schedstats", schedstats_setup_enable);
> +
> +int schedstats_sysctl_handler(ctl_table *table, int write, struct file *filp,
> + void __user *buffer, size_t *lenp, loff_t *ppos)
> +{
> + int ret, prev = schedstats_sysctl;
> + struct task_struct *g, *t;
> +
> + ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
> + if ((ret != 0) || (prev == schedstats_sysctl))
> + return ret;
> + if (schedstats_sysctl) {
> + read_lock(&tasklist_lock);
> + do_each_thread(g, t) {
> + memset(&t->sched_info, 0, sizeof(t->sched_info));
> + } while_each_thread(g, t);
> + read_unlock(&tasklist_lock);
> + }
> + schedstats_set(schedstats_sysctl);
why not just introduce a schedstats_lock mutex, and acquire it for both
the 'if (schedstats_sysctl)' line and the schedstats_set() line. That
will make the locking meaningful: two parallel sysctl ops will be atomic
to each other. [right now they wont be and they can clear schedstat data
in parallel -> not a big problem but it makes schedstats_lock rather
meaningless]
> -#define SCHEDSTAT_VERSION 12
> +#define SCHEDSTAT_VERSION 13
>
> static int show_schedstat(struct seq_file *seq, void *v)
> {
> @@ -394,6 +439,10 @@ static int show_schedstat(struct seq_fil
>
> seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
> seq_printf(seq, "timestamp %lu\n", jiffies);
> + if (!schedstats_sysctl) {
> + seq_printf(seq, "State Off\n");
> + return 0;
> + }
and show_schedstat() should then also take the schedstats_lock mutex.
Ingo
Arjan van de Ven wrote:
>On Mon, 2006-02-27 at 03:38 -0500, Shailabh Nagar wrote:
>
>
>>Arjan van de Ven wrote:
>>
>>
>>
>>>>+/* Allocate task_delay_info for all tasks without one */
>>>>+static int alloc_delays(void)
>>>>
>>>>
>>>>
>>>>
>>>I'm sorry but this function seems to be highly horrible
>>>
>>>
>>>
>>>
>>Could you be more specific ? Is it the way its coded or the design
>>(preallocate, then assign)
>>itself ?
>>
>>The function needs to allocate task_delay_info structs for all tasks
>>that might
>>have been forked since the last time delay accounting was turned off.
>>Either we have to count how many such tasks there are, or preallocate
>>nr_tasks (as an upper bound) and then use as many as needed.
>>
>>
>
>it generally feels really fragile, especially with the task enumeration
>going to RCU soon. (eg you'd lose the ability to lock out new task
>creation)
>
>
>On first sight it looks a lot better to allocate these things on demand,
>but I'm not sure how the sleeping-allocation would interact with the
>places it'd need to be called...
>
>
Yes, thats the reason why we didn't do the on-demand allocation...the
next time a task is checked
could be in any of the places where the timestamping is done. Doing the
allocation there (and incurring
the extra cost of the check even when sysctl hasn't been used) didn't
seem worthwhile, esp. when we
have a point (sysctl handler) where we can catch most of the allocs needed.
But if task enumeration is going to get more difficult, we'll need to
keep the on-demand allocation (on
next use) as a backup for tasks that weren't caught during the sysctl
change.
>
>
>
On Mon, Feb 27, 2006 at 09:42:23AM +0100, Arjan van de Ven wrote:
> On Mon, 2006-02-27 at 03:38 -0500, Shailabh Nagar wrote:
> > Arjan van de Ven wrote:
> >
> > The function needs to allocate task_delay_info structs for all tasks
> > that might
> > have been forked since the last time delay accounting was turned off.
> > Either we have to count how many such tasks there are, or preallocate
> > nr_tasks (as an upper bound) and then use as many as needed.
>
> it generally feels really fragile, especially with the task enumeration
> going to RCU soon. (eg you'd lose the ability to lock out new task
> creation)
I haven't yet seen any RCU-based code that does this. Can you point out
what patches you are talking about ? As of 2.6.16-rc4 and -rt15,
AFAICS, you can count tasks by holding the read side of tasklist_lock.
Granted it is a bit ugly to repeat this in order to overcome
the race on dropping tasklist_lock for allocation.
> On first sight it looks a lot better to allocate these things on demand,
> but I'm not sure how the sleeping-allocation would interact with the
> places it'd need to be called...
This could be a problem for contexts where sleeping cannot be permitted,
not to mention fast paths where blocking may introduce a skew. It seems
simpler to just let this happen during sysctl time.
Thanks
Dipankar
Shailabh Nagar wrote:
> The following patches add accounting for the delays seen by tasks in
> a) waiting for a CPU (while being runnable)
> b) completion of synchronous block I/O initiated by the task
> c) swapping in pages (i.e. capacity misses).
>
> Such delays provide feedback for a task's cpu priority, io priority and
> rss limit values. Long delays, especially relative to other tasks, can
> be a trigger for changing a task's cpu/io priorities and modifying its
> rss usage (either directly through sys_getprlimit() that was proposed
> earlier on lkml or by throttling cpu consumption or process calling
> sys_setrlimit etc.)
>
Can we get an idea about the actual userspace programs and algorithms
that use this feedback? Links, performance numbers, etc.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
On Mon, 2006-02-27 at 14:34 +0530, Dipankar Sarma wrote:
> On Mon, Feb 27, 2006 at 09:42:23AM +0100, Arjan van de Ven wrote:
> > On Mon, 2006-02-27 at 03:38 -0500, Shailabh Nagar wrote:
> > > Arjan van de Ven wrote:
> > >
> > > The function needs to allocate task_delay_info structs for all tasks
> > > that might
> > > have been forked since the last time delay accounting was turned off.
> > > Either we have to count how many such tasks there are, or preallocate
> > > nr_tasks (as an upper bound) and then use as many as needed.
> >
> > it generally feels really fragile, especially with the task enumeration
> > going to RCU soon. (eg you'd lose the ability to lock out new task
> > creation)
>
> I haven't yet seen any RCU-based code that does this. Can you point out
> what patches you are talking about ? As of 2.6.16-rc4 and -rt15,
> AFAICS, you can count tasks by holding the read side of tasklist_lock.
> Granted it is a bit ugly to repeat this in order to overcome
> the race on dropping tasklist_lock for allocation.
there are several people (Christoph for one) who are working on making
the tasklist_lock go away entirely in favor of RCU-like locking...
Arjan van de Ven wrote:
>>+static inline void delayacct_blkio(void)
>>+{
>>+ if (unlikely(current->delays && delayacct_on))
>>+ __delayacct_blkio();
>>+}
>>
>>
>
>why is this unlikely?
>
>
delayacct_on is expected to be off most of the time, hence the compound is
unlikely too.
>
>
>>+ u64 blkio_delay; /* wait for sync block io completion */
>>
>>
>
>this misses O_SYNC, msync(), and general throttling.
>
Hmm, that it does :-(
>I get the feeling this is being measured at the wrong level
>currently.... since the number of entry points that needs measuring at
>the current level is hardly finite...
>
>
Will take another look if it can be done elsewhere. Earlier was using
io_schedule but that isn't
called from everywhere.
--Shailabh
Shailabh Nagar wrote:
> schedstats-sysctl.patch
>
> Add sysctl option for controlling schedstats collection
> dynamically. Delay accounting leverages schedstats for
> cpu delay statistics.
>
I'd sort of rather not tie this in with schedstats if possible.
Schedstats adds a reasonable amount of cache footprint and
branches in hot paths. Most of schedstats stuff is something that
hardly anyone will use.
Sure you can share common code though...
>
> Index: linux-2.6.16-rc4/kernel/sched.c
> ===================================================================
> --- linux-2.6.16-rc4.orig/kernel/sched.c 2006-02-27 01:20:04.000000000 -0500
> +++ linux-2.6.16-rc4/kernel/sched.c 2006-02-27 01:52:52.000000000 -0500
> @@ -382,11 +382,56 @@ static inline void task_rq_unlock(runque
> }
>
> #ifdef CONFIG_SCHEDSTATS
> +
> +int schedstats_sysctl = 0; /* schedstats turned off by default */
Should be read mostly.
> +static DEFINE_PER_CPU(int, schedstats) = 0;
> +
When the above is in the read mostly section, you won't need this at all.
You don't intend to switch the sysctl with great frequency, do you?
> +static void schedstats_set(int val)
> +{
> + int i;
> + static spinlock_t schedstats_lock = SPIN_LOCK_UNLOCKED;
> +
> + spin_lock(&schedstats_lock);
> + schedstats_sysctl = val;
> + for (i = 0; i < NR_CPUS; i++)
> + per_cpu(schedstats, i) = val;
> + spin_unlock(&schedstats_lock);
> +}
> +
> +static int __init schedstats_setup_enable(char *str)
> +{
> + schedstats_sysctl = 1;
> + schedstats_set(schedstats_sysctl);
> + return 1;
> +}
> +
> +__setup("schedstats", schedstats_setup_enable);
> +
> +int schedstats_sysctl_handler(ctl_table *table, int write, struct file *filp,
> + void __user *buffer, size_t *lenp, loff_t *ppos)
> +{
> + int ret, prev = schedstats_sysctl;
> + struct task_struct *g, *t;
> +
> + ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
> + if ((ret != 0) || (prev == schedstats_sysctl))
> + return ret;
> + if (schedstats_sysctl) {
> + read_lock(&tasklist_lock);
> + do_each_thread(g, t) {
> + memset(&t->sched_info, 0, sizeof(t->sched_info));
> + } while_each_thread(g, t);
> + read_unlock(&tasklist_lock);
> + }
> + schedstats_set(schedstats_sysctl);
You don't clear the rq's schedstats stuff here.
And clearing this at all is not really needed for the schedstats interface.
You have a timestamp and a set of accumulated values, so it is easy to work
out deltas. So do you need this?
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
On Mon, 2006-02-27 at 04:13 -0500, Shailabh Nagar wrote:
> Arjan van de Ven wrote:
>
> >>+static inline void delayacct_blkio(void)
> >>+{
> >>+ if (unlikely(current->delays && delayacct_on))
> >>+ __delayacct_blkio();
> >>+}
> >>
> >>
> >
> >why is this unlikely?
> >
> >
> delayacct_on is expected to be off most of the time,
that's not really enough I think to warrent a compiler hint
> hence the compound is
> unlikely too.
you then should move that as first in the test instead ;-)
On Mon, 2006-02-27 at 04:13 -0500, Shailabh Nagar wrote:
> Arjan van de Ven wrote:
>
> >>+static inline void delayacct_blkio(void)
> >>+{
> >>+ if (unlikely(current->delays && delayacct_on))
> >>+ __delayacct_blkio();
> >>+}
> >>
> >>
> >
> >why is this unlikely?
> >
> >
> delayacct_on is expected to be off most of the time, hence the compound is
> unlikely too.
ok that opens the question: why is this a runtime tunable?
Is it really worth all this complexity?
Nick Piggin wrote:
<snip>
>> +int schedstats_sysctl_handler(ctl_table *table, int write, struct
>> file *filp,
>> + void __user *buffer, size_t *lenp, loff_t *ppos)
>> +{
>> + int ret, prev = schedstats_sysctl;
>> + struct task_struct *g, *t;
>> +
>> + ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
>> + if ((ret != 0) || (prev == schedstats_sysctl))
>> + return ret;
>> + if (schedstats_sysctl) {
>> + read_lock(&tasklist_lock);
>> + do_each_thread(g, t) {
>> + memset(&t->sched_info, 0, sizeof(t->sched_info));
>> + } while_each_thread(g, t);
>> + read_unlock(&tasklist_lock);
>> + }
>> + schedstats_set(schedstats_sysctl);
>
>
> You don't clear the rq's schedstats stuff here.
Good point.
>
> And clearing this at all is not really needed for the schedstats
> interface.
> You have a timestamp and a set of accumulated values, so it is easy to
> work
> out deltas. So do you need this?
Not clearing the stats will mean userspace has to distinguish between
the tasks
that are hanging around from before the last turn off, and the ones
created after
wards. Any delta taken across an interval where schedstats was turned
off will
give the impression a task was sleeping during the interval (and hence
show it had
a lesser average wait time than it might actually have experienced).
--Shailabh
On Mon, Feb 27, 2006 at 10:13:14AM +0100, Arjan van de Ven wrote:
> On Mon, 2006-02-27 at 14:34 +0530, Dipankar Sarma wrote:
> > On Mon, Feb 27, 2006 at 09:42:23AM +0100, Arjan van de Ven wrote:
> > > On Mon, 2006-02-27 at 03:38 -0500, Shailabh Nagar wrote:
> > > > Arjan van de Ven wrote:
> > > >
> > > > The function needs to allocate task_delay_info structs for all tasks
> > > > that might
> > > > have been forked since the last time delay accounting was turned off.
> > > > Either we have to count how many such tasks there are, or preallocate
> > > > nr_tasks (as an upper bound) and then use as many as needed.
> > >
> > > it generally feels really fragile, especially with the task enumeration
> > > going to RCU soon. (eg you'd lose the ability to lock out new task
> > > creation)
> >
> > I haven't yet seen any RCU-based code that does this. Can you point out
> > what patches you are talking about ? As of 2.6.16-rc4 and -rt15,
> > AFAICS, you can count tasks by holding the read side of tasklist_lock.
> > Granted it is a bit ugly to repeat this in order to overcome
> > the race on dropping tasklist_lock for allocation.
>
> there are several people (Christoph for one) who are working on making
> the tasklist_lock go away entirely in favor of RCU-like locking...
>
>
One of the reasons that alloc_delays() is a bit ugly is to handle the case
that tasks might get created between the two loops. Even with RCU kind of
locking (except for changing the locking primitives of-course), this code would
work fine. Once delayacct_on is set to 1, copy_process() should take care of
allocating the delays structure. alloc_delays() re-iterates through the list
of tasks to find tasks whose allocation got missed. This revisit happens
at most once due to the small window in which we check for delayacct_on
and allocate the task's delays structure. If tasks are missed within that
window, we revisit the tasks again and allocate for them.
Even with RCU kind of locking, this code should be able to gracefully
deal with tasks getting created/lost between the scanning of the tasklist.
Balbir
<snip>
> why not just introduce a schedstats_lock mutex, and acquire it for both
> the 'if (schedstats_sysctl)' line and the schedstats_set() line. That
> will make the locking meaningful: two parallel sysctl ops will be atomic
> to each other. [right now they wont be and they can clear schedstat data
> in parallel -> not a big problem but it makes schedstats_lock rather
> meaningless]
>
Ingo,
Can sysctl's run in parallel? sys_sysctl() is protects the call
to do_sysctl() with BKL (lock_kernel/unlock_kernel).
Am I missing something?
Balbir
> But if task enumeration is going to get more difficult, we'll need to
> keep the on-demand allocation (on
> next use) as a backup for tasks that weren't caught during the sysctl
> change.
>
One possible issue with on-demand allocation is that under heavy load
allocation causes IO to happen and when we try to timestamp that IO, we
do not have a delays structure to do so.
Balbir
>
> One of the reasons that alloc_delays() is a bit ugly is to handle the case
> that tasks might get created between the two loops. Even with RCU kind of
> locking (except for changing the locking primitives of-course), this code would
> work fine. Once delayacct_on is set to 1, copy_process() should take care of
> allocating the delays structure. alloc_delays() re-iterates through the list
> of tasks to find tasks whose allocation got missed. This revisit happens
> at most once due to the small window in which we check for delayacct_on
> and allocate the task's delays structure. If tasks are missed within that
> window, we revisit the tasks again and allocate for them.
well you assume you CAN walk all tasks... which is something afaik
that's being deprecated ;)
> well you assume you CAN walk all tasks... which is something afaik
> that's being deprecated ;)
>
Thanks for clarifying that. We will need to revisit this function as a result
of the proposed changes (when they happen).
Balbir
On Mon, 2006-02-27 at 16:16 +0530, Balbir Singh wrote:
> <snip>
> > why not just introduce a schedstats_lock mutex, and acquire it for both
> > the 'if (schedstats_sysctl)' line and the schedstats_set() line. That
> > will make the locking meaningful: two parallel sysctl ops will be atomic
> > to each other. [right now they wont be and they can clear schedstat data
> > in parallel -> not a big problem but it makes schedstats_lock rather
> > meaningless]
> >
>
> Ingo,
>
> Can sysctl's run in parallel? sys_sysctl() is protects the call
> to do_sysctl() with BKL (lock_kernel/unlock_kernel).
>
> Am I missing something?
your sysctl functions sleep. the BKL is useless in the light of sleeping
code...
On Mon, Feb 27, 2006 at 12:24:18PM +0100, Arjan van de Ven wrote:
> well you assume you CAN walk all tasks... which is something afaik
> that's being deprecated ;)
CPU Hotplug (migrate_live_tasks) also depends on walking all tasks atleast.
Wonder how that would need to change if we can't do that.
--
Regards,
vatsa
Shailabh Nagar wrote:
> Nick Piggin wrote:
>>
>> And clearing this at all is not really needed for the schedstats
>> interface.
>> You have a timestamp and a set of accumulated values, so it is easy to
>> work
>> out deltas. So do you need this?
>
>
> Not clearing the stats will mean userspace has to distinguish between
> the tasks
> that are hanging around from before the last turn off, and the ones
> created after
> wards. Any delta taken across an interval where schedstats was turned
> off will
> give the impression a task was sleeping during the interval (and hence
> show it had
> a lesser average wait time than it might actually have experienced).
Presumably a delta taken acrsoss an interval where schedstats
was turned off would be rather inaccurate, no matter what.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
> your sysctl functions sleep. the BKL is useless in the light of sleeping
> code...
>
But wouldn't all sysctls potentially sleep (on account of copying data from
the user).
Thanks for clarifying,
Balbir
On Mon, 2006-02-27 at 17:59 +0530, Balbir Singh wrote:
> > your sysctl functions sleep. the BKL is useless in the light of sleeping
> > code...
> >
>
> But wouldn't all sysctls potentially sleep (on account of copying data from
> the user).
.. I'm not the one saying the BKL was useful... you were ;)
Shailabh Nagar <[email protected]> writes:
> delayacct-blkio.patch
>
> Record time spent by a task waiting for completion of
> userspace initiated synchronous block I/O. This can help
> determine the right I/O priority for the task.
I think it's a good idea to have such a statistic by default.
Can you add a counter that is summed up in task_struct and reports
in /proc/*/stat so that it could be displayed by top?
This way it would be useful even with "normal" user space.
-Andi
> > But wouldn't all sysctls potentially sleep (on account of copying data from
> > the user).
>
> .. I'm not the one saying the BKL was useful... you were ;)
My tiny mind must have been confused by the presence of the code, which
I presumed would be useful. I guess that is not the case always :-)
Balbir
On Mon, 2006-02-27 at 03:12 -0500, Shailabh Nagar wrote:
> Add sysctl option for controlling schedstats collection
> dynamically. Delay accounting leverages schedstats for
> cpu delay statistics.
Is there some reason you're using the sysctl interface, and not say
sysfs instead?
<b
On Mon, 2006-02-27 at 20:17 +1100, Nick Piggin wrote:
> > #ifdef CONFIG_SCHEDSTATS
> > +
> > +int schedstats_sysctl = 0; /* schedstats turned off by default */
>
> Should be read mostly.
>
> > +static DEFINE_PER_CPU(int, schedstats) = 0;
> > +
>
> When the above is in the read mostly section, you won't need this at all.
>
> You don't intend to switch the sysctl with great frequency, do you?
No, it is not expected to switch often.
We originally coded it as __read_mostly, but thought the variable
bouncing between CPUs would be costly. Is it cheaper with
__read_mostly ? or it doesn't matter ?
--
----------------------------------------------------------------------
Chandra Seetharaman | Be careful what you choose....
- [email protected] | .......you may get it.
----------------------------------------------------------------------
Bryan O'Sullivan wrote:
>On Mon, 2006-02-27 at 03:12 -0500, Shailabh Nagar wrote:
>
>
>
>>Add sysctl option for controlling schedstats collection
>>dynamically. Delay accounting leverages schedstats for
>>cpu delay statistics.
>>
>>
>
>Is there some reason you're using the sysctl interface, and not say
>sysfs instead?
>
> <b
>
No. Is /proc/sys/kernel deprecated in favor of /sys/kernel now ?
--Shailabh
Andi Kleen wrote:
>Shailabh Nagar <[email protected]> writes:
>
>
>
>>delayacct-blkio.patch
>>
>>Record time spent by a task waiting for completion of
>>userspace initiated synchronous block I/O. This can help
>>determine the right I/O priority for the task.
>>
>>
>
>I think it's a good idea to have such a statistic by default.
>
>Can you add a counter that is summed up in task_struct and reports
>in /proc/*/stat so that it could be displayed by top?
>
>
Sure. That would make delayacct code simpler too since it could just
read off from task_struct.
>This way it would be useful even with "normal" user space.
>
>
Yes. Need to resolve the multiple entry point semantic...more in
separate mail.
--Shailabh
>-Andi
>
>
Andi Kleen wrote:
>Shailabh Nagar <[email protected]> writes:
>
>
>
>>delayacct-blkio.patch
>>
>>Record time spent by a task waiting for completion of
>>userspace initiated synchronous block I/O. This can help
>>determine the right I/O priority for the task.
>>
>>
>
>I think it's a good idea to have such a statistic by default.
>
>
Besides the paths we're counting and the one's Arjan listed below, are
there others
you had in mind ?
>Can you add a counter that is summed up in task_struct and reports
>in /proc/*/stat so that it could be displayed by top?
>
>This way it would be useful even with "normal" user space.
>
>-Andi
>
Arjan van de Ven wrote:
>this misses O_SYNC, msync(), and general throttling.
>I get the feeling this is being measured at the wrong level
>currently.... since the number of entry points that needs measuring at
>the current level is hardly finite...
>
>
Our intent was to get an idea of user-initiated sync block I/O because
there is some expectation from user space that a higher I/O priority will
result in lower delays for such I/O. General throttling writes wouldn't
fit in
this category though msync and O_SYNC would.
Are there a lot of other paths you see ? I'll root around more but if you
could just list a few more, it'll help.
As for the level at which the counting is being done, the reason for
choosing this one was to avoid counting time spent waiting for async I/O
completion and also to keep the accounting simple (diff of two
timestamps without
modifying block I/O structures).
To our usage model, async I/O is also not as useful to be counted since
userspace has already
taken steps to tolerate the latency and can do useful work (and not be
"delayed").
However, I would have liked to capture the time spent within
sys_io_getevents
when a timeout is specified, since there the user is again going to be
delayed,
but the mingling of block and network I/O events makes that more complex.
Going further down the I/O processing stack than the current level would
probably require more elaborate mechanisms to keep track of the submitter ?
Or is there a better merging point for sync I/O that I'm missing ?
Your comments would be welcome to improve this code...
--Shailabh
P.S. Sorry if merging the two responses violates any netiquette :-)
Arjan van de Ven wrote:
>On Mon, 2006-02-27 at 03:22 -0500, Shailabh Nagar wrote:
>
>
>>delayacct-swapin.patch
>>
>>Record time spent by a task waiting for its pages to be swapped in.
>>This statistic can help in adjusting the rss limits of
>>tasks (process), especially relative to each other, when the system is
>>under memory pressure.
>>
>>
>
>
>ok this poses a question: how do you deal with nested timings?
>
I don't :-(
An earlier version used local variables instead of one within the
task_delay_info
struct but we moved to using a var within to save on stack space in
critical paths.
>Say an
>O_SYC write which internally causes a pagefault?
>
>
And here we hit the problem of nesting being needed....so....
>delayacct_timestamp_start() at minimum has to get event-type specific,
>or even implement a stack of some sorts.
>
>
Would keeping the timespec vars on the stacks of the functions being
accounted be too
expensive vs. keeping bunches of vars within task_delay_info to deal
with the nesting ?
Unfortunately, the need for accuracy also means the variables needed are
timespecs and
not something smaller.
--Shailabh
On Mon, 2006-02-27 at 15:55 -0500, Shailabh Nagar wrote:
> No. Is /proc/sys/kernel deprecated in favor of /sys/kernel now ?
Yes.
<b
On Monday 27 February 2006 23:09, Shailabh Nagar wrote:
> Besides the paths we're counting and the one's Arjan listed below, are
> there others
> you had in mind ?
I would like to see all reads including metadata reads in file systems.
-Andi
Chandra Seetharaman wrote:
>On Mon, 2006-02-27 at 20:17 +1100, Nick Piggin wrote:
>
>>> #ifdef CONFIG_SCHEDSTATS
>>>+
>>>+int schedstats_sysctl = 0; /* schedstats turned off by default */
>>>
>>Should be read mostly.
>>
>>
>>>+static DEFINE_PER_CPU(int, schedstats) = 0;
>>>+
>>>
>>When the above is in the read mostly section, you won't need this at all.
>>
>>You don't intend to switch the sysctl with great frequency, do you?
>>
>
>No, it is not expected to switch often.
>
>We originally coded it as __read_mostly, but thought the variable
>bouncing between CPUs would be costly. Is it cheaper with
>__read_mostly ? or it doesn't matter ?
>
>
Well it will only "bounce" when the cacheline it is in is written to by
a different CPU. Considering this happens with your per-cpu implementation
_anyway_, they don't buy you anything much.
Putting it in __read_mostly means that you won't happen to share a cacheline
with a variable that is being written to frequently.
Nick
--
Send instant messages to your online friends http://au.messenger.yahoo.com
On Tue, 2006-02-28 at 11:25 +1100, Nick Piggin wrote:
> Chandra Seetharaman wrote:
>
> >On Mon, 2006-02-27 at 20:17 +1100, Nick Piggin wrote:
> >
> >>> #ifdef CONFIG_SCHEDSTATS
> >>>+
> >>>+int schedstats_sysctl = 0; /* schedstats turned off by default */
> >>>
> >>Should be read mostly.
> >>
> >>
> >>>+static DEFINE_PER_CPU(int, schedstats) = 0;
> >>>+
> >>>
> >>When the above is in the read mostly section, you won't need this at all.
> >>
> >>You don't intend to switch the sysctl with great frequency, do you?
> >>
> >
> >No, it is not expected to switch often.
> >
> >We originally coded it as __read_mostly, but thought the variable
> >bouncing between CPUs would be costly. Is it cheaper with
> >__read_mostly ? or it doesn't matter ?
> >
> >
>
> Well it will only "bounce" when the cacheline it is in is written to by
> a different CPU. Considering this happens with your per-cpu implementation
> _anyway_, they don't buy you anything much.
>
> Putting it in __read_mostly means that you won't happen to share a cacheline
> with a variable that is being written to frequently.
>
Thanks for the clarification Nick.
> Nick
>
> --
>
> Send instant messages to your online friends http://au.messenger.yahoo.com
>
>
>
> -------------------------------------------------------
> This SF.Net email is sponsored by xPML, a groundbreaking scripting language
> that extends applications into web and mobile media. Attend the live webcast
> and join the prime developer group breaking into this new coding territory!
> http://sel.as-us.falkag.net/sel?cmd=lnk&kid=110944&bid=241720&dat=121642
> _______________________________________________
> Lse-tech mailing list
> [email protected]
> https://lists.sourceforge.net/lists/listinfo/lse-tech
> Our intent was to get an idea of user-initiated sync block I/O because
> there is some expectation from user space that a higher I/O priority will
> result in lower delays for such I/O. General throttling writes wouldn't
> fit in
> this category though msync and O_SYNC would.
>
> Are there a lot of other paths you see ? I'll root around more but if you
> could just list a few more, it'll help.
unmount
-o sync mounts
last-close kind of syncs on block devices (yes databases do this and
care)
fdatasync()/fsync()
open() (does a read seek, and may even do cluster locking stuff)
flock()
Jamal,
Pls keep lkml and lse-tech on cc since some of this affects the usage
of delay accounting.
jamal wrote:
>Hi Shailabh,
>Apologies for taking a week to respond ..
>
>On Mon, 2006-27-02 at 15:26 -0500, Shailabh Nagar wrote:
>
>
>>jamal wrote:
>>
>>
>
>
>
>>Yes, the current intent is to allow multiple listeners to receive the
>>responses sent by the kernel.
>>
>>
>
>Responses or events? There is a difference:
>Response implies the program in user space requested (ex a GET) for that
>information and is receiving such info.
>Event implies the program in user space asked to be informed of changes
>in the kernel. Example an exit would be considered an event.
>Events are received by virtue of registering to a multicast group.
>[..]
>
>
My design was to have the listener get both responses (what I call
replies in the code)
as well as events (data sent on exit of pid)
>>Since this interface (taskstats) is currently designed for that
>>possibility, having multiple listeners, one for
>>each "component" such as delay accounting, is the model we're using.
>>We expect each component to have a pair of userspace programs, one for
>>sending commands and the other
>>to "listen" to all replies + data generated on task exits.
>>
>>
>
>You need to have a sender of GETs essentially and a listener of events.
>Those are two connections. The replies of a get from user1 will not be
>sent to user2 as well - unless ... thats what you are trying to achieve;
>the question is why?
>
>
Yes, I was trying to have an asymmetric model where the userspace sender
of GETs
doesn't receive the reply as a unicast. Rather the reply is sent by
multicast (alongwith all the
event data).
Reason for this unintuitive design was to make it easier to process the
returned data.
The expected usage of delay accounting is to periodically "sample" the
delays for all
tasks (or tgids) in the system. Also, get the delays from exiting pids
(lets forget how tgid exit
is handled for now...irrelevant to this discussion).
Using the above two pieces of data, userspace can aggregate the "delays"
seen by any
grouping of tasks that it chooses to implement.
In this usage scenario, its more efficient to have one receiver get both
response and event
data and process in a loop.
However, we could switch to the model you suggest and use a
multithreaded send/receive
userspace utility.
>>The listener
>>is expected to register/deregister interest through
>>TASKSTATS_CMD_LISTEN and IGNORE.
>>
>>
>>
>
>It is not necessary if you follow the model i described.
>
>
>
>>>How does this correlate to TASKSTATS_CMD_LISTEN/IGNORE?
>>>
>>>
>>>
>>>
>>See above. Its mainly an optimization so that if no listener is present,
>>there's no need to generate the data.
>>
>>
>>
>
>Also not necessary - There is a recent netlink addition to make sure
>that events dont get sent if no listeners exist.
>genetlink needs to be extended. For now assume such a thing exists.
>
>
Ok. Will this addition work for both unicast and multicast modes ?
>
>
>>>>+
>>>>
>>>>
>
>
>
>>Good point. Should check for users sending it as a cmd and treat it as a
>>noop.
>>
>>
>
>More like return an -EINVAL
>
>
Will this be necessary ? Isn't genl_rcv_msg() going to return a -EOPNOTSUPP
automatically for us since we've not registered the command ?
>
>
>>I'm just using
>>this as a placeholder for data thats returned without being requested.
>>
>>
>>
>
>So it is unconditional?
>
>
Yes.
>
>
>>Come to think of it, there's no real reason to have a genlmsghdr for
>>returned data, is there ?
>>
>>
>
>All messages should be consistent whether they are sent from user
>or kernel.
>
>
Ok. will retain genetlink header.
>>Other than to copy the genlmsghdr that was sent so user can identify
>>which command was sent
>>(and I'm doing that through the reply type, perhaps redundantly).
>>
>>
>>
>
>yes, that is a useful trick. Just make sure they are reflected
>correctly.
>
>
>
>>Actually, the next iteration of the code will move to dynamically
>>generated ID. But yes, will need to check for that.
>>
>>
>>
>
>Also if you can provide feedback whether the doc i sent was any use
>and what wasnt clear etc.
>
>
Will do.
>>Thanks for the review.
>>Couple of questions about general netlink:
>>is it intended to remain a size that will always be aligned to the
>>NLMSG_ALIGNTO so that (NLMSG_DATA(nlhdr) + GENL_HDRLEN) can always
>>be used as a pointer to the genlmsghdr ?
>>
>>
>>
>
>I am not sure i followed.
>The whole message (nlhdr, genlhdr, optionalhdr, TLVs) has to be in
>the end 32 bit aligned.
>
>
Ok , so separate padding isn't needed to make the genlhdr, optionalhdr
and TLV parts aligned
too.
>>Adding some macros like genlmsg_data(nlh) would be handy (currently I
>>just define and use it locally).
>>
>>
>>
>
>Send a patch.
>
>
will do.
Thanks,
Shailabh
>cheers,
>jamal
>
>
>
On Mon, 2006-06-03 at 12:00 -0500, Shailabh Nagar wrote:
> My design was to have the listener get both responses (what I call
> replies in the code) as well as events (data sent on exit of pid)
>
I think i may not be doing justice explaining this, so let me be more
elaborate so we can be in sync.
Here is the classical way of doing things:
- Assume several apps in user space and a target in the kernel (this
could be reversed or combined in many ways, but the sake of
simplicity/clarity make the above assumption).
- suppose we have five user space apps A, B, C, D, E; these processes
would typically do one of the following class of activities:
a) configure (ADD/NEW/DEL etc). This is issued towards the kernel to
set/create/delete/flush some scalar attribute or vector. These sorts of
commands are synchronous. i.e you issue them, you expect a response
(which may indicate success/failure etc). The response is unicast; the
effect of what they affected may cause an event which may be multicast.
b) query(GET). This is issued towards the kernel to query state of
configured items. These class of commands are also synchronous. There
are special cases of the query which dump everything in the target -
literally called "dumps". The response is unicast.
c) events. These are _asynchronous_ messages issued by the kernel to
indicate some happening in the kernel. The event may be caused by #a
above or any other activity in the kernel. Events are multicast.
To receive them you have to register for the multicast group. You do so
via sockets. You can register to many multicast group.
For clarity again assume we have a multicast group where announcements
of pids exiting is seen and C and D are registered to such a multicast
group.
Suppose process A exits. That would fall under #c above. C and D will be
notified.
Suppose B configures something in the kernel that forces the kernel to
have process E exit and that such an operation is successful. B will get
acknowledgement it succeeded (unicast). C and D will get notified
(multicast).
Suppose C issued a GET to find details about a specific pid, then only C
will get that info back (message is unicast).
[A response message to a GET is typically designed to be the same as an
ADD message i.e one should be able to take exactly the same message,
change one or two things and shove back into the kernel to configure].
Suppose D issued a GET with dump flag, then D will get the details of
all pids (message is unicast).
Is this clear? Is there more than the above you need?
There are no hard rules on what you need to be multicasting and as an
example you could send periodic(aka time based) samples from the kernel
on a multicast channel and that would be received by all. It did seem
odd that you want to have a semi-promiscous mode where a response to a
GET is multicast. If that is still what you want to achieve, then you
should.
> However, we could switch to the model you suggest and use a
> multithreaded send/receive userspace utility.
>
This is more of the classical way of doing things.
> >There is a recent netlink addition to make sure
> >that events dont get sent if no listeners exist.
> >genetlink needs to be extended. For now assume such a thing exists.
> >
> >
> Ok. Will this addition work for both unicast and multicast modes ?
>
If you never open a connection to the kernel, nothing will be generated
towards user space.
There are other techniques to rate limit event generation as well (one
such technique is a nagle-like algorithm used by xfrm).
> >
> Will this be necessary ? Isn't genl_rcv_msg() going to return a -EOPNOTSUPP
> automatically for us since we've not registered the command ?
>
Yes, please in your doc feedback remind me of this,
> >
> >Also if you can provide feedback whether the doc i sent was any use
> >and what wasnt clear etc.
> >
> >
> Will do.
>
also take a look at the excellent documentation Thomas Graf has put in
the kernel for all the utilities for manipulating netlink messages and
tell me if that should also be put in this doc (It is listed as a TODO).
cheers,
jamal
On Mon, Feb 27, 2006 at 08:17:47PM +1100, Nick Piggin wrote:
> Shailabh Nagar wrote:
> >schedstats-sysctl.patch
> >
> >Add sysctl option for controlling schedstats collection
> >dynamically. Delay accounting leverages schedstats for
> >cpu delay statistics.
> >
>
> I'd sort of rather not tie this in with schedstats if possible.
> Schedstats adds a reasonable amount of cache footprint and
> branches in hot paths. Most of schedstats stuff is something that
> hardly anyone will use.
>
> Sure you can share common code though...
>
This patch refines scheduler statistics collection and display to three levels,
tasks, runqueue and scheddomains. CONFIG_SCHEDSTATS now requires a boot time
option in the form of schedstats or schedstats= to display scheduler statistics
They can all be enabled together to get complete statistics by passing "all"
as a boot time option. schedstat_inc has been split into schedstat_rq_inc and
schedstat_sd_inc, each of which checks if rq and sd statistics gathering is
enabled or not. schedstat_add has been changed to schedstat_sd_add, it checks
if sd statistics gathering is on prior to gathering the statistics.
Similar changes have been made for task schedstats gathering.
The output of /proc/schedstat and /proc/<pid>/schedstat and /proc/<pid>/task/
*/schedstat has been modified to print a gentle message suggesting that
statistics gathering is off. Also a header "statistics for cpuXXX" has been
added (for each cpu) to the /proc/schedstat output.
This patch is motivated by comments for sharing code with
CONFIG_SCHEDSTATS but not incurring the complete overhead of the entire
CONFIG_SCHEDSTATS code.
Testing
=======
a) booted with schedstats (schedstats=all)
------------------------------------------
cat /proc/schedstat
version 13
timestamp 4294922919
statistics for cpu0
cpu0 10 10 132 142 240 97775 24181 48251 45935 5664 2376 73594
domain0 00000003 24464 24242 162 224 62 4 0 24242 149 148 0 1 1 0 0 148 24290 24117 64 173 109 0 0 24117 0 0 0 0 0 0 0 0 0 4392 835 0
statistics for cpu1
cpu1 3 3 180 14504 387 50735 6520 15430 11035 4195 2376 44215
domain0 00000003 25870 25203 107 695 588 3 0 25203 198 198 0 0 0 0 0 198 6608 6428 91 184 93 0 0 6428 0 0 0 0 0 0 0 0 0 2316 307 0
cat /proc/1/schedstat
506 34 1102
b) booted with schedstats=tasks
-------------------------------
cat /proc/schedstat
version 13
timestamp 4294937241
runqueue and scheddomain stats are not enabled
cat /proc/1/schedstat
505 58 1097
c) booted with schedstats=rq
-----------------------------
cat /proc/schedstat
version 13
timestamp 4294913832
statistics for cpu0
cpu0 14 14 56 102 260 96332 18867 47278 45064 3556397949 2216 77465
scheddomain stats are not enabled
statistics for cpu1
cpu1 3 3 12134 12138 333 42878 4224 12874 8779 1714457722 2071 38654
scheddomain stats are not enabled
cat /proc/1/schedstat
tasks schedstats is not enabled
d) booted with schedstats=sd
----------------------------
cat /proc/schedstat
version 13
timestamp 4294936220
statistics for cpu0
runqueue stats are not enabled
domain0 00000003 38048 37802 140 248 108 0 0 37802 151 149 0 3 3 0 0 149 27574 27417 59 158 99 0 0 27417 0 0 0 0 0 0 0 0 0 4168 827 0
statistics for cpu1
runqueue stats are not enabled
domain0 00000003 39094 38441 119 682 563 3 0 38441 199 196 0 5 5 0 0 196 9167 8970 107 203 96 0 0 8970 0 0 0 0 0 0 0 0 0 2159 330 0
cat /proc/1/schedstat
tasks schedstats is not enabled
Alternatives considered
=======================
The other alternative that was considered was that instead of changing the
format of /proc/schedstat and /proc/<pid>*/schedstat, we could print zeros for
all levels for which statistics is not collected. But zeros could be treated
as valid values, so this solution was not implemented.
Limitations
===========
The effectiveness of this patch is limited to run-time statistics collection.
The run-time overhead is proportional to the level of statistics enabled.
The space consumed is the same as before.
This patch was created against 2.6.16-rc5
Signed-off-by: Balbir Singh <[email protected]>
---
Documentation/kernel-parameters.txt | 11 ++
fs/proc/base.c | 4
include/linux/sched.h | 23 ++++
kernel/sched.c | 195 +++++++++++++++++++++++++-----------
4 files changed, 175 insertions(+), 58 deletions(-)
diff -puN kernel/sched.c~schedstats_refinement kernel/sched.c
--- linux-2.6.16-rc5/kernel/sched.c~schedstats_refinement 2006-03-07 16:14:59.000000000 +0530
+++ linux-2.6.16-rc5-balbir/kernel/sched.c 2006-03-07 20:43:46.000000000 +0530
@@ -386,7 +386,28 @@ static inline void task_rq_unlock(runque
* bump this up when changing the output format or the meaning of an existing
* format, so that tools can adapt (or abort)
*/
-#define SCHEDSTAT_VERSION 12
+#define SCHEDSTAT_VERSION 13
+
+int schedstats_on __read_mostly;
+
+/*
+ * Parse the schedstats options passed at boot time
+ */
+static int __init schedstats_setup_enable(char *str)
+{
+ if (!str || !strcmp(str, "") || !strcmp(str, "=all"))
+ schedstats_on = SCHEDSTATS_ALL;
+ else if (!strcmp(str, "=tasks"))
+ schedstats_on = SCHEDSTATS_TASKS;
+ else if (!strcmp(str, "=sd"))
+ schedstats_on = SCHEDSTATS_SD;
+ else if (!strcmp(str, "=rq"))
+ schedstats_on = SCHEDSTATS_RQ;
+
+ return 1;
+}
+
+__setup("schedstats", schedstats_setup_enable);
static int show_schedstat(struct seq_file *seq, void *v)
{
@@ -394,26 +415,44 @@ static int show_schedstat(struct seq_fil
seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
seq_printf(seq, "timestamp %lu\n", jiffies);
+
+ if (!schedstats_rq_on() && !schedstats_sd_on()) {
+ seq_printf(seq, "runqueue and scheddomain stats are not "
+ "enabled\n");
+ return 0;
+ }
+
for_each_online_cpu(cpu) {
runqueue_t *rq = cpu_rq(cpu);
#ifdef CONFIG_SMP
struct sched_domain *sd;
int dcnt = 0;
#endif
+ seq_printf(seq, "\nstatistics for cpu%d\n\n", cpu);
- /* runqueue-specific stats */
- seq_printf(seq,
- "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
- cpu, rq->yld_both_empty,
- rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
- rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
- rq->ttwu_cnt, rq->ttwu_local,
- rq->rq_sched_info.cpu_time,
- rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
+ if (schedstats_rq_on()) {
+ /* runqueue-specific stats */
+ seq_printf(seq,
+ "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu "
+ "%lu",
+ cpu, rq->yld_both_empty,
+ rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
+ rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
+ rq->ttwu_cnt, rq->ttwu_local,
+ rq->rq_sched_info.cpu_time,
+ rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
- seq_printf(seq, "\n");
+ seq_printf(seq, "\n");
+ } else
+ seq_printf(seq, "runqueue stats are not enabled\n");
#ifdef CONFIG_SMP
+
+ if (!schedstats_sd_on()) {
+ seq_printf(seq, "scheddomain stats are not enabled\n");
+ continue;
+ }
+
/* domain-specific stats */
preempt_disable();
for_each_domain(cpu, sd) {
@@ -472,11 +511,30 @@ struct file_operations proc_schedstat_op
.release = single_release,
};
-# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
-# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
+# define schedstat_sd_inc(sd, field) \
+do { \
+ if (unlikely(schedstats_sd_on())) \
+ (sd)->field++; \
+} while (0)
+
+# define schedstat_sd_add(sd, field, amt) \
+do { \
+ if (unlikely(schedstats_sd_on())) \
+ (sd)->field += (amt); \
+} while (0)
+
+# define schedstat_rq_inc(rq, field) \
+do { \
+ if (unlikely(schedstats_rq_on())) \
+ (rq)->field++; \
+} while (0)
+
#else /* !CONFIG_SCHEDSTATS */
-# define schedstat_inc(rq, field) do { } while (0)
-# define schedstat_add(rq, field, amt) do { } while (0)
+
+# define schedstat_sd_inc(rq, field) do { } while (0)
+# define schedstat_sd_add(rq, field, amt) do { } while (0)
+# define schedstat_rq_inc(rq, field) do { } while (0)
+
#endif
/*
@@ -515,6 +573,15 @@ static inline void sched_info_dequeued(t
t->sched_info.last_queued = 0;
}
+static void rq_sched_info_arrive(struct runqueue *rq, unsigned long diff)
+{
+ if (!schedstats_rq_on() || !rq)
+ return;
+
+ rq->rq_sched_info.run_delay += diff;
+ rq->rq_sched_info.pcnt++;
+}
+
/*
* Called when a task finally hits the cpu. We can now calculate how
* long it was waiting to run. We also note when it began so that we
@@ -523,20 +590,23 @@ static inline void sched_info_dequeued(t
static void sched_info_arrive(task_t *t)
{
unsigned long now = jiffies, diff = 0;
- struct runqueue *rq = task_rq(t);
+ if (!schedstats_tasks_on() && !schedstats_rq_on())
+ return;
+
+ /*
+ * diff is required in case schedstats is on for tasks or rq
+ */
if (t->sched_info.last_queued)
diff = now - t->sched_info.last_queued;
sched_info_dequeued(t);
- t->sched_info.run_delay += diff;
- t->sched_info.last_arrival = now;
- t->sched_info.pcnt++;
-
- if (!rq)
- return;
- rq->rq_sched_info.run_delay += diff;
- rq->rq_sched_info.pcnt++;
+ if (schedstats_tasks_on()) {
+ t->sched_info.run_delay += diff;
+ t->sched_info.last_arrival = now;
+ t->sched_info.pcnt++;
+ }
+ rq_sched_info_arrive(task_rq(t), diff);
}
/*
@@ -556,23 +626,32 @@ static void sched_info_arrive(task_t *t)
*/
static inline void sched_info_queued(task_t *t)
{
+ if (!schedstats_tasks_on() && !schedstats_rq_on())
+ return;
+
if (!t->sched_info.last_queued)
t->sched_info.last_queued = jiffies;
}
+static inline void rq_sched_info_depart(struct runqueue *rq, unsigned long diff)
+{
+ if (!schedstats_rq_on() || !rq)
+ return;
+
+ rq->rq_sched_info.cpu_time += diff;
+}
+
/*
* Called when a process ceases being the active-running process, either
* voluntarily or involuntarily. Now we can calculate how long we ran.
*/
static inline void sched_info_depart(task_t *t)
{
- struct runqueue *rq = task_rq(t);
unsigned long diff = jiffies - t->sched_info.last_arrival;
- t->sched_info.cpu_time += diff;
-
- if (rq)
- rq->rq_sched_info.cpu_time += diff;
+ if (schedstats_tasks_on())
+ t->sched_info.cpu_time += diff;
+ rq_sched_info_depart(task_rq(t), diff);
}
/*
@@ -1190,15 +1269,15 @@ static int try_to_wake_up(task_t *p, uns
new_cpu = cpu;
- schedstat_inc(rq, ttwu_cnt);
+ schedstat_rq_inc(rq, ttwu_cnt);
if (cpu == this_cpu) {
- schedstat_inc(rq, ttwu_local);
+ schedstat_rq_inc(rq, ttwu_local);
goto out_set_cpu;
}
for_each_domain(this_cpu, sd) {
if (cpu_isset(cpu, sd->span)) {
- schedstat_inc(sd, ttwu_wake_remote);
+ schedstat_sd_inc(sd, ttwu_wake_remote);
this_sd = sd;
break;
}
@@ -1239,7 +1318,7 @@ static int try_to_wake_up(task_t *p, uns
* p is cache cold in this domain, and
* there is no bad imbalance.
*/
- schedstat_inc(this_sd, ttwu_move_affine);
+ schedstat_sd_inc(this_sd, ttwu_move_affine);
goto out_set_cpu;
}
}
@@ -1250,7 +1329,7 @@ static int try_to_wake_up(task_t *p, uns
*/
if (this_sd->flags & SD_WAKE_BALANCE) {
if (imbalance*this_load <= 100*load) {
- schedstat_inc(this_sd, ttwu_move_balance);
+ schedstat_sd_inc(this_sd, ttwu_move_balance);
goto out_set_cpu;
}
}
@@ -1894,7 +1973,7 @@ skip_queue:
#ifdef CONFIG_SCHEDSTATS
if (task_hot(tmp, busiest->timestamp_last_tick, sd))
- schedstat_inc(sd, lb_hot_gained[idle]);
+ schedstat_sd_inc(sd, lb_hot_gained[idle]);
#endif
pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
@@ -1913,7 +1992,7 @@ out:
* so we can safely collect pull_task() stats here rather than
* inside pull_task().
*/
- schedstat_add(sd, lb_gained[idle], pulled);
+ schedstat_sd_add(sd, lb_gained[idle], pulled);
if (all_pinned)
*all_pinned = pinned;
@@ -2109,23 +2188,23 @@ static int load_balance(int this_cpu, ru
if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER)
sd_idle = 1;
- schedstat_inc(sd, lb_cnt[idle]);
+ schedstat_sd_inc(sd, lb_cnt[idle]);
group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle);
if (!group) {
- schedstat_inc(sd, lb_nobusyg[idle]);
+ schedstat_sd_inc(sd, lb_nobusyg[idle]);
goto out_balanced;
}
busiest = find_busiest_queue(group, idle);
if (!busiest) {
- schedstat_inc(sd, lb_nobusyq[idle]);
+ schedstat_sd_inc(sd, lb_nobusyq[idle]);
goto out_balanced;
}
BUG_ON(busiest == this_rq);
- schedstat_add(sd, lb_imbalance[idle], imbalance);
+ schedstat_sd_add(sd, lb_imbalance[idle], imbalance);
nr_moved = 0;
if (busiest->nr_running > 1) {
@@ -2146,7 +2225,7 @@ static int load_balance(int this_cpu, ru
}
if (!nr_moved) {
- schedstat_inc(sd, lb_failed[idle]);
+ schedstat_sd_inc(sd, lb_failed[idle]);
sd->nr_balance_failed++;
if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
@@ -2199,7 +2278,7 @@ static int load_balance(int this_cpu, ru
return nr_moved;
out_balanced:
- schedstat_inc(sd, lb_balanced[idle]);
+ schedstat_sd_inc(sd, lb_balanced[idle]);
sd->nr_balance_failed = 0;
@@ -2233,22 +2312,22 @@ static int load_balance_newidle(int this
if (sd->flags & SD_SHARE_CPUPOWER)
sd_idle = 1;
- schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
+ schedstat_sd_inc(sd, lb_cnt[NEWLY_IDLE]);
group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle);
if (!group) {
- schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
+ schedstat_sd_inc(sd, lb_nobusyg[NEWLY_IDLE]);
goto out_balanced;
}
busiest = find_busiest_queue(group, NEWLY_IDLE);
if (!busiest) {
- schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
+ schedstat_sd_inc(sd, lb_nobusyq[NEWLY_IDLE]);
goto out_balanced;
}
BUG_ON(busiest == this_rq);
- schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
+ schedstat_sd_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
nr_moved = 0;
if (busiest->nr_running > 1) {
@@ -2260,7 +2339,7 @@ static int load_balance_newidle(int this
}
if (!nr_moved) {
- schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
+ schedstat_sd_inc(sd, lb_failed[NEWLY_IDLE]);
if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
return -1;
} else
@@ -2269,7 +2348,7 @@ static int load_balance_newidle(int this
return nr_moved;
out_balanced:
- schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+ schedstat_sd_inc(sd, lb_balanced[NEWLY_IDLE]);
if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
return -1;
sd->nr_balance_failed = 0;
@@ -2333,12 +2412,12 @@ static void active_load_balance(runqueue
if (unlikely(sd == NULL))
goto out;
- schedstat_inc(sd, alb_cnt);
+ schedstat_sd_inc(sd, alb_cnt);
if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
- schedstat_inc(sd, alb_pushed);
+ schedstat_sd_inc(sd, alb_pushed);
else
- schedstat_inc(sd, alb_failed);
+ schedstat_sd_inc(sd, alb_failed);
out:
spin_unlock(&target_rq->lock);
}
@@ -2906,7 +2985,7 @@ need_resched_nonpreemptible:
dump_stack();
}
- schedstat_inc(rq, sched_cnt);
+ schedstat_rq_inc(rq, sched_cnt);
now = sched_clock();
if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
run_time = now - prev->timestamp;
@@ -2974,7 +3053,7 @@ go_idle:
/*
* Switch the active and expired arrays.
*/
- schedstat_inc(rq, sched_switch);
+ schedstat_rq_inc(rq, sched_switch);
rq->active = rq->expired;
rq->expired = array;
array = rq->active;
@@ -3007,7 +3086,7 @@ go_idle:
next->activated = 0;
switch_tasks:
if (next == rq->idle)
- schedstat_inc(rq, sched_goidle);
+ schedstat_rq_inc(rq, sched_goidle);
prefetch(next);
prefetch_stack(next);
clear_tsk_need_resched(prev);
@@ -3979,7 +4058,7 @@ asmlinkage long sys_sched_yield(void)
prio_array_t *array = current->array;
prio_array_t *target = rq->expired;
- schedstat_inc(rq, yld_cnt);
+ schedstat_rq_inc(rq, yld_cnt);
/*
* We implement yielding by moving the task into the expired
* queue.
@@ -3991,11 +4070,11 @@ asmlinkage long sys_sched_yield(void)
target = rq->active;
if (array->nr_active == 1) {
- schedstat_inc(rq, yld_act_empty);
+ schedstat_rq_inc(rq, yld_act_empty);
if (!rq->expired->nr_active)
- schedstat_inc(rq, yld_both_empty);
+ schedstat_rq_inc(rq, yld_both_empty);
} else if (!rq->expired->nr_active)
- schedstat_inc(rq, yld_exp_empty);
+ schedstat_rq_inc(rq, yld_exp_empty);
if (array != target) {
dequeue_task(current, array);
diff -puN include/linux/sched.h~schedstats_refinement include/linux/sched.h
--- linux-2.6.16-rc5/include/linux/sched.h~schedstats_refinement 2006-03-07 16:14:59.000000000 +0530
+++ linux-2.6.16-rc5-balbir/include/linux/sched.h 2006-03-07 20:46:10.000000000 +0530
@@ -525,6 +525,29 @@ struct backing_dev_info;
struct reclaim_state;
#ifdef CONFIG_SCHEDSTATS
+
+#define SCHEDSTATS_TASKS 0x1
+#define SCHEDSTATS_RQ 0x2
+#define SCHEDSTATS_SD 0x4
+#define SCHEDSTATS_ALL (SCHEDSTATS_TASKS | SCHEDSTATS_RQ | SCHEDSTATS_SD)
+
+extern int schedstats_on;
+
+static inline int schedstats_tasks_on(void)
+{
+ return schedstats_on & SCHEDSTATS_TASKS;
+}
+
+static inline int schedstats_rq_on(void)
+{
+ return schedstats_on & SCHEDSTATS_RQ;
+}
+
+static inline int schedstats_sd_on(void)
+{
+ return schedstats_on & SCHEDSTATS_SD;
+}
+
struct sched_info {
/* cumulative counters */
unsigned long cpu_time, /* time spent on the cpu */
diff -puN Documentation/kernel-parameters.txt~schedstats_refinement Documentation/kernel-parameters.txt
--- linux-2.6.16-rc5/Documentation/kernel-parameters.txt~schedstats_refinement 2006-03-07 16:14:59.000000000 +0530
+++ linux-2.6.16-rc5-balbir/Documentation/kernel-parameters.txt 2006-03-07 20:48:44.000000000 +0530
@@ -1333,6 +1333,17 @@ running once the system is up.
sc1200wdt= [HW,WDT] SC1200 WDT (watchdog) driver
Format: <io>[,<timeout>[,<isapnp>]]
+ schedstats [KNL]
+ Enable all schedstats if CONFIG_SCHEDSTATS is defined
+ same as the schedstats=all
+
+ schedstats= [KNL]
+ Format: {"all", "tasks", "sd", "rq"}
+ all -- turns on the complete schedstats
+ rq -- turns on schedstats only for runqueue
+ sd -- turns on schedstats only for scheddomains
+ tasks -- turns on schedstats only for tasks
+
scsi_debug_*= [SCSI]
See drivers/scsi/scsi_debug.c.
diff -puN fs/proc/base.c~schedstats_refinement fs/proc/base.c
--- linux-2.6.16-rc5/fs/proc/base.c~schedstats_refinement 2006-03-07 16:14:59.000000000 +0530
+++ linux-2.6.16-rc5-balbir/fs/proc/base.c 2006-03-07 16:17:39.000000000 +0530
@@ -72,6 +72,7 @@
#include <linux/cpuset.h>
#include <linux/audit.h>
#include <linux/poll.h>
+#include <linux/sched.h>
#include "internal.h"
/*
@@ -504,6 +505,9 @@ static int proc_pid_wchan(struct task_st
*/
static int proc_pid_schedstat(struct task_struct *task, char *buffer)
{
+ if (!schedstats_tasks_on())
+ return sprintf(buffer, "tasks schedstats is not enabled\n");
+
return sprintf(buffer, "%lu %lu %lu\n",
task->sched_info.cpu_time,
task->sched_info.run_delay,
_
jamal wrote:
>On Mon, 2006-06-03 at 12:00 -0500, Shailabh Nagar wrote:
>
>
>
>>My design was to have the listener get both responses (what I call
>>replies in the code) as well as events (data sent on exit of pid)
>>
>>
>>
>
>I think i may not be doing justice explaining this, so let me be more
>elaborate so we can be in sync.
>Here is the classical way of doing things:
>
>- Assume several apps in user space and a target in the kernel (this
>could be reversed or combined in many ways, but the sake of
>simplicity/clarity make the above assumption).
>- suppose we have five user space apps A, B, C, D, E; these processes
>would typically do one of the following class of activities:
>
>a) configure (ADD/NEW/DEL etc). This is issued towards the kernel to
>set/create/delete/flush some scalar attribute or vector. These sorts of
>commands are synchronous. i.e you issue them, you expect a response
>(which may indicate success/failure etc). The response is unicast; the
>effect of what they affected may cause an event which may be multicast.
>
>b) query(GET). This is issued towards the kernel to query state of
>configured items. These class of commands are also synchronous. There
>are special cases of the query which dump everything in the target -
>literally called "dumps". The response is unicast.
>
>c) events. These are _asynchronous_ messages issued by the kernel to
>indicate some happening in the kernel. The event may be caused by #a
>above or any other activity in the kernel. Events are multicast.
>To receive them you have to register for the multicast group. You do so
>via sockets. You can register to many multicast group.
>
>For clarity again assume we have a multicast group where announcements
>of pids exiting is seen and C and D are registered to such a multicast
>group.
>Suppose process A exits. That would fall under #c above. C and D will be
>notified.
>Suppose B configures something in the kernel that forces the kernel to
>have process E exit and that such an operation is successful. B will get
>acknowledgement it succeeded (unicast). C and D will get notified
>(multicast).
>Suppose C issued a GET to find details about a specific pid, then only C
>will get that info back (message is unicast).
>[A response message to a GET is typically designed to be the same as an
>ADD message i.e one should be able to take exactly the same message,
>change one or two things and shove back into the kernel to configure].
>Suppose D issued a GET with dump flag, then D will get the details of
>all pids (message is unicast).
>
>Is this clear? Is there more than the above you need?
>
>
Thanks for the clarification of the usage model. While our needs are
certainly much less complex,
it is useful to know the range of options.
>There are no hard rules on what you need to be multicasting and as an
>example you could send periodic(aka time based) samples from the kernel
>on a multicast channel and that would be received by all. It did seem
>odd that you want to have a semi-promiscous mode where a response to a
>GET is multicast. If that is still what you want to achieve, then you
>should.
>
>
Ok, we'll probably switch to the classical mode you suggest since the
"efficient processing"
rationale for choosing to operate in the semi-promiscous mode earlier
can be overcome by
writing a multi-threaded userspace utility.
>>However, we could switch to the model you suggest and use a
>>multithreaded send/receive userspace utility.
>>
>>
>>
>
>This is more of the classical way of doing things.
>
>
>
>
>>>There is a recent netlink addition to make sure
>>>that events dont get sent if no listeners exist.
>>>genetlink needs to be extended. For now assume such a thing exists.
>>>
>>>
>>>
>>>
>>Ok. Will this addition work for both unicast and multicast modes ?
>>
>>
>>
>
>If you never open a connection to the kernel, nothing will be generated
>towards user space.
>There are other techniques to rate limit event generation as well (one
>such technique is a nagle-like algorithm used by xfrm).
>
>
>
>>Will this be necessary ? Isn't genl_rcv_msg() going to return a -EOPNOTSUPP
>>automatically for us since we've not registered the command ?
>>
>>
>>
>
>Yes, please in your doc feedback remind me of this,
>
>
>
>>>Also if you can provide feedback whether the doc i sent was any use
>>>and what wasnt clear etc.
>>>
>>>
>>>
>>>
>>Will do.
>>
>>
>>
>
>also take a look at the excellent documentation Thomas Graf has put in
>the kernel for all the utilities for manipulating netlink messages and
>tell me if that should also be put in this doc (It is listed as a TODO).
>
>
>
>
Ok.
Thanks,
Shailabh
>cheers,
>jamal
>
>
>
> Thanks for the clarification of the usage model. While our needs are
> certainly much less complex,
> it is useful to know the range of options.
>
> >There are no hard rules on what you need to be multicasting and as an
> >example you could send periodic(aka time based) samples from the kernel
> >on a multicast channel and that would be received by all. It did seem
> >odd that you want to have a semi-promiscous mode where a response to a
> >GET is multicast. If that is still what you want to achieve, then you
> >should.
> >
> >>>Also if you can provide feedback whether the doc i sent was any use
> >>>and what wasnt clear etc.
> >also take a look at the excellent documentation Thomas Graf has put in
> >the kernel for all the utilities for manipulating netlink messages and
> >tell me if that should also be put in this doc (It is listed as a TODO).
Hello, Jamal,
Please find the latest version of the patch for review. The genetlink
code has been updated as per your review comments. The changelog is provided
below
1. Eliminated TASKSTATS_CMD_LISTEN and TASKSTATS_CMD_IGNORE
2. Provide generic functions called genlmsg_data() and genlmsg_len()
in linux/net/genetlink.h
3. Do not multicast all replies, multicast only events generated due
to task exit.
4. The taskstats and taskstats_reply structures are now 64 bit aligned.
5. Family id is dynamically generated.
Please let us know if we missed something out.
Thanks,
Balbir
Signed-off-by: Shailabh Nagar <[email protected]>
Signed-off-by: Balbir Singh <[email protected]>
---
include/linux/delayacct.h | 2
include/linux/taskstats.h | 128 ++++++++++++++++++++++++
include/net/genetlink.h | 20 +++
init/Kconfig | 16 ++-
kernel/Makefile | 1
kernel/delayacct.c | 56 ++++++++++
kernel/taskstats.c | 244 ++++++++++++++++++++++++++++++++++++++++++++++
7 files changed, 464 insertions(+), 3 deletions(-)
diff -puN include/linux/delayacct.h~delayacct-genetlink include/linux/delayacct.h
--- linux-2.6.16-rc5/include/linux/delayacct.h~delayacct-genetlink 2006-03-09 17:15:31.000000000 +0530
+++ linux-2.6.16-rc5-balbir/include/linux/delayacct.h 2006-03-09 17:15:31.000000000 +0530
@@ -15,6 +15,7 @@
#define _LINUX_TASKDELAYS_H
#include <linux/sched.h>
+#include <linux/taskstats.h>
#ifdef CONFIG_TASK_DELAY_ACCT
extern int delayacct_on; /* Delay accounting turned on/off */
@@ -24,6 +25,7 @@ extern void __delayacct_tsk_init(struct
extern void __delayacct_tsk_exit(struct task_struct *);
extern void __delayacct_blkio(void);
extern void __delayacct_swapin(void);
+extern int delayacct_add_tsk(struct taskstats_reply *, struct task_struct *);
static inline void delayacct_tsk_init(struct task_struct *tsk)
{
diff -puN /dev/null include/linux/taskstats.h
--- /dev/null 2004-06-24 23:34:38.000000000 +0530
+++ linux-2.6.16-rc5-balbir/include/linux/taskstats.h 2006-03-09 19:28:54.000000000 +0530
@@ -0,0 +1,128 @@
+/* taskstats.h - exporting per-task statistics
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ * (C) Balbir Singh, IBM Corp. 2006
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#ifndef _LINUX_TASKSTATS_H
+#define _LINUX_TASKSTATS_H
+
+/* Format for per-task data returned to userland when
+ * - a task exits
+ * - listener requests stats for a task
+ *
+ * The struct is versioned. Newer versions should only add fields to
+ * the bottom of the struct to maintain backward compatibility.
+ *
+ * To create the next version, bump up the taskstats_version variable
+ * and delineate the start of newly added fields with a comment indicating
+ * the version number.
+ */
+
+#define TASKSTATS_VERSION 1
+
+struct taskstats {
+ /* Maintain 64-bit alignment while extending */
+
+ /* Version 1 */
+#define TASKSTATS_NOPID -1
+ __s64 pid;
+ __s64 tgid;
+
+ /* XXX_count is number of delay values recorded.
+ * XXX_total is corresponding cumulative delay in nanoseconds
+ */
+
+#define TASKSTATS_NOCPUSTATS 1
+ __u64 cpu_count;
+ __u64 cpu_delay_total; /* wait, while runnable, for cpu */
+ __u64 blkio_count;
+ __u64 blkio_delay_total; /* sync,block io completion wait*/
+ __u64 swapin_count;
+ __u64 swapin_delay_total; /* swapin page fault wait*/
+
+ __u64 cpu_run_total; /* cpu running time
+ * no count available/provided */
+};
+
+
+#define TASKSTATS_LISTEN_GROUP 0x1
+
+/*
+ * Commands sent from userspace
+ * Not versioned. New commands should only be inserted at the enum's end
+ */
+
+enum {
+ TASKSTATS_CMD_UNSPEC, /* Reserved */
+ TASKSTATS_CMD_NONE, /* Not a valid cmd to send
+ * Marks data sent on task/tgid exit */
+ TASKSTATS_CMD_LISTEN, /* Start listening */
+ TASKSTATS_CMD_IGNORE, /* Stop listening */
+ TASKSTATS_CMD_PID, /* Send stats for a pid */
+ TASKSTATS_CMD_TGID, /* Send stats for a tgid */
+};
+
+/* Parameters for commands
+ * New parameters should only be inserted at the struct's end
+ */
+
+struct taskstats_cmd_param {
+ /* Maintain 64-bit alignment while extending */
+ union {
+ __s64 pid;
+ __s64 tgid;
+ } id;
+};
+
+enum outtype {
+ TASKSTATS_REPLY_NONE = 1, /* Control cmd response */
+ TASKSTATS_REPLY_PID, /* per-pid data cmd response*/
+ TASKSTATS_REPLY_TGID, /* per-tgid data cmd response*/
+ TASKSTATS_REPLY_EXIT_PID, /* Exiting task's stats */
+ TASKSTATS_REPLY_EXIT_TGID, /* Exiting tgid's stats
+ * (sent on each tid's exit) */
+};
+
+/*
+ * Reply sent from kernel
+ * Version number affects size/format of struct taskstats only
+ */
+
+struct taskstats_reply {
+ /* Maintain 64-bit alignment while extending */
+ __u16 outtype; /* Must be one of enum outtype */
+ __u16 version;
+ __u32 err;
+ struct taskstats stats; /* Invalid if err != 0 */
+};
+
+/* NETLINK_GENERIC related info */
+
+#define TASKSTATS_GENL_NAME "TASKSTATS"
+#define TASKSTATS_GENL_VERSION 0x1
+
+#define TASKSTATS_HDRLEN (NLMSG_SPACE(GENL_HDRLEN))
+#define TASKSTATS_BODYLEN (sizeof(struct taskstats_reply))
+
+#ifdef __KERNEL__
+
+#include <linux/sched.h>
+
+#ifdef CONFIG_TASKSTATS
+extern void taskstats_exit_pid(struct task_struct *);
+#else
+static inline void taskstats_exit_pid(struct task_struct *tsk)
+{}
+#endif
+
+#endif /* __KERNEL__ */
+#endif /* _LINUX_TASKSTATS_H */
diff -puN init/Kconfig~delayacct-genetlink init/Kconfig
--- linux-2.6.16-rc5/init/Kconfig~delayacct-genetlink 2006-03-09 17:15:31.000000000 +0530
+++ linux-2.6.16-rc5-balbir/init/Kconfig 2006-03-09 17:15:31.000000000 +0530
@@ -158,11 +158,21 @@ config TASK_DELAY_ACCT
in pages. Such statistics can help in setting a task's priorities
relative to other tasks for cpu, io, rss limits etc.
- Unlike BSD process accounting, this information is available
- continuously during the lifetime of a task.
-
Say N if unsure.
+config TASKSTATS
+ bool "Export task/process statistics through netlink (EXPERIMENTAL)"
+ depends on TASK_DELAY_ACCT
+ default y
+ help
+ Export selected statistics for tasks/processes through the
+ generic netlink interface. Unlike BSD process accounting, the
+ statistics are available during the lifetime of tasks/processes as
+ responses to commands. Like BSD accounting, they are sent to user
+ space on task exit.
+
+ Say Y if unsure.
+
config SYSCTL
bool "Sysctl support"
---help---
diff -puN kernel/delayacct.c~delayacct-genetlink kernel/delayacct.c
--- linux-2.6.16-rc5/kernel/delayacct.c~delayacct-genetlink 2006-03-09 17:15:31.000000000 +0530
+++ linux-2.6.16-rc5-balbir/kernel/delayacct.c 2006-03-09 17:15:31.000000000 +0530
@@ -16,9 +16,12 @@
#include <linux/time.h>
#include <linux/sysctl.h>
#include <linux/delayacct.h>
+#include <linux/taskstats.h>
+#include <linux/mutex.h>
int delayacct_on = 0; /* Delay accounting turned on/off */
kmem_cache_t *delayacct_cache;
+static DEFINE_MUTEX(delayacct_exit_mutex);
static int __init delayacct_setup_enable(char *str)
{
@@ -51,8 +54,14 @@ void __delayacct_tsk_init(struct task_st
void __delayacct_tsk_exit(struct task_struct *tsk)
{
+ /*
+ * Protect against racing thread group exits
+ */
+ mutex_lock(&delayacct_exit_mutex);
+ taskstats_exit_pid(tsk);
kmem_cache_free(delayacct_cache, tsk->delays);
tsk->delays = NULL;
+ mutex_unlock(&delayacct_exit_mutex);
}
static inline nsec_t delayacct_measure(void)
@@ -97,3 +106,50 @@ void __delayacct_swapin(void)
current->delays->swapin_count++;
spin_unlock(¤t->delays->lock);
}
+
+#ifdef CONFIG_TASKSTATS
+
+int delayacct_add_tsk(struct taskstats_reply *reply, struct task_struct *tsk)
+{
+ struct taskstats *d = &reply->stats;
+ nsec_t tmp;
+ struct timespec ts;
+ unsigned long t1,t2;
+
+ if (!tsk->delays || !delayacct_on)
+ return -EINVAL;
+
+ /* zero XXX_total,non-zero XXX_count implies XXX stat overflowed */
+#ifdef CONFIG_SCHEDSTATS
+
+ tmp = (nsec_t)d->cpu_run_total ;
+ tmp += (u64)(tsk->utime+tsk->stime)*TICK_NSEC;
+ d->cpu_run_total = (tmp < (nsec_t)d->cpu_run_total)? 0:tmp;
+
+ /* No locking available for sched_info. Take snapshot first. */
+ t1 = tsk->sched_info.pcnt;
+ t2 = tsk->sched_info.run_delay;
+
+ d->cpu_count += t1;
+
+ jiffies_to_timespec(t2, &ts);
+ tmp = (nsec_t)d->cpu_delay_total + timespec_to_ns(&ts);
+ d->cpu_delay_total = (tmp < (nsec_t)d->cpu_delay_total)? 0:tmp;
+#else
+ /* Non-zero XXX_total,zero XXX_count implies XXX stat unavailable */
+ d->cpu_count = 0;
+ d->cpu_run_total = d->cpu_delay_total = TASKSTATS_NOCPUSTATS;
+#endif
+ spin_lock(&tsk->delays->lock);
+ tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
+ d->blkio_delay_total = (tmp < d->blkio_delay_total)? 0:tmp;
+ tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
+ d->swapin_delay_total = (tmp < d->swapin_delay_total)? 0:tmp;
+ d->blkio_count += tsk->delays->blkio_count;
+ d->swapin_count += tsk->delays->swapin_count;
+ spin_unlock(&tsk->delays->lock);
+
+ return 0;
+}
+
+#endif /* CONFIG_TASKSTATS */
diff -puN kernel/Makefile~delayacct-genetlink kernel/Makefile
--- linux-2.6.16-rc5/kernel/Makefile~delayacct-genetlink 2006-03-09 17:15:31.000000000 +0530
+++ linux-2.6.16-rc5-balbir/kernel/Makefile 2006-03-09 17:15:31.000000000 +0530
@@ -35,6 +35,7 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
+obj-$(CONFIG_TASKSTATS) += taskstats.o
ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
# According to Alan Modra <[email protected]>, the -fno-omit-frame-pointer is
diff -puN /dev/null kernel/taskstats.c
--- /dev/null 2004-06-24 23:34:38.000000000 +0530
+++ linux-2.6.16-rc5-balbir/kernel/taskstats.c 2006-03-09 18:52:47.000000000 +0530
@@ -0,0 +1,244 @@
+/*
+ * taskstats.c - Export per-task statistics to userland
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ * (C) Balbir Singh, IBM Corp. 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/taskstats.h>
+#include <linux/delayacct.h>
+#include <net/genetlink.h>
+#include <asm/atomic.h>
+
+const int taskstats_version = TASKSTATS_VERSION;
+static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
+static int family_registered = 0;
+
+
+static struct genl_family family = {
+ .id = GENL_ID_GENERATE,
+ .name = TASKSTATS_GENL_NAME,
+ .version = TASKSTATS_GENL_VERSION,
+ .hdrsize = 0,
+ .maxattr = 0,
+};
+
+/* Taskstat specific functions */
+static int prepare_reply(struct genl_info *info, u8 cmd,
+ struct sk_buff **skbp, struct taskstats_reply **replyp)
+{
+ struct sk_buff *skb;
+ struct taskstats_reply *reply;
+
+ skb = nlmsg_new(TASKSTATS_HDRLEN + TASKSTATS_BODYLEN);
+ if (!skb)
+ return -ENOMEM;
+
+ if (!info) {
+ int seq = get_cpu_var(taskstats_seqnum)++;
+ put_cpu_var(taskstats_seqnum);
+
+ reply = genlmsg_put(skb, 0, seq,
+ family.id, 0, NLM_F_REQUEST,
+ cmd, family.version);
+ } else
+ reply = genlmsg_put(skb, info->snd_pid, info->snd_seq,
+ family.id, 0, info->nlhdr->nlmsg_flags,
+ info->genlhdr->cmd, family.version);
+ if (reply == NULL) {
+ nlmsg_free(skb);
+ return -EINVAL;
+ }
+ skb_put(skb, TASKSTATS_BODYLEN);
+
+ memset(reply, 0, sizeof(*reply));
+ reply->version = taskstats_version;
+ reply->err = 0;
+
+ *skbp = skb;
+ *replyp = reply;
+ return 0;
+}
+
+static int send_reply(struct sk_buff *skb, int replytype, pid_t pid, int event)
+{
+ struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
+ struct taskstats_reply *reply;
+ int rc;
+
+ reply = (struct taskstats_reply *)genlmsg_data(genlhdr);
+ reply->outtype = replytype;
+
+ rc = genlmsg_end(skb, reply);
+ if (rc < 0) {
+ nlmsg_free(skb);
+ return rc;
+ }
+
+ if (event)
+ return genlmsg_multicast(skb, pid, TASKSTATS_LISTEN_GROUP);
+ else
+ return genlmsg_unicast(skb, pid);
+}
+
+static inline void fill_pid(struct taskstats_reply *reply, pid_t pid,
+ struct task_struct *pidtsk)
+{
+ int rc;
+ struct task_struct *tsk = pidtsk;
+
+ if (!pidtsk) {
+ read_lock(&tasklist_lock);
+ tsk = find_task_by_pid(pid);
+ if (!tsk) {
+ read_unlock(&tasklist_lock);
+ reply->err = EINVAL;
+ return;
+ }
+ get_task_struct(tsk);
+ read_unlock(&tasklist_lock);
+ } else
+ get_task_struct(tsk);
+
+ rc = delayacct_add_tsk(reply, tsk);
+ if (!rc) {
+ reply->stats.pid = (s64)tsk->pid;
+ reply->stats.tgid = (s64)tsk->tgid;
+ } else
+ reply->err = (rc < 0) ? -rc : rc ;
+
+ put_task_struct(tsk);
+}
+
+static int taskstats_send_pid(struct sk_buff *skb, struct genl_info *info)
+{
+ int rc;
+ struct sk_buff *rep_skb;
+ struct taskstats_reply *reply;
+ struct taskstats_cmd_param *param= info->userhdr;
+
+ rc = prepare_reply(info, info->genlhdr->cmd, &rep_skb, &reply);
+ if (rc)
+ return rc;
+ fill_pid(reply, param->id.pid, NULL);
+ return send_reply(rep_skb, TASKSTATS_REPLY_PID, info->snd_pid, 0);
+}
+
+static inline void fill_tgid(struct taskstats_reply *reply, pid_t tgid,
+ struct task_struct *tgidtsk)
+{
+ int rc;
+ struct task_struct *tsk, *first;
+
+ first = tgidtsk;
+ read_lock(&tasklist_lock);
+ if (!first) {
+ first = find_task_by_pid(tgid);
+ if (!first) {
+ read_unlock(&tasklist_lock);
+ reply->err = EINVAL;
+ return;
+ }
+ }
+ tsk = first;
+ do {
+ rc = delayacct_add_tsk(reply, tsk);
+ if (rc)
+ break;
+ } while_each_thread(first, tsk);
+ read_unlock(&tasklist_lock);
+
+ if (!rc) {
+ reply->stats.pid = (s64)TASKSTATS_NOPID;
+ reply->stats.tgid = (s64)tgid;
+ } else
+ reply->err = (rc < 0) ? -rc : rc ;
+}
+
+static int taskstats_send_tgid(struct sk_buff *skb, struct genl_info *info)
+{
+ int rc;
+ struct sk_buff *rep_skb;
+ struct taskstats_reply *reply;
+ struct taskstats_cmd_param *param= info->userhdr;
+
+ rc = prepare_reply(info, info->genlhdr->cmd, &rep_skb, &reply);
+ if (rc)
+ return rc;
+ fill_tgid(reply, param->id.tgid, NULL);
+ return send_reply(rep_skb, TASKSTATS_REPLY_TGID, info->snd_pid, 0);
+}
+
+/* Send pid data out on exit */
+void taskstats_exit_pid(struct task_struct *tsk)
+{
+ int rc;
+ struct sk_buff *rep_skb;
+ struct taskstats_reply *reply;
+
+ /*
+ * tasks can start to exit very early. Ensure that the family
+ * is registered before notifications are sent out
+ */
+ if (!family_registered)
+ return;
+
+ rc = prepare_reply(NULL, TASKSTATS_CMD_NONE, &rep_skb, &reply);
+ if (rc)
+ return;
+ fill_pid(reply, tsk->pid, tsk);
+ rc = send_reply(rep_skb, TASKSTATS_REPLY_EXIT_PID, 0, 1);
+
+ if (rc || thread_group_empty(tsk))
+ return;
+
+ /* Send tgid data too */
+ rc = prepare_reply(NULL, TASKSTATS_CMD_NONE, &rep_skb, &reply);
+ if (rc)
+ return;
+ fill_tgid(reply, tsk->tgid, tsk);
+ send_reply(rep_skb, TASKSTATS_REPLY_EXIT_TGID, 0, 1);
+}
+
+static struct genl_ops pid_ops = {
+ .cmd = TASKSTATS_CMD_PID,
+ .doit = taskstats_send_pid,
+};
+
+static struct genl_ops tgid_ops = {
+ .cmd = TASKSTATS_CMD_TGID,
+ .doit = taskstats_send_tgid,
+};
+
+static int __init taskstats_init(void)
+{
+ if (genl_register_family(&family))
+ return -EFAULT;
+ family_registered = 1;
+
+ if (genl_register_ops(&family, &pid_ops))
+ goto err;
+ if (genl_register_ops(&family, &tgid_ops))
+ goto err;
+
+ return 0;
+err:
+ genl_unregister_family(&family);
+ family_registered = 0;
+ return -EFAULT;
+}
+
+late_initcall(taskstats_init);
+
diff -puN include/net/genetlink.h~delayacct-genetlink include/net/genetlink.h
--- linux-2.6.16-rc5/include/net/genetlink.h~delayacct-genetlink 2006-03-09 17:15:31.000000000 +0530
+++ linux-2.6.16-rc5-balbir/include/net/genetlink.h 2006-03-09 17:48:39.000000000 +0530
@@ -150,4 +150,24 @@ static inline int genlmsg_unicast(struct
return nlmsg_unicast(genl_sock, skb, pid);
}
+/**
+ * gennlmsg_data - head of message payload
+ * @gnlh: genetlink messsage header
+ */
+static inline void *genlmsg_data(const struct genlmsghdr *gnlh)
+{
+ return ((unsigned char *) gnlh + GENL_HDRLEN);
+}
+
+/**
+ * genlmsg_len - length of message payload
+ * @gnlh: genetlink message header
+ */
+static inline int genlmsg_len(const struct genlmsghdr *gnlh)
+{
+ struct nlmsghdr *nlh = (struct nlmsghdr *)((unsigned char *)gnlh -
+ NLMSG_HDRLEN);
+ return (nlh->nlmsg_len - GENL_HDRLEN - NLMSG_HDRLEN);
+}
+
#endif /* __NET_GENERIC_NETLINK_H */
_
Balbir Singh wrote:
<snip>
>Hello, Jamal,
>
>Please find the latest version of the patch for review. The genetlink
>code has been updated as per your review comments. The changelog is provided
>below
>
>1. Eliminated TASKSTATS_CMD_LISTEN and TASKSTATS_CMD_IGNORE
>2. Provide generic functions called genlmsg_data() and genlmsg_len()
> in linux/net/genetlink.h
>
>
Balbir,
it might be a good idea to split 2. out separately, since it has generic
value beyond the
delay accounting patches (just like we did for the timespec_diff_ns change)
Thanks,
Shailabh
>3. Do not multicast all replies, multicast only events generated due
> to task exit.
>4. The taskstats and taskstats_reply structures are now 64 bit aligned.
>5. Family id is dynamically generated.
>
>Please let us know if we missed something out.
>
>Thanks,
>Balbir
>
>
>Signed-off-by: Shailabh Nagar <[email protected]>
>Signed-off-by: Balbir Singh <[email protected]>
>
>---
>
> include/linux/delayacct.h | 2
> include/linux/taskstats.h | 128 ++++++++++++++++++++++++
> include/net/genetlink.h | 20 +++
> init/Kconfig | 16 ++-
> kernel/Makefile | 1
> kernel/delayacct.c | 56 ++++++++++
> kernel/taskstats.c | 244 ++++++++++++++++++++++++++++++++++++++++++++++
> 7 files changed, 464 insertions(+), 3 deletions(-)
>
>
>
<snip>
On Thu, 2006-09-03 at 20:07 +0530, Balbir Singh wrote:
> Please find the latest version of the patch for review. The genetlink
> code has been updated as per your review comments. The changelog is provided
> below
>
> 1. Eliminated TASKSTATS_CMD_LISTEN and TASKSTATS_CMD_IGNORE
> 2. Provide generic functions called genlmsg_data() and genlmsg_len()
> in linux/net/genetlink.h
> 3. Do not multicast all replies, multicast only events generated due
> to task exit.
> 4. The taskstats and taskstats_reply structures are now 64 bit aligned.
> 5. Family id is dynamically generated.
>
> Please let us know if we missed something out.
Design still shaky IMO - now that i think i may understand what your end
goal is.
Using the principles i described in earlier email, the problem you are
trying to solve is:
a) shipping of the taskstats from kernel to user-space asynchronously to
all listeners on multicast channel/group TASKSTATS_LISTEN_GRP
at the point when some process exits.
b) responding to queries issued by the user to the kernel for taskstats
of a particular defined tgid and/or pid combination.
Did i summarize your goals correctly?
So lets stat with #b:
i) the message is multicast; there has to be a user space app registered
to the multicast group otherwise nothing goes to user space.
ii) user space issues a GET and it seems to me the appropriate naming
for the response is a NEW.
Lets go to #a:
The issued multicast messages are also NEW and no different from the
ones sent in response to a GET.
Having said that then, you have the following commands:
enum {
TASKSTATS_CMD_UNSPEC, /* Reserved */
TASKSTATS_CMD_GET, /* user -> kernel query*/
TASKSTATS_CMD_NEW, /* kernel -> user update */
};
You also need the following TLVs
enum {
TASKSTATS_TYPE_UNSPEC, /* Reserved */
TASKSTATS_TYPE_TGID, /* The TGID */
TASKSTATS_TYPE_PID, /* The PID */
TASKSTATS_TYPE_STATS, /* carries the taskstats */
TASKSTATS_TYPE_VERS, /* carries the version */
};
Refer to the doc i passed you and provide feedback if how to use the
above is not obvious.
The use of TLVs above implies that any of these can be optionally
appearing.
So when you are going from user->kernel with a GET a in #a above, then
you can specify the PID and/or TGID and you dont need to specify the
STATS and this would be perfectly legal.
On kernel->user (in the case of response to #a or async notifiation as
in #b) you really dont need to specify the TG/PID since they appear in
the STATS etc.
I take it you dont want to configure the values of taskstats from user
space, otherwise user->kernel will send a NEW as well.
I also take it dumping doesnt apply to you, so you dont need a dump
callback in your kernel code.
>From what i described so far, you dont really need a header for yourself
either (maybe you need one to just store the VERSION?)
I didnt understand the point to the err field you had in the reply.
Netlink already does issue errors which can be read via perror. If this
is different from what you need, it may be an excuse to have your own
header.
I hope this helps.
cheers,
jamal
On Fri, 2006-10-03 at 09:53 -0500, jamal wrote:
>
> a) shipping of the taskstats from kernel to user-space asynchronously to
> all listeners on multicast channel/group TASKSTATS_LISTEN_GRP
> at the point when some process exits.
> b) responding to queries issued by the user to the kernel for taskstats
> of a particular defined tgid and/or pid combination.
>
> Did i summarize your goals correctly?
>
> So lets stat with #b:
> i) the message is multicast; there has to be a user space app registered
> to the multicast group otherwise nothing goes to user space.
I mispoke:
The above applies to #a.
For #b, the message from/to kernel to user is unicast.
cheers,
jamal
On Fri, Mar 10, 2006 at 09:53:53AM -0500, jamal wrote:
> On Thu, 2006-09-03 at 20:07 +0530, Balbir Singh wrote:
> > Please let us know if we missed something out.
>
> Design still shaky IMO - now that i think i may understand what your end
> goal is.
> Using the principles i described in earlier email, the problem you are
> trying to solve is:
>
> a) shipping of the taskstats from kernel to user-space asynchronously to
> all listeners on multicast channel/group TASKSTATS_LISTEN_GRP
> at the point when some process exits.
> b) responding to queries issued by the user to the kernel for taskstats
> of a particular defined tgid and/or pid combination.
>
> Did i summarize your goals correctly?
Yes, you did.
>
> So lets stat with #b:
> i) the message is multicast; there has to be a user space app registered
> to the multicast group otherwise nothing goes to user space.
> ii) user space issues a GET and it seems to me the appropriate naming
> for the response is a NEW.
>
> Lets go to #a:
> The issued multicast messages are also NEW and no different from the
> ones sent in response to a GET.
>
> Having said that then, you have the following commands:
>
> enum {
> TASKSTATS_CMD_UNSPEC, /* Reserved */
> TASKSTATS_CMD_GET, /* user -> kernel query*/
> TASKSTATS_CMD_NEW, /* kernel -> user update */
> };
>
> You also need the following TLVs
>
> enum {
> TASKSTATS_TYPE_UNSPEC, /* Reserved */
> TASKSTATS_TYPE_TGID, /* The TGID */
> TASKSTATS_TYPE_PID, /* The PID */
> TASKSTATS_TYPE_STATS, /* carries the taskstats */
> TASKSTATS_TYPE_VERS, /* carries the version */
> };
>
> Refer to the doc i passed you and provide feedback if how to use the
> above is not obvious.
>
I will look at the document, just got hold of it.
> The use of TLVs above implies that any of these can be optionally
> appearing.
> So when you are going from user->kernel with a GET a in #a above, then
> you can specify the PID and/or TGID and you dont need to specify the
> STATS and this would be perfectly legal.
>
> On kernel->user (in the case of response to #a or async notifiation as
> in #b) you really dont need to specify the TG/PID since they appear in
> the STATS etc.
I see your point now. I am looking at other users of netlink like
rtnetlink and I see the classical usage.
We can implement TLV's in our code, but for the most part the data we exchange
between the user <-> kernel has all the TLV's listed in the enum above.
The major differnece is the type (pid/tgid). Hence we created a structure
(taskstats) instead of using TLV's.
> I take it you dont want to configure the values of taskstats from user
> space, otherwise user->kernel will send a NEW as well.
Your understanding is correct.
> I also take it dumping doesnt apply to you, so you dont need a dump
> callback in your kernel code.
Yes, this is correct as well.
> >From what i described so far, you dont really need a header for yourself
> either (maybe you need one to just store the VERSION?)
>
True, we do not need a header.
> I didnt understand the point to the err field you had in the reply.
> Netlink already does issue errors which can be read via perror. If this
> is different from what you need, it may be an excuse to have your own
> header.
>
Hmm.. Will look into this.
> I hope this helps.
>
Yes, it does immensely. Thanks for the detailed feedback.
> cheers,
> jamal
>
>
Warm Regards,
Balbir
On Fri, 2006-10-03 at 22:09 +0530, Balbir Singh wrote:
> On Fri, Mar 10, 2006 at 09:53:53AM -0500, jamal wrote:
> > On kernel->user (in the case of response to #a or async notifiation as
> > in #b) you really dont need to specify the TG/PID since they appear in
> > the STATS etc.
>
> I see your point now. I am looking at other users of netlink like
> rtnetlink and I see the classical usage.
>
> We can implement TLV's in our code, but for the most part the data we exchange
> between the user <-> kernel has all the TLV's listed in the enum above.
>
> The major differnece is the type (pid/tgid). Hence we created a structure
> (taskstats) instead of using TLV's.
Something to remember:
1) TLVs are essentially giving you the flexibility to send optionally
appearing elements. It is up to the receiver (in the kernel or user
space) to check for the presence of mandatory elements or execute things
depending on the presence of certain TLVs. Example in your case:
if the tgid TLV appears then the user is requesting for that TLV
if the pid appears then they are requesting for that
if both appear then it is the && of the two.
You should always ignore TLVs you dont understand - to allow for forward
compatibility.
2) The "T" part is essentially also encoding (semantically) what size
the value is; the "L" part is useful for validation. So the receiver
will always know what the size of the TLV is by definition and uses the
L to make sure it is the right size. Reject what is of the wrong size.
cheers,
jamal
On Sat, Mar 11, 2006 at 08:30:49AM -0500, jamal wrote:
> On Fri, 2006-10-03 at 22:09 +0530, Balbir Singh wrote:
> > On Fri, Mar 10, 2006 at 09:53:53AM -0500, jamal wrote:
>
> > > On kernel->user (in the case of response to #a or async notifiation as
> > > in #b) you really dont need to specify the TG/PID since they appear in
> > > the STATS etc.
> >
> > I see your point now. I am looking at other users of netlink like
> > rtnetlink and I see the classical usage.
> >
> > We can implement TLV's in our code, but for the most part the data we exchange
> > between the user <-> kernel has all the TLV's listed in the enum above.
> >
> > The major differnece is the type (pid/tgid). Hence we created a structure
> > (taskstats) instead of using TLV's.
>
> Something to remember:
>
> 1) TLVs are essentially giving you the flexibility to send optionally
> appearing elements. It is up to the receiver (in the kernel or user
> space) to check for the presence of mandatory elements or execute things
> depending on the presence of certain TLVs. Example in your case:
> if the tgid TLV appears then the user is requesting for that TLV
> if the pid appears then they are requesting for that
> if both appear then it is the && of the two.
> You should always ignore TLVs you dont understand - to allow for forward
> compatibility.
>
> 2) The "T" part is essentially also encoding (semantically) what size
> the value is; the "L" part is useful for validation. So the receiver
> will always know what the size of the TLV is by definition and uses the
> L to make sure it is the right size. Reject what is of the wrong size.
>
> cheers,
> jamal
Thanks for the clarification, I will try and adapt our genetlink to use
TLV's, I can see the benefits - we will work on this as an evolutionary change
to our code.
Warm Regards,
Balbir