Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S262176AbVEESn7 (ORCPT ); Thu, 5 May 2005 14:43:59 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S262172AbVEESn7 (ORCPT ); Thu, 5 May 2005 14:43:59 -0400 Received: from c-24-22-18-178.hsd1.or.comcast.net ([24.22.18.178]:41873 "EHLO w-gerrit.beaverton.ibm.com") by vger.kernel.org with ESMTP id S262176AbVEES3K (ORCPT ); Thu, 5 May 2005 14:29:10 -0400 Message-Id: <20050505180928.689701000@us.ibm.com> References: <20050505180731.010896000@us.ibm.com> Date: Thu, 05 May 2005 11:07:33 -0700 To: linux-kernel@vger.kernel.org, ckrm-tech@lists.sourceforge.net Subject: [patch 02/21] CKRM: Processor Delay Accounting From: gh@us.ibm.com Sender: linux-kernel-owner@vger.kernel.org X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 14068 Lines: 420 -- Content-Disposition: inline; filename=02-diff_delay_acct CKRM processor scheduling delay accounting - provides a mechanism to In addition to counting frequency the total delay in ns is also recorded. CPU delays are specified as cpu-wait and cpu-run. I/O delays are recorded for memory and regular I/O. Information is accessible through /proc//delay. Signed-Off-By: Chandra Seetharaman Signed-Off-By: Hubertus Franke Signed-Off-By: Shailabh Nagar Signed-Off-By: Gerrit Huizenga fs/proc/array.c | 18 +++++++++ fs/proc/base.c | 17 ++++++++ fs/proc/internal.h | 1 include/linux/sched.h | 89 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/taskdelays.h | 35 +++++++++++++++++ init/Kconfig | 8 ++++ kernel/fork.c | 1 kernel/sched.c | 20 ++++++++++ mm/memory.c | 9 ++++ 9 files changed, 197 insertions(+), 1 deletion(-) Index: linux-2.6.12-rc3-ckrm5/fs/proc/array.c =================================================================== --- linux-2.6.12-rc3-ckrm5.orig/fs/proc/array.c 2005-05-05 09:32:56.000000000 -0700 +++ linux-2.6.12-rc3-ckrm5/fs/proc/array.c 2005-05-05 09:35:02.000000000 -0700 @@ -482,3 +482,21 @@ int proc_pid_statm(struct task_struct *t return sprintf(buffer,"%d %d %d %d %d %d %d\n", size, resident, shared, text, lib, data, 0); } + + +int proc_pid_delay(struct task_struct *task, char * buffer) +{ + int res; + + res = sprintf(buffer,"%u %llu %llu %u %llu %u %llu\n", + (unsigned int) get_delay(task,runs), + (uint64_t) get_delay(task,runcpu_total), + (uint64_t) get_delay(task,waitcpu_total), + (unsigned int) get_delay(task,num_iowaits), + (uint64_t) get_delay(task,iowait_total), + (unsigned int) get_delay(task,num_memwaits), + (uint64_t) get_delay(task,mem_iowait_total) + ); + return res; +} + Index: linux-2.6.12-rc3-ckrm5/fs/proc/base.c =================================================================== --- linux-2.6.12-rc3-ckrm5.orig/fs/proc/base.c 2005-05-05 09:32:56.000000000 -0700 +++ linux-2.6.12-rc3-ckrm5/fs/proc/base.c 2005-05-05 09:35:02.000000000 -0700 @@ -120,6 +120,10 @@ enum pid_directory_inos { #ifdef CONFIG_AUDITSYSCALL PROC_TID_LOGINUID, #endif +#ifdef CONFIG_DELAY_ACCT + PROC_TID_DELAY_ACCT, + PROC_TGID_DELAY_ACCT, +#endif PROC_TID_FD_DIR = 0x8000, /* 0x8000-0xffff */ PROC_TID_OOM_SCORE, PROC_TID_OOM_ADJUST, @@ -155,6 +159,9 @@ static struct pid_entry tgid_base_stuff[ #ifdef CONFIG_SECURITY E(PROC_TGID_ATTR, "attr", S_IFDIR|S_IRUGO|S_IXUGO), #endif +#ifdef CONFIG_DELAY_ACCT + E(PROC_TGID_DELAY_ACCT,"delay", S_IFREG|S_IRUGO), +#endif #ifdef CONFIG_KALLSYMS E(PROC_TGID_WCHAN, "wchan", S_IFREG|S_IRUGO), #endif @@ -191,6 +198,9 @@ static struct pid_entry tid_base_stuff[] #ifdef CONFIG_SECURITY E(PROC_TID_ATTR, "attr", S_IFDIR|S_IRUGO|S_IXUGO), #endif +#ifdef CONFIG_DELAY_ACCT + E(PROC_TGID_DELAY_ACCT,"delay", S_IFREG|S_IRUGO), +#endif #ifdef CONFIG_KALLSYMS E(PROC_TID_WCHAN, "wchan", S_IFREG|S_IRUGO), #endif @@ -1564,6 +1574,13 @@ static struct dentry *proc_pident_lookup ei->op.proc_read = proc_pid_wchan; break; #endif +#ifdef CONFIG_DELAY_ACCT + case PROC_TID_DELAY_ACCT: + case PROC_TGID_DELAY_ACCT: + inode->i_fop = &proc_info_file_operations; + ei->op.proc_read = proc_pid_delay; + break; +#endif #ifdef CONFIG_SCHEDSTATS case PROC_TID_SCHEDSTAT: case PROC_TGID_SCHEDSTAT: Index: linux-2.6.12-rc3-ckrm5/fs/proc/internal.h =================================================================== --- linux-2.6.12-rc3-ckrm5.orig/fs/proc/internal.h 2005-03-01 23:37:48.000000000 -0800 +++ linux-2.6.12-rc3-ckrm5/fs/proc/internal.h 2005-05-05 09:35:02.000000000 -0700 @@ -36,6 +36,7 @@ extern int proc_tid_stat(struct task_str extern int proc_tgid_stat(struct task_struct *, char *); extern int proc_pid_status(struct task_struct *, char *); extern int proc_pid_statm(struct task_struct *, char *); +extern int proc_pid_delay(struct task_struct *, char*); static inline struct task_struct *proc_task(struct inode *inode) { Index: linux-2.6.12-rc3-ckrm5/include/linux/sched.h =================================================================== --- linux-2.6.12-rc3-ckrm5.orig/include/linux/sched.h 2005-05-05 09:33:00.000000000 -0700 +++ linux-2.6.12-rc3-ckrm5/include/linux/sched.h 2005-05-05 09:35:02.000000000 -0700 @@ -34,6 +34,7 @@ #include #include #include +#include struct exec_domain; @@ -737,6 +738,9 @@ struct task_struct { nodemask_t mems_allowed; int cpuset_mems_generation; #endif +#ifdef CONFIG_DELAY_ACCT + struct task_delay_info delays; +#endif }; static inline pid_t process_group(struct task_struct *tsk) @@ -1033,6 +1037,9 @@ task_t *fork_idle(int); extern void set_task_comm(struct task_struct *tsk, char *from); extern void get_task_comm(char *to, struct task_struct *tsk); +#define PF_MEMIO 0x00400000 /* I am potentially doing I/O for mem */ +#define PF_IOWAIT 0x00800000 /* I am waiting on disk I/O */ + #ifdef CONFIG_SMP extern void wait_task_inactive(task_t * p); #else @@ -1267,6 +1274,88 @@ static inline int try_to_freeze(unsigned return 0; } #endif /* CONFIG_PM */ + +/* API for registering delay info */ +#ifdef CONFIG_DELAY_ACCT + +#define test_delay_flag(tsk,flg) ((tsk)->flags & (flg)) +#define set_delay_flag(tsk,flg) ((tsk)->flags |= (flg)) +#define clear_delay_flag(tsk,flg) ((tsk)->flags &= ~(flg)) + +#define def_delay_var(var) unsigned long long var +#define get_delay(tsk,field) ((tsk)->delays.field) + +#define start_delay(var) ((var) = sched_clock()) +#define start_delay_set(var,flg) (set_delay_flag(current,flg),(var) = \ + sched_clock()) + +#define inc_delay(tsk,field) (((tsk)->delays.field)++) + +/* because of hardware timer drifts in SMPs and task continue on different cpu + * then where the start_ts was taken there is a possibility that + * end_ts < start_ts by some usecs. In this case we ignore the diff + * and add nothing to the total. + */ +#ifdef CONFIG_SMP +#define test_ts_integrity(start_ts,end_ts) (likely((end_ts) > (start_ts))) +#else +#define test_ts_integrity(start_ts,end_ts) (1) +#endif + +#define add_delay_ts(tsk,field,start_ts,end_ts) \ + do { if (test_ts_integrity(start_ts,end_ts)) (tsk)->delays.field += ((end_ts)-(start_ts)); } while (0) + +#define add_delay_clear(tsk,field,start_ts,flg) \ + do { \ + unsigned long long now = sched_clock(); \ + add_delay_ts(tsk,field,start_ts,now); \ + clear_delay_flag(tsk,flg); \ + } while (0) + +static inline void add_io_delay(unsigned long long dstart) +{ + struct task_struct * tsk = current; + unsigned long long now = sched_clock(); + unsigned long long val; + + if (test_ts_integrity(dstart,now)) + val = now - dstart; + else + val = 0; + if (test_delay_flag(tsk,PF_MEMIO)) { + tsk->delays.mem_iowait_total += val; + tsk->delays.num_memwaits++; + } else { + tsk->delays.iowait_total += val; + tsk->delays.num_iowaits++; + } + clear_delay_flag(tsk,PF_IOWAIT); +} + +inline static void init_delays(struct task_struct *tsk) +{ + memset((void*)&tsk->delays,0,sizeof(tsk->delays)); +} + +#else + +#define test_delay_flag(tsk,flg) (0) +#define set_delay_flag(tsk,flg) do { } while (0) +#define clear_delay_flag(tsk,flg) do { } while (0) + +#define def_delay_var(var) +#define get_delay(tsk,field) (0) + +#define start_delay(var) do { } while (0) +#define start_delay_set(var,flg) do { } while (0) + +#define inc_delay(tsk,field) do { } while (0) +#define add_delay_ts(tsk,field,start_ts,now) do { } while (0) +#define add_delay_clear(tsk,field,start_ts,flg) do { } while (0) +#define add_io_delay(dstart) do { } while (0) +#define init_delays(tsk) do { } while (0) +#endif + #endif /* __KERNEL__ */ #endif Index: linux-2.6.12-rc3-ckrm5/include/linux/taskdelays.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.12-rc3-ckrm5/include/linux/taskdelays.h 2005-05-05 09:35:02.000000000 -0700 @@ -0,0 +1,35 @@ +/* taskdelays.h - for delay accounting + * + * Copyright (C) Hubertus Franke, IBM Corp. 2003, 2004 + * + * Has the data structure for delay counting. + * + * Latest version, more details at http://ckrm.sf.net + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef _LINUX_TASKDELAYS_H +#define _LINUX_TASKDELAYS_H + +#include +#include + +struct task_delay_info { + /* delay statistics in usecs */ + uint64_t waitcpu_total; + uint64_t runcpu_total; + uint64_t iowait_total; + uint64_t mem_iowait_total; + uint32_t runs; + uint32_t num_iowaits; + uint32_t num_memwaits; +}; + +#endif /* _LINUX_TASKDELAYS_H */ Index: linux-2.6.12-rc3-ckrm5/init/Kconfig =================================================================== --- linux-2.6.12-rc3-ckrm5.orig/init/Kconfig 2005-05-05 09:34:55.000000000 -0700 +++ linux-2.6.12-rc3-ckrm5/init/Kconfig 2005-05-05 09:35:02.000000000 -0700 @@ -261,6 +261,14 @@ menuconfig EMBEDDED environments which can tolerate a "non-standard" kernel. Only use this if you really know what you are doing. +config DELAY_ACCT + bool "Enable delay accounting (EXPERIMENTAL)" + help + In addition to counting frequency the total delay in ns is also + recorded. CPU delays are specified as cpu-wait and cpu-run. + I/O delays are recorded for memory and regular I/O. + Information is accessible through /proc//delay. + config KALLSYMS bool "Load all symbols for debugging/kksymoops" if EMBEDDED default y Index: linux-2.6.12-rc3-ckrm5/kernel/fork.c =================================================================== --- linux-2.6.12-rc3-ckrm5.orig/kernel/fork.c 2005-05-05 09:34:55.000000000 -0700 +++ linux-2.6.12-rc3-ckrm5/kernel/fork.c 2005-05-05 09:35:02.000000000 -0700 @@ -901,6 +901,7 @@ static task_t *copy_process(unsigned lon if (p->binfmt && !try_module_get(p->binfmt->module)) goto bad_fork_cleanup_put_domain; + init_delays(p); p->did_exec = 0; copy_flags(clone_flags, p); p->pid = pid; Index: linux-2.6.12-rc3-ckrm5/kernel/sched.c =================================================================== --- linux-2.6.12-rc3-ckrm5.orig/kernel/sched.c 2005-05-05 09:33:00.000000000 -0700 +++ linux-2.6.12-rc3-ckrm5/kernel/sched.c 2005-05-05 09:35:02.000000000 -0700 @@ -268,6 +268,8 @@ static DEFINE_PER_CPU(struct runqueue, r #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +#define task_is_running(p) (this_rq() == task_rq(p)) + /* * Default context-switch locking: */ @@ -2749,6 +2751,7 @@ switch_tasks: update_cpu_clock(prev, rq, now); + add_delay_ts(prev, runcpu_total, prev->timestamp, now); prev->sleep_avg -= run_time; if ((long)prev->sleep_avg <= 0) prev->sleep_avg = 0; @@ -2756,6 +2759,8 @@ switch_tasks: sched_info_switch(prev, next); if (likely(prev != next)) { + add_delay_ts(next, waitcpu_total, next->timestamp, now); + inc_delay(next, runs); next->timestamp = now; rq->nr_switches++; rq->curr = next; @@ -3799,9 +3804,12 @@ void __sched io_schedule(void) { struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); + def_delay_var(dstart); + start_delay_set(dstart, PF_IOWAIT); atomic_inc(&rq->nr_iowait); schedule(); atomic_dec(&rq->nr_iowait); + add_io_delay(dstart); } EXPORT_SYMBOL(io_schedule); @@ -3810,10 +3818,13 @@ long __sched io_schedule_timeout(long ti { struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); long ret; + def_delay_var(dstart); + start_delay_set(dstart,PF_IOWAIT); atomic_inc(&rq->nr_iowait); ret = schedule_timeout(timeout); atomic_dec(&rq->nr_iowait); + add_io_delay(dstart); return ret; } @@ -5002,3 +5013,12 @@ void normalize_rt_tasks(void) } #endif /* CONFIG_MAGIC_SYSRQ */ + +#ifdef CONFIG_DELAY_ACCT +int task_running_sys(struct task_struct *p) +{ + return task_is_running(p); +} +EXPORT_SYMBOL_GPL(task_running_sys); +#endif + Index: linux-2.6.12-rc3-ckrm5/mm/memory.c =================================================================== --- linux-2.6.12-rc3-ckrm5.orig/mm/memory.c 2005-05-05 09:33:00.000000000 -0700 +++ linux-2.6.12-rc3-ckrm5/mm/memory.c 2005-05-05 09:35:02.000000000 -0700 @@ -2031,6 +2031,7 @@ int handle_mm_fault(struct mm_struct *mm pud_t *pud; pmd_t *pmd; pte_t *pte; + int rc; __set_current_state(TASK_RUNNING); @@ -2044,6 +2045,9 @@ int handle_mm_fault(struct mm_struct *mm * and the SMP-safe atomic PTE updates. */ pgd = pgd_offset(mm, address); + + set_delay_flag(current, PF_MEMIO); + spin_lock(&mm->page_table_lock); pud = pud_alloc(mm, pgd, address); @@ -2058,10 +2062,13 @@ int handle_mm_fault(struct mm_struct *mm if (!pte) goto oom; - return handle_pte_fault(mm, vma, address, write_access, pte, pmd); + rc = handle_pte_fault(mm, vma, address, write_access, pte, pmd); + clear_delay_flag(current, PF_MEMIO); + return rc; oom: spin_unlock(&mm->page_table_lock); + clear_delay_flag(current, PF_MEMIO); return VM_FAULT_OOM; } -- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/