Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751326AbWC3Aw0 (ORCPT ); Wed, 29 Mar 2006 19:52:26 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1751322AbWC3AwZ (ORCPT ); Wed, 29 Mar 2006 19:52:25 -0500 Received: from mtagate2.uk.ibm.com ([195.212.29.135]:6082 "EHLO mtagate2.uk.ibm.com") by vger.kernel.org with ESMTP id S1751320AbWC3AwY (ORCPT ); Wed, 29 Mar 2006 19:52:24 -0500 Message-ID: <442B2BB6.9020309@watson.ibm.com> Date: Wed, 29 Mar 2006 19:52:06 -0500 From: Shailabh Nagar User-Agent: Debian Thunderbird 1.0.2 (X11/20051002) X-Accept-Language: en-us, en MIME-Version: 1.0 To: linux-kernel CC: netdev , Thomas Graf , Jamal Subject: [Patch 5/8] generic netlink interface for delay accounting References: <442B271D.10208@watson.ibm.com> In-Reply-To: <442B271D.10208@watson.ibm.com> Content-Type: text/plain; charset=ISO-8859-1 Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 17333 Lines: 594 delayacct-genetlink.patch Create a generic netlink interface (NETLINK_GENERIC family), called "taskstats", for getting delay and cpu statistics of tasks and thread groups during their lifetime and when they exit. Signed-off-by: Shailabh Nagar Signed-off-by: Balbir Singh include/linux/delayacct.h | 11 + include/linux/taskstats.h | 113 +++++++++++++++++ init/Kconfig | 16 ++ kernel/Makefile | 1 kernel/delayacct.c | 44 ++++++ kernel/taskstats.c | 292 ++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 474 insertions(+), 3 deletions(-) Index: linux-2.6.16/include/linux/delayacct.h =================================================================== --- linux-2.6.16.orig/include/linux/delayacct.h 2006-03-29 18:13:13.000000000 -0500 +++ linux-2.6.16/include/linux/delayacct.h 2006-03-29 18:13:18.000000000 -0500 @@ -18,6 +18,7 @@ #define _LINUX_TASKDELAYS_H #include +#include #ifdef CONFIG_TASK_DELAY_ACCT extern int delayacct_on; /* Delay accounting turned on/off */ @@ -27,6 +28,7 @@ extern void __delayacct_tsk_init(struct extern void __delayacct_tsk_exit(struct task_struct *); extern void __delayacct_blkio_start(void); extern void __delayacct_blkio_end(void); +extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *); static inline void delayacct_tsk_init(struct task_struct *tsk) { @@ -64,4 +66,13 @@ static inline void delayacct_blkio_start static inline void delayacct_blkio_end(void) {} #endif /* CONFIG_TASK_DELAY_ACCT */ +#ifdef CONFIG_TASKSTATS +static inline int delayacct_add_tsk(struct taskstats *d, + struct task_struct *tsk) +{ + if (!tsk->delays) + return -EINVAL; + return __delayacct_add_tsk(d, tsk); +} +#endif #endif /* _LINUX_TASKDELAYS_H */ Index: linux-2.6.16/include/linux/taskstats.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.16/include/linux/taskstats.h 2006-03-29 18:13:18.000000000 -0500 @@ -0,0 +1,113 @@ +/* taskstats.h - exporting per-task statistics + * + * Copyright (C) Shailabh Nagar, IBM Corp. 2006 + * (C) Balbir Singh, IBM Corp. 2006 + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef _LINUX_TASKSTATS_H +#define _LINUX_TASKSTATS_H + +/* Format for per-task data returned to userland when + * - a task exits + * - listener requests stats for a task + * + * The struct is versioned. Newer versions should only add fields to + * the bottom of the struct to maintain backward compatibility. + * + * To create the next version, bump up the taskstats_version variable + * and delineate the start of newly added fields with a comment indicating + * the version number. + */ + +#define TASKSTATS_VERSION 1 +#define TASKSTATS_NOPID -1 + +struct taskstats { + /* Maintain 64-bit alignment while extending */ + /* Version 1 */ + + /* XXX_count is number of delay values recorded. + * XXX_total is corresponding cumulative delay in nanoseconds + */ + +#define TASKSTATS_NOCPUSTATS 1 + __u64 cpu_count; + __u64 cpu_delay_total; /* wait, while runnable, for cpu */ + __u64 blkio_count; + __u64 blkio_delay_total; /* sync,block io completion wait*/ + __u64 swapin_count; + __u64 swapin_delay_total; /* swapin page fault wait*/ + + __u64 cpu_run_total; /* cpu running time + * no count available/provided */ +}; + + +#define TASKSTATS_LISTEN_GROUP 0x1 + +/* + * Commands sent from userspace + * Not versioned. New commands should only be inserted at the enum's end + */ + +enum { + TASKSTATS_CMD_UNSPEC = 0, /* Reserved */ + TASKSTATS_CMD_GET, /* user->kernel request/get-response */ + TASKSTATS_CMD_NEW, /* kernel->user event */ + __TASKSTATS_CMD_MAX, +}; + +#define TASKSTATS_CMD_MAX (__TASKSTATS_CMD_MAX - 1) + +enum { + TASKSTATS_TYPE_UNSPEC = 0, /* Reserved */ + TASKSTATS_TYPE_PID, /* Process id */ + TASKSTATS_TYPE_TGID, /* Thread group id */ + TASKSTATS_TYPE_STATS, /* taskstats structure */ + TASKSTATS_TYPE_AGGR_PID, /* contains pid + stats */ + TASKSTATS_TYPE_AGGR_TGID, /* contains tgid + stats */ + __TASKSTATS_TYPE_MAX, +}; + +#define TASKSTATS_TYPE_MAX (__TASKSTATS_TYPE_MAX - 1) + +enum { + TASKSTATS_CMD_ATTR_UNSPEC = 0, + TASKSTATS_CMD_ATTR_PID, + TASKSTATS_CMD_ATTR_TGID, + __TASKSTATS_CMD_ATTR_MAX, +}; + +#define TASKSTATS_CMD_ATTR_MAX (__TASKSTATS_CMD_ATTR_MAX - 1) + +/* NETLINK_GENERIC related info */ + +#define TASKSTATS_GENL_NAME "TASKSTATS" +#define TASKSTATS_GENL_VERSION 0x1 + +#ifdef __KERNEL__ + +#include + +enum { + TASKSTATS_MSG_UNICAST, /* send data only to requester */ + TASKSTATS_MSG_MULTICAST, /* send data to a group */ +}; + +#ifdef CONFIG_TASKSTATS +extern void taskstats_exit_pid(struct task_struct *); +#else +static inline void taskstats_exit_pid(struct task_struct *tsk) +{} +#endif + +#endif /* __KERNEL__ */ +#endif /* _LINUX_TASKSTATS_H */ Index: linux-2.6.16/init/Kconfig =================================================================== --- linux-2.6.16.orig/init/Kconfig 2006-03-29 18:12:57.000000000 -0500 +++ linux-2.6.16/init/Kconfig 2006-03-29 18:13:18.000000000 -0500 @@ -158,11 +158,21 @@ config TASK_DELAY_ACCT in pages. Such statistics can help in setting a task's priorities relative to other tasks for cpu, io, rss limits etc. - Unlike BSD process accounting, this information is available - continuously during the lifetime of a task. - Say N if unsure. +config TASKSTATS + bool "Export task/process statistics through netlink (EXPERIMENTAL)" + depends on TASK_DELAY_ACCT + default y + help + Export selected statistics for tasks/processes through the + generic netlink interface. Unlike BSD process accounting, the + statistics are available during the lifetime of tasks/processes as + responses to commands. Like BSD accounting, they are sent to user + space on task exit. + + Say Y if unsure. + config SYSCTL bool "Sysctl support" ---help--- Index: linux-2.6.16/kernel/delayacct.c =================================================================== --- linux-2.6.16.orig/kernel/delayacct.c 2006-03-29 18:13:13.000000000 -0500 +++ linux-2.6.16/kernel/delayacct.c 2006-03-29 18:13:18.000000000 -0500 @@ -1,6 +1,7 @@ /* delayacct.c - per-task delay accounting * * Copyright (C) Shailabh Nagar, IBM Corp. 2006 + * Copyright (C) Balbir Singh, IBM Corp. 2006 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -18,9 +19,12 @@ #include #include #include +#include +#include int delayacct_on __read_mostly = 0; /* Delay accounting turned on/off */ kmem_cache_t *delayacct_cache; +static DEFINE_MUTEX(delayacct_exit_mutex); static int __init delayacct_setup_enable(char *str) { @@ -50,10 +54,16 @@ void __delayacct_tsk_init(struct task_st void __delayacct_tsk_exit(struct task_struct *tsk) { + /* + * Protect against racing thread group exits + */ + mutex_lock(&delayacct_exit_mutex); + taskstats_exit_pid(tsk); if (tsk->delays) { kmem_cache_free(delayacct_cache, tsk->delays); tsk->delays = NULL; } + mutex_unlock(&delayacct_exit_mutex); } /* @@ -108,3 +118,37 @@ void __delayacct_blkio_end(void) ¤t->delays->blkio_delay, ¤t->delays->blkio_count); } +#ifdef CONFIG_TASKSTATS +int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) +{ + nsec_t tmp; + struct timespec ts; + unsigned long t1,t2; + + /* zero XXX_total,non-zero XXX_count implies XXX stat overflowed */ + + tmp = (nsec_t)d->cpu_run_total ; + tmp += (u64)(tsk->utime+tsk->stime)*TICK_NSEC; + d->cpu_run_total = (tmp < (nsec_t)d->cpu_run_total)? 0: tmp; + + /* No locking available for sched_info. Take snapshot first. */ + t1 = tsk->sched_info.pcnt; + t2 = tsk->sched_info.run_delay; + + d->cpu_count += t1; + + jiffies_to_timespec(t2, &ts); + tmp = (nsec_t)d->cpu_delay_total + timespec_to_ns(&ts); + d->cpu_delay_total = (tmp < (nsec_t)d->cpu_delay_total)? 0: tmp; + + spin_lock(&tsk->delays->lock); + tmp = d->blkio_delay_total + tsk->delays->blkio_delay; + d->blkio_delay_total = (tmp < d->blkio_delay_total)? 0: tmp; + tmp = d->swapin_delay_total + tsk->delays->swapin_delay; + d->swapin_delay_total = (tmp < d->swapin_delay_total)? 0: tmp; + d->blkio_count += tsk->delays->blkio_count; + d->swapin_count += tsk->delays->swapin_count; + spin_unlock(&tsk->delays->lock); + return 0; +} +#endif /* CONFIG_TASKSTATS */ Index: linux-2.6.16/kernel/Makefile =================================================================== --- linux-2.6.16.orig/kernel/Makefile 2006-03-29 18:12:57.000000000 -0500 +++ linux-2.6.16/kernel/Makefile 2006-03-29 18:13:18.000000000 -0500 @@ -35,6 +35,7 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o +obj-$(CONFIG_TASKSTATS) += taskstats.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is Index: linux-2.6.16/kernel/taskstats.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.16/kernel/taskstats.c 2006-03-29 18:13:18.000000000 -0500 @@ -0,0 +1,292 @@ +/* + * taskstats.c - Export per-task statistics to userland + * + * Copyright (C) Shailabh Nagar, IBM Corp. 2006 + * (C) Balbir Singh, IBM Corp. 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include +#include + +const int taskstats_version = TASKSTATS_VERSION; +static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; +static int family_registered = 0; + +static struct genl_family family = { + .id = GENL_ID_GENERATE, + .name = TASKSTATS_GENL_NAME, + .version = TASKSTATS_GENL_VERSION, + .maxattr = TASKSTATS_CMD_ATTR_MAX, +}; + +static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] __read_mostly = { + [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, + [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, +}; + + +static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, + void **replyp, size_t size) +{ + struct sk_buff *skb; + void *reply; + + /* + * If new attributes are added, please revisit this allocation + */ + skb = nlmsg_new(size); + if (!skb) + return -ENOMEM; + + if (!info) { + int seq = get_cpu_var(taskstats_seqnum)++; + put_cpu_var(taskstats_seqnum); + + reply = genlmsg_put(skb, 0, seq, + family.id, 0, 0, + cmd, family.version); + } else + reply = genlmsg_put(skb, info->snd_pid, info->snd_seq, + family.id, 0, 0, + cmd, family.version); + if (reply == NULL) { + nlmsg_free(skb); + return -EINVAL; + } + + *skbp = skb; + *replyp = reply; + return 0; +} + +static int send_reply(struct sk_buff *skb, pid_t pid, int event) +{ + struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); + void *reply; + int rc; + + reply = genlmsg_data(genlhdr); + + rc = genlmsg_end(skb, reply); + if (rc < 0) { + nlmsg_free(skb); + return rc; + } + + if (event == TASKSTATS_MSG_MULTICAST) + return genlmsg_multicast(skb, pid, TASKSTATS_LISTEN_GROUP); + return genlmsg_unicast(skb, pid); +} + +static inline int fill_pid(pid_t pid, struct task_struct *pidtsk, + struct taskstats *stats) +{ + int rc; + struct task_struct *tsk = pidtsk; + + if (!pidtsk) { + read_lock(&tasklist_lock); + tsk = find_task_by_pid(pid); + if (!tsk) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + get_task_struct(tsk); + read_unlock(&tasklist_lock); + } else + get_task_struct(tsk); + + rc = delayacct_add_tsk(stats, tsk); + put_task_struct(tsk); + + return rc; + +} + +static inline int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, + struct taskstats *stats) +{ + int rc; + struct task_struct *tsk, *first; + + first = tgidtsk; + read_lock(&tasklist_lock); + if (!first) { + first = find_task_by_pid(tgid); + if (!first) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + } + tsk = first; + do { + rc = delayacct_add_tsk(stats, tsk); + if (rc) + break; + } while_each_thread(first, tsk); + read_unlock(&tasklist_lock); + + return rc; +} + +static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info) +{ + int rc = 0; + struct sk_buff *rep_skb; + struct taskstats stats; + void *reply; + size_t size; + struct nlattr *na; + + /* + * Size includes space for nested attribute as well + * The returned data is of the format + * TASKSTATS_TYPE_AGGR_PID/TGID + * --> TASKSTATS_TYPE_PID/TGID + * --> TASKSTATS_TYPE_STATS + */ + size = nla_total_size(sizeof(u32)) + + nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); + + memset(&stats, 0, sizeof(stats)); + rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); + if (rc < 0) + return rc; + + if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { + u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); + rc = fill_pid((pid_t)pid, NULL, &stats); + if (rc < 0) + goto err; + + na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); + NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid); + } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { + u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); + rc = fill_tgid((pid_t)tgid, NULL, &stats); + if (rc < 0) + goto err; + + na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); + NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid); + } else { + rc = -EINVAL; + goto err; + } + + NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, stats); + nla_nest_end(rep_skb, na); + + return send_reply(rep_skb, info->snd_pid, TASKSTATS_MSG_UNICAST); + +nla_put_failure: + return genlmsg_cancel(rep_skb, reply); +err: + nlmsg_free(rep_skb); + return rc; +} + + +/* Send pid data out on exit */ +void taskstats_exit_pid(struct task_struct *tsk) +{ + int rc = 0; + struct sk_buff *rep_skb; + void *reply; + struct taskstats stats; + size_t size; + int is_thread_group = !thread_group_empty(tsk); + struct nlattr *na; + + /* + * tasks can start to exit very early. Ensure that the family + * is registered before notifications are sent out + */ + if (!family_registered) + return; + + /* + * Size includes space for nested attributes + */ + size = nla_total_size(sizeof(u32)) + + nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); + + if (is_thread_group) + size = 2 * size; // PID + STATS + TGID + STATS + + memset(&stats, 0, sizeof(stats)); + rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); + if (rc < 0) + return; + + rc = fill_pid(tsk->pid, tsk, &stats); + if (rc < 0) + goto err; + + na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); + NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid); + NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, stats); + nla_nest_end(rep_skb, na); + + if (!is_thread_group) { + send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST); + return; + } + + memset(&stats, 0, sizeof(stats)); + rc = fill_tgid(tsk->tgid, tsk, &stats); + if (rc < 0) + goto err; + + na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); + NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid); + NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, stats); + nla_nest_end(rep_skb, na); + + send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST); + return; + +nla_put_failure: + genlmsg_cancel(rep_skb, reply); + return; +err: + nlmsg_free(rep_skb); +} + +static struct genl_ops taskstats_ops = { + .cmd = TASKSTATS_CMD_GET, + .doit = taskstats_send_stats, + .policy = taskstats_cmd_get_policy, +}; + +static int __init taskstats_init(void) +{ + if (genl_register_family(&family)) + return -EFAULT; + family_registered = 1; + + if (genl_register_ops(&family, &taskstats_ops)) + goto err; + + return 0; +err: + genl_unregister_family(&family); + family_registered = 0; + return -EFAULT; +} + +late_initcall(taskstats_init); - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/