2006-08-03 04:20:48

by Jay Lan

[permalink] [raw]
Subject: [patch 1/3] basic accounting over taskstats

Index: linux/include/linux/taskstats.h
===================================================================
--- linux.orig/include/linux/taskstats.h 2006-07-31 11:38:54.132326042 -0700
+++ linux/include/linux/taskstats.h 2006-08-02 19:17:02.155010656 -0700
@@ -2,6 +2,7 @@
*
* Copyright (C) Shailabh Nagar, IBM Corp. 2006
* (C) Balbir Singh, IBM Corp. 2006
+ * (C) Jay Lan, SGI, 2006
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of version 2.1 of the GNU Lesser General Public License
@@ -29,16 +30,18 @@
* c) add new fields after version comment; maintain 64-bit alignment
*/

-#define TASKSTATS_VERSION 1
+
+#define TASKSTATS_VERSION 2
+#define TS_COMM_LEN 16 /* should sync up with TASK_COMM_LEN
+ * in linux/sched.h */

struct taskstats {

/* Version 1 */
__u16 version;
- __u16 padding[3]; /* Userspace should not interpret the padding
- * field which can be replaced by useful
- * fields if struct taskstats is extended.
- */
+ __u32 ac_exitcode; /* Exit status */
+ __u8 ac_flag; /* Record flags */
+ __u8 ac_nice; /* task_nice */

/* Delay accounting fields start
*
@@ -88,6 +91,22 @@ struct taskstats {
__u64 cpu_run_virtual_total;
/* Delay accounting fields end */
/* version 1 ends here */
+
+ /* Basic Accounting Fields start */
+ char ac_comm[TS_COMM_LEN]; /* Command name */
+ __u8 ac_sched; /* Scheduling discipline */
+ __u8 ac_pad[3];
+ __u32 ac_uid; /* User ID */
+ __u32 ac_gid; /* Group ID */
+ __u32 ac_pid; /* Process ID */
+ __u32 ac_ppid; /* Parent process ID */
+ __u32 ac_btime; /* Begin time [sec since 1970] */
+ __u64 ac_etime; /* Elapsed time [usec] */
+ __u64 ac_utime; /* User CPU time [usec] */
+ __u64 ac_stime; /* SYstem CPU time [usec] */
+ __u64 ac_minflt; /* Minor Page Fault */
+ __u64 ac_majflt; /* Major Page Fault */
+ /* Basic Accounting Fields end */
};


Index: linux/kernel/taskstats.c
===================================================================
--- linux.orig/kernel/taskstats.c 2006-07-31 11:38:54.160326367 -0700
+++ linux/kernel/taskstats.c 2006-08-02 19:17:02.155010656 -0700
@@ -18,6 +18,7 @@

#include <linux/kernel.h>
#include <linux/taskstats_kern.h>
+#include <linux/tsacct_kern.h>
#include <linux/delayacct.h>
#include <linux/cpumask.h>
#include <linux/percpu.h>
@@ -198,7 +199,10 @@ static int fill_pid(pid_t pid, struct ta
*/

delayacct_add_tsk(stats, tsk);
+
+ /* fill in basic acct fields */
stats->version = TASKSTATS_VERSION;
+ bacct_add_tsk(stats, tsk);

/* Define err: label here if needed */
put_task_struct(tsk);
Index: linux/kernel/Makefile
===================================================================
--- linux.orig/kernel/Makefile 2006-08-02 14:52:08.000000000 -0700
+++ linux/kernel/Makefile 2006-08-02 16:56:23.029588787 -0700
@@ -49,7 +49,7 @@ obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
-obj-$(CONFIG_TASKSTATS) += taskstats.o
+obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o

ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
# According to Alan Modra <[email protected]>, the -fno-omit-frame-pointer is
Index: linux/kernel/tsacct.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux/kernel/tsacct.c 2006-08-02 19:17:20.859257731 -0700
@@ -0,0 +1,69 @@
+/*
+ * tsacct.c - System accounting over taskstats interface
+ *
+ * Copyright (C) Jay Lan, <[email protected]>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/tsacct_kern.h>
+#include <linux/acct.h>
+
+
+#define USEC_PER_TICK (USEC_PER_SEC/HZ)
+/*
+ * fill in basic accounting fields
+ */
+void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
+{
+ struct timespec uptime, ts;
+
+ BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
+
+ /* calculate task elapsed time in timespec */
+ do_posix_clock_monotonic_gettime(&uptime);
+ ts = timespec_sub(uptime, current->group_leader->start_time);
+ /* rebase elapsed time to usec */
+ stats->ac_etime = (timespec_to_ns(&ts))/NSEC_PER_USEC;
+ stats->ac_btime = xtime.tv_sec - ts.tv_sec;
+ if (thread_group_leader(tsk)) {
+ stats->ac_exitcode = tsk->exit_code;
+ if (tsk->flags & PF_FORKNOEXEC)
+ stats->ac_flag |= AFORK;
+ }
+ if (tsk->flags & PF_SUPERPRIV)
+ stats->ac_flag |= ASU;
+ if (tsk->flags & PF_DUMPCORE)
+ stats->ac_flag |= ACORE;
+ if (tsk->flags & PF_SIGNALED)
+ stats->ac_flag |= AXSIG;
+ stats->ac_nice = task_nice(tsk);
+ stats->ac_sched = tsk->policy;
+ stats->ac_uid = tsk->uid;
+ stats->ac_gid = tsk->gid;
+ stats->ac_pid = tsk->pid;
+ stats->ac_ppid = (tsk->parent) ? tsk->parent->pid : 0;
+ stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC;
+ stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC;
+ stats->ac_minflt = tsk->min_flt;
+ stats->ac_majflt = tsk->maj_flt;
+ /* Each process gets a minimum of one usec cpu time */
+ if ((stats->ac_utime == 0) && (stats->ac_stime == 0)) {
+ stats->ac_stime = 1;
+ }
+
+ strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm));
+}
+
Index: linux/include/linux/tsacct_kern.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux/include/linux/tsacct_kern.h 2006-08-02 19:17:02.155010656 -0700
@@ -0,0 +1,19 @@
+/*
+ * tsacct_kern.h - kernel header for system accounting over taskstats interface
+ *
+ * Copyright (C) Jay Lan SGI
+ */
+
+#ifndef _LINUX_TSACCT_KERN_H
+#define _LINUX_TSACCT_KERN_H
+
+#include <linux/taskstats.h>
+
+#ifdef CONFIG_TASKSTATS
+extern void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk);
+#else
+static inline void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
+{}
+#endif /* CONFIG_TASKSTATS */
+
+#endif


Attachments:
taskstats-rev2.patch (6.31 kB)

2006-08-03 06:52:33

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch 1/3] basic accounting over taskstats

On Wed, 02 Aug 2006 21:20:53 -0700
Jay Lan <[email protected]> wrote:

> This patch is to replace the "[patch 1/3] add basic accounting
> fields to taskstats" posted on 7/31.
>
> This patch adds some basic accounting fields to the taskstats
> struct, add a new kernel/tsacct.c to handle basic accounting
> data handling upon exit. A handle is added to taskstats.c
> to invoke the basic accounting data handling.
>

> +#define TS_COMM_LEN 16 /* should sync up with TASK_COMM_LEN
> + * in linux/sched.h */

There was a proposal recently to increase TASK_COMM_LENGTH from 16 to 20 so
that it was long enough to hold an entire IEEE(?) UUID so that the
operator can easily match up a kernel thread with the storage device which
it manages.

I don't know if/when that change will happen, but the message is that
TASK_COMM_LENGTH may increase.

> + BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);

And if it does, we'll need to increase TS_COMM_LEN as well. That will
amount to an non-compatible change to the interface which you are
proposing. We want to avoid that.

Hence I'd propose that you increase TS_COMM_LEN to 32 or something and if
TASK_COMM_LEN later becomes really big, we might just choose to truncate
it.

Or we remove this field altogether, perhaps. The same info is available
from /proc/pid/stat anyway. Is it really needed?

2006-08-03 13:56:21

by Shailabh Nagar

[permalink] [raw]
Subject: Re: [patch 1/3] basic accounting over taskstats

Andrew Morton wrote:

>
> Or we remove this field altogether, perhaps. The same info is available
> from /proc/pid/stat anyway. Is it really needed?
>

Gathering this data in userspace from /proc might be
difficult esp. for short-lived tasks.

Also, /proc may not be mounted ? I'd heard somewhere that
some sysadmins don't install /proc for security reasons.
Don't know how far thats true.

Several other fields, totalling 58 bytes, added by the CSA
patches are also duplicated in /proc/pid/stat. But all of them
could change in value during the lifetime of a task so I'm
guessing its not useful to get them from /proc
even if some kind of userspace polling of the value was
possible.

But if there is a way, it would sure save a lot of payload
sent over taskstats !

"duplicate" fields from CSA:
+ __u8 ac_nice; /* task_nice */
+ char ac_comm[TS_COMM_LEN]; /* Command name */
+ __u8 ac_sched; /* Scheduling discipline */
+ __u32 ac_pid; /* Process ID */
+ __u32 ac_ppid; /* Parent process ID */
+ __u64 ac_utime; /* User CPU time [usec] */
+ __u64 ac_stime; /* SYstem CPU time [usec] */
+ __u64 ac_minflt; /* Minor Page Fault */
+ __u64 ac_majflt; /* Major Page Fault */

2006-08-03 18:48:49

by Jay Lan

[permalink] [raw]
Subject: Re: [patch 1/3] basic accounting over taskstats

Shailabh Nagar wrote:
> Andrew Morton wrote:
>
>>
>> Or we remove this field altogether, perhaps. The same info is available
>> from /proc/pid/stat anyway. Is it really needed?
>>
>
> Gathering this data in userspace from /proc might be
> difficult esp. for short-lived tasks.

This is a serious concern. I think increasing TS_COMM_LEN
to 32 would be a good solution.

>
> Also, /proc may not be mounted ? I'd heard somewhere that
> some sysadmins don't install /proc for security reasons.
> Don't know how far thats true.
>
> Several other fields, totalling 58 bytes, added by the CSA
> patches are also duplicated in /proc/pid/stat. But all of them
> could change in value during the lifetime of a task so I'm
> guessing its not useful to get them from /proc
> even if some kind of userspace polling of the value was
> possible.

The same concern above applies to here, doesn't it?

Regards,
- jay


>
> But if there is a way, it would sure save a lot of payload
> sent over taskstats !
>
> "duplicate" fields from CSA:
> + __u8 ac_nice; /* task_nice */
> + char ac_comm[TS_COMM_LEN]; /* Command name */
> + __u8 ac_sched; /* Scheduling discipline */
> + __u32 ac_pid; /* Process ID */
> + __u32 ac_ppid; /* Parent process ID */
> + __u64 ac_utime; /* User CPU time [usec] */
> + __u64 ac_stime; /* SYstem CPU time [usec] */
> + __u64 ac_minflt; /* Minor Page Fault */
> + __u64 ac_majflt; /* Major Page Fault */

2006-08-03 19:30:00

by Shailabh Nagar

[permalink] [raw]
Subject: Re: [patch 1/3] basic accounting over taskstats

Jay Lan wrote:
> Shailabh Nagar wrote:
>
>> Andrew Morton wrote:
>>
>>>
>>> Or we remove this field altogether, perhaps. The same info is available
>>> from /proc/pid/stat anyway. Is it really needed?
>>>
>>
>> Gathering this data in userspace from /proc might be
>> difficult esp. for short-lived tasks.
>
>
> This is a serious concern. I think increasing TS_COMM_LEN
> to 32 would be a good solution.
>
>>
>> Also, /proc may not be mounted ? I'd heard somewhere that
>> some sysadmins don't install /proc for security reasons.
>> Don't know how far thats true.
>>
>> Several other fields, totalling 58 bytes, added by the CSA
>> patches are also duplicated in /proc/pid/stat. But all of them
>> could change in value during the lifetime of a task so I'm
>> guessing its not useful to get them from /proc
>> even if some kind of userspace polling of the value was
>> possible.
>
>
> The same concern above applies to here, doesn't it?

Yes. For some of the fields, pid/ppid/nice/sched you may not
really care about whether you get the value present sometime
during the lifetime vs. value at the time task exited, but for
others like utime/stime/minflt/majflt, getting the last value
is likely to matter. Either way, since there's no guarantee
that you can poll a short-lived task fast enough to get any value,
exporting the value from the kernel at exit seems to be the only safe
way.

--Shailabh

>
> Regards,
> - jay
>
>
>>
>> But if there is a way, it would sure save a lot of payload
>> sent over taskstats !
>>
>> "duplicate" fields from CSA:
>> + __u8 ac_nice; /* task_nice */
>> + char ac_comm[TS_COMM_LEN]; /* Command name */
>> + __u8 ac_sched; /* Scheduling discipline */
>> + __u32 ac_pid; /* Process ID */
>> + __u32 ac_ppid; /* Parent process ID */
>> + __u64 ac_utime; /* User CPU time [usec] */
>> + __u64 ac_stime; /* SYstem CPU time [usec] */
>> + __u64 ac_minflt; /* Minor Page Fault */
>> + __u64 ac_majflt; /* Major Page Fault */
>
>